diff --git a/.gitattributes b/.gitattributes index efb059f169a..bcc7d57b904 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,4 +1,2 @@ contrib/* linguist-vendored *.h linguist-language=C++ -# to avoid frequent conflicts -tests/queries/0_stateless/arcadia_skip_list.txt text merge=union diff --git a/.github/workflows/anchore-analysis.yml b/.github/workflows/anchore-analysis.yml index 1005c8f6c38..9f3f944c696 100644 --- a/.github/workflows/anchore-analysis.yml +++ b/.github/workflows/anchore-analysis.yml @@ -8,6 +8,10 @@ name: Docker Container Scan (clickhouse-server) +env: + # Force the stdout and stderr streams to be unbuffered + PYTHONUNBUFFERED: 1 + "on": pull_request: paths: diff --git a/.github/workflows/backport.yml b/.github/workflows/backport.yml index c83d3f6d5bd..05cfc6a9405 100644 --- a/.github/workflows/backport.yml +++ b/.github/workflows/backport.yml @@ -1,4 +1,9 @@ name: CherryPick + +env: + # Force the stdout and stderr streams to be unbuffered + PYTHONUNBUFFERED: 1 + concurrency: group: cherry-pick on: # yamllint disable-line rule:truthy @@ -8,18 +13,24 @@ jobs: CherryPick: runs-on: [self-hosted, style-checker] steps: + - name: Set envs + # https://docs.github.com/en/actions/learn-github-actions/workflow-commands-for-github-actions#multiline-strings + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/cherry_pick + ROBOT_CLICKHOUSE_SSH_KEY<> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/compatibility_check + REPO_COPY=${{runner.temp}}/compatibility_check/ClickHouse + REPORTS_PATH=${{runner.temp}}/reports_dir + EOF + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} - name: CompatibilityCheck - env: - TEMP_PATH: ${{runner.temp}}/compatibility_check - REPO_COPY: ${{runner.temp}}/compatibility_check/ClickHouse - REPORTS_PATH: ${{runner.temp}}/reports_dir run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -51,154 +65,182 @@ jobs: needs: [DockerHubPush] runs-on: [self-hosted, builder] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/build_check + IMAGES_PATH=${{runner.temp}}/images_path + REPO_COPY=${{runner.temp}}/build_check/ClickHouse + CACHES_PATH=${{runner.temp}}/../ccaches + CHECK_NAME=ClickHouse build check (actions) + BUILD_NAME=package_release + EOF - name: Download changed images uses: actions/download-artifact@v2 with: name: changed_images - path: ${{ runner.temp }}/images_path + path: ${{ env.IMAGES_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 with: - submodules: 'recursive' + submodules: 'true' fetch-depth: 0 # otherwise we will have no info about contributors - name: Build - env: - TEMP_PATH: ${{runner.temp}}/build_check - IMAGES_PATH: ${{runner.temp}}/images_path - REPO_COPY: ${{runner.temp}}/build_check/ClickHouse - CACHES_PATH: ${{runner.temp}}/../ccaches - CHECK_NAME: 'ClickHouse build check (actions)' - BUILD_NAME: 'package_release' run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH cp -r $GITHUB_WORKSPACE $TEMP_PATH cd $REPO_COPY/tests/ci && python3 build_check.py "$CHECK_NAME" $BUILD_NAME - name: Upload build URLs to artifacts + if: ${{ success() || failure() }} uses: actions/upload-artifact@v2 with: name: ${{ env.BUILD_NAME }} - path: ${{ runner.temp }}/build_check/${{ env.BUILD_NAME }}.json + path: ${{ env.TEMP_PATH }}/${{ env.BUILD_NAME }}.json - name: Cleanup if: always() run: | docker kill $(docker ps -q) ||: docker rm -f $(docker ps -a -q) ||: - sudo rm -fr $TEMP_PATH + sudo rm -fr $TEMP_PATH $CACHES_PATH BuilderDebAsan: needs: [DockerHubPush] runs-on: [self-hosted, builder] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/build_check + IMAGES_PATH=${{runner.temp}}/images_path + REPO_COPY=${{runner.temp}}/build_check/ClickHouse + CACHES_PATH=${{runner.temp}}/../ccaches + CHECK_NAME=ClickHouse build check (actions) + BUILD_NAME=package_asan + EOF - name: Download changed images uses: actions/download-artifact@v2 with: name: changed_images - path: ${{ runner.temp }}/images_path + path: ${{ env.IMAGES_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 with: - submodules: 'recursive' + submodules: 'true' fetch-depth: 0 # otherwise we will have no info about contributors - name: Build - env: - TEMP_PATH: ${{runner.temp}}/build_check - IMAGES_PATH: ${{runner.temp}}/images_path - REPO_COPY: ${{runner.temp}}/build_check/ClickHouse - CACHES_PATH: ${{runner.temp}}/../ccaches - CHECK_NAME: 'ClickHouse build check (actions)' - BUILD_NAME: 'package_asan' run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH cp -r $GITHUB_WORKSPACE $TEMP_PATH cd $REPO_COPY/tests/ci && python3 build_check.py "$CHECK_NAME" $BUILD_NAME - name: Upload build URLs to artifacts + if: ${{ success() || failure() }} uses: actions/upload-artifact@v2 with: name: ${{ env.BUILD_NAME }} - path: ${{ runner.temp }}/build_check/${{ env.BUILD_NAME }}.json + path: ${{ env.TEMP_PATH }}/${{ env.BUILD_NAME }}.json - name: Cleanup if: always() run: | docker kill $(docker ps -q) ||: docker rm -f $(docker ps -a -q) ||: - sudo rm -fr $TEMP_PATH + sudo rm -fr $TEMP_PATH $CACHES_PATH BuilderDebTsan: needs: [DockerHubPush] runs-on: [self-hosted, builder] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/build_check + IMAGES_PATH=${{runner.temp}}/images_path + REPO_COPY=${{runner.temp}}/build_check/ClickHouse + CACHES_PATH=${{runner.temp}}/../ccaches + CHECK_NAME=ClickHouse build check (actions) + BUILD_NAME=package_tsan + EOF - name: Download changed images uses: actions/download-artifact@v2 with: name: changed_images - path: ${{ runner.temp }}/images_path + path: ${{ env.IMAGES_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 with: - submodules: 'recursive' + submodules: 'true' fetch-depth: 0 # otherwise we will have no info about contributors - name: Build - env: - TEMP_PATH: ${{runner.temp}}/build_check - IMAGES_PATH: ${{runner.temp}}/images_path - REPO_COPY: ${{runner.temp}}/build_check/ClickHouse - CACHES_PATH: ${{runner.temp}}/../ccaches - CHECK_NAME: 'ClickHouse build check (actions)' - BUILD_NAME: 'package_tsan' run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH cp -r $GITHUB_WORKSPACE $TEMP_PATH cd $REPO_COPY/tests/ci && python3 build_check.py "$CHECK_NAME" $BUILD_NAME - name: Upload build URLs to artifacts + if: ${{ success() || failure() }} uses: actions/upload-artifact@v2 with: name: ${{ env.BUILD_NAME }} - path: ${{ runner.temp }}/build_check/${{ env.BUILD_NAME }}.json + path: ${{ env.TEMP_PATH }}/${{ env.BUILD_NAME }}.json - name: Cleanup if: always() run: | docker kill $(docker ps -q) ||: docker rm -f $(docker ps -a -q) ||: - sudo rm -fr $TEMP_PATH + sudo rm -fr $TEMP_PATH $CACHES_PATH BuilderDebDebug: needs: [DockerHubPush] runs-on: [self-hosted, builder] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/build_check + IMAGES_PATH=${{runner.temp}}/images_path + REPO_COPY=${{runner.temp}}/build_check/ClickHouse + CACHES_PATH=${{runner.temp}}/../ccaches + CHECK_NAME=ClickHouse build check (actions) + BUILD_NAME=package_debug + EOF - name: Download changed images uses: actions/download-artifact@v2 with: name: changed_images - path: ${{ runner.temp }}/images_path + path: ${{ env.IMAGES_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 with: - submodules: 'recursive' + submodules: 'true' fetch-depth: 0 # otherwise we will have no info about contributors - name: Build - env: - TEMP_PATH: ${{runner.temp}}/build_check - IMAGES_PATH: ${{runner.temp}}/images_path - REPO_COPY: ${{runner.temp}}/build_check/ClickHouse - CACHES_PATH: ${{runner.temp}}/../ccaches - CHECK_NAME: 'ClickHouse build check (actions)' - BUILD_NAME: 'package_debug' run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH cp -r $GITHUB_WORKSPACE $TEMP_PATH cd $REPO_COPY/tests/ci && python3 build_check.py "$CHECK_NAME" $BUILD_NAME - name: Upload build URLs to artifacts + if: ${{ success() || failure() }} uses: actions/upload-artifact@v2 with: name: ${{ env.BUILD_NAME }} - path: ${{ runner.temp }}/build_check/${{ env.BUILD_NAME }}.json + path: ${{ env.TEMP_PATH }}/${{ env.BUILD_NAME }}.json - name: Cleanup if: always() run: | docker kill $(docker ps -q) ||: docker rm -f $(docker ps -a -q) ||: - sudo rm -fr $TEMP_PATH + sudo rm -fr $TEMP_PATH $CACHES_PATH ############################################################################################ ##################################### BUILD REPORTER ####################################### ############################################################################################ @@ -207,22 +249,26 @@ jobs: - BuilderDebRelease - BuilderDebAsan - BuilderDebTsan - - BuilderDebUBsan - - BuilderDebMsan - BuilderDebDebug runs-on: [self-hosted, style-checker] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/report_check + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=ClickHouse build check (actions) + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Report Builder - env: - TEMP_PATH: ${{runner.temp}}/report_check - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'ClickHouse build check (actions)' run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -241,19 +287,25 @@ jobs: needs: [BuilderDebAsan] runs-on: [self-hosted, func-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stateless_debug + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stateless tests (address, actions) + REPO_COPY=${{runner.temp}}/stateless_debug/ClickHouse + KILL_TIMEOUT=10800 + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Functional test - env: - TEMP_PATH: ${{runner.temp}}/stateless_debug - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'Stateless tests (address, actions)' - REPO_COPY: ${{runner.temp}}/stateless_debug/ClickHouse - KILL_TIMEOUT: 10800 run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -273,19 +325,25 @@ jobs: needs: [BuilderDebDebug] runs-on: [self-hosted, func-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stateful_debug + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stateful tests (debug, actions) + REPO_COPY=${{runner.temp}}/stateful_debug/ClickHouse + KILL_TIMEOUT=3600 + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Functional test - env: - TEMP_PATH: ${{runner.temp}}/stateful_debug - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'Stateful tests (debug, actions)' - REPO_COPY: ${{runner.temp}}/stateful_debug/ClickHouse - KILL_TIMEOUT: 3600 run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -303,20 +361,30 @@ jobs: ############################################################################################## StressTestTsan: needs: [BuilderDebTsan] - runs-on: [self-hosted, stress-tester] + # func testers have 16 cores + 128 GB memory + # while stress testers have 36 cores + 72 memory + # It would be better to have something like 32 + 128, + # but such servers almost unavailable as spot instances. + runs-on: [self-hosted, func-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stress_thread + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stress test (thread, actions) + REPO_COPY=${{runner.temp}}/stress_thread/ClickHouse + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Stress test - env: - TEMP_PATH: ${{runner.temp}}/stress_thread - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'Stress test (thread, actions)' - REPO_COPY: ${{runner.temp}}/stress_thread/ClickHouse run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -333,21 +401,27 @@ jobs: ############################# INTEGRATION TESTS ############################################# ############################################################################################# IntegrationTestsRelease: - needs: [BuilderDebRelease, FunctionalStatelessTestRelease] + needs: [BuilderDebRelease] runs-on: [self-hosted, stress-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/integration_tests_release + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Integration tests (release, actions) + REPO_COPY=${{runner.temp}}/integration_tests_release/ClickHouse + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Integration test - env: - TEMP_PATH: ${{runner.temp}}/integration_tests_release - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'Integration tests (release, actions)' - REPO_COPY: ${{runner.temp}}/integration_tests_release/ClickHouse run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -371,6 +445,9 @@ jobs: - CompatibilityCheck runs-on: [self-hosted, style-checker] steps: + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Finish label diff --git a/.github/workflows/cancel.yml b/.github/workflows/cancel.yml index dccd27c4319..857cbf2c495 100644 --- a/.github/workflows/cancel.yml +++ b/.github/workflows/cancel.yml @@ -1,4 +1,9 @@ name: Cancel + +env: + # Force the stdout and stderr streams to be unbuffered + PYTHONUNBUFFERED: 1 + on: # yamllint disable-line rule:truthy workflow_run: workflows: ["CIGithubActions", "ReleaseCI", "DocsCheck", "BackportPR"] diff --git a/.github/workflows/docs_check.yml b/.github/workflows/docs_check.yml index 9e47f96320d..23c0840d379 100644 --- a/.github/workflows/docs_check.yml +++ b/.github/workflows/docs_check.yml @@ -1,4 +1,9 @@ name: DocsCheck + +env: + # Force the stdout and stderr streams to be unbuffered + PYTHONUNBUFFERED: 1 + on: # yamllint disable-line rule:truthy pull_request: types: @@ -14,6 +19,9 @@ jobs: CheckLabels: runs-on: [self-hosted, style-checker] steps: + - name: Clear repository + run: | + sudo rm -rf $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Labels check @@ -24,6 +32,9 @@ jobs: needs: CheckLabels runs-on: [self-hosted, style-checker] steps: + - name: Clear repository + run: | + sudo rm -rf $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Images check @@ -39,17 +50,23 @@ jobs: needs: DockerHubPush runs-on: [self-hosted, func-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/docs_check + REPO_COPY=${{runner.temp}}/docs_check/ClickHouse + EOF - name: Download changed images uses: actions/download-artifact@v2 with: name: changed_images - path: ${{ runner.temp }}/docs_check + path: ${{ env.TEMP_PATH }} + - name: Clear repository + run: | + sudo rm -rf $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Docs Check - env: - TEMP_PATH: ${{runner.temp}}/docs_check - REPO_COPY: ${{runner.temp}}/docs_check/ClickHouse run: | cp -r $GITHUB_WORKSPACE $TEMP_PATH cd $REPO_COPY/tests/ci diff --git a/.github/workflows/jepsen.yml b/.github/workflows/jepsen.yml new file mode 100644 index 00000000000..1b01b4d5074 --- /dev/null +++ b/.github/workflows/jepsen.yml @@ -0,0 +1,44 @@ +name: JepsenWorkflow +env: + # Force the stdout and stderr streams to be unbuffered + PYTHONUNBUFFERED: 1 +concurrency: + group: jepsen +on: # yamllint disable-line rule:truthy + schedule: + - cron: '0 */6 * * *' + workflow_run: + workflows: ["CIGithubActions"] + types: + - completed + workflow_dispatch: +jobs: + KeeperJepsenRelease: + runs-on: [self-hosted, style-checker] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/keeper_jepsen + REPO_COPY=${{runner.temp}}/keeper_jepsen/ClickHouse + EOF + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE + - name: Check out repository code + uses: actions/checkout@v2 + with: + fetch-depth: 0 + - name: Jepsen Test + run: | + sudo rm -fr $TEMP_PATH + mkdir -p $TEMP_PATH + cp -r $GITHUB_WORKSPACE $TEMP_PATH + cd $REPO_COPY/tests/ci + python3 keeper_jepsen_check.py + - name: Cleanup + if: always() + run: | + docker kill $(docker ps -q) ||: + docker rm -f $(docker ps -a -q) ||: + sudo rm -fr $TEMP_PATH diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 2adfbce3577..c42513ff9a8 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -1,4 +1,9 @@ name: CIGithubActions + +env: + # Force the stdout and stderr streams to be unbuffered + PYTHONUNBUFFERED: 1 + on: # yamllint disable-line rule:truthy pull_request: types: @@ -17,6 +22,9 @@ jobs: CheckLabels: runs-on: [self-hosted, style-checker] steps: + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Labels check @@ -27,6 +35,9 @@ jobs: needs: CheckLabels runs-on: [self-hosted, style-checker] steps: + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Images check @@ -42,16 +53,22 @@ jobs: needs: DockerHubPush runs-on: [self-hosted, style-checker] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{ runner.temp }}/style_check + EOF - name: Download changed images uses: actions/download-artifact@v2 with: name: changed_images - path: ${{ runner.temp }}/style_check + path: ${{ env.TEMP_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Style Check - env: - TEMP_PATH: ${{ runner.temp }}/style_check run: | cd $GITHUB_WORKSPACE/tests/ci python3 style_check.py @@ -65,13 +82,19 @@ jobs: needs: DockerHubPush runs-on: [self-hosted, builder] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/fasttest + REPO_COPY=${{runner.temp}}/fasttest/ClickHouse + CACHES_PATH=${{runner.temp}}/../ccaches + EOF + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Fast Test - env: - TEMP_PATH: ${{runner.temp}}/fasttest - REPO_COPY: ${{runner.temp}}/fasttest/ClickHouse - CACHES_PATH: ${{runner.temp}}/../ccaches run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -82,19 +105,25 @@ jobs: run: | docker kill $(docker ps -q) ||: docker rm -f $(docker ps -a -q) ||: - sudo rm -fr $TEMP_PATH + sudo rm -fr $TEMP_PATH $CACHES_PATH PVSCheck: needs: [DockerHubPush, FastTest] runs-on: [self-hosted, func-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/pvs_check + REPO_COPY=${{runner.temp}}/pvs_check/ClickHouse + EOF + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 with: - submodules: 'recursive' + submodules: 'true' - name: PVS Check - env: - TEMP_PATH: ${{runner.temp}}/pvs_check - REPO_COPY: ${{runner.temp}}/pvs_check/ClickHouse run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -110,17 +139,23 @@ jobs: needs: [BuilderDebRelease] runs-on: [self-hosted, style-checker] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/compatibility_check + REPO_COPY=${{runner.temp}}/compatibility_check/ClickHouse + REPORTS_PATH=${{runner.temp}}/reports_dir + EOF + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} - name: CompatibilityCheck - env: - TEMP_PATH: ${{runner.temp}}/compatibility_check - REPO_COPY: ${{runner.temp}}/compatibility_check/ClickHouse - REPORTS_PATH: ${{runner.temp}}/reports_dir run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -136,17 +171,23 @@ jobs: needs: [BuilderDebSplitted] runs-on: [self-hosted, style-checker] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/split_build_check + REPO_COPY=${{runner.temp}}/split_build_check/ClickHouse + REPORTS_PATH=${{runner.temp}}/reports_dir + EOF + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} - name: Split build check - env: - TEMP_PATH: ${{runner.temp}}/split_build_check - REPO_COPY: ${{runner.temp}}/split_build_check/ClickHouse - REPORTS_PATH: ${{runner.temp}}/reports_dir run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -165,44 +206,151 @@ jobs: needs: [DockerHubPush, FastTest] runs-on: [self-hosted, builder] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/build_check + IMAGES_PATH=${{runner.temp}}/images_path + REPO_COPY=${{runner.temp}}/build_check/ClickHouse + CACHES_PATH=${{runner.temp}}/../ccaches + CHECK_NAME=ClickHouse build check (actions) + BUILD_NAME=package_release + EOF - name: Download changed images uses: actions/download-artifact@v2 with: name: changed_images - path: ${{ runner.temp }}/images_path + path: ${{ env.IMAGES_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 with: - submodules: 'recursive' + submodules: 'true' fetch-depth: 0 # otherwise we will have no info about contributors - name: Build - env: - TEMP_PATH: ${{runner.temp}}/build_check - IMAGES_PATH: ${{runner.temp}}/images_path - REPO_COPY: ${{runner.temp}}/build_check/ClickHouse - CACHES_PATH: ${{runner.temp}}/../ccaches - CHECK_NAME: 'ClickHouse build check (actions)' - BUILD_NAME: 'package_release' run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH cp -r $GITHUB_WORKSPACE $TEMP_PATH cd $REPO_COPY/tests/ci && python3 build_check.py "$CHECK_NAME" $BUILD_NAME - name: Upload build URLs to artifacts + if: ${{ success() || failure() }} uses: actions/upload-artifact@v2 with: name: ${{ env.BUILD_NAME }} - path: ${{ runner.temp }}/build_check/${{ env.BUILD_NAME }}.json + path: ${{ env.TEMP_PATH }}/${{ env.BUILD_NAME }}.json - name: Cleanup if: always() run: | docker kill $(docker ps -q) ||: docker rm -f $(docker ps -a -q) ||: sudo rm -fr $TEMP_PATH + BuilderPerformance: + needs: [DockerHubPush, FastTest] + runs-on: [self-hosted, builder] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/build_check + IMAGES_PATH=${{runner.temp}}/images_path + REPO_COPY=${{runner.temp}}/build_check/ClickHouse + CACHES_PATH=${{runner.temp}}/../ccaches + CHECK_NAME=ClickHouse build check (actions) + BUILD_NAME=performance + EOF + - name: Download changed images + uses: actions/download-artifact@v2 + with: + name: changed_images + path: ${{ env.IMAGES_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE + - name: Check out repository code + uses: actions/checkout@v2 + with: + submodules: 'true' + fetch-depth: 0 # otherwise we will have no info about contributors + - name: Build + run: | + sudo rm -fr $TEMP_PATH + mkdir -p $TEMP_PATH + cp -r $GITHUB_WORKSPACE $TEMP_PATH + cd $REPO_COPY/tests/ci && python3 build_check.py "$CHECK_NAME" $BUILD_NAME + - name: Upload build URLs to artifacts + if: ${{ success() || failure() }} + uses: actions/upload-artifact@v2 + with: + name: ${{ env.BUILD_NAME }} + path: ${{ env.TEMP_PATH }}/${{ env.BUILD_NAME }}.json + - name: Cleanup + if: always() + run: | + docker kill $(docker ps -q) ||: + docker rm -f $(docker ps -a -q) ||: + sudo rm -fr $TEMP_PATH $CACHES_PATH BuilderBinRelease: needs: [DockerHubPush, FastTest] runs-on: [self-hosted, builder] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/build_check + IMAGES_PATH=${{runner.temp}}/images_path + REPO_COPY=${{runner.temp}}/build_check/ClickHouse + CACHES_PATH=${{runner.temp}}/../ccaches + CHECK_NAME=ClickHouse build check (actions) + BUILD_NAME=binary_release + EOF + - name: Download changed images + uses: actions/download-artifact@v2 + with: + name: changed_images + path: ${{ env.IMAGES_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE + - name: Check out repository code + uses: actions/checkout@v2 + with: + submodules: 'true' + fetch-depth: 0 # otherwise we will have no info about contributors + - name: Build + run: | + sudo rm -fr $TEMP_PATH + mkdir -p $TEMP_PATH + cp -r $GITHUB_WORKSPACE $TEMP_PATH + cd $REPO_COPY/tests/ci && python3 build_check.py "$CHECK_NAME" $BUILD_NAME + - name: Upload build URLs to artifacts + if: ${{ success() || failure() }} + uses: actions/upload-artifact@v2 + with: + name: ${{ env.BUILD_NAME }} + path: ${{ env.TEMP_PATH }}/${{ env.BUILD_NAME }}.json + - name: Cleanup + if: always() + run: | + docker kill $(docker ps -q) ||: + docker rm -f $(docker ps -a -q) ||: + sudo rm -fr $TEMP_PATH $CACHES_PATH + BuilderDebAarch64: + needs: [DockerHubPush, FastTest] + runs-on: [self-hosted, builder] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/build_check + IMAGES_PATH=${{runner.temp}}/images_path + REPO_COPY=${{runner.temp}}/build_check/ClickHouse + CACHES_PATH=${{runner.temp}}/../ccaches + CHECK_NAME=ClickHouse build check (actions) + BUILD_NAME=package_aarch64 + EOF - name: Download changed images uses: actions/download-artifact@v2 with: @@ -211,22 +359,16 @@ jobs: - name: Check out repository code uses: actions/checkout@v2 with: - submodules: 'recursive' + submodules: 'true' fetch-depth: 0 # otherwise we will have no info about contributors - name: Build - env: - TEMP_PATH: ${{runner.temp}}/build_check - IMAGES_PATH: ${{runner.temp}}/images_path - REPO_COPY: ${{runner.temp}}/build_check/ClickHouse - CACHES_PATH: ${{runner.temp}}/../ccaches - CHECK_NAME: 'ClickHouse build check (actions)' - BUILD_NAME: 'binary_release' run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH cp -r $GITHUB_WORKSPACE $TEMP_PATH cd $REPO_COPY/tests/ci && python3 build_check.py "$CHECK_NAME" $BUILD_NAME - name: Upload build URLs to artifacts + if: ${{ success() || failure() }} uses: actions/upload-artifact@v2 with: name: ${{ env.BUILD_NAME }} @@ -236,197 +378,232 @@ jobs: run: | docker kill $(docker ps -q) ||: docker rm -f $(docker ps -a -q) ||: - sudo rm -fr $TEMP_PATH + sudo rm -fr $TEMP_PATH $CACHES_PATH BuilderDebAsan: needs: [DockerHubPush, FastTest] runs-on: [self-hosted, builder] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/build_check + IMAGES_PATH=${{runner.temp}}/images_path + REPO_COPY=${{runner.temp}}/build_check/ClickHouse + CACHES_PATH=${{runner.temp}}/../ccaches + CHECK_NAME=ClickHouse build check (actions) + BUILD_NAME=package_asan + EOF - name: Download changed images uses: actions/download-artifact@v2 with: name: changed_images - path: ${{ runner.temp }}/images_path + path: ${{ env.IMAGES_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 with: - submodules: 'recursive' + submodules: 'true' fetch-depth: 0 # otherwise we will have no info about contributors - name: Build - env: - TEMP_PATH: ${{runner.temp}}/build_check - IMAGES_PATH: ${{runner.temp}}/images_path - REPO_COPY: ${{runner.temp}}/build_check/ClickHouse - CACHES_PATH: ${{runner.temp}}/../ccaches - CHECK_NAME: 'ClickHouse build check (actions)' - BUILD_NAME: 'package_asan' run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH cp -r $GITHUB_WORKSPACE $TEMP_PATH cd $REPO_COPY/tests/ci && python3 build_check.py "$CHECK_NAME" $BUILD_NAME - name: Upload build URLs to artifacts + if: ${{ success() || failure() }} uses: actions/upload-artifact@v2 with: name: ${{ env.BUILD_NAME }} - path: ${{ runner.temp }}/build_check/${{ env.BUILD_NAME }}.json + path: ${{ env.TEMP_PATH }}/${{ env.BUILD_NAME }}.json - name: Cleanup if: always() run: | docker kill $(docker ps -q) ||: docker rm -f $(docker ps -a -q) ||: - sudo rm -fr $TEMP_PATH + sudo rm -fr $TEMP_PATH $CACHES_PATH BuilderDebUBsan: needs: [DockerHubPush, FastTest] runs-on: [self-hosted, builder] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/build_check + IMAGES_PATH=${{runner.temp}}/images_path + REPO_COPY=${{runner.temp}}/build_check/ClickHouse + CACHES_PATH=${{runner.temp}}/../ccaches + CHECK_NAME=ClickHouse build check (actions) + BUILD_NAME=package_ubsan + EOF - name: Download changed images uses: actions/download-artifact@v2 with: name: changed_images - path: ${{ runner.temp }}/images_path + path: ${{ env.IMAGES_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 with: - submodules: 'recursive' + submodules: 'true' fetch-depth: 0 # otherwise we will have no info about contributors - name: Build - env: - TEMP_PATH: ${{runner.temp}}/build_check - IMAGES_PATH: ${{runner.temp}}/images_path - REPO_COPY: ${{runner.temp}}/build_check/ClickHouse - CACHES_PATH: ${{runner.temp}}/../ccaches - CHECK_NAME: 'ClickHouse build check (actions)' - BUILD_NAME: 'package_ubsan' run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH cp -r $GITHUB_WORKSPACE $TEMP_PATH cd $REPO_COPY/tests/ci && python3 build_check.py "$CHECK_NAME" $BUILD_NAME - name: Upload build URLs to artifacts + if: ${{ success() || failure() }} uses: actions/upload-artifact@v2 with: name: ${{ env.BUILD_NAME }} - path: ${{ runner.temp }}/build_check/${{ env.BUILD_NAME }}.json + path: ${{ env.TEMP_PATH }}/${{ env.BUILD_NAME }}.json - name: Cleanup if: always() run: | docker kill $(docker ps -q) ||: docker rm -f $(docker ps -a -q) ||: - sudo rm -fr $TEMP_PATH + sudo rm -fr $TEMP_PATH $CACHES_PATH BuilderDebTsan: needs: [DockerHubPush, FastTest] runs-on: [self-hosted, builder] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/build_check + IMAGES_PATH=${{runner.temp}}/images_path + REPO_COPY=${{runner.temp}}/build_check/ClickHouse + CACHES_PATH=${{runner.temp}}/../ccaches + CHECK_NAME=ClickHouse build check (actions) + BUILD_NAME=package_tsan + EOF - name: Download changed images uses: actions/download-artifact@v2 with: name: changed_images - path: ${{ runner.temp }}/images_path + path: ${{ env.IMAGES_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 with: - submodules: 'recursive' + submodules: 'true' fetch-depth: 0 # otherwise we will have no info about contributors - name: Build - env: - TEMP_PATH: ${{runner.temp}}/build_check - IMAGES_PATH: ${{runner.temp}}/images_path - REPO_COPY: ${{runner.temp}}/build_check/ClickHouse - CACHES_PATH: ${{runner.temp}}/../ccaches - CHECK_NAME: 'ClickHouse build check (actions)' - BUILD_NAME: 'package_tsan' run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH cp -r $GITHUB_WORKSPACE $TEMP_PATH cd $REPO_COPY/tests/ci && python3 build_check.py "$CHECK_NAME" $BUILD_NAME - name: Upload build URLs to artifacts + if: ${{ success() || failure() }} uses: actions/upload-artifact@v2 with: name: ${{ env.BUILD_NAME }} - path: ${{ runner.temp }}/build_check/${{ env.BUILD_NAME }}.json + path: ${{ env.TEMP_PATH }}/${{ env.BUILD_NAME }}.json - name: Cleanup if: always() run: | docker kill $(docker ps -q) ||: docker rm -f $(docker ps -a -q) ||: - sudo rm -fr $TEMP_PATH + sudo rm -fr $TEMP_PATH $CACHES_PATH BuilderDebMsan: needs: [DockerHubPush, FastTest] runs-on: [self-hosted, builder] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/build_check + IMAGES_PATH=${{runner.temp}}/images_path + REPO_COPY=${{runner.temp}}/build_check/ClickHouse + CACHES_PATH=${{runner.temp}}/../ccaches + CHECK_NAME=ClickHouse build check (actions) + BUILD_NAME=package_msan + EOF - name: Download changed images uses: actions/download-artifact@v2 with: name: changed_images - path: ${{ runner.temp }}/images_path + path: ${{ env.IMAGES_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 with: - submodules: 'recursive' + submodules: 'true' fetch-depth: 0 # otherwise we will have no info about contributors - name: Build - env: - TEMP_PATH: ${{runner.temp}}/build_check - IMAGES_PATH: ${{runner.temp}}/images_path - REPO_COPY: ${{runner.temp}}/build_check/ClickHouse - CACHES_PATH: ${{runner.temp}}/../ccaches - CHECK_NAME: 'ClickHouse build check (actions)' - BUILD_NAME: 'package_msan' run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH cp -r $GITHUB_WORKSPACE $TEMP_PATH cd $REPO_COPY/tests/ci && python3 build_check.py "$CHECK_NAME" $BUILD_NAME - name: Upload build URLs to artifacts + if: ${{ success() || failure() }} uses: actions/upload-artifact@v2 with: name: ${{ env.BUILD_NAME }} - path: ${{ runner.temp }}/build_check/${{ env.BUILD_NAME }}.json + path: ${{ env.TEMP_PATH }}/${{ env.BUILD_NAME }}.json - name: Cleanup if: always() run: | docker kill $(docker ps -q) ||: docker rm -f $(docker ps -a -q) ||: - sudo rm -fr $TEMP_PATH + sudo rm -fr $TEMP_PATH $CACHES_PATH BuilderDebDebug: needs: [DockerHubPush, FastTest] runs-on: [self-hosted, builder] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/build_check + IMAGES_PATH=${{runner.temp}}/images_path + REPO_COPY=${{runner.temp}}/build_check/ClickHouse + CACHES_PATH=${{runner.temp}}/../ccaches + CHECK_NAME=ClickHouse build check (actions) + BUILD_NAME=package_debug + EOF - name: Download changed images uses: actions/download-artifact@v2 with: name: changed_images - path: ${{ runner.temp }}/images_path + path: ${{ env.IMAGES_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 with: - submodules: 'recursive' + submodules: 'true' fetch-depth: 0 # otherwise we will have no info about contributors - name: Build - env: - TEMP_PATH: ${{runner.temp}}/build_check - IMAGES_PATH: ${{runner.temp}}/images_path - REPO_COPY: ${{runner.temp}}/build_check/ClickHouse - CACHES_PATH: ${{runner.temp}}/../ccaches - CHECK_NAME: 'ClickHouse build check (actions)' - BUILD_NAME: 'package_debug' run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH cp -r $GITHUB_WORKSPACE $TEMP_PATH cd $REPO_COPY/tests/ci && python3 build_check.py "$CHECK_NAME" $BUILD_NAME - name: Upload build URLs to artifacts + if: ${{ success() || failure() }} uses: actions/upload-artifact@v2 with: name: ${{ env.BUILD_NAME }} - path: ${{ runner.temp }}/build_check/${{ env.BUILD_NAME }}.json + path: ${{ env.TEMP_PATH }}/${{ env.BUILD_NAME }}.json - name: Cleanup if: always() run: | docker kill $(docker ps -q) ||: docker rm -f $(docker ps -a -q) ||: - sudo rm -fr $TEMP_PATH + sudo rm -fr $TEMP_PATH $CACHES_PATH ########################################################################################## ##################################### SPECIAL BUILDS ##################################### ########################################################################################## @@ -434,268 +611,317 @@ jobs: needs: [DockerHubPush, FastTest] runs-on: [self-hosted, builder] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/build_check + IMAGES_PATH=${{runner.temp}}/images_path + REPO_COPY=${{runner.temp}}/build_check/ClickHouse + CACHES_PATH=${{runner.temp}}/../ccaches + CHECK_NAME=ClickHouse build check (actions) + BUILD_NAME=binary_splitted + EOF - name: Download changed images uses: actions/download-artifact@v2 with: name: changed_images - path: ${{ runner.temp }}/images_path + path: ${{ env.IMAGES_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 with: - submodules: 'recursive' + submodules: 'true' fetch-depth: 0 # otherwise we will have no info about contributors - name: Build - env: - TEMP_PATH: ${{runner.temp}}/build_check - IMAGES_PATH: ${{runner.temp}}/images_path - REPO_COPY: ${{runner.temp}}/build_check/ClickHouse - CACHES_PATH: ${{runner.temp}}/../ccaches - CHECK_NAME: 'ClickHouse build check (actions)' - BUILD_NAME: 'binary_splitted' run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH cp -r $GITHUB_WORKSPACE $TEMP_PATH cd $REPO_COPY/tests/ci && python3 build_check.py "$CHECK_NAME" $BUILD_NAME - name: Upload build URLs to artifacts + if: ${{ success() || failure() }} uses: actions/upload-artifact@v2 with: name: ${{ env.BUILD_NAME }} - path: ${{ runner.temp }}/build_check/${{ env.BUILD_NAME }}.json + path: ${{ env.TEMP_PATH }}/${{ env.BUILD_NAME }}.json - name: Cleanup if: always() run: | docker kill $(docker ps -q) ||: docker rm -f $(docker ps -a -q) ||: - sudo rm -fr $TEMP_PATH + sudo rm -fr $TEMP_PATH $CACHES_PATH BuilderBinTidy: needs: [DockerHubPush, FastTest] runs-on: [self-hosted, builder] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/build_check + IMAGES_PATH=${{runner.temp}}/images_path + REPO_COPY=${{runner.temp}}/build_check/ClickHouse + CACHES_PATH=${{runner.temp}}/../ccaches + CHECK_NAME=ClickHouse build check (actions) + BUILD_NAME=binary_tidy + EOF - name: Download changed images uses: actions/download-artifact@v2 with: name: changed_images - path: ${{ runner.temp }}/images_path + path: ${{ env.IMAGES_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 with: - submodules: 'recursive' + submodules: 'true' fetch-depth: 0 # otherwise we will have no info about contributors - name: Build - env: - TEMP_PATH: ${{runner.temp}}/build_check - IMAGES_PATH: ${{runner.temp}}/images_path - REPO_COPY: ${{runner.temp}}/build_check/ClickHouse - CACHES_PATH: ${{runner.temp}}/../ccaches - CHECK_NAME: 'ClickHouse build check (actions)' - BUILD_NAME: 'binary_tidy' run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH cp -r $GITHUB_WORKSPACE $TEMP_PATH cd $REPO_COPY/tests/ci && python3 build_check.py "$CHECK_NAME" $BUILD_NAME - name: Upload build URLs to artifacts + if: ${{ success() || failure() }} uses: actions/upload-artifact@v2 with: name: ${{ env.BUILD_NAME }} - path: ${{ runner.temp }}/build_check/${{ env.BUILD_NAME }}.json + path: ${{ env.TEMP_PATH }}/${{ env.BUILD_NAME }}.json - name: Cleanup if: always() run: | docker kill $(docker ps -q) ||: docker rm -f $(docker ps -a -q) ||: - sudo rm -fr $TEMP_PATH + sudo rm -fr $TEMP_PATH $CACHES_PATH BuilderBinDarwin: needs: [DockerHubPush, FastTest] runs-on: [self-hosted, builder] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/build_check + IMAGES_PATH=${{runner.temp}}/images_path + REPO_COPY=${{runner.temp}}/build_check/ClickHouse + CACHES_PATH=${{runner.temp}}/../ccaches + CHECK_NAME=ClickHouse build check (actions) + BUILD_NAME=binary_darwin + EOF - name: Download changed images uses: actions/download-artifact@v2 with: name: changed_images - path: ${{ runner.temp }}/images_path + path: ${{ env.IMAGES_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 with: - submodules: 'recursive' + submodules: 'true' fetch-depth: 0 # otherwise we will have no info about contributors - name: Build - env: - TEMP_PATH: ${{runner.temp}}/build_check - IMAGES_PATH: ${{runner.temp}}/images_path - REPO_COPY: ${{runner.temp}}/build_check/ClickHouse - CACHES_PATH: ${{runner.temp}}/../ccaches - CHECK_NAME: 'ClickHouse build check (actions)' - BUILD_NAME: 'binary_darwin' run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH cp -r $GITHUB_WORKSPACE $TEMP_PATH cd $REPO_COPY/tests/ci && python3 build_check.py "$CHECK_NAME" $BUILD_NAME - name: Upload build URLs to artifacts + if: ${{ success() || failure() }} uses: actions/upload-artifact@v2 with: name: ${{ env.BUILD_NAME }} - path: ${{ runner.temp }}/build_check/${{ env.BUILD_NAME }}.json + path: ${{ env.TEMP_PATH }}/${{ env.BUILD_NAME }}.json - name: Cleanup if: always() run: | docker kill $(docker ps -q) ||: docker rm -f $(docker ps -a -q) ||: - sudo rm -fr $TEMP_PATH + sudo rm -fr $TEMP_PATH $CACHES_PATH BuilderBinAarch64: needs: [DockerHubPush, FastTest] runs-on: [self-hosted, builder] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/build_check + IMAGES_PATH=${{runner.temp}}/images_path + REPO_COPY=${{runner.temp}}/build_check/ClickHouse + CACHES_PATH=${{runner.temp}}/../ccaches + CHECK_NAME=ClickHouse build check (actions) + BUILD_NAME=binary_aarch64 + EOF - name: Download changed images uses: actions/download-artifact@v2 with: name: changed_images - path: ${{ runner.temp }}/images_path + path: ${{ env.IMAGES_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 with: - submodules: 'recursive' + submodules: 'true' fetch-depth: 0 # otherwise we will have no info about contributors - name: Build - env: - TEMP_PATH: ${{runner.temp}}/build_check - IMAGES_PATH: ${{runner.temp}}/images_path - REPO_COPY: ${{runner.temp}}/build_check/ClickHouse - CACHES_PATH: ${{runner.temp}}/../ccaches - CHECK_NAME: 'ClickHouse build check (actions)' - BUILD_NAME: 'binary_aarch64' run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH cp -r $GITHUB_WORKSPACE $TEMP_PATH cd $REPO_COPY/tests/ci && python3 build_check.py "$CHECK_NAME" $BUILD_NAME - name: Upload build URLs to artifacts + if: ${{ success() || failure() }} uses: actions/upload-artifact@v2 with: name: ${{ env.BUILD_NAME }} - path: ${{ runner.temp }}/build_check/${{ env.BUILD_NAME }}.json + path: ${{ env.TEMP_PATH }}/${{ env.BUILD_NAME }}.json - name: Cleanup if: always() run: | docker kill $(docker ps -q) ||: docker rm -f $(docker ps -a -q) ||: - sudo rm -fr $TEMP_PATH + sudo rm -fr $TEMP_PATH $CACHES_PATH BuilderBinFreeBSD: needs: [DockerHubPush, FastTest] runs-on: [self-hosted, builder] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/build_check + IMAGES_PATH=${{runner.temp}}/images_path + REPO_COPY=${{runner.temp}}/build_check/ClickHouse + CACHES_PATH=${{runner.temp}}/../ccaches + CHECK_NAME=ClickHouse build check (actions) + BUILD_NAME=binary_freebsd + EOF - name: Download changed images uses: actions/download-artifact@v2 with: name: changed_images - path: ${{ runner.temp }}/images_path + path: ${{ env.IMAGES_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 with: - submodules: 'recursive' + submodules: 'true' fetch-depth: 0 # otherwise we will have no info about contributors - name: Build - env: - TEMP_PATH: ${{runner.temp}}/build_check - IMAGES_PATH: ${{runner.temp}}/images_path - REPO_COPY: ${{runner.temp}}/build_check/ClickHouse - CACHES_PATH: ${{runner.temp}}/../ccaches - CHECK_NAME: 'ClickHouse build check (actions)' - BUILD_NAME: 'binary_freebsd' run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH cp -r $GITHUB_WORKSPACE $TEMP_PATH cd $REPO_COPY/tests/ci && python3 build_check.py "$CHECK_NAME" $BUILD_NAME - name: Upload build URLs to artifacts + if: ${{ success() || failure() }} uses: actions/upload-artifact@v2 with: name: ${{ env.BUILD_NAME }} - path: ${{ runner.temp }}/build_check/${{ env.BUILD_NAME }}.json + path: ${{ env.TEMP_PATH }}/${{ env.BUILD_NAME }}.json - name: Cleanup if: always() run: | docker kill $(docker ps -q) ||: docker rm -f $(docker ps -a -q) ||: - sudo rm -fr $TEMP_PATH + sudo rm -fr $TEMP_PATH $CACHES_PATH BuilderBinDarwinAarch64: needs: [DockerHubPush, FastTest] runs-on: [self-hosted, builder] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/build_check + IMAGES_PATH=${{runner.temp}}/images_path + REPO_COPY=${{runner.temp}}/build_check/ClickHouse + CACHES_PATH=${{runner.temp}}/../ccaches + CHECK_NAME=ClickHouse build check (actions) + BUILD_NAME=binary_darwin_aarch64 + EOF - name: Download changed images uses: actions/download-artifact@v2 with: name: changed_images - path: ${{ runner.temp }}/images_path + path: ${{ env.IMAGES_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 with: - submodules: 'recursive' + submodules: 'true' fetch-depth: 0 # otherwise we will have no info about contributors - name: Build - env: - TEMP_PATH: ${{runner.temp}}/build_check - IMAGES_PATH: ${{runner.temp}}/images_path - REPO_COPY: ${{runner.temp}}/build_check/ClickHouse - CACHES_PATH: ${{runner.temp}}/../ccaches - CHECK_NAME: 'ClickHouse build check (actions)' - BUILD_NAME: 'binary_darwin_aarch64' run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH cp -r $GITHUB_WORKSPACE $TEMP_PATH cd $REPO_COPY/tests/ci && python3 build_check.py "$CHECK_NAME" $BUILD_NAME - name: Upload build URLs to artifacts + if: ${{ success() || failure() }} uses: actions/upload-artifact@v2 with: name: ${{ env.BUILD_NAME }} - path: ${{ runner.temp }}/build_check/${{ env.BUILD_NAME }}.json + path: ${{ env.TEMP_PATH }}/${{ env.BUILD_NAME }}.json - name: Cleanup if: always() run: | docker kill $(docker ps -q) ||: docker rm -f $(docker ps -a -q) ||: - sudo rm -fr $TEMP_PATH + sudo rm -fr $TEMP_PATH $CACHES_PATH BuilderBinPPC64: needs: [DockerHubPush, FastTest] runs-on: [self-hosted, builder] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/build_check + IMAGES_PATH=${{runner.temp}}/images_path + REPO_COPY=${{runner.temp}}/build_check/ClickHouse + CACHES_PATH=${{runner.temp}}/../ccaches + CHECK_NAME=ClickHouse build check (actions) + BUILD_NAME=binary_ppc64le + EOF - name: Download changed images uses: actions/download-artifact@v2 with: name: changed_images - path: ${{ runner.temp }}/images_path + path: ${{ env.IMAGES_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 with: - submodules: 'recursive' + submodules: 'true' fetch-depth: 0 # otherwise we will have no info about contributors - name: Build - env: - TEMP_PATH: ${{runner.temp}}/build_check - IMAGES_PATH: ${{runner.temp}}/images_path - REPO_COPY: ${{runner.temp}}/build_check/ClickHouse - CACHES_PATH: ${{runner.temp}}/../ccaches - CHECK_NAME: 'ClickHouse build check (actions)' - BUILD_NAME: 'binary_ppc64le' run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH cp -r $GITHUB_WORKSPACE $TEMP_PATH cd $REPO_COPY/tests/ci && python3 build_check.py "$CHECK_NAME" $BUILD_NAME - name: Upload build URLs to artifacts + if: ${{ success() || failure() }} uses: actions/upload-artifact@v2 with: name: ${{ env.BUILD_NAME }} - path: ${{ runner.temp }}/build_check/${{ env.BUILD_NAME }}.json + path: ${{ env.TEMP_PATH }}/${{ env.BUILD_NAME }}.json - name: Cleanup if: always() run: | docker kill $(docker ps -q) ||: docker rm -f $(docker ps -a -q) ||: - sudo rm -fr $TEMP_PATH + sudo rm -fr $TEMP_PATH $CACHES_PATH ############################################################################################ ##################################### BUILD REPORTER ####################################### ############################################################################################ @@ -703,25 +929,32 @@ jobs: needs: - BuilderDebRelease - BuilderBinRelease + - BuilderDebAarch64 - BuilderDebAsan - BuilderDebTsan - BuilderDebUBsan - BuilderDebMsan - BuilderDebDebug runs-on: [self-hosted, style-checker] - if: always() + if: ${{ success() || failure() }} steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/report_check + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=ClickHouse build check (actions) + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Report Builder - env: - TEMP_PATH: ${{runner.temp}}/report_check - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'ClickHouse build check (actions)' run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -743,19 +976,25 @@ jobs: - BuilderBinDarwinAarch64 - BuilderBinPPC64 runs-on: [self-hosted, style-checker] - if: always() + if: ${{ success() || failure() }} steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/report_check + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=ClickHouse special build check (actions) + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Report Builder - env: - TEMP_PATH: ${{runner.temp}}/report_check - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'ClickHouse special build check (actions)' run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -774,19 +1013,25 @@ jobs: needs: [BuilderDebRelease] runs-on: [self-hosted, func-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stateless_release + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stateless tests (release, actions) + REPO_COPY=${{runner.temp}}/stateless_release/ClickHouse + KILL_TIMEOUT=10800 + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Functional test - env: - TEMP_PATH: ${{runner.temp}}/stateless_release - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'Stateless tests (release, actions)' - REPO_COPY: ${{runner.temp}}/stateless_release/ClickHouse - KILL_TIMEOUT: 10800 run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -799,23 +1044,68 @@ jobs: docker kill $(docker ps -q) ||: docker rm -f $(docker ps -a -q) ||: sudo rm -fr $TEMP_PATH - FunctionalStatelessTestReleaseDatabaseReplicated: + FunctionalStatelessTestReleaseDatabaseReplicated0: needs: [BuilderDebRelease] runs-on: [self-hosted, func-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stateless_database_replicated + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stateless tests (release, DatabaseReplicated, actions) + REPO_COPY=${{runner.temp}}/stateless_database_replicated/ClickHouse + KILL_TIMEOUT=10800 + RUN_BY_HASH_NUM=0 + RUN_BY_HASH_TOTAL=2 + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE + - name: Check out repository code + uses: actions/checkout@v2 + - name: Functional test + run: | + sudo rm -fr $TEMP_PATH + mkdir -p $TEMP_PATH + cp -r $GITHUB_WORKSPACE $TEMP_PATH + cd $REPO_COPY/tests/ci + python3 functional_test_check.py "$CHECK_NAME" $KILL_TIMEOUT + - name: Cleanup + if: always() + run: | + docker kill $(docker ps -q) ||: + docker rm -f $(docker ps -a -q) ||: + sudo rm -fr $TEMP_PATH + FunctionalStatelessTestReleaseDatabaseReplicated1: + needs: [BuilderDebRelease] + runs-on: [self-hosted, func-tester] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stateless_database_replicated + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stateless tests (release, DatabaseReplicated, actions) + REPO_COPY=${{runner.temp}}/stateless_database_replicated/ClickHouse + KILL_TIMEOUT=10800 + RUN_BY_HASH_NUM=1 + RUN_BY_HASH_TOTAL=2 + EOF + - name: Download json reports + uses: actions/download-artifact@v2 + with: + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Functional test - env: - TEMP_PATH: ${{runner.temp}}/stateless_database_replicated - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'Stateless tests (release, DatabaseReplicated, actions)' - REPO_COPY: ${{runner.temp}}/stateless_database_replicated/ClickHouse - KILL_TIMEOUT: 10800 run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -832,19 +1122,25 @@ jobs: needs: [BuilderDebRelease] runs-on: [self-hosted, func-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stateless_wide_parts + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stateless tests (release, wide parts enabled, actions) + REPO_COPY=${{runner.temp}}/stateless_wide_parts/ClickHouse + KILL_TIMEOUT=10800 + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Functional test - env: - TEMP_PATH: ${{runner.temp}}/stateless_wide_parts - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'Stateless tests (release, wide parts enabled, actions)' - REPO_COPY: ${{runner.temp}}/stateless_wide_parts/ClickHouse - KILL_TIMEOUT: 10800 run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -857,23 +1153,31 @@ jobs: docker kill $(docker ps -q) ||: docker rm -f $(docker ps -a -q) ||: sudo rm -fr $TEMP_PATH - FunctionalStatelessTestAsan: + FunctionalStatelessTestAsan0: needs: [BuilderDebAsan] runs-on: [self-hosted, func-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stateless_debug + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stateless tests (address, actions) + REPO_COPY=${{runner.temp}}/stateless_debug/ClickHouse + KILL_TIMEOUT=10800 + RUN_BY_HASH_NUM=0 + RUN_BY_HASH_TOTAL=2 + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Functional test - env: - TEMP_PATH: ${{runner.temp}}/stateless_debug - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'Stateless tests (address, actions)' - REPO_COPY: ${{runner.temp}}/stateless_debug/ClickHouse - KILL_TIMEOUT: 10800 run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -886,23 +1190,142 @@ jobs: docker kill $(docker ps -q) ||: docker rm -f $(docker ps -a -q) ||: sudo rm -fr $TEMP_PATH - FunctionalStatelessTestTsan: - needs: [BuilderDebTsan] + FunctionalStatelessTestAsan1: + needs: [BuilderDebAsan] runs-on: [self-hosted, func-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stateless_debug + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stateless tests (address, actions) + REPO_COPY=${{runner.temp}}/stateless_debug/ClickHouse + KILL_TIMEOUT=10800 + RUN_BY_HASH_NUM=1 + RUN_BY_HASH_TOTAL=2 + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE + - name: Check out repository code + uses: actions/checkout@v2 + - name: Functional test + run: | + sudo rm -fr $TEMP_PATH + mkdir -p $TEMP_PATH + cp -r $GITHUB_WORKSPACE $TEMP_PATH + cd $REPO_COPY/tests/ci + python3 functional_test_check.py "$CHECK_NAME" $KILL_TIMEOUT + - name: Cleanup + if: always() + run: | + docker kill $(docker ps -q) ||: + docker rm -f $(docker ps -a -q) ||: + sudo rm -fr $TEMP_PATH + FunctionalStatelessTestTsan0: + needs: [BuilderDebTsan] + runs-on: [self-hosted, func-tester] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stateless_tsan + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stateless tests (thread, actions) + REPO_COPY=${{runner.temp}}/stateless_tsan/ClickHouse + KILL_TIMEOUT=10800 + RUN_BY_HASH_NUM=0 + RUN_BY_HASH_TOTAL=3 + EOF + - name: Download json reports + uses: actions/download-artifact@v2 + with: + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE + - name: Check out repository code + uses: actions/checkout@v2 + - name: Functional test + run: | + sudo rm -fr $TEMP_PATH + mkdir -p $TEMP_PATH + cp -r $GITHUB_WORKSPACE $TEMP_PATH + cd $REPO_COPY/tests/ci + python3 functional_test_check.py "$CHECK_NAME" $KILL_TIMEOUT + - name: Cleanup + if: always() + run: | + docker kill $(docker ps -q) ||: + docker rm -f $(docker ps -a -q) ||: + sudo rm -fr $TEMP_PATH + FunctionalStatelessTestTsan1: + needs: [BuilderDebTsan] + runs-on: [self-hosted, func-tester] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stateless_tsan + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stateless tests (thread, actions) + REPO_COPY=${{runner.temp}}/stateless_tsan/ClickHouse + KILL_TIMEOUT=10800 + RUN_BY_HASH_NUM=1 + RUN_BY_HASH_TOTAL=3 + EOF + - name: Download json reports + uses: actions/download-artifact@v2 + with: + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE + - name: Check out repository code + uses: actions/checkout@v2 + - name: Functional test + run: | + sudo rm -fr $TEMP_PATH + mkdir -p $TEMP_PATH + cp -r $GITHUB_WORKSPACE $TEMP_PATH + cd $REPO_COPY/tests/ci + python3 functional_test_check.py "$CHECK_NAME" $KILL_TIMEOUT + - name: Cleanup + if: always() + run: | + docker kill $(docker ps -q) ||: + docker rm -f $(docker ps -a -q) ||: + sudo rm -fr $TEMP_PATH + FunctionalStatelessTestTsan2: + needs: [BuilderDebTsan] + runs-on: [self-hosted, func-tester] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stateless_tsan + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stateless tests (thread, actions) + REPO_COPY=${{runner.temp}}/stateless_tsan/ClickHouse + KILL_TIMEOUT=10800 + RUN_BY_HASH_NUM=2 + RUN_BY_HASH_TOTAL=3 + EOF + - name: Download json reports + uses: actions/download-artifact@v2 + with: + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Functional test - env: - TEMP_PATH: ${{runner.temp}}/stateless_tsan - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'Stateless tests (thread, actions)' - REPO_COPY: ${{runner.temp}}/stateless_tsan/ClickHouse - KILL_TIMEOUT: 10800 run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -919,19 +1342,25 @@ jobs: needs: [BuilderDebUBsan] runs-on: [self-hosted, func-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stateless_ubsan + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stateless tests (ubsan, actions) + REPO_COPY=${{runner.temp}}/stateless_ubsan/ClickHouse + KILL_TIMEOUT=10800 + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Functional test - env: - TEMP_PATH: ${{runner.temp}}/stateless_ubsan - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'Stateless tests (ubsan, actions)' - REPO_COPY: ${{runner.temp}}/stateless_ubsan/ClickHouse - KILL_TIMEOUT: 10800 run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -944,23 +1373,31 @@ jobs: docker kill $(docker ps -q) ||: docker rm -f $(docker ps -a -q) ||: sudo rm -fr $TEMP_PATH - FunctionalStatelessTestMsan: + FunctionalStatelessTestMsan0: needs: [BuilderDebMsan] runs-on: [self-hosted, func-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stateless_memory + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stateless tests (memory, actions) + REPO_COPY=${{runner.temp}}/stateless_memory/ClickHouse + KILL_TIMEOUT=10800 + RUN_BY_HASH_NUM=0 + RUN_BY_HASH_TOTAL=3 + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Functional test - env: - TEMP_PATH: ${{runner.temp}}/stateless_memory - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'Stateless tests (memory, actions)' - REPO_COPY: ${{runner.temp}}/stateless_memory/ClickHouse - KILL_TIMEOUT: 10800 run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -973,23 +1410,179 @@ jobs: docker kill $(docker ps -q) ||: docker rm -f $(docker ps -a -q) ||: sudo rm -fr $TEMP_PATH - FunctionalStatelessTestDebug: - needs: [BuilderDebDebug] + FunctionalStatelessTestMsan1: + needs: [BuilderDebMsan] runs-on: [self-hosted, func-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stateless_memory + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stateless tests (memory, actions) + REPO_COPY=${{runner.temp}}/stateless_memory/ClickHouse + KILL_TIMEOUT=10800 + RUN_BY_HASH_NUM=1 + RUN_BY_HASH_TOTAL=3 + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE + - name: Check out repository code + uses: actions/checkout@v2 + - name: Functional test + run: | + sudo rm -fr $TEMP_PATH + mkdir -p $TEMP_PATH + cp -r $GITHUB_WORKSPACE $TEMP_PATH + cd $REPO_COPY/tests/ci + python3 functional_test_check.py "$CHECK_NAME" $KILL_TIMEOUT + - name: Cleanup + if: always() + run: | + docker kill $(docker ps -q) ||: + docker rm -f $(docker ps -a -q) ||: + sudo rm -fr $TEMP_PATH + FunctionalStatelessTestMsan2: + needs: [BuilderDebMsan] + runs-on: [self-hosted, func-tester] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stateless_memory + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stateless tests (memory, actions) + REPO_COPY=${{runner.temp}}/stateless_memory/ClickHouse + KILL_TIMEOUT=10800 + RUN_BY_HASH_NUM=2 + RUN_BY_HASH_TOTAL=3 + EOF + - name: Download json reports + uses: actions/download-artifact@v2 + with: + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE + - name: Check out repository code + uses: actions/checkout@v2 + - name: Functional test + run: | + sudo rm -fr $TEMP_PATH + mkdir -p $TEMP_PATH + cp -r $GITHUB_WORKSPACE $TEMP_PATH + cd $REPO_COPY/tests/ci + python3 functional_test_check.py "$CHECK_NAME" $KILL_TIMEOUT + - name: Cleanup + if: always() + run: | + docker kill $(docker ps -q) ||: + docker rm -f $(docker ps -a -q) ||: + sudo rm -fr $TEMP_PATH + FunctionalStatelessTestDebug0: + needs: [BuilderDebDebug] + runs-on: [self-hosted, func-tester] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stateless_debug + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stateless tests (debug, actions) + REPO_COPY=${{runner.temp}}/stateless_debug/ClickHouse + KILL_TIMEOUT=10800 + RUN_BY_HASH_NUM=0 + RUN_BY_HASH_TOTAL=3 + EOF + - name: Download json reports + uses: actions/download-artifact@v2 + with: + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE + - name: Check out repository code + uses: actions/checkout@v2 + - name: Functional test + run: | + sudo rm -fr $TEMP_PATH + mkdir -p $TEMP_PATH + cp -r $GITHUB_WORKSPACE $TEMP_PATH + cd $REPO_COPY/tests/ci + python3 functional_test_check.py "$CHECK_NAME" $KILL_TIMEOUT + - name: Cleanup + if: always() + run: | + docker kill $(docker ps -q) ||: + docker rm -f $(docker ps -a -q) ||: + sudo rm -fr $TEMP_PATH + FunctionalStatelessTestDebug1: + needs: [BuilderDebDebug] + runs-on: [self-hosted, func-tester] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stateless_debug + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stateless tests (debug, actions) + REPO_COPY=${{runner.temp}}/stateless_debug/ClickHouse + KILL_TIMEOUT=10800 + RUN_BY_HASH_NUM=1 + RUN_BY_HASH_TOTAL=3 + EOF + - name: Download json reports + uses: actions/download-artifact@v2 + with: + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE + - name: Check out repository code + uses: actions/checkout@v2 + - name: Functional test + run: | + sudo rm -fr $TEMP_PATH + mkdir -p $TEMP_PATH + cp -r $GITHUB_WORKSPACE $TEMP_PATH + cd $REPO_COPY/tests/ci + python3 functional_test_check.py "$CHECK_NAME" $KILL_TIMEOUT + - name: Cleanup + if: always() + run: | + docker kill $(docker ps -q) ||: + docker rm -f $(docker ps -a -q) ||: + sudo rm -fr $TEMP_PATH + FunctionalStatelessTestDebug2: + needs: [BuilderDebDebug] + runs-on: [self-hosted, func-tester] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stateless_debug + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stateless tests (debug, actions) + REPO_COPY=${{runner.temp}}/stateless_debug/ClickHouse + KILL_TIMEOUT=10800 + RUN_BY_HASH_NUM=2 + RUN_BY_HASH_TOTAL=3 + EOF + - name: Download json reports + uses: actions/download-artifact@v2 + with: + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Functional test - env: - TEMP_PATH: ${{runner.temp}}/stateless_debug - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'Stateless tests (debug, actions)' - REPO_COPY: ${{runner.temp}}/stateless_debug/ClickHouse - KILL_TIMEOUT: 10800 run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -1006,19 +1599,25 @@ jobs: needs: [BuilderDebAsan] runs-on: [self-hosted, func-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stateless_flaky_asan + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stateless tests flaky check (address, actions) + REPO_COPY=${{runner.temp}}/stateless_flaky_asan/ClickHouse + KILL_TIMEOUT=3600 + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Functional test - env: - TEMP_PATH: ${{runner.temp}}/stateless_flaky_asan - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'Stateless tests flaky check (address, actions)' - REPO_COPY: ${{runner.temp}}/stateless_flaky_asan/ClickHouse - KILL_TIMEOUT: 3600 run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -1038,19 +1637,25 @@ jobs: needs: [BuilderDebRelease] runs-on: [self-hosted, func-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stateful_release + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stateful tests (release, actions) + REPO_COPY=${{runner.temp}}/stateful_release/ClickHouse + KILL_TIMEOUT=3600 + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Functional test - env: - TEMP_PATH: ${{runner.temp}}/stateful_release - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'Stateful tests (release, actions)' - REPO_COPY: ${{runner.temp}}/stateful_release/ClickHouse - KILL_TIMEOUT: 3600 run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -1067,19 +1672,25 @@ jobs: needs: [BuilderDebAsan] runs-on: [self-hosted, func-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stateful_debug + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stateful tests (address, actions) + REPO_COPY=${{runner.temp}}/stateful_debug/ClickHouse + KILL_TIMEOUT=3600 + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Functional test - env: - TEMP_PATH: ${{runner.temp}}/stateful_debug - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'Stateful tests (address, actions)' - REPO_COPY: ${{runner.temp}}/stateful_debug/ClickHouse - KILL_TIMEOUT: 3600 run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -1096,19 +1707,25 @@ jobs: needs: [BuilderDebTsan] runs-on: [self-hosted, func-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stateful_tsan + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stateful tests (thread, actions) + REPO_COPY=${{runner.temp}}/stateful_tsan/ClickHouse + KILL_TIMEOUT=3600 + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Functional test - env: - TEMP_PATH: ${{runner.temp}}/stateful_tsan - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'Stateful tests (thread, actions)' - REPO_COPY: ${{runner.temp}}/stateful_tsan/ClickHouse - KILL_TIMEOUT: 3600 run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -1125,19 +1742,25 @@ jobs: needs: [BuilderDebMsan] runs-on: [self-hosted, func-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stateful_msan + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stateful tests (memory, actions) + REPO_COPY=${{runner.temp}}/stateful_msan/ClickHouse + KILL_TIMEOUT=3600 + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Functional test - env: - TEMP_PATH: ${{runner.temp}}/stateful_msan - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'Stateful tests (memory, actions)' - REPO_COPY: ${{runner.temp}}/stateful_msan/ClickHouse - KILL_TIMEOUT: 3600 run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -1154,19 +1777,25 @@ jobs: needs: [BuilderDebUBsan] runs-on: [self-hosted, func-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stateful_ubsan + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stateful tests (ubsan, actions) + REPO_COPY=${{runner.temp}}/stateful_ubsan/ClickHouse + KILL_TIMEOUT=3600 + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Functional test - env: - TEMP_PATH: ${{runner.temp}}/stateful_ubsan - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'Stateful tests (ubsan, actions)' - REPO_COPY: ${{runner.temp}}/stateful_ubsan/ClickHouse - KILL_TIMEOUT: 3600 run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -1183,19 +1812,25 @@ jobs: needs: [BuilderDebDebug] runs-on: [self-hosted, func-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stateful_debug + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stateful tests (debug, actions) + REPO_COPY=${{runner.temp}}/stateful_debug/ClickHouse + KILL_TIMEOUT=3600 + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Functional test - env: - TEMP_PATH: ${{runner.temp}}/stateful_debug - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'Stateful tests (debug, actions)' - REPO_COPY: ${{runner.temp}}/stateful_debug/ClickHouse - KILL_TIMEOUT: 3600 run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -1215,18 +1850,24 @@ jobs: needs: [BuilderDebAsan] runs-on: [self-hosted, stress-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stress_thread + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stress test (address, actions) + REPO_COPY=${{runner.temp}}/stress_thread/ClickHouse + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Stress test - env: - TEMP_PATH: ${{runner.temp}}/stress_thread - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'Stress test (address, actions)' - REPO_COPY: ${{runner.temp}}/stress_thread/ClickHouse run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -1241,20 +1882,30 @@ jobs: sudo rm -fr $TEMP_PATH StressTestTsan: needs: [BuilderDebTsan] - runs-on: [self-hosted, stress-tester] + # func testers have 16 cores + 128 GB memory + # while stress testers have 36 cores + 72 memory + # It would be better to have something like 32 + 128, + # but such servers almost unavailable as spot instances. + runs-on: [self-hosted, func-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stress_thread + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stress test (thread, actions) + REPO_COPY=${{runner.temp}}/stress_thread/ClickHouse + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Stress test - env: - TEMP_PATH: ${{runner.temp}}/stress_thread - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'Stress test (thread, actions)' - REPO_COPY: ${{runner.temp}}/stress_thread/ClickHouse run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -1271,18 +1922,24 @@ jobs: needs: [BuilderDebMsan] runs-on: [self-hosted, stress-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stress_memory + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stress test (memory, actions) + REPO_COPY=${{runner.temp}}/stress_memory/ClickHouse + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Stress test - env: - TEMP_PATH: ${{runner.temp}}/stress_memory - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'Stress test (memory, actions)' - REPO_COPY: ${{runner.temp}}/stress_memory/ClickHouse run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -1299,18 +1956,24 @@ jobs: needs: [BuilderDebUBsan] runs-on: [self-hosted, stress-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stress_undefined + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stress test (undefined, actions) + REPO_COPY=${{runner.temp}}/stress_undefined/ClickHouse + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Stress test - env: - TEMP_PATH: ${{runner.temp}}/stress_undefined - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'Stress test (undefined, actions)' - REPO_COPY: ${{runner.temp}}/stress_undefined/ClickHouse run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -1327,18 +1990,24 @@ jobs: needs: [BuilderDebDebug] runs-on: [self-hosted, stress-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stress_debug + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stress test (debug, actions) + REPO_COPY=${{runner.temp}}/stress_debug/ClickHouse + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Stress test - env: - TEMP_PATH: ${{runner.temp}}/stress_debug - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'Stress test (debug, actions)' - REPO_COPY: ${{runner.temp}}/stress_debug/ClickHouse run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -1358,18 +2027,24 @@ jobs: needs: [BuilderDebAsan] runs-on: [self-hosted, fuzzer-unit-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/ast_fuzzer_asan + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=AST fuzzer (ASan, actions) + REPO_COPY=${{runner.temp}}/ast_fuzzer_asan/ClickHouse + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Fuzzer - env: - TEMP_PATH: ${{runner.temp}}/ast_fuzzer_asan - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'AST fuzzer (ASan, actions)' - REPO_COPY: ${{runner.temp}}/ast_fuzzer_asan/ClickHouse run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -1386,18 +2061,24 @@ jobs: needs: [BuilderDebTsan] runs-on: [self-hosted, fuzzer-unit-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/ast_fuzzer_tsan + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=AST fuzzer (TSan, actions) + REPO_COPY=${{runner.temp}}/ast_fuzzer_tsan/ClickHouse + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Fuzzer - env: - TEMP_PATH: ${{runner.temp}}/ast_fuzzer_tsan - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'AST fuzzer (TSan, actions)' - REPO_COPY: ${{runner.temp}}/ast_fuzzer_tsan/ClickHouse run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -1414,18 +2095,24 @@ jobs: needs: [BuilderDebUBsan] runs-on: [self-hosted, fuzzer-unit-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/ast_fuzzer_ubsan + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=AST fuzzer (UBSan, actions) + REPO_COPY=${{runner.temp}}/ast_fuzzer_ubsan/ClickHouse + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Fuzzer - env: - TEMP_PATH: ${{runner.temp}}/ast_fuzzer_ubsan - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'AST fuzzer (UBSan, actions)' - REPO_COPY: ${{runner.temp}}/ast_fuzzer_ubsan/ClickHouse run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -1442,18 +2129,24 @@ jobs: needs: [BuilderDebMsan] runs-on: [self-hosted, fuzzer-unit-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/ast_fuzzer_msan + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=AST fuzzer (MSan, actions) + REPO_COPY=${{runner.temp}}/ast_fuzzer_msan/ClickHouse + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Fuzzer - env: - TEMP_PATH: ${{runner.temp}}/ast_fuzzer_msan - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'AST fuzzer (MSan, actions)' - REPO_COPY: ${{runner.temp}}/ast_fuzzer_msan/ClickHouse run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -1470,18 +2163,24 @@ jobs: needs: [BuilderDebDebug] runs-on: [self-hosted, fuzzer-unit-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/ast_fuzzer_debug + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=AST fuzzer (debug, actions) + REPO_COPY=${{runner.temp}}/ast_fuzzer_debug/ClickHouse + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Fuzzer - env: - TEMP_PATH: ${{runner.temp}}/ast_fuzzer_debug - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'AST fuzzer (debug, actions)' - REPO_COPY: ${{runner.temp}}/ast_fuzzer_debug/ClickHouse run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -1497,22 +2196,30 @@ jobs: ############################################################################################# ############################# INTEGRATION TESTS ############################################# ############################################################################################# - IntegrationTestsAsan: - needs: [BuilderDebAsan, FunctionalStatelessTestAsan] + IntegrationTestsAsan0: + needs: [BuilderDebAsan] runs-on: [self-hosted, stress-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/integration_tests_asan + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Integration tests (asan, actions) + REPO_COPY=${{runner.temp}}/integration_tests_asan/ClickHouse + RUN_BY_HASH_NUM=0 + RUN_BY_HASH_TOTAL=3 + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Integration test - env: - TEMP_PATH: ${{runner.temp}}/integration_tests_asan - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'Integration tests (asan, actions)' - REPO_COPY: ${{runner.temp}}/integration_tests_asan/ClickHouse run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -1525,22 +2232,30 @@ jobs: docker kill $(docker ps -q) ||: docker rm -f $(docker ps -a -q) ||: sudo rm -fr $TEMP_PATH - IntegrationTestsTsan: - needs: [BuilderDebTsan, FunctionalStatelessTestTsan] + IntegrationTestsAsan1: + needs: [BuilderDebAsan] runs-on: [self-hosted, stress-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/integration_tests_asan + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Integration tests (asan, actions) + REPO_COPY=${{runner.temp}}/integration_tests_asan/ClickHouse + RUN_BY_HASH_NUM=1 + RUN_BY_HASH_TOTAL=3 + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Integration test - env: - TEMP_PATH: ${{runner.temp}}/integration_tests_tsan - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'Integration tests (thread, actions)' - REPO_COPY: ${{runner.temp}}/integration_tests_tsan/ClickHouse run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -1553,22 +2268,246 @@ jobs: docker kill $(docker ps -q) ||: docker rm -f $(docker ps -a -q) ||: sudo rm -fr $TEMP_PATH - IntegrationTestsRelease: - needs: [BuilderDebRelease, FunctionalStatelessTestRelease] + IntegrationTestsAsan2: + needs: [BuilderDebAsan] runs-on: [self-hosted, stress-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/integration_tests_asan + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Integration tests (asan, actions) + REPO_COPY=${{runner.temp}}/integration_tests_asan/ClickHouse + RUN_BY_HASH_NUM=2 + RUN_BY_HASH_TOTAL=3 + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE + - name: Check out repository code + uses: actions/checkout@v2 + - name: Integration test + run: | + sudo rm -fr $TEMP_PATH + mkdir -p $TEMP_PATH + cp -r $GITHUB_WORKSPACE $TEMP_PATH + cd $REPO_COPY/tests/ci + python3 integration_test_check.py "$CHECK_NAME" + - name: Cleanup + if: always() + run: | + docker kill $(docker ps -q) ||: + docker rm -f $(docker ps -a -q) ||: + sudo rm -fr $TEMP_PATH + IntegrationTestsTsan0: + needs: [BuilderDebTsan] + runs-on: [self-hosted, stress-tester] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/integration_tests_tsan + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Integration tests (thread, actions) + REPO_COPY=${{runner.temp}}/integration_tests_tsan/ClickHouse + RUN_BY_HASH_NUM=0 + RUN_BY_HASH_TOTAL=4 + EOF + - name: Download json reports + uses: actions/download-artifact@v2 + with: + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE + - name: Check out repository code + uses: actions/checkout@v2 + - name: Integration test + run: | + sudo rm -fr $TEMP_PATH + mkdir -p $TEMP_PATH + cp -r $GITHUB_WORKSPACE $TEMP_PATH + cd $REPO_COPY/tests/ci + python3 integration_test_check.py "$CHECK_NAME" + - name: Cleanup + if: always() + run: | + docker kill $(docker ps -q) ||: + docker rm -f $(docker ps -a -q) ||: + sudo rm -fr $TEMP_PATH + IntegrationTestsTsan1: + needs: [BuilderDebTsan] + runs-on: [self-hosted, stress-tester] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/integration_tests_tsan + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Integration tests (thread, actions) + REPO_COPY=${{runner.temp}}/integration_tests_tsan/ClickHouse + RUN_BY_HASH_NUM=1 + RUN_BY_HASH_TOTAL=4 + EOF + - name: Download json reports + uses: actions/download-artifact@v2 + with: + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE + - name: Check out repository code + uses: actions/checkout@v2 + - name: Integration test + run: | + sudo rm -fr $TEMP_PATH + mkdir -p $TEMP_PATH + cp -r $GITHUB_WORKSPACE $TEMP_PATH + cd $REPO_COPY/tests/ci + python3 integration_test_check.py "$CHECK_NAME" + - name: Cleanup + if: always() + run: | + docker kill $(docker ps -q) ||: + docker rm -f $(docker ps -a -q) ||: + sudo rm -fr $TEMP_PATH + IntegrationTestsTsan2: + needs: [BuilderDebTsan] + runs-on: [self-hosted, stress-tester] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/integration_tests_tsan + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Integration tests (thread, actions) + REPO_COPY=${{runner.temp}}/integration_tests_tsan/ClickHouse + RUN_BY_HASH_NUM=2 + RUN_BY_HASH_TOTAL=4 + EOF + - name: Download json reports + uses: actions/download-artifact@v2 + with: + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE + - name: Check out repository code + uses: actions/checkout@v2 + - name: Integration test + run: | + sudo rm -fr $TEMP_PATH + mkdir -p $TEMP_PATH + cp -r $GITHUB_WORKSPACE $TEMP_PATH + cd $REPO_COPY/tests/ci + python3 integration_test_check.py "$CHECK_NAME" + - name: Cleanup + if: always() + run: | + docker kill $(docker ps -q) ||: + docker rm -f $(docker ps -a -q) ||: + sudo rm -fr $TEMP_PATH + IntegrationTestsTsan3: + needs: [BuilderDebTsan] + runs-on: [self-hosted, stress-tester] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/integration_tests_tsan + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Integration tests (thread, actions) + REPO_COPY=${{runner.temp}}/integration_tests_tsan/ClickHouse + RUN_BY_HASH_NUM=3 + RUN_BY_HASH_TOTAL=4 + EOF + - name: Download json reports + uses: actions/download-artifact@v2 + with: + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE + - name: Check out repository code + uses: actions/checkout@v2 + - name: Integration test + run: | + sudo rm -fr $TEMP_PATH + mkdir -p $TEMP_PATH + cp -r $GITHUB_WORKSPACE $TEMP_PATH + cd $REPO_COPY/tests/ci + python3 integration_test_check.py "$CHECK_NAME" + - name: Cleanup + if: always() + run: | + docker kill $(docker ps -q) ||: + docker rm -f $(docker ps -a -q) ||: + sudo rm -fr $TEMP_PATH + IntegrationTestsRelease0: + needs: [BuilderDebRelease] + runs-on: [self-hosted, stress-tester] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/integration_tests_release + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Integration tests (release, actions) + REPO_COPY=${{runner.temp}}/integration_tests_release/ClickHouse + RUN_BY_HASH_NUM=0 + RUN_BY_HASH_TOTAL=2 + EOF + - name: Download json reports + uses: actions/download-artifact@v2 + with: + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE + - name: Check out repository code + uses: actions/checkout@v2 + - name: Integration test + run: | + sudo rm -fr $TEMP_PATH + mkdir -p $TEMP_PATH + cp -r $GITHUB_WORKSPACE $TEMP_PATH + cd $REPO_COPY/tests/ci + python3 integration_test_check.py "$CHECK_NAME" + - name: Cleanup + if: always() + run: | + docker kill $(docker ps -q) ||: + docker rm -f $(docker ps -a -q) ||: + sudo rm -fr $TEMP_PATH + IntegrationTestsRelease1: + needs: [BuilderDebRelease] + runs-on: [self-hosted, stress-tester] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/integration_tests_release + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Integration tests (release, actions) + REPO_COPY=${{runner.temp}}/integration_tests_release/ClickHouse + RUN_BY_HASH_NUM=1 + RUN_BY_HASH_TOTAL=2 + EOF + - name: Download json reports + uses: actions/download-artifact@v2 + with: + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Integration test - env: - TEMP_PATH: ${{runner.temp}}/integration_tests_release - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'Integration tests (release, actions)' - REPO_COPY: ${{runner.temp}}/integration_tests_release/ClickHouse run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -1585,18 +2524,24 @@ jobs: needs: [BuilderDebAsan] runs-on: [self-hosted, stress-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/integration_tests_asan_flaky_check + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Integration tests flaky check (asan, actions) + REPO_COPY=${{runner.temp}}/integration_tests_asan_flaky_check/ClickHouse + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Integration test - env: - TEMP_PATH: ${{runner.temp}}/integration_tests_asan_flaky_check - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'Integration tests flaky check (asan, actions)' - REPO_COPY: ${{runner.temp}}/integration_tests_asan_flaky_check/ClickHouse run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -1616,18 +2561,24 @@ jobs: needs: [BuilderDebAsan] runs-on: [self-hosted, fuzzer-unit-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/unit_tests_asan + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Unit tests (asan, actions) + REPO_COPY=${{runner.temp}}/unit_tests_asan/ClickHouse + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Unit test - env: - TEMP_PATH: ${{runner.temp}}/unit_tests_asan - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'Unit tests (asan, actions)' - REPO_COPY: ${{runner.temp}}/unit_tests_asan/ClickHouse run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -1644,18 +2595,24 @@ jobs: needs: [BuilderBinRelease] runs-on: [self-hosted, fuzzer-unit-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/unit_tests_asan + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Unit tests (release-clang, actions) + REPO_COPY=${{runner.temp}}/unit_tests_asan/ClickHouse + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Unit test - env: - TEMP_PATH: ${{runner.temp}}/unit_tests_asan - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'Unit tests (release-clang, actions)' - REPO_COPY: ${{runner.temp}}/unit_tests_asan/ClickHouse run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -1672,18 +2629,24 @@ jobs: needs: [BuilderDebTsan] runs-on: [self-hosted, fuzzer-unit-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/unit_tests_tsan + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Unit tests (tsan, actions) + REPO_COPY=${{runner.temp}}/unit_tests_tsan/ClickHouse + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Unit test - env: - TEMP_PATH: ${{runner.temp}}/unit_tests_tsan - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'Unit tests (tsan, actions)' - REPO_COPY: ${{runner.temp}}/unit_tests_tsan/ClickHouse run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -1700,18 +2663,24 @@ jobs: needs: [BuilderDebMsan] runs-on: [self-hosted, fuzzer-unit-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/unit_tests_msan + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Unit tests (msan, actions) + REPO_COPY=${{runner.temp}}/unit_tests_msan/ClickHouse + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Unit test - env: - TEMP_PATH: ${{runner.temp}}/unit_tests_msan - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'Unit tests (msan, actions)' - REPO_COPY: ${{runner.temp}}/unit_tests_msan/ClickHouse run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -1728,18 +2697,24 @@ jobs: needs: [BuilderDebUBsan] runs-on: [self-hosted, fuzzer-unit-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/unit_tests_ubsan + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Unit tests (ubsan, actions) + REPO_COPY=${{runner.temp}}/unit_tests_ubsan/ClickHouse + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Unit test - env: - TEMP_PATH: ${{runner.temp}}/unit_tests_ubsan - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'Unit tests (ubsan, actions)' - REPO_COPY: ${{runner.temp}}/unit_tests_ubsan/ClickHouse run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -1752,6 +2727,153 @@ jobs: docker kill $(docker ps -q) ||: docker rm -f $(docker ps -a -q) ||: sudo rm -fr $TEMP_PATH +############################################################################################# +#################################### PERFORMANCE TESTS ###################################### +############################################################################################# + PerformanceComparison0: + needs: [BuilderPerformance] + runs-on: [self-hosted, stress-tester] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/performance_comparison + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Performance Comparison (actions) + REPO_COPY=${{runner.temp}}/performance_comparison/ClickHouse + RUN_BY_HASH_NUM=0 + RUN_BY_HASH_TOTAL=4 + EOF + - name: Download json reports + uses: actions/download-artifact@v2 + with: + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE + - name: Check out repository code + uses: actions/checkout@v2 + - name: Performance Comparison + run: | + sudo rm -fr $TEMP_PATH + mkdir -p $TEMP_PATH + cp -r $GITHUB_WORKSPACE $TEMP_PATH + cd $REPO_COPY/tests/ci + python3 performance_comparison_check.py "$CHECK_NAME" + - name: Cleanup + if: always() + run: | + docker kill $(docker ps -q) ||: + docker rm -f $(docker ps -a -q) ||: + sudo rm -fr $TEMP_PATH + PerformanceComparison1: + needs: [BuilderPerformance] + runs-on: [self-hosted, stress-tester] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/performance_comparison + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Performance Comparison (actions) + REPO_COPY=${{runner.temp}}/performance_comparison/ClickHouse + RUN_BY_HASH_NUM=1 + RUN_BY_HASH_TOTAL=4 + EOF + - name: Download json reports + uses: actions/download-artifact@v2 + with: + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE + - name: Check out repository code + uses: actions/checkout@v2 + - name: Performance Comparison + run: | + sudo rm -fr $TEMP_PATH + mkdir -p $TEMP_PATH + cp -r $GITHUB_WORKSPACE $TEMP_PATH + cd $REPO_COPY/tests/ci + python3 performance_comparison_check.py "$CHECK_NAME" + - name: Cleanup + if: always() + run: | + docker kill $(docker ps -q) ||: + docker rm -f $(docker ps -a -q) ||: + sudo rm -fr $TEMP_PATH + PerformanceComparison2: + needs: [BuilderPerformance] + runs-on: [self-hosted, stress-tester] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/performance_comparison + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Performance Comparison (actions) + REPO_COPY=${{runner.temp}}/performance_comparison/ClickHouse + RUN_BY_HASH_NUM=2 + RUN_BY_HASH_TOTAL=4 + EOF + - name: Download json reports + uses: actions/download-artifact@v2 + with: + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE + - name: Check out repository code + uses: actions/checkout@v2 + - name: Performance Comparison + run: | + sudo rm -fr $TEMP_PATH + mkdir -p $TEMP_PATH + cp -r $GITHUB_WORKSPACE $TEMP_PATH + cd $REPO_COPY/tests/ci + python3 performance_comparison_check.py "$CHECK_NAME" + - name: Cleanup + if: always() + run: | + docker kill $(docker ps -q) ||: + docker rm -f $(docker ps -a -q) ||: + sudo rm -fr $TEMP_PATH + PerformanceComparison3: + needs: [BuilderPerformance] + runs-on: [self-hosted, stress-tester] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/performance_comparison + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Performance Comparison (actions) + REPO_COPY=${{runner.temp}}/performance_comparison/ClickHouse + RUN_BY_HASH_NUM=3 + RUN_BY_HASH_TOTAL=4 + EOF + - name: Download json reports + uses: actions/download-artifact@v2 + with: + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE + - name: Check out repository code + uses: actions/checkout@v2 + - name: Performance Comparison + run: | + sudo rm -fr $TEMP_PATH + mkdir -p $TEMP_PATH + cp -r $GITHUB_WORKSPACE $TEMP_PATH + cd $REPO_COPY/tests/ci + python3 performance_comparison_check.py "$CHECK_NAME" + - name: Cleanup + if: always() + run: | + docker kill $(docker ps -q) ||: + docker rm -f $(docker ps -a -q) ||: + sudo rm -fr $TEMP_PATH FinishCheck: needs: - StyleCheck @@ -1759,13 +2881,21 @@ jobs: - CheckLabels - BuilderReport - FastTest - - FunctionalStatelessTestDebug + - FunctionalStatelessTestDebug0 + - FunctionalStatelessTestDebug1 + - FunctionalStatelessTestDebug2 - FunctionalStatelessTestRelease - - FunctionalStatelessTestReleaseDatabaseReplicated + - FunctionalStatelessTestReleaseDatabaseReplicated0 + - FunctionalStatelessTestReleaseDatabaseReplicated1 - FunctionalStatelessTestReleaseWideParts - - FunctionalStatelessTestAsan - - FunctionalStatelessTestTsan - - FunctionalStatelessTestMsan + - FunctionalStatelessTestAsan0 + - FunctionalStatelessTestAsan1 + - FunctionalStatelessTestTsan0 + - FunctionalStatelessTestTsan1 + - FunctionalStatelessTestTsan2 + - FunctionalStatelessTestMsan0 + - FunctionalStatelessTestMsan1 + - FunctionalStatelessTestMsan2 - FunctionalStatelessTestUBsan - FunctionalStatefulTestDebug - FunctionalStatefulTestRelease @@ -1783,9 +2913,19 @@ jobs: - ASTFuzzerTestTsan - ASTFuzzerTestMSan - ASTFuzzerTestUBSan - - IntegrationTestsAsan - - IntegrationTestsRelease - - IntegrationTestsTsan + - IntegrationTestsAsan0 + - IntegrationTestsAsan1 + - IntegrationTestsAsan2 + - IntegrationTestsRelease0 + - IntegrationTestsRelease1 + - IntegrationTestsTsan0 + - IntegrationTestsTsan1 + - IntegrationTestsTsan2 + - IntegrationTestsTsan3 + - PerformanceComparison0 + - PerformanceComparison1 + - PerformanceComparison2 + - PerformanceComparison3 - PVSCheck - UnitTestsAsan - UnitTestsTsan @@ -1797,6 +2937,9 @@ jobs: - IntegrationTestsFlakyCheck runs-on: [self-hosted, style-checker] steps: + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Finish label diff --git a/.github/workflows/master.yml b/.github/workflows/master.yml index 26921b8ea48..c2ed39224aa 100644 --- a/.github/workflows/master.yml +++ b/.github/workflows/master.yml @@ -1,4 +1,9 @@ name: MasterCI + +env: + # Force the stdout and stderr streams to be unbuffered + PYTHONUNBUFFERED: 1 + on: # yamllint disable-line rule:truthy push: branches: @@ -7,6 +12,9 @@ jobs: DockerHubPush: runs-on: [self-hosted, style-checker] steps: + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Images check @@ -22,16 +30,22 @@ jobs: needs: DockerHubPush runs-on: [self-hosted, style-checker] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{ runner.temp }}/style_check + EOF - name: Download changed images uses: actions/download-artifact@v2 with: name: changed_images - path: ${{ runner.temp }}/style_check + path: ${{ env.TEMP_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Style Check - env: - TEMP_PATH: ${{ runner.temp }}/style_check run: | cd $GITHUB_WORKSPACE/tests/ci python3 style_check.py @@ -45,17 +59,23 @@ jobs: needs: [BuilderDebRelease] runs-on: [self-hosted, style-checker] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/compatibility_check + REPO_COPY=${{runner.temp}}/compatibility_check/ClickHouse + REPORTS_PATH=${{runner.temp}}/reports_dir + EOF + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} - name: CompatibilityCheck - env: - TEMP_PATH: ${{runner.temp}}/compatibility_check - REPO_COPY: ${{runner.temp}}/compatibility_check/ClickHouse - REPORTS_PATH: ${{runner.temp}}/reports_dir run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -72,17 +92,23 @@ jobs: if: ${{ !contains(github.event.pull_request.labels.*.name, 'pr-documentation') && !contains(github.event.pull_request.labels.*.name, 'pr-doc-fix') }} runs-on: [self-hosted, style-checker] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/split_build_check + REPO_COPY=${{runner.temp}}/split_build_check/ClickHouse + REPORTS_PATH=${{runner.temp}}/reports_dir + EOF + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} - name: Split build check - env: - TEMP_PATH: ${{runner.temp}}/split_build_check - REPO_COPY: ${{runner.temp}}/split_build_check/ClickHouse - REPORTS_PATH: ${{runner.temp}}/reports_dir run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -101,269 +127,363 @@ jobs: needs: [DockerHubPush] runs-on: [self-hosted, builder] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/build_check + IMAGES_PATH=${{runner.temp}}/images_path + REPO_COPY=${{runner.temp}}/build_check/ClickHouse + CACHES_PATH=${{runner.temp}}/../ccaches + CHECK_NAME=ClickHouse build check (actions) + BUILD_NAME=package_release + EOF - name: Download changed images uses: actions/download-artifact@v2 with: name: changed_images - path: ${{ runner.temp }}/images_path + path: ${{ env.IMAGES_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 with: - submodules: 'recursive' + submodules: 'true' fetch-depth: 0 # otherwise we will have no info about contributors - name: Build - env: - TEMP_PATH: ${{runner.temp}}/build_check - IMAGES_PATH: ${{runner.temp}}/images_path - REPO_COPY: ${{runner.temp}}/build_check/ClickHouse - CACHES_PATH: ${{runner.temp}}/../ccaches - CHECK_NAME: 'ClickHouse build check (actions)' - BUILD_NAME: 'package_release' run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH cp -r $GITHUB_WORKSPACE $TEMP_PATH cd $REPO_COPY/tests/ci && python3 build_check.py "$CHECK_NAME" $BUILD_NAME - name: Upload build URLs to artifacts + if: ${{ success() || failure() }} uses: actions/upload-artifact@v2 with: name: ${{ env.BUILD_NAME }} - path: ${{ runner.temp }}/build_check/${{ env.BUILD_NAME }}.json + path: ${{ env.TEMP_PATH }}/${{ env.BUILD_NAME }}.json - name: Cleanup if: always() run: | docker kill $(docker ps -q) ||: docker rm -f $(docker ps -a -q) ||: sudo rm -fr $TEMP_PATH + BuilderPerformance: + needs: DockerHubPush + runs-on: [self-hosted, builder] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/build_check + IMAGES_PATH=${{runner.temp}}/images_path + REPO_COPY=${{runner.temp}}/build_check/ClickHouse + CACHES_PATH=${{runner.temp}}/../ccaches + CHECK_NAME=ClickHouse build check (actions) + BUILD_NAME=performance + EOF + - name: Download changed images + uses: actions/download-artifact@v2 + with: + name: changed_images + path: ${{ env.IMAGES_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE + - name: Check out repository code + uses: actions/checkout@v2 + with: + submodules: 'true' + fetch-depth: 0 # otherwise we will have no info about contributors + - name: Build + run: | + sudo rm -fr $TEMP_PATH + mkdir -p $TEMP_PATH + cp -r $GITHUB_WORKSPACE $TEMP_PATH + cd $REPO_COPY/tests/ci && python3 build_check.py "$CHECK_NAME" $BUILD_NAME + - name: Upload build URLs to artifacts + if: ${{ success() || failure() }} + uses: actions/upload-artifact@v2 + with: + name: ${{ env.BUILD_NAME }} + path: ${{ env.TEMP_PATH }}/${{ env.BUILD_NAME }}.json + - name: Cleanup + if: always() + run: | + docker kill $(docker ps -q) ||: + docker rm -f $(docker ps -a -q) ||: + sudo rm -fr $TEMP_PATH $CACHES_PATH BuilderBinRelease: needs: [DockerHubPush] if: ${{ !contains(github.event.pull_request.labels.*.name, 'pr-documentation') && !contains(github.event.pull_request.labels.*.name, 'pr-doc-fix') }} runs-on: [self-hosted, builder] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/build_check + IMAGES_PATH=${{runner.temp}}/images_path + REPO_COPY=${{runner.temp}}/build_check/ClickHouse + CACHES_PATH=${{runner.temp}}/../ccaches + CHECK_NAME=ClickHouse build check (actions) + BUILD_NAME=binary_release + EOF - name: Download changed images uses: actions/download-artifact@v2 with: name: changed_images - path: ${{ runner.temp }}/images_path + path: ${{ env.IMAGES_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 with: - submodules: 'recursive' + submodules: 'true' fetch-depth: 0 # otherwise we will have no info about contributors - name: Build - env: - TEMP_PATH: ${{runner.temp}}/build_check - IMAGES_PATH: ${{runner.temp}}/images_path - REPO_COPY: ${{runner.temp}}/build_check/ClickHouse - CACHES_PATH: ${{runner.temp}}/../ccaches - CHECK_NAME: 'ClickHouse build check (actions)' - BUILD_NAME: 'binary_release' run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH cp -r $GITHUB_WORKSPACE $TEMP_PATH cd $REPO_COPY/tests/ci && python3 build_check.py "$CHECK_NAME" $BUILD_NAME - name: Upload build URLs to artifacts + if: ${{ success() || failure() }} uses: actions/upload-artifact@v2 with: name: ${{ env.BUILD_NAME }} - path: ${{ runner.temp }}/build_check/${{ env.BUILD_NAME }}.json + path: ${{ env.TEMP_PATH }}/${{ env.BUILD_NAME }}.json - name: Cleanup if: always() run: | docker kill $(docker ps -q) ||: docker rm -f $(docker ps -a -q) ||: - sudo rm -fr $TEMP_PATH + sudo rm -fr $TEMP_PATH $CACHES_PATH BuilderDebAsan: needs: [DockerHubPush] runs-on: [self-hosted, builder] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/build_check + IMAGES_PATH=${{runner.temp}}/images_path + REPO_COPY=${{runner.temp}}/build_check/ClickHouse + CACHES_PATH=${{runner.temp}}/../ccaches + CHECK_NAME=ClickHouse build check (actions) + BUILD_NAME=package_asan + EOF - name: Download changed images uses: actions/download-artifact@v2 with: name: changed_images - path: ${{ runner.temp }}/images_path + path: ${{ env.IMAGES_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 with: - submodules: 'recursive' + submodules: 'true' fetch-depth: 0 # otherwise we will have no info about contributors - name: Build - env: - TEMP_PATH: ${{runner.temp}}/build_check - IMAGES_PATH: ${{runner.temp}}/images_path - REPO_COPY: ${{runner.temp}}/build_check/ClickHouse - CACHES_PATH: ${{runner.temp}}/../ccaches - CHECK_NAME: 'ClickHouse build check (actions)' - BUILD_NAME: 'package_asan' run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH cp -r $GITHUB_WORKSPACE $TEMP_PATH cd $REPO_COPY/tests/ci && python3 build_check.py "$CHECK_NAME" $BUILD_NAME - name: Upload build URLs to artifacts + if: ${{ success() || failure() }} uses: actions/upload-artifact@v2 with: name: ${{ env.BUILD_NAME }} - path: ${{ runner.temp }}/build_check/${{ env.BUILD_NAME }}.json + path: ${{ env.TEMP_PATH }}/${{ env.BUILD_NAME }}.json - name: Cleanup if: always() run: | docker kill $(docker ps -q) ||: docker rm -f $(docker ps -a -q) ||: - sudo rm -fr $TEMP_PATH + sudo rm -fr $TEMP_PATH $CACHES_PATH BuilderDebUBsan: needs: [DockerHubPush] runs-on: [self-hosted, builder] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/build_check + IMAGES_PATH=${{runner.temp}}/images_path + REPO_COPY=${{runner.temp}}/build_check/ClickHouse + CACHES_PATH=${{runner.temp}}/../ccaches + CHECK_NAME=ClickHouse build check (actions) + BUILD_NAME=package_ubsan + EOF - name: Download changed images uses: actions/download-artifact@v2 with: name: changed_images - path: ${{ runner.temp }}/images_path + path: ${{ env.IMAGES_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 with: - submodules: 'recursive' + submodules: 'true' fetch-depth: 0 # otherwise we will have no info about contributors - name: Build - env: - TEMP_PATH: ${{runner.temp}}/build_check - IMAGES_PATH: ${{runner.temp}}/images_path - REPO_COPY: ${{runner.temp}}/build_check/ClickHouse - CACHES_PATH: ${{runner.temp}}/../ccaches - CHECK_NAME: 'ClickHouse build check (actions)' - BUILD_NAME: 'package_ubsan' run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH cp -r $GITHUB_WORKSPACE $TEMP_PATH cd $REPO_COPY/tests/ci && python3 build_check.py "$CHECK_NAME" $BUILD_NAME - name: Upload build URLs to artifacts + if: ${{ success() || failure() }} uses: actions/upload-artifact@v2 with: name: ${{ env.BUILD_NAME }} - path: ${{ runner.temp }}/build_check/${{ env.BUILD_NAME }}.json + path: ${{ env.TEMP_PATH }}/${{ env.BUILD_NAME }}.json - name: Cleanup if: always() run: | docker kill $(docker ps -q) ||: docker rm -f $(docker ps -a -q) ||: - sudo rm -fr $TEMP_PATH + sudo rm -fr $TEMP_PATH $CACHES_PATH BuilderDebTsan: needs: [DockerHubPush] runs-on: [self-hosted, builder] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/build_check + IMAGES_PATH=${{runner.temp}}/images_path + REPO_COPY=${{runner.temp}}/build_check/ClickHouse + CACHES_PATH=${{runner.temp}}/../ccaches + CHECK_NAME=ClickHouse build check (actions) + BUILD_NAME=package_tsan + EOF - name: Download changed images uses: actions/download-artifact@v2 with: name: changed_images - path: ${{ runner.temp }}/images_path + path: ${{ env.IMAGES_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 with: - submodules: 'recursive' + submodules: 'true' fetch-depth: 0 # otherwise we will have no info about contributors - name: Build - env: - TEMP_PATH: ${{runner.temp}}/build_check - IMAGES_PATH: ${{runner.temp}}/images_path - REPO_COPY: ${{runner.temp}}/build_check/ClickHouse - CACHES_PATH: ${{runner.temp}}/../ccaches - CHECK_NAME: 'ClickHouse build check (actions)' - BUILD_NAME: 'package_tsan' run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH cp -r $GITHUB_WORKSPACE $TEMP_PATH cd $REPO_COPY/tests/ci && python3 build_check.py "$CHECK_NAME" $BUILD_NAME - name: Upload build URLs to artifacts + if: ${{ success() || failure() }} uses: actions/upload-artifact@v2 with: name: ${{ env.BUILD_NAME }} - path: ${{ runner.temp }}/build_check/${{ env.BUILD_NAME }}.json + path: ${{ env.TEMP_PATH }}/${{ env.BUILD_NAME }}.json - name: Cleanup if: always() run: | docker kill $(docker ps -q) ||: docker rm -f $(docker ps -a -q) ||: - sudo rm -fr $TEMP_PATH + sudo rm -fr $TEMP_PATH $CACHES_PATH BuilderDebMsan: needs: [DockerHubPush] runs-on: [self-hosted, builder] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/build_check + IMAGES_PATH=${{runner.temp}}/images_path + REPO_COPY=${{runner.temp}}/build_check/ClickHouse + CACHES_PATH=${{runner.temp}}/../ccaches + CHECK_NAME=ClickHouse build check (actions) + BUILD_NAME=package_msan + EOF - name: Download changed images uses: actions/download-artifact@v2 with: name: changed_images - path: ${{ runner.temp }}/images_path + path: ${{ env.IMAGES_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 with: - submodules: 'recursive' + submodules: 'true' fetch-depth: 0 # otherwise we will have no info about contributors - name: Build - env: - TEMP_PATH: ${{runner.temp}}/build_check - IMAGES_PATH: ${{runner.temp}}/images_path - REPO_COPY: ${{runner.temp}}/build_check/ClickHouse - CACHES_PATH: ${{runner.temp}}/../ccaches - CHECK_NAME: 'ClickHouse build check (actions)' - BUILD_NAME: 'package_msan' run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH cp -r $GITHUB_WORKSPACE $TEMP_PATH cd $REPO_COPY/tests/ci && python3 build_check.py "$CHECK_NAME" $BUILD_NAME - name: Upload build URLs to artifacts + if: ${{ success() || failure() }} uses: actions/upload-artifact@v2 with: name: ${{ env.BUILD_NAME }} - path: ${{ runner.temp }}/build_check/${{ env.BUILD_NAME }}.json + path: ${{ env.TEMP_PATH }}/${{ env.BUILD_NAME }}.json - name: Cleanup if: always() run: | docker kill $(docker ps -q) ||: docker rm -f $(docker ps -a -q) ||: - sudo rm -fr $TEMP_PATH + sudo rm -fr $TEMP_PATH $CACHES_PATH BuilderDebDebug: needs: [DockerHubPush] runs-on: [self-hosted, builder] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/build_check + IMAGES_PATH=${{runner.temp}}/images_path + REPO_COPY=${{runner.temp}}/build_check/ClickHouse + CACHES_PATH=${{runner.temp}}/../ccaches + CHECK_NAME=ClickHouse build check (actions) + BUILD_NAME=package_debug + EOF - name: Download changed images uses: actions/download-artifact@v2 with: name: changed_images - path: ${{ runner.temp }}/images_path + path: ${{ env.IMAGES_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 with: - submodules: 'recursive' + submodules: 'true' fetch-depth: 0 # otherwise we will have no info about contributors - name: Build - env: - TEMP_PATH: ${{runner.temp}}/build_check - IMAGES_PATH: ${{runner.temp}}/images_path - REPO_COPY: ${{runner.temp}}/build_check/ClickHouse - CACHES_PATH: ${{runner.temp}}/../ccaches - CHECK_NAME: 'ClickHouse build check (actions)' - BUILD_NAME: 'package_debug' run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH cp -r $GITHUB_WORKSPACE $TEMP_PATH cd $REPO_COPY/tests/ci && python3 build_check.py "$CHECK_NAME" $BUILD_NAME - name: Upload build URLs to artifacts + if: ${{ success() || failure() }} uses: actions/upload-artifact@v2 with: name: ${{ env.BUILD_NAME }} - path: ${{ runner.temp }}/build_check/${{ env.BUILD_NAME }}.json + path: ${{ env.TEMP_PATH }}/${{ env.BUILD_NAME }}.json - name: Cleanup if: always() run: | docker kill $(docker ps -q) ||: docker rm -f $(docker ps -a -q) ||: - sudo rm -fr $TEMP_PATH + sudo rm -fr $TEMP_PATH $CACHES_PATH ########################################################################################## ##################################### SPECIAL BUILDS ##################################### ########################################################################################## @@ -372,274 +492,323 @@ jobs: if: ${{ !contains(github.event.pull_request.labels.*.name, 'pr-documentation') && !contains(github.event.pull_request.labels.*.name, 'pr-doc-fix') }} runs-on: [self-hosted, builder] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/build_check + IMAGES_PATH=${{runner.temp}}/images_path + REPO_COPY=${{runner.temp}}/build_check/ClickHouse + CACHES_PATH=${{runner.temp}}/../ccaches + CHECK_NAME=ClickHouse build check (actions) + BUILD_NAME=binary_splitted + EOF - name: Download changed images uses: actions/download-artifact@v2 with: name: changed_images - path: ${{ runner.temp }}/images_path + path: ${{ env.IMAGES_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 with: - submodules: 'recursive' + submodules: 'true' fetch-depth: 0 # otherwise we will have no info about contributors - name: Build - env: - TEMP_PATH: ${{runner.temp}}/build_check - IMAGES_PATH: ${{runner.temp}}/images_path - REPO_COPY: ${{runner.temp}}/build_check/ClickHouse - CACHES_PATH: ${{runner.temp}}/../ccaches - CHECK_NAME: 'ClickHouse build check (actions)' - BUILD_NAME: 'binary_splitted' run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH cp -r $GITHUB_WORKSPACE $TEMP_PATH cd $REPO_COPY/tests/ci && python3 build_check.py "$CHECK_NAME" $BUILD_NAME - name: Upload build URLs to artifacts + if: ${{ success() || failure() }} uses: actions/upload-artifact@v2 with: name: ${{ env.BUILD_NAME }} - path: ${{ runner.temp }}/build_check/${{ env.BUILD_NAME }}.json + path: ${{ env.TEMP_PATH }}/${{ env.BUILD_NAME }}.json - name: Cleanup if: always() run: | docker kill $(docker ps -q) ||: docker rm -f $(docker ps -a -q) ||: - sudo rm -fr $TEMP_PATH + sudo rm -fr $TEMP_PATH $CACHES_PATH BuilderBinTidy: needs: [DockerHubPush] if: ${{ !contains(github.event.pull_request.labels.*.name, 'pr-documentation') && !contains(github.event.pull_request.labels.*.name, 'pr-doc-fix') }} runs-on: [self-hosted, builder] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/build_check + IMAGES_PATH=${{runner.temp}}/images_path + REPO_COPY=${{runner.temp}}/build_check/ClickHouse + CACHES_PATH=${{runner.temp}}/../ccaches + CHECK_NAME=ClickHouse build check (actions) + BUILD_NAME=binary_tidy + EOF - name: Download changed images uses: actions/download-artifact@v2 with: name: changed_images - path: ${{ runner.temp }}/images_path + path: ${{ env.IMAGES_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 with: - submodules: 'recursive' + submodules: 'true' fetch-depth: 0 # otherwise we will have no info about contributors - name: Build - env: - TEMP_PATH: ${{runner.temp}}/build_check - IMAGES_PATH: ${{runner.temp}}/images_path - REPO_COPY: ${{runner.temp}}/build_check/ClickHouse - CACHES_PATH: ${{runner.temp}}/../ccaches - CHECK_NAME: 'ClickHouse build check (actions)' - BUILD_NAME: 'binary_tidy' run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH cp -r $GITHUB_WORKSPACE $TEMP_PATH cd $REPO_COPY/tests/ci && python3 build_check.py "$CHECK_NAME" $BUILD_NAME - name: Upload build URLs to artifacts + if: ${{ success() || failure() }} uses: actions/upload-artifact@v2 with: name: ${{ env.BUILD_NAME }} - path: ${{ runner.temp }}/build_check/${{ env.BUILD_NAME }}.json + path: ${{ env.TEMP_PATH }}/${{ env.BUILD_NAME }}.json - name: Cleanup if: always() run: | docker kill $(docker ps -q) ||: docker rm -f $(docker ps -a -q) ||: - sudo rm -fr $TEMP_PATH + sudo rm -fr $TEMP_PATH $CACHES_PATH BuilderBinDarwin: needs: [DockerHubPush] if: ${{ !contains(github.event.pull_request.labels.*.name, 'pr-documentation') && !contains(github.event.pull_request.labels.*.name, 'pr-doc-fix') }} runs-on: [self-hosted, builder] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/build_check + IMAGES_PATH=${{runner.temp}}/images_path + REPO_COPY=${{runner.temp}}/build_check/ClickHouse + CACHES_PATH=${{runner.temp}}/../ccaches + CHECK_NAME=ClickHouse build check (actions) + BUILD_NAME=binary_darwin + EOF - name: Download changed images uses: actions/download-artifact@v2 with: name: changed_images - path: ${{ runner.temp }}/images_path + path: ${{ env.IMAGES_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 with: - submodules: 'recursive' + submodules: 'true' fetch-depth: 0 # otherwise we will have no info about contributors - name: Build - env: - TEMP_PATH: ${{runner.temp}}/build_check - IMAGES_PATH: ${{runner.temp}}/images_path - REPO_COPY: ${{runner.temp}}/build_check/ClickHouse - CACHES_PATH: ${{runner.temp}}/../ccaches - CHECK_NAME: 'ClickHouse build check (actions)' - BUILD_NAME: 'binary_darwin' run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH cp -r $GITHUB_WORKSPACE $TEMP_PATH cd $REPO_COPY/tests/ci && python3 build_check.py "$CHECK_NAME" $BUILD_NAME - name: Upload build URLs to artifacts + if: ${{ success() || failure() }} uses: actions/upload-artifact@v2 with: name: ${{ env.BUILD_NAME }} - path: ${{ runner.temp }}/build_check/${{ env.BUILD_NAME }}.json + path: ${{ env.TEMP_PATH }}/${{ env.BUILD_NAME }}.json - name: Cleanup if: always() run: | docker kill $(docker ps -q) ||: docker rm -f $(docker ps -a -q) ||: - sudo rm -fr $TEMP_PATH + sudo rm -fr $TEMP_PATH $CACHES_PATH BuilderBinAarch64: needs: [DockerHubPush] if: ${{ !contains(github.event.pull_request.labels.*.name, 'pr-documentation') && !contains(github.event.pull_request.labels.*.name, 'pr-doc-fix') }} runs-on: [self-hosted, builder] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/build_check + IMAGES_PATH=${{runner.temp}}/images_path + REPO_COPY=${{runner.temp}}/build_check/ClickHouse + CACHES_PATH=${{runner.temp}}/../ccaches + CHECK_NAME=ClickHouse build check (actions) + BUILD_NAME=binary_aarch64 + EOF - name: Download changed images uses: actions/download-artifact@v2 with: name: changed_images - path: ${{ runner.temp }}/images_path + path: ${{ env.IMAGES_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 with: - submodules: 'recursive' + submodules: 'true' fetch-depth: 0 # otherwise we will have no info about contributors - name: Build - env: - TEMP_PATH: ${{runner.temp}}/build_check - IMAGES_PATH: ${{runner.temp}}/images_path - REPO_COPY: ${{runner.temp}}/build_check/ClickHouse - CACHES_PATH: ${{runner.temp}}/../ccaches - CHECK_NAME: 'ClickHouse build check (actions)' - BUILD_NAME: 'binary_aarch64' run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH cp -r $GITHUB_WORKSPACE $TEMP_PATH cd $REPO_COPY/tests/ci && python3 build_check.py "$CHECK_NAME" $BUILD_NAME - name: Upload build URLs to artifacts + if: ${{ success() || failure() }} uses: actions/upload-artifact@v2 with: name: ${{ env.BUILD_NAME }} - path: ${{ runner.temp }}/build_check/${{ env.BUILD_NAME }}.json + path: ${{ env.TEMP_PATH }}/${{ env.BUILD_NAME }}.json - name: Cleanup if: always() run: | docker kill $(docker ps -q) ||: docker rm -f $(docker ps -a -q) ||: - sudo rm -fr $TEMP_PATH + sudo rm -fr $TEMP_PATH $CACHES_PATH BuilderBinFreeBSD: needs: [DockerHubPush] if: ${{ !contains(github.event.pull_request.labels.*.name, 'pr-documentation') && !contains(github.event.pull_request.labels.*.name, 'pr-doc-fix') }} runs-on: [self-hosted, builder] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/build_check + IMAGES_PATH=${{runner.temp}}/images_path + REPO_COPY=${{runner.temp}}/build_check/ClickHouse + CACHES_PATH=${{runner.temp}}/../ccaches + CHECK_NAME=ClickHouse build check (actions) + BUILD_NAME=binary_freebsd + EOF - name: Download changed images uses: actions/download-artifact@v2 with: name: changed_images - path: ${{ runner.temp }}/images_path + path: ${{ env.IMAGES_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 with: - submodules: 'recursive' + submodules: 'true' fetch-depth: 0 # otherwise we will have no info about contributors - name: Build - env: - TEMP_PATH: ${{runner.temp}}/build_check - IMAGES_PATH: ${{runner.temp}}/images_path - REPO_COPY: ${{runner.temp}}/build_check/ClickHouse - CACHES_PATH: ${{runner.temp}}/../ccaches - CHECK_NAME: 'ClickHouse build check (actions)' - BUILD_NAME: 'binary_freebsd' run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH cp -r $GITHUB_WORKSPACE $TEMP_PATH cd $REPO_COPY/tests/ci && python3 build_check.py "$CHECK_NAME" $BUILD_NAME - name: Upload build URLs to artifacts + if: ${{ success() || failure() }} uses: actions/upload-artifact@v2 with: name: ${{ env.BUILD_NAME }} - path: ${{ runner.temp }}/build_check/${{ env.BUILD_NAME }}.json + path: ${{ env.TEMP_PATH }}/${{ env.BUILD_NAME }}.json - name: Cleanup if: always() run: | docker kill $(docker ps -q) ||: docker rm -f $(docker ps -a -q) ||: - sudo rm -fr $TEMP_PATH + sudo rm -fr $TEMP_PATH $CACHES_PATH BuilderBinDarwinAarch64: needs: [DockerHubPush] if: ${{ !contains(github.event.pull_request.labels.*.name, 'pr-documentation') && !contains(github.event.pull_request.labels.*.name, 'pr-doc-fix') }} runs-on: [self-hosted, builder] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/build_check + IMAGES_PATH=${{runner.temp}}/images_path + REPO_COPY=${{runner.temp}}/build_check/ClickHouse + CACHES_PATH=${{runner.temp}}/../ccaches + CHECK_NAME=ClickHouse build check (actions) + BUILD_NAME=binary_darwin_aarch64 + EOF - name: Download changed images uses: actions/download-artifact@v2 with: name: changed_images - path: ${{ runner.temp }}/images_path + path: ${{ env.IMAGES_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 with: - submodules: 'recursive' + submodules: 'true' fetch-depth: 0 # otherwise we will have no info about contributors - name: Build - env: - TEMP_PATH: ${{runner.temp}}/build_check - IMAGES_PATH: ${{runner.temp}}/images_path - REPO_COPY: ${{runner.temp}}/build_check/ClickHouse - CACHES_PATH: ${{runner.temp}}/../ccaches - CHECK_NAME: 'ClickHouse build check (actions)' - BUILD_NAME: 'binary_darwin_aarch64' run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH cp -r $GITHUB_WORKSPACE $TEMP_PATH cd $REPO_COPY/tests/ci && python3 build_check.py "$CHECK_NAME" $BUILD_NAME - name: Upload build URLs to artifacts + if: ${{ success() || failure() }} uses: actions/upload-artifact@v2 with: name: ${{ env.BUILD_NAME }} - path: ${{ runner.temp }}/build_check/${{ env.BUILD_NAME }}.json + path: ${{ env.TEMP_PATH }}/${{ env.BUILD_NAME }}.json - name: Cleanup if: always() run: | docker kill $(docker ps -q) ||: docker rm -f $(docker ps -a -q) ||: - sudo rm -fr $TEMP_PATH + sudo rm -fr $TEMP_PATH $CACHES_PATH BuilderBinPPC64: needs: [DockerHubPush] if: ${{ !contains(github.event.pull_request.labels.*.name, 'pr-documentation') && !contains(github.event.pull_request.labels.*.name, 'pr-doc-fix') }} runs-on: [self-hosted, builder] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/build_check + IMAGES_PATH=${{runner.temp}}/images_path + REPO_COPY=${{runner.temp}}/build_check/ClickHouse + CACHES_PATH=${{runner.temp}}/../ccaches + CHECK_NAME=ClickHouse build check (actions) + BUILD_NAME=binary_ppc64le + EOF - name: Download changed images uses: actions/download-artifact@v2 with: name: changed_images - path: ${{ runner.temp }}/images_path + path: ${{ env.IMAGES_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 with: - submodules: 'recursive' + submodules: 'true' fetch-depth: 0 # otherwise we will have no info about contributors - name: Build - env: - TEMP_PATH: ${{runner.temp}}/build_check - IMAGES_PATH: ${{runner.temp}}/images_path - REPO_COPY: ${{runner.temp}}/build_check/ClickHouse - CACHES_PATH: ${{runner.temp}}/../ccaches - CHECK_NAME: 'ClickHouse build check (actions)' - BUILD_NAME: 'binary_ppc64le' run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH cp -r $GITHUB_WORKSPACE $TEMP_PATH cd $REPO_COPY/tests/ci && python3 build_check.py "$CHECK_NAME" $BUILD_NAME - name: Upload build URLs to artifacts + if: ${{ success() || failure() }} uses: actions/upload-artifact@v2 with: name: ${{ env.BUILD_NAME }} - path: ${{ runner.temp }}/build_check/${{ env.BUILD_NAME }}.json + path: ${{ env.TEMP_PATH }}/${{ env.BUILD_NAME }}.json - name: Cleanup if: always() run: | docker kill $(docker ps -q) ||: docker rm -f $(docker ps -a -q) ||: - sudo rm -fr $TEMP_PATH + sudo rm -fr $TEMP_PATH $CACHES_PATH ############################################################################################ ##################################### BUILD REPORTER ####################################### ############################################################################################ @@ -654,17 +823,23 @@ jobs: - BuilderDebDebug runs-on: [self-hosted, style-checker] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/report_check + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=ClickHouse build check (actions) + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Report Builder - env: - TEMP_PATH: ${{runner.temp}}/report_check - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'ClickHouse build check (actions)' run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -687,17 +862,23 @@ jobs: - BuilderBinPPC64 runs-on: [self-hosted, style-checker] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/report_check + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=ClickHouse special build check (actions) + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Report Builder - env: - TEMP_PATH: ${{runner.temp}}/report_check - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'ClickHouse special build check (actions)' run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -716,19 +897,25 @@ jobs: needs: [BuilderDebRelease] runs-on: [self-hosted, func-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stateless_release + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stateless tests (release, actions) + REPO_COPY=${{runner.temp}}/stateless_release/ClickHouse + KILL_TIMEOUT=10800 + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Functional test - env: - TEMP_PATH: ${{runner.temp}}/stateless_release - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'Stateless tests (release, actions)' - REPO_COPY: ${{runner.temp}}/stateless_release/ClickHouse - KILL_TIMEOUT: 10800 run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -745,19 +932,25 @@ jobs: needs: [BuilderDebRelease] runs-on: [self-hosted, func-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stateless_release_database_ordinary + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stateless tests (release, DatabaseOrdinary, actions) + REPO_COPY=${{runner.temp}}/stateless_release_database_ordinary/ClickHouse + KILL_TIMEOUT=10800 + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Functional test - env: - TEMP_PATH: ${{runner.temp}}/stateless_release_database_ordinary - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'Stateless tests (release, DatabaseOrdinary, actions)' - REPO_COPY: ${{runner.temp}}/stateless_release_database_ordinary/ClickHouse - KILL_TIMEOUT: 10800 run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -770,23 +963,31 @@ jobs: docker kill $(docker ps -q) ||: docker rm -f $(docker ps -a -q) ||: sudo rm -fr $TEMP_PATH - FunctionalStatelessTestAsan: + FunctionalStatelessTestAsan0: needs: [BuilderDebAsan] runs-on: [self-hosted, func-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stateless_debug + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stateless tests (address, actions) + REPO_COPY=${{runner.temp}}/stateless_debug/ClickHouse + KILL_TIMEOUT=10800 + RUN_BY_HASH_NUM=0 + RUN_BY_HASH_TOTAL=2 + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Functional test - env: - TEMP_PATH: ${{runner.temp}}/stateless_debug - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'Stateless tests (address, actions)' - REPO_COPY: ${{runner.temp}}/stateless_debug/ClickHouse - KILL_TIMEOUT: 10800 run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -799,23 +1000,142 @@ jobs: docker kill $(docker ps -q) ||: docker rm -f $(docker ps -a -q) ||: sudo rm -fr $TEMP_PATH - FunctionalStatelessTestTsan: - needs: [BuilderDebTsan] + FunctionalStatelessTestAsan1: + needs: [BuilderDebAsan] runs-on: [self-hosted, func-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stateless_debug + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stateless tests (address, actions) + REPO_COPY=${{runner.temp}}/stateless_debug/ClickHouse + KILL_TIMEOUT=10800 + RUN_BY_HASH_NUM=1 + RUN_BY_HASH_TOTAL=2 + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE + - name: Check out repository code + uses: actions/checkout@v2 + - name: Functional test + run: | + sudo rm -fr $TEMP_PATH + mkdir -p $TEMP_PATH + cp -r $GITHUB_WORKSPACE $TEMP_PATH + cd $REPO_COPY/tests/ci + python3 functional_test_check.py "$CHECK_NAME" $KILL_TIMEOUT + - name: Cleanup + if: always() + run: | + docker kill $(docker ps -q) ||: + docker rm -f $(docker ps -a -q) ||: + sudo rm -fr $TEMP_PATH + FunctionalStatelessTestTsan0: + needs: [BuilderDebTsan] + runs-on: [self-hosted, func-tester] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stateless_tsan + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stateless tests (thread, actions) + REPO_COPY=${{runner.temp}}/stateless_tsan/ClickHouse + KILL_TIMEOUT=10800 + RUN_BY_HASH_NUM=0 + RUN_BY_HASH_TOTAL=3 + EOF + - name: Download json reports + uses: actions/download-artifact@v2 + with: + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE + - name: Check out repository code + uses: actions/checkout@v2 + - name: Functional test + run: | + sudo rm -fr $TEMP_PATH + mkdir -p $TEMP_PATH + cp -r $GITHUB_WORKSPACE $TEMP_PATH + cd $REPO_COPY/tests/ci + python3 functional_test_check.py "$CHECK_NAME" $KILL_TIMEOUT + - name: Cleanup + if: always() + run: | + docker kill $(docker ps -q) ||: + docker rm -f $(docker ps -a -q) ||: + sudo rm -fr $TEMP_PATH + FunctionalStatelessTestTsan1: + needs: [BuilderDebTsan] + runs-on: [self-hosted, func-tester] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stateless_tsan + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stateless tests (thread, actions) + REPO_COPY=${{runner.temp}}/stateless_tsan/ClickHouse + KILL_TIMEOUT=10800 + RUN_BY_HASH_NUM=1 + RUN_BY_HASH_TOTAL=3 + EOF + - name: Download json reports + uses: actions/download-artifact@v2 + with: + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE + - name: Check out repository code + uses: actions/checkout@v2 + - name: Functional test + run: | + sudo rm -fr $TEMP_PATH + mkdir -p $TEMP_PATH + cp -r $GITHUB_WORKSPACE $TEMP_PATH + cd $REPO_COPY/tests/ci + python3 functional_test_check.py "$CHECK_NAME" $KILL_TIMEOUT + - name: Cleanup + if: always() + run: | + docker kill $(docker ps -q) ||: + docker rm -f $(docker ps -a -q) ||: + sudo rm -fr $TEMP_PATH + FunctionalStatelessTestTsan2: + needs: [BuilderDebTsan] + runs-on: [self-hosted, func-tester] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stateless_tsan + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stateless tests (thread, actions) + REPO_COPY=${{runner.temp}}/stateless_tsan/ClickHouse + KILL_TIMEOUT=10800 + RUN_BY_HASH_NUM=2 + RUN_BY_HASH_TOTAL=3 + EOF + - name: Download json reports + uses: actions/download-artifact@v2 + with: + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Functional test - env: - TEMP_PATH: ${{runner.temp}}/stateless_tsan - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'Stateless tests (thread, actions)' - REPO_COPY: ${{runner.temp}}/stateless_tsan/ClickHouse - KILL_TIMEOUT: 10800 run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -832,19 +1152,25 @@ jobs: needs: [BuilderDebUBsan] runs-on: [self-hosted, func-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stateless_ubsan + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stateless tests (ubsan, actions) + REPO_COPY=${{runner.temp}}/stateless_ubsan/ClickHouse + KILL_TIMEOUT=10800 + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Functional test - env: - TEMP_PATH: ${{runner.temp}}/stateless_ubsan - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'Stateless tests (ubsan, actions)' - REPO_COPY: ${{runner.temp}}/stateless_ubsan/ClickHouse - KILL_TIMEOUT: 10800 run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -857,23 +1183,31 @@ jobs: docker kill $(docker ps -q) ||: docker rm -f $(docker ps -a -q) ||: sudo rm -fr $TEMP_PATH - FunctionalStatelessTestMsan: + FunctionalStatelessTestMsan0: needs: [BuilderDebMsan] runs-on: [self-hosted, func-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stateless_memory + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stateless tests (memory, actions) + REPO_COPY=${{runner.temp}}/stateless_memory/ClickHouse + KILL_TIMEOUT=10800 + RUN_BY_HASH_NUM=0 + RUN_BY_HASH_TOTAL=3 + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Functional test - env: - TEMP_PATH: ${{runner.temp}}/stateless_memory - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'Stateless tests (memory, actions)' - REPO_COPY: ${{runner.temp}}/stateless_memory/ClickHouse - KILL_TIMEOUT: 10800 run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -886,23 +1220,179 @@ jobs: docker kill $(docker ps -q) ||: docker rm -f $(docker ps -a -q) ||: sudo rm -fr $TEMP_PATH - FunctionalStatelessTestDebug: - needs: [BuilderDebDebug] + FunctionalStatelessTestMsan1: + needs: [BuilderDebMsan] runs-on: [self-hosted, func-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stateless_memory + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stateless tests (memory, actions) + REPO_COPY=${{runner.temp}}/stateless_memory/ClickHouse + KILL_TIMEOUT=10800 + RUN_BY_HASH_NUM=1 + RUN_BY_HASH_TOTAL=3 + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE + - name: Check out repository code + uses: actions/checkout@v2 + - name: Functional test + run: | + sudo rm -fr $TEMP_PATH + mkdir -p $TEMP_PATH + cp -r $GITHUB_WORKSPACE $TEMP_PATH + cd $REPO_COPY/tests/ci + python3 functional_test_check.py "$CHECK_NAME" $KILL_TIMEOUT + - name: Cleanup + if: always() + run: | + docker kill $(docker ps -q) ||: + docker rm -f $(docker ps -a -q) ||: + sudo rm -fr $TEMP_PATH + FunctionalStatelessTestMsan2: + needs: [BuilderDebMsan] + runs-on: [self-hosted, func-tester] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stateless_memory + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stateless tests (memory, actions) + REPO_COPY=${{runner.temp}}/stateless_memory/ClickHouse + KILL_TIMEOUT=10800 + RUN_BY_HASH_NUM=2 + RUN_BY_HASH_TOTAL=3 + EOF + - name: Download json reports + uses: actions/download-artifact@v2 + with: + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE + - name: Check out repository code + uses: actions/checkout@v2 + - name: Functional test + run: | + sudo rm -fr $TEMP_PATH + mkdir -p $TEMP_PATH + cp -r $GITHUB_WORKSPACE $TEMP_PATH + cd $REPO_COPY/tests/ci + python3 functional_test_check.py "$CHECK_NAME" $KILL_TIMEOUT + - name: Cleanup + if: always() + run: | + docker kill $(docker ps -q) ||: + docker rm -f $(docker ps -a -q) ||: + sudo rm -fr $TEMP_PATH + FunctionalStatelessTestDebug0: + needs: [BuilderDebDebug] + runs-on: [self-hosted, func-tester] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stateless_debug + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stateless tests (debug, actions) + REPO_COPY=${{runner.temp}}/stateless_debug/ClickHouse + KILL_TIMEOUT=10800 + RUN_BY_HASH_NUM=0 + RUN_BY_HASH_TOTAL=3 + EOF + - name: Download json reports + uses: actions/download-artifact@v2 + with: + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE + - name: Check out repository code + uses: actions/checkout@v2 + - name: Functional test + run: | + sudo rm -fr $TEMP_PATH + mkdir -p $TEMP_PATH + cp -r $GITHUB_WORKSPACE $TEMP_PATH + cd $REPO_COPY/tests/ci + python3 functional_test_check.py "$CHECK_NAME" $KILL_TIMEOUT + - name: Cleanup + if: always() + run: | + docker kill $(docker ps -q) ||: + docker rm -f $(docker ps -a -q) ||: + sudo rm -fr $TEMP_PATH + FunctionalStatelessTestDebug1: + needs: [BuilderDebDebug] + runs-on: [self-hosted, func-tester] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stateless_debug + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stateless tests (debug, actions) + REPO_COPY=${{runner.temp}}/stateless_debug/ClickHouse + KILL_TIMEOUT=10800 + RUN_BY_HASH_NUM=1 + RUN_BY_HASH_TOTAL=3 + EOF + - name: Download json reports + uses: actions/download-artifact@v2 + with: + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE + - name: Check out repository code + uses: actions/checkout@v2 + - name: Functional test + run: | + sudo rm -fr $TEMP_PATH + mkdir -p $TEMP_PATH + cp -r $GITHUB_WORKSPACE $TEMP_PATH + cd $REPO_COPY/tests/ci + python3 functional_test_check.py "$CHECK_NAME" $KILL_TIMEOUT + - name: Cleanup + if: always() + run: | + docker kill $(docker ps -q) ||: + docker rm -f $(docker ps -a -q) ||: + sudo rm -fr $TEMP_PATH + FunctionalStatelessTestDebug2: + needs: [BuilderDebDebug] + runs-on: [self-hosted, func-tester] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stateless_debug + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stateless tests (debug, actions) + REPO_COPY=${{runner.temp}}/stateless_debug/ClickHouse + KILL_TIMEOUT=10800 + RUN_BY_HASH_NUM=2 + RUN_BY_HASH_TOTAL=3 + EOF + - name: Download json reports + uses: actions/download-artifact@v2 + with: + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Functional test - env: - TEMP_PATH: ${{runner.temp}}/stateless_debug - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'Stateless tests (debug, actions)' - REPO_COPY: ${{runner.temp}}/stateless_debug/ClickHouse - KILL_TIMEOUT: 10800 run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -922,19 +1412,25 @@ jobs: needs: [BuilderDebRelease] runs-on: [self-hosted, func-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stateful_release + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stateful tests (release, actions) + REPO_COPY=${{runner.temp}}/stateful_release/ClickHouse + KILL_TIMEOUT=3600 + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Functional test - env: - TEMP_PATH: ${{runner.temp}}/stateful_release - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'Stateful tests (release, actions)' - REPO_COPY: ${{runner.temp}}/stateful_release/ClickHouse - KILL_TIMEOUT: 3600 run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -951,19 +1447,25 @@ jobs: needs: [BuilderDebRelease] runs-on: [self-hosted, func-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stateful_release_database_ordinary + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stateful tests (release, DatabaseOrdinary, actions) + REPO_COPY=${{runner.temp}}/stateful_release_database_ordinary/ClickHouse + KILL_TIMEOUT=3600 + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Functional test - env: - TEMP_PATH: ${{runner.temp}}/stateful_release_database_ordinary - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'Stateful tests (release, DatabaseOrdinary, actions)' - REPO_COPY: ${{runner.temp}}/stateful_release_database_ordinary/ClickHouse - KILL_TIMEOUT: 3600 run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -980,19 +1482,25 @@ jobs: needs: [BuilderDebAsan] runs-on: [self-hosted, func-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stateful_debug + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stateful tests (address, actions) + REPO_COPY=${{runner.temp}}/stateful_debug/ClickHouse + KILL_TIMEOUT=3600 + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Functional test - env: - TEMP_PATH: ${{runner.temp}}/stateful_debug - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'Stateful tests (address, actions)' - REPO_COPY: ${{runner.temp}}/stateful_debug/ClickHouse - KILL_TIMEOUT: 3600 run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -1009,19 +1517,25 @@ jobs: needs: [BuilderDebTsan] runs-on: [self-hosted, func-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stateful_tsan + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stateful tests (thread, actions) + REPO_COPY=${{runner.temp}}/stateful_tsan/ClickHouse + KILL_TIMEOUT=3600 + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Functional test - env: - TEMP_PATH: ${{runner.temp}}/stateful_tsan - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'Stateful tests (thread, actions)' - REPO_COPY: ${{runner.temp}}/stateful_tsan/ClickHouse - KILL_TIMEOUT: 3600 run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -1038,19 +1552,25 @@ jobs: needs: [BuilderDebMsan] runs-on: [self-hosted, func-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stateful_msan + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stateful tests (memory, actions) + REPO_COPY=${{runner.temp}}/stateful_msan/ClickHouse + KILL_TIMEOUT=3600 + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Functional test - env: - TEMP_PATH: ${{runner.temp}}/stateful_msan - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'Stateful tests (memory, actions)' - REPO_COPY: ${{runner.temp}}/stateful_msan/ClickHouse - KILL_TIMEOUT: 3600 run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -1067,19 +1587,25 @@ jobs: needs: [BuilderDebUBsan] runs-on: [self-hosted, func-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stateful_ubsan + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stateful tests (ubsan, actions) + REPO_COPY=${{runner.temp}}/stateful_ubsan/ClickHouse + KILL_TIMEOUT=3600 + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Functional test - env: - TEMP_PATH: ${{runner.temp}}/stateful_ubsan - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'Stateful tests (ubsan, actions)' - REPO_COPY: ${{runner.temp}}/stateful_ubsan/ClickHouse - KILL_TIMEOUT: 3600 run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -1096,19 +1622,25 @@ jobs: needs: [BuilderDebDebug] runs-on: [self-hosted, func-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stateful_debug + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stateful tests (debug, actions) + REPO_COPY=${{runner.temp}}/stateful_debug/ClickHouse + KILL_TIMEOUT=3600 + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Functional test - env: - TEMP_PATH: ${{runner.temp}}/stateful_debug - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'Stateful tests (debug, actions)' - REPO_COPY: ${{runner.temp}}/stateful_debug/ClickHouse - KILL_TIMEOUT: 3600 run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -1128,18 +1660,24 @@ jobs: needs: [BuilderDebAsan] runs-on: [self-hosted, stress-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stress_thread + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stress test (address, actions) + REPO_COPY=${{runner.temp}}/stress_thread/ClickHouse + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Stress test - env: - TEMP_PATH: ${{runner.temp}}/stress_thread - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'Stress test (address, actions)' - REPO_COPY: ${{runner.temp}}/stress_thread/ClickHouse run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -1154,20 +1692,30 @@ jobs: sudo rm -fr $TEMP_PATH StressTestTsan: needs: [BuilderDebTsan] - runs-on: [self-hosted, stress-tester] + # func testers have 16 cores + 128 GB memory + # while stress testers have 36 cores + 72 memory + # It would be better to have something like 32 + 128, + # but such servers almost unavailable as spot instances. + runs-on: [self-hosted, func-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stress_thread + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stress test (thread, actions) + REPO_COPY=${{runner.temp}}/stress_thread/ClickHouse + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Stress test - env: - TEMP_PATH: ${{runner.temp}}/stress_thread - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'Stress test (thread, actions)' - REPO_COPY: ${{runner.temp}}/stress_thread/ClickHouse run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -1184,18 +1732,24 @@ jobs: needs: [BuilderDebMsan] runs-on: [self-hosted, stress-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stress_memory + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stress test (memory, actions) + REPO_COPY=${{runner.temp}}/stress_memory/ClickHouse + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Stress test - env: - TEMP_PATH: ${{runner.temp}}/stress_memory - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'Stress test (memory, actions)' - REPO_COPY: ${{runner.temp}}/stress_memory/ClickHouse run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -1212,18 +1766,24 @@ jobs: needs: [BuilderDebUBsan] runs-on: [self-hosted, stress-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stress_undefined + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stress test (undefined, actions) + REPO_COPY=${{runner.temp}}/stress_undefined/ClickHouse + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Stress test - env: - TEMP_PATH: ${{runner.temp}}/stress_undefined - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'Stress test (undefined, actions)' - REPO_COPY: ${{runner.temp}}/stress_undefined/ClickHouse run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -1240,18 +1800,24 @@ jobs: needs: [BuilderDebDebug] runs-on: [self-hosted, stress-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stress_debug + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stress test (debug, actions) + REPO_COPY=${{runner.temp}}/stress_debug/ClickHouse + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Stress test - env: - TEMP_PATH: ${{runner.temp}}/stress_debug - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'Stress test (debug, actions)' - REPO_COPY: ${{runner.temp}}/stress_debug/ClickHouse run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -1267,22 +1833,30 @@ jobs: ############################################################################################# ############################# INTEGRATION TESTS ############################################# ############################################################################################# - IntegrationTestsAsan: - needs: [BuilderDebAsan, FunctionalStatelessTestAsan] + IntegrationTestsAsan0: + needs: [BuilderDebAsan] runs-on: [self-hosted, stress-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/integration_tests_asan + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Integration tests (asan, actions) + REPO_COPY=${{runner.temp}}/integration_tests_asan/ClickHouse + RUN_BY_HASH_NUM=0 + RUN_BY_HASH_TOTAL=3 + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Integration test - env: - TEMP_PATH: ${{runner.temp}}/integration_tests_asan - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'Integration tests (asan, actions)' - REPO_COPY: ${{runner.temp}}/integration_tests_asan/ClickHouse run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -1295,22 +1869,30 @@ jobs: docker kill $(docker ps -q) ||: docker rm -f $(docker ps -a -q) ||: sudo rm -fr $TEMP_PATH - IntegrationTestsTsan: - needs: [BuilderDebTsan, FunctionalStatelessTestTsan] + IntegrationTestsAsan1: + needs: [BuilderDebAsan] runs-on: [self-hosted, stress-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/integration_tests_asan + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Integration tests (asan, actions) + REPO_COPY=${{runner.temp}}/integration_tests_asan/ClickHouse + RUN_BY_HASH_NUM=1 + RUN_BY_HASH_TOTAL=3 + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Integration test - env: - TEMP_PATH: ${{runner.temp}}/integration_tests_tsan - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'Integration tests (thread, actions)' - REPO_COPY: ${{runner.temp}}/integration_tests_tsan/ClickHouse run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -1323,22 +1905,246 @@ jobs: docker kill $(docker ps -q) ||: docker rm -f $(docker ps -a -q) ||: sudo rm -fr $TEMP_PATH - IntegrationTestsRelease: - needs: [BuilderDebRelease, FunctionalStatelessTestRelease] + IntegrationTestsAsan2: + needs: [BuilderDebAsan] runs-on: [self-hosted, stress-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/integration_tests_asan + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Integration tests (asan, actions) + REPO_COPY=${{runner.temp}}/integration_tests_asan/ClickHouse + RUN_BY_HASH_NUM=2 + RUN_BY_HASH_TOTAL=3 + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE + - name: Check out repository code + uses: actions/checkout@v2 + - name: Integration test + run: | + sudo rm -fr $TEMP_PATH + mkdir -p $TEMP_PATH + cp -r $GITHUB_WORKSPACE $TEMP_PATH + cd $REPO_COPY/tests/ci + python3 integration_test_check.py "$CHECK_NAME" + - name: Cleanup + if: always() + run: | + docker kill $(docker ps -q) ||: + docker rm -f $(docker ps -a -q) ||: + sudo rm -fr $TEMP_PATH + IntegrationTestsTsan0: + needs: [BuilderDebTsan] + runs-on: [self-hosted, stress-tester] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/integration_tests_tsan + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Integration tests (thread, actions) + REPO_COPY=${{runner.temp}}/integration_tests_tsan/ClickHouse + RUN_BY_HASH_NUM=0 + RUN_BY_HASH_TOTAL=4 + EOF + - name: Download json reports + uses: actions/download-artifact@v2 + with: + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE + - name: Check out repository code + uses: actions/checkout@v2 + - name: Integration test + run: | + sudo rm -fr $TEMP_PATH + mkdir -p $TEMP_PATH + cp -r $GITHUB_WORKSPACE $TEMP_PATH + cd $REPO_COPY/tests/ci + python3 integration_test_check.py "$CHECK_NAME" + - name: Cleanup + if: always() + run: | + docker kill $(docker ps -q) ||: + docker rm -f $(docker ps -a -q) ||: + sudo rm -fr $TEMP_PATH + IntegrationTestsTsan1: + needs: [BuilderDebTsan] + runs-on: [self-hosted, stress-tester] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/integration_tests_tsan + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Integration tests (thread, actions) + REPO_COPY=${{runner.temp}}/integration_tests_tsan/ClickHouse + RUN_BY_HASH_NUM=1 + RUN_BY_HASH_TOTAL=4 + EOF + - name: Download json reports + uses: actions/download-artifact@v2 + with: + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE + - name: Check out repository code + uses: actions/checkout@v2 + - name: Integration test + run: | + sudo rm -fr $TEMP_PATH + mkdir -p $TEMP_PATH + cp -r $GITHUB_WORKSPACE $TEMP_PATH + cd $REPO_COPY/tests/ci + python3 integration_test_check.py "$CHECK_NAME" + - name: Cleanup + if: always() + run: | + docker kill $(docker ps -q) ||: + docker rm -f $(docker ps -a -q) ||: + sudo rm -fr $TEMP_PATH + IntegrationTestsTsan2: + needs: [BuilderDebTsan] + runs-on: [self-hosted, stress-tester] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/integration_tests_tsan + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Integration tests (thread, actions) + REPO_COPY=${{runner.temp}}/integration_tests_tsan/ClickHouse + RUN_BY_HASH_NUM=2 + RUN_BY_HASH_TOTAL=4 + EOF + - name: Download json reports + uses: actions/download-artifact@v2 + with: + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE + - name: Check out repository code + uses: actions/checkout@v2 + - name: Integration test + run: | + sudo rm -fr $TEMP_PATH + mkdir -p $TEMP_PATH + cp -r $GITHUB_WORKSPACE $TEMP_PATH + cd $REPO_COPY/tests/ci + python3 integration_test_check.py "$CHECK_NAME" + - name: Cleanup + if: always() + run: | + docker kill $(docker ps -q) ||: + docker rm -f $(docker ps -a -q) ||: + sudo rm -fr $TEMP_PATH + IntegrationTestsTsan3: + needs: [BuilderDebTsan] + runs-on: [self-hosted, stress-tester] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/integration_tests_tsan + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Integration tests (thread, actions) + REPO_COPY=${{runner.temp}}/integration_tests_tsan/ClickHouse + RUN_BY_HASH_NUM=3 + RUN_BY_HASH_TOTAL=4 + EOF + - name: Download json reports + uses: actions/download-artifact@v2 + with: + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE + - name: Check out repository code + uses: actions/checkout@v2 + - name: Integration test + run: | + sudo rm -fr $TEMP_PATH + mkdir -p $TEMP_PATH + cp -r $GITHUB_WORKSPACE $TEMP_PATH + cd $REPO_COPY/tests/ci + python3 integration_test_check.py "$CHECK_NAME" + - name: Cleanup + if: always() + run: | + docker kill $(docker ps -q) ||: + docker rm -f $(docker ps -a -q) ||: + sudo rm -fr $TEMP_PATH + IntegrationTestsRelease0: + needs: [BuilderDebRelease] + runs-on: [self-hosted, stress-tester] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/integration_tests_release + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Integration tests (release, actions) + REPO_COPY=${{runner.temp}}/integration_tests_release/ClickHouse + RUN_BY_HASH_NUM=0 + RUN_BY_HASH_TOTAL=2 + EOF + - name: Download json reports + uses: actions/download-artifact@v2 + with: + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE + - name: Check out repository code + uses: actions/checkout@v2 + - name: Integration test + run: | + sudo rm -fr $TEMP_PATH + mkdir -p $TEMP_PATH + cp -r $GITHUB_WORKSPACE $TEMP_PATH + cd $REPO_COPY/tests/ci + python3 integration_test_check.py "$CHECK_NAME" + - name: Cleanup + if: always() + run: | + docker kill $(docker ps -q) ||: + docker rm -f $(docker ps -a -q) ||: + sudo rm -fr $TEMP_PATH + IntegrationTestsRelease1: + needs: [BuilderDebRelease] + runs-on: [self-hosted, stress-tester] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/integration_tests_release + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Integration tests (release, actions) + REPO_COPY=${{runner.temp}}/integration_tests_release/ClickHouse + RUN_BY_HASH_NUM=1 + RUN_BY_HASH_TOTAL=2 + EOF + - name: Download json reports + uses: actions/download-artifact@v2 + with: + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Integration test - env: - TEMP_PATH: ${{runner.temp}}/integration_tests_release - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'Integration tests (release, actions)' - REPO_COPY: ${{runner.temp}}/integration_tests_release/ClickHouse run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -1358,18 +2164,24 @@ jobs: needs: [BuilderDebAsan] runs-on: [self-hosted, fuzzer-unit-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/ast_fuzzer_asan + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=AST fuzzer (ASan, actions) + REPO_COPY=${{runner.temp}}/ast_fuzzer_asan/ClickHouse + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Fuzzer - env: - TEMP_PATH: ${{runner.temp}}/ast_fuzzer_asan - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'AST fuzzer (ASan, actions)' - REPO_COPY: ${{runner.temp}}/ast_fuzzer_asan/ClickHouse run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -1386,18 +2198,24 @@ jobs: needs: [BuilderDebTsan] runs-on: [self-hosted, fuzzer-unit-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/ast_fuzzer_tsan + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=AST fuzzer (TSan, actions) + REPO_COPY=${{runner.temp}}/ast_fuzzer_tsan/ClickHouse + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Fuzzer - env: - TEMP_PATH: ${{runner.temp}}/ast_fuzzer_tsan - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'AST fuzzer (TSan, actions)' - REPO_COPY: ${{runner.temp}}/ast_fuzzer_tsan/ClickHouse run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -1414,18 +2232,24 @@ jobs: needs: [BuilderDebUBsan] runs-on: [self-hosted, fuzzer-unit-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/ast_fuzzer_ubsan + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=AST fuzzer (UBSan, actions) + REPO_COPY=${{runner.temp}}/ast_fuzzer_ubsan/ClickHouse + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Fuzzer - env: - TEMP_PATH: ${{runner.temp}}/ast_fuzzer_ubsan - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'AST fuzzer (UBSan, actions)' - REPO_COPY: ${{runner.temp}}/ast_fuzzer_ubsan/ClickHouse run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -1442,18 +2266,24 @@ jobs: needs: [BuilderDebMsan] runs-on: [self-hosted, fuzzer-unit-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/ast_fuzzer_msan + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=AST fuzzer (MSan, actions) + REPO_COPY=${{runner.temp}}/ast_fuzzer_msan/ClickHouse + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Fuzzer - env: - TEMP_PATH: ${{runner.temp}}/ast_fuzzer_msan - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'AST fuzzer (MSan, actions)' - REPO_COPY: ${{runner.temp}}/ast_fuzzer_msan/ClickHouse run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -1470,18 +2300,24 @@ jobs: needs: [BuilderDebDebug] runs-on: [self-hosted, fuzzer-unit-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/ast_fuzzer_debug + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=AST fuzzer (debug, actions) + REPO_COPY=${{runner.temp}}/ast_fuzzer_debug/ClickHouse + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Fuzzer - env: - TEMP_PATH: ${{runner.temp}}/ast_fuzzer_debug - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'AST fuzzer (debug, actions)' - REPO_COPY: ${{runner.temp}}/ast_fuzzer_debug/ClickHouse run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -1501,18 +2337,24 @@ jobs: needs: [BuilderDebAsan] runs-on: [self-hosted, fuzzer-unit-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/unit_tests_asan + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Unit tests (asan, actions) + REPO_COPY=${{runner.temp}}/unit_tests_asan/ClickHouse + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Unit test - env: - TEMP_PATH: ${{runner.temp}}/unit_tests_asan - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'Unit tests (asan, actions)' - REPO_COPY: ${{runner.temp}}/unit_tests_asan/ClickHouse run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -1529,18 +2371,24 @@ jobs: needs: [BuilderBinRelease] runs-on: [self-hosted, fuzzer-unit-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/unit_tests_asan + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Unit tests (release-clang, actions) + REPO_COPY=${{runner.temp}}/unit_tests_asan/ClickHouse + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Unit test - env: - TEMP_PATH: ${{runner.temp}}/unit_tests_asan - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'Unit tests (release-clang, actions)' - REPO_COPY: ${{runner.temp}}/unit_tests_asan/ClickHouse run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -1557,18 +2405,24 @@ jobs: needs: [BuilderDebTsan] runs-on: [self-hosted, fuzzer-unit-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/unit_tests_tsan + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Unit tests (tsan, actions) + REPO_COPY=${{runner.temp}}/unit_tests_tsan/ClickHouse + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Unit test - env: - TEMP_PATH: ${{runner.temp}}/unit_tests_tsan - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'Unit tests (tsan, actions)' - REPO_COPY: ${{runner.temp}}/unit_tests_tsan/ClickHouse run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -1585,18 +2439,24 @@ jobs: needs: [BuilderDebMsan] runs-on: [self-hosted, fuzzer-unit-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/unit_tests_msan + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Unit tests (msan, actions) + REPO_COPY=${{runner.temp}}/unit_tests_msan/ClickHouse + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Unit test - env: - TEMP_PATH: ${{runner.temp}}/unit_tests_msan - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'Unit tests (msan, actions)' - REPO_COPY: ${{runner.temp}}/unit_tests_msan/ClickHouse run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -1613,18 +2473,24 @@ jobs: needs: [BuilderDebUBsan] runs-on: [self-hosted, fuzzer-unit-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/unit_tests_ubsan + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Unit tests (ubsan, actions) + REPO_COPY=${{runner.temp}}/unit_tests_ubsan/ClickHouse + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Unit test - env: - TEMP_PATH: ${{runner.temp}}/unit_tests_ubsan - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'Unit tests (msan, actions)' - REPO_COPY: ${{runner.temp}}/unit_tests_ubsan/ClickHouse run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -1637,16 +2503,170 @@ jobs: docker kill $(docker ps -q) ||: docker rm -f $(docker ps -a -q) ||: sudo rm -fr $TEMP_PATH +############################################################################################# +#################################### PERFORMANCE TESTS ###################################### +############################################################################################# + PerformanceComparison0: + needs: [BuilderPerformance] + runs-on: [self-hosted, stress-tester] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/performance_comparison + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Performance Comparison (actions) + REPO_COPY=${{runner.temp}}/performance_comparison/ClickHouse + RUN_BY_HASH_NUM=0 + RUN_BY_HASH_TOTAL=4 + EOF + - name: Download json reports + uses: actions/download-artifact@v2 + with: + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE + - name: Check out repository code + uses: actions/checkout@v2 + - name: Performance Comparison + run: | + sudo rm -fr $TEMP_PATH + mkdir -p $TEMP_PATH + cp -r $GITHUB_WORKSPACE $TEMP_PATH + cd $REPO_COPY/tests/ci + python3 performance_comparison_check.py "$CHECK_NAME" + - name: Cleanup + if: always() + run: | + docker kill $(docker ps -q) ||: + docker rm -f $(docker ps -a -q) ||: + sudo rm -fr $TEMP_PATH + PerformanceComparison1: + needs: [BuilderPerformance] + runs-on: [self-hosted, stress-tester] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/performance_comparison + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Performance Comparison (actions) + REPO_COPY=${{runner.temp}}/performance_comparison/ClickHouse + RUN_BY_HASH_NUM=1 + RUN_BY_HASH_TOTAL=4 + EOF + - name: Download json reports + uses: actions/download-artifact@v2 + with: + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE + - name: Check out repository code + uses: actions/checkout@v2 + - name: Performance Comparison + run: | + sudo rm -fr $TEMP_PATH + mkdir -p $TEMP_PATH + cp -r $GITHUB_WORKSPACE $TEMP_PATH + cd $REPO_COPY/tests/ci + python3 performance_comparison_check.py "$CHECK_NAME" + - name: Cleanup + if: always() + run: | + docker kill $(docker ps -q) ||: + docker rm -f $(docker ps -a -q) ||: + sudo rm -fr $TEMP_PATH + PerformanceComparison2: + needs: [BuilderPerformance] + runs-on: [self-hosted, stress-tester] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/performance_comparison + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Performance Comparison (actions) + REPO_COPY=${{runner.temp}}/performance_comparison/ClickHouse + RUN_BY_HASH_NUM=2 + RUN_BY_HASH_TOTAL=4 + EOF + - name: Download json reports + uses: actions/download-artifact@v2 + with: + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE + - name: Check out repository code + uses: actions/checkout@v2 + - name: Performance Comparison + run: | + sudo rm -fr $TEMP_PATH + mkdir -p $TEMP_PATH + cp -r $GITHUB_WORKSPACE $TEMP_PATH + cd $REPO_COPY/tests/ci + python3 performance_comparison_check.py "$CHECK_NAME" + - name: Cleanup + if: always() + run: | + docker kill $(docker ps -q) ||: + docker rm -f $(docker ps -a -q) ||: + sudo rm -fr $TEMP_PATH + PerformanceComparison3: + needs: [BuilderPerformance] + runs-on: [self-hosted, stress-tester] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/performance_comparison + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Performance Comparison (actions) + REPO_COPY=${{runner.temp}}/performance_comparison/ClickHouse + RUN_BY_HASH_NUM=3 + RUN_BY_HASH_TOTAL=4 + EOF + - name: Download json reports + uses: actions/download-artifact@v2 + with: + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE + - name: Check out repository code + uses: actions/checkout@v2 + - name: Performance Comparison + run: | + sudo rm -fr $TEMP_PATH + mkdir -p $TEMP_PATH + cp -r $GITHUB_WORKSPACE $TEMP_PATH + cd $REPO_COPY/tests/ci + python3 performance_comparison_check.py "$CHECK_NAME" + - name: Cleanup + if: always() + run: | + docker kill $(docker ps -q) ||: + docker rm -f $(docker ps -a -q) ||: + sudo rm -fr $TEMP_PATH FinishCheck: needs: - DockerHubPush - BuilderReport - - FunctionalStatelessTestDebug + - FunctionalStatelessTestDebug0 + - FunctionalStatelessTestDebug1 + - FunctionalStatelessTestDebug2 - FunctionalStatelessTestRelease - FunctionalStatelessTestReleaseDatabaseOrdinary - - FunctionalStatelessTestAsan - - FunctionalStatelessTestTsan - - FunctionalStatelessTestMsan + - FunctionalStatelessTestAsan0 + - FunctionalStatelessTestAsan1 + - FunctionalStatelessTestTsan0 + - FunctionalStatelessTestTsan1 + - FunctionalStatelessTestTsan2 + - FunctionalStatelessTestMsan0 + - FunctionalStatelessTestMsan1 + - FunctionalStatelessTestMsan2 - FunctionalStatelessTestUBsan - FunctionalStatefulTestDebug - FunctionalStatefulTestRelease @@ -1660,9 +2680,19 @@ jobs: - StressTestTsan - StressTestMsan - StressTestUBsan - - IntegrationTestsAsan - - IntegrationTestsRelease - - IntegrationTestsTsan + - IntegrationTestsAsan0 + - IntegrationTestsAsan1 + - IntegrationTestsAsan2 + - IntegrationTestsRelease0 + - IntegrationTestsRelease1 + - IntegrationTestsTsan0 + - IntegrationTestsTsan1 + - IntegrationTestsTsan2 + - IntegrationTestsTsan3 + - PerformanceComparison0 + - PerformanceComparison1 + - PerformanceComparison2 + - PerformanceComparison3 - CompatibilityCheck - ASTFuzzerTestDebug - ASTFuzzerTestAsan @@ -1677,6 +2707,9 @@ jobs: - SplitBuildSmokeTest runs-on: [self-hosted, style-checker] steps: + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Finish label diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index dd576b04c8c..1212bddb4a5 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -1,4 +1,9 @@ name: DocsReleaseChecks + +env: + # Force the stdout and stderr streams to be unbuffered + PYTHONUNBUFFERED: 1 + concurrency: group: master-release cancel-in-progress: true @@ -11,10 +16,15 @@ on: # yamllint disable-line rule:truthy - 'website/**' - 'benchmark/**' - 'docker/**' + - '.github/**' + workflow_dispatch: jobs: DockerHubPush: runs-on: [self-hosted, style-checker] steps: + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Images check @@ -30,20 +40,31 @@ jobs: needs: DockerHubPush runs-on: [self-hosted, func-tester] steps: + - name: Set envs + # https://docs.github.com/en/actions/learn-github-actions/workflow-commands-for-github-actions#multiline-strings + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/docs_release + REPO_COPY=${{runner.temp}}/docs_release/ClickHouse + CLOUDFLARE_TOKEN=${{secrets.CLOUDFLARE}} + ROBOT_CLICKHOUSE_SSH_KEY<> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/compatibility_check + REPO_COPY=${{runner.temp}}/compatibility_check/ClickHouse + REPORTS_PATH=${{runner.temp}}/reports_dir + EOF + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} - name: CompatibilityCheck - env: - TEMP_PATH: ${{runner.temp}}/compatibility_check - REPO_COPY: ${{runner.temp}}/compatibility_check/ClickHouse - REPORTS_PATH: ${{runner.temp}}/reports_dir run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -54,230 +68,272 @@ jobs: needs: [DockerHubPush] runs-on: [self-hosted, builder] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/build_check + IMAGES_PATH=${{runner.temp}}/images_path + REPO_COPY=${{runner.temp}}/build_check/ClickHouse + CACHES_PATH=${{runner.temp}}/../ccaches + CHECK_NAME=ClickHouse build check (actions) + BUILD_NAME=package_release + EOF - name: Download changed images uses: actions/download-artifact@v2 with: name: changed_images - path: ${{ runner.temp }}/images_path + path: ${{ env.IMAGES_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 with: - submodules: 'recursive' + submodules: 'true' fetch-depth: 0 # otherwise we will have no info about contributors - name: Build - env: - TEMP_PATH: ${{runner.temp}}/build_check - IMAGES_PATH: ${{runner.temp}}/images_path - REPO_COPY: ${{runner.temp}}/build_check/ClickHouse - CACHES_PATH: ${{runner.temp}}/../ccaches - CHECK_NAME: 'ClickHouse build check (actions)' - BUILD_NAME: 'package_release' run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH cp -r $GITHUB_WORKSPACE $TEMP_PATH cd $REPO_COPY/tests/ci && python3 build_check.py "$CHECK_NAME" $BUILD_NAME - name: Upload build URLs to artifacts + if: ${{ success() || failure() }} uses: actions/upload-artifact@v2 with: name: ${{ env.BUILD_NAME }} - path: ${{ runner.temp }}/build_check/${{ env.BUILD_NAME }}.json + path: ${{ env.TEMP_PATH }}/${{ env.BUILD_NAME }}.json - name: Cleanup if: always() run: | docker kill $(docker ps -q) ||: docker rm -f $(docker ps -a -q) ||: - sudo rm -fr $TEMP_PATH + sudo rm -fr $TEMP_PATH $CACHES_PATH BuilderDebAsan: needs: [DockerHubPush] runs-on: [self-hosted, builder] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/build_check + IMAGES_PATH=${{runner.temp}}/images_path + REPO_COPY=${{runner.temp}}/build_check/ClickHouse + CACHES_PATH=${{runner.temp}}/../ccaches + CHECK_NAME=ClickHouse build check (actions) + BUILD_NAME=package_asan + EOF - name: Download changed images uses: actions/download-artifact@v2 with: name: changed_images - path: ${{ runner.temp }}/images_path + path: ${{ env.IMAGES_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 with: - submodules: 'recursive' + submodules: 'true' fetch-depth: 0 # otherwise we will have no info about contributors - name: Build - env: - TEMP_PATH: ${{runner.temp}}/build_check - IMAGES_PATH: ${{runner.temp}}/images_path - REPO_COPY: ${{runner.temp}}/build_check/ClickHouse - CACHES_PATH: ${{runner.temp}}/../ccaches - CHECK_NAME: 'ClickHouse build check (actions)' - BUILD_NAME: 'package_asan' run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH cp -r $GITHUB_WORKSPACE $TEMP_PATH cd $REPO_COPY/tests/ci && python3 build_check.py "$CHECK_NAME" $BUILD_NAME - name: Upload build URLs to artifacts + if: ${{ success() || failure() }} uses: actions/upload-artifact@v2 with: name: ${{ env.BUILD_NAME }} - path: ${{ runner.temp }}/build_check/${{ env.BUILD_NAME }}.json + path: ${{ env.TEMP_PATH }}/${{ env.BUILD_NAME }}.json - name: Cleanup if: always() run: | docker kill $(docker ps -q) ||: docker rm -f $(docker ps -a -q) ||: - sudo rm -fr $TEMP_PATH + sudo rm -fr $TEMP_PATH $CACHES_PATH BuilderDebUBsan: needs: [DockerHubPush] runs-on: [self-hosted, builder] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/build_check + IMAGES_PATH=${{runner.temp}}/images_path + REPO_COPY=${{runner.temp}}/build_check/ClickHouse + CACHES_PATH=${{runner.temp}}/../ccaches + CHECK_NAME=ClickHouse build check (actions) + BUILD_NAME=package_ubsan + EOF - name: Download changed images uses: actions/download-artifact@v2 with: name: changed_images - path: ${{ runner.temp }}/images_path + path: ${{ env.IMAGES_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 with: - submodules: 'recursive' + submodules: 'true' fetch-depth: 0 # otherwise we will have no info about contributors - name: Build - env: - TEMP_PATH: ${{runner.temp}}/build_check - IMAGES_PATH: ${{runner.temp}}/images_path - REPO_COPY: ${{runner.temp}}/build_check/ClickHouse - CACHES_PATH: ${{runner.temp}}/../ccaches - CHECK_NAME: 'ClickHouse build check (actions)' - BUILD_NAME: 'package_ubsan' run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH cp -r $GITHUB_WORKSPACE $TEMP_PATH cd $REPO_COPY/tests/ci && python3 build_check.py "$CHECK_NAME" $BUILD_NAME - name: Upload build URLs to artifacts + if: ${{ success() || failure() }} uses: actions/upload-artifact@v2 with: name: ${{ env.BUILD_NAME }} - path: ${{ runner.temp }}/build_check/${{ env.BUILD_NAME }}.json + path: ${{ env.TEMP_PATH }}/${{ env.BUILD_NAME }}.json - name: Cleanup if: always() run: | docker kill $(docker ps -q) ||: docker rm -f $(docker ps -a -q) ||: - sudo rm -fr $TEMP_PATH + sudo rm -fr $TEMP_PATH $CACHES_PATH BuilderDebTsan: needs: [DockerHubPush] runs-on: [self-hosted, builder] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/build_check + IMAGES_PATH=${{runner.temp}}/images_path + REPO_COPY=${{runner.temp}}/build_check/ClickHouse + CACHES_PATH=${{runner.temp}}/../ccaches + CHECK_NAME=ClickHouse build check (actions) + BUILD_NAME=package_tsan + EOF - name: Download changed images uses: actions/download-artifact@v2 with: name: changed_images - path: ${{ runner.temp }}/images_path + path: ${{ env.IMAGES_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 with: - submodules: 'recursive' + submodules: 'true' fetch-depth: 0 # otherwise we will have no info about contributors - name: Build - env: - TEMP_PATH: ${{runner.temp}}/build_check - IMAGES_PATH: ${{runner.temp}}/images_path - REPO_COPY: ${{runner.temp}}/build_check/ClickHouse - CACHES_PATH: ${{runner.temp}}/../ccaches - CHECK_NAME: 'ClickHouse build check (actions)' - BUILD_NAME: 'package_tsan' run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH cp -r $GITHUB_WORKSPACE $TEMP_PATH cd $REPO_COPY/tests/ci && python3 build_check.py "$CHECK_NAME" $BUILD_NAME - name: Upload build URLs to artifacts + if: ${{ success() || failure() }} uses: actions/upload-artifact@v2 with: name: ${{ env.BUILD_NAME }} - path: ${{ runner.temp }}/build_check/${{ env.BUILD_NAME }}.json + path: ${{ env.TEMP_PATH }}/${{ env.BUILD_NAME }}.json - name: Cleanup if: always() run: | docker kill $(docker ps -q) ||: docker rm -f $(docker ps -a -q) ||: - sudo rm -fr $TEMP_PATH + sudo rm -fr $TEMP_PATH $CACHES_PATH BuilderDebMsan: needs: [DockerHubPush] runs-on: [self-hosted, builder] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/build_check + IMAGES_PATH=${{runner.temp}}/images_path + REPO_COPY=${{runner.temp}}/build_check/ClickHouse + CACHES_PATH=${{runner.temp}}/../ccaches + CHECK_NAME=ClickHouse build check (actions) + BUILD_NAME=package_msan + EOF - name: Download changed images uses: actions/download-artifact@v2 with: name: changed_images - path: ${{ runner.temp }}/images_path + path: ${{ env.IMAGES_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 with: - submodules: 'recursive' + submodules: 'true' fetch-depth: 0 # otherwise we will have no info about contributors - name: Build - env: - TEMP_PATH: ${{runner.temp}}/build_check - IMAGES_PATH: ${{runner.temp}}/images_path - REPO_COPY: ${{runner.temp}}/build_check/ClickHouse - CACHES_PATH: ${{runner.temp}}/../ccaches - CHECK_NAME: 'ClickHouse build check (actions)' - BUILD_NAME: 'package_msan' run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH cp -r $GITHUB_WORKSPACE $TEMP_PATH cd $REPO_COPY/tests/ci && python3 build_check.py "$CHECK_NAME" $BUILD_NAME - name: Upload build URLs to artifacts + if: ${{ success() || failure() }} uses: actions/upload-artifact@v2 with: name: ${{ env.BUILD_NAME }} - path: ${{ runner.temp }}/build_check/${{ env.BUILD_NAME }}.json + path: ${{ env.TEMP_PATH }}/${{ env.BUILD_NAME }}.json - name: Cleanup if: always() run: | docker kill $(docker ps -q) ||: docker rm -f $(docker ps -a -q) ||: - sudo rm -fr $TEMP_PATH + sudo rm -fr $TEMP_PATH $CACHES_PATH BuilderDebDebug: needs: [DockerHubPush] runs-on: [self-hosted, builder] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/build_check + IMAGES_PATH=${{runner.temp}}/images_path + REPO_COPY=${{runner.temp}}/build_check/ClickHouse + CACHES_PATH=${{runner.temp}}/../ccaches + CHECK_NAME=ClickHouse build check (actions) + BUILD_NAME=package_debug + EOF - name: Download changed images uses: actions/download-artifact@v2 with: name: changed_images - path: ${{ runner.temp }}/images_path + path: ${{ env.IMAGES_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 with: - submodules: 'recursive' + submodules: 'true' fetch-depth: 0 # otherwise we will have no info about contributors - name: Build - env: - TEMP_PATH: ${{runner.temp}}/build_check - IMAGES_PATH: ${{runner.temp}}/images_path - REPO_COPY: ${{runner.temp}}/build_check/ClickHouse - CACHES_PATH: ${{runner.temp}}/../ccaches - CHECK_NAME: 'ClickHouse build check (actions)' - BUILD_NAME: 'package_debug' run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH cp -r $GITHUB_WORKSPACE $TEMP_PATH cd $REPO_COPY/tests/ci && python3 build_check.py "$CHECK_NAME" $BUILD_NAME - name: Upload build URLs to artifacts + if: ${{ success() || failure() }} uses: actions/upload-artifact@v2 with: name: ${{ env.BUILD_NAME }} - path: ${{ runner.temp }}/build_check/${{ env.BUILD_NAME }}.json + path: ${{ env.TEMP_PATH }}/${{ env.BUILD_NAME }}.json - name: Cleanup if: always() run: | docker kill $(docker ps -q) ||: docker rm -f $(docker ps -a -q) ||: - sudo rm -fr $TEMP_PATH + sudo rm -fr $TEMP_PATH $CACHES_PATH ############################################################################################ ##################################### BUILD REPORTER ####################################### ############################################################################################ @@ -291,17 +347,23 @@ jobs: - BuilderDebDebug runs-on: [self-hosted, style-checker] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/report_check + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=ClickHouse build check (actions) + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Report Builder - env: - TEMP_PATH: ${{runner.temp}}/report_check - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'ClickHouse build check (actions)' run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -320,19 +382,25 @@ jobs: needs: [BuilderDebRelease] runs-on: [self-hosted, func-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stateless_debug + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stateless tests (release, actions) + REPO_COPY=${{runner.temp}}/stateless_debug/ClickHouse + KILL_TIMEOUT=10800 + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Functional test - env: - TEMP_PATH: ${{runner.temp}}/stateless_debug - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'Stateless tests (release, actions)' - REPO_COPY: ${{runner.temp}}/stateless_debug/ClickHouse - KILL_TIMEOUT: 10800 run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -345,23 +413,31 @@ jobs: docker kill $(docker ps -q) ||: docker rm -f $(docker ps -a -q) ||: sudo rm -fr $TEMP_PATH - FunctionalStatelessTestAsan: + FunctionalStatelessTestAsan0: needs: [BuilderDebAsan] runs-on: [self-hosted, func-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stateless_debug + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stateless tests (address, actions) + REPO_COPY=${{runner.temp}}/stateless_debug/ClickHouse + KILL_TIMEOUT=10800 + RUN_BY_HASH_NUM=0 + RUN_BY_HASH_TOTAL=2 + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Functional test - env: - TEMP_PATH: ${{runner.temp}}/stateless_debug - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'Stateless tests (address, actions)' - REPO_COPY: ${{runner.temp}}/stateless_debug/ClickHouse - KILL_TIMEOUT: 10800 run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -374,23 +450,142 @@ jobs: docker kill $(docker ps -q) ||: docker rm -f $(docker ps -a -q) ||: sudo rm -fr $TEMP_PATH - FunctionalStatelessTestTsan: - needs: [BuilderDebTsan] + FunctionalStatelessTestAsan1: + needs: [BuilderDebAsan] runs-on: [self-hosted, func-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stateless_debug + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stateless tests (address, actions) + REPO_COPY=${{runner.temp}}/stateless_debug/ClickHouse + KILL_TIMEOUT=10800 + RUN_BY_HASH_NUM=1 + RUN_BY_HASH_TOTAL=2 + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE + - name: Check out repository code + uses: actions/checkout@v2 + - name: Functional test + run: | + sudo rm -fr $TEMP_PATH + mkdir -p $TEMP_PATH + cp -r $GITHUB_WORKSPACE $TEMP_PATH + cd $REPO_COPY/tests/ci + python3 functional_test_check.py "$CHECK_NAME" $KILL_TIMEOUT + - name: Cleanup + if: always() + run: | + docker kill $(docker ps -q) ||: + docker rm -f $(docker ps -a -q) ||: + sudo rm -fr $TEMP_PATH + FunctionalStatelessTestTsan0: + needs: [BuilderDebTsan] + runs-on: [self-hosted, func-tester] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stateless_tsan + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stateless tests (thread, actions) + REPO_COPY=${{runner.temp}}/stateless_tsan/ClickHouse + KILL_TIMEOUT=10800 + RUN_BY_HASH_NUM=0 + RUN_BY_HASH_TOTAL=3 + EOF + - name: Download json reports + uses: actions/download-artifact@v2 + with: + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE + - name: Check out repository code + uses: actions/checkout@v2 + - name: Functional test + run: | + sudo rm -fr $TEMP_PATH + mkdir -p $TEMP_PATH + cp -r $GITHUB_WORKSPACE $TEMP_PATH + cd $REPO_COPY/tests/ci + python3 functional_test_check.py "$CHECK_NAME" $KILL_TIMEOUT + - name: Cleanup + if: always() + run: | + docker kill $(docker ps -q) ||: + docker rm -f $(docker ps -a -q) ||: + sudo rm -fr $TEMP_PATH + FunctionalStatelessTestTsan1: + needs: [BuilderDebTsan] + runs-on: [self-hosted, func-tester] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stateless_tsan + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stateless tests (thread, actions) + REPO_COPY=${{runner.temp}}/stateless_tsan/ClickHouse + KILL_TIMEOUT=10800 + RUN_BY_HASH_NUM=1 + RUN_BY_HASH_TOTAL=3 + EOF + - name: Download json reports + uses: actions/download-artifact@v2 + with: + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE + - name: Check out repository code + uses: actions/checkout@v2 + - name: Functional test + run: | + sudo rm -fr $TEMP_PATH + mkdir -p $TEMP_PATH + cp -r $GITHUB_WORKSPACE $TEMP_PATH + cd $REPO_COPY/tests/ci + python3 functional_test_check.py "$CHECK_NAME" $KILL_TIMEOUT + - name: Cleanup + if: always() + run: | + docker kill $(docker ps -q) ||: + docker rm -f $(docker ps -a -q) ||: + sudo rm -fr $TEMP_PATH + FunctionalStatelessTestTsan2: + needs: [BuilderDebTsan] + runs-on: [self-hosted, func-tester] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stateless_tsan + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stateless tests (thread, actions) + REPO_COPY=${{runner.temp}}/stateless_tsan/ClickHouse + KILL_TIMEOUT=10800 + RUN_BY_HASH_NUM=2 + RUN_BY_HASH_TOTAL=3 + EOF + - name: Download json reports + uses: actions/download-artifact@v2 + with: + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Functional test - env: - TEMP_PATH: ${{runner.temp}}/stateless_tsan - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'Stateless tests (thread, actions)' - REPO_COPY: ${{runner.temp}}/stateless_tsan/ClickHouse - KILL_TIMEOUT: 10800 run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -407,19 +602,25 @@ jobs: needs: [BuilderDebUBsan] runs-on: [self-hosted, func-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stateless_ubsan + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stateless tests (ubsan, actions) + REPO_COPY=${{runner.temp}}/stateless_ubsan/ClickHouse + KILL_TIMEOUT=10800 + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Functional test - env: - TEMP_PATH: ${{runner.temp}}/stateless_ubsan - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'Stateless tests (ubsan, actions)' - REPO_COPY: ${{runner.temp}}/stateless_ubsan/ClickHouse - KILL_TIMEOUT: 10800 run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -432,23 +633,31 @@ jobs: docker kill $(docker ps -q) ||: docker rm -f $(docker ps -a -q) ||: sudo rm -fr $TEMP_PATH - FunctionalStatelessTestMsan: + FunctionalStatelessTestMsan0: needs: [BuilderDebMsan] runs-on: [self-hosted, func-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stateless_memory + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stateless tests (memory, actions) + REPO_COPY=${{runner.temp}}/stateless_memory/ClickHouse + KILL_TIMEOUT=10800 + RUN_BY_HASH_NUM=0 + RUN_BY_HASH_TOTAL=3 + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Functional test - env: - TEMP_PATH: ${{runner.temp}}/stateless_memory - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'Stateless tests (memory, actions)' - REPO_COPY: ${{runner.temp}}/stateless_memory/ClickHouse - KILL_TIMEOUT: 10800 run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -461,23 +670,179 @@ jobs: docker kill $(docker ps -q) ||: docker rm -f $(docker ps -a -q) ||: sudo rm -fr $TEMP_PATH - FunctionalStatelessTestDebug: - needs: [BuilderDebDebug] + FunctionalStatelessTestMsan1: + needs: [BuilderDebMsan] runs-on: [self-hosted, func-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stateless_memory + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stateless tests (memory, actions) + REPO_COPY=${{runner.temp}}/stateless_memory/ClickHouse + KILL_TIMEOUT=10800 + RUN_BY_HASH_NUM=1 + RUN_BY_HASH_TOTAL=3 + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE + - name: Check out repository code + uses: actions/checkout@v2 + - name: Functional test + run: | + sudo rm -fr $TEMP_PATH + mkdir -p $TEMP_PATH + cp -r $GITHUB_WORKSPACE $TEMP_PATH + cd $REPO_COPY/tests/ci + python3 functional_test_check.py "$CHECK_NAME" $KILL_TIMEOUT + - name: Cleanup + if: always() + run: | + docker kill $(docker ps -q) ||: + docker rm -f $(docker ps -a -q) ||: + sudo rm -fr $TEMP_PATH + FunctionalStatelessTestMsan2: + needs: [BuilderDebMsan] + runs-on: [self-hosted, func-tester] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stateless_memory + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stateless tests (memory, actions) + REPO_COPY=${{runner.temp}}/stateless_memory/ClickHouse + KILL_TIMEOUT=10800 + RUN_BY_HASH_NUM=2 + RUN_BY_HASH_TOTAL=3 + EOF + - name: Download json reports + uses: actions/download-artifact@v2 + with: + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE + - name: Check out repository code + uses: actions/checkout@v2 + - name: Functional test + run: | + sudo rm -fr $TEMP_PATH + mkdir -p $TEMP_PATH + cp -r $GITHUB_WORKSPACE $TEMP_PATH + cd $REPO_COPY/tests/ci + python3 functional_test_check.py "$CHECK_NAME" $KILL_TIMEOUT + - name: Cleanup + if: always() + run: | + docker kill $(docker ps -q) ||: + docker rm -f $(docker ps -a -q) ||: + sudo rm -fr $TEMP_PATH + FunctionalStatelessTestDebug0: + needs: [BuilderDebDebug] + runs-on: [self-hosted, func-tester] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stateless_debug + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stateless tests (debug, actions) + REPO_COPY=${{runner.temp}}/stateless_debug/ClickHouse + KILL_TIMEOUT=10800 + RUN_BY_HASH_NUM=0 + RUN_BY_HASH_TOTAL=3 + EOF + - name: Download json reports + uses: actions/download-artifact@v2 + with: + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE + - name: Check out repository code + uses: actions/checkout@v2 + - name: Functional test + run: | + sudo rm -fr $TEMP_PATH + mkdir -p $TEMP_PATH + cp -r $GITHUB_WORKSPACE $TEMP_PATH + cd $REPO_COPY/tests/ci + python3 functional_test_check.py "$CHECK_NAME" $KILL_TIMEOUT + - name: Cleanup + if: always() + run: | + docker kill $(docker ps -q) ||: + docker rm -f $(docker ps -a -q) ||: + sudo rm -fr $TEMP_PATH + FunctionalStatelessTestDebug1: + needs: [BuilderDebDebug] + runs-on: [self-hosted, func-tester] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stateless_debug + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stateless tests (debug, actions) + REPO_COPY=${{runner.temp}}/stateless_debug/ClickHouse + KILL_TIMEOUT=10800 + RUN_BY_HASH_NUM=1 + RUN_BY_HASH_TOTAL=3 + EOF + - name: Download json reports + uses: actions/download-artifact@v2 + with: + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE + - name: Check out repository code + uses: actions/checkout@v2 + - name: Functional test + run: | + sudo rm -fr $TEMP_PATH + mkdir -p $TEMP_PATH + cp -r $GITHUB_WORKSPACE $TEMP_PATH + cd $REPO_COPY/tests/ci + python3 functional_test_check.py "$CHECK_NAME" $KILL_TIMEOUT + - name: Cleanup + if: always() + run: | + docker kill $(docker ps -q) ||: + docker rm -f $(docker ps -a -q) ||: + sudo rm -fr $TEMP_PATH + FunctionalStatelessTestDebug2: + needs: [BuilderDebDebug] + runs-on: [self-hosted, func-tester] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stateless_debug + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stateless tests (debug, actions) + REPO_COPY=${{runner.temp}}/stateless_debug/ClickHouse + KILL_TIMEOUT=10800 + RUN_BY_HASH_NUM=2 + RUN_BY_HASH_TOTAL=3 + EOF + - name: Download json reports + uses: actions/download-artifact@v2 + with: + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Functional test - env: - TEMP_PATH: ${{runner.temp}}/stateless_debug - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'Stateless tests (debug, actions)' - REPO_COPY: ${{runner.temp}}/stateless_debug/ClickHouse - KILL_TIMEOUT: 10800 run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -497,19 +862,25 @@ jobs: needs: [BuilderDebRelease] runs-on: [self-hosted, func-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stateful_debug + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stateful tests (release, actions) + REPO_COPY=${{runner.temp}}/stateful_debug/ClickHouse + KILL_TIMEOUT=3600 + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Functional test - env: - TEMP_PATH: ${{runner.temp}}/stateful_debug - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'Stateful tests (release, actions)' - REPO_COPY: ${{runner.temp}}/stateful_debug/ClickHouse - KILL_TIMEOUT: 3600 run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -526,19 +897,25 @@ jobs: needs: [BuilderDebAsan] runs-on: [self-hosted, func-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stateful_debug + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stateful tests (address, actions) + REPO_COPY=${{runner.temp}}/stateful_debug/ClickHouse + KILL_TIMEOUT=3600 + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Functional test - env: - TEMP_PATH: ${{runner.temp}}/stateful_debug - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'Stateful tests (address, actions)' - REPO_COPY: ${{runner.temp}}/stateful_debug/ClickHouse - KILL_TIMEOUT: 3600 run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -555,19 +932,25 @@ jobs: needs: [BuilderDebTsan] runs-on: [self-hosted, func-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stateful_tsan + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stateful tests (thread, actions) + REPO_COPY=${{runner.temp}}/stateful_tsan/ClickHouse + KILL_TIMEOUT=3600 + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Functional test - env: - TEMP_PATH: ${{runner.temp}}/stateful_tsan - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'Stateful tests (thread, actions)' - REPO_COPY: ${{runner.temp}}/stateful_tsan/ClickHouse - KILL_TIMEOUT: 3600 run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -584,19 +967,25 @@ jobs: needs: [BuilderDebMsan] runs-on: [self-hosted, func-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stateful_msan + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stateful tests (memory, actions) + REPO_COPY=${{runner.temp}}/stateful_msan/ClickHouse + KILL_TIMEOUT=3600 + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Functional test - env: - TEMP_PATH: ${{runner.temp}}/stateful_msan - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'Stateful tests (memory, actions)' - REPO_COPY: ${{runner.temp}}/stateful_msan/ClickHouse - KILL_TIMEOUT: 3600 run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -613,19 +1002,25 @@ jobs: needs: [BuilderDebUBsan] runs-on: [self-hosted, func-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stateful_ubsan + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stateful tests (ubsan, actions) + REPO_COPY=${{runner.temp}}/stateful_ubsan/ClickHouse + KILL_TIMEOUT=3600 + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Functional test - env: - TEMP_PATH: ${{runner.temp}}/stateful_ubsan - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'Stateful tests (ubsan, actions)' - REPO_COPY: ${{runner.temp}}/stateful_ubsan/ClickHouse - KILL_TIMEOUT: 3600 run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -642,19 +1037,25 @@ jobs: needs: [BuilderDebDebug] runs-on: [self-hosted, func-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stateful_debug + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stateful tests (debug, actions) + REPO_COPY=${{runner.temp}}/stateful_debug/ClickHouse + KILL_TIMEOUT=3600 + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Functional test - env: - TEMP_PATH: ${{runner.temp}}/stateful_debug - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'Stateful tests (debug, actions)' - REPO_COPY: ${{runner.temp}}/stateful_debug/ClickHouse - KILL_TIMEOUT: 3600 run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -674,18 +1075,24 @@ jobs: needs: [BuilderDebAsan] runs-on: [self-hosted, stress-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stress_thread + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stress test (address, actions) + REPO_COPY=${{runner.temp}}/stress_thread/ClickHouse + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Stress test - env: - TEMP_PATH: ${{runner.temp}}/stress_thread - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'Stress test (address, actions)' - REPO_COPY: ${{runner.temp}}/stress_thread/ClickHouse run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -700,20 +1107,30 @@ jobs: sudo rm -fr $TEMP_PATH StressTestTsan: needs: [BuilderDebTsan] - runs-on: [self-hosted, stress-tester] + # func testers have 16 cores + 128 GB memory + # while stress testers have 36 cores + 72 memory + # It would be better to have something like 32 + 128, + # but such servers almost unavailable as spot instances. + runs-on: [self-hosted, func-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stress_thread + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stress test (thread, actions) + REPO_COPY=${{runner.temp}}/stress_thread/ClickHouse + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Stress test - env: - TEMP_PATH: ${{runner.temp}}/stress_thread - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'Stress test (thread, actions)' - REPO_COPY: ${{runner.temp}}/stress_thread/ClickHouse run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -730,18 +1147,24 @@ jobs: needs: [BuilderDebMsan] runs-on: [self-hosted, stress-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stress_memory + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stress test (memory, actions) + REPO_COPY=${{runner.temp}}/stress_memory/ClickHouse + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Stress test - env: - TEMP_PATH: ${{runner.temp}}/stress_memory - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'Stress test (memory, actions)' - REPO_COPY: ${{runner.temp}}/stress_memory/ClickHouse run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -758,18 +1181,24 @@ jobs: needs: [BuilderDebUBsan] runs-on: [self-hosted, stress-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stress_undefined + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stress test (undefined, actions) + REPO_COPY=${{runner.temp}}/stress_undefined/ClickHouse + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Stress test - env: - TEMP_PATH: ${{runner.temp}}/stress_undefined - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'Stress test (undefined, actions)' - REPO_COPY: ${{runner.temp}}/stress_undefined/ClickHouse run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -786,18 +1215,24 @@ jobs: needs: [BuilderDebDebug] runs-on: [self-hosted, stress-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stress_debug + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stress test (debug, actions) + REPO_COPY=${{runner.temp}}/stress_debug/ClickHouse + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Stress test - env: - TEMP_PATH: ${{runner.temp}}/stress_debug - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'Stress test (debug, actions)' - REPO_COPY: ${{runner.temp}}/stress_debug/ClickHouse run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -813,22 +1248,30 @@ jobs: ############################################################################################# ############################# INTEGRATION TESTS ############################################# ############################################################################################# - IntegrationTestsAsan: - needs: [BuilderDebAsan, FunctionalStatelessTestAsan] + IntegrationTestsAsan0: + needs: [BuilderDebAsan] runs-on: [self-hosted, stress-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/integration_tests_asan + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Integration tests (asan, actions) + REPO_COPY=${{runner.temp}}/integration_tests_asan/ClickHouse + RUN_BY_HASH_NUM=0 + RUN_BY_HASH_TOTAL=3 + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Integration test - env: - TEMP_PATH: ${{runner.temp}}/integration_tests_asan - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'Integration tests (asan, actions)' - REPO_COPY: ${{runner.temp}}/integration_tests_asan/ClickHouse run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -841,22 +1284,30 @@ jobs: docker kill $(docker ps -q) ||: docker rm -f $(docker ps -a -q) ||: sudo rm -fr $TEMP_PATH - IntegrationTestsTsan: - needs: [BuilderDebTsan, FunctionalStatelessTestTsan] + IntegrationTestsAsan1: + needs: [BuilderDebAsan] runs-on: [self-hosted, stress-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/integration_tests_asan + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Integration tests (asan, actions) + REPO_COPY=${{runner.temp}}/integration_tests_asan/ClickHouse + RUN_BY_HASH_NUM=1 + RUN_BY_HASH_TOTAL=3 + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Integration test - env: - TEMP_PATH: ${{runner.temp}}/integration_tests_tsan - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'Integration tests (thread, actions)' - REPO_COPY: ${{runner.temp}}/integration_tests_tsan/ClickHouse run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -869,22 +1320,246 @@ jobs: docker kill $(docker ps -q) ||: docker rm -f $(docker ps -a -q) ||: sudo rm -fr $TEMP_PATH - IntegrationTestsRelease: - needs: [BuilderDebRelease, FunctionalStatelessTestRelease] + IntegrationTestsAsan2: + needs: [BuilderDebAsan] runs-on: [self-hosted, stress-tester] steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/integration_tests_asan + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Integration tests (asan, actions) + REPO_COPY=${{runner.temp}}/integration_tests_asan/ClickHouse + RUN_BY_HASH_NUM=2 + RUN_BY_HASH_TOTAL=3 + EOF - name: Download json reports uses: actions/download-artifact@v2 with: - path: ${{runner.temp}}/reports_dir + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE + - name: Check out repository code + uses: actions/checkout@v2 + - name: Integration test + run: | + sudo rm -fr $TEMP_PATH + mkdir -p $TEMP_PATH + cp -r $GITHUB_WORKSPACE $TEMP_PATH + cd $REPO_COPY/tests/ci + python3 integration_test_check.py "$CHECK_NAME" + - name: Cleanup + if: always() + run: | + docker kill $(docker ps -q) ||: + docker rm -f $(docker ps -a -q) ||: + sudo rm -fr $TEMP_PATH + IntegrationTestsTsan0: + needs: [BuilderDebTsan] + runs-on: [self-hosted, stress-tester] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/integration_tests_tsan + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Integration tests (thread, actions) + REPO_COPY=${{runner.temp}}/integration_tests_tsan/ClickHouse + RUN_BY_HASH_NUM=0 + RUN_BY_HASH_TOTAL=4 + EOF + - name: Download json reports + uses: actions/download-artifact@v2 + with: + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE + - name: Check out repository code + uses: actions/checkout@v2 + - name: Integration test + run: | + sudo rm -fr $TEMP_PATH + mkdir -p $TEMP_PATH + cp -r $GITHUB_WORKSPACE $TEMP_PATH + cd $REPO_COPY/tests/ci + python3 integration_test_check.py "$CHECK_NAME" + - name: Cleanup + if: always() + run: | + docker kill $(docker ps -q) ||: + docker rm -f $(docker ps -a -q) ||: + sudo rm -fr $TEMP_PATH + IntegrationTestsTsan1: + needs: [BuilderDebTsan] + runs-on: [self-hosted, stress-tester] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/integration_tests_tsan + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Integration tests (thread, actions) + REPO_COPY=${{runner.temp}}/integration_tests_tsan/ClickHouse + RUN_BY_HASH_NUM=1 + RUN_BY_HASH_TOTAL=4 + EOF + - name: Download json reports + uses: actions/download-artifact@v2 + with: + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE + - name: Check out repository code + uses: actions/checkout@v2 + - name: Integration test + run: | + sudo rm -fr $TEMP_PATH + mkdir -p $TEMP_PATH + cp -r $GITHUB_WORKSPACE $TEMP_PATH + cd $REPO_COPY/tests/ci + python3 integration_test_check.py "$CHECK_NAME" + - name: Cleanup + if: always() + run: | + docker kill $(docker ps -q) ||: + docker rm -f $(docker ps -a -q) ||: + sudo rm -fr $TEMP_PATH + IntegrationTestsTsan2: + needs: [BuilderDebTsan] + runs-on: [self-hosted, stress-tester] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/integration_tests_tsan + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Integration tests (thread, actions) + REPO_COPY=${{runner.temp}}/integration_tests_tsan/ClickHouse + RUN_BY_HASH_NUM=2 + RUN_BY_HASH_TOTAL=4 + EOF + - name: Download json reports + uses: actions/download-artifact@v2 + with: + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE + - name: Check out repository code + uses: actions/checkout@v2 + - name: Integration test + run: | + sudo rm -fr $TEMP_PATH + mkdir -p $TEMP_PATH + cp -r $GITHUB_WORKSPACE $TEMP_PATH + cd $REPO_COPY/tests/ci + python3 integration_test_check.py "$CHECK_NAME" + - name: Cleanup + if: always() + run: | + docker kill $(docker ps -q) ||: + docker rm -f $(docker ps -a -q) ||: + sudo rm -fr $TEMP_PATH + IntegrationTestsTsan3: + needs: [BuilderDebTsan] + runs-on: [self-hosted, stress-tester] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/integration_tests_tsan + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Integration tests (thread, actions) + REPO_COPY=${{runner.temp}}/integration_tests_tsan/ClickHouse + RUN_BY_HASH_NUM=3 + RUN_BY_HASH_TOTAL=4 + EOF + - name: Download json reports + uses: actions/download-artifact@v2 + with: + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE + - name: Check out repository code + uses: actions/checkout@v2 + - name: Integration test + run: | + sudo rm -fr $TEMP_PATH + mkdir -p $TEMP_PATH + cp -r $GITHUB_WORKSPACE $TEMP_PATH + cd $REPO_COPY/tests/ci + python3 integration_test_check.py "$CHECK_NAME" + - name: Cleanup + if: always() + run: | + docker kill $(docker ps -q) ||: + docker rm -f $(docker ps -a -q) ||: + sudo rm -fr $TEMP_PATH + IntegrationTestsRelease0: + needs: [BuilderDebRelease] + runs-on: [self-hosted, stress-tester] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/integration_tests_release + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Integration tests (release, actions) + REPO_COPY=${{runner.temp}}/integration_tests_release/ClickHouse + RUN_BY_HASH_NUM=0 + RUN_BY_HASH_TOTAL=2 + EOF + - name: Download json reports + uses: actions/download-artifact@v2 + with: + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE + - name: Check out repository code + uses: actions/checkout@v2 + - name: Integration test + run: | + sudo rm -fr $TEMP_PATH + mkdir -p $TEMP_PATH + cp -r $GITHUB_WORKSPACE $TEMP_PATH + cd $REPO_COPY/tests/ci + python3 integration_test_check.py "$CHECK_NAME" + - name: Cleanup + if: always() + run: | + docker kill $(docker ps -q) ||: + docker rm -f $(docker ps -a -q) ||: + sudo rm -fr $TEMP_PATH + IntegrationTestsRelease1: + needs: [BuilderDebRelease] + runs-on: [self-hosted, stress-tester] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/integration_tests_release + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Integration tests (release, actions) + REPO_COPY=${{runner.temp}}/integration_tests_release/ClickHouse + RUN_BY_HASH_NUM=1 + RUN_BY_HASH_TOTAL=2 + EOF + - name: Download json reports + uses: actions/download-artifact@v2 + with: + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Integration test - env: - TEMP_PATH: ${{runner.temp}}/integration_tests_release - REPORTS_PATH: ${{runner.temp}}/reports_dir - CHECK_NAME: 'Integration tests (release, actions)' - REPO_COPY: ${{runner.temp}}/integration_tests_release/ClickHouse run: | sudo rm -fr $TEMP_PATH mkdir -p $TEMP_PATH @@ -901,11 +1576,18 @@ jobs: needs: - DockerHubPush - BuilderReport - - FunctionalStatelessTestDebug + - FunctionalStatelessTestDebug0 + - FunctionalStatelessTestDebug1 + - FunctionalStatelessTestDebug2 - FunctionalStatelessTestRelease - - FunctionalStatelessTestAsan - - FunctionalStatelessTestTsan - - FunctionalStatelessTestMsan + - FunctionalStatelessTestAsan0 + - FunctionalStatelessTestAsan1 + - FunctionalStatelessTestTsan0 + - FunctionalStatelessTestTsan1 + - FunctionalStatelessTestTsan2 + - FunctionalStatelessTestMsan0 + - FunctionalStatelessTestMsan1 + - FunctionalStatelessTestMsan2 - FunctionalStatelessTestUBsan - FunctionalStatefulTestDebug - FunctionalStatefulTestRelease @@ -918,12 +1600,21 @@ jobs: - StressTestTsan - StressTestMsan - StressTestUBsan - - IntegrationTestsAsan - - IntegrationTestsRelease - - IntegrationTestsTsan + - IntegrationTestsAsan0 + - IntegrationTestsAsan1 + - IntegrationTestsAsan2 + - IntegrationTestsRelease0 + - IntegrationTestsRelease1 + - IntegrationTestsTsan0 + - IntegrationTestsTsan1 + - IntegrationTestsTsan2 + - IntegrationTestsTsan3 - CompatibilityCheck runs-on: [self-hosted, style-checker] steps: + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE - name: Check out repository code uses: actions/checkout@v2 - name: Finish label diff --git a/.github/workflows/woboq.yml b/.github/workflows/woboq.yml new file mode 100644 index 00000000000..f3cd7ab6245 --- /dev/null +++ b/.github/workflows/woboq.yml @@ -0,0 +1,42 @@ +name: WoboqBuilder +env: + # Force the stdout and stderr streams to be unbuffered + PYTHONUNBUFFERED: 1 + +concurrency: + group: woboq +on: # yamllint disable-line rule:truthy + schedule: + - cron: '0 */18 * * *' + workflow_dispatch: +jobs: + # don't use dockerhub push because this image updates so rarely + WoboqCodebrowser: + runs-on: [self-hosted, style-checker] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/codebrowser + REPO_COPY=${{runner.temp}}/codebrowser/ClickHouse + IMAGES_PATH=${{runner.temp}}/images_path + EOF + - name: Clear repository + run: | + sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE + - name: Check out repository code + uses: actions/checkout@v2 + with: + submodules: 'true' + - name: Codebrowser + run: | + sudo rm -fr $TEMP_PATH + mkdir -p $TEMP_PATH + cp -r $GITHUB_WORKSPACE $TEMP_PATH + cd $REPO_COPY/tests/ci && python3 codebrowser_check.py + - name: Cleanup + if: always() + run: | + docker kill $(docker ps -q) ||: + docker rm -f $(docker ps -a -q) ||: + sudo rm -fr $TEMP_PATH diff --git a/.gitmodules b/.gitmodules index df9c5f1a416..1af2937993b 100644 --- a/.gitmodules +++ b/.gitmodules @@ -54,8 +54,8 @@ url = https://github.com/ClickHouse-Extras/Turbo-Base64.git [submodule "contrib/arrow"] path = contrib/arrow - url = https://github.com/ClickHouse-Extras/arrow - branch = clickhouse-arrow-2.0.0 + url = https://github.com/ClickHouse-Extras/arrow.git + branch = blessed/release-6.0.1 [submodule "contrib/thrift"] path = contrib/thrift url = https://github.com/apache/thrift.git @@ -190,8 +190,8 @@ url = https://github.com/xz-mirror/xz [submodule "contrib/abseil-cpp"] path = contrib/abseil-cpp - url = https://github.com/ClickHouse-Extras/abseil-cpp.git - branch = lts_2020_02_25 + url = https://github.com/abseil/abseil-cpp.git + branch = lts_2021_11_02 [submodule "contrib/dragonbox"] path = contrib/dragonbox url = https://github.com/ClickHouse-Extras/dragonbox.git @@ -253,3 +253,6 @@ [submodule "contrib/nlp-data"] path = contrib/nlp-data url = https://github.com/evillique/nlp-data.git +[submodule "contrib/azure"] + path = contrib/azure + url = https://github.com/ClickHouse-Extras/azure-sdk-for-cpp.git diff --git a/CHANGELOG.md b/CHANGELOG.md index f34725448f2..9027cfb117a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,181 @@ +### ClickHouse release v21.12, 2021-12-15 + +#### Backward Incompatible Change + +* *A fix for a feature that previously had unwanted behaviour.* Do not allow direct select for Kafka/RabbitMQ/FileLog. Can be enabled by setting `stream_like_engine_allow_direct_select`. Direct select will be not allowed even if enabled by setting, in case there is an attached materialized view. For Kafka and RabbitMQ direct selectm if allowed, will not commit massages by default. To enable commits with direct select, user must use storage level setting `kafka{rabbitmq}_commit_on_select=1` (default `0`). [#31053](https://github.com/ClickHouse/ClickHouse/pull/31053) ([Kseniia Sumarokova](https://github.com/kssenii)). +* *A slight change in behaviour of a new function.* Return unquoted string in JSON_VALUE. Closes [#27965](https://github.com/ClickHouse/ClickHouse/issues/27965). [#31008](https://github.com/ClickHouse/ClickHouse/pull/31008) ([Kseniia Sumarokova](https://github.com/kssenii)). +* *Setting rename.* Add custom null representation support for TSV/CSV input formats. Fix deserialing Nullable(String) in TSV/CSV/JSONCompactStringsEachRow/JSONStringsEachRow input formats. Rename `output_format_csv_null_representation` and `output_format_tsv_null_representation` to `format_csv_null_representation` and `format_tsv_null_representation` accordingly. [#30497](https://github.com/ClickHouse/ClickHouse/pull/30497) ([Kruglov Pavel](https://github.com/Avogar)). +* *Further deprecation of already unused code.* This is relevant only for users of ClickHouse versions older than 20.6. A "leader election" mechanism is removed from `ReplicatedMergeTree`, because multiple leaders are supported since 20.6. If you are upgrading from an older version and some replica with an old version is a leader, then server will fail to start after upgrade. Stop replicas with old version to make new version start. After that it will not be possible to downgrade to version older than 20.6. [#32140](https://github.com/ClickHouse/ClickHouse/pull/32140) ([tavplubix](https://github.com/tavplubix)). + +#### New Feature + +* Implemented more of the ZooKeeper Four Letter Words commands in clickhouse-keeper: https://zookeeper.apache.org/doc/r3.4.8/zookeeperAdmin.html#sc_zkCommands. [#28981](https://github.com/ClickHouse/ClickHouse/pull/28981) ([JackyWoo](https://github.com/JackyWoo)). Now `clickhouse-keeper` is feature complete. +* Support for `Bool` data type. [#31072](https://github.com/ClickHouse/ClickHouse/pull/31072) ([kevin wan](https://github.com/MaxWk)). +* Support for `PARTITION BY` in File, URL, HDFS storages and with `INSERT INTO` table function. Closes [#30273](https://github.com/ClickHouse/ClickHouse/issues/30273). [#30690](https://github.com/ClickHouse/ClickHouse/pull/30690) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Added `CONSTRAINT ... ASSUME ...` (without checking during `INSERT`). Added query transformation to CNF (https://github.com/ClickHouse/ClickHouse/issues/11749) for more convenient optimization. Added simple query rewriting using constraints (only simple matching now, will be improved to support <,=,>... later). Added ability to replace heavy columns with light columns if it's possible. [#18787](https://github.com/ClickHouse/ClickHouse/pull/18787) ([Nikita Vasilev](https://github.com/nikvas0)). +* Basic access authentication for http/url functions. [#31648](https://github.com/ClickHouse/ClickHouse/pull/31648) ([michael1589](https://github.com/michael1589)). +* Support `INTERVAL` type in `STEP` clause for `WITH FILL` modifier. [#30927](https://github.com/ClickHouse/ClickHouse/pull/30927) ([Anton Popov](https://github.com/CurtizJ)). +* Add support for parallel reading from multiple files and support globs in `FROM INFILE` clause. [#30135](https://github.com/ClickHouse/ClickHouse/pull/30135) ([Filatenkov Artur](https://github.com/FArthur-cmd)). +* Add support for `Identifier` table and database query parameters. Closes [#27226](https://github.com/ClickHouse/ClickHouse/issues/27226). [#28668](https://github.com/ClickHouse/ClickHouse/pull/28668) ([Nikolay Degterinsky](https://github.com/evillique)). +* *TLDR: Major improvements of completeness and consistency of text formats.* Refactor formats `TSV`, `TSVRaw`, `CSV` and `JSONCompactEachRow`, `JSONCompactStringsEachRow`, remove code duplication, add base interface for formats with `-WithNames` and `-WithNamesAndTypes` suffixes. Add formats `CSVWithNamesAndTypes`, `TSVRawWithNames`, `TSVRawWithNamesAndTypes`, `JSONCompactEachRowWIthNames`, `JSONCompactStringsEachRowWIthNames`, `RowBinaryWithNames`. Support parallel parsing for formats `TSVWithNamesAndTypes`, `TSVRaw(WithNames/WIthNamesAndTypes)`, `CSVWithNamesAndTypes`, `JSONCompactEachRow(WithNames/WIthNamesAndTypes)`, `JSONCompactStringsEachRow(WithNames/WIthNamesAndTypes)`. Support columns mapping and types checking for `RowBinaryWithNamesAndTypes` format. Add setting `input_format_with_types_use_header` which specify if we should check that types written in `WIthNamesAndTypes` format matches with table structure. Add setting `input_format_csv_empty_as_default` and use it in CSV format instead of `input_format_defaults_for_omitted_fields` (because this setting should not control `csv_empty_as_default`). Fix usage of setting `input_format_defaults_for_omitted_fields` (it was used only as `csv_empty_as_default`, but it should control calculation of default expressions for omitted fields). Fix Nullable input/output in `TSVRaw` format, make this format fully compatible with inserting into TSV. Fix inserting NULLs in `LowCardinality(Nullable)` when `input_format_null_as_default` is enabled (previously default values was inserted instead of actual NULLs). Fix strings deserialization in `JSONStringsEachRow`/`JSONCompactStringsEachRow` formats (strings were parsed just until first '\n' or '\t'). Add ability to use `Raw` escaping rule in Template input format. Add diagnostic info for JSONCompactEachRow(WithNames/WIthNamesAndTypes) input format. Fix bug with parallel parsing of `-WithNames` formats in case when setting `min_chunk_bytes_for_parallel_parsing` is less than bytes in a single row. [#30178](https://github.com/ClickHouse/ClickHouse/pull/30178) ([Kruglov Pavel](https://github.com/Avogar)). Allow to print/parse names and types of colums in `CustomSeparated` input/output format. Add formats `CustomSeparatedWithNames/WithNamesAndTypes` similar to `TSVWithNames/WithNamesAndTypes`. [#31434](https://github.com/ClickHouse/ClickHouse/pull/31434) ([Kruglov Pavel](https://github.com/Avogar)). +* Aliyun OSS Storage support. [#31286](https://github.com/ClickHouse/ClickHouse/pull/31286) ([cfcz48](https://github.com/cfcz48)). +* Exposes all settings of the global thread pool in the configuration file. [#31285](https://github.com/ClickHouse/ClickHouse/pull/31285) ([Tomáš Hromada](https://github.com/gyfis)). +* Introduced window functions `exponentialTimeDecayedSum`, `exponentialTimeDecayedMax`, `exponentialTimeDecayedCount` and `exponentialTimeDecayedAvg` which are more effective than `exponentialMovingAverage` for bigger windows. Also more use-cases were covered. [#29799](https://github.com/ClickHouse/ClickHouse/pull/29799) ([Vladimir Chebotarev](https://github.com/excitoon)). +* Add option to compress logs before writing them to a file using LZ4. Closes [#23860](https://github.com/ClickHouse/ClickHouse/issues/23860). [#29219](https://github.com/ClickHouse/ClickHouse/pull/29219) ([Nikolay Degterinsky](https://github.com/evillique)). +* Support `JOIN ON 1 = 1` that have CROSS JOIN semantic. This closes [#25578](https://github.com/ClickHouse/ClickHouse/issues/25578). [#25894](https://github.com/ClickHouse/ClickHouse/pull/25894) ([Vladimir C](https://github.com/vdimir)). +* Add Map combinator for `Map` type. - Rename old `sum-, min-, max- Map` for mapped arrays to `sum-, min-, max- MappedArrays`. [#24539](https://github.com/ClickHouse/ClickHouse/pull/24539) ([Ildus Kurbangaliev](https://github.com/ildus)). +* Make reading from HTTP retriable. Closes [#29696](https://github.com/ClickHouse/ClickHouse/issues/29696). [#29894](https://github.com/ClickHouse/ClickHouse/pull/29894) ([Kseniia Sumarokova](https://github.com/kssenii)). + +#### Experimental Feature + +* `WINDOW VIEW` to enable stream processing in ClickHouse. [#8331](https://github.com/ClickHouse/ClickHouse/pull/8331) ([vxider](https://github.com/Vxider)). +* Drop support for using Ordinary databases with `MaterializedMySQL`. [#31292](https://github.com/ClickHouse/ClickHouse/pull/31292) ([Stig Bakken](https://github.com/stigsb)). +* Implement the commands BACKUP and RESTORE for the Log family. This feature is under development. [#30688](https://github.com/ClickHouse/ClickHouse/pull/30688) ([Vitaly Baranov](https://github.com/vitlibar)). + +#### Performance Improvement + +* Reduce memory usage when reading with `s3` / `url` / `hdfs` formats `Parquet`, `ORC`, `Arrow` (controlled by setting `input_format_allow_seeks`, enabled by default). Also add setting `remote_read_min_bytes_for_seek` to control seeks. Closes [#10461](https://github.com/ClickHouse/ClickHouse/issues/10461). Closes [#16857](https://github.com/ClickHouse/ClickHouse/issues/16857). [#30936](https://github.com/ClickHouse/ClickHouse/pull/30936) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Add optimizations for constant conditions in JOIN ON, ref [#26928](https://github.com/ClickHouse/ClickHouse/issues/26928). [#27021](https://github.com/ClickHouse/ClickHouse/pull/27021) ([Vladimir C](https://github.com/vdimir)). +* Support parallel formatting for all text formats, except `JSONEachRowWithProgress` and `PrettyCompactMonoBlock`. [#31489](https://github.com/ClickHouse/ClickHouse/pull/31489) ([Kruglov Pavel](https://github.com/Avogar)). +* Speed up count over nullable columns. [#31806](https://github.com/ClickHouse/ClickHouse/pull/31806) ([Raúl Marín](https://github.com/Algunenano)). +* Speed up `avg` and `sumCount` aggregate functions. [#31694](https://github.com/ClickHouse/ClickHouse/pull/31694) ([Raúl Marín](https://github.com/Algunenano)). +* Improve performance of JSON and XML output formats. [#31673](https://github.com/ClickHouse/ClickHouse/pull/31673) ([alexey-milovidov](https://github.com/alexey-milovidov)). +* Improve performance of syncing data to block device. This closes [#31181](https://github.com/ClickHouse/ClickHouse/issues/31181). [#31229](https://github.com/ClickHouse/ClickHouse/pull/31229) ([zhanglistar](https://github.com/zhanglistar)). +* Fixing query performance issue in `LiveView` tables. Fixes [#30831](https://github.com/ClickHouse/ClickHouse/issues/30831). [#31006](https://github.com/ClickHouse/ClickHouse/pull/31006) ([vzakaznikov](https://github.com/vzakaznikov)). +* Speed up query parsing. [#31949](https://github.com/ClickHouse/ClickHouse/pull/31949) ([Raúl Marín](https://github.com/Algunenano)). +* Allow to split `GraphiteMergeTree` rollup rules for plain/tagged metrics (optional `rule_type` field). [#25122](https://github.com/ClickHouse/ClickHouse/pull/25122) ([Michail Safronov](https://github.com/msaf1980)). +* Remove excessive `DESC TABLE` requests for `remote()` (in case of `remote('127.1', system.one)` (i.e. identifier as the db.table instead of string) there was excessive `DESC TABLE` request). [#32019](https://github.com/ClickHouse/ClickHouse/pull/32019) ([Azat Khuzhin](https://github.com/azat)). +* Optimize function `tupleElement` to reading of subcolumn with enabled setting `optimize_functions_to_subcolumns`. [#31261](https://github.com/ClickHouse/ClickHouse/pull/31261) ([Anton Popov](https://github.com/CurtizJ)). +* Optimize function `mapContains` to reading of subcolumn `key` with enabled settings `optimize_functions_to_subcolumns`. [#31218](https://github.com/ClickHouse/ClickHouse/pull/31218) ([Anton Popov](https://github.com/CurtizJ)). +* Add settings `merge_tree_min_rows_for_concurrent_read_for_remote_filesystem` and `merge_tree_min_bytes_for_concurrent_read_for_remote_filesystem`. [#30970](https://github.com/ClickHouse/ClickHouse/pull/30970) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Skipping mutations of different partitions in `StorageMergeTree`. [#21326](https://github.com/ClickHouse/ClickHouse/pull/21326) ([Vladimir Chebotarev](https://github.com/excitoon)). + +#### Improvement + +* Do not allow to drop a table or dictionary if some tables or dictionaries depend on it. [#30977](https://github.com/ClickHouse/ClickHouse/pull/30977) ([tavplubix](https://github.com/tavplubix)). +* Allow versioning of aggregate function states. Now we can introduce backward compatible changes in serialization format of aggregate function states. Closes [#12552](https://github.com/ClickHouse/ClickHouse/issues/12552). [#24820](https://github.com/ClickHouse/ClickHouse/pull/24820) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Support PostgreSQL style `ALTER MODIFY COLUMN` syntax. [#32003](https://github.com/ClickHouse/ClickHouse/pull/32003) ([SuperDJY](https://github.com/cmsxbc)). +* Added `update_field` support for `RangeHashedDictionary`, `ComplexKeyRangeHashedDictionary`. [#32185](https://github.com/ClickHouse/ClickHouse/pull/32185) ([Maksim Kita](https://github.com/kitaisreal)). +* The `murmurHash3_128` and `sipHash128` functions now accept an arbitrary number of arguments. This closes [#28774](https://github.com/ClickHouse/ClickHouse/issues/28774). [#28965](https://github.com/ClickHouse/ClickHouse/pull/28965) ([小路](https://github.com/nicelulu)). +* Support default expression for `HDFS` storage and optimize fetching when source is column oriented. [#32256](https://github.com/ClickHouse/ClickHouse/pull/32256) ([李扬](https://github.com/taiyang-li)). +* Improve the operation name of an opentelemetry span. [#32234](https://github.com/ClickHouse/ClickHouse/pull/32234) ([Frank Chen](https://github.com/FrankChen021)). +* Use `Content-Type: application/x-ndjson` (http://ndjson.org/) for output format `JSONEachRow`. [#32223](https://github.com/ClickHouse/ClickHouse/pull/32223) ([Dmitriy Dorofeev](https://github.com/deem0n)). +* Improve skipping unknown fields with quoted escaping rule in Template/CustomSeparated formats. Previously you could skip only quoted strings, now you can skip values with any type. [#32204](https://github.com/ClickHouse/ClickHouse/pull/32204) ([Kruglov Pavel](https://github.com/Avogar)). +* Now `clickhouse-keeper` refuses to start or apply configuration changes when they contain duplicated IDs or endpoints. Fixes [#31339](https://github.com/ClickHouse/ClickHouse/issues/31339). [#32121](https://github.com/ClickHouse/ClickHouse/pull/32121) ([alesapin](https://github.com/alesapin)). +* Set Content-Type in HTTP packets issued from URL engine. [#32113](https://github.com/ClickHouse/ClickHouse/pull/32113) ([Frank Chen](https://github.com/FrankChen021)). +* Return Content-Type as 'application/json' for `JSONEachRow` format if `output_format_json_array_of_rows` is enabled. [#32112](https://github.com/ClickHouse/ClickHouse/pull/32112) ([Frank Chen](https://github.com/FrankChen021)). +* Allow to parse `+` before `Float32`/`Float64` values. [#32079](https://github.com/ClickHouse/ClickHouse/pull/32079) ([Kruglov Pavel](https://github.com/Avogar)). +* Allow a user configured `hdfs_replication` parameter for `DiskHDFS` and `StorageHDFS`. Closes [#32039](https://github.com/ClickHouse/ClickHouse/issues/32039). [#32049](https://github.com/ClickHouse/ClickHouse/pull/32049) ([leosunli](https://github.com/leosunli)). +* Added ClickHouse `exception` and `exception_code` fields to opentelemetry span log. [#32040](https://github.com/ClickHouse/ClickHouse/pull/32040) ([Frank Chen](https://github.com/FrankChen021)). +* Improve opentelemetry span log duration - it was is zero at the query level if there is a query exception. [#32038](https://github.com/ClickHouse/ClickHouse/pull/32038) ([Frank Chen](https://github.com/FrankChen021)). +* Fix the issue that `LowCardinality` of `Int256` cannot be created. [#31832](https://github.com/ClickHouse/ClickHouse/pull/31832) ([alexey-milovidov](https://github.com/alexey-milovidov)). +* Recreate `system.*_log` tables in case of different engine/partition_by. [#31824](https://github.com/ClickHouse/ClickHouse/pull/31824) ([Azat Khuzhin](https://github.com/azat)). +* `MaterializedMySQL`: Fix issue with table named 'table'. [#31781](https://github.com/ClickHouse/ClickHouse/pull/31781) ([Håvard Kvålen](https://github.com/havardk)). +* ClickHouse dictionary source: support predefined connections. Closes [#31705](https://github.com/ClickHouse/ClickHouse/issues/31705). [#31749](https://github.com/ClickHouse/ClickHouse/pull/31749) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Allow to use predefined connections configuration for Kafka and RabbitMQ engines (the same way as for other integration table engines). [#31691](https://github.com/ClickHouse/ClickHouse/pull/31691) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Always re-render prompt while navigating history in clickhouse-client. This will improve usability of manipulating very long queries that don't fit on screen. [#31675](https://github.com/ClickHouse/ClickHouse/pull/31675) ([alexey-milovidov](https://github.com/alexey-milovidov)) (author: Amos Bird). +* Add key bindings for navigating through history (instead of lines/history). [#31641](https://github.com/ClickHouse/ClickHouse/pull/31641) ([Azat Khuzhin](https://github.com/azat)). +* Improve the `max_execution_time` checks. Fixed some cases when timeout checks do not happen and query could run too long. [#31636](https://github.com/ClickHouse/ClickHouse/pull/31636) ([Raúl Marín](https://github.com/Algunenano)). +* Better exception message when `users.xml` cannot be loaded due to bad password hash. This closes [#24126](https://github.com/ClickHouse/ClickHouse/issues/24126). [#31557](https://github.com/ClickHouse/ClickHouse/pull/31557) ([Vitaly Baranov](https://github.com/vitlibar)). +* Use shard and replica name from `Replicated` database arguments when expanding macros in `ReplicatedMergeTree` arguments if these macros are not defined in config. Closes [#31471](https://github.com/ClickHouse/ClickHouse/issues/31471). [#31488](https://github.com/ClickHouse/ClickHouse/pull/31488) ([tavplubix](https://github.com/tavplubix)). +* Better analysis for `min/max/count` projection. Now, with enabled `allow_experimental_projection_optimization`, virtual `min/max/count` projection can be used together with columns from partition key. [#31474](https://github.com/ClickHouse/ClickHouse/pull/31474) ([Amos Bird](https://github.com/amosbird)). +* Add `--pager` support for `clickhouse-local`. [#31457](https://github.com/ClickHouse/ClickHouse/pull/31457) ([Azat Khuzhin](https://github.com/azat)). +* Fix waiting of the editor during interactive query edition (`waitpid()` returns -1 on `SIGWINCH` and `EDITOR` and `clickhouse-local`/`clickhouse-client` works concurrently). [#31456](https://github.com/ClickHouse/ClickHouse/pull/31456) ([Azat Khuzhin](https://github.com/azat)). +* Throw an exception if there is some garbage after field in `JSONCompactStrings(EachRow)` format. [#31455](https://github.com/ClickHouse/ClickHouse/pull/31455) ([Kruglov Pavel](https://github.com/Avogar)). +* Default value of `http_send_timeout` and `http_receive_timeout` settings changed from 1800 (30 minutes) to 180 (3 minutes). [#31450](https://github.com/ClickHouse/ClickHouse/pull/31450) ([tavplubix](https://github.com/tavplubix)). +* `MaterializedMySQL` now handles `CREATE TABLE ... LIKE ...` DDL queries. [#31410](https://github.com/ClickHouse/ClickHouse/pull/31410) ([Stig Bakken](https://github.com/stigsb)). +* Return artificial create query when executing `show create table` on system's tables. [#31391](https://github.com/ClickHouse/ClickHouse/pull/31391) ([SuperDJY](https://github.com/cmsxbc)). +* Previously progress was shown only for `numbers` table function. Now for `numbers_mt` it is also shown. [#31318](https://github.com/ClickHouse/ClickHouse/pull/31318) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Initial user's roles are used now to find row policies, see [#31080](https://github.com/ClickHouse/ClickHouse/issues/31080). [#31262](https://github.com/ClickHouse/ClickHouse/pull/31262) ([Vitaly Baranov](https://github.com/vitlibar)). +* If some obsolete setting is changed - show warning in `system.warnings`. [#31252](https://github.com/ClickHouse/ClickHouse/pull/31252) ([tavplubix](https://github.com/tavplubix)). +* Improved backoff for background cleanup tasks in `MergeTree`. Settings `merge_tree_clear_old_temporary_directories_interval_seconds` and `merge_tree_clear_old_parts_interval_seconds` moved from users settings to merge tree settings. [#31180](https://github.com/ClickHouse/ClickHouse/pull/31180) ([tavplubix](https://github.com/tavplubix)). +* Now every replica will send to client only incremental information about profile events counters. [#31155](https://github.com/ClickHouse/ClickHouse/pull/31155) ([Dmitry Novik](https://github.com/novikd)). This makes `--hardware_utilization` option in `clickhouse-client` usable. +* Enable multiline editing in clickhouse-client by default. This addresses [#31121](https://github.com/ClickHouse/ClickHouse/issues/31121) . [#31123](https://github.com/ClickHouse/ClickHouse/pull/31123) ([Amos Bird](https://github.com/amosbird)). +* Function name normalization for `ALTER` queries. This helps avoid metadata mismatch between creating table with indices/projections and adding indices/projections via alter commands. This is a follow-up PR of https://github.com/ClickHouse/ClickHouse/pull/20174. Mark as improvements as there are no bug reports and the senario is somehow rare. [#31095](https://github.com/ClickHouse/ClickHouse/pull/31095) ([Amos Bird](https://github.com/amosbird)). +* Support `IF EXISTS` modifier for `RENAME DATABASE`/`TABLE`/`DICTIONARY` query. If this directive is used, one will not get an error if the DATABASE/TABLE/DICTIONARY to be renamed doesn't exist. [#31081](https://github.com/ClickHouse/ClickHouse/pull/31081) ([victorgao](https://github.com/kafka1991)). +* Cancel vertical merges when partition is dropped. This is a follow-up of https://github.com/ClickHouse/ClickHouse/pull/25684 and https://github.com/ClickHouse/ClickHouse/pull/30996. [#31057](https://github.com/ClickHouse/ClickHouse/pull/31057) ([Amos Bird](https://github.com/amosbird)). +* The local session inside a Clickhouse dictionary source won't send its events to the session log anymore. This fixes a possible deadlock (tsan alert) on shutdown. Also this PR fixes flaky `test_dictionaries_dependency_xml/`. [#31013](https://github.com/ClickHouse/ClickHouse/pull/31013) ([Vitaly Baranov](https://github.com/vitlibar)). +* Less locking in ALTER command. [#31010](https://github.com/ClickHouse/ClickHouse/pull/31010) ([Amos Bird](https://github.com/amosbird)). +* Fix `--verbose` option in clickhouse-local interactive mode and allow logging into file. [#30881](https://github.com/ClickHouse/ClickHouse/pull/30881) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Added `\l`, `\d`, `\c` commands in `clickhouse-client` like in MySQL and PostgreSQL. [#30876](https://github.com/ClickHouse/ClickHouse/pull/30876) ([Pavel Medvedev](https://github.com/pmed)). +* For clickhouse-local or clickhouse-client: if there is `--interactive` option with `--query` or `--queries-file`, then first execute them like in non-interactive and then start interactive mode. [#30851](https://github.com/ClickHouse/ClickHouse/pull/30851) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Fix possible "The local set of parts of X doesn't look like the set of parts in ZooKeeper" error (if DROP fails during removing znodes from zookeeper). [#30826](https://github.com/ClickHouse/ClickHouse/pull/30826) ([Azat Khuzhin](https://github.com/azat)). +* Avro format works against Kafka. Setting `output_format_avro_rows_in_file` added. [#30351](https://github.com/ClickHouse/ClickHouse/pull/30351) ([Ilya Golshtein](https://github.com/ilejn)). +* Allow to specify one or any number of PostgreSQL schemas for one `MaterializedPostgreSQL` database. Closes [#28901](https://github.com/ClickHouse/ClickHouse/issues/28901). Closes [#29324](https://github.com/ClickHouse/ClickHouse/issues/29324). [#28933](https://github.com/ClickHouse/ClickHouse/pull/28933) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Replaced default ports for clickhouse-keeper internal communication from 44444 to 9234. Fixes [#30879](https://github.com/ClickHouse/ClickHouse/issues/30879). [#31799](https://github.com/ClickHouse/ClickHouse/pull/31799) ([alesapin](https://github.com/alesapin)). +* Implement function transform with Decimal arguments. [#31839](https://github.com/ClickHouse/ClickHouse/pull/31839) ([李帅](https://github.com/loneylee)). +* Fix abort in debug server and `DB::Exception: std::out_of_range: basic_string` error in release server in case of bad hdfs url by adding additional check of hdfs url structure. [#31042](https://github.com/ClickHouse/ClickHouse/pull/31042) ([Kruglov Pavel](https://github.com/Avogar)). +* Fix possible assert in `hdfs` table function/engine, add test. [#31036](https://github.com/ClickHouse/ClickHouse/pull/31036) ([Kruglov Pavel](https://github.com/Avogar)). + +#### Bug Fixes + +* Fix group by / order by / limit by aliases with positional arguments enabled. Closes [#31173](https://github.com/ClickHouse/ClickHouse/issues/31173). [#31741](https://github.com/ClickHouse/ClickHouse/pull/31741) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Fix usage of `Buffer` table engine with type `Map`. Fixes [#30546](https://github.com/ClickHouse/ClickHouse/issues/30546). [#31742](https://github.com/ClickHouse/ClickHouse/pull/31742) ([Anton Popov](https://github.com/CurtizJ)). +* Fix reading from `MergeTree` tables with enabled `use_uncompressed_cache`. [#31826](https://github.com/ClickHouse/ClickHouse/pull/31826) ([Anton Popov](https://github.com/CurtizJ)). +* Fixed the behavior when mutations that have nothing to do are stuck (with enabled setting `empty_result_for_aggregation_by_empty_set`). [#32358](https://github.com/ClickHouse/ClickHouse/pull/32358) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). +* Fix skipping columns while writing protobuf. This PR fixes [#31160](https://github.com/ClickHouse/ClickHouse/issues/31160), see the comment [#31160](https://github.com/ClickHouse/ClickHouse/issues/31160)#issuecomment-980595318. [#31988](https://github.com/ClickHouse/ClickHouse/pull/31988) ([Vitaly Baranov](https://github.com/vitlibar)). +* Fix bug when remove unneeded columns in subquery. If there is an aggregation function in query without group by, do not remove if it is unneeded. [#32289](https://github.com/ClickHouse/ClickHouse/pull/32289) ([dongyifeng](https://github.com/dyf6372)). +* Quota limit was not reached, but the limit was exceeded. This PR fixes [#31174](https://github.com/ClickHouse/ClickHouse/issues/31174). [#31337](https://github.com/ClickHouse/ClickHouse/pull/31337) ([sunny](https://github.com/sunny19930321)). +* Fix SHOW GRANTS when partial revokes are used. This PR fixes [#31138](https://github.com/ClickHouse/ClickHouse/issues/31138). [#31249](https://github.com/ClickHouse/ClickHouse/pull/31249) ([Vitaly Baranov](https://github.com/vitlibar)). +* Memory amount was incorrectly estimated when ClickHouse is run in containers with cgroup limits. [#31157](https://github.com/ClickHouse/ClickHouse/pull/31157) ([Pavel Medvedev](https://github.com/pmed)). +* Fix `ALTER ... MATERIALIZE COLUMN ...` queries in case when data type of default expression is not equal to the data type of column. [#32348](https://github.com/ClickHouse/ClickHouse/pull/32348) ([Anton Popov](https://github.com/CurtizJ)). +* Fixed crash with SIGFPE in aggregate function `avgWeighted` with `Decimal` argument. Fixes [#32053](https://github.com/ClickHouse/ClickHouse/issues/32053). [#32303](https://github.com/ClickHouse/ClickHouse/pull/32303) ([tavplubix](https://github.com/tavplubix)). +* Server might fail to start with `Cannot attach 1 tables due to cyclic dependencies` error if `Dictionary` table looks at XML-dictionary with the same name, it's fixed. Fixes [#31315](https://github.com/ClickHouse/ClickHouse/issues/31315). [#32288](https://github.com/ClickHouse/ClickHouse/pull/32288) ([tavplubix](https://github.com/tavplubix)). +* Fix parsing error while NaN deserializing for `Nullable(Float)` for `Quoted` escaping rule. [#32190](https://github.com/ClickHouse/ClickHouse/pull/32190) ([Kruglov Pavel](https://github.com/Avogar)). +* XML dictionaries: identifiers, used in table create query, can be qualified to `default_database` during upgrade to newer version. Closes [#31963](https://github.com/ClickHouse/ClickHouse/issues/31963). [#32187](https://github.com/ClickHouse/ClickHouse/pull/32187) ([Maksim Kita](https://github.com/kitaisreal)). +* Number of active replicas might be determined incorrectly when inserting with quorum if setting `replicated_can_become_leader` is disabled on some replicas. It's fixed. [#32157](https://github.com/ClickHouse/ClickHouse/pull/32157) ([tavplubix](https://github.com/tavplubix)). +* Dictionaries: fix cases when `{condition}` does not work for custom database queries. [#32117](https://github.com/ClickHouse/ClickHouse/pull/32117) ([Maksim Kita](https://github.com/kitaisreal)). +* Fix `CAST` from `Nullable` with `cast_keep_nullable` (`PARAMETER_OUT_OF_BOUND` error before for i.e. `toUInt32OrDefault(toNullable(toUInt32(1)))`). [#32080](https://github.com/ClickHouse/ClickHouse/pull/32080) ([Azat Khuzhin](https://github.com/azat)). +* Fix CREATE TABLE of Join Storage in some obscure cases. Close [#31680](https://github.com/ClickHouse/ClickHouse/issues/31680). [#32066](https://github.com/ClickHouse/ClickHouse/pull/32066) ([SuperDJY](https://github.com/cmsxbc)). +* Fixed `Directory ... already exists and is not empty` error when detaching part. [#32063](https://github.com/ClickHouse/ClickHouse/pull/32063) ([tavplubix](https://github.com/tavplubix)). +* `MaterializedMySQL` (experimental feature): Fix misinterpretation of `DECIMAL` data from MySQL. [#31990](https://github.com/ClickHouse/ClickHouse/pull/31990) ([Håvard Kvålen](https://github.com/havardk)). +* `FileLog` (experimental feature) engine unnesessary created meta data directory when create table failed. Fix [#31962](https://github.com/ClickHouse/ClickHouse/issues/31962). [#31967](https://github.com/ClickHouse/ClickHouse/pull/31967) ([flynn](https://github.com/ucasfl)). +* Some `GET_PART` entry might hang in replication queue if part is lost on all replicas and there are no other parts in the same partition. It's fixed in cases when partition key contains only columns of integer types or `Date[Time]`. Fixes [#31485](https://github.com/ClickHouse/ClickHouse/issues/31485). [#31887](https://github.com/ClickHouse/ClickHouse/pull/31887) ([tavplubix](https://github.com/tavplubix)). +* Fix functions `empty` and `notEmpty` with arguments of `UUID` type. Fixes [#31819](https://github.com/ClickHouse/ClickHouse/issues/31819). [#31883](https://github.com/ClickHouse/ClickHouse/pull/31883) ([Anton Popov](https://github.com/CurtizJ)). +* Change configuration path from `keeper_server.session_timeout_ms` to `keeper_server.coordination_settings.session_timeout_ms` when constructing a `KeeperTCPHandler`. Same with `operation_timeout`. [#31859](https://github.com/ClickHouse/ClickHouse/pull/31859) ([JackyWoo](https://github.com/JackyWoo)). +* Fix invalid cast of Nullable type when nullable primary key is used. (Nullable primary key is a discouraged feature - please do not use). This fixes [#31075](https://github.com/ClickHouse/ClickHouse/issues/31075). [#31823](https://github.com/ClickHouse/ClickHouse/pull/31823) ([Amos Bird](https://github.com/amosbird)). +* Fix crash in recursive UDF in SQL. Closes [#30856](https://github.com/ClickHouse/ClickHouse/issues/30856). [#31820](https://github.com/ClickHouse/ClickHouse/pull/31820) ([Maksim Kita](https://github.com/kitaisreal)). +* Fix crash when function `dictGet` with type is used for dictionary attribute when type is `Nullable`. Fixes [#30980](https://github.com/ClickHouse/ClickHouse/issues/30980). [#31800](https://github.com/ClickHouse/ClickHouse/pull/31800) ([Maksim Kita](https://github.com/kitaisreal)). +* Fix crash with empty result of ODBC query (with some ODBC drivers). Closes [#31465](https://github.com/ClickHouse/ClickHouse/issues/31465). [#31766](https://github.com/ClickHouse/ClickHouse/pull/31766) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Fix disabling query profiler (In case of `query_profiler_real_time_period_ns>0`/`query_profiler_cpu_time_period_ns>0` query profiler can stayed enabled even after query finished). [#31740](https://github.com/ClickHouse/ClickHouse/pull/31740) ([Azat Khuzhin](https://github.com/azat)). +* Fixed rare segfault on concurrent `ATTACH PARTITION` queries. [#31738](https://github.com/ClickHouse/ClickHouse/pull/31738) ([tavplubix](https://github.com/tavplubix)). +* Fix race in JSONEachRowWithProgress output format when data and lines with progress are mixed in output. [#31736](https://github.com/ClickHouse/ClickHouse/pull/31736) ([Kruglov Pavel](https://github.com/Avogar)). +* Fixed `there are no such cluster here` error on execution of `ON CLUSTER` query if specified cluster name is name of `Replicated` database. [#31723](https://github.com/ClickHouse/ClickHouse/pull/31723) ([tavplubix](https://github.com/tavplubix)). +* Fix exception on some of the applications of `decrypt` function on Nullable columns. This closes [#31662](https://github.com/ClickHouse/ClickHouse/issues/31662). This closes [#31426](https://github.com/ClickHouse/ClickHouse/issues/31426). [#31707](https://github.com/ClickHouse/ClickHouse/pull/31707) ([alexey-milovidov](https://github.com/alexey-milovidov)). +* Fixed function ngrams when string contains UTF-8 characters. [#31706](https://github.com/ClickHouse/ClickHouse/pull/31706) ([yandd](https://github.com/yandd)). +* Settings `input_format_allow_errors_num` and `input_format_allow_errors_ratio` did not work for parsing of domain types, such as `IPv4`, it's fixed. Fixes [#31686](https://github.com/ClickHouse/ClickHouse/issues/31686). [#31697](https://github.com/ClickHouse/ClickHouse/pull/31697) ([tavplubix](https://github.com/tavplubix)). +* Fixed null pointer exception in `MATERIALIZE COLUMN`. [#31679](https://github.com/ClickHouse/ClickHouse/pull/31679) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* `RENAME TABLE` query worked incorrectly on attempt to rename an DDL dictionary in `Ordinary` database, it's fixed. [#31638](https://github.com/ClickHouse/ClickHouse/pull/31638) ([tavplubix](https://github.com/tavplubix)). +* Implement `sparkbar` aggregate function as it was intended, see: [#26175](https://github.com/ClickHouse/ClickHouse/issues/26175)#issuecomment-960353867, [comment](https://github.com/ClickHouse/ClickHouse/issues/26175#issuecomment-961155065). [#31624](https://github.com/ClickHouse/ClickHouse/pull/31624) ([小路](https://github.com/nicelulu)). +* Fix invalid generated JSON when only column names contain invalid UTF-8 sequences. [#31534](https://github.com/ClickHouse/ClickHouse/pull/31534) ([Kevin Michel](https://github.com/kmichel-aiven)). +* Disable `partial_merge_join_left_table_buffer_bytes` before bug in this optimization is fixed. See [#31009](https://github.com/ClickHouse/ClickHouse/issues/31009)). Remove redundant option `partial_merge_join_optimizations`. [#31528](https://github.com/ClickHouse/ClickHouse/pull/31528) ([Vladimir C](https://github.com/vdimir)). +* Fix progress for short `INSERT SELECT` queries. [#31510](https://github.com/ClickHouse/ClickHouse/pull/31510) ([Azat Khuzhin](https://github.com/azat)). +* Fix wrong behavior with group by and positional arguments. Closes [#31280](https://github.com/ClickHouse/ClickHouse/issues/31280)#issuecomment-968696186. [#31420](https://github.com/ClickHouse/ClickHouse/pull/31420) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Resolve `nullptr` in STS credentials provider for S3. [#31409](https://github.com/ClickHouse/ClickHouse/pull/31409) ([Vladimir Chebotarev](https://github.com/excitoon)). +* Remove `notLike` function from index analysis, because it was wrong. [#31169](https://github.com/ClickHouse/ClickHouse/pull/31169) ([sundyli](https://github.com/sundy-li)). +* Fix bug in Keeper which can lead to inability to start when some coordination logs was lost and we have more fresh snapshot than our latest log. [#31150](https://github.com/ClickHouse/ClickHouse/pull/31150) ([alesapin](https://github.com/alesapin)). +* Rewrite right distributed table in local join. solves [#25809](https://github.com/ClickHouse/ClickHouse/issues/25809). [#31105](https://github.com/ClickHouse/ClickHouse/pull/31105) ([abel-cheng](https://github.com/abel-cheng)). +* Fix `Merge` table with aliases and where (it did not work before at all). Closes [#28802](https://github.com/ClickHouse/ClickHouse/issues/28802). [#31044](https://github.com/ClickHouse/ClickHouse/pull/31044) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Fix JSON_VALUE/JSON_QUERY with quoted identifiers. This allows to have spaces in json path. Closes [#30971](https://github.com/ClickHouse/ClickHouse/issues/30971). [#31003](https://github.com/ClickHouse/ClickHouse/pull/31003) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Using `formatRow` function with not row-oriented formats led to segfault. Don't allow to use this function with such formats (because it doesn't make sense). [#31001](https://github.com/ClickHouse/ClickHouse/pull/31001) ([Kruglov Pavel](https://github.com/Avogar)). +* Fix bug which broke select queries if they happened after dropping materialized view. Found in [#30691](https://github.com/ClickHouse/ClickHouse/issues/30691). [#30997](https://github.com/ClickHouse/ClickHouse/pull/30997) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Skip `max_partition_size_to_drop check` in case of ATTACH PARTITION ... FROM and MOVE PARTITION ... [#30995](https://github.com/ClickHouse/ClickHouse/pull/30995) ([Amr Alaa](https://github.com/amralaa-MSFT)). +* Fix some corner cases with `INTERSECT` and `EXCEPT` operators. Closes [#30803](https://github.com/ClickHouse/ClickHouse/issues/30803). [#30965](https://github.com/ClickHouse/ClickHouse/pull/30965) ([Kseniia Sumarokova](https://github.com/kssenii)). + +#### Build/Testing/Packaging Improvement + +* Fix incorrect filtering result on non-x86 builds. This closes [#31417](https://github.com/ClickHouse/ClickHouse/issues/31417). This closes [#31524](https://github.com/ClickHouse/ClickHouse/issues/31524). [#31574](https://github.com/ClickHouse/ClickHouse/pull/31574) ([alexey-milovidov](https://github.com/alexey-milovidov)). +* Make ClickHouse build fully reproducible (byte identical on different machines). This closes [#22113](https://github.com/ClickHouse/ClickHouse/issues/22113). [#31899](https://github.com/ClickHouse/ClickHouse/pull/31899) ([alexey-milovidov](https://github.com/alexey-milovidov)). Remove filesystem path to the build directory from binaries to enable reproducible builds. This needed for [#22113](https://github.com/ClickHouse/ClickHouse/issues/22113). [#31838](https://github.com/ClickHouse/ClickHouse/pull/31838) ([alexey-milovidov](https://github.com/alexey-milovidov)). +* Use our own CMakeLists for `zlib-ng`, `cassandra`, `mariadb-connector-c` and `xz`, `re2`, `sentry`, `gsasl`, `arrow`, `protobuf`. This is needed for [#20151](https://github.com/ClickHouse/ClickHouse/issues/20151). Part of [#9226](https://github.com/ClickHouse/ClickHouse/issues/9226). A small step towards removal of annoying trash from the build system. [#30599](https://github.com/ClickHouse/ClickHouse/pull/30599) ([alexey-milovidov](https://github.com/alexey-milovidov)). +* Hermetic builds: use fixed version of libc and make sure that no source or binary files from the host OS are using during build. This closes [#27133](https://github.com/ClickHouse/ClickHouse/issues/27133). This closes [#21435](https://github.com/ClickHouse/ClickHouse/issues/21435). This closes [#30462](https://github.com/ClickHouse/ClickHouse/issues/30462). [#30011](https://github.com/ClickHouse/ClickHouse/pull/30011) ([alexey-milovidov](https://github.com/alexey-milovidov)). +* Adding function `getFuzzerData()` to easily fuzz particular functions. This closes [#23227](https://github.com/ClickHouse/ClickHouse/issues/23227). [#27526](https://github.com/ClickHouse/ClickHouse/pull/27526) ([Alexey Boykov](https://github.com/mathalex)). +* More correct setting up capabilities inside Docker. [#31802](https://github.com/ClickHouse/ClickHouse/pull/31802) ([Constantine Peresypkin](https://github.com/pkit)). +* Enable clang `-fstrict-vtable-pointers`, `-fwhole-program-vtables` compile options. [#20151](https://github.com/ClickHouse/ClickHouse/pull/20151) ([Maksim Kita](https://github.com/kitaisreal)). +* Avoid downloading toolchain tarballs for cross-compiling for FreeBSD. [#31672](https://github.com/ClickHouse/ClickHouse/pull/31672) ([alexey-milovidov](https://github.com/alexey-milovidov)). +* Initial support for risc-v. See development/build-cross-riscv for quirks and build command that was tested. [#31309](https://github.com/ClickHouse/ClickHouse/pull/31309) ([Vladimir Smirnov](https://github.com/Civil)). +* Support compile in arm machine with parameter "-DENABLE_TESTS=OFF". [#31007](https://github.com/ClickHouse/ClickHouse/pull/31007) ([zhanghuajie](https://github.com/zhanghuajieHIT)). + + ### ClickHouse release v21.11, 2021-11-09 #### Backward Incompatible Change diff --git a/CMakeLists.txt b/CMakeLists.txt index bc0f119e3f6..fdc9cfcd303 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -424,6 +424,11 @@ if (OS_LINUX AND NOT SANITIZE) set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,--no-undefined") endif () +# Increase stack size on Musl. We need big stack for our recursive-descend parser. +if (USE_MUSL) + set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,-z,stack-size=2097152") +endif () + include(cmake/dbms_glob_sources.cmake) if (OS_LINUX OR OS_ANDROID) @@ -447,10 +452,15 @@ if (MAKE_STATIC_LIBRARIES) # It's disabled for ARM because otherwise ClickHouse cannot run on Android. set (CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -fno-pie") set (CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELWITHDEBINFO} -fno-pie") - set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,-no-pie") + set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -no-pie -Wl,-no-pie") endif () else () set (CMAKE_POSITION_INDEPENDENT_CODE ON) + # This is required for clang on Arch linux, that uses PIE by default. + # See enable-SSP-and-PIE-by-default.patch [1]. + # + # [1]: https://github.com/archlinux/svntogit-packages/blob/6e681aa860e65ad46a1387081482eb875c2200f2/trunk/enable-SSP-and-PIE-by-default.patch + set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -no-pie") endif () if (ENABLE_TESTS) @@ -508,6 +518,7 @@ include (cmake/find/hdfs3.cmake) # uses protobuf include (cmake/find/poco.cmake) include (cmake/find/curl.cmake) include (cmake/find/s3.cmake) +include (cmake/find/blob_storage.cmake) include (cmake/find/base64.cmake) include (cmake/find/parquet.cmake) include (cmake/find/simdjson.cmake) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 8628d7a01fd..e5af9594811 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -2,7 +2,13 @@ ClickHouse is an open project, and you can contribute to it in many ways. You can help with ideas, code, or documentation. We appreciate any efforts that help us to make the project better. -Thank you. +Thank you! + +## Legal Info + +When you open your first pull-request to ClickHouse repo, a bot will invite you to accept ClickHouse Individual CLA (Contributor License Agreement). It is a simple few click process. For subsequent pull-requests the bot will check if you have already signed it and won't bother you again. + +Optionally, to make contributions even more tight legally, your employer as a legal entity may want to sign a ClickHouse Corporate CLA with ClickHouse, Inc. If you're interested to do so, contact us at [legal@clickhouse.com](mailto:legal@clickhouse.com). ## Technical Info diff --git a/PreLoad.cmake b/PreLoad.cmake index 9fba896d72e..46bf8efed31 100644 --- a/PreLoad.cmake +++ b/PreLoad.cmake @@ -27,8 +27,7 @@ execute_process(COMMAND uname -m OUTPUT_VARIABLE ARCH) if (OS MATCHES "Linux" AND NOT DEFINED CMAKE_TOOLCHAIN_FILE AND NOT DISABLE_HERMETIC_BUILD - AND ($ENV{CC} MATCHES ".*clang.*" OR CMAKE_C_COMPILER MATCHES ".*clang.*") - AND (USE_STATIC_LIBRARIES OR NOT DEFINED USE_STATIC_LIBRARIES)) + AND ($ENV{CC} MATCHES ".*clang.*" OR CMAKE_C_COMPILER MATCHES ".*clang.*")) if (ARCH MATCHES "amd64|x86_64") set (CMAKE_TOOLCHAIN_FILE "cmake/linux/toolchain-x86_64.cmake" CACHE INTERNAL "" FORCE) diff --git a/base/CMakeLists.txt b/base/CMakeLists.txt index 452b483fb6a..be1a0fb2af1 100644 --- a/base/CMakeLists.txt +++ b/base/CMakeLists.txt @@ -9,7 +9,3 @@ add_subdirectory (pcg-random) add_subdirectory (widechar_width) add_subdirectory (readpassphrase) add_subdirectory (bridge) - -if (USE_MYSQL) - add_subdirectory (mysqlxx) -endif () diff --git a/base/base/CMakeLists.txt b/base/base/CMakeLists.txt index 3b5d3377c28..800c0db508d 100644 --- a/base/base/CMakeLists.txt +++ b/base/base/CMakeLists.txt @@ -1,8 +1,6 @@ set (SRCS argsToConfig.cpp coverage.cpp - DateLUT.cpp - DateLUTImpl.cpp demangle.cpp getFQDNOrHostName.cpp getMemoryAmount.cpp @@ -18,14 +16,11 @@ set (SRCS sleep.cpp terminalColors.cpp errnoToString.cpp - getResource.cpp StringRef.cpp ) if (ENABLE_REPLXX) list (APPEND SRCS ReplxxLineReader.cpp) -elseif (ENABLE_READLINE) - list (APPEND SRCS ReadlineLineReader.cpp) endif () if (USE_DEBUG_HELPERS) @@ -52,28 +47,6 @@ if (OS_DARWIN AND NOT MAKE_STATIC_LIBRARIES) target_link_libraries(common PUBLIC -Wl,-U,_inside_main) endif() -# Allow explicit fallback to readline -if (NOT ENABLE_REPLXX AND ENABLE_READLINE) - message (STATUS "Attempt to fallback to readline explicitly") - set (READLINE_PATHS "/usr/local/opt/readline/lib") - # First try find custom lib for macos users (default lib without history support) - find_library (READLINE_LIB NAMES readline PATHS ${READLINE_PATHS} NO_DEFAULT_PATH) - if (NOT READLINE_LIB) - find_library (READLINE_LIB NAMES readline PATHS ${READLINE_PATHS}) - endif () - - set(READLINE_INCLUDE_PATHS "/usr/local/opt/readline/include") - find_path (READLINE_INCLUDE_DIR NAMES readline/readline.h PATHS ${READLINE_INCLUDE_PATHS} NO_DEFAULT_PATH) - if (NOT READLINE_INCLUDE_DIR) - find_path (READLINE_INCLUDE_DIR NAMES readline/readline.h PATHS ${READLINE_INCLUDE_PATHS}) - endif () - if (READLINE_INCLUDE_DIR AND READLINE_LIB) - target_link_libraries(common PUBLIC ${READLINE_LIB}) - target_compile_definitions(common PUBLIC USE_READLINE=1) - message (STATUS "Using readline: ${READLINE_INCLUDE_DIR} : ${READLINE_LIB}") - endif () -endif () - target_link_libraries (common PUBLIC ${CITYHASH_LIBRARIES} diff --git a/base/base/LineReader.cpp b/base/base/LineReader.cpp index 5beebb58b3b..9491f957762 100644 --- a/base/base/LineReader.cpp +++ b/base/base/LineReader.cpp @@ -10,16 +10,6 @@ #include -#ifdef OS_LINUX -/// We can detect if code is linked with one or another readline variants or open the library dynamically. -# include -extern "C" -{ - char * readline(const char *) __attribute__((__weak__)); - char * (*readline_ptr)(const char *) = readline; -} -#endif - #ifdef HAS_RESERVED_IDENTIFIER #pragma clang diagnostic ignored "-Wreserved-identifier" #endif @@ -152,33 +142,6 @@ LineReader::InputStatus LineReader::readOneLine(const String & prompt) { input.clear(); -#ifdef OS_LINUX - if (!readline_ptr) - { - for (const auto * name : {"libreadline.so", "libreadline.so.0", "libeditline.so", "libeditline.so.0"}) - { - void * dl_handle = dlopen(name, RTLD_LAZY); - if (dl_handle) - { - readline_ptr = reinterpret_cast(dlsym(dl_handle, "readline")); - if (readline_ptr) - { - break; - } - } - } - } - - /// Minimal support for readline - if (readline_ptr) - { - char * line_read = (*readline_ptr)(prompt.c_str()); - if (!line_read) - return ABORT; - input = line_read; - } - else -#endif { std::cout << prompt; std::getline(std::cin, input); diff --git a/base/base/ReadlineLineReader.cpp b/base/base/ReadlineLineReader.cpp deleted file mode 100644 index de444a0b1d9..00000000000 --- a/base/base/ReadlineLineReader.cpp +++ /dev/null @@ -1,187 +0,0 @@ -#include -#include -#include - -#include -#include -#include -#include - -#include - -namespace -{ - -/// Trim ending whitespace inplace -void trim(String & s) -{ - s.erase(std::find_if(s.rbegin(), s.rend(), [](int ch) { return !std::isspace(ch); }).base(), s.end()); -} - -} - -static const LineReader::Suggest * suggest; - -/// Points to current word to suggest. -static LineReader::Suggest::Words::const_iterator pos; -/// Points after the last possible match. -static LineReader::Suggest::Words::const_iterator end; - -/// Set iterators to the matched range of words if any. -static void findRange(const char * prefix, size_t prefix_length) -{ - std::string prefix_str(prefix); - if (auto completions = suggest->getCompletions(prefix_str, prefix_length)) - std::tie(pos, end) = *completions; -} - -/// Iterates through matched range. -static char * nextMatch() -{ - if (pos >= end) - return nullptr; - - /// readline will free memory by itself. - char * word = strdup(pos->c_str()); - ++pos; - return word; -} - -static char * generate(const char * text, int state) -{ - if (!suggest->ready) - return nullptr; - if (state == 0) - findRange(text, strlen(text)); - - /// Do not append whitespace after word. For unknown reason, rl_completion_append_character = '\0' does not work. - rl_completion_suppress_append = 1; - - return nextMatch(); -}; - -ReadlineLineReader::ReadlineLineReader( - const Suggest & suggest_, const String & history_file_path_, bool multiline_, Patterns extenders_, Patterns delimiters_) - : LineReader(history_file_path_, multiline_, std::move(extenders_), std::move(delimiters_)) -{ - suggest = &suggest_; - - if (!history_file_path.empty()) - { - int res = read_history(history_file_path.c_str()); - if (res) - std::cerr << "Cannot read history from file " + history_file_path + ": "+ errnoToString(errno) << std::endl; - } - - /// Added '.' to the default list. Because it is used to separate database and table. - rl_basic_word_break_characters = word_break_characters; - - /// Not append whitespace after single suggestion. Because whitespace after function name is meaningless. - rl_completion_append_character = '\0'; - - rl_completion_entry_function = generate; - - /// Install Ctrl+C signal handler that will be used in interactive mode. - - if (rl_initialize()) - throw std::runtime_error("Cannot initialize readline"); - - auto clear_prompt_or_exit = [](int) - { - /// This is signal safe. - ssize_t res = write(STDOUT_FILENO, "\n", 1); - - /// Allow to quit client while query is in progress by pressing Ctrl+C twice. - /// (First press to Ctrl+C will try to cancel query by InterruptListener). - if (res == 1 && rl_line_buffer[0] && !RL_ISSTATE(RL_STATE_DONE)) - { - rl_replace_line("", 0); - if (rl_forced_update_display()) - _exit(0); - } - else - { - /// A little dirty, but we struggle to find better way to correctly - /// force readline to exit after returning from the signal handler. - _exit(0); - } - }; - - if (signal(SIGINT, clear_prompt_or_exit) == SIG_ERR) - throw std::runtime_error(std::string("Cannot set signal handler for readline: ") + errnoToString(errno)); - - rl_variable_bind("completion-ignore-case", "on"); - // TODO: it doesn't work - // history_write_timestamps = 1; -} - -ReadlineLineReader::~ReadlineLineReader() -{ -} - -LineReader::InputStatus ReadlineLineReader::readOneLine(const String & prompt) -{ - input.clear(); - - const char* cinput = readline(prompt.c_str()); - if (cinput == nullptr) - return (errno != EAGAIN) ? ABORT : RESET_LINE; - input = cinput; - - trim(input); - return INPUT_LINE; -} - -void ReadlineLineReader::addToHistory(const String & line) -{ - add_history(line.c_str()); - - // Flush changes to the disk - // NOTE readline builds a buffer of all the lines to write, and write them in one syscall. - // Thus there is no need to lock the history file here. - write_history(history_file_path.c_str()); -} - -#if RL_VERSION_MAJOR >= 7 - -#define BRACK_PASTE_PREF "\033[200~" -#define BRACK_PASTE_SUFF "\033[201~" - -#define BRACK_PASTE_LAST '~' -#define BRACK_PASTE_SLEN 6 - -/// This handler bypasses some unused macro/event checkings and remove trailing newlines before insertion. -static int clickhouse_rl_bracketed_paste_begin(int /* count */, int /* key */) -{ - std::string buf; - buf.reserve(128); - - RL_SETSTATE(RL_STATE_MOREINPUT); - SCOPE_EXIT(RL_UNSETSTATE(RL_STATE_MOREINPUT)); - int c; - while ((c = rl_read_key()) >= 0) - { - if (c == '\r') - c = '\n'; - buf.push_back(c); - if (buf.size() >= BRACK_PASTE_SLEN && c == BRACK_PASTE_LAST && buf.substr(buf.size() - BRACK_PASTE_SLEN) == BRACK_PASTE_SUFF) - { - buf.resize(buf.size() - BRACK_PASTE_SLEN); - break; - } - } - trim(buf); - return static_cast(rl_insert_text(buf.c_str())) == buf.size() ? 0 : 1; -} - -#endif - -void ReadlineLineReader::enableBracketedPaste() -{ -#if RL_VERSION_MAJOR >= 7 - rl_variable_bind("enable-bracketed-paste", "on"); - - /// Use our bracketed paste handler to get better user experience. See comments above. - rl_bind_keyseq(BRACK_PASTE_PREF, clickhouse_rl_bracketed_paste_begin); -#endif -}; diff --git a/base/base/ReadlineLineReader.h b/base/base/ReadlineLineReader.h deleted file mode 100644 index 95bd23b4634..00000000000 --- a/base/base/ReadlineLineReader.h +++ /dev/null @@ -1,19 +0,0 @@ -#pragma once - -#include "LineReader.h" - -#include -#include - -class ReadlineLineReader : public LineReader -{ -public: - ReadlineLineReader(const Suggest & suggest, const String & history_file_path, bool multiline, Patterns extenders_, Patterns delimiters_); - ~ReadlineLineReader() override; - - void enableBracketedPaste() override; - -private: - InputStatus readOneLine(const String & prompt) override; - void addToHistory(const String & line) override; -}; diff --git a/base/base/getPageSize.cpp b/base/base/getPageSize.cpp index 6f7e0c6e259..948fc4a7700 100644 --- a/base/base/getPageSize.cpp +++ b/base/base/getPageSize.cpp @@ -1,8 +1,11 @@ #include #include - +#include Int64 getPageSize() { - return sysconf(_SC_PAGESIZE); + Int64 page_size = sysconf(_SC_PAGESIZE); + if (page_size < 0) + abort(); + return page_size; } diff --git a/base/base/phdr_cache.cpp b/base/base/phdr_cache.cpp index 8ae10f6bf83..20a755ed7a4 100644 --- a/base/base/phdr_cache.cpp +++ b/base/base/phdr_cache.cpp @@ -123,6 +123,12 @@ bool hasPHDRCache() #else void updatePHDRCache() {} -bool hasPHDRCache() { return false; } + +#if defined(USE_MUSL) + /// With statically linked with musl, dl_iterate_phdr is immutable. + bool hasPHDRCache() { return true; } +#else + bool hasPHDRCache() { return false; } +#endif #endif diff --git a/base/harmful/harmful.c b/base/harmful/harmful.c index bfb68abbcfb..5a27cae0383 100644 --- a/base/harmful/harmful.c +++ b/base/harmful/harmful.c @@ -182,7 +182,6 @@ TRAP(vlimit) TRAP(wcsnrtombs) TRAP(wcsrtombs) TRAP(wctomb) -TRAP(wordexp) TRAP(basename) TRAP(catgets) TRAP(dbm_clearerr) @@ -195,9 +194,8 @@ TRAP(dbm_nextkey) TRAP(dbm_open) TRAP(dbm_store) TRAP(dirname) -#if !defined(SANITIZER) -TRAP(dlerror) // Used by tsan -#endif +// TRAP(dlerror) // It is not thread-safe. But it is used by dynamic linker to load some name resolution plugins. Also used by TSan. +/// Note: we should better get rid of glibc, dynamic linking and all that sort of annoying garbage altogether. TRAP(ftw) TRAP(getc_unlocked) //TRAP(getenv) // Ok at program startup @@ -245,4 +243,21 @@ TRAP(lgammaf32x) TRAP(lgammaf64) TRAP(lgammaf64x) +/// These functions are unused by ClickHouse and we should be aware if they are accidentally get used. +/// Sometimes people report that these function contain vulnerabilities (these reports are bogus for ClickHouse). +TRAP(mq_close) +TRAP(mq_getattr) +TRAP(mq_setattr) +TRAP(mq_notify) +TRAP(mq_open) +TRAP(mq_receive) +TRAP(mq_send) +TRAP(mq_unlink) +TRAP(mq_timedsend) +TRAP(mq_timedreceive) + +/// These functions are also unused by ClickHouse. +TRAP(wordexp) +TRAP(wordfree) + #endif diff --git a/base/mysqlxx/CMakeLists.txt b/base/mysqlxx/CMakeLists.txt deleted file mode 100644 index 80db50c2593..00000000000 --- a/base/mysqlxx/CMakeLists.txt +++ /dev/null @@ -1,61 +0,0 @@ -add_library (mysqlxx - Connection.cpp - Exception.cpp - Query.cpp - ResultBase.cpp - UseQueryResult.cpp - Row.cpp - Value.cpp - Pool.cpp - PoolFactory.cpp - PoolWithFailover.cpp -) - -target_include_directories (mysqlxx PUBLIC ..) - -if (NOT USE_INTERNAL_MYSQL_LIBRARY) - set(PLATFORM_LIBRARIES ${CMAKE_DL_LIBS}) - - if (USE_MYSQL) - target_include_directories (mysqlxx SYSTEM PRIVATE ${MYSQL_INCLUDE_DIR}) - endif () - - if (APPLE) - find_library (ICONV_LIBRARY iconv) - set (MYSQLCLIENT_LIBRARIES ${MYSQLCLIENT_LIBRARIES} ${STATIC_MYSQLCLIENT_LIB} ${ICONV_LIBRARY}) - elseif (USE_STATIC_LIBRARIES AND STATIC_MYSQLCLIENT_LIB) - set (MYSQLCLIENT_LIBRARIES ${STATIC_MYSQLCLIENT_LIB}) - endif () -endif () - -target_link_libraries (mysqlxx - PUBLIC - common - PRIVATE - ${MYSQLCLIENT_LIBRARIES} - ${ZLIB_LIBRARIES} -) - -if(OPENSSL_LIBRARIES) - target_link_libraries(mysqlxx PRIVATE ${OPENSSL_LIBRARIES}) -endif() - -target_link_libraries(mysqlxx PRIVATE ${PLATFORM_LIBRARIES}) - -if (NOT USE_INTERNAL_MYSQL_LIBRARY AND OPENSSL_INCLUDE_DIR) - target_include_directories (mysqlxx SYSTEM PRIVATE ${OPENSSL_INCLUDE_DIR}) -endif () - -target_no_warning(mysqlxx reserved-macro-identifier) - -if (NOT USE_INTERNAL_MYSQL_LIBRARY AND USE_STATIC_LIBRARIES) - message(WARNING "Statically linking with system mysql/mariadb only works " - "if mysql client libraries are built with same openssl version as " - "we are going to use now. It wouldn't work if GnuTLS is used. " - "Try -D\"USE_INTERNAL_MYSQL_LIBRARY\"=ON or -D\"ENABLE_MYSQL\"=OFF or " - "-D\"USE_STATIC_LIBRARIES\"=OFF") -endif () - -if (ENABLE_TESTS) - add_subdirectory (tests) -endif () diff --git a/cmake/find/blob_storage.cmake b/cmake/find/blob_storage.cmake new file mode 100644 index 00000000000..74a907da7db --- /dev/null +++ b/cmake/find/blob_storage.cmake @@ -0,0 +1,30 @@ +option (ENABLE_AZURE_BLOB_STORAGE "Enable Azure blob storage" ${ENABLE_LIBRARIES}) + +option(USE_INTERNAL_AZURE_BLOB_STORAGE_LIBRARY + "Set to FALSE to use system Azure SDK instead of bundled (OFF currently not implemented)" + ON) + +if (ENABLE_AZURE_BLOB_STORAGE) + set(USE_AZURE_BLOB_STORAGE 1) + set(AZURE_BLOB_STORAGE_LIBRARY azure_sdk) +endif() + +if ((NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/azure/sdk" + OR NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/azure/cmake-modules") + AND USE_INTERNAL_AZURE_BLOB_STORAGE_LIBRARY) + message (WARNING "submodule contrib/azure is missing. to fix try run: \n git submodule update --init") + set(USE_INTERNAL_AZURE_BLOB_STORAGE_LIBRARY OFF) + set(USE_AZURE_BLOB_STORAGE 0) +endif () + +if (NOT USE_INTERNAL_SSL_LIBRARY AND USE_INTERNAL_AZURE_BLOB_STORAGE_LIBRARY) + message (FATAL_ERROR "Currently Blob Storage support can be built only with internal SSL library") +endif() + +if (NOT USE_INTERNAL_CURL AND USE_INTERNAL_AZURE_BLOB_STORAGE_LIBRARY) + message (FATAL_ERROR "Currently Blob Storage support can be built only with internal curl library") +endif() + +if (USE_AZURE_BLOB_STORAGE) + message (STATUS "Using Azure Blob Storage - ${USE_AZURE_BLOB_STORAGE}") +endif() diff --git a/cmake/find/ccache.cmake b/cmake/find/ccache.cmake index 43c2de0c921..95ec3d8a034 100644 --- a/cmake/find/ccache.cmake +++ b/cmake/find/ccache.cmake @@ -32,11 +32,6 @@ if (CCACHE_FOUND AND NOT COMPILER_MATCHES_CCACHE) if (CCACHE_VERSION VERSION_GREATER "3.2.0" OR NOT CMAKE_CXX_COMPILER_ID STREQUAL "Clang") message(STATUS "Using ${CCACHE_FOUND} ${CCACHE_VERSION}") - set (CMAKE_CXX_COMPILER_LAUNCHER ${CCACHE_FOUND} ${CMAKE_CXX_COMPILER_LAUNCHER}) - set (CMAKE_C_COMPILER_LAUNCHER ${CCACHE_FOUND} ${CMAKE_C_COMPILER_LAUNCHER}) - - set_property (GLOBAL PROPERTY RULE_LAUNCH_LINK ${CCACHE_FOUND}) - # debian (debhelpers) set SOURCE_DATE_EPOCH environment variable, that is # filled from the debian/changelog or current time. # @@ -49,11 +44,14 @@ if (CCACHE_FOUND AND NOT COMPILER_MATCHES_CCACHE) # - 4.0+ will ignore SOURCE_DATE_EPOCH environment variable. if (CCACHE_VERSION VERSION_GREATER_EQUAL "4.2") message(STATUS "ccache is 4.2+ no quirks for SOURCE_DATE_EPOCH required") + set(LAUNCHER ${CCACHE_FOUND}) elseif (CCACHE_VERSION VERSION_GREATER_EQUAL "4.0") message(STATUS "Ignore SOURCE_DATE_EPOCH for ccache") - set_property (GLOBAL PROPERTY RULE_LAUNCH_COMPILE "env -u SOURCE_DATE_EPOCH") - set_property (GLOBAL PROPERTY RULE_LAUNCH_LINK "env -u SOURCE_DATE_EPOCH") + set(LAUNCHER env -u SOURCE_DATE_EPOCH ${CCACHE_FOUND}) endif() + + set (CMAKE_CXX_COMPILER_LAUNCHER ${LAUNCHER} ${CMAKE_CXX_COMPILER_LAUNCHER}) + set (CMAKE_C_COMPILER_LAUNCHER ${LAUNCHER} ${CMAKE_C_COMPILER_LAUNCHER}) else () message(${RECONFIGURE_MESSAGE_LEVEL} "Not using ${CCACHE_FOUND} ${CCACHE_VERSION} bug: https://bugzilla.samba.org/show_bug.cgi?id=8118") endif () diff --git a/cmake/linux/toolchain-x86_64.cmake b/cmake/linux/toolchain-x86_64.cmake index 879f35feb83..965ea024ab7 100644 --- a/cmake/linux/toolchain-x86_64.cmake +++ b/cmake/linux/toolchain-x86_64.cmake @@ -14,9 +14,12 @@ set (TOOLCHAIN_PATH "${CMAKE_CURRENT_LIST_DIR}/../../contrib/sysroot/linux-x86_6 set (CMAKE_SYSROOT "${TOOLCHAIN_PATH}/x86_64-linux-gnu/libc") -set (CMAKE_C_FLAGS_INIT "${CMAKE_C_FLAGS} --gcc-toolchain=${TOOLCHAIN_PATH}") -set (CMAKE_CXX_FLAGS_INIT "${CMAKE_CXX_FLAGS} --gcc-toolchain=${TOOLCHAIN_PATH}") -set (CMAKE_ASM_FLAGS_INIT "${CMAKE_ASM_FLAGS} --gcc-toolchain=${TOOLCHAIN_PATH}") +set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} --gcc-toolchain=${TOOLCHAIN_PATH}") +set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} --gcc-toolchain=${TOOLCHAIN_PATH}") +set (CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} --gcc-toolchain=${TOOLCHAIN_PATH}") +set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} --gcc-toolchain=${TOOLCHAIN_PATH}") +set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} --gcc-toolchain=${TOOLCHAIN_PATH}") +set (CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} --gcc-toolchain=${TOOLCHAIN_PATH}") set (HAS_PRE_1970_EXITCODE "0" CACHE STRING "Result from TRY_RUN" FORCE) set (HAS_PRE_1970_EXITCODE__TRYRUN_OUTPUT "" CACHE STRING "Output from TRY_RUN" FORCE) diff --git a/cmake/target.cmake b/cmake/target.cmake index 3c02c4313f1..4b109d165e7 100644 --- a/cmake/target.cmake +++ b/cmake/target.cmake @@ -42,6 +42,14 @@ if (CMAKE_CROSSCOMPILING) message (FATAL_ERROR "Trying to cross-compile to unsupported system: ${CMAKE_SYSTEM_NAME}!") endif () + if (USE_MUSL) + set (USE_SENTRY OFF CACHE INTERNAL "") + set (ENABLE_ODBC OFF CACHE INTERNAL "") + set (ENABLE_GRPC OFF CACHE INTERNAL "") + set (ENABLE_HDFS OFF CACHE INTERNAL "") + set (ENABLE_EMBEDDED_COMPILER OFF CACHE INTERNAL "") + endif () + # Don't know why but CXX_STANDARD doesn't work for cross-compilation set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++20") diff --git a/cmake/tools.cmake b/cmake/tools.cmake index eb3624f3b3b..69a37304f58 100644 --- a/cmake/tools.cmake +++ b/cmake/tools.cmake @@ -91,6 +91,9 @@ endif () if (LINKER_NAME) if (COMPILER_CLANG AND (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 12.0.0 OR CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL 12.0.0)) find_program (LLD_PATH NAMES ${LINKER_NAME}) + if (NOT LLD_PATH) + message (FATAL_ERROR "Using linker ${LINKER_NAME} but can't find its path.") + endif () set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} --ld-path=${LLD_PATH}") set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} --ld-path=${LLD_PATH}") else () diff --git a/contrib/CMakeLists.txt b/contrib/CMakeLists.txt index 173fed6ef64..b7158b22744 100644 --- a/contrib/CMakeLists.txt +++ b/contrib/CMakeLists.txt @@ -248,6 +248,10 @@ endif() # - sentry-native add_subdirectory (curl-cmake) +if (USE_INTERNAL_AZURE_BLOB_STORAGE_LIBRARY) + add_subdirectory(azure-cmake) +endif() + if (USE_SENTRY) add_subdirectory (sentry-native-cmake) endif() diff --git a/contrib/NuRaft b/contrib/NuRaft index d10351f312c..ff100a87131 160000 --- a/contrib/NuRaft +++ b/contrib/NuRaft @@ -1 +1 @@ -Subproject commit d10351f312c1ae1ca3fdda433693dfbef3acfece +Subproject commit ff100a8713146e1ca4b4158dd6cc4eef9af47fc3 diff --git a/contrib/abseil-cpp b/contrib/abseil-cpp index b004a8a0241..215105818df 160000 --- a/contrib/abseil-cpp +++ b/contrib/abseil-cpp @@ -1 +1 @@ -Subproject commit b004a8a02418b83de8b686caa0b0f6e39ac2191f +Subproject commit 215105818dfde3174fe799600bb0f3cae233d0bf diff --git a/contrib/abseil-cpp-cmake/CMakeLists.txt b/contrib/abseil-cpp-cmake/CMakeLists.txt index c8cb512066a..65e4c24ff5a 100644 --- a/contrib/abseil-cpp-cmake/CMakeLists.txt +++ b/contrib/abseil-cpp-cmake/CMakeLists.txt @@ -2,6 +2,8 @@ set(ABSL_ROOT_DIR "${ClickHouse_SOURCE_DIR}/contrib/abseil-cpp") if(NOT EXISTS "${ABSL_ROOT_DIR}/CMakeLists.txt") message(FATAL_ERROR " submodule third_party/abseil-cpp is missing. To fix try run: \n git submodule update --init --recursive") endif() +set(BUILD_TESTING OFF) +set(ABSL_PROPAGATE_CXX_STD ON) add_subdirectory("${ABSL_ROOT_DIR}" "${ClickHouse_BINARY_DIR}/contrib/abseil-cpp") add_library(abseil_swiss_tables INTERFACE) diff --git a/contrib/arrow b/contrib/arrow index 078e21bad34..aa9a7a698e3 160000 --- a/contrib/arrow +++ b/contrib/arrow @@ -1 +1 @@ -Subproject commit 078e21bad344747b7656ef2d7a4f7410a0a303eb +Subproject commit aa9a7a698e33e278abe053f4634170b3b026e48e diff --git a/contrib/arrow-cmake/CMakeLists.txt b/contrib/arrow-cmake/CMakeLists.txt index 231185462dc..e01b546310f 100644 --- a/contrib/arrow-cmake/CMakeLists.txt +++ b/contrib/arrow-cmake/CMakeLists.txt @@ -1,5 +1,22 @@ set (CMAKE_CXX_STANDARD 17) +set(ARROW_VERSION "6.0.1") +string(REGEX MATCH "^[0-9]+\\.[0-9]+\\.[0-9]+" ARROW_BASE_VERSION "${ARROW_VERSION}") + +set(ARROW_VERSION_MAJOR "6") +set(ARROW_VERSION_MINOR "0") +set(ARROW_VERSION_PATCH "1") + +if(ARROW_VERSION_MAJOR STREQUAL "0") + # Arrow 0.x.y => SO version is "x", full SO version is "x.y.0" + set(ARROW_SO_VERSION "${ARROW_VERSION_MINOR}") + set(ARROW_FULL_SO_VERSION "${ARROW_SO_VERSION}.${ARROW_VERSION_PATCH}.0") +else() + # Arrow 1.x.y => SO version is "10x", full SO version is "10x.y.0" + math(EXPR ARROW_SO_VERSION "${ARROW_VERSION_MAJOR} * 100 + ${ARROW_VERSION_MINOR}") + set(ARROW_FULL_SO_VERSION "${ARROW_SO_VERSION}.${ARROW_VERSION_PATCH}.0") +endif() + # === thrift set(LIBRARY_DIR "${ClickHouse_SOURCE_DIR}/contrib/thrift/lib/cpp") @@ -93,6 +110,9 @@ add_subdirectory(${FLATBUFFERS_SRC_DIR} "${FLATBUFFERS_BINARY_DIR}") message(STATUS "FLATBUFFERS_LIBRARY: ${FLATBUFFERS_LIBRARY}") +# === hdfs +set(HDFS_INCLUDE_DIR "${ClickHouse_SOURCE_DIR}/contrib/libhdfs3/include/hdfs/") + # arrow-cmake cmake file calling orc cmake subroutine which detects certain compiler features. # Apple Clang compiler failed to compile this code without specifying c++11 standard. # As result these compiler features detected as absent. In result it failed to compile orc itself. @@ -114,6 +134,7 @@ configure_file("${ORC_INCLUDE_DIR}/orc/orc-config.hh.in" "${ORC_BUILD_INCLUDE_DI configure_file("${ORC_SOURCE_SRC_DIR}/Adaptor.hh.in" "${ORC_BUILD_INCLUDE_DIR}/Adaptor.hh") +# ARROW_ORC + adapters/orc/CMakefiles set(ORC_SRCS "${ARROW_SRC_DIR}/arrow/adapters/orc/adapter.cc" "${ARROW_SRC_DIR}/arrow/adapters/orc/adapter_util.cc" @@ -150,28 +171,8 @@ set(LIBRARY_DIR "${ClickHouse_SOURCE_DIR}/contrib/arrow/cpp/src/arrow") configure_file("${LIBRARY_DIR}/util/config.h.cmake" "${CMAKE_CURRENT_BINARY_DIR}/cpp/src/arrow/util/config.h") -# arrow/cpp/src/arrow/CMakeLists.txt +# arrow/cpp/src/arrow/CMakeLists.txt (ARROW_SRCS + ARROW_COMPUTE + ARROW_IPC) set(ARROW_SRCS - "${LIBRARY_DIR}/buffer.cc" - "${LIBRARY_DIR}/builder.cc" - "${LIBRARY_DIR}/chunked_array.cc" - "${LIBRARY_DIR}/compare.cc" - "${LIBRARY_DIR}/datum.cc" - "${LIBRARY_DIR}/device.cc" - "${LIBRARY_DIR}/extension_type.cc" - "${LIBRARY_DIR}/memory_pool.cc" - "${LIBRARY_DIR}/pretty_print.cc" - "${LIBRARY_DIR}/record_batch.cc" - "${LIBRARY_DIR}/result.cc" - "${LIBRARY_DIR}/scalar.cc" - "${LIBRARY_DIR}/sparse_tensor.cc" - "${LIBRARY_DIR}/status.cc" - "${LIBRARY_DIR}/table_builder.cc" - "${LIBRARY_DIR}/table.cc" - "${LIBRARY_DIR}/tensor.cc" - "${LIBRARY_DIR}/type.cc" - "${LIBRARY_DIR}/visitor.cc" - "${LIBRARY_DIR}/array/array_base.cc" "${LIBRARY_DIR}/array/array_binary.cc" "${LIBRARY_DIR}/array/array_decimal.cc" @@ -191,25 +192,112 @@ set(ARROW_SRCS "${LIBRARY_DIR}/array/diff.cc" "${LIBRARY_DIR}/array/util.cc" "${LIBRARY_DIR}/array/validate.cc" + "${LIBRARY_DIR}/builder.cc" + "${LIBRARY_DIR}/buffer.cc" + "${LIBRARY_DIR}/chunked_array.cc" + "${LIBRARY_DIR}/compare.cc" + "${LIBRARY_DIR}/config.cc" + "${LIBRARY_DIR}/datum.cc" + "${LIBRARY_DIR}/device.cc" + "${LIBRARY_DIR}/extension_type.cc" + "${LIBRARY_DIR}/memory_pool.cc" + "${LIBRARY_DIR}/pretty_print.cc" + "${LIBRARY_DIR}/record_batch.cc" + "${LIBRARY_DIR}/result.cc" + "${LIBRARY_DIR}/scalar.cc" + "${LIBRARY_DIR}/sparse_tensor.cc" + "${LIBRARY_DIR}/status.cc" + "${LIBRARY_DIR}/table.cc" + "${LIBRARY_DIR}/table_builder.cc" + "${LIBRARY_DIR}/tensor.cc" + "${LIBRARY_DIR}/tensor/coo_converter.cc" + "${LIBRARY_DIR}/tensor/csf_converter.cc" + "${LIBRARY_DIR}/tensor/csx_converter.cc" + "${LIBRARY_DIR}/type.cc" + "${LIBRARY_DIR}/visitor.cc" + "${LIBRARY_DIR}/c/bridge.cc" + "${LIBRARY_DIR}/io/buffered.cc" + "${LIBRARY_DIR}/io/caching.cc" + "${LIBRARY_DIR}/io/compressed.cc" + "${LIBRARY_DIR}/io/file.cc" + "${LIBRARY_DIR}/io/hdfs.cc" + "${LIBRARY_DIR}/io/hdfs_internal.cc" + "${LIBRARY_DIR}/io/interfaces.cc" + "${LIBRARY_DIR}/io/memory.cc" + "${LIBRARY_DIR}/io/slow.cc" + "${LIBRARY_DIR}/io/stdio.cc" + "${LIBRARY_DIR}/io/transform.cc" + "${LIBRARY_DIR}/util/async_util.cc" + "${LIBRARY_DIR}/util/basic_decimal.cc" + "${LIBRARY_DIR}/util/bit_block_counter.cc" + "${LIBRARY_DIR}/util/bit_run_reader.cc" + "${LIBRARY_DIR}/util/bit_util.cc" + "${LIBRARY_DIR}/util/bitmap.cc" + "${LIBRARY_DIR}/util/bitmap_builders.cc" + "${LIBRARY_DIR}/util/bitmap_ops.cc" + "${LIBRARY_DIR}/util/bpacking.cc" + "${LIBRARY_DIR}/util/cancel.cc" + "${LIBRARY_DIR}/util/compression.cc" + "${LIBRARY_DIR}/util/counting_semaphore.cc" + "${LIBRARY_DIR}/util/cpu_info.cc" + "${LIBRARY_DIR}/util/decimal.cc" + "${LIBRARY_DIR}/util/delimiting.cc" + "${LIBRARY_DIR}/util/formatting.cc" + "${LIBRARY_DIR}/util/future.cc" + "${LIBRARY_DIR}/util/int_util.cc" + "${LIBRARY_DIR}/util/io_util.cc" + "${LIBRARY_DIR}/util/logging.cc" + "${LIBRARY_DIR}/util/key_value_metadata.cc" + "${LIBRARY_DIR}/util/memory.cc" + "${LIBRARY_DIR}/util/mutex.cc" + "${LIBRARY_DIR}/util/string.cc" + "${LIBRARY_DIR}/util/string_builder.cc" + "${LIBRARY_DIR}/util/task_group.cc" + "${LIBRARY_DIR}/util/tdigest.cc" + "${LIBRARY_DIR}/util/thread_pool.cc" + "${LIBRARY_DIR}/util/time.cc" + "${LIBRARY_DIR}/util/trie.cc" + "${LIBRARY_DIR}/util/unreachable.cc" + "${LIBRARY_DIR}/util/uri.cc" + "${LIBRARY_DIR}/util/utf8.cc" + "${LIBRARY_DIR}/util/value_parsing.cc" + "${LIBRARY_DIR}/vendored/base64.cpp" + "${LIBRARY_DIR}/vendored/datetime/tz.cpp" + + "${LIBRARY_DIR}/vendored/musl/strptime.c" + "${LIBRARY_DIR}/vendored/uriparser/UriCommon.c" + "${LIBRARY_DIR}/vendored/uriparser/UriCompare.c" + "${LIBRARY_DIR}/vendored/uriparser/UriEscape.c" + "${LIBRARY_DIR}/vendored/uriparser/UriFile.c" + "${LIBRARY_DIR}/vendored/uriparser/UriIp4Base.c" + "${LIBRARY_DIR}/vendored/uriparser/UriIp4.c" + "${LIBRARY_DIR}/vendored/uriparser/UriMemory.c" + "${LIBRARY_DIR}/vendored/uriparser/UriNormalizeBase.c" + "${LIBRARY_DIR}/vendored/uriparser/UriNormalize.c" + "${LIBRARY_DIR}/vendored/uriparser/UriParseBase.c" + "${LIBRARY_DIR}/vendored/uriparser/UriParse.c" + "${LIBRARY_DIR}/vendored/uriparser/UriQuery.c" + "${LIBRARY_DIR}/vendored/uriparser/UriRecompose.c" + "${LIBRARY_DIR}/vendored/uriparser/UriResolve.c" + "${LIBRARY_DIR}/vendored/uriparser/UriShorten.c" "${LIBRARY_DIR}/compute/api_aggregate.cc" "${LIBRARY_DIR}/compute/api_scalar.cc" "${LIBRARY_DIR}/compute/api_vector.cc" "${LIBRARY_DIR}/compute/cast.cc" "${LIBRARY_DIR}/compute/exec.cc" + "${LIBRARY_DIR}/compute/exec/aggregate_node.cc" + "${LIBRARY_DIR}/compute/exec/exec_plan.cc" + "${LIBRARY_DIR}/compute/exec/expression.cc" + "${LIBRARY_DIR}/compute/exec/filter_node.cc" + "${LIBRARY_DIR}/compute/exec/project_node.cc" + "${LIBRARY_DIR}/compute/exec/source_node.cc" + "${LIBRARY_DIR}/compute/exec/sink_node.cc" + "${LIBRARY_DIR}/compute/exec/order_by_impl.cc" "${LIBRARY_DIR}/compute/function.cc" "${LIBRARY_DIR}/compute/function_internal.cc" "${LIBRARY_DIR}/compute/kernel.cc" "${LIBRARY_DIR}/compute/registry.cc" - - "${LIBRARY_DIR}/compute/exec/exec_plan.cc" - "${LIBRARY_DIR}/compute/exec/expression.cc" - "${LIBRARY_DIR}/compute/exec/key_compare.cc" - "${LIBRARY_DIR}/compute/exec/key_encode.cc" - "${LIBRARY_DIR}/compute/exec/key_hash.cc" - "${LIBRARY_DIR}/compute/exec/key_map.cc" - "${LIBRARY_DIR}/compute/exec/util.cc" - "${LIBRARY_DIR}/compute/kernels/aggregate_basic.cc" "${LIBRARY_DIR}/compute/kernels/aggregate_mode.cc" "${LIBRARY_DIR}/compute/kernels/aggregate_quantile.cc" @@ -227,28 +315,31 @@ set(ARROW_SRCS "${LIBRARY_DIR}/compute/kernels/scalar_cast_string.cc" "${LIBRARY_DIR}/compute/kernels/scalar_cast_temporal.cc" "${LIBRARY_DIR}/compute/kernels/scalar_compare.cc" - "${LIBRARY_DIR}/compute/kernels/scalar_fill_null.cc" - "${LIBRARY_DIR}/compute/kernels/scalar_if_else.cc" "${LIBRARY_DIR}/compute/kernels/scalar_nested.cc" "${LIBRARY_DIR}/compute/kernels/scalar_set_lookup.cc" "${LIBRARY_DIR}/compute/kernels/scalar_string.cc" - "${LIBRARY_DIR}/compute/kernels/scalar_temporal.cc" + "${LIBRARY_DIR}/compute/kernels/scalar_temporal_binary.cc" + "${LIBRARY_DIR}/compute/kernels/scalar_temporal_unary.cc" "${LIBRARY_DIR}/compute/kernels/scalar_validity.cc" + "${LIBRARY_DIR}/compute/kernels/scalar_if_else.cc" "${LIBRARY_DIR}/compute/kernels/util_internal.cc" + "${LIBRARY_DIR}/compute/kernels/vector_array_sort.cc" "${LIBRARY_DIR}/compute/kernels/vector_hash.cc" "${LIBRARY_DIR}/compute/kernels/vector_nested.cc" "${LIBRARY_DIR}/compute/kernels/vector_replace.cc" "${LIBRARY_DIR}/compute/kernels/vector_selection.cc" "${LIBRARY_DIR}/compute/kernels/vector_sort.cc" - - "${LIBRARY_DIR}/csv/chunker.cc" - "${LIBRARY_DIR}/csv/column_builder.cc" - "${LIBRARY_DIR}/csv/column_decoder.cc" - "${LIBRARY_DIR}/csv/converter.cc" - "${LIBRARY_DIR}/csv/options.cc" - "${LIBRARY_DIR}/csv/parser.cc" - "${LIBRARY_DIR}/csv/reader.cc" - "${LIBRARY_DIR}/csv/writer.cc" + "${LIBRARY_DIR}/compute/kernels/row_encoder.cc" + "${LIBRARY_DIR}/compute/exec/union_node.cc" + "${LIBRARY_DIR}/compute/exec/key_hash.cc" + "${LIBRARY_DIR}/compute/exec/key_map.cc" + "${LIBRARY_DIR}/compute/exec/key_compare.cc" + "${LIBRARY_DIR}/compute/exec/key_encode.cc" + "${LIBRARY_DIR}/compute/exec/util.cc" + "${LIBRARY_DIR}/compute/exec/hash_join_dict.cc" + "${LIBRARY_DIR}/compute/exec/hash_join.cc" + "${LIBRARY_DIR}/compute/exec/hash_join_node.cc" + "${LIBRARY_DIR}/compute/exec/task_util.cc" "${LIBRARY_DIR}/ipc/dictionary.cc" "${LIBRARY_DIR}/ipc/feather.cc" @@ -258,52 +349,6 @@ set(ARROW_SRCS "${LIBRARY_DIR}/ipc/reader.cc" "${LIBRARY_DIR}/ipc/writer.cc" - "${LIBRARY_DIR}/io/buffered.cc" - "${LIBRARY_DIR}/io/caching.cc" - "${LIBRARY_DIR}/io/compressed.cc" - "${LIBRARY_DIR}/io/file.cc" - "${LIBRARY_DIR}/io/interfaces.cc" - "${LIBRARY_DIR}/io/memory.cc" - "${LIBRARY_DIR}/io/slow.cc" - "${LIBRARY_DIR}/io/stdio.cc" - "${LIBRARY_DIR}/io/transform.cc" - - "${LIBRARY_DIR}/tensor/coo_converter.cc" - "${LIBRARY_DIR}/tensor/csf_converter.cc" - "${LIBRARY_DIR}/tensor/csx_converter.cc" - - "${LIBRARY_DIR}/util/basic_decimal.cc" - "${LIBRARY_DIR}/util/bit_block_counter.cc" - "${LIBRARY_DIR}/util/bit_run_reader.cc" - "${LIBRARY_DIR}/util/bit_util.cc" - "${LIBRARY_DIR}/util/bitmap_builders.cc" - "${LIBRARY_DIR}/util/bitmap_ops.cc" - "${LIBRARY_DIR}/util/bitmap.cc" - "${LIBRARY_DIR}/util/bpacking.cc" - "${LIBRARY_DIR}/util/cancel.cc" - "${LIBRARY_DIR}/util/compression.cc" - "${LIBRARY_DIR}/util/cpu_info.cc" - "${LIBRARY_DIR}/util/decimal.cc" - "${LIBRARY_DIR}/util/delimiting.cc" - "${LIBRARY_DIR}/util/formatting.cc" - "${LIBRARY_DIR}/util/future.cc" - "${LIBRARY_DIR}/util/int_util.cc" - "${LIBRARY_DIR}/util/io_util.cc" - "${LIBRARY_DIR}/util/key_value_metadata.cc" - "${LIBRARY_DIR}/util/logging.cc" - "${LIBRARY_DIR}/util/memory.cc" - "${LIBRARY_DIR}/util/mutex.cc" - "${LIBRARY_DIR}/util/string_builder.cc" - "${LIBRARY_DIR}/util/string.cc" - "${LIBRARY_DIR}/util/task_group.cc" - "${LIBRARY_DIR}/util/tdigest.cc" - "${LIBRARY_DIR}/util/thread_pool.cc" - "${LIBRARY_DIR}/util/time.cc" - "${LIBRARY_DIR}/util/trie.cc" - "${LIBRARY_DIR}/util/utf8.cc" - "${LIBRARY_DIR}/util/value_parsing.cc" - - "${LIBRARY_DIR}/vendored/base64.cpp" ${ORC_SRCS} ) @@ -373,6 +418,7 @@ target_include_directories(${ARROW_LIBRARY} PRIVATE SYSTEM ${ORC_BUILD_INCLUDE_D target_include_directories(${ARROW_LIBRARY} PRIVATE SYSTEM ${ORC_ADDITION_SOURCE_DIR}) target_include_directories(${ARROW_LIBRARY} PRIVATE SYSTEM ${ARROW_SRC_DIR}) target_include_directories(${ARROW_LIBRARY} PRIVATE SYSTEM ${FLATBUFFERS_INCLUDE_DIR}) +target_include_directories(${ARROW_LIBRARY} PRIVATE SYSTEM ${HDFS_INCLUDE_DIR}) # === parquet @@ -446,7 +492,7 @@ set (HAVE_STRERROR_R 1) set (HAVE_SCHED_GET_PRIORITY_MAX 1) set (HAVE_SCHED_GET_PRIORITY_MIN 1) -if (OS_LINUX) +if (OS_LINUX AND NOT USE_MUSL) set (STRERROR_R_CHAR_P 1) endif () diff --git a/contrib/azure b/contrib/azure new file mode 160000 index 00000000000..ac4b763d4ca --- /dev/null +++ b/contrib/azure @@ -0,0 +1 @@ +Subproject commit ac4b763d4ca40122275f1497cbdc5451337461d9 diff --git a/contrib/azure-cmake/CMakeLists.txt b/contrib/azure-cmake/CMakeLists.txt new file mode 100644 index 00000000000..527503b85a2 --- /dev/null +++ b/contrib/azure-cmake/CMakeLists.txt @@ -0,0 +1,74 @@ +set(AZURE_DIR "${ClickHouse_SOURCE_DIR}/contrib/azure") +set(AZURE_SDK_LIBRARY_DIR "${AZURE_DIR}/sdk") + +file(GLOB AZURE_SDK_CORE_SRC + "${AZURE_SDK_LIBRARY_DIR}/core/azure-core/src/*.cpp" + "${AZURE_SDK_LIBRARY_DIR}/core/azure-core/src/cryptography/*.cpp" + "${AZURE_SDK_LIBRARY_DIR}/core/azure-core/src/http/*.cpp" + "${AZURE_SDK_LIBRARY_DIR}/core/azure-core/src/http/curl/*.hpp" + "${AZURE_SDK_LIBRARY_DIR}/core/azure-core/src/http/curl/*.cpp" + "${AZURE_SDK_LIBRARY_DIR}/core/azure-core/src/winhttp/*.cpp" + "${AZURE_SDK_LIBRARY_DIR}/core/azure-core/src/io/*.cpp" + "${AZURE_SDK_LIBRARY_DIR}/core/azure-core/src/private/*.hpp" +) + +file(GLOB AZURE_SDK_IDENTITY_SRC + "${AZURE_SDK_LIBRARY_DIR}/identity/azure-identity/src/*.cpp" + "${AZURE_SDK_LIBRARY_DIR}/identity/azure-identity/src/private/*.hpp" +) + +file(GLOB AZURE_SDK_STORAGE_COMMON_SRC + "${AZURE_SDK_LIBRARY_DIR}/storage/azure-storage-common/src/*.cpp" + "${AZURE_SDK_LIBRARY_DIR}/storage/azure-storage-common/src/private/*.cpp" +) + +file(GLOB AZURE_SDK_STORAGE_BLOBS_SRC + "${AZURE_SDK_LIBRARY_DIR}/storage/azure-storage-blobs/src/*.cpp" + "${AZURE_SDK_LIBRARY_DIR}/storage/azure-storage-blobs/src/private/*.hpp" +) + +file(GLOB AZURE_SDK_UNIFIED_SRC + ${AZURE_SDK_CORE_SRC} + ${AZURE_SDK_IDENTITY_SRC} + ${AZURE_SDK_STORAGE_COMMON_SRC} + ${AZURE_SDK_STORAGE_BLOBS_SRC} +) + +set(AZURE_SDK_INCLUDES + "${AZURE_SDK_LIBRARY_DIR}/core/azure-core/inc/" + "${AZURE_SDK_LIBRARY_DIR}/identity/azure-identity/inc/" + "${AZURE_SDK_LIBRARY_DIR}/storage/azure-storage-common/inc/" + "${AZURE_SDK_LIBRARY_DIR}/storage/azure-storage-blobs/inc/" +) + +include("${AZURE_DIR}/cmake-modules/AzureTransportAdapters.cmake") + +add_library(azure_sdk ${AZURE_SDK_UNIFIED_SRC}) + +if (COMPILER_CLANG) + target_compile_options(azure_sdk PRIVATE + -Wno-deprecated-copy-dtor + -Wno-extra-semi + -Wno-suggest-destructor-override + -Wno-inconsistent-missing-destructor-override + -Wno-error=unknown-warning-option + ) + + if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 13) + target_compile_options(azure_sdk PRIVATE -Wno-reserved-identifier) + endif() +endif() + +# Originally, on Windows azure-core is built with bcrypt and crypt32 by default +if (OPENSSL_FOUND) + target_link_libraries(azure_sdk PRIVATE ${OPENSSL_LIBRARIES}) +endif() + +# Originally, on Windows azure-core is built with winhttp by default +if (CURL_FOUND) + target_link_libraries(azure_sdk PRIVATE ${CURL_LIBRARY}) +endif() + +target_link_libraries(azure_sdk PRIVATE ${LIBXML2_LIBRARIES}) + +target_include_directories(azure_sdk SYSTEM PUBLIC ${AZURE_SDK_INCLUDES}) diff --git a/contrib/boost b/contrib/boost index fcb058e1459..c0807e83f28 160000 --- a/contrib/boost +++ b/contrib/boost @@ -1 +1 @@ -Subproject commit fcb058e1459ac273ecfe7cdf72791cb1479115af +Subproject commit c0807e83f2824e8dd67a15b355496a9b784cdcd5 diff --git a/contrib/boost-cmake/CMakeLists.txt b/contrib/boost-cmake/CMakeLists.txt index 057a893e926..4a21b8a0e2d 100644 --- a/contrib/boost-cmake/CMakeLists.txt +++ b/contrib/boost-cmake/CMakeLists.txt @@ -1,9 +1,7 @@ option (USE_INTERNAL_BOOST_LIBRARY "Use internal Boost library" ON) if (NOT USE_INTERNAL_BOOST_LIBRARY) - # 1.70 like in contrib/boost - # 1.71 on CI - set(BOOST_VERSION 1.71) + set(BOOST_VERSION 1.78) find_package(Boost ${BOOST_VERSION} COMPONENTS system @@ -66,9 +64,11 @@ if (NOT EXTERNAL_BOOST_FOUND) set (SRCS_FILESYSTEM "${LIBRARY_DIR}/libs/filesystem/src/codecvt_error_category.cpp" + "${LIBRARY_DIR}/libs/filesystem/src/directory.cpp" + "${LIBRARY_DIR}/libs/filesystem/src/exception.cpp" "${LIBRARY_DIR}/libs/filesystem/src/operations.cpp" - "${LIBRARY_DIR}/libs/filesystem/src/path_traits.cpp" "${LIBRARY_DIR}/libs/filesystem/src/path.cpp" + "${LIBRARY_DIR}/libs/filesystem/src/path_traits.cpp" "${LIBRARY_DIR}/libs/filesystem/src/portability.cpp" "${LIBRARY_DIR}/libs/filesystem/src/unique_path.cpp" "${LIBRARY_DIR}/libs/filesystem/src/utf8_codecvt_facet.cpp" @@ -126,24 +126,11 @@ if (NOT EXTERNAL_BOOST_FOUND) # regex set (SRCS_REGEX - "${LIBRARY_DIR}/libs/regex/src/c_regex_traits.cpp" - "${LIBRARY_DIR}/libs/regex/src/cpp_regex_traits.cpp" - "${LIBRARY_DIR}/libs/regex/src/cregex.cpp" - "${LIBRARY_DIR}/libs/regex/src/fileiter.cpp" - "${LIBRARY_DIR}/libs/regex/src/icu.cpp" - "${LIBRARY_DIR}/libs/regex/src/instances.cpp" - "${LIBRARY_DIR}/libs/regex/src/internals.hpp" "${LIBRARY_DIR}/libs/regex/src/posix_api.cpp" "${LIBRARY_DIR}/libs/regex/src/regex_debug.cpp" - "${LIBRARY_DIR}/libs/regex/src/regex_raw_buffer.cpp" - "${LIBRARY_DIR}/libs/regex/src/regex_traits_defaults.cpp" "${LIBRARY_DIR}/libs/regex/src/regex.cpp" "${LIBRARY_DIR}/libs/regex/src/static_mutex.cpp" - "${LIBRARY_DIR}/libs/regex/src/usinstances.cpp" - "${LIBRARY_DIR}/libs/regex/src/w32_regex_traits.cpp" - "${LIBRARY_DIR}/libs/regex/src/wc_regex_traits.cpp" "${LIBRARY_DIR}/libs/regex/src/wide_posix_api.cpp" - "${LIBRARY_DIR}/libs/regex/src/winstances.cpp" ) add_library (_boost_regex ${SRCS_REGEX}) @@ -166,7 +153,6 @@ if (NOT EXTERNAL_BOOST_FOUND) set (SRCS_CONTEXT "${LIBRARY_DIR}/libs/context/src/dummy.cpp" - "${LIBRARY_DIR}/libs/context/src/execution_context.cpp" "${LIBRARY_DIR}/libs/context/src/posix/stack_traits.cpp" ) diff --git a/contrib/boringssl-cmake/CMakeLists.txt b/contrib/boringssl-cmake/CMakeLists.txt index 474e32f3b91..fb32df8cd79 100644 --- a/contrib/boringssl-cmake/CMakeLists.txt +++ b/contrib/boringssl-cmake/CMakeLists.txt @@ -639,6 +639,7 @@ add_library( "${BORINGSSL_SOURCE_DIR}/decrepit/ssl/ssl_decrepit.c" "${BORINGSSL_SOURCE_DIR}/decrepit/cfb/cfb.c" + "${BORINGSSL_SOURCE_DIR}/decrepit/bio/base64_bio.c" ) add_executable( diff --git a/contrib/cassandra b/contrib/cassandra index eb9b68dadbb..f4a31e92a25 160000 --- a/contrib/cassandra +++ b/contrib/cassandra @@ -1 +1 @@ -Subproject commit eb9b68dadbb4417a2c132ad4a1c2fa76e65e6fc1 +Subproject commit f4a31e92a25c34c02c7291ff97c7813bc83b0e09 diff --git a/contrib/jemalloc b/contrib/jemalloc index e6891d97461..a1404807211 160000 --- a/contrib/jemalloc +++ b/contrib/jemalloc @@ -1 +1 @@ -Subproject commit e6891d9746143bf2cf617493d880ba5a0b9a3efd +Subproject commit a1404807211b1612539f840b3dcb1bf38d1a269e diff --git a/contrib/libuv-cmake/CMakeLists.txt b/contrib/libuv-cmake/CMakeLists.txt index 4fbd0575b55..dc47b0bf496 100644 --- a/contrib/libuv-cmake/CMakeLists.txt +++ b/contrib/libuv-cmake/CMakeLists.txt @@ -1,17 +1,8 @@ # This file is a modified version of contrib/libuv/CMakeLists.txt -include(CMakeDependentOption) - set (SOURCE_DIR "${CMAKE_SOURCE_DIR}/contrib/libuv") set (BINARY_DIR "${CMAKE_BINARY_DIR}/contrib/libuv") - -if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang|GNU") - list(APPEND uv_cflags -fvisibility=hidden --std=gnu89) - list(APPEND uv_cflags -Wall -Wextra -Wstrict-prototypes) - list(APPEND uv_cflags -Wno-unused-parameter) -endif() - set(uv_sources src/fs-poll.c src/idna.c @@ -76,7 +67,7 @@ endif() if(CMAKE_SYSTEM_NAME STREQUAL "Linux") list(APPEND uv_defines _GNU_SOURCE _POSIX_C_SOURCE=200112) - list(APPEND uv_libraries dl rt) + list(APPEND uv_libraries rt) list(APPEND uv_sources src/unix/linux-core.c src/unix/linux-inotify.c diff --git a/contrib/libxml2-cmake/linux_x86_64/include/libxml/xmlversion.h b/contrib/libxml2-cmake/linux_x86_64/include/libxml/xmlversion.h index 92d3414fdac..52f62214324 100644 --- a/contrib/libxml2-cmake/linux_x86_64/include/libxml/xmlversion.h +++ b/contrib/libxml2-cmake/linux_x86_64/include/libxml/xmlversion.h @@ -268,7 +268,7 @@ XMLPUBFUN void XMLCALL xmlCheckVersion(int version); * * Whether iconv support is available */ -#if 1 +#if 0 #define LIBXML_ICONV_ENABLED #endif diff --git a/contrib/mariadb-connector-c-cmake/CMakeLists.txt b/contrib/mariadb-connector-c-cmake/CMakeLists.txt index ea74e13b7f0..7c3f25cdf87 100644 --- a/contrib/mariadb-connector-c-cmake/CMakeLists.txt +++ b/contrib/mariadb-connector-c-cmake/CMakeLists.txt @@ -236,8 +236,7 @@ set(LIBMARIADB_SOURCES ${LIBMARIADB_SOURCES} ${CC_SOURCE_DIR}/libmariadb/mariadb add_library(mariadbclient STATIC ${LIBMARIADB_SOURCES}) target_link_libraries(mariadbclient ${SYSTEM_LIBS}) -target_include_directories(mariadbclient - PRIVATE ${CC_BINARY_DIR}/include-private - PUBLIC ${CC_BINARY_DIR}/include-public ${CC_SOURCE_DIR}/include ${CC_SOURCE_DIR}/libmariadb) +target_include_directories(mariadbclient PRIVATE ${CC_BINARY_DIR}/include-private) +target_include_directories(mariadbclient SYSTEM PUBLIC ${CC_BINARY_DIR}/include-public ${CC_SOURCE_DIR}/include ${CC_SOURCE_DIR}/libmariadb) set_target_properties(mariadbclient PROPERTIES IMPORTED_INTERFACE_LINK_LIBRARIES "${SYSTEM_LIBS}") diff --git a/contrib/poco b/contrib/poco index 258b9ba6cd2..520a90e02e3 160000 --- a/contrib/poco +++ b/contrib/poco @@ -1 +1 @@ -Subproject commit 258b9ba6cd245ff88e9346f75c43464c403f329d +Subproject commit 520a90e02e3e5cb90afeae1846d161dbc508a6f1 diff --git a/contrib/protobuf b/contrib/protobuf index c1c5d020260..6bb70196c53 160000 --- a/contrib/protobuf +++ b/contrib/protobuf @@ -1 +1 @@ -Subproject commit c1c5d02026059f4c3cb51aaa08e82288d3e08b89 +Subproject commit 6bb70196c5360268d9f021bb7936fb0b551724c2 diff --git a/contrib/replxx-cmake/CMakeLists.txt b/contrib/replxx-cmake/CMakeLists.txt index 07f24bae25d..222a38095cb 100644 --- a/contrib/replxx-cmake/CMakeLists.txt +++ b/contrib/replxx-cmake/CMakeLists.txt @@ -8,7 +8,7 @@ if (NOT ENABLE_REPLXX) add_library(replxx INTERFACE) target_compile_definitions(replxx INTERFACE USE_REPLXX=0) - message (STATUS "Not using replxx (Beware! Runtime fallback to readline is possible!)") + message (STATUS "Not using replxx") return() endif() diff --git a/contrib/s2geometry b/contrib/s2geometry index 38b7a290f92..471fe9dc931 160000 --- a/contrib/s2geometry +++ b/contrib/s2geometry @@ -1 +1 @@ -Subproject commit 38b7a290f927cc372218c2094602b83e35b18c05 +Subproject commit 471fe9dc931a4bb560333545186e9b5da168ac83 diff --git a/contrib/s2geometry-cmake/CMakeLists.txt b/contrib/s2geometry-cmake/CMakeLists.txt index 41d570c9afd..e2b0f20f408 100644 --- a/contrib/s2geometry-cmake/CMakeLists.txt +++ b/contrib/s2geometry-cmake/CMakeLists.txt @@ -1,8 +1,12 @@ set(S2_SOURCE_DIR "${ClickHouse_SOURCE_DIR}/contrib/s2geometry/src") +set(ABSL_SOURCE_DIR "${ClickHouse_SOURCE_DIR}/contrib/abseil-cpp") +if(NOT EXISTS "${ABSL_SOURCE_DIR}/CMakeLists.txt") + message(FATAL_ERROR " submodule contrib/abseil-cpp is missing. To fix try run: \n git submodule update --init --recursive") +endif() + + set(S2_SRCS - "${S2_SOURCE_DIR}/s2/base/stringprintf.cc" - "${S2_SOURCE_DIR}/s2/base/strtoint.cc" "${S2_SOURCE_DIR}/s2/encoded_s2cell_id_vector.cc" "${S2_SOURCE_DIR}/s2/encoded_s2point_vector.cc" "${S2_SOURCE_DIR}/s2/encoded_s2shape_index.cc" @@ -14,11 +18,14 @@ set(S2_SRCS "${S2_SOURCE_DIR}/s2/s1chord_angle.cc" "${S2_SOURCE_DIR}/s2/s1interval.cc" "${S2_SOURCE_DIR}/s2/s2boolean_operation.cc" + "${S2_SOURCE_DIR}/s2/s2buffer_operation.cc" "${S2_SOURCE_DIR}/s2/s2builder.cc" "${S2_SOURCE_DIR}/s2/s2builder_graph.cc" "${S2_SOURCE_DIR}/s2/s2builderutil_closed_set_normalizer.cc" "${S2_SOURCE_DIR}/s2/s2builderutil_find_polygon_degeneracies.cc" + "${S2_SOURCE_DIR}/s2/s2builderutil_get_snapped_winding_delta.cc" "${S2_SOURCE_DIR}/s2/s2builderutil_lax_polygon_layer.cc" + "${S2_SOURCE_DIR}/s2/s2builderutil_lax_polyline_layer.cc" "${S2_SOURCE_DIR}/s2/s2builderutil_s2point_vector_layer.cc" "${S2_SOURCE_DIR}/s2/s2builderutil_s2polygon_layer.cc" "${S2_SOURCE_DIR}/s2/s2builderutil_s2polyline_layer.cc" @@ -44,7 +51,6 @@ set(S2_SRCS "${S2_SOURCE_DIR}/s2/s2edge_crossings.cc" "${S2_SOURCE_DIR}/s2/s2edge_distances.cc" "${S2_SOURCE_DIR}/s2/s2edge_tessellator.cc" - "${S2_SOURCE_DIR}/s2/s2error.cc" "${S2_SOURCE_DIR}/s2/s2furthest_edge_query.cc" "${S2_SOURCE_DIR}/s2/s2latlng.cc" "${S2_SOURCE_DIR}/s2/s2latlng_rect.cc" @@ -55,6 +61,7 @@ set(S2_SRCS "${S2_SOURCE_DIR}/s2/s2loop.cc" "${S2_SOURCE_DIR}/s2/s2loop_measures.cc" "${S2_SOURCE_DIR}/s2/s2measures.cc" + "${S2_SOURCE_DIR}/s2/s2memory_tracker.cc" "${S2_SOURCE_DIR}/s2/s2metrics.cc" "${S2_SOURCE_DIR}/s2/s2max_distance_targets.cc" "${S2_SOURCE_DIR}/s2/s2min_distance_targets.cc" @@ -82,28 +89,15 @@ set(S2_SRCS "${S2_SOURCE_DIR}/s2/s2shapeutil_build_polygon_boundaries.cc" "${S2_SOURCE_DIR}/s2/s2shapeutil_coding.cc" "${S2_SOURCE_DIR}/s2/s2shapeutil_contains_brute_force.cc" + "${S2_SOURCE_DIR}/s2/s2shapeutil_conversion.cc" "${S2_SOURCE_DIR}/s2/s2shapeutil_edge_iterator.cc" "${S2_SOURCE_DIR}/s2/s2shapeutil_get_reference_point.cc" "${S2_SOURCE_DIR}/s2/s2shapeutil_range_iterator.cc" "${S2_SOURCE_DIR}/s2/s2shapeutil_visit_crossing_edge_pairs.cc" "${S2_SOURCE_DIR}/s2/s2text_format.cc" "${S2_SOURCE_DIR}/s2/s2wedge_relations.cc" - "${S2_SOURCE_DIR}/s2/strings/ostringstream.cc" + "${S2_SOURCE_DIR}/s2/s2winding_operation.cc" "${S2_SOURCE_DIR}/s2/strings/serialize.cc" - # ClickHouse doesn't use strings from abseil. - # So, there is no duplicate symbols. - "${S2_SOURCE_DIR}/s2/third_party/absl/base/dynamic_annotations.cc" - "${S2_SOURCE_DIR}/s2/third_party/absl/base/internal/raw_logging.cc" - "${S2_SOURCE_DIR}/s2/third_party/absl/base/internal/throw_delegate.cc" - "${S2_SOURCE_DIR}/s2/third_party/absl/numeric/int128.cc" - "${S2_SOURCE_DIR}/s2/third_party/absl/strings/ascii.cc" - "${S2_SOURCE_DIR}/s2/third_party/absl/strings/match.cc" - "${S2_SOURCE_DIR}/s2/third_party/absl/strings/numbers.cc" - "${S2_SOURCE_DIR}/s2/third_party/absl/strings/str_cat.cc" - "${S2_SOURCE_DIR}/s2/third_party/absl/strings/str_split.cc" - "${S2_SOURCE_DIR}/s2/third_party/absl/strings/string_view.cc" - "${S2_SOURCE_DIR}/s2/third_party/absl/strings/strip.cc" - "${S2_SOURCE_DIR}/s2/third_party/absl/strings/internal/memutil.cc" "${S2_SOURCE_DIR}/s2/util/bits/bit-interleave.cc" "${S2_SOURCE_DIR}/s2/util/bits/bits.cc" "${S2_SOURCE_DIR}/s2/util/coding/coder.cc" @@ -111,17 +105,41 @@ set(S2_SRCS "${S2_SOURCE_DIR}/s2/util/math/exactfloat/exactfloat.cc" "${S2_SOURCE_DIR}/s2/util/math/mathutil.cc" "${S2_SOURCE_DIR}/s2/util/units/length-units.cc" + ) add_library(s2 ${S2_SRCS}) - -set_property(TARGET s2 PROPERTY CXX_STANDARD 11) +set_property(TARGET s2 PROPERTY CXX_STANDARD 17) if (OPENSSL_FOUND) target_link_libraries(s2 PRIVATE ${OPENSSL_LIBRARIES}) endif() +# Copied from contrib/s2geometry/CMakeLists +target_link_libraries(s2 PRIVATE + absl::base + absl::btree + absl::config + absl::core_headers + absl::dynamic_annotations + absl::endian + absl::fixed_array + absl::flat_hash_map + absl::flat_hash_set + absl::hash + absl::inlined_vector + absl::int128 + absl::log_severity + absl::memory + absl::span + absl::str_format + absl::strings + absl::type_traits + absl::utility + ) + target_include_directories(s2 SYSTEM BEFORE PUBLIC "${S2_SOURCE_DIR}/") +target_include_directories(s2 SYSTEM PUBLIC "${ABSL_SOURCE_DIR}") if(M_LIBRARY) target_link_libraries(s2 PRIVATE ${M_LIBRARY}) diff --git a/contrib/sysroot b/contrib/sysroot index 410845187f5..bbcac834526 160000 --- a/contrib/sysroot +++ b/contrib/sysroot @@ -1 +1 @@ -Subproject commit 410845187f582c5e6692b53dddbe43efbb728734 +Subproject commit bbcac834526d90d1e764164b861be426891d1743 diff --git a/contrib/unixodbc-cmake/linux_x86_64/private/config.h b/contrib/unixodbc-cmake/linux_x86_64/private/config.h index d80a4da4665..59cee9e8565 100644 --- a/contrib/unixodbc-cmake/linux_x86_64/private/config.h +++ b/contrib/unixodbc-cmake/linux_x86_64/private/config.h @@ -202,10 +202,10 @@ #define HAVE_READDIR 1 /* Add readline support */ -#define HAVE_READLINE 1 +/* #undef HAVE_READLINE */ /* Define to 1 if you have the header file. */ -#define HAVE_READLINE_HISTORY_H 1 +/* #undef HAVE_READLINE_HISTORY_H */ /* Use the scandir lib */ /* #undef HAVE_SCANDIR */ diff --git a/debian/rules b/debian/rules index 4562d24bec4..e0ad0388de7 100755 --- a/debian/rules +++ b/debian/rules @@ -45,6 +45,10 @@ ifdef DEB_CXX ifeq ($(DEB_BUILD_GNU_TYPE),$(DEB_HOST_GNU_TYPE)) CC := $(DEB_CC) CXX := $(DEB_CXX) +else ifeq (clang,$(findstring clang,$(DEB_CXX))) +# If we crosscompile with clang, it knows what to do + CC := $(DEB_CC) + CXX := $(DEB_CXX) else CC := $(DEB_HOST_GNU_TYPE)-$(DEB_CC) CXX := $(DEB_HOST_GNU_TYPE)-$(DEB_CXX) @@ -77,10 +81,6 @@ else THREADS_COUNT = 1 endif -ifneq ($(THREADS_COUNT),) - THREADS_COUNT:=-j$(THREADS_COUNT) -endif - %: dh $@ $(DH_FLAGS) --buildsystem=cmake @@ -89,11 +89,11 @@ override_dh_auto_configure: override_dh_auto_build: # Fix for ninja. Do not add -O. - $(MAKE) $(THREADS_COUNT) -C $(BUILDDIR) $(MAKE_TARGET) + $(MAKE) -j$(THREADS_COUNT) -C $(BUILDDIR) $(MAKE_TARGET) override_dh_auto_test: ifeq (,$(filter nocheck,$(DEB_BUILD_OPTIONS))) - cd $(BUILDDIR) && ctest $(THREADS_COUNT) -V + cd $(BUILDDIR) && ctest -j$(THREADS_COUNT) -V endif override_dh_clean: @@ -120,7 +120,7 @@ override_dh_install: dh_install --list-missing --sourcedir=$(DESTDIR) override_dh_auto_install: - env DESTDIR=$(DESTDIR) $(MAKE) $(THREADS_COUNT) -C $(BUILDDIR) install + env DESTDIR=$(DESTDIR) $(MAKE) -j$(THREADS_COUNT) -C $(BUILDDIR) install override_dh_shlibdeps: true # We depend only on libc and dh_shlibdeps gives us wrong (too strict) dependency. diff --git a/docker/images.json b/docker/images.json index a696b0597df..dc7126a3f5a 100644 --- a/docker/images.json +++ b/docker/images.json @@ -46,7 +46,6 @@ "name": "clickhouse/stateless-test", "dependent": [ "docker/test/stateful", - "docker/test/coverage", "docker/test/unit" ] }, @@ -56,10 +55,6 @@ "docker/test/stress" ] }, - "docker/test/coverage": { - "name": "clickhouse/test-coverage", - "dependent": [] - }, "docker/test/unit": { "name": "clickhouse/unit-test", "dependent": [] diff --git a/docker/packager/binary/Dockerfile b/docker/packager/binary/Dockerfile index 6a6d0e7212c..8f886ea357d 100644 --- a/docker/packager/binary/Dockerfile +++ b/docker/packager/binary/Dockerfile @@ -24,41 +24,34 @@ RUN apt-get update \ && apt-key add /tmp/llvm-snapshot.gpg.key \ && export CODENAME="$(lsb_release --codename --short | tr 'A-Z' 'a-z')" \ && echo "deb [trusted=yes] https://apt.llvm.org/${CODENAME}/ llvm-toolchain-${CODENAME}-${LLVM_VERSION} main" >> \ - /etc/apt/sources.list + /etc/apt/sources.list \ + && apt-get clean # initial packages -RUN apt-get update \ - && apt-get install \ - bash \ - fakeroot \ - ccache \ - curl \ - software-properties-common \ - --yes --no-install-recommends - RUN apt-get update \ && apt-get install \ bash \ build-essential \ ccache \ + clang-${LLVM_VERSION} \ + clang-tidy-${LLVM_VERSION} \ cmake \ curl \ + fakeroot \ gdb \ git \ gperf \ - clang-${LLVM_VERSION} \ - clang-tidy-${LLVM_VERSION} \ lld-${LLVM_VERSION} \ llvm-${LLVM_VERSION} \ llvm-${LLVM_VERSION}-dev \ - libicu-dev \ - libreadline-dev \ moreutils \ ninja-build \ pigz \ rename \ + software-properties-common \ tzdata \ - --yes --no-install-recommends + --yes --no-install-recommends \ + && apt-get clean # This symlink required by gcc to find lld compiler RUN ln -s /usr/bin/lld-${LLVM_VERSION} /usr/bin/ld.lld @@ -67,7 +60,7 @@ ENV CC=clang-${LLVM_VERSION} ENV CXX=clang++-${LLVM_VERSION} # libtapi is required to support .tbh format from recent MacOS SDKs -RUN git clone https://github.com/tpoechtrager/apple-libtapi.git \ +RUN git clone --depth 1 https://github.com/tpoechtrager/apple-libtapi.git \ && cd apple-libtapi \ && INSTALLPREFIX=/cctools ./build.sh \ && ./install.sh \ @@ -75,7 +68,7 @@ RUN git clone https://github.com/tpoechtrager/apple-libtapi.git \ && rm -rf apple-libtapi # Build and install tools for cross-linking to Darwin (x86-64) -RUN git clone https://github.com/tpoechtrager/cctools-port.git \ +RUN git clone --depth 1 https://github.com/tpoechtrager/cctools-port.git \ && cd cctools-port/cctools \ && ./configure --prefix=/cctools --with-libtapi=/cctools \ --target=x86_64-apple-darwin \ @@ -84,7 +77,7 @@ RUN git clone https://github.com/tpoechtrager/cctools-port.git \ && rm -rf cctools-port # Build and install tools for cross-linking to Darwin (aarch64) -RUN git clone https://github.com/tpoechtrager/cctools-port.git \ +RUN git clone --depth 1 https://github.com/tpoechtrager/cctools-port.git \ && cd cctools-port/cctools \ && ./configure --prefix=/cctools --with-libtapi=/cctools \ --target=aarch64-apple-darwin \ @@ -98,7 +91,8 @@ RUN wget -nv https://github.com/phracker/MacOSX-SDKs/releases/download/11.3/MacO # NOTE: Seems like gcc-11 is too new for ubuntu20 repository RUN add-apt-repository ppa:ubuntu-toolchain-r/test --yes \ && apt-get update \ - && apt-get install gcc-11 g++-11 --yes + && apt-get install gcc-11 g++-11 --yes \ + && apt-get clean COPY build.sh / diff --git a/docker/packager/deb/Dockerfile b/docker/packager/deb/Dockerfile index 873edfe4afc..89c34846efa 100644 --- a/docker/packager/deb/Dockerfile +++ b/docker/packager/deb/Dockerfile @@ -64,8 +64,14 @@ RUN add-apt-repository ppa:ubuntu-toolchain-r/test --yes \ && apt-get install gcc-11 g++-11 --yes -# This symlink required by gcc to find lld compiler -RUN ln -s /usr/bin/lld-${LLVM_VERSION} /usr/bin/ld.lld +# These symlinks are required: +# /usr/bin/ld.lld: by gcc to find lld compiler +# /usr/bin/aarch64-linux-gnu-obj*: for debug symbols stripping +RUN ln -s /usr/bin/lld-${LLVM_VERSION} /usr/bin/ld.lld \ + && ln -sf /usr/lib/llvm-${LLVM_VERSION}/bin/llvm-objcopy /usr/bin/aarch64-linux-gnu-strip \ + && ln -sf /usr/lib/llvm-${LLVM_VERSION}/bin/llvm-objcopy /usr/bin/aarch64-linux-gnu-objcopy \ + && ln -sf /usr/lib/llvm-${LLVM_VERSION}/bin/llvm-objdump /usr/bin/aarch64-linux-gnu-objdump + COPY build.sh / diff --git a/docker/packager/packager b/docker/packager/packager index 9cce12be949..c042db2251d 100755 --- a/docker/packager/packager +++ b/docker/packager/packager @@ -29,7 +29,13 @@ def pull_image(image_name): return False def build_image(image_name, filepath): - subprocess.check_call("docker build --network=host -t {} -f {} .".format(image_name, filepath), shell=True) + context = os.path.dirname(filepath) + subprocess.check_call( + "docker build --network=host -t {} -f {} {}".format( + image_name, filepath, context + ), + shell=True, + ) def run_docker_image_with_env(image_name, output, env_variables, ch_root, ccache_dir, docker_image_version): env_part = " -e ".join(env_variables) @@ -90,6 +96,7 @@ def parse_env_variables(build_type, compiler, sanitizer, package_type, image_typ elif is_cross_arm: cc = compiler[:-len(ARM_SUFFIX)] cmake_flags.append("-DCMAKE_TOOLCHAIN_FILE=/build/cmake/linux/toolchain-aarch64.cmake") + result.append("DEB_ARCH_FLAG=-aarm64") elif is_cross_freebsd: cc = compiler[:-len(FREEBSD_SUFFIX)] cmake_flags.append("-DCMAKE_TOOLCHAIN_FILE=/build/cmake/freebsd/toolchain-x86_64.cmake") @@ -98,6 +105,7 @@ def parse_env_variables(build_type, compiler, sanitizer, package_type, image_typ cmake_flags.append("-DCMAKE_TOOLCHAIN_FILE=/build/cmake/linux/toolchain-ppc64le.cmake") else: cc = compiler + result.append("DEB_ARCH_FLAG=-aamd64") cxx = cc.replace('gcc', 'g++').replace('clang', 'clang++') diff --git a/docker/server/README.md b/docker/server/README.md index c63bb980c13..5a96a63bb05 100644 --- a/docker/server/README.md +++ b/docker/server/README.md @@ -17,6 +17,8 @@ $ docker run -d --name some-clickhouse-server --ulimit nofile=262144:262144 clic By default ClickHouse will be accessible only via docker network. See the [networking section below](#networking). +By default, starting above server instance will be run as default user without password. + ### connect to it from a native client ```bash $ docker run -it --rm --link some-clickhouse-server:clickhouse-server clickhouse/clickhouse-client --host clickhouse-server diff --git a/docker/test/codebrowser/Dockerfile b/docker/test/codebrowser/Dockerfile index 25fabca67b5..d1059b3dacc 100644 --- a/docker/test/codebrowser/Dockerfile +++ b/docker/test/codebrowser/Dockerfile @@ -6,7 +6,7 @@ FROM clickhouse/binary-builder ARG apt_archive="http://archive.ubuntu.com" RUN sed -i "s|http://archive.ubuntu.com|$apt_archive|g" /etc/apt/sources.list -RUN apt-get update && apt-get --yes --allow-unauthenticated install clang-9 libllvm9 libclang-9-dev +RUN apt-get update && apt-get --yes --allow-unauthenticated install clang-13 libllvm13 libclang-13-dev # repo versions doesn't work correctly with C++17 # also we push reports to s3, so we add index.html to subfolder urls @@ -23,12 +23,12 @@ ENV SOURCE_DIRECTORY=/repo_folder ENV BUILD_DIRECTORY=/build ENV HTML_RESULT_DIRECTORY=$BUILD_DIRECTORY/html_report ENV SHA=nosha -ENV DATA="data" +ENV DATA="https://s3.amazonaws.com/clickhouse-test-reports/codebrowser/data" CMD mkdir -p $BUILD_DIRECTORY && cd $BUILD_DIRECTORY && \ cmake $SOURCE_DIRECTORY -DCMAKE_CXX_COMPILER=/usr/bin/clang\+\+-13 -DCMAKE_C_COMPILER=/usr/bin/clang-13 -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DENABLE_EMBEDDED_COMPILER=0 -DENABLE_S3=0 && \ mkdir -p $HTML_RESULT_DIRECTORY && \ $CODEGEN -b $BUILD_DIRECTORY -a -o $HTML_RESULT_DIRECTORY -p ClickHouse:$SOURCE_DIRECTORY:$SHA -d $DATA | ts '%Y-%m-%d %H:%M:%S' && \ cp -r $STATIC_DATA $HTML_RESULT_DIRECTORY/ &&\ - $CODEINDEX $HTML_RESULT_DIRECTORY -d $DATA | ts '%Y-%m-%d %H:%M:%S' && \ + $CODEINDEX $HTML_RESULT_DIRECTORY -d "$DATA" | ts '%Y-%m-%d %H:%M:%S' && \ mv $HTML_RESULT_DIRECTORY /test_output diff --git a/docker/test/coverage/Dockerfile b/docker/test/coverage/Dockerfile deleted file mode 100644 index ccf0bbc7c83..00000000000 --- a/docker/test/coverage/Dockerfile +++ /dev/null @@ -1,18 +0,0 @@ -# docker build -t clickhouse/test-coverage . -FROM clickhouse/stateless-test - -RUN apt-get update -y \ - && env DEBIAN_FRONTEND=noninteractive \ - apt-get install --yes --no-install-recommends \ - cmake - -COPY s3downloader /s3downloader -COPY run.sh /run.sh - -ENV DATASETS="hits visits" -ENV COVERAGE_DIR=/coverage_reports -ENV SOURCE_DIR=/build -ENV OUTPUT_DIR=/output -ENV IGNORE='.*contrib.*' - -CMD ["/bin/bash", "/run.sh"] diff --git a/docker/test/coverage/run.sh b/docker/test/coverage/run.sh deleted file mode 100755 index 807efdf1e47..00000000000 --- a/docker/test/coverage/run.sh +++ /dev/null @@ -1,112 +0,0 @@ -#!/bin/bash - -kill_clickhouse () { - echo "clickhouse pids $(pgrep -u clickhouse)" | ts '%Y-%m-%d %H:%M:%S' - pkill -f "clickhouse-server" 2>/dev/null - - - for _ in {1..120} - do - if ! pkill -0 -f "clickhouse-server" ; then break ; fi - echo "ClickHouse still alive" | ts '%Y-%m-%d %H:%M:%S' - sleep 1 - done - - if pkill -0 -f "clickhouse-server" - then - pstree -apgT - jobs - echo "Failed to kill the ClickHouse server" | ts '%Y-%m-%d %H:%M:%S' - return 1 - fi -} - -start_clickhouse () { - LLVM_PROFILE_FILE='server_%h_%p_%m.profraw' sudo -Eu clickhouse /usr/bin/clickhouse-server --config /etc/clickhouse-server/config.xml & - counter=0 - until clickhouse-client --query "SELECT 1" - do - if [ "$counter" -gt 120 ] - then - echo "Cannot start clickhouse-server" - cat /var/log/clickhouse-server/stdout.log - tail -n1000 /var/log/clickhouse-server/stderr.log - tail -n1000 /var/log/clickhouse-server/clickhouse-server.log - break - fi - sleep 0.5 - counter=$((counter + 1)) - done -} - - -chmod 777 / - -dpkg -i package_folder/clickhouse-common-static_*.deb; \ - dpkg -i package_folder/clickhouse-common-static-dbg_*.deb; \ - dpkg -i package_folder/clickhouse-server_*.deb; \ - dpkg -i package_folder/clickhouse-client_*.deb; \ - dpkg -i package_folder/clickhouse-test_*.deb - -mkdir -p /var/lib/clickhouse -mkdir -p /var/log/clickhouse-server -chmod 777 -R /var/log/clickhouse-server/ - -# install test configs -/usr/share/clickhouse-test/config/install.sh - -start_clickhouse - -# shellcheck disable=SC2086 # No quotes because I want to split it into words. -if ! /s3downloader --dataset-names $DATASETS; then - echo "Cannot download datatsets" - exit 1 -fi - - -chmod 777 -R /var/lib/clickhouse - - -LLVM_PROFILE_FILE='client_coverage_%5m.profraw' clickhouse-client --query "SHOW DATABASES" -LLVM_PROFILE_FILE='client_coverage_%5m.profraw' clickhouse-client --query "ATTACH DATABASE datasets ENGINE = Ordinary" -LLVM_PROFILE_FILE='client_coverage_%5m.profraw' clickhouse-client --query "CREATE DATABASE test" - -kill_clickhouse -start_clickhouse - -LLVM_PROFILE_FILE='client_coverage_%5m.profraw' clickhouse-client --query "SHOW TABLES FROM datasets" -LLVM_PROFILE_FILE='client_coverage_%5m.profraw' clickhouse-client --query "SHOW TABLES FROM test" -LLVM_PROFILE_FILE='client_coverage_%5m.profraw' clickhouse-client --query "RENAME TABLE datasets.hits_v1 TO test.hits" -LLVM_PROFILE_FILE='client_coverage_%5m.profraw' clickhouse-client --query "RENAME TABLE datasets.visits_v1 TO test.visits" -LLVM_PROFILE_FILE='client_coverage_%5m.profraw' clickhouse-client --query "SHOW TABLES FROM test" - -LLVM_PROFILE_FILE='client_coverage_%5m.profraw' clickhouse-test -j 8 --testname --shard --zookeeper --print-time 2>&1 | ts '%Y-%m-%d %H:%M:%S' | tee /test_result.txt - -readarray -t FAILED_TESTS < <(awk '/FAIL|TIMEOUT|ERROR/ { print substr($3, 1, length($3)-1) }' "/test_result.txt") - -kill_clickhouse - -sleep 3 - -if [[ -n "${FAILED_TESTS[*]}" ]] -then - # Clean the data so that there is no interference from the previous test run. - rm -rf /var/lib/clickhouse/{{meta,}data,user_files} ||: - - start_clickhouse - - echo "Going to run again: ${FAILED_TESTS[*]}" - - LLVM_PROFILE_FILE='client_coverage_%5m.profraw' clickhouse-test --order=random --testname --shard --zookeeper "${FAILED_TESTS[@]}" 2>&1 | ts '%Y-%m-%d %H:%M:%S' | tee -a /test_result.txt -else - echo "No failed tests" -fi - -mkdir -p "$COVERAGE_DIR" -mv /*.profraw "$COVERAGE_DIR" - -mkdir -p "$SOURCE_DIR"/obj-x86_64-linux-gnu -cd "$SOURCE_DIR"/obj-x86_64-linux-gnu && CC=clang-11 CXX=clang++-11 cmake .. && cd / -llvm-profdata-11 merge -sparse "${COVERAGE_DIR}"/* -o clickhouse.profdata -llvm-cov-11 export /usr/bin/clickhouse -instr-profile=clickhouse.profdata -j=16 -format=lcov -skip-functions -ignore-filename-regex "$IGNORE" > output.lcov -genhtml output.lcov --ignore-errors source --output-directory "${OUTPUT_DIR}" diff --git a/docker/test/coverage/s3downloader b/docker/test/coverage/s3downloader deleted file mode 100755 index eb3b3cd9faf..00000000000 --- a/docker/test/coverage/s3downloader +++ /dev/null @@ -1,101 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -import os -import sys -import time -import tarfile -import logging -import argparse -import requests -import tempfile - - -DEFAULT_URL = 'https://clickhouse-datasets.s3.yandex.net' - -AVAILABLE_DATASETS = { - 'hits': 'hits_v1.tar', - 'visits': 'visits_v1.tar', -} - -RETRIES_COUNT = 5 - -def _get_temp_file_name(): - return os.path.join(tempfile._get_default_tempdir(), next(tempfile._get_candidate_names())) - -def build_url(base_url, dataset): - return os.path.join(base_url, dataset, 'partitions', AVAILABLE_DATASETS[dataset]) - -def dowload_with_progress(url, path): - logging.info("Downloading from %s to temp path %s", url, path) - for i in range(RETRIES_COUNT): - try: - with open(path, 'wb') as f: - response = requests.get(url, stream=True) - response.raise_for_status() - total_length = response.headers.get('content-length') - if total_length is None or int(total_length) == 0: - logging.info("No content-length, will download file without progress") - f.write(response.content) - else: - dl = 0 - total_length = int(total_length) - logging.info("Content length is %ld bytes", total_length) - for data in response.iter_content(chunk_size=4096): - dl += len(data) - f.write(data) - if sys.stdout.isatty(): - done = int(50 * dl / total_length) - percent = int(100 * float(dl) / total_length) - sys.stdout.write("\r[{}{}] {}%".format('=' * done, ' ' * (50-done), percent)) - sys.stdout.flush() - break - except Exception as ex: - sys.stdout.write("\n") - time.sleep(3) - logging.info("Exception while downloading %s, retry %s", ex, i + 1) - if os.path.exists(path): - os.remove(path) - else: - raise Exception("Cannot download dataset from {}, all retries exceeded".format(url)) - - sys.stdout.write("\n") - logging.info("Downloading finished") - -def unpack_to_clickhouse_directory(tar_path, clickhouse_path): - logging.info("Will unpack data from temp path %s to clickhouse db %s", tar_path, clickhouse_path) - with tarfile.open(tar_path, 'r') as comp_file: - comp_file.extractall(path=clickhouse_path) - logging.info("Unpack finished") - - -if __name__ == "__main__": - logging.basicConfig(level=logging.INFO) - - parser = argparse.ArgumentParser( - description="Simple tool for dowloading datasets for clickhouse from S3") - - parser.add_argument('--dataset-names', required=True, nargs='+', choices=list(AVAILABLE_DATASETS.keys())) - parser.add_argument('--url-prefix', default=DEFAULT_URL) - parser.add_argument('--clickhouse-data-path', default='/var/lib/clickhouse/') - - args = parser.parse_args() - datasets = args.dataset_names - logging.info("Will fetch following datasets: %s", ', '.join(datasets)) - for dataset in datasets: - logging.info("Processing %s", dataset) - temp_archive_path = _get_temp_file_name() - try: - download_url_for_dataset = build_url(args.url_prefix, dataset) - dowload_with_progress(download_url_for_dataset, temp_archive_path) - unpack_to_clickhouse_directory(temp_archive_path, args.clickhouse_data_path) - except Exception as ex: - logging.info("Some exception occured %s", str(ex)) - raise - finally: - logging.info("Will remove downloaded file %s from filesystem if it exists", temp_archive_path) - if os.path.exists(temp_archive_path): - os.remove(temp_archive_path) - logging.info("Processing of %s finished", dataset) - logging.info("Fetch finished, enjoy your tables!") - - diff --git a/docker/test/fasttest/run.sh b/docker/test/fasttest/run.sh index cd39c0fb75d..24168cea330 100755 --- a/docker/test/fasttest/run.sh +++ b/docker/test/fasttest/run.sh @@ -111,19 +111,6 @@ function start_server fi echo "ClickHouse server pid '$server_pid' started and responded" - - echo " -set follow-fork-mode child -handle all noprint -handle SIGSEGV stop print -handle SIGBUS stop print -handle SIGABRT stop print -continue -thread apply all backtrace -continue -" > script.gdb - - gdb -batch -command script.gdb -p "$server_pid" & } function clone_root @@ -186,6 +173,8 @@ function clone_submodules contrib/dragonbox contrib/fast_float contrib/NuRaft + contrib/jemalloc + contrib/replxx ) git submodule sync @@ -206,6 +195,8 @@ function run_cmake "-DENABLE_THINLTO=0" "-DUSE_UNWIND=1" "-DENABLE_NURAFT=1" + "-DENABLE_JEMALLOC=1" + "-DENABLE_REPLXX=1" ) # TODO remove this? we don't use ccache anyway. An option would be to download it @@ -266,7 +257,13 @@ function run_tests start_server set +e - time clickhouse-test --hung-check -j 8 --order=random \ + local NPROC + NPROC=$(nproc) + NPROC=$((NPROC / 2)) + if [[ $NPROC == 0 ]]; then + NPROC=1 + fi + time clickhouse-test --hung-check -j "${NPROC}" --order=random \ --fast-tests-only --no-long --testname --shard --zookeeper --check-zookeeper-session \ -- "$FASTTEST_FOCUS" 2>&1 \ | ts '%Y-%m-%d %H:%M:%S' \ diff --git a/docker/test/fuzzer/run-fuzzer.sh b/docker/test/fuzzer/run-fuzzer.sh index 351b4a3c541..1ebaed752a6 100755 --- a/docker/test/fuzzer/run-fuzzer.sh +++ b/docker/test/fuzzer/run-fuzzer.sh @@ -1,5 +1,5 @@ #!/bin/bash -# shellcheck disable=SC2086,SC2001,SC2046 +# shellcheck disable=SC2086,SC2001,SC2046,SC2030,SC2031 set -eux set -o pipefail @@ -35,7 +35,7 @@ function clone fi git diff --name-only master HEAD | tee ci-changed-files.txt else - if [ -v COMMIT_SHA ]; then + if [ -v SHA_TO_TEST ]; then git fetch --depth 2 origin "$SHA_TO_TEST" git checkout "$SHA_TO_TEST" echo "Checked out nominal SHA $SHA_TO_TEST for master" @@ -52,9 +52,21 @@ function clone } +function wget_with_retry +{ + for _ in 1 2 3 4; do + if wget -nv -nd -c "$1";then + return 0 + else + sleep 0.5 + fi + done + return 1 +} + function download { - wget -nv -nd -c "$BINARY_URL_TO_DOWNLOAD" + wget_with_retry "$BINARY_URL_TO_DOWNLOAD" chmod +x clickhouse ln -s ./clickhouse ./clickhouse-server @@ -155,21 +167,47 @@ function fuzz kill -0 $server_pid + # Set follow-fork-mode to parent, because we attach to clickhouse-server, not to watchdog + # and clickhouse-server can do fork-exec, for example, to run some bridge. + # Do not set nostop noprint for all signals, because some it may cause gdb to hang, + # explicitly ignore non-fatal signals that are used by server. + # Number of SIGRTMIN can be determined only in runtime. + RTMIN=$(kill -l SIGRTMIN) echo " -set follow-fork-mode child -handle all noprint -handle SIGSEGV stop print -handle SIGBUS stop print -continue -thread apply all backtrace +set follow-fork-mode parent +handle SIGHUP nostop noprint pass +handle SIGINT nostop noprint pass +handle SIGQUIT nostop noprint pass +handle SIGPIPE nostop noprint pass +handle SIGTERM nostop noprint pass +handle SIGUSR1 nostop noprint pass +handle SIGUSR2 nostop noprint pass +handle SIG$RTMIN nostop noprint pass +info signals continue +backtrace full +info locals +info registers +disassemble /s +up +info locals +disassemble /s +up +info locals +disassemble /s +p \"done\" +detach +quit " > script.gdb - gdb -batch -command script.gdb -p $server_pid & + gdb -batch -command script.gdb -p $server_pid & + sleep 5 + # gdb will send SIGSTOP, spend some time loading debug info and then send SIGCONT, wait for it (up to send_timeout, 300s) + time clickhouse-client --query "SELECT 'Connected to clickhouse-server after attaching gdb'" ||: # Check connectivity after we attach gdb, because it might cause the server - # to freeze and the fuzzer will fail. - for _ in {1..60} + # to freeze and the fuzzer will fail. In debug build it can take a lot of time. + for _ in {1..180} do sleep 1 if clickhouse-client --query "select 1" @@ -189,6 +227,7 @@ continue --receive_data_timeout_ms=10000 \ --stacktrace \ --query-fuzzer-runs=1000 \ + --testmode \ --queries-file $(ls -1 ch/tests/queries/0_stateless/*.sql | sort -R) \ $NEW_TESTS_OPT \ > >(tail -n 100000 > fuzzer.log) \ diff --git a/docker/test/integration/base/Dockerfile b/docker/test/integration/base/Dockerfile index add4dad0132..89c2b19236e 100644 --- a/docker/test/integration/base/Dockerfile +++ b/docker/test/integration/base/Dockerfile @@ -7,7 +7,6 @@ RUN apt-get update \ && env DEBIAN_FRONTEND=noninteractive apt-get -y install \ tzdata \ python3 \ - libreadline-dev \ libicu-dev \ bsdutils \ gdb \ diff --git a/docker/test/integration/runner/Dockerfile b/docker/test/integration/runner/Dockerfile index 5695be70b9a..6a40fea7500 100644 --- a/docker/test/integration/runner/Dockerfile +++ b/docker/test/integration/runner/Dockerfile @@ -21,7 +21,6 @@ RUN apt-get update \ cgroupfs-mount \ python3-pip \ tzdata \ - libreadline-dev \ libicu-dev \ bsdutils \ curl \ @@ -73,11 +72,13 @@ RUN python3 -m pip install \ grpcio-tools \ kafka-python \ kazoo \ + lz4 \ minio \ protobuf \ psycopg2-binary==2.8.6 \ - pymongo \ + pymongo==3.11.0 \ pytest \ + pytest-order==1.0.0 \ pytest-timeout \ pytest-xdist \ pytest-repeat \ @@ -86,7 +87,8 @@ RUN python3 -m pip install \ tzlocal==2.1 \ urllib3 \ requests-kerberos \ - pyhdfs + pyhdfs \ + azure-storage-blob COPY modprobe.sh /usr/local/bin/modprobe COPY dockerd-entrypoint.sh /usr/local/bin/ diff --git a/docker/test/integration/runner/compose/docker_compose_azurite.yml b/docker/test/integration/runner/compose/docker_compose_azurite.yml new file mode 100644 index 00000000000..430ea0d9d14 --- /dev/null +++ b/docker/test/integration/runner/compose/docker_compose_azurite.yml @@ -0,0 +1,13 @@ +version: '2.3' + +services: + azurite1: + image: mcr.microsoft.com/azure-storage/azurite + ports: + - "10000:10000" + volumes: + - data1-1:/data1 + command: azurite-blob --blobHost 0.0.0.0 --blobPort 10000 --debug /azurite_log + +volumes: + data1-1: diff --git a/docker/test/integration/runner/compose/docker_compose_mongo.yml b/docker/test/integration/runner/compose/docker_compose_mongo.yml index e794966bd08..0bdd054420a 100644 --- a/docker/test/integration/runner/compose/docker_compose_mongo.yml +++ b/docker/test/integration/runner/compose/docker_compose_mongo.yml @@ -1,7 +1,7 @@ version: '2.3' services: mongo1: - image: mongo:3.6 + image: mongo:5.0 restart: always environment: MONGO_INITDB_ROOT_USERNAME: root @@ -9,3 +9,9 @@ services: ports: - ${MONGO_EXTERNAL_PORT}:${MONGO_INTERNAL_PORT} command: --profile=2 --verbose + + mongo2: + image: mongo:5.0 + restart: always + ports: + - "27018:27017" diff --git a/docker/test/integration/runner/dockerd-entrypoint.sh b/docker/test/integration/runner/dockerd-entrypoint.sh index 5a021036b26..ad8a8e4eb84 100755 --- a/docker/test/integration/runner/dockerd-entrypoint.sh +++ b/docker/test/integration/runner/dockerd-entrypoint.sh @@ -8,8 +8,8 @@ echo '{ "ip-forward": true, "log-level": "debug", "storage-driver": "overlay2", - "insecure-registries" : ["dockerhub-proxy.sas.yp-c.yandex.net:5000"], - "registry-mirrors" : ["http://dockerhub-proxy.sas.yp-c.yandex.net:5000"] + "insecure-registries" : ["dockerhub-proxy.dockerhub-proxy-zone:5000"], + "registry-mirrors" : ["http://dockerhub-proxy.dockerhub-proxy-zone:5000"] }' | dd of=/etc/docker/daemon.json 2>/dev/null dockerd --host=unix:///var/run/docker.sock --host=tcp://0.0.0.0:2375 --default-address-pool base=172.17.0.0/12,size=24 &>/ClickHouse/tests/integration/dockerd.log & diff --git a/docker/test/performance-comparison/compare.sh b/docker/test/performance-comparison/compare.sh index c32b50a3cbe..16ac304d7fb 100755 --- a/docker/test/performance-comparison/compare.sh +++ b/docker/test/performance-comparison/compare.sh @@ -193,7 +193,7 @@ function run_tests then # Run only explicitly specified tests, if any. # shellcheck disable=SC2010 - test_files=$(ls "$test_prefix" | grep "$CHPC_TEST_GREP" | xargs -I{} -n1 readlink -f "$test_prefix/{}") + test_files=($(ls "$test_prefix" | grep "$CHPC_TEST_GREP" | xargs -I{} -n1 readlink -f "$test_prefix/{}")) elif [ "$PR_TO_TEST" -ne 0 ] \ && [ "$(wc -l < changed-test-definitions.txt)" -gt 0 ] \ && [ "$(wc -l < other-changed-files.txt)" -eq 0 ] @@ -201,10 +201,26 @@ function run_tests # If only the perf tests were changed in the PR, we will run only these # tests. The lists of changed files are prepared in entrypoint.sh because # it has the repository. - test_files=$(sed "s/tests\/performance/${test_prefix//\//\\/}/" changed-test-definitions.txt) + test_files=($(sed "s/tests\/performance/${test_prefix//\//\\/}/" changed-test-definitions.txt)) else # The default -- run all tests found in the test dir. - test_files=$(ls "$test_prefix"/*.xml) + test_files=($(ls "$test_prefix"/*.xml)) + fi + + # We split perf tests into multiple checks to make them faster + if [ -v CHPC_TEST_RUN_BY_HASH_TOTAL ]; then + # filter tests array in bash https://stackoverflow.com/a/40375567 + for index in "${!test_files[@]}"; do + # sorry for this, just calculating hash(test_name) % total_tests_group == my_test_group_num + test_hash_result=$(echo test_files[$index] | perl -ne 'use Digest::MD5 qw(md5); print unpack('Q', md5($_)) % $ENV{CHPC_TEST_RUN_BY_HASH_TOTAL} == $ENV{CHPC_TEST_RUN_BY_HASH_NUM};') + # BTW, for some reason when hash(test_name) % total_tests_group != my_test_group_num perl outputs nothing, not zero + if [ "$test_hash_result" != "1" ]; then + # deleting element from array + unset -v 'test_files[$index]' + fi + done + # to have sequential indexes... + test_files=("${test_files[@]}") fi # For PRs w/o changes in test definitons, test only a subset of queries, @@ -212,21 +228,26 @@ function run_tests # already set, keep those values. # # NOTE: too high CHPC_RUNS/CHPC_MAX_QUERIES may hit internal CI timeout. - if [ "$PR_TO_TEST" -ne 0 ] && [ "$(wc -l < changed-test-definitions.txt)" -eq 0 ] - then - CHPC_RUNS=${CHPC_RUNS:-7} - CHPC_MAX_QUERIES=${CHPC_MAX_QUERIES:-10} - else - CHPC_RUNS=${CHPC_RUNS:-13} - CHPC_MAX_QUERIES=${CHPC_MAX_QUERIES:-0} - fi + # NOTE: Currently we disabled complete run even for master branch + #if [ "$PR_TO_TEST" -ne 0 ] && [ "$(wc -l < changed-test-definitions.txt)" -eq 0 ] + #then + # CHPC_RUNS=${CHPC_RUNS:-7} + # CHPC_MAX_QUERIES=${CHPC_MAX_QUERIES:-10} + #else + # CHPC_RUNS=${CHPC_RUNS:-13} + # CHPC_MAX_QUERIES=${CHPC_MAX_QUERIES:-0} + #fi + + CHPC_RUNS=${CHPC_RUNS:-7} + CHPC_MAX_QUERIES=${CHPC_MAX_QUERIES:-10} + export CHPC_RUNS export CHPC_MAX_QUERIES # Determine which concurrent benchmarks to run. For now, the only test # we run as a concurrent benchmark is 'website'. Run it as benchmark if we # are also going to run it as a normal test. - for test in $test_files; do echo "$test"; done | sed -n '/website/p' > benchmarks-to-run.txt + for test in ${test_files[@]}; do echo "$test"; done | sed -n '/website/p' > benchmarks-to-run.txt # Delete old report files. for x in {test-times,wall-clock-times}.tsv @@ -235,8 +256,8 @@ function run_tests touch "$x" done - # Randomize test order. - test_files=$(for f in $test_files; do echo "$f"; done | sort -R) + # Randomize test order. BTW, it's not an array no more. + test_files=$(for f in ${test_files[@]}; do echo "$f"; done | sort -R) # Limit profiling time to 10 minutes, not to run for too long. profile_seconds_left=600 @@ -261,16 +282,24 @@ function run_tests # Use awk because bash doesn't support floating point arithmetic. profile_seconds=$(awk "BEGIN { print ($profile_seconds_left > 0 ? 10 : 0) }") - TIMEFORMAT=$(printf "$test_name\t%%3R\t%%3U\t%%3S\n") - # The grep is to filter out set -x output and keep only time output. - # The '2>&1 >/dev/null' redirects stderr to stdout, and discards stdout. - { \ - time "$script_dir/perf.py" --host localhost localhost --port $LEFT_SERVER_PORT $RIGHT_SERVER_PORT \ - --runs "$CHPC_RUNS" --max-queries "$CHPC_MAX_QUERIES" \ - --profile-seconds "$profile_seconds" \ - -- "$test" > "$test_name-raw.tsv" 2> "$test_name-err.log" ; \ - } 2>&1 >/dev/null | tee >(grep -v ^+ >> "wall-clock-times.tsv") \ - || echo "Test $test_name failed with error code $?" >> "$test_name-err.log" + ( + set +x + argv=( + --host localhost localhost + --port "$LEFT_SERVER_PORT" "$RIGHT_SERVER_PORT" + --runs "$CHPC_RUNS" + --max-queries "$CHPC_MAX_QUERIES" + --profile-seconds "$profile_seconds" + + "$test" + ) + TIMEFORMAT=$(printf "$test_name\t%%3R\t%%3U\t%%3S\n") + # one more subshell to suppress trace output for "set +x" + ( + time "$script_dir/perf.py" "${argv[@]}" > "$test_name-raw.tsv" 2> "$test_name-err.log" + ) 2>>wall-clock-times.tsv >/dev/null \ + || echo "Test $test_name failed with error code $?" >> "$test_name-err.log" + ) 2>/dev/null profile_seconds_left=$(awk -F' ' \ 'BEGIN { s = '$profile_seconds_left'; } /^profile-total/ { s -= $2 } END { print s }' \ @@ -278,8 +307,6 @@ function run_tests current_test=$((current_test + 1)) done - unset TIMEFORMAT - wait } @@ -291,7 +318,7 @@ function get_profiles_watchdog for pid in $(pgrep -f clickhouse) do - gdb -p "$pid" --batch --ex "info proc all" --ex "thread apply all bt" --ex quit &> "$pid.gdb.log" & + sudo gdb -p "$pid" --batch --ex "info proc all" --ex "thread apply all bt" --ex quit &> "$pid.gdb.log" & done wait @@ -518,7 +545,9 @@ unset IFS # all nodes. numactl --show numactl --cpunodebind=all --membind=all numactl --show -numactl --cpunodebind=all --membind=all parallel --joblog analyze/parallel-log.txt --null < analyze/commands.txt 2>> analyze/errors.log +# Use less jobs to avoid OOM. Some queries can consume 8+ GB of memory. +jobs_count=$(($(grep -c ^processor /proc/cpuinfo) / 3)) +numactl --cpunodebind=all --membind=all parallel --jobs $jobs_count --joblog analyze/parallel-log.txt --null < analyze/commands.txt 2>> analyze/errors.log clickhouse-local --query " -- Join the metric names back to the metric statistics we've calculated, and make diff --git a/docker/test/performance-comparison/download.sh b/docker/test/performance-comparison/download.sh index 49323c28700..8fa6eb5ec83 100755 --- a/docker/test/performance-comparison/download.sh +++ b/docker/test/performance-comparison/download.sh @@ -16,16 +16,28 @@ right_sha=$4 datasets=${CHPC_DATASETS-"hits1 hits10 hits100 values"} declare -A dataset_paths -dataset_paths["hits10"]="https://s3.mds.yandex.net/clickhouse-private-datasets/hits_10m_single/partitions/hits_10m_single.tar" -dataset_paths["hits100"]="https://s3.mds.yandex.net/clickhouse-private-datasets/hits_100m_single/partitions/hits_100m_single.tar" -dataset_paths["hits1"]="https://clickhouse-datasets.s3.yandex.net/hits/partitions/hits_v1.tar" -dataset_paths["values"]="https://clickhouse-datasets.s3.yandex.net/values_with_expressions/partitions/test_values.tar" +if [[ $S3_URL == *"s3.amazonaws.com"* ]]; then + dataset_paths["hits10"]="https://clickhouse-private-datasets.s3.amazonaws.com/hits_10m_single/partitions/hits_10m_single.tar" + dataset_paths["hits100"]="https://clickhouse-private-datasets.s3.amazonaws.com/hits_100m_single/partitions/hits_100m_single.tar" + dataset_paths["hits1"]="https://clickhouse-datasets.s3.amazonaws.com/hits/partitions/hits_v1.tar" + dataset_paths["values"]="https://clickhouse-datasets.s3.amazonaws.com/values_with_expressions/partitions/test_values.tar" +else + dataset_paths["hits10"]="https://s3.mds.yandex.net/clickhouse-private-datasets/hits_10m_single/partitions/hits_10m_single.tar" + dataset_paths["hits100"]="https://s3.mds.yandex.net/clickhouse-private-datasets/hits_100m_single/partitions/hits_100m_single.tar" + dataset_paths["hits1"]="https://clickhouse-datasets.s3.yandex.net/hits/partitions/hits_v1.tar" + dataset_paths["values"]="https://clickhouse-datasets.s3.yandex.net/values_with_expressions/partitions/test_values.tar" +fi + function download { # Historically there were various paths for the performance test package. # Test all of them. - for path in "https://clickhouse-builds.s3.yandex.net/$left_pr/$left_sha/"{,clickhouse_build_check/}"performance/performance.tgz" + declare -a urls_to_try=("https://s3.amazonaws.com/clickhouse-builds/$left_pr/$left_sha/performance/performance.tgz" + "https://clickhouse-builds.s3.yandex.net/$left_pr/$left_sha/clickhouse_build_check/performance/performance.tgz" + ) + + for path in "${urls_to_try[@]}" do if curl --fail --head "$path" then diff --git a/docker/test/performance-comparison/entrypoint.sh b/docker/test/performance-comparison/entrypoint.sh index d87b95b1129..3d37a6c0e92 100755 --- a/docker/test/performance-comparison/entrypoint.sh +++ b/docker/test/performance-comparison/entrypoint.sh @@ -4,6 +4,27 @@ set -ex CHPC_CHECK_START_TIMESTAMP="$(date +%s)" export CHPC_CHECK_START_TIMESTAMP +S3_URL=${S3_URL:="https://clickhouse-builds.s3.yandex.net"} + +COMMON_BUILD_PREFIX="/clickhouse_build_check" +if [[ $S3_URL == *"s3.amazonaws.com"* ]]; then + COMMON_BUILD_PREFIX="" +fi + +# Sometimes AWS responde with DNS error and it's impossible to retry it with +# current curl version options. +function curl_with_retry +{ + for _ in 1 2 3 4; do + if curl --fail --head "$1";then + return 0 + else + sleep 0.5 + fi + done + return 1 +} + # Use the packaged repository to find the revision we will compare to. function find_reference_sha { @@ -43,9 +64,12 @@ function find_reference_sha # Historically there were various path for the performance test package, # test all of them. unset found - for path in "https://clickhouse-builds.s3.yandex.net/0/$REF_SHA/"{,clickhouse_build_check/}"performance/performance.tgz" + declare -a urls_to_try=("https://s3.amazonaws.com/clickhouse-builds/0/$REF_SHA/performance/performance.tgz" + "https://clickhouse-builds.s3.yandex.net/0/$REF_SHA/clickhouse_build_check/performance/performance.tgz" + ) + for path in "${urls_to_try[@]}" do - if curl --fail --head "$path" + if curl_with_retry "$path" then found="$path" break @@ -65,14 +89,11 @@ chmod 777 workspace output cd workspace -# Download the package for the version we are going to test -for path in "https://clickhouse-builds.s3.yandex.net/$PR_TO_TEST/$SHA_TO_TEST/"{,clickhouse_build_check/}"performance/performance.tgz" -do - if curl --fail --head "$path" - then - right_path="$path" - fi -done +# Download the package for the version we are going to test. +if curl_with_retry "$S3_URL/$PR_TO_TEST/$SHA_TO_TEST$COMMON_BUILD_PREFIX/performance/performance.tgz" +then + right_path="$S3_URL/$PR_TO_TEST/$SHA_TO_TEST$COMMON_BUILD_PREFIX/performance/performance.tgz" +fi mkdir right wget -nv -nd -c "$right_path" -O- | tar -C right --strip-components=1 -zxv diff --git a/docker/test/performance-comparison/perf.py b/docker/test/performance-comparison/perf.py index 301c5cc7d73..61987d34299 100755 --- a/docker/test/performance-comparison/perf.py +++ b/docker/test/performance-comparison/perf.py @@ -45,6 +45,7 @@ parser.add_argument('--runs', type=int, default=1, help='Number of query runs pe parser.add_argument('--max-queries', type=int, default=None, help='Test no more than this number of queries, chosen at random.') parser.add_argument('--queries-to-run', nargs='*', type=int, default=None, help='Space-separated list of indexes of queries to test.') parser.add_argument('--max-query-seconds', type=int, default=15, help='For how many seconds at most a query is allowed to run. The script finishes with error if this time is exceeded.') +parser.add_argument('--prewarm-max-query-seconds', type=int, default=180, help='For how many seconds at most a prewarm (cold storage) query is allowed to run. The script finishes with error if this time is exceeded.') parser.add_argument('--profile-seconds', type=int, default=0, help='For how many seconds to profile a query for which the performance has changed.') parser.add_argument('--long', action='store_true', help='Do not skip the tests tagged as long.') parser.add_argument('--print-queries', action='store_true', help='Print test queries and exit.') @@ -284,7 +285,7 @@ for query_index in queries_to_run: # it makes the results unstable. res = c.execute(q, query_id = prewarm_id, settings = { - 'max_execution_time': args.max_query_seconds, + 'max_execution_time': args.prewarm_max_query_seconds, 'query_profiler_real_time_period_ns': 10000000, 'memory_profiler_step': '4Mi', }) @@ -354,11 +355,9 @@ for query_index in queries_to_run: print(f'query\t{query_index}\t{run_id}\t{conn_index}\t{elapsed}') if elapsed > args.max_query_seconds: - # Stop processing pathologically slow queries, to avoid timing out - # the entire test task. This shouldn't really happen, so we don't - # need much handling for this case and can just exit. + # Do not stop processing pathologically slow queries, + # since this may hide errors in other queries. print(f'The query no. {query_index} is taking too long to run ({elapsed} s)', file=sys.stderr) - exit(2) # Be careful with the counter, after this line it's the next iteration # already. diff --git a/docker/test/pvs/Dockerfile b/docker/test/pvs/Dockerfile index 5dc32ebcc22..c236b3a51d1 100644 --- a/docker/test/pvs/Dockerfile +++ b/docker/test/pvs/Dockerfile @@ -42,7 +42,7 @@ ENV CCACHE_DIR=/test_output/ccache CMD echo "Running PVS version $PKG_VERSION" && mkdir -p $CCACHE_DIR && cd /repo_folder && pvs-studio-analyzer credentials $LICENCE_NAME $LICENCE_KEY -o ./licence.lic \ && cmake . -D"ENABLE_EMBEDDED_COMPILER"=OFF -D"DISABLE_HERMETIC_BUILD"=ON -DCMAKE_C_COMPILER=clang-13 -DCMAKE_CXX_COMPILER=clang\+\+-13 \ && ninja re2_st clickhouse_grpc_protos \ - && pvs-studio-analyzer analyze -o pvs-studio.log -e contrib -j 4 -l ./licence.lic; \ + && pvs-studio-analyzer analyze -o pvs-studio.log -e contrib -j "$(nproc)" -l ./licence.lic; \ cp /repo_folder/pvs-studio.log /test_output; \ plog-converter -a GA:1,2 -t fullhtml -o /test_output/pvs-studio-html-report pvs-studio.log; \ plog-converter -a GA:1,2 -t tasklist -o /test_output/pvs-studio-task-report.txt pvs-studio.log diff --git a/docker/test/stateful/run.sh b/docker/test/stateful/run.sh index f8dee0f8bc9..8202a07f017 100755 --- a/docker/test/stateful/run.sh +++ b/docker/test/stateful/run.sh @@ -123,7 +123,12 @@ function run_tests() export -f run_tests timeout "$MAX_RUN_TIME" bash -c run_tests ||: -./process_functional_tests_result.py || echo -e "failure\tCannot parse results" > /test_output/check_status.tsv +echo "Files in current directory" +ls -la ./ +echo "Files in root directory" +ls -la / + +/process_functional_tests_result.py || echo -e "failure\tCannot parse results" > /test_output/check_status.tsv grep -Fa "Fatal" /var/log/clickhouse-server/clickhouse-server.log ||: diff --git a/docker/test/stateless/Dockerfile b/docker/test/stateless/Dockerfile index 7de8c061673..05d26924b15 100644 --- a/docker/test/stateless/Dockerfile +++ b/docker/test/stateless/Dockerfile @@ -49,7 +49,6 @@ RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone ENV NUM_TRIES=1 ENV MAX_RUN_TIME=0 - # Download Minio-related binaries RUN wget 'https://dl.min.io/server/minio/release/linux-amd64/minio' \ && chmod +x ./minio \ diff --git a/docker/test/stateless/run.sh b/docker/test/stateless/run.sh index 93f64fdec66..d6d9f189e89 100755 --- a/docker/test/stateless/run.sh +++ b/docker/test/stateless/run.sh @@ -96,6 +96,13 @@ function run_tests() ADDITIONAL_OPTIONS+=('8') fi + if [[ -n "$RUN_BY_HASH_NUM" ]] && [[ -n "$RUN_BY_HASH_TOTAL" ]]; then + ADDITIONAL_OPTIONS+=('--run-by-hash-num') + ADDITIONAL_OPTIONS+=("$RUN_BY_HASH_NUM") + ADDITIONAL_OPTIONS+=('--run-by-hash-total') + ADDITIONAL_OPTIONS+=("$RUN_BY_HASH_TOTAL") + fi + set +e clickhouse-test --testname --shard --zookeeper --check-zookeeper-session --hung-check --print-time \ --test-runs "$NUM_TRIES" "${ADDITIONAL_OPTIONS[@]}" 2>&1 \ @@ -108,7 +115,12 @@ export -f run_tests timeout "$MAX_RUN_TIME" bash -c run_tests ||: -./process_functional_tests_result.py || echo -e "failure\tCannot parse results" > /test_output/check_status.tsv +echo "Files in current directory" +ls -la ./ +echo "Files in root directory" +ls -la / + +/process_functional_tests_result.py || echo -e "failure\tCannot parse results" > /test_output/check_status.tsv clickhouse-client -q "system flush logs" ||: diff --git a/docker/test/stress/run.sh b/docker/test/stress/run.sh index 6d720d02cdc..2efb62689ff 100755 --- a/docker/test/stress/run.sh +++ b/docker/test/stress/run.sh @@ -1,6 +1,7 @@ #!/bin/bash # shellcheck disable=SC2094 # shellcheck disable=SC2086 +# shellcheck disable=SC2024 set -x @@ -127,14 +128,35 @@ function start() counter=$((counter + 1)) done + # Set follow-fork-mode to parent, because we attach to clickhouse-server, not to watchdog + # and clickhouse-server can do fork-exec, for example, to run some bridge. + # Do not set nostop noprint for all signals, because some it may cause gdb to hang, + # explicitly ignore non-fatal signals that are used by server. + # Number of SIGRTMIN can be determined only in runtime. + RTMIN=$(kill -l SIGRTMIN) echo " -set follow-fork-mode child -handle all noprint -handle SIGSEGV stop print -handle SIGBUS stop print -handle SIGABRT stop print +set follow-fork-mode parent +handle SIGHUP nostop noprint pass +handle SIGINT nostop noprint pass +handle SIGQUIT nostop noprint pass +handle SIGPIPE nostop noprint pass +handle SIGTERM nostop noprint pass +handle SIGUSR1 nostop noprint pass +handle SIGUSR2 nostop noprint pass +handle SIG$RTMIN nostop noprint pass +info signals continue -thread apply all backtrace +backtrace full +info locals +info registers +disassemble /s +up +info locals +disassemble /s +up +info locals +disassemble /s +p \"done\" detach quit " > script.gdb @@ -142,7 +164,10 @@ quit # FIXME Hung check may work incorrectly because of attached gdb # 1. False positives are possible # 2. We cannot attach another gdb to get stacktraces if some queries hung - gdb -batch -command script.gdb -p "$(cat /var/run/clickhouse-server/clickhouse-server.pid)" >> /test_output/gdb.log & + gdb -batch -command script.gdb -p "$(cat /var/run/clickhouse-server/clickhouse-server.pid)" | ts '%Y-%m-%d %H:%M:%S' >> /test_output/gdb.log & + sleep 5 + # gdb will send SIGSTOP, spend some time loading debug info and then send SIGCONT, wait for it (up to send_timeout, 300s) + time clickhouse-client --query "SELECT 'Connected to clickhouse-server after attaching gdb'" ||: } configure @@ -213,6 +238,9 @@ zgrep -Fa " " /var/log/clickhouse-server/clickhouse-server.log* > /dev/n zgrep -Fa "########################################" /test_output/* > /dev/null \ && echo -e 'Killed by signal (output files)\tFAIL' >> /test_output/test_results.tsv +zgrep -Fa " received signal " /test_output/gdb.log > /dev/null \ + && echo -e 'Found signal in gdb.log\tFAIL' >> /test_output/test_results.tsv + # Put logs into /test_output/ for log_file in /var/log/clickhouse-server/clickhouse-server.log* do diff --git a/docker/test/testflows/runner/Dockerfile b/docker/test/testflows/runner/Dockerfile index 8ea3cd46973..d15f237587b 100644 --- a/docker/test/testflows/runner/Dockerfile +++ b/docker/test/testflows/runner/Dockerfile @@ -21,7 +21,6 @@ RUN apt-get update \ cgroupfs-mount \ python3-pip \ tzdata \ - libreadline-dev \ libicu-dev \ bsdutils \ curl \ diff --git a/docker/test/testflows/runner/dockerd-entrypoint.sh b/docker/test/testflows/runner/dockerd-entrypoint.sh index 8abbd9e1c8e..0e15396082a 100755 --- a/docker/test/testflows/runner/dockerd-entrypoint.sh +++ b/docker/test/testflows/runner/dockerd-entrypoint.sh @@ -5,8 +5,8 @@ echo "Configure to use Yandex dockerhub-proxy" mkdir -p /etc/docker/ cat > /etc/docker/daemon.json << EOF { - "insecure-registries" : ["dockerhub-proxy.sas.yp-c.yandex.net:5000"], - "registry-mirrors" : ["http://dockerhub-proxy.sas.yp-c.yandex.net:5000"] + "insecure-registries" : ["dockerhub-proxy.dockerhub-proxy-zone:5000"], + "registry-mirrors" : ["http://dockerhub-proxy.dockerhub-proxy-zone:5000"] } EOF diff --git a/docs/en/development/tests.md b/docs/en/development/tests.md index ea32f608124..5f3245c4d60 100644 --- a/docs/en/development/tests.md +++ b/docs/en/development/tests.md @@ -106,20 +106,20 @@ Build ClickHouse. Run ClickHouse from the terminal: change directory to `program Note that all clickhouse tools (server, client, etc) are just symlinks to a single binary named `clickhouse`. You can find this binary at `programs/clickhouse`. All tools can also be invoked as `clickhouse tool` instead of `clickhouse-tool`. -Alternatively you can install ClickHouse package: either stable release from Yandex repository or you can build package for yourself with `./release` in ClickHouse sources root. Then start the server with `sudo service clickhouse-server start` (or stop to stop the server). Look for logs at `/etc/clickhouse-server/clickhouse-server.log`. +Alternatively you can install ClickHouse package: either stable release from ClickHouse repository or you can build package for yourself with `./release` in ClickHouse sources root. Then start the server with `sudo clickhouse start` (or stop to stop the server). Look for logs at `/etc/clickhouse-server/clickhouse-server.log`. When ClickHouse is already installed on your system, you can build a new `clickhouse` binary and replace the existing binary: ``` bash -$ sudo service clickhouse-server stop +$ sudo clickhouse stop $ sudo cp ./clickhouse /usr/bin/ -$ sudo service clickhouse-server start +$ sudo clickhouse start ``` Also you can stop system clickhouse-server and run your own with the same configuration but with logging to terminal: ``` bash -$ sudo service clickhouse-server stop +$ sudo clickhouse stop $ sudo -u clickhouse /usr/bin/clickhouse server --config-file /etc/clickhouse-server/config.xml ``` @@ -257,9 +257,9 @@ There are five variants (Debug, ASan, TSan, MSan, UBSan). Thread Fuzzer (please don't mix up with Thread Sanitizer) is another kind of fuzzing that allows to randomize thread order of execution. It helps to find even more special cases. -## Security Audit {#security-audit} +## Security Audit -People from Yandex Security Team do some basic overview of ClickHouse capabilities from the security standpoint. +People from Yandex Security Team did some basic overview of ClickHouse capabilities from the security standpoint. ## Static Analyzers {#static-analyzers} @@ -326,15 +326,11 @@ There is automated check for flaky tests. It runs all new tests 100 times (for f ## Testflows -[Testflows](https://testflows.com/) is an enterprise-grade testing framework. It is used by Altinity for some of the tests and we run these tests in our CI. - -## Yandex Checks (only for Yandex employees) - -These checks are importing ClickHouse code into Yandex internal monorepository, so ClickHouse codebase can be used as a library by other products at Yandex (YT and YDB). Note that clickhouse-server itself is not being build from internal repo and unmodified open-source build is used for Yandex applications. +[Testflows](https://testflows.com/) is an enterprise-grade open-source testing framework, which is used to test a subset of ClickHouse. ## Test Automation {#test-automation} -We run tests with Yandex internal CI and job automation system named “Sandbox”. +We run tests with [GitHub Actions](https://github.com/features/actions). Build jobs and tests are run in Sandbox on per commit basis. Resulting packages and test results are published in GitHub and can be downloaded by direct links. Artifacts are stored for several months. When you send a pull request on GitHub, we tag it as “can be tested” and our CI system will build ClickHouse packages (release, debug, with address sanitizer, etc) for you. diff --git a/docs/en/engines/database-engines/materialized-mysql.md b/docs/en/engines/database-engines/materialized-mysql.md index 5aa969daf88..cdc904f1e94 100644 --- a/docs/en/engines/database-engines/materialized-mysql.md +++ b/docs/en/engines/database-engines/materialized-mysql.md @@ -17,6 +17,7 @@ ClickHouse server works as MySQL replica. It reads binlog and performs DDL and D ``` sql CREATE DATABASE [IF NOT EXISTS] db_name [ON CLUSTER cluster] ENGINE = MaterializedMySQL('host:port', ['database' | database], 'user', 'password') [SETTINGS ...] +[TABLE OVERRIDE table1 (...), TABLE OVERRIDE table2 (...)] ``` **Engine Parameters** @@ -82,6 +83,7 @@ When working with the `MaterializedMySQL` database engine, [ReplacingMergeTree]( | VARCHAR, VAR_STRING | [String](../../sql-reference/data-types/string.md) | | BLOB | [String](../../sql-reference/data-types/string.md) | | BINARY | [FixedString](../../sql-reference/data-types/fixedstring.md) | +| BIT | [UInt64](../../sql-reference/data-types/int-uint.md) | [Nullable](../../sql-reference/data-types/nullable.md) is supported. @@ -109,15 +111,19 @@ MySQL DDL queries are converted into the corresponding ClickHouse DDL queries ([ - MySQL `DELETE` query is converted into `INSERT` with `_sign=-1`. -- MySQL `UPDATE` query is converted into `INSERT` with `_sign=-1` and `INSERT` with `_sign=1`. +- MySQL `UPDATE` query is converted into `INSERT` with `_sign=-1` and `INSERT` with `_sign=1` if the primary key has been changed, or + `INSERT` with `_sign=1` if not. ### Selecting from MaterializedMySQL Tables {#select} `SELECT` query from `MaterializedMySQL` tables has some specifics: -- If `_version` is not specified in the `SELECT` query, [FINAL](../../sql-reference/statements/select/from.md#select-from-final) modifier is used. So only rows with `MAX(_version)` are selected. +- If `_version` is not specified in the `SELECT` query, the + [FINAL](../../sql-reference/statements/select/from.md#select-from-final) modifier is used, so only rows with + `MAX(_version)` are returned for each primary key value. -- If `_sign` is not specified in the `SELECT` query, `WHERE _sign=1` is used by default. So the deleted rows are not included into the result set. +- If `_sign` is not specified in the `SELECT` query, `WHERE _sign=1` is used by default. So the deleted rows are not + included into the result set. - The result includes columns comments in case they exist in MySQL database tables. @@ -125,15 +131,95 @@ MySQL DDL queries are converted into the corresponding ClickHouse DDL queries ([ MySQL `PRIMARY KEY` and `INDEX` clauses are converted into `ORDER BY` tuples in ClickHouse tables. -ClickHouse has only one physical order, which is determined by `ORDER BY` clause. To create a new physical order, use [materialized views](../../sql-reference/statements/create/view.md#materialized). +ClickHouse has only one physical order, which is determined by `ORDER BY` clause. To create a new physical order, use +[materialized views](../../sql-reference/statements/create/view.md#materialized). **Notes** - Rows with `_sign=-1` are not deleted physically from the tables. -- Cascade `UPDATE/DELETE` queries are not supported by the `MaterializedMySQL` engine. +- Cascade `UPDATE/DELETE` queries are not supported by the `MaterializedMySQL` engine, as they are not visible in the + MySQL binlog. - Replication can be easily broken. - Manual operations on database and tables are forbidden. -- `MaterializedMySQL` is influenced by [optimize_on_insert](../../operations/settings/settings.md#optimize-on-insert) setting. The data is merged in the corresponding table in the `MaterializedMySQL` database when a table in the MySQL server changes. +- `MaterializedMySQL` is affected by the [optimize_on_insert](../../operations/settings/settings.md#optimize-on-insert) + setting. Data is merged in the corresponding table in the `MaterializedMySQL` database when a table in the MySQL + server changes. + +### Table Overrides {#table-overrides} + +Table overrides can be used to customize the ClickHouse DDL queries, allowing you to make schema optimizations for your +application. This is especially useful for controlling partitioning, which is important for the overall performance of +MaterializedMySQL. + +These are the schema conversion manipulations you can do with table overrides for MaterializedMySQL: + + * Modify column type. Must be compatible with the original type, or replication will fail. For example, + you can modify a UInt32 column to UInt64, but you can not modify a String column to Array(String). + * Modify [column TTL](../table-engines/mergetree-family/mergetree/#mergetree-column-ttl). + * Modify [column compression codec](../../sql-reference/statements/create/table/#codecs). + * Add [ALIAS columns](../../sql-reference/statements/create/table/#alias). + * Add [skipping indexes](../table-engines/mergetree-family/mergetree/#table_engine-mergetree-data_skipping-indexes) + * Add [projections](../table-engines/mergetree-family/mergetree/#projections). Note that projection optimizations are + disabled when using `SELECT ... FINAL` (which MaterializedMySQL does by default), so their utility is limited here. + `INDEX ... TYPE hypothesis` as [described in the v21.12 blog post]](https://clickhouse.com/blog/en/2021/clickhouse-v21.12-released/) + may be more useful in this case. + * Modify [PARTITION BY](../table-engines/mergetree-family/custom-partitioning-key/) + * Modify [ORDER BY](../table-engines/mergetree-family/mergetree/#mergetree-query-clauses) + * Modify [PRIMARY KEY](../table-engines/mergetree-family/mergetree/#mergetree-query-clauses) + * Add [SAMPLE BY](../table-engines/mergetree-family/mergetree/#mergetree-query-clauses) + * Add [table TTL](../table-engines/mergetree-family/mergetree/#mergetree-query-clauses) + +```sql +CREATE DATABASE db_name ENGINE = MaterializedMySQL(...) +[SETTINGS ...] +[TABLE OVERRIDE table_name ( + [COLUMNS ( + [col_name [datatype] [ALIAS expr] [CODEC(...)] [TTL expr], ...] + [INDEX index_name expr TYPE indextype[(...)] GRANULARITY val, ...] + [PROJECTION projection_name (SELECT [GROUP BY] [ORDER BY]), ...] + )] + [ORDER BY expr] + [PRIMARY KEY expr] + [PARTITION BY expr] + [SAMPLE BY expr] + [TTL expr] +), ...] +``` + +Example: + +```sql +CREATE DATABASE db_name ENGINE = MaterializedMySQL(...) +TABLE OVERRIDE table1 ( + COLUMNS ( + userid UUID, + category LowCardinality(String), + timestamp DateTime CODEC(Delta, Default) + ) + PARTITION BY toYear(timestamp) +), +TABLE OVERRIDE table2 ( + COLUMNS ( + client_ip String TTL created + INTERVAL 72 HOUR + ) + SAMPLE BY ip_hash +) +``` + +The `COLUMNS` list is sparse; existing columns are modified as specified, extra ALIAS columns are added. It is not +possible to add ordinary or MATERIALIZED columns. Modified columns with a different type must be assignable from the +original type. There is currently no validation of this or similar issues when the `CREATE DATABASE` query executes, so +extra care needs to be taken. + +You may specify overrides for tables that do not exist yet. + +!!! warning "Warning" + It is easy to break replication with table overrides if not used with care. For example: + + * If an ALIAS column is added with a table override, and a column with the same name is later added to the source + MySQL table, the converted ALTER TABLE query in ClickHouse will fail and replication stops. + * It is currently possible to add overrides that reference nullable columns where not-nullable are required, such as in + `ORDER BY` or `PARTITION BY`. This will cause CREATE TABLE queries that will fail, also causing replication to stop. ## Examples of Use {#examples-of-use} @@ -150,11 +236,9 @@ mysql> SELECT * FROM test; ``` ```text -+---+------+------+ -| a | b | c | -+---+------+------+ -| 2 | 222 | Wow! | -+---+------+------+ +┌─a─┬───b─┬─c────┐ +│ 2 │ 222 │ Wow! │ +└───┴─────┴──────┘ ``` Database in ClickHouse, exchanging data with the MySQL server: diff --git a/docs/en/engines/database-engines/materialized-postgresql.md b/docs/en/engines/database-engines/materialized-postgresql.md index d2c4dbf1f3c..4dea156f32e 100644 --- a/docs/en/engines/database-engines/materialized-postgresql.md +++ b/docs/en/engines/database-engines/materialized-postgresql.md @@ -5,15 +5,15 @@ toc_title: MaterializedPostgreSQL # [experimental] MaterializedPostgreSQL {#materialize-postgresql} -Creates ClickHouse database with an initial data dump of PostgreSQL database tables and starts replication process, i.e. executes background job to apply new changes as they happen on PostgreSQL database tables in the remote PostgreSQL database. +Creates a ClickHouse database with tables from PostgreSQL database. Firstly, database with engine `MaterializedPostgreSQL` creates a snapshot of PostgreSQL database and loads required tables. Required tables can include any subset of tables from any subset of schemas from specified database. Along with the snapshot database engine acquires LSN and once initial dump of tables is performed - it starts pulling updates from WAL. After database is created, newly added tables to PostgreSQL database are not automatically added to replication. They have to be added manually with `ATTACH TABLE db.table` query. -ClickHouse server works as PostgreSQL replica. It reads WAL and performs DML queries. DDL is not replicated, but can be handled (described below). +Replication is implemented with PostgreSQL Logical Replication Protocol, which does not allow to replicate DDL, but allows to know whether replication breaking changes happened (column type changes, adding/removing columns). Such changes are detected and according tables stop receiving updates. Such tables can be automatically reloaded in the background in case required setting is turned on. Safest way for now is to use `ATTACH`/ `DETACH` queries to reload table completely. If DDL does not break replication (for example, renaming a column) table will still receive updates (insertion is done by position). ## Creating a Database {#creating-a-database} ``` sql CREATE DATABASE [IF NOT EXISTS] db_name [ON CLUSTER cluster] -ENGINE = MaterializedPostgreSQL('host:port', ['database' | database], 'user', 'password') [SETTINGS ...] +ENGINE = MaterializedPostgreSQL('host:port', 'database', 'user', 'password') [SETTINGS ...] ``` **Engine Parameters** @@ -23,51 +23,39 @@ ENGINE = MaterializedPostgreSQL('host:port', ['database' | database], 'user', 'p - `user` — PostgreSQL user. - `password` — User password. +## Example of Use {#example-of-use} + +``` sql +CREATE DATABASE postgresql; +ENGINE = MaterializedPostgreSQL('postgres1:5432', 'postgres_database', 'postgres_user', 'postgres_password'); + +SHOW TABLES FROM postgres_db; + +┌─name───┐ +│ table1 │ +└────────┘ + +SELECT * FROM postgresql_db.postgres_table; +``` + ## Dynamically adding new tables to replication {#dynamically-adding-table-to-replication} +After `MaterializedPostgreSQL` database is created, it does not automatically detect new tables in according PostgreSQL database. Such tables can be added manually: + ``` sql ATTACH TABLE postgres_database.new_table; ``` -When specifying a specific list of tables in the database using the setting [materialized_postgresql_tables_list](../../operations/settings/settings.md#materialized-postgresql-tables-list), it will be updated to the current state, taking into account the tables which were added by the `ATTACH TABLE` query. +Warning: before version 21.13 adding table to replication left unremoved temprorary replication slot (named `{db_name}_ch_replication_slot_tmp`). If attaching tables in clickhouse version before 21.13, make sure to delete it manually (`SELECT pg_drop_replication_slot('{db_name}_ch_replication_slot_tmp')`). Otherwise disk usage will grow. Issue is fixed in 21.13. ## Dynamically removing tables from replication {#dynamically-removing-table-from-replication} +It is possible to remove specific tables from replication: + ``` sql DETACH TABLE postgres_database.table_to_remove; ``` -## Settings {#settings} - -- [materialized_postgresql_tables_list](../../operations/settings/settings.md#materialized-postgresql-tables-list) - -- [materialized_postgresql_schema](../../operations/settings/settings.md#materialized-postgresql-schema) - -- [materialized_postgresql_schema_list](../../operations/settings/settings.md#materialized-postgresql-schema-list) - -- [materialized_postgresql_allow_automatic_update](../../operations/settings/settings.md#materialized-postgresql-allow-automatic-update) - -- [materialized_postgresql_max_block_size](../../operations/settings/settings.md#materialized-postgresql-max-block-size) - -- [materialized_postgresql_replication_slot](../../operations/settings/settings.md#materialized-postgresql-replication-slot) - -- [materialized_postgresql_snapshot](../../operations/settings/settings.md#materialized-postgresql-snapshot) - -``` sql -CREATE DATABASE database1 -ENGINE = MaterializedPostgreSQL('postgres1:5432', 'postgres_database', 'postgres_user', 'postgres_password') -SETTINGS materialized_postgresql_tables_list = 'table1,table2,table3'; - -SELECT * FROM database1.table1; -``` - -The settings can be changed, if necessary, using a DDL query. But it is impossible to change the setting `materialized_postgresql_tables_list`. To update the list of tables in this setting use the `ATTACH TABLE` query. - -``` sql -ALTER DATABASE postgres_database MODIFY SETTING materialized_postgresql_max_block_size = ; -``` - - ## PostgreSQL schema {#schema} PostgreSQL [schema](https://www.postgresql.org/docs/9.1/ddl-schemas.html) can be configured in 3 ways (starting from version 21.12). @@ -150,13 +138,63 @@ WHERE oid = 'postgres_table'::regclass; !!! warning "Warning" Replication of [**TOAST**](https://www.postgresql.org/docs/9.5/storage-toast.html) values is not supported. The default value for the data type will be used. -## Example of Use {#example-of-use} +## Settings {#settings} + +1. materialized_postgresql_tables_list {#materialized-postgresql-tables-list} + +Sets a comma-separated list of PostgreSQL database tables, which will be replicated via [MaterializedPostgreSQL](../../engines/database-engines/materialized-postgresql.md) database engine. + +Default value: empty list — means whole PostgreSQL database will be replicated. + +2. materialized_postgresql_schema {#materialized-postgresql-schema} + +Default value: empty string. (Default schema is used) + +3. materialized_postgresql_schema_list {#materialized-postgresql-schema-list} + +Default value: empty list. (Default schema is used) + +4. materialized_postgresql_allow_automatic_update {#materialized-postgresql-allow-automatic-update} + +Allows reloading table in the background, when schema changes are detected. DDL queries on the PostgreSQL side are not replicated via ClickHouse [MaterializedPostgreSQL](../../engines/database-engines/materialized-postgresql.md) engine, because it is not allowed with PostgreSQL logical replication protocol, but the fact of DDL changes is detected transactionally. In this case, the default behaviour is to stop replicating those tables once DDL is detected. However, if this setting is enabled, then, instead of stopping the replication of those tables, they will be reloaded in the background via database snapshot without data losses and replication will continue for them. + +Possible values: + +- 0 — The table is not automatically updated in the background, when schema changes are detected. +- 1 — The table is automatically updated in the background, when schema changes are detected. + +Default value: `0`. + +5. materialized_postgresql_max_block_size {#materialized-postgresql-max-block-size} + +Sets the number of rows collected in memory before flushing data into PostgreSQL database table. + +Possible values: + +- Positive integer. + +Default value: `65536`. + +6. materialized_postgresql_replication_slot {#materialized-postgresql-replication-slot} + +A user-created replication slot. Must be used together with `materialized_postgresql_snapshot`. + +7. materialized_postgresql_snapshot {#materialized-postgresql-snapshot} + +A text string identifying a snapshot, from which [initial dump of PostgreSQL tables](../../engines/database-engines/materialized-postgresql.md) will be performed. Must be used together with `materialized_postgresql_replication_slot`. ``` sql -CREATE DATABASE postgresql_db -ENGINE = MaterializedPostgreSQL('postgres1:5432', 'postgres_database', 'postgres_user', 'postgres_password'); +CREATE DATABASE database1 +ENGINE = MaterializedPostgreSQL('postgres1:5432', 'postgres_database', 'postgres_user', 'postgres_password') +SETTINGS materialized_postgresql_tables_list = 'table1,table2,table3'; -SELECT * FROM postgresql_db.postgres_table; +SELECT * FROM database1.table1; +``` + +The settings can be changed, if necessary, using a DDL query. But it is impossible to change the setting `materialized_postgresql_tables_list`. To update the list of tables in this setting use the `ATTACH TABLE` query. + +``` sql +ALTER DATABASE postgres_database MODIFY SETTING materialized_postgresql_max_block_size = ; ``` ## Notes {#notes} @@ -165,11 +203,11 @@ SELECT * FROM postgresql_db.postgres_table; Logical Replication Slots which exist on the primary are not available on standby replicas. So if there is a failover, new primary (the old physical standby) won’t be aware of any slots which were existing with old primary. This will lead to a broken replication from PostgreSQL. -A solution to this is to manage replication slots yourself and define a permanent replication slot (some information can be found [here](https://patroni.readthedocs.io/en/latest/SETTINGS.html)). You'll need to pass slot name via [materialized_postgresql_replication_slot](../../operations/settings/settings.md#materialized-postgresql-replication-slot) setting, and it has to be exported with `EXPORT SNAPSHOT` option. The snapshot identifier needs to be passed via [materialized_postgresql_snapshot](../../operations/settings/settings.md#materialized-postgresql-snapshot) setting. +A solution to this is to manage replication slots yourself and define a permanent replication slot (some information can be found [here](https://patroni.readthedocs.io/en/latest/SETTINGS.html)). You'll need to pass slot name via `materialized_postgresql_replication_slot` setting, and it has to be exported with `EXPORT SNAPSHOT` option. The snapshot identifier needs to be passed via `materialized_postgresql_snapshot` setting. Please note that this should be used only if it is actually needed. If there is no real need for that or full understanding why, then it is better to allow the table engine to create and manage its own replication slot. -**Example (from [@bchrobot](https://github.com/bchrobot))** +**Example (from [@bchrobot](https://github.com/bchrobot))** 1. Configure replication slot in PostgreSQL. @@ -214,3 +252,23 @@ SETTINGS ```bash kubectl exec acid-demo-cluster-0 -c postgres -- su postgres -c 'patronictl failover --candidate acid-demo-cluster-1 --force' ``` + +### Required permissions + +1. [CREATE PUBLICATION](https://postgrespro.ru/docs/postgresql/14/sql-createpublication) -- create query privilege. + +2. [CREATE_REPLICATION_SLOT](https://postgrespro.ru/docs/postgrespro/10/protocol-replication#PROTOCOL-REPLICATION-CREATE-SLOT) -- replication privelege. + +3. [pg_drop_replication_slot](https://postgrespro.ru/docs/postgrespro/9.5/functions-admin#functions-replication) -- replication privilege or superuser. + +4. [DROP PUBLICATION](https://postgrespro.ru/docs/postgresql/10/sql-droppublication) -- owner of publication (`username` in MaterializedPostgreSQL engine itself). + +It is possible to avoid executing `2` and `3` commands and having those permissions. Use settings `materialized_postgresql_replication_slot` and `materialized_postgresql_snapshot`. But with much care. + +Access to tables: + +1. pg_publication + +2. pg_replication_slots + +3. pg_publication_tables diff --git a/docs/en/engines/table-engines/integrations/hdfs.md b/docs/en/engines/table-engines/integrations/hdfs.md index 0fcf7a63dd8..0d6d90f9d31 100644 --- a/docs/en/engines/table-engines/integrations/hdfs.md +++ b/docs/en/engines/table-engines/integrations/hdfs.md @@ -5,8 +5,7 @@ toc_title: HDFS # HDFS {#table_engines-hdfs} -This engine provides integration with [Apache Hadoop](https://en.wikipedia.org/wiki/Apache_Hadoop) ecosystem by allowing to manage data on [HDFS](https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-hdfs/HdfsDesign.html) via ClickHouse. This engine is similar -to the [File](../../../engines/table-engines/special/file.md#table_engines-file) and [URL](../../../engines/table-engines/special/url.md#table_engines-url) engines, but provides Hadoop-specific features. +This engine provides integration with the [Apache Hadoop](https://en.wikipedia.org/wiki/Apache_Hadoop) ecosystem by allowing to manage data on [HDFS](https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-hdfs/HdfsDesign.html) via ClickHouse. This engine is similar to the [File](../../../engines/table-engines/special/file.md#table_engines-file) and [URL](../../../engines/table-engines/special/url.md#table_engines-url) engines, but provides Hadoop-specific features. ## Usage {#usage} @@ -14,12 +13,13 @@ to the [File](../../../engines/table-engines/special/file.md#table_engines-file) ENGINE = HDFS(URI, format) ``` -The `URI` parameter is the whole file URI in HDFS. -The `format` parameter specifies one of the available file formats. To perform +**Engine Parameters** + +- `URI` - whole file URI in HDFS. The path part of `URI` may contain globs. In this case the table would be readonly. +- `format` - specifies one of the available file formats. To perform `SELECT` queries, the format must be supported for input, and to perform `INSERT` queries – for output. The available formats are listed in the [Formats](../../../interfaces/formats.md#formats) section. -The path part of `URI` may contain globs. In this case the table would be readonly. **Example:** @@ -71,12 +71,12 @@ Constructions with `{}` are similar to the [remote](../../../sql-reference/table 1. Suppose we have several files in TSV format with the following URIs on HDFS: -- 'hdfs://hdfs1:9000/some_dir/some_file_1' -- 'hdfs://hdfs1:9000/some_dir/some_file_2' -- 'hdfs://hdfs1:9000/some_dir/some_file_3' -- 'hdfs://hdfs1:9000/another_dir/some_file_1' -- 'hdfs://hdfs1:9000/another_dir/some_file_2' -- 'hdfs://hdfs1:9000/another_dir/some_file_3' + - 'hdfs://hdfs1:9000/some_dir/some_file_1' + - 'hdfs://hdfs1:9000/some_dir/some_file_2' + - 'hdfs://hdfs1:9000/some_dir/some_file_3' + - 'hdfs://hdfs1:9000/another_dir/some_file_1' + - 'hdfs://hdfs1:9000/another_dir/some_file_2' + - 'hdfs://hdfs1:9000/another_dir/some_file_3' 1. There are several ways to make a table consisting of all six files: @@ -132,6 +132,7 @@ Similar to GraphiteMergeTree, the HDFS engine supports extended configuration us | **parameter** | **default value** | +| - | - | | rpc\_client\_connect\_tcpnodelay | true | | dfs\_client\_read\_shortcircuit | true | | output\_replace-datanode-on-failure | true | @@ -181,25 +182,26 @@ Similar to GraphiteMergeTree, the HDFS engine supports extended configuration us #### ClickHouse extras {#clickhouse-extras} | **parameter** | **default value** | +| - | - | |hadoop\_kerberos\_keytab | "" | |hadoop\_kerberos\_principal | "" | |hadoop\_kerberos\_kinit\_command | kinit | |libhdfs3\_conf | "" | ### Limitations {#limitations} - * hadoop\_security\_kerberos\_ticket\_cache\_path and libhdfs3\_conf can be global only, not user specific +* `hadoop_security_kerberos_ticket_cache_path` and `libhdfs3_conf` can be global only, not user specific ## Kerberos support {#kerberos-support} -If hadoop\_security\_authentication parameter has value 'kerberos', ClickHouse authentifies via Kerberos facility. -Parameters [here](#clickhouse-extras) and hadoop\_security\_kerberos\_ticket\_cache\_path may be of help. +If the `hadoop_security_authentication` parameter has the value `kerberos`, ClickHouse authenticates via Kerberos. +Parameters are [here](#clickhouse-extras) and `hadoop_security_kerberos_ticket_cache_path` may be of help. Note that due to libhdfs3 limitations only old-fashioned approach is supported, -datanode communications are not secured by SASL (HADOOP\_SECURE\_DN\_USER is a reliable indicator of such -security approach). Use tests/integration/test\_storage\_kerberized\_hdfs/hdfs_configs/bootstrap.sh for reference. +datanode communications are not secured by SASL (`HADOOP_SECURE_DN_USER` is a reliable indicator of such +security approach). Use `tests/integration/test_storage_kerberized_hdfs/hdfs_configs/bootstrap.sh` for reference. -If hadoop\_kerberos\_keytab, hadoop\_kerberos\_principal or hadoop\_kerberos\_kinit\_command is specified, kinit will be invoked. hadoop\_kerberos\_keytab and hadoop\_kerberos\_principal are mandatory in this case. kinit tool and krb5 configuration files are required. +If `hadoop_kerberos_keytab`, `hadoop_kerberos_principal` or `hadoop_kerberos_kinit_command` is specified, `kinit` will be invoked. `hadoop_kerberos_keytab` and `hadoop_kerberos_principal` are mandatory in this case. `kinit` tool and krb5 configuration files are required. -## HDFS Namenode HA support{#namenode-ha} +## HDFS Namenode HA support {#namenode-ha} libhdfs3 support HDFS namenode HA. diff --git a/docs/en/engines/table-engines/integrations/materialized-postgresql.md b/docs/en/engines/table-engines/integrations/materialized-postgresql.md index d02a11257c2..fa349e49af5 100644 --- a/docs/en/engines/table-engines/integrations/materialized-postgresql.md +++ b/docs/en/engines/table-engines/integrations/materialized-postgresql.md @@ -7,7 +7,7 @@ toc_title: MaterializedPostgreSQL Creates ClickHouse table with an initial data dump of PostgreSQL table and starts replication process, i.e. executes background job to apply new changes as they happen on PostgreSQL table in the remote PostgreSQL database. -If more than one table is required, it is highly recommended to use the [MaterializedPostgreSQL](../../../engines/database-engines/materialized-postgresql.md) database engine instead of the table engine and use the [materialized_postgresql_tables_list](../../../operations/settings/settings.md#materialized-postgresql-tables-list) setting, which specifies the tables to be replicated. It will be much better in terms of CPU, fewer connections and fewer replication slots inside the remote PostgreSQL database. +If more than one table is required, it is highly recommended to use the [MaterializedPostgreSQL](../../../engines/database-engines/materialized-postgresql.md) database engine instead of the table engine and use the `materialized_postgresql_tables_list` setting, which specifies the tables to be replicated (will also be possible to add database `schema`). It will be much better in terms of CPU, fewer connections and fewer replication slots inside the remote PostgreSQL database. ## Creating a Table {#creating-a-table} @@ -38,7 +38,7 @@ PRIMARY KEY key; - `_version` — Transaction counter. Type: [UInt64](../../../sql-reference/data-types/int-uint.md). - `_sign` — Deletion mark. Type: [Int8](../../../sql-reference/data-types/int-uint.md). Possible values: - - `1` — Row is not deleted, + - `1` — Row is not deleted, - `-1` — Row is deleted. These columns do not need to be added when a table is created. They are always accessible in `SELECT` query. diff --git a/docs/en/engines/table-engines/integrations/postgresql.md b/docs/en/engines/table-engines/integrations/postgresql.md index 0bdb54e0c16..789759ec521 100644 --- a/docs/en/engines/table-engines/integrations/postgresql.md +++ b/docs/en/engines/table-engines/integrations/postgresql.md @@ -36,6 +36,31 @@ The table structure can differ from the original PostgreSQL table structure: - `schema` — Non-default table schema. Optional. - `on conflict ...` — example: `ON CONFLICT DO NOTHING`. Optional. Note: adding this option will make insertion less efficient. +or via config (since version 21.11): + +``` + + + + + + +
+
+ + + + + + +
+``` + +Some parameters can be overriden by key value arguments: +``` sql +SELECT * FROM postgresql(postgres1, schema='schema1', table='table1'); +``` + ## Implementation Details {#implementation-details} `SELECT` queries on PostgreSQL side run as `COPY (SELECT ...) TO STDOUT` inside read-only PostgreSQL transaction with commit after each `SELECT` query. diff --git a/docs/en/engines/table-engines/integrations/rabbitmq.md b/docs/en/engines/table-engines/integrations/rabbitmq.md index ebb42461204..78c144ac76f 100644 --- a/docs/en/engines/table-engines/integrations/rabbitmq.md +++ b/docs/en/engines/table-engines/integrations/rabbitmq.md @@ -37,6 +37,7 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] [rabbitmq_skip_broken_messages = N,] [rabbitmq_max_block_size = N,] [rabbitmq_flush_interval_ms = N] + [rabbitmq_queue_settings_list = 'x-dead-letter-exchange=my-dlx,x-max-length=10,x-overflow=reject-publish'] ``` Required parameters: @@ -59,6 +60,7 @@ Optional parameters: - `rabbitmq_skip_broken_messages` – RabbitMQ message parser tolerance to schema-incompatible messages per block. Default: `0`. If `rabbitmq_skip_broken_messages = N` then the engine skips *N* RabbitMQ messages that cannot be parsed (a message equals a row of data). - `rabbitmq_max_block_size` - `rabbitmq_flush_interval_ms` +- `rabbitmq_queue_settings_list` - allows to set RabbitMQ settings when creating a queue. Available settings: `x-max-length`, `x-max-length-bytes`, `x-message-ttl`, `x-expires`, `x-priority`, `x-max-priority`, `x-overflow`, `x-dead-letter-exchange`, `x-queue-type`. The `durable` setting is enabled automatically for the queue. SSL connection: diff --git a/docs/en/engines/table-engines/mergetree-family/custom-partitioning-key.md b/docs/en/engines/table-engines/mergetree-family/custom-partitioning-key.md index 2711c76aeb6..5ac2105e9fd 100644 --- a/docs/en/engines/table-engines/mergetree-family/custom-partitioning-key.md +++ b/docs/en/engines/table-engines/mergetree-family/custom-partitioning-key.md @@ -66,9 +66,9 @@ WHERE table = 'visits' └───────────┴────────────────┴────────┘ ``` -The `partition` column contains the names of the partitions. There are two partitions in this example: `201901` and `201902`. You can use this column value to specify the partition name in [ALTER … PARTITION](#alter_manipulations-with-partitions) queries. +The `partition` column contains the names of the partitions. There are two partitions in this example: `201901` and `201902`. You can use this column value to specify the partition name in [ALTER … PARTITION](../../../sql-reference/statements/alter/partition.md) queries. -The `name` column contains the names of the partition data parts. You can use this column to specify the name of the part in the [ALTER ATTACH PART](#alter_attach-partition) query. +The `name` column contains the names of the partition data parts. You can use this column to specify the name of the part in the [ALTER ATTACH PART](../../../sql-reference/statements/alter/partition.md#alter_attach-partition) query. Let’s break down the name of the first part: `201901_1_3_1`: diff --git a/docs/en/engines/table-engines/special/distributed.md b/docs/en/engines/table-engines/special/distributed.md index 708dab6fb7d..faa1026b919 100644 --- a/docs/en/engines/table-engines/special/distributed.md +++ b/docs/en/engines/table-engines/special/distributed.md @@ -8,24 +8,43 @@ toc_title: Distributed Tables with Distributed engine do not store any data of their own, but allow distributed query processing on multiple servers. Reading is automatically parallelized. During a read, the table indexes on remote servers are used, if there are any. -The Distributed engine accepts parameters: +## Creating a Table {#distributed-creating-a-table} -- the cluster name in the server’s config file +``` sql +CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] +( + name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], + name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2], + ... +) ENGINE = Distributed(cluster, database, table[, sharding_key[, policy_name]]) +[SETTINGS name=value, ...] +``` -- the name of a remote database +### From a Table {#distributed-from-a-table} +When the `Distributed` table is pointing to a table on the current server you can adopt that table's schema: -- the name of a remote table +``` sql +CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] AS [db2.]name2 ENGINE = Distributed(cluster, database, table[, sharding_key[, policy_name]]) [SETTINGS name=value, ...] +``` -- (optionally) sharding key +**Distributed Parameters** -- (optionally) policy name, it will be used to store temporary files for async send +- `cluster` - the cluster name in the server’s config file - See also: +- `database` - the name of a remote database - - [insert_distributed_sync](../../../operations/settings/settings.md#insert_distributed_sync) setting - - [MergeTree](../../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-multiple-volumes) for the examples +- `table` - the name of a remote table -Also, it accepts the following settings: +- `sharding_key` - (optionally) sharding key + +- `policy_name` - (optionally) policy name, it will be used to store temporary files for async send + +**See Also** + + - [insert_distributed_sync](../../../operations/settings/settings.md#insert_distributed_sync) setting + - [MergeTree](../../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-multiple-volumes) for the examples + +**Distributed Settings** - `fsync_after_insert` - do the `fsync` for the file data after asynchronous insert to Distributed. Guarantees that the OS flushed the whole inserted data to a file **on the initiator node** disk. @@ -59,24 +78,25 @@ Also, it accepts the following settings: - [prefer_localhost_replica](../../../operations/settings/settings.md#settings-prefer-localhost-replica) setting - `bytes_to_throw_insert` handled before `bytes_to_delay_insert`, so you should not set it to the value less then `bytes_to_delay_insert` -Example: +**Example** ``` sql -Distributed(logs, default, hits[, sharding_key[, policy_name]]) +CREATE TABLE hits_all AS hits +ENGINE = Distributed(logs, default, hits[, sharding_key[, policy_name]]) SETTINGS fsync_after_insert=0, fsync_directories=0; ``` -Data will be read from all servers in the `logs` cluster, from the default.hits table located on every server in the cluster. +Data will be read from all servers in the `logs` cluster, from the `default.hits` table located on every server in the cluster. Data is not only read but is partially processed on the remote servers (to the extent that this is possible). -For example, for a query with GROUP BY, data will be aggregated on remote servers, and the intermediate states of aggregate functions will be sent to the requestor server. Then data will be further aggregated. +For example, for a query with `GROUP BY`, data will be aggregated on remote servers, and the intermediate states of aggregate functions will be sent to the requestor server. Then data will be further aggregated. -Instead of the database name, you can use a constant expression that returns a string. For example: currentDatabase(). +Instead of the database name, you can use a constant expression that returns a string. For example: `currentDatabase()`. -logs – The cluster name in the server’s config file. +## Clusters {#distributed-clusters} -Clusters are set like this: +Clusters are configured in the [server configuration file](../../../operations/configuration-files.md): ``` xml @@ -132,12 +152,13 @@ Replicas are duplicating servers (in order to read all the data, you can access Cluster names must not contain dots. The parameters `host`, `port`, and optionally `user`, `password`, `secure`, `compression` are specified for each server: + - `host` – The address of the remote server. You can use either the domain or the IPv4 or IPv6 address. If you specify the domain, the server makes a DNS request when it starts, and the result is stored as long as the server is running. If the DNS request fails, the server does not start. If you change the DNS record, restart the server. -- `port` – The TCP port for messenger activity (`tcp_port` in the config, usually set to 9000). Do not confuse it with http_port. -- `user` – Name of the user for connecting to a remote server. Default value: default. This user must have access to connect to the specified server. Access is configured in the users.xml file. For more information, see the section [Access rights](../../../operations/access-rights.md). +- `port` – The TCP port for messenger activity (`tcp_port` in the config, usually set to 9000). Not to be confused with `http_port`. +- `user` – Name of the user for connecting to a remote server. Default value is the `default` user. This user must have access to connect to the specified server. Access is configured in the `users.xml` file. For more information, see the section [Access rights](../../../operations/access-rights.md). - `password` – The password for connecting to a remote server (not masked). Default value: empty string. -- `secure` - Use ssl for connection, usually you also should define `port` = 9440. Server should listen on `9440` and have correct certificates. -- `compression` - Use data compression. Default value: true. +- `secure` - Whether to use a secure SSL/TLS connection. Usually also requires specifying the port (the default secure port is `9440`). The server should listen on `9440` and be configured with correct certificates. +- `compression` - Use data compression. Default value: `true`. When specifying replicas, one of the available replicas will be selected for each of the shards when reading. You can configure the algorithm for load balancing (the preference for which replica to access) – see the [load_balancing](../../../operations/settings/settings.md#settings-load_balancing) setting. If the connection with the server is not established, there will be an attempt to connect with a short timeout. If the connection failed, the next replica will be selected, and so on for all the replicas. If the connection attempt failed for all the replicas, the attempt will be repeated the same way, several times. @@ -149,40 +170,42 @@ You can specify as many clusters as you wish in the configuration. To view your clusters, use the `system.clusters` table. -The Distributed engine allows working with a cluster like a local server. However, the cluster is inextensible: you must write its configuration in the server config file (even better, for all the cluster’s servers). +The `Distributed` engine allows working with a cluster like a local server. However, the cluster's configuration cannot be specified dynamically, it has to be configured in the server config file. Usually, all servers in a cluster will have the same cluster config (though this is not required). Clusters from the config file are updated on the fly, without restarting the server. -The Distributed engine requires writing clusters to the config file. Clusters from the config file are updated on the fly, without restarting the server. If you need to send a query to an unknown set of shards and replicas each time, you do not need to create a Distributed table – use the `remote` table function instead. See the section [Table functions](../../../sql-reference/table-functions/index.md). +If you need to send a query to an unknown set of shards and replicas each time, you do not need to create a `Distributed` table – use the `remote` table function instead. See the section [Table functions](../../../sql-reference/table-functions/index.md). + +## Writing data {#distributed-writing-data} There are two methods for writing data to a cluster: -First, you can define which servers to write which data to and perform the write directly on each shard. In other words, perform INSERT in the tables that the distributed table “looks at”. This is the most flexible solution as you can use any sharding scheme, which could be non-trivial due to the requirements of the subject area. This is also the most optimal solution since data can be written to different shards completely independently. +First, you can define which servers to write which data to and perform the write directly on each shard. In other words, perform direct `INSERT` statements on the remote tables in the cluster that the `Distributed` table is pointing to. This is the most flexible solution as you can use any sharding scheme, even one that is non-trivial due to the requirements of the subject area. This is also the most optimal solution since data can be written to different shards completely independently. -Second, you can perform INSERT in a Distributed table. In this case, the table will distribute the inserted data across the servers itself. In order to write to a Distributed table, it must have a sharding key set (the last parameter). In addition, if there is only one shard, the write operation works without specifying the sharding key, since it does not mean anything in this case. +Second, you can perform `INSERT` statements on a `Distributed` table. In this case, the table will distribute the inserted data across the servers itself. In order to write to a `Distributed` table, it must have the `sharding_key` parameter configured (except if there is only one shard). -Each shard can have a weight defined in the config file. By default, the weight is equal to one. Data is distributed across shards in the amount proportional to the shard weight. For example, if there are two shards and the first has a weight of 9 while the second has a weight of 10, the first will be sent 9 / 19 parts of the rows, and the second will be sent 10 / 19. +Each shard can have a `` defined in the config file. By default, the weight is `1`. Data is distributed across shards in the amount proportional to the shard weight. All shard weights are summed up, then each shard's weight is divided by the total to determine each shard's proportion. For example, if there are two shards and the first has a weight of 1 while the second has a weight of 2, the first will be sent one third (1 / 3) of inserted rows and the second will be sent two thirds (2 / 3). -Each shard can have the `internal_replication` parameter defined in the config file. +Each shard can have the `internal_replication` parameter defined in the config file. If this parameter is set to `true`, the write operation selects the first healthy replica and writes data to it. Use this if the tables underlying the `Distributed` table are replicated tables (e.g. any of the `Replicated*MergeTree` table engines). One of the table replicas will receive the write and it will be replicated to the other replicas automatically. -If this parameter is set to `true`, the write operation selects the first healthy replica and writes data to it. Use this alternative if the Distributed table “looks at” replicated tables. In other words, if the table where data will be written is going to replicate them itself. - -If it is set to `false` (the default), data is written to all replicas. In essence, this means that the Distributed table replicates data itself. This is worse than using replicated tables, because the consistency of replicas is not checked, and over time they will contain slightly different data. +If `internal_replication` is set to `false` (the default), data is written to all replicas. In this case, the `Distributed` table replicates data itself. This is worse than using replicated tables because the consistency of replicas is not checked and, over time, they will contain slightly different data. To select the shard that a row of data is sent to, the sharding expression is analyzed, and its remainder is taken from dividing it by the total weight of the shards. The row is sent to the shard that corresponds to the half-interval of the remainders from `prev_weights` to `prev_weights + weight`, where `prev_weights` is the total weight of the shards with the smallest number, and `weight` is the weight of this shard. For example, if there are two shards, and the first has a weight of 9 while the second has a weight of 10, the row will be sent to the first shard for the remainders from the range \[0, 9), and to the second for the remainders from the range \[9, 19). -The sharding expression can be any expression from constants and table columns that returns an integer. For example, you can use the expression `rand()` for random distribution of data, or `UserID` for distribution by the remainder from dividing the user’s ID (then the data of a single user will reside on a single shard, which simplifies running IN and JOIN by users). If one of the columns is not distributed evenly enough, you can wrap it in a hash function: intHash64(UserID). +The sharding expression can be any expression from constants and table columns that returns an integer. For example, you can use the expression `rand()` for random distribution of data, or `UserID` for distribution by the remainder from dividing the user’s ID (then the data of a single user will reside on a single shard, which simplifies running `IN` and `JOIN` by users). If one of the columns is not distributed evenly enough, you can wrap it in a hash function e.g. `intHash64(UserID)`. -A simple remainder from the division is a limited solution for sharding and isn’t always appropriate. It works for medium and large volumes of data (dozens of servers), but not for very large volumes of data (hundreds of servers or more). In the latter case, use the sharding scheme required by the subject area, rather than using entries in Distributed tables. - -SELECT queries are sent to all the shards and work regardless of how data is distributed across the shards (they can be distributed completely randomly). When you add a new shard, you do not have to transfer old data into it. Instead, you can write new data to it by using a heavier weight – the data will be distributed slightly unevenly, but queries will work correctly and efficiently. +A simple remainder from the division is a limited solution for sharding and isn’t always appropriate. It works for medium and large volumes of data (dozens of servers), but not for very large volumes of data (hundreds of servers or more). In the latter case, use the sharding scheme required by the subject area rather than using entries in `Distributed` tables. You should be concerned about the sharding scheme in the following cases: -- Queries are used that require joining data (IN or JOIN) by a specific key. If data is sharded by this key, you can use local IN or JOIN instead of GLOBAL IN or GLOBAL JOIN, which is much more efficient. -- A large number of servers is used (hundreds or more) with a large number of small queries (queries of individual clients - websites, advertisers, or partners). In order for the small queries to not affect the entire cluster, it makes sense to locate data for a single client on a single shard. Alternatively, as we’ve done in Yandex.Metrica, you can set up bi-level sharding: divide the entire cluster into “layers”, where a layer may consist of multiple shards. Data for a single client is located on a single layer, but shards can be added to a layer as necessary, and data is randomly distributed within them. Distributed tables are created for each layer, and a single shared distributed table is created for global queries. +- Queries are used that require joining data (`IN` or `JOIN`) by a specific key. If data is sharded by this key, you can use local `IN` or `JOIN` instead of `GLOBAL IN` or `GLOBAL JOIN`, which is much more efficient. +- A large number of servers is used (hundreds or more) with a large number of small queries, for example, queries for data of individual clients (e.g. websites, advertisers, or partners). In order for the small queries to not affect the entire cluster, it makes sense to locate data for a single client on a single shard. Alternatively, as we’ve done in Yandex.Metrica, you can set up bi-level sharding: divide the entire cluster into “layers”, where a layer may consist of multiple shards. Data for a single client is located on a single layer, but shards can be added to a layer as necessary, and data is randomly distributed within them. `Distributed` tables are created for each layer, and a single shared distributed table is created for global queries. Data is written asynchronously. When inserted in the table, the data block is just written to the local file system. The data is sent to the remote servers in the background as soon as possible. The periodicity for sending data is managed by the [distributed_directory_monitor_sleep_time_ms](../../../operations/settings/settings.md#distributed_directory_monitor_sleep_time_ms) and [distributed_directory_monitor_max_sleep_time_ms](../../../operations/settings/settings.md#distributed_directory_monitor_max_sleep_time_ms) settings. The `Distributed` engine sends each file with inserted data separately, but you can enable batch sending of files with the [distributed_directory_monitor_batch_inserts](../../../operations/settings/settings.md#distributed_directory_monitor_batch_inserts) setting. This setting improves cluster performance by better utilizing local server and network resources. You should check whether data is sent successfully by checking the list of files (data waiting to be sent) in the table directory: `/var/lib/clickhouse/data/database/table/`. The number of threads performing background tasks can be set by [background_distributed_schedule_pool_size](../../../operations/settings/settings.md#background_distributed_schedule_pool_size) setting. -If the server ceased to exist or had a rough restart (for example, after a device failure) after an INSERT to a Distributed table, the inserted data might be lost. If a damaged data part is detected in the table directory, it is transferred to the `broken` subdirectory and no longer used. +If the server ceased to exist or had a rough restart (for example, due to a hardware failure) after an `INSERT` to a `Distributed` table, the inserted data might be lost. If a damaged data part is detected in the table directory, it is transferred to the `broken` subdirectory and no longer used. + +## Reading data {#distributed-reading-data} + +When querying a `Distributed` table, `SELECT` queries are sent to all shards and work regardless of how data is distributed across the shards (they can be distributed completely randomly). When you add a new shard, you do not have to transfer old data into it. Instead, you can write new data to it by using a heavier weight – the data will be distributed slightly unevenly, but queries will work correctly and efficiently. When the `max_parallel_replicas` option is enabled, query processing is parallelized across all replicas within a single shard. For more information, see the section [max_parallel_replicas](../../../operations/settings/settings.md#settings-max_parallel_replicas). diff --git a/docs/en/faq/general/how-do-i-contribute-code-to-clickhouse.md b/docs/en/faq/general/how-do-i-contribute-code-to-clickhouse.md new file mode 100644 index 00000000000..731dc9dface --- /dev/null +++ b/docs/en/faq/general/how-do-i-contribute-code-to-clickhouse.md @@ -0,0 +1,15 @@ +--- +title: How do I contribute code to ClickHouse? +toc_hidden: true +toc_priority: 120 +--- + +# How do I contribute code to ClickHouse? {#how-do-i-contribute-code-to-clickhouse} + +ClickHouse is an open-source project [developed on GitHub](https://github.com/ClickHouse/ClickHouse). + +As customary, contribution instructions are published in [CONTRIBUTING.md](https://github.com/ClickHouse/ClickHouse/blob/master/CONTRIBUTING.md) file in the root of the source code repository. + +If you want to suggest a substantial change to ClickHouse, consider [opening a GitHub issue](https://github.com/ClickHouse/ClickHouse/issues/new/choose) explaining what you want to do, to discuss it with maintainers and community first. [Examples of such RFC issues](https://github.com/ClickHouse/ClickHouse/issues?q=is%3Aissue+is%3Aopen+rfc). + +If your contributions are security related, please check out [our security policy](https://github.com/ClickHouse/ClickHouse/security/policy/) too. diff --git a/docs/en/faq/general/index.md b/docs/en/faq/general/index.md index cd2368be1cf..51fff9a53ae 100644 --- a/docs/en/faq/general/index.md +++ b/docs/en/faq/general/index.md @@ -17,6 +17,7 @@ Questions: - [What is OLAP?](../../faq/general/olap.md) - [What is a columnar database?](../../faq/general/columnar-database.md) - [Why not use something like MapReduce?](../../faq/general/mapreduce.md) +- [How do I contribute code to ClickHouse?](../../faq/general/how-do-i-contribute-code-to-clickhouse.md) !!! info "Don’t see what you were looking for?" Check out [other F.A.Q. categories](../../faq/index.md) or browse around main documentation articles found in the left sidebar. diff --git a/docs/en/faq/operations/index.md b/docs/en/faq/operations/index.md index c0a6d85b66d..81aec18b9cf 100644 --- a/docs/en/faq/operations/index.md +++ b/docs/en/faq/operations/index.md @@ -11,6 +11,7 @@ Questions: - [Which ClickHouse version to use in production?](../../faq/operations/production.md) - [Is it possible to delete old records from a ClickHouse table?](../../faq/operations/delete-old-data.md) +- [Does ClickHouse support multi-region replication?](../../faq/operations/multi-region-replication.md) !!! info "Don’t see what you were looking for?" Check out [other F.A.Q. categories](../../faq/index.md) or browse around main documentation articles found in the left sidebar. diff --git a/docs/en/faq/operations/multi-region-replication.md b/docs/en/faq/operations/multi-region-replication.md new file mode 100644 index 00000000000..7d78737544a --- /dev/null +++ b/docs/en/faq/operations/multi-region-replication.md @@ -0,0 +1,13 @@ +--- +title: Does ClickHouse support multi-region replication? +toc_hidden: true +toc_priority: 30 +--- + +# Does ClickHouse support multi-region replication? {#does-clickhouse-support-multi-region-replication} + +The short answer is "yes". However, we recommend keeping latency between all regions/datacenters in two-digit range, otherwise write performance will suffer as it goes through distributed consensus protocol. For example, replication between US coasts will likely work fine, but between the US and Europe won't. + +Configuration-wise there's no difference compared to single-region replication, simply use hosts that are located in different locations for replicas. + +For more information, see [full article on data replication](../../engines/table-engines/mergetree-family/replication.md). diff --git a/docs/en/getting-started/install.md b/docs/en/getting-started/install.md index 4a97ab6589d..70a1b8349ff 100644 --- a/docs/en/getting-started/install.md +++ b/docs/en/getting-started/install.md @@ -142,6 +142,12 @@ On Gentoo, you can just use `emerge clickhouse` to install ClickHouse from sourc To start the server as a daemon, run: +``` bash +$ sudo clickhouse start +``` + +There are also another ways to run ClickHouse: + ``` bash $ sudo service clickhouse-server start ``` @@ -152,6 +158,12 @@ If you do not have `service` command, run as $ sudo /etc/init.d/clickhouse-server start ``` +If you have `systemctl` command, run as + +``` bash +$ sudo systemctl start clickhouse-server.service +``` + See the logs in the `/var/log/clickhouse-server/` directory. If the server does not start, check the configurations in the file `/etc/clickhouse-server/config.xml`. diff --git a/docs/en/interfaces/formats.md b/docs/en/interfaces/formats.md index 245a0c8fe89..f266d0e6354 100644 --- a/docs/en/interfaces/formats.md +++ b/docs/en/interfaces/formats.md @@ -204,7 +204,7 @@ When parsing with this format, tabs or linefeeds are not allowed in each field. This format is also available under the name `TSVRawWithNames`. -## TabSeparatedWithNamesAndTypes {#tabseparatedrawwithnamesandtypes} +## TabSeparatedRawWithNamesAndTypes {#tabseparatedrawwithnamesandtypes} Differs from `TabSeparatedWithNamesAndTypes` format in that the rows are written without escaping. When parsing with this format, tabs or linefeeds are not allowed in each field. diff --git a/docs/en/interfaces/grpc.md b/docs/en/interfaces/grpc.md new file mode 100644 index 00000000000..b30715082ec --- /dev/null +++ b/docs/en/interfaces/grpc.md @@ -0,0 +1,99 @@ +--- +toc_priority: 19 +toc_title: gRPC Interface +--- + +# gRPC Interface {#grpc-interface} + +## Introduction {#grpc-interface-introduction} + +ClickHouse supports [gRPC](https://grpc.io/) interface. It is an open source remote procedure call system that uses HTTP/2 and [Protocol Buffers](https://en.wikipedia.org/wiki/Protocol_Buffers). The implementation of gRPC in ClickHouse supports: + +- SSL; +- authentication; +- sessions; +- compression; +- parallel queries through the same channel; +- cancellation of queries; +- getting progress and logs; +- external tables. + +The specification of the interface is described in [clickhouse_grpc.proto](https://github.com/ClickHouse/ClickHouse/blob/master/src/Server/grpc_protos/clickhouse_grpc.proto). + +## gRPC Configuration {#grpc-interface-configuration} + +To use the gRPC interface set `grpc_port` in the main [server configuration](../operations/configuration-files.md). Other configuration options see in the following example: + +```xml +9100 + + false + + + /path/to/ssl_cert_file + /path/to/ssl_key_file + + + false + + + /path/to/ssl_ca_cert_file + + + deflate + + + medium + + + -1 + -1 + + + false + +``` + +## Built-in Client {#grpc-client} + +You can write a client in any of the programming languages supported by gRPC using the provided [specification](https://github.com/ClickHouse/ClickHouse/blob/master/src/Server/grpc_protos/clickhouse_grpc.proto). +Or you can use a built-in Python client. It is placed in [utils/grpc-client/clickhouse-grpc-client.py](https://github.com/ClickHouse/ClickHouse/blob/master/utils/grpc-client/clickhouse-grpc-client.py) in the repository. The built-in client requires [grpcio and grpcio-tools](https://grpc.io/docs/languages/python/quickstart) Python modules. + +The client supports the following arguments: + +- `--help` – Shows a help message and exits. +- `--host HOST, -h HOST` – A server name. Default value: `localhost`. You can use IPv4 or IPv6 addresses also. +- `--port PORT` – A port to connect to. This port should be enabled in the ClickHouse server configuration (see `grpc_port`). Default value: `9100`. +- `--user USER_NAME, -u USER_NAME` – A user name. Default value: `default`. +- `--password PASSWORD` – A password. Default value: empty string. +- `--query QUERY, -q QUERY` – A query to process when using non-interactive mode. +- `--database DATABASE, -d DATABASE` – A default database. If not specified, the current database set in the server settings is used (`default` by default). +- `--format OUTPUT_FORMAT, -f OUTPUT_FORMAT` – A result output [format](formats.md). Default value for interactive mode: `PrettyCompact`. +- `--debug` – Enables showing debug information. + +To run the client in an interactive mode call it without `--query` argument. + +In a batch mode query data can be passed via `stdin`. + +**Client Usage Example** + +In the following example a table is created and loaded with data from a CSV file. Then the content of the table is queried. + +``` bash +./clickhouse-grpc-client.py -q "CREATE TABLE grpc_example_table (id UInt32, text String) ENGINE = MergeTree() ORDER BY id;" +echo "0,Input data for" > a.txt ; echo "1,gRPC protocol example" >> a.txt +cat a.txt | ./clickhouse-grpc-client.py -q "INSERT INTO grpc_example_table FORMAT CSV" + +./clickhouse-grpc-client.py --format PrettyCompact -q "SELECT * FROM grpc_example_table;" +``` + +Result: + +``` text +┌─id─┬─text──────────────────┐ +│ 0 │ Input data for │ +│ 1 │ gRPC protocol example │ +└────┴───────────────────────┘ +``` diff --git a/docs/en/interfaces/http.md b/docs/en/interfaces/http.md index a2f0944de47..38e729fde0b 100644 --- a/docs/en/interfaces/http.md +++ b/docs/en/interfaces/http.md @@ -424,7 +424,10 @@ Next are the configuration methods for different `type`. `query` value is a predefined query of `predefined_query_handler`, which is executed by ClickHouse when an HTTP request is matched and the result of the query is returned. It is a must configuration. -The following example defines the values of [max_threads](../operations/settings/settings.md#settings-max_threads) and `max_alter_threads` settings, then queries the system table to check whether these settings were set successfully. +The following example defines the values of [max_threads](../operations/settings/settings.md#settings-max_threads) and `max_final_threads` settings, then queries the system table to check whether these settings were set successfully. + +!!! note "Warning" + To keep the default `handlers` such as` query`, `play`,` ping`, use the `` rule. Example: @@ -443,13 +446,14 @@ Example: SELECT name, value FROM system.settings WHERE name = {name_2:String} + ``` ``` bash -$ curl -H 'XXX:TEST_HEADER_VALUE' -H 'PARAMS_XXX:max_threads' 'http://localhost:8123/query_param_with_url/1/max_threads/max_alter_threads?max_threads=1&max_alter_threads=2' +$ curl -H 'XXX:TEST_HEADER_VALUE' -H 'PARAMS_XXX:max_threads' 'http://localhost:8123/query_param_with_url/1/max_threads/max_final_threads?max_threads=1&max_final_threads=2' 1 -max_alter_threads 2 +max_final_threads 2 ``` !!! note "caution" @@ -461,7 +465,7 @@ In `dynamic_query_handler`, the query is written in the form of param of the HTT ClickHouse extracts and executes the value corresponding to the `query_param_name` value in the URL of the HTTP request. The default value of `query_param_name` is `/query` . It is an optional configuration. If there is no definition in the configuration file, the param is not passed in. -To experiment with this functionality, the example defines the values of [max_threads](../operations/settings/settings.md#settings-max_threads) and `max_alter_threads` and `queries` whether the settings were set successfully. +To experiment with this functionality, the example defines the values of [max_threads](../operations/settings/settings.md#settings-max_threads) and `max_final_threads` and `queries` whether the settings were set successfully. Example: @@ -475,13 +479,14 @@ Example: query_param + ``` ``` bash -$ curl -H 'XXX:TEST_HEADER_VALUE_DYNAMIC' 'http://localhost:8123/own?max_threads=1&max_alter_threads=2¶m_name_1=max_threads¶m_name_2=max_alter_threads&query_param=SELECT%20name,value%20FROM%20system.settings%20where%20name%20=%20%7Bname_1:String%7D%20OR%20name%20=%20%7Bname_2:String%7D' +$ curl -H 'XXX:TEST_HEADER_VALUE_DYNAMIC' 'http://localhost:8123/own?max_threads=1&max_final_threads=2¶m_name_1=max_threads¶m_name_2=max_final_threads&query_param=SELECT%20name,value%20FROM%20system.settings%20where%20name%20=%20%7Bname_1:String%7D%20OR%20name%20=%20%7Bname_2:String%7D' max_threads 1 -max_alter_threads 2 +max_final_threads 2 ``` ### static {#static} @@ -505,6 +510,7 @@ Return a message. Say Hi! + ``` diff --git a/docs/en/interfaces/index.md b/docs/en/interfaces/index.md index 10f15ae47d6..7b73cec22a0 100644 --- a/docs/en/interfaces/index.md +++ b/docs/en/interfaces/index.md @@ -6,10 +6,11 @@ toc_title: Introduction # Interfaces {#interfaces} -ClickHouse provides two network interfaces (both can be optionally wrapped in TLS for additional security): +ClickHouse provides three network interfaces (they can be optionally wrapped in TLS for additional security): - [HTTP](http.md), which is documented and easy to use directly. - [Native TCP](../interfaces/tcp.md), which has less overhead. +- [gRPC](grpc.md). In most cases it is recommended to use appropriate tool or library instead of interacting with those directly. Officially supported by Yandex are the following: @@ -24,4 +25,3 @@ There are also a wide range of third-party libraries for working with ClickHouse - [Integrations](../interfaces/third-party/integrations.md) - [Visual interfaces](../interfaces/third-party/gui.md) -[Original article](https://clickhouse.com/docs/en/interfaces/) diff --git a/docs/en/interfaces/third-party/client-libraries.md b/docs/en/interfaces/third-party/client-libraries.md index 342b1c9a496..a116c8e2222 100644 --- a/docs/en/interfaces/third-party/client-libraries.md +++ b/docs/en/interfaces/third-party/client-libraries.md @@ -6,7 +6,7 @@ toc_title: Client Libraries # Client Libraries from Third-party Developers {#client-libraries-from-third-party-developers} !!! warning "Disclaimer" - Yandex does **not** maintain the libraries listed below and hasn’t done any extensive testing to ensure their quality. + ClickHouse Inc does **not** maintain the libraries listed below and hasn’t done any extensive testing to ensure their quality. - Python - [infi.clickhouse_orm](https://github.com/Infinidat/infi.clickhouse_orm) diff --git a/docs/en/interfaces/third-party/gui.md b/docs/en/interfaces/third-party/gui.md index 593019bfb2e..393974c60c4 100644 --- a/docs/en/interfaces/third-party/gui.md +++ b/docs/en/interfaces/third-party/gui.md @@ -220,4 +220,24 @@ SeekTable is [free](https://www.seektable.com/help/cloud-pricing) for personal/i [Chadmin](https://github.com/bun4uk/chadmin) is a simple UI where you can visualize your currently running queries on your ClickHouse cluster and info about them and kill them if you want. +### TABLUM.IO {#tablum_io} + +[TABLUM.IO](https://tablum.io/) — an online query and analytics tool for ETL and visualization. It allows connecting to ClickHouse, query data via a versatile SQL console as well as to load data from static files and 3rd party services. TABLUM.IO can visualize data results as charts and tables. + +Features: +- ETL: data loading from popular databases, local and remote files, API invocations. +- Versatile SQL console with syntax highlight and visual query builder. +- Data visualization as charts and tables. +- Data materialization and sub-queries. +- Data reporting to Slack, Telegram or email. +- Data pipelining via proprietary API. +- Data export in JSON, CSV, SQL, HTML formats. +- Web-based interface. + +TABLUM.IO can be run as a self-hosted solution (as a docker image) or in the cloud. +License: [commercial](https://tablum.io/pricing) product with 3-month free period. + +Try it out for free [in the cloud](https://tablum.io/try). +Learn more about the product at [TABLUM.IO](https://tablum.io/) + [Original article](https://clickhouse.com/docs/en/interfaces/third-party/gui/) diff --git a/docs/en/introduction/adopters.md b/docs/en/introduction/adopters.md index 325cd1ff825..87c5a6f7aec 100644 --- a/docs/en/introduction/adopters.md +++ b/docs/en/introduction/adopters.md @@ -87,7 +87,7 @@ toc_title: Adopters | Kontur | Software Development | Metrics | — | — | [Talk in Russian, November 2018](https://www.youtube.com/watch?v=U4u4Bd0FtrY) | | Kuaishou | Video | — | — | — | [ClickHouse Meetup, October 2018](https://clickhouse.com/blog/en/2018/clickhouse-community-meetup-in-beijing-on-october-28-2018/) | | KGK Global | Vehicle monitoring | — | — | — | [Press release, June 2021](https://zoom.cnews.ru/news/item/530921) | -| Lawrence Berkeley National Laboratory | Research | Traffic analysis | 1 server | 11.8 TiB | [Slides in English, April 2019](https://www.smitasin.com/presentations/2019-04-17_DOE-NSM.pdf) | +| Lawrence Berkeley National Laboratory | Research | Traffic analysis | 5 servers | 55 TiB | [Slides in English, April 2019](https://www.smitasin.com/presentations/2019-04-17_DOE-NSM.pdf) | | LifeStreet | Ad network | Main product | 75 servers (3 replicas) | 5.27 PiB | [Blog post in Russian, February 2017](https://habr.com/en/post/322620/) | | Mail.ru Cloud Solutions | Cloud services | Main product | — | — | [Article in Russian](https://mcs.mail.ru/help/db-create/clickhouse#) | | MAXILECT | Ad Tech, Blockchain, ML, AI | — | — | — | [Job advertisement, 2021](https://www.linkedin.com/feed/update/urn:li:activity:6780842017229430784/) | @@ -178,5 +178,9 @@ toc_title: Adopters | Цифровой Рабочий | Industrial IoT, Analytics | — | — | — | [Blog post in Russian, March 2021](https://habr.com/en/company/croc/blog/548018/) | | ООО «МПЗ Богородский» | Agriculture | — | — | — | [Article in Russian, November 2020](https://cloud.yandex.ru/cases/okraina) | | ДомКлик | Real Estate | — | — | — | [Article in Russian, October 2021](https://habr.com/ru/company/domclick/blog/585936/) | +| Futurra Group | Analytics | — | — | — | [Article in Russian, December 2021](https://dou.ua/forums/topic/35587/) | +| UseTech | Software Development | — | — | — | [Job Posting, December 2021](https://vk.com/wall136266658_2418) | +| Lookforsale | E-Commerce | — | — | — | [Job Posting, December 2021](https://telegram.me/javascript_jobs/587318) | +| R-Vision | Information Security | — | — | — | [Article in Russian, December 2021](https://www.anti-malware.ru/reviews/R-Vision-SENSE-15) | [Original article](https://clickhouse.com/docs/en/introduction/adopters/) diff --git a/docs/en/operations/clickhouse-keeper.md b/docs/en/operations/clickhouse-keeper.md index eb4673be18a..350ca835187 100644 --- a/docs/en/operations/clickhouse-keeper.md +++ b/docs/en/operations/clickhouse-keeper.md @@ -3,20 +3,25 @@ toc_priority: 66 toc_title: ClickHouse Keeper --- -# [pre-production] ClickHouse Keeper +# [pre-production] ClickHouse Keeper {#clickHouse-keeper} ClickHouse server uses [ZooKeeper](https://zookeeper.apache.org/) coordination system for data [replication](../engines/table-engines/mergetree-family/replication.md) and [distributed DDL](../sql-reference/distributed-ddl.md) queries execution. ClickHouse Keeper is an alternative coordination system compatible with ZooKeeper. !!! warning "Warning" This feature is currently in the pre-production stage. We test it in our CI and on small internal installations. -## Implementation details +## Implementation details {#implementation-details} ZooKeeper is one of the first well-known open-source coordination systems. It's implemented in Java, has quite a simple and powerful data model. ZooKeeper's coordination algorithm called ZAB (ZooKeeper Atomic Broadcast) doesn't provide linearizability guarantees for reads, because each ZooKeeper node serves reads locally. Unlike ZooKeeper ClickHouse Keeper is written in C++ and uses [RAFT algorithm](https://raft.github.io/) [implementation](https://github.com/eBay/NuRaft). This algorithm allows to have linearizability for reads and writes, has several open-source implementations in different languages. By default, ClickHouse Keeper provides the same guarantees as ZooKeeper (linearizable writes, non-linearizable reads). It has a compatible client-server protocol, so any standard ZooKeeper client can be used to interact with ClickHouse Keeper. Snapshots and logs have an incompatible format with ZooKeeper, but `clickhouse-keeper-converter` tool allows to convert ZooKeeper data to ClickHouse Keeper snapshot. Interserver protocol in ClickHouse Keeper is also incompatible with ZooKeeper so mixed ZooKeeper / ClickHouse Keeper cluster is impossible. -## Configuration +ClickHouse Keeper supports Access Control List (ACL) the same way as [ZooKeeper](https://zookeeper.apache.org/doc/r3.1.2/zookeeperProgrammers.html#sc_ZooKeeperAccessControl) does. ClickHouse Keeper supports the same set of permissions and has the identical built-in schemes: `world`, `auth`, `digest`, `host` and `ip`. Digest authentication scheme uses pair `username:password`. Password is encoded in Base64. + +!!! info "Note" + External integrations are not supported. + +## Configuration {#configuration} ClickHouse Keeper can be used as a standalone replacement for ZooKeeper or as an internal part of the ClickHouse server, but in both cases configuration is almost the same `.xml` file. The main ClickHouse Keeper configuration tag is ``. Keeper configuration has the following parameters: @@ -97,7 +102,7 @@ Examples of configuration for quorum with three nodes can be found in [integrati ``` -## How to run +## How to run {#how-to-run} ClickHouse Keeper is bundled into the ClickHouse server package, just add configuration of `` and start ClickHouse server as always. If you want to run standalone ClickHouse Keeper you can start it in a similar way with: @@ -105,26 +110,27 @@ ClickHouse Keeper is bundled into the ClickHouse server package, just add config clickhouse-keeper --config /etc/your_path_to_config/config.xml --daemon ``` -## Four Letter Word Commands +## Four Letter Word Commands {#four-letter-word-commands} ClickHouse Keeper also provides 4lw commands which are almost the same with Zookeeper. Each command is composed of four letters such as `mntr`, `stat` etc. There are some more interesting commands: `stat` gives some general information about the server and connected clients, while `srvr` and `cons` give extended details on server and connections respectively. The 4lw commands has a white list configuration `four_letter_word_white_list` which has default value "conf,cons,crst,envi,ruok,srst,srvr,stat,wchc,wchs,dirs,mntr,isro". You can issue the commands to ClickHouse Keeper via telnet or nc, at the client port. + ``` echo mntr | nc localhost 9181 ``` Bellow is the detailed 4lw commands: -- ruok : Tests if server is running in a non-error state. The server will respond with imok if it is running. Otherwise it will not respond at all. A response of "imok" does not necessarily indicate that the server has joined the quorum, just that the server process is active and bound to the specified client port. Use "stat" for details on state wrt quorum and client connection information. +- `ruok`: Tests if server is running in a non-error state. The server will respond with imok if it is running. Otherwise it will not respond at all. A response of "imok" does not necessarily indicate that the server has joined the quorum, just that the server process is active and bound to the specified client port. Use "stat" for details on state wrt quorum and client connection information. ``` imok ``` -- mntr : Outputs a list of variables that could be used for monitoring the health of the cluster. +- `mntr`: Outputs a list of variables that could be used for monitoring the health of the cluster. ``` zk_version v21.11.1.1-prestable-7a4a0b0edef0ad6e0aa662cd3b90c3f4acf796e7 @@ -146,12 +152,11 @@ zk_followers 0 zk_synced_followers 0 ``` -- srvr : Lists full details for the server. +- `srvr`: Lists full details for the server. ``` ClickHouse Keeper version: v21.11.1.1-prestable-7a4a0b0edef0ad6e0aa662cd3b90c3f4acf796e7 Latency min/avg/max: 0/0/0 - Received: 2 Sent : 2 Connections: 1 @@ -161,16 +166,14 @@ Mode: leader Node count: 4 ``` -- stat : Lists brief details for the server and connected clients. +- `stat`: Lists brief details for the server and connected clients. ``` ClickHouse Keeper version: v21.11.1.1-prestable-7a4a0b0edef0ad6e0aa662cd3b90c3f4acf796e7 Clients: 192.168.1.1:52852(recved=0,sent=0) 192.168.1.1:52042(recved=24,sent=48) - Latency min/avg/max: 0/0/0 - Received: 4 Sent : 4 Connections: 1 @@ -178,16 +181,15 @@ Outstanding: 0 Zxid: 36 Mode: leader Node count: 4 - ``` -- srst : Reset server statistics. The command will affect the result of `srvr`, `mntr` and `stat`. +- `srst`: Reset server statistics. The command will affect the result of `srvr`, `mntr` and `stat`. ``` Server stats reset. ``` -- conf : Print details about serving configuration. +- `conf`: Print details about serving configuration. ``` server_id=1 @@ -220,20 +222,20 @@ compress_snapshots_with_zstd_format=true configuration_change_tries_count=20 ``` -- cons : List full connection/session details for all clients connected to this server. Includes information on numbers of packets received/sent, session id, operation latencies, last operation performed, etc... +- `cons`: List full connection/session details for all clients connected to this server. Includes information on numbers of packets received/sent, session id, operation latencies, last operation performed, etc... ``` 192.168.1.1:52163(recved=0,sent=0,sid=0xffffffffffffffff,lop=NA,est=1636454787393,to=30000,lzxid=0xffffffffffffffff,lresp=0,llat=0,minlat=0,avglat=0,maxlat=0) 192.168.1.1:52042(recved=9,sent=18,sid=0x0000000000000001,lop=List,est=1636454739887,to=30000,lcxid=0x0000000000000005,lzxid=0x0000000000000005,lresp=1636454739892,llat=0,minlat=0,avglat=0,maxlat=0) ``` -- crst : Reset connection/session statistics for all connections. +- `crst`: Reset connection/session statistics for all connections. ``` Connection stats reset. ``` -- envi : Print details about serving environment +- `envi`: Print details about serving environment ``` Environment: @@ -250,41 +252,41 @@ user.tmp=/var/folders/b4/smbq5mfj7578f2jzwn602tt40000gn/T/ ``` -- dirs : Shows the total size of snapshot and log files in bytes +- `dirs`: Shows the total size of snapshot and log files in bytes ``` snapshot_dir_size: 0 log_dir_size: 3875 ``` -- isro: Tests if server is running in read-only mode. The server will respond with "ro" if in read-only mode or "rw" if not in read-only mode. +- `isro`: Tests if server is running in read-only mode. The server will respond with "ro" if in read-only mode or "rw" if not in read-only mode. ``` rw ``` -- wchs : Lists brief information on watches for the server. +- `wchs`: Lists brief information on watches for the server. ``` 1 connections watching 1 paths Total watches:1 ``` -- wchc : Lists detailed information on watches for the server, by session. This outputs a list of sessions(connections) with associated watches (paths). Note, depending on the number of watches this operation may be expensive (ie impact server performance), use it carefully. +- `wchc`: Lists detailed information on watches for the server, by session. This outputs a list of sessions (connections) with associated watches (paths). Note, depending on the number of watches this operation may be expensive (ie impact server performance), use it carefully. ``` 0x0000000000000001 /clickhouse/task_queue/ddl ``` -- wchp : Lists detailed information on watches for the server, by path. This outputs a list of paths (znodes) with associated sessions. Note, depending on the number of watches this operation may be expensive (ie impact server performance), use it carefully. +- `wchp`: Lists detailed information on watches for the server, by path. This outputs a list of paths (znodes) with associated sessions. Note, depending on the number of watches this operation may be expensive (i. e. impact server performance), use it carefully. ``` /clickhouse/task_queue/ddl 0x0000000000000001 ``` -- dump : Lists the outstanding sessions and ephemeral nodes. This only works on the leader. +- `dump`: Lists the outstanding sessions and ephemeral nodes. This only works on the leader. ``` Sessions dump (2): @@ -295,7 +297,7 @@ Sessions with Ephemerals (1): /clickhouse/task_queue/ddl ``` -## [experimental] Migration from ZooKeeper +## [experimental] Migration from ZooKeeper {#migration-from-zookeeper} Seamlessly migration from ZooKeeper to ClickHouse Keeper is impossible you have to stop your ZooKeeper cluster, convert data and start ClickHouse Keeper. `clickhouse-keeper-converter` tool allows converting ZooKeeper logs and snapshots to ClickHouse Keeper snapshot. It works only with ZooKeeper > 3.4. Steps for migration: diff --git a/docs/en/operations/external-authenticators/kerberos.md b/docs/en/operations/external-authenticators/kerberos.md index 2e2a88dc7a8..da84c1f6a89 100644 --- a/docs/en/operations/external-authenticators/kerberos.md +++ b/docs/en/operations/external-authenticators/kerberos.md @@ -14,11 +14,11 @@ To enable Kerberos, one should include `kerberos` section in `config.xml`. This #### Parameters: - `principal` - canonical service principal name that will be acquired and used when accepting security contexts. - - This parameter is optional, if omitted, the default principal will be used. + - This parameter is optional, if omitted, the default principal will be used. - `realm` - a realm, that will be used to restrict authentication to only those requests whose initiator's realm matches it. - - This parameter is optional, if omitted, no additional filtering by realm will be applied. + - This parameter is optional, if omitted, no additional filtering by realm will be applied. Example (goes into `config.xml`): @@ -75,7 +75,7 @@ In order to enable Kerberos authentication for the user, specify `kerberos` sect Parameters: - `realm` - a realm that will be used to restrict authentication to only those requests whose initiator's realm matches it. - - This parameter is optional, if omitted, no additional filtering by realm will be applied. + - This parameter is optional, if omitted, no additional filtering by realm will be applied. Example (goes into `users.xml`): diff --git a/docs/en/operations/server-configuration-parameters/settings.md b/docs/en/operations/server-configuration-parameters/settings.md index 717ab4e14b7..bd164fa59f9 100644 --- a/docs/en/operations/server-configuration-parameters/settings.md +++ b/docs/en/operations/server-configuration-parameters/settings.md @@ -435,26 +435,58 @@ Similar to `interserver_http_host`, except that this hostname can be used by oth ## interserver_http_credentials {#server-settings-interserver-http-credentials} -The username and password used to authenticate during [replication](../../engines/table-engines/mergetree-family/replication.md) with the Replicated\* engines. These credentials are used only for communication between replicas and are unrelated to credentials for ClickHouse clients. The server is checking these credentials for connecting replicas and use the same credentials when connecting to other replicas. So, these credentials should be set the same for all replicas in a cluster. -By default, the authentication is not used. +A username and a password used to connect to other servers during [replication](../../engines/table-engines/mergetree-family/replication.md). Also the server authenticates other replicas using these credentials. So, `interserver_http_credentials` must be the same for all replicas in a cluster. + +By default, if `interserver_http_credentials` section is omitted, authentication is not used during replication. !!! note "Note" - These credentials are common for replication through `HTTP` and `HTTPS`. + `interserver_http_credentials` settings do not relate to a ClickHouse client credentials [configuration](../../interfaces/cli.md#configuration_files). -This section contains the following parameters: +!!! note "Note" + These credentials are common for replication via `HTTP` and `HTTPS`. -- `user` — username. -- `password` — password. +The section contains the following parameters: -**Example** +- `user` — Username. +- `password` — Password. +- `allow_empty` — If `true`, then other replicas are allowed to connect without authentication even if credentials are set. If `false`, then connections without authentication are refused. Default value: `false`. +- `old` — Contains old `user` and `password` used during credential rotation. Several `old` sections can be specified. + +**Credentials Rotation** + +ClickHouse supports dynamic interserver credentials rotation without stopping all replicas at the same time to update their configuration. Credentials can be changed in several steps. + +To enable authentication, set `interserver_http_credentials.allow_empty` to `true` and add credentials. This allows connections with authentication and without it. + +``` xml + + admin + 111 + true + +``` + +After configuring all replicas set `allow_empty` to `false` or remove this setting. It makes authentication with new credentials mandatory. + +To change existing credentials, move the username and the password to `interserver_http_credentials.old` section and update `user` and `password` with new values. At this point the server uses new credentials to connect to other replicas and accepts connections with either new or old credentials. ``` xml admin 222 + + admin + 111 + + + temp + 000 + ``` +When new credentials are applied to all replicas, old credentials may be removed. + ## keep_alive_timeout {#keep-alive-timeout} The number of seconds that ClickHouse waits for incoming requests before closing the connection. Defaults to 10 seconds. @@ -505,7 +537,7 @@ Keys: - `level` – Logging level. Acceptable values: `trace`, `debug`, `information`, `warning`, `error`. - `log` – The log file. Contains all the entries according to `level`. - `errorlog` – Error log file. -- `size` – Size of the file. Applies to `log`and`errorlog`. Once the file reaches `size`, ClickHouse archives and renames it, and creates a new log file in its place. +- `size` – Size of the file. Applies to `log` and `errorlog`. Once the file reaches `size`, ClickHouse archives and renames it, and creates a new log file in its place. - `count` – The number of archived log files that ClickHouse stores. **Example** @@ -640,7 +672,8 @@ On hosts with low RAM and swap, you possibly need setting `max_server_memory_usa ## max_concurrent_queries {#max-concurrent-queries} -The maximum number of simultaneously processed queries related to MergeTree table. Queries may be limited by other settings: [max_concurrent_queries_for_user](#max-concurrent-queries-for-user), [max_concurrent_queries_for_all_users](#max-concurrent-queries-for-all-users), [min_marks_to_honor_max_concurrent_queries](#min-marks-to-honor-max-concurrent-queries). +The maximum number of simultaneously processed queries related to MergeTree table. +Queries may be limited by other settings: [max_concurrent_insert_queries](#max-concurrent-insert-queries), [max_concurrent_select_queries](#max-concurrent-select-queries), [max_concurrent_queries_for_user](#max-concurrent-queries-for-user), [max_concurrent_queries_for_all_users](#max-concurrent-queries-for-all-users), [min_marks_to_honor_max_concurrent_queries](#min-marks-to-honor-max-concurrent-queries). !!! info "Note" These settings can be modified at runtime and will take effect immediately. Queries that are already running will remain unchanged. @@ -656,6 +689,42 @@ Possible values: 100 ``` +## max_concurrent_insert_queries {#max-concurrent-insert-queries} + +The maximum number of simultaneously processed insert queries. + +!!! info "Note" + These settings can be modified at runtime and will take effect immediately. Queries that are already running will remain unchanged. + +Possible values: + +- Positive integer. +- 0 — Disabled. + +**Example** + +``` xml +100 +``` + +## max_concurrent_select_queries {#max-concurrent-select-queries} + +The maximum number of simultaneously processed select queries. + +!!! info "Note" + These settings can be modified at runtime and will take effect immediately. Queries that are already running will remain unchanged. + +Possible values: + +- Positive integer. +- 0 — Disabled. + +**Example** + +``` xml +100 +``` + ## max_concurrent_queries_for_user {#max-concurrent-queries-for-user} The maximum number of simultaneously processed queries related to MergeTree table per user. @@ -750,9 +819,13 @@ The value 0 means that you can delete all tables without any restrictions. ## max_thread_pool_size {#max-thread-pool-size} -The maximum number of threads in the Global Thread pool. +ClickHouse uses threads from the Global Thread pool to process queries. If there is no idle thread to process a query, then a new thread is created in the pool. `max_thread_pool_size` limits the maximum number of threads in the pool. -Default value: 10000. +Possible values: + +- Positive integer. + +Default value: `10000`. **Example** @@ -762,9 +835,13 @@ Default value: 10000. ## max_thread_pool_free_size {#max-thread-pool-free-size} -The number of threads that are always held in the Global Thread pool. +If the number of **idle** threads in the Global Thread pool is greater than `max_thread_pool_free_size`, then ClickHouse releases resources occupied by some threads and the pool size is decreased. Threads can be created again if necessary. -Default value: 1000. +Possible values: + +- Positive integer. + +Default value: `1000`. **Example** @@ -774,9 +851,13 @@ Default value: 1000. ## thread_pool_queue_size {#thread-pool-queue-size} -The limit to the number of jobs that can be scheduled on the Global Thread pool. Increasing queue size leads to larger memory usage. It is recommended to keep this value equal to the `max_thread_pool_size`. +The maximum number of jobs that can be scheduled on the Global Thread pool. Increasing queue size leads to larger memory usage. It is recommended to keep this value equal to [max_thread_pool_size](#max-thread-pool-size). -Default value: 10000. +Possible values: + +- Positive integer. + +Default value: `10000`. **Example** @@ -1443,7 +1524,7 @@ You can also define sections `memory` — means storing information only in memo To add an LDAP server as a remote user directory of users that are not defined locally, define a single `ldap` section with a following parameters: - `server` — one of LDAP server names defined in `ldap_servers` config section. This parameter is mandatory and cannot be empty. -- `roles` — section with a list of locally defined roles that will be assigned to each user retrieved from the LDAP server. If no roles are specified, user will not be able to perform any actions after authentication. If any of the listed roles is not defined locally at the time of authentication, the authenthication attept will fail as if the provided password was incorrect. +- `roles` — section with a list of locally defined roles that will be assigned to each user retrieved from the LDAP server. If no roles are specified, user will not be able to perform any actions after authentication. If any of the listed roles is not defined locally at the time of authentication, the authentication attempt will fail as if the provided password was incorrect. **Example** @@ -1507,3 +1588,4 @@ Possible values: - Positive integer. Default value: `10000`. + diff --git a/docs/en/operations/settings/merge-tree-settings.md b/docs/en/operations/settings/merge-tree-settings.md index 0fd1e54955c..af75d130ed3 100644 --- a/docs/en/operations/settings/merge-tree-settings.md +++ b/docs/en/operations/settings/merge-tree-settings.md @@ -356,3 +356,24 @@ Possible values: - 1 — Parts are detached. Default value: `0`. + +## merge_tree_clear_old_temporary_directories_interval_seconds {#setting-merge-tree-clear-old-temporary-directories-interval-seconds} + +Sets the interval in seconds for ClickHouse to execute the cleanup of old temporary directories. + +Possible values: + +- Any positive integer. + +Default value: `60` seconds. + +## merge_tree_clear_old_parts_interval_seconds {#setting-merge-tree-clear-old-parts-interval-seconds} + +Sets the interval in seconds for ClickHouse to execute the cleanup of old parts, WALs, and mutations. + +Possible values: + +- Any positive integer. + +Default value: `1` second. + diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index fa4cc41e8ff..510047f4353 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -817,9 +817,19 @@ If the number of rows to be read from a file of a [MergeTree](../../engines/tabl Possible values: -- Any positive integer. +- Positive integer. -Default value: 163840. +Default value: `163840`. + +## merge_tree_min_rows_for_concurrent_read_for_remote_filesystem {#merge-tree-min-rows-for-concurrent-read-for-remote-filesystem} + +The minimum number of lines to read from one file before [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) engine can parallelize reading, when reading from remote filesystem. + +Possible values: + +- Positive integer. + +Default value: `163840`. ## merge_tree_min_bytes_for_concurrent_read {#setting-merge-tree-min-bytes-for-concurrent-read} @@ -827,9 +837,19 @@ If the number of bytes to read from one file of a [MergeTree](../../engines/tabl Possible value: -- Any positive integer. +- Positive integer. -Default value: 251658240. +Default value: `251658240`. + +## merge_tree_min_bytes_for_concurrent_read_for_remote_filesystem {#merge-tree-min-bytes-for-concurrent-read-for-remote-filesystem} + +The minimum number of bytes to read from one file before [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) engine can parallelize reading, when reading from remote filesystem. + +Possible values: + +- Positive integer. + +Default value: `251658240`. ## merge_tree_min_rows_for_seek {#setting-merge-tree-min-rows-for-seek} @@ -885,26 +905,6 @@ Possible values: Default value: 2013265920. -## merge_tree_clear_old_temporary_directories_interval_seconds {#setting-merge-tree-clear-old-temporary-directories-interval-seconds} - -Sets the interval in seconds for ClickHouse to execute the cleanup of old temporary directories. - -Possible values: - -- Any positive integer. - -Default value: `60` seconds. - -## merge_tree_clear_old_parts_interval_seconds {#setting-merge-tree-clear-old-parts-interval-seconds} - -Sets the interval in seconds for ClickHouse to execute the cleanup of old parts, WALs, and mutations. - -Possible values: - -- Any positive integer. - -Default value: `1` second. - ## min_bytes_to_use_direct_io {#settings-min-bytes-to-use-direct-io} The minimum data volume required for using direct I/O access to the storage disk. @@ -1489,7 +1489,7 @@ Possible values: Default value: `1`. -**See Also** +**See Also** - [min_count_to_compile_aggregate_expression](#min_count_to_compile_aggregate_expression) @@ -1707,18 +1707,17 @@ Quorum writes `INSERT` succeeds only when ClickHouse manages to correctly write data to the `insert_quorum` of replicas during the `insert_quorum_timeout`. If for any reason the number of replicas with successful writes does not reach the `insert_quorum`, the write is considered failed and ClickHouse will delete the inserted block from all the replicas where data has already been written. -All the replicas in the quorum are consistent, i.e., they contain data from all previous `INSERT` queries. The `INSERT` sequence is linearized. +When `insert_quorum_parallel` is disabled, all replicas in the quorum are consistent, i.e. they contain data from all previous `INSERT` queries (the `INSERT` sequence is linearized). When reading data written using `insert_quorum` and `insert_quorum_parallel` is disabled, you can turn on sequential consistency for `SELECT` queries using [select_sequential_consistency](#settings-select_sequential_consistency). -When reading the data written from the `insert_quorum`, you can use the [select_sequential_consistency](#settings-select_sequential_consistency) option. - -ClickHouse generates an exception +ClickHouse generates an exception: - If the number of available replicas at the time of the query is less than the `insert_quorum`. -- At an attempt to write data when the previous block has not yet been inserted in the `insert_quorum` of replicas. This situation may occur if the user tries to perform an `INSERT` before the previous one with the `insert_quorum` is completed. +- When `insert_quorum_parallel` is disabled and an attempt to write data is made when the previous block has not yet been inserted in `insert_quorum` of replicas. This situation may occur if the user tries to perform another `INSERT` query to the same table before the previous one with `insert_quorum` is completed. See also: - [insert_quorum_timeout](#settings-insert_quorum_timeout) +- [insert_quorum_parallel](#settings-insert_quorum_parallel) - [select_sequential_consistency](#settings-select_sequential_consistency) ## insert_quorum_timeout {#settings-insert_quorum_timeout} @@ -1730,11 +1729,29 @@ Default value: 600 000 milliseconds (ten minutes). See also: - [insert_quorum](#settings-insert_quorum) +- [insert_quorum_parallel](#settings-insert_quorum_parallel) +- [select_sequential_consistency](#settings-select_sequential_consistency) + +## insert_quorum_parallel {#settings-insert_quorum_parallel} + +Enables or disables parallelism for quorum `INSERT` queries. If enabled, additional `INSERT` queries can be sent while previous queries have not yet finished. If disabled, additional writes to the same table will be rejected. + +Possible values: + +- 0 — Disabled. +- 1 — Enabled. + +Default value: 1. + +See also: + +- [insert_quorum](#settings-insert_quorum) +- [insert_quorum_timeout](#settings-insert_quorum_timeout) - [select_sequential_consistency](#settings-select_sequential_consistency) ## select_sequential_consistency {#settings-select_sequential_consistency} -Enables or disables sequential consistency for `SELECT` queries: +Enables or disables sequential consistency for `SELECT` queries. Requires `insert_quorum_parallel` to be disabled (enabled by default). Possible values: @@ -1747,10 +1764,13 @@ Usage When sequential consistency is enabled, ClickHouse allows the client to execute the `SELECT` query only for those replicas that contain data from all previous `INSERT` queries executed with `insert_quorum`. If the client refers to a partial replica, ClickHouse will generate an exception. The SELECT query will not include data that has not yet been written to the quorum of replicas. +When `insert_quorum_parallel` is enabled (the default), then `select_sequential_consistency` does not work. This is because parallel `INSERT` queries can be written to different sets of quorum replicas so there is no guarantee a single replica will have received all writes. + See also: - [insert_quorum](#settings-insert_quorum) - [insert_quorum_timeout](#settings-insert_quorum_timeout) +- [insert_quorum_parallel](#settings-insert_quorum_parallel) ## insert_deduplicate {#settings-insert-deduplicate} @@ -2095,7 +2115,7 @@ Possible values: - 0 — Optimization disabled. - 1 — Optimization enabled. - + Default value: `1`. See also: @@ -3134,6 +3154,12 @@ Possible values: Default value: `0`. +!!! warning "Warning" + Nullable primary key usually indicates bad design. It is forbidden in almost all main stream DBMS. The feature is mainly for [AggregatingMergeTree](../../engines/table-engines/mergetree-family/aggregatingmergetree.md) and is not heavily tested. Use with care. + +!!! warning "Warning" + Do not enable this feature in version `<= 21.8`. It's not properly implemented and may lead to server crash. + ## aggregate_functions_null_for_empty {#aggregate_functions_null_for_empty} Enables or disables rewriting all aggregate functions in a query, adding [-OrNull](../../sql-reference/aggregate-functions/combinators.md#agg-functions-combinator-ornull) suffix to them. Enable it for SQL standard compatibility. @@ -3682,49 +3708,6 @@ Possible values: Default value: `0`. -## materialized_postgresql_max_block_size {#materialized-postgresql-max-block-size} - -Sets the number of rows collected in memory before flushing data into PostgreSQL database table. - -Possible values: - -- Positive integer. - -Default value: `65536`. - -## materialized_postgresql_tables_list {#materialized-postgresql-tables-list} - -Sets a comma-separated list of PostgreSQL database tables, which will be replicated via [MaterializedPostgreSQL](../../engines/database-engines/materialized-postgresql.md) database engine. - -Default value: empty list — means whole PostgreSQL database will be replicated. - -## materialized_postgresql_schema {#materialized-postgresql-schema} - -Default value: empty string. (Default schema is used) - -## materialized_postgresql_schema_list {#materialized-postgresql-schema-list} - -Default value: empty list. (Default schema is used) - -## materialized_postgresql_allow_automatic_update {#materialized-postgresql-allow-automatic-update} - -Allows reloading table in the background, when schema changes are detected. DDL queries on the PostgreSQL side are not replicated via ClickHouse [MaterializedPostgreSQL](../../engines/database-engines/materialized-postgresql.md) engine, because it is not allowed with PostgreSQL logical replication protocol, but the fact of DDL changes is detected transactionally. In this case, the default behaviour is to stop replicating those tables once DDL is detected. However, if this setting is enabled, then, instead of stopping the replication of those tables, they will be reloaded in the background via database snapshot without data losses and replication will continue for them. - -Possible values: - -- 0 — The table is not automatically updated in the background, when schema changes are detected. -- 1 — The table is automatically updated in the background, when schema changes are detected. - -Default value: `0`. - -## materialized_postgresql_replication_slot {#materialized-postgresql-replication-slot} - -A user-created replication slot. Must be used together with [materialized_postgresql_snapshot](#materialized-postgresql-snapshot). - -## materialized_postgresql_snapshot {#materialized-postgresql-snapshot} - -A text string identifying a snapshot, from which [initial dump of PostgreSQL tables](../../engines/database-engines/materialized-postgresql.md) will be performed. Must be used together with [materialized_postgresql_replication_slot](#materialized-postgresql-replication-slot). - ## allow_experimental_projection_optimization {#allow-experimental-projection-optimization} Enables or disables [projection](../../engines/table-engines/mergetree-family/mergetree.md#projections) optimization when processing `SELECT` queries. @@ -3993,8 +3976,8 @@ If [wait_for_async_insert](#wait-for-async-insert) is enabled, every client will Possible values: -- 0 — Insertions are made synchronously, one after another. -- 1 — Multiple asynchronous insertions enabled. +- 0 — Insertions are made synchronously, one after another. +- 1 — Multiple asynchronous insertions enabled. Default value: `0`. @@ -4066,7 +4049,7 @@ Default value: `0`. ## alter_partition_verbose_result {#alter-partition-verbose-result} -Enables or disables the display of information about the parts to which the manipulation operations with partitions and parts have been successfully applied. +Enables or disables the display of information about the parts to which the manipulation operations with partitions and parts have been successfully applied. Applicable to [ATTACH PARTITION|PART](../../sql-reference/statements/alter/partition.md#alter_attach-partition) and to [FREEZE PARTITION](../../sql-reference/statements/alter/partition.md#alter_freeze-partition). Possible values: diff --git a/docs/en/operations/system-tables/columns.md b/docs/en/operations/system-tables/columns.md index 5ba38ab3e67..55e4a8284a0 100644 --- a/docs/en/operations/system-tables/columns.md +++ b/docs/en/operations/system-tables/columns.md @@ -6,7 +6,7 @@ You can use this table to get information similar to the [DESCRIBE TABLE](../../ Columns from [temporary tables](../../sql-reference/statements/create/table.md#temporary-tables) are visible in the `system.columns` only in those session where they have been created. They are shown with the empty `database` field. -Columns: +The `system.columns` table contains the following columns (the column type is shown in brackets): - `database` ([String](../../sql-reference/data-types/string.md)) — Database name. - `table` ([String](../../sql-reference/data-types/string.md)) — Table name. @@ -86,21 +86,4 @@ numeric_scale: ᴺᵁᴸᴸ datetime_precision: ᴺᵁᴸᴸ ``` -The `system.columns` table contains the following columns (the column type is shown in brackets): - -- `database` (String) — Database name. -- `table` (String) — Table name. -- `name` (String) — Column name. -- `type` (String) — Column type. -- `default_kind` (String) — Expression type (`DEFAULT`, `MATERIALIZED`, `ALIAS`) for the default value, or an empty string if it is not defined. -- `default_expression` (String) — Expression for the default value, or an empty string if it is not defined. -- `data_compressed_bytes` (UInt64) — The size of compressed data, in bytes. -- `data_uncompressed_bytes` (UInt64) — The size of decompressed data, in bytes. -- `marks_bytes` (UInt64) — The size of marks, in bytes. -- `comment` (String) — Comment on the column, or an empty string if it is not defined. -- `is_in_partition_key` (UInt8) — Flag that indicates whether the column is in the partition expression. -- `is_in_sorting_key` (UInt8) — Flag that indicates whether the column is in the sorting key expression. -- `is_in_primary_key` (UInt8) — Flag that indicates whether the column is in the primary key expression. -- `is_in_sampling_key` (UInt8) — Flag that indicates whether the column is in the sampling key expression. - [Original article](https://clickhouse.com/docs/en/operations/system-tables/columns) diff --git a/docs/en/operations/system-tables/metrics.md b/docs/en/operations/system-tables/metrics.md index 551c63d1aa3..21e5923e3a0 100644 --- a/docs/en/operations/system-tables/metrics.md +++ b/docs/en/operations/system-tables/metrics.md @@ -35,7 +35,7 @@ SELECT * FROM system.metrics LIMIT 10 - [system.asynchronous_metrics](../../operations/system-tables/asynchronous_metrics.md#system_tables-asynchronous_metrics) — Contains periodically calculated metrics. - [system.events](../../operations/system-tables/events.md#system_tables-events) — Contains a number of events that occurred. -- [system.metric_log](../../operations/system-tables/metric_log.md#system_tables-metric_log) — Contains a history of metrics values from tables `system.metrics` и `system.events`. +- [system.metric_log](../../operations/system-tables/metric_log.md#system_tables-metric_log) — Contains a history of metrics values from tables `system.metrics` and `system.events`. - [Monitoring](../../operations/monitoring.md) — Base concepts of ClickHouse monitoring. [Original article](https://clickhouse.com/docs/en/operations/system-tables/metrics) diff --git a/docs/en/operations/tips.md b/docs/en/operations/tips.md index 6866c4db491..477d3b52965 100644 --- a/docs/en/operations/tips.md +++ b/docs/en/operations/tips.md @@ -34,7 +34,7 @@ Use `perf top` to watch the time spent in the kernel for memory management. Permanent huge pages also do not need to be allocated. !!! warning "Attention" - If your system has less than 16 GB of RAM you may experience various memory exceptions because default settings does not match this amount of RAM. Recommended amount of RAM is 32 GB or more. You can use ClickHouse in system with small amount of RAM, even with 2 GB of RAM, but it requires an additional tuning and able to process small ingestion rate. + If your system has less than 16 GB of RAM, you may experience various memory exceptions because default settings do not match this amount of memory. The recommended amount of RAM is 32 GB or more. You can use ClickHouse in a system with a small amount of RAM, even with 2 GB of RAM, but it requires additional tuning and can ingest at a low rate. ## Storage Subsystem {#storage-subsystem} diff --git a/docs/en/operations/utilities/odbc-bridge.md b/docs/en/operations/utilities/odbc-bridge.md index 70b413c9c1f..e5967085c49 100644 --- a/docs/en/operations/utilities/odbc-bridge.md +++ b/docs/en/operations/utilities/odbc-bridge.md @@ -26,7 +26,7 @@ Query is send in post body. Response is returned in RowBinary format. ```bash $ clickhouse-odbc-bridge --http-port 9018 --daemon -$ curl -d "query=SELECT PageID, ImpID, AdType FROM Keys ORDER BY PageID, ImpID" --data-urlencode "connection_string=DSN=ClickHouse;DATABASE=stat" --data-urlencode "columns=columns format version: 1 +$ curl -d "query=SELECT PageID, ImpID, AdType FROM Keys ORDER BY PageID, ImpID" --data-urlencode "connection_string=DSN=ClickHouse;DATABASE=stat" --data-urlencode "sample_block=columns format version: 1 3 columns: \`PageID\` String \`ImpID\` String diff --git a/docs/en/sql-reference/aggregate-functions/reference/sparkbar.md b/docs/en/sql-reference/aggregate-functions/reference/sparkbar.md new file mode 100644 index 00000000000..47c696129c7 --- /dev/null +++ b/docs/en/sql-reference/aggregate-functions/reference/sparkbar.md @@ -0,0 +1,64 @@ +--- +toc_priority: 311 +toc_title: sparkbar +--- + +# sparkbar {#sparkbar} + +The function plots a frequency histogram for values `x` and the repetition rate `y` of these values over the interval `[min_x, max_x]`. + + +If no interval is specified, then the minimum `x` is used as the interval start, and the maximum `x` — as the interval end. + +**Syntax** + +``` sql +sparkbar(width[, min_x, max_x])(x, y) +``` + +**Parameters** + +- `width` — The number of segments. Type: [Integer](../../../sql-reference/data-types/int-uint.md). +- `min_x` — The interval start. Optional parameter. +- `max_x` — The interval end. Optional parameter. + +**Arguments** + +- `x` — The field with values. +- `y` — The field with the frequency of values. + +**Returned value** + +- The frequency histogram. + +**Example** + +Query: + +``` sql +CREATE TABLE spark_bar_data (`cnt` UInt64,`event_date` Date) ENGINE = MergeTree ORDER BY event_date SETTINGS index_granularity = 8192; + +INSERT INTO spark_bar_data VALUES(1,'2020-01-01'),(4,'2020-01-02'),(5,'2020-01-03'),(2,'2020-01-04'),(3,'2020-01-05'),(7,'2020-01-06'),(6,'2020-01-07'),(8,'2020-01-08'),(2,'2020-01-11'); + +SELECT sparkbar(9)(event_date,cnt) FROM spark_bar_data; + +SELECT sparkbar(9,toDate('2020-01-01'),toDate('2020-01-10'))(event_date,cnt) FROM spark_bar_data; +``` + +Result: + +``` text + +┌─sparkbar(9)(event_date, cnt)─┐ +│ │ +│ ▁▅▄▃██▅ ▁ │ +│ │ +└──────────────────────────────┘ + +┌─sparkbar(9, toDate('2020-01-01'), toDate('2020-01-10'))(event_date, cnt)─┐ +│ │ +│▁▄▄▂▅▇█▁ │ +│ │ +└──────────────────────────────────────────────────────────────────────────┘ +``` + diff --git a/docs/en/sql-reference/aggregate-functions/reference/uniq.md b/docs/en/sql-reference/aggregate-functions/reference/uniq.md index 598af24c0de..33bfe72548b 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/uniq.md +++ b/docs/en/sql-reference/aggregate-functions/reference/uniq.md @@ -24,9 +24,7 @@ Function: - Calculates a hash for all parameters in the aggregate, then uses it in calculations. -- Uses an adaptive sampling algorithm. For the calculation state, the function uses a sample of element hash values up to 65536. - - This algorithm is very accurate and very efficient on the CPU. When the query contains several of these functions, using `uniq` is almost as fast as using other aggregate functions. +- Uses an adaptive sampling algorithm. For the calculation state, the function uses a sample of element hash values up to 65536. This algorithm is very accurate and very efficient on the CPU. When the query contains several of these functions, using `uniq` is almost as fast as using other aggregate functions. - Provides the result deterministically (it does not depend on the query processing order). diff --git a/docs/en/sql-reference/data-types/aggregatefunction.md b/docs/en/sql-reference/data-types/aggregatefunction.md index 81945eeece6..e483a20eed9 100644 --- a/docs/en/sql-reference/data-types/aggregatefunction.md +++ b/docs/en/sql-reference/data-types/aggregatefunction.md @@ -11,9 +11,7 @@ Aggregate functions can have an implementation-defined intermediate state that c **Parameters** -- Name of the aggregate function. - - If the function is parametric, specify its parameters too. +- Name of the aggregate function. If the function is parametric, specify its parameters too. - Types of the aggregate function arguments. diff --git a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-polygon.md b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-polygon.md index 5fedd5cf8ad..b49f384367d 100644 --- a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-polygon.md +++ b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-polygon.md @@ -41,7 +41,7 @@ Example of a polygon dictionary configuration: ``` -Tne corresponding [DDL-query](../../../sql-reference/statements/create/dictionary.md#create-dictionary-query): +The corresponding [DDL-query](../../../sql-reference/statements/create/dictionary.md#create-dictionary-query): ``` sql CREATE DICTIONARY polygon_dict_name ( key Array(Array(Array(Array(Float64)))), diff --git a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md index 095f059513c..e606a19af6f 100644 --- a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md +++ b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md @@ -122,7 +122,12 @@ Setting fields: - `command` — The absolute path to the executable file, or the file name (if the program directory is written to `PATH`). - `format` — The file format. All the formats described in [Formats](../../../interfaces/formats.md#formats) are supported. +- `command_termination_timeout` — executable script should contain main read-write loop. After dictionary is destroyed, pipe is closed, and executable file will have `command_termination_timeout` seconds to shutdown, before ClickHouse will send SIGTERM signal to child process. Specified in seconds. Default value is 10. Optional parameter. +- `command_read_timeout` - timeout for reading data from command stdout in milliseconds. Default value 10000. Optional parameter. +- `command_read_timeout` - timeout for writing data to command stdin in milliseconds. Default value 10000. Optional parameter. - `implicit_key` — The executable source file can return only values, and the correspondence to the requested keys is determined implicitly — by the order of rows in the result. Default value is false. +- `execute_direct` - Executable source file will be searched inside `user_scripts` folder and executed directly. Arguments are passed using spaces. Example: `test_script arg_1 arg_2`. Default value is false. Optional parameter. +- `send_chunk_header` - controls whether to send row count before sending a chunk of data to process. Optional. Default value is `false`. That dictionary source can be configured only via XML configuration. Creating dictionaries with executable source via DDL is disabled, otherwise, the DB user would be able to execute arbitrary binary on ClickHouse node. @@ -150,10 +155,14 @@ Setting fields: - `command` — The absolute path to the executable file, or the file name (if the program directory is written to `PATH`). - `format` — The file format. All the formats described in “[Formats](../../../interfaces/formats.md#formats)” are supported. -- `pool_size` — Size of pool. If 0 is specified as `pool_size` then there is no pool size restrictions. -- `command_termination_timeout` — Executable pool script should contain main read-write loop. After dictionary is destroyed, pipe is closed, and executable file will have `command_termination_timeout` seconds to shutdown, before ClickHouse will send SIGTERM signal to child process. Specified in seconds. Default value is 10. Optional parameter. +- `pool_size` — Size of pool. If 0 is specified as `pool_size` then there is no pool size restrictions. Default value is `16`. +- `command_termination_timeout` — executable script should contain main read-write loop. After dictionary is destroyed, pipe is closed, and executable file will have `command_termination_timeout` seconds to shutdown, before ClickHouse will send SIGTERM signal to child process. Specified in seconds. Default value is 10. Optional parameter. - `max_command_execution_time` — Maximum executable script command execution time for processing block of data. Specified in seconds. Default value is 10. Optional parameter. +- `command_read_timeout` - timeout for reading data from command stdout in milliseconds. Default value 10000. Optional parameter. +- `command_read_timeout` - timeout for writing data to command stdin in milliseconds. Default value 10000. Optional parameter. - `implicit_key` — The executable source file can return only values, and the correspondence to the requested keys is determined implicitly — by the order of rows in the result. Default value is false. Optional parameter. +- `execute_direct` - Executable source file will be searched inside `user_scripts` folder and executed directly. Additional arguments can be specified. Example: `test_script arg_1 arg_2`. Default value is false. Optional parameter. +- `send_chunk_header` - controls whether to send row count before sending a chunk of data to process. Optional. Default value is `false`. That dictionary source can be configured only via XML configuration. Creating dictionaries with executable source via DDL is disabled, otherwise, the DB user would be able to execute arbitrary binary on ClickHouse node. diff --git a/docs/en/sql-reference/functions/index.md b/docs/en/sql-reference/functions/index.md index e86e6b37998..037078ba223 100644 --- a/docs/en/sql-reference/functions/index.md +++ b/docs/en/sql-reference/functions/index.md @@ -73,16 +73,19 @@ User defined function configurations are searched relative to the path specified A function configuration contains the following settings: - `name` - a function name. -- `command` - a command or a script to execute. +- `command` - script name to execute or command if `execute_direct` is false. - `argument` - argument description with the `type` of an argument. Each argument is described in a separate setting. - `format` - a [format](../../interfaces/formats.md) in which arguments are passed to the command. - `return_type` - the type of a returned value. - `type` - an executable type. If `type` is set to `executable` then single command is started. If it is set to `executable_pool` then a pool of commands is created. - `max_command_execution_time` - maximum execution time in seconds for processing block of data. This setting is valid for `executable_pool` commands only. Optional. Default value is `10`. -- `command_termination_timeout` - time in seconds during which a command should finish after its pipe is closed. After that time `SIGTERM` is sent to the process executing the command. This setting is valid for `executable_pool` commands only. Optional. Default value is `10`. +- `command_termination_timeout` - time in seconds during which a command should finish after its pipe is closed. After that time `SIGTERM` is sent to the process executing the command. Optional. Default value is `10`. +- `command_read_timeout` - timeout for reading data from command stdout in milliseconds. Default value 10000. Optional parameter. +- `command_read_timeout` - timeout for writing data to command stdin in milliseconds. Default value 10000. Optional parameter. - `pool_size` - the size of a command pool. Optional. Default value is `16`. -- `lifetime` - the reload interval of a function in seconds. If it is set to `0` then the function is not reloaded. - `send_chunk_header` - controls whether to send row count before sending a chunk of data to process. Optional. Default value is `false`. +- `execute_direct` - Executable source file will be searched inside `user_scripts` folder and executed directly. Additional arguments can be specified. Example: `test_script arg_1 arg_2`. Default value is true. Optional parameter. +- `lifetime` - the reload interval of a function in seconds. If it is set to `0` then the function is not reloaded. Default value is `0`. Optional parameter. The command must read arguments from `STDIN` and must output the result to `STDOUT`. The command must process arguments iteratively. That is after processing a chunk of arguments it must wait for the next chunk. @@ -102,7 +105,6 @@ Creating `test_function` using XML configuration: TabSeparated cd /; clickhouse-local --input-format TabSeparated --output-format TabSeparated --structure 'x UInt64, y UInt64' --query "SELECT x + y FROM table" - 0 ``` diff --git a/docs/en/sql-reference/functions/window-view-functions.md b/docs/en/sql-reference/functions/time-window-functions.md similarity index 69% rename from docs/en/sql-reference/functions/window-view-functions.md rename to docs/en/sql-reference/functions/time-window-functions.md index 5684e93bd88..2ea44a6e585 100644 --- a/docs/en/sql-reference/functions/window-view-functions.md +++ b/docs/en/sql-reference/functions/time-window-functions.md @@ -1,15 +1,15 @@ --- toc_priority: 68 -toc_title: Window View +toc_title: Time Window --- -# Window View Functions {#window-view-functions} +# Time Window Functions {#time-window-functions} -Window functions indicate the lower and upper window bound of records in WindowView. The functions for working with WindowView are listed below. +Time window functions return the inclusive lower and exclusive upper bound of the corresponding window. The functions for working with WindowView are listed below: -## tumble {#window-view-functions-tumble} +## tumble {#time-window-functions-tumble} -A tumbling time window assigns records to non-overlapping, continuous windows with a fixed duration (interval). +A tumbling time window assigns records to non-overlapping, continuous windows with a fixed duration (`interval`). ``` sql tumble(time_attr, interval [, timezone]) @@ -22,7 +22,7 @@ tumble(time_attr, interval [, timezone]) **Returned values** -- The lower and upper bound of the tumble window. +- The inclusive lower and exclusive upper bound of the corresponding tumbling window. Type: `Tuple(DateTime, DateTime)` @@ -42,7 +42,7 @@ Result: └───────────────────────────────────────────────┘ ``` -## hop {#window-view-functions-hop} +## hop {#time-window-functions-hop} A hopping time window has a fixed duration (`window_interval`) and hops by a specified hop interval (`hop_interval`). If the `hop_interval` is smaller than the `window_interval`, hopping windows are overlapping. Thus, records can be assigned to multiple windows. @@ -59,9 +59,7 @@ hop(time_attr, hop_interval, window_interval [, timezone]) **Returned values** -- The lower and upper bound of the hop window. Since hop windows are - overlapped, the function only returns the bound of the **first** window when - hop function is used **without** `WINDOW VIEW`. +- The inclusive lower and exclusive upper bound of the corresponding hopping window. Since one record can be assigned to multiple hop windows, the function only returns the bound of the **first** window when hop function is used **without** `WINDOW VIEW`. Type: `Tuple(DateTime, DateTime)` @@ -81,33 +79,33 @@ Result: └───────────────────────────────────────────────────────────┘ ``` -## tumbleStart {#window-view-functions-tumblestart} +## tumbleStart {#time-window-functions-tumblestart} -Indicate the lower bound of a tumble function. +Returns the inclusive lower bound of the corresponding tumbling window. ``` sql tumbleStart(time_attr, interval [, timezone]); ``` -## tumbleEnd {#window-view-functions-tumbleend} +## tumbleEnd {#time-window-functions-tumbleend} -Indicate the upper bound of a tumble function. +Returns the exclusive upper bound of the corresponding tumbling window. ``` sql tumbleEnd(time_attr, interval [, timezone]); ``` -## hopStart {#window-view-functions-hopstart} +## hopStart {#time-window-functions-hopstart} -Indicate the lower bound of a hop function. +Returns the inclusive lower bound of the corresponding hopping window. ``` sql hopStart(time_attr, hop_interval, window_interval [, timezone]); ``` -## hopEnd {#window-view-functions-hopend} +## hopEnd {#time-window-functions-hopend} -Indicate the upper bound of a hop function. +Returns the exclusive upper bound of the corresponding hopping window. ``` sql hopEnd(time_attr, hop_interval, window_interval [, timezone]); diff --git a/docs/en/sql-reference/statements/alter/projection.md b/docs/en/sql-reference/statements/alter/projection.md index 96cd8f5d607..c7ebc83c496 100644 --- a/docs/en/sql-reference/statements/alter/projection.md +++ b/docs/en/sql-reference/statements/alter/projection.md @@ -9,11 +9,12 @@ The following operations with [projections](../../../engines/table-engines/merge - `ALTER TABLE [db].name ADD PROJECTION name ( SELECT [GROUP BY] [ORDER BY] )` - Adds projection description to tables metadata. -- `ALTER TABLE [db].name DROP PROJECTION name` - Removes projection description from tables metadata and deletes projection files from disk. +- `ALTER TABLE [db].name DROP PROJECTION name` - Removes projection description from tables metadata and deletes projection files from disk. Implemented as a [mutation](../../../sql-reference/statements/alter/index.md#mutations). - `ALTER TABLE [db.]table MATERIALIZE PROJECTION name IN PARTITION partition_name` - The query rebuilds the projection `name` in the partition `partition_name`. Implemented as a [mutation](../../../sql-reference/statements/alter/index.md#mutations). -- `ALTER TABLE [db.]table CLEAR PROJECTION name IN PARTITION partition_name` - Deletes projection files from disk without removing description. +- `ALTER TABLE [db.]table CLEAR PROJECTION name IN PARTITION partition_name` - Deletes projection files from disk without removing description. Implemented as a [mutation](../../../sql-reference/statements/alter/index.md#mutations). + The commands `ADD`, `DROP` and `CLEAR` are lightweight in a sense that they only change metadata or remove files. diff --git a/docs/en/sql-reference/statements/create/role.md b/docs/en/sql-reference/statements/create/role.md index 4723613aeef..e0e58f7a0f6 100644 --- a/docs/en/sql-reference/statements/create/role.md +++ b/docs/en/sql-reference/statements/create/role.md @@ -31,7 +31,7 @@ CREATE ROLE accountant; GRANT SELECT ON db.* TO accountant; ``` -This sequence of queries creates the role `accountant` that has the privilege of reading data from the `accounting` database. +This sequence of queries creates the role `accountant` that has the privilege of reading data from the `db` database. Assigning the role to the user `mira`: diff --git a/docs/en/sql-reference/statements/create/table.md b/docs/en/sql-reference/statements/create/table.md index d64642704f5..c3e54545549 100644 --- a/docs/en/sql-reference/statements/create/table.md +++ b/docs/en/sql-reference/statements/create/table.md @@ -22,7 +22,7 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] ) ENGINE = engine ``` -Creates a table named `name` in the `db` database or the current database if `db` is not set, with the structure specified in brackets and the `engine` engine. +Creates a table named `table_name` in the `db` database or the current database if `db` is not set, with the structure specified in brackets and the `engine` engine. The structure of the table is a list of column descriptions, secondary indexes and constraints . If [primary key](#primary-key) is supported by the engine, it will be indicated as parameter for the table engine. A column description is `name type` in the simplest case. Example: `RegionID UInt32`. diff --git a/docs/en/sql-reference/statements/create/view.md b/docs/en/sql-reference/statements/create/view.md index aa6b82360e0..f7d3a6d697a 100644 --- a/docs/en/sql-reference/statements/create/view.md +++ b/docs/en/sql-reference/statements/create/view.md @@ -251,22 +251,22 @@ Most common uses of live view tables include: Enable usage of window views and `WATCH` query using [allow_experimental_window_view](../../../operations/settings/settings.md#allow-experimental-window-view) setting. Input the command `set allow_experimental_window_view = 1`. ``` sql -CREATE WINDOW VIEW [IF NOT EXISTS] [db.]table_name [TO [db.]table_name] [ENGINE = engine] [WATERMARK = strategy] [ALLOWED_LATENESS = interval_function] AS SELECT ... GROUP BY window_view_function +CREATE WINDOW VIEW [IF NOT EXISTS] [db.]table_name [TO [db.]table_name] [ENGINE = engine] [WATERMARK = strategy] [ALLOWED_LATENESS = interval_function] AS SELECT ... GROUP BY time_window_function ``` -Window view can aggregate data by time window and output the results when the window is ready to fire. It stores the partial aggregation results in an inner(or specified) table and can push the processing result to a specified table or push notifications using the WATCH query. +Window view can aggregate data by time window and output the results when the window is ready to fire. It stores the partial aggregation results in an inner(or specified) table to reduce latency and can push the processing result to a specified table or push notifications using the WATCH query. Creating a window view is similar to creating `MATERIALIZED VIEW`. Window view needs an inner storage engine to store intermediate data. The inner storage will use `AggregatingMergeTree` as the default engine. -### Window View Functions {#window-view-windowviewfunctions} +### Time Window Functions {#window-view-timewindowfunctions} -[Window view functions](../../functions/window-view-functions.md) are used to indicate the lower and upper window bound of records. The window view needs to be used with a window view function. +[Time window functions](../../functions/time-window-functions.md) are used to get the lower and upper window bound of records. The window view needs to be used with a time window function. ### TIME ATTRIBUTES {#window-view-timeattributes} Window view supports **processing time** and **event time** process. -**Processing time** allows window view to produce results based on the local machine's time and is used by default. It is the most straightforward notion of time but does not provide determinism. The processing time attribute can be defined by setting the `time_attr` of the window view function to a table column or using the function `now()`. The following query creates a window view with processing time. +**Processing time** allows window view to produce results based on the local machine's time and is used by default. It is the most straightforward notion of time but does not provide determinism. The processing time attribute can be defined by setting the `time_attr` of the time window function to a table column or using the function `now()`. The following query creates a window view with processing time. ``` sql CREATE WINDOW VIEW wv AS SELECT count(number), tumbleStart(w_id) as w_start from date GROUP BY tumble(now(), INTERVAL '5' SECOND) as w_id @@ -274,13 +274,13 @@ CREATE WINDOW VIEW wv AS SELECT count(number), tumbleStart(w_id) as w_start from **Event time** is the time that each individual event occurred on its producing device. This time is typically embedded within the records when it is generated. Event time processing allows for consistent results even in case of out-of-order events or late events. Window view supports event time processing by using `WATERMARK` syntax. -Window view provides three watermark strategies. +Window view provides three watermark strategies: * `STRICTLY_ASCENDING`: Emits a watermark of the maximum observed timestamp so far. Rows that have a timestamp smaller to the max timestamp are not late. * `ASCENDING`: Emits a watermark of the maximum observed timestamp so far minus 1. Rows that have a timestamp equal and smaller to the max timestamp are not late. * `BOUNDED`: WATERMARK=INTERVAL. Emits watermarks, which are the maximum observed timestamp minus the specified delay. -The following queries are examples of creating a window view with `WATERMARK`. +The following queries are examples of creating a window view with `WATERMARK`: ``` sql CREATE WINDOW VIEW wv WATERMARK=STRICTLY_ASCENDING AS SELECT count(number) FROM date GROUP BY tumble(timestamp, INTERVAL '5' SECOND); @@ -296,15 +296,18 @@ CREATE WINDOW VIEW test.wv TO test.dst WATERMARK=ASCENDING ALLOWED_LATENESS=INTE Note that elements emitted by a late firing should be treated as updated results of a previous computation. Instead of firing at the end of windows, the window view will fire immediately when the late event arrives. Thus, it will result in multiple outputs for the same window. Users need to take these duplicated results into account or deduplicate them. -### Monitoring New Windows{#window-view-monitoring} +### Monitoring New Windows {#window-view-monitoring} -Window view supports the `WATCH` query to constantly append the processing results to the console or use `TO` syntax to output the results to a table. +Window view supports the [WATCH](../../../sql-reference/statements/watch.md) query to monitoring changes, or use `TO` syntax to output the results to a table. ``` sql -WATCH [db.]name [LIMIT n] +WATCH [db.]window_view +[EVENTS] +[LIMIT n] +[FORMAT format] ``` -`WATCH` query acts similar as in `LIVE VIEW`. A `LIMIT` can be specified to set the number of updates to receive before terminating the query. +`WATCH` query acts similar as in `LIVE VIEW`. A `LIMIT` can be specified to set the number of updates to receive before terminating the query. The `EVENTS` clause can be used to obtain a short form of the `WATCH` query where instead of the query result you will just get the latest query watermark. ### Settings {#window-view-settings} diff --git a/docs/en/sql-reference/statements/grant.md b/docs/en/sql-reference/statements/grant.md index f04952746a6..2b1262f7d3c 100644 --- a/docs/en/sql-reference/statements/grant.md +++ b/docs/en/sql-reference/statements/grant.md @@ -21,7 +21,7 @@ GRANT [ON CLUSTER cluster_name] privilege[(column_name [,...])] [,...] ON {db.ta - `user` — ClickHouse user account. The `WITH GRANT OPTION` clause grants `user` or `role` with permission to execute the `GRANT` query. Users can grant privileges of the same scope they have and less. -The `WITH REPLACE OPTION` clause replace old privileges by new privileges for the `user` or `role`, if not specified it is append privileges. +The `WITH REPLACE OPTION` clause replace old privileges by new privileges for the `user` or `role`, if is not specified it appends privileges. ## Assigning Role Syntax {#assign-role-syntax} @@ -33,7 +33,7 @@ GRANT [ON CLUSTER cluster_name] role [,...] TO {user | another_role | CURRENT_US - `user` — ClickHouse user account. The `WITH ADMIN OPTION` clause grants [ADMIN OPTION](#admin-option-privilege) privilege to `user` or `role`. -The `WITH REPLACE OPTION` clause replace old roles by new role for the `user` or `role`, if not specified it is append roles. +The `WITH REPLACE OPTION` clause replace old roles by new role for the `user` or `role`, if is not specified it appends roles. ## Usage {#grant-usage} diff --git a/docs/en/sql-reference/statements/select/group-by.md b/docs/en/sql-reference/statements/select/group-by.md index 26dd51d806d..969a39ce51f 100644 --- a/docs/en/sql-reference/statements/select/group-by.md +++ b/docs/en/sql-reference/statements/select/group-by.md @@ -206,6 +206,9 @@ This extra row is only produced in `JSON*`, `TabSeparated*`, and `Pretty*` forma - In `Pretty*` formats, the row is output as a separate table after the main result. - In the other formats it is not available. +!!! note "Note" + totals is output in the results of `SELECT` queries, and is not output in `INSERT INTO ... SELECT`. + `WITH TOTALS` can be run in different ways when [HAVING](../../../sql-reference/statements/select/having.md) is present. The behavior depends on the `totals_mode` setting. ### Configuring Totals Processing {#configuring-totals-processing} diff --git a/docs/en/sql-reference/statements/select/join.md b/docs/en/sql-reference/statements/select/join.md index aa61348d2a0..3d302be561a 100644 --- a/docs/en/sql-reference/statements/select/join.md +++ b/docs/en/sql-reference/statements/select/join.md @@ -55,13 +55,13 @@ The behavior of ClickHouse server for `ANY JOIN` operations depends on the [any_ - [join_on_disk_max_files_to_merge](../../../operations/settings/settings.md#join_on_disk_max_files_to_merge) - [any_join_distinct_right_table_keys](../../../operations/settings/settings.md#any_join_distinct_right_table_keys) -## ON Section Conditions {on-section-conditions} +## ON Section Conditions {#on-section-conditions} + +An `ON` section can contain several conditions combined using the `AND` and `OR` operators. Conditions specifying join keys must refer both left and right tables and must use the equality operator. Other conditions may use other logical operators but they must refer either the left or the right table of a query. -An `ON` section can contain several conditions combined using the `AND` operator. Conditions specifying join keys must refer both left and right tables and must use the equality operator. Other conditions may use other logical operators but they must refer either the left or the right table of a query. Rows are joined if the whole complex condition is met. If the conditions are not met, still rows may be included in the result depending on the `JOIN` type. Note that if the same conditions are placed in a `WHERE` section and they are not met, then rows are always filtered out from the result. -!!! note "Note" - The `OR` operator inside an `ON` section is not supported yet. +The `OR` operator inside the `ON` clause works using the hash join algorithm — for each `OR` argument with join keys for `JOIN`, a separate hash table is created, so memory consumption and query execution time grow linearly with an increase in the number of expressions `OR` of the `ON` clause. !!! note "Note" If a condition refers columns from different tables, then only the equality operator (`=`) is supported so far. @@ -109,7 +109,47 @@ Result: │ B │ Text B │ 15 │ └──────┴────────┴────────┘ ``` +Query with `INNER` type of a join and condition with `OR`: +``` sql +CREATE TABLE t1 (`a` Int64, `b` Int64) ENGINE = MergeTree() ORDER BY a; + +CREATE TABLE t2 (`key` Int32, `val` Int64) ENGINE = MergeTree() ORDER BY key; + +INSERT INTO t1 SELECT number as a, -a as b from numbers(5); + +INSERT INTO t2 SELECT if(number % 2 == 0, toInt64(number), -number) as key, number as val from numbers(5); + +SELECT a, b, val FROM t1 INNER JOIN t2 ON t1.a = t2.key OR t1.b = t2.key; +``` + +Result: + +``` +┌─a─┬──b─┬─val─┐ +│ 0 │ 0 │ 0 │ +│ 1 │ -1 │ 1 │ +│ 2 │ -2 │ 2 │ +│ 3 │ -3 │ 3 │ +│ 4 │ -4 │ 4 │ +└───┴────┴─────┘ +``` + +Query with `INNER` type of a join and conditions with `OR` and `AND`: + +``` sql +SELECT a, b, val FROM t1 INNER JOIN t2 ON t1.a = t2.key OR t1.b = t2.key AND t2.val > 3; +``` + +Result: + +``` +┌─a─┬──b─┬─val─┐ +│ 0 │ 0 │ 0 │ +│ 2 │ -2 │ 2 │ +│ 4 │ -4 │ 4 │ +└───┴────┴─────┘ +``` ## ASOF JOIN Usage {#asof-join-usage} `ASOF JOIN` is useful when you need to join records that have no exact match. diff --git a/docs/en/sql-reference/table-functions/hdfsCluster.md b/docs/en/sql-reference/table-functions/hdfsCluster.md new file mode 100644 index 00000000000..6183fe83c38 --- /dev/null +++ b/docs/en/sql-reference/table-functions/hdfsCluster.md @@ -0,0 +1,58 @@ +--- +toc_priority: 55 +toc_title: hdfsCluster +--- + +# hdfsCluster Table Function {#hdfsCluster-table-function} + +Allows processing files from HDFS in parallel from many nodes in a specified cluster. On initiator it creates a connection to all nodes in the cluster, discloses asterics in HDFS file path, and dispatches each file dynamically. On the worker node it asks the initiator about the next task to process and processes it. This is repeated until all tasks are finished. + +**Syntax** + +``` sql +hdfsCluster(cluster_name, URI, format, structure) +``` + +**Arguments** + +- `cluster_name` — Name of a cluster that is used to build a set of addresses and connection parameters to remote and local servers. +- `URI` — URI to a file or a bunch of files. Supports following wildcards in readonly mode: `*`, `?`, `{'abc','def'}` and `{N..M}` where `N`, `M` — numbers, `abc`, `def` — strings. For more information see [Wildcards In Path](../../engines/table-engines/integrations/s3.md#wildcards-in-path). +- `format` — The [format](../../interfaces/formats.md#formats) of the file. +- `structure` — Structure of the table. Format `'column1_name column1_type, column2_name column2_type, ...'`. + +**Returned value** + +A table with the specified structure for reading data in the specified file. + +**Examples** + +1. Suppose that we have a ClickHouse cluster named `cluster_simple`, and several files with following URIs on HDFS: + +- ‘hdfs://hdfs1:9000/some_dir/some_file_1’ +- ‘hdfs://hdfs1:9000/some_dir/some_file_2’ +- ‘hdfs://hdfs1:9000/some_dir/some_file_3’ +- ‘hdfs://hdfs1:9000/another_dir/some_file_1’ +- ‘hdfs://hdfs1:9000/another_dir/some_file_2’ +- ‘hdfs://hdfs1:9000/another_dir/some_file_3’ + +2. Query the amount of rows in these files: + +``` sql +SELECT count(*) +FROM hdfsCluster('cluster_simple', 'hdfs://hdfs1:9000/{some,another}_dir/some_file_{1..3}', 'TSV', 'name String, value UInt32') +``` + +3. Query the amount of rows in all files of these two directories: + +``` sql +SELECT count(*) +FROM hdfsCluster('cluster_simple', 'hdfs://hdfs1:9000/{some,another}_dir/*', 'TSV', 'name String, value UInt32') +``` + +!!! warning "Warning" + If your listing of files contains number ranges with leading zeros, use the construction with braces for each digit separately or use `?`. + +**See Also** + +- [HDFS engine](../../engines/table-engines/integrations/hdfs.md) +- [HDFS table function](../../sql-reference/table-functions/hdfs.md) diff --git a/docs/ja/faq/general/how-do-i-contribute-code-to-clickhouse.md b/docs/ja/faq/general/how-do-i-contribute-code-to-clickhouse.md new file mode 120000 index 00000000000..5ac9a615386 --- /dev/null +++ b/docs/ja/faq/general/how-do-i-contribute-code-to-clickhouse.md @@ -0,0 +1 @@ +../../../en/faq/general/how-do-i-contribute-code-to-clickhouse.md \ No newline at end of file diff --git a/docs/ja/faq/operations/multi-region-replication.md b/docs/ja/faq/operations/multi-region-replication.md new file mode 120000 index 00000000000..dbc985ee1fb --- /dev/null +++ b/docs/ja/faq/operations/multi-region-replication.md @@ -0,0 +1 @@ +../../../en/faq/operations/multi-region-replication.md \ No newline at end of file diff --git a/docs/ja/getting-started/example-datasets/ontime.md b/docs/ja/getting-started/example-datasets/ontime.md index 2a951af6026..33314faa53d 100644 --- a/docs/ja/getting-started/example-datasets/ontime.md +++ b/docs/ja/getting-started/example-datasets/ontime.md @@ -15,13 +15,7 @@ toc_title: OnTime データのダウンロード: ``` bash -for s in `seq 1987 2018` -do -for m in `seq 1 12` -do -wget https://transtats.bts.gov/PREZIP/On_Time_Reporting_Carrier_On_Time_Performance_1987_present_${s}_${m}.zip -done -done +wget --no-check-certificate --continue https://transtats.bts.gov/PREZIP/On_Time_Reporting_Carrier_On_Time_Performance_1987_present_{1987..2021}_{1..12}.zip ``` (https://github.com/Percona-Lab/ontime-airline-performance/blob/master/download.sh より) @@ -40,7 +34,7 @@ CREATE TABLE `ontime` `Reporting_Airline` String, `DOT_ID_Reporting_Airline` Int32, `IATA_CODE_Reporting_Airline` String, - `Tail_Number` Int32, + `Tail_Number` String, `Flight_Number_Reporting_Airline` String, `OriginAirportID` Int32, `OriginAirportSeqID` Int32, diff --git a/docs/ja/interfaces/http.md b/docs/ja/interfaces/http.md index 4ac9cd9e472..210e3f46d24 100644 --- a/docs/ja/interfaces/http.md +++ b/docs/ja/interfaces/http.md @@ -397,7 +397,7 @@ $ curl -v 'http://localhost:8123/predefined_query' `` 値は以下の定義済みクエリです `` これは、Http要求が一致し、クエリの結果が返されたときにClickHouseによって実行されます。 これは必須構成です。 -次の例では、次の値を定義します `max_threads` と `max_alter_threads` 設定、そしてクエリのテーブルから設定設定します。 +次の例では、次の値を定義します `max_threads` と `max_final_threads` 設定、そしてクエリのテーブルから設定設定します。 例: @@ -420,9 +420,9 @@ $ curl -v 'http://localhost:8123/predefined_query' ``` ``` bash -$ curl -H 'XXX:TEST_HEADER_VALUE' -H 'PARAMS_XXX:max_threads' 'http://localhost:8123/query_param_with_url/1/max_threads/max_alter_threads?max_threads=1&max_alter_threads=2' +$ curl -H 'XXX:TEST_HEADER_VALUE' -H 'PARAMS_XXX:max_threads' 'http://localhost:8123/query_param_with_url/1/max_threads/max_final_threads?max_threads=1&max_final_threads=2' 1 -max_alter_threads 2 +max_final_threads 2 ``` !!! note "注意" @@ -434,7 +434,7 @@ max_alter_threads 2 クリックハウスは、 `` HTTP要求のurlの値。 のデフォルト値 `` は `/query` . これはオプションの構成です。 設定ファイルに定義がない場合、paramは渡されません。 -この機能を試すために、この例ではmax_threadsとmax_alter_threadsの値を定義し、設定が正常に設定されたかどうかを照会します。 +この機能を試すために、この例ではmax_threadsとmax_final_threadsの値を定義し、設定が正常に設定されたかどうかを照会します。 例: @@ -452,9 +452,9 @@ max_alter_threads 2 ``` ``` bash -$ curl -H 'XXX:TEST_HEADER_VALUE_DYNAMIC' 'http://localhost:8123/own?max_threads=1&max_alter_threads=2¶m_name_1=max_threads¶m_name_2=max_alter_threads&query_param=SELECT%20name,value%20FROM%20system.settings%20where%20name%20=%20%7Bname_1:String%7D%20OR%20name%20=%20%7Bname_2:String%7D' +$ curl -H 'XXX:TEST_HEADER_VALUE_DYNAMIC' 'http://localhost:8123/own?max_threads=1&max_final_threads=2¶m_name_1=max_threads¶m_name_2=max_final_threads&query_param=SELECT%20name,value%20FROM%20system.settings%20where%20name%20=%20%7Bname_1:String%7D%20OR%20name%20=%20%7Bname_2:String%7D' max_threads 1 -max_alter_threads 2 +max_final_threads 2 ``` ## 静的 {#static} diff --git a/docs/ru/development/developer-instruction.md b/docs/ru/development/developer-instruction.md index 8466c709ad1..6e30e0052e5 100644 --- a/docs/ru/development/developer-instruction.md +++ b/docs/ru/development/developer-instruction.md @@ -40,7 +40,7 @@ ClickHouse не работает и не собирается на 32-битны Выполните в терминале: - git clone git@github.com:ClickHouse/ClickHouse.git + git clone git@github.com:your_github_username/ClickHouse.git cd ClickHouse Замените первое вхождение слова `ClickHouse` в команде для git на имя вашего аккаунта на GitHub. diff --git a/docs/ru/engines/table-engines/integrations/hdfs.md b/docs/ru/engines/table-engines/integrations/hdfs.md index 5949cc8a0d7..78a82955cd2 100644 --- a/docs/ru/engines/table-engines/integrations/hdfs.md +++ b/docs/ru/engines/table-engines/integrations/hdfs.md @@ -5,7 +5,7 @@ toc_title: HDFS # HDFS {#table_engines-hdfs} -Управляет данными в HDFS. Данный движок похож на движки [File](../special/file.md#table_engines-file) и [URL](../special/url.md#table_engines-url). +Этот движок обеспечивает интеграцию с экосистемой [Apache Hadoop](https://ru.wikipedia.org/wiki/Hadoop), позволяя управлять данными в HDFS посредством ClickHouse. Данный движок похож на движки [File](../special/file.md#table_engines-file) и [URL](../special/url.md#table_engines-url), но предоставляет возможности, характерные для Hadoop. ## Использование движка {#usage} @@ -13,9 +13,11 @@ toc_title: HDFS ENGINE = HDFS(URI, format) ``` -В параметр `URI` нужно передавать полный URI файла в HDFS. +**Параметры движка** + +В параметр `URI` нужно передавать полный URI файла в HDFS. Часть URI с путем файла может содержать шаблоны. В этом случае таблица может использоваться только для чтения. Параметр `format` должен быть таким, который ClickHouse может использовать и в запросах `INSERT`, и в запросах `SELECT`. Полный список поддерживаемых форматов смотрите в разделе [Форматы](../../../interfaces/formats.md#formats). -Часть URI с путем файла может содержать шаблоны. В этом случае таблица может использоваться только для чтения. + **Пример:** @@ -67,12 +69,12 @@ SELECT * FROM hdfs_engine_table LIMIT 2 1. Предположим, у нас есть несколько файлов со следующими URI в HDFS: -- 'hdfs://hdfs1:9000/some_dir/some_file_1' -- 'hdfs://hdfs1:9000/some_dir/some_file_2' -- 'hdfs://hdfs1:9000/some_dir/some_file_3' -- 'hdfs://hdfs1:9000/another_dir/some_file_1' -- 'hdfs://hdfs1:9000/another_dir/some_file_2' -- 'hdfs://hdfs1:9000/another_dir/some_file_3' + - 'hdfs://hdfs1:9000/some_dir/some_file_1' + - 'hdfs://hdfs1:9000/some_dir/some_file_2' + - 'hdfs://hdfs1:9000/some_dir/some_file_3' + - 'hdfs://hdfs1:9000/another_dir/some_file_1' + - 'hdfs://hdfs1:9000/another_dir/some_file_2' + - 'hdfs://hdfs1:9000/another_dir/some_file_3' 1. Есть несколько возможностей создать таблицу, состояющую из этих шести файлов: @@ -128,6 +130,7 @@ CREATE TABLE big_table (name String, value UInt32) ENGINE = HDFS('hdfs://hdfs1:9 | **параметр** | **по умолчанию** | +| - | - | | rpc\_client\_connect\_tcpnodelay | true | | dfs\_client\_read\_shortcircuit | true | | output\_replace-datanode-on-failure | true | @@ -177,22 +180,23 @@ CREATE TABLE big_table (name String, value UInt32) ENGINE = HDFS('hdfs://hdfs1:9 #### Расширенные параметры для ClickHouse {#clickhouse-extras} | **параметр** | **по умолчанию** | +| - | - | |hadoop\_kerberos\_keytab | "" | |hadoop\_kerberos\_principal | "" | |hadoop\_kerberos\_kinit\_command | kinit | ### Ограничения {#limitations} - * hadoop\_security\_kerberos\_ticket\_cache\_path могут быть определены только на глобальном уровне + * `hadoop_security_kerberos_ticket_cache_path` и `libhdfs3_conf` могут быть определены только на глобальном, а не на пользовательском уровне ## Поддержка Kerberos {#kerberos-support} -Если hadoop\_security\_authentication параметр имеет значение 'kerberos', ClickHouse аутентифицируется с помощью Kerberos. -[Расширенные параметры](#clickhouse-extras) и hadoop\_security\_kerberos\_ticket\_cache\_path помогают сделать это. +Если параметр `hadoop_security_authentication` имеет значение `kerberos`, ClickHouse аутентифицируется с помощью Kerberos. +[Расширенные параметры](#clickhouse-extras) и `hadoop_security_kerberos_ticket_cache_path` помогают сделать это. Обратите внимание что из-за ограничений libhdfs3 поддерживается только устаревший метод аутентификации, -коммуникация с узлами данных не защищена SASL (HADOOP\_SECURE\_DN\_USER надежный показатель такого -подхода к безопасности). Используйте tests/integration/test\_storage\_kerberized\_hdfs/hdfs_configs/bootstrap.sh для примера настроек. +коммуникация с узлами данных не защищена SASL (`HADOOP_SECURE_DN_USER` надежный показатель такого +подхода к безопасности). Используйте `tests/integration/test_storage_kerberized_hdfs/hdfs_configs/bootstrap.sh` для примера настроек. -Если hadoop\_kerberos\_keytab, hadoop\_kerberos\_principal или hadoop\_kerberos\_kinit\_command указаны в настройках, kinit будет вызван. hadoop\_kerberos\_keytab и hadoop\_kerberos\_principal обязательны в этом случае. Необходимо также будет установить kinit и файлы конфигурации krb5. +Если `hadoop_kerberos_keytab`, `hadoop_kerberos_principal` или `hadoop_kerberos_kinit_command` указаны в настройках, `kinit` будет вызван. `hadoop_kerberos_keytab` и `hadoop_kerberos_principal` обязательны в этом случае. Необходимо также будет установить `kinit` и файлы конфигурации krb5. ## Виртуальные столбцы {#virtual-columns} diff --git a/docs/ru/engines/table-engines/integrations/kafka.md b/docs/ru/engines/table-engines/integrations/kafka.md index 19e2850dd51..7ea3d124ab3 100644 --- a/docs/ru/engines/table-engines/integrations/kafka.md +++ b/docs/ru/engines/table-engines/integrations/kafka.md @@ -191,5 +191,5 @@ ClickHouse может поддерживать учетные данные Kerbe **Смотрите также** - [Виртуальные столбцы](index.md#table_engines-virtual_columns) -- [background_schedule_pool_size](../../../operations/settings/settings.md#background_schedule_pool_size) +- [background_message_broker_schedule_pool_size](../../../operations/settings/settings.md#background_message_broker_schedule_pool_size) diff --git a/docs/ru/faq/general/how-do-i-contribute-code-to-clickhouse.md b/docs/ru/faq/general/how-do-i-contribute-code-to-clickhouse.md new file mode 120000 index 00000000000..5ac9a615386 --- /dev/null +++ b/docs/ru/faq/general/how-do-i-contribute-code-to-clickhouse.md @@ -0,0 +1 @@ +../../../en/faq/general/how-do-i-contribute-code-to-clickhouse.md \ No newline at end of file diff --git a/docs/ru/faq/operations/multi-region-replication.md b/docs/ru/faq/operations/multi-region-replication.md new file mode 120000 index 00000000000..dbc985ee1fb --- /dev/null +++ b/docs/ru/faq/operations/multi-region-replication.md @@ -0,0 +1 @@ +../../../en/faq/operations/multi-region-replication.md \ No newline at end of file diff --git a/docs/ru/getting-started/example-datasets/ontime.md b/docs/ru/getting-started/example-datasets/ontime.md index e1d47a5a9e7..2ee4315c76f 100644 --- a/docs/ru/getting-started/example-datasets/ontime.md +++ b/docs/ru/getting-started/example-datasets/ontime.md @@ -15,13 +15,7 @@ toc_title: OnTime Скачивание данных (из `https://github.com/Percona-Lab/ontime-airline-performance/blob/master/download.sh`): ``` bash -for s in `seq 1987 2018` -do -for m in `seq 1 12` -do -wget https://transtats.bts.gov/PREZIP/On_Time_Reporting_Carrier_On_Time_Performance_1987_present_${s}_${m}.zip -done -done +wget --no-check-certificate --continue https://transtats.bts.gov/PREZIP/On_Time_Reporting_Carrier_On_Time_Performance_1987_present_{1987..2021}_{1..12}.zip ``` Создание таблицы: @@ -38,7 +32,7 @@ CREATE TABLE `ontime` `Reporting_Airline` String, `DOT_ID_Reporting_Airline` Int32, `IATA_CODE_Reporting_Airline` String, - `Tail_Number` Int32, + `Tail_Number` String, `Flight_Number_Reporting_Airline` String, `OriginAirportID` Int32, `OriginAirportSeqID` Int32, diff --git a/docs/ru/interfaces/cli.md b/docs/ru/interfaces/cli.md index bbb66b70371..e2ca1a86284 100644 --- a/docs/ru/interfaces/cli.md +++ b/docs/ru/interfaces/cli.md @@ -114,7 +114,7 @@ $ clickhouse-client --param_tbl="numbers" --param_db="system" --param_col="numbe Параметры в конфигурационных файлах переопределяют значения по умолчанию. -### Параметры командной строки {#parametry-komandnoi-stroki} +### Параметры командной строки {#command-line-options} - `--host, -h` — имя сервера, по умолчанию — ‘localhost’. Вы можете использовать как имя, так и IPv4 или IPv6 адрес. - `--port` — порт для подключения, по умолчанию — 9000. Обратите внимание: для HTTP-интерфейса и нативного интерфейса используются разные порты. @@ -136,7 +136,7 @@ $ clickhouse-client --param_tbl="numbers" --param_db="system" --param_col="numbe Начиная с версии 20.5, в `clickhouse-client` есть автоматическая подсветка синтаксиса (включена всегда). -### Конфигурационные файлы {#konfiguratsionnye-faily} +### Конфигурационные файлы {#configuration_files} `clickhouse—client` использует первый существующий файл из: diff --git a/docs/ru/interfaces/formats.md b/docs/ru/interfaces/formats.md index 79d760271f5..a384776e519 100644 --- a/docs/ru/interfaces/formats.md +++ b/docs/ru/interfaces/formats.md @@ -129,6 +129,9 @@ world Каждый элемент структуры типа [Nested](../sql-reference/data-types/nested-data-structures/nested.md) представляется как отдельный массив. +Входящие параметры типа "перечисление" (`ENUM`) могут передаваться в виде значений или порядковых номеров. Сначала переданное значение будет сопоставляться с элементами перечисления. Если совпадение не будет найдено и при этом переданное значение является числом, оно будет трактоваться как порядковый номер в перечислении. +Если входящие параметры типа `ENUM` содержат только порядковые номера, рекомендуется включить настройку [input_format_tsv_enum_as_number](../operations/settings/settings.md#settings-input_format_tsv_enum_as_number) для ускорения парсинга. + Например: ``` sql @@ -362,6 +365,9 @@ $ clickhouse-client --format_csv_delimiter="|" --query="INSERT INTO test.csv FOR Если установлена настройка [input_format_defaults_for_omitted_fields = 1](../operations/settings/settings.md#session_settings-input_format_defaults_for_omitted_fields) и тип столбца не `Nullable(T)`, то пустые значения без кавычек заменяются значениями по умолчанию для типа данных столбца. +Входящие параметры типа "перечисление" (`ENUM`) могут передаваться в виде значений или порядковых номеров. Сначала переданное значение будет сопоставляться с элементами перечисления. Если совпадение не будет найдено и при этом переданное значение является числом, оно будет трактоваться как порядковый номер в перечислении. +Если входящие параметры типа `ENUM` содержат только порядковые номера, рекомендуется включить настройку [input_format_tsv_enum_as_number](../operations/settings/settings.md#settings-input_format_tsv_enum_as_number) для ускорения парсинга. + Формат CSV поддерживает вывод totals и extremes аналогично `TabSeparated`. ## CSVWithNames {#csvwithnames} @@ -693,7 +699,7 @@ CREATE TABLE IF NOT EXISTS example_table - Если `input_format_defaults_for_omitted_fields = 1`, то значение по умолчанию для `x` равно `0`, а значение по умолчанию `a` равно `x * 2`. !!! note "Предупреждение" - Если `input_format_defaults_for_omitted_fields = 1`, то при обработке запросов ClickHouse потребляет больше вычислительных ресурсов, чем если `input_format_defaults_for_omitted_fields = 0`. + При добавлении данных с помощью `input_format_defaults_for_omitted_fields = 1`, ClickHouse потребляет больше вычислительных ресурсов по сравнению с `input_format_defaults_for_omitted_fields = 0`. ### Выборка данных {#vyborka-dannykh} diff --git a/docs/ru/interfaces/grpc.md b/docs/ru/interfaces/grpc.md new file mode 100644 index 00000000000..89032c9372c --- /dev/null +++ b/docs/ru/interfaces/grpc.md @@ -0,0 +1,99 @@ +--- +toc_priority: 18 +toc_title: gRPC интерфейс +--- + +# Интерфейс gRPC {#grpc-interface} + +## Введение {#grpc-interface-introduction} + +ClickHouse поддерживает интерфейс [gRPC](https://grpc.io/). Это система удаленного вызова процедур с открытым исходным кодом, которая использует HTTP/2 и [Protocol Buffers](https://ru.wikipedia.org/wiki/Protocol_Buffers). В реализации gRPC в ClickHouse поддерживаются: + +- SSL; +- аутентификация; +- сессии; +- сжатие; +- параллельные запросы, выполняемые через один канал; +- отмена запросов; +- получение прогресса операций и логов; +- внешние таблицы. + +Спецификация интерфейса содержится в [clickhouse_grpc.proto](https://github.com/ClickHouse/ClickHouse/blob/master/src/Server/grpc_protos/clickhouse_grpc.proto). + +## Конфигурация gRPC {#grpc-interface-configuration} + +Чтобы сделать доступным интерфейс gRPC, нужно задать порт с помощью настройки `grpc_port` в [конфигурации сервера](../operations/configuration-files.md). Другие настройки приведены в примере: + +```xml +9100 + + false + + + /path/to/ssl_cert_file + /path/to/ssl_key_file + + + false + + + /path/to/ssl_ca_cert_file + + + deflate + + + medium + + + -1 + -1 + + + false + +``` + +## Встроенный клиент {#grpc-client} + +Можно написать клиент на любом языке программирования, который поддерживается gRPC, с использованием [спецификации](https://github.com/ClickHouse/ClickHouse/blob/master/src/Server/grpc_protos/clickhouse_grpc.proto). +Также можно воспользоваться встроенным Python клиентом. Он расположен в [utils/grpc-client/clickhouse-grpc-client.py](https://github.com/ClickHouse/ClickHouse/blob/master/utils/grpc-client/clickhouse-grpc-client.py) в репозитории. Для работы встроенного клиента требуются Python модули [grpcio и grpcio-tools](https://grpc.io/docs/languages/python/quickstart). + +Клиент поддерживает аргументы: + +- `--help` – вывести справку и завершить работу. +- `--host HOST, -h HOST` – имя сервера. Значение по умолчанию: `localhost`. Можно задать адрес IPv4 или IPv6. +- `--port PORT` – номер порта. Этот порт должен быть задан в конфигурации сервера ClickHouse настройкой `grpc_port`. Значение по умолчанию: `9100`. +- `--user USER_NAME, -u USER_NAME` – имя пользователя. Значение по умолчанию: `default`. +- `--password PASSWORD` – пароль. Значение по умолчанию: пустая строка. +- `--query QUERY, -q QUERY` – запрос, который выполнится, когда используется неинтерактивный режим работы. +- `--database DATABASE, -d DATABASE` – база данных по умолчанию. Если не указана, то будет использована база данных, заданная в настройках сервера (по умолчанию `default`). +- `--format OUTPUT_FORMAT, -f OUTPUT_FORMAT` – [формат](formats.md) вывода результата. Значение по умолчанию для интерактивного режима: `PrettyCompact`. +- `--debug` – вывод отладочной информации. + +Чтобы запустить клиент в интерактивном режиме, не указывайте аргумент `--query`. + +В неинтерактивном режиме данные запроса можно передать через `stdin`. + +**Пример использования клиента** + +В примере создается таблица, и в нее загружаются данные из CSV файла. Затем выводится содержимое таблицы. + +``` bash +./clickhouse-grpc-client.py -q "CREATE TABLE grpc_example_table (id UInt32, text String) ENGINE = MergeTree() ORDER BY id;" +echo "0,Input data for" > a.txt ; echo "1,gRPC protocol example" >> a.txt +cat a.txt | ./clickhouse-grpc-client.py -q "INSERT INTO grpc_example_table FORMAT CSV" + +./clickhouse-grpc-client.py --format PrettyCompact -q "SELECT * FROM grpc_example_table;" +``` + +Результат: + +``` text +┌─id─┬─text──────────────────┐ +│ 0 │ Input data for │ +│ 1 │ gRPC protocol example │ +└────┴───────────────────────┘ +``` diff --git a/docs/ru/interfaces/http.md b/docs/ru/interfaces/http.md index 6d94a43ff15..8687201e1c9 100644 --- a/docs/ru/interfaces/http.md +++ b/docs/ru/interfaces/http.md @@ -422,7 +422,10 @@ $ curl -v 'http://localhost:8123/predefined_query' Значение `query` — это предопределенный запрос `predefined_query_handler`, который выполняется ClickHouse при совпадении HTTP-запроса и возврате результата запроса. Это обязательная настройка. -В следующем примере определяются настройки [max_threads](../operations/settings/settings.md#settings-max_threads) и `max_alter_threads`, а затем запрашивается системная таблица, чтобы проверить, были ли эти параметры успешно установлены. +В следующем примере определяются настройки [max_threads](../operations/settings/settings.md#settings-max_threads) и `max_final_threads`, а затем запрашивается системная таблица, чтобы проверить, были ли эти параметры успешно установлены. + +!!! note "Предупреждение" + Чтобы сохранить стандартные `handlers` такие как `query`, `play`, `ping`, используйте правило ``. Пример: @@ -441,13 +444,14 @@ $ curl -v 'http://localhost:8123/predefined_query' SELECT name, value FROM system.settings WHERE name = {name_2:String} + ``` ``` bash -$ curl -H 'XXX:TEST_HEADER_VALUE' -H 'PARAMS_XXX:max_threads' 'http://localhost:8123/query_param_with_url/1/max_threads/max_alter_threads?max_threads=1&max_alter_threads=2' +$ curl -H 'XXX:TEST_HEADER_VALUE' -H 'PARAMS_XXX:max_threads' 'http://localhost:8123/query_param_with_url/1/max_threads/max_final_threads?max_threads=1&max_final_threads=2' 1 -max_alter_threads 2 +max_final_threads 2 ``` !!! note "Предупреждение" @@ -459,7 +463,7 @@ max_alter_threads 2 ClickHouse извлекает и выполняет значение, соответствующее значению `query_param_name` URL-адресе HTTP-запроса. Значение по умолчанию `query_param_name` — это `/query` . Это необязательная настройка. Если в файле конфигурации нет определения, параметр не передается. -Чтобы поэкспериментировать с этой функциональностью, в примере определяются значения [max_threads](../operations/settings/settings.md#settings-max_threads) и `max_alter_threads` и запрашивается, успешно ли были установлены настройки. +Чтобы поэкспериментировать с этой функциональностью, в примере определяются значения [max_threads](../operations/settings/settings.md#settings-max_threads) и `max_final_threads` и запрашивается, успешно ли были установлены настройки. Пример: @@ -473,13 +477,14 @@ ClickHouse извлекает и выполняет значение, соотв query_param + ``` ``` bash -$ curl -H 'XXX:TEST_HEADER_VALUE_DYNAMIC' 'http://localhost:8123/own?max_threads=1&max_alter_threads=2¶m_name_1=max_threads¶m_name_2=max_alter_threads&query_param=SELECT%20name,value%20FROM%20system.settings%20where%20name%20=%20%7Bname_1:String%7D%20OR%20name%20=%20%7Bname_2:String%7D' +$ curl -H 'XXX:TEST_HEADER_VALUE_DYNAMIC' 'http://localhost:8123/own?max_threads=1&max_final_threads=2¶m_name_1=max_threads¶m_name_2=max_final_threads&query_param=SELECT%20name,value%20FROM%20system.settings%20where%20name%20=%20%7Bname_1:String%7D%20OR%20name%20=%20%7Bname_2:String%7D' max_threads 1 -max_alter_threads 2 +max_final_threads 2 ``` ### static {#static} @@ -503,6 +508,7 @@ max_alter_threads 2 Say Hi! + ``` diff --git a/docs/ru/interfaces/index.md b/docs/ru/interfaces/index.md index 12e8853823e..b23a402e0b7 100644 --- a/docs/ru/interfaces/index.md +++ b/docs/ru/interfaces/index.md @@ -6,12 +6,13 @@ toc_title: "Введение" # Интерфейсы {#interfaces} -ClickHouse предоставляет два сетевых интерфейса (оба могут быть дополнительно обернуты в TLS для дополнительной безопасности): +ClickHouse предоставляет три сетевых интерфейса (они могут быть обернуты в TLS для дополнительной безопасности): - [HTTP](http.md), который задокументирован и прост для использования напрямую; -- [Native TCP](tcp.md), который имеет меньше накладных расходов. +- [Native TCP](tcp.md), который имеет меньше накладных расходов; +- [gRPC](grpc.md). -В большинстве случаев рекомендуется использовать подходящий инструмент или библиотеку, а не напрямую взаимодействовать с ClickHouse по сути. Официально поддерживаемые Яндексом: +В большинстве случаев рекомендуется использовать подходящий инструмент или библиотеку, а не напрямую взаимодействовать с ClickHouse. Официально поддерживаемые Яндексом: - [Консольный клиент](cli.md); - [JDBC-драйвер](jdbc.md); diff --git a/docs/ru/interfaces/third-party/gui.md b/docs/ru/interfaces/third-party/gui.md index b80aaf7948e..201e92f994e 100644 --- a/docs/ru/interfaces/third-party/gui.md +++ b/docs/ru/interfaces/third-party/gui.md @@ -227,4 +227,25 @@ SeekTable [бесплатен](https://www.seektable.com/help/cloud-pricing) д [Chadmin](https://github.com/bun4uk/chadmin) — простой графический интерфейс для визуализации запущенных запросов на вашем кластере ClickHouse. Он отображает информацию о запросах и дает возможность их завершать. +### TABLUM.IO {#tablum_io} + +[TABLUM.IO](https://tablum.io/) — онлайн инструмент для загрузки и визуализации данных. Позволяет подключаться к БД ClickHouse, работать с базами и таблицами через многофункциональную SQL консоль, загружать данные из таблиц, объединять их с данными из других источников (файлов, сторонних сервисов) и визуализировать результаты в виде таблиц и графиков. + +Основные возможности: +- Многофункциональный ETL: загрузка данных из популярных баз данных, локальных и удаленных файлов, загрузка результатов вызова REST API. +- Универсальная SQL консоль с подсветкой синтаксиса и визуальным генератором SQL запросов. +- Визуализация загруженных данных в виде графиков и таблиц. +- Материализация данных и подзапросы к загруженным данным. +- Отправка результатов визуализации в Slack, Telegram или на email. +- Организация потоков данных (data pipeline) через собственный API. +- Экспорт данных в форматах JSON, CSV, SQL, HTML. +- Веб-интерфейс. + +Поддерживается установка TABLUM.IO на собственный сервер (в виде Docker образа) или работа с сервисом в облаке. +Лицензия: [коммерческий](https://tablum.io/pricing) продукт с периодом бесплатного тестирования 3 месяца. + +Протестировать TABLUM.IO без разворачивания на собственном сервере можно [здесь](https://tablum.io/try). +Подробно о продукте смотрите на [TABLUM.IO](https://tablum.io/) + + [Original article](https://clickhouse.com/docs/en/interfaces/third-party/gui/) diff --git a/docs/ru/operations/clickhouse-keeper.md b/docs/ru/operations/clickhouse-keeper.md index 14d95ebae68..2f3f3c0f63c 100644 --- a/docs/ru/operations/clickhouse-keeper.md +++ b/docs/ru/operations/clickhouse-keeper.md @@ -3,25 +3,30 @@ toc_priority: 66 toc_title: ClickHouse Keeper --- -# [пре-продакшн] ClickHouse Keeper +# [пре-продакшн] ClickHouse Keeper {#clickHouse-keeper} Сервер ClickHouse использует сервис координации [ZooKeeper](https://zookeeper.apache.org/) для [репликации](../engines/table-engines/mergetree-family/replication.md) данных и выполнения [распределенных DDL запросов](../sql-reference/distributed-ddl.md). ClickHouse Keeper — это альтернативный сервис координации, совместимый с ZooKeeper. !!! warning "Предупреждение" ClickHouse Keeper находится в стадии пре-продакшн и тестируется в CI ClickHouse и на нескольких внутренних инсталляциях. -## Детали реализации +## Детали реализации {#implementation-details} ZooKeeper — один из первых широко известных сервисов координации с открытым исходным кодом. Он реализован на языке программирования Java, имеет достаточно простую и мощную модель данных. Алгоритм координации Zookeeper называется ZAB (ZooKeeper Atomic Broadcast). Он не гарантирует линеаризуемость операций чтения, поскольку каждый узел ZooKeeper обслуживает чтения локально. В отличие от ZooKeeper, ClickHouse Keeper реализован на C++ и использует алгоритм [RAFT](https://raft.github.io/), [реализация](https://github.com/eBay/NuRaft). Этот алгоритм позволяет достичь линеаризуемости чтения и записи, имеет несколько реализаций с открытым исходным кодом на разных языках. По умолчанию ClickHouse Keeper предоставляет те же гарантии, что и ZooKeeper (линеаризуемость записей, последовательная согласованность чтений). У него есть совместимый клиент-серверный протокол, поэтому любой стандартный клиент ZooKeeper может использоваться для взаимодействия с ClickHouse Keeper. Снэпшоты и журналы имеют несовместимый с ZooKeeper формат, однако можно конвертировать данные Zookeeper в снэпшот ClickHouse Keeper с помощью `clickhouse-keeper-converter`. Межсерверный протокол ClickHouse Keeper также несовместим с ZooKeeper, поэтому создание смешанного кластера ZooKeeper / ClickHouse Keeper невозможно. -## Конфигурация +Система управления доступом (ACL) ClickHouse Keeper реализована так же, как в [ZooKeeper](https://zookeeper.apache.org/doc/r3.1.2/zookeeperProgrammers.html#sc_ZooKeeperAccessControl). ClickHouse Keeper поддерживает тот же набор разрешений и идентичные схемы: `world`, `auth`, `digest`, `host` и `ip`. Digest для аутентификации использует пару значений `username:password`. Пароль кодируется в Base64. + +!!! info "Примечание" + Внешние интеграции не поддерживаются. + +## Конфигурация {#configuration} ClickHouse Keeper может использоваться как равноценная замена ZooKeeper или как внутренняя часть сервера ClickHouse, но в обоих случаях конфигурация представлена файлом `.xml`. Главный тег конфигурации ClickHouse Keeper — это ``. Параметры конфигурации: - `tcp_port` — порт для подключения клиента (по умолчанию для ZooKeeper: `2181`). -- `tcp_port_secure` — зашифрованный порт для подключения клиента. +- `tcp_port_secure` — зашифрованный порт для SSL-соединения между клиентом и сервером сервиса. - `server_id` — уникальный идентификатор сервера, каждый участник кластера должен иметь уникальный номер (1, 2, 3 и т. д.). - `log_storage_path` — путь к журналам координации, лучше хранить их на незанятом устройстве (актуально и для ZooKeeper). - `snapshot_storage_path` — путь к снэпшотам координации. @@ -49,8 +54,13 @@ ClickHouse Keeper может использоваться как равноце - `auto_forwarding` — разрешить пересылку запросов на запись от последователей лидеру (по умолчанию: true). - `shutdown_timeout` — время ожидания завершения внутренних подключений и выключения, в миллисекундах (по умолчанию: 5000). - `startup_timeout` — время отключения сервера, если он не подключается к другим участникам кворума, в миллисекундах (по умолчанию: 30000). +- `four_letter_word_white_list` — список разрешенных 4-х буквенных команд (по умолчанию: "conf,cons,crst,envi,ruok,srst,srvr,stat,wchc,wchs,dirs,mntr,isro"). -Конфигурация кворума находится в `.` и содержит описание серверов. Единственный параметр для всего кворума — `secure`, который включает зашифрованное соединение для связи между участниками кворума. Параметры для каждого ``: +Конфигурация кворума находится в `.` и содержит описание серверов. + +Единственный параметр для всего кворума — `secure`, который включает зашифрованное соединение для связи между участниками кворума. Параметру можно задать значение `true`, если для внутренней коммуникации между узлами требуется SSL-соединение, в ином случае не указывайте ничего. + +Параметры для каждого ``: - `id` — идентификатор сервера в кворуме. - `hostname` — имя хоста, на котором размещен сервер. @@ -92,7 +102,7 @@ ClickHouse Keeper может использоваться как равноце ``` -## Как запустить +## Как запустить {#how-to-run} ClickHouse Keeper входит в пакет `clickhouse-server`, просто добавьте кофигурацию `` и запустите сервер ClickHouse как обычно. Если вы хотите запустить ClickHouse Keeper автономно, сделайте это аналогичным способом: @@ -100,7 +110,195 @@ ClickHouse Keeper входит в пакет `clickhouse-server`, просто clickhouse-keeper --config /etc/your_path_to_config/config.xml --daemon ``` -## [экспериментально] Переход с ZooKeeper +## 4-х буквенные команды {#four-letter-word-commands} + +ClickHouse Keeper также поддерживает 4-х буквенные команды, почти такие же, как у Zookeeper. Каждая команда состоит из 4-х символов, например, `mntr`, `stat` и т. д. Несколько интересных команд: `stat` предоставляет общую информацию о сервере и подключенных клиентах, а `srvr` и `cons` предоставляют расширенные сведения о сервере и подключениях соответственно. + +У 4-х буквенных команд есть параметр для настройки разрешенного списка `four_letter_word_white_list`, который имеет значение по умолчанию "conf,cons,crst,envi,ruok,srst,srvr,stat, wchc,wchs,dirs,mntr,isro". + +Вы можете отправлять команды в ClickHouse Keeper через telnet или nc на порт для клиента. + +``` +echo mntr | nc localhost 9181 +``` + +Ниже приведен подробный список 4-х буквенных команд: + +- `ruok`: Проверяет, что сервер запущен без ошибок. В этом случае сервер ответит `imok`. В противном случае он не ответит. Ответ `imok` не обязательно означает, что сервер присоединился к кворуму, а указывает, что процесс сервера активен и привязан к указанному клиентскому порту. Используйте команду `stat` для получения подробной информации о состоянии кворума и клиентском подключении. + +``` +imok +``` + +- `mntr`: Выводит список переменных, которые используются для мониторинга работоспособности кластера. + +``` +zk_version v21.11.1.1-prestable-7a4a0b0edef0ad6e0aa662cd3b90c3f4acf796e7 +zk_avg_latency 0 +zk_max_latency 0 +zk_min_latency 0 +zk_packets_received 68 +zk_packets_sent 68 +zk_num_alive_connections 1 +zk_outstanding_requests 0 +zk_server_state leader +zk_znode_count 4 +zk_watch_count 1 +zk_ephemerals_count 0 +zk_approximate_data_size 723 +zk_open_file_descriptor_count 310 +zk_max_file_descriptor_count 10240 +zk_followers 0 +zk_synced_followers 0 +``` + +- `srvr`: Выводит информацию о сервере: его версию, роль участника кворума и т.п. + +``` +ClickHouse Keeper version: v21.11.1.1-prestable-7a4a0b0edef0ad6e0aa662cd3b90c3f4acf796e7 +Latency min/avg/max: 0/0/0 +Received: 2 +Sent : 2 +Connections: 1 +Outstanding: 0 +Zxid: 34 +Mode: leader +Node count: 4 +``` + +- `stat`: Выводит краткие сведения о сервере и подключенных клиентах. + +``` +ClickHouse Keeper version: v21.11.1.1-prestable-7a4a0b0edef0ad6e0aa662cd3b90c3f4acf796e7 +Clients: + 192.168.1.1:52852(recved=0,sent=0) + 192.168.1.1:52042(recved=24,sent=48) +Latency min/avg/max: 0/0/0 +Received: 4 +Sent : 4 +Connections: 1 +Outstanding: 0 +Zxid: 36 +Mode: leader +Node count: 4 +``` + +- `srst`: Сбрасывает статистику сервера. Команда влияет на результат вывода `srvr`, `mntr` и `stat`. + +``` +Server stats reset. +``` + +- `conf`: Выводит подробную информацию о серверной конфигурации. + +``` +server_id=1 +tcp_port=2181 +four_letter_word_white_list=* +log_storage_path=./coordination/logs +snapshot_storage_path=./coordination/snapshots +max_requests_batch_size=100 +session_timeout_ms=30000 +operation_timeout_ms=10000 +dead_session_check_period_ms=500 +heart_beat_interval_ms=500 +election_timeout_lower_bound_ms=1000 +election_timeout_upper_bound_ms=2000 +reserved_log_items=1000000000000000 +snapshot_distance=10000 +auto_forwarding=true +shutdown_timeout=5000 +startup_timeout=240000 +raft_logs_level=information +snapshots_to_keep=3 +rotate_log_storage_interval=100000 +stale_log_gap=10000 +fresh_log_gap=200 +max_requests_batch_size=100 +quorum_reads=false +force_sync=false +compress_logs=true +compress_snapshots_with_zstd_format=true +configuration_change_tries_count=20 +``` + +- `cons`: Выводит полную информацию о подключениях/сессиях для всех клиентов, подключенных к этому серверу. Включает информацию о количестве принятых/отправленных пакетов, идентификаторе сессии, задержках операций, последней выполненной операции и т. д. + +``` + 192.168.1.1:52163(recved=0,sent=0,sid=0xffffffffffffffff,lop=NA,est=1636454787393,to=30000,lzxid=0xffffffffffffffff,lresp=0,llat=0,minlat=0,avglat=0,maxlat=0) + 192.168.1.1:52042(recved=9,sent=18,sid=0x0000000000000001,lop=List,est=1636454739887,to=30000,lcxid=0x0000000000000005,lzxid=0x0000000000000005,lresp=1636454739892,llat=0,minlat=0,avglat=0,maxlat=0) +``` + +- `crst`: Сбрасывает статистику подключений/сессий для всех подключений. + +``` +Connection stats reset. +``` + +- `envi`: Выводит подробную информацию о серверном окружении. + +``` +Environment: +clickhouse.keeper.version=v21.11.1.1-prestable-7a4a0b0edef0ad6e0aa662cd3b90c3f4acf796e7 +host.name=ZBMAC-C02D4054M.local +os.name=Darwin +os.arch=x86_64 +os.version=19.6.0 +cpu.count=12 +user.name=root +user.home=/Users/JackyWoo/ +user.dir=/Users/JackyWoo/project/jd/clickhouse/cmake-build-debug/programs/ +user.tmp=/var/folders/b4/smbq5mfj7578f2jzwn602tt40000gn/T/ +``` + + +- `dirs`: Показывает общий размер файлов снэпшотов и журналов в байтах. + +``` +snapshot_dir_size: 0 +log_dir_size: 3875 +``` + +- `isro`: Проверяет, что сервер работает в режиме только для чтения. Сервер ответит `ro`, если он находится в режиме только для чтения, или `rw`, если нет. + +``` +rw +``` + +- `wchs`: Показывает краткую информацию о количестве отслеживаемых путей (watches) на сервере. + +``` +1 connections watching 1 paths +Total watches:1 +``` + +- `wchc`: Показывает подробную информацию об отслеживаемых путях (watches) на сервере в разбивке по сессиям. При этом выводится список сессий (подключений) с соответствующими отслеживаемыми путями. Обратите внимание, что в зависимости от количества отслеживаемых путей эта операция может быть дорогостоящей (т. е. повлиять на производительность сервера), используйте ее осторожно. + +``` +0x0000000000000001 + /clickhouse/task_queue/ddl +``` + +- `wchp`: Показывает подробную информацию об отслеживаемых путях (watches) на сервере в разбивке по пути. При этом выводится список путей (узлов) с соответствующими сессиями. Обратите внимание, что в зависимости от количества отселживаемых путей (watches) эта операция может быть дорогостоящей (т. е. повлиять на производительность сервера), используйте ее осторожно. + +``` +/clickhouse/task_queue/ddl + 0x0000000000000001 +``` + +- `dump`: Выводит список незавершенных сеансов и эфемерных узлов. Команда работает только на лидере. + +``` +Sessions dump (2): +0x0000000000000001 +0x0000000000000002 +Sessions with Ephemerals (1): +0x0000000000000001 + /clickhouse/task_queue/ddl +``` + + +## [экспериментально] Переход с ZooKeeper {#migration-from-zookeeper} Плавный переход с ZooKeeper на ClickHouse Keeper невозможен, необходимо остановить кластер ZooKeeper, преобразовать данные и запустить ClickHouse Keeper. Утилита `clickhouse-keeper-converter` конвертирует журналы и снэпшоты ZooKeeper в снэпшот ClickHouse Keeper. Работа утилиты проверена только для версий ZooKeeper выше 3.4. Для миграции необходимо выполнить следующие шаги: diff --git a/docs/ru/operations/server-configuration-parameters/settings.md b/docs/ru/operations/server-configuration-parameters/settings.md index 1b0c7fc5897..4a2da778a06 100644 --- a/docs/ru/operations/server-configuration-parameters/settings.md +++ b/docs/ru/operations/server-configuration-parameters/settings.md @@ -52,7 +52,7 @@ ClickHouse перезагружает встроенные словари с з ClickHouse проверяет условия для `min_part_size` и `min_part_size_ratio` и выполнит те блоки `case`, для которых условия совпали. - Если кусок данных совпадает с условиями, ClickHouse использует указанные метод сжатия. -- Если кусок данных совпадает с несколькими блоками `case`, ClickHouse использует перый совпавший блок условий. +- Если кусок данных совпадает с несколькими блоками `case`, ClickHouse использует первый совпавший блок условий. Если ни один `` не подходит, то ClickHouse применит алгоритм сжатия `lz4`. @@ -436,26 +436,58 @@ ClickHouse проверяет условия для `min_part_size` и `min_part ## interserver_http_credentials {#server-settings-interserver-http-credentials} -Имя пользователя и пароль, использующиеся для аутентификации при [репликации](../../operations/server-configuration-parameters/settings.md) движками Replicated\*. Это имя пользователя и пароль используются только для взаимодействия между репликами кластера и никак не связаны с аутентификацией клиентов ClickHouse. Сервер проверяет совпадение имени и пароля для соединяющихся с ним реплик, а также использует это же имя и пароль для соединения с другими репликами. Соответственно, эти имя и пароль должны быть прописаны одинаковыми для всех реплик кластера. -По умолчанию аутентификация не используется. +Имя пользователя и пароль, использующиеся для подключения к другим серверам при [репликации](../../engines/table-engines/mergetree-family/replication.md) движками Replicated\*. Сервер использует эти же учетные данные при аутентификации других реплик. Поэтому настройки `interserver_http_credentials` должны быть заданы одинаковыми для всех реплик кластера. + +По умолчанию, если секция `interserver_http_credentials` не задана в конфигурации, аутентификация при репликации не используется. !!! note "Примечание" - Эти учетные данные являются общими для обмена данными по протоколам `HTTP` и `HTTPS`. + Настройки `interserver_http_credentials` не относятся к [конфигурации](../../interfaces/cli.md#configuration_files) учетных данных клиента ClickHouse. + +!!! note "Примечание" + Учетные данные в `interserver_http_credentials` являются общими для репликации по `HTTP` и `HTTPS`. Раздел содержит следующие параметры: - `user` — имя пользователя. - `password` — пароль. +- `allow_empty` — если `true`, то другие реплики могут подключаться без аутентификации, даже если учетные данные заданы. Если `false`, то подключение без аутентификации не допускается. Значение по умолчанию: `false`. +- `old` — секция содержит старые значения `user` и `password`, которые используются в процессе изменения учетных данных. Можно указывать несколько секций `old`. -**Пример конфигурации** +**Изменение учетных данных** + +ClickHouse поддерживает динамическое изменение учетных данных. При этом не требуется одновременно останавливать все реплики, чтобы обновить конфигурацию. Изменение учетных данных выполняется за несколько шагов. + +Чтобы включить аутентификацию, установите `interserver_http_credentials.allow_empty` в значение `true` и задайте учетные данные. С такой конфигурацией разрешены подключения как с аутентификацией, так и без нее. + +``` xml + + admin + 111 + true + +``` + +После конфигурации всех реплик установите `allow_empty` в значение `false` или удалите эту настройку. Это сделает аутентификацию с новыми учетными данными обязательной. + +Чтобы изменить учетные данные, перенесите имя пользователя и пароль в секцию `interserver_http_credentials.old` и укажите новые значения для `user` и `password`. Сервер будет использовать новые учетные данные при подключении к другим репликам и при этом будет разрешать подключения как с новыми, так и со старыми учетными данными. ``` xml admin 222 + + admin + 111 + + + temp + 000 + ``` +Когда новые учетные данные обновятся на всех репликах, старые учетные данные можно удалить из конфигурации. + ## keep_alive_timeout {#keep-alive-timeout} Время в секундах, в течение которого ClickHouse ожидает входящих запросов прежде чем закрыть соединение. Значение по умолчанию: 10 секунд. @@ -554,13 +586,13 @@ ClickHouse проверяет условия для `min_part_size` и `min_part Ключи: - `enabled` – Булевый флаг чтобы включить функциональность, по умолчанию `false`. Установите `true` чтобы разрешить отправку отчетов о сбоях. -- `endpoint` – Вы можете переопределить URL на который будут отсылаться отчеты об ошибках и использовать собственную инсталяцию Sentry. Используйте URL синтаксис [Sentry DSN](https://docs.sentry.io/error-reporting/quickstart/?platform=native#configure-the-sdk). +- `endpoint` – Вы можете переопределить URL на который будут отсылаться отчеты об ошибках и использовать собственную инсталляцию Sentry. Используйте URL синтаксис [Sentry DSN](https://docs.sentry.io/error-reporting/quickstart/?platform=native#configure-the-sdk). - `anonymize` - Запретить отсылку имени хоста сервера в отчете о сбое. - `http_proxy` - Настройка HTTP proxy для отсылки отчетов о сбоях. - `debug` - Настроить клиентскую библиотеку Sentry в debug режим. - `tmp_path` - Путь в файловой системе для временного хранения состояния отчетов о сбоях перед отправкой на сервер Sentry. -**Рекомендованые настройки** +**Рекомендованные настройки** ``` xml @@ -751,9 +783,13 @@ ClickHouse проверяет условия для `min_part_size` и `min_part ## max_thread_pool_size {#max-thread-pool-size} -Максимальное количество потоков в глобальном пуле потоков. +ClickHouse использует потоки из глобального пула потоков для обработки запросов. Если в пуле нет свободных потоков, то в нем создается еще один. Параметр `max_thread_pool_size` ограничивает максимальное количество потоков в пуле. -Значение по умолчанию: 10000. +Возможные значения: + +- Положительное целое число. + +Значение по умолчанию: `10000`. **Пример** @@ -761,6 +797,38 @@ ClickHouse проверяет условия для `min_part_size` и `min_part 12000 ``` +## max_thread_pool_free_size {#max-thread-pool-free-size} + +Если в глобальном пуле потоков количество **свободных** потоков больше, чем задано параметром `max_thread_pool_free_size`, то ClickHouse освобождает ресурсы, занятые некоторыми потоками. В таком случае размер пула уменьшается. При необходимости потоки будут созданы заново. + +Возможные значения: + +- Положительное целое число. + +Значение по умолчанию: `1000`. + +**Пример** + +``` xml +1200 +``` + +## thread_pool_queue_size {#thread-pool-queue-size} + +Максимальное количество задач, которые запланированы для выполнения в глобальном пуле потоков. При увеличении этого параметра возрастает использование памяти. Рекомендуется, чтобы значение этого параметра совпадало со значением параметра [max_thread_pool_size](#max-thread-pool-size). + +Возможные значения: + +- Положительное целое число. + +Значение по умолчанию: `10000`. + +**Пример** + +``` xml +12000 +``` + ## merge_tree {#server_configuration_parameters-merge_tree} Тонкая настройка таблиц семейства [MergeTree](../../operations/server-configuration-parameters/settings.md). @@ -1011,7 +1079,7 @@ ClickHouse проверяет условия для `min_part_size` и `min_part Если таблица не существует, то ClickHouse создаст её. Если структура журнала запросов изменилась при обновлении сервера ClickHouse, то таблица со старой структурой переименовывается, а новая таблица создается автоматически. -**Example** +**Пример** ``` xml @@ -1075,9 +1143,8 @@ Parameters: ## query_masking_rules {#query-masking-rules} -Правила основанные на регурялных выражениях, которые будут применены для всех запросов а также для всех сообщений перед сохранением их в лог на сервере, -`system.query_log`, `system.text_log`, `system.processes` таблицы, а также в логах отсылаемых клиенту. Это позволяет предотвратить -утечку конфиденциальных данных из SQL запросов (такие как имена, электронные письма, личные идентификаторы или номера кредитных карт) в логи. +Правила, основанные на регулярных выражениях, которые будут применены для всех запросов, а также для всех сообщений перед сохранением их в лог на сервере, +`system.query_log`, `system.text_log`, `system.processes` таблицы, а также в логах, отсылаемых клиенту. Это позволяет предотвратить утечку конфиденциальных данных из SQL запросов (такие как имена, электронные письма, личные идентификаторы или номера кредитных карт) в логи. **Пример** @@ -1096,7 +1163,7 @@ Parameters: - `regexp` - совместимое с RE2 регулярное выражение (обязательное) - `replace` - строка замены для конфиденциальных данных (опционально, по умолчанию - шесть звездочек) -Правила маскировки применяются ко всему запросу (для предотвращения утечки конфиденциальных данных из неправильно оформленных / не интерпритируемых запросов). +Правила маскировки применяются ко всему запросу (для предотвращения утечки конфиденциальных данных из неправильно оформленных / не интерпретируемых запросов). `system.events` таблица содержит счетчик `QueryMaskingRulesMatch` который считает общее кол-во совпадений правил маскировки. @@ -1418,7 +1485,7 @@ ClickHouse использует ZooKeeper для хранения метадан Также вы можете добавить секции `memory` — означает хранение информации только в памяти, без записи на диск, и `ldap` — означает хранения информации на [LDAP-сервере](https://en.wikipedia.org/wiki/Lightweight_Directory_Access_Protocol). Чтобы добавить LDAP-сервер в качестве удаленного каталога пользователей, которые не определены локально, определите один раздел `ldap` со следующими параметрами: -- `server` — имя одного из LDAP-серверов, определенных в секции `ldap_servers` конфигурациионного файла. Этот параметр явялется необязательным и может быть пустым. +- `server` — имя одного из LDAP-серверов, определенных в секции `ldap_servers` конфигурационного файла. Этот параметр является необязательным и может быть пустым. - `roles` — раздел со списком локально определенных ролей, которые будут назначены каждому пользователю, полученному с LDAP-сервера. Если роли не заданы, пользователь не сможет выполнять никаких действий после аутентификации. Если какая-либо из перечисленных ролей не определена локально во время проверки подлинности, попытка проверки подлинности завершится неудачей, как если бы предоставленный пароль был неверным. **Пример** diff --git a/docs/ru/operations/settings/merge-tree-settings.md b/docs/ru/operations/settings/merge-tree-settings.md index 31cc229c6aa..e30539498b3 100644 --- a/docs/ru/operations/settings/merge-tree-settings.md +++ b/docs/ru/operations/settings/merge-tree-settings.md @@ -355,3 +355,23 @@ Eсли суммарное число активных кусков во все - 1 — куски данных открепляются. Значение по умолчанию: `0`. + +## merge_tree_clear_old_temporary_directories_interval_seconds {#setting-merge-tree-clear-old-temporary-directories-interval-seconds} + +Задает интервал в секундах для удаления старых временных каталогов на сервере ClickHouse. + +Возможные значения: + +- Положительное целое число. + +Значение по умолчанию: `60` секунд. + +## merge_tree_clear_old_parts_interval_seconds {#setting-merge-tree-clear-old-parts-interval-seconds} + +Задает интервал в секундах для удаления старых кусков данных, журналов предзаписи (WAL) и мутаций на сервере ClickHouse. + +Возможные значения: + +- Положительное целое число. + +Значение по умолчанию: `1` секунда. diff --git a/docs/ru/operations/settings/settings.md b/docs/ru/operations/settings/settings.md index f9717b0fb27..affa90d9840 100644 --- a/docs/ru/operations/settings/settings.md +++ b/docs/ru/operations/settings/settings.md @@ -391,12 +391,14 @@ INSERT INTO test VALUES (lower('Hello')), (lower('world')), (lower('INSERT')), ( ## input_format_tsv_enum_as_number {#settings-input_format_tsv_enum_as_number} -Включает или отключает парсинг значений перечислений как идентификаторов перечислений для входного формата TSV. +Включает или отключает парсинг значений перечислений как порядковых номеров. + +Если режим включен, то во входящих данных в формате `TCV` значения перечисления (тип `ENUM`) всегда трактуются как порядковые номера, а не как элементы перечисления. Эту настройку рекомендуется включать для оптимизации парсинга, если данные типа `ENUM` содержат только порядковые номера, а не сами элементы перечисления. Возможные значения: -- 0 — парсинг значений перечисления как значений. -- 1 — парсинг значений перечисления как идентификаторов перечисления. +- 0 — входящие значения типа `ENUM` сначала сопоставляются с элементами перечисления, а если совпадений не найдено, то трактуются как порядковые номера. +- 1 — входящие значения типа `ENUM` сразу трактуются как порядковые номера. Значение по умолчанию: 0. @@ -410,10 +412,39 @@ CREATE TABLE table_with_enum_column_for_tsv_insert (Id Int32,Value Enum('first' При включенной настройке `input_format_tsv_enum_as_number`: +Запрос: + ```sql SET input_format_tsv_enum_as_number = 1; INSERT INTO table_with_enum_column_for_tsv_insert FORMAT TSV 102 2; -INSERT INTO table_with_enum_column_for_tsv_insert FORMAT TSV 103 1; +SELECT * FROM table_with_enum_column_for_tsv_insert; +``` + +Результат: + +```text +┌──Id─┬─Value──┐ +│ 102 │ second │ +└─────┴────────┘ +``` + +Запрос: + +```sql +SET input_format_tsv_enum_as_number = 1; +INSERT INTO table_with_enum_column_for_tsv_insert FORMAT TSV 103 'first'; +``` + +сгенерирует исключение. + +При отключенной настройке `input_format_tsv_enum_as_number`: + +Запрос: + +```sql +SET input_format_tsv_enum_as_number = 0; +INSERT INTO table_with_enum_column_for_tsv_insert FORMAT TSV 102 2; +INSERT INTO table_with_enum_column_for_tsv_insert FORMAT TSV 103 'first'; SELECT * FROM table_with_enum_column_for_tsv_insert; ``` @@ -428,15 +459,6 @@ SELECT * FROM table_with_enum_column_for_tsv_insert; └─────┴────────┘ ``` -При отключенной настройке `input_format_tsv_enum_as_number` запрос `INSERT`: - -```sql -SET input_format_tsv_enum_as_number = 0; -INSERT INTO table_with_enum_column_for_tsv_insert FORMAT TSV 102 2; -``` - -сгенерирует исключение. - ## input_format_null_as_default {#settings-input-format-null-as-default} Включает или отключает инициализацию [значениями по умолчанию](../../sql-reference/statements/create/table.md#create-default-values) ячеек с [NULL](../../sql-reference/syntax.md#null-literal), если тип данных столбца не позволяет [хранить NULL](../../sql-reference/data-types/nullable.md#data_type-nullable). @@ -739,9 +761,20 @@ ClickHouse может парсить только базовый формат `Y Возможные значения: -- Любое положительное целое число. +- Положительное целое число. -Значение по умолчанию: 163840. +Значение по умолчанию: `163840`. + + +## merge_tree_min_rows_for_concurrent_read_for_remote_filesystem {#merge-tree-min-rows-for-concurrent-read-for-remote-filesystem} + +Минимальное количество строк для чтения из одного файла, прежде чем движок [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) может выполнять параллельное чтение из удаленной файловой системы. + +Возможные значения: + +- Положительное целое число. + +Значение по умолчанию: `163840`. ## merge_tree_min_bytes_for_concurrent_read {#setting-merge-tree-min-bytes-for-concurrent-read} @@ -751,7 +784,17 @@ ClickHouse может парсить только базовый формат `Y - Положительное целое число. -Значение по умолчанию: 251658240. +Значение по умолчанию: `251658240`. + +## merge_tree_min_bytes_for_concurrent_read_for_remote_filesystem {#merge-tree-min-bytes-for-concurrent-read-for-remote-filesystem} + +Минимальное количество байтов для чтения из одного файла, прежде чем движок [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) может выполнять параллельное чтение из удаленной файловой системы. + +Возможное значение: + +- Положительное целое число. + +Значение по умолчанию: `251658240`. ## merge_tree_min_rows_for_seek {#setting-merge-tree-min-rows-for-seek} @@ -807,26 +850,6 @@ ClickHouse может парсить только базовый формат `Y Значение по умолчанию: 2013265920. -## merge_tree_clear_old_temporary_directories_interval_seconds {#setting-merge-tree-clear-old-temporary-directories-interval-seconds} - -Задает интервал в секундах для удаления старых временных каталогов на сервере ClickHouse. - -Возможные значения: - -- Положительное целое число. - -Значение по умолчанию: `60` секунд. - -## merge_tree_clear_old_parts_interval_seconds {#setting-merge-tree-clear-old-parts-interval-seconds} - -Задает интервал в секундах для удаления старых кусков данных, журналов предзаписи (WAL) и мутаций на сервере ClickHouse . - -Возможные значения: - -- Положительное целое число. - -Значение по умолчанию: `1` секунда. - ## min_bytes_to_use_direct_io {#settings-min-bytes-to-use-direct-io} Минимальный объём данных, необходимый для прямого (небуферизованного) чтения/записи (direct I/O) на диск. @@ -1531,12 +1554,13 @@ SELECT area/period FROM account_orders FORMAT JSON; ## input_format_csv_enum_as_number {#settings-input_format_csv_enum_as_number} -Включает или отключает парсинг значений перечислений как идентификаторов перечислений для входного формата CSV. +Включает или отключает парсинг значений перечислений как порядковых номеров. +Если режим включен, то во входящих данных в формате `CSV` значения перечисления (тип `ENUM`) всегда трактуются как порядковые номера, а не как элементы перечисления. Эту настройку рекомендуется включать для оптимизации парсинга, если данные типа `ENUM` содержат только порядковые номера, а не сами элементы перечисления. Возможные значения: -- 0 — парсинг значений перечисления как значений. -- 1 — парсинг значений перечисления как идентификаторов перечисления. +- 0 — входящие значения типа `ENUM` сначала сопоставляются с элементами перечисления, а если совпадений не найдено, то трактуются как порядковые номера. +- 1 — входящие значения типа `ENUM` сразу трактуются как порядковые номера. Значение по умолчанию: 0. @@ -1550,10 +1574,11 @@ CREATE TABLE table_with_enum_column_for_csv_insert (Id Int32,Value Enum('first' При включенной настройке `input_format_csv_enum_as_number`: +Запрос: + ```sql SET input_format_csv_enum_as_number = 1; INSERT INTO table_with_enum_column_for_csv_insert FORMAT CSV 102,2; -SELECT * FROM table_with_enum_column_for_csv_insert; ``` Результат: @@ -1564,15 +1589,37 @@ SELECT * FROM table_with_enum_column_for_csv_insert; └─────┴────────┘ ``` -При отключенной настройке `input_format_csv_enum_as_number` запрос `INSERT`: +Запрос: ```sql -SET input_format_csv_enum_as_number = 0; -INSERT INTO table_with_enum_column_for_csv_insert FORMAT CSV 102,2; +SET input_format_csv_enum_as_number = 1; +INSERT INTO table_with_enum_column_for_csv_insert FORMAT CSV 103,'first' ``` сгенерирует исключение. +При отключенной настройке `input_format_csv_enum_as_number`: + +Запрос: + +```sql +SET input_format_csv_enum_as_number = 0; +INSERT INTO table_with_enum_column_for_csv_insert FORMAT CSV 102,2 +INSERT INTO table_with_enum_column_for_csv_insert FORMAT CSV 103,'first' +SELECT * FROM table_with_enum_column_for_csv_insert; +``` + +Результат: + +```text +┌──Id─┬─Value──┐ +│ 102 │ second │ +└─────┴────────┘ +┌──Id─┬─Value─┐ +│ 103 │ first │ +└─────┴───────┘ +``` + ## output_format_csv_crlf_end_of_line {#settings-output-format-csv-crlf-end-of-line} Использовать в качестве разделителя строк для CSV формата CRLF (DOS/Windows стиль) вместо LF (Unix стиль). @@ -1594,18 +1641,19 @@ INSERT INTO table_with_enum_column_for_csv_insert FORMAT CSV 102,2; `INSERT` завершается успешно только в том случае, когда ClickHouse смог без ошибки записать данные в `insert_quorum` реплик за время `insert_quorum_timeout`. Если по любой причине количество реплик с успешной записью не достигнет `insert_quorum`, то запись считается не состоявшейся и ClickHouse удалит вставленный блок из всех реплик, куда уже успел записать данные. -Все реплики в кворуме консистентны, т.е. содержат данные всех более ранних запросов `INSERT`. Последовательность `INSERT` линеаризуется. +Когда `insert_quorum_parallel` выключена, все реплики кворума консистентны, то есть содержат данные всех предыдущих запросов `INSERT` (последовательность `INSERT` линеаризуется). При чтении с диска данных, записанных с помощью `insert_quorum` и при выключенной `insert_quorum_parallel`, можно включить последовательную консистентность для запросов `SELECT` с помощью [select_sequential_consistency](#settings-select_sequential_consistency). -При чтении данных, записанных с `insert_quorum` можно использовать настройку [select_sequential_consistency](#settings-select_sequential_consistency). - -ClickHouse генерирует исключение +ClickHouse генерирует исключение: - Если количество доступных реплик на момент запроса меньше `insert_quorum`. - При попытке записать данные в момент, когда предыдущий блок ещё не вставлен в `insert_quorum` реплик. Эта ситуация может возникнуть, если пользователь вызвал `INSERT` прежде, чем завершился предыдущий с `insert_quorum`. +- При выключенной `insert_quorum_parallel` и при попытке записать данные в момент, когда предыдущий блок еще не вставлен в `insert_quorum` реплик (несколько параллельных `INSERT`-запросов). Эта ситуация может возникнуть при попытке пользователя выполнить очередной запрос `INSERT` к той же таблице, прежде чем завершится предыдущий с `insert_quorum`. + См. также: - [insert_quorum_timeout](#settings-insert_quorum_timeout) +- [insert_quorum_parallel](#settings-insert_quorum_parallel) - [select_sequential_consistency](#settings-select_sequential_consistency) ## insert_quorum_timeout {#settings-insert_quorum_timeout} @@ -1617,11 +1665,29 @@ ClickHouse генерирует исключение См. также: - [insert_quorum](#settings-insert_quorum) +- [insert_quorum_parallel](#settings-insert_quorum_parallel) +- [select_sequential_consistency](#settings-select_sequential_consistency) + +## insert_quorum_parallel {#settings-insert_quorum_parallel} + +Включает и выключает параллелизм для кворумных вставок (`INSERT`-запросы). Когда опция включена, возможно выполнять несколько кворумных `INSERT`-запросов одновременно, при этом запросы не дожидаются окончания друг друга . Когда опция выключена, одновременные записи с кворумом в одну и ту же таблицу будут отклонены (будет выполнена только одна из них). + +Возможные значения: + +- 0 — Выключена. +- 1 — Включена. + +Значение по умолчанию: 1. + +См. также: + +- [insert_quorum](#settings-insert_quorum) +- [insert_quorum_timeout](#settings-insert_quorum_timeout) - [select_sequential_consistency](#settings-select_sequential_consistency) ## select_sequential_consistency {#settings-select_sequential_consistency} -Включает или выключает последовательную консистентность для запросов `SELECT`. +Включает или выключает последовательную консистентность для запросов `SELECT`. Необходимо, чтобы `insert_quorum_parallel` была выключена (по умолчанию включена), а опция `insert_quorum` включена. Возможные значения: @@ -1634,10 +1700,13 @@ ClickHouse генерирует исключение Когда последовательная консистентность включена, то ClickHouse позволит клиенту выполнить запрос `SELECT` только к тем репликам, которые содержат данные всех предыдущих запросов `INSERT`, выполненных с `insert_quorum`. Если клиент обратится к неполной реплике, то ClickHouse сгенерирует исключение. В запросе SELECT не будут участвовать данные, которые ещё не были записаны на кворум реплик. +Если `insert_quorum_parallel` включена (по умолчанию это так), тогда `select_sequential_consistency` не будет работать. Причина в том, что параллельные запросы `INSERT` можно записать в разные наборы реплик кворума, поэтому нет гарантии того, что в отдельно взятую реплику будут сделаны все записи. + См. также: - [insert_quorum](#settings-insert_quorum) - [insert_quorum_timeout](#settings-insert_quorum_timeout) +- [insert_quorum_parallel](#settings-insert_quorum_parallel) ## insert_deduplicate {#settings-insert-deduplicate} diff --git a/docs/ru/sql-reference/aggregate-functions/reference/sparkbar.md b/docs/ru/sql-reference/aggregate-functions/reference/sparkbar.md new file mode 100644 index 00000000000..b66d710744e --- /dev/null +++ b/docs/ru/sql-reference/aggregate-functions/reference/sparkbar.md @@ -0,0 +1,66 @@ +--- +toc_priority: 311 +toc_title: sparkbar +--- + +# sparkbar {#sparkbar} + +Функция строит гистограмму частот по заданным значениям `x` и частоте повторения этих значений `y` на интервале `[min_x, max_x]`. + +Если интервал для построения не указан, то в качестве нижней границы интервала будет взято минимальное значение `x`, а в качестве верхней границы — максимальное значение `x`. + + +**Синтаксис** + +``` sql +sparkbar(width[, min_x, max_x])(x, y) +``` + +**Параметры** + +- `width` — Количество столбцов гистограммы. Тип: [Integer](../../../sql-reference/data-types/int-uint.md). + +- `min_x` — Начало интервала. Необязательный параметр. +- `max_x` — Конец интервала. Необязательный параметр. + +**Аргументы** + +- `x` — Поле со значениями. +- `y` — Поле с частотой повторения значений. + + +**Возвращаемые значения** + +- Гистограмма частот. + +**Пример** + +Запрос: + +``` sql +CREATE TABLE spark_bar_data (`cnt` UInt64,`event_date` Date) ENGINE = MergeTree ORDER BY event_date SETTINGS index_granularity = 8192; + +INSERT INTO spark_bar_data VALUES(1,'2020-01-01'),(4,'2020-01-02'),(5,'2020-01-03'),(2,'2020-01-04'),(3,'2020-01-05'),(7,'2020-01-06'),(6,'2020-01-07'),(8,'2020-01-08'),(2,'2020-01-11'); + +SELECT sparkbar(9)(event_date,cnt) FROM spark_bar_data; + +SELECT sparkbar(9,toDate('2020-01-01'),toDate('2020-01-10'))(event_date,cnt) FROM spark_bar_data; +``` + +Результат: + +``` text + +┌─sparkbar(9)(event_date, cnt)─┐ +│ │ +│ ▁▅▄▃██▅ ▁ │ +│ │ +└──────────────────────────────┘ + +┌─sparkbar(9, toDate('2020-01-01'), toDate('2020-01-10'))(event_date, cnt)─┐ +│ │ +│▁▄▄▂▅▇█▁ │ +│ │ +└──────────────────────────────────────────────────────────────────────────┘ +``` + diff --git a/docs/ru/sql-reference/statements/grant.md b/docs/ru/sql-reference/statements/grant.md index c970d4d24f3..06432193f9f 100644 --- a/docs/ru/sql-reference/statements/grant.md +++ b/docs/ru/sql-reference/statements/grant.md @@ -21,7 +21,7 @@ GRANT [ON CLUSTER cluster_name] privilege[(column_name [,...])] [,...] ON {db.ta - `user` — Пользователь ClickHouse. `WITH GRANT OPTION` разрешает пользователю или роли выполнять запрос `GRANT`. Пользователь может выдавать только те привилегии, которые есть у него, той же или меньшей области действий. -`WITH REPLACE OPTION` заменяет все старые привилегии новыми привилегиями для `user` или `role`, Если не указано, добавьте новые привилегии для старых. +`WITH REPLACE OPTION` заменяет все старые привилегии новыми привилегиями для `user` или `role`, если не указано, добавляет новые привилегии. ## Синтаксис назначения ролей {#assign-role-syntax} @@ -34,7 +34,7 @@ GRANT [ON CLUSTER cluster_name] role [,...] TO {user | another_role | CURRENT_US - `user` — Пользователь ClickHouse. `WITH ADMIN OPTION` присваивает привилегию [ADMIN OPTION](#admin-option-privilege) пользователю или роли. -`WITH REPLACE OPTION` заменяет все старые роли новыми ролями для пользователя `user` или `role`, Если не указано, добавьте новые роли в старые. +`WITH REPLACE OPTION` заменяет все старые роли новыми ролями для пользователя `user` или `role`, если не указано, добавляет новые новые роли. ## Использование {#grant-usage} diff --git a/docs/ru/sql-reference/statements/select/group-by.md b/docs/ru/sql-reference/statements/select/group-by.md index 8bc1b765ad3..27a9d67cded 100644 --- a/docs/ru/sql-reference/statements/select/group-by.md +++ b/docs/ru/sql-reference/statements/select/group-by.md @@ -203,6 +203,9 @@ SELECT year, month, day, count(*) FROM t GROUP BY year, month, day WITH CUBE; - В `Pretty*` форматах, строка выводится в виде отдельной таблицы после основного результата. - В других форматах она не доступна. +!!! note "Примечание" + totals выводится только в результатах запросов `SELECT`, и не вывоводится в `INSERT INTO ... SELECT`. + При использовании секции [HAVING](having.md) поведение `WITH TOTALS` контролируется настройкой `totals_mode`. ### Настройка обработки итогов {#configuring-totals-processing} diff --git a/docs/ru/sql-reference/statements/select/join.md b/docs/ru/sql-reference/statements/select/join.md index 9f6d38a024f..bb9b7445083 100644 --- a/docs/ru/sql-reference/statements/select/join.md +++ b/docs/ru/sql-reference/statements/select/join.md @@ -55,13 +55,13 @@ FROM - [join_on_disk_max_files_to_merge](../../../operations/settings/settings.md#join_on_disk_max_files_to_merge) - [any_join_distinct_right_table_keys](../../../operations/settings/settings.md#any_join_distinct_right_table_keys) -## Условия в секции ON {on-section-conditions} +## Условия в секции ON {#on-section-conditions} + +Секция `ON` может содержать несколько условий, связанных операторами `AND` и `OR`. Условия, задающие ключи соединения, должны содержать столбцы левой и правой таблицы и должны использовать оператор равенства. Прочие условия могут использовать другие логические операторы, но в отдельном условии могут использоваться столбцы либо только левой, либо только правой таблицы. -Секция `ON` может содержать несколько условий, связанных оператором `AND`. Условия, задающие ключи соединения, должны содержать столбцы левой и правой таблицы и должны использовать оператор равенства. Прочие условия могут использовать другие логические операторы, но в отдельном условии могут использоваться столбцы либо только левой, либо только правой таблицы. Строки объединяются только тогда, когда всё составное условие выполнено. Если оно не выполнено, то строки могут попасть в результат в зависимости от типа `JOIN`. Обратите внимание, что если то же самое условие поместить в секцию `WHERE`, то строки, для которых оно не выполняется, никогда не попаду в результат. -!!! note "Примечание" - Оператор `OR` внутри секции `ON` пока не поддерживается. +Оператор `OR` внутри секции `ON` работает, используя алгоритм хеш-соединения — на каждый агрумент `OR` с ключами соединений для `JOIN` создается отдельная хеш-таблица, поэтому потребление памяти и время выполнения запроса растет линейно при увеличении количества выражений `OR` секции `ON`. !!! note "Примечание" Если в условии использованы столбцы из разных таблиц, то пока поддерживается только оператор равенства (`=`). @@ -110,6 +110,47 @@ SELECT name, text, scores FROM table_1 INNER JOIN table_2 └──────┴────────┴────────┘ ``` +Запрос с типом соединения `INNER` и условием с оператором `OR`: + +``` sql +CREATE TABLE t1 (`a` Int64, `b` Int64) ENGINE = MergeTree() ORDER BY a; + +CREATE TABLE t2 (`key` Int32, `val` Int64) ENGINE = MergeTree() ORDER BY key; + +INSERT INTO t1 SELECT number as a, -a as b from numbers(5); + +INSERT INTO t2 SELECT if(number % 2 == 0, toInt64(number), -number) as key, number as val from numbers(5); + +SELECT a, b, val FROM t1 INNER JOIN t2 ON t1.a = t2.key OR t1.b = t2.key; +``` + +Результат: + +``` +┌─a─┬──b─┬─val─┐ +│ 0 │ 0 │ 0 │ +│ 1 │ -1 │ 1 │ +│ 2 │ -2 │ 2 │ +│ 3 │ -3 │ 3 │ +│ 4 │ -4 │ 4 │ +└───┴────┴─────┘ +``` + +Запрос с типом соединения `INNER` и условиями с операторами `OR` и `AND`: + +``` sql +SELECT a, b, val FROM t1 INNER JOIN t2 ON t1.a = t2.key OR t1.b = t2.key AND t2.val > 3; +``` + +Результат: + +``` +┌─a─┬──b─┬─val─┐ +│ 0 │ 0 │ 0 │ +│ 2 │ -2 │ 2 │ +│ 4 │ -4 │ 4 │ +└───┴────┴─────┘ +``` ## Использование ASOF JOIN {#asof-join-usage} `ASOF JOIN` применим в том случае, когда необходимо объединять записи, которые не имеют точного совпадения. diff --git a/docs/ru/sql-reference/statements/select/order-by.md b/docs/ru/sql-reference/statements/select/order-by.md index 190a46dacc9..3f52b260423 100644 --- a/docs/ru/sql-reference/statements/select/order-by.md +++ b/docs/ru/sql-reference/statements/select/order-by.md @@ -271,7 +271,7 @@ SELECT * FROM collate_test ORDER BY s ASC COLLATE 'en'; ## Модификатор ORDER BY expr WITH FILL {#orderby-with-fill} -Этот модификатор также может быть скобинирован с модификатором [LIMIT ... WITH TIES](../../../sql-reference/statements/select/limit.md#limit-with-ties) +Этот модификатор также может быть скомбинирован с модификатором [LIMIT ... WITH TIES](../../../sql-reference/statements/select/limit.md#limit-with-ties) Модификатор `WITH FILL` может быть установлен после `ORDER BY expr` с опциональными параметрами `FROM expr`, `TO expr` и `STEP expr`. Все пропущенные значения для колонки `expr` будут заполнены значениями, соответствующими предполагаемой последовательности значений колонки, другие колонки будут заполнены значениями по умолчанию. diff --git a/docs/zh/faq/general/how-do-i-contribute-code-to-clickhouse.md b/docs/zh/faq/general/how-do-i-contribute-code-to-clickhouse.md new file mode 120000 index 00000000000..5ac9a615386 --- /dev/null +++ b/docs/zh/faq/general/how-do-i-contribute-code-to-clickhouse.md @@ -0,0 +1 @@ +../../../en/faq/general/how-do-i-contribute-code-to-clickhouse.md \ No newline at end of file diff --git a/docs/zh/faq/operations/multi-region-replication.md b/docs/zh/faq/operations/multi-region-replication.md new file mode 120000 index 00000000000..dbc985ee1fb --- /dev/null +++ b/docs/zh/faq/operations/multi-region-replication.md @@ -0,0 +1 @@ +../../../en/faq/operations/multi-region-replication.md \ No newline at end of file diff --git a/docs/zh/getting-started/example-datasets/ontime.md b/docs/zh/getting-started/example-datasets/ontime.md index 03a9a8c4278..907f63634cc 100644 --- a/docs/zh/getting-started/example-datasets/ontime.md +++ b/docs/zh/getting-started/example-datasets/ontime.md @@ -15,17 +15,9 @@ toc_title: OnTime 下载数据: ``` bash -for s in `seq 1987 2018` -do -for m in `seq 1 12` -do -wget https://transtats.bts.gov/PREZIP/On_Time_Reporting_Carrier_On_Time_Performance_1987_present_${s}_${m}.zip -done -done +wget --no-check-certificate --continue https://transtats.bts.gov/PREZIP/On_Time_Reporting_Carrier_On_Time_Performance_1987_present_{1987..2021}_{1..12}.zip ``` -(参考 https://github.com/Percona-Lab/ontime-airline-performance/blob/master/download.sh ) - 创建表结构: ``` sql @@ -40,7 +32,7 @@ CREATE TABLE `ontime` `Reporting_Airline` String, `DOT_ID_Reporting_Airline` Int32, `IATA_CODE_Reporting_Airline` String, - `Tail_Number` Int32, + `Tail_Number` String, `Flight_Number_Reporting_Airline` String, `OriginAirportID` Int32, `OriginAirportSeqID` Int32, diff --git a/docs/zh/interfaces/http.md b/docs/zh/interfaces/http.md index cdce4f2f2e7..738b0365f46 100644 --- a/docs/zh/interfaces/http.md +++ b/docs/zh/interfaces/http.md @@ -407,7 +407,7 @@ $ curl -v 'http://localhost:8123/predefined_query' `query` 是一个预定义的`predefined_query_handler`查询,它由ClickHouse在匹配HTTP请求并返回查询结果时执行。这是一个必须的配置。 -以下是定义的[max_threads](../operations/settings/settings.md#settings-max_threads)和`max_alter_threads`设置, 然后查询系统表以检查这些设置是否设置成功。 +以下是定义的[max_threads](../operations/settings/settings.md#settings-max_threads)和`max_final_threads`设置, 然后查询系统表以检查这些设置是否设置成功。 示例: @@ -430,9 +430,9 @@ $ curl -v 'http://localhost:8123/predefined_query' ``` ``` bash -$ curl -H 'XXX:TEST_HEADER_VALUE' -H 'PARAMS_XXX:max_threads' 'http://localhost:8123/query_param_with_url/1/max_threads/max_alter_threads?max_threads=1&max_alter_threads=2' +$ curl -H 'XXX:TEST_HEADER_VALUE' -H 'PARAMS_XXX:max_threads' 'http://localhost:8123/query_param_with_url/1/max_threads/max_final_threads?max_threads=1&max_final_threads=2' 1 -max_alter_threads 2 +max_final_threads 2 ``` !!! note "警告" @@ -444,7 +444,7 @@ max_alter_threads 2 ClickHouse提取并执行与HTTP请求URL中的`query_param_name`值对应的值。`query_param_name`的默认值是`/query`。这是一个可选的配置。如果配置文件中没有定义,则不会传入参数。 -为了试验这个功能,示例定义了[max_threads](../operations/settings/settings.md#settings-max_threads)和`max_alter_threads`,`queries`设置是否成功的值。 +为了试验这个功能,示例定义了[max_threads](../operations/settings/settings.md#settings-max_threads)和`max_final_threads`,`queries`设置是否成功的值。 示例: @@ -462,9 +462,9 @@ ClickHouse提取并执行与HTTP请求URL中的`query_param_name`值对应的值 ``` ``` bash -$ curl -H 'XXX:TEST_HEADER_VALUE_DYNAMIC' 'http://localhost:8123/own?max_threads=1&max_alter_threads=2¶m_name_1=max_threads¶m_name_2=max_alter_threads&query_param=SELECT%20name,value%20FROM%20system.settings%20where%20name%20=%20%7Bname_1:String%7D%20OR%20name%20=%20%7Bname_2:String%7D' +$ curl -H 'XXX:TEST_HEADER_VALUE_DYNAMIC' 'http://localhost:8123/own?max_threads=1&max_final_threads=2¶m_name_1=max_threads¶m_name_2=max_final_threads&query_param=SELECT%20name,value%20FROM%20system.settings%20where%20name%20=%20%7Bname_1:String%7D%20OR%20name%20=%20%7Bname_2:String%7D' max_threads 1 -max_alter_threads 2 +max_final_threads 2 ``` ### static {#static} diff --git a/docs/zh/operations/system-tables/columns.md b/docs/zh/operations/system-tables/columns.md index 9a90561a07b..6d4299a9056 100644 --- a/docs/zh/operations/system-tables/columns.md +++ b/docs/zh/operations/system-tables/columns.md @@ -1,29 +1,89 @@ ---- -machine_translated: true -machine_translated_rev: 5decc73b5dc60054f19087d3690c4eb99446a6c3 ---- +# system.columns {#system-columns} -# 系统。列 {#system-columns} +此系统表包含所有表中列的信息。 -包含有关所有表中列的信息。 +你可以使用这个表来获得类似于 [DESCRIBE TABLE](../../sql-reference/statements/misc.md#misc-describe-table) 查询的信息,但是可以同时获得多个表的信息。 -您可以使用此表获取类似于以下内容的信息 [DESCRIBE TABLE](../../sql-reference/statements/misc.md#misc-describe-table) 查询,但对于多个表一次。 +[临时表](../../sql-reference/statements/create/table.md#temporary-tables)中的列只在创建它们的会话中的 `system.columns` 中才可见,并且它们的 `database` 字段显示为空。 -该 `system.columns` 表包含以下列(列类型显示在括号中): +`system.columns` 表包含以下列 (括号中显示的是列类型): -- `database` (String) — Database name. -- `table` (String) — Table name. -- `name` (String) — Column name. -- `type` (String) — Column type. -- `default_kind` (String) — Expression type (`DEFAULT`, `MATERIALIZED`, `ALIAS`)为默认值,如果没有定义,则为空字符串。 -- `default_expression` (String) — Expression for the default value, or an empty string if it is not defined. -- `data_compressed_bytes` (UInt64) — The size of compressed data, in bytes. -- `data_uncompressed_bytes` (UInt64) — The size of decompressed data, in bytes. -- `marks_bytes` (UInt64) — The size of marks, in bytes. -- `comment` (String) — Comment on the column, or an empty string if it is not defined. -- `is_in_partition_key` (UInt8) — Flag that indicates whether the column is in the partition expression. -- `is_in_sorting_key` (UInt8) — Flag that indicates whether the column is in the sorting key expression. -- `is_in_primary_key` (UInt8) — Flag that indicates whether the column is in the primary key expression. -- `is_in_sampling_key` (UInt8) — Flag that indicates whether the column is in the sampling key expression. +- `database` ([String](../../sql-reference/data-types/string.md)) — 数据库名称。 +- `table` ([String](../../sql-reference/data-types/string.md)) — 表名。 +- `name` ([String](../../sql-reference/data-types/string.md)) — 列名。 +- `type` ([String](../../sql-reference/data-types/string.md)) — 列类型。 +- `position` ([UInt64](../../sql-reference/data-types/int-uint.md)) — 列在表中的顺序位置,从1开始。 +- `default_kind` ([String](../../sql-reference/data-types/string.md)) — 默认值的表达式类型(`DEFAULT`, `MATERIALIZED`, `ALIAS`) ,如果没有定义,则为空字符串。 +- `default_expression` ([String](../../sql-reference/data-types/string.md)) — 默认值的表达式,如果未定义则为空字符串。 +- `data_compressed_bytes` ([UInt64](../../sql-reference/data-types/int-uint.md)) — 压缩数据的大小,以字节为单位。 +- `data_uncompressed_bytes` ([UInt64](../../sql-reference/data-types/int-uint.md)) — 解压后的数据的大小,以字节为单位。 +- `marks_bytes` ([UInt64](../../sql-reference/data-types/int-uint.md)) — 标记的大小,以字节为单位。 +- `comment` ([String](../../sql-reference/data-types/string.md)) — 列注释,如果没有定义,则为空字符串。 +- `is_in_partition_key` ([UInt8](../../sql-reference/data-types/int-uint.md)) — 列是否在分区表达式中的标志。 +- `is_in_sorting_key` ([UInt8](../../sql-reference/data-types/int-uint.md)) — 列是否在排序键表达式中的标志。 +- `is_in_primary_key` ([UInt8](../../sql-reference/data-types/int-uint.md)) — 列是否在主键表达式中的标志。 +- `is_in_sampling_key` ([UInt8](../../sql-reference/data-types/int-uint.md)) — 列是否在采样键表达式中的标志。 +- `compression_codec` ([String](../../sql-reference/data-types/string.md)) — 压缩编码的名称。 +- `character_octet_length` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — 二进制数据、字符数据或文本数据和图像的最大长度(以字节为单位)。在 ClickHouse 中只对 `FixedString` 数据类型有意义。否则,将返回 `NULL` 值。 +- `numeric_precision` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — 近似数字型数据、精确数字型数据、整数型数据或货币数据的精度。在 ClickHouse 中,对于整数类型是比特率(bitness),对于 `Decimal` 类型是十进制精度。否则,将返回 `NULL` 值。 +- `numeric_precision_radix` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — 数字系统的基数是近似数字型数据、精确数字型数据、整数型数据或货币数据的精度。在 ClickHouse 中,对于整数类型是2,对于 `Decimal` 类型是10。否则,将返回 `NULL` 值。 +- `numeric_scale` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — 近似数字型数据、精确数字型数据、整数型数据或货币数据的比例。在 ClickHouse 中只对 `Decimal` 类型有意义。否则,将返回 `NULL` 值。 +- `datetime_precision` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — `DateTime64` 数据类型的小数精度。对于其他数据类型,将返回 `NULL` 值。 + +**示例** + +```sql +SELECT * FROM system.columns LIMIT 2 FORMAT Vertical; +``` + +```text +Row 1: +────── +database: INFORMATION_SCHEMA +table: COLUMNS +name: table_catalog +type: String +position: 1 +default_kind: +default_expression: +data_compressed_bytes: 0 +data_uncompressed_bytes: 0 +marks_bytes: 0 +comment: +is_in_partition_key: 0 +is_in_sorting_key: 0 +is_in_primary_key: 0 +is_in_sampling_key: 0 +compression_codec: +character_octet_length: ᴺᵁᴸᴸ +numeric_precision: ᴺᵁᴸᴸ +numeric_precision_radix: ᴺᵁᴸᴸ +numeric_scale: ᴺᵁᴸᴸ +datetime_precision: ᴺᵁᴸᴸ + +Row 2: +────── +database: INFORMATION_SCHEMA +table: COLUMNS +name: table_schema +type: String +position: 2 +default_kind: +default_expression: +data_compressed_bytes: 0 +data_uncompressed_bytes: 0 +marks_bytes: 0 +comment: +is_in_partition_key: 0 +is_in_sorting_key: 0 +is_in_primary_key: 0 +is_in_sampling_key: 0 +compression_codec: +character_octet_length: ᴺᵁᴸᴸ +numeric_precision: ᴺᵁᴸᴸ +numeric_precision_radix: ᴺᵁᴸᴸ +numeric_scale: ᴺᵁᴸᴸ +datetime_precision: ᴺᵁᴸᴸ +``` [原文](https://clickhouse.com/docs/zh/operations/system-tables/columns) diff --git a/docs/zh/operations/system-tables/contributors.md b/docs/zh/operations/system-tables/contributors.md index e9374a7dc9c..fd876da3594 100644 --- a/docs/zh/operations/system-tables/contributors.md +++ b/docs/zh/operations/system-tables/contributors.md @@ -1,15 +1,10 @@ ---- -machine_translated: true -machine_translated_rev: 5decc73b5dc60054f19087d3690c4eb99446a6c3 ---- +# system.contributors {#system-contributors} -# 系统。贡献者 {#system-contributors} - -包含有关贡献者的信息。 该顺序在查询执行时是随机的。 +此系统表包含有关贡献者的信息。排列顺序是在查询执行时随机生成的。 列: -- `name` (String) — Contributor (author) name from git log. +- `name` (String) — git 日志中的贡献者 (作者) 名字。 **示例** @@ -32,7 +27,7 @@ SELECT * FROM system.contributors LIMIT 10 └──────────────────┘ ``` -要在表中找出自己,请使用查询: +要在表中找到你自己,请这样查询: ``` sql SELECT * FROM system.contributors WHERE name = 'Olga Khvostikova' @@ -43,3 +38,5 @@ SELECT * FROM system.contributors WHERE name = 'Olga Khvostikova' │ Olga Khvostikova │ └──────────────────┘ ``` + +[原文](https://clickhouse.com/docs/zh/operations/system-tables/contributors) diff --git a/docs/zh/operations/system-tables/databases.md b/docs/zh/operations/system-tables/databases.md index 134b8ebc7ab..3fadb02446d 100644 --- a/docs/zh/operations/system-tables/databases.md +++ b/docs/zh/operations/system-tables/databases.md @@ -1,12 +1,39 @@ ---- -machine_translated: true -machine_translated_rev: 5decc73b5dc60054f19087d3690c4eb99446a6c3 ---- +# system.databases {#system-databases} -# 系统。数据库 {#system-databases} +包含当前用户可用的数据库的相关信息。 -此表包含一个名为"字符串"的列 ‘name’ – the name of a database. +列: -服务器知道的每个数据库在表中都有相应的条目。 +- `name` ([String](../../sql-reference/data-types/string.md)) — 数据库的名称。 +- `engine` ([String](../../sql-reference/data-types/string.md)) — [数据库的引擎](../../engines/database-engines/index.md)。 +- `data_path` ([String](../../sql-reference/data-types/string.md)) — 数据的路径。 +- `metadata_path` ([String](../../sql-reference/data-types/enum.md)) — 元数据的路径。 +- `uuid` ([UUID](../../sql-reference/data-types/uuid.md)) — 数据库的 UUID。 +- `comment` ([String](../../sql-reference/data-types/enum.md)) — 数据库的注释。 -该系统表用于实现 `SHOW DATABASES` 查询。 +这个系统表的 `name` 列被用于实现 `SHOW DATABASES` 查询。 + +**示例** + +创建一个数据库。 + +``` sql +CREATE DATABASE test; +``` + +查询此用户所有可用的数据库。 + +``` sql +SELECT * FROM system.databases; +``` + +``` text +┌─name───────────────┬─engine─┬─data_path──────────────────┬─metadata_path───────────────────────────────────────────────────────┬─uuid─────────────────────────────────┬─comment─┐ +│ INFORMATION_SCHEMA │ Memory │ /var/lib/clickhouse/ │ │ 00000000-0000-0000-0000-000000000000 │ │ +│ default │ Atomic │ /var/lib/clickhouse/store/ │ /var/lib/clickhouse/store/d31/d317b4bd-3595-4386-81ee-c2334694128a/ │ 24363899-31d7-42a0-a436-389931d752a0 │ │ +│ information_schema │ Memory │ /var/lib/clickhouse/ │ │ 00000000-0000-0000-0000-000000000000 │ │ +│ system │ Atomic │ /var/lib/clickhouse/store/ │ /var/lib/clickhouse/store/1d1/1d1c869d-e465-4b1b-a51f-be033436ebf9/ │ 03e9f3d1-cc88-4a49-83e9-f3d1cc881a49 │ │ +└────────────────────┴────────┴────────────────────────────┴─────────────────────────────────────────────────────────────────────┴──────────────────────────────────────┴─────────┘ +``` + +[原文](https://clickhouse.com/docs/zh/operations/system-tables/databases) diff --git a/docs/zh/operations/system-tables/detached_parts.md b/docs/zh/operations/system-tables/detached_parts.md index ba35444c551..efcbb61d37e 100644 --- a/docs/zh/operations/system-tables/detached_parts.md +++ b/docs/zh/operations/system-tables/detached_parts.md @@ -1,14 +1,11 @@ ---- -machine_translated: true -machine_translated_rev: 5decc73b5dc60054f19087d3690c4eb99446a6c3 ---- +# system.detached_parts {#system_tables-detached_parts} -# 系统。detached_parts {#system_tables-detached_parts} +包含关于 [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) 表的分离分区的信息。`reason` 列详细说明了该分区被分离的原因。 -包含有关分离部分的信息 [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) 桌子 该 `reason` 列指定分离部件的原因。 +对于用户分离的分区,原因是空的。你可以通过 [ALTER TABLE ATTACH PARTITION\|PART](../../sql-reference/statements/alter/partition.md#alter_attach-partition) 命令添加这些分区。 -对于用户分离的部件,原因是空的。 这些部件可以附加 [ALTER TABLE ATTACH PARTITION\|PART](../../sql-reference/statements/alter.md#alter_attach-partition) 指挥部 +关于其他列的描述,请参见 [system.parts](../../operations/system-tables/parts.md#system_tables-parts)。 -有关其他列的说明,请参阅 [系统。零件](../../operations/system-tables/parts.md#system_tables-parts). +如果分区名称无效,一些列的值可能是`NULL`。你可以通过[ALTER TABLE DROP DETACHED PART](../../sql-reference/statements/alter/partition.md#alter_drop-detached)来删除这些分区。 -如果部件名称无效,某些列的值可能为 `NULL`. 这些部分可以删除 [ALTER TABLE DROP DETACHED PART](../../sql-reference/statements/alter.md#alter_drop-detached). +[原文](https://clickhouse.com/docs/zh/operations/system-tables/detached_parts) diff --git a/docs/zh/operations/system-tables/disks.md b/docs/zh/operations/system-tables/disks.md index 8cd24d24550..2a6dcc9ae45 100644 --- a/docs/zh/operations/system-tables/disks.md +++ b/docs/zh/operations/system-tables/disks.md @@ -1,31 +1,27 @@ ---- -machine_translated: true -machine_translated_rev: 5decc73b5dc60054f19087d3690c4eb99446a6c3 ---- +# system.disks {#system_tables-disks} -# 系统。磁盘 {#system_tables-disks} - -包含有关在定义的磁盘信息 [服务器配置](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-multiple-volumes_configure). +包含在 [服务器配置](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-multiple-volumes_configure) 中定义的磁盘信息. 列: -- `name` ([字符串](../../sql-reference/data-types/string.md)) — Name of a disk in the server configuration. -- `path` ([字符串](../../sql-reference/data-types/string.md)) — Path to the mount point in the file system. -- `free_space` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Free space on disk in bytes. -- `total_space` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Disk volume in bytes. -- `keep_free_space` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Amount of disk space that should stay free on disk in bytes. Defined in the `keep_free_space_bytes` 磁盘配置参数。 +- `name` ([字符串](../../sql-reference/data-types/string.md)) — 服务器配置中的磁盘名称. +- `path` ([字符串](../../sql-reference/data-types/string.md)) — 文件系统中挂载点的路径. +- `free_space` ([UInt64](../../sql-reference/data-types/int-uint.md)) — 磁盘上的可用空间,以字节为单位. +- `total_space` ([UInt64](../../sql-reference/data-types/int-uint.md)) — 磁盘容量,以字节为单位。 +- `keep_free_space` ([UInt64](../../sql-reference/data-types/int-uint.md)) — 在磁盘上应保持空闲的磁盘空间的数量,以字节为单位。在磁盘配置的 `keep_free_space_bytes` 参数中定义。 -## 系统。storage_policies {#system_tables-storage_policies} +**示例** -包含有关存储策略和卷中定义的信息 [服务器配置](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-multiple-volumes_configure). +```sql +:) SELECT * FROM system.disks; +``` -列: +```text +┌─name────┬─path─────────────────┬───free_space─┬──total_space─┬─keep_free_space─┐ +│ default │ /var/lib/clickhouse/ │ 276392587264 │ 490652508160 │ 0 │ +└─────────┴──────────────────────┴──────────────┴──────────────┴─────────────────┘ -- `policy_name` ([字符串](../../sql-reference/data-types/string.md)) — Name of the storage policy. -- `volume_name` ([字符串](../../sql-reference/data-types/string.md)) — Volume name defined in the storage policy. -- `volume_priority` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Volume order number in the configuration. -- `disks` ([数组(字符串)](../../sql-reference/data-types/array.md)) — Disk names, defined in the storage policy. -- `max_data_part_size` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Maximum size of a data part that can be stored on volume disks (0 — no limit). -- `move_factor` ([Float64](../../sql-reference/data-types/float.md)) — Ratio of free disk space. When the ratio exceeds the value of configuration parameter, ClickHouse start to move data to the next volume in order. +1 rows in set. Elapsed: 0.001 sec. +``` -如果存储策略包含多个卷,则每个卷的信息将存储在表的单独行中。 +[原文](https://clickhouse.com/docs/zh/operations/system-tables/disks) diff --git a/docs/zh/operations/system-tables/merge_tree_settings.md b/docs/zh/operations/system-tables/merge_tree_settings.md index a6ad6f78f8e..48d9a7dd9af 100644 --- a/docs/zh/operations/system-tables/merge_tree_settings.md +++ b/docs/zh/operations/system-tables/merge_tree_settings.md @@ -1,16 +1,55 @@ ---- -machine_translated: true -machine_translated_rev: 5decc73b5dc60054f19087d3690c4eb99446a6c3 ---- +# system.merge_tree_settings {#system-merge_tree_settings} -# 系统。merge_tree_settings {#system-merge_tree_settings} - -包含有关以下设置的信息 `MergeTree` 桌子 +包含 `MergeTree` 表的设置 (Setting) 信息。 列: -- `name` (String) — Setting name. -- `value` (String) — Setting value. -- `description` (String) — Setting description. -- `type` (String) — Setting type (implementation specific string value). -- `changed` (UInt8) — Whether the setting was explicitly defined in the config or explicitly changed. +- `name` (String) — 设置名称。 +- `value` (String) — 设置的值。 +- `description` (String) — 设置描述。 +- `type` (String) — 设置类型 (执行特定的字符串值)。 +- `changed` (UInt8) — 该设置是否在配置中明确定义或是明确改变。 + + +**示例** +```sql +:) SELECT * FROM system.merge_tree_settings LIMIT 4 FORMAT Vertical; +``` + +```text +Row 1: +────── +name: index_granularity +value: 8192 +changed: 0 +description: How many rows correspond to one primary key value. +type: SettingUInt64 + +Row 2: +────── +name: min_bytes_for_wide_part +value: 0 +changed: 0 +description: Minimal uncompressed size in bytes to create part in wide format instead of compact +type: SettingUInt64 + +Row 3: +────── +name: min_rows_for_wide_part +value: 0 +changed: 0 +description: Minimal number of rows to create part in wide format instead of compact +type: SettingUInt64 + +Row 4: +────── +name: merge_max_block_size +value: 8192 +changed: 0 +description: How many rows in blocks should be formed for merge operations. +type: SettingUInt64 + +4 rows in set. Elapsed: 0.001 sec. +``` + +[原文](https://clickhouse.com/docs/zh/operations/system-tables/merge_tree_settings) diff --git a/docs/zh/operations/system-tables/metrics.md b/docs/zh/operations/system-tables/metrics.md index 34b7fa35681..5b5b4615f82 100644 --- a/docs/zh/operations/system-tables/metrics.md +++ b/docs/zh/operations/system-tables/metrics.md @@ -1,19 +1,14 @@ ---- -machine_translated: true -machine_translated_rev: 5decc73b5dc60054f19087d3690c4eb99446a6c3 ---- +# system.metrics {#system_tables-metrics} -# 系统。指标 {#system_tables-metrics} - -包含可以立即计算或具有当前值的指标。 例如,同时处理的查询的数量或当前副本的延迟。 此表始终是最新的。 +此系统表包含可以即时计算或具有当前值的指标。例如,同时处理的查询数量或当前的复制延迟。这个表始终是最新的。 列: -- `metric` ([字符串](../../sql-reference/data-types/string.md)) — Metric name. -- `value` ([Int64](../../sql-reference/data-types/int-uint.md)) — Metric value. -- `description` ([字符串](../../sql-reference/data-types/string.md)) — Metric description. +- `metric` ([字符串](../../sql-reference/data-types/string.md)) — 指标名称. +- `value` ([Int64](../../sql-reference/data-types/int-uint.md)) — 指标的值. +- `description` ([字符串](../../sql-reference/data-types/string.md)) — 指标的描述. -支持的指标列表,您可以在 [src/Common/CurrentMetrics.cpp](https://github.com/ClickHouse/ClickHouse/blob/master/src/Common/CurrentMetrics.cpp) ClickHouse的源文件。 +对于支持的指标列表,您可以查看 [src/Common/CurrentMetrics.cpp](https://github.com/ClickHouse/ClickHouse/blob/master/src/Common/CurrentMetrics.cpp) ClickHouse 的源文件。 **示例** @@ -38,7 +33,7 @@ SELECT * FROM system.metrics LIMIT 10 **另请参阅** -- [系统。asynchronous_metrics](../../operations/system-tables/asynchronous_metrics.md#system_tables-asynchronous_metrics) — Contains periodically calculated metrics. -- [系统。活动](../../operations/system-tables/events.md#system_tables-events) — Contains a number of events that occurred. -- [系统。metric_log](../../operations/system-tables/metric_log.md#system_tables-metric_log) — Contains a history of metrics values from tables `system.metrics` и `system.events`. -- [监测](../../operations/monitoring.md) — Base concepts of ClickHouse monitoring. +- [system.asynchronous_metrics](../../operations/system-tables/asynchronous_metrics.md#system_tables-asynchronous_metrics) — 包含周期性的计算指标。 +- [system.events](../../operations/system-tables/events.md#system_tables-events) — 包含发生的一些事件。 +- [system.metric_log](../../operations/system-tables/metric_log.md#system_tables-metric_log) — 包含`system.metrics`表和`system.events`表的历史指标值。 +- [监控](../../operations/monitoring.md) — ClickHouse 监控的基本概念。 diff --git a/docs/zh/operations/system-tables/numbers.md b/docs/zh/operations/system-tables/numbers.md index c42c87053ca..fd67baa01a5 100644 --- a/docs/zh/operations/system-tables/numbers.md +++ b/docs/zh/operations/system-tables/numbers.md @@ -1,12 +1,32 @@ ---- -machine_translated: true -machine_translated_rev: 5decc73b5dc60054f19087d3690c4eb99446a6c3 ---- +# system.numbers {#system-numbers} -# 系统。数字 {#system-numbers} +这个表有一个名为 `number` 的 UInt64 列,包含了几乎所有从 0 开始的自然数。 -此表包含一个名为UInt64的列 `number` 它包含几乎所有从零开始的自然数。 +你可以用这个表进行测试,或者如果你需要进行暴力搜索。 -您可以使用此表进行测试,或者如果您需要进行暴力搜索。 +从该表的读取是不并行的。 -从此表中读取的内容不是并行的。 +**示例** + +```sql +:) SELECT * FROM system.numbers LIMIT 10; +``` + +```text +┌─number─┐ +│ 0 │ +│ 1 │ +│ 2 │ +│ 3 │ +│ 4 │ +│ 5 │ +│ 6 │ +│ 7 │ +│ 8 │ +│ 9 │ +└────────┘ + +10 rows in set. Elapsed: 0.001 sec. +``` + +[原文](https://clickhouse.com/docs/zh/operations/system-tables/numbers) diff --git a/docs/zh/operations/system-tables/one.md b/docs/zh/operations/system-tables/one.md index a8dc64c18c7..79d2c0199d8 100644 --- a/docs/zh/operations/system-tables/one.md +++ b/docs/zh/operations/system-tables/one.md @@ -1,12 +1,23 @@ ---- -machine_translated: true -machine_translated_rev: 5decc73b5dc60054f19087d3690c4eb99446a6c3 ---- +# system.one {#system-one} -# 系统。一 {#system-one} +此表包含一行只有一个值为 0 的 `dummy` UInt8 列的数据。 -此表包含一行,其中包含一行 `dummy` UInt8列包含值0。 +如果 `SELECT` 查询没有指定 `FROM` 子句,就会使用这个表来查询。 -如果使用此表 `SELECT` 查询不指定 `FROM` 条款 +这个表类似于其他数据库管理系统(DMBS)中的 `DUAL` 表。 -这类似于 `DUAL` 表在其他Dbms中找到。 +**示例** + +```sql +:) SELECT * FROM system.one LIMIT 10; +``` + +```text +┌─dummy─┐ +│ 0 │ +└───────┘ + +1 rows in set. Elapsed: 0.001 sec. +``` + +[原文](https://clickhouse.com/docs/zh/operations/system-tables/one) diff --git a/docs/zh/operations/system-tables/parts.md b/docs/zh/operations/system-tables/parts.md index e924ee27df3..dc98288305f 100644 --- a/docs/zh/operations/system-tables/parts.md +++ b/docs/zh/operations/system-tables/parts.md @@ -1,85 +1,167 @@ ---- -machine_translated: true -machine_translated_rev: 5decc73b5dc60054f19087d3690c4eb99446a6c3 ---- +# system.parts {#system_tables-parts} -# 系统。零件 {#system_tables-parts} +此系统表包含 [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) 表分区的相关信息。 -包含有关的部分信息 [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) 桌子 - -每行描述一个数据部分。 +每一行描述一个数据分区。 列: -- `partition` (String) – The partition name. To learn what a partition is, see the description of the [ALTER](../../sql-reference/statements/alter.md#query_language_queries_alter) 查询。 +- `partition` ([String](../../sql-reference/data-types/string.md)) – 分区名称。请参阅 [ALTER](../../sql-reference/statements/alter/index.md#query_language_queries_alter) 查询的说明,来了解什么是分区。 格式: - `YYYYMM` 用于按月自动分区。 - - `any_string` 手动分区时。 + - `any_string` 手动分区时,是其他格式的字符串。 -- `name` (`String`) – Name of the data part. +- `name` ([String](../../sql-reference/data-types/string.md)) – 数据分区的名称。 -- `active` (`UInt8`) – Flag that indicates whether the data part is active. If a data part is active, it's used in a table. Otherwise, it's deleted. Inactive data parts remain after merging. +- `part_type` ([String](../../sql-reference/data-types/string.md)) — 数据分区的存储格式。 -- `marks` (`UInt64`) – The number of marks. To get the approximate number of rows in a data part, multiply `marks` 通过索引粒度(通常为8192)(此提示不适用于自适应粒度)。 + 可能的值: -- `rows` (`UInt64`) – The number of rows. + - `Wide` — 每一列在文件系统中的一个单独文件中存储。 + - `Compact` — 所有列在文件系统中的一个文件中存储。 -- `bytes_on_disk` (`UInt64`) – Total size of all the data part files in bytes. + 数据存储格式由 [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) 表的 `min_bytes_for_wide_part` 和 `min_rows_for_wide_part` 控制。 -- `data_compressed_bytes` (`UInt64`) – Total size of compressed data in the data part. All the auxiliary files (for example, files with marks) are not included. + - `active` ([UInt8](../../sql-reference/data-types/int-uint.md)) – 指示数据分区是否处于活动状态的标志。如果数据分区处于活动状态,则此数据正在被表使用。反之,则不活跃(deleted)。合并后仍会保留非活跃的数据分区。 -- `data_uncompressed_bytes` (`UInt64`) – Total size of uncompressed data in the data part. All the auxiliary files (for example, files with marks) are not included. +- `marks` ([UInt64](../../sql-reference/data-types/int-uint.md)) – 标记数。要获得数据分区中的大致行数:使用`marks`(标记数)乘以索引粒度(通常为 8192)。不适用于自适应颗粒度。 -- `marks_bytes` (`UInt64`) – The size of the file with marks. +- `rows` ([UInt64](../../sql-reference/data-types/int-uint.md)) – 行数. -- `modification_time` (`DateTime`) – The time the directory with the data part was modified. This usually corresponds to the time of data part creation.\| +- `bytes_on_disk` ([UInt64](../../sql-reference/data-types/int-uint.md)) – 数据总大小(以字节为单位)。 -- `remove_time` (`DateTime`) – The time when the data part became inactive. +- `data_compressed_bytes` ([UInt64](../../sql-reference/data-types/int-uint.md)) – 数据分区中压缩数据的总大小。不包括所有辅助文件(例如,带有标记的文件)。 -- `refcount` (`UInt32`) – The number of places where the data part is used. A value greater than 2 indicates that the data part is used in queries or merges. +- `data_uncompressed_bytes` ([UInt64](../../sql-reference/data-types/int-uint.md)) – 数据分区中未压缩数据的总大小。不包括所有辅助文件(例如,带有标记的文件)。 -- `min_date` (`Date`) – The minimum value of the date key in the data part. +- `marks_bytes` ([UInt64](../../sql-reference/data-types/int-uint.md)) – 带有标记的文件的大小。 -- `max_date` (`Date`) – The maximum value of the date key in the data part. +- `secondary_indices_compressed_bytes` ([UInt64](../../sql-reference/data-types/int-uint.md)) – 数据分区中二级索引的压缩数据总大小。所有的辅助文件(例如,带有标记的文件)都不包括在内。 -- `min_time` (`DateTime`) – The minimum value of the date and time key in the data part. +- `secondary_indices_uncompressed_bytes` ([UInt64](../../sql-reference/data-types/int-uint.md)) – 数据分区中二级索引的未压缩数据的总大小。所有的辅助文件(例如,带有标记的文件)都不包括在内。 -- `max_time`(`DateTime`) – The maximum value of the date and time key in the data part. +- `secondary_indices_marks_bytes` ([UInt64](../../sql-reference/data-types/int-uint.md)) – 带标记的二级索引的文件大小。 -- `partition_id` (`String`) – ID of the partition. +- `modification_time` ([DateTime](../../sql-reference/data-types/datetime.md)) – 包含数据分区的目录被修改的时间。这通常对应于数据部分创建的时间。 -- `min_block_number` (`UInt64`) – The minimum number of data parts that make up the current part after merging. +- `remove_time` ([DateTime](../../sql-reference/data-types/datetime.md)) – 数据分区变为非活动状态的时间。 -- `max_block_number` (`UInt64`) – The maximum number of data parts that make up the current part after merging. +- `refcount` ([UInt32](../../sql-reference/data-types/int-uint.md)) – 使用数据部分的位置数。大于 2 的值表示数据部分用于查询或是用于合并。 -- `level` (`UInt32`) – Depth of the merge tree. Zero means that the current part was created by insert rather than by merging other parts. +- `min_date` ([Date](../../sql-reference/data-types/date.md)) – 数据部分中日期键的最小值。 -- `data_version` (`UInt64`) – Number that is used to determine which mutations should be applied to the data part (mutations with a version higher than `data_version`). +- `max_date` ([Date](../../sql-reference/data-types/date.md)) – 数据部分中日期键的最大值。 -- `primary_key_bytes_in_memory` (`UInt64`) – The amount of memory (in bytes) used by primary key values. +- `min_time` ([DateTime](../../sql-reference/data-types/datetime.md)) – 数据部分中日期和时间键的最小值。 -- `primary_key_bytes_in_memory_allocated` (`UInt64`) – The amount of memory (in bytes) reserved for primary key values. +- `max_time`([DateTime](../../sql-reference/data-types/datetime.md)) – 数据部分中日期和时间键的最大值。 -- `is_frozen` (`UInt8`) – Flag that shows that a partition data backup exists. 1, the backup exists. 0, the backup doesn't exist. For more details, see [FREEZE PARTITION](../../sql-reference/statements/alter.md#alter_freeze-partition) +- `partition_id` ([String](../../sql-reference/data-types/string.md)) – 分区的 ID。 -- `database` (`String`) – Name of the database. +- `min_block_number` ([UInt64](../../sql-reference/data-types/int-uint.md)) – 合并后构成当前部分的最小数据部分数量。 -- `table` (`String`) – Name of the table. +- `max_block_number` ([UInt64](../../sql-reference/data-types/int-uint.md)) – 合并后构成当前部分的最大数据部分数量。 -- `engine` (`String`) – Name of the table engine without parameters. +- `level` ([UInt32](../../sql-reference/data-types/int-uint.md)) – 合并树的深度。值为 0 表示该分区是通过插入创建的,而不是通过合并创建的。 -- `path` (`String`) – Absolute path to the folder with data part files. +- `data_version` ([UInt64](../../sql-reference/data-types/int-uint.md)) – 用于确定应将哪些订正(mutations)应用于数据部分(版本高于 `data_version` 的订正(mutations))的数字。 -- `disk` (`String`) – Name of a disk that stores the data part. +- `primary_key_bytes_in_memory` ([UInt64](../../sql-reference/data-types/int-uint.md)) – 主键值使用的内存量(以字节为单位)。 -- `hash_of_all_files` (`String`) – [sipHash128](../../sql-reference/functions/hash-functions.md#hash_functions-siphash128) 的压缩文件。 +- `primary_key_bytes_in_memory_allocated` ([UInt64](../../sql-reference/data-types/int-uint.md)) – 为主键值保留的内存量(以字节为单位)。 -- `hash_of_uncompressed_files` (`String`) – [sipHash128](../../sql-reference/functions/hash-functions.md#hash_functions-siphash128) 未压缩的文件(带标记的文件,索引文件等。). +- `is_frozen` ([UInt8](../../sql-reference/data-types/int-uint.md)) – 显示分区数据备份存在的标志。1,备份存在。0,备份不存在。更多细节,见 [FREEZE PARTITION](../../sql-reference/statements/alter/partition.md#alter_freeze-partition)。 -- `uncompressed_hash_of_compressed_files` (`String`) – [sipHash128](../../sql-reference/functions/hash-functions.md#hash_functions-siphash128) 压缩文件中的数据,就好像它们是未压缩的。 +- `database` ([String](../../sql-reference/data-types/string.md)) – 数据库的名称。 -- `bytes` (`UInt64`) – Alias for `bytes_on_disk`. +- `table` ([String](../../sql-reference/data-types/string.md)) – 表的名称。 -- `marks_size` (`UInt64`) – Alias for `marks_bytes`. +- `engine` ([String](../../sql-reference/data-types/string.md)) – 不带参数的表引擎名称。 + +- `path` ([String](../../sql-reference/data-types/string.md)) – 包含数据部分文件的文件夹的绝对路径。 + +- `disk` ([String](../../sql-reference/data-types/string.md)) – 存储数据部分的磁盘的名称。 + +- `hash_of_all_files` ([String](../../sql-reference/data-types/string.md)) – 压缩文件的 [sipHash128](../../sql-reference/functions/hash-functions.md#hash_functions-siphash128)。 + +- `hash_of_uncompressed_files` ([String](../../sql-reference/data-types/string.md)) – 未压缩文件(带有标记的文件、索引文件等)的 [sipHash128](../../sql-reference/functions/hash-functions.md#hash_functions-siphash128)。 + +- `uncompressed_hash_of_compressed_files` ([String](../../sql-reference/data-types/string.md)) – 压缩文件中的数据(没有压缩时)的 [sipHash128](../../sql-reference/functions/hash-functions.md#hash_functions-siphash128)。 + +- `delete_ttl_info_min` ([DateTime](../../sql-reference/data-types/datetime.md)) — [TTL DELETE 规则](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-ttl) 的日期和时间键的最小值。 + +- `delete_ttl_info_max` ([DateTime](../../sql-reference/data-types/datetime.md)) — [TTL DELETE 规则](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-ttl) 的日期和时间键的最大值。 + +- `move_ttl_info.expression` ([Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md))) — 表达式的数组。 每个表达式定义一个 [TTL MOVE 规则](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-ttl). + + !!! note "警告" + 保留 `move_ttl_info.expression` 数组主要是为了向后兼容,现在检查 `TTL MOVE` 规则最简单的方法是使用 `move_ttl_info.min` 和 `move_ttl_info.max` 字段。 + +- `move_ttl_info.min` ([Array](../../sql-reference/data-types/array.md)([DateTime](../../sql-reference/data-types/datetime.md))) — 日期值和时间值的数组。数组中的每个元素都描述了一个 [TTL MOVE rule](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-ttl) 的最小键值。 + +- `move_ttl_info.max` ([Array](../../sql-reference/data-types/array.md)([DateTime](../../sql-reference/data-types/datetime.md))) — 日期值和时间值的数组。数组中的每个元素都描述了一个 [TTL MOVE rule](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-ttl) 的最大键值。 + +- `bytes` ([UInt64](../../sql-reference/data-types/int-uint.md)) – `bytes_on_disk`的别名。 + +- `marks_size` ([UInt64](../../sql-reference/data-types/int-uint.md)) – `marks_bytes`的别名。 + +**示例** + +``` sql +SELECT * FROM system.parts LIMIT 1 FORMAT Vertical; +``` + +``` text +Row 1: +────── +partition: tuple() +name: all_1_4_1_6 +part_type: Wide +active: 1 +marks: 2 +rows: 6 +bytes_on_disk: 310 +data_compressed_bytes: 157 +data_uncompressed_bytes: 91 +secondary_indices_compressed_bytes: 58 +secondary_indices_uncompressed_bytes: 6 +secondary_indices_marks_bytes: 48 +marks_bytes: 144 +modification_time: 2020-06-18 13:01:49 +remove_time: 1970-01-01 00:00:00 +refcount: 1 +min_date: 1970-01-01 +max_date: 1970-01-01 +min_time: 1970-01-01 00:00:00 +max_time: 1970-01-01 00:00:00 +partition_id: all +min_block_number: 1 +max_block_number: 4 +level: 1 +data_version: 6 +primary_key_bytes_in_memory: 8 +primary_key_bytes_in_memory_allocated: 64 +is_frozen: 0 +database: default +table: months +engine: MergeTree +disk_name: default +path: /var/lib/clickhouse/data/default/months/all_1_4_1_6/ +hash_of_all_files: 2d0657a16d9430824d35e327fcbd87bf +hash_of_uncompressed_files: 84950cc30ba867c77a408ae21332ba29 +uncompressed_hash_of_compressed_files: 1ad78f1c6843bbfb99a2c931abe7df7d +delete_ttl_info_min: 1970-01-01 00:00:00 +delete_ttl_info_max: 1970-01-01 00:00:00 +move_ttl_info.expression: [] +move_ttl_info.min: [] +move_ttl_info.max: [] +``` + +**另请参阅** + +- [MergeTree(合并树)家族](../../engines/table-engines/mergetree-family/mergetree.md) +- [列和表的 TTL](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-ttl) + +[原文](https://clickhouse.com/docs/zh/operations/system-tables/parts) diff --git a/docs/zh/operations/system-tables/settings.md b/docs/zh/operations/system-tables/settings.md index c717c8c9562..144eb0179c4 100644 --- a/docs/zh/operations/system-tables/settings.md +++ b/docs/zh/operations/system-tables/settings.md @@ -1,27 +1,22 @@ ---- -machine_translated: true -machine_translated_rev: 5decc73b5dc60054f19087d3690c4eb99446a6c3 ---- +# system.settings {#system-tables-system-settings} -# 系统。设置 {#system-tables-system-settings} - -包含有关当前用户的会话设置的信息。 +包含当前用户会话设置的相关信息。 列: -- `name` ([字符串](../../sql-reference/data-types/string.md)) — Setting name. -- `value` ([字符串](../../sql-reference/data-types/string.md)) — Setting value. -- `changed` ([UInt8](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Shows whether a setting is changed from its default value. -- `description` ([字符串](../../sql-reference/data-types/string.md)) — Short setting description. -- `min` ([可为空](../../sql-reference/data-types/nullable.md)([字符串](../../sql-reference/data-types/string.md))) — Minimum value of the setting, if any is set via [制约因素](../../operations/settings/constraints-on-settings.md#constraints-on-settings). 如果设置没有最小值,则包含 [NULL](../../sql-reference/syntax.md#null-literal). -- `max` ([可为空](../../sql-reference/data-types/nullable.md)([字符串](../../sql-reference/data-types/string.md))) — Maximum value of the setting, if any is set via [制约因素](../../operations/settings/constraints-on-settings.md#constraints-on-settings). 如果设置没有最大值,则包含 [NULL](../../sql-reference/syntax.md#null-literal). -- `readonly` ([UInt8](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Shows whether the current user can change the setting: - - `0` — Current user can change the setting. - - `1` — Current user can't change the setting. +- `name` ([字符串](../../sql-reference/data-types/string.md)) — 设置名称。 +- `value` ([字符串](../../sql-reference/data-types/string.md)) — 设置的值。 +- `changed` ([UInt8](../../sql-reference/data-types/int-uint.md#uint-ranges)) — 显示该设置是否从其默认值修改。 +- `description` ([字符串](../../sql-reference/data-types/string.md)) — 该设置的简要描述。 +- `min` ([可为空](../../sql-reference/data-types/nullable.md)([字符串](../../sql-reference/data-types/string.md))) — 该设置的最小值,如果有最小值,则是通过[约束](../../operations/settings/constraints-on-settings.md#constraints-on-settings)设置的。如果该设置没有最小值,则包含 [NULL](../../sql-reference/syntax.md#null-literal). +- `max` ([可为空](../../sql-reference/data-types/nullable.md)([字符串](../../sql-reference/data-types/string.md))) — 该设置的最大值, 如果有最大值,则是通过[约束](../../operations/settings/constraints-on-settings.md#constraints-on-settings)设置的。如果该设置没有最大值,则包含 [NULL](../../sql-reference/syntax.md#null-literal). +- `readonly` ([UInt8](../../sql-reference/data-types/int-uint.md#uint-ranges)) — 当前用户是否可以修改该设置: + - `0` — 当前用户可以修改此设置. + - `1` — 当前用户不能修改此设置. **示例** -下面的示例演示如何获取有关名称包含的设置的信息 `min_i`. +下面的例子显示了如何获得设置名称中包含`min_i`的设置信息。 ``` sql SELECT * @@ -37,10 +32,10 @@ WHERE name LIKE '%min_i%' └─────────────────────────────────────────────┴───────────┴─────────┴───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┴──────┴──────┴──────────┘ ``` -使用 `WHERE changed` 可以是有用的,例如,当你想检查: +比如,当你想要检查以下情况时,使用 `WHERE changed` 会很有用: -- 配置文件中的设置是否正确加载并正在使用。 -- 在当前会话中更改的设置。 +- 配置文件中的设置是否正确加载,并正在使用。 +- 在当前会话中更改过的设置。 @@ -52,4 +47,6 @@ SELECT * FROM system.settings WHERE changed AND name='load_balancing' - [设置](../../operations/settings/index.md#session-settings-intro) - [查询权限](../../operations/settings/permissions-for-queries.md#settings_readonly) -- [对设置的限制](../../operations/settings/constraints-on-settings.md) +- [对设置的约束](../../operations/settings/constraints-on-settings.md) + +[原文](https://clickhouse.com/docs/zh/operations/system-tables/settings) diff --git a/docs/zh/operations/system-tables/storage_policies.md b/docs/zh/operations/system-tables/storage_policies.md index 550af6b2b27..e2531649493 100644 --- a/docs/zh/operations/system-tables/storage_policies.md +++ b/docs/zh/operations/system-tables/storage_policies.md @@ -1,19 +1,17 @@ ---- -machine_translated: true -machine_translated_rev: 5decc73b5dc60054f19087d3690c4eb99446a6c3 ---- +# system.storage_policies {#system_tables-storage_policies} -# 系统。storage_policies {#system_tables-storage_policies} - -包含有关存储策略和卷中定义的信息 [服务器配置](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-multiple-volumes_configure). +包含有关 [服务器配置](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-multiple-volumes_configure) 中定义的存储策略和卷信息。 列: -- `policy_name` ([字符串](../../sql-reference/data-types/string.md)) — Name of the storage policy. -- `volume_name` ([字符串](../../sql-reference/data-types/string.md)) — Volume name defined in the storage policy. -- `volume_priority` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Volume order number in the configuration. -- `disks` ([数组(字符串)](../../sql-reference/data-types/array.md)) — Disk names, defined in the storage policy. -- `max_data_part_size` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Maximum size of a data part that can be stored on volume disks (0 — no limit). -- `move_factor` ([Float64](../../sql-reference/data-types/float.md)) — Ratio of free disk space. When the ratio exceeds the value of configuration parameter, ClickHouse start to move data to the next volume in order. +- `policy_name` ([String](../../sql-reference/data-types/string.md)) — 存储策略的名称。 +- `volume_name` ([String](../../sql-reference/data-types/string.md)) — 存储策略中定义的卷名称。 +- `volume_priority` ([UInt64](../../sql-reference/data-types/int-uint.md)) — 配置中的卷顺序号,数据根据这个优先级填充卷,比如插入和合并期间的数据将被写入优先级较低的卷 (还需考虑其他规则: TTL, `max_data_part_size`, `move_factor`)。 +- `disks` ([Array(String)](../../sql-reference/data-types/array.md)) — 存储策略中定义的磁盘名。 +- `max_data_part_size` ([UInt64](../../sql-reference/data-types/int-uint.md)) — 可以存储在卷磁盘上数据部分的最大大小 (0 - 不限制)。 +- `move_factor` ([Float64](../../sql-reference/data-types/float.md)) — 磁盘空闲的比率。当比率超过配置的值,ClickHouse 将把数据向下一个卷移动。 +- `prefer_not_to_merge` ([UInt8](../../sql-reference/data-types/int-uint.md)) — 设置中 `prefer_not_to_merge` 的值. 当这个设置启用时,不允许在此卷上合并数据。这将允许控制 ClickHouse 如何与运行速度较慢的磁盘一起工作。 -如果存储策略包含多个卷,则每个卷的信息将存储在表的单独行中。 +如果存储策略包含多个卷,则每个卷的信息将在表中作为单独一行存储。 + +[原文](https://clickhouse.com/docs/zh/operations/system-tables/storage_policies) diff --git a/docs/zh/operations/system-tables/tables.md b/docs/zh/operations/system-tables/tables.md index 6a719a92ca0..03ea9f93d26 100644 --- a/docs/zh/operations/system-tables/tables.md +++ b/docs/zh/operations/system-tables/tables.md @@ -1,58 +1,128 @@ ---- -machine_translated: true -machine_translated_rev: 5decc73b5dc60054f19087d3690c4eb99446a6c3 ---- +# system.tables {#system-tables} -# 系统。表 {#system-tables} +包含服务器知道的每个表的元数据。 [分离的](../../sql-reference/statements/detach.md)表不在 `system.tables` 显示。 -包含服务器知道的每个表的元数据。 分离的表不显示在 `system.tables`。 +[临时表](../../sql-reference/statements/create/table.md#temporary-tables)只在创建它们的会话中的 `system.tables` 中才可见。它们的数据库字段显示为空,并且 `is_temporary` 标志显示为开启。 -此表包含以下列(列类型显示在括号中): +此表包含以下列 (列类型显示在括号中): -- `database` (String) — 表所在的数据库表名。 +- `database` ([String](../../sql-reference/data-types/string.md)) — 表所在的数据库名。 -- `name` (String) — 表名。 +- `name` ([String](../../sql-reference/data-types/string.md)) — 表名。 -- `engine` (String) — 表引擎名 (不包含参数)。 +- `engine` ([String](../../sql-reference/data-types/string.md)) — 表引擎名 (不包含参数)。 -- `is_temporary` (UInt8)-指示表是否是临时的标志。 +- `is_temporary` ([UInt8](../../sql-reference/data-types/int-uint.md)) - 指示表是否是临时的标志。 -- `data_path` (String)-文件系统中表数据的路径。 +- `data_path` ([String](../../sql-reference/data-types/string.md)) - 表数据在文件系统中的路径。 -- `metadata_path` (String)-文件系统中表元数据的路径。 +- `metadata_path` ([String](../../sql-reference/data-types/string.md)) - 表元数据在文件系统中的路径。 -- `metadata_modification_time` (DateTime)-表元数据的最新修改时间。 +- `metadata_modification_time` ([DateTime](../../sql-reference/data-types/datetime.md)) - 表元数据的最新修改时间。 -- `dependencies_database` (数组(字符串))-数据库依赖关系。 +- `dependencies_database` ([Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md))) - 数据库依赖关系。 -- `dependencies_table` (数组(字符串))-表依赖关系 ([MaterializedView](../../engines/table-engines/special/materializedview.md) 基于当前表的表)。 +- `dependencies_table` ([Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md))) - 表依赖关系 (基于当前表的 [物化视图](../../engines/table-engines/special/materializedview.md) 表) 。 -- `create_table_query` (String)-用于创建表的SQL语句。 +- `create_table_query` ([String](../../sql-reference/data-types/string.md)) - 用于创建表的 SQL 语句。 -- `engine_full` (String)-表引擎的参数。 +- `engine_full` ([String](../../sql-reference/data-types/string.md)) - 表引擎的参数。 -- `partition_key` (String)-表中指定的分区键表达式。 +- `as_select` ([String](../../sql-reference/data-types/string.md)) - 视图的 `SELECT` 语句。 -- `sorting_key` (String)-表中指定的排序键表达式。 +- `partition_key` ([String](../../sql-reference/data-types/string.md)) - 表中指定的分区键表达式。 -- `primary_key` (String)-表中指定的主键表达式。 +- `sorting_key` ([String](../../sql-reference/data-types/string.md)) - 表中指定的排序键表达式。 -- `sampling_key` (String)-表中指定的采样键表达式。 +- `primary_key` ([String](../../sql-reference/data-types/string.md)) - 表中指定的主键表达式。 -- `storage_policy` (字符串)-存储策略: +- `sampling_key` ([String](../../sql-reference/data-types/string.md)) - 表中指定的采样键表达式。 + +- `storage_policy` ([String](../../sql-reference/data-types/string.md)) - 存储策略: - [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-multiple-volumes) - - [分布](../../engines/table-engines/special/distributed.md#distributed) + - [Distributed](../../engines/table-engines/special/distributed.md#distributed) -- `total_rows` (Nullable(UInt64))-总行数,如果可以快速确定表中的确切行数,否则行数为`Null`(包括底层 `Buffer` 表)。 +- `total_rows` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) - 总行数,如果无法快速确定表中的确切行数,则行数返回为 `NULL` (包括底层 `Buffer` 表) 。 -- `total_bytes` (Nullable(UInt64))-总字节数,如果可以快速确定存储表的确切字节数,否则字节数为`Null` (即**不** 包括任何底层存储)。 +- `total_bytes` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) - 总字节数,如果无法快速确定存储表的确切字节数,则字节数返回为 `NULL` ( **不** 包括任何底层存储) 。 - - 如果表将数据存在磁盘上,返回实际使用的磁盘空间(压缩后)。 + - 如果表将数据存在磁盘上,返回实际使用的磁盘空间 (压缩后) 。 - 如果表在内存中存储数据,返回在内存中使用的近似字节数。 -- `lifetime_rows` (Nullbale(UInt64))-服务启动后插入的总行数(只针对`Buffer`表)。 +- `lifetime_rows` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) - 服务启动后插入的总行数(只针对 `Buffer` 表) 。 + + +- `lifetime_bytes` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) - 服务启动后插入的总字节数(只针对 `Buffer` 表) 。 + + +- `comment` ([String](../../sql-reference/data-types/string.md)) - 表的注释。 + +- `has_own_data` ([UInt8](../../sql-reference/data-types/int-uint.md)) — 标志,表示表本身是否在磁盘上存储数据,或者访问其他来源。 `system.tables` 表被用于 `SHOW TABLES` 的查询实现中。 +**示例** + +```sql +SELECT * FROM system.tables LIMIT 2 FORMAT Vertical; +``` + +```text +Row 1: +────── +database: base +name: t1 +uuid: 81b1c20a-b7c6-4116-a2ce-7583fb6b6736 +engine: MergeTree +is_temporary: 0 +data_paths: ['/var/lib/clickhouse/store/81b/81b1c20a-b7c6-4116-a2ce-7583fb6b6736/'] +metadata_path: /var/lib/clickhouse/store/461/461cf698-fd0b-406d-8c01-5d8fd5748a91/t1.sql +metadata_modification_time: 2021-01-25 19:14:32 +dependencies_database: [] +dependencies_table: [] +create_table_query: CREATE TABLE base.t1 (`n` UInt64) ENGINE = MergeTree ORDER BY n SETTINGS index_granularity = 8192 +engine_full: MergeTree ORDER BY n SETTINGS index_granularity = 8192 +as_select: SELECT database AS table_catalog +partition_key: +sorting_key: n +primary_key: n +sampling_key: +storage_policy: default +total_rows: 1 +total_bytes: 99 +lifetime_rows: ᴺᵁᴸᴸ +lifetime_bytes: ᴺᵁᴸᴸ +comment: +has_own_data: 0 + +Row 2: +────── +database: default +name: 53r93yleapyears +uuid: 00000000-0000-0000-0000-000000000000 +engine: MergeTree +is_temporary: 0 +data_paths: ['/var/lib/clickhouse/data/default/53r93yleapyears/'] +metadata_path: /var/lib/clickhouse/metadata/default/53r93yleapyears.sql +metadata_modification_time: 2020-09-23 09:05:36 +dependencies_database: [] +dependencies_table: [] +create_table_query: CREATE TABLE default.`53r93yleapyears` (`id` Int8, `febdays` Int8) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 8192 +engine_full: MergeTree ORDER BY id SETTINGS index_granularity = 8192 +as_select: SELECT name AS catalog_name +partition_key: +sorting_key: id +primary_key: id +sampling_key: +storage_policy: default +total_rows: 2 +total_bytes: 155 +lifetime_rows: ᴺᵁᴸᴸ +lifetime_bytes: ᴺᵁᴸᴸ +comment: +has_own_data: 0 +``` + + [原文](https://clickhouse.com/docs/zh/operations/system-tables/tables) diff --git a/docs/zh/sql-reference/functions/time-window-functions.md b/docs/zh/sql-reference/functions/time-window-functions.md new file mode 100644 index 00000000000..ab28a47ad55 --- /dev/null +++ b/docs/zh/sql-reference/functions/time-window-functions.md @@ -0,0 +1,112 @@ +--- +toc_priority: 68 +toc_title: 时间窗口 +--- + +# 时间窗口函数 {#time-window-han-shu} + +时间窗口函数用于获取窗口的起始(包含边界)和结束时间(不包含边界)。系统支持的时间窗口函数如下: + +## tumble {#time-window-functions-tumble} + +tumble窗口是连续的、不重叠的固定大小(`interval`)时间窗口。 + +``` sql +tumble(time_attr, interval [, timezone]) +``` + +**参数** +- `time_attr` - [DateTime](../../sql-reference/data-types/datetime.md)类型的时间数据。 +- `interval` - [Interval](../../sql-reference/data-types/special-data-types/interval.md)类型的窗口大小。 +- `timezone` — [Timezone name](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone) 类型的时区(可选参数). + +**返回值** + +- tumble窗口的开始(包含边界)和结束时间(不包含边界) + +类型: `Tuple(DateTime, DateTime)` + +**示例** + +查询: + +``` sql +SELECT tumble(now(), toIntervalDay('1')) +``` + +结果: + +``` text +┌─tumble(now(), toIntervalDay('1'))─────────────┐ +│ ['2020-01-01 00:00:00','2020-01-02 00:00:00'] │ +└───────────────────────────────────────────────┘ +``` + +## hop {#time-window-functions-hop} + +hop窗口是一个固定大小(`window_interval`)的时间窗口,并按照一个固定的滑动间隔(`hop_interval`)滑动。当滑动间隔小于窗口大小时,滑动窗口间存在重叠,此时一个数据可能存在于多个窗口。 + +``` sql +hop(time_attr, hop_interval, window_interval [, timezone]) +``` + +**参数** + +- `time_attr` - [DateTime](../../sql-reference/data-types/datetime.md)类型的时间数据。 +- `hop_interval` - [Interval](../../sql-reference/data-types/special-data-types/interval.md)类型的滑动间隔,需要大于0。 +- `window_interval` - [Interval](../../sql-reference/data-types/special-data-types/interval.md)类型的窗口大小,需要大于0。 +- `timezone` — [Timezone name](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone) 类型的时区(可选参数)。 + +**返回值** + +- hop窗口的开始(包含边界)和结束时间(不包含边界)。由于一个数据可能存在于多个窗口,脱离window view单独调用该函数时只返回第一个窗口数据。 + +类型: `Tuple(DateTime, DateTime)` + +**示例** + +查询: + +``` sql +SELECT hop(now(), INTERVAL '1' SECOND, INTERVAL '2' SECOND) +``` + +结果: + +``` text +┌─hop(now(), toIntervalSecond('1'), toIntervalSecond('2'))──┐ +│ ('2020-01-14 16:58:22','2020-01-14 16:58:24') │ +└───────────────────────────────────────────────────────────┘ +``` + +## tumbleStart {#time-window-functions-tumblestart} + +返回tumble窗口的开始时间(包含边界)。 + +``` sql +tumbleStart(time_attr, interval [, timezone]); +``` + +## tumbleEnd {#time-window-functions-tumbleend} + +返回tumble窗口的结束时间(不包含边界)。 + +``` sql +tumbleEnd(time_attr, interval [, timezone]); +``` + +## hopStart {#time-window-functions-hopstart} + +返回hop窗口的开始时间(包含边界)。 + +``` sql +hopStart(time_attr, hop_interval, window_interval [, timezone]); +``` + +## hopEnd {#time-window-functions-hopend} + +返回hop窗口的结束时间(不包含边界)。 + +``` sql +hopEnd(time_attr, hop_interval, window_interval [, timezone]); +``` \ No newline at end of file diff --git a/docs/zh/sql-reference/statements/create/view.md b/docs/zh/sql-reference/statements/create/view.md index da69860f068..506f1717b03 100644 --- a/docs/zh/sql-reference/statements/create/view.md +++ b/docs/zh/sql-reference/statements/create/view.md @@ -5,7 +5,7 @@ toc_title: VIEW # CREATE VIEW {#create-view} -创建一个新视图。 有两种类型的视图:普通视图和物化视图。 +创建一个新视图。 有两种类型的视图:普通视图,物化视图,Live视图和Window视图。 ## Normal {#normal} @@ -241,3 +241,120 @@ Code: 60. DB::Exception: Received from localhost:9000. DB::Exception: Table defa - 使用定期刷新从系统表中查看指标。 [原始文章](https://clickhouse.com/docs/en/sql-reference/statements/create/view/) + +## Window View [Experimental] {#window-view} + +!!! important "重要" + 这是一项试验性功能,可能会在未来版本中以向后不兼容的方式进行更改。 + 通过[allow_experimental_window_view](../../../operations/settings/settings.md#allow-experimental-window-view)启用window view以及`WATCH`语句。输入命令 + `set allow_experimental_window_view = 1`。 + +``` sql +CREATE WINDOW VIEW [IF NOT EXISTS] [db.]table_name [TO [db.]table_name] [ENGINE = engine] [WATERMARK = strategy] [ALLOWED_LATENESS = interval_function] AS SELECT ... GROUP BY time_window_function +``` + +Window view可以通过时间窗口聚合数据,并在满足窗口触发条件时自动触发对应窗口计算。其通过将计算状态保存降低处理延迟,支持将处理结果输出至目标表或通过`WATCH`语句输出至终端。 + +创建window view的方式和创建物化视图类似。Window view使用默认为`AggregatingMergeTree`的内部存储引擎存储计算中间状态。 + +### 时间窗口函数 {#window-view-shi-jian-chuang-kou-han-shu} + +[时间窗口函数](../../functions/time-window-functions.md)用于获取窗口的起始和结束时间。Window view需要和时间窗口函数配合使用。 + +### 时间属性 {#window-view-shi-jian-shu-xing} + +Window view 支持**处理时间**和**事件时间**两种时间类型。 + +**处理时间**为默认时间类型,该模式下window view使用本地机器时间计算窗口数据。“处理时间”时间类型计算简单,但具有不确定性。该模式下时间可以为时间窗口函数的第一个参数`time_attr`,或通过函数`now()`使用当前机器时间。下面的例子展示了使用“处理时间”创建window view的例子。 + +``` sql +CREATE WINDOW VIEW wv AS SELECT count(number), tumbleStart(w_id) as w_start from date GROUP BY tumble(now(), INTERVAL '5' SECOND) as w_id +``` + +**事件时间** 是事件真实发生的时间,该时间往往在事件发生时便嵌入数据记录。事件时间处理提供较高的确定性,可以处理乱序数据以及迟到数据。Window view通过水位线(`WATERMARK`)启用事件时间处理。 + +Window view提供如下三种水位线策略: + +* `STRICTLY_ASCENDING`: 提交观测到的最大时间作为水位线,小于最大观测时间的数据不算迟到。 +* `ASCENDING`: 提交观测到的最大时间减1作为水位线。小于或等于最大观测时间的数据不算迟到。 +* `BOUNDED`: WATERMARK=INTERVAL. 提交最大观测时间减去固定间隔(`INTERVAL`)做为水位线。 + +以下为使用`WATERMARK`创建window view的示例: + +``` sql +CREATE WINDOW VIEW wv WATERMARK=STRICTLY_ASCENDING AS SELECT count(number) FROM date GROUP BY tumble(timestamp, INTERVAL '5' SECOND); +CREATE WINDOW VIEW wv WATERMARK=ASCENDING AS SELECT count(number) FROM date GROUP BY tumble(timestamp, INTERVAL '5' SECOND); +CREATE WINDOW VIEW wv WATERMARK=INTERVAL '3' SECOND AS SELECT count(number) FROM date GROUP BY tumble(timestamp, INTERVAL '5' SECOND); +``` + +通常,窗口会在水位线到达时触发,水位线到达之后的数据会被丢弃。Window view可以通过设置`ALLOWED_LATENESS=INTERVAL`来开启迟到消息处理。示例如下: + +``` sql +CREATE WINDOW VIEW test.wv TO test.dst WATERMARK=ASCENDING ALLOWED_LATENESS=INTERVAL '2' SECOND AS SELECT count(a) AS count, tumbleEnd(wid) AS w_end FROM test.mt GROUP BY tumble(timestamp, INTERVAL '5' SECOND) AS wid; +``` + +需要注意的是,迟到消息需要更新之前的处理结果。与在窗口结束时触发不同,迟到消息到达时window view会立即触发计算。因此,会导致同一个窗口输出多次计算结果。用户需要注意这种情况,并消除重复结果。 + +### 新窗口监控 {#window-view-xin-chuang-kou-jian-kong} + +Window view可以通过`WATCH`语句将处理结果推送至终端,或通过`TO`语句将结果推送至数据表。 + +``` sql +WATCH [db.]name [LIMIT n] +``` + +`WATCH`语句和`LIVE VIEW`中的类似。支持设置`LIMIT`参数,输出消息数目达到`LIMIT`限制时结束查询。 + +### 设置 {#window-view-she-zhi} + +- `window_view_clean_interval`: window view清除过期数据间隔(单位为秒)。系统会定期清除过期数据,尚未触发的窗口数据不会被清除。 +- `window_view_heartbeat_interval`: 用于判断watch查询活跃的心跳时间间隔。 + +### 示例 {#window-view-shi-li} + +假设我们需要每10秒统计一次`data`表中的点击日志,且`data`表的结构如下: + +``` sql +CREATE TABLE data ( `id` UInt64, `timestamp` DateTime) ENGINE = Memory; +``` + +首先,使用10秒大小的tumble函数创建window view。 + +``` sql +CREATE WINDOW VIEW wv as select count(id), tumbleStart(w_id) as window_start from data group by tumble(timestamp, INTERVAL '10' SECOND) as w_id +``` + +随后,我们使用`WATCH`语句获取计算结果。 + +``` sql +WATCH wv +``` + +当日志插入表`data`时, + +``` sql +INSERT INTO data VALUES(1,now()) +``` + +`WATCH`语句会输出如下结果: + +``` text +┌─count(id)─┬────────window_start─┐ +│ 1 │ 2020-01-14 16:56:40 │ +└───────────┴─────────────────────┘ +``` + +或者,我们可以通过`TO`关键字将处理结果输出至另一张表。 + +``` sql +CREATE WINDOW VIEW wv TO dst AS SELECT count(id), tumbleStart(w_id) as window_start FROM data GROUP BY tumble(timestamp, INTERVAL '10' SECOND) as w_id +``` + +ClickHouse测试中提供了更多的示例(以`*window_view*`命名)。 + +### Window View 使用场景 {#window-view-shi-yong-chang-jing} + +Window view 在以下场景有用: + +* **监控**: 以时间维度聚合及处理数据,并将处理结果输出至目标表。用户可通过目标表获取并操作计算结果。 +* **分析**: 以时间维度进行数据分析. 当数据源非常庞大时,window view可以减少重复全表查询的计算量。 diff --git a/programs/client/Client.cpp b/programs/client/Client.cpp index 383b9bb5e52..e01677aaac6 100644 --- a/programs/client/Client.cpp +++ b/programs/client/Client.cpp @@ -20,9 +20,7 @@ #include #include -#if !defined(ARCADIA_BUILD) -# include -#endif +#include #include #include #include @@ -705,6 +703,12 @@ bool Client::processWithFuzzing(const String & full_query) throw; } + if (!orig_ast) + { + // Can't continue after a parsing error + return true; + } + // `USE db` should not be executed // since this will break every query after `DROP db` if (orig_ast->as()) @@ -712,12 +716,6 @@ bool Client::processWithFuzzing(const String & full_query) return true; } - if (!orig_ast) - { - // Can't continue after a parsing error - return true; - } - // Don't repeat: // - INSERT -- Because the tables may grow too big. // - CREATE -- Because first we run the unmodified query, it will succeed, diff --git a/programs/copier/ClusterCopier.cpp b/programs/copier/ClusterCopier.cpp index 536bb37199d..4d491a06795 100644 --- a/programs/copier/ClusterCopier.cpp +++ b/programs/copier/ClusterCopier.cpp @@ -46,7 +46,7 @@ void ClusterCopier::init() reloadTaskDescription(); task_cluster->loadTasks(*task_cluster_current_config); - getContext()->setClustersConfig(task_cluster_current_config, task_cluster->clusters_prefix); + getContext()->setClustersConfig(task_cluster_current_config, false, task_cluster->clusters_prefix); /// Set up shards and their priority task_cluster->random_engine.seed(task_cluster->random_device()); diff --git a/programs/install/Install.cpp b/programs/install/Install.cpp index a017355cfdd..706e273e2b4 100644 --- a/programs/install/Install.cpp +++ b/programs/install/Install.cpp @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/programs/keeper/Keeper.cpp b/programs/keeper/Keeper.cpp index 4dadef911d7..d144b4d332e 100644 --- a/programs/keeper/Keeper.cpp +++ b/programs/keeper/Keeper.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -22,10 +23,8 @@ #include #include -#if !defined(ARCADIA_BUILD) -# include "config_core.h" -# include "Common/config_version.h" -#endif +#include "config_core.h" +#include "Common/config_version.h" #if USE_SSL # include @@ -381,11 +380,11 @@ int Keeper::main(const std::vector & /*args*/) socket.setReceiveTimeout(settings.receive_timeout); socket.setSendTimeout(settings.send_timeout); servers->emplace_back( + listen_host, port_name, - std::make_unique( - new KeeperTCPHandlerFactory(*this, false), server_pool, socket, new Poco::Net::TCPServerParams)); - - LOG_INFO(log, "Listening for connections to Keeper (tcp): {}", address.toString()); + "Keeper (tcp): " + address.toString(), + std::make_unique( + new KeeperTCPHandlerFactory(*this, false), server_pool, socket)); }); const char * secure_port_name = "keeper_server.tcp_port_secure"; @@ -397,10 +396,11 @@ int Keeper::main(const std::vector & /*args*/) socket.setReceiveTimeout(settings.receive_timeout); socket.setSendTimeout(settings.send_timeout); servers->emplace_back( + listen_host, secure_port_name, - std::make_unique( - new KeeperTCPHandlerFactory(*this, true), server_pool, socket, new Poco::Net::TCPServerParams)); - LOG_INFO(log, "Listening for connections to Keeper with secure protocol (tcp_secure): {}", address.toString()); + "Keeper with secure protocol (tcp_secure): " + address.toString(), + std::make_unique( + new KeeperTCPHandlerFactory(*this, true), server_pool, socket)); #else UNUSED(port); throw Exception{"SSL support for TCP protocol is disabled because Poco library was built without NetSSL support.", @@ -410,7 +410,10 @@ int Keeper::main(const std::vector & /*args*/) } for (auto & server : *servers) + { server.start(); + LOG_INFO(log, "Listening for {}", server.getDescription()); + } zkutil::EventPtr unused_event = std::make_shared(); zkutil::ZooKeeperNodeCache unused_cache([] { return nullptr; }); diff --git a/programs/local/LocalServer.cpp b/programs/local/LocalServer.cpp index 1f27072f142..aa4747636c9 100644 --- a/programs/local/LocalServer.cpp +++ b/programs/local/LocalServer.cpp @@ -313,11 +313,11 @@ void LocalServer::cleanup() std::string LocalServer::getInitialCreateTableQuery() { - if (!config().has("table-structure")) + if (!config().has("table-structure") && !config().has("table-file")) return {}; auto table_name = backQuoteIfNeed(config().getString("table-name", "table")); - auto table_structure = config().getString("table-structure"); + auto table_structure = config().getString("table-structure", "auto"); auto data_format = backQuoteIfNeed(config().getString("table-data-format", "TSV")); String table_file; @@ -332,7 +332,12 @@ std::string LocalServer::getInitialCreateTableQuery() table_file = quoteString(config().getString("table-file")); } - return fmt::format("CREATE TABLE {} ({}) ENGINE = File({}, {});", + if (table_structure == "auto") + table_structure = ""; + else + table_structure = "(" + table_structure + ")"; + + return fmt::format("CREATE TABLE {} {} ENGINE = File({}, {});", table_name, table_structure, data_format, table_file); } @@ -388,12 +393,6 @@ void LocalServer::setupUsers() } -String LocalServer::getQueryTextPrefix() -{ - return getInitialCreateTableQuery(); -} - - void LocalServer::connect() { connection_parameters = ConnectionParameters(config()); @@ -428,7 +427,7 @@ try #else is_interactive = stdin_is_a_tty && (config().hasOption("interactive") - || (!config().has("query") && !config().has("table-structure") && queries_files.empty())); + || (!config().has("query") && !config().has("table-structure") && queries_files.empty() && !config().has("table-file"))); #endif if (!is_interactive) { @@ -463,6 +462,10 @@ try } #endif + String initial_query = getInitialCreateTableQuery(); + if (!initial_query.empty()) + processQueryText(initial_query); + if (is_interactive && !delayed_interactive) { runInteractive(); @@ -729,7 +732,6 @@ void LocalServer::printHelpMessage([[maybe_unused]] const OptionsDescription & o void LocalServer::addOptions(OptionsDescription & options_description) { options_description.main_description->add_options() - ("database,d", po::value(), "database") ("table,N", po::value(), "name of the initial table") /// If structure argument is omitted then initial query is not generated @@ -795,9 +797,9 @@ void LocalServer::processOptions(const OptionsDescription &, const CommandLineOp int mainEntryClickHouseLocal(int argc, char ** argv) { - DB::LocalServer app; try { + DB::LocalServer app; app.init(argc, argv); return app.run(); } diff --git a/programs/local/LocalServer.h b/programs/local/LocalServer.h index ce0df06c86a..06e3746eacd 100644 --- a/programs/local/LocalServer.h +++ b/programs/local/LocalServer.h @@ -37,7 +37,6 @@ protected: void processError(const String & query) const override; String getName() const override { return "local"; } - String getQueryTextPrefix() override; void printHelpMessage(const OptionsDescription & options_description) override; void addOptions(OptionsDescription & options_description) override; diff --git a/programs/main.cpp b/programs/main.cpp index cd416f57982..2cdda075ca7 100644 --- a/programs/main.cpp +++ b/programs/main.cpp @@ -13,9 +13,7 @@ #include #include /// pair -#if !defined(ARCADIA_BUILD) -# include "config_tools.h" -#endif +#include "config_tools.h" #include #include diff --git a/programs/obfuscator/Obfuscator.cpp b/programs/obfuscator/Obfuscator.cpp index caccc726923..947e7ab1768 100644 --- a/programs/obfuscator/Obfuscator.cpp +++ b/programs/obfuscator/Obfuscator.cpp @@ -30,7 +30,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/programs/odbc-bridge/ODBCBlockOutputStream.cpp b/programs/odbc-bridge/ODBCBlockOutputStream.cpp index 710614130c3..1c28da2a072 100644 --- a/programs/odbc-bridge/ODBCBlockOutputStream.cpp +++ b/programs/odbc-bridge/ODBCBlockOutputStream.cpp @@ -1,13 +1,8 @@ #include "ODBCBlockOutputStream.h" -#include #include -#include -#include -#include -#include "getIdentifierQuote.h" -#include -#include +#include +#include #include #include @@ -45,7 +40,7 @@ void ODBCSink::consume(Chunk chunk) std::string query = getInsertQuery(db_name, table_name, block.getColumnsWithTypeAndName(), quoting) + values_buf.str(); execute(connection_holder, - [&](nanodbc::connection & connection) { execute(connection, query); }); + [&](nanodbc::connection & connection) { execute(connection, query); }); } } diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp index 1af458a3b49..ca1f3957f7f 100644 --- a/programs/server/Server.cpp +++ b/programs/server/Server.cpp @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -70,6 +71,7 @@ #include "MetricsTransmitter.h" #include #include +#include #include #include #include @@ -85,10 +87,8 @@ #include #include -#if !defined(ARCADIA_BUILD) -# include "config_core.h" -# include "Common/config_version.h" -#endif +#include "config_core.h" +#include "Common/config_version.h" #if defined(OS_LINUX) # include @@ -99,7 +99,7 @@ #endif #if USE_SSL -# if USE_INTERNAL_SSL_LIBRARY && !defined(ARCADIA_BUILD) +# if USE_INTERNAL_SSL_LIBRARY # include # endif # include @@ -132,6 +132,11 @@ namespace CurrentMetrics extern const Metric MaxPushedDDLEntryID; } +namespace ProfileEvents +{ + extern const Event MainConfigLoads; +} + namespace fs = std::filesystem; #if USE_JEMALLOC @@ -349,16 +354,53 @@ Poco::Net::SocketAddress Server::socketBindListen(Poco::Net::ServerSocket & sock return address; } -void Server::createServer(const std::string & listen_host, const char * port_name, bool listen_try, CreateServerFunc && func) const +std::vector getListenHosts(const Poco::Util::AbstractConfiguration & config) +{ + auto listen_hosts = DB::getMultipleValuesFromConfig(config, "", "listen_host"); + if (listen_hosts.empty()) + { + listen_hosts.emplace_back("::1"); + listen_hosts.emplace_back("127.0.0.1"); + } + return listen_hosts; +} + +bool getListenTry(const Poco::Util::AbstractConfiguration & config) +{ + bool listen_try = config.getBool("listen_try", false); + if (!listen_try) + listen_try = DB::getMultipleValuesFromConfig(config, "", "listen_host").empty(); + return listen_try; +} + + +void Server::createServer( + Poco::Util::AbstractConfiguration & config, + const std::string & listen_host, + const char * port_name, + bool listen_try, + bool start_server, + std::vector & servers, + CreateServerFunc && func) const { /// For testing purposes, user may omit tcp_port or http_port or https_port in configuration file. - if (!config().has(port_name)) + if (config.getString(port_name, "").empty()) return; - auto port = config().getInt(port_name); + /// If we already have an active server for this listen_host/port_name, don't create it again + for (const auto & server : servers) + if (!server.isStopping() && server.getListenHost() == listen_host && server.getPortName() == port_name) + return; + + auto port = config.getInt(port_name); try { - func(port); + servers.push_back(func(port)); + if (start_server) + { + servers.back().start(); + LOG_INFO(&logger(), "Listening for {}", servers.back().getDescription()); + } global_context->registerServerPort(port_name, port); } catch (const Poco::Exception &) @@ -520,6 +562,27 @@ if (ThreadFuzzer::instance().isEffective()) config().getUInt("thread_pool_queue_size", 10000) ); + Poco::ThreadPool server_pool(3, config().getUInt("max_connections", 1024)); + std::mutex servers_lock; + std::vector servers; + std::vector servers_to_start_before_tables; + /// This object will periodically calculate some metrics. + AsynchronousMetrics async_metrics( + global_context, config().getUInt("asynchronous_metrics_update_period_s", 1), + [&]() -> std::vector + { + std::vector metrics; + metrics.reserve(servers_to_start_before_tables.size()); + for (const auto & server : servers_to_start_before_tables) + metrics.emplace_back(ProtocolServerMetrics{server.getPortName(), server.currentThreads()}); + + std::lock_guard lock(servers_lock); + for (const auto & server : servers) + metrics.emplace_back(ProtocolServerMetrics{server.getPortName(), server.currentThreads()}); + return metrics; + } + ); + ConnectionCollector::init(global_context, config().getUInt("max_threads_for_connection_collector", 10)); bool has_zookeeper = config().has("zookeeper"); @@ -847,7 +910,7 @@ if (ThreadFuzzer::instance().isEffective()) // in a lot of places. For now, disable updating log configuration without server restart. //setTextLog(global_context->getTextLog()); updateLevels(*config, logger()); - global_context->setClustersConfig(config); + global_context->setClustersConfig(config, has_zookeeper); global_context->setMacros(std::make_unique(*config, "macros", log)); global_context->setExternalAuthenticatorsConfig(*config); @@ -865,6 +928,12 @@ if (ThreadFuzzer::instance().isEffective()) if (config->has("max_concurrent_queries")) global_context->getProcessList().setMaxSize(config->getInt("max_concurrent_queries", 0)); + if (config->has("max_concurrent_insert_queries")) + global_context->getProcessList().setMaxInsertQueriesAmount(config->getInt("max_concurrent_insert_queries", 0)); + + if (config->has("max_concurrent_select_queries")) + global_context->getProcessList().setMaxSelectQueriesAmount(config->getInt("max_concurrent_select_queries", 0)); + if (config->has("keeper_server")) global_context->updateKeeperConfiguration(*config); @@ -876,12 +945,17 @@ if (ThreadFuzzer::instance().isEffective()) global_context->reloadZooKeeperIfChanged(config); global_context->reloadAuxiliaryZooKeepersConfigIfChanged(config); + + std::lock_guard lock(servers_lock); + updateServers(*config, server_pool, async_metrics, servers); } global_context->updateStorageConfiguration(*config); global_context->updateInterserverCredentials(*config); CompressionCodecEncrypted::Configuration::instance().tryLoad(*config, "encryption_codecs"); + + ProfileEvents::increment(ProfileEvents::MainConfigLoads); }, /* already_loaded = */ false); /// Reload it right now (initial loading) @@ -993,24 +1067,8 @@ if (ThreadFuzzer::instance().isEffective()) /// try set up encryption. There are some errors in config, error will be printed and server wouldn't start. CompressionCodecEncrypted::Configuration::instance().load(config(), "encryption_codecs"); - Poco::Timespan keep_alive_timeout(config().getUInt("keep_alive_timeout", 10), 0); - - Poco::ThreadPool server_pool(3, config().getUInt("max_connections", 1024)); - Poco::Net::HTTPServerParams::Ptr http_params = new Poco::Net::HTTPServerParams; - http_params->setTimeout(settings.http_receive_timeout); - http_params->setKeepAliveTimeout(keep_alive_timeout); - - auto servers_to_start_before_tables = std::make_shared>(); - - std::vector listen_hosts = DB::getMultipleValuesFromConfig(config(), "", "listen_host"); - - bool listen_try = config().getBool("listen_try", false); - if (listen_hosts.empty()) - { - listen_hosts.emplace_back("::1"); - listen_hosts.emplace_back("127.0.0.1"); - listen_try = true; - } + const auto listen_hosts = getListenHosts(config()); + const auto listen_try = getListenTry(config()); if (config().has("keeper_server")) { @@ -1033,39 +1091,46 @@ if (ThreadFuzzer::instance().isEffective()) { /// TCP Keeper const char * port_name = "keeper_server.tcp_port"; - createServer(listen_host, port_name, listen_try, [&](UInt16 port) - { - Poco::Net::ServerSocket socket; - auto address = socketBindListen(socket, listen_host, port); - socket.setReceiveTimeout(settings.receive_timeout); - socket.setSendTimeout(settings.send_timeout); - servers_to_start_before_tables->emplace_back( - port_name, - std::make_unique( - new KeeperTCPHandlerFactory(*this, false), server_pool, socket, new Poco::Net::TCPServerParams)); - - LOG_INFO(log, "Listening for connections to Keeper (tcp): {}", address.toString()); - }); + createServer( + config(), listen_host, port_name, listen_try, /* start_server: */ false, + servers_to_start_before_tables, + [&](UInt16 port) -> ProtocolServerAdapter + { + Poco::Net::ServerSocket socket; + auto address = socketBindListen(socket, listen_host, port); + socket.setReceiveTimeout(settings.receive_timeout); + socket.setSendTimeout(settings.send_timeout); + return ProtocolServerAdapter( + listen_host, + port_name, + "Keeper (tcp): " + address.toString(), + std::make_unique( + new KeeperTCPHandlerFactory(*this, false), server_pool, socket)); + }); const char * secure_port_name = "keeper_server.tcp_port_secure"; - createServer(listen_host, secure_port_name, listen_try, [&](UInt16 port) - { + createServer( + config(), listen_host, secure_port_name, listen_try, /* start_server: */ false, + servers_to_start_before_tables, + [&](UInt16 port) -> ProtocolServerAdapter + { #if USE_SSL - Poco::Net::SecureServerSocket socket; - auto address = socketBindListen(socket, listen_host, port, /* secure = */ true); - socket.setReceiveTimeout(settings.receive_timeout); - socket.setSendTimeout(settings.send_timeout); - servers_to_start_before_tables->emplace_back( - secure_port_name, - std::make_unique( - new KeeperTCPHandlerFactory(*this, true), server_pool, socket, new Poco::Net::TCPServerParams)); - LOG_INFO(log, "Listening for connections to Keeper with secure protocol (tcp_secure): {}", address.toString()); + Poco::Net::SecureServerSocket socket; + auto address = socketBindListen(socket, listen_host, port, /* secure = */ true); + socket.setReceiveTimeout(settings.receive_timeout); + socket.setSendTimeout(settings.send_timeout); + return ProtocolServerAdapter( + listen_host, + secure_port_name, + "Keeper with secure protocol (tcp_secure): " + address.toString(), + std::make_unique( + new KeeperTCPHandlerFactory(*this, true), server_pool, socket)); #else - UNUSED(port); - throw Exception{"SSL support for TCP protocol is disabled because Poco library was built without NetSSL support.", - ErrorCodes::SUPPORT_IS_DISABLED}; + UNUSED(port); + throw Exception{"SSL support for TCP protocol is disabled because Poco library was built without NetSSL support.", + ErrorCodes::SUPPORT_IS_DISABLED}; #endif - }); + }); } #else throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "ClickHouse server built without NuRaft library. Cannot use internal coordination."); @@ -1073,14 +1138,19 @@ if (ThreadFuzzer::instance().isEffective()) } - for (auto & server : *servers_to_start_before_tables) + for (auto & server : servers_to_start_before_tables) + { server.start(); + LOG_INFO(log, "Listening for {}", server.getDescription()); + } SCOPE_EXIT({ /// Stop reloading of the main config. This must be done before `global_context->shutdown()` because /// otherwise the reloading may pass a changed config to some destroyed parts of ContextSharedPart. main_config_reloader.reset(); + async_metrics.stop(); + /** Ask to cancel background jobs all table engines, * and also query_log. * It is important to do early, not in destructor of Context, because @@ -1092,11 +1162,11 @@ if (ThreadFuzzer::instance().isEffective()) LOG_DEBUG(log, "Shut down storages."); - if (!servers_to_start_before_tables->empty()) + if (!servers_to_start_before_tables.empty()) { LOG_DEBUG(log, "Waiting for current connections to servers for tables to finish."); int current_connections = 0; - for (auto & server : *servers_to_start_before_tables) + for (auto & server : servers_to_start_before_tables) { server.stop(); current_connections += server.currentConnections(); @@ -1108,7 +1178,7 @@ if (ThreadFuzzer::instance().isEffective()) LOG_INFO(log, "Closed all listening sockets."); if (current_connections > 0) - current_connections = waitServersToFinish(*servers_to_start_before_tables, config().getInt("shutdown_wait_unfinished", 5)); + current_connections = waitServersToFinish(servers_to_start_before_tables, config().getInt("shutdown_wait_unfinished", 5)); if (current_connections) LOG_INFO(log, "Closed connections to servers for tables. But {} remain. Probably some tables of other users cannot finish their connections after context shutdown.", current_connections); @@ -1262,223 +1332,18 @@ if (ThreadFuzzer::instance().isEffective()) LOG_INFO(log, "TaskStats is not implemented for this OS. IO accounting will be disabled."); #endif - auto servers = std::make_shared>(); { - /// This object will periodically calculate some metrics. - AsynchronousMetrics async_metrics( - global_context, config().getUInt("asynchronous_metrics_update_period_s", 1), servers_to_start_before_tables, servers); attachSystemTablesAsync(global_context, *DatabaseCatalog::instance().getSystemDatabase(), async_metrics); - for (const auto & listen_host : listen_hosts) { - /// HTTP - const char * port_name = "http_port"; - createServer(listen_host, port_name, listen_try, [&](UInt16 port) - { - Poco::Net::ServerSocket socket; - auto address = socketBindListen(socket, listen_host, port); - socket.setReceiveTimeout(settings.http_receive_timeout); - socket.setSendTimeout(settings.http_send_timeout); - - servers->emplace_back( - port_name, - std::make_unique( - context(), createHandlerFactory(*this, async_metrics, "HTTPHandler-factory"), server_pool, socket, http_params)); - - LOG_INFO(log, "Listening for http://{}", address.toString()); - }); - - /// HTTPS - port_name = "https_port"; - createServer(listen_host, port_name, listen_try, [&](UInt16 port) - { -#if USE_SSL - Poco::Net::SecureServerSocket socket; - auto address = socketBindListen(socket, listen_host, port, /* secure = */ true); - socket.setReceiveTimeout(settings.http_receive_timeout); - socket.setSendTimeout(settings.http_send_timeout); - servers->emplace_back( - port_name, - std::make_unique( - context(), createHandlerFactory(*this, async_metrics, "HTTPSHandler-factory"), server_pool, socket, http_params)); - - LOG_INFO(log, "Listening for https://{}", address.toString()); -#else - UNUSED(port); - throw Exception{"HTTPS protocol is disabled because Poco library was built without NetSSL support.", - ErrorCodes::SUPPORT_IS_DISABLED}; -#endif - }); - - /// TCP - port_name = "tcp_port"; - createServer(listen_host, port_name, listen_try, [&](UInt16 port) - { - Poco::Net::ServerSocket socket; - auto address = socketBindListen(socket, listen_host, port); - socket.setReceiveTimeout(settings.receive_timeout); - socket.setSendTimeout(settings.send_timeout); - servers->emplace_back(port_name, std::make_unique( - new TCPHandlerFactory(*this, /* secure */ false, /* proxy protocol */ false), - server_pool, - socket, - new Poco::Net::TCPServerParams)); - - LOG_INFO(log, "Listening for connections with native protocol (tcp): {}", address.toString()); - }); - - /// TCP with PROXY protocol, see https://github.com/wolfeidau/proxyv2/blob/master/docs/proxy-protocol.txt - port_name = "tcp_with_proxy_port"; - createServer(listen_host, port_name, listen_try, [&](UInt16 port) - { - Poco::Net::ServerSocket socket; - auto address = socketBindListen(socket, listen_host, port); - socket.setReceiveTimeout(settings.receive_timeout); - socket.setSendTimeout(settings.send_timeout); - servers->emplace_back(port_name, std::make_unique( - new TCPHandlerFactory(*this, /* secure */ false, /* proxy protocol */ true), - server_pool, - socket, - new Poco::Net::TCPServerParams)); - - LOG_INFO(log, "Listening for connections with native protocol (tcp) with PROXY: {}", address.toString()); - }); - - /// TCP with SSL - port_name = "tcp_port_secure"; - createServer(listen_host, port_name, listen_try, [&](UInt16 port) - { -#if USE_SSL - Poco::Net::SecureServerSocket socket; - auto address = socketBindListen(socket, listen_host, port, /* secure = */ true); - socket.setReceiveTimeout(settings.receive_timeout); - socket.setSendTimeout(settings.send_timeout); - servers->emplace_back(port_name, std::make_unique( - new TCPHandlerFactory(*this, /* secure */ true, /* proxy protocol */ false), - server_pool, - socket, - new Poco::Net::TCPServerParams)); - LOG_INFO(log, "Listening for connections with secure native protocol (tcp_secure): {}", address.toString()); -#else - UNUSED(port); - throw Exception{"SSL support for TCP protocol is disabled because Poco library was built without NetSSL support.", - ErrorCodes::SUPPORT_IS_DISABLED}; -#endif - }); - - /// Interserver IO HTTP - port_name = "interserver_http_port"; - createServer(listen_host, port_name, listen_try, [&](UInt16 port) - { - Poco::Net::ServerSocket socket; - auto address = socketBindListen(socket, listen_host, port); - socket.setReceiveTimeout(settings.http_receive_timeout); - socket.setSendTimeout(settings.http_send_timeout); - servers->emplace_back( - port_name, - std::make_unique( - context(), - createHandlerFactory(*this, async_metrics, "InterserverIOHTTPHandler-factory"), - server_pool, - socket, - http_params)); - - LOG_INFO(log, "Listening for replica communication (interserver): http://{}", address.toString()); - }); - - port_name = "interserver_https_port"; - createServer(listen_host, port_name, listen_try, [&](UInt16 port) - { -#if USE_SSL - Poco::Net::SecureServerSocket socket; - auto address = socketBindListen(socket, listen_host, port, /* secure = */ true); - socket.setReceiveTimeout(settings.http_receive_timeout); - socket.setSendTimeout(settings.http_send_timeout); - servers->emplace_back( - port_name, - std::make_unique( - context(), - createHandlerFactory(*this, async_metrics, "InterserverIOHTTPSHandler-factory"), - server_pool, - socket, - http_params)); - - LOG_INFO(log, "Listening for secure replica communication (interserver): https://{}", address.toString()); -#else - UNUSED(port); - throw Exception{"SSL support for TCP protocol is disabled because Poco library was built without NetSSL support.", - ErrorCodes::SUPPORT_IS_DISABLED}; -#endif - }); - - port_name = "mysql_port"; - createServer(listen_host, port_name, listen_try, [&](UInt16 port) - { - Poco::Net::ServerSocket socket; - auto address = socketBindListen(socket, listen_host, port, /* secure = */ true); - socket.setReceiveTimeout(Poco::Timespan()); - socket.setSendTimeout(settings.send_timeout); - servers->emplace_back(port_name, std::make_unique( - new MySQLHandlerFactory(*this), - server_pool, - socket, - new Poco::Net::TCPServerParams)); - - LOG_INFO(log, "Listening for MySQL compatibility protocol: {}", address.toString()); - }); - - port_name = "postgresql_port"; - createServer(listen_host, port_name, listen_try, [&](UInt16 port) - { - Poco::Net::ServerSocket socket; - auto address = socketBindListen(socket, listen_host, port, /* secure = */ true); - socket.setReceiveTimeout(Poco::Timespan()); - socket.setSendTimeout(settings.send_timeout); - servers->emplace_back(port_name, std::make_unique( - new PostgreSQLHandlerFactory(*this), - server_pool, - socket, - new Poco::Net::TCPServerParams)); - - LOG_INFO(log, "Listening for PostgreSQL compatibility protocol: " + address.toString()); - }); - -#if USE_GRPC - port_name = "grpc_port"; - createServer(listen_host, port_name, listen_try, [&](UInt16 port) - { - Poco::Net::SocketAddress server_address(listen_host, port); - servers->emplace_back(port_name, std::make_unique(*this, makeSocketAddress(listen_host, port, log))); - LOG_INFO(log, "Listening for gRPC protocol: " + server_address.toString()); - }); -#endif - - /// Prometheus (if defined and not setup yet with http_port) - port_name = "prometheus.port"; - createServer(listen_host, port_name, listen_try, [&](UInt16 port) - { - Poco::Net::ServerSocket socket; - auto address = socketBindListen(socket, listen_host, port); - socket.setReceiveTimeout(settings.http_receive_timeout); - socket.setSendTimeout(settings.http_send_timeout); - servers->emplace_back( - port_name, - std::make_unique( - context(), - createHandlerFactory(*this, async_metrics, "PrometheusHandler-factory"), - server_pool, - socket, - http_params)); - - LOG_INFO(log, "Listening for Prometheus: http://{}", address.toString()); - }); + std::lock_guard lock(servers_lock); + createServers(config(), listen_hosts, listen_try, server_pool, async_metrics, servers); + if (servers.empty()) + throw Exception( + "No servers started (add valid listen_host and 'tcp_port' or 'http_port' to configuration file.)", + ErrorCodes::NO_ELEMENTS_IN_CONFIG); } - if (servers->empty()) - throw Exception("No servers started (add valid listen_host and 'tcp_port' or 'http_port' to configuration file.)", - ErrorCodes::NO_ELEMENTS_IN_CONFIG); - - /// Must be done after initialization of `servers`, because async_metrics will access `servers` variable from its thread. async_metrics.start(); { @@ -1557,9 +1422,24 @@ if (ThreadFuzzer::instance().isEffective()) &CurrentMetrics::MaxDDLEntryID, &CurrentMetrics::MaxPushedDDLEntryID)); } - for (auto & server : *servers) - server.start(); - LOG_INFO(log, "Ready for connections."); + { + std::lock_guard lock(servers_lock); + for (auto & server : servers) + { + server.start(); + LOG_INFO(log, "Listening for {}", server.getDescription()); + } + LOG_INFO(log, "Ready for connections."); + } + + try + { + global_context->startClusterDiscovery(); + } + catch (...) + { + tryLogCurrentException(log, "Caught exception while starting cluster discovery"); + } SCOPE_EXIT_SAFE({ LOG_DEBUG(log, "Received termination signal."); @@ -1568,10 +1448,13 @@ if (ThreadFuzzer::instance().isEffective()) is_cancelled = true; int current_connections = 0; - for (auto & server : *servers) { - server.stop(); - current_connections += server.currentConnections(); + std::lock_guard lock(servers_lock); + for (auto & server : servers) + { + server.stop(); + current_connections += server.currentConnections(); + } } if (current_connections) @@ -1584,7 +1467,7 @@ if (ThreadFuzzer::instance().isEffective()) global_context->getProcessList().killAllQueries(); if (current_connections) - current_connections = waitServersToFinish(*servers, config().getInt("shutdown_wait_unfinished", 5)); + current_connections = waitServersToFinish(servers, config().getInt("shutdown_wait_unfinished", 5)); if (current_connections) LOG_INFO(log, "Closed connections. But {} remain." @@ -1620,4 +1503,273 @@ if (ThreadFuzzer::instance().isEffective()) return Application::EXIT_OK; } + +void Server::createServers( + Poco::Util::AbstractConfiguration & config, + const std::vector & listen_hosts, + bool listen_try, + Poco::ThreadPool & server_pool, + AsynchronousMetrics & async_metrics, + std::vector & servers, + bool start_servers) +{ + const Settings & settings = global_context->getSettingsRef(); + + Poco::Timespan keep_alive_timeout(config.getUInt("keep_alive_timeout", 10), 0); + Poco::Net::HTTPServerParams::Ptr http_params = new Poco::Net::HTTPServerParams; + http_params->setTimeout(settings.http_receive_timeout); + http_params->setKeepAliveTimeout(keep_alive_timeout); + + for (const auto & listen_host : listen_hosts) + { + /// HTTP + const char * port_name = "http_port"; + createServer(config, listen_host, port_name, listen_try, start_servers, servers, [&](UInt16 port) -> ProtocolServerAdapter + { + Poco::Net::ServerSocket socket; + auto address = socketBindListen(socket, listen_host, port); + socket.setReceiveTimeout(settings.http_receive_timeout); + socket.setSendTimeout(settings.http_send_timeout); + + return ProtocolServerAdapter( + listen_host, + port_name, + "http://" + address.toString(), + std::make_unique( + context(), createHandlerFactory(*this, async_metrics, "HTTPHandler-factory"), server_pool, socket, http_params)); + }); + + /// HTTPS + port_name = "https_port"; + createServer(config, listen_host, port_name, listen_try, start_servers, servers, [&](UInt16 port) -> ProtocolServerAdapter + { +#if USE_SSL + Poco::Net::SecureServerSocket socket; + auto address = socketBindListen(socket, listen_host, port, /* secure = */ true); + socket.setReceiveTimeout(settings.http_receive_timeout); + socket.setSendTimeout(settings.http_send_timeout); + return ProtocolServerAdapter( + listen_host, + port_name, + "https://" + address.toString(), + std::make_unique( + context(), createHandlerFactory(*this, async_metrics, "HTTPSHandler-factory"), server_pool, socket, http_params)); +#else + UNUSED(port); + throw Exception{"HTTPS protocol is disabled because Poco library was built without NetSSL support.", + ErrorCodes::SUPPORT_IS_DISABLED}; +#endif + }); + + /// TCP + port_name = "tcp_port"; + createServer(config, listen_host, port_name, listen_try, start_servers, servers, [&](UInt16 port) -> ProtocolServerAdapter + { + Poco::Net::ServerSocket socket; + auto address = socketBindListen(socket, listen_host, port); + socket.setReceiveTimeout(settings.receive_timeout); + socket.setSendTimeout(settings.send_timeout); + return ProtocolServerAdapter( + listen_host, + port_name, + "native protocol (tcp): " + address.toString(), + std::make_unique( + new TCPHandlerFactory(*this, /* secure */ false, /* proxy protocol */ false), + server_pool, + socket, + new Poco::Net::TCPServerParams)); + }); + + /// TCP with PROXY protocol, see https://github.com/wolfeidau/proxyv2/blob/master/docs/proxy-protocol.txt + port_name = "tcp_with_proxy_port"; + createServer(config, listen_host, port_name, listen_try, start_servers, servers, [&](UInt16 port) -> ProtocolServerAdapter + { + Poco::Net::ServerSocket socket; + auto address = socketBindListen(socket, listen_host, port); + socket.setReceiveTimeout(settings.receive_timeout); + socket.setSendTimeout(settings.send_timeout); + return ProtocolServerAdapter( + listen_host, + port_name, + "native protocol (tcp) with PROXY: " + address.toString(), + std::make_unique( + new TCPHandlerFactory(*this, /* secure */ false, /* proxy protocol */ true), + server_pool, + socket, + new Poco::Net::TCPServerParams)); + }); + + /// TCP with SSL + port_name = "tcp_port_secure"; + createServer(config, listen_host, port_name, listen_try, start_servers, servers, [&](UInt16 port) -> ProtocolServerAdapter + { +#if USE_SSL + Poco::Net::SecureServerSocket socket; + auto address = socketBindListen(socket, listen_host, port, /* secure = */ true); + socket.setReceiveTimeout(settings.receive_timeout); + socket.setSendTimeout(settings.send_timeout); + return ProtocolServerAdapter( + listen_host, + port_name, + "secure native protocol (tcp_secure): " + address.toString(), + std::make_unique( + new TCPHandlerFactory(*this, /* secure */ true, /* proxy protocol */ false), + server_pool, + socket, + new Poco::Net::TCPServerParams)); +#else + UNUSED(port); + throw Exception{"SSL support for TCP protocol is disabled because Poco library was built without NetSSL support.", + ErrorCodes::SUPPORT_IS_DISABLED}; +#endif + }); + + /// Interserver IO HTTP + port_name = "interserver_http_port"; + createServer(config, listen_host, port_name, listen_try, start_servers, servers, [&](UInt16 port) -> ProtocolServerAdapter + { + Poco::Net::ServerSocket socket; + auto address = socketBindListen(socket, listen_host, port); + socket.setReceiveTimeout(settings.http_receive_timeout); + socket.setSendTimeout(settings.http_send_timeout); + return ProtocolServerAdapter( + listen_host, + port_name, + "replica communication (interserver): http://" + address.toString(), + std::make_unique( + context(), + createHandlerFactory(*this, async_metrics, "InterserverIOHTTPHandler-factory"), + server_pool, + socket, + http_params)); + }); + + port_name = "interserver_https_port"; + createServer(config, listen_host, port_name, listen_try, start_servers, servers, [&](UInt16 port) -> ProtocolServerAdapter + { +#if USE_SSL + Poco::Net::SecureServerSocket socket; + auto address = socketBindListen(socket, listen_host, port, /* secure = */ true); + socket.setReceiveTimeout(settings.http_receive_timeout); + socket.setSendTimeout(settings.http_send_timeout); + return ProtocolServerAdapter( + listen_host, + port_name, + "secure replica communication (interserver): https://" + address.toString(), + std::make_unique( + context(), + createHandlerFactory(*this, async_metrics, "InterserverIOHTTPSHandler-factory"), + server_pool, + socket, + http_params)); +#else + UNUSED(port); + throw Exception{"SSL support for TCP protocol is disabled because Poco library was built without NetSSL support.", + ErrorCodes::SUPPORT_IS_DISABLED}; +#endif + }); + + port_name = "mysql_port"; + createServer(config, listen_host, port_name, listen_try, start_servers, servers, [&](UInt16 port) -> ProtocolServerAdapter + { + Poco::Net::ServerSocket socket; + auto address = socketBindListen(socket, listen_host, port, /* secure = */ true); + socket.setReceiveTimeout(Poco::Timespan()); + socket.setSendTimeout(settings.send_timeout); + return ProtocolServerAdapter( + listen_host, + port_name, + "MySQL compatibility protocol: " + address.toString(), + std::make_unique(new MySQLHandlerFactory(*this), server_pool, socket, new Poco::Net::TCPServerParams)); + }); + + port_name = "postgresql_port"; + createServer(config, listen_host, port_name, listen_try, start_servers, servers, [&](UInt16 port) -> ProtocolServerAdapter + { + Poco::Net::ServerSocket socket; + auto address = socketBindListen(socket, listen_host, port, /* secure = */ true); + socket.setReceiveTimeout(Poco::Timespan()); + socket.setSendTimeout(settings.send_timeout); + return ProtocolServerAdapter( + listen_host, + port_name, + "PostgreSQL compatibility protocol: " + address.toString(), + std::make_unique(new PostgreSQLHandlerFactory(*this), server_pool, socket, new Poco::Net::TCPServerParams)); + }); + +#if USE_GRPC + port_name = "grpc_port"; + createServer(config, listen_host, port_name, listen_try, start_servers, servers, [&](UInt16 port) -> ProtocolServerAdapter + { + Poco::Net::SocketAddress server_address(listen_host, port); + return ProtocolServerAdapter( + listen_host, + port_name, + "gRPC protocol: " + server_address.toString(), + std::make_unique(*this, makeSocketAddress(listen_host, port, &logger()))); + }); +#endif + + /// Prometheus (if defined and not setup yet with http_port) + port_name = "prometheus.port"; + createServer(config, listen_host, port_name, listen_try, start_servers, servers, [&](UInt16 port) -> ProtocolServerAdapter + { + Poco::Net::ServerSocket socket; + auto address = socketBindListen(socket, listen_host, port); + socket.setReceiveTimeout(settings.http_receive_timeout); + socket.setSendTimeout(settings.http_send_timeout); + return ProtocolServerAdapter( + listen_host, + port_name, + "Prometheus: http://" + address.toString(), + std::make_unique( + context(), createHandlerFactory(*this, async_metrics, "PrometheusHandler-factory"), server_pool, socket, http_params)); + }); + } + +} + +void Server::updateServers( + Poco::Util::AbstractConfiguration & config, + Poco::ThreadPool & server_pool, + AsynchronousMetrics & async_metrics, + std::vector & servers) +{ + Poco::Logger * log = &logger(); + /// Gracefully shutdown servers when their port is removed from config + const auto listen_hosts = getListenHosts(config); + const auto listen_try = getListenTry(config); + + for (auto & server : servers) + if (!server.isStopping()) + { + bool has_host = std::find(listen_hosts.begin(), listen_hosts.end(), server.getListenHost()) != listen_hosts.end(); + bool has_port = !config.getString(server.getPortName(), "").empty(); + if (!has_host || !has_port || config.getInt(server.getPortName()) != server.portNumber()) + { + server.stop(); + LOG_INFO(log, "Stopped listening for {}", server.getDescription()); + } + } + + createServers(config, listen_hosts, listen_try, server_pool, async_metrics, servers, /* start_servers: */ true); + + /// Remove servers once all their connections are closed + while (std::any_of(servers.begin(), servers.end(), [](const auto & server) { return server.isStopping(); })) + { + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + std::erase_if(servers, [&log](auto & server) + { + if (!server.isStopping()) + return false; + auto is_finished = server.currentConnections() == 0; + if (is_finished) + LOG_DEBUG(log, "Server finished: {}", server.getDescription()); + else + LOG_TRACE(log, "Waiting server to finish: {}", server.getDescription()); + return is_finished; + }); + } +} + } diff --git a/programs/server/Server.h b/programs/server/Server.h index 45e5fccd51d..b4f2ea3bb79 100644 --- a/programs/server/Server.h +++ b/programs/server/Server.h @@ -24,6 +24,8 @@ namespace Poco namespace DB { +class AsynchronousMetrics; +class ProtocolServerAdapter; class Server : public BaseDaemon, public IServer { @@ -67,8 +69,30 @@ private: ContextMutablePtr global_context; Poco::Net::SocketAddress socketBindListen(Poco::Net::ServerSocket & socket, const std::string & host, UInt16 port, [[maybe_unused]] bool secure = false) const; - using CreateServerFunc = std::function; - void createServer(const std::string & listen_host, const char * port_name, bool listen_try, CreateServerFunc && func) const; + using CreateServerFunc = std::function; + void createServer( + Poco::Util::AbstractConfiguration & config, + const std::string & listen_host, + const char * port_name, + bool listen_try, + bool start_server, + std::vector & servers, + CreateServerFunc && func) const; + + void createServers( + Poco::Util::AbstractConfiguration & config, + const std::vector & listen_hosts, + bool listen_try, + Poco::ThreadPool & server_pool, + AsynchronousMetrics & async_metrics, + std::vector & servers, + bool start_servers = false); + + void updateServers( + Poco::Util::AbstractConfiguration & config, + Poco::ThreadPool & server_pool, + AsynchronousMetrics & async_metrics, + std::vector & servers); }; } diff --git a/programs/server/config.xml b/programs/server/config.xml index 9a2a6d7729f..d88773a3fc4 100644 --- a/programs/server/config.xml +++ b/programs/server/config.xml @@ -152,6 +152,7 @@ This setting could be used to switch replication to another network interface (the server may be connected to multiple networks via multiple addresses) --> + @@ -177,6 +178,7 @@ --> + @@ -293,6 +295,10 @@ 10000 + + + 0.9 diff --git a/release b/release index 6e6970d7b00..3eb5591fe2c 100755 --- a/release +++ b/release @@ -87,7 +87,7 @@ if [ -z "$NO_BUILD" ] ; then # Build (only binary packages). debuild --preserve-env -e PATH \ -e DEB_CC=$DEB_CC -e DEB_CXX=$DEB_CXX -e CMAKE_FLAGS="$CMAKE_FLAGS" \ - -b ${DEBUILD_NOSIGN_OPTIONS} ${DEBUILD_NODEPS_OPTIONS} + -b ${DEBUILD_NOSIGN_OPTIONS} ${DEBUILD_NODEPS_OPTIONS} ${DEB_ARCH_FLAG} fi if [ -n "$MAKE_RPM" ]; then diff --git a/src/Access/AccessControl.cpp b/src/Access/AccessControl.cpp index da1fd94239a..9cad53e667b 100644 --- a/src/Access/AccessControl.cpp +++ b/src/Access/AccessControl.cpp @@ -30,6 +30,7 @@ namespace ErrorCodes { extern const int UNKNOWN_ELEMENT_IN_CONFIG; extern const int UNKNOWN_SETTING; + extern const int AUTHENTICATION_FAILED; } @@ -401,9 +402,20 @@ void AccessControl::addStoragesFromMainConfig( } -UUID AccessControl::login(const Credentials & credentials, const Poco::Net::IPAddress & address) const +UUID AccessControl::authenticate(const Credentials & credentials, const Poco::Net::IPAddress & address) const { - return MultipleAccessStorage::login(credentials, address, *external_authenticators); + try + { + return MultipleAccessStorage::authenticate(credentials, address, *external_authenticators); + } + catch (...) + { + tryLogCurrentException(getLogger(), "from: " + address.toString() + ", user: " + credentials.getUserName() + ": Authentication failed"); + + /// We use the same message for all authentication failures because we don't want to give away any unnecessary information for security reasons, + /// only the log will show the exact reason. + throw Exception(credentials.getUserName() + ": Authentication failed: password is incorrect or there is no user with such name", ErrorCodes::AUTHENTICATION_FAILED); + } } void AccessControl::setExternalAuthenticatorsConfig(const Poco::Util::AbstractConfiguration & config) diff --git a/src/Access/AccessControl.h b/src/Access/AccessControl.h index d891432266e..77709313d3e 100644 --- a/src/Access/AccessControl.h +++ b/src/Access/AccessControl.h @@ -113,7 +113,7 @@ public: bool isSettingNameAllowed(const std::string_view & name) const; void checkSettingNameIsAllowed(const std::string_view & name) const; - UUID login(const Credentials & credentials, const Poco::Net::IPAddress & address) const; + UUID authenticate(const Credentials & credentials, const Poco::Net::IPAddress & address) const; void setExternalAuthenticatorsConfig(const Poco::Util::AbstractConfiguration & config); std::shared_ptr getContextAccess( diff --git a/src/Access/Authentication.cpp b/src/Access/Authentication.cpp index 794c0a0d5d5..6bc9aeec4c2 100644 --- a/src/Access/Authentication.cpp +++ b/src/Access/Authentication.cpp @@ -54,7 +54,7 @@ namespace const Poco::SHA1Engine::Digest & digest = engine.digest(); Poco::SHA1Engine::Digest calculated_password_sha1(sha1_size); - for (size_t i = 0; i < sha1_size; i++) + for (size_t i = 0; i < sha1_size; ++i) calculated_password_sha1[i] = scrambled_password[i] ^ digest[i]; auto calculated_password_double_sha1 = Util::encodeSHA1(calculated_password_sha1); diff --git a/src/Access/DiskAccessStorage.cpp b/src/Access/DiskAccessStorage.cpp index 1428e546d34..5215139b50c 100644 --- a/src/Access/DiskAccessStorage.cpp +++ b/src/Access/DiskAccessStorage.cpp @@ -426,19 +426,24 @@ std::vector DiskAccessStorage::findAllImpl(AccessEntityType type) const return res; } -bool DiskAccessStorage::existsImpl(const UUID & id) const +bool DiskAccessStorage::exists(const UUID & id) const { std::lock_guard lock{mutex}; return entries_by_id.count(id); } -AccessEntityPtr DiskAccessStorage::readImpl(const UUID & id) const +AccessEntityPtr DiskAccessStorage::readImpl(const UUID & id, bool throw_if_not_exists) const { std::lock_guard lock{mutex}; auto it = entries_by_id.find(id); if (it == entries_by_id.end()) - throwNotFound(id); + { + if (throw_if_not_exists) + throwNotFound(id); + else + return nullptr; + } const auto & entry = it->second; if (!entry.entity) @@ -447,43 +452,56 @@ AccessEntityPtr DiskAccessStorage::readImpl(const UUID & id) const } -String DiskAccessStorage::readNameImpl(const UUID & id) const +std::optional DiskAccessStorage::readNameImpl(const UUID & id, bool throw_if_not_exists) const { std::lock_guard lock{mutex}; auto it = entries_by_id.find(id); if (it == entries_by_id.end()) - throwNotFound(id); - return String{it->second.name}; + { + if (throw_if_not_exists) + throwNotFound(id); + else + return std::nullopt; + } + return it->second.name; } -bool DiskAccessStorage::canInsertImpl(const AccessEntityPtr &) const -{ - return !readonly; -} - - -UUID DiskAccessStorage::insertImpl(const AccessEntityPtr & new_entity, bool replace_if_exists) +std::optional DiskAccessStorage::insertImpl(const AccessEntityPtr & new_entity, bool replace_if_exists, bool throw_if_exists) { Notifications notifications; SCOPE_EXIT({ notify(notifications); }); UUID id = generateRandomID(); std::lock_guard lock{mutex}; - insertNoLock(id, new_entity, replace_if_exists, notifications); - return id; + if (insertNoLock(id, new_entity, replace_if_exists, throw_if_exists, notifications)) + return id; + + return std::nullopt; } -void DiskAccessStorage::insertNoLock(const UUID & id, const AccessEntityPtr & new_entity, bool replace_if_exists, Notifications & notifications) +bool DiskAccessStorage::insertNoLock(const UUID & id, const AccessEntityPtr & new_entity, bool replace_if_exists, bool throw_if_exists, Notifications & notifications) { const String & name = new_entity->getName(); AccessEntityType type = new_entity->getType(); + /// Check that we can insert. if (readonly) throwReadonlyCannotInsert(type, name); - /// Check that we can insert. + auto & entries_by_name = entries_by_name_and_type[static_cast(type)]; + auto it_by_name = entries_by_name.find(name); + bool name_collision = (it_by_name != entries_by_name.end()); + + if (name_collision && !replace_if_exists) + { + if (throw_if_exists) + throwNameCollisionCannotInsert(type, name); + else + return false; + } + auto it_by_id = entries_by_id.find(id); if (it_by_id != entries_by_id.end()) { @@ -491,18 +509,11 @@ void DiskAccessStorage::insertNoLock(const UUID & id, const AccessEntityPtr & ne throwIDCollisionCannotInsert(id, type, name, existing_entry.entity->getType(), existing_entry.entity->getName()); } - auto & entries_by_name = entries_by_name_and_type[static_cast(type)]; - auto it_by_name = entries_by_name.find(name); - bool name_collision = (it_by_name != entries_by_name.end()); - - if (name_collision && !replace_if_exists) - throwNameCollisionCannotInsert(type, name); - scheduleWriteLists(type); writeAccessEntityToDisk(id, *new_entity); if (name_collision && replace_if_exists) - removeNoLock(it_by_name->second->id, notifications); + removeNoLock(it_by_name->second->id, /* throw_if_not_exists = */ false, notifications); /// Do insertion. auto & entry = entries_by_id[id]; @@ -512,24 +523,30 @@ void DiskAccessStorage::insertNoLock(const UUID & id, const AccessEntityPtr & ne entry.entity = new_entity; entries_by_name[entry.name] = &entry; prepareNotifications(id, entry, false, notifications); + return true; } -void DiskAccessStorage::removeImpl(const UUID & id) +bool DiskAccessStorage::removeImpl(const UUID & id, bool throw_if_not_exists) { Notifications notifications; SCOPE_EXIT({ notify(notifications); }); std::lock_guard lock{mutex}; - removeNoLock(id, notifications); + return removeNoLock(id, throw_if_not_exists, notifications); } -void DiskAccessStorage::removeNoLock(const UUID & id, Notifications & notifications) +bool DiskAccessStorage::removeNoLock(const UUID & id, bool throw_if_not_exists, Notifications & notifications) { auto it = entries_by_id.find(id); if (it == entries_by_id.end()) - throwNotFound(id); + { + if (throw_if_not_exists) + throwNotFound(id); + else + return false; + } Entry & entry = it->second; AccessEntityType type = entry.type; @@ -545,28 +562,35 @@ void DiskAccessStorage::removeNoLock(const UUID & id, Notifications & notificati auto & entries_by_name = entries_by_name_and_type[static_cast(type)]; entries_by_name.erase(entry.name); entries_by_id.erase(it); + return true; } -void DiskAccessStorage::updateImpl(const UUID & id, const UpdateFunc & update_func) +bool DiskAccessStorage::updateImpl(const UUID & id, const UpdateFunc & update_func, bool throw_if_not_exists) { Notifications notifications; SCOPE_EXIT({ notify(notifications); }); std::lock_guard lock{mutex}; - updateNoLock(id, update_func, notifications); + return updateNoLock(id, update_func, throw_if_not_exists, notifications); } -void DiskAccessStorage::updateNoLock(const UUID & id, const UpdateFunc & update_func, Notifications & notifications) +bool DiskAccessStorage::updateNoLock(const UUID & id, const UpdateFunc & update_func, bool throw_if_not_exists, Notifications & notifications) { auto it = entries_by_id.find(id); if (it == entries_by_id.end()) - throwNotFound(id); + { + if (throw_if_not_exists) + throwNotFound(id); + else + return false; + } Entry & entry = it->second; if (readonly) throwReadonlyCannotUpdate(entry.type, entry.name); + if (!entry.entity) entry.entity = readAccessEntityFromDisk(id); auto old_entity = entry.entity; @@ -576,7 +600,7 @@ void DiskAccessStorage::updateNoLock(const UUID & id, const UpdateFunc & update_ throwBadCast(id, new_entity->getType(), new_entity->getName(), old_entity->getType()); if (*new_entity == *old_entity) - return; + return true; const String & new_name = new_entity->getName(); const String & old_name = old_entity->getName(); @@ -602,6 +626,7 @@ void DiskAccessStorage::updateNoLock(const UUID & id, const UpdateFunc & update_ } prepareNotifications(id, entry, false, notifications); + return true; } @@ -675,7 +700,7 @@ scope_guard DiskAccessStorage::subscribeForChangesImpl(AccessEntityType type, co }; } -bool DiskAccessStorage::hasSubscriptionImpl(const UUID & id) const +bool DiskAccessStorage::hasSubscription(const UUID & id) const { std::lock_guard lock{mutex}; auto it = entries_by_id.find(id); @@ -687,7 +712,7 @@ bool DiskAccessStorage::hasSubscriptionImpl(const UUID & id) const return false; } -bool DiskAccessStorage::hasSubscriptionImpl(AccessEntityType type) const +bool DiskAccessStorage::hasSubscription(AccessEntityType type) const { std::lock_guard lock{mutex}; const auto & handlers = handlers_by_type[static_cast(type)]; diff --git a/src/Access/DiskAccessStorage.h b/src/Access/DiskAccessStorage.h index 853a18590f0..20390dabfa0 100644 --- a/src/Access/DiskAccessStorage.h +++ b/src/Access/DiskAccessStorage.h @@ -24,22 +24,22 @@ public: bool isPathEqual(const String & directory_path_) const; void setReadOnly(bool readonly_) { readonly = readonly_; } - bool isReadOnly() const { return readonly; } + bool isReadOnly() const override { return readonly; } + + bool exists(const UUID & id) const override; + bool hasSubscription(const UUID & id) const override; + bool hasSubscription(AccessEntityType type) const override; private: std::optional findImpl(AccessEntityType type, const String & name) const override; std::vector findAllImpl(AccessEntityType type) const override; - bool existsImpl(const UUID & id) const override; - AccessEntityPtr readImpl(const UUID & id) const override; - String readNameImpl(const UUID & id) const override; - bool canInsertImpl(const AccessEntityPtr & entity) const override; - UUID insertImpl(const AccessEntityPtr & entity, bool replace_if_exists) override; - void removeImpl(const UUID & id) override; - void updateImpl(const UUID & id, const UpdateFunc & update_func) override; + AccessEntityPtr readImpl(const UUID & id, bool throw_if_not_exists) const override; + std::optional readNameImpl(const UUID & id, bool throw_if_not_exists) const override; + std::optional insertImpl(const AccessEntityPtr & entity, bool replace_if_exists, bool throw_if_exists) override; + bool removeImpl(const UUID & id, bool throw_if_not_exists) override; + bool updateImpl(const UUID & id, const UpdateFunc & update_func, bool throw_if_not_exists) override; scope_guard subscribeForChangesImpl(const UUID & id, const OnChangedHandler & handler) const override; scope_guard subscribeForChangesImpl(AccessEntityType type, const OnChangedHandler & handler) const override; - bool hasSubscriptionImpl(const UUID & id) const override; - bool hasSubscriptionImpl(AccessEntityType type) const override; void clear(); bool readLists(); @@ -50,9 +50,9 @@ private: void listsWritingThreadFunc(); void stopListsWritingThread(); - void insertNoLock(const UUID & id, const AccessEntityPtr & new_entity, bool replace_if_exists, Notifications & notifications); - void removeNoLock(const UUID & id, Notifications & notifications); - void updateNoLock(const UUID & id, const UpdateFunc & update_func, Notifications & notifications); + bool insertNoLock(const UUID & id, const AccessEntityPtr & new_entity, bool replace_if_exists, bool throw_if_exists, Notifications & notifications); + bool removeNoLock(const UUID & id, bool throw_if_not_exists, Notifications & notifications); + bool updateNoLock(const UUID & id, const UpdateFunc & update_func, bool throw_if_not_exists, Notifications & notifications); AccessEntityPtr readAccessEntityFromDisk(const UUID & id) const; void writeAccessEntityToDisk(const UUID & id, const IAccessEntity & entity) const; diff --git a/src/Access/EnabledQuota.cpp b/src/Access/EnabledQuota.cpp index f0c6004bd77..78dd3c7022a 100644 --- a/src/Access/EnabledQuota.cpp +++ b/src/Access/EnabledQuota.cpp @@ -2,6 +2,7 @@ #include #include #include +#include #include #include #include @@ -15,6 +16,7 @@ namespace ErrorCodes extern const int QUOTA_EXPIRED; } + struct EnabledQuota::Impl { [[noreturn]] static void throwQuotaExceed( @@ -35,54 +37,6 @@ struct EnabledQuota::Impl } - /// Returns the end of the current interval. If the passed `current_time` is greater than that end, - /// the function automatically recalculates the interval's end by adding the interval's duration - /// one or more times until the interval's end is greater than `current_time`. - /// If that recalculation occurs the function also resets amounts of resources used and sets the variable - /// `counters_were_reset`. - static std::chrono::system_clock::time_point getEndOfInterval( - const Interval & interval, std::chrono::system_clock::time_point current_time, bool & counters_were_reset) - { - auto & end_of_interval = interval.end_of_interval; - auto end_loaded = end_of_interval.load(); - auto end = std::chrono::system_clock::time_point{end_loaded}; - if (current_time < end) - { - counters_were_reset = false; - return end; - } - - bool need_reset_counters = false; - - do - { - /// Calculate the end of the next interval: - /// | X | - /// end current_time next_end = end + duration * n - /// where n is an integer number, n >= 1. - const auto duration = interval.duration; - UInt64 n = static_cast((current_time - end + duration) / duration); - end = end + duration * n; - if (end_of_interval.compare_exchange_strong(end_loaded, end.time_since_epoch())) - { - /// We reset counters only if the interval's end has been calculated before. - /// If it hasn't we just calculate the interval's end for the first time and don't reset counters yet. - need_reset_counters = (end_loaded.count() != 0); - break; - } - end = std::chrono::system_clock::time_point{end_loaded}; - } - while (current_time >= end); - - if (need_reset_counters) - { - boost::range::fill(interval.used, 0); - counters_were_reset = true; - } - return end; - } - - static void used( const String & user_name, const Intervals & intervals, @@ -91,24 +45,22 @@ struct EnabledQuota::Impl std::chrono::system_clock::time_point current_time, bool check_exceeded) { + auto quota_type_i = static_cast(quota_type); for (const auto & interval : intervals.intervals) { - auto quota_type_i = static_cast(quota_type); QuotaValue used = (interval.used[quota_type_i] += value); QuotaValue max = interval.max[quota_type_i]; if (!max) continue; + if (used > max) { bool counters_were_reset = false; - auto end_of_interval = getEndOfInterval(interval, current_time, counters_were_reset); + auto end_of_interval = interval.getEndOfInterval(current_time, counters_were_reset); if (counters_were_reset) - { used = (interval.used[quota_type_i] += value); - if ((used > max) && check_exceeded) - throwQuotaExceed(user_name, intervals.quota_name, quota_type, used, max, interval.duration, end_of_interval); - } - else if (check_exceeded) + + if (check_exceeded && (used > max)) throwQuotaExceed(user_name, intervals.quota_name, quota_type, used, max, interval.duration, end_of_interval); } } @@ -127,10 +79,11 @@ struct EnabledQuota::Impl QuotaValue max = interval.max[quota_type_i]; if (!max) continue; + if (used > max) { bool counters_were_reset = false; - std::chrono::system_clock::time_point end_of_interval = getEndOfInterval(interval, current_time, counters_were_reset); + auto end_of_interval = interval.getEndOfInterval(current_time, counters_were_reset); if (!counters_were_reset) throwQuotaExceed(user_name, intervals.quota_name, quota_type, used, max, interval.duration, end_of_interval); } @@ -145,17 +98,32 @@ struct EnabledQuota::Impl for (auto quota_type : collections::range(QuotaType::MAX)) checkExceeded(user_name, intervals, quota_type, current_time); } + + static std::chrono::system_clock::duration randomDuration(std::chrono::seconds max) + { + auto count = std::chrono::duration_cast(max).count(); + std::uniform_int_distribution distribution{0, count - 1}; + return std::chrono::system_clock::duration(distribution(thread_local_rng)); + } }; -EnabledQuota::Interval::Interval() +EnabledQuota::Interval::Interval(std::chrono::seconds duration_, bool randomize_interval_, std::chrono::system_clock::time_point current_time_) + : duration(duration_) , randomize_interval(randomize_interval_) { + std::chrono::system_clock::time_point initial_end{}; + if (randomize_interval_) + initial_end += Impl::randomDuration(duration_); + end_of_interval = initial_end.time_since_epoch(); + for (auto quota_type : collections::range(QuotaType::MAX)) { auto quota_type_i = static_cast(quota_type); used[quota_type_i].store(0); max[quota_type_i] = 0; } + + getEndOfInterval(current_time_); /// Force updating the end of the interval for the first time. } @@ -177,6 +145,55 @@ EnabledQuota::Interval & EnabledQuota::Interval::operator =(const Interval & src } +/// Returns the end of the current interval. If the passed `current_time` is greater than that end, +/// the function automatically recalculates the interval's end by adding the interval's duration +/// one or more times until the interval's end is greater than `current_time`. +/// If that recalculation occurs the function also resets amounts of resources used and sets the variable +/// `counters_were_reset`. +std::chrono::system_clock::time_point EnabledQuota::Interval::getEndOfInterval(std::chrono::system_clock::time_point current_time) const +{ + bool counters_were_reset; + return getEndOfInterval(current_time, counters_were_reset); +} + +std::chrono::system_clock::time_point EnabledQuota::Interval::getEndOfInterval(std::chrono::system_clock::time_point current_time, bool & counters_were_reset) const +{ + auto end_loaded = end_of_interval.load(); + auto end = std::chrono::system_clock::time_point{end_loaded}; + if (current_time < end) + { + counters_were_reset = false; + return end; + } + + bool need_reset_counters = false; + + do + { + /// Calculate the end of the next interval: + /// | X | + /// end current_time next_end = end + duration * n + /// where n is an integer number, n >= 1. + UInt64 n = static_cast((current_time - end + duration) / duration); + end = end + duration * n; + if (end_of_interval.compare_exchange_strong(end_loaded, end.time_since_epoch())) + { + need_reset_counters = true; + break; + } + end = std::chrono::system_clock::time_point{end_loaded}; + } + while (current_time >= end); + + if (need_reset_counters) + { + boost::range::fill(used, 0); + counters_were_reset = true; + } + return end; +} + + std::optional EnabledQuota::Intervals::getUsage(std::chrono::system_clock::time_point current_time) const { if (!quota_id) @@ -192,8 +209,7 @@ std::optional EnabledQuota::Intervals::getUsage(std::chrono::system_ auto & out = usage.intervals.back(); out.duration = in.duration; out.randomize_interval = in.randomize_interval; - bool counters_were_reset = false; - out.end_of_interval = Impl::getEndOfInterval(in, current_time, counters_were_reset); + out.end_of_interval = in.getEndOfInterval(current_time); for (auto quota_type : collections::range(QuotaType::MAX)) { auto quota_type_i = static_cast(quota_type); diff --git a/src/Access/EnabledQuota.h b/src/Access/EnabledQuota.h index 097afe861d2..88362c9193f 100644 --- a/src/Access/EnabledQuota.h +++ b/src/Access/EnabledQuota.h @@ -73,9 +73,13 @@ private: bool randomize_interval = false; mutable std::atomic end_of_interval; - Interval(); + Interval(std::chrono::seconds duration_, bool randomize_interval_, std::chrono::system_clock::time_point current_time_); + Interval(const Interval & src) { *this = src; } Interval & operator =(const Interval & src); + + std::chrono::system_clock::time_point getEndOfInterval(std::chrono::system_clock::time_point current_time) const; + std::chrono::system_clock::time_point getEndOfInterval(std::chrono::system_clock::time_point current_time, bool & counters_were_reset) const; }; struct Intervals diff --git a/src/Access/IAccessStorage.cpp b/src/Access/IAccessStorage.cpp index 51c2525d923..0a7fc630d97 100644 --- a/src/Access/IAccessStorage.cpp +++ b/src/Access/IAccessStorage.cpp @@ -21,8 +21,8 @@ namespace ErrorCodes extern const int ACCESS_STORAGE_READONLY; extern const int WRONG_PASSWORD; extern const int IP_ADDRESS_NOT_ALLOWED; - extern const int AUTHENTICATION_FAILED; extern const int LOGICAL_ERROR; + extern const int NOT_IMPLEMENTED; } @@ -32,101 +32,6 @@ namespace { return "ID(" + toString(id) + ")"; } - - String formatTypeWithNameOrID(const IAccessStorage & storage, const UUID & id) - { - auto entity = storage.tryRead(id); - if (entity) - return entity->formatTypeWithName(); - return outputID(id); - } - - - template - bool tryCall(const Func & function) - { - try - { - function(); - return true; - } - catch (...) - { - return false; - } - } - - - class ErrorsTracker - { - public: - explicit ErrorsTracker(size_t count_) { succeed.reserve(count_); } - - template - bool tryCall(const Func & func) - { - try - { - func(); - } - catch (Exception & e) - { - if (!exception) - exception.emplace(e); - succeed.push_back(false); - return false; - } - catch (Poco::Exception & e) - { - if (!exception) - exception.emplace(Exception::CreateFromPocoTag{}, e); - succeed.push_back(false); - return false; - } - catch (std::exception & e) - { - if (!exception) - exception.emplace(Exception::CreateFromSTDTag{}, e); - succeed.push_back(false); - return false; - } - succeed.push_back(true); - return true; - } - - bool errors() const { return exception.has_value(); } - - void showErrors(const char * format, Fn auto && get_name_function) - { - if (!exception) - return; - - Strings succeeded_names_list; - Strings failed_names_list; - for (size_t i = 0; i != succeed.size(); ++i) - { - String name = get_name_function(i); - if (succeed[i]) - succeeded_names_list.emplace_back(name); - else - failed_names_list.emplace_back(name); - } - String succeeded_names = boost::algorithm::join(succeeded_names_list, ", "); - String failed_names = boost::algorithm::join(failed_names_list, ", "); - if (succeeded_names.empty()) - succeeded_names = "none"; - - String error_message = format; - boost::replace_all(error_message, "{succeeded_names}", succeeded_names); - boost::replace_all(error_message, "{failed_names}", failed_names); - exception->addMessage(error_message); - exception->rethrow(); - } - - private: - std::vector succeed; - std::optional exception; - }; } @@ -175,228 +80,332 @@ std::vector IAccessStorage::getIDs(AccessEntityType type, const Strings & } -bool IAccessStorage::exists(const UUID & id) const -{ - return existsImpl(id); -} - - -AccessEntityPtr IAccessStorage::tryReadBase(const UUID & id) const -{ - AccessEntityPtr entity; - auto func = [&] { entity = readImpl(id); }; - if (!tryCall(func)) - return nullptr; - return entity; -} - - String IAccessStorage::readName(const UUID & id) const { - return readNameImpl(id); + return *readNameImpl(id, /* throw_if_not_exists = */ true); } -Strings IAccessStorage::readNames(const std::vector & ids) const +std::optional IAccessStorage::readName(const UUID & id, bool throw_if_not_exists) const { - Strings res; - res.reserve(ids.size()); - for (const auto & id : ids) - res.emplace_back(readName(id)); - return res; + return readNameImpl(id, throw_if_not_exists); } -std::optional IAccessStorage::tryReadName(const UUID & id) const -{ - String name; - auto func = [&] { name = readNameImpl(id); }; - if (!tryCall(func)) - return {}; - return name; -} - - -Strings IAccessStorage::tryReadNames(const std::vector & ids) const +Strings IAccessStorage::readNames(const std::vector & ids, bool throw_if_not_exists) const { Strings res; res.reserve(ids.size()); for (const auto & id : ids) { - if (auto name = tryReadName(id)) + if (auto name = readNameImpl(id, throw_if_not_exists)) res.emplace_back(std::move(name).value()); } return res; } -UUID IAccessStorage::insert(const AccessEntityPtr & entity) +std::optional IAccessStorage::tryReadName(const UUID & id) const { - return insertImpl(entity, false); + return readName(id, /* throw_if_not_exists = */ false); } -std::vector IAccessStorage::insert(const std::vector & multiple_entities) +Strings IAccessStorage::tryReadNames(const std::vector & ids) const { - ErrorsTracker tracker(multiple_entities.size()); + return readNames(ids, /* throw_if_not_exists = */ false); +} - std::vector ids; - for (const auto & entity : multiple_entities) + +std::optional IAccessStorage::readNameImpl(const UUID & id, bool throw_if_not_exists) const +{ + if (auto entity = read(id, throw_if_not_exists)) + return entity->getName(); + return std::nullopt; +} + + +UUID IAccessStorage::insert(const AccessEntityPtr & entity) +{ + return *insert(entity, /* replace_if_exists = */ false, /* throw_if_exists = */ true); +} + + +std::optional IAccessStorage::insert(const AccessEntityPtr & entity, bool replace_if_exists, bool throw_if_exists) +{ + return insertImpl(entity, replace_if_exists, throw_if_exists); +} + + +std::vector IAccessStorage::insert(const std::vector & multiple_entities, bool replace_if_exists, bool throw_if_exists) +{ + if (multiple_entities.empty()) + return {}; + + if (multiple_entities.size() == 1) { - UUID id; - auto func = [&] { id = insertImpl(entity, /* replace_if_exists = */ false); }; - if (tracker.tryCall(func)) - ids.push_back(id); + if (auto id = insert(multiple_entities[0], replace_if_exists, throw_if_exists)) + return {*id}; + return {}; } - if (tracker.errors()) + std::vector successfully_inserted; + try { - auto get_name_function = [&](size_t i) { return multiple_entities[i]->formatTypeWithName(); }; - tracker.showErrors("Couldn't insert {failed_names}. Successfully inserted: {succeeded_names}", get_name_function); + std::vector ids; + for (const auto & entity : multiple_entities) + { + if (auto id = insertImpl(entity, replace_if_exists, throw_if_exists)) + { + successfully_inserted.push_back(entity); + ids.push_back(*id); + } + } + return ids; + } + catch (Exception & e) + { + /// Try to add more information to the error message. + if (!successfully_inserted.empty()) + { + String successfully_inserted_str; + for (const auto & entity : successfully_inserted) + { + if (!successfully_inserted_str.empty()) + successfully_inserted_str += ", "; + successfully_inserted_str += entity->formatTypeWithName(); + } + e.addMessage("After successfully inserting {}/{}: {}", successfully_inserted.size(), multiple_entities.size(), successfully_inserted_str); + } + e.rethrow(); + __builtin_unreachable(); } - - return ids; } std::optional IAccessStorage::tryInsert(const AccessEntityPtr & entity) { - UUID id; - auto func = [&] { id = insertImpl(entity, /* replace_if_exists = */ false); }; - if (!tryCall(func)) - return {}; - return id; + return insert(entity, /* replace_if_exists = */ false, /* throw_if_exists = */ false); } std::vector IAccessStorage::tryInsert(const std::vector & multiple_entities) { - std::vector ids; - for (const auto & entity : multiple_entities) - { - UUID id; - auto func = [&] { id = insertImpl(entity, /* replace_if_exists = */ false); }; - if (tryCall(func)) - ids.push_back(id); - } - return ids; + return insert(multiple_entities, /* replace_if_exists = */ false, /* throw_if_exists = */ false); } UUID IAccessStorage::insertOrReplace(const AccessEntityPtr & entity) { - return insertImpl(entity, /* replace_if_exists = */ true); + return *insert(entity, /* replace_if_exists = */ true, /* throw_if_exists = */ false); } std::vector IAccessStorage::insertOrReplace(const std::vector & multiple_entities) { - ErrorsTracker tracker(multiple_entities.size()); - - std::vector ids; - for (const auto & entity : multiple_entities) - { - UUID id; - auto func = [&] { id = insertImpl(entity, /* replace_if_exists = */ true); }; - if (tracker.tryCall(func)) - ids.push_back(id); - } - - if (tracker.errors()) - { - auto get_name_function = [&](size_t i) { return multiple_entities[i]->formatTypeWithName(); }; - tracker.showErrors("Couldn't insert {failed_names}. Successfully inserted: {succeeded_names}", get_name_function); - } - - return ids; + return insert(multiple_entities, /* replace_if_exists = */ true, /* throw_if_exists = */ false); } -void IAccessStorage::remove(const UUID & id) +std::optional IAccessStorage::insertImpl(const AccessEntityPtr & entity, bool, bool) { - removeImpl(id); + if (isReadOnly()) + throwReadonlyCannotInsert(entity->getType(), entity->getName()); + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "insertImpl() is not implemented in {}", getStorageType()); } -void IAccessStorage::remove(const std::vector & ids) +bool IAccessStorage::remove(const UUID & id, bool throw_if_not_exists) { - ErrorsTracker tracker(ids.size()); + return removeImpl(id, throw_if_not_exists); +} - for (const auto & id : ids) + +std::vector IAccessStorage::remove(const std::vector & ids, bool throw_if_not_exists) +{ + if (ids.empty()) + return {}; + if (ids.size() == 1) + return remove(ids[0], throw_if_not_exists) ? ids : std::vector{}; + + Strings removed_names; + try { - auto func = [&] { removeImpl(id); }; - tracker.tryCall(func); + std::vector removed_ids; + std::vector readonly_ids; + + /// First we call remove() for non-readonly entities. + for (const auto & id : ids) + { + if (isReadOnly(id)) + readonly_ids.push_back(id); + else + { + auto name = tryReadName(id); + if (remove(id, throw_if_not_exists)) + { + removed_ids.push_back(id); + if (name) + removed_names.push_back(std::move(name).value()); + } + } + } + + /// For readonly entities we're still going to call remove() because + /// isReadOnly(id) could change and even if it's not then a storage-specific + /// implementation of removeImpl() will probably generate a better error message. + for (const auto & id : readonly_ids) + { + auto name = tryReadName(id); + if (remove(id, throw_if_not_exists)) + { + removed_ids.push_back(id); + if (name) + removed_names.push_back(std::move(name).value()); + } + } + + return removed_ids; } - - if (tracker.errors()) + catch (Exception & e) { - auto get_name_function = [&](size_t i) { return formatTypeWithNameOrID(*this, ids[i]); }; - tracker.showErrors("Couldn't remove {failed_names}. Successfully removed: {succeeded_names}", get_name_function); + /// Try to add more information to the error message. + if (!removed_names.empty()) + { + String removed_names_str; + for (const auto & name : removed_names) + { + if (!removed_names_str.empty()) + removed_names_str += ", "; + removed_names_str += backQuote(name); + } + e.addMessage("After successfully removing {}/{}: {}", removed_names.size(), ids.size(), removed_names_str); + } + e.rethrow(); + __builtin_unreachable(); } } bool IAccessStorage::tryRemove(const UUID & id) { - auto func = [&] { removeImpl(id); }; - return tryCall(func); + return remove(id, /* throw_if_not_exists = */ false); } std::vector IAccessStorage::tryRemove(const std::vector & ids) { - std::vector removed_ids; - for (const auto & id : ids) - { - auto func = [&] { removeImpl(id); }; - if (tryCall(func)) - removed_ids.push_back(id); - } - return removed_ids; + return remove(ids, /* throw_if_not_exists = */ false); } -void IAccessStorage::update(const UUID & id, const UpdateFunc & update_func) +bool IAccessStorage::removeImpl(const UUID & id, bool throw_if_not_exists) { - updateImpl(id, update_func); + if (isReadOnly(id)) + { + auto entity = read(id, throw_if_not_exists); + if (!entity) + return false; + throwReadonlyCannotRemove(entity->getType(), entity->getName()); + } + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "removeImpl() is not implemented in {}", getStorageType()); } -void IAccessStorage::update(const std::vector & ids, const UpdateFunc & update_func) +bool IAccessStorage::update(const UUID & id, const UpdateFunc & update_func, bool throw_if_not_exists) { - ErrorsTracker tracker(ids.size()); + return updateImpl(id, update_func, throw_if_not_exists); +} - for (const auto & id : ids) + +std::vector IAccessStorage::update(const std::vector & ids, const UpdateFunc & update_func, bool throw_if_not_exists) +{ + if (ids.empty()) + return {}; + if (ids.size() == 1) + return update(ids[0], update_func, throw_if_not_exists) ? ids : std::vector{}; + + Strings names_of_updated; + try { - auto func = [&] { updateImpl(id, update_func); }; - tracker.tryCall(func); + std::vector ids_of_updated; + std::vector readonly_ids; + + /// First we call update() for non-readonly entities. + for (const auto & id : ids) + { + if (isReadOnly(id)) + readonly_ids.push_back(id); + else + { + auto name = tryReadName(id); + if (update(id, update_func, throw_if_not_exists)) + { + ids_of_updated.push_back(id); + if (name) + names_of_updated.push_back(std::move(name).value()); + } + } + } + + /// For readonly entities we're still going to call update() because + /// isReadOnly(id) could change and even if it's not then a storage-specific + /// implementation of updateImpl() will probably generate a better error message. + for (const auto & id : readonly_ids) + { + auto name = tryReadName(id); + if (update(id, update_func, throw_if_not_exists)) + { + ids_of_updated.push_back(id); + if (name) + names_of_updated.push_back(std::move(name).value()); + } + } + + return ids_of_updated; } - - if (tracker.errors()) + catch (Exception & e) { - auto get_name_function = [&](size_t i) { return formatTypeWithNameOrID(*this, ids[i]); }; - tracker.showErrors("Couldn't update {failed_names}. Successfully updated: {succeeded_names}", get_name_function); + /// Try to add more information to the error message. + if (!names_of_updated.empty()) + { + String names_of_updated_str; + for (const auto & name : names_of_updated) + { + if (!names_of_updated_str.empty()) + names_of_updated_str += ", "; + names_of_updated_str += backQuote(name); + } + e.addMessage("After successfully updating {}/{}: {}", names_of_updated.size(), ids.size(), names_of_updated_str); + } + e.rethrow(); + __builtin_unreachable(); } } bool IAccessStorage::tryUpdate(const UUID & id, const UpdateFunc & update_func) { - auto func = [&] { updateImpl(id, update_func); }; - return tryCall(func); + return update(id, update_func, /* throw_if_not_exists = */ false); } std::vector IAccessStorage::tryUpdate(const std::vector & ids, const UpdateFunc & update_func) { - std::vector updated_ids; - for (const auto & id : ids) + return update(ids, update_func, /* throw_if_not_exists = */ false); +} + + +bool IAccessStorage::updateImpl(const UUID & id, const UpdateFunc &, bool throw_if_not_exists) +{ + if (isReadOnly(id)) { - auto func = [&] { updateImpl(id, update_func); }; - if (tryCall(func)) - updated_ids.push_back(id); + auto entity = read(id, throw_if_not_exists); + if (!entity) + return false; + throwReadonlyCannotUpdate(entity->getType(), entity->getName()); } - return updated_ids; + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "updateImpl() is not implemented in {}", getStorageType()); } @@ -421,18 +430,6 @@ scope_guard IAccessStorage::subscribeForChanges(const std::vector & ids, c } -bool IAccessStorage::hasSubscription(AccessEntityType type) const -{ - return hasSubscriptionImpl(type); -} - - -bool IAccessStorage::hasSubscription(const UUID & id) const -{ - return hasSubscriptionImpl(id); -} - - void IAccessStorage::notify(const Notifications & notifications) { for (const auto & [fn, id, new_entity] : notifications) @@ -440,50 +437,53 @@ void IAccessStorage::notify(const Notifications & notifications) } -UUID IAccessStorage::login( - const Credentials & credentials, - const Poco::Net::IPAddress & address, - const ExternalAuthenticators & external_authenticators, - bool replace_exception_with_cannot_authenticate) const -{ - try - { - return loginImpl(credentials, address, external_authenticators); - } - catch (...) - { - if (!replace_exception_with_cannot_authenticate) - throw; - - tryLogCurrentException(getLogger(), "from: " + address.toString() + ", user: " + credentials.getUserName() + ": Authentication failed"); - throwCannotAuthenticate(credentials.getUserName()); - } -} - - -UUID IAccessStorage::loginImpl( +UUID IAccessStorage::authenticate( const Credentials & credentials, const Poco::Net::IPAddress & address, const ExternalAuthenticators & external_authenticators) const +{ + return *authenticateImpl(credentials, address, external_authenticators, /* throw_if_user_not_exists = */ true); +} + + +std::optional IAccessStorage::authenticate( + const Credentials & credentials, + const Poco::Net::IPAddress & address, + const ExternalAuthenticators & external_authenticators, + bool throw_if_user_not_exists) const +{ + return authenticateImpl(credentials, address, external_authenticators, throw_if_user_not_exists); +} + + +std::optional IAccessStorage::authenticateImpl( + const Credentials & credentials, + const Poco::Net::IPAddress & address, + const ExternalAuthenticators & external_authenticators, + bool throw_if_user_not_exists) const { if (auto id = find(credentials.getUserName())) { if (auto user = tryRead(*id)) { - if (!isAddressAllowedImpl(*user, address)) + if (!isAddressAllowed(*user, address)) throwAddressNotAllowed(address); - if (!areCredentialsValidImpl(*user, credentials, external_authenticators)) + if (!areCredentialsValid(*user, credentials, external_authenticators)) throwInvalidCredentials(); - return *id; + return id; } } - throwNotFound(AccessEntityType::USER, credentials.getUserName()); + + if (throw_if_user_not_exists) + throwNotFound(AccessEntityType::USER, credentials.getUserName()); + else + return std::nullopt; } -bool IAccessStorage::areCredentialsValidImpl( +bool IAccessStorage::areCredentialsValid( const User & user, const Credentials & credentials, const ExternalAuthenticators & external_authenticators) const @@ -498,24 +498,12 @@ bool IAccessStorage::areCredentialsValidImpl( } -bool IAccessStorage::isAddressAllowedImpl(const User & user, const Poco::Net::IPAddress & address) const +bool IAccessStorage::isAddressAllowed(const User & user, const Poco::Net::IPAddress & address) const { return user.allowed_client_hosts.contains(address); } -UUID IAccessStorage::getIDOfLoggedUser(const String & user_name) const -{ - return getIDOfLoggedUserImpl(user_name); -} - - -UUID IAccessStorage::getIDOfLoggedUserImpl(const String & user_name) const -{ - return getID(user_name); -} - - UUID IAccessStorage::generateRandomID() { static Poco::UUIDGenerator generator; @@ -615,11 +603,4 @@ void IAccessStorage::throwInvalidCredentials() throw Exception("Invalid credentials", ErrorCodes::WRONG_PASSWORD); } -void IAccessStorage::throwCannotAuthenticate(const String & user_name) -{ - /// We use the same message for all authentication failures because we don't want to give away any unnecessary information for security reasons, - /// only the log will show the exact reason. - throw Exception(user_name + ": Authentication failed: password is incorrect or there is no user with such name", ErrorCodes::AUTHENTICATION_FAILED); -} - } diff --git a/src/Access/IAccessStorage.h b/src/Access/IAccessStorage.h index ccbb1ffe5bc..6e533a439d1 100644 --- a/src/Access/IAccessStorage.h +++ b/src/Access/IAccessStorage.h @@ -34,6 +34,12 @@ public: /// Returns a JSON with the parameters of the storage. It's up to the storage type to fill the JSON. virtual String getStorageParamsJSON() const { return "{}"; } + /// Returns true if this storage is readonly. + virtual bool isReadOnly() const { return false; } + + /// Returns true if this entity is readonly. + virtual bool isReadOnly(const UUID &) const { return isReadOnly(); } + /// Returns the identifiers of all the entities of a specified type contained in the storage. std::vector findAll(AccessEntityType type) const; @@ -63,14 +69,14 @@ public: std::vector getIDs(const Strings & names) const { return getIDs(EntityClassT::TYPE, names); } /// Returns whether there is an entity with such identifier in the storage. - bool exists(const UUID & id) const; + virtual bool exists(const UUID & id) const = 0; /// Reads an entity. Throws an exception if not found. template - std::shared_ptr read(const UUID & id) const; + std::shared_ptr read(const UUID & id, bool throw_if_not_exists = true) const; template - std::shared_ptr read(const String & name) const; + std::shared_ptr read(const String & name, bool throw_if_not_exists = true) const; /// Reads an entity. Returns nullptr if not found. template @@ -81,18 +87,16 @@ public: /// Reads only name of an entity. String readName(const UUID & id) const; - Strings readNames(const std::vector & ids) const; + std::optional readName(const UUID & id, bool throw_if_not_exists) const; + Strings readNames(const std::vector & ids, bool throw_if_not_exists = true) const; std::optional tryReadName(const UUID & id) const; Strings tryReadNames(const std::vector & ids) const; - /// Returns true if a specified entity can be inserted into this storage. - /// This function doesn't check whether there are no entities with such name in the storage. - bool canInsert(const AccessEntityPtr & entity) const { return canInsertImpl(entity); } - /// Inserts an entity to the storage. Returns ID of a new entry in the storage. /// Throws an exception if the specified name already exists. UUID insert(const AccessEntityPtr & entity); - std::vector insert(const std::vector & multiple_entities); + std::optional insert(const AccessEntityPtr & entity, bool replace_if_exists, bool throw_if_exists); + std::vector insert(const std::vector & multiple_entities, bool replace_if_exists = false, bool throw_if_exists = true); /// Inserts an entity to the storage. Returns ID of a new entry in the storage. std::optional tryInsert(const AccessEntityPtr & entity); @@ -104,8 +108,8 @@ public: std::vector insertOrReplace(const std::vector & multiple_entities); /// Removes an entity from the storage. Throws an exception if couldn't remove. - void remove(const UUID & id); - void remove(const std::vector & ids); + bool remove(const UUID & id, bool throw_if_not_exists = true); + std::vector remove(const std::vector & ids, bool throw_if_not_exists = true); /// Removes an entity from the storage. Returns false if couldn't remove. bool tryRemove(const UUID & id); @@ -116,8 +120,8 @@ public: using UpdateFunc = std::function; /// Updates an entity stored in the storage. Throws an exception if couldn't update. - void update(const UUID & id, const UpdateFunc & update_func); - void update(const std::vector & ids, const UpdateFunc & update_func); + bool update(const UUID & id, const UpdateFunc & update_func, bool throw_if_not_exists = true); + std::vector update(const std::vector & ids, const UpdateFunc & update_func, bool throw_if_not_exists = true); /// Updates an entity stored in the storage. Returns false if couldn't update. bool tryUpdate(const UUID & id, const UpdateFunc & update_func); @@ -139,35 +143,27 @@ public: scope_guard subscribeForChanges(const UUID & id, const OnChangedHandler & handler) const; scope_guard subscribeForChanges(const std::vector & ids, const OnChangedHandler & handler) const; - bool hasSubscription(AccessEntityType type) const; - bool hasSubscription(const UUID & id) const; + virtual bool hasSubscription(AccessEntityType type) const = 0; + virtual bool hasSubscription(const UUID & id) const = 0; /// Finds a user, check the provided credentials and returns the ID of the user if they are valid. /// Throws an exception if no such user or credentials are invalid. - UUID login(const Credentials & credentials, const Poco::Net::IPAddress & address, const ExternalAuthenticators & external_authenticators, bool replace_exception_with_cannot_authenticate = true) const; - - /// Returns the ID of a user who has logged in (maybe on another node). - /// The function assumes that the password has been already checked somehow, so we can skip checking it now. - UUID getIDOfLoggedUser(const String & user_name) const; + UUID authenticate(const Credentials & credentials, const Poco::Net::IPAddress & address, const ExternalAuthenticators & external_authenticators) const; + std::optional authenticate(const Credentials & credentials, const Poco::Net::IPAddress & address, const ExternalAuthenticators & external_authenticators, bool throw_if_user_not_exists) const; protected: virtual std::optional findImpl(AccessEntityType type, const String & name) const = 0; virtual std::vector findAllImpl(AccessEntityType type) const = 0; - virtual bool existsImpl(const UUID & id) const = 0; - virtual AccessEntityPtr readImpl(const UUID & id) const = 0; - virtual String readNameImpl(const UUID & id) const = 0; - virtual bool canInsertImpl(const AccessEntityPtr & entity) const = 0; - virtual UUID insertImpl(const AccessEntityPtr & entity, bool replace_if_exists) = 0; - virtual void removeImpl(const UUID & id) = 0; - virtual void updateImpl(const UUID & id, const UpdateFunc & update_func) = 0; + virtual AccessEntityPtr readImpl(const UUID & id, bool throw_if_not_exists) const = 0; + virtual std::optional readNameImpl(const UUID & id, bool throw_if_not_exists) const; + virtual std::optional insertImpl(const AccessEntityPtr & entity, bool replace_if_exists, bool throw_if_exists); + virtual bool removeImpl(const UUID & id, bool throw_if_not_exists); + virtual bool updateImpl(const UUID & id, const UpdateFunc & update_func, bool throw_if_not_exists); virtual scope_guard subscribeForChangesImpl(const UUID & id, const OnChangedHandler & handler) const = 0; virtual scope_guard subscribeForChangesImpl(AccessEntityType type, const OnChangedHandler & handler) const = 0; - virtual bool hasSubscriptionImpl(const UUID & id) const = 0; - virtual bool hasSubscriptionImpl(AccessEntityType type) const = 0; - virtual UUID loginImpl(const Credentials & credentials, const Poco::Net::IPAddress & address, const ExternalAuthenticators & external_authenticators) const; - virtual bool areCredentialsValidImpl(const User & user, const Credentials & credentials, const ExternalAuthenticators & external_authenticators) const; - virtual bool isAddressAllowedImpl(const User & user, const Poco::Net::IPAddress & address) const; - virtual UUID getIDOfLoggedUserImpl(const String & user_name) const; + virtual std::optional authenticateImpl(const Credentials & credentials, const Poco::Net::IPAddress & address, const ExternalAuthenticators & external_authenticators, bool throw_if_user_not_exists) const; + virtual bool areCredentialsValid(const User & user, const Credentials & credentials, const ExternalAuthenticators & external_authenticators) const; + virtual bool isAddressAllowed(const User & user, const Poco::Net::IPAddress & address) const; static UUID generateRandomID(); Poco::Logger * getLogger() const; @@ -184,30 +180,28 @@ protected: [[noreturn]] void throwReadonlyCannotRemove(AccessEntityType type, const String & name) const; [[noreturn]] static void throwAddressNotAllowed(const Poco::Net::IPAddress & address); [[noreturn]] static void throwInvalidCredentials(); - [[noreturn]] static void throwCannotAuthenticate(const String & user_name); using Notification = std::tuple; using Notifications = std::vector; static void notify(const Notifications & notifications); private: - AccessEntityPtr tryReadBase(const UUID & id) const; - const String storage_name; mutable std::atomic log = nullptr; }; template -std::shared_ptr IAccessStorage::read(const UUID & id) const +std::shared_ptr IAccessStorage::read(const UUID & id, bool throw_if_not_exists) const { - auto entity = readImpl(id); + auto entity = readImpl(id, throw_if_not_exists); if constexpr (std::is_same_v) return entity; else { - auto ptr = typeid_cast>(entity); - if (ptr) + if (!entity) + return nullptr; + if (auto ptr = typeid_cast>(entity)) return ptr; throwBadCast(id, entity->getType(), entity->getName(), EntityClassT::TYPE); } @@ -215,26 +209,27 @@ std::shared_ptr IAccessStorage::read(const UUID & id) const template -std::shared_ptr IAccessStorage::read(const String & name) const +std::shared_ptr IAccessStorage::read(const String & name, bool throw_if_not_exists) const { - return read(getID(name)); + if (auto id = find(name)) + return read(*id, throw_if_not_exists); + if (throw_if_not_exists) + throwNotFound(EntityClassT::TYPE, name); + else + return nullptr; } template std::shared_ptr IAccessStorage::tryRead(const UUID & id) const { - auto entity = tryReadBase(id); - if (!entity) - return nullptr; - return typeid_cast>(entity); + return read(id, false); } template std::shared_ptr IAccessStorage::tryRead(const String & name) const { - auto id = find(name); - return id ? tryRead(*id) : nullptr; + return read(name, false); } } diff --git a/src/Access/LDAPAccessStorage.cpp b/src/Access/LDAPAccessStorage.cpp index c89d8c1f953..77c3281e5ab 100644 --- a/src/Access/LDAPAccessStorage.cpp +++ b/src/Access/LDAPAccessStorage.cpp @@ -426,52 +426,24 @@ std::vector LDAPAccessStorage::findAllImpl(AccessEntityType type) const } -bool LDAPAccessStorage::existsImpl(const UUID & id) const +bool LDAPAccessStorage::exists(const UUID & id) const { std::scoped_lock lock(mutex); return memory_storage.exists(id); } -AccessEntityPtr LDAPAccessStorage::readImpl(const UUID & id) const +AccessEntityPtr LDAPAccessStorage::readImpl(const UUID & id, bool throw_if_not_exists) const { std::scoped_lock lock(mutex); - return memory_storage.read(id); + return memory_storage.read(id, throw_if_not_exists); } -String LDAPAccessStorage::readNameImpl(const UUID & id) const +std::optional LDAPAccessStorage::readNameImpl(const UUID & id, bool throw_if_not_exists) const { std::scoped_lock lock(mutex); - return memory_storage.readName(id); -} - - -bool LDAPAccessStorage::canInsertImpl(const AccessEntityPtr &) const -{ - return false; -} - - -UUID LDAPAccessStorage::insertImpl(const AccessEntityPtr & entity, bool) -{ - throwReadonlyCannotInsert(entity->getType(), entity->getName()); -} - - -void LDAPAccessStorage::removeImpl(const UUID & id) -{ - std::scoped_lock lock(mutex); - auto entity = read(id); - throwReadonlyCannotRemove(entity->getType(), entity->getName()); -} - - -void LDAPAccessStorage::updateImpl(const UUID & id, const UpdateFunc &) -{ - std::scoped_lock lock(mutex); - auto entity = read(id); - throwReadonlyCannotUpdate(entity->getType(), entity->getName()); + return memory_storage.readName(id, throw_if_not_exists); } @@ -489,20 +461,24 @@ scope_guard LDAPAccessStorage::subscribeForChangesImpl(AccessEntityType type, co } -bool LDAPAccessStorage::hasSubscriptionImpl(const UUID & id) const +bool LDAPAccessStorage::hasSubscription(const UUID & id) const { std::scoped_lock lock(mutex); return memory_storage.hasSubscription(id); } -bool LDAPAccessStorage::hasSubscriptionImpl(AccessEntityType type) const +bool LDAPAccessStorage::hasSubscription(AccessEntityType type) const { std::scoped_lock lock(mutex); return memory_storage.hasSubscription(type); } -UUID LDAPAccessStorage::loginImpl(const Credentials & credentials, const Poco::Net::IPAddress & address, const ExternalAuthenticators & external_authenticators) const +std::optional LDAPAccessStorage::authenticateImpl( + const Credentials & credentials, + const Poco::Net::IPAddress & address, + const ExternalAuthenticators & external_authenticators, + bool /* throw_if_user_not_exists */) const { std::scoped_lock lock(mutex); LDAPClient::SearchResultsList external_roles; @@ -511,16 +487,19 @@ UUID LDAPAccessStorage::loginImpl(const Credentials & credentials, const Poco::N { auto user = memory_storage.read(*id); - if (!isAddressAllowedImpl(*user, address)) + if (!isAddressAllowed(*user, address)) throwAddressNotAllowed(address); + if (typeid_cast(&credentials)) + return id; + if (!areLDAPCredentialsValidNoLock(*user, credentials, external_authenticators, external_roles)) throwInvalidCredentials(); // Just in case external_roles are changed. This will be no-op if they are not. updateAssignedRolesNoLock(*id, user->getName(), external_roles); - return *id; + return id; } else { @@ -530,9 +509,16 @@ UUID LDAPAccessStorage::loginImpl(const Credentials & credentials, const Poco::N user->auth_data = AuthenticationData(AuthenticationType::LDAP); user->auth_data.setLDAPServerName(ldap_server_name); - if (!isAddressAllowedImpl(*user, address)) + if (!isAddressAllowed(*user, address)) throwAddressNotAllowed(address); + if (typeid_cast(&credentials)) + { + // TODO: mapped external roles are not available here. Without a password we can't authenticate and retrieve roles from LDAP server. + assignRolesNoLock(*user, external_roles); + return memory_storage.insert(user); + } + if (!areLDAPCredentialsValidNoLock(*user, credentials, external_authenticators, external_roles)) throwInvalidCredentials(); @@ -541,31 +527,4 @@ UUID LDAPAccessStorage::loginImpl(const Credentials & credentials, const Poco::N return memory_storage.insert(user); } } - -UUID LDAPAccessStorage::getIDOfLoggedUserImpl(const String & user_name) const -{ - std::scoped_lock lock(mutex); - auto id = memory_storage.find(user_name); - if (id) - { - return *id; - } - else - { - // User does not exist, so we create one, and add it pretending that the authentication is successful. - auto user = std::make_shared(); - user->setName(user_name); - user->auth_data = AuthenticationData(AuthenticationType::LDAP); - user->auth_data.setLDAPServerName(ldap_server_name); - - LDAPClient::SearchResultsList external_roles; - - // TODO: mapped external roles are not available here. Without a password we can't authenticate and retrieve roles from LDAP server. - - assignRolesNoLock(*user, external_roles); - - return memory_storage.insert(user); - } -} - } diff --git a/src/Access/LDAPAccessStorage.h b/src/Access/LDAPAccessStorage.h index feb6ee4d92a..cc93c523516 100644 --- a/src/Access/LDAPAccessStorage.h +++ b/src/Access/LDAPAccessStorage.h @@ -40,23 +40,19 @@ public: public: // IAccessStorage implementations. virtual const char * getStorageType() const override; virtual String getStorageParamsJSON() const override; + virtual bool isReadOnly() const override { return true; } + virtual bool exists(const UUID & id) const override; + virtual bool hasSubscription(const UUID & id) const override; + virtual bool hasSubscription(AccessEntityType type) const override; private: // IAccessStorage implementations. virtual std::optional findImpl(AccessEntityType type, const String & name) const override; virtual std::vector findAllImpl(AccessEntityType type) const override; - virtual bool existsImpl(const UUID & id) const override; - virtual AccessEntityPtr readImpl(const UUID & id) const override; - virtual String readNameImpl(const UUID & id) const override; - virtual bool canInsertImpl(const AccessEntityPtr &) const override; - virtual UUID insertImpl(const AccessEntityPtr & entity, bool replace_if_exists) override; - virtual void removeImpl(const UUID & id) override; - virtual void updateImpl(const UUID & id, const UpdateFunc & update_func) override; + virtual AccessEntityPtr readImpl(const UUID & id, bool throw_if_not_exists) const override; + virtual std::optional readNameImpl(const UUID & id, bool throw_if_not_exists) const override; virtual scope_guard subscribeForChangesImpl(const UUID & id, const OnChangedHandler & handler) const override; virtual scope_guard subscribeForChangesImpl(AccessEntityType type, const OnChangedHandler & handler) const override; - virtual bool hasSubscriptionImpl(const UUID & id) const override; - virtual bool hasSubscriptionImpl(AccessEntityType type) const override; - virtual UUID loginImpl(const Credentials & credentials, const Poco::Net::IPAddress & address, const ExternalAuthenticators & external_authenticators) const override; - virtual UUID getIDOfLoggedUserImpl(const String & user_name) const override; + virtual std::optional authenticateImpl(const Credentials & credentials, const Poco::Net::IPAddress & address, const ExternalAuthenticators & external_authenticators, bool throw_if_user_not_exists) const override; private: void setConfiguration(AccessControl * access_control_, const Poco::Util::AbstractConfiguration & config, const String & prefix); diff --git a/src/Access/LDAPClient.cpp b/src/Access/LDAPClient.cpp index c666520c069..49d01074f6a 100644 --- a/src/Access/LDAPClient.cpp +++ b/src/Access/LDAPClient.cpp @@ -448,7 +448,7 @@ LDAPClient::SearchResults LDAPClient::search(const SearchParams & search_params) vals = nullptr; }); - for (std::size_t i = 0; vals[i]; i++) + for (size_t i = 0; vals[i]; ++i) { if (vals[i]->bv_val && vals[i]->bv_len > 0) result.emplace(vals[i]->bv_val, vals[i]->bv_len); @@ -473,7 +473,7 @@ LDAPClient::SearchResults LDAPClient::search(const SearchParams & search_params) referrals = nullptr; }); - for (std::size_t i = 0; referrals[i]; i++) + for (size_t i = 0; referrals[i]; ++i) { LOG_WARNING(&Poco::Logger::get("LDAPClient"), "Received reference during LDAP search but not following it: {}", referrals[i]); } diff --git a/src/Access/MemoryAccessStorage.cpp b/src/Access/MemoryAccessStorage.cpp index d3c99204bd3..f7989693d87 100644 --- a/src/Access/MemoryAccessStorage.cpp +++ b/src/Access/MemoryAccessStorage.cpp @@ -38,64 +38,72 @@ std::vector MemoryAccessStorage::findAllImpl(AccessEntityType type) const } -bool MemoryAccessStorage::existsImpl(const UUID & id) const +bool MemoryAccessStorage::exists(const UUID & id) const { std::lock_guard lock{mutex}; return entries_by_id.count(id); } -AccessEntityPtr MemoryAccessStorage::readImpl(const UUID & id) const +AccessEntityPtr MemoryAccessStorage::readImpl(const UUID & id, bool throw_if_not_exists) const { std::lock_guard lock{mutex}; auto it = entries_by_id.find(id); if (it == entries_by_id.end()) - throwNotFound(id); + { + if (throw_if_not_exists) + throwNotFound(id); + else + return nullptr; + } const Entry & entry = it->second; return entry.entity; } -String MemoryAccessStorage::readNameImpl(const UUID & id) const -{ - return readImpl(id)->getName(); -} - - -UUID MemoryAccessStorage::insertImpl(const AccessEntityPtr & new_entity, bool replace_if_exists) +std::optional MemoryAccessStorage::insertImpl(const AccessEntityPtr & new_entity, bool replace_if_exists, bool throw_if_exists) { Notifications notifications; SCOPE_EXIT({ notify(notifications); }); UUID id = generateRandomID(); std::lock_guard lock{mutex}; - insertNoLock(id, new_entity, replace_if_exists, notifications); - return id; + if (insertNoLock(id, new_entity, replace_if_exists, throw_if_exists, notifications)) + return id; + + return std::nullopt; } -void MemoryAccessStorage::insertNoLock(const UUID & id, const AccessEntityPtr & new_entity, bool replace_if_exists, Notifications & notifications) +bool MemoryAccessStorage::insertNoLock(const UUID & id, const AccessEntityPtr & new_entity, bool replace_if_exists, bool throw_if_exists, Notifications & notifications) { const String & name = new_entity->getName(); AccessEntityType type = new_entity->getType(); /// Check that we can insert. - auto it = entries_by_id.find(id); - if (it != entries_by_id.end()) + auto & entries_by_name = entries_by_name_and_type[static_cast(type)]; + auto it_by_name = entries_by_name.find(name); + bool name_collision = (it_by_name != entries_by_name.end()); + + if (name_collision && !replace_if_exists) { - const auto & existing_entry = it->second; + if (throw_if_exists) + throwNameCollisionCannotInsert(type, name); + else + return false; + } + + auto it_by_id = entries_by_id.find(id); + if (it_by_id != entries_by_id.end()) + { + const auto & existing_entry = it_by_id->second; throwIDCollisionCannotInsert(id, type, name, existing_entry.entity->getType(), existing_entry.entity->getName()); } - auto & entries_by_name = entries_by_name_and_type[static_cast(type)]; - auto it2 = entries_by_name.find(name); - if (it2 != entries_by_name.end()) + if (name_collision && replace_if_exists) { - const auto & existing_entry = *(it2->second); - if (replace_if_exists) - removeNoLock(existing_entry.id, notifications); - else - throwNameCollisionCannotInsert(type, name); + const auto & existing_entry = *(it_by_name->second); + removeNoLock(existing_entry.id, /* throw_if_not_exists = */ false, notifications); } /// Do insertion. @@ -104,24 +112,30 @@ void MemoryAccessStorage::insertNoLock(const UUID & id, const AccessEntityPtr & entry.entity = new_entity; entries_by_name[name] = &entry; prepareNotifications(entry, false, notifications); + return true; } -void MemoryAccessStorage::removeImpl(const UUID & id) +bool MemoryAccessStorage::removeImpl(const UUID & id, bool throw_if_not_exists) { Notifications notifications; SCOPE_EXIT({ notify(notifications); }); std::lock_guard lock{mutex}; - removeNoLock(id, notifications); + return removeNoLock(id, throw_if_not_exists, notifications); } -void MemoryAccessStorage::removeNoLock(const UUID & id, Notifications & notifications) +bool MemoryAccessStorage::removeNoLock(const UUID & id, bool throw_if_not_exists, Notifications & notifications) { auto it = entries_by_id.find(id); if (it == entries_by_id.end()) - throwNotFound(id); + { + if (throw_if_not_exists) + throwNotFound(id); + else + return false; + } Entry & entry = it->second; const String & name = entry.entity->getName(); @@ -133,24 +147,30 @@ void MemoryAccessStorage::removeNoLock(const UUID & id, Notifications & notifica auto & entries_by_name = entries_by_name_and_type[static_cast(type)]; entries_by_name.erase(name); entries_by_id.erase(it); + return true; } -void MemoryAccessStorage::updateImpl(const UUID & id, const UpdateFunc & update_func) +bool MemoryAccessStorage::updateImpl(const UUID & id, const UpdateFunc & update_func, bool throw_if_not_exists) { Notifications notifications; SCOPE_EXIT({ notify(notifications); }); std::lock_guard lock{mutex}; - updateNoLock(id, update_func, notifications); + return updateNoLock(id, update_func, throw_if_not_exists, notifications); } -void MemoryAccessStorage::updateNoLock(const UUID & id, const UpdateFunc & update_func, Notifications & notifications) +bool MemoryAccessStorage::updateNoLock(const UUID & id, const UpdateFunc & update_func, bool throw_if_not_exists, Notifications & notifications) { auto it = entries_by_id.find(id); if (it == entries_by_id.end()) - throwNotFound(id); + { + if (throw_if_not_exists) + throwNotFound(id); + else + return false; + } Entry & entry = it->second; auto old_entity = entry.entity; @@ -160,7 +180,7 @@ void MemoryAccessStorage::updateNoLock(const UUID & id, const UpdateFunc & updat throwBadCast(id, new_entity->getType(), new_entity->getName(), old_entity->getType()); if (*new_entity == *old_entity) - return; + return true; entry.entity = new_entity; @@ -176,6 +196,7 @@ void MemoryAccessStorage::updateNoLock(const UUID & id, const UpdateFunc & updat } prepareNotifications(entry, false, notifications); + return true; } @@ -235,7 +256,7 @@ void MemoryAccessStorage::setAllNoLock(const std::vector ids_to_remove = std::move(not_used_ids); boost::range::copy(conflicting_ids, std::inserter(ids_to_remove, ids_to_remove.end())); for (const auto & id : ids_to_remove) - removeNoLock(id, notifications); + removeNoLock(id, /* throw_if_not_exists = */ false, notifications); /// Insert or update entities. for (const auto & [id, entity] : all_entities) @@ -246,11 +267,16 @@ void MemoryAccessStorage::setAllNoLock(const std::vectorsecond.entity) != *entity) { const AccessEntityPtr & changed_entity = entity; - updateNoLock(id, [&changed_entity](const AccessEntityPtr &) { return changed_entity; }, notifications); + updateNoLock(id, + [&changed_entity](const AccessEntityPtr &) { return changed_entity; }, + /* throw_if_not_exists = */ true, + notifications); } } else - insertNoLock(id, entity, false, notifications); + { + insertNoLock(id, entity, /* replace_if_exists = */ false, /* throw_if_exists = */ true, notifications); + } } } @@ -304,7 +330,7 @@ scope_guard MemoryAccessStorage::subscribeForChangesImpl(const UUID & id, const } -bool MemoryAccessStorage::hasSubscriptionImpl(const UUID & id) const +bool MemoryAccessStorage::hasSubscription(const UUID & id) const { std::lock_guard lock{mutex}; auto it = entries_by_id.find(id); @@ -317,7 +343,7 @@ bool MemoryAccessStorage::hasSubscriptionImpl(const UUID & id) const } -bool MemoryAccessStorage::hasSubscriptionImpl(AccessEntityType type) const +bool MemoryAccessStorage::hasSubscription(AccessEntityType type) const { std::lock_guard lock{mutex}; const auto & handlers = handlers_by_type[static_cast(type)]; diff --git a/src/Access/MemoryAccessStorage.h b/src/Access/MemoryAccessStorage.h index ea7b0193471..3e31f155ba7 100644 --- a/src/Access/MemoryAccessStorage.h +++ b/src/Access/MemoryAccessStorage.h @@ -23,20 +23,19 @@ public: void setAll(const std::vector & all_entities); void setAll(const std::vector> & all_entities); + bool exists(const UUID & id) const override; + bool hasSubscription(const UUID & id) const override; + bool hasSubscription(AccessEntityType type) const override; + private: std::optional findImpl(AccessEntityType type, const String & name) const override; std::vector findAllImpl(AccessEntityType type) const override; - bool existsImpl(const UUID & id) const override; - AccessEntityPtr readImpl(const UUID & id) const override; - String readNameImpl(const UUID & id) const override; - bool canInsertImpl(const AccessEntityPtr &) const override { return true; } - UUID insertImpl(const AccessEntityPtr & entity, bool replace_if_exists) override; - void removeImpl(const UUID & id) override; - void updateImpl(const UUID & id, const UpdateFunc & update_func) override; + AccessEntityPtr readImpl(const UUID & id, bool throw_if_not_exists) const override; + std::optional insertImpl(const AccessEntityPtr & entity, bool replace_if_exists, bool throw_if_exists) override; + bool removeImpl(const UUID & id, bool throw_if_not_exists) override; + bool updateImpl(const UUID & id, const UpdateFunc & update_func, bool throw_if_not_exists) override; scope_guard subscribeForChangesImpl(const UUID & id, const OnChangedHandler & handler) const override; scope_guard subscribeForChangesImpl(AccessEntityType type, const OnChangedHandler & handler) const override; - bool hasSubscriptionImpl(const UUID & id) const override; - bool hasSubscriptionImpl(AccessEntityType type) const override; struct Entry { @@ -45,9 +44,9 @@ private: mutable std::list handlers_by_id; }; - void insertNoLock(const UUID & id, const AccessEntityPtr & entity, bool replace_if_exists, Notifications & notifications); - void removeNoLock(const UUID & id, Notifications & notifications); - void updateNoLock(const UUID & id, const UpdateFunc & update_func, Notifications & notifications); + bool insertNoLock(const UUID & id, const AccessEntityPtr & entity, bool replace_if_exists, bool throw_if_exists, Notifications & notifications); + bool removeNoLock(const UUID & id, bool throw_if_not_exists, Notifications & notifications); + bool updateNoLock(const UUID & id, const UpdateFunc & update_func, bool throw_if_not_exists, Notifications & notifications); void setAllNoLock(const std::vector> & all_entities, Notifications & notifications); void prepareNotifications(const Entry & entry, bool remove, Notifications & notifications) const; diff --git a/src/Access/MultipleAccessStorage.cpp b/src/Access/MultipleAccessStorage.cpp index 61bc84e8ab2..11540dd1b77 100644 --- a/src/Access/MultipleAccessStorage.cpp +++ b/src/Access/MultipleAccessStorage.cpp @@ -13,8 +13,8 @@ namespace DB { namespace ErrorCodes { - extern const int ACCESS_STORAGE_FOR_INSERTION_NOT_FOUND; extern const int ACCESS_ENTITY_ALREADY_EXISTS; + extern const int ACCESS_STORAGE_FOR_INSERTION_NOT_FOUND; } using Storage = IAccessStorage; @@ -129,7 +129,7 @@ std::vector MultipleAccessStorage::findAllImpl(AccessEntityType type) cons } -bool MultipleAccessStorage::existsImpl(const UUID & id) const +bool MultipleAccessStorage::exists(const UUID & id) const { return findStorage(id) != nullptr; } @@ -180,39 +180,59 @@ ConstStoragePtr MultipleAccessStorage::getStorage(const UUID & id) const return const_cast(this)->getStorage(id); } -AccessEntityPtr MultipleAccessStorage::readImpl(const UUID & id) const +AccessEntityPtr MultipleAccessStorage::readImpl(const UUID & id, bool throw_if_not_exists) const { - return getStorage(id)->read(id); + if (auto storage = findStorage(id)) + return storage->read(id, throw_if_not_exists); + + if (throw_if_not_exists) + throwNotFound(id); + else + return nullptr; } -String MultipleAccessStorage::readNameImpl(const UUID & id) const +std::optional MultipleAccessStorage::readNameImpl(const UUID & id, bool throw_if_not_exists) const { - return getStorage(id)->readName(id); + if (auto storage = findStorage(id)) + return storage->readName(id, throw_if_not_exists); + + if (throw_if_not_exists) + throwNotFound(id); + else + return std::nullopt; } -bool MultipleAccessStorage::canInsertImpl(const AccessEntityPtr & entity) const +bool MultipleAccessStorage::isReadOnly() const { auto storages = getStoragesInternal(); for (const auto & storage : *storages) { - if (storage->canInsert(entity)) - return true; + if (!storage->isReadOnly()) + return false; } + return true; +} + + +bool MultipleAccessStorage::isReadOnly(const UUID & id) const +{ + auto storage = findStorage(id); + if (storage) + return storage->isReadOnly(id); return false; } -UUID MultipleAccessStorage::insertImpl(const AccessEntityPtr & entity, bool replace_if_exists) +std::optional MultipleAccessStorage::insertImpl(const AccessEntityPtr & entity, bool replace_if_exists, bool throw_if_exists) { - auto storages = getStoragesInternal(); - std::shared_ptr storage_for_insertion; + + auto storages = getStoragesInternal(); for (const auto & storage : *storages) { - if (storage->canInsert(entity) || - storage->find(entity->getType(), entity->getName())) + if (!storage->isReadOnly() || storage->find(entity->getType(), entity->getName())) { storage_for_insertion = storage; break; @@ -220,49 +240,73 @@ UUID MultipleAccessStorage::insertImpl(const AccessEntityPtr & entity, bool repl } if (!storage_for_insertion) - throw Exception("Not found a storage to insert " + entity->formatTypeWithName(), ErrorCodes::ACCESS_STORAGE_FOR_INSERTION_NOT_FOUND); + { + throw Exception( + ErrorCodes::ACCESS_STORAGE_FOR_INSERTION_NOT_FOUND, + "Could not insert {} because there is no writeable access storage in {}", + entity->formatTypeWithName(), + getStorageName()); + } - auto id = replace_if_exists ? storage_for_insertion->insertOrReplace(entity) : storage_for_insertion->insert(entity); - std::lock_guard lock{mutex}; - ids_cache.set(id, storage_for_insertion); + auto id = storage_for_insertion->insert(entity, replace_if_exists, throw_if_exists); + if (id) + { + std::lock_guard lock{mutex}; + ids_cache.set(*id, storage_for_insertion); + } return id; } -void MultipleAccessStorage::removeImpl(const UUID & id) +bool MultipleAccessStorage::removeImpl(const UUID & id, bool throw_if_not_exists) { - getStorage(id)->remove(id); + if (auto storage = findStorage(id)) + return storage->remove(id, throw_if_not_exists); + + if (throw_if_not_exists) + throwNotFound(id); + else + return false; } -void MultipleAccessStorage::updateImpl(const UUID & id, const UpdateFunc & update_func) +bool MultipleAccessStorage::updateImpl(const UUID & id, const UpdateFunc & update_func, bool throw_if_not_exists) { - auto storage_for_updating = getStorage(id); + auto storage_for_updating = findStorage(id); + if (!storage_for_updating) + { + if (throw_if_not_exists) + throwNotFound(id); + else + return false; + } /// If the updating involves renaming check that the renamed entity will be accessible by name. auto storages = getStoragesInternal(); if ((storages->size() > 1) && (storages->front() != storage_for_updating)) { - auto old_entity = storage_for_updating->read(id); - auto new_entity = update_func(old_entity); - if (new_entity->getName() != old_entity->getName()) + if (auto old_entity = storage_for_updating->tryRead(id)) { - for (const auto & storage : *storages) + auto new_entity = update_func(old_entity); + if (new_entity->getName() != old_entity->getName()) { - if (storage == storage_for_updating) - break; - if (storage->find(new_entity->getType(), new_entity->getName())) + for (const auto & storage : *storages) { - throw Exception( - old_entity->formatTypeWithName() + ": cannot rename to " + backQuote(new_entity->getName()) + " because " - + new_entity->formatTypeWithName() + " already exists in " + storage->getStorageName(), - ErrorCodes::ACCESS_ENTITY_ALREADY_EXISTS); + if (storage == storage_for_updating) + break; + if (storage->find(new_entity->getType(), new_entity->getName())) + { + throw Exception( + old_entity->formatTypeWithName() + ": cannot rename to " + backQuote(new_entity->getName()) + " because " + + new_entity->formatTypeWithName() + " already exists in " + storage->getStorageName(), + ErrorCodes::ACCESS_ENTITY_ALREADY_EXISTS); + } } } } } - storage_for_updating->update(id, update_func); + return storage_for_updating->update(id, update_func, throw_if_not_exists); } @@ -275,7 +319,7 @@ scope_guard MultipleAccessStorage::subscribeForChangesImpl(const UUID & id, cons } -bool MultipleAccessStorage::hasSubscriptionImpl(const UUID & id) const +bool MultipleAccessStorage::hasSubscription(const UUID & id) const { auto storages = getStoragesInternal(); for (const auto & storage : *storages) @@ -307,7 +351,7 @@ scope_guard MultipleAccessStorage::subscribeForChangesImpl(AccessEntityType type } -bool MultipleAccessStorage::hasSubscriptionImpl(AccessEntityType type) const +bool MultipleAccessStorage::hasSubscription(AccessEntityType type) const { std::lock_guard lock{mutex}; const auto & handlers = handlers_by_type[static_cast(type)]; @@ -405,57 +449,24 @@ void MultipleAccessStorage::updateSubscriptionsToNestedStorages(std::unique_lock } -UUID MultipleAccessStorage::loginImpl(const Credentials & credentials, const Poco::Net::IPAddress & address, const ExternalAuthenticators & external_authenticators) const +std::optional MultipleAccessStorage::authenticateImpl(const Credentials & credentials, const Poco::Net::IPAddress & address, const ExternalAuthenticators & external_authenticators, bool throw_if_user_not_exists) const { auto storages = getStoragesInternal(); for (const auto & storage : *storages) { - try + auto id = storage->authenticate(credentials, address, external_authenticators, /* throw_if_user_not_exists = */ false); + if (id) { - auto id = storage->login(credentials, address, external_authenticators, /* replace_exception_with_cannot_authenticate = */ false); std::lock_guard lock{mutex}; - ids_cache.set(id, storage); + ids_cache.set(*id, storage); return id; } - catch (...) - { - if (!storage->find(AccessEntityType::USER, credentials.getUserName())) - { - /// The authentication failed because there no users with such name in the `storage` - /// thus we can try to search in other nested storages. - continue; - } - throw; - } } - throwNotFound(AccessEntityType::USER, credentials.getUserName()); -} - -UUID MultipleAccessStorage::getIDOfLoggedUserImpl(const String & user_name) const -{ - auto storages = getStoragesInternal(); - for (const auto & storage : *storages) - { - try - { - auto id = storage->getIDOfLoggedUser(user_name); - std::lock_guard lock{mutex}; - ids_cache.set(id, storage); - return id; - } - catch (...) - { - if (!storage->find(AccessEntityType::USER, user_name)) - { - /// The authentication failed because there no users with such name in the `storage` - /// thus we can try to search in other nested storages. - continue; - } - throw; - } - } - throwNotFound(AccessEntityType::USER, user_name); + if (throw_if_user_not_exists) + throwNotFound(AccessEntityType::USER, credentials.getUserName()); + else + return std::nullopt; } } diff --git a/src/Access/MultipleAccessStorage.h b/src/Access/MultipleAccessStorage.h index 462f97d6fa9..36c316c71f4 100644 --- a/src/Access/MultipleAccessStorage.h +++ b/src/Access/MultipleAccessStorage.h @@ -21,6 +21,8 @@ public: ~MultipleAccessStorage() override; const char * getStorageType() const override { return STORAGE_TYPE; } + bool isReadOnly() const override; + bool isReadOnly(const UUID & id) const override; void setStorages(const std::vector & storages); void addStorage(const StoragePtr & new_storage); @@ -34,22 +36,21 @@ public: ConstStoragePtr getStorage(const UUID & id) const; StoragePtr getStorage(const UUID & id); + bool exists(const UUID & id) const override; + bool hasSubscription(const UUID & id) const override; + bool hasSubscription(AccessEntityType type) const override; + protected: std::optional findImpl(AccessEntityType type, const String & name) const override; std::vector findAllImpl(AccessEntityType type) const override; - bool existsImpl(const UUID & id) const override; - AccessEntityPtr readImpl(const UUID & id) const override; - String readNameImpl(const UUID &id) const override; - bool canInsertImpl(const AccessEntityPtr & entity) const override; - UUID insertImpl(const AccessEntityPtr & entity, bool replace_if_exists) override; - void removeImpl(const UUID & id) override; - void updateImpl(const UUID & id, const UpdateFunc & update_func) override; + AccessEntityPtr readImpl(const UUID & id, bool throw_if_not_exists) const override; + std::optional readNameImpl(const UUID & id, bool throw_if_not_exists) const override; + std::optional insertImpl(const AccessEntityPtr & entity, bool replace_if_exists, bool throw_if_exists) override; + bool removeImpl(const UUID & id, bool throw_if_not_exists) override; + bool updateImpl(const UUID & id, const UpdateFunc & update_func, bool throw_if_not_exists) override; scope_guard subscribeForChangesImpl(const UUID & id, const OnChangedHandler & handler) const override; scope_guard subscribeForChangesImpl(AccessEntityType type, const OnChangedHandler & handler) const override; - bool hasSubscriptionImpl(const UUID & id) const override; - bool hasSubscriptionImpl(AccessEntityType type) const override; - UUID loginImpl(const Credentials & credentials, const Poco::Net::IPAddress & address, const ExternalAuthenticators & external_authenticators) const override; - UUID getIDOfLoggedUserImpl(const String & user_name) const override; + std::optional authenticateImpl(const Credentials & credentials, const Poco::Net::IPAddress & address, const ExternalAuthenticators & external_authenticators, bool throw_if_user_not_exists) const override; private: using Storages = std::vector; diff --git a/src/Access/QuotaCache.cpp b/src/Access/QuotaCache.cpp index 566c2409205..43ab4268b0c 100644 --- a/src/Access/QuotaCache.cpp +++ b/src/Access/QuotaCache.cpp @@ -4,7 +4,6 @@ #include #include #include -#include #include #include #include @@ -22,17 +21,6 @@ namespace ErrorCodes } -namespace -{ - std::chrono::system_clock::duration randomDuration(std::chrono::seconds max) - { - auto count = std::chrono::duration_cast(max).count(); - std::uniform_int_distribution distribution{0, count - 1}; - return std::chrono::system_clock::duration(distribution(thread_local_rng)); - } -} - - void QuotaCache::QuotaInfo::setQuota(const QuotaPtr & quota_, const UUID & quota_id_) { quota = quota_; @@ -94,18 +82,21 @@ boost::shared_ptr QuotaCache::QuotaInfo::getOrBui auto it = key_to_intervals.find(key); if (it != key_to_intervals.end()) return it->second; - return rebuildIntervals(key); + return rebuildIntervals(key, std::chrono::system_clock::now()); } void QuotaCache::QuotaInfo::rebuildAllIntervals() { + if (key_to_intervals.empty()) + return; + auto current_time = std::chrono::system_clock::now(); for (const String & key : key_to_intervals | boost::adaptors::map_keys) - rebuildIntervals(key); + rebuildIntervals(key, current_time); } -boost::shared_ptr QuotaCache::QuotaInfo::rebuildIntervals(const String & key) +boost::shared_ptr QuotaCache::QuotaInfo::rebuildIntervals(const String & key, std::chrono::system_clock::time_point current_time) { auto new_intervals = boost::make_shared(); new_intervals->quota_name = quota->getName(); @@ -115,14 +106,8 @@ boost::shared_ptr QuotaCache::QuotaInfo::rebuildI intervals.reserve(quota->all_limits.size()); for (const auto & limits : quota->all_limits) { - intervals.emplace_back(); + intervals.emplace_back(limits.duration, limits.randomize_interval, current_time); auto & interval = intervals.back(); - interval.duration = limits.duration; - std::chrono::system_clock::time_point end_of_interval{}; - interval.randomize_interval = limits.randomize_interval; - if (limits.randomize_interval) - end_of_interval += randomDuration(limits.duration); - interval.end_of_interval = end_of_interval.time_since_epoch(); for (auto quota_type : collections::range(QuotaType::MAX)) { auto quota_type_i = static_cast(quota_type); diff --git a/src/Access/QuotaCache.h b/src/Access/QuotaCache.h index 77682230370..7298acad415 100644 --- a/src/Access/QuotaCache.h +++ b/src/Access/QuotaCache.h @@ -43,7 +43,7 @@ private: String calculateKey(const EnabledQuota & enabled_quota) const; boost::shared_ptr getOrBuildIntervals(const String & key); - boost::shared_ptr rebuildIntervals(const String & key); + boost::shared_ptr rebuildIntervals(const String & key, std::chrono::system_clock::time_point current_time); void rebuildAllIntervals(); QuotaPtr quota; diff --git a/src/Access/ReplicatedAccessStorage.cpp b/src/Access/ReplicatedAccessStorage.cpp index 93b8a5c992a..a564fa65223 100644 --- a/src/Access/ReplicatedAccessStorage.cpp +++ b/src/Access/ReplicatedAccessStorage.cpp @@ -91,7 +91,7 @@ static void retryOnZooKeeperUserError(size_t attempts, Func && function) } } -UUID ReplicatedAccessStorage::insertImpl(const AccessEntityPtr & new_entity, bool replace_if_exists) +std::optional ReplicatedAccessStorage::insertImpl(const AccessEntityPtr & new_entity, bool replace_if_exists, bool throw_if_exists) { const UUID id = generateRandomID(); const AccessEntityTypeInfo type_info = AccessEntityTypeInfo::get(new_entity->getType()); @@ -99,7 +99,11 @@ UUID ReplicatedAccessStorage::insertImpl(const AccessEntityPtr & new_entity, boo LOG_DEBUG(getLogger(), "Inserting entity of type {} named {} with id {}", type_info.name, name, toString(id)); auto zookeeper = get_zookeeper(); - retryOnZooKeeperUserError(10, [&]{ insertZooKeeper(zookeeper, id, new_entity, replace_if_exists); }); + bool ok = false; + retryOnZooKeeperUserError(10, [&]{ ok = insertZooKeeper(zookeeper, id, new_entity, replace_if_exists, throw_if_exists); }); + + if (!ok) + return std::nullopt; Notifications notifications; SCOPE_EXIT({ notify(notifications); }); @@ -109,8 +113,12 @@ UUID ReplicatedAccessStorage::insertImpl(const AccessEntityPtr & new_entity, boo } -void ReplicatedAccessStorage::insertZooKeeper( - const zkutil::ZooKeeperPtr & zookeeper, const UUID & id, const AccessEntityPtr & new_entity, bool replace_if_exists) +bool ReplicatedAccessStorage::insertZooKeeper( + const zkutil::ZooKeeperPtr & zookeeper, + const UUID & id, + const AccessEntityPtr & new_entity, + bool replace_if_exists, + bool throw_if_exists) { const String & name = new_entity->getName(); const AccessEntityType type = new_entity->getType(); @@ -131,6 +139,7 @@ void ReplicatedAccessStorage::insertZooKeeper( Coordination::Responses responses; const Coordination::Error res = zookeeper->tryMulti(ops, responses); + if (res == Coordination::Error::ZNODEEXISTS) { if (responses[0]->error == Coordination::Error::ZNODEEXISTS) @@ -166,33 +175,47 @@ void ReplicatedAccessStorage::insertZooKeeper( /// If this fails, then we'll just retry from the start. zookeeper->multi(replace_ops); + + /// Everything's fine, the new entity has been inserted instead of an existing entity. + return true; } else { - throwNameCollisionCannotInsert(type, name); + /// Couldn't insert the new entity because there is an existing entity with such name. + if (throw_if_exists) + throwNameCollisionCannotInsert(type, name); + else + return false; } } - else - { - zkutil::KeeperMultiException::check(res, ops, responses); - } + + /// If this fails, then we'll just retry from the start. + zkutil::KeeperMultiException::check(res, ops, responses); + + /// Everything's fine, the new entity has been inserted. + return true; } -void ReplicatedAccessStorage::removeImpl(const UUID & id) +bool ReplicatedAccessStorage::removeImpl(const UUID & id, bool throw_if_not_exists) { LOG_DEBUG(getLogger(), "Removing entity {}", toString(id)); auto zookeeper = get_zookeeper(); - retryOnZooKeeperUserError(10, [&] { removeZooKeeper(zookeeper, id); }); + bool ok = false; + retryOnZooKeeperUserError(10, [&] { ok = removeZooKeeper(zookeeper, id, throw_if_not_exists); }); + + if (!ok) + return false; Notifications notifications; SCOPE_EXIT({ notify(notifications); }); std::lock_guard lock{mutex}; removeEntityNoLock(id, notifications); + return true; } -void ReplicatedAccessStorage::removeZooKeeper(const zkutil::ZooKeeperPtr & zookeeper, const UUID & id) +bool ReplicatedAccessStorage::removeZooKeeper(const zkutil::ZooKeeperPtr & zookeeper, const UUID & id, bool throw_if_not_exists) { const String entity_uuid = toString(id); const String entity_path = zookeeper_path + "/uuid/" + entity_uuid; @@ -201,7 +224,13 @@ void ReplicatedAccessStorage::removeZooKeeper(const zkutil::ZooKeeperPtr & zooke Coordination::Stat entity_stat; const bool uuid_exists = zookeeper->tryGet(entity_path, entity_definition, &entity_stat); if (!uuid_exists) - throwNotFound(id); + { + /// Couldn't remove, there is no such entity. + if (throw_if_not_exists) + throwNotFound(id); + else + return false; + } const AccessEntityPtr entity = deserializeAccessEntity(entity_definition, entity_path); const AccessEntityTypeInfo type_info = AccessEntityTypeInfo::get(entity->getType()); @@ -212,26 +241,35 @@ void ReplicatedAccessStorage::removeZooKeeper(const zkutil::ZooKeeperPtr & zooke Coordination::Requests ops; ops.emplace_back(zkutil::makeRemoveRequest(entity_path, entity_stat.version)); ops.emplace_back(zkutil::makeRemoveRequest(entity_name_path, -1)); + /// If this fails, then we'll just retry from the start. zookeeper->multi(ops); + + /// Everything's fine, the entity has been removed. + return true; } -void ReplicatedAccessStorage::updateImpl(const UUID & id, const UpdateFunc & update_func) +bool ReplicatedAccessStorage::updateImpl(const UUID & id, const UpdateFunc & update_func, bool throw_if_not_exists) { LOG_DEBUG(getLogger(), "Updating entity {}", toString(id)); auto zookeeper = get_zookeeper(); - retryOnZooKeeperUserError(10, [&] { updateZooKeeper(zookeeper, id, update_func); }); + bool ok = false; + retryOnZooKeeperUserError(10, [&] { ok = updateZooKeeper(zookeeper, id, update_func, throw_if_not_exists); }); + + if (!ok) + return false; Notifications notifications; SCOPE_EXIT({ notify(notifications); }); std::lock_guard lock{mutex}; refreshEntityNoLock(zookeeper, id, notifications); + return true; } -void ReplicatedAccessStorage::updateZooKeeper(const zkutil::ZooKeeperPtr & zookeeper, const UUID & id, const UpdateFunc & update_func) +bool ReplicatedAccessStorage::updateZooKeeper(const zkutil::ZooKeeperPtr & zookeeper, const UUID & id, const UpdateFunc & update_func, bool throw_if_not_exists) { const String entity_uuid = toString(id); const String entity_path = zookeeper_path + "/uuid/" + entity_uuid; @@ -240,7 +278,12 @@ void ReplicatedAccessStorage::updateZooKeeper(const zkutil::ZooKeeperPtr & zooke Coordination::Stat stat; const bool uuid_exists = zookeeper->tryGet(entity_path, old_entity_definition, &stat); if (!uuid_exists) - throwNotFound(id); + { + if (throw_if_not_exists) + throwNotFound(id); + else + return false; + } const AccessEntityPtr old_entity = deserializeAccessEntity(old_entity_definition, entity_path); const AccessEntityPtr new_entity = update_func(old_entity); @@ -276,7 +319,11 @@ void ReplicatedAccessStorage::updateZooKeeper(const zkutil::ZooKeeperPtr & zooke } else { + /// If this fails, then we'll just retry from the start. zkutil::KeeperMultiException::check(res, ops, responses); + + /// Everything's fine, the entity has been updated. + return true; } } @@ -525,30 +572,29 @@ std::vector ReplicatedAccessStorage::findAllImpl(AccessEntityType type) co } -bool ReplicatedAccessStorage::existsImpl(const UUID & id) const +bool ReplicatedAccessStorage::exists(const UUID & id) const { std::lock_guard lock{mutex}; return entries_by_id.count(id); } -AccessEntityPtr ReplicatedAccessStorage::readImpl(const UUID & id) const +AccessEntityPtr ReplicatedAccessStorage::readImpl(const UUID & id, bool throw_if_not_exists) const { std::lock_guard lock{mutex}; const auto it = entries_by_id.find(id); if (it == entries_by_id.end()) - throwNotFound(id); + { + if (throw_if_not_exists) + throwNotFound(id); + else + return nullptr; + } const Entry & entry = it->second; return entry.entity; } -String ReplicatedAccessStorage::readNameImpl(const UUID & id) const -{ - return readImpl(id)->getName(); -} - - void ReplicatedAccessStorage::prepareNotifications(const Entry & entry, bool remove, Notifications & notifications) const { const AccessEntityPtr entity = remove ? nullptr : entry.entity; @@ -598,7 +644,7 @@ scope_guard ReplicatedAccessStorage::subscribeForChangesImpl(const UUID & id, co } -bool ReplicatedAccessStorage::hasSubscriptionImpl(const UUID & id) const +bool ReplicatedAccessStorage::hasSubscription(const UUID & id) const { std::lock_guard lock{mutex}; const auto & it = entries_by_id.find(id); @@ -611,7 +657,7 @@ bool ReplicatedAccessStorage::hasSubscriptionImpl(const UUID & id) const } -bool ReplicatedAccessStorage::hasSubscriptionImpl(AccessEntityType type) const +bool ReplicatedAccessStorage::hasSubscription(AccessEntityType type) const { std::lock_guard lock{mutex}; const auto & handlers = handlers_by_type[static_cast(type)]; diff --git a/src/Access/ReplicatedAccessStorage.h b/src/Access/ReplicatedAccessStorage.h index 54dbfbf5b7d..8fdd24b6d54 100644 --- a/src/Access/ReplicatedAccessStorage.h +++ b/src/Access/ReplicatedAccessStorage.h @@ -32,6 +32,10 @@ public: virtual void startup(); virtual void shutdown(); + bool exists(const UUID & id) const override; + bool hasSubscription(const UUID & id) const override; + bool hasSubscription(AccessEntityType type) const override; + private: String zookeeper_path; zkutil::GetZooKeeper get_zookeeper; @@ -41,13 +45,13 @@ private: ThreadFromGlobalPool worker_thread; ConcurrentBoundedQueue refresh_queue; - UUID insertImpl(const AccessEntityPtr & entity, bool replace_if_exists) override; - void removeImpl(const UUID & id) override; - void updateImpl(const UUID & id, const UpdateFunc & update_func) override; + std::optional insertImpl(const AccessEntityPtr & entity, bool replace_if_exists, bool throw_if_exists) override; + bool removeImpl(const UUID & id, bool throw_if_not_exists) override; + bool updateImpl(const UUID & id, const UpdateFunc & update_func, bool throw_if_not_exists) override; - void insertZooKeeper(const zkutil::ZooKeeperPtr & zookeeper, const UUID & id, const AccessEntityPtr & entity, bool replace_if_exists); - void removeZooKeeper(const zkutil::ZooKeeperPtr & zookeeper, const UUID & id); - void updateZooKeeper(const zkutil::ZooKeeperPtr & zookeeper, const UUID & id, const UpdateFunc & update_func); + bool insertZooKeeper(const zkutil::ZooKeeperPtr & zookeeper, const UUID & id, const AccessEntityPtr & entity, bool replace_if_exists, bool throw_if_exists); + bool removeZooKeeper(const zkutil::ZooKeeperPtr & zookeeper, const UUID & id, bool throw_if_not_exists); + bool updateZooKeeper(const zkutil::ZooKeeperPtr & zookeeper, const UUID & id, const UpdateFunc & update_func, bool throw_if_not_exists); void runWorkerThread(); void resetAfterError(); @@ -71,16 +75,11 @@ private: std::optional findImpl(AccessEntityType type, const String & name) const override; std::vector findAllImpl(AccessEntityType type) const override; - bool existsImpl(const UUID & id) const override; - AccessEntityPtr readImpl(const UUID & id) const override; - String readNameImpl(const UUID & id) const override; - bool canInsertImpl(const AccessEntityPtr &) const override { return true; } + AccessEntityPtr readImpl(const UUID & id, bool throw_if_not_exists) const override; void prepareNotifications(const Entry & entry, bool remove, Notifications & notifications) const; scope_guard subscribeForChangesImpl(const UUID & id, const OnChangedHandler & handler) const override; scope_guard subscribeForChangesImpl(AccessEntityType type, const OnChangedHandler & handler) const override; - bool hasSubscriptionImpl(const UUID & id) const override; - bool hasSubscriptionImpl(AccessEntityType type) const override; mutable std::mutex mutex; std::unordered_map entries_by_id; diff --git a/src/Access/SettingsConstraints.cpp b/src/Access/SettingsConstraints.cpp index be59fc13136..9d21ba91f5d 100644 --- a/src/Access/SettingsConstraints.cpp +++ b/src/Access/SettingsConstraints.cpp @@ -15,6 +15,7 @@ namespace ErrorCodes extern const int READONLY; extern const int QUERY_IS_PROHIBITED; extern const int SETTING_CONSTRAINT_VIOLATION; + extern const int UNKNOWN_SETTING; } @@ -200,7 +201,23 @@ bool SettingsConstraints::checkImpl(const Settings & current_settings, SettingCh }; if (reaction == THROW_ON_VIOLATION) - access_control->checkSettingNameIsAllowed(setting_name); + { + try + { + access_control->checkSettingNameIsAllowed(setting_name); + } + catch (Exception & e) + { + if (e.code() == ErrorCodes::UNKNOWN_SETTING) + { + if (const auto hints = current_settings.getHints(change.name); !hints.empty()) + { + e.addMessage(fmt::format("Maybe you meant {}", toString(hints))); + } + } + throw; + } + } else if (!access_control->isSettingNameAllowed(setting_name)) return false; diff --git a/src/Access/UsersConfigAccessStorage.cpp b/src/Access/UsersConfigAccessStorage.cpp index 7b4ff2d3296..5bd2da97445 100644 --- a/src/Access/UsersConfigAccessStorage.cpp +++ b/src/Access/UsersConfigAccessStorage.cpp @@ -610,41 +610,21 @@ std::vector UsersConfigAccessStorage::findAllImpl(AccessEntityType type) c } -bool UsersConfigAccessStorage::existsImpl(const UUID & id) const +bool UsersConfigAccessStorage::exists(const UUID & id) const { return memory_storage.exists(id); } -AccessEntityPtr UsersConfigAccessStorage::readImpl(const UUID & id) const +AccessEntityPtr UsersConfigAccessStorage::readImpl(const UUID & id, bool throw_if_not_exists) const { - return memory_storage.read(id); + return memory_storage.read(id, throw_if_not_exists); } -String UsersConfigAccessStorage::readNameImpl(const UUID & id) const +std::optional UsersConfigAccessStorage::readNameImpl(const UUID & id, bool throw_if_not_exists) const { - return memory_storage.readName(id); -} - - -UUID UsersConfigAccessStorage::insertImpl(const AccessEntityPtr & entity, bool) -{ - throwReadonlyCannotInsert(entity->getType(), entity->getName()); -} - - -void UsersConfigAccessStorage::removeImpl(const UUID & id) -{ - auto entity = read(id); - throwReadonlyCannotRemove(entity->getType(), entity->getName()); -} - - -void UsersConfigAccessStorage::updateImpl(const UUID & id, const UpdateFunc &) -{ - auto entity = read(id); - throwReadonlyCannotUpdate(entity->getType(), entity->getName()); + return memory_storage.readName(id, throw_if_not_exists); } @@ -660,13 +640,13 @@ scope_guard UsersConfigAccessStorage::subscribeForChangesImpl(AccessEntityType t } -bool UsersConfigAccessStorage::hasSubscriptionImpl(const UUID & id) const +bool UsersConfigAccessStorage::hasSubscription(const UUID & id) const { return memory_storage.hasSubscription(id); } -bool UsersConfigAccessStorage::hasSubscriptionImpl(AccessEntityType type) const +bool UsersConfigAccessStorage::hasSubscription(AccessEntityType type) const { return memory_storage.hasSubscription(type); } diff --git a/src/Access/UsersConfigAccessStorage.h b/src/Access/UsersConfigAccessStorage.h index 8f87e5ad928..470990f0706 100644 --- a/src/Access/UsersConfigAccessStorage.h +++ b/src/Access/UsersConfigAccessStorage.h @@ -27,6 +27,7 @@ public: const char * getStorageType() const override { return STORAGE_TYPE; } String getStorageParamsJSON() const override; + bool isReadOnly() const override { return true; } String getPath() const; bool isPathEqual(const String & path_) const; @@ -41,22 +42,19 @@ public: void startPeriodicReloading(); void stopPeriodicReloading(); + bool exists(const UUID & id) const override; + bool hasSubscription(const UUID & id) const override; + bool hasSubscription(AccessEntityType type) const override; + private: void parseFromConfig(const Poco::Util::AbstractConfiguration & config); std::optional findImpl(AccessEntityType type, const String & name) const override; std::vector findAllImpl(AccessEntityType type) const override; - bool existsImpl(const UUID & id) const override; - AccessEntityPtr readImpl(const UUID & id) const override; - String readNameImpl(const UUID & id) const override; - bool canInsertImpl(const AccessEntityPtr &) const override { return false; } - UUID insertImpl(const AccessEntityPtr & entity, bool replace_if_exists) override; - void removeImpl(const UUID & id) override; - void updateImpl(const UUID & id, const UpdateFunc & update_func) override; + AccessEntityPtr readImpl(const UUID & id, bool throw_if_not_exists) const override; + std::optional readNameImpl(const UUID & id, bool throw_if_not_exists) const override; scope_guard subscribeForChangesImpl(const UUID & id, const OnChangedHandler & handler) const override; scope_guard subscribeForChangesImpl(AccessEntityType type, const OnChangedHandler & handler) const override; - bool hasSubscriptionImpl(const UUID & id) const override; - bool hasSubscriptionImpl(AccessEntityType type) const override; MemoryAccessStorage memory_storage; CheckSettingNameFunction check_setting_name_function; diff --git a/src/AggregateFunctions/AggregateFunctionAvg.h b/src/AggregateFunctions/AggregateFunctionAvg.h index 8ca0ae1dac2..eb061337753 100644 --- a/src/AggregateFunctions/AggregateFunctionAvg.h +++ b/src/AggregateFunctions/AggregateFunctionAvg.h @@ -218,9 +218,9 @@ public: using ColVecType = ColumnVectorOrDecimal; - void NO_SANITIZE_UNDEFINED add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena *) const final + void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena *) const final { - this->data(place).numerator += static_cast(*columns[0]).getData()[row_num]; + increment(place, static_cast(*columns[0]).getData()[row_num]); ++this->data(place).denominator; } @@ -240,7 +240,7 @@ public: sum_data.addMany(column.getData().data(), batch_size); this->data(place).denominator += batch_size; } - this->data(place).numerator += sum_data.sum; + increment(place, sum_data.sum); } void addBatchSinglePlaceNotNull( @@ -270,7 +270,7 @@ public: sum_data.addManyNotNull(column.getData().data(), null_map, batch_size); this->data(place).denominator += batch_size - countBytesInFilter(null_map, batch_size); } - this->data(place).numerator += sum_data.sum; + increment(place, sum_data.sum); } String getName() const override { return "avg"; } @@ -298,5 +298,10 @@ public: #endif +private: + void NO_SANITIZE_UNDEFINED increment(AggregateDataPtr __restrict place, Numerator inc) const + { + this->data(place).numerator += inc; + } }; } diff --git a/src/AggregateFunctions/AggregateFunctionForEach.h b/src/AggregateFunctions/AggregateFunctionForEach.h index 0de6272d23e..064b7b00c86 100644 --- a/src/AggregateFunctions/AggregateFunctionForEach.h +++ b/src/AggregateFunctions/AggregateFunctionForEach.h @@ -90,7 +90,7 @@ private: throw; } - for (i = 0; i < old_size; i++) + for (i = 0; i < old_size; ++i) { nested_func->merge(&new_state[i * nested_size_of_data], &old_state[i * nested_size_of_data], diff --git a/src/AggregateFunctions/AggregateFunctionGroupBitmap.h b/src/AggregateFunctions/AggregateFunctionGroupBitmap.h index 77dcede4c20..94d64d47b51 100644 --- a/src/AggregateFunctions/AggregateFunctionGroupBitmap.h +++ b/src/AggregateFunctions/AggregateFunctionGroupBitmap.h @@ -54,6 +54,8 @@ public: template class AggregateFunctionBitmapL2 final : public IAggregateFunctionDataHelper> { +private: + static constexpr auto STATE_VERSION_1_MIN_REVISION = 54455; public: AggregateFunctionBitmapL2(const DataTypePtr & type) : IAggregateFunctionDataHelper>({type}, {}) @@ -105,9 +107,38 @@ public: } } - void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional /* version */) const override { this->data(place).rbs.write(buf); } + bool isVersioned() const override { return true; } - void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional /* version */, Arena *) const override { this->data(place).rbs.read(buf); } + size_t getDefaultVersion() const override { return 1; } + + size_t getVersionFromRevision(size_t revision) const override + { + if (revision >= STATE_VERSION_1_MIN_REVISION) + return 1; + else + return 0; + } + + void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional version) const override + { + if (!version) + version = getDefaultVersion(); + + if (*version >= 1) + DB::writeBoolText(this->data(place).init, buf); + + this->data(place).rbs.write(buf); + } + + void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional version, Arena *) const override + { + if (!version) + version = getDefaultVersion(); + + if (*version >= 1) + DB::readBoolText(this->data(place).init, buf); + this->data(place).rbs.read(buf); + } void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override { diff --git a/src/AggregateFunctions/AggregateFunctionGroupBitmapData.h b/src/AggregateFunctions/AggregateFunctionGroupBitmapData.h index 878cbc3219f..eee91904b9b 100644 --- a/src/AggregateFunctions/AggregateFunctionGroupBitmapData.h +++ b/src/AggregateFunctions/AggregateFunctionGroupBitmapData.h @@ -421,6 +421,9 @@ public: */ UInt8 rb_contains(UInt64 x) const { + if (!std::is_same_v && x > rb_max()) + return 0; + if (isSmall()) return small.find(x) != small.end(); else @@ -432,6 +435,9 @@ public: */ void rb_remove(UInt64 x) { + if (!std::is_same_v && x > rb_max()) + return; + if (isSmall()) toLarge(); diff --git a/src/AggregateFunctions/AggregateFunctionHistogram.h b/src/AggregateFunctions/AggregateFunctionHistogram.h index 665e505aa4e..b858c6b628c 100644 --- a/src/AggregateFunctions/AggregateFunctionHistogram.h +++ b/src/AggregateFunctions/AggregateFunctionHistogram.h @@ -271,7 +271,7 @@ public: { lower_bound = std::min(lower_bound, other.lower_bound); upper_bound = std::max(upper_bound, other.upper_bound); - for (size_t i = 0; i < other.size; i++) + for (size_t i = 0; i < other.size; ++i) add(other.points[i].mean, other.points[i].weight, max_bins); } diff --git a/src/AggregateFunctions/AggregateFunctionIf.cpp b/src/AggregateFunctions/AggregateFunctionIf.cpp index 4ac6a2dce21..d752900c018 100644 --- a/src/AggregateFunctions/AggregateFunctionIf.cpp +++ b/src/AggregateFunctions/AggregateFunctionIf.cpp @@ -56,7 +56,7 @@ static bool ALWAYS_INLINE inline is_all_zeros(const UInt8 * flags, size_t size) i += 8; } - for (; i < size; i++) + for (; i < size; ++i) if (flags[i]) return false; diff --git a/src/AggregateFunctions/AggregateFunctionTopK.cpp b/src/AggregateFunctions/AggregateFunctionTopK.cpp index c3b80cae080..801f3d5e28d 100644 --- a/src/AggregateFunctions/AggregateFunctionTopK.cpp +++ b/src/AggregateFunctions/AggregateFunctionTopK.cpp @@ -7,18 +7,20 @@ #include -#define TOP_K_MAX_SIZE 0xFFFFFF +static inline constexpr UInt64 TOP_K_MAX_SIZE = 0xFFFFFF; namespace DB { + struct Settings; namespace ErrorCodes { - extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; extern const int ARGUMENT_OUT_OF_BOUND; extern const int ILLEGAL_TYPE_OF_ARGUMENT; + extern const int LOGICAL_ERROR; + extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; } @@ -42,19 +44,22 @@ class AggregateFunctionTopKDateTime : public AggregateFunctionTopK -static IAggregateFunction * createWithExtraTypes(const DataTypePtr & argument_type, UInt64 threshold, UInt64 load_factor, const Array & params) +static IAggregateFunction * createWithExtraTypes(const DataTypes & argument_types, UInt64 threshold, UInt64 load_factor, const Array & params) { - WhichDataType which(argument_type); + if (argument_types.empty()) + throw DB::Exception(ErrorCodes::LOGICAL_ERROR, "Got empty arguments list"); + + WhichDataType which(argument_types[0]); if (which.idx == TypeIndex::Date) - return new AggregateFunctionTopKDate(threshold, load_factor, {argument_type}, params); + return new AggregateFunctionTopKDate(threshold, load_factor, argument_types, params); if (which.idx == TypeIndex::DateTime) - return new AggregateFunctionTopKDateTime(threshold, load_factor, {argument_type}, params); + return new AggregateFunctionTopKDateTime(threshold, load_factor, argument_types, params); /// Check that we can use plain version of AggregateFunctionTopKGeneric - if (argument_type->isValueUnambiguouslyRepresentedInContiguousMemoryRegion()) - return new AggregateFunctionTopKGeneric(threshold, load_factor, argument_type, params); + if (argument_types[0]->isValueUnambiguouslyRepresentedInContiguousMemoryRegion()) + return new AggregateFunctionTopKGeneric(threshold, load_factor, argument_types, params); else - return new AggregateFunctionTopKGeneric(threshold, load_factor, argument_type, params); + return new AggregateFunctionTopKGeneric(threshold, load_factor, argument_types, params); } @@ -78,40 +83,37 @@ AggregateFunctionPtr createAggregateFunctionTopK(const std::string & name, const if (!params.empty()) { if (params.size() > 2) - throw Exception("Aggregate function " + name + " requires two parameters or less.", - ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH); + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + "Aggregate function '{}' requires two parameters or less", name); - UInt64 k = applyVisitor(FieldVisitorConvertToNumber(), params[0]); if (params.size() == 2) { load_factor = applyVisitor(FieldVisitorConvertToNumber(), params[1]); if (load_factor < 1) - throw Exception("Too small parameter 'load_factor' for aggregate function " + name + ". Minimum: 1", - ErrorCodes::ARGUMENT_OUT_OF_BOUND); + throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, + "Too small parameter 'load_factor' for aggregate function '{}' (got {}, minimum is 1)", name, load_factor); } - if (k > TOP_K_MAX_SIZE || load_factor > TOP_K_MAX_SIZE || k * load_factor > TOP_K_MAX_SIZE) - throw Exception("Too large parameter(s) for aggregate function " + name + ". Maximum: " + toString(TOP_K_MAX_SIZE), - ErrorCodes::ARGUMENT_OUT_OF_BOUND); + threshold = applyVisitor(FieldVisitorConvertToNumber(), params[0]); - if (k == 0) - throw Exception("Parameter 0 is illegal for aggregate function " + name, - ErrorCodes::ARGUMENT_OUT_OF_BOUND); + if (threshold > TOP_K_MAX_SIZE || load_factor > TOP_K_MAX_SIZE || threshold * load_factor > TOP_K_MAX_SIZE) + throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, + "Too large parameter(s) for aggregate function '{}' (maximum is {})", name, toString(TOP_K_MAX_SIZE)); - threshold = k; + if (threshold == 0) + throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Parameter 0 is illegal for aggregate function '{}'", name); } AggregateFunctionPtr res(createWithNumericType( *argument_types[0], threshold, load_factor, argument_types, params)); if (!res) - res = AggregateFunctionPtr(createWithExtraTypes(argument_types[0], threshold, load_factor, params)); + res = AggregateFunctionPtr(createWithExtraTypes(argument_types, threshold, load_factor, params)); if (!res) - throw Exception("Illegal type " + argument_types[0]->getName() + - " of argument for aggregate function " + name, ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); - + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Illegal type {} of argument for aggregate function '{}'", argument_types[0]->getName(), name); return res; } diff --git a/src/AggregateFunctions/AggregateFunctionTopK.h b/src/AggregateFunctions/AggregateFunctionTopK.h index eb84288a1ae..98774254695 100644 --- a/src/AggregateFunctions/AggregateFunctionTopK.h +++ b/src/AggregateFunctions/AggregateFunctionTopK.h @@ -132,8 +132,8 @@ private: public: AggregateFunctionTopKGeneric( - UInt64 threshold_, UInt64 load_factor, const DataTypePtr & input_data_type_, const Array & params) - : IAggregateFunctionDataHelper>({input_data_type_}, params) + UInt64 threshold_, UInt64 load_factor, const DataTypes & argument_types_, const Array & params) + : IAggregateFunctionDataHelper>(argument_types_, params) , threshold(threshold_), reserved(load_factor * threshold), input_data_type(this->argument_types[0]) {} String getName() const override { return is_weighted ? "topKWeighted" : "topK"; } diff --git a/src/AggregateFunctions/IAggregateFunction.h b/src/AggregateFunctions/IAggregateFunction.h index 1a27f036af7..fd2100cc334 100644 --- a/src/AggregateFunctions/IAggregateFunction.h +++ b/src/AggregateFunctions/IAggregateFunction.h @@ -2,6 +2,7 @@ #include #include +#include #include #include #include @@ -181,6 +182,13 @@ public: Arena * arena, ssize_t if_argument_pos = -1) const = 0; + /// The version of "addBatch", that handle sparse columns as arguments. + virtual void addBatchSparse( + AggregateDataPtr * places, + size_t place_offset, + const IColumn ** columns, + Arena * arena) const = 0; + virtual void mergeBatch( size_t batch_size, AggregateDataPtr * places, @@ -193,6 +201,10 @@ public: virtual void addBatchSinglePlace( size_t batch_size, AggregateDataPtr place, const IColumn ** columns, Arena * arena, ssize_t if_argument_pos = -1) const = 0; + /// The version of "addBatchSinglePlace", that handle sparse columns as arguments. + virtual void addBatchSparseSinglePlace( + AggregateDataPtr place, const IColumn ** columns, Arena * arena) const = 0; + /** The same for single place when need to aggregate only filtered data. * Instead of using an if-column, the condition is combined inside the null_map */ @@ -367,6 +379,22 @@ public: } } + void addBatchSparse( + AggregateDataPtr * places, + size_t place_offset, + const IColumn ** columns, + Arena * arena) const override + { + const auto & column_sparse = assert_cast(*columns[0]); + const auto * values = &column_sparse.getValuesColumn(); + size_t batch_size = column_sparse.size(); + auto offset_it = column_sparse.begin(); + + for (size_t i = 0; i < batch_size; ++i, ++offset_it) + static_cast(this)->add(places[offset_it.getCurrentRow()] + place_offset, + &values, offset_it.getValueIndex(), arena); + } + void mergeBatch( size_t batch_size, AggregateDataPtr * places, @@ -398,6 +426,19 @@ public: } } + void addBatchSparseSinglePlace( + AggregateDataPtr place, const IColumn ** columns, Arena * arena) const override + { + /// TODO: add values and defaults separately if order of adding isn't important. + const auto & column_sparse = assert_cast(*columns[0]); + const auto * values = &column_sparse.getValuesColumn(); + size_t batch_size = column_sparse.size(); + auto offset_it = column_sparse.begin(); + + for (size_t i = 0; i < batch_size; ++i, ++offset_it) + static_cast(this)->add(place, &values, offset_it.getValueIndex(), arena); + } + void addBatchSinglePlaceNotNull( size_t batch_size, AggregateDataPtr place, diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 908d931afb7..dff659f9f76 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -106,6 +106,10 @@ if (USE_AWS_S3) add_headers_and_sources(dbms Disks/S3) endif() +if (USE_AZURE_BLOB_STORAGE) + add_headers_and_sources(dbms Disks/AzureBlobStorage) +endif() + if (USE_HDFS) add_headers_and_sources(dbms Storages/HDFS) add_headers_and_sources(dbms Disks/HDFS) @@ -450,6 +454,11 @@ if (USE_AWS_S3) target_include_directories (clickhouse_common_io SYSTEM BEFORE PUBLIC ${AWS_S3_INCLUDE_DIR}) endif() +if (USE_AZURE_BLOB_STORAGE) + target_link_libraries (clickhouse_common_io PUBLIC ${AZURE_BLOB_STORAGE_LIBRARY}) + target_include_directories (clickhouse_common_io SYSTEM BEFORE PUBLIC ${AZURE_SDK_INCLUDES}) +endif() + if (USE_S2_GEOMETRY) dbms_target_link_libraries (PUBLIC ${S2_GEOMETRY_LIBRARY}) dbms_target_include_directories (SYSTEM BEFORE PUBLIC ${S2_GEOMETRY_INCLUDE_DIR}) @@ -517,6 +526,14 @@ if (USE_BZIP2) target_include_directories (clickhouse_common_io SYSTEM BEFORE PRIVATE ${BZIP2_INCLUDE_DIR}) endif() +if(USE_SIMDJSON) + dbms_target_link_libraries(PRIVATE simdjson) +endif() + +if(USE_RAPIDJSON) + dbms_target_include_directories(SYSTEM PRIVATE ${RAPIDJSON_INCLUDE_DIR}) +endif() + dbms_target_link_libraries(PUBLIC consistent-hashing) include ("${ClickHouse_SOURCE_DIR}/cmake/add_check.cmake") diff --git a/src/Client/ClientBase.cpp b/src/Client/ClientBase.cpp index b97d8342186..75e0588f786 100644 --- a/src/Client/ClientBase.cpp +++ b/src/Client/ClientBase.cpp @@ -4,10 +4,12 @@ #include #include #include +#include +#include #include -#include -#include +#include +#include #include #include #include "Common/Exception.h" @@ -52,6 +54,7 @@ #include #include #include +#include #include #include #include @@ -105,6 +108,99 @@ namespace ProfileEvents namespace DB { +static void incrementProfileEventsBlock(Block & dst, const Block & src) +{ + if (!dst) + { + dst = src; + return; + } + + assertBlocksHaveEqualStructure(src, dst, "ProfileEvents"); + + std::unordered_map name_pos; + for (size_t i = 0; i < dst.columns(); ++i) + name_pos[dst.getByPosition(i).name] = i; + + size_t dst_rows = dst.rows(); + MutableColumns mutable_columns = dst.mutateColumns(); + + auto & dst_column_host_name = typeid_cast(*mutable_columns[name_pos["host_name"]]); + auto & dst_array_current_time = typeid_cast(*mutable_columns[name_pos["current_time"]]).getData(); + auto & dst_array_thread_id = typeid_cast(*mutable_columns[name_pos["thread_id"]]).getData(); + auto & dst_array_type = typeid_cast(*mutable_columns[name_pos["type"]]).getData(); + auto & dst_column_name = typeid_cast(*mutable_columns[name_pos["name"]]); + auto & dst_array_value = typeid_cast(*mutable_columns[name_pos["value"]]).getData(); + + const auto & src_column_host_name = typeid_cast(*src.getByName("host_name").column); + const auto & src_array_current_time = typeid_cast(*src.getByName("current_time").column).getData(); + const auto & src_array_thread_id = typeid_cast(*src.getByName("thread_id").column).getData(); + const auto & src_column_name = typeid_cast(*src.getByName("name").column); + const auto & src_array_value = typeid_cast(*src.getByName("value").column).getData(); + + struct Id + { + StringRef name; + StringRef host_name; + UInt64 thread_id; + + bool operator<(const Id & rhs) const + { + return std::tie(name, host_name, thread_id) + < std::tie(rhs.name, rhs.host_name, rhs.thread_id); + } + }; + std::map rows_by_name; + for (size_t src_row = 0; src_row < src.rows(); ++src_row) + { + Id id{ + src_column_name.getDataAt(src_row), + src_column_host_name.getDataAt(src_row), + src_array_thread_id[src_row], + }; + rows_by_name[id] = src_row; + } + + /// Merge src into dst. + for (size_t dst_row = 0; dst_row < dst_rows; ++dst_row) + { + Id id{ + dst_column_name.getDataAt(dst_row), + dst_column_host_name.getDataAt(dst_row), + dst_array_thread_id[dst_row], + }; + + if (auto it = rows_by_name.find(id); it != rows_by_name.end()) + { + size_t src_row = it->second; + dst_array_current_time[dst_row] = src_array_current_time[src_row]; + + switch (dst_array_type[dst_row]) + { + case ProfileEvents::Type::INCREMENT: + dst_array_value[dst_row] += src_array_value[src_row]; + break; + case ProfileEvents::Type::GAUGE: + dst_array_value[dst_row] = src_array_value[src_row]; + break; + } + + rows_by_name.erase(it); + } + } + + /// Copy rows from src that dst does not contains. + for (const auto & [id, pos] : rows_by_name) + { + for (size_t col = 0; col < src.columns(); ++col) + { + mutable_columns[col]->insert((*src.getByPosition(col).column)[pos]); + } + } + + dst.setColumns(std::move(mutable_columns)); +} + std::atomic_flag exit_on_signal = ATOMIC_FLAG_INIT; @@ -465,7 +561,7 @@ void ClientBase::processTextAsSingleQuery(const String & full_query) try { - processParsedSingleQuery(full_query, query_to_execute, parsed_query); + processParsedSingleQuery(full_query, query_to_execute, parsed_query, echo_queries); } catch (Exception & e) { @@ -753,7 +849,7 @@ void ClientBase::onProfileEvents(Block & block) } else { - profile_events.last_block = block; + incrementProfileEventsBlock(profile_events.last_block, block); } } profile_events.watch.restart(); @@ -1414,9 +1510,6 @@ void ClientBase::runInteractive() highlight_callback = highlight; ReplxxLineReader lr(*suggest, history_file, config().has("multiline"), query_extenders, query_delimiters, highlight_callback); - -#elif defined(USE_READLINE) && USE_READLINE - ReadlineLineReader lr(*suggest, history_file, config().has("multiline"), query_extenders, query_delimiters); #else LineReader lr(history_file, config().has("multiline"), query_extenders, query_delimiters); #endif @@ -1494,17 +1587,14 @@ void ClientBase::runNonInteractive() { auto process_multi_query_from_file = [&](const String & file) { - auto text = getQueryTextPrefix(); String queries_from_file; ReadBufferFromFile in(file); readStringUntilEOF(queries_from_file, in); - text += queries_from_file; - return executeMultiQuery(text); + return executeMultiQuery(queries_from_file); }; - /// Read all queries into `text`. for (const auto & queries_file : queries_files) { for (const auto & interleave_file : interleave_queries_files) @@ -1519,9 +1609,6 @@ void ClientBase::runNonInteractive() } String text; - if (is_multiquery) - text = getQueryTextPrefix(); - if (config().has("query")) { text += config().getRawString("query"); /// Poco configuration should not process substitutions in form of ${...} inside query. @@ -1644,7 +1731,13 @@ void ClientBase::parseAndCheckOptions(OptionsDescription & options_description, /// Check unrecognized options without positional options. auto unrecognized_options = po::collect_unrecognized(parsed.options, po::collect_unrecognized_mode::exclude_positional); if (!unrecognized_options.empty()) + { + auto hints = this->getHints(unrecognized_options[0]); + if (!hints.empty()) + throw Exception(ErrorCodes::UNRECOGNIZED_ARGUMENTS, "Unrecognized option '{}'. Maybe you meant {}", unrecognized_options[0], toString(hints)); + throw Exception(ErrorCodes::UNRECOGNIZED_ARGUMENTS, "Unrecognized option '{}'", unrecognized_options[0]); + } /// Check positional options (options after ' -- ', ex: clickhouse-client -- ). unrecognized_options = po::collect_unrecognized(parsed.options, po::collect_unrecognized_mode::include_positional); @@ -1722,6 +1815,25 @@ void ClientBase::init(int argc, char ** argv) ; addOptions(options_description); + + auto getter = [](const auto & op) + { + String op_long_name = op->long_name(); + return "--" + String(op_long_name); + }; + + if (options_description.main_description) + { + const auto & main_options = options_description.main_description->options(); + std::transform(main_options.begin(), main_options.end(), std::back_inserter(cmd_options), getter); + } + + if (options_description.external_description) + { + const auto & external_options = options_description.external_description->options(); + std::transform(external_options.begin(), external_options.end(), std::back_inserter(cmd_options), getter); + } + parseAndCheckOptions(options_description, options, common_arguments); po::notify(options); diff --git a/src/Client/ClientBase.h b/src/Client/ClientBase.h index bad1395e699..1926df5afea 100644 --- a/src/Client/ClientBase.h +++ b/src/Client/ClientBase.h @@ -1,5 +1,6 @@ #pragma once +#include "Common/NamePrompter.h" #include #include #include @@ -37,7 +38,7 @@ void interruptSignalHandler(int signum); class InternalTextLogs; -class ClientBase : public Poco::Util::Application +class ClientBase : public Poco::Util::Application, public IHints<2, ClientBase> { public: @@ -48,6 +49,8 @@ public: void init(int argc, char ** argv); + std::vector getAllRegisteredNames() const override { return cmd_options; } + protected: void runInteractive(); void runNonInteractive(); @@ -78,9 +81,6 @@ protected: String & query_to_execute, ASTPtr & parsed_query, const String & all_queries_text, std::optional & current_exception); - /// For non-interactive multi-query mode get queries text prefix. - virtual String getQueryTextPrefix() { return ""; } - static void clearTerminal(); void showClientVersion(); @@ -100,9 +100,10 @@ protected: const std::vector & external_tables_arguments) = 0; virtual void processConfig() = 0; -private: +protected: bool processQueryText(const String & text); +private: void receiveResult(ASTPtr parsed_query); bool receiveAndProcessPacket(ASTPtr parsed_query, bool cancelled); void receiveLogs(ASTPtr parsed_query); @@ -147,6 +148,7 @@ protected: std::vector queries_files; /// If not empty, queries will be read from these files std::vector interleave_queries_files; /// If not empty, run queries from these files before processing every file from 'queries_files'. + std::vector cmd_options; bool stdin_is_a_tty = false; /// stdin is a terminal. bool stdout_is_a_tty = false; /// stdout is a terminal. diff --git a/src/Client/ClientBaseHelpers.cpp b/src/Client/ClientBaseHelpers.cpp index e1c1481c5b4..3a5d4f4cf33 100644 --- a/src/Client/ClientBaseHelpers.cpp +++ b/src/Client/ClientBaseHelpers.cpp @@ -1,8 +1,8 @@ #include "ClientBaseHelpers.h" -#include -#include +#include +#include #include #include diff --git a/src/Client/IConnections.cpp b/src/Client/IConnections.cpp index dc57cae61a4..1ab7ba6e5d8 100644 --- a/src/Client/IConnections.cpp +++ b/src/Client/IConnections.cpp @@ -25,7 +25,12 @@ struct PocoSocketWrapper : public Poco::Net::SocketImpl void IConnections::DrainCallback::operator()(int fd, Poco::Timespan, const std::string fd_description) const { if (!PocoSocketWrapper(fd).poll(drain_timeout, Poco::Net::Socket::SELECT_READ)) - throw Exception(ErrorCodes::SOCKET_TIMEOUT, "Read timeout while draining from {}", fd_description); + { + throw Exception(ErrorCodes::SOCKET_TIMEOUT, + "Read timeout ({} ms) while draining from {}", + drain_timeout.totalMilliseconds(), + fd_description); + } } } diff --git a/src/Client/LocalConnection.cpp b/src/Client/LocalConnection.cpp index 9eaa9ce883a..8ad853950b2 100644 --- a/src/Client/LocalConnection.cpp +++ b/src/Client/LocalConnection.cpp @@ -214,15 +214,15 @@ bool LocalConnection::poll(size_t) if (next_packet_type) return true; - if (send_progress && (state->after_send_progress.elapsedMicroseconds() >= query_context->getSettingsRef().interactive_delay)) - { - state->after_send_progress.restart(); - next_packet_type = Protocol::Server::Progress; - return true; - } - if (!state->is_finished) { + if (send_progress && (state->after_send_progress.elapsedMicroseconds() >= query_context->getSettingsRef().interactive_delay)) + { + state->after_send_progress.restart(); + next_packet_type = Protocol::Server::Progress; + return true; + } + try { pollImpl(); @@ -282,6 +282,18 @@ bool LocalConnection::poll(size_t) } } + if (state->is_finished && !state->sent_profile_info) + { + state->sent_profile_info = true; + + if (state->executor) + { + next_packet_type = Protocol::Server::ProfileInfo; + state->profile_info = state->executor->getProfileInfo(); + return true; + } + } + if (state->is_finished) { finishQuery(); @@ -349,6 +361,16 @@ Packet LocalConnection::receivePacket() next_packet_type.reset(); break; } + case Protocol::Server::ProfileInfo: + { + if (state->profile_info) + { + packet.profile_info = std::move(*state->profile_info); + state->profile_info.reset(); + } + next_packet_type.reset(); + break; + } case Protocol::Server::TableColumns: { if (state->columns_description) diff --git a/src/Client/LocalConnection.h b/src/Client/LocalConnection.h index fbd054506e7..92c2af30c80 100644 --- a/src/Client/LocalConnection.h +++ b/src/Client/LocalConnection.h @@ -35,6 +35,7 @@ struct LocalQueryState /// Current block to be sent next. std::optional block; std::optional columns_description; + std::optional profile_info; /// Is request cancelled bool is_cancelled = false; @@ -43,6 +44,7 @@ struct LocalQueryState bool sent_totals = false; bool sent_extremes = false; bool sent_progress = false; + bool sent_profile_info = false; /// To output progress, the difference after the previous sending of progress. Progress progress; diff --git a/src/Client/MultiplexedConnections.cpp b/src/Client/MultiplexedConnections.cpp index c3000443a9c..37a372dfb45 100644 --- a/src/Client/MultiplexedConnections.cpp +++ b/src/Client/MultiplexedConnections.cpp @@ -395,17 +395,17 @@ MultiplexedConnections::ReplicaState & MultiplexedConnections::getReplicaForRead read_list.push_back(*connection->socket); } + auto timeout = is_draining ? drain_timeout : receive_timeout; int n = Poco::Net::Socket::select( read_list, write_list, except_list, - is_draining ? drain_timeout : receive_timeout); + timeout); /// We treat any error as timeout for simplicity. /// And we also check if read_list is still empty just in case. if (n <= 0 || read_list.empty()) { - auto err_msg = fmt::format("Timeout exceeded while reading from {}", dumpAddressesUnlocked()); for (ReplicaState & state : replica_states) { Connection * connection = state.connection; @@ -415,7 +415,10 @@ MultiplexedConnections::ReplicaState & MultiplexedConnections::getReplicaForRead invalidateReplica(state); } } - throw Exception(err_msg, ErrorCodes::TIMEOUT_EXCEEDED); + throw Exception(ErrorCodes::TIMEOUT_EXCEEDED, + "Timeout ({} ms) exceeded while reading from {}", + timeout.totalMilliseconds(), + dumpAddressesUnlocked()); } } diff --git a/src/Columns/ColumnAggregateFunction.h b/src/Columns/ColumnAggregateFunction.h index b5efff928bb..cfc4f74ac26 100644 --- a/src/Columns/ColumnAggregateFunction.h +++ b/src/Columns/ColumnAggregateFunction.h @@ -133,6 +133,11 @@ public: void get(size_t n, Field & res) const override; + bool isDefaultAt(size_t) const override + { + throw Exception("Method isDefaultAt is not supported for ColumnAggregateFunction", ErrorCodes::NOT_IMPLEMENTED); + } + StringRef getDataAt(size_t n) const override; void insertData(const char * pos, size_t length) override; @@ -208,6 +213,16 @@ public: throw Exception("Method hasEqualValues is not supported for ColumnAggregateFunction", ErrorCodes::NOT_IMPLEMENTED); } + double getRatioOfDefaultRows(double) const override + { + throw Exception("Method getRatioOfDefaultRows is not supported for ColumnAggregateFunction", ErrorCodes::NOT_IMPLEMENTED); + } + + void getIndicesOfNonDefaultRows(Offsets &, size_t, size_t) const override + { + throw Exception("Method getIndicesOfNonDefaultRows is not supported for ColumnAggregateFunction", ErrorCodes::NOT_IMPLEMENTED); + } + void getPermutation(bool reverse, size_t limit, int nan_direction_hint, Permutation & res) const override; void updatePermutation(bool reverse, size_t limit, int, Permutation & res, EqualRanges & equal_range) const override; diff --git a/src/Columns/ColumnArray.cpp b/src/Columns/ColumnArray.cpp index 5e40b89cc7e..929c0153a0a 100644 --- a/src/Columns/ColumnArray.cpp +++ b/src/Columns/ColumnArray.cpp @@ -182,6 +182,13 @@ StringRef ColumnArray::getDataAt(size_t n) const } +bool ColumnArray::isDefaultAt(size_t n) const +{ + const auto & offsets_data = getOffsets(); + return offsets_data[n] == offsets_data[static_cast(n) - 1]; +} + + void ColumnArray::insertData(const char * pos, size_t length) { /** Similarly - only for arrays of fixed length values. @@ -576,7 +583,8 @@ void ColumnArray::expand(const IColumn::Filter & mask, bool inverted) } if (from != -1) - throw Exception("Not enough bytes in mask", ErrorCodes::LOGICAL_ERROR);} + throw Exception("Not enough bytes in mask", ErrorCodes::LOGICAL_ERROR); +} template ColumnPtr ColumnArray::filterNumber(const Filter & filt, ssize_t result_size_hint) const @@ -868,6 +876,16 @@ ColumnPtr ColumnArray::compress() const }); } +double ColumnArray::getRatioOfDefaultRows(double sample_ratio) const +{ + return getRatioOfDefaultRowsImpl(sample_ratio); +} + +void ColumnArray::getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const +{ + return getIndicesOfNonDefaultRowsImpl(indices, from, limit); +} + ColumnPtr ColumnArray::replicate(const Offsets & replicate_offsets) const { diff --git a/src/Columns/ColumnArray.h b/src/Columns/ColumnArray.h index 48c54eca319..4d15e123770 100644 --- a/src/Columns/ColumnArray.h +++ b/src/Columns/ColumnArray.h @@ -60,6 +60,7 @@ public: Field operator[](size_t n) const override; void get(size_t n, Field & res) const override; StringRef getDataAt(size_t n) const override; + bool isDefaultAt(size_t n) const override; void insertData(const char * pos, size_t length) override; StringRef serializeValueIntoArena(size_t n, Arena & arena, char const *& begin) const override; const char * deserializeAndInsertFromArena(const char * pos) override; @@ -143,6 +144,10 @@ public: return false; } + double getRatioOfDefaultRows(double sample_ratio) const override; + + void getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const override; + bool isCollationSupported() const override { return getData().isCollationSupported(); } private: diff --git a/src/Columns/ColumnCompressed.h b/src/Columns/ColumnCompressed.h index 92bdca7cd72..e7bbed53b8d 100644 --- a/src/Columns/ColumnCompressed.h +++ b/src/Columns/ColumnCompressed.h @@ -82,6 +82,7 @@ public: Field operator[](size_t) const override { throwMustBeDecompressed(); } void get(size_t, Field &) const override { throwMustBeDecompressed(); } StringRef getDataAt(size_t) const override { throwMustBeDecompressed(); } + bool isDefaultAt(size_t) const override { throwMustBeDecompressed(); } void insert(const Field &) override { throwMustBeDecompressed(); } void insertRangeFrom(const IColumn &, size_t, size_t) override { throwMustBeDecompressed(); } void insertData(const char *, size_t) override { throwMustBeDecompressed(); } @@ -113,6 +114,8 @@ public: void gather(ColumnGathererStream &) override { throwMustBeDecompressed(); } void getExtremes(Field &, Field &) const override { throwMustBeDecompressed(); } size_t byteSizeAt(size_t) const override { throwMustBeDecompressed(); } + double getRatioOfDefaultRows(double) const override { throwMustBeDecompressed(); } + void getIndicesOfNonDefaultRows(Offsets &, size_t, size_t) const override { throwMustBeDecompressed(); } protected: size_t rows; diff --git a/src/Columns/ColumnConst.h b/src/Columns/ColumnConst.h index 1faf2a999b2..1ddc8789e7d 100644 --- a/src/Columns/ColumnConst.h +++ b/src/Columns/ColumnConst.h @@ -5,6 +5,7 @@ #include #include #include +#include namespace DB @@ -115,6 +116,11 @@ public: return data->getFloat32(0); } + bool isDefaultAt(size_t) const override + { + return data->isDefaultAt(0); + } + bool isNullAt(size_t) const override { return data->isNullAt(0); @@ -239,6 +245,27 @@ public: return false; } + double getRatioOfDefaultRows(double) const override + { + return data->isDefaultAt(0) ? 1.0 : 0.0; + } + + void getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const override + { + if (!data->isDefaultAt(0)) + { + size_t to = limit && from + limit < size() ? from + limit : size(); + indices.reserve(indices.size() + to - from); + for (size_t i = from; i < to; ++i) + indices.push_back(i); + } + } + + SerializationInfoPtr getSerializationInfo() const override + { + return data->getSerializationInfo(); + } + bool isNullable() const override { return isColumnNullable(*data); } bool onlyNull() const override { return data->isNullAt(0); } bool isNumeric() const override { return data->isNumeric(); } diff --git a/src/Columns/ColumnDecimal.cpp b/src/Columns/ColumnDecimal.cpp index 70e2b4a6d96..99085f0f976 100644 --- a/src/Columns/ColumnDecimal.cpp +++ b/src/Columns/ColumnDecimal.cpp @@ -331,7 +331,8 @@ void ColumnDecimal::gather(ColumnGathererStream & gatherer) template ColumnPtr ColumnDecimal::compress() const { - size_t source_size = data.size() * sizeof(T); + const size_t data_size = data.size(); + const size_t source_size = data_size * sizeof(T); /// Don't compress small blocks. if (source_size < 4096) /// A wild guess. @@ -342,8 +343,9 @@ ColumnPtr ColumnDecimal::compress() const if (!compressed) return ColumnCompressed::wrap(this->getPtr()); - return ColumnCompressed::create(data.size(), compressed->size(), - [compressed = std::move(compressed), column_size = data.size(), scale = this->scale] + const size_t compressed_size = compressed->size(); + return ColumnCompressed::create(data_size, compressed_size, + [compressed = std::move(compressed), column_size = data_size, scale = this->scale] { auto res = ColumnDecimal::create(column_size, scale); ColumnCompressed::decompressBuffer( diff --git a/src/Columns/ColumnDecimal.h b/src/Columns/ColumnDecimal.h index ee13c7e7b2b..18d4526e0f3 100644 --- a/src/Columns/ColumnDecimal.h +++ b/src/Columns/ColumnDecimal.h @@ -177,8 +177,17 @@ public: return false; } - ColumnPtr compress() const override; + double getRatioOfDefaultRows(double sample_ratio) const override + { + return this->template getRatioOfDefaultRowsImpl(sample_ratio); + } + void getIndicesOfNonDefaultRows(IColumn::Offsets & indices, size_t from, size_t limit) const override + { + return this->template getIndicesOfNonDefaultRowsImpl(indices, from, limit); + } + + ColumnPtr compress() const override; void insertValue(const T value) { data.push_back(value); } Container & getData() { return data; } diff --git a/src/Columns/ColumnFixedString.cpp b/src/Columns/ColumnFixedString.cpp index 1eb2d4d5b1f..0828f8ebd89 100644 --- a/src/Columns/ColumnFixedString.cpp +++ b/src/Columns/ColumnFixedString.cpp @@ -51,6 +51,12 @@ MutableColumnPtr ColumnFixedString::cloneResized(size_t size) const return new_col_holder; } +bool ColumnFixedString::isDefaultAt(size_t index) const +{ + assert(index < size()); + return memoryIsZero(chars.data() + index * n, n); +} + void ColumnFixedString::insert(const Field & x) { const String & s = DB::get(x); @@ -409,9 +415,9 @@ ColumnPtr ColumnFixedString::compress() const if (!compressed) return ColumnCompressed::wrap(this->getPtr()); - size_t column_size = size(); - - return ColumnCompressed::create(column_size, compressed->size(), + const size_t column_size = size(); + const size_t compressed_size = compressed->size(); + return ColumnCompressed::create(column_size, compressed_size, [compressed = std::move(compressed), column_size, n = n] { size_t chars_size = n * column_size; diff --git a/src/Columns/ColumnFixedString.h b/src/Columns/ColumnFixedString.h index 4a66a429d96..f813ef47f21 100644 --- a/src/Columns/ColumnFixedString.h +++ b/src/Columns/ColumnFixedString.h @@ -88,6 +88,8 @@ public: return StringRef(&chars[n * index], n); } + bool isDefaultAt(size_t index) const override; + void insert(const Field & x) override; void insertFrom(const IColumn & src_, size_t index) override; @@ -173,6 +175,11 @@ public: chars.reserve(n * size); } + void resize(size_t size) + { + chars.resize(n * size); + } + void getExtremes(Field & min, Field & max) const override; bool structureEquals(const IColumn & rhs) const override @@ -182,6 +189,16 @@ public: return false; } + double getRatioOfDefaultRows(double sample_ratio) const override + { + return getRatioOfDefaultRowsImpl(sample_ratio); + } + + void getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const override + { + return getIndicesOfNonDefaultRowsImpl(indices, from, limit); + } + bool canBeInsideNullable() const override { return true; } bool isFixedAndContiguous() const override { return true; } diff --git a/src/Columns/ColumnFunction.h b/src/Columns/ColumnFunction.h index 2354a4f0cb3..2592dc01f98 100644 --- a/src/Columns/ColumnFunction.h +++ b/src/Columns/ColumnFunction.h @@ -24,7 +24,12 @@ class ColumnFunction final : public COWHelper private: friend class COWHelper; - ColumnFunction(size_t size, FunctionBasePtr function_, const ColumnsWithTypeAndName & columns_to_capture, bool is_short_circuit_argument_ = false, bool is_function_compiled_ = false); + ColumnFunction( + size_t size, + FunctionBasePtr function_, + const ColumnsWithTypeAndName & columns_to_capture, + bool is_short_circuit_argument_ = false, + bool is_function_compiled_ = false); public: const char * getFamilyName() const override { return "Function"; } @@ -68,6 +73,11 @@ public: throw Exception("Cannot get value from " + getName(), ErrorCodes::NOT_IMPLEMENTED); } + bool isDefaultAt(size_t) const override + { + throw Exception("isDefaultAt is not implemented for " + getName(), ErrorCodes::NOT_IMPLEMENTED); + } + void insert(const Field &) override { throw Exception("Cannot insert into " + getName(), ErrorCodes::NOT_IMPLEMENTED); @@ -153,6 +163,16 @@ public: throw Exception("Method gather is not supported for " + getName(), ErrorCodes::NOT_IMPLEMENTED); } + double getRatioOfDefaultRows(double) const override + { + throw Exception("Method getRatioOfDefaultRows is not supported for " + getName(), ErrorCodes::NOT_IMPLEMENTED); + } + + void getIndicesOfNonDefaultRows(Offsets &, size_t, size_t) const override + { + throw Exception("Method getIndicesOfNonDefaultRows is not supported for " + getName(), ErrorCodes::NOT_IMPLEMENTED); + } + bool isShortCircuitArgument() const { return is_short_circuit_argument; } DataTypePtr getResultType() const; diff --git a/src/Columns/ColumnLowCardinality.h b/src/Columns/ColumnLowCardinality.h index 13957357637..0baed4cfb2d 100644 --- a/src/Columns/ColumnLowCardinality.h +++ b/src/Columns/ColumnLowCardinality.h @@ -64,6 +64,7 @@ public: return getDictionary().getDataAtWithTerminatingZero(getIndexes().getUInt(n)); } + bool isDefaultAt(size_t n) const override { return getDictionary().isDefaultAt(getIndexes().getUInt(n)); } UInt64 get64(size_t n) const override { return getDictionary().get64(getIndexes().getUInt(n)); } UInt64 getUInt(size_t n) const override { return getDictionary().getUInt(getIndexes().getUInt(n)); } Int64 getInt(size_t n) const override { return getDictionary().getInt(getIndexes().getUInt(n)); } @@ -180,6 +181,16 @@ public: return false; } + double getRatioOfDefaultRows(double sample_ratio) const override + { + return getIndexes().getRatioOfDefaultRows(sample_ratio); + } + + void getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const override + { + return getIndexes().getIndicesOfNonDefaultRows(indices, from, limit); + } + bool valuesHaveFixedSize() const override { return getDictionary().valuesHaveFixedSize(); } bool isFixedAndContiguous() const override { return false; } size_t sizeOfValueIfFixed() const override { return getDictionary().sizeOfValueIfFixed(); } diff --git a/src/Columns/ColumnMap.cpp b/src/Columns/ColumnMap.cpp index 99d965a4ec1..e595525d9e8 100644 --- a/src/Columns/ColumnMap.cpp +++ b/src/Columns/ColumnMap.cpp @@ -81,6 +81,11 @@ void ColumnMap::get(size_t n, Field & res) const getNestedData().get(offset + i, map[i]); } +bool ColumnMap::isDefaultAt(size_t n) const +{ + return nested->isDefaultAt(n); +} + StringRef ColumnMap::getDataAt(size_t) const { throw Exception("Method getDataAt is not supported for " + getName(), ErrorCodes::NOT_IMPLEMENTED); @@ -273,6 +278,16 @@ bool ColumnMap::structureEquals(const IColumn & rhs) const return false; } +double ColumnMap::getRatioOfDefaultRows(double sample_ratio) const +{ + return getRatioOfDefaultRowsImpl(sample_ratio); +} + +void ColumnMap::getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const +{ + return getIndicesOfNonDefaultRowsImpl(indices, from, limit); +} + ColumnPtr ColumnMap::compress() const { auto compressed = nested->compress(); diff --git a/src/Columns/ColumnMap.h b/src/Columns/ColumnMap.h index aa13bfd3d68..fb69541c363 100644 --- a/src/Columns/ColumnMap.h +++ b/src/Columns/ColumnMap.h @@ -51,6 +51,7 @@ public: Field operator[](size_t n) const override; void get(size_t n, Field & res) const override; + bool isDefaultAt(size_t n) const override; StringRef getDataAt(size_t n) const override; void insertData(const char * pos, size_t length) override; void insert(const Field & x) override; @@ -85,6 +86,8 @@ public: void protect() override; void forEachSubcolumn(ColumnCallback callback) override; bool structureEquals(const IColumn & rhs) const override; + double getRatioOfDefaultRows(double sample_ratio) const override; + void getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const override; const ColumnArray & getNestedColumn() const { return assert_cast(*nested); } ColumnArray & getNestedColumn() { return assert_cast(*nested); } diff --git a/src/Columns/ColumnNullable.cpp b/src/Columns/ColumnNullable.cpp index 4b3340a6b50..7a4ce8288a3 100644 --- a/src/Columns/ColumnNullable.cpp +++ b/src/Columns/ColumnNullable.cpp @@ -648,6 +648,29 @@ void ColumnNullable::checkConsistency() const ErrorCodes::SIZES_OF_NESTED_COLUMNS_ARE_INCONSISTENT); } +ColumnPtr ColumnNullable::createWithOffsets(const IColumn::Offsets & offsets, const Field & default_field, size_t total_rows, size_t shift) const +{ + ColumnPtr new_values; + ColumnPtr new_null_map; + + if (default_field.getType() == Field::Types::Null) + { + auto default_column = nested_column->cloneEmpty(); + default_column->insertDefault(); + + /// Value in main column, when null map is 1 is implementation defined. So, take any value. + new_values = nested_column->createWithOffsets(offsets, (*default_column)[0], total_rows, shift); + new_null_map = null_map->createWithOffsets(offsets, Field(1u), total_rows, shift); + } + else + { + new_values = nested_column->createWithOffsets(offsets, default_field, total_rows, shift); + new_null_map = null_map->createWithOffsets(offsets, Field(0u), total_rows, shift); + } + + return ColumnNullable::create(new_values, new_null_map); +} + ColumnPtr makeNullable(const ColumnPtr & column) { if (isColumnNullable(*column)) diff --git a/src/Columns/ColumnNullable.h b/src/Columns/ColumnNullable.h index 9da7b0dac1c..3e99a25a445 100644 --- a/src/Columns/ColumnNullable.h +++ b/src/Columns/ColumnNullable.h @@ -54,6 +54,7 @@ public: void get(size_t n, Field & res) const override; bool getBool(size_t n) const override { return isNullAt(n) ? false : nested_column->getBool(n); } UInt64 get64(size_t n) const override { return nested_column->get64(n); } + bool isDefaultAt(size_t n) const override { return isNullAt(n); } /** * If isNullAt(n) returns false, returns the nested column's getDataAt(n), otherwise returns a special value @@ -137,6 +138,18 @@ public: return false; } + double getRatioOfDefaultRows(double sample_ratio) const override + { + return null_map->getRatioOfDefaultRows(sample_ratio); + } + + void getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const override + { + null_map->getIndicesOfNonDefaultRows(indices, from, limit); + } + + ColumnPtr createWithOffsets(const IColumn::Offsets & offsets, const Field & default_field, size_t total_rows, size_t shift) const override; + bool isNullable() const override { return true; } bool isFixedAndContiguous() const override { return false; } bool valuesHaveFixedSize() const override { return nested_column->valuesHaveFixedSize(); } diff --git a/src/Columns/ColumnSparse.cpp b/src/Columns/ColumnSparse.cpp new file mode 100644 index 00000000000..e9bdc3971c0 --- /dev/null +++ b/src/Columns/ColumnSparse.cpp @@ -0,0 +1,780 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; + extern const int SIZES_OF_COLUMNS_DOESNT_MATCH; +} + +ColumnSparse::ColumnSparse(MutableColumnPtr && values_) + : values(std::move(values_)), _size(0) +{ + if (!values->empty()) + throw Exception("Not empty values passed to ColumnSparse, but no offsets passed", ErrorCodes::LOGICAL_ERROR); + + values->insertDefault(); + offsets = ColumnUInt64::create(); +} + +ColumnSparse::ColumnSparse(MutableColumnPtr && values_, MutableColumnPtr && offsets_, size_t size_) + : values(std::move(values_)), offsets(std::move(offsets_)), _size(size_) +{ + const ColumnUInt64 * offsets_concrete = typeid_cast(offsets.get()); + + if (!offsets_concrete) + throw Exception(ErrorCodes::LOGICAL_ERROR, "'offsets' column must be a ColumnUInt64, got: {}", offsets->getName()); + + /// 'values' should contain one extra element: default value at 0 position. + if (offsets->size() + 1 != values->size()) + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Values size ({}) is inconsistent with offsets size ({})", values->size(), offsets->size()); + + if (_size < offsets->size()) + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Size of sparse column ({}) cannot be lower than number of non-default values ({})", _size, offsets->size()); + + if (!offsets_concrete->empty() && _size <= offsets_concrete->getData().back()) + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Size of sparse column ({}) should be greater than last position of non-default value ({})", + _size, offsets_concrete->getData().back()); + +#ifndef NDEBUG + const auto & offsets_data = getOffsetsData(); + const auto * it = std::adjacent_find(offsets_data.begin(), offsets_data.end(), std::greater_equal()); + if (it != offsets_data.end()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Offsets of ColumnSparse must be strictly sorted"); +#endif +} + +MutableColumnPtr ColumnSparse::cloneResized(size_t new_size) const +{ + if (new_size == 0) + return ColumnSparse::create(values->cloneEmpty()); + + if (new_size >= _size) + return ColumnSparse::create(IColumn::mutate(values), IColumn::mutate(offsets), new_size); + + auto res = ColumnSparse::create(values->cloneEmpty()); + res->insertRangeFrom(*this, 0, new_size); + return res; +} + +bool ColumnSparse::isDefaultAt(size_t n) const +{ + return getValueIndex(n) == 0; +} + +bool ColumnSparse::isNullAt(size_t n) const +{ + return values->isNullAt(getValueIndex(n)); +} + +Field ColumnSparse::operator[](size_t n) const +{ + return (*values)[getValueIndex(n)]; +} + +void ColumnSparse::get(size_t n, Field & res) const +{ + values->get(getValueIndex(n), res); +} + +bool ColumnSparse::getBool(size_t n) const +{ + return values->getBool(getValueIndex(n)); +} + +Float64 ColumnSparse::getFloat64(size_t n) const +{ + return values->getFloat64(getValueIndex(n)); +} + +Float32 ColumnSparse::getFloat32(size_t n) const +{ + return values->getFloat32(getValueIndex(n)); +} + +UInt64 ColumnSparse::getUInt(size_t n) const +{ + return values->getUInt(getValueIndex(n)); +} + +Int64 ColumnSparse::getInt(size_t n) const +{ + return values->getInt(getValueIndex(n)); +} + +UInt64 ColumnSparse::get64(size_t n) const +{ + return values->get64(getValueIndex(n)); +} + +StringRef ColumnSparse::getDataAt(size_t n) const +{ + return values->getDataAt(getValueIndex(n)); +} + +ColumnPtr ColumnSparse::convertToFullColumnIfSparse() const +{ + return values->createWithOffsets(getOffsetsData(), (*values)[0], _size, /*shift=*/ 1); +} + +void ColumnSparse::insertSingleValue(const Inserter & inserter) +{ + inserter(*values); + + size_t last_idx = values->size() - 1; + if (values->isDefaultAt(last_idx)) + values->popBack(1); + else + getOffsetsData().push_back(_size); + + ++_size; +} + +void ColumnSparse::insertData(const char * pos, size_t length) +{ + insertSingleValue([&](IColumn & column) { column.insertData(pos, length); }); +} + +StringRef ColumnSparse::serializeValueIntoArena(size_t n, Arena & arena, char const *& begin) const +{ + return values->serializeValueIntoArena(getValueIndex(n), arena, begin); +} + +const char * ColumnSparse::deserializeAndInsertFromArena(const char * pos) +{ + const char * res = nullptr; + insertSingleValue([&](IColumn & column) { res = column.deserializeAndInsertFromArena(pos); }); + return res; +} + +const char * ColumnSparse::skipSerializedInArena(const char * pos) const +{ + return values->skipSerializedInArena(pos); +} + +void ColumnSparse::insertRangeFrom(const IColumn & src, size_t start, size_t length) +{ + if (length == 0) + return; + + if (start + length > src.size()) + throw Exception("Parameter out of bound in IColumnString::insertRangeFrom method.", + ErrorCodes::LOGICAL_ERROR); + + auto & offsets_data = getOffsetsData(); + + size_t end = start + length; + if (const auto * src_sparse = typeid_cast(&src)) + { + const auto & src_offsets = src_sparse->getOffsetsData(); + const auto & src_values = src_sparse->getValuesColumn(); + + size_t offset_start = std::lower_bound(src_offsets.begin(), src_offsets.end(), start) - src_offsets.begin(); + size_t offset_end = std::lower_bound(src_offsets.begin(), src_offsets.end(), end) - src_offsets.begin(); + assert(offset_start <= offset_end); + + if (offset_start != offset_end) + { + offsets_data.reserve(offsets_data.size() + offset_end - offset_start); + insertManyDefaults(src_offsets[offset_start] - start); + offsets_data.push_back(_size); + ++_size; + + for (size_t i = offset_start + 1; i < offset_end; ++i) + { + size_t current_diff = src_offsets[i] - src_offsets[i - 1]; + insertManyDefaults(current_diff - 1); + offsets_data.push_back(_size); + ++_size; + } + + /// 'end' <= 'src_offsets[offsets_end]', but end is excluded, so index is 'offsets_end' - 1. + /// Since 'end' is excluded, need to subtract one more row from result. + insertManyDefaults(end - src_offsets[offset_end - 1] - 1); + values->insertRangeFrom(src_values, offset_start + 1, offset_end - offset_start); + } + else + { + insertManyDefaults(length); + } + } + else + { + for (size_t i = start; i < end; ++i) + { + if (!src.isDefaultAt(i)) + { + values->insertFrom(src, i); + offsets_data.push_back(_size); + } + + ++_size; + } + } +} + +void ColumnSparse::insert(const Field & x) +{ + insertSingleValue([&](IColumn & column) { column.insert(x); }); +} + +void ColumnSparse::insertFrom(const IColumn & src, size_t n) +{ + if (const auto * src_sparse = typeid_cast(&src)) + { + if (size_t value_index = src_sparse->getValueIndex(n)) + { + getOffsetsData().push_back(_size); + values->insertFrom(src_sparse->getValuesColumn(), value_index); + } + } + else + { + if (!src.isDefaultAt(n)) + { + values->insertFrom(src, n); + getOffsetsData().push_back(_size); + } + } + + ++_size; +} + +void ColumnSparse::insertDefault() +{ + ++_size; +} + +void ColumnSparse::insertManyDefaults(size_t length) +{ + _size += length; +} + +void ColumnSparse::popBack(size_t n) +{ + assert(n < _size); + + auto & offsets_data = getOffsetsData(); + size_t new_size = _size - n; + + size_t removed_values = 0; + while (!offsets_data.empty() && offsets_data.back() >= new_size) + { + offsets_data.pop_back(); + ++removed_values; + } + + if (removed_values) + values->popBack(removed_values); + + _size = new_size; +} + +ColumnPtr ColumnSparse::filter(const Filter & filt, ssize_t) const +{ + if (_size != filt.size()) + throw Exception("Size of filter doesn't match size of column.", ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH); + + if (offsets->empty()) + { + auto res = cloneEmpty(); + res->insertManyDefaults(countBytesInFilter(filt)); + return res; + } + + auto res_offsets = offsets->cloneEmpty(); + auto & res_offsets_data = assert_cast(*res_offsets).getData(); + + Filter values_filter; + values_filter.reserve(values->size()); + values_filter.push_back(1); + size_t values_result_size_hint = 1; + + size_t res_offset = 0; + auto offset_it = begin(); + for (size_t i = 0; i < _size; ++i, ++offset_it) + { + if (!offset_it.isDefault()) + { + if (filt[i]) + { + res_offsets_data.push_back(res_offset); + values_filter.push_back(1); + ++res_offset; + ++values_result_size_hint; + } + else + { + values_filter.push_back(0); + } + } + else + { + res_offset += filt[i] != 0; + } + } + + auto res_values = values->filter(values_filter, values_result_size_hint); + return this->create(std::move(res_values), std::move(res_offsets), res_offset); +} + +void ColumnSparse::expand(const Filter & mask, bool inverted) +{ + if (mask.size() < _size) + throw Exception("Mask size should be no less than data size.", ErrorCodes::LOGICAL_ERROR); + + auto res_offsets = offsets->cloneEmpty(); + auto & res_offsets_data = assert_cast(*res_offsets).getData(); + + auto it = begin(); + for (size_t i = 0; i < mask.size(); ++i) + { + if (!!mask[i] ^ inverted) + { + if (it.getCurrentRow() == _size) + throw Exception("Too many bytes in mask", ErrorCodes::LOGICAL_ERROR); + + if (!it.isDefault()) + res_offsets_data[it.getCurrentOffset()] = i; + + ++it; + } + } + + _size = mask.size(); +} + +ColumnPtr ColumnSparse::permute(const Permutation & perm, size_t limit) const +{ + return permuteImpl(*this, perm, limit); +} + +ColumnPtr ColumnSparse::index(const IColumn & indexes, size_t limit) const +{ + return selectIndexImpl(*this, indexes, limit); +} + +template +ColumnPtr ColumnSparse::indexImpl(const PaddedPODArray & indexes, size_t limit) const +{ + assert(limit <= indexes.size()); + if (limit == 0) + return ColumnSparse::create(values->cloneEmpty()); + + if (offsets->empty()) + { + auto res = cloneEmpty(); + res->insertManyDefaults(limit); + return res; + } + + auto res_offsets = offsets->cloneEmpty(); + auto & res_offsets_data = assert_cast(*res_offsets).getData(); + auto res_values = values->cloneEmpty(); + res_values->insertDefault(); + + /// If we need to permute full column, or if limit is large enough, + /// it's better to save indexes of values in O(size) + /// and avoid binary search for obtaining every index. + /// 3 is just a guess for overhead on copying indexes. + bool execute_linear = + limit == _size || limit * std::bit_width(offsets->size()) > _size * 3; + + if (execute_linear) + { + PaddedPODArray values_index(_size); + auto offset_it = begin(); + for (size_t i = 0; i < _size; ++i, ++offset_it) + values_index[i] = offset_it.getValueIndex(); + + for (size_t i = 0; i < limit; ++i) + { + size_t index = values_index[indexes[i]]; + if (index != 0) + { + res_values->insertFrom(*values, index); + res_offsets_data.push_back(i); + } + } + } + else + { + for (size_t i = 0; i < limit; ++i) + { + size_t index = getValueIndex(indexes[i]); + if (index != 0) + { + res_values->insertFrom(*values, index); + res_offsets_data.push_back(i); + } + } + } + + return ColumnSparse::create(std::move(res_values), std::move(res_offsets), limit); +} + +int ColumnSparse::compareAt(size_t n, size_t m, const IColumn & rhs_, int null_direction_hint) const +{ + if (const auto * rhs_sparse = typeid_cast(&rhs_)) + return values->compareAt(getValueIndex(n), rhs_sparse->getValueIndex(m), rhs_sparse->getValuesColumn(), null_direction_hint); + + return values->compareAt(getValueIndex(n), m, rhs_, null_direction_hint); +} + +void ColumnSparse::compareColumn(const IColumn & rhs, size_t rhs_row_num, + PaddedPODArray * row_indexes, PaddedPODArray & compare_results, + int direction, int nan_direction_hint) const +{ + if (row_indexes) + { + /// TODO: implement without conversion to full column. + auto this_full = convertToFullColumnIfSparse(); + auto rhs_full = rhs.convertToFullColumnIfSparse(); + this_full->compareColumn(*rhs_full, rhs_row_num, row_indexes, compare_results, direction, nan_direction_hint); + } + else + { + const auto & rhs_sparse = assert_cast(rhs); + PaddedPODArray nested_result; + values->compareColumn(rhs_sparse.getValuesColumn(), rhs_sparse.getValueIndex(rhs_row_num), + nullptr, nested_result, direction, nan_direction_hint); + + const auto & offsets_data = getOffsetsData(); + compare_results.resize_fill(_size, nested_result[0]); + for (size_t i = 0; i < offsets_data.size(); ++i) + compare_results[offsets_data[i]] = nested_result[i + 1]; + } +} + +int ColumnSparse::compareAtWithCollation(size_t n, size_t m, const IColumn & rhs, int null_direction_hint, const Collator & collator) const +{ + if (const auto * rhs_sparse = typeid_cast(&rhs)) + return values->compareAtWithCollation(getValueIndex(n), rhs_sparse->getValueIndex(m), rhs_sparse->getValuesColumn(), null_direction_hint, collator); + + return values->compareAtWithCollation(getValueIndex(n), m, rhs, null_direction_hint, collator); +} + +bool ColumnSparse::hasEqualValues() const +{ + size_t num_defaults = getNumberOfDefaults(); + if (num_defaults == _size) + return true; + + /// Have at least 1 default and 1 non-default values. + if (num_defaults != 0) + return false; + + /// Check that probably all non-default values are equal. + /// It's suboptiomal, but it's a rare case. + for (size_t i = 2; i < values->size(); ++i) + if (values->compareAt(1, i, *values, 1) != 0) + return false; + + return true; +} + +void ColumnSparse::getPermutationImpl(bool reverse, size_t limit, int null_direction_hint, Permutation & res, const Collator * collator) const +{ + if (_size == 0) + return; + + res.resize(_size); + if (offsets->empty()) + { + for (size_t i = 0; i < _size; ++i) + res[i] = i; + return; + } + + if (limit == 0 || limit > _size) + limit = _size; + + Permutation perm; + /// Firstly we sort all values. + /// limit + 1 for case when there are 0 default values. + if (collator) + values->getPermutationWithCollation(*collator, reverse, limit + 1, null_direction_hint, perm); + else + values->getPermutation(reverse, limit + 1, null_direction_hint, perm); + + size_t num_of_defaults = getNumberOfDefaults(); + size_t row = 0; + + const auto & offsets_data = getOffsetsData(); + + /// Fill the permutation. + for (size_t i = 0; i < perm.size() && row < limit; ++i) + { + if (perm[i] == 0) + { + if (!num_of_defaults) + continue; + + /// Fill the positions of default values in the required quantity. + auto offset_it = begin(); + while (row < limit) + { + while (offset_it.getCurrentRow() < _size && !offset_it.isDefault()) + ++offset_it; + + if (offset_it.getCurrentRow() == _size) + break; + + res[row++] = offset_it.getCurrentRow(); + ++offset_it; + } + } + else + { + res[row++] = offsets_data[perm[i] - 1]; + } + } + + assert(row == limit); +} + +void ColumnSparse::getPermutation(bool reverse, size_t limit, int null_direction_hint, Permutation & res) const +{ + return getPermutationImpl(reverse, limit, null_direction_hint, res, nullptr); +} + +void ColumnSparse::updatePermutation(bool reverse, size_t limit, int null_direction_hint, Permutation & res, EqualRanges & equal_range) const +{ + auto this_full = convertToFullColumnIfSparse(); + this_full->updatePermutation(reverse, limit, null_direction_hint, res, equal_range); +} + +void ColumnSparse::getPermutationWithCollation(const Collator & collator, bool reverse, size_t limit, int null_direction_hint, Permutation & res) const +{ + return getPermutationImpl(reverse, limit, null_direction_hint, res, &collator); +} + +void ColumnSparse::updatePermutationWithCollation( + const Collator & collator, bool reverse, size_t limit, int null_direction_hint, Permutation & res, EqualRanges& equal_range) const +{ + auto this_full = convertToFullColumnIfSparse(); + this_full->updatePermutationWithCollation(collator, reverse, limit, null_direction_hint, res, equal_range); +} + +size_t ColumnSparse::byteSize() const +{ + return values->byteSize() + offsets->byteSize() + sizeof(_size); +} + +size_t ColumnSparse::byteSizeAt(size_t n) const +{ + size_t index = getValueIndex(n); + size_t res = values->byteSizeAt(index); + if (index) + res += sizeof(UInt64); + + return res; +} + +size_t ColumnSparse::allocatedBytes() const +{ + return values->allocatedBytes() + offsets->allocatedBytes() + sizeof(_size); +} + +void ColumnSparse::protect() +{ + values->protect(); + offsets->protect(); +} + +ColumnPtr ColumnSparse::replicate(const Offsets & replicate_offsets) const +{ + /// TODO: implement specializations. + if (_size != replicate_offsets.size()) + throw Exception("Size of offsets doesn't match size of column.", ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH); + + if (_size == 0) + return ColumnSparse::create(values->cloneEmpty()); + + auto res_offsets = offsets->cloneEmpty(); + auto & res_offsets_data = assert_cast(*res_offsets).getData(); + auto res_values = values->cloneEmpty(); + res_values->insertDefault(); + + auto offset_it = begin(); + for (size_t i = 0; i < _size; ++i, ++offset_it) + { + if (!offset_it.isDefault()) + { + size_t replicate_size = replicate_offsets[i] - replicate_offsets[i - 1]; + res_offsets_data.reserve(res_offsets_data.size() + replicate_size); + for (size_t row = replicate_offsets[i - 1]; row < replicate_offsets[i]; ++row) + { + res_offsets_data.push_back(row); + res_values->insertFrom(*values, offset_it.getValueIndex()); + } + } + } + + return ColumnSparse::create(std::move(res_values), std::move(res_offsets), replicate_offsets.back()); +} + +void ColumnSparse::updateHashWithValue(size_t n, SipHash & hash) const +{ + values->updateHashWithValue(getValueIndex(n), hash); +} + +void ColumnSparse::updateWeakHash32(WeakHash32 & hash) const +{ + if (hash.getData().size() != _size) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Size of WeakHash32 does not match size of column: " + "column size is {}, hash size is {}", _size, hash.getData().size()); + + auto offset_it = begin(); + auto & hash_data = hash.getData(); + for (size_t i = 0; i < _size; ++i, ++offset_it) + { + size_t value_index = offset_it.getValueIndex(); + auto data_ref = values->getDataAt(value_index); + hash_data[i] = ::updateWeakHash32(reinterpret_cast(data_ref.data), data_ref.size, hash_data[i]); + } +} + +void ColumnSparse::updateHashFast(SipHash & hash) const +{ + values->updateHashFast(hash); + offsets->updateHashFast(hash); + hash.update(_size); +} + +void ColumnSparse::getExtremes(Field & min, Field & max) const +{ + if (_size == 0) + { + values->get(0, min); + values->get(0, max); + return; + } + + if (getNumberOfDefaults() == 0) + { + size_t min_idx = 1; + size_t max_idx = 1; + + for (size_t i = 2; i < values->size(); ++i) + { + if (values->compareAt(i, min_idx, *values, 1) < 0) + min_idx = i; + else if (values->compareAt(i, max_idx, *values, 1) > 0) + max_idx = i; + } + + values->get(min_idx, min); + values->get(max_idx, max); + return; + } + + values->getExtremes(min, max); +} + +void ColumnSparse::getIndicesOfNonDefaultRows(IColumn::Offsets & indices, size_t from, size_t limit) const +{ + const auto & offsets_data = getOffsetsData(); + const auto * start = from ? std::lower_bound(offsets_data.begin(), offsets_data.end(), from) : offsets_data.begin(); + const auto * end = limit ? std::lower_bound(offsets_data.begin(), offsets_data.end(), from + limit) : offsets_data.end(); + + indices.insert(start, end); +} + +double ColumnSparse::getRatioOfDefaultRows(double) const +{ + return static_cast(getNumberOfDefaults()) / _size; +} + +MutableColumns ColumnSparse::scatter(ColumnIndex num_columns, const Selector & selector) const +{ + return scatterImpl(num_columns, selector); +} + +void ColumnSparse::gather(ColumnGathererStream & gatherer_stream) +{ + gatherer_stream.gather(*this); +} + +ColumnPtr ColumnSparse::compress() const +{ + auto values_compressed = values->compress(); + auto offsets_compressed = offsets->compress(); + + size_t byte_size = values_compressed->byteSize() + offsets_compressed->byteSize(); + + return ColumnCompressed::create(size(), byte_size, + [values_compressed = std::move(values_compressed), offsets_compressed = std::move(offsets_compressed), size = size()] + { + return ColumnSparse::create(values_compressed->decompress(), offsets_compressed->decompress(), size); + }); +} + +bool ColumnSparse::structureEquals(const IColumn & rhs) const +{ + if (const auto * rhs_sparse = typeid_cast(&rhs)) + return values->structureEquals(*rhs_sparse->values); + return false; +} + +void ColumnSparse::forEachSubcolumn(ColumnCallback callback) +{ + callback(values); + callback(offsets); +} + +const IColumn::Offsets & ColumnSparse::getOffsetsData() const +{ + return assert_cast(*offsets).getData(); +} + +IColumn::Offsets & ColumnSparse::getOffsetsData() +{ + return assert_cast(*offsets).getData(); +} + +size_t ColumnSparse::getValueIndex(size_t n) const +{ + assert(n < _size); + + const auto & offsets_data = getOffsetsData(); + const auto * it = std::lower_bound(offsets_data.begin(), offsets_data.end(), n); + if (it == offsets_data.end() || *it != n) + return 0; + + return it - offsets_data.begin() + 1; +} + +ColumnPtr recursiveRemoveSparse(const ColumnPtr & column) +{ + if (!column) + return column; + + if (const auto * column_tuple = typeid_cast(column.get())) + { + auto columns = column_tuple->getColumns(); + for (auto & element : columns) + element = recursiveRemoveSparse(element); + + return ColumnTuple::create(columns); + } + + return column->convertToFullColumnIfSparse(); +} + +} diff --git a/src/Columns/ColumnSparse.h b/src/Columns/ColumnSparse.h new file mode 100644 index 00000000000..6eb09642510 --- /dev/null +++ b/src/Columns/ColumnSparse.h @@ -0,0 +1,231 @@ +#pragma once + +#include +#include +#include +#include +#include + +class Collator; + +namespace DB +{ + + +/** Column for spare representation. + * It stores column with non-default values and column + * with their sorted positions in original column. Column with + * values contains also one default value at 0 position to make + * implementation of execution of functions and sorting more convenient. + */ +class ColumnSparse final : public COWHelper +{ +private: + friend class COWHelper; + + explicit ColumnSparse(MutableColumnPtr && values_); + ColumnSparse(MutableColumnPtr && values_, MutableColumnPtr && offsets_, size_t size_); + ColumnSparse(const ColumnSparse &) = default; + +public: + static constexpr auto DEFAULT_ROWS_SEARCH_SAMPLE_RATIO = 0.1; + static constexpr auto DEFAULT_RATIO_FOR_SPARSE_SERIALIZATION = 0.95; + + using Base = COWHelper; + static Ptr create(const ColumnPtr & values_, const ColumnPtr & offsets_, size_t size_) + { + return Base::create(values_->assumeMutable(), offsets_->assumeMutable(), size_); + } + + template ::value>::type> + static MutablePtr create(TColumnPtr && values_, TColumnPtr && offsets_, size_t size_) + { + return Base::create(std::move(values_), std::move(offsets_), size_); + } + + static Ptr create(const ColumnPtr & values_) + { + return Base::create(values_->assumeMutable()); + } + + template ::value>::type> + static MutablePtr create(TColumnPtr && values_) + { + return Base::create(std::forward(values_)); + } + + bool isSparse() const override { return true; } + const char * getFamilyName() const override { return "Sparse"; } + std::string getName() const override { return "Sparse(" + values->getName() + ")"; } + TypeIndex getDataType() const override { return values->getDataType(); } + MutableColumnPtr cloneResized(size_t new_size) const override; + size_t size() const override { return _size; } + bool isDefaultAt(size_t n) const override; + bool isNullAt(size_t n) const override; + Field operator[](size_t n) const override; + void get(size_t n, Field & res) const override; + bool getBool(size_t n) const override; + Float64 getFloat64(size_t n) const override; + Float32 getFloat32(size_t n) const override; + UInt64 getUInt(size_t n) const override; + Int64 getInt(size_t n) const override; + UInt64 get64(size_t n) const override; + StringRef getDataAt(size_t n) const override; + + ColumnPtr convertToFullColumnIfSparse() const override; + + /// Will insert null value if pos=nullptr + void insertData(const char * pos, size_t length) override; + StringRef serializeValueIntoArena(size_t n, Arena & arena, char const *& begin) const override; + const char * deserializeAndInsertFromArena(const char * pos) override; + const char * skipSerializedInArena(const char *) const override; + void insertRangeFrom(const IColumn & src, size_t start, size_t length) override; + void insert(const Field & x) override; + void insertFrom(const IColumn & src, size_t n) override; + void insertDefault() override; + void insertManyDefaults(size_t length) override; + + void popBack(size_t n) override; + ColumnPtr filter(const Filter & filt, ssize_t) const override; + void expand(const Filter & mask, bool inverted) override; + ColumnPtr permute(const Permutation & perm, size_t limit) const override; + + ColumnPtr index(const IColumn & indexes, size_t limit) const override; + + template + ColumnPtr indexImpl(const PaddedPODArray & indexes, size_t limit) const; + + int compareAt(size_t n, size_t m, const IColumn & rhs_, int null_direction_hint) const override; + void compareColumn(const IColumn & rhs, size_t rhs_row_num, + PaddedPODArray * row_indexes, PaddedPODArray & compare_results, + int direction, int nan_direction_hint) const override; + + int compareAtWithCollation(size_t n, size_t m, const IColumn & rhs, int null_direction_hint, const Collator & collator) const override; + bool hasEqualValues() const override; + + void getPermutationImpl(bool reverse, size_t limit, int null_direction_hint, Permutation & res, const Collator * collator) const; + + void getPermutation(bool reverse, size_t limit, int null_direction_hint, Permutation & res) const override; + void updatePermutation(bool reverse, size_t limit, int null_direction_hint, Permutation & res, EqualRanges & equal_range) const override; + void getPermutationWithCollation(const Collator & collator, bool reverse, size_t limit, int null_direction_hint, Permutation & res) const override; + void updatePermutationWithCollation( + const Collator & collator, bool reverse, size_t limit, int null_direction_hint, Permutation & res, EqualRanges& equal_range) const override; + + size_t byteSize() const override; + size_t byteSizeAt(size_t n) const override; + size_t allocatedBytes() const override; + void protect() override; + ColumnPtr replicate(const Offsets & replicate_offsets) const override; + void updateHashWithValue(size_t n, SipHash & hash) const override; + void updateWeakHash32(WeakHash32 & hash) const override; + void updateHashFast(SipHash & hash) const override; + void getExtremes(Field & min, Field & max) const override; + + void getIndicesOfNonDefaultRows(IColumn::Offsets & indices, size_t from, size_t limit) const override; + double getRatioOfDefaultRows(double sample_ratio) const override; + + MutableColumns scatter(ColumnIndex num_columns, const Selector & selector) const override; + + void gather(ColumnGathererStream & gatherer_stream) override; + + ColumnPtr compress() const override; + + void forEachSubcolumn(ColumnCallback callback) override; + + bool structureEquals(const IColumn & rhs) const override; + + bool isNullable() const override { return values->isNullable(); } + bool isFixedAndContiguous() const override { return false; } + bool valuesHaveFixedSize() const override { return values->valuesHaveFixedSize(); } + size_t sizeOfValueIfFixed() const override { return values->sizeOfValueIfFixed() + values->sizeOfValueIfFixed(); } + bool isCollationSupported() const override { return values->isCollationSupported(); } + + size_t getNumberOfDefaults() const { return _size - offsets->size(); } + size_t getNumberOfTrailingDefaults() const + { + return offsets->empty() ? _size : _size - getOffsetsData().back() - 1; + } + + /// Return position of element in 'values' columns, + /// that corresponds to n-th element of full column. + /// O(log(offsets.size())) complexity, + size_t getValueIndex(size_t n) const; + + const IColumn & getValuesColumn() const { return *values; } + IColumn & getValuesColumn() { return *values; } + + const ColumnPtr & getValuesPtr() const { return values; } + ColumnPtr & getValuesPtr() { return values; } + + const IColumn::Offsets & getOffsetsData() const; + IColumn::Offsets & getOffsetsData(); + + const ColumnPtr & getOffsetsPtr() const { return offsets; } + ColumnPtr & getOffsetsPtr() { return offsets; } + + const IColumn & getOffsetsColumn() const { return *offsets; } + IColumn & getOffsetsColumn() { return *offsets; } + + /// This class helps to iterate over all values in ColumnSparse. + class Iterator + { + public: + Iterator(const PaddedPODArray & offsets_, size_t size_, size_t current_offset_, size_t current_row_) + : offsets(offsets_), size(size_), current_offset(current_offset_), current_row(current_row_) + { + } + + bool ALWAYS_INLINE isDefault() const { return current_offset == offsets.size() || current_row != offsets[current_offset]; } + size_t ALWAYS_INLINE getValueIndex() const { return isDefault() ? 0 : current_offset + 1; } + size_t ALWAYS_INLINE getCurrentRow() const { return current_row; } + size_t ALWAYS_INLINE getCurrentOffset() const { return current_offset; } + + bool operator==(const Iterator & other) const + { + return size == other.size + && current_offset == other.current_offset + && current_row == other.current_row; + } + + bool operator!=(const Iterator & other) const { return !(*this == other); } + + Iterator operator++() + { + if (!isDefault()) + ++current_offset; + ++current_row; + return *this; + } + + private: + const PaddedPODArray & offsets; + const size_t size; + size_t current_offset; + size_t current_row; + }; + + Iterator begin() const { return Iterator(getOffsetsData(), _size, 0, 0); } + Iterator end() const { return Iterator(getOffsetsData(), _size, getOffsetsData().size(), _size); } + +private: + using Inserter = std::function; + + /// Inserts value to 'values' column via callback. + /// Properly handles cases, when inserted value is default. + /// Used, when it's unknown in advance if inserted value is default. + void insertSingleValue(const Inserter & inserter); + + /// Contains default value at 0 position. + /// It's convenient, because it allows to execute, e.g functions or sorting, + /// for this column without handling different cases. + WrappedPtr values; + + /// Sorted offsets of non-default values in the full column. + /// 'offsets[i]' corresponds to 'values[i + 1]'. + WrappedPtr offsets; + size_t _size; +}; + +ColumnPtr recursiveRemoveSparse(const ColumnPtr & column); + +} diff --git a/src/Columns/ColumnString.cpp b/src/Columns/ColumnString.cpp index 2beb9add318..cd8a3e698d8 100644 --- a/src/Columns/ColumnString.cpp +++ b/src/Columns/ColumnString.cpp @@ -474,8 +474,9 @@ void ColumnString::getExtremes(Field & min, Field & max) const ColumnPtr ColumnString::compress() const { - size_t source_chars_size = chars.size(); - size_t source_offsets_size = offsets.size() * sizeof(Offset); + const size_t source_chars_size = chars.size(); + const size_t source_offsets_elements = offsets.size(); + const size_t source_offsets_size = source_offsets_elements * sizeof(Offset); /// Don't compress small blocks. if (source_chars_size < 4096) /// A wild guess. @@ -489,12 +490,14 @@ ColumnPtr ColumnString::compress() const auto offsets_compressed = ColumnCompressed::compressBuffer(offsets.data(), source_offsets_size, true); - return ColumnCompressed::create(offsets.size(), chars_compressed->size() + offsets_compressed->size(), + const size_t chars_compressed_size = chars_compressed->size(); + const size_t offsets_compressed_size = offsets_compressed->size(); + return ColumnCompressed::create(source_offsets_elements, chars_compressed_size + offsets_compressed_size, [ chars_compressed = std::move(chars_compressed), offsets_compressed = std::move(offsets_compressed), source_chars_size, - source_offsets_elements = offsets.size() + source_offsets_elements ] { auto res = ColumnString::create(); diff --git a/src/Columns/ColumnString.h b/src/Columns/ColumnString.h index 0ab4ed8e49d..a7b2c60d9a1 100644 --- a/src/Columns/ColumnString.h +++ b/src/Columns/ColumnString.h @@ -107,6 +107,12 @@ public: return StringRef(&chars[offsetAt(n)], sizeAt(n)); } + bool isDefaultAt(size_t n) const override + { + assert(n < size()); + return sizeAt(n) == 1; + } + /// Suppress gcc 7.3.1 warning: '*((void*)& +8)' may be used uninitialized in this function #if !defined(__clang__) #pragma GCC diagnostic push @@ -278,6 +284,16 @@ public: return typeid(rhs) == typeid(ColumnString); } + double getRatioOfDefaultRows(double sample_ratio) const override + { + return getRatioOfDefaultRowsImpl(sample_ratio); + } + + void getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const override + { + return getIndicesOfNonDefaultRowsImpl(indices, from, limit); + } + Chars & getChars() { return chars; } const Chars & getChars() const { return chars; } diff --git a/src/Columns/ColumnStringHelpers.h b/src/Columns/ColumnStringHelpers.h new file mode 100644 index 00000000000..851486e490a --- /dev/null +++ b/src/Columns/ColumnStringHelpers.h @@ -0,0 +1,91 @@ +#pragma once + +#include +#include + +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ +extern const int TOO_LARGE_STRING_SIZE; +} + +namespace ColumnStringHelpers +{ + +/** Simplifies writing data to the ColumnString or ColumnFixedString via WriteBuffer. + * + * Take care of little subtle details, like padding or proper offsets. + */ +template +class WriteHelper +{ + ColumnType & col; + WriteBufferFromVector buffer; + size_t prev_row_buffer_size = 0; + + static ColumnType & resizeColumn(ColumnType & column, size_t rows) + { + if constexpr (std::is_same_v) + column.resize(rows); + else + { + column.getOffsets().reserve(rows); + /// Using coefficient 2 for initial size is arbitrary. + column.getChars().resize(rows * 2); + } + return column; + } + +public: + WriteHelper(ColumnType & col_, size_t expected_rows) + : col(resizeColumn(col_, expected_rows)) + , buffer(col.getChars()) + {} + + ~WriteHelper() = default; + + void finalize() + { + buffer.finalize(); + } + + auto & getWriteBuffer() + { + return buffer; + } + + inline void rowWritten() + { + if constexpr (std::is_same_v) + { + if (buffer.count() > prev_row_buffer_size + col.getN()) + throw Exception( + ErrorCodes::TOO_LARGE_STRING_SIZE, + "Too large string for FixedString column"); + + // Pad with zeroes on the right to maintain FixedString invariant. + const auto excess_bytes = buffer.count() % col.getN(); + const auto fill_bytes = col.getN() - excess_bytes; + writeChar(0, fill_bytes, buffer); + } + else + { + writeChar(0, buffer); + col.getOffsets().push_back(buffer.count()); + } + + prev_row_buffer_size = buffer.count(); + } +}; + +} + +} diff --git a/src/Columns/ColumnTuple.cpp b/src/Columns/ColumnTuple.cpp index d157f18bf32..d667b264d55 100644 --- a/src/Columns/ColumnTuple.cpp +++ b/src/Columns/ColumnTuple.cpp @@ -12,6 +12,7 @@ #include #include #include +#include namespace DB @@ -113,6 +114,15 @@ void ColumnTuple::get(size_t n, Field & res) const res = tuple; } +bool ColumnTuple::isDefaultAt(size_t n) const +{ + const size_t tuple_size = columns.size(); + for (size_t i = 0; i < tuple_size; ++i) + if (!columns[i]->isDefaultAt(n)) + return false; + return true; +} + StringRef ColumnTuple::getDataAt(size_t) const { throw Exception("Method getDataAt is not supported for " + getName(), ErrorCodes::NOT_IMPLEMENTED); @@ -536,4 +546,25 @@ ColumnPtr ColumnTuple::compress() const }); } +double ColumnTuple::getRatioOfDefaultRows(double sample_ratio) const +{ + return getRatioOfDefaultRowsImpl(sample_ratio); +} + +void ColumnTuple::getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const +{ + return getIndicesOfNonDefaultRowsImpl(indices, from, limit); +} + +SerializationInfoPtr ColumnTuple::getSerializationInfo() const +{ + MutableSerializationInfos infos; + infos.reserve(columns.size()); + + for (const auto & column : columns) + infos.push_back(const_pointer_cast(column->getSerializationInfo())); + + return std::make_shared(std::move(infos), SerializationInfo::Settings{}); +} + } diff --git a/src/Columns/ColumnTuple.h b/src/Columns/ColumnTuple.h index 54723239a06..b4c7f6bdf9a 100644 --- a/src/Columns/ColumnTuple.h +++ b/src/Columns/ColumnTuple.h @@ -53,6 +53,7 @@ public: Field operator[](size_t n) const override; void get(size_t n, Field & res) const override; + bool isDefaultAt(size_t n) const override; StringRef getDataAt(size_t n) const override; void insertData(const char * pos, size_t length) override; void insert(const Field & x) override; @@ -93,6 +94,9 @@ public: bool structureEquals(const IColumn & rhs) const override; bool isCollationSupported() const override; ColumnPtr compress() const override; + double getRatioOfDefaultRows(double sample_ratio) const override; + void getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const override; + SerializationInfoPtr getSerializationInfo() const override; size_t tupleSize() const { return columns.size(); } diff --git a/src/Columns/ColumnUnique.h b/src/Columns/ColumnUnique.h index 66bb56983fc..59eca547852 100644 --- a/src/Columns/ColumnUnique.h +++ b/src/Columns/ColumnUnique.h @@ -68,6 +68,7 @@ public: Field operator[](size_t n) const override { return (*getNestedColumn())[n]; } void get(size_t n, Field & res) const override { getNestedColumn()->get(n, res); } + bool isDefaultAt(size_t n) const override { return n == 0; } StringRef getDataAt(size_t n) const override { return getNestedColumn()->getDataAt(n); } StringRef getDataAtWithTerminatingZero(size_t n) const override { @@ -122,6 +123,16 @@ public: return false; } + double getRatioOfDefaultRows(double) const override + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method 'getRatioOfDefaultRows' not implemented for ColumnUnique"); + } + + void getIndicesOfNonDefaultRows(IColumn::Offsets &, size_t, size_t) const override + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method 'getIndicesOfNonDefaultRows' not implemented for ColumnUnique"); + } + const UInt64 * tryGetSavedHash() const override { return reverse_index.tryGetSavedHash(); } UInt128 getHash() const override { return hash.getHash(*getRawColumnPtr()); } diff --git a/src/Columns/ColumnVector.cpp b/src/Columns/ColumnVector.cpp index 13ba522b2ac..9808acf48c8 100644 --- a/src/Columns/ColumnVector.cpp +++ b/src/Columns/ColumnVector.cpp @@ -481,7 +481,8 @@ void ColumnVector::getExtremes(Field & min, Field & max) const template ColumnPtr ColumnVector::compress() const { - size_t source_size = data.size() * sizeof(T); + const size_t data_size = data.size(); + const size_t source_size = data_size * sizeof(T); /// Don't compress small blocks. if (source_size < 4096) /// A wild guess. @@ -492,8 +493,9 @@ ColumnPtr ColumnVector::compress() const if (!compressed) return ColumnCompressed::wrap(this->getPtr()); - return ColumnCompressed::create(data.size(), compressed->size(), - [compressed = std::move(compressed), column_size = data.size()] + const size_t compressed_size = compressed->size(); + return ColumnCompressed::create(data_size, compressed_size, + [compressed = std::move(compressed), column_size = data_size] { auto res = ColumnVector::create(column_size); ColumnCompressed::decompressBuffer( @@ -502,6 +504,24 @@ ColumnPtr ColumnVector::compress() const }); } +template +ColumnPtr ColumnVector::createWithOffsets(const IColumn::Offsets & offsets, const Field & default_field, size_t total_rows, size_t shift) const +{ + if (offsets.size() + shift != size()) + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Incompatible sizes of offsets ({}), shift ({}) and size of column {}", offsets.size(), shift, size()); + + auto res = this->create(); + auto & res_data = res->getData(); + + T default_value = safeGet(default_field); + res_data.resize_fill(total_rows, default_value); + for (size_t i = 0; i < offsets.size(); ++i) + res_data[offsets[i]] = data[i + shift]; + + return res; +} + /// Explicit template instantiations - to avoid code bloat in headers. template class ColumnVector; template class ColumnVector; diff --git a/src/Columns/ColumnVector.h b/src/Columns/ColumnVector.h index d3e5e8d2bdd..bee7bfa738c 100644 --- a/src/Columns/ColumnVector.h +++ b/src/Columns/ColumnVector.h @@ -328,11 +328,25 @@ public: return StringRef(reinterpret_cast(&data[n]), sizeof(data[n])); } + bool isDefaultAt(size_t n) const override { return data[n] == T{}; } + bool structureEquals(const IColumn & rhs) const override { return typeid(rhs) == typeid(ColumnVector); } + double getRatioOfDefaultRows(double sample_ratio) const override + { + return this->template getRatioOfDefaultRowsImpl(sample_ratio); + } + + void getIndicesOfNonDefaultRows(IColumn::Offsets & indices, size_t from, size_t limit) const override + { + return this->template getIndicesOfNonDefaultRowsImpl(indices, from, limit); + } + + ColumnPtr createWithOffsets(const IColumn::Offsets & offsets, const Field & default_field, size_t total_rows, size_t shift) const override; + ColumnPtr compress() const override; /// Replace elements that match the filter with zeroes. If inverted replaces not matched elements. diff --git a/src/Columns/FilterDescription.cpp b/src/Columns/FilterDescription.cpp index c9968d841c2..973d5bc4391 100644 --- a/src/Columns/FilterDescription.cpp +++ b/src/Columns/FilterDescription.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include @@ -50,6 +51,9 @@ ConstantFilterDescription::ConstantFilterDescription(const IColumn & column) FilterDescription::FilterDescription(const IColumn & column_) { + if (column_.isSparse()) + data_holder = recursiveRemoveSparse(column_.getPtr()); + if (column_.lowCardinality()) data_holder = column_.convertToFullColumnIfLowCardinality(); diff --git a/src/Columns/IColumn.cpp b/src/Columns/IColumn.cpp index a3ed0885651..e7caee3b23d 100644 --- a/src/Columns/IColumn.cpp +++ b/src/Columns/IColumn.cpp @@ -4,11 +4,17 @@ #include #include #include +#include namespace DB { +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; +} + String IColumn::dumpStructure() const { WriteBufferFromOwnString res; @@ -30,6 +36,39 @@ void IColumn::insertFrom(const IColumn & src, size_t n) insert(src[n]); } +ColumnPtr IColumn::createWithOffsets(const Offsets & offsets, const Field & default_field, size_t total_rows, size_t shift) const +{ + if (offsets.size() + shift != size()) + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Incompatible sizes of offsets ({}), shift ({}) and size of column {}", offsets.size(), shift, size()); + + auto res = cloneEmpty(); + res->reserve(total_rows); + + ssize_t current_offset = -1; + for (size_t i = 0; i < offsets.size(); ++i) + { + ssize_t offsets_diff = static_cast(offsets[i]) - current_offset; + current_offset = offsets[i]; + + if (offsets_diff > 1) + res->insertMany(default_field, offsets_diff - 1); + + res->insertFrom(*this, i + shift); + } + + ssize_t offsets_diff = static_cast(total_rows) - current_offset; + if (offsets_diff > 1) + res->insertMany(default_field, offsets_diff - 1); + + return res; +} + +SerializationInfoPtr IColumn::getSerializationInfo() const +{ + return std::make_shared(ISerialization::getKind(*this), SerializationInfo::Settings{}); +} + bool isColumnNullable(const IColumn & column) { return checkColumn(column); diff --git a/src/Columns/IColumn.h b/src/Columns/IColumn.h index e4f99cc0a1d..b1a6e83ee98 100644 --- a/src/Columns/IColumn.h +++ b/src/Columns/IColumn.h @@ -26,9 +26,8 @@ class ColumnGathererStream; class Field; class WeakHash32; -class ISerialization; -using SerializationPtr = std::shared_ptr; - +class SerializationInfo; +using SerializationInfoPtr = std::shared_ptr; /* * Represents a set of equal ranges in previous column to perform sorting in current column. @@ -64,9 +63,18 @@ public: virtual Ptr convertToFullColumnIfConst() const { return getPtr(); } /// If column isn't ColumnLowCardinality, return itself. - /// If column is ColumnLowCardinality, transforms is to full column. + /// If column is ColumnLowCardinality, transforms it to full column. virtual Ptr convertToFullColumnIfLowCardinality() const { return getPtr(); } + /// If column isn't ColumnSparse, return itself. + /// If column is ColumnSparse, transforms it to full column. + virtual Ptr convertToFullColumnIfSparse() const { return getPtr(); } + + Ptr convertToFullIfNeeded() const + { + return convertToFullColumnIfSparse()->convertToFullColumnIfConst()->convertToFullColumnIfLowCardinality(); + } + /// Creates empty column with the same type. virtual MutablePtr cloneEmpty() const { return cloneResized(0); } @@ -133,7 +141,7 @@ public: throw Exception("Method getInt is not supported for " + getName(), ErrorCodes::NOT_IMPLEMENTED); } - virtual bool isDefaultAt(size_t n) const { return get64(n) == 0; } + virtual bool isDefaultAt(size_t n) const = 0; virtual bool isNullAt(size_t /*n*/) const { return false; } /** If column is numeric, return value of n-th element, casted to bool. @@ -173,6 +181,13 @@ public: insertFrom(src, position); } + /// Appends one field multiple times. Can be optimized in inherited classes. + virtual void insertMany(const Field & field, size_t length) + { + for (size_t i = 0; i < length; ++i) + insert(field); + } + /// Appends data located in specified memory chunk if it is possible (throws an exception if it cannot be implemented). /// Is used to optimize some computations (in aggregation, for example). /// Parameter length could be ignored if column values have fixed size. @@ -375,6 +390,22 @@ public: throw Exception("Method structureEquals is not supported for " + getName(), ErrorCodes::NOT_IMPLEMENTED); } + /// Returns ration of values in column, that equal to default value of column. + /// Checks only @sample_ratio ratio of rows. + virtual double getRatioOfDefaultRows(double sample_ratio = 1.0) const = 0; + + /// Returns indices of values in column, that not equal to default value of column. + virtual void getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const = 0; + + /// Returns column with @total_size elements. + /// In result column values from current column are at positions from @offsets. + /// Other values are filled by @default_value. + /// @shift means how much rows to skip from the beginning of current column. + /// Used to create full column from sparse. + virtual Ptr createWithOffsets(const Offsets & offsets, const Field & default_field, size_t total_rows, size_t shift) const; + + virtual SerializationInfoPtr getSerializationInfo() const; + /// Compress column in memory to some representation that allows to decompress it back. /// Return itself if compression is not applicable for this column type. virtual Ptr compress() const @@ -457,6 +488,8 @@ public: virtual bool lowCardinality() const { return false; } + virtual bool isSparse() const { return false; } + virtual bool isCollationSupported() const { return false; } virtual ~IColumn() = default; @@ -468,7 +501,6 @@ public: String dumpStructure() const; protected: - /// Template is to devirtualize calls to insertFrom method. /// In derived classes (that use final keyword), implement scatter method as call to scatterImpl. template @@ -489,6 +521,13 @@ protected: template bool hasEqualValuesImpl() const; + /// Template is to devirtualize calls to 'isDefaultAt' method. + template + double getRatioOfDefaultRowsImpl(double sample_ratio) const; + + template + void getIndicesOfNonDefaultRowsImpl(Offsets & indices, size_t from, size_t limit) const; + /// Uses std::sort and partial_sort as default algorithms. /// Implements 'less' and 'equals' via comparator. /// If 'less' and 'equals' can be implemented more optimal diff --git a/src/Columns/IColumnDummy.h b/src/Columns/IColumnDummy.h index ff45cf28737..89844f4b0b3 100644 --- a/src/Columns/IColumnDummy.h +++ b/src/Columns/IColumnDummy.h @@ -46,6 +46,7 @@ public: Field operator[](size_t) const override { throw Exception("Cannot get value from " + getName(), ErrorCodes::NOT_IMPLEMENTED); } void get(size_t, Field &) const override { throw Exception("Cannot get value from " + getName(), ErrorCodes::NOT_IMPLEMENTED); } void insert(const Field &) override { throw Exception("Cannot insert element into " + getName(), ErrorCodes::NOT_IMPLEMENTED); } + bool isDefaultAt(size_t) const override { throw Exception("isDefaultAt is not implemented for " + getName(), ErrorCodes::NOT_IMPLEMENTED); } StringRef getDataAt(size_t) const override { @@ -161,6 +162,16 @@ public: return res; } + double getRatioOfDefaultRows(double) const override + { + throw Exception("Method getRatioOfDefaultRows is not supported for " + getName(), ErrorCodes::NOT_IMPLEMENTED); + } + + void getIndicesOfNonDefaultRows(Offsets &, size_t, size_t) const override + { + throw Exception("Method getIndicesOfNonDefaultRows is not supported for " + getName(), ErrorCodes::NOT_IMPLEMENTED); + } + void gather(ColumnGathererStream &) override { throw Exception("Method gather is not supported for " + getName(), ErrorCodes::NOT_IMPLEMENTED); diff --git a/src/Columns/IColumnImpl.h b/src/Columns/IColumnImpl.h index d45867e289d..1be52087d11 100644 --- a/src/Columns/IColumnImpl.h +++ b/src/Columns/IColumnImpl.h @@ -16,6 +16,7 @@ namespace DB namespace ErrorCodes { extern const int SIZES_OF_COLUMNS_DOESNT_MATCH; + extern const int LOGICAL_ERROR; } template @@ -141,6 +142,56 @@ bool IColumn::hasEqualValuesImpl() const return true; } +template +double IColumn::getRatioOfDefaultRowsImpl(double sample_ratio) const +{ + if (sample_ratio <= 0.0 || sample_ratio > 1.0) + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Value of 'sample_ratio' must be in interval (0.0; 1.0], but got: {}", sample_ratio); + + /// Randomize a little to avoid boundary effects. + std::uniform_int_distribution dist(1, static_cast(1.0 / sample_ratio)); + + size_t num_rows = size(); + size_t num_sampled_rows = static_cast(num_rows * sample_ratio); + size_t num_checked_rows = dist(thread_local_rng); + num_sampled_rows = std::min(num_sampled_rows + dist(thread_local_rng), num_rows); + size_t res = 0; + + if (num_sampled_rows == num_rows) + { + for (size_t i = 0; i < num_rows; ++i) + res += static_cast(*this).isDefaultAt(i); + num_checked_rows = num_rows; + } + else if (num_sampled_rows != 0) + { + for (size_t i = num_checked_rows; i < num_rows; ++i) + { + if (num_checked_rows * num_rows <= i * num_sampled_rows) + { + res += static_cast(*this).isDefaultAt(i); + ++num_checked_rows; + } + } + } + + return static_cast(res) / num_checked_rows; +} + +template +void IColumn::getIndicesOfNonDefaultRowsImpl(Offsets & indices, size_t from, size_t limit) const +{ + size_t to = limit && from + limit < size() ? from + limit : size(); + indices.reserve(indices.size() + to - from); + + for (size_t i = from; i < to; ++i) + { + if (!static_cast(*this).isDefaultAt(i)) + indices.push_back(i); + } +} + template void IColumn::updatePermutationImpl( size_t limit, diff --git a/src/Columns/MaskOperations.cpp b/src/Columns/MaskOperations.cpp index 9499185da30..1641bdf5a4c 100644 --- a/src/Columns/MaskOperations.cpp +++ b/src/Columns/MaskOperations.cpp @@ -293,7 +293,7 @@ void executeColumnIfNeeded(ColumnWithTypeAndName & column, bool empty) column.column = column_function->getResultType()->createColumn(); } -int checkShirtCircuitArguments(const ColumnsWithTypeAndName & arguments) +int checkShortCircuitArguments(const ColumnsWithTypeAndName & arguments) { int last_short_circuit_argument_index = -1; for (size_t i = 0; i != arguments.size(); ++i) diff --git a/src/Columns/MaskOperations.h b/src/Columns/MaskOperations.h index bd6c5e8fe2c..e43b4588258 100644 --- a/src/Columns/MaskOperations.h +++ b/src/Columns/MaskOperations.h @@ -66,7 +66,7 @@ void executeColumnIfNeeded(ColumnWithTypeAndName & column, bool empty = false); /// Check if arguments contain lazy executed argument. If contain, return index of the last one, /// otherwise return -1. -int checkShirtCircuitArguments(const ColumnsWithTypeAndName & arguments); +int checkShortCircuitArguments(const ColumnsWithTypeAndName & arguments); void copyMask(const PaddedPODArray & from, PaddedPODArray & to); diff --git a/src/Columns/tests/gtest_column_sparse.cpp b/src/Columns/tests/gtest_column_sparse.cpp new file mode 100644 index 00000000000..56284b75204 --- /dev/null +++ b/src/Columns/tests/gtest_column_sparse.cpp @@ -0,0 +1,327 @@ +#include +#include + +#include +#include +#include + +#include +#include + +#include + +using namespace DB; +pcg64 rng(randomSeed()); + +std::pair createColumns(size_t n, size_t k) +{ + auto values = ColumnVector::create(); + auto offsets = ColumnVector::create(); + auto full = ColumnVector::create(); + + auto & values_data = values->getData(); + auto & offsets_data = offsets->getData(); + auto & full_data = full->getData(); + + values_data.push_back(0); + + for (size_t i = 0; i < n; ++i) + { + bool not_zero = rng() % k == 0; + size_t value = not_zero ? rng() % 1000000 : 0; + full_data.push_back(value); + + if (not_zero) + { + values_data.push_back(value); + offsets_data.push_back(i); + } + } + + auto sparse = ColumnSparse::create(std::move(values), std::move(offsets), n); + return std::make_pair(std::move(sparse), std::move(full)); +} + +bool checkEquals(const IColumn & lhs, const IColumn & rhs) +{ + if (lhs.size() != rhs.size()) + return false; + + for (size_t i = 0; i < lhs.size(); ++i) + if (lhs.compareAt(i, i, rhs, 0) != 0) + return false; + + return true; +} + +// Can't use ErrorCodes, because of 'using namespace DB'. +constexpr int error_code = 12345; + +constexpr size_t T = 5000; +constexpr size_t MAX_ROWS = 10000; +constexpr size_t sparse_ratios[] = {1, 2, 5, 10, 32, 50, 64, 100, 256, 500, 1000, 5000, 10000}; +constexpr size_t K = sizeof(sparse_ratios) / sizeof(sparse_ratios[0]); + +#define DUMP_COLUMN(column) std::cerr << #column << ": " << (column)->dumpStructure() << "\n" + +TEST(ColumnSparse, InsertRangeFrom) +{ + auto test_case = [&](size_t n1, size_t k1, size_t n2, size_t k2, size_t from, size_t len) + { + auto [sparse_dst, full_dst] = createColumns(n1, k1); + auto [sparse_src, full_src] = createColumns(n2, k2); + + sparse_dst->insertRangeFrom(*sparse_src, from, len); + full_dst->insertRangeFrom(*full_src, from, len); + + if (!checkEquals(*sparse_dst->convertToFullColumnIfSparse(), *full_dst)) + { + DUMP_COLUMN(sparse_src); + DUMP_COLUMN(full_src); + DUMP_COLUMN(sparse_dst); + DUMP_COLUMN(full_dst); + throw Exception(error_code, "Columns are unequal"); + } + }; + + try + { + for (size_t i = 0; i < T; ++i) + { + size_t n1 = rng() % MAX_ROWS + 1; + size_t k1 = sparse_ratios[rng() % K]; + + size_t n2 = rng() % MAX_ROWS + 1; + size_t k2 = sparse_ratios[rng() % K]; + + size_t from = rng() % n2; + size_t to = rng() % n2; + + if (from > to) + std::swap(from, to); + + test_case(n1, k1, n2, k2, from, to - from); + } + } + catch (const Exception & e) + { + FAIL() << e.displayText(); + } +} + +TEST(ColumnSparse, PopBack) +{ + auto test_case = [&](size_t n, size_t k, size_t m) + { + auto [sparse_dst, full_dst] = createColumns(n, k); + + sparse_dst->popBack(m); + full_dst->popBack(m); + + if (!checkEquals(*sparse_dst->convertToFullColumnIfSparse(), *full_dst)) + { + DUMP_COLUMN(sparse_dst); + DUMP_COLUMN(full_dst); + throw Exception(error_code, "Columns are unequal"); + } + }; + + try + { + for (size_t i = 0; i < T; ++i) + { + size_t n = rng() % MAX_ROWS + 1; + size_t k = sparse_ratios[rng() % K]; + size_t m = rng() % n; + + test_case(n, k, m); + } + } + catch (const Exception & e) + { + FAIL() << e.displayText(); + } +} + +TEST(ColumnSparse, Filter) +{ + auto test_case = [&](size_t n, size_t k, size_t m) + { + auto [sparse_src, full_src] = createColumns(n, k); + + PaddedPODArray filt(n); + for (size_t i = 0; i < n; ++i) + filt[i] = rng() % m == 0; + + auto sparse_dst = sparse_src->filter(filt, -1); + auto full_dst = full_src->filter(filt, -1); + + if (!checkEquals(*sparse_dst->convertToFullColumnIfSparse(), *full_dst)) + { + DUMP_COLUMN(sparse_src); + DUMP_COLUMN(full_src); + DUMP_COLUMN(sparse_dst); + DUMP_COLUMN(full_dst); + throw Exception(error_code, "Columns are unequal"); + } + }; + + try + { + for (size_t i = 0; i < T; ++i) + { + size_t n = rng() % MAX_ROWS + 1; + size_t k = sparse_ratios[rng() % K]; + size_t m = sparse_ratios[rng() % K]; + + test_case(n, k, m); + } + } + catch (const Exception & e) + { + FAIL() << e.displayText(); + } +} + +TEST(ColumnSparse, Permute) +{ + auto test_case = [&](size_t n, size_t k, size_t limit) + { + auto [sparse_src, full_src] = createColumns(n, k); + + IColumn::Permutation perm(n); + std::iota(perm.begin(), perm.end(), 0); + std::shuffle(perm.begin(), perm.end(), rng); + + auto sparse_dst = sparse_src->permute(perm, limit); + auto full_dst = full_src->permute(perm, limit); + + if (limit) + { + sparse_dst = sparse_dst->cut(0, limit); + full_dst = full_dst->cut(0, limit); + } + + if (!checkEquals(*sparse_dst->convertToFullColumnIfSparse(), *full_dst)) + { + DUMP_COLUMN(sparse_src); + DUMP_COLUMN(full_src); + DUMP_COLUMN(sparse_dst); + DUMP_COLUMN(full_dst); + throw Exception(error_code, "Columns are unequal"); + } + }; + + try + { + for (size_t i = 0; i < T; ++i) + { + size_t n = rng() % MAX_ROWS + 1; + size_t k = sparse_ratios[rng() % K]; + size_t limit = rng() % 2 ? 0 : rng() % n; + + test_case(n, k, limit); + } + } + catch (const Exception & e) + { + FAIL() << e.displayText(); + } +} + +TEST(ColumnSparse, CompareColumn) +{ + auto test_case = [&](size_t n1, size_t k1, size_t n2, size_t k2, size_t row_num) + { + auto [sparse_src1, full_src1] = createColumns(n1, k1); + auto [sparse_src2, full_src2] = createColumns(n2, k2); + + PaddedPODArray comp_sparse; + PaddedPODArray comp_full; + + sparse_src1->compareColumn(*sparse_src2, row_num, nullptr, comp_sparse, 1, 1); + full_src1->compareColumn(*full_src2, row_num, nullptr, comp_full, 1, 1); + + if (comp_sparse != comp_full) + { + DUMP_COLUMN(sparse_src1); + DUMP_COLUMN(full_src1); + DUMP_COLUMN(sparse_src2); + DUMP_COLUMN(full_src2); + throw Exception(error_code, "Compare results are unequal"); + } + }; + + try + { + for (size_t i = 0; i < T; ++i) + { + size_t n1 = rng() % MAX_ROWS + 1; + size_t k1 = sparse_ratios[rng() % K]; + + size_t n2 = rng() % MAX_ROWS + 1; + size_t k2 = sparse_ratios[rng() % K]; + + size_t row_num = rng() % n2; + + test_case(n1, k1, n2, k2, row_num); + } + } + catch (const Exception & e) + { + FAIL() << e.displayText(); + } +} + +TEST(ColumnSparse, GetPermutation) +{ + auto test_case = [&](size_t n, size_t k, size_t limit, bool reverse) + { + auto [sparse_src, full_src] = createColumns(n, k); + + IColumn::Permutation perm_sparse; + IColumn::Permutation perm_full; + + sparse_src->getPermutation(reverse, limit, 1, perm_sparse); + full_src->getPermutation(reverse, limit, 1, perm_full); + + auto sparse_sorted = sparse_src->permute(perm_sparse, limit); + auto full_sorted = full_src->permute(perm_full, limit); + + if (limit) + { + sparse_sorted = sparse_sorted->cut(0, limit); + full_sorted = full_sorted->cut(0, limit); + } + + if (!checkEquals(*sparse_sorted->convertToFullColumnIfSparse(), *full_sorted)) + { + DUMP_COLUMN(sparse_src); + DUMP_COLUMN(full_src); + DUMP_COLUMN(sparse_sorted); + DUMP_COLUMN(full_sorted); + throw Exception(error_code, "Sorted columns are unequal"); + } + }; + + try + { + for (size_t i = 0; i < T; ++i) + { + size_t n = rng() % MAX_ROWS + 1; + size_t k = sparse_ratios[rng() % K]; + + size_t limit = rng() % 2 ? 0 : rng() % n; + bool reverse = rng() % 2; + + test_case(n, k, limit, reverse); + } + } + catch (const Exception & e) + { + FAIL() << e.displayText(); + } +} + +#undef DUMP_COLUMN +#undef DUMP_NON_DEFAULTS diff --git a/src/Common/CMakeLists.txt b/src/Common/CMakeLists.txt index 1935fe4fed1..1e7d3591a48 100644 --- a/src/Common/CMakeLists.txt +++ b/src/Common/CMakeLists.txt @@ -1,8 +1,9 @@ add_subdirectory(StringUtils) -# after common_io -#add_subdirectory(ZooKeeper) -#add_subdirectory(ConfigProcessor) if (ENABLE_EXAMPLES) add_subdirectory(examples) endif() + +if (USE_MYSQL) + add_subdirectory (mysqlxx) +endif () diff --git a/src/Common/Config/ConfigProcessor.cpp b/src/Common/Config/ConfigProcessor.cpp index da7405b993f..41e9a53e50f 100644 --- a/src/Common/Config/ConfigProcessor.cpp +++ b/src/Common/Config/ConfigProcessor.cpp @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include #include #include @@ -41,24 +41,6 @@ namespace ErrorCodes /// For cutting preprocessed path to this base static std::string main_config_path; -/// Extracts from a string the first encountered number consisting of at least two digits. -static std::string numberFromHost(const std::string & s) -{ - for (size_t i = 0; i < s.size(); ++i) - { - std::string res; - size_t j = i; - while (j < s.size() && isNumericASCII(s[j])) - res += s[j++]; - if (res.size() >= 2) - { - while (res[0] == '0') - res.erase(res.begin()); - return res; - } - } - return ""; -} bool ConfigProcessor::isPreprocessedFile(const std::string & path) { @@ -245,19 +227,6 @@ void ConfigProcessor::merge(XMLDocumentPtr config, XMLDocumentPtr with) mergeRecursive(config, config_root, with_root); } -static std::string layerFromHost() -{ - struct utsname buf; - if (uname(&buf)) - throw Poco::Exception(std::string("uname failed: ") + errnoToString(errno)); - - std::string layer = numberFromHost(buf.nodename); - if (layer.empty()) - throw Poco::Exception(std::string("no layer in host name: ") + buf.nodename); - - return layer; -} - void ConfigProcessor::doIncludesRecursive( XMLDocumentPtr config, XMLDocumentPtr include_from, @@ -288,18 +257,6 @@ void ConfigProcessor::doIncludesRecursive( if (node->nodeType() != Node::ELEMENT_NODE) return; - /// Substitute for the number extracted from the hostname only if there is an - /// empty tag without attributes in the original file. - if (node->nodeName() == "layer" - && !node->hasAttributes() - && !node->hasChildNodes() - && node->nodeValue().empty()) - { - NodePtr new_node = config->createTextNode(layerFromHost()); - node->appendChild(new_node); - return; - } - std::map attr_nodes; NamedNodeMapPtr attributes = node->attributes(); size_t substs_count = 0; diff --git a/src/Common/Config/ConfigProcessor.h b/src/Common/Config/ConfigProcessor.h index 2a92a709934..04278d72303 100644 --- a/src/Common/Config/ConfigProcessor.h +++ b/src/Common/Config/ConfigProcessor.h @@ -59,7 +59,6 @@ public: /// 4) If zk_node_cache is non-NULL, replace elements matching the "" pattern with /// "contents of the /bar ZooKeeper node". /// If has_zk_includes is non-NULL and there are such elements, set has_zk_includes to true. - /// 5) (Yandex.Metrika-specific) Substitute "" with "layer number from the hostname". XMLDocumentPtr processConfig( bool * has_zk_includes = nullptr, zkutil::ZooKeeperNodeCache * zk_node_cache = nullptr, diff --git a/base/base/DateLUT.cpp b/src/Common/DateLUT.cpp similarity index 100% rename from base/base/DateLUT.cpp rename to src/Common/DateLUT.cpp diff --git a/base/base/DateLUT.h b/src/Common/DateLUT.h similarity index 98% rename from base/base/DateLUT.h rename to src/Common/DateLUT.h index 31fc6b1e24b..edf09250e6a 100644 --- a/base/base/DateLUT.h +++ b/src/Common/DateLUT.h @@ -2,7 +2,7 @@ #include "DateLUTImpl.h" -#include "defines.h" +#include #include diff --git a/base/base/DateLUTImpl.cpp b/src/Common/DateLUTImpl.cpp similarity index 99% rename from base/base/DateLUTImpl.cpp rename to src/Common/DateLUTImpl.cpp index bbce3b111d3..ebf32c4dbd9 100644 --- a/base/base/DateLUTImpl.cpp +++ b/src/Common/DateLUTImpl.cpp @@ -3,7 +3,7 @@ #include #include #include -#include +#include #include #include diff --git a/base/base/DateLUTImpl.h b/src/Common/DateLUTImpl.h similarity index 99% rename from base/base/DateLUTImpl.h rename to src/Common/DateLUTImpl.h index 012d2cefe84..e52e6547fa2 100644 --- a/base/base/DateLUTImpl.h +++ b/src/Common/DateLUTImpl.h @@ -1,8 +1,8 @@ #pragma once -#include "DayNum.h" -#include "defines.h" -#include "types.h" +#include +#include +#include #include #include diff --git a/src/Common/Dwarf.cpp b/src/Common/Dwarf.cpp index 18e9315d5c3..a85bbe818b5 100644 --- a/src/Common/Dwarf.cpp +++ b/src/Common/Dwarf.cpp @@ -838,7 +838,7 @@ bool Dwarf::findLocation( // The next inlined subroutine's call file and call line is the current // caller's location. - for (size_t i = 0; i < num_found - 1; i++) + for (size_t i = 0; i < num_found - 1; ++i) { call_locations[i].file = call_locations[i + 1].file; call_locations[i].line = call_locations[i + 1].line; diff --git a/src/Common/ErrorCodes.cpp b/src/Common/ErrorCodes.cpp index 54785f92926..16f85fcae61 100644 --- a/src/Common/ErrorCodes.cpp +++ b/src/Common/ErrorCodes.cpp @@ -470,6 +470,7 @@ M(497, ACCESS_DENIED) \ M(498, LIMIT_BY_WITH_TIES_IS_NOT_SUPPORTED) \ M(499, S3_ERROR) \ + M(500, AZURE_BLOB_STORAGE_ERROR) \ M(501, CANNOT_CREATE_DATABASE) \ M(502, CANNOT_SIGQUEUE) \ M(503, AGGREGATE_FUNCTION_THROW) \ @@ -601,6 +602,9 @@ M(631, UNKNOWN_FILE_SIZE) \ M(632, UNEXPECTED_DATA_AFTER_PARSED_VALUE) \ M(633, QUERY_IS_NOT_SUPPORTED_IN_WINDOW_VIEW) \ + M(634, MONGODB_ERROR) \ + M(635, CANNOT_POLL) \ + M(636, CANNOT_EXTRACT_TABLE_STRUCTURE) \ \ M(999, KEEPER_EXCEPTION) \ M(1000, POCO_EXCEPTION) \ diff --git a/src/Common/FiberStack.h b/src/Common/FiberStack.h index 94d4cee197c..f5baa1b92b3 100644 --- a/src/Common/FiberStack.h +++ b/src/Common/FiberStack.h @@ -4,7 +4,7 @@ #include #include #include - +#include #include #include #include @@ -36,7 +36,7 @@ public: explicit FiberStack(size_t stack_size_ = default_stack_size) : stack_size(stack_size_) { - page_size = ::sysconf(_SC_PAGESIZE); + page_size = getPageSize(); } boost::context::stack_context allocate() diff --git a/base/base/LocalDate.h b/src/Common/LocalDate.h similarity index 90% rename from base/base/LocalDate.h rename to src/Common/LocalDate.h index d199b9dbb4b..6c847ceff0e 100644 --- a/base/base/LocalDate.h +++ b/src/Common/LocalDate.h @@ -2,9 +2,8 @@ #include #include -#include #include -#include +#include /** Stores a calendar date in broken-down form (year, month, day-in-month). @@ -154,19 +153,6 @@ public: { return !(*this == other); } - - /// NOTE Inefficient. - std::string toString(char separator = '-') const - { - std::stringstream ss; - if (separator) - ss << year() << separator << (month() / 10) << (month() % 10) - << separator << (day() / 10) << (day() % 10); - else - ss << year() << (month() / 10) << (month() % 10) - << (day() / 10) << (day() % 10); - return ss.str(); - } }; static_assert(sizeof(LocalDate) == 4); diff --git a/base/base/LocalDateTime.h b/src/Common/LocalDateTime.h similarity index 98% rename from base/base/LocalDateTime.h rename to src/Common/LocalDateTime.h index 282a56ac640..fa0a680d274 100644 --- a/base/base/LocalDateTime.h +++ b/src/Common/LocalDateTime.h @@ -3,8 +3,8 @@ #include #include #include -#include -#include +#include +#include /** Stores calendar date and time in broken-down form. diff --git a/src/Common/NamePrompter.h b/src/Common/NamePrompter.h index 4a44a3adeaa..74725ede08c 100644 --- a/src/Common/NamePrompter.h +++ b/src/Common/NamePrompter.h @@ -102,6 +102,13 @@ public: return prompter.getHints(name, getAllRegisteredNames()); } + IHints() = default; + + IHints(const IHints &) = default; + IHints(IHints &&) = default; + IHints & operator=(const IHints &) = default; + IHints & operator=(IHints &&) = default; + virtual ~IHints() = default; private: diff --git a/src/Common/PODArray.h b/src/Common/PODArray.h index 8a621b8db1c..b312fbda21c 100644 --- a/src/Common/PODArray.h +++ b/src/Common/PODArray.h @@ -37,6 +37,7 @@ namespace DB namespace ErrorCodes { extern const int CANNOT_MPROTECT; + extern const int CANNOT_ALLOCATE_MEMORY; } /** A dynamic array for POD types. @@ -104,7 +105,13 @@ protected: char * c_end_of_storage = null; /// Does not include pad_right. /// The amount of memory occupied by the num_elements of the elements. - static size_t byte_size(size_t num_elements) { return num_elements * ELEMENT_SIZE; } + static size_t byte_size(size_t num_elements) + { + size_t amount; + if (__builtin_mul_overflow(num_elements, ELEMENT_SIZE, &amount)) + throw Exception("Amount of memory requested to allocate is more than allowed", ErrorCodes::CANNOT_ALLOCATE_MEMORY); + return amount; + } /// Minimum amount of memory to allocate for num_elements, including padding. static size_t minimum_memory_for_elements(size_t num_elements) { return byte_size(num_elements) + pad_right + pad_left; } diff --git a/src/Common/ProfileEvents.cpp b/src/Common/ProfileEvents.cpp index 982523a3ef2..878930f58d9 100644 --- a/src/Common/ProfileEvents.cpp +++ b/src/Common/ProfileEvents.cpp @@ -259,6 +259,8 @@ M(RemoteFSUnusedPrefetches, "Number of prefetches pending at buffer destruction") \ M(RemoteFSPrefetchedReads, "Number of reads from prefecthed buffer") \ M(RemoteFSUnprefetchedReads, "Number of reads from unprefetched buffer") \ + M(RemoteFSLazySeeks, "Number of lazy seeks") \ + M(RemoteFSSeeksWithReset, "Number of seeks which lead to a new connection") \ M(RemoteFSBuffers, "Number of buffers created for asynchronous reading from remote filesystem") \ \ M(ReadBufferSeekCancelConnection, "Number of seeks which lead to new connection (s3, http)") \ @@ -274,7 +276,8 @@ M(ThreadPoolReaderPageCacheMissElapsedMicroseconds, "Time spent reading data inside the asynchronous job in ThreadPoolReader - when read was not done from page cache.") \ \ M(AsynchronousReadWaitMicroseconds, "Time spent in waiting for asynchronous reads.") \ - + \ + M(MainConfigLoads, "Number of times the main configuration was reloaded.") \ namespace ProfileEvents { diff --git a/src/Common/ShellCommand.cpp b/src/Common/ShellCommand.cpp index f238e976f8a..0093d72e766 100644 --- a/src/Common/ShellCommand.cpp +++ b/src/Common/ShellCommand.cpp @@ -65,14 +65,14 @@ ShellCommand::~ShellCommand() size_t try_wait_timeout = config.terminate_in_destructor_strategy.wait_for_normal_exit_before_termination_seconds; bool process_terminated_normally = tryWaitProcessWithTimeout(try_wait_timeout); - if (!process_terminated_normally) - { - LOG_TRACE(getLogger(), "Will kill shell command pid {} with SIGTERM", pid); + if (process_terminated_normally) + return; - int retcode = kill(pid, SIGTERM); - if (retcode != 0) - LOG_WARNING(getLogger(), "Cannot kill shell command pid {} errno '{}'", pid, errnoToString(retcode)); - } + LOG_TRACE(getLogger(), "Will kill shell command pid {} with SIGTERM", pid); + + int retcode = kill(pid, SIGTERM); + if (retcode != 0) + LOG_WARNING(getLogger(), "Cannot kill shell command pid {} errno '{}'", pid, errnoToString(retcode)); } else { @@ -91,7 +91,7 @@ bool ShellCommand::tryWaitProcessWithTimeout(size_t timeout_in_seconds) { int status = 0; - LOG_TRACE(getLogger(), "Try wait for shell command pid ({}) with timeout ({})", pid, timeout_in_seconds); + LOG_TRACE(getLogger(), "Try wait for shell command pid {} with timeout {}", pid, timeout_in_seconds); wait_called = true; struct timespec interval {.tv_sec = 1, .tv_nsec = 0}; @@ -119,7 +119,9 @@ bool ShellCommand::tryWaitProcessWithTimeout(size_t timeout_in_seconds) bool process_terminated_normally = (waitpid_res == pid); if (process_terminated_normally) + { return true; + } else if (waitpid_res == 0) { --timeout_in_seconds; @@ -128,7 +130,9 @@ bool ShellCommand::tryWaitProcessWithTimeout(size_t timeout_in_seconds) continue; } else if (waitpid_res == -1 && errno != EINTR) + { return false; + } } return false; @@ -155,12 +159,17 @@ std::unique_ptr ShellCommand::executeImpl( { logCommand(filename, argv); +#if !defined(USE_MUSL) /** Here it is written that with a normal call `vfork`, there is a chance of deadlock in multithreaded programs, * because of the resolving of symbols in the shared library * http://www.oracle.com/technetwork/server-storage/solaris10/subprocess-136439.html * Therefore, separate the resolving of the symbol from the call. */ static void * real_vfork = dlsym(RTLD_DEFAULT, "vfork"); +#else + /// If we use Musl with static linking, there is no dlsym and no issue with vfork. + static void * real_vfork = reinterpret_cast(&vfork); +#endif if (!real_vfork) throwFromErrno("Cannot find symbol vfork in myself", ErrorCodes::CANNOT_DLSYM); diff --git a/src/Common/SparseHashMap.h b/src/Common/SparseHashMap.h deleted file mode 100644 index 3f38d52a2b8..00000000000 --- a/src/Common/SparseHashMap.h +++ /dev/null @@ -1,10 +0,0 @@ -#pragma once - -/// SparseHashMap is a wrapper for google::sparse_hash_map. - -#include - -template , - class EqualKey = std::equal_to, - class Alloc = google::libc_allocator_with_realloc>> -using SparseHashMap = google::sparse_hash_map; diff --git a/src/Common/StatusFile.cpp b/src/Common/StatusFile.cpp index 5e4c31b149e..cfeab149d30 100644 --- a/src/Common/StatusFile.cpp +++ b/src/Common/StatusFile.cpp @@ -7,7 +7,7 @@ #include #include #include -#include +#include #include #include diff --git a/src/Common/SymbolIndex.cpp b/src/Common/SymbolIndex.cpp index 2d875b7042d..32c1a15337c 100644 --- a/src/Common/SymbolIndex.cpp +++ b/src/Common/SymbolIndex.cpp @@ -5,6 +5,7 @@ #include #include +#include #include @@ -85,12 +86,48 @@ namespace /// https://stackoverflow.com/questions/32088140/multiple-string-tables-in-elf-object +void updateResources(ElfW(Addr) base_address, std::string_view object_name, std::string_view name, const void * address, SymbolIndex::Resources & resources) +{ + const char * char_address = static_cast(address); + + if (name.starts_with("_binary_") || name.starts_with("binary_")) + { + if (name.ends_with("_start")) + { + name = name.substr((name[0] == '_') + strlen("binary_")); + name = name.substr(0, name.size() - strlen("_start")); + + resources.emplace(name, SymbolIndex::ResourcesBlob{ + base_address, + object_name, + std::string_view{char_address, 0}, // NOLINT + }); + } + else if (name.ends_with("_end")) + { + name = name.substr((name[0] == '_') + strlen("binary_")); + name = name.substr(0, name.size() - strlen("_end")); + + auto it = resources.find(name); + if (it != resources.end() && it->second.base_address == base_address && it->second.data.empty()) + { + const char * start = it->second.data.data(); + assert(char_address >= start); + it->second.data = std::string_view{start, static_cast(char_address - start)}; + } + } + } +} + + /// Based on the code of musl-libc and the answer of Kanalpiroge on /// https://stackoverflow.com/questions/15779185/list-all-the-functions-symbols-on-the-fly-in-c-code-on-a-linux-architecture /// It does not extract all the symbols (but only public - exported and used for dynamic linking), /// but will work if we cannot find or parse ELF files. -void collectSymbolsFromProgramHeaders(dl_phdr_info * info, - std::vector & symbols) +void collectSymbolsFromProgramHeaders( + dl_phdr_info * info, + std::vector & symbols, + SymbolIndex::Resources & resources) { /* Iterate over all headers of the current shared lib * (first call is for the executable itself) @@ -121,10 +158,12 @@ void collectSymbolsFromProgramHeaders(dl_phdr_info * info, size_t sym_cnt = 0; for (const auto * it = dyn_begin; it->d_tag != DT_NULL; ++it) { + ElfW(Addr) base_address = correct_address(info->dlpi_addr, it->d_un.d_ptr); + // TODO: this branch leads to invalid address of the hash table. Need further investigation. // if (it->d_tag == DT_HASH) // { - // const ElfW(Word) * hash = reinterpret_cast(correct_address(info->dlpi_addr, it->d_un.d_ptr)); + // const ElfW(Word) * hash = reinterpret_cast(base_address); // sym_cnt = hash[1]; // break; // } @@ -135,7 +174,7 @@ void collectSymbolsFromProgramHeaders(dl_phdr_info * info, const uint32_t * buckets = nullptr; const uint32_t * hashval = nullptr; - const ElfW(Word) * hash = reinterpret_cast(correct_address(info->dlpi_addr, it->d_un.d_ptr)); + const ElfW(Word) * hash = reinterpret_cast(base_address); buckets = hash + 4 + (hash[2] * sizeof(size_t) / 4); @@ -164,9 +203,11 @@ void collectSymbolsFromProgramHeaders(dl_phdr_info * info, const char * strtab = nullptr; for (const auto * it = dyn_begin; it->d_tag != DT_NULL; ++it) { + ElfW(Addr) base_address = correct_address(info->dlpi_addr, it->d_un.d_ptr); + if (it->d_tag == DT_STRTAB) { - strtab = reinterpret_cast(correct_address(info->dlpi_addr, it->d_un.d_ptr)); + strtab = reinterpret_cast(base_address); break; } } @@ -176,18 +217,16 @@ void collectSymbolsFromProgramHeaders(dl_phdr_info * info, for (const auto * it = dyn_begin; it->d_tag != DT_NULL; ++it) { + ElfW(Addr) base_address = correct_address(info->dlpi_addr, it->d_un.d_ptr); + if (it->d_tag == DT_SYMTAB) { /* Get the pointer to the first entry of the symbol table */ - const ElfW(Sym) * elf_sym = reinterpret_cast(correct_address(info->dlpi_addr, it->d_un.d_ptr)); + const ElfW(Sym) * elf_sym = reinterpret_cast(base_address); /* Iterate over the symbol table */ for (ElfW(Word) sym_index = 0; sym_index < ElfW(Word)(sym_cnt); ++sym_index) { - /// We are not interested in empty symbols. - if (!elf_sym[sym_index].st_size) - continue; - /* Get the name of the sym_index-th symbol. * This is located at the address of st_name relative to the beginning of the string table. */ @@ -197,10 +236,18 @@ void collectSymbolsFromProgramHeaders(dl_phdr_info * info, continue; SymbolIndex::Symbol symbol; - symbol.address_begin = reinterpret_cast(info->dlpi_addr + elf_sym[sym_index].st_value); - symbol.address_end = reinterpret_cast(info->dlpi_addr + elf_sym[sym_index].st_value + elf_sym[sym_index].st_size); + symbol.address_begin = reinterpret_cast( + info->dlpi_addr + elf_sym[sym_index].st_value); + symbol.address_end = reinterpret_cast( + info->dlpi_addr + elf_sym[sym_index].st_value + elf_sym[sym_index].st_size); symbol.name = sym_name; - symbols.push_back(symbol); + + /// We are not interested in empty symbols. + if (elf_sym[sym_index].st_size) + symbols.push_back(symbol); + + /// But resources can be represented by a pair of empty symbols (indicating their boundaries). + updateResources(base_address, info->dlpi_name, symbol.name, symbol.address_begin, resources); } break; @@ -210,6 +257,7 @@ void collectSymbolsFromProgramHeaders(dl_phdr_info * info, } +#if !defined USE_MUSL String getBuildIDFromProgramHeaders(dl_phdr_info * info) { for (size_t header_index = 0; header_index < info->dlpi_phnum; ++header_index) @@ -222,6 +270,7 @@ String getBuildIDFromProgramHeaders(dl_phdr_info * info) } return {}; } +#endif void collectSymbolsFromELFSymbolTable( @@ -229,7 +278,8 @@ void collectSymbolsFromELFSymbolTable( const Elf & elf, const Elf::Section & symbol_table, const Elf::Section & string_table, - std::vector & symbols) + std::vector & symbols, + SymbolIndex::Resources & resources) { /// Iterate symbol table. const ElfSym * symbol_table_entry = reinterpret_cast(symbol_table.begin()); @@ -241,7 +291,6 @@ void collectSymbolsFromELFSymbolTable( { if (!symbol_table_entry->st_name || !symbol_table_entry->st_value - || !symbol_table_entry->st_size || strings + symbol_table_entry->st_name >= elf.end()) continue; @@ -252,10 +301,16 @@ void collectSymbolsFromELFSymbolTable( continue; SymbolIndex::Symbol symbol; - symbol.address_begin = reinterpret_cast(info->dlpi_addr + symbol_table_entry->st_value); - symbol.address_end = reinterpret_cast(info->dlpi_addr + symbol_table_entry->st_value + symbol_table_entry->st_size); + symbol.address_begin = reinterpret_cast( + info->dlpi_addr + symbol_table_entry->st_value); + symbol.address_end = reinterpret_cast( + info->dlpi_addr + symbol_table_entry->st_value + symbol_table_entry->st_size); symbol.name = symbol_name; - symbols.push_back(symbol); + + if (symbol_table_entry->st_size) + symbols.push_back(symbol); + + updateResources(info->dlpi_addr, info->dlpi_name, symbol.name, symbol.address_begin, resources); } } @@ -265,7 +320,8 @@ bool searchAndCollectSymbolsFromELFSymbolTable( const Elf & elf, unsigned section_header_type, const char * string_table_name, - std::vector & symbols) + std::vector & symbols, + SymbolIndex::Resources & resources) { std::optional symbol_table; std::optional string_table; @@ -283,31 +339,45 @@ bool searchAndCollectSymbolsFromELFSymbolTable( return false; } - collectSymbolsFromELFSymbolTable(info, elf, *symbol_table, *string_table, symbols); + collectSymbolsFromELFSymbolTable(info, elf, *symbol_table, *string_table, symbols, resources); return true; } -void collectSymbolsFromELF(dl_phdr_info * info, +void collectSymbolsFromELF( + dl_phdr_info * info, std::vector & symbols, std::vector & objects, + SymbolIndex::Resources & resources, String & build_id) { + String object_name; + String our_build_id; + +#if defined (USE_MUSL) + object_name = "/proc/self/exe"; + our_build_id = Elf(object_name).getBuildID(); + build_id = our_build_id; +#else /// MSan does not know that the program segments in memory are initialized. __msan_unpoison_string(info->dlpi_name); - std::string object_name = info->dlpi_name; - - String our_build_id = getBuildIDFromProgramHeaders(info); + object_name = info->dlpi_name; + our_build_id = getBuildIDFromProgramHeaders(info); /// If the name is empty and there is a non-empty build-id - it's main executable. /// Find a elf file for the main executable and set the build-id. if (object_name.empty()) { object_name = "/proc/self/exe"; + + if (our_build_id.empty()) + our_build_id = Elf(object_name).getBuildID(); + if (build_id.empty()) build_id = our_build_id; } +#endif std::error_code ec; std::filesystem::path canonical_path = std::filesystem::canonical(object_name, ec); @@ -377,10 +447,12 @@ void collectSymbolsFromELF(dl_phdr_info * info, object.name = object_name; objects.push_back(std::move(object)); - searchAndCollectSymbolsFromELFSymbolTable(info, *objects.back().elf, SHT_SYMTAB, ".strtab", symbols); + searchAndCollectSymbolsFromELFSymbolTable(info, *objects.back().elf, SHT_SYMTAB, ".strtab", symbols, resources); - /// Unneeded because they were parsed from "program headers" of loaded objects. - //searchAndCollectSymbolsFromELFSymbolTable(info, *objects.back().elf, SHT_DYNSYM, ".dynstr", symbols); + /// Unneeded if they were parsed from "program headers" of loaded objects. +#if defined USE_MUSL + searchAndCollectSymbolsFromELFSymbolTable(info, *objects.back().elf, SHT_DYNSYM, ".dynstr", symbols, resources); +#endif } @@ -392,8 +464,8 @@ int collectSymbols(dl_phdr_info * info, size_t, void * data_ptr) { SymbolIndex::Data & data = *reinterpret_cast(data_ptr); - collectSymbolsFromProgramHeaders(info, data.symbols); - collectSymbolsFromELF(info, data.symbols, data.objects, data.build_id); + collectSymbolsFromProgramHeaders(info, data.symbols, data.resources); + collectSymbolsFromELF(info, data.symbols, data.objects, data.resources, data.build_id); /* Continue iterations */ return 0; @@ -424,7 +496,7 @@ const T * find(const void * address, const std::vector & vec) void SymbolIndex::update() { - dl_iterate_phdr(collectSymbols, &data.symbols); + dl_iterate_phdr(collectSymbols, &data); std::sort(data.objects.begin(), data.objects.end(), [](const Object & a, const Object & b) { return a.address_begin < b.address_begin; }); std::sort(data.symbols.begin(), data.symbols.end(), [](const Symbol & a, const Symbol & b) { return a.address_begin < b.address_begin; }); diff --git a/src/Common/SymbolIndex.h b/src/Common/SymbolIndex.h index 37862987bd2..1331cf81cf7 100644 --- a/src/Common/SymbolIndex.h +++ b/src/Common/SymbolIndex.h @@ -4,6 +4,7 @@ #include #include +#include #include #include @@ -47,15 +48,37 @@ public: const std::vector & symbols() const { return data.symbols; } const std::vector & objects() const { return data.objects; } + std::string_view getResource(String name) const + { + if (auto it = data.resources.find(name); it != data.resources.end()) + return it->second.data; + return {}; + } + /// The BuildID that is generated by compiler. String getBuildID() const { return data.build_id; } String getBuildIDHex() const; + struct ResourcesBlob + { + /// Symbol can be presented in multiple shared objects, + /// base_address will be used to compare only symbols from the same SO. + ElfW(Addr) base_address; + /// Just a human name of the SO. + std::string_view object_name; + /// Data blob. + std::string_view data; + }; + using Resources = std::unordered_map; + struct Data { std::vector symbols; std::vector objects; String build_id; + + /// Resources (embedded binary data) are located by symbols in form of _binary_name_start and _binary_name_end. + Resources resources; }; private: Data data; diff --git a/src/Common/ThreadStatus.cpp b/src/Common/ThreadStatus.cpp index c976e4ca16a..411f725f2db 100644 --- a/src/Common/ThreadStatus.cpp +++ b/src/Common/ThreadStatus.cpp @@ -72,6 +72,24 @@ static thread_local bool has_alt_stack = false; #endif +std::vector ThreadGroupStatus::getProfileEventsCountersAndMemoryForThreads() +{ + std::lock_guard guard(mutex); + + /// It is OK to move it, since it is enough to report statistics for the thread at least once. + auto stats = std::move(finished_threads_counters_memory); + for (auto * thread : threads) + { + stats.emplace_back(ProfileEventsCountersAndMemory{ + thread->performance_counters.getPartiallyAtomicSnapshot(), + thread->memory_tracker.get(), + thread->thread_id, + }); + } + + return stats; +} + ThreadStatus::ThreadStatus() : thread_id{getThreadId()} { @@ -139,11 +157,17 @@ ThreadStatus::~ThreadStatus() { /// It's a minor tracked memory leak here (not the memory itself but it's counter). /// We've already allocated a little bit more than the limit and cannot track it in the thread memory tracker or its parent. + tryLogCurrentException(log); } if (thread_group) { std::lock_guard guard(thread_group->mutex); + thread_group->finished_threads_counters_memory.emplace_back(ThreadGroupStatus::ProfileEventsCountersAndMemory{ + performance_counters.getPartiallyAtomicSnapshot(), + memory_tracker.get(), + thread_id, + }); thread_group->threads.erase(this); } diff --git a/src/Common/ThreadStatus.h b/src/Common/ThreadStatus.h index 97ddda1ea30..f3920474111 100644 --- a/src/Common/ThreadStatus.h +++ b/src/Common/ThreadStatus.h @@ -61,6 +61,13 @@ using ThreadStatusPtr = ThreadStatus *; class ThreadGroupStatus { public: + struct ProfileEventsCountersAndMemory + { + ProfileEvents::Counters::Snapshot counters; + Int64 memory_usage; + UInt64 thread_id; + }; + mutable std::mutex mutex; ProfileEvents::Counters performance_counters{VariableContext::Process}; @@ -83,6 +90,10 @@ public: String query; UInt64 normalized_query_hash = 0; + + std::vector finished_threads_counters_memory; + + std::vector getProfileEventsCountersAndMemoryForThreads(); }; using ThreadGroupStatusPtr = std::shared_ptr; diff --git a/src/Common/TraceCollector.cpp b/src/Common/TraceCollector.cpp index d84202449d1..523251fa2a2 100644 --- a/src/Common/TraceCollector.cpp +++ b/src/Common/TraceCollector.cpp @@ -153,7 +153,7 @@ void TraceCollector::run() Array trace; trace.reserve(trace_size); - for (size_t i = 0; i < trace_size; i++) + for (size_t i = 0; i < trace_size; ++i) { uintptr_t addr = 0; readPODBinary(addr, in); diff --git a/src/Common/ZooKeeper/ZooKeeper.cpp b/src/Common/ZooKeeper/ZooKeeper.cpp index f05a10b8815..c8753c8edaf 100644 --- a/src/Common/ZooKeeper/ZooKeeper.cpp +++ b/src/Common/ZooKeeper/ZooKeeper.cpp @@ -26,6 +26,7 @@ namespace ErrorCodes { extern const int LOGICAL_ERROR; extern const int NOT_IMPLEMENTED; + extern const int BAD_ARGUMENTS; } } @@ -1133,4 +1134,54 @@ Coordination::RequestPtr makeCheckRequest(const std::string & path, int version) return request; } +std::string normalizeZooKeeperPath(std::string zookeeper_path, bool check_starts_with_slash, Poco::Logger * log) +{ + if (!zookeeper_path.empty() && zookeeper_path.back() == '/') + zookeeper_path.resize(zookeeper_path.size() - 1); + /// If zookeeper chroot prefix is used, path should start with '/', because chroot concatenates without it. + if (!zookeeper_path.empty() && zookeeper_path.front() != '/') + { + /// Do not allow this for new tables, print warning for tables created in old versions + if (check_starts_with_slash) + throw DB::Exception(DB::ErrorCodes::BAD_ARGUMENTS, "ZooKeeper path must starts with '/', got '{}'", zookeeper_path); + if (log) + LOG_WARNING(log, "ZooKeeper path ('{}') does not start with '/'. It will not be supported in future releases"); + zookeeper_path = "/" + zookeeper_path; + } + + return zookeeper_path; +} + +String extractZooKeeperName(const String & path) +{ + static constexpr auto default_zookeeper_name = "default"; + if (path.empty()) + throw DB::Exception("ZooKeeper path should not be empty", DB::ErrorCodes::BAD_ARGUMENTS); + if (path[0] == '/') + return default_zookeeper_name; + auto pos = path.find(":/"); + if (pos != String::npos && pos < path.find('/')) + { + auto zookeeper_name = path.substr(0, pos); + if (zookeeper_name.empty()) + throw DB::Exception("Zookeeper path should start with '/' or ':/'", DB::ErrorCodes::BAD_ARGUMENTS); + return zookeeper_name; + } + return default_zookeeper_name; +} + +String extractZooKeeperPath(const String & path, bool check_starts_with_slash, Poco::Logger * log) +{ + if (path.empty()) + throw DB::Exception("ZooKeeper path should not be empty", DB::ErrorCodes::BAD_ARGUMENTS); + if (path[0] == '/') + return normalizeZooKeeperPath(path, check_starts_with_slash, log); + auto pos = path.find(":/"); + if (pos != String::npos && pos < path.find('/')) + { + return normalizeZooKeeperPath(path.substr(pos + 1, String::npos), check_starts_with_slash, log); + } + return normalizeZooKeeperPath(path, check_starts_with_slash, log); +} + } diff --git a/src/Common/ZooKeeper/ZooKeeper.h b/src/Common/ZooKeeper/ZooKeeper.h index 8e015b1f331..371f93f6df3 100644 --- a/src/Common/ZooKeeper/ZooKeeper.h +++ b/src/Common/ZooKeeper/ZooKeeper.h @@ -379,4 +379,11 @@ private: }; using EphemeralNodeHolderPtr = EphemeralNodeHolder::Ptr; + +String normalizeZooKeeperPath(std::string zookeeper_path, bool check_starts_with_slash, Poco::Logger * log = nullptr); + +String extractZooKeeperName(const String & path); + +String extractZooKeeperPath(const String & path, bool check_starts_with_slash, Poco::Logger * log = nullptr); + } diff --git a/src/Common/config.h.in b/src/Common/config.h.in index 9e97490e2ad..c53abd17bc2 100644 --- a/src/Common/config.h.in +++ b/src/Common/config.h.in @@ -9,6 +9,7 @@ #cmakedefine01 USE_HDFS #cmakedefine01 USE_INTERNAL_HDFS3_LIBRARY #cmakedefine01 USE_AWS_S3 +#cmakedefine01 USE_AZURE_BLOB_STORAGE #cmakedefine01 USE_BROTLI #cmakedefine01 USE_UNWIND #cmakedefine01 USE_OPENCL diff --git a/src/Common/getRandomASCIIString.cpp b/src/Common/getRandomASCIIString.cpp new file mode 100644 index 00000000000..788c0d05ff5 --- /dev/null +++ b/src/Common/getRandomASCIIString.cpp @@ -0,0 +1,17 @@ +#include +#include +#include + +namespace DB +{ + +String getRandomASCIIString(size_t len, char first, char last) +{ + std::uniform_int_distribution distribution(first, last); + String res(len, ' '); + for (auto & c : res) + c = distribution(thread_local_rng); + return res; +} + +} diff --git a/src/Common/getRandomASCIIString.h b/src/Common/getRandomASCIIString.h new file mode 100644 index 00000000000..69684a9bef2 --- /dev/null +++ b/src/Common/getRandomASCIIString.h @@ -0,0 +1,10 @@ +#pragma once +#include + +namespace DB +{ +/// Slow random string. Useful for random names and things like this. Not for +/// generating data. +String getRandomASCIIString(size_t len = 32, char first = 'a', char last = 'z'); + +} diff --git a/base/base/getResource.cpp b/src/Common/getResource.cpp similarity index 80% rename from base/base/getResource.cpp rename to src/Common/getResource.cpp index 6682ae0a01f..fe603fcc550 100644 --- a/base/base/getResource.cpp +++ b/src/Common/getResource.cpp @@ -1,8 +1,9 @@ #include "getResource.h" -#include "unaligned.h" #include #include #include +#include + std::string_view getResource(std::string_view name) { @@ -13,6 +14,11 @@ std::string_view getResource(std::string_view name) std::replace(name_replaced.begin(), name_replaced.end(), '.', '_'); boost::replace_all(name_replaced, "+", "_PLUS_"); +#if defined USE_MUSL + /// If static linking is used, we cannot use dlsym and have to parse ELF symbol table by ourself. + return DB::SymbolIndex::instance()->getResource(name_replaced); + +#else // In most `dlsym(3)` APIs, one passes the symbol name as it appears via // something like `nm` or `objdump -t`. For example, a symbol `_foo` would be // looked up with the string `"_foo"`. @@ -33,8 +39,8 @@ std::string_view getResource(std::string_view name) std::string symbol_name_start = prefix + name_replaced + "_start"; std::string symbol_name_end = prefix + name_replaced + "_end"; - const char* sym_start = reinterpret_cast(dlsym(RTLD_DEFAULT, symbol_name_start.c_str())); - const char* sym_end = reinterpret_cast(dlsym(RTLD_DEFAULT, symbol_name_end.c_str())); + const char * sym_start = reinterpret_cast(dlsym(RTLD_DEFAULT, symbol_name_start.c_str())); + const char * sym_end = reinterpret_cast(dlsym(RTLD_DEFAULT, symbol_name_end.c_str())); if (sym_start && sym_end) { @@ -42,4 +48,5 @@ std::string_view getResource(std::string_view name) return { sym_start, resource_size }; } return {}; +#endif } diff --git a/base/base/getResource.h b/src/Common/getResource.h similarity index 100% rename from base/base/getResource.h rename to src/Common/getResource.h diff --git a/src/Common/mysqlxx/CMakeLists.txt b/src/Common/mysqlxx/CMakeLists.txt new file mode 100644 index 00000000000..76005651e61 --- /dev/null +++ b/src/Common/mysqlxx/CMakeLists.txt @@ -0,0 +1,24 @@ +add_library (mysqlxx + Connection.cpp + Exception.cpp + Query.cpp + ResultBase.cpp + UseQueryResult.cpp + Row.cpp + Value.cpp + Pool.cpp + PoolFactory.cpp + PoolWithFailover.cpp +) + +target_include_directories (mysqlxx PUBLIC .) + +target_link_libraries (mysqlxx + clickhouse_common_io + ${MYSQLCLIENT_LIBRARIES} + ${ZLIB_LIBRARIES} +) + +if (ENABLE_TESTS) + add_subdirectory (tests) +endif () diff --git a/base/mysqlxx/Connection.cpp b/src/Common/mysqlxx/Connection.cpp similarity index 100% rename from base/mysqlxx/Connection.cpp rename to src/Common/mysqlxx/Connection.cpp diff --git a/base/mysqlxx/Exception.cpp b/src/Common/mysqlxx/Exception.cpp similarity index 70% rename from base/mysqlxx/Exception.cpp rename to src/Common/mysqlxx/Exception.cpp index 188e7bd740d..0f5320da754 100644 --- a/base/mysqlxx/Exception.cpp +++ b/src/Common/mysqlxx/Exception.cpp @@ -4,6 +4,7 @@ #include #endif #include +#include namespace mysqlxx @@ -11,11 +12,7 @@ namespace mysqlxx std::string errorMessage(MYSQL * driver) { - std::stringstream res; - res << mysql_error(driver) - << " (" << (driver->host ? driver->host : "(nullptr)") - << ":" << driver->port << ")"; - return res.str(); + return fmt::format("{} ({}:{})", mysql_error(driver), driver->host ? driver->host : "(nullptr)", driver->port); } void checkError(MYSQL * driver) diff --git a/base/mysqlxx/Pool.cpp b/src/Common/mysqlxx/Pool.cpp similarity index 100% rename from base/mysqlxx/Pool.cpp rename to src/Common/mysqlxx/Pool.cpp diff --git a/base/mysqlxx/PoolFactory.cpp b/src/Common/mysqlxx/PoolFactory.cpp similarity index 100% rename from base/mysqlxx/PoolFactory.cpp rename to src/Common/mysqlxx/PoolFactory.cpp diff --git a/base/mysqlxx/PoolWithFailover.cpp b/src/Common/mysqlxx/PoolWithFailover.cpp similarity index 93% rename from base/mysqlxx/PoolWithFailover.cpp rename to src/Common/mysqlxx/PoolWithFailover.cpp index 14c0db9ecd5..e4da07c7a79 100644 --- a/base/mysqlxx/PoolWithFailover.cpp +++ b/src/Common/mysqlxx/PoolWithFailover.cpp @@ -3,13 +3,8 @@ #include #include #include - - -/// Duplicate of code from StringUtils.h. Copied here for less dependencies. -static bool startsWith(const std::string & s, const char * prefix) -{ - return s.size() >= strlen(prefix) && 0 == memcmp(s.data(), prefix, strlen(prefix)); -} +#include +#include using namespace mysqlxx; @@ -31,7 +26,7 @@ PoolWithFailover::PoolWithFailover( for (const auto & replica_config_key : replica_keys) { /// There could be another elements in the same level in configuration file, like "password", "port"... - if (startsWith(replica_config_key, "replica")) + if (replica_config_key.starts_with("replica")) { std::string replica_name = config_name_ + "." + replica_config_key; @@ -82,7 +77,9 @@ PoolWithFailover::PoolWithFailover( unsigned default_connections_, unsigned max_connections_, size_t max_tries_, - uint64_t wait_timeout_) + uint64_t wait_timeout_, + size_t connect_timeout_, + size_t rw_timeout_) : max_tries(max_tries_) , shareable(false) , wait_timeout(wait_timeout_) @@ -93,8 +90,8 @@ PoolWithFailover::PoolWithFailover( replicas_by_priority[0].emplace_back(std::make_shared(database, host, user, password, port, /* socket_ = */ "", - MYSQLXX_DEFAULT_TIMEOUT, - MYSQLXX_DEFAULT_RW_TIMEOUT, + connect_timeout_, + rw_timeout_, default_connections_, max_connections_)); } @@ -179,7 +176,7 @@ PoolWithFailover::Entry PoolWithFailover::get() return (*full_pool)->get(wait_timeout); } - std::stringstream message; + DB::WriteBufferFromOwnString message; message << "Connections to all replicas failed: "; for (auto it = replicas_by_priority.begin(); it != replicas_by_priority.end(); ++it) for (auto jt = it->second.begin(); jt != it->second.end(); ++jt) diff --git a/base/mysqlxx/Query.cpp b/src/Common/mysqlxx/Query.cpp similarity index 82% rename from base/mysqlxx/Query.cpp rename to src/Common/mysqlxx/Query.cpp index d4514c3e988..11e72e14f9d 100644 --- a/base/mysqlxx/Query.cpp +++ b/src/Common/mysqlxx/Query.cpp @@ -21,10 +21,7 @@ Query::Query(Connection * conn_, const std::string & query_string) : conn(conn_) /// Важно в случае, если Query используется не из того же потока, что Connection. mysql_thread_init(); - if (!query_string.empty()) - query_buf << query_string; - - query_buf.imbue(std::locale::classic()); + query = query_string; } Query::Query(const Query & other) : conn(other.conn) @@ -32,9 +29,7 @@ Query::Query(const Query & other) : conn(other.conn) /// Важно в случае, если Query используется не из того же потока, что Connection. mysql_thread_init(); - query_buf.imbue(std::locale::classic()); - - *this << other.str(); + query = other.query; } Query & Query::operator= (const Query & other) @@ -43,8 +38,7 @@ Query & Query::operator= (const Query & other) return *this; conn = other.conn; - - query_buf.str(other.str()); + query = other.query; return *this; } @@ -54,20 +48,13 @@ Query::~Query() mysql_thread_end(); } -void Query::reset() -{ - query_buf.str({}); -} - void Query::executeImpl() { - std::string query_string = query_buf.str(); - MYSQL* mysql_driver = conn->getDriver(); auto & logger = Poco::Logger::get("mysqlxx::Query"); logger.trace("Running MySQL query using connection %lu", mysql_thread_id(mysql_driver)); - if (mysql_real_query(mysql_driver, query_string.data(), query_string.size())) + if (mysql_real_query(mysql_driver, query.data(), query.size())) { const auto err_no = mysql_errno(mysql_driver); switch (err_no) diff --git a/base/mysqlxx/ResultBase.cpp b/src/Common/mysqlxx/ResultBase.cpp similarity index 100% rename from base/mysqlxx/ResultBase.cpp rename to src/Common/mysqlxx/ResultBase.cpp diff --git a/base/mysqlxx/Row.cpp b/src/Common/mysqlxx/Row.cpp similarity index 66% rename from base/mysqlxx/Row.cpp rename to src/Common/mysqlxx/Row.cpp index aecec46e519..861a04f8ece 100644 --- a/base/mysqlxx/Row.cpp +++ b/src/Common/mysqlxx/Row.cpp @@ -21,4 +21,12 @@ Value Row::operator[] (const char * name) const throw Exception(std::string("Unknown column ") + name); } +enum enum_field_types Row::getFieldType(size_t i) +{ + if (i >= res->getNumFields()) + throw Exception(std::string("Array Index Overflow")); + MYSQL_FIELDS fields = res->getFields(); + return fields[i].type; +} + } diff --git a/base/mysqlxx/UseQueryResult.cpp b/src/Common/mysqlxx/UseQueryResult.cpp similarity index 100% rename from base/mysqlxx/UseQueryResult.cpp rename to src/Common/mysqlxx/UseQueryResult.cpp diff --git a/base/mysqlxx/Value.cpp b/src/Common/mysqlxx/Value.cpp similarity index 93% rename from base/mysqlxx/Value.cpp rename to src/Common/mysqlxx/Value.cpp index ed66167e8ea..85b63b722a2 100644 --- a/base/mysqlxx/Value.cpp +++ b/src/Common/mysqlxx/Value.cpp @@ -156,19 +156,21 @@ void Value::throwException(const char * text) const { static constexpr size_t preview_length = 1000; - std::stringstream info; - info << text; + std::string info(text); if (!isNull()) { - info << ": "; - info.write(m_data, m_length); + info.append(": "); + info.append(m_data, m_length); } if (res && res->getQuery()) - info << ", query: " << res->getQuery()->str().substr(0, preview_length); + { + info.append(", query: "); + info.append(res->getQuery()->str().substr(0, preview_length)); + } - throw CannotParseValue(info.str()); + throw CannotParseValue(info); } } diff --git a/base/mysqlxx/Connection.h b/src/Common/mysqlxx/mysqlxx/Connection.h similarity index 99% rename from base/mysqlxx/Connection.h rename to src/Common/mysqlxx/mysqlxx/Connection.h index 65955136eb1..548e75a1fef 100644 --- a/base/mysqlxx/Connection.h +++ b/src/Common/mysqlxx/mysqlxx/Connection.h @@ -154,7 +154,7 @@ public: bool ping(); /// Creates query. It can be set with query string or later. - Query query(const std::string & str = ""); + Query query(const std::string & str); /// Get MySQL C API MYSQL object. MYSQL * getDriver(); diff --git a/base/mysqlxx/Exception.h b/src/Common/mysqlxx/mysqlxx/Exception.h similarity index 100% rename from base/mysqlxx/Exception.h rename to src/Common/mysqlxx/mysqlxx/Exception.h diff --git a/base/mysqlxx/Null.h b/src/Common/mysqlxx/mysqlxx/Null.h similarity index 100% rename from base/mysqlxx/Null.h rename to src/Common/mysqlxx/mysqlxx/Null.h diff --git a/base/mysqlxx/Pool.h b/src/Common/mysqlxx/mysqlxx/Pool.h similarity index 100% rename from base/mysqlxx/Pool.h rename to src/Common/mysqlxx/mysqlxx/Pool.h diff --git a/base/mysqlxx/PoolFactory.h b/src/Common/mysqlxx/mysqlxx/PoolFactory.h similarity index 100% rename from base/mysqlxx/PoolFactory.h rename to src/Common/mysqlxx/mysqlxx/PoolFactory.h diff --git a/base/mysqlxx/PoolWithFailover.h b/src/Common/mysqlxx/mysqlxx/PoolWithFailover.h similarity index 93% rename from base/mysqlxx/PoolWithFailover.h rename to src/Common/mysqlxx/mysqlxx/PoolWithFailover.h index 2bd5ec9f30a..17870d141e1 100644 --- a/base/mysqlxx/PoolWithFailover.h +++ b/src/Common/mysqlxx/mysqlxx/PoolWithFailover.h @@ -6,6 +6,7 @@ #define MYSQLXX_POOL_WITH_FAILOVER_DEFAULT_START_CONNECTIONS 1 #define MYSQLXX_POOL_WITH_FAILOVER_DEFAULT_MAX_CONNECTIONS 16 #define MYSQLXX_POOL_WITH_FAILOVER_DEFAULT_MAX_TRIES 3 +#define MYSQLXX_POOL_WITH_FAILOVER_DEFAULT_CONNECTION_WAIT_TIMEOUT 5 /// in seconds namespace mysqlxx @@ -121,7 +122,9 @@ namespace mysqlxx unsigned default_connections_ = MYSQLXX_POOL_WITH_FAILOVER_DEFAULT_START_CONNECTIONS, unsigned max_connections_ = MYSQLXX_POOL_WITH_FAILOVER_DEFAULT_MAX_CONNECTIONS, size_t max_tries_ = MYSQLXX_POOL_WITH_FAILOVER_DEFAULT_MAX_TRIES, - uint64_t wait_timeout_ = UINT64_MAX); + uint64_t wait_timeout_ = MYSQLXX_POOL_WITH_FAILOVER_DEFAULT_CONNECTION_WAIT_TIMEOUT, + size_t connect_timeout = MYSQLXX_DEFAULT_TIMEOUT, + size_t rw_timeout = MYSQLXX_DEFAULT_RW_TIMEOUT); PoolWithFailover(const PoolWithFailover & other); diff --git a/base/mysqlxx/Query.h b/src/Common/mysqlxx/mysqlxx/Query.h similarity index 74% rename from base/mysqlxx/Query.h rename to src/Common/mysqlxx/mysqlxx/Query.h index 036e8952bc3..49aa3f223e7 100644 --- a/base/mysqlxx/Query.h +++ b/src/Common/mysqlxx/mysqlxx/Query.h @@ -13,9 +13,7 @@ namespace mysqlxx * Ссылается на Connection. Если уничтожить Connection, то Query станет некорректным и пользоваться им будет нельзя. * * Пример использования: - * mysqlxx::Query query = connection.query(); - * query << "SELECT 1 AS x, 2 AS y, 3 AS z"; - * query << " LIMIT 1"; + * mysqlxx::Query query = connection.query("SELECT 1 AS x, 2 AS y, 3 AS z LIMIT 1"); * mysqlxx::UseQueryResult result = query.use(); * * while (mysqlxx::Row row = result.fetch()) @@ -29,14 +27,11 @@ namespace mysqlxx class Query { public: - Query(Connection * conn_, const std::string & query_string = ""); + Query(Connection * conn_, const std::string & query_string); Query(const Query & other); Query & operator= (const Query & other); ~Query(); - /** Сбросить текст запроса. Это используется, если нужно написать новый запрос в том же объекте. */ - void reset(); - /** Выполнить запрос, результат которого не имеет значения (почти всё кроме SELECT). */ void execute(); @@ -54,24 +49,12 @@ public: /// Получить текст запроса (например, для вывода его в лог). См. ещё operator<< ниже. std::string str() const { - return query_buf.str(); - } - - auto rdbuf() const - { - return query_buf.rdbuf(); - } - - template - inline Query & operator<< (T && x) - { - query_buf << std::forward(x); - return *this; + return query; } private: Connection * conn; - std::ostringstream query_buf; + std::string query; void executeImpl(); }; @@ -80,7 +63,7 @@ private: /// Вывести текст запроса в ostream. inline std::ostream & operator<< (std::ostream & ostr, const Query & query) { - return ostr << query.rdbuf(); + return ostr << query.str(); } diff --git a/base/mysqlxx/ResultBase.h b/src/Common/mysqlxx/mysqlxx/ResultBase.h similarity index 100% rename from base/mysqlxx/ResultBase.h rename to src/Common/mysqlxx/mysqlxx/ResultBase.h diff --git a/base/mysqlxx/Row.h b/src/Common/mysqlxx/mysqlxx/Row.h similarity index 98% rename from base/mysqlxx/Row.h rename to src/Common/mysqlxx/mysqlxx/Row.h index d668fdbd29a..b11d7d628ef 100644 --- a/base/mysqlxx/Row.h +++ b/src/Common/mysqlxx/mysqlxx/Row.h @@ -79,6 +79,8 @@ public: */ operator private_bool_type() const { return row == nullptr ? nullptr : &Row::row; } + enum enum_field_types getFieldType(size_t i); + private: MYSQL_ROW row{}; ResultBase * res{}; diff --git a/base/mysqlxx/Transaction.h b/src/Common/mysqlxx/mysqlxx/Transaction.h similarity index 100% rename from base/mysqlxx/Transaction.h rename to src/Common/mysqlxx/mysqlxx/Transaction.h diff --git a/base/mysqlxx/Types.h b/src/Common/mysqlxx/mysqlxx/Types.h similarity index 94% rename from base/mysqlxx/Types.h rename to src/Common/mysqlxx/mysqlxx/Types.h index b5ed70916fa..5fd9aa8bbc8 100644 --- a/base/mysqlxx/Types.h +++ b/src/Common/mysqlxx/mysqlxx/Types.h @@ -16,6 +16,8 @@ using MYSQL_ROW = char**; struct st_mysql_field; using MYSQL_FIELD = st_mysql_field; +enum struct enum_field_types; + #endif namespace mysqlxx diff --git a/base/mysqlxx/UseQueryResult.h b/src/Common/mysqlxx/mysqlxx/UseQueryResult.h similarity index 100% rename from base/mysqlxx/UseQueryResult.h rename to src/Common/mysqlxx/mysqlxx/UseQueryResult.h diff --git a/base/mysqlxx/Value.h b/src/Common/mysqlxx/mysqlxx/Value.h similarity index 99% rename from base/mysqlxx/Value.h rename to src/Common/mysqlxx/mysqlxx/Value.h index 6d3b2e96ebd..797a65a63f9 100644 --- a/base/mysqlxx/Value.h +++ b/src/Common/mysqlxx/mysqlxx/Value.h @@ -10,10 +10,10 @@ #include #include -#include +#include #include -#include +#include namespace mysqlxx diff --git a/base/mysqlxx/mysqlxx.h b/src/Common/mysqlxx/mysqlxx/mysqlxx.h similarity index 94% rename from base/mysqlxx/mysqlxx.h rename to src/Common/mysqlxx/mysqlxx/mysqlxx.h index 0caadcbb720..785d4361fd7 100644 --- a/base/mysqlxx/mysqlxx.h +++ b/src/Common/mysqlxx/mysqlxx/mysqlxx.h @@ -3,8 +3,8 @@ #include #include #include -#include -#include +#include +#include #include @@ -23,7 +23,7 @@ * where values are stored consecutively as (non-zero-terminated) strings. * * 2. Too slow methods for converting values to numbers. - * In mysql++, it is done through std::stringstream. + * In mysql++, it is done through std::s*****stream (it is banned in our codebase). * This is slower than POSIX functions (strtoul, etc). * In turn, this is slower than simple hand-coded functions, * that doesn't respect locales and unused by MySQL number representations. diff --git a/base/mysqlxx/tests/CMakeLists.txt b/src/Common/mysqlxx/tests/CMakeLists.txt similarity index 100% rename from base/mysqlxx/tests/CMakeLists.txt rename to src/Common/mysqlxx/tests/CMakeLists.txt diff --git a/base/mysqlxx/tests/mysqlxx_pool_test.cpp b/src/Common/mysqlxx/tests/mysqlxx_pool_test.cpp similarity index 90% rename from base/mysqlxx/tests/mysqlxx_pool_test.cpp rename to src/Common/mysqlxx/tests/mysqlxx_pool_test.cpp index 3dc23e4da85..61d6a117285 100644 --- a/base/mysqlxx/tests/mysqlxx_pool_test.cpp +++ b/src/Common/mysqlxx/tests/mysqlxx_pool_test.cpp @@ -2,7 +2,6 @@ #include #include -#include #include @@ -41,10 +40,7 @@ mysqlxx::Pool::Entry getWithFailover(mysqlxx::Pool & connections_pool) std::this_thread::sleep_for(1s); } - std::stringstream message; - message << "Connections to all replicas failed: " << connections_pool.getDescription(); - - throw Poco::Exception(message.str()); + throw Poco::Exception("Connections to all replicas failed: " + connections_pool.getDescription()); } } @@ -69,8 +65,7 @@ int main(int, char **) std::clog << "Preparing query (5s sleep) ..."; std::this_thread::sleep_for(5s); - mysqlxx::Query query = worker->query(); - query << test_query; + mysqlxx::Query query = worker->query(test_query); std::clog << "ok" << std::endl; std::clog << "Querying result (5s sleep) ..."; diff --git a/src/Common/parseRemoteDescription.cpp b/src/Common/parseRemoteDescription.cpp index 7c8053037ea..fa5d3a8fbd5 100644 --- a/src/Common/parseRemoteDescription.cpp +++ b/src/Common/parseRemoteDescription.cpp @@ -41,7 +41,7 @@ static void append(std::vector & to, const std::vector & what, s static bool parseNumber(const String & description, size_t l, size_t r, size_t & res) { res = 0; - for (size_t pos = l; pos < r; pos ++) + for (size_t pos = l; pos < r; ++pos) { if (!isNumericASCII(description[pos])) return false; diff --git a/src/Common/tests/gtest_DateLUTImpl.cpp b/src/Common/tests/gtest_DateLUTImpl.cpp index a621f05c517..1220c50b409 100644 --- a/src/Common/tests/gtest_DateLUTImpl.cpp +++ b/src/Common/tests/gtest_DateLUTImpl.cpp @@ -1,5 +1,5 @@ -#include -#include +#include +#include #include diff --git a/src/Common/tests/gtest_local_date_time_comparison.cpp b/src/Common/tests/gtest_local_date_time_comparison.cpp index 9f66da51c94..8aea710ea55 100644 --- a/src/Common/tests/gtest_local_date_time_comparison.cpp +++ b/src/Common/tests/gtest_local_date_time_comparison.cpp @@ -2,7 +2,7 @@ #include #include -#include +#include void fillStackWithGarbage() diff --git a/src/Compression/CompressionFactoryAdditions.cpp b/src/Compression/CompressionFactoryAdditions.cpp index b5f00c60827..d87d0f8b4ee 100644 --- a/src/Compression/CompressionFactoryAdditions.cpp +++ b/src/Compression/CompressionFactoryAdditions.cpp @@ -117,7 +117,7 @@ ASTPtr CompressionCodecFactory::validateCodecAndGetPreprocessedAST( }; ISerialization::SubstreamPath path; - column_type->getDefaultSerialization()->enumerateStreams(path, callback, column_type, nullptr); + column_type->getDefaultSerialization()->enumerateStreams(path, callback, column_type); if (!result_codec) throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot find any substream with data type for type {}. It's a bug", column_type->getName()); diff --git a/src/Coordination/Changelog.cpp b/src/Coordination/Changelog.cpp index 2c730ee16ed..74e093284a8 100644 --- a/src/Coordination/Changelog.cpp +++ b/src/Coordination/Changelog.cpp @@ -252,7 +252,7 @@ public: catch (const Exception & ex) { if (ex.code() == ErrorCodes::UNKNOWN_FORMAT_VERSION) - throw ex; + throw; result.error = true; LOG_WARNING(log, "Cannot completely read changelog on path {}, error: {}", filepath, ex.message()); diff --git a/src/Coordination/KeeperServer.cpp b/src/Coordination/KeeperServer.cpp index 171fa2986eb..82ea100bccb 100644 --- a/src/Coordination/KeeperServer.cpp +++ b/src/Coordination/KeeperServer.cpp @@ -75,6 +75,17 @@ std::string checkAndGetSuperdigest(const String & user_and_digest) return user_and_digest; } +int32_t getValueOrMaxInt32AndLogWarning(uint64_t value, const std::string & name, Poco::Logger * log) +{ + if (value > std::numeric_limits::max()) + { + LOG_WARNING(log, "Got {} value for setting '{}' which is bigger than int32_t max value, lowering value to {}.", value, name, std::numeric_limits::max()); + return std::numeric_limits::max(); + } + + return static_cast(value); +} + } KeeperServer::KeeperServer( @@ -134,18 +145,18 @@ void KeeperServer::startup() } nuraft::raft_params params; - params.heart_beat_interval_ = coordination_settings->heart_beat_interval_ms.totalMilliseconds(); - params.election_timeout_lower_bound_ = coordination_settings->election_timeout_lower_bound_ms.totalMilliseconds(); - params.election_timeout_upper_bound_ = coordination_settings->election_timeout_upper_bound_ms.totalMilliseconds(); - - params.reserved_log_items_ = coordination_settings->reserved_log_items; - params.snapshot_distance_ = coordination_settings->snapshot_distance; - params.stale_log_gap_ = coordination_settings->stale_log_gap; - params.fresh_log_gap_ = coordination_settings->fresh_log_gap; - params.client_req_timeout_ = coordination_settings->operation_timeout_ms.totalMilliseconds(); + params.heart_beat_interval_ = getValueOrMaxInt32AndLogWarning(coordination_settings->heart_beat_interval_ms.totalMilliseconds(), "heart_beat_interval_ms", log); + params.election_timeout_lower_bound_ = getValueOrMaxInt32AndLogWarning(coordination_settings->election_timeout_lower_bound_ms.totalMilliseconds(), "election_timeout_lower_bound_ms", log); + params.election_timeout_upper_bound_ = getValueOrMaxInt32AndLogWarning(coordination_settings->election_timeout_upper_bound_ms.totalMilliseconds(), "election_timeout_upper_bound_ms", log); + params.reserved_log_items_ = getValueOrMaxInt32AndLogWarning(coordination_settings->reserved_log_items, "reserved_log_items", log); + params.snapshot_distance_ = getValueOrMaxInt32AndLogWarning(coordination_settings->snapshot_distance, "snapshot_distance", log); + params.stale_log_gap_ = getValueOrMaxInt32AndLogWarning(coordination_settings->stale_log_gap, "stale_log_gap", log); + params.fresh_log_gap_ = getValueOrMaxInt32AndLogWarning(coordination_settings->fresh_log_gap, "fresh_log_gap", log); + params.client_req_timeout_ = getValueOrMaxInt32AndLogWarning(coordination_settings->operation_timeout_ms.totalMilliseconds(), "operation_timeout_ms", log); params.auto_forwarding_ = coordination_settings->auto_forwarding; - params.auto_forwarding_req_timeout_ = coordination_settings->operation_timeout_ms.totalMilliseconds() * 2; - params.max_append_size_ = coordination_settings->max_requests_batch_size; + params.auto_forwarding_req_timeout_ = std::max(coordination_settings->operation_timeout_ms.totalMilliseconds() * 2, std::numeric_limits::max()); + params.auto_forwarding_req_timeout_ = getValueOrMaxInt32AndLogWarning(coordination_settings->operation_timeout_ms.totalMilliseconds() * 2, "operation_timeout_ms", log); + params.max_append_size_ = getValueOrMaxInt32AndLogWarning(coordination_settings->max_requests_batch_size, "max_requests_batch_size", log); params.return_method_ = nuraft::raft_params::async_handler; diff --git a/src/Coordination/KeeperSnapshotManager.cpp b/src/Coordination/KeeperSnapshotManager.cpp index be6d4db4219..518d569ca67 100644 --- a/src/Coordination/KeeperSnapshotManager.cpp +++ b/src/Coordination/KeeperSnapshotManager.cpp @@ -55,7 +55,7 @@ namespace return "/"; } - void writeNode(const KeeperStorage::Node & node, WriteBuffer & out) + void writeNode(const KeeperStorage::Node & node, SnapshotVersion version, WriteBuffer & out) { writeBinary(node.data, out); @@ -76,6 +76,11 @@ namespace writeBinary(node.stat.pzxid, out); writeBinary(node.seq_num, out); + + if (version >= SnapshotVersion::V4) + { + writeBinary(node.size_bytes, out); + } } void readNode(KeeperStorage::Node & node, ReadBuffer & in, SnapshotVersion version, ACLMap & acl_map) @@ -124,6 +129,11 @@ namespace readBinary(node.stat.numChildren, in); readBinary(node.stat.pzxid, in); readBinary(node.seq_num, in); + + if (version >= SnapshotVersion::V4) + { + readBinary(node.size_bytes, in); + } } void serializeSnapshotMetadata(const SnapshotMetadataPtr & snapshot_meta, WriteBuffer & out) @@ -176,7 +186,7 @@ void KeeperStorageSnapshot::serialize(const KeeperStorageSnapshot & snapshot, Wr throw Exception(ErrorCodes::LOGICAL_ERROR, "Trying to serialize node with mzxid {}, but last snapshot index {}", node.stat.mzxid, snapshot.snapshot_meta->get_last_log_idx()); writeBinary(path, out); - writeNode(node, out); + writeNode(node, snapshot.version, out); /// Last iteration: check and exit here without iterator increment. Otherwise /// false positive race condition on list end is possible. diff --git a/src/Coordination/KeeperSnapshotManager.h b/src/Coordination/KeeperSnapshotManager.h index 2889ec493df..174864a0ceb 100644 --- a/src/Coordination/KeeperSnapshotManager.h +++ b/src/Coordination/KeeperSnapshotManager.h @@ -18,9 +18,10 @@ enum SnapshotVersion : uint8_t V1 = 1, /// with ACL map V2 = 2, /// with 64 bit buffer header V3 = 3, /// compress snapshots with ZSTD codec + V4 = 4, /// add Node size to snapshots }; -static constexpr auto CURRENT_SNAPSHOT_VERSION = SnapshotVersion::V3; +static constexpr auto CURRENT_SNAPSHOT_VERSION = SnapshotVersion::V4; /// What is stored in binary shapsnot struct SnapshotDeserializationResult diff --git a/src/Coordination/KeeperStorage.cpp b/src/Coordination/KeeperStorage.cpp index 81bb3d0dd7d..a64a7d425f6 100644 --- a/src/Coordination/KeeperStorage.cpp +++ b/src/Coordination/KeeperStorage.cpp @@ -91,8 +91,7 @@ static bool checkACL(int32_t permission, const Coordination::ACLs & node_acls, c static bool fixupACL( const std::vector & request_acls, const std::vector & current_ids, - std::vector & result_acls, - bool hash_acls) + std::vector & result_acls) { if (request_acls.empty()) return true; @@ -125,29 +124,12 @@ static bool fixupACL( return false; valid_found = true; - if (hash_acls) - new_acl.id = generateDigest(new_acl.id); result_acls.push_back(new_acl); } } return valid_found; } -uint64_t KeeperStorage::Node::sizeInBytes() const -{ - uint64_t total_size{0}; - for (const auto & child : children) - total_size += child.size(); - - total_size += data.size(); - - total_size += sizeof(acl_id); - total_size += sizeof(is_sequental); - total_size += sizeof(stat); - total_size += sizeof(seq_num); - return total_size; -} - static KeeperStorage::ResponsesForSessions processWatchesImpl(const String & path, KeeperStorage::Watches & watches, KeeperStorage::Watches & list_watches, Coordination::Event event_type) { KeeperStorage::ResponsesForSessions result; @@ -325,7 +307,7 @@ struct KeeperStorageCreateRequestProcessor final : public KeeperStorageRequestPr KeeperStorage::Node created_node; Coordination::ACLs node_acls; - if (!fixupACL(request.acls, session_auth_ids, node_acls, !request.restored_from_zookeeper_log)) + if (!fixupACL(request.acls, session_auth_ids, node_acls)) { response.error = Coordination::Error::ZINVALIDACL; return {response_ptr, {}}; @@ -354,6 +336,7 @@ struct KeeperStorageCreateRequestProcessor final : public KeeperStorageRequestPr { parent.children.insert(child_path); + parent.size_bytes += child_path.size(); prev_parent_cversion = parent.stat.cversion; prev_parent_zxid = parent.stat.pzxid; @@ -391,6 +374,7 @@ struct KeeperStorageCreateRequestProcessor final : public KeeperStorageRequestPr undo_parent.stat.cversion = prev_parent_cversion; undo_parent.stat.pzxid = prev_parent_zxid; undo_parent.children.erase(child_path); + undo_parent.size_bytes -= child_path.size(); }); }; @@ -524,6 +508,7 @@ struct KeeperStorageRemoveRequestProcessor final : public KeeperStorageRequestPr --parent.stat.numChildren; ++parent.stat.cversion; parent.children.erase(child_basename); + parent.size_bytes -= child_basename.size(); }); response.error = Coordination::Error::ZOK; @@ -543,6 +528,7 @@ struct KeeperStorageRemoveRequestProcessor final : public KeeperStorageRequestPr ++parent.stat.numChildren; --parent.stat.cversion; parent.children.insert(child_basename); + parent.size_bytes += child_basename.size(); }); }; } @@ -621,11 +607,11 @@ struct KeeperStorageSetRequestProcessor final : public KeeperStorageRequestProce auto itr = container.updateValue(request.path, [zxid, request] (KeeperStorage::Node & value) { - value.data = request.data; value.stat.version++; value.stat.mzxid = zxid; value.stat.mtime = std::chrono::system_clock::now().time_since_epoch() / std::chrono::milliseconds(1); value.stat.dataLength = request.data.length(); + value.size_bytes = value.size_bytes + request.data.size() - value.data.size(); value.data = request.data; }); @@ -789,7 +775,7 @@ struct KeeperStorageSetACLRequestProcessor final : public KeeperStorageRequestPr auto & session_auth_ids = storage.session_and_auth[session_id]; Coordination::ACLs node_acls; - if (!fixupACL(request.acls, session_auth_ids, node_acls, !request.restored_from_zookeeper_log)) + if (!fixupACL(request.acls, session_auth_ids, node_acls)) { response.error = Coordination::Error::ZINVALIDACL; return {response_ptr, {}}; @@ -1110,6 +1096,7 @@ KeeperStorage::ResponsesForSessions KeeperStorage::processRequest(const Coordina --parent.stat.numChildren; ++parent.stat.cversion; parent.children.erase(getBaseName(ephemeral_path)); + parent.size_bytes -= getBaseName(ephemeral_path).size(); }); auto responses = processWatchesImpl(ephemeral_path, watches, list_watches, Coordination::Event::DELETED); diff --git a/src/Coordination/KeeperStorage.h b/src/Coordination/KeeperStorage.h index 44dc1b2b43b..f61b17a88a6 100644 --- a/src/Coordination/KeeperStorage.h +++ b/src/Coordination/KeeperStorage.h @@ -35,9 +35,22 @@ public: Coordination::Stat stat{}; int32_t seq_num = 0; ChildrenSet children{}; + uint64_t size_bytes; // save size to avoid calculate every time + Node() + { + size_bytes = sizeof(size_bytes); + size_bytes += data.size(); + size_bytes += sizeof(acl_id); + size_bytes += sizeof(is_sequental); + size_bytes += sizeof(stat); + size_bytes += sizeof(seq_num); + } /// Object memory size - uint64_t sizeInBytes() const; + uint64_t sizeInBytes() const + { + return size_bytes; + } }; struct ResponseForSession diff --git a/src/Coordination/tests/gtest_coordination.cpp b/src/Coordination/tests/gtest_coordination.cpp index d5498a1bc13..d274ee34a88 100644 --- a/src/Coordination/tests/gtest_coordination.cpp +++ b/src/Coordination/tests/gtest_coordination.cpp @@ -1,6 +1,5 @@ #include -#include #include "config_core.h" #if USE_NURAFT @@ -15,7 +14,6 @@ #include #include #include -#include #include #include #include @@ -979,24 +977,24 @@ TEST_P(CoordinationTest, SnapshotableHashMapDataSize) world.disableSnapshotMode(); world.insert("world", n1); - EXPECT_EQ(world.getApproximateDataSize(), 94); + EXPECT_EQ(world.getApproximateDataSize(), 98); world.updateValue("world", [&](Node & value) { value = n2; }); - EXPECT_EQ(world.getApproximateDataSize(), 96); + EXPECT_EQ(world.getApproximateDataSize(), 98); world.erase("world"); EXPECT_EQ(world.getApproximateDataSize(), 0); world.enableSnapshotMode(); world.insert("world", n1); - EXPECT_EQ(world.getApproximateDataSize(), 94); + EXPECT_EQ(world.getApproximateDataSize(), 98); world.updateValue("world", [&](Node & value) { value = n2; }); - EXPECT_EQ(world.getApproximateDataSize(), 190); + EXPECT_EQ(world.getApproximateDataSize(), 196); world.clearOutdatedNodes(); - EXPECT_EQ(world.getApproximateDataSize(), 96); + EXPECT_EQ(world.getApproximateDataSize(), 98); world.erase("world"); - EXPECT_EQ(world.getApproximateDataSize(), 96); + EXPECT_EQ(world.getApproximateDataSize(), 98); world.clear(); EXPECT_EQ(world.getApproximateDataSize(), 0); diff --git a/src/Core/Block.cpp b/src/Core/Block.cpp index 168ee346626..85eb6264220 100644 --- a/src/Core/Block.cpp +++ b/src/Core/Block.cpp @@ -9,6 +9,7 @@ #include #include +#include #include @@ -37,7 +38,7 @@ static ReturnType onError(const std::string & message [[maybe_unused]], int code template static ReturnType checkColumnStructure(const ColumnWithTypeAndName & actual, const ColumnWithTypeAndName & expected, - const std::string & context_description, bool allow_remove_constants, int code) + const std::string & context_description, bool allow_materialize, int code) { if (actual.name != expected.name) return onError("Block structure mismatch in " + context_description + " stream: different names of columns:\n" @@ -52,11 +53,16 @@ static ReturnType checkColumnStructure(const ColumnWithTypeAndName & actual, con const IColumn * actual_column = actual.column.get(); - /// If we allow to remove constants, and expected column is not const, then unwrap actual constant column. - if (allow_remove_constants && !isColumnConst(*expected.column)) + /// If we allow to materialize, and expected column is not const or sparse, then unwrap actual column. + if (allow_materialize) { - if (const auto * column_const = typeid_cast(actual_column)) - actual_column = &column_const->getDataColumn(); + if (!isColumnConst(*expected.column)) + if (const auto * column_const = typeid_cast(actual_column)) + actual_column = &column_const->getDataColumn(); + + if (!expected.column->isSparse()) + if (const auto * column_sparse = typeid_cast(actual_column)) + actual_column = &column_sparse->getValuesColumn(); } if (actual_column->getName() != expected.column->getName()) @@ -79,7 +85,7 @@ static ReturnType checkColumnStructure(const ColumnWithTypeAndName & actual, con template -static ReturnType checkBlockStructure(const Block & lhs, const Block & rhs, const std::string & context_description, bool allow_remove_constants) +static ReturnType checkBlockStructure(const Block & lhs, const Block & rhs, const std::string & context_description, bool allow_materialize) { size_t columns = rhs.columns(); if (lhs.columns() != columns) @@ -93,11 +99,11 @@ static ReturnType checkBlockStructure(const Block & lhs, const Block & rhs, cons if constexpr (std::is_same_v) { - if (!checkColumnStructure(actual, expected, context_description, allow_remove_constants, ErrorCodes::LOGICAL_ERROR)) + if (!checkColumnStructure(actual, expected, context_description, allow_materialize, ErrorCodes::LOGICAL_ERROR)) return false; } else - checkColumnStructure(actual, expected, context_description, allow_remove_constants, ErrorCodes::LOGICAL_ERROR); + checkColumnStructure(actual, expected, context_description, allow_materialize, ErrorCodes::LOGICAL_ERROR); } return ReturnType(true); @@ -139,15 +145,17 @@ void Block::insert(size_t position, ColumnWithTypeAndName elem) if (elem.name.empty()) throw Exception("Column name in Block cannot be empty", ErrorCodes::AMBIGUOUS_COLUMN_NAME); - for (auto & name_pos : index_by_name) - if (name_pos.second >= position) - ++name_pos.second; - - auto [it, inserted] = index_by_name.emplace(elem.name, position); + auto [new_it, inserted] = index_by_name.emplace(elem.name, position); if (!inserted) - checkColumnStructure(data[it->second], elem, + checkColumnStructure(data[new_it->second], elem, "(columns with identical name must have identical structure)", true, ErrorCodes::AMBIGUOUS_COLUMN_NAME); + for (auto it = index_by_name.begin(); it != index_by_name.end(); ++it) + { + if (it->second >= position && (!inserted || it != new_it)) + ++it->second; + } + data.emplace(data.begin() + position, std::move(elem)); } @@ -203,7 +211,7 @@ void Block::eraseImpl(size_t position) for (auto it = index_by_name.begin(); it != index_by_name.end();) { if (it->second == position) - index_by_name.erase(it++); + it = index_by_name.erase(it); else { if (it->second > position) @@ -706,6 +714,11 @@ void Block::updateHash(SipHash & hash) const col.column->updateHashWithValue(row_no, hash); } +void convertToFullIfSparse(Block & block) +{ + for (auto & column : block) + column.column = recursiveRemoveSparse(column.column); +} ColumnPtr getColumnFromBlock(const Block & block, const NameAndTypePair & column) { @@ -729,7 +742,7 @@ Block materializeBlock(const Block & block) for (size_t i = 0; i < columns; ++i) { auto & element = res.getByPosition(i); - element.column = element.column->convertToFullColumnIfConst(); + element.column = recursiveRemoveSparse(element.column->convertToFullColumnIfConst()); } return res; @@ -738,7 +751,7 @@ Block materializeBlock(const Block & block) void materializeBlockInplace(Block & block) { for (size_t i = 0; i < block.columns(); ++i) - block.getByPosition(i).column = block.getByPosition(i).column->convertToFullColumnIfConst(); + block.getByPosition(i).column = recursiveRemoveSparse(block.getByPosition(i).column->convertToFullColumnIfConst()); } } diff --git a/src/Core/Block.h b/src/Core/Block.h index c0c9391e3b2..cad29dea7e6 100644 --- a/src/Core/Block.h +++ b/src/Core/Block.h @@ -193,6 +193,8 @@ void assertCompatibleHeader(const Block & actual, const Block & desired, const s /// Calculate difference in structure of blocks and write description into output strings. NOTE It doesn't compare values of constant columns. void getBlocksDifference(const Block & lhs, const Block & rhs, std::string & out_lhs_diff, std::string & out_rhs_diff); +void convertToFullIfSparse(Block & block); + /// Helps in-memory storages to extract columns from block. /// Properly handles cases, when column is a subcolumn and when it is compressed. ColumnPtr getColumnFromBlock(const Block & block, const NameAndTypePair & column); diff --git a/src/Core/ExternalTable.cpp b/src/Core/ExternalTable.cpp index b4adbcc0662..3b515fab5c9 100644 --- a/src/Core/ExternalTable.cpp +++ b/src/Core/ExternalTable.cpp @@ -169,7 +169,7 @@ void ExternalTablesHandler::handlePart(const Poco::Net::MessageHeader & header, processors.push_back(std::move(sink)); processors.push_back(std::move(exception_handling)); - auto executor = std::make_shared(processors); + auto executor = std::make_shared(processors, getContext()->getProcessListElement()); executor->execute(/*num_threads = */ 1); /// We are ready to receive the next file, for this we clear all the information received diff --git a/src/Core/Field.h b/src/Core/Field.h index a9fb73393cf..19573ed9831 100644 --- a/src/Core/Field.h +++ b/src/Core/Field.h @@ -808,11 +808,27 @@ auto & Field::safeGet() template T & Field::reinterpret() { + assert(which != Types::String); // See specialization for char using ValueType = std::decay_t; ValueType * MAY_ALIAS ptr = reinterpret_cast(&storage); return *ptr; } +// Specialize reinterpreting to char (used in ColumnUnique) to make sure Strings are reinterpreted correctly +// inline to avoid multiple definitions +template <> +inline char & Field::reinterpret() +{ + if (which == Types::String) + { + // For String we want to return a pointer to the data, not the start of the class + // as the layout of std::string depends on the STD version and options + char * ptr = reinterpret_cast(&storage)->data(); + return *ptr; + } + return *reinterpret_cast(&storage); +} + template T get(const Field & field) { diff --git a/src/Core/MySQL/Authentication.cpp b/src/Core/MySQL/Authentication.cpp index 4dd20ff585e..0492211c51f 100644 --- a/src/Core/MySQL/Authentication.cpp +++ b/src/Core/MySQL/Authentication.cpp @@ -71,7 +71,7 @@ Native41::Native41(const String & password_, const String & scramble_) const Poco::SHA1Engine::Digest & digest = engine3.digest(); scramble.resize(SCRAMBLE_LENGTH); - for (size_t i = 0; i < SCRAMBLE_LENGTH; i++) + for (size_t i = 0; i < SCRAMBLE_LENGTH; ++i) scramble[i] = static_cast(password_sha1[i] ^ digest[i]); } @@ -191,7 +191,7 @@ void Sha256Password::authenticate( } password.resize(plaintext_size); - for (int i = 0; i < plaintext_size; i++) + for (int i = 0; i < plaintext_size; ++i) { password[i] = plaintext[i] ^ static_cast(scramble[i % SCRAMBLE_LENGTH]); } diff --git a/src/Core/MySQL/MySQLGtid.cpp b/src/Core/MySQL/MySQLGtid.cpp index a441bccb076..bfd0bd02b45 100644 --- a/src/Core/MySQL/MySQLGtid.cpp +++ b/src/Core/MySQL/MySQLGtid.cpp @@ -41,7 +41,7 @@ void GTIDSets::parse(const String gtid_format) GTIDSet set; set.uuid = DB::parse(server_ids[0]); - for (size_t k = 1; k < server_ids.size(); k++) + for (size_t k = 1; k < server_ids.size(); ++k) { std::vector inters; boost::split(inters, server_ids[k], [](char c) { return c == '-'; }); @@ -74,7 +74,7 @@ void GTIDSets::update(const GTID & other) { if (set.uuid == other.uuid) { - for (auto i = 0U; i < set.intervals.size(); i++) + for (auto i = 0U; i < set.intervals.size(); ++i) { auto & current = set.intervals[i]; @@ -134,7 +134,7 @@ String GTIDSets::toString() const { WriteBufferFromOwnString buffer; - for (size_t i = 0; i < sets.size(); i++) + for (size_t i = 0; i < sets.size(); ++i) { GTIDSet set = sets[i]; writeUUIDText(set.uuid, buffer); diff --git a/src/Core/MySQL/MySQLReplication.cpp b/src/Core/MySQL/MySQLReplication.cpp index f734154f4ba..fb230f412f0 100644 --- a/src/Core/MySQL/MySQLReplication.cpp +++ b/src/Core/MySQL/MySQLReplication.cpp @@ -5,7 +5,7 @@ #include #include #include -#include +#include #include #include #include @@ -159,7 +159,7 @@ namespace MySQLReplication payload.ignore(1); column_count = readLengthEncodedNumber(payload); - for (auto i = 0U; i < column_count; i++) + for (auto i = 0U; i < column_count; ++i) { UInt8 v = 0x00; payload.readStrict(reinterpret_cast(&v), 1); @@ -188,7 +188,7 @@ namespace MySQLReplication { auto pos = 0; column_meta.reserve(column_count); - for (auto i = 0U; i < column_count; i++) + for (auto i = 0U; i < column_count; ++i) { UInt16 typ = column_type[i]; switch (typ) @@ -230,6 +230,7 @@ namespace MySQLReplication pos += 2; break; } + case MYSQL_TYPE_BIT: case MYSQL_TYPE_VARCHAR: case MYSQL_TYPE_VAR_STRING: { /// Little-Endian @@ -255,7 +256,7 @@ namespace MySQLReplication out << "Table Len: " << std::to_string(this->table_len) << '\n'; out << "Table: " << this->table << '\n'; out << "Column Count: " << this->column_count << '\n'; - for (auto i = 0U; i < column_count; i++) + for (UInt32 i = 0; i < column_count; ++i) { out << "Column Type [" << i << "]: " << std::to_string(column_type[i]) << ", Meta: " << column_meta[i] << '\n'; } @@ -312,7 +313,7 @@ namespace MySQLReplication UInt32 null_index = 0; UInt32 re_count = 0; - for (auto i = 0U; i < number_columns; i++) + for (UInt32 i = 0; i < number_columns; ++i) { if (bitmap[i]) re_count++; @@ -321,7 +322,7 @@ namespace MySQLReplication boost::dynamic_bitset<> columns_null_set; readBitmap(payload, columns_null_set, re_count); - for (auto i = 0U; i < number_columns; i++) + for (UInt32 i = 0; i < number_columns; ++i) { UInt32 field_len = 0; @@ -523,7 +524,7 @@ namespace MySQLReplication res += (val ^ (mask & compressed_integer_align_numbers[compressed_integers])); } - for (auto k = 0U; k < uncompressed_integers; k++) + for (size_t k = 0; k < uncompressed_integers; ++k) { UInt32 val = 0; readBigEndianStrict(payload, reinterpret_cast(&val), 4); @@ -536,7 +537,7 @@ namespace MySQLReplication size_t uncompressed_decimals = scale / digits_per_integer; size_t compressed_decimals = scale - (uncompressed_decimals * digits_per_integer); - for (auto k = 0U; k < uncompressed_decimals; k++) + for (size_t k = 0; k < uncompressed_decimals; ++k) { UInt32 val = 0; readBigEndianStrict(payload, reinterpret_cast(&val), 4); @@ -584,6 +585,15 @@ namespace MySQLReplication } break; } + case MYSQL_TYPE_BIT: + { + UInt32 bits = ((meta >> 8) * 8) + (meta & 0xff); + UInt32 size = (bits + 7) / 8; + UInt64 val = 0UL; + readBigEndianStrict(payload, reinterpret_cast(&val), size); + row.push_back(val); + break; + } case MYSQL_TYPE_VARCHAR: case MYSQL_TYPE_VAR_STRING: { @@ -669,7 +679,7 @@ namespace MySQLReplication header.dump(out); out << "Schema: " << this->schema << '\n'; out << "Table: " << this->table << '\n'; - for (auto i = 0U; i < rows.size(); i++) + for (size_t i = 0; i < rows.size(); ++i) { out << "Row[" << i << "]: " << applyVisitor(to_string, rows[i]) << '\n'; } diff --git a/src/Core/MySQL/PacketsProtocolText.cpp b/src/Core/MySQL/PacketsProtocolText.cpp index 0494a146c47..728e8061e87 100644 --- a/src/Core/MySQL/PacketsProtocolText.cpp +++ b/src/Core/MySQL/PacketsProtocolText.cpp @@ -15,7 +15,7 @@ namespace ProtocolText ResultSetRow::ResultSetRow(const Serializations & serializations, const Columns & columns_, int row_num_) : columns(columns_), row_num(row_num_) { - for (size_t i = 0; i < columns.size(); i++) + for (size_t i = 0; i < columns.size(); ++i) { if (columns[i]->isNullAt(row_num)) { @@ -39,7 +39,7 @@ size_t ResultSetRow::getPayloadSize() const void ResultSetRow::writePayloadImpl(WriteBuffer & buffer) const { - for (size_t i = 0; i < columns.size(); i++) + for (size_t i = 0; i < columns.size(); ++i) { if (columns[i]->isNullAt(row_num)) buffer.write(serialized[i].data(), 1); diff --git a/src/Core/NamesAndTypes.cpp b/src/Core/NamesAndTypes.cpp index b47f5a6823b..b9098d3308d 100644 --- a/src/Core/NamesAndTypes.cpp +++ b/src/Core/NamesAndTypes.cpp @@ -7,6 +7,7 @@ #include #include #include +#include namespace DB @@ -43,6 +44,17 @@ String NameAndTypePair::getSubcolumnName() const return name.substr(*subcolumn_delimiter_position + 1, name.size() - *subcolumn_delimiter_position); } +String NameAndTypePair::dump() const +{ + WriteBufferFromOwnString out; + out << "name: " << name << "\n" + << "type: " << type->getName() << "\n" + << "name in storage: " << getNameInStorage() << "\n" + << "type in storage: " << getTypeInStorage()->getName(); + + return out.str(); +} + void NamesAndTypesList::readText(ReadBuffer & buf) { const DataTypeFactory & data_type_factory = DataTypeFactory::instance(); diff --git a/src/Core/NamesAndTypes.h b/src/Core/NamesAndTypes.h index 58b5189db63..3ac9ad2fa02 100644 --- a/src/Core/NamesAndTypes.h +++ b/src/Core/NamesAndTypes.h @@ -40,6 +40,8 @@ public: return name == rhs.name && type->equals(*rhs.type); } + String dump() const; + String name; DataTypePtr type; diff --git a/src/Core/PostgreSQL/Connection.cpp b/src/Core/PostgreSQL/Connection.cpp index 75786a51d92..f97a35a9e92 100644 --- a/src/Core/PostgreSQL/Connection.cpp +++ b/src/Core/PostgreSQL/Connection.cpp @@ -12,10 +12,7 @@ Connection::Connection(const ConnectionInfo & connection_info_, bool replication , log(&Poco::Logger::get("PostgreSQLReplicaConnection")) { if (replication) - { - connection_info = std::make_pair( - fmt::format("{} replication=database", connection_info.first), connection_info.second); - } + connection_info = {fmt::format("{} replication=database", connection_info.connection_string), connection_info.host_port}; } void Connection::execWithRetry(const std::function & exec) @@ -61,11 +58,14 @@ void Connection::updateConnection() { if (connection) connection->close(); + /// Always throws if there is no connection. - connection = std::make_unique(connection_info.first); + connection = std::make_unique(connection_info.connection_string); + if (replication) connection->set_variable("default_transaction_isolation", "'repeatable read'"); - LOG_DEBUG(&Poco::Logger::get("PostgreSQLConnection"), "New connection to {}", connection_info.second); + + LOG_DEBUG(&Poco::Logger::get("PostgreSQLConnection"), "New connection to {}", connection_info.host_port); } void Connection::connect() diff --git a/src/Core/PostgreSQL/Connection.h b/src/Core/PostgreSQL/Connection.h index d65c38643c1..8c5609dc66b 100644 --- a/src/Core/PostgreSQL/Connection.h +++ b/src/Core/PostgreSQL/Connection.h @@ -8,19 +8,26 @@ #include #include -/* Methods to work with PostgreSQL connection object. +/** Methods to work with PostgreSQL connection object. * Should only be used in case there has to be a single connection object, which * is long-lived and there are no concurrent connection queries. - * Now only use case - for replication handler for replication from PostgreSQL. - * In all other integration engine use pool with failover. - **/ + */ namespace Poco { class Logger; } +namespace pqxx +{ + using ConnectionPtr = std::unique_ptr; +} + namespace postgres { -using ConnectionInfo = std::pair; -using ConnectionPtr = std::unique_ptr; + +struct ConnectionInfo +{ + String connection_string; + String host_port; /// For logs. +}; class Connection : private boost::noncopyable { @@ -33,14 +40,17 @@ public: void connect(); + void updateConnection(); + void tryUpdateConnection(); const ConnectionInfo & getConnectionInfo() { return connection_info; } -private: - void updateConnection(); + String getInfoForLog() const { return connection_info.host_port; } - ConnectionPtr connection; +private: + + pqxx::ConnectionPtr connection; ConnectionInfo connection_info; bool replication; @@ -48,6 +58,9 @@ private: Poco::Logger * log; }; + +using ConnectionPtr = std::unique_ptr; + } #endif diff --git a/src/Core/PostgreSQL/ConnectionHolder.h b/src/Core/PostgreSQL/ConnectionHolder.h index d0d64935e91..38e321e222c 100644 --- a/src/Core/PostgreSQL/ConnectionHolder.h +++ b/src/Core/PostgreSQL/ConnectionHolder.h @@ -7,12 +7,12 @@ #include #include #include +#include "Connection.h" namespace postgres { -using ConnectionPtr = std::unique_ptr; using Pool = BorrowedObjectPool; using PoolPtr = std::shared_ptr; @@ -28,8 +28,12 @@ public: pqxx::connection & get() { - assert(connection != nullptr); - return *connection; + return connection->getRef(); + } + + void update() + { + connection->updateConnection(); } private: diff --git a/src/Core/PostgreSQL/PoolWithFailover.cpp b/src/Core/PostgreSQL/PoolWithFailover.cpp index 3addb511c3b..844c60087e0 100644 --- a/src/Core/PostgreSQL/PoolWithFailover.cpp +++ b/src/Core/PostgreSQL/PoolWithFailover.cpp @@ -32,9 +32,9 @@ PoolWithFailover::PoolWithFailover( { for (const auto & replica_configuration : configurations) { - auto connection_string = formatConnectionString(replica_configuration.database, - replica_configuration.host, replica_configuration.port, replica_configuration.username, replica_configuration.password).first; - replicas_with_priority[priority].emplace_back(connection_string, pool_size, getConnectionForLog(replica_configuration.host, replica_configuration.port)); + auto connection_info = formatConnectionString(replica_configuration.database, + replica_configuration.host, replica_configuration.port, replica_configuration.username, replica_configuration.password); + replicas_with_priority[priority].emplace_back(connection_info, pool_size); } } } @@ -52,8 +52,8 @@ PoolWithFailover::PoolWithFailover( for (const auto & [host, port] : configuration.addresses) { LOG_DEBUG(&Poco::Logger::get("PostgreSQLPoolWithFailover"), "Adding address host: {}, port: {} to connection pool", host, port); - auto connection_string = formatConnectionString(configuration.database, host, port, configuration.username, configuration.password).first; - replicas_with_priority[0].emplace_back(connection_string, pool_size, getConnectionForLog(host, port)); + auto connection_string = formatConnectionString(configuration.database, host, port, configuration.username, configuration.password); + replicas_with_priority[0].emplace_back(connection_string, pool_size); } } @@ -83,16 +83,18 @@ ConnectionHolderPtr PoolWithFailover::get() try { /// Create a new connection or reopen an old connection if it became invalid. - if (!connection || !connection->is_open()) + if (!connection) { - connection = std::make_unique(replica.connection_string); - LOG_DEBUG(log, "New connection to {}:{}", connection->hostname(), connection->port()); + connection = std::make_unique(replica.connection_info); + LOG_DEBUG(log, "New connection to {}", connection->getInfoForLog()); } + + connection->connect(); } catch (const pqxx::broken_connection & pqxx_error) { LOG_ERROR(log, "Connection error: {}", pqxx_error.what()); - error_message << "Try " << try_idx + 1 << ". Connection to `" << replica.name_for_log << "` failed: " << pqxx_error.what() << "\n"; + error_message << "Try " << try_idx + 1 << ". Connection to `" << replica.connection_info.host_port << "` failed: " << pqxx_error.what() << "\n"; replica.pool->returnObject(std::move(connection)); continue; diff --git a/src/Core/PostgreSQL/PoolWithFailover.h b/src/Core/PostgreSQL/PoolWithFailover.h index c59010a5d43..e6f691ed2dd 100644 --- a/src/Core/PostgreSQL/PoolWithFailover.h +++ b/src/Core/PostgreSQL/PoolWithFailover.h @@ -44,12 +44,11 @@ public: private: struct PoolHolder { - String connection_string; + ConnectionInfo connection_info; PoolPtr pool; - String name_for_log; - PoolHolder(const String & connection_string_, size_t pool_size, const String & name_for_log_) - : connection_string(connection_string_), pool(std::make_shared(pool_size)), name_for_log(name_for_log_) {} + PoolHolder(const ConnectionInfo & connection_info_, size_t pool_size) + : connection_info(connection_info_), pool(std::make_shared(pool_size)) {} }; /// Highest priority is 0, the bigger the number in map, the less the priority diff --git a/src/Core/PostgreSQL/Utils.cpp b/src/Core/PostgreSQL/Utils.cpp index 60b13218202..b4ad19c819a 100644 --- a/src/Core/PostgreSQL/Utils.cpp +++ b/src/Core/PostgreSQL/Utils.cpp @@ -17,7 +17,7 @@ ConnectionInfo formatConnectionString(String dbname, String host, UInt16 port, S << " user=" << DB::quote << user << " password=" << DB::quote << password << " connect_timeout=10"; - return std::make_pair(out.str(), host + ':' + DB::toString(port)); + return {out.str(), host + ':' + DB::toString(port)}; } String getConnectionForLog(const String & host, UInt16 port) diff --git a/src/Core/PostgreSQL/insertPostgreSQLValue.cpp b/src/Core/PostgreSQL/insertPostgreSQLValue.cpp index 1c3230ec826..f4d47049554 100644 --- a/src/Core/PostgreSQL/insertPostgreSQLValue.cpp +++ b/src/Core/PostgreSQL/insertPostgreSQLValue.cpp @@ -85,7 +85,7 @@ void insertPostgreSQLValue( assert_cast(column).insertData(value.data(), value.size()); break; case ExternalResultDescription::ValueType::vtUUID: - assert_cast(column).insert(parse(value.data(), value.size())); + assert_cast(column).insertValue(parse(value.data(), value.size())); break; case ExternalResultDescription::ValueType::vtDate: assert_cast(column).insertValue(UInt16{LocalDate{std::string(value)}.getDayNum()}); diff --git a/src/Core/ProtocolDefines.h b/src/Core/ProtocolDefines.h index 36820788b91..93f44b02ce3 100644 --- a/src/Core/ProtocolDefines.h +++ b/src/Core/ProtocolDefines.h @@ -44,6 +44,8 @@ #define DBMS_MIN_PROTOCOL_VERSION_WITH_INCREMENTAL_PROFILE_EVENTS 54451 +#define DBMS_MIN_REVISION_WITH_CUSTOM_SERIALIZATION 54454 + /// Version of ClickHouse TCP protocol. /// /// Should be incremented manually on protocol changes. @@ -51,7 +53,6 @@ /// NOTE: DBMS_TCP_PROTOCOL_VERSION has nothing common with VERSION_REVISION, /// later is just a number for server version (one number instead of commit SHA) /// for simplicity (sometimes it may be more convenient in some use cases). - -#define DBMS_TCP_PROTOCOL_VERSION 54453 +#define DBMS_TCP_PROTOCOL_VERSION 54455 #define DBMS_MIN_PROTOCOL_VERSION_WITH_INITIAL_QUERY_START_TIME 54449 diff --git a/src/Core/Settings.cpp b/src/Core/Settings.cpp index 11c625007d9..8daf39d9928 100644 --- a/src/Core/Settings.cpp +++ b/src/Core/Settings.cpp @@ -117,6 +117,16 @@ void Settings::checkNoSettingNamesAtTopLevel(const Poco::Util::AbstractConfigura } } +std::vector Settings::getAllRegisteredNames() const +{ + std::vector all_settings; + for (const auto & setting_field : all()) + { + all_settings.push_back(setting_field.getName()); + } + return all_settings; +} + IMPLEMENT_SETTINGS_TRAITS(FormatFactorySettingsTraits, FORMAT_FACTORY_SETTINGS) } diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 47b01655c26..6e53fa4342c 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -1,5 +1,6 @@ #pragma once +#include #include #include #include @@ -45,7 +46,6 @@ class IColumn; M(UInt64, max_insert_threads, 0, "The maximum number of threads to execute the INSERT SELECT query. Values 0 or 1 means that INSERT SELECT is not run in parallel. Higher values will lead to higher memory usage. Parallel INSERT SELECT has effect only if the SELECT part is run on parallel, see 'max_threads' setting.", 0) \ M(UInt64, max_final_threads, 16, "The maximum number of threads to read from table with FINAL.", 0) \ M(MaxThreads, max_threads, 0, "The maximum number of threads to execute the request. By default, it is determined automatically.", 0) \ - M(MaxThreads, max_alter_threads, 0, "The maximum number of threads to execute the ALTER requests. By default, it is determined automatically.", 0) \ M(UInt64, max_read_buffer_size, DBMS_DEFAULT_BUFFER_SIZE, "The maximum size of the buffer to read from the filesystem.", 0) \ M(UInt64, max_distributed_connections, 1024, "The maximum number of connections for distributed processing of one query (should be greater than max_threads).", 0) \ M(UInt64, max_query_size, DBMS_DEFAULT_MAX_QUERY_SIZE, "Which part of the query can be read into RAM for parsing (the remaining data for INSERT, if any, is read later)", 0) \ @@ -55,7 +55,7 @@ class IColumn; M(Milliseconds, connect_timeout_with_failover_secure_ms, 100, "Connection timeout for selecting first healthy replica (for secure connections).", 0) \ M(Seconds, receive_timeout, DBMS_DEFAULT_RECEIVE_TIMEOUT_SEC, "", 0) \ M(Seconds, send_timeout, DBMS_DEFAULT_SEND_TIMEOUT_SEC, "", 0) \ - M(Seconds, drain_timeout, 3, "", 0) \ + M(Seconds, drain_timeout, 3, "Timeout for draining remote connections, -1 means synchronous drain w/o ignoring errors", 0) \ M(Seconds, tcp_keep_alive_timeout, 290 /* less than DBMS_DEFAULT_RECEIVE_TIMEOUT_SEC */, "The time in seconds the connection needs to remain idle before TCP starts sending keepalive probes", 0) \ M(Milliseconds, hedged_connection_timeout_ms, 100, "Connection timeout for establishing connection with replica for Hedged requests", 0) \ M(Milliseconds, receive_data_timeout_ms, 2000, "Connection timeout for receiving first packet of data or packet with positive progress from replica", 0) \ @@ -496,8 +496,12 @@ class IColumn; M(Bool, database_replicated_always_detach_permanently, false, "Execute DETACH TABLE as DETACH TABLE PERMANENTLY if database engine is Replicated", 0) \ M(DistributedDDLOutputMode, distributed_ddl_output_mode, DistributedDDLOutputMode::THROW, "Format of distributed DDL query result", 0) \ M(UInt64, distributed_ddl_entry_format_version, 1, "Version of DDL entry to write into ZooKeeper", 0) \ + \ M(UInt64, external_storage_max_read_rows, 0, "Limit maximum number of rows when table with external engine should flush history data. Now supported only for MySQL table engine, database engine, dictionary and MaterializedMySQL. If equal to 0, this setting is disabled", 0) \ M(UInt64, external_storage_max_read_bytes, 0, "Limit maximum number of bytes when table with external engine should flush history data. Now supported only for MySQL table engine, database engine, dictionary and MaterializedMySQL. If equal to 0, this setting is disabled", 0) \ + M(UInt64, external_storage_connect_timeout_sec, DBMS_DEFAULT_CONNECT_TIMEOUT_SEC, "Connect timeout in seconds. Now supported only for MySQL", 0) \ + M(UInt64, external_storage_rw_timeout_sec, DBMS_DEFAULT_RECEIVE_TIMEOUT_SEC, "Read/write timeout in seconds. Now supported only for MySQL", 0) \ + \ M(UnionMode, union_default_mode, UnionMode::Unspecified, "Set default Union Mode in SelectWithUnion query. Possible values: empty string, 'ALL', 'DISTINCT'. If empty, query without Union Mode will throw exception.", 0) \ M(Bool, optimize_aggregators_of_group_by_keys, true, "Eliminates min/max/any/anyLast aggregators of GROUP BY keys in SELECT section", 0) \ M(Bool, optimize_group_by_function_keys, true, "Eliminates functions of other keys in GROUP BY section", 0) \ @@ -567,7 +571,7 @@ class IColumn; MAKE_OBSOLETE(M, UInt64, merge_tree_clear_old_temporary_directories_interval_seconds, 60) \ MAKE_OBSOLETE(M, UInt64, merge_tree_clear_old_parts_interval_seconds, 1) \ MAKE_OBSOLETE(M, UInt64, partial_merge_join_optimizations, 0) \ - + MAKE_OBSOLETE(M, MaxThreads, max_alter_threads, 0) \ /** The section above is for obsolete settings. Do not add anything there. */ @@ -589,8 +593,11 @@ class IColumn; M(Bool, input_format_null_as_default, true, "For text input formats initialize null fields with default values if data type of this field is not nullable", 0) \ M(Bool, input_format_arrow_import_nested, false, "Allow to insert array of structs into Nested table in Arrow input format.", 0) \ M(Bool, input_format_orc_import_nested, false, "Allow to insert array of structs into Nested table in ORC input format.", 0) \ + M(Int64, input_format_orc_row_batch_size, 100'000, "Batch size when reading ORC stripes.", 0) \ M(Bool, input_format_parquet_import_nested, false, "Allow to insert array of structs into Nested table in Parquet input format.", 0) \ M(Bool, input_format_allow_seeks, true, "Allow seeks while reading in ORC/Parquet/Arrow input formats", 0) \ + M(UInt64, input_format_msgpack_number_of_columns, 0, "The number of columns in inserted MsgPack data. Used for automatic schema inference from data.", 0) \ + M(UInt64, input_format_max_rows_to_read_for_schema_inference, 100, "The maximum rows of data to read for automatic schema inference", 0) \ \ M(DateTimeInputFormat, date_time_input_format, FormatSettings::DateTimeInputFormat::Basic, "Method to read DateTime from text input formats. Possible values: 'basic' and 'best_effort'.", 0) \ M(DateTimeOutputFormat, date_time_output_format, FormatSettings::DateTimeOutputFormat::Simple, "Method to write DateTime to text output. Possible values: 'simple', 'iso', 'unix_timestamp'.", 0) \ @@ -656,6 +663,7 @@ class IColumn; M(Bool, output_format_arrow_low_cardinality_as_dictionary, false, "Enable output LowCardinality type as Dictionary Arrow type", 0) \ \ M(EnumComparingMode, format_capn_proto_enum_comparising_mode, FormatSettings::EnumComparingMode::BY_VALUES, "How to map ClickHouse Enum and CapnProto Enum", 0)\ + // End of FORMAT_FACTORY_SETTINGS // Please add settings non-related to formats into the COMMON_SETTINGS above. @@ -670,7 +678,7 @@ DECLARE_SETTINGS_TRAITS_ALLOW_CUSTOM_SETTINGS(SettingsTraits, LIST_OF_SETTINGS) /** Settings of query execution. * These settings go to users.xml. */ -struct Settings : public BaseSettings +struct Settings : public BaseSettings, public IHints<2, Settings> { /// For initialization from empty initializer-list to be "value initialization", not "aggregate initialization" in C++14. /// http://en.cppreference.com/w/cpp/language/aggregate_initialization @@ -694,6 +702,8 @@ struct Settings : public BaseSettings /// Check that there is no user-level settings at the top level in config. /// This is a common source of mistake (user don't know where to write user-level setting). static void checkNoSettingNamesAtTopLevel(const Poco::Util::AbstractConfiguration & config, const String & config_path); + + std::vector getAllRegisteredNames() const override; }; /* diff --git a/src/DataStreams/PushingToViewsBlockOutputStream.cpp b/src/DataStreams/PushingToViewsBlockOutputStream.cpp deleted file mode 100644 index 76c378c07e0..00000000000 --- a/src/DataStreams/PushingToViewsBlockOutputStream.cpp +++ /dev/null @@ -1,401 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -namespace DB -{ - -PushingToViewsBlockOutputStream::PushingToViewsBlockOutputStream( - const StoragePtr & storage_, - const StorageMetadataPtr & metadata_snapshot_, - ContextPtr context_, - const ASTPtr & query_ptr_, - bool no_destination) - : WithContext(context_) - , storage(storage_) - , metadata_snapshot(metadata_snapshot_) - , log(&Poco::Logger::get("PushingToViewsBlockOutputStream")) - , query_ptr(query_ptr_) -{ - checkStackSize(); - - /** TODO This is a very important line. At any insertion into the table one of streams should own lock. - * Although now any insertion into the table is done via PushingToViewsBlockOutputStream, - * but it's clear that here is not the best place for this functionality. - */ - addTableLock( - storage->lockForShare(getContext()->getInitialQueryId(), getContext()->getSettingsRef().lock_acquire_timeout)); - - /// If the "root" table deduplicates blocks, there are no need to make deduplication for children - /// Moreover, deduplication for AggregatingMergeTree children could produce false positives due to low size of inserting blocks - bool disable_deduplication_for_children = false; - if (!getContext()->getSettingsRef().deduplicate_blocks_in_dependent_materialized_views) - disable_deduplication_for_children = !no_destination && storage->supportsDeduplication(); - - auto table_id = storage->getStorageID(); - Dependencies dependencies = DatabaseCatalog::instance().getDependencies(table_id); - - /// We need special context for materialized views insertions - if (!dependencies.empty()) - { - select_context = Context::createCopy(context); - insert_context = Context::createCopy(context); - - const auto & insert_settings = insert_context->getSettingsRef(); - - // Do not deduplicate insertions into MV if the main insertion is Ok - if (disable_deduplication_for_children) - insert_context->setSetting("insert_deduplicate", Field{false}); - - // Separate min_insert_block_size_rows/min_insert_block_size_bytes for children - if (insert_settings.min_insert_block_size_rows_for_materialized_views) - insert_context->setSetting("min_insert_block_size_rows", insert_settings.min_insert_block_size_rows_for_materialized_views.value); - if (insert_settings.min_insert_block_size_bytes_for_materialized_views) - insert_context->setSetting("min_insert_block_size_bytes", insert_settings.min_insert_block_size_bytes_for_materialized_views.value); - } - - for (const auto & database_table : dependencies) - { - auto dependent_table = DatabaseCatalog::instance().getTable(database_table, getContext()); - auto dependent_metadata_snapshot = dependent_table->getInMemoryMetadataPtr(); - - ASTPtr query; - BlockOutputStreamPtr out; - - if (auto * materialized_view = dynamic_cast(dependent_table.get())) - { - addTableLock( - materialized_view->lockForShare(getContext()->getInitialQueryId(), getContext()->getSettingsRef().lock_acquire_timeout)); - - StoragePtr inner_table = materialized_view->getTargetTable(); - auto inner_table_id = inner_table->getStorageID(); - auto inner_metadata_snapshot = inner_table->getInMemoryMetadataPtr(); - query = dependent_metadata_snapshot->getSelectQuery().inner_query; - - std::unique_ptr insert = std::make_unique(); - insert->table_id = inner_table_id; - - /// Get list of columns we get from select query. - auto header = InterpreterSelectQuery(query, select_context, SelectQueryOptions().analyze()) - .getSampleBlock(); - - /// Insert only columns returned by select. - auto list = std::make_shared(); - const auto & inner_table_columns = inner_metadata_snapshot->getColumns(); - for (const auto & column : header) - { - /// But skip columns which storage doesn't have. - if (inner_table_columns.hasPhysical(column.name)) - list->children.emplace_back(std::make_shared(column.name)); - } - - insert->columns = std::move(list); - - ASTPtr insert_query_ptr(insert.release()); - InterpreterInsertQuery interpreter(insert_query_ptr, insert_context); - BlockIO io = interpreter.execute(); - out = io.out; - } - else if ( - dynamic_cast(dependent_table.get()) || dynamic_cast(dependent_table.get())) - out = std::make_shared( - dependent_table, dependent_metadata_snapshot, insert_context, ASTPtr(), true); - else - out = std::make_shared( - dependent_table, dependent_metadata_snapshot, insert_context, ASTPtr()); - - views.emplace_back(ViewInfo{std::move(query), database_table, std::move(out), nullptr, 0 /* elapsed_ms */}); - } - - /// Do not push to destination table if the flag is set - if (!no_destination) - { - output = storage->write(query_ptr, storage->getInMemoryMetadataPtr(), getContext()); - replicated_output = dynamic_cast(output.get()); - } -} - - -Block PushingToViewsBlockOutputStream::getHeader() const -{ - /// If we don't write directly to the destination - /// then expect that we're inserting with precalculated virtual columns - if (output) - return metadata_snapshot->getSampleBlock(); - else - return metadata_snapshot->getSampleBlockWithVirtuals(storage->getVirtuals()); -} - - -void PushingToViewsBlockOutputStream::write(const Block & block) -{ - /** Throw an exception if the sizes of arrays - elements of nested data structures doesn't match. - * We have to make this assertion before writing to table, because storage engine may assume that they have equal sizes. - * NOTE It'd better to do this check in serialization of nested structures (in place when this assumption is required), - * but currently we don't have methods for serialization of nested structures "as a whole". - */ - Nested::validateArraySizes(block); - - if (auto * live_view = dynamic_cast(storage.get())) - { - StorageLiveView::writeIntoLiveView(*live_view, block, getContext()); - } - else if (auto * window_view = dynamic_cast(storage.get())) - { - StorageWindowView::writeIntoWindowView(*window_view, block, getContext()); - } - else - { - if (output) - /// TODO: to support virtual and alias columns inside MVs, we should return here the inserted block extended - /// with additional columns directly from storage and pass it to MVs instead of raw block. - output->write(block); - } - - /// Don't process materialized views if this block is duplicate - if (!getContext()->getSettingsRef().deduplicate_blocks_in_dependent_materialized_views && replicated_output && replicated_output->lastBlockIsDuplicate()) - return; - - // Insert data into materialized views only after successful insert into main table - const Settings & settings = getContext()->getSettingsRef(); - if (settings.parallel_view_processing && views.size() > 1) - { - // Push to views concurrently if enabled and more than one view is attached - ThreadPool pool(std::min(size_t(settings.max_threads), views.size())); - for (auto & view : views) - { - auto thread_group = CurrentThread::getGroup(); - pool.scheduleOrThrowOnError([=, &view, this] - { - setThreadName("PushingToViews"); - if (thread_group) - CurrentThread::attachToIfDetached(thread_group); - process(block, view); - }); - } - // Wait for concurrent view processing - pool.wait(); - } - else - { - // Process sequentially - for (auto & view : views) - { - process(block, view); - - if (view.exception) - std::rethrow_exception(view.exception); - } - } -} - -void PushingToViewsBlockOutputStream::writePrefix() -{ - if (output) - output->writePrefix(); - - for (auto & view : views) - { - try - { - view.out->writePrefix(); - } - catch (Exception & ex) - { - ex.addMessage("while write prefix to view " + view.table_id.getNameForLogs()); - throw; - } - } -} - -void PushingToViewsBlockOutputStream::writeSuffix() -{ - if (output) - output->writeSuffix(); - - std::exception_ptr first_exception; - - const Settings & settings = getContext()->getSettingsRef(); - bool parallel_processing = false; - - /// Run writeSuffix() for views in separate thread pool. - /// In could have been done in PushingToViewsBlockOutputStream::process, however - /// it is not good if insert into main table fail but into view succeed. - if (settings.parallel_view_processing && views.size() > 1) - { - parallel_processing = true; - - // Push to views concurrently if enabled and more than one view is attached - ThreadPool pool(std::min(size_t(settings.max_threads), views.size())); - auto thread_group = CurrentThread::getGroup(); - - for (auto & view : views) - { - if (view.exception) - continue; - - pool.scheduleOrThrowOnError([thread_group, &view, this] - { - setThreadName("PushingToViews"); - if (thread_group) - CurrentThread::attachToIfDetached(thread_group); - - Stopwatch watch; - try - { - view.out->writeSuffix(); - } - catch (...) - { - view.exception = std::current_exception(); - } - view.elapsed_ms += watch.elapsedMilliseconds(); - - LOG_TRACE(log, "Pushing from {} to {} took {} ms.", - storage->getStorageID().getNameForLogs(), - view.table_id.getNameForLogs(), - view.elapsed_ms); - }); - } - // Wait for concurrent view processing - pool.wait(); - } - - for (auto & view : views) - { - if (view.exception) - { - if (!first_exception) - first_exception = view.exception; - - continue; - } - - if (parallel_processing) - continue; - - Stopwatch watch; - try - { - view.out->writeSuffix(); - } - catch (Exception & ex) - { - ex.addMessage("while write prefix to view " + view.table_id.getNameForLogs()); - throw; - } - view.elapsed_ms += watch.elapsedMilliseconds(); - - LOG_TRACE(log, "Pushing from {} to {} took {} ms.", - storage->getStorageID().getNameForLogs(), - view.table_id.getNameForLogs(), - view.elapsed_ms); - } - - if (first_exception) - std::rethrow_exception(first_exception); - - UInt64 milliseconds = main_watch.elapsedMilliseconds(); - if (views.size() > 1) - { - LOG_DEBUG(log, "Pushing from {} to {} views took {} ms.", - storage->getStorageID().getNameForLogs(), views.size(), - milliseconds); - } -} - -void PushingToViewsBlockOutputStream::flush() -{ - if (output) - output->flush(); - - for (auto & view : views) - view.out->flush(); -} - -void PushingToViewsBlockOutputStream::process(const Block & block, ViewInfo & view) -{ - Stopwatch watch; - - try - { - BlockInputStreamPtr in; - - /// We need keep InterpreterSelectQuery, until the processing will be finished, since: - /// - /// - We copy Context inside InterpreterSelectQuery to support - /// modification of context (Settings) for subqueries - /// - InterpreterSelectQuery lives shorter than query pipeline. - /// It's used just to build the query pipeline and no longer needed - /// - ExpressionAnalyzer and then, Functions, that created in InterpreterSelectQuery, - /// **can** take a reference to Context from InterpreterSelectQuery - /// (the problem raises only when function uses context from the - /// execute*() method, like FunctionDictGet do) - /// - These objects live inside query pipeline (DataStreams) and the reference become dangling. - std::optional select; - - if (view.query) - { - /// We create a table with the same name as original table and the same alias columns, - /// but it will contain single block (that is INSERT-ed into main table). - /// InterpreterSelectQuery will do processing of alias columns. - - auto local_context = Context::createCopy(select_context); - local_context->addViewSource( - StorageValues::create(storage->getStorageID(), metadata_snapshot->getColumns(), block, storage->getVirtuals())); - select.emplace(view.query, local_context, SelectQueryOptions()); - in = std::make_shared(select->execute().getInputStream()); - - /// Squashing is needed here because the materialized view query can generate a lot of blocks - /// even when only one block is inserted into the parent table (e.g. if the query is a GROUP BY - /// and two-level aggregation is triggered). - in = std::make_shared( - in, getContext()->getSettingsRef().min_insert_block_size_rows, getContext()->getSettingsRef().min_insert_block_size_bytes); - in = std::make_shared(in, view.out->getHeader(), ConvertingBlockInputStream::MatchColumnsMode::Name); - } - else - in = std::make_shared(block); - - in->readPrefix(); - - while (Block result_block = in->read()) - { - Nested::validateArraySizes(result_block); - view.out->write(result_block); - } - - in->readSuffix(); - } - catch (Exception & ex) - { - ex.addMessage("while pushing to view " + view.table_id.getNameForLogs()); - view.exception = std::current_exception(); - } - catch (...) - { - view.exception = std::current_exception(); - } - - view.elapsed_ms += watch.elapsedMilliseconds(); -} - -} diff --git a/src/DataTypes/DataTypeAggregateFunction.h b/src/DataTypes/DataTypeAggregateFunction.h index e500bf2858a..39fbfb62917 100644 --- a/src/DataTypes/DataTypeAggregateFunction.h +++ b/src/DataTypes/DataTypeAggregateFunction.h @@ -66,6 +66,7 @@ public: bool shouldAlignRightInPrettyFormats() const override { return false; } SerializationPtr doGetDefaultSerialization() const override; + bool supportsSparseSerialization() const override { return false; } bool isVersioned() const { return function->isVersioned(); } diff --git a/src/DataTypes/DataTypeDate32.h b/src/DataTypes/DataTypeDate32.h index 9b93e5feb16..f0f78849e06 100644 --- a/src/DataTypes/DataTypeDate32.h +++ b/src/DataTypes/DataTypeDate32.h @@ -1,7 +1,7 @@ #pragma once #include -#include +#include namespace DB { diff --git a/src/DataTypes/DataTypeDecimalBase.h b/src/DataTypes/DataTypeDecimalBase.h index c0585095eeb..dc8c99b06bc 100644 --- a/src/DataTypes/DataTypeDecimalBase.h +++ b/src/DataTypes/DataTypeDecimalBase.h @@ -59,6 +59,7 @@ class DataTypeDecimalBase : public IDataType public: using FieldType = T; using ColumnType = ColumnDecimal; + static constexpr auto type_id = TypeId; static constexpr bool is_parametric = true; diff --git a/src/DataTypes/DataTypeEnum.h b/src/DataTypes/DataTypeEnum.h index 92c72b87afa..2f607fc2aa6 100644 --- a/src/DataTypes/DataTypeEnum.h +++ b/src/DataTypes/DataTypeEnum.h @@ -38,6 +38,7 @@ class DataTypeEnum final : public IDataTypeEnum, public EnumValues public: using FieldType = Type; using ColumnType = ColumnVector; + static constexpr auto type_id = sizeof(FieldType) == 1 ? TypeIndex::Enum8 : TypeIndex::Enum16; using typename EnumValues::Values; static constexpr bool is_parametric = true; @@ -52,7 +53,7 @@ public: std::string doGetName() const override { return type_name; } const char * getFamilyName() const override; - TypeIndex getTypeId() const override { return sizeof(FieldType) == 1 ? TypeIndex::Enum8 : TypeIndex::Enum16; } + TypeIndex getTypeId() const override { return type_id; } FieldType readValue(ReadBuffer & istr) const { diff --git a/src/DataTypes/DataTypeFixedString.h b/src/DataTypes/DataTypeFixedString.h index f88d2f5337a..a53fde42b29 100644 --- a/src/DataTypes/DataTypeFixedString.h +++ b/src/DataTypes/DataTypeFixedString.h @@ -10,6 +10,8 @@ namespace DB { +class ColumnFixedString; + namespace ErrorCodes { extern const int ARGUMENT_OUT_OF_BOUND; @@ -22,7 +24,10 @@ private: size_t n; public: + using ColumnType = ColumnFixedString; + static constexpr bool is_parametric = true; + static constexpr auto type_id = TypeIndex::FixedString; DataTypeFixedString(size_t n_) : n(n_) { @@ -33,7 +38,7 @@ public: } std::string doGetName() const override; - TypeIndex getTypeId() const override { return TypeIndex::FixedString; } + TypeIndex getTypeId() const override { return type_id; } const char * getFamilyName() const override { return "FixedString"; } diff --git a/src/DataTypes/DataTypeLowCardinality.h b/src/DataTypes/DataTypeLowCardinality.h index 7f4286046d9..38b2109eec6 100644 --- a/src/DataTypes/DataTypeLowCardinality.h +++ b/src/DataTypes/DataTypeLowCardinality.h @@ -51,6 +51,7 @@ public: bool isNullable() const override { return false; } bool onlyNull() const override { return false; } bool lowCardinality() const override { return true; } + bool supportsSparseSerialization() const override { return false; } bool isLowCardinalityNullable() const override { return dictionary_type->isNullable(); } static MutableColumnUniquePtr createColumnUnique(const IDataType & keys_type); diff --git a/src/DataTypes/DataTypeNumberBase.h b/src/DataTypes/DataTypeNumberBase.h index 95975051600..59dc26ed13a 100644 --- a/src/DataTypes/DataTypeNumberBase.h +++ b/src/DataTypes/DataTypeNumberBase.h @@ -20,6 +20,7 @@ class DataTypeNumberBase : public IDataType public: static constexpr bool is_parametric = false; static constexpr auto family_name = TypeName; + static constexpr auto type_id = TypeId; using FieldType = T; using ColumnType = ColumnVector; diff --git a/src/DataTypes/DataTypeString.h b/src/DataTypes/DataTypeString.h index fd674505bc0..5f3bde43a13 100644 --- a/src/DataTypes/DataTypeString.h +++ b/src/DataTypes/DataTypeString.h @@ -6,10 +6,13 @@ namespace DB { +class ColumnString; + class DataTypeString final : public IDataType { public: using FieldType = String; + using ColumnType = ColumnString; static constexpr bool is_parametric = false; static constexpr auto type_id = TypeIndex::String; diff --git a/src/DataTypes/DataTypeTuple.cpp b/src/DataTypes/DataTypeTuple.cpp index 0660f371258..ad6d4e2943b 100644 --- a/src/DataTypes/DataTypeTuple.cpp +++ b/src/DataTypes/DataTypeTuple.cpp @@ -6,8 +6,10 @@ #include #include #include +#include #include #include +#include #include #include #include @@ -152,6 +154,20 @@ MutableColumnPtr DataTypeTuple::createColumn() const return ColumnTuple::create(std::move(tuple_columns)); } +MutableColumnPtr DataTypeTuple::createColumn(const ISerialization & serialization) const +{ + const auto & element_serializations = + assert_cast(serialization).getElementsSerializations(); + + size_t size = elems.size(); + assert(element_serializations.size() == size); + MutableColumns tuple_columns(size); + for (size_t i = 0; i < size; ++i) + tuple_columns[i] = elems[i]->createColumn(*element_serializations[i]->getNested()); + + return ColumnTuple::create(std::move(tuple_columns)); +} + Field DataTypeTuple::getDefault() const { return Tuple(collections::map(elems, [] (const DataTypePtr & elem) { return elem->getDefault(); })); @@ -248,21 +264,33 @@ SerializationPtr DataTypeTuple::doGetDefaultSerialization() const return std::make_shared(std::move(serializations), use_explicit_names); } -SerializationPtr DataTypeTuple::getSerialization(const String & column_name, const StreamExistenceCallback & callback) const +SerializationPtr DataTypeTuple::getSerialization(const SerializationInfo & info) const { SerializationTuple::ElementSerializations serializations(elems.size()); + const auto & info_tuple = assert_cast(info); bool use_explicit_names = have_explicit_names && serialize_names; + for (size_t i = 0; i < elems.size(); ++i) { String elem_name = use_explicit_names ? names[i] : toString(i + 1); - auto subcolumn_name = Nested::concatenateName(column_name, elem_name); - auto serializaion = elems[i]->getSerialization(subcolumn_name, callback); - serializations[i] = std::make_shared(serializaion, elem_name); + auto serialization = elems[i]->getSerialization(*info_tuple.getElementInfo(i)); + serializations[i] = std::make_shared(serialization, elem_name); } return std::make_shared(std::move(serializations), use_explicit_names); } +MutableSerializationInfoPtr DataTypeTuple::createSerializationInfo(const SerializationInfo::Settings & settings) const +{ + MutableSerializationInfos infos; + infos.reserve(elems.size()); + for (const auto & elem : elems) + infos.push_back(elem->createSerializationInfo(settings)); + + return std::make_shared(std::move(infos), settings); +} + + static DataTypePtr create(const ASTPtr & arguments) { if (!arguments || arguments->children.empty()) diff --git a/src/DataTypes/DataTypeTuple.h b/src/DataTypes/DataTypeTuple.h index d168d73efbf..c56e87ca22d 100644 --- a/src/DataTypes/DataTypeTuple.h +++ b/src/DataTypes/DataTypeTuple.h @@ -36,8 +36,10 @@ public: const char * getFamilyName() const override { return "Tuple"; } bool canBeInsideNullable() const override { return false; } + bool supportsSparseSerialization() const override { return true; } MutableColumnPtr createColumn() const override; + MutableColumnPtr createColumn(const ISerialization & serialization) const override; Field getDefault() const override; void insertDefaultInto(IColumn & column) const override; @@ -52,9 +54,9 @@ public: size_t getMaximumSizeOfValueInMemory() const override; size_t getSizeOfValueInMemory() const override; - SerializationPtr getSerialization(const String & column_name, const StreamExistenceCallback & callback) const override; - SerializationPtr doGetDefaultSerialization() const override; + SerializationPtr getSerialization(const SerializationInfo & info) const override; + MutableSerializationInfoPtr createSerializationInfo(const SerializationInfo::Settings & settings) const override; const DataTypePtr & getElement(size_t i) const { return elems[i]; } const DataTypes & getElements() const { return elems; } diff --git a/src/DataTypes/DataTypeUUID.h b/src/DataTypes/DataTypeUUID.h index 5ed7a912607..af9f1f35ca5 100644 --- a/src/DataTypes/DataTypeUUID.h +++ b/src/DataTypes/DataTypeUUID.h @@ -15,9 +15,10 @@ public: using FieldType = UUID; using ColumnType = ColumnVector; + static constexpr auto type_id = TypeIndex::UUID; const char * getFamilyName() const override { return "UUID"; } - TypeIndex getTypeId() const override { return TypeIndex::UUID; } + TypeIndex getTypeId() const override { return type_id; } Field getDefault() const override; diff --git a/src/DataTypes/DataTypesNumber.cpp b/src/DataTypes/DataTypesNumber.cpp index fef4c34d8b0..0c9a410077f 100644 --- a/src/DataTypes/DataTypesNumber.cpp +++ b/src/DataTypes/DataTypesNumber.cpp @@ -86,6 +86,7 @@ void registerDataTypeNumbers(DataTypeFactory & factory) factory.registerAlias("INT UNSIGNED", "UInt32", DataTypeFactory::CaseInsensitive); factory.registerAlias("INTEGER UNSIGNED", "UInt32", DataTypeFactory::CaseInsensitive); factory.registerAlias("BIGINT UNSIGNED", "UInt64", DataTypeFactory::CaseInsensitive); + factory.registerAlias("BIT", "UInt64", DataTypeFactory::CaseInsensitive); } } diff --git a/src/DataTypes/IDataType.cpp b/src/DataTypes/IDataType.cpp index 669876c792d..edc9e4159f4 100644 --- a/src/DataTypes/IDataType.cpp +++ b/src/DataTypes/IDataType.cpp @@ -1,5 +1,6 @@ #include #include +#include #include #include @@ -10,6 +11,8 @@ #include #include #include +#include +#include namespace DB @@ -40,6 +43,15 @@ void IDataType::updateAvgValueSizeHint(const IColumn & column, double & avg_valu } } +MutableColumnPtr IDataType::createColumn(const ISerialization & serialization) const +{ + auto column = createColumn(); + if (serialization.getKind() == ISerialization::Kind::SPARSE) + return ColumnSparse::create(std::move(column)); + + return column; +} + ColumnPtr IDataType::createColumnConst(size_t size, const Field & field) const { auto column = createColumn(); @@ -65,9 +77,7 @@ size_t IDataType::getSizeOfValueInMemory() const void IDataType::forEachSubcolumn( const SubcolumnCallback & callback, - const SerializationPtr & serialization, - const DataTypePtr & type, - const ColumnPtr & column) + const SubstreamData & data) { ISerialization::StreamCallback callback_with_data = [&](const auto & subpath) { @@ -76,66 +86,59 @@ void IDataType::forEachSubcolumn( if (!subpath[i].visited && ISerialization::hasSubcolumnForPath(subpath, i + 1)) { auto name = ISerialization::getSubcolumnNameForStream(subpath, i + 1); - auto data = ISerialization::createFromPath(subpath, i); - callback(subpath, name, data); + auto subdata = ISerialization::createFromPath(subpath, i); + callback(subpath, name, subdata); } subpath[i].visited = true; } }; - ISerialization::SubstreamPath path; - serialization->enumerateStreams(path, callback_with_data, type, column); + SubstreamPath path; + data.serialization->enumerateStreams(path, callback_with_data, data); } -DataTypePtr IDataType::tryGetSubcolumnType(const String & subcolumn_name) const +template +Ptr IDataType::getForSubcolumn( + const String & subcolumn_name, + const SubstreamData & data, + Ptr SubstreamData::*member, + bool throw_if_null) const { - DataTypePtr res; - forEachSubcolumn([&](const auto &, const auto & name, const auto & data) + Ptr res; + forEachSubcolumn([&](const auto &, const auto & name, const auto & subdata) { if (name == subcolumn_name) - res = data.type; - }, getDefaultSerialization(), getPtr(), nullptr); + res = subdata.*member; + }, data); + + if (!res && throw_if_null) + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "There is no subcolumn {} in type {}", subcolumn_name, getName()); return res; } +DataTypePtr IDataType::tryGetSubcolumnType(const String & subcolumn_name) const +{ + SubstreamData data = { getDefaultSerialization(), getPtr(), nullptr, nullptr }; + return getForSubcolumn(subcolumn_name, data, &SubstreamData::type, false); +} + DataTypePtr IDataType::getSubcolumnType(const String & subcolumn_name) const { - auto subcolumn_type = tryGetSubcolumnType(subcolumn_name); - if (subcolumn_type) - return subcolumn_type; - - throw Exception(ErrorCodes::ILLEGAL_COLUMN, "There is no subcolumn {} in type {}", subcolumn_name, getName()); + SubstreamData data = { getDefaultSerialization(), getPtr(), nullptr, nullptr }; + return getForSubcolumn(subcolumn_name, data, &SubstreamData::type); } SerializationPtr IDataType::getSubcolumnSerialization(const String & subcolumn_name, const SerializationPtr & serialization) const { - SerializationPtr res; - forEachSubcolumn([&](const auto &, const auto & name, const auto & data) - { - if (name == subcolumn_name) - res = data.serialization; - }, serialization, nullptr, nullptr); - - if (res) - return res; - - throw Exception(ErrorCodes::ILLEGAL_COLUMN, "There is no subcolumn {} in type {}", subcolumn_name, getName()); + SubstreamData data = { serialization, nullptr, nullptr, nullptr }; + return getForSubcolumn(subcolumn_name, data, &SubstreamData::serialization); } ColumnPtr IDataType::getSubcolumn(const String & subcolumn_name, const ColumnPtr & column) const { - ColumnPtr res; - forEachSubcolumn([&](const auto &, const auto & name, const auto & data) - { - if (name == subcolumn_name) - res = data.column; - }, getDefaultSerialization(), nullptr, column); - - if (res) - return res; - - throw Exception(ErrorCodes::ILLEGAL_COLUMN, "There is no subcolumn {} in type {}", subcolumn_name, getName()); + SubstreamData data = { getDefaultSerialization(), nullptr, column, nullptr }; + return getForSubcolumn(subcolumn_name, data, &SubstreamData::column); } Names IDataType::getSubcolumnNames() const @@ -144,7 +147,7 @@ Names IDataType::getSubcolumnNames() const forEachSubcolumn([&](const auto &, const auto & name, const auto &) { res.push_back(name); - }, getDefaultSerialization(), nullptr, nullptr); + }, { getDefaultSerialization(), nullptr, nullptr, nullptr }); return res; } @@ -163,6 +166,12 @@ void IDataType::setCustomization(DataTypeCustomDescPtr custom_desc_) const custom_serialization = std::move(custom_desc_->serialization); } +MutableSerializationInfoPtr IDataType::createSerializationInfo( + const SerializationInfo::Settings & settings) const +{ + return std::make_shared(ISerialization::Kind::DEFAULT, settings); +} + SerializationPtr IDataType::getDefaultSerialization() const { if (custom_serialization) @@ -171,22 +180,48 @@ SerializationPtr IDataType::getDefaultSerialization() const return doGetDefaultSerialization(); } +SerializationPtr IDataType::getSparseSerialization() const +{ + return std::make_shared(getDefaultSerialization()); +} + +SerializationPtr IDataType::getSerialization(ISerialization::Kind kind) const +{ + if (supportsSparseSerialization() && kind == ISerialization::Kind::SPARSE) + return getSparseSerialization(); + + return getDefaultSerialization(); +} + +SerializationPtr IDataType::getSerialization(const SerializationInfo & info) const +{ + return getSerialization(info.getKind()); +} + // static -SerializationPtr IDataType::getSerialization(const NameAndTypePair & column, const IDataType::StreamExistenceCallback & callback) +SerializationPtr IDataType::getSerialization(const NameAndTypePair & column, const SerializationInfo & info) { if (column.isSubcolumn()) { const auto & type_in_storage = column.getTypeInStorage(); - auto default_serialization = type_in_storage->getDefaultSerialization(); - return type_in_storage->getSubcolumnSerialization(column.getSubcolumnName(), default_serialization); + auto serialization = type_in_storage->getSerialization(info); + return type_in_storage->getSubcolumnSerialization(column.getSubcolumnName(), serialization); } - return column.type->getSerialization(column.name, callback); + return column.type->getSerialization(info); } -SerializationPtr IDataType::getSerialization(const String &, const StreamExistenceCallback &) const +// static +SerializationPtr IDataType::getSerialization(const NameAndTypePair & column) { - return getDefaultSerialization(); + if (column.isSubcolumn()) + { + const auto & type_in_storage = column.getTypeInStorage(); + auto serialization = type_in_storage->getDefaultSerialization(); + return type_in_storage->getSubcolumnSerialization(column.getSubcolumnName(), serialization); + } + + return column.type->getDefaultSerialization(); } } diff --git a/src/DataTypes/IDataType.h b/src/DataTypes/IDataType.h index fc42d678d57..85644b6f6ca 100644 --- a/src/DataTypes/IDataType.h +++ b/src/DataTypes/IDataType.h @@ -6,7 +6,8 @@ #include #include #include - +#include +#include namespace DB { @@ -27,7 +28,6 @@ using DataTypePtr = std::shared_ptr; using DataTypes = std::vector; struct NameAndTypePair; -class SerializationInfo; struct DataTypeWithConstInfo { @@ -84,45 +84,54 @@ public: SerializationPtr getSubcolumnSerialization(const String & subcolumn_name, const SerializationPtr & serialization) const; ColumnPtr getSubcolumn(const String & subcolumn_name, const ColumnPtr & column) const; + using SubstreamData = ISerialization::SubstreamData; + using SubstreamPath = ISerialization::SubstreamPath; + using SubcolumnCallback = std::function; + const SubstreamData &)>; static void forEachSubcolumn( const SubcolumnCallback & callback, - const SerializationPtr & serialization, - const DataTypePtr & type, - const ColumnPtr & column); + const SubstreamData & data); Names getSubcolumnNames() const; - /// Returns default serialization of data type. + virtual MutableSerializationInfoPtr createSerializationInfo( + const SerializationInfo::Settings & settings) const; + + /// TODO: support more types. + virtual bool supportsSparseSerialization() const { return !haveSubtypes(); } + SerializationPtr getDefaultSerialization() const; + SerializationPtr getSparseSerialization() const; - /// Asks whether the stream with given name exists in table. - /// If callback returned true for all streams, which are required for - /// one of serialization types, that serialization will be chosen for reading. - /// If callback always returned false, the default serialization will be chosen. - using StreamExistenceCallback = std::function; + /// Chooses serialization according to serialization kind. + SerializationPtr getSerialization(ISerialization::Kind kind) const; - /// Chooses serialization for reading of one column or subcolumns by - /// checking existence of substreams using callback. - static SerializationPtr getSerialization( - const NameAndTypePair & column, - const StreamExistenceCallback & callback = [](const String &) { return false; }); + /// Chooses serialization according to collected information about content of column. + virtual SerializationPtr getSerialization(const SerializationInfo & info) const; - virtual SerializationPtr getSerialization(const String & column_name, const StreamExistenceCallback & callback) const; + /// Chooses between subcolumn serialization and regular serialization according to @column. + /// This method typically should be used to get serialization for reading column or subcolumn. + static SerializationPtr getSerialization(const NameAndTypePair & column, const SerializationInfo & info); + + static SerializationPtr getSerialization(const NameAndTypePair & column); protected: virtual String doGetName() const { return getFamilyName(); } virtual SerializationPtr doGetDefaultSerialization() const = 0; public: - /** Create empty column for corresponding type. + /** Create empty column for corresponding type and default serialization. */ virtual MutableColumnPtr createColumn() const = 0; + /** Create empty column for corresponding type and serialization. + */ + virtual MutableColumnPtr createColumn(const ISerialization & serialization) const; + /** Create ColumnConst for corresponding type, with specified size and value. */ ColumnPtr createColumnConst(size_t size, const Field & field) const; @@ -292,6 +301,14 @@ protected: public: const IDataTypeCustomName * getCustomName() const { return custom_name.get(); } const ISerialization * getCustomSerialization() const { return custom_serialization.get(); } + +private: + template + Ptr getForSubcolumn( + const String & subcolumn_name, + const SubstreamData & data, + Ptr SubstreamData::*member, + bool throw_if_null = true) const; }; @@ -360,6 +377,8 @@ struct WhichDataType constexpr bool isNullable() const { return idx == TypeIndex::Nullable; } constexpr bool isFunction() const { return idx == TypeIndex::Function; } constexpr bool isAggregateFunction() const { return idx == TypeIndex::AggregateFunction; } + + constexpr bool isLowCarnality() const { return idx == TypeIndex::LowCardinality; } }; /// IDataType helpers (alternative for IDataType virtual methods with single point of truth) @@ -495,6 +514,11 @@ inline bool isCompilableType(const DataTypePtr & data_type) return data_type->isValueRepresentedByNumber() && !isDecimal(data_type); } +inline bool isBool(const DataTypePtr & data_type) +{ + return data_type->getName() == "Bool"; +} + template constexpr bool IsDataTypeDecimal = false; template constexpr bool IsDataTypeNumber = false; template constexpr bool IsDataTypeDateOrDateTime = false; diff --git a/src/DataTypes/NestedUtils.cpp b/src/DataTypes/NestedUtils.cpp index 4f804a0ca50..b35a0713519 100644 --- a/src/DataTypes/NestedUtils.cpp +++ b/src/DataTypes/NestedUtils.cpp @@ -36,9 +36,18 @@ std::string concatenateName(const std::string & nested_table_name, const std::st /** Name can be treated as compound if it contains dot (.) in the middle. */ -std::pair splitName(const std::string & name) +std::pair splitName(const std::string & name, bool reverse) { - auto idx = name.find_first_of('.'); + auto idx = (reverse ? name.find_last_of('.') : name.find_first_of('.')); + if (idx == std::string::npos || idx == 0 || idx + 1 == name.size()) + return {name, {}}; + + return {name.substr(0, idx), name.substr(idx + 1)}; +} + +std::pair splitName(const std::string_view & name, bool reverse) +{ + auto idx = (reverse ? name.find_last_of('.') : name.find_first_of('.')); if (idx == std::string::npos || idx == 0 || idx + 1 == name.size()) return {name, {}}; @@ -211,6 +220,7 @@ void validateArraySizes(const Block & block) } } + std::unordered_set getAllTableNames(const Block & block) { std::unordered_set nested_table_names; diff --git a/src/DataTypes/NestedUtils.h b/src/DataTypes/NestedUtils.h index d16e309fc81..2ca5c17dc74 100644 --- a/src/DataTypes/NestedUtils.h +++ b/src/DataTypes/NestedUtils.h @@ -11,7 +11,9 @@ namespace Nested { std::string concatenateName(const std::string & nested_table_name, const std::string & nested_field_name); - std::pair splitName(const std::string & name); + /// Splits name of compound identifier by first/last dot (depending on 'reverse' parameter). + std::pair splitName(const std::string & name, bool reverse = false); + std::pair splitName(const std::string_view & name, bool reverse = false); /// Returns the prefix of the name to the first '.'. Or the name is unchanged if there is no dot. std::string extractTableName(const std::string & nested_name); diff --git a/src/DataTypes/Serializations/ISerialization.cpp b/src/DataTypes/Serializations/ISerialization.cpp index 6fa18eee061..5cdc037d5cb 100644 --- a/src/DataTypes/Serializations/ISerialization.cpp +++ b/src/DataTypes/Serializations/ISerialization.cpp @@ -16,12 +16,43 @@ namespace ErrorCodes { extern const int MULTIPLE_STREAMS_REQUIRED; extern const int UNEXPECTED_DATA_AFTER_PARSED_VALUE; + extern const int LOGICAL_ERROR; +} + +ISerialization::Kind ISerialization::getKind(const IColumn & column) +{ + if (column.isSparse()) + return Kind::SPARSE; + + return Kind::DEFAULT; +} + +String ISerialization::kindToString(Kind kind) +{ + switch (kind) + { + case Kind::DEFAULT: + return "Default"; + case Kind::SPARSE: + return "Sparse"; + } + __builtin_unreachable(); +} + +ISerialization::Kind ISerialization::stringToKind(const String & str) +{ + if (str == "Default") + return Kind::DEFAULT; + else if (str == "Sparse") + return Kind::SPARSE; + else + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown serialization kind '{}'", str); } String ISerialization::Substream::toString() const { if (type == TupleElement) - return fmt::format("TupleElement({}, escape_tuple_delimiter={})", + return fmt::format("TupleElement({}, escape_tuple_delimiter = {})", tuple_element_name, escape_tuple_delimiter ? "true" : "false"); return String(magic_enum::enum_name(type)); @@ -44,18 +75,22 @@ String ISerialization::SubstreamPath::toString() const void ISerialization::enumerateStreams( SubstreamPath & path, const StreamCallback & callback, - DataTypePtr type, - ColumnPtr column) const + const SubstreamData & data) const { path.push_back(Substream::Regular); - path.back().data = {type, column, getPtr(), nullptr}; + path.back().data = data; callback(path); path.pop_back(); } void ISerialization::enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const { - enumerateStreams(path, callback, nullptr, nullptr); + enumerateStreams(path, callback, {getPtr(), nullptr, nullptr, nullptr}); +} + +void ISerialization::enumerateStreams(SubstreamPath & path, const StreamCallback & callback, const DataTypePtr & type) const +{ + enumerateStreams(path, callback, {getPtr(), type, nullptr, nullptr}); } void ISerialization::serializeBinaryBulk(const IColumn & column, WriteBuffer &, size_t, size_t) const @@ -147,11 +182,23 @@ String ISerialization::getFileNameForStream(const NameAndTypePair & column, cons return getFileNameForStream(column.getNameInStorage(), path); } +static size_t isOffsetsOfNested(const ISerialization::SubstreamPath & path) +{ + if (path.empty()) + return false; + + for (const auto & elem : path) + if (elem.type == ISerialization::Substream::ArrayElements) + return false; + + return path.back().type == ISerialization::Substream::ArraySizes; +} + String ISerialization::getFileNameForStream(const String & name_in_storage, const SubstreamPath & path) { String stream_name; auto nested_storage_name = Nested::extractTableName(name_in_storage); - if (name_in_storage != nested_storage_name && (path.size() == 1 && path[0].type == ISerialization::Substream::ArraySizes)) + if (name_in_storage != nested_storage_name && isOffsetsOfNested(path)) stream_name = escapeForFileName(nested_storage_name); else stream_name = escapeForFileName(name_in_storage); @@ -242,10 +289,9 @@ ISerialization::SubstreamData ISerialization::createFromPath(const SubstreamPath assert(prefix_len < path.size()); SubstreamData res = path[prefix_len].data; - res.creator.reset(); for (ssize_t i = static_cast(prefix_len) - 1; i >= 0; --i) { - const auto & creator = path[i].data.creator; + const auto & creator = path[i].creator; if (creator) { res.type = res.type ? creator->create(res.type) : res.type; diff --git a/src/DataTypes/Serializations/ISerialization.h b/src/DataTypes/Serializations/ISerialization.h index 6338bb8a437..b1fd4d0a9da 100644 --- a/src/DataTypes/Serializations/ISerialization.h +++ b/src/DataTypes/Serializations/ISerialization.h @@ -2,15 +2,25 @@ #include #include +#include +#include #include #include #include #include +#include namespace DB { +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; +} + +class IDataType; + class ReadBuffer; class WriteBuffer; class ProtobufReader; @@ -22,19 +32,40 @@ using DataTypePtr = std::shared_ptr; class ISerialization; using SerializationPtr = std::shared_ptr; +class SerializationInfo; +using SerializationInfoPtr = std::shared_ptr; + class Field; struct FormatSettings; struct NameAndTypePair; +/** Represents serialization of data type. + * Has methods to serialize/deserialize column in binary and several text formats. + * Every data type has default serialization, but can be serialized in different representations. + * Default serialization can be wrapped to one of the special kind of serializations. + * Currently there is only one special serialization: Sparse. + * Each serialization has its own implementation of IColumn as its in-memory representation. + */ class ISerialization : private boost::noncopyable, public std::enable_shared_from_this { public: ISerialization() = default; virtual ~ISerialization() = default; + enum class Kind : UInt8 + { + DEFAULT = 0, + SPARSE = 1, + }; + + virtual Kind getKind() const { return Kind::DEFAULT; } SerializationPtr getPtr() const { return shared_from_this(); } + static Kind getKind(const IColumn & column); + static String kindToString(Kind kind); + static Kind stringToKind(const String & str); + /** Binary serialization for range of values in column - for writing to disk/network, etc. * * Some data types are represented in multiple streams while being serialized. @@ -70,10 +101,10 @@ public: struct SubstreamData { + SerializationPtr serialization; DataTypePtr type; ColumnPtr column; - SerializationPtr serialization; - SubcolumnCreatorPtr creator; + SerializationInfoPtr serialization_info; }; struct Substream @@ -108,6 +139,9 @@ public: /// Data for current substream. SubstreamData data; + /// Creator of subcolumn for current substream. + SubcolumnCreatorPtr creator = nullptr; + /// Flag, that may help to traverse substream paths. mutable bool visited = false; @@ -130,13 +164,14 @@ public: virtual void enumerateStreams( SubstreamPath & path, const StreamCallback & callback, - DataTypePtr type, - ColumnPtr column) const; + const SubstreamData & data) const; void enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const; void enumerateStreams(const StreamCallback & callback, SubstreamPath && path) const { enumerateStreams(callback, path); } void enumerateStreams(const StreamCallback & callback) const { enumerateStreams(callback, {}); } + void enumerateStreams(SubstreamPath & path, const StreamCallback & callback, const DataTypePtr & type) const; + using OutputStreamGetter = std::function; using InputStreamGetter = std::function; @@ -300,16 +335,41 @@ public: static ColumnPtr getFromSubstreamsCache(SubstreamsCache * cache, const SubstreamPath & path); static bool isSpecialCompressionAllowed(const SubstreamPath & path); - static size_t getArrayLevel(const SubstreamPath & path); + static size_t getArrayLevel(const SubstreamPath & path); static bool hasSubcolumnForPath(const SubstreamPath & path, size_t prefix_len); static SubstreamData createFromPath(const SubstreamPath & path, size_t prefix_len); protected: + template + State * checkAndGetState(const StatePtr & state) const; + [[noreturn]] void throwUnexpectedDataAfterParsedValue(IColumn & column, ReadBuffer & istr, const FormatSettings &, const String & type_name) const; }; using SerializationPtr = std::shared_ptr; using Serializations = std::vector; +using SerializationByName = std::unordered_map; + +template +State * ISerialization::checkAndGetState(const StatePtr & state) const +{ + if (!state) + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Got empty state for {}", demangle(typeid(*this).name())); + + auto * state_concrete = typeid_cast(state.get()); + if (!state_concrete) + { + auto & state_ref = *state; + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Invalid State for {}. Expected: {}, got {}", + demangle(typeid(*this).name()), + demangle(typeid(State).name()), + demangle(typeid(state_ref).name())); + } + + return state_concrete; +} } diff --git a/src/DataTypes/Serializations/SerializationArray.cpp b/src/DataTypes/Serializations/SerializationArray.cpp index 956ada2436f..e3b535a2a11 100644 --- a/src/DataTypes/Serializations/SerializationArray.cpp +++ b/src/DataTypes/Serializations/SerializationArray.cpp @@ -198,33 +198,38 @@ ColumnPtr SerializationArray::SubcolumnCreator::create(const ColumnPtr & prev) c void SerializationArray::enumerateStreams( SubstreamPath & path, const StreamCallback & callback, - DataTypePtr type, - ColumnPtr column) const + const SubstreamData & data) const { - const auto * type_array = type ? &assert_cast(*type) : nullptr; - const auto * column_array = column ? &assert_cast(*column) : nullptr; + const auto * type_array = data.type ? &assert_cast(*data.type) : nullptr; + const auto * column_array = data.column ? &assert_cast(*data.column) : nullptr; auto offsets_column = column_array ? column_array->getOffsetsPtr() : nullptr; path.push_back(Substream::ArraySizes); path.back().data = { - type ? std::make_shared() : nullptr, - offsets_column ? arrayOffsetsToSizes(*offsets_column) : nullptr, std::make_shared( std::make_shared>(), "size" + std::to_string(getArrayLevel(path)), false), - nullptr, + data.type ? std::make_shared() : nullptr, + offsets_column ? arrayOffsetsToSizes(*offsets_column) : nullptr, + data.serialization_info, }; callback(path); path.back() = Substream::ArrayElements; - path.back().data = {type, column, getPtr(), std::make_shared(offsets_column)}; + path.back().data = data; + path.back().creator = std::make_shared(offsets_column); - auto next_type = type_array ? type_array->getNestedType() : nullptr; - auto next_column = column_array ? column_array->getDataPtr() : nullptr; + SubstreamData next_data = + { + nested, + type_array ? type_array->getNestedType() : nullptr, + column_array ? column_array->getDataPtr() : nullptr, + data.serialization_info, + }; - nested->enumerateStreams(path, callback, next_type, next_column); + nested->enumerateStreams(path, callback, next_data); path.pop_back(); } diff --git a/src/DataTypes/Serializations/SerializationArray.h b/src/DataTypes/Serializations/SerializationArray.h index f766083623d..cd8cac54881 100644 --- a/src/DataTypes/Serializations/SerializationArray.h +++ b/src/DataTypes/Serializations/SerializationArray.h @@ -38,8 +38,7 @@ public: void enumerateStreams( SubstreamPath & path, const StreamCallback & callback, - DataTypePtr type, - ColumnPtr column) const override; + const SubstreamData & data) const override; void serializeBinaryBulkStatePrefix( SerializeBinaryBulkSettings & settings, diff --git a/src/DataTypes/Serializations/SerializationBool.cpp b/src/DataTypes/Serializations/SerializationBool.cpp index def2b565afc..1efacaaecc5 100644 --- a/src/DataTypes/Serializations/SerializationBool.cpp +++ b/src/DataTypes/Serializations/SerializationBool.cpp @@ -6,69 +6,47 @@ #include #include #include +#include + +#include namespace DB { namespace ErrorCodes { - extern const int CANNOT_PARSE_DOMAIN_VALUE_FROM_STRING; extern const int ILLEGAL_COLUMN; + extern const int CANNOT_PARSE_BOOL; } -SerializationBool::SerializationBool(const SerializationPtr &nested_) - : SerializationCustomSimpleText(nested_) +namespace { -} -void SerializationBool::serializeText(const IColumn &column, size_t row_num, WriteBuffer &ostr, const FormatSettings &) const +constexpr char str_true[5] = "true"; +constexpr char str_false[6] = "false"; + +const ColumnUInt8 * checkAndGetSerializeColumnType(const IColumn & column) { - const auto *col = checkAndGetColumn(&column); - if (!col) + const auto * col = checkAndGetColumn(&column); + if (!checkAndGetColumn(&column)) throw Exception("Bool type can only serialize columns of type UInt8." + column.getName(), ErrorCodes::ILLEGAL_COLUMN); - - if (col->getData()[row_num]) - ostr.write(str_true, sizeof(str_true) - 1); - else - ostr.write(str_false, sizeof(str_false) - 1); + return col; } -void SerializationBool::deserializeText(IColumn &column, ReadBuffer &istr, const FormatSettings & settings, bool whole) const +ColumnUInt8 * checkAndGetDeserializeColumnType(IColumn & column) { - ColumnUInt8 *col = typeid_cast(&column); - if (!col) - { + auto * col = typeid_cast(&column); + if (!checkAndGetColumn(&column)) throw Exception("Bool type can only deserialize columns of type UInt8." + column.getName(), ErrorCodes::ILLEGAL_COLUMN); - } - - if (!istr.eof()) - { - bool value = false; - - if (*istr.position() == 't' || *istr.position() == 'f' || *istr.position() == 'T' || *istr.position() == 'F') - readBoolTextWord(value, istr, true); - else if (*istr.position() == '1' || *istr.position() == '0') - readBoolText(value, istr); - else - throw Exception("Invalid boolean value, should be true/false, TRUE/FALSE, 1/0.", - ErrorCodes::CANNOT_PARSE_DOMAIN_VALUE_FROM_STRING); - col->insert(value); - } - else - throw Exception("Expected boolean value but get EOF.", ErrorCodes::CANNOT_PARSE_DOMAIN_VALUE_FROM_STRING); - - if (whole && !istr.eof()) - throwUnexpectedDataAfterParsedValue(column, istr, settings, "Bool"); + return col; } -void SerializationBool::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +void serializeCustom(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) { - const auto *col = checkAndGetColumn(&column); - if (!col) - throw Exception("Bool type can only serialize columns of type UInt8." + column.getName(), - ErrorCodes::ILLEGAL_COLUMN); + const auto * col = checkAndGetSerializeColumnType(column); + if (col->getData()[row_num]) { writeString(settings.bool_true_representation, ostr); @@ -79,91 +57,278 @@ void SerializationBool::serializeTextEscaped(const IColumn & column, size_t row_ } } +void serializeSimple(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) +{ + const auto * col = checkAndGetSerializeColumnType(column); + + if (col->getData()[row_num]) + ostr.write(str_true, sizeof(str_true) - 1); + else + ostr.write(str_false, sizeof(str_false) - 1); +} + +bool tryDeserializeAllVariants(ColumnUInt8 * column, ReadBuffer & istr) +{ + if (checkCharCaseInsensitive('1', istr)) + { + column->insert(true); + } + else if (checkCharCaseInsensitive('0', istr)) + { + column->insert(false); + } + /// 'True' and 'T' + else if (checkCharCaseInsensitive('t', istr)) + { + /// Check if it's just short form `T` or full form `True` + if (checkCharCaseInsensitive('r', istr)) + { + if (!checkStringCaseInsensitive("ue", istr)) + return false; + } + column->insert(true); + } + /// 'False' and 'F' + else if (checkCharCaseInsensitive('f', istr)) + { + /// Check if it's just short form `F` or full form `False` + if (checkCharCaseInsensitive('a', istr)) + { + if (!checkStringCaseInsensitive("lse", istr)) + return false; + } + column->insert(false); + } + /// 'Yes' and 'Y' + else if (checkCharCaseInsensitive('y', istr)) + { + /// Check if it's just short form `Y` or full form `Yes` + if (checkCharCaseInsensitive('e', istr)) + { + if (!checkCharCaseInsensitive('s', istr)) + return false; + } + column->insert(true); + } + /// 'No' and 'N' + else if (checkCharCaseInsensitive('n', istr)) + { + /// Check if it's just short form `N` or full form `No` + checkCharCaseInsensitive('o', istr); + column->insert(false); + } + /// 'On' and 'Off' + else if (checkCharCaseInsensitive('o', istr)) + { + if (checkCharCaseInsensitive('n', istr)) + column->insert(true); + else if (checkStringCaseInsensitive("ff", istr)) + { + column->insert(false); + } + else + return false; + } + /// 'Enable' and 'Enabled' + else if (checkStringCaseInsensitive("enable", istr)) + { + /// Check if it's 'enable' or 'enabled' + checkCharCaseInsensitive('d', istr); + column->insert(true); + } + /// 'Disable' and 'Disabled' + else if (checkStringCaseInsensitive("disable", istr)) + { + /// Check if it's 'disable' or 'disabled' + checkCharCaseInsensitive('d', istr); + column->insert(false); + } + else + { + return false; + } + + return true; +} + +void deserializeImpl( + IColumn & column, ReadBuffer & istr, const FormatSettings & settings, std::function check_end_of_value) +{ + ColumnUInt8 * col = checkAndGetDeserializeColumnType(column); + + PeekableReadBuffer buf(istr); + buf.setCheckpoint(); + if (checkString(settings.bool_true_representation, buf) && check_end_of_value(buf)) + { + col->insert(true); + return; + } + + buf.rollbackToCheckpoint(); + if (checkString(settings.bool_false_representation, buf) && check_end_of_value(buf)) + { + col->insert(false); + buf.dropCheckpoint(); + if (buf.hasUnreadData()) + throw Exception( + ErrorCodes::CANNOT_PARSE_BOOL, + "Cannot continue parsing after parsed bool value because it will result in the loss of some data. It may happen if " + "bool_true_representation or bool_false_representation contains some delimiters of input format"); + return; + } + + buf.rollbackToCheckpoint(); + if (tryDeserializeAllVariants(col, buf) && check_end_of_value(buf)) + { + buf.dropCheckpoint(); + if (buf.hasUnreadData()) + throw Exception( + ErrorCodes::CANNOT_PARSE_BOOL, + "Cannot continue parsing after parsed bool value because it will result in the loss of some data. It may happen if " + "bool_true_representation or bool_false_representation contains some delimiters of input format"); + return; + } + + buf.makeContinuousMemoryFromCheckpointToPos(); + buf.rollbackToCheckpoint(); + throw Exception( + ErrorCodes::CANNOT_PARSE_BOOL, + "Cannot parse boolean value here: '{}', should be '{}' or '{}' controlled by setting bool_true_representation and " + "bool_false_representation or one of " + "True/False/T/F/Y/N/Yes/No/On/Off/Enable/Disable/Enabled/Disabled/1/0", + String(buf.position(), std::min(10lu, buf.available())), + settings.bool_true_representation, settings.bool_false_representation); +} + +} + + +SerializationBool::SerializationBool(const SerializationPtr &nested_) + : SerializationWrapper(nested_) +{ +} + +void SerializationBool::serializeText(const IColumn & column, size_t row_num, WriteBuffer &ostr, const FormatSettings & settings) const +{ + serializeCustom(column, row_num, ostr, settings); +} + +void SerializationBool::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + serializeCustom(column, row_num, ostr, settings); +} + void SerializationBool::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { if (istr.eof()) - throw Exception("Expected boolean value but get EOF.", ErrorCodes::CANNOT_PARSE_DOMAIN_VALUE_FROM_STRING); - String input; - readEscapedString(input, istr); - deserializeFromString(column, input, settings); + throw Exception("Expected boolean value but get EOF.", ErrorCodes::CANNOT_PARSE_BOOL); + + deserializeImpl(column, istr, settings, [](ReadBuffer & buf){ return buf.eof() || *buf.position() == '\t' || *buf.position() == '\n'; }); } void SerializationBool::serializeTextJSON(const IColumn &column, size_t row_num, WriteBuffer &ostr, const FormatSettings &settings) const { - serializeText(column, row_num, ostr, settings); + serializeSimple(column, row_num, ostr, settings); } void SerializationBool::deserializeTextJSON(IColumn &column, ReadBuffer &istr, const FormatSettings &) const { - ColumnUInt8 *col = typeid_cast(&column); - if (!col) - { - throw Exception("Bool type can only deserialize columns of type UInt8." + column.getName(), - ErrorCodes::ILLEGAL_COLUMN); - } + if (istr.eof()) + throw Exception("Expected boolean value but get EOF.", ErrorCodes::CANNOT_PARSE_BOOL); - if (!istr.eof()) - { - bool value = false; + ColumnUInt8 * col = checkAndGetDeserializeColumnType(column); + bool value = false; - if (*istr.position() == 't' || *istr.position() == 'f') - readBoolTextWord(value, istr); - else if (*istr.position() == '1' || *istr.position() == '0') - readBoolText(value, istr); - else - throw Exception("Invalid boolean value, should be true/false, 1/0.", - ErrorCodes::CANNOT_PARSE_DOMAIN_VALUE_FROM_STRING); - col->insert(value); - } + if (*istr.position() == 't' || *istr.position() == 'f') + readBoolTextWord(value, istr); + else if (*istr.position() == '1' || *istr.position() == '0') + readBoolText(value, istr); else - throw Exception("Expected boolean value but get EOF.", ErrorCodes::CANNOT_PARSE_DOMAIN_VALUE_FROM_STRING); + throw Exception("Invalid boolean value, should be true/false, 1/0.", + ErrorCodes::CANNOT_PARSE_BOOL); + col->insert(value); } void SerializationBool::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { - serializeTextEscaped(column, row_num, ostr, settings); + serializeCustom(column, row_num, ostr, settings); } void SerializationBool::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { if (istr.eof()) - throw Exception("Expected boolean value but get EOF.", ErrorCodes::CANNOT_PARSE_DOMAIN_VALUE_FROM_STRING); - String input; - readCSVString(input, istr, settings.csv); - deserializeFromString(column, input, settings); + throw Exception("Expected boolean value but get EOF.", ErrorCodes::CANNOT_PARSE_BOOL); + + deserializeImpl(column, istr, settings, [&](ReadBuffer & buf){ return buf.eof() || *buf.position() == settings.csv.delimiter || *buf.position() == '\n'; }); } void SerializationBool::serializeTextRaw(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { - serializeTextEscaped(column, row_num, ostr, settings); + serializeCustom(column, row_num, ostr, settings); } void SerializationBool::deserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { if (istr.eof()) - throw Exception("Expected boolean value but get EOF.", ErrorCodes::CANNOT_PARSE_DOMAIN_VALUE_FROM_STRING); - String input; - readString(input, istr); - deserializeFromString(column, input, settings); + throw Exception("Expected boolean value but get EOF.", ErrorCodes::CANNOT_PARSE_BOOL); + + deserializeImpl(column, istr, settings, [&](ReadBuffer & buf){ return buf.eof() || *buf.position() == '\t' || *buf.position() == '\n'; }); } -void SerializationBool::deserializeFromString(IColumn & column, String & input, const FormatSettings & settings) +void SerializationBool::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { - ColumnUInt8 * col = typeid_cast(&column); - if (!col) - { - throw Exception("Bool type can only deserialize columns of type UInt8." + column.getName(), ErrorCodes::ILLEGAL_COLUMN); - } + serializeSimple(column, row_num, ostr, settings); +} - if (settings.bool_true_representation == input) +void SerializationBool::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + if (istr.eof()) + throw Exception("Expected boolean value but get EOF.", ErrorCodes::CANNOT_PARSE_BOOL); + + auto * col = checkAndGetDeserializeColumnType(column); + + char symbol = toLowerIfAlphaASCII(*istr.position()); + switch (symbol) { - col->insert(true); + case 't': + assertStringCaseInsensitive("true", istr); + col->insert(true); + break; + case 'f': + assertStringCaseInsensitive("false", istr); + col->insert(false); + break; + case '1': + col->insert(true); + break; + case '0': + col->insert(false); + break; + case '\'': + ++istr.position(); + deserializeImpl(column, istr, settings, [](ReadBuffer & buf){ return !buf.eof() && *buf.position() == '\''; }); + assertChar('\'', istr); + break; + default: + throw Exception( + ErrorCodes::CANNOT_PARSE_BOOL, + "Cannot parse boolean value here: '{}', should be true/false, 1/0 or on of " + "True/False/T/F/Y/N/Yes/No/On/Off/Enable/Disable/Enabled/Disabled/1/0 in quotes", + String(istr.position(), std::min(10ul, istr.available()))); } - else if (settings.bool_false_representation == input) - { - col->insert(false); - } - else - throw Exception("Invalid boolean value, should be " + settings.bool_true_representation + " or " + settings.bool_false_representation + " controlled by setting bool_true_representation and bool_false_representation.", ErrorCodes::ILLEGAL_COLUMN); } + +void SerializationBool::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + if (istr.eof()) + throw Exception("Expected boolean value but get EOF.", ErrorCodes::CANNOT_PARSE_BOOL); + + deserializeImpl(column, istr, settings, [&](ReadBuffer & buf){ return buf.eof(); }); +} + +void SerializationBool::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + serializeSimple(column, row_num, ostr, settings); +} + } diff --git a/src/DataTypes/Serializations/SerializationBool.h b/src/DataTypes/Serializations/SerializationBool.h index eda37864db5..a9f4c6404b3 100644 --- a/src/DataTypes/Serializations/SerializationBool.h +++ b/src/DataTypes/Serializations/SerializationBool.h @@ -1,26 +1,23 @@ #pragma once -#include +#include +#include +#include namespace DB { -class SerializationBool final : public SerializationCustomSimpleText +class SerializationBool final : public SerializationWrapper { -private: - static constexpr char str_true[5] = "true"; - static constexpr char str_false[6] = "false"; - public: SerializationBool(const SerializationPtr & nested_); void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings,bool whole) const override; void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; - void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; @@ -29,8 +26,12 @@ public: void serializeTextRaw(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; void deserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; -protected: - static void deserializeFromString(IColumn & column, String & input, const FormatSettings & settings); + void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + + void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; }; } diff --git a/src/DataTypes/Serializations/SerializationCustomSimpleText.h b/src/DataTypes/Serializations/SerializationCustomSimpleText.h index f1b24f65b22..ba7c712f86c 100644 --- a/src/DataTypes/Serializations/SerializationCustomSimpleText.h +++ b/src/DataTypes/Serializations/SerializationCustomSimpleText.h @@ -10,7 +10,7 @@ class WriteBuffer; struct FormatSettings; class IColumn; -/** Simple IDataTypeCustomTextSerialization that uses serializeText/deserializeText +/** Simple ISerialization that uses serializeText/deserializeText * for all serialization and deserialization. */ class SerializationCustomSimpleText : public SerializationWrapper { diff --git a/src/DataTypes/Serializations/SerializationDateTime.cpp b/src/DataTypes/Serializations/SerializationDateTime.cpp index ce64bfd785a..b4269fb0f8c 100644 --- a/src/DataTypes/Serializations/SerializationDateTime.cpp +++ b/src/DataTypes/Serializations/SerializationDateTime.cpp @@ -2,7 +2,7 @@ #include #include -#include +#include #include #include #include diff --git a/src/DataTypes/Serializations/SerializationDateTime64.cpp b/src/DataTypes/Serializations/SerializationDateTime64.cpp index aeba7e7e341..b9ed5bd4a02 100644 --- a/src/DataTypes/Serializations/SerializationDateTime64.cpp +++ b/src/DataTypes/Serializations/SerializationDateTime64.cpp @@ -2,7 +2,7 @@ #include #include -#include +#include #include #include #include diff --git a/src/DataTypes/Serializations/SerializationInfo.cpp b/src/DataTypes/Serializations/SerializationInfo.cpp new file mode 100644 index 00000000000..42d3d14b672 --- /dev/null +++ b/src/DataTypes/Serializations/SerializationInfo.cpp @@ -0,0 +1,222 @@ +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int CORRUPTED_DATA; +} + +namespace +{ + +constexpr auto KEY_VERSION = "version"; +constexpr auto KEY_NUM_ROWS = "num_rows"; +constexpr auto KEY_COLUMNS = "columns"; +constexpr auto KEY_NUM_DEFAULTS = "num_defaults"; +constexpr auto KEY_KIND = "kind"; +constexpr auto KEY_NAME = "name"; + +} + +void SerializationInfo::Data::add(const IColumn & column) +{ + size_t rows = column.size(); + double ratio = column.getRatioOfDefaultRows(ColumnSparse::DEFAULT_ROWS_SEARCH_SAMPLE_RATIO); + + num_rows += rows; + num_defaults += static_cast(ratio * rows); +} + +void SerializationInfo::Data::add(const Data & other) +{ + num_rows += other.num_rows; + num_defaults += other.num_defaults; +} + +SerializationInfo::SerializationInfo(ISerialization::Kind kind_, const Settings & settings_) + : settings(settings_) + , kind(kind_) +{ +} + +void SerializationInfo::add(const IColumn & column) +{ + data.add(column); + if (settings.choose_kind) + kind = chooseKind(data, settings); +} + +void SerializationInfo::add(const SerializationInfo & other) +{ + data.add(other.data); + if (settings.choose_kind) + kind = chooseKind(data, settings); +} + +void SerializationInfo::replaceData(const SerializationInfo & other) +{ + data = other.data; +} + +MutableSerializationInfoPtr SerializationInfo::clone() const +{ + auto res = std::make_shared(kind, settings); + res->data = data; + return res; +} + +void SerializationInfo::serialializeKindBinary(WriteBuffer & out) const +{ + writeBinary(static_cast(kind), out); +} + +void SerializationInfo::deserializeFromKindsBinary(ReadBuffer & in) +{ + UInt8 kind_num; + readBinary(kind_num, in); + auto maybe_kind = magic_enum::enum_cast(kind_num); + if (!maybe_kind) + throw Exception(ErrorCodes::CORRUPTED_DATA, "Unknown serialization kind " + std::to_string(kind_num)); + + kind = *maybe_kind; +} + +Poco::JSON::Object SerializationInfo::toJSON() const +{ + Poco::JSON::Object object; + object.set(KEY_KIND, ISerialization::kindToString(kind)); + object.set(KEY_NUM_DEFAULTS, data.num_defaults); + object.set(KEY_NUM_ROWS, data.num_rows); + return object; +} + +void SerializationInfo::fromJSON(const Poco::JSON::Object & object) +{ + if (!object.has(KEY_KIND) || !object.has(KEY_NUM_DEFAULTS) || !object.has(KEY_NUM_ROWS)) + throw Exception(ErrorCodes::CORRUPTED_DATA, + "Missed field '{}' or '{}' or '{}' in SerializationInfo of columns", + KEY_KIND, KEY_NUM_DEFAULTS, KEY_NUM_ROWS); + + data.num_rows = object.getValue(KEY_NUM_ROWS); + data.num_defaults = object.getValue(KEY_NUM_DEFAULTS); + kind = ISerialization::stringToKind(object.getValue(KEY_KIND)); +} + +ISerialization::Kind SerializationInfo::chooseKind(const Data & data, const Settings & settings) +{ + double ratio = data.num_rows ? std::min(static_cast(data.num_defaults) / data.num_rows, 1.0) : 0.0; + return ratio > settings.ratio_of_defaults_for_sparse ? ISerialization::Kind::SPARSE : ISerialization::Kind::DEFAULT; +} + +SerializationInfoByName::SerializationInfoByName( + const NamesAndTypesList & columns, + const SerializationInfo::Settings & settings) +{ + if (settings.isAlwaysDefault()) + return; + + for (const auto & column : columns) + if (column.type->supportsSparseSerialization()) + emplace(column.name, column.type->createSerializationInfo(settings)); +} + +void SerializationInfoByName::add(const Block & block) +{ + for (const auto & column : block) + { + auto it = find(column.name); + if (it == end()) + continue; + + it->second->add(*column.column); + } +} + +void SerializationInfoByName::add(const SerializationInfoByName & other) +{ + for (const auto & [name, info] : other) + { + auto it = find(name); + if (it == end()) + continue; + + it->second->add(*info); + } +} + +void SerializationInfoByName::writeJSON(WriteBuffer & out) const +{ + Poco::JSON::Object object; + object.set(KEY_VERSION, SERIALIZATION_INFO_VERSION); + + Poco::JSON::Array column_infos; + for (const auto & [name, info] : *this) + { + auto info_json = info->toJSON(); + info_json.set(KEY_NAME, name); + column_infos.add(std::move(info_json)); + } + + object.set(KEY_COLUMNS, std::move(column_infos)); + + std::ostringstream oss; // STYLE_CHECK_ALLOW_STD_STRING_STREAM + oss.exceptions(std::ios::failbit); + Poco::JSON::Stringifier::stringify(object, oss); + + return writeString(oss.str(), out); +} + +void SerializationInfoByName::readJSON(ReadBuffer & in) +{ + String json_str; + readString(json_str, in); + + Poco::JSON::Parser parser; + auto object = parser.parse(json_str).extract(); + + if (!object->has(KEY_VERSION)) + throw Exception(ErrorCodes::CORRUPTED_DATA, "Missed version of serialization infos"); + + if (object->getValue(KEY_VERSION) > SERIALIZATION_INFO_VERSION) + throw Exception(ErrorCodes::CORRUPTED_DATA, + "Unknown version of serialization infos ({}). Should be less or equal than {}", + object->getValue(KEY_VERSION), SERIALIZATION_INFO_VERSION); + + if (object->has(KEY_COLUMNS)) + { + auto array = object->getArray(KEY_COLUMNS); + for (const auto & elem : *array) + { + auto elem_object = elem.extract(); + + if (!elem_object->has(KEY_NAME)) + throw Exception(ErrorCodes::CORRUPTED_DATA, + "Missed field '{}' in SerializationInfo of columns", KEY_NAME); + + auto name = elem_object->getValue(KEY_NAME); + auto it = find(name); + + if (it == end()) + throw Exception(ErrorCodes::CORRUPTED_DATA, + "There is no column {} in serialization infos", name); + + it->second->fromJSON(*elem_object); + } + } +} + +} diff --git a/src/DataTypes/Serializations/SerializationInfo.h b/src/DataTypes/Serializations/SerializationInfo.h new file mode 100644 index 00000000000..f7af5d77217 --- /dev/null +++ b/src/DataTypes/Serializations/SerializationInfo.h @@ -0,0 +1,96 @@ +#pragma once + +#include +#include +#include + + +namespace DB +{ + +class ReadBuffer; +class WriteBuffer; +class NamesAndTypesList; +class Block; + +constexpr auto SERIALIZATION_INFO_VERSION = 0; + +/** Contains information about kind of serialization of column and its subcolumns. + * Also contains information about content of columns, + * that helps to choose kind of serialization of column. + * + * Currently has only information about number of default rows, + * that helps to choose sparse serialization. + * + * Should be extended, when new kinds of serialization will be implemented. + */ +class SerializationInfo +{ +public: + struct Data + { + size_t num_rows = 0; + size_t num_defaults = 0; + + void add(const IColumn & column); + void add(const Data & other); + }; + + struct Settings + { + const double ratio_of_defaults_for_sparse = 1.0; + const bool choose_kind = false; + + bool isAlwaysDefault() const { return ratio_of_defaults_for_sparse >= 1.0; } + }; + + SerializationInfo(ISerialization::Kind kind_, const Settings & settings_); + + virtual ~SerializationInfo() = default; + + virtual bool hasCustomSerialization() const { return kind != ISerialization::Kind::DEFAULT; } + + virtual void add(const IColumn & column); + virtual void add(const SerializationInfo & other); + virtual void replaceData(const SerializationInfo & other); + virtual std::shared_ptr clone() const; + + virtual void serialializeKindBinary(WriteBuffer & out) const; + virtual void deserializeFromKindsBinary(ReadBuffer & in); + + virtual Poco::JSON::Object toJSON() const; + virtual void fromJSON(const Poco::JSON::Object & object); + + const Settings & getSettings() const { return settings; } + const Data & getData() const { return data; } + ISerialization::Kind getKind() const { return kind; } + + static ISerialization::Kind chooseKind(const Data & data, const Settings & settings); + +protected: + const Settings settings; + + ISerialization::Kind kind; + Data data; +}; + +using SerializationInfoPtr = std::shared_ptr; +using MutableSerializationInfoPtr = std::shared_ptr; + +using SerializationInfos = std::vector; +using MutableSerializationInfos = std::vector; + +class SerializationInfoByName : public std::unordered_map +{ +public: + SerializationInfoByName() = default; + SerializationInfoByName(const NamesAndTypesList & columns, const SerializationInfo::Settings & settings); + + void add(const Block & block); + void add(const SerializationInfoByName & other); + + void writeJSON(WriteBuffer & out) const; + void readJSON(ReadBuffer & in); +}; + +} diff --git a/src/DataTypes/Serializations/SerializationInfoTuple.cpp b/src/DataTypes/Serializations/SerializationInfoTuple.cpp new file mode 100644 index 00000000000..378bed2af53 --- /dev/null +++ b/src/DataTypes/Serializations/SerializationInfoTuple.cpp @@ -0,0 +1,114 @@ +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int CORRUPTED_DATA; + extern const int THERE_IS_NO_COLUMN; +} + +SerializationInfoTuple::SerializationInfoTuple( + MutableSerializationInfos elems_, const Settings & settings_) + : SerializationInfo(ISerialization::Kind::DEFAULT, settings_) + , elems(std::move(elems_)) +{ +} + +bool SerializationInfoTuple::hasCustomSerialization() const +{ + return std::any_of(elems.begin(), elems.end(), [](const auto & elem) { return elem->hasCustomSerialization(); }); +} + +void SerializationInfoTuple::add(const IColumn & column) +{ + SerializationInfo::add(column); + + const auto & column_tuple = assert_cast(column); + const auto & right_elems = column_tuple.getColumns(); + assert(elems.size() == right_elems.size()); + + for (size_t i = 0; i < elems.size(); ++i) + elems[i]->add(*right_elems[i]); +} + +void SerializationInfoTuple::add(const SerializationInfo & other) +{ + SerializationInfo::add(other); + + const auto & info_tuple = assert_cast(other); + assert(elems.size() == info_tuple.elems.size()); + + for (size_t i = 0; i < elems.size(); ++i) + elems[i]->add(*info_tuple.elems[i]); +} + +void SerializationInfoTuple::replaceData(const SerializationInfo & other) +{ + SerializationInfo::add(other); + + const auto & info_tuple = assert_cast(other); + assert(elems.size() == info_tuple.elems.size()); + + for (size_t i = 0; i < elems.size(); ++i) + elems[i]->replaceData(*info_tuple.elems[i]); +} + +MutableSerializationInfoPtr SerializationInfoTuple::clone() const +{ + MutableSerializationInfos elems_cloned; + elems_cloned.reserve(elems.size()); + for (const auto & elem : elems) + elems_cloned.push_back(elem->clone()); + + return std::make_shared(std::move(elems_cloned), settings); +} + +void SerializationInfoTuple::serialializeKindBinary(WriteBuffer & out) const +{ + SerializationInfo::serialializeKindBinary(out); + for (const auto & elem : elems) + elem->serialializeKindBinary(out); +} + +void SerializationInfoTuple::deserializeFromKindsBinary(ReadBuffer & in) +{ + SerializationInfo::deserializeFromKindsBinary(in); + for (const auto & elem : elems) + elem->deserializeFromKindsBinary(in); +} + +Poco::JSON::Object SerializationInfoTuple::toJSON() const +{ + auto object = SerializationInfo::toJSON(); + Poco::JSON::Array subcolumns; + for (const auto & elem : elems) + subcolumns.add(elem->toJSON()); + + object.set("subcolumns", std::move(subcolumns)); + return object; +} + +void SerializationInfoTuple::fromJSON(const Poco::JSON::Object & object) +{ + SerializationInfo::fromJSON(object); + + if (!object.has("subcolumns")) + throw Exception(ErrorCodes::CORRUPTED_DATA, + "Missed field '{}' in SerializationInfo of columns SerializationInfoTuple"); + + auto subcolumns = object.getArray("subcolumns"); + if (elems.size() != subcolumns->size()) + throw Exception(ErrorCodes::THERE_IS_NO_COLUMN, + "Mismatched number of subcolumns between JSON and SerializationInfoTuple." + "Expected: {}, got: {}", elems.size(), subcolumns->size()); + + for (size_t i = 0; i < elems.size(); ++i) + elems[i]->fromJSON(*subcolumns->getObject(i)); +} + +} diff --git a/src/DataTypes/Serializations/SerializationInfoTuple.h b/src/DataTypes/Serializations/SerializationInfoTuple.h new file mode 100644 index 00000000000..d196f80393e --- /dev/null +++ b/src/DataTypes/Serializations/SerializationInfoTuple.h @@ -0,0 +1,31 @@ +#pragma once +#include + +namespace DB +{ + +class SerializationInfoTuple : public SerializationInfo +{ +public: + SerializationInfoTuple(MutableSerializationInfos elems_, const Settings & settings_); + + bool hasCustomSerialization() const override; + void add(const IColumn & column) override; + void add(const SerializationInfo & other) override; + void replaceData(const SerializationInfo & other) override; + + MutableSerializationInfoPtr clone() const override; + void serialializeKindBinary(WriteBuffer & out) const override; + void deserializeFromKindsBinary(ReadBuffer & in) override; + + Poco::JSON::Object toJSON() const override; + void fromJSON(const Poco::JSON::Object & object) override; + + MutableSerializationInfoPtr getElementInfo(size_t i) const { return elems[i]; } + ISerialization::Kind getElementKind(size_t i) const { return elems[i]->getKind(); } + +private: + MutableSerializationInfos elems; +}; + +} diff --git a/src/DataTypes/Serializations/SerializationLowCardinality.cpp b/src/DataTypes/Serializations/SerializationLowCardinality.cpp index d83a6c0ee83..c79f588e46c 100644 --- a/src/DataTypes/Serializations/SerializationLowCardinality.cpp +++ b/src/DataTypes/Serializations/SerializationLowCardinality.cpp @@ -43,23 +43,25 @@ SerializationLowCardinality::SerializationLowCardinality(const DataTypePtr & dic void SerializationLowCardinality::enumerateStreams( SubstreamPath & path, const StreamCallback & callback, - DataTypePtr type, - ColumnPtr column) const + const SubstreamData & data) const { - const auto * column_lc = column ? &getColumnLowCardinality(*column) : nullptr; + const auto * column_lc = data.column ? &getColumnLowCardinality(*data.column) : nullptr; - SubstreamData data; - data.type = type ? dictionary_type : nullptr; - data.column = column_lc ? column_lc->getDictionary().getNestedColumn() : nullptr; - data.serialization = dict_inner_serialization; + SubstreamData dict_data = + { + dict_inner_serialization, + data.type ? dictionary_type : nullptr, + column_lc ? column_lc->getDictionary().getNestedColumn() : nullptr, + data.serialization_info, + }; path.push_back(Substream::DictionaryKeys); - path.back().data = data; + path.back().data = dict_data; - dict_inner_serialization->enumerateStreams(path, callback, data.type, data.column); + dict_inner_serialization->enumerateStreams(path, callback, dict_data); path.back() = Substream::DictionaryIndexes; - path.back().data = {type, column, getPtr(), nullptr}; + path.back().data = data; callback(path); path.pop_back(); @@ -222,42 +224,6 @@ struct DeserializeStateLowCardinality : public ISerialization::DeserializeBinary explicit DeserializeStateLowCardinality(UInt64 key_version_) : key_version(key_version_) {} }; -static SerializeStateLowCardinality * checkAndGetLowCardinalitySerializeState( - ISerialization::SerializeBinaryBulkStatePtr & state) -{ - if (!state) - throw Exception("Got empty state for SerializationLowCardinality.", ErrorCodes::LOGICAL_ERROR); - - auto * low_cardinality_state = typeid_cast(state.get()); - if (!low_cardinality_state) - { - auto & state_ref = *state; - throw Exception("Invalid SerializeBinaryBulkState for SerializationLowCardinality. Expected: " - + demangle(typeid(SerializeStateLowCardinality).name()) + ", got " - + demangle(typeid(state_ref).name()), ErrorCodes::LOGICAL_ERROR); - } - - return low_cardinality_state; -} - -static DeserializeStateLowCardinality * checkAndGetLowCardinalityDeserializeState( - ISerialization::DeserializeBinaryBulkStatePtr & state) -{ - if (!state) - throw Exception("Got empty state for SerializationLowCardinality.", ErrorCodes::LOGICAL_ERROR); - - auto * low_cardinality_state = typeid_cast(state.get()); - if (!low_cardinality_state) - { - auto & state_ref = *state; - throw Exception("Invalid DeserializeBinaryBulkState for SerializationLowCardinality. Expected: " - + demangle(typeid(DeserializeStateLowCardinality).name()) + ", got " - + demangle(typeid(state_ref).name()), ErrorCodes::LOGICAL_ERROR); - } - - return low_cardinality_state; -} - void SerializationLowCardinality::serializeBinaryBulkStatePrefix( SerializeBinaryBulkSettings & settings, SerializeBinaryBulkStatePtr & state) const @@ -282,7 +248,7 @@ void SerializationLowCardinality::serializeBinaryBulkStateSuffix( SerializeBinaryBulkSettings & settings, SerializeBinaryBulkStatePtr & state) const { - auto * low_cardinality_state = checkAndGetLowCardinalitySerializeState(state); + auto * low_cardinality_state = checkAndGetState(state); KeysSerializationVersion::checkVersion(low_cardinality_state->key_version.value); if (low_cardinality_state->shared_dictionary && settings.low_cardinality_max_dictionary_size) @@ -521,7 +487,7 @@ void SerializationLowCardinality::serializeBinaryBulkWithMultipleStreams( const ColumnLowCardinality & low_cardinality_column = typeid_cast(column); - auto * low_cardinality_state = checkAndGetLowCardinalitySerializeState(state); + auto * low_cardinality_state = checkAndGetState(state); auto & global_dictionary = low_cardinality_state->shared_dictionary; KeysSerializationVersion::checkVersion(low_cardinality_state->key_version.value); @@ -620,7 +586,7 @@ void SerializationLowCardinality::deserializeBinaryBulkWithMultipleStreams( if (!indexes_stream) throw Exception("Got empty stream for SerializationLowCardinality indexes.", ErrorCodes::LOGICAL_ERROR); - auto * low_cardinality_state = checkAndGetLowCardinalityDeserializeState(state); + auto * low_cardinality_state = checkAndGetState(state); KeysSerializationVersion::checkVersion(low_cardinality_state->key_version.value); auto read_dictionary = [this, low_cardinality_state, keys_stream]() @@ -670,6 +636,9 @@ void SerializationLowCardinality::deserializeBinaryBulkWithMultipleStreams( if (!low_cardinality_state->index_type.need_global_dictionary) { + if (additional_keys == nullptr) + throw Exception("No additional keys found.", ErrorCodes::INCORRECT_DATA); + ColumnPtr keys_column = additional_keys; if (low_cardinality_state->null_map) keys_column = ColumnNullable::create(additional_keys, low_cardinality_state->null_map); @@ -696,6 +665,9 @@ void SerializationLowCardinality::deserializeBinaryBulkWithMultipleStreams( if (!maps.additional_keys_map->empty()) { + if (additional_keys == nullptr) + throw Exception("No additional keys found.", ErrorCodes::INCORRECT_DATA); + auto used_add_keys = additional_keys->index(*maps.additional_keys_map, 0); if (dictionary_type->isNullable()) diff --git a/src/DataTypes/Serializations/SerializationLowCardinality.h b/src/DataTypes/Serializations/SerializationLowCardinality.h index af26405fcfa..5f8a2a95a25 100644 --- a/src/DataTypes/Serializations/SerializationLowCardinality.h +++ b/src/DataTypes/Serializations/SerializationLowCardinality.h @@ -20,8 +20,7 @@ public: void enumerateStreams( SubstreamPath & path, const StreamCallback & callback, - DataTypePtr type, - ColumnPtr column) const override; + const SubstreamData & data) const override; void serializeBinaryBulkStatePrefix( SerializeBinaryBulkSettings & settings, diff --git a/src/DataTypes/Serializations/SerializationMap.cpp b/src/DataTypes/Serializations/SerializationMap.cpp index d909b455441..3f17061a744 100644 --- a/src/DataTypes/Serializations/SerializationMap.cpp +++ b/src/DataTypes/Serializations/SerializationMap.cpp @@ -250,13 +250,17 @@ void SerializationMap::deserializeTextCSV(IColumn & column, ReadBuffer & istr, c void SerializationMap::enumerateStreams( SubstreamPath & path, const StreamCallback & callback, - DataTypePtr type, - ColumnPtr column) const + const SubstreamData & data) const { - auto next_type = type ? assert_cast(*type).getNestedType() : nullptr; - auto next_column = column ? assert_cast(*column).getNestedColumnPtr() : nullptr; + SubstreamData next_data = + { + nested, + data.type ? assert_cast(*data.type).getNestedType() : nullptr, + data.column ? assert_cast(*data.column).getNestedColumnPtr() : nullptr, + data.serialization_info, + }; - nested->enumerateStreams(path, callback, next_type, next_column); + nested->enumerateStreams(path, callback, next_data); } void SerializationMap::serializeBinaryBulkStatePrefix( diff --git a/src/DataTypes/Serializations/SerializationMap.h b/src/DataTypes/Serializations/SerializationMap.h index b679a8cf4c6..93b3e179499 100644 --- a/src/DataTypes/Serializations/SerializationMap.h +++ b/src/DataTypes/Serializations/SerializationMap.h @@ -34,8 +34,7 @@ public: void enumerateStreams( SubstreamPath & path, const StreamCallback & callback, - DataTypePtr type, - ColumnPtr column) const override; + const SubstreamData & data) const override; void serializeBinaryBulkStatePrefix( SerializeBinaryBulkSettings & settings, diff --git a/src/DataTypes/Serializations/SerializationNamed.cpp b/src/DataTypes/Serializations/SerializationNamed.cpp index 4ef4d4527f8..097e9cedfbe 100644 --- a/src/DataTypes/Serializations/SerializationNamed.cpp +++ b/src/DataTypes/Serializations/SerializationNamed.cpp @@ -6,12 +6,13 @@ namespace DB void SerializationNamed::enumerateStreams( SubstreamPath & path, const StreamCallback & callback, - DataTypePtr type, - ColumnPtr column) const + const SubstreamData & data) const { addToPath(path); - path.back().data = {type, column, getPtr(), std::make_shared(name, escape_delimiter)}; - nested_serialization->enumerateStreams(path, callback, type, column); + path.back().data = data; + path.back().creator = std::make_shared(name, escape_delimiter); + + nested_serialization->enumerateStreams(path, callback, data); path.pop_back(); } diff --git a/src/DataTypes/Serializations/SerializationNamed.h b/src/DataTypes/Serializations/SerializationNamed.h index 20dd15a20ba..91db0cf67f4 100644 --- a/src/DataTypes/Serializations/SerializationNamed.h +++ b/src/DataTypes/Serializations/SerializationNamed.h @@ -23,8 +23,7 @@ public: void enumerateStreams( SubstreamPath & path, const StreamCallback & callback, - DataTypePtr type, - ColumnPtr column) const override; + const SubstreamData & data) const override; void serializeBinaryBulkStatePrefix( SerializeBinaryBulkSettings & settings, diff --git a/src/DataTypes/Serializations/SerializationNullable.cpp b/src/DataTypes/Serializations/SerializationNullable.cpp index 261d0ff3c5d..a6273deaa30 100644 --- a/src/DataTypes/Serializations/SerializationNullable.cpp +++ b/src/DataTypes/Serializations/SerializationNullable.cpp @@ -40,30 +40,35 @@ ColumnPtr SerializationNullable::SubcolumnCreator::create(const ColumnPtr & prev void SerializationNullable::enumerateStreams( SubstreamPath & path, const StreamCallback & callback, - DataTypePtr type, - ColumnPtr column) const + const SubstreamData & data) const { - const auto * type_nullable = type ? &assert_cast(*type) : nullptr; - const auto * column_nullable = column ? &assert_cast(*column) : nullptr; + const auto * type_nullable = data.type ? &assert_cast(*data.type) : nullptr; + const auto * column_nullable = data.column ? &assert_cast(*data.column) : nullptr; path.push_back(Substream::NullMap); path.back().data = { + std::make_shared(std::make_shared>(), "null", false), type_nullable ? std::make_shared() : nullptr, column_nullable ? column_nullable->getNullMapColumnPtr() : nullptr, - std::make_shared(std::make_shared>(), "null", false), - nullptr, + data.serialization_info, }; callback(path); path.back() = Substream::NullableElements; - path.back().data = {type, column, getPtr(), std::make_shared(path.back().data.column)}; + path.back().creator = std::make_shared(path.back().data.column); + path.back().data = data; - auto next_type = type_nullable ? type_nullable->getNestedType() : nullptr; - auto next_column = column_nullable ? column_nullable->getNestedColumnPtr() : nullptr; + SubstreamData next_data = + { + nested, + type_nullable ? type_nullable->getNestedType() : nullptr, + column_nullable ? column_nullable->getNestedColumnPtr() : nullptr, + data.serialization_info, + }; - nested->enumerateStreams(path, callback, next_type, next_column); + nested->enumerateStreams(path, callback, next_data); path.pop_back(); } diff --git a/src/DataTypes/Serializations/SerializationNullable.h b/src/DataTypes/Serializations/SerializationNullable.h index c514234127c..eb3e9bfb430 100644 --- a/src/DataTypes/Serializations/SerializationNullable.h +++ b/src/DataTypes/Serializations/SerializationNullable.h @@ -16,8 +16,7 @@ public: void enumerateStreams( SubstreamPath & path, const StreamCallback & callback, - DataTypePtr type, - ColumnPtr column) const override; + const SubstreamData & data) const override; void serializeBinaryBulkStatePrefix( SerializeBinaryBulkSettings & settings, diff --git a/src/DataTypes/Serializations/SerializationSparse.cpp b/src/DataTypes/Serializations/SerializationSparse.cpp new file mode 100644 index 00000000000..64db248c5fc --- /dev/null +++ b/src/DataTypes/Serializations/SerializationSparse.cpp @@ -0,0 +1,380 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int NOT_IMPLEMENTED; + extern const int LOGICAL_ERROR; +} + +namespace +{ + +/// 2^62, because VarInt supports only values < 2^63. +constexpr auto END_OF_GRANULE_FLAG = 1ULL << 62; + +struct DeserializeStateSparse : public ISerialization::DeserializeBinaryBulkState +{ + /// Number of default values, that remain from previous read. + size_t num_trailing_defaults = 0; + /// Do we have non-default value after @num_trailing_defaults? + bool has_value_after_defaults = false; + ISerialization::DeserializeBinaryBulkStatePtr nested; + + void reset() + { + num_trailing_defaults = 0; + has_value_after_defaults = false; + } +}; + +void serializeOffsets(const IColumn::Offsets & offsets, WriteBuffer & ostr, size_t start, size_t end) +{ + size_t size = offsets.size(); + for (size_t i = 0; i < size; ++i) + { + size_t group_size = offsets[i] - start; + writeVarUInt(group_size, ostr); + start += group_size + 1; + } + + size_t group_size = start < end ? end - start : 0; + group_size |= END_OF_GRANULE_FLAG; + writeVarUInt(group_size, ostr); +} + + +/// Returns number of read rows. +/// @start is the size of column before reading offsets. +size_t deserializeOffsets(IColumn::Offsets & offsets, + ReadBuffer & istr, size_t start, size_t limit, DeserializeStateSparse & state) +{ + if (limit && state.num_trailing_defaults >= limit) + { + state.num_trailing_defaults -= limit; + return limit; + } + + /// Just try to guess number of offsets. + offsets.reserve(offsets.size() + + static_cast(limit * (1.0 - ColumnSparse::DEFAULT_RATIO_FOR_SPARSE_SERIALIZATION))); + + bool first = true; + size_t total_rows = state.num_trailing_defaults; + if (state.has_value_after_defaults) + { + offsets.push_back(start + state.num_trailing_defaults); + first = false; + + state.has_value_after_defaults = false; + state.num_trailing_defaults = 0; + ++total_rows; + } + + size_t group_size; + while (!istr.eof()) + { + readVarUInt(group_size, istr); + + bool end_of_granule = group_size & END_OF_GRANULE_FLAG; + group_size &= ~END_OF_GRANULE_FLAG; + + size_t next_total_rows = total_rows + group_size; + group_size += state.num_trailing_defaults; + + if (limit && next_total_rows >= limit) + { + /// If it was not last group in granule, + /// we have to add current non-default value at further reads. + state.num_trailing_defaults = next_total_rows - limit; + state.has_value_after_defaults = !end_of_granule; + return limit; + } + + if (end_of_granule) + { + state.has_value_after_defaults = false; + state.num_trailing_defaults = group_size; + } + else + { + /// If we add value to column for first time in current read, + /// start from column's current size, because it can have some defaults after last offset, + /// otherwise just start from previous offset. + size_t start_of_group = start; + if (!first && !offsets.empty()) + start_of_group = offsets.back() + 1; + if (first) + first = false; + + offsets.push_back(start_of_group + group_size); + + state.num_trailing_defaults = 0; + state.has_value_after_defaults = false; + ++next_total_rows; + } + + total_rows = next_total_rows; + } + + return total_rows; +} + +} + +SerializationSparse::SerializationSparse(const SerializationPtr & nested_) + : nested(nested_) +{ +} + +SerializationPtr SerializationSparse::SubcolumnCreator::create(const SerializationPtr & prev) const +{ + return std::make_shared(prev); +} + +ColumnPtr SerializationSparse::SubcolumnCreator::create(const ColumnPtr & prev) const +{ + return ColumnSparse::create(prev, offsets, size); +} + +void SerializationSparse::enumerateStreams( + SubstreamPath & path, + const StreamCallback & callback, + const SubstreamData & data) const +{ + const auto * column_sparse = data.column ? &assert_cast(*data.column) : nullptr; + + size_t column_size = column_sparse ? column_sparse->size() : 0; + + path.push_back(Substream::SparseOffsets); + path.back().data = + { + std::make_shared>(), + data.type ? std::make_shared() : nullptr, + column_sparse ? column_sparse->getOffsetsPtr() : nullptr, + data.serialization_info, + }; + + callback(path); + + path.back() = Substream::SparseElements; + path.back().creator = std::make_shared(path.back().data.column, column_size); + path.back().data = data; + + SubstreamData next_data = + { + nested, + data.type, + column_sparse ? column_sparse->getValuesPtr() : nullptr, + data.serialization_info, + }; + + nested->enumerateStreams(path, callback, next_data); + path.pop_back(); +} + +void SerializationSparse::serializeBinaryBulkStatePrefix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + settings.path.push_back(Substream::SparseElements); + nested->serializeBinaryBulkStatePrefix(settings, state); + settings.path.pop_back(); +} + +void SerializationSparse::serializeBinaryBulkWithMultipleStreams( + const IColumn & column, + size_t offset, + size_t limit, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + size_t size = column.size(); + + auto offsets_column = DataTypeNumber().createColumn(); + auto & offsets_data = assert_cast &>(*offsets_column).getData(); + column.getIndicesOfNonDefaultRows(offsets_data, offset, limit); + + settings.path.push_back(Substream::SparseOffsets); + if (auto * stream = settings.getter(settings.path)) + { + size_t end = limit && offset + limit < size ? offset + limit : size; + serializeOffsets(offsets_data, *stream, offset, end); + } + + if (!offsets_data.empty()) + { + settings.path.back() = Substream::SparseElements; + if (const auto * column_sparse = typeid_cast(&column)) + { + const auto & values = column_sparse->getValuesColumn(); + size_t begin = column_sparse->getValueIndex(offsets_data[0]); + size_t end = column_sparse->getValueIndex(offsets_data.back()); + nested->serializeBinaryBulkWithMultipleStreams(values, begin, end - begin + 1, settings, state); + } + else + { + auto values = column.index(*offsets_column, 0); + nested->serializeBinaryBulkWithMultipleStreams(*values, 0, values->size(), settings, state); + } + } + + settings.path.pop_back(); +} + +void SerializationSparse::serializeBinaryBulkStateSuffix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + settings.path.push_back(Substream::SparseElements); + nested->serializeBinaryBulkStateSuffix(settings, state); + settings.path.pop_back(); +} + +void SerializationSparse::deserializeBinaryBulkStatePrefix( + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state) const +{ + auto state_sparse = std::make_shared(); + + settings.path.push_back(Substream::SparseElements); + nested->deserializeBinaryBulkStatePrefix(settings, state_sparse->nested); + settings.path.pop_back(); + + state = std::move(state_sparse); +} + +void SerializationSparse::deserializeBinaryBulkWithMultipleStreams( + ColumnPtr & column, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsCache * cache) const +{ + auto * state_sparse = checkAndGetState(state); + + if (!settings.continuous_reading) + state_sparse->reset(); + + auto mutable_column = column->assumeMutable(); + auto & column_sparse = assert_cast(*mutable_column); + auto & offsets_data = column_sparse.getOffsetsData(); + + size_t old_size = offsets_data.size(); + + size_t read_rows = 0; + settings.path.push_back(Substream::SparseOffsets); + if (auto * stream = settings.getter(settings.path)) + read_rows = deserializeOffsets(offsets_data, *stream, column_sparse.size(), limit, *state_sparse); + + auto & values_column = column_sparse.getValuesPtr(); + size_t values_limit = offsets_data.size() - old_size; + + settings.path.back() = Substream::SparseElements; + nested->deserializeBinaryBulkWithMultipleStreams(values_column, values_limit, settings, state_sparse->nested, cache); + settings.path.pop_back(); + + if (offsets_data.size() + 1 != values_column->size()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Inconsistent sizes of values and offsets in SerializationSparse." + " Offsets size: {}, values size: {}", offsets_data.size(), values_column->size()); + + /// 'insertManyDefaults' just increases size of column. + column_sparse.insertManyDefaults(read_rows); + column = std::move(mutable_column); +} + +/// All methods below just wrap nested serialization. + +void SerializationSparse::serializeBinary(const Field & field, WriteBuffer & ostr) const +{ + nested->serializeBinary(field, ostr); +} + +void SerializationSparse::deserializeBinary(Field & field, ReadBuffer & istr) const +{ + nested->deserializeBinary(field, istr); +} + +void SerializationSparse::serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const +{ + const auto & column_sparse = assert_cast(column); + nested->serializeBinary(column_sparse.getValuesColumn(), column_sparse.getValueIndex(row_num), ostr); +} + +void SerializationSparse::deserializeBinary(IColumn &, ReadBuffer &) const +{ + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method 'deserializeBinary' is not implemented for SerializationSparse"); +} + +void SerializationSparse::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const auto & column_sparse = assert_cast(column); + nested->serializeTextEscaped(column_sparse.getValuesColumn(), column_sparse.getValueIndex(row_num), ostr, settings); +} + +void SerializationSparse::deserializeTextEscaped(IColumn &, ReadBuffer &, const FormatSettings &) const +{ + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method 'deserializeTextEscaped' is not implemented for SerializationSparse"); +} + +void SerializationSparse::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const auto & column_sparse = assert_cast(column); + nested->serializeTextQuoted(column_sparse.getValuesColumn(), column_sparse.getValueIndex(row_num), ostr, settings); +} + +void SerializationSparse::deserializeTextQuoted(IColumn &, ReadBuffer &, const FormatSettings &) const +{ + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method 'deserializeTextQuoted' is not implemented for SerializationSparse"); +} + +void SerializationSparse::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const auto & column_sparse = assert_cast(column); + nested->serializeTextCSV(column_sparse.getValuesColumn(), column_sparse.getValueIndex(row_num), ostr, settings); +} + +void SerializationSparse::deserializeTextCSV(IColumn &, ReadBuffer &, const FormatSettings &) const +{ + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method 'deserializeTextCSV' is not implemented for SerializationSparse"); +} + +void SerializationSparse::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const auto & column_sparse = assert_cast(column); + nested->serializeText(column_sparse.getValuesColumn(), column_sparse.getValueIndex(row_num), ostr, settings); +} + +void SerializationSparse::deserializeWholeText(IColumn &, ReadBuffer &, const FormatSettings &) const +{ + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method 'deserializeWholeText' is not implemented for SerializationSparse"); +} + +void SerializationSparse::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const auto & column_sparse = assert_cast(column); + nested->serializeTextJSON(column_sparse.getValuesColumn(), column_sparse.getValueIndex(row_num), ostr, settings); +} + +void SerializationSparse::deserializeTextJSON(IColumn &, ReadBuffer &, const FormatSettings &) const +{ + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method 'deserializeTextJSON' is not implemented for SerializationSparse"); +} + +void SerializationSparse::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const auto & column_sparse = assert_cast(column); + nested->serializeTextXML(column_sparse.getValuesColumn(), column_sparse.getValueIndex(row_num), ostr, settings); +} + +} diff --git a/src/DataTypes/Serializations/SerializationSparse.h b/src/DataTypes/Serializations/SerializationSparse.h new file mode 100644 index 00000000000..51d9df2cb5d --- /dev/null +++ b/src/DataTypes/Serializations/SerializationSparse.h @@ -0,0 +1,103 @@ +#pragma once + +#include + +namespace DB +{ + + +/** Serialization for sparse representation. + * Only '{serialize,deserialize}BinaryBulk' makes sense. + * Format: + * Values and offsets are written to separate substreams. + * There are written only non-default values. + * + * Offsets have position independent format: as i-th offset there + * is written number of default values, that precedes the i-th non-default value. + * Offsets are written in VarInt encoding. + * Additionally at the end of every call of 'serializeBinaryBulkWithMultipleStreams' + * there is written number of default values in the suffix of part of column, + * that we currently writing. This value also marked with a flag, that means the end of portion of data. + * This value is used, e.g. to allow independent reading of granules in MergeTree. + */ +class SerializationSparse final : public ISerialization +{ +public: + SerializationSparse(const SerializationPtr & nested_); + + Kind getKind() const override { return Kind::SPARSE; } + + virtual void enumerateStreams( + SubstreamPath & path, + const StreamCallback & callback, + const SubstreamData & data) const override; + + void serializeBinaryBulkStatePrefix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void serializeBinaryBulkStateSuffix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void deserializeBinaryBulkStatePrefix( + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state) const override; + + /// Allows to write ColumnSparse and other columns in sparse serialization. + void serializeBinaryBulkWithMultipleStreams( + const IColumn & column, + size_t offset, + size_t limit, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + /// Allows to read only ColumnSparse. + void deserializeBinaryBulkWithMultipleStreams( + ColumnPtr & column, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsCache * cache) const override; + + void serializeBinary(const Field & field, WriteBuffer & ostr) const override; + void deserializeBinary(Field & field, ReadBuffer & istr) const override; + + void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const override; + void deserializeBinary(IColumn & column, ReadBuffer & istr) const override; + + void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + +private: + struct SubcolumnCreator : public ISubcolumnCreator + { + const ColumnPtr offsets; + const size_t size; + + SubcolumnCreator(const ColumnPtr & offsets_, size_t size_) + : offsets(offsets_), size(size_) {} + + DataTypePtr create(const DataTypePtr & prev) const override { return prev; } + SerializationPtr create(const SerializationPtr & prev) const override; + ColumnPtr create(const ColumnPtr & prev) const override; + }; + + SerializationPtr nested; +}; + +} diff --git a/src/DataTypes/Serializations/SerializationTuple.cpp b/src/DataTypes/Serializations/SerializationTuple.cpp index cb4e7f9666e..cd5a6b65a3c 100644 --- a/src/DataTypes/Serializations/SerializationTuple.cpp +++ b/src/DataTypes/Serializations/SerializationTuple.cpp @@ -1,5 +1,6 @@ #include #include +#include #include #include #include @@ -16,7 +17,6 @@ namespace ErrorCodes { extern const int SIZES_OF_COLUMNS_IN_TUPLE_DOESNT_MATCH; extern const int NOT_FOUND_COLUMN_IN_BLOCK; - extern const int LOGICAL_ERROR; } @@ -260,7 +260,7 @@ void SerializationTuple::serializeTextCSV(const IColumn & column, size_t row_num for (const auto i : collections::range(0, elems.size())) { if (i != 0) - writeChar(',', ostr); + writeChar(settings.csv.tuple_delimiter, ostr); elems[i]->serializeTextCSV(extractElementColumn(column, i), row_num, ostr, settings); } } @@ -275,7 +275,7 @@ void SerializationTuple::deserializeTextCSV(IColumn & column, ReadBuffer & istr, if (i != 0) { skipWhitespaceIfAny(istr); - assertChar(settings.csv.delimiter, istr); + assertChar(settings.csv.tuple_delimiter, istr); skipWhitespaceIfAny(istr); } elems[i]->deserializeTextCSV(extractElementColumn(column, i), istr, settings); @@ -286,18 +286,23 @@ void SerializationTuple::deserializeTextCSV(IColumn & column, ReadBuffer & istr, void SerializationTuple::enumerateStreams( SubstreamPath & path, const StreamCallback & callback, - DataTypePtr type, - ColumnPtr column) const + const SubstreamData & data) const { - const auto * type_tuple = type ? &assert_cast(*type) : nullptr; - const auto * column_tuple = column ? &assert_cast(*column) : nullptr; + const auto * type_tuple = data.type ? &assert_cast(*data.type) : nullptr; + const auto * column_tuple = data.column ? &assert_cast(*data.column) : nullptr; + const auto * info_tuple = data.serialization_info ? &assert_cast(*data.serialization_info) : nullptr; for (size_t i = 0; i < elems.size(); ++i) { - auto next_type = type_tuple ? type_tuple->getElement(i) : nullptr; - auto next_column = column_tuple ? column_tuple->getColumnPtr(i) : nullptr; + SubstreamData next_data = + { + elems[i], + type_tuple ? type_tuple->getElement(i) : nullptr, + column_tuple ? column_tuple->getColumnPtr(i) : nullptr, + info_tuple ? info_tuple->getElementInfo(i) : nullptr, + }; - elems[i]->enumerateStreams(path, callback, next_type, next_column); + elems[i]->enumerateStreams(path, callback, next_data); } } @@ -311,39 +316,6 @@ struct DeserializeBinaryBulkStateTuple : public ISerialization::DeserializeBinar std::vector states; }; -static SerializeBinaryBulkStateTuple * checkAndGetTupleSerializeState(ISerialization::SerializeBinaryBulkStatePtr & state) -{ - if (!state) - throw Exception("Got empty state for DataTypeTuple.", ErrorCodes::LOGICAL_ERROR); - - auto * tuple_state = typeid_cast(state.get()); - if (!tuple_state) - { - auto & state_ref = *state; - throw Exception("Invalid SerializeBinaryBulkState for DataTypeTuple. Expected: " - + demangle(typeid(SerializeBinaryBulkStateTuple).name()) + ", got " - + demangle(typeid(state_ref).name()), ErrorCodes::LOGICAL_ERROR); - } - - return tuple_state; -} - -static DeserializeBinaryBulkStateTuple * checkAndGetTupleDeserializeState(ISerialization::DeserializeBinaryBulkStatePtr & state) -{ - if (!state) - throw Exception("Got empty state for DataTypeTuple.", ErrorCodes::LOGICAL_ERROR); - - auto * tuple_state = typeid_cast(state.get()); - if (!tuple_state) - { - auto & state_ref = *state; - throw Exception("Invalid DeserializeBinaryBulkState for DataTypeTuple. Expected: " - + demangle(typeid(DeserializeBinaryBulkStateTuple).name()) + ", got " - + demangle(typeid(state_ref).name()), ErrorCodes::LOGICAL_ERROR); - } - - return tuple_state; -} void SerializationTuple::serializeBinaryBulkStatePrefix( SerializeBinaryBulkSettings & settings, @@ -362,7 +334,7 @@ void SerializationTuple::serializeBinaryBulkStateSuffix( SerializeBinaryBulkSettings & settings, SerializeBinaryBulkStatePtr & state) const { - auto * tuple_state = checkAndGetTupleSerializeState(state); + auto * tuple_state = checkAndGetState(state); for (size_t i = 0; i < elems.size(); ++i) elems[i]->serializeBinaryBulkStateSuffix(settings, tuple_state->states[i]); @@ -388,7 +360,7 @@ void SerializationTuple::serializeBinaryBulkWithMultipleStreams( SerializeBinaryBulkSettings & settings, SerializeBinaryBulkStatePtr & state) const { - auto * tuple_state = checkAndGetTupleSerializeState(state); + auto * tuple_state = checkAndGetState(state); for (const auto i : collections::range(0, elems.size())) { @@ -404,7 +376,7 @@ void SerializationTuple::deserializeBinaryBulkWithMultipleStreams( DeserializeBinaryBulkStatePtr & state, SubstreamsCache * cache) const { - auto * tuple_state = checkAndGetTupleDeserializeState(state); + auto * tuple_state = checkAndGetState(state); auto mutable_column = column->assumeMutable(); auto & column_tuple = assert_cast(*mutable_column); diff --git a/src/DataTypes/Serializations/SerializationTuple.h b/src/DataTypes/Serializations/SerializationTuple.h index 0eb178f8301..e82d8473645 100644 --- a/src/DataTypes/Serializations/SerializationTuple.h +++ b/src/DataTypes/Serializations/SerializationTuple.h @@ -13,7 +13,9 @@ public: using ElementSerializations = std::vector; SerializationTuple(const ElementSerializations & elems_, bool have_explicit_names_) - : elems(elems_), have_explicit_names(have_explicit_names_) {} + : elems(elems_), have_explicit_names(have_explicit_names_) + { + } void serializeBinary(const Field & field, WriteBuffer & ostr) const override; void deserializeBinary(Field & field, ReadBuffer & istr) const override; @@ -34,8 +36,7 @@ public: void enumerateStreams( SubstreamPath & path, const StreamCallback & callback, - DataTypePtr type, - ColumnPtr column) const override; + const SubstreamData & data) const override; void serializeBinaryBulkStatePrefix( SerializeBinaryBulkSettings & settings, @@ -63,6 +64,8 @@ public: DeserializeBinaryBulkStatePtr & state, SubstreamsCache * cache) const override; + const ElementSerializations & getElementsSerializations() const { return elems; } + private: ElementSerializations elems; bool have_explicit_names; diff --git a/src/DataTypes/Serializations/SerializationWrapper.cpp b/src/DataTypes/Serializations/SerializationWrapper.cpp index c0829ab1b26..271c53dfcf1 100644 --- a/src/DataTypes/Serializations/SerializationWrapper.cpp +++ b/src/DataTypes/Serializations/SerializationWrapper.cpp @@ -7,10 +7,9 @@ namespace DB void SerializationWrapper::enumerateStreams( SubstreamPath & path, const StreamCallback & callback, - DataTypePtr type, - ColumnPtr column) const + const SubstreamData & data) const { - nested_serialization->enumerateStreams(path, callback, type, column); + nested_serialization->enumerateStreams(path, callback, data); } void SerializationWrapper::serializeBinaryBulkStatePrefix( diff --git a/src/DataTypes/Serializations/SerializationWrapper.h b/src/DataTypes/Serializations/SerializationWrapper.h index c48278d53db..4cdcffc21a8 100644 --- a/src/DataTypes/Serializations/SerializationWrapper.h +++ b/src/DataTypes/Serializations/SerializationWrapper.h @@ -16,11 +16,14 @@ protected: public: SerializationWrapper(const SerializationPtr & nested_serialization_) : nested_serialization(nested_serialization_) {} + const SerializationPtr & getNested() const { return nested_serialization; } + + Kind getKind() const override { return nested_serialization->getKind(); } + void enumerateStreams( SubstreamPath & path, const StreamCallback & callback, - DataTypePtr type, - ColumnPtr column) const override; + const SubstreamData & data) const override; void serializeBinaryBulkStatePrefix( SerializeBinaryBulkSettings & settings, diff --git a/src/DataTypes/TimezoneMixin.h b/src/DataTypes/TimezoneMixin.h index e6e9f7a7989..03ecde5dd0a 100644 --- a/src/DataTypes/TimezoneMixin.h +++ b/src/DataTypes/TimezoneMixin.h @@ -1,6 +1,6 @@ #pragma once #include -#include +#include class DateLUTImpl; diff --git a/src/DataTypes/convertMySQLDataType.cpp b/src/DataTypes/convertMySQLDataType.cpp index 1b5e20bddce..ee897de9597 100644 --- a/src/DataTypes/convertMySQLDataType.cpp +++ b/src/DataTypes/convertMySQLDataType.cpp @@ -91,6 +91,10 @@ DataTypePtr convertMySQLDataType(MultiEnum type_support, res = std::make_shared(scale); } } + else if (type_name == "bit") + { + res = std::make_shared(); + } else if (type_support.isSet(MySQLDataTypesSupport::DECIMAL) && (type_name == "numeric" || type_name == "decimal")) { if (precision <= DecimalUtils::max_precision) diff --git a/src/DataTypes/tests/gtest_split_name.cpp b/src/DataTypes/tests/gtest_split_name.cpp new file mode 100644 index 00000000000..04ce4d5e108 --- /dev/null +++ b/src/DataTypes/tests/gtest_split_name.cpp @@ -0,0 +1,32 @@ +#include + +#include + +using namespace DB; + +TEST(SplitName, forward) +{ + ASSERT_EQ(Nested::splitName(String("abc")), (std::pair{"abc", ""})); + ASSERT_EQ(Nested::splitName(String("a.b")), (std::pair{"a", "b"})); + ASSERT_EQ(Nested::splitName(String("a.b.c")), (std::pair{"a", "b.c"})); + ASSERT_EQ(Nested::splitName(String("a.1")), (std::pair{"a", "1"})); + ASSERT_EQ(Nested::splitName(String("a.1.b")), (std::pair{"a", "1.b"})); + ASSERT_EQ(Nested::splitName(String("1.a")), (std::pair{"1", "a"})); + ASSERT_EQ(Nested::splitName(String("a.b1.b2")), (std::pair{"a", "b1.b2"})); + ASSERT_EQ(Nested::splitName(String("a.b1.2a.3a")), (std::pair{"a", "b1.2a.3a"})); + ASSERT_EQ(Nested::splitName(String("..")), (std::pair{"..", ""})); +} + +TEST(SplitName, reverse) +{ + ASSERT_EQ(Nested::splitName(String("abc"), true), (std::pair{"abc", ""})); + ASSERT_EQ(Nested::splitName(String("a.b"), true), (std::pair{"a", "b"})); + ASSERT_EQ(Nested::splitName(String("a.b.c"), true), (std::pair{"a.b", "c"})); + ASSERT_EQ(Nested::splitName(String("a.1"), true), (std::pair{"a", "1"})); + ASSERT_EQ(Nested::splitName(String("a.1a.b"), true), (std::pair{"a.1a", "b"})); + ASSERT_EQ(Nested::splitName(String("1a.b"), true), (std::pair{"1a", "b"})); + ASSERT_EQ(Nested::splitName(String("a.b1.b2"), true), (std::pair{"a.b1", "b2"})); + ASSERT_EQ(Nested::splitName(String("a.b1.2a.3a"), true), (std::pair{"a.b1.2a", "3a"})); + ASSERT_EQ(Nested::splitName(String("a.b1.b2.b3"), true), (std::pair{"a.b1.b2", "b3"})); + ASSERT_EQ(Nested::splitName(String(".."), true), (std::pair{"..", ""})); +} diff --git a/src/Databases/DatabaseAtomic.cpp b/src/Databases/DatabaseAtomic.cpp index c898d5ee943..cb0c1cdae95 100644 --- a/src/Databases/DatabaseAtomic.cpp +++ b/src/Databases/DatabaseAtomic.cpp @@ -43,6 +43,7 @@ DatabaseAtomic::DatabaseAtomic(String name_, String metadata_path_, UUID uuid, c , db_uuid(uuid) { assert(db_uuid != UUIDHelpers::Nil); + fs::create_directories(fs::path(getContext()->getPath()) / "metadata"); fs::create_directories(path_to_table_symlinks); tryCreateMetadataSymlink(); } diff --git a/src/Databases/DatabaseDictionary.cpp b/src/Databases/DatabaseDictionary.cpp index db7da95fb27..82766c1e384 100644 --- a/src/Databases/DatabaseDictionary.cpp +++ b/src/Databases/DatabaseDictionary.cpp @@ -29,10 +29,13 @@ namespace return nullptr; DictionaryStructure dictionary_structure = ExternalDictionariesLoader::getDictionaryStructure(*load_result.config); + auto comment = load_result.config->config->getString("dictionary.comment", ""); + return StorageDictionary::create( StorageID(database_name, load_result.name), load_result.name, dictionary_structure, + comment, StorageDictionary::Location::DictionaryDatabase, context); } diff --git a/src/Databases/DatabaseFactory.cpp b/src/Databases/DatabaseFactory.cpp index ca48e2847ed..3f6cb49fda7 100644 --- a/src/Databases/DatabaseFactory.cpp +++ b/src/Databases/DatabaseFactory.cpp @@ -23,6 +23,8 @@ # include # include # include +# include +# include # include # include #endif @@ -117,6 +119,7 @@ DatabasePtr DatabaseFactory::getImpl(const ASTCreateQuery & create, const String static const std::unordered_set engines_with_arguments{"MySQL", "MaterializeMySQL", "MaterializedMySQL", "Lazy", "Replicated", "PostgreSQL", "MaterializedPostgreSQL", "SQLite"}; + static const std::unordered_set engines_with_table_overrides{"MaterializeMySQL", "MaterializedMySQL", "MaterializedPostgreSQL"}; bool engine_may_have_arguments = engines_with_arguments.contains(engine_name); if (engine_define->engine->arguments && !engine_may_have_arguments) @@ -131,6 +134,9 @@ DatabasePtr DatabaseFactory::getImpl(const ASTCreateQuery & create, const String throw Exception(ErrorCodes::UNKNOWN_ELEMENT_IN_AST, "Database engine `{}` cannot have parameters, primary_key, order_by, sample_by, settings", engine_name); + if (create.table_overrides && !engines_with_table_overrides.contains(engine_name)) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Database engine `{}` cannot have table overrides", engine_name); + if (engine_name == "Ordinary") return std::make_shared(database_name, metadata_path, context); else if (engine_name == "Atomic") @@ -194,13 +200,15 @@ DatabasePtr DatabaseFactory::getImpl(const ASTCreateQuery & create, const String if (engine_name == "MySQL") { auto mysql_database_settings = std::make_unique(); - auto mysql_pool = mysqlxx::PoolWithFailover(configuration.database, configuration.addresses, configuration.username, configuration.password); + MySQLSettings mysql_settings; + auto mysql_pool = createMySQLPoolWithFailover(configuration, mysql_settings); mysql_database_settings->loadFromQueryContext(context); mysql_database_settings->loadFromQuery(*engine_define); /// higher priority return std::make_shared( - context, database_name, metadata_path, engine_define, configuration.database, std::move(mysql_database_settings), std::move(mysql_pool)); + context, database_name, metadata_path, engine_define, configuration.database, + std::move(mysql_database_settings), std::move(mysql_pool), create.attach); } MySQLClient client(configuration.host, configuration.port, configuration.username, configuration.password); diff --git a/src/Databases/DatabaseOnDisk.cpp b/src/Databases/DatabaseOnDisk.cpp index e9944b592ed..165bad950f5 100644 --- a/src/Databases/DatabaseOnDisk.cpp +++ b/src/Databases/DatabaseOnDisk.cpp @@ -76,10 +76,16 @@ std::pair createTableFromAST( /// - the database has not been loaded yet; /// - the code is simpler, since the query is already brought to a suitable form. if (!ast_create_query.columns_list || !ast_create_query.columns_list->columns) - throw Exception("Missing definition of columns.", ErrorCodes::EMPTY_LIST_OF_COLUMNS_PASSED); - - columns = InterpreterCreateQuery::getColumnsDescription(*ast_create_query.columns_list->columns, context, true); - constraints = InterpreterCreateQuery::getConstraintsDescription(ast_create_query.columns_list->constraints); + { + if (!StorageFactory::instance().checkIfStorageSupportsSchemaInterface(ast_create_query.storage->engine->name)) + throw Exception("Missing definition of columns.", ErrorCodes::EMPTY_LIST_OF_COLUMNS_PASSED); + /// Leave columns empty. + } + else + { + columns = InterpreterCreateQuery::getColumnsDescription(*ast_create_query.columns_list->columns, context, true); + constraints = InterpreterCreateQuery::getConstraintsDescription(ast_create_query.columns_list->constraints); + } } return diff --git a/src/Databases/DatabasesCommon.cpp b/src/Databases/DatabasesCommon.cpp index ffb39f5b113..1c3f417b431 100644 --- a/src/Databases/DatabasesCommon.cpp +++ b/src/Databases/DatabasesCommon.cpp @@ -30,27 +30,33 @@ void applyMetadataChangesToCreateQuery(const ASTPtr & query, const StorageInMemo auto & ast_create_query = query->as(); bool has_structure = ast_create_query.columns_list && ast_create_query.columns_list->columns; + if (ast_create_query.as_table_function && !has_structure) throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Cannot alter table {} because it was created AS table function" " and doesn't have structure in metadata", backQuote(ast_create_query.getTable())); - assert(has_structure); - ASTPtr new_columns = InterpreterCreateQuery::formatColumns(metadata.columns); - ASTPtr new_indices = InterpreterCreateQuery::formatIndices(metadata.secondary_indices); - ASTPtr new_constraints = InterpreterCreateQuery::formatConstraints(metadata.constraints); - ASTPtr new_projections = InterpreterCreateQuery::formatProjections(metadata.projections); + if (!has_structure && !ast_create_query.is_dictionary) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot alter table {} metadata doesn't have structure", backQuote(ast_create_query.getTable())); - ast_create_query.columns_list->replace(ast_create_query.columns_list->columns, new_columns); - ast_create_query.columns_list->setOrReplace(ast_create_query.columns_list->indices, new_indices); - ast_create_query.columns_list->setOrReplace(ast_create_query.columns_list->constraints, new_constraints); - ast_create_query.columns_list->setOrReplace(ast_create_query.columns_list->projections, new_projections); + if (!ast_create_query.is_dictionary) + { + ASTPtr new_columns = InterpreterCreateQuery::formatColumns(metadata.columns); + ASTPtr new_indices = InterpreterCreateQuery::formatIndices(metadata.secondary_indices); + ASTPtr new_constraints = InterpreterCreateQuery::formatConstraints(metadata.constraints); + ASTPtr new_projections = InterpreterCreateQuery::formatProjections(metadata.projections); + + ast_create_query.columns_list->replace(ast_create_query.columns_list->columns, new_columns); + ast_create_query.columns_list->setOrReplace(ast_create_query.columns_list->indices, new_indices); + ast_create_query.columns_list->setOrReplace(ast_create_query.columns_list->constraints, new_constraints); + ast_create_query.columns_list->setOrReplace(ast_create_query.columns_list->projections, new_projections); + } if (metadata.select.select_query) { query->replace(ast_create_query.select, metadata.select.select_query); } - /// MaterializedView is one type of CREATE query without storage. + /// MaterializedView, Dictionary are types of CREATE query without storage. if (ast_create_query.storage) { ASTStorage & storage_ast = *ast_create_query.storage; diff --git a/src/Databases/MySQL/ConnectionMySQLSettings.cpp b/src/Databases/MySQL/ConnectionMySQLSettings.cpp index 1026d14018b..87da701e481 100644 --- a/src/Databases/MySQL/ConnectionMySQLSettings.cpp +++ b/src/Databases/MySQL/ConnectionMySQLSettings.cpp @@ -14,7 +14,7 @@ namespace ErrorCodes extern const int BAD_ARGUMENTS; } -IMPLEMENT_SETTINGS_TRAITS(ConnectionMySQLSettingsTraits, LIST_OF_CONNECTION_MYSQL_SETTINGS) +IMPLEMENT_SETTINGS_TRAITS(ConnectionMySQLSettingsTraits, LIST_OF_MYSQL_DATABASE_SETTINGS) void ConnectionMySQLSettings::loadFromQuery(ASTStorage & storage_def) { diff --git a/src/Databases/MySQL/ConnectionMySQLSettings.h b/src/Databases/MySQL/ConnectionMySQLSettings.h index f05985a0cda..8b17d4e7898 100644 --- a/src/Databases/MySQL/ConnectionMySQLSettings.h +++ b/src/Databases/MySQL/ConnectionMySQLSettings.h @@ -4,6 +4,7 @@ #include #include #include +#include namespace DB { @@ -17,7 +18,11 @@ class ASTStorage; #define APPLY_FOR_IMMUTABLE_CONNECTION_MYSQL_SETTINGS(M) \ M(mysql_datatypes_support_level) -DECLARE_SETTINGS_TRAITS(ConnectionMySQLSettingsTraits, LIST_OF_CONNECTION_MYSQL_SETTINGS) +#define LIST_OF_MYSQL_DATABASE_SETTINGS(M) \ + LIST_OF_CONNECTION_MYSQL_SETTINGS(M) \ + LIST_OF_MYSQL_SETTINGS(M) + +DECLARE_SETTINGS_TRAITS(ConnectionMySQLSettingsTraits, LIST_OF_MYSQL_DATABASE_SETTINGS) /** Settings for the MySQL database engine. diff --git a/src/Databases/MySQL/DatabaseMaterializedMySQL.cpp b/src/Databases/MySQL/DatabaseMaterializedMySQL.cpp index 30cdc9fb501..13f55eab9e2 100644 --- a/src/Databases/MySQL/DatabaseMaterializedMySQL.cpp +++ b/src/Databases/MySQL/DatabaseMaterializedMySQL.cpp @@ -9,7 +9,6 @@ # include # include # include -# include # include # include diff --git a/src/Databases/MySQL/DatabaseMySQL.cpp b/src/Databases/MySQL/DatabaseMySQL.cpp index f62e06aff8d..cc6d808a564 100644 --- a/src/Databases/MySQL/DatabaseMySQL.cpp +++ b/src/Databases/MySQL/DatabaseMySQL.cpp @@ -53,7 +53,8 @@ DatabaseMySQL::DatabaseMySQL( const ASTStorage * database_engine_define_, const String & database_name_in_mysql_, std::unique_ptr settings_, - mysqlxx::PoolWithFailover && pool) + mysqlxx::PoolWithFailover && pool, + bool attach) : IDatabase(database_name_) , WithContext(context_->getGlobalContext()) , metadata_path(metadata_path_) @@ -62,7 +63,19 @@ DatabaseMySQL::DatabaseMySQL( , database_settings(std::move(settings_)) , mysql_pool(std::move(pool)) { - empty(); /// test database is works fine. + try + { + /// Test that the database is working fine; it will also fetch tables. + empty(); + } + catch (...) + { + if (attach) + tryLogCurrentException("DatabaseMySQL"); + else + throw; + } + thread = ThreadFromGlobalPool{&DatabaseMySQL::cleanOutdatedTables, this}; } diff --git a/src/Databases/MySQL/DatabaseMySQL.h b/src/Databases/MySQL/DatabaseMySQL.h index e57ac442db1..1ee090ecd52 100644 --- a/src/Databases/MySQL/DatabaseMySQL.h +++ b/src/Databases/MySQL/DatabaseMySQL.h @@ -45,7 +45,8 @@ public: const ASTStorage * database_engine_define, const String & database_name_in_mysql, std::unique_ptr settings_, - mysqlxx::PoolWithFailover && pool); + mysqlxx::PoolWithFailover && pool, + bool attach); String getEngineName() const override { return "MySQL"; } diff --git a/src/Dictionaries/CacheDictionary.cpp b/src/Dictionaries/CacheDictionary.cpp index 723457fba5b..7da25298cf2 100644 --- a/src/Dictionaries/CacheDictionary.cpp +++ b/src/Dictionaries/CacheDictionary.cpp @@ -110,12 +110,12 @@ std::exception_ptr CacheDictionary::getLastException() cons } template -const IDictionarySource * CacheDictionary::getSource() const +DictionarySourcePtr CacheDictionary::getSource() const { /// Mutex required here because of the getSourceAndUpdateIfNeeded() function /// which is used from another thread. std::lock_guard lock(source_mutex); - return source_ptr.get(); + return source_ptr; } template @@ -602,6 +602,7 @@ void CacheDictionary::update(CacheDictionaryUpdateUnitPtr update_queue; diff --git a/src/Dictionaries/CassandraDictionarySource.h b/src/Dictionaries/CassandraDictionarySource.h index 35419d3ea7d..76ad2316366 100644 --- a/src/Dictionaries/CassandraDictionarySource.h +++ b/src/Dictionaries/CassandraDictionarySource.h @@ -61,7 +61,7 @@ public: DictionarySourcePtr clone() const override { - return std::make_unique(dict_struct, configuration, sample_block); + return std::make_shared(dict_struct, configuration, sample_block); } Pipe loadIds(const std::vector & ids) override; diff --git a/src/Dictionaries/ClickHouseDictionarySource.cpp b/src/Dictionaries/ClickHouseDictionarySource.cpp index 1ddcdd96454..6abd5f317e2 100644 --- a/src/Dictionaries/ClickHouseDictionarySource.cpp +++ b/src/Dictionaries/ClickHouseDictionarySource.cpp @@ -17,7 +17,6 @@ #include "DictionaryStructure.h" #include "ExternalQueryBuilder.h" #include "readInvalidateQuery.h" -#include "writeParenthesisedString.h" #include "DictionaryFactory.h" #include "DictionarySourceHelpers.h" diff --git a/src/Dictionaries/ClickHouseDictionarySource.h b/src/Dictionaries/ClickHouseDictionarySource.h index be09fa415fd..cdcc0ee824f 100644 --- a/src/Dictionaries/ClickHouseDictionarySource.h +++ b/src/Dictionaries/ClickHouseDictionarySource.h @@ -60,7 +60,7 @@ public: bool hasUpdateField() const override; - DictionarySourcePtr clone() const override { return std::make_unique(*this); } + DictionarySourcePtr clone() const override { return std::make_shared(*this); } std::string toString() const override; diff --git a/src/Dictionaries/DictionaryHelpers.h b/src/Dictionaries/DictionaryHelpers.h index 26fbb6f193f..b59e29c327e 100644 --- a/src/Dictionaries/DictionaryHelpers.h +++ b/src/Dictionaries/DictionaryHelpers.h @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -411,7 +412,7 @@ public: if constexpr (key_type == DictionaryKeyType::Simple) { - key_columns[0] = key_columns[0]->convertToFullColumnIfConst(); + key_columns[0] = recursiveRemoveSparse(key_columns[0]->convertToFullColumnIfConst()); const auto * vector_col = checkAndGetColumn>(key_columns[0].get()); if (!vector_col) @@ -574,6 +575,8 @@ void mergeBlockWithPipe( while (executor.pull(block)) { + convertToFullIfSparse(block); + Columns block_key_columns; block_key_columns.reserve(key_columns_size); @@ -633,7 +636,7 @@ static const PaddedPODArray & getColumnVectorData( PaddedPODArray & backup_storage) { bool is_const_column = isColumnConst(*column); - auto full_column = column->convertToFullColumnIfConst(); + auto full_column = recursiveRemoveSparse(column->convertToFullColumnIfConst()); auto vector_col = checkAndGetColumn>(full_column.get()); if (!vector_col) diff --git a/src/Dictionaries/DirectDictionary.cpp b/src/Dictionaries/DirectDictionary.cpp index 551f485e5bb..12c624a6859 100644 --- a/src/Dictionaries/DirectDictionary.cpp +++ b/src/Dictionaries/DirectDictionary.cpp @@ -75,6 +75,8 @@ Columns DirectDictionary::getColumns( Block block; while (executor.pull(block)) { + convertToFullIfSparse(block); + /// Split into keys columns and attribute columns for (size_t i = 0; i < dictionary_keys_size; ++i) block_key_columns.emplace_back(block.safeGetByPosition(i).column); diff --git a/src/Dictionaries/DirectDictionary.h b/src/Dictionaries/DirectDictionary.h index edf4c8d1d9a..4bf24e6ae98 100644 --- a/src/Dictionaries/DirectDictionary.h +++ b/src/Dictionaries/DirectDictionary.h @@ -58,7 +58,7 @@ public: return std::make_shared(getDictionaryID(), dict_struct, source_ptr->clone()); } - const IDictionarySource * getSource() const override { return source_ptr.get(); } + DictionarySourcePtr getSource() const override { return source_ptr; } const DictionaryLifetime & getLifetime() const override { return dict_lifetime; } diff --git a/src/Dictionaries/ExecutableDictionarySource.cpp b/src/Dictionaries/ExecutableDictionarySource.cpp index c09993c2a84..7a3550e7284 100644 --- a/src/Dictionaries/ExecutableDictionarySource.cpp +++ b/src/Dictionaries/ExecutableDictionarySource.cpp @@ -1,10 +1,16 @@ #include "ExecutableDictionarySource.h" +#include + +#include + #include -#include +#include +#include #include #include +#include #include #include @@ -27,15 +33,46 @@ namespace ErrorCodes extern const int UNSUPPORTED_METHOD; } +namespace +{ + + void updateCommandIfNeeded(String & command, bool execute_direct, ContextPtr context) + { + if (!execute_direct) + return; + + auto global_context = context->getGlobalContext(); + auto user_scripts_path = global_context->getUserScriptsPath(); + auto script_path = user_scripts_path + '/' + command; + + if (!fileOrSymlinkPathStartsWith(script_path, user_scripts_path)) + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, + "Executable file {} must be inside user scripts folder {}", + command, + user_scripts_path); + + if (!std::filesystem::exists(std::filesystem::path(script_path))) + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, + "Executable file {} does not exist inside user scripts folder {}", + command, + user_scripts_path); + + command = std::move(script_path); + } + +} + ExecutableDictionarySource::ExecutableDictionarySource( const DictionaryStructure & dict_struct_, const Configuration & configuration_, Block & sample_block_, + std::shared_ptr coordinator_, ContextPtr context_) : log(&Poco::Logger::get("ExecutableDictionarySource")) , dict_struct(dict_struct_) , configuration(configuration_) - , sample_block{sample_block_} + , sample_block(sample_block_) + , coordinator(std::move(coordinator_)) , context(context_) { /// Remove keys from sample_block for implicit_key dictionary because @@ -58,6 +95,7 @@ ExecutableDictionarySource::ExecutableDictionarySource(const ExecutableDictionar , dict_struct(other.dict_struct) , configuration(other.configuration) , sample_block(other.sample_block) + , coordinator(other.coordinator) , context(Context::createCopy(other.context)) { } @@ -69,11 +107,11 @@ Pipe ExecutableDictionarySource::loadAll() LOG_TRACE(log, "loadAll {}", toString()); - ShellCommand::Config config(configuration.command); - auto process = ShellCommand::execute(config); + const auto & coordinator_configuration = coordinator->getConfiguration(); + auto command = configuration.command; + updateCommandIfNeeded(command, coordinator_configuration.execute_direct, context); - Pipe pipe(std::make_unique(context, configuration.format, sample_block, std::move(process))); - return pipe; + return coordinator->createPipe(command, configuration.command_arguments, sample_block, context); } Pipe ExecutableDictionarySource::loadUpdatedAll() @@ -82,17 +120,32 @@ Pipe ExecutableDictionarySource::loadUpdatedAll() throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "ExecutableDictionarySource with implicit_key does not support loadUpdatedAll method"); time_t new_update_time = time(nullptr); - SCOPE_EXIT(update_time = new_update_time); - std::string command_with_update_field = configuration.command; + const auto & coordinator_configuration = coordinator->getConfiguration(); + auto command = configuration.command; + updateCommandIfNeeded(command, coordinator_configuration.execute_direct, context); + + auto command_arguments = configuration.command_arguments; + if (update_time) - command_with_update_field += " " + configuration.update_field + " " + DB::toString(LocalDateTime(update_time - configuration.update_lag)); + { + auto update_difference = DB::toString(LocalDateTime(update_time - configuration.update_lag)); - LOG_TRACE(log, "loadUpdatedAll {}", command_with_update_field); - ShellCommand::Config config(command_with_update_field); - auto process = ShellCommand::execute(config); - Pipe pipe(std::make_unique(context, configuration.format, sample_block, std::move(process))); - return pipe; + if (coordinator_configuration.execute_direct) + { + command_arguments.emplace_back(configuration.update_field); + command_arguments.emplace_back(std::move(update_difference)); + } + else + { + command += ' ' + configuration.update_field + ' ' + update_difference; + } + } + + update_time = new_update_time; + + LOG_TRACE(log, "loadUpdatedAll {}", command); + return coordinator->createPipe(command, command_arguments, sample_block, context); } Pipe ExecutableDictionarySource::loadIds(const std::vector & ids) @@ -113,27 +166,17 @@ Pipe ExecutableDictionarySource::loadKeys(const Columns & key_columns, const std Pipe ExecutableDictionarySource::getStreamForBlock(const Block & block) { - ShellCommand::Config config(configuration.command); - auto process = ShellCommand::execute(config); - auto * process_in = &process->in; + const auto & coordinator_configuration = coordinator->getConfiguration(); + String command = configuration.command; + updateCommandIfNeeded(command, coordinator_configuration.execute_direct, context); - ShellCommandSource::SendDataTask task = {[process_in, block, this]() - { - auto & out = *process_in; + auto source = std::make_shared(block); + auto shell_input_pipe = Pipe(std::move(source)); - if (configuration.send_chunk_header) - { - writeText(block.rows(), out); - writeChar('\n', out); - } + Pipes shell_input_pipes; + shell_input_pipes.emplace_back(std::move(shell_input_pipe)); - auto output_format = context->getOutputFormat(configuration.format, out, block.cloneEmpty()); - formatBlock(output_format, block); - out.close(); - }}; - std::vector tasks = {std::move(task)}; - - Pipe pipe(std::make_unique(context, configuration.format, sample_block, std::move(process), std::move(tasks))); + auto pipe = coordinator->createPipe(command, configuration.command_arguments, std::move(shell_input_pipes), sample_block, context); if (configuration.implicit_key) pipe.addTransform(std::make_shared(block, pipe.getHeader())); @@ -158,7 +201,7 @@ bool ExecutableDictionarySource::hasUpdateField() const DictionarySourcePtr ExecutableDictionarySource::clone() const { - return std::make_unique(*this); + return std::make_shared(*this); } std::string ExecutableDictionarySource::toString() const @@ -189,17 +232,40 @@ void registerDictionarySourceExecutable(DictionarySourceFactory & factory) std::string settings_config_prefix = config_prefix + ".executable"; + bool execute_direct = config.getBool(settings_config_prefix + ".execute_direct", false); + std::string command_value = config.getString(settings_config_prefix + ".command"); + std::vector command_arguments; + + if (execute_direct) + { + boost::split(command_arguments, command_value, [](char c) { return c == ' '; }); + + command_value = std::move(command_arguments[0]); + command_arguments.erase(command_arguments.begin()); + } + ExecutableDictionarySource::Configuration configuration { - .command = config.getString(settings_config_prefix + ".command"), - .format = config.getString(settings_config_prefix + ".format"), + .command = std::move(command_value), + .command_arguments = std::move(command_arguments), .update_field = config.getString(settings_config_prefix + ".update_field", ""), .update_lag = config.getUInt64(settings_config_prefix + ".update_lag", 1), .implicit_key = config.getBool(settings_config_prefix + ".implicit_key", false), - .send_chunk_header = config.getBool(settings_config_prefix + ".send_chunk_header", false) }; - return std::make_unique(dict_struct, configuration, sample_block, context); + ShellCommandSourceCoordinator::Configuration shell_command_coordinator_configration + { + .format = config.getString(settings_config_prefix + ".format"), + .command_termination_timeout_seconds = config.getUInt64(settings_config_prefix + ".command_termination_timeout", 10), + .command_read_timeout_milliseconds = config.getUInt64(settings_config_prefix + ".command_read_timeout", 10000), + .command_write_timeout_milliseconds = config.getUInt64(settings_config_prefix + ".command_write_timeout", 10000), + .is_executable_pool = false, + .send_chunk_header = config.getBool(settings_config_prefix + ".send_chunk_header", false), + .execute_direct = config.getBool(settings_config_prefix + ".execute_direct", false) + }; + + auto coordinator = std::make_shared(shell_command_coordinator_configration); + return std::make_unique(dict_struct, configuration, sample_block, std::move(coordinator), context); }; factory.registerSource("executable", create_table_source); diff --git a/src/Dictionaries/ExecutableDictionarySource.h b/src/Dictionaries/ExecutableDictionarySource.h index a7ffc8bebcb..6c5d2de3714 100644 --- a/src/Dictionaries/ExecutableDictionarySource.h +++ b/src/Dictionaries/ExecutableDictionarySource.h @@ -7,6 +7,7 @@ #include #include +#include namespace DB @@ -20,20 +21,19 @@ public: struct Configuration { std::string command; - std::string format; + std::vector command_arguments; std::string update_field; UInt64 update_lag; /// Implicit key means that the source script will return only values, /// and the correspondence to the requested keys is determined implicitly - by the order of rows in the result. bool implicit_key; - /// Send number_of_rows\n before sending chunk to process - bool send_chunk_header; }; ExecutableDictionarySource( const DictionaryStructure & dict_struct_, const Configuration & configuration_, Block & sample_block_, + std::shared_ptr coordinator_, ContextPtr context_); ExecutableDictionarySource(const ExecutableDictionarySource & other); @@ -69,6 +69,7 @@ private: const DictionaryStructure dict_struct; const Configuration configuration; Block sample_block; + std::shared_ptr coordinator; ContextPtr context; }; diff --git a/src/Dictionaries/ExecutablePoolDictionarySource.cpp b/src/Dictionaries/ExecutablePoolDictionarySource.cpp index dce2ce94b93..48ddeed7fa6 100644 --- a/src/Dictionaries/ExecutablePoolDictionarySource.cpp +++ b/src/Dictionaries/ExecutablePoolDictionarySource.cpp @@ -1,14 +1,20 @@ #include "ExecutablePoolDictionarySource.h" +#include + +#include + #include -#include +#include +#include #include +#include +#include +#include #include #include -#include -#include #include #include @@ -23,20 +29,19 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; extern const int DICTIONARY_ACCESS_DENIED; extern const int UNSUPPORTED_METHOD; - extern const int TIMEOUT_EXCEEDED; } ExecutablePoolDictionarySource::ExecutablePoolDictionarySource( const DictionaryStructure & dict_struct_, const Configuration & configuration_, Block & sample_block_, + std::shared_ptr coordinator_, ContextPtr context_) : dict_struct(dict_struct_) , configuration(configuration_) , sample_block(sample_block_) + , coordinator(std::move(coordinator_)) , context(context_) - /// If pool size == 0 then there is no size restrictions. Poco max size of semaphore is integer type. - , process_pool(std::make_shared(configuration.pool_size == 0 ? std::numeric_limits::max() : configuration.pool_size)) , log(&Poco::Logger::get("ExecutablePoolDictionarySource")) { /// Remove keys from sample_block for implicit_key dictionary because @@ -59,8 +64,8 @@ ExecutablePoolDictionarySource::ExecutablePoolDictionarySource(const ExecutableP : dict_struct(other.dict_struct) , configuration(other.configuration) , sample_block(other.sample_block) + , coordinator(other.coordinator) , context(Context::createCopy(other.context)) - , process_pool(std::make_shared(configuration.pool_size)) , log(&Poco::Logger::get("ExecutablePoolDictionarySource")) { } @@ -93,41 +98,47 @@ Pipe ExecutablePoolDictionarySource::loadKeys(const Columns & key_columns, const Pipe ExecutablePoolDictionarySource::getStreamForBlock(const Block & block) { - std::unique_ptr process; - bool result = process_pool->tryBorrowObject(process, [this]() + String command = configuration.command; + const auto & coordinator_configuration = coordinator->getConfiguration(); + + if (coordinator_configuration.execute_direct) { - ShellCommand::Config config(configuration.command); - config.terminate_in_destructor_strategy = ShellCommand::DestructorStrategy{ true /*terminate_in_destructor*/, configuration.command_termination_timeout }; - auto shell_command = ShellCommand::execute(config); - return shell_command; - }, configuration.max_command_execution_time * 10000); + auto global_context = context->getGlobalContext(); + auto user_scripts_path = global_context->getUserScriptsPath(); + auto script_path = user_scripts_path + '/' + command; - if (!result) - throw Exception(ErrorCodes::TIMEOUT_EXCEEDED, - "Could not get process from pool, max command execution timeout exceeded {} seconds", - configuration.max_command_execution_time); + if (!fileOrSymlinkPathStartsWith(script_path, user_scripts_path)) + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, + "Executable file {} must be inside user scripts folder {}", + command, + user_scripts_path); - size_t rows_to_read = block.rows(); - auto * process_in = &process->in; - ShellCommandSource::SendDataTask task = [process_in, block, this]() mutable - { - auto & out = *process_in; + if (!std::filesystem::exists(std::filesystem::path(script_path))) + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, + "Executable file {} does not exist inside user scripts folder {}", + command, + user_scripts_path); - if (configuration.send_chunk_header) - { - writeText(block.rows(), out); - writeChar('\n', out); - } + command = std::move(script_path); + } - auto output_format = context->getOutputFormat(configuration.format, out, block.cloneEmpty()); - formatBlock(output_format, block); - }; - std::vector tasks = {std::move(task)}; + auto source = std::make_shared(block); + auto shell_input_pipe = Pipe(std::move(source)); ShellCommandSourceConfiguration command_configuration; command_configuration.read_fixed_number_of_rows = true; - command_configuration.number_of_rows_to_read = rows_to_read; - Pipe pipe(std::make_unique(context, configuration.format, sample_block, std::move(process), std::move(tasks), command_configuration, process_pool)); + command_configuration.number_of_rows_to_read = block.rows(); + + Pipes shell_input_pipes; + shell_input_pipes.emplace_back(std::move(shell_input_pipe)); + + auto pipe = coordinator->createPipe( + command, + configuration.command_arguments, + std::move(shell_input_pipes), + sample_block, + context, + command_configuration); if (configuration.implicit_key) pipe.addTransform(std::make_shared(block, pipe.getHeader())); @@ -152,12 +163,13 @@ bool ExecutablePoolDictionarySource::hasUpdateField() const DictionarySourcePtr ExecutablePoolDictionarySource::clone() const { - return std::make_unique(*this); + return std::make_shared(*this); } std::string ExecutablePoolDictionarySource::toString() const { - return "ExecutablePool size: " + std::to_string(configuration.pool_size) + " command: " + configuration.command; + size_t pool_size = coordinator->getConfiguration().pool_size; + return "ExecutablePool size: " + std::to_string(pool_size) + " command: " + configuration.command; } void registerDictionarySourceExecutablePool(DictionarySourceFactory & factory) @@ -189,18 +201,40 @@ void registerDictionarySourceExecutablePool(DictionarySourceFactory & factory) if (max_execution_time_seconds != 0 && max_command_execution_time > max_execution_time_seconds) max_command_execution_time = max_execution_time_seconds; + bool execute_direct = config.getBool(settings_config_prefix + ".execute_direct", false); + std::string command_value = config.getString(settings_config_prefix + ".command"); + std::vector command_arguments; + + if (execute_direct) + { + boost::split(command_arguments, command_value, [](char c) { return c == ' '; }); + + command_value = std::move(command_arguments[0]); + command_arguments.erase(command_arguments.begin()); + } + ExecutablePoolDictionarySource::Configuration configuration { - .command = config.getString(settings_config_prefix + ".command"), - .format = config.getString(settings_config_prefix + ".format"), - .pool_size = config.getUInt64(settings_config_prefix + ".size"), - .command_termination_timeout = config.getUInt64(settings_config_prefix + ".command_termination_timeout", 10), - .max_command_execution_time = max_command_execution_time, + .command = std::move(command_value), + .command_arguments = std::move(command_arguments), .implicit_key = config.getBool(settings_config_prefix + ".implicit_key", false), - .send_chunk_header = config.getBool(settings_config_prefix + ".send_chunk_header", false) }; - return std::make_unique(dict_struct, configuration, sample_block, context); + ShellCommandSourceCoordinator::Configuration shell_command_coordinator_configration + { + .format = config.getString(settings_config_prefix + ".format"), + .command_termination_timeout_seconds = config.getUInt64(settings_config_prefix + ".command_termination_timeout", 10), + .command_read_timeout_milliseconds = config.getUInt64(settings_config_prefix + ".command_read_timeout", 10000), + .command_write_timeout_milliseconds = config.getUInt64(settings_config_prefix + ".command_write_timeout", 10000), + .pool_size = config.getUInt64(settings_config_prefix + ".pool_size", 16), + .max_command_execution_time_seconds = max_command_execution_time, + .is_executable_pool = true, + .send_chunk_header = config.getBool(settings_config_prefix + ".send_chunk_header", false), + .execute_direct = execute_direct + }; + + auto coordinator = std::make_shared(shell_command_coordinator_configration); + return std::make_unique(dict_struct, configuration, sample_block, std::move(coordinator), context); }; factory.registerSource("executable_pool", create_table_source); diff --git a/src/Dictionaries/ExecutablePoolDictionarySource.h b/src/Dictionaries/ExecutablePoolDictionarySource.h index 51215b6311b..b9b3b8efb1b 100644 --- a/src/Dictionaries/ExecutablePoolDictionarySource.h +++ b/src/Dictionaries/ExecutablePoolDictionarySource.h @@ -28,21 +28,15 @@ public: struct Configuration { String command; - String format; - size_t pool_size; - size_t command_termination_timeout; - size_t max_command_execution_time; - /// Implicit key means that the source script will return only values, - /// and the correspondence to the requested keys is determined implicitly - by the order of rows in the result. + std::vector command_arguments; bool implicit_key; - /// Send number_of_rows\n before sending chunk to process - bool send_chunk_header; }; ExecutablePoolDictionarySource( const DictionaryStructure & dict_struct_, const Configuration & configuration_, Block & sample_block_, + std::shared_ptr coordinator_, ContextPtr context_); ExecutablePoolDictionarySource(const ExecutablePoolDictionarySource & other); @@ -77,8 +71,8 @@ private: const Configuration configuration; Block sample_block; + std::shared_ptr coordinator; ContextPtr context; - std::shared_ptr process_pool; Poco::Logger * log; }; diff --git a/src/Dictionaries/ExternalQueryBuilder.cpp b/src/Dictionaries/ExternalQueryBuilder.cpp index f513c7b2f61..1701f08fd67 100644 --- a/src/Dictionaries/ExternalQueryBuilder.cpp +++ b/src/Dictionaries/ExternalQueryBuilder.cpp @@ -1,14 +1,23 @@ #include "ExternalQueryBuilder.h" + +#include + #include #include #include -#include -#include "DictionaryStructure.h" -#include "writeParenthesisedString.h" +#include namespace DB { + +static inline void writeParenthesisedString(const String & s, WriteBuffer & buf) +{ + writeChar('(', buf); + writeString(s, buf); + writeChar(')', buf); +} + namespace ErrorCodes { extern const int UNSUPPORTED_METHOD; diff --git a/src/Dictionaries/FileDictionarySource.h b/src/Dictionaries/FileDictionarySource.h index c8e37986b2f..8fe2d87d8b9 100644 --- a/src/Dictionaries/FileDictionarySource.h +++ b/src/Dictionaries/FileDictionarySource.h @@ -51,7 +51,7 @@ public: ///Not supported for FileDictionarySource bool hasUpdateField() const override { return false; } - DictionarySourcePtr clone() const override { return std::make_unique(*this); } + DictionarySourcePtr clone() const override { return std::make_shared(*this); } std::string toString() const override; diff --git a/src/Dictionaries/FlatDictionary.cpp b/src/Dictionaries/FlatDictionary.cpp index b6c5f10564b..de4ae66300a 100644 --- a/src/Dictionaries/FlatDictionary.cpp +++ b/src/Dictionaries/FlatDictionary.cpp @@ -328,6 +328,8 @@ void FlatDictionary::updateData() Block block; while (executor.pull(block)) { + convertToFullIfSparse(block); + /// We are using this to keep saved data if input stream consists of multiple blocks if (!update_field_loaded_block) update_field_loaded_block = std::make_shared(block.cloneEmpty()); diff --git a/src/Dictionaries/FlatDictionary.h b/src/Dictionaries/FlatDictionary.h index 5c3a1d634d8..308cd72d55b 100644 --- a/src/Dictionaries/FlatDictionary.h +++ b/src/Dictionaries/FlatDictionary.h @@ -61,7 +61,7 @@ public: return std::make_shared(getDictionaryID(), dict_struct, source_ptr->clone(), dict_lifetime, configuration, update_field_loaded_block); } - const IDictionarySource * getSource() const override { return source_ptr.get(); } + DictionarySourcePtr getSource() const override { return source_ptr; } const DictionaryLifetime & getLifetime() const override { return dict_lifetime; } diff --git a/src/Dictionaries/HTTPDictionarySource.cpp b/src/Dictionaries/HTTPDictionarySource.cpp index aba6b40f206..308570644d1 100644 --- a/src/Dictionaries/HTTPDictionarySource.cpp +++ b/src/Dictionaries/HTTPDictionarySource.cpp @@ -207,7 +207,7 @@ bool HTTPDictionarySource::hasUpdateField() const DictionarySourcePtr HTTPDictionarySource::clone() const { - return std::make_unique(*this); + return std::make_shared(*this); } std::string HTTPDictionarySource::toString() const diff --git a/src/Dictionaries/HTTPDictionarySource.h b/src/Dictionaries/HTTPDictionarySource.h index 35fbabecf2a..ce357814982 100644 --- a/src/Dictionaries/HTTPDictionarySource.h +++ b/src/Dictionaries/HTTPDictionarySource.h @@ -4,7 +4,7 @@ #include #include #include -#include +#include #include "DictionaryStructure.h" #include "IDictionarySource.h" #include diff --git a/src/Dictionaries/HashedArrayDictionary.h b/src/Dictionaries/HashedArrayDictionary.h index 5ad1efeb056..0d07c43477a 100644 --- a/src/Dictionaries/HashedArrayDictionary.h +++ b/src/Dictionaries/HashedArrayDictionary.h @@ -5,8 +5,6 @@ #include #include -#include - #include #include #include @@ -73,7 +71,7 @@ public: return std::make_shared>(getDictionaryID(), dict_struct, source_ptr->clone(), configuration, update_field_loaded_block); } - const IDictionarySource * getSource() const override { return source_ptr.get(); } + DictionarySourcePtr getSource() const override { return source_ptr; } const DictionaryLifetime & getLifetime() const override { return configuration.lifetime; } diff --git a/src/Dictionaries/HashedDictionary.cpp b/src/Dictionaries/HashedDictionary.cpp index 1df152eec38..c48893bf24f 100644 --- a/src/Dictionaries/HashedDictionary.cpp +++ b/src/Dictionaries/HashedDictionary.cpp @@ -376,6 +376,8 @@ void HashedDictionary::updateData() Block block; while (executor.pull(block)) { + convertToFullIfSparse(block); + /// We are using this to keep saved data if input stream consists of multiple blocks if (!update_field_loaded_block) update_field_loaded_block = std::make_shared(block.cloneEmpty()); @@ -589,7 +591,9 @@ void HashedDictionary::loadData() } } else + { updateData(); + } if (configuration.require_nonempty && 0 == element_count) throw Exception(ErrorCodes::DICTIONARY_IS_EMPTY, diff --git a/src/Dictionaries/HashedDictionary.h b/src/Dictionaries/HashedDictionary.h index 23919c009c5..6f63c5ec546 100644 --- a/src/Dictionaries/HashedDictionary.h +++ b/src/Dictionaries/HashedDictionary.h @@ -4,8 +4,7 @@ #include #include #include - -#include +#include #include #include @@ -79,7 +78,7 @@ public: return std::make_shared>(getDictionaryID(), dict_struct, source_ptr->clone(), configuration, update_field_loaded_block); } - const IDictionarySource * getSource() const override { return source_ptr.get(); } + DictionarySourcePtr getSource() const override { return source_ptr; } const DictionaryLifetime & getLifetime() const override { return configuration.lifetime; } @@ -124,11 +123,22 @@ private: HashMap, HashMapWithSavedHash>>; + /// Here we use sparse_hash_map with DefaultHash<> for the following reasons: + /// + /// - DefaultHash<> is used for HashMap + /// - DefaultHash<> (from HashTable/Hash.h> works better then std::hash<> + /// in case of sequential set of keys, but with random access to this set, i.e. + /// + /// SELECT number FROM numbers(3000000) ORDER BY rand() + /// + /// And even though std::hash<> works better in some other cases, + /// DefaultHash<> is preferred since the difference for this particular + /// case is significant, i.e. it can be 10x+. template using CollectionTypeSparse = std::conditional_t< dictionary_key_type == DictionaryKeyType::Simple, - SparseHashMap, - SparseHashMap>; + google::sparse_hash_map>, + google::sparse_hash_map>>; template using CollectionType = std::conditional_t, CollectionTypeNonSparse>; diff --git a/src/Dictionaries/IDictionary.h b/src/Dictionaries/IDictionary.h index 66e35c8fa12..b1923306003 100644 --- a/src/Dictionaries/IDictionary.h +++ b/src/Dictionaries/IDictionary.h @@ -1,16 +1,16 @@ #pragma once -#include -#include -#include -#include -#include -#include -#include - #include #include +#include +#include +#include +#include +#include +#include +#include + namespace DB { @@ -19,7 +19,7 @@ namespace ErrorCodes extern const int NOT_IMPLEMENTED; } -struct IDictionary; +class IDictionary; using DictionaryPtr = std::unique_ptr; /** DictionaryKeyType provides IDictionary client information about @@ -47,8 +47,9 @@ enum class DictionarySpecialKeyType /** * Base class for Dictionaries implementation. */ -struct IDictionary : public IExternalLoadable +class IDictionary : public IExternalLoadable { +public: explicit IDictionary(const StorageID & dictionary_id_) : dictionary_id(dictionary_id_) , full_name(dictionary_id.getInternalDictionaryName()) @@ -99,7 +100,7 @@ struct IDictionary : public IExternalLoadable virtual double getLoadFactor() const = 0; - virtual const IDictionarySource * getSource() const = 0; + virtual DictionarySourcePtr getSource() const = 0; virtual const DictionaryStructure & getStructure() const = 0; @@ -200,7 +201,7 @@ struct IDictionary : public IExternalLoadable bool isModified() const override { - const auto * source = getSource(); + const auto source = getSource(); return source && source->isModified(); } diff --git a/src/Dictionaries/IDictionarySource.h b/src/Dictionaries/IDictionarySource.h index 5071b69d2bf..128595b815f 100644 --- a/src/Dictionaries/IDictionarySource.h +++ b/src/Dictionaries/IDictionarySource.h @@ -10,8 +10,7 @@ namespace DB { class IDictionarySource; -using DictionarySourcePtr = std::unique_ptr; -using SharedDictionarySourcePtr = std::shared_ptr; +using DictionarySourcePtr = std::shared_ptr; /** Data-provider interface for external dictionaries, * abstracts out the data source (file, MySQL, ClickHouse, external program, network request et cetera) diff --git a/src/Dictionaries/IPAddressDictionary.h b/src/Dictionaries/IPAddressDictionary.h index ed0d8692d21..9f604b5aeb8 100644 --- a/src/Dictionaries/IPAddressDictionary.h +++ b/src/Dictionaries/IPAddressDictionary.h @@ -56,7 +56,7 @@ public: return std::make_shared(getDictionaryID(), dict_struct, source_ptr->clone(), dict_lifetime, require_nonempty); } - const IDictionarySource * getSource() const override { return source_ptr.get(); } + DictionarySourcePtr getSource() const override { return source_ptr; } const DictionaryLifetime & getLifetime() const override { return dict_lifetime; } diff --git a/src/Dictionaries/LibraryDictionarySource.cpp b/src/Dictionaries/LibraryDictionarySource.cpp index 42683fb884c..b79ee9be59a 100644 --- a/src/Dictionaries/LibraryDictionarySource.cpp +++ b/src/Dictionaries/LibraryDictionarySource.cpp @@ -129,7 +129,7 @@ Pipe LibraryDictionarySource::loadKeys(const Columns & key_columns, const std::v DictionarySourcePtr LibraryDictionarySource::clone() const { - return std::make_unique(*this); + return std::make_shared(*this); } diff --git a/src/Dictionaries/LibraryDictionarySource.h b/src/Dictionaries/LibraryDictionarySource.h index e1cb01c0a14..9d7590c20ad 100644 --- a/src/Dictionaries/LibraryDictionarySource.h +++ b/src/Dictionaries/LibraryDictionarySource.h @@ -1,7 +1,7 @@ #pragma once #include -#include +#include #include #include "DictionaryStructure.h" #include diff --git a/src/Dictionaries/MongoDBDictionarySource.h b/src/Dictionaries/MongoDBDictionarySource.h index 3625deca9c6..85531f89902 100644 --- a/src/Dictionaries/MongoDBDictionarySource.h +++ b/src/Dictionaries/MongoDBDictionarySource.h @@ -65,7 +65,7 @@ public: ///Not yet supported bool hasUpdateField() const override { return false; } - DictionarySourcePtr clone() const override { return std::make_unique(*this); } + DictionarySourcePtr clone() const override { return std::make_shared(*this); } std::string toString() const override; diff --git a/src/Dictionaries/MySQLDictionarySource.cpp b/src/Dictionaries/MySQLDictionarySource.cpp index f6de6ca0cc1..0bf5cc3cae0 100644 --- a/src/Dictionaries/MySQLDictionarySource.cpp +++ b/src/Dictionaries/MySQLDictionarySource.cpp @@ -14,6 +14,8 @@ #include #include #include +#include +#include namespace DB @@ -46,13 +48,17 @@ void registerDictionarySourceMysql(DictionarySourceFactory & factory) auto settings_config_prefix = config_prefix + ".mysql"; std::shared_ptr pool; - ExternalDataSourceConfiguration configuration; + StorageMySQLConfiguration configuration; auto named_collection = created_from_ddl ? getExternalDataSourceConfiguration(config, settings_config_prefix, global_context) : std::nullopt; if (named_collection) { - configuration = *named_collection; - std::vector> addresses{std::make_pair(configuration.host, configuration.port)}; - pool = std::make_shared(configuration.database, addresses, configuration.username, configuration.password); + configuration.set(*named_collection); + configuration.addresses = {std::make_pair(configuration.host, configuration.port)}; + MySQLSettings mysql_settings; + const auto & settings = global_context->getSettingsRef(); + mysql_settings.connect_timeout = settings.external_storage_connect_timeout_sec; + mysql_settings.read_write_timeout = settings.external_storage_rw_timeout_sec; + pool = std::make_shared(createMySQLPoolWithFailover(configuration, mysql_settings)); } else { @@ -95,7 +101,7 @@ void registerDictionarySourceMysql(DictionarySourceFactory & factory) # include # include # include -# include +# include # include # include "readInvalidateQuery.h" # include @@ -225,7 +231,7 @@ bool MySQLDictionarySource::hasUpdateField() const DictionarySourcePtr MySQLDictionarySource::clone() const { - return std::make_unique(*this); + return std::make_shared(*this); } std::string MySQLDictionarySource::toString() const diff --git a/src/Dictionaries/MySQLDictionarySource.h b/src/Dictionaries/MySQLDictionarySource.h index 37743ca2f62..90506ad1726 100644 --- a/src/Dictionaries/MySQLDictionarySource.h +++ b/src/Dictionaries/MySQLDictionarySource.h @@ -5,7 +5,7 @@ #include "config_core.h" #if USE_MYSQL -# include +# include # include # include "DictionaryStructure.h" # include "ExternalQueryBuilder.h" diff --git a/src/Dictionaries/PolygonDictionary.h b/src/Dictionaries/PolygonDictionary.h index 346160c342f..762c136b8e0 100644 --- a/src/Dictionaries/PolygonDictionary.h +++ b/src/Dictionaries/PolygonDictionary.h @@ -87,7 +87,7 @@ public: double getLoadFactor() const override { return 1.0; } - const IDictionarySource * getSource() const override { return source_ptr.get(); } + DictionarySourcePtr getSource() const override { return source_ptr; } const DictionaryStructure & getStructure() const override { return dict_struct; } diff --git a/src/Dictionaries/PolygonDictionaryUtils.cpp b/src/Dictionaries/PolygonDictionaryUtils.cpp index fced18a6f88..15267481c0b 100644 --- a/src/Dictionaries/PolygonDictionaryUtils.cpp +++ b/src/Dictionaries/PolygonDictionaryUtils.cpp @@ -151,7 +151,7 @@ void SlabsPolygonIndex::indexBuild(const std::vector & polygons) } } - for (size_t i = 0; i != all_edges.size(); i++) + for (size_t i = 0; i != all_edges.size(); ++i) { size_t l = edge_left[i]; size_t r = edge_right[i]; diff --git a/src/Dictionaries/PostgreSQLDictionarySource.cpp b/src/Dictionaries/PostgreSQLDictionarySource.cpp index c9fb8b86b77..0ac84b35048 100644 --- a/src/Dictionaries/PostgreSQLDictionarySource.cpp +++ b/src/Dictionaries/PostgreSQLDictionarySource.cpp @@ -161,7 +161,7 @@ bool PostgreSQLDictionarySource::supportsSelectiveLoad() const DictionarySourcePtr PostgreSQLDictionarySource::clone() const { - return std::make_unique(*this); + return std::make_shared(*this); } diff --git a/src/Dictionaries/PostgreSQLDictionarySource.h b/src/Dictionaries/PostgreSQLDictionarySource.h index 1cde2958107..87a87eac363 100644 --- a/src/Dictionaries/PostgreSQLDictionarySource.h +++ b/src/Dictionaries/PostgreSQLDictionarySource.h @@ -7,7 +7,7 @@ #if USE_LIBPQXX #include "ExternalQueryBuilder.h" #include -#include +#include #include #include diff --git a/src/Dictionaries/RangeHashedDictionary.h b/src/Dictionaries/RangeHashedDictionary.h index 1605e2bab81..fca72d5d7cc 100644 --- a/src/Dictionaries/RangeHashedDictionary.h +++ b/src/Dictionaries/RangeHashedDictionary.h @@ -67,7 +67,7 @@ public: return std::make_shared(getDictionaryID(), dict_struct, source_ptr->clone(), dict_lifetime, require_nonempty, update_field_loaded_block); } - const IDictionarySource * getSource() const override { return source_ptr.get(); } + DictionarySourcePtr getSource() const override { return source_ptr; } const DictionaryLifetime & getLifetime() const override { return dict_lifetime; } diff --git a/src/Dictionaries/RedisDictionarySource.h b/src/Dictionaries/RedisDictionarySource.h index 053094e2303..eff97dede0c 100644 --- a/src/Dictionaries/RedisDictionarySource.h +++ b/src/Dictionaries/RedisDictionarySource.h @@ -76,7 +76,7 @@ namespace ErrorCodes bool hasUpdateField() const override { return false; } - DictionarySourcePtr clone() const override { return std::make_unique(*this); } + DictionarySourcePtr clone() const override { return std::make_shared(*this); } std::string toString() const override; diff --git a/src/Dictionaries/XDBCDictionarySource.cpp b/src/Dictionaries/XDBCDictionarySource.cpp index f827c0cd8d0..ab7cf65eb8b 100644 --- a/src/Dictionaries/XDBCDictionarySource.cpp +++ b/src/Dictionaries/XDBCDictionarySource.cpp @@ -9,7 +9,7 @@ #include #include #include -#include +#include #include #include "DictionarySourceFactory.h" #include "DictionaryStructure.h" @@ -162,7 +162,7 @@ bool XDBCDictionarySource::hasUpdateField() const DictionarySourcePtr XDBCDictionarySource::clone() const { - return std::make_unique(*this); + return std::make_shared(*this); } diff --git a/src/Dictionaries/writeParenthesisedString.cpp b/src/Dictionaries/writeParenthesisedString.cpp deleted file mode 100644 index 5e237aa1e6c..00000000000 --- a/src/Dictionaries/writeParenthesisedString.cpp +++ /dev/null @@ -1,12 +0,0 @@ -#include "writeParenthesisedString.h" - -namespace DB -{ -void writeParenthesisedString(const String & s, WriteBuffer & buf) -{ - writeChar('(', buf); - writeString(s, buf); - writeChar(')', buf); -} - -} diff --git a/src/Dictionaries/writeParenthesisedString.h b/src/Dictionaries/writeParenthesisedString.h deleted file mode 100644 index ec61e944d38..00000000000 --- a/src/Dictionaries/writeParenthesisedString.h +++ /dev/null @@ -1,11 +0,0 @@ -#pragma once - -#include - - -namespace DB -{ -void writeParenthesisedString(const String & s, WriteBuffer & buf); - - -} diff --git a/src/Disks/AzureBlobStorage/AzureBlobStorageAuth.cpp b/src/Disks/AzureBlobStorage/AzureBlobStorageAuth.cpp new file mode 100644 index 00000000000..94553ba04e9 --- /dev/null +++ b/src/Disks/AzureBlobStorage/AzureBlobStorageAuth.cpp @@ -0,0 +1,145 @@ +#include + +#if USE_AZURE_BLOB_STORAGE + +#include +#include +#include + +using namespace Azure::Storage::Blobs; + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int BAD_ARGUMENTS; +} + +struct AzureBlobStorageEndpoint +{ + const String storage_account_url; + const String container_name; + const std::optional container_already_exists; +}; + + +void validateStorageAccountUrl(const String & storage_account_url) +{ + const auto * storage_account_url_pattern_str = R"(http(()|s)://[a-z0-9-.:]+(()|/)[a-z0-9]*(()|/))"; + static const RE2 storage_account_url_pattern(storage_account_url_pattern_str); + + if (!re2::RE2::FullMatch(storage_account_url, storage_account_url_pattern)) + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Blob Storage URL is not valid, should follow the format: {}, got: {}", storage_account_url_pattern_str, storage_account_url); +} + + +void validateContainerName(const String & container_name) +{ + auto len = container_name.length(); + if (len < 3 || len > 64) + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "AzureBlob Storage container name is not valid, should have length between 3 and 64, but has length: {}", len); + + const auto * container_name_pattern_str = R"([a-z][a-z0-9-]+)"; + static const RE2 container_name_pattern(container_name_pattern_str); + + if (!re2::RE2::FullMatch(container_name, container_name_pattern)) + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "AzureBlob Storage container name is not valid, should follow the format: {}, got: {}", container_name_pattern_str, container_name); +} + + +AzureBlobStorageEndpoint processAzureBlobStorageEndpoint(const Poco::Util::AbstractConfiguration & config, const String & config_prefix) +{ + String storage_account_url = config.getString(config_prefix + ".storage_account_url"); + validateStorageAccountUrl(storage_account_url); + String container_name = config.getString(config_prefix + ".container_name", "default-container"); + validateContainerName(container_name); + std::optional container_already_exists {}; + if (config.has(config_prefix + ".container_already_exists")) + container_already_exists = {config.getBool(config_prefix + ".container_already_exists")}; + return {storage_account_url, container_name, container_already_exists}; +} + + +template +std::shared_ptr getClientWithConnectionString(const String & connection_str, const String & container_name) = delete; + + +template<> +std::shared_ptr getClientWithConnectionString( + const String & connection_str, const String & /*container_name*/) +{ + return std::make_shared(BlobServiceClient::CreateFromConnectionString(connection_str)); +} + + +template<> +std::shared_ptr getClientWithConnectionString( + const String & connection_str, const String & container_name) +{ + return std::make_shared(BlobContainerClient::CreateFromConnectionString(connection_str, container_name)); +} + + +template +std::shared_ptr getAzureBlobStorageClientWithAuth( + const String & url, const String & container_name, const Poco::Util::AbstractConfiguration & config, const String & config_prefix) +{ + if (config.has(config_prefix + ".connection_string")) + { + String connection_str = config.getString(config_prefix + ".connection_string"); + return getClientWithConnectionString(connection_str, container_name); + } + + if (config.has(config_prefix + ".account_key") && config.has(config_prefix + ".account_name")) + { + auto storage_shared_key_credential = std::make_shared( + config.getString(config_prefix + ".account_name"), + config.getString(config_prefix + ".account_key") + ); + return std::make_shared(url, storage_shared_key_credential); + } + + auto managed_identity_credential = std::make_shared(); + return std::make_shared(url, managed_identity_credential); +} + + +std::shared_ptr getAzureBlobContainerClient( + const Poco::Util::AbstractConfiguration & config, const String & config_prefix) +{ + auto endpoint = processAzureBlobStorageEndpoint(config, config_prefix); + auto container_name = endpoint.container_name; + auto final_url = endpoint.storage_account_url + + (endpoint.storage_account_url.back() == '/' ? "" : "/") + + container_name; + + if (endpoint.container_already_exists.value_or(false)) + return getAzureBlobStorageClientWithAuth(final_url, container_name, config, config_prefix); + + auto blob_service_client = getAzureBlobStorageClientWithAuth(endpoint.storage_account_url, container_name, config, config_prefix); + + if (!endpoint.container_already_exists.has_value()) + { + ListBlobContainersOptions blob_containers_list_options; + blob_containers_list_options.Prefix = container_name; + blob_containers_list_options.PageSizeHint = 1; + auto blob_containers = blob_service_client->ListBlobContainers().BlobContainers; + for (const auto & blob_container : blob_containers) + { + if (blob_container.Name == endpoint.container_name) + return getAzureBlobStorageClientWithAuth(final_url, container_name, config, config_prefix); + } + } + + return std::make_shared( + blob_service_client->CreateBlobContainer(container_name).Value); +} + +} + +#endif diff --git a/src/Disks/AzureBlobStorage/AzureBlobStorageAuth.h b/src/Disks/AzureBlobStorage/AzureBlobStorageAuth.h new file mode 100644 index 00000000000..1cef6105d41 --- /dev/null +++ b/src/Disks/AzureBlobStorage/AzureBlobStorageAuth.h @@ -0,0 +1,20 @@ +#pragma once + +#if !defined(ARCADIA_BUILD) +#include +#endif + +#if USE_AZURE_BLOB_STORAGE + +#include +#include + +namespace DB +{ + +std::shared_ptr getAzureBlobContainerClient( + const Poco::Util::AbstractConfiguration & config, const String & config_prefix); + +} + +#endif diff --git a/src/Disks/AzureBlobStorage/DiskAzureBlobStorage.cpp b/src/Disks/AzureBlobStorage/DiskAzureBlobStorage.cpp new file mode 100644 index 00000000000..e2ee6ee0153 --- /dev/null +++ b/src/Disks/AzureBlobStorage/DiskAzureBlobStorage.cpp @@ -0,0 +1,191 @@ +#include + +#if USE_AZURE_BLOB_STORAGE + +#include +#include +#include +#include +#include +#include + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int AZURE_BLOB_STORAGE_ERROR; +} + + +DiskAzureBlobStorageSettings::DiskAzureBlobStorageSettings( + UInt64 max_single_part_upload_size_, + UInt64 min_bytes_for_seek_, + int max_single_read_retries_, + int max_single_download_retries_, + int thread_pool_size_) : + max_single_part_upload_size(max_single_part_upload_size_), + min_bytes_for_seek(min_bytes_for_seek_), + max_single_read_retries(max_single_read_retries_), + max_single_download_retries(max_single_download_retries_), + thread_pool_size(thread_pool_size_) {} + + +class AzureBlobStoragePathKeeper : public RemoteFSPathKeeper +{ +public: + /// RemoteFSPathKeeper constructed with a placeholder argument for chunk_limit, it is unused in this class + AzureBlobStoragePathKeeper() : RemoteFSPathKeeper(1000) {} + + void addPath(const String & path) override + { + paths.push_back(path); + } + + std::vector paths; +}; + + +DiskAzureBlobStorage::DiskAzureBlobStorage( + const String & name_, + DiskPtr metadata_disk_, + std::shared_ptr blob_container_client_, + SettingsPtr settings_, + GetDiskSettings settings_getter_) : + IDiskRemote(name_, "", metadata_disk_, "DiskAzureBlobStorage", settings_->thread_pool_size), + blob_container_client(blob_container_client_), + current_settings(std::move(settings_)), + settings_getter(settings_getter_) {} + + +std::unique_ptr DiskAzureBlobStorage::readFile( + const String & path, + const ReadSettings & read_settings, + std::optional /*estimated_size*/) const +{ + auto settings = current_settings.get(); + auto metadata = readMeta(path); + + LOG_TEST(log, "Read from file by path: {}", backQuote(metadata_disk->getPath() + path)); + + bool threadpool_read = read_settings.remote_fs_method == RemoteFSReadMethod::threadpool; + + auto reader_impl = std::make_unique( + path, blob_container_client, metadata, settings->max_single_read_retries, + settings->max_single_download_retries, read_settings, threadpool_read); + + if (threadpool_read) + { + auto reader = getThreadPoolReader(); + return std::make_unique(reader, read_settings, std::move(reader_impl)); + } + else + { + auto buf = std::make_unique(std::move(reader_impl)); + return std::make_unique(std::move(buf), current_settings.get()->min_bytes_for_seek); + } +} + + +std::unique_ptr DiskAzureBlobStorage::writeFile( + const String & path, + size_t buf_size, + WriteMode mode) +{ + auto metadata = readOrCreateMetaForWriting(path, mode); + auto blob_path = path + "_" + getRandomASCIIString(8); /// NOTE: path contains the tmp_* prefix in the blob name + + LOG_TRACE(log, "{} to file by path: {}. AzureBlob Storage path: {}", + mode == WriteMode::Rewrite ? "Write" : "Append", backQuote(metadata_disk->getPath() + path), blob_path); + + auto buffer = std::make_unique( + blob_container_client, + blob_path, + current_settings.get()->max_single_part_upload_size, + buf_size); + + return std::make_unique>(std::move(buffer), std::move(metadata), blob_path); +} + + +DiskType DiskAzureBlobStorage::getType() const +{ + return DiskType::AzureBlobStorage; +} + + +bool DiskAzureBlobStorage::isRemote() const +{ + return true; +} + + +bool DiskAzureBlobStorage::supportZeroCopyReplication() const +{ + return true; +} + + +bool DiskAzureBlobStorage::checkUniqueId(const String & id) const +{ + Azure::Storage::Blobs::ListBlobsOptions blobs_list_options; + blobs_list_options.Prefix = id; + blobs_list_options.PageSizeHint = 1; + + auto blobs_list_response = blob_container_client->ListBlobs(blobs_list_options); + auto blobs_list = blobs_list_response.Blobs; + + for (const auto & blob : blobs_list) + { + if (id == blob.Name) + return true; + } + + return false; +} + + +void DiskAzureBlobStorage::removeFromRemoteFS(RemoteFSPathKeeperPtr fs_paths_keeper) +{ + auto * paths_keeper = dynamic_cast(fs_paths_keeper.get()); + + if (paths_keeper) + { + for (const auto & path : paths_keeper->paths) + { + try + { + auto delete_info = blob_container_client->DeleteBlob(path); + if (!delete_info.Value.Deleted) + throw Exception(ErrorCodes::AZURE_BLOB_STORAGE_ERROR, "Failed to delete file in AzureBlob Storage: {}", path); + } + catch (const Azure::Storage::StorageException & e) + { + LOG_INFO(log, "Caught an error while deleting file {} : {}", path, e.Message); + throw; + } + } + } +} + + +RemoteFSPathKeeperPtr DiskAzureBlobStorage::createFSPathKeeper() const +{ + return std::make_shared(); +} + + +void DiskAzureBlobStorage::applyNewSettings(const Poco::Util::AbstractConfiguration & config, ContextPtr context, const String &, const DisksMap &) +{ + auto new_settings = settings_getter(config, "storage_configuration.disks." + name, context); + + current_settings.set(std::move(new_settings)); + + if (AsyncExecutor * exec = dynamic_cast(&getExecutor())) + exec->setMaxThreads(current_settings.get()->thread_pool_size); +} + +} + +#endif diff --git a/src/Disks/AzureBlobStorage/DiskAzureBlobStorage.h b/src/Disks/AzureBlobStorage/DiskAzureBlobStorage.h new file mode 100644 index 00000000000..f90ede1add9 --- /dev/null +++ b/src/Disks/AzureBlobStorage/DiskAzureBlobStorage.h @@ -0,0 +1,86 @@ +#pragma once + +#include + +#if USE_AZURE_BLOB_STORAGE + +#include +#include +#include +#include + +#include +#include + + +namespace DB +{ + +struct DiskAzureBlobStorageSettings final +{ + DiskAzureBlobStorageSettings( + UInt64 max_single_part_upload_size_, + UInt64 min_bytes_for_seek_, + int max_single_read_retries, + int max_single_download_retries, + int thread_pool_size_); + + size_t max_single_part_upload_size; /// NOTE: on 32-bit machines it will be at most 4GB, but size_t is also used in BufferBase for offset + UInt64 min_bytes_for_seek; + size_t max_single_read_retries; + size_t max_single_download_retries; + size_t thread_pool_size; +}; + + +class DiskAzureBlobStorage final : public IDiskRemote +{ +public: + + using SettingsPtr = std::unique_ptr; + using GetDiskSettings = std::function; + + DiskAzureBlobStorage( + const String & name_, + DiskPtr metadata_disk_, + std::shared_ptr blob_container_client_, + SettingsPtr settings_, + GetDiskSettings settings_getter_); + + std::unique_ptr readFile( + const String & path, + const ReadSettings & settings, + std::optional estimated_size) const override; + + std::unique_ptr writeFile( + const String & path, + size_t buf_size, + WriteMode mode) override; + + DiskType getType() const override; + + bool isRemote() const override; + + bool supportZeroCopyReplication() const override; + + bool checkUniqueId(const String & id) const override; + + void removeFromRemoteFS(RemoteFSPathKeeperPtr fs_paths_keeper) override; + + RemoteFSPathKeeperPtr createFSPathKeeper() const override; + + void applyNewSettings(const Poco::Util::AbstractConfiguration & config, ContextPtr context, const String &, const DisksMap &) override; + +private: + + /// client used to access the files in the Blob Storage cloud + std::shared_ptr blob_container_client; + + MultiVersion current_settings; + /// Gets disk settings from context. + GetDiskSettings settings_getter; +}; + +} + +#endif diff --git a/src/Disks/AzureBlobStorage/registerDiskAzureBlobStorage.cpp b/src/Disks/AzureBlobStorage/registerDiskAzureBlobStorage.cpp new file mode 100644 index 00000000000..243452353d3 --- /dev/null +++ b/src/Disks/AzureBlobStorage/registerDiskAzureBlobStorage.cpp @@ -0,0 +1,128 @@ +#if !defined(ARCADIA_BUILD) +#include +#endif + +#include + +#if USE_AZURE_BLOB_STORAGE + +#include +#include +#include +#include +#include + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int PATH_ACCESS_DENIED; +} + +constexpr char test_file[] = "test.txt"; +constexpr char test_str[] = "test"; +constexpr size_t test_str_size = 4; + + +void checkWriteAccess(IDisk & disk) +{ + auto file = disk.writeFile(test_file, DBMS_DEFAULT_BUFFER_SIZE, WriteMode::Rewrite); + file->write(test_str, test_str_size); +} + + +void checkReadAccess(IDisk & disk) +{ + auto file = disk.readFile(test_file); + String buf(test_str_size, '0'); + file->readStrict(buf.data(), test_str_size); + if (buf != test_str) + throw Exception("No read access to disk", ErrorCodes::PATH_ACCESS_DENIED); +} + + +void checkReadWithOffset(IDisk & disk) +{ + auto file = disk.readFile(test_file); + auto offset = 2; + auto test_size = test_str_size - offset; + String buf(test_size, '0'); + file->seek(offset, 0); + file->readStrict(buf.data(), test_size); + if (buf != test_str + offset) + throw Exception("Failed to read file with offset", ErrorCodes::PATH_ACCESS_DENIED); +} + + +void checkRemoveAccess(IDisk & disk) +{ + disk.removeFile(test_file); +} + + +std::unique_ptr getSettings(const Poco::Util::AbstractConfiguration & config, const String & config_prefix, ContextPtr /*context*/) +{ + return std::make_unique( + config.getUInt64(config_prefix + ".max_single_part_upload_size", 100 * 1024 * 1024), + config.getUInt64(config_prefix + ".min_bytes_for_seek", 1024 * 1024), + config.getInt(config_prefix + ".max_single_read_retries", 3), + config.getInt(config_prefix + ".max_single_download_retries", 3), + config.getInt(config_prefix + ".thread_pool_size", 16) + ); +} + + +void registerDiskAzureBlobStorage(DiskFactory & factory) +{ + auto creator = []( + const String & name, + const Poco::Util::AbstractConfiguration & config, + const String & config_prefix, + ContextPtr context, + const DisksMap & /*map*/) + { + auto [metadata_path, metadata_disk] = prepareForLocalMetadata(name, config, config_prefix, context); + + std::shared_ptr azure_blob_storage_disk = std::make_shared( + name, + metadata_disk, + getAzureBlobContainerClient(config, config_prefix), + getSettings(config, config_prefix, context), + getSettings + ); + + if (!config.getBool(config_prefix + ".skip_access_check", false)) + { + checkWriteAccess(*azure_blob_storage_disk); + checkReadAccess(*azure_blob_storage_disk); + checkReadWithOffset(*azure_blob_storage_disk); + checkRemoveAccess(*azure_blob_storage_disk); + } + + azure_blob_storage_disk->startup(); + + if (config.getBool(config_prefix + ".cache_enabled", true)) + { + String cache_path = config.getString(config_prefix + ".cache_path", context->getPath() + "disks/" + name + "/cache/"); + azure_blob_storage_disk = wrapWithCache(azure_blob_storage_disk, "azure-blob-storage-cache", cache_path, metadata_path); + } + + return std::make_shared(azure_blob_storage_disk); + }; + factory.registerDiskType("azure_blob_storage", creator); +} + +} + +#else + +namespace DB +{ + +void registerDiskAzureBlobStorage(DiskFactory &) {} + +} + +#endif diff --git a/src/Disks/DiskCacheWrapper.cpp b/src/Disks/DiskCacheWrapper.cpp index e1e901f0d45..b09487c17bc 100644 --- a/src/Disks/DiskCacheWrapper.cpp +++ b/src/Disks/DiskCacheWrapper.cpp @@ -91,7 +91,7 @@ DiskCacheWrapper::readFile( if (!cache_file_predicate(path)) return DiskDecorator::readFile(path, settings, size); - LOG_DEBUG(log, "Read file {} from cache", backQuote(path)); + LOG_TEST(log, "Read file {} from cache", backQuote(path)); if (cache_disk->exists(path)) return cache_disk->readFile(path, settings, size); @@ -105,11 +105,11 @@ DiskCacheWrapper::readFile( { /// This thread will responsible for file downloading to cache. metadata->status = DOWNLOADING; - LOG_DEBUG(log, "File {} doesn't exist in cache. Will download it", backQuote(path)); + LOG_TEST(log, "File {} doesn't exist in cache. Will download it", backQuote(path)); } else if (metadata->status == DOWNLOADING) { - LOG_DEBUG(log, "Waiting for file {} download to cache", backQuote(path)); + LOG_TEST(log, "Waiting for file {} download to cache", backQuote(path)); metadata->condition.wait(lock, [metadata] { return metadata->status == DOWNLOADED || metadata->status == ERROR; }); } } @@ -134,7 +134,7 @@ DiskCacheWrapper::readFile( } cache_disk->moveFile(tmp_path, path); - LOG_DEBUG(log, "File {} downloaded to cache", backQuote(path)); + LOG_TEST(log, "File {} downloaded to cache", backQuote(path)); } catch (...) { @@ -163,7 +163,7 @@ DiskCacheWrapper::writeFile(const String & path, size_t buf_size, WriteMode mode if (!cache_file_predicate(path)) return DiskDecorator::writeFile(path, buf_size, mode); - LOG_DEBUG(log, "Write file {} to cache", backQuote(path)); + LOG_TRACE(log, "Write file {} to cache", backQuote(path)); auto dir_path = directoryPath(path); if (!cache_disk->exists(dir_path)) diff --git a/src/Disks/DiskMemory.cpp b/src/Disks/DiskMemory.cpp index ea8bf719de6..834ed3e0c65 100644 --- a/src/Disks/DiskMemory.cpp +++ b/src/Disks/DiskMemory.cpp @@ -253,7 +253,7 @@ void DiskMemory::clearDirectory(const String & path) throw Exception( "Failed to clear directory '" + path + "'. " + iter->first + " is a directory", ErrorCodes::CANNOT_DELETE_DIRECTORY); - files.erase(iter++); + iter = files.erase(iter); } } diff --git a/src/Disks/DiskType.h b/src/Disks/DiskType.h index 42d87b2381f..435f427b05a 100644 --- a/src/Disks/DiskType.h +++ b/src/Disks/DiskType.h @@ -13,6 +13,7 @@ enum class DiskType HDFS, Encrypted, WebServer, + AzureBlobStorage, }; inline String toString(DiskType disk_type) @@ -31,6 +32,8 @@ inline String toString(DiskType disk_type) return "encrypted"; case DiskType::WebServer: return "web"; + case DiskType::AzureBlobStorage: + return "azure_blob_storage"; } __builtin_unreachable(); } diff --git a/src/Disks/HDFS/DiskHDFS.cpp b/src/Disks/HDFS/DiskHDFS.cpp index c116a62a977..41c407c10ee 100644 --- a/src/Disks/HDFS/DiskHDFS.cpp +++ b/src/Disks/HDFS/DiskHDFS.cpp @@ -1,5 +1,6 @@ #include #include +#include #include #include @@ -74,7 +75,7 @@ std::unique_ptr DiskHDFS::readFile(const String & path, { auto metadata = readMeta(path); - LOG_TRACE(log, + LOG_TEST(log, "Read from file by path: {}. Existing HDFS objects: {}", backQuote(metadata_disk->getPath() + path), metadata.remote_fs_objects.size()); @@ -160,17 +161,13 @@ void registerDiskHDFS(DiskFactory & factory) ContextPtr context_, const DisksMap & /*map*/) -> DiskPtr { - fs::path disk = fs::path(context_->getPath()) / "disks" / name; - fs::create_directories(disk); - String uri{config.getString(config_prefix + ".endpoint")}; checkHDFSURL(uri); if (uri.back() != '/') throw Exception(ErrorCodes::BAD_ARGUMENTS, "HDFS path must ends with '/', but '{}' doesn't.", uri); - String metadata_path = context_->getPath() + "disks/" + name + "/"; - auto metadata_disk = std::make_shared(name + "-metadata", metadata_path, 0); + auto metadata_disk = prepareForLocalMetadata(name, config, config_prefix, context_).second; return std::make_shared( name, uri, diff --git a/src/Disks/IDiskRemote.cpp b/src/Disks/IDiskRemote.cpp index e920e6fd5b9..848726f957d 100644 --- a/src/Disks/IDiskRemote.cpp +++ b/src/Disks/IDiskRemote.cpp @@ -177,7 +177,7 @@ IDiskRemote::Metadata IDiskRemote::createMeta(const String & path) const void IDiskRemote::removeMeta(const String & path, RemoteFSPathKeeperPtr fs_paths_keeper) { - LOG_DEBUG(log, "Remove file by path: {}", backQuote(metadata_disk->getPath() + path)); + LOG_TRACE(log, "Remove file by path: {}", backQuote(metadata_disk->getPath() + path)); if (!metadata_disk->isFile(path)) throw Exception(ErrorCodes::CANNOT_DELETE_DIRECTORY, "Path '{}' is a directory", path); @@ -464,7 +464,7 @@ bool IDiskRemote::tryReserve(UInt64 bytes) std::lock_guard lock(reservation_mutex); if (bytes == 0) { - LOG_DEBUG(log, "Reserving 0 bytes on remote_fs disk {}", backQuote(name)); + LOG_TRACE(log, "Reserving 0 bytes on remote_fs disk {}", backQuote(name)); ++reservation_count; return true; } @@ -473,7 +473,7 @@ bool IDiskRemote::tryReserve(UInt64 bytes) UInt64 unreserved_space = available_space - std::min(available_space, reserved_bytes); if (unreserved_space >= bytes) { - LOG_DEBUG(log, "Reserving {} on disk {}, having unreserved {}.", + LOG_TRACE(log, "Reserving {} on disk {}, having unreserved {}.", ReadableSize(bytes), backQuote(name), ReadableSize(unreserved_space)); ++reservation_count; reserved_bytes += bytes; diff --git a/src/Disks/IDiskRemote.h b/src/Disks/IDiskRemote.h index c9b8fe81d9f..c6a904020de 100644 --- a/src/Disks/IDiskRemote.h +++ b/src/Disks/IDiskRemote.h @@ -42,7 +42,7 @@ class IAsynchronousReader; using AsynchronousReaderPtr = std::shared_ptr; -/// Base Disk class for remote FS's, which are not posix-compatible (DiskS3 and DiskHDFS) +/// Base Disk class for remote FS's, which are not posix-compatible (e.g. DiskS3, DiskHDFS, DiskBlobStorage) class IDiskRemote : public IDisk { diff --git a/src/Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.cpp b/src/Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.cpp index 23fd353a5f0..c8484e6088d 100644 --- a/src/Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.cpp +++ b/src/Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.cpp @@ -21,6 +21,8 @@ namespace ProfileEvents extern const Event RemoteFSUnusedPrefetches; extern const Event RemoteFSPrefetchedReads; extern const Event RemoteFSUnprefetchedReads; + extern const Event RemoteFSLazySeeks; + extern const Event RemoteFSSeeksWithReset; extern const Event RemoteFSBuffers; } @@ -152,11 +154,16 @@ bool AsynchronousReadIndirectBufferFromRemoteFS::nextImpl() CurrentMetrics::Increment metric_increment{CurrentMetrics::AsynchronousReadWait}; Stopwatch watch; { - size = prefetch_future.get(); + auto result = prefetch_future.get(); + size = result.size; + auto offset = result.offset; + assert(offset < size); + if (size) { memory.swap(prefetch_buffer); - set(memory.data(), memory.size()); + size -= offset; + set(memory.data() + offset, size); working_buffer.resize(size); file_offset_of_buffer_end += size; } @@ -168,16 +175,23 @@ bool AsynchronousReadIndirectBufferFromRemoteFS::nextImpl() else { ProfileEvents::increment(ProfileEvents::RemoteFSUnprefetchedReads); - size = readInto(memory.data(), memory.size()).get(); + auto result = readInto(memory.data(), memory.size()).get(); + size = result.size; + auto offset = result.offset; + assert(offset < size); if (size) { - set(memory.data(), memory.size()); + size -= offset; + set(memory.data() + offset, size); working_buffer.resize(size); file_offset_of_buffer_end += size; } } + if (file_offset_of_buffer_end != impl->offset()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected equality {} == {}. It's a bug", file_offset_of_buffer_end, impl->offset()); + prefetch_future = {}; return size; } @@ -231,18 +245,22 @@ off_t AsynchronousReadIndirectBufferFromRemoteFS::seek(off_t offset_, int whence pos = working_buffer.end(); - /// Note: we read in range [file_offset_of_buffer_end, read_until_position). - if (read_until_position && file_offset_of_buffer_end < *read_until_position - && static_cast(file_offset_of_buffer_end) >= getPosition() - && static_cast(file_offset_of_buffer_end) < getPosition() + static_cast(min_bytes_for_seek)) + /** + * Lazy ignore. Save number of bytes to ignore and ignore it either for prefetch buffer or current buffer. + * Note: we read in range [file_offset_of_buffer_end, read_until_position). + */ + off_t file_offset_before_seek = impl->offset(); + if (impl->initialized() + && read_until_position && file_offset_of_buffer_end < *read_until_position + && static_cast(file_offset_of_buffer_end) > file_offset_before_seek + && static_cast(file_offset_of_buffer_end) < file_offset_before_seek + static_cast(min_bytes_for_seek)) { - /** - * Lazy ignore. Save number of bytes to ignore and ignore it either for prefetch buffer or current buffer. - */ - bytes_to_ignore = file_offset_of_buffer_end - getPosition(); + ProfileEvents::increment(ProfileEvents::RemoteFSLazySeeks); + bytes_to_ignore = file_offset_of_buffer_end - file_offset_before_seek; } else { + ProfileEvents::increment(ProfileEvents::RemoteFSSeeksWithReset); impl->reset(); } diff --git a/src/Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.h b/src/Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.h index c9b6532e76c..1b0cc17cb41 100644 --- a/src/Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.h +++ b/src/Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.h @@ -1,9 +1,6 @@ #pragma once -#if !defined(ARCADIA_BUILD) #include -#endif - #include #include #include diff --git a/src/Disks/IO/ReadBufferFromRemoteFSGather.cpp b/src/Disks/IO/ReadBufferFromRemoteFSGather.cpp index 1505f03576c..4db0c9e3c71 100644 --- a/src/Disks/IO/ReadBufferFromRemoteFSGather.cpp +++ b/src/Disks/IO/ReadBufferFromRemoteFSGather.cpp @@ -8,6 +8,10 @@ #include #endif +#if USE_AZURE_BLOB_STORAGE +#include +#endif + #if USE_HDFS #include #endif @@ -30,6 +34,15 @@ SeekableReadBufferPtr ReadBufferFromS3Gather::createImplementationBuffer(const S #endif +#if USE_AZURE_BLOB_STORAGE +SeekableReadBufferPtr ReadBufferFromAzureBlobStorageGather::createImplementationBuffer(const String & path, size_t read_until_position_) const +{ + return std::make_unique(blob_container_client, path, max_single_read_retries, + max_single_download_retries, settings.remote_fs_buffer_size, threadpool_read, read_until_position_); +} +#endif + + SeekableReadBufferPtr ReadBufferFromWebServerGather::createImplementationBuffer(const String & path, size_t read_until_position_) const { return std::make_unique(fs::path(uri) / path, context, settings, threadpool_read, read_until_position_); @@ -52,7 +65,7 @@ ReadBufferFromRemoteFSGather::ReadBufferFromRemoteFSGather(const RemoteMetadata } -size_t ReadBufferFromRemoteFSGather::readInto(char * data, size_t size, size_t offset, size_t ignore) +ReadBufferFromRemoteFSGather::ReadResult ReadBufferFromRemoteFSGather::readInto(char * data, size_t size, size_t offset, size_t ignore) { /** * Set `data` to current working and internal buffers. @@ -60,23 +73,24 @@ size_t ReadBufferFromRemoteFSGather::readInto(char * data, size_t size, size_t o */ set(data, size); - absolute_position = offset; + file_offset_of_buffer_end = offset; bytes_to_ignore = ignore; + if (bytes_to_ignore) + assert(initialized()); auto result = nextImpl(); - bytes_to_ignore = 0; if (result) - return working_buffer.size(); + return {working_buffer.size(), BufferBase::offset()}; - return 0; + return {0, 0}; } void ReadBufferFromRemoteFSGather::initialize() { /// One clickhouse file can be split into multiple files in remote fs. - auto current_buf_offset = absolute_position; + auto current_buf_offset = file_offset_of_buffer_end; for (size_t i = 0; i < metadata.remote_fs_objects.size(); ++i) { const auto & [file_path, size] = metadata.remote_fs_objects[i]; @@ -131,7 +145,6 @@ bool ReadBufferFromRemoteFSGather::nextImpl() return readImpl(); } - bool ReadBufferFromRemoteFSGather::readImpl() { swap(*current_buf); @@ -142,15 +155,26 @@ bool ReadBufferFromRemoteFSGather::readImpl() * we save how many bytes need to be ignored (new_offset - position() bytes). */ if (bytes_to_ignore) + { current_buf->ignore(bytes_to_ignore); + bytes_to_ignore = 0; + } - auto result = current_buf->next(); + bool result = current_buf->hasPendingData(); + if (result) + { + /// bytes_to_ignore already added. + file_offset_of_buffer_end += current_buf->available(); + } + else + { + result = current_buf->next(); + if (result) + file_offset_of_buffer_end += current_buf->buffer().size(); + } swap(*current_buf); - if (result) - absolute_position += working_buffer.size(); - return result; } @@ -167,7 +191,6 @@ void ReadBufferFromRemoteFSGather::reset() current_buf.reset(); } - String ReadBufferFromRemoteFSGather::getFileName() const { return canonical_path; diff --git a/src/Disks/IO/ReadBufferFromRemoteFSGather.h b/src/Disks/IO/ReadBufferFromRemoteFSGather.h index 5bc7d4e4819..ddd651f47a1 100644 --- a/src/Disks/IO/ReadBufferFromRemoteFSGather.h +++ b/src/Disks/IO/ReadBufferFromRemoteFSGather.h @@ -1,13 +1,14 @@ #pragma once -#if !defined(ARCADIA_BUILD) #include -#endif - #include #include #include +#if USE_AZURE_BLOB_STORAGE +#include +#endif + namespace Aws { namespace S3 @@ -36,10 +37,20 @@ public: void setReadUntilPosition(size_t position) override; - size_t readInto(char * data, size_t size, size_t offset, size_t ignore = 0); + struct ReadResult + { + size_t size = 0; + size_t offset = 0; + }; + + ReadResult readInto(char * data, size_t size, size_t offset, size_t ignore = 0); size_t getFileSize() const; + size_t offset() const { return file_offset_of_buffer_end; } + + bool initialized() const { return current_buf != nullptr; } + protected: virtual SeekableReadBufferPtr createImplementationBuffer(const String & path, size_t read_until_position) const = 0; @@ -56,8 +67,13 @@ private: size_t current_buf_idx = 0; - size_t absolute_position = 0; + size_t file_offset_of_buffer_end = 0; + /** + * File: |___________________| + * Buffer: |~~~~~~~| + * file_offset_of_buffer_end: ^ + */ size_t bytes_to_ignore = 0; size_t read_until_position = 0; @@ -100,6 +116,40 @@ private: #endif +#if USE_AZURE_BLOB_STORAGE +/// Reads data from AzureBlob Storage using paths stored in metadata. +class ReadBufferFromAzureBlobStorageGather final : public ReadBufferFromRemoteFSGather +{ +public: + ReadBufferFromAzureBlobStorageGather( + const String & path_, + std::shared_ptr blob_container_client_, + IDiskRemote::Metadata metadata_, + size_t max_single_read_retries_, + size_t max_single_download_retries_, + const ReadSettings & settings_, + bool threadpool_read_ = false) + : ReadBufferFromRemoteFSGather(metadata_, path_) + , blob_container_client(blob_container_client_) + , max_single_read_retries(max_single_read_retries_) + , max_single_download_retries(max_single_download_retries_) + , settings(settings_) + , threadpool_read(threadpool_read_) + { + } + + SeekableReadBufferPtr createImplementationBuffer(const String & path, size_t read_until_position) const override; + +private: + std::shared_ptr blob_container_client; + size_t max_single_read_retries; + size_t max_single_download_retries; + ReadSettings settings; + bool threadpool_read; +}; +#endif + + class ReadBufferFromWebServerGather final : public ReadBufferFromRemoteFSGather { public: diff --git a/src/Disks/IO/ReadIndirectBufferFromRemoteFS.cpp b/src/Disks/IO/ReadIndirectBufferFromRemoteFS.cpp index 112124d9fd7..c21a55d68ac 100644 --- a/src/Disks/IO/ReadIndirectBufferFromRemoteFS.cpp +++ b/src/Disks/IO/ReadIndirectBufferFromRemoteFS.cpp @@ -20,7 +20,7 @@ ReadIndirectBufferFromRemoteFS::ReadIndirectBufferFromRemoteFS( off_t ReadIndirectBufferFromRemoteFS::getPosition() { - return impl->absolute_position - available(); + return impl->file_offset_of_buffer_end - available(); } @@ -35,29 +35,29 @@ off_t ReadIndirectBufferFromRemoteFS::seek(off_t offset_, int whence) if (whence == SEEK_CUR) { /// If position within current working buffer - shift pos. - if (!working_buffer.empty() && size_t(getPosition() + offset_) < impl->absolute_position) + if (!working_buffer.empty() && size_t(getPosition() + offset_) < impl->file_offset_of_buffer_end) { pos += offset_; return getPosition(); } else { - impl->absolute_position += offset_; + impl->file_offset_of_buffer_end += offset_; } } else if (whence == SEEK_SET) { /// If position within current working buffer - shift pos. if (!working_buffer.empty() - && size_t(offset_) >= impl->absolute_position - working_buffer.size() - && size_t(offset_) < impl->absolute_position) + && size_t(offset_) >= impl->file_offset_of_buffer_end - working_buffer.size() + && size_t(offset_) < impl->file_offset_of_buffer_end) { - pos = working_buffer.end() - (impl->absolute_position - offset_); + pos = working_buffer.end() - (impl->file_offset_of_buffer_end - offset_); return getPosition(); } else { - impl->absolute_position = offset_; + impl->file_offset_of_buffer_end = offset_; } } else @@ -66,7 +66,7 @@ off_t ReadIndirectBufferFromRemoteFS::seek(off_t offset_, int whence) impl->reset(); pos = working_buffer.end(); - return impl->absolute_position; + return impl->file_offset_of_buffer_end; } diff --git a/src/Disks/IO/ThreadPoolRemoteFSReader.cpp b/src/Disks/IO/ThreadPoolRemoteFSReader.cpp index 945b2d3eb7e..4be55ff3ecf 100644 --- a/src/Disks/IO/ThreadPoolRemoteFSReader.cpp +++ b/src/Disks/IO/ThreadPoolRemoteFSReader.cpp @@ -8,7 +8,6 @@ #include #include -#include #include #include @@ -28,7 +27,7 @@ namespace CurrentMetrics namespace DB { -size_t ThreadPoolRemoteFSReader::RemoteFSFileDescriptor::readInto(char * data, size_t size, size_t offset, size_t ignore) +ReadBufferFromRemoteFSGather::ReadResult ThreadPoolRemoteFSReader::RemoteFSFileDescriptor::readInto(char * data, size_t size, size_t offset, size_t ignore) { return reader->readInto(data, size, offset, ignore); } @@ -44,18 +43,18 @@ std::future ThreadPoolRemoteFSReader::submit(Reques { auto task = std::make_shared>([request] { - setThreadName("ThreadPoolRemoteFSRead"); + setThreadName("VFSRead"); CurrentMetrics::Increment metric_increment{CurrentMetrics::Read}; auto * remote_fs_fd = assert_cast(request.descriptor.get()); Stopwatch watch(CLOCK_MONOTONIC); - auto bytes_read = remote_fs_fd->readInto(request.buf, request.size, request.offset, request.ignore); + auto [bytes_read, offset] = remote_fs_fd->readInto(request.buf, request.size, request.offset, request.ignore); watch.stop(); ProfileEvents::increment(ProfileEvents::RemoteFSReadMicroseconds, watch.elapsedMicroseconds()); ProfileEvents::increment(ProfileEvents::RemoteFSReadBytes, bytes_read); - return bytes_read; + return Result{ .size = bytes_read, .offset = offset }; }); auto future = task->get_future(); diff --git a/src/Disks/IO/ThreadPoolRemoteFSReader.h b/src/Disks/IO/ThreadPoolRemoteFSReader.h index c300162e214..b2d5f11724a 100644 --- a/src/Disks/IO/ThreadPoolRemoteFSReader.h +++ b/src/Disks/IO/ThreadPoolRemoteFSReader.h @@ -3,12 +3,12 @@ #include #include #include +#include #include namespace DB { -class ReadBufferFromRemoteFSGather; class ThreadPoolRemoteFSReader : public IAsynchronousReader { @@ -28,9 +28,9 @@ public: struct ThreadPoolRemoteFSReader::RemoteFSFileDescriptor : public IFileDescriptor { public: - RemoteFSFileDescriptor(std::shared_ptr reader_) : reader(reader_) {} + explicit RemoteFSFileDescriptor(std::shared_ptr reader_) : reader(reader_) {} - size_t readInto(char * data, size_t size, size_t offset, size_t ignore = 0); + ReadBufferFromRemoteFSGather::ReadResult readInto(char * data, size_t size, size_t offset, size_t ignore = 0); private: std::shared_ptr reader; diff --git a/src/Disks/IO/WriteIndirectBufferFromRemoteFS.cpp b/src/Disks/IO/WriteIndirectBufferFromRemoteFS.cpp index 9ff26ae894f..87453440693 100644 --- a/src/Disks/IO/WriteIndirectBufferFromRemoteFS.cpp +++ b/src/Disks/IO/WriteIndirectBufferFromRemoteFS.cpp @@ -1,6 +1,7 @@ #include "WriteIndirectBufferFromRemoteFS.h" #include +#include #include #include @@ -57,6 +58,11 @@ template class WriteIndirectBufferFromRemoteFS; #endif +#if USE_AZURE_BLOB_STORAGE +template +class WriteIndirectBufferFromRemoteFS; +#endif + #if USE_HDFS template class WriteIndirectBufferFromRemoteFS; diff --git a/src/Disks/RemoteDisksCommon.cpp b/src/Disks/RemoteDisksCommon.cpp new file mode 100644 index 00000000000..1402e3f62c8 --- /dev/null +++ b/src/Disks/RemoteDisksCommon.cpp @@ -0,0 +1,43 @@ +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int BAD_ARGUMENTS; +} + +std::shared_ptr wrapWithCache( + std::shared_ptr disk, String cache_name, String cache_path, String metadata_path) +{ + if (metadata_path == cache_path) + throw Exception("Metadata and cache paths should be different: " + metadata_path, ErrorCodes::BAD_ARGUMENTS); + + auto cache_disk = std::make_shared(cache_name, cache_path, 0); + auto cache_file_predicate = [] (const String & path) + { + return path.ends_with("idx") // index files. + || path.ends_with("mrk") || path.ends_with("mrk2") || path.ends_with("mrk3") /// mark files. + || path.ends_with("txt") || path.ends_with("dat"); + }; + + return std::make_shared(disk, cache_disk, cache_file_predicate); +} + + +std::pair prepareForLocalMetadata( + const String & name, + const Poco::Util::AbstractConfiguration & config, + const String & config_prefix, + ContextPtr context) +{ + /// where the metadata files are stored locally + auto metadata_path = config.getString(config_prefix + ".metadata_path", context->getPath() + "disks/" + name + "/"); + fs::create_directories(metadata_path); + auto metadata_disk = std::make_shared(name + "-metadata", metadata_path, 0); + return std::make_pair(metadata_path, metadata_disk); +} + +} diff --git a/src/Disks/RemoteDisksCommon.h b/src/Disks/RemoteDisksCommon.h new file mode 100644 index 00000000000..0d057b44d18 --- /dev/null +++ b/src/Disks/RemoteDisksCommon.h @@ -0,0 +1,24 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include + + +namespace DB +{ + +std::shared_ptr wrapWithCache( + std::shared_ptr disk, String cache_name, String cache_path, String metadata_path); + +std::pair prepareForLocalMetadata( + const String & name, + const Poco::Util::AbstractConfiguration & config, + const String & config_prefix, + ContextPtr context); + +} diff --git a/src/Disks/S3/DiskS3.cpp b/src/Disks/S3/DiskS3.cpp index 92d673687c6..201334cbd12 100644 --- a/src/Disks/S3/DiskS3.cpp +++ b/src/Disks/S3/DiskS3.cpp @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -25,6 +26,7 @@ #include #include +#include #include #include #include @@ -100,15 +102,6 @@ private: Chunks chunks; }; -String getRandomName() -{ - std::uniform_int_distribution distribution('a', 'z'); - String res(32, ' '); /// The number of bits of entropy should be not less than 128. - for (auto & c : res) - c = distribution(thread_local_rng); - return res; -} - template void throwIfError(Aws::Utils::Outcome & response) { @@ -226,7 +219,7 @@ std::unique_ptr DiskS3::readFile(const String & path, co auto settings = current_settings.get(); auto metadata = readMeta(path); - LOG_TRACE(log, "Read from file by path: {}. Existing S3 objects: {}", + LOG_TEST(log, "Read from file by path: {}. Existing S3 objects: {}", backQuote(metadata_disk->getPath() + path), metadata.remote_fs_objects.size()); bool threadpool_read = read_settings.remote_fs_method == RemoteFSReadMethod::threadpool; @@ -254,7 +247,7 @@ std::unique_ptr DiskS3::writeFile(const String & path, auto metadata = readOrCreateMetaForWriting(path, mode); /// Path to store new S3 object. - auto s3_path = getRandomName(); + auto s3_path = getRandomASCIIString(); std::optional object_metadata; if (settings->send_metadata) @@ -362,7 +355,7 @@ void DiskS3::findLastRevision() /// Construct revision number from high to low bits. String revision; revision.reserve(64); - for (int bit = 0; bit < 64; bit++) + for (int bit = 0; bit < 64; ++bit) { auto revision_prefix = revision + "1"; diff --git a/src/Disks/S3/DiskS3.h b/src/Disks/S3/DiskS3.h index d355d785cea..18ed733ff01 100644 --- a/src/Disks/S3/DiskS3.h +++ b/src/Disks/S3/DiskS3.h @@ -168,7 +168,7 @@ private: inline static const String RESTORE_FILE_NAME = "restore"; /// Key has format: ../../r{revision}-{operation} - const re2::RE2 key_regexp {".*/r(\\d+)-(\\w+).*"}; + const re2::RE2 key_regexp {".*/r(\\d+)-(\\w+)$"}; /// Object contains information about schema version. inline static const String SCHEMA_VERSION_OBJECT = ".SCHEMA_VERSION"; diff --git a/src/Disks/S3/registerDiskS3.cpp b/src/Disks/S3/registerDiskS3.cpp index f7c1d7537c4..f6824a1b3af 100644 --- a/src/Disks/S3/registerDiskS3.cpp +++ b/src/Disks/S3/registerDiskS3.cpp @@ -18,6 +18,7 @@ #include "ProxyResolverConfiguration.h" #include "Disks/DiskRestartProxy.h" #include "Disks/DiskLocal.h" +#include "Disks/RemoteDisksCommon.h" namespace DB { @@ -176,9 +177,7 @@ void registerDiskS3(DiskFactory & factory) if (uri.key.back() != '/') throw Exception("S3 path must ends with '/', but '" + uri.key + "' doesn't.", ErrorCodes::BAD_ARGUMENTS); - String metadata_path = config.getString(config_prefix + ".metadata_path", context->getPath() + "disks/" + name + "/"); - fs::create_directories(metadata_path); - auto metadata_disk = std::make_shared(name + "-metadata", metadata_path, 0); + auto [metadata_path, metadata_disk] = prepareForLocalMetadata(name, config, config_prefix, context); std::shared_ptr s3disk = std::make_shared( name, @@ -199,24 +198,10 @@ void registerDiskS3(DiskFactory & factory) s3disk->startup(); - bool cache_enabled = config.getBool(config_prefix + ".cache_enabled", true); - - if (cache_enabled) + if (config.getBool(config_prefix + ".cache_enabled", true)) { String cache_path = config.getString(config_prefix + ".cache_path", context->getPath() + "disks/" + name + "/cache/"); - - if (metadata_path == cache_path) - throw Exception("Metadata and cache path should be different: " + metadata_path, ErrorCodes::BAD_ARGUMENTS); - - auto cache_disk = std::make_shared("s3-cache", cache_path, 0); - auto cache_file_predicate = [] (const String & path) - { - return path.ends_with("idx") // index files. - || path.ends_with("mrk") || path.ends_with("mrk2") || path.ends_with("mrk3") // mark files. - || path.ends_with("txt") || path.ends_with("dat"); - }; - - s3disk = std::make_shared(s3disk, cache_disk, cache_file_predicate); + s3disk = wrapWithCache(s3disk, "s3-cache", cache_path, metadata_path); } return std::make_shared(s3disk); diff --git a/src/Disks/registerDisks.cpp b/src/Disks/registerDisks.cpp index 5dd86043d83..88c3fdde1e0 100644 --- a/src/Disks/registerDisks.cpp +++ b/src/Disks/registerDisks.cpp @@ -14,6 +14,10 @@ void registerDiskMemory(DiskFactory & factory); void registerDiskS3(DiskFactory & factory); #endif +#if USE_AZURE_BLOB_STORAGE +void registerDiskAzureBlobStorage(DiskFactory & factory); +#endif + #if USE_SSL void registerDiskEncrypted(DiskFactory & factory); #endif @@ -36,6 +40,10 @@ void registerDisks() registerDiskS3(factory); #endif +#if USE_AZURE_BLOB_STORAGE + registerDiskAzureBlobStorage(factory); +#endif + #if USE_SSL registerDiskEncrypted(factory); #endif diff --git a/src/Formats/CapnProtoUtils.cpp b/src/Formats/CapnProtoUtils.cpp index ecfa5df8351..bed46a97c1b 100644 --- a/src/Formats/CapnProtoUtils.cpp +++ b/src/Formats/CapnProtoUtils.cpp @@ -7,6 +7,8 @@ #include #include #include +#include +#include #include #include #include @@ -26,6 +28,7 @@ namespace ErrorCodes extern const int FILE_DOESNT_EXIST; extern const int UNKNOWN_EXCEPTION; extern const int INCORRECT_DATA; + extern const int CANNOT_EXTRACT_TABLE_STRUCTURE; } capnp::StructSchema CapnProtoSchemaParser::getMessageSchema(const FormatSchemaInfo & schema_info) @@ -427,6 +430,113 @@ void checkCapnProtoSchemaStructure(const capnp::StructSchema & schema, const Blo } } +template +static DataTypePtr getEnumDataTypeFromEnumerants(const capnp::EnumSchema::EnumerantList & enumerants) +{ + std::vector> values; + for (auto enumerant : enumerants) + values.emplace_back(enumerant.getProto().getName(), ValueType(enumerant.getOrdinal())); + return std::make_shared>(std::move(values)); +} + +static DataTypePtr getEnumDataTypeFromEnumSchema(const capnp::EnumSchema & enum_schema) +{ + auto enumerants = enum_schema.getEnumerants(); + if (enumerants.size() < 128) + return getEnumDataTypeFromEnumerants(enumerants); + if (enumerants.size() < 32768) + return getEnumDataTypeFromEnumerants(enumerants); + + throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "ClickHouse supports only 8 and 16-but Enums"); +} + +static DataTypePtr getDataTypeFromCapnProtoType(const capnp::Type & capnp_type) +{ + switch (capnp_type.which()) + { + case capnp::schema::Type::INT8: + return std::make_shared(); + case capnp::schema::Type::INT16: + return std::make_shared(); + case capnp::schema::Type::INT32: + return std::make_shared(); + case capnp::schema::Type::INT64: + return std::make_shared(); + case capnp::schema::Type::BOOL: [[fallthrough]]; + case capnp::schema::Type::UINT8: + return std::make_shared(); + case capnp::schema::Type::UINT16: + return std::make_shared(); + case capnp::schema::Type::UINT32: + return std::make_shared(); + case capnp::schema::Type::UINT64: + return std::make_shared(); + case capnp::schema::Type::FLOAT32: + return std::make_shared(); + case capnp::schema::Type::FLOAT64: + return std::make_shared(); + case capnp::schema::Type::DATA: [[fallthrough]]; + case capnp::schema::Type::TEXT: + return std::make_shared(); + case capnp::schema::Type::ENUM: + return getEnumDataTypeFromEnumSchema(capnp_type.asEnum()); + case capnp::schema::Type::LIST: + { + auto list_schema = capnp_type.asList(); + auto nested_type = getDataTypeFromCapnProtoType(list_schema.getElementType()); + return std::make_shared(nested_type); + } + case capnp::schema::Type::STRUCT: + { + auto struct_schema = capnp_type.asStruct(); + + /// Check if it can be Nullable. + if (checkIfStructIsNamedUnion(struct_schema)) + { + auto fields = struct_schema.getUnionFields(); + if (fields.size() != 2 || (!fields[0].getType().isVoid() && !fields[1].getType().isVoid())) + throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "Unions are not supported"); + auto value_type = fields[0].getType().isVoid() ? fields[1].getType() : fields[0].getType(); + if (value_type.isStruct() || value_type.isList()) + throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "Tuples and Lists cannot be inside Nullable"); + + auto nested_type = getDataTypeFromCapnProtoType(value_type); + return std::make_shared(nested_type); + } + + if (checkIfStructContainsUnnamedUnion(struct_schema)) + throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "Unnamed union is not supported"); + + /// Treat Struct as Tuple. + DataTypes nested_types; + Names nested_names; + for (auto field : struct_schema.getNonUnionFields()) + { + nested_names.push_back(field.getProto().getName()); + nested_types.push_back(getDataTypeFromCapnProtoType(field.getType())); + } + return std::make_shared(std::move(nested_types), std::move(nested_names)); + } + default: + throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "Unsupported CapnProtoType: {}", getCapnProtoFullTypeName(capnp_type)); + } +} + +NamesAndTypesList capnProtoSchemaToCHSchema(const capnp::StructSchema & schema) +{ + if (checkIfStructContainsUnnamedUnion(schema)) + throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "Unnamed union is not supported"); + + NamesAndTypesList names_and_types; + for (auto field : schema.getNonUnionFields()) + { + auto name = field.getProto().getName(); + auto type = getDataTypeFromCapnProtoType(field.getType()); + names_and_types.emplace_back(name, type); + } + return names_and_types; +} + } #endif diff --git a/src/Formats/CapnProtoUtils.h b/src/Formats/CapnProtoUtils.h index 93ca0a5e616..51c152de17f 100644 --- a/src/Formats/CapnProtoUtils.h +++ b/src/Formats/CapnProtoUtils.h @@ -38,6 +38,7 @@ capnp::DynamicValue::Reader getReaderByColumnName(const capnp::DynamicStruct::Re void checkCapnProtoSchemaStructure(const capnp::StructSchema & schema, const Block & header, FormatSettings::EnumComparingMode mode); +NamesAndTypesList capnProtoSchemaToCHSchema(const capnp::StructSchema & schema); } #endif diff --git a/src/Formats/EscapingRuleUtils.cpp b/src/Formats/EscapingRuleUtils.cpp index d956d9e6bfb..0a7747fc864 100644 --- a/src/Formats/EscapingRuleUtils.cpp +++ b/src/Formats/EscapingRuleUtils.cpp @@ -1,7 +1,16 @@ #include +#include +#include #include +#include +#include +#include #include #include +#include +#include +#include +#include namespace DB { @@ -9,6 +18,7 @@ namespace DB namespace ErrorCodes { extern const int BAD_ARGUMENTS; + extern const int LOGICAL_ERROR; } FormatSettings::EscapingRule stringToEscapingRule(const String & escaping_rule) @@ -193,30 +203,145 @@ void writeStringByEscapingRule(const String & value, WriteBuffer & out, FormatSe } } -String readStringByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule escaping_rule, const FormatSettings & format_settings) +template +String readByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule escaping_rule, const FormatSettings & format_settings) { String result; switch (escaping_rule) { case FormatSettings::EscapingRule::Quoted: - readQuotedString(result, buf); + if constexpr (read_string) + readQuotedString(result, buf); + else + readQuotedFieldIntoString(result, buf); break; case FormatSettings::EscapingRule::JSON: - readJSONString(result, buf); + if constexpr (read_string) + readJSONString(result, buf); + else + readJSONFieldIntoString(result, buf); break; case FormatSettings::EscapingRule::Raw: readString(result, buf); break; case FormatSettings::EscapingRule::CSV: - readCSVString(result, buf, format_settings.csv); + if constexpr (read_string) + readCSVString(result, buf, format_settings.csv); + else + readCSVField(result, buf, format_settings.csv); break; case FormatSettings::EscapingRule::Escaped: readEscapedString(result, buf); break; default: - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot read string with {} escaping rule", escapingRuleToString(escaping_rule)); + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot read value with {} escaping rule", escapingRuleToString(escaping_rule)); } return result; } +String readFieldByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule escaping_rule, const FormatSettings & format_settings) +{ + return readByEscapingRule(buf, escaping_rule, format_settings); +} + +String readStringByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule escaping_rule, const FormatSettings & format_settings) +{ + return readByEscapingRule(buf, escaping_rule, format_settings); +} + +static bool evaluateConstantExpressionFromString(const StringRef & field, DataTypePtr & type, ContextPtr context) +{ + if (!context) + throw Exception(ErrorCodes::LOGICAL_ERROR, "You must provide context to evaluate constant expression"); + + ParserExpression parser; + Expected expected; + Tokens tokens(field.data, field.data + field.size); + IParser::Pos token_iterator(tokens, context->getSettingsRef().max_parser_depth); + ASTPtr ast; + + /// FIXME: Our parser cannot parse maps in the form of '{key : value}' that is used in text formats. + bool parsed = parser.parse(token_iterator, ast, expected); + if (!parsed) + return false; + + try + { + std::pair result = evaluateConstantExpression(ast, context); + type = generalizeDataType(result.second); + return true; + } + catch (...) + { + return false; + } +} + +DataTypePtr determineDataTypeByEscapingRule(const String & field, const FormatSettings & format_settings, FormatSettings::EscapingRule escaping_rule, ContextPtr context) +{ + switch (escaping_rule) + { + case FormatSettings::EscapingRule::Quoted: + { + DataTypePtr type; + bool parsed = evaluateConstantExpressionFromString(field, type, context); + return parsed ? type : nullptr; + } + case FormatSettings::EscapingRule::JSON: + return getDataTypeFromJSONField(field); + case FormatSettings::EscapingRule::CSV: + { + if (field.empty() || field == format_settings.csv.null_representation) + return nullptr; + + if (field == format_settings.bool_false_representation || field == format_settings.bool_true_representation) + return std::make_shared(); + + DataTypePtr type; + bool parsed; + if (field[0] == '\'' || field[0] == '"') + { + /// Try to evaluate expression inside quotes. + parsed = evaluateConstantExpressionFromString(StringRef(field.data() + 1, field.size() - 2), type, context); + /// If it's a number in quotes we determine it as a string. + if (parsed && type && isNumber(removeNullable(type))) + return makeNullable(std::make_shared()); + } + else + parsed = evaluateConstantExpressionFromString(field, type, context); + + /// If we couldn't parse an expression, determine it as a string. + return parsed ? type : makeNullable(std::make_shared()); + } + case FormatSettings::EscapingRule::Raw: [[fallthrough]]; + case FormatSettings::EscapingRule::Escaped: + /// TODO: Try to use some heuristics here to determine the type of data. + return field.empty() ? nullptr : makeNullable(std::make_shared()); + default: + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot determine the type for value with {} escaping rule", escapingRuleToString(escaping_rule)); + } +} + +DataTypes determineDataTypesByEscapingRule(const std::vector & fields, const FormatSettings & format_settings, FormatSettings::EscapingRule escaping_rule, ContextPtr context) +{ + DataTypes data_types; + data_types.reserve(fields.size()); + for (const auto & field : fields) + data_types.push_back(determineDataTypeByEscapingRule(field, format_settings, escaping_rule, context)); + return data_types; +} + +DataTypePtr getDefaultDataTypeForEscapingRule(FormatSettings::EscapingRule escaping_rule) +{ + switch (escaping_rule) + { + case FormatSettings::EscapingRule::CSV: [[fallthrough]]; + case FormatSettings::EscapingRule::Escaped: [[fallthrough]]; + case FormatSettings::EscapingRule::Raw: + return makeNullable(std::make_shared()); + default: + return nullptr; + } +} + } diff --git a/src/Formats/EscapingRuleUtils.h b/src/Formats/EscapingRuleUtils.h index 02f027db74d..10147b29ad6 100644 --- a/src/Formats/EscapingRuleUtils.h +++ b/src/Formats/EscapingRuleUtils.h @@ -4,6 +4,7 @@ #include #include #include +#include namespace DB { @@ -33,5 +34,24 @@ void serializeFieldByEscapingRule( void writeStringByEscapingRule(const String & value, WriteBuffer & out, FormatSettings::EscapingRule escaping_rule, const FormatSettings & format_settings); String readStringByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule escaping_rule, const FormatSettings & format_settings); +String readFieldByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule escaping_rule, const FormatSettings & format_settings); + +/// Try to determine the type of the field written by a specific escaping rule. +/// If cannot, return nullptr. +/// - For Quoted escaping rule we can interpret a single field as a constant +/// expression and get it's type by evaluation this expression. +/// - For JSON escaping rule we can use JSON parser to parse a single field +/// and then convert JSON type of this field to ClickHouse type. +/// - For CSV escaping rule we can do the next: +/// - If the field is an unquoted string, then we could try to evaluate it +/// as a constant expression, and if it fails, treat it as a String. +/// - If the field is a string in quotes, then we can try to evaluate +/// expression inside quotes as a constant expression, and if it fails or +/// the result is a number (we don't parse numbers in quotes) we treat it as a String. +/// - For TSV and TSVRaw we treat each field as a String (TODO: try to use some tweaks and heuristics here) +DataTypePtr determineDataTypeByEscapingRule(const String & field, const FormatSettings & format_settings, FormatSettings::EscapingRule escaping_rule, ContextPtr context = nullptr); +DataTypes determineDataTypesByEscapingRule(const std::vector & fields, const FormatSettings & format_settings, FormatSettings::EscapingRule escaping_rule, ContextPtr context = nullptr); + +DataTypePtr getDefaultDataTypeForEscapingRule(FormatSettings::EscapingRule escaping_rule); } diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp index 75b096de425..2068de0d01c 100644 --- a/src/Formats/FormatFactory.cpp +++ b/src/Formats/FormatFactory.cpp @@ -1,20 +1,18 @@ #include #include -#include -#include #include #include +#include +#include #include #include -#include #include -#include #include +#include +#include #include - -#include -#include +#include namespace DB { @@ -57,6 +55,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) format_settings.csv.allow_single_quotes = settings.format_csv_allow_single_quotes; format_settings.csv.crlf_end_of_line = settings.output_format_csv_crlf_end_of_line; format_settings.csv.delimiter = settings.format_csv_delimiter; + format_settings.csv.tuple_delimiter = settings.format_csv_delimiter; format_settings.csv.empty_as_default = settings.input_format_csv_empty_as_default; format_settings.csv.input_format_enum_as_number = settings.input_format_csv_enum_as_number; format_settings.csv.null_representation = settings.format_csv_null_representation; @@ -114,9 +113,12 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) format_settings.arrow.low_cardinality_as_dictionary = settings.output_format_arrow_low_cardinality_as_dictionary; format_settings.arrow.import_nested = settings.input_format_arrow_import_nested; format_settings.orc.import_nested = settings.input_format_orc_import_nested; + format_settings.orc.row_batch_size = settings.input_format_orc_row_batch_size; format_settings.defaults_for_omitted_fields = settings.input_format_defaults_for_omitted_fields; format_settings.capn_proto.enum_comparing_mode = settings.format_capn_proto_enum_comparising_mode; format_settings.seekable_read = settings.input_format_allow_seeks; + format_settings.msgpack.number_of_columns = settings.input_format_msgpack_number_of_columns; + format_settings.max_rows_to_read_for_schema_inference = settings.input_format_max_rows_to_read_for_schema_inference; /// Validate avro_schema_registry_url with RemoteHostFilter when non-empty and in Server context if (format_settings.schema.is_server) @@ -198,7 +200,6 @@ InputFormatPtr FormatFactory::getInput( return format; } - InputFormatPtr FormatFactory::getInputFormat( const String & name, ReadBuffer & buf, @@ -233,6 +234,18 @@ InputFormatPtr FormatFactory::getInputFormat( return format; } +static void addExistingProgressToOutputFormat(OutputFormatPtr format, ContextPtr context) +{ + auto * element_id = context->getProcessListElement(); + if (element_id) + { + /// While preparing the query there might have been progress (for example in subscalar subqueries) so add it here + auto current_progress = element_id->getProgressIn(); + Progress read_progress{current_progress.read_rows, current_progress.read_bytes, current_progress.total_rows_to_read}; + format->onProgress(read_progress); + } +} + OutputFormatPtr FormatFactory::getOutputFormatParallelIfPossible( const String & name, WriteBuffer & buf, @@ -261,7 +274,9 @@ OutputFormatPtr FormatFactory::getOutputFormatParallelIfPossible( if (context->hasQueryContext() && settings.log_queries) context->getQueryContext()->addQueryFactoriesInfo(Context::QueryLogFactories::Format, name); - return std::make_shared(builder); + auto format = std::make_shared(builder); + addExistingProgressToOutputFormat(format, context); + return format; } return getOutputFormat(name, buf, sample, context, callback, _format_settings); @@ -301,6 +316,8 @@ OutputFormatPtr FormatFactory::getOutputFormat( if (auto * mysql = typeid_cast(format.get())) mysql->setContext(context); + addExistingProgressToOutputFormat(format, context); + return format; } @@ -323,6 +340,32 @@ String FormatFactory::getContentType( return format->getContentType(); } +SchemaReaderPtr FormatFactory::getSchemaReader( + const String & name, + ReadBuffer & buf, + ContextPtr context, + const std::optional & _format_settings) const +{ + const auto & schema_reader_creator = dict.at(name).schema_reader_creator; + if (!schema_reader_creator) + throw Exception("FormatFactory: Format " + name + " doesn't support schema inference.", ErrorCodes::LOGICAL_ERROR); + + auto format_settings = _format_settings ? *_format_settings : getFormatSettings(context); + return schema_reader_creator(buf, format_settings, context); +} + +ExternalSchemaReaderPtr FormatFactory::getExternalSchemaReader( + const String & name, + ContextPtr context, + const std::optional & _format_settings) const +{ + const auto & external_schema_reader_creator = dict.at(name).external_schema_reader_creator; + if (!external_schema_reader_creator) + throw Exception("FormatFactory: Format " + name + " doesn't support schema inference.", ErrorCodes::LOGICAL_ERROR); + + auto format_settings = _format_settings ? *_format_settings : getFormatSettings(context); + return external_schema_reader_creator(format_settings); +} void FormatFactory::registerInputFormat(const String & name, InputCreator input_creator) { @@ -356,6 +399,21 @@ void FormatFactory::registerFileSegmentationEngine(const String & name, FileSegm target = std::move(file_segmentation_engine); } +void FormatFactory::registerSchemaReader(const String & name, SchemaReaderCreator schema_reader_creator) +{ + auto & target = dict[name].schema_reader_creator; + if (target) + throw Exception("FormatFactory: Schema reader " + name + " is already registered", ErrorCodes::LOGICAL_ERROR); + target = std::move(schema_reader_creator); +} + +void FormatFactory::registerExternalSchemaReader(const String & name, ExternalSchemaReaderCreator external_schema_reader_creator) +{ + auto & target = dict[name].external_schema_reader_creator; + if (target) + throw Exception("FormatFactory: Schema reader " + name + " is already registered", ErrorCodes::LOGICAL_ERROR); + target = std::move(external_schema_reader_creator); +} void FormatFactory::markOutputFormatSupportsParallelFormatting(const String & name) { @@ -393,6 +451,23 @@ bool FormatFactory::isOutputFormat(const String & name) const return it != dict.end() && it->second.output_creator; } +bool FormatFactory::checkIfFormatHasSchemaReader(const String & name) +{ + const auto & target = getCreators(name); + return bool(target.schema_reader_creator); +} + +bool FormatFactory::checkIfFormatHasExternalSchemaReader(const String & name) +{ + const auto & target = getCreators(name); + return bool(target.external_schema_reader_creator); +} + +bool FormatFactory::checkIfFormatHasAnySchemaReader(const String & name) +{ + return checkIfFormatHasSchemaReader(name) || checkIfFormatHasExternalSchemaReader(name); +} + FormatFactory & FormatFactory::instance() { static FormatFactory ret; diff --git a/src/Formats/FormatFactory.h b/src/Formats/FormatFactory.h index ea285c47996..a62b32da0cc 100644 --- a/src/Formats/FormatFactory.h +++ b/src/Formats/FormatFactory.h @@ -4,7 +4,9 @@ #include #include #include +#include #include +#include #include @@ -31,6 +33,11 @@ class IOutputFormat; struct RowInputFormatParams; struct RowOutputFormatParams; +class ISchemaReader; +class IExternalSchemaReader; +using SchemaReaderPtr = std::shared_ptr; +using ExternalSchemaReaderPtr = std::shared_ptr; + using InputFormatPtr = std::shared_ptr; using OutputFormatPtr = std::shared_ptr; @@ -85,11 +92,16 @@ private: /// The checker should return true if parallel parsing should be disabled. using NonTrivialPrefixAndSuffixChecker = std::function; + using SchemaReaderCreator = std::function; + using ExternalSchemaReaderCreator = std::function; + struct Creators { InputCreator input_creator; OutputCreator output_creator; FileSegmentationEngine file_segmentation_engine; + SchemaReaderCreator schema_reader_creator; + ExternalSchemaReaderCreator external_schema_reader_creator; bool supports_parallel_formatting{false}; bool is_column_oriented{false}; NonTrivialPrefixAndSuffixChecker non_trivial_prefix_and_suffix_checker; @@ -138,6 +150,17 @@ public: ContextPtr context, const std::optional & format_settings = std::nullopt) const; + SchemaReaderPtr getSchemaReader( + const String & name, + ReadBuffer & buf, + ContextPtr context, + const std::optional & format_settings = std::nullopt) const; + + ExternalSchemaReaderPtr getExternalSchemaReader( + const String & name, + ContextPtr context, + const std::optional & format_settings = std::nullopt) const; + void registerFileSegmentationEngine(const String & name, FileSegmentationEngine file_segmentation_engine); void registerNonTrivialPrefixAndSuffixChecker(const String & name, NonTrivialPrefixAndSuffixChecker non_trivial_prefix_and_suffix_checker); @@ -146,11 +169,19 @@ public: void registerInputFormat(const String & name, InputCreator input_creator); void registerOutputFormat(const String & name, OutputCreator output_creator); + /// Register schema readers for format its name. + void registerSchemaReader(const String & name, SchemaReaderCreator schema_reader_creator); + void registerExternalSchemaReader(const String & name, ExternalSchemaReaderCreator external_schema_reader_creator); + void markOutputFormatSupportsParallelFormatting(const String & name); void markFormatAsColumnOriented(const String & name); bool checkIfFormatIsColumnOriented(const String & name); + bool checkIfFormatHasSchemaReader(const String & name); + bool checkIfFormatHasExternalSchemaReader(const String & name); + bool checkIfFormatHasAnySchemaReader(const String & name); + const FormatsDictionary & getAllFormats() const { return dict; @@ -163,6 +194,7 @@ private: FormatsDictionary dict; const Creators & getCreators(const String & name) const; + }; } diff --git a/src/Formats/FormatSettings.h b/src/Formats/FormatSettings.h index a18a20bac7b..6298e959c3e 100644 --- a/src/Formats/FormatSettings.h +++ b/src/Formats/FormatSettings.h @@ -33,6 +33,7 @@ struct FormatSettings bool defaults_for_omitted_fields = true; bool seekable_read = true; + UInt64 max_rows_to_read_for_schema_inference = 100; enum class DateTimeInputFormat { @@ -95,6 +96,7 @@ struct FormatSettings bool input_format_enum_as_number = false; bool input_format_arrays_as_nested_csv = false; String null_representation = "\\N"; + char tuple_delimiter = ','; } csv; struct Custom @@ -200,6 +202,7 @@ struct FormatSettings struct { bool import_nested = false; + int64_t row_batch_size = 100'000; } orc; /// For capnProto format we should determine how to @@ -215,6 +218,11 @@ struct FormatSettings { EnumComparingMode enum_comparing_mode = EnumComparingMode::BY_VALUES; } capn_proto; + + struct + { + UInt64 number_of_columns = 0; + } msgpack; }; } diff --git a/src/Formats/JSONEachRowUtils.cpp b/src/Formats/JSONEachRowUtils.cpp index b55e9f59cc7..c63b8453634 100644 --- a/src/Formats/JSONEachRowUtils.cpp +++ b/src/Formats/JSONEachRowUtils.cpp @@ -1,7 +1,17 @@ #include #include +#include #include #include +#include +#include +#include +#include +#include +#include +#include +#include +#include #include @@ -26,7 +36,7 @@ static std::pair fileSegmentationEngineJSONEachRowImpl(ReadBuffer while (loadAtPosition(in, memory, pos) && (balance || memory.size() + static_cast(pos - in.position()) < min_chunk_size || number_of_rows < min_rows)) { const auto current_object_size = memory.size() + static_cast(pos - in.position()); - if (current_object_size > 10 * min_chunk_size) + if (min_chunk_size != 0 && current_object_size > 10 * min_chunk_size) throw ParsingException("Size of JSON object is extremely large. Expected not greater than " + std::to_string(min_chunk_size) + " bytes, but current is " + std::to_string(current_object_size) + " bytes per row. Increase the value setting 'min_chunk_bytes_for_parallel_parsing' or check your data manually, most likely JSON is malformed", ErrorCodes::INCORRECT_DATA); @@ -92,6 +102,122 @@ static std::pair fileSegmentationEngineJSONEachRowImpl(ReadBuffer return {loadAtPosition(in, memory, pos), number_of_rows}; } +template +static String readJSONEachRowLineIntoStringImpl(ReadBuffer & in) +{ + Memory memory; + fileSegmentationEngineJSONEachRowImpl(in, memory, 0, 1); + return String(memory.data(), memory.size()); +} + +template +DataTypePtr getDataTypeFromJSONFieldImpl(const Element & field) +{ + if (field.isNull()) + return nullptr; + + if (field.isBool()) + return makeNullable(std::make_shared()); + + if (field.isInt64() || field.isUInt64() || field.isDouble()) + return makeNullable(std::make_shared()); + + if (field.isString()) + return makeNullable(std::make_shared()); + + if (field.isArray()) + { + auto array = field.getArray(); + + /// Return nullptr in case of empty array because we cannot determine nested type. + if (array.size() == 0) + return nullptr; + + DataTypes nested_data_types; + /// If this array contains fields with different types we will treat it as Tuple. + bool is_tuple = false; + for (const auto element : array) + { + auto type = getDataTypeFromJSONFieldImpl(element); + if (!type) + return nullptr; + + if (!nested_data_types.empty() && type->getName() != nested_data_types.back()->getName()) + is_tuple = true; + + nested_data_types.push_back(std::move(type)); + } + + if (is_tuple) + return std::make_shared(nested_data_types); + + return std::make_shared(nested_data_types.back()); + } + + if (field.isObject()) + { + auto object = field.getObject(); + DataTypePtr value_type; + for (const auto key_value_pair : object) + { + auto type = getDataTypeFromJSONFieldImpl(key_value_pair.second); + if (!type) + return nullptr; + + if (value_type && value_type->getName() != type->getName()) + return nullptr; + + value_type = type; + } + return std::make_shared(std::make_shared(), value_type); + } + + throw Exception{ErrorCodes::INCORRECT_DATA, "Unexpected JSON type"}; +} + +auto getJSONParserAndElement() +{ +#if USE_SIMDJSON + return std::pair(); +#elif USE_RAPIDJSON + return std::pair(); +#else + return std::pair(); +#endif +} + +DataTypePtr getDataTypeFromJSONField(const String & field) +{ + auto [parser, element] = getJSONParserAndElement(); + bool parsed = parser.parse(field, element); + if (!parsed) + throw Exception(ErrorCodes::INCORRECT_DATA, "Cannot parse JSON object"); + + return getDataTypeFromJSONFieldImpl(element); +} + +template +static DataTypes determineColumnDataTypesFromJSONEachRowDataImpl(ReadBuffer & in, bool /*json_strings*/, Extractor & extractor) +{ + String line = readJSONEachRowLineIntoStringImpl(in); + auto [parser, element] = getJSONParserAndElement(); + bool parsed = parser.parse(line, element); + if (!parsed) + throw Exception(ErrorCodes::INCORRECT_DATA, "Cannot parse JSON object"); + + auto fields = extractor.extract(element); + + DataTypes data_types; + data_types.reserve(fields.size()); + for (const auto & field : fields) + data_types.push_back(getDataTypeFromJSONFieldImpl(field)); + + /// TODO: For JSONStringsEachRow/JSONCompactStringsEach all types will be strings. + /// Should we try to parse data inside strings somehow in this case? + + return data_types; +} + std::pair fileSegmentationEngineJSONEachRow(ReadBuffer & in, DB::Memory<> & memory, size_t min_chunk_size) { return fileSegmentationEngineJSONEachRowImpl<'{', '}'>(in, memory, min_chunk_size, 1); @@ -102,6 +228,60 @@ std::pair fileSegmentationEngineJSONCompactEachRow(ReadBuffer & in return fileSegmentationEngineJSONEachRowImpl<'[', ']'>(in, memory, min_chunk_size, min_rows); } +struct JSONEachRowFieldsExtractor +{ + template + std::vector extract(const Element & element) + { + /// {..., "" : , ...} + auto object = element.getObject(); + std::vector fields; + fields.reserve(object.size()); + column_names.reserve(object.size()); + for (const auto & key_value_pair : object) + { + column_names.emplace_back(key_value_pair.first); + fields.push_back(key_value_pair.second); + } + + return fields; + } + + std::vector column_names; +}; + +std::unordered_map readRowAndGetNamesAndDataTypesForJSONEachRow(ReadBuffer & in, bool json_strings) +{ + JSONEachRowFieldsExtractor extractor; + auto data_types = determineColumnDataTypesFromJSONEachRowDataImpl(in, json_strings, extractor); + std::unordered_map result; + for (size_t i = 0; i != extractor.column_names.size(); ++i) + result[extractor.column_names[i]] = data_types[i]; + return result; +} + +struct JSONCompactEachRowFieldsExtractor +{ + template + std::vector extract(const Element & element) + { + /// [..., , ...] + auto array = element.getArray(); + std::vector fields; + fields.reserve(array.size()); + for (size_t i = 0; i != array.size(); ++i) + fields.push_back(array[i]); + return fields; + } +}; + +DataTypes readRowAndGetDataTypesForJSONCompactEachRow(ReadBuffer & in, bool json_strings) +{ + JSONCompactEachRowFieldsExtractor extractor; + return determineColumnDataTypesFromJSONEachRowDataImpl(in, json_strings, extractor); +} + + bool nonTrivialPrefixAndSuffixCheckerJSONEachRowImpl(ReadBuffer & buf) { /// For JSONEachRow we can safely skip whitespace characters diff --git a/src/Formats/JSONEachRowUtils.h b/src/Formats/JSONEachRowUtils.h index 4a049aa1abd..6f71baa8b40 100644 --- a/src/Formats/JSONEachRowUtils.h +++ b/src/Formats/JSONEachRowUtils.h @@ -11,6 +11,21 @@ namespace DB std::pair fileSegmentationEngineJSONEachRow(ReadBuffer & in, DB::Memory<> & memory, size_t min_chunk_size); std::pair fileSegmentationEngineJSONCompactEachRow(ReadBuffer & in, DB::Memory<> & memory, size_t min_chunk_size, size_t min_rows); + +/// Parse JSON from string and convert it's type to ClickHouse type. Make the result type always Nullable. +/// JSON array with different nested types is treated as Tuple. +/// If cannot convert (for example when field contains null), return nullptr. +DataTypePtr getDataTypeFromJSONField(const String & field); + +/// Read row in JSONEachRow format and try to determine type for each field. +/// Return map {column_name : type}. +/// If cannot determine the type of some field, return nullptr for it. +std::unordered_map readRowAndGetNamesAndDataTypesForJSONEachRow(ReadBuffer & in, bool json_strings); + +/// Read row in JSONCompactEachRow format and try to determine type for each field. +/// If cannot determine the type of some field, return nullptr for it. +DataTypes readRowAndGetDataTypesForJSONCompactEachRow(ReadBuffer & in, bool json_strings); + bool nonTrivialPrefixAndSuffixCheckerJSONEachRowImpl(ReadBuffer & buf); bool readFieldImpl(ReadBuffer & in, IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, const String & column_name, const FormatSettings & format_settings, bool yield_strings); diff --git a/src/Formats/NativeReader.cpp b/src/Formats/NativeReader.cpp index bf13b7a22c1..645069bfbdf 100644 --- a/src/Formats/NativeReader.cpp +++ b/src/Formats/NativeReader.cpp @@ -1,4 +1,5 @@ #include +#include #include #include @@ -10,6 +11,7 @@ #include #include +#include #include @@ -63,7 +65,7 @@ void NativeReader::resetParser() use_index = false; } -void NativeReader::readData(const IDataType & type, ColumnPtr & column, ReadBuffer & istr, size_t rows, double avg_value_size_hint, size_t revision) +void NativeReader::readData(const ISerialization & serialization, ColumnPtr & column, ReadBuffer & istr, size_t rows, double avg_value_size_hint) { ISerialization::DeserializeBinaryBulkSettings settings; settings.getter = [&](ISerialization::SubstreamPath) -> ReadBuffer * { return &istr; }; @@ -73,21 +75,12 @@ void NativeReader::readData(const IDataType & type, ColumnPtr & column, ReadBuff ISerialization::DeserializeBinaryBulkStatePtr state; - const auto * aggregate_function_data_type = typeid_cast(&type); - if (aggregate_function_data_type && aggregate_function_data_type->isVersioned()) - { - auto version = aggregate_function_data_type->getVersionFromRevision(revision); - aggregate_function_data_type->setVersion(version, /* if_empty */true); - } - - auto serialization = type.getDefaultSerialization(); - - serialization->deserializeBinaryBulkStatePrefix(settings, state); - serialization->deserializeBinaryBulkWithMultipleStreams(column, rows, settings, state, nullptr); + serialization.deserializeBinaryBulkStatePrefix(settings, state); + serialization.deserializeBinaryBulkWithMultipleStreams(column, rows, settings, state, nullptr); if (column->size() != rows) - throw Exception("Cannot read all data in NativeBlockInputStream. Rows read: " + toString(column->size()) + ". Rows expected: " + toString(rows) + ".", - ErrorCodes::CANNOT_READ_ALL_DATA); + throw Exception(ErrorCodes::CANNOT_READ_ALL_DATA, + "Cannot read all data in NativeBlockInputStream. Rows read: {}. Rows expected: {}", column->size(), rows); } @@ -151,6 +144,30 @@ Block NativeReader::read() readBinary(type_name, istr); column.type = data_type_factory.get(type_name); + const auto * aggregate_function_data_type = typeid_cast(column.type.get()); + if (aggregate_function_data_type && aggregate_function_data_type->isVersioned()) + { + auto version = aggregate_function_data_type->getVersionFromRevision(server_revision); + aggregate_function_data_type->setVersion(version, /*if_empty=*/ true); + } + + SerializationPtr serialization; + if (server_revision >= DBMS_MIN_REVISION_WITH_CUSTOM_SERIALIZATION) + { + auto info = column.type->createSerializationInfo({}); + + UInt8 has_custom; + readBinary(has_custom, istr); + if (has_custom) + info->deserializeFromKindsBinary(istr); + + serialization = column.type->getSerialization(*info); + } + else + { + serialization = column.type->getDefaultSerialization(); + } + if (use_index) { /// Index allows to do more checks. @@ -161,11 +178,11 @@ Block NativeReader::read() } /// Data - ColumnPtr read_column = column.type->createColumn(); + ColumnPtr read_column = column.type->createColumn(*serialization); double avg_value_size_hint = avg_value_size_hints.empty() ? 0 : avg_value_size_hints[i]; if (rows) /// If no rows, nothing to read. - readData(*column.type, read_column, istr, rows, avg_value_size_hint, server_revision); + readData(*serialization, read_column, istr, rows, avg_value_size_hint); column.column = std::move(read_column); @@ -175,8 +192,8 @@ Block NativeReader::read() auto & header_column = header.getByName(column.name); if (!header_column.type->equals(*column.type)) { - column.column = recursiveTypeConversion(column.column, column.type, header.getByPosition(i).type); - column.type = header.getByPosition(i).type; + column.column = recursiveTypeConversion(column.column, column.type, header.safeGetByPosition(i).type); + column.type = header.safeGetByPosition(i).type; } } diff --git a/src/Formats/NativeReader.h b/src/Formats/NativeReader.h index 215bfa6812e..1f9eb8b9764 100644 --- a/src/Formats/NativeReader.h +++ b/src/Formats/NativeReader.h @@ -31,7 +31,7 @@ public: IndexForNativeFormat::Blocks::const_iterator index_block_it_, IndexForNativeFormat::Blocks::const_iterator index_block_end_); - static void readData(const IDataType & type, ColumnPtr & column, ReadBuffer & istr, size_t rows, double avg_value_size_hint, size_t revision); + static void readData(const ISerialization & serialization, ColumnPtr & column, ReadBuffer & istr, size_t rows, double avg_value_size_hint); Block getHeader() const; diff --git a/src/Formats/NativeWriter.cpp b/src/Formats/NativeWriter.cpp index 1a4cc24a7d9..eb744e130f7 100644 --- a/src/Formats/NativeWriter.cpp +++ b/src/Formats/NativeWriter.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include #include @@ -11,6 +12,8 @@ #include #include +#include +#include #include namespace DB @@ -43,7 +46,7 @@ void NativeWriter::flush() } -static void writeData(const IDataType & type, const ColumnPtr & column, WriteBuffer & ostr, UInt64 offset, UInt64 limit) +static void writeData(const ISerialization & serialization, const ColumnPtr & column, WriteBuffer & ostr, UInt64 offset, UInt64 limit) { /** If there are columns-constants - then we materialize them. * (Since the data type does not know how to serialize / deserialize constants.) @@ -55,12 +58,10 @@ static void writeData(const IDataType & type, const ColumnPtr & column, WriteBuf settings.position_independent_encoding = false; settings.low_cardinality_max_dictionary_size = 0; //-V1048 - auto serialization = type.getDefaultSerialization(); - ISerialization::SerializeBinaryBulkStatePtr state; - serialization->serializeBinaryBulkStatePrefix(settings, state); - serialization->serializeBinaryBulkWithMultipleStreams(*full_column, offset, limit, settings, state); - serialization->serializeBinaryBulkStateSuffix(settings, state); + serialization.serializeBinaryBulkStatePrefix(settings, state); + serialization.serializeBinaryBulkWithMultipleStreams(*full_column, offset, limit, settings, state); + serialization.serializeBinaryBulkStateSuffix(settings, state); } @@ -140,9 +141,27 @@ void NativeWriter::write(const Block & block) writeStringBinary(type_name, ostr); + /// Serialization. Dynamic, if client supports it. + SerializationPtr serialization; + if (client_revision >= DBMS_MIN_REVISION_WITH_CUSTOM_SERIALIZATION) + { + auto info = column.column->getSerializationInfo(); + serialization = column.type->getSerialization(*info); + + bool has_custom = info->hasCustomSerialization(); + writeBinary(static_cast(has_custom), ostr); + if (has_custom) + info->serialializeKindBinary(ostr); + } + else + { + serialization = column.type->getDefaultSerialization(); + column.column = recursiveRemoveSparse(column.column); + } + /// Data if (rows) /// Zero items of data is always represented as zero number of bytes. - writeData(*column.type, column.column, ostr, 0, 0); + writeData(*serialization, column.column, ostr, 0, 0); if (index) { diff --git a/src/Formats/ParsedTemplateFormatString.cpp b/src/Formats/ParsedTemplateFormatString.cpp index 4966420f05b..8d1b987d01a 100644 --- a/src/Formats/ParsedTemplateFormatString.cpp +++ b/src/Formats/ParsedTemplateFormatString.cpp @@ -14,14 +14,14 @@ namespace ErrorCodes extern const int INVALID_TEMPLATE_FORMAT; } -ParsedTemplateFormatString::ParsedTemplateFormatString(const FormatSchemaInfo & schema, const ColumnIdxGetter & idx_by_name) +ParsedTemplateFormatString::ParsedTemplateFormatString(const FormatSchemaInfo & schema, const ColumnIdxGetter & idx_by_name, bool allow_indexes) { ReadBufferFromFile schema_file(schema.absoluteSchemaPath(), 4096); String format_string; readStringUntilEOF(format_string, schema_file); try { - parse(format_string, idx_by_name); + parse(format_string, idx_by_name, allow_indexes); } catch (DB::Exception & e) { @@ -33,7 +33,7 @@ ParsedTemplateFormatString::ParsedTemplateFormatString(const FormatSchemaInfo & } -void ParsedTemplateFormatString::parse(const String & format_string, const ColumnIdxGetter & idx_by_name) +void ParsedTemplateFormatString::parse(const String & format_string, const ColumnIdxGetter & idx_by_name, bool allow_indexes) { enum ParserState { @@ -100,6 +100,8 @@ void ParsedTemplateFormatString::parse(const String & format_string, const Colum column_idx = strtoull(column_names.back().c_str(), &col_idx_end, 10); if (col_idx_end != column_names.back().c_str() + column_names.back().size() || errno) column_idx = idx_by_name(column_names.back()); + else if (!allow_indexes) + throw Exception(ErrorCodes::INVALID_TEMPLATE_FORMAT, "Indexes instead of names are not allowed"); } format_idx_to_column_idx.emplace_back(column_idx); break; diff --git a/src/Formats/ParsedTemplateFormatString.h b/src/Formats/ParsedTemplateFormatString.h index ba0ebdf5aa8..c5617d0f0ef 100644 --- a/src/Formats/ParsedTemplateFormatString.h +++ b/src/Formats/ParsedTemplateFormatString.h @@ -31,9 +31,9 @@ struct ParsedTemplateFormatString typedef std::function(const String &)> ColumnIdxGetter; ParsedTemplateFormatString() = default; - ParsedTemplateFormatString(const FormatSchemaInfo & schema, const ColumnIdxGetter & idx_by_name); + ParsedTemplateFormatString(const FormatSchemaInfo & schema, const ColumnIdxGetter & idx_by_name, bool allow_indexes = true); - void parse(const String & format_string, const ColumnIdxGetter & idx_by_name); + void parse(const String & format_string, const ColumnIdxGetter & idx_by_name, bool allow_indexes = true); static const char * readMayBeQuotedColumnNameInto(const char * pos, size_t size, String & s); size_t columnsCount() const; diff --git a/src/Formats/ProtobufSchemas.cpp b/src/Formats/ProtobufSchemas.cpp index a6e63b1c256..9f25f830e37 100644 --- a/src/Formats/ProtobufSchemas.cpp +++ b/src/Formats/ProtobufSchemas.cpp @@ -71,6 +71,7 @@ ProtobufSchemas::~ProtobufSchemas() = default; const google::protobuf::Descriptor * ProtobufSchemas::getMessageTypeForFormatSchema(const FormatSchemaInfo & info) { + std::lock_guard lock(mutex); auto it = importers.find(info.schemaDirectory()); if (it == importers.end()) it = importers.emplace(info.schemaDirectory(), std::make_unique(info.schemaDirectory())).first; diff --git a/src/Formats/ProtobufSchemas.h b/src/Formats/ProtobufSchemas.h index f911cb2ce4b..0a2eeea9893 100644 --- a/src/Formats/ProtobufSchemas.h +++ b/src/Formats/ProtobufSchemas.h @@ -4,6 +4,7 @@ #if USE_PROTOBUF #include +#include #include #include #include @@ -39,6 +40,7 @@ public: private: class ImporterWithSourceTree; std::unordered_map> importers; + std::mutex mutex; }; } diff --git a/src/Formats/ProtobufSerializer.cpp b/src/Formats/ProtobufSerializer.cpp index 4c1e03578c1..b59db12a16c 100644 --- a/src/Formats/ProtobufSerializer.cpp +++ b/src/Formats/ProtobufSerializer.cpp @@ -24,10 +24,12 @@ # include # include # include +# include # include # include # include # include +# include # include # include # include @@ -55,6 +57,7 @@ namespace ErrorCodes extern const int PROTOBUF_FIELD_NOT_REPEATED; extern const int PROTOBUF_BAD_CAST; extern const int LOGICAL_ERROR; + extern const int BAD_ARGUMENTS; } namespace @@ -859,7 +862,7 @@ namespace template void toStringAppend(NumberType value, PaddedPODArray & str) { - WriteBufferFromVector buf{str, WriteBufferFromVector>::AppendModeTag{}}; + WriteBufferFromVector buf{str, AppendModeTag{}}; writeText(value, buf); } @@ -2147,9 +2150,11 @@ namespace std::vector && field_descs_, const FieldDescriptor * parent_field_descriptor_, bool with_length_delimiter_, + std::unique_ptr missing_columns_filler_, const ProtobufReaderOrWriter & reader_or_writer_) : parent_field_descriptor(parent_field_descriptor_) , with_length_delimiter(with_length_delimiter_) + , missing_columns_filler(std::move(missing_columns_filler_)) , should_skip_if_empty(parent_field_descriptor ? shouldSkipZeroOrEmpty(*parent_field_descriptor) : false) , reader(reader_or_writer_.reader) , writer(reader_or_writer_.writer) @@ -2170,8 +2175,6 @@ namespace if (!num_columns_) wrongNumberOfColumns(num_columns_, ">0"); - columns.assign(columns_, columns_ + num_columns_); - std::vector field_columns; for (const FieldInfo & info : field_infos) { @@ -2188,13 +2191,17 @@ namespace if (reader) { - missing_column_indices.resize(num_columns_); - for (size_t column_index : collections::range(num_columns_)) - missing_column_indices[column_index] = column_index; - for (const auto & field_info : field_infos) - for (size_t column_index : field_info.column_indices) - missing_column_indices[column_index] = static_cast(-1); - boost::range::remove_erase(missing_column_indices, static_cast(-1)); + mutable_columns.resize(num_columns_); + for (size_t i : collections::range(num_columns_)) + mutable_columns[i] = columns_[i]->assumeMutable(); + + std::vector column_is_missing; + column_is_missing.resize(num_columns_, true); + for (const FieldInfo & info : field_infos) + for (size_t i : info.column_indices) + column_is_missing[i] = false; + + has_missing_columns = (std::find(column_is_missing.begin(), column_is_missing.end(), true) != column_is_missing.end()); } } @@ -2243,7 +2250,7 @@ namespace { last_field_index = 0; last_field_tag = field_infos[0].field_tag; - size_t old_size = columns.empty() ? 0 : columns[0]->size(); + size_t old_size = mutable_columns.empty() ? 0 : mutable_columns[0]->size(); try { @@ -2268,10 +2275,10 @@ namespace } catch (...) { - for (auto & column : columns) + for (auto & column : mutable_columns) { if (column->size() > old_size) - column->assumeMutableRef().popBack(column->size() - old_size); + column->popBack(column->size() - old_size); } throw; } @@ -2302,10 +2309,9 @@ namespace if (parent_field_descriptor) out << " field " << quoteString(parent_field_descriptor->full_name()) << " (" << parent_field_descriptor->type_name() << ")"; - for (size_t i = 0; i != field_infos.size(); ++i) + for (const auto & field_info : field_infos) { out << "\n"; - const auto & field_info = field_infos[i]; writeIndent(out, indent + 1) << "Columns #"; for (size_t j = 0; j != field_info.column_indices.size(); ++j) { @@ -2342,13 +2348,8 @@ namespace void addDefaultsToMissingColumns(size_t row_num) { - for (size_t column_index : missing_column_indices) - { - auto & column = columns[column_index]; - size_t old_size = column->size(); - if (row_num >= old_size) - column->assumeMutableRef().insertDefault(); - } + if (has_missing_columns) + missing_columns_filler->addDefaults(mutable_columns, row_num); } struct FieldInfo @@ -2374,13 +2375,14 @@ namespace const FieldDescriptor * const parent_field_descriptor; const bool with_length_delimiter; + const std::unique_ptr missing_columns_filler; const bool should_skip_if_empty; ProtobufReader * const reader; ProtobufWriter * const writer; std::vector field_infos; std::unordered_map field_index_by_field_tag; - Columns columns; - std::vector missing_column_indices; + MutableColumns mutable_columns; + bool has_missing_columns = false; int last_field_tag = 0; size_t last_field_index = static_cast(-1); }; @@ -2626,7 +2628,8 @@ namespace with_length_delimiter, /* parent_field_descriptor = */ nullptr, used_column_indices, - /* columns_are_reordered_outside = */ false); + /* columns_are_reordered_outside = */ false, + /* check_nested_while_filling_missing_columns = */ true); if (!message_serializer) { @@ -2813,7 +2816,8 @@ namespace bool with_length_delimiter, const FieldDescriptor * parent_field_descriptor, std::vector & used_column_indices, - bool columns_are_reordered_outside) + bool columns_are_reordered_outside, + bool check_nested_while_filling_missing_columns) { std::vector column_names_sv; column_names_sv.reserve(num_columns); @@ -2828,7 +2832,8 @@ namespace with_length_delimiter, parent_field_descriptor, used_column_indices, - columns_are_reordered_outside); + columns_are_reordered_outside, + check_nested_while_filling_missing_columns); } std::unique_ptr buildMessageSerializerImpl( @@ -2839,7 +2844,8 @@ namespace bool with_length_delimiter, const FieldDescriptor * parent_field_descriptor, std::vector & used_column_indices, - bool columns_are_reordered_outside) + bool columns_are_reordered_outside, + bool check_nested_while_filling_missing_columns) { std::vector field_descs; boost::container::flat_map field_descriptors_in_use; @@ -2962,7 +2968,8 @@ namespace /* with_length_delimiter = */ false, field_descriptor, used_column_indices_in_nested, - /* columns_are_reordered_outside = */ true); + /* columns_are_reordered_outside = */ true, + /* check_nested_while_filling_missing_columns = */ false); /// `columns_are_reordered_outside` is true because column indices are /// going to be transformed and then written to the outer message, @@ -3001,7 +3008,8 @@ namespace /* with_length_delimiter = */ false, field_descriptor, used_column_indices_in_nested, - /* columns_are_reordered_outside = */ true); + /* columns_are_reordered_outside = */ true, + /* check_nested_while_filling_missing_columns = */ false); /// `columns_are_reordered_outside` is true because column indices are /// going to be transformed and then written to the outer message, @@ -3010,6 +3018,7 @@ namespace if (nested_message_serializer) { std::vector column_names_used; + column_names_used.reserve(used_column_indices_in_nested.size()); for (size_t i : used_column_indices_in_nested) column_names_used.emplace_back(nested_column_names[i]); auto field_serializer = std::make_unique( @@ -3040,8 +3049,18 @@ namespace if (field_descs.empty()) return nullptr; + std::unique_ptr missing_columns_filler; + if (reader_or_writer.reader) + { + if (check_nested_while_filling_missing_columns) + missing_columns_filler = std::make_unique(num_columns, column_names, data_types); + else + missing_columns_filler = std::make_unique(); + } + return std::make_unique( - std::move(field_descs), parent_field_descriptor, with_length_delimiter, reader_or_writer); + std::move(field_descs), parent_field_descriptor, with_length_delimiter, + std::move(missing_columns_filler), reader_or_writer); } /// Builds a serializer for one-to-one match: @@ -3147,7 +3166,8 @@ namespace /* with_length_delimiter = */ false, &field_descriptor, used_column_indices, - /* columns_are_reordered_outside = */ false); + /* columns_are_reordered_outside = */ false, + /* check_nested_while_filling_missing_columns = */ false); if (!message_serializer) { @@ -3210,8 +3230,105 @@ namespace std::function get_root_desc_function; std::shared_ptr root_serializer_ptr; }; -} + template + DataTypePtr getEnumDataType(const google::protobuf::EnumDescriptor * enum_descriptor) + { + std::vector> values; + for (int i = 0; i != enum_descriptor->value_count(); ++i) + { + const auto * enum_value_descriptor = enum_descriptor->value(i); + values.emplace_back(enum_value_descriptor->name(), enum_value_descriptor->number()); + } + return std::make_shared>(std::move(values)); + } + + NameAndTypePair getNameAndDataTypeFromField(const google::protobuf::FieldDescriptor * field_descriptor, bool allow_repeat = true) + { + if (allow_repeat && field_descriptor->is_map()) + { + auto name_and_type = getNameAndDataTypeFromField(field_descriptor, false); + const auto * tuple_type = assert_cast(name_and_type.type.get()); + return {name_and_type.name, std::make_shared(tuple_type->getElements())}; + } + + if (allow_repeat && field_descriptor->is_repeated()) + { + auto name_and_type = getNameAndDataTypeFromField(field_descriptor, false); + return {name_and_type.name, std::make_shared(name_and_type.type)}; + } + + switch (field_descriptor->type()) + { + case FieldTypeId::TYPE_SFIXED32: [[fallthrough]]; + case FieldTypeId::TYPE_SINT32: [[fallthrough]]; + case FieldTypeId::TYPE_INT32: + return {field_descriptor->name(), std::make_shared()}; + case FieldTypeId::TYPE_SFIXED64: [[fallthrough]]; + case FieldTypeId::TYPE_SINT64: [[fallthrough]]; + case FieldTypeId::TYPE_INT64: + return {field_descriptor->name(), std::make_shared()}; + case FieldTypeId::TYPE_BOOL: + return {field_descriptor->name(), std::make_shared()}; + case FieldTypeId::TYPE_FLOAT: + return {field_descriptor->name(), std::make_shared()}; + case FieldTypeId::TYPE_DOUBLE: + return {field_descriptor->name(), std::make_shared()}; + case FieldTypeId::TYPE_UINT32: [[fallthrough]]; + case FieldTypeId::TYPE_FIXED32: + return {field_descriptor->name(), std::make_shared()}; + case FieldTypeId::TYPE_UINT64: [[fallthrough]]; + case FieldTypeId::TYPE_FIXED64: + return {field_descriptor->name(), std::make_shared()}; + case FieldTypeId::TYPE_BYTES: [[fallthrough]]; + case FieldTypeId::TYPE_STRING: + return {field_descriptor->name(), std::make_shared()}; + case FieldTypeId::TYPE_ENUM: + { + const auto * enum_descriptor = field_descriptor->enum_type(); + if (enum_descriptor->value_count() == 0) + throw Exception("Empty enum field", ErrorCodes::BAD_ARGUMENTS); + int max_abs = std::abs(enum_descriptor->value(0)->number()); + for (int i = 1; i != enum_descriptor->value_count(); ++i) + { + if (std::abs(enum_descriptor->value(i)->number()) > max_abs) + max_abs = std::abs(enum_descriptor->value(i)->number()); + } + if (max_abs < 128) + return {field_descriptor->name(), getEnumDataType(enum_descriptor)}; + else if (max_abs < 32768) + return {field_descriptor->name(), getEnumDataType(enum_descriptor)}; + else + throw Exception("ClickHouse supports only 8-bit and 16-bit enums", ErrorCodes::BAD_ARGUMENTS); + } + case FieldTypeId::TYPE_GROUP: [[fallthrough]]; + case FieldTypeId::TYPE_MESSAGE: + { + const auto * message_descriptor = field_descriptor->message_type(); + if (message_descriptor->field_count() == 1) + { + const auto * nested_field_descriptor = message_descriptor->field(0); + auto nested_name_and_type = getNameAndDataTypeFromField(nested_field_descriptor); + return {field_descriptor->name() + "_" + nested_name_and_type.name, nested_name_and_type.type}; + } + else + { + DataTypes nested_types; + Strings nested_names; + for (int i = 0; i != message_descriptor->field_count(); ++i) + { + auto nested_name_and_type = getNameAndDataTypeFromField(message_descriptor->field(i)); + nested_types.push_back(nested_name_and_type.type); + nested_names.push_back(nested_name_and_type.name); + } + return {field_descriptor->name(), std::make_shared(std::move(nested_types), std::move(nested_names))}; + } + } + } + + __builtin_unreachable(); + } +} std::unique_ptr ProtobufSerializer::create( const Strings & column_names, @@ -3234,5 +3351,14 @@ std::unique_ptr ProtobufSerializer::create( std::vector missing_column_indices; return ProtobufSerializerBuilder(writer).buildMessageSerializer(column_names, data_types, missing_column_indices, message_descriptor, with_length_delimiter); } + +NamesAndTypesList protobufSchemaToCHSchema(const google::protobuf::Descriptor * message_descriptor) +{ + NamesAndTypesList schema; + for (int i = 0; i != message_descriptor->field_count(); ++i) + schema.push_back(getNameAndDataTypeFromField(message_descriptor->field(i))); + return schema; +} + } #endif diff --git a/src/Formats/ProtobufSerializer.h b/src/Formats/ProtobufSerializer.h index 3eaca6a18d6..d9bed913517 100644 --- a/src/Formats/ProtobufSerializer.h +++ b/src/Formats/ProtobufSerializer.h @@ -4,6 +4,7 @@ #if USE_PROTOBUF # include +#include namespace google::protobuf { class Descriptor; } @@ -48,5 +49,7 @@ public: ProtobufWriter & writer); }; +NamesAndTypesList protobufSchemaToCHSchema(const google::protobuf::Descriptor * message_descriptor); + } #endif diff --git a/src/Formats/ReadSchemaUtils.cpp b/src/Formats/ReadSchemaUtils.cpp new file mode 100644 index 00000000000..37067eae64f --- /dev/null +++ b/src/Formats/ReadSchemaUtils.cpp @@ -0,0 +1,112 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int CANNOT_EXTRACT_TABLE_STRUCTURE; + extern const int BAD_ARGUMENTS; +} + +ColumnsDescription readSchemaFromFormat(const String & format_name, const std::optional & format_settings, ReadBufferCreator read_buffer_creator, ContextPtr context) +{ + NamesAndTypesList names_and_types; + if (FormatFactory::instance().checkIfFormatHasExternalSchemaReader(format_name)) + { + auto external_schema_reader = FormatFactory::instance().getExternalSchemaReader(format_name, context, format_settings); + try + { + names_and_types = external_schema_reader->readSchema(); + } + catch (const DB::Exception & e) + { + throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "Cannot extract table structure from {} format file. Error: {}", format_name, e.message()); + } + } + else if (FormatFactory::instance().checkIfFormatHasSchemaReader(format_name)) + { + auto read_buf = read_buffer_creator(); + if (read_buf->eof()) + throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "Cannot extract table structure from {} format file, file is empty", format_name); + + auto schema_reader = FormatFactory::instance().getSchemaReader(format_name, *read_buf, context, format_settings); + try + { + names_and_types = schema_reader->readSchema(); + } + catch (const DB::Exception & e) + { + throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "Cannot extract table structure from {} format file. Error: {}", format_name, e.message()); + } + } + else + throw Exception(ErrorCodes::BAD_ARGUMENTS, "{} file format doesn't support schema inference", format_name); + + return ColumnsDescription(names_and_types); +} + +DataTypePtr generalizeDataType(DataTypePtr type) +{ + WhichDataType which(type); + + if (which.isNothing()) + return nullptr; + + if (which.isNullable()) + { + const auto * nullable_type = assert_cast(type.get()); + return generalizeDataType(nullable_type->getNestedType()); + } + + if (isNumber(type)) + return makeNullable(std::make_shared()); + + if (which.isArray()) + { + const auto * array_type = assert_cast(type.get()); + auto nested_type = generalizeDataType(array_type->getNestedType()); + return nested_type ? std::make_shared(nested_type) : nullptr; + } + + if (which.isTuple()) + { + const auto * tuple_type = assert_cast(type.get()); + DataTypes nested_types; + for (const auto & element : tuple_type->getElements()) + { + auto nested_type = generalizeDataType(element); + if (!nested_type) + return nullptr; + nested_types.push_back(nested_type); + } + return std::make_shared(std::move(nested_types)); + } + + if (which.isMap()) + { + const auto * map_type = assert_cast(type.get()); + auto key_type = removeNullable(generalizeDataType(map_type->getKeyType())); + auto value_type = generalizeDataType(map_type->getValueType()); + return key_type && value_type ? std::make_shared(key_type, value_type) : nullptr; + } + + if (which.isLowCarnality()) + { + const auto * lc_type = assert_cast(type.get()); + auto nested_type = generalizeDataType(lc_type->getDictionaryType()); + return nested_type ? std::make_shared(nested_type) : nullptr; + } + + return makeNullable(type); +} + +} diff --git a/src/Formats/ReadSchemaUtils.h b/src/Formats/ReadSchemaUtils.h new file mode 100644 index 00000000000..fb43acc3cd6 --- /dev/null +++ b/src/Formats/ReadSchemaUtils.h @@ -0,0 +1,30 @@ +#pragma once + +#include +#include + +namespace DB +{ + +/// Try to determine the schema of the data in specifying format. +/// For formats that have an external schema reader, it will +/// use it and won't create a read buffer. +/// For formats that have a schema reader from the data, +/// read buffer will be created by the provided creator and +/// the schema will be extracted from the data. +/// If format doesn't have any schema reader or a schema reader +/// couldn't determine the schema, an exception will be thrown. +using ReadBufferCreator = std::function()>; +ColumnsDescription readSchemaFromFormat(const String & format_name, const std::optional & format_settings, ReadBufferCreator read_buffer_creator, ContextPtr context); + +/// Convert type to the most general type: +/// - IntN, UIntN, FloatN, Decimal -> Float64 +/// - Type -> Nullable(type) +/// - Array(Type) -> Array(Nullable(Type)) +/// - Tuple(Type1, ..., TypeN) -> Tuple(Nullable(Type1), ..., Nullable(TypeN)) +/// - Map(KeyType, ValueType) -> Map(KeyType, Nullable(ValueType)) +/// - LowCardinality(Type) -> LowCardinality(Nullable(Type)) +/// If type is Nothing or one of the nested types is Nothing, return nullptr. +DataTypePtr generalizeDataType(DataTypePtr type); + +} diff --git a/src/Formats/RowInputMissingColumnsFiller.cpp b/src/Formats/RowInputMissingColumnsFiller.cpp new file mode 100644 index 00000000000..ff8f9e19380 --- /dev/null +++ b/src/Formats/RowInputMissingColumnsFiller.cpp @@ -0,0 +1,140 @@ +#include +#include +#include +#include + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int ILLEGAL_COLUMN; +} + + +RowInputMissingColumnsFiller::RowInputMissingColumnsFiller() = default; + +RowInputMissingColumnsFiller::RowInputMissingColumnsFiller(const NamesAndTypesList & names_and_types) +{ + std::unordered_map> nested_groups; /// Nested prefix -> column indices. + size_t i = 0; + for (auto it = names_and_types.begin(); it != names_and_types.end(); ++it, ++i) + { + const auto & name_and_type = *it; + if (isArray(name_and_type.type)) + { + auto split = Nested::splitName(name_and_type.name); + if (!split.second.empty()) /// Is it really a column of Nested data structure? + nested_groups[split.first].push_back(i); + } + } + setNestedGroups(std::move(nested_groups), names_and_types.size()); +} + +RowInputMissingColumnsFiller::RowInputMissingColumnsFiller(const Names & names, const DataTypes & types) +{ + std::unordered_map> nested_groups; /// Nested prefix -> column indices. + for (size_t i = 0; i != names.size(); ++i) + { + if (isArray(types[i])) + { + auto split = Nested::splitName(names[i]); + if (!split.second.empty()) /// Is it really a column of Nested data structure? + nested_groups[split.first].push_back(i); + } + } + setNestedGroups(std::move(nested_groups), names.size()); +} + +RowInputMissingColumnsFiller::RowInputMissingColumnsFiller(size_t count, const std::string_view * names, const DataTypePtr * types) +{ + std::unordered_map> nested_groups; /// Nested prefix -> column indices. + for (size_t i = 0; i != count; ++i) + { + if (isArray(types[i])) + { + auto split = Nested::splitName(names[i]); + if (!split.second.empty()) /// Is it really a column of Nested data structure? + nested_groups[split.first].push_back(i); + } + } + setNestedGroups(std::move(nested_groups), count); +} + +void RowInputMissingColumnsFiller::setNestedGroups(std::unordered_map> && nested_groups, size_t num_columns) +{ + if (!nested_groups.empty()) + { + column_infos.resize(num_columns); + for (auto & nested_group : nested_groups | boost::adaptors::map_values) + { + if (nested_group.size() <= 1) + continue; + auto nested_group_shared = std::make_shared>(std::move(nested_group)); + for (size_t i : *nested_group_shared) + column_infos[i].nested_group = nested_group_shared; + } + } +} + + +void RowInputMissingColumnsFiller::addDefaults(MutableColumns & columns, size_t row_num) const +{ + for (size_t i = 0; i != columns.size(); ++i) + { + auto & column = *columns[i]; + size_t column_size = column.size(); + if (row_num < column_size) + continue; /// The column already has an element in this position, skipping. + + if (row_num > column_size) + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Wrong row_number {}, expected either {} or {}", row_num, column_size - 1, column_size); + + if ((i >= column_infos.size()) || !column_infos[i].nested_group) + { + column.insertDefault(); + continue; + } + + const auto & nested_group = *column_infos[i].nested_group; + size_t size_of_array = 0; + for (size_t j : nested_group) + { + const auto & column_j = columns[j]; + size_t column_size_j = column_j->size(); + if (row_num < column_size_j) + { + const auto * column_array = typeid_cast(column_j.get()); + if (!column_array) + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Column with Array type is not represented by ColumnArray column: {}", column_j->dumpStructure()); + const auto & offsets = column_array->getOffsets(); + size_of_array = offsets[row_num] - offsets[row_num - 1]; + break; + } + } + + for (size_t j : nested_group) + { + auto & column_j = columns[j]; + size_t column_size_j = column_j->size(); + if (row_num >= column_size_j) + { + if (row_num > column_size_j) + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Wrong row_number {}, expected either {} or {}", row_num, column_size_j - 1, column_size_j); + + auto * column_array = typeid_cast(column_j.get()); + if (!column_array) + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Column with Array type is not represented by ColumnArray column: {}", column_j->dumpStructure()); + + auto & data = column_array->getData(); + auto & offsets = column_array->getOffsets(); + for (size_t k = 0; k != size_of_array; ++k) + data.insertDefault(); + offsets.push_back(data.size()); + } + } + } +} + +} diff --git a/src/Formats/RowInputMissingColumnsFiller.h b/src/Formats/RowInputMissingColumnsFiller.h new file mode 100644 index 00000000000..0eaefd4e814 --- /dev/null +++ b/src/Formats/RowInputMissingColumnsFiller.h @@ -0,0 +1,40 @@ +#pragma once + +#include + + +namespace DB +{ + +/// Adds default values to columns if they don't have a specified row yet. +/// This class can be useful for implementing IRowInputFormat. +/// For missing columns of nested structure, it creates not columns of empty arrays, +/// but columns of arrays of correct lengths. +class RowInputMissingColumnsFiller +{ +public: + /// Makes a column filler which checks nested structures while adding default values to columns. + RowInputMissingColumnsFiller(const NamesAndTypesList & names_and_types); + RowInputMissingColumnsFiller(const Names & names, const DataTypes & types); + RowInputMissingColumnsFiller(size_t count, const std::string_view * names, const DataTypePtr * types); + + /// Default constructor makes a column filler which doesn't check nested structures while + /// adding default values to columns. + RowInputMissingColumnsFiller(); + + /// Adds default values to some columns. + /// For each column the function checks the number of rows and if it's less than (row_num + 1) + /// the function will add a default value to this column. + void addDefaults(MutableColumns & columns, size_t row_num) const; + +private: + void setNestedGroups(std::unordered_map> && nested_groups, size_t num_columns); + + struct ColumnInfo + { + std::shared_ptr> nested_group; + }; + std::vector column_infos; +}; + +} diff --git a/src/Formats/config_formats.h.in b/src/Formats/config_formats.h.in index f6497b4830b..427abc7d1ce 100644 --- a/src/Formats/config_formats.h.in +++ b/src/Formats/config_formats.h.in @@ -10,4 +10,3 @@ #cmakedefine01 USE_ARROW #cmakedefine01 USE_PROTOBUF #cmakedefine01 USE_MSGPACK - diff --git a/src/Formats/registerFormats.cpp b/src/Formats/registerFormats.cpp index 7425c6898de..1349c9e3323 100644 --- a/src/Formats/registerFormats.cpp +++ b/src/Formats/registerFormats.cpp @@ -81,6 +81,28 @@ void registerInputFormatCapnProto(FormatFactory & factory); void registerNonTrivialPrefixAndSuffixCheckerJSONEachRow(FormatFactory & factory); void registerNonTrivialPrefixAndSuffixCheckerJSONAsString(FormatFactory & factory); +void registerArrowSchemaReader(FormatFactory & factory); +void registerParquetSchemaReader(FormatFactory & factory); +void registerORCSchemaReader(FormatFactory & factory); +void registerTSVSchemaReader(FormatFactory & factory); +void registerCSVSchemaReader(FormatFactory & factory); +void registerJSONCompactEachRowSchemaReader(FormatFactory & factory); +void registerJSONEachRowSchemaReader(FormatFactory & factory); +void registerNativeSchemaReader(FormatFactory & factory); +void registerRowBinaryWithNamesAndTypesSchemaReader(FormatFactory & factory); +void registerAvroSchemaReader(FormatFactory & factory); +void registerProtobufSchemaReader(FormatFactory & factory); +void registerLineAsStringSchemaReader(FormatFactory & factory); +void registerJSONAsStringSchemaReader(FormatFactory & factory); +void registerRawBLOBSchemaReader(FormatFactory & factory); +void registerMsgPackSchemaReader(FormatFactory & factory); +void registerCapnProtoSchemaReader(FormatFactory & factory); +void registerCustomSeparatedSchemaReader(FormatFactory & factory); +void registerRegexpSchemaReader(FormatFactory & factory); +void registerTSKVSchemaReader(FormatFactory & factory); +void registerValuesSchemaReader(FormatFactory & factory); +void registerTemplateSchemaReader(FormatFactory & factory); + void registerFormats() { auto & factory = FormatFactory::instance(); @@ -152,6 +174,28 @@ void registerFormats() registerNonTrivialPrefixAndSuffixCheckerJSONEachRow(factory); registerNonTrivialPrefixAndSuffixCheckerJSONAsString(factory); + + registerArrowSchemaReader(factory); + registerParquetSchemaReader(factory); + registerORCSchemaReader(factory); + registerTSVSchemaReader(factory); + registerCSVSchemaReader(factory); + registerJSONCompactEachRowSchemaReader(factory); + registerJSONEachRowSchemaReader(factory); + registerNativeSchemaReader(factory); + registerRowBinaryWithNamesAndTypesSchemaReader(factory); + registerAvroSchemaReader(factory); + registerProtobufSchemaReader(factory); + registerLineAsStringSchemaReader(factory); + registerJSONAsStringSchemaReader(factory); + registerRawBLOBSchemaReader(factory); + registerMsgPackSchemaReader(factory); + registerCapnProtoSchemaReader(factory); + registerCustomSeparatedSchemaReader(factory); + registerRegexpSchemaReader(factory); + registerTSKVSchemaReader(factory); + registerValuesSchemaReader(factory); + registerTemplateSchemaReader(factory); } } diff --git a/src/Functions/CRC.cpp b/src/Functions/CRC.cpp index 00aa631c85b..abcf137f2e7 100644 --- a/src/Functions/CRC.cpp +++ b/src/Functions/CRC.cpp @@ -33,7 +33,7 @@ struct CRCImpl static CRCBase base(polynomial); T crc = 0; - for (size_t i = 0; i < size; i++) + for (size_t i = 0; i < size; ++i) crc = base.tab[(crc ^ buf[i]) & 0xff] ^ (crc >> 8); return crc; } diff --git a/src/Functions/CustomWeekTransforms.h b/src/Functions/CustomWeekTransforms.h index 218dcd083eb..5ccb2e06c44 100644 --- a/src/Functions/CustomWeekTransforms.h +++ b/src/Functions/CustomWeekTransforms.h @@ -9,7 +9,7 @@ #include #include #include -#include +#include /// The default mode value to use for the WEEK() function #define DEFAULT_WEEK_MODE 0 diff --git a/src/Functions/DateTimeTransforms.h b/src/Functions/DateTimeTransforms.h index 15a08c4e76d..08dac9c2ba0 100644 --- a/src/Functions/DateTimeTransforms.h +++ b/src/Functions/DateTimeTransforms.h @@ -3,8 +3,8 @@ #include #include #include -#include -#include +#include +#include #include #include #include diff --git a/src/Functions/FunctionDateOrDateTimeAddInterval.h b/src/Functions/FunctionDateOrDateTimeAddInterval.h index af34f27d6b8..4224a74ae8e 100644 --- a/src/Functions/FunctionDateOrDateTimeAddInterval.h +++ b/src/Functions/FunctionDateOrDateTimeAddInterval.h @@ -1,5 +1,5 @@ #pragma once -#include +#include #include #include diff --git a/src/Functions/FunctionFile.cpp b/src/Functions/FunctionFile.cpp index 5a123c9557c..01314b52119 100644 --- a/src/Functions/FunctionFile.cpp +++ b/src/Functions/FunctionFile.cpp @@ -3,10 +3,13 @@ #include #include #include +#include +#include #include #include #include + namespace fs = std::filesystem; namespace DB @@ -16,9 +19,7 @@ namespace ErrorCodes { extern const int ILLEGAL_COLUMN; extern const int NOT_IMPLEMENTED; - extern const int INCORRECT_FILE_NAME; extern const int DATABASE_ACCESS_DENIED; - extern const int FILE_DOESNT_EXIST; } /// A function to read file as a string. @@ -30,15 +31,14 @@ public: explicit FunctionFile(ContextPtr context_) : WithContext(context_) {} String getName() const override { return name; } - size_t getNumberOfArguments() const override { return 1; } - bool isInjective(const ColumnsWithTypeAndName &) const override { return true; } bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; } DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override { if (!isString(arguments[0].type)) - throw Exception(getName() + " is only implemented for types String", ErrorCodes::NOT_IMPLEMENTED); + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "{} is only implemented for type String", getName()); + return std::make_shared(); } @@ -47,80 +47,50 @@ public: ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override { const ColumnPtr column = arguments[0].column; - const ColumnString * expected = checkAndGetColumn(column.get()); - if (!expected) + const ColumnString * column_src = checkAndGetColumn(column.get()); + if (!column_src) throw Exception( fmt::format("Illegal column {} of argument of function {}", arguments[0].column->getName(), getName()), ErrorCodes::ILLEGAL_COLUMN); - const ColumnString::Chars & chars = expected->getChars(); - const ColumnString::Offsets & offsets = expected->getOffsets(); - - std::vector checked_filenames(input_rows_count); - auto result = ColumnString::create(); auto & res_chars = result->getChars(); auto & res_offsets = result->getOffsets(); res_offsets.resize(input_rows_count); - size_t source_offset = 0; - size_t result_offset = 0; + fs::path user_files_absolute_path = fs::canonical(fs::path(getContext()->getUserFilesPath())); + std::string user_files_absolute_path_string = user_files_absolute_path.string(); + + // If run in Local mode, no need for path checking. + bool need_check = getContext()->getApplicationType() != Context::ApplicationType::LOCAL; + for (size_t row = 0; row < input_rows_count; ++row) { - const char * filename = reinterpret_cast(&chars[source_offset]); + StringRef filename = column_src->getDataAt(row); + fs::path file_path(filename.data, filename.data + filename.size); - fs::path user_files_absolute_path = fs::canonical(fs::path(getContext()->getUserFilesPath())); - fs::path file_path(filename); if (file_path.is_relative()) file_path = user_files_absolute_path / file_path; - fs::path file_absolute_path = fs::canonical(file_path); - checkReadIsAllowedOrThrow(user_files_absolute_path.string(), file_absolute_path); - checked_filenames[row] = file_absolute_path.string(); + /// Do not use fs::canonical or fs::weakly_canonical. + /// Otherwise it will not allow to work with symlinks in `user_files_path` directory. + file_path = fs::absolute(file_path).lexically_normal(); - if (!fs::exists(file_absolute_path)) - throw Exception(fmt::format("File {} doesn't exist.", file_absolute_path.string()), ErrorCodes::FILE_DOESNT_EXIST); + if (need_check && file_path.string().find(user_files_absolute_path_string) != 0) + throw Exception(ErrorCodes::DATABASE_ACCESS_DENIED, "File is not inside {}", user_files_absolute_path.string()); - const auto current_file_size = fs::file_size(file_absolute_path); + ReadBufferFromFile in(file_path); + WriteBufferFromVector out(res_chars, AppendModeTag{}); + copyData(in, out); + out.finalize(); - result_offset += current_file_size + 1; - res_offsets[row] = result_offset; - source_offset = offsets[row]; - } - - res_chars.resize(result_offset); - - size_t prev_offset = 0; - - for (size_t row = 0; row < input_rows_count; ++row) - { - auto file_absolute_path = checked_filenames[row]; - ReadBufferFromFile in(file_absolute_path); - char * res_buf = reinterpret_cast(&res_chars[prev_offset]); - - const size_t file_lenght = res_offsets[row] - prev_offset - 1; - prev_offset = res_offsets[row]; - in.readStrict(res_buf, file_lenght); - res_buf[file_lenght] = '\0'; + res_chars.push_back(0); + res_offsets[row] = res_chars.size(); } return result; } - -private: - - void checkReadIsAllowedOrThrow(const std::string & user_files_absolute_path, const std::string & file_absolute_path) const - { - // If run in Local mode, no need for path checking. - if (getContext()->getApplicationType() != Context::ApplicationType::LOCAL) - if (file_absolute_path.find(user_files_absolute_path) != 0) - throw Exception("File is not inside " + user_files_absolute_path, ErrorCodes::DATABASE_ACCESS_DENIED); - - fs::path fs_path(file_absolute_path); - if (fs::exists(fs_path) && fs::is_directory(fs_path)) - throw Exception("File can't be a directory", ErrorCodes::INCORRECT_FILE_NAME); - } }; diff --git a/src/Functions/FunctionMathUnary.h b/src/Functions/FunctionMathUnary.h index 2d39daac366..d9ca162ba16 100644 --- a/src/Functions/FunctionMathUnary.h +++ b/src/Functions/FunctionMathUnary.h @@ -94,7 +94,7 @@ private: Impl::execute(src_remaining, dst_remaining); if constexpr (is_big_int_v || std::is_same_v) - for (size_t i = 0; i < rows_remaining; i++) + for (size_t i = 0; i < rows_remaining; ++i) dst_data[rows_size + i] = dst_remaining[i]; else memcpy(&dst_data[rows_size], dst_remaining, rows_remaining * sizeof(ReturnType)); diff --git a/src/Functions/FunctionUnixTimestamp64.h b/src/Functions/FunctionUnixTimestamp64.h index 8c507077acd..5248f524a2b 100644 --- a/src/Functions/FunctionUnixTimestamp64.h +++ b/src/Functions/FunctionUnixTimestamp64.h @@ -56,7 +56,7 @@ public: const auto & source_data = typeid_cast &>(col).getData(); - Int32 scale_diff = typeid_cast(*src.type).getScale() - target_scale; + const Int32 scale_diff = typeid_cast(*src.type).getScale() - target_scale; if (scale_diff == 0) { for (size_t i = 0; i < input_rows_count; ++i) diff --git a/src/Functions/FunctionsConversion.h b/src/Functions/FunctionsConversion.h index 9238cc81c37..8018fa8e726 100644 --- a/src/Functions/FunctionsConversion.h +++ b/src/Functions/FunctionsConversion.h @@ -1,5 +1,6 @@ #pragma once +#include #include #include @@ -34,6 +35,7 @@ #include #include #include +#include #include #include #include @@ -180,6 +182,7 @@ struct ConvertImpl vec_null_map_to = &col_null_map_to->getData(); } + bool result_is_bool = isBool(result_type); for (size_t i = 0; i < input_rows_count; ++i) { if constexpr (std::is_same_v != std::is_same_v) @@ -266,6 +269,12 @@ struct ConvertImpl vec_to[i] = static_cast(vec_from[i]); } } + + if constexpr (std::is_same_v) + { + if (result_is_bool) + vec_to[i] = static_cast(vec_to[i]); + } } } @@ -850,11 +859,15 @@ struct ConvertImpl struct ConvertImplGenericToString { - static ColumnPtr execute(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type) + static ColumnPtr execute(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t /*input_rows_count*/) { + static_assert(std::is_same_v || std::is_same_v, + "Can be used only to serialize to ColumnString or ColumnFixedString"); + ColumnUInt8::MutablePtr null_map = copyNullMap(arguments[0].column); const auto & col_with_type_and_name = columnGetNested(arguments[0]); @@ -862,27 +875,25 @@ struct ConvertImplGenericToString const IColumn & col_from = *col_with_type_and_name.column; size_t size = col_from.size(); + auto col_to = result_type->createColumn(); - auto col_to = ColumnString::create(); - - ColumnString::Chars & data_to = col_to->getChars(); - ColumnString::Offsets & offsets_to = col_to->getOffsets(); - - data_to.resize(size * 2); /// Using coefficient 2 for initial size is arbitrary. - offsets_to.resize(size); - - WriteBufferFromVector write_buffer(data_to); - - FormatSettings format_settings; - auto serialization = type.getDefaultSerialization(); - for (size_t i = 0; i < size; ++i) { - serialization->serializeText(col_from, i, write_buffer, format_settings); - writeChar(0, write_buffer); - offsets_to[i] = write_buffer.count(); - } + ColumnStringHelpers::WriteHelper write_helper( + assert_cast(*col_to), + size); - write_buffer.finalize(); + auto & write_buffer = write_helper.getWriteBuffer(); + + FormatSettings format_settings; + auto serialization = type.getDefaultSerialization(); + for (size_t i = 0; i < size; ++i) + { + serialization->serializeText(col_from, i, write_buffer, format_settings); + write_helper.rowWritten(); + } + + write_helper.finalize(); + } if (result_type->isNullable() && null_map) return ColumnNullable::create(std::move(col_to), std::move(null_map)); @@ -1006,7 +1017,8 @@ inline bool tryParseImpl(DataTypeUUID::FieldType & x, ReadBuffer & else message_buf << " at begin of string"; - if (isNativeNumber(to_type)) + // Currently there are no functions toIPv{4,6}Or{Null,Zero} + if (isNativeNumber(to_type) && !(to_type.getName() == "IPv4" || to_type.getName() == "IPv6")) message_buf << ". Note: there are to" << to_type.getName() << "OrZero and to" << to_type.getName() << "OrNull functions, which returns zero/NULL instead of throwing exception."; throw Exception(message_buf.str(), ErrorCodes::CANNOT_PARSE_TEXT); @@ -1285,40 +1297,35 @@ template struct ConvertImpl, DataTypeFixedString>, ToDataType, Name, ConvertReturnNullOnErrorTag> : ConvertThroughParsing {}; -/// Generic conversion of any type from String. Used for complex types: Array and Tuple. +/// Generic conversion of any type from String. Used for complex types: Array and Tuple or types with custom serialization. +template struct ConvertImplGenericFromString { - static ColumnPtr execute(ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type) + static ColumnPtr execute(ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, const ColumnNullable *, size_t input_rows_count) { + static_assert(std::is_same_v || std::is_same_v, + "Can be used only to parse from ColumnString or ColumnFixedString"); + const IColumn & col_from = *arguments[0].column; - size_t size = col_from.size(); - const IDataType & data_type_to = *result_type; - - if (const ColumnString * col_from_string = checkAndGetColumn(&col_from)) + if (const StringColumnType * col_from_string = checkAndGetColumn(&col_from)) { auto res = data_type_to.createColumn(); IColumn & column_to = *res; - column_to.reserve(size); - - const ColumnString::Chars & chars = col_from_string->getChars(); - const IColumn::Offsets & offsets = col_from_string->getOffsets(); - - size_t current_offset = 0; + column_to.reserve(input_rows_count); FormatSettings format_settings; auto serialization = data_type_to.getDefaultSerialization(); - for (size_t i = 0; i < size; ++i) + for (size_t i = 0; i < input_rows_count; ++i) { - ReadBufferFromMemory read_buffer(&chars[current_offset], offsets[i] - current_offset - 1); + const auto & val = col_from_string->getDataAt(i); + ReadBufferFromMemory read_buffer(val.data, val.size); serialization->deserializeWholeText(column_to, read_buffer, format_settings); if (!read_buffer.eof()) throwExceptionForIncompletelyParsedValue(read_buffer, result_type); - - current_offset = offsets[i]; } return res; @@ -1352,6 +1359,18 @@ struct ConvertImpl, T, Name, ConvertDefau } }; +template +struct ConvertImpl +{ + template + static ColumnPtr execute(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t /*input_rows_count*/, + Additions additions [[maybe_unused]] = Additions()) + { + + return arguments[0].column; + } +}; + /** Conversion from FixedString to String. * Cutting sequences of zero bytes from end of strings. @@ -1721,7 +1740,10 @@ private: throw Exception("Wrong UUID conversion", ErrorCodes::CANNOT_CONVERT_TYPE); } else - result_column = ConvertImpl::execute(arguments, result_type, input_rows_count); + { + result_column + = ConvertImpl::execute(arguments, result_type, input_rows_count); + } } else { @@ -1767,7 +1789,7 @@ private: /// Generic conversion of any type to String. if (std::is_same_v) { - return ConvertImplGenericToString::execute(arguments, result_type); + return ConvertImplGenericToString::execute(arguments, result_type, input_rows_count); } else throw Exception("Illegal type " + arguments[0].type->getName() + " of argument of function " + getName(), @@ -2551,6 +2573,7 @@ private: { /// In case when converting to Nullable type, we apply different parsing rule, /// that will not throw an exception but return NULL in case of malformed input. + FunctionPtr function = FunctionConvertFromString::create(); return createFunctionAdaptor(function, from_type); } @@ -2610,6 +2633,37 @@ private: }; } + template + WrapperType createBoolWrapper(const DataTypePtr & from_type, const ToDataType * const to_type, bool requested_result_is_nullable) const + { + if (checkAndGetDataType(from_type.get())) + { + return &ConvertImplGenericFromString::execute; + } + + return createWrapper(from_type, to_type, requested_result_is_nullable); + } + + WrapperType createUInt8ToUInt8Wrapper(const DataTypePtr from_type, const DataTypePtr to_type) const + { + return [from_type, to_type] (ColumnsWithTypeAndName & arguments, const DataTypePtr &, const ColumnNullable *, size_t /*input_rows_count*/) -> ColumnPtr + { + if (isBool(from_type) || !isBool(to_type)) + return arguments.front().column; + + /// Special case when we convert UInt8 column to Bool column. + /// both columns have type UInt8, but we shouldn't use identity wrapper, + /// because Bool column can contain only 0 and 1. + auto res_column = to_type->createColumn(); + const auto & data_from = checkAndGetColumn(arguments[0].column.get())->getData(); + auto & data_to = assert_cast(res_column.get())->getData(); + data_to.resize(data_from.size()); + for (size_t i = 0; i != data_from.size(); ++i) + data_to[i] = static_cast(data_from[i]); + return res_column; + }; + } + static WrapperType createStringWrapper(const DataTypePtr & from_type) { FunctionPtr function = FunctionToString::create(); @@ -2725,10 +2779,7 @@ private: /// Conversion from String through parsing. if (checkAndGetDataType(from_type_untyped.get())) { - return [] (ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, const ColumnNullable *, size_t /*input_rows_count*/) - { - return ConvertImplGenericFromString::execute(arguments, result_type); - }; + return &ConvertImplGenericFromString::execute; } else { @@ -2745,10 +2796,7 @@ private: /// Conversion from String through parsing. if (checkAndGetDataType(from_type_untyped.get())) { - return [] (ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, const ColumnNullable *, size_t /*input_rows_count*/) - { - return ConvertImplGenericFromString::execute(arguments, result_type); - }; + return &ConvertImplGenericFromString::execute; } const auto * from_type = checkAndGetDataType(from_type_untyped.get()); @@ -2816,10 +2864,7 @@ private: /// Conversion from String through parsing. if (checkAndGetDataType(from_type_untyped.get())) { - return [] (ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, const ColumnNullable *, size_t /*input_rows_count*/) - { - return ConvertImplGenericFromString::execute(arguments, result_type); - }; + return &ConvertImplGenericFromString::execute; } const auto * from_type = checkAndGetDataType(from_type_untyped.get()); @@ -3275,7 +3320,12 @@ private: WrapperType prepareImpl(const DataTypePtr & from_type, const DataTypePtr & to_type, bool requested_result_is_nullable) const { if (from_type->equals(*to_type)) + { + if (isUInt8(from_type)) + return createUInt8ToUInt8Wrapper(from_type, to_type); + return createIdentityWrapper(from_type); + } else if (WhichDataType(from_type).isNothing()) return createNothingWrapper(to_type.get()); @@ -3287,7 +3337,6 @@ private: using ToDataType = typename Types::LeftType; if constexpr ( - std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v || @@ -3309,6 +3358,14 @@ private: ret = createWrapper(from_type, checkAndGetDataType(to_type.get()), requested_result_is_nullable); return true; } + if constexpr (std::is_same_v) + { + if (isBool(to_type)) + ret = createBoolWrapper(from_type, checkAndGetDataType(to_type.get()), requested_result_is_nullable); + else + ret = createWrapper(from_type, checkAndGetDataType(to_type.get()), requested_result_is_nullable); + return true; + } if constexpr ( std::is_same_v || std::is_same_v) @@ -3330,6 +3387,38 @@ private: return false; }; + auto make_custom_serialization_wrapper = [&](const auto & types) -> bool + { + using Types = std::decay_t; + using ToDataType = typename Types::RightType; + using FromDataType = typename Types::LeftType; + + if constexpr (WhichDataType(FromDataType::type_id).isStringOrFixedString()) + { + if (to_type->getCustomSerialization()) + { + ret = &ConvertImplGenericFromString::execute; + return true; + } + } + if constexpr (WhichDataType(ToDataType::type_id).isStringOrFixedString()) + { + if (from_type->getCustomSerialization()) + { + ret = [](ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, const ColumnNullable *, size_t input_rows_count) -> ColumnPtr + { + return ConvertImplGenericToString::execute(arguments, result_type, input_rows_count); + }; + return true; + } + } + + return false; + }; + + if (callOnTwoTypeIndexes(from_type->getTypeId(), to_type->getTypeId(), make_custom_serialization_wrapper)) + return ret; + if (callOnIndexAndDataType(to_type->getTypeId(), make_default_wrapper)) return ret; diff --git a/src/Functions/FunctionsExternalDictionaries.h b/src/Functions/FunctionsExternalDictionaries.h index c52d54f30aa..71597f2b433 100644 --- a/src/Functions/FunctionsExternalDictionaries.h +++ b/src/Functions/FunctionsExternalDictionaries.h @@ -68,11 +68,12 @@ public: std::shared_ptr getDictionary(const String & dictionary_name) { - auto dict = getContext()->getExternalDictionariesLoader().getDictionary(dictionary_name, getContext()); + auto current_context = getContext(); + auto dict = current_context->getExternalDictionariesLoader().getDictionary(dictionary_name, current_context); if (!access_checked) { - getContext()->checkAccess(AccessType::dictGet, dict->getDatabaseOrNoDatabaseTag(), dict->getDictionaryID().getTableName()); + current_context->checkAccess(AccessType::dictGet, dict->getDatabaseOrNoDatabaseTag(), dict->getDictionaryID().getTableName()); access_checked = true; } @@ -106,8 +107,9 @@ public: if (!attr_name_col) throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Second argument of function dictGet must be a constant string"); - const auto dictionary_name = dict_name_col->getValue(); - const auto attribute_name = attr_name_col->getValue(); + const auto & dictionary_name = dict_name_col->getValue(); + const auto & attribute_name = attr_name_col->getValue(); + return getDictionary(dictionary_name)->isInjective(attribute_name); } diff --git a/src/Functions/FunctionsJSON.cpp b/src/Functions/FunctionsJSON.cpp index 105643ff82b..d542f023625 100644 --- a/src/Functions/FunctionsJSON.cpp +++ b/src/Functions/FunctionsJSON.cpp @@ -1249,7 +1249,7 @@ public: { ColumnString & col_str = assert_cast(dest); auto & chars = col_str.getChars(); - WriteBufferFromVector buf(chars, WriteBufferFromVector::AppendModeTag()); + WriteBufferFromVector buf(chars, AppendModeTag()); traverse(element, buf); buf.finalize(); chars.push_back(0); diff --git a/src/Functions/FunctionsLogical.cpp b/src/Functions/FunctionsLogical.cpp index f427deced3a..87a2ecd4c57 100644 --- a/src/Functions/FunctionsLogical.cpp +++ b/src/Functions/FunctionsLogical.cpp @@ -609,7 +609,7 @@ ColumnPtr FunctionAnyArityLogical::executeImpl( ColumnsWithTypeAndName arguments = std::move(args); /// Special implementation for short-circuit arguments. - if (checkShirtCircuitArguments(arguments) != -1) + if (checkShortCircuitArguments(arguments) != -1) return executeShortCircuit(arguments, result_type); ColumnRawPtrs args_in; diff --git a/src/Functions/FunctionsLogical.h b/src/Functions/FunctionsLogical.h index 3ddf7ea84eb..7d4f5489e86 100644 --- a/src/Functions/FunctionsLogical.h +++ b/src/Functions/FunctionsLogical.h @@ -185,7 +185,7 @@ public: if constexpr (!Impl::isSaturable()) { auto * result = nativeBoolCast(b, types[0], values[0]); - for (size_t i = 1; i < types.size(); i++) + for (size_t i = 1; i < types.size(); ++i) result = Impl::apply(b, result, nativeBoolCast(b, types[i], values[i])); return b.CreateSelect(result, b.getInt8(1), b.getInt8(0)); } @@ -194,7 +194,7 @@ public: auto * stop = llvm::BasicBlock::Create(next->getContext(), "", next->getParent()); b.SetInsertPoint(stop); auto * phi = b.CreatePHI(b.getInt8Ty(), values.size()); - for (size_t i = 0; i < types.size(); i++) + for (size_t i = 0; i < types.size(); ++i) { b.SetInsertPoint(next); auto * value = values[i]; diff --git a/src/Functions/FunctionsStringArray.h b/src/Functions/FunctionsStringArray.h index 27907626971..a6e705bb1af 100644 --- a/src/Functions/FunctionsStringArray.h +++ b/src/Functions/FunctionsStringArray.h @@ -749,7 +749,7 @@ private: { ColumnsWithTypeAndName cols; cols.emplace_back(col_arr.getDataPtr(), nested_type, "tmp"); - return ConvertImplGenericToString::execute(cols, std::make_shared()); + return ConvertImplGenericToString::execute(cols, std::make_shared(), col_arr.size()); } } diff --git a/src/Functions/FunctionsWindow.cpp b/src/Functions/FunctionsTimeWindow.cpp similarity index 93% rename from src/Functions/FunctionsWindow.cpp rename to src/Functions/FunctionsTimeWindow.cpp index a26faac304d..79ce7356ee7 100644 --- a/src/Functions/FunctionsWindow.cpp +++ b/src/Functions/FunctionsTimeWindow.cpp @@ -9,7 +9,7 @@ #include #include #include -#include +#include namespace DB { @@ -114,9 +114,9 @@ namespace } template <> -struct WindowImpl +struct TimeWindowImpl { - static constexpr auto name = "TUMBLE"; + static constexpr auto name = "tumble"; [[maybe_unused]] static DataTypePtr getReturnType(const ColumnsWithTypeAndName & arguments, const String & function_name) { @@ -211,9 +211,9 @@ struct WindowImpl }; template <> -struct WindowImpl +struct TimeWindowImpl { - static constexpr auto name = "TUMBLE_START"; + static constexpr auto name = "tumbleStart"; static DataTypePtr getReturnType(const ColumnsWithTypeAndName & arguments, const String & function_name) { @@ -231,7 +231,7 @@ struct WindowImpl } else { - return std::static_pointer_cast(WindowImpl::getReturnType(arguments, function_name)) + return std::static_pointer_cast(TimeWindowImpl::getReturnType(arguments, function_name)) ->getElement(0); } } @@ -249,19 +249,19 @@ struct WindowImpl result_column = time_column.column; } else - result_column = WindowImpl::dispatchForColumns(arguments, function_name); + result_column = TimeWindowImpl::dispatchForColumns(arguments, function_name); return executeWindowBound(result_column, 0, function_name); } }; template <> -struct WindowImpl +struct TimeWindowImpl { - static constexpr auto name = "TUMBLE_END"; + static constexpr auto name = "tumbleEnd"; [[maybe_unused]] static DataTypePtr getReturnType(const ColumnsWithTypeAndName & arguments, const String & function_name) { - return WindowImpl::getReturnType(arguments, function_name); + return TimeWindowImpl::getReturnType(arguments, function_name); } [[maybe_unused]] static ColumnPtr dispatchForColumns(const ColumnsWithTypeAndName & arguments, const String& function_name) @@ -277,15 +277,15 @@ struct WindowImpl result_column = time_column.column; } else - result_column = WindowImpl::dispatchForColumns(arguments, function_name); + result_column = TimeWindowImpl::dispatchForColumns(arguments, function_name); return executeWindowBound(result_column, 1, function_name); } }; template <> -struct WindowImpl +struct TimeWindowImpl { - static constexpr auto name = "HOP"; + static constexpr auto name = "hop"; [[maybe_unused]] static DataTypePtr getReturnType(const ColumnsWithTypeAndName & arguments, const String & function_name) { @@ -415,9 +415,9 @@ struct WindowImpl }; template <> -struct WindowImpl +struct TimeWindowImpl { - static constexpr auto name = "WINDOW_ID"; + static constexpr auto name = "windowID"; [[maybe_unused]] static DataTypePtr getReturnType(const ColumnsWithTypeAndName & arguments, const String & function_name) { @@ -547,7 +547,7 @@ struct WindowImpl [[maybe_unused]] static ColumnPtr dispatchForTumbleColumns(const ColumnsWithTypeAndName & arguments, const String & function_name) { - ColumnPtr column = WindowImpl::dispatchForColumns(arguments, function_name); + ColumnPtr column = TimeWindowImpl::dispatchForColumns(arguments, function_name); return executeWindowBound(column, 1, function_name); } @@ -567,9 +567,9 @@ struct WindowImpl }; template <> -struct WindowImpl +struct TimeWindowImpl { - static constexpr auto name = "HOP_START"; + static constexpr auto name = "hopStart"; static DataTypePtr getReturnType(const ColumnsWithTypeAndName & arguments, const String & function_name) { @@ -587,7 +587,7 @@ struct WindowImpl } else { - return std::static_pointer_cast(WindowImpl::getReturnType(arguments, function_name))->getElement(0); + return std::static_pointer_cast(TimeWindowImpl::getReturnType(arguments, function_name))->getElement(0); } } @@ -604,19 +604,19 @@ struct WindowImpl result_column = time_column.column; } else - result_column = WindowImpl::dispatchForColumns(arguments, function_name); + result_column = TimeWindowImpl::dispatchForColumns(arguments, function_name); return executeWindowBound(result_column, 0, function_name); } }; template <> -struct WindowImpl +struct TimeWindowImpl { - static constexpr auto name = "HOP_END"; + static constexpr auto name = "hopEnd"; [[maybe_unused]] static DataTypePtr getReturnType(const ColumnsWithTypeAndName & arguments, const String & function_name) { - return WindowImpl::getReturnType(arguments, function_name); + return TimeWindowImpl::getReturnType(arguments, function_name); } [[maybe_unused]] static ColumnPtr dispatchForColumns(const ColumnsWithTypeAndName & arguments, const String & function_name) @@ -632,25 +632,25 @@ struct WindowImpl result_column = time_column.column; } else - result_column = WindowImpl::dispatchForColumns(arguments, function_name); + result_column = TimeWindowImpl::dispatchForColumns(arguments, function_name); return executeWindowBound(result_column, 1, function_name); } }; -template -DataTypePtr FunctionWindow::getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const +template +DataTypePtr FunctionTimeWindow::getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const { - return WindowImpl::getReturnType(arguments, name); + return TimeWindowImpl::getReturnType(arguments, name); } -template -ColumnPtr FunctionWindow::executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & /*result_type*/, size_t /*input_rows_count*/) const +template +ColumnPtr FunctionTimeWindow::executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & /*result_type*/, size_t /*input_rows_count*/) const { - return WindowImpl::dispatchForColumns(arguments, name); + return TimeWindowImpl::dispatchForColumns(arguments, name); } -void registerFunctionsWindow(FunctionFactory& factory) +void registerFunctionsTimeWindow(FunctionFactory& factory) { factory.registerFunction(); factory.registerFunction(); diff --git a/src/Functions/FunctionsWindow.h b/src/Functions/FunctionsTimeWindow.h similarity index 76% rename from src/Functions/FunctionsWindow.h rename to src/Functions/FunctionsTimeWindow.h index 37acb660751..6e5d79fd062 100644 --- a/src/Functions/FunctionsWindow.h +++ b/src/Functions/FunctionsTimeWindow.h @@ -1,36 +1,36 @@ #pragma once -#include +#include #include #include namespace DB { -/** Window functions: +/** Time window functions: * - * TUMBLE(time_attr, interval [, timezone]) + * tumble(time_attr, interval [, timezone]) * - * TUMBLE_START(window_id) + * tumbleStart(window_id) * - * TUMBLE_START(time_attr, interval [, timezone]) + * tumbleStart(time_attr, interval [, timezone]) * - * TUMBLE_END(window_id) + * tumbleEnd(window_id) * - * TUMBLE_END(time_attr, interval [, timezone]) + * tumbleEnd(time_attr, interval [, timezone]) * - * HOP(time_attr, hop_interval, window_interval [, timezone]) + * hop(time_attr, hop_interval, window_interval [, timezone]) * - * HOP_START(window_id) + * hopStart(window_id) * - * HOP_START(time_attr, hop_interval, window_interval [, timezone]) + * hopStart(time_attr, hop_interval, window_interval [, timezone]) * - * HOP_END(window_id) + * hopEnd(window_id) * - * HOP_END(time_attr, hop_interval, window_interval [, timezone]) + * hopEnd(time_attr, hop_interval, window_interval [, timezone]) * */ -enum WindowFunctionName +enum TimeWindowFunctionName { TUMBLE, TUMBLE_START, @@ -117,8 +117,8 @@ struct ToStartOfTransform; ADD_TIME(Second, 1) #undef ADD_TIME -template -struct WindowImpl +template +struct TimeWindowImpl { static constexpr auto name = "UNKNOWN"; @@ -127,12 +127,12 @@ struct WindowImpl static ColumnPtr dispatchForColumns(const ColumnsWithTypeAndName & arguments, const String & function_name); }; -template -class FunctionWindow : public IFunction +template +class FunctionTimeWindow : public IFunction { public: - static constexpr auto name = WindowImpl::name; - static FunctionPtr create(ContextPtr) { return std::make_shared(); } + static constexpr auto name = TimeWindowImpl::name; + static FunctionPtr create(ContextPtr) { return std::make_shared(); } String getName() const override { return name; } bool isVariadic() const override { return true; } size_t getNumberOfArguments() const override { return 0; } @@ -145,11 +145,11 @@ public: ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & /*result_type*/, size_t /*input_rows_count*/) const override; }; -using FunctionTumble = FunctionWindow; -using FunctionTumbleStart = FunctionWindow; -using FunctionTumbleEnd = FunctionWindow; -using FunctionHop = FunctionWindow; -using FunctionWindowId = FunctionWindow; -using FunctionHopStart = FunctionWindow; -using FunctionHopEnd = FunctionWindow; +using FunctionTumble = FunctionTimeWindow; +using FunctionTumbleStart = FunctionTimeWindow; +using FunctionTumbleEnd = FunctionTimeWindow; +using FunctionHop = FunctionTimeWindow; +using FunctionWindowId = FunctionTimeWindow; +using FunctionHopStart = FunctionTimeWindow; +using FunctionHopEnd = FunctionTimeWindow; } diff --git a/src/Functions/IFunction.cpp b/src/Functions/IFunction.cpp index 12f2f50a5f0..cfb4e12a025 100644 --- a/src/Functions/IFunction.cpp +++ b/src/Functions/IFunction.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -220,10 +221,15 @@ ColumnPtr IExecutableFunction::executeWithoutLowCardinalityColumns( return res; } -ColumnPtr IExecutableFunction::execute(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count, bool dry_run) const +static void convertSparseColumnsToFull(ColumnsWithTypeAndName & args) +{ + for (auto & column : args) + column.column = recursiveRemoveSparse(column.column); +} + +ColumnPtr IExecutableFunction::executeWithoutSparseColumns(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count, bool dry_run) const { ColumnPtr result; - if (useDefaultImplementationForLowCardinalityColumns()) { ColumnsWithTypeAndName columns_without_low_cardinality = arguments; @@ -264,6 +270,73 @@ ColumnPtr IExecutableFunction::execute(const ColumnsWithTypeAndName & arguments, return result; } +ColumnPtr IExecutableFunction::execute(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count, bool dry_run) const +{ + if (useDefaultImplementationForSparseColumns()) + { + size_t num_sparse_columns = 0; + size_t num_full_columns = 0; + size_t sparse_column_position = 0; + + for (size_t i = 0; i < arguments.size(); ++i) + { + const auto * column_sparse = checkAndGetColumn(arguments[i].column.get()); + /// In rare case, when sparse column doesn't have default values, + /// it's more convenient to convert it to full before execution of function. + if (column_sparse && column_sparse->getNumberOfDefaults()) + { + sparse_column_position = i; + ++num_sparse_columns; + } + else if (!isColumnConst(*arguments[i].column)) + { + ++num_full_columns; + } + } + + auto columns_without_sparse = arguments; + if (num_sparse_columns == 1 && num_full_columns == 0) + { + auto & arg_with_sparse = columns_without_sparse[sparse_column_position]; + ColumnPtr sparse_offsets; + { + /// New scope to avoid possible mistakes on dangling reference. + const auto & column_sparse = assert_cast(*arg_with_sparse.column); + sparse_offsets = column_sparse.getOffsetsPtr(); + arg_with_sparse.column = column_sparse.getValuesPtr(); + } + + size_t values_size = arg_with_sparse.column->size(); + for (size_t i = 0; i < columns_without_sparse.size(); ++i) + { + if (i == sparse_column_position) + continue; + + columns_without_sparse[i].column = columns_without_sparse[i].column->cloneResized(values_size); + } + + auto res = executeWithoutSparseColumns(columns_without_sparse, result_type, values_size, dry_run); + + if (isColumnConst(*res)) + return res->cloneResized(input_rows_count); + + /// If default of sparse column is changed after execution of function, convert to full column. + if (!result_type->supportsSparseSerialization() || !res->isDefaultAt(0)) + { + const auto & offsets_data = assert_cast &>(*sparse_offsets).getData(); + return res->createWithOffsets(offsets_data, (*res)[0], input_rows_count, /*shift=*/ 1); + } + + return ColumnSparse::create(res, sparse_offsets, input_rows_count); + } + + convertSparseColumnsToFull(columns_without_sparse); + return executeWithoutSparseColumns(columns_without_sparse, result_type, input_rows_count, dry_run); + } + + return executeWithoutSparseColumns(arguments, result_type, input_rows_count, dry_run); +} + void IFunctionOverloadResolver::checkNumberOfArguments(size_t number_of_arguments) const { if (isVariadic()) diff --git a/src/Functions/IFunction.h b/src/Functions/IFunction.h index dfa3f00d1cf..8063ad77ad0 100644 --- a/src/Functions/IFunction.h +++ b/src/Functions/IFunction.h @@ -76,6 +76,13 @@ protected: */ virtual bool useDefaultImplementationForLowCardinalityColumns() const { return true; } + /** If function arguments has single sparse column and all other arguments are constants, call function on nested column. + * Otherwise, convert all sparse columns to ordinary columns. + * If default value doesn't change after function execution, returns sparse column as a result. + * Otherwise, result column is converted to full. + */ + virtual bool useDefaultImplementationForSparseColumns() const { return true; } + /** Some arguments could remain constant during this implementation. */ virtual ColumnNumbers getArgumentsThatAreAlwaysConstant() const { return {}; } @@ -96,6 +103,8 @@ private: ColumnPtr executeWithoutLowCardinalityColumns( const ColumnsWithTypeAndName & args, const DataTypePtr & result_type, size_t input_rows_count, bool dry_run) const; + ColumnPtr executeWithoutSparseColumns( + const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count, bool dry_run) const; }; using ExecutableFunctionPtr = std::shared_ptr; @@ -351,6 +360,13 @@ protected: */ virtual bool useDefaultImplementationForLowCardinalityColumns() const { return true; } + /** If function arguments has single sparse column and all other arguments are constants, call function on nested column. + * Otherwise, convert all sparse columns to ordinary columns. + * If default value doesn't change after function execution, returns sparse column as a result. + * Otherwise, result column is converted to full. + */ + virtual bool useDefaultImplementationForSparseColumns() const { return true; } + // /// If it isn't, will convert all ColumnLowCardinality arguments to full columns. virtual bool canBeExecutedOnLowCardinalityDictionary() const { return true; } @@ -404,6 +420,13 @@ public: */ virtual bool useDefaultImplementationForLowCardinalityColumns() const { return true; } + /** If function arguments has single sparse column and all other arguments are constants, call function on nested column. + * Otherwise, convert all sparse columns to ordinary columns. + * If default value doesn't change after function execution, returns sparse column as a result. + * Otherwise, result column is converted to full. + */ + virtual bool useDefaultImplementationForSparseColumns() const { return true; } + /// If it isn't, will convert all ColumnLowCardinality arguments to full columns. virtual bool canBeExecutedOnLowCardinalityDictionary() const { return true; } diff --git a/src/Functions/IFunctionAdaptors.h b/src/Functions/IFunctionAdaptors.h index 9bfe010c0d0..ec43087ad66 100644 --- a/src/Functions/IFunctionAdaptors.h +++ b/src/Functions/IFunctionAdaptors.h @@ -29,6 +29,7 @@ protected: bool useDefaultImplementationForNulls() const final { return function->useDefaultImplementationForNulls(); } bool useDefaultImplementationForConstants() const final { return function->useDefaultImplementationForConstants(); } bool useDefaultImplementationForLowCardinalityColumns() const final { return function->useDefaultImplementationForLowCardinalityColumns(); } + bool useDefaultImplementationForSparseColumns() const final { return function->useDefaultImplementationForSparseColumns(); } ColumnNumbers getArgumentsThatAreAlwaysConstant() const final { return function->getArgumentsThatAreAlwaysConstant(); } bool canBeExecutedOnDefaultArguments() const override { return function->canBeExecutedOnDefaultArguments(); } @@ -124,6 +125,7 @@ public: bool useDefaultImplementationForNulls() const override { return function->useDefaultImplementationForNulls(); } bool useDefaultImplementationForLowCardinalityColumns() const override { return function->useDefaultImplementationForLowCardinalityColumns(); } + bool useDefaultImplementationForSparseColumns() const override { return function->useDefaultImplementationForSparseColumns(); } bool canBeExecutedOnLowCardinalityDictionary() const override { return function->canBeExecutedOnLowCardinalityDictionary(); } FunctionBasePtr buildImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type) const override diff --git a/src/Functions/ReplaceRegexpImpl.h b/src/Functions/ReplaceRegexpImpl.h index 4320f47c424..e6305431d8f 100644 --- a/src/Functions/ReplaceRegexpImpl.h +++ b/src/Functions/ReplaceRegexpImpl.h @@ -32,10 +32,18 @@ template struct ReplaceRegexpImpl { /// Sequence of instructions, describing how to get resulting string. - /// Each element is either: - /// - substitution (in that case first element of pair is their number and second element is empty) - /// - string that need to be inserted (in that case, first element of pair is that string and second element is -1) - using Instructions = std::vector>; + struct Instruction + { + /// If not negative - perform substitution of n-th subpattern from the regexp match. + int substitution_num = -1; + /// Otherwise - paste this string verbatim. + std::string literal; + + Instruction(int substitution_num_) : substitution_num(substitution_num_) {} + Instruction(std::string literal_) : literal(std::move(literal_)) {} + }; + + using Instructions = std::vector; static const size_t max_captures = 10; @@ -53,10 +61,10 @@ struct ReplaceRegexpImpl { if (!now.empty()) { - instructions.emplace_back(-1, now); + instructions.emplace_back(now); now = ""; } - instructions.emplace_back(s[i + 1] - '0', String()); + instructions.emplace_back(s[i + 1] - '0'); } else now += s[i + 1]; /// Escaping @@ -68,16 +76,15 @@ struct ReplaceRegexpImpl if (!now.empty()) { - instructions.emplace_back(-1, now); + instructions.emplace_back(now); now = ""; } for (const auto & it : instructions) - if (it.first >= num_captures) - throw Exception( - "Invalid replace instruction in replacement string. Id: " + toString(it.first) + ", but regexp has only " - + toString(num_captures - 1) + " subpatterns", - ErrorCodes::BAD_ARGUMENTS); + if (it.substitution_num >= num_captures) + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Invalid replace instruction in replacement string. Id: {}, but regexp has only {} subpatterns", + it.substitution_num, num_captures - 1); return instructions; } @@ -93,56 +100,51 @@ struct ReplaceRegexpImpl { re2_st::StringPiece matches[max_captures]; - size_t start_pos = 0; - bool is_first_match = true; - bool is_start_pos_added_one = false; + size_t copy_pos = 0; + size_t match_pos = 0; - while (start_pos < static_cast(input.length())) + while (match_pos < static_cast(input.length())) { /// If no more replacements possible for current string bool can_finish_current_string = false; - if (searcher.Match(input, start_pos, input.length(), re2_st::RE2::Anchor::UNANCHORED, matches, num_captures)) + if (searcher.Match(input, match_pos, input.length(), re2_st::RE2::Anchor::UNANCHORED, matches, num_captures)) { - if (is_start_pos_added_one) - start_pos -= 1; - const auto & match = matches[0]; - size_t bytes_to_copy = (match.data() - input.data()) - start_pos; + size_t bytes_to_copy = (match.data() - input.data()) - copy_pos; /// Copy prefix before matched regexp without modification res_data.resize(res_data.size() + bytes_to_copy); - memcpySmallAllowReadWriteOverflow15(&res_data[res_offset], input.data() + start_pos, bytes_to_copy); + memcpySmallAllowReadWriteOverflow15(&res_data[res_offset], input.data() + copy_pos, bytes_to_copy); res_offset += bytes_to_copy; - start_pos += bytes_to_copy + match.length(); - - /// To avoid infinite loop. - if (is_first_match && match.length() == 0 && !replace_one && input.length() > 1) - { - start_pos += 1; - is_start_pos_added_one = true; - } + copy_pos += bytes_to_copy + match.length(); + match_pos = copy_pos; /// Do substitution instructions for (const auto & it : instructions) { - if (it.first >= 0) + if (it.substitution_num >= 0) { - res_data.resize(res_data.size() + matches[it.first].length()); - memcpy(&res_data[res_offset], matches[it.first].data(), matches[it.first].length()); - res_offset += matches[it.first].length(); + const auto & substitution = matches[it.substitution_num]; + + res_data.resize(res_data.size() + substitution.length()); + memcpy(&res_data[res_offset], substitution.data(), substitution.length()); + res_offset += substitution.length(); } else { - res_data.resize(res_data.size() + it.second.size()); - memcpy(&res_data[res_offset], it.second.data(), it.second.size()); - res_offset += it.second.size(); + const auto & literal = it.literal; + + res_data.resize(res_data.size() + literal.size()); + memcpy(&res_data[res_offset], literal.data(), literal.size()); + res_offset += literal.size(); } } - if (replace_one || (!is_first_match && match.length() == 0)) + if (replace_one) can_finish_current_string = true; - is_first_match = false; + else if (match.length() == 0) + ++match_pos; /// Step one character to avoid infinite loop. } else can_finish_current_string = true; @@ -150,10 +152,11 @@ struct ReplaceRegexpImpl /// If ready, append suffix after match to end of string. if (can_finish_current_string) { - res_data.resize(res_data.size() + input.length() - start_pos); - memcpySmallAllowReadWriteOverflow15(&res_data[res_offset], input.data() + start_pos, input.length() - start_pos); - res_offset += input.length() - start_pos; - start_pos = input.length(); + res_data.resize(res_data.size() + input.length() - copy_pos); + memcpySmallAllowReadWriteOverflow15(&res_data[res_offset], input.data() + copy_pos, input.length() - copy_pos); + res_offset += input.length() - copy_pos; + copy_pos = input.length(); + match_pos = copy_pos; } } diff --git a/src/Functions/array/arraySlice.cpp b/src/Functions/array/arraySlice.cpp index d6b50f55563..7a2e97de78a 100644 --- a/src/Functions/array/arraySlice.cpp +++ b/src/Functions/array/arraySlice.cpp @@ -102,7 +102,7 @@ public: { if (!length_column || length_column->onlyNull()) { - return array_column; + return arguments[0].column; } else if (isColumnConst(*length_column)) sink = GatherUtils::sliceFromLeftConstantOffsetBounded(*source, 0, length_column->getInt(0)); diff --git a/src/Functions/array/mapOp.cpp b/src/Functions/array/mapOp.cpp index a5913105146..b928254e454 100644 --- a/src/Functions/array/mapOp.cpp +++ b/src/Functions/array/mapOp.cpp @@ -204,7 +204,7 @@ private: std::map summing_map; - for (size_t i = 0; i < row_count; i++) + for (size_t i = 0; i < row_count; ++i) { [[maybe_unused]] bool first = true; for (auto & arg : args) @@ -222,7 +222,7 @@ private: } Field temp_val; - for (size_t j = 0; j < len; j++) + for (size_t j = 0; j < len; ++j) { KeyType key; if constexpr (std::is_same::value) diff --git a/src/Functions/base64Decode.cpp b/src/Functions/base64Decode.cpp index 66e0969783e..027fef73911 100644 --- a/src/Functions/base64Decode.cpp +++ b/src/Functions/base64Decode.cpp @@ -8,7 +8,7 @@ namespace DB { void registerFunctionBase64Decode(FunctionFactory & factory) { - tb64ini(0, 1); + tb64ini(0, 0); factory.registerFunction>(); /// MysQL compatibility alias. diff --git a/src/Functions/base64Encode.cpp b/src/Functions/base64Encode.cpp index 1f181a31144..3e456e50379 100644 --- a/src/Functions/base64Encode.cpp +++ b/src/Functions/base64Encode.cpp @@ -10,7 +10,7 @@ namespace DB { void registerFunctionBase64Encode(FunctionFactory & factory) { - tb64ini(0, 1); + tb64ini(0, 0); factory.registerFunction>(); /// MysQL compatibility alias. diff --git a/src/Functions/dateName.cpp b/src/Functions/dateName.cpp index c8c86060265..c89a7f80dfd 100644 --- a/src/Functions/dateName.cpp +++ b/src/Functions/dateName.cpp @@ -1,4 +1,4 @@ -#include +#include #include #include diff --git a/src/Functions/extractTimeZoneFromFunctionArguments.cpp b/src/Functions/extractTimeZoneFromFunctionArguments.cpp index 50254606510..88e1d664bf0 100644 --- a/src/Functions/extractTimeZoneFromFunctionArguments.cpp +++ b/src/Functions/extractTimeZoneFromFunctionArguments.cpp @@ -4,7 +4,7 @@ #include #include #include -#include +#include namespace DB diff --git a/src/Functions/formatDateTime.cpp b/src/Functions/formatDateTime.cpp index 9bb2abcb2c7..9f303b86ad3 100644 --- a/src/Functions/formatDateTime.cpp +++ b/src/Functions/formatDateTime.cpp @@ -14,7 +14,7 @@ #include -#include +#include #include #include diff --git a/src/Functions/formatString.h b/src/Functions/formatString.h index c72e7db9579..419ecf1c773 100644 --- a/src/Functions/formatString.h +++ b/src/Functions/formatString.h @@ -42,7 +42,7 @@ struct FormatImpl static void parseNumber(const String & description, UInt64 l, UInt64 r, UInt64 & res) { res = 0; - for (UInt64 pos = l; pos < r; pos++) + for (UInt64 pos = l; pos < r; ++pos) { if (!isNumericASCII(description[pos])) throw Exception("Not a number in curly braces at position " + std::to_string(pos), ErrorCodes::BAD_ARGUMENTS); diff --git a/src/Functions/fuzzBits.cpp b/src/Functions/fuzzBits.cpp index eb2af22d1ab..8b54026724d 100644 --- a/src/Functions/fuzzBits.cpp +++ b/src/Functions/fuzzBits.cpp @@ -18,6 +18,7 @@ namespace ErrorCodes extern const int ILLEGAL_COLUMN; extern const int DECIMAL_OVERFLOW; extern const int ARGUMENT_OUT_OF_BOUND; + extern const int LOGICAL_ERROR; } @@ -142,6 +143,7 @@ public: else if (const ColumnFixedString * col_in_fixed = checkAndGetColumn(col_in_untyped.get())) { const auto n = col_in_fixed->getN(); + const auto col_in_rows = col_in_fixed->size(); auto col_to = ColumnFixedString::create(n); ColumnFixedString::Chars & chars_to = col_to->getChars(); @@ -153,7 +155,16 @@ public: const auto * ptr_in = col_in_fixed->getChars().data(); auto * ptr_to = chars_to.data(); - fuzzBits(ptr_in, ptr_to, chars_to.size(), inverse_probability); + + if (col_in_rows >= input_rows_count) + fuzzBits(ptr_in, ptr_to, chars_to.size(), inverse_probability); + else if (col_in_rows != 1) + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "1 != col_in_rows {} < input_rows_count {}", col_in_rows, input_rows_count); + else + for (size_t i = 0; i < input_rows_count; ++i) + fuzzBits(ptr_in, ptr_to + i * n, n, inverse_probability); return col_to; } diff --git a/src/Functions/geoToH3.cpp b/src/Functions/geoToH3.cpp index 93865782c8e..18951d1a03f 100644 --- a/src/Functions/geoToH3.cpp +++ b/src/Functions/geoToH3.cpp @@ -76,7 +76,7 @@ public: auto & dst_data = dst->getData(); dst_data.resize(input_rows_count); - for (size_t row = 0; row < input_rows_count; row++) + for (size_t row = 0; row < input_rows_count; ++row) { const double lon = col_lon->getFloat64(row); const double lat = col_lat->getFloat64(row); diff --git a/src/Functions/geoToS2.cpp b/src/Functions/geoToS2.cpp index 644e4661412..32d2a1d7a10 100644 --- a/src/Functions/geoToS2.cpp +++ b/src/Functions/geoToS2.cpp @@ -73,7 +73,7 @@ public: auto & dst_data = dst->getData(); dst_data.resize(input_rows_count); - for (const auto row : collections::range(0, input_rows_count)) + for (size_t row = 0; row < input_rows_count; ++row) { const Float64 lon = col_lon->getFloat64(row); const Float64 lat = col_lat->getFloat64(row); diff --git a/src/Functions/h3EdgeAngle.cpp b/src/Functions/h3EdgeAngle.cpp index 68e44e38bb9..5d5ad6cd1d3 100644 --- a/src/Functions/h3EdgeAngle.cpp +++ b/src/Functions/h3EdgeAngle.cpp @@ -58,7 +58,7 @@ public: auto & dst_data = dst->getData(); dst_data.resize(input_rows_count); - for (size_t row = 0; row < input_rows_count; row++) + for (size_t row = 0; row < input_rows_count; ++row) { const int resolution = col_hindex->getUInt(row); if (resolution > MAX_H3_RES) diff --git a/src/Functions/h3EdgeLengthM.cpp b/src/Functions/h3EdgeLengthM.cpp index eb0aab029b7..3eef9be9345 100644 --- a/src/Functions/h3EdgeLengthM.cpp +++ b/src/Functions/h3EdgeLengthM.cpp @@ -63,7 +63,7 @@ public: auto & dst_data = dst->getData(); dst_data.resize(input_rows_count); - for (size_t row = 0; row < input_rows_count; row++) + for (size_t row = 0; row < input_rows_count; ++row) { const UInt64 resolution = col_hindex->getUInt(row); if (resolution > MAX_H3_RES) diff --git a/src/Functions/h3GetBaseCell.cpp b/src/Functions/h3GetBaseCell.cpp index 1f635fda715..83978919f2c 100644 --- a/src/Functions/h3GetBaseCell.cpp +++ b/src/Functions/h3GetBaseCell.cpp @@ -55,7 +55,7 @@ public: auto & dst_data = dst->getData(); dst_data.resize(input_rows_count); - for (size_t row = 0; row < input_rows_count; row++) + for (size_t row = 0; row < input_rows_count; ++row) { const UInt64 hindex = col_hindex->getUInt(row); diff --git a/src/Functions/h3GetFaces.cpp b/src/Functions/h3GetFaces.cpp index 5d82c16296c..c0300e7212b 100644 --- a/src/Functions/h3GetFaces.cpp +++ b/src/Functions/h3GetFaces.cpp @@ -64,7 +64,7 @@ public: auto current_offset = 0; std::vector faces; - for (size_t row = 0; row < input_rows_count; row++) + for (size_t row = 0; row < input_rows_count; ++row) { int max_faces = maxFaceCount(data[row]); @@ -73,7 +73,7 @@ public: // function name h3GetFaces (v3.x) changed to getIcosahedronFaces (v4.0.0). getIcosahedronFaces(data[row], faces.data()); - for (int i = 0; i < max_faces; i++) + for (int i = 0; i < max_faces; ++i) { // valid icosahedron faces are represented by integers 0-19 if (faces[i] >= 0 && faces[i] <= 19) diff --git a/src/Functions/h3GetResolution.cpp b/src/Functions/h3GetResolution.cpp index cc4a3c7443d..02b634dac89 100644 --- a/src/Functions/h3GetResolution.cpp +++ b/src/Functions/h3GetResolution.cpp @@ -55,7 +55,7 @@ public: auto & dst_data = dst->getData(); dst_data.resize(input_rows_count); - for (size_t row = 0; row < input_rows_count; row++) + for (size_t row = 0; row < input_rows_count; ++row) { const UInt64 hindex = col_hindex->getUInt(row); diff --git a/src/Functions/h3HexAreaM2.cpp b/src/Functions/h3HexAreaM2.cpp index 6aa8fb31aab..96b301806a5 100644 --- a/src/Functions/h3HexAreaM2.cpp +++ b/src/Functions/h3HexAreaM2.cpp @@ -58,7 +58,7 @@ public: auto & dst_data = dst->getData(); dst_data.resize(input_rows_count); - for (size_t row = 0; row < input_rows_count; row++) + for (size_t row = 0; row < input_rows_count; ++row) { const UInt64 resolution = col_hindex->getUInt(row); if (resolution > MAX_H3_RES) diff --git a/src/Functions/h3IndexesAreNeighbors.cpp b/src/Functions/h3IndexesAreNeighbors.cpp index f938f7fe784..27eaacad4d6 100644 --- a/src/Functions/h3IndexesAreNeighbors.cpp +++ b/src/Functions/h3IndexesAreNeighbors.cpp @@ -63,7 +63,7 @@ public: auto & dst_data = dst->getData(); dst_data.resize(input_rows_count); - for (size_t row = 0; row < input_rows_count; row++) + for (size_t row = 0; row < input_rows_count; ++row) { const UInt64 hindex_origin = col_hindex_origin->getUInt(row); const UInt64 hindex_dest = col_hindex_dest->getUInt(row); diff --git a/src/Functions/h3IsPentagon.cpp b/src/Functions/h3IsPentagon.cpp index 039fea39f2a..a6726fe1656 100644 --- a/src/Functions/h3IsPentagon.cpp +++ b/src/Functions/h3IsPentagon.cpp @@ -56,7 +56,7 @@ public: auto & dst_data = dst->getData(); dst_data.resize(input_rows_count); - for (size_t row = 0 ; row < input_rows_count ; row++) + for (size_t row = 0 ; row < input_rows_count ; ++row) { UInt8 res = isPentagon(data[row]); dst_data[row] = res; diff --git a/src/Functions/h3IsResClassIII.cpp b/src/Functions/h3IsResClassIII.cpp index f2f7ae445f2..c6b79d404a4 100644 --- a/src/Functions/h3IsResClassIII.cpp +++ b/src/Functions/h3IsResClassIII.cpp @@ -56,7 +56,7 @@ public: auto & dst_data = dst->getData(); dst_data.resize(input_rows_count); - for (size_t row = 0 ; row < input_rows_count ; row++) + for (size_t row = 0 ; row < input_rows_count ; ++row) { UInt8 res = isResClassIII(data[row]); dst_data[row] = res; diff --git a/src/Functions/h3IsValid.cpp b/src/Functions/h3IsValid.cpp index 891d534375e..aa109eee6b4 100644 --- a/src/Functions/h3IsValid.cpp +++ b/src/Functions/h3IsValid.cpp @@ -55,7 +55,7 @@ public: auto & dst_data = dst->getData(); dst_data.resize(input_rows_count); - for (size_t row = 0; row < input_rows_count; row++) + for (size_t row = 0; row < input_rows_count; ++row) { const UInt64 hindex = col_hindex->getUInt(row); diff --git a/src/Functions/h3ToChildren.cpp b/src/Functions/h3ToChildren.cpp index 5745838e9cb..56b3dd9a88c 100644 --- a/src/Functions/h3ToChildren.cpp +++ b/src/Functions/h3ToChildren.cpp @@ -76,7 +76,7 @@ public: std::vector hindex_vec; - for (size_t row = 0; row < input_rows_count; row++) + for (size_t row = 0; row < input_rows_count; ++row) { const UInt64 parent_hindex = col_hindex->getUInt(row); const UInt8 child_resolution = col_resolution->getUInt(row); diff --git a/src/Functions/h3ToParent.cpp b/src/Functions/h3ToParent.cpp index 76ebea6daf6..fef1b16696f 100644 --- a/src/Functions/h3ToParent.cpp +++ b/src/Functions/h3ToParent.cpp @@ -66,7 +66,7 @@ public: auto & dst_data = dst->getData(); dst_data.resize(input_rows_count); - for (size_t row = 0; row < input_rows_count; row++) + for (size_t row = 0; row < input_rows_count; ++row) { const UInt64 hindex = col_hindex->getUInt(row); const UInt8 resolution = col_resolution->getUInt(row); diff --git a/src/Functions/h3kRing.cpp b/src/Functions/h3kRing.cpp index 1bcb3e1ab6c..9fc6312daa4 100644 --- a/src/Functions/h3kRing.cpp +++ b/src/Functions/h3kRing.cpp @@ -73,7 +73,7 @@ public: std::vector hindex_vec; - for (size_t row = 0; row < input_rows_count; row++) + for (size_t row = 0; row < input_rows_count; ++row) { const H3Index origin_hindex = col_hindex->getUInt(row); const int k = col_k->getInt(row); diff --git a/src/Functions/if.cpp b/src/Functions/if.cpp index 953aff3568e..6841098ebcf 100644 --- a/src/Functions/if.cpp +++ b/src/Functions/if.cpp @@ -969,7 +969,7 @@ private: static void executeShortCircuitArguments(ColumnsWithTypeAndName & arguments) { - int last_short_circuit_argument_index = checkShirtCircuitArguments(arguments); + int last_short_circuit_argument_index = checkShortCircuitArguments(arguments); if (last_short_circuit_argument_index == -1) return; diff --git a/src/Functions/ignore.cpp b/src/Functions/ignore.cpp index 931ef4a00ed..77c16cf7819 100644 --- a/src/Functions/ignore.cpp +++ b/src/Functions/ignore.cpp @@ -36,6 +36,8 @@ public: /// (in getResultIfAlwaysReturnsConstantAndHasArguments) bool useDefaultImplementationForLowCardinalityColumns() const override { return false; } + bool useDefaultImplementationForSparseColumns() const override { return false; } + String getName() const override { return name; diff --git a/src/Functions/isIPAddressContainedIn.cpp b/src/Functions/isIPAddressContainedIn.cpp index 048fa04adb1..3d2a38ef4c0 100644 --- a/src/Functions/isIPAddressContainedIn.cpp +++ b/src/Functions/isIPAddressContainedIn.cpp @@ -210,7 +210,7 @@ namespace DB ColumnUInt8::MutablePtr col_res = ColumnUInt8::create(input_rows_count); ColumnUInt8::Container & vec_res = col_res->getData(); - for (size_t i = 0; i < input_rows_count; i++) + for (size_t i = 0; i < input_rows_count; ++i) { const auto cidr = parseIPWithCIDR(col_cidr.getDataAt(i)); vec_res[i] = isAddressInRange(addr, cidr) ? 1 : 0; @@ -227,7 +227,7 @@ namespace DB ColumnUInt8::MutablePtr col_res = ColumnUInt8::create(input_rows_count); ColumnUInt8::Container & vec_res = col_res->getData(); - for (size_t i = 0; i < input_rows_count; i++) + for (size_t i = 0; i < input_rows_count; ++i) { const auto addr = IPAddressVariant(col_addr.getDataAt(i)); vec_res[i] = isAddressInRange(addr, cidr) ? 1 : 0; @@ -241,7 +241,7 @@ namespace DB ColumnUInt8::MutablePtr col_res = ColumnUInt8::create(input_rows_count); ColumnUInt8::Container & vec_res = col_res->getData(); - for (size_t i = 0; i < input_rows_count; i++) + for (size_t i = 0; i < input_rows_count; ++i) { const auto addr = IPAddressVariant(col_addr.getDataAt(i)); const auto cidr = parseIPWithCIDR(col_cidr.getDataAt(i)); diff --git a/src/Functions/map.cpp b/src/Functions/map.cpp index 03a9da404c2..4e242c4348b 100644 --- a/src/Functions/map.cpp +++ b/src/Functions/map.cpp @@ -310,7 +310,7 @@ public: FunctionLike func_like; - for (size_t row = 0; row < input_rows_count; row++) + for (size_t row = 0; row < input_rows_count; ++row) { size_t element_start_row = row != 0 ? column_array.getOffsets()[row-1] : 0; size_t elem_size = column_array.getOffsets()[row]- element_start_row; @@ -457,7 +457,7 @@ public: IColumn::Offset current_offset = 0; - for (size_t row = 0; row < input_rows_count; row++) + for (size_t row = 0; row < input_rows_count; ++row) { size_t element_start_row = row != 0 ? nested_column.getOffsets()[row-1] : 0; size_t element_size = nested_column.getOffsets()[row]- element_start_row; @@ -492,7 +492,7 @@ public: auto res = func_like.executeImpl(new_arguments, result_type, input_rows_count); const auto & container = checkAndGetColumn(res.get())->getData(); - for (size_t row_num = 0; row_num < element_size; row_num++) + for (size_t row_num = 0; row_num < element_size; ++row_num) { if (container[row_num] == 1) { diff --git a/src/Functions/multiIf.cpp b/src/Functions/multiIf.cpp index 3e5242d5f9b..070a7c2f05e 100644 --- a/src/Functions/multiIf.cpp +++ b/src/Functions/multiIf.cpp @@ -262,7 +262,7 @@ public: private: static void executeShortCircuitArguments(ColumnsWithTypeAndName & arguments) { - int last_short_circuit_argument_index = checkShirtCircuitArguments(arguments); + int last_short_circuit_argument_index = checkShortCircuitArguments(arguments); if (last_short_circuit_argument_index < 0) return; diff --git a/src/Functions/pointInPolygon.cpp b/src/Functions/pointInPolygon.cpp index 03e46541cdf..c3a9c411cbc 100644 --- a/src/Functions/pointInPolygon.cpp +++ b/src/Functions/pointInPolygon.cpp @@ -139,7 +139,7 @@ public: } else { - for (size_t i = 1; i < arguments.size(); i++) + for (size_t i = 1; i < arguments.size(); ++i) { const auto * array = checkAndGetDataType(arguments[i].get()); if (array == nullptr) diff --git a/src/Functions/polygonArea.cpp b/src/Functions/polygonArea.cpp index 2e38d6c74b9..c4c573490f6 100644 --- a/src/Functions/polygonArea.cpp +++ b/src/Functions/polygonArea.cpp @@ -78,7 +78,7 @@ public: { auto geometries = Converter::convert(arguments[0].column->convertToFullColumnIfConst()); - for (size_t i = 0; i < input_rows_count; i++) + for (size_t i = 0; i < input_rows_count; ++i) res_data.emplace_back(boost::geometry::area(geometries[i])); } } diff --git a/src/Functions/polygonConvexHull.cpp b/src/Functions/polygonConvexHull.cpp index 887a12b8b6a..e8756f11bba 100644 --- a/src/Functions/polygonConvexHull.cpp +++ b/src/Functions/polygonConvexHull.cpp @@ -75,7 +75,7 @@ public: { auto geometries = Converter::convert(arguments[0].column->convertToFullColumnIfConst()); - for (size_t i = 0; i < input_rows_count; i++) + for (size_t i = 0; i < input_rows_count; ++i) { Polygon convex_hull{}; boost::geometry::convex_hull(geometries[i], convex_hull); diff --git a/src/Functions/polygonPerimeter.cpp b/src/Functions/polygonPerimeter.cpp index 8291020197a..eedb91a1622 100644 --- a/src/Functions/polygonPerimeter.cpp +++ b/src/Functions/polygonPerimeter.cpp @@ -77,7 +77,7 @@ public: { auto geometries = Converter::convert(arguments[0].column->convertToFullColumnIfConst()); - for (size_t i = 0; i < input_rows_count; i++) + for (size_t i = 0; i < input_rows_count; ++i) res_data.emplace_back(boost::geometry::perimeter(geometries[i])); } } diff --git a/src/Functions/polygonsDistance.cpp b/src/Functions/polygonsDistance.cpp index 8dd88e1c3bd..51c0198b465 100644 --- a/src/Functions/polygonsDistance.cpp +++ b/src/Functions/polygonsDistance.cpp @@ -83,7 +83,7 @@ public: auto first = LeftConverter::convert(arguments[0].column->convertToFullColumnIfConst()); auto second = RightConverter::convert(arguments[1].column->convertToFullColumnIfConst()); - for (size_t i = 0; i < input_rows_count; i++) + for (size_t i = 0; i < input_rows_count; ++i) { boost::geometry::correct(first[i]); boost::geometry::correct(second[i]); diff --git a/src/Functions/polygonsEquals.cpp b/src/Functions/polygonsEquals.cpp index da1db43229b..5c572a16d0e 100644 --- a/src/Functions/polygonsEquals.cpp +++ b/src/Functions/polygonsEquals.cpp @@ -82,7 +82,7 @@ public: auto first = LeftConverter::convert(arguments[0].column->convertToFullColumnIfConst()); auto second = RightConverter::convert(arguments[1].column->convertToFullColumnIfConst()); - for (size_t i = 0; i < input_rows_count; i++) + for (size_t i = 0; i < input_rows_count; ++i) { boost::geometry::correct(first[i]); boost::geometry::correct(second[i]); diff --git a/src/Functions/polygonsSymDifference.cpp b/src/Functions/polygonsSymDifference.cpp index 8ef0142072a..4f718760124 100644 --- a/src/Functions/polygonsSymDifference.cpp +++ b/src/Functions/polygonsSymDifference.cpp @@ -81,7 +81,7 @@ public: auto second = RightConverter::convert(arguments[1].column->convertToFullColumnIfConst()); /// NOLINTNEXTLINE(clang-analyzer-core.uninitialized.Assign) - for (size_t i = 0; i < input_rows_count; i++) + for (size_t i = 0; i < input_rows_count; ++i) { boost::geometry::correct(first[i]); boost::geometry::correct(second[i]); diff --git a/src/Functions/polygonsUnion.cpp b/src/Functions/polygonsUnion.cpp index 770aa14ac52..e0c6f208c91 100644 --- a/src/Functions/polygonsUnion.cpp +++ b/src/Functions/polygonsUnion.cpp @@ -82,7 +82,7 @@ public: /// We are not interested in some pitfalls in third-party libraries /// NOLINTNEXTLINE(clang-analyzer-core.uninitialized.Assign) - for (size_t i = 0; i < input_rows_count; i++) + for (size_t i = 0; i < input_rows_count; ++i) { /// Orient the polygons correctly. boost::geometry::correct(first[i]); diff --git a/src/Functions/polygonsWithin.cpp b/src/Functions/polygonsWithin.cpp index 66e5b4e6e17..0412c9a656d 100644 --- a/src/Functions/polygonsWithin.cpp +++ b/src/Functions/polygonsWithin.cpp @@ -85,7 +85,7 @@ public: auto second = RightConverter::convert(arguments[1].column->convertToFullColumnIfConst()); /// NOLINTNEXTLINE(clang-analyzer-core.uninitialized.Assign) - for (size_t i = 0; i < input_rows_count; i++) + for (size_t i = 0; i < input_rows_count; ++i) { boost::geometry::correct(first[i]); boost::geometry::correct(second[i]); diff --git a/src/Functions/readWkt.cpp b/src/Functions/readWkt.cpp index c3ae6516e0f..b8d0d20acb3 100644 --- a/src/Functions/readWkt.cpp +++ b/src/Functions/readWkt.cpp @@ -55,7 +55,7 @@ public: Serializer serializer; Geometry geometry; - for (size_t i = 0; i < input_rows_count; i++) + for (size_t i = 0; i < input_rows_count; ++i) { const auto & str = column_string->getDataAt(i).toString(); boost::geometry::read_wkt(str, geometry); diff --git a/src/Functions/registerFunctions.cpp b/src/Functions/registerFunctions.cpp index 30f52e4ce37..3f84d97b465 100644 --- a/src/Functions/registerFunctions.cpp +++ b/src/Functions/registerFunctions.cpp @@ -61,7 +61,7 @@ void registerFunctionValidateNestedArraySizes(FunctionFactory & factory); void registerFunctionsSnowflake(FunctionFactory & factory); void registerFunctionTid(FunctionFactory & factory); void registerFunctionLogTrace(FunctionFactory & factory); -void registerFunctionsWindow(FunctionFactory &); +void registerFunctionsTimeWindow(FunctionFactory &); #if USE_NLP void registerFunctionLanguageDetectUTF8(FunctionFactory &); @@ -129,7 +129,7 @@ void registerFunctions() registerFunctionsStringHash(factory); registerFunctionValidateNestedArraySizes(factory); registerFunctionsSnowflake(factory); - registerFunctionsWindow(factory); + registerFunctionsTimeWindow(factory); #if USE_NLP registerFunctionLanguageDetectUTF8(factory); diff --git a/src/Functions/reinterpretAs.cpp b/src/Functions/reinterpretAs.cpp index f237b158fe5..ad357c74402 100644 --- a/src/Functions/reinterpretAs.cpp +++ b/src/Functions/reinterpretAs.cpp @@ -24,6 +24,7 @@ #include + namespace DB { namespace ErrorCodes @@ -174,16 +175,14 @@ public: const auto & offsets_from = col_from->getOffsets(); size_t size = offsets_from.size(); auto & vec_res = col_res->getData(); - vec_res.resize(size); + vec_res.resize_fill(size); size_t offset = 0; for (size_t i = 0; i < size; ++i) { - ToFieldType value{}; - memcpy(&value, + memcpy(&vec_res[i], &data_from[offset], std::min(static_cast(sizeof(ToFieldType)), offsets_from[i] - offset - 1)); - vec_res[i] = value; offset = offsets_from[i]; } @@ -201,15 +200,18 @@ public: size_t step = col_from_fixed->getN(); size_t size = data_from.size() / step; auto & vec_res = col_res->getData(); - vec_res.resize(size); size_t offset = 0; size_t copy_size = std::min(step, sizeof(ToFieldType)); + + if (sizeof(ToFieldType) <= step) + vec_res.resize(size); + else + vec_res.resize_fill(size); + for (size_t i = 0; i < size; ++i) { - ToFieldType value{}; - memcpy(&value, &data_from[offset], copy_size); - vec_res[i] = value; + memcpy(&vec_res[i], &data_from[offset], copy_size); offset += step; } @@ -288,7 +290,7 @@ private: { StringRef data = src.getDataAt(i); - std::memcpy(&data_to[offset], data.data, std::min(n, data.size)); + memcpy(&data_to[offset], data.data, std::min(n, data.size)); offset += n; } } @@ -347,10 +349,13 @@ private: using To = typename ToContainer::value_type; size_t size = from.size(); - to.resize_fill(size); - static constexpr size_t copy_size = std::min(sizeof(From), sizeof(To)); + if (sizeof(To) <= sizeof(From)) + to.resize(size); + else + to.resize_fill(size); + for (size_t i = 0; i < size; ++i) memcpy(static_cast(&to[i]), static_cast(&from[i]), copy_size); } diff --git a/src/Functions/s2CapContains.cpp b/src/Functions/s2CapContains.cpp index c3ebbf0d251..100b028646c 100644 --- a/src/Functions/s2CapContains.cpp +++ b/src/Functions/s2CapContains.cpp @@ -91,7 +91,7 @@ public: auto & dst_data = dst->getData(); dst_data.reserve(input_rows_count); - for (const auto row : collections::range(0, input_rows_count)) + for (size_t row=0 ; row < input_rows_count; ++row) { const auto center = S2CellId(col_center->getUInt(row)); const Float64 degrees = col_degrees->getFloat64(row); diff --git a/src/Functions/s2CapUnion.cpp b/src/Functions/s2CapUnion.cpp index 2328db4cb52..263163963af 100644 --- a/src/Functions/s2CapUnion.cpp +++ b/src/Functions/s2CapUnion.cpp @@ -95,7 +95,7 @@ public: auto & vec_res_radius = col_res_radius->getData(); vec_res_radius.reserve(input_rows_count); - for (const auto row : collections::range(0, input_rows_count)) + for (size_t row = 0; row < input_rows_count; ++row) { const UInt64 first_center = col_center1->getUInt(row); const Float64 first_radius = col_radius1->getFloat64(row); diff --git a/src/Functions/s2CellsIntersect.cpp b/src/Functions/s2CellsIntersect.cpp index d7801afe0d0..f8273a1fcca 100644 --- a/src/Functions/s2CellsIntersect.cpp +++ b/src/Functions/s2CellsIntersect.cpp @@ -72,7 +72,7 @@ public: auto & dst_data = dst->getData(); dst_data.reserve(input_rows_count); - for (const auto row : collections::range(0, input_rows_count)) + for (size_t row = 0; row < input_rows_count; ++row) { const UInt64 id_first = col_id_first->getInt(row); const UInt64 id_second = col_id_second->getInt(row); diff --git a/src/Functions/s2GetNeighbors.cpp b/src/Functions/s2GetNeighbors.cpp index 99c1395f3cd..c0b2e634e6f 100644 --- a/src/Functions/s2GetNeighbors.cpp +++ b/src/Functions/s2GetNeighbors.cpp @@ -72,7 +72,7 @@ public: dst_offsets.resize(input_rows_count); size_t current_offset = 0; - for (const auto row : collections::range(0, input_rows_count)) + for (size_t row = 0; row < input_rows_count; ++row) { const UInt64 id = col_id->getUInt(row); diff --git a/src/Functions/s2RectAdd.cpp b/src/Functions/s2RectAdd.cpp index 9a6fcd25e5a..f7c39b2a6b1 100644 --- a/src/Functions/s2RectAdd.cpp +++ b/src/Functions/s2RectAdd.cpp @@ -77,7 +77,7 @@ public: auto & vec_res_second = col_res_second->getData(); vec_res_second.reserve(input_rows_count); - for (const auto row : collections::range(0, input_rows_count)) + for (size_t row = 0; row < input_rows_count; ++row) { const auto lo = S2CellId(col_lo->getUInt(row)); const auto hi = S2CellId(col_hi->getUInt(row)); diff --git a/src/Functions/s2RectContains.cpp b/src/Functions/s2RectContains.cpp index 11db27e68ca..90ced5450bc 100644 --- a/src/Functions/s2RectContains.cpp +++ b/src/Functions/s2RectContains.cpp @@ -70,7 +70,7 @@ public: auto & dst_data = dst->getData(); dst_data.reserve(input_rows_count); - for (const auto row : collections::range(0, input_rows_count)) + for (size_t row = 0; row < input_rows_count; ++row) { const auto lo = S2CellId(col_lo->getUInt(row)); const auto hi = S2CellId(col_hi->getUInt(row)); diff --git a/src/Functions/s2RectIntersection.cpp b/src/Functions/s2RectIntersection.cpp index a8a4536c9e7..b108cc1b64f 100644 --- a/src/Functions/s2RectIntersection.cpp +++ b/src/Functions/s2RectIntersection.cpp @@ -81,7 +81,7 @@ public: auto & vec_res_second = col_res_second->getData(); vec_res_second.reserve(input_rows_count); - for (const auto row : collections::range(0, input_rows_count)) + for (size_t row = 0; row < input_rows_count; ++row) { const auto lo1 = S2CellId(col_lo1->getUInt(row)); const auto hi1 = S2CellId(col_hi1->getUInt(row)); diff --git a/src/Functions/s2RectUnion.cpp b/src/Functions/s2RectUnion.cpp index f187c068345..bd40a747a09 100644 --- a/src/Functions/s2RectUnion.cpp +++ b/src/Functions/s2RectUnion.cpp @@ -79,7 +79,7 @@ public: auto & vec_res_second = col_res_second->getData(); vec_res_second.reserve(input_rows_count); - for (const auto row : collections::range(0, input_rows_count)) + for (size_t row = 0; row < input_rows_count; ++row) { const auto lo1 = S2CellId(col_lo1->getUInt(row)); const auto hi1 = S2CellId(col_hi1->getUInt(row)); diff --git a/src/Functions/s2ToGeo.cpp b/src/Functions/s2ToGeo.cpp index 032fdbfe323..03a67d49e45 100644 --- a/src/Functions/s2ToGeo.cpp +++ b/src/Functions/s2ToGeo.cpp @@ -78,7 +78,7 @@ public: auto & latitude = col_latitude->getData(); latitude.reserve(input_rows_count); - for (const auto row : collections::range(0, input_rows_count)) + for (size_t row = 0; row < input_rows_count; ++row) { const auto id = S2CellId(col_id->getUInt(row)); diff --git a/src/Functions/serverConstants.cpp b/src/Functions/serverConstants.cpp index 87d5f955e88..e277c906c1b 100644 --- a/src/Functions/serverConstants.cpp +++ b/src/Functions/serverConstants.cpp @@ -5,7 +5,7 @@ #include #include #include -#include +#include #if defined(OS_LINUX) # include diff --git a/src/Functions/svg.cpp b/src/Functions/svg.cpp index b3a89c0393c..e1d48ffc061 100644 --- a/src/Functions/svg.cpp +++ b/src/Functions/svg.cpp @@ -79,7 +79,7 @@ public: auto figures = Converter::convert(arguments[0].column->convertToFullColumnIfConst()); - for (size_t i = 0; i < input_rows_count; i++) + for (size_t i = 0; i < input_rows_count; ++i) { std::stringstream str; // STYLE_CHECK_ALLOW_STD_STRING_STREAM boost::geometry::correct(figures[i]); diff --git a/src/Functions/throwIf.cpp b/src/Functions/throwIf.cpp index d499f1f492f..7533e30c9b9 100644 --- a/src/Functions/throwIf.cpp +++ b/src/Functions/throwIf.cpp @@ -48,36 +48,53 @@ public: const size_t number_of_arguments = arguments.size(); if (number_of_arguments < 1 || number_of_arguments > 2) - throw Exception{"Number of arguments for function " + getName() + " doesn't match: passed " - + toString(number_of_arguments) + ", should be 1 or 2", - ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH}; + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + "Number of arguments for function {} doesn't match: passed {}, should be 1 or 2", + getName(), + toString(number_of_arguments)); if (!isNativeNumber(arguments[0])) - throw Exception{"Argument for function " + getName() + " must be number", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT}; + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Argument for function {} must be number", + getName()); if (number_of_arguments > 1 && !isString(arguments[1])) - throw Exception{"Illegal type " + arguments[1]->getName() + " of argument of function " + getName(), - ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT}; + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Illegal type {} of argument of function {}", + arguments[1]->getName(), + getName()); return std::make_shared(); } - bool useDefaultImplementationForConstants() const override { return true; } + bool useDefaultImplementationForConstants() const override { return false; } ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1}; } - ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t /*input_rows_count*/) const override + /** Prevent constant folding for FunctionThrowIf because for short circuit evaluation + * it is unsafe to evaluate this function during DAG analysis. + */ + bool isSuitableForConstantFolding() const override { return false; } + + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override { + if (input_rows_count == 0) + return result_type->createColumn(); + std::optional custom_message; if (arguments.size() == 2) { - const auto * msg_column = checkAndGetColumnConst(arguments[1].column.get()); - if (!msg_column) - throw Exception{"Second argument for function " + getName() + " must be constant String", ErrorCodes::ILLEGAL_COLUMN}; - custom_message = msg_column->getValue(); + const auto * message_column = checkAndGetColumnConst(arguments[1].column.get()); + if (!message_column) + throw Exception(ErrorCodes::ILLEGAL_COLUMN, + "Second argument for function {} must be constant String", + getName()); + + custom_message = message_column->getValue(); } - const auto * in = arguments.front().column.get(); + auto first_argument_column = arguments.front().column; + const auto * in = first_argument_column.get(); ColumnPtr res; if (!((res = execute(in, custom_message)) @@ -90,7 +107,9 @@ public: || (res = execute(in, custom_message)) || (res = execute(in, custom_message)) || (res = execute(in, custom_message)))) + { throw Exception{"Illegal column " + in->getName() + " of first argument of function " + getName(), ErrorCodes::ILLEGAL_COLUMN}; + } return res; } @@ -98,15 +117,22 @@ public: template ColumnPtr execute(const IColumn * in_untyped, const std::optional & message) const { - if (const auto in = checkAndGetColumn>(in_untyped)) + const auto * in = checkAndGetColumn>(in_untyped); + + if (!in) + in = checkAndGetColumnConstData>(in_untyped); + + if (in) { const auto & in_data = in->getData(); if (!memoryIsZero(in_data.data(), in_data.size() * sizeof(in_data[0]))) - throw Exception{message.value_or("Value passed to '" + getName() + "' function is non zero"), - ErrorCodes::FUNCTION_THROW_IF_VALUE_IS_NON_ZERO}; + { + throw Exception(ErrorCodes::FUNCTION_THROW_IF_VALUE_IS_NON_ZERO, + message.value_or("Value passed to '" + getName() + "' function is non zero")); + } /// We return non constant to avoid constant folding. - return ColumnUInt8::create(in_data.size(), 0); + return ColumnUInt8::create(in_data.size(), 0); } return nullptr; diff --git a/src/Functions/timezoneOf.cpp b/src/Functions/timezoneOf.cpp index a6556bdb800..03c9e27a3a8 100644 --- a/src/Functions/timezoneOf.cpp +++ b/src/Functions/timezoneOf.cpp @@ -3,7 +3,7 @@ #include #include #include -#include +#include #include diff --git a/src/Functions/toStartOfInterval.cpp b/src/Functions/toStartOfInterval.cpp index ecc3b80f088..f8ea44851b6 100644 --- a/src/Functions/toStartOfInterval.cpp +++ b/src/Functions/toStartOfInterval.cpp @@ -1,4 +1,4 @@ -#include +#include #include #include #include diff --git a/src/Functions/today.cpp b/src/Functions/today.cpp index fb9fd945239..fe63197d127 100644 --- a/src/Functions/today.cpp +++ b/src/Functions/today.cpp @@ -1,4 +1,4 @@ -#include +#include #include diff --git a/src/Functions/wkt.cpp b/src/Functions/wkt.cpp index 8fbb8f59d33..732441eeef2 100644 --- a/src/Functions/wkt.cpp +++ b/src/Functions/wkt.cpp @@ -49,7 +49,7 @@ public: auto figures = Converter::convert(arguments[0].column->convertToFullColumnIfConst()); - for (size_t i = 0; i < input_rows_count; i++) + for (size_t i = 0; i < input_rows_count; ++i) { std::stringstream str; // STYLE_CHECK_ALLOW_STD_STRING_STREAM str << boost::geometry::wkt(figures[i]); diff --git a/src/Functions/yesterday.cpp b/src/Functions/yesterday.cpp index f792f885472..364d4721b34 100644 --- a/src/Functions/yesterday.cpp +++ b/src/Functions/yesterday.cpp @@ -1,4 +1,4 @@ -#include +#include #include diff --git a/src/IO/AIO.cpp b/src/IO/AIO.cpp index 777d9bbbc7f..97e5a470463 100644 --- a/src/IO/AIO.cpp +++ b/src/IO/AIO.cpp @@ -95,7 +95,7 @@ int io_destroy(int ctx) int io_submit(int ctx, long nr, struct iocb * iocbpp[]) { - for (long i = 0; i < nr; i++) + for (long i = 0; i < nr; ++i) { struct aiocb * iocb = &iocbpp[i]->aio; diff --git a/src/IO/AsynchronousReadBufferFromFileDescriptor.cpp b/src/IO/AsynchronousReadBufferFromFileDescriptor.cpp index b2be45471c8..a27c9035c61 100644 --- a/src/IO/AsynchronousReadBufferFromFileDescriptor.cpp +++ b/src/IO/AsynchronousReadBufferFromFileDescriptor.cpp @@ -69,7 +69,8 @@ bool AsynchronousReadBufferFromFileDescriptor::nextImpl() { Stopwatch watch; CurrentMetrics::Increment metric_increment{CurrentMetrics::AsynchronousReadWait}; - size = prefetch_future.get(); + auto result = prefetch_future.get(); + size = result.size; ProfileEvents::increment(ProfileEvents::AsynchronousReadWaitMicroseconds, watch.elapsedMicroseconds()); } @@ -90,7 +91,7 @@ bool AsynchronousReadBufferFromFileDescriptor::nextImpl() { /// No pending request. Do synchronous read. - auto size = readInto(memory.data(), memory.size()).get(); + auto [size, _] = readInto(memory.data(), memory.size()).get(); file_offset_of_buffer_end += size; if (size) @@ -201,4 +202,3 @@ void AsynchronousReadBufferFromFileDescriptor::rewind() } } - diff --git a/src/IO/AsynchronousReader.h b/src/IO/AsynchronousReader.h index e4a81623205..e79e72f3bec 100644 --- a/src/IO/AsynchronousReader.h +++ b/src/IO/AsynchronousReader.h @@ -49,10 +49,18 @@ public: size_t ignore = 0; }; - /// Less than requested amount of data can be returned. - /// If size is zero - the file has ended. - /// (for example, EINTR must be handled by implementation automatically) - using Result = size_t; + struct Result + { + /// size + /// Less than requested amount of data can be returned. + /// If size is zero - the file has ended. + /// (for example, EINTR must be handled by implementation automatically) + size_t size = 0; + + /// offset + /// Optional. Useful when implementation needs to do ignore(). + size_t offset = 0; + }; /// Submit request and obtain a handle. This method don't perform any waits. /// If this method did not throw, the caller must wait for the result with 'wait' method diff --git a/src/IO/LimitReadBuffer.cpp b/src/IO/LimitReadBuffer.cpp index 9daffa3a1d3..30914f9b798 100644 --- a/src/IO/LimitReadBuffer.cpp +++ b/src/IO/LimitReadBuffer.cpp @@ -29,7 +29,8 @@ bool LimitReadBuffer::nextImpl() if (!in->next()) { - working_buffer = in->buffer(); + /// Clearing the buffer with existing data. + set(in->position(), 0); return false; } diff --git a/src/IO/Lz4DeflatingWriteBuffer.cpp b/src/IO/Lz4DeflatingWriteBuffer.cpp index 5d9c5d40e6f..da954b13df9 100644 --- a/src/IO/Lz4DeflatingWriteBuffer.cpp +++ b/src/IO/Lz4DeflatingWriteBuffer.cpp @@ -54,14 +54,19 @@ void Lz4DeflatingWriteBuffer::nextImpl() in_data = reinterpret_cast(working_buffer.begin()); in_capacity = offset(); + out_capacity = out->buffer().end() - out->position(); + out_data = reinterpret_cast(out->position()); + try { if (first_time) { - out->nextIfAtEnd(); - - out_data = reinterpret_cast(out->position()); - out_capacity = out->buffer().end() - out->position(); + if (out_capacity < LZ4F_HEADER_SIZE_MAX) + { + out->next(); + out_capacity = out->buffer().end() - out->position(); + out_data = reinterpret_cast(out->position()); + } /// write frame header and check for errors size_t header_size = LZ4F_compressBegin(ctx, out_data, out_capacity, &kPrefs); @@ -74,24 +79,29 @@ void Lz4DeflatingWriteBuffer::nextImpl() out_capacity -= header_size; out->position() = out->buffer().end() - out_capacity; + out_data = reinterpret_cast(out->position()); + first_time = false; } do { /// Ensure that there is enough space for compressed block of minimal size - if (out_capacity < LZ4F_compressBound(0, &kPrefs)) + size_t min_compressed_block_size = LZ4F_compressBound(1, &kPrefs); + if (out_capacity < min_compressed_block_size) { out->next(); out_capacity = out->buffer().end() - out->position(); + out_data = reinterpret_cast(out->position()); } - out_data = reinterpret_cast(out->position()); - /// LZ4F_compressUpdate compresses whole input buffer at once so we need to shink it manually size_t cur_buffer_size = in_capacity; - while (out_capacity < LZ4F_compressBound(cur_buffer_size, &kPrefs)) - cur_buffer_size /= 2; + if (out_capacity >= min_compressed_block_size) /// We cannot shrink the input buffer if it's already too small. + { + while (out_capacity < LZ4F_compressBound(cur_buffer_size, &kPrefs)) + cur_buffer_size /= 2; + } size_t compressed_size = LZ4F_compressUpdate(ctx, out_data, out_capacity, in_data, cur_buffer_size, nullptr); @@ -101,11 +111,12 @@ void Lz4DeflatingWriteBuffer::nextImpl() "LZ4 failed to encode stream. LZ4F version: {}", LZ4F_VERSION); - out_capacity -= compressed_size; in_capacity -= cur_buffer_size; - in_data = reinterpret_cast(working_buffer.end() - in_capacity); + + out_capacity -= compressed_size; out->position() = out->buffer().end() - out_capacity; + out_data = reinterpret_cast(out->position()); } while (in_capacity > 0); } @@ -120,14 +131,16 @@ void Lz4DeflatingWriteBuffer::finalizeBefore() { next(); + out_capacity = out->buffer().end() - out->position(); + out_data = reinterpret_cast(out->position()); + if (out_capacity < LZ4F_compressBound(0, &kPrefs)) { out->next(); out_capacity = out->buffer().end() - out->position(); + out_data = reinterpret_cast(out->position()); } - out_data = reinterpret_cast(out->position()); - /// compression end size_t end_size = LZ4F_compressEnd(ctx, out_data, out_capacity, nullptr); @@ -139,6 +152,7 @@ void Lz4DeflatingWriteBuffer::finalizeBefore() out_capacity -= end_size; out->position() = out->buffer().end() - out_capacity; + out_data = reinterpret_cast(out->position()); } void Lz4DeflatingWriteBuffer::finalizeAfter() diff --git a/src/IO/ReadBufferFromAzureBlobStorage.cpp b/src/IO/ReadBufferFromAzureBlobStorage.cpp new file mode 100644 index 00000000000..0ce6db97437 --- /dev/null +++ b/src/IO/ReadBufferFromAzureBlobStorage.cpp @@ -0,0 +1,173 @@ +#if !defined(ARCADIA_BUILD) +#include +#endif + +#if USE_AZURE_BLOB_STORAGE + +#include +#include +#include +#include + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int CANNOT_SEEK_THROUGH_FILE; + extern const int SEEK_POSITION_OUT_OF_BOUND; + extern const int RECEIVED_EMPTY_DATA; + extern const int LOGICAL_ERROR; +} + + +ReadBufferFromAzureBlobStorage::ReadBufferFromAzureBlobStorage( + std::shared_ptr blob_container_client_, + const String & path_, + size_t max_single_read_retries_, + size_t max_single_download_retries_, + size_t tmp_buffer_size_, + bool use_external_buffer_, + size_t read_until_position_) + : SeekableReadBuffer(nullptr, 0) + , blob_container_client(blob_container_client_) + , path(path_) + , max_single_read_retries(max_single_read_retries_) + , max_single_download_retries(max_single_download_retries_) + , tmp_buffer_size(tmp_buffer_size_) + , use_external_buffer(use_external_buffer_) + , read_until_position(read_until_position_) +{ + if (!use_external_buffer) + { + tmp_buffer.resize(tmp_buffer_size); + data_ptr = tmp_buffer.data(); + data_capacity = tmp_buffer_size; + } +} + + +bool ReadBufferFromAzureBlobStorage::nextImpl() +{ + if (read_until_position) + { + if (read_until_position == offset) + return false; + + if (read_until_position < offset) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Attempt to read beyond right offset ({} > {})", offset, read_until_position - 1); + } + + if (!initialized) + initialize(); + + if (use_external_buffer) + { + data_ptr = internal_buffer.begin(); + data_capacity = internal_buffer.size(); + } + + size_t to_read_bytes = std::min(static_cast(total_size - offset), data_capacity); + size_t bytes_read = 0; + + size_t sleep_time_with_backoff_milliseconds = 100; + for (size_t i = 0; i < max_single_read_retries; ++i) + { + try + { + bytes_read = data_stream->ReadToCount(reinterpret_cast(data_ptr), to_read_bytes); + break; + } + catch (const Azure::Storage::StorageException & e) + { + LOG_INFO(log, "Exception caught during Azure Read for file {} at attempt {}: {}", path, i, e.Message); + if (i + 1 == max_single_read_retries) + throw; + + sleepForMilliseconds(sleep_time_with_backoff_milliseconds); + sleep_time_with_backoff_milliseconds *= 2; + initialized = false; + initialize(); + } + } + + if (bytes_read == 0) + return false; + + BufferBase::set(data_ptr, bytes_read, 0); + offset += bytes_read; + + return true; +} + + +off_t ReadBufferFromAzureBlobStorage::seek(off_t offset_, int whence) +{ + if (initialized) + throw Exception("Seek is allowed only before first read attempt from the buffer.", ErrorCodes::CANNOT_SEEK_THROUGH_FILE); + + if (whence != SEEK_SET) + throw Exception("Only SEEK_SET mode is allowed.", ErrorCodes::CANNOT_SEEK_THROUGH_FILE); + + if (offset_ < 0) + throw Exception("Seek position is out of bounds. Offset: " + std::to_string(offset_), ErrorCodes::SEEK_POSITION_OUT_OF_BOUND); + + offset = offset_; + + return offset; +} + + +off_t ReadBufferFromAzureBlobStorage::getPosition() +{ + return offset - available(); +} + + +void ReadBufferFromAzureBlobStorage::initialize() +{ + if (initialized) + return; + + Azure::Storage::Blobs::DownloadBlobOptions download_options; + + Azure::Nullable length {}; + if (read_until_position != 0) + length = {static_cast(read_until_position - offset)}; + + download_options.Range = {static_cast(offset), length}; + + blob_client = std::make_unique(blob_container_client->GetBlobClient(path)); + + size_t sleep_time_with_backoff_milliseconds = 100; + for (size_t i = 0; i < max_single_download_retries; ++i) + { + try + { + auto download_response = blob_client->Download(download_options); + data_stream = std::move(download_response.Value.BodyStream); + break; + } + catch (const Azure::Core::RequestFailedException & e) + { + LOG_INFO(log, "Exception caught during Azure Download for file {} at offset {} at attempt {} : {}", path, offset, i + 1, e.Message); + if (i + 1 == max_single_download_retries) + throw; + + sleepForMilliseconds(sleep_time_with_backoff_milliseconds); + sleep_time_with_backoff_milliseconds *= 2; + } + } + + if (data_stream == nullptr) + throw Exception(ErrorCodes::RECEIVED_EMPTY_DATA, "Null data stream obtained while downloading file {} from Blob Storage", path); + + total_size = data_stream->Length() + offset; + + initialized = true; +} + +} + +#endif diff --git a/src/IO/ReadBufferFromAzureBlobStorage.h b/src/IO/ReadBufferFromAzureBlobStorage.h new file mode 100644 index 00000000000..53749ad3199 --- /dev/null +++ b/src/IO/ReadBufferFromAzureBlobStorage.h @@ -0,0 +1,63 @@ +#pragma once + +#if !defined(ARCADIA_BUILD) +#include +#endif + +#if USE_AZURE_BLOB_STORAGE + +#include +#include +#include +#include + +namespace DB +{ + +class ReadBufferFromAzureBlobStorage : public SeekableReadBuffer +{ +public: + + explicit ReadBufferFromAzureBlobStorage( + std::shared_ptr blob_container_client_, + const String & path_, + size_t max_single_read_retries_, + size_t max_single_download_retries_, + size_t tmp_buffer_size_, + bool use_external_buffer_ = false, + size_t read_until_position_ = 0 + ); + + off_t seek(off_t off, int whence) override; + off_t getPosition() override; + + bool nextImpl() override; + +private: + + void initialize(); + + std::unique_ptr data_stream; + std::shared_ptr blob_container_client; + std::unique_ptr blob_client; + + const String path; + size_t max_single_read_retries; + size_t max_single_download_retries; + std::vector tmp_buffer; + size_t tmp_buffer_size; + bool use_external_buffer; + off_t read_until_position = 0; + + off_t offset = 0; + size_t total_size; + bool initialized = false; + char * data_ptr; + size_t data_capacity; + + Poco::Logger * log = &Poco::Logger::get("ReadBufferFromAzureBlobStorage"); +}; + +} + +#endif diff --git a/src/IO/ReadBufferFromS3.cpp b/src/IO/ReadBufferFromS3.cpp index 53d2067780e..f01640cb95b 100644 --- a/src/IO/ReadBufferFromS3.cpp +++ b/src/IO/ReadBufferFromS3.cpp @@ -235,12 +235,13 @@ std::unique_ptr ReadBufferFromS3::initialize() throw Exception(ErrorCodes::LOGICAL_ERROR, "Attempt to read beyond right offset ({} > {})", offset, read_until_position - 1); req.SetRange(fmt::format("bytes={}-{}", offset, read_until_position - 1)); - LOG_DEBUG(log, "Read S3 object. Bucket: {}, Key: {}, Range: {}-{}", bucket, key, offset, read_until_position - 1); + LOG_TEST(log, "Read S3 object. Bucket: {}, Key: {}, Range: {}-{}", bucket, key, offset, read_until_position - 1); } else { - req.SetRange(fmt::format("bytes={}-", offset)); - LOG_DEBUG(log, "Read S3 object. Bucket: {}, Key: {}, Offset: {}", bucket, key, offset); + if (offset) + req.SetRange(fmt::format("bytes={}-", offset)); + LOG_TEST(log, "Read S3 object. Bucket: {}, Key: {}, Offset: {}", bucket, key, offset); } Aws::S3::Model::GetObjectOutcome outcome = client_ptr->GetObject(req); diff --git a/src/IO/ReadHelpers.cpp b/src/IO/ReadHelpers.cpp index b0a6838b81e..48811a41edd 100644 --- a/src/IO/ReadHelpers.cpp +++ b/src/IO/ReadHelpers.cpp @@ -702,6 +702,25 @@ void readCSVString(String & s, ReadBuffer & buf, const FormatSettings::CSV & set readCSVStringInto(s, buf, settings); } +void readCSVField(String & s, ReadBuffer & buf, const FormatSettings::CSV & settings) +{ + s.clear(); + bool add_quote = false; + char quote = '\''; + + if (!buf.eof() && (*buf.position() == '\'' || *buf.position() == '"')) + { + quote = *buf.position(); + s.push_back(quote); + add_quote = true; + } + + readCSVStringInto(s, buf, settings); + + if (add_quote) + s.push_back(quote); +} + template void readCSVStringInto>(PaddedPODArray & s, ReadBuffer & buf, const FormatSettings::CSV & settings); @@ -1212,6 +1231,19 @@ void skipToNextRowOrEof(PeekableReadBuffer & buf, const String & row_after_delim } } +// Use PeekableReadBuffer to copy field to string after parsing. +template +static void readParsedValueIntoString(String & s, ReadBuffer & buf, ParseFunc parse_func) +{ + PeekableReadBuffer peekable_buf(buf); + peekable_buf.setCheckpoint(); + parse_func(peekable_buf); + peekable_buf.makeContinuousMemoryFromCheckpointToPos(); + auto * end = peekable_buf.position(); + peekable_buf.rollbackToCheckpoint(); + s.append(peekable_buf.position(), end); + peekable_buf.position() = end; +} template static void readQuotedFieldInBrackets(String & s, ReadBuffer & buf) @@ -1266,7 +1298,11 @@ void readQuotedFieldIntoString(String & s, ReadBuffer & buf) /// - Number: integer, float, decimal. if (*buf.position() == '\'') - readQuotedString(s, buf); + { + s.push_back('\''); + readQuotedStringInto(s, buf); + s.push_back('\''); + } else if (*buf.position() == '[') readQuotedFieldInBrackets<'[', ']'>(s, buf); else if (*buf.position() == '(') @@ -1290,18 +1326,19 @@ void readQuotedFieldIntoString(String & s, ReadBuffer & buf) else { /// It's an integer, float or decimal. They all can be parsed as float. - /// Use PeekableReadBuffer to copy field to string after parsing. - PeekableReadBuffer peekable_buf(buf); - peekable_buf.setCheckpoint(); - Float64 tmp; - readFloatText(tmp, peekable_buf); - peekable_buf.makeContinuousMemoryFromCheckpointToPos(); - auto * end = peekable_buf.position(); - peekable_buf.rollbackToCheckpoint(); - s.append(peekable_buf.position(), end); - peekable_buf.position() = end; + auto parse_func = [](ReadBuffer & in) + { + Float64 tmp; + readFloatText(tmp, in); + }; + readParsedValueIntoString(s, buf, parse_func); } } +void readJSONFieldIntoString(String & s, ReadBuffer & buf) +{ + auto parse_func = [](ReadBuffer & in) { skipJSONField(in, "json_field"); }; + readParsedValueIntoString(s, buf, parse_func); +} } diff --git a/src/IO/ReadHelpers.h b/src/IO/ReadHelpers.h index c48306cf6d3..6d1023947a5 100644 --- a/src/IO/ReadHelpers.h +++ b/src/IO/ReadHelpers.h @@ -8,9 +8,9 @@ #include -#include -#include -#include +#include +#include +#include #include #include #include @@ -563,6 +563,8 @@ void readStringUntilWhitespace(String & s, ReadBuffer & buf); */ void readCSVString(String & s, ReadBuffer & buf, const FormatSettings::CSV & settings); +/// Differ from readCSVString in that it doesn't remove quotes around field if any. +void readCSVField(String & s, ReadBuffer & buf, const FormatSettings::CSV & settings); /// Read and append result to array of characters. template @@ -899,13 +901,8 @@ inline ReturnType readDateTimeTextImpl(DateTime64 & datetime64, UInt32 scale, Re { /// Unix timestamp with subsecond precision, already scaled to integer. /// For disambiguation we support only time since 2001-09-09 01:46:40 UTC and less than 30 000 years in future. - - for (size_t i = 0; i < scale; ++i) - { - components.fractional *= 10; - components.fractional += components.whole % 10; - components.whole /= 10; - } + components.fractional = components.whole % common::exp10_i32(scale); + components.whole = components.whole / common::exp10_i32(scale); } datetime64 = DecimalUtils::decimalFromComponents(components, scale); @@ -1386,4 +1383,7 @@ struct PcgDeserializer void readQuotedFieldIntoString(String & s, ReadBuffer & buf); +void readJSONFieldIntoString(String & s, ReadBuffer & buf); + } + diff --git a/src/IO/ReadWriteBufferFromHTTP.h b/src/IO/ReadWriteBufferFromHTTP.h index df1d443d5a0..11c4e99c353 100644 --- a/src/IO/ReadWriteBufferFromHTTP.h +++ b/src/IO/ReadWriteBufferFromHTTP.h @@ -427,7 +427,7 @@ namespace detail LOG_ERROR(log, "HTTP request to `{}` failed at try {}/{} with bytes read: {}/{}. " "Error: {}. (Current backoff wait is {}/{} ms)", - uri.toString(), i, settings.http_max_tries, + uri.toString(), i + 1, settings.http_max_tries, getOffset(), read_range.end ? toString(*read_range.end) : "unknown", e.displayText(), milliseconds_to_wait, settings.http_retry_max_backoff_ms); diff --git a/src/IO/S3/PocoHTTPClient.cpp b/src/IO/S3/PocoHTTPClient.cpp index 68bdbc9cf86..25b03d66097 100644 --- a/src/IO/S3/PocoHTTPClient.cpp +++ b/src/IO/S3/PocoHTTPClient.cpp @@ -119,7 +119,7 @@ void PocoHTTPClient::makeRequestInternal( Poco::Logger * log = &Poco::Logger::get("AWSClient"); auto uri = request.GetUri().GetURIString(); - LOG_DEBUG(log, "Make request to: {}", uri); + LOG_TEST(log, "Make request to: {}", uri); enum class S3MetricType { @@ -251,7 +251,7 @@ void PocoHTTPClient::makeRequestInternal( if (request.GetContentBody()) { - LOG_TRACE(log, "Writing request body."); + LOG_TEST(log, "Writing request body."); if (attempt > 0) /// rewind content body buffer. { @@ -259,24 +259,24 @@ void PocoHTTPClient::makeRequestInternal( request.GetContentBody()->seekg(0); } auto size = Poco::StreamCopier::copyStream(*request.GetContentBody(), request_body_stream); - LOG_DEBUG(log, "Written {} bytes to request body", size); + LOG_TEST(log, "Written {} bytes to request body", size); } - LOG_TRACE(log, "Receiving response..."); + LOG_TEST(log, "Receiving response..."); auto & response_body_stream = session->receiveResponse(poco_response); watch.stop(); ProfileEvents::increment(select_metric(S3MetricType::Microseconds), watch.elapsedMicroseconds()); int status_code = static_cast(poco_response.getStatus()); - LOG_DEBUG(log, "Response status: {}, {}", status_code, poco_response.getReason()); + LOG_TEST(log, "Response status: {}, {}", status_code, poco_response.getReason()); if (poco_response.getStatus() == Poco::Net::HTTPResponse::HTTP_TEMPORARY_REDIRECT) { auto location = poco_response.get("location"); remote_host_filter.checkURL(Poco::URI(location)); uri = location; - LOG_DEBUG(log, "Redirecting request to new location: {}", location); + LOG_TEST(log, "Redirecting request to new location: {}", location); ProfileEvents::increment(select_metric(S3MetricType::Redirects)); @@ -292,7 +292,7 @@ void PocoHTTPClient::makeRequestInternal( response->AddHeader(header_name, header_value); headers_ss << header_name << ": " << header_value << "; "; } - LOG_DEBUG(log, "Received headers: {}", headers_ss.str()); + LOG_TEST(log, "Received headers: {}", headers_ss.str()); if (status_code == 429 || status_code == 503) { // API throttling diff --git a/src/IO/S3Common.cpp b/src/IO/S3Common.cpp index 41b2b1f059a..432dc443300 100644 --- a/src/IO/S3Common.cpp +++ b/src/IO/S3Common.cpp @@ -51,8 +51,8 @@ const std::pair & convertLogLevel(Aws::U {Aws::Utils::Logging::LogLevel::Error, {DB::LogsLevel::error, Poco::Message::PRIO_ERROR}}, {Aws::Utils::Logging::LogLevel::Warn, {DB::LogsLevel::warning, Poco::Message::PRIO_WARNING}}, {Aws::Utils::Logging::LogLevel::Info, {DB::LogsLevel::information, Poco::Message::PRIO_INFORMATION}}, - {Aws::Utils::Logging::LogLevel::Debug, {DB::LogsLevel::debug, Poco::Message::PRIO_DEBUG}}, - {Aws::Utils::Logging::LogLevel::Trace, {DB::LogsLevel::trace, Poco::Message::PRIO_TRACE}}, + {Aws::Utils::Logging::LogLevel::Debug, {DB::LogsLevel::debug, Poco::Message::PRIO_TEST}}, + {Aws::Utils::Logging::LogLevel::Trace, {DB::LogsLevel::trace, Poco::Message::PRIO_TEST}}, }; return mapping.at(log_level); } diff --git a/src/IO/SynchronousReader.cpp b/src/IO/SynchronousReader.cpp index 599299ddad4..4414da28d28 100644 --- a/src/IO/SynchronousReader.cpp +++ b/src/IO/SynchronousReader.cpp @@ -82,10 +82,9 @@ std::future SynchronousReader::submit(Request reque watch.stop(); ProfileEvents::increment(ProfileEvents::DiskReadElapsedMicroseconds, watch.elapsedMicroseconds()); - return bytes_read; + return Result{ .size = bytes_read, .offset = 0}; + }); } } - - diff --git a/src/IO/ThreadPoolReader.cpp b/src/IO/ThreadPoolReader.cpp index 32bc13ecb75..63bc8fe7c49 100644 --- a/src/IO/ThreadPoolReader.cpp +++ b/src/IO/ThreadPoolReader.cpp @@ -117,7 +117,7 @@ std::future ThreadPoolReader::submit(Request reques if (!res) { /// The file has ended. - promise.set_value(0); + promise.set_value({0, 0}); watch.stop(); ProfileEvents::increment(ProfileEvents::ThreadPoolReaderPageCacheHitElapsedMicroseconds, watch.elapsedMicroseconds()); @@ -176,7 +176,7 @@ std::future ThreadPoolReader::submit(Request reques ProfileEvents::increment(ProfileEvents::ThreadPoolReaderPageCacheHitElapsedMicroseconds, watch.elapsedMicroseconds()); ProfileEvents::increment(ProfileEvents::DiskReadElapsedMicroseconds, watch.elapsedMicroseconds()); - promise.set_value(bytes_read); + promise.set_value({bytes_read, 0}); return future; } } @@ -219,7 +219,7 @@ std::future ThreadPoolReader::submit(Request reques ProfileEvents::increment(ProfileEvents::ThreadPoolReaderPageCacheMissElapsedMicroseconds, watch.elapsedMicroseconds()); ProfileEvents::increment(ProfileEvents::DiskReadElapsedMicroseconds, watch.elapsedMicroseconds()); - return bytes_read; + return Result{ .size = bytes_read, .offset = 0 }; }); auto future = task->get_future(); diff --git a/src/IO/WriteBuffer.h b/src/IO/WriteBuffer.h index 9ceed533855..4d7f300a504 100644 --- a/src/IO/WriteBuffer.h +++ b/src/IO/WriteBuffer.h @@ -5,6 +5,7 @@ #include #include #include +#include #include #include diff --git a/src/IO/WriteBufferFromAzureBlobStorage.cpp b/src/IO/WriteBufferFromAzureBlobStorage.cpp new file mode 100644 index 00000000000..88882fcef65 --- /dev/null +++ b/src/IO/WriteBufferFromAzureBlobStorage.cpp @@ -0,0 +1,81 @@ +#if !defined(ARCADIA_BUILD) +#include +#endif + +#if USE_AZURE_BLOB_STORAGE + +#include +#include +#include +#include + + +namespace DB +{ + +WriteBufferFromAzureBlobStorage::WriteBufferFromAzureBlobStorage( + std::shared_ptr blob_container_client_, + const String & blob_path_, + size_t max_single_part_upload_size_, + size_t buf_size_) : + BufferWithOwnMemory(buf_size_, nullptr, 0), + blob_container_client(blob_container_client_), + max_single_part_upload_size(max_single_part_upload_size_), + blob_path(blob_path_) {} + + +WriteBufferFromAzureBlobStorage::~WriteBufferFromAzureBlobStorage() +{ + finalize(); +} + +void WriteBufferFromAzureBlobStorage::finalizeImpl() +{ + const size_t max_tries = 3; + for (size_t i = 0; i < max_tries; ++i) + { + try + { + next(); + break; + } + catch (const Azure::Core::RequestFailedException & e) + { + if (i == max_tries - 1) + throw; + LOG_INFO(&Poco::Logger::get("WriteBufferFromAzureBlobStorage"), + "Exception caught during finalizing azure storage write at attempt {}: {}", i + 1, e.Message); + } + } +} + +void WriteBufferFromAzureBlobStorage::nextImpl() +{ + if (!offset()) + return; + + auto * buffer_begin = working_buffer.begin(); + auto len = offset(); + auto block_blob_client = blob_container_client->GetBlockBlobClient(blob_path); + + size_t read = 0; + std::vector block_ids; + while (read < len) + { + auto part_len = std::min(len - read, max_single_part_upload_size); + + auto block_id = getRandomASCIIString(64); + block_ids.push_back(block_id); + + Azure::Core::IO::MemoryBodyStream tmp_buffer(reinterpret_cast(buffer_begin + read), part_len); + block_blob_client.StageBlock(block_id, tmp_buffer); + + read += part_len; + } + + block_blob_client.CommitBlockList(block_ids); +} + +} + +#endif diff --git a/src/IO/WriteBufferFromAzureBlobStorage.h b/src/IO/WriteBufferFromAzureBlobStorage.h new file mode 100644 index 00000000000..cbbfb577a91 --- /dev/null +++ b/src/IO/WriteBufferFromAzureBlobStorage.h @@ -0,0 +1,44 @@ +#pragma once + +#if !defined(ARCADIA_BUILD) +#include +#endif + +#if USE_AZURE_BLOB_STORAGE + +#include + +#include +#include +#include +#include + + +namespace DB +{ + +class WriteBufferFromAzureBlobStorage : public BufferWithOwnMemory +{ +public: + + explicit WriteBufferFromAzureBlobStorage( + std::shared_ptr blob_container_client_, + const String & blob_path_, + size_t max_single_part_upload_size_, + size_t buf_size_); + + ~WriteBufferFromAzureBlobStorage() override; + + void nextImpl() override; + +private: + void finalizeImpl() override; + + std::shared_ptr blob_container_client; + size_t max_single_part_upload_size; + const String blob_path; +}; + +} + +#endif diff --git a/src/IO/WriteBufferFromVector.h b/src/IO/WriteBufferFromVector.h index ddb8d87ca4a..23ae3a70ef3 100644 --- a/src/IO/WriteBufferFromVector.h +++ b/src/IO/WriteBufferFromVector.h @@ -13,6 +13,8 @@ namespace ErrorCodes extern const int CANNOT_WRITE_AFTER_END_OF_BUFFER; } +struct AppendModeTag {}; + /** Writes data to existing std::vector or similar type. When not enough space, it doubles vector size. * * In destructor, vector is cut to the size of written data. @@ -35,7 +37,6 @@ public: } /// Append to vector instead of rewrite. - struct AppendModeTag {}; WriteBufferFromVector(VectorType & vector_, AppendModeTag) : WriteBuffer(nullptr, 0), vector(vector_) { diff --git a/src/IO/WriteHelpers.cpp b/src/IO/WriteHelpers.cpp index 61bfc281050..b41f621e0b9 100644 --- a/src/IO/WriteHelpers.cpp +++ b/src/IO/WriteHelpers.cpp @@ -68,8 +68,13 @@ void writeException(const Exception & e, WriteBuffer & buf, bool with_stack_trac template static inline void writeProbablyQuotedStringImpl(const StringRef & s, WriteBuffer & buf, F && write_quoted_string) { - if (isValidIdentifier(std::string_view{s})) + if (isValidIdentifier(std::string_view{s}) + /// This are valid identifiers but are problematic if present unquoted in SQL query. + && !(s.size == strlen("distinct") && 0 == strncasecmp(s.data, "distinct", strlen("distinct"))) + && !(s.size == strlen("all") && 0 == strncasecmp(s.data, "all", strlen("all")))) + { writeString(s, buf); + } else write_quoted_string(s, buf); } diff --git a/src/IO/WriteHelpers.h b/src/IO/WriteHelpers.h index 5498e1c90f3..ca2c202014c 100644 --- a/src/IO/WriteHelpers.h +++ b/src/IO/WriteHelpers.h @@ -9,9 +9,9 @@ #include -#include -#include -#include +#include +#include +#include #include #include #include diff --git a/src/IO/ZlibDeflatingWriteBuffer.cpp b/src/IO/ZlibDeflatingWriteBuffer.cpp index 29afa4b160c..c265791e38a 100644 --- a/src/IO/ZlibDeflatingWriteBuffer.cpp +++ b/src/IO/ZlibDeflatingWriteBuffer.cpp @@ -77,7 +77,14 @@ void ZlibDeflatingWriteBuffer::nextImpl() ZlibDeflatingWriteBuffer::~ZlibDeflatingWriteBuffer() { - finalize(); + try + { + finalize(); + } + catch (...) + { + tryLogCurrentException(__PRETTY_FUNCTION__); + } } void ZlibDeflatingWriteBuffer::finalizeBefore() diff --git a/src/IO/parseDateTimeBestEffort.cpp b/src/IO/parseDateTimeBestEffort.cpp index 274a4ecc2f2..3b05d8c76b6 100644 --- a/src/IO/parseDateTimeBestEffort.cpp +++ b/src/IO/parseDateTimeBestEffort.cpp @@ -1,4 +1,4 @@ -#include +#include #include #include @@ -151,7 +151,18 @@ ReturnType parseDateTimeBestEffortImpl( { num_digits = readDigits(digits, sizeof(digits), in); - if (num_digits == 10 && !year && !has_time) + if (num_digits == 13 && !year && !has_time) + { + /// This is unix timestamp with millisecond. + readDecimalNumber<10>(res, digits); + if (fractional) + { + fractional->digits = 3; + readDecimalNumber<3>(fractional->value, digits + 10); + } + return ReturnType(true); + } + else if (num_digits == 10 && !year && !has_time) { /// This is unix timestamp. readDecimalNumber<10>(res, digits); diff --git a/src/IO/tests/gtest_DateTimeToString.cpp b/src/IO/tests/gtest_DateTimeToString.cpp index c30c8943944..2d878fdd548 100644 --- a/src/IO/tests/gtest_DateTimeToString.cpp +++ b/src/IO/tests/gtest_DateTimeToString.cpp @@ -1,6 +1,6 @@ #include -#include +#include #include #include diff --git a/src/Interpreters/Aggregator.cpp b/src/Interpreters/Aggregator.cpp index efea8e9d0f7..ae5ce117c61 100644 --- a/src/Interpreters/Aggregator.cpp +++ b/src/Interpreters/Aggregator.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -573,6 +574,14 @@ void Aggregator::createAggregateStates(AggregateDataPtr & aggregate_data) const } } +bool Aggregator::hasSparseArguments(AggregateFunctionInstruction * aggregate_instructions) +{ + for (auto * inst = aggregate_instructions; inst->that; ++inst) + if (inst->has_sparse_arguments) + return true; + return false; +} + /** It's interesting - if you remove `noinline`, then gcc for some reason will inline this function, and the performance decreases (~ 10%). * (Probably because after the inline of this function, more internal functions no longer be inlined.) * Inline does not make sense, since the inner loop is entirely inside this function. @@ -592,7 +601,7 @@ void NO_INLINE Aggregator::executeImpl( if (!no_more_keys) { #if USE_EMBEDDED_COMPILER - if (compiled_aggregate_functions_holder) + if (compiled_aggregate_functions_holder && !hasSparseArguments(aggregate_instructions)) { executeImplBatch(method, state, aggregates_pool, rows, aggregate_instructions, overflow_row); } @@ -644,7 +653,7 @@ void NO_INLINE Aggregator::executeImplBatch( } } - if (!has_arrays) + if (!has_arrays && !hasSparseArguments(aggregate_instructions)) { for (AggregateFunctionInstruction * inst = aggregate_instructions; inst->that; ++inst) { @@ -770,6 +779,8 @@ void NO_INLINE Aggregator::executeImplBatch( if (inst->offsets) inst->batch_that->addBatchArray(rows, places.get(), inst->state_offset, inst->batch_arguments, inst->offsets, aggregates_pool); + else if (inst->has_sparse_arguments) + inst->batch_that->addBatchSparse(places.get(), inst->state_offset, inst->batch_arguments, aggregates_pool); else inst->batch_that->addBatch(rows, places.get(), inst->state_offset, inst->batch_arguments, aggregates_pool); } @@ -835,6 +846,8 @@ void NO_INLINE Aggregator::executeWithoutKeyImpl( if (inst->offsets) inst->batch_that->addBatchSinglePlace( inst->offsets[static_cast(rows - 1)], res + inst->state_offset, inst->batch_arguments, arena); + else if (inst->has_sparse_arguments) + inst->batch_that->addBatchSparseSinglePlace(res + inst->state_offset, inst->batch_arguments, arena); else inst->batch_that->addBatchSinglePlace(rows, res + inst->state_offset, inst->batch_arguments, arena); } @@ -870,19 +883,30 @@ void Aggregator::prepareAggregateInstructions(Columns columns, AggregateColumns for (size_t i = 0; i < params.aggregates_size; ++i) { + bool allow_sparse_arguments = aggregate_columns[i].size() == 1; + bool has_sparse_arguments = false; + for (size_t j = 0; j < aggregate_columns[i].size(); ++j) { materialized_columns.push_back(columns.at(params.aggregates[i].arguments[j])->convertToFullColumnIfConst()); aggregate_columns[i][j] = materialized_columns.back().get(); - auto column_no_lc = recursiveRemoveLowCardinality(aggregate_columns[i][j]->getPtr()); - if (column_no_lc.get() != aggregate_columns[i][j]) + auto full_column = allow_sparse_arguments + ? aggregate_columns[i][j]->getPtr() + : recursiveRemoveSparse(aggregate_columns[i][j]->getPtr()); + + full_column = recursiveRemoveLowCardinality(full_column); + if (full_column.get() != aggregate_columns[i][j]) { - materialized_columns.emplace_back(std::move(column_no_lc)); + materialized_columns.emplace_back(std::move(full_column)); aggregate_columns[i][j] = materialized_columns.back().get(); } + + if (aggregate_columns[i][j]->isSparse()) + has_sparse_arguments = true; } + aggregate_functions_instructions[i].has_sparse_arguments = has_sparse_arguments; aggregate_functions_instructions[i].arguments = aggregate_columns[i].data(); aggregate_functions_instructions[i].state_offset = offsets_of_aggregate_states[i]; @@ -942,7 +966,7 @@ bool Aggregator::executeOnBlock(Columns columns, UInt64 num_rows, AggregatedData /// Remember the columns we will work with for (size_t i = 0; i < params.keys_size; ++i) { - materialized_columns.push_back(columns.at(params.keys[i])->convertToFullColumnIfConst()); + materialized_columns.push_back(recursiveRemoveSparse(columns.at(params.keys[i]))->convertToFullColumnIfConst()); key_columns[i] = materialized_columns.back().get(); if (!result.isLowCardinality()) diff --git a/src/Interpreters/Aggregator.h b/src/Interpreters/Aggregator.h index 3c53769e128..c79c2c5ef64 100644 --- a/src/Interpreters/Aggregator.h +++ b/src/Interpreters/Aggregator.h @@ -1062,6 +1062,7 @@ private: const IAggregateFunction * batch_that{}; const IColumn ** batch_arguments{}; const UInt64 * offsets{}; + bool has_sparse_arguments = false; }; using AggregateFunctionInstructions = std::vector; @@ -1317,6 +1318,8 @@ private: AggregatedDataVariants & data_variants, Columns & key_columns, size_t key_row, MutableColumns & final_key_columns) const; + + static bool hasSparseArguments(AggregateFunctionInstruction * aggregate_instructions); }; diff --git a/src/Interpreters/ArithmeticOperationsInAgrFuncOptimize.cpp b/src/Interpreters/ArithmeticOperationsInAgrFuncOptimize.cpp index 70a58971d3f..383ca3db6f4 100644 --- a/src/Interpreters/ArithmeticOperationsInAgrFuncOptimize.cpp +++ b/src/Interpreters/ArithmeticOperationsInAgrFuncOptimize.cpp @@ -157,7 +157,12 @@ void ArithmeticOperationsInAgrFuncMatcher::visit(const ASTFunction & func, ASTPt void ArithmeticOperationsInAgrFuncMatcher::visit(ASTPtr & ast, Data & data) { if (const auto * function_node = ast->as()) + { + if (function_node->is_window_function) + return; + visit(*function_node, ast, data); + } } bool ArithmeticOperationsInAgrFuncMatcher::needChildVisit(const ASTPtr & node, const ASTPtr &) diff --git a/src/Interpreters/AsynchronousMetrics.cpp b/src/Interpreters/AsynchronousMetrics.cpp index 121f7c4153f..d1c5fbebbc7 100644 --- a/src/Interpreters/AsynchronousMetrics.cpp +++ b/src/Interpreters/AsynchronousMetrics.cpp @@ -69,12 +69,10 @@ static std::unique_ptr openFileIfExists(const std::stri AsynchronousMetrics::AsynchronousMetrics( ContextPtr global_context_, int update_period_seconds, - std::shared_ptr> servers_to_start_before_tables_, - std::shared_ptr> servers_) + const ProtocolServerMetricsFunc & protocol_server_metrics_func_) : WithContext(global_context_) , update_period(update_period_seconds) - , servers_to_start_before_tables(servers_to_start_before_tables_) - , servers(servers_) + , protocol_server_metrics_func(protocol_server_metrics_func_) , log(&Poco::Logger::get("AsynchronousMetrics")) { #if defined(OS_LINUX) @@ -238,7 +236,7 @@ void AsynchronousMetrics::start() thread = std::make_unique([this] { run(); }); } -AsynchronousMetrics::~AsynchronousMetrics() +void AsynchronousMetrics::stop() { try { @@ -249,7 +247,10 @@ AsynchronousMetrics::~AsynchronousMetrics() wait_cond.notify_one(); if (thread) + { thread->join(); + thread.reset(); + } } catch (...) { @@ -257,6 +258,11 @@ AsynchronousMetrics::~AsynchronousMetrics() } } +AsynchronousMetrics::~AsynchronousMetrics() +{ + stop(); +} + AsynchronousMetricValues AsynchronousMetrics::getValues() const { @@ -1381,22 +1387,11 @@ void AsynchronousMetrics::update(std::chrono::system_clock::time_point update_ti return it->second; }; - if (servers_to_start_before_tables) + const auto server_metrics = protocol_server_metrics_func(); + for (const auto & server_metric : server_metrics) { - for (const auto & server : *servers_to_start_before_tables) - { - if (const auto * name = get_metric_name(server.getPortName())) - new_values[name] = server.currentThreads(); - } - } - - if (servers) - { - for (const auto & server : *servers) - { - if (const auto * name = get_metric_name(server.getPortName())) - new_values[name] = server.currentThreads(); - } + if (const auto * name = get_metric_name(server_metric.port_name)) + new_values[name] = server_metric.current_threads; } } diff --git a/src/Interpreters/AsynchronousMetrics.h b/src/Interpreters/AsynchronousMetrics.h index 7a5c2d638d7..3c7581ce1a3 100644 --- a/src/Interpreters/AsynchronousMetrics.h +++ b/src/Interpreters/AsynchronousMetrics.h @@ -30,6 +30,11 @@ class ReadBuffer; using AsynchronousMetricValue = double; using AsynchronousMetricValues = std::unordered_map; +struct ProtocolServerMetrics +{ + String port_name; + size_t current_threads; +}; /** Periodically (by default, each minute, starting at 30 seconds offset) * calculates and updates some metrics, @@ -41,24 +46,25 @@ using AsynchronousMetricValues = std::unordered_map()>; AsynchronousMetrics( ContextPtr global_context_, int update_period_seconds, - std::shared_ptr> servers_to_start_before_tables_, - std::shared_ptr> servers_); + const ProtocolServerMetricsFunc & protocol_server_metrics_func_); ~AsynchronousMetrics(); /// Separate method allows to initialize the `servers` variable beforehand. void start(); + void stop(); + /// Returns copy of all values. AsynchronousMetricValues getValues() const; private: const std::chrono::seconds update_period; - std::shared_ptr> servers_to_start_before_tables{nullptr}; - std::shared_ptr> servers{nullptr}; + ProtocolServerMetricsFunc protocol_server_metrics_func; mutable std::mutex mutex; std::condition_variable wait_cond; diff --git a/src/Interpreters/Cluster.cpp b/src/Interpreters/Cluster.cpp index 30d0dd4cece..b7b6b84439b 100644 --- a/src/Interpreters/Cluster.cpp +++ b/src/Interpreters/Cluster.cpp @@ -320,13 +320,29 @@ void Clusters::updateClusters(const Poco::Util::AbstractConfiguration & new_conf if (old_config) { for (const auto & key : deleted_keys) - impl.erase(key); + { + if (!automatic_clusters.contains(key)) + impl.erase(key); + } } else - impl.clear(); + { + if (!automatic_clusters.empty()) + std::erase_if(impl, [this](const auto & e) { return automatic_clusters.contains(e.first); }); + else + impl.clear(); + } + for (const auto & key : new_config_keys) { + if (new_config.has(config_prefix + "." + key + ".discovery")) + { + /// Handled in ClusterDiscovery + automatic_clusters.insert(key); + continue; + } + if (key.find('.') != String::npos) throw Exception("Cluster names with dots are not supported: '" + key + "'", ErrorCodes::SYNTAX_ERROR); diff --git a/src/Interpreters/Cluster.h b/src/Interpreters/Cluster.h index a64e17264b1..3773dadaf13 100644 --- a/src/Interpreters/Cluster.h +++ b/src/Interpreters/Cluster.h @@ -6,6 +6,8 @@ #include #include +#include +#include namespace Poco { @@ -295,12 +297,15 @@ public: void updateClusters(const Poco::Util::AbstractConfiguration & new_config, const Settings & settings, const String & config_prefix, Poco::Util::AbstractConfiguration * old_config = nullptr); -public: using Impl = std::map; Impl getContainer() const; protected: + + /// setup outside of this class, stored to prevent deleting from impl on config update + std::unordered_set automatic_clusters; + Impl impl; mutable std::mutex mutex; }; diff --git a/src/Interpreters/ClusterDiscovery.cpp b/src/Interpreters/ClusterDiscovery.cpp new file mode 100644 index 00000000000..8b68ba02504 --- /dev/null +++ b/src/Interpreters/ClusterDiscovery.cpp @@ -0,0 +1,479 @@ +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + +#include + +#include +#include +#include + +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; +} + +namespace +{ + +fs::path getShardsListPath(const String & zk_root) +{ + return fs::path(zk_root + "/shards"); +} + +} + +/* + * Holds boolean flags for fixed set of keys. + * Flags can be concurrently set from different threads, and consumer can wait for it. + */ +template +class ClusterDiscovery::ConcurrentFlags +{ +public: + template + ConcurrentFlags(It begin, It end) + { + for (auto it = begin; it != end; ++it) + flags.emplace(*it, false); + } + + void set(const T & key) + { + auto it = flags.find(key); + if (it == flags.end()) + throw DB::Exception(ErrorCodes::LOGICAL_ERROR, "Unknown value '{}'", key); + it->second = true; + any_need_update = true; + cv.notify_one(); + } + + /// waits unit at least one flag is set + /// caller should handle all set flags (or set it again manually) + /// note: keys of returen map should not be changed! + /// @param finished - output parameter indicates that stop() was called + std::unordered_map & wait(std::chrono::milliseconds timeout, bool & finished) + { + std::unique_lock lk(mu); + cv.wait_for(lk, timeout, [this]() -> bool { return any_need_update || stop_flag; }); + finished = stop_flag; + + /// all set flags expected to be handled by caller + any_need_update = false; + return flags; + } + + void stop() + { + std::unique_lock lk(mu); + stop_flag = true; + cv.notify_one(); + } + +private: + std::condition_variable cv; + std::mutex mu; + + /// flag indicates that update is required + std::unordered_map flags; + std::atomic_bool any_need_update = true; + bool stop_flag = false; +}; + +ClusterDiscovery::ClusterDiscovery( + const Poco::Util::AbstractConfiguration & config, + ContextPtr context_, + const String & config_prefix) + : context(Context::createCopy(context_)) + , current_node_name(toString(ServerUUID::get())) + , log(&Poco::Logger::get("ClusterDiscovery")) +{ + LOG_DEBUG(log, "Cluster discovery is enabled"); + + Poco::Util::AbstractConfiguration::Keys config_keys; + config.keys(config_prefix, config_keys); + + for (const auto & key : config_keys) + { + String prefix = config_prefix + "." + key + ".discovery"; + if (!config.has(prefix)) + continue; + + clusters_info.emplace( + key, + ClusterInfo( + /* name_= */ key, + /* zk_root_= */ config.getString(prefix + ".path"), + /* port= */ context->getTCPPort(), + /* secure= */ config.getBool(prefix + ".secure", false), + /* shard_id= */ config.getUInt(prefix + ".shard", 0) + ) + ); + } + clusters_to_update = std::make_shared(config_keys.begin(), config_keys.end()); +} + +/// List node in zookeper for cluster +Strings ClusterDiscovery::getNodeNames(zkutil::ZooKeeperPtr & zk, + const String & zk_root, + const String & cluster_name, + int * version, + bool set_callback) +{ + auto watch_callback = [cluster_name, clusters_to_update=clusters_to_update](auto) { clusters_to_update->set(cluster_name); }; + + Coordination::Stat stat; + Strings nodes = zk->getChildrenWatch(getShardsListPath(zk_root), &stat, set_callback ? watch_callback : Coordination::WatchCallback{}); + if (version) + *version = stat.cversion; + return nodes; +} + +/// Reads node information from specified zookeeper nodes +/// On error returns empty result +ClusterDiscovery::NodesInfo ClusterDiscovery::getNodes(zkutil::ZooKeeperPtr & zk, const String & zk_root, const Strings & node_uuids) +{ + NodesInfo result; + for (const auto & node_uuid : node_uuids) + { + String payload; + bool ok = zk->tryGet(getShardsListPath(zk_root) / node_uuid, payload) && + NodeInfo::parse(payload, result[node_uuid]); + if (!ok) + { + LOG_WARNING(log, "Can't get data from node '{}' in '{}'", node_uuid, zk_root); + return {}; + } + } + return result; +} + +/// Checks if cluster nodes set is changed. +/// Returns true if update required. +/// It performs only shallow check (set of nodes' uuids). +/// So, if node's hostname are changed, then cluster won't be updated. +bool ClusterDiscovery::needUpdate(const Strings & node_uuids, const NodesInfo & nodes) +{ + bool has_difference = node_uuids.size() != nodes.size() || + std::any_of(node_uuids.begin(), node_uuids.end(), [&nodes] (auto u) { return !nodes.contains(u); }); + { + /// Just to log updated nodes, suboptimal, but should be ok for expected update sizes + std::set new_names(node_uuids.begin(), node_uuids.end()); + std::set old_names; + for (const auto & [name, _] : nodes) + old_names.emplace(name); + + auto format_cluster_update = [](const std::set & s1, const std::set & s2) + { + std::vector diff; + std::set_difference(s1.begin(), s1.end(), s2.begin(), s2.end(), std::back_inserter(diff)); + + constexpr size_t max_to_show = 3; + size_t sz = diff.size(); + bool need_crop = sz > max_to_show; + if (need_crop) + diff.resize(max_to_show); + + if (sz == 0) + return fmt::format("{} nodes", sz); + return fmt::format("{} node{} [{}{}]", sz, sz != 1 ? "s" : "", fmt::join(diff, ", "), need_crop ? ",..." : ""); + }; + + LOG_DEBUG(log, "Cluster update: added {}, removed {}", + format_cluster_update(new_names, old_names), + format_cluster_update(old_names, new_names)); + } + return has_difference; +} + +ClusterPtr ClusterDiscovery::makeCluster(const ClusterInfo & cluster_info) +{ + std::vector> shards; + { + std::map replica_adresses; + + for (const auto & [_, node] : cluster_info.nodes_info) + { + if (cluster_info.current_node.secure != node.secure) + { + LOG_WARNING(log, "Node '{}' in cluster '{}' has different 'secure' value, skipping it", node.address, cluster_info.name); + continue; + } + replica_adresses[node.shard_id].emplace_back(node.address); + } + + shards.reserve(replica_adresses.size()); + for (auto & [_, replicas] : replica_adresses) + shards.emplace_back(std::move(replicas)); + } + + bool secure = cluster_info.current_node.secure; + auto cluster = std::make_shared( + context->getSettings(), + shards, + /* username= */ context->getUserName(), + /* password= */ "", + /* clickhouse_port= */ secure ? context->getTCPPortSecure().value_or(DBMS_DEFAULT_SECURE_PORT) : context->getTCPPort(), + /* treat_local_as_remote= */ false, + /* treat_local_port_as_remote= */ context->getApplicationType() == Context::ApplicationType::LOCAL, + /* secure= */ secure); + return cluster; +} + +/// Reads data from zookeeper and tries to update cluster. +/// Returns true on success (or no update required). +bool ClusterDiscovery::updateCluster(ClusterInfo & cluster_info) +{ + LOG_DEBUG(log, "Updating cluster '{}'", cluster_info.name); + + auto zk = context->getZooKeeper(); + + int start_version; + Strings node_uuids = getNodeNames(zk, cluster_info.zk_root, cluster_info.name, &start_version, false); + auto & nodes_info = cluster_info.nodes_info; + + if (std::find(node_uuids.begin(), node_uuids.end(), current_node_name) == node_uuids.end()) + { + LOG_ERROR(log, "Can't find current node in cluster '{}', will register again", cluster_info.name); + registerInZk(zk, cluster_info); + nodes_info.clear(); + return false; + } + + if (!needUpdate(node_uuids, nodes_info)) + { + LOG_DEBUG(log, "No update required for cluster '{}'", cluster_info.name); + return true; + } + + nodes_info = getNodes(zk, cluster_info.zk_root, node_uuids); + if (nodes_info.empty()) + { + LOG_WARNING(log, "Can't get nodes info for '{}'", cluster_info.name); + return false; + } + + int current_version; + getNodeNames(zk, cluster_info.zk_root, cluster_info.name, ¤t_version, true); + + if (current_version != start_version) + { + LOG_DEBUG(log, "Cluster '{}' configuration changed during update", cluster_info.name); + nodes_info.clear(); + return false; + } + + LOG_DEBUG(log, "Updating system.clusters record for '{}' with {} nodes", cluster_info.name, cluster_info.nodes_info.size()); + + auto cluster = makeCluster(cluster_info); + context->setCluster(cluster_info.name, cluster); + return true; +} + +void ClusterDiscovery::registerInZk(zkutil::ZooKeeperPtr & zk, ClusterInfo & info) +{ + LOG_DEBUG(log, "Registering current node {} in cluster {}", current_node_name, info.name); + + String node_path = getShardsListPath(info.zk_root) / current_node_name; + zk->createAncestors(node_path); + + zk->createOrUpdate(node_path, info.current_node.serialize(), zkutil::CreateMode::Ephemeral); + LOG_DEBUG(log, "Current node {} registered in cluster {}", current_node_name, info.name); +} + +void ClusterDiscovery::initialUpdate() +{ + auto zk = context->getZooKeeper(); + for (auto & [_, info] : clusters_info) + { + registerInZk(zk, info); + if (!updateCluster(info)) + { + LOG_WARNING(log, "Error on initial cluster '{}' update, will retry in background", info.name); + clusters_to_update->set(info.name); + } + } +} + +void ClusterDiscovery::start() +{ + if (clusters_info.empty()) + { + LOG_DEBUG(log, "No defined clusters for discovery"); + return; + } + + try + { + initialUpdate(); + } + catch (...) + { + tryLogCurrentException(log, "Caught exception in cluster discovery initialization"); + } + + using namespace std::chrono_literals; + constexpr static std::chrono::milliseconds DEFAULT_BACKOFF_TIMEOUT = 10ms; + + LOG_DEBUG(log, "Starting working thread"); + main_thread = ThreadFromGlobalPool([this] + { + std::chrono::milliseconds backoff_timeout = DEFAULT_BACKOFF_TIMEOUT; + + bool finish = false; + while (!finish) + { + try + { + finish = runMainThread([&backoff_timeout] { backoff_timeout = DEFAULT_BACKOFF_TIMEOUT; }); + } + catch (...) + { + /* + * it can be zk error (will take new session) or other retriable error, + * should not stop discovery forever + */ + tryLogCurrentException(log, "Caught exception in cluster discovery runMainThread"); + } + std::this_thread::sleep_for(backoff_timeout); + backoff_timeout = std::min(backoff_timeout * 2, std::chrono::milliseconds(3min)); + } + }); +} + +/// Returns `true` on graceful shutdown (no restart required) +bool ClusterDiscovery::runMainThread(std::function up_to_date_callback) +{ + setThreadName("ClusterDiscover"); + LOG_DEBUG(log, "Worker thread started"); + + using namespace std::chrono_literals; + + constexpr auto force_update_interval = 2min; + bool finished = false; + while (!finished) + { + bool all_up_to_date = true; + auto & clusters = clusters_to_update->wait(5s, finished); + for (auto & [cluster_name, need_update] : clusters) + { + auto cluster_info_it = clusters_info.find(cluster_name); + if (cluster_info_it == clusters_info.end()) + { + LOG_ERROR(log, "Unknown cluster '{}'", cluster_name); + continue; + } + auto & cluster_info = cluster_info_it->second; + + if (!need_update.exchange(false)) + { + /// force updating periodically + bool force_update = cluster_info.watch.elapsedSeconds() > std::chrono::seconds(force_update_interval).count(); + if (!force_update) + continue; + } + + if (updateCluster(cluster_info)) + { + cluster_info.watch.restart(); + LOG_DEBUG(log, "Cluster '{}' updated successfully", cluster_name); + } + else + { + all_up_to_date = false; + /// no need to trigger convar, will retry after timeout in `wait` + need_update = true; + LOG_WARNING(log, "Cluster '{}' wasn't updated, will retry", cluster_name); + } + } + + if (all_up_to_date) + { + up_to_date_callback(); + } + } + LOG_DEBUG(log, "Worker thread stopped"); + return finished; +} + +void ClusterDiscovery::shutdown() +{ + LOG_DEBUG(log, "Shutting down"); + clusters_to_update->stop(); + + if (main_thread.joinable()) + main_thread.join(); +} + +ClusterDiscovery::~ClusterDiscovery() +{ + ClusterDiscovery::shutdown(); +} + +bool ClusterDiscovery::NodeInfo::parse(const String & data, NodeInfo & result) +{ + try + { + Poco::JSON::Parser parser; + auto json = parser.parse(data).extract(); + + size_t ver = json->optValue("version", data_ver); + if (ver == data_ver) + { + result.address = json->getValue("address"); + result.secure = json->optValue("secure", false); + result.shard_id = json->optValue("shard_id", 0); + } + else + { + LOG_ERROR( + &Poco::Logger::get("ClusterDiscovery"), + "Unsupported version '{}' of data in zk node '{}'", + ver, data.size() < 1024 ? data : "[data too long]"); + } + } + catch (Poco::Exception & e) + { + LOG_WARNING( + &Poco::Logger::get("ClusterDiscovery"), + "Can't parse '{}' from node: {}", + data.size() < 1024 ? data : "[data too long]", e.displayText()); + return false; + } + return true; +} + +String ClusterDiscovery::NodeInfo::serialize() const +{ + Poco::JSON::Object json; + json.set("version", data_ver); + json.set("address", address); + json.set("shard_id", shard_id); + + std::ostringstream oss; // STYLE_CHECK_ALLOW_STD_STRING_STREAM + oss.exceptions(std::ios::failbit); + Poco::JSON::Stringifier::stringify(json, oss); + return oss.str(); +} + +} diff --git a/src/Interpreters/ClusterDiscovery.h b/src/Interpreters/ClusterDiscovery.h new file mode 100644 index 00000000000..2098652c069 --- /dev/null +++ b/src/Interpreters/ClusterDiscovery.h @@ -0,0 +1,124 @@ +#pragma once + +#include +#include +#include +#include +#include + +#include + +#include + +#include + +namespace DB +{ + +/* + * Discover cluster nodes. + * + * Each node adds ephemernal node into specified path in zookeeper (each cluster have own path). + * Also node subscribed for updates for these paths, and at each child node chanhe cluster updated. + * When node goes down ephemernal node are destroyed, cluster configuration is updated on other node and gone node is removed from cluster. + */ +class ClusterDiscovery +{ + +public: + ClusterDiscovery( + const Poco::Util::AbstractConfiguration & config, + ContextPtr context_, + const String & config_prefix = "remote_servers"); + + void start(); + + ~ClusterDiscovery(); + +private: + struct NodeInfo + { + /// versioning for format of data stored in zk + static constexpr size_t data_ver = 1; + + /// host:port + String address; + /// is secure tcp port user + bool secure = false; + /// shard number + size_t shard_id = 0; + + NodeInfo() = default; + explicit NodeInfo(const String & address_, bool secure_, size_t shard_id_) + : address(address_) + , secure(secure_) + , shard_id(shard_id_) + {} + + static bool parse(const String & data, NodeInfo & result); + String serialize() const; + }; + + // node uuid -> address ("host:port") + using NodesInfo = std::unordered_map; + + struct ClusterInfo + { + const String name; + const String zk_root; + NodesInfo nodes_info; + + /// Track last update time + Stopwatch watch; + + NodeInfo current_node; + + explicit ClusterInfo(const String & name_, const String & zk_root_, UInt16 port, bool secure, size_t shard_id) + : name(name_) + , zk_root(zk_root_) + , current_node(getFQDNOrHostName() + ":" + toString(port), secure, shard_id) + { + } + }; + + void initialUpdate(); + + void registerInZk(zkutil::ZooKeeperPtr & zk, ClusterInfo & info); + + Strings getNodeNames(zkutil::ZooKeeperPtr & zk, + const String & zk_root, + const String & cluster_name, + int * version = nullptr, + bool set_callback = true); + + NodesInfo getNodes(zkutil::ZooKeeperPtr & zk, const String & zk_root, const Strings & node_uuids); + + ClusterPtr makeCluster(const ClusterInfo & cluster_info); + + bool needUpdate(const Strings & node_uuids, const NodesInfo & nodes); + bool updateCluster(ClusterInfo & cluster_info); + + bool runMainThread(std::function up_to_date_callback); + void shutdown(); + + /// cluster name -> cluster info (zk root, set of nodes) + std::unordered_map clusters_info; + + ContextMutablePtr context; + + String current_node_name; + + template class ConcurrentFlags; + using UpdateFlags = ConcurrentFlags; + + /// Cluster names to update. + /// The `shared_ptr` is used because it's passed to watch callback. + /// It prevents accessing to invalid object after ClusterDiscovery is destroyed. + std::shared_ptr clusters_to_update; + + ThreadFromGlobalPool main_thread; + + Poco::Logger * log; +}; + +} diff --git a/src/Interpreters/ClusterProxy/executeQuery.h b/src/Interpreters/ClusterProxy/executeQuery.h index 0a77b7b6035..4cf85870e48 100644 --- a/src/Interpreters/ClusterProxy/executeQuery.h +++ b/src/Interpreters/ClusterProxy/executeQuery.h @@ -34,7 +34,8 @@ class IStreamFactory; /// - optimize_skip_unused_shards_nesting /// /// @return new Context with adjusted settings -ContextMutablePtr updateSettingsForCluster(const Cluster & cluster, ContextPtr context, const Settings & settings, Poco::Logger * log = nullptr); +ContextMutablePtr updateSettingsForCluster( + const Cluster & cluster, ContextPtr context, const Settings & settings, Poco::Logger * log = nullptr); /// Execute a distributed query, creating a vector of BlockInputStreams, from which the result can be read. /// `stream_factory` object encapsulates the logic of creating streams for a different type of query diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp index db1d6a37877..14b0f65072a 100644 --- a/src/Interpreters/Context.cpp +++ b/src/Interpreters/Context.cpp @@ -86,6 +86,7 @@ #include #include #include +#include #include @@ -254,6 +255,7 @@ struct ContextSharedPart std::shared_ptr clusters; ConfigurationPtr clusters_config; /// Stores updated configs mutable std::mutex clusters_mutex; /// Guards clusters and clusters_config + std::unique_ptr cluster_discovery; std::shared_ptr async_insert_queue; std::map server_ports; @@ -2195,11 +2197,22 @@ std::shared_ptr Context::getClusters() const return shared->clusters; } +void Context::startClusterDiscovery() +{ + if (!shared->cluster_discovery) + return; + shared->cluster_discovery->start(); +} + /// On repeating calls updates existing clusters and adds new clusters, doesn't delete old clusters -void Context::setClustersConfig(const ConfigurationPtr & config, const String & config_name) +void Context::setClustersConfig(const ConfigurationPtr & config, bool enable_discovery, const String & config_name) { std::lock_guard lock(shared->clusters_mutex); + if (config->getBool("allow_experimental_cluster_discovery", false) && enable_discovery && !shared->cluster_discovery) + { + shared->cluster_discovery = std::make_unique(*config, getGlobalContext()); + } /// Do not update clusters if this part of config wasn't changed. if (shared->clusters && isSameConfiguration(*config, *shared->clusters_config, config_name)) @@ -2209,7 +2222,7 @@ void Context::setClustersConfig(const ConfigurationPtr & config, const String & shared->clusters_config = config; if (!shared->clusters) - shared->clusters = std::make_unique(*shared->clusters_config, settings, config_name); + shared->clusters = std::make_shared(*shared->clusters_config, settings, config_name); else shared->clusters->updateClusters(*shared->clusters_config, settings, config_name, old_clusters_config); } diff --git a/src/Interpreters/Context.h b/src/Interpreters/Context.h index 823bc028c15..6b0a4671efb 100644 --- a/src/Interpreters/Context.h +++ b/src/Interpreters/Context.h @@ -750,7 +750,10 @@ public: std::shared_ptr getClusters() const; std::shared_ptr getCluster(const std::string & cluster_name) const; std::shared_ptr tryGetCluster(const std::string & cluster_name) const; - void setClustersConfig(const ConfigurationPtr & config, const String & config_name = "remote_servers"); + void setClustersConfig(const ConfigurationPtr & config, bool enable_discovery = false, const String & config_name = "remote_servers"); + + void startClusterDiscovery(); + /// Sets custom cluster, but doesn't update configuration void setCluster(const String & cluster_name, const std::shared_ptr & cluster); void reloadClusterConfig() const; diff --git a/src/Interpreters/DDLWorker.cpp b/src/Interpreters/DDLWorker.cpp index 27bb4906f1a..ee5dc4deebb 100644 --- a/src/Interpreters/DDLWorker.cpp +++ b/src/Interpreters/DDLWorker.cpp @@ -1189,7 +1189,7 @@ void DDLWorker::runMainThread() } catch (...) { - tryLogCurrentException(log, "Unexpected error, will try to restart main thread:"); + tryLogCurrentException(log, "Unexpected error, will try to restart main thread"); reset_state(); sleepForSeconds(5); } diff --git a/src/Interpreters/ExpressionAnalyzer.cpp b/src/Interpreters/ExpressionAnalyzer.cpp index 9b343bec055..c195cb93c5e 100644 --- a/src/Interpreters/ExpressionAnalyzer.cpp +++ b/src/Interpreters/ExpressionAnalyzer.cpp @@ -115,82 +115,62 @@ bool checkPositionalArguments(ASTPtr & argument, const ASTSelectQuery * select_q } } - /// In case of expression/function (order by 1+2 and 2*x1, greatest(1, 2)) replace - /// positions only if all literals are numbers, otherwise it is not positional. - bool positional = true; + const auto * ast_literal = typeid_cast(argument.get()); + if (!ast_literal) + return false; - /// Case when GROUP BY element is position. - if (const auto * ast_literal = typeid_cast(argument.get())) + auto which = ast_literal->value.getType(); + if (which != Field::Types::UInt64) + return false; + + auto pos = ast_literal->value.get(); + if (!pos || pos > columns.size()) + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Positional argument out of bounds: {} (exprected in range [1, {}]", + pos, columns.size()); + + const auto & column = columns[--pos]; + if (typeid_cast(column.get())) { - auto which = ast_literal->value.getType(); - if (which == Field::Types::UInt64) + argument = column->clone(); + } + else if (typeid_cast(column.get())) + { + std::function throw_if_aggregate_function = [&](ASTPtr node) { - auto pos = ast_literal->value.get(); - if (pos > 0 && pos <= columns.size()) + if (const auto * function = typeid_cast(node.get())) { - const auto & column = columns[--pos]; - if (typeid_cast(column.get())) + auto is_aggregate_function = AggregateFunctionFactory::instance().isAggregateFunctionName(function->name); + if (is_aggregate_function) { - argument = column->clone(); - } - else if (typeid_cast(column.get())) - { - std::function throw_if_aggregate_function = [&](ASTPtr node) - { - if (const auto * function = typeid_cast(node.get())) - { - auto is_aggregate_function = AggregateFunctionFactory::instance().isAggregateFunctionName(function->name); - if (is_aggregate_function) - { - throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, - "Illegal value (aggregate function) for positional argument in {}", - ASTSelectQuery::expressionToString(expression)); - } - else - { - if (function->arguments) - { - for (const auto & arg : function->arguments->children) - throw_if_aggregate_function(arg); - } - } - } - }; - - if (expression == ASTSelectQuery::Expression::GROUP_BY) - throw_if_aggregate_function(column); - - argument = column->clone(); + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Illegal value (aggregate function) for positional argument in {}", + ASTSelectQuery::expressionToString(expression)); } else { - throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, - "Illegal value for positional argument in {}", - ASTSelectQuery::expressionToString(expression)); + if (function->arguments) + { + for (const auto & arg : function->arguments->children) + throw_if_aggregate_function(arg); + } } } - else if (pos > columns.size() || !pos) - { - throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, - "Positional argument out of bounds: {} (exprected in range [1, {}]", - pos, columns.size()); - } - } - else - positional = false; - } - else if (const auto * ast_function = typeid_cast(argument.get())) - { - if (ast_function->arguments) - { - for (auto & arg : ast_function->arguments->children) - positional &= checkPositionalArguments(arg, select_query, expression); - } + }; + + if (expression == ASTSelectQuery::Expression::GROUP_BY) + throw_if_aggregate_function(column); + + argument = column->clone(); } else - positional = false; + { + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Illegal value for positional argument in {}", + ASTSelectQuery::expressionToString(expression)); + } - return positional; + return true; } void replaceForPositionalArguments(ASTPtr & argument, const ASTSelectQuery * select_query, ASTSelectQuery::Expression expression) @@ -1901,7 +1881,7 @@ std::string ExpressionAnalysisResult::dump() const if (!selected_columns.empty()) { ss << "selected_columns "; - for (size_t i = 0; i < selected_columns.size(); i++) + for (size_t i = 0; i < selected_columns.size(); ++i) { if (i > 0) { diff --git a/src/Interpreters/ExternalUserDefinedExecutableFunctionsLoader.cpp b/src/Interpreters/ExternalUserDefinedExecutableFunctionsLoader.cpp index 2de7b4b7846..b266746642f 100644 --- a/src/Interpreters/ExternalUserDefinedExecutableFunctionsLoader.cpp +++ b/src/Interpreters/ExternalUserDefinedExecutableFunctionsLoader.cpp @@ -1,5 +1,7 @@ #include "ExternalUserDefinedExecutableFunctionsLoader.h" +#include + #include #include @@ -54,29 +56,44 @@ ExternalLoader::LoadablePtr ExternalUserDefinedExecutableFunctionsLoader::create throw Exception(ErrorCodes::FUNCTION_ALREADY_EXISTS, "The aggregate function '{}' already exists", name); String type = config.getString(key_in_config + ".type"); - UserDefinedExecutableFunctionType function_type; + + bool is_executable_pool = false; if (type == "executable") - function_type = UserDefinedExecutableFunctionType::executable; + is_executable_pool = false; else if (type == "executable_pool") - function_type = UserDefinedExecutableFunctionType::executable_pool; + is_executable_pool = true; else throw Exception(ErrorCodes::BAD_ARGUMENTS, "Wrong user defined function type expected 'executable' or 'executable_pool' actual {}", - function_type); + type); + + bool execute_direct = config.getBool(key_in_config + ".execute_direct", true); + + String command_value = config.getString(key_in_config + ".command"); + std::vector command_arguments; + + if (execute_direct) + { + boost::split(command_arguments, command_value, [](char c) { return c == ' '; }); + + command_value = std::move(command_arguments[0]); + command_arguments.erase(command_arguments.begin()); + } - String command = config.getString(key_in_config + ".command"); String format = config.getString(key_in_config + ".format"); DataTypePtr result_type = DataTypeFactory::instance().get(config.getString(key_in_config + ".return_type")); bool send_chunk_header = config.getBool(key_in_config + ".send_chunk_header", false); + size_t command_termination_timeout_seconds = config.getUInt64(key_in_config + ".command_termination_timeout", 10); + size_t command_read_timeout_milliseconds = config.getUInt64(key_in_config + ".command_read_timeout", 10000); + size_t command_write_timeout_milliseconds = config.getUInt64(key_in_config + ".command_write_timeout", 10000); size_t pool_size = 0; - size_t command_termination_timeout = 0; size_t max_command_execution_time = 0; - if (function_type == UserDefinedExecutableFunctionType::executable_pool) + + if (is_executable_pool) { pool_size = config.getUInt64(key_in_config + ".pool_size", 16); - command_termination_timeout = config.getUInt64(key_in_config + ".command_termination_timeout", 10); max_command_execution_time = config.getUInt64(key_in_config + ".max_command_execution_time", 10); size_t max_execution_time_seconds = static_cast(getContext()->getSettings().max_execution_time.totalSeconds()); @@ -106,19 +123,28 @@ ExternalLoader::LoadablePtr ExternalUserDefinedExecutableFunctionsLoader::create UserDefinedExecutableFunctionConfiguration function_configuration { - .type = function_type, .name = std::move(name), //-V1030 - .script_path = std::move(command), //-V1030 - .format = std::move(format), //-V1030 + .command = std::move(command_value), //-V1030 + .command_arguments = std::move(command_arguments), //-V1030 .argument_types = std::move(argument_types), //-V1030 .result_type = std::move(result_type), //-V1030 - .pool_size = pool_size, - .command_termination_timeout = command_termination_timeout, - .max_command_execution_time = max_command_execution_time, - .send_chunk_header = send_chunk_header }; - return std::make_shared(function_configuration, lifetime); + ShellCommandSourceCoordinator::Configuration shell_command_coordinator_configration + { + .format = std::move(format), //-V1030 + .command_termination_timeout_seconds = command_termination_timeout_seconds, + .command_read_timeout_milliseconds = command_read_timeout_milliseconds, + .command_write_timeout_milliseconds = command_write_timeout_milliseconds, + .pool_size = pool_size, + .max_command_execution_time_seconds = max_command_execution_time, + .is_executable_pool = is_executable_pool, + .send_chunk_header = send_chunk_header, + .execute_direct = execute_direct + }; + + auto coordinator = std::make_shared(shell_command_coordinator_configration); + return std::make_shared(function_configuration, std::move(coordinator), lifetime); } } diff --git a/src/Interpreters/FillingRow.cpp b/src/Interpreters/FillingRow.cpp index df99c0d11ed..94f185a44cc 100644 --- a/src/Interpreters/FillingRow.cpp +++ b/src/Interpreters/FillingRow.cpp @@ -1,5 +1,4 @@ #include -#include #include diff --git a/src/Interpreters/HashJoin.cpp b/src/Interpreters/HashJoin.cpp index 6ba9e7505f2..82d8356b7c7 100644 --- a/src/Interpreters/HashJoin.cpp +++ b/src/Interpreters/HashJoin.cpp @@ -268,11 +268,9 @@ HashJoin::HashJoin(std::shared_ptr table_join_, const Block & right_s LOG_TRACE(log, "Joining on: {}", fmt::join(log_text, " | ")); } - JoinCommon::removeLowCardinalityInplace(right_table_keys); - + JoinCommon::convertToFullColumnsInplace(right_table_keys); initRightBlockStructure(data->sample_block); - JoinCommon::createMissedColumns(sample_block_with_columns_to_add); if (nullable_right_side) diff --git a/src/Interpreters/InterpreterAlterQuery.cpp b/src/Interpreters/InterpreterAlterQuery.cpp index 5f7c54e427f..2475d437acb 100644 --- a/src/Interpreters/InterpreterAlterQuery.cpp +++ b/src/Interpreters/InterpreterAlterQuery.cpp @@ -48,10 +48,15 @@ BlockIO InterpreterAlterQuery::execute() FunctionNameNormalizer().visit(query_ptr.get()); const auto & alter = query_ptr->as(); if (alter.alter_object == ASTAlterQuery::AlterObjectType::DATABASE) + { return executeToDatabase(alter); + } else if (alter.alter_object == ASTAlterQuery::AlterObjectType::TABLE || alter.alter_object == ASTAlterQuery::AlterObjectType::LIVE_VIEW) + { return executeToTable(alter); + } + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown alter object type"); } diff --git a/src/Interpreters/InterpreterCreateFunctionQuery.cpp b/src/Interpreters/InterpreterCreateFunctionQuery.cpp index 1c43da07628..2f345f8b237 100644 --- a/src/Interpreters/InterpreterCreateFunctionQuery.cpp +++ b/src/Interpreters/InterpreterCreateFunctionQuery.cpp @@ -57,9 +57,14 @@ BlockIO InterpreterCreateFunctionQuery::execute() void InterpreterCreateFunctionQuery::validateFunction(ASTPtr function, const String & name) { - const auto * args_tuple = function->as()->arguments->children.at(0)->as(); + auto & lambda_function = function->as(); + auto & lambda_function_expression_list = lambda_function.arguments->children; + + const auto & tuple_function_arguments = lambda_function_expression_list.at(0)->as(); + std::unordered_set arguments; - for (const auto & argument : args_tuple->arguments->children) + + for (const auto & argument : tuple_function_arguments.arguments->children) { const auto & argument_name = argument->as()->name(); auto [_, inserted] = arguments.insert(argument_name); @@ -67,7 +72,7 @@ void InterpreterCreateFunctionQuery::validateFunction(ASTPtr function, const Str throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "Identifier {} already used as function parameter", argument_name); } - ASTPtr function_body = function->as()->children.at(0)->children.at(1); + ASTPtr function_body = lambda_function_expression_list.at(1); validateFunctionRecursiveness(function_body, name); } @@ -82,5 +87,4 @@ void InterpreterCreateFunctionQuery::validateFunctionRecursiveness(ASTPtr node, validateFunctionRecursiveness(child, function_to_create); } } - } diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index 8f003e75a07..7ddb0c8c26e 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -637,13 +637,14 @@ InterpreterCreateQuery::TableProperties InterpreterCreateQuery::getTableProperti /// Table function without columns list. auto table_function = TableFunctionFactory::instance().get(create.as_table_function, getContext()); properties.columns = table_function->getActualTableStructure(getContext()); - assert(!properties.columns.empty()); } else if (create.is_dictionary) { return {}; } - else + /// We can have queries like "CREATE TABLE ENGINE=" if + /// supports schema inference (will determine table structure in it's constructor). + else if (!StorageFactory::instance().checkIfStorageSupportsSchemaInterface(create.storage->engine->name)) throw Exception("Incorrect CREATE query: required list of column descriptions or AS section or SELECT.", ErrorCodes::INCORRECT_QUERY); /// Even if query has list of columns, canonicalize it (unfold Nested columns). @@ -1083,7 +1084,10 @@ bool InterpreterCreateQuery::doCreateTable(ASTCreateQuery & create, { const auto & factory = TableFunctionFactory::instance(); auto table_func = factory.get(create.as_table_function, getContext()); - res = table_func->execute(create.as_table_function, getContext(), create.getTable(), properties.columns); + /// In case of CREATE AS table_function() query we should use global context + /// in storage creation because there will be no query context on server startup + /// and because storage lifetime is bigger than query context lifetime. + res = table_func->execute(create.as_table_function, getContext(), create.getTable(), properties.columns, /*use_global_context=*/true); res->renameInMemory({create.getDatabase(), create.getTable(), create.uuid}); } else diff --git a/src/Interpreters/InterpreterDescribeQuery.cpp b/src/Interpreters/InterpreterDescribeQuery.cpp index 638c671c3a3..36ea2949b6a 100644 --- a/src/Interpreters/InterpreterDescribeQuery.cpp +++ b/src/Interpreters/InterpreterDescribeQuery.cpp @@ -150,7 +150,7 @@ BlockIO InterpreterDescribeQuery::execute() res_columns[6]->insertDefault(); res_columns[7]->insert(1u); - }, column.type->getDefaultSerialization(), column.type, nullptr); + }, {column.type->getDefaultSerialization(), column.type, nullptr, nullptr}); } } diff --git a/src/Interpreters/InterpreterInsertQuery.cpp b/src/Interpreters/InterpreterInsertQuery.cpp index 8c0d3620dd6..a1f83c81a81 100644 --- a/src/Interpreters/InterpreterInsertQuery.cpp +++ b/src/Interpreters/InterpreterInsertQuery.cpp @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -196,6 +197,9 @@ Chain InterpreterInsertQuery::buildChainImpl( /// We create a pipeline of several streams, into which we will write data. Chain out; + /// Keep a reference to the context to make sure it stays alive until the chain is executed and destroyed + out.addInterpreterContext(context_ptr); + /// NOTE: we explicitly ignore bound materialized views when inserting into Kafka Storage. /// Otherwise we'll get duplicates when MV reads same rows again from Kafka. if (table->noPushingToViews() && !no_destination) @@ -371,7 +375,7 @@ BlockIO InterpreterInsertQuery::execute() pipeline = interpreter_watch.buildQueryPipeline(); } - for (size_t i = 0; i < out_streams_size; i++) + for (size_t i = 0; i < out_streams_size; ++i) { auto out = buildChainImpl(table, metadata_snapshot, query_sample_block, nullptr, nullptr); out_chains.emplace_back(std::move(out)); @@ -380,13 +384,6 @@ BlockIO InterpreterInsertQuery::execute() BlockIO res; - res.pipeline.addStorageHolder(table); - if (const auto * mv = dynamic_cast(table.get())) - { - if (auto inner_table = mv->tryGetTargetTable()) - res.pipeline.addStorageHolder(inner_table); - } - /// What type of query: INSERT or INSERT SELECT or INSERT WATCH? if (is_distributed_insert_select) { @@ -406,6 +403,13 @@ BlockIO InterpreterInsertQuery::execute() return std::make_shared(in_header, actions); }); + /// We need to convert Sparse columns to full, because it's destination storage + /// may not support it may have different settings for applying Sparse serialization. + pipeline.addSimpleTransform([&](const Block & in_header) -> ProcessorPtr + { + return std::make_shared(in_header); + }); + size_t num_select_threads = pipeline.getNumThreads(); size_t num_insert_threads = std::max_element(out_chains.begin(), out_chains.end(), [&](const auto &a, const auto &b) { @@ -445,6 +449,13 @@ BlockIO InterpreterInsertQuery::execute() } } + res.pipeline.addStorageHolder(table); + if (const auto * mv = dynamic_cast(table.get())) + { + if (auto inner_table = mv->tryGetTargetTable()) + res.pipeline.addStorageHolder(inner_table); + } + return res; } @@ -455,7 +466,7 @@ StorageID InterpreterInsertQuery::getDatabaseTable() const } -void InterpreterInsertQuery::extendQueryLogElemImpl(QueryLogElement & elem, const ASTPtr &, ContextPtr context_) const +void InterpreterInsertQuery::extendQueryLogElemImpl(QueryLogElement & elem, ContextPtr context_) { elem.query_kind = "Insert"; const auto & insert_table = context_->getInsertionTable(); @@ -466,4 +477,9 @@ void InterpreterInsertQuery::extendQueryLogElemImpl(QueryLogElement & elem, cons } } +void InterpreterInsertQuery::extendQueryLogElemImpl(QueryLogElement & elem, const ASTPtr &, ContextPtr context_) const +{ + extendQueryLogElemImpl(elem, context_); +} + } diff --git a/src/Interpreters/InterpreterInsertQuery.h b/src/Interpreters/InterpreterInsertQuery.h index e5733a8c28b..93de92a0680 100644 --- a/src/Interpreters/InterpreterInsertQuery.h +++ b/src/Interpreters/InterpreterInsertQuery.h @@ -40,6 +40,7 @@ public: ThreadStatus * thread_status = nullptr, std::atomic_uint64_t * elapsed_counter_ms = nullptr); + static void extendQueryLogElemImpl(QueryLogElement & elem, ContextPtr context_); void extendQueryLogElemImpl(QueryLogElement & elem, const ASTPtr & ast, ContextPtr context_) const override; StoragePtr getTable(ASTInsertQuery & query); diff --git a/src/Interpreters/InterpreterSelectQuery.cpp b/src/Interpreters/InterpreterSelectQuery.cpp index c8f48f2ed1f..5c0322ac1d9 100644 --- a/src/Interpreters/InterpreterSelectQuery.cpp +++ b/src/Interpreters/InterpreterSelectQuery.cpp @@ -113,8 +113,10 @@ String InterpreterSelectQuery::generateFilterActions(ActionsDAGPtr & actions, co select_ast->setExpression(ASTSelectQuery::Expression::SELECT, std::make_shared()); auto expr_list = select_ast->select(); - // The first column is our filter expression. - expr_list->children.push_back(row_policy_filter); + /// The first column is our filter expression. + /// the row_policy_filter should be cloned, because it may be changed by TreeRewriter. + /// which make it possible an invalid expression, although it may be valid in whole select. + expr_list->children.push_back(row_policy_filter->clone()); /// Keep columns that are required after the filter actions. for (const auto & column_str : prerequisite_columns) @@ -386,7 +388,9 @@ InterpreterSelectQuery::InterpreterSelectQuery( query.setFinal(); /// Save scalar sub queries's results in the query context - if (!options.only_analyze && context->hasQueryContext()) + /// But discard them if the Storage has been modified + /// In an ideal situation we would only discard the scalars affected by the storage change + if (!options.only_analyze && context->hasQueryContext() && !context->getViewSource()) for (const auto & it : syntax_analyzer_result->getScalars()) context->getQueryContext()->addScalar(it.first, it.second); diff --git a/src/Interpreters/InterpreterSystemQuery.cpp b/src/Interpreters/InterpreterSystemQuery.cpp index 960fddccb8c..b39ededaa91 100644 --- a/src/Interpreters/InterpreterSystemQuery.cpp +++ b/src/Interpreters/InterpreterSystemQuery.cpp @@ -469,7 +469,7 @@ void InterpreterSystemQuery::restoreReplica() { getContext()->checkAccess(AccessType::SYSTEM_RESTORE_REPLICA, table_id); - const zkutil::ZooKeeperPtr& zookeeper = getContext()->getZooKeeper(); + const zkutil::ZooKeeperPtr & zookeeper = getContext()->getZooKeeper(); if (zookeeper->expired()) throw Exception(ErrorCodes::NO_ZOOKEEPER, diff --git a/src/Interpreters/MergeJoin.cpp b/src/Interpreters/MergeJoin.cpp index 6f4fef46886..7f22386f54b 100644 --- a/src/Interpreters/MergeJoin.cpp +++ b/src/Interpreters/MergeJoin.cpp @@ -534,8 +534,9 @@ MergeJoin::MergeJoin(std::shared_ptr table_join_, const Block & right if (right_sample_block.getByName(right_key).type->lowCardinality()) lowcard_right_keys.push_back(right_key); } - JoinCommon::removeLowCardinalityInplace(right_table_keys); - JoinCommon::removeLowCardinalityInplace(right_sample_block, key_names_right); + + JoinCommon::convertToFullColumnsInplace(right_table_keys); + JoinCommon::convertToFullColumnsInplace(right_sample_block, key_names_right); const NameSet required_right_keys = table_join->requiredRightKeys(); for (const auto & column : right_table_keys) @@ -664,7 +665,7 @@ bool MergeJoin::saveRightBlock(Block && block) Block MergeJoin::modifyRightBlock(const Block & src_block) const { Block block = materializeBlock(src_block); - JoinCommon::removeLowCardinalityInplace(block, table_join->getOnlyClause().key_names_right); + JoinCommon::convertToFullColumnsInplace(block, table_join->getOnlyClause().key_names_right); return block; } @@ -706,7 +707,7 @@ void MergeJoin::joinBlock(Block & block, ExtraBlockPtr & not_processed) lowcard_keys.push_back(column_name); } - JoinCommon::removeLowCardinalityInplace(block, key_names_left, false); + JoinCommon::convertToFullColumnsInplace(block, key_names_left, false); sortBlock(block, left_sort_description); diff --git a/src/Interpreters/MutationsInterpreter.cpp b/src/Interpreters/MutationsInterpreter.cpp index d2bd47d0908..ac8dcce35d0 100644 --- a/src/Interpreters/MutationsInterpreter.cpp +++ b/src/Interpreters/MutationsInterpreter.cpp @@ -421,6 +421,7 @@ ASTPtr MutationsInterpreter::prepare(bool dry_run) NameSet updated_columns; bool materialize_ttl_recalculate_only = materializeTTLRecalculateOnly(storage); + for (const MutationCommand & command : commands) { if (command.type == MutationCommand::Type::UPDATE @@ -569,6 +570,12 @@ ASTPtr MutationsInterpreter::prepare(bool dry_run) stages.emplace_back(context); const auto & column = columns_desc.get(command.column_name); + + if (!column.default_desc.expression) + throw Exception( + ErrorCodes::BAD_ARGUMENTS, + "Cannot materialize column `{}` because it doesn't have default expression", column.name); + auto materialized_column = makeASTFunction( "_CAST", column.default_desc.expression->clone(), std::make_shared(column.type->getName())); @@ -625,7 +632,9 @@ ASTPtr MutationsInterpreter::prepare(bool dry_run) dependencies.insert(dependency); } } - else if (metadata_snapshot->hasRowsTTL()) + else if (metadata_snapshot->hasRowsTTL() + || metadata_snapshot->hasAnyRowsWhereTTL() + || metadata_snapshot->hasAnyGroupByTTL()) { for (const auto & column : all_columns) dependencies.emplace(column.name, ColumnDependency::TTL_TARGET); diff --git a/src/Interpreters/MySQL/InterpretersMySQLDDLQuery.cpp b/src/Interpreters/MySQL/InterpretersMySQLDDLQuery.cpp index 54b87e3bed6..9494c4133ff 100644 --- a/src/Interpreters/MySQL/InterpretersMySQLDDLQuery.cpp +++ b/src/Interpreters/MySQL/InterpretersMySQLDDLQuery.cpp @@ -20,10 +20,12 @@ #include #include #include +#include #include #include #include #include +#include #include namespace DB @@ -519,6 +521,12 @@ ASTs InterpreterCreateImpl::getRewrittenQueries( rewritten_query->set(rewritten_query->storage, storage); rewritten_query->set(rewritten_query->columns_list, columns); + if (auto override_ast = tryGetTableOverride(mapped_to_database, create_query.table)) + { + const auto & override = override_ast->as(); + applyTableOverrideToCreateQuery(override, rewritten_query.get()); + } + return ASTs{rewritten_query}; } diff --git a/src/Interpreters/MySQL/tests/gtest_create_rewritten.cpp b/src/Interpreters/MySQL/tests/gtest_create_rewritten.cpp index 5e18b0de2e0..02af07bc00c 100644 --- a/src/Interpreters/MySQL/tests/gtest_create_rewritten.cpp +++ b/src/Interpreters/MySQL/tests/gtest_create_rewritten.cpp @@ -40,7 +40,7 @@ TEST(MySQLCreateRewritten, ColumnsDataType) {"TINYINT", "Int8"}, {"SMALLINT", "Int16"}, {"MEDIUMINT", "Int32"}, {"INT", "Int32"}, {"INTEGER", "Int32"}, {"BIGINT", "Int64"}, {"FLOAT", "Float32"}, {"DOUBLE", "Float64"}, {"VARCHAR(10)", "String"}, {"CHAR(10)", "String"}, {"Date", "Date"}, {"DateTime", "DateTime"}, - {"TIMESTAMP", "DateTime"}, {"BOOLEAN", "Bool"} + {"TIMESTAMP", "DateTime"}, {"BOOLEAN", "Bool"}, {"BIT", "UInt64"} }; for (const auto & [test_type, mapped_type] : test_types) diff --git a/src/Interpreters/OptimizeIfWithConstantConditionVisitor.cpp b/src/Interpreters/OptimizeIfWithConstantConditionVisitor.cpp index 0440c52797c..802bf4e43ce 100644 --- a/src/Interpreters/OptimizeIfWithConstantConditionVisitor.cpp +++ b/src/Interpreters/OptimizeIfWithConstantConditionVisitor.cpp @@ -51,7 +51,7 @@ static bool tryExtractConstValueFromCondition(const ASTPtr & condition, bool & v } } } - else if (function->name == "toUInt8" || function->name == "toInt8") + else if (function->name == "toUInt8" || function->name == "toInt8" || function->name == "identity") { if (const auto * expr_list = function->arguments->as()) { diff --git a/src/Interpreters/PartLog.h b/src/Interpreters/PartLog.h index 1aec850e3dc..b2d18e4d40d 100644 --- a/src/Interpreters/PartLog.h +++ b/src/Interpreters/PartLog.h @@ -49,7 +49,6 @@ struct PartLogElement UInt16 error = 0; String exception; - static std::string name() { return "PartLog"; } static NamesAndTypesList getNamesAndTypes(); diff --git a/src/Interpreters/ProcessList.cpp b/src/Interpreters/ProcessList.cpp index a4583685a90..e7e52142fc8 100644 --- a/src/Interpreters/ProcessList.cpp +++ b/src/Interpreters/ProcessList.cpp @@ -86,6 +86,20 @@ ProcessList::EntryPtr ProcessList::insert(const String & query_, const IAST * as throw Exception("Too many simultaneous queries. Maximum: " + toString(max_size), ErrorCodes::TOO_MANY_SIMULTANEOUS_QUERIES); } + String query_kind{ast->getQueryKindString()}; + if (!is_unlimited_query) + { + auto amount = getQueryKindAmount(query_kind); + if (max_insert_queries_amount && query_kind == "Insert" && amount >= max_insert_queries_amount) + throw Exception(ErrorCodes::TOO_MANY_SIMULTANEOUS_QUERIES, + "Too many simultaneous insert queries. Maximum: {}, current: {}", + max_insert_queries_amount, amount); + if (max_select_queries_amount && query_kind == "Select" && amount >= max_select_queries_amount) + throw Exception(ErrorCodes::TOO_MANY_SIMULTANEOUS_QUERIES, + "Too many simultaneous select queries. Maximum: {}, current: {}", + max_select_queries_amount, amount); + } + { /** * `max_size` check above is controlled by `max_concurrent_queries` server setting and is a "hard" limit for how many @@ -176,7 +190,9 @@ ProcessList::EntryPtr ProcessList::insert(const String & query_, const IAST * as } auto process_it = processes.emplace(processes.end(), - query_context, query_, client_info, priorities.insert(settings.priority)); + query_context, query_, client_info, priorities.insert(settings.priority), query_kind); + + increaseQueryKindAmount(query_kind); res = std::make_shared(*this, process_it); @@ -242,6 +258,7 @@ ProcessListEntry::~ProcessListEntry() String user = it->getClientInfo().current_user; String query_id = it->getClientInfo().current_query_id; + String query_kind = it->query_kind; const QueryStatus * process_list_element_ptr = &*it; @@ -273,6 +290,9 @@ ProcessListEntry::~ProcessListEntry() LOG_ERROR(&Poco::Logger::get("ProcessList"), "Logical error: cannot find query by query_id and pointer to ProcessListElement in ProcessListForUser"); std::terminate(); } + + parent.decreaseQueryKindAmount(query_kind); + parent.have_space.notify_all(); /// If there are no more queries for the user, then we will reset memory tracker and network throttler. @@ -286,11 +306,12 @@ ProcessListEntry::~ProcessListEntry() QueryStatus::QueryStatus( - ContextPtr context_, const String & query_, const ClientInfo & client_info_, QueryPriorities::Handle && priority_handle_) + ContextPtr context_, const String & query_, const ClientInfo & client_info_, QueryPriorities::Handle && priority_handle_, const String & query_kind_) : WithContext(context_) , query(query_) , client_info(client_info_) , priority_handle(std::move(priority_handle_)) + , query_kind(query_kind_) { auto settings = getContext()->getSettings(); limits.max_execution_time = settings.max_execution_time; @@ -411,9 +432,8 @@ QueryStatusInfo QueryStatus::getInfo(bool get_thread_list, bool get_profile_even res.read_bytes = progress_in.read_bytes; res.total_rows = progress_in.total_rows_to_read; - /// TODO: Use written_rows and written_bytes when real time progress is implemented - res.written_rows = progress_out.read_rows; - res.written_bytes = progress_out.read_bytes; + res.written_rows = progress_out.written_rows; + res.written_bytes = progress_out.written_bytes; if (thread_group) { @@ -485,4 +505,33 @@ ProcessList::UserInfo ProcessList::getUserInfo(bool get_profile_events) const return per_user_infos; } +void ProcessList::increaseQueryKindAmount(const String & query_kind) +{ + auto found = query_kind_amounts.find(query_kind); + if (found == query_kind_amounts.end()) + query_kind_amounts[query_kind] = 1; + else + found->second += 1; +} + +void ProcessList::decreaseQueryKindAmount(const String & query_kind) +{ + auto found = query_kind_amounts.find(query_kind); + /// TODO: we could just rebuild the map, as we have saved all query_kind. + if (found == query_kind_amounts.end()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Wrong query kind amount: decrease before increase on '{}'", query_kind); + else if (found->second == 0) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Wrong query kind amount: decrease to negative on '{}'", query_kind, found->second); + else + found->second -= 1; + +} +ProcessList::QueryAmount ProcessList::getQueryKindAmount(const String & query_kind) +{ + auto found = query_kind_amounts.find(query_kind); + if (found == query_kind_amounts.end()) + return 0; + return found->second; +} + } diff --git a/src/Interpreters/ProcessList.h b/src/Interpreters/ProcessList.h index 9c826bde061..ada24c03275 100644 --- a/src/Interpreters/ProcessList.h +++ b/src/Interpreters/ProcessList.h @@ -94,7 +94,7 @@ protected: ExecutionSpeedLimits limits; OverflowMode overflow_mode; - QueryPriorities::Handle priority_handle; + QueryPriorities::Handle priority_handle = nullptr; std::atomic is_killed { false }; @@ -118,13 +118,17 @@ protected: ProcessListForUser * user_process_list = nullptr; + String query_kind; + public: QueryStatus( ContextPtr context_, const String & query_, const ClientInfo & client_info_, - QueryPriorities::Handle && priority_handle_); + QueryPriorities::Handle && priority_handle_, + const String & query_kind_ + ); ~QueryStatus(); @@ -256,6 +260,7 @@ class ProcessList public: using Element = QueryStatus; using Entry = ProcessListEntry; + using QueryAmount = UInt64; /// list, for iterators not to invalidate. NOTE: could replace with cyclic buffer, but not worth. using Container = std::list; @@ -265,6 +270,8 @@ public: /// User -> queries using UserToQueries = std::unordered_map; + using QueryKindToAmount = std::unordered_map; + protected: friend class ProcessListEntry; @@ -287,6 +294,19 @@ protected: /// Call under lock. Finds process with specified current_user and current_query_id. QueryStatus * tryGetProcessListElement(const String & current_query_id, const String & current_user); + /// limit for insert. 0 means no limit. Otherwise, when limit exceeded, an exception is thrown. + size_t max_insert_queries_amount = 0; + + /// limit for select. 0 means no limit. Otherwise, when limit exceeded, an exception is thrown. + size_t max_select_queries_amount = 0; + + /// amount of queries by query kind. + QueryKindToAmount query_kind_amounts; + + void increaseQueryKindAmount(const String & query_kind); + void decreaseQueryKindAmount(const String & query_kind); + QueryAmount getQueryKindAmount(const String & query_kind); + public: using EntryPtr = std::shared_ptr; @@ -312,6 +332,18 @@ public: max_size = max_size_; } + void setMaxInsertQueriesAmount(size_t max_insert_queries_amount_) + { + std::lock_guard lock(mutex); + max_insert_queries_amount = max_insert_queries_amount_; + } + + void setMaxSelectQueriesAmount(size_t max_select_queries_amount_) + { + std::lock_guard lock(mutex); + max_select_queries_amount = max_select_queries_amount_; + } + /// Try call cancel() for input and output streams of query with specified id and user CancellationCode sendCancelToQuery(const String & current_query_id, const String & current_user, bool kill = false); diff --git a/src/Interpreters/QueryViewsLog.cpp b/src/Interpreters/QueryViewsLog.cpp index 2c0f1ecd878..c0703d77691 100644 --- a/src/Interpreters/QueryViewsLog.cpp +++ b/src/Interpreters/QueryViewsLog.cpp @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include namespace DB diff --git a/src/Interpreters/RewriteAnyFunctionVisitor.cpp b/src/Interpreters/RewriteAnyFunctionVisitor.cpp index eed6368ae54..5eb14aa4252 100644 --- a/src/Interpreters/RewriteAnyFunctionVisitor.cpp +++ b/src/Interpreters/RewriteAnyFunctionVisitor.cpp @@ -63,7 +63,12 @@ bool extractIdentifiers(const ASTFunction & func, std::unordered_set & void RewriteAnyFunctionMatcher::visit(ASTPtr & ast, Data & data) { if (auto * func = ast->as()) + { + if (func->is_window_function) + return; + visit(*func, ast, data); + } } void RewriteAnyFunctionMatcher::visit(const ASTFunction & func, ASTPtr & ast, Data & data) diff --git a/src/Interpreters/RewriteSumIfFunctionVisitor.cpp b/src/Interpreters/RewriteSumIfFunctionVisitor.cpp index 7b322ca1585..7f725c1d8a5 100644 --- a/src/Interpreters/RewriteSumIfFunctionVisitor.cpp +++ b/src/Interpreters/RewriteSumIfFunctionVisitor.cpp @@ -10,7 +10,12 @@ namespace DB void RewriteSumIfFunctionMatcher::visit(ASTPtr & ast, Data & data) { if (auto * func = ast->as()) + { + if (func->is_window_function) + return; + visit(*func, ast, data); + } } void RewriteSumIfFunctionMatcher::visit(const ASTFunction & func, ASTPtr & ast, Data &) diff --git a/src/Interpreters/SelectQueryOptions.h b/src/Interpreters/SelectQueryOptions.h index 709ecdc239c..bc95a940c18 100644 --- a/src/Interpreters/SelectQueryOptions.h +++ b/src/Interpreters/SelectQueryOptions.h @@ -41,6 +41,9 @@ struct SelectQueryOptions /// It is needed because lazy normal projections require special planning in FetchColumns stage, such as adding WHERE transform. /// It is also used to avoid adding aggregating step when aggregate projection is chosen. bool is_projection_query = false; + /// This flag is needed for projection description. + /// Otherwise, keys for GROUP BY may be removed as constants. + bool ignore_ast_optimizations = false; bool ignore_alias = false; bool is_internal = false; bool is_subquery = false; // non-subquery can also have subquery_depth > 0, e.g. insert select @@ -120,6 +123,12 @@ struct SelectQueryOptions return *this; } + SelectQueryOptions & ignoreASTOptimizationsAlias(bool value = true) + { + ignore_ast_optimizations = value; + return *this; + } + SelectQueryOptions & setInternal(bool value = false) { is_internal = value; diff --git a/src/Interpreters/Session.cpp b/src/Interpreters/Session.cpp index c26d8b52049..2af9a2b6bbc 100644 --- a/src/Interpreters/Session.cpp +++ b/src/Interpreters/Session.cpp @@ -311,7 +311,7 @@ void Session::authenticate(const Credentials & credentials_, const Poco::Net::So try { - user_id = global_context->getAccessControl().login(credentials_, address.host()); + user_id = global_context->getAccessControl().authenticate(credentials_, address.host()); LOG_DEBUG(log, "{} Authenticated with global context as user {}", toString(auth_id), user_id ? toString(*user_id) : ""); } diff --git a/src/Interpreters/Set.cpp b/src/Interpreters/Set.cpp index 90fc0f6c0d0..0ccaae9a795 100644 --- a/src/Interpreters/Set.cpp +++ b/src/Interpreters/Set.cpp @@ -177,7 +177,7 @@ bool Set::insertFromBlock(const ColumnsWithTypeAndName & columns) /// Remember the columns we will work with for (size_t i = 0; i < keys_size; ++i) { - materialized_columns.emplace_back(columns.at(i).column->convertToFullColumnIfConst()->convertToFullColumnIfLowCardinality()); + materialized_columns.emplace_back(columns.at(i).column->convertToFullIfNeeded()); key_columns.emplace_back(materialized_columns.back().get()); } diff --git a/src/Interpreters/ThreadStatusExt.cpp b/src/Interpreters/ThreadStatusExt.cpp index fc6aa15a1e8..b3720b89eaa 100644 --- a/src/Interpreters/ThreadStatusExt.cpp +++ b/src/Interpreters/ThreadStatusExt.cpp @@ -24,12 +24,6 @@ # include #endif -namespace ProfileEvents -{ - extern const Event InsertedRows; - extern const Event InsertedBytes; -} - /// Implement some methods of ThreadStatus and CurrentThread here to avoid extra linking dependencies in clickhouse_common_io /// TODO It doesn't make sense. @@ -447,9 +441,8 @@ void ThreadStatus::logToQueryThreadLog(QueryThreadLog & thread_log, const String elem.read_rows = progress_in.read_rows.load(std::memory_order_relaxed); elem.read_bytes = progress_in.read_bytes.load(std::memory_order_relaxed); - /// TODO: Use written_rows and written_bytes when run time progress is implemented - elem.written_rows = progress_out.read_rows.load(std::memory_order_relaxed); - elem.written_bytes = progress_out.read_bytes.load(std::memory_order_relaxed); + elem.written_rows = progress_out.written_rows.load(std::memory_order_relaxed); + elem.written_bytes = progress_out.written_bytes.load(std::memory_order_relaxed); elem.memory_usage = memory_tracker.get(); elem.peak_memory_usage = memory_tracker.getPeak(); @@ -520,8 +513,8 @@ void ThreadStatus::logToQueryViewsLog(const ViewRuntimeData & vinfo) auto events = std::make_shared(performance_counters.getPartiallyAtomicSnapshot()); element.read_rows = progress_in.read_rows.load(std::memory_order_relaxed); element.read_bytes = progress_in.read_bytes.load(std::memory_order_relaxed); - element.written_rows = (*events)[ProfileEvents::InsertedRows]; - element.written_bytes = (*events)[ProfileEvents::InsertedBytes]; + element.written_rows = progress_out.written_rows.load(std::memory_order_relaxed); + element.written_bytes = progress_out.written_bytes.load(std::memory_order_relaxed); element.peak_memory_usage = memory_tracker.getPeak() > 0 ? memory_tracker.getPeak() : 0; if (query_context_ptr->getSettingsRef().log_profile_events != 0) { diff --git a/src/Interpreters/TreeOptimizer.cpp b/src/Interpreters/TreeOptimizer.cpp index 5e355cd52af..64b25ca9777 100644 --- a/src/Interpreters/TreeOptimizer.cpp +++ b/src/Interpreters/TreeOptimizer.cpp @@ -410,12 +410,19 @@ void optimizeDuplicateDistinct(ASTSelectQuery & select) /// has a single argument and not an aggregate functions. void optimizeMonotonousFunctionsInOrderBy(ASTSelectQuery * select_query, ContextPtr context, const TablesWithColumns & tables_with_columns, - const Names & sorting_key_columns) + const TreeRewriterResult & result) { auto order_by = select_query->orderBy(); if (!order_by) return; + /// Do not apply optimization for Distributed and Merge storages, + /// because we can't get the sorting key of their undelying tables + /// and we can break the matching of the sorting key for `read_in_order` + /// optimization by removing monotonous functions from the prefix of key. + if (result.is_remote_storage || (result.storage && result.storage->getName() == "Merge")) + return; + for (const auto & child : order_by->children) { auto * order_by_element = child->as(); @@ -438,6 +445,8 @@ void optimizeMonotonousFunctionsInOrderBy(ASTSelectQuery * select_query, Context } } + auto sorting_key_columns = result.metadata_snapshot ? result.metadata_snapshot->getSortingKeyColumns() : Names{}; + bool is_sorting_key_prefix = true; for (size_t i = 0; i < order_by->children.size(); ++i) { @@ -735,8 +744,6 @@ void TreeOptimizer::apply(ASTPtr & query, TreeRewriterResult & result, && result.storage->supportsSubcolumns() && result.metadata_snapshot) optimizeFunctionsToSubcolumns(query, result.metadata_snapshot); - optimizeIf(query, result.aliases, settings.optimize_if_chain_to_multiif); - /// Move arithmetic operations out of aggregation functions if (settings.optimize_arithmetic_operations_in_aggregate_functions) optimizeAggregationFunctions(query); @@ -802,8 +809,7 @@ void TreeOptimizer::apply(ASTPtr & query, TreeRewriterResult & result, /// Replace monotonous functions with its argument if (settings.optimize_monotonous_functions_in_order_by) - optimizeMonotonousFunctionsInOrderBy(select_query, context, tables_with_columns, - result.metadata_snapshot ? result.metadata_snapshot->getSortingKeyColumns() : Names{}); + optimizeMonotonousFunctionsInOrderBy(select_query, context, tables_with_columns, result); /// Remove duplicate items from ORDER BY. /// Execute it after all order by optimizations, diff --git a/src/Interpreters/TreeRewriter.cpp b/src/Interpreters/TreeRewriter.cpp index 6b3a50d88e2..0285bdf333c 100644 --- a/src/Interpreters/TreeRewriter.cpp +++ b/src/Interpreters/TreeRewriter.cpp @@ -957,7 +957,7 @@ void TreeRewriterResult::collectUsedColumns(const ASTPtr & query, bool is_select unknown_required_source_columns.erase(column_name); if (!required.count(column_name)) - source_columns.erase(it++); + it = source_columns.erase(it); else ++it; } @@ -973,7 +973,7 @@ void TreeRewriterResult::collectUsedColumns(const ASTPtr & query, bool is_select if (column) { source_columns.push_back(*column); - unknown_required_source_columns.erase(it++); + it = unknown_required_source_columns.erase(it); } else ++it; @@ -1120,8 +1120,10 @@ TreeRewriterResultPtr TreeRewriter::analyzeSelect( /// Push the predicate expression down to subqueries. The optimization should be applied to both initial and secondary queries. result.rewrite_subqueries = PredicateExpressionsOptimizer(getContext(), tables_with_columns, settings).optimize(*select_query); + TreeOptimizer::optimizeIf(query, result.aliases, settings.optimize_if_chain_to_multiif); + /// Only apply AST optimization for initial queries. - if (getContext()->getClientInfo().query_kind == ClientInfo::QueryKind::INITIAL_QUERY) + if (getContext()->getClientInfo().query_kind != ClientInfo::QueryKind::SECONDARY_QUERY && !select_options.ignore_ast_optimizations) TreeOptimizer::apply(query, result, tables_with_columns, getContext()); /// array_join_alias_to_name, array_join_result_to_source. diff --git a/src/Interpreters/UserDefinedExecutableFunction.cpp b/src/Interpreters/UserDefinedExecutableFunction.cpp index d57978d0fd6..e5a852b0e75 100644 --- a/src/Interpreters/UserDefinedExecutableFunction.cpp +++ b/src/Interpreters/UserDefinedExecutableFunction.cpp @@ -13,14 +13,12 @@ namespace DB UserDefinedExecutableFunction::UserDefinedExecutableFunction( const UserDefinedExecutableFunctionConfiguration & configuration_, - const ExternalLoadableLifetime & lifetime_, - std::shared_ptr process_pool_) + std::shared_ptr coordinator_, + const ExternalLoadableLifetime & lifetime_) : configuration(configuration_) + , coordinator(std::move(coordinator_)) , lifetime(lifetime_) - , process_pool(process_pool_) { - if (!process_pool && configuration.type == UserDefinedExecutableFunctionType::executable_pool) - process_pool = std::make_shared(configuration.pool_size == 0 ? std::numeric_limits::max() : configuration.pool_size); } }; diff --git a/src/Interpreters/UserDefinedExecutableFunction.h b/src/Interpreters/UserDefinedExecutableFunction.h index 1cb1de47578..a4fad8ceb7b 100644 --- a/src/Interpreters/UserDefinedExecutableFunction.h +++ b/src/Interpreters/UserDefinedExecutableFunction.h @@ -10,26 +10,13 @@ namespace DB { -enum class UserDefinedExecutableFunctionType -{ - executable, - executable_pool -}; - struct UserDefinedExecutableFunctionConfiguration { - UserDefinedExecutableFunctionType type = UserDefinedExecutableFunctionType::executable; std::string name; - std::string script_path; - std::string format; + std::string command; + std::vector command_arguments; std::vector argument_types; DataTypePtr result_type; - /// Pool settings - size_t pool_size = 0; - size_t command_termination_timeout = 0; - size_t max_command_execution_time = 0; - /// Send number_of_rows\n before sending chunk to process - bool send_chunk_header = false; }; class UserDefinedExecutableFunction final : public IExternalLoadable @@ -38,8 +25,8 @@ public: UserDefinedExecutableFunction( const UserDefinedExecutableFunctionConfiguration & configuration_, - const ExternalLoadableLifetime & lifetime_, - std::shared_ptr process_pool_ = nullptr); + std::shared_ptr coordinator_, + const ExternalLoadableLifetime & lifetime_); const ExternalLoadableLifetime & getLifetime() const override { @@ -63,7 +50,7 @@ public: std::shared_ptr clone() const override { - return std::make_shared(configuration, lifetime, process_pool); + return std::make_shared(configuration, coordinator, lifetime); } const UserDefinedExecutableFunctionConfiguration & getConfiguration() const @@ -71,9 +58,9 @@ public: return configuration; } - std::shared_ptr getProcessPool() const + std::shared_ptr getCoordinator() const { - return process_pool; + return coordinator; } std::shared_ptr shared_from_this() @@ -87,13 +74,9 @@ public: } private: - UserDefinedExecutableFunction(const UserDefinedExecutableFunctionConfiguration & configuration_, - std::shared_ptr process_pool_, - const ExternalLoadableLifetime & lifetime_); - UserDefinedExecutableFunctionConfiguration configuration; + std::shared_ptr coordinator; ExternalLoadableLifetime lifetime; - std::shared_ptr process_pool; }; } diff --git a/src/Interpreters/UserDefinedExecutableFunctionFactory.cpp b/src/Interpreters/UserDefinedExecutableFunctionFactory.cpp index 4cb3e034b01..0cffd61eaf6 100644 --- a/src/Interpreters/UserDefinedExecutableFunctionFactory.cpp +++ b/src/Interpreters/UserDefinedExecutableFunctionFactory.cpp @@ -1,8 +1,13 @@ #include "UserDefinedExecutableFunctionFactory.h" +#include + +#include + #include #include +#include #include #include @@ -19,7 +24,6 @@ namespace DB namespace ErrorCodes { extern const int UNSUPPORTED_METHOD; - extern const int TIMEOUT_EXCEEDED; } class UserDefinedFunction final : public IFunction @@ -52,10 +56,36 @@ public: ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override { + auto coordinator = executable_function->getCoordinator(); + const auto & coordinator_configuration = coordinator->getConfiguration(); const auto & configuration = executable_function->getConfiguration(); + + String command = configuration.command; + + if (coordinator_configuration.execute_direct) + { + auto user_scripts_path = context->getUserScriptsPath(); + auto script_path = user_scripts_path + '/' + command; + + if (!fileOrSymlinkPathStartsWith(script_path, user_scripts_path)) + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, + "Executable file {} must be inside user scripts folder {}", + command, + user_scripts_path); + + if (!std::filesystem::exists(std::filesystem::path(script_path))) + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, + "Executable file {} does not exist inside user scripts folder {}", + command, + user_scripts_path); + + command = std::move(script_path); + } + + size_t argument_size = arguments.size(); auto arguments_copy = arguments; - for (size_t i = 0; i < arguments.size(); ++i) + for (size_t i = 0; i < argument_size; ++i) { auto & column_with_type = arguments_copy[i]; column_with_type.column = column_with_type.column->convertToFullColumnIfConst(); @@ -71,53 +101,33 @@ public: column_with_type = column_to_cast; } - std::unique_ptr process = getProcess(); - ColumnWithTypeAndName result(result_type, "result"); Block result_block({result}); Block arguments_block(arguments_copy); - auto * process_in = &process->in; - - auto process_pool = executable_function->getProcessPool(); - bool is_executable_pool_function = (process_pool != nullptr); + auto source = std::make_shared(std::move(arguments_block)); + auto shell_input_pipe = Pipe(std::move(source)); ShellCommandSourceConfiguration shell_command_source_configuration; - if (is_executable_pool_function) + if (coordinator->getConfiguration().is_executable_pool) { shell_command_source_configuration.read_fixed_number_of_rows = true; shell_command_source_configuration.number_of_rows_to_read = input_rows_count; } - ShellCommandSource::SendDataTask task = {[process_in, arguments_block, &configuration, is_executable_pool_function, this]() - { - auto & out = *process_in; + Pipes shell_input_pipes; + shell_input_pipes.emplace_back(std::move(shell_input_pipe)); - if (configuration.send_chunk_header) - { - writeText(arguments_block.rows(), out); - writeChar('\n', out); - } - - auto output_format = context->getOutputFormat(configuration.format, out, arguments_block.cloneEmpty()); - formatBlock(output_format, arguments_block); - if (!is_executable_pool_function) - out.close(); - }}; - std::vector tasks = {std::move(task)}; - - Pipe pipe(std::make_unique( + Pipe pipe = coordinator->createPipe( + command, + configuration.command_arguments, + std::move(shell_input_pipes), + result_block, context, - configuration.format, - result_block.cloneEmpty(), - std::move(process), - std::move(tasks), - shell_command_source_configuration, - process_pool)); + shell_command_source_configuration); QueryPipeline pipeline(std::move(pipe)); - PullingPipelineExecutor executor(pipeline); auto result_column = result_type->createColumn(); @@ -143,36 +153,6 @@ public: private: - std::unique_ptr getProcess() const - { - auto process_pool = executable_function->getProcessPool(); - auto executable_function_configuration = executable_function->getConfiguration(); - - std::unique_ptr process; - bool is_executable_pool_function = (process_pool != nullptr); - if (is_executable_pool_function) - { - bool result = process_pool->tryBorrowObject(process, [&]() - { - ShellCommand::Config process_config(executable_function_configuration.script_path); - process_config.terminate_in_destructor_strategy = ShellCommand::DestructorStrategy{ true /*terminate_in_destructor*/, executable_function_configuration.command_termination_timeout }; - auto shell_command = ShellCommand::execute(process_config); - return shell_command; - }, executable_function_configuration.max_command_execution_time * 1000); - - if (!result) - throw Exception(ErrorCodes::TIMEOUT_EXCEEDED, - "Could not get process from pool, max command execution timeout exceeded {} seconds", - executable_function_configuration.max_command_execution_time); - } - else - { - process = ShellCommand::execute(executable_function_configuration.script_path); - } - - return process; - } - ExternalUserDefinedExecutableFunctionsLoader::UserDefinedExecutableFunctionPtr executable_function; ContextPtr context; }; diff --git a/src/Interpreters/applyTableOverride.cpp b/src/Interpreters/applyTableOverride.cpp new file mode 100644 index 00000000000..e614e58b06b --- /dev/null +++ b/src/Interpreters/applyTableOverride.cpp @@ -0,0 +1,125 @@ +#include + +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +void applyTableOverrideToCreateQuery(const ASTTableOverride & override, ASTCreateQuery * create_query) +{ + if (auto * columns = override.columns) + { + if (!create_query->columns_list) + create_query->set(create_query->columns_list, std::make_shared()); + if (columns->columns) + { + for (const auto & override_column_ast : columns->columns->children) + { + auto * override_column = override_column_ast->as(); + if (!override_column) + continue; + if (!create_query->columns_list->columns) + create_query->columns_list->set(create_query->columns_list->columns, std::make_shared()); + auto & dest_children = create_query->columns_list->columns->children; + auto exists = std::find_if(dest_children.begin(), dest_children.end(), [&](ASTPtr node) -> bool + { + return node->as()->name == override_column->name; + }); + /// For columns, only allow adding ALIAS (non-physical) for now. + /// TODO: This logic should instead be handled by validation that is + /// executed from InterpreterCreateQuery / InterpreterAlterQuery. + if (exists == dest_children.end()) + { + if (override_column->default_specifier == "ALIAS") + dest_children.emplace_back(override_column_ast); + } + else + dest_children[exists - dest_children.begin()] = override_column_ast; + } + } + if (columns->indices) + { + for (const auto & override_index_ast : columns->indices->children) + { + auto * override_index = override_index_ast->as(); + if (!override_index) + continue; + if (!create_query->columns_list->indices) + create_query->columns_list->set(create_query->columns_list->indices, std::make_shared()); + auto & dest_children = create_query->columns_list->indices->children; + auto exists = std::find_if(dest_children.begin(), dest_children.end(), [&](ASTPtr node) -> bool + { + return node->as()->name == override_index->name; + }); + if (exists == dest_children.end()) + dest_children.emplace_back(override_index_ast); + else + dest_children[exists - dest_children.begin()] = override_index_ast; + } + } + if (columns->constraints) + { + for (const auto & override_constraint_ast : columns->constraints->children) + { + auto * override_constraint = override_constraint_ast->as(); + if (!override_constraint) + continue; + if (!create_query->columns_list->constraints) + create_query->columns_list->set(create_query->columns_list->constraints, std::make_shared()); + auto & dest_children = create_query->columns_list->constraints->children; + auto exists = std::find_if(dest_children.begin(), dest_children.end(), [&](ASTPtr node) -> bool + { + return node->as()->name == override_constraint->name; + }); + if (exists == dest_children.end()) + dest_children.emplace_back(override_constraint_ast); + else + dest_children[exists - dest_children.begin()] = override_constraint_ast; + } + } + if (columns->projections) + { + for (const auto & override_projection_ast : columns->projections->children) + { + auto * override_projection = override_projection_ast->as(); + if (!override_projection) + continue; + if (!create_query->columns_list->projections) + create_query->columns_list->set(create_query->columns_list->projections, std::make_shared()); + auto & dest_children = create_query->columns_list->projections->children; + auto exists = std::find_if(dest_children.begin(), dest_children.end(), [&](ASTPtr node) -> bool + { + return node->as()->name == override_projection->name; + }); + if (exists == dest_children.end()) + dest_children.emplace_back(override_projection_ast); + else + dest_children[exists - dest_children.begin()] = override_projection_ast; + } + } + } + if (auto * storage = override.storage) + { + if (!create_query->storage) + create_query->set(create_query->storage, std::make_shared()); + if (storage->partition_by) + create_query->storage->set(create_query->storage->partition_by, storage->partition_by->clone()); + if (storage->primary_key) + create_query->storage->set(create_query->storage->primary_key, storage->primary_key->clone()); + if (storage->order_by) + create_query->storage->set(create_query->storage->order_by, storage->order_by->clone()); + if (storage->sample_by) + create_query->storage->set(create_query->storage->sample_by, storage->sample_by->clone()); + if (storage->ttl_table) + create_query->storage->set(create_query->storage->ttl_table, storage->ttl_table->clone()); + // No support for overriding ENGINE and SETTINGS + } + +} + +} diff --git a/src/Interpreters/applyTableOverride.h b/src/Interpreters/applyTableOverride.h new file mode 100644 index 00000000000..1c51c3b8506 --- /dev/null +++ b/src/Interpreters/applyTableOverride.h @@ -0,0 +1,16 @@ +#pragma once + +#include +#include +#include + +namespace DB +{ + +class ASTTableOverride; +class ASTCreateQuery; +class ASTIndentifier; + +void applyTableOverrideToCreateQuery(const ASTTableOverride & override, ASTCreateQuery * create_query); + +} diff --git a/src/Interpreters/convertFieldToType.cpp b/src/Interpreters/convertFieldToType.cpp index fa8e77e4f4c..d0f15a4c595 100644 --- a/src/Interpreters/convertFieldToType.cpp +++ b/src/Interpreters/convertFieldToType.cpp @@ -22,7 +22,7 @@ #include #include -#include +#include #include diff --git a/src/Interpreters/executeQuery.cpp b/src/Interpreters/executeQuery.cpp index b69bbcc6332..7dcfc4b95b3 100644 --- a/src/Interpreters/executeQuery.cpp +++ b/src/Interpreters/executeQuery.cpp @@ -556,9 +556,14 @@ static std::tuple executeQueryImpl( auto * insert_query = ast->as(); - if (insert_query && insert_query->table_id) - /// Resolve database before trying to use async insert feature - to properly hash the query. - insert_query->table_id = context->resolveStorageID(insert_query->table_id); + /// Resolve database before trying to use async insert feature - to properly hash the query. + if (insert_query) + { + if (insert_query->table_id) + insert_query->table_id = context->resolveStorageID(insert_query->table_id); + else if (auto table = insert_query->getTable(); !table.empty()) + insert_query->table_id = context->resolveStorageID(StorageID{insert_query->getDatabase(), table}); + } if (insert_query && insert_query->select) { @@ -579,8 +584,14 @@ static std::tuple executeQueryImpl( } } else + { /// reset Input callbacks if query is not INSERT SELECT context->resetInputCallbacks(); + } + + StreamLocalLimits limits; + std::shared_ptr quota; + std::unique_ptr interpreter; auto * queue = context->getAsynchronousInsertQueue(); const bool async_insert = queue @@ -591,65 +602,71 @@ static std::tuple executeQueryImpl( { queue->push(ast, context); - BlockIO io; if (settings.wait_for_async_insert) { auto timeout = settings.wait_for_async_insert_timeout.totalMilliseconds(); auto query_id = context->getCurrentQueryId(); auto source = std::make_shared(query_id, timeout, *queue); - io.pipeline = QueryPipeline(Pipe(std::move(source))); + res.pipeline = QueryPipeline(Pipe(std::move(source))); } - return std::make_tuple(ast, std::move(io)); - } - - auto interpreter = InterpreterFactory::get(ast, context, SelectQueryOptions(stage).setInternal(internal)); - - std::shared_ptr quota; - if (!interpreter->ignoreQuota()) - { quota = context->getQuota(); if (quota) { - if (ast->as() || ast->as()) - { - quota->used(QuotaType::QUERY_SELECTS, 1); - } - else if (ast->as()) - { - quota->used(QuotaType::QUERY_INSERTS, 1); - } + quota->used(QuotaType::QUERY_INSERTS, 1); quota->used(QuotaType::QUERIES, 1); - quota->checkExceeded(QuotaType::ERRORS); } - } - StreamLocalLimits limits; - if (!interpreter->ignoreLimits()) - { - limits.mode = LimitsMode::LIMITS_CURRENT; //-V1048 - limits.size_limits = SizeLimits(settings.max_result_rows, settings.max_result_bytes, settings.result_overflow_mode); - } - - { - std::unique_ptr span; - if (context->query_trace_context.trace_id != UUID()) - { - auto raw_interpreter_ptr = interpreter.get(); - std::string class_name(abi::__cxa_demangle(typeid(*raw_interpreter_ptr).name(), nullptr, nullptr, nullptr)); - span = std::make_unique(class_name + "::execute()"); - } - res = interpreter->execute(); - } - - QueryPipeline & pipeline = res.pipeline; - - if (const auto * insert_interpreter = typeid_cast(&*interpreter)) - { - /// Save insertion table (not table function). TODO: support remote() table function. - auto table_id = insert_interpreter->getDatabaseTable(); + const auto & table_id = insert_query->table_id; if (!table_id.empty()) - context->setInsertionTable(std::move(table_id)); + context->setInsertionTable(table_id); + } + else + { + interpreter = InterpreterFactory::get(ast, context, SelectQueryOptions(stage).setInternal(internal)); + + if (!interpreter->ignoreQuota()) + { + quota = context->getQuota(); + if (quota) + { + if (ast->as() || ast->as()) + { + quota->used(QuotaType::QUERY_SELECTS, 1); + } + else if (ast->as()) + { + quota->used(QuotaType::QUERY_INSERTS, 1); + } + quota->used(QuotaType::QUERIES, 1); + quota->checkExceeded(QuotaType::ERRORS); + } + } + + if (!interpreter->ignoreLimits()) + { + limits.mode = LimitsMode::LIMITS_CURRENT; //-V1048 + limits.size_limits = SizeLimits(settings.max_result_rows, settings.max_result_bytes, settings.result_overflow_mode); + } + + { + std::unique_ptr span; + if (context->query_trace_context.trace_id != UUID()) + { + auto * raw_interpreter_ptr = interpreter.get(); + std::string class_name(abi::__cxa_demangle(typeid(*raw_interpreter_ptr).name(), nullptr, nullptr, nullptr)); + span = std::make_unique(class_name + "::execute()"); + } + res = interpreter->execute(); + } + + if (const auto * insert_interpreter = typeid_cast(&*interpreter)) + { + /// Save insertion table (not table function). TODO: support remote() table function. + auto table_id = insert_interpreter->getDatabaseTable(); + if (!table_id.empty()) + context->setInsertionTable(std::move(table_id)); + } } if (process_list_entry) @@ -663,6 +680,8 @@ static std::tuple executeQueryImpl( /// Hold element of process list till end of query execution. res.process_list_entry = process_list_entry; + auto & pipeline = res.pipeline; + if (pipeline.pulling() || pipeline.completed()) { /// Limits on the result, the quota on the result, and also callback for progress. @@ -712,7 +731,10 @@ static std::tuple executeQueryImpl( elem.query_views = info.views; } - interpreter->extendQueryLogElem(elem, ast, context, query_database, query_table); + if (async_insert) + InterpreterInsertQuery::extendQueryLogElemImpl(elem, context); + else if (interpreter) + interpreter->extendQueryLogElem(elem, ast, context, query_database, query_table); if (settings.log_query_settings) elem.query_settings = std::make_shared(context->getSettingsRef()); @@ -819,8 +841,8 @@ static std::tuple executeQueryImpl( else /// will be used only for ordinary INSERT queries { auto progress_out = process_list_elem->getProgressOut(); - elem.result_rows = progress_out.read_rows; - elem.result_bytes = progress_out.read_bytes; + elem.result_rows = progress_out.written_rows; + elem.result_bytes = progress_out.written_rows; } if (elem.read_rows != 0) diff --git a/src/Interpreters/getTableExpressions.cpp b/src/Interpreters/getTableExpressions.cpp index d82c7fc1332..830f0ea4411 100644 --- a/src/Interpreters/getTableExpressions.cpp +++ b/src/Interpreters/getTableExpressions.cpp @@ -16,7 +16,7 @@ NameSet removeDuplicateColumns(NamesAndTypesList & columns) if (names.emplace(it->name).second) ++it; else - columns.erase(it++); + it = columns.erase(it); } return names; } diff --git a/src/Interpreters/getTableOverride.cpp b/src/Interpreters/getTableOverride.cpp new file mode 100644 index 00000000000..903d9e80836 --- /dev/null +++ b/src/Interpreters/getTableOverride.cpp @@ -0,0 +1,27 @@ +#include "getTableOverride.h" + +#include +#include +#include +#include + +namespace DB +{ + +ASTPtr tryGetTableOverride(const String & mapped_database, const String & table) +{ + if (auto database_ptr = DatabaseCatalog::instance().tryGetDatabase(mapped_database)) + { + auto create_query = database_ptr->getCreateDatabaseQuery(); + if (auto * create_database_query = create_query->as()) + { + if (create_database_query->table_overrides) + { + return create_database_query->table_overrides->tryGetTableOverride(table); + } + } + } + return nullptr; +} + +} diff --git a/src/Interpreters/getTableOverride.h b/src/Interpreters/getTableOverride.h new file mode 100644 index 00000000000..1a0a15e6fe2 --- /dev/null +++ b/src/Interpreters/getTableOverride.h @@ -0,0 +1,8 @@ +#pragma once +#include +#include + +namespace DB +{ +ASTPtr tryGetTableOverride(const String & mapped_database, const String & table); +} diff --git a/src/Interpreters/join_common.cpp b/src/Interpreters/join_common.cpp index bf20bef6992..ca55fde0740 100644 --- a/src/Interpreters/join_common.cpp +++ b/src/Interpreters/join_common.cpp @@ -2,6 +2,7 @@ #include #include +#include #include #include @@ -133,8 +134,11 @@ DataTypePtr convertTypeToNullable(const DataTypePtr & type) /// Convert column to nullable. If column LowCardinality or Const, convert nested column. /// Returns nullptr if conversion cannot be performed. -static ColumnPtr tryConvertColumnToNullable(const ColumnPtr & col) +static ColumnPtr tryConvertColumnToNullable(ColumnPtr col) { + if (col->isSparse()) + col = recursiveRemoveSparse(col); + if (isColumnNullable(*col) || col->canBeInsideNullable()) return makeNullable(col); @@ -225,7 +229,13 @@ void removeColumnNullability(ColumnWithTypeAndName & column) if (column.column && column.column->isNullable()) { + column.column = column.column->convertToFullColumnIfConst(); const auto * nullable_col = checkAndGetColumn(*column.column); + if (!nullable_col) + { + throw DB::Exception(ErrorCodes::LOGICAL_ERROR, "Column '{}' is expected to be nullable", column.dumpStructure()); + } + MutableColumnPtr mutable_column = nullable_col->getNestedColumn().cloneEmpty(); insertFromNullableOrDefault(mutable_column, nullable_col); column.column = std::move(mutable_column); @@ -291,7 +301,7 @@ ColumnRawPtrs materializeColumnsInplace(Block & block, const Names & names) for (const auto & column_name : names) { auto & column = block.getByName(column_name).column; - column = recursiveRemoveLowCardinality(column->convertToFullColumnIfConst()); + column = recursiveRemoveLowCardinality(recursiveRemoveSparse(column->convertToFullColumnIfConst())); ptrs.push_back(column.get()); } @@ -316,7 +326,8 @@ ColumnRawPtrMap materializeColumnsInplaceMap(Block & block, const Names & names) ColumnPtr materializeColumn(const Block & block, const String & column_name) { const auto & src_column = block.getByName(column_name).column; - return recursiveRemoveLowCardinality(src_column->convertToFullColumnIfConst()); + return recursiveRemoveLowCardinality( + recursiveRemoveSparse(src_column->convertToFullColumnIfConst())); } Columns materializeColumns(const Block & block, const Names & names) @@ -343,22 +354,22 @@ ColumnRawPtrs getRawPointers(const Columns & columns) return ptrs; } -void removeLowCardinalityInplace(Block & block) +void convertToFullColumnsInplace(Block & block) { for (size_t i = 0; i < block.columns(); ++i) { auto & col = block.getByPosition(i); - col.column = recursiveRemoveLowCardinality(col.column); + col.column = recursiveRemoveLowCardinality(recursiveRemoveSparse(col.column)); col.type = recursiveRemoveLowCardinality(col.type); } } -void removeLowCardinalityInplace(Block & block, const Names & names, bool change_type) +void convertToFullColumnsInplace(Block & block, const Names & names, bool change_type) { for (const String & column_name : names) { auto & col = block.getByName(column_name); - col.column = recursiveRemoveLowCardinality(col.column); + col.column = recursiveRemoveLowCardinality(recursiveRemoveSparse(col.column)); if (change_type) col.type = recursiveRemoveLowCardinality(col.type); } @@ -395,6 +406,9 @@ ColumnRawPtrs extractKeysForJoin(const Block & block_keys, const Names & key_nam /// We will join only keys, where all components are not NULL. if (const auto * nullable = checkAndGetColumn(*key_columns[i])) key_columns[i] = &nullable->getNestedColumn(); + + if (const auto * sparse = checkAndGetColumn(*key_columns[i])) + key_columns[i] = &sparse->getValuesColumn(); } return key_columns; diff --git a/src/Interpreters/join_common.h b/src/Interpreters/join_common.h index d3d2a442e41..3e5a22f33bf 100644 --- a/src/Interpreters/join_common.h +++ b/src/Interpreters/join_common.h @@ -72,8 +72,8 @@ Columns materializeColumns(const Block & block, const Names & names); ColumnRawPtrs materializeColumnsInplace(Block & block, const Names & names); ColumnRawPtrMap materializeColumnsInplaceMap(Block & block, const Names & names); ColumnRawPtrs getRawPointers(const Columns & columns); -void removeLowCardinalityInplace(Block & block); -void removeLowCardinalityInplace(Block & block, const Names & names, bool change_type = true); +void convertToFullColumnsInplace(Block & block); +void convertToFullColumnsInplace(Block & block, const Names & names, bool change_type = true); void restoreLowCardinalityInplace(Block & block, const Names & lowcard_keys); ColumnRawPtrs extractKeysForJoin(const Block & block_keys, const Names & key_names_right); diff --git a/src/Interpreters/tests/gtest_table_overrides.cpp b/src/Interpreters/tests/gtest_table_overrides.cpp new file mode 100644 index 00000000000..779bc7a53a4 --- /dev/null +++ b/src/Interpreters/tests/gtest_table_overrides.cpp @@ -0,0 +1,88 @@ +#include +#include +#include +#include + +#include +#include + +namespace +{ +using namespace DB; +using namespace std::literals; +} + + +struct TableOverrideTestCase +{ + String create_database_query; + String create_table_query; + String expected_create_table_query; +}; + +std::ostream & operator<<(std::ostream & ostr, const TableOverrideTestCase & test_case) +{ + return ostr << "database: " << test_case.create_database_query << ", table: " << test_case.create_table_query + << ", expected: " << test_case.expected_create_table_query; +} + +class TableOverrideTest : public ::testing::TestWithParam +{}; + +TEST_P(TableOverrideTest, applyOverrides) +{ + const auto & [database_query, table_query, expected_query] = GetParam(); + ParserCreateQuery parser; + ASTPtr database_ast; + ASSERT_NO_THROW(database_ast = parseQuery(parser, database_query, 0, 0)); + auto * database = database_ast->as(); + ASSERT_NE(nullptr, database); + ASTPtr table_ast; + ASSERT_NO_THROW(table_ast = parseQuery(parser, table_query, 0, 0)); + auto * table = table_ast->as(); + ASSERT_NE(nullptr, table); + auto table_name = table->table->as()->name(); + if (database->table_overrides) + { + auto override_ast = database->table_overrides->tryGetTableOverride(table_name); + ASSERT_NE(nullptr, override_ast); + auto * override_table_ast = override_ast->as(); + ASSERT_NE(nullptr, override_table_ast); + applyTableOverrideToCreateQuery(*override_table_ast, table); + } + EXPECT_EQ(expected_query, serializeAST(*table)); +} + +INSTANTIATE_TEST_SUITE_P(ApplyTableOverrides, TableOverrideTest, + ::testing::ValuesIn(std::initializer_list{ + { + "CREATE DATABASE db", + "CREATE TABLE db.t (id Int64) ENGINE=Log", + "CREATE TABLE db.t (`id` Int64) ENGINE = Log" + }, + { + "CREATE DATABASE db TABLE OVERRIDE t (PARTITION BY tuple())", + "CREATE TABLE db.t (id Int64) ENGINE=MergeTree", + "CREATE TABLE db.t (`id` Int64) ENGINE = MergeTree PARTITION BY tuple()" + }, + { + "CREATE DATABASE db TABLE OVERRIDE t (COLUMNS (id UInt64 CODEC(Delta), shard UInt8 ALIAS modulo(id, 16)) PARTITION BY shard)", + "CREATE TABLE db.t (id Int64) ENGINE=MergeTree", + "CREATE TABLE db.t (`id` UInt64 CODEC(Delta), `shard` UInt8 ALIAS id % 16) ENGINE = MergeTree PARTITION BY shard" + }, + { + "CREATE DATABASE db TABLE OVERRIDE a (PARTITION BY modulo(id, 3)), TABLE OVERRIDE b (PARTITION BY modulo(id, 5))", + "CREATE TABLE db.a (id Int64) ENGINE=MergeTree", + "CREATE TABLE db.a (`id` Int64) ENGINE = MergeTree PARTITION BY id % 3" + }, + { + "CREATE DATABASE db TABLE OVERRIDE a (PARTITION BY modulo(id, 3)), TABLE OVERRIDE b (PARTITION BY modulo(id, 5))", + "CREATE TABLE db.b (id Int64) ENGINE=MergeTree", + "CREATE TABLE db.b (`id` Int64) ENGINE = MergeTree PARTITION BY id % 5" + }, + { + "CREATE DATABASE db TABLE OVERRIDE `tbl` (PARTITION BY toYYYYMM(created))", + "CREATE TABLE db.tbl (id Int64, created DateTime) ENGINE=Foo", + "CREATE TABLE db.tbl (`id` Int64, `created` DateTime) ENGINE = Foo PARTITION BY toYYYYMM(created)", + } +})); diff --git a/src/Parsers/ASTCreateQuery.cpp b/src/Parsers/ASTCreateQuery.cpp index facc7e728c9..e61a0f55142 100644 --- a/src/Parsers/ASTCreateQuery.cpp +++ b/src/Parsers/ASTCreateQuery.cpp @@ -200,6 +200,8 @@ ASTPtr ASTCreateQuery::clone() const res->set(res->select, select->clone()); if (tables) res->set(res->tables, tables->clone()); + if (table_overrides) + res->set(res->table_overrides, table_overrides->clone()); if (dictionary) { @@ -240,6 +242,12 @@ void ASTCreateQuery::formatQueryImpl(const FormatSettings & settings, FormatStat if (storage) storage->formatImpl(settings, state, frame); + if (table_overrides) + { + settings.ostr << settings.nl_or_ws; + table_overrides->formatImpl(settings, state, frame); + } + if (comment) { settings.ostr << (settings.hilite ? hilite_keyword : "") << settings.nl_or_ws << "COMMENT " << (settings.hilite ? hilite_none : ""); @@ -351,7 +359,7 @@ void ASTCreateQuery::formatQueryImpl(const FormatSettings & settings, FormatStat if (as_table_function) { - if (columns_list) + if (columns_list && !columns_list->empty()) { frame.expression_list_always_start_on_new_line = true; settings.ostr << (settings.one_line ? " (" : "\n("); @@ -367,7 +375,7 @@ void ASTCreateQuery::formatQueryImpl(const FormatSettings & settings, FormatStat frame.expression_list_always_start_on_new_line = true; - if (columns_list && !as_table_function) + if (columns_list && !columns_list->empty() && !as_table_function) { settings.ostr << (settings.one_line ? " (" : "\n("); FormatStateStacked frame_nested = frame; @@ -419,8 +427,11 @@ void ASTCreateQuery::formatQueryImpl(const FormatSettings & settings, FormatStat if (select) { - settings.ostr << (settings.hilite ? hilite_keyword : "") << " AS" << settings.nl_or_ws << (settings.hilite ? hilite_none : ""); + settings.ostr << (settings.hilite ? hilite_keyword : "") << " AS" + << (comment ? "(" : "") + << settings.nl_or_ws << (settings.hilite ? hilite_none : ""); select->formatImpl(settings, state, frame); + settings.ostr << (comment ? ")" : ""); } if (tables) diff --git a/src/Parsers/ASTCreateQuery.h b/src/Parsers/ASTCreateQuery.h index 2516b1d0728..2e35731acad 100644 --- a/src/Parsers/ASTCreateQuery.h +++ b/src/Parsers/ASTCreateQuery.h @@ -4,6 +4,7 @@ #include #include #include +#include #include namespace DB @@ -49,6 +50,12 @@ public: ASTPtr clone() const override; void formatImpl(const FormatSettings & s, FormatState & state, FormatStateStacked frame) const override; + + bool empty() + { + return (!columns || columns->children.empty()) && (!indices || indices->children.empty()) && (!constraints || constraints->children.empty()) + && (!projections || projections->children.empty()); + } }; @@ -79,6 +86,8 @@ public: ASTSelectWithUnionQuery * select = nullptr; IAST * comment = nullptr; + ASTTableOverrideList * table_overrides = nullptr; /// For CREATE DATABASE with engines that automatically create tables + bool is_dictionary{false}; /// CREATE DICTIONARY ASTExpressionList * dictionary_attributes_list = nullptr; /// attributes of ASTDictionary * dictionary = nullptr; /// dictionary definition (layout, primary key, etc.) diff --git a/src/Parsers/ASTTableOverrides.cpp b/src/Parsers/ASTTableOverrides.cpp new file mode 100644 index 00000000000..d2625bf19b4 --- /dev/null +++ b/src/Parsers/ASTTableOverrides.cpp @@ -0,0 +1,137 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +ASTPtr ASTTableOverride::clone() const +{ + auto res = std::make_shared(*this); + res->children.clear(); + res->table_name = table_name; + if (columns) + res->set(res->columns, columns->clone()); + if (storage) + res->set(res->storage, storage->clone()); + return res; +} + +void ASTTableOverride::formatImpl(const FormatSettings & settings_, FormatState & state, FormatStateStacked frame) const +{ + FormatSettings settings = settings_; + settings.always_quote_identifiers = true; + String nl_or_nothing = settings.one_line ? "" : "\n"; + String nl_or_ws = settings.one_line ? " " : "\n"; + String hl_keyword = settings.hilite ? hilite_keyword : ""; + String hl_none = settings.hilite ? hilite_none : ""; + + settings.ostr << hl_keyword << "TABLE OVERRIDE " << hl_none; + ASTIdentifier(table_name).formatImpl(settings, state, frame); + if (!columns && (!storage || storage->children.empty())) + return; + auto override_frame = frame; + ++override_frame.indent; + settings.ostr << nl_or_ws << '(' << nl_or_nothing; + String indent_str = settings.one_line ? "" : String(4 * override_frame.indent, ' '); + size_t override_elems = 0; + if (columns) + { + FormatStateStacked columns_frame = override_frame; + columns_frame.expression_list_always_start_on_new_line = true; + settings.ostr << indent_str << hl_keyword << "COLUMNS" << hl_none << nl_or_ws << indent_str << "("; + columns->formatImpl(settings, state, columns_frame); + settings.ostr << nl_or_nothing << indent_str << ")"; + ++override_elems; + } + if (storage) + { + const auto & format_storage_elem = [&](IAST * elem, const String & elem_name) + { + if (elem) + { + settings.ostr << (override_elems++ ? nl_or_ws : "") + << indent_str + << hl_keyword << elem_name << hl_none << ' '; + elem->formatImpl(settings, state, override_frame); + } + }; + format_storage_elem(storage->partition_by, "PARTITION BY"); + format_storage_elem(storage->primary_key, "PRIMARY KEY"); + format_storage_elem(storage->order_by, "ORDER BY"); + format_storage_elem(storage->sample_by, "SAMPLE BY"); + format_storage_elem(storage->ttl_table, "TTL"); + } + + settings.ostr << nl_or_nothing << ')'; +} + +ASTPtr ASTTableOverrideList::clone() const +{ + auto res = std::make_shared(*this); + res->cloneChildren(); + return res; +} + +ASTPtr ASTTableOverrideList::tryGetTableOverride(const String & name) const +{ + auto it = positions.find(name); + if (it == positions.end()) + return nullptr; + return children[it->second]; +} + +void ASTTableOverrideList::setTableOverride(const String & name, const ASTPtr ast) +{ + auto it = positions.find(name); + if (it == positions.end()) + { + positions[name] = children.size(); + children.emplace_back(ast); + } + else + { + children[it->second] = ast; + } +} + +void ASTTableOverrideList::removeTableOverride(const String & name) +{ + if (hasOverride(name)) + { + size_t pos = positions[name]; + children.erase(children.begin() + pos); + positions.erase(name); + for (auto & pr : positions) + if (pr.second > pos) + --pr.second; + } +} + +bool ASTTableOverrideList::hasOverride(const String & name) const +{ + return positions.count(name); +} + +void ASTTableOverrideList::formatImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const +{ + if (frame.expression_list_prepend_whitespace) + settings.ostr << ' '; + + for (ASTs::const_iterator it = children.begin(); it != children.end(); ++it) + { + if (it != children.begin()) + { + settings.ostr << (settings.one_line ? ", " : ",\n"); + } + + (*it)->formatImpl(settings, state, frame); + } +} + +} diff --git a/src/Parsers/ASTTableOverrides.h b/src/Parsers/ASTTableOverrides.h new file mode 100644 index 00000000000..62e96b16b01 --- /dev/null +++ b/src/Parsers/ASTTableOverrides.h @@ -0,0 +1,51 @@ +#pragma once + +#include + +#include + + +namespace DB +{ + +class ASTColumns; +class ASTCreateQuery; +class ASTIdentifier; +class ASTStorage; + +/// Storage and column overrides for a single table, for example: +/// +/// TABLE OVERRIDE `foo` PARTITION BY toYYYYMM(`createtime`) +/// +class ASTTableOverride : public IAST +{ +public: + String table_name; + ASTColumns * columns = nullptr; + ASTStorage * storage = nullptr; + String getID(char) const override { return "TableOverride " + table_name; } + ASTPtr clone() const override; + void formatImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const override; +}; + +/// List of table overrides, for example: +/// +/// TABLE OVERRIDE `foo` (PARTITION BY toYYYYMM(`createtime`)), +/// TABLE OVERRIDE `bar` (SAMPLE BY `id`) +/// +class ASTTableOverrideList : public IAST +{ +public: + String getID(char) const override { return "TableOverrideList"; } + ASTPtr clone() const override; + void formatImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const override; + void setTableOverride(const String & name, const ASTPtr ast); + void removeTableOverride(const String & name); + ASTPtr tryGetTableOverride(const String & name) const; + bool hasOverride(const String & name) const; + +private: + std::map positions; +}; + +} diff --git a/src/Parsers/ExpressionElementParsers.cpp b/src/Parsers/ExpressionElementParsers.cpp index e9ec7b43a21..584c2a32afd 100644 --- a/src/Parsers/ExpressionElementParsers.cpp +++ b/src/Parsers/ExpressionElementParsers.cpp @@ -22,7 +22,6 @@ #include #include #include -#include #include #include @@ -35,7 +34,6 @@ #include #include -#include #include "ASTColumnsMatcher.h" #include @@ -1935,15 +1933,21 @@ bool ParserColumnsTransformers::parseImpl(Pos & pos, ASTPtr & node, Expected & e { if (const auto * func = lambda->as(); func && func->name == "lambda") { + if (func->arguments->children.size() != 2) + throw Exception(ErrorCodes::SYNTAX_ERROR, "lambda requires two arguments"); + const auto * lambda_args_tuple = func->arguments->children.at(0)->as(); + if (!lambda_args_tuple || lambda_args_tuple->name != "tuple") + throw Exception(ErrorCodes::SYNTAX_ERROR, "First argument of lambda must be a tuple"); + const ASTs & lambda_arg_asts = lambda_args_tuple->arguments->children; if (lambda_arg_asts.size() != 1) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "APPLY column transformer can only accept lambda with one argument"); + throw Exception(ErrorCodes::SYNTAX_ERROR, "APPLY column transformer can only accept lambda with one argument"); if (auto opt_arg_name = tryGetIdentifierName(lambda_arg_asts[0]); opt_arg_name) lambda_arg = *opt_arg_name; else - throw Exception(ErrorCodes::BAD_ARGUMENTS, "lambda argument declarations must be identifiers"); + throw Exception(ErrorCodes::SYNTAX_ERROR, "lambda argument declarations must be identifiers"); } else { diff --git a/src/Parsers/ExpressionListParsers.cpp b/src/Parsers/ExpressionListParsers.cpp index 485df2d3662..680d3f6031b 100644 --- a/src/Parsers/ExpressionListParsers.cpp +++ b/src/Parsers/ExpressionListParsers.cpp @@ -402,7 +402,7 @@ bool ParserVariableArityOperatorList::parseImpl(Pos & pos, ASTPtr & node, Expect bool ParserBetweenExpression::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) { /// For the expression (subject [NOT] BETWEEN left AND right) - /// create an AST the same as for (subject> = left AND subject <= right). + /// create an AST the same as for (subject >= left AND subject <= right). ParserKeyword s_not("NOT"); ParserKeyword s_between("BETWEEN"); diff --git a/src/Parsers/ParserCreateQuery.cpp b/src/Parsers/ParserCreateQuery.cpp index dd224db1c5a..7f47e1efb49 100644 --- a/src/Parsers/ParserCreateQuery.cpp +++ b/src/Parsers/ParserCreateQuery.cpp @@ -1,22 +1,23 @@ -#include +#include +#include +#include +#include #include #include #include -#include -#include -#include #include -#include +#include #include +#include +#include #include #include -#include -#include -#include #include #include #include -#include +#include +#include +#include namespace DB @@ -556,34 +557,43 @@ bool ParserCreateTableQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expe } } } + /** Create queries without list of columns: + * - CREATE|ATTACH TABLE ... AS ... + * - CREATE|ATTACH TABLE ... ENGINE = engine + */ else { storage_p.parse(pos, storage, expected); - if (!s_as.ignore(pos, expected)) - return false; - - if (!select_p.parse(pos, select, expected)) /// AS SELECT ... + /// CREATE|ATTACH TABLE ... AS ... + if (s_as.ignore(pos, expected)) { - /// ENGINE can not be specified for table functions. - if (storage || !table_function_p.parse(pos, as_table_function, expected)) + if (!select_p.parse(pos, select, expected)) /// AS SELECT ... { - /// AS [db.]table - if (!name_p.parse(pos, as_table, expected)) - return false; - - if (s_dot.ignore(pos, expected)) + /// ENGINE can not be specified for table functions. + if (storage || !table_function_p.parse(pos, as_table_function, expected)) { - as_database = as_table; + /// AS [db.]table if (!name_p.parse(pos, as_table, expected)) return false; - } - /// Optional - ENGINE can be specified. - if (!storage) - storage_p.parse(pos, storage, expected); + if (s_dot.ignore(pos, expected)) + { + as_database = as_table; + if (!name_p.parse(pos, as_table, expected)) + return false; + } + + /// Optional - ENGINE can be specified. + if (!storage) + storage_p.parse(pos, storage, expected); + } } } + else if (!storage) + { + return false; + } } auto comment = parseComment(pos, expected); @@ -746,6 +756,7 @@ bool ParserCreateLiveViewQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & e if (!select_p.parse(pos, select, expected)) return false; + auto comment = parseComment(pos, expected); auto query = std::make_shared(); node = query; @@ -780,6 +791,9 @@ bool ParserCreateLiveViewQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & e if (live_view_periodic_refresh) query->live_view_periodic_refresh.emplace(live_view_periodic_refresh->as().value.safeGet()); + if (comment) + query->set(query->comment, comment); + return true; } @@ -932,6 +946,141 @@ bool ParserCreateWindowViewQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & return true; } +bool ParserTableOverrideDeclaration::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) +{ + ParserKeyword s_table_override("TABLE OVERRIDE"); + ParserIdentifier table_name_p; + ParserToken lparen_p(TokenType::OpeningRoundBracket); + ParserToken rparen_p(TokenType::ClosingRoundBracket); + ParserTablePropertiesDeclarationList table_properties_p; + ParserExpression expression_p; + ParserTTLExpressionList parser_ttl_list; + ParserKeyword s_columns("COLUMNS"); + ParserKeyword s_partition_by("PARTITION BY"); + ParserKeyword s_primary_key("PRIMARY KEY"); + ParserKeyword s_order_by("ORDER BY"); + ParserKeyword s_sample_by("SAMPLE BY"); + ParserKeyword s_ttl("TTL"); + ASTPtr table_name; + ASTPtr columns; + ASTPtr partition_by; + ASTPtr primary_key; + ASTPtr order_by; + ASTPtr sample_by; + ASTPtr ttl_table; + + if (!s_table_override.ignore(pos, expected)) + return false; + + if (!table_name_p.parse(pos, table_name, expected)) + return false; + + if (!lparen_p.ignore(pos, expected)) + return false; + + while (true) + { + if (!columns && s_columns.ignore(pos, expected)) + { + if (!lparen_p.ignore(pos, expected)) + return false; + if (!table_properties_p.parse(pos, columns, expected)) + return false; + if (!rparen_p.ignore(pos, expected)) + return false; + } + + + if (!partition_by && s_partition_by.ignore(pos, expected)) + { + if (expression_p.parse(pos, partition_by, expected)) + continue; + else + return false; + } + + if (!primary_key && s_primary_key.ignore(pos, expected)) + { + if (expression_p.parse(pos, primary_key, expected)) + continue; + else + return false; + } + + if (!order_by && s_order_by.ignore(pos, expected)) + { + if (expression_p.parse(pos, order_by, expected)) + continue; + else + return false; + } + + if (!sample_by && s_sample_by.ignore(pos, expected)) + { + if (expression_p.parse(pos, sample_by, expected)) + continue; + else + return false; + } + + if (!ttl_table && s_ttl.ignore(pos, expected)) + { + if (parser_ttl_list.parse(pos, ttl_table, expected)) + continue; + else + return false; + } + + break; + } + + if (!rparen_p.ignore(pos, expected)) + return false; + + auto storage = std::make_shared(); + storage->set(storage->partition_by, partition_by); + storage->set(storage->primary_key, primary_key); + storage->set(storage->order_by, order_by); + storage->set(storage->sample_by, sample_by); + storage->set(storage->ttl_table, ttl_table); + + auto res = std::make_shared(); + res->table_name = table_name->as()->name(); + res->set(res->storage, storage); + if (columns) + res->set(res->columns, columns); + + node = res; + + return true; +} + +bool ParserTableOverridesDeclarationList::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) +{ + ParserTableOverrideDeclaration table_override_p; + ParserToken s_comma(TokenType::Comma); + auto res = std::make_shared(); + auto parse_element = [&] + { + ASTPtr element; + if (!table_override_p.parse(pos, element, expected)) + return false; + auto * table_override = element->as(); + if (!table_override) + return false; + res->setTableOverride(table_override->table_name, element); + return true; + }; + + if (!ParserList::parseUtil(pos, expected, parse_element, s_comma, true)) + return false; + + if (!res->children.empty()) + node = res; + + return true; +} + bool ParserCreateDatabaseQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) { ParserKeyword s_create("CREATE"); @@ -940,9 +1089,11 @@ bool ParserCreateDatabaseQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & e ParserKeyword s_if_not_exists("IF NOT EXISTS"); ParserStorage storage_p; ParserIdentifier name_p(true); + ParserTableOverridesDeclarationList table_overrides_p; ASTPtr database; ASTPtr storage; + ASTPtr table_overrides; UUID uuid = UUIDHelpers::Nil; String cluster_str; @@ -984,6 +1135,9 @@ bool ParserCreateDatabaseQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & e storage_p.parse(pos, storage, expected); auto comment = parseComment(pos, expected); + if (!table_overrides_p.parse(pos, table_overrides, expected)) + return false; + auto query = std::make_shared(); node = query; @@ -1000,6 +1154,8 @@ bool ParserCreateDatabaseQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & e query->set(query->storage, storage); if (comment) query->set(query->comment, comment); + if (table_overrides && !table_overrides->children.empty()) + query->set(query->table_overrides, table_overrides); return true; } diff --git a/src/Parsers/ParserCreateQuery.h b/src/Parsers/ParserCreateQuery.h index e1acaf486d8..33aafb40d83 100644 --- a/src/Parsers/ParserCreateQuery.h +++ b/src/Parsers/ParserCreateQuery.h @@ -361,6 +361,8 @@ protected: * Or: * CREATE|ATTACH TABLE [IF NOT EXISTS] [db.]name [UUID 'uuid'] [ON CLUSTER cluster] AS ENGINE = engine SELECT ... * + * Or (for engines that supports schema inference): + * CREATE|ATTACH TABLE [IF NOT EXISTS] [db.]name [UUID 'uuid'] [ON CLUSTER cluster] ENGINE = engine */ class ParserCreateTableQuery : public IParserBase { @@ -385,6 +387,20 @@ protected: bool parseImpl(Pos & pos, ASTPtr & node, Expected & expected) override; }; +class ParserTableOverrideDeclaration : public IParserBase +{ +protected: + const char * getName() const override { return "table override declaration"; } + bool parseImpl(Pos & pos, ASTPtr & node, Expected & expected) override; +}; + +class ParserTableOverridesDeclarationList : public IParserBase +{ +protected: + const char * getName() const override { return "table overrides declaration list"; } + bool parseImpl(Pos & pos, ASTPtr & node, Expected & expected) override; +}; + /// CREATE|ATTACH DATABASE db [ENGINE = engine] class ParserCreateDatabaseQuery : public IParserBase { diff --git a/src/Parsers/tests/gtest_Parser.cpp b/src/Parsers/tests/gtest_Parser.cpp index 5ebea834a91..fac79de4c5b 100644 --- a/src/Parsers/tests/gtest_Parser.cpp +++ b/src/Parsers/tests/gtest_Parser.cpp @@ -1,11 +1,14 @@ -#include +#include +#include +#include +#include +#include #include #include - +#include #include -#include #include -#include +#include #include @@ -25,7 +28,7 @@ struct ParserTestCase std::ostream & operator<<(std::ostream & ostr, const std::shared_ptr parser) { - return ostr << "Praser: " << parser->getName(); + return ostr << "Parser: " << parser->getName(); } std::ostream & operator<<(std::ostream & ostr, const ParserTestCase & test_case) @@ -55,7 +58,6 @@ TEST_P(ParserTest, parseQuery) } } - INSTANTIATE_TEST_SUITE_P(ParserOptimizeQuery, ParserTest, ::testing::Combine( ::testing::Values(std::make_shared()), @@ -144,7 +146,7 @@ INSTANTIATE_TEST_SUITE_P(ParserAlterCommand_MODIFY_COMMENT, ParserTest, INSTANTIATE_TEST_SUITE_P(ParserCreateQuery_DICTIONARY_WITH_COMMENT, ParserTest, ::testing::Combine( - ::testing::Values(std::make_shared()), + ::testing::Values(std::make_shared()), ::testing::ValuesIn(std::initializer_list{ { R"sql(CREATE DICTIONARY 2024_dictionary_with_comment @@ -170,3 +172,57 @@ LAYOUT(FLAT()) COMMENT 'Test dictionary with comment')sql" }} ))); + +INSTANTIATE_TEST_SUITE_P(ParserCreateDatabaseQuery, ParserTest, + ::testing::Combine( + ::testing::Values(std::make_shared()), + ::testing::ValuesIn(std::initializer_list{ + { + "CREATE DATABASE db ENGINE=MaterializeMySQL('addr:port', 'db', 'user', 'pw')", + "CREATE DATABASE db\nENGINE = MaterializeMySQL('addr:port', 'db', 'user', 'pw')" + }, + { + "CREATE DATABASE db ENGINE=MaterializeMySQL('addr:port', 'db', 'user', 'pw') TABLE OVERRIDE `tbl`\n(PARTITION BY toYYYYMM(created))", + "CREATE DATABASE db\nENGINE = MaterializeMySQL('addr:port', 'db', 'user', 'pw')\nTABLE OVERRIDE `tbl`\n(\n PARTITION BY toYYYYMM(`created`)\n)" + }, + { + "CREATE DATABASE db ENGINE=Foo TABLE OVERRIDE `tbl` (), TABLE OVERRIDE a (COLUMNS (_created DateTime MATERIALIZED now())), TABLE OVERRIDE b (PARTITION BY rand())", + "CREATE DATABASE db\nENGINE = Foo\nTABLE OVERRIDE `tbl`,\nTABLE OVERRIDE `a`\n(\n COLUMNS\n (\n `_created` DateTime MATERIALIZED now()\n )\n),\nTABLE OVERRIDE `b`\n(\n PARTITION BY rand()\n)" + }, + { + "CREATE DATABASE db ENGINE=MaterializeMySQL('addr:port', 'db', 'user', 'pw') TABLE OVERRIDE tbl (COLUMNS (id UUID) PARTITION BY toYYYYMM(created))", + "CREATE DATABASE db\nENGINE = MaterializeMySQL('addr:port', 'db', 'user', 'pw')\nTABLE OVERRIDE `tbl`\n(\n COLUMNS\n (\n `id` UUID\n )\n PARTITION BY toYYYYMM(`created`)\n)" + }, + { + "CREATE DATABASE db TABLE OVERRIDE tbl (COLUMNS (INDEX foo foo TYPE minmax GRANULARITY 1) PARTITION BY if(_staged = 1, 'staging', toYYYYMM(created)))", + "CREATE DATABASE db\nTABLE OVERRIDE `tbl`\n(\n COLUMNS\n (\n INDEX foo `foo` TYPE minmax GRANULARITY 1\n )\n PARTITION BY if(`_staged` = 1, 'staging', toYYYYMM(`created`))\n)" + }, + { + "CREATE DATABASE db TABLE OVERRIDE t1 (TTL inserted + INTERVAL 1 MONTH DELETE), TABLE OVERRIDE t2 (TTL `inserted` + INTERVAL 2 MONTH DELETE)", + "CREATE DATABASE db\nTABLE OVERRIDE `t1`\n(\n TTL `inserted` + toIntervalMonth(1)\n),\nTABLE OVERRIDE `t2`\n(\n TTL `inserted` + toIntervalMonth(2)\n)" + }, + { + "CREATE DATABASE db ENGINE = MaterializeMySQL('127.0.0.1:3306', 'db', 'root', 'pw') SETTINGS allows_query_when_mysql_lost = 1 TABLE OVERRIDE tab3 (COLUMNS (_staged UInt8 MATERIALIZED 1) PARTITION BY (c3) TTL c3 + INTERVAL 10 minute), TABLE OVERRIDE tab5 (PARTITION BY (c3) TTL c3 + INTERVAL 10 minute)", + "CREATE DATABASE db\nENGINE = MaterializeMySQL('127.0.0.1:3306', 'db', 'root', 'pw')\nSETTINGS allows_query_when_mysql_lost = 1\nTABLE OVERRIDE `tab3`\n(\n COLUMNS\n (\n `_staged` UInt8 MATERIALIZED 1\n )\n PARTITION BY `c3`\n TTL `c3` + toIntervalMinute(10)\n),\nTABLE OVERRIDE `tab5`\n(\n PARTITION BY `c3`\n TTL `c3` + toIntervalMinute(10)\n)" + }, + { + "CREATE DATABASE db TABLE OVERRIDE tbl (PARTITION BY toYYYYMM(created) COLUMNS (created DateTime CODEC(Delta)))", + "CREATE DATABASE db\nTABLE OVERRIDE `tbl`\n(\n COLUMNS\n (\n `created` DateTime CODEC(Delta)\n )\n PARTITION BY toYYYYMM(`created`)\n)" + }, + { + "CREATE DATABASE db ENGINE = Foo() SETTINGS a = 1", + "CREATE DATABASE db\nENGINE = Foo\nSETTINGS a = 1" + }, + { + "CREATE DATABASE db ENGINE = Foo() SETTINGS a = 1, b = 2", + "CREATE DATABASE db\nENGINE = Foo\nSETTINGS a = 1, b = 2" + }, + { + "CREATE DATABASE db ENGINE = Foo() SETTINGS a = 1, b = 2 TABLE OVERRIDE a (ORDER BY (id, version))", + "CREATE DATABASE db\nENGINE = Foo\nSETTINGS a = 1, b = 2\nTABLE OVERRIDE `a`\n(\n ORDER BY (`id`, `version`)\n)" + }, + { + "CREATE DATABASE db ENGINE = Foo() SETTINGS a = 1, b = 2 COMMENT 'db comment' TABLE OVERRIDE a (ORDER BY (id, version))", + "CREATE DATABASE db\nENGINE = Foo\nSETTINGS a = 1, b = 2\nTABLE OVERRIDE `a`\n(\n ORDER BY (`id`, `version`)\n)\nCOMMENT 'db comment'" + } +}))); diff --git a/src/Processors/Chunk.cpp b/src/Processors/Chunk.cpp index c1c98e2931c..8c7c09abf01 100644 --- a/src/Processors/Chunk.cpp +++ b/src/Processors/Chunk.cpp @@ -1,6 +1,7 @@ #include #include #include +#include namespace DB { @@ -185,4 +186,14 @@ const ChunkMissingValues::RowsBitMask & ChunkMissingValues::getDefaultsBitmask(s return none; } +void convertToFullIfSparse(Chunk & chunk) +{ + size_t num_rows = chunk.getNumRows(); + auto columns = chunk.detachColumns(); + for (auto & column : columns) + column = recursiveRemoveSparse(column); + + chunk.setColumns(std::move(columns), num_rows); +} + } diff --git a/src/Processors/Chunk.h b/src/Processors/Chunk.h index c26180453c7..e70ba57a267 100644 --- a/src/Processors/Chunk.h +++ b/src/Processors/Chunk.h @@ -131,4 +131,10 @@ private: RowsMaskByColumnId rows_mask_by_column_id; }; +/// Converts all columns to full serialization in chunk. +/// It's needed, when you have to access to the internals of the column, +/// or when you need to perform operation with two columns +/// and their structure must be equal (e.g. compareAt). +void convertToFullIfSparse(Chunk & chunk); + } diff --git a/src/Processors/Executors/PipelineExecutor.h b/src/Processors/Executors/PipelineExecutor.h index 12f2bd8b75b..0b1fe5dedf6 100644 --- a/src/Processors/Executors/PipelineExecutor.h +++ b/src/Processors/Executors/PipelineExecutor.h @@ -26,7 +26,7 @@ public: /// During pipeline execution new processors can appear. They will be added to existing set. /// /// Explicit graph representation is built in constructor. Throws if graph is not correct. - explicit PipelineExecutor(Processors & processors, QueryStatus * elem = nullptr); + explicit PipelineExecutor(Processors & processors, QueryStatus * elem); ~PipelineExecutor(); /// Execute pipeline in multiple threads. Must be called once. diff --git a/src/Processors/Formats/ISchemaReader.cpp b/src/Processors/Formats/ISchemaReader.cpp new file mode 100644 index 00000000000..096e39a2893 --- /dev/null +++ b/src/Processors/Formats/ISchemaReader.cpp @@ -0,0 +1,160 @@ +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int CANNOT_EXTRACT_TABLE_STRUCTURE; +} + +IRowSchemaReader::IRowSchemaReader(ReadBuffer & in_, size_t max_rows_to_read_, DataTypePtr default_type_) + : ISchemaReader(in_), max_rows_to_read(max_rows_to_read_), default_type(default_type_) +{ +} + +NamesAndTypesList IRowSchemaReader::readSchema() +{ + DataTypes data_types = readRowAndGetDataTypes(); + for (size_t row = 1; row < max_rows_to_read; ++row) + { + DataTypes new_data_types = readRowAndGetDataTypes(); + if (new_data_types.empty()) + /// We reached eof. + break; + + if (new_data_types.size() != data_types.size()) + throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "Rows have different amount of values"); + + for (size_t i = 0; i != data_types.size(); ++i) + { + /// We couldn't determine the type of this column in a new row, just skip it. + if (!new_data_types[i]) + continue; + + /// If we couldn't determine the type of column yet, just set the new type. + if (!data_types[i]) + data_types[i] = new_data_types[i]; + /// If the new type and the previous type for this column are different, + /// we will use default type if we have it or throw an exception. + else if (data_types[i]->getName() != new_data_types[i]->getName()) + { + if (default_type) + data_types[i] = default_type; + else + throw Exception( + ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, + "Automatically defined type {} for column {} in row {} differs from type defined by previous rows: {}", new_data_types[i]->getName(), i + 1, row, data_types[i]->getName()); + } + } + } + + /// Check that we read at list one column. + if (data_types.empty()) + throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "Cannot read rows from the data"); + + /// If column names weren't set, use default names 'c1', 'c2', ... + if (column_names.empty()) + { + column_names.reserve(data_types.size()); + for (size_t i = 0; i != data_types.size(); ++i) + column_names.push_back("c" + std::to_string(i + 1)); + } + /// If column names were set, check that the number of names match the number of types. + else if (column_names.size() != data_types.size()) + throw Exception( + ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, + "The number of column names {} differs with the number of types {}", column_names.size(), data_types.size()); + + NamesAndTypesList result; + for (size_t i = 0; i != data_types.size(); ++i) + { + /// Check that we could determine the type of this column. + if (!data_types[i]) + { + if (!default_type) + throw Exception( + ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, + "Cannot determine table structure by first {} rows of data, because some columns contain only Nulls. To increase the maximum " + "number of rows to read for structure determination, use setting input_format_max_rows_to_read_for_schema_inference", + max_rows_to_read); + + data_types[i] = default_type; + } + result.emplace_back(column_names[i], data_types[i]); + } + + return result; +} + +IRowWithNamesSchemaReader::IRowWithNamesSchemaReader(ReadBuffer & in_, size_t max_rows_to_read_, DataTypePtr default_type_) + : ISchemaReader(in_), max_rows_to_read(max_rows_to_read_), default_type(default_type_) +{ +} + +NamesAndTypesList IRowWithNamesSchemaReader::readSchema() +{ + auto names_and_types = readRowAndGetNamesAndDataTypes(); + for (size_t row = 1; row < max_rows_to_read; ++row) + { + auto new_names_and_types = readRowAndGetNamesAndDataTypes(); + if (new_names_and_types.empty()) + /// We reached eof. + break; + + for (const auto & [name, new_type] : new_names_and_types) + { + auto it = names_and_types.find(name); + /// If we didn't see this column before, just add it. + if (it == names_and_types.end()) + { + names_and_types[name] = new_type; + continue; + } + + auto & type = it->second; + /// If we couldn't determine the type of column yet, just set the new type. + if (!type) + type = new_type; + /// If the new type and the previous type for this column are different, + /// we will use default type if we have it or throw an exception. + else if (new_type && type->getName() != new_type->getName()) + { + if (default_type) + type = default_type; + else + throw Exception( + ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, + "Automatically defined type {} for column {} in row {} differs from type defined by previous rows: {}", type->getName(), name, row, new_type->getName()); + } + } + } + + /// Check that we read at list one column. + if (names_and_types.empty()) + throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "Cannot read rows from the data"); + + NamesAndTypesList result; + for (auto & [name, type] : names_and_types) + { + /// Check that we could determine the type of this column. + if (!type) + { + if (!default_type) + throw Exception( + ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, + "Cannot determine table structure by first {} rows of data, because some columns contain only Nulls. To increase the maximum " + "number of rows to read for structure determination, use setting input_format_max_rows_to_read_for_schema_inference", + max_rows_to_read); + + type = default_type; + } + result.emplace_back(name, type); + } + + return result; +} + +} diff --git a/src/Processors/Formats/ISchemaReader.h b/src/Processors/Formats/ISchemaReader.h new file mode 100644 index 00000000000..67a8eb88d61 --- /dev/null +++ b/src/Processors/Formats/ISchemaReader.h @@ -0,0 +1,87 @@ +#pragma once + +#include +#include +#include +#include + +namespace DB +{ + +/// Base class for schema inference for the data in some specific format. +/// It reads some data from read buffer and try to determine the schema +/// from read data. +class ISchemaReader +{ +public: + ISchemaReader(ReadBuffer & in_) : in(in_) {} + + virtual NamesAndTypesList readSchema() = 0; + + virtual ~ISchemaReader() = default; + +protected: + ReadBuffer & in; +}; + +/// Base class for schema inference for formats that read data row by row. +/// It reads data row by row (up to max_rows_to_read), determines types of columns +/// for each row and compare them with types from the previous rows. If some column +/// contains values with different types in different rows, the default type will be +/// used for this column or the exception will be thrown (if default type is not set). +class IRowSchemaReader : public ISchemaReader +{ +public: + IRowSchemaReader(ReadBuffer & in_, size_t max_rows_to_read_, DataTypePtr default_type_ = nullptr); + NamesAndTypesList readSchema() override; + +protected: + /// Read one row and determine types of columns in it. + /// Return types in the same order in which the values were in the row. + /// If it's impossible to determine the type for some column, return nullptr for it. + /// Return empty list if can't read more data. + virtual DataTypes readRowAndGetDataTypes() = 0; + + void setColumnNames(const std::vector & names) { column_names = names; } + +private: + size_t max_rows_to_read; + DataTypePtr default_type; + std::vector column_names; +}; + +/// Base class for schema inference for formats that read data row by row and each +/// row contains column names and values (ex: JSONEachRow, TSKV). +/// Differ from IRowSchemaReader in that after reading a row we get +/// a map {column_name : type} and some columns may be missed in a single row +/// (in this case we will use types from the previous rows for missed columns). +class IRowWithNamesSchemaReader : public ISchemaReader +{ +public: + IRowWithNamesSchemaReader(ReadBuffer & in_, size_t max_rows_to_read_, DataTypePtr default_type_ = nullptr); + NamesAndTypesList readSchema() override; + +protected: + /// Read one row and determine types of columns in it. + /// Return map {column_name : type}. + /// If it's impossible to determine the type for some column, return nullptr for it. + /// Return empty map is can't read more data. + virtual std::unordered_map readRowAndGetNamesAndDataTypes() = 0; + +private: + size_t max_rows_to_read; + DataTypePtr default_type; +}; + +/// Base class for schema inference for formats that don't need any data to +/// determine the schema: formats with constant schema (ex: JSONAsString, LineAsString) +/// and formats that use external format schema (ex: Protobuf, CapnProto). +class IExternalSchemaReader +{ +public: + virtual NamesAndTypesList readSchema() = 0; + + virtual ~IExternalSchemaReader() = default; +}; + +} diff --git a/src/Processors/Formats/Impl/ArrowBlockInputFormat.cpp b/src/Processors/Formats/Impl/ArrowBlockInputFormat.cpp index b1f9eaa59a1..4af2c651c39 100644 --- a/src/Processors/Formats/Impl/ArrowBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ArrowBlockInputFormat.cpp @@ -37,6 +37,9 @@ Chunk ArrowBlockInputFormat::generate() if (!stream_reader) prepareReader(); + if (is_stopped) + return {}; + batch_result = stream_reader->Next(); if (batch_result.ok() && !(*batch_result)) return res; @@ -46,6 +49,9 @@ Chunk ArrowBlockInputFormat::generate() if (!file_reader) prepareReader(); + if (is_stopped) + return {}; + if (record_batch_current >= record_batch_total) return res; @@ -79,27 +85,38 @@ void ArrowBlockInputFormat::resetParser() record_batch_current = 0; } +static std::shared_ptr createStreamReader(ReadBuffer & in) +{ + auto stream_reader_status = arrow::ipc::RecordBatchStreamReader::Open(std::make_unique(in)); + if (!stream_reader_status.ok()) + throw Exception(ErrorCodes::UNKNOWN_EXCEPTION, + "Error while opening a table: {}", stream_reader_status.status().ToString()); + return *stream_reader_status; +} + +static std::shared_ptr createFileReader(ReadBuffer & in, const FormatSettings & format_settings, std::atomic & is_stopped) +{ + auto arrow_file = asArrowFile(in, format_settings, is_stopped); + if (is_stopped) + return nullptr; + + auto file_reader_status = arrow::ipc::RecordBatchFileReader::Open(std::move(arrow_file)); + if (!file_reader_status.ok()) + throw Exception(ErrorCodes::UNKNOWN_EXCEPTION, + "Error while opening a table: {}", file_reader_status.status().ToString()); + return *file_reader_status; +} + + void ArrowBlockInputFormat::prepareReader() { - std::shared_ptr schema; - if (stream) - { - auto stream_reader_status = arrow::ipc::RecordBatchStreamReader::Open(std::make_unique(*in)); - if (!stream_reader_status.ok()) - throw Exception(ErrorCodes::UNKNOWN_EXCEPTION, - "Error while opening a table: {}", stream_reader_status.status().ToString()); - stream_reader = *stream_reader_status; - schema = stream_reader->schema(); - } + stream_reader = createStreamReader(*in); else { - auto file_reader_status = arrow::ipc::RecordBatchFileReader::Open(asArrowFile(*in, format_settings)); - if (!file_reader_status.ok()) - throw Exception(ErrorCodes::UNKNOWN_EXCEPTION, - "Error while opening a table: {}", file_reader_status.status().ToString()); - file_reader = *file_reader_status; - schema = file_reader->schema(); + file_reader = createFileReader(*in, format_settings, is_stopped); + if (!file_reader) + return; } arrow_column_to_ch_column = std::make_unique(getPort().getHeader(), "Arrow", format_settings.arrow.import_nested); @@ -112,6 +129,27 @@ void ArrowBlockInputFormat::prepareReader() record_batch_current = 0; } +ArrowSchemaReader::ArrowSchemaReader(ReadBuffer & in_, bool stream_, const FormatSettings & format_settings_) + : ISchemaReader(in_), stream(stream_), format_settings(format_settings_) +{ +} + +NamesAndTypesList ArrowSchemaReader::readSchema() +{ + std::shared_ptr schema; + + if (stream) + schema = createStreamReader(in)->schema(); + else + { + std::atomic is_stopped = 0; + schema = createFileReader(in, format_settings, is_stopped)->schema(); + } + + auto header = ArrowColumnToCHColumn::arrowSchemaToCHHeader(*schema, stream ? "ArrowStream" : "Arrow"); + return header.getNamesAndTypesList(); +} + void registerInputFormatArrow(FormatFactory & factory) { factory.registerInputFormat( @@ -135,6 +173,20 @@ void registerInputFormatArrow(FormatFactory & factory) }); } +void registerArrowSchemaReader(FormatFactory & factory) +{ + factory.registerSchemaReader( + "Arrow", + [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr) + { + return std::make_shared(buf, false, settings); + }); + factory.registerSchemaReader( + "ArrowStream", + [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr) + { + return std::make_shared(buf, true, settings); + });} } #else @@ -144,6 +196,8 @@ class FormatFactory; void registerInputFormatArrow(FormatFactory &) { } + +void registerArrowSchemaReader(FormatFactory &) {} } #endif diff --git a/src/Processors/Formats/Impl/ArrowBlockInputFormat.h b/src/Processors/Formats/Impl/ArrowBlockInputFormat.h index 44e18e3f852..62cbf949fc2 100644 --- a/src/Processors/Formats/Impl/ArrowBlockInputFormat.h +++ b/src/Processors/Formats/Impl/ArrowBlockInputFormat.h @@ -4,6 +4,7 @@ #if USE_ARROW #include +#include #include namespace arrow { class RecordBatchReader; } @@ -27,6 +28,11 @@ public: private: Chunk generate() override; + void onCancel() override + { + is_stopped = 1; + } + // Whether to use ArrowStream format bool stream; // This field is only used for ArrowStream format @@ -42,6 +48,20 @@ private: const FormatSettings format_settings; void prepareReader(); + + std::atomic is_stopped{0}; +}; + +class ArrowSchemaReader : public ISchemaReader +{ +public: + ArrowSchemaReader(ReadBuffer & in_, bool stream_, const FormatSettings & format_settings_); + + NamesAndTypesList readSchema() override; + +private: + bool stream; + const FormatSettings format_settings; }; } diff --git a/src/Processors/Formats/Impl/ArrowBufferedStreams.cpp b/src/Processors/Formats/Impl/ArrowBufferedStreams.cpp index 148faabf352..86d278397c2 100644 --- a/src/Processors/Formats/Impl/ArrowBufferedStreams.cpp +++ b/src/Processors/Formats/Impl/ArrowBufferedStreams.cpp @@ -140,7 +140,7 @@ arrow::Status ArrowInputStreamFromReadBuffer::Close() return arrow::Status(); } -std::shared_ptr asArrowFile(ReadBuffer & in, const FormatSettings & settings) +std::shared_ptr asArrowFile(ReadBuffer & in, const FormatSettings & settings, std::atomic & is_cancelled) { if (auto * fd_in = dynamic_cast(&in)) { @@ -160,7 +160,7 @@ std::shared_ptr asArrowFile(ReadBuffer & in, const std::string file_data; { WriteBufferFromString file_buffer(file_data); - copyData(in, file_buffer); + copyData(in, file_buffer, is_cancelled); } return std::make_shared(arrow::Buffer::FromString(std::move(file_data))); diff --git a/src/Processors/Formats/Impl/ArrowBufferedStreams.h b/src/Processors/Formats/Impl/ArrowBufferedStreams.h index 29c869e4152..d649c52557f 100644 --- a/src/Processors/Formats/Impl/ArrowBufferedStreams.h +++ b/src/Processors/Formats/Impl/ArrowBufferedStreams.h @@ -86,7 +86,7 @@ private: ARROW_DISALLOW_COPY_AND_ASSIGN(ArrowInputStreamFromReadBuffer); }; -std::shared_ptr asArrowFile(ReadBuffer & in, const FormatSettings & settings); +std::shared_ptr asArrowFile(ReadBuffer & in, const FormatSettings & settings, std::atomic & is_cancelled); } diff --git a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp index dea753b14e3..aa181ea0b8b 100644 --- a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp +++ b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp @@ -14,7 +14,7 @@ #include #include #include -#include +#include #include #include #include @@ -59,11 +59,12 @@ namespace DB namespace ErrorCodes { + extern const int BAD_ARGUMENTS; + extern const int DUPLICATE_COLUMN; + extern const int THERE_IS_NO_COLUMN; + extern const int UNKNOWN_EXCEPTION; extern const int UNKNOWN_TYPE; extern const int VALUE_IS_OUT_OF_RANGE_OF_DATA_TYPE; - extern const int THERE_IS_NO_COLUMN; - extern const int BAD_ARGUMENTS; - extern const int UNKNOWN_EXCEPTION; } @@ -238,10 +239,8 @@ static ColumnWithTypeAndName readColumnWithTimestampData(std::shared_ptr -static ColumnWithTypeAndName readColumnWithDecimalData(std::shared_ptr & arrow_column, const String & column_name) +static ColumnWithTypeAndName readColumnWithDecimalDataImpl(std::shared_ptr & arrow_column, const String & column_name, DataTypePtr internal_type) { - const auto * arrow_decimal_type = static_cast(arrow_column->type().get()); - auto internal_type = std::make_shared>(arrow_decimal_type->precision(), arrow_decimal_type->scale()); auto internal_column = internal_type->createColumn(); auto & column = assert_cast &>(*internal_column); auto & column_data = column.getData(); @@ -258,6 +257,21 @@ static ColumnWithTypeAndName readColumnWithDecimalData(std::shared_ptr +static ColumnWithTypeAndName readColumnWithDecimalData(std::shared_ptr & arrow_column, const String & column_name) +{ + const auto * arrow_decimal_type = static_cast(arrow_column->type().get()); + size_t precision = arrow_decimal_type->precision(); + auto internal_type = createDecimal(precision, arrow_decimal_type->scale()); + if (precision <= DecimalUtils::max_precision) + return readColumnWithDecimalDataImpl(arrow_column, column_name, internal_type); + else if (precision <= DecimalUtils::max_precision) + return readColumnWithDecimalDataImpl(arrow_column, column_name, internal_type); + else if (precision <= DecimalUtils::max_precision) + return readColumnWithDecimalDataImpl(arrow_column, column_name, internal_type); + return readColumnWithDecimalDataImpl(arrow_column, column_name, internal_type); +} + /// Creates a null bytemap from arrow's null bitmap static ColumnPtr readByteMapFromArrowColumn(std::shared_ptr & arrow_column) { @@ -327,12 +341,13 @@ static ColumnWithTypeAndName readColumnFromArrowColumn( const std::string & column_name, const std::string & format_name, bool is_nullable, - std::unordered_map> & dictionary_values) + std::unordered_map> & dictionary_values, + bool read_ints_as_dates) { if (!is_nullable && arrow_column->null_count() && arrow_column->type()->id() != arrow::Type::LIST && arrow_column->type()->id() != arrow::Type::MAP && arrow_column->type()->id() != arrow::Type::STRUCT) { - auto nested_column = readColumnFromArrowColumn(arrow_column, column_name, format_name, true, dictionary_values); + auto nested_column = readColumnFromArrowColumn(arrow_column, column_name, format_name, true, dictionary_values, read_ints_as_dates); auto nullmap_column = readByteMapFromArrowColumn(arrow_column); auto nullable_type = std::make_shared(std::move(nested_column.type)); auto nullable_column = ColumnNullable::create(std::move(nested_column.column), std::move(nullmap_column)); @@ -357,25 +372,27 @@ static ColumnWithTypeAndName readColumnFromArrowColumn( case arrow::Type::UINT16: { auto column = readColumnWithNumericData(arrow_column, column_name); - column.type = std::make_shared(); + if (read_ints_as_dates) + column.type = std::make_shared(); return column; } case arrow::Type::UINT32: { auto column = readColumnWithNumericData(arrow_column, column_name); - column.type = std::make_shared(); + if (read_ints_as_dates) + column.type = std::make_shared(); return column; } case arrow::Type::TIMESTAMP: return readColumnWithTimestampData(arrow_column, column_name); case arrow::Type::DECIMAL128: - return readColumnWithDecimalData(arrow_column, column_name); + return readColumnWithDecimalData(arrow_column, column_name); case arrow::Type::DECIMAL256: - return readColumnWithDecimalData(arrow_column, column_name); + return readColumnWithDecimalData(arrow_column, column_name); case arrow::Type::MAP: { auto arrow_nested_column = getNestedArrowColumn(arrow_column); - auto nested_column = readColumnFromArrowColumn(arrow_nested_column, column_name, format_name, false, dictionary_values); + auto nested_column = readColumnFromArrowColumn(arrow_nested_column, column_name, format_name, false, dictionary_values, read_ints_as_dates); auto offsets_column = readOffsetsFromArrowListColumn(arrow_column); const auto * tuple_column = assert_cast(nested_column.column.get()); @@ -387,7 +404,7 @@ static ColumnWithTypeAndName readColumnFromArrowColumn( case arrow::Type::LIST: { auto arrow_nested_column = getNestedArrowColumn(arrow_column); - auto nested_column = readColumnFromArrowColumn(arrow_nested_column, column_name, format_name, false, dictionary_values); + auto nested_column = readColumnFromArrowColumn(arrow_nested_column, column_name, format_name, false, dictionary_values, read_ints_as_dates); auto offsets_column = readOffsetsFromArrowListColumn(arrow_column); auto array_column = ColumnArray::create(std::move(nested_column.column), std::move(offsets_column)); auto array_type = std::make_shared(nested_column.type); @@ -412,7 +429,7 @@ static ColumnWithTypeAndName readColumnFromArrowColumn( for (int i = 0; i != arrow_struct_type->num_fields(); ++i) { auto nested_arrow_column = std::make_shared(nested_arrow_columns[i]); - auto element = readColumnFromArrowColumn(nested_arrow_column, arrow_struct_type->field(i)->name(), format_name, false, dictionary_values); + auto element = readColumnFromArrowColumn(nested_arrow_column, arrow_struct_type->field(i)->name(), format_name, false, dictionary_values, read_ints_as_dates); tuple_elements.emplace_back(std::move(element.column)); tuple_types.emplace_back(std::move(element.type)); tuple_names.emplace_back(std::move(element.name)); @@ -435,7 +452,7 @@ static ColumnWithTypeAndName readColumnFromArrowColumn( dict_array.emplace_back(dict_chunk.dictionary()); } auto arrow_dict_column = std::make_shared(dict_array); - auto dict_column = readColumnFromArrowColumn(arrow_dict_column, column_name, format_name, false, dictionary_values); + auto dict_column = readColumnFromArrowColumn(arrow_dict_column, column_name, format_name, false, dictionary_values, read_ints_as_dates); /// We should convert read column to ColumnUnique. auto tmp_lc_column = DataTypeLowCardinality(dict_column.type).createColumn(); @@ -482,7 +499,7 @@ static void checkStatus(const arrow::Status & status, const String & column_name throw Exception{ErrorCodes::UNKNOWN_EXCEPTION, "Error with a {} column '{}': {}.", format_name, column_name, status.ToString()}; } -static Block arrowSchemaToCHHeader(const arrow::Schema & schema, const std::string & format_name) +Block ArrowColumnToCHColumn::arrowSchemaToCHHeader(const arrow::Schema & schema, const std::string & format_name) { ColumnsWithTypeAndName sample_columns; for (const auto & field : schema.fields()) @@ -492,24 +509,21 @@ static Block arrowSchemaToCHHeader(const arrow::Schema & schema, const std::stri std::unique_ptr array_builder; arrow::Status status = MakeBuilder(pool, field->type(), &array_builder); checkStatus(status, field->name(), format_name); + std::shared_ptr arrow_array; status = array_builder->Finish(&arrow_array); checkStatus(status, field->name(), format_name); + arrow::ArrayVector array_vector = {arrow_array}; auto arrow_column = std::make_shared(array_vector); std::unordered_map> dict_values; - ColumnWithTypeAndName sample_column = readColumnFromArrowColumn(arrow_column, field->name(), format_name, false, dict_values); + ColumnWithTypeAndName sample_column = readColumnFromArrowColumn(arrow_column, field->name(), format_name, false, dict_values, false); + sample_columns.emplace_back(std::move(sample_column)); } return Block(std::move(sample_columns)); } -ArrowColumnToCHColumn::ArrowColumnToCHColumn( - const arrow::Schema & schema, const std::string & format_name_, bool import_nested_) - : header(arrowSchemaToCHHeader(schema, format_name_)), format_name(format_name_), import_nested(import_nested_) -{ -} - ArrowColumnToCHColumn::ArrowColumnToCHColumn( const Block & header_, const std::string & format_name_, bool import_nested_) : header(header_), format_name(format_name_), import_nested(import_nested_) @@ -519,9 +533,11 @@ ArrowColumnToCHColumn::ArrowColumnToCHColumn( void ArrowColumnToCHColumn::arrowTableToCHChunk(Chunk & res, std::shared_ptr & table) { NameToColumnPtr name_to_column_ptr; - for (const auto& column_name : table->ColumnNames()) + for (const auto & column_name : table->ColumnNames()) { std::shared_ptr arrow_column = table->GetColumnByName(column_name); + if (!arrow_column) + throw Exception(ErrorCodes::DUPLICATE_COLUMN, "Column '{}' is duplicated", column_name); name_to_column_ptr[column_name] = arrow_column; } @@ -550,7 +566,7 @@ void ArrowColumnToCHColumn::arrowColumnsToCHChunk(Chunk & res, NameToColumnPtr & if (!nested_tables.contains(nested_table_name)) { std::shared_ptr arrow_column = name_to_column_ptr[nested_table_name]; - ColumnsWithTypeAndName cols = {readColumnFromArrowColumn(arrow_column, nested_table_name, format_name, false, dictionary_values)}; + ColumnsWithTypeAndName cols = {readColumnFromArrowColumn(arrow_column, nested_table_name, format_name, false, dictionary_values, true)}; Block block(cols); nested_tables[nested_table_name] = std::make_shared(Nested::flatten(block)); } @@ -570,7 +586,7 @@ void ArrowColumnToCHColumn::arrowColumnsToCHChunk(Chunk & res, NameToColumnPtr & if (read_from_nested) column = nested_tables[nested_table_name]->getByName(header_column.name); else - column = readColumnFromArrowColumn(arrow_column, header_column.name, format_name, false, dictionary_values); + column = readColumnFromArrowColumn(arrow_column, header_column.name, format_name, false, dictionary_values, true); try { diff --git a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.h b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.h index 46976093f0b..58f8f1536b5 100644 --- a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.h +++ b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.h @@ -23,16 +23,14 @@ public: ArrowColumnToCHColumn(const Block & header_, const std::string & format_name_, bool import_nested_); - /// Constructor that create header by arrow schema. It will be useful for inserting - /// data from file without knowing table structure. - ArrowColumnToCHColumn(const arrow::Schema & schema, const std::string & format_name, bool import_nested_); - void arrowTableToCHChunk(Chunk & res, std::shared_ptr & table); void arrowColumnsToCHChunk(Chunk & res, NameToColumnPtr & name_to_column_ptr); + static Block arrowSchemaToCHHeader(const arrow::Schema & schema, const std::string & format_name); + private: - const Block header; + const Block & header; const std::string format_name; bool import_nested; diff --git a/src/Processors/Formats/Impl/AvroRowInputFormat.cpp b/src/Processors/Formats/Impl/AvroRowInputFormat.cpp index 1f806d47c45..a372df41344 100644 --- a/src/Processors/Formats/Impl/AvroRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/AvroRowInputFormat.cpp @@ -68,6 +68,7 @@ namespace DB { + namespace ErrorCodes { extern const int BAD_ARGUMENTS; @@ -280,7 +281,7 @@ AvroDeserializer::DeserializeFn AvroDeserializer::createDeserializeFn(avro::Node for (size_t n = decoder.arrayStart(); n != 0; n = decoder.arrayNext()) { total += n; - for (size_t i = 0; i < n; i++) + for (size_t i = 0; i < n; ++i) { nested_deserialize(nested_column, decoder); } @@ -344,7 +345,7 @@ AvroDeserializer::DeserializeFn AvroDeserializer::createDeserializeFn(avro::Node if (target.isString()) { std::vector symbols; - for (size_t i = 0; i < root_node->names(); i++) + for (size_t i = 0; i < root_node->names(); ++i) { symbols.push_back(root_node->nameAt(i)); } @@ -359,7 +360,7 @@ AvroDeserializer::DeserializeFn AvroDeserializer::createDeserializeFn(avro::Node { const auto & enum_type = dynamic_cast(*target_type); Row symbol_mapping; - for (size_t i = 0; i < root_node->names(); i++) + for (size_t i = 0; i < root_node->names(); ++i) { symbol_mapping.push_back(enum_type.castToValue(root_node->nameAt(i))); } @@ -443,11 +444,19 @@ AvroDeserializer::SkipFn AvroDeserializer::createSkipFn(avro::NodePtr root_node) case avro::AVRO_UNION: { std::vector union_skip_fns; - for (size_t i = 0; i < root_node->leaves(); i++) + for (size_t i = 0; i < root_node->leaves(); ++i) { union_skip_fns.push_back(createSkipFn(root_node->leafAt(i))); } - return [union_skip_fns](avro::Decoder & decoder) { union_skip_fns[decoder.decodeUnionIndex()](decoder); }; + return [union_skip_fns](avro::Decoder & decoder) + { + auto index = decoder.decodeUnionIndex(); + if (index >= union_skip_fns.size()) + { + throw Exception("Union index out of boundary", ErrorCodes::INCORRECT_DATA); + } + union_skip_fns[index](decoder); + }; } case avro::AVRO_NULL: return [](avro::Decoder & decoder) { decoder.decodeNull(); }; @@ -476,7 +485,7 @@ AvroDeserializer::SkipFn AvroDeserializer::createSkipFn(avro::NodePtr root_node) case avro::AVRO_RECORD: { std::vector field_skip_fns; - for (size_t i = 0; i < root_node->leaves(); i++) + for (size_t i = 0; i < root_node->leaves(); ++i) { field_skip_fns.push_back(createSkipFn(root_node->leafAt(i))); } @@ -806,6 +815,92 @@ const AvroDeserializer & AvroConfluentRowInputFormat::getOrCreateDeserializer(Sc return it->second; } +AvroSchemaReader::AvroSchemaReader(ReadBuffer & in_, bool confluent_, const FormatSettings & format_settings_) + : ISchemaReader(in_), confluent(confluent_), format_settings(format_settings_) +{ +} + +NamesAndTypesList AvroSchemaReader::readSchema() +{ + avro::NodePtr root_node; + if (confluent) + { + UInt32 schema_id = readConfluentSchemaId(in); + root_node = getConfluentSchemaRegistry(format_settings)->getSchema(schema_id).root(); + } + else + { + auto file_reader_ptr = std::make_unique(std::make_unique(in)); + root_node = file_reader_ptr->dataSchema().root(); + } + + if (root_node->type() != avro::Type::AVRO_RECORD) + throw Exception("Root schema must be a record", ErrorCodes::TYPE_MISMATCH); + + NamesAndTypesList names_and_types; + for (size_t i = 0; i != root_node->leaves(); ++i) + names_and_types.emplace_back(root_node->nameAt(i), avroNodeToDataType(root_node->leafAt(i))); + + return names_and_types; +} + +DataTypePtr AvroSchemaReader::avroNodeToDataType(avro::NodePtr node) +{ + switch (node->type()) + { + case avro::Type::AVRO_INT: + return {std::make_shared()}; + case avro::Type::AVRO_LONG: + return std::make_shared(); + case avro::Type::AVRO_BOOL: + return std::make_shared(); + case avro::Type::AVRO_FLOAT: + return std::make_shared(); + case avro::Type::AVRO_DOUBLE: + return std::make_shared(); + case avro::Type::AVRO_STRING: + return std::make_shared(); + case avro::Type::AVRO_BYTES: + return std::make_shared(); + case avro::Type::AVRO_ENUM: + { + if (node->names() < 128) + { + EnumValues::Values values; + for (size_t i = 0; i != node->names(); ++i) + values.emplace_back(node->nameAt(i), i); + return std::make_shared(std::move(values)); + } + else if (node->names() < 32768) + { + EnumValues::Values values; + for (size_t i = 0; i != node->names(); ++i) + values.emplace_back(node->nameAt(i), i); + return std::make_shared(std::move(values)); + } + + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "ClickHouse supports only 8 and 16-bit Enum."); + } + case avro::Type::AVRO_FIXED: + return std::make_shared(node->fixedSize()); + case avro::Type::AVRO_ARRAY: + return std::make_shared(avroNodeToDataType(node->leafAt(0))); + case avro::Type::AVRO_NULL: + return std::make_shared(); + case avro::Type::AVRO_UNION: + if (node->leaves() == 2 && (node->leafAt(0)->type() == avro::Type::AVRO_NULL || node->leafAt(1)->type() == avro::Type::AVRO_NULL)) + { + size_t nested_leaf_index = node->leafAt(0)->type() == avro::Type::AVRO_NULL ? 1 : 0; + return makeNullable(avroNodeToDataType(node->leafAt(nested_leaf_index))); + } + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Avro type UNION is not supported for inserting."); + case avro::Type::AVRO_SYMBOLIC: + return avroNodeToDataType(avro::resolveSymbol(node)); + default: + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Avro column {} is not supported for inserting."); + } +} + void registerInputFormatAvro(FormatFactory & factory) { factory.registerInputFormat("Avro", []( @@ -827,6 +922,21 @@ void registerInputFormatAvro(FormatFactory & factory) }); } +void registerAvroSchemaReader(FormatFactory & factory) +{ + factory.registerSchemaReader("Avro", [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr) + { + return std::make_shared(buf, false, settings); + }); + + factory.registerSchemaReader("AvroConfluent", [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr) + { + return std::make_shared(buf, true, settings); + }); + +} + + } #else @@ -837,6 +947,8 @@ class FormatFactory; void registerInputFormatAvro(FormatFactory &) { } + +void registerAvroSchemaReader(FormatFactory &) {} } #endif diff --git a/src/Processors/Formats/Impl/AvroRowInputFormat.h b/src/Processors/Formats/Impl/AvroRowInputFormat.h index 2de11178e96..46e571d87ec 100644 --- a/src/Processors/Formats/Impl/AvroRowInputFormat.h +++ b/src/Processors/Formats/Impl/AvroRowInputFormat.h @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -22,6 +23,12 @@ namespace DB { + +namespace ErrorCodes +{ + extern const int INCORRECT_DATA; +} + class AvroDeserializer { public: @@ -81,7 +88,12 @@ private: action.execute(columns, decoder, ext); break; case Union: - actions[decoder.decodeUnionIndex()].execute(columns, decoder, ext); + auto index = decoder.decodeUnionIndex(); + if (index >= actions.size()) + { + throw Exception("Union index out of boundary", ErrorCodes::INCORRECT_DATA); + } + actions[index].execute(columns, decoder, ext); break; } } @@ -149,6 +161,20 @@ private: FormatSettings format_settings; }; +class AvroSchemaReader : public ISchemaReader +{ +public: + AvroSchemaReader(ReadBuffer & in_, bool confluent_, const FormatSettings & format_settings_); + + NamesAndTypesList readSchema() override; + +private: + DataTypePtr avroNodeToDataType(avro::NodePtr node); + + bool confluent; + const FormatSettings format_settings; +}; + } #endif diff --git a/src/Processors/Formats/Impl/BinaryRowInputFormat.cpp b/src/Processors/Formats/Impl/BinaryRowInputFormat.cpp index 0506c539c0f..b356967a544 100644 --- a/src/Processors/Formats/Impl/BinaryRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/BinaryRowInputFormat.cpp @@ -5,7 +5,6 @@ #include #include - namespace DB { @@ -15,11 +14,23 @@ namespace ErrorCodes } BinaryRowInputFormat::BinaryRowInputFormat(ReadBuffer & in_, Block header, Params params_, bool with_names_, bool with_types_, const FormatSettings & format_settings_) - : RowInputFormatWithNamesAndTypes(std::move(header), in_, std::move(params_), with_names_, with_types_, format_settings_) + : RowInputFormatWithNamesAndTypes( + std::move(header), + in_, + std::move(params_), + with_names_, + with_types_, + format_settings_, + std::make_unique(in_, format_settings_)) { } -std::vector BinaryRowInputFormat::readHeaderRow() + +BinaryFormatReader::BinaryFormatReader(ReadBuffer & in_, const FormatSettings & format_settings_) : FormatWithNamesAndTypesReader(in_, format_settings_) +{ +} + +std::vector BinaryFormatReader::readHeaderRow() { std::vector fields; String field; @@ -31,13 +42,13 @@ std::vector BinaryRowInputFormat::readHeaderRow() return fields; } -std::vector BinaryRowInputFormat::readNames() +std::vector BinaryFormatReader::readNames() { readVarUInt(read_columns, *in); return readHeaderRow(); } -std::vector BinaryRowInputFormat::readTypes() +std::vector BinaryFormatReader::readTypes() { auto types = readHeaderRow(); for (const auto & type_name : types) @@ -45,31 +56,37 @@ std::vector BinaryRowInputFormat::readTypes() return types; } -bool BinaryRowInputFormat::readField(IColumn & column, const DataTypePtr & /*type*/, const SerializationPtr & serialization, bool /*is_last_file_column*/, const String & /*column_name*/) +bool BinaryFormatReader::readField(IColumn & column, const DataTypePtr & /*type*/, const SerializationPtr & serialization, bool /*is_last_file_column*/, const String & /*column_name*/) { serialization->deserializeBinary(column, *in); return true; } -void BinaryRowInputFormat::skipHeaderRow() +void BinaryFormatReader::skipHeaderRow() { String tmp; for (size_t i = 0; i < read_columns; ++i) readStringBinary(tmp, *in); } -void BinaryRowInputFormat::skipNames() +void BinaryFormatReader::skipNames() { readVarUInt(read_columns, *in); skipHeaderRow(); } -void BinaryRowInputFormat::skipTypes() +void BinaryFormatReader::skipTypes() { + if (read_columns == 0) + { + /// It's possible only when with_names = false and with_types = true + readVarUInt(read_columns, *in); + } + skipHeaderRow(); } -void BinaryRowInputFormat::skipField(size_t file_column) +void BinaryFormatReader::skipField(size_t file_column) { if (file_column >= read_data_types.size()) throw Exception(ErrorCodes::CANNOT_SKIP_UNKNOWN_FIELD, "Cannot skip unknown field in RowBinaryWithNames format, because it's type is unknown"); @@ -77,6 +94,11 @@ void BinaryRowInputFormat::skipField(size_t file_column) read_data_types[file_column]->getDefaultSerialization()->deserializeBinary(field, *in); } +BinaryWithNamesAndTypesSchemaReader::BinaryWithNamesAndTypesSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_) + : FormatWithNamesAndTypesSchemaReader(in_, 0, true, true, &reader), reader(in_, format_settings_) +{ +} + void registerInputFormatRowBinary(FormatFactory & factory) { auto register_func = [&](const String & format_name, bool with_names, bool with_types) @@ -94,4 +116,13 @@ void registerInputFormatRowBinary(FormatFactory & factory) registerWithNamesAndTypes("RowBinary", register_func); } +void registerRowBinaryWithNamesAndTypesSchemaReader(FormatFactory & factory) +{ + factory.registerSchemaReader("RowBinaryWithNamesAndTypes", [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr) + { + return std::make_shared(buf, settings); + }); +} + + } diff --git a/src/Processors/Formats/Impl/BinaryRowInputFormat.h b/src/Processors/Formats/Impl/BinaryRowInputFormat.h index 61d6df77522..d98e75bf621 100644 --- a/src/Processors/Formats/Impl/BinaryRowInputFormat.h +++ b/src/Processors/Formats/Impl/BinaryRowInputFormat.h @@ -1,15 +1,19 @@ #pragma once #include -#include #include +#include namespace DB { -class ReadBuffer; +namespace ErrorCodes +{ + extern const int NOT_IMPLEMENTED; +} +class ReadBuffer; /** A stream for inputting data in a binary line-by-line format. */ @@ -24,9 +28,15 @@ public: /// in this format we cannot provide any DiagnosticInfo, because here we have /// just binary data. std::string getDiagnosticInfo() override { return {}; } +}; + +class BinaryFormatReader : public FormatWithNamesAndTypesReader +{ +public: + BinaryFormatReader(ReadBuffer & in_, const FormatSettings & format_settings_); -private: bool readField(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, bool is_last_file_column, const String & column_name) override; + void skipField(size_t file_column) override; void skipNames() override; @@ -37,9 +47,24 @@ private: std::vector readTypes() override; std::vector readHeaderRow(); +private: /// Data types read from input data. DataTypes read_data_types; - UInt64 read_columns = 0; + UInt64 read_columns; +}; + +class BinaryWithNamesAndTypesSchemaReader : public FormatWithNamesAndTypesSchemaReader +{ +public: + BinaryWithNamesAndTypesSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_); + +private: + DataTypes readRowAndGetDataTypes() override + { + throw Exception{ErrorCodes::NOT_IMPLEMENTED, "Method readRowAndGetDataTypes is not implemented"}; + } + + BinaryFormatReader reader; }; } diff --git a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp index 9de2b908b1e..735a549d0a6 100644 --- a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp @@ -5,13 +5,16 @@ #include #include #include +#include +#include #include #include -#include +#include +#include + namespace DB { - namespace ErrorCodes { extern const int BAD_ARGUMENTS; @@ -26,7 +29,14 @@ CSVRowInputFormat::CSVRowInputFormat( bool with_names_, bool with_types_, const FormatSettings & format_settings_) - : RowInputFormatWithNamesAndTypes(header_, in_, params_, with_names_, with_types_, format_settings_) + : RowInputFormatWithNamesAndTypes( + header_, + in_, + params_, + with_names_, + with_types_, + format_settings_, + std::make_unique(in_, format_settings_)) { const String bad_delimiters = " \t\"'.UL"; if (bad_delimiters.find(format_settings.csv.delimiter) != String::npos) @@ -36,6 +46,11 @@ CSVRowInputFormat::CSVRowInputFormat( ErrorCodes::BAD_ARGUMENTS); } +void CSVRowInputFormat::syncAfterError() +{ + skipToNextLineOrEOF(*in); +} + static void skipEndOfLine(ReadBuffer & in) { /// \n (Unix) or \r\n (DOS/Windows) or \n\r (Mac OS Classic) @@ -52,8 +67,10 @@ static void skipEndOfLine(ReadBuffer & in) if (!in.eof() && *in.position() == '\n') ++in.position(); else - throw Exception("Cannot parse CSV format: found \\r (CR) not followed by \\n (LF)." - " Line must end by \\n (LF) or \\r\\n (CR LF) or \\n\\r.", ErrorCodes::INCORRECT_DATA); + throw Exception( + "Cannot parse CSV format: found \\r (CR) not followed by \\n (LF)." + " Line must end by \\n (LF) or \\r\\n (CR LF) or \\n\\r.", + ErrorCodes::INCORRECT_DATA); } else if (!in.eof()) throw Exception("Expected end of line", ErrorCodes::INCORRECT_DATA); @@ -62,32 +79,38 @@ static void skipEndOfLine(ReadBuffer & in) /// Skip `whitespace` symbols allowed in CSV. static inline void skipWhitespacesAndTabs(ReadBuffer & in) { - while (!in.eof() - && (*in.position() == ' ' - || *in.position() == '\t')) + while (!in.eof() && (*in.position() == ' ' || *in.position() == '\t')) ++in.position(); } -void CSVRowInputFormat::skipFieldDelimiter() +CSVFormatReader::CSVFormatReader(ReadBuffer & in_, const FormatSettings & format_settings_) : FormatWithNamesAndTypesReader(in_, format_settings_) +{ +} + +void CSVFormatReader::skipFieldDelimiter() { skipWhitespacesAndTabs(*in); assertChar(format_settings.csv.delimiter, *in); } -String CSVRowInputFormat::readFieldIntoString() +template +String CSVFormatReader::readCSVFieldIntoString() { skipWhitespacesAndTabs(*in); String field; - readCSVString(field, *in, format_settings.csv); + if constexpr (read_string) + readCSVString(field, *in, format_settings.csv); + else + readCSVField(field, *in, format_settings.csv); return field; } -void CSVRowInputFormat::skipField() +void CSVFormatReader::skipField() { - readFieldIntoString(); + readCSVFieldIntoString(); } -void CSVRowInputFormat::skipRowEndDelimiter() +void CSVFormatReader::skipRowEndDelimiter() { skipWhitespacesAndTabs(*in); @@ -105,33 +128,32 @@ void CSVRowInputFormat::skipRowEndDelimiter() skipEndOfLine(*in); } -void CSVRowInputFormat::skipHeaderRow() +void CSVFormatReader::skipHeaderRow() { do { skipField(); skipWhitespacesAndTabs(*in); - } - while (checkChar(format_settings.csv.delimiter, *in)); + } while (checkChar(format_settings.csv.delimiter, *in)); skipRowEndDelimiter(); } -std::vector CSVRowInputFormat::readHeaderRow() +template +std::vector CSVFormatReader::readRowImpl() { std::vector fields; do { - fields.push_back(readFieldIntoString()); + fields.push_back(readCSVFieldIntoString()); skipWhitespacesAndTabs(*in); - } - while (checkChar(format_settings.csv.delimiter, *in)); + } while (checkChar(format_settings.csv.delimiter, *in)); skipRowEndDelimiter(); return fields; } -bool CSVRowInputFormat::parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out) +bool CSVFormatReader::parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out) { const char delimiter = format_settings.csv.delimiter; @@ -144,7 +166,8 @@ bool CSVRowInputFormat::parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out) { if (*in->position() == '\n' || *in->position() == '\r') { - out << "ERROR: Line feed found where delimiter (" << delimiter << ") is expected." + out << "ERROR: Line feed found where delimiter (" << delimiter + << ") is expected." " It's like your file has less columns than expected.\n" "And if your file has the right number of columns, maybe it has unescaped quotes in values.\n"; } @@ -160,7 +183,7 @@ bool CSVRowInputFormat::parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out) return true; } -bool CSVRowInputFormat::parseRowEndWithDiagnosticInfo(WriteBuffer & out) +bool CSVFormatReader::parseRowEndWithDiagnosticInfo(WriteBuffer & out) { skipWhitespacesAndTabs(*in); @@ -191,23 +214,21 @@ bool CSVRowInputFormat::parseRowEndWithDiagnosticInfo(WriteBuffer & out) return true; } -void CSVRowInputFormat::syncAfterError() -{ - skipToNextLineOrEOF(*in); -} - -bool CSVRowInputFormat::readField(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, bool is_last_file_column, const String & /*column_name*/) +bool CSVFormatReader::readField( + IColumn & column, + const DataTypePtr & type, + const SerializationPtr & serialization, + bool is_last_file_column, + const String & /*column_name*/) { skipWhitespacesAndTabs(*in); const bool at_delimiter = !in->eof() && *in->position() == format_settings.csv.delimiter; - const bool at_last_column_line_end = is_last_file_column - && (in->eof() || *in->position() == '\n' || *in->position() == '\r'); + const bool at_last_column_line_end = is_last_file_column && (in->eof() || *in->position() == '\n' || *in->position() == '\r'); /// Note: Tuples are serialized in CSV as separate columns, but with empty_as_default or null_as_default /// only one empty or NULL column will be expected - if (format_settings.csv.empty_as_default - && (at_delimiter || at_last_column_line_end)) + if (format_settings.csv.empty_as_default && (at_delimiter || at_last_column_line_end)) { /// Treat empty unquoted column value as default value, if /// specified in the settings. Tuple columns might seem @@ -231,6 +252,31 @@ bool CSVRowInputFormat::readField(IColumn & column, const DataTypePtr & type, co } } + +CSVSchemaReader::CSVSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, const FormatSettings & format_setting_, ContextPtr context_) + : FormatWithNamesAndTypesSchemaReader( + in_, + format_setting_.max_rows_to_read_for_schema_inference, + with_names_, + with_types_, + &reader, + getDefaultDataTypeForEscapingRule(FormatSettings::EscapingRule::CSV)) + , reader(in_, format_setting_) + , context(context_) +{ +} + + +DataTypes CSVSchemaReader::readRowAndGetDataTypes() +{ + if (in.eof()) + return {}; + + auto fields = reader.readRow(); + return determineDataTypesByEscapingRule(fields, reader.getFormatSettings(), FormatSettings::EscapingRule::CSV, context); +} + + void registerInputFormatCSV(FormatFactory & factory) { auto register_func = [&](const String & format_name, bool with_names, bool with_types) @@ -326,4 +372,17 @@ void registerFileSegmentationEngineCSV(FormatFactory & factory) registerWithNamesAndTypes("CSV", register_func); } +void registerCSVSchemaReader(FormatFactory & factory) +{ + auto register_func = [&](const String & format_name, bool with_names, bool with_types) + { + factory.registerSchemaReader(format_name, [with_names, with_types](ReadBuffer & buf, const FormatSettings & settings, ContextPtr context) + { + return std::make_shared(buf, with_names, with_types, settings, context); + }); + }; + + registerWithNamesAndTypes("CSV", register_func); +} + } diff --git a/src/Processors/Formats/Impl/CSVRowInputFormat.h b/src/Processors/Formats/Impl/CSVRowInputFormat.h index d7c557b58d8..d723647595e 100644 --- a/src/Processors/Formats/Impl/CSVRowInputFormat.h +++ b/src/Processors/Formats/Impl/CSVRowInputFormat.h @@ -5,6 +5,7 @@ #include #include +#include #include @@ -28,6 +29,12 @@ public: private: bool allowSyncAfterError() const override { return true; } void syncAfterError() override; +}; + +class CSVFormatReader : public FormatWithNamesAndTypesReader +{ +public: + CSVFormatReader(ReadBuffer & in_, const FormatSettings & format_settings_); bool parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out) override; bool parseRowEndWithDiagnosticInfo(WriteBuffer & out) override; @@ -42,17 +49,34 @@ private: void skipField(size_t /*file_column*/) override { skipField(); } void skipField(); - void skipHeaderRow() ; + void skipHeaderRow(); void skipNames() override { skipHeaderRow(); } void skipTypes() override { skipHeaderRow(); } void skipFieldDelimiter() override; void skipRowEndDelimiter() override; - std::vector readHeaderRow(); std::vector readNames() override { return readHeaderRow(); } std::vector readTypes() override { return readHeaderRow(); } + std::vector readHeaderRow() { return readRowImpl(); } + std::vector readRow() { return readRowImpl(); } - String readFieldIntoString(); + template + std::vector readRowImpl(); + + template + String readCSVFieldIntoString(); +}; + +class CSVSchemaReader : public FormatWithNamesAndTypesSchemaReader +{ +public: + CSVSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, const FormatSettings & format_setting_, ContextPtr context_); + +private: + DataTypes readRowAndGetDataTypes() override; + + CSVFormatReader reader; + ContextPtr context; }; } diff --git a/src/Processors/Formats/Impl/CapnProtoRowInputFormat.cpp b/src/Processors/Formats/Impl/CapnProtoRowInputFormat.cpp index 4d000bb1f35..311f4742335 100644 --- a/src/Processors/Formats/Impl/CapnProtoRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/CapnProtoRowInputFormat.cpp @@ -273,6 +273,7 @@ bool CapnProtoRowInputFormat::readRow(MutableColumns & columns, RowReadExtension #endif auto root_reader = msg.getRoot(root); + for (size_t i = 0; i != columns.size(); ++i) { auto value = getReaderByColumnName(root_reader, column_names[i]); @@ -282,6 +283,24 @@ bool CapnProtoRowInputFormat::readRow(MutableColumns & columns, RowReadExtension return true; } +CapnProtoSchemaReader::CapnProtoSchemaReader(const FormatSettings & format_settings_) : format_settings(format_settings_) +{ +} + +NamesAndTypesList CapnProtoSchemaReader::readSchema() +{ + auto schema_info = FormatSchemaInfo( + format_settings.schema.format_schema, + "CapnProto", + true, + format_settings.schema.is_server, + format_settings.schema.format_schema_path); + + auto schema_parser = CapnProtoSchemaParser(); + auto schema = schema_parser.getMessageSchema(schema_info); + return capnProtoSchemaToCHSchema(schema); +} + void registerInputFormatCapnProto(FormatFactory & factory) { factory.registerInputFormat( @@ -293,6 +312,14 @@ void registerInputFormatCapnProto(FormatFactory & factory) }); } +void registerCapnProtoSchemaReader(FormatFactory & factory) +{ + factory.registerExternalSchemaReader("CapnProto", [](const FormatSettings & settings) + { + return std::make_shared(settings); + }); +} + } #else @@ -301,6 +328,7 @@ namespace DB { class FormatFactory; void registerInputFormatCapnProto(FormatFactory &) {} + void registerCapnProtoSchemaReader(FormatFactory &) {} } #endif // USE_CAPNP diff --git a/src/Processors/Formats/Impl/CapnProtoRowInputFormat.h b/src/Processors/Formats/Impl/CapnProtoRowInputFormat.h index 4c0f34d70a3..053de14d1a4 100644 --- a/src/Processors/Formats/Impl/CapnProtoRowInputFormat.h +++ b/src/Processors/Formats/Impl/CapnProtoRowInputFormat.h @@ -6,6 +6,7 @@ #include #include #include +#include namespace DB { @@ -38,6 +39,17 @@ private: Names column_names; }; +class CapnProtoSchemaReader : public IExternalSchemaReader +{ +public: + explicit CapnProtoSchemaReader(const FormatSettings & format_settings_); + + NamesAndTypesList readSchema() override; + +private: + const FormatSettings format_settings; +}; + } #endif // USE_CAPNP diff --git a/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.cpp b/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.cpp index 8cd9d154ae4..d2e0d6e21a9 100644 --- a/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.cpp @@ -24,16 +24,34 @@ static FormatSettings updateFormatSettings(const FormatSettings & settings) CustomSeparatedRowInputFormat::CustomSeparatedRowInputFormat( const Block & header_, - ReadBuffer & in_, + ReadBuffer & in_buf_, const Params & params_, bool with_names_, bool with_types_, bool ignore_spaces_, const FormatSettings & format_settings_) - : RowInputFormatWithNamesAndTypes(header_, buf, params_, with_names_, with_types_, updateFormatSettings(format_settings_)) - , buf(in_) - , ignore_spaces(ignore_spaces_) - , escaping_rule(format_settings_.custom.escaping_rule) + : CustomSeparatedRowInputFormat( + header_, std::make_unique(in_buf_), params_, with_names_, with_types_, ignore_spaces_, updateFormatSettings(format_settings_)) +{ +} + +CustomSeparatedRowInputFormat::CustomSeparatedRowInputFormat( + const Block & header_, + std::unique_ptr buf_, + const Params & params_, + bool with_names_, + bool with_types_, + bool ignore_spaces_, + const FormatSettings & format_settings_) + : RowInputFormatWithNamesAndTypes( + header_, + *buf_, + params_, + with_names_, + with_types_, + format_settings_, + std::make_unique(*buf_, ignore_spaces_, format_settings_)) + , buf(std::move(buf_)) { /// In case of CustomSeparatedWithNames(AndTypes) formats and enabled setting input_format_with_names_use_header we don't know /// the exact number of columns in data (because it can contain unknown columns). So, if field_delimiter and row_after_delimiter are @@ -48,146 +66,6 @@ CustomSeparatedRowInputFormat::CustomSeparatedRowInputFormat( } } -void CustomSeparatedRowInputFormat::skipPrefixBeforeHeader() -{ - skipSpaces(); - assertString(format_settings.custom.result_before_delimiter, buf); -} - -void CustomSeparatedRowInputFormat::skipRowStartDelimiter() -{ - skipSpaces(); - assertString(format_settings.custom.row_before_delimiter, buf); -} - -void CustomSeparatedRowInputFormat::skipFieldDelimiter() -{ - skipSpaces(); - assertString(format_settings.custom.field_delimiter, buf); -} - -void CustomSeparatedRowInputFormat::skipRowEndDelimiter() -{ - skipSpaces(); - assertString(format_settings.custom.row_after_delimiter, buf); -} - -void CustomSeparatedRowInputFormat::skipRowBetweenDelimiter() -{ - skipSpaces(); - assertString(format_settings.custom.row_between_delimiter, buf); -} - -void CustomSeparatedRowInputFormat::skipField() -{ - skipSpaces(); - skipFieldByEscapingRule(buf, escaping_rule, format_settings); -} - -bool CustomSeparatedRowInputFormat::checkEndOfRow() -{ - PeekableReadBufferCheckpoint checkpoint{buf, true}; - - skipSpaces(); - if (!checkString(format_settings.custom.row_after_delimiter, buf)) - return false; - - skipSpaces(); - - /// At the end of row after row_after_delimiter we expect result_after_delimiter or row_between_delimiter. - - if (checkString(format_settings.custom.row_between_delimiter, buf)) - return true; - - buf.rollbackToCheckpoint(); - skipSpaces(); - buf.ignore(format_settings.custom.row_after_delimiter.size()); - return checkForSuffixImpl(true); -} - -std::vector CustomSeparatedRowInputFormat::readHeaderRow() -{ - std::vector values; - skipRowStartDelimiter(); - do - { - if (!values.empty()) - skipFieldDelimiter(); - skipSpaces(); - values.push_back(readStringByEscapingRule(buf, escaping_rule, format_settings)); - } - while (!checkEndOfRow()); - - skipRowEndDelimiter(); - return values; -} - -void CustomSeparatedRowInputFormat::skipHeaderRow() -{ - size_t columns = getPort().getHeader().columns(); - skipRowStartDelimiter(); - for (size_t i = 0; i != columns; ++i) - { - skipField(); - if (i + 1 != columns) - skipFieldDelimiter(); - } - skipRowEndDelimiter(); -} - -bool CustomSeparatedRowInputFormat::readField(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, bool, const String &) -{ - skipSpaces(); - return deserializeFieldByEscapingRule(type, serialization, column, buf, escaping_rule, format_settings); -} - -bool CustomSeparatedRowInputFormat::checkForSuffixImpl(bool check_eof) -{ - skipSpaces(); - if (format_settings.custom.result_after_delimiter.empty()) - { - if (!check_eof) - return false; - - return buf.eof(); - } - - if (unlikely(checkString(format_settings.custom.result_after_delimiter, buf))) - { - skipSpaces(); - if (!check_eof) - return true; - - if (buf.eof()) - return true; - } - return false; -} - -bool CustomSeparatedRowInputFormat::tryParseSuffixWithDiagnosticInfo(WriteBuffer & out) -{ - PeekableReadBufferCheckpoint checkpoint{buf}; - if (checkForSuffixImpl(false)) - { - if (buf.eof()) - out << "\n"; - else - out << " There is some data after suffix\n"; - return false; - } - buf.rollbackToCheckpoint(); - return true; -} - -bool CustomSeparatedRowInputFormat::checkForSuffix() -{ - PeekableReadBufferCheckpoint checkpoint{buf}; - if (checkForSuffixImpl(true)) - return true; - buf.rollbackToCheckpoint(); - return false; -} - bool CustomSeparatedRowInputFormat::allowSyncAfterError() const { @@ -196,37 +74,248 @@ bool CustomSeparatedRowInputFormat::allowSyncAfterError() const void CustomSeparatedRowInputFormat::syncAfterError() { - skipToNextRowOrEof(buf, format_settings.custom.row_after_delimiter, format_settings.custom.row_between_delimiter, ignore_spaces); - end_of_stream = buf.eof(); - /// It can happen that buf.position() is not at the beginning of row + skipToNextRowOrEof(*buf, format_settings.custom.row_after_delimiter, format_settings.custom.row_between_delimiter, ignore_spaces); + end_of_stream = buf->eof(); + /// It can happen that buf->position() is not at the beginning of row /// if some delimiters is similar to row_format.delimiters.back() and row_between_delimiter. /// It will cause another parsing error. } -bool CustomSeparatedRowInputFormat::parseRowStartWithDiagnosticInfo(WriteBuffer & out) +void CustomSeparatedRowInputFormat::setReadBuffer(ReadBuffer & in_) { - return parseDelimiterWithDiagnosticInfo(out, buf, format_settings.custom.row_before_delimiter, "delimiter before first field", ignore_spaces); + buf = std::make_unique(in_); + RowInputFormatWithNamesAndTypes::setReadBuffer(*buf); } -bool CustomSeparatedRowInputFormat::parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out) +CustomSeparatedFormatReader::CustomSeparatedFormatReader( + PeekableReadBuffer & buf_, bool ignore_spaces_, const FormatSettings & format_settings_) + : FormatWithNamesAndTypesReader(buf_, format_settings_), buf(&buf_), ignore_spaces(ignore_spaces_) { - return parseDelimiterWithDiagnosticInfo(out, buf, format_settings.custom.field_delimiter, "delimiter between fields", ignore_spaces); -} - -bool CustomSeparatedRowInputFormat::parseRowEndWithDiagnosticInfo(WriteBuffer & out) -{ - return parseDelimiterWithDiagnosticInfo(out, buf, format_settings.custom.row_after_delimiter, "delimiter after last field", ignore_spaces); -} - -bool CustomSeparatedRowInputFormat::parseRowBetweenDelimiterWithDiagnosticInfo(WriteBuffer & out) -{ - return parseDelimiterWithDiagnosticInfo(out, buf, format_settings.custom.row_between_delimiter, "delimiter between rows", ignore_spaces); } void CustomSeparatedRowInputFormat::resetParser() { RowInputFormatWithNamesAndTypes::resetParser(); - buf.reset(); + buf->reset(); +} + +void CustomSeparatedFormatReader::skipPrefixBeforeHeader() +{ + skipSpaces(); + assertString(format_settings.custom.result_before_delimiter, *buf); +} + +void CustomSeparatedFormatReader::skipRowStartDelimiter() +{ + skipSpaces(); + assertString(format_settings.custom.row_before_delimiter, *buf); +} + +void CustomSeparatedFormatReader::skipFieldDelimiter() +{ + skipSpaces(); + assertString(format_settings.custom.field_delimiter, *buf); +} + +void CustomSeparatedFormatReader::skipRowEndDelimiter() +{ + skipSpaces(); + assertString(format_settings.custom.row_after_delimiter, *buf); +} + +void CustomSeparatedFormatReader::skipRowBetweenDelimiter() +{ + skipSpaces(); + assertString(format_settings.custom.row_between_delimiter, *buf); +} + +void CustomSeparatedFormatReader::skipField() +{ + skipSpaces(); + skipFieldByEscapingRule(*buf, format_settings.custom.escaping_rule, format_settings); +} + +bool CustomSeparatedFormatReader::checkEndOfRow() +{ + PeekableReadBufferCheckpoint checkpoint{*buf, true}; + + skipSpaces(); + if (!checkString(format_settings.custom.row_after_delimiter, *buf)) + return false; + + skipSpaces(); + + /// At the end of row after row_after_delimiter we expect result_after_delimiter or row_between_delimiter. + + if (checkString(format_settings.custom.row_between_delimiter, *buf)) + return true; + + buf->rollbackToCheckpoint(); + skipSpaces(); + buf->ignore(format_settings.custom.row_after_delimiter.size()); + return checkForSuffixImpl(true); +} + +template +String CustomSeparatedFormatReader::readFieldIntoString(bool is_first) +{ + if (!is_first) + skipFieldDelimiter(); + skipSpaces(); + if constexpr (is_header) + return readStringByEscapingRule(*buf, format_settings.custom.escaping_rule, format_settings); + else + return readFieldByEscapingRule(*buf, format_settings.custom.escaping_rule, format_settings); +} + +template +std::vector CustomSeparatedFormatReader::readRowImpl() +{ + std::vector values; + skipRowStartDelimiter(); + + if (columns == 0) + { + do + { + values.push_back(readFieldIntoString(values.empty())); + } while (!checkEndOfRow()); + columns = values.size(); + } + else + { + for (size_t i = 0; i != columns; ++i) + values.push_back(readFieldIntoString(i == 0)); + } + + skipRowEndDelimiter(); + return values; +} + +void CustomSeparatedFormatReader::skipHeaderRow() +{ + skipRowStartDelimiter(); + bool first = true; + do + { + if (!first) + skipFieldDelimiter(); + first = false; + + skipField(); + } + while (!checkEndOfRow()); + + skipRowEndDelimiter(); +} + +bool CustomSeparatedFormatReader::readField(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, bool, const String &) +{ + skipSpaces(); + return deserializeFieldByEscapingRule(type, serialization, column, *buf, format_settings.custom.escaping_rule, format_settings); +} + +bool CustomSeparatedFormatReader::checkForSuffixImpl(bool check_eof) +{ + skipSpaces(); + if (format_settings.custom.result_after_delimiter.empty()) + { + if (!check_eof) + return false; + + return buf->eof(); + } + + if (unlikely(checkString(format_settings.custom.result_after_delimiter, *buf))) + { + skipSpaces(); + if (!check_eof) + return true; + + if (buf->eof()) + return true; + } + return false; +} + +bool CustomSeparatedFormatReader::tryParseSuffixWithDiagnosticInfo(WriteBuffer & out) +{ + PeekableReadBufferCheckpoint checkpoint{*buf}; + if (checkForSuffixImpl(false)) + { + if (buf->eof()) + out << "\n"; + else + out << " There is some data after suffix\n"; + return false; + } + buf->rollbackToCheckpoint(); + return true; +} + +bool CustomSeparatedFormatReader::checkForSuffix() +{ + PeekableReadBufferCheckpoint checkpoint{*buf}; + if (checkForSuffixImpl(true)) + return true; + buf->rollbackToCheckpoint(); + return false; +} + +bool CustomSeparatedFormatReader::parseRowStartWithDiagnosticInfo(WriteBuffer & out) +{ + return parseDelimiterWithDiagnosticInfo(out, *buf, format_settings.custom.row_before_delimiter, "delimiter before first field", ignore_spaces); +} + +bool CustomSeparatedFormatReader::parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out) +{ + return parseDelimiterWithDiagnosticInfo(out, *buf, format_settings.custom.field_delimiter, "delimiter between fields", ignore_spaces); +} + +bool CustomSeparatedFormatReader::parseRowEndWithDiagnosticInfo(WriteBuffer & out) +{ + return parseDelimiterWithDiagnosticInfo(out, *buf, format_settings.custom.row_after_delimiter, "delimiter after last field", ignore_spaces); +} + +bool CustomSeparatedFormatReader::parseRowBetweenDelimiterWithDiagnosticInfo(WriteBuffer & out) +{ + return parseDelimiterWithDiagnosticInfo(out, *buf, format_settings.custom.row_between_delimiter, "delimiter between rows", ignore_spaces); +} + +void CustomSeparatedFormatReader::setReadBuffer(ReadBuffer & in_) +{ + buf = assert_cast(&in_); + FormatWithNamesAndTypesReader::setReadBuffer(in_); +} + +CustomSeparatedSchemaReader::CustomSeparatedSchemaReader( + ReadBuffer & in_, bool with_names_, bool with_types_, bool ignore_spaces_, const FormatSettings & format_setting_, ContextPtr context_) + : FormatWithNamesAndTypesSchemaReader( + buf, + format_setting_.max_rows_to_read_for_schema_inference, + with_names_, + with_types_, + &reader, + getDefaultDataTypeForEscapingRule(format_setting_.custom.escaping_rule)) + , buf(in_) + , reader(buf, ignore_spaces_, updateFormatSettings(format_setting_)) + , context(context_) +{ +} + +DataTypes CustomSeparatedSchemaReader::readRowAndGetDataTypes() +{ + if (reader.checkForSuffix()) + return {}; + + if (!first_row || with_names || with_types) + reader.skipRowBetweenDelimiter(); + + if (first_row) + first_row = false; + + auto fields = reader.readRow(); + return determineDataTypesByEscapingRule(fields, reader.getFormatSettings(), reader.getEscapingRule(), context); } void registerInputFormatCustomSeparated(FormatFactory & factory) @@ -248,4 +337,20 @@ void registerInputFormatCustomSeparated(FormatFactory & factory) } } +void registerCustomSeparatedSchemaReader(FormatFactory & factory) +{ + for (bool ignore_spaces : {false, true}) + { + auto register_func = [&](const String & format_name, bool with_names, bool with_types) + { + factory.registerSchemaReader(format_name, [with_names, with_types, ignore_spaces](ReadBuffer & buf, const FormatSettings & settings, ContextPtr context) + { + return std::make_shared(buf, with_names, with_types, ignore_spaces, settings, context); + }); + }; + + registerWithNamesAndTypes(ignore_spaces ? "CustomSeparatedIgnoreSpaces" : "CustomSeparated", register_func); + } +} + } diff --git a/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.h b/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.h index 00ee28e50cc..d38d5bf0da4 100644 --- a/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.h +++ b/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.h @@ -19,13 +19,27 @@ public: void resetParser() override; String getName() const override { return "CustomSeparatedRowInputFormat"; } + void setReadBuffer(ReadBuffer & in_) override; private: CustomSeparatedRowInputFormat( const Block & header_, - std::unique_ptr in_, + std::unique_ptr in_buf_, const Params & params_, bool with_names_, bool with_types_, bool ignore_spaces_, const FormatSettings & format_settings_); + + bool allowSyncAfterError() const override; + void syncAfterError() override; + + std::unique_ptr buf; + bool ignore_spaces; +}; + +class CustomSeparatedFormatReader : public FormatWithNamesAndTypesReader +{ +public: + CustomSeparatedFormatReader(PeekableReadBuffer & buf_, bool ignore_spaces_, const FormatSettings & format_settings_); + using EscapingRule = FormatSettings::EscapingRule; bool readField(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, bool is_last_file_column, const String & column_name) override; @@ -44,9 +58,6 @@ private: bool checkForSuffix() override; - bool allowSyncAfterError() const override; - void syncAfterError() override; - bool parseRowStartWithDiagnosticInfo(WriteBuffer & out) override; bool parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out) override; bool parseRowEndWithDiagnosticInfo(WriteBuffer & out) override; @@ -55,15 +66,41 @@ private: std::vector readNames() override { return readHeaderRow(); } std::vector readTypes() override { return readHeaderRow(); } - std::vector readHeaderRow(); + std::vector readHeaderRow() {return readRowImpl(); } + + std::vector readRow() { return readRowImpl(); } bool checkEndOfRow(); bool checkForSuffixImpl(bool check_eof); - inline void skipSpaces() { if (ignore_spaces) skipWhitespaceIfAny(buf); } + inline void skipSpaces() { if (ignore_spaces) skipWhitespaceIfAny(*buf); } + + EscapingRule getEscapingRule() { return format_settings.custom.escaping_rule; } + + void setReadBuffer(ReadBuffer & in_) override; +private: + template + std::vector readRowImpl(); + + template + String readFieldIntoString(bool is_first); + + PeekableReadBuffer * buf; + bool ignore_spaces; + size_t columns = 0; +}; + +class CustomSeparatedSchemaReader : public FormatWithNamesAndTypesSchemaReader +{ +public: + CustomSeparatedSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, bool ignore_spaces_, const FormatSettings & format_setting_, ContextPtr context_); + +private: + DataTypes readRowAndGetDataTypes() override; PeekableReadBuffer buf; - bool ignore_spaces; - EscapingRule escaping_rule; + CustomSeparatedFormatReader reader; + ContextPtr context; + bool first_row = true; }; } diff --git a/src/Processors/Formats/Impl/JSONAsStringRowInputFormat.cpp b/src/Processors/Formats/Impl/JSONAsStringRowInputFormat.cpp index 1cca53b2f56..56ba975dea1 100644 --- a/src/Processors/Formats/Impl/JSONAsStringRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/JSONAsStringRowInputFormat.cpp @@ -14,8 +14,11 @@ namespace ErrorCodes extern const int INCORRECT_DATA; } -JSONAsStringRowInputFormat::JSONAsStringRowInputFormat(const Block & header_, ReadBuffer & in_, Params params_) : - IRowInputFormat(header_, buf, std::move(params_)), buf(in_) +JSONAsStringRowInputFormat::JSONAsStringRowInputFormat(const Block & header_, ReadBuffer & in_, Params params_) + : JSONAsStringRowInputFormat(header_, std::make_unique(in_), params_) {} + +JSONAsStringRowInputFormat::JSONAsStringRowInputFormat(const Block & header_, std::unique_ptr buf_, Params params_) : + IRowInputFormat(header_, *buf_, std::move(params_)), buf(std::move(buf_)) { if (header_.columns() > 1) throw Exception(ErrorCodes::BAD_ARGUMENTS, @@ -31,113 +34,113 @@ JSONAsStringRowInputFormat::JSONAsStringRowInputFormat(const Block & header_, Re void JSONAsStringRowInputFormat::resetParser() { IRowInputFormat::resetParser(); - buf.reset(); + buf->reset(); } void JSONAsStringRowInputFormat::readPrefix() { /// In this format, BOM at beginning of stream cannot be confused with value, so it is safe to skip it. - skipBOMIfExists(buf); + skipBOMIfExists(*buf); - skipWhitespaceIfAny(buf); - if (!buf.eof() && *buf.position() == '[') + skipWhitespaceIfAny(*buf); + if (!buf->eof() && *buf->position() == '[') { - ++buf.position(); + ++buf->position(); data_in_square_brackets = true; } } void JSONAsStringRowInputFormat::readSuffix() { - skipWhitespaceIfAny(buf); + skipWhitespaceIfAny(*buf); if (data_in_square_brackets) { - assertChar(']', buf); - skipWhitespaceIfAny(buf); + assertChar(']', *buf); + skipWhitespaceIfAny(*buf); } - if (!buf.eof() && *buf.position() == ';') + if (!buf->eof() && *buf->position() == ';') { - ++buf.position(); - skipWhitespaceIfAny(buf); + ++buf->position(); + skipWhitespaceIfAny(*buf); } - assertEOF(buf); + assertEOF(*buf); } void JSONAsStringRowInputFormat::readJSONObject(IColumn & column) { - PeekableReadBufferCheckpoint checkpoint{buf}; + PeekableReadBufferCheckpoint checkpoint{*buf}; size_t balance = 0; bool quotes = false; - if (*buf.position() != '{') + if (*buf->position() != '{') throw Exception("JSON object must begin with '{'.", ErrorCodes::INCORRECT_DATA); - ++buf.position(); + ++buf->position(); ++balance; char * pos; while (balance) { - if (buf.eof()) + if (buf->eof()) throw Exception("Unexpected end of file while parsing JSON object.", ErrorCodes::INCORRECT_DATA); if (quotes) { - pos = find_first_symbols<'"', '\\'>(buf.position(), buf.buffer().end()); - buf.position() = pos; - if (buf.position() == buf.buffer().end()) + pos = find_first_symbols<'"', '\\'>(buf->position(), buf->buffer().end()); + buf->position() = pos; + if (buf->position() == buf->buffer().end()) continue; - if (*buf.position() == '"') + if (*buf->position() == '"') { quotes = false; - ++buf.position(); + ++buf->position(); } - else if (*buf.position() == '\\') + else if (*buf->position() == '\\') { - ++buf.position(); - if (!buf.eof()) + ++buf->position(); + if (!buf->eof()) { - ++buf.position(); + ++buf->position(); } } } else { - pos = find_first_symbols<'"', '{', '}', '\\'>(buf.position(), buf.buffer().end()); - buf.position() = pos; - if (buf.position() == buf.buffer().end()) + pos = find_first_symbols<'"', '{', '}', '\\'>(buf->position(), buf->buffer().end()); + buf->position() = pos; + if (buf->position() == buf->buffer().end()) continue; - if (*buf.position() == '{') + if (*buf->position() == '{') { ++balance; - ++buf.position(); + ++buf->position(); } - else if (*buf.position() == '}') + else if (*buf->position() == '}') { --balance; - ++buf.position(); + ++buf->position(); } - else if (*buf.position() == '\\') + else if (*buf->position() == '\\') { - ++buf.position(); - if (!buf.eof()) + ++buf->position(); + if (!buf->eof()) { - ++buf.position(); + ++buf->position(); } } - else if (*buf.position() == '"') + else if (*buf->position() == '"') { quotes = true; - ++buf.position(); + ++buf->position(); } } } - buf.makeContinuousMemoryFromCheckpointToPos(); - char * end = buf.position(); - buf.rollbackToCheckpoint(); - column.insertData(buf.position(), end - buf.position()); - buf.position() = end; + buf->makeContinuousMemoryFromCheckpointToPos(); + char * end = buf->position(); + buf->rollbackToCheckpoint(); + column.insertData(buf->position(), end - buf->position()); + buf->position() = end; } bool JSONAsStringRowInputFormat::readRow(MutableColumns & columns, RowReadExtension &) @@ -145,30 +148,36 @@ bool JSONAsStringRowInputFormat::readRow(MutableColumns & columns, RowReadExtens if (!allow_new_rows) return false; - skipWhitespaceIfAny(buf); - if (!buf.eof()) + skipWhitespaceIfAny(*buf); + if (!buf->eof()) { - if (!data_in_square_brackets && *buf.position() == ';') + if (!data_in_square_brackets && *buf->position() == ';') { /// ';' means the end of query, but it cannot be before ']'. return allow_new_rows = false; } - else if (data_in_square_brackets && *buf.position() == ']') + else if (data_in_square_brackets && *buf->position() == ']') { /// ']' means the end of query. return allow_new_rows = false; } } - if (!buf.eof()) + if (!buf->eof()) readJSONObject(*columns[0]); - skipWhitespaceIfAny(buf); - if (!buf.eof() && *buf.position() == ',') - ++buf.position(); - skipWhitespaceIfAny(buf); + skipWhitespaceIfAny(*buf); + if (!buf->eof() && *buf->position() == ',') + ++buf->position(); + skipWhitespaceIfAny(*buf); - return !buf.eof(); + return !buf->eof(); +} + +void JSONAsStringRowInputFormat::setReadBuffer(ReadBuffer & in_) +{ + buf = std::make_unique(in_); + IInputFormat::setReadBuffer(*buf); } void registerInputFormatJSONAsString(FormatFactory & factory) @@ -193,4 +202,12 @@ void registerNonTrivialPrefixAndSuffixCheckerJSONAsString(FormatFactory & factor factory.registerNonTrivialPrefixAndSuffixChecker("JSONAsString", nonTrivialPrefixAndSuffixCheckerJSONEachRowImpl); } +void registerJSONAsStringSchemaReader(FormatFactory & factory) +{ + factory.registerExternalSchemaReader("JSONAsString", [](const FormatSettings &) + { + return std::make_shared(); + }); +} + } diff --git a/src/Processors/Formats/Impl/JSONAsStringRowInputFormat.h b/src/Processors/Formats/Impl/JSONAsStringRowInputFormat.h index 96ad60b3fab..ea6e9a1ed2f 100644 --- a/src/Processors/Formats/Impl/JSONAsStringRowInputFormat.h +++ b/src/Processors/Formats/Impl/JSONAsStringRowInputFormat.h @@ -1,8 +1,10 @@ #pragma once #include +#include #include #include +#include namespace DB { @@ -20,8 +22,11 @@ public: String getName() const override { return "JSONAsStringRowInputFormat"; } void resetParser() override; + void setReadBuffer(ReadBuffer & in_) override; private: + JSONAsStringRowInputFormat(const Block & header_, std::unique_ptr buf_, Params params_); + bool readRow(MutableColumns & columns, RowReadExtension & ext) override; void readPrefix() override; @@ -29,11 +34,20 @@ private: void readJSONObject(IColumn & column); - PeekableReadBuffer buf; + std::unique_ptr buf; /// This flag is needed to know if data is in square brackets. bool data_in_square_brackets = false; bool allow_new_rows = true; }; +class JSONAsStringExternalSchemaReader : public IExternalSchemaReader +{ +public: + NamesAndTypesList readSchema() override + { + return {{"json", std::make_shared()}}; + } +}; + } diff --git a/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.cpp b/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.cpp index 88fb411ffbd..263702ad20f 100644 --- a/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.cpp @@ -1,6 +1,7 @@ #include #include +#include #include #include #include @@ -8,16 +9,13 @@ #include #include #include +#include +#include +#include namespace DB { -namespace ErrorCodes -{ - extern const int INCORRECT_DATA; -} - - JSONCompactEachRowRowInputFormat::JSONCompactEachRowRowInputFormat( const Block & header_, ReadBuffer & in_, @@ -26,24 +24,40 @@ JSONCompactEachRowRowInputFormat::JSONCompactEachRowRowInputFormat( bool with_types_, bool yield_strings_, const FormatSettings & format_settings_) - : RowInputFormatWithNamesAndTypes(header_, in_, std::move(params_), with_names_, with_types_, format_settings_) - , yield_strings(yield_strings_) + : RowInputFormatWithNamesAndTypes( + header_, + in_, + std::move(params_), + with_names_, + with_types_, + format_settings_, + std::make_unique(in_, yield_strings_, format_settings_)) { } -void JSONCompactEachRowRowInputFormat::skipRowStartDelimiter() +void JSONCompactEachRowRowInputFormat::syncAfterError() +{ + skipToUnescapedNextLineOrEOF(*in); +} + +JSONCompactEachRowFormatReader::JSONCompactEachRowFormatReader(ReadBuffer & in_, bool yield_strings_, const FormatSettings & format_settings_) + : FormatWithNamesAndTypesReader(in_, format_settings_), yield_strings(yield_strings_) +{ +} + +void JSONCompactEachRowFormatReader::skipRowStartDelimiter() { skipWhitespaceIfAny(*in); assertChar('[', *in); } -void JSONCompactEachRowRowInputFormat::skipFieldDelimiter() +void JSONCompactEachRowFormatReader::skipFieldDelimiter() { skipWhitespaceIfAny(*in); assertChar(',', *in); } -void JSONCompactEachRowRowInputFormat::skipRowEndDelimiter() +void JSONCompactEachRowFormatReader::skipRowEndDelimiter() { skipWhitespaceIfAny(*in); assertChar(']', *in); @@ -55,29 +69,18 @@ void JSONCompactEachRowRowInputFormat::skipRowEndDelimiter() skipWhitespaceIfAny(*in); } -String JSONCompactEachRowRowInputFormat::readFieldIntoString() +void JSONCompactEachRowFormatReader::skipField() { skipWhitespaceIfAny(*in); - String field; - readJSONString(field, *in); - return field; + skipJSONField(*in, "skipped_field"); } -void JSONCompactEachRowRowInputFormat::skipField(size_t file_column) -{ - skipWhitespaceIfAny(*in); - skipJSONField(*in, column_mapping->names_of_columns[file_column]); -} - -void JSONCompactEachRowRowInputFormat::skipHeaderRow() +void JSONCompactEachRowFormatReader::skipHeaderRow() { skipRowStartDelimiter(); - size_t i = 0; do { - if (i >= column_mapping->names_of_columns.size()) - throw Exception(ErrorCodes::INCORRECT_DATA, "The number of columns in a row differs from the number of column names"); - skipField(i++); + skipField(); skipWhitespaceIfAny(*in); } while (checkChar(',', *in)); @@ -85,13 +88,16 @@ void JSONCompactEachRowRowInputFormat::skipHeaderRow() skipRowEndDelimiter(); } -std::vector JSONCompactEachRowRowInputFormat::readHeaderRow() +std::vector JSONCompactEachRowFormatReader::readHeaderRow() { skipRowStartDelimiter(); std::vector fields; + String field; do { - fields.push_back(readFieldIntoString()); + skipWhitespaceIfAny(*in); + readJSONString(field, *in); + fields.push_back(field); skipWhitespaceIfAny(*in); } while (checkChar(',', *in)); @@ -100,18 +106,13 @@ std::vector JSONCompactEachRowRowInputFormat::readHeaderRow() return fields; } -bool JSONCompactEachRowRowInputFormat::readField(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, bool /*is_last_file_column*/, const String & column_name) +bool JSONCompactEachRowFormatReader::readField(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, bool /*is_last_file_column*/, const String & column_name) { skipWhitespaceIfAny(*in); return readFieldImpl(*in, column, type, serialization, column_name, format_settings, yield_strings); } -void JSONCompactEachRowRowInputFormat::syncAfterError() -{ - skipToUnescapedNextLineOrEOF(*in); -} - -bool JSONCompactEachRowRowInputFormat::parseRowStartWithDiagnosticInfo(WriteBuffer & out) +bool JSONCompactEachRowFormatReader::parseRowStartWithDiagnosticInfo(WriteBuffer & out) { skipWhitespaceIfAny(*in); if (!checkChar('[', *in)) @@ -123,7 +124,7 @@ bool JSONCompactEachRowRowInputFormat::parseRowStartWithDiagnosticInfo(WriteBuff return true; } -bool JSONCompactEachRowRowInputFormat::parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out) +bool JSONCompactEachRowFormatReader::parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out) { try { @@ -150,7 +151,7 @@ bool JSONCompactEachRowRowInputFormat::parseFieldDelimiterWithDiagnosticInfo(Wri return true; } -bool JSONCompactEachRowRowInputFormat::parseRowEndWithDiagnosticInfo(WriteBuffer & out) +bool JSONCompactEachRowFormatReader::parseRowEndWithDiagnosticInfo(WriteBuffer & out) { skipWhitespaceIfAny(*in); @@ -180,6 +181,20 @@ bool JSONCompactEachRowRowInputFormat::parseRowEndWithDiagnosticInfo(WriteBuffer return true; } +JSONCompactEachRowRowSchemaReader::JSONCompactEachRowRowSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, bool yield_strings_, const FormatSettings & format_settings_) + : FormatWithNamesAndTypesSchemaReader(in_, format_settings_.max_rows_to_read_for_schema_inference, with_names_, with_types_, &reader), reader(in_, yield_strings_, format_settings_) +{ +} + +DataTypes JSONCompactEachRowRowSchemaReader::readRowAndGetDataTypes() +{ + skipWhitespaceIfAny(in); + if (in.eof()) + return {}; + + return readRowAndGetDataTypesForJSONCompactEachRow(in, reader.yieldStrings()); +} + void registerInputFormatJSONCompactEachRow(FormatFactory & factory) { for (bool yield_strings : {true, false}) @@ -200,6 +215,21 @@ void registerInputFormatJSONCompactEachRow(FormatFactory & factory) } } +void registerJSONCompactEachRowSchemaReader(FormatFactory & factory) +{ + for (bool json_strings : {false, true}) + { + auto register_func = [&](const String & format_name, bool with_names, bool with_types) + { + factory.registerSchemaReader(format_name, [=](ReadBuffer & buf, const FormatSettings & settings, ContextPtr) + { + return std::make_shared(buf, with_names, with_types, json_strings, settings); + }); + }; + registerWithNamesAndTypes(json_strings ? "JSONCompactStringsEachRow" : "JSONCompactEachRow", register_func); + } +} + void registerFileSegmentationEngineJSONCompactEachRow(FormatFactory & factory) { auto register_func = [&](const String & format_name, bool with_names, bool with_types) diff --git a/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.h b/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.h index e01a4f49b30..0551aa8b64e 100644 --- a/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.h +++ b/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.h @@ -2,6 +2,7 @@ #include #include +#include #include #include @@ -10,6 +11,7 @@ namespace DB class ReadBuffer; + /** A stream for reading data in a bunch of formats: * - JSONCompactEachRow * - JSONCompactEachRowWithNamesAndTypes @@ -34,6 +36,13 @@ public: private: bool allowSyncAfterError() const override { return true; } void syncAfterError() override; +}; + +class JSONCompactEachRowFormatReader : public FormatWithNamesAndTypesReader +{ +public: + JSONCompactEachRowFormatReader(ReadBuffer & in_, bool yield_strings_, const FormatSettings & format_settings_); + bool parseRowStartWithDiagnosticInfo(WriteBuffer & out) override; bool parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out) override; @@ -45,7 +54,8 @@ private: bool readField(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, bool is_last_file_column, const String & column_name) override; - void skipField(size_t file_column) override; + void skipField(size_t /*column_index*/) override { skipField(); } + void skipField(); void skipHeaderRow(); void skipNames() override { skipHeaderRow(); } void skipTypes() override { skipHeaderRow(); } @@ -56,9 +66,21 @@ private: std::vector readHeaderRow(); std::vector readNames() override { return readHeaderRow(); } std::vector readTypes() override { return readHeaderRow(); } - String readFieldIntoString(); + bool yieldStrings() const { return yield_strings; } +private: bool yield_strings; }; +class JSONCompactEachRowRowSchemaReader : public FormatWithNamesAndTypesSchemaReader +{ +public: + JSONCompactEachRowRowSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, bool yield_strings_, const FormatSettings & format_settings_); + +private: + DataTypes readRowAndGetDataTypes() override; + + JSONCompactEachRowFormatReader reader; +}; + } diff --git a/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp b/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp index 28481313974..75beca955b9 100644 --- a/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp @@ -6,6 +6,7 @@ #include #include #include +#include namespace DB { @@ -286,11 +287,7 @@ void JSONEachRowRowInputFormat::readPrefix() skipBOMIfExists(*in); skipWhitespaceIfAny(*in); - if (!in->eof() && *in->position() == '[') - { - ++in->position(); - data_in_square_brackets = true; - } + data_in_square_brackets = checkChar('[', *in); } void JSONEachRowRowInputFormat::readSuffix() @@ -309,6 +306,28 @@ void JSONEachRowRowInputFormat::readSuffix() assertEOF(*in); } +JSONEachRowSchemaReader::JSONEachRowSchemaReader(ReadBuffer & in_, bool json_strings_, const FormatSettings & format_settings) + : IRowWithNamesSchemaReader(in_, format_settings.max_rows_to_read_for_schema_inference), json_strings(json_strings_) +{ +} + + +std::unordered_map JSONEachRowSchemaReader::readRowAndGetNamesAndDataTypes() +{ + if (first_row) + { + skipBOMIfExists(in); + skipWhitespaceIfAny(in); + checkChar('[', in); + first_row = false; + } + + skipWhitespaceIfAny(in); + if (in.eof()) + return {}; + + return readRowAndGetNamesAndDataTypesForJSONEachRow(in, json_strings); +} void registerInputFormatJSONEachRow(FormatFactory & factory) { @@ -343,4 +362,17 @@ void registerNonTrivialPrefixAndSuffixCheckerJSONEachRow(FormatFactory & factory factory.registerNonTrivialPrefixAndSuffixChecker("JSONStringsEachRow", nonTrivialPrefixAndSuffixCheckerJSONEachRowImpl); } +void registerJSONEachRowSchemaReader(FormatFactory & factory) +{ + factory.registerSchemaReader("JSONEachRow", [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr) + { + return std::make_unique(buf, false, settings); + }); + + factory.registerSchemaReader("JSONStringsEachRow", [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr) + { + return std::make_unique(buf, true, settings); + }); +} + } diff --git a/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.h b/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.h index 9810f2dc765..323909a7730 100644 --- a/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.h +++ b/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.h @@ -2,6 +2,7 @@ #include #include +#include #include #include @@ -84,4 +85,16 @@ private: bool yield_strings; }; +class JSONEachRowSchemaReader : public IRowWithNamesSchemaReader +{ +public: + JSONEachRowSchemaReader(ReadBuffer & in_, bool json_strings, const FormatSettings & format_settings); + +private: + std::unordered_map readRowAndGetNamesAndDataTypes() override; + + bool json_strings; + bool first_row = true; +}; + } diff --git a/src/Processors/Formats/Impl/LineAsStringRowInputFormat.cpp b/src/Processors/Formats/Impl/LineAsStringRowInputFormat.cpp index 1a05f61d36b..5983f3170e5 100644 --- a/src/Processors/Formats/Impl/LineAsStringRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/LineAsStringRowInputFormat.cpp @@ -72,4 +72,13 @@ void registerInputFormatLineAsString(FormatFactory & factory) return std::make_shared(sample, buf, params); }); } + +void registerLineAsStringSchemaReader(FormatFactory & factory) +{ + factory.registerExternalSchemaReader("LineAsString", []( + const FormatSettings &) + { + return std::make_shared(); + }); +} } diff --git a/src/Processors/Formats/Impl/LineAsStringRowInputFormat.h b/src/Processors/Formats/Impl/LineAsStringRowInputFormat.h index 1a6c6247558..c4c17c47dbe 100644 --- a/src/Processors/Formats/Impl/LineAsStringRowInputFormat.h +++ b/src/Processors/Formats/Impl/LineAsStringRowInputFormat.h @@ -1,7 +1,9 @@ #pragma once #include +#include #include +#include namespace DB { @@ -26,4 +28,13 @@ private: void readLineObject(IColumn & column); }; +class LinaAsStringSchemaReader : public IExternalSchemaReader +{ +public: + NamesAndTypesList readSchema() override + { + return {{"line", std::make_shared()}}; + } +}; + } diff --git a/src/Processors/Formats/Impl/MsgPackRowInputFormat.cpp b/src/Processors/Formats/Impl/MsgPackRowInputFormat.cpp index e34729be928..c56af536e15 100644 --- a/src/Processors/Formats/Impl/MsgPackRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/MsgPackRowInputFormat.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -26,15 +27,20 @@ namespace ErrorCodes { extern const int ILLEGAL_COLUMN; extern const int INCORRECT_DATA; + extern const int BAD_ARGUMENTS; + extern const int UNEXPECTED_END_OF_FILE; } MsgPackRowInputFormat::MsgPackRowInputFormat(const Block & header_, ReadBuffer & in_, Params params_) - : IRowInputFormat(header_, buf, std::move(params_)), buf(in_), parser(visitor), data_types(header_.getDataTypes()) {} + : MsgPackRowInputFormat(header_, std::make_unique(in_), params_) {} + +MsgPackRowInputFormat::MsgPackRowInputFormat(const Block & header_, std::unique_ptr buf_, Params params_) + : IRowInputFormat(header_, *buf_, std::move(params_)), buf(std::move(buf_)), parser(visitor), data_types(header_.getDataTypes()) {} void MsgPackRowInputFormat::resetParser() { IRowInputFormat::resetParser(); - buf.reset(); + buf->reset(); visitor.reset(); } @@ -325,21 +331,21 @@ void MsgPackVisitor::parse_error(size_t, size_t) // NOLINT bool MsgPackRowInputFormat::readObject() { - if (buf.eof()) + if (buf->eof()) return false; - PeekableReadBufferCheckpoint checkpoint{buf}; + PeekableReadBufferCheckpoint checkpoint{*buf}; size_t offset = 0; - while (!parser.execute(buf.position(), buf.available(), offset)) + while (!parser.execute(buf->position(), buf->available(), offset)) { - buf.position() = buf.buffer().end(); - if (buf.eof()) + buf->position() = buf->buffer().end(); + if (buf->eof()) throw Exception("Unexpected end of file while parsing msgpack object.", ErrorCodes::INCORRECT_DATA); - buf.position() = buf.buffer().end(); - buf.makeContinuousMemoryFromCheckpointToPos(); - buf.rollbackToCheckpoint(); + buf->position() = buf->buffer().end(); + buf->makeContinuousMemoryFromCheckpointToPos(); + buf->rollbackToCheckpoint(); } - buf.position() += offset; + buf->position() += offset; return true; } @@ -363,6 +369,113 @@ bool MsgPackRowInputFormat::readRow(MutableColumns & columns, RowReadExtension & return true; } +void MsgPackRowInputFormat::setReadBuffer(ReadBuffer & in_) +{ + buf = std::make_unique(in_); + IInputFormat::setReadBuffer(in_); +} + +MsgPackSchemaReader::MsgPackSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_) + : IRowSchemaReader(buf, format_settings_.max_rows_to_read_for_schema_inference), buf(in_), number_of_columns(format_settings_.msgpack.number_of_columns) +{ + if (!number_of_columns) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "You must specify setting input_format_msgpack_number_of_columns to extract table schema from MsgPack data"); +} + + +msgpack::object_handle MsgPackSchemaReader::readObject() +{ + if (buf.eof()) + throw Exception(ErrorCodes::UNEXPECTED_END_OF_FILE, "Unexpected eof while parsing msgpack object"); + + PeekableReadBufferCheckpoint checkpoint{buf}; + size_t offset = 0; + bool need_more_data = true; + msgpack::object_handle object_handle; + while (need_more_data) + { + offset = 0; + try + { + object_handle = msgpack::unpack(buf.position(), buf.buffer().end() - buf.position(), offset); + need_more_data = false; + } + catch (msgpack::insufficient_bytes &) + { + buf.position() = buf.buffer().end(); + if (buf.eof()) + throw Exception("Unexpected end of file while parsing msgpack object", ErrorCodes::UNEXPECTED_END_OF_FILE); + buf.position() = buf.buffer().end(); + buf.makeContinuousMemoryFromCheckpointToPos(); + buf.rollbackToCheckpoint(); + } + } + buf.position() += offset; + return object_handle; +} + +DataTypePtr MsgPackSchemaReader::getDataType(const msgpack::object & object) +{ + switch (object.type) + { + case msgpack::type::object_type::POSITIVE_INTEGER: [[fallthrough]]; + case msgpack::type::object_type::NEGATIVE_INTEGER: + return makeNullable(std::make_shared()); + case msgpack::type::object_type::FLOAT32: + return makeNullable(std::make_shared()); + case msgpack::type::object_type::FLOAT64: + return makeNullable(std::make_shared()); + case msgpack::type::object_type::BOOLEAN: + return makeNullable(std::make_shared()); + case msgpack::type::object_type::BIN: [[fallthrough]]; + case msgpack::type::object_type::STR: + return makeNullable(std::make_shared()); + case msgpack::type::object_type::ARRAY: + { + msgpack::object_array object_array = object.via.array; + if (object_array.size) + { + auto nested_type = getDataType(object_array.ptr[0]); + if (nested_type) + return std::make_shared(getDataType(object_array.ptr[0])); + } + return nullptr; + } + case msgpack::type::object_type::MAP: + { + msgpack::object_map object_map = object.via.map; + if (object_map.size) + { + auto key_type = removeNullable(getDataType(object_map.ptr[0].key)); + auto value_type = getDataType(object_map.ptr[0].val); + if (key_type && value_type) + return std::make_shared(key_type, value_type); + } + return nullptr; + } + case msgpack::type::object_type::NIL: + return nullptr; + default: + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Msgpack type is not supported"); + } +} + +DataTypes MsgPackSchemaReader::readRowAndGetDataTypes() +{ + if (buf.eof()) + return {}; + + DataTypes data_types; + data_types.reserve(number_of_columns); + for (size_t i = 0; i != number_of_columns; ++i) + { + auto object_handle = readObject(); + data_types.push_back(getDataType(object_handle.get())); + } + + return data_types; +} + void registerInputFormatMsgPack(FormatFactory & factory) { factory.registerInputFormat("MsgPack", []( @@ -375,6 +488,14 @@ void registerInputFormatMsgPack(FormatFactory & factory) }); } +void registerMsgPackSchemaReader(FormatFactory & factory) +{ + factory.registerSchemaReader("MsgPack", [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr) + { + return std::make_shared(buf, settings); + }); +} + } #else @@ -385,6 +506,10 @@ class FormatFactory; void registerInputFormatMsgPack(FormatFactory &) { } + +void registerMsgPackSchemaReader(FormatFactory &) +{ +} } #endif diff --git a/src/Processors/Formats/Impl/MsgPackRowInputFormat.h b/src/Processors/Formats/Impl/MsgPackRowInputFormat.h index d2d500a4480..dd5655c80fc 100644 --- a/src/Processors/Formats/Impl/MsgPackRowInputFormat.h +++ b/src/Processors/Formats/Impl/MsgPackRowInputFormat.h @@ -6,6 +6,7 @@ #if USE_MSGPACK #include +#include #include #include #include @@ -61,18 +62,35 @@ public: String getName() const override { return "MagPackRowInputFormat"; } void resetParser() override; + void setReadBuffer(ReadBuffer & in_) override; private: + MsgPackRowInputFormat(const Block & header_, std::unique_ptr buf_, Params params_); + bool readRow(MutableColumns & columns, RowReadExtension & ext) override; bool readObject(); - PeekableReadBuffer buf; + std::unique_ptr buf; MsgPackVisitor visitor; msgpack::detail::parse_helper parser; const DataTypes data_types; }; +class MsgPackSchemaReader : public IRowSchemaReader +{ +public: + MsgPackSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_); + +private: + msgpack::object_handle readObject(); + DataTypePtr getDataType(const msgpack::object & object); + DataTypes readRowAndGetDataTypes() override; + + PeekableReadBuffer buf; + UInt64 number_of_columns; +}; + } #endif diff --git a/src/Processors/Formats/Impl/MySQLOutputFormat.cpp b/src/Processors/Formats/Impl/MySQLOutputFormat.cpp index 5033176ca4b..74070252ebb 100644 --- a/src/Processors/Formats/Impl/MySQLOutputFormat.cpp +++ b/src/Processors/Formats/Impl/MySQLOutputFormat.cpp @@ -48,7 +48,7 @@ void MySQLOutputFormat::writePrefix() { packet_endpoint->sendPacket(LengthEncodedNumber(header.columns())); - for (size_t i = 0; i < header.columns(); i++) + for (size_t i = 0; i < header.columns(); ++i) { const auto & column_name = header.getColumnsWithTypeAndName()[i].name; packet_endpoint->sendPacket(getColumnDefinition(column_name, data_types[i]->getTypeId())); @@ -63,7 +63,7 @@ void MySQLOutputFormat::writePrefix() void MySQLOutputFormat::consume(Chunk chunk) { - for (size_t i = 0; i < chunk.getNumRows(); i++) + for (size_t i = 0; i < chunk.getNumRows(); ++i) { ProtocolText::ResultSetRow row_packet(serializations, chunk.getColumns(), i); packet_endpoint->sendPacket(row_packet); diff --git a/src/Processors/Formats/Impl/NativeFormat.cpp b/src/Processors/Formats/Impl/NativeFormat.cpp index 07cf4670981..19e2ede6b65 100644 --- a/src/Processors/Formats/Impl/NativeFormat.cpp +++ b/src/Processors/Formats/Impl/NativeFormat.cpp @@ -1,8 +1,10 @@ #include #include + #include #include #include +#include #include @@ -82,6 +84,20 @@ private: NativeWriter writer; }; +class NativeSchemaReader : public ISchemaReader +{ +public: + explicit NativeSchemaReader(ReadBuffer & in_) : ISchemaReader(in_) {} + + NamesAndTypesList readSchema() override + { + auto reader = NativeReader(in, 0); + auto block = reader.read(); + return block.getNamesAndTypesList(); + } +}; + + void registerInputFormatNative(FormatFactory & factory) { factory.registerInputFormat("Native", []( @@ -106,4 +122,14 @@ void registerOutputFormatNative(FormatFactory & factory) }); } + +void registerNativeSchemaReader(FormatFactory & factory) +{ + factory.registerSchemaReader("Native", [](ReadBuffer & buf, const FormatSettings &, ContextPtr) + { + return std::make_shared(buf); + }); +} + + } diff --git a/src/Processors/Formats/Impl/ORCBlockInputFormat.cpp b/src/Processors/Formats/Impl/ORCBlockInputFormat.cpp index 8768e2f5f14..9a787e5a614 100644 --- a/src/Processors/Formats/Impl/ORCBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ORCBlockInputFormat.cpp @@ -5,7 +5,6 @@ #include #include #include -#include #include "ArrowBufferedStreams.h" #include "ArrowColumnToCHColumn.h" #include @@ -19,13 +18,6 @@ namespace ErrorCodes extern const int CANNOT_READ_ALL_DATA; } -#define THROW_ARROW_NOT_OK(status) \ - do \ - { \ - if (::arrow::Status _s = (status); !_s.ok()) \ - throw Exception(_s.ToString(), ErrorCodes::BAD_ARGUMENTS); \ - } while (false) - ORCBlockInputFormat::ORCBlockInputFormat(ReadBuffer & in_, Block header_, const FormatSettings & format_settings_) : IInputFormat(std::move(header_), in_), format_settings(format_settings_) { @@ -38,37 +30,28 @@ Chunk ORCBlockInputFormat::generate() if (!file_reader) prepareReader(); + if (is_stopped) + return {}; + + std::shared_ptr batch_reader; + auto result = file_reader->NextStripeReader(format_settings.orc.row_batch_size, include_indices); + if (!result.ok()) + throw ParsingException(ErrorCodes::CANNOT_READ_ALL_DATA, "Failed to create batch reader: {}", result.status().ToString()); + batch_reader = std::move(result).ValueOrDie(); if (!batch_reader) { - arrow::Status reader_status = file_reader->NextStripeReader( - DBMS_DEFAULT_BUFFER_SIZE, include_indices, &batch_reader); - if (!reader_status.ok()) - throw ParsingException(ErrorCodes::CANNOT_READ_ALL_DATA, - "Failed to create batch reader: {}", - reader_status.ToString()); - if (!batch_reader) - return res; + return res; } - std::shared_ptr batch_result; - arrow::Status batch_status = batch_reader->ReadNext(&batch_result); - if (!batch_status.ok()) - throw ParsingException(ErrorCodes::CANNOT_READ_ALL_DATA, - "Error while reading batch of ORC data: {}", - batch_status.ToString()); + std::shared_ptr table; + arrow::Status table_status = batch_reader->ReadAll(&table); + if (!table_status.ok()) + throw ParsingException(ErrorCodes::CANNOT_READ_ALL_DATA, "Error while reading batch of ORC data: {}", table_status.ToString()); - if (!batch_result || !batch_result->num_rows()) + if (!table || !table->num_rows()) return res; - ArrowColumnToCHColumn::NameToColumnPtr name_to_column_ptr; - for (const auto & column_name : column_names) - { - arrow::ArrayVector vec = {batch_result->GetColumnByName(column_name)}; - std::shared_ptr arrow_column = std::make_shared(vec); - name_to_column_ptr[column_name] = arrow_column; - } - arrow_column_to_ch_column->arrowColumnsToCHChunk(res, name_to_column_ptr); - batch_reader.reset(); + arrow_column_to_ch_column->arrowTableToCHChunk(res, table); return res; } @@ -79,7 +62,6 @@ void ORCBlockInputFormat::resetParser() file_reader.reset(); include_indices.clear(); - stripe_current = 0; } static size_t countIndicesForType(std::shared_ptr type) @@ -105,14 +87,34 @@ static size_t countIndicesForType(std::shared_ptr type) return 1; } +static void getFileReaderAndSchema( + ReadBuffer & in, + std::unique_ptr & file_reader, + std::shared_ptr & schema, + const FormatSettings & format_settings, + std::atomic & is_stopped) +{ + auto arrow_file = asArrowFile(in, format_settings, is_stopped); + if (is_stopped) + return; + + auto result = arrow::adapters::orc::ORCFileReader::Open(std::move(arrow_file), arrow::default_memory_pool()); + if (!result.ok()) + throw Exception(result.status().ToString(), ErrorCodes::BAD_ARGUMENTS); + file_reader = std::move(result).ValueOrDie(); + + auto read_schema_result = file_reader->ReadSchema(); + if (!read_schema_result.ok()) + throw Exception(read_schema_result.status().ToString(), ErrorCodes::BAD_ARGUMENTS); + schema = std::move(read_schema_result).ValueOrDie(); +} + void ORCBlockInputFormat::prepareReader() { - THROW_ARROW_NOT_OK(arrow::adapters::orc::ORCFileReader::Open(asArrowFile(*in, format_settings), arrow::default_memory_pool(), &file_reader)); - stripe_total = file_reader->NumberOfStripes(); - stripe_current = 0; - std::shared_ptr schema; - THROW_ARROW_NOT_OK(file_reader->ReadSchema(&schema)); + getFileReaderAndSchema(*in, file_reader, schema, format_settings, is_stopped); + if (is_stopped) + return; arrow_column_to_ch_column = std::make_unique(getPort().getHeader(), "ORC", format_settings.orc.import_nested); @@ -139,7 +141,21 @@ void ORCBlockInputFormat::prepareReader() } } -void registerInputFormatORC(FormatFactory &factory) +ORCSchemaReader::ORCSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_) : ISchemaReader(in_), format_settings(format_settings_) +{ +} + +NamesAndTypesList ORCSchemaReader::readSchema() +{ + std::unique_ptr file_reader; + std::shared_ptr schema; + std::atomic is_stopped = 0; + getFileReaderAndSchema(in, file_reader, schema, format_settings, is_stopped); + auto header = ArrowColumnToCHColumn::arrowSchemaToCHHeader(*schema, "ORC"); + return header.getNamesAndTypesList(); +} + +void registerInputFormatORC(FormatFactory & factory) { factory.registerInputFormat( "ORC", @@ -153,6 +169,17 @@ void registerInputFormatORC(FormatFactory &factory) factory.markFormatAsColumnOriented("ORC"); } +void registerORCSchemaReader(FormatFactory & factory) +{ + factory.registerSchemaReader( + "ORC", + [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr) + { + return std::make_shared(buf, settings); + } + ); +} + } #else @@ -162,6 +189,10 @@ namespace DB void registerInputFormatORC(FormatFactory &) { } + + void registerORCSchemaReader(FormatFactory &) + { + } } #endif diff --git a/src/Processors/Formats/Impl/ORCBlockInputFormat.h b/src/Processors/Formats/Impl/ORCBlockInputFormat.h index 857ec7937b7..9b55747f552 100644 --- a/src/Processors/Formats/Impl/ORCBlockInputFormat.h +++ b/src/Processors/Formats/Impl/ORCBlockInputFormat.h @@ -3,6 +3,7 @@ #if USE_ORC #include +#include #include #include @@ -29,28 +30,40 @@ public: protected: Chunk generate() override; + void onCancel() override + { + is_stopped = 1; + } + private: // TODO: check that this class implements every part of its parent std::unique_ptr file_reader; - std::shared_ptr batch_reader; - std::unique_ptr arrow_column_to_ch_column; std::vector column_names; - int stripe_total = 0; - - int stripe_current = 0; - // indices of columns to read from ORC file std::vector include_indices; const FormatSettings format_settings; void prepareReader(); + + std::atomic is_stopped{0}; +}; + +class ORCSchemaReader : public ISchemaReader +{ +public: + ORCSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_); + + NamesAndTypesList readSchema() override; + +private: + const FormatSettings format_settings; }; } diff --git a/src/Processors/Formats/Impl/ORCBlockOutputFormat.cpp b/src/Processors/Formats/Impl/ORCBlockOutputFormat.cpp index 4c8f6ab2c54..651b9545c81 100644 --- a/src/Processors/Formats/Impl/ORCBlockOutputFormat.cpp +++ b/src/Processors/Formats/Impl/ORCBlockOutputFormat.cpp @@ -87,6 +87,7 @@ ORC_UNIQUE_PTR ORCBlockOutputFormat::getORCType(const DataTypePtr & t { return orc::createPrimitiveType(orc::TypeKind::DOUBLE); } + case TypeIndex::Date32: [[fallthrough]]; case TypeIndex::Date: { return orc::createPrimitiveType(orc::TypeKind::DATE); @@ -292,6 +293,7 @@ void ORCBlockOutputFormat::writeColumn( writeNumbers(orc_column, column, null_bytemap, [](const UInt16 & value){ return value; }); break; } + case TypeIndex::Date32: [[fallthrough]]; case TypeIndex::Int32: { writeNumbers(orc_column, column, null_bytemap, [](const Int32 & value){ return value; }); diff --git a/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp b/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp index 901531d81cf..1d303014d31 100644 --- a/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp @@ -44,6 +44,9 @@ Chunk ParquetBlockInputFormat::generate() if (!file_reader) prepareReader(); + if (is_stopped) + return {}; + if (row_group_current >= row_group_total) return res; @@ -91,15 +94,30 @@ static size_t countIndicesForType(std::shared_ptr type) return 1; } +static void getFileReaderAndSchema( + ReadBuffer & in, + std::unique_ptr & file_reader, + std::shared_ptr & schema, + const FormatSettings & format_settings, + std::atomic & is_stopped) +{ + auto arrow_file = asArrowFile(in, format_settings, is_stopped); + if (is_stopped) + return; + THROW_ARROW_NOT_OK(parquet::arrow::OpenFile(std::move(arrow_file), arrow::default_memory_pool(), &file_reader)); + THROW_ARROW_NOT_OK(file_reader->GetSchema(&schema)); +} + void ParquetBlockInputFormat::prepareReader() { - THROW_ARROW_NOT_OK(parquet::arrow::OpenFile(asArrowFile(*in, format_settings), arrow::default_memory_pool(), &file_reader)); + std::shared_ptr schema; + getFileReaderAndSchema(*in, file_reader, schema, format_settings, is_stopped); + if (is_stopped) + return; + row_group_total = file_reader->num_row_groups(); row_group_current = 0; - std::shared_ptr schema; - THROW_ARROW_NOT_OK(file_reader->GetSchema(&schema)); - arrow_column_to_ch_column = std::make_unique(getPort().getHeader(), "Parquet", format_settings.parquet.import_nested); std::unordered_set nested_table_names; @@ -123,7 +141,21 @@ void ParquetBlockInputFormat::prepareReader() } } -void registerInputFormatParquet(FormatFactory &factory) +ParquetSchemaReader::ParquetSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_) : ISchemaReader(in_), format_settings(format_settings_) +{ +} + +NamesAndTypesList ParquetSchemaReader::readSchema() +{ + std::unique_ptr file_reader; + std::shared_ptr schema; + std::atomic is_stopped = 0; + getFileReaderAndSchema(in, file_reader, schema, format_settings, is_stopped); + auto header = ArrowColumnToCHColumn::arrowSchemaToCHHeader(*schema, "Parquet"); + return header.getNamesAndTypesList(); +} + +void registerInputFormatParquet(FormatFactory & factory) { factory.registerInputFormat( "Parquet", @@ -137,6 +169,17 @@ void registerInputFormatParquet(FormatFactory &factory) factory.markFormatAsColumnOriented("Parquet"); } +void registerParquetSchemaReader(FormatFactory & factory) +{ + factory.registerSchemaReader( + "Parquet", + [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr) + { + return std::make_shared(buf, settings); + } + ); +} + } #else @@ -147,6 +190,8 @@ class FormatFactory; void registerInputFormatParquet(FormatFactory &) { } + +void registerParquetSchemaReader(FormatFactory &) {} } #endif diff --git a/src/Processors/Formats/Impl/ParquetBlockInputFormat.h b/src/Processors/Formats/Impl/ParquetBlockInputFormat.h index 472aec66da3..dbc99c08a35 100644 --- a/src/Processors/Formats/Impl/ParquetBlockInputFormat.h +++ b/src/Processors/Formats/Impl/ParquetBlockInputFormat.h @@ -3,6 +3,7 @@ #if USE_PARQUET #include +#include #include namespace parquet::arrow { class FileReader; } @@ -28,6 +29,11 @@ private: void prepareReader(); + void onCancel() override + { + is_stopped = 1; + } + std::unique_ptr file_reader; int row_group_total = 0; // indices of columns to read from Parquet file @@ -35,6 +41,19 @@ private: std::unique_ptr arrow_column_to_ch_column; int row_group_current = 0; const FormatSettings format_settings; + + std::atomic is_stopped{0}; +}; + +class ParquetSchemaReader : public ISchemaReader +{ +public: + ParquetSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_); + + NamesAndTypesList readSchema() override; + +private: + const FormatSettings format_settings; }; } diff --git a/src/Processors/Formats/Impl/PostgreSQLOutputFormat.cpp b/src/Processors/Formats/Impl/PostgreSQLOutputFormat.cpp index f46488fd0a8..0450051daf8 100644 --- a/src/Processors/Formats/Impl/PostgreSQLOutputFormat.cpp +++ b/src/Processors/Formats/Impl/PostgreSQLOutputFormat.cpp @@ -21,7 +21,7 @@ void PostgreSQLOutputFormat::writePrefix() std::vector columns; columns.reserve(header.columns()); - for (size_t i = 0; i < header.columns(); i++) + for (size_t i = 0; i < header.columns(); ++i) { const auto & column_name = header.getColumnsWithTypeAndName()[i].name; columns.emplace_back(column_name, data_types[i]->getTypeId()); diff --git a/src/Processors/Formats/Impl/PrettyBlockOutputFormat.cpp b/src/Processors/Formats/Impl/PrettyBlockOutputFormat.cpp index 961eb63f4e5..ad65a5f707d 100644 --- a/src/Processors/Formats/Impl/PrettyBlockOutputFormat.cpp +++ b/src/Processors/Formats/Impl/PrettyBlockOutputFormat.cpp @@ -160,7 +160,7 @@ void PrettyBlockOutputFormat::write(const Chunk & chunk, PortKind port_kind) Serializations serializations(num_columns); for (size_t i = 0; i < num_columns; ++i) - serializations[i] = header.getByPosition(i).type->getDefaultSerialization(); + serializations[i] = header.getByPosition(i).type->getSerialization(*columns[i]->getSerializationInfo()); WidthsPerColumn widths; Widths max_widths; @@ -325,7 +325,7 @@ void PrettyBlockOutputFormat::writeValueWithPadding( { String serialized_value = " "; { - WriteBufferFromString out_serialize(serialized_value, WriteBufferFromString::AppendModeTag()); + WriteBufferFromString out_serialize(serialized_value, AppendModeTag()); serialization.serializeText(column, row_num, out_serialize, format_settings); } diff --git a/src/Processors/Formats/Impl/PrettySpaceBlockOutputFormat.cpp b/src/Processors/Formats/Impl/PrettySpaceBlockOutputFormat.cpp index cf47a26efc7..85b27a6fb57 100644 --- a/src/Processors/Formats/Impl/PrettySpaceBlockOutputFormat.cpp +++ b/src/Processors/Formats/Impl/PrettySpaceBlockOutputFormat.cpp @@ -26,7 +26,7 @@ void PrettySpaceBlockOutputFormat::write(const Chunk & chunk, PortKind port_kind Serializations serializations(num_columns); for (size_t i = 0; i < num_columns; ++i) - serializations[i] = header.getByPosition(i).type->getDefaultSerialization(); + serializations[i] = header.getByPosition(i).type->getSerialization(*columns[i]->getSerializationInfo()); WidthsPerColumn widths; Widths max_widths; diff --git a/src/Processors/Formats/Impl/ProtobufRowInputFormat.cpp b/src/Processors/Formats/Impl/ProtobufRowInputFormat.cpp index df7b7102739..66da27e8829 100644 --- a/src/Processors/Formats/Impl/ProtobufRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/ProtobufRowInputFormat.cpp @@ -73,6 +73,34 @@ void registerInputFormatProtobuf(FormatFactory & factory) } } +ProtobufSchemaReader::ProtobufSchemaReader(const FormatSettings & format_settings) + : schema_info( + format_settings.schema.format_schema, + "Protobuf", + true, + format_settings.schema.is_server, + format_settings.schema.format_schema_path) +{ +} + +NamesAndTypesList ProtobufSchemaReader::readSchema() +{ + const auto * message_descriptor = ProtobufSchemas::instance().getMessageTypeForFormatSchema(schema_info); + return protobufSchemaToCHSchema(message_descriptor); +} + +void registerProtobufSchemaReader(FormatFactory & factory) +{ + factory.registerExternalSchemaReader("Protobuf", [](const FormatSettings & settings) + { + return std::make_shared(settings); + }); + factory.registerExternalSchemaReader("ProtobufSingle", [](const FormatSettings & settings) + { + return std::make_shared(settings); + }); +} + } #else @@ -81,6 +109,8 @@ namespace DB { class FormatFactory; void registerInputFormatProtobuf(FormatFactory &) {} + +void registerProtobufSchemaReader(FormatFactory &) {} } #endif diff --git a/src/Processors/Formats/Impl/ProtobufRowInputFormat.h b/src/Processors/Formats/Impl/ProtobufRowInputFormat.h index 6f465e3f0b8..d7d16d36ddf 100644 --- a/src/Processors/Formats/Impl/ProtobufRowInputFormat.h +++ b/src/Processors/Formats/Impl/ProtobufRowInputFormat.h @@ -3,7 +3,9 @@ #include "config_formats.h" #if USE_PROTOBUF +# include # include +# include namespace DB { @@ -42,5 +44,16 @@ private: std::unique_ptr serializer; }; +class ProtobufSchemaReader : public IExternalSchemaReader +{ +public: + explicit ProtobufSchemaReader(const FormatSettings & format_settings); + + NamesAndTypesList readSchema() override; + +private: + FormatSchemaInfo schema_info; +}; + } #endif diff --git a/src/Processors/Formats/Impl/RawBLOBRowInputFormat.cpp b/src/Processors/Formats/Impl/RawBLOBRowInputFormat.cpp index 34424fffd34..91b1cc60fae 100644 --- a/src/Processors/Formats/Impl/RawBLOBRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/RawBLOBRowInputFormat.cpp @@ -51,5 +51,14 @@ void registerInputFormatRawBLOB(FormatFactory & factory) }); } +void registerRawBLOBSchemaReader(FormatFactory & factory) +{ + factory.registerExternalSchemaReader("RawBLOB", []( + const FormatSettings &) + { + return std::make_shared(); + }); +} + } diff --git a/src/Processors/Formats/Impl/RawBLOBRowInputFormat.h b/src/Processors/Formats/Impl/RawBLOBRowInputFormat.h index 343af9f4068..367ca04f9d8 100644 --- a/src/Processors/Formats/Impl/RawBLOBRowInputFormat.h +++ b/src/Processors/Formats/Impl/RawBLOBRowInputFormat.h @@ -1,6 +1,8 @@ #pragma once #include +#include +#include namespace DB @@ -22,5 +24,14 @@ private: bool readRow(MutableColumns & columns, RowReadExtension &) override; }; +class RawBLOBSchemaReader: public IExternalSchemaReader +{ +public: + NamesAndTypesList readSchema() override + { + return {{"raw_blob", std::make_shared()}}; + } +}; + } diff --git a/src/Processors/Formats/Impl/RegexpRowInputFormat.cpp b/src/Processors/Formats/Impl/RegexpRowInputFormat.cpp index 7dd7e6df267..90db6f6f0ec 100644 --- a/src/Processors/Formats/Impl/RegexpRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/RegexpRowInputFormat.cpp @@ -14,13 +14,7 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } -RegexpRowInputFormat::RegexpRowInputFormat( - ReadBuffer & in_, const Block & header_, Params params_, const FormatSettings & format_settings_) - : IRowInputFormat(header_, buf, std::move(params_)) - , buf(in_) - , format_settings(format_settings_) - , escaping_rule(format_settings_.regexp.escaping_rule) - , regexp(format_settings_.regexp.regexp) +RegexpFieldExtractor::RegexpFieldExtractor(const FormatSettings & format_settings) : regexp(format_settings.regexp.regexp), skip_unmatched(format_settings.regexp.skip_unmatched) { size_t fields_count = regexp.NumberOfCapturingGroups(); matched_fields.resize(fields_count); @@ -35,45 +29,8 @@ RegexpRowInputFormat::RegexpRowInputFormat( } } - -void RegexpRowInputFormat::resetParser() +bool RegexpFieldExtractor::parseRow(PeekableReadBuffer & buf) { - IRowInputFormat::resetParser(); - buf.reset(); -} - -bool RegexpRowInputFormat::readField(size_t index, MutableColumns & columns) -{ - const auto & type = getPort().getHeader().getByPosition(index).type; - ReadBuffer field_buf(const_cast(matched_fields[index].data()), matched_fields[index].size(), 0); - try - { - return deserializeFieldByEscapingRule(type, serializations[index], *columns[index], field_buf, escaping_rule, format_settings); - } - catch (Exception & e) - { - e.addMessage("(while reading the value of column " + getPort().getHeader().getByPosition(index).name + ")"); - throw; - } -} - -void RegexpRowInputFormat::readFieldsFromMatch(MutableColumns & columns, RowReadExtension & ext) -{ - if (matched_fields.size() != columns.size()) - throw Exception("The number of matched fields in line doesn't match the number of columns.", ErrorCodes::INCORRECT_DATA); - - ext.read_columns.assign(columns.size(), false); - for (size_t columns_index = 0; columns_index < columns.size(); ++columns_index) - { - ext.read_columns[columns_index] = readField(columns_index, columns); - } -} - -bool RegexpRowInputFormat::readRow(MutableColumns & columns, RowReadExtension & ext) -{ - if (buf.eof()) - return false; - PeekableReadBufferCheckpoint checkpoint{buf}; size_t line_size = 0; @@ -89,27 +46,114 @@ bool RegexpRowInputFormat::readRow(MutableColumns & columns, RowReadExtension & buf.rollbackToCheckpoint(); bool match = RE2::FullMatchN(re2::StringPiece(buf.position(), line_size), regexp, re2_arguments_ptrs.data(), re2_arguments_ptrs.size()); - bool read_line = true; - if (!match) - { - if (!format_settings.regexp.skip_unmatched) - throw Exception("Line \"" + std::string(buf.position(), line_size) + "\" doesn't match the regexp.", ErrorCodes::INCORRECT_DATA); - read_line = false; - } - - if (read_line) - readFieldsFromMatch(columns, ext); + if (!match && !skip_unmatched) + throw Exception("Line \"" + std::string(buf.position(), line_size) + "\" doesn't match the regexp.", ErrorCodes::INCORRECT_DATA); buf.position() += line_size; - checkChar('\r', buf); if (!buf.eof() && !checkChar('\n', buf)) throw Exception("No \\n after \\r at the end of line.", ErrorCodes::INCORRECT_DATA); + return match; +} + +RegexpRowInputFormat::RegexpRowInputFormat( + ReadBuffer & in_, const Block & header_, Params params_, const FormatSettings & format_settings_) + : RegexpRowInputFormat(std::make_unique(in_), header_, params_, format_settings_) +{ +} + +RegexpRowInputFormat::RegexpRowInputFormat( + std::unique_ptr buf_, const Block & header_, Params params_, const FormatSettings & format_settings_) + : IRowInputFormat(header_, *buf_, std::move(params_)) + , buf(std::move(buf_)) + , format_settings(format_settings_) + , escaping_rule(format_settings_.regexp.escaping_rule) + , field_extractor(RegexpFieldExtractor(format_settings_)) +{ +} + +void RegexpRowInputFormat::resetParser() +{ + IRowInputFormat::resetParser(); + buf->reset(); +} + +bool RegexpRowInputFormat::readField(size_t index, MutableColumns & columns) +{ + const auto & type = getPort().getHeader().getByPosition(index).type; + auto matched_field = field_extractor.getField(index); + ReadBuffer field_buf(const_cast(matched_field.data()), matched_field.size(), 0); + try + { + return deserializeFieldByEscapingRule(type, serializations[index], *columns[index], field_buf, escaping_rule, format_settings); + } + catch (Exception & e) + { + e.addMessage("(while reading the value of column " + getPort().getHeader().getByPosition(index).name + ")"); + throw; + } +} + +void RegexpRowInputFormat::readFieldsFromMatch(MutableColumns & columns, RowReadExtension & ext) +{ + if (field_extractor.getMatchedFieldsSize() != columns.size()) + throw Exception("The number of matched fields in line doesn't match the number of columns.", ErrorCodes::INCORRECT_DATA); + + ext.read_columns.assign(columns.size(), false); + for (size_t columns_index = 0; columns_index < columns.size(); ++columns_index) + { + ext.read_columns[columns_index] = readField(columns_index, columns); + } +} + +bool RegexpRowInputFormat::readRow(MutableColumns & columns, RowReadExtension & ext) +{ + if (buf->eof()) + return false; + + if (field_extractor.parseRow(*buf)) + readFieldsFromMatch(columns, ext); return true; } +void RegexpRowInputFormat::setReadBuffer(ReadBuffer & in_) +{ + buf = std::make_unique(in_); + IInputFormat::setReadBuffer(*buf); +} + +RegexpSchemaReader::RegexpSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_, ContextPtr context_) + : IRowSchemaReader( + buf, + format_settings_.max_rows_to_read_for_schema_inference, + getDefaultDataTypeForEscapingRule(format_settings_.regexp.escaping_rule)) + , format_settings(format_settings_) + , field_extractor(format_settings) + , buf(in_) + , context(context_) +{ +} + +DataTypes RegexpSchemaReader::readRowAndGetDataTypes() +{ + if (buf.eof()) + return {}; + + field_extractor.parseRow(buf); + + DataTypes data_types; + data_types.reserve(field_extractor.getMatchedFieldsSize()); + for (size_t i = 0; i != field_extractor.getMatchedFieldsSize(); ++i) + { + String field(field_extractor.getField(i)); + data_types.push_back(determineDataTypeByEscapingRule(field, format_settings, format_settings.regexp.escaping_rule, context)); + } + + return data_types; +} + void registerInputFormatRegexp(FormatFactory & factory) { factory.registerInputFormat("Regexp", []( @@ -161,4 +205,12 @@ void registerFileSegmentationEngineRegexp(FormatFactory & factory) factory.registerFileSegmentationEngine("Regexp", &fileSegmentationEngineRegexpImpl); } +void registerRegexpSchemaReader(FormatFactory & factory) +{ + factory.registerSchemaReader("Regexp", [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr context) + { + return std::make_shared(buf, settings, context); + }); +} + } diff --git a/src/Processors/Formats/Impl/RegexpRowInputFormat.h b/src/Processors/Formats/Impl/RegexpRowInputFormat.h index c54549b8bac..dffd2f82e02 100644 --- a/src/Processors/Formats/Impl/RegexpRowInputFormat.h +++ b/src/Processors/Formats/Impl/RegexpRowInputFormat.h @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -16,6 +17,29 @@ namespace DB class ReadBuffer; +/// Class for extracting row fields from data by regexp. +class RegexpFieldExtractor +{ +public: + RegexpFieldExtractor(const FormatSettings & format_settings); + + /// Return true if row was successfully parsed and row fields were extracted. + bool parseRow(PeekableReadBuffer & buf); + + re2::StringPiece getField(size_t index) { return matched_fields[index]; } + size_t getMatchedFieldsSize() const { return matched_fields.size(); } + size_t getNumberOfGroups() const { return regexp.NumberOfCapturingGroups(); } + +private: + const RE2 regexp; + // The vector of fields extracted from line using regexp. + std::vector matched_fields; + // These two vectors are needed to use RE2::FullMatchN (function for extracting fields). + std::vector re2_arguments; + std::vector re2_arguments_ptrs; + bool skip_unmatched; +}; + /// Regexp input format. /// This format applies regular expression from format_regexp setting for every line of file /// (the lines must be separated by newline character ('\n') or DOS-style newline ("\r\n")). @@ -25,29 +49,42 @@ class ReadBuffer; class RegexpRowInputFormat : public IRowInputFormat { - using EscapingRule = FormatSettings::EscapingRule; public: RegexpRowInputFormat(ReadBuffer & in_, const Block & header_, Params params_, const FormatSettings & format_settings_); String getName() const override { return "RegexpRowInputFormat"; } void resetParser() override; + void setReadBuffer(ReadBuffer & in_) override; private: + RegexpRowInputFormat(std::unique_ptr buf_, const Block & header_, Params params_, const FormatSettings & format_settings_); + + using EscapingRule = FormatSettings::EscapingRule; + bool readRow(MutableColumns & columns, RowReadExtension & ext) override; bool readField(size_t index, MutableColumns & columns); void readFieldsFromMatch(MutableColumns & columns, RowReadExtension & ext); - PeekableReadBuffer buf; + std::unique_ptr buf; const FormatSettings format_settings; const EscapingRule escaping_rule; + RegexpFieldExtractor field_extractor; +}; - const RE2 regexp; - // The vector of fields extracted from line using regexp. - std::vector matched_fields; - // These two vectors are needed to use RE2::FullMatchN (function for extracting fields). - std::vector re2_arguments; - std::vector re2_arguments_ptrs; +class RegexpSchemaReader : public IRowSchemaReader +{ +public: + RegexpSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings, ContextPtr context_); + +private: + DataTypes readRowAndGetDataTypes() override; + + using EscapingRule = FormatSettings::EscapingRule; + const FormatSettings format_settings; + RegexpFieldExtractor field_extractor; + PeekableReadBuffer buf; + ContextPtr context; }; } diff --git a/src/Processors/Formats/Impl/TSKVRowInputFormat.cpp b/src/Processors/Formats/Impl/TSKVRowInputFormat.cpp index eef97e15dd5..8a56c2ed5c7 100644 --- a/src/Processors/Formats/Impl/TSKVRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/TSKVRowInputFormat.cpp @@ -1,7 +1,10 @@ #include #include #include +#include #include +#include +#include namespace DB @@ -211,6 +214,59 @@ void TSKVRowInputFormat::resetParser() name_buf.clear(); } +TSKVSchemaReader::TSKVSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_) + : IRowWithNamesSchemaReader( + in_, + format_settings_.max_rows_to_read_for_schema_inference, + getDefaultDataTypeForEscapingRule(FormatSettings::EscapingRule::Escaped)) + , format_settings(format_settings_) +{ +} + +std::unordered_map TSKVSchemaReader::readRowAndGetNamesAndDataTypes() +{ + if (first_row) + { + skipBOMIfExists(in); + first_row = false; + } + + if (in.eof()) + return {}; + + if (*in.position() == '\n') + { + ++in.position(); + return {}; + } + + std::unordered_map names_and_types; + StringRef name_ref; + String name_tmp; + String value; + do + { + bool has_value = readName(in, name_ref, name_tmp); + if (has_value) + { + readEscapedString(value, in); + names_and_types[String(name_ref)] = determineDataTypeByEscapingRule(value, format_settings, FormatSettings::EscapingRule::Escaped); + } + else + { + /// The only thing that can go without value is `tskv` fragment that is ignored. + if (!(name_ref.size == 4 && 0 == memcmp(name_ref.data, "tskv", 4))) + throw Exception("Found field without value while parsing TSKV format: " + name_ref.toString(), ErrorCodes::INCORRECT_DATA); + } + + } + while (checkChar('\t', in)); + + assertChar('\n', in); + + return names_and_types; +} + void registerInputFormatTSKV(FormatFactory & factory) { factory.registerInputFormat("TSKV", []( @@ -222,5 +278,12 @@ void registerInputFormatTSKV(FormatFactory & factory) return std::make_shared(buf, sample, std::move(params), settings); }); } +void registerTSKVSchemaReader(FormatFactory & factory) +{ + factory.registerSchemaReader("TSKV", [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr) + { + return std::make_shared(buf, settings); + }); +} } diff --git a/src/Processors/Formats/Impl/TSKVRowInputFormat.h b/src/Processors/Formats/Impl/TSKVRowInputFormat.h index 7d732bae691..6aef50a0f84 100644 --- a/src/Processors/Formats/Impl/TSKVRowInputFormat.h +++ b/src/Processors/Formats/Impl/TSKVRowInputFormat.h @@ -2,6 +2,7 @@ #include #include +#include #include #include @@ -52,4 +53,16 @@ private: /// for row like ..., non-nullable column name=\N, ... }; +class TSKVSchemaReader : public IRowWithNamesSchemaReader +{ +public: + TSKVSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_); + +private: + std::unordered_map readRowAndGetNamesAndDataTypes() override; + + const FormatSettings format_settings; + bool first_row = true; +}; + } diff --git a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp index 1e6d238b202..bb844ec68ea 100644 --- a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp @@ -1,13 +1,15 @@ #include #include -#include -#include -#include -#include -#include -#include +#include +#include #include +#include +#include +#include +#include +#include +#include namespace DB { @@ -38,40 +40,50 @@ TabSeparatedRowInputFormat::TabSeparatedRowInputFormat( bool with_types_, bool is_raw_, const FormatSettings & format_settings_) - : RowInputFormatWithNamesAndTypes(header_, in_, params_, with_names_, with_types_, format_settings_), is_raw(is_raw_) + : RowInputFormatWithNamesAndTypes(header_, in_, params_, with_names_, with_types_, format_settings_, std::make_unique(in_, format_settings_, is_raw_)) { } -void TabSeparatedRowInputFormat::skipFieldDelimiter() +TabSeparatedFormatReader::TabSeparatedFormatReader(ReadBuffer & in_, const FormatSettings & format_settings_, bool is_raw_) + : FormatWithNamesAndTypesReader(in_, format_settings_), is_raw(is_raw_) +{ +} + +void TabSeparatedFormatReader::skipFieldDelimiter() { assertChar('\t', *in); } -void TabSeparatedRowInputFormat::skipRowEndDelimiter() +void TabSeparatedFormatReader::skipRowEndDelimiter() { if (in->eof()) return; - if (unlikely(row_num <= 1)) + if (unlikely(first_row)) + { checkForCarriageReturn(*in); + first_row = false; + } assertChar('\n', *in); } -String TabSeparatedRowInputFormat::readFieldIntoString() +String TabSeparatedFormatReader::readFieldIntoString() { String field; - readEscapedString(field, *in); + if (is_raw) + readString(field, *in); + else + readEscapedString(field, *in); return field; } -void TabSeparatedRowInputFormat::skipField() +void TabSeparatedFormatReader::skipField() { - NullOutput null_sink; - readEscapedStringInto(null_sink, *in); + readFieldIntoString(); } -void TabSeparatedRowInputFormat::skipHeaderRow() +void TabSeparatedFormatReader::skipHeaderRow() { do { @@ -82,7 +94,7 @@ void TabSeparatedRowInputFormat::skipHeaderRow() skipRowEndDelimiter(); } -std::vector TabSeparatedRowInputFormat::readHeaderRow() +std::vector TabSeparatedFormatReader::readRow() { std::vector fields; do @@ -95,7 +107,7 @@ std::vector TabSeparatedRowInputFormat::readHeaderRow() return fields; } -bool TabSeparatedRowInputFormat::readField(IColumn & column, const DataTypePtr & type, +bool TabSeparatedFormatReader::readField(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, bool is_last_file_column, const String & /*column_name*/) { const bool at_delimiter = !is_last_file_column && !in->eof() && *in->position() == '\t'; @@ -118,6 +130,7 @@ bool TabSeparatedRowInputFormat::readField(IColumn & column, const DataTypePtr & return true; } + if (as_nullable) return SerializationNullable::deserializeTextEscapedImpl(column, *in, format_settings, serialization); @@ -125,7 +138,7 @@ bool TabSeparatedRowInputFormat::readField(IColumn & column, const DataTypePtr & return true; } -bool TabSeparatedRowInputFormat::parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out) +bool TabSeparatedFormatReader::parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out) { try { @@ -156,7 +169,7 @@ bool TabSeparatedRowInputFormat::parseFieldDelimiterWithDiagnosticInfo(WriteBuff return true; } -bool TabSeparatedRowInputFormat::parseRowEndWithDiagnosticInfo(WriteBuffer & out) +bool TabSeparatedFormatReader::parseRowEndWithDiagnosticInfo(WriteBuffer & out) { if (in->eof()) return true; @@ -190,7 +203,7 @@ bool TabSeparatedRowInputFormat::parseRowEndWithDiagnosticInfo(WriteBuffer & out return true; } -void TabSeparatedRowInputFormat::checkNullValueForNonNullable(DataTypePtr type) +void TabSeparatedFormatReader::checkNullValueForNonNullable(DataTypePtr type) { bool can_be_parsed_as_null = type->isNullable() || type->isLowCardinalityNullable() || format_settings.null_as_default; @@ -218,6 +231,28 @@ void TabSeparatedRowInputFormat::syncAfterError() skipToUnescapedNextLineOrEOF(*in); } +TabSeparatedSchemaReader::TabSeparatedSchemaReader( + ReadBuffer & in_, bool with_names_, bool with_types_, bool is_raw_, const FormatSettings & format_settings_) + : FormatWithNamesAndTypesSchemaReader( + in_, + format_settings_.max_rows_to_read_for_schema_inference, + with_names_, + with_types_, + &reader, + getDefaultDataTypeForEscapingRule(is_raw_ ? FormatSettings::EscapingRule::Raw : FormatSettings::EscapingRule::Escaped)) + , reader(in_, format_settings_, is_raw_) +{ +} + +DataTypes TabSeparatedSchemaReader::readRowAndGetDataTypes() +{ + if (in.eof()) + return {}; + + auto fields = reader.readRow(); + return determineDataTypesByEscapingRule(fields, reader.getFormatSettings(), reader.getEscapingRule()); +} + void registerInputFormatTabSeparated(FormatFactory & factory) { for (bool is_raw : {false, true}) @@ -239,6 +274,23 @@ void registerInputFormatTabSeparated(FormatFactory & factory) } } +void registerTSVSchemaReader(FormatFactory & factory) +{ + for (bool is_raw : {false, true}) + { + auto register_func = [&](const String & format_name, bool with_names, bool with_types) + { + factory.registerSchemaReader(format_name, [with_names, with_types, is_raw](ReadBuffer & buf, const FormatSettings & settings, ContextPtr) + { + return std::make_shared(buf, with_names, with_types, is_raw, settings); + }); + }; + + registerWithNamesAndTypes(is_raw ? "TabSeparatedRaw" : "TabSeparated", register_func); + registerWithNamesAndTypes(is_raw ? "TSVRaw" : "TSV", register_func); + } +} + static std::pair fileSegmentationEngineTabSeparatedImpl(ReadBuffer & in, DB::Memory<> & memory, size_t min_chunk_size, bool is_raw, size_t min_rows) { bool need_more_data = true; diff --git a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.h b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.h index 6e2e283e792..1f2bfc255b8 100644 --- a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.h +++ b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.h @@ -3,6 +3,7 @@ #include #include #include +#include namespace DB @@ -24,6 +25,13 @@ public: private: bool allowSyncAfterError() const override { return true; } void syncAfterError() override; + bool isGarbageAfterField(size_t, ReadBuffer::Position pos) override { return *pos != '\n' && *pos != '\t'; } +}; + +class TabSeparatedFormatReader : public FormatWithNamesAndTypesReader +{ +public: + TabSeparatedFormatReader(ReadBuffer & in_, const FormatSettings & format_settings, bool is_raw_); bool readField(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, bool is_last_file_column, const String & column_name) override; @@ -36,18 +44,34 @@ private: void skipFieldDelimiter() override; void skipRowEndDelimiter() override; - std::vector readHeaderRow(); - std::vector readNames() override { return readHeaderRow(); } - std::vector readTypes() override { return readHeaderRow(); } + std::vector readRow(); + std::vector readNames() override { return readRow(); } + std::vector readTypes() override { return readRow(); } String readFieldIntoString(); void checkNullValueForNonNullable(DataTypePtr type) override; bool parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out) override; bool parseRowEndWithDiagnosticInfo(WriteBuffer & out) override; - bool isGarbageAfterField(size_t, ReadBuffer::Position pos) override { return *pos != '\n' && *pos != '\t'; } + FormatSettings::EscapingRule getEscapingRule() + { + return is_raw ? FormatSettings::EscapingRule::Raw : FormatSettings::EscapingRule::Escaped; + } +private: bool is_raw; + bool first_row = true; +}; + +class TabSeparatedSchemaReader : public FormatWithNamesAndTypesSchemaReader +{ +public: + TabSeparatedSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, bool is_raw_, const FormatSettings & format_settings); + +private: + DataTypes readRowAndGetDataTypes() override; + + TabSeparatedFormatReader reader; }; } diff --git a/src/Processors/Formats/Impl/TabSeparatedRowOutputFormat.cpp b/src/Processors/Formats/Impl/TabSeparatedRowOutputFormat.cpp index 5d87f5a0b14..37bd8daa502 100644 --- a/src/Processors/Formats/Impl/TabSeparatedRowOutputFormat.cpp +++ b/src/Processors/Formats/Impl/TabSeparatedRowOutputFormat.cpp @@ -22,7 +22,10 @@ void TabSeparatedRowOutputFormat::writeLine(const std::vector & values) { for (size_t i = 0; i < values.size(); ++i) { - writeEscapedString(values[i], out); + if (is_raw) + writeString(values[i], out); + else + writeEscapedString(values[i], out); if (i + 1 == values.size()) writeRowEndDelimiter(); else diff --git a/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp b/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp index 25162e927ac..06d6ba06bcc 100644 --- a/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp @@ -4,7 +4,6 @@ #include #include #include -#include #include namespace DB @@ -12,52 +11,48 @@ namespace DB namespace ErrorCodes { -extern const int ATTEMPT_TO_READ_AFTER_EOF; -extern const int CANNOT_READ_ALL_DATA; -extern const int CANNOT_PARSE_ESCAPE_SEQUENCE; -extern const int CANNOT_PARSE_QUOTED_STRING; -extern const int SYNTAX_ERROR; + extern const int ATTEMPT_TO_READ_AFTER_EOF; + extern const int CANNOT_READ_ALL_DATA; + extern const int CANNOT_PARSE_ESCAPE_SEQUENCE; + extern const int CANNOT_PARSE_QUOTED_STRING; + extern const int SYNTAX_ERROR; } +[[noreturn]] static void throwUnexpectedEof(size_t row_num) +{ + throw ParsingException("Unexpected EOF while parsing row " + std::to_string(row_num) + ". " + "Maybe last row has wrong format or input doesn't contain specified suffix before EOF.", + ErrorCodes::CANNOT_READ_ALL_DATA); +} -TemplateRowInputFormat::TemplateRowInputFormat(const Block & header_, ReadBuffer & in_, const Params & params_, +TemplateRowInputFormat::TemplateRowInputFormat( + const Block & header_, + ReadBuffer & in_, + const Params & params_, + FormatSettings settings_, + bool ignore_spaces_, + ParsedTemplateFormatString format_, + ParsedTemplateFormatString row_format_, + std::string row_between_delimiter_) + : TemplateRowInputFormat( + header_, std::make_unique(in_), params_, settings_, ignore_spaces_, format_, row_format_, row_between_delimiter_) +{ +} + +TemplateRowInputFormat::TemplateRowInputFormat(const Block & header_, std::unique_ptr buf_, const Params & params_, FormatSettings settings_, bool ignore_spaces_, ParsedTemplateFormatString format_, ParsedTemplateFormatString row_format_, std::string row_between_delimiter_) - : RowInputFormatWithDiagnosticInfo(header_, buf, params_), buf(in_), data_types(header_.getDataTypes()), + : RowInputFormatWithDiagnosticInfo(header_, *buf_, params_), buf(std::move(buf_)), data_types(header_.getDataTypes()), settings(std::move(settings_)), ignore_spaces(ignore_spaces_), format(std::move(format_)), row_format(std::move(row_format_)), - default_csv_delimiter(settings.csv.delimiter), row_between_delimiter(std::move(row_between_delimiter_)) + default_csv_delimiter(settings.csv.delimiter), row_between_delimiter(row_between_delimiter_), + format_reader(std::make_unique(*buf, ignore_spaces_, format, row_format, row_between_delimiter, settings)) { - /// Validate format string for result set - bool has_data = false; - for (size_t i = 0; i < format.columnsCount(); ++i) - { - if (format.format_idx_to_column_idx[i]) - { - if (*format.format_idx_to_column_idx[i] != 0) - format.throwInvalidFormat("Invalid input part", i); - if (has_data) - format.throwInvalidFormat("${data} can occur only once", i); - if (format.escaping_rules[i] != EscapingRule::None) - format.throwInvalidFormat("${data} must have empty or None deserialization type", i); - has_data = true; - format_data_idx = i; - } - else - { - if (format.escaping_rules[i] == EscapingRule::XML) - format.throwInvalidFormat("XML deserialization is not supported", i); - } - } - /// Validate format string for rows std::vector column_in_format(header_.columns(), false); for (size_t i = 0; i < row_format.columnsCount(); ++i) { - if (row_format.escaping_rules[i] == EscapingRule::XML) - row_format.throwInvalidFormat("XML deserialization is not supported", i); - if (row_format.format_idx_to_column_idx[i]) { if (header_.columns() <= *row_format.format_idx_to_column_idx[i]) @@ -80,69 +75,7 @@ TemplateRowInputFormat::TemplateRowInputFormat(const Block & header_, ReadBuffer void TemplateRowInputFormat::readPrefix() { - size_t last_successfully_parsed_idx = 0; - try - { - tryReadPrefixOrSuffix(last_successfully_parsed_idx, format_data_idx); - } - catch (Exception & e) - { - format.throwInvalidFormat(e.message() + " While parsing prefix", last_successfully_parsed_idx); - } -} - -/// Asserts delimiters and skips fields in prefix or suffix. -/// tryReadPrefixOrSuffix(...) is used in checkForSuffix() to avoid throwing an exception after read of each row -/// (most likely false will be returned on first call of checkString(...)) -template -ReturnType TemplateRowInputFormat::tryReadPrefixOrSuffix(size_t & input_part_beg, size_t input_part_end) -{ - static constexpr bool throw_exception = std::is_same_v; - - skipSpaces(); - if constexpr (throw_exception) - assertString(format.delimiters[input_part_beg], buf); - else - { - if (likely(!checkString(format.delimiters[input_part_beg], buf))) - return ReturnType(false); - } - - while (input_part_beg < input_part_end) - { - skipSpaces(); - if constexpr (throw_exception) - skipField(format.escaping_rules[input_part_beg]); - else - { - try - { - skipField(format.escaping_rules[input_part_beg]); - } - catch (const Exception & e) - { - if (e.code() != ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF && - e.code() != ErrorCodes::CANNOT_PARSE_ESCAPE_SEQUENCE && - e.code() != ErrorCodes::CANNOT_PARSE_QUOTED_STRING) - throw; - /// If it's parsing error, then suffix is not found - return ReturnType(false); - } - } - ++input_part_beg; - - skipSpaces(); - if constexpr (throw_exception) - assertString(format.delimiters[input_part_beg], buf); - else - { - if (likely(!checkString(format.delimiters[input_part_beg], buf))) - return ReturnType(false); - } - } - - if constexpr (!throw_exception) - return ReturnType(true); + format_reader->readPrefix(); } bool TemplateRowInputFormat::readRow(MutableColumns & columns, RowReadExtension & extra) @@ -151,9 +84,7 @@ bool TemplateRowInputFormat::readRow(MutableColumns & columns, RowReadExtension if (unlikely(end_of_stream)) return false; - skipSpaces(); - - if (unlikely(checkForSuffix())) + if (unlikely(format_reader->checkForSuffix())) { end_of_stream = true; return false; @@ -162,27 +93,24 @@ bool TemplateRowInputFormat::readRow(MutableColumns & columns, RowReadExtension updateDiagnosticInfo(); if (likely(row_num != 1)) - assertString(row_between_delimiter, buf); + format_reader->skipRowBetweenDelimiter(); extra.read_columns.assign(columns.size(), false); for (size_t i = 0; i < row_format.columnsCount(); ++i) { - skipSpaces(); - assertString(row_format.delimiters[i], buf); - skipSpaces(); + format_reader->skipDelimiter(i); + if (row_format.format_idx_to_column_idx[i]) { size_t col_idx = *row_format.format_idx_to_column_idx[i]; extra.read_columns[col_idx] = deserializeField(data_types[col_idx], serializations[col_idx], *columns[col_idx], i); } else - skipField(row_format.escaping_rules[i]); - + format_reader->skipField(row_format.escaping_rules[i]); } - skipSpaces(); - assertString(row_format.delimiters.back(), buf); + format_reader->skipRowEndDelimiter(); for (const auto & idx : always_default_columns) data_types[idx]->insertDefaultInto(*columns[idx]); @@ -200,70 +128,26 @@ bool TemplateRowInputFormat::deserializeField(const DataTypePtr & type, row_format.delimiters[file_column + 1].front(); try { - return deserializeFieldByEscapingRule(type, serialization, column, buf, escaping_rule, settings); + return deserializeFieldByEscapingRule(type, serialization, column, *buf, escaping_rule, settings); } catch (Exception & e) { if (e.code() == ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF) - throwUnexpectedEof(); + throwUnexpectedEof(row_num); throw; } } -void TemplateRowInputFormat::skipField(TemplateRowInputFormat::EscapingRule escaping_rule) -{ - try - { - skipFieldByEscapingRule(buf, escaping_rule, settings); - } - catch (Exception & e) - { - if (e.code() == ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF) - throwUnexpectedEof(); - throw; - } -} - -/// Returns true if all rows have been read i.e. there are only suffix and spaces (if ignore_spaces == true) before EOF. -/// Otherwise returns false -bool TemplateRowInputFormat::checkForSuffix() -{ - PeekableReadBufferCheckpoint checkpoint{buf}; - bool suffix_found = false; - size_t last_successfully_parsed_idx = format_data_idx + 1; - try - { - suffix_found = tryReadPrefixOrSuffix(last_successfully_parsed_idx, format.columnsCount()); - } - catch (const Exception & e) - { - if (e.code() != ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF && - e.code() != ErrorCodes::CANNOT_PARSE_ESCAPE_SEQUENCE && - e.code() != ErrorCodes::CANNOT_PARSE_QUOTED_STRING) - throw; - } - - if (unlikely(suffix_found)) - { - skipSpaces(); - if (buf.eof()) - return true; - } - - buf.rollbackToCheckpoint(); - return false; -} - bool TemplateRowInputFormat::parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out) { out << "Suffix does not match: "; - size_t last_successfully_parsed_idx = format_data_idx + 1; - const ReadBuffer::Position row_begin_pos = buf.position(); + size_t last_successfully_parsed_idx = format_reader->getFormatDataIdx() + 1; + const ReadBuffer::Position row_begin_pos = buf->position(); bool caught = false; try { - PeekableReadBufferCheckpoint checkpoint{buf, true}; - tryReadPrefixOrSuffix(last_successfully_parsed_idx, format.columnsCount()); + PeekableReadBufferCheckpoint checkpoint{*buf, true}; + format_reader->tryReadPrefixOrSuffix(last_successfully_parsed_idx, format.columnsCount()); } catch (Exception & e) { @@ -273,12 +157,12 @@ bool TemplateRowInputFormat::parseRowAndPrintDiagnosticInfo(MutableColumns & col if (!caught) { out << " There is some data after suffix (EOF expected, got "; - verbosePrintString(buf.position(), std::min(buf.buffer().end(), buf.position() + 16), out); + verbosePrintString(buf->position(), std::min(buf->buffer().end(), buf->position() + 16), out); out << "). "; } out << " Format string (from format_schema): \n" << format.dump() << "\n"; - if (row_begin_pos != buf.position()) + if (row_begin_pos != buf->position()) { /// Pointers to buffer memory were invalidated during checking for suffix out << "\nCannot print more diagnostic info."; @@ -287,15 +171,15 @@ bool TemplateRowInputFormat::parseRowAndPrintDiagnosticInfo(MutableColumns & col out << "\nUsing format string (from format_schema_rows): " << row_format.dump() << "\n"; out << "\nTrying to parse next row, because suffix does not match:\n"; - if (likely(row_num != 1) && !parseDelimiterWithDiagnosticInfo(out, buf, row_between_delimiter, "delimiter between rows", ignore_spaces)) + if (likely(row_num != 1) && !parseDelimiterWithDiagnosticInfo(out, *buf, row_between_delimiter, "delimiter between rows", ignore_spaces)) return false; for (size_t i = 0; i < row_format.columnsCount(); ++i) { - if (!parseDelimiterWithDiagnosticInfo(out, buf, row_format.delimiters[i], "delimiter before field " + std::to_string(i), ignore_spaces)) + if (!parseDelimiterWithDiagnosticInfo(out, *buf, row_format.delimiters[i], "delimiter before field " + std::to_string(i), ignore_spaces)) return false; - skipSpaces(); + format_reader->skipSpaces(); if (row_format.format_idx_to_column_idx[i]) { const auto & header = getPort().getHeader(); @@ -318,7 +202,7 @@ bool TemplateRowInputFormat::parseRowAndPrintDiagnosticInfo(MutableColumns & col } } - return parseDelimiterWithDiagnosticInfo(out, buf, row_format.delimiters.back(), "delimiter after last field", ignore_spaces); + return parseDelimiterWithDiagnosticInfo(out, *buf, row_format.delimiters.back(), "delimiter after last field", ignore_spaces); } bool parseDelimiterWithDiagnosticInfo(WriteBuffer & out, ReadBuffer & buf, const String & delimiter, const String & description, bool skip_spaces) @@ -350,7 +234,7 @@ void TemplateRowInputFormat::tryDeserializeField(const DataTypePtr & type, IColu if (index) deserializeField(type, serializations[*index], column, file_column); else - skipField(row_format.escaping_rules[file_column]); + format_reader->skipField(row_format.escaping_rules[file_column]); } bool TemplateRowInputFormat::isGarbageAfterField(size_t, ReadBuffer::Position) @@ -366,25 +250,286 @@ bool TemplateRowInputFormat::allowSyncAfterError() const void TemplateRowInputFormat::syncAfterError() { - skipToNextRowOrEof(buf, row_format.delimiters.back(), row_between_delimiter, ignore_spaces); - end_of_stream = buf.eof(); - /// It can happen that buf.position() is not at the beginning of row + skipToNextRowOrEof(*buf, row_format.delimiters.back(), row_between_delimiter, ignore_spaces); + end_of_stream = buf->eof(); + /// It can happen that buf->position() is not at the beginning of row /// if some delimiters is similar to row_format.delimiters.back() and row_between_delimiter. /// It will cause another parsing error. } -void TemplateRowInputFormat::throwUnexpectedEof() -{ - throw ParsingException("Unexpected EOF while parsing row " + std::to_string(row_num) + ". " - "Maybe last row has wrong format or input doesn't contain specified suffix before EOF.", - ErrorCodes::CANNOT_READ_ALL_DATA); -} - void TemplateRowInputFormat::resetParser() { RowInputFormatWithDiagnosticInfo::resetParser(); end_of_stream = false; - buf.reset(); + buf->reset(); +} + +void TemplateRowInputFormat::setReadBuffer(ReadBuffer & in_) +{ + buf = std::make_unique(in_); + IInputFormat::setReadBuffer(*buf); +} + +TemplateFormatReader::TemplateFormatReader( + PeekableReadBuffer & buf_, + bool ignore_spaces_, + const ParsedTemplateFormatString & format_, + const ParsedTemplateFormatString & row_format_, + std::string row_between_delimiter_, + const FormatSettings & format_settings_) + : buf(&buf_) + , ignore_spaces(ignore_spaces_) + , format(format_) + , row_format(row_format_) + , row_between_delimiter(row_between_delimiter_) + , format_settings(format_settings_) +{ + /// Validate format string for result set + bool has_data = false; + for (size_t i = 0; i < format.columnsCount(); ++i) + { + if (format.format_idx_to_column_idx[i]) + { + if (*format.format_idx_to_column_idx[i] != 0) + format.throwInvalidFormat("Invalid input part", i); + if (has_data) + format.throwInvalidFormat("${data} can occur only once", i); + if (format.escaping_rules[i] != EscapingRule::None) + format.throwInvalidFormat("${data} must have empty or None deserialization type", i); + has_data = true; + format_data_idx = i; + } + else + { + if (format.escaping_rules[i] == EscapingRule::XML) + format.throwInvalidFormat("XML deserialization is not supported", i); + } + } + + /// Validate format string for rows + for (size_t i = 0; i < row_format.columnsCount(); ++i) + { + if (row_format.escaping_rules[i] == EscapingRule::XML) + row_format.throwInvalidFormat("XML deserialization is not supported", i); + } +} + +void TemplateFormatReader::readPrefix() +{ + size_t last_successfully_parsed_idx = 0; + try + { + tryReadPrefixOrSuffix(last_successfully_parsed_idx, format_data_idx); + } + catch (Exception & e) + { + format.throwInvalidFormat(e.message() + " While parsing prefix", last_successfully_parsed_idx); + } +} + +void TemplateFormatReader::skipField(EscapingRule escaping_rule) +{ + try + { + skipFieldByEscapingRule(*buf, escaping_rule, format_settings); + } + catch (Exception & e) + { + if (e.code() == ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF) + throwUnexpectedEof(row_num); + throw; + } +} + +/// Asserts delimiters and skips fields in prefix or suffix. +/// tryReadPrefixOrSuffix(...) is used in checkForSuffix() to avoid throwing an exception after read of each row +/// (most likely false will be returned on first call of checkString(...)) +template +ReturnType TemplateFormatReader::tryReadPrefixOrSuffix(size_t & input_part_beg, size_t input_part_end) +{ + static constexpr bool throw_exception = std::is_same_v; + + skipSpaces(); + if constexpr (throw_exception) + assertString(format.delimiters[input_part_beg], *buf); + else + { + if (likely(!checkString(format.delimiters[input_part_beg], *buf))) + return ReturnType(false); + } + + while (input_part_beg < input_part_end) + { + skipSpaces(); + if constexpr (throw_exception) + skipField(format.escaping_rules[input_part_beg]); + else + { + try + { + skipField(format.escaping_rules[input_part_beg]); + } + catch (const Exception & e) + { + if (e.code() != ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF && + e.code() != ErrorCodes::CANNOT_PARSE_ESCAPE_SEQUENCE && + e.code() != ErrorCodes::CANNOT_PARSE_QUOTED_STRING) + throw; + /// If it's parsing error, then suffix is not found + return ReturnType(false); + } + } + ++input_part_beg; + + skipSpaces(); + if constexpr (throw_exception) + assertString(format.delimiters[input_part_beg], *buf); + else + { + if (likely(!checkString(format.delimiters[input_part_beg], *buf))) + return ReturnType(false); + } + } + + if constexpr (!throw_exception) + return ReturnType(true); +} + +/// Returns true if all rows have been read i.e. there are only suffix and spaces (if ignore_spaces == true) before EOF. +/// Otherwise returns false +bool TemplateFormatReader::checkForSuffix() +{ + PeekableReadBufferCheckpoint checkpoint{*buf}; + bool suffix_found = false; + size_t last_successfully_parsed_idx = format_data_idx + 1; + try + { + suffix_found = tryReadPrefixOrSuffix(last_successfully_parsed_idx, format.columnsCount()); + } + catch (const Exception & e) + { + if (e.code() != ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF && + e.code() != ErrorCodes::CANNOT_PARSE_ESCAPE_SEQUENCE && + e.code() != ErrorCodes::CANNOT_PARSE_QUOTED_STRING) + throw; + } + + if (unlikely(suffix_found)) + { + skipSpaces(); + if (buf->eof()) + return true; + } + + buf->rollbackToCheckpoint(); + return false; +} + +void TemplateFormatReader::skipDelimiter(size_t index) +{ + skipSpaces(); + assertString(row_format.delimiters[index], *buf); + skipSpaces(); +} + +void TemplateFormatReader::skipRowEndDelimiter() +{ + ++row_num; + skipSpaces(); + assertString(row_format.delimiters.back(), *buf); + skipSpaces(); +} + +void TemplateFormatReader::skipRowBetweenDelimiter() +{ + skipSpaces(); + assertString(row_between_delimiter, *buf); + skipSpaces(); +} + +TemplateSchemaReader::TemplateSchemaReader( + ReadBuffer & in_, + bool ignore_spaces_, + const ParsedTemplateFormatString & format_, + const ParsedTemplateFormatString & row_format_, + std::string row_between_delimiter, + const FormatSettings & format_settings_, + ContextPtr context_) + : IRowSchemaReader(buf, format_settings_.max_rows_to_read_for_schema_inference) + , buf(in_) + , format(format_) + , row_format(row_format_) + , format_settings(format_settings_) + , context(context_) + , format_reader(buf, ignore_spaces_, format, row_format, row_between_delimiter, format_settings) +{ + setColumnNames(row_format.column_names); +} + +DataTypes TemplateSchemaReader::readRowAndGetDataTypes() +{ + if (first_row) + format_reader.readPrefix(); + + if (format_reader.checkForSuffix()) + return {}; + + if (first_row) + first_row = false; + else + format_reader.skipRowBetweenDelimiter(); + + DataTypes data_types; + data_types.reserve(row_format.columnsCount()); + String field; + for (size_t i = 0; i != row_format.columnsCount(); ++i) + { + format_reader.skipDelimiter(i); + if (row_format.escaping_rules[i] == FormatSettings::EscapingRule::CSV) + format_settings.csv.delimiter = row_format.delimiters[i + 1].empty() ? format_settings.csv.delimiter : row_format.delimiters[i + 1].front(); + + field = readFieldByEscapingRule(buf, row_format.escaping_rules[i], format_settings); + data_types.push_back(determineDataTypeByEscapingRule(field, format_settings, row_format.escaping_rules[i], context)); + } + + format_reader.skipRowEndDelimiter(); + return data_types; +} + +static ParsedTemplateFormatString fillResultSetFormat(const FormatSettings & settings) +{ + ParsedTemplateFormatString resultset_format; + if (settings.template_settings.resultset_format.empty()) + { + /// Default format string: "${data}" + resultset_format.delimiters.resize(2); + resultset_format.escaping_rules.emplace_back(ParsedTemplateFormatString::EscapingRule::None); + resultset_format.format_idx_to_column_idx.emplace_back(0); + resultset_format.column_names.emplace_back("data"); + } + else + { + /// Read format string from file + resultset_format = ParsedTemplateFormatString( + FormatSchemaInfo(settings.template_settings.resultset_format, "Template", false, + settings.schema.is_server, settings.schema.format_schema_path), + [&](const String & partName) -> std::optional + { + if (partName == "data") + return 0; + throw Exception("Unknown input part " + partName, + ErrorCodes::SYNTAX_ERROR); + }); + } + return resultset_format; +} + +static ParsedTemplateFormatString fillRowFormat(const FormatSettings & settings, ParsedTemplateFormatString::ColumnIdxGetter idx_getter, bool allow_indexes) +{ + return ParsedTemplateFormatString( + FormatSchemaInfo( + settings.template_settings.row_format, "Template", false, settings.schema.is_server, settings.schema.format_schema_path), + idx_getter, allow_indexes); } void registerInputFormatTemplate(FormatFactory & factory) @@ -397,39 +542,34 @@ void registerInputFormatTemplate(FormatFactory & factory) IRowInputFormat::Params params, const FormatSettings & settings) { - ParsedTemplateFormatString resultset_format; - if (settings.template_settings.resultset_format.empty()) + auto idx_getter = [&](const String & colName) -> std::optional { - /// Default format string: "${data}" - resultset_format.delimiters.resize(2); - resultset_format.escaping_rules.emplace_back(ParsedTemplateFormatString::EscapingRule::None); - resultset_format.format_idx_to_column_idx.emplace_back(0); - resultset_format.column_names.emplace_back("data"); - } - else - { - /// Read format string from file - resultset_format = ParsedTemplateFormatString( - FormatSchemaInfo(settings.template_settings.resultset_format, "Template", false, - settings.schema.is_server, settings.schema.format_schema_path), - [&](const String & partName) -> std::optional - { - if (partName == "data") - return 0; - throw Exception("Unknown input part " + partName, - ErrorCodes::SYNTAX_ERROR); - }); - } + return sample.getPositionByName(colName); + }; - ParsedTemplateFormatString row_format = ParsedTemplateFormatString( - FormatSchemaInfo(settings.template_settings.row_format, "Template", false, - settings.schema.is_server, settings.schema.format_schema_path), - [&](const String & colName) -> std::optional - { - return sample.getPositionByName(colName); - }); + return std::make_shared( + sample, + buf, + params, + settings, + ignore_spaces, + fillResultSetFormat(settings), + fillRowFormat(settings, idx_getter, true), + settings.template_settings.row_between_delimiter); + }); + } +} - return std::make_shared(sample, buf, params, settings, ignore_spaces, resultset_format, row_format, settings.template_settings.row_between_delimiter); +void registerTemplateSchemaReader(FormatFactory & factory) +{ + for (bool ignore_spaces : {false, true}) + { + factory.registerSchemaReader(ignore_spaces ? "TemplateIgnoreSpaces" : "Template", [ignore_spaces](ReadBuffer & buf, const FormatSettings & settings, ContextPtr context) + { + size_t index = 0; + auto idx_getter = [&](const String &) -> std::optional { return index++; }; + auto row_format = fillRowFormat(settings, idx_getter, false); + return std::make_shared(buf, ignore_spaces, fillResultSetFormat(settings), row_format, settings.template_settings.row_between_delimiter, settings, context); }); } } diff --git a/src/Processors/Formats/Impl/TemplateRowInputFormat.h b/src/Processors/Formats/Impl/TemplateRowInputFormat.h index 1663bf3ba02..755ad6cb39b 100644 --- a/src/Processors/Formats/Impl/TemplateRowInputFormat.h +++ b/src/Processors/Formats/Impl/TemplateRowInputFormat.h @@ -2,15 +2,19 @@ #include #include +#include #include #include #include #include +#include namespace DB { +class TemplateFormatReader; + class TemplateRowInputFormat : public RowInputFormatWithDiagnosticInfo { using EscapingRule = FormatSettings::EscapingRule; @@ -25,6 +29,11 @@ public: void resetParser() override; private: + TemplateRowInputFormat(const Block & header_, std::unique_ptr buf_, const Params & params_, + FormatSettings settings_, bool ignore_spaces_, + ParsedTemplateFormatString format_, ParsedTemplateFormatString row_format_, + std::string row_between_delimiter); + bool readRow(MutableColumns & columns, RowReadExtension & extra) override; void readPrefix() override; @@ -35,20 +44,14 @@ private: bool deserializeField(const DataTypePtr & type, const SerializationPtr & serialization, IColumn & column, size_t file_column); - void skipField(EscapingRule escaping_rule); - inline void skipSpaces() { if (ignore_spaces) skipWhitespaceIfAny(buf); } - - template - ReturnType tryReadPrefixOrSuffix(size_t & input_part_beg, size_t input_part_end); - bool checkForSuffix(); - [[noreturn]] void throwUnexpectedEof(); - bool parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out) override; void tryDeserializeField(const DataTypePtr & type, IColumn & column, size_t file_column) override; bool isGarbageAfterField(size_t after_col_idx, ReadBuffer::Position pos) override; - PeekableReadBuffer buf; + void setReadBuffer(ReadBuffer & in_) override; + + std::unique_ptr buf; const DataTypes data_types; FormatSettings settings; @@ -56,12 +59,76 @@ private: const ParsedTemplateFormatString format; const ParsedTemplateFormatString row_format; - size_t format_data_idx; bool end_of_stream = false; std::vector always_default_columns; const char default_csv_delimiter; const std::string row_between_delimiter; + + std::unique_ptr format_reader; +}; + +class TemplateFormatReader +{ + using EscapingRule = FormatSettings::EscapingRule; + +public: + TemplateFormatReader( + PeekableReadBuffer & buf_, + bool ignore_spaces_, + const ParsedTemplateFormatString & format_, + const ParsedTemplateFormatString & row_format_, + std::string row_between_delimiter, + const FormatSettings & format_settings_); + + void readPrefix(); + void skipField(EscapingRule escaping_rule); + inline void skipSpaces() { if (ignore_spaces) skipWhitespaceIfAny(*buf); } + + template + ReturnType tryReadPrefixOrSuffix(size_t & input_part_beg, size_t input_part_end); + bool checkForSuffix(); + + void setReadBuffer(PeekableReadBuffer & buf_) { buf = &buf_; } + + void skipDelimiter(size_t index); + void skipRowEndDelimiter(); + void skipRowBetweenDelimiter(); + + size_t getFormatDataIdx() const { return format_data_idx; } + +private: + PeekableReadBuffer * buf; + bool ignore_spaces; + const ParsedTemplateFormatString & format; + const ParsedTemplateFormatString & row_format; + const std::string row_between_delimiter; + const FormatSettings & format_settings; + size_t format_data_idx; + size_t row_num; +}; + +class TemplateSchemaReader : public IRowSchemaReader +{ +public: + TemplateSchemaReader(ReadBuffer & in_, + bool ignore_spaces_, + const ParsedTemplateFormatString & format_, + const ParsedTemplateFormatString & row_format_, + std::string row_between_delimiter, + const FormatSettings & format_settings_, + ContextPtr context_); + + DataTypes readRowAndGetDataTypes() override; + +private: + PeekableReadBuffer buf; + const ParsedTemplateFormatString format; + const ParsedTemplateFormatString row_format; + FormatSettings format_settings; + ContextPtr context; + TemplateFormatReader format_reader; + bool first_row = true; }; bool parseDelimiterWithDiagnosticInfo(WriteBuffer & out, ReadBuffer & buf, const String & delimiter, const String & description, bool skip_spaces); diff --git a/src/Processors/Formats/Impl/ValuesBlockInputFormat.cpp b/src/Processors/Formats/Impl/ValuesBlockInputFormat.cpp index adf6d2e8a25..b58be3f5526 100644 --- a/src/Processors/Formats/Impl/ValuesBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ValuesBlockInputFormat.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -15,6 +16,7 @@ #include #include +#include namespace DB { @@ -286,6 +288,50 @@ namespace } } +/// Can be used in fileSegmentationEngine for parallel parsing of Values +static bool skipToNextRow(PeekableReadBuffer * buf, size_t min_chunk_bytes, int balance) +{ + skipWhitespaceIfAny(*buf); + if (buf->eof() || *buf->position() == ';') + return false; + bool quoted = false; + + size_t chunk_begin_buf_count = buf->count(); + while (!buf->eof() && (balance || buf->count() - chunk_begin_buf_count < min_chunk_bytes)) + { + buf->position() = find_first_symbols<'\\', '\'', ')', '('>(buf->position(), buf->buffer().end()); + if (buf->position() == buf->buffer().end()) + continue; + if (*buf->position() == '\\') + { + ++buf->position(); + if (!buf->eof()) + ++buf->position(); + } + else if (*buf->position() == '\'') + { + quoted ^= true; + ++buf->position(); + } + else if (*buf->position() == ')') + { + ++buf->position(); + if (!quoted) + --balance; + } + else if (*buf->position() == '(') + { + ++buf->position(); + if (!quoted) + ++balance; + } + } + + if (!buf->eof() && *buf->position() == ',') + ++buf->position(); + return true; +} + bool ValuesBlockInputFormat::parseExpression(IColumn & column, size_t column_idx) { const Block & header = getPort().getHeader(); @@ -293,7 +339,7 @@ bool ValuesBlockInputFormat::parseExpression(IColumn & column, size_t column_idx auto settings = context->getSettingsRef(); /// We need continuous memory containing the expression to use Lexer - skipToNextRow(0, 1); + skipToNextRow(buf.get(), 0, 1); buf->makeContinuousMemoryFromCheckpointToPos(); buf->rollbackToCheckpoint(); @@ -437,50 +483,6 @@ bool ValuesBlockInputFormat::parseExpression(IColumn & column, size_t column_idx return true; } -/// Can be used in fileSegmentationEngine for parallel parsing of Values -bool ValuesBlockInputFormat::skipToNextRow(size_t min_chunk_bytes, int balance) -{ - skipWhitespaceIfAny(*buf); - if (buf->eof() || *buf->position() == ';') - return false; - bool quoted = false; - - size_t chunk_begin_buf_count = buf->count(); - while (!buf->eof() && (balance || buf->count() - chunk_begin_buf_count < min_chunk_bytes)) - { - buf->position() = find_first_symbols<'\\', '\'', ')', '('>(buf->position(), buf->buffer().end()); - if (buf->position() == buf->buffer().end()) - continue; - if (*buf->position() == '\\') - { - ++buf->position(); - if (!buf->eof()) - ++buf->position(); - } - else if (*buf->position() == '\'') - { - quoted ^= true; - ++buf->position(); - } - else if (*buf->position() == ')') - { - ++buf->position(); - if (!quoted) - --balance; - } - else if (*buf->position() == '(') - { - ++buf->position(); - if (!quoted) - ++balance; - } - } - - if (!buf->eof() && *buf->position() == ',') - ++buf->position(); - return true; -} - void ValuesBlockInputFormat::assertDelimiterAfterValue(size_t column_idx) { if (unlikely(!checkDelimiterAfterValue(column_idx))) @@ -559,6 +561,63 @@ void ValuesBlockInputFormat::setReadBuffer(ReadBuffer & in_) IInputFormat::setReadBuffer(*buf); } +ValuesSchemaReader::ValuesSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_, ContextPtr context_) + : IRowSchemaReader(buf, format_settings_.max_rows_to_read_for_schema_inference), buf(in_), context(context_) +{ +} + +DataTypes ValuesSchemaReader::readRowAndGetDataTypes() +{ + if (first_row) + { + skipBOMIfExists(buf); + first_row = false; + } + + skipWhitespaceIfAny(buf); + if (buf.eof()) + return {}; + + assertChar('(', buf); + PeekableReadBufferCheckpoint checkpoint(buf); + skipToNextRow(&buf, 0, 1); + buf.makeContinuousMemoryFromCheckpointToPos(); + buf.rollbackToCheckpoint(); + + Tokens tokens(buf.position(), buf.buffer().end()); + IParser::Pos token_iterator(tokens, context->getSettingsRef().max_parser_depth); + + DataTypes data_types; + bool finish = false; + while (!finish) + { + Expected expected; + ASTPtr ast; + + bool parsed = parser.parse(token_iterator, ast, expected); + /// Consider delimiter after value (',' or ')') as part of expression + parsed &= token_iterator->type == TokenType::Comma || token_iterator->type == TokenType::ClosingRoundBracket; + + if (!parsed) + throw Exception(ErrorCodes::SYNTAX_ERROR, "Cannot parse expression here: {}, token: {}", + String(buf.position(), std::min(SHOW_CHARS_ON_SYNTAX_ERROR, buf.buffer().end() - buf.position())), String(token_iterator.get().begin, token_iterator.get().end)); + + std::pair result = evaluateConstantExpression(ast, context); + data_types.push_back(generalizeDataType(result.second)); + + if (token_iterator->type == TokenType::ClosingRoundBracket) + finish = true; + ++token_iterator; + buf.position() = const_cast(token_iterator->begin); + } + + skipWhitespaceIfAny(buf); + if (!buf.eof() && *buf.position() == ',') + ++buf.position(); + + return data_types; +} + void registerInputFormatValues(FormatFactory & factory) { factory.registerInputFormat("Values", []( @@ -571,4 +630,12 @@ void registerInputFormatValues(FormatFactory & factory) }); } +void registerValuesSchemaReader(FormatFactory & factory) +{ + factory.registerSchemaReader("Values", [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr context) + { + return std::make_shared(buf, settings, context); + }); +} + } diff --git a/src/Processors/Formats/Impl/ValuesBlockInputFormat.h b/src/Processors/Formats/Impl/ValuesBlockInputFormat.h index 5bbd4bea5ba..e1521955472 100644 --- a/src/Processors/Formats/Impl/ValuesBlockInputFormat.h +++ b/src/Processors/Formats/Impl/ValuesBlockInputFormat.h @@ -7,6 +7,7 @@ #include #include #include +#include #include namespace DB @@ -68,8 +69,6 @@ private: void readPrefix(); void readSuffix(); - bool skipToNextRow(size_t min_chunk_bytes = 0, int balance = 0); - std::unique_ptr buf; const RowInputFormatParams params; @@ -95,4 +94,18 @@ private: BlockMissingValues block_missing_values; }; +class ValuesSchemaReader : public IRowSchemaReader +{ +public: + ValuesSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings, ContextPtr context_); + +private: + DataTypes readRowAndGetDataTypes() override; + + PeekableReadBuffer buf; + ContextPtr context; + ParserExpression parser; + bool first_row = true; +}; + } diff --git a/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp b/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp index 87fa5ec1c4a..7720b01dc74 100644 --- a/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp +++ b/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp @@ -1,5 +1,7 @@ #include +#include #include +#include #include #include @@ -9,6 +11,7 @@ namespace DB namespace ErrorCodes { extern const int INCORRECT_DATA; + extern const int CANNOT_EXTRACT_TABLE_STRUCTURE; } RowInputFormatWithNamesAndTypes::RowInputFormatWithNamesAndTypes( @@ -17,8 +20,13 @@ RowInputFormatWithNamesAndTypes::RowInputFormatWithNamesAndTypes( const Params & params_, bool with_names_, bool with_types_, - const FormatSettings & format_settings_) - : RowInputFormatWithDiagnosticInfo(header_, in_, params_), format_settings(format_settings_), with_names(with_names_), with_types(with_types_) + const FormatSettings & format_settings_, + std::unique_ptr format_reader_) + : RowInputFormatWithDiagnosticInfo(header_, in_, params_) + , format_settings(format_settings_) + , with_names(with_names_) + , with_types(with_types_) + , format_reader(std::move(format_reader_)) { const auto & sample = getPort().getHeader(); size_t num_columns = sample.columns(); @@ -88,7 +96,7 @@ void RowInputFormatWithNamesAndTypes::readPrefix() } /// Skip prefix before names and types. - skipPrefixBeforeHeader(); + format_reader->skipPrefixBeforeHeader(); /// This is a bit of abstraction leakage, but we need it in parallel parsing: /// we check if this InputFormat is working with the "real" beginning of the data. @@ -97,7 +105,7 @@ void RowInputFormatWithNamesAndTypes::readPrefix() if (format_settings.with_names_use_header) { std::vector read_columns(data_types.size(), false); - auto column_names = readNames(); + auto column_names = format_reader->readNames(); for (const auto & name : column_names) addInputColumn(name, read_columns); @@ -110,7 +118,7 @@ void RowInputFormatWithNamesAndTypes::readPrefix() else { setupAllColumnsByTableSchema(); - skipNames(); + format_reader->skipNames(); } } else if (!column_mapping->is_set) @@ -119,10 +127,10 @@ void RowInputFormatWithNamesAndTypes::readPrefix() if (with_types) { /// Skip delimiter between names and types. - skipRowBetweenDelimiter(); + format_reader->skipRowBetweenDelimiter(); if (format_settings.with_types_use_header) { - auto types = readTypes(); + auto types = format_reader->readTypes(); if (types.size() != column_mapping->column_indexes_for_input_fields.size()) throw Exception( ErrorCodes::INCORRECT_DATA, @@ -143,7 +151,7 @@ void RowInputFormatWithNamesAndTypes::readPrefix() } } else - skipTypes(); + format_reader->skipTypes(); } } @@ -161,7 +169,7 @@ bool RowInputFormatWithNamesAndTypes::readRow(MutableColumns & columns, RowReadE if (unlikely(end_of_stream)) return false; - if (unlikely(checkForSuffix())) + if (unlikely(format_reader->checkForSuffix())) { end_of_stream = true; return false; @@ -170,9 +178,9 @@ bool RowInputFormatWithNamesAndTypes::readRow(MutableColumns & columns, RowReadE updateDiagnosticInfo(); if (likely(row_num != 1 || (getCurrentUnitNumber() == 0 && (with_names || with_types)))) - skipRowBetweenDelimiter(); + format_reader->skipRowBetweenDelimiter(); - skipRowStartDelimiter(); + format_reader->skipRowStartDelimiter(); ext.read_columns.resize(data_types.size()); for (size_t file_column = 0; file_column < column_mapping->column_indexes_for_input_fields.size(); ++file_column) @@ -180,20 +188,20 @@ bool RowInputFormatWithNamesAndTypes::readRow(MutableColumns & columns, RowReadE const auto & column_index = column_mapping->column_indexes_for_input_fields[file_column]; const bool is_last_file_column = file_column + 1 == column_mapping->column_indexes_for_input_fields.size(); if (column_index) - ext.read_columns[*column_index] = readField( + ext.read_columns[*column_index] = format_reader->readField( *columns[*column_index], data_types[*column_index], serializations[*column_index], is_last_file_column, column_mapping->names_of_columns[file_column]); else - skipField(file_column); + format_reader->skipField(file_column); if (!is_last_file_column) - skipFieldDelimiter(); + format_reader->skipFieldDelimiter(); } - skipRowEndDelimiter(); + format_reader->skipRowEndDelimiter(); insertDefaultsForNotSeenColumns(columns, ext); @@ -218,13 +226,13 @@ void RowInputFormatWithNamesAndTypes::tryDeserializeField(const DataTypePtr & ty const auto & index = column_mapping->column_indexes_for_input_fields[file_column]; if (index) { - checkNullValueForNonNullable(type); + format_reader->checkNullValueForNonNullable(type); const bool is_last_file_column = file_column + 1 == column_mapping->column_indexes_for_input_fields.size(); - readField(column, type, serializations[*index], is_last_file_column, column_mapping->names_of_columns[file_column]); + format_reader->readField(column, type, serializations[*index], is_last_file_column, column_mapping->names_of_columns[file_column]); } else { - skipField(file_column); + format_reader->skipField(file_column); } } @@ -236,13 +244,13 @@ bool RowInputFormatWithNamesAndTypes::parseRowAndPrintDiagnosticInfo(MutableColu return false; } - if (!tryParseSuffixWithDiagnosticInfo(out)) + if (!format_reader->tryParseSuffixWithDiagnosticInfo(out)) return false; - if (likely(row_num != 1) && !parseRowBetweenDelimiterWithDiagnosticInfo(out)) + if (likely(row_num != 1) && !format_reader->parseRowBetweenDelimiterWithDiagnosticInfo(out)) return false; - if (!parseRowStartWithDiagnosticInfo(out)) + if (!format_reader->parseRowStartWithDiagnosticInfo(out)) return false; for (size_t file_column = 0; file_column < column_mapping->column_indexes_for_input_fields.size(); ++file_column) @@ -266,22 +274,68 @@ bool RowInputFormatWithNamesAndTypes::parseRowAndPrintDiagnosticInfo(MutableColu /// Delimiters if (file_column + 1 != column_mapping->column_indexes_for_input_fields.size()) { - if (!parseFieldDelimiterWithDiagnosticInfo(out)) + if (!format_reader->parseFieldDelimiterWithDiagnosticInfo(out)) return false; } } - return parseRowEndWithDiagnosticInfo(out); + return format_reader->parseRowEndWithDiagnosticInfo(out); } - -void registerFileSegmentationEngineForFormatWithNamesAndTypes( - FormatFactory & factory, const String & base_format_name, FormatFactory::FileSegmentationEngine segmentation_engine) +bool RowInputFormatWithNamesAndTypes::isGarbageAfterField(size_t index, ReadBuffer::Position pos) { - factory.registerFileSegmentationEngine(base_format_name, segmentation_engine); - factory.registerFileSegmentationEngine(base_format_name + "WithNames", segmentation_engine); - factory.registerFileSegmentationEngine(base_format_name + "WithNamesAndTypes", segmentation_engine); + return format_reader->isGarbageAfterField(index, pos); } +void RowInputFormatWithNamesAndTypes::setReadBuffer(ReadBuffer & in_) +{ + format_reader->setReadBuffer(in_); + IInputFormat::setReadBuffer(in_); +} + +FormatWithNamesAndTypesSchemaReader::FormatWithNamesAndTypesSchemaReader( + ReadBuffer & in_, + size_t max_rows_to_read_, + bool with_names_, + bool with_types_, + FormatWithNamesAndTypesReader * format_reader_, + DataTypePtr default_type_) + : IRowSchemaReader(in_, max_rows_to_read_, default_type_), with_names(with_names_), with_types(with_types_), format_reader(format_reader_) +{ +} + +NamesAndTypesList FormatWithNamesAndTypesSchemaReader::readSchema() +{ + if (with_names || with_types) + skipBOMIfExists(in); + + format_reader->skipPrefixBeforeHeader(); + + Names names; + if (with_names) + names = format_reader->readNames(); + + if (with_types) + { + format_reader->skipRowBetweenDelimiter(); + std::vector data_type_names = format_reader->readTypes(); + if (data_type_names.size() != names.size()) + throw Exception( + ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, + "The number of column names {} differs with the number of types {}", names.size(), data_type_names.size()); + + NamesAndTypesList result; + for (size_t i = 0; i != data_type_names.size(); ++i) + result.emplace_back(names[i], DataTypeFactory::instance().get(data_type_names[i])); + return result; + } + + if (!names.empty()) + setColumnNames(names); + + /// We should determine types by reading rows with data. Use the implementation from IRowSchemaReader. + return IRowSchemaReader::readSchema(); +} } + diff --git a/src/Processors/Formats/RowInputFormatWithNamesAndTypes.h b/src/Processors/Formats/RowInputFormatWithNamesAndTypes.h index cd7cd34d7e6..25ffc8d6de2 100644 --- a/src/Processors/Formats/RowInputFormatWithNamesAndTypes.h +++ b/src/Processors/Formats/RowInputFormatWithNamesAndTypes.h @@ -1,12 +1,15 @@ #pragma once #include +#include #include #include namespace DB { +class FormatWithNamesAndTypesReader; + /// Base class for input formats with -WithNames and -WithNamesAndTypes suffixes. /// It accepts 2 parameters in constructor - with_names and with_types and implements /// input format depending on them: @@ -20,7 +23,7 @@ namespace DB /// then reads/skips types. So you can this invariant. class RowInputFormatWithNamesAndTypes : public RowInputFormatWithDiagnosticInfo { -public: +protected: /** with_names - in the first line the header with column names * with_types - in the second line the header with column names */ @@ -28,44 +31,14 @@ public: const Block & header_, ReadBuffer & in_, const Params & params_, - bool with_names_, bool with_types_, const FormatSettings & format_settings_); + bool with_names_, + bool with_types_, + const FormatSettings & format_settings_, + std::unique_ptr format_reader_); void resetParser() override; - -protected: - /// Read single field from input. Return false if there was no real value and we inserted default value. - virtual bool readField(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, bool is_last_file_column, const String & column_name) = 0; - - /// Skip single field, it's used to skip unknown columns. - virtual void skipField(size_t file_column) = 0; - /// Skip the whole row with names. - virtual void skipNames() = 0; - /// Skip the whole row with types. - virtual void skipTypes() = 0; - - /// Skip delimiters, if any. - virtual void skipPrefixBeforeHeader() {} - virtual void skipRowStartDelimiter() {} - virtual void skipFieldDelimiter() {} - virtual void skipRowEndDelimiter() {} - virtual void skipRowBetweenDelimiter() {} - - /// Check suffix. - virtual bool checkForSuffix() { return in->eof(); } - - /// Methods for parsing with diagnostic info. - virtual void checkNullValueForNonNullable(DataTypePtr) {} - virtual bool parseRowStartWithDiagnosticInfo(WriteBuffer &) { return true; } - virtual bool parseFieldDelimiterWithDiagnosticInfo(WriteBuffer &) { return true; } - virtual bool parseRowEndWithDiagnosticInfo(WriteBuffer &) { return true;} - virtual bool parseRowBetweenDelimiterWithDiagnosticInfo(WriteBuffer &) { return true;} - virtual bool tryParseSuffixWithDiagnosticInfo(WriteBuffer &) { return true; } - bool isGarbageAfterField(size_t, ReadBuffer::Position) override {return false; } - - /// Read row with names and return the list of them. - virtual std::vector readNames() = 0; - /// Read row with types and return the list of them. - virtual std::vector readTypes() = 0; + bool isGarbageAfterField(size_t index, ReadBuffer::Position pos) override; + void setReadBuffer(ReadBuffer & in_) override; const FormatSettings format_settings; DataTypes data_types; @@ -84,10 +57,90 @@ private: bool with_names; bool with_types; + std::unique_ptr format_reader; std::unordered_map column_indexes_by_names; }; -void registerFileSegmentationEngineForFormatWithNamesAndTypes( - FormatFactory & factory, const String & base_format_name, FormatFactory::FileSegmentationEngine segmentation_engine); +/// Base class for parsing data in input formats with -WithNames and -WithNamesAndTypes suffixes. +/// Used for reading/skipping names/types/delimiters in specific format. +class FormatWithNamesAndTypesReader +{ +public: + explicit FormatWithNamesAndTypesReader(ReadBuffer & in_, const FormatSettings & format_settings_) : in(&in_), format_settings(format_settings_) {} + + /// Read single field from input. Return false if there was no real value and we inserted default value. + virtual bool readField(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, bool is_last_file_column, const String & column_name) = 0; + + /// Methods for parsing with diagnostic info. + virtual void checkNullValueForNonNullable(DataTypePtr) {} + virtual bool parseRowStartWithDiagnosticInfo(WriteBuffer &) { return true; } + virtual bool parseFieldDelimiterWithDiagnosticInfo(WriteBuffer &) { return true; } + virtual bool parseRowEndWithDiagnosticInfo(WriteBuffer &) { return true;} + virtual bool parseRowBetweenDelimiterWithDiagnosticInfo(WriteBuffer &) { return true;} + virtual bool tryParseSuffixWithDiagnosticInfo(WriteBuffer &) { return true; } + virtual bool isGarbageAfterField(size_t, ReadBuffer::Position) { return false; } + + /// Read row with names and return the list of them. + virtual std::vector readNames() = 0; + /// Read row with types and return the list of them. + virtual std::vector readTypes() = 0; + + /// Skip single field, it's used to skip unknown columns. + virtual void skipField(size_t file_column) = 0; + /// Skip the whole row with names. + virtual void skipNames() = 0; + /// Skip the whole row with types. + virtual void skipTypes() = 0; + + /// Skip delimiters, if any. + virtual void skipPrefixBeforeHeader() {} + virtual void skipRowStartDelimiter() {} + virtual void skipFieldDelimiter() {} + virtual void skipRowEndDelimiter() {} + virtual void skipRowBetweenDelimiter() {} + + /// Check suffix. + virtual bool checkForSuffix() { return in->eof(); } + + const FormatSettings & getFormatSettings() const { return format_settings; } + + virtual void setReadBuffer(ReadBuffer & in_) { in = &in_; } + + virtual ~FormatWithNamesAndTypesReader() = default; + +protected: + ReadBuffer * in; + const FormatSettings format_settings; +}; + +/// Base class for schema inference for formats with -WithNames and -WithNamesAndTypes suffixes. +/// For formats with -WithNamesAndTypes suffix the schema will be determined by first two rows. +/// For formats with -WithNames suffix the names of columns will be determined by the first row +/// and types of columns by the rows with data. +/// For formats without suffixes default column names will be used +/// and types will be determined by the rows with data. +class FormatWithNamesAndTypesSchemaReader : public IRowSchemaReader +{ +public: + FormatWithNamesAndTypesSchemaReader( + ReadBuffer & in, + size_t max_rows_to_read_, + bool with_names_, + bool with_types_, + FormatWithNamesAndTypesReader * format_reader_, + DataTypePtr default_type_ = nullptr); + + NamesAndTypesList readSchema() override; + +protected: + virtual DataTypes readRowAndGetDataTypes() override = 0; + + bool with_names; + bool with_types; + +private: + FormatWithNamesAndTypesReader * format_reader; +}; } + diff --git a/src/Processors/Merges/Algorithms/GraphiteRollupSortedAlgorithm.cpp b/src/Processors/Merges/Algorithms/GraphiteRollupSortedAlgorithm.cpp index 328c34823a0..d5a35fef7bd 100644 --- a/src/Processors/Merges/Algorithms/GraphiteRollupSortedAlgorithm.cpp +++ b/src/Processors/Merges/Algorithms/GraphiteRollupSortedAlgorithm.cpp @@ -1,7 +1,7 @@ #include #include -#include -#include +#include +#include #include diff --git a/src/Processors/Merges/Algorithms/IMergingAlgorithm.h b/src/Processors/Merges/Algorithms/IMergingAlgorithm.h index 5c8d18875e7..c56324c7d9a 100644 --- a/src/Processors/Merges/Algorithms/IMergingAlgorithm.h +++ b/src/Processors/Merges/Algorithms/IMergingAlgorithm.h @@ -39,6 +39,7 @@ public: void set(Chunk chunk_) { + convertToFullIfSparse(chunk_); chunk = std::move(chunk_); skip_last_row = false; } diff --git a/src/Processors/QueryPlan/BuildQueryPipelineSettings.cpp b/src/Processors/QueryPlan/BuildQueryPipelineSettings.cpp index 2480673d65e..fb3ed7f80fc 100644 --- a/src/Processors/QueryPlan/BuildQueryPipelineSettings.cpp +++ b/src/Processors/QueryPlan/BuildQueryPipelineSettings.cpp @@ -6,16 +6,13 @@ namespace DB { -BuildQueryPipelineSettings BuildQueryPipelineSettings::fromSettings(const Settings & from) +BuildQueryPipelineSettings BuildQueryPipelineSettings::fromContext(ContextPtr from) { BuildQueryPipelineSettings settings; - settings.actions_settings = ExpressionActionsSettings::fromSettings(from, CompileExpressions::yes); + settings.actions_settings = ExpressionActionsSettings::fromSettings(from->getSettingsRef(), CompileExpressions::yes); + settings.process_list_element = from->getProcessListElement(); + settings.progress_callback = from->getProgressCallback(); return settings; } -BuildQueryPipelineSettings BuildQueryPipelineSettings::fromContext(ContextPtr from) -{ - return fromSettings(from->getSettingsRef()); -} - } diff --git a/src/Processors/QueryPlan/BuildQueryPipelineSettings.h b/src/Processors/QueryPlan/BuildQueryPipelineSettings.h index c3282d43778..fadbd061fbd 100644 --- a/src/Processors/QueryPlan/BuildQueryPipelineSettings.h +++ b/src/Processors/QueryPlan/BuildQueryPipelineSettings.h @@ -1,5 +1,6 @@ #pragma once +#include #include #include @@ -8,14 +9,15 @@ namespace DB { struct Settings; +class QueryStatus; struct BuildQueryPipelineSettings { ExpressionActionsSettings actions_settings; + QueryStatus * process_list_element = nullptr; + ProgressCallback progress_callback = nullptr; const ExpressionActionsSettings & getActionsSettings() const { return actions_settings; } - - static BuildQueryPipelineSettings fromSettings(const Settings & from); static BuildQueryPipelineSettings fromContext(ContextPtr from); }; diff --git a/src/Processors/QueryPlan/QueryPlan.cpp b/src/Processors/QueryPlan/QueryPlan.cpp index f319e562bfb..a271ef78dfa 100644 --- a/src/Processors/QueryPlan/QueryPlan.cpp +++ b/src/Processors/QueryPlan/QueryPlan.cpp @@ -180,6 +180,9 @@ QueryPipelineBuilderPtr QueryPlan::buildQueryPipeline( for (auto & context : interpreter_context) last_pipeline->addInterpreterContext(std::move(context)); + last_pipeline->setProgressCallback(build_pipeline_settings.progress_callback); + last_pipeline->setProcessListElement(build_pipeline_settings.process_list_element); + return last_pipeline; } diff --git a/src/Processors/QueryPlan/ReadFromRemote.cpp b/src/Processors/QueryPlan/ReadFromRemote.cpp index 8fcec03d746..826ef084d87 100644 --- a/src/Processors/QueryPlan/ReadFromRemote.cpp +++ b/src/Processors/QueryPlan/ReadFromRemote.cpp @@ -61,8 +61,12 @@ static String formattedAST(const ASTPtr & ast) { if (!ast) return {}; + WriteBufferFromOwnString buf; - formatAST(*ast, buf, false, true); + IAST::FormatSettings ast_format_settings(buf, /*one_line*/ true); + ast_format_settings.hilite = false; + ast_format_settings.always_quote_identifiers = true; + ast->format(ast_format_settings); return buf.str(); } diff --git a/src/Processors/Sinks/SinkToStorage.cpp b/src/Processors/Sinks/SinkToStorage.cpp index 9ec0939f3a8..5f9f9f9b1a1 100644 --- a/src/Processors/Sinks/SinkToStorage.cpp +++ b/src/Processors/Sinks/SinkToStorage.cpp @@ -6,7 +6,7 @@ namespace DB SinkToStorage::SinkToStorage(const Block & header) : ExceptionKeepingTransform(header, header, false) {} -void SinkToStorage::transform(Chunk & chunk) +void SinkToStorage::onConsume(Chunk chunk) { /** Throw an exception if the sizes of arrays - elements of nested data structures doesn't match. * We have to make this assertion before writing to table, because storage engine may assume that they have equal sizes. @@ -16,8 +16,16 @@ void SinkToStorage::transform(Chunk & chunk) Nested::validateArraySizes(getHeader().cloneWithColumns(chunk.getColumns())); consume(chunk.clone()); - if (lastBlockIsDuplicate()) - chunk.clear(); + if (!lastBlockIsDuplicate()) + cur_chunk = std::move(chunk); +} + +SinkToStorage::GenerateResult SinkToStorage::onGenerate() +{ + GenerateResult res; + res.chunk = std::move(cur_chunk); + res.is_done = true; + return res; } } diff --git a/src/Processors/Sinks/SinkToStorage.h b/src/Processors/Sinks/SinkToStorage.h index 01d51940d64..023bbd8b094 100644 --- a/src/Processors/Sinks/SinkToStorage.h +++ b/src/Processors/Sinks/SinkToStorage.h @@ -24,7 +24,10 @@ protected: private: std::vector table_locks; - void transform(Chunk & chunk) override; + void onConsume(Chunk chunk) override; + GenerateResult onGenerate() override; + + Chunk cur_chunk; }; using SinkToStoragePtr = std::shared_ptr; diff --git a/src/Processors/Sources/MySQLSource.cpp b/src/Processors/Sources/MySQLSource.cpp index 8e9cdcfda48..b0cb62340e9 100644 --- a/src/Processors/Sources/MySQLSource.cpp +++ b/src/Processors/Sources/MySQLSource.cpp @@ -2,6 +2,7 @@ #if USE_MYSQL #include +#include #include #include #include @@ -126,7 +127,7 @@ namespace { using ValueType = ExternalResultDescription::ValueType; - void insertValue(const IDataType & data_type, IColumn & column, const ValueType type, const mysqlxx::Value & value, size_t & read_bytes_size) + void insertValue(const IDataType & data_type, IColumn & column, const ValueType type, const mysqlxx::Value & value, size_t & read_bytes_size, enum enum_field_types mysql_type) { switch (type) { @@ -143,9 +144,24 @@ namespace read_bytes_size += 4; break; case ValueType::vtUInt64: - assert_cast(column).insertValue(value.getUInt()); - read_bytes_size += 8; + { + //we don't have enum enum_field_types definition in mysqlxx/Types.h, so we use literal values directly here. + if (static_cast(mysql_type) == 16) + { + size_t n = value.size(); + UInt64 val = 0UL; + ReadBufferFromMemory payload(const_cast(value.data()), n); + MySQLReplication::readBigEndianStrict(payload, reinterpret_cast(&val), n); + assert_cast(column).insertValue(val); + read_bytes_size += n; + } + else + { + assert_cast(column).insertValue(value.getUInt()); + read_bytes_size += 8; + } break; + } case ValueType::vtInt8: assert_cast(column).insertValue(value.getInt()); read_bytes_size += 1; @@ -258,12 +274,12 @@ Chunk MySQLSource::generate() { ColumnNullable & column_nullable = assert_cast(*columns[index]); const auto & data_type = assert_cast(*sample.type); - insertValue(*data_type.getNestedType(), column_nullable.getNestedColumn(), description.types[index].first, value, read_bytes_size); + insertValue(*data_type.getNestedType(), column_nullable.getNestedColumn(), description.types[index].first, value, read_bytes_size, row.getFieldType(position_mapping[index])); column_nullable.getNullMapData().emplace_back(false); } else { - insertValue(*sample.type, *columns[index], description.types[index].first, value, read_bytes_size); + insertValue(*sample.type, *columns[index], description.types[index].first, value, read_bytes_size, row.getFieldType(position_mapping[index])); } } else diff --git a/src/Processors/Sources/ShellCommandSource.cpp b/src/Processors/Sources/ShellCommandSource.cpp new file mode 100644 index 00000000000..dc272ace01e --- /dev/null +++ b/src/Processors/Sources/ShellCommandSource.cpp @@ -0,0 +1,586 @@ +#include + +#include + +#include + +#include +#include + +#include +#include +#include +#include +#include + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int UNSUPPORTED_METHOD; + extern const int TIMEOUT_EXCEEDED; + extern const int CANNOT_FCNTL; + extern const int CANNOT_READ_FROM_FILE_DESCRIPTOR; + extern const int CANNOT_POLL; + extern const int CANNOT_WRITE_TO_FILE_DESCRIPTOR; +} + +static bool tryMakeFdNonBlocking(int fd) +{ + int flags = fcntl(fd, F_GETFL, 0); + if (-1 == flags) + return false; + if (-1 == fcntl(fd, F_SETFL, flags | O_NONBLOCK)) + return false; + + return true; +} + +static void makeFdNonBlocking(int fd) +{ + bool result = tryMakeFdNonBlocking(fd); + if (!result) + throwFromErrno("Cannot set non-blocking mode of pipe", ErrorCodes::CANNOT_FCNTL); +} + +static bool tryMakeFdBlocking(int fd) +{ + int flags = fcntl(fd, F_GETFL, 0); + if (-1 == flags) + return false; + + if (-1 == fcntl(fd, F_SETFL, flags & (~O_NONBLOCK))) + return false; + + return true; +} + +static void makeFdBlocking(int fd) +{ + bool result = tryMakeFdBlocking(fd); + if (!result) + throwFromErrno("Cannot set blocking mode of pipe", ErrorCodes::CANNOT_FCNTL); +} + +static bool pollFd(int fd, size_t timeout_milliseconds, int events) +{ + pollfd pfd; + pfd.fd = fd; + pfd.events = events; + pfd.revents = 0; + + Stopwatch watch; + + int res; + + while (true) + { + res = poll(&pfd, 1, timeout_milliseconds); + + if (res < 0) + { + if (errno == EINTR) + { + watch.stop(); + timeout_milliseconds -= watch.elapsedMilliseconds(); + watch.start(); + + continue; + } + else + { + throwFromErrno("Cannot poll", ErrorCodes::CANNOT_POLL); + } + } + else + { + break; + } + } + + return res > 0; +} + +class TimeoutReadBufferFromFileDescriptor : public BufferWithOwnMemory +{ +public: + explicit TimeoutReadBufferFromFileDescriptor(int fd_, size_t timeout_milliseconds_) + : fd(fd_) + , timeout_milliseconds(timeout_milliseconds_) + { + makeFdNonBlocking(fd); + } + + bool nextImpl() override + { + size_t bytes_read = 0; + + while (!bytes_read) + { + if (!pollFd(fd, timeout_milliseconds, POLLIN)) + throw Exception(ErrorCodes::TIMEOUT_EXCEEDED, "Pipe read timeout exceeded {} milliseconds", timeout_milliseconds); + + ssize_t res = ::read(fd, internal_buffer.begin(), internal_buffer.size()); + + if (-1 == res && errno != EINTR) + throwFromErrno("Cannot read from pipe ", ErrorCodes::CANNOT_READ_FROM_FILE_DESCRIPTOR); + + if (res == 0) + break; + + if (res > 0) + bytes_read += res; + } + + if (bytes_read > 0) + { + working_buffer = internal_buffer; + working_buffer.resize(bytes_read); + } + else + { + return false; + } + + return true; + } + + void reset() const + { + makeFdBlocking(fd); + } + + ~TimeoutReadBufferFromFileDescriptor() override + { + tryMakeFdBlocking(fd); + } + +private: + int fd; + size_t timeout_milliseconds; +}; + +class TimeoutWriteBufferFromFileDescriptor : public BufferWithOwnMemory +{ +public: + explicit TimeoutWriteBufferFromFileDescriptor(int fd_, size_t timeout_milliseconds_) + : fd(fd_) + , timeout_milliseconds(timeout_milliseconds_) + { + makeFdNonBlocking(fd); + } + + void nextImpl() override + { + if (!offset()) + return; + + size_t bytes_written = 0; + + while (bytes_written != offset()) + { + if (!pollFd(fd, timeout_milliseconds, POLLOUT)) + throw Exception(ErrorCodes::TIMEOUT_EXCEEDED, "Pipe write timeout exceeded {} milliseconds", timeout_milliseconds); + + ssize_t res = ::write(fd, working_buffer.begin() + bytes_written, offset() - bytes_written); + + if ((-1 == res || 0 == res) && errno != EINTR) + throwFromErrno("Cannot write into pipe ", ErrorCodes::CANNOT_WRITE_TO_FILE_DESCRIPTOR); + + if (res > 0) + bytes_written += res; + } + } + + void reset() const + { + makeFdBlocking(fd); + } + + ~TimeoutWriteBufferFromFileDescriptor() override + { + tryMakeFdBlocking(fd); + } + +private: + int fd; + size_t timeout_milliseconds; +}; + +class ShellCommandHolder +{ +public: + using ShellCommandBuilderFunc = std::function()>; + + explicit ShellCommandHolder(ShellCommandBuilderFunc && func_) + : func(std::move(func_)) + {} + + std::unique_ptr buildCommand() + { + if (returned_command) + return std::move(returned_command); + + return func(); + } + + void returnCommand(std::unique_ptr command) + { + returned_command = std::move(command); + } + +private: + std::unique_ptr returned_command; + ShellCommandBuilderFunc func; +}; + +namespace +{ + /** A stream, that get child process and sends data using tasks in background threads. + * For each send data task background thread is created. Send data task must send data to process input pipes. + * ShellCommandPoolSource receives data from process stdout. + * + * If process_pool is passed in constructor then after source is destroyed process is returned to pool. + */ + class ShellCommandSource final : public SourceWithProgress + { + public: + + using SendDataTask = std::function; + + ShellCommandSource( + ContextPtr context_, + const std::string & format_, + size_t command_read_timeout_milliseconds, + const Block & sample_block_, + std::unique_ptr && command_, + std::vector && send_data_tasks = {}, + const ShellCommandSourceConfiguration & configuration_ = {}, + std::unique_ptr && command_holder_ = nullptr, + std::shared_ptr process_pool_ = nullptr) + : SourceWithProgress(sample_block_) + , context(context_) + , format(format_) + , sample_block(sample_block_) + , command(std::move(command_)) + , configuration(configuration_) + , timeout_command_out(command->out.getFD(), command_read_timeout_milliseconds) + , command_holder(std::move(command_holder_)) + , process_pool(process_pool_) + { + for (auto && send_data_task : send_data_tasks) + { + send_data_threads.emplace_back([task = std::move(send_data_task), this]() + { + try + { + task(); + } + catch (...) + { + std::lock_guard lock(send_data_lock); + exception_during_send_data = std::current_exception(); + } + }); + } + + size_t max_block_size = configuration.max_block_size; + + if (configuration.read_fixed_number_of_rows) + { + /** Currently parallel parsing input format cannot read exactly max_block_size rows from input, + * so it will be blocked on ReadBufferFromFileDescriptor because this file descriptor represent pipe that does not have eof. + */ + auto context_for_reading = Context::createCopy(context); + context_for_reading->setSetting("input_format_parallel_parsing", false); + context = context_for_reading; + + if (configuration.read_number_of_rows_from_process_output) + { + /// Initialize executor in generate + return; + } + + max_block_size = configuration.number_of_rows_to_read; + } + + pipeline = QueryPipeline(Pipe(context->getInputFormat(format, timeout_command_out, sample_block, max_block_size))); + executor = std::make_unique(pipeline); + } + + ~ShellCommandSource() override + { + for (auto & thread : send_data_threads) + if (thread.joinable()) + thread.join(); + + if (command_is_invalid) + command = nullptr; + + if (command_holder && process_pool) + { + bool valid_command = configuration.read_fixed_number_of_rows && current_read_rows >= configuration.number_of_rows_to_read; + + if (command && valid_command) + command_holder->returnCommand(std::move(command)); + + process_pool->returnObject(std::move(command_holder)); + } + } + + protected: + + Chunk generate() override + { + rethrowExceptionDuringSendDataIfNeeded(); + + Chunk chunk; + + try + { + if (configuration.read_fixed_number_of_rows) + { + if (!executor && configuration.read_number_of_rows_from_process_output) + { + readText(configuration.number_of_rows_to_read, timeout_command_out); + char dummy; + readChar(dummy, timeout_command_out); + + size_t max_block_size = configuration.number_of_rows_to_read; + pipeline = QueryPipeline(Pipe(context->getInputFormat(format, timeout_command_out, sample_block, max_block_size))); + executor = std::make_unique(pipeline); + } + + if (current_read_rows >= configuration.number_of_rows_to_read) + return {}; + } + + if (!executor->pull(chunk)) + return {}; + + current_read_rows += chunk.getNumRows(); + } + catch (...) + { + command_is_invalid = true; + throw; + } + + return chunk; + } + + Status prepare() override + { + auto status = SourceWithProgress::prepare(); + + if (status == Status::Finished) + { + for (auto & thread : send_data_threads) + if (thread.joinable()) + thread.join(); + + rethrowExceptionDuringSendDataIfNeeded(); + } + + return status; + } + + String getName() const override { return "ShellCommandSource"; } + + private: + + void rethrowExceptionDuringSendDataIfNeeded() + { + std::lock_guard lock(send_data_lock); + if (exception_during_send_data) + { + command_is_invalid = true; + std::rethrow_exception(exception_during_send_data); + } + } + + ContextPtr context; + std::string format; + Block sample_block; + + std::unique_ptr command; + ShellCommandSourceConfiguration configuration; + + TimeoutReadBufferFromFileDescriptor timeout_command_out; + + size_t current_read_rows = 0; + + ShellCommandHolderPtr command_holder; + std::shared_ptr process_pool; + + QueryPipeline pipeline; + std::unique_ptr executor; + + std::vector send_data_threads; + + std::mutex send_data_lock; + std::exception_ptr exception_during_send_data; + + std::atomic command_is_invalid {false}; + }; + + class SendingChunkHeaderTransform final : public ISimpleTransform + { + public: + SendingChunkHeaderTransform(const Block & header, std::shared_ptr buffer_) + : ISimpleTransform(header, header, false) + , buffer(buffer_) + { + } + + String getName() const override { return "SendingChunkHeaderTransform"; } + + protected: + + void transform(Chunk & chunk) override + { + writeText(chunk.getNumRows(), *buffer); + writeChar('\n', *buffer); + } + + private: + std::shared_ptr buffer; + }; + +} + +ShellCommandSourceCoordinator::ShellCommandSourceCoordinator(const Configuration & configuration_) + : configuration(configuration_) +{ + if (configuration.is_executable_pool) + process_pool = std::make_shared(configuration.pool_size ? configuration.pool_size : std::numeric_limits::max()); +} + +Pipe ShellCommandSourceCoordinator::createPipe( + const std::string & command, + const std::vector & arguments, + std::vector && input_pipes, + Block sample_block, + ContextPtr context, + const ShellCommandSourceConfiguration & source_configuration) +{ + ShellCommand::Config command_config(command); + command_config.arguments = arguments; + for (size_t i = 1; i < input_pipes.size(); ++i) + command_config.write_fds.emplace_back(i + 2); + + std::unique_ptr process; + std::unique_ptr process_holder; + + auto destructor_strategy = ShellCommand::DestructorStrategy{true /*terminate_in_destructor*/, configuration.command_termination_timeout_seconds}; + command_config.terminate_in_destructor_strategy = destructor_strategy; + + bool is_executable_pool = (process_pool != nullptr); + if (is_executable_pool) + { + bool execute_direct = configuration.execute_direct; + + bool result = process_pool->tryBorrowObject( + process_holder, + [command_config, execute_direct]() + { + ShellCommandHolder::ShellCommandBuilderFunc func = [command_config, execute_direct]() mutable + { + if (execute_direct) + return ShellCommand::executeDirect(command_config); + else + return ShellCommand::execute(command_config); + }; + + return std::make_unique(std::move(func)); + }, + configuration.max_command_execution_time_seconds * 10000); + + if (!result) + throw Exception( + ErrorCodes::TIMEOUT_EXCEEDED, + "Could not get process from pool, max command execution timeout exceeded {} seconds", + configuration.max_command_execution_time_seconds); + + process = process_holder->buildCommand(); + } + else + { + if (configuration.execute_direct) + process = ShellCommand::executeDirect(command_config); + else + process = ShellCommand::execute(command_config); + } + + std::vector tasks; + tasks.reserve(input_pipes.size()); + + for (size_t i = 0; i < input_pipes.size(); ++i) + { + WriteBufferFromFile * write_buffer = nullptr; + + if (i == 0) + { + write_buffer = &process->in; + } + else + { + auto descriptor = i + 2; + auto it = process->write_fds.find(descriptor); + if (it == process->write_fds.end()) + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "Process does not contain descriptor to write {}", descriptor); + + write_buffer = &it->second; + } + + int write_buffer_fd = write_buffer->getFD(); + auto timeout_write_buffer = std::make_shared(write_buffer_fd, configuration.command_write_timeout_milliseconds); + + input_pipes[i].resize(1); + + if (configuration.send_chunk_header) + { + auto transform = std::make_shared(input_pipes[i].getHeader(), timeout_write_buffer); + input_pipes[i].addTransform(std::move(transform)); + } + + auto pipeline = std::make_shared(std::move(input_pipes[i])); + auto out = context->getOutputFormat(configuration.format, *timeout_write_buffer, materializeBlock(pipeline->getHeader())); + out->setAutoFlush(); + pipeline->complete(std::move(out)); + + ShellCommandSource::SendDataTask task = [pipeline, timeout_write_buffer, write_buffer, is_executable_pool]() + { + CompletedPipelineExecutor executor(*pipeline); + executor.execute(); + + if (!is_executable_pool) + { + timeout_write_buffer->next(); + timeout_write_buffer->reset(); + + write_buffer->close(); + } + }; + + tasks.emplace_back(std::move(task)); + } + + auto source = std::make_unique( + context, + configuration.format, + configuration.command_read_timeout_milliseconds, + std::move(sample_block), + std::move(process), + std::move(tasks), + source_configuration, + std::move(process_holder), + process_pool); + auto pipe = Pipe(std::move(source)); + + return pipe; +} + +} diff --git a/src/Processors/Sources/ShellCommandSource.h b/src/Processors/Sources/ShellCommandSource.h index 4974c33f290..649c713afcb 100644 --- a/src/Processors/Sources/ShellCommandSource.h +++ b/src/Processors/Sources/ShellCommandSource.h @@ -19,14 +19,10 @@ namespace DB { -/** A stream, that get child process and sends data using tasks in background threads. - * For each send data task background thread is created. Send data task must send data to process input pipes. - * ShellCommandPoolSource receives data from process stdout. - * - * If process_pool is passed in constructor then after source is destroyed process is returned to pool. - */ +class ShellCommandHolder; +using ShellCommandHolderPtr = std::unique_ptr; -using ProcessPool = BorrowedObjectPool>; +using ProcessPool = BorrowedObjectPool; struct ShellCommandSourceConfiguration { @@ -37,148 +33,92 @@ struct ShellCommandSourceConfiguration /// Valid only if read_fixed_number_of_rows = true size_t number_of_rows_to_read = 0; /// Max block size - size_t max_block_size = DBMS_DEFAULT_BUFFER_SIZE; + size_t max_block_size = DEFAULT_BLOCK_SIZE; }; -class ShellCommandSource final : public SourceWithProgress +class ShellCommandSourceCoordinator { public: - using SendDataTask = std::function; + struct Configuration + { - ShellCommandSource( + /// Script output format + std::string format; + + /// Command termination timeout in seconds + size_t command_termination_timeout_seconds = 10; + + /// Timeout for reading data from command stdout + size_t command_read_timeout_milliseconds = 10000; + + /// Timeout for writing data to command stdin + size_t command_write_timeout_milliseconds = 10000; + + /// Pool size valid only if executable_pool = true + size_t pool_size = 16; + + /// Max command execution time in milliseconds. Valid only if executable_pool = true + size_t max_command_execution_time_seconds = 10; + + /// Should pool of processes be created. + bool is_executable_pool = false; + + /// Send number_of_rows\n before sending chunk to process. + bool send_chunk_header = false; + + /// Execute script direct or with /bin/bash. + bool execute_direct = true; + + }; + + explicit ShellCommandSourceCoordinator(const Configuration & configuration_); + + const Configuration & getConfiguration() const + { + return configuration; + } + + Pipe createPipe( + const std::string & command, + const std::vector & arguments, + std::vector && input_pipes, + Block sample_block, ContextPtr context, - const std::string & format, - const Block & sample_block, - std::unique_ptr && command_, - std::vector && send_data_tasks = {}, - const ShellCommandSourceConfiguration & configuration_ = {}, - std::shared_ptr process_pool_ = nullptr) - : SourceWithProgress(sample_block) - , command(std::move(command_)) - , configuration(configuration_) - , process_pool(process_pool_) + const ShellCommandSourceConfiguration & source_configuration = {}); + + Pipe createPipe( + const std::string & command, + std::vector && input_pipes, + Block sample_block, + ContextPtr context, + const ShellCommandSourceConfiguration & source_configuration = {}) { - for (auto && send_data_task : send_data_tasks) - { - send_data_threads.emplace_back([task = std::move(send_data_task), this]() - { - try - { - task(); - } - catch (...) - { - std::lock_guard lock(send_data_lock); - exception_during_send_data = std::current_exception(); - } - }); - } - - size_t max_block_size = configuration.max_block_size; - - if (configuration.read_fixed_number_of_rows) - { - /** Currently parallel parsing input format cannot read exactly max_block_size rows from input, - * so it will be blocked on ReadBufferFromFileDescriptor because this file descriptor represent pipe that does not have eof. - */ - auto context_for_reading = Context::createCopy(context); - context_for_reading->setSetting("input_format_parallel_parsing", false); - context = context_for_reading; - - if (configuration.read_number_of_rows_from_process_output) - { - readText(configuration.number_of_rows_to_read, command->out); - char dummy; - readChar(dummy, command->out); - } - - max_block_size = configuration.number_of_rows_to_read; - } - - pipeline = QueryPipeline(Pipe(context->getInputFormat(format, command->out, sample_block, max_block_size))); - executor = std::make_unique(pipeline); + return createPipe(command, {}, std::move(input_pipes), std::move(sample_block), std::move(context), source_configuration); } - ~ShellCommandSource() override + Pipe createPipe( + const std::string & command, + const std::vector & arguments, + Block sample_block, + ContextPtr context) { - for (auto & thread : send_data_threads) - if (thread.joinable()) - thread.join(); - - if (command && process_pool) - process_pool->returnObject(std::move(command)); + return createPipe(command, arguments, {}, std::move(sample_block), std::move(context), {}); } -protected: - - Chunk generate() override + Pipe createPipe( + const std::string & command, + Block sample_block, + ContextPtr context) { - rethrowExceptionDuringSendDataIfNeeded(); - - if (configuration.read_fixed_number_of_rows && configuration.number_of_rows_to_read == current_read_rows) - return {}; - - Chunk chunk; - - try - { - if (!executor->pull(chunk)) - return {}; - - current_read_rows += chunk.getNumRows(); - } - catch (...) - { - command = nullptr; - throw; - } - - return chunk; + return createPipe(command, {}, {}, std::move(sample_block), std::move(context), {}); } - Status prepare() override - { - auto status = SourceWithProgress::prepare(); - - if (status == Status::Finished) - { - for (auto & thread : send_data_threads) - if (thread.joinable()) - thread.join(); - - rethrowExceptionDuringSendDataIfNeeded(); - } - - return status; - } - - String getName() const override { return "ShellCommandSource"; } - private: - void rethrowExceptionDuringSendDataIfNeeded() - { - std::lock_guard lock(send_data_lock); - if (exception_during_send_data) - { - command = nullptr; - std::rethrow_exception(exception_during_send_data); - } - } + Configuration configuration; - std::unique_ptr command; - ShellCommandSourceConfiguration configuration; - - size_t current_read_rows = 0; - - std::shared_ptr process_pool; - - QueryPipeline pipeline; - std::unique_ptr executor; - - std::vector send_data_threads; - std::mutex send_data_lock; - std::exception_ptr exception_during_send_data; + std::shared_ptr process_pool = nullptr; }; + } diff --git a/src/Processors/Sources/SourceWithProgress.cpp b/src/Processors/Sources/SourceWithProgress.cpp index 9b7a5c6a762..60c39c919f6 100644 --- a/src/Processors/Sources/SourceWithProgress.cpp +++ b/src/Processors/Sources/SourceWithProgress.cpp @@ -26,6 +26,8 @@ SourceWithProgress::SourceWithProgress(Block header, bool enable_auto_progress) void SourceWithProgress::setProcessListElement(QueryStatus * elem) { process_list_elem = elem; + if (!elem) + return; /// Update total_rows_approx as soon as possible. /// diff --git a/src/Processors/TTL/ITTLAlgorithm.h b/src/Processors/TTL/ITTLAlgorithm.h index d219f9f7ad3..49cd2c46d9d 100644 --- a/src/Processors/TTL/ITTLAlgorithm.h +++ b/src/Processors/TTL/ITTLAlgorithm.h @@ -3,7 +3,7 @@ #include #include #include -#include +#include namespace DB { diff --git a/src/Processors/Transforms/AggregatingInOrderTransform.cpp b/src/Processors/Transforms/AggregatingInOrderTransform.cpp index 3602b7f7d03..fae1ede1f9c 100644 --- a/src/Processors/Transforms/AggregatingInOrderTransform.cpp +++ b/src/Processors/Transforms/AggregatingInOrderTransform.cpp @@ -231,8 +231,10 @@ IProcessor::Status AggregatingInOrderTransform::prepare() input.setNeeded(); return Status::NeedData; } + assert(!is_consume_finished); current_chunk = input.pull(true /* set_not_needed */); + convertToFullIfSparse(current_chunk); return Status::Ready; } diff --git a/src/Processors/Transforms/AggregatingTransform.cpp b/src/Processors/Transforms/AggregatingTransform.cpp index 8357a997960..5b58530f3d5 100644 --- a/src/Processors/Transforms/AggregatingTransform.cpp +++ b/src/Processors/Transforms/AggregatingTransform.cpp @@ -329,7 +329,7 @@ private: if (num_threads > first->aggregates_pools.size()) { Arenas & first_pool = first->aggregates_pools; - for (size_t j = first_pool.size(); j < num_threads; j++) + for (size_t j = first_pool.size(); j < num_threads; ++j) first_pool.emplace_back(std::make_shared()); } diff --git a/src/Processors/Transforms/CheckConstraintsTransform.cpp b/src/Processors/Transforms/CheckConstraintsTransform.cpp index b7849b8a627..50ec86f33b6 100644 --- a/src/Processors/Transforms/CheckConstraintsTransform.cpp +++ b/src/Processors/Transforms/CheckConstraintsTransform.cpp @@ -35,7 +35,7 @@ CheckConstraintsTransform::CheckConstraintsTransform( } -void CheckConstraintsTransform::transform(Chunk & chunk) +void CheckConstraintsTransform::onConsume(Chunk chunk) { if (chunk.getNumRows() > 0) { @@ -123,6 +123,7 @@ void CheckConstraintsTransform::transform(Chunk & chunk) } rows_written += chunk.getNumRows(); + cur_chunk = std::move(chunk); } } diff --git a/src/Processors/Transforms/CheckConstraintsTransform.h b/src/Processors/Transforms/CheckConstraintsTransform.h index 3198ec84198..09833ff396b 100644 --- a/src/Processors/Transforms/CheckConstraintsTransform.h +++ b/src/Processors/Transforms/CheckConstraintsTransform.h @@ -23,12 +23,19 @@ public: String getName() const override { return "CheckConstraintsTransform"; } - void transform(Chunk & chunk) override; + void onConsume(Chunk chunk) override; + GenerateResult onGenerate() override + { + GenerateResult res; + res.chunk = std::move(cur_chunk); + return res; + } private: StorageID table_id; const ASTs constraints_to_check; const ConstraintsExpressions expressions; size_t rows_written = 0; + Chunk cur_chunk; }; } diff --git a/src/Processors/Transforms/CountingTransform.cpp b/src/Processors/Transforms/CountingTransform.cpp index 88ecbe6adc3..eb191b36586 100644 --- a/src/Processors/Transforms/CountingTransform.cpp +++ b/src/Processors/Transforms/CountingTransform.cpp @@ -16,22 +16,23 @@ namespace ProfileEvents namespace DB { -void CountingTransform::transform(Chunk & chunk) +void CountingTransform::onConsume(Chunk chunk) { - Progress local_progress(chunk.getNumRows(), chunk.bytes(), 0); + Progress local_progress{WriteProgress(chunk.getNumRows(), chunk.bytes())}; progress.incrementPiecewiseAtomically(local_progress); //std::cerr << "============ counting adding progress for " << static_cast(thread_status) << ' ' << chunk.getNumRows() << " rows\n"; if (thread_status) { - thread_status->performance_counters.increment(ProfileEvents::InsertedRows, local_progress.read_rows); - thread_status->performance_counters.increment(ProfileEvents::InsertedBytes, local_progress.read_bytes); + thread_status->performance_counters.increment(ProfileEvents::InsertedRows, local_progress.written_rows); + thread_status->performance_counters.increment(ProfileEvents::InsertedBytes, local_progress.written_bytes); + thread_status->progress_out.incrementPiecewiseAtomically(local_progress); } else { - ProfileEvents::increment(ProfileEvents::InsertedRows, local_progress.read_rows); - ProfileEvents::increment(ProfileEvents::InsertedBytes, local_progress.read_bytes); + ProfileEvents::increment(ProfileEvents::InsertedRows, local_progress.written_rows); + ProfileEvents::increment(ProfileEvents::InsertedBytes, local_progress.written_bytes); } if (process_elem) @@ -39,6 +40,8 @@ void CountingTransform::transform(Chunk & chunk) if (progress_callback) progress_callback(local_progress); + + cur_chunk = std::move(chunk); } } diff --git a/src/Processors/Transforms/CountingTransform.h b/src/Processors/Transforms/CountingTransform.h index e7100e8510b..877f6a0a543 100644 --- a/src/Processors/Transforms/CountingTransform.h +++ b/src/Processors/Transforms/CountingTransform.h @@ -34,13 +34,20 @@ public: return progress; } - void transform(Chunk & chunk) override; + void onConsume(Chunk chunk) override; + GenerateResult onGenerate() override + { + GenerateResult res; + res.chunk = std::move(cur_chunk); + return res; + } protected: Progress progress; ProgressCallback progress_callback; QueryStatus * process_elem = nullptr; ThreadStatus * thread_status = nullptr; + Chunk cur_chunk; }; } diff --git a/src/Processors/Transforms/DistinctTransform.cpp b/src/Processors/Transforms/DistinctTransform.cpp index fddfe663af5..cf6a128aa40 100644 --- a/src/Processors/Transforms/DistinctTransform.cpp +++ b/src/Processors/Transforms/DistinctTransform.cpp @@ -54,6 +54,9 @@ void DistinctTransform::buildFilter( void DistinctTransform::transform(Chunk & chunk) { + /// Convert to full column, because SetVariant for sparse column is not implemented. + convertToFullIfSparse(chunk); + auto num_rows = chunk.getNumRows(); auto columns = chunk.detachColumns(); diff --git a/src/Processors/Transforms/ExceptionKeepingTransform.cpp b/src/Processors/Transforms/ExceptionKeepingTransform.cpp index 5c968471e1b..f2b29a45f84 100644 --- a/src/Processors/Transforms/ExceptionKeepingTransform.cpp +++ b/src/Processors/Transforms/ExceptionKeepingTransform.cpp @@ -21,8 +21,13 @@ ExceptionKeepingTransform::ExceptionKeepingTransform(const Block & in_header, co IProcessor::Status ExceptionKeepingTransform::prepare() { - if (!ignore_on_start_and_finish && !was_on_start_called) - return Status::Ready; + if (stage == Stage::Start) + { + if (ignore_on_start_and_finish) + stage = Stage::Consume; + else + return Status::Ready; + } /// Check can output. @@ -43,12 +48,19 @@ IProcessor::Status ExceptionKeepingTransform::prepare() return Status::PortFull; } - if (!ready_input) + if (stage == Stage::Generate) + return Status::Ready; + + while (!ready_input) { if (input.isFinished()) { - if (!ignore_on_start_and_finish && !was_on_finish_called && !has_exception) - return Status::Ready; + if (stage != Stage::Exception && stage != Stage::Finish) + { + stage = Stage::Finish; + if (!ignore_on_start_and_finish) + return Status::Ready; + } output.finish(); return Status::Finished; @@ -63,12 +75,13 @@ IProcessor::Status ExceptionKeepingTransform::prepare() if (data.exception) { - has_exception = true; + stage = Stage::Exception; + onException(); output.pushData(std::move(data)); return Status::PortFull; } - if (has_exception) + if (stage == Stage::Exception) /// In case of exception, just drop all other data. /// If transform is stateful, it's state may be broken after exception from transform() data.chunk.clear(); @@ -117,40 +130,66 @@ static std::exception_ptr runStep(std::function step, ThreadStatus * thr void ExceptionKeepingTransform::work() { - if (!ignore_on_start_and_finish && !was_on_start_called) + if (stage == Stage::Start) { - was_on_start_called = true; + stage = Stage::Consume; if (auto exception = runStep([this] { onStart(); }, thread_status, elapsed_counter_ms)) { - has_exception = true; + stage = Stage::Exception; ready_output = true; data.exception = std::move(exception); + onException(); } } - else if (ready_input) + else if (stage == Stage::Consume || stage == Stage::Generate) { - ready_input = false; - - if (auto exception = runStep([this] { transform(data.chunk); }, thread_status, elapsed_counter_ms)) + if (stage == Stage::Consume) { - has_exception = true; - data.chunk.clear(); - data.exception = std::move(exception); + ready_input = false; + + if (auto exception = runStep([this] { onConsume(std::move(data.chunk)); }, thread_status, elapsed_counter_ms)) + { + stage = Stage::Exception; + ready_output = true; + data.exception = std::move(exception); + onException(); + } + else + stage = Stage::Generate; } - if (data.chunk || data.exception) - ready_output = true; - } - else if (!ignore_on_start_and_finish && !was_on_finish_called) - { - was_on_finish_called = true; + if (stage == Stage::Generate) + { + GenerateResult res; + if (auto exception = runStep([this, &res] { res = onGenerate(); }, thread_status, elapsed_counter_ms)) + { + stage = Stage::Exception; + ready_output = true; + data.exception = std::move(exception); + onException(); + } + else + { + if (res.chunk) + { + data.chunk = std::move(res.chunk); + ready_output = true; + } + if (res.is_done) + stage = Stage::Consume; + } + } + } + else if (stage == Stage::Finish) + { if (auto exception = runStep([this] { onFinish(); }, thread_status, elapsed_counter_ms)) { - has_exception = true; + stage = Stage::Exception; ready_output = true; data.exception = std::move(exception); + onException(); } } } diff --git a/src/Processors/Transforms/ExceptionKeepingTransform.h b/src/Processors/Transforms/ExceptionKeepingTransform.h index 867f13bf53a..e2bc161971e 100644 --- a/src/Processors/Transforms/ExceptionKeepingTransform.h +++ b/src/Processors/Transforms/ExceptionKeepingTransform.h @@ -28,18 +28,31 @@ protected: OutputPort & output; Port::Data data; + enum class Stage + { + Start, + Consume, + Generate, + Finish, + Exception, + }; + + Stage stage = Stage::Start; bool ready_input = false; bool ready_output = false; - bool has_exception = false; - const bool ignore_on_start_and_finish = true; - bool was_on_start_called = false; - bool was_on_finish_called = false; -//protected: - virtual void transform(Chunk & chunk) = 0; + struct GenerateResult + { + Chunk chunk; + bool is_done = true; + }; + virtual void onStart() {} + virtual void onConsume(Chunk chunk) = 0; + virtual GenerateResult onGenerate() = 0; virtual void onFinish() {} + virtual void onException() {} public: ExceptionKeepingTransform(const Block & in_header, const Block & out_header, bool ignore_on_start_and_finish_ = true); diff --git a/src/Processors/Transforms/ExpressionTransform.cpp b/src/Processors/Transforms/ExpressionTransform.cpp index ca788f1dd9f..0d3341b000c 100644 --- a/src/Processors/Transforms/ExpressionTransform.cpp +++ b/src/Processors/Transforms/ExpressionTransform.cpp @@ -31,7 +31,7 @@ ConvertingTransform::ConvertingTransform(const Block & header_, ExpressionAction { } -void ConvertingTransform::transform(Chunk & chunk) +void ConvertingTransform::onConsume(Chunk chunk) { size_t num_rows = chunk.getNumRows(); auto block = getInputPort().getHeader().cloneWithColumns(chunk.detachColumns()); @@ -39,6 +39,7 @@ void ConvertingTransform::transform(Chunk & chunk) expression->execute(block, num_rows); chunk.setColumns(block.getColumns(), num_rows); + cur_chunk = std::move(chunk); } } diff --git a/src/Processors/Transforms/ExpressionTransform.h b/src/Processors/Transforms/ExpressionTransform.h index a76dc733e14..ea73c8fb1da 100644 --- a/src/Processors/Transforms/ExpressionTransform.h +++ b/src/Processors/Transforms/ExpressionTransform.h @@ -43,10 +43,17 @@ public: String getName() const override { return "ConvertingTransform"; } protected: - void transform(Chunk & chunk) override; + void onConsume(Chunk chunk) override; + GenerateResult onGenerate() override + { + GenerateResult res; + res.chunk = std::move(cur_chunk); + return res; + } private: ExpressionActionsPtr expression; + Chunk cur_chunk; }; } diff --git a/src/Processors/Transforms/MaterializingTransform.cpp b/src/Processors/Transforms/MaterializingTransform.cpp index abf416e8047..1eaa5458d37 100644 --- a/src/Processors/Transforms/MaterializingTransform.cpp +++ b/src/Processors/Transforms/MaterializingTransform.cpp @@ -1,4 +1,5 @@ #include +#include namespace DB { @@ -12,7 +13,7 @@ void MaterializingTransform::transform(Chunk & chunk) auto columns = chunk.detachColumns(); for (auto & col : columns) - col = col->convertToFullColumnIfConst(); + col = recursiveRemoveSparse(col->convertToFullColumnIfConst()); chunk.setColumns(std::move(columns), num_rows); } diff --git a/src/Processors/Transforms/MongoDBSource.cpp b/src/Processors/Transforms/MongoDBSource.cpp index 30ad9746520..4f5be41a89d 100644 --- a/src/Processors/Transforms/MongoDBSource.cpp +++ b/src/Processors/Transforms/MongoDBSource.cpp @@ -36,6 +36,7 @@ namespace ErrorCodes extern const int MONGODB_CANNOT_AUTHENTICATE; extern const int NOT_FOUND_COLUMN_IN_BLOCK; extern const int UNKNOWN_TYPE; + extern const int MONGODB_ERROR; } @@ -327,6 +328,14 @@ Chunk MongoDBSource::generate() for (auto & document : response.documents()) { + if (document->exists("ok") && document->exists("$err") + && document->exists("code") && document->getInteger("ok") == 0) + { + auto code = document->getInteger("code"); + const Poco::MongoDB::Element::Ptr value = document->get("$err"); + auto message = static_cast &>(*value).value(); + throw Exception(ErrorCodes::MONGODB_ERROR, "Got error from MongoDB: {}, code: {}", message, code); + } ++num_rows; for (const auto idx : collections::range(0, size)) diff --git a/src/Processors/Transforms/PostgreSQLSource.cpp b/src/Processors/Transforms/PostgreSQLSource.cpp index ac8408d8338..88f092a2533 100644 --- a/src/Processors/Transforms/PostgreSQLSource.cpp +++ b/src/Processors/Transforms/PostgreSQLSource.cpp @@ -74,7 +74,17 @@ template void PostgreSQLSource::onStart() { if (!tx) - tx = std::make_shared(connection_holder->get()); + { + try + { + tx = std::make_shared(connection_holder->get()); + } + catch (const pqxx::broken_connection &) + { + connection_holder->update(); + tx = std::make_shared(connection_holder->get()); + } + } stream = std::make_unique(*tx, pqxx::from_query, std::string_view(query_str)); } diff --git a/src/Processors/Transforms/SortingTransform.cpp b/src/Processors/Transforms/SortingTransform.cpp index 7bdc927d0d8..8fa9d7adb84 100644 --- a/src/Processors/Transforms/SortingTransform.cpp +++ b/src/Processors/Transforms/SortingTransform.cpp @@ -31,6 +31,11 @@ MergeSorter::MergeSorter(Chunks chunks_, SortDescription & description_, size_t if (chunk.getNumRows() == 0) continue; + /// Convert to full column, because sparse column has + /// access to element in O(log(K)), where K is number of non-default rows, + /// which can be inefficient. + convertToFullIfSparse(chunk); + cursors.emplace_back(chunk.getColumns(), description); has_collation |= cursors.back().has_collation; diff --git a/src/Processors/Transforms/SquashingChunksTransform.cpp b/src/Processors/Transforms/SquashingChunksTransform.cpp index f5aef01463a..908f6c0ff34 100644 --- a/src/Processors/Transforms/SquashingChunksTransform.cpp +++ b/src/Processors/Transforms/SquashingChunksTransform.cpp @@ -11,14 +11,22 @@ SquashingChunksTransform::SquashingChunksTransform( { } -void SquashingChunksTransform::transform(Chunk & chunk) +void SquashingChunksTransform::onConsume(Chunk chunk) { if (auto block = squashing.add(getInputPort().getHeader().cloneWithColumns(chunk.detachColumns()))) { - chunk.setColumns(block.getColumns(), block.rows()); + cur_chunk.setColumns(block.getColumns(), block.rows()); } } +SquashingChunksTransform::GenerateResult SquashingChunksTransform::onGenerate() +{ + GenerateResult res; + res.chunk = std::move(cur_chunk); + res.is_done = true; + return res; +} + void SquashingChunksTransform::onFinish() { auto block = squashing.add({}); @@ -27,7 +35,7 @@ void SquashingChunksTransform::onFinish() void SquashingChunksTransform::work() { - if (has_exception) + if (stage == Stage::Exception) { data.chunk.clear(); ready_input = false; diff --git a/src/Processors/Transforms/SquashingChunksTransform.h b/src/Processors/Transforms/SquashingChunksTransform.h index bf4a051891b..531efe0d6a2 100644 --- a/src/Processors/Transforms/SquashingChunksTransform.h +++ b/src/Processors/Transforms/SquashingChunksTransform.h @@ -17,12 +17,14 @@ public: void work() override; protected: - void transform(Chunk & chunk) override; + void onConsume(Chunk chunk) override; + GenerateResult onGenerate() override; void onFinish() override; private: SquashingTransform squashing; + Chunk cur_chunk; Chunk finish_chunk; }; diff --git a/src/Processors/Transforms/TTLCalcTransform.h b/src/Processors/Transforms/TTLCalcTransform.h index 14592c07155..495879400dc 100644 --- a/src/Processors/Transforms/TTLCalcTransform.h +++ b/src/Processors/Transforms/TTLCalcTransform.h @@ -6,7 +6,7 @@ #include #include -#include +#include namespace DB { diff --git a/src/Processors/Transforms/TTLTransform.cpp b/src/Processors/Transforms/TTLTransform.cpp index a515a50fafb..7d0da3dca91 100644 --- a/src/Processors/Transforms/TTLTransform.cpp +++ b/src/Processors/Transforms/TTLTransform.cpp @@ -105,6 +105,7 @@ void TTLTransform::consume(Chunk chunk) return; } + convertToFullIfSparse(chunk); auto block = getInputPort().getHeader().cloneWithColumns(chunk.detachColumns()); for (const auto & algorithm : algorithms) diff --git a/src/Processors/Transforms/TTLTransform.h b/src/Processors/Transforms/TTLTransform.h index 9207c68448b..3f0dffd1998 100644 --- a/src/Processors/Transforms/TTLTransform.h +++ b/src/Processors/Transforms/TTLTransform.h @@ -7,7 +7,7 @@ #include #include -#include +#include namespace DB { diff --git a/src/Processors/Transforms/WindowTransform.cpp b/src/Processors/Transforms/WindowTransform.cpp index 1f8376f4700..0da7541556b 100644 --- a/src/Processors/Transforms/WindowTransform.cpp +++ b/src/Processors/Transforms/WindowTransform.cpp @@ -4,10 +4,12 @@ #include #include #include +#include #include #include #include #include +#include #include #include @@ -383,7 +385,7 @@ void WindowTransform::advancePartitionEnd() // prev_frame_start, partition_end); size_t i = 0; - for (; i < partition_by_columns; i++) + for (; i < partition_by_columns; ++i) { const auto * reference_column = inputAt(prev_frame_start)[partition_by_indices[i]].get(); @@ -665,7 +667,7 @@ bool WindowTransform::arePeers(const RowNumber & x, const RowNumber & y) const } size_t i = 0; - for (; i < n; i++) + for (; i < n; ++i) { const auto * column_x = inputAt(x)[order_by_indices[i]].get(); const auto * column_y = inputAt(y)[order_by_indices[i]].get(); @@ -1005,6 +1007,12 @@ static void assertSameColumns(const Columns & left_all, assert(left_column); assert(right_column); + if (const auto * left_lc = typeid_cast(left_column)) + left_column = left_lc->getDictionary().getNestedColumn().get(); + + if (const auto * right_lc = typeid_cast(right_column)) + right_column = right_lc->getDictionary().getNestedColumn().get(); + assert(typeid(*left_column).hash_code() == typeid(*right_column).hash_code()); @@ -1056,10 +1064,13 @@ void WindowTransform::appendChunk(Chunk & chunk) // Another problem with Const columns is that the aggregate functions // can't work with them, so we have to materialize them like the // Aggregator does. + // Likewise, aggregate functions can't work with LowCardinality, + // so we have to materialize them too. // Just materialize everything. auto columns = chunk.detachColumns(); + block.original_input_columns = columns; for (auto & column : columns) - column = std::move(column)->convertToFullColumnIfConst(); + column = recursiveRemoveLowCardinality(std::move(column)->convertToFullColumnIfConst()); block.input_columns = std::move(columns); // Initialize output columns. @@ -1302,7 +1313,7 @@ IProcessor::Status WindowTransform::prepare() // Output the ready block. const auto i = next_output_block_number - first_block_number; auto & block = blocks[i]; - auto columns = block.input_columns; + auto columns = block.original_input_columns; for (auto & res : block.output_columns) { columns.push_back(ColumnPtr(std::move(res))); diff --git a/src/Processors/Transforms/WindowTransform.h b/src/Processors/Transforms/WindowTransform.h index 5fbdd6d38e1..077979e83b9 100644 --- a/src/Processors/Transforms/WindowTransform.h +++ b/src/Processors/Transforms/WindowTransform.h @@ -39,6 +39,7 @@ struct WindowFunctionWorkspace struct WindowTransformBlock { + Columns original_input_columns; Columns input_columns; MutableColumns output_columns; diff --git a/src/Processors/Transforms/buildPushingToViewsChain.cpp b/src/Processors/Transforms/buildPushingToViewsChain.cpp index 503f34593c7..17075e2b318 100644 --- a/src/Processors/Transforms/buildPushingToViewsChain.cpp +++ b/src/Processors/Transforms/buildPushingToViewsChain.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -14,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -23,6 +25,12 @@ #include #include +namespace ProfileEvents +{ + extern const Event SelectedBytes; + extern const Event SelectedRows; +} + namespace DB { @@ -83,11 +91,26 @@ public: String getName() const override { return "ExecutingInnerQueryFromView"; } protected: - void transform(Chunk & chunk) override; + void onConsume(Chunk chunk) override; + GenerateResult onGenerate() override; private: ViewsDataPtr views_data; ViewRuntimeData & view; + + struct State + { + QueryPipeline pipeline; + PullingPipelineExecutor executor; + + explicit State(QueryPipeline pipeline_) + : pipeline(std::move(pipeline_)) + , executor(pipeline) + { + } + }; + + std::optional state; }; /// Insert into LiveView. @@ -389,7 +412,7 @@ Chain buildPushingToViewsChain( return result_chain; } -static void process(Block & block, ViewRuntimeData & view, const ViewsData & views_data) +static QueryPipeline process(Block block, ViewRuntimeData & view, const ViewsData & views_data) { const auto & context = views_data.context; @@ -400,7 +423,7 @@ static void process(Block & block, ViewRuntimeData & view, const ViewsData & vie local_context->addViewSource(StorageValues::create( views_data.source_storage_id, views_data.source_metadata_snapshot->getColumns(), - block, + std::move(block), views_data.source_storage->getVirtuals())); /// We need keep InterpreterSelectQuery, until the processing will be finished, since: @@ -436,23 +459,7 @@ static void process(Block & block, ViewRuntimeData & view, const ViewsData & vie pipeline.getHeader(), std::make_shared(std::move(converting)))); - pipeline.setProgressCallback([context](const Progress & progress) - { - CurrentThread::updateProgressIn(progress); - if (auto callback = context->getProgressCallback()) - callback(progress); - }); - - auto query_pipeline = QueryPipelineBuilder::getPipeline(std::move(pipeline)); - PullingPipelineExecutor executor(query_pipeline); - if (!executor.pull(block)) - { - block.clear(); - return; - } - - if (executor.pull(block)) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Single chunk is expected from view inner query {}", view.query); + return QueryPipelineBuilder::getPipeline(std::move(pipeline)); } static void logQueryViews(std::list & views, ContextPtr context) @@ -550,14 +557,33 @@ ExecutingInnerQueryFromViewTransform::ExecutingInnerQueryFromViewTransform( { } -void ExecutingInnerQueryFromViewTransform::transform(Chunk & chunk) +void ExecutingInnerQueryFromViewTransform::onConsume(Chunk chunk) { auto block = getInputPort().getHeader().cloneWithColumns(chunk.getColumns()); - process(block, view, *views_data); - chunk.setColumns(block.getColumns(), block.rows()); + state.emplace(process(block, view, *views_data)); } +ExecutingInnerQueryFromViewTransform::GenerateResult ExecutingInnerQueryFromViewTransform::onGenerate() +{ + GenerateResult res; + if (!state.has_value()) + return res; + + res.is_done = false; + while (!res.is_done) + { + res.is_done = !state->executor.pull(res.chunk); + if (res.chunk) + break; + } + + if (res.is_done) + state.reset(); + + return res; +} + PushingToLiveViewSink::PushingToLiveViewSink(const Block & header, StorageLiveView & live_view_, StoragePtr storage_holder_, ContextPtr context_) : SinkToStorage(header) , live_view(live_view_) @@ -570,7 +596,11 @@ void PushingToLiveViewSink::consume(Chunk chunk) { Progress local_progress(chunk.getNumRows(), chunk.bytes(), 0); StorageLiveView::writeIntoLiveView(live_view, getHeader().cloneWithColumns(chunk.detachColumns()), context); - CurrentThread::updateProgressIn(local_progress); + auto * process = context->getProcessListElement(); + if (process) + process->updateProgressIn(local_progress); + ProfileEvents::increment(ProfileEvents::SelectedRows, local_progress.read_rows); + ProfileEvents::increment(ProfileEvents::SelectedBytes, local_progress.read_bytes); } @@ -589,7 +619,11 @@ void PushingToWindowViewSink::consume(Chunk chunk) Progress local_progress(chunk.getNumRows(), chunk.bytes(), 0); StorageWindowView::writeIntoWindowView( window_view, getHeader().cloneWithColumns(chunk.detachColumns()), context); - CurrentThread::updateProgressIn(local_progress); + auto * process = context->getProcessListElement(); + if (process) + process->updateProgressIn(local_progress); + ProfileEvents::increment(ProfileEvents::SelectedRows, local_progress.read_rows); + ProfileEvents::increment(ProfileEvents::SelectedBytes, local_progress.read_bytes); } diff --git a/src/Processors/tests/gtest_exception_on_incorrect_pipeline.cpp b/src/Processors/tests/gtest_exception_on_incorrect_pipeline.cpp index df3901e2eb1..ee661b39fac 100644 --- a/src/Processors/tests/gtest_exception_on_incorrect_pipeline.cpp +++ b/src/Processors/tests/gtest_exception_on_incorrect_pipeline.cpp @@ -27,7 +27,8 @@ TEST(Processors, PortsConnected) processors.emplace_back(std::move(source)); processors.emplace_back(std::move(sink)); - PipelineExecutor executor(processors); + QueryStatus * element = nullptr; + PipelineExecutor executor(processors, element); executor.execute(1); } @@ -51,7 +52,8 @@ TEST(Processors, PortsNotConnected) try { - PipelineExecutor executor(processors); + QueryStatus * element = nullptr; + PipelineExecutor executor(processors, element); executor.execute(1); ASSERT_TRUE(false) << "Should have thrown."; } diff --git a/src/QueryPipeline/BlockIO.h b/src/QueryPipeline/BlockIO.h index 0f05beca4a8..5918b4b27fc 100644 --- a/src/QueryPipeline/BlockIO.h +++ b/src/QueryPipeline/BlockIO.h @@ -31,13 +31,13 @@ struct BlockIO /// When it is true, don't bother sending any non-empty blocks to the out stream bool null_format = false; - /// Call these functions if you want to log the request. void onFinish() { if (finish_callback) { finish_callback(pipeline); } + pipeline.reset(); } void onException() const diff --git a/src/QueryPipeline/Chain.h b/src/QueryPipeline/Chain.h index c5fdc34cecf..60dbad10131 100644 --- a/src/QueryPipeline/Chain.h +++ b/src/QueryPipeline/Chain.h @@ -1,5 +1,6 @@ #pragma once +#include #include #include @@ -42,6 +43,7 @@ public: void addTableLock(TableLockHolder lock) { holder.table_locks.emplace_back(std::move(lock)); } void addStorageHolder(StoragePtr storage) { holder.storage_holders.emplace_back(std::move(storage)); } void attachResources(PipelineResourcesHolder holder_) { holder = std::move(holder_); } + void addInterpreterContext(ContextPtr context) { holder.interpreter_context.emplace_back(std::move(context)); } PipelineResourcesHolder detachResources() { return std::move(holder); } void reset(); diff --git a/src/QueryPipeline/ConnectionCollector.cpp b/src/QueryPipeline/ConnectionCollector.cpp index a6a0afb68d3..c2cdd1a1133 100644 --- a/src/QueryPipeline/ConnectionCollector.cpp +++ b/src/QueryPipeline/ConnectionCollector.cpp @@ -46,7 +46,7 @@ struct AsyncDrainTask std::shared_ptr shared_connections; void operator()() const { - ConnectionCollector::drainConnections(*shared_connections); + ConnectionCollector::drainConnections(*shared_connections, /* throw_error= */ false); } // We don't have std::unique_function yet. Wrap it in shared_ptr to make the functor copyable. @@ -71,7 +71,7 @@ std::shared_ptr ConnectionCollector::enqueueConnectionCleanup( return connections; } -void ConnectionCollector::drainConnections(IConnections & connections) noexcept +void ConnectionCollector::drainConnections(IConnections & connections, bool throw_error) { bool is_drained = false; try @@ -90,6 +90,9 @@ void ConnectionCollector::drainConnections(IConnections & connections) noexcept break; default: + /// Connection should be closed in case of unknown packet, + /// since this means that the connection in some bad state. + is_drained = false; throw Exception( ErrorCodes::UNKNOWN_PACKET_FROM_SERVER, "Unknown packet {} from one of the following replicas: {}", @@ -111,6 +114,9 @@ void ConnectionCollector::drainConnections(IConnections & connections) noexcept tryLogCurrentException(&Poco::Logger::get("ConnectionCollector"), __PRETTY_FUNCTION__); } } + + if (throw_error) + throw; } } diff --git a/src/QueryPipeline/ConnectionCollector.h b/src/QueryPipeline/ConnectionCollector.h index 5b6e82d000e..44482607277 100644 --- a/src/QueryPipeline/ConnectionCollector.h +++ b/src/QueryPipeline/ConnectionCollector.h @@ -17,7 +17,7 @@ public: static ConnectionCollector & init(ContextMutablePtr global_context_, size_t max_threads); static std::shared_ptr enqueueConnectionCleanup(const ConnectionPoolWithFailoverPtr & pool, std::shared_ptr connections) noexcept; - static void drainConnections(IConnections & connections) noexcept; + static void drainConnections(IConnections & connections, bool throw_error); private: explicit ConnectionCollector(ContextMutablePtr global_context_, size_t max_threads); diff --git a/src/QueryPipeline/QueryPipelineBuilder.cpp b/src/QueryPipeline/QueryPipelineBuilder.cpp index 40c64046560..dba7c7cb8f7 100644 --- a/src/QueryPipeline/QueryPipelineBuilder.cpp +++ b/src/QueryPipeline/QueryPipelineBuilder.cpp @@ -560,6 +560,7 @@ QueryPipeline QueryPipelineBuilder::getPipeline(QueryPipelineBuilder builder) { QueryPipeline res(std::move(builder.pipe)); res.setNumThreads(builder.getNumThreads()); + res.setProcessListElement(builder.process_list_element); return res; } diff --git a/src/QueryPipeline/RemoteQueryExecutor.cpp b/src/QueryPipeline/RemoteQueryExecutor.cpp index 653d9a2bbf8..142e56ceb25 100644 --- a/src/QueryPipeline/RemoteQueryExecutor.cpp +++ b/src/QueryPipeline/RemoteQueryExecutor.cpp @@ -495,14 +495,26 @@ void RemoteQueryExecutor::finish(std::unique_ptr * read_context) /// Send the request to abort the execution of the request, if not already sent. tryCancel("Cancelling query because enough data has been read", read_context); - /// Try to drain connections asynchronously. - if (auto conn = ConnectionCollector::enqueueConnectionCleanup(pool, connections)) + + if (context->getSettingsRef().drain_timeout != Poco::Timespan(-1000000)) { - /// Drain connections synchronously. + auto connections_left = ConnectionCollector::enqueueConnectionCleanup(pool, connections); + if (connections_left) + { + /// Drain connections synchronously and suppress errors. + CurrentMetrics::Increment metric_increment(CurrentMetrics::ActiveSyncDrainedConnections); + ConnectionCollector::drainConnections(*connections_left, /* throw_error= */ false); + CurrentMetrics::add(CurrentMetrics::SyncDrainedConnections, 1); + } + } + else + { + /// Drain connections synchronously w/o suppressing errors. CurrentMetrics::Increment metric_increment(CurrentMetrics::ActiveSyncDrainedConnections); - ConnectionCollector::drainConnections(*conn); + ConnectionCollector::drainConnections(*connections, /* throw_error= */ true); CurrentMetrics::add(CurrentMetrics::SyncDrainedConnections, 1); } + finished = true; } diff --git a/src/QueryPipeline/RemoteQueryExecutor.h b/src/QueryPipeline/RemoteQueryExecutor.h index d5603fd2281..655bd5603de 100644 --- a/src/QueryPipeline/RemoteQueryExecutor.h +++ b/src/QueryPipeline/RemoteQueryExecutor.h @@ -227,7 +227,7 @@ private: void processMergeTreeReadTaskRequest(PartitionReadRequest request); - /// Cancell query and restart it with info about duplicated UUIDs + /// Cancel query and restart it with info about duplicate UUIDs /// only for `allow_experimental_query_deduplication`. std::variant restartQueryWithoutDuplicatedUUIDs(std::unique_ptr * read_context = nullptr); diff --git a/src/QueryPipeline/tests/gtest_blocks_size_merging_streams.cpp b/src/QueryPipeline/tests/gtest_blocks_size_merging_streams.cpp index fb10601216e..e19d2c7114b 100644 --- a/src/QueryPipeline/tests/gtest_blocks_size_merging_streams.cpp +++ b/src/QueryPipeline/tests/gtest_blocks_size_merging_streams.cpp @@ -16,7 +16,7 @@ static Block getBlockWithSize(const std::vector & columns, size_t r ColumnsWithTypeAndName cols; size_t size_of_row_in_bytes = columns.size() * sizeof(UInt64); - for (size_t i = 0; i * sizeof(UInt64) < size_of_row_in_bytes; i++) + for (size_t i = 0; i * sizeof(UInt64) < size_of_row_in_bytes; ++i) { auto column = ColumnUInt64::create(rows, 0); for (size_t j = 0; j < rows; ++j) diff --git a/src/QueryPipeline/tests/gtest_check_sorted_stream.cpp b/src/QueryPipeline/tests/gtest_check_sorted_stream.cpp index 751f7ef8635..7b30958f0c4 100644 --- a/src/QueryPipeline/tests/gtest_check_sorted_stream.cpp +++ b/src/QueryPipeline/tests/gtest_check_sorted_stream.cpp @@ -29,7 +29,7 @@ static Block getSortedBlockWithSize( { ColumnsWithTypeAndName cols; size_t size_of_row_in_bytes = columns.size() * sizeof(UInt64); - for (size_t i = 0; i * sizeof(UInt64) < size_of_row_in_bytes; i++) + for (size_t i = 0; i * sizeof(UInt64) < size_of_row_in_bytes; ++i) { auto column = ColumnUInt64::create(rows, 0); for (size_t j = 0; j < rows; ++j) @@ -47,7 +47,7 @@ static Block getUnSortedBlockWithSize(const std::vector & columns, { ColumnsWithTypeAndName cols; size_t size_of_row_in_bytes = columns.size() * sizeof(UInt64); - for (size_t i = 0; i * sizeof(UInt64) < size_of_row_in_bytes; i++) + for (size_t i = 0; i * sizeof(UInt64) < size_of_row_in_bytes; ++i) { auto column = ColumnUInt64::create(rows, 0); for (size_t j = 0; j < rows; ++j) @@ -71,7 +71,7 @@ static Block getEqualValuesBlockWithSize( { ColumnsWithTypeAndName cols; size_t size_of_row_in_bytes = columns.size() * sizeof(UInt64); - for (size_t i = 0; i * sizeof(UInt64) < size_of_row_in_bytes; i++) + for (size_t i = 0; i * sizeof(UInt64) < size_of_row_in_bytes; ++i) { auto column = ColumnUInt64::create(rows, 0); for (size_t j = 0; j < rows; ++j) diff --git a/src/Server/GRPCServer.cpp b/src/Server/GRPCServer.cpp index 096194455b1..589bdd63f41 100644 --- a/src/Server/GRPCServer.cpp +++ b/src/Server/GRPCServer.cpp @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -618,7 +619,11 @@ namespace ASTInsertQuery * insert_query = nullptr; String input_format; String input_data_delimiter; + PODArray output; String output_format; + CompressionMethod compression_method = CompressionMethod::None; + int compression_level = 0; + uint64_t interactive_delay = 100000; bool send_exception_with_stacktrace = true; bool input_function_is_used = false; @@ -635,8 +640,10 @@ namespace bool responder_finished = false; bool cancelled = false; - std::optional read_buffer; - std::optional write_buffer; + std::unique_ptr read_buffer; + std::unique_ptr write_buffer; + WriteBufferFromVector> * nested_write_buffer = nullptr; + WriteBuffer * compressing_write_buffer = nullptr; std::unique_ptr pipeline; std::unique_ptr pipeline_executor; std::shared_ptr output_format_processor; @@ -818,6 +825,10 @@ namespace if (output_format.empty()) output_format = query_context->getDefaultFormat(); + /// Choose compression. + compression_method = chooseCompressionMethod("", query_info.compression_type()); + compression_level = query_info.compression_level(); + /// Set callback to create and fill external tables query_context->setExternalTablesInitializer([this] (ContextPtr context) { @@ -891,7 +902,7 @@ namespace void Call::initializeBlockInputStream(const Block & header) { assert(!read_buffer); - read_buffer.emplace([this]() -> std::pair + read_buffer = std::make_unique([this]() -> std::pair { if (need_input_data_from_insert_query) { @@ -947,6 +958,8 @@ namespace return {nullptr, 0}; /// no more input data }); + read_buffer = wrapReadBufferWithCompressionMethod(std::move(read_buffer), compression_method); + assert(!pipeline); auto source = query_context->getInputFormat( input_format, *read_buffer, header, query_context->getSettings().max_insert_block_size); @@ -1030,7 +1043,10 @@ namespace /// The data will be written directly to the table. auto metadata_snapshot = storage->getInMemoryMetadataPtr(); auto sink = storage->write(ASTPtr(), metadata_snapshot, query_context); - ReadBufferFromMemory data(external_table.data().data(), external_table.data().size()); + + std::unique_ptr buf = std::make_unique(external_table.data().data(), external_table.data().size()); + buf = wrapReadBufferWithCompressionMethod(std::move(buf), chooseCompressionMethod("", external_table.compression_type())); + String format = external_table.format(); if (format.empty()) format = "TabSeparated"; @@ -1047,7 +1063,7 @@ namespace external_table_context->applySettingsChanges(settings_changes); } auto in = external_table_context->getInputFormat( - format, data, metadata_snapshot->getSampleBlock(), + format, *buf, metadata_snapshot->getSampleBlock(), external_table_context->getSettings().max_insert_block_size); QueryPipelineBuilder cur_pipeline; @@ -1101,7 +1117,18 @@ namespace if (io.pipeline.pulling()) header = io.pipeline.getHeader(); - write_buffer.emplace(*result.mutable_output()); + if (compression_method != CompressionMethod::None) + output.resize(DBMS_DEFAULT_BUFFER_SIZE); /// Must have enough space for compressed data. + write_buffer = std::make_unique>>(output); + nested_write_buffer = static_cast> *>(write_buffer.get()); + if (compression_method != CompressionMethod::None) + { + write_buffer = wrapWriteBufferWithCompressionMethod(std::move(write_buffer), compression_method, compression_level); + compressing_write_buffer = write_buffer.get(); + } + + auto has_output = [&] { return (nested_write_buffer->position() != output.data()) || (compressing_write_buffer && compressing_write_buffer->offset()); }; + output_format_processor = query_context->getOutputFormat(output_format, *write_buffer, header); Stopwatch after_send_progress; @@ -1143,8 +1170,7 @@ namespace addLogsToResult(); - bool has_output = write_buffer->offset(); - if (has_output || result.has_progress() || result.logs_size()) + if (has_output() || result.has_progress() || result.logs_size()) sendResult(); throwIfFailedToSendResult(); @@ -1164,13 +1190,11 @@ namespace auto executor = std::make_shared(io.pipeline); auto callback = [&]() -> bool { - throwIfFailedToSendResult(); addProgressToResult(); addLogsToResult(); - bool has_output = write_buffer->offset(); - if (has_output || result.has_progress() || result.logs_size()) + if (has_output() || result.has_progress() || result.logs_size()) sendResult(); throwIfFailedToSendResult(); @@ -1260,6 +1284,8 @@ namespace /// immediately after it receives our final result, and it's prohibited to have /// two queries executed at the same time with the same query ID or session ID. io.process_list_entry.reset(); + if (query_context) + query_context->setProcessListElement(nullptr); if (session) session->releaseSessionID(); } @@ -1272,6 +1298,8 @@ namespace output_format_processor.reset(); read_buffer.reset(); write_buffer.reset(); + nested_write_buffer = nullptr; + compressing_write_buffer = nullptr; io = {}; query_scope.reset(); query_context.reset(); @@ -1390,10 +1418,17 @@ namespace if (!totals) return; - WriteBufferFromString buf{*result.mutable_totals()}; - auto format = query_context->getOutputFormat(output_format, buf, totals); + PODArray memory; + if (compression_method != CompressionMethod::None) + memory.resize(DBMS_DEFAULT_BUFFER_SIZE); /// Must have enough space for compressed data. + std::unique_ptr buf = std::make_unique>>(memory); + buf = wrapWriteBufferWithCompressionMethod(std::move(buf), compression_method, compression_level); + auto format = query_context->getOutputFormat(output_format, *buf, totals); format->write(materializeBlock(totals)); format->finalize(); + buf->finalize(); + + result.mutable_totals()->assign(memory.data(), memory.size()); } void Call::addExtremesToResult(const Block & extremes) @@ -1401,10 +1436,17 @@ namespace if (!extremes) return; - WriteBufferFromString buf{*result.mutable_extremes()}; - auto format = query_context->getOutputFormat(output_format, buf, extremes); + PODArray memory; + if (compression_method != CompressionMethod::None) + memory.resize(DBMS_DEFAULT_BUFFER_SIZE); /// Must have enough space for compressed data. + std::unique_ptr buf = std::make_unique>>(memory); + buf = wrapWriteBufferWithCompressionMethod(std::move(buf), compression_method, compression_level); + auto format = query_context->getOutputFormat(output_format, *buf, extremes); format->write(materializeBlock(extremes)); format->finalize(); + buf->finalize(); + + result.mutable_extremes()->assign(memory.data(), memory.size()); } void Call::addProfileInfoToResult(const ProfileInfo & info) @@ -1475,6 +1517,38 @@ namespace if (!send_final_message && !isOutputStreaming(call_type)) return; + /// Copy output to `result.output`, with optional compressing. + if (write_buffer) + { + size_t output_size; + if (send_final_message) + { + if (compressing_write_buffer) + LOG_DEBUG(log, "Compressing final {} bytes", compressing_write_buffer->offset()); + write_buffer->finalize(); + output_size = output.size(); + } + else + { + if (compressing_write_buffer && compressing_write_buffer->offset()) + { + LOG_DEBUG(log, "Compressing {} bytes", compressing_write_buffer->offset()); + compressing_write_buffer->sync(); + } + output_size = nested_write_buffer->position() - output.data(); + } + + if (output_size) + { + result.mutable_output()->assign(output.data(), output_size); + nested_write_buffer->restart(); /// We're going to reuse the same buffer again for next block of data. + } + } + + if (!send_final_message && result.output().empty() && result.totals().empty() && result.extremes().empty() && !result.logs_size() + && !result.has_progress() && !result.has_stats() && !result.has_exception() && !result.cancelled()) + return; /// Nothing to send. + /// Wait for previous write to finish. /// (gRPC doesn't allow to start sending another result while the previous is still being sending.) if (sending_result.get()) @@ -1488,9 +1562,6 @@ namespace /// Start sending the result. LOG_DEBUG(log, "Sending {} result to the client: {}", (send_final_message ? "final" : "intermediate"), getResultDescription(result)); - if (write_buffer) - write_buffer->finalize(); - sending_result.set(true); auto callback = [this](bool ok) { @@ -1511,8 +1582,6 @@ namespace /// gRPC has already retrieved all data from `result`, so we don't have to keep it. result.Clear(); - if (write_buffer) - write_buffer->restart(); if (send_final_message) { diff --git a/src/Server/GRPCServer.h b/src/Server/GRPCServer.h index 25c3813c11d..e2b48f1c16b 100644 --- a/src/Server/GRPCServer.h +++ b/src/Server/GRPCServer.h @@ -4,6 +4,7 @@ #if USE_GRPC #include +#include #include "clickhouse_grpc.grpc.pb.h" namespace Poco { class Logger; } @@ -30,6 +31,9 @@ public: /// Stops the server. No new connections will be accepted. void stop(); + /// Returns the port this server is listening to. + UInt16 portNumber() const { return address_to_listen.port(); } + /// Returns the number of currently handled connections. size_t currentConnections() const; diff --git a/src/Server/HTTP/HTMLForm.cpp b/src/Server/HTTP/HTMLForm.cpp index 86e08f3c8e7..d9d897d20c4 100644 --- a/src/Server/HTTP/HTMLForm.cpp +++ b/src/Server/HTTP/HTMLForm.cpp @@ -183,8 +183,8 @@ void HTMLForm::readMultipart(ReadBuffer & in_, PartHandler & handler) size_t fields = 0; MultipartReadBuffer in(in_, boundary); - /// Assume there is at least one part - in.skipToNextBoundary(); + if (!in.skipToNextBoundary()) + throw Poco::Net::HTMLFormException("No boundary line found"); /// Read each part until next boundary (or last boundary) while (!in.eof()) @@ -241,7 +241,9 @@ HTMLForm::MultipartReadBuffer::MultipartReadBuffer(ReadBuffer & in_, const std:: bool HTMLForm::MultipartReadBuffer::skipToNextBoundary() { - assert(working_buffer.empty() || eof()); + if (in.eof()) + return false; + assert(boundary_hit); boundary_hit = false; @@ -257,7 +259,7 @@ bool HTMLForm::MultipartReadBuffer::skipToNextBoundary() } } - throw Poco::Net::HTMLFormException("No boundary line found"); + return false; } std::string HTMLForm::MultipartReadBuffer::readLine(bool append_crlf) diff --git a/src/Server/HTTP/HTTPServer.cpp b/src/Server/HTTP/HTTPServer.cpp index 42e6467d0af..2e91fad1c0f 100644 --- a/src/Server/HTTP/HTTPServer.cpp +++ b/src/Server/HTTP/HTTPServer.cpp @@ -5,31 +5,13 @@ namespace DB { -HTTPServer::HTTPServer( - ContextPtr context, - HTTPRequestHandlerFactoryPtr factory_, - UInt16 port_number, - Poco::Net::HTTPServerParams::Ptr params) - : TCPServer(new HTTPServerConnectionFactory(context, params, factory_), port_number, params), factory(factory_) -{ -} - -HTTPServer::HTTPServer( - ContextPtr context, - HTTPRequestHandlerFactoryPtr factory_, - const Poco::Net::ServerSocket & socket, - Poco::Net::HTTPServerParams::Ptr params) - : TCPServer(new HTTPServerConnectionFactory(context, params, factory_), socket, params), factory(factory_) -{ -} - HTTPServer::HTTPServer( ContextPtr context, HTTPRequestHandlerFactoryPtr factory_, Poco::ThreadPool & thread_pool, - const Poco::Net::ServerSocket & socket, + Poco::Net::ServerSocket & socket_, Poco::Net::HTTPServerParams::Ptr params) - : TCPServer(new HTTPServerConnectionFactory(context, params, factory_), thread_pool, socket, params), factory(factory_) + : TCPServer(new HTTPServerConnectionFactory(context, params, factory_), thread_pool, socket_, params), factory(factory_) { } diff --git a/src/Server/HTTP/HTTPServer.h b/src/Server/HTTP/HTTPServer.h index 3518fd66d20..07ad54d267f 100644 --- a/src/Server/HTTP/HTTPServer.h +++ b/src/Server/HTTP/HTTPServer.h @@ -1,9 +1,9 @@ #pragma once #include +#include #include -#include #include @@ -13,26 +13,14 @@ namespace DB class Context; -class HTTPServer : public Poco::Net::TCPServer +class HTTPServer : public TCPServer { public: explicit HTTPServer( - ContextPtr context, - HTTPRequestHandlerFactoryPtr factory, - UInt16 port_number = 80, - Poco::Net::HTTPServerParams::Ptr params = new Poco::Net::HTTPServerParams); - - HTTPServer( - ContextPtr context, - HTTPRequestHandlerFactoryPtr factory, - const Poco::Net::ServerSocket & socket, - Poco::Net::HTTPServerParams::Ptr params); - - HTTPServer( ContextPtr context, HTTPRequestHandlerFactoryPtr factory, Poco::ThreadPool & thread_pool, - const Poco::Net::ServerSocket & socket, + Poco::Net::ServerSocket & socket, Poco::Net::HTTPServerParams::Ptr params); ~HTTPServer() override; diff --git a/src/Server/HTTP/HTTPServerConnection.cpp b/src/Server/HTTP/HTTPServerConnection.cpp index de81da20ead..7020b8e9a23 100644 --- a/src/Server/HTTP/HTTPServerConnection.cpp +++ b/src/Server/HTTP/HTTPServerConnection.cpp @@ -1,4 +1,5 @@ #include +#include #include @@ -7,10 +8,11 @@ namespace DB HTTPServerConnection::HTTPServerConnection( ContextPtr context_, + TCPServer & tcp_server_, const Poco::Net::StreamSocket & socket, Poco::Net::HTTPServerParams::Ptr params_, HTTPRequestHandlerFactoryPtr factory_) - : TCPServerConnection(socket), context(Context::createCopy(context_)), params(params_), factory(factory_), stopped(false) + : TCPServerConnection(socket), context(Context::createCopy(context_)), tcp_server(tcp_server_), params(params_), factory(factory_), stopped(false) { poco_check_ptr(factory); } @@ -20,12 +22,12 @@ void HTTPServerConnection::run() std::string server = params->getSoftwareVersion(); Poco::Net::HTTPServerSession session(socket(), params); - while (!stopped && session.hasMoreRequests()) + while (!stopped && tcp_server.isOpen() && session.hasMoreRequests()) { try { std::unique_lock lock(mutex); - if (!stopped) + if (!stopped && tcp_server.isOpen()) { HTTPServerResponse response(session); HTTPServerRequest request(context, response, session); @@ -48,6 +50,11 @@ void HTTPServerConnection::run() response.set("Server", server); try { + if (!tcp_server.isOpen()) + { + sendErrorResponse(session, Poco::Net::HTTPResponse::HTTP_SERVICE_UNAVAILABLE); + break; + } std::unique_ptr handler(factory->createRequestHandler(request)); if (handler) diff --git a/src/Server/HTTP/HTTPServerConnection.h b/src/Server/HTTP/HTTPServerConnection.h index 1c7ae6cd2b7..db3969f6ffb 100644 --- a/src/Server/HTTP/HTTPServerConnection.h +++ b/src/Server/HTTP/HTTPServerConnection.h @@ -9,12 +9,14 @@ namespace DB { +class TCPServer; class HTTPServerConnection : public Poco::Net::TCPServerConnection { public: HTTPServerConnection( ContextPtr context, + TCPServer & tcp_server, const Poco::Net::StreamSocket & socket, Poco::Net::HTTPServerParams::Ptr params, HTTPRequestHandlerFactoryPtr factory); @@ -26,6 +28,7 @@ protected: private: ContextPtr context; + TCPServer & tcp_server; Poco::Net::HTTPServerParams::Ptr params; HTTPRequestHandlerFactoryPtr factory; bool stopped; diff --git a/src/Server/HTTP/HTTPServerConnectionFactory.cpp b/src/Server/HTTP/HTTPServerConnectionFactory.cpp index 0e4fb6cfcec..008da222c79 100644 --- a/src/Server/HTTP/HTTPServerConnectionFactory.cpp +++ b/src/Server/HTTP/HTTPServerConnectionFactory.cpp @@ -11,9 +11,9 @@ HTTPServerConnectionFactory::HTTPServerConnectionFactory( poco_check_ptr(factory); } -Poco::Net::TCPServerConnection * HTTPServerConnectionFactory::createConnection(const Poco::Net::StreamSocket & socket) +Poco::Net::TCPServerConnection * HTTPServerConnectionFactory::createConnection(const Poco::Net::StreamSocket & socket, TCPServer & tcp_server) { - return new HTTPServerConnection(context, socket, params, factory); + return new HTTPServerConnection(context, tcp_server, socket, params, factory); } } diff --git a/src/Server/HTTP/HTTPServerConnectionFactory.h b/src/Server/HTTP/HTTPServerConnectionFactory.h index 3f11eca0f69..a19dc6d4d5c 100644 --- a/src/Server/HTTP/HTTPServerConnectionFactory.h +++ b/src/Server/HTTP/HTTPServerConnectionFactory.h @@ -2,19 +2,19 @@ #include #include +#include #include -#include namespace DB { -class HTTPServerConnectionFactory : public Poco::Net::TCPServerConnectionFactory +class HTTPServerConnectionFactory : public TCPServerConnectionFactory { public: HTTPServerConnectionFactory(ContextPtr context, Poco::Net::HTTPServerParams::Ptr params, HTTPRequestHandlerFactoryPtr factory); - Poco::Net::TCPServerConnection * createConnection(const Poco::Net::StreamSocket & socket) override; + Poco::Net::TCPServerConnection * createConnection(const Poco::Net::StreamSocket & socket, TCPServer & tcp_server) override; private: ContextPtr context; diff --git a/src/Server/HTTPHandler.cpp b/src/Server/HTTPHandler.cpp index 384799c4687..673edfb6719 100644 --- a/src/Server/HTTPHandler.cpp +++ b/src/Server/HTTPHandler.cpp @@ -719,9 +719,16 @@ void HTTPHandler::processQuery( context->checkSettingsConstraints(settings_changes); context->applySettingsChanges(settings_changes); - // Set the query id supplied by the user, if any, and also update the OpenTelemetry fields. + /// Set the query id supplied by the user, if any, and also update the OpenTelemetry fields. context->setCurrentQueryId(params.get("query_id", request.get("X-ClickHouse-Query-Id", ""))); + /// Initialize query scope, once query_id is initialized. + /// (To track as much allocations as possible) + query_scope.emplace(context); + + /// NOTE: this may create pretty huge allocations that will not be accounted in trace_log, + /// because memory_profiler_sample_probability/memory_profiler_step are not applied yet, + /// they will be applied in ProcessList::insert() from executeQuery() itself. const auto & query = getQuery(request, params, context); std::unique_ptr in_param = std::make_unique(query); in = has_external_data ? std::move(in_param) : std::make_unique(*in_param, *in_post_maybe_compressed); @@ -769,7 +776,7 @@ void HTTPHandler::processQuery( if (settings.readonly > 0 && settings.cancel_http_readonly_queries_on_client_close) { - append_callback([context = context, &request](const Progress &) + append_callback([&context, &request](const Progress &) { /// Assume that at the point this method is called no one is reading data from the socket any more: /// should be true for read-only queries. @@ -780,8 +787,6 @@ void HTTPHandler::processQuery( customizeContext(request, context); - query_scope.emplace(context); - executeQuery(*in, *used_output.out_maybe_delayed_and_compressed, /* allow_into_outfile = */ false, context, [&response] (const String & current_query_id, const String & content_type, const String & format, const String & timezone) { diff --git a/src/Server/KeeperTCPHandler.cpp b/src/Server/KeeperTCPHandler.cpp index 67abd6db13a..0c5d7d93689 100644 --- a/src/Server/KeeperTCPHandler.cpp +++ b/src/Server/KeeperTCPHandler.cpp @@ -557,6 +557,8 @@ void KeeperTCPHandler::updateStats(Coordination::ZooKeeperResponsePtr & response std::lock_guard lock(conn_stats_mutex); conn_stats.updateLatency(elapsed); } + + operations.erase(response->xid); keeper_dispatcher->updateKeeperStatLatency(elapsed); last_op.set(std::make_unique(LastOp{ diff --git a/src/Server/KeeperTCPHandler.h b/src/Server/KeeperTCPHandler.h index fb6541d1f53..f98b269b8be 100644 --- a/src/Server/KeeperTCPHandler.h +++ b/src/Server/KeeperTCPHandler.h @@ -93,7 +93,7 @@ private: Poco::Timestamp established; - using Operations = std::map; + using Operations = std::unordered_map; Operations operations; LastOpMultiVersion last_op; diff --git a/src/Server/KeeperTCPHandlerFactory.h b/src/Server/KeeperTCPHandlerFactory.h index 67bb3dab268..58dc73d7c27 100644 --- a/src/Server/KeeperTCPHandlerFactory.h +++ b/src/Server/KeeperTCPHandlerFactory.h @@ -1,7 +1,7 @@ #pragma once #include -#include +#include #include #include #include @@ -10,7 +10,7 @@ namespace DB { -class KeeperTCPHandlerFactory : public Poco::Net::TCPServerConnectionFactory +class KeeperTCPHandlerFactory : public TCPServerConnectionFactory { private: IServer & server; @@ -29,7 +29,7 @@ public: { } - Poco::Net::TCPServerConnection * createConnection(const Poco::Net::StreamSocket & socket) override + Poco::Net::TCPServerConnection * createConnection(const Poco::Net::StreamSocket & socket, TCPServer &) override { try { diff --git a/src/Server/MySQLHandler.cpp b/src/Server/MySQLHandler.cpp index deebc073ad5..2836ee05c30 100644 --- a/src/Server/MySQLHandler.cpp +++ b/src/Server/MySQLHandler.cpp @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -62,10 +63,11 @@ static String showTableStatusReplacementQuery(const String & query); static String killConnectionIdReplacementQuery(const String & query); static String selectLimitReplacementQuery(const String & query); -MySQLHandler::MySQLHandler(IServer & server_, const Poco::Net::StreamSocket & socket_, +MySQLHandler::MySQLHandler(IServer & server_, TCPServer & tcp_server_, const Poco::Net::StreamSocket & socket_, bool ssl_enabled, size_t connection_id_) : Poco::Net::TCPServerConnection(socket_) , server(server_) + , tcp_server(tcp_server_) , log(&Poco::Logger::get("MySQLHandler")) , connection_id(connection_id_) , auth_plugin(new MySQLProtocol::Authentication::Native41()) @@ -138,11 +140,14 @@ void MySQLHandler::run() OKPacket ok_packet(0, handshake_response.capability_flags, 0, 0, 0); packet_endpoint->sendPacket(ok_packet, true); - while (true) + while (tcp_server.isOpen()) { packet_endpoint->resetSequenceId(); MySQLPacketPayloadReadBuffer payload = packet_endpoint->getPayload(); + while (!in->poll(1000000)) + if (!tcp_server.isOpen()) + return; char command = 0; payload.readStrict(command); @@ -152,6 +157,8 @@ void MySQLHandler::run() LOG_DEBUG(log, "Received command: {}. Connection id: {}.", static_cast(static_cast(command)), connection_id); + if (!tcp_server.isOpen()) + return; try { switch (command) @@ -369,8 +376,8 @@ void MySQLHandler::finishHandshakeSSL( } #if USE_SSL -MySQLHandlerSSL::MySQLHandlerSSL(IServer & server_, const Poco::Net::StreamSocket & socket_, bool ssl_enabled, size_t connection_id_, RSA & public_key_, RSA & private_key_) - : MySQLHandler(server_, socket_, ssl_enabled, connection_id_) +MySQLHandlerSSL::MySQLHandlerSSL(IServer & server_, TCPServer & tcp_server_, const Poco::Net::StreamSocket & socket_, bool ssl_enabled, size_t connection_id_, RSA & public_key_, RSA & private_key_) + : MySQLHandler(server_, tcp_server_, socket_, ssl_enabled, connection_id_) , public_key(public_key_) , private_key(private_key_) {} diff --git a/src/Server/MySQLHandler.h b/src/Server/MySQLHandler.h index 7ef212bf36e..3af5f7a0eb2 100644 --- a/src/Server/MySQLHandler.h +++ b/src/Server/MySQLHandler.h @@ -24,11 +24,14 @@ namespace CurrentMetrics namespace DB { +class ReadBufferFromPocoSocket; +class TCPServer; + /// Handler for MySQL wire protocol connections. Allows to connect to ClickHouse using MySQL client. class MySQLHandler : public Poco::Net::TCPServerConnection { public: - MySQLHandler(IServer & server_, const Poco::Net::StreamSocket & socket_, bool ssl_enabled, size_t connection_id_); + MySQLHandler(IServer & server_, TCPServer & tcp_server_, const Poco::Net::StreamSocket & socket_, bool ssl_enabled, size_t connection_id_); void run() final; @@ -52,6 +55,7 @@ protected: virtual void finishHandshakeSSL(size_t packet_size, char * buf, size_t pos, std::function read_bytes, MySQLProtocol::ConnectionPhase::HandshakeResponse & packet); IServer & server; + TCPServer & tcp_server; Poco::Logger * log; UInt64 connection_id = 0; @@ -68,7 +72,7 @@ protected: Replacements replacements; std::unique_ptr auth_plugin; - std::shared_ptr in; + std::shared_ptr in; std::shared_ptr out; bool secure_connection = false; }; @@ -77,7 +81,7 @@ protected: class MySQLHandlerSSL : public MySQLHandler { public: - MySQLHandlerSSL(IServer & server_, const Poco::Net::StreamSocket & socket_, bool ssl_enabled, size_t connection_id_, RSA & public_key_, RSA & private_key_); + MySQLHandlerSSL(IServer & server_, TCPServer & tcp_server_, const Poco::Net::StreamSocket & socket_, bool ssl_enabled, size_t connection_id_, RSA & public_key_, RSA & private_key_); private: void authPluginSSL() override; diff --git a/src/Server/MySQLHandlerFactory.cpp b/src/Server/MySQLHandlerFactory.cpp index 7a0bfd8ab09..f7bb073e275 100644 --- a/src/Server/MySQLHandlerFactory.cpp +++ b/src/Server/MySQLHandlerFactory.cpp @@ -118,14 +118,14 @@ void MySQLHandlerFactory::generateRSAKeys() } #endif -Poco::Net::TCPServerConnection * MySQLHandlerFactory::createConnection(const Poco::Net::StreamSocket & socket) +Poco::Net::TCPServerConnection * MySQLHandlerFactory::createConnection(const Poco::Net::StreamSocket & socket, TCPServer & tcp_server) { size_t connection_id = last_connection_id++; LOG_TRACE(log, "MySQL connection. Id: {}. Address: {}", connection_id, socket.peerAddress().toString()); #if USE_SSL - return new MySQLHandlerSSL(server, socket, ssl_enabled, connection_id, *public_key, *private_key); + return new MySQLHandlerSSL(server, tcp_server, socket, ssl_enabled, connection_id, *public_key, *private_key); #else - return new MySQLHandler(server, socket, ssl_enabled, connection_id); + return new MySQLHandler(server, tcp_server, socket, ssl_enabled, connection_id); #endif } diff --git a/src/Server/MySQLHandlerFactory.h b/src/Server/MySQLHandlerFactory.h index 106fdfdf341..25f1af85273 100644 --- a/src/Server/MySQLHandlerFactory.h +++ b/src/Server/MySQLHandlerFactory.h @@ -1,9 +1,9 @@ #pragma once -#include #include #include #include +#include #include @@ -13,8 +13,9 @@ namespace DB { +class TCPServer; -class MySQLHandlerFactory : public Poco::Net::TCPServerConnectionFactory +class MySQLHandlerFactory : public TCPServerConnectionFactory { private: IServer & server; @@ -43,7 +44,7 @@ public: void generateRSAKeys(); - Poco::Net::TCPServerConnection * createConnection(const Poco::Net::StreamSocket & socket) override; + Poco::Net::TCPServerConnection * createConnection(const Poco::Net::StreamSocket & socket, TCPServer & tcp_server) override; }; } diff --git a/src/Server/PostgreSQLHandler.cpp b/src/Server/PostgreSQLHandler.cpp index fee4ace3452..9808b538280 100644 --- a/src/Server/PostgreSQLHandler.cpp +++ b/src/Server/PostgreSQLHandler.cpp @@ -6,6 +6,7 @@ #include #include "PostgreSQLHandler.h" #include +#include #include #include #include @@ -28,11 +29,13 @@ namespace ErrorCodes PostgreSQLHandler::PostgreSQLHandler( const Poco::Net::StreamSocket & socket_, IServer & server_, + TCPServer & tcp_server_, bool ssl_enabled_, Int32 connection_id_, std::vector> & auth_methods_) : Poco::Net::TCPServerConnection(socket_) , server(server_) + , tcp_server(tcp_server_) , ssl_enabled(ssl_enabled_) , connection_id(connection_id_) , authentication_manager(auth_methods_) @@ -60,11 +63,18 @@ void PostgreSQLHandler::run() if (!startup()) return; - while (true) + while (tcp_server.isOpen()) { message_transport->send(PostgreSQLProtocol::Messaging::ReadyForQuery(), true); + + constexpr size_t connection_check_timeout = 1; // 1 second + while (!in->poll(1000000 * connection_check_timeout)) + if (!tcp_server.isOpen()) + return; PostgreSQLProtocol::Messaging::FrontMessageType message_type = message_transport->receiveMessageType(); + if (!tcp_server.isOpen()) + return; switch (message_type) { case PostgreSQLProtocol::Messaging::FrontMessageType::QUERY: diff --git a/src/Server/PostgreSQLHandler.h b/src/Server/PostgreSQLHandler.h index ded9616296a..4fd08cc2606 100644 --- a/src/Server/PostgreSQLHandler.h +++ b/src/Server/PostgreSQLHandler.h @@ -1,15 +1,12 @@ #pragma once #include +#include #include #include #include #include "IServer.h" -#if !defined(ARCADIA_BUILD) -# include -#endif - #if USE_SSL # include #endif @@ -21,8 +18,9 @@ namespace CurrentMetrics namespace DB { - +class ReadBufferFromPocoSocket; class Session; +class TCPServer; /** PostgreSQL wire protocol implementation. * For more info see https://www.postgresql.org/docs/current/protocol.html @@ -33,6 +31,7 @@ public: PostgreSQLHandler( const Poco::Net::StreamSocket & socket_, IServer & server_, + TCPServer & tcp_server_, bool ssl_enabled_, Int32 connection_id_, std::vector> & auth_methods_); @@ -43,12 +42,13 @@ private: Poco::Logger * log = &Poco::Logger::get("PostgreSQLHandler"); IServer & server; + TCPServer & tcp_server; std::unique_ptr session; bool ssl_enabled = false; Int32 connection_id = 0; Int32 secret_key = 0; - std::shared_ptr in; + std::shared_ptr in; std::shared_ptr out; std::shared_ptr message_transport; diff --git a/src/Server/PostgreSQLHandlerFactory.cpp b/src/Server/PostgreSQLHandlerFactory.cpp index 1158cf5835e..6f2124861e7 100644 --- a/src/Server/PostgreSQLHandlerFactory.cpp +++ b/src/Server/PostgreSQLHandlerFactory.cpp @@ -1,5 +1,4 @@ #include "PostgreSQLHandlerFactory.h" -#include #include #include @@ -17,11 +16,11 @@ PostgreSQLHandlerFactory::PostgreSQLHandlerFactory(IServer & server_) }; } -Poco::Net::TCPServerConnection * PostgreSQLHandlerFactory::createConnection(const Poco::Net::StreamSocket & socket) +Poco::Net::TCPServerConnection * PostgreSQLHandlerFactory::createConnection(const Poco::Net::StreamSocket & socket, TCPServer & tcp_server) { Int32 connection_id = last_connection_id++; LOG_TRACE(log, "PostgreSQL connection. Id: {}. Address: {}", connection_id, socket.peerAddress().toString()); - return new PostgreSQLHandler(socket, server, ssl_enabled, connection_id, auth_methods); + return new PostgreSQLHandler(socket, server, tcp_server, ssl_enabled, connection_id, auth_methods); } } diff --git a/src/Server/PostgreSQLHandlerFactory.h b/src/Server/PostgreSQLHandlerFactory.h index 9103cbaad90..e9241da6f0e 100644 --- a/src/Server/PostgreSQLHandlerFactory.h +++ b/src/Server/PostgreSQLHandlerFactory.h @@ -1,19 +1,16 @@ #pragma once -#include #include #include #include +#include #include - -#if !defined(ARCADIA_BUILD) -# include -#endif +#include namespace DB { -class PostgreSQLHandlerFactory : public Poco::Net::TCPServerConnectionFactory +class PostgreSQLHandlerFactory : public TCPServerConnectionFactory { private: IServer & server; @@ -31,6 +28,6 @@ private: public: explicit PostgreSQLHandlerFactory(IServer & server_); - Poco::Net::TCPServerConnection * createConnection(const Poco::Net::StreamSocket & socket) override; + Poco::Net::TCPServerConnection * createConnection(const Poco::Net::StreamSocket & socket, TCPServer & server) override; }; } diff --git a/src/Server/ProtocolServerAdapter.cpp b/src/Server/ProtocolServerAdapter.cpp index 6ec1ec572f7..b41ad2376f1 100644 --- a/src/Server/ProtocolServerAdapter.cpp +++ b/src/Server/ProtocolServerAdapter.cpp @@ -1,5 +1,5 @@ #include -#include +#include #if USE_GRPC #include @@ -11,20 +11,29 @@ namespace DB class ProtocolServerAdapter::TCPServerAdapterImpl : public Impl { public: - explicit TCPServerAdapterImpl(std::unique_ptr tcp_server_) : tcp_server(std::move(tcp_server_)) {} + explicit TCPServerAdapterImpl(std::unique_ptr tcp_server_) : tcp_server(std::move(tcp_server_)) {} ~TCPServerAdapterImpl() override = default; void start() override { tcp_server->start(); } void stop() override { tcp_server->stop(); } + bool isStopping() const override { return !tcp_server->isOpen(); } + UInt16 portNumber() const override { return tcp_server->portNumber(); } size_t currentConnections() const override { return tcp_server->currentConnections(); } size_t currentThreads() const override { return tcp_server->currentThreads(); } private: - std::unique_ptr tcp_server; + std::unique_ptr tcp_server; }; -ProtocolServerAdapter::ProtocolServerAdapter(const char * port_name_, std::unique_ptr tcp_server_) - : port_name(port_name_), impl(std::make_unique(std::move(tcp_server_))) +ProtocolServerAdapter::ProtocolServerAdapter( + const std::string & listen_host_, + const char * port_name_, + const std::string & description_, + std::unique_ptr tcp_server_) + : listen_host(listen_host_) + , port_name(port_name_) + , description(description_) + , impl(std::make_unique(std::move(tcp_server_))) { } @@ -36,16 +45,30 @@ public: ~GRPCServerAdapterImpl() override = default; void start() override { grpc_server->start(); } - void stop() override { grpc_server->stop(); } + void stop() override + { + is_stopping = true; + grpc_server->stop(); + } + bool isStopping() const override { return is_stopping; } + UInt16 portNumber() const override { return grpc_server->portNumber(); } size_t currentConnections() const override { return grpc_server->currentConnections(); } size_t currentThreads() const override { return grpc_server->currentThreads(); } private: std::unique_ptr grpc_server; + bool is_stopping = false; }; -ProtocolServerAdapter::ProtocolServerAdapter(const char * port_name_, std::unique_ptr grpc_server_) - : port_name(port_name_), impl(std::make_unique(std::move(grpc_server_))) +ProtocolServerAdapter::ProtocolServerAdapter( + const std::string & listen_host_, + const char * port_name_, + const std::string & description_, + std::unique_ptr grpc_server_) + : listen_host(listen_host_) + , port_name(port_name_) + , description(description_) + , impl(std::make_unique(std::move(grpc_server_))) { } #endif diff --git a/src/Server/ProtocolServerAdapter.h b/src/Server/ProtocolServerAdapter.h index 04c46b53356..9b3b1af0301 100644 --- a/src/Server/ProtocolServerAdapter.h +++ b/src/Server/ProtocolServerAdapter.h @@ -2,14 +2,14 @@ #include +#include #include #include -namespace Poco::Net { class TCPServer; } - namespace DB { class GRPCServer; +class TCPServer; /// Provides an unified interface to access a protocol implementing server /// no matter what type it has (HTTPServer, TCPServer, MySQLServer, GRPCServer, ...). @@ -19,10 +19,10 @@ class ProtocolServerAdapter public: ProtocolServerAdapter(ProtocolServerAdapter && src) = default; ProtocolServerAdapter & operator =(ProtocolServerAdapter && src) = default; - ProtocolServerAdapter(const char * port_name_, std::unique_ptr tcp_server_); + ProtocolServerAdapter(const std::string & listen_host_, const char * port_name_, const std::string & description_, std::unique_ptr tcp_server_); #if USE_GRPC - ProtocolServerAdapter(const char * port_name_, std::unique_ptr grpc_server_); + ProtocolServerAdapter(const std::string & listen_host_, const char * port_name_, const std::string & description_, std::unique_ptr grpc_server_); #endif /// Starts the server. A new thread will be created that waits for and accepts incoming connections. @@ -31,14 +31,23 @@ public: /// Stops the server. No new connections will be accepted. void stop() { impl->stop(); } + bool isStopping() const { return impl->isStopping(); } + /// Returns the number of currently handled connections. size_t currentConnections() const { return impl->currentConnections(); } /// Returns the number of current threads. size_t currentThreads() const { return impl->currentThreads(); } + /// Returns the port this server is listening to. + UInt16 portNumber() const { return impl->portNumber(); } + + const std::string & getListenHost() const { return listen_host; } + const std::string & getPortName() const { return port_name; } + const std::string & getDescription() const { return description; } + private: class Impl { @@ -46,13 +55,17 @@ private: virtual ~Impl() {} virtual void start() = 0; virtual void stop() = 0; + virtual bool isStopping() const = 0; + virtual UInt16 portNumber() const = 0; virtual size_t currentConnections() const = 0; virtual size_t currentThreads() const = 0; }; class TCPServerAdapterImpl; class GRPCServerAdapterImpl; + std::string listen_host; std::string port_name; + std::string description; std::unique_ptr impl; }; diff --git a/src/Server/TCPHandler.cpp b/src/Server/TCPHandler.cpp index cdf1838c06b..6b4f77dd7d0 100644 --- a/src/Server/TCPHandler.cpp +++ b/src/Server/TCPHandler.cpp @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include @@ -81,9 +82,10 @@ namespace ErrorCodes extern const int UNKNOWN_PROTOCOL; } -TCPHandler::TCPHandler(IServer & server_, const Poco::Net::StreamSocket & socket_, bool parse_proxy_protocol_, std::string server_display_name_) +TCPHandler::TCPHandler(IServer & server_, TCPServer & tcp_server_, const Poco::Net::StreamSocket & socket_, bool parse_proxy_protocol_, std::string server_display_name_) : Poco::Net::TCPServerConnection(socket_) , server(server_) + , tcp_server(tcp_server_) , parse_proxy_protocol(parse_proxy_protocol_) , log(&Poco::Logger::get("TCPHandler")) , server_display_name(std::move(server_display_name_)) @@ -172,13 +174,13 @@ void TCPHandler::runImpl() throw; } - while (true) + while (tcp_server.isOpen()) { /// We are waiting for a packet from the client. Thus, every `poll_interval` seconds check whether we need to shut down. { Stopwatch idle_time; UInt64 timeout_ms = std::min(poll_interval, idle_connection_timeout) * 1000000; - while (!server.isCancelled() && !static_cast(*in).poll(timeout_ms)) + while (tcp_server.isOpen() && !server.isCancelled() && !static_cast(*in).poll(timeout_ms)) { if (idle_time.elapsedSeconds() > idle_connection_timeout) { @@ -189,7 +191,7 @@ void TCPHandler::runImpl() } /// If we need to shut down, or client disconnects. - if (server.isCancelled() || in->eof()) + if (!tcp_server.isOpen() || server.isCancelled() || in->eof()) break; Stopwatch watch; @@ -233,8 +235,6 @@ void TCPHandler::runImpl() /// NOTE: these settings are applied only for current connection (not for distributed tables' connections) state.timeout_setter = std::make_unique(socket(), receive_timeout, send_timeout); - std::mutex fatal_error_mutex; - /// Should we send internal logs to client? const auto client_logs_level = query_context->getSettingsRef().send_logs_level; if (client_tcp_protocol_version >= DBMS_MIN_REVISION_WITH_SERVER_LOGS @@ -243,7 +243,7 @@ void TCPHandler::runImpl() state.logs_queue = std::make_shared(); state.logs_queue->max_priority = Poco::Logger::parseLevel(client_logs_level.toString()); CurrentThread::attachInternalTextLogsQueue(state.logs_queue, client_logs_level); - CurrentThread::setFatalErrorCallback([this, &fatal_error_mutex] + CurrentThread::setFatalErrorCallback([this] { std::lock_guard lock(fatal_error_mutex); sendLogs(); @@ -351,7 +351,7 @@ void TCPHandler::runImpl() /// Should not check for cancel in case of input. if (!state.need_receive_data_for_input) { - auto callback = [this, &fatal_error_mutex]() + auto callback = [this]() { std::lock_guard lock(fatal_error_mutex); @@ -949,28 +949,27 @@ void TCPHandler::sendProfileEvents() ThreadIdToCountersSnapshot new_snapshots; ProfileEventsSnapshot group_snapshot; { - std::lock_guard guard(thread_group->mutex); - snapshots.reserve(thread_group->threads.size()); - for (auto * thread : thread_group->threads) + auto stats = thread_group->getProfileEventsCountersAndMemoryForThreads(); + snapshots.reserve(stats.size()); + + for (auto & stat : stats) { - auto const thread_id = thread->thread_id; + auto const thread_id = stat.thread_id; if (thread_id == current_thread_id) continue; auto current_time = time(nullptr); - auto counters = thread->performance_counters.getPartiallyAtomicSnapshot(); - auto memory_usage = thread->memory_tracker.get(); auto previous_snapshot = last_sent_snapshots.find(thread_id); auto increment = previous_snapshot != last_sent_snapshots.end() - ? CountersIncrement(counters, previous_snapshot->second) - : CountersIncrement(counters); + ? CountersIncrement(stat.counters, previous_snapshot->second) + : CountersIncrement(stat.counters); snapshots.push_back(ProfileEventsSnapshot{ thread_id, std::move(increment), - memory_usage, + stat.memory_usage, current_time }); - new_snapshots[thread_id] = std::move(counters); + new_snapshots[thread_id] = std::move(stat.counters); } group_snapshot.thread_id = 0; diff --git a/src/Server/TCPHandler.h b/src/Server/TCPHandler.h index 4a340e328ed..4c4aeb0d913 100644 --- a/src/Server/TCPHandler.h +++ b/src/Server/TCPHandler.h @@ -35,6 +35,7 @@ class Session; struct Settings; class ColumnsDescription; struct ProfileInfo; +class TCPServer; /// State of query processing. struct QueryState @@ -127,7 +128,7 @@ public: * because it allows to check the IP ranges of the trusted proxy. * Proxy-forwarded (original client) IP address is used for quota accounting if quota is keyed by forwarded IP. */ - TCPHandler(IServer & server_, const Poco::Net::StreamSocket & socket_, bool parse_proxy_protocol_, std::string server_display_name_); + TCPHandler(IServer & server_, TCPServer & tcp_server_, const Poco::Net::StreamSocket & socket_, bool parse_proxy_protocol_, std::string server_display_name_); ~TCPHandler() override; void run() override; @@ -137,6 +138,7 @@ public: private: IServer & server; + TCPServer & tcp_server; bool parse_proxy_protocol = false; Poco::Logger * log; @@ -177,6 +179,7 @@ private: String cluster_secret; std::mutex task_callback_mutex; + std::mutex fatal_error_mutex; /// At the moment, only one ongoing query in the connection is supported at a time. QueryState state; diff --git a/src/Server/TCPHandlerFactory.h b/src/Server/TCPHandlerFactory.h index e610bea330c..03b2592198d 100644 --- a/src/Server/TCPHandlerFactory.h +++ b/src/Server/TCPHandlerFactory.h @@ -1,17 +1,17 @@ #pragma once -#include #include #include #include #include +#include namespace Poco { class Logger; } namespace DB { -class TCPHandlerFactory : public Poco::Net::TCPServerConnectionFactory +class TCPHandlerFactory : public TCPServerConnectionFactory { private: IServer & server; @@ -38,13 +38,13 @@ public: server_display_name = server.config().getString("display_name", getFQDNOrHostName()); } - Poco::Net::TCPServerConnection * createConnection(const Poco::Net::StreamSocket & socket) override + Poco::Net::TCPServerConnection * createConnection(const Poco::Net::StreamSocket & socket, TCPServer & tcp_server) override { try { LOG_TRACE(log, "TCP Request. Address: {}", socket.peerAddress().toString()); - return new TCPHandler(server, socket, parse_proxy_protocol, server_display_name); + return new TCPHandler(server, tcp_server, socket, parse_proxy_protocol, server_display_name); } catch (const Poco::Net::NetException &) { diff --git a/src/Server/TCPServer.cpp b/src/Server/TCPServer.cpp new file mode 100644 index 00000000000..380c4ef9924 --- /dev/null +++ b/src/Server/TCPServer.cpp @@ -0,0 +1,36 @@ +#include +#include + +namespace DB +{ + +class TCPServerConnectionFactoryImpl : public Poco::Net::TCPServerConnectionFactory +{ +public: + TCPServerConnectionFactoryImpl(TCPServer & tcp_server_, DB::TCPServerConnectionFactory::Ptr factory_) + : tcp_server(tcp_server_) + , factory(factory_) + {} + + Poco::Net::TCPServerConnection * createConnection(const Poco::Net::StreamSocket & socket) override + { + return factory->createConnection(socket, tcp_server); + } +private: + TCPServer & tcp_server; + DB::TCPServerConnectionFactory::Ptr factory; +}; + +TCPServer::TCPServer( + TCPServerConnectionFactory::Ptr factory_, + Poco::ThreadPool & thread_pool, + Poco::Net::ServerSocket & socket_, + Poco::Net::TCPServerParams::Ptr params) + : Poco::Net::TCPServer(new TCPServerConnectionFactoryImpl(*this, factory_), thread_pool, socket_, params) + , factory(factory_) + , socket(socket_) + , is_open(true) + , port_number(socket.address().port()) +{} + +} diff --git a/src/Server/TCPServer.h b/src/Server/TCPServer.h new file mode 100644 index 00000000000..219fed5342b --- /dev/null +++ b/src/Server/TCPServer.h @@ -0,0 +1,47 @@ +#pragma once + +#include + +#include +#include + + +namespace DB +{ +class Context; + +class TCPServer : public Poco::Net::TCPServer +{ +public: + explicit TCPServer( + TCPServerConnectionFactory::Ptr factory, + Poco::ThreadPool & thread_pool, + Poco::Net::ServerSocket & socket, + Poco::Net::TCPServerParams::Ptr params = new Poco::Net::TCPServerParams); + + /// Close the socket and ask existing connections to stop serving queries + void stop() + { + Poco::Net::TCPServer::stop(); + // This notifies already established connections that they should stop serving + // queries and close their socket as soon as they can. + is_open = false; + // Poco's stop() stops listening on the socket but leaves it open. + // To be able to hand over control of the listening port to a new server, and + // to get fast connection refusal instead of timeouts, we also need to close + // the listening socket. + socket.close(); + } + + bool isOpen() const { return is_open; } + + UInt16 portNumber() const { return port_number; } + +private: + TCPServerConnectionFactory::Ptr factory; + Poco::Net::ServerSocket socket; + std::atomic is_open; + UInt16 port_number; +}; + +} diff --git a/src/Server/TCPServerConnectionFactory.h b/src/Server/TCPServerConnectionFactory.h new file mode 100644 index 00000000000..613f98352bd --- /dev/null +++ b/src/Server/TCPServerConnectionFactory.h @@ -0,0 +1,27 @@ +#pragma once + +#include + +namespace Poco +{ +namespace Net +{ + class StreamSocket; + class TCPServerConnection; +} +} +namespace DB +{ +class TCPServer; + +class TCPServerConnectionFactory +{ +public: + using Ptr = Poco::SharedPtr; + + virtual ~TCPServerConnectionFactory() = default; + + /// Same as Poco::Net::TCPServerConnectionFactory except we can pass the TCPServer + virtual Poco::Net::TCPServerConnection * createConnection(const Poco::Net::StreamSocket & socket, TCPServer & tcp_server) = 0; +}; +} diff --git a/src/Server/WebUIRequestHandler.cpp b/src/Server/WebUIRequestHandler.cpp index 1dcad049f49..50aa0be4778 100644 --- a/src/Server/WebUIRequestHandler.cpp +++ b/src/Server/WebUIRequestHandler.cpp @@ -6,7 +6,7 @@ #include #include -#include +#include namespace DB diff --git a/src/Server/grpc_protos/clickhouse_grpc.proto b/src/Server/grpc_protos/clickhouse_grpc.proto index c6cafaf6e40..c86c74535c5 100644 --- a/src/Server/grpc_protos/clickhouse_grpc.proto +++ b/src/Server/grpc_protos/clickhouse_grpc.proto @@ -37,6 +37,10 @@ message ExternalTable { // Format of the data to insert to the external table. string format = 4; + // Compression type used to compress `data`. + // Supported values: none, gzip(gz), deflate, brotli(br), lzma(xz), zstd(zst), lz4, bz2. + string compression_type = 6; + // Settings for executing that insertion, applied after QueryInfo.settings. map settings = 5; } @@ -101,6 +105,25 @@ message QueryInfo { /// Controls how a ClickHouse server will compress query execution results before sending back to the client. /// If not set the compression settings from the configuration file will be used. Compression result_compression = 17; + + // Compression type for `input_data`, `output_data`, `totals` and `extremes`. + // Supported compression types: none, gzip(gz), deflate, brotli(br), lzma(xz), zstd(zst), lz4, bz2. + // When used for `input_data` the client is responsible to compress data before putting it into `input_data`. + // When used for `output_data` or `totals` or `extremes` the client receives compressed data and should decompress it by itself. + // In the latter case consider to specify also `compression_level`. + string compression_type = 18; + + // Compression level. + // WARNING: If it's not specified the compression level is set to zero by default which might be not the best choice for some compression types (see below). + // The compression level should be in the following range (the higher the number, the better the compression): + // none: compression level isn't used + // gzip: 0..9; 0 means no compression, 6 is recommended by default (compression level -1 also means 6) + // brotli: 0..11 + // lzma: 0..9; 6 is recommended by default + // zstd: 1..22; 3 is recommended by default (compression level 0 also means 3) + // lz4: 0..16; values < 0 mean fast acceleration + // bz2: 1..9 + int32 compression_level = 19; } enum LogsLevel { diff --git a/src/Storages/ExecutableSettings.h b/src/Storages/ExecutableSettings.h index 9c0cfc05fa5..c6c1f0b9eb2 100644 --- a/src/Storages/ExecutableSettings.h +++ b/src/Storages/ExecutableSettings.h @@ -9,16 +9,23 @@ namespace DB class ASTStorage; #define LIST_OF_EXECUTABLE_SETTINGS(M) \ - M(UInt64, send_chunk_header, false, "Send number_of_rows\n before sending chunk to process", 0) \ - M(UInt64, pool_size, 16, "Processes pool size. If size == 0, then there is no size restrictions", 0) \ + M(Bool, send_chunk_header, false, "Send number_of_rows\n before sending chunk to process.", 0) \ + M(UInt64, pool_size, 16, "Processes pool size. If size == 0, then there is no size restrictions.", 0) \ M(UInt64, max_command_execution_time, 10, "Max command execution time in seconds.", 0) \ M(UInt64, command_termination_timeout, 10, "Command termination timeout in seconds.", 0) \ + M(UInt64, command_read_timeout, 10000, "Timeout for reading data from command stdout in milliseconds.", 0) \ + M(UInt64, command_write_timeout, 10000, "Timeout for writing data to command stdin in milliseconds.", 0) DECLARE_SETTINGS_TRAITS(ExecutableSettingsTraits, LIST_OF_EXECUTABLE_SETTINGS) /// Settings for ExecutablePool engine. struct ExecutableSettings : public BaseSettings { + std::string script_name; + std::vector script_arguments; + + bool is_executable_pool = false; + void loadFromQuery(ASTStorage & storage_def); }; diff --git a/src/Storages/ExternalDataSourceConfiguration.cpp b/src/Storages/ExternalDataSourceConfiguration.cpp index 8389c432db2..42b3b148551 100644 --- a/src/Storages/ExternalDataSourceConfiguration.cpp +++ b/src/Storages/ExternalDataSourceConfiguration.cpp @@ -54,6 +54,7 @@ void ExternalDataSourceConfiguration::set(const ExternalDataSourceConfiguration database = conf.database; table = conf.table; schema = conf.schema; + addresses = conf.addresses; addresses_expr = conf.addresses_expr; } @@ -86,7 +87,7 @@ std::optional getExternalDataSourceConfiguration(const configuration.username = config.getString(collection_prefix + ".user", ""); configuration.password = config.getString(collection_prefix + ".password", ""); configuration.database = config.getString(collection_prefix + ".database", ""); - configuration.table = config.getString(collection_prefix + ".table", ""); + configuration.table = config.getString(collection_prefix + ".table", config.getString(collection_prefix + ".collection", "")); configuration.schema = config.getString(collection_prefix + ".schema", ""); configuration.addresses_expr = config.getString(collection_prefix + ".addresses_expr", ""); diff --git a/src/Storages/ExternalDataSourceConfiguration.h b/src/Storages/ExternalDataSourceConfiguration.h index b214caa9a12..f33fd938092 100644 --- a/src/Storages/ExternalDataSourceConfiguration.h +++ b/src/Storages/ExternalDataSourceConfiguration.h @@ -40,7 +40,6 @@ struct StorageMySQLConfiguration : ExternalDataSourceConfiguration struct StorageMongoDBConfiguration : ExternalDataSourceConfiguration { - String collection; String options; }; @@ -89,7 +88,7 @@ struct URLBasedDataSourceConfiguration String url; String format; String compression_method = "auto"; - String structure; + String structure = "auto"; std::vector> headers; String http_method; diff --git a/src/Storages/FileLog/DirectoryWatcherBase.cpp b/src/Storages/FileLog/DirectoryWatcherBase.cpp index f2737219fd8..005e1e5fd1b 100644 --- a/src/Storages/FileLog/DirectoryWatcherBase.cpp +++ b/src/Storages/FileLog/DirectoryWatcherBase.cpp @@ -6,7 +6,7 @@ #include #include #include -#include +#include namespace DB { diff --git a/src/Storages/HDFS/HDFSCommon.cpp b/src/Storages/HDFS/HDFSCommon.cpp index 68d5f60a2aa..8aceed05b72 100644 --- a/src/Storages/HDFS/HDFSCommon.cpp +++ b/src/Storages/HDFS/HDFSCommon.cpp @@ -22,7 +22,7 @@ namespace ErrorCodes } const String HDFSBuilderWrapper::CONFIG_PREFIX = "hdfs"; -const String HDFS_URL_REGEXP = "^hdfs://[^:/]*:[0-9]*/.*"; +const String HDFS_URL_REGEXP = "^hdfs://[^/]*/.*"; void HDFSBuilderWrapper::loadFromConfig(const Poco::Util::AbstractConfiguration & config, const String & config_path, bool isUser) diff --git a/src/Storages/HDFS/StorageHDFS.cpp b/src/Storages/HDFS/StorageHDFS.cpp index 70aa3d28174..f22f6f66ced 100644 --- a/src/Storages/HDFS/StorageHDFS.cpp +++ b/src/Storages/HDFS/StorageHDFS.cpp @@ -10,12 +10,10 @@ #include #include #include -#include #include #include #include -#include #include #include @@ -30,6 +28,8 @@ #include #include + +#include #include #include @@ -52,6 +52,69 @@ namespace ErrorCodes { extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; extern const int ACCESS_DENIED; + extern const int CANNOT_EXTRACT_TABLE_STRUCTURE; +} +namespace +{ + /* Recursive directory listing with matched paths as a result. + * Have the same method in StorageFile. + */ + Strings LSWithRegexpMatching(const String & path_for_ls, const HDFSFSPtr & fs, const String & for_match) + { + const size_t first_glob = for_match.find_first_of("*?{"); + + const size_t end_of_path_without_globs = for_match.substr(0, first_glob).rfind('/'); + const String suffix_with_globs = for_match.substr(end_of_path_without_globs); /// begin with '/' + const String prefix_without_globs = path_for_ls + for_match.substr(1, end_of_path_without_globs); /// ends with '/' + + const size_t next_slash = suffix_with_globs.find('/', 1); + re2::RE2 matcher(makeRegexpPatternFromGlobs(suffix_with_globs.substr(0, next_slash))); + + HDFSFileInfo ls; + ls.file_info = hdfsListDirectory(fs.get(), prefix_without_globs.data(), &ls.length); + Strings result; + for (int i = 0; i < ls.length; ++i) + { + const String full_path = String(ls.file_info[i].mName); + const size_t last_slash = full_path.rfind('/'); + const String file_name = full_path.substr(last_slash); + const bool looking_for_directory = next_slash != std::string::npos; + const bool is_directory = ls.file_info[i].mKind == 'D'; + /// Condition with type of current file_info means what kind of path is it in current iteration of ls + if (!is_directory && !looking_for_directory) + { + if (re2::RE2::FullMatch(file_name, matcher)) + { + result.push_back(String(ls.file_info[i].mName)); + } + } + else if (is_directory && looking_for_directory) + { + if (re2::RE2::FullMatch(file_name, matcher)) + { + Strings result_part = LSWithRegexpMatching(fs::path(full_path) / "", fs, suffix_with_globs.substr(next_slash)); + /// Recursion depth is limited by pattern. '*' works only for depth = 1, for depth = 2 pattern path is '*/*'. So we do not need additional check. + std::move(result_part.begin(), result_part.end(), std::back_inserter(result)); + } + } + } + + return result; + } + + std::pair getPathFromUriAndUriWithoutPath(const String & uri) + { + const size_t begin_of_path = uri.find('/', uri.find("//") + 2); + return {uri.substr(begin_of_path), uri.substr(0, begin_of_path)}; + } + + std::vector getPathsList(const String & path_from_uri, const String & uri_without_path, ContextPtr context) + { + HDFSBuilderWrapper builder = createHDFSBuilder(uri_without_path + "/", context->getGlobalContext()->getConfigRef()); + HDFSFSPtr fs = createHDFSFS(builder.get()); + + return LSWithRegexpMatching("/", fs, path_from_uri); + } } StorageHDFS::StorageHDFS( @@ -62,173 +125,230 @@ StorageHDFS::StorageHDFS( const ConstraintsDescription & constraints_, const String & comment, ContextPtr context_, - const String & compression_method_ = "", + const String & compression_method_, + const bool distributed_processing_, ASTPtr partition_by_) : IStorage(table_id_) , WithContext(context_) , uri(uri_) , format_name(format_name_) , compression_method(compression_method_) + , distributed_processing(distributed_processing_) , partition_by(partition_by_) { context_->getRemoteHostFilter().checkURL(Poco::URI(uri)); checkHDFSURL(uri); StorageInMemoryMetadata storage_metadata; - storage_metadata.setColumns(columns_); + + if (columns_.empty()) + { + auto columns = getTableStructureFromData(format_name, uri, compression_method, context_); + storage_metadata.setColumns(columns); + } + else + storage_metadata.setColumns(columns_); + storage_metadata.setConstraints(constraints_); storage_metadata.setComment(comment); setInMemoryMetadata(storage_metadata); } -using StorageHDFSPtr = std::shared_ptr; - -class HDFSSource : public SourceWithProgress, WithContext +ColumnsDescription StorageHDFS::getTableStructureFromData( + const String & format, + const String & uri, + const String & compression_method, + ContextPtr ctx) { -public: - struct SourcesInfo + auto read_buffer_creator = [&]() { - std::vector uris; - std::atomic next_uri_to_read = 0; + const auto [path_from_uri, uri_without_path] = getPathFromUriAndUriWithoutPath(uri); + auto paths = getPathsList(path_from_uri, uri, ctx); + if (paths.empty()) + throw Exception( + ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, + "Cannot extract table structure from {} format file, because there are no files in HDFS with provided path. You must " + "specify table structure manually", + format); - bool need_path_column = false; - bool need_file_column = false; + auto compression = chooseCompressionMethod(paths[0], compression_method); + return wrapReadBufferWithCompressionMethod( + std::make_unique(uri_without_path, paths[0], ctx->getGlobalContext()->getConfigRef()), compression); }; - using SourcesInfoPtr = std::shared_ptr; + return readSchemaFromFormat(format, std::nullopt, read_buffer_creator, ctx); +} - static Block getHeader(const StorageMetadataPtr & metadata_snapshot, bool need_path_column, bool need_file_column) +class HDFSSource::DisclosedGlobIterator::Impl +{ +public: + Impl(ContextPtr context_, const String & uri) { - auto header = metadata_snapshot->getSampleBlock(); - - /// Note: AddingDefaultsBlockInputStream doesn't change header. - - if (need_path_column) - header.insert({DataTypeString().createColumn(), std::make_shared(), "_path"}); - if (need_file_column) - header.insert({DataTypeString().createColumn(), std::make_shared(), "_file"}); - - return header; + const auto [path_from_uri, uri_without_path] = getPathFromUriAndUriWithoutPath(uri); + uris = getPathsList(path_from_uri, uri_without_path, context_); + for (auto & elem : uris) + elem = uri_without_path + elem; + uris_iter = uris.begin(); } - static Block getBlockForSource( - const StorageHDFSPtr & storage, - const StorageMetadataPtr & metadata_snapshot, - const ColumnsDescription & columns_description, - const SourcesInfoPtr & files_info) + String next() + { + std::lock_guard lock(mutex); + if (uris_iter != uris.end()) + { + auto answer = *uris_iter; + ++uris_iter; + return answer; + } + return {}; + } +private: + std::mutex mutex; + Strings uris; + Strings::iterator uris_iter; +}; + +Block HDFSSource::getHeader(const StorageMetadataPtr & metadata_snapshot, bool need_path_column, bool need_file_column) +{ + auto header = metadata_snapshot->getSampleBlock(); + /// Note: AddingDefaultsBlockInputStream doesn't change header. + if (need_path_column) + header.insert({DataTypeString().createColumn(), std::make_shared(), "_path"}); + if (need_file_column) + header.insert({DataTypeString().createColumn(), std::make_shared(), "_file"}); + return header; +} + +Block HDFSSource::getBlockForSource( + const StorageHDFSPtr & storage, + const StorageMetadataPtr & metadata_snapshot, + const ColumnsDescription & columns_description, + bool need_path_column, + bool need_file_column) +{ + if (storage->isColumnOriented()) + return metadata_snapshot->getSampleBlockForColumns( + columns_description.getNamesOfPhysical(), storage->getVirtuals(), storage->getStorageID()); + else + return getHeader(metadata_snapshot, need_path_column, need_file_column); +} + +HDFSSource::DisclosedGlobIterator::DisclosedGlobIterator(ContextPtr context_, const String & uri) + : pimpl(std::make_shared(context_, uri)) {} + +String HDFSSource::DisclosedGlobIterator::next() +{ + return pimpl->next(); +} + + +HDFSSource::HDFSSource( + StorageHDFSPtr storage_, + const StorageMetadataPtr & metadata_snapshot_, + ContextPtr context_, + UInt64 max_block_size_, + bool need_path_column_, + bool need_file_column_, + std::shared_ptr file_iterator_, + ColumnsDescription columns_description_) + : SourceWithProgress(getBlockForSource(storage_, metadata_snapshot_, columns_description_, need_path_column_, need_file_column_)) + , WithContext(context_) + , storage(std::move(storage_)) + , metadata_snapshot(metadata_snapshot_) + , max_block_size(max_block_size_) + , need_path_column(need_path_column_) + , need_file_column(need_file_column_) + , file_iterator(file_iterator_) + , columns_description(std::move(columns_description_)) +{ + initialize(); +} + +void HDFSSource::onCancel() +{ + if (reader) + reader->cancel(); +} + +bool HDFSSource::initialize() +{ + current_path = (*file_iterator)(); + if (current_path.empty()) + return false; + const size_t begin_of_path = current_path.find('/', current_path.find("//") + 2); + const String path_from_uri = current_path.substr(begin_of_path); + const String uri_without_path = current_path.substr(0, begin_of_path); + + auto compression = chooseCompressionMethod(path_from_uri, storage->compression_method); + read_buf = wrapReadBufferWithCompressionMethod(std::make_unique(uri_without_path, path_from_uri, getContext()->getGlobalContext()->getConfigRef()), compression); + + auto get_block_for_format = [&]() -> Block { if (storage->isColumnOriented()) - return metadata_snapshot->getSampleBlockForColumns( - columns_description.getNamesOfPhysical(), storage->getVirtuals(), storage->getStorageID()); - else - return getHeader(metadata_snapshot, files_info->need_path_column, files_info->need_file_column); - } + return metadata_snapshot->getSampleBlockForColumns(columns_description.getNamesOfPhysical()); + return metadata_snapshot->getSampleBlock(); + }; - HDFSSource( - StorageHDFSPtr storage_, - const StorageMetadataPtr & metadata_snapshot_, - ContextPtr context_, - UInt64 max_block_size_, - SourcesInfoPtr source_info_, - String uri_without_path_, - ColumnsDescription columns_description_) - : SourceWithProgress(getBlockForSource(storage_, metadata_snapshot_, columns_description_, source_info_)) - , WithContext(context_) - , storage(std::move(storage_)) - , metadata_snapshot(metadata_snapshot_) - , source_info(std::move(source_info_)) - , uri_without_path(std::move(uri_without_path_)) - , max_block_size(max_block_size_) - , columns_description(std::move(columns_description_)) - { - } + auto input_format = getContext()->getInputFormat(storage->format_name, *read_buf, get_block_for_format(), max_block_size); - String getName() const override + QueryPipelineBuilder builder; + builder.init(Pipe(input_format)); + if (columns_description.hasDefaults()) { - return "HDFS"; - } - - Chunk generate() override - { - while (true) + builder.addSimpleTransform([&](const Block & header) { - if (!reader) - { - auto pos = source_info->next_uri_to_read.fetch_add(1); - if (pos >= source_info->uris.size()) - return {}; + return std::make_shared(header, columns_description, *input_format, getContext()); + }); + } + pipeline = std::make_unique(QueryPipelineBuilder::getPipeline(std::move(builder))); + reader = std::make_unique(*pipeline); + return true; +} - auto path = source_info->uris[pos]; - current_path = uri_without_path + path; +String HDFSSource::getName() const +{ + return "HDFSSource"; +} - auto compression = chooseCompressionMethod(path, storage->compression_method); - read_buf = wrapReadBufferWithCompressionMethod(std::make_unique(uri_without_path, path, getContext()->getGlobalContext()->getConfigRef()), compression); +Chunk HDFSSource::generate() +{ + if (!reader) + return {}; - auto get_block_for_format = [&]() -> Block - { - if (storage->isColumnOriented()) - return metadata_snapshot->getSampleBlockForColumns(columns_description.getNamesOfPhysical()); - return metadata_snapshot->getSampleBlock(); - }; - auto input_format = getContext()->getInputFormat(storage->format_name, *read_buf, get_block_for_format(), max_block_size); + Chunk chunk; + if (reader->pull(chunk)) + { + Columns columns = chunk.getColumns(); + UInt64 num_rows = chunk.getNumRows(); - QueryPipelineBuilder builder; - builder.init(Pipe(input_format)); - if (columns_description.hasDefaults()) - { - builder.addSimpleTransform([&](const Block & header) - { - return std::make_shared(header, columns_description, *input_format, getContext()); - }); - } - pipeline = std::make_unique(QueryPipelineBuilder::getPipeline(std::move(builder))); - reader = std::make_unique(*pipeline); - } - - Block res; - if (reader->pull(res)) - { - Columns columns = res.getColumns(); - UInt64 num_rows = res.rows(); - - /// Enrich with virtual columns. - if (source_info->need_path_column) - { - auto column = DataTypeString().createColumnConst(num_rows, current_path); - columns.push_back(column->convertToFullColumnIfConst()); - } - - if (source_info->need_file_column) - { - size_t last_slash_pos = current_path.find_last_of('/'); - auto file_name = current_path.substr(last_slash_pos + 1); - - auto column = DataTypeString().createColumnConst(num_rows, std::move(file_name)); - columns.push_back(column->convertToFullColumnIfConst()); - } - - return Chunk(std::move(columns), num_rows); - } - - reader.reset(); - pipeline.reset(); - read_buf.reset(); + /// Enrich with virtual columns. + if (need_path_column) + { + auto column = DataTypeString().createColumnConst(num_rows, current_path); + columns.push_back(column->convertToFullColumnIfConst()); } + + if (need_file_column) + { + size_t last_slash_pos = current_path.find_last_of('/'); + auto file_name = current_path.substr(last_slash_pos + 1); + + auto column = DataTypeString().createColumnConst(num_rows, std::move(file_name)); + columns.push_back(column->convertToFullColumnIfConst()); + } + + return Chunk(std::move(columns), num_rows); } -private: - StorageHDFSPtr storage; - StorageMetadataPtr metadata_snapshot; - SourcesInfoPtr source_info; - String uri_without_path; - UInt64 max_block_size; - ColumnsDescription columns_description; + reader.reset(); + pipeline.reset(); + read_buf.reset(); + + if (!initialize()) + return {}; + return generate(); +} - std::unique_ptr read_buf; - std::unique_ptr pipeline; - std::unique_ptr reader; - String current_path; -}; class HDFSSink : public SinkToStorage { @@ -300,7 +420,6 @@ public: private: const String uri; - const String format; const Block sample_block; ContextPtr context; @@ -308,51 +427,6 @@ private: }; -/* Recursive directory listing with matched paths as a result. - * Have the same method in StorageFile. - */ -Strings LSWithRegexpMatching(const String & path_for_ls, const HDFSFSPtr & fs, const String & for_match) -{ - const size_t first_glob = for_match.find_first_of("*?{"); - - const size_t end_of_path_without_globs = for_match.substr(0, first_glob).rfind('/'); - const String suffix_with_globs = for_match.substr(end_of_path_without_globs); /// begin with '/' - const String prefix_without_globs = path_for_ls + for_match.substr(1, end_of_path_without_globs); /// ends with '/' - - const size_t next_slash = suffix_with_globs.find('/', 1); - re2::RE2 matcher(makeRegexpPatternFromGlobs(suffix_with_globs.substr(0, next_slash))); - - HDFSFileInfo ls; - ls.file_info = hdfsListDirectory(fs.get(), prefix_without_globs.data(), &ls.length); - Strings result; - for (int i = 0; i < ls.length; ++i) - { - const String full_path = String(ls.file_info[i].mName); - const size_t last_slash = full_path.rfind('/'); - const String file_name = full_path.substr(last_slash); - const bool looking_for_directory = next_slash != std::string::npos; - const bool is_directory = ls.file_info[i].mKind == 'D'; - /// Condition with type of current file_info means what kind of path is it in current iteration of ls - if (!is_directory && !looking_for_directory) - { - if (re2::RE2::FullMatch(file_name, matcher)) - { - result.push_back(String(ls.file_info[i].mName)); - } - } - else if (is_directory && looking_for_directory) - { - if (re2::RE2::FullMatch(file_name, matcher)) - { - Strings result_part = LSWithRegexpMatching(fs::path(full_path) / "", fs, suffix_with_globs.substr(next_slash)); - /// Recursion depth is limited by pattern. '*' works only for depth = 1, for depth = 2 pattern path is '*/*'. So we do not need additional check. - std::move(result_part.begin(), result_part.end(), std::back_inserter(result)); - } - } - } - return result; -} - bool StorageHDFS::isColumnOriented() const { return format_name != "Distributed" && FormatFactory::instance().checkIfFormatIsColumnOriented(format_name); @@ -367,29 +441,34 @@ Pipe StorageHDFS::read( size_t max_block_size, unsigned num_streams) { - const size_t begin_of_path = uri.find('/', uri.find("//") + 2); - const String path_from_uri = uri.substr(begin_of_path); - const String uri_without_path = uri.substr(0, begin_of_path); - - HDFSBuilderWrapper builder = createHDFSBuilder(uri_without_path + "/", context_->getGlobalContext()->getConfigRef()); - HDFSFSPtr fs = createHDFSFS(builder.get()); - - auto sources_info = std::make_shared(); - sources_info->uris = LSWithRegexpMatching("/", fs, path_from_uri); - - if (sources_info->uris.empty()) - LOG_WARNING(log, "No file in HDFS matches the path: {}", uri); + bool need_path_column = false; + bool need_file_column = false; for (const auto & column : column_names) { if (column == "_path") - sources_info->need_path_column = true; + need_path_column = true; if (column == "_file") - sources_info->need_file_column = true; + need_file_column = true; } - if (num_streams > sources_info->uris.size()) - num_streams = sources_info->uris.size(); + std::shared_ptr iterator_wrapper{nullptr}; + if (distributed_processing) + { + iterator_wrapper = std::make_shared( + [callback = context_->getReadTaskCallback()]() -> String { + return callback(); + }); + } + else + { + /// Iterate through disclosed globs and make a source for each file + auto glob_iterator = std::make_shared(context_, uri); + iterator_wrapper = std::make_shared([glob_iterator]() + { + return glob_iterator->next(); + }); + } Pipes pipes; auto this_ptr = std::static_pointer_cast(shared_from_this()); @@ -409,8 +488,9 @@ Pipe StorageHDFS::read( metadata_snapshot, context_, max_block_size, - sources_info, - uri_without_path, + need_path_column, + need_file_column, + iterator_wrapper, get_columns_for_format())); } return Pipe::unitePipes(std::move(pipes)); @@ -443,13 +523,13 @@ SinkToStoragePtr StorageHDFS::write(const ASTPtr & query, const StorageMetadataP } } -void StorageHDFS::truncate(const ASTPtr & /* query */, const StorageMetadataPtr &, ContextPtr context_, TableExclusiveLockHolder &) +void StorageHDFS::truncate(const ASTPtr & /* query */, const StorageMetadataPtr &, ContextPtr local_context, TableExclusiveLockHolder &) { const size_t begin_of_path = uri.find('/', uri.find("//") + 2); const String path = uri.substr(begin_of_path); const String url = uri.substr(0, begin_of_path); - HDFSBuilderWrapper builder = createHDFSBuilder(url + "/", context_->getGlobalContext()->getConfigRef()); + HDFSBuilderWrapper builder = createHDFSBuilder(url + "/", local_context->getGlobalContext()->getConfigRef()); HDFSFSPtr fs = createHDFSFS(builder.get()); int ret = hdfsDelete(fs.get(), path.data(), 0); @@ -488,10 +568,11 @@ void registerStorageHDFS(StorageFactory & factory) partition_by = args.storage_def->partition_by->clone(); return StorageHDFS::create( - url, args.table_id, format_name, args.columns, args.constraints, args.comment, args.getContext(), compression_method, partition_by); + url, args.table_id, format_name, args.columns, args.constraints, args.comment, args.getContext(), compression_method, false, partition_by); }, { .supports_sort_order = true, // for partition by + .supports_schema_inference = true, .source_access_type = AccessType::HDFS, }); } diff --git a/src/Storages/HDFS/StorageHDFS.h b/src/Storages/HDFS/StorageHDFS.h index db6b078265d..9e845d8fd74 100644 --- a/src/Storages/HDFS/StorageHDFS.h +++ b/src/Storages/HDFS/StorageHDFS.h @@ -4,6 +4,7 @@ #if USE_HDFS +#include #include #include #include @@ -30,9 +31,13 @@ public: size_t max_block_size, unsigned num_streams) override; - SinkToStoragePtr write(const ASTPtr & query, const StorageMetadataPtr & /*metadata_snapshot*/, ContextPtr context) override; + SinkToStoragePtr write(const ASTPtr & query, const StorageMetadataPtr & /*metadata_snapshot*/, ContextPtr /*context*/) override; - void truncate(const ASTPtr & query, const StorageMetadataPtr & metadata_snapshot, ContextPtr context_, TableExclusiveLockHolder &) override; + void truncate( + const ASTPtr & query, + const StorageMetadataPtr & metadata_snapshot, + ContextPtr local_context, + TableExclusiveLockHolder &) override; NamesAndTypesList getVirtuals() const override; @@ -44,6 +49,12 @@ public: /// format to read only them. Note: this hack cannot be done with ordinary formats like TSV. bool isColumnOriented() const; + static ColumnsDescription getTableStructureFromData( + const String & format, + const String & uri, + const String & compression_method, + ContextPtr ctx); + protected: friend class HDFSSource; StorageHDFS( @@ -54,17 +65,84 @@ protected: const ConstraintsDescription & constraints_, const String & comment, ContextPtr context_, - const String & compression_method_, + const String & compression_method_ = "", + bool distributed_processing_ = false, ASTPtr partition_by = nullptr); private: const String uri; String format_name; String compression_method; + const bool distributed_processing; ASTPtr partition_by; Poco::Logger * log = &Poco::Logger::get("StorageHDFS"); }; + +class PullingPipelineExecutor; + +class HDFSSource : public SourceWithProgress, WithContext +{ +public: + class DisclosedGlobIterator + { + public: + DisclosedGlobIterator(ContextPtr context_, const String & uri_); + String next(); + private: + class Impl; + /// shared_ptr to have copy constructor + std::shared_ptr pimpl; + }; + + using IteratorWrapper = std::function; + using StorageHDFSPtr = std::shared_ptr; + + static Block getHeader( + const StorageMetadataPtr & metadata_snapshot, + bool need_path_column, + bool need_file_column); + + static Block getBlockForSource( + const StorageHDFSPtr & storage, + const StorageMetadataPtr & metadata_snapshot, + const ColumnsDescription & columns_description, + bool need_path_column, + bool need_file_column); + + HDFSSource( + StorageHDFSPtr storage_, + const StorageMetadataPtr & metadata_snapshot_, + ContextPtr context_, + UInt64 max_block_size_, + bool need_path_column_, + bool need_file_column_, + std::shared_ptr file_iterator_, + ColumnsDescription columns_description_); + + String getName() const override; + + Chunk generate() override; + + void onCancel() override; + +private: + StorageHDFSPtr storage; + StorageMetadataPtr metadata_snapshot; + UInt64 max_block_size; + bool need_path_column; + bool need_file_column; + std::shared_ptr file_iterator; + ColumnsDescription columns_description; + + std::unique_ptr read_buf; + std::unique_ptr pipeline; + std::unique_ptr reader; + String current_path; + + /// Recreate ReadBuffer and PullingPipelineExecutor for each file. + bool initialize(); +}; } #endif diff --git a/src/Storages/HDFS/StorageHDFSCluster.cpp b/src/Storages/HDFS/StorageHDFSCluster.cpp new file mode 100644 index 00000000000..ba1cc045fbf --- /dev/null +++ b/src/Storages/HDFS/StorageHDFSCluster.cpp @@ -0,0 +1,149 @@ +#include + +#if USE_HDFS + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace DB +{ +StorageHDFSCluster::StorageHDFSCluster( + String cluster_name_, + const String & uri_, + const StorageID & table_id_, + const String & format_name_, + const ColumnsDescription & columns_, + const ConstraintsDescription & constraints_, + const String & compression_method_) + : IStorage(table_id_) + , cluster_name(cluster_name_) + , uri(uri_) + , format_name(format_name_) + , compression_method(compression_method_) +{ + StorageInMemoryMetadata storage_metadata; + storage_metadata.setColumns(columns_); + storage_metadata.setConstraints(constraints_); + setInMemoryMetadata(storage_metadata); +} + +/// The code executes on initiator +Pipe StorageHDFSCluster::read( + const Names & column_names, + const StorageMetadataPtr & metadata_snapshot, + SelectQueryInfo & query_info, + ContextPtr context, + QueryProcessingStage::Enum processed_stage, + size_t /*max_block_size*/, + unsigned /*num_streams*/) +{ + auto cluster = context->getCluster(cluster_name)->getClusterWithReplicasAsShards(context->getSettings()); + + auto iterator = std::make_shared(context, uri); + auto callback = std::make_shared([iterator]() mutable -> String + { + return iterator->next(); + }); + + /// Calculate the header. This is significant, because some columns could be thrown away in some cases like query with count(*) + Block header = + InterpreterSelectQuery(query_info.query, context, SelectQueryOptions(processed_stage).analyze()).getSampleBlock(); + + const Scalars & scalars = context->hasQueryContext() ? context->getQueryContext()->getScalars() : Scalars{}; + + Pipes pipes; + + const bool add_agg_info = processed_stage == QueryProcessingStage::WithMergeableState; + + for (const auto & replicas : cluster->getShardsAddresses()) + { + /// There will be only one replica, because we consider each replica as a shard + for (const auto & node : replicas) + { + auto connection = std::make_shared( + node.host_name, node.port, context->getGlobalContext()->getCurrentDatabase(), + node.user, node.password, node.cluster, node.cluster_secret, + "HDFSClusterInititiator", + node.compression, + node.secure + ); + + + /// For unknown reason global context is passed to IStorage::read() method + /// So, task_identifier is passed as constructor argument. It is more obvious. + auto remote_query_executor = std::make_shared( + connection, + queryToString(query_info.query), + header, + context, + /*throttler=*/nullptr, + scalars, + Tables(), + processed_stage, + RemoteQueryExecutor::Extension{.task_iterator = callback}); + + pipes.emplace_back(std::make_shared(remote_query_executor, add_agg_info, false)); + } + } + + metadata_snapshot->check(column_names, getVirtuals(), getStorageID()); + return Pipe::unitePipes(std::move(pipes)); +} + +QueryProcessingStage::Enum StorageHDFSCluster::getQueryProcessingStage( + ContextPtr context, QueryProcessingStage::Enum to_stage, const StorageMetadataPtr &, SelectQueryInfo &) const +{ + /// Initiator executes query on remote node. + if (context->getClientInfo().query_kind == ClientInfo::QueryKind::INITIAL_QUERY) + if (to_stage >= QueryProcessingStage::Enum::WithMergeableState) + return QueryProcessingStage::Enum::WithMergeableState; + + /// Follower just reads the data. + return QueryProcessingStage::Enum::FetchColumns; +} + + +NamesAndTypesList StorageHDFSCluster::getVirtuals() const +{ + return NamesAndTypesList{ + {"_path", std::make_shared()}, + {"_file", std::make_shared()} + }; +} + + +} + +#endif diff --git a/src/Storages/HDFS/StorageHDFSCluster.h b/src/Storages/HDFS/StorageHDFSCluster.h new file mode 100644 index 00000000000..0e568a9faf8 --- /dev/null +++ b/src/Storages/HDFS/StorageHDFSCluster.h @@ -0,0 +1,55 @@ +#pragma once + +#include + +#if USE_HDFS + +#include +#include + +#include + +#include +#include +#include + +namespace DB +{ + +class Context; + +class StorageHDFSCluster : public shared_ptr_helper, public IStorage +{ + friend struct shared_ptr_helper; +public: + std::string getName() const override { return "HDFSCluster"; } + + Pipe read(const Names &, const StorageMetadataPtr &, SelectQueryInfo &, + ContextPtr, QueryProcessingStage::Enum, size_t /*max_block_size*/, unsigned /*num_streams*/) override; + + QueryProcessingStage::Enum + getQueryProcessingStage(ContextPtr, QueryProcessingStage::Enum, const StorageMetadataPtr &, SelectQueryInfo &) const override; + + NamesAndTypesList getVirtuals() const override; + +protected: + StorageHDFSCluster( + String cluster_name_, + const String & uri_, + const StorageID & table_id_, + const String & format_name_, + const ColumnsDescription & columns_, + const ConstraintsDescription & constraints_, + const String & compression_method_); + +private: + String cluster_name; + String uri; + String format_name; + String compression_method; +}; + + +} + +#endif diff --git a/src/Storages/IStorage.cpp b/src/Storages/IStorage.cpp index 021335fea1f..a923258b111 100644 --- a/src/Storages/IStorage.cpp +++ b/src/Storages/IStorage.cpp @@ -139,7 +139,6 @@ void IStorage::alter(const AlterCommands & params, ContextPtr context, AlterLock setInMemoryMetadata(new_metadata); } - void IStorage::checkAlterIsPossible(const AlterCommands & commands, ContextPtr /* context */) const { for (const auto & command : commands) diff --git a/src/Storages/IStorage.h b/src/Storages/IStorage.h index 8432e5c48d1..bcbc771815b 100644 --- a/src/Storages/IStorage.h +++ b/src/Storages/IStorage.h @@ -511,7 +511,7 @@ public: virtual void shutdown() {} /// Called before shutdown() to flush data to underlying storage - /// (for Buffer) + /// Data in memory need to be persistent virtual void flush() {} /// Asks table to stop executing some action identified by action_type diff --git a/src/Storages/Kafka/WriteBufferToKafkaProducer.cpp b/src/Storages/Kafka/WriteBufferToKafkaProducer.cpp index 7b736e95d25..748ea02ac6d 100644 --- a/src/Storages/Kafka/WriteBufferToKafkaProducer.cpp +++ b/src/Storages/Kafka/WriteBufferToKafkaProducer.cpp @@ -103,7 +103,7 @@ void WriteBufferToKafkaProducer::countRow(const Columns & columns, size_t curren producer->poll(timeout); continue; } - throw e; + throw; } break; @@ -126,7 +126,7 @@ void WriteBufferToKafkaProducer::flush() { if (e.get_error() == RD_KAFKA_RESP_ERR__TIMED_OUT) continue; - throw e; + throw; } break; diff --git a/src/Storages/LiveView/StorageLiveView.cpp b/src/Storages/LiveView/StorageLiveView.cpp index 6ea7b8c2f27..83578e3b5b9 100644 --- a/src/Storages/LiveView/StorageLiveView.cpp +++ b/src/Storages/LiveView/StorageLiveView.cpp @@ -280,7 +280,8 @@ StorageLiveView::StorageLiveView( const StorageID & table_id_, ContextPtr context_, const ASTCreateQuery & query, - const ColumnsDescription & columns_) + const ColumnsDescription & columns_, + const String & comment) : IStorage(table_id_) , WithContext(context_->getGlobalContext()) { @@ -291,6 +292,9 @@ StorageLiveView::StorageLiveView( StorageInMemoryMetadata storage_metadata; storage_metadata.setColumns(columns_); + if (!comment.empty()) + storage_metadata.setComment(comment); + setInMemoryMetadata(storage_metadata); if (!query.select) @@ -621,7 +625,7 @@ void registerStorageLiveView(StorageFactory & factory) "Experimental LIVE VIEW feature is not enabled (the setting 'allow_experimental_live_view')", ErrorCodes::SUPPORT_IS_DISABLED); - return StorageLiveView::create(args.table_id, args.getLocalContext(), args.query, args.columns); + return StorageLiveView::create(args.table_id, args.getLocalContext(), args.query, args.columns, args.comment); }); } diff --git a/src/Storages/LiveView/StorageLiveView.h b/src/Storages/LiveView/StorageLiveView.h index 265f5d60ec4..17e2f50e7ec 100644 --- a/src/Storages/LiveView/StorageLiveView.h +++ b/src/Storages/LiveView/StorageLiveView.h @@ -232,8 +232,8 @@ private: const StorageID & table_id_, ContextPtr context_, const ASTCreateQuery & query, - const ColumnsDescription & columns - ); + const ColumnsDescription & columns, + const String & comment); }; } diff --git a/src/Storages/MergeTree/ActiveDataPartSet.cpp b/src/Storages/MergeTree/ActiveDataPartSet.cpp index 0f6cd8050ca..b21910158ad 100644 --- a/src/Storages/MergeTree/ActiveDataPartSet.cpp +++ b/src/Storages/MergeTree/ActiveDataPartSet.cpp @@ -49,7 +49,7 @@ bool ActiveDataPartSet::add(const String & name, Strings * out_replaced_parts) if (out_replaced_parts) out_replaced_parts->push_back(it->second); - part_info_to_name.erase(it++); + it = part_info_to_name.erase(it); } if (out_replaced_parts) @@ -61,7 +61,7 @@ bool ActiveDataPartSet::add(const String & name, Strings * out_replaced_parts) assert(part_info != it->first); if (out_replaced_parts) out_replaced_parts->push_back(it->second); - part_info_to_name.erase(it++); + it = part_info_to_name.erase(it); } if (it != part_info_to_name.end() && !part_info.isDisjoint(it->first)) diff --git a/src/Storages/MergeTree/DataPartsExchange.cpp b/src/Storages/MergeTree/DataPartsExchange.cpp index e952deb5a87..2855e21356d 100644 --- a/src/Storages/MergeTree/DataPartsExchange.cpp +++ b/src/Storages/MergeTree/DataPartsExchange.cpp @@ -591,6 +591,7 @@ MergeTreeData::MutableDataPartPtr Fetcher::downloadPartToMemory( block.getNamesAndTypesList(), {}, CompressionCodecFactory::instance().get("NONE", {})); + part_out.write(block); part_out.writeSuffixAndFinalizePart(new_projection_part); new_projection_part->checksums.checkEqual(checksums, /* have_uncompressed = */ true); @@ -612,7 +613,9 @@ MergeTreeData::MutableDataPartPtr Fetcher::downloadPartToMemory( new_data_part->partition.create(metadata_snapshot, block, 0, context); MergedBlockOutputStream part_out( - new_data_part, metadata_snapshot, block.getNamesAndTypesList(), {}, CompressionCodecFactory::instance().get("NONE", {})); + new_data_part, metadata_snapshot, block.getNamesAndTypesList(), {}, + CompressionCodecFactory::instance().get("NONE", {})); + part_out.write(block); part_out.writeSuffixAndFinalizePart(new_data_part); new_data_part->checksums.checkEqual(checksums, /* have_uncompressed = */ true); diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index 21dbedbb6ac..83328594363 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -270,7 +270,7 @@ static void decrementTypeMetric(MergeTreeDataPartType type) IMergeTreeDataPart::IMergeTreeDataPart( - MergeTreeData & storage_, + const MergeTreeData & storage_, const String & name_, const VolumePtr & volume_, const std::optional & relative_path_, @@ -407,69 +407,95 @@ std::pair IMergeTreeDataPart::getMinMaxTime() const } -void IMergeTreeDataPart::setColumns(const NamesAndTypesList & new_columns) +void IMergeTreeDataPart::setColumns(const NamesAndTypesList & new_columns, const SerializationInfoByName & new_infos) { columns = new_columns; + column_name_to_position.clear(); column_name_to_position.reserve(new_columns.size()); size_t pos = 0; + for (const auto & column : columns) { - column_name_to_position.emplace(column.name, pos); - for (const auto & subcolumn : column.type->getSubcolumnNames()) - column_name_to_position.emplace(Nested::concatenateName(column.name, subcolumn), pos); - ++pos; + column_name_to_position.emplace(column.name, pos++); + + auto it = new_infos.find(column.name); + if (it != new_infos.end()) + { + auto & old_info = serialization_infos[column.name]; + const auto & new_info = it->second; + + if (old_info) + old_info->replaceData(*new_info); + else + old_info = new_info->clone(); + } } } +SerializationPtr IMergeTreeDataPart::getSerialization(const NameAndTypePair & column) const +{ + auto it = serialization_infos.find(column.getNameInStorage()); + return it == serialization_infos.end() + ? IDataType::getSerialization(column) + : IDataType::getSerialization(column, *it->second); +} + void IMergeTreeDataPart::removeIfNeeded() { - if (state == State::DeleteOnDestroy || is_temp) + if (!is_temp && state != State::DeleteOnDestroy) + return; + + try { - try - { - auto path = getFullRelativePath(); + auto path = getFullRelativePath(); - if (!volume->getDisk()->exists(path)) + if (!volume->getDisk()->exists(path)) + return; + + if (is_temp) + { + String file_name = fileName(relative_path); + + if (file_name.empty()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "relative_path {} of part {} is invalid or not set", relative_path, name); + + if (!startsWith(file_name, "tmp") && !endsWith(file_name, ".tmp_proj")) + { + LOG_ERROR( + storage.log, + "~DataPart() should remove part {} but its name doesn't start with \"tmp\" or end with \".tmp_proj\". Too " + "suspicious, keeping the part.", + path); return; - - if (is_temp) - { - String file_name = fileName(relative_path); - - if (file_name.empty()) - throw Exception("relative_path " + relative_path + " of part " + name + " is invalid or not set", ErrorCodes::LOGICAL_ERROR); - - if (!startsWith(file_name, "tmp") && !endsWith(file_name, ".tmp_proj")) - { - LOG_ERROR( - storage.log, - "~DataPart() should remove part {} but its name doesn't start with \"tmp\" or end with \".tmp_proj\". Too " - "suspicious, keeping the part.", - path); - return; - } - } - - if (parent_part) - { - std::optional keep_shared_data = keepSharedDataInDecoupledStorage(); - if (!keep_shared_data.has_value()) - return; - projectionRemove(parent_part->getFullRelativePath(), *keep_shared_data); - } - else - remove(); - - if (state == State::DeleteOnDestroy) - { - LOG_TRACE(storage.log, "Removed part from old location {}", path); } } - catch (...) + + if (parent_part) { - tryLogCurrentException(__PRETTY_FUNCTION__); + std::optional keep_shared_data = keepSharedDataInDecoupledStorage(); + if (!keep_shared_data.has_value()) + return; + projectionRemove(parent_part->getFullRelativePath(), *keep_shared_data); } + else + remove(); + + if (state == State::DeleteOnDestroy) + { + LOG_TRACE(storage.log, "Removed part from old location {}", path); + } + } + catch (...) + { + /// FIXME If part it temporary, then directory will not be removed for 1 day (temporary_directories_lifetime). + /// If it's tmp_merge_ or tmp_fetch_, + /// then all future attempts to execute part producing operation will fail with "directory already exists". + /// Seems like it's especially important for remote disks, because removal may fail due to network issues. + tryLogCurrentException(__PRETTY_FUNCTION__); + assert(!is_temp); + assert(state != State::DeleteOnDestroy); + assert(state != State::Temporary); } } @@ -601,8 +627,8 @@ void IMergeTreeDataPart::loadColumnsChecksumsIndexes(bool require_columns_checks if (check_consistency) checkConsistency(require_columns_checksums); - loadDefaultCompressionCodec(); + loadDefaultCompressionCodec(); } void IMergeTreeDataPart::loadProjections(bool require_columns_checksums, bool check_consistency) @@ -653,13 +679,13 @@ void IMergeTreeDataPart::loadIndex() size_t marks_count = index_granularity.getMarksCount(); - Serializations serializations(key_size); + Serializations key_serializations(key_size); for (size_t j = 0; j < key_size; ++j) - serializations[j] = primary_key.data_types[j]->getDefaultSerialization(); + key_serializations[j] = primary_key.data_types[j]->getDefaultSerialization(); for (size_t i = 0; i < marks_count; ++i) //-V756 for (size_t j = 0; j < key_size; ++j) - serializations[j]->deserializeBinary(*loaded_index[j], *index_file); + key_serializations[j]->deserializeBinary(*loaded_index[j], *index_file); for (size_t i = 0; i < key_size; ++i) { @@ -750,14 +776,8 @@ CompressionCodecPtr IMergeTreeDataPart::detectDefaultCompressionCodec() const auto column_size = getColumnSize(part_column.name); if (column_size.data_compressed != 0 && !storage_columns.hasCompressionCodec(part_column.name)) { - auto serialization = IDataType::getSerialization(part_column, - [&](const String & stream_name) - { - return volume->getDisk()->exists(stream_name + IMergeTreeDataPart::DATA_FILE_EXTENSION); - }); - String path_to_data_file; - serialization->enumerateStreams([&](const ISerialization::SubstreamPath & substream_path) + getSerialization(part_column)->enumerateStreams([&](const ISerialization::SubstreamPath & substream_path) { if (path_to_data_file.empty()) { @@ -865,6 +885,14 @@ void IMergeTreeDataPart::loadChecksums(bool require) void IMergeTreeDataPart::loadRowsCount() { String path = fs::path(getFullRelativePath()) / "count.txt"; + + auto read_rows_count = [&]() + { + auto buf = openForReading(volume->getDisk(), path); + readIntText(rows_count, *buf); + assertEOF(*buf); + }; + if (index_granularity.empty()) { rows_count = 0; @@ -874,16 +902,16 @@ void IMergeTreeDataPart::loadRowsCount() if (!volume->getDisk()->exists(path)) throw Exception("No count.txt in part " + name, ErrorCodes::NO_FILE_IN_DATA_PART); - auto buf = openForReading(volume->getDisk(), path); - readIntText(rows_count, *buf); - assertEOF(*buf); + read_rows_count(); #ifndef NDEBUG /// columns have to be loaded for (const auto & column : getColumns()) { /// Most trivial types - if (column.type->isValueRepresentedByNumber() && !column.type->haveSubtypes()) + if (column.type->isValueRepresentedByNumber() + && !column.type->haveSubtypes() + && getSerialization(column)->getKind() == ISerialization::Kind::DEFAULT) { auto size = getColumnSize(column.name); @@ -927,9 +955,15 @@ void IMergeTreeDataPart::loadRowsCount() } else { + if (volume->getDisk()->exists(path)) + { + read_rows_count(); + return; + } + for (const NameAndTypePair & column : columns) { - ColumnPtr column_col = column.type->createColumn(); + ColumnPtr column_col = column.type->createColumn(*getSerialization(column)); if (!column_col->isFixedAndContiguous() || column_col->lowCardinality()) continue; @@ -1044,7 +1078,18 @@ void IMergeTreeDataPart::loadColumns(bool require) } } - setColumns(loaded_columns); + SerializationInfo::Settings settings = + { + .ratio_of_defaults_for_sparse = storage.getSettings()->ratio_of_defaults_for_sparse_serialization, + .choose_kind = false, + }; + + SerializationInfoByName infos(loaded_columns, settings); + path = getFullRelativePath() + SERIALIZATION_FILE_NAME; + if (volume->getDisk()->exists(path)) + infos.readJSON(*volume->getDisk()->readFile(path)); + + setColumns(loaded_columns, infos); } bool IMergeTreeDataPart::shallParticipateInMerges(const StoragePolicyPtr & storage_policy) const @@ -1157,14 +1202,17 @@ void IMergeTreeDataPart::remove() const * And a race condition can happen that will lead to "File not found" error here. */ + /// NOTE We rename part to delete_tmp_ instead of delete_tmp_ to avoid race condition + /// when we try to remove two parts with the same name, but different relative paths, + /// for example all_1_2_1 (in Deleting state) and tmp_merge_all_1_2_1 (in Temporary state). fs::path from = fs::path(storage.relative_data_path) / relative_path; - fs::path to = fs::path(storage.relative_data_path) / ("delete_tmp_" + name); + fs::path to = fs::path(storage.relative_data_path) / ("delete_tmp_" + relative_path); // TODO directory delete_tmp_ is never removed if server crashes before returning from this function auto disk = volume->getDisk(); if (disk->exists(to)) { - LOG_WARNING(storage.log, "Directory {} (to which part must be renamed before removing) already exists. Most likely this is due to unclean restart. Removing it.", fullPath(disk, to)); + LOG_WARNING(storage.log, "Directory {} (to which part must be renamed before removing) already exists. Most likely this is due to unclean restart or race condition. Removing it.", fullPath(disk, to)); try { disk->removeSharedRecursive(fs::path(to) / "", *keep_shared_data); @@ -1309,7 +1357,7 @@ String IMergeTreeDataPart::getRelativePathForPrefix(const String & prefix, bool else if (parent_part) full_relative_path /= parent_part->relative_path; - for (int try_no = 0; try_no < 10; try_no++) + for (int try_no = 0; try_no < 10; ++try_no) { res = (prefix.empty() ? "" : prefix + "_") + name + (try_no ? "_try" + DB::toString(try_no) : ""); @@ -1340,9 +1388,7 @@ void IMergeTreeDataPart::renameToDetached(const String & prefix) const void IMergeTreeDataPart::makeCloneInDetached(const String & prefix, const StorageMetadataPtr & /*metadata_snapshot*/) const { String destination_path = fs::path(storage.relative_data_path) / getRelativePathForDetachedPart(prefix); - - /// Backup is not recursive (max_level is 0), so do not copy inner directories - localBackup(volume->getDisk(), getFullRelativePath(), destination_path, 0); + localBackup(volume->getDisk(), getFullRelativePath(), destination_path); volume->getDisk()->removeFileIfExists(fs::path(destination_path) / DELETE_ON_DESTROY_MARKER_FILE_NAME); } @@ -1558,15 +1604,6 @@ bool IMergeTreeDataPart::checkAllTTLCalculated(const StorageMetadataPtr & metada return true; } -SerializationPtr IMergeTreeDataPart::getSerializationForColumn(const NameAndTypePair & column) const -{ - return IDataType::getSerialization(column, - [&](const String & stream_name) - { - return checksums.files.count(stream_name + DATA_FILE_EXTENSION) != 0; - }); -} - String IMergeTreeDataPart::getUniqueId() const { auto disk = volume->getDisk(); diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.h b/src/Storages/MergeTree/IMergeTreeDataPart.h index 3515da20fa9..ab08ca1c33a 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.h +++ b/src/Storages/MergeTree/IMergeTreeDataPart.h @@ -13,6 +13,7 @@ #include #include #include +#include #include @@ -70,7 +71,7 @@ public: const IMergeTreeDataPart * parent_part_); IMergeTreeDataPart( - MergeTreeData & storage_, + const MergeTreeData & storage_, const String & name_, const VolumePtr & volume, const std::optional & relative_path, @@ -127,9 +128,12 @@ public: String getTypeName() const { return getType().toString(); } - void setColumns(const NamesAndTypesList & new_columns); + void setColumns(const NamesAndTypesList & new_columns, const SerializationInfoByName & new_infos = {}); const NamesAndTypesList & getColumns() const { return columns; } + const SerializationInfoByName & getSerializationInfos() const { return serialization_infos; } + SerializationInfoByName & getSerializationInfos() { return serialization_infos; } + SerializationPtr getSerialization(const NameAndTypePair & column) const; /// Throws an exception if part is not stored in on-disk format. void assertOnDisk() const; @@ -192,12 +196,12 @@ public: size_t rows_count = 0; - time_t modification_time = 0; /// When the part is removed from the working set. Changes once. mutable std::atomic remove_time { std::numeric_limits::max() }; /// If true, the destructor will delete the directory with the part. + /// FIXME Why do we need this flag? What's difference from Temporary and DeleteOnDestroy state? Can we get rid of this? bool is_temp = false; /// If true it means that there are no ZooKeeper node for this part, so it should be deleted only from filesystem @@ -390,14 +394,16 @@ public: static inline constexpr auto UUID_FILE_NAME = "uuid.txt"; + /// File that contains information about kinds of serialization of columns + /// and information that helps to choose kind of serialization later during merging + /// (number of rows, number of rows with default values, etc). + static inline constexpr auto SERIALIZATION_FILE_NAME = "serialization.json"; + /// Checks that all TTLs (table min/max, column ttls, so on) for part /// calculated. Part without calculated TTL may exist if TTL was added after /// part creation (using alter query with materialize_ttl setting). bool checkAllTTLCalculated(const StorageMetadataPtr & metadata_snapshot) const; - /// Returns serialization for column according to files in which column is written in part. - SerializationPtr getSerializationForColumn(const NameAndTypePair & column) const; - /// Return some uniq string for file /// Required for distinguish different copies of the same part on S3 String getUniqueId() const; @@ -420,6 +426,7 @@ protected: /// Columns description. Cannot be changed, after part initialization. NamesAndTypesList columns; + const Type part_type; /// Not null when it's a projection part. @@ -444,6 +451,9 @@ private: /// In compact parts order of columns is necessary NameToNumber column_name_to_position; + /// Map from name of column to its serialization info. + SerializationInfoByName serialization_infos; + /// Reads part unique identifier (if exists) from uuid.txt void loadUUID(); diff --git a/src/Storages/MergeTree/IMergeTreeReader.cpp b/src/Storages/MergeTree/IMergeTreeReader.cpp index 75eb01ed73c..79186402027 100644 --- a/src/Storages/MergeTree/IMergeTreeReader.cpp +++ b/src/Storages/MergeTree/IMergeTreeReader.cpp @@ -297,7 +297,7 @@ IMergeTreeReader::ColumnPosition IMergeTreeReader::findColumnForOffsets(const St { if (typeid_cast(part_column.type.get())) { - auto position = data_part->getColumnPosition(part_column.name); + auto position = data_part->getColumnPosition(part_column.getNameInStorage()); if (position && Nested::extractTableName(part_column.name) == table_name) return position; } diff --git a/src/Storages/MergeTree/IMergedBlockOutputStream.cpp b/src/Storages/MergeTree/IMergedBlockOutputStream.cpp index 48fd9e583bf..5393d71ff86 100644 --- a/src/Storages/MergeTree/IMergedBlockOutputStream.cpp +++ b/src/Storages/MergeTree/IMergedBlockOutputStream.cpp @@ -4,19 +4,34 @@ namespace DB { + IMergedBlockOutputStream::IMergedBlockOutputStream( const MergeTreeDataPartPtr & data_part, - const StorageMetadataPtr & metadata_snapshot_) + const StorageMetadataPtr & metadata_snapshot_, + const NamesAndTypesList & columns_list, + bool reset_columns_) : storage(data_part->storage) , metadata_snapshot(metadata_snapshot_) , volume(data_part->volume) , part_path(data_part->isStoredOnDisk() ? data_part->getFullRelativePath() : "") + , reset_columns(reset_columns_) { + if (reset_columns) + { + SerializationInfo::Settings info_settings = + { + .ratio_of_defaults_for_sparse = storage.getSettings()->ratio_of_defaults_for_sparse_serialization, + .choose_kind = false, + }; + + new_serialization_infos = SerializationInfoByName(columns_list, info_settings); + } } NameSet IMergedBlockOutputStream::removeEmptyColumnsFromPart( const MergeTreeDataPartPtr & data_part, NamesAndTypesList & columns, + SerializationInfoByName & serialization_infos, MergeTreeData::DataPart::Checksums & checksums) { const NameSet & empty_columns = data_part->expired_columns; @@ -28,10 +43,9 @@ NameSet IMergedBlockOutputStream::removeEmptyColumnsFromPart( /// Collect counts for shared streams of different columns. As an example, Nested columns have shared stream with array sizes. std::map stream_counts; - for (const NameAndTypePair & column : columns) + for (const auto & column : columns) { - auto serialization = data_part->getSerializationForColumn(column); - serialization->enumerateStreams( + data_part->getSerialization(column)->enumerateStreams( [&](const ISerialization::SubstreamPath & substream_path) { ++stream_counts[ISerialization::getFileNameForStream(column, substream_path)]; @@ -57,8 +71,8 @@ NameSet IMergedBlockOutputStream::removeEmptyColumnsFromPart( } }; - auto serialization = data_part->getSerializationForColumn(*column_with_type); - serialization->enumerateStreams(callback); + data_part->getSerialization(*column_with_type)->enumerateStreams(callback); + serialization_infos.erase(column_name); } /// Remove files on disk and checksums diff --git a/src/Storages/MergeTree/IMergedBlockOutputStream.h b/src/Storages/MergeTree/IMergedBlockOutputStream.h index 36fbe76cca2..a7c25edabd4 100644 --- a/src/Storages/MergeTree/IMergedBlockOutputStream.h +++ b/src/Storages/MergeTree/IMergedBlockOutputStream.h @@ -13,7 +13,9 @@ class IMergedBlockOutputStream public: IMergedBlockOutputStream( const MergeTreeDataPartPtr & data_part, - const StorageMetadataPtr & metadata_snapshot_); + const StorageMetadataPtr & metadata_snapshot_, + const NamesAndTypesList & columns_list, + bool reset_columns_); virtual ~IMergedBlockOutputStream() = default; @@ -36,6 +38,7 @@ protected: static NameSet removeEmptyColumnsFromPart( const MergeTreeDataPartPtr & data_part, NamesAndTypesList & columns, + SerializationInfoByName & serialization_infos, MergeTreeData::DataPart::Checksums & checksums); const MergeTreeData & storage; @@ -45,6 +48,9 @@ protected: String part_path; IMergeTreeDataPart::MergeTreeWriterPtr writer; + + bool reset_columns = false; + SerializationInfoByName new_serialization_infos; }; using IMergedBlockOutputStreamPtr = std::shared_ptr; diff --git a/src/Storages/MergeTree/LeaderElection.h b/src/Storages/MergeTree/LeaderElection.h index afaf2e7e841..aadaf953e73 100644 --- a/src/Storages/MergeTree/LeaderElection.h +++ b/src/Storages/MergeTree/LeaderElection.h @@ -48,6 +48,7 @@ void checkNoOldLeaders(Poco::Logger * log, ZooKeeper & zookeeper, const String p } else { + std::sort(potential_leaders.begin(), potential_leaders.end()); if (potential_leaders.front() == persistent_multiple_leaders) return; diff --git a/src/Storages/MergeTree/MergeTask.cpp b/src/Storages/MergeTree/MergeTask.cpp index a3e549ecda3..b4ecfbebdcb 100644 --- a/src/Storages/MergeTree/MergeTask.cpp +++ b/src/Storages/MergeTree/MergeTask.cpp @@ -6,6 +6,7 @@ #include #include +#include #include #include #include @@ -158,12 +159,20 @@ bool MergeTask::ExecuteAndFinalizeHorizontalPart::prepare() global_ctx->parent_part); global_ctx->new_data_part->uuid = global_ctx->future_part->uuid; - global_ctx->new_data_part->setColumns(global_ctx->storage_columns); global_ctx->new_data_part->partition.assign(global_ctx->future_part->getPartition()); global_ctx->new_data_part->is_temp = global_ctx->parent_part == nullptr; ctx->need_remove_expired_values = false; ctx->force_ttl = false; + + SerializationInfo::Settings info_settings = + { + .ratio_of_defaults_for_sparse = global_ctx->data->getSettings()->ratio_of_defaults_for_sparse_serialization, + .choose_kind = true, + }; + + SerializationInfoByName infos(global_ctx->storage_columns, info_settings); + for (const auto & part : global_ctx->future_part->parts) { global_ctx->new_data_part->ttl_infos.update(part->ttl_infos); @@ -173,8 +182,12 @@ bool MergeTask::ExecuteAndFinalizeHorizontalPart::prepare() ctx->need_remove_expired_values = true; ctx->force_ttl = true; } + + infos.add(part->getSerializationInfos()); } + global_ctx->new_data_part->setColumns(global_ctx->storage_columns, infos); + const auto & local_part_min_ttl = global_ctx->new_data_part->ttl_infos.part_min_ttl; if (local_part_min_ttl && local_part_min_ttl <= global_ctx->time_of_merge) ctx->need_remove_expired_values = true; @@ -248,6 +261,7 @@ bool MergeTask::ExecuteAndFinalizeHorizontalPart::prepare() global_ctx->merging_columns, MergeTreeIndexFactory::instance().getMany(global_ctx->metadata_snapshot->getSecondaryIndices()), ctx->compression_codec, + /*reset_columns=*/ true, ctx->blocks_are_granules_size); global_ctx->rows_written = 0; @@ -395,7 +409,7 @@ bool MergeTask::VerticalMergeStage::prepareVerticalMergeForAllColumns() const void MergeTask::VerticalMergeStage::prepareVerticalMergeForOneColumn() const { - const String & column_name = ctx->it_name_and_type->name; + const auto & [column_name, column_type] = *ctx->it_name_and_type; Names column_names{column_name}; ctx->progress_before = global_ctx->merge_list_element_ptr->progress.load(std::memory_order_relaxed); @@ -501,7 +515,14 @@ bool MergeTask::VerticalMergeStage::finalizeVerticalMergeForAllColumns() const bool MergeTask::MergeProjectionsStage::mergeMinMaxIndexAndPrepareProjections() const { for (const auto & part : global_ctx->future_part->parts) - global_ctx->new_data_part->minmax_idx->merge(*part->minmax_idx); + { + /// Skip empty parts, + /// (that can be created in StorageReplicatedMergeTree::createEmptyPartInsteadOfLost()) + /// since they can incorrectly set min, + /// that will be changed after one more merge/OPTIMIZE. + if (!part->isEmpty()) + global_ctx->new_data_part->minmax_idx->merge(*part->minmax_idx); + } /// Print overall profiling info. NOTE: it may duplicates previous messages { diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index e58472e572b..6597c28360d 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -224,7 +224,6 @@ MergeTreeData::MergeTreeData( { try { - checkPartitionKeyAndInitMinMax(metadata_.partition_key); setProperties(metadata_, metadata_, attach); if (minmax_idx_date_column_pos == -1) @@ -1525,6 +1524,24 @@ void MergeTreeData::removePartsFinally(const MergeTreeData::DataPartsVector & pa } } +void MergeTreeData::flushAllInMemoryPartsIfNeeded() +{ + if (getSettings()->in_memory_parts_enable_wal) + return; + + auto metadata_snapshot = getInMemoryMetadataPtr(); + DataPartsVector parts = getDataPartsVector(); + for (const auto & part : parts) + { + if (auto part_in_memory = asInMemoryPart(part)) + { + const auto & storage_relative_path = part_in_memory->storage.relative_data_path; + part_in_memory->flushToDisk(storage_relative_path, part_in_memory->relative_path, metadata_snapshot); + } + } + +} + size_t MergeTreeData::clearOldPartsFromFilesystem(bool force) { DataPartsVector parts_to_remove = grabOldParts(force); @@ -3725,6 +3742,27 @@ std::unordered_set MergeTreeData::getPartitionIDsFromQuery(const ASTs & return partition_ids; } +std::set MergeTreeData::getPartitionIdsAffectedByCommands( + const MutationCommands & commands, ContextPtr query_context) const +{ + std::set affected_partition_ids; + + for (const auto & command : commands) + { + if (!command.partition) + { + affected_partition_ids.clear(); + break; + } + + affected_partition_ids.insert( + getPartitionIDFromQuery(command.partition, query_context) + ); + } + + return affected_partition_ids; +} + MergeTreeData::DataPartsVector MergeTreeData::getDataPartsVector( const DataPartStates & affordable_states, DataPartStateVector * out_states, bool require_projection_parts) const diff --git a/src/Storages/MergeTree/MergeTreeData.h b/src/Storages/MergeTree/MergeTreeData.h index 8830aaad7a1..380c2f4f4c5 100644 --- a/src/Storages/MergeTree/MergeTreeData.h +++ b/src/Storages/MergeTree/MergeTreeData.h @@ -548,6 +548,9 @@ public: /// Removes parts from data_parts, they should be in Deleting state void removePartsFinally(const DataPartsVector & parts); + /// When WAL is not enabled, the InMemoryParts need to be persistent. + void flushAllInMemoryPartsIfNeeded(); + /// Delete irrelevant parts from memory and disk. /// If 'force' - don't wait for old_parts_lifetime. size_t clearOldPartsFromFilesystem(bool force = false); @@ -685,6 +688,7 @@ public: /// For ATTACH/DETACH/DROP PARTITION. String getPartitionIDFromQuery(const ASTPtr & ast, ContextPtr context) const; std::unordered_set getPartitionIDsFromQuery(const ASTs & asts, ContextPtr context) const; + std::set getPartitionIdsAffectedByCommands(const MutationCommands & commands, ContextPtr query_context) const; /// Extracts MergeTreeData of other *MergeTree* storage /// and checks that their structure suitable for ALTER TABLE ATTACH PARTITION FROM diff --git a/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp b/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp index 6161c4c32a3..cb9fa7e6086 100644 --- a/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp +++ b/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp @@ -664,27 +664,55 @@ void MergeTreeDataMergerMutator::splitMutationCommands( } -NamesAndTypesList MergeTreeDataMergerMutator::getColumnsForNewDataPart( +std::pair +MergeTreeDataMergerMutator::getColumnsForNewDataPart( MergeTreeData::DataPartPtr source_part, const Block & updated_header, NamesAndTypesList storage_columns, + const SerializationInfoByName & serialization_infos, const MutationCommands & commands_for_removes) { - /// In compact parts we read all columns, because they all stored in a - /// single file - if (!isWidePart(source_part)) - return updated_header.getNamesAndTypesList(); - NameSet removed_columns; NameToNameMap renamed_columns_to_from; + NameToNameMap renamed_columns_from_to; + ColumnsDescription part_columns(source_part->getColumns()); + /// All commands are validated in AlterCommand so we don't care about order for (const auto & command : commands_for_removes) { + /// If we don't have this column in source part, than we don't need to materialize it + if (!part_columns.has(command.column_name)) + continue; + if (command.type == MutationCommand::DROP_COLUMN) removed_columns.insert(command.column_name); + if (command.type == MutationCommand::RENAME_COLUMN) + { renamed_columns_to_from.emplace(command.rename_to, command.column_name); + renamed_columns_from_to.emplace(command.column_name, command.rename_to); + } } + + bool is_wide_part = isWidePart(source_part); + SerializationInfoByName new_serialization_infos; + for (const auto & [name, info] : serialization_infos) + { + if (is_wide_part && removed_columns.count(name)) + continue; + + auto it = renamed_columns_from_to.find(name); + if (it != renamed_columns_from_to.end()) + new_serialization_infos.emplace(it->second, info); + else + new_serialization_infos.emplace(name, info); + } + + /// In compact parts we read all columns, because they all stored in a + /// single file + if (!is_wide_part) + return {updated_header.getNamesAndTypesList(), new_serialization_infos}; + Names source_column_names = source_part->getColumns().getNames(); NameSet source_columns_name_set(source_column_names.begin(), source_column_names.end()); for (auto it = storage_columns.begin(); it != storage_columns.end();) @@ -711,18 +739,9 @@ NamesAndTypesList MergeTreeDataMergerMutator::getColumnsForNewDataPart( } else { - bool was_renamed = false; - bool was_removed = removed_columns.count(it->name); - /// Check that this column was renamed to some other name - for (const auto & [rename_to, rename_from] : renamed_columns_to_from) - { - if (rename_from == it->name) - { - was_renamed = true; - break; - } - } + bool was_renamed = renamed_columns_from_to.count(it->name); + bool was_removed = removed_columns.count(it->name); /// If we want to rename this column to some other name, than it /// should it's previous version should be dropped or removed @@ -731,7 +750,6 @@ NamesAndTypesList MergeTreeDataMergerMutator::getColumnsForNewDataPart( ErrorCodes::LOGICAL_ERROR, "Incorrect mutation commands, trying to rename column {} to {}, but part {} already has column {}", renamed_columns_to_from[it->name], it->name, source_part->name, it->name); - /// Column was renamed and no other column renamed to it's name /// or column is dropped. if (!renamed_columns_to_from.count(it->name) && (was_renamed || was_removed)) @@ -742,7 +760,7 @@ NamesAndTypesList MergeTreeDataMergerMutator::getColumnsForNewDataPart( } } - return storage_columns; + return {storage_columns, new_serialization_infos}; } diff --git a/src/Storages/MergeTree/MergeTreeDataMergerMutator.h b/src/Storages/MergeTree/MergeTreeDataMergerMutator.h index e5c8a4d8285..bcac642eb16 100644 --- a/src/Storages/MergeTree/MergeTreeDataMergerMutator.h +++ b/src/Storages/MergeTree/MergeTreeDataMergerMutator.h @@ -150,10 +150,11 @@ private: MutationCommands & for_file_renames); /// Get the columns list of the resulting part in the same order as storage_columns. - static NamesAndTypesList getColumnsForNewDataPart( + static std::pair getColumnsForNewDataPart( MergeTreeData::DataPartPtr source_part, const Block & updated_header, NamesAndTypesList storage_columns, + const SerializationInfoByName & serialization_infos, const MutationCommands & commands_for_removes); static ExecuteTTLType shouldExecuteTTL( diff --git a/src/Storages/MergeTree/MergeTreeDataPartCompact.cpp b/src/Storages/MergeTree/MergeTreeDataPartCompact.cpp index c4c2e65547b..f4da730b1f0 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartCompact.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartCompact.cpp @@ -125,7 +125,7 @@ void MergeTreeDataPartCompact::loadIndexGranularity() bool MergeTreeDataPartCompact::hasColumnFiles(const NameAndTypePair & column) const { - if (!getColumnPosition(column.name)) + if (!getColumnPosition(column.getNameInStorage())) return false; auto bin_checksum = checksums.files.find(DATA_FILE_NAME_WITH_EXTENSION); diff --git a/src/Storages/MergeTree/MergeTreeDataPartInMemory.cpp b/src/Storages/MergeTree/MergeTreeDataPartInMemory.cpp index 3a1ea474d74..4ec53d88339 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartInMemory.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartInMemory.cpp @@ -121,7 +121,9 @@ void MergeTreeDataPartInMemory::flushToDisk(const String & base_path, const Stri auto projection_compression_codec = storage.getContext()->chooseCompressionCodec(0, 0); auto projection_indices = MergeTreeIndexFactory::instance().getMany(desc.metadata->getSecondaryIndices()); MergedBlockOutputStream projection_out( - projection_data_part, desc.metadata, projection_part->columns, projection_indices, projection_compression_codec); + projection_data_part, desc.metadata, projection_part->columns, projection_indices, + projection_compression_codec); + projection_out.write(projection_part->block); projection_out.writeSuffixAndFinalizePart(projection_data_part); new_data_part->addProjectionPart(projection_name, std::move(projection_data_part)); diff --git a/src/Storages/MergeTree/MergeTreeDataPartInMemory.h b/src/Storages/MergeTree/MergeTreeDataPartInMemory.h index d1a0344859d..c5ee9ebd01f 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartInMemory.h +++ b/src/Storages/MergeTree/MergeTreeDataPartInMemory.h @@ -45,7 +45,7 @@ public: bool isStoredOnDisk() const override { return false; } bool isStoredOnRemoteDisk() const override { return false; } - bool hasColumnFiles(const NameAndTypePair & column) const override { return !!getColumnPosition(column.name); } + bool hasColumnFiles(const NameAndTypePair & column) const override { return !!getColumnPosition(column.getNameInStorage()); } String getFileNameForColumn(const NameAndTypePair & /* column */) const override { return ""; } void renameTo(const String & new_relative_path, bool remove_new_dir_if_exists) const override; void makeCloneInDetached(const String & prefix, const StorageMetadataPtr & metadata_snapshot) const override; diff --git a/src/Storages/MergeTree/MergeTreeDataPartWide.cpp b/src/Storages/MergeTree/MergeTreeDataPartWide.cpp index 312f5b435d6..b279c1aba6a 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWide.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartWide.cpp @@ -80,8 +80,7 @@ ColumnSize MergeTreeDataPartWide::getColumnSizeImpl( if (checksums.empty()) return size; - auto serialization = getSerializationForColumn(column); - serialization->enumerateStreams([&](const ISerialization::SubstreamPath & substream_path) + getSerialization(column)->enumerateStreams([&](const ISerialization::SubstreamPath & substream_path) { String file_name = ISerialization::getFileNameForStream(column, substream_path); @@ -163,8 +162,7 @@ void MergeTreeDataPartWide::checkConsistency(bool require_part_metadata) const { for (const NameAndTypePair & name_type : columns) { - auto serialization = getSerializationForColumn(name_type); - serialization->enumerateStreams([&](const ISerialization::SubstreamPath & substream_path) + getSerialization(name_type)->enumerateStreams([&](const ISerialization::SubstreamPath & substream_path) { String file_name = ISerialization::getFileNameForStream(name_type, substream_path); String mrk_file_name = file_name + index_granularity_info.marks_file_extension; @@ -178,7 +176,6 @@ void MergeTreeDataPartWide::checkConsistency(bool require_part_metadata) const }); } } - } else { @@ -186,13 +183,7 @@ void MergeTreeDataPartWide::checkConsistency(bool require_part_metadata) const std::optional marks_size; for (const NameAndTypePair & name_type : columns) { - auto serialization = IDataType::getSerialization(name_type, - [&](const String & stream_name) - { - return volume->getDisk()->exists(stream_name + DATA_FILE_EXTENSION); - }); - - serialization->enumerateStreams([&](const ISerialization::SubstreamPath & substream_path) + getSerialization(name_type)->enumerateStreams([&](const ISerialization::SubstreamPath & substream_path) { auto file_path = path + ISerialization::getFileNameForStream(name_type, substream_path) + index_granularity_info.marks_file_extension; @@ -227,8 +218,7 @@ bool MergeTreeDataPartWide::hasColumnFiles(const NameAndTypePair & column) const }; bool res = true; - auto serialization = IDataType::getSerialization(column, check_stream_exists); - serialization->enumerateStreams([&](const ISerialization::SubstreamPath & substream_path) + getSerialization(column)->enumerateStreams([&](const auto & substream_path) { String file_name = ISerialization::getFileNameForStream(column, substream_path); if (!check_stream_exists(file_name)) @@ -241,8 +231,7 @@ bool MergeTreeDataPartWide::hasColumnFiles(const NameAndTypePair & column) const String MergeTreeDataPartWide::getFileNameForColumn(const NameAndTypePair & column) const { String filename; - auto serialization = column.type->getDefaultSerialization(); - serialization->enumerateStreams([&](const ISerialization::SubstreamPath & substream_path) + getSerialization(column)->enumerateStreams([&](const ISerialization::SubstreamPath & substream_path) { if (filename.empty()) filename = ISerialization::getFileNameForStream(column, substream_path); @@ -261,7 +250,10 @@ void MergeTreeDataPartWide::calculateEachColumnSizes(ColumnSizeByName & each_col #ifndef NDEBUG /// Most trivial types - if (rows_count != 0 && column.type->isValueRepresentedByNumber() && !column.type->haveSubtypes()) + if (rows_count != 0 + && column.type->isValueRepresentedByNumber() + && !column.type->haveSubtypes() + && getSerialization(column)->getKind() == ISerialization::Kind::DEFAULT) { size_t rows_in_column = size.data_uncompressed / column.type->getSizeOfValueInMemory(); if (rows_in_column != rows_count) diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.cpp b/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.cpp index 5d17d6235e1..ce85bc75c80 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.cpp @@ -66,7 +66,7 @@ void MergeTreeDataPartWriterCompact::addStreams(const NameAndTypePair & column, }; ISerialization::SubstreamPath path; - serializations[column.name]->enumerateStreams(path, callback, column.type, nullptr); + data_part->getSerialization(column)->enumerateStreams(path, callback, column.type); } namespace @@ -207,7 +207,7 @@ void MergeTreeDataPartWriterCompact::writeDataBlock(const Block & block, const G writeIntBinary(UInt64(0), marks); writeColumnSingleGranule( - block.getByName(name_and_type->name), serializations[name_and_type->name], + block.getByName(name_and_type->name), data_part->getSerialization(*name_and_type), stream_getter, granule.start_row, granule.rows_to_write); /// Each type always have at least one substream diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.cpp b/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.cpp index 4263640c1e0..03ae6688beb 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.cpp @@ -85,9 +85,6 @@ MergeTreeDataPartWriterOnDisk::MergeTreeDataPartWriterOnDisk( if (!disk->exists(part_path)) disk->createDirectories(part_path); - for (const auto & column : columns_list) - serializations.emplace(column.name, column.type->getDefaultSerialization()); - if (settings.rewrite_primary_key) initPrimaryIndex(); initSkipIndices(); @@ -119,7 +116,7 @@ static size_t computeIndexGranularityImpl( } else { - size_t size_of_row_in_bytes = block_size_in_memory / rows_in_block; + size_t size_of_row_in_bytes = std::max(block_size_in_memory / rows_in_block, 1UL); index_granularity_for_block = index_granularity_bytes / size_of_row_in_bytes; } } diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.h b/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.h index e64ba9edec0..fb46175c2aa 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.h +++ b/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.h @@ -131,9 +131,6 @@ protected: MergeTreeIndexAggregators skip_indices_aggregators; std::vector skip_index_accumulated_marks; - using SerializationsMap = std::unordered_map; - SerializationsMap serializations; - std::unique_ptr index_file_stream; std::unique_ptr index_stream; DataTypes index_types; diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterWide.cpp b/src/Storages/MergeTree/MergeTreeDataPartWriterWide.cpp index 224a197c3c8..b620bf8130e 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWriterWide.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartWriterWide.cpp @@ -4,6 +4,7 @@ #include #include #include +#include namespace DB { @@ -85,7 +86,6 @@ MergeTreeDataPartWriterWide::MergeTreeDataPartWriterWide( addStreams(it, columns.getCodecDescOrDefault(it.name, default_codec)); } - void MergeTreeDataPartWriterWide::addStreams( const NameAndTypePair & column, const ASTPtr & effective_codec_desc) @@ -94,6 +94,7 @@ void MergeTreeDataPartWriterWide::addStreams( { assert(!substream_path.empty()); String stream_name = ISerialization::getFileNameForStream(column, substream_path); + /// Shared offsets for Nested type. if (column_streams.count(stream_name)) return; @@ -117,7 +118,7 @@ void MergeTreeDataPartWriterWide::addStreams( }; ISerialization::SubstreamPath path; - serializations[column.name]->enumerateStreams(path, callback, column.type, nullptr); + data_part->getSerialization(column)->enumerateStreams(path, callback, column.type); } @@ -196,7 +197,9 @@ void MergeTreeDataPartWriterWide::write(const Block & block, const IColumn::Perm fillIndexGranularity(index_granularity_for_block, block.rows()); } - auto granules_to_write = getGranulesToWrite(index_granularity, block.rows(), getCurrentMark(), rows_written_in_last_mark); + Block block_to_write = block; + + auto granules_to_write = getGranulesToWrite(index_granularity, block_to_write.rows(), getCurrentMark(), rows_written_in_last_mark); auto offset_columns = written_offset_columns ? *written_offset_columns : WrittenOffsetColumns{}; Block primary_key_block; @@ -208,7 +211,10 @@ void MergeTreeDataPartWriterWide::write(const Block & block, const IColumn::Perm auto it = columns_list.begin(); for (size_t i = 0; i < columns_list.size(); ++i, ++it) { - const ColumnWithTypeAndName & column = block.getByName(it->name); + auto & column = block_to_write.getByName(it->name); + + if (data_part->getSerialization(*it)->getKind() != ISerialization::Kind::SPARSE) + column.column = recursiveRemoveSparse(column.column); if (permutation) { @@ -269,7 +275,7 @@ StreamsWithMarks MergeTreeDataPartWriterWide::getCurrentMarksForColumn( ISerialization::SubstreamPath & path) { StreamsWithMarks result; - serializations[column.name]->enumerateStreams([&] (const ISerialization::SubstreamPath & substream_path) + data_part->getSerialization(column)->enumerateStreams([&] (const ISerialization::SubstreamPath & substream_path) { bool is_offsets = !substream_path.empty() && substream_path.back().type == ISerialization::Substream::ArraySizes; @@ -304,7 +310,7 @@ void MergeTreeDataPartWriterWide::writeSingleGranule( ISerialization::SerializeBinaryBulkSettings & serialize_settings, const Granule & granule) { - const auto & serialization = serializations[name_and_type.name]; + const auto & serialization = data_part->getSerialization(name_and_type); serialization->serializeBinaryBulkWithMultipleStreams(column, granule.start_row, granule.rows_to_write, serialize_settings, serialization_state); /// So that instead of the marks pointing to the end of the compressed block, there were marks pointing to the beginning of the next one. @@ -334,12 +340,13 @@ void MergeTreeDataPartWriterWide::writeColumn( const auto & [name, type] = name_and_type; auto [it, inserted] = serialization_states.emplace(name, nullptr); + auto serialization = data_part->getSerialization(name_and_type); if (inserted) { ISerialization::SerializeBinaryBulkSettings serialize_settings; serialize_settings.getter = createStreamGetter(name_and_type, offset_columns); - serializations[name]->serializeBinaryBulkStatePrefix(serialize_settings, it->second); + serialization->serializeBinaryBulkStatePrefix(serialize_settings, it->second); } const auto & global_settings = storage.getContext()->getSettingsRef(); @@ -380,7 +387,7 @@ void MergeTreeDataPartWriterWide::writeColumn( } } - serializations[name]->enumerateStreams([&] (const ISerialization::SubstreamPath & substream_path) + serialization->enumerateStreams([&] (const ISerialization::SubstreamPath & substream_path) { bool is_offsets = !substream_path.empty() && substream_path.back().type == ISerialization::Substream::ArraySizes; if (is_offsets) @@ -392,10 +399,13 @@ void MergeTreeDataPartWriterWide::writeColumn( } -void MergeTreeDataPartWriterWide::validateColumnOfFixedSize(const String & name, const IDataType & type) +void MergeTreeDataPartWriterWide::validateColumnOfFixedSize(const NameAndTypePair & name_type) { - if (!type.isValueRepresentedByNumber() || type.haveSubtypes()) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot validate column of non fixed type {}", type.getName()); + const auto & [name, type] = name_type; + const auto & serialization = data_part->getSerialization(name_type); + + if (!type->isValueRepresentedByNumber() || type->haveSubtypes() || serialization->getKind() != ISerialization::Kind::DEFAULT) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot validate column of non fixed type {}", type->getName()); auto disk = data_part->volume->getDisk(); String escaped_name = escapeForFileName(name); @@ -410,7 +420,6 @@ void MergeTreeDataPartWriterWide::validateColumnOfFixedSize(const String & name, size_t mark_num; - const auto & serialization = serializations[name]; for (mark_num = 0; !mrk_in->eof(); ++mark_num) { if (mark_num > index_granularity.getMarksCount()) @@ -436,7 +445,7 @@ void MergeTreeDataPartWriterWide::validateColumnOfFixedSize(const String & name, if (index_granularity_rows == 0) { - auto column = type.createColumn(); + auto column = type->createColumn(); serialization->deserializeBinaryBulk(*column, bin_in, 1000000000, 0.0); @@ -456,7 +465,7 @@ void MergeTreeDataPartWriterWide::validateColumnOfFixedSize(const String & name, ErrorCodes::LOGICAL_ERROR, "Incorrect mark rows for part {} for mark #{} (compressed offset {}, decompressed offset {}), in-memory {}, on disk {}, total marks {}", data_part->getFullPath(), mark_num, offset_in_compressed_file, offset_in_decompressed_block, index_granularity.getMarkRows(mark_num), index_granularity_rows, index_granularity.getMarksCount()); - auto column = type.createColumn(); + auto column = type->createColumn(); serialization->deserializeBinaryBulk(*column, bin_in, index_granularity_rows, 0.0); @@ -495,7 +504,7 @@ void MergeTreeDataPartWriterWide::validateColumnOfFixedSize(const String & name, "Still have something in marks stream, last mark #{} index granularity size {}, last rows {}", mark_num, index_granularity.getMarksCount(), index_granularity_rows); if (!bin_in.eof()) { - auto column = type.createColumn(); + auto column = type->createColumn(); serialization->deserializeBinaryBulk(*column, bin_in, 1000000000, 0.0); @@ -531,7 +540,7 @@ void MergeTreeDataPartWriterWide::finishDataSerialization(IMergeTreeDataPart::Ch if (!serialization_states.empty()) { serialize_settings.getter = createStreamGetter(*it, written_offset_columns ? *written_offset_columns : offset_columns); - serializations[it->name]->serializeBinaryBulkStateSuffix(serialize_settings, serialization_states[it->name]); + data_part->getSerialization(*it)->serializeBinaryBulkStateSuffix(serialize_settings, serialization_states[it->name]); } if (write_final_mark) @@ -554,8 +563,12 @@ void MergeTreeDataPartWriterWide::finishDataSerialization(IMergeTreeDataPart::Ch /// data according to marks. Otherwise throws LOGICAL_ERROR (equal to abort in debug mode) for (const auto & column : columns_list) { - if (column.type->isValueRepresentedByNumber() && !column.type->haveSubtypes()) - validateColumnOfFixedSize(column.name, *column.type); + if (column.type->isValueRepresentedByNumber() + && !column.type->haveSubtypes() + && data_part->getSerialization(column)->getKind() == ISerialization::Kind::DEFAULT) + { + validateColumnOfFixedSize(column); + } } #endif @@ -580,7 +593,7 @@ void MergeTreeDataPartWriterWide::writeFinalMark( { writeSingleMark(column, offset_columns, 0, path); /// Memoize information about offsets - serializations[column.name]->enumerateStreams([&] (const ISerialization::SubstreamPath & substream_path) + data_part->getSerialization(column)->enumerateStreams([&] (const ISerialization::SubstreamPath & substream_path) { bool is_offsets = !substream_path.empty() && substream_path.back().type == ISerialization::Substream::ArraySizes; if (is_offsets) diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterWide.h b/src/Storages/MergeTree/MergeTreeDataPartWriterWide.h index 5eaaa0c1bbe..6303fbbac0d 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWriterWide.h +++ b/src/Storages/MergeTree/MergeTreeDataPartWriterWide.h @@ -84,7 +84,7 @@ private: /// Method for self check (used in debug-build only). Checks that written /// data and corresponding marks are consistent. Otherwise throws logical /// errors. - void validateColumnOfFixedSize(const String & name, const IDataType & type); + void validateColumnOfFixedSize(const NameAndTypePair & name_type); void fillIndexGranularity(size_t index_granularity_for_block, size_t rows_in_block) override; diff --git a/src/Storages/MergeTree/MergeTreeDataWriter.cpp b/src/Storages/MergeTree/MergeTreeDataWriter.cpp index 2cf24215d28..9f17a44a7f8 100644 --- a/src/Storages/MergeTree/MergeTreeDataWriter.cpp +++ b/src/Storages/MergeTree/MergeTreeDataWriter.cpp @@ -198,36 +198,41 @@ BlocksWithPartition MergeTreeDataWriter::splitBlockIntoParts( return result; } -Block MergeTreeDataWriter::mergeBlock(const Block & block, SortDescription sort_description, Names & partition_key_columns, IColumn::Permutation *& permutation) +Block MergeTreeDataWriter::mergeBlock( + const Block & block, + SortDescription sort_description, + const Names & partition_key_columns, + IColumn::Permutation *& permutation, + const MergeTreeData::MergingParams & merging_params) { size_t block_size = block.rows(); auto get_merging_algorithm = [&]() -> std::shared_ptr { - switch (data.merging_params.mode) + switch (merging_params.mode) { /// There is nothing to merge in single block in ordinary MergeTree case MergeTreeData::MergingParams::Ordinary: return nullptr; case MergeTreeData::MergingParams::Replacing: return std::make_shared( - block, 1, sort_description, data.merging_params.version_column, block_size + 1); + block, 1, sort_description, merging_params.version_column, block_size + 1); case MergeTreeData::MergingParams::Collapsing: return std::make_shared( - block, 1, sort_description, data.merging_params.sign_column, + block, 1, sort_description, merging_params.sign_column, false, block_size + 1, &Poco::Logger::get("MergeTreeBlockOutputStream")); case MergeTreeData::MergingParams::Summing: return std::make_shared( - block, 1, sort_description, data.merging_params.columns_to_sum, + block, 1, sort_description, merging_params.columns_to_sum, partition_key_columns, block_size + 1); case MergeTreeData::MergingParams::Aggregating: return std::make_shared(block, 1, sort_description, block_size + 1); case MergeTreeData::MergingParams::VersionedCollapsing: return std::make_shared( - block, 1, sort_description, data.merging_params.sign_column, block_size + 1); + block, 1, sort_description, merging_params.sign_column, block_size + 1); case MergeTreeData::MergingParams::Graphite: return std::make_shared( - block, 1, sort_description, block_size + 1, data.merging_params.graphite_params, time(nullptr)); + block, 1, sort_description, block_size + 1, merging_params.graphite_params, time(nullptr)); } __builtin_unreachable(); @@ -330,7 +335,7 @@ MergeTreeData::MutableDataPartPtr MergeTreeDataWriter::writeTempPart( Names partition_key_columns = metadata_snapshot->getPartitionKey().column_names; if (context->getSettingsRef().optimize_on_insert) - block = mergeBlock(block, sort_description, partition_key_columns, perm_ptr); + block = mergeBlock(block, sort_description, partition_key_columns, perm_ptr, data.merging_params); /// Size of part would not be greater than block.bytes() + epsilon size_t expected_size = block.bytes(); @@ -359,7 +364,13 @@ MergeTreeData::MutableDataPartPtr MergeTreeDataWriter::writeTempPart( if (data.storage_settings.get()->assign_part_uuids) new_data_part->uuid = UUIDHelpers::generateV4(); - new_data_part->setColumns(columns); + const auto & data_settings = data.getSettings(); + + SerializationInfo::Settings settings{data_settings->ratio_of_defaults_for_sparse_serialization, true}; + SerializationInfoByName infos(columns, settings); + infos.add(block); + + new_data_part->setColumns(columns, infos); new_data_part->rows_count = block.rows(); new_data_part->partition = std::move(partition); new_data_part->minmax_idx = std::move(minmax_idx); @@ -407,8 +418,10 @@ MergeTreeData::MutableDataPartPtr MergeTreeDataWriter::writeTempPart( auto compression_codec = data.getContext()->chooseCompressionCodec(0, 0); const auto & index_factory = MergeTreeIndexFactory::instance(); - MergedBlockOutputStream out(new_data_part, metadata_snapshot, columns, index_factory.getMany(metadata_snapshot->getSecondaryIndices()), compression_codec); - bool sync_on_insert = data.getSettings()->fsync_after_insert; + MergedBlockOutputStream out(new_data_part, metadata_snapshot,columns, + index_factory.getMany(metadata_snapshot->getSecondaryIndices()), compression_codec); + + bool sync_on_insert = data_settings->fsync_after_insert; out.writeWithPermutation(block, perm_ptr); @@ -429,7 +442,7 @@ MergeTreeData::MutableDataPartPtr MergeTreeDataWriter::writeTempPart( } MergeTreeData::MutableDataPartPtr MergeTreeDataWriter::writeProjectionPartImpl( - const String part_name, + const String & part_name, MergeTreeDataPartType part_type, const String & relative_path, bool is_temp, @@ -437,8 +450,9 @@ MergeTreeData::MutableDataPartPtr MergeTreeDataWriter::writeProjectionPartImpl( const MergeTreeData & data, Poco::Logger * log, Block block, - const StorageMetadataPtr & metadata_snapshot) + const ProjectionDescription & projection) { + const StorageMetadataPtr & metadata_snapshot = projection.metadata; MergeTreePartInfo new_part_info("all", 0, 0, 0); auto new_data_part = data.createPart( part_name, @@ -450,7 +464,11 @@ MergeTreeData::MutableDataPartPtr MergeTreeDataWriter::writeProjectionPartImpl( new_data_part->is_temp = is_temp; NamesAndTypesList columns = metadata_snapshot->getColumns().getAllPhysical().filter(block.getNames()); - new_data_part->setColumns(columns); + SerializationInfo::Settings settings{data.getSettings()->ratio_of_defaults_for_sparse_serialization, true}; + SerializationInfoByName infos(columns, settings); + infos.add(block); + + new_data_part->setColumns(columns, infos); if (new_data_part->isStoredOnDisk()) { @@ -494,6 +512,13 @@ MergeTreeData::MutableDataPartPtr MergeTreeDataWriter::writeProjectionPartImpl( ProfileEvents::increment(ProfileEvents::MergeTreeDataProjectionWriterBlocksAlreadySorted); } + if (projection.type == ProjectionDescription::Type::Aggregate) + { + MergeTreeData::MergingParams projection_merging_params; + projection_merging_params.mode = MergeTreeData::MergingParams::Aggregating; + block = mergeBlock(block, sort_description, {}, perm_ptr, projection_merging_params); + } + /// This effectively chooses minimal compression method: /// either default lz4 or compression method with zero thresholds on absolute and relative part size. auto compression_codec = data.getContext()->chooseCompressionCodec(0, 0); @@ -542,7 +567,7 @@ MergeTreeData::MutableDataPartPtr MergeTreeDataWriter::writeProjectionPart( data, log, block, - projection.metadata); + projection); } /// This is used for projection materialization process which may contain multiple stages of @@ -579,7 +604,7 @@ MergeTreeData::MutableDataPartPtr MergeTreeDataWriter::writeTempProjectionPart( data, log, block, - projection.metadata); + projection); } MergeTreeData::MutableDataPartPtr MergeTreeDataWriter::writeInMemoryProjectionPart( @@ -598,7 +623,7 @@ MergeTreeData::MutableDataPartPtr MergeTreeDataWriter::writeInMemoryProjectionPa data, log, block, - projection.metadata); + projection); } } diff --git a/src/Storages/MergeTree/MergeTreeDataWriter.h b/src/Storages/MergeTree/MergeTreeDataWriter.h index 006f897c3e2..f16ec877113 100644 --- a/src/Storages/MergeTree/MergeTreeDataWriter.h +++ b/src/Storages/MergeTree/MergeTreeDataWriter.h @@ -33,7 +33,7 @@ using BlocksWithPartition = std::vector; class MergeTreeDataWriter { public: - MergeTreeDataWriter(MergeTreeData & data_) : data(data_), log(&Poco::Logger::get(data.getLogName() + " (Writer)")) {} + explicit MergeTreeDataWriter(MergeTreeData & data_) : data(data_), log(&Poco::Logger::get(data.getLogName() + " (Writer)")) {} /** Split the block to blocks, each of them must be written as separate part. * (split rows by partition) @@ -74,11 +74,16 @@ public: const ProjectionDescription & projection, const IMergeTreeDataPart * parent_part); - Block mergeBlock(const Block & block, SortDescription sort_description, Names & partition_key_columns, IColumn::Permutation *& permutation); + static Block mergeBlock( + const Block & block, + SortDescription sort_description, + const Names & partition_key_columns, + IColumn::Permutation *& permutation, + const MergeTreeData::MergingParams & merging_params); private: static MergeTreeData::MutableDataPartPtr writeProjectionPartImpl( - const String part_name, + const String & part_name, MergeTreeDataPartType part_type, const String & relative_path, bool is_temp, @@ -86,7 +91,7 @@ private: const MergeTreeData & data, Poco::Logger * log, Block block, - const StorageMetadataPtr & metadata_snapshot); + const ProjectionDescription & projection); MergeTreeData & data; diff --git a/src/Storages/MergeTree/MergeTreeIndexFullText.cpp b/src/Storages/MergeTree/MergeTreeIndexFullText.cpp index a8820b3f6d4..9332f4fd442 100644 --- a/src/Storages/MergeTree/MergeTreeIndexFullText.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexFullText.cpp @@ -112,7 +112,7 @@ void MergeTreeIndexAggregatorFullText::update(const Block & block, size_t * pos, size_t element_start_row = column_offsets[current_position - 1]; size_t elements_size = column_offsets[current_position] - element_start_row; - for (size_t row_num = 0; row_num < elements_size; row_num++) + for (size_t row_num = 0; row_num < elements_size; ++row_num) { auto ref = column_key.getDataAt(element_start_row + row_num); token_extractor->stringPaddedToBloomFilter(ref.data, ref.size, granule->bloom_filters[col]); diff --git a/src/Storages/MergeTree/MergeTreeIndexHypothesisMergedCondition.cpp b/src/Storages/MergeTree/MergeTreeIndexHypothesisMergedCondition.cpp index 6900ae1e69b..84195eb71b2 100644 --- a/src/Storages/MergeTree/MergeTreeIndexHypothesisMergedCondition.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexHypothesisMergedCondition.cpp @@ -146,10 +146,15 @@ bool MergeTreeIndexhypothesisMergedCondition::mayBeTrueOnGranule(const MergeTree values.push_back(granule->met); } - if (const auto it = answer_cache.find(values); it != std::end(answer_cache)) - return it->second; + const ComparisonGraph * graph = nullptr; - const auto & graph = getGraph(values); + { + std::lock_guard lock(cache_mutex); + if (const auto it = answer_cache.find(values); it != std::end(answer_cache)) + return it->second; + + graph = getGraph(values); + } bool always_false = false; expression_cnf->iterateGroups( @@ -166,7 +171,7 @@ bool MergeTreeIndexhypothesisMergedCondition::mayBeTrueOnGranule(const MergeTree if (func && func->arguments->children.size() == 2) { const auto expected = ComparisonGraph::atomToCompareResult(atom); - if (graph.isPossibleCompare(expected, func->arguments->children[0], func->arguments->children[1])) + if (graph->isPossibleCompare(expected, func->arguments->children[0], func->arguments->children[1])) { /// If graph failed use matching. /// We don't need to check constraints. @@ -177,6 +182,8 @@ bool MergeTreeIndexhypothesisMergedCondition::mayBeTrueOnGranule(const MergeTree always_false = true; }); + std::lock_guard lock(cache_mutex); + answer_cache[values] = !always_false; return !always_false; } @@ -195,11 +202,13 @@ std::unique_ptr MergeTreeIndexhypothesisMergedCondition::buildG return std::make_unique(active_atomic_formulas); } -const ComparisonGraph & MergeTreeIndexhypothesisMergedCondition::getGraph(const std::vector & values) const +const ComparisonGraph * MergeTreeIndexhypothesisMergedCondition::getGraph(const std::vector & values) const { - if (!graph_cache.contains(values)) - graph_cache[values] = buildGraph(values); - return *graph_cache.at(values); + auto [it, inserted] = graph_cache.try_emplace(values); + if (inserted) + it->second = buildGraph(values); + + return it->second.get(); } } diff --git a/src/Storages/MergeTree/MergeTreeIndexHypothesisMergedCondition.h b/src/Storages/MergeTree/MergeTreeIndexHypothesisMergedCondition.h index 530e14e15cc..9ebcbe9d7dc 100644 --- a/src/Storages/MergeTree/MergeTreeIndexHypothesisMergedCondition.h +++ b/src/Storages/MergeTree/MergeTreeIndexHypothesisMergedCondition.h @@ -21,11 +21,14 @@ public: private: void addConstraints(const ConstraintsDescription & constraints_description); std::unique_ptr buildGraph(const std::vector & values) const; - const ComparisonGraph & getGraph(const std::vector & values) const; + const ComparisonGraph * getGraph(const std::vector & values) const; ASTPtr expression_ast; std::unique_ptr expression_cnf; + /// Part analysis can be done in parallel. + /// So, we have shared answer and graph cache. + mutable std::mutex cache_mutex; mutable std::unordered_map, std::unique_ptr> graph_cache; mutable std::unordered_map, bool> answer_cache; diff --git a/src/Storages/MergeTree/MergeTreeIndices.h b/src/Storages/MergeTree/MergeTreeIndices.h index 8f10b2c51ba..1e001d01ada 100644 --- a/src/Storages/MergeTree/MergeTreeIndices.h +++ b/src/Storages/MergeTree/MergeTreeIndices.h @@ -147,9 +147,11 @@ struct IMergeTreeIndex /// Returns extension for deserialization. /// /// Return pair. - virtual MergeTreeIndexFormat getDeserializedFormat(const DiskPtr, const std::string & /* relative_path_prefix */) const + virtual MergeTreeIndexFormat getDeserializedFormat(const DiskPtr disk, const std::string & relative_path_prefix) const { - return {1, ".idx"}; + if (disk->exists(relative_path_prefix + ".idx")) + return {1, ".idx"}; + return {0 /*unknown*/, ""}; } /// Checks whether the column is in data skipping index. diff --git a/src/Storages/MergeTree/MergeTreePartsMover.cpp b/src/Storages/MergeTree/MergeTreePartsMover.cpp index 15e7ed4c1d0..5a889ea5e8b 100644 --- a/src/Storages/MergeTree/MergeTreePartsMover.cpp +++ b/src/Storages/MergeTree/MergeTreePartsMover.cpp @@ -123,6 +123,9 @@ bool MergeTreePartsMover::selectPartsForMove( auto metadata_snapshot = data->getInMemoryMetadataPtr(); + if (need_to_move.empty() && !metadata_snapshot->hasAnyMoveTTL()) + return false; + for (const auto & part : data_parts) { String reason; @@ -228,6 +231,7 @@ MergeTreeData::DataPartPtr MergeTreePartsMover::clonePart(const MergeTreeMoveEnt LOG_TRACE(log, "Part {} was cloned to {}", part->name, cloned_part->getFullPath()); cloned_part->loadColumnsChecksumsIndexes(true, true); + cloned_part->modification_time = disk->getLastModified(cloned_part->getFullRelativePath()).epochTime(); return cloned_part; } diff --git a/src/Storages/MergeTree/MergeTreeReadPool.cpp b/src/Storages/MergeTree/MergeTreeReadPool.cpp index 09542c30636..c89affb5365 100644 --- a/src/Storages/MergeTree/MergeTreeReadPool.cpp +++ b/src/Storages/MergeTree/MergeTreeReadPool.cpp @@ -198,7 +198,9 @@ std::vector MergeTreeReadPool::fillPerPartInfo(const RangesInDataParts & for (const auto i : collections::range(0, parts.size())) { const auto & part = parts[i]; - is_part_on_remote_disk[i] = part.data_part->isStoredOnRemoteDisk(); + bool part_on_remote_disk = part.data_part->isStoredOnRemoteDisk(); + is_part_on_remote_disk[i] = part_on_remote_disk; + do_not_steal_tasks |= part_on_remote_disk; /// Read marks for every data part. size_t sum_marks = 0; diff --git a/src/Storages/MergeTree/MergeTreeReaderCompact.cpp b/src/Storages/MergeTree/MergeTreeReaderCompact.cpp index 1f8642db886..b594b59fdfa 100644 --- a/src/Storages/MergeTree/MergeTreeReaderCompact.cpp +++ b/src/Storages/MergeTree/MergeTreeReaderCompact.cpp @@ -54,7 +54,7 @@ MergeTreeReaderCompact::MergeTreeReaderCompact( { auto column_from_part = getColumnFromPart(*name_and_type); - auto position = data_part->getColumnPosition(column_from_part.name); + auto position = data_part->getColumnPosition(column_from_part.getNameInStorage()); if (!position && typeid_cast(column_from_part.type.get())) { /// If array of Nested column is missing in part, @@ -140,8 +140,12 @@ size_t MergeTreeReaderCompact::readRows( if (!column_positions[i]) continue; + auto column_from_part = getColumnFromPart(*column_it); if (res_columns[i] == nullptr) - res_columns[i] = getColumnFromPart(*column_it).type->createColumn(); + { + auto serialization = data_part->getSerialization(column_from_part); + res_columns[i] = column_from_part.type->createColumn(*serialization); + } } while (read_rows < max_rows_to_read) @@ -199,6 +203,8 @@ void MergeTreeReaderCompact::readData( { const auto & [name, type] = name_and_type; + adjustUpperBound(current_task_last_mark); /// Must go before seek. + if (!isContinuousReading(from_mark, column_position)) seekToMark(from_mark, column_position); @@ -207,8 +213,6 @@ void MergeTreeReaderCompact::readData( if (only_offsets && (substream_path.size() != 1 || substream_path[0].type != ISerialization::Substream::ArraySizes)) return nullptr; - /// For asynchronous reading from remote fs. - data_buffer->setReadUntilPosition(marks_loader.getMark(current_task_last_mark).offset_in_compressed_file); return data_buffer; }; @@ -220,9 +224,11 @@ void MergeTreeReaderCompact::readData( if (name_and_type.isSubcolumn()) { const auto & type_in_storage = name_and_type.getTypeInStorage(); - ColumnPtr temp_column = type_in_storage->createColumn(); + const auto & name_in_storage = name_and_type.getNameInStorage(); + + auto serialization = data_part->getSerialization(NameAndTypePair{name_in_storage, type_in_storage}); + ColumnPtr temp_column = type_in_storage->createColumn(*serialization); - auto serialization = type_in_storage->getDefaultSerialization(); serialization->deserializeBinaryBulkStatePrefix(deserialize_settings, state); serialization->deserializeBinaryBulkWithMultipleStreams(temp_column, rows_to_read, deserialize_settings, state, nullptr); @@ -236,7 +242,7 @@ void MergeTreeReaderCompact::readData( } else { - auto serialization = type->getDefaultSerialization(); + auto serialization = data_part->getSerialization(name_and_type); serialization->deserializeBinaryBulkStatePrefix(deserialize_settings, state); serialization->deserializeBinaryBulkWithMultipleStreams(column, rows_to_read, deserialize_settings, state, nullptr); } @@ -269,6 +275,34 @@ void MergeTreeReaderCompact::seekToMark(size_t row_index, size_t column_index) } } +void MergeTreeReaderCompact::adjustUpperBound(size_t last_mark) +{ + auto right_offset = marks_loader.getMark(last_mark).offset_in_compressed_file; + if (!right_offset) + { + /// If already reading till the end of file. + if (last_right_offset && *last_right_offset == 0) + return; + + last_right_offset = 0; // Zero value means the end of file. + if (cached_buffer) + cached_buffer->setReadUntilEnd(); + if (non_cached_buffer) + non_cached_buffer->setReadUntilEnd(); + } + else + { + if (last_right_offset && right_offset <= last_right_offset.value()) + return; + + last_right_offset = right_offset; + if (cached_buffer) + cached_buffer->setReadUntilPosition(right_offset); + if (non_cached_buffer) + non_cached_buffer->setReadUntilPosition(right_offset); + } +} + bool MergeTreeReaderCompact::isContinuousReading(size_t mark, size_t column_position) { if (!last_read_granule) diff --git a/src/Storages/MergeTree/MergeTreeReaderCompact.h b/src/Storages/MergeTree/MergeTreeReaderCompact.h index 350c8427eff..381b212df3c 100644 --- a/src/Storages/MergeTree/MergeTreeReaderCompact.h +++ b/src/Storages/MergeTree/MergeTreeReaderCompact.h @@ -52,6 +52,9 @@ private: /// Should we read full column or only it's offsets std::vector read_only_offsets; + /// For asynchronous reading from remote fs. Same meaning as in MergeTreeReaderStream. + std::optional last_right_offset; + size_t next_mark = 0; std::optional> last_read_granule; @@ -67,6 +70,9 @@ private: MergeTreeMarksLoader & marks_loader, const ColumnPositions & column_positions, const MarkRanges & mark_ranges); + + /// For asynchronous reading from remote fs. + void adjustUpperBound(size_t last_mark); }; } diff --git a/src/Storages/MergeTree/MergeTreeReaderWide.cpp b/src/Storages/MergeTree/MergeTreeReaderWide.cpp index 50650ef66e5..5e51a2931e4 100644 --- a/src/Storages/MergeTree/MergeTreeReaderWide.cpp +++ b/src/Storages/MergeTree/MergeTreeReaderWide.cpp @@ -1,6 +1,7 @@ #include #include +#include #include #include #include @@ -105,7 +106,10 @@ size_t MergeTreeReaderWide::readRows( /// The column is already present in the block so we will append the values to the end. bool append = res_columns[pos] != nullptr; if (!append) - res_columns[pos] = type->createColumn(); + { + auto serialization = data_part->getSerialization(column_from_part); + res_columns[pos] = type->createColumn(*serialization); + } auto & column = res_columns[pos]; try @@ -184,9 +188,7 @@ void MergeTreeReaderWide::addStreams(const NameAndTypePair & name_and_type, profile_callback, clock_type)); }; - auto serialization = data_part->getSerializationForColumn(name_and_type); - serialization->enumerateStreams(callback); - serializations.emplace(name_and_type.name, std::move(serialization)); + data_part->getSerialization(name_and_type)->enumerateStreams(callback); } @@ -220,6 +222,23 @@ static ReadBuffer * getStream( return stream.data_buffer; } +void MergeTreeReaderWide::deserializePrefix( + const SerializationPtr & serialization, + const NameAndTypePair & name_and_type, + size_t current_task_last_mark, + ISerialization::SubstreamsCache & cache) +{ + const auto & name = name_and_type.name; + if (deserialize_binary_bulk_state_map.count(name) == 0) + { + ISerialization::DeserializeBinaryBulkSettings deserialize_settings; + deserialize_settings.getter = [&](const ISerialization::SubstreamPath & substream_path) + { + return getStream(/* seek_to_start = */true, substream_path, streams, name_and_type, 0, /* seek_to_mark = */false, current_task_last_mark, cache); + }; + serialization->deserializeBinaryBulkStatePrefix(deserialize_settings, deserialize_binary_bulk_state_map[name]); + } +} void MergeTreeReaderWide::prefetch( const NameAndTypePair & name_and_type, @@ -229,8 +248,8 @@ void MergeTreeReaderWide::prefetch( ISerialization::SubstreamsCache & cache, std::unordered_set & prefetched_streams) { - const auto & name = name_and_type.name; - auto & serialization = serializations[name]; + auto serialization = data_part->getSerialization(name_and_type); + deserializePrefix(serialization, name_and_type, current_task_last_mark, cache); serialization->enumerateStreams([&](const ISerialization::SubstreamPath & substream_path) { @@ -258,16 +277,9 @@ void MergeTreeReaderWide::readData( deserialize_settings.avg_value_size_hint = avg_value_size_hint; const auto & name = name_and_type.name; - auto & serialization = serializations[name]; + auto serialization = data_part->getSerialization(name_and_type); - if (deserialize_binary_bulk_state_map.count(name) == 0) - { - deserialize_settings.getter = [&](const ISerialization::SubstreamPath & substream_path) - { - return getStream(/* seek_to_start = */true, substream_path, streams, name_and_type, from_mark, /* seek_to_mark = */false, current_task_last_mark, cache); - }; - serialization->deserializeBinaryBulkStatePrefix(deserialize_settings, deserialize_binary_bulk_state_map[name]); - } + deserializePrefix(serialization, name_and_type, current_task_last_mark, cache); deserialize_settings.getter = [&](const ISerialization::SubstreamPath & substream_path) { diff --git a/src/Storages/MergeTree/MergeTreeReaderWide.h b/src/Storages/MergeTree/MergeTreeReaderWide.h index e27dd85643b..41219560ecc 100644 --- a/src/Storages/MergeTree/MergeTreeReaderWide.h +++ b/src/Storages/MergeTree/MergeTreeReaderWide.h @@ -34,11 +34,9 @@ public: bool canReadIncompleteGranules() const override { return true; } using FileStreams = std::map>; - using Serializations = std::map; private: FileStreams streams; - Serializations serializations; DiskPtr disk; void addStreams(const NameAndTypePair & name_and_type, @@ -57,6 +55,12 @@ private: size_t current_task_last_mark, ISerialization::SubstreamsCache & cache, std::unordered_set & prefetched_streams); /// if stream was already prefetched do nothing + + void deserializePrefix( + const SerializationPtr & serialization, + const NameAndTypePair & name_and_type, + size_t current_task_last_mark, + ISerialization::SubstreamsCache & cache); }; } diff --git a/src/Storages/MergeTree/MergeTreeSettings.h b/src/Storages/MergeTree/MergeTreeSettings.h index decc72df14c..b991166b3b6 100644 --- a/src/Storages/MergeTree/MergeTreeSettings.h +++ b/src/Storages/MergeTree/MergeTreeSettings.h @@ -33,6 +33,7 @@ struct Settings; M(UInt64, min_rows_for_compact_part, 0, "Experimental. Minimal number of rows to create part in compact format instead of saving it in RAM", 0) \ M(Bool, in_memory_parts_enable_wal, true, "Whether to write blocks in Native format to write-ahead-log before creation in-memory part", 0) \ M(UInt64, write_ahead_log_max_bytes, 1024 * 1024 * 1024, "Rotate WAL, if it exceeds that amount of bytes", 0) \ + M(Float, ratio_of_defaults_for_sparse_serialization, 1.0, "Minimal ratio of number of default values to number of all values in column to store it in sparse serializations. If >= 1, columns will be always written in full serialization.", 0) \ \ /** Merge settings. */ \ M(UInt64, merge_max_block_size, DEFAULT_MERGE_BLOCK_SIZE, "How many rows in blocks should be formed for merge operations.", 0) \ diff --git a/src/Storages/MergeTree/MergeTreeWriteAheadLog.cpp b/src/Storages/MergeTree/MergeTreeWriteAheadLog.cpp index 9323249946a..694357ab0c2 100644 --- a/src/Storages/MergeTree/MergeTreeWriteAheadLog.cpp +++ b/src/Storages/MergeTree/MergeTreeWriteAheadLog.cpp @@ -199,6 +199,7 @@ MergeTreeData::MutableDataPartsVector MergeTreeWriteAheadLog::restore(const Stor part->minmax_idx->update(block, storage.getMinMaxColumnsNames(metadata_snapshot->getPartitionKey())); part->partition.create(metadata_snapshot, block, 0, context); + part->setColumns(block.getNamesAndTypesList()); if (metadata_snapshot->hasSortingKey()) metadata_snapshot->getSortingKey().expression->execute(block); diff --git a/src/Storages/MergeTree/MergedBlockOutputStream.cpp b/src/Storages/MergeTree/MergedBlockOutputStream.cpp index 43146709686..cbdbb2339df 100644 --- a/src/Storages/MergeTree/MergedBlockOutputStream.cpp +++ b/src/Storages/MergeTree/MergedBlockOutputStream.cpp @@ -18,8 +18,9 @@ MergedBlockOutputStream::MergedBlockOutputStream( const NamesAndTypesList & columns_list_, const MergeTreeIndices & skip_indices, CompressionCodecPtr default_codec_, + bool reset_columns_, bool blocks_are_granules_size) - : IMergedBlockOutputStream(data_part, metadata_snapshot_) + : IMergedBlockOutputStream(data_part, metadata_snapshot_, columns_list_, reset_columns_) , columns_list(columns_list_) , default_codec(default_codec_) { @@ -77,10 +78,16 @@ void MergedBlockOutputStream::writeSuffixAndFinalizePart( else part_columns = *total_columns_list; - if (new_part->isStoredOnDisk()) - finalizePartOnDisk(new_part, part_columns, checksums, sync); + auto & serialization_infos = reset_columns + ? new_serialization_infos + : new_part->getSerializationInfos(); + + if (new_part->isStoredOnDisk()) + finalizePartOnDisk(new_part, part_columns, serialization_infos, checksums, sync); + + if (reset_columns) + new_part->setColumns(part_columns, serialization_infos); - new_part->setColumns(part_columns); new_part->rows_count = rows_count; new_part->modification_time = time(nullptr); new_part->index = writer->releaseIndexColumns(); @@ -97,6 +104,7 @@ void MergedBlockOutputStream::writeSuffixAndFinalizePart( void MergedBlockOutputStream::finalizePartOnDisk( const MergeTreeData::MutableDataPartPtr & new_part, NamesAndTypesList & part_columns, + SerializationInfoByName & serialization_infos, MergeTreeData::DataPart::Checksums & checksums, bool sync) { @@ -127,15 +135,17 @@ void MergedBlockOutputStream::finalizePartOnDisk( out->sync(); } - if (storage.format_version >= MERGE_TREE_DATA_MIN_FORMAT_VERSION_WITH_CUSTOM_PARTITIONING || isCompactPart(new_part)) + if (storage.format_version >= MERGE_TREE_DATA_MIN_FORMAT_VERSION_WITH_CUSTOM_PARTITIONING) { new_part->partition.store(storage, volume->getDisk(), part_path, checksums); if (new_part->minmax_idx->initialized) new_part->minmax_idx->store(storage, volume->getDisk(), part_path, checksums); else if (rows_count) throw Exception("MinMax index was not initialized for new non-empty part " + new_part->name - + ". It is a bug.", ErrorCodes::LOGICAL_ERROR); + + ". It is a bug.", ErrorCodes::LOGICAL_ERROR); + } + { auto count_out = volume->getDisk()->writeFile(fs::path(part_path) / "count.txt", 4096); HashingWriteBuffer count_out_hashing(*count_out); writeIntText(rows_count, count_out_hashing); @@ -161,7 +171,19 @@ void MergedBlockOutputStream::finalizePartOnDisk( out->sync(); } - removeEmptyColumnsFromPart(new_part, part_columns, checksums); + removeEmptyColumnsFromPart(new_part, part_columns, serialization_infos, checksums); + + if (!serialization_infos.empty()) + { + auto out = volume->getDisk()->writeFile(part_path + IMergeTreeDataPart::SERIALIZATION_FILE_NAME, 4096); + HashingWriteBuffer out_hashing(*out); + serialization_infos.writeJSON(out_hashing); + checksums.files[IMergeTreeDataPart::SERIALIZATION_FILE_NAME].file_size = out_hashing.count(); + checksums.files[IMergeTreeDataPart::SERIALIZATION_FILE_NAME].file_hash = out_hashing.getHash(); + out->finalize(); + if (sync) + out->sync(); + } { /// Write a file with a description of columns. @@ -202,6 +224,9 @@ void MergedBlockOutputStream::writeImpl(const Block & block, const IColumn::Perm return; writer->write(block, permutation); + if (reset_columns) + new_serialization_infos.add(block); + rows_count += rows; } diff --git a/src/Storages/MergeTree/MergedBlockOutputStream.h b/src/Storages/MergeTree/MergedBlockOutputStream.h index 5965331ee81..ffc740bf410 100644 --- a/src/Storages/MergeTree/MergedBlockOutputStream.h +++ b/src/Storages/MergeTree/MergedBlockOutputStream.h @@ -19,6 +19,7 @@ public: const NamesAndTypesList & columns_list_, const MergeTreeIndices & skip_indices, CompressionCodecPtr default_codec_, + bool reset_columns_ = false, bool blocks_are_granules_size = false); Block getHeader() const { return metadata_snapshot->getSampleBlock(); } @@ -48,6 +49,7 @@ private: void finalizePartOnDisk( const MergeTreeData::MutableDataPartPtr & new_part, NamesAndTypesList & part_columns, + SerializationInfoByName & serialization_infos, MergeTreeData::DataPart::Checksums & checksums, bool sync); diff --git a/src/Storages/MergeTree/MergedColumnOnlyOutputStream.cpp b/src/Storages/MergeTree/MergedColumnOnlyOutputStream.cpp index 4b760103750..ff79a187490 100644 --- a/src/Storages/MergeTree/MergedColumnOnlyOutputStream.cpp +++ b/src/Storages/MergeTree/MergedColumnOnlyOutputStream.cpp @@ -18,7 +18,7 @@ MergedColumnOnlyOutputStream::MergedColumnOnlyOutputStream( WrittenOffsetColumns * offset_columns_, const MergeTreeIndexGranularity & index_granularity, const MergeTreeIndexGranularityInfo * index_granularity_info) - : IMergedBlockOutputStream(data_part, metadata_snapshot_) + : IMergedBlockOutputStream(data_part, metadata_snapshot_, header_.getNamesAndTypesList(), /*reset_columns=*/ true) , header(header_) { const auto & global_settings = data_part->storage.getContext()->getSettings(); @@ -51,6 +51,7 @@ void MergedColumnOnlyOutputStream::write(const Block & block) return; writer->write(block, nullptr); + new_serialization_infos.add(block); } MergeTreeData::DataPart::Checksums @@ -71,12 +72,12 @@ MergedColumnOnlyOutputStream::writeSuffixAndGetChecksums( auto columns = new_part->getColumns(); - auto removed_files = removeEmptyColumnsFromPart(new_part, columns, checksums); + auto removed_files = removeEmptyColumnsFromPart(new_part, columns, new_serialization_infos, checksums); for (const String & removed_file : removed_files) if (all_checksums.files.count(removed_file)) all_checksums.files.erase(removed_file); - new_part->setColumns(columns); + new_part->setColumns(columns, new_serialization_infos); return checksums; } diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp index accf167f5ff..86a692c8a48 100644 --- a/src/Storages/MergeTree/MutateTask.cpp +++ b/src/Storages/MergeTree/MutateTask.cpp @@ -315,8 +315,7 @@ NameSet collectFilesToSkip( files_to_skip.insert(stream_name + mrk_extension); }; - auto serialization = source_part->getSerializationForColumn({entry.name, entry.type}); - serialization->enumerateStreams(callback); + source_part->getSerialization({entry.name, entry.type})->enumerateStreams(callback); } for (const auto & index : indices_to_recalc) { @@ -341,8 +340,7 @@ static NameToNameVector collectFilesForRenames( std::map stream_counts; for (const auto & column : source_part->getColumns()) { - auto serialization = source_part->getSerializationForColumn(column); - serialization->enumerateStreams( + source_part->getSerialization(column)->enumerateStreams( [&](const ISerialization::SubstreamPath & substream_path) { ++stream_counts[ISerialization::getFileNameForStream(column, substream_path)]; @@ -386,10 +384,7 @@ static NameToNameVector collectFilesForRenames( auto column = source_part->getColumns().tryGetByName(command.column_name); if (column) - { - auto serialization = source_part->getSerializationForColumn(*column); - serialization->enumerateStreams(callback); - } + source_part->getSerialization(*column)->enumerateStreams(callback); } else if (command.type == MutationCommand::Type::RENAME_COLUMN) { @@ -411,10 +406,7 @@ static NameToNameVector collectFilesForRenames( auto column = source_part->getColumns().tryGetByName(command.column_name); if (column) - { - auto serialization = source_part->getSerializationForColumn(*column); - serialization->enumerateStreams(callback); - } + source_part->getSerialization(*column)->enumerateStreams(callback); } } @@ -1301,7 +1293,12 @@ bool MutateTask::prepare() /// It shouldn't be changed by mutation. ctx->new_data_part->index_granularity_info = ctx->source_part->index_granularity_info; - ctx->new_data_part->setColumns(MergeTreeDataMergerMutator::getColumnsForNewDataPart(ctx->source_part, ctx->updated_header, ctx->storage_columns, ctx->for_file_renames)); + + auto [new_columns, new_infos] = MergeTreeDataMergerMutator::getColumnsForNewDataPart( + ctx->source_part, ctx->updated_header, ctx->storage_columns, + ctx->source_part->getSerializationInfos(), ctx->commands_for_part); + + ctx->new_data_part->setColumns(new_columns, new_infos); ctx->new_data_part->partition.assign(ctx->source_part->partition); ctx->disk = ctx->new_data_part->volume->getDisk(); diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp index cc9a142c65c..b3da3d47684 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp @@ -1033,7 +1033,7 @@ void ReplicatedMergeTreeQueue::removePartProducingOpsInRange( min_unprocessed_insert_time_changed, max_processed_insert_time_changed, lock); (*it)->removed_by_other_entry = true; - queue.erase(it++); + it = queue.erase(it); ++removed_entries; } else diff --git a/src/Storages/MergeTree/checkDataPart.cpp b/src/Storages/MergeTree/checkDataPart.cpp index d312a7f9c3e..eabd901eb24 100644 --- a/src/Storages/MergeTree/checkDataPart.cpp +++ b/src/Storages/MergeTree/checkDataPart.cpp @@ -98,6 +98,23 @@ IMergeTreeDataPart::Checksums checkDataPart( }; }; + SerializationInfoByName serialization_infos(columns_txt, {}); + auto serialization_path = path + IMergeTreeDataPart::SERIALIZATION_FILE_NAME; + + if (disk->exists(serialization_path)) + { + auto serialization_file = disk->readFile(serialization_path); + serialization_infos.readJSON(*serialization_file); + } + + auto get_serialization = [&serialization_infos](const auto & column) + { + auto it = serialization_infos.find(column.name); + return it == serialization_infos.end() + ? column.type->getDefaultSerialization() + : column.type->getSerialization(*it->second); + }; + /// This function calculates only checksum of file content (compressed or uncompressed). /// It also calculates checksum of projections. auto checksum_file = [&](const String & file_path, const String & file_name) @@ -132,12 +149,7 @@ IMergeTreeDataPart::Checksums checkDataPart( const NamesAndTypesList & projection_columns_list = projection->getColumns(); for (const auto & projection_column : projection_columns_list) { - auto serialization = IDataType::getSerialization(projection_column, [&](const String & stream_name) - { - return disk->exists(stream_name + IMergeTreeDataPart::DATA_FILE_EXTENSION); - }); - - serialization->enumerateStreams( + get_serialization(projection_column)->enumerateStreams( [&](const ISerialization::SubstreamPath & substream_path) { String projection_file_name = ISerialization::getFileNameForStream(projection_column, substream_path) + ".bin"; @@ -209,13 +221,7 @@ IMergeTreeDataPart::Checksums checkDataPart( { for (const auto & column : columns_list) { - auto serialization = IDataType::getSerialization(column, - [&](const String & stream_name) - { - return disk->exists(stream_name + IMergeTreeDataPart::DATA_FILE_EXTENSION); - }); - - serialization->enumerateStreams([&](const ISerialization::SubstreamPath & substream_path) + get_serialization(column)->enumerateStreams([&](const ISerialization::SubstreamPath & substream_path) { String file_name = ISerialization::getFileNameForStream(column, substream_path) + ".bin"; checksums_data.files[file_name] = checksum_compressed_file(disk, path + file_name); diff --git a/src/Storages/MergeTree/registerStorageMergeTree.cpp b/src/Storages/MergeTree/registerStorageMergeTree.cpp index cb52c8b86c0..fc3eff7459b 100644 --- a/src/Storages/MergeTree/registerStorageMergeTree.cpp +++ b/src/Storages/MergeTree/registerStorageMergeTree.cpp @@ -35,6 +35,7 @@ namespace ErrorCodes extern const int NO_ELEMENTS_IN_CONFIG; extern const int UNKNOWN_STORAGE; extern const int NO_REPLICA_NAME_GIVEN; + extern const int CANNOT_EXTRACT_TABLE_STRUCTURE; } @@ -258,6 +259,34 @@ If you use the Replicated version of engines, see https://clickhouse.com/docs/en return help; } +static ColumnsDescription getColumnsDescriptionFromZookeeper(const String & raw_zookeeper_path, ContextMutablePtr context) +{ + String zookeeper_name = zkutil::extractZooKeeperName(raw_zookeeper_path); + String zookeeper_path = zkutil::extractZooKeeperPath(raw_zookeeper_path, true); + + if (!context->hasZooKeeper() && !context->hasAuxiliaryZooKeeper(zookeeper_name)) + throw Exception{ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "Cannot get replica structure without zookeeper, you must specify the structure manually"}; + + zkutil::ZooKeeperPtr zookeeper; + try + { + if (zookeeper_name == StorageReplicatedMergeTree::getDefaultZooKeeperName()) + zookeeper = context->getZooKeeper(); + else + zookeeper = context->getAuxiliaryZooKeeper(zookeeper_name); + } + catch (...) + { + throw Exception{ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "Cannot get replica structure from zookeeper, because cannot get zookeeper: {}. You must specify structure manually", getCurrentExceptionMessage(false)}; + } + + if (!zookeeper->exists(zookeeper_path + "/replicas")) + throw Exception{ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "Cannot get replica structure, because there no other replicas in zookeeper. You must specify the structure manually"}; + + Coordination::Stat columns_stat; + return ColumnsDescription::parse(zookeeper->get(fs::path(zookeeper_path) / "columns", &columns_stat)); +} + static StoragePtr create(const StorageFactory::Arguments & args) { @@ -638,7 +667,14 @@ static StoragePtr create(const StorageFactory::Arguments & args) String date_column_name; StorageInMemoryMetadata metadata; - metadata.setColumns(args.columns); + + ColumnsDescription columns; + if (args.columns.empty() && replicated) + columns = getColumnsDescriptionFromZookeeper(zookeeper_path, args.getContext()); + else + columns = args.columns; + + metadata.setColumns(columns); metadata.setComment(args.comment); std::unique_ptr storage_settings; @@ -705,12 +741,12 @@ static StoragePtr create(const StorageFactory::Arguments & args) if (args.query.columns_list && args.query.columns_list->indices) for (auto & index : args.query.columns_list->indices->children) - metadata.secondary_indices.push_back(IndexDescription::getIndexFromAST(index, args.columns, args.getContext())); + metadata.secondary_indices.push_back(IndexDescription::getIndexFromAST(index, columns, args.getContext())); if (args.query.columns_list && args.query.columns_list->projections) for (auto & projection_ast : args.query.columns_list->projections->children) { - auto projection = ProjectionDescription::getProjectionFromAST(projection_ast, args.columns, args.getContext()); + auto projection = ProjectionDescription::getProjectionFromAST(projection_ast, columns, args.getContext()); metadata.projections.add(std::move(projection)); } @@ -720,10 +756,10 @@ static StoragePtr create(const StorageFactory::Arguments & args) constraints.push_back(constraint); metadata.constraints = ConstraintsDescription(constraints); - auto column_ttl_asts = args.columns.getColumnTTLs(); + auto column_ttl_asts = columns.getColumnTTLs(); for (const auto & [name, ast] : column_ttl_asts) { - auto new_ttl_entry = TTLDescription::getTTLFromAST(ast, args.columns, args.getContext(), metadata.primary_key); + auto new_ttl_entry = TTLDescription::getTTLFromAST(ast, columns, args.getContext(), metadata.primary_key); metadata.column_ttls_by_name[name] = new_ttl_entry; } @@ -850,6 +886,7 @@ void registerStorageMergeTree(StorageFactory & factory) features.supports_replication = true; features.supports_deduplication = true; + features.supports_schema_inference = true; factory.registerStorage("ReplicatedMergeTree", create, features); factory.registerStorage("ReplicatedCollapsingMergeTree", create, features); diff --git a/src/Storages/MySQL/MySQLHelpers.cpp b/src/Storages/MySQL/MySQLHelpers.cpp new file mode 100644 index 00000000000..e7745e6c0bb --- /dev/null +++ b/src/Storages/MySQL/MySQLHelpers.cpp @@ -0,0 +1,26 @@ +#include "MySQLHelpers.h" + +#if USE_MYSQL +#include +#include +#include + +namespace DB +{ + +mysqlxx::PoolWithFailover +createMySQLPoolWithFailover(const StorageMySQLConfiguration & configuration, const MySQLSettings & mysql_settings) +{ + return mysqlxx::PoolWithFailover( + configuration.database, configuration.addresses, configuration.username, configuration.password, + MYSQLXX_POOL_WITH_FAILOVER_DEFAULT_START_CONNECTIONS, + mysql_settings.connection_pool_size, + mysql_settings.connection_max_tries, + mysql_settings.connection_wait_timeout, + mysql_settings.connect_timeout, + mysql_settings.read_write_timeout); +} + +} + +#endif diff --git a/src/Storages/MySQL/MySQLHelpers.h b/src/Storages/MySQL/MySQLHelpers.h new file mode 100644 index 00000000000..712c5a2c719 --- /dev/null +++ b/src/Storages/MySQL/MySQLHelpers.h @@ -0,0 +1,19 @@ +#pragma once +#include "config_core.h" + +#if USE_MYSQL +#include + +namespace mysqlxx { class PoolWithFailover; } + +namespace DB +{ +struct StorageMySQLConfiguration; +struct MySQLSettings; + +mysqlxx::PoolWithFailover +createMySQLPoolWithFailover(const StorageMySQLConfiguration & configuration, const MySQLSettings & mysql_settings); + +} + +#endif diff --git a/src/Storages/MySQL/MySQLSettings.h b/src/Storages/MySQL/MySQLSettings.h index 872b0607e20..aa2c2703d6b 100644 --- a/src/Storages/MySQL/MySQLSettings.h +++ b/src/Storages/MySQL/MySQLSettings.h @@ -19,6 +19,8 @@ class ASTStorage; M(UInt64, connection_max_tries, 3, "Number of retries for pool with failover", 0) \ M(UInt64, connection_wait_timeout, 5, "Timeout (in seconds) for waiting for free connection (in case of there is already connection_pool_size active connections), 0 - do not wait.", 0) \ M(Bool, connection_auto_close, true, "Auto-close connection after query execution, i.e. disable connection reuse.", 0) \ + M(UInt64, connect_timeout, DBMS_DEFAULT_CONNECT_TIMEOUT_SEC, "Connect timeout (in seconds)", 0) \ + M(UInt64, read_write_timeout, DBMS_DEFAULT_RECEIVE_TIMEOUT_SEC, "Read/write timeout (in seconds)", 0) \ DECLARE_SETTINGS_TRAITS(MySQLSettingsTraits, LIST_OF_MYSQL_SETTINGS) diff --git a/src/Storages/PostgreSQL/PostgreSQLReplicationHandler.cpp b/src/Storages/PostgreSQL/PostgreSQLReplicationHandler.cpp index 7cc71a63443..4848ae6c9ea 100644 --- a/src/Storages/PostgreSQL/PostgreSQLReplicationHandler.cpp +++ b/src/Storages/PostgreSQL/PostgreSQLReplicationHandler.cpp @@ -1,13 +1,15 @@ #include "PostgreSQLReplicationHandler.h" +#include +#include #include #include #include #include +#include #include #include #include -#include #include #include #include @@ -18,6 +20,7 @@ namespace DB static const auto RESCHEDULE_MS = 1000; static const auto BACKOFF_TRESHOLD_MS = 10000; +static const auto CLEANUP_RESCHEDULE_MS = 600000 * 3; /// 30 min namespace ErrorCodes { @@ -26,6 +29,30 @@ namespace ErrorCodes extern const int POSTGRESQL_REPLICATION_INTERNAL_ERROR; } +class TemporaryReplicationSlot +{ +public: + TemporaryReplicationSlot( + PostgreSQLReplicationHandler * handler_, + std::shared_ptr tx_, + String & start_lsn, + String & snapshot_name) + : handler(handler_), tx(tx_) + { + handler->createReplicationSlot(*tx, start_lsn, snapshot_name, /* temporary */true); + } + + ~TemporaryReplicationSlot() + { + handler->dropReplicationSlot(*tx, /* temporary */true); + } + +private: + PostgreSQLReplicationHandler * handler; + std::shared_ptr tx; +}; + + PostgreSQLReplicationHandler::PostgreSQLReplicationHandler( const String & replication_identifier, const String & postgres_database_, @@ -67,6 +94,7 @@ PostgreSQLReplicationHandler::PostgreSQLReplicationHandler( startup_task = context->getSchedulePool().createTask("PostgreSQLReplicaStartup", [this]{ checkConnectionAndStart(); }); consumer_task = context->getSchedulePool().createTask("PostgreSQLReplicaStartup", [this]{ consumerFunc(); }); + cleanup_task = context->getSchedulePool().createTask("PostgreSQLReplicaStartup", [this]{ cleanupFunc(); }); } @@ -146,6 +174,7 @@ void PostgreSQLReplicationHandler::shutdown() stop_synchronization.store(true); startup_task->deactivate(); consumer_task->deactivate(); + cleanup_task->deactivate(); } @@ -266,6 +295,7 @@ void PostgreSQLReplicationHandler::startSynchronization(bool throw_on_error) (is_materialized_postgresql_database ? postgres_database : postgres_database + '.' + tables_list)); consumer_task->activateAndSchedule(); + cleanup_task->activateAndSchedule(); /// Do not rely anymore on saved storage pointers. materialized_storages.clear(); @@ -276,10 +306,12 @@ ASTPtr PostgreSQLReplicationHandler::getCreateNestedTableQuery(StorageMaterializ { postgres::Connection connection(connection_info); pqxx::nontransaction tx(connection.getRef()); - auto table_structure = std::make_unique(fetchPostgreSQLTableStructure(tx, table_name, postgres_schema, true, true, true)); - if (!table_structure) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Failed to get PostgreSQL table structure"); - return storage->getCreateNestedTableQuery(std::move(table_structure)); + + auto [postgres_table_schema, postgres_table_name] = getSchemaAndTableName(table_name); + auto table_structure = std::make_unique(fetchPostgreSQLTableStructure(tx, postgres_table_name, postgres_table_schema, true, true, true)); + + auto table_override = tryGetTableOverride(current_database_name, table_name); + return storage->getCreateNestedTableQuery(std::move(table_structure), table_override ? table_override->as() : nullptr); } @@ -297,7 +329,8 @@ StoragePtr PostgreSQLReplicationHandler::loadFromSnapshot(postgres::Connection & query_str = fmt::format("SELECT * FROM {}", quoted_name); LOG_DEBUG(log, "Loading PostgreSQL table {}.{}", postgres_database, quoted_name); - materialized_storage->createNestedIfNeeded(fetchTableStructure(*tx, table_name)); + auto table_override = tryGetTableOverride(current_database_name, table_name); + materialized_storage->createNestedIfNeeded(fetchTableStructure(*tx, table_name), table_override ? table_override->as() : nullptr); auto nested_storage = materialized_storage->getNested(); auto insert = std::make_shared(); @@ -326,6 +359,21 @@ StoragePtr PostgreSQLReplicationHandler::loadFromSnapshot(postgres::Connection & } +void PostgreSQLReplicationHandler::cleanupFunc() +{ + /// It is very important to make sure temporary replication slots are removed! + /// So just in case every 30 minutes check if one still exists. + postgres::Connection connection(connection_info); + String last_committed_lsn; + connection.execWithRetry([&](pqxx::nontransaction & tx) + { + if (isReplicationSlotExist(tx, last_committed_lsn, /* temporary */true)) + dropReplicationSlot(tx, /* temporary */true); + }); + cleanup_task->scheduleAfter(CLEANUP_RESCHEDULE_MS); +} + + void PostgreSQLReplicationHandler::consumerFunc() { std::vector> skipped_tables; @@ -511,17 +559,25 @@ void PostgreSQLReplicationHandler::dropPublication(pqxx::nontransaction & tx) void PostgreSQLReplicationHandler::addTableToPublication(pqxx::nontransaction & ntx, const String & table_name) { - std::string query_str = fmt::format("ALTER PUBLICATION {} ADD TABLE ONLY {}", publication_name, doubleQuoteString(table_name)); + std::string query_str = fmt::format("ALTER PUBLICATION {} ADD TABLE ONLY {}", publication_name, doubleQuoteWithSchema(table_name)); ntx.exec(query_str); - LOG_TRACE(log, "Added table `{}` to publication `{}`", table_name, publication_name); + LOG_TRACE(log, "Added table {} to publication `{}`", doubleQuoteWithSchema(table_name), publication_name); } void PostgreSQLReplicationHandler::removeTableFromPublication(pqxx::nontransaction & ntx, const String & table_name) { - std::string query_str = fmt::format("ALTER PUBLICATION {} DROP TABLE ONLY {}", publication_name, doubleQuoteString(table_name)); - ntx.exec(query_str); - LOG_TRACE(log, "Removed table `{}` from publication `{}`", table_name, publication_name); + try + { + std::string query_str = fmt::format("ALTER PUBLICATION {} DROP TABLE ONLY {}", publication_name, doubleQuoteWithSchema(table_name)); + ntx.exec(query_str); + LOG_TRACE(log, "Removed table `{}` from publication `{}`", doubleQuoteWithSchema(table_name), publication_name); + } + catch (const pqxx::undefined_table &) + { + /// Removing table from replication must succeed even if table does not exist in PostgreSQL. + LOG_WARNING(log, "Did not remove table {} from publication, because table does not exist in PostgreSQL", doubleQuoteWithSchema(table_name), publication_name); + } } @@ -762,10 +818,12 @@ void PostgreSQLReplicationHandler::addTableToReplication(StorageMaterializedPost StoragePtr nested_storage; { - pqxx::nontransaction tx(replication_connection.getRef()); - if (isReplicationSlotExist(tx, start_lsn, /* temporary */true)) - dropReplicationSlot(tx, /* temporary */true); - createReplicationSlot(tx, start_lsn, snapshot_name, /* temporary */true); + auto tx = std::make_shared(replication_connection.getRef()); + + if (isReplicationSlotExist(*tx, start_lsn, /* temporary */true)) + dropReplicationSlot(*tx, /* temporary */true); + + TemporaryReplicationSlot temporary_slot(this, tx, start_lsn, snapshot_name); /// Protect against deadlock. auto nested = DatabaseCatalog::instance().tryGetTable(materialized_storage->getNestedStorageID(), materialized_storage->getNestedTableContext()); @@ -836,81 +894,81 @@ void PostgreSQLReplicationHandler::reloadFromSnapshot(const std::vector(replication_connection.getRef()); - String snapshot_name, start_lsn; - - if (isReplicationSlotExist(tx, start_lsn, /* temporary */true)) - dropReplicationSlot(tx, /* temporary */true); - - createReplicationSlot(tx, start_lsn, snapshot_name, /* temporary */true); - postgres::Connection tmp_connection(connection_info); - - for (const auto & [relation_id, table_name] : relation_data) { - auto storage = DatabaseCatalog::instance().getTable(StorageID(current_database_name, table_name), context); - auto * materialized_storage = storage->as (); - auto materialized_table_lock = materialized_storage->lockForShare(String(), context->getSettingsRef().lock_acquire_timeout); + String snapshot_name, start_lsn; + if (isReplicationSlotExist(*tx, start_lsn, /* temporary */true)) + dropReplicationSlot(*tx, /* temporary */true); - /// If for some reason this temporary table already exists - also drop it. - auto temp_materialized_storage = materialized_storage->createTemporary(); + TemporaryReplicationSlot temporary_slot(this, tx, start_lsn, snapshot_name); + postgres::Connection tmp_connection(connection_info); - /// This snapshot is valid up to the end of the transaction, which exported it. - StoragePtr temp_nested_storage = loadFromSnapshot(tmp_connection, snapshot_name, table_name, - temp_materialized_storage->as ()); - - auto table_id = materialized_storage->getNestedStorageID(); - auto temp_table_id = temp_nested_storage->getStorageID(); - - LOG_DEBUG(log, "Starting background update of table {} ({} with {})", - table_name, table_id.getNameForLogs(), temp_table_id.getNameForLogs()); - - auto ast_rename = std::make_shared(); - ASTRenameQuery::Element elem + for (const auto & [relation_id, table_name] : relation_data) { - ASTRenameQuery::Table{table_id.database_name, table_id.table_name}, - ASTRenameQuery::Table{temp_table_id.database_name, temp_table_id.table_name} - }; - ast_rename->elements.push_back(std::move(elem)); - ast_rename->exchange = true; + auto storage = DatabaseCatalog::instance().getTable(StorageID(current_database_name, table_name), context); + auto * materialized_storage = storage->as (); + auto materialized_table_lock = materialized_storage->lockForShare(String(), context->getSettingsRef().lock_acquire_timeout); - auto nested_context = materialized_storage->getNestedTableContext(); + /// If for some reason this temporary table already exists - also drop it. + auto temp_materialized_storage = materialized_storage->createTemporary(); - try - { - InterpreterRenameQuery(ast_rename, nested_context).execute(); + /// This snapshot is valid up to the end of the transaction, which exported it. + StoragePtr temp_nested_storage = loadFromSnapshot(tmp_connection, snapshot_name, table_name, + temp_materialized_storage->as ()); - auto nested_storage = DatabaseCatalog::instance().getTable(StorageID(table_id.database_name, table_id.table_name, temp_table_id.uuid), nested_context); - materialized_storage->set(nested_storage); + auto table_id = materialized_storage->getNestedStorageID(); + auto temp_table_id = temp_nested_storage->getStorageID(); - auto nested_sample_block = nested_storage->getInMemoryMetadataPtr()->getSampleBlock(); - auto materialized_sample_block = materialized_storage->getInMemoryMetadataPtr()->getSampleBlock(); - assertBlocksHaveEqualStructure(nested_sample_block, materialized_sample_block, "while reloading table in the background"); + LOG_DEBUG(log, "Starting background update of table {} ({} with {})", + table_name, table_id.getNameForLogs(), temp_table_id.getNameForLogs()); - LOG_INFO(log, "Updated table {}. New structure: {}", - nested_storage->getStorageID().getNameForLogs(), nested_sample_block.dumpStructure()); + auto ast_rename = std::make_shared(); + ASTRenameQuery::Element elem + { + ASTRenameQuery::Table{table_id.database_name, table_id.table_name}, + ASTRenameQuery::Table{temp_table_id.database_name, temp_table_id.table_name} + }; + ast_rename->elements.push_back(std::move(elem)); + ast_rename->exchange = true; - /// Pass pointer to new nested table into replication consumer, remove current table from skip list and set start lsn position. - consumer->updateNested(table_name, nested_storage, relation_id, start_lsn); + auto nested_context = materialized_storage->getNestedTableContext(); - auto table_to_drop = DatabaseCatalog::instance().getTable(StorageID(temp_table_id.database_name, temp_table_id.table_name, table_id.uuid), nested_context); - auto drop_table_id = table_to_drop->getStorageID(); + try + { + InterpreterRenameQuery(ast_rename, nested_context).execute(); - if (drop_table_id == nested_storage->getStorageID()) - throw Exception(ErrorCodes::LOGICAL_ERROR, - "Cannot drop table because is has the same uuid as new table: {}", drop_table_id.getNameForLogs()); + auto nested_storage = DatabaseCatalog::instance().getTable(StorageID(table_id.database_name, table_id.table_name, temp_table_id.uuid), nested_context); + materialized_storage->set(nested_storage); - LOG_DEBUG(log, "Dropping table {}", drop_table_id.getNameForLogs()); - InterpreterDropQuery::executeDropQuery(ASTDropQuery::Kind::Drop, nested_context, nested_context, drop_table_id, true); - } - catch (...) - { - tryLogCurrentException(__PRETTY_FUNCTION__); + auto nested_sample_block = nested_storage->getInMemoryMetadataPtr()->getSampleBlock(); + auto materialized_sample_block = materialized_storage->getInMemoryMetadataPtr()->getSampleBlock(); + assertBlocksHaveEqualStructure(nested_sample_block, materialized_sample_block, "while reloading table in the background"); + + LOG_INFO(log, "Updated table {}. New structure: {}", + nested_storage->getStorageID().getNameForLogs(), nested_sample_block.dumpStructure()); + + /// Pass pointer to new nested table into replication consumer, remove current table from skip list and set start lsn position. + consumer->updateNested(table_name, nested_storage, relation_id, start_lsn); + + auto table_to_drop = DatabaseCatalog::instance().getTable(StorageID(temp_table_id.database_name, temp_table_id.table_name, table_id.uuid), nested_context); + auto drop_table_id = table_to_drop->getStorageID(); + + if (drop_table_id == nested_storage->getStorageID()) + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Cannot drop table because is has the same uuid as new table: {}", drop_table_id.getNameForLogs()); + + LOG_DEBUG(log, "Dropping table {}", drop_table_id.getNameForLogs()); + InterpreterDropQuery::executeDropQuery(ASTDropQuery::Kind::Drop, nested_context, nested_context, drop_table_id, true); + } + catch (...) + { + tryLogCurrentException(__PRETTY_FUNCTION__); + } } } - dropReplicationSlot(tx, /* temporary */true); - tx.commit(); + tx->commit(); } catch (...) { diff --git a/src/Storages/PostgreSQL/PostgreSQLReplicationHandler.h b/src/Storages/PostgreSQL/PostgreSQLReplicationHandler.h index cf44101db76..c0a2a6f2559 100644 --- a/src/Storages/PostgreSQL/PostgreSQLReplicationHandler.h +++ b/src/Storages/PostgreSQL/PostgreSQLReplicationHandler.h @@ -15,6 +15,8 @@ struct SettingChange; class PostgreSQLReplicationHandler { +friend class TemporaryReplicationSlot; + public: PostgreSQLReplicationHandler( const String & replication_identifier, @@ -52,6 +54,8 @@ public: void setSetting(const SettingChange & setting); + void cleanupFunc(); + private: using MaterializedStorages = std::unordered_map; @@ -133,7 +137,9 @@ private: /// Replication consumer. Manages decoding of replication stream and syncing into tables. std::shared_ptr consumer; - BackgroundSchedulePool::TaskHolder startup_task, consumer_task; + BackgroundSchedulePool::TaskHolder startup_task; + BackgroundSchedulePool::TaskHolder consumer_task; + BackgroundSchedulePool::TaskHolder cleanup_task; std::atomic stop_synchronization = false; diff --git a/src/Storages/PostgreSQL/StorageMaterializedPostgreSQL.cpp b/src/Storages/PostgreSQL/StorageMaterializedPostgreSQL.cpp index 591e10a88b9..aefd1aedbf7 100644 --- a/src/Storages/PostgreSQL/StorageMaterializedPostgreSQL.cpp +++ b/src/Storages/PostgreSQL/StorageMaterializedPostgreSQL.cpp @@ -2,28 +2,38 @@ #if USE_LIBPQXX #include + #include #include #include + #include +#include + #include #include #include #include +#include + #include #include + #include #include #include #include -#include + +#include #include #include #include + #include #include #include -#include + +#include namespace DB @@ -181,18 +191,18 @@ StorageID StorageMaterializedPostgreSQL::getNestedStorageID() const } -void StorageMaterializedPostgreSQL::createNestedIfNeeded(PostgreSQLTableStructurePtr table_structure) +void StorageMaterializedPostgreSQL::createNestedIfNeeded(PostgreSQLTableStructurePtr table_structure, const ASTTableOverride * table_override) { if (tryGetNested()) return; - const auto ast_create = getCreateNestedTableQuery(std::move(table_structure)); - auto table_id = getStorageID(); - auto tmp_nested_table_id = StorageID(table_id.database_name, getNestedTableName()); - LOG_DEBUG(log, "Creating clickhouse table for postgresql table {}", table_id.getNameForLogs()); - try { + const auto ast_create = getCreateNestedTableQuery(std::move(table_structure), table_override); + auto table_id = getStorageID(); + auto tmp_nested_table_id = StorageID(table_id.database_name, getNestedTableName()); + LOG_DEBUG(log, "Creating clickhouse table for postgresql table {}", table_id.getNameForLogs()); + InterpreterCreateQuery interpreter(ast_create, nested_context); interpreter.execute(); @@ -200,10 +210,10 @@ void StorageMaterializedPostgreSQL::createNestedIfNeeded(PostgreSQLTableStructur /// Save storage_id with correct uuid. nested_table_id = nested_storage->getStorageID(); } - catch (Exception & e) + catch (...) { - e.addMessage("while creating nested table: {}", tmp_nested_table_id.getNameForLogs()); tryLogCurrentException(__PRETTY_FUNCTION__); + throw; } } @@ -362,12 +372,31 @@ ASTPtr StorageMaterializedPostgreSQL::getColumnDeclaration(const DataTypePtr & d } +std::shared_ptr StorageMaterializedPostgreSQL::getColumnsExpressionList(const NamesAndTypesList & columns) const +{ + auto columns_expression_list = std::make_shared(); + for (const auto & [name, type] : columns) + { + const auto & column_declaration = std::make_shared(); + + column_declaration->name = name; + column_declaration->type = getColumnDeclaration(type); + + columns_expression_list->children.emplace_back(column_declaration); + } + return columns_expression_list; +} + + /// For single storage MaterializedPostgreSQL get columns and primary key columns from storage definition. /// For database engine MaterializedPostgreSQL get columns and primary key columns by fetching from PostgreSQL, also using the same /// transaction with snapshot, which is used for initial tables dump. -ASTPtr StorageMaterializedPostgreSQL::getCreateNestedTableQuery(PostgreSQLTableStructurePtr table_structure) +ASTPtr StorageMaterializedPostgreSQL::getCreateNestedTableQuery( + PostgreSQLTableStructurePtr table_structure, const ASTTableOverride * table_override) { auto create_table_query = std::make_shared(); + if (table_override) + applyTableOverrideToCreateQuery(*table_override, create_table_query.get()); auto table_id = getStorageID(); create_table_query->setTable(getNestedTableName()); @@ -375,40 +404,85 @@ ASTPtr StorageMaterializedPostgreSQL::getCreateNestedTableQuery(PostgreSQLTableS if (is_materialized_postgresql_database) create_table_query->uuid = table_id.uuid; + auto storage = std::make_shared(); + storage->set(storage->engine, makeASTFunction("ReplacingMergeTree", std::make_shared("_version"))); + auto columns_declare_list = std::make_shared(); - auto columns_expression_list = std::make_shared(); auto order_by_expression = std::make_shared(); auto metadata_snapshot = getInMemoryMetadataPtr(); - const auto & columns = metadata_snapshot->getColumns(); + + ConstraintsDescription constraints; NamesAndTypesList ordinary_columns_and_types; - if (!is_materialized_postgresql_database) + if (is_materialized_postgresql_database) { - ordinary_columns_and_types = columns.getOrdinary(); - } - else - { - if (!table_structure) + if (!table_structure && !table_override) { - throw Exception(ErrorCodes::LOGICAL_ERROR, - "No table structure returned for table {}.{}", table_id.database_name, table_id.table_name); + throw Exception(ErrorCodes::LOGICAL_ERROR, "No table structure returned for table {}.{}", + table_id.database_name, table_id.table_name); } - if (!table_structure->columns) + if (!table_structure->columns && (!table_override || !table_override->columns)) { - throw Exception(ErrorCodes::LOGICAL_ERROR, - "No columns returned for table {}.{}", table_id.database_name, table_id.table_name); + throw Exception(ErrorCodes::LOGICAL_ERROR, "No columns returned for table {}.{}", + table_id.database_name, table_id.table_name); } - ordinary_columns_and_types = *table_structure->columns; + bool has_order_by_override = table_override && table_override->storage && table_override->storage->order_by; + if (has_order_by_override && !table_structure->replica_identity_columns) + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Having PRIMARY KEY OVERRIDE is allowed only if there is " + "replica identity index for PostgreSQL table. (table {}.{})", + table_id.database_name, table_id.table_name); + } if (!table_structure->primary_key_columns && !table_structure->replica_identity_columns) { throw Exception(ErrorCodes::BAD_ARGUMENTS, - "Table {}.{} has no primary key and no replica identity index", table_id.database_name, table_id.table_name); + "Table {}.{} has no primary key and no replica identity index", + table_id.database_name, table_id.table_name); } + if (table_override && table_override->columns) + { + if (table_override->columns) + { + auto children = table_override->columns->children; + const auto & columns = children[0]->as(); + if (columns) + { + for (const auto & child : columns->children) + { + const auto * column_declaration = child->as(); + auto type = DataTypeFactory::instance().get(column_declaration->type); + ordinary_columns_and_types.emplace_back(NameAndTypePair(column_declaration->name, type)); + } + } + + columns_declare_list->set(columns_declare_list->columns, children[0]); + } + else + { + ordinary_columns_and_types = *table_structure->columns; + columns_declare_list->set(columns_declare_list->columns, getColumnsExpressionList(ordinary_columns_and_types)); + } + + auto * columns = table_override->columns; + if (columns && columns->constraints) + constraints = ConstraintsDescription(columns->constraints->children); + } + else + { + ordinary_columns_and_types = *table_structure->columns; + columns_declare_list->set(columns_declare_list->columns, getColumnsExpressionList(ordinary_columns_and_types)); + } + + if (ordinary_columns_and_types.empty()) + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Table {}.{} has no columns", table_id.database_name, table_id.table_name); + NamesAndTypesList merging_columns; if (table_structure->primary_key_columns) merging_columns = *table_structure->primary_key_columns; @@ -417,39 +491,28 @@ ASTPtr StorageMaterializedPostgreSQL::getCreateNestedTableQuery(PostgreSQLTableS order_by_expression->name = "tuple"; order_by_expression->arguments = std::make_shared(); - for (const auto & column : merging_columns) order_by_expression->arguments->children.emplace_back(std::make_shared(column.name)); - } - for (const auto & [name, type] : ordinary_columns_and_types) + storage->set(storage->order_by, order_by_expression); + } + else { - const auto & column_declaration = std::make_shared(); + ordinary_columns_and_types = metadata_snapshot->getColumns().getOrdinary(); + columns_declare_list->set(columns_declare_list->columns, getColumnsExpressionList(ordinary_columns_and_types)); - column_declaration->name = name; - column_declaration->type = getColumnDeclaration(type); + auto primary_key_ast = metadata_snapshot->getPrimaryKeyAST(); + if (!primary_key_ast) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Storage MaterializedPostgreSQL must have primary key"); + storage->set(storage->order_by, primary_key_ast); - columns_expression_list->children.emplace_back(column_declaration); + constraints = metadata_snapshot->getConstraints(); } - columns_declare_list->set(columns_declare_list->columns, columns_expression_list); - columns_declare_list->columns->children.emplace_back(getMaterializedColumnsDeclaration("_sign", "Int8", 1)); columns_declare_list->columns->children.emplace_back(getMaterializedColumnsDeclaration("_version", "UInt64", 1)); - create_table_query->set(create_table_query->columns_list, columns_declare_list); - /// Not nullptr for single storage (because throws exception if not specified), nullptr otherwise. - auto primary_key_ast = getInMemoryMetadataPtr()->getPrimaryKeyAST(); - - auto storage = std::make_shared(); - storage->set(storage->engine, makeASTFunction("ReplacingMergeTree", std::make_shared("_version"))); - - if (primary_key_ast) - storage->set(storage->order_by, primary_key_ast); - else - storage->set(storage->order_by, order_by_expression); - create_table_query->set(create_table_query->storage, storage); /// Add columns _sign and _version, so that they can be accessed from nested ReplacingMergeTree table if needed. @@ -458,8 +521,7 @@ ASTPtr StorageMaterializedPostgreSQL::getCreateNestedTableQuery(PostgreSQLTableS StorageInMemoryMetadata storage_metadata; storage_metadata.setColumns(ColumnsDescription(ordinary_columns_and_types)); - storage_metadata.setConstraints(metadata_snapshot->getConstraints()); - + storage_metadata.setConstraints(constraints); setInMemoryMetadata(storage_metadata); return create_table_query; diff --git a/src/Storages/PostgreSQL/StorageMaterializedPostgreSQL.h b/src/Storages/PostgreSQL/StorageMaterializedPostgreSQL.h index 10724fb9bf0..9e11f314738 100644 --- a/src/Storages/PostgreSQL/StorageMaterializedPostgreSQL.h +++ b/src/Storages/PostgreSQL/StorageMaterializedPostgreSQL.h @@ -99,7 +99,11 @@ public: /// only once - when nested table is successfully created and is never changed afterwards. bool hasNested() { return has_nested.load(); } - void createNestedIfNeeded(PostgreSQLTableStructurePtr table_structure); + void createNestedIfNeeded(PostgreSQLTableStructurePtr table_structure, const ASTTableOverride * table_override); + + ASTPtr getCreateNestedTableQuery(PostgreSQLTableStructurePtr table_structure, const ASTTableOverride * table_override); + + std::shared_ptr getColumnsExpressionList(const NamesAndTypesList & columns) const; StoragePtr getNested() const; @@ -120,8 +124,6 @@ public: bool supportsFinal() const override { return true; } - ASTPtr getCreateNestedTableQuery(PostgreSQLTableStructurePtr table_structure); - protected: StorageMaterializedPostgreSQL( const StorageID & table_id_, diff --git a/src/Storages/ProjectionsDescription.cpp b/src/Storages/ProjectionsDescription.cpp index f1a0372a07d..791583e2495 100644 --- a/src/Storages/ProjectionsDescription.cpp +++ b/src/Storages/ProjectionsDescription.cpp @@ -201,7 +201,7 @@ ProjectionDescription ProjectionDescription::getMinMaxCountProjection( select_expression_list->children.push_back(makeASTFunction("count")); select_query->setExpression(ASTProjectionSelectQuery::Expression::SELECT, std::move(select_expression_list)); - if (partition_columns) + if (partition_columns && !partition_columns->children.empty()) select_query->setExpression(ASTProjectionSelectQuery::Expression::GROUP_BY, partition_columns->clone()); result.definition_ast = select_query; @@ -211,7 +211,9 @@ ProjectionDescription ProjectionDescription::getMinMaxCountProjection( auto external_storage_holder = std::make_shared(query_context, columns, ConstraintsDescription{}); StoragePtr storage = external_storage_holder->getTable(); InterpreterSelectQuery select( - result.query_ast, query_context, storage, {}, SelectQueryOptions{QueryProcessingStage::WithMergeableState}.modify().ignoreAlias()); + result.query_ast, query_context, storage, {}, + /// Here we ignore ast optimizations because otherwise aggregation keys may be removed from result header as constants. + SelectQueryOptions{QueryProcessingStage::WithMergeableState}.modify().ignoreAlias().ignoreASTOptimizationsAlias()); result.required_columns = select.getRequiredColumns(); result.sample_block = select.getSampleBlock(); diff --git a/src/Storages/RabbitMQ/ReadBufferFromRabbitMQConsumer.cpp b/src/Storages/RabbitMQ/ReadBufferFromRabbitMQConsumer.cpp index ac60d748e36..c8f199d098e 100644 --- a/src/Storages/RabbitMQ/ReadBufferFromRabbitMQConsumer.cpp +++ b/src/Storages/RabbitMQ/ReadBufferFromRabbitMQConsumer.cpp @@ -20,7 +20,6 @@ namespace ErrorCodes } ReadBufferFromRabbitMQConsumer::ReadBufferFromRabbitMQConsumer( - ChannelPtr consumer_channel_, RabbitMQHandler & event_handler_, std::vector & queues_, size_t channel_id_base_, @@ -30,7 +29,6 @@ ReadBufferFromRabbitMQConsumer::ReadBufferFromRabbitMQConsumer( uint32_t queue_size_, const std::atomic & stopped_) : ReadBuffer(nullptr, 0) - , consumer_channel(std::move(consumer_channel_)) , event_handler(event_handler_) , queues(queues_) , channel_base(channel_base_) @@ -129,9 +127,6 @@ void ReadBufferFromRabbitMQConsumer::setupChannel() if (!consumer_channel) return; - /// We mark initialized only once. - initialized = true; - wait_subscription.store(true); consumer_channel->onReady([&]() diff --git a/src/Storages/RabbitMQ/ReadBufferFromRabbitMQConsumer.h b/src/Storages/RabbitMQ/ReadBufferFromRabbitMQConsumer.h index 55d129856b8..8a527011a3c 100644 --- a/src/Storages/RabbitMQ/ReadBufferFromRabbitMQConsumer.h +++ b/src/Storages/RabbitMQ/ReadBufferFromRabbitMQConsumer.h @@ -20,7 +20,6 @@ class ReadBufferFromRabbitMQConsumer : public ReadBuffer public: ReadBufferFromRabbitMQConsumer( - ChannelPtr consumer_channel_, RabbitMQHandler & event_handler_, std::vector & queues_, size_t channel_id_base_, @@ -37,7 +36,7 @@ public: UInt64 delivery_tag; String channel_id; - AckTracker() : delivery_tag(0), channel_id("") {} + AckTracker() = default; AckTracker(UInt64 tag, String id) : delivery_tag(tag), channel_id(id) {} }; @@ -75,12 +74,6 @@ public: auto getMessageID() const { return current.message_id; } auto getTimestamp() const { return current.timestamp; } - void initialize() - { - if (!initialized) - setupChannel(); - } - private: bool nextImpl() override; @@ -105,9 +98,6 @@ private: AckTracker last_inserted_record_info; UInt64 prev_tag = 0, channel_id_counter = 0; - - /// Has initial setup after constructor been made? - bool initialized = false; }; } diff --git a/src/Storages/RabbitMQ/StorageRabbitMQ.cpp b/src/Storages/RabbitMQ/StorageRabbitMQ.cpp index 66772e7015b..ac299657ae6 100644 --- a/src/Storages/RabbitMQ/StorageRabbitMQ.cpp +++ b/src/Storages/RabbitMQ/StorageRabbitMQ.cpp @@ -577,7 +577,7 @@ bool StorageRabbitMQ::updateChannel(ChannelPtr & channel) try { channel = connection->createChannel(); - return channel->usable(); + return true; } catch (...) { @@ -587,6 +587,21 @@ bool StorageRabbitMQ::updateChannel(ChannelPtr & channel) } +void StorageRabbitMQ::prepareChannelForBuffer(ConsumerBufferPtr buffer) +{ + if (!buffer) + return; + + if (buffer->queuesCount() != queues.size()) + buffer->updateQueues(queues); + + buffer->updateAckTracker(); + + if (updateChannel(buffer->getChannel())) + buffer->setupChannel(); +} + + void StorageRabbitMQ::unbindExchange() { /* This is needed because with RabbitMQ (without special adjustments) can't, for example, properly make mv if there was insert query @@ -715,9 +730,9 @@ void StorageRabbitMQ::startup() } catch (...) { - tryLogCurrentException(log); if (!is_attach) throw; + tryLogCurrentException(log); } } else @@ -731,15 +746,14 @@ void StorageRabbitMQ::startup() try { auto buffer = createReadBuffer(); - if (rabbit_is_ready) - buffer->initialize(); pushReadBuffer(std::move(buffer)); ++num_created_consumers; } - catch (const AMQP::Exception & e) + catch (...) { - LOG_ERROR(log, "Got AMQ exception {}", e.what()); - throw; + if (!is_attach) + throw; + tryLogCurrentException(log); } } @@ -871,9 +885,8 @@ ConsumerBufferPtr StorageRabbitMQ::popReadBuffer(std::chrono::milliseconds timeo ConsumerBufferPtr StorageRabbitMQ::createReadBuffer() { - ChannelPtr consumer_channel = connection->createChannel(); return std::make_shared( - std::move(consumer_channel), connection->getHandler(), queues, ++consumer_id, + connection->getHandler(), queues, ++consumer_id, unique_strbase, log, row_delimiter, queue_size, shutdown_called); } @@ -921,7 +934,7 @@ void StorageRabbitMQ::initializeBuffers() if (!initialized) { for (const auto & buffer : buffers) - buffer->initialize(); + prepareChannelForBuffer(buffer); initialized = true; } } @@ -1086,19 +1099,7 @@ bool StorageRabbitMQ::streamToViews() if (source->needChannelUpdate()) { auto buffer = source->getBuffer(); - if (buffer) - { - if (buffer->queuesCount() != queues.size()) - buffer->updateQueues(queues); - - buffer->updateAckTracker(); - - if (updateChannel(buffer->getChannel())) - { - LOG_TRACE(log, "Connection is active, but channel update is needed"); - buffer->setupChannel(); - } - } + prepareChannelForBuffer(buffer); } /* false is returned by the sendAck function in only two cases: diff --git a/src/Storages/RabbitMQ/StorageRabbitMQ.h b/src/Storages/RabbitMQ/StorageRabbitMQ.h index a27a5bd59f1..9633326366d 100644 --- a/src/Storages/RabbitMQ/StorageRabbitMQ.h +++ b/src/Storages/RabbitMQ/StorageRabbitMQ.h @@ -66,6 +66,7 @@ public: bool updateChannel(ChannelPtr & channel); void updateQueues(std::vector & queues_) { queues_ = queues; } + void prepareChannelForBuffer(ConsumerBufferPtr buffer); void incrementReader(); void decrementReader(); diff --git a/src/Storages/RocksDB/EmbeddedRocksDBSink.cpp b/src/Storages/RocksDB/EmbeddedRocksDBSink.cpp index 1c918c15775..b42f2214d88 100644 --- a/src/Storages/RocksDB/EmbeddedRocksDBSink.cpp +++ b/src/Storages/RocksDB/EmbeddedRocksDBSink.cpp @@ -38,7 +38,7 @@ void EmbeddedRocksDBSink::consume(Chunk chunk) rocksdb::WriteBatch batch; rocksdb::Status status; - for (size_t i = 0; i < rows; i++) + for (size_t i = 0; i < rows; ++i) { wb_key.restart(); wb_value.restart(); diff --git a/src/Storages/SelectQueryDescription.cpp b/src/Storages/SelectQueryDescription.cpp index 018a9f0ea98..2cc8f769cf1 100644 --- a/src/Storages/SelectQueryDescription.cpp +++ b/src/Storages/SelectQueryDescription.cpp @@ -2,6 +2,7 @@ #include #include +#include #include #include #include @@ -12,7 +13,6 @@ namespace DB namespace ErrorCodes { extern const int QUERY_IS_NOT_SUPPORTED_IN_MATERIALIZED_VIEW; -extern const int LOGICAL_ERROR; } SelectQueryDescription::SelectQueryDescription(const SelectQueryDescription & other) @@ -60,9 +60,9 @@ StorageID extractDependentTableFromSelectQuery(ASTSelectQuery & query, ContextPt { auto * ast_select = subquery->as(); if (!ast_select) - throw Exception("Logical error while creating StorageMaterializedView. " - "Could not retrieve table name from select query.", - DB::ErrorCodes::LOGICAL_ERROR); + throw Exception(ErrorCodes::QUERY_IS_NOT_SUPPORTED_IN_MATERIALIZED_VIEW, + "StorageMaterializedView cannot be created from table functions ({})", + serializeAST(*subquery)); if (ast_select->list_of_selects->children.size() != 1) throw Exception("UNION is not supported for MATERIALIZED VIEW", ErrorCodes::QUERY_IS_NOT_SUPPORTED_IN_MATERIALIZED_VIEW); diff --git a/src/Storages/StorageBuffer.cpp b/src/Storages/StorageBuffer.cpp index 6417aa9f72c..0cc401aa93c 100644 --- a/src/Storages/StorageBuffer.cpp +++ b/src/Storages/StorageBuffer.cpp @@ -126,7 +126,13 @@ StorageBuffer::StorageBuffer( , bg_pool(getContext()->getBufferFlushSchedulePool()) { StorageInMemoryMetadata storage_metadata; - storage_metadata.setColumns(columns_); + if (columns_.empty()) + { + auto dest_table = DatabaseCatalog::instance().getTable(destination_id, context_); + storage_metadata.setColumns(dest_table->getInMemoryMetadataPtr()->getColumns()); + } + else + storage_metadata.setColumns(columns_); storage_metadata.setConstraints(constraints_); storage_metadata.setComment(comment); setInMemoryMetadata(storage_metadata); @@ -455,10 +461,8 @@ static void appendBlock(const Block & from, Block & to) size_t rows = from.rows(); size_t bytes = from.bytes(); - CurrentMetrics::add(CurrentMetrics::StorageBufferRows, rows); - CurrentMetrics::add(CurrentMetrics::StorageBufferBytes, bytes); - size_t old_rows = to.rows(); + size_t old_bytes = to.bytes(); MutableColumnPtr last_col; try @@ -468,6 +472,8 @@ static void appendBlock(const Block & from, Block & to) if (to.rows() == 0) { to = from; + CurrentMetrics::add(CurrentMetrics::StorageBufferRows, rows); + CurrentMetrics::add(CurrentMetrics::StorageBufferBytes, bytes); } else { @@ -480,6 +486,8 @@ static void appendBlock(const Block & from, Block & to) to.getByPosition(column_no).column = std::move(last_col); } + CurrentMetrics::add(CurrentMetrics::StorageBufferRows, rows); + CurrentMetrics::add(CurrentMetrics::StorageBufferBytes, to.bytes() - old_bytes); } } catch (...) @@ -1108,7 +1116,7 @@ void registerStorageBuffer(StorageFactory & factory) // After we evaluated all expressions, check that all arguments are // literals. - for (size_t i = 0; i < engine_args.size(); i++) + for (size_t i = 0; i < engine_args.size(); ++i) { if (!typeid_cast(engine_args[i].get())) { @@ -1165,6 +1173,7 @@ void registerStorageBuffer(StorageFactory & factory) }, { .supports_parallel_insert = true, + .supports_schema_inference = true, }); } diff --git a/src/Storages/StorageDictionary.cpp b/src/Storages/StorageDictionary.cpp index f6b330fe3df..da8c5f115b2 100644 --- a/src/Storages/StorageDictionary.cpp +++ b/src/Storages/StorageDictionary.cpp @@ -11,6 +11,7 @@ #include #include #include +#include namespace DB @@ -21,6 +22,7 @@ namespace ErrorCodes extern const int THERE_IS_NO_COLUMN; extern const int CANNOT_DETACH_DICTIONARY_AS_TABLE; extern const int DICTIONARY_ALREADY_EXISTS; + extern const int NOT_IMPLEMENTED; } namespace @@ -111,10 +113,11 @@ StorageDictionary::StorageDictionary( const StorageID & table_id_, const String & dictionary_name_, const DictionaryStructure & dictionary_structure_, + const String & comment, Location location_, ContextPtr context_) : StorageDictionary( - table_id_, dictionary_name_, ColumnsDescription{getNamesAndTypes(dictionary_structure_)}, String{}, location_, context_) + table_id_, dictionary_name_, ColumnsDescription{getNamesAndTypes(dictionary_structure_)}, comment, location_, context_) { } @@ -126,6 +129,7 @@ StorageDictionary::StorageDictionary( table_id, table_id.getFullNameNotQuoted(), context_->getExternalDictionariesLoader().getDictionaryStructure(*dictionary_configuration), + dictionary_configuration->getString("dictionary.comment", ""), Location::SameDatabaseAndNameAsDictionary, context_) { @@ -230,7 +234,7 @@ void StorageDictionary::renameInMemory(const StorageID & new_table_id) if (move_to_atomic) configuration->setString("dictionary.uuid", toString(new_table_id.uuid)); else if (move_to_ordinary) - configuration->remove("dictionary.uuid"); + configuration->remove("dictionary.uuid"); } /// Dictionary is moving between databases of different engines or is renaming inside Ordinary database @@ -260,6 +264,40 @@ void StorageDictionary::renameInMemory(const StorageID & new_table_id) } } +void StorageDictionary::checkAlterIsPossible(const AlterCommands & commands, ContextPtr /* context */) const +{ + for (const auto & command : commands) + { + if (location == Location::DictionaryDatabase || command.type != AlterCommand::COMMENT_TABLE) + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Alter of type '{}' is not supported by storage {}", + command.type, getName()); + } +} + +void StorageDictionary::alter(const AlterCommands & params, ContextPtr alter_context, AlterLockHolder & lock_holder) +{ + IStorage::alter(params, alter_context, lock_holder); + + if (location == Location::Custom) + return; + + auto new_comment = getInMemoryMetadataPtr()->comment; + + auto storage_id = getStorageID(); + const auto & external_dictionaries_loader = getContext()->getExternalDictionariesLoader(); + auto result = external_dictionaries_loader.getLoadResult(storage_id.getInternalDictionaryName()); + + if (result.object) + { + auto dictionary = std::static_pointer_cast(result.object); + auto * dictionary_non_const = const_cast(dictionary.get()); + dictionary_non_const->setDictionaryComment(new_comment); + } + + std::lock_guard lock(dictionary_config_mutex); + configuration->setString("dictionary.comment", std::move(new_comment)); +} + void registerStorageDictionary(StorageFactory & factory) { factory.registerStorage("Dictionary", [](const StorageFactory::Arguments & args) diff --git a/src/Storages/StorageDictionary.h b/src/Storages/StorageDictionary.h index 7d0af8c0ee3..855d02b0947 100644 --- a/src/Storages/StorageDictionary.h +++ b/src/Storages/StorageDictionary.h @@ -42,6 +42,10 @@ public: void renameInMemory(const StorageID & new_table_id) override; + void checkAlterIsPossible(const AlterCommands & commands, ContextPtr /* context */) const override; + + void alter(const AlterCommands & params, ContextPtr alter_context, AlterLockHolder &) override; + Poco::Timestamp getUpdateTime() const; LoadablesConfigurationPtr getConfiguration() const; @@ -89,6 +93,7 @@ private: const StorageID & table_id_, const String & dictionary_name_, const DictionaryStructure & dictionary_structure, + const String & comment, Location location_, ContextPtr context_); diff --git a/src/Storages/StorageDistributed.cpp b/src/Storages/StorageDistributed.cpp index e033d319fc8..19869b77106 100644 --- a/src/Storages/StorageDistributed.cpp +++ b/src/Storages/StorageDistributed.cpp @@ -13,6 +13,7 @@ #include #include #include +#include #include @@ -24,7 +25,6 @@ #include #include -#include #include #include #include @@ -36,13 +36,12 @@ #include #include #include -#include +#include #include #include #include #include -#include #include #include #include @@ -63,7 +62,6 @@ #include #include -#include #include #include @@ -71,8 +69,6 @@ #include #include -#include - #include #include #include @@ -329,7 +325,16 @@ StorageDistributed::StorageDistributed( , rng(randomSeed()) { StorageInMemoryMetadata storage_metadata; - storage_metadata.setColumns(columns_); + if (columns_.empty()) + { + StorageID id = StorageID::createEmpty(); + id.table_name = remote_table; + id.database_name = remote_database; + storage_metadata.setColumns(getStructureOfRemoteTable(*getCluster(), id, getContext(), remote_table_function_ptr)); + } + else + storage_metadata.setColumns(columns_); + storage_metadata.setConstraints(constraints_); storage_metadata.setComment(comment); setInMemoryMetadata(storage_metadata); @@ -730,7 +735,15 @@ QueryPipelineBuilderPtr StorageDistributed::distributedWrite(const ASTInsertQuer std::vector> pipelines; - String new_query_str = queryToString(new_query); + String new_query_str; + { + WriteBufferFromOwnString buf; + IAST::FormatSettings ast_format_settings(buf, /*one_line*/ true); + ast_format_settings.always_quote_identifiers = true; + new_query->IAST::format(ast_format_settings); + new_query_str = buf.str(); + } + for (size_t shard_index : collections::range(0, shards_info.size())) { const auto & shard_info = shards_info[shard_index]; @@ -1390,6 +1403,7 @@ void registerStorageDistributed(StorageFactory & factory) { .supports_settings = true, .supports_parallel_insert = true, + .supports_schema_inference = true, .source_access_type = AccessType::REMOTE, }); } diff --git a/src/Storages/StorageExecutable.cpp b/src/Storages/StorageExecutable.cpp index 51ecfc1e884..21143438725 100644 --- a/src/Storages/StorageExecutable.cpp +++ b/src/Storages/StorageExecutable.cpp @@ -2,6 +2,8 @@ #include +#include + #include #include @@ -16,13 +18,12 @@ #include #include #include +#include #include #include #include #include -#include - namespace DB { @@ -30,80 +31,78 @@ namespace DB namespace ErrorCodes { extern const int UNSUPPORTED_METHOD; - extern const int LOGICAL_ERROR; extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; - extern const int TIMEOUT_EXCEEDED; } -StorageExecutable::StorageExecutable( - const StorageID & table_id_, - const String & script_name_, - const std::vector & arguments_, - const String & format_, - const std::vector & input_queries_, - const ColumnsDescription & columns, - const ConstraintsDescription & constraints) - : IStorage(table_id_) - , script_name(script_name_) - , arguments(arguments_) - , format(format_) - , input_queries(input_queries_) - , log(&Poco::Logger::get("StorageExecutable")) +namespace { - StorageInMemoryMetadata storage_metadata; - storage_metadata.setColumns(columns); - storage_metadata.setConstraints(constraints); - setInMemoryMetadata(storage_metadata); + void transformToSingleBlockSources(Pipes & inputs) + { + size_t inputs_size = inputs.size(); + for (size_t i = 0; i < inputs_size; ++i) + { + auto && input = inputs[i]; + QueryPipeline input_pipeline(std::move(input)); + PullingPipelineExecutor input_pipeline_executor(input_pipeline); + + auto header = input_pipeline_executor.getHeader(); + auto result_block = header.cloneEmpty(); + + size_t result_block_columns = result_block.columns(); + + Block result; + while (input_pipeline_executor.pull(result)) + { + for (size_t result_block_index = 0; result_block_index < result_block_columns; ++result_block_index) + { + auto & block_column = result.safeGetByPosition(result_block_index); + auto & result_block_column = result_block.safeGetByPosition(result_block_index); + + result_block_column.column->assumeMutable()->insertRangeFrom(*block_column.column, 0, block_column.column->size()); + } + } + + auto source = std::make_shared(std::move(result_block)); + inputs[i] = Pipe(std::move(source)); + } + } } StorageExecutable::StorageExecutable( const StorageID & table_id_, - const String & script_name_, - const std::vector & arguments_, - const String & format_, - const std::vector & input_queries_, + const String & format, const ExecutableSettings & settings_, + const std::vector & input_queries_, const ColumnsDescription & columns, const ConstraintsDescription & constraints) : IStorage(table_id_) - , script_name(script_name_) - , arguments(arguments_) - , format(format_) - , input_queries(input_queries_) , settings(settings_) - /// If pool size == 0 then there is no size restrictions. Poco max size of semaphore is integer type. - , process_pool(std::make_shared(settings.pool_size == 0 ? std::numeric_limits::max() : settings.pool_size)) - , log(&Poco::Logger::get("StorageExecutablePool")) + , input_queries(input_queries_) + , log(settings.is_executable_pool ? &Poco::Logger::get("StorageExecutablePool") : &Poco::Logger::get("StorageExecutable")) { StorageInMemoryMetadata storage_metadata; storage_metadata.setColumns(columns); storage_metadata.setConstraints(constraints); setInMemoryMetadata(storage_metadata); + + ShellCommandSourceCoordinator::Configuration configuration + { + .format = format, + .command_termination_timeout_seconds = settings.command_termination_timeout, + .command_read_timeout_milliseconds = settings.command_read_timeout, + .command_write_timeout_milliseconds = settings.command_write_timeout, + + .pool_size = settings.pool_size, + .max_command_execution_time_seconds = settings.max_command_execution_time, + + .is_executable_pool = settings.is_executable_pool, + .send_chunk_header = settings.send_chunk_header, + .execute_direct = true + }; + + coordinator = std::make_unique(std::move(configuration)); } -class SendingChunkHeaderTransform final : public ISimpleTransform -{ -public: - SendingChunkHeaderTransform(const Block & header, WriteBuffer & buffer_) - : ISimpleTransform(header, header, false) - , buffer(buffer_) - { - } - - String getName() const override { return "SendingChunkHeaderTransform"; } - -protected: - - void transform(Chunk & chunk) override - { - writeText(chunk.getNumRows(), buffer); - writeChar('\n', buffer); - } - -private: - WriteBuffer & buffer; -}; - Pipe StorageExecutable::read( const Names & /*column_names*/, const StorageMetadataPtr & metadata_snapshot, @@ -113,10 +112,12 @@ Pipe StorageExecutable::read( size_t max_block_size, unsigned /*threads*/) { + auto & script_name = settings.script_name; + auto user_scripts_path = context->getUserScriptsPath(); auto script_path = user_scripts_path + '/' + script_name; - if (!pathStartsWith(script_path, user_scripts_path)) + if (!fileOrSymlinkPathStartsWith(script_path, user_scripts_path)) throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "Executable file {} must be inside user scripts folder {}", script_name, @@ -128,101 +129,31 @@ Pipe StorageExecutable::read( script_name, user_scripts_path); - std::vector inputs; + Pipes inputs; inputs.reserve(input_queries.size()); for (auto & input_query : input_queries) { InterpreterSelectWithUnionQuery interpreter(input_query, context, {}); - inputs.emplace_back(interpreter.buildQueryPipeline()); + inputs.emplace_back(QueryPipelineBuilder::getPipe(interpreter.buildQueryPipeline())); } - ShellCommand::Config config(script_path); - config.arguments = arguments; - for (size_t i = 1; i < inputs.size(); ++i) - config.write_fds.emplace_back(i + 2); - - std::unique_ptr process; - - bool is_executable_pool = (process_pool != nullptr); - if (is_executable_pool) - { - bool result = process_pool->tryBorrowObject(process, [&config, this]() - { - config.terminate_in_destructor_strategy = ShellCommand::DestructorStrategy{ true /*terminate_in_destructor*/, settings.command_termination_timeout }; - auto shell_command = ShellCommand::executeDirect(config); - return shell_command; - }, settings.max_command_execution_time * 10000); - - if (!result) - throw Exception(ErrorCodes::TIMEOUT_EXCEEDED, - "Could not get process from pool, max command execution timeout exceeded {} seconds", - settings.max_command_execution_time); - } - else - { - process = ShellCommand::executeDirect(config); - } - - std::vector tasks; - tasks.reserve(inputs.size()); - - for (size_t i = 0; i < inputs.size(); ++i) - { - WriteBufferFromFile * write_buffer = nullptr; - - if (i == 0) - { - write_buffer = &process->in; - } - else - { - auto descriptor = i + 2; - auto it = process->write_fds.find(descriptor); - if (it == process->write_fds.end()) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Process does not contain descriptor to write {}", descriptor); - - write_buffer = &it->second; - } - - inputs[i].resize(1); - if (settings.send_chunk_header) - { - auto transform = std::make_shared(inputs[i].getHeader(), *write_buffer); - inputs[i].addTransform(std::move(transform)); - } - - auto pipeline = std::make_shared(QueryPipelineBuilder::getPipeline(std::move(inputs[i]))); - - auto out = context->getOutputFormat(format, *write_buffer, materializeBlock(pipeline->getHeader())); - out->setAutoFlush(); - pipeline->complete(std::move(out)); - - ShellCommandSource::SendDataTask task = [pipeline, write_buffer, is_executable_pool]() - { - CompletedPipelineExecutor executor(*pipeline); - executor.execute(); - - if (!is_executable_pool) - write_buffer->close(); - }; - - tasks.emplace_back(std::move(task)); - } + /// For executable pool we read data from input streams and convert it to single blocks streams. + if (settings.is_executable_pool) + transformToSingleBlockSources(inputs); auto sample_block = metadata_snapshot->getSampleBlock(); ShellCommandSourceConfiguration configuration; configuration.max_block_size = max_block_size; - if (is_executable_pool) + if (settings.is_executable_pool) { configuration.read_fixed_number_of_rows = true; configuration.read_number_of_rows_from_process_output = true; } - Pipe pipe(std::make_unique(context, format, std::move(sample_block), std::move(process), std::move(tasks), configuration, process_pool)); - return pipe; + return coordinator->createPipe(script_path, settings.script_arguments, std::move(inputs), std::move(sample_block), context, configuration); } void registerStorageExecutable(StorageFactory & factory) @@ -262,6 +193,11 @@ void registerStorageExecutable(StorageFactory & factory) const auto & columns = args.columns; const auto & constraints = args.constraints; + ExecutableSettings settings; + settings.script_name = script_name; + settings.script_arguments = script_name_with_arguments; + settings.is_executable_pool = is_executable_pool; + if (is_executable_pool) { size_t max_command_execution_time = 10; @@ -270,28 +206,28 @@ void registerStorageExecutable(StorageFactory & factory) if (max_execution_time_seconds != 0 && max_command_execution_time > max_execution_time_seconds) max_command_execution_time = max_execution_time_seconds; - ExecutableSettings pool_settings; - pool_settings.max_command_execution_time = max_command_execution_time; - if (args.storage_def->settings) - pool_settings.loadFromQuery(*args.storage_def); + settings.max_command_execution_time = max_command_execution_time; + } - return StorageExecutable::create(args.table_id, script_name, script_name_with_arguments, format, input_queries, pool_settings, columns, constraints); - } - else - { - return StorageExecutable::create(args.table_id, script_name, script_name_with_arguments, format, input_queries, columns, constraints); - } + if (args.storage_def->settings) + settings.loadFromQuery(*args.storage_def); + + auto global_context = args.getContext()->getGlobalContext(); + return StorageExecutable::create(args.table_id, format, settings, input_queries, columns, constraints); }; + StorageFactory::StorageFeatures storage_features; + storage_features.supports_settings = true; + factory.registerStorage("Executable", [&](const StorageFactory::Arguments & args) { return register_storage(args, false /*is_executable_pool*/); - }); + }, storage_features); factory.registerStorage("ExecutablePool", [&](const StorageFactory::Arguments & args) { return register_storage(args, true /*is_executable_pool*/); - }); + }, storage_features); } }; diff --git a/src/Storages/StorageExecutable.h b/src/Storages/StorageExecutable.h index 74df17f1463..b6248abae97 100644 --- a/src/Storages/StorageExecutable.h +++ b/src/Storages/StorageExecutable.h @@ -23,7 +23,7 @@ public: String getName() const override { - if (process_pool) + if (settings.is_executable_pool) return "ExecutablePool"; else return "Executable"; @@ -42,31 +42,17 @@ protected: StorageExecutable( const StorageID & table_id, - const String & script_name_, - const std::vector & arguments_, - const String & format_, - const std::vector & input_queries_, - const ColumnsDescription & columns, - const ConstraintsDescription & constraints); - - StorageExecutable( - const StorageID & table_id, - const String & script_name_, - const std::vector & arguments_, - const String & format_, - const std::vector & input_queries_, - const ExecutableSettings & settings_, + const String & format, + const ExecutableSettings & settings, + const std::vector & input_queries, const ColumnsDescription & columns, const ConstraintsDescription & constraints); private: - String script_name; - std::vector arguments; - String format; - std::vector input_queries; ExecutableSettings settings; - std::shared_ptr process_pool; + std::vector input_queries; Poco::Logger * log; + std::unique_ptr coordinator; }; } diff --git a/src/Storages/StorageFactory.cpp b/src/Storages/StorageFactory.cpp index b82db93809b..eae46220c86 100644 --- a/src/Storages/StorageFactory.cpp +++ b/src/Storages/StorageFactory.cpp @@ -156,9 +156,6 @@ StoragePtr StorageFactory::get( throw Exception("Unknown table engine " + name, ErrorCodes::UNKNOWN_STORAGE); } - if (query.comment) - comment = query.comment->as().value.get(); - auto check_feature = [&](String feature_description, FeatureMatcherFn feature_matcher_fn) { if (!feature_matcher_fn(it->second.features)) @@ -204,6 +201,9 @@ StoragePtr StorageFactory::get( } } + if (query.comment) + comment = query.comment->as().value.get(); + ASTs empty_engine_args; Arguments arguments{ .engine_name = name, diff --git a/src/Storages/StorageFactory.h b/src/Storages/StorageFactory.h index 20db1a44897..6ffa6327176 100644 --- a/src/Storages/StorageFactory.h +++ b/src/Storages/StorageFactory.h @@ -66,6 +66,7 @@ public: bool supports_deduplication = false; /// See also IStorage::supportsParallelInsert() bool supports_parallel_insert = false; + bool supports_schema_inference = false; AccessType source_access_type = AccessType::NONE; }; @@ -98,6 +99,7 @@ public: .supports_replication = false, .supports_deduplication = false, .supports_parallel_insert = false, + .supports_schema_inference = false, .source_access_type = AccessType::NONE, }); @@ -126,6 +128,12 @@ public: AccessType getSourceAccessType(const String & table_engine) const; + bool checkIfStorageSupportsSchemaInterface(const String & storage_name) + { + if (storages.contains(storage_name)) + return storages[storage_name].features.supports_schema_inference; + return false; + } private: Storages storages; }; diff --git a/src/Storages/StorageFile.cpp b/src/Storages/StorageFile.cpp index 6eb2ce1d298..a479f982c70 100644 --- a/src/Storages/StorageFile.cpp +++ b/src/Storages/StorageFile.cpp @@ -15,8 +15,9 @@ #include #include -#include #include +#include +#include #include #include @@ -38,6 +39,7 @@ #include #include #include +#include #include #include #include @@ -63,6 +65,7 @@ namespace ErrorCodes extern const int INCOMPATIBLE_COLUMNS; extern const int CANNOT_STAT; extern const int LOGICAL_ERROR; + extern const int CANNOT_EXTRACT_TABLE_STRUCTURE; } namespace @@ -135,6 +138,56 @@ void checkCreationIsAllowed(ContextPtr context_global, const std::string & db_di throw Exception("File must not be a directory", ErrorCodes::INCORRECT_FILE_NAME); } +std::unique_ptr createReadBuffer( + const String & current_path, + bool use_table_fd, + const String & storage_name, + int table_fd, + const String & compression_method, + ContextPtr context) +{ + std::unique_ptr nested_buffer; + CompressionMethod method; + + struct stat file_stat{}; + + if (use_table_fd) + { + /// Check if file descriptor allows random reads (and reading it twice). + if (0 != fstat(table_fd, &file_stat)) + throwFromErrno("Cannot stat table file descriptor, inside " + storage_name, ErrorCodes::CANNOT_STAT); + + if (S_ISREG(file_stat.st_mode)) + nested_buffer = std::make_unique(table_fd); + else + nested_buffer = std::make_unique(table_fd); + + method = chooseCompressionMethod("", compression_method); + } + else + { + /// Check if file descriptor allows random reads (and reading it twice). + if (0 != stat(current_path.c_str(), &file_stat)) + throwFromErrno("Cannot stat file " + current_path, ErrorCodes::CANNOT_STAT); + + if (S_ISREG(file_stat.st_mode)) + nested_buffer = std::make_unique(current_path, context->getSettingsRef().max_read_buffer_size); + else + nested_buffer = std::make_unique(current_path, context->getSettingsRef().max_read_buffer_size); + + method = chooseCompressionMethod(current_path, compression_method); + } + + /// For clickhouse-local add progress callback to display progress bar. + if (context->getApplicationType() == Context::ApplicationType::LOCAL) + { + auto & in = static_cast(*nested_buffer); + in.setProgressCallback(context); + } + + return wrapReadBufferWithCompressionMethod(std::move(nested_buffer), method); +} + } Strings StorageFile::getPathsList(const String & table_path, const String & user_files_path, ContextPtr context, size_t & total_bytes_to_read) @@ -147,8 +200,7 @@ Strings StorageFile::getPathsList(const String & table_path, const String & user Strings paths; /// Do not use fs::canonical or fs::weakly_canonical. /// Otherwise it will not allow to work with symlinks in `user_files_path` directory. - String path = fs::absolute(fs_table_path); - path = fs::path(path).lexically_normal(); /// Normalize path. + String path = fs::absolute(fs_table_path).lexically_normal(); /// Normalize path. if (path.find_first_of("*?{") == std::string::npos) { std::error_code error; @@ -165,6 +217,42 @@ Strings StorageFile::getPathsList(const String & table_path, const String & user return paths; } + +ColumnsDescription StorageFile::getTableStructureFromData( + const String & format, + const std::vector & paths, + const String & compression_method, + const std::optional & format_settings, + ContextPtr context) +{ + if (format == "Distributed") + { + if (paths.empty()) + throw Exception( + "Cannot get table structure from file, because no files match specified name", ErrorCodes::INCORRECT_FILE_NAME); + + auto source = StorageDistributedDirectoryMonitor::createSourceFromFile(paths[0]); + return ColumnsDescription(source->getOutputs().front().getHeader().getNamesAndTypesList()); + } + + auto read_buffer_creator = [&]() + { + String path; + auto it = std::find_if(paths.begin(), paths.end(), [](const String & p){ return std::filesystem::exists(p); }); + if (it == paths.end()) + throw Exception( + ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, + "Cannot extract table structure from {} format file, because there are no files with provided path. You must specify " + "table structure manually", + format); + + path = *it; + return createReadBuffer(path, false, "File", -1, compression_method, context); + }; + + return readSchemaFromFormat(format, format_settings, read_buffer_creator, context); +} + bool StorageFile::isColumnOriented() const { return format_name != "Distributed" && FormatFactory::instance().checkIfFormatIsColumnOriented(format_name); @@ -183,10 +271,13 @@ StorageFile::StorageFile(int table_fd_, CommonArguments args) throw Exception("Using file descriptor as source of storage isn't allowed for server daemons", ErrorCodes::DATABASE_ACCESS_DENIED); if (args.format_name == "Distributed") throw Exception("Distributed format is allowed only with explicit file path", ErrorCodes::INCORRECT_FILE_NAME); + if (args.columns.empty()) + throw Exception("Automatic schema inference is not allowed when using file descriptor as source of storage", ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE); is_db_table = false; use_table_fd = true; table_fd = table_fd_; + setStorageMetadata(args); } StorageFile::StorageFile(const std::string & table_path_, const std::string & user_files_path, CommonArguments args) @@ -195,22 +286,7 @@ StorageFile::StorageFile(const std::string & table_path_, const std::string & us is_db_table = false; paths = getPathsList(table_path_, user_files_path, args.getContext(), total_bytes_to_read); path_for_partitioned_write = table_path_; - - if (args.format_name == "Distributed") - { - if (paths.empty()) - throw Exception("Cannot get table structure from file, because no files match specified name", ErrorCodes::INCORRECT_FILE_NAME); - - auto & first_path = paths[0]; - Block header = StorageDistributedDirectoryMonitor::createSourceFromFile(first_path)->getOutputs().front().getHeader(); - - StorageInMemoryMetadata storage_metadata; - auto columns = ColumnsDescription(header.getNamesAndTypesList()); - if (!args.columns.empty() && columns != args.columns) - throw Exception("Table structure and file structure are different", ErrorCodes::INCOMPATIBLE_COLUMNS); - storage_metadata.setColumns(columns); - setInMemoryMetadata(storage_metadata); - } + setStorageMetadata(args); } StorageFile::StorageFile(const std::string & relative_table_dir_path, CommonArguments args) @@ -226,6 +302,8 @@ StorageFile::StorageFile(const std::string & relative_table_dir_path, CommonArgu paths = {getTablePath(table_dir_path, format_name)}; if (fs::exists(paths[0])) total_bytes_to_read = fs::file_size(paths[0]); + + setStorageMetadata(args); } StorageFile::StorageFile(CommonArguments args) @@ -234,9 +312,21 @@ StorageFile::StorageFile(CommonArguments args) , format_settings(args.format_settings) , compression_method(args.compression_method) , base_path(args.getContext()->getPath()) +{ +} + +void StorageFile::setStorageMetadata(CommonArguments args) { StorageInMemoryMetadata storage_metadata; - if (args.format_name != "Distributed") + + if (args.format_name == "Distributed" || args.columns.empty()) + { + auto columns = getTableStructureFromData(format_name, paths, compression_method, format_settings, args.getContext()); + if (!args.columns.empty() && args.columns != columns) + throw Exception("Table structure and file structure are different", ErrorCodes::INCOMPATIBLE_COLUMNS); + storage_metadata.setColumns(columns); + } + else storage_metadata.setColumns(args.columns); storage_metadata.setConstraints(args.constraints); @@ -351,46 +441,7 @@ public: } } - std::unique_ptr nested_buffer; - CompressionMethod method; - - struct stat file_stat{}; - - if (storage->use_table_fd) - { - /// Check if file descriptor allows random reads (and reading it twice). - if (0 != fstat(storage->table_fd, &file_stat)) - throwFromErrno("Cannot stat table file descriptor, inside " + storage->getName(), ErrorCodes::CANNOT_STAT); - - if (S_ISREG(file_stat.st_mode)) - nested_buffer = std::make_unique(storage->table_fd); - else - nested_buffer = std::make_unique(storage->table_fd); - - method = chooseCompressionMethod("", storage->compression_method); - } - else - { - /// Check if file descriptor allows random reads (and reading it twice). - if (0 != stat(current_path.c_str(), &file_stat)) - throwFromErrno("Cannot stat file " + current_path, ErrorCodes::CANNOT_STAT); - - if (S_ISREG(file_stat.st_mode)) - nested_buffer = std::make_unique(current_path, context->getSettingsRef().max_read_buffer_size); - else - nested_buffer = std::make_unique(current_path, context->getSettingsRef().max_read_buffer_size); - - method = chooseCompressionMethod(current_path, storage->compression_method); - } - - /// For clickhouse-local add progress callback to display progress bar. - if (context->getApplicationType() == Context::ApplicationType::LOCAL) - { - auto & in = static_cast(*nested_buffer); - in.setProgressCallback(context); - } - - read_buf = wrapReadBufferWithCompressionMethod(std::move(nested_buffer), method); + read_buf = createReadBuffer(current_path, storage->use_table_fd, storage->getName(), storage->table_fd, storage->compression_method, context); auto get_block_for_format = [&]() -> Block { @@ -854,7 +905,8 @@ void registerStorageFile(StorageFactory & factory) { StorageFactory::StorageFeatures storage_features{ .supports_settings = true, - .source_access_type = AccessType::FILE + .supports_schema_inference = true, + .source_access_type = AccessType::FILE, }; factory.registerStorage( diff --git a/src/Storages/StorageFile.h b/src/Storages/StorageFile.h index f48d1c285da..6b015976589 100644 --- a/src/Storages/StorageFile.h +++ b/src/Storages/StorageFile.h @@ -1,6 +1,7 @@ #pragma once #include + #include #include @@ -70,6 +71,13 @@ public: bool supportsPartitionBy() const override { return true; } + static ColumnsDescription getTableStructureFromData( + const String & format, + const std::vector & paths, + const String & compression_method, + const std::optional & format_settings, + ContextPtr context); + protected: friend class StorageFileSource; friend class StorageFileSink; @@ -86,6 +94,8 @@ protected: private: explicit StorageFile(CommonArguments args); + void setStorageMetadata(CommonArguments args); + std::string format_name; // We use format settings from global context + CREATE query for File table // function -- in this case, format_settings is set. diff --git a/src/Storages/StorageGenerateRandom.cpp b/src/Storages/StorageGenerateRandom.cpp index a4dfbfc3f96..19e8f78d877 100644 --- a/src/Storages/StorageGenerateRandom.cpp +++ b/src/Storages/StorageGenerateRandom.cpp @@ -216,6 +216,13 @@ ColumnPtr fillColumnWithRandomData( fillBufferWithRandomData(reinterpret_cast(column->getData().data()), limit * sizeof(UInt16), rng); return column; } + case TypeIndex::Date32: + { + auto column = ColumnInt32::create(); + column->getData().resize(limit); + fillBufferWithRandomData(reinterpret_cast(column->getData().data()), limit * sizeof(Int32), rng); + return column; + } case TypeIndex::UInt32: [[fallthrough]]; case TypeIndex::DateTime: { diff --git a/src/Storages/StorageInMemoryMetadata.cpp b/src/Storages/StorageInMemoryMetadata.cpp index dcdf3a097e6..061319bc1c0 100644 --- a/src/Storages/StorageInMemoryMetadata.cpp +++ b/src/Storages/StorageInMemoryMetadata.cpp @@ -254,16 +254,24 @@ ColumnDependencies StorageInMemoryMetadata::getColumnDependencies(const NameSet for (const auto & projection : getProjections()) add_dependent_columns(&projection, projections_columns); - if (hasRowsTTL()) + auto add_for_rows_ttl = [&](const auto & expression, auto & to_set) { - auto rows_expression = getRowsTTL().expression; - if (add_dependent_columns(rows_expression, required_ttl_columns) && include_ttl_target) + if (add_dependent_columns(expression, to_set) && include_ttl_target) { /// Filter all columns, if rows TTL expression have to be recalculated. for (const auto & column : getColumns().getAllPhysical()) updated_ttl_columns.insert(column.name); } - } + }; + + if (hasRowsTTL()) + add_for_rows_ttl(getRowsTTL().expression, required_ttl_columns); + + for (const auto & entry : getRowsWhereTTLs()) + add_for_rows_ttl(entry.expression, required_ttl_columns); + + for (const auto & entry : getGroupByTTLs()) + add_for_rows_ttl(entry.expression, required_ttl_columns); for (const auto & entry : getRecompressionTTLs()) add_dependent_columns(entry.expression, required_ttl_columns); diff --git a/src/Storages/StorageMaterializedView.cpp b/src/Storages/StorageMaterializedView.cpp index 60d5a4a07d1..56844192ee9 100644 --- a/src/Storages/StorageMaterializedView.cpp +++ b/src/Storages/StorageMaterializedView.cpp @@ -60,7 +60,8 @@ StorageMaterializedView::StorageMaterializedView( ContextPtr local_context, const ASTCreateQuery & query, const ColumnsDescription & columns_, - bool attach_) + bool attach_, + const String & comment) : IStorage(table_id_), WithMutableContext(local_context->getGlobalContext()) { StorageInMemoryMetadata storage_metadata; @@ -81,6 +82,9 @@ StorageMaterializedView::StorageMaterializedView( auto select = SelectQueryDescription::getSelectQueryFromASTForMatView(query.select->clone(), local_context); storage_metadata.setSelectQuery(select); + if (!comment.empty()) + storage_metadata.setComment(comment); + setInMemoryMetadata(storage_metadata); bool point_to_itself_by_uuid = has_inner_table && query.to_inner_uuid != UUIDHelpers::Nil @@ -432,7 +436,7 @@ void registerStorageMaterializedView(StorageFactory & factory) /// Pass local_context here to convey setting for inner table return StorageMaterializedView::create( args.table_id, args.getLocalContext(), args.query, - args.columns, args.attach); + args.columns, args.attach, args.comment); }); } diff --git a/src/Storages/StorageMaterializedView.h b/src/Storages/StorageMaterializedView.h index 69b6f0c8c55..c110d0b211c 100644 --- a/src/Storages/StorageMaterializedView.h +++ b/src/Storages/StorageMaterializedView.h @@ -108,7 +108,8 @@ protected: ContextPtr local_context, const ASTCreateQuery & query, const ColumnsDescription & columns_, - bool attach_); + bool attach_, + const String & comment); }; } diff --git a/src/Storages/StorageMerge.cpp b/src/Storages/StorageMerge.cpp index f82f9d21217..bdb7ddb744a 100644 --- a/src/Storages/StorageMerge.cpp +++ b/src/Storages/StorageMerge.cpp @@ -43,6 +43,7 @@ namespace ErrorCodes extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; extern const int SAMPLING_NOT_SUPPORTED; extern const int ALTER_OF_COLUMN_IS_FORBIDDEN; + extern const int CANNOT_EXTRACT_TABLE_STRUCTURE; } StorageMerge::StorageMerge( @@ -61,7 +62,7 @@ StorageMerge::StorageMerge( , database_is_regexp(database_is_regexp_) { StorageInMemoryMetadata storage_metadata; - storage_metadata.setColumns(columns_); + storage_metadata.setColumns(columns_.empty() ? getColumnsDescriptionFromSourceTables() : columns_); storage_metadata.setComment(comment); setInMemoryMetadata(storage_metadata); } @@ -82,11 +83,19 @@ StorageMerge::StorageMerge( , database_is_regexp(database_is_regexp_) { StorageInMemoryMetadata storage_metadata; - storage_metadata.setColumns(columns_); + storage_metadata.setColumns(columns_.empty() ? getColumnsDescriptionFromSourceTables() : columns_); storage_metadata.setComment(comment); setInMemoryMetadata(storage_metadata); } +ColumnsDescription StorageMerge::getColumnsDescriptionFromSourceTables() const +{ + auto table = getFirstTable([](auto && t) { return t; }); + if (!table) + throw Exception{ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "There are no tables satisfied provided regexp, you must specify table structure manually"}; + return table->getInMemoryMetadataPtr()->getColumns(); +} + template StoragePtr StorageMerge::getFirstTable(F && predicate) const { @@ -762,7 +771,6 @@ void StorageMerge::convertingSourceStream( IStorage::ColumnSizeByName StorageMerge::getColumnSizes() const { - auto first_materialized_mysql = getFirstTable([](const StoragePtr & table) { return table && table->getName() == "MaterializedMySQL"; }); if (!first_materialized_mysql) return {}; @@ -816,6 +824,9 @@ void registerStorageMerge(StorageFactory & factory) return StorageMerge::create( args.table_id, args.columns, args.comment, source_database_name_or_regexp, is_regexp, table_name_regexp, args.getContext()); + }, + { + .supports_schema_inference = true }); } diff --git a/src/Storages/StorageMerge.h b/src/Storages/StorageMerge.h index 56adeab9279..ad3075efd08 100644 --- a/src/Storages/StorageMerge.h +++ b/src/Storages/StorageMerge.h @@ -132,6 +132,8 @@ protected: static SelectQueryInfo getModifiedQueryInfo( const SelectQueryInfo & query_info, ContextPtr modified_context, const StorageID & current_storage_id, bool is_merge_engine); + + ColumnsDescription getColumnsDescriptionFromSourceTables() const; }; } diff --git a/src/Storages/StorageMergeTree.cpp b/src/Storages/StorageMergeTree.cpp index 470a406dbe4..03ac27d0e46 100644 --- a/src/Storages/StorageMergeTree.cpp +++ b/src/Storages/StorageMergeTree.cpp @@ -139,6 +139,10 @@ void StorageMergeTree::startup() } } +void StorageMergeTree::flush() +{ + flushAllInMemoryPartsIfNeeded(); +} void StorageMergeTree::shutdown() { @@ -504,6 +508,9 @@ void StorageMergeTree::waitForMutation(Int64 version, const String & file_name) void StorageMergeTree::mutate(const MutationCommands & commands, ContextPtr query_context) { + /// Validate partition IDs (if any) before starting mutation + getPartitionIdsAffectedByCommands(commands, query_context); + String mutation_file_name; Int64 version = startMutation(commands, mutation_file_name); @@ -894,6 +901,7 @@ std::shared_ptr StorageMergeTree::selectPartsToMutate( auto commands = MutationCommands::create(); size_t current_ast_elements = 0; + auto last_mutation_to_apply = mutations_end_it; for (auto it = mutations_begin_it; it != mutations_end_it; ++it) { size_t commands_size = 0; @@ -930,7 +938,8 @@ std::shared_ptr StorageMergeTree::selectPartsToMutate( MergeTreeMutationEntry & entry = it->second; entry.latest_fail_time = time(nullptr); entry.latest_fail_reason = getCurrentExceptionMessage(false); - continue; + /// NOTE we should not skip mutations, because exception may be retryable (e.g. MEMORY_LIMIT_EXCEEDED) + break; } } @@ -939,8 +948,10 @@ std::shared_ptr StorageMergeTree::selectPartsToMutate( current_ast_elements += commands_size; commands->insert(commands->end(), it->second.commands.begin(), it->second.commands.end()); + last_mutation_to_apply = it; } + assert(commands->empty() == (last_mutation_to_apply == mutations_end_it)); if (!commands->empty()) { bool is_partition_affected = false; @@ -965,13 +976,13 @@ std::shared_ptr StorageMergeTree::selectPartsToMutate( /// Shall not create a new part, but will do that later if mutation with higher version appear. /// This is needed in order to not produce excessive mutations of non-related parts. auto block_range = std::make_pair(part->info.min_block, part->info.max_block); - updated_version_by_block_range[block_range] = current_mutations_by_version.rbegin()->first; + updated_version_by_block_range[block_range] = last_mutation_to_apply->first; were_some_mutations_for_some_parts_skipped = true; continue; } auto new_part_info = part->info; - new_part_info.mutation = current_mutations_by_version.rbegin()->first; + new_part_info.mutation = last_mutation_to_apply->first; future_part->parts.push_back(part); future_part->part_info = new_part_info; diff --git a/src/Storages/StorageMergeTree.h b/src/Storages/StorageMergeTree.h index 7aeca1f1a0c..ee99b412f59 100644 --- a/src/Storages/StorageMergeTree.h +++ b/src/Storages/StorageMergeTree.h @@ -31,6 +31,7 @@ class StorageMergeTree final : public shared_ptr_helper, publi friend struct shared_ptr_helper; public: void startup() override; + void flush() override; void shutdown() override; ~StorageMergeTree() override; diff --git a/src/Storages/StorageMongoDB.cpp b/src/Storages/StorageMongoDB.cpp index 9b48f3fc3b3..2c1b44d8685 100644 --- a/src/Storages/StorageMongoDB.cpp +++ b/src/Storages/StorageMongoDB.cpp @@ -66,10 +66,20 @@ void StorageMongoDB::connectIfNotConnected() if (!authenticated) { + Poco::URI poco_uri(uri); + auto query_params = poco_uri.getQueryParameters(); + auto auth_source = std::find_if(query_params.begin(), query_params.end(), + [&](const std::pair & param) { return param.first == "authSource"; }); + auto auth_db = database_name; + if (auth_source != query_params.end()) + auth_db = auth_source->second; # if POCO_VERSION >= 0x01070800 - Poco::MongoDB::Database poco_db(database_name); - if (!poco_db.authenticate(*connection, username, password, Poco::MongoDB::Database::AUTH_SCRAM_SHA1)) - throw Exception("Cannot authenticate in MongoDB, incorrect user or password", ErrorCodes::MONGODB_CANNOT_AUTHENTICATE); + if (!username.empty() && !password.empty()) + { + Poco::MongoDB::Database poco_db(auth_db); + if (!poco_db.authenticate(*connection, username, password, Poco::MongoDB::Database::AUTH_SCRAM_SHA1)) + throw Exception("Cannot authenticate in MongoDB, incorrect user or password", ErrorCodes::MONGODB_CANNOT_AUTHENTICATE); + } # else authenticate(*connection, database_name, username, password); # endif @@ -112,9 +122,7 @@ StorageMongoDBConfiguration StorageMongoDB::getConfiguration(ASTs engine_args, C for (const auto & [arg_name, arg_value] : storage_specific_args) { - if (arg_name == "collection") - configuration.collection = arg_value->as()->value.safeGet(); - else if (arg_name == "options") + if (arg_name == "options") configuration.options = arg_value->as()->value.safeGet(); else throw Exception(ErrorCodes::BAD_ARGUMENTS, @@ -139,7 +147,7 @@ StorageMongoDBConfiguration StorageMongoDB::getConfiguration(ASTs engine_args, C configuration.host = parsed_host_port.first; configuration.port = parsed_host_port.second; configuration.database = engine_args[1]->as().value.safeGet(); - configuration.collection = engine_args[2]->as().value.safeGet(); + configuration.table = engine_args[2]->as().value.safeGet(); configuration.username = engine_args[3]->as().value.safeGet(); configuration.password = engine_args[4]->as().value.safeGet(); @@ -163,7 +171,7 @@ void registerStorageMongoDB(StorageFactory & factory) configuration.host, configuration.port, configuration.database, - configuration.collection, + configuration.table, configuration.username, configuration.password, configuration.options, diff --git a/src/Storages/StorageMySQL.cpp b/src/Storages/StorageMySQL.cpp index 0b6095e033b..66adf3ae272 100644 --- a/src/Storages/StorageMySQL.cpp +++ b/src/Storages/StorageMySQL.cpp @@ -4,6 +4,7 @@ #include #include +#include #include #include #include @@ -306,13 +307,7 @@ void registerStorageMySQL(StorageFactory & factory) if (!mysql_settings.connection_pool_size) throw Exception("connection_pool_size cannot be zero.", ErrorCodes::BAD_ARGUMENTS); - mysqlxx::PoolWithFailover pool( - configuration.database, configuration.addresses, - configuration.username, configuration.password, - MYSQLXX_POOL_WITH_FAILOVER_DEFAULT_START_CONNECTIONS, - mysql_settings.connection_pool_size, - mysql_settings.connection_max_tries, - mysql_settings.connection_wait_timeout); + mysqlxx::PoolWithFailover pool = createMySQLPoolWithFailover(configuration, mysql_settings); return StorageMySQL::create( args.table_id, diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index d0d52fd488a..ce5576bd809 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -3,7 +3,6 @@ #include "Common/hex.h" #include #include -#include #include #include #include @@ -20,7 +19,6 @@ #include #include #include -#include #include #include #include @@ -35,7 +33,6 @@ #include -#include #include #include @@ -45,7 +42,6 @@ #include #include #include -#include #include #include @@ -68,7 +64,6 @@ #include -#include #include #include @@ -194,56 +189,6 @@ zkutil::ZooKeeperPtr StorageReplicatedMergeTree::getZooKeeper() const return res; } -static std::string normalizeZooKeeperPath(std::string zookeeper_path, bool check_starts_with_slash, Poco::Logger * log = nullptr) -{ - if (!zookeeper_path.empty() && zookeeper_path.back() == '/') - zookeeper_path.resize(zookeeper_path.size() - 1); - /// If zookeeper chroot prefix is used, path should start with '/', because chroot concatenates without it. - if (!zookeeper_path.empty() && zookeeper_path.front() != '/') - { - /// Do not allow this for new tables, print warning for tables created in old versions - if (check_starts_with_slash) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "ZooKeeper path must starts with '/', got '{}'", zookeeper_path); - if (log) - LOG_WARNING(log, "ZooKeeper path ('{}') does not start with '/'. It will not be supported in future releases"); - zookeeper_path = "/" + zookeeper_path; - } - - return zookeeper_path; -} - -static String extractZooKeeperName(const String & path) -{ - static constexpr auto default_zookeeper_name = "default"; - if (path.empty()) - throw Exception("ZooKeeper path should not be empty", ErrorCodes::BAD_ARGUMENTS); - if (path[0] == '/') - return default_zookeeper_name; - auto pos = path.find(":/"); - if (pos != String::npos && pos < path.find('/')) - { - auto zookeeper_name = path.substr(0, pos); - if (zookeeper_name.empty()) - throw Exception("Zookeeper path should start with '/' or ':/'", ErrorCodes::BAD_ARGUMENTS); - return zookeeper_name; - } - return default_zookeeper_name; -} - -static String extractZooKeeperPath(const String & path, bool check_starts_with_slash, Poco::Logger * log = nullptr) -{ - if (path.empty()) - throw Exception("ZooKeeper path should not be empty", ErrorCodes::BAD_ARGUMENTS); - if (path[0] == '/') - return normalizeZooKeeperPath(path, check_starts_with_slash, log); - auto pos = path.find(":/"); - if (pos != String::npos && pos < path.find('/')) - { - return normalizeZooKeeperPath(path.substr(pos + 1, String::npos), check_starts_with_slash, log); - } - return normalizeZooKeeperPath(path, check_starts_with_slash, log); -} - static MergeTreePartInfo makeDummyDropRangeForMovePartitionOrAttachPartitionFrom(const String & partition_id) { /// NOTE We don't have special log entry type for MOVE PARTITION/ATTACH PARTITION FROM, @@ -287,8 +232,8 @@ StorageReplicatedMergeTree::StorageReplicatedMergeTree( true, /// require_part_metadata attach, [this] (const std::string & name) { enqueuePartForCheck(name); }) - , zookeeper_name(extractZooKeeperName(zookeeper_path_)) - , zookeeper_path(extractZooKeeperPath(zookeeper_path_, /* check_starts_with_slash */ !attach, log)) + , zookeeper_name(zkutil::extractZooKeeperName(zookeeper_path_)) + , zookeeper_path(zkutil::extractZooKeeperPath(zookeeper_path_, /* check_starts_with_slash */ !attach, log)) , replica_name(replica_name_) , replica_path(fs::path(zookeeper_path) / "replicas" / replica_name_) , reader(*this) @@ -1373,9 +1318,6 @@ void StorageReplicatedMergeTree::checkPartChecksumsAndAddCommitOps(const zkutil: const auto storage_settings_ptr = getSettings(); String part_path = fs::path(replica_path) / "parts" / part_name; - //ops.emplace_back(zkutil::makeCheckRequest( - // zookeeper_path + "/columns", expected_columns_version)); - if (storage_settings_ptr->use_minimalistic_part_header_in_zookeeper) { ops.emplace_back(zkutil::makeCreateRequest( @@ -1421,6 +1363,7 @@ MergeTreeData::DataPartsVector StorageReplicatedMergeTree::checkPartChecksumsAnd Coordination::Requests new_ops; for (const String & part_path : absent_part_paths_on_replicas) { + /// NOTE Create request may fail with ZNONODE if replica is being dropped, we will throw an exception new_ops.emplace_back(zkutil::makeCreateRequest(part_path, "", zkutil::CreateMode::Persistent)); new_ops.emplace_back(zkutil::makeRemoveRequest(part_path, -1)); } @@ -4534,28 +4477,6 @@ bool StorageReplicatedMergeTree::executeMetadataAlter(const StorageReplicatedMer } -std::set StorageReplicatedMergeTree::getPartitionIdsAffectedByCommands( - const MutationCommands & commands, ContextPtr query_context) const -{ - std::set affected_partition_ids; - - for (const auto & command : commands) - { - if (!command.partition) - { - affected_partition_ids.clear(); - break; - } - - affected_partition_ids.insert( - getPartitionIDFromQuery(command.partition, query_context) - ); - } - - return affected_partition_ids; -} - - PartitionBlockNumbersHolder StorageReplicatedMergeTree::allocateBlockNumbersInAffectedPartitions( const MutationCommands & commands, ContextPtr query_context, const zkutil::ZooKeeperPtr & zookeeper) const { @@ -5585,8 +5506,8 @@ void StorageReplicatedMergeTree::fetchPartition( info.table_id = getStorageID(); info.table_id.uuid = UUIDHelpers::Nil; auto expand_from = query_context->getMacros()->expand(from_, info); - String auxiliary_zookeeper_name = extractZooKeeperName(expand_from); - String from = extractZooKeeperPath(expand_from, /* check_starts_with_slash */ true); + String auxiliary_zookeeper_name = zkutil::extractZooKeeperName(expand_from); + String from = zkutil::extractZooKeeperPath(expand_from, /* check_starts_with_slash */ true); if (from.empty()) throw Exception("ZooKeeper path should not be empty", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); @@ -6662,7 +6583,7 @@ void StorageReplicatedMergeTree::movePartitionToShard( if (!move_part) throw Exception("MOVE PARTITION TO SHARD is not supported, use MOVE PART instead", ErrorCodes::NOT_IMPLEMENTED); - if (normalizeZooKeeperPath(zookeeper_path, /* check_starts_with_slash */ true) == normalizeZooKeeperPath(to, /* check_starts_with_slash */ true)) + if (zkutil::normalizeZooKeeperPath(zookeeper_path, /* check_starts_with_slash */ true) == zkutil::normalizeZooKeeperPath(to, /* check_starts_with_slash */ true)) throw Exception("Source and destination are the same", ErrorCodes::BAD_ARGUMENTS); auto zookeeper = getZooKeeper(); @@ -7396,7 +7317,6 @@ bool StorageReplicatedMergeTree::createEmptyPartInsteadOfLost(zkutil::ZooKeeperP new_data_part->minmax_idx = std::move(minmax_idx); new_data_part->is_temp = true; - SyncGuardPtr sync_guard; if (new_data_part->isStoredOnDisk()) { @@ -7421,7 +7341,9 @@ bool StorageReplicatedMergeTree::createEmptyPartInsteadOfLost(zkutil::ZooKeeperP auto compression_codec = getContext()->chooseCompressionCodec(0, 0); const auto & index_factory = MergeTreeIndexFactory::instance(); - MergedBlockOutputStream out(new_data_part, metadata_snapshot, columns, index_factory.getMany(metadata_snapshot->getSecondaryIndices()), compression_codec); + MergedBlockOutputStream out(new_data_part, metadata_snapshot, columns, + index_factory.getMany(metadata_snapshot->getSecondaryIndices()), compression_codec); + bool sync_on_insert = settings->fsync_after_insert; out.write(block); diff --git a/src/Storages/StorageReplicatedMergeTree.h b/src/Storages/StorageReplicatedMergeTree.h index bcd364df30e..b2721210344 100644 --- a/src/Storages/StorageReplicatedMergeTree.h +++ b/src/Storages/StorageReplicatedMergeTree.h @@ -263,6 +263,8 @@ public: bool createEmptyPartInsteadOfLost(zkutil::ZooKeeperPtr zookeeper, const String & lost_part_name); + static const String getDefaultZooKeeperName() { return default_zookeeper_name; } + private: std::atomic_bool are_restoring_replica {false}; @@ -717,7 +719,6 @@ private: std::unique_ptr getDefaultSettings() const override; - std::set getPartitionIdsAffectedByCommands(const MutationCommands & commands, ContextPtr query_context) const; PartitionBlockNumbersHolder allocateBlockNumbersInAffectedPartitions( const MutationCommands & commands, ContextPtr query_context, const zkutil::ZooKeeperPtr & zookeeper) const; diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp index 3f08dee62b6..3d988472b54 100644 --- a/src/Storages/StorageS3.cpp +++ b/src/Storages/StorageS3.cpp @@ -3,14 +3,12 @@ #if USE_AWS_S3 -#include #include #include #include -#include #include #include #include @@ -25,11 +23,10 @@ #include #include -#include #include -#include #include +#include #include #include @@ -54,13 +51,10 @@ #include #include #include -#include #include namespace fs = std::filesystem; -#include - static const String PARTITION_ID_WILDCARD = "{_partition_id}"; @@ -74,6 +68,8 @@ namespace ErrorCodes extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; extern const int S3_ERROR; extern const int UNEXPECTED_EXPRESSION; + extern const int CANNOT_OPEN_FILE; + extern const int CANNOT_EXTRACT_TABLE_STRUCTURE; } class IOutputFormat; @@ -226,6 +222,13 @@ StorageS3Source::StorageS3Source( } +void StorageS3Source::onCancel() +{ + if (reader) + reader->cancel(); +} + + bool StorageS3Source::initialize() { String current_key = (*file_iterator)(); @@ -312,6 +315,9 @@ public: , sample_block(sample_block_) , format_settings(format_settings_) { + if (key.find_first_of("*?{") != std::string::npos) + throw Exception(ErrorCodes::CANNOT_OPEN_FILE, "S3 key '{}' contains globs, so the table is in readonly mode", key); + write_buf = wrapWriteBufferWithCompressionMethod( std::make_unique(client, bucket, key, min_upload_part_size, max_single_part_upload_size), compression_method, 3); writer = FormatFactory::instance().getOutputFormatParallelIfPossible(format, *write_buf, sample_block, context, {}, format_settings); @@ -474,13 +480,39 @@ StorageS3::StorageS3( { context_->getGlobalContext()->getRemoteHostFilter().checkURL(uri_.uri); StorageInMemoryMetadata storage_metadata; - storage_metadata.setColumns(columns_); + + updateClientAndAuthSettings(context_, client_auth); + if (columns_.empty()) + { + auto columns = getTableStructureFromDataImpl(format_name, client_auth, max_single_read_retries_, compression_method, distributed_processing_, format_settings, context_); + storage_metadata.setColumns(columns); + } + else + storage_metadata.setColumns(columns_); + storage_metadata.setConstraints(constraints_); storage_metadata.setComment(comment); setInMemoryMetadata(storage_metadata); - updateClientAndAuthSettings(context_, client_auth); } +std::shared_ptr StorageS3::createFileIterator(const ClientAuthentication & client_auth, bool distributed_processing, ContextPtr local_context) +{ + std::shared_ptr iterator_wrapper{nullptr}; + if (distributed_processing) + { + return std::make_shared( + [callback = local_context->getReadTaskCallback()]() -> String { + return callback(); + }); + } + + /// Iterate through disclosed globs and make a source for each file + auto glob_iterator = std::make_shared(*client_auth.client, client_auth.uri); + return std::make_shared([glob_iterator]() + { + return glob_iterator->next(); + }); +} Pipe StorageS3::read( const Names & column_names, @@ -504,23 +536,7 @@ Pipe StorageS3::read( need_file_column = true; } - std::shared_ptr iterator_wrapper{nullptr}; - if (distributed_processing) - { - iterator_wrapper = std::make_shared( - [callback = local_context->getReadTaskCallback()]() -> String { - return callback(); - }); - } - else - { - /// Iterate through disclosed globs and make a source for each file - auto glob_iterator = std::make_shared(*client_auth.client, client_auth.uri); - iterator_wrapper = std::make_shared([glob_iterator]() - { - return glob_iterator->next(); - }); - } + std::shared_ptr iterator_wrapper = createFileIterator(client_auth, distributed_processing, local_context); for (size_t i = 0; i < num_streams; ++i) { @@ -701,6 +717,51 @@ StorageS3Configuration StorageS3::getConfiguration(ASTs & engine_args, ContextPt return configuration; } +ColumnsDescription StorageS3::getTableStructureFromData( + const String & format, + const S3::URI & uri, + const String & access_key_id, + const String & secret_access_key, + UInt64 max_connections, + UInt64 max_single_read_retries, + const String & compression_method, + bool distributed_processing, + const std::optional & format_settings, + ContextPtr ctx) +{ + ClientAuthentication client_auth{uri, access_key_id, secret_access_key, max_connections, {}, {}}; + updateClientAndAuthSettings(ctx, client_auth); + return getTableStructureFromDataImpl(format, client_auth, max_single_read_retries, compression_method, distributed_processing, format_settings, ctx); +} + +ColumnsDescription StorageS3::getTableStructureFromDataImpl( + const String & format, + const ClientAuthentication & client_auth, + UInt64 max_single_read_retries, + const String & compression_method, + bool distributed_processing, + const std::optional & format_settings, + ContextPtr ctx) +{ + auto read_buffer_creator = [&]() + { + auto file_iterator = createFileIterator(client_auth, distributed_processing, ctx); + String current_key = (*file_iterator)(); + if (current_key.empty()) + throw Exception( + ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, + "Cannot extract table structure from {} format file, because there are no files with provided path in S3. You must specify " + "table structure manually", + format); + + return wrapReadBufferWithCompressionMethod( + std::make_unique(client_auth.client, client_auth.uri.bucket, current_key, max_single_read_retries, ctx->getReadSettings()), + chooseCompressionMethod(current_key, compression_method)); + }; + + return readSchemaFromFormat(format, format_settings, read_buffer_creator, ctx); +} + void registerStorageS3Impl(const String & name, StorageFactory & factory) { @@ -769,6 +830,7 @@ void registerStorageS3Impl(const String & name, StorageFactory & factory) { .supports_settings = true, .supports_sort_order = true, // for partition by + .supports_schema_inference = true, .source_access_type = AccessType::S3, }); } diff --git a/src/Storages/StorageS3.h b/src/Storages/StorageS3.h index 8ce287ff681..0690040915d 100644 --- a/src/Storages/StorageS3.h +++ b/src/Storages/StorageS3.h @@ -68,6 +68,8 @@ public: Chunk generate() override; + void onCancel() override; + private: String name; String bucket; @@ -145,8 +147,19 @@ public: static StorageS3Configuration getConfiguration(ASTs & engine_args, ContextPtr local_context); -private: + static ColumnsDescription getTableStructureFromData( + const String & format, + const S3::URI & uri, + const String & access_key_id, + const String & secret_access_key, + UInt64 max_connections, + UInt64 max_single_read_retries, + const String & compression_method, + bool distributed_processing, + const std::optional & format_settings, + ContextPtr ctx); +private: friend class StorageS3Cluster; friend class TableFunctionS3Cluster; @@ -173,6 +186,17 @@ private: ASTPtr partition_by; static void updateClientAndAuthSettings(ContextPtr, ClientAuthentication &); + + static std::shared_ptr createFileIterator(const ClientAuthentication & client_auth, bool distributed_processing, ContextPtr local_context); + + static ColumnsDescription getTableStructureFromDataImpl( + const String & format, + const ClientAuthentication & client_auth, + UInt64 max_single_read_retries, + const String & compression_method, + bool distributed_processing, + const std::optional & format_settings, + ContextPtr ctx); }; } diff --git a/src/Storages/StorageURL.cpp b/src/Storages/StorageURL.cpp index fe05d168c31..471b460d349 100644 --- a/src/Storages/StorageURL.cpp +++ b/src/Storages/StorageURL.cpp @@ -13,8 +13,9 @@ #include #include -#include +#include #include +#include #include #include @@ -40,7 +41,7 @@ namespace ErrorCodes IStorageURLBase::IStorageURLBase( const String & uri_, - ContextPtr /*context_*/, + ContextPtr context_, const StorageID & table_id_, const String & format_name_, const std::optional & format_settings_, @@ -61,12 +62,48 @@ IStorageURLBase::IStorageURLBase( , partition_by(partition_by_) { StorageInMemoryMetadata storage_metadata; - storage_metadata.setColumns(columns_); + if (columns_.empty()) + { + auto columns = getTableStructureFromData(format_name, uri, compression_method, headers, format_settings, context_); + storage_metadata.setColumns(columns); + } + else + storage_metadata.setColumns(columns_); storage_metadata.setConstraints(constraints_); storage_metadata.setComment(comment); setInMemoryMetadata(storage_metadata); } +ColumnsDescription IStorageURLBase::getTableStructureFromData( + const String & format, + const String & uri, + const String & compression_method, + const ReadWriteBufferFromHTTP::HTTPHeaderEntries & headers, + const std::optional & format_settings, + ContextPtr context) +{ + auto read_buffer_creator = [&]() + { + auto parsed_uri = Poco::URI(uri); + return wrapReadBufferWithCompressionMethod( + std::make_unique( + parsed_uri, + Poco::Net::HTTPRequest::HTTP_GET, + nullptr, + ConnectionTimeouts::getHTTPTimeouts(context), + Poco::Net::HTTPBasicCredentials{}, + context->getSettingsRef().max_http_get_redirects, + DBMS_DEFAULT_BUFFER_SIZE, + context->getReadSettings(), + headers, + ReadWriteBufferFromHTTP::Range{}, + context->getRemoteHostFilter()), + chooseCompressionMethod(parsed_uri.getPath(), compression_method)); + }; + + return readSchemaFromFormat(format, format_settings, read_buffer_creator, context); +} + namespace { ReadWriteBufferFromHTTP::HTTPHeaderEntries getHeaders( @@ -95,11 +132,26 @@ namespace class StorageURLSource : public SourceWithProgress { + using URIParams = std::vector>; public: + struct URIInfo + { + using FailoverOptions = std::vector; + std::vector uri_list_to_read; + std::atomic next_uri_to_read = 0; + }; + using URIInfoPtr = std::shared_ptr; + + void onCancel() override + { + if (reader) + reader->cancel(); + } + StorageURLSource( - const std::vector & uri_options, + URIInfoPtr uri_info_, const std::string & http_method, std::function callback, const String & format, @@ -114,10 +166,12 @@ namespace const ReadWriteBufferFromHTTP::HTTPHeaderEntries & headers_ = {}, const URIParams & params = {}) : SourceWithProgress(sample_block), name(std::move(name_)) + , uri_info(uri_info_) { auto headers = getHeaders(headers_); + /// Lazy initialization. We should not perform requests in constructor, because we need to do it in query pipeline. - initialize = [=, this] + initialize = [=, this](const URIInfo::FailoverOptions & uri_options) { WriteBufferFromOwnString error_message; for (auto option = uri_options.begin(); option < uri_options.end(); ++option) @@ -135,10 +189,11 @@ namespace if (n != std::string::npos) { credentials.setUsername(user_info.substr(0, n)); - credentials.setPassword(user_info.substr(n+1)); + credentials.setPassword(user_info.substr(n + 1)); } } + /// Get first alive uri. read_buf = wrapReadBufferWithCompressionMethod( std::make_unique( request_uri, @@ -188,29 +243,34 @@ namespace Chunk generate() override { - if (initialize) + while (true) { - initialize(); - initialize = {}; + if (!reader) + { + auto current_uri_pos = uri_info->next_uri_to_read.fetch_add(1); + if (current_uri_pos >= uri_info->uri_list_to_read.size()) + return {}; + + auto current_uri = uri_info->uri_list_to_read[current_uri_pos]; + initialize(current_uri); + } + + Chunk chunk; + if (reader->pull(chunk)) + return chunk; + + pipeline->reset(); + reader.reset(); } - - if (!reader) - return {}; - - Chunk chunk; - if (reader->pull(chunk)) - return chunk; - - pipeline->reset(); - reader.reset(); - - return {}; } private: - std::function initialize; + using InitializeFunc = std::function; + InitializeFunc initialize; String name; + URIInfoPtr uri_info; + std::unique_ptr read_buf; std::unique_ptr pipeline; std::unique_ptr reader; @@ -332,7 +392,7 @@ Pipe IStorageURLBase::read( ContextPtr local_context, QueryProcessingStage::Enum processed_stage, size_t max_block_size, - unsigned /*num_streams*/) + unsigned num_streams) { auto params = getReadURIParams(column_names, metadata_snapshot, query_info, local_context, processed_stage, max_block_size); bool with_globs = (uri.find('{') != std::string::npos && uri.find('}') != std::string::npos) @@ -341,18 +401,23 @@ Pipe IStorageURLBase::read( if (with_globs) { size_t max_addresses = local_context->getSettingsRef().glob_expansion_max_elements; - std::vector url_descriptions = parseRemoteDescription(uri, 0, uri.size(), ',', max_addresses); - std::vector uri_options; + auto uri_descriptions = parseRemoteDescription(uri, 0, uri.size(), ',', max_addresses); + + if (num_streams > uri_descriptions.size()) + num_streams = uri_descriptions.size(); + + /// For each uri (which acts like shard) check if it has failover options + auto uri_info = std::make_shared(); + for (const auto & description : uri_descriptions) + uri_info->uri_list_to_read.emplace_back(parseRemoteDescription(description, 0, description.size(), '|', max_addresses)); Pipes pipes; - for (const auto & url_description : url_descriptions) - { - /// For each uri (which acts like shard) check if it has failover options - uri_options = parseRemoteDescription(url_description, 0, url_description.size(), '|', max_addresses); - StoragePtr shard; + pipes.reserve(num_streams); + for (size_t i = 0; i < num_streams; ++i) + { pipes.emplace_back(std::make_shared( - uri_options, + uri_info, getReadMethod(), getReadPOSTDataCallback( column_names, metadata_snapshot, query_info, @@ -371,9 +436,10 @@ Pipe IStorageURLBase::read( } else { - std::vector uri_options{uri}; + auto uri_info = std::make_shared(); + uri_info->uri_list_to_read.emplace_back(std::vector{uri}); return Pipe(std::make_shared( - uri_options, + uri_info, getReadMethod(), getReadPOSTDataCallback( column_names, metadata_snapshot, query_info, @@ -402,8 +468,10 @@ Pipe StorageURLWithFailover::read( { auto params = getReadURIParams(column_names, metadata_snapshot, query_info, local_context, processed_stage, max_block_size); + auto uri_info = std::make_shared(); + uri_info->uri_list_to_read.emplace_back(uri_options); auto pipe = Pipe(std::make_shared( - uri_options, + uri_info, getReadMethod(), getReadPOSTDataCallback( column_names, metadata_snapshot, query_info, @@ -611,6 +679,7 @@ void registerStorageURL(StorageFactory & factory) }, { .supports_settings = true, + .supports_schema_inference = true, .source_access_type = AccessType::URL, }); } diff --git a/src/Storages/StorageURL.h b/src/Storages/StorageURL.h index cf72352a183..790f01135d3 100644 --- a/src/Storages/StorageURL.h +++ b/src/Storages/StorageURL.h @@ -41,6 +41,14 @@ public: bool supportsPartitionBy() const override { return true; } + static ColumnsDescription getTableStructureFromData( + const String & format, + const String & uri, + const String & compression_method, + const ReadWriteBufferFromHTTP::HTTPHeaderEntries & headers, + const std::optional & format_settings, + ContextPtr context); + protected: IStorageURLBase( const String & uri_, diff --git a/src/Storages/System/CMakeLists.txt b/src/Storages/System/CMakeLists.txt index 96c05a59173..133761cbe22 100644 --- a/src/Storages/System/CMakeLists.txt +++ b/src/Storages/System/CMakeLists.txt @@ -9,6 +9,36 @@ get_property (BUILD_COMPILE_DEFINITIONS DIRECTORY ${ClickHouse_SOURCE_DIR} PROPE get_property(TZDATA_VERSION GLOBAL PROPERTY TZDATA_VERSION_PROP) + +find_package(Git) +if(Git_FOUND) + # The commit's git hash, and whether the building workspace was dirty or not + execute_process(COMMAND + "${GIT_EXECUTABLE}" rev-parse HEAD + WORKING_DIRECTORY "${ClickHouse_SOURCE_DIR}" + OUTPUT_VARIABLE GIT_HASH + ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) + # Git branch name + execute_process(COMMAND + "${GIT_EXECUTABLE}" rev-parse --abbrev-ref HEAD + WORKING_DIRECTORY "${ClickHouse_SOURCE_DIR}" + OUTPUT_VARIABLE GIT_BRANCH + ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) + # The date of the commit + SET(ENV{TZ} "UTC") + execute_process(COMMAND + "${GIT_EXECUTABLE}" log -1 --format=%ad --date=iso-local + WORKING_DIRECTORY "${ClickHouse_SOURCE_DIR}" + OUTPUT_VARIABLE GIT_DATE + ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) + # The subject of the commit + execute_process(COMMAND + "${GIT_EXECUTABLE}" log -1 --format=%s + WORKING_DIRECTORY "${ClickHouse_SOURCE_DIR}" + OUTPUT_VARIABLE GIT_COMMIT_SUBJECT + ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) +endif() + configure_file (StorageSystemBuildOptions.generated.cpp.in ${CONFIG_BUILD}) include("${ClickHouse_SOURCE_DIR}/cmake/dbms_glob_sources.cmake") diff --git a/src/Storages/System/StorageSystemBuildOptions.generated.cpp.in b/src/Storages/System/StorageSystemBuildOptions.generated.cpp.in index da563cc245b..8a19d7649aa 100644 --- a/src/Storages/System/StorageSystemBuildOptions.generated.cpp.in +++ b/src/Storages/System/StorageSystemBuildOptions.generated.cpp.in @@ -50,6 +50,10 @@ const char * auto_config_build[] "USE_KRB5", "@USE_KRB5@", "USE_FILELOG", "@USE_FILELOG@", "USE_BZIP2", "@USE_BZIP2@", + "GIT_HASH", "@GIT_HASH@", + "GIT_BRANCH", "@GIT_BRANCH@", + "GIT_DATE", "@GIT_DATE@", + "GIT_COMMIT_SUBJECT", "@GIT_COMMIT_SUBJECT@", nullptr, nullptr }; diff --git a/src/Storages/System/StorageSystemClusters.cpp b/src/Storages/System/StorageSystemClusters.cpp index 1f5def6d6b4..1e303d1aeaa 100644 --- a/src/Storages/System/StorageSystemClusters.cpp +++ b/src/Storages/System/StorageSystemClusters.cpp @@ -45,7 +45,8 @@ void StorageSystemClusters::fillData(MutableColumns & res_columns, ContextPtr co // get an error when trying to get the info about DB from ZK. // Just ignore these inaccessible databases. A good example of a // failing test is `01526_client_start_and_exit`. - try { + try + { writeCluster(res_columns, {name_and_database.first, replicated->getCluster()}); } catch (...) diff --git a/src/Storages/System/StorageSystemDictionaries.cpp b/src/Storages/System/StorageSystemDictionaries.cpp index d8f92d38081..c0d7d8cc4ed 100644 --- a/src/Storages/System/StorageSystemDictionaries.cpp +++ b/src/Storages/System/StorageSystemDictionaries.cpp @@ -142,7 +142,9 @@ void StorageSystemDictionaries::fillData(MutableColumns & res_columns, ContextPt res_columns[i++]->insertDefault(); if (dict_ptr) + { res_columns[i++]->insert(dict_ptr->getDictionaryComment()); + } else { if (load_result.config && load_result.config->config->has("dictionary.comment")) diff --git a/src/Storages/System/StorageSystemPartsColumns.cpp b/src/Storages/System/StorageSystemPartsColumns.cpp index 8dbd73628ca..f1b3a13c332 100644 --- a/src/Storages/System/StorageSystemPartsColumns.cpp +++ b/src/Storages/System/StorageSystemPartsColumns.cpp @@ -6,6 +6,8 @@ #include #include #include +#include +#include #include #include #include @@ -58,7 +60,11 @@ StorageSystemPartsColumns::StorageSystemPartsColumns(const StorageID & table_id_ {"column_bytes_on_disk", std::make_shared()}, {"column_data_compressed_bytes", std::make_shared()}, {"column_data_uncompressed_bytes", std::make_shared()}, - {"column_marks_bytes", std::make_shared()} + {"column_marks_bytes", std::make_shared()}, + {"serialization_kind", std::make_shared()}, + {"subcolumns.names", std::make_shared(std::make_shared())}, + {"subcolumns.types", std::make_shared(std::make_shared())}, + {"subcolumns.serializations", std::make_shared(std::make_shared())} } ) { @@ -216,6 +222,28 @@ void StorageSystemPartsColumns::processNextStorage( if (columns_mask[src_index++]) columns[res_index++]->insert(column_size.marks); + auto serialization = part->getSerialization(column); + if (columns_mask[src_index++]) + columns[res_index++]->insert(ISerialization::kindToString(serialization->getKind())); + + Array subcolumn_names; + Array subcolumn_types; + Array subcolumn_sers; + + IDataType::forEachSubcolumn([&](const auto &, const auto & name, const auto & data) + { + subcolumn_names.push_back(name); + subcolumn_types.push_back(data.type->getName()); + subcolumn_sers.push_back(ISerialization::kindToString(data.serialization->getKind())); + }, { serialization, column.type, nullptr, nullptr }); + + if (columns_mask[src_index++]) + columns[res_index++]->insert(subcolumn_names); + if (columns_mask[src_index++]) + columns[res_index++]->insert(subcolumn_types); + if (columns_mask[src_index++]) + columns[res_index++]->insert(subcolumn_sers); + if (has_state_column) columns[res_index++]->insert(part->stateString()); } diff --git a/src/Storages/System/attachInformationSchemaTables.cpp b/src/Storages/System/attachInformationSchemaTables.cpp index 803e9d55dac..68a1eac305e 100644 --- a/src/Storages/System/attachInformationSchemaTables.cpp +++ b/src/Storages/System/attachInformationSchemaTables.cpp @@ -3,7 +3,7 @@ #include #include #include -#include +#include namespace DB { diff --git a/src/Storages/WindowView/StorageWindowView.cpp b/src/Storages/WindowView/StorageWindowView.cpp index 915e775ff14..25ecc0e16ef 100644 --- a/src/Storages/WindowView/StorageWindowView.cpp +++ b/src/Storages/WindowView/StorageWindowView.cpp @@ -4,7 +4,7 @@ #include #include #include -#include +#include #include #include #include @@ -62,7 +62,7 @@ namespace ErrorCodes namespace { - /// Fetch all window info and replace TUMPLE or HOP node names with WINDOW_ID + /// Fetch all window info and replace tumble or hop node names with windowID struct FetchQueryInfoMatcher { using Visitor = InDepthNodeVisitor; @@ -85,20 +85,20 @@ namespace { if (auto * t = ast->as()) { - if (t->name == "TUMBLE" || t->name == "HOP") + if (t->name == "tumble" || t->name == "hop") { - data.is_tumble = t->name == "TUMBLE"; - data.is_hop = t->name == "HOP"; + data.is_tumble = t->name == "tumble"; + data.is_hop = t->name == "hop"; auto temp_node = t->clone(); temp_node->setAlias(""); if (startsWith(t->arguments->children[0]->getColumnName(), "toDateTime")) throw Exception( - "The first argument of window function should not be a constant value.", + "The first argument of time window function should not be a constant value.", ErrorCodes::QUERY_IS_NOT_SUPPORTED_IN_WINDOW_VIEW); if (!data.window_function) { data.serialized_window_function = serializeAST(*temp_node); - t->name = "WINDOW_ID"; + t->name = "windowID"; data.window_id_name = t->getColumnName(); data.window_id_alias = t->alias; data.window_function = t->clone(); @@ -108,15 +108,15 @@ namespace else { if (serializeAST(*temp_node) != data.serialized_window_function) - throw Exception("WINDOW VIEW only support ONE WINDOW FUNCTION", ErrorCodes::QUERY_IS_NOT_SUPPORTED_IN_WINDOW_VIEW); - t->name = "WINDOW_ID"; + throw Exception("WINDOW VIEW only support ONE TIME WINDOW FUNCTION", ErrorCodes::QUERY_IS_NOT_SUPPORTED_IN_WINDOW_VIEW); + t->name = "windowID"; } } } } }; - /// Replace WINDOW_ID node name with either TUMBLE or HOP. + /// Replace windowID node name with either tumble or hop struct ReplaceWindowIdMatcher { public: @@ -132,15 +132,15 @@ namespace { if (auto * t = ast->as()) { - if (t->name == "WINDOW_ID") + if (t->name == "windowID") t->name = data.window_name; } } }; - /// GROUP BY TUMBLE(now(), INTERVAL '5' SECOND) + /// GROUP BY tumble(now(), INTERVAL '5' SECOND) /// will become - /// GROUP BY TUMBLE(____timestamp, INTERVAL '5' SECOND) + /// GROUP BY tumble(____timestamp, INTERVAL '5' SECOND) struct ReplaceFunctionNowData { using TypeToVisit = ASTFunction; @@ -151,7 +151,7 @@ namespace void visit(ASTFunction & node, ASTPtr & node_ptr) { - if (node.name == "WINDOW_ID" || node.name == "TUMBLE" || node.name == "HOP") + if (node.name == "windowID" || node.name == "tumble" || node.name == "hop") { if (const auto * t = node.arguments->children[0]->as(); t && t->name == "now") @@ -188,8 +188,8 @@ namespace { if (auto * t = ast->as()) { - if (t->name == "HOP" || t->name == "TUMBLE") - t->name = "WINDOW_ID"; + if (t->name == "hop" || t->name == "tumble") + t->name = "windowID"; } } }; @@ -221,12 +221,12 @@ namespace { if (node.name == "tuple") { - /// tuple(WINDOW_ID(timestamp, toIntervalSecond('5'))) + /// tuple(windowID(timestamp, toIntervalSecond('5'))) return; } else { - /// WINDOW_ID(timestamp, toIntervalSecond('5')) -> identifier. + /// windowID(timestamp, toIntervalSecond('5')) -> identifier. /// and other... node_ptr = std::make_shared(node.getColumnName()); } @@ -351,14 +351,14 @@ static size_t getWindowIDColumnPosition(const Block & header) auto position = -1; for (const auto & column : header.getColumnsWithTypeAndName()) { - if (startsWith(column.name, "WINDOW_ID")) + if (startsWith(column.name, "windowID")) { position = header.getPositionByName(column.name); break; } } if (position < 0) - throw Exception("Not found column WINDOW_ID", ErrorCodes::LOGICAL_ERROR); + throw Exception("Not found column windowID", ErrorCodes::LOGICAL_ERROR); return position; } @@ -527,7 +527,7 @@ inline void StorageWindowView::fire(UInt32 watermark) for (auto & watch_stream : watch_streams) { if (auto watch_stream_ptr = watch_stream.lock()) - watch_stream_ptr->addBlock(block); + watch_stream_ptr->addBlock(block, watermark); } } if (!target_table_id.empty()) @@ -631,7 +631,7 @@ std::shared_ptr StorageWindowView::getInnerTableCreateQuery( time_now_visitor.visit(node); function_now_timezone = time_now_data.now_timezone; } - /// TUMBLE/HOP -> WINDOW_ID + /// tumble/hop -> windowID func_window_visitor.visit(node); to_identifier_visitor.visit(node); new_storage->set(field, node); @@ -910,7 +910,11 @@ Pipe StorageWindowView::watch( } auto reader = std::make_shared( - *this, has_limit, limit, + std::static_pointer_cast(shared_from_this()), + query.is_watch_events, + window_view_timezone, + has_limit, + limit, local_context->getSettingsRef().window_view_heartbeat_interval.totalSeconds()); std::lock_guard lock(fire_signal_mutex); @@ -960,7 +964,7 @@ StorageWindowView::StorageWindowView( select_table_id = StorageID(select_database_name, select_table_name); DatabaseCatalog::instance().addDependency(select_table_id, table_id_); - /// Extract all info from query; substitute Function_TUMPLE and Function_HOP with Function_WINDOW_ID. + /// Extract all info from query; substitute Function_tumble and Function_hop with Function_windowID. auto inner_query = innerQueryParser(select_query->as()); // Parse mergeable query @@ -971,13 +975,13 @@ StorageWindowView::StorageWindowView( if (is_time_column_func_now) window_id_name = func_now_data.window_id_name; - // Parse final query (same as mergeable query but has TUMBLE/HOP instead of WINDOW_ID) + // Parse final query (same as mergeable query but has tumble/hop instead of windowID) final_query = mergeable_query->clone(); ReplaceWindowIdMatcher::Data final_query_data; if (is_tumble) - final_query_data.window_name = "TUMBLE"; + final_query_data.window_name = "tumble"; else - final_query_data.window_name = "HOP"; + final_query_data.window_name = "hop"; ReplaceWindowIdMatcher::Visitor(final_query_data).visit(final_query); is_watermark_strictly_ascending = query.is_watermark_strictly_ascending; @@ -989,9 +993,9 @@ StorageWindowView::StorageWindowView( eventTimeParser(query); if (is_tumble) - window_column_name = std::regex_replace(window_id_name, std::regex("WINDOW_ID"), "TUMBLE"); + window_column_name = std::regex_replace(window_id_name, std::regex("windowID"), "tumble"); else - window_column_name = std::regex_replace(window_id_name, std::regex("WINDOW_ID"), "HOP"); + window_column_name = std::regex_replace(window_id_name, std::regex("windowID"), "hop"); auto generate_inner_table_name = [](const StorageID & storage_id) { @@ -1042,14 +1046,14 @@ ASTPtr StorageWindowView::innerQueryParser(const ASTSelectQuery & query) if (!query_info_data.is_tumble && !query_info_data.is_hop) throw Exception(ErrorCodes::INCORRECT_QUERY, - "WINDOW FUNCTION is not specified for {}", getName()); + "TIME WINDOW FUNCTION is not specified for {}", getName()); window_id_name = query_info_data.window_id_name; window_id_alias = query_info_data.window_id_alias; timestamp_column_name = query_info_data.timestamp_column_name; is_tumble = query_info_data.is_tumble; - // Parse window function + // Parse time window function ASTFunction & window_function = typeid_cast(*query_info_data.window_function); const auto & arguments = window_function.arguments->children; extractWindowArgument( @@ -1077,7 +1081,8 @@ ASTPtr StorageWindowView::innerQueryParser(const ASTSelectQuery & query) ErrorCodes::ILLEGAL_COLUMN, "Illegal column #{} of time zone argument of function, must be constant string", time_zone_arg_num); - time_zone = &DateLUT::instance(time_zone_ast->value.safeGet()); + window_view_timezone = time_zone_ast->value.safeGet(); + time_zone = &DateLUT::instance(window_view_timezone); } else time_zone = &DateLUT::instance(); @@ -1354,9 +1359,12 @@ Block & StorageWindowView::getHeader() const sample_block = InterpreterSelectQuery( select_query->clone(), window_view_context, getParentStorage(), nullptr, SelectQueryOptions(QueryProcessingStage::Complete)).getSampleBlock(); - + /// convert all columns to full columns + /// in case some of them are constant for (size_t i = 0; i < sample_block.columns(); ++i) + { sample_block.safeGetByPosition(i).column = sample_block.safeGetByPosition(i).column->convertToFullColumnIfConst(); + } } return sample_block; } diff --git a/src/Storages/WindowView/StorageWindowView.h b/src/Storages/WindowView/StorageWindowView.h index 08f24816d72..ef552262378 100644 --- a/src/Storages/WindowView/StorageWindowView.h +++ b/src/Storages/WindowView/StorageWindowView.h @@ -22,11 +22,11 @@ using ASTPtr = std::shared_ptr; * [ENGINE [db.]name] * [WATERMARK strategy] [ALLOWED_LATENESS interval_function] * AS SELECT ... - * GROUP BY [TUBLE/HOP(...)] + * GROUP BY [tumble/hop(...)] * * - only stores data that has not been triggered yet; * - fire_task checks if there is a window ready to be fired - * (each window result is fired in one output at the end of TUMBLE/HOP window interval); + * (each window result is fired in one output at the end of tumble/hop window interval); * - intermediate data is stored in inner table with * AggregatingMergeTree engine by default, but any other -MergeTree * engine might be used as inner table engine; @@ -35,24 +35,24 @@ using ASTPtr = std::shared_ptr; * Here function in GROUP BY clause results in a "window_id" * represented as Tuple(DateTime, DateTime) - lower and upper bounds of the window. * Function might be one of the following: - * 1. TUMBLE(time_attr, interval [, timezone]) + * 1. tumble(time_attr, interval [, timezone]) * - non-overlapping, continuous windows with a fixed duration (interval); * - example: - * SELECT TUMBLE(toDateTime('2021-01-01 00:01:45'), INTERVAL 10 SECOND) + * SELECT tumble(toDateTime('2021-01-01 00:01:45'), INTERVAL 10 SECOND) * results in ('2021-01-01 00:01:40','2021-01-01 00:01:50') - * 2. HOP(time_attr, hop_interval, window_interval [, timezone]) + * 2. hop(time_attr, hop_interval, window_interval [, timezone]) * - sliding window; * - has a fixed duration (window_interval parameter) and hops by a * specified hop interval (hop_interval parameter); * If the hop_interval is smaller than the window_interval, hopping windows * are overlapping. Thus, records can be assigned to multiple windows. * - example: - * SELECT HOP(toDateTime('2021-01-01 00:00:45'), INTERVAL 3 SECOND, INTERVAL 10 SECOND) + * SELECT hop(toDateTime('2021-01-01 00:00:45'), INTERVAL 3 SECOND, INTERVAL 10 SECOND) * results in ('2021-01-01 00:00:38','2021-01-01 00:00:48') * * DateTime value can be used with the following functions to find out start/end of the window: - * - TUMPLE_START(time_attr, interval [, timezone]), TUMPLE_END(time_attr, interval [, timezone]) - * - HOP_START(time_attr, hop_interval, window_interval [, timezone]), HOP_END(time_attr, hop_interval, window_interval [, timezone]) + * - tumbleStart(time_attr, interval [, timezone]), tumbleEnd(time_attr, interval [, timezone]) + * - hopStart(time_attr, hop_interval, window_interval [, timezone]), hopEnd(time_attr, hop_interval, window_interval [, timezone]) * * * Time processing options. @@ -61,8 +61,8 @@ using ASTPtr = std::shared_ptr; * - produces results based on the time of the local machine; * - example: * CREATE WINDOW VIEW test.wv TO test.dst - * AS SELECT count(number), TUMBLE_START(w_id) as w_start FROM test.mt - * GROUP BY TUMBLE(now(), INTERVAL '5' SECOND) as w_id + * AS SELECT count(number), tumbleStart(w_id) as w_start FROM test.mt + * GROUP BY tumble(now(), INTERVAL '5' SECOND) as w_id * * 2. event time * - produces results based on the time that is contained in every record; @@ -79,7 +79,7 @@ using ASTPtr = std::shared_ptr; * CREATE WINDOW VIEW test.wv TO test.dst * WATERMARK=STRICTLY_ASCENDING * AS SELECT count(number) FROM test.mt - * GROUP BY TUMBLE(timestamp, INTERVAL '5' SECOND); + * GROUP BY tumble(timestamp, INTERVAL '5' SECOND); * (where `timestamp` is a DateTime column in test.mt) * * @@ -90,8 +90,8 @@ using ASTPtr = std::shared_ptr; * - Can be enabled by using ALLOWED_LATENESS=INTERVAL, like this: * CREATE WINDOW VIEW test.wv TO test.dst * WATERMARK=ASCENDING ALLOWED_LATENESS=INTERVAL '2' SECOND - * AS SELECT count(a) AS count, TUMBLE_END(wid) AS w_end FROM test.mt - * GROUP BY TUMBLE(timestamp, INTERVAL '5' SECOND) AS wid; + * AS SELECT count(a) AS count, tumbleEnd(wid) AS w_end FROM test.mt + * GROUP BY tumble(timestamp, INTERVAL '5' SECOND) AS wid; * * - Instead of firing at the end of windows, WINDOW VIEW will fire * immediately when encountering late events; @@ -150,11 +150,11 @@ public: private: Poco::Logger * log; - /// Stored query, e.g. SELECT * FROM * GROUP BY TUMBLE(now(), *) + /// Stored query, e.g. SELECT * FROM * GROUP BY tumble(now(), *) ASTPtr select_query; - /// Used to generate the mergeable state of select_query, e.g. SELECT * FROM * GROUP BY WINDOW_ID(____timestamp, *) + /// Used to generate the mergeable state of select_query, e.g. SELECT * FROM * GROUP BY windowID(____timestamp, *) ASTPtr mergeable_query; - /// Used to fetch the mergeable state and generate the final result. e.g. SELECT * FROM * GROUP BY TUMBLE(____timestamp, *) + /// Used to fetch the mergeable state and generate the final result. e.g. SELECT * FROM * GROUP BY tumble(____timestamp, *) ASTPtr final_query; ContextMutablePtr window_view_context; @@ -210,6 +210,7 @@ private: BackgroundSchedulePool::TaskHolder clean_cache_task; BackgroundSchedulePool::TaskHolder fire_task; + String window_view_timezone; String function_now_timezone; ASTPtr innerQueryParser(const ASTSelectQuery & query); diff --git a/src/Storages/WindowView/WindowViewSource.h b/src/Storages/WindowView/WindowViewSource.h index 6a1fe34b553..a726cdc8712 100644 --- a/src/Storages/WindowView/WindowViewSource.h +++ b/src/Storages/WindowView/WindowViewSource.h @@ -11,83 +11,109 @@ class WindowViewSource : public SourceWithProgress { public: WindowViewSource( - StorageWindowView & storage_, + std::shared_ptr storage_, + const bool is_events_, + String window_view_timezone_, const bool has_limit_, const UInt64 limit_, const UInt64 heartbeat_interval_sec_) - : SourceWithProgress(storage_.getHeader()) + : SourceWithProgress( + is_events_ ? Block( + {ColumnWithTypeAndName(ColumnUInt32::create(), std::make_shared(window_view_timezone_), "watermark")}) + : storage_->getHeader()) , storage(storage_) + , is_events(is_events_) + , window_view_timezone(window_view_timezone_) , has_limit(has_limit_) , limit(limit_) - , heartbeat_interval_sec(heartbeat_interval_sec_) {} + , heartbeat_interval_sec(heartbeat_interval_sec_) + { + if (is_events) + header.insert( + ColumnWithTypeAndName(ColumnUInt32::create(), std::make_shared(window_view_timezone_), "watermark")); + else + header = storage->getHeader(); + } String getName() const override { return "WindowViewSource"; } - void addBlock(Block block_) + void addBlock(Block block_, UInt32 watermark) { std::lock_guard lock(blocks_mutex); - blocks.push_back(std::move(block_)); + blocks_with_watermark.push_back({std::move(block_), watermark}); } protected: - Block getHeader() const { return storage.getHeader(); } + Block getHeader() const { return header; } Chunk generate() override { - auto block = generateImpl(); - return Chunk(block.getColumns(), block.rows()); + Block block; + UInt32 watermark; + std::tie(block, watermark) = generateImpl(); + if (is_events) + { + return Chunk( + {DataTypeDateTime(window_view_timezone).createColumnConst(block.rows(), watermark)->convertToFullColumnIfConst()}, + block.rows()); + } + else + { + return Chunk(block.getColumns(), block.rows()); + } } - Block generateImpl() + std::pair generateImpl() { - Block res; - if (has_limit && num_updates == static_cast(limit)) - return Block(); + return {Block(), 0}; - if (isCancelled() || storage.shutdown_called) - return Block(); + if (isCancelled() || storage->shutdown_called) + return {Block(), 0}; std::unique_lock lock(blocks_mutex); - if (blocks.empty()) + if (blocks_with_watermark.empty()) { if (!end_of_blocks) { end_of_blocks = true; num_updates += 1; - return getHeader(); + return {getHeader(), 0}; } - storage.fire_condition.wait_for(lock, std::chrono::seconds(heartbeat_interval_sec)); + storage->fire_condition.wait_for(lock, std::chrono::seconds(heartbeat_interval_sec)); - if (isCancelled() || storage.shutdown_called) + if (isCancelled() || storage->shutdown_called) { - return Block(); + return {Block(), 0}; } - if (blocks.empty()) - return getHeader(); + if (blocks_with_watermark.empty()) + return {getHeader(), 0}; else { end_of_blocks = false; - res = blocks.front(); - blocks.pop_front(); + auto res = blocks_with_watermark.front(); + blocks_with_watermark.pop_front(); return res; } } else { - res = blocks.front(); - blocks.pop_front(); + auto res = blocks_with_watermark.front(); + blocks_with_watermark.pop_front(); return res; } } private: - StorageWindowView & storage; + std::shared_ptr storage; - BlocksList blocks; + std::list> blocks_with_watermark; + Block header; + const bool is_events; + String window_view_timezone; const bool has_limit; const UInt64 limit; Int64 num_updates = -1; diff --git a/src/Storages/tests/gtest_transform_query_for_external_database.cpp b/src/Storages/tests/gtest_transform_query_for_external_database.cpp index f161400630b..57b9e73bbbd 100644 --- a/src/Storages/tests/gtest_transform_query_for_external_database.cpp +++ b/src/Storages/tests/gtest_transform_query_for_external_database.cpp @@ -120,7 +120,7 @@ TEST(TransformQueryForExternalDatabase, InWithSingleElement) check(state, 1, "SELECT column FROM test.table WHERE 1 IN (1)", - R"(SELECT "column" FROM "test"."table" WHERE 1)"); + R"(SELECT "column" FROM "test"."table" WHERE 1 = 1)"); check(state, 1, "SELECT column FROM test.table WHERE column IN (1, 2)", R"(SELECT "column" FROM "test"."table" WHERE "column" IN (1, 2))"); @@ -135,7 +135,7 @@ TEST(TransformQueryForExternalDatabase, InWithMultipleColumns) check(state, 1, "SELECT column FROM test.table WHERE (1,1) IN ((1,1))", - R"(SELECT "column" FROM "test"."table" WHERE 1)"); + R"(SELECT "column" FROM "test"."table" WHERE 1 = 1)"); check(state, 1, "SELECT field, value FROM test.table WHERE (field, value) IN (('foo', 'bar'))", R"(SELECT "field", "value" FROM "test"."table" WHERE ("field", "value") IN (('foo', 'bar')))"); diff --git a/src/Storages/transformQueryForExternalDatabase.cpp b/src/Storages/transformQueryForExternalDatabase.cpp index 4d6c1787a34..c42fb7fa965 100644 --- a/src/Storages/transformQueryForExternalDatabase.cpp +++ b/src/Storages/transformQueryForExternalDatabase.cpp @@ -306,6 +306,18 @@ String transformQueryForExternalDatabase( throw Exception("Query contains non-compatible expressions (and external_table_strict_query=true)", ErrorCodes::INCORRECT_QUERY); } + auto * literal_expr = typeid_cast(original_where.get()); + UInt64 value; + if (literal_expr && literal_expr->value.tryGet(value) && (value == 0 || value == 1)) + { + /// WHERE 1 -> WHERE 1=1, WHERE 0 -> WHERE 1=0. + if (value) + original_where = makeASTFunction("equals", std::make_shared(1), std::make_shared(1)); + else + original_where = makeASTFunction("equals", std::make_shared(1), std::make_shared(0)); + select->setExpression(ASTSelectQuery::Expression::WHERE, std::move(original_where)); + } + ASTPtr select_ptr = select; dropAliases(select_ptr); diff --git a/src/TableFunctions/ITableFunction.cpp b/src/TableFunctions/ITableFunction.cpp index fa7f6e52220..42b24abdbbe 100644 --- a/src/TableFunctions/ITableFunction.cpp +++ b/src/TableFunctions/ITableFunction.cpp @@ -15,25 +15,23 @@ namespace DB { StoragePtr ITableFunction::execute(const ASTPtr & ast_function, ContextPtr context, const std::string & table_name, - ColumnsDescription cached_columns) const + ColumnsDescription cached_columns, bool use_global_context) const { ProfileEvents::increment(ProfileEvents::TableFunctionExecute); context->checkAccess(AccessType::CREATE_TEMPORARY_TABLE | StorageFactory::instance().getSourceAccessType(getStorageTypeName())); + auto context_to_use = use_global_context ? context->getGlobalContext() : context; + if (cached_columns.empty()) return executeImpl(ast_function, context, table_name, std::move(cached_columns)); - /// We have table structure, so it's CREATE AS table_function(). - /// We should use global context here because there will be no query context on server startup - /// and because storage lifetime is bigger than query context lifetime. - auto global_context = context->getGlobalContext(); if (hasStaticStructure() && cached_columns == getActualTableStructure(context)) - return executeImpl(ast_function, global_context, table_name, std::move(cached_columns)); + return executeImpl(ast_function, context_to_use, table_name, std::move(cached_columns)); auto this_table_function = shared_from_this(); auto get_storage = [=]() -> StoragePtr { - return this_table_function->executeImpl(ast_function, global_context, table_name, cached_columns); + return this_table_function->executeImpl(ast_function, context_to_use, table_name, cached_columns); }; /// It will request actual table structure and create underlying storage lazily diff --git a/src/TableFunctions/ITableFunction.h b/src/TableFunctions/ITableFunction.h index 56147ffd598..93cf5057e88 100644 --- a/src/TableFunctions/ITableFunction.h +++ b/src/TableFunctions/ITableFunction.h @@ -54,7 +54,7 @@ public: /// Create storage according to the query. StoragePtr - execute(const ASTPtr & ast_function, ContextPtr context, const std::string & table_name, ColumnsDescription cached_columns_ = {}) const; + execute(const ASTPtr & ast_function, ContextPtr context, const std::string & table_name, ColumnsDescription cached_columns_ = {}, bool use_global_context = false) const; virtual ~ITableFunction() = default; diff --git a/src/TableFunctions/ITableFunctionFileLike.cpp b/src/TableFunctions/ITableFunctionFileLike.cpp index 699ad698bd8..4395c318983 100644 --- a/src/TableFunctions/ITableFunctionFileLike.cpp +++ b/src/TableFunctions/ITableFunctionFileLike.cpp @@ -1,4 +1,3 @@ -#include #include #include @@ -6,16 +5,16 @@ #include #include -#include #include #include -#include #include #include +#include + namespace DB { @@ -23,10 +22,27 @@ namespace ErrorCodes { extern const int LOGICAL_ERROR; extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; - extern const int INCORRECT_FILE_NAME; extern const int BAD_ARGUMENTS; } +namespace +{ + void checkIfFormatSupportsAutoStructure(const String & name, const String & format) + { + if (name == "file" && format == "Distributed") + return; + + if (FormatFactory::instance().checkIfFormatHasAnySchemaReader(format)) + return; + + throw Exception( + "Table function '" + name + + "' allows automatic structure determination only for formats that support schema inference and for Distributed format in table function " + "'file'", + ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH); + } +} + void ITableFunctionFileLike::parseArguments(const ASTPtr & ast_function, ContextPtr context) { /// Parse args @@ -46,21 +62,23 @@ void ITableFunctionFileLike::parseArguments(const ASTPtr & ast_function, Context filename = args[0]->as().value.safeGet(); format = args[1]->as().value.safeGet(); - if (args.size() == 2 && getName() == "file") + if (args.size() == 2) { - if (format == "Distributed") - return; - throw Exception("Table function '" + getName() + "' allows 2 arguments only for Distributed format.", ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH); + checkIfFormatSupportsAutoStructure(getName(), format); + return; } if (args.size() != 3 && args.size() != 4) - throw Exception("Table function '" + getName() + "' requires 3 or 4 arguments: filename, format, structure and compression method (default auto).", + throw Exception("Table function '" + getName() + "' requires 2, 3 or 4 arguments: filename, format, structure (default auto) and compression method (default auto)", ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH); structure = args[2]->as().value.safeGet(); + if (structure == "auto") + checkIfFormatSupportsAutoStructure(getName(), format); + if (structure.empty()) throw Exception(ErrorCodes::BAD_ARGUMENTS, - "Table structure is empty for table function '{}'", + "Table structure is empty for table function '{}'. If you want to use automatic schema inference, use 'auto'", ast_function->formatForErrorMessage()); if (args.size() == 4) @@ -69,25 +87,12 @@ void ITableFunctionFileLike::parseArguments(const ASTPtr & ast_function, Context StoragePtr ITableFunctionFileLike::executeImpl(const ASTPtr & /*ast_function*/, ContextPtr context, const std::string & table_name, ColumnsDescription /*cached_columns*/) const { - auto columns = getActualTableStructure(context); + ColumnsDescription columns; + if (structure != "auto") + columns = parseColumnsListFromString(structure, context); StoragePtr storage = getStorage(filename, format, columns, context, table_name, compression_method); storage->startup(); return storage; } -ColumnsDescription ITableFunctionFileLike::getActualTableStructure(ContextPtr context) const -{ - if (structure.empty()) - { - assert(getName() == "file" && format == "Distributed"); - size_t total_bytes_to_read = 0; - Strings paths = StorageFile::getPathsList(filename, context->getUserFilesPath(), context, total_bytes_to_read); - if (paths.empty()) - throw Exception("Cannot get table structure from file, because no files match specified name", ErrorCodes::INCORRECT_FILE_NAME); - auto source = StorageDistributedDirectoryMonitor::createSourceFromFile(paths[0]); - return ColumnsDescription{source->getOutputs().front().getHeader().getNamesAndTypesList()}; - } - return parseColumnsListFromString(structure, context); -} - } diff --git a/src/TableFunctions/ITableFunctionFileLike.h b/src/TableFunctions/ITableFunctionFileLike.h index 2069f02b0dd..2ceafdee229 100644 --- a/src/TableFunctions/ITableFunctionFileLike.h +++ b/src/TableFunctions/ITableFunctionFileLike.h @@ -8,7 +8,7 @@ class ColumnsDescription; class Context; /* - * function(source, format, structure) - creates a temporary storage from formatted source + * function(source, format, structure[, compression_method]) - creates a temporary storage from formatted source */ class ITableFunctionFileLike : public ITableFunction { @@ -18,7 +18,7 @@ protected: String filename; String format; - String structure; + String structure = "auto"; String compression_method = "auto"; private: @@ -28,8 +28,7 @@ private: const String & source, const String & format, const ColumnsDescription & columns, ContextPtr global_context, const std::string & table_name, const String & compression_method) const = 0; - ColumnsDescription getActualTableStructure(ContextPtr context) const override; - - bool hasStaticStructure() const override { return true; } + bool hasStaticStructure() const override { return structure != "auto"; } }; + } diff --git a/src/TableFunctions/TableFunctionExecutable.cpp b/src/TableFunctions/TableFunctionExecutable.cpp index 9edb75b0a69..41ba2db5c33 100644 --- a/src/TableFunctions/TableFunctionExecutable.cpp +++ b/src/TableFunctions/TableFunctionExecutable.cpp @@ -75,7 +75,12 @@ ColumnsDescription TableFunctionExecutable::getActualTableStructure(ContextPtr c StoragePtr TableFunctionExecutable::executeImpl(const ASTPtr & /*ast_function*/, ContextPtr context, const std::string & table_name, ColumnsDescription /*cached_columns*/) const { auto storage_id = StorageID(getDatabaseName(), table_name); - auto storage = StorageExecutable::create(storage_id, script_name, arguments, format, input_queries, getActualTableStructure(context), ConstraintsDescription{}); + auto global_context = context->getGlobalContext(); + ExecutableSettings settings; + settings.script_name = script_name; + settings.script_arguments = std::move(arguments); + + auto storage = StorageExecutable::create(storage_id, format, settings, input_queries, getActualTableStructure(context), ConstraintsDescription{}); storage->startup(); return storage; } diff --git a/src/TableFunctions/TableFunctionFile.cpp b/src/TableFunctions/TableFunctionFile.cpp index d8bdb3b45c4..71aba5494e8 100644 --- a/src/TableFunctions/TableFunctionFile.cpp +++ b/src/TableFunctions/TableFunctionFile.cpp @@ -1,4 +1,5 @@ #include +#include #include "registerTableFunctions.h" #include @@ -9,11 +10,13 @@ namespace DB { + StoragePtr TableFunctionFile::getStorage(const String & source, const String & format_, const ColumnsDescription & columns, ContextPtr global_context, const std::string & table_name, const std::string & compression_method_) const { + LOG_DEBUG(&Poco::Logger::get("TableFunctionFile"), "getStorage"); // For `file` table function, we are going to use format settings from the // query context. StorageFile::CommonArguments args{ @@ -30,8 +33,21 @@ StoragePtr TableFunctionFile::getStorage(const String & source, return StorageFile::create(source, global_context->getUserFilesPath(), args); } +ColumnsDescription TableFunctionFile::getActualTableStructure(ContextPtr context) const +{ + if (structure == "auto") + { + size_t total_bytes_to_read = 0; + Strings paths = StorageFile::getPathsList(filename, context->getUserFilesPath(), context, total_bytes_to_read); + return StorageFile::getTableStructureFromData(format, paths, compression_method, std::nullopt, context); + } + + return parseColumnsListFromString(structure, context); +} + void registerTableFunctionFile(TableFunctionFactory & factory) { factory.registerFunction(); } + } diff --git a/src/TableFunctions/TableFunctionFile.h b/src/TableFunctions/TableFunctionFile.h index 460656a7218..f26e4a9c06d 100644 --- a/src/TableFunctions/TableFunctionFile.h +++ b/src/TableFunctions/TableFunctionFile.h @@ -6,7 +6,7 @@ namespace DB { -/* file(path, format, structure) - creates a temporary storage from file +/* file(path, format[, structure, compression]) - creates a temporary storage from file * * The file must be in the clickhouse data directory. * The relative path begins with the clickhouse data directory. @@ -20,9 +20,13 @@ public: return name; } + ColumnsDescription getActualTableStructure(ContextPtr context) const override; + private: StoragePtr getStorage( const String & source, const String & format_, const ColumnsDescription & columns, ContextPtr global_context, const std::string & table_name, const std::string & compression_method_) const override; const char * getStorageTypeName() const override { return "File"; } -};} +}; + +} diff --git a/src/TableFunctions/TableFunctionHDFS.cpp b/src/TableFunctions/TableFunctionHDFS.cpp index 245674b0e06..b626f563977 100644 --- a/src/TableFunctions/TableFunctionHDFS.cpp +++ b/src/TableFunctions/TableFunctionHDFS.cpp @@ -6,9 +6,11 @@ #include #include #include +#include namespace DB { + StoragePtr TableFunctionHDFS::getStorage( const String & source, const String & format_, const ColumnsDescription & columns, ContextPtr global_context, const std::string & table_name, const String & compression_method_) const @@ -24,12 +26,18 @@ StoragePtr TableFunctionHDFS::getStorage( compression_method_); } +ColumnsDescription TableFunctionHDFS::getActualTableStructure(ContextPtr context) const +{ + if (structure == "auto") + return StorageHDFS::getTableStructureFromData(format, filename, compression_method, context); + + return parseColumnsListFromString(structure, context); +} -#if USE_HDFS void registerTableFunctionHDFS(TableFunctionFactory & factory) { factory.registerFunction(); } -#endif + } #endif diff --git a/src/TableFunctions/TableFunctionHDFS.h b/src/TableFunctions/TableFunctionHDFS.h index d9ee9b47868..74139818209 100644 --- a/src/TableFunctions/TableFunctionHDFS.h +++ b/src/TableFunctions/TableFunctionHDFS.h @@ -12,7 +12,7 @@ namespace DB class Context; -/* hdfs(name_node_ip:name_node_port, format, structure) - creates a temporary storage from hdfs file +/* hdfs(URI, format[, structure, compression]) - creates a temporary storage from hdfs files * */ class TableFunctionHDFS : public ITableFunctionFileLike @@ -24,6 +24,8 @@ public: return name; } + ColumnsDescription getActualTableStructure(ContextPtr context) const override; + private: StoragePtr getStorage( const String & source, const String & format_, const ColumnsDescription & columns, ContextPtr global_context, diff --git a/src/TableFunctions/TableFunctionHDFSCluster.cpp b/src/TableFunctions/TableFunctionHDFSCluster.cpp new file mode 100644 index 00000000000..ca1ac6a11cd --- /dev/null +++ b/src/TableFunctions/TableFunctionHDFSCluster.cpp @@ -0,0 +1,116 @@ +#include + +#if USE_HDFS + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "registerTableFunctions.h" + +#include +#include + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; +} + + +void TableFunctionHDFSCluster::parseArguments(const ASTPtr & ast_function, ContextPtr context) +{ + /// Parse args + ASTs & args_func = ast_function->children; + + if (args_func.size() != 1) + throw Exception("Table function '" + getName() + "' must have arguments.", ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH); + + ASTs & args = args_func.at(0)->children; + + const auto message = fmt::format( + "The signature of table function {} shall be the following:\n" \ + " - cluster, uri, format, structure", + " - cluster, uri, format, structure, compression_method", + getName()); + + if (args.size() < 4 || args.size() > 5) + throw Exception(message, ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH); + + for (auto & arg : args) + arg = evaluateConstantExpressionOrIdentifierAsLiteral(arg, context); + + /// This arguments are always the first + cluster_name = args[0]->as().value.safeGet(); + uri = args[1]->as().value.safeGet(); + format = args[2]->as().value.safeGet(); + structure = args[3]->as().value.safeGet(); + if (args.size() >= 5) + compression_method = args[4]->as().value.safeGet(); +} + + +ColumnsDescription TableFunctionHDFSCluster::getActualTableStructure(ContextPtr context) const +{ + return parseColumnsListFromString(structure, context); +} + +StoragePtr TableFunctionHDFSCluster::executeImpl( + const ASTPtr & /*function*/, ContextPtr context, + const std::string & table_name, ColumnsDescription /*cached_columns*/) const +{ + StoragePtr storage; + if (context->getClientInfo().query_kind == ClientInfo::QueryKind::SECONDARY_QUERY) + { + /// On worker node this uri won't contains globs + storage = StorageHDFS::create( + uri, + StorageID(getDatabaseName(), table_name), + format, + getActualTableStructure(context), + ConstraintsDescription{}, + String{}, + context, + compression_method, + /*distributed_processing=*/true, + nullptr); + } + else + { + storage = StorageHDFSCluster::create( + cluster_name, uri, StorageID(getDatabaseName(), table_name), + format, getActualTableStructure(context), ConstraintsDescription{}, + compression_method); + } + + storage->startup(); + + return storage; +} + + +void registerTableFunctionHDFSCluster(TableFunctionFactory & factory) +{ + factory.registerFunction(); +} + + +} + +#endif diff --git a/src/TableFunctions/TableFunctionHDFSCluster.h b/src/TableFunctions/TableFunctionHDFSCluster.h new file mode 100644 index 00000000000..58d1c3d9b05 --- /dev/null +++ b/src/TableFunctions/TableFunctionHDFSCluster.h @@ -0,0 +1,54 @@ +#pragma once + +#include + +#if USE_HDFS + +#include + + +namespace DB +{ + +class Context; + +/** + * hdfsCluster(cluster, URI, format, structure, compression_method) + * A table function, which allows to process many files from HDFS on a specific cluster + * On initiator it creates a connection to _all_ nodes in cluster, discloses asterics + * in HDFS file path and dispatch each file dynamically. + * On worker node it asks initiator about next task to process, processes it. + * This is repeated until the tasks are finished. + */ +class TableFunctionHDFSCluster : public ITableFunction +{ +public: + static constexpr auto name = "hdfsCluster"; + std::string getName() const override + { + return name; + } + bool hasStaticStructure() const override { return true; } + +protected: + StoragePtr executeImpl( + const ASTPtr & ast_function, + ContextPtr context, + const std::string & table_name, + ColumnsDescription cached_columns) const override; + + const char * getStorageTypeName() const override { return "HDFSCluster"; } + + ColumnsDescription getActualTableStructure(ContextPtr) const override; + void parseArguments(const ASTPtr &, ContextPtr) override; + + String cluster_name; + String uri; + String format; + String structure; + String compression_method = "auto"; +}; + +} + +#endif diff --git a/src/TableFunctions/TableFunctionMySQL.cpp b/src/TableFunctions/TableFunctionMySQL.cpp index a9cecb11a1c..e959fa754c9 100644 --- a/src/TableFunctions/TableFunctionMySQL.cpp +++ b/src/TableFunctions/TableFunctionMySQL.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -37,7 +38,11 @@ void TableFunctionMySQL::parseArguments(const ASTPtr & ast_function, ContextPtr throw Exception("Table function 'mysql' must have arguments.", ErrorCodes::LOGICAL_ERROR); configuration = StorageMySQL::getConfiguration(args_func.arguments->children, context); - pool.emplace(configuration->database, configuration->addresses, configuration->username, configuration->password); + MySQLSettings mysql_settings; + const auto & settings = context->getSettingsRef(); + mysql_settings.connect_timeout = settings.external_storage_connect_timeout_sec; + mysql_settings.read_write_timeout = settings.external_storage_rw_timeout_sec; + pool.emplace(createMySQLPoolWithFailover(*configuration, mysql_settings)); } ColumnsDescription TableFunctionMySQL::getActualTableStructure(ContextPtr context) const diff --git a/src/TableFunctions/TableFunctionPostgreSQL.cpp b/src/TableFunctions/TableFunctionPostgreSQL.cpp index bcfe8d5444c..d948f40588f 100644 --- a/src/TableFunctions/TableFunctionPostgreSQL.cpp +++ b/src/TableFunctions/TableFunctionPostgreSQL.cpp @@ -50,6 +50,7 @@ ColumnsDescription TableFunctionPostgreSQL::getActualTableStructure(ContextPtr c if (!columns) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Table structure not returned"); + return ColumnsDescription{*columns}; } diff --git a/src/TableFunctions/TableFunctionRemote.h b/src/TableFunctions/TableFunctionRemote.h index 845c36182dc..976397ddc45 100644 --- a/src/TableFunctions/TableFunctionRemote.h +++ b/src/TableFunctions/TableFunctionRemote.h @@ -27,6 +27,7 @@ public: bool needStructureConversion() const override { return false; } private: + StoragePtr executeImpl(const ASTPtr & ast_function, ContextPtr context, const std::string & table_name, ColumnsDescription cached_columns) const override; const char * getStorageTypeName() const override { return "Distributed"; } diff --git a/src/TableFunctions/TableFunctionS3.cpp b/src/TableFunctions/TableFunctionS3.cpp index e26c282c622..c4be01c6b5c 100644 --- a/src/TableFunctions/TableFunctionS3.cpp +++ b/src/TableFunctions/TableFunctionS3.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #include "registerTableFunctions.h" @@ -28,6 +29,7 @@ void TableFunctionS3::parseArguments(const ASTPtr & ast_function, ContextPtr con const auto message = fmt::format( "The signature of table function {} could be the following:\n" \ + " - url, format\n" \ " - url, format, structure\n" \ " - url, format, structure, compression_method\n" \ " - url, access_key_id, secret_access_key, format, structure\n" \ @@ -69,17 +71,32 @@ void TableFunctionS3::parseArguments(const ASTPtr & ast_function, ContextPtr con /// Size -> argument indexes static auto size_to_args = std::map> { + {2, {{"format", 1}}}, {3, {{"format", 1}, {"structure", 2}}}, - {4, {{"format", 1}, {"structure", 2}, {"compression_method", 3}}}, {5, {{"access_key_id", 1}, {"secret_access_key", 2}, {"format", 3}, {"structure", 4}}}, {6, {{"access_key_id", 1}, {"secret_access_key", 2}, {"format", 3}, {"structure", 4}, {"compression_method", 5}}} }; + std::map args_to_idx; + /// For 4 arguments we support 2 possible variants: + /// s3(source, format, structure, compression_method) and s3(source, access_key_id, access_key_id, format) + /// We can distinguish them by looking at the 4-th argument: check if it's a format name or not. + if (args.size() == 4) + { + auto last_arg = args[3]->as().value.safeGet(); + if (FormatFactory::instance().getAllFormats().contains(last_arg)) + args_to_idx = {{"access_key_id", 1}, {"access_key_id", 2}, {"format", 3}}; + else + args_to_idx = {{"format", 1}, {"structure", 2}, {"compression_method", 3}}; + } + else + { + args_to_idx = size_to_args[args.size()]; + } + /// This argument is always the first configuration.url = args[0]->as().value.safeGet(); - auto & args_to_idx = size_to_args[args.size()]; - if (args_to_idx.contains("format")) configuration.format = args[args_to_idx["format"]]->as().value.safeGet(); @@ -101,6 +118,21 @@ void TableFunctionS3::parseArguments(const ASTPtr & ast_function, ContextPtr con ColumnsDescription TableFunctionS3::getActualTableStructure(ContextPtr context) const { + if (s3_configuration->structure == "auto") + { + return StorageS3::getTableStructureFromData( + s3_configuration->format, + S3::URI(Poco::URI(s3_configuration->url)), + s3_configuration->access_key_id, + s3_configuration->secret_access_key, + context->getSettingsRef().s3_max_connections, + context->getSettingsRef().s3_max_single_read_retries, + s3_configuration->compression_method, + false, + std::nullopt, + context); + } + return parseColumnsListFromString(s3_configuration->structure, context); } @@ -113,6 +145,10 @@ StoragePtr TableFunctionS3::executeImpl(const ASTPtr & /*ast_function*/, Context UInt64 max_single_part_upload_size = context->getSettingsRef().s3_max_single_part_upload_size; UInt64 max_connections = context->getSettingsRef().s3_max_connections; + ColumnsDescription columns; + if (s3_configuration->structure != "auto") + columns = parseColumnsListFromString(s3_configuration->structure, context); + StoragePtr storage = StorageS3::create( s3_uri, s3_configuration->access_key_id, diff --git a/src/TableFunctions/TableFunctionS3.h b/src/TableFunctions/TableFunctionS3.h index 8d4c1391236..374e653072e 100644 --- a/src/TableFunctions/TableFunctionS3.h +++ b/src/TableFunctions/TableFunctionS3.h @@ -13,7 +13,7 @@ namespace DB class Context; -/* s3(source, [access_key_id, secret_access_key,] format, structure) - creates a temporary storage for a file in S3 +/* s3(source, [access_key_id, secret_access_key,] format, structure[, compression]) - creates a temporary storage for a file in S3 */ class TableFunctionS3 : public ITableFunction { @@ -23,7 +23,7 @@ public: { return name; } - bool hasStaticStructure() const override { return true; } + bool hasStaticStructure() const override { return s3_configuration->structure != "auto"; } protected: StoragePtr executeImpl( diff --git a/src/TableFunctions/TableFunctionURL.cpp b/src/TableFunctions/TableFunctionURL.cpp index c3ea30f800f..7c4d7b4a444 100644 --- a/src/TableFunctions/TableFunctionURL.cpp +++ b/src/TableFunctions/TableFunctionURL.cpp @@ -2,11 +2,11 @@ #include "registerTableFunctions.h" #include -#include #include #include #include #include +#include #include @@ -59,20 +59,10 @@ void TableFunctionURL::parseArguments(const ASTPtr & ast_function, ContextPtr co } } - StoragePtr TableFunctionURL::getStorage( const String & source, const String & format_, const ColumnsDescription & columns, ContextPtr global_context, const std::string & table_name, const String & compression_method_) const { - ReadWriteBufferFromHTTP::HTTPHeaderEntries headers; - for (const auto & [header, value] : configuration.headers) - { - auto value_literal = value.safeGet(); - if (header == "Range") - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Range headers are not allowed"); - headers.emplace_back(std::make_pair(header, value_literal)); - } - return StorageURL::create( source, StorageID(getDatabaseName(), table_name), @@ -83,10 +73,31 @@ StoragePtr TableFunctionURL::getStorage( String{}, global_context, compression_method_, - headers, + getHeaders(), configuration.http_method); } +ReadWriteBufferFromHTTP::HTTPHeaderEntries TableFunctionURL::getHeaders() const +{ + ReadWriteBufferFromHTTP::HTTPHeaderEntries headers; + for (const auto & [header, value] : configuration.headers) + { + auto value_literal = value.safeGet(); + if (header == "Range") + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Range headers are not allowed"); + headers.emplace_back(std::make_pair(header, value_literal)); + } + return headers; +} + +ColumnsDescription TableFunctionURL::getActualTableStructure(ContextPtr context) const +{ + if (structure == "auto") + return StorageURL::getTableStructureFromData(format, filename, compression_method, getHeaders(), std::nullopt, context); + + return parseColumnsListFromString(structure, context); +} + void registerTableFunctionURL(TableFunctionFactory & factory) { factory.registerFunction(); diff --git a/src/TableFunctions/TableFunctionURL.h b/src/TableFunctions/TableFunctionURL.h index 9425112acb2..798a37dc478 100644 --- a/src/TableFunctions/TableFunctionURL.h +++ b/src/TableFunctions/TableFunctionURL.h @@ -2,6 +2,7 @@ #include #include +#include namespace DB @@ -9,7 +10,7 @@ namespace DB class Context; -/* url(source, format, structure) - creates a temporary storage from url +/* url(source, format[, structure, compression]) - creates a temporary storage from url */ class TableFunctionURL : public ITableFunctionFileLike { @@ -20,6 +21,8 @@ public: return name; } + ColumnsDescription getActualTableStructure(ContextPtr context) const override; + protected: void parseArguments(const ASTPtr & ast_function, ContextPtr context) override; @@ -29,6 +32,8 @@ private: const std::string & table_name, const String & compression_method_) const override; const char * getStorageTypeName() const override { return "URL"; } + ReadWriteBufferFromHTTP::HTTPHeaderEntries getHeaders() const; + URLBasedDataSourceConfiguration configuration; }; diff --git a/src/TableFunctions/registerTableFunctions.cpp b/src/TableFunctions/registerTableFunctions.cpp index b4aab3e5c55..ea5c2c75f94 100644 --- a/src/TableFunctions/registerTableFunctions.cpp +++ b/src/TableFunctions/registerTableFunctions.cpp @@ -28,6 +28,7 @@ void registerTableFunctions() #if USE_HDFS registerTableFunctionHDFS(factory); + registerTableFunctionHDFSCluster(factory); #endif registerTableFunctionODBC(factory); diff --git a/src/TableFunctions/registerTableFunctions.h b/src/TableFunctions/registerTableFunctions.h index 8dbb5ebb5fa..8ddd9b7c8ab 100644 --- a/src/TableFunctions/registerTableFunctions.h +++ b/src/TableFunctions/registerTableFunctions.h @@ -26,6 +26,7 @@ void registerTableFunctionCOS(TableFunctionFactory & factory); #if USE_HDFS void registerTableFunctionHDFS(TableFunctionFactory & factory); +void registerTableFunctionHDFSCluster(TableFunctionFactory & factory); #endif void registerTableFunctionODBC(TableFunctionFactory & factory); diff --git a/tests/ci/ast_fuzzer_check.py b/tests/ci/ast_fuzzer_check.py index 656e9fdbe50..042e0e90459 100644 --- a/tests/ci/ast_fuzzer_check.py +++ b/tests/ci/ast_fuzzer_check.py @@ -7,9 +7,11 @@ import sys from github import Github +from env_helper import GITHUB_REPOSITORY, TEMP_PATH, REPO_COPY, REPORTS_PATH, GITHUB_SERVER_URL, \ + GITHUB_RUN_ID from s3_helper import S3Helper from get_robot_token import get_best_robot_token -from pr_info import PRInfo, get_event +from pr_info import PRInfo from build_download_helper import get_build_name_for_check, get_build_urls from docker_pull_helper import get_image_with_version from commit_status_helper import post_commit_status @@ -21,12 +23,12 @@ IMAGE_NAME = 'clickhouse/fuzzer' def get_run_command(pr_number, sha, download_url, workspace_path, image): return f'docker run --network=host --volume={workspace_path}:/workspace ' \ - '--cap-add syslog --cap-add sys_admin ' \ + '--cap-add syslog --cap-add sys_admin --cap-add=SYS_PTRACE ' \ f'-e PR_TO_TEST={pr_number} -e SHA_TO_TEST={sha} -e BINARY_URL_TO_DOWNLOAD="{download_url}" '\ f'{image}' def get_commit(gh, commit_sha): - repo = gh.get_repo(os.getenv("GITHUB_REPOSITORY", "ClickHouse/ClickHouse")) + repo = gh.get_repo(GITHUB_REPOSITORY) commit = repo.get_commit(commit_sha) return commit @@ -35,16 +37,16 @@ if __name__ == "__main__": stopwatch = Stopwatch() - temp_path = os.getenv("TEMP_PATH", os.path.abspath(".")) - repo_path = os.getenv("REPO_COPY", os.path.abspath("../../")) - reports_path = os.getenv("REPORTS_PATH", "./reports") + temp_path = TEMP_PATH + repo_path = REPO_COPY + reports_path = REPORTS_PATH check_name = sys.argv[1] if not os.path.exists(temp_path): os.makedirs(temp_path) - pr_info = PRInfo(get_event()) + pr_info = PRInfo() gh = Github(get_best_robot_token()) @@ -106,7 +108,7 @@ if __name__ == "__main__": logging.info("Exception uploading file %s text %s", f, ex) paths[f] = '' - report_url = f"{os.getenv('GITHUB_SERVER_URL')}/{os.getenv('GITHUB_REPOSITORY')}/actions/runs/{os.getenv('GITHUB_RUN_ID')}" + report_url = f"{GITHUB_SERVER_URL}/{GITHUB_REPOSITORY}/actions/runs/{GITHUB_RUN_ID}" if paths['runlog.log']: report_url = paths['runlog.log'] if paths['main.log']: diff --git a/tests/ci/build_check.py b/tests/ci/build_check.py index 36db7d596c9..f37ea49e387 100644 --- a/tests/ci/build_check.py +++ b/tests/ci/build_check.py @@ -7,8 +7,10 @@ import os import sys import time from github import Github + +from env_helper import REPO_COPY, TEMP_PATH, CACHES_PATH, IMAGES_PATH from s3_helper import S3Helper -from pr_info import PRInfo, get_event +from pr_info import PRInfo from get_robot_token import get_best_robot_token from version_helper import get_version_from_repo, update_version_local from ccache_utils import get_ccache_if_not_exists, upload_ccache @@ -76,15 +78,23 @@ def get_image_name(build_config): return 'clickhouse/deb-builder' -def build_clickhouse(packager_cmd, logs_path): +def build_clickhouse(packager_cmd, logs_path, build_output_path): build_log_path = os.path.join(logs_path, 'build_log.log') with TeePopen(packager_cmd, build_log_path) as process: retcode = process.wait() + if os.path.exists(build_output_path): + build_results = os.listdir(build_output_path) + else: + build_results = [] + if retcode == 0: - logging.info("Built successfully") + if len(build_results) != 0: + logging.info("Built successfully") + else: + logging.info("Success exit code, but no build artifacts => build failed") else: logging.info("Build failed") - return build_log_path, retcode == 0 + return build_log_path, retcode == 0 and len(build_results) > 0 def get_build_results_if_exists(s3_helper, s3_prefix): @@ -106,15 +116,19 @@ def create_json_artifact(temp_path, build_name, log_url, build_urls, build_confi "status": success, } - with open(os.path.join(temp_path, "build_urls_" + build_name + '.json'), 'w') as build_links: + json_name = "build_urls_" + build_name + '.json' + + print ("Dump json report", result, "to", json_name, "with env", "build_urls_{build_name}") + + with open(os.path.join(temp_path, json_name), 'w') as build_links: json.dump(result, build_links) if __name__ == "__main__": logging.basicConfig(level=logging.INFO) - repo_path = os.getenv("REPO_COPY", os.path.abspath("../../")) - temp_path = os.getenv("TEMP_PATH", os.path.abspath(".")) - caches_path = os.getenv("CACHES_PATH", temp_path) + repo_path = REPO_COPY + temp_path = TEMP_PATH + caches_path = CACHES_PATH build_check_name = sys.argv[1] build_name = sys.argv[2] @@ -124,7 +138,7 @@ if __name__ == "__main__": if not os.path.exists(temp_path): os.makedirs(temp_path) - pr_info = PRInfo(get_event()) + pr_info = PRInfo() logging.info("Repo copy path %s", repo_path) @@ -136,8 +150,10 @@ if __name__ == "__main__": if 'release' in pr_info.labels or 'release-lts' in pr_info.labels: # for release pull requests we use branch names prefixes, not pr numbers release_or_pr = pr_info.head_ref - elif pr_info.number == 0: - # for pushes to master - major version + elif pr_info.number == 0 and build_config['package_type'] != "performance": + # for pushes to master - major version, but not for performance builds + # they havily relies on a fixed path for build package and nobody going + # to deploy them somewhere, so it's ok. release_or_pr = ".".join(version.as_tuple()[:2]) else: # PR number for anything else @@ -157,11 +173,11 @@ if __name__ == "__main__": log_url = 'https://s3.amazonaws.com/clickhouse-builds/' + url.replace('+', '%2B').replace(' ', '%20') else: build_urls.append('https://s3.amazonaws.com/clickhouse-builds/' + url.replace('+', '%2B').replace(' ', '%20')) - create_json_artifact(temp_path, build_name, log_url, build_urls, build_config, 0, True) + create_json_artifact(temp_path, build_name, log_url, build_urls, build_config, 0, len(build_urls) > 0) sys.exit(0) image_name = get_image_name(build_config) - docker_image = get_image_with_version(os.getenv("IMAGES_PATH"), image_name) + docker_image = get_image_with_version(IMAGES_PATH, image_name) image_version = docker_image.version logging.info("Got version from repo %s", version.get_version_string()) @@ -189,6 +205,10 @@ if __name__ == "__main__": logging.info("cache was not fetched, will create empty dir") os.makedirs(ccache_path) + if build_config['package_type'] == "performance" and pr_info.number != 0: + # because perf tests store some information about git commits + subprocess.check_call(f"cd {repo_path} && git fetch origin master:master", shell=True) + packager_cmd = get_packager_cmd(build_config, os.path.join(repo_path, "docker/packager"), build_output_path, version.get_version_string(), image_version, ccache_path, pr_info) logging.info("Going to run packager with %s", packager_cmd) @@ -197,7 +217,7 @@ if __name__ == "__main__": os.makedirs(build_clickhouse_log) start = time.time() - log_path, success = build_clickhouse(packager_cmd, build_clickhouse_log) + log_path, success = build_clickhouse(packager_cmd, build_clickhouse_log, build_output_path) elapsed = int(time.time() - start) subprocess.check_call(f"sudo chown -R ubuntu:ubuntu {build_output_path}", shell=True) subprocess.check_call(f"sudo chown -R ubuntu:ubuntu {ccache_path}", shell=True) diff --git a/tests/ci/build_download_helper.py b/tests/ci/build_download_helper.py index 5ce54423e19..1df96731270 100644 --- a/tests/ci/build_download_helper.py +++ b/tests/ci/build_download_helper.py @@ -92,3 +92,6 @@ def download_unit_tests(check_name, reports_path, result_path): def download_clickhouse_binary(check_name, reports_path, result_path): download_builds_filter(check_name, reports_path, result_path, lambda x: x.endswith('clickhouse')) + +def download_performance_build(check_name, reports_path, result_path): + download_builds_filter(check_name, reports_path, result_path, lambda x: x.endswith('performance.tgz')) diff --git a/tests/ci/build_report_check.py b/tests/ci/build_report_check.py index 3d97a973017..a85558ebe33 100644 --- a/tests/ci/build_report_check.py +++ b/tests/ci/build_report_check.py @@ -5,11 +5,13 @@ import logging import os import sys from github import Github + +from env_helper import REPORTS_PATH, TEMP_PATH, GITHUB_REPOSITORY, GITHUB_SERVER_URL, GITHUB_RUN_ID from report import create_build_html_report from s3_helper import S3Helper from get_robot_token import get_best_robot_token -from pr_info import PRInfo, get_event -from commit_status_helper import get_commit +from pr_info import PRInfo +from commit_status_helper import get_commit from ci_config import CI_CONFIG from rerun_helper import RerunHelper @@ -25,7 +27,7 @@ class BuildResult(): self.with_coverage = with_coverage def group_by_artifacts(build_urls): - groups = {'deb': [], 'binary': [], 'tgz': [], 'rpm': [], 'preformance': []} + groups = {'deb': [], 'binary': [], 'tgz': [], 'rpm': [], 'performance': []} for url in build_urls: if url.endswith('performance.tgz'): groups['performance'].append(url) @@ -75,8 +77,8 @@ def get_build_name_from_file_name(file_name): if __name__ == "__main__": logging.basicConfig(level=logging.INFO) - reports_path = os.getenv("REPORTS_PATH", "./reports") - temp_path = os.path.join(os.getenv("TEMP_PATH", ".")) + reports_path = REPORTS_PATH + temp_path = TEMP_PATH logging.info("Reports path %s", reports_path) if not os.path.exists(temp_path): @@ -85,7 +87,7 @@ if __name__ == "__main__": build_check_name = sys.argv[1] gh = Github(get_best_robot_token()) - pr_info = PRInfo(get_event()) + pr_info = PRInfo() rerun_helper = RerunHelper(gh, pr_info, build_check_name) if rerun_helper.is_already_finished_by_status(): logging.info("Check is already finished according to github status, exiting") @@ -127,15 +129,15 @@ if __name__ == "__main__": s3_helper = S3Helper('https://s3.amazonaws.com') - pr_info = PRInfo(get_event()) + pr_info = PRInfo() - branch_url = f"{os.getenv('GITHUB_SERVER_URL')}/{os.getenv('GITHUB_REPOSITORY')}/commits/master" + branch_url = f"{GITHUB_SERVER_URL}/{GITHUB_REPOSITORY}/commits/master" branch_name = "master" if pr_info.number != 0: branch_name = "PR #{}".format(pr_info.number) - branch_url = f"{os.getenv('GITHUB_SERVER_URL')}/{os.getenv('GITHUB_REPOSITORY')}/pull/{pr_info.number}" - commit_url = f"{os.getenv('GITHUB_SERVER_URL')}/{os.getenv('GITHUB_REPOSITORY')}/commit/{pr_info.sha}" - task_url = f"{os.getenv('GITHUB_SERVER_URL')}/{os.getenv('GITHUB_REPOSITORY')}/actions/runs/{os.getenv('GITHUB_RUN_ID', '0')}" + branch_url = f"{GITHUB_SERVER_URL}/{GITHUB_REPOSITORY}/pull/{pr_info.number}" + commit_url = f"{GITHUB_SERVER_URL}/{GITHUB_REPOSITORY}/commit/{pr_info.sha}" + task_url = f"{GITHUB_SERVER_URL}/{GITHUB_REPOSITORY}/actions/runs/{GITHUB_RUN_ID or '0'}" report = create_build_html_report( build_check_name, build_results, diff --git a/tests/ci/approve_lambda/Dockerfile b/tests/ci/cancel_and_rerun_workflow_lambda/Dockerfile similarity index 100% rename from tests/ci/approve_lambda/Dockerfile rename to tests/ci/cancel_and_rerun_workflow_lambda/Dockerfile diff --git a/tests/ci/cancel_and_rerun_workflow_lambda/app.py b/tests/ci/cancel_and_rerun_workflow_lambda/app.py new file mode 100644 index 00000000000..b79eb292dc6 --- /dev/null +++ b/tests/ci/cancel_and_rerun_workflow_lambda/app.py @@ -0,0 +1,191 @@ +#!/usr/bin/env python3 + +from collections import namedtuple +import json +import time + +import jwt +import requests +import boto3 + +NEED_RERUN_OR_CANCELL_WORKFLOWS = { + 13241696, # PR + 15834118, # Docs + 15516108, # ReleaseCI + 15797242, # BackportPR +} + +# https://docs.github.com/en/rest/reference/actions#cancel-a-workflow-run +# +API_URL = 'https://api.github.com/repos/ClickHouse/ClickHouse' + +MAX_RETRY = 5 + +def get_installation_id(jwt_token): + headers = { + "Authorization": f"Bearer {jwt_token}", + "Accept": "application/vnd.github.v3+json", + } + response = requests.get("https://api.github.com/app/installations", headers=headers) + response.raise_for_status() + data = response.json() + return data[0]['id'] + +def get_access_token(jwt_token, installation_id): + headers = { + "Authorization": f"Bearer {jwt_token}", + "Accept": "application/vnd.github.v3+json", + } + response = requests.post(f"https://api.github.com/app/installations/{installation_id}/access_tokens", headers=headers) + response.raise_for_status() + data = response.json() + return data['token'] + +def get_key_and_app_from_aws(): + secret_name = "clickhouse_github_secret_key" + session = boto3.session.Session() + client = session.client( + service_name='secretsmanager', + ) + get_secret_value_response = client.get_secret_value( + SecretId=secret_name + ) + data = json.loads(get_secret_value_response['SecretString']) + return data['clickhouse-app-key'], int(data['clickhouse-app-id']) + +def get_token_from_aws(): + private_key, app_id = get_key_and_app_from_aws() + payload = { + "iat": int(time.time()) - 60, + "exp": int(time.time()) + (10 * 60), + "iss": app_id, + } + + encoded_jwt = jwt.encode(payload, private_key, algorithm="RS256") + installation_id = get_installation_id(encoded_jwt) + return get_access_token(encoded_jwt, installation_id) + +def _exec_get_with_retry(url): + for i in range(MAX_RETRY): + try: + response = requests.get(url) + response.raise_for_status() + return response.json() + except Exception as ex: + print("Got exception executing request", ex) + time.sleep(i + 1) + + raise Exception("Cannot execute GET request with retries") + + +WorkflowDescription = namedtuple('WorkflowDescription', + ['run_id', 'status', 'rerun_url', 'cancel_url']) + + +def get_workflows_description_for_pull_request(pull_request_event): + head_branch = pull_request_event['head']['ref'] + print("PR", pull_request_event['number'], "has head ref", head_branch) + workflows_data = [] + workflows = _exec_get_with_retry(API_URL + f"/actions/runs?branch={head_branch}&event=pull_request&page=1") + workflows_data += workflows['workflow_runs'] + i = 2 + while len(workflows['workflow_runs']) > 0: + workflows = _exec_get_with_retry(API_URL + f"/actions/runs?branch={head_branch}&event=pull_request&page={i}") + workflows_data += workflows['workflow_runs'] + i += 1 + if i > 30: + print("Too many workflows found") + break + + workflow_descriptions = [] + for workflow in workflows_data: + # unfortunately we cannot filter workflows from forks in request to API so doing it manually + if (workflow['head_repository']['full_name'] == pull_request_event['head']['repo']['full_name'] + and workflow['workflow_id'] in NEED_RERUN_OR_CANCELL_WORKFLOWS): + workflow_descriptions.append(WorkflowDescription( + run_id=workflow['id'], + status=workflow['status'], + rerun_url=workflow['rerun_url'], + cancel_url=workflow['cancel_url'])) + + return workflow_descriptions + +def get_workflow_description(workflow_id): + workflow = _exec_get_with_retry(API_URL + f"/actions/runs/{workflow_id}") + return WorkflowDescription( + run_id=workflow['id'], + status=workflow['status'], + rerun_url=workflow['rerun_url'], + cancel_url=workflow['cancel_url']) + +def _exec_post_with_retry(url, token): + headers = { + "Authorization": f"token {token}" + } + for i in range(MAX_RETRY): + try: + response = requests.post(url, headers=headers) + response.raise_for_status() + return response.json() + except Exception as ex: + print("Got exception executing request", ex) + time.sleep(i + 1) + + raise Exception("Cannot execute POST request with retry") + +def exec_workflow_url(urls_to_cancel, token): + for url in urls_to_cancel: + print("Post for workflow workflow using url", url) + _exec_post_with_retry(url, token) + print("Workflow post finished") + +def main(event): + token = get_token_from_aws() + event_data = json.loads(event['body']) + + print("Got event for PR", event_data['number']) + action = event_data['action'] + print("Got action", event_data['action']) + pull_request = event_data['pull_request'] + labels = { l['name'] for l in pull_request['labels'] } + print("PR has labels", labels) + if action == 'closed' or 'do not test' in labels: + print("PR merged/closed or manually labeled 'do not test' will kill workflows") + workflow_descriptions = get_workflows_description_for_pull_request(pull_request) + urls_to_cancel = [] + for workflow_description in workflow_descriptions: + if workflow_description.status != 'completed': + urls_to_cancel.append(workflow_description.cancel_url) + print(f"Found {len(urls_to_cancel)} workflows to cancel") + exec_workflow_url(urls_to_cancel, token) + elif action == 'labeled' and 'can be tested' in labels: + print("PR marked with can be tested label, rerun workflow") + workflow_descriptions = get_workflows_description_for_pull_request(pull_request) + if not workflow_descriptions: + print("Not found any workflows") + return + + sorted_workflows = list(sorted(workflow_descriptions, key=lambda x: x.run_id)) + most_recent_workflow = sorted_workflows[-1] + print("Latest workflow", most_recent_workflow) + if most_recent_workflow.status != 'completed': + print("Latest workflow is not completed, cancelling") + exec_workflow_url([most_recent_workflow.cancel_url], token) + print("Cancelled") + + for _ in range(30): + latest_workflow_desc = get_workflow_description(most_recent_workflow.run_id) + print("Checking latest workflow", latest_workflow_desc) + if latest_workflow_desc.status in ('completed', 'cancelled'): + print("Finally latest workflow done, going to rerun") + exec_workflow_url([most_recent_workflow.rerun_url], token) + print("Rerun finished, exiting") + break + print("Still have strange status") + time.sleep(3) + + else: + print("Nothing to do") + +def handler(event, _): + main(event) diff --git a/tests/ci/approve_lambda/requirements.txt b/tests/ci/cancel_and_rerun_workflow_lambda/requirements.txt similarity index 100% rename from tests/ci/approve_lambda/requirements.txt rename to tests/ci/cancel_and_rerun_workflow_lambda/requirements.txt diff --git a/tests/ci/cancel_workflow_lambda/app.py b/tests/ci/cancel_workflow_lambda/app.py deleted file mode 100644 index e475fcb931a..00000000000 --- a/tests/ci/cancel_workflow_lambda/app.py +++ /dev/null @@ -1,127 +0,0 @@ -#!/usr/bin/env python3 - -import json -import time -import jwt - -import requests -import boto3 - -# https://docs.github.com/en/rest/reference/actions#cancel-a-workflow-run -# -API_URL = 'https://api.github.com/repos/ClickHouse/ClickHouse' - -MAX_RETRY = 5 - -def get_installation_id(jwt_token): - headers = { - "Authorization": f"Bearer {jwt_token}", - "Accept": "application/vnd.github.v3+json", - } - response = requests.get("https://api.github.com/app/installations", headers=headers) - response.raise_for_status() - data = response.json() - return data[0]['id'] - -def get_access_token(jwt_token, installation_id): - headers = { - "Authorization": f"Bearer {jwt_token}", - "Accept": "application/vnd.github.v3+json", - } - response = requests.post(f"https://api.github.com/app/installations/{installation_id}/access_tokens", headers=headers) - response.raise_for_status() - data = response.json() - return data['token'] - -def get_key_and_app_from_aws(): - secret_name = "clickhouse_github_secret_key" - session = boto3.session.Session() - client = session.client( - service_name='secretsmanager', - ) - get_secret_value_response = client.get_secret_value( - SecretId=secret_name - ) - data = json.loads(get_secret_value_response['SecretString']) - return data['clickhouse-app-key'], int(data['clickhouse-app-id']) - -def get_token_from_aws(): - private_key, app_id = get_key_and_app_from_aws() - payload = { - "iat": int(time.time()) - 60, - "exp": int(time.time()) + (10 * 60), - "iss": app_id, - } - - encoded_jwt = jwt.encode(payload, private_key, algorithm="RS256") - installation_id = get_installation_id(encoded_jwt) - return get_access_token(encoded_jwt, installation_id) - -def _exec_get_with_retry(url): - for i in range(MAX_RETRY): - try: - response = requests.get(url) - response.raise_for_status() - return response.json() - except Exception as ex: - print("Got exception executing request", ex) - time.sleep(i + 1) - - raise Exception("Cannot execute GET request with retries") - - -def get_workflows_cancel_urls_for_pull_request(pull_request_event): - head_branch = pull_request_event['head']['ref'] - print("PR", pull_request_event['number'], "has head ref", head_branch) - workflows = _exec_get_with_retry(API_URL + f"/actions/runs?branch={head_branch}") - workflows_urls_to_cancel = set([]) - for workflow in workflows['workflow_runs']: - if workflow['status'] != 'completed': - print("Workflow", workflow['url'], "not finished, going to be cancelled") - workflows_urls_to_cancel.add(workflow['cancel_url']) - else: - print("Workflow", workflow['url'], "already finished, will not try to cancel") - - return workflows_urls_to_cancel - -def _exec_post_with_retry(url, token): - headers = { - "Authorization": f"token {token}" - } - for i in range(MAX_RETRY): - try: - response = requests.post(url, headers=headers) - response.raise_for_status() - return response.json() - except Exception as ex: - print("Got exception executing request", ex) - time.sleep(i + 1) - - raise Exception("Cannot execute POST request with retry") - -def cancel_workflows(urls_to_cancel, token): - for url in urls_to_cancel: - print("Cancelling workflow using url", url) - _exec_post_with_retry(url, token) - print("Workflow cancelled") - -def main(event): - token = get_token_from_aws() - event_data = json.loads(event['body']) - - print("Got event for PR", event_data['number']) - action = event_data['action'] - print("Got action", event_data['action']) - pull_request = event_data['pull_request'] - labels = { l['name'] for l in pull_request['labels'] } - print("PR has labels", labels) - if action == 'closed' or 'do not test' in labels: - print("PR merged/closed or manually labeled 'do not test' will kill workflows") - workflows_to_cancel = get_workflows_cancel_urls_for_pull_request(pull_request) - print(f"Found {len(workflows_to_cancel)} workflows to cancel") - cancel_workflows(workflows_to_cancel, token) - else: - print("Nothing to do") - -def handler(event, _): - main(event) diff --git a/tests/ci/cherry_pick.py b/tests/ci/cherry_pick.py index 112b58ef1cf..91a018f158f 100644 --- a/tests/ci/cherry_pick.py +++ b/tests/ci/cherry_pick.py @@ -5,6 +5,7 @@ import logging import os import subprocess +from env_helper import GITHUB_WORKSPACE, TEMP_PATH from get_robot_token import get_parameter_from_ssm from ssh import SSHKey from cherry_pick_utils.backport import Backport @@ -13,8 +14,8 @@ from cherry_pick_utils.cherrypick import CherryPick if __name__ == "__main__": logging.basicConfig(level=logging.INFO) - repo_path = os.path.join(os.getenv("GITHUB_WORKSPACE", os.path.abspath("../../"))) - temp_path = os.path.join(os.getenv("TEMP_PATH")) + repo_path = GITHUB_WORKSPACE + temp_path = TEMP_PATH if not os.path.exists(temp_path): os.makedirs(temp_path) diff --git a/tests/ci/cherry_pick_utils/backport.py b/tests/ci/cherry_pick_utils/backport.py index a28a1510694..9227dbf4108 100644 --- a/tests/ci/cherry_pick_utils/backport.py +++ b/tests/ci/cherry_pick_utils/backport.py @@ -74,7 +74,7 @@ class Backport: # First pass. Find all must-backports for label in pr['labels']['nodes']: - if label['name'] == 'pr-bugfix' or label['name'] == 'pr-must-backport': + if label['name'] == 'pr-must-backport': backport_map[pr['number']] = branch_set.copy() continue matched = RE_MUST_BACKPORT.match(label['name']) diff --git a/tests/ci/ci_config.py b/tests/ci/ci_config.py index ebc311986b7..d5f8757ffdf 100644 --- a/tests/ci/ci_config.py +++ b/tests/ci/ci_config.py @@ -11,7 +11,7 @@ CI_CONFIG = { "splitted": "unsplitted", "alien_pkgs": True, "tidy": "disable", - "with_coverage": False + "with_coverage": False, }, "performance": { "compiler": "clang-13", @@ -21,7 +21,7 @@ CI_CONFIG = { "bundled": "bundled", "splitted": "unsplitted", "tidy": "disable", - "with_coverage": False + "with_coverage": False, }, "binary_gcc": { "compiler": "gcc-11", @@ -31,7 +31,18 @@ CI_CONFIG = { "bundled": "bundled", "splitted": "unsplitted", "tidy": "disable", - "with_coverage": False + "with_coverage": False, + }, + "package_aarch64": { + "compiler": "clang-13-aarch64", + "build_type": "", + "sanitizer": "", + "package_type": "deb", + "bundled": "bundled", + "splitted": "unsplitted", + "alien_pkgs": True, + "tidy": "disable", + "with_coverage": False, }, "package_asan": { "compiler": "clang-13", @@ -41,7 +52,7 @@ CI_CONFIG = { "bundled": "bundled", "splitted": "unsplitted", "tidy": "disable", - "with_coverage": False + "with_coverage": False, }, "package_ubsan": { "compiler": "clang-13", @@ -51,7 +62,7 @@ CI_CONFIG = { "bundled": "bundled", "splitted": "unsplitted", "tidy": "disable", - "with_coverage": False + "with_coverage": False, }, "package_tsan": { "compiler": "clang-13", @@ -61,7 +72,7 @@ CI_CONFIG = { "bundled": "bundled", "splitted": "unsplitted", "tidy": "disable", - "with_coverage": False + "with_coverage": False, }, "package_msan": { "compiler": "clang-13", @@ -71,7 +82,7 @@ CI_CONFIG = { "bundled": "bundled", "splitted": "unsplitted", "tidy": "disable", - "with_coverage": False + "with_coverage": False, }, "package_debug": { "compiler": "clang-13", @@ -81,7 +92,7 @@ CI_CONFIG = { "bundled": "bundled", "splitted": "unsplitted", "tidy": "disable", - "with_coverage": False + "with_coverage": False, }, "binary_release": { "compiler": "clang-13", @@ -91,7 +102,7 @@ CI_CONFIG = { "bundled": "bundled", "splitted": "unsplitted", "tidy": "disable", - "with_coverage": False + "with_coverage": False, }, "binary_tidy": { "compiler": "clang-13", @@ -101,7 +112,7 @@ CI_CONFIG = { "bundled": "bundled", "splitted": "unsplitted", "tidy": "enable", - "with_coverage": False + "with_coverage": False, }, "binary_splitted": { "compiler": "clang-13", @@ -111,7 +122,7 @@ CI_CONFIG = { "bundled": "bundled", "splitted": "splitted", "tidy": "disable", - "with_coverage": False + "with_coverage": False, }, "binary_darwin": { "compiler": "clang-13-darwin", @@ -121,7 +132,7 @@ CI_CONFIG = { "bundled": "bundled", "splitted": "unsplitted", "tidy": "disable", - "with_coverage": False + "with_coverage": False, }, "binary_aarch64": { "compiler": "clang-13-aarch64", @@ -131,7 +142,7 @@ CI_CONFIG = { "bundled": "bundled", "splitted": "unsplitted", "tidy": "disable", - "with_coverage": False + "with_coverage": False, }, "binary_freebsd": { "compiler": "clang-13-freebsd", @@ -141,7 +152,7 @@ CI_CONFIG = { "bundled": "bundled", "splitted": "unsplitted", "tidy": "disable", - "with_coverage": False + "with_coverage": False, }, "binary_darwin_aarch64": { "compiler": "clang-13-darwin-aarch64", @@ -151,7 +162,7 @@ CI_CONFIG = { "bundled": "bundled", "splitted": "unsplitted", "tidy": "disable", - "with_coverage": False + "with_coverage": False, }, "binary_ppc64le": { "compiler": "clang-13-ppc64le", @@ -161,27 +172,29 @@ CI_CONFIG = { "bundled": "bundled", "splitted": "unsplitted", "tidy": "disable", - "with_coverage": False - } + "with_coverage": False, + }, }, "builds_report_config": { "ClickHouse build check (actions)": [ "package_release", "performance", + "package_aarch64", "package_asan", "package_ubsan", "package_tsan", "package_msan", "package_debug", - "binary_release" + "binary_release", ], "ClickHouse special build check (actions)": [ "binary_tidy", "binary_splitted", "binary_darwin", - "binary_arrach64", + "binary_aarch64", "binary_freebsd", - "binary_darwin_aarch64" + "binary_darwin_aarch64", + "binary_ppc64le", ], }, "tests_config": { @@ -316,6 +329,9 @@ CI_CONFIG = { }, "ClickHouse Keeper Jepsen (actions)": { "required_build": "binary_release", - } - } + }, + "Performance Comparison (actions)": { + "required_build": "performance", + }, + }, } diff --git a/tests/ci/codebrowser_check.py b/tests/ci/codebrowser_check.py new file mode 100644 index 00000000000..97fd58c3235 --- /dev/null +++ b/tests/ci/codebrowser_check.py @@ -0,0 +1,75 @@ +#!/usr/bin/env python3 + + +import os +import subprocess +import logging + +from github import Github + +from env_helper import IMAGES_PATH, REPO_COPY +from stopwatch import Stopwatch +from upload_result_helper import upload_results +from s3_helper import S3Helper +from get_robot_token import get_best_robot_token +from commit_status_helper import post_commit_status +from docker_pull_helper import get_image_with_version +from tee_popen import TeePopen + +NAME = "Woboq Build (actions)" + +def get_run_command(repo_path, output_path, image): + cmd = "docker run " + \ + f"--volume={repo_path}:/repo_folder " \ + f"--volume={output_path}:/test_output " \ + f"-e 'DATA=https://s3.amazonaws.com/clickhouse-test-reports/codebrowser/data' {image}" + return cmd + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + + stopwatch = Stopwatch() + + temp_path = os.getenv("TEMP_PATH", os.path.abspath(".")) + + gh = Github(get_best_robot_token()) + + if not os.path.exists(temp_path): + os.makedirs(temp_path) + + docker_image = get_image_with_version(IMAGES_PATH, 'clickhouse/codebrowser') + s3_helper = S3Helper('https://s3.amazonaws.com') + + result_path = os.path.join(temp_path, "result_path") + if not os.path.exists(result_path): + os.makedirs(result_path) + + run_command = get_run_command(REPO_COPY, result_path, docker_image) + + logging.info("Going to run codebrowser: %s", run_command) + + run_log_path = os.path.join(temp_path, "runlog.log") + + with TeePopen(run_command, run_log_path) as process: + retcode = process.wait() + if retcode == 0: + logging.info("Run successfully") + else: + logging.info("Run failed") + + subprocess.check_call(f"sudo chown -R ubuntu:ubuntu {temp_path}", shell=True) + + report_path = os.path.join(result_path, "html_report") + logging.info("Report path %s", report_path) + s3_path_prefix = "codebrowser" + html_urls = s3_helper.fast_parallel_upload_dir(report_path, s3_path_prefix, 'clickhouse-test-reports') + + index_html = 'HTML report' + + test_results = [(index_html, "Look at the report")] + + report_url = upload_results(s3_helper, 0, os.getenv("GITHUB_SHA"), test_results, [], NAME) + + print(f"::notice ::Report url: {report_url}") + + post_commit_status(gh, os.getenv("GITHUB_SHA"), NAME, "Report built", "success", report_url) diff --git a/tests/ci/commit_status_helper.py b/tests/ci/commit_status_helper.py index 5bdbf634715..8396303c5a3 100644 --- a/tests/ci/commit_status_helper.py +++ b/tests/ci/commit_status_helper.py @@ -1,12 +1,33 @@ #!/usr/bin/env python3 -import os +import time +from env_helper import GITHUB_REPOSITORY + +RETRY = 5 + + +def get_commit(gh, commit_sha, retry_count=RETRY): + for i in range(retry_count): + try: + repo = gh.get_repo(GITHUB_REPOSITORY) + commit = repo.get_commit(commit_sha) + return commit + except Exception as ex: + if i == retry_count - 1: + raise ex + time.sleep(i) + + # just suppress warning + return None -def get_commit(gh, commit_sha): - repo = gh.get_repo(os.getenv("GITHUB_REPOSITORY", "ClickHouse/ClickHouse")) - commit = repo.get_commit(commit_sha) - return commit def post_commit_status(gh, sha, check_name, description, state, report_url): - commit = get_commit(gh, sha) - commit.create_status(context=check_name, description=description, state=state, target_url=report_url) + for i in range(RETRY): + try: + commit = get_commit(gh, sha, 1) + commit.create_status(context=check_name, description=description, state=state, target_url=report_url) + break + except Exception as ex: + if i == RETRY - 1: + raise ex + time.sleep(i) diff --git a/tests/ci/compatibility_check.py b/tests/ci/compatibility_check.py index 665f399b040..72626bd6364 100644 --- a/tests/ci/compatibility_check.py +++ b/tests/ci/compatibility_check.py @@ -8,9 +8,10 @@ import sys from github import Github +from env_helper import TEMP_PATH, REPO_COPY, REPORTS_PATH from s3_helper import S3Helper from get_robot_token import get_best_robot_token -from pr_info import PRInfo, get_event +from pr_info import PRInfo from build_download_helper import download_builds_filter from upload_result_helper import upload_results from docker_pull_helper import get_images_with_versions @@ -103,11 +104,11 @@ if __name__ == "__main__": stopwatch = Stopwatch() - temp_path = os.getenv("TEMP_PATH", os.path.abspath(".")) - repo_path = os.getenv("REPO_COPY", os.path.abspath("../../")) - reports_path = os.getenv("REPORTS_PATH", "./reports") + temp_path = TEMP_PATH + repo_path = REPO_COPY + reports_path = REPORTS_PATH - pr_info = PRInfo(get_event()) + pr_info = PRInfo() gh = Github(get_best_robot_token()) diff --git a/tests/ci/docker_images_check.py b/tests/ci/docker_images_check.py index 5e05cbaecd7..e389d612f44 100644 --- a/tests/ci/docker_images_check.py +++ b/tests/ci/docker_images_check.py @@ -6,8 +6,10 @@ import os import time import shutil from github import Github + +from env_helper import GITHUB_WORKSPACE, RUNNER_TEMP from s3_helper import S3Helper -from pr_info import PRInfo, get_event +from pr_info import PRInfo from get_robot_token import get_best_robot_token, get_parameter_from_ssm from upload_result_helper import upload_results from commit_status_helper import get_commit @@ -157,8 +159,8 @@ if __name__ == "__main__": stopwatch = Stopwatch() - repo_path = os.getenv("GITHUB_WORKSPACE", os.path.abspath("../../")) - temp_path = os.path.join(os.getenv("RUNNER_TEMP", os.path.abspath("./temp")), 'docker_images_check') + repo_path = GITHUB_WORKSPACE + temp_path = os.path.join(RUNNER_TEMP, 'docker_images_check') dockerhub_password = get_parameter_from_ssm('dockerhub_robot_password') if os.path.exists(temp_path): @@ -167,7 +169,7 @@ if __name__ == "__main__": if not os.path.exists(temp_path): os.makedirs(temp_path) - pr_info = PRInfo(get_event(), need_changed_files=True) + pr_info = PRInfo(need_changed_files=True) changed_images, dockerhub_repo_name = get_changed_docker_images(pr_info, repo_path, "docker/images.json") logging.info("Has changed images %s", ', '.join([str(image[0]) for image in changed_images])) pr_commit_version = str(pr_info.number) + '-' + pr_info.sha diff --git a/tests/ci/docker_pull_helper.py b/tests/ci/docker_pull_helper.py index f9804744820..50354da6801 100644 --- a/tests/ci/docker_pull_helper.py +++ b/tests/ci/docker_pull_helper.py @@ -25,6 +25,11 @@ def get_images_with_versions(reports_path, required_image, pull=True): images_path = os.path.join(root, 'changed_images.json') break + if not images_path: + logging.info("Images file not found") + else: + logging.info("Images file path %s", images_path) + if images_path is not None and os.path.exists(images_path): logging.info("Images file exists") with open(images_path, 'r', encoding='utf-8') as images_fd: diff --git a/tests/ci/docs_check.py b/tests/ci/docs_check.py index 87c327f2776..2daa75f9663 100644 --- a/tests/ci/docs_check.py +++ b/tests/ci/docs_check.py @@ -4,8 +4,10 @@ import subprocess import os import sys from github import Github + +from env_helper import TEMP_PATH, REPO_COPY from s3_helper import S3Helper -from pr_info import PRInfo, get_event +from pr_info import PRInfo from get_robot_token import get_best_robot_token from upload_result_helper import upload_results from docker_pull_helper import get_image_with_version @@ -23,10 +25,10 @@ if __name__ == "__main__": stopwatch = Stopwatch() - temp_path = os.path.join(os.getenv("TEMP_PATH")) - repo_path = os.path.join(os.getenv("REPO_COPY")) + temp_path = TEMP_PATH + repo_path = REPO_COPY - pr_info = PRInfo(get_event(), need_changed_files=True) + pr_info = PRInfo(need_changed_files=True) gh = Github(get_best_robot_token()) @@ -36,7 +38,7 @@ if __name__ == "__main__": sys.exit(0) if not pr_info.has_changes_in_documentation(): - logging.info ("No changes in documentation") + logging.info("No changes in documentation") commit = get_commit(gh, pr_info.sha) commit.create_status(context=NAME, description="No changes in docs", state="success") sys.exit(0) diff --git a/tests/ci/docs_release.py b/tests/ci/docs_release.py index 04922e8c5ab..90588848f12 100644 --- a/tests/ci/docs_release.py +++ b/tests/ci/docs_release.py @@ -2,12 +2,12 @@ import logging import subprocess import os -import sys from github import Github +from env_helper import TEMP_PATH, REPO_COPY, CLOUDFLARE_TOKEN from s3_helper import S3Helper -from pr_info import PRInfo, get_event +from pr_info import PRInfo from get_robot_token import get_best_robot_token from ssh import SSHKey from upload_result_helper import upload_results @@ -19,19 +19,12 @@ NAME = "Docs Release (actions)" if __name__ == "__main__": logging.basicConfig(level=logging.INFO) - temp_path = os.path.join(os.getenv("TEMP_PATH")) - repo_path = os.path.join(os.getenv("REPO_COPY")) + temp_path = TEMP_PATH + repo_path = REPO_COPY - pr_info = PRInfo(get_event(), need_changed_files=True) + pr_info = PRInfo(need_changed_files=True) gh = Github(get_best_robot_token()) - if not pr_info.has_changes_in_documentation(): - logging.info ("No changes in documentation") - commit = get_commit(gh, pr_info.sha) - commit.create_status(context=NAME, description="No changes in docs", state="success") - sys.exit(0) - - logging.info("Has changes in docs") if not os.path.exists(temp_path): os.makedirs(temp_path) @@ -42,7 +35,7 @@ if __name__ == "__main__": if not os.path.exists(test_output): os.makedirs(test_output) - token = os.getenv('CLOUDFLARE_TOKEN') + token = CLOUDFLARE_TOKEN cmd = "docker run --cap-add=SYS_PTRACE --volume=$SSH_AUTH_SOCK:/ssh-agent -e SSH_AUTH_SOCK=/ssh-agent " \ f"-e CLOUDFLARE_TOKEN={token} --volume={repo_path}:/repo_path --volume={test_output}:/output_path {docker_image}" diff --git a/tests/ci/env_helper.py b/tests/ci/env_helper.py new file mode 100644 index 00000000000..90178e5c56a --- /dev/null +++ b/tests/ci/env_helper.py @@ -0,0 +1,18 @@ +import os + +CI = bool(os.getenv("CI")) +TEMP_PATH = os.getenv("TEMP_PATH", os.path.abspath(".")) + +CACHES_PATH = os.getenv("CACHES_PATH", TEMP_PATH) +CLOUDFLARE_TOKEN = os.getenv("CLOUDFLARE_TOKEN") +GITHUB_EVENT_PATH = os.getenv("GITHUB_EVENT_PATH") +GITHUB_REPOSITORY = os.getenv("GITHUB_REPOSITORY", "ClickHouse/ClickHouse") +GITHUB_RUN_ID = os.getenv("GITHUB_RUN_ID") +GITHUB_SERVER_URL = os.getenv("GITHUB_SERVER_URL", "https://github.com") +GITHUB_WORKSPACE = os.getenv("GITHUB_WORKSPACE", os.path.abspath("../../")) +IMAGES_PATH = os.getenv("IMAGES_PATH") +REPORTS_PATH = os.getenv("REPORTS_PATH", "./reports") +REPO_COPY = os.getenv("REPO_COPY", os.path.abspath("../../")) +RUNNER_TEMP = os.getenv("RUNNER_TEMP", os.path.abspath("./tmp")) +S3_BUILDS_BUCKET = os.getenv("S3_BUILDS_BUCKET", "clickhouse-builds") +S3_TEST_REPORTS_BUCKET = os.getenv("S3_TEST_REPORTS_BUCKET", "clickhouse-test-reports") diff --git a/tests/ci/fast_test_check.py b/tests/ci/fast_test_check.py index 30eabe87dce..0eef886625a 100644 --- a/tests/ci/fast_test_check.py +++ b/tests/ci/fast_test_check.py @@ -7,7 +7,9 @@ import csv import sys from github import Github -from pr_info import PRInfo, get_event + +from env_helper import CACHES_PATH, TEMP_PATH +from pr_info import PRInfo from s3_helper import S3Helper from get_robot_token import get_best_robot_token from upload_result_helper import upload_results @@ -39,17 +41,23 @@ def process_results(result_folder): test_files = [f for f in os.listdir(result_folder) if os.path.isfile(os.path.join(result_folder, f))] additional_files = [os.path.join(result_folder, f) for f in test_files] + status = [] status_path = os.path.join(result_folder, "check_status.tsv") - logging.info("Found test_results.tsv") - status = list(csv.reader(open(status_path, 'r'), delimiter='\t')) + if os.path.exists(status_path): + logging.info("Found test_results.tsv") + with open(status_path, 'r', encoding='utf-8') as status_file: + status = list(csv.reader(status_file, delimiter='\t')) if len(status) != 1 or len(status[0]) != 2: + logging.info("Files in result folder %s", os.listdir(result_folder)) return "error", "Invalid check_status.tsv", test_results, additional_files state, description = status[0][0], status[0][1] results_path = os.path.join(result_folder, "test_results.tsv") - test_results = list(csv.reader(open(results_path, 'r'), delimiter='\t')) + if os.path.exists(results_path): + with open(results_path, 'r', encoding='utf-8') as results_file: + test_results = list(csv.reader(results_file, delimiter='\t')) if len(test_results) == 0: - raise Exception("Empty results") + return "error", "Empty test_results.tsv", test_results, additional_files return state, description, test_results, additional_files @@ -60,13 +68,13 @@ if __name__ == "__main__": stopwatch = Stopwatch() - temp_path = os.getenv("TEMP_PATH", os.path.abspath(".")) - caches_path = os.getenv("CACHES_PATH", temp_path) + temp_path = TEMP_PATH + caches_path = CACHES_PATH if not os.path.exists(temp_path): os.makedirs(temp_path) - pr_info = PRInfo(get_event()) + pr_info = PRInfo() gh = Github(get_best_robot_token()) diff --git a/tests/ci/finish_check.py b/tests/ci/finish_check.py index 576b97058c7..72f26daf4cd 100644 --- a/tests/ci/finish_check.py +++ b/tests/ci/finish_check.py @@ -1,13 +1,15 @@ #!/usr/bin/env python3 import logging -import os from github import Github -from pr_info import PRInfo, get_event + +from env_helper import GITHUB_SERVER_URL, GITHUB_REPOSITORY, GITHUB_RUN_ID +from pr_info import PRInfo from get_robot_token import get_best_robot_token from commit_status_helper import get_commit NAME = 'Run Check (actions)' + def filter_statuses(statuses): """ Squash statuses to latest state @@ -23,14 +25,15 @@ def filter_statuses(statuses): filt[status.context] = status return filt + if __name__ == "__main__": logging.basicConfig(level=logging.INFO) - pr_info = PRInfo(get_event(), need_orgs=True) + pr_info = PRInfo(need_orgs=True) gh = Github(get_best_robot_token()) commit = get_commit(gh, pr_info.sha) - url = f"{os.getenv('GITHUB_SERVER_URL')}/{os.getenv('GITHUB_REPOSITORY')}/actions/runs/{os.getenv('GITHUB_RUN_ID')}" + url = f"{GITHUB_SERVER_URL}/{GITHUB_REPOSITORY}/actions/runs/{GITHUB_RUN_ID}" statuses = filter_statuses(list(commit.get_statuses())) if NAME in statuses and statuses[NAME].state == "pending": commit.create_status(context=NAME, description="All checks finished", state="success", target_url=url) diff --git a/tests/ci/functional_test_check.py b/tests/ci/functional_test_check.py index 15b9ab44b31..4419ba1c920 100644 --- a/tests/ci/functional_test_check.py +++ b/tests/ci/functional_test_check.py @@ -8,9 +8,10 @@ import sys from github import Github +from env_helper import TEMP_PATH, REPO_COPY, REPORTS_PATH from s3_helper import S3Helper from get_robot_token import get_best_robot_token -from pr_info import PRInfo, get_event +from pr_info import PRInfo from build_download_helper import download_all_deb_packages from upload_result_helper import upload_results from docker_pull_helper import get_image_with_version @@ -20,15 +21,20 @@ from stopwatch import Stopwatch from rerun_helper import RerunHelper from tee_popen import TeePopen -def get_additional_envs(check_name): +def get_additional_envs(check_name, run_by_hash_num, run_by_hash_total): + result = [] if 'DatabaseReplicated' in check_name: - return ["USE_DATABASE_REPLICATED=1"] + result.append("USE_DATABASE_REPLICATED=1") if 'DatabaseOrdinary' in check_name: - return ["USE_DATABASE_ORDINARY=1"] + result.append("USE_DATABASE_ORDINARY=1") if 'wide parts enabled' in check_name: - return ["USE_POLYMORPHIC_PARTS=1"] + result.append("USE_POLYMORPHIC_PARTS=1") - return [] + if run_by_hash_total != 0: + result.append(f"RUN_BY_HASH_NUM={run_by_hash_num}") + result.append(f"RUN_BY_HASH_TOTAL={run_by_hash_total}") + + return result def get_image_name(check_name): if 'stateless' in check_name.lower(): @@ -88,20 +94,30 @@ def process_results(result_folder, server_log_path): server_log_files = [f for f in os.listdir(server_log_path) if os.path.isfile(os.path.join(server_log_path, f))] additional_files = additional_files + [os.path.join(server_log_path, f) for f in server_log_files] + status = [] status_path = os.path.join(result_folder, "check_status.tsv") - logging.info("Found test_results.tsv") - with open(status_path, 'r', encoding='utf-8') as status_file: - status = list(csv.reader(status_file, delimiter='\t')) + if os.path.exists(status_path): + logging.info("Found test_results.tsv") + with open(status_path, 'r', encoding='utf-8') as status_file: + status = list(csv.reader(status_file, delimiter='\t')) if len(status) != 1 or len(status[0]) != 2: + logging.info("Files in result folder %s", os.listdir(result_folder)) return "error", "Invalid check_status.tsv", test_results, additional_files state, description = status[0][0], status[0][1] results_path = os.path.join(result_folder, "test_results.tsv") + + if os.path.exists(results_path): + logging.info("Found test_results.tsv") + else: + logging.info("Files in result folder %s", os.listdir(result_folder)) + return "error", "Not found test_results.tsv", test_results, additional_files + with open(results_path, 'r', encoding='utf-8') as results_file: test_results = list(csv.reader(results_file, delimiter='\t')) if len(test_results) == 0: - raise Exception("Empty results") + return "error", "Empty test_results.tsv", test_results, additional_files return state, description, test_results, additional_files @@ -111,18 +127,28 @@ if __name__ == "__main__": stopwatch = Stopwatch() - temp_path = os.getenv("TEMP_PATH", os.path.abspath(".")) - repo_path = os.getenv("REPO_COPY", os.path.abspath("../../")) - reports_path = os.getenv("REPORTS_PATH", "./reports") + temp_path = TEMP_PATH + repo_path = REPO_COPY + reports_path = REPORTS_PATH check_name = sys.argv[1] kill_timeout = int(sys.argv[2]) + flaky_check = 'flaky' in check_name.lower() gh = Github(get_best_robot_token()) - pr_info = PRInfo(get_event(), need_changed_files=flaky_check) + pr_info = PRInfo(need_changed_files=flaky_check) - rerun_helper = RerunHelper(gh, pr_info, check_name) + if 'RUN_BY_HASH_NUM' in os.environ: + run_by_hash_num = int(os.getenv('RUN_BY_HASH_NUM')) + run_by_hash_total = int(os.getenv('RUN_BY_HASH_TOTAL')) + check_name_with_group = check_name + f' [{run_by_hash_num + 1}/{run_by_hash_total}]' + else: + run_by_hash_num = 0 + run_by_hash_total = 0 + check_name_with_group = check_name + + rerun_helper = RerunHelper(gh, pr_info, check_name_with_group) if rerun_helper.is_already_finished_by_status(): logging.info("Check is already finished according to github status, exiting") sys.exit(0) @@ -135,7 +161,7 @@ if __name__ == "__main__": tests_to_run = get_tests_to_run(pr_info) if not tests_to_run: commit = get_commit(gh, pr_info.sha) - commit.create_status(context=check_name, description='Not found changed stateless tests', state='success') + commit.create_status(context=check_name_with_group, description='Not found changed stateless tests', state='success') sys.exit(0) image_name = get_image_name(check_name) @@ -157,7 +183,7 @@ if __name__ == "__main__": run_log_path = os.path.join(result_path, "runlog.log") - additional_envs = get_additional_envs(check_name) + additional_envs = get_additional_envs(check_name, run_by_hash_num, run_by_hash_total) run_command = get_run_command(packages_path, result_path, server_log_path, kill_timeout, additional_envs, docker_image, flaky_check, tests_to_run) logging.info("Going to run func tests: %s", run_command) @@ -176,12 +202,12 @@ if __name__ == "__main__": ch_helper = ClickHouseHelper() mark_flaky_tests(ch_helper, check_name, test_results) - report_url = upload_results(s3_helper, pr_info.number, pr_info.sha, test_results, [run_log_path] + additional_logs, check_name) + report_url = upload_results(s3_helper, pr_info.number, pr_info.sha, test_results, [run_log_path] + additional_logs, check_name_with_group) print(f"::notice ::Report url: {report_url}") - post_commit_status(gh, pr_info.sha, check_name, description, state, report_url) + post_commit_status(gh, pr_info.sha, check_name_with_group, description, state, report_url) - prepared_events = prepare_tests_results_for_clickhouse(pr_info, test_results, state, stopwatch.duration_seconds, stopwatch.start_time_str, report_url, check_name) + prepared_events = prepare_tests_results_for_clickhouse(pr_info, test_results, state, stopwatch.duration_seconds, stopwatch.start_time_str, report_url, check_name_with_group) ch_helper.insert_events_into(db="gh-data", table="checks", events=prepared_events) if state != 'success': diff --git a/tests/ci/integration_test_check.py b/tests/ci/integration_test_check.py index 4a60d825687..20e33f2f2dc 100644 --- a/tests/ci/integration_test_check.py +++ b/tests/ci/integration_test_check.py @@ -9,9 +9,10 @@ import csv from github import Github +from env_helper import TEMP_PATH, REPO_COPY, REPORTS_PATH from s3_helper import S3Helper from get_robot_token import get_best_robot_token -from pr_info import PRInfo, get_event +from pr_info import PRInfo from build_download_helper import download_all_deb_packages from upload_result_helper import upload_results from docker_pull_helper import get_images_with_versions @@ -22,21 +23,19 @@ from rerun_helper import RerunHelper from tee_popen import TeePopen -DOWNLOAD_RETRIES_COUNT = 5 - IMAGES = [ - "yandex/clickhouse-integration-tests-runner", - "yandex/clickhouse-mysql-golang-client", - "yandex/clickhouse-mysql-java-client", - "yandex/clickhouse-mysql-js-client", - "yandex/clickhouse-mysql-php-client", - "yandex/clickhouse-postgresql-java-client", - "yandex/clickhouse-integration-test", - "yandex/clickhouse-kerberos-kdc", - "yandex/clickhouse-integration-helper", + "clickhouse/integration-tests-runner", + "clickhouse/mysql-golang-client", + "clickhouse/mysql-java-client", + "clickhouse/mysql-js-client", + "clickhouse/mysql-php-client", + "clickhouse/postgresql-java-client", + "clickhouse/integration-test", + "clickhouse/kerberos-kdc", + "clickhouse/integration-helper", ] -def get_json_params_dict(check_name, pr_info, docker_images): +def get_json_params_dict(check_name, pr_info, docker_images, run_by_hash_total, run_by_hash_num): return { 'context_name': check_name, 'commit': pr_info.sha, @@ -46,6 +45,8 @@ def get_json_params_dict(check_name, pr_info, docker_images): 'shuffle_test_groups': False, 'use_tmpfs': False, 'disable_net_host': True, + 'run_by_hash_total': run_by_hash_total, + 'run_by_hash_num': run_by_hash_num, } def get_env_for_runner(build_path, repo_path, result_path, work_path): @@ -76,23 +77,24 @@ def process_results(result_folder): test_files = [f for f in os.listdir(result_folder) if os.path.isfile(os.path.join(result_folder, f))] additional_files = [os.path.join(result_folder, f) for f in test_files] + status = [] status_path = os.path.join(result_folder, "check_status.tsv") if os.path.exists(status_path): logging.info("Found test_results.tsv") with open(status_path, 'r', encoding='utf-8') as status_file: status = list(csv.reader(status_file, delimiter='\t')) - else: - status = [] if len(status) != 1 or len(status[0]) != 2: + logging.info("Files in result folder %s", os.listdir(result_folder)) return "error", "Invalid check_status.tsv", test_results, additional_files state, description = status[0][0], status[0][1] results_path = os.path.join(result_folder, "test_results.tsv") - with open(results_path, 'r', encoding='utf-8') as results_file: - test_results = list(csv.reader(results_file, delimiter='\t')) + if os.path.exists(results_path): + with open(results_path, 'r', encoding='utf-8') as results_file: + test_results = list(csv.reader(results_file, delimiter='\t')) if len(test_results) == 0: - raise Exception("Empty results") + return "error", "Empty test_results.tsv", test_results, additional_files return state, description, test_results, additional_files @@ -101,26 +103,35 @@ if __name__ == "__main__": stopwatch = Stopwatch() - temp_path = os.getenv("TEMP_PATH", os.path.abspath(".")) - repo_path = os.getenv("REPO_COPY", os.path.abspath("../../")) - reports_path = os.getenv("REPORTS_PATH", "./reports") + temp_path = TEMP_PATH + repo_path = REPO_COPY + reports_path = REPORTS_PATH check_name = sys.argv[1] + if 'RUN_BY_HASH_NUM' in os.environ: + run_by_hash_num = int(os.getenv('RUN_BY_HASH_NUM')) + run_by_hash_total = int(os.getenv('RUN_BY_HASH_TOTAL')) + check_name_with_group = check_name + f' [{run_by_hash_num + 1}/{run_by_hash_total}]' + else: + run_by_hash_num = 0 + run_by_hash_total = 0 + check_name_with_group = check_name + if not os.path.exists(temp_path): os.makedirs(temp_path) is_flaky_check = 'flaky' in check_name - pr_info = PRInfo(get_event(), need_changed_files=is_flaky_check) + pr_info = PRInfo(need_changed_files=is_flaky_check) gh = Github(get_best_robot_token()) - rerun_helper = RerunHelper(gh, pr_info, check_name) + rerun_helper = RerunHelper(gh, pr_info, check_name_with_group) if rerun_helper.is_already_finished_by_status(): logging.info("Check is already finished according to github status, exiting") sys.exit(0) - images = get_images_with_versions(temp_path, IMAGES) + images = get_images_with_versions(reports_path, IMAGES) images_with_versions = {i.name: i.version for i in images} result_path = os.path.join(temp_path, "output_dir") if not os.path.exists(result_path): @@ -140,7 +151,7 @@ if __name__ == "__main__": json_path = os.path.join(work_path, 'params.json') with open(json_path, 'w', encoding='utf-8') as json_params: - json_params.write(json.dumps(get_json_params_dict(check_name, pr_info, images_with_versions))) + json_params.write(json.dumps(get_json_params_dict(check_name, pr_info, images_with_versions, run_by_hash_total, run_by_hash_num))) output_path_log = os.path.join(result_path, "main_script_log.txt") @@ -162,9 +173,9 @@ if __name__ == "__main__": mark_flaky_tests(ch_helper, check_name, test_results) s3_helper = S3Helper('https://s3.amazonaws.com') - report_url = upload_results(s3_helper, pr_info.number, pr_info.sha, test_results, [output_path_log] + additional_logs, check_name, False) + report_url = upload_results(s3_helper, pr_info.number, pr_info.sha, test_results, [output_path_log] + additional_logs, check_name_with_group, False) print(f"::notice ::Report url: {report_url}") - post_commit_status(gh, pr_info.sha, check_name, description, state, report_url) + post_commit_status(gh, pr_info.sha, check_name_with_group, description, state, report_url) - prepared_events = prepare_tests_results_for_clickhouse(pr_info, test_results, state, stopwatch.duration_seconds, stopwatch.start_time_str, report_url, check_name) + prepared_events = prepare_tests_results_for_clickhouse(pr_info, test_results, state, stopwatch.duration_seconds, stopwatch.start_time_str, report_url, check_name_with_group) ch_helper.insert_events_into(db="gh-data", table="checks", events=prepared_events) diff --git a/tests/ci/keeper_jepsen_check.py b/tests/ci/keeper_jepsen_check.py new file mode 100644 index 00000000000..b7acc92b0f3 --- /dev/null +++ b/tests/ci/keeper_jepsen_check.py @@ -0,0 +1,215 @@ +#!/usr/bin/env python3 + +import time +import logging +import os +import sys + +import boto3 +from github import Github +import requests + +from env_helper import REPO_COPY, TEMP_PATH +from stopwatch import Stopwatch +from upload_result_helper import upload_results +from s3_helper import S3Helper +from get_robot_token import get_best_robot_token, get_parameter_from_ssm +from pr_info import PRInfo +from compress_files import compress_fast +from commit_status_helper import post_commit_status +from clickhouse_helper import ClickHouseHelper, prepare_tests_results_for_clickhouse +from version_helper import get_version_from_repo +from tee_popen import TeePopen +from ssh import SSHKey +from build_download_helper import get_build_name_for_check +from rerun_helper import RerunHelper + +JEPSEN_GROUP_NAME = 'jepsen_group' +DESIRED_INSTANCE_COUNT = 3 +IMAGE_NAME = 'clickhouse/keeper-jepsen-test' +CHECK_NAME = 'ClickHouse Keeper Jepsen (actions)' + + +SUCCESSFUL_TESTS_ANCHOR = "# Successful tests" +INTERMINATE_TESTS_ANCHOR = "# Indeterminate tests" +CRASHED_TESTS_ANCHOR = "# Crashed tests" +FAILED_TESTS_ANCHOR = "# Failed tests" + +def _parse_jepsen_output(path): + test_results = [] + current_type = '' + with open(path, 'r') as f: + for line in f: + if SUCCESSFUL_TESTS_ANCHOR in line: + current_type = 'OK' + elif INTERMINATE_TESTS_ANCHOR in line or CRASHED_TESTS_ANCHOR in line: + current_type = 'ERROR' + elif FAILED_TESTS_ANCHOR in line: + current_type = 'FAIL' + + if (line.startswith('store/clickhouse-keeper') or line.startswith('clickhouse-keeper')) and current_type: + test_results.append((line.strip(), current_type)) + + return test_results + +def get_autoscaling_group_instances_ids(asg_client, group_name): + group_description = asg_client.describe_auto_scaling_groups(AutoScalingGroupNames=[group_name]) + our_group = group_description['AutoScalingGroups'][0] + instance_ids = [] + for instance in our_group['Instances']: + if instance['LifecycleState'] == 'InService' and instance['HealthStatus'] == 'Healthy': + instance_ids.append(instance['InstanceId']) + + return instance_ids + +def get_instances_addresses(ec2_client, instance_ids): + ec2_response = ec2_client.describe_instances(InstanceIds = instance_ids) + instance_ips = [] + for instances in ec2_response['Reservations']: + for ip in instances['Instances']: + instance_ips.append(ip['PrivateIpAddress']) + return instance_ips + + +def prepare_autoscaling_group_and_get_hostnames(): + asg_client = boto3.client('autoscaling', region_name='us-east-1') + asg_client.set_desired_capacity(AutoScalingGroupName=JEPSEN_GROUP_NAME, DesiredCapacity=DESIRED_INSTANCE_COUNT) + + instances = get_autoscaling_group_instances_ids(asg_client, JEPSEN_GROUP_NAME) + counter = 0 + while len(instances) < DESIRED_INSTANCE_COUNT: + time.sleep(5) + instances = get_autoscaling_group_instances_ids(asg_client, JEPSEN_GROUP_NAME) + counter += 1 + if counter > 30: + raise Exception("Cannot wait autoscaling group") + + ec2_client = boto3.client('ec2', region_name='us-east-1') + return get_instances_addresses(ec2_client, instances) + + +def clear_autoscaling_group(): + asg_client = boto3.client('autoscaling', region_name='us-east-1') + asg_client.set_desired_capacity(AutoScalingGroupName=JEPSEN_GROUP_NAME, DesiredCapacity=0) + instances = get_autoscaling_group_instances_ids(asg_client, JEPSEN_GROUP_NAME) + counter = 0 + while len(instances) > 0: + time.sleep(5) + instances = get_autoscaling_group_instances_ids(asg_client, JEPSEN_GROUP_NAME) + counter += 1 + if counter > 30: + raise Exception("Cannot wait autoscaling group") + + +def save_nodes_to_file(instances, temp_path): + nodes_path = os.path.join(temp_path, "nodes.txt") + with open(nodes_path, 'w') as f: + f.write("\n".join(instances)) + f.flush() + return nodes_path + +def get_run_command(ssh_auth_sock, ssh_sock_dir, pr_info, nodes_path, repo_path, build_url, result_path, docker_image): + return f"docker run --network=host -v '{ssh_sock_dir}:{ssh_sock_dir}' -e SSH_AUTH_SOCK={ssh_auth_sock} " \ + f"-e PR_TO_TEST={pr_info.number} -e SHA_TO_TEST={pr_info.sha} -v '{nodes_path}:/nodes.txt' -v {result_path}:/test_output " \ + f"-e 'CLICKHOUSE_PACKAGE={build_url}' -v '{repo_path}:/ch' -e 'CLICKHOUSE_REPO_PATH=/ch' -e NODES_USERNAME=ubuntu {docker_image}" + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + + stopwatch = Stopwatch() + + pr_info = PRInfo() + + logging.info("Start at PR number %s, commit sha %s labels %s", pr_info.number, pr_info.sha, pr_info.labels) + + if pr_info.number != 0 and 'jepsen-test' not in pr_info.labels: + logging.info("Not jepsen test label in labels list, skipping") + sys.exit(0) + + gh = Github(get_best_robot_token()) + + rerun_helper = RerunHelper(gh, pr_info, CHECK_NAME) + if rerun_helper.is_already_finished_by_status(): + logging.info("Check is already finished according to github status, exiting") + sys.exit(0) + + if not os.path.exists(TEMP_PATH): + os.makedirs(TEMP_PATH) + + result_path = os.path.join(TEMP_PATH, "result_path") + if not os.path.exists(result_path): + os.makedirs(result_path) + + instances = prepare_autoscaling_group_and_get_hostnames() + nodes_path = save_nodes_to_file(instances, TEMP_PATH) + + # always use latest + docker_image = IMAGE_NAME + + build_name = get_build_name_for_check(CHECK_NAME) + + if pr_info.number == 0: + version = get_version_from_repo(REPO_COPY) + release_or_pr = ".".join(version.as_tuple()[:2]) + else: + # PR number for anything else + release_or_pr = str(pr_info.number) + + # This check run separately from other checks because it requires exclusive + # run (see .github/workflows/jepsen.yml) So we cannot add explicit + # dependency on a build job and using busy loop on it's results. For the + # same reason we are using latest docker image. + build_url = f"https://s3.amazonaws.com/clickhouse-builds/{release_or_pr}/{pr_info.sha}/{build_name}/clickhouse" + head = requests.head(build_url) + counter = 0 + while head.status_code != 200: + time.sleep(10) + head = requests.head(build_url) + counter += 1 + if counter >= 180: + post_commit_status(gh, pr_info.sha, CHECK_NAME, "Cannot fetch build to run", "error", "") + raise Exception("Cannot fetch build") + + with SSHKey(key_value=get_parameter_from_ssm("jepsen_ssh_key") + '\n'): + ssh_auth_sock = os.environ['SSH_AUTH_SOCK'] + auth_sock_dir = os.path.dirname(ssh_auth_sock) + cmd = get_run_command(ssh_auth_sock, auth_sock_dir, pr_info, nodes_path, REPO_COPY, build_url, result_path, docker_image) + logging.info("Going to run jepsen: %s", cmd) + + run_log_path = os.path.join(TEMP_PATH, "runlog.log") + + with TeePopen(cmd, run_log_path) as process: + retcode = process.wait() + if retcode == 0: + logging.info("Run successfully") + else: + logging.info("Run failed") + + status = 'success' + description = 'No invalid analysis found ヽ(‘ー`)ノ' + jepsen_log_path = os.path.join(result_path, 'jepsen_run_all_tests.log') + additional_data = [] + try: + test_result = _parse_jepsen_output(jepsen_log_path) + if any(r[1] == 'FAIL' for r in test_result): + status = 'failure' + description = 'Found invalid analysis (ノಥ益ಥ)ノ ┻━┻' + + compress_fast(os.path.join(result_path, 'store'), os.path.join(result_path, 'jepsen_store.tar.gz')) + additional_data.append(os.path.join(result_path, 'jepsen_store.tar.gz')) + except Exception as ex: + print("Exception", ex) + status = 'failure' + description = 'No Jepsen output log' + test_result = [('No Jepsen output log', 'FAIL')] + + s3_helper = S3Helper('https://s3.amazonaws.com') + report_url = upload_results(s3_helper, pr_info.number, pr_info.sha, test_result, [run_log_path] + additional_data, CHECK_NAME) + + print(f"::notice ::Report url: {report_url}") + post_commit_status(gh, pr_info.sha, CHECK_NAME, description, status, report_url) + + ch_helper = ClickHouseHelper() + prepared_events = prepare_tests_results_for_clickhouse(pr_info, test_result, status, stopwatch.duration_seconds, stopwatch.start_time_str, report_url, CHECK_NAME) + ch_helper.insert_events_into(db="gh-data", table="checks", events=prepared_events) + clear_autoscaling_group() diff --git a/tests/ci/performance_comparison_check.py b/tests/ci/performance_comparison_check.py new file mode 100644 index 00000000000..90c5034bfa7 --- /dev/null +++ b/tests/ci/performance_comparison_check.py @@ -0,0 +1,175 @@ +#!/usr/bin/env python3 + + +import os +import logging +import sys +import json +import subprocess +import traceback +import re + +from github import Github + +from pr_info import PRInfo +from s3_helper import S3Helper +from get_robot_token import get_best_robot_token +from docker_pull_helper import get_image_with_version +from commit_status_helper import get_commit, post_commit_status +from tee_popen import TeePopen + +IMAGE_NAME = 'clickhouse/performance-comparison' + +def get_run_command(workspace, result_path, pr_to_test, sha_to_test, additional_env, image): + return f"docker run --privileged --volume={workspace}:/workspace --volume={result_path}:/output " \ + f"--cap-add syslog --cap-add sys_admin --cap-add sys_rawio " \ + f"-e PR_TO_TEST={pr_to_test} -e SHA_TO_TEST={sha_to_test} {additional_env} " \ + f"{image}" + +class RamDrive: + def __init__(self, path, size): + self.path = path + self.size = size + + def __enter__(self): + if not os.path.exists(self.path): + os.makedirs(self.path) + + subprocess.check_call(f"sudo mount -t tmpfs -o rw,size={self.size} tmpfs {self.path}", shell=True) + + def __exit__(self, exc_type, exc_val, exc_tb): + subprocess.check_call(f"sudo umount {self.path}", shell=True) + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + temp_path = os.getenv("TEMP_PATH", os.path.abspath(".")) + repo_path = os.getenv("REPO_COPY", os.path.abspath("../../")) + ramdrive_path = os.getenv("RAMDRIVE_PATH", os.path.join(temp_path, "ramdrive")) + # currently unused, doesn't make tests more stable + ramdrive_size = os.getenv("RAMDRIVE_SIZE", '0G') + reports_path = os.getenv("REPORTS_PATH", "./reports") + + check_name = sys.argv[1] + + if not os.path.exists(temp_path): + os.makedirs(temp_path) + + with open(os.getenv('GITHUB_EVENT_PATH'), 'r', encoding='utf-8') as event_file: + event = json.load(event_file) + + gh = Github(get_best_robot_token()) + pr_info = PRInfo(event) + commit = get_commit(gh, pr_info.sha) + + docker_env = '' + + docker_env += " -e S3_URL=https://s3.amazonaws.com/clickhouse-builds" + + if pr_info.number == 0: + pr_link = commit.html_url + else: + pr_link = f"https://github.com/ClickHouse/ClickHouse/pull/{pr_info.number}" + + task_url = f"https://github.com/ClickHouse/ClickHouse/actions/runs/{os.getenv('GITHUB_RUN_ID')}" + docker_env += ' -e CHPC_ADD_REPORT_LINKS="Job (actions) Tested commit"'.format( + task_url, pr_link) + + if 'RUN_BY_HASH_TOTAL' in os.environ: + run_by_hash_total = int(os.getenv('RUN_BY_HASH_TOTAL')) + run_by_hash_num = int(os.getenv('RUN_BY_HASH_NUM')) + docker_env += f' -e CHPC_TEST_RUN_BY_HASH_TOTAL={run_by_hash_total} -e CHPC_TEST_RUN_BY_HASH_NUM={run_by_hash_num}' + check_name_with_group = check_name + f' [{run_by_hash_num + 1}/{run_by_hash_total}]' + else: + check_name_with_group = check_name + + docker_image = get_image_with_version(reports_path, IMAGE_NAME) + + #with RamDrive(ramdrive_path, ramdrive_size): + result_path = ramdrive_path + if not os.path.exists(result_path): + os.makedirs(result_path) + + run_command = get_run_command(result_path, result_path, pr_info.number, pr_info.sha, docker_env, docker_image) + logging.info("Going to run command %s", run_command) + run_log_path = os.path.join(temp_path, "runlog.log") + with TeePopen(run_command, run_log_path) as process: + retcode = process.wait() + if retcode == 0: + logging.info("Run successfully") + else: + logging.info("Run failed") + + subprocess.check_call(f"sudo chown -R ubuntu:ubuntu {temp_path}", shell=True) + + paths = { + 'compare.log': os.path.join(result_path, 'compare.log'), + 'output.7z': os.path.join(result_path, 'output.7z'), + 'report.html': os.path.join(result_path, 'report.html'), + 'all-queries.html': os.path.join(result_path, 'all-queries.html'), + 'queries.rep': os.path.join(result_path, 'queries.rep'), + 'all-query-metrics.tsv': os.path.join(result_path, 'report/all-query-metrics.tsv'), + 'runlog.log': run_log_path, + } + + check_name_prefix = check_name_with_group.lower().replace(' ', '_').replace('(', '_').replace(')', '_').replace(',', '_') + s3_prefix = f'{pr_info.number}/{pr_info.sha}/{check_name_prefix}/' + s3_helper = S3Helper('https://s3.amazonaws.com') + for file in paths: + try: + paths[file] = s3_helper.upload_test_report_to_s3(paths[file], + s3_prefix + file) + except Exception: + paths[file] = '' + traceback.print_exc() + + # Upload all images and flamegraphs to S3 + try: + s3_helper.upload_test_folder_to_s3( + os.path.join(result_path, 'images'), + s3_prefix + 'images' + ) + except Exception: + traceback.print_exc() + + # Try to fetch status from the report. + status = '' + message = '' + try: + report_text = open(os.path.join(result_path, 'report.html'), 'r').read() + status_match = re.search('', report_text) + message_match = re.search('', report_text) + if status_match: + status = status_match.group(1).strip() + if message_match: + message = message_match.group(1).strip() + + # TODO: Remove me, always green mode for the first time + status = 'success' + except Exception: + traceback.print_exc() + status = 'failure' + message = 'Failed to parse the report.' + + if not status: + status = 'failure' + message = 'No status in report.' + elif not message: + status = 'failure' + message = 'No message in report.' + + report_url = task_url + + if paths['runlog.log']: + report_url = paths['runlog.log'] + + if paths['compare.log']: + report_url = paths['compare.log'] + + if paths['output.7z']: + report_url = paths['output.7z'] + + if paths['report.html']: + report_url = paths['report.html'] + + + post_commit_status(gh, pr_info.sha, check_name_with_group, message, status, report_url) diff --git a/tests/ci/pr_info.py b/tests/ci/pr_info.py index 88d4595bc66..812834824b7 100644 --- a/tests/ci/pr_info.py +++ b/tests/ci/pr_info.py @@ -6,11 +6,13 @@ import urllib import requests from unidiff import PatchSet +from env_helper import GITHUB_REPOSITORY, GITHUB_SERVER_URL, GITHUB_RUN_ID, GITHUB_EVENT_PATH -DIFF_IN_DOCUMENTATION_EXT = [".html", ".md", ".yml", ".txt", ".css", ".js", ".xml", ".ico", ".conf", ".svg", ".png", ".jpg", ".py", ".sh", ".json"] +DIFF_IN_DOCUMENTATION_EXT = [".html", ".md", ".yml", ".txt", ".css", ".js", ".xml", ".ico", ".conf", ".svg", ".png", + ".jpg", ".py", ".sh", ".json"] def get_pr_for_commit(sha, ref): - try_get_pr_url = f"https://api.github.com/repos/{os.getenv('GITHUB_REPOSITORY', 'ClickHouse/ClickHouse')}/commits/{sha}/pulls" + try_get_pr_url = f"https://api.github.com/repos/{GITHUB_REPOSITORY}/commits/{sha}/pulls" try: response = requests.get(try_get_pr_url) response.raise_for_status() @@ -22,7 +24,7 @@ def get_pr_for_commit(sha, ref): # refs for RPs looks like XX if pr['head']['ref'] in ref: return pr - print ("Cannot find PR with required ref", ref, "returning first one") + print("Cannot find PR with required ref", ref, "returning first one") first_pr = data[0] return first_pr except Exception as ex: @@ -30,24 +32,35 @@ def get_pr_for_commit(sha, ref): return None -def get_event(): - with open(os.getenv('GITHUB_EVENT_PATH'), 'r', encoding='utf-8') as ef: - return json.load(ef) - - class PRInfo: - def __init__(self, github_event, need_orgs=False, need_changed_files=False): - if 'pull_request' in github_event: # pull request and other similar events - self.number = github_event['number'] + def __init__(self, github_event=None, need_orgs=False, need_changed_files=False, labels_from_api=False): + if not github_event: + if GITHUB_EVENT_PATH: + with open(GITHUB_EVENT_PATH, 'r', encoding='utf-8') as event_file: + github_event = json.load(event_file) + else: + github_event = {'commits': 1, 'after': 'HEAD', 'ref': None} + self.event = github_event + self.changed_files = set([]) + + # workflow completed event, used for PRs only + if 'action' in github_event and github_event['action'] == 'completed': + self.sha = github_event['workflow_run']['head_sha'] + prs_for_sha = requests.get(f"https://api.github.com/repos/{GITHUB_REPOSITORY}/commits/{self.sha}/pulls").json() + if len(prs_for_sha) != 0: + github_event['pull_request'] = prs_for_sha[0] + + if 'pull_request' in github_event: # pull request and other similar events + self.number = github_event['pull_request']['number'] if 'after' in github_event: self.sha = github_event['after'] else: self.sha = github_event['pull_request']['head']['sha'] - repo_prefix = f"{os.getenv('GITHUB_SERVER_URL', 'https://github.com')}/{os.getenv('GITHUB_REPOSITORY', 'ClickHouse/ClickHouse')}" - self.task_url = f"{repo_prefix}/actions/runs/{os.getenv('GITHUB_RUN_ID')}" + repo_prefix = f"{GITHUB_SERVER_URL}/{GITHUB_REPOSITORY}" + self.task_url = f"{repo_prefix}/actions/runs/{GITHUB_RUN_ID or '0'}" - self.repo_full_name = os.getenv('GITHUB_REPOSITORY', 'ClickHouse/ClickHouse') + self.repo_full_name = GITHUB_REPOSITORY self.commit_html_url = f"{repo_prefix}/commits/{self.sha}" self.pr_html_url = f"{repo_prefix}/pull/{self.number}" @@ -56,7 +69,12 @@ class PRInfo: self.head_ref = github_event['pull_request']['head']['ref'] self.head_name = github_event['pull_request']['head']['repo']['full_name'] - self.labels = { l['name'] for l in github_event['pull_request']['labels'] } + if labels_from_api: + response = requests.get(f"https://api.github.com/repos/{GITHUB_REPOSITORY}/issues/{self.number}/labels") + self.labels = {l['name'] for l in response.json()} + else: + self.labels = {l['name'] for l in github_event['pull_request']['labels']} + self.user_login = github_event['pull_request']['user']['login'] self.user_orgs = set([]) if need_orgs: @@ -65,21 +83,15 @@ class PRInfo: response_json = user_orgs_response.json() self.user_orgs = set(org['id'] for org in response_json) - self.changed_files = set([]) - if need_changed_files: - diff_url = github_event['pull_request']['diff_url'] - diff = urllib.request.urlopen(diff_url) - diff_object = PatchSet(diff, diff.headers.get_charsets()[0]) - self.changed_files = { f.path for f in diff_object } - + self.diff_url = github_event['pull_request']['diff_url'] elif 'commits' in github_event: self.sha = github_event['after'] pull_request = get_pr_for_commit(self.sha, github_event['ref']) - repo_prefix = f"{os.getenv('GITHUB_SERVER_URL', 'https://github.com')}/{os.getenv('GITHUB_REPOSITORY', 'ClickHouse/ClickHouse')}" - self.task_url = f"{repo_prefix}/actions/runs/{os.getenv('GITHUB_RUN_ID')}" + repo_prefix = f"{GITHUB_SERVER_URL}/{GITHUB_REPOSITORY}" + self.task_url = f"{repo_prefix}/actions/runs/{GITHUB_RUN_ID or '0'}" self.commit_html_url = f"{repo_prefix}/commits/{self.sha}" - self.repo_full_name = os.getenv('GITHUB_REPOSITORY', 'ClickHouse/ClickHouse') - if pull_request is None or pull_request['state'] == 'closed': # it's merged PR to master + self.repo_full_name = GITHUB_REPOSITORY + if pull_request is None or pull_request['state'] == 'closed': # it's merged PR to master self.number = 0 self.labels = {} self.pr_html_url = f"{repo_prefix}/commits/master" @@ -87,40 +99,58 @@ class PRInfo: self.base_name = self.repo_full_name self.head_ref = "master" self.head_name = self.repo_full_name + self.diff_url = \ + f"https://api.github.com/repos/{GITHUB_REPOSITORY}/compare/{github_event['before']}...{self.sha}" else: self.number = pull_request['number'] - self.labels = { l['name'] for l in pull_request['labels'] } + if labels_from_api: + response = requests.get(f"https://api.github.com/repos/{GITHUB_REPOSITORY}/issues/{self.number}/labels") + self.labels = {l['name'] for l in response.json()} + else: + self.labels = {l['name'] for l in pull_request['labels']} + self.base_ref = pull_request['base']['ref'] self.base_name = pull_request['base']['repo']['full_name'] self.head_ref = pull_request['head']['ref'] self.head_name = pull_request['head']['repo']['full_name'] self.pr_html_url = pull_request['html_url'] - - if need_changed_files: - if self.number == 0: - commit_before = github_event['before'] - response = requests.get(f"https://api.github.com/repos/{os.getenv('GITHUB_REPOSITORY')}/compare/{commit_before}...{self.sha}") - response.raise_for_status() - diff = response.json() - - if 'files' in diff: - self.changed_files = [f['filename'] for f in diff['files']] - else: - self.changed_files = set([]) + if 'pr-backport' in self.labels: + self.diff_url = f"https://github.com/{GITHUB_REPOSITORY}/compare/master...{self.head_ref}.diff" else: - if 'pr-backport' in self.labels: - diff_url = f"https://github.com/{os.getenv('GITHUB_REPOSITORY')}/compare/master...{self.head_ref}.diff" - else: - diff_url = pull_request['diff_url'] - - diff = urllib.request.urlopen(diff_url) - diff_object = PatchSet(diff, diff.headers.get_charsets()[0]) - self.changed_files = { f.path for f in diff_object } - else: - self.changed_files = set([]) + self.diff_url = pull_request['diff_url'] else: - raise Exception("Cannot detect type of event") + print(json.dumps(github_event, sort_keys=True, indent=4)) + self.sha = os.getenv("GITHUB_SHA") + self.number = 0 + self.labels = {} + repo_prefix = f"{GITHUB_SERVER_URL}/{GITHUB_REPOSITORY}" + self.task_url = f"{repo_prefix}/actions/runs/{GITHUB_RUN_ID or '0'}" + self.commit_html_url = f"{repo_prefix}/commits/{self.sha}" + self.repo_full_name = GITHUB_REPOSITORY + self.pr_html_url = f"{repo_prefix}/commits/master" + self.base_ref = "master" + self.base_name = self.repo_full_name + self.head_ref = "master" + self.head_name = self.repo_full_name + if need_changed_files: + self.fetch_changed_files() + + def fetch_changed_files(self): + if not self.diff_url: + raise Exception("Diff URL cannot be find for event") + + if 'commits' in self.event and self.number == 0: + response = requests.get(self.diff_url) + response.raise_for_status() + diff = response.json() + + if 'files' in diff: + self.changed_files = [f['filename'] for f in diff['files']] + else: + diff = urllib.request.urlopen(self.diff_url) + diff_object = PatchSet(diff, diff.headers.get_charsets()[0]) + self.changed_files = {f.path for f in diff_object} def get_dict(self): return { diff --git a/tests/ci/pvs_check.py b/tests/ci/pvs_check.py index aa4a130902b..af543211c16 100644 --- a/tests/ci/pvs_check.py +++ b/tests/ci/pvs_check.py @@ -7,8 +7,10 @@ import json import logging import sys from github import Github + +from env_helper import REPO_COPY, TEMP_PATH, GITHUB_RUN_ID, GITHUB_REPOSITORY, GITHUB_SERVER_URL from s3_helper import S3Helper -from pr_info import PRInfo, get_event +from pr_info import PRInfo from get_robot_token import get_best_robot_token, get_parameter_from_ssm from upload_result_helper import upload_results from commit_status_helper import get_commit @@ -22,6 +24,7 @@ LICENCE_NAME = 'Free license: ClickHouse, Yandex' HTML_REPORT_FOLDER = 'pvs-studio-html-report' TXT_REPORT_NAME = 'pvs-studio-task-report.txt' + def _process_txt_report(path): warnings = [] errors = [] @@ -37,15 +40,16 @@ def _process_txt_report(path): return warnings, errors + if __name__ == "__main__": logging.basicConfig(level=logging.INFO) stopwatch = Stopwatch() - repo_path = os.path.join(os.getenv("REPO_COPY", os.path.abspath("../../"))) - temp_path = os.path.join(os.getenv("TEMP_PATH")) + repo_path = REPO_COPY + temp_path = TEMP_PATH - pr_info = PRInfo(get_event()) + pr_info = PRInfo() # this check modify repository so copy it to the temp directory logging.info("Repo copy path %s", repo_path) @@ -83,7 +87,8 @@ if __name__ == "__main__": logging.info("Run Ok") if retcode != 0: - commit.create_status(context=NAME, description='PVS report failed to build', state='failure', target_url=f"https://github.com/ClickHouse/ClickHouse/actions/runs/{os.getenv('GITHUB_RUN_ID')}") + commit.create_status(context=NAME, description='PVS report failed to build', state='error', + target_url=f"{GITHUB_SERVER_URL}/{GITHUB_REPOSITORY}/actions/runs/{GITHUB_RUN_ID}") sys.exit(1) try: @@ -97,8 +102,8 @@ if __name__ == "__main__": break if not index_html: - commit.create_status(context=NAME, description='PVS report failed to build', state='failure', - target_url=f"{os.getenv('GITHUB_SERVER_URL')}/{os.getenv('GITHUB_REPOSITORY')}/actions/runs/{os.getenv('GITHUB_RUN_ID')}") + commit.create_status(context=NAME, description='PVS report failed to build', state='error', + target_url=f"{GITHUB_SERVER_URL}/{GITHUB_REPOSITORY}/actions/runs/{GITHUB_RUN_ID}") sys.exit(1) txt_report = os.path.join(temp_path, TXT_REPORT_NAME) diff --git a/tests/ci/run_check.py b/tests/ci/run_check.py index 3739cf88248..2085a5c1e2b 100644 --- a/tests/ci/run_check.py +++ b/tests/ci/run_check.py @@ -1,9 +1,10 @@ #!/usr/bin/env python3 -import os import sys import logging from github import Github -from pr_info import PRInfo, get_event + +from env_helper import GITHUB_RUN_ID, GITHUB_REPOSITORY, GITHUB_SERVER_URL +from pr_info import PRInfo from get_robot_token import get_best_robot_token from commit_status_helper import get_commit @@ -21,7 +22,7 @@ DO_NOT_TEST_LABEL = "do not test" # Individual trusted contirbutors who are not in any trusted organization. # Can be changed in runtime: we will append users that we learned to be in # a trusted org, to save GitHub API calls. -TRUSTED_CONTRIBUTORS = { +TRUSTED_CONTRIBUTORS = {e.lower() for e in [ "achimbab", "adevyatova ", # DOCSUP "Algunenano", # Raúl Marín, Tinybird @@ -34,6 +35,7 @@ TRUSTED_CONTRIBUTORS = { "bobrik", # Seasoned contributor, CloundFlare "BohuTANG", "codyrobert", # Flickerbox engineer + "cwurm", # Employee "damozhaeva", # DOCSUP "den-crane", "flickerbox-tom", # Flickerbox @@ -64,12 +66,14 @@ TRUSTED_CONTRIBUTORS = { "vdimir", # Employee "vzakaznikov", "YiuRULE", - "zlobober" # Developer of YT -} + "zlobober", # Developer of YT + "ilejn", # Arenadata, responsible for Kerberized Kafka + "thomoco", # ClickHouse +]} def pr_is_by_trusted_user(pr_user_login, pr_user_orgs): - if pr_user_login in TRUSTED_CONTRIBUTORS: + if pr_user_login.lower() in TRUSTED_CONTRIBUTORS: logging.info("User '%s' is trusted", pr_user_login) return True @@ -87,6 +91,7 @@ def pr_is_by_trusted_user(pr_user_login, pr_user_orgs): # can be skipped entirely. def should_run_checks_for_pr(pr_info): # Consider the labels and whether the user is trusted. + print("Got labels", pr_info.labels) force_labels = set(['force tests']).intersection(pr_info.labels) if force_labels: return True, "Labeled '{}'".format(', '.join(force_labels)) @@ -102,14 +107,15 @@ def should_run_checks_for_pr(pr_info): return True, "No special conditions apply" + if __name__ == "__main__": logging.basicConfig(level=logging.INFO) - pr_info = PRInfo(get_event(), need_orgs=True) + pr_info = PRInfo(need_orgs=True, labels_from_api=True) can_run, description = should_run_checks_for_pr(pr_info) gh = Github(get_best_robot_token()) commit = get_commit(gh, pr_info.sha) - url = f"{os.getenv('GITHUB_SERVER_URL')}/{os.getenv('GITHUB_REPOSITORY')}/actions/runs/{os.getenv('GITHUB_RUN_ID')}" + url = f"{GITHUB_SERVER_URL}/{GITHUB_REPOSITORY}/actions/runs/{GITHUB_RUN_ID}" if not can_run: print("::notice ::Cannot run") commit.create_status(context=NAME, description=description, state="failure", target_url=url) diff --git a/tests/ci/s3_helper.py b/tests/ci/s3_helper.py index 7c1ee8ad9ee..753f036a8d7 100644 --- a/tests/ci/s3_helper.py +++ b/tests/ci/s3_helper.py @@ -2,10 +2,17 @@ import hashlib import logging import os +import re +import shutil +import time from multiprocessing.dummy import Pool + import boto3 + +from env_helper import S3_TEST_REPORTS_BUCKET, S3_BUILDS_BUCKET, RUNNER_TEMP, CI from compress_files import compress_file_fast + def _md5(fname): hash_md5 = hashlib.md5() with open(fname, "rb") as f: @@ -25,7 +32,7 @@ def _flatten_list(lst): return result -class S3Helper(): +class S3Helper: def __init__(self, host): self.session = boto3.session.Session(region_name='us-east-1') self.client = self.session.client('s3', endpoint_url=host) @@ -49,9 +56,7 @@ class S3Helper(): else: logging.info("No content type provied for %s", file_path) else: - is_log = s3_path.endswith("log") or ".log." in s3_path - is_text = s3_path.endswith("txt") or is_log or s3_path.endswith("err") or s3_path.endswith("out") - if not s3_path.endswith('.gz') and (is_text or is_log): + if re.search(r'\.(txt|log|err|out)$', s3_path) or re.search(r'\.log\..*(? tuple: + headers = { + "Authorization": f"token {token}", + "Accept": "application/vnd.github.v3+json", + } + response = requests.get( + f"https://api.github.com/orgs/{org}/teams/{team_slug}/members", headers=headers + ) + response.raise_for_status() + data = response.json() + return tuple(m["login"] for m in data) + + +def get_members_keys(members: tuple) -> str: + class Worker(Thread): + def __init__(self, request_queue): + Thread.__init__(self) + self.queue = request_queue + self.results = [] + + def run(self): + while True: + m = self.queue.get() + if m == "": + break + response = requests.get(f"https://github.com/{m}.keys") + self.results.append(f"# {m}\n{response.text}") + self.queue.task_done() + + q = Queue() + workers = [] + for m in members: + q.put(m) + # Create workers and add to the queue + worker = Worker(q) + worker.start() + workers.append(worker) + + # Workers keep working till they receive an empty string + for _ in workers: + q.put("") + + # Join workers to wait till they finished + for worker in workers: + worker.join() + + responses = [] + for worker in workers: + responses.extend(worker.results) + return "".join(responses) + + +def get_token_from_aws() -> str: + import boto3 + + secret_name = "clickhouse_robot_token" + session = boto3.session.Session() + client = session.client( + service_name="secretsmanager", + ) + get_secret_value_response = client.get_secret_value(SecretId=secret_name) + data = json.loads(get_secret_value_response["SecretString"]) + return data["clickhouse_robot_token"] + + +def main(token: str, org: str, team_slug: str) -> str: + members = get_org_team_members(token, org, team_slug) + keys = get_members_keys(members) + + return keys + + +def handler(event, context): + token = get_token_from_aws() + result = { + "statusCode": 200, + "headers": { + "Content-Type": "text/html", + }, + "body": main(token, "ClickHouse", "core"), + } + return result + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Get the public SSH keys for members of given org and team" + ) + parser.add_argument("--token", required=True, help="Github PAT") + parser.add_argument( + "--organization", help="GitHub organization name", default="ClickHouse" + ) + parser.add_argument("--team", help="GitHub team name", default="core") + + args = parser.parse_args() + keys = main(args.token, args.organization, args.team) + + print(f"Just shoing off the keys:\n{keys}") diff --git a/tests/ci/team_keys_lambda/requirements.txt b/tests/ci/team_keys_lambda/requirements.txt new file mode 100644 index 00000000000..f2293605cf1 --- /dev/null +++ b/tests/ci/team_keys_lambda/requirements.txt @@ -0,0 +1 @@ +requests diff --git a/tests/ci/unit_tests_check.py b/tests/ci/unit_tests_check.py index abccbcd4512..06faa5704af 100644 --- a/tests/ci/unit_tests_check.py +++ b/tests/ci/unit_tests_check.py @@ -7,9 +7,10 @@ import subprocess from github import Github +from env_helper import TEMP_PATH, REPO_COPY, REPORTS_PATH from s3_helper import S3Helper from get_robot_token import get_best_robot_token -from pr_info import PRInfo, get_event +from pr_info import PRInfo from build_download_helper import download_unit_tests from upload_result_helper import upload_results from docker_pull_helper import get_image_with_version @@ -94,16 +95,16 @@ if __name__ == "__main__": stopwatch = Stopwatch() - temp_path = os.getenv("TEMP_PATH", os.path.abspath(".")) - repo_path = os.getenv("REPO_COPY", os.path.abspath("../../")) - reports_path = os.getenv("REPORTS_PATH", "./reports") + temp_path = TEMP_PATH + repo_path = REPO_COPY + reports_path = REPORTS_PATH check_name = sys.argv[1] if not os.path.exists(temp_path): os.makedirs(temp_path) - pr_info = PRInfo(get_event()) + pr_info = PRInfo() gh = Github(get_best_robot_token()) diff --git a/tests/ci/upload_result_helper.py b/tests/ci/upload_result_helper.py index d0705372c44..5a5e8d3f36a 100644 --- a/tests/ci/upload_result_helper.py +++ b/tests/ci/upload_result_helper.py @@ -2,10 +2,12 @@ import os import logging import ast +from env_helper import GITHUB_SERVER_URL, GITHUB_REPOSITORY, GITHUB_RUN_ID from report import create_test_html_report + def process_logs(s3_client, additional_logs, s3_path_prefix, test_results, with_raw_logs): - proccessed_logs = {} + processed_logs = {} # Firstly convert paths of logs from test_results to urls to s3. for test_result in test_results: if len(test_result) <= 3 or with_raw_logs: @@ -15,14 +17,14 @@ def process_logs(s3_client, additional_logs, s3_path_prefix, test_results, with_ test_log_paths = ast.literal_eval(test_result[3]) test_log_urls = [] for log_path in test_log_paths: - if log_path in proccessed_logs: - test_log_urls.append(proccessed_logs[log_path]) + if log_path in processed_logs: + test_log_urls.append(processed_logs[log_path]) elif log_path: url = s3_client.upload_test_report_to_s3( log_path, s3_path_prefix + "/" + os.path.basename(log_path)) test_log_urls.append(url) - proccessed_logs[log_path] = url + processed_logs[log_path] = url test_result[3] = test_log_urls @@ -36,18 +38,19 @@ def process_logs(s3_client, additional_logs, s3_path_prefix, test_results, with_ return additional_urls + def upload_results(s3_client, pr_number, commit_sha, test_results, additional_files, check_name, with_raw_logs=True): s3_path_prefix = f"{pr_number}/{commit_sha}/" + check_name.lower().replace(' ', '_').replace('(', '_').replace(')', '_').replace(',', '_') additional_urls = process_logs(s3_client, additional_files, s3_path_prefix, test_results, with_raw_logs) - branch_url = f"{os.getenv('GITHUB_SERVER_URL')}/{os.getenv('GITHUB_REPOSITORY')}/commits/master" + branch_url = f"{GITHUB_SERVER_URL}/{GITHUB_REPOSITORY}/commits/master" branch_name = "master" if pr_number != 0: branch_name = f"PR #{pr_number}" - branch_url = f"{os.getenv('GITHUB_SERVER_URL')}/{os.getenv('GITHUB_REPOSITORY')}/pull/{pr_number}" - commit_url = f"{os.getenv('GITHUB_SERVER_URL')}/{os.getenv('GITHUB_REPOSITORY')}/commit/{commit_sha}" + branch_url = f"{GITHUB_SERVER_URL}/{GITHUB_REPOSITORY}/pull/{pr_number}" + commit_url = f"{GITHUB_SERVER_URL}/{GITHUB_REPOSITORY}/commit/{commit_sha}" - task_url = f"{os.getenv('GITHUB_SERVER_URL')}/{os.getenv('GITHUB_REPOSITORY')}/actions/runs/{os.getenv('GITHUB_RUN_ID')}" + task_url = f"{GITHUB_SERVER_URL}/{GITHUB_REPOSITORY}/actions/runs/{GITHUB_RUN_ID}" if additional_urls: raw_log_url = additional_urls[0] diff --git a/tests/ci/worker/dockerhub_proxy_template.sh b/tests/ci/worker/dockerhub_proxy_template.sh new file mode 100644 index 00000000000..5ee63a05125 --- /dev/null +++ b/tests/ci/worker/dockerhub_proxy_template.sh @@ -0,0 +1,8 @@ +#!/usr/bin/env bash +set -xeuo pipefail + +mkdir /home/ubuntu/registrystorage + +sed -i 's/preserve_hostname: false/preserve_hostname: true/g' /etc/cloud/cloud.cfg + +docker run -d --network=host -p 5000:5000 -v /home/ubuntu/registrystorage:/var/lib/registry -e REGISTRY_HTTP_ADDR=0.0.0.0:5000 -e REGISTRY_STORAGE_DELETE_ENABLED=true -e REGISTRY_PROXY_REMOTEURL=https://registry-1.docker.io --restart=always --name registry registry:2 diff --git a/tests/ci/worker/init_builder.sh b/tests/ci/worker/init_builder.sh deleted file mode 100644 index 8fd00c1db0a..00000000000 --- a/tests/ci/worker/init_builder.sh +++ /dev/null @@ -1,34 +0,0 @@ -#!/usr/bin/env bash -set -uo pipefail - -echo "Running init script" -export DEBIAN_FRONTEND=noninteractive -export RUNNER_HOME=/home/ubuntu/actions-runner - -export RUNNER_URL="https://github.com/ClickHouse" -# Funny fact, but metadata service has fixed IP -export INSTANCE_ID=`curl -s http://169.254.169.254/latest/meta-data/instance-id` - -while true; do - runner_pid=`pgrep run.sh` - echo "Got runner pid $runner_pid" - - cd $RUNNER_HOME - if [ -z "$runner_pid" ]; then - echo "Receiving token" - RUNNER_TOKEN=`/usr/local/bin/aws ssm get-parameter --name github_runner_registration_token --with-decryption --output text --query Parameter.Value` - - echo "Will try to remove runner" - sudo -u ubuntu ./config.sh remove --token $RUNNER_TOKEN ||: - - echo "Going to configure runner" - sudo -u ubuntu ./config.sh --url $RUNNER_URL --token $RUNNER_TOKEN --name $INSTANCE_ID --runnergroup Default --labels 'self-hosted,Linux,X64,builder' --work _work - - echo "Run" - sudo -u ubuntu ./run.sh & - sleep 15 - else - echo "Runner is working with pid $runner_pid, nothing to do" - sleep 10 - fi -done diff --git a/tests/ci/worker/init_func_tester.sh b/tests/ci/worker/init_func_tester.sh deleted file mode 100644 index d3ee3cb3d7f..00000000000 --- a/tests/ci/worker/init_func_tester.sh +++ /dev/null @@ -1,34 +0,0 @@ -#!/usr/bin/env bash -set -uo pipefail - -echo "Running init script" -export DEBIAN_FRONTEND=noninteractive -export RUNNER_HOME=/home/ubuntu/actions-runner - -export RUNNER_URL="https://github.com/ClickHouse" -# Funny fact, but metadata service has fixed IP -export INSTANCE_ID=`curl -s http://169.254.169.254/latest/meta-data/instance-id` - -while true; do - runner_pid=`pgrep run.sh` - echo "Got runner pid $runner_pid" - - cd $RUNNER_HOME - if [ -z "$runner_pid" ]; then - echo "Receiving token" - RUNNER_TOKEN=`/usr/local/bin/aws ssm get-parameter --name github_runner_registration_token --with-decryption --output text --query Parameter.Value` - - echo "Will try to remove runner" - sudo -u ubuntu ./config.sh remove --token $RUNNER_TOKEN ||: - - echo "Going to configure runner" - sudo -u ubuntu ./config.sh --url $RUNNER_URL --token $RUNNER_TOKEN --name $INSTANCE_ID --runnergroup Default --labels 'self-hosted,Linux,X64,func-tester' --work _work - - echo "Run" - sudo -u ubuntu ./run.sh & - sleep 15 - else - echo "Runner is working with pid $runner_pid, nothing to do" - sleep 10 - fi -done diff --git a/tests/ci/worker/init_fuzzer_unit_tester.sh b/tests/ci/worker/init_fuzzer_unit_tester.sh deleted file mode 100644 index 2fbedba9e40..00000000000 --- a/tests/ci/worker/init_fuzzer_unit_tester.sh +++ /dev/null @@ -1,34 +0,0 @@ -#!/usr/bin/env bash -set -uo pipefail - -echo "Running init script" -export DEBIAN_FRONTEND=noninteractive -export RUNNER_HOME=/home/ubuntu/actions-runner - -export RUNNER_URL="https://github.com/ClickHouse" -# Funny fact, but metadata service has fixed IP -export INSTANCE_ID=`curl -s http://169.254.169.254/latest/meta-data/instance-id` - -while true; do - runner_pid=`pgrep run.sh` - echo "Got runner pid $runner_pid" - - cd $RUNNER_HOME - if [ -z "$runner_pid" ]; then - echo "Receiving token" - RUNNER_TOKEN=`/usr/local/bin/aws ssm get-parameter --name github_runner_registration_token --with-decryption --output text --query Parameter.Value` - - echo "Will try to remove runner" - sudo -u ubuntu ./config.sh remove --token $RUNNER_TOKEN ||: - - echo "Going to configure runner" - sudo -u ubuntu ./config.sh --url $RUNNER_URL --token $RUNNER_TOKEN --name $INSTANCE_ID --runnergroup Default --labels 'self-hosted,Linux,X64,fuzzer-unit-tester' --work _work - - echo "Run" - sudo -u ubuntu ./run.sh & - sleep 15 - else - echo "Runner is working with pid $runner_pid, nothing to do" - sleep 10 - fi -done diff --git a/tests/ci/worker/init_runner.sh b/tests/ci/worker/init_runner.sh new file mode 100644 index 00000000000..6838d925500 --- /dev/null +++ b/tests/ci/worker/init_runner.sh @@ -0,0 +1,47 @@ +#!/usr/bin/env bash +set -uo pipefail + +#################################### +# IMPORTANT! # +# EC2 instance should have # +# `github:runner-type` tag # +# set accordingly to a runner role # +#################################### + +echo "Running init script" +export DEBIAN_FRONTEND=noninteractive +export RUNNER_HOME=/home/ubuntu/actions-runner + +export RUNNER_URL="https://github.com/ClickHouse" +# Funny fact, but metadata service has fixed IP +INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id) +export INSTANCE_ID + +# combine labels +RUNNER_TYPE=$(/usr/local/bin/aws ec2 describe-tags --filters "Name=resource-id,Values=$INSTANCE_ID" | jq '.Tags[] | select(."Key" == "github:runner-type") | .Value' -r) +LABELS="self-hosted,Linux,$(uname -m),$RUNNER_TYPE" +export LABELS + +while true; do + runner_pid=$(pgrep run.sh) + echo "Got runner pid $runner_pid" + + cd $RUNNER_HOME || exit 1 + if [ -z "$runner_pid" ]; then + echo "Receiving token" + RUNNER_TOKEN=$(/usr/local/bin/aws ssm get-parameter --name github_runner_registration_token --with-decryption --output text --query Parameter.Value) + + echo "Will try to remove runner" + sudo -u ubuntu ./config.sh remove --token "$RUNNER_TOKEN" ||: + + echo "Going to configure runner" + sudo -u ubuntu ./config.sh --url $RUNNER_URL --token "$RUNNER_TOKEN" --name "$INSTANCE_ID" --runnergroup Default --labels "$LABELS" --work _work + + echo "Run" + sudo -u ubuntu ./run.sh & + sleep 15 + else + echo "Runner is working with pid $runner_pid, nothing to do" + sleep 10 + fi +done diff --git a/tests/ci/worker/init_stress_tester.sh b/tests/ci/worker/init_stress_tester.sh deleted file mode 100644 index 234f035e1ea..00000000000 --- a/tests/ci/worker/init_stress_tester.sh +++ /dev/null @@ -1,34 +0,0 @@ -#!/usr/bin/env bash -set -uo pipefail - -echo "Running init script" -export DEBIAN_FRONTEND=noninteractive -export RUNNER_HOME=/home/ubuntu/actions-runner - -export RUNNER_URL="https://github.com/ClickHouse" -# Funny fact, but metadata service has fixed IP -export INSTANCE_ID=`curl -s http://169.254.169.254/latest/meta-data/instance-id` - -while true; do - runner_pid=`pgrep run.sh` - echo "Got runner pid $runner_pid" - - cd $RUNNER_HOME - if [ -z "$runner_pid" ]; then - echo "Receiving token" - RUNNER_TOKEN=`/usr/local/bin/aws ssm get-parameter --name github_runner_registration_token --with-decryption --output text --query Parameter.Value` - - echo "Will try to remove runner" - sudo -u ubuntu ./config.sh remove --token $RUNNER_TOKEN ||: - - echo "Going to configure runner" - sudo -u ubuntu ./config.sh --url $RUNNER_URL --token $RUNNER_TOKEN --name $INSTANCE_ID --runnergroup Default --labels 'self-hosted,Linux,X64,stress-tester' --work _work - - echo "Run" - sudo -u ubuntu ./run.sh & - sleep 15 - else - echo "Runner is working with pid $runner_pid, nothing to do" - sleep 10 - fi -done diff --git a/tests/ci/worker/init_style_checker.sh b/tests/ci/worker/init_style_checker.sh deleted file mode 100644 index 77cf66b5262..00000000000 --- a/tests/ci/worker/init_style_checker.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/usr/bin/bash -set -euo pipefail - -echo "Running init script" -export DEBIAN_FRONTEND=noninteractive -export RUNNER_HOME=/home/ubuntu/actions-runner - -echo "Receiving token" -export RUNNER_TOKEN=`/usr/local/bin/aws ssm get-parameter --name github_runner_registration_token --with-decryption --output text --query Parameter.Value` -export RUNNER_URL="https://github.com/ClickHouse" -# Funny fact, but metadata service has fixed IP -export INSTANCE_ID=`curl -s http://169.254.169.254/latest/meta-data/instance-id` - -cd $RUNNER_HOME - -echo "Going to configure runner" -sudo -u ubuntu ./config.sh --url $RUNNER_URL --token $RUNNER_TOKEN --name $INSTANCE_ID --runnergroup Default --labels 'self-hosted,Linux,X64,style-checker' --work _work - -echo "Run" -sudo -u ubuntu ./run.sh diff --git a/tests/ci/worker/ubuntu_ami_for_ci.sh b/tests/ci/worker/ubuntu_ami_for_ci.sh new file mode 100644 index 00000000000..3fabbb1f8a4 --- /dev/null +++ b/tests/ci/worker/ubuntu_ami_for_ci.sh @@ -0,0 +1,92 @@ +#!/usr/bin/env bash +set -xeuo pipefail + +echo "Running prepare script" +export DEBIAN_FRONTEND=noninteractive +export RUNNER_VERSION=2.285.1 +export RUNNER_HOME=/home/ubuntu/actions-runner + +deb_arch() { + case $(uname -m) in + x86_64 ) + echo amd64;; + aarch64 ) + echo arm64;; + esac +} + +runner_arch() { + case $(uname -m) in + x86_64 ) + echo x64;; + aarch64 ) + echo arm64;; + esac +} + +apt-get update + +apt-get install --yes --no-install-recommends \ + apt-transport-https \ + build-essential \ + ca-certificates \ + curl \ + gnupg \ + jq \ + lsb-release \ + pigz \ + python3-dev \ + python3-pip \ + unzip + +curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg + +echo "deb [arch=$(deb_arch) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null + +apt-get update + +apt-get install --yes --no-install-recommends docker-ce docker-ce-cli containerd.io + +usermod -aG docker ubuntu + +# enable ipv6 in containers (fixed-cidr-v6 is some random network mask) +cat < /etc/docker/daemon.json +{ + "ipv6": true, + "fixed-cidr-v6": "2001:db8:1::/64", + "insecure-registries" : ["dockerhub-proxy.dockerhub-proxy-zone:5000"], + "registry-mirrors" : ["http://dockerhub-proxy.dockerhub-proxy-zone:5000"] +} +EOT + +systemctl restart docker + +pip install boto3 pygithub requests urllib3 unidiff dohq-artifactory + +mkdir -p $RUNNER_HOME && cd $RUNNER_HOME + +RUNNER_ARCHIVE="actions-runner-linux-$(runner_arch)-$RUNNER_VERSION.tar.gz" + +curl -O -L "https://github.com/actions/runner/releases/download/v$RUNNER_VERSION/$RUNNER_ARCHIVE" + +tar xzf "./$RUNNER_ARCHIVE" +rm -f "./$RUNNER_ARCHIVE" +./bin/installdependencies.sh + +chown -R ubuntu:ubuntu $RUNNER_HOME + +cd /home/ubuntu +curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "awscliv2.zip" +unzip awscliv2.zip +./aws/install + +rm -rf /home/ubuntu/awscliv2.zip /home/ubuntu/aws + +# SSH keys of core team +mkdir -p /home/ubuntu/.ssh + +# ~/.ssh/authorized_keys is cleaned out, so we use deprecated but working ~/.ssh/authorized_keys2 +aws lambda invoke --region us-east-1 --function-name team-keys-lambda /tmp/core.keys +jq < /tmp/core.keys -r '.body' > /home/ubuntu/.ssh/authorized_keys2 +chown ubuntu: /home/ubuntu/.ssh -R +chmod 0700 /home/ubuntu/.ssh diff --git a/tests/ci/worker/ubuntu_style_check.sh b/tests/ci/worker/ubuntu_style_check.sh deleted file mode 100644 index bf5c6057bed..00000000000 --- a/tests/ci/worker/ubuntu_style_check.sh +++ /dev/null @@ -1,57 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail - -echo "Running prepare script" -export DEBIAN_FRONTEND=noninteractive -export RUNNER_VERSION=2.283.1 -export RUNNER_HOME=/home/ubuntu/actions-runner - -apt-get update - -apt-get install --yes --no-install-recommends \ - apt-transport-https \ - ca-certificates \ - curl \ - gnupg \ - lsb-release \ - python3-pip \ - unzip - -curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg - -echo "deb [arch=amd64 signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null - -apt-get update - -apt-get install --yes --no-install-recommends docker-ce docker-ce-cli containerd.io - -usermod -aG docker ubuntu - -# enable ipv6 in containers (fixed-cidr-v6 is some random network mask) -cat < /etc/docker/daemon.json -{ - "ipv6": true, - "fixed-cidr-v6": "2001:db8:1::/64" -} -EOT - -systemctl restart docker - -pip install boto3 pygithub requests urllib3 unidiff - -mkdir -p $RUNNER_HOME && cd $RUNNER_HOME - -curl -O -L https://github.com/actions/runner/releases/download/v$RUNNER_VERSION/actions-runner-linux-x64-$RUNNER_VERSION.tar.gz - -tar xzf ./actions-runner-linux-x64-$RUNNER_VERSION.tar.gz -rm -f ./actions-runner-linux-x64-$RUNNER_VERSION.tar.gz -./bin/installdependencies.sh - -chown -R ubuntu:ubuntu $RUNNER_HOME - -cd /home/ubuntu -curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" -unzip awscliv2.zip -./aws/install - -rm -rf /home/ubuntu/awscliv2.zip /home/ubuntu/aws diff --git a/tests/ci/cancel_workflow_lambda/Dockerfile b/tests/ci/workflow_approve_rerun_lambda/Dockerfile similarity index 100% rename from tests/ci/cancel_workflow_lambda/Dockerfile rename to tests/ci/workflow_approve_rerun_lambda/Dockerfile diff --git a/tests/ci/approve_lambda/app.py b/tests/ci/workflow_approve_rerun_lambda/app.py similarity index 77% rename from tests/ci/approve_lambda/app.py rename to tests/ci/workflow_approve_rerun_lambda/app.py index 619c80ce299..f2502f605af 100644 --- a/tests/ci/approve_lambda/app.py +++ b/tests/ci/workflow_approve_rerun_lambda/app.py @@ -23,10 +23,11 @@ SUSPICIOUS_PATTERNS = [ ] MAX_RETRY = 5 +MAX_WORKFLOW_RERUN = 7 WorkflowDescription = namedtuple('WorkflowDescription', - ['name', 'action', 'run_id', 'event', 'workflow_id', - 'fork_owner_login', 'fork_branch']) + ['name', 'action', 'run_id', 'event', 'workflow_id', 'conclusion', 'status', 'api_url', + 'fork_owner_login', 'fork_branch', 'rerun_url', 'jobs_url', 'attempt', 'url']) TRUSTED_WORKFLOW_IDS = { 14586616, # Cancel workflows, always trusted @@ -38,10 +39,18 @@ TRUSTED_ORG_IDS = { 54801242, # clickhouse } +NEED_RERUN_WORKFLOWS = { + 13241696, # PR + 15834118, # Docs + 15522500, # MasterCI + 15516108, # ReleaseCI + 15797242, # BackportPR +} + # Individual trusted contirbutors who are not in any trusted organization. # Can be changed in runtime: we will append users that we learned to be in # a trusted org, to save GitHub API calls. -TRUSTED_CONTRIBUTORS = { +TRUSTED_CONTRIBUTORS = {e.lower() for e in [ "achimbab", "adevyatova ", # DOCSUP "Algunenano", # Raúl Marín, Tinybird @@ -53,6 +62,7 @@ TRUSTED_CONTRIBUTORS = { "bharatnc", # Newbie, but already with many contributions. "bobrik", # Seasoned contributor, CloundFlare "BohuTANG", + "cwurm", # Employee "damozhaeva", # DOCSUP "den-crane", "gyuton", # DOCSUP @@ -81,8 +91,8 @@ TRUSTED_CONTRIBUTORS = { "vdimir", # Employee "vzakaznikov", "YiuRULE", - "zlobober" # Developer of YT -} + "zlobober", # Developer of YT +]} def get_installation_id(jwt_token): @@ -119,7 +129,7 @@ def get_key_and_app_from_aws(): def is_trusted_contributor(pr_user_login, pr_user_orgs): - if pr_user_login in TRUSTED_CONTRIBUTORS: + if pr_user_login.lower() in TRUSTED_CONTRIBUTORS: print(f"User '{pr_user_login}' is trusted") return True @@ -180,6 +190,13 @@ def get_workflow_description_from_event(event): fork_branch = event['workflow_run']['head_branch'] name = event['workflow_run']['name'] workflow_id = event['workflow_run']['workflow_id'] + conclusion = event['workflow_run']['conclusion'] + attempt = event['workflow_run']['run_attempt'] + status = event['workflow_run']['status'] + jobs_url = event['workflow_run']['jobs_url'] + rerun_url = event['workflow_run']['rerun_url'] + url = event['workflow_run']['html_url'] + api_url = event['workflow_run']['url'] return WorkflowDescription( name=name, action=action, @@ -188,6 +205,13 @@ def get_workflow_description_from_event(event): fork_owner_login=fork_owner, fork_branch=fork_branch, workflow_id=workflow_id, + conclusion=conclusion, + attempt=attempt, + status=status, + jobs_url=jobs_url, + rerun_url=rerun_url, + url=url, + api_url=api_url ) def get_pr_author_and_orgs(pull_request): @@ -255,12 +279,63 @@ def get_token_from_aws(): installation_id = get_installation_id(encoded_jwt) return get_access_token(encoded_jwt, installation_id) +def get_workflow_jobs(workflow_description): + jobs_url = workflow_description.api_url + f"/attempts/{workflow_description.attempt}/jobs" + jobs = [] + i = 1 + while True: + got_jobs = _exec_get_with_retry(jobs_url + f"?page={i}") + if len(got_jobs['jobs']) == 0: + break + + jobs += got_jobs['jobs'] + i += 1 + + return jobs + +def check_need_to_rerun(workflow_description): + if workflow_description.attempt >= MAX_WORKFLOW_RERUN: + print("Not going to rerun workflow because it's already tried more than two times") + return False + print("Going to check jobs") + + jobs = get_workflow_jobs(workflow_description) + print("Got jobs", len(jobs)) + for job in jobs: + if job['conclusion'] not in ('success', 'skipped'): + print("Job", job['name'], "failed, checking steps") + for step in job['steps']: + # always the last job + if step['name'] == 'Complete job': + print("Found Complete job step for job", job['name']) + break + else: + print("Checked all steps and doesn't found Complete job, going to rerun") + return True + + return False + +def rerun_workflow(workflow_description, token): + print("Going to rerun workflow") + _exec_post_with_retry(workflow_description.rerun_url, token) + def main(event): token = get_token_from_aws() event_data = json.loads(event['body']) workflow_description = get_workflow_description_from_event(event_data) print("Got workflow description", workflow_description) + if workflow_description.action == 'completed' and workflow_description.conclusion == 'failure': + print("Workflow", workflow_description.url, "completed and failed, let's check for rerun") + + if workflow_description.workflow_id not in NEED_RERUN_WORKFLOWS: + print("Workflow", workflow_description.workflow_id, "not in list of rerunable workflows") + return + + if check_need_to_rerun(workflow_description): + rerun_workflow(workflow_description, token) + return + if workflow_description.action != "requested": print("Exiting, event action is", workflow_description.action) return diff --git a/tests/ci/cancel_workflow_lambda/requirements.txt b/tests/ci/workflow_approve_rerun_lambda/requirements.txt similarity index 100% rename from tests/ci/cancel_workflow_lambda/requirements.txt rename to tests/ci/workflow_approve_rerun_lambda/requirements.txt diff --git a/tests/clickhouse-test b/tests/clickhouse-test index 8a87227519f..cb8d5914362 100755 --- a/tests/clickhouse-test +++ b/tests/clickhouse-test @@ -17,6 +17,8 @@ import math import http.client import urllib.parse import json +# for crc32 +import zlib from argparse import ArgumentParser from typing import Tuple, Union, Optional, Dict, Set, List @@ -57,6 +59,13 @@ MAX_RETRIES = 3 TEST_FILE_EXTENSIONS = ['.sql', '.sql.j2', '.sh', '.py', '.expect'] + +def stringhash(s): + # default hash() function consistent + # only during process invocation https://stackoverflow.com/a/42089311 + return zlib.crc32(s.encode('utf-8')) + + class HTTPError(Exception): def __init__(self, message=None, code=None): self.message = message @@ -309,6 +318,7 @@ class FailureReason(enum.Enum): EXCEPTION = "having having exception in stdout: " RESULT_DIFF = "result differs with reference: " TOO_LONG = "Test runs too long (> 60s). Make it faster." + INTERNAL_QUERY_FAIL = "Internal query (CREATE/DROP DATABASE) failed:" # SKIPPED reasons DISABLED = "disabled" @@ -346,6 +356,14 @@ class TestResult: class TestCase: + @staticmethod + def get_description_from_exception_info(exc_info): + exc_type, exc_value, tb = exc_info + exc_name = exc_type.__name__ + traceback_str = "\n".join(traceback.format_tb(tb, 10)) + description = f"\n{exc_name}\n{exc_value}\n{traceback_str}" + return description + @staticmethod def get_reference_file(suite_dir, name): """ @@ -564,7 +582,7 @@ class TestCase: database = args.testcase_database # This is for .sh tests - os.environ["CLICKHOUSE_LOG_COMMENT"] = self.case_file + os.environ["CLICKHOUSE_LOG_COMMENT"] = args.testcase_basename params = { 'client': client + ' --database=' + database, @@ -653,13 +671,21 @@ class TestCase: return result except KeyboardInterrupt as e: raise e + except HTTPError: + return TestResult(self.name, TestStatus.FAIL, + FailureReason.INTERNAL_QUERY_FAIL, + 0., + self.get_description_from_exception_info(sys.exc_info())) + except (ConnectionRefusedError, ConnectionResetError): + return TestResult(self.name, TestStatus.FAIL, + FailureReason.SERVER_DIED, + 0., + self.get_description_from_exception_info(sys.exc_info())) except: - exc_type, exc_value, tb = sys.exc_info() - exc_name = exc_type.__name__ - traceback_str = "\n".join(traceback.format_tb(tb, 10)) - description = f"{exc_name}\n{exc_value}\n{traceback_str}" - return TestResult(self.name, TestStatus.UNKNOWN, FailureReason.INTERNAL_ERROR, 0., description) - + return TestResult(self.name, TestStatus.UNKNOWN, + FailureReason.INTERNAL_ERROR, + 0., + self.get_description_from_exception_info(sys.exc_info())) class TestSuite: @staticmethod @@ -756,7 +782,15 @@ class TestSuite: self.suite_tmp_path: str = suite_tmp_path self.suite: str = suite - self.all_tests: List[str] = self.get_tests_list(self.tests_in_suite_key_func) + filter_func = lambda x: True + + if args.run_by_hash_num is not None and args.run_by_hash_total is not None: + if args.run_by_hash_num > args.run_by_hash_total: + raise Exception(f"Incorrect run by hash, value {args.run_by_hash_num} bigger than total {args.run_by_hash_total}") + + filter_func = lambda x: stringhash(x) % args.run_by_hash_total == args.run_by_hash_num + + self.all_tests: List[str] = self.get_tests_list(self.tests_in_suite_key_func, filter_func) self.all_tags: Dict[str, Set[str]] = self.read_test_tags(self.suite_path, self.all_tests) self.sequential_tests = [] @@ -777,17 +811,17 @@ class TestSuite: return ('no-parallel' in self.all_tags[test_name]) or ('sequential' in self.all_tags[test_name]) - def get_tests_list(self, sort_key): + def get_tests_list(self, sort_key, filter_func): """ Return list of tests file names to run """ - all_tests = list(self.get_selected_tests()) + all_tests = list(self.get_selected_tests(filter_func)) all_tests = all_tests * self.args.test_runs all_tests.sort(key=sort_key) return all_tests - def get_selected_tests(self): + def get_selected_tests(self, filter_func): """ Find all files with tests, filter, render templates """ @@ -804,11 +838,13 @@ class TestSuite: continue if USE_JINJA and test_name.endswith(".gen.sql"): continue + if not filter_func(test_name): + continue test_name = self.render_test_template(j2env, self.suite_path, test_name) yield test_name @staticmethod - def readTestSuite(args, suite_dir_name: str): + def read_test_suite(args, suite_dir_name: str): def is_data_present(): return int(clickhouse_execute(args, 'EXISTS TABLE test.hits')) @@ -1192,7 +1228,7 @@ def main(args): if server_died.is_set(): break - test_suite = TestSuite.readTestSuite(args, suite) + test_suite = TestSuite.read_test_suite(args, suite) if test_suite is None: continue @@ -1325,6 +1361,9 @@ if __name__ == '__main__': parser.add_argument('--print-time', action='store_true', dest='print_time', help='Print test time') parser.add_argument('--check-zookeeper-session', action='store_true', help='Check ZooKeeper session uptime to determine if failed test should be retried') + parser.add_argument('--run-by-hash-num', type=int, help='Run tests matching crc32(test_name) % run_by_hash_total == run_by_hash_num') + parser.add_argument('--run-by-hash-total', type=int, help='Total test groups for crc32(test_name) % run_by_hash_total == run_by_hash_num') + group = parser.add_mutually_exclusive_group(required=False) group.add_argument('--zookeeper', action='store_true', default=None, dest='zookeeper', help='Run zookeeper related tests') group.add_argument('--no-zookeeper', action='store_false', default=None, dest='zookeeper', help='Do not run zookeeper related tests') diff --git a/tests/config/executable_pool_dictionary.xml b/tests/config/executable_pool_dictionary.xml index 13f34f0048e..212552a6776 100644 --- a/tests/config/executable_pool_dictionary.xml +++ b/tests/config/executable_pool_dictionary.xml @@ -61,10 +61,11 @@ - + TabSeparated while read read_data; do printf "$read_data\tvalue a\tvalue b\n"; done - + 5 + diff --git a/tests/config/test_function.xml b/tests/config/test_function.xml index 2e31c9677ec..928cbd75c78 100644 --- a/tests/config/test_function.xml +++ b/tests/config/test_function.xml @@ -11,6 +11,6 @@ TabSeparated cd /; clickhouse-local --input-format TabSeparated --output-format TabSeparated --structure 'x UInt64, y UInt64' --query "SELECT x + y FROM table" - 0 + 0 diff --git a/tests/integration/ci-runner.py b/tests/integration/ci-runner.py index 25d09a8c4c5..830b8e149f6 100755 --- a/tests/integration/ci-runner.py +++ b/tests/integration/ci-runner.py @@ -10,6 +10,8 @@ from collections import defaultdict import random import json import csv +# for crc32 +import zlib MAX_RETRY = 3 @@ -26,6 +28,9 @@ MAX_TIME_SECONDS = 3600 MAX_TIME_IN_SANDBOX = 20 * 60 # 20 minutes TASK_TIMEOUT = 8 * 60 * 60 # 8 hours +def stringhash(s): + return zlib.crc32(s.encode('utf-8')) + def get_tests_to_run(pr_info): result = set([]) changed_files = pr_info['changed_files'] @@ -183,6 +188,13 @@ class ClickhouseIntegrationTestsRunner: self.start_time = time.time() self.soft_deadline_time = self.start_time + (TASK_TIMEOUT - MAX_TIME_IN_SANDBOX) + if 'run_by_hash_total' in self.params: + self.run_by_hash_total = self.params['run_by_hash_total'] + self.run_by_hash_num = self.params['run_by_hash_num'] + else: + self.run_by_hash_total = 0 + self.run_by_hash_num = 0 + def path(self): return self.result_path @@ -576,6 +588,15 @@ class ClickhouseIntegrationTestsRunner: self._install_clickhouse(build_path) logging.info("Dump iptables before run %s", subprocess.check_output("sudo iptables -L", shell=True)) all_tests = self._get_all_tests(repo_path) + + if self.run_by_hash_total != 0: + grouped_tests = self.group_test_by_file(all_tests) + all_filtered_by_hash_tests = [] + for group, tests_in_group in grouped_tests.items(): + if stringhash(group) % self.run_by_hash_total == self.run_by_hash_num: + all_filtered_by_hash_tests += tests_in_group + all_tests = all_filtered_by_hash_tests + parallel_skip_tests = self._get_parallel_tests_skip_list(repo_path) logging.info("Found %s tests first 3 %s", len(all_tests), ' '.join(all_tests[:3])) filtered_sequential_tests = list(filter(lambda test: test in all_tests, parallel_skip_tests)) @@ -609,7 +630,7 @@ class ClickhouseIntegrationTestsRunner: random.shuffle(items_to_run) for group, tests in items_to_run: - logging.info("Running test group %s countaining %s tests", group, len(tests)) + logging.info("Running test group %s containing %s tests", group, len(tests)) group_counters, group_test_times, log_paths = self.try_run_test_group(repo_path, group, tests, MAX_RETRY, NUM_WORKERS) total_tests = 0 for counter, value in group_counters.items(): diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 9461f4a81c5..4b0a9a2835b 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -8,12 +8,12 @@ from helpers.network import _NetworkManager @pytest.fixture(autouse=True, scope="session") def cleanup_environment(): try: - if int(os.environ.get("PYTEST_CLEANUP_CONTAINERS")) == 1: + if int(os.environ.get("PYTEST_CLEANUP_CONTAINERS", 0)) == 1: logging.debug(f"Cleaning all iptables rules") _NetworkManager.clean_all_user_iptables_rules() result = run_and_check(['docker ps | wc -l'], shell=True) if int(result) > 1: - if int(os.environ.get("PYTEST_CLEANUP_CONTAINERS")) != 1: + if int(os.environ.get("PYTEST_CLEANUP_CONTAINERS", 0)) != 1: logging.warning(f"Docker containters({int(result)}) are running before tests run. They can be left from previous pytest run and cause test failures.\n"\ "You can set env PYTEST_CLEANUP_CONTAINERS=1 or use runner with --cleanup-containers argument to enable automatic containers cleanup.") else: diff --git a/tests/integration/helpers/cluster.py b/tests/integration/helpers/cluster.py index 0817cc882b4..0c513f68c32 100644 --- a/tests/integration/helpers/cluster.py +++ b/tests/integration/helpers/cluster.py @@ -285,6 +285,8 @@ class ClickHouseCluster: self.minio_redirect_ip = None self.minio_redirect_port = 8080 + self.with_azurite = False + # available when with_hdfs == True self.hdfs_host = "hdfs1" self.hdfs_ip = None @@ -725,6 +727,8 @@ class ClickHouseCluster: env_variables['MONGO_HOST'] = self.mongo_host env_variables['MONGO_EXTERNAL_PORT'] = str(self.mongo_port) env_variables['MONGO_INTERNAL_PORT'] = "27017" + env_variables['MONGO_EXTERNAL_PORT_2'] = "27018" + env_variables['MONGO_INTERNAL_PORT_2'] = "27017" self.base_cmd.extend(['--file', p.join(docker_compose_yml_dir, 'docker_compose_mongo.yml')]) self.base_mongo_cmd = ['docker-compose', '--env-file', instance.env_file, '--project-name', self.project_name, '--file', p.join(docker_compose_yml_dir, 'docker_compose_mongo.yml')] @@ -742,6 +746,13 @@ class ClickHouseCluster: '--file', p.join(docker_compose_yml_dir, 'docker_compose_minio.yml')] return self.base_minio_cmd + def setup_azurite_cmd(self, instance, env_variables, docker_compose_yml_dir): + self.with_azurite = True + self.base_cmd.extend(['--file', p.join(docker_compose_yml_dir, 'docker_compose_azurite.yml')]) + self.base_azurite_cmd = ['docker-compose', '--env-file', instance.env_file, '--project-name', self.project_name, + '--file', p.join(docker_compose_yml_dir, 'docker_compose_azurite.yml')] + return self.base_azurite_cmd + def setup_cassandra_cmd(self, instance, env_variables, docker_compose_yml_dir): self.with_cassandra = True env_variables['CASSANDRA_PORT'] = str(self.cassandra_port) @@ -773,7 +784,7 @@ class ClickHouseCluster: with_kafka=False, with_kerberized_kafka=False, with_rabbitmq=False, clickhouse_path_dir=None, with_odbc_drivers=False, with_postgres=False, with_postgres_cluster=False, with_hdfs=False, with_kerberized_hdfs=False, with_mongo=False, with_mongo_secure=False, with_nginx=False, - with_redis=False, with_minio=False, with_cassandra=False, with_jdbc_bridge=False, + with_redis=False, with_minio=False, with_azurite=False, with_cassandra=False, with_jdbc_bridge=False, hostname=None, env_variables=None, image="clickhouse/integration-test", tag=None, stay_alive=False, ipv4_address=None, ipv6_address=None, with_installed_binary=False, external_dirs=None, tmpfs=None, zookeeper_docker_compose_path=None, minio_certs_dir=None, use_keeper=True, @@ -829,6 +840,7 @@ class ClickHouseCluster: with_mongo=with_mongo or with_mongo_secure, with_redis=with_redis, with_minio=with_minio, + with_azurite=with_azurite, with_cassandra=with_cassandra, with_jdbc_bridge=with_jdbc_bridge, server_bin_path=self.server_bin_path, @@ -932,6 +944,9 @@ class ClickHouseCluster: if with_minio and not self.with_minio: cmds.append(self.setup_minio_cmd(instance, env_variables, docker_compose_yml_dir)) + if with_azurite and not self.with_azurite: + cmds.append(self.setup_azurite_cmd(instance, env_variables, docker_compose_yml_dir)) + if minio_certs_dir is not None: if self.minio_certs_dir is None: self.minio_certs_dir = minio_certs_dir @@ -1086,15 +1101,13 @@ class ClickHouseCluster: info = self.mysql_client_container.client.api.inspect_container(self.mysql_client_container.name) if info['State']['Health']['Status'] == 'healthy': logging.debug("Mysql Client Container Started") - break + return time.sleep(1) - - return except Exception as ex: errors += [str(ex)] time.sleep(1) - run_and_check(['docker-compose', 'ps', '--services', '--all']) + run_and_check(['docker', 'ps', '--all']) logging.error("Can't connect to MySQL Client:{}".format(errors)) raise Exception("Cannot wait MySQL Client container") @@ -1383,6 +1396,23 @@ class ClickHouseCluster: raise Exception("Can't wait Minio to start") + def wait_azurite_to_start(self, timeout=180): + from azure.storage.blob import BlobServiceClient + connection_string = "DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=http://127.0.0.1:10000/devstoreaccount1;" + time.sleep(1) + start = time.time() + while time.time() - start < timeout: + try: + blob_service_client = BlobServiceClient.from_connection_string(connection_string) + logging.debug(blob_service_client.get_account_information()) + self.blob_service_client = blob_service_client + return + except Exception as ex: + logging.debug("Can't connect to Azurite: %s", str(ex)) + time.sleep(1) + + raise Exception("Can't wait Azurite to start") + def wait_schema_registry_to_start(self, timeout=180): sr_client = CachedSchemaRegistryClient({"url":'http://localhost:{}'.format(self.schema_registry_port)}) start = time.time() @@ -1446,6 +1476,18 @@ class ClickHouseCluster: common_opts = ['--verbose', 'up', '-d'] + images_pull_cmd = self.base_cmd + ['pull'] + # sometimes dockerhub/proxy can be flaky + for i in range(5): + try: + run_and_check(images_pull_cmd) + break + except Exception as ex: + if i == 4: + raise ex + logging.info("Got exception pulling images: %s", ex) + time.sleep(i * 3) + if self.with_zookeeper_secure and self.base_zookeeper_cmd: logging.debug('Setup ZooKeeper Secure') logging.debug(f'Creating internal ZooKeeper dirs: {self.zookeeper_dirs_to_create}') @@ -1497,7 +1539,7 @@ class ClickHouseCluster: if os.path.exists(self.mysql_dir): shutil.rmtree(self.mysql_dir) os.makedirs(self.mysql_logs_dir) - os.chmod(self.mysql_logs_dir, stat.S_IRWXO) + os.chmod(self.mysql_logs_dir, stat.S_IRWXU | stat.S_IRWXO) subprocess_check_call(self.base_mysql_cmd + common_opts) self.up_called = True self.wait_mysql_to_start() @@ -1507,7 +1549,7 @@ class ClickHouseCluster: if os.path.exists(self.mysql8_dir): shutil.rmtree(self.mysql8_dir) os.makedirs(self.mysql8_logs_dir) - os.chmod(self.mysql8_logs_dir, stat.S_IRWXO) + os.chmod(self.mysql8_logs_dir, stat.S_IRWXU | stat.S_IRWXO) subprocess_check_call(self.base_mysql8_cmd + common_opts) self.wait_mysql8_to_start() @@ -1516,7 +1558,7 @@ class ClickHouseCluster: if os.path.exists(self.mysql_cluster_dir): shutil.rmtree(self.mysql_cluster_dir) os.makedirs(self.mysql_cluster_logs_dir) - os.chmod(self.mysql_cluster_logs_dir, stat.S_IRWXO) + os.chmod(self.mysql_cluster_logs_dir, stat.S_IRWXU | stat.S_IRWXO) subprocess_check_call(self.base_mysql_cluster_cmd + common_opts) self.up_called = True @@ -1527,7 +1569,7 @@ class ClickHouseCluster: if os.path.exists(self.postgres_dir): shutil.rmtree(self.postgres_dir) os.makedirs(self.postgres_logs_dir) - os.chmod(self.postgres_logs_dir, stat.S_IRWXO) + os.chmod(self.postgres_logs_dir, stat.S_IRWXU | stat.S_IRWXO) subprocess_check_call(self.base_postgres_cmd + common_opts) self.up_called = True @@ -1536,11 +1578,11 @@ class ClickHouseCluster: if self.with_postgres_cluster and self.base_postgres_cluster_cmd: print('Setup Postgres') os.makedirs(self.postgres2_logs_dir) - os.chmod(self.postgres2_logs_dir, stat.S_IRWXO) + os.chmod(self.postgres2_logs_dir, stat.S_IRWXU | stat.S_IRWXO) os.makedirs(self.postgres3_logs_dir) - os.chmod(self.postgres3_logs_dir, stat.S_IRWXO) + os.chmod(self.postgres3_logs_dir, stat.S_IRWXU | stat.S_IRWXO) os.makedirs(self.postgres4_logs_dir) - os.chmod(self.postgres4_logs_dir, stat.S_IRWXO) + os.chmod(self.postgres4_logs_dir, stat.S_IRWXU | stat.S_IRWXO) subprocess_check_call(self.base_postgres_cluster_cmd + common_opts) self.up_called = True self.wait_postgres_cluster_to_start() @@ -1561,7 +1603,7 @@ class ClickHouseCluster: if self.with_rabbitmq and self.base_rabbitmq_cmd: logging.debug('Setup RabbitMQ') os.makedirs(self.rabbitmq_logs_dir) - os.chmod(self.rabbitmq_logs_dir, stat.S_IRWXO) + os.chmod(self.rabbitmq_logs_dir, stat.S_IRWXU | stat.S_IRWXO) for i in range(5): subprocess_check_call(self.base_rabbitmq_cmd + common_opts + ['--renew-anon-volumes']) @@ -1574,7 +1616,7 @@ class ClickHouseCluster: if self.with_hdfs and self.base_hdfs_cmd: logging.debug('Setup HDFS') os.makedirs(self.hdfs_logs_dir) - os.chmod(self.hdfs_logs_dir, stat.S_IRWXO) + os.chmod(self.hdfs_logs_dir, stat.S_IRWXU | stat.S_IRWXO) subprocess_check_call(self.base_hdfs_cmd + common_opts) self.up_called = True self.make_hdfs_api() @@ -1583,7 +1625,7 @@ class ClickHouseCluster: if self.with_kerberized_hdfs and self.base_kerberized_hdfs_cmd: logging.debug('Setup kerberized HDFS') os.makedirs(self.hdfs_kerberized_logs_dir) - os.chmod(self.hdfs_kerberized_logs_dir, stat.S_IRWXO) + os.chmod(self.hdfs_kerberized_logs_dir, stat.S_IRWXU | stat.S_IRWXO) run_and_check(self.base_kerberized_hdfs_cmd + common_opts) self.up_called = True self.make_hdfs_api(kerberized=True) @@ -1624,6 +1666,14 @@ class ClickHouseCluster: logging.info("Trying to connect to Minio...") self.wait_minio_to_start(secure=self.minio_certs_dir is not None) + if self.with_azurite and self.base_azurite_cmd: + azurite_start_cmd = self.base_azurite_cmd + common_opts + logging.info("Trying to create Azurite instance by command %s", ' '.join(map(str, azurite_start_cmd))) + run_and_check(azurite_start_cmd) + self.up_called = True + logging.info("Trying to connect to Azurite") + self.wait_azurite_to_start() + if self.with_cassandra and self.base_cassandra_cmd: subprocess_check_call(self.base_cassandra_cmd + ['up', '-d']) self.up_called = True @@ -1631,7 +1681,7 @@ class ClickHouseCluster: if self.with_jdbc_bridge and self.base_jdbc_bridge_cmd: os.makedirs(self.jdbc_driver_logs_dir) - os.chmod(self.jdbc_driver_logs_dir, stat.S_IRWXO) + os.chmod(self.jdbc_driver_logs_dir, stat.S_IRWXU | stat.S_IRWXO) subprocess_check_call(self.base_jdbc_bridge_cmd + ['up', '-d']) self.up_called = True @@ -1843,7 +1893,7 @@ class ClickHouseInstance: self, cluster, base_path, name, base_config_dir, custom_main_configs, custom_user_configs, custom_dictionaries, macros, with_zookeeper, zookeeper_config_path, with_mysql_client, with_mysql, with_mysql8, with_mysql_cluster, with_kafka, with_kerberized_kafka, - with_rabbitmq, with_nginx, with_kerberized_hdfs, with_mongo, with_redis, with_minio, with_jdbc_bridge, + with_rabbitmq, with_nginx, with_kerberized_hdfs, with_mongo, with_redis, with_minio, with_azurite, with_jdbc_bridge, with_cassandra, server_bin_path, odbc_bridge_bin_path, library_bridge_bin_path, clickhouse_path_dir, with_odbc_drivers, with_postgres, with_postgres_cluster, clickhouse_start_command=CLICKHOUSE_START_COMMAND, main_config_name="config.xml", users_config_name="users.xml", copy_common_configs=True, @@ -1887,6 +1937,7 @@ class ClickHouseInstance: self.with_mongo = with_mongo self.with_redis = with_redis self.with_minio = with_minio + self.with_azurite = with_azurite self.with_cassandra = with_cassandra self.with_jdbc_bridge = with_jdbc_bridge @@ -2004,7 +2055,8 @@ class ClickHouseInstance: user=user, password=password, database=database) # Connects to the instance via HTTP interface, sends a query and returns the answer - def http_query(self, sql, data=None, params=None, user=None, password=None, expect_fail_and_get_error=False): + def http_query(self, sql, data=None, params=None, user=None, password=None, expect_fail_and_get_error=False, + port=8123, timeout=None, retry_strategy=None): logging.debug(f"Executing query {sql} on {self.name} via HTTP interface") if params is None: params = {} @@ -2018,12 +2070,19 @@ class ClickHouseInstance: auth = requests.auth.HTTPBasicAuth(user, password) elif user: auth = requests.auth.HTTPBasicAuth(user, '') - url = "http://" + self.ip_address + ":8123/?" + urllib.parse.urlencode(params) + url = f"http://{self.ip_address}:{port}/?" + urllib.parse.urlencode(params) - if data: - r = requests.post(url, data, auth=auth) + if retry_strategy is None: + requester = requests else: - r = requests.get(url, auth=auth) + adapter = requests.adapters.HTTPAdapter(max_retries=retry_strategy) + requester = requests.Session() + requester.mount("https://", adapter) + requester.mount("http://", adapter) + if data: + r = requester.post(url, data, auth=auth, timeout=timeout) + else: + r = requester.get(url, auth=auth, timeout=timeout) def http_code_and_message(): code = r.status_code @@ -2107,7 +2166,7 @@ class ClickHouseInstance: except Exception as e: logging.warning(f"Current start attempt failed. Will kill {pid} just in case.") self.exec_in_container(["bash", "-c", f"kill -9 {pid}"], user='root', nothrow=True) - time.sleep(time_to_sleep) + time.sleep(time_to_sleep) raise Exception("Cannot start ClickHouse, see additional info in logs") @@ -2115,7 +2174,7 @@ class ClickHouseInstance: def wait_start(self, start_wait_sec): start_time = time.time() last_err = None - while time.time() <= start_time + start_wait_sec: + while True: try: pid = self.get_process_pid("clickhouse") if pid is None: @@ -2129,6 +2188,8 @@ class ClickHouseInstance: logging.warning(f"ERROR {err}") else: raise Exception("ClickHouse server is not running. Check logs.") + if time.time() > start_time + start_wait_sec: + break logging.error(f"No time left to start. But process is still running. Will dump threads.") ps_clickhouse = self.exec_in_container(["bash", "-c", "ps -C clickhouse"], nothrow=True, user='root') logging.info(f"PS RESULT:\n{ps_clickhouse}") @@ -2550,6 +2611,9 @@ class ClickHouseInstance: if self.with_minio: depends_on.append("minio1") + if self.with_azurite: + depends_on.append("azurite1") + self.cluster.env_variables.update(self.env_variables) odbc_ini_path = "" diff --git a/tests/integration/helpers/utility.py b/tests/integration/helpers/utility.py new file mode 100644 index 00000000000..69dfa53cd3e --- /dev/null +++ b/tests/integration/helpers/utility.py @@ -0,0 +1,43 @@ +import string +import random +import threading + + +# By default the exceptions that was throwed in threads will be ignored +# (they will not mark the test as failed, only printed to stderr). +# Wrap thrading.Thread and re-throw exception on join() +class SafeThread(threading.Thread): + def __init__(self, target): + super().__init__() + self.target = target + self.exception = None + def run(self): + try: + self.target() + except Exception as e: # pylint: disable=broad-except + self.exception = e + def join(self, timeout=None): + super().join(timeout) + if self.exception: + raise self.exception + + +def random_string(length): + letters = string.ascii_letters + return ''.join(random.choice(letters) for i in range(length)) + + +def generate_values(date_str, count, sign=1): + data = [[date_str, sign * (i + 1), random_string(10)] for i in range(count)] + data.sort(key=lambda tup: tup[1]) + return ",".join(["('{}',{},'{}')".format(x, y, z) for x, y, z in data]) + + +def replace_config(config_path, old, new): + config = open(config_path, 'r') + config_lines = config.readlines() + config.close() + config_lines = [line.replace(old, new) for line in config_lines] + config = open(config_path, 'w') + config.writelines(config_lines) + config.close() diff --git a/tests/integration/pytest.ini b/tests/integration/pytest.ini index 4593fc8c4d8..2a57ea5a229 100644 --- a/tests/integration/pytest.ini +++ b/tests/integration/pytest.ini @@ -1,7 +1,7 @@ [pytest] python_files = test*.py norecursedirs = _instances* -timeout = 1800 +timeout = 900 junit_duration_report = call junit_suite_name = integration log_level = DEBUG @@ -15,3 +15,7 @@ log_file = pytest.log log_file_level = DEBUG log_file_format = %(asctime)s [ %(process)d ] %(levelname)s : %(message)s (%(filename)s:%(lineno)s, %(funcName)s) log_file_date_format = %Y-%m-%d %H:%M:%S +markers = + long_run: marks tests which run for a long time +addopts = + -m 'not long_run' diff --git a/tests/integration/test_async_drain_connection/test.py b/tests/integration/test_async_drain_connection/test.py index 21f9b142e7a..40d78ebbe7c 100644 --- a/tests/integration/test_async_drain_connection/test.py +++ b/tests/integration/test_async_drain_connection/test.py @@ -1,21 +1,21 @@ -import os -import sys -import time -from multiprocessing.dummy import Pool +# pylint: disable=redefined-outer-name +# pylint: disable=unused-argument + import pytest from helpers.cluster import ClickHouseCluster cluster = ClickHouseCluster(__file__) -node = cluster.add_instance("node", main_configs=["configs/config.xml"]) +node = cluster.add_instance('node', main_configs=['configs/config.xml']) -@pytest.fixture(scope="module") +@pytest.fixture(scope='module') def started_cluster(): try: cluster.start() - node.query( - 'create table t (number UInt64) engine = Distributed(test_cluster_two_shards, system, numbers);' - ) + node.query(""" + create table t (number UInt64) + engine = Distributed(test_cluster_two_shards, system, numbers) + """) yield cluster finally: @@ -23,14 +23,14 @@ def started_cluster(): def test_filled_async_drain_connection_pool(started_cluster): - busy_pool = Pool(10) - - def execute_query(i): + def execute_queries(_): for _ in range(100): - node.query('select * from t where number = 0 limit 2;', - settings={ - "sleep_in_receive_cancel_ms": 10000000, - "max_execution_time": 5 - }) + node.query('select * from t where number = 0 limit 2', settings={ + 'sleep_in_receive_cancel_ms': int(10e6), + 'max_execution_time': 5, + # decrease drain_timeout to make test more stable + # (another way is to increase max_execution_time, but this will make test slower) + 'drain_timeout': 1, + }) - p = busy_pool.map(execute_query, range(10)) + any(map(execute_queries, range(10))) diff --git a/tests/integration/test_azure_blob_storage_zero_copy_replication/__init__.py b/tests/integration/test_azure_blob_storage_zero_copy_replication/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/integration/test_azure_blob_storage_zero_copy_replication/configs/config.d/storage_conf.xml b/tests/integration/test_azure_blob_storage_zero_copy_replication/configs/config.d/storage_conf.xml new file mode 100644 index 00000000000..4235083f5ca --- /dev/null +++ b/tests/integration/test_azure_blob_storage_zero_copy_replication/configs/config.d/storage_conf.xml @@ -0,0 +1,46 @@ + + + + + azure_blob_storage + http://azurite1:10000/devstoreaccount1 + cont + false + + devstoreaccount1 + Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw== + + + + + +
+ blob_storage_disk +
+
+
+
+
+ + + + + + node1 + 9000 + + + + + node2 + 9000 + + + + + + + test_cluster + + +
diff --git a/tests/integration/test_azure_blob_storage_zero_copy_replication/test.py b/tests/integration/test_azure_blob_storage_zero_copy_replication/test.py new file mode 100644 index 00000000000..08fb6e53e7b --- /dev/null +++ b/tests/integration/test_azure_blob_storage_zero_copy_replication/test.py @@ -0,0 +1,83 @@ +import logging +import pytest +from helpers.cluster import ClickHouseCluster + + +logging.getLogger().setLevel(logging.INFO) +logging.getLogger().addHandler(logging.StreamHandler()) + +NODE1 = "node1" +NODE2 = "node2" +TABLE_NAME = "blob_storage_table" +CONTAINER_NAME = "cont" +CLUSTER_NAME = "test_cluster" + + +@pytest.fixture(scope="module") +def cluster(): + try: + cluster = ClickHouseCluster(__file__) + cluster.add_instance(NODE1, main_configs=["configs/config.d/storage_conf.xml"], macros={'replica': '1'}, + with_azurite=True, + with_zookeeper=True) + cluster.add_instance(NODE2, main_configs=["configs/config.d/storage_conf.xml"], macros={'replica': '2'}, + with_azurite=True, + with_zookeeper=True) + logging.info("Starting cluster...") + cluster.start() + logging.info("Cluster started") + + yield cluster + finally: + cluster.shutdown() + + +def create_table(node, table_name, replica, **additional_settings): + settings = { + "storage_policy": "blob_storage_policy", + "old_parts_lifetime": 1, + } + settings.update(additional_settings) + + create_table_statement = f""" + CREATE TABLE {table_name} ON CLUSTER {CLUSTER_NAME} ( + id Int64, + data String + ) ENGINE=ReplicatedMergeTree('/clickhouse/tables/{table_name}', '{{replica}}') + ORDER BY id + SETTINGS {",".join((k+"="+repr(v) for k, v in settings.items()))}""" + + node.query(f"DROP TABLE IF EXISTS {table_name}") + node.query(create_table_statement) + assert node.query(f"SELECT COUNT(*) FROM {table_name} FORMAT Values") == "(0)" + + +def get_large_objects_count(blob_container_client, large_size_threshold=100): + return sum(blob['size'] > large_size_threshold for blob in blob_container_client.list_blobs()) + + +def test_zero_copy_replication(cluster): + node1 = cluster.instances[NODE1] + node2 = cluster.instances[NODE2] + create_table(node1, TABLE_NAME, 1) + + blob_container_client = cluster.blob_service_client.get_container_client(CONTAINER_NAME) + + values1 = "(0,'data'),(1,'data')" + values2 = "(2,'data'),(3,'data')" + + node1.query(f"INSERT INTO {TABLE_NAME} VALUES {values1}") + node2.query(f"SYSTEM SYNC REPLICA {TABLE_NAME}") + assert node1.query(f"SELECT * FROM {TABLE_NAME} order by id FORMAT Values") == values1 + assert node2.query(f"SELECT * FROM {TABLE_NAME} order by id FORMAT Values") == values1 + + # Based on version 21.x - should be only one file with size 100+ (checksums.txt), used by both nodes + assert get_large_objects_count(blob_container_client) == 1 + + node2.query(f"INSERT INTO {TABLE_NAME} VALUES {values2}") + node1.query(f"SYSTEM SYNC REPLICA {TABLE_NAME}") + + assert node2.query(f"SELECT * FROM {TABLE_NAME} order by id FORMAT Values") == values1 + "," + values2 + assert node1.query(f"SELECT * FROM {TABLE_NAME} order by id FORMAT Values") == values1 + "," + values2 + + assert get_large_objects_count(blob_container_client) == 2 diff --git a/tests/integration/test_cluster_discovery/__init__.py b/tests/integration/test_cluster_discovery/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/integration/test_cluster_discovery/config/config.xml b/tests/integration/test_cluster_discovery/config/config.xml new file mode 100644 index 00000000000..70cb010fe0e --- /dev/null +++ b/tests/integration/test_cluster_discovery/config/config.xml @@ -0,0 +1,23 @@ + + 1 + + + + /clickhouse/discovery/test_auto_cluster + + + + + + + node1 + 9000 + + + node2 + 9000 + + + + + diff --git a/tests/integration/test_cluster_discovery/config/config_shard1.xml b/tests/integration/test_cluster_discovery/config/config_shard1.xml new file mode 100644 index 00000000000..06a77a37263 --- /dev/null +++ b/tests/integration/test_cluster_discovery/config/config_shard1.xml @@ -0,0 +1,24 @@ + + 1 + + + + /clickhouse/discovery/test_auto_cluster + 1 + + + + + + + node1 + 9000 + + + node2 + 9000 + + + + + diff --git a/tests/integration/test_cluster_discovery/config/config_shard3.xml b/tests/integration/test_cluster_discovery/config/config_shard3.xml new file mode 100644 index 00000000000..ab66fdc2ab7 --- /dev/null +++ b/tests/integration/test_cluster_discovery/config/config_shard3.xml @@ -0,0 +1,24 @@ + + 1 + + + + /clickhouse/discovery/test_auto_cluster + 3 + + + + + + + node1 + 9000 + + + node2 + 9000 + + + + + diff --git a/tests/integration/test_cluster_discovery/test.py b/tests/integration/test_cluster_discovery/test.py new file mode 100644 index 00000000000..acddd855040 --- /dev/null +++ b/tests/integration/test_cluster_discovery/test.py @@ -0,0 +1,81 @@ +import pytest + +import functools +import time + +from helpers.cluster import ClickHouseCluster + +cluster = ClickHouseCluster(__file__) + +shard_configs = { + i: f'config/config_shard{i}.xml' + for i in [1, 3] +} + +nodes = [ + cluster.add_instance( + f'node{i}', + main_configs=[shard_configs.get(i, 'config/config.xml')], + stay_alive=True, + with_zookeeper=True + ) for i in range(5) +] + + +@pytest.fixture(scope="module") +def start_cluster(): + try: + cluster.start() + yield cluster + finally: + cluster.shutdown() + + +def check_on_cluster(nodes, expected, *, what, cluster_name='test_auto_cluster', msg=None, retries=5): + """ + Select data from `system.clusters` on specified nodes and check the result + """ + assert 1 <= retries <= 6 + + for retry in range(1, retries + 1): + nodes_res = { + node.name: int(node.query(f"SELECT {what} FROM system.clusters WHERE cluster = '{cluster_name}'")) + for node in nodes + } + if all(actual == expected for actual in nodes_res.values()): + break + + if retry != retries: + time.sleep(2 ** retry) + else: + msg = msg or f"Wrong '{what}' result" + raise Exception(f'{msg}: {nodes_res}, expected: {expected} (after {retries} retries)') + + +def test_cluster_discovery_startup_and_stop(start_cluster): + """ + Start cluster, check nodes count in system.clusters, + then stop/start some nodes and check that it (dis)appeared in cluster. + """ + + check_nodes_count = functools.partial(check_on_cluster, what='count()', msg='Wrong nodes count in cluster') + check_shard_num = functools.partial(check_on_cluster, what='count(DISTINCT shard_num)', msg='Wrong shard_num count in cluster') + + total_shards = len(shard_configs) + 1 + check_nodes_count([nodes[0], nodes[2]], len(nodes)) + check_shard_num([nodes[0], nodes[2]], total_shards) + + nodes[1].stop_clickhouse(kill=True) + check_nodes_count([nodes[0], nodes[2]], len(nodes) - 1) + check_shard_num([nodes[0], nodes[2]], total_shards - 1) + + nodes[3].stop_clickhouse() + check_nodes_count([nodes[0], nodes[2]], len(nodes) - 2) + + nodes[1].start_clickhouse() + check_nodes_count([nodes[0], nodes[2]], len(nodes) - 1) + + nodes[3].start_clickhouse() + check_nodes_count([nodes[0], nodes[2]], len(nodes)) + + check_nodes_count([nodes[1], nodes[2]], 2, cluster_name='two_shards', retries=1) diff --git a/tests/integration/test_concurrent_queries_restriction_by_query_kind/__init__.py b/tests/integration/test_concurrent_queries_restriction_by_query_kind/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/integration/test_concurrent_queries_restriction_by_query_kind/configs/concurrent_insert_restriction.xml b/tests/integration/test_concurrent_queries_restriction_by_query_kind/configs/concurrent_insert_restriction.xml new file mode 100644 index 00000000000..7753c579902 --- /dev/null +++ b/tests/integration/test_concurrent_queries_restriction_by_query_kind/configs/concurrent_insert_restriction.xml @@ -0,0 +1,3 @@ + + 2 + diff --git a/tests/integration/test_concurrent_queries_restriction_by_query_kind/configs/concurrent_select_restriction.xml b/tests/integration/test_concurrent_queries_restriction_by_query_kind/configs/concurrent_select_restriction.xml new file mode 100644 index 00000000000..c8f081e6804 --- /dev/null +++ b/tests/integration/test_concurrent_queries_restriction_by_query_kind/configs/concurrent_select_restriction.xml @@ -0,0 +1,3 @@ + + 2 + diff --git a/tests/integration/test_concurrent_queries_restriction_by_query_kind/test.py b/tests/integration/test_concurrent_queries_restriction_by_query_kind/test.py new file mode 100644 index 00000000000..2d16d9157f6 --- /dev/null +++ b/tests/integration/test_concurrent_queries_restriction_by_query_kind/test.py @@ -0,0 +1,77 @@ +import time +from multiprocessing.dummy import Pool + +import pytest +from helpers.cluster import ClickHouseCluster + + +cluster = ClickHouseCluster(__file__) +node_insert = cluster.add_instance('node_insert', main_configs=['configs/concurrent_insert_restriction.xml']) +node_select = cluster.add_instance('node_select', main_configs=['configs/concurrent_select_restriction.xml']) + + +@pytest.fixture(scope="module") +def started_cluster(): + try: + cluster.start() + node_select.query("create table test_concurrent_insert (x UInt64) ENGINE = MergeTree() order by tuple()") + node_insert.query("create table test_concurrent_insert (x UInt64) ENGINE = MergeTree() order by tuple()") + yield cluster + finally: + cluster.shutdown() + + +def execute_with_background(node, sql, background_sql, background_times, wait_times=3): + r = None + for _ in range(wait_times): + r = node.query('show processlist', stdin='') + if not r.strip(): + break + time.sleep(1) + else: + assert False, "there are unknown background queries: {}".format(r) + for _ in range(background_times): + node.get_query_request(background_sql, stdin='') + time.sleep(0.5) # wait background to start. + return node.query(sql, stdin='') + + +def common_pattern(node, query_kind, restricted_sql, normal_sql, limit, wait_times): + # restriction is working + with pytest.raises(Exception, match=r".*Too many simultaneous {} queries.*".format(query_kind)): + execute_with_background(node, restricted_sql, restricted_sql, limit, wait_times) + + # different query kind is independent + execute_with_background(node, normal_sql, restricted_sql, limit, wait_times) + + # normal + execute_with_background(node, restricted_sql, '', 0, wait_times) + + +def test_select(started_cluster): + common_pattern( + node_select, 'select', + 'select sleep(3)', + 'insert into test_concurrent_insert values (0)', + 2, + 10 + ) + + # subquery is not counted + execute_with_background( + node_select, + 'select sleep(3)', + 'insert into test_concurrent_insert select sleep(3)', + 2, + 10 + ) + + +def test_insert(started_cluster): + common_pattern( + node_insert, 'insert', + 'insert into test_concurrent_insert select sleep(3)', + 'select 1', + 2, + 10 + ) diff --git a/tests/integration/test_executable_dictionary/__init__.py b/tests/integration/test_executable_dictionary/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/integration/test_executable_dictionary/config/dictionaries_config.xml b/tests/integration/test_executable_dictionary/config/dictionaries_config.xml new file mode 100644 index 00000000000..3cbf717bb67 --- /dev/null +++ b/tests/integration/test_executable_dictionary/config/dictionaries_config.xml @@ -0,0 +1,2 @@ + + diff --git a/tests/integration/test_executable_dictionary/dictionaries/executable_input_argument_python_dictionary.xml b/tests/integration/test_executable_dictionary/dictionaries/executable_input_argument_python_dictionary.xml new file mode 100644 index 00000000000..ddbb8e95abb --- /dev/null +++ b/tests/integration/test_executable_dictionary/dictionaries/executable_input_argument_python_dictionary.xml @@ -0,0 +1,99 @@ + + + executable_input_argument_python + + + TabSeparated + input_argument.py 1 + 1 + + + + + + + + input + + + result + String + + + + + + + executable_input_argument_pool_python + + + TabSeparated + input_argument.py 1 + 1 + + + + + + + + input + + + result + String + + + + + + + executable_implicit_input_argument_python + + + TabSeparated + input_implicit_argument.py 1 + 1 + 1 + + + + + + + + input + + + result + String + + + + + + + executable_implicit_input_argument_pool_python + + + TabSeparated + input_implicit_argument.py 1 + 1 + 1 + + + + + + + + input + + + result + String + + + + + \ No newline at end of file diff --git a/tests/integration/test_executable_dictionary/dictionaries/executable_input_bash_dictionary.xml b/tests/integration/test_executable_dictionary/dictionaries/executable_input_bash_dictionary.xml new file mode 100644 index 00000000000..488a12de115 --- /dev/null +++ b/tests/integration/test_executable_dictionary/dictionaries/executable_input_bash_dictionary.xml @@ -0,0 +1,99 @@ + + + executable_input_bash + + + TabSeparated + input.sh + 1 + + + + + + + + input + + + result + String + + + + + + + executable_input_pool_bash + + + TabSeparated + input.sh + 1 + + + + + + + + input + + + result + String + + + + + + + executable_implicit_input_bash + + + TabSeparated + input_implicit.sh + 1 + 1 + + + + + + + + input + + + result + String + + + + + + + executable_implicit_input_pool_bash + + + TabSeparated + input_implicit.sh + 1 + 1 + + + + + + + + input + + + result + String + + + + + \ No newline at end of file diff --git a/tests/integration/test_executable_dictionary/dictionaries/executable_input_python_dictionary.xml b/tests/integration/test_executable_dictionary/dictionaries/executable_input_python_dictionary.xml new file mode 100644 index 00000000000..5b551e51951 --- /dev/null +++ b/tests/integration/test_executable_dictionary/dictionaries/executable_input_python_dictionary.xml @@ -0,0 +1,99 @@ + + + executable_input_python + + + TabSeparated + input.py + 1 + + + + + + + + input + + + result + String + + + + + + + executable_input_pool_python + + + TabSeparated + input.py + 1 + + + + + + + + input + + + result + String + + + + + + + executable_implicit_input_python + + + TabSeparated + input_implicit.py + 1 + 1 + + + + + + + + input + + + result + String + + + + + + + executable_implicit_input_pool_python + + + TabSeparated + input_implicit.py + 1 + 1 + + + + + + + + input + + + result + String + + + + + \ No newline at end of file diff --git a/tests/integration/test_executable_dictionary/dictionaries/executable_input_send_chunk_header_python_dictionary.xml b/tests/integration/test_executable_dictionary/dictionaries/executable_input_send_chunk_header_python_dictionary.xml new file mode 100644 index 00000000000..816cb0db2c5 --- /dev/null +++ b/tests/integration/test_executable_dictionary/dictionaries/executable_input_send_chunk_header_python_dictionary.xml @@ -0,0 +1,103 @@ + + + executable_input_send_chunk_header_python + + + TabSeparated + input_chunk_header.py + 1 + 1 + + + + + + + + input + + + result + String + + + + + + + executable_input_send_chunk_header_pool_python + + + TabSeparated + input_chunk_header.py + 1 + 1 + + + + + + + + input + + + result + String + + + + + + + executable_implicit_input_send_chunk_header_python + + + TabSeparated + input_implicit_chunk_header.py + 1 + 1 + 1 + + + + + + + + input + + + result + String + + + + + + + executable_implicit_input_send_chunk_header_pool_python + + + TabSeparated + input_implicit_chunk_header.py + 1 + 1 + 1 + + + + + + + + input + + + result + String + + + + + \ No newline at end of file diff --git a/tests/integration/test_executable_dictionary/dictionaries/executable_input_signalled_python_dictionary.xml b/tests/integration/test_executable_dictionary/dictionaries/executable_input_signalled_python_dictionary.xml new file mode 100644 index 00000000000..71f8873b20e --- /dev/null +++ b/tests/integration/test_executable_dictionary/dictionaries/executable_input_signalled_python_dictionary.xml @@ -0,0 +1,103 @@ + + + executable_input_signalled_python + + + TabSeparated + input_signalled.py + 1 + 1000 + + + + + + + + input + + + result + String + Default result + + + + + + executable_input_signalled_pool_python + + + TabSeparated + input_signalled.py + 1 + 1000 + + + + + + + + input + + + result + String + Default result + + + + + + executable_implicit_input_signalled_python + + + TabSeparated + input_implicit_signalled.py + 1 + 1 + 1000 + + + + + + + + input + + + result + String + Default result + + + + + + executable_implicit_input_signalled_pool_python + + + TabSeparated + input_implicit_signalled.py + 1 + 1 + 1000 + + + + + + + + input + + + result + String + Default result + + + + \ No newline at end of file diff --git a/tests/integration/test_executable_dictionary/dictionaries/executable_input_slow_python_dictionary.xml b/tests/integration/test_executable_dictionary/dictionaries/executable_input_slow_python_dictionary.xml new file mode 100644 index 00000000000..dee161a9b78 --- /dev/null +++ b/tests/integration/test_executable_dictionary/dictionaries/executable_input_slow_python_dictionary.xml @@ -0,0 +1,103 @@ + + + executable_input_slow_python + + + TabSeparated + input_slow.py + 1 + 1000 + + + + + + + + input + + + result + String + + + + + + + executable_input_slow_pool_python + + + TabSeparated + input_slow.py + 1 + 1000 + + + + + + + + input + + + result + String + + + + + + + executable_implicit_input_slow_python + + + TabSeparated + input_implicit_slow.py + 1 + 1 + 1000 + + + + + + + + input + + + result + String + + + + + + + executable_implicit_input_slow_pool_python + + + TabSeparated + input_implicit_slow.py + 1 + 1 + 1000 + + + + + + + + input + + + result + String + + + + + \ No newline at end of file diff --git a/tests/integration/test_executable_dictionary/dictionaries/executable_input_sum_python_dictionary.xml b/tests/integration/test_executable_dictionary/dictionaries/executable_input_sum_python_dictionary.xml new file mode 100644 index 00000000000..3f63e7b8671 --- /dev/null +++ b/tests/integration/test_executable_dictionary/dictionaries/executable_input_sum_python_dictionary.xml @@ -0,0 +1,128 @@ + + + executable_input_sum_python + + + TabSeparated + input_sum.py + 1 + + + + + + + + + first_argument + UInt64 + + + second_argument + UInt64 + + + + result + UInt64 + + + + + + + executable_input_sum_pool_python + + + TabSeparated + input_sum.py + 1 + + + + + + + + + first_argument + UInt64 + + + second_argument + UInt64 + + + + result + UInt64 + + + + + + + + executable_implicit_input_sum_python + + + TabSeparated + input_implicit_sum.py + 1 + 1 + + + + + + + + + first_argument + UInt64 + + + second_argument + UInt64 + + + + result + UInt64 + + + + + + + executable_implicit_input_sum_pool_python + + + TabSeparated + input_implicit_sum.py + 1 + 1 + + + + + + + + + first_argument + UInt64 + + + second_argument + UInt64 + + + + result + UInt64 + + + + + diff --git a/tests/integration/test_executable_dictionary/dictionaries/executable_non_direct_dictionary.xml b/tests/integration/test_executable_dictionary/dictionaries/executable_non_direct_dictionary.xml new file mode 100644 index 00000000000..3f77dae1ac6 --- /dev/null +++ b/tests/integration/test_executable_dictionary/dictionaries/executable_non_direct_dictionary.xml @@ -0,0 +1,95 @@ + + + executable_input_non_direct_bash + + + TabSeparated + while read read_data; do printf "$read_data\tKey $read_data\n"; done + + + + + + + + input + + + result + String + + + + + + + executable_input_non_direct_pool_bash + + + TabSeparated + while read read_data; do printf "$read_data\tKey $read_data\n"; done + + + + + + + + input + + + result + String + + + + + + + executable_input_implicit_non_direct_bash + + + TabSeparated + while read read_data; do printf "Key $read_data\n"; done + 1 + + + + + + + + input + + + result + String + + + + + + + executable_input_implicit_non_direct_pool_bash + + + TabSeparated + while read read_data; do printf "Key $read_data\n"; done + 1 + + + + + + + + input + + + result + String + + + + + \ No newline at end of file diff --git a/tests/integration/test_executable_dictionary/dictionaries/executable_source_argument_python_dictionary.xml b/tests/integration/test_executable_dictionary/dictionaries/executable_source_argument_python_dictionary.xml new file mode 100644 index 00000000000..3173eb5500d --- /dev/null +++ b/tests/integration/test_executable_dictionary/dictionaries/executable_source_argument_python_dictionary.xml @@ -0,0 +1,54 @@ + + + executable_source_simple_key_argument_python + + + TabSeparated + source_argument.py 1 + 1 + + + + + + 0 + + + input + + + result + String + + + + + + + executable_source_complex_key_argument_python + + + TabSeparated + source_argument.py 1 + 1 + + + + + + 0 + + + + input + UInt64 + + + + result + String + + + + + \ No newline at end of file diff --git a/tests/integration/test_executable_dictionary/dictionaries/executable_source_python_dictionary.xml b/tests/integration/test_executable_dictionary/dictionaries/executable_source_python_dictionary.xml new file mode 100644 index 00000000000..a2036fc67bb --- /dev/null +++ b/tests/integration/test_executable_dictionary/dictionaries/executable_source_python_dictionary.xml @@ -0,0 +1,54 @@ + + + executable_source_simple_key_python + + + TabSeparated + source.py + 1 + + + + + + 0 + + + input + + + result + String + + + + + + + executable_source_complex_key_python + + + TabSeparated + source.py + 1 + + + + + + 0 + + + + input + UInt64 + + + + result + String + + + + + \ No newline at end of file diff --git a/tests/integration/test_executable_dictionary/dictionaries/executable_source_updated_python_dictionary.xml b/tests/integration/test_executable_dictionary/dictionaries/executable_source_updated_python_dictionary.xml new file mode 100644 index 00000000000..10d1b1ca0c6 --- /dev/null +++ b/tests/integration/test_executable_dictionary/dictionaries/executable_source_updated_python_dictionary.xml @@ -0,0 +1,56 @@ + + + executable_source_simple_key_update_python + + + TabSeparated + source_update.py + 1 + 1 + + + + + + 5 + + + input + + + result + String + + + + + + + executable_source_complex_key_update_python + + + TabSeparated + source_update.py + 1 + 1 + + + + + + 5 + + + + input + UInt64 + + + + result + String + + + + + \ No newline at end of file diff --git a/tests/integration/test_executable_dictionary/test.py b/tests/integration/test_executable_dictionary/test.py new file mode 100644 index 00000000000..5e50a092a29 --- /dev/null +++ b/tests/integration/test_executable_dictionary/test.py @@ -0,0 +1,175 @@ +import os +import sys +import time + +import pytest + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__)) + +from helpers.cluster import ClickHouseCluster + +cluster = ClickHouseCluster(__file__) +node = cluster.add_instance('node', stay_alive=True, main_configs=[]) + + +def skip_test_msan(instance): + if instance.is_built_with_memory_sanitizer(): + pytest.skip("Memory Sanitizer cannot work with vfork") + +def copy_file_to_container(local_path, dist_path, container_id): + os.system("docker cp {local} {cont_id}:{dist}".format(local=local_path, cont_id=container_id, dist=dist_path)) + +config = ''' + /etc/clickhouse-server/dictionaries/*_dictionary.xml +''' + +@pytest.fixture(scope="module") +def started_cluster(): + try: + cluster.start() + + node.replace_config("/etc/clickhouse-server/config.d/dictionaries_config.xml", config) + + copy_file_to_container(os.path.join(SCRIPT_DIR, 'dictionaries/.'), '/etc/clickhouse-server/dictionaries', node.docker_id) + copy_file_to_container(os.path.join(SCRIPT_DIR, 'user_scripts/.'), '/var/lib/clickhouse/user_scripts', node.docker_id) + + node.restart_clickhouse() + + yield cluster + + finally: + cluster.shutdown() + +def test_executable_input_bash(started_cluster): + skip_test_msan(node) + assert node.query("SELECT dictGet('executable_input_bash', 'result', toUInt64(1))") == 'Key 1\n' + assert node.query("SELECT dictGet('executable_input_pool_bash', 'result', toUInt64(1))") == 'Key 1\n' + +def test_executable_implicit_input_bash(started_cluster): + skip_test_msan(node) + assert node.query("SELECT dictGet('executable_implicit_input_bash', 'result', toUInt64(1))") == 'Key 1\n' + assert node.query("SELECT dictGet('executable_implicit_input_pool_bash', 'result', toUInt64(1))") == 'Key 1\n' + +def test_executable_input_python(started_cluster): + skip_test_msan(node) + assert node.query("SELECT dictGet('executable_input_python', 'result', toUInt64(1))") == 'Key 1\n' + assert node.query("SELECT dictGet('executable_input_pool_python', 'result', toUInt64(1))") == 'Key 1\n' + +def test_executable_implicit_input_python(started_cluster): + skip_test_msan(node) + assert node.query("SELECT dictGet('executable_implicit_input_python', 'result', toUInt64(1))") == 'Key 1\n' + assert node.query("SELECT dictGet('executable_implicit_input_pool_python', 'result', toUInt64(1))") == 'Key 1\n' + +def test_executable_input_send_chunk_header_python(started_cluster): + skip_test_msan(node) + assert node.query("SELECT dictGet('executable_input_send_chunk_header_python', 'result', toUInt64(1))") == 'Key 1\n' + assert node.query("SELECT dictGet('executable_input_send_chunk_header_pool_python', 'result', toUInt64(1))") == 'Key 1\n' + +def test_executable_implicit_input_send_chunk_header_python(started_cluster): + skip_test_msan(node) + assert node.query("SELECT dictGet('executable_implicit_input_send_chunk_header_python', 'result', toUInt64(1))") == 'Key 1\n' + assert node.query("SELECT dictGet('executable_implicit_input_send_chunk_header_pool_python', 'result', toUInt64(1))") == 'Key 1\n' + +def test_executable_input_sum_python(started_cluster): + skip_test_msan(node) + assert node.query("SELECT dictGet('executable_input_sum_python', 'result', tuple(toUInt64(1), toUInt64(1)))") == '2\n' + assert node.query("SELECT dictGet('executable_input_sum_pool_python', 'result', tuple(toUInt64(1), toUInt64(1)))") == '2\n' + +def test_executable_implicit_input_sum_python(started_cluster): + skip_test_msan(node) + assert node.query("SELECT dictGet('executable_implicit_input_sum_python', 'result', tuple(toUInt64(1), toUInt64(1)))") == '2\n' + assert node.query("SELECT dictGet('executable_implicit_input_sum_pool_python', 'result', tuple(toUInt64(1), toUInt64(1)))") == '2\n' + +def test_executable_input_argument_python(started_cluster): + skip_test_msan(node) + assert node.query("SELECT dictGet('executable_input_argument_python', 'result', toUInt64(1))") == 'Key 1 1\n' + assert node.query("SELECT dictGet('executable_input_argument_pool_python', 'result', toUInt64(1))") == 'Key 1 1\n' + +def test_executable_implicit_input_argument_python(started_cluster): + skip_test_msan(node) + assert node.query("SELECT dictGet('executable_implicit_input_argument_python', 'result', toUInt64(1))") == 'Key 1 1\n' + assert node.query("SELECT dictGet('executable_implicit_input_argument_pool_python', 'result', toUInt64(1))") == 'Key 1 1\n' + +def test_executable_input_signalled_python(started_cluster): + skip_test_msan(node) + assert node.query("SELECT dictGet('executable_input_signalled_python', 'result', toUInt64(1))") == 'Default result\n' + assert node.query("SELECT dictGet('executable_input_signalled_pool_python', 'result', toUInt64(1))") == 'Default result\n' + +def test_executable_implicit_input_signalled_python(started_cluster): + skip_test_msan(node) + assert node.query("SELECT dictGet('executable_implicit_input_signalled_python', 'result', toUInt64(1))") == 'Default result\n' + assert node.query("SELECT dictGet('executable_implicit_input_signalled_pool_python', 'result', toUInt64(1))") == 'Default result\n' + +def test_executable_input_slow_python(started_cluster): + skip_test_msan(node) + assert node.query_and_get_error("SELECT dictGet('executable_input_slow_python', 'result', toUInt64(1))") + assert node.query_and_get_error("SELECT dictGet('executable_input_slow_pool_python', 'result', toUInt64(1))") + +def test_executable_implicit_input_slow_python(started_cluster): + skip_test_msan(node) + assert node.query_and_get_error("SELECT dictGet('executable_implicit_input_slow_python', 'result', toUInt64(1))") + assert node.query_and_get_error("SELECT dictGet('executable_implicit_input_slow_pool_python', 'result', toUInt64(1))") + +def test_executable_input_slow_python(started_cluster): + skip_test_msan(node) + assert node.query_and_get_error("SELECT dictGet('executable_input_slow_python', 'result', toUInt64(1))") + assert node.query_and_get_error("SELECT dictGet('executable_input_slow_pool_python', 'result', toUInt64(1))") + +def test_executable_implicit_input_slow_python(started_cluster): + skip_test_msan(node) + assert node.query_and_get_error("SELECT dictGet('executable_implicit_input_slow_python', 'result', toUInt64(1))") + assert node.query_and_get_error("SELECT dictGet('executable_implicit_input_slow_pool_python', 'result', toUInt64(1))") + +def test_executable_non_direct_input_bash(started_cluster): + skip_test_msan(node) + assert node.query("SELECT dictGet('executable_input_non_direct_bash', 'result', toUInt64(1))") == 'Key 1\n' + assert node.query("SELECT dictGet('executable_input_non_direct_pool_bash', 'result', toUInt64(1))") == 'Key 1\n' + +def test_executable_implicit_non_direct_input_bash(started_cluster): + skip_test_msan(node) + assert node.query("SELECT dictGet('executable_input_implicit_non_direct_bash', 'result', toUInt64(1))") == 'Key 1\n' + assert node.query("SELECT dictGet('executable_input_implicit_non_direct_pool_bash', 'result', toUInt64(1))") == 'Key 1\n' + +def test_executable_source_python(started_cluster): + skip_test_msan(node) + assert node.query("SELECT * FROM dictionary(executable_source_simple_key_python) ORDER BY input") == '1\tValue 1\n2\tValue 2\n3\tValue 3\n' + assert node.query("SELECT dictGet('executable_source_simple_key_python', 'result', toUInt64(1))") == 'Value 1\n' + assert node.query("SELECT dictGet('executable_source_simple_key_python', 'result', toUInt64(2))") == 'Value 2\n' + assert node.query("SELECT dictGet('executable_source_simple_key_python', 'result', toUInt64(3))") == 'Value 3\n' + + assert node.query("SELECT * FROM dictionary('executable_source_complex_key_python') ORDER BY input") == '1\tValue 1\n2\tValue 2\n3\tValue 3\n' + assert node.query("SELECT dictGet('executable_source_complex_key_python', 'result', tuple(toUInt64(1)))") == 'Value 1\n' + assert node.query("SELECT dictGet('executable_source_complex_key_python', 'result', tuple(toUInt64(2)))") == 'Value 2\n' + assert node.query("SELECT dictGet('executable_source_complex_key_python', 'result', tuple(toUInt64(3)))") == 'Value 3\n' + +def test_executable_source_argument_python(started_cluster): + skip_test_msan(node) + assert node.query("SELECT * FROM dictionary(executable_source_simple_key_argument_python) ORDER BY input") == '1\tValue 1 1\n2\tValue 1 2\n3\tValue 1 3\n' + assert node.query("SELECT dictGet('executable_source_simple_key_argument_python', 'result', toUInt64(1))") == 'Value 1 1\n' + assert node.query("SELECT dictGet('executable_source_simple_key_argument_python', 'result', toUInt64(2))") == 'Value 1 2\n' + assert node.query("SELECT dictGet('executable_source_simple_key_argument_python', 'result', toUInt64(3))") == 'Value 1 3\n' + + assert node.query("SELECT * FROM dictionary(executable_source_complex_key_argument_python) ORDER BY input") == '1\tValue 1 1\n2\tValue 1 2\n3\tValue 1 3\n' + assert node.query("SELECT dictGet('executable_source_complex_key_argument_python', 'result', toUInt64(1))") == 'Value 1 1\n' + assert node.query("SELECT dictGet('executable_source_complex_key_argument_python', 'result', toUInt64(2))") == 'Value 1 2\n' + assert node.query("SELECT dictGet('executable_source_complex_key_argument_python', 'result', toUInt64(3))") == 'Value 1 3\n' + +def test_executable_source_updated_python(started_cluster): + skip_test_msan(node) + assert node.query("SELECT * FROM dictionary(executable_source_simple_key_update_python) ORDER BY input") == '1\tValue 0 1\n' + assert node.query("SELECT dictGet('executable_source_simple_key_update_python', 'result', toUInt64(1))") == 'Value 0 1\n' + + time.sleep(10) + + assert node.query("SELECT * FROM dictionary(executable_source_simple_key_update_python) ORDER BY input") == '1\tValue 1 1\n' + assert node.query("SELECT dictGet('executable_source_simple_key_update_python', 'result', toUInt64(1))") == 'Value 1 1\n' + + assert node.query("SELECT * FROM dictionary(executable_source_complex_key_update_python) ORDER BY input") == '1\tValue 0 1\n' + assert node.query("SELECT dictGet('executable_source_complex_key_update_python', 'result', toUInt64(1))") == 'Value 0 1\n' + + time.sleep(10) + + assert node.query("SELECT * FROM dictionary(executable_source_complex_key_update_python) ORDER BY input") == '1\tValue 1 1\n' + assert node.query("SELECT dictGet('executable_source_complex_key_update_python', 'result', toUInt64(1))") == 'Value 1 1\n' + diff --git a/tests/integration/test_executable_dictionary/user_scripts/input.py b/tests/integration/test_executable_dictionary/user_scripts/input.py new file mode 100755 index 00000000000..e711dd8e306 --- /dev/null +++ b/tests/integration/test_executable_dictionary/user_scripts/input.py @@ -0,0 +1,11 @@ +#!/usr/bin/python3 + +import sys +import os +import signal + +if __name__ == '__main__': + for line in sys.stdin: + updated_line = line.replace('\n', '') + print(updated_line + '\t' + "Key " + updated_line, end='\n') + sys.stdout.flush() diff --git a/tests/integration/test_executable_dictionary/user_scripts/input.sh b/tests/integration/test_executable_dictionary/user_scripts/input.sh new file mode 100755 index 00000000000..7712c392951 --- /dev/null +++ b/tests/integration/test_executable_dictionary/user_scripts/input.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +while read read_data; + do printf "$read_data\tKey $read_data\n"; +done diff --git a/tests/integration/test_executable_dictionary/user_scripts/input_argument.py b/tests/integration/test_executable_dictionary/user_scripts/input_argument.py new file mode 100755 index 00000000000..163f9c4183f --- /dev/null +++ b/tests/integration/test_executable_dictionary/user_scripts/input_argument.py @@ -0,0 +1,11 @@ +#!/usr/bin/python3 + +import sys + +if __name__ == '__main__': + arg = int(sys.argv[1]) + + for line in sys.stdin: + updated_line = line.replace('\n', '') + print(updated_line + '\t' + "Key " + str(arg) + " " + updated_line, end='\n') + sys.stdout.flush() diff --git a/tests/integration/test_executable_dictionary/user_scripts/input_chunk_header.py b/tests/integration/test_executable_dictionary/user_scripts/input_chunk_header.py new file mode 100755 index 00000000000..4eb00f64eb3 --- /dev/null +++ b/tests/integration/test_executable_dictionary/user_scripts/input_chunk_header.py @@ -0,0 +1,15 @@ +#!/usr/bin/python3 + +import sys + +if __name__ == '__main__': + for chunk_header in sys.stdin: + chunk_length = int(chunk_header) + + while chunk_length != 0: + line = sys.stdin.readline() + updated_line = line.replace('\n', '') + chunk_length -= 1 + print(updated_line + '\t' + "Key " + updated_line, end='\n') + + sys.stdout.flush() diff --git a/tests/integration/test_executable_dictionary/user_scripts/input_implicit.py b/tests/integration/test_executable_dictionary/user_scripts/input_implicit.py new file mode 100755 index 00000000000..835ab1f441a --- /dev/null +++ b/tests/integration/test_executable_dictionary/user_scripts/input_implicit.py @@ -0,0 +1,8 @@ +#!/usr/bin/python3 + +import sys + +if __name__ == '__main__': + for line in sys.stdin: + print("Key " + line, end='') + sys.stdout.flush() diff --git a/tests/integration/test_executable_dictionary/user_scripts/input_implicit.sh b/tests/integration/test_executable_dictionary/user_scripts/input_implicit.sh new file mode 100755 index 00000000000..aea51b82b1f --- /dev/null +++ b/tests/integration/test_executable_dictionary/user_scripts/input_implicit.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +while read read_data; + do printf "Key $read_data\n"; +done diff --git a/tests/integration/test_executable_dictionary/user_scripts/input_implicit_argument.py b/tests/integration/test_executable_dictionary/user_scripts/input_implicit_argument.py new file mode 100755 index 00000000000..c1b2e5966d7 --- /dev/null +++ b/tests/integration/test_executable_dictionary/user_scripts/input_implicit_argument.py @@ -0,0 +1,10 @@ +#!/usr/bin/python3 + +import sys + +if __name__ == '__main__': + arg = int(sys.argv[1]) + + for line in sys.stdin: + print("Key " + str(arg) + " " + line, end='') + sys.stdout.flush() diff --git a/tests/integration/test_executable_dictionary/user_scripts/input_implicit_chunk_header.py b/tests/integration/test_executable_dictionary/user_scripts/input_implicit_chunk_header.py new file mode 100755 index 00000000000..5dc03e1c507 --- /dev/null +++ b/tests/integration/test_executable_dictionary/user_scripts/input_implicit_chunk_header.py @@ -0,0 +1,14 @@ +#!/usr/bin/python3 + +import sys + +if __name__ == '__main__': + for chunk_header in sys.stdin: + chunk_length = int(chunk_header) + + while chunk_length != 0: + line = sys.stdin.readline() + chunk_length -= 1 + print("Key " + line, end='') + + sys.stdout.flush() diff --git a/tests/integration/test_executable_dictionary/user_scripts/input_implicit_signalled.py b/tests/integration/test_executable_dictionary/user_scripts/input_implicit_signalled.py new file mode 100755 index 00000000000..27c8bc4840e --- /dev/null +++ b/tests/integration/test_executable_dictionary/user_scripts/input_implicit_signalled.py @@ -0,0 +1,13 @@ +#!/usr/bin/python3 + +import sys +import os +import signal +import time + +if __name__ == '__main__': + for line in sys.stdin: + os.signal(os.getpid(), signal.SIGTERM) + + print("Key " + line, end='') + sys.stdout.flush() diff --git a/tests/integration/test_executable_dictionary/user_scripts/input_implicit_slow.py b/tests/integration/test_executable_dictionary/user_scripts/input_implicit_slow.py new file mode 100755 index 00000000000..648a9eac918 --- /dev/null +++ b/tests/integration/test_executable_dictionary/user_scripts/input_implicit_slow.py @@ -0,0 +1,12 @@ +#!/usr/bin/python3 + +import sys +import os +import signal +import time + +if __name__ == '__main__': + for line in sys.stdin: + time.sleep(5) + print("Key " + line, end='') + sys.stdout.flush() diff --git a/tests/integration/test_executable_dictionary/user_scripts/input_implicit_sum.py b/tests/integration/test_executable_dictionary/user_scripts/input_implicit_sum.py new file mode 100755 index 00000000000..432d7a13a2f --- /dev/null +++ b/tests/integration/test_executable_dictionary/user_scripts/input_implicit_sum.py @@ -0,0 +1,10 @@ +#!/usr/bin/python3 + +import sys +import re + +if __name__ == '__main__': + for line in sys.stdin: + line_split = re.split(r'\t+', line) + print(int(line_split[0]) + int(line_split[1]), end='\n') + sys.stdout.flush() diff --git a/tests/integration/test_executable_dictionary/user_scripts/input_signalled.py b/tests/integration/test_executable_dictionary/user_scripts/input_signalled.py new file mode 100755 index 00000000000..a3a99f1e71e --- /dev/null +++ b/tests/integration/test_executable_dictionary/user_scripts/input_signalled.py @@ -0,0 +1,13 @@ +#!/usr/bin/python3 + +import sys +import os +import signal +import time + +if __name__ == '__main__': + for line in sys.stdin: + os.signal(os.getpid(), signal.SIGTERM) + updated_line = line.replace('\n', '') + print(updated_line + '\t' + "Key " + updated_line, end='\n') + sys.stdout.flush() diff --git a/tests/integration/test_executable_dictionary/user_scripts/input_slow.py b/tests/integration/test_executable_dictionary/user_scripts/input_slow.py new file mode 100755 index 00000000000..a3b8c484b29 --- /dev/null +++ b/tests/integration/test_executable_dictionary/user_scripts/input_slow.py @@ -0,0 +1,13 @@ +#!/usr/bin/python3 + +import sys +import os +import signal +import time + +if __name__ == '__main__': + for line in sys.stdin: + time.sleep(5) + updated_line = line.replace('\n', '') + print(updated_line + '\t' + "Key " + updated_line, end='\n') + sys.stdout.flush() diff --git a/tests/integration/test_executable_dictionary/user_scripts/input_sum.py b/tests/integration/test_executable_dictionary/user_scripts/input_sum.py new file mode 100755 index 00000000000..e9ec5028701 --- /dev/null +++ b/tests/integration/test_executable_dictionary/user_scripts/input_sum.py @@ -0,0 +1,12 @@ +#!/usr/bin/python3 + +import sys +import re + +if __name__ == '__main__': + for line in sys.stdin: + updated_line = line.replace('\n', '') + line_split = re.split(r'\t+', line) + sum = int(line_split[0]) + int(line_split[1]) + print(updated_line + '\t' + str(sum), end='\n') + sys.stdout.flush() diff --git a/tests/integration/test_executable_dictionary/user_scripts/source.py b/tests/integration/test_executable_dictionary/user_scripts/source.py new file mode 100755 index 00000000000..e105773c467 --- /dev/null +++ b/tests/integration/test_executable_dictionary/user_scripts/source.py @@ -0,0 +1,10 @@ +#!/usr/bin/python3 + +import sys + +if __name__ == '__main__': + print('1' + '\t' + 'Value 1', end='\n') + print('2' + '\t' + 'Value 2', end='\n') + print('3' + '\t' + 'Value 3', end='\n') + + sys.stdout.flush() diff --git a/tests/integration/test_executable_dictionary/user_scripts/source_argument.py b/tests/integration/test_executable_dictionary/user_scripts/source_argument.py new file mode 100755 index 00000000000..881e73adc97 --- /dev/null +++ b/tests/integration/test_executable_dictionary/user_scripts/source_argument.py @@ -0,0 +1,12 @@ +#!/usr/bin/python3 + +import sys + +if __name__ == '__main__': + arg = int(sys.argv[1]) + + print('1' + '\t' + 'Value ' + str(arg) + ' 1', end='\n') + print('2' + '\t' + 'Value ' + str(arg) + ' 2', end='\n') + print('3' + '\t' + 'Value ' + str(arg) + ' 3', end='\n') + + sys.stdout.flush() diff --git a/tests/integration/test_executable_dictionary/user_scripts/source_update.py b/tests/integration/test_executable_dictionary/user_scripts/source_update.py new file mode 100755 index 00000000000..99388f9ada3 --- /dev/null +++ b/tests/integration/test_executable_dictionary/user_scripts/source_update.py @@ -0,0 +1,12 @@ +#!/usr/bin/python3 + +import sys + +if __name__ == '__main__': + update_field_value = 0 + + if len(sys.argv) >= 2: + update_field_value = int(sys.argv[1]) + + print('1' + '\t' + 'Value ' + str(update_field_value) + ' 1', end='\n') + sys.stdout.flush() diff --git a/tests/integration/test_executable_table_function/test.py b/tests/integration/test_executable_table_function/test.py index f5537e26b94..7820396d20f 100644 --- a/tests/integration/test_executable_table_function/test.py +++ b/tests/integration/test_executable_table_function/test.py @@ -1,6 +1,5 @@ import os import sys -import time import pytest @@ -30,69 +29,353 @@ def started_cluster(): copy_file_to_container(os.path.join(SCRIPT_DIR, 'user_scripts/.'), '/var/lib/clickhouse/user_scripts', node.docker_id) node.restart_clickhouse() + node.query("CREATE TABLE test_data_table (id UInt64) ENGINE=TinyLog;") + node.query("INSERT INTO test_data_table VALUES (0), (1), (2);") + yield cluster finally: cluster.shutdown() -def test_executable_function_no_input(started_cluster): +def test_executable_function_no_input_bash(started_cluster): skip_test_msan(node) - assert node.query("SELECT * FROM executable('test_no_input.sh', 'TabSeparated', 'value UInt64')") == '1\n' + assert node.query("SELECT * FROM executable('no_input.sh', 'TabSeparated', 'value String')") == 'Key 0\nKey 1\nKey 2\n' -def test_executable_function_input(started_cluster): +def test_executable_function_no_input_python(started_cluster): skip_test_msan(node) - assert node.query("SELECT * FROM executable('test_input.sh', 'TabSeparated', 'value String', (SELECT 1))") == 'Key 1\n' + assert node.query("SELECT * FROM executable('no_input.py', 'TabSeparated', 'value String')") == 'Key 0\nKey 1\nKey 2\n' -def test_executable_function_input_multiple_pipes(started_cluster): +def test_executable_function_input_bash(started_cluster): skip_test_msan(node) - actual = node.query("SELECT * FROM executable('test_input_multiple_pipes.sh', 'TabSeparated', 'value String', (SELECT 1), (SELECT 2), (SELECT 3))") + + query = "SELECT * FROM executable('input.sh', 'TabSeparated', 'value String', {source})" + assert node.query(query.format(source='(SELECT 1)')) == 'Key 1\n' + assert node.query(query.format(source='(SELECT id FROM test_data_table)')) == 'Key 0\nKey 1\nKey 2\n' + +def test_executable_function_input_python(started_cluster): + skip_test_msan(node) + + query = "SELECT * FROM executable('input.py', 'TabSeparated', 'value String', {source})" + assert node.query(query.format(source='(SELECT 1)')) == 'Key 1\n' + assert node.query(query.format(source='(SELECT id FROM test_data_table)')) == 'Key 0\nKey 1\nKey 2\n' + +def test_executable_function_input_sum_python(started_cluster): + skip_test_msan(node) + + query = "SELECT * FROM executable('input_sum.py', 'TabSeparated', 'value UInt64', {source})" + assert node.query(query.format(source='(SELECT 1, 1)')) == '2\n' + assert node.query(query.format(source='(SELECT id, id FROM test_data_table)')) == '0\n2\n4\n' + +def test_executable_function_input_argument_python(started_cluster): + skip_test_msan(node) + + query = "SELECT * FROM executable('input_argument.py 1', 'TabSeparated', 'value String', {source})" + assert node.query(query.format(source='(SELECT 1)')) == 'Key 1 1\n' + assert node.query(query.format(source='(SELECT id FROM test_data_table)')) == 'Key 1 0\nKey 1 1\nKey 1 2\n' + +def test_executable_function_input_signalled_python(started_cluster): + skip_test_msan(node) + + query = "SELECT * FROM executable('input_signalled.py', 'TabSeparated', 'value String', {source})" + assert node.query(query.format(source='(SELECT 1)')) == '' + assert node.query(query.format(source='(SELECT id FROM test_data_table)')) == '' + +def test_executable_function_input_slow_python(started_cluster): + skip_test_msan(node) + + query = "SELECT * FROM executable('input_slow.py', 'TabSeparated', 'value String', {source})" + assert node.query_and_get_error(query.format(source='(SELECT 1)')) + assert node.query_and_get_error(query.format(source='(SELECT id FROM test_data_table)')) + +def test_executable_function_input_multiple_pipes_python(started_cluster): + skip_test_msan(node) + query = "SELECT * FROM executable('input_multiple_pipes.py', 'TabSeparated', 'value String', {source})" + actual = node.query(query.format(source='(SELECT 1), (SELECT 2), (SELECT 3)')) expected = 'Key from 4 fd 3\nKey from 3 fd 2\nKey from 0 fd 1\n' assert actual == expected -def test_executable_function_argument(started_cluster): - skip_test_msan(node) - assert node.query("SELECT * FROM executable('test_argument.sh 1', 'TabSeparated', 'value String')") == 'Key 1\n' - -def test_executable_storage_no_input(started_cluster): - skip_test_msan(node) - node.query("DROP TABLE IF EXISTS test_table") - node.query("CREATE TABLE test_table (value UInt64) ENGINE=Executable('test_no_input.sh', 'TabSeparated')") - assert node.query("SELECT * FROM test_table") == '1\n' - node.query("DROP TABLE test_table") - -def test_executable_storage_input(started_cluster): - skip_test_msan(node) - node.query("DROP TABLE IF EXISTS test_table") - node.query("CREATE TABLE test_table (value String) ENGINE=Executable('test_input.sh', 'TabSeparated', (SELECT 1))") - assert node.query("SELECT * FROM test_table") == 'Key 1\n' - node.query("DROP TABLE test_table") - -def test_executable_storage_input_multiple_pipes(started_cluster): - skip_test_msan(node) - node.query("DROP TABLE IF EXISTS test_table") - node.query("CREATE TABLE test_table (value String) ENGINE=Executable('test_input_multiple_pipes.sh', 'TabSeparated', (SELECT 1), (SELECT 2), (SELECT 3))") - actual = node.query("SELECT * FROM test_table") - expected = 'Key from 4 fd 3\nKey from 3 fd 2\nKey from 0 fd 1\n' + actual = node.query(query.format(source='(SELECT id FROM test_data_table), (SELECT 2), (SELECT 3)')) + expected = 'Key from 4 fd 3\nKey from 3 fd 2\nKey from 0 fd 0\nKey from 0 fd 1\nKey from 0 fd 2\n' assert actual == expected - node.query("DROP TABLE test_table") -def test_executable_storage_argument(started_cluster): +def test_executable_storage_no_input_bash(started_cluster): skip_test_msan(node) node.query("DROP TABLE IF EXISTS test_table") - node.query("CREATE TABLE test_table (value String) ENGINE=Executable('test_argument.sh 1', 'TabSeparated')") + node.query("CREATE TABLE test_table (value String) ENGINE=Executable('no_input.sh', 'TabSeparated')") + assert node.query("SELECT * FROM test_table") == 'Key 0\nKey 1\nKey 2\n' + node.query("DROP TABLE test_table") + +def test_executable_storage_no_input_python(started_cluster): + skip_test_msan(node) + node.query("DROP TABLE IF EXISTS test_table") + node.query("CREATE TABLE test_table (value String) ENGINE=Executable('no_input.py', 'TabSeparated')") + assert node.query("SELECT * FROM test_table") == 'Key 0\nKey 1\nKey 2\n' + node.query("DROP TABLE test_table") + +def test_executable_storage_input_bash(started_cluster): + skip_test_msan(node) + + query = "CREATE TABLE test_table (value String) ENGINE=Executable('input.sh', 'TabSeparated', {source})" + + node.query("DROP TABLE IF EXISTS test_table") + node.query(query.format(source='(SELECT 1)')) assert node.query("SELECT * FROM test_table") == 'Key 1\n' node.query("DROP TABLE test_table") -def test_executable_pool_storage(started_cluster): + node.query(query.format(source='(SELECT id FROM test_data_table)')) + assert node.query("SELECT * FROM test_table") == 'Key 0\nKey 1\nKey 2\n' + node.query("DROP TABLE test_table") + +def test_executable_storage_input_python(started_cluster): skip_test_msan(node) + + query = "CREATE TABLE test_table (value String) ENGINE=Executable('input.py', 'TabSeparated', {source})" + node.query("DROP TABLE IF EXISTS test_table") - node.query("CREATE TABLE test_table (value String) ENGINE=ExecutablePool('test_input_process_pool.sh', 'TabSeparated', (SELECT 1))") + node.query(query.format(source='(SELECT 1)')) assert node.query("SELECT * FROM test_table") == 'Key 1\n' node.query("DROP TABLE test_table") -def test_executable_pool_storage_multiple_pipes(started_cluster): + node.query(query.format(source='(SELECT id FROM test_data_table)')) + assert node.query("SELECT * FROM test_table") == 'Key 0\nKey 1\nKey 2\n' + node.query("DROP TABLE test_table") + +def test_executable_storage_input_send_chunk_header_python(started_cluster): skip_test_msan(node) + + query = "CREATE TABLE test_table (value String) ENGINE=Executable('input_chunk_header.py', 'TabSeparated', {source}) SETTINGS send_chunk_header=1" + node.query("DROP TABLE IF EXISTS test_table") - node.query("CREATE TABLE test_table (value String) ENGINE=ExecutablePool('test_input_process_pool_multiple_pipes.sh', 'TabSeparated', (SELECT 1), (SELECT 2), (SELECT 3))") + node.query(query.format(source='(SELECT 1)')) + assert node.query("SELECT * FROM test_table") == 'Key 1\n' + node.query("DROP TABLE test_table") + + node.query(query.format(source='(SELECT id FROM test_data_table)')) + assert node.query("SELECT * FROM test_table") == 'Key 0\nKey 1\nKey 2\n' + node.query("DROP TABLE test_table") + +def test_executable_storage_input_sum_python(started_cluster): + skip_test_msan(node) + + query = "CREATE TABLE test_table (value UInt64) ENGINE=Executable('input_sum.py', 'TabSeparated', {source})" + + node.query("DROP TABLE IF EXISTS test_table") + node.query(query.format(source='(SELECT 1, 1)')) + assert node.query("SELECT * FROM test_table") == '2\n' + node.query("DROP TABLE test_table") + + node.query(query.format(source='(SELECT id, id FROM test_data_table)')) + assert node.query("SELECT * FROM test_table") == '0\n2\n4\n' + node.query("DROP TABLE test_table") + +def test_executable_storage_input_argument_python(started_cluster): + skip_test_msan(node) + + query = "CREATE TABLE test_table (value String) ENGINE=Executable('input_argument.py 1', 'TabSeparated', {source})" + + node.query("DROP TABLE IF EXISTS test_table") + node.query(query.format(source='(SELECT 1)')) + assert node.query("SELECT * FROM test_table") == 'Key 1 1\n' + node.query("DROP TABLE test_table") + + node.query(query.format(source='(SELECT id FROM test_data_table)')) + assert node.query("SELECT * FROM test_table") == 'Key 1 0\nKey 1 1\nKey 1 2\n' + node.query("DROP TABLE test_table") + +def test_executable_storage_input_signalled_python(started_cluster): + skip_test_msan(node) + + query = "CREATE TABLE test_table (value String) ENGINE=Executable('input_signalled.py', 'TabSeparated', {source})" + + node.query("DROP TABLE IF EXISTS test_table") + node.query(query.format(source='(SELECT 1)')) + assert node.query("SELECT * FROM test_table") == '' + node.query("DROP TABLE test_table") + + node.query(query.format(source='(SELECT id FROM test_data_table)')) + assert node.query("SELECT * FROM test_table") == '' + node.query("DROP TABLE test_table") + +def test_executable_storage_input_slow_python(started_cluster): + skip_test_msan(node) + + query = "CREATE TABLE test_table (value String) ENGINE=Executable('input_slow.py', 'TabSeparated', {source}) SETTINGS command_read_timeout=2500" + + node.query("DROP TABLE IF EXISTS test_table") + node.query(query.format(source='(SELECT 1)')) + assert node.query_and_get_error("SELECT * FROM test_table") + node.query("DROP TABLE test_table") + + node.query(query.format(source='(SELECT id FROM test_data_table)')) + assert node.query_and_get_error("SELECT * FROM test_table") + node.query("DROP TABLE test_table") + +def test_executable_function_input_multiple_pipes_python(started_cluster): + skip_test_msan(node) + + query = "CREATE TABLE test_table (value String) ENGINE=Executable('input_multiple_pipes.py', 'TabSeparated', {source})" + + node.query("DROP TABLE IF EXISTS test_table") + node.query(query.format(source='(SELECT 1), (SELECT 2), (SELECT 3)')) assert node.query("SELECT * FROM test_table") == 'Key from 4 fd 3\nKey from 3 fd 2\nKey from 0 fd 1\n' node.query("DROP TABLE test_table") + + node.query(query.format(source='(SELECT id FROM test_data_table), (SELECT 2), (SELECT 3)')) + assert node.query("SELECT * FROM test_table") == 'Key from 4 fd 3\nKey from 3 fd 2\nKey from 0 fd 0\nKey from 0 fd 1\nKey from 0 fd 2\n' + node.query("DROP TABLE test_table") + +def test_executable_pool_storage_input_python(started_cluster): + skip_test_msan(node) + + query = "CREATE TABLE test_table (value String) ENGINE=ExecutablePool('input_pool.py', 'TabSeparated', {source}) SETTINGS send_chunk_header=1, pool_size=1" + + node.query("DROP TABLE IF EXISTS test_table") + node.query(query.format(source='(SELECT 1)')) + + assert node.query("SELECT * FROM test_table") == 'Key 1\n' + assert node.query("SELECT * FROM test_table") == 'Key 1\n' + assert node.query("SELECT * FROM test_table") == 'Key 1\n' + + node.query("DROP TABLE test_table") + + node.query(query.format(source='(SELECT id FROM test_data_table)')) + + assert node.query("SELECT * FROM test_table") == 'Key 0\nKey 1\nKey 2\n' + assert node.query("SELECT * FROM test_table") == 'Key 0\nKey 1\nKey 2\n' + assert node.query("SELECT * FROM test_table") == 'Key 0\nKey 1\nKey 2\n' + + node.query("DROP TABLE test_table") + +def test_executable_pool_storage_input_sum_python(started_cluster): + skip_test_msan(node) + + query = "CREATE TABLE test_table (value UInt64) ENGINE=ExecutablePool('input_sum_pool.py', 'TabSeparated', {source}) SETTINGS send_chunk_header=1, pool_size=1" + + node.query("DROP TABLE IF EXISTS test_table") + node.query(query.format(source='(SELECT 1, 1)')) + + assert node.query("SELECT * FROM test_table") == '2\n' + assert node.query("SELECT * FROM test_table") == '2\n' + assert node.query("SELECT * FROM test_table") == '2\n' + + node.query("DROP TABLE test_table") + + node.query(query.format(source='(SELECT id, id FROM test_data_table)')) + + assert node.query("SELECT * FROM test_table") == '0\n2\n4\n' + assert node.query("SELECT * FROM test_table") == '0\n2\n4\n' + assert node.query("SELECT * FROM test_table") == '0\n2\n4\n' + + node.query("DROP TABLE test_table") + +def test_executable_pool_storage_input_argument_python(started_cluster): + skip_test_msan(node) + + query = "CREATE TABLE test_table (value String) ENGINE=ExecutablePool('input_argument_pool.py 1', 'TabSeparated', {source}) SETTINGS send_chunk_header=1, pool_size=1" + + node.query("DROP TABLE IF EXISTS test_table") + node.query(query.format(source='(SELECT 1)')) + + assert node.query("SELECT * FROM test_table") == 'Key 1 1\n' + assert node.query("SELECT * FROM test_table") == 'Key 1 1\n' + assert node.query("SELECT * FROM test_table") == 'Key 1 1\n' + + node.query("DROP TABLE test_table") + + node.query(query.format(source='(SELECT id FROM test_data_table)')) + + assert node.query("SELECT * FROM test_table") == 'Key 1 0\nKey 1 1\nKey 1 2\n' + assert node.query("SELECT * FROM test_table") == 'Key 1 0\nKey 1 1\nKey 1 2\n' + assert node.query("SELECT * FROM test_table") == 'Key 1 0\nKey 1 1\nKey 1 2\n' + + node.query("DROP TABLE test_table") + +def test_executable_pool_storage_input_signalled_python(started_cluster): + skip_test_msan(node) + + query = "CREATE TABLE test_table (value String) ENGINE=ExecutablePool('input_signalled_pool.py', 'TabSeparated', {source}) SETTINGS send_chunk_header=1, pool_size=1" + + node.query("DROP TABLE IF EXISTS test_table") + node.query(query.format(source='(SELECT 1)')) + + assert node.query_and_get_error("SELECT * FROM test_table") + assert node.query_and_get_error("SELECT * FROM test_table") + assert node.query_and_get_error("SELECT * FROM test_table") + + node.query("DROP TABLE test_table") + + node.query(query.format(source='(SELECT id FROM test_data_table)')) + + assert node.query_and_get_error("SELECT * FROM test_table") + assert node.query_and_get_error("SELECT * FROM test_table") + assert node.query_and_get_error("SELECT * FROM test_table") + + node.query("DROP TABLE test_table") + +def test_executable_pool_storage_input_slow_python(started_cluster): + skip_test_msan(node) + + query = """CREATE TABLE test_table (value String) + ENGINE=ExecutablePool('input_slow_pool.py', 'TabSeparated', {source}) + SETTINGS send_chunk_header=1, pool_size=1, command_read_timeout=2500""" + + node.query("DROP TABLE IF EXISTS test_table") + node.query(query.format(source='(SELECT 1)')) + + assert node.query_and_get_error("SELECT * FROM test_table") + assert node.query_and_get_error("SELECT * FROM test_table") + assert node.query_and_get_error("SELECT * FROM test_table") + + node.query("DROP TABLE test_table") + + node.query(query.format(source='(SELECT id FROM test_data_table)')) + + assert node.query_and_get_error("SELECT * FROM test_table") + assert node.query_and_get_error("SELECT * FROM test_table") + assert node.query_and_get_error("SELECT * FROM test_table") + + node.query("DROP TABLE test_table") + +def test_executable_pool_storage_input_multiple_pipes_python(started_cluster): + skip_test_msan(node) + + query = "CREATE TABLE test_table (value String) ENGINE=ExecutablePool('input_multiple_pipes_pool.py', 'TabSeparated', {source}) SETTINGS send_chunk_header=1, pool_size=1" + + node.query("DROP TABLE IF EXISTS test_table") + node.query(query.format(source='(SELECT 1), (SELECT 2), (SELECT 3)')) + + assert node.query("SELECT * FROM test_table") == 'Key from 4 fd 3\nKey from 3 fd 2\nKey from 0 fd 1\n' + assert node.query("SELECT * FROM test_table") == 'Key from 4 fd 3\nKey from 3 fd 2\nKey from 0 fd 1\n' + assert node.query("SELECT * FROM test_table") == 'Key from 4 fd 3\nKey from 3 fd 2\nKey from 0 fd 1\n' + + node.query("DROP TABLE test_table") + + node.query(query.format(source='(SELECT id FROM test_data_table), (SELECT 2), (SELECT 3)')) + + assert node.query("SELECT * FROM test_table") == 'Key from 4 fd 3\nKey from 3 fd 2\nKey from 0 fd 0\nKey from 0 fd 1\nKey from 0 fd 2\n' + assert node.query("SELECT * FROM test_table") == 'Key from 4 fd 3\nKey from 3 fd 2\nKey from 0 fd 0\nKey from 0 fd 1\nKey from 0 fd 2\n' + assert node.query("SELECT * FROM test_table") == 'Key from 4 fd 3\nKey from 3 fd 2\nKey from 0 fd 0\nKey from 0 fd 1\nKey from 0 fd 2\n' + + node.query("DROP TABLE test_table") + +def test_executable_pool_storage_input_count_python(started_cluster): + skip_test_msan(node) + + query = "CREATE TABLE test_table (value String) ENGINE=ExecutablePool('input_count_pool.py', 'TabSeparated', {source}) SETTINGS send_chunk_header=1, pool_size=1" + + node.query("DROP TABLE IF EXISTS test_table") + node.query(query.format(source='(SELECT 1)')) + + assert node.query("SELECT * FROM test_table") == '1\n' + assert node.query("SELECT * FROM test_table") == '1\n' + assert node.query("SELECT * FROM test_table") == '1\n' + + node.query("DROP TABLE test_table") + + node.query(query.format(source='(SELECT number FROM system.numbers LIMIT 250000)')) + + assert node.query("SELECT * FROM test_table") == '250000\n' + assert node.query("SELECT * FROM test_table") == '250000\n' + assert node.query("SELECT * FROM test_table") == '250000\n' + + node.query("DROP TABLE test_table") diff --git a/tests/integration/test_executable_table_function/user_scripts/input.py b/tests/integration/test_executable_table_function/user_scripts/input.py new file mode 100755 index 00000000000..835ab1f441a --- /dev/null +++ b/tests/integration/test_executable_table_function/user_scripts/input.py @@ -0,0 +1,8 @@ +#!/usr/bin/python3 + +import sys + +if __name__ == '__main__': + for line in sys.stdin: + print("Key " + line, end='') + sys.stdout.flush() diff --git a/tests/integration/test_executable_table_function/user_scripts/test_input.sh b/tests/integration/test_executable_table_function/user_scripts/input.sh similarity index 100% rename from tests/integration/test_executable_table_function/user_scripts/test_input.sh rename to tests/integration/test_executable_table_function/user_scripts/input.sh diff --git a/tests/integration/test_executable_table_function/user_scripts/input_argument.py b/tests/integration/test_executable_table_function/user_scripts/input_argument.py new file mode 100755 index 00000000000..c1b2e5966d7 --- /dev/null +++ b/tests/integration/test_executable_table_function/user_scripts/input_argument.py @@ -0,0 +1,10 @@ +#!/usr/bin/python3 + +import sys + +if __name__ == '__main__': + arg = int(sys.argv[1]) + + for line in sys.stdin: + print("Key " + str(arg) + " " + line, end='') + sys.stdout.flush() diff --git a/tests/integration/test_executable_table_function/user_scripts/input_argument_pool.py b/tests/integration/test_executable_table_function/user_scripts/input_argument_pool.py new file mode 100755 index 00000000000..378a6ef4391 --- /dev/null +++ b/tests/integration/test_executable_table_function/user_scripts/input_argument_pool.py @@ -0,0 +1,17 @@ +#!/usr/bin/python3 + +import sys + +if __name__ == '__main__': + arg = int(sys.argv[1]) + + for chunk_header in sys.stdin: + chunk_length = int(chunk_header) + print(str(chunk_length), end='\n') + + while chunk_length != 0: + line = sys.stdin.readline() + chunk_length -= 1 + print("Key " + str(arg) + " " + line, end='') + + sys.stdout.flush() diff --git a/tests/integration/test_executable_table_function/user_scripts/input_chunk_header.py b/tests/integration/test_executable_table_function/user_scripts/input_chunk_header.py new file mode 100755 index 00000000000..5dc03e1c507 --- /dev/null +++ b/tests/integration/test_executable_table_function/user_scripts/input_chunk_header.py @@ -0,0 +1,14 @@ +#!/usr/bin/python3 + +import sys + +if __name__ == '__main__': + for chunk_header in sys.stdin: + chunk_length = int(chunk_header) + + while chunk_length != 0: + line = sys.stdin.readline() + chunk_length -= 1 + print("Key " + line, end='') + + sys.stdout.flush() diff --git a/tests/integration/test_executable_table_function/user_scripts/input_count_pool.py b/tests/integration/test_executable_table_function/user_scripts/input_count_pool.py new file mode 100755 index 00000000000..8b744168a82 --- /dev/null +++ b/tests/integration/test_executable_table_function/user_scripts/input_count_pool.py @@ -0,0 +1,15 @@ +#!/usr/bin/python3 + +import sys + +if __name__ == '__main__': + for chunk_header in sys.stdin: + chunk_length = int(chunk_header) + print(1, end='\n') + print(str(chunk_length), end='\n') + + while chunk_length != 0: + line = sys.stdin.readline() + chunk_length -= 1 + + sys.stdout.flush() diff --git a/tests/integration/test_executable_table_function/user_scripts/input_multiple_pipes.py b/tests/integration/test_executable_table_function/user_scripts/input_multiple_pipes.py new file mode 100755 index 00000000000..64590cbc16a --- /dev/null +++ b/tests/integration/test_executable_table_function/user_scripts/input_multiple_pipes.py @@ -0,0 +1,19 @@ +#!/usr/bin/python3 + +import sys +import os + +if __name__ == '__main__': + fd3 = os.fdopen(3) + fd4 = os.fdopen(4) + + for line in fd4: + print("Key from 4 fd " + line, end='') + + for line in fd3: + print("Key from 3 fd " + line, end='') + + for line in sys.stdin: + print("Key from 0 fd " + line, end='') + + sys.stdout.flush() diff --git a/tests/integration/test_executable_table_function/user_scripts/input_multiple_pipes_pool.py b/tests/integration/test_executable_table_function/user_scripts/input_multiple_pipes_pool.py new file mode 100755 index 00000000000..a3a515899f9 --- /dev/null +++ b/tests/integration/test_executable_table_function/user_scripts/input_multiple_pipes_pool.py @@ -0,0 +1,45 @@ +#!/usr/bin/python3 + +import sys +import os + +if __name__ == '__main__': + fd3 = os.fdopen(3) + fd4 = os.fdopen(4) + + lines = [] + + for chunk_header_fd4 in fd4: + fd4_chunk_length = int(chunk_header_fd4) + + while fd4_chunk_length != 0: + line = fd4.readline() + fd4_chunk_length -= 1 + lines.append("Key from 4 fd " + line) + + for chunk_header_fd3 in fd3: + fd3_chunk_length = int(chunk_header_fd3) + + while fd3_chunk_length != 0: + line = fd3.readline() + fd3_chunk_length -= 1 + lines.append("Key from 3 fd " + line) + + for chunk_header in sys.stdin: + chunk_length = int(chunk_header) + + while chunk_length != 0: + line = sys.stdin.readline() + chunk_length -= 1 + lines.append("Key from 0 fd " + line) + + break + break + + print(str(len(lines)), end='\n') + + for line in lines: + print(line, end='') + lines.clear() + + sys.stdout.flush() \ No newline at end of file diff --git a/tests/integration/test_executable_table_function/user_scripts/input_pool.py b/tests/integration/test_executable_table_function/user_scripts/input_pool.py new file mode 100755 index 00000000000..ec4e9af23cd --- /dev/null +++ b/tests/integration/test_executable_table_function/user_scripts/input_pool.py @@ -0,0 +1,15 @@ +#!/usr/bin/python3 + +import sys + +if __name__ == '__main__': + for chunk_header in sys.stdin: + chunk_length = int(chunk_header) + print(str(chunk_length), end='\n') + + while chunk_length != 0: + line = sys.stdin.readline() + chunk_length -= 1 + print("Key " + line, end='') + + sys.stdout.flush() diff --git a/tests/integration/test_executable_table_function/user_scripts/input_signalled.py b/tests/integration/test_executable_table_function/user_scripts/input_signalled.py new file mode 100755 index 00000000000..93ce20fa8e7 --- /dev/null +++ b/tests/integration/test_executable_table_function/user_scripts/input_signalled.py @@ -0,0 +1,12 @@ +#!/usr/bin/python3 + +import sys +import os +import signal + +if __name__ == '__main__': + for line in sys.stdin: + os.signal(os.getpid(), signal.SIGTERM) + + print("Key " + line, end='') + sys.stdout.flush() diff --git a/tests/integration/test_executable_table_function/user_scripts/input_signalled_pool.py b/tests/integration/test_executable_table_function/user_scripts/input_signalled_pool.py new file mode 100755 index 00000000000..1ea0eddbd8d --- /dev/null +++ b/tests/integration/test_executable_table_function/user_scripts/input_signalled_pool.py @@ -0,0 +1,19 @@ +#!/usr/bin/python3 + +import sys +import os +import signal + +if __name__ == '__main__': + for chunk_header in sys.stdin: + os.signal(os.getpid(), signal.SIGTERM) + + chunk_length = int(chunk_header) + print(str(chunk_length), end='\n') + + while chunk_length != 0: + line = sys.stdin.readline() + chunk_length -= 1 + print("Key " + line, end='') + + sys.stdout.flush() diff --git a/tests/integration/test_executable_table_function/user_scripts/input_slow.py b/tests/integration/test_executable_table_function/user_scripts/input_slow.py new file mode 100755 index 00000000000..4c2abe89e33 --- /dev/null +++ b/tests/integration/test_executable_table_function/user_scripts/input_slow.py @@ -0,0 +1,10 @@ +#!/usr/bin/python3 + +import sys +import time + +if __name__ == '__main__': + for line in sys.stdin: + time.sleep(25) + print("Key " + line, end='') + sys.stdout.flush() diff --git a/tests/integration/test_executable_table_function/user_scripts/input_slow_pool.py b/tests/integration/test_executable_table_function/user_scripts/input_slow_pool.py new file mode 100755 index 00000000000..c8df7e18c4c --- /dev/null +++ b/tests/integration/test_executable_table_function/user_scripts/input_slow_pool.py @@ -0,0 +1,18 @@ +#!/usr/bin/python3 + +import sys +import time + +if __name__ == '__main__': + for chunk_header in sys.stdin: + time.sleep(25) + + chunk_length = int(chunk_header) + print(str(chunk_length), end='\n') + + while chunk_length != 0: + line = sys.stdin.readline() + chunk_length -= 1 + print("Key " + line, end='') + + sys.stdout.flush() diff --git a/tests/integration/test_executable_table_function/user_scripts/input_sum.py b/tests/integration/test_executable_table_function/user_scripts/input_sum.py new file mode 100755 index 00000000000..432d7a13a2f --- /dev/null +++ b/tests/integration/test_executable_table_function/user_scripts/input_sum.py @@ -0,0 +1,10 @@ +#!/usr/bin/python3 + +import sys +import re + +if __name__ == '__main__': + for line in sys.stdin: + line_split = re.split(r'\t+', line) + print(int(line_split[0]) + int(line_split[1]), end='\n') + sys.stdout.flush() diff --git a/tests/integration/test_executable_table_function/user_scripts/input_sum_pool.py b/tests/integration/test_executable_table_function/user_scripts/input_sum_pool.py new file mode 100755 index 00000000000..cd0de25fe87 --- /dev/null +++ b/tests/integration/test_executable_table_function/user_scripts/input_sum_pool.py @@ -0,0 +1,17 @@ +#!/usr/bin/python3 + +import sys +import re + +if __name__ == '__main__': + for chunk_header in sys.stdin: + chunk_length = int(chunk_header) + print(str(chunk_length), end='\n') + + while chunk_length != 0: + line = sys.stdin.readline() + line_split = re.split(r'\t+', line) + print(int(line_split[0]) + int(line_split[1]), end='\n') + chunk_length -= 1 + + sys.stdout.flush() diff --git a/tests/integration/test_executable_table_function/user_scripts/no_input.py b/tests/integration/test_executable_table_function/user_scripts/no_input.py new file mode 100755 index 00000000000..65b78f3d755 --- /dev/null +++ b/tests/integration/test_executable_table_function/user_scripts/no_input.py @@ -0,0 +1,9 @@ +#!/usr/bin/python3 + +import sys + +if __name__ == '__main__': + print("Key 0") + print("Key 1") + print("Key 2") + sys.stdout.flush() diff --git a/tests/integration/test_executable_table_function/user_scripts/no_input.sh b/tests/integration/test_executable_table_function/user_scripts/no_input.sh new file mode 100755 index 00000000000..13d172a5be4 --- /dev/null +++ b/tests/integration/test_executable_table_function/user_scripts/no_input.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +printf "Key 0\n"; +printf "Key 1\n"; +printf "Key 2\n"; diff --git a/tests/integration/test_executable_table_function/user_scripts/test_argument.sh b/tests/integration/test_executable_table_function/user_scripts/test_argument.sh deleted file mode 100755 index 89634031d2b..00000000000 --- a/tests/integration/test_executable_table_function/user_scripts/test_argument.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash - -echo "Key $1" diff --git a/tests/integration/test_executable_table_function/user_scripts/test_input_multiple_pipes.sh b/tests/integration/test_executable_table_function/user_scripts/test_input_multiple_pipes.sh deleted file mode 100755 index 1e53e3211dc..00000000000 --- a/tests/integration/test_executable_table_function/user_scripts/test_input_multiple_pipes.sh +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash - -while read -t 250 -u 4 read_data; do printf "Key from 4 fd $read_data\n"; done -while read -t 250 -u 3 read_data; do printf "Key from 3 fd $read_data\n"; done -while read -t 250 read_data; do printf "Key from 0 fd $read_data\n"; done diff --git a/tests/integration/test_executable_table_function/user_scripts/test_no_input.sh b/tests/integration/test_executable_table_function/user_scripts/test_no_input.sh deleted file mode 100755 index 9e8b3be63d6..00000000000 --- a/tests/integration/test_executable_table_function/user_scripts/test_no_input.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash - -echo "1" diff --git a/tests/integration/test_executable_user_defined_function/__init__.py b/tests/integration/test_executable_user_defined_function/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/integration/test_executable_user_defined_function/config/executable_user_defined_functions_config.xml b/tests/integration/test_executable_user_defined_function/config/executable_user_defined_functions_config.xml new file mode 100644 index 00000000000..3cbf717bb67 --- /dev/null +++ b/tests/integration/test_executable_user_defined_function/config/executable_user_defined_functions_config.xml @@ -0,0 +1,2 @@ + + diff --git a/tests/integration/test_executable_user_defined_function/functions/test_function_config.xml b/tests/integration/test_executable_user_defined_function/functions/test_function_config.xml new file mode 100644 index 00000000000..d8f81a588a2 --- /dev/null +++ b/tests/integration/test_executable_user_defined_function/functions/test_function_config.xml @@ -0,0 +1,196 @@ + + + executable + test_function_bash + String + + UInt64 + + TabSeparated + input.sh + + + + executable_pool + test_function_pool_bash + String + + UInt64 + + TabSeparated + input.sh + + + + executable + test_function_python + String + + UInt64 + + TabSeparated + input.py + + + + executable_pool + test_function_pool_python + String + + UInt64 + + TabSeparated + input.py + + + + executable + test_function_send_chunk_header_python + String + + UInt64 + + TabSeparated + 1 + input_chunk_header.py + + + + executable_pool + test_function_send_chunk_header_pool_python + String + + UInt64 + + TabSeparated + 1 + input_chunk_header.py + + + + executable + test_function_sum_python + String + + UInt64 + + + UInt64 + + TabSeparated + input_sum.py + + + + executable_pool + test_function_sum_pool_python + String + + UInt64 + + + UInt64 + + TabSeparated + input_sum.py + + + + executable + test_function_argument_python + String + + UInt64 + + TabSeparated + input_argument.py 1 + + + + executable_pool + test_function_argument_pool_python + String + + UInt64 + + TabSeparated + input_argument.py 1 + + + + executable + test_function_slow_python + String + + UInt64 + + TabSeparated + input_slow.py + 1 + 1000 + + + + executable_pool + test_function_slow_pool_python + String + + UInt64 + + TabSeparated + input_slow.py + 1 + 1000 + + + + executable + test_function_signalled_python + String + + UInt64 + + TabSeparated + input_signalled.py + 1 + 1000 + + + + executable_pool + test_function_signalled_pool_python + String + + UInt64 + + TabSeparated + input_signalled.py + 1 + 1000 + + + + executable + test_function_non_direct_bash + String + + UInt64 + + TabSeparated + while read read_data; do printf "Key $read_data\n"; done + 0 + + + + executable_pool + test_function_non_direct_pool_bash + String + + UInt64 + + TabSeparated + while read read_data; do printf "Key $read_data\n"; done + 0 + + + diff --git a/tests/integration/test_executable_user_defined_function/test.py b/tests/integration/test_executable_user_defined_function/test.py new file mode 100644 index 00000000000..94afdf8d8a9 --- /dev/null +++ b/tests/integration/test_executable_user_defined_function/test.py @@ -0,0 +1,106 @@ +import os +import sys +import time + +import pytest + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__)) + +from helpers.cluster import ClickHouseCluster + +cluster = ClickHouseCluster(__file__) +node = cluster.add_instance('node', stay_alive=True, main_configs=[]) + + +def skip_test_msan(instance): + if instance.is_built_with_memory_sanitizer(): + pytest.skip("Memory Sanitizer cannot work with vfork") + +def copy_file_to_container(local_path, dist_path, container_id): + os.system("docker cp {local} {cont_id}:{dist}".format(local=local_path, cont_id=container_id, dist=dist_path)) + +config = ''' + /etc/clickhouse-server/functions/test_function_config.xml +''' + +@pytest.fixture(scope="module") +def started_cluster(): + try: + cluster.start() + + node.replace_config("/etc/clickhouse-server/config.d/executable_user_defined_functions_config.xml", config) + + copy_file_to_container(os.path.join(SCRIPT_DIR, 'functions/.'), '/etc/clickhouse-server/functions', node.docker_id) + copy_file_to_container(os.path.join(SCRIPT_DIR, 'user_scripts/.'), '/var/lib/clickhouse/user_scripts', node.docker_id) + + node.restart_clickhouse() + + yield cluster + + finally: + cluster.shutdown() + +def test_executable_function_bash(started_cluster): + skip_test_msan(node) + assert node.query("SELECT test_function_bash(toUInt64(1))") == 'Key 1\n' + assert node.query("SELECT test_function_bash(1)") == 'Key 1\n' + + assert node.query("SELECT test_function_pool_bash(toUInt64(1))") == 'Key 1\n' + assert node.query("SELECT test_function_pool_bash(1)") == 'Key 1\n' + +def test_executable_function_python(started_cluster): + skip_test_msan(node) + assert node.query("SELECT test_function_python(toUInt64(1))") == 'Key 1\n' + assert node.query("SELECT test_function_python(1)") == 'Key 1\n' + + assert node.query("SELECT test_function_pool_python(toUInt64(1))") == 'Key 1\n' + assert node.query("SELECT test_function_pool_python(1)") == 'Key 1\n' + +def test_executable_function_send_chunk_header_python(started_cluster): + skip_test_msan(node) + assert node.query("SELECT test_function_send_chunk_header_python(toUInt64(1))") == 'Key 1\n' + assert node.query("SELECT test_function_send_chunk_header_python(1)") == 'Key 1\n' + + assert node.query("SELECT test_function_send_chunk_header_pool_python(toUInt64(1))") == 'Key 1\n' + assert node.query("SELECT test_function_send_chunk_header_pool_python(1)") == 'Key 1\n' + +def test_executable_function_sum_python(started_cluster): + skip_test_msan(node) + assert node.query("SELECT test_function_sum_python(toUInt64(1), toUInt64(1))") == '2\n' + assert node.query("SELECT test_function_sum_python(1, 1)") == '2\n' + + assert node.query("SELECT test_function_sum_pool_python(toUInt64(1), toUInt64(1))") == '2\n' + assert node.query("SELECT test_function_sum_pool_python(1, 1)") == '2\n' + +def test_executable_function_argument_python(started_cluster): + skip_test_msan(node) + assert node.query("SELECT test_function_argument_python(toUInt64(1))") == 'Key 1 1\n' + assert node.query("SELECT test_function_argument_python(1)") == 'Key 1 1\n' + + assert node.query("SELECT test_function_argument_pool_python(toUInt64(1))") == 'Key 1 1\n' + assert node.query("SELECT test_function_argument_pool_python(1)") == 'Key 1 1\n' + +def test_executable_function_signalled_python(started_cluster): + skip_test_msan(node) + assert node.query_and_get_error("SELECT test_function_signalled_python(toUInt64(1))") + assert node.query_and_get_error("SELECT test_function_signalled_python(1)") + + assert node.query_and_get_error("SELECT test_function_signalled_pool_python(toUInt64(1))") + assert node.query_and_get_error("SELECT test_function_signalled_pool_python(1)") + +def test_executable_function_slow_python(started_cluster): + skip_test_msan(node) + assert node.query_and_get_error("SELECT test_function_slow_python(toUInt64(1))") + assert node.query_and_get_error("SELECT test_function_slow_python(1)") + + assert node.query_and_get_error("SELECT test_function_slow_pool_python(toUInt64(1))") + assert node.query_and_get_error("SELECT test_function_slow_pool_python(1)") + +def test_executable_function_non_direct_bash(started_cluster): + skip_test_msan(node) + assert node.query("SELECT test_function_non_direct_bash(toUInt64(1))") == 'Key 1\n' + assert node.query("SELECT test_function_non_direct_bash(1)") == 'Key 1\n' + + assert node.query("SELECT test_function_non_direct_pool_bash(toUInt64(1))") == 'Key 1\n' + assert node.query("SELECT test_function_non_direct_pool_bash(1)") == 'Key 1\n' diff --git a/tests/integration/test_executable_user_defined_function/user_scripts/input.py b/tests/integration/test_executable_user_defined_function/user_scripts/input.py new file mode 100755 index 00000000000..835ab1f441a --- /dev/null +++ b/tests/integration/test_executable_user_defined_function/user_scripts/input.py @@ -0,0 +1,8 @@ +#!/usr/bin/python3 + +import sys + +if __name__ == '__main__': + for line in sys.stdin: + print("Key " + line, end='') + sys.stdout.flush() diff --git a/tests/integration/test_executable_user_defined_function/user_scripts/input.sh b/tests/integration/test_executable_user_defined_function/user_scripts/input.sh new file mode 100755 index 00000000000..aea51b82b1f --- /dev/null +++ b/tests/integration/test_executable_user_defined_function/user_scripts/input.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +while read read_data; + do printf "Key $read_data\n"; +done diff --git a/tests/integration/test_executable_user_defined_function/user_scripts/input_argument.py b/tests/integration/test_executable_user_defined_function/user_scripts/input_argument.py new file mode 100755 index 00000000000..c1b2e5966d7 --- /dev/null +++ b/tests/integration/test_executable_user_defined_function/user_scripts/input_argument.py @@ -0,0 +1,10 @@ +#!/usr/bin/python3 + +import sys + +if __name__ == '__main__': + arg = int(sys.argv[1]) + + for line in sys.stdin: + print("Key " + str(arg) + " " + line, end='') + sys.stdout.flush() diff --git a/tests/integration/test_executable_user_defined_function/user_scripts/input_chunk_header.py b/tests/integration/test_executable_user_defined_function/user_scripts/input_chunk_header.py new file mode 100755 index 00000000000..5dc03e1c507 --- /dev/null +++ b/tests/integration/test_executable_user_defined_function/user_scripts/input_chunk_header.py @@ -0,0 +1,14 @@ +#!/usr/bin/python3 + +import sys + +if __name__ == '__main__': + for chunk_header in sys.stdin: + chunk_length = int(chunk_header) + + while chunk_length != 0: + line = sys.stdin.readline() + chunk_length -= 1 + print("Key " + line, end='') + + sys.stdout.flush() diff --git a/tests/integration/test_executable_user_defined_function/user_scripts/input_signalled.py b/tests/integration/test_executable_user_defined_function/user_scripts/input_signalled.py new file mode 100755 index 00000000000..27c8bc4840e --- /dev/null +++ b/tests/integration/test_executable_user_defined_function/user_scripts/input_signalled.py @@ -0,0 +1,13 @@ +#!/usr/bin/python3 + +import sys +import os +import signal +import time + +if __name__ == '__main__': + for line in sys.stdin: + os.signal(os.getpid(), signal.SIGTERM) + + print("Key " + line, end='') + sys.stdout.flush() diff --git a/tests/integration/test_executable_user_defined_function/user_scripts/input_slow.py b/tests/integration/test_executable_user_defined_function/user_scripts/input_slow.py new file mode 100755 index 00000000000..648a9eac918 --- /dev/null +++ b/tests/integration/test_executable_user_defined_function/user_scripts/input_slow.py @@ -0,0 +1,12 @@ +#!/usr/bin/python3 + +import sys +import os +import signal +import time + +if __name__ == '__main__': + for line in sys.stdin: + time.sleep(5) + print("Key " + line, end='') + sys.stdout.flush() diff --git a/tests/integration/test_executable_user_defined_function/user_scripts/input_sum.py b/tests/integration/test_executable_user_defined_function/user_scripts/input_sum.py new file mode 100755 index 00000000000..432d7a13a2f --- /dev/null +++ b/tests/integration/test_executable_user_defined_function/user_scripts/input_sum.py @@ -0,0 +1,10 @@ +#!/usr/bin/python3 + +import sys +import re + +if __name__ == '__main__': + for line in sys.stdin: + line_split = re.split(r'\t+', line) + print(int(line_split[0]) + int(line_split[1]), end='\n') + sys.stdout.flush() diff --git a/tests/integration/test_executable_user_defined_functions_config_reload/functions/test_function_config.xml b/tests/integration/test_executable_user_defined_functions_config_reload/functions/test_function_config.xml index f2a7d6e67b1..d0bd6e5ab88 100644 --- a/tests/integration/test_executable_user_defined_functions_config_reload/functions/test_function_config.xml +++ b/tests/integration/test_executable_user_defined_functions_config_reload/functions/test_function_config.xml @@ -7,8 +7,7 @@ UInt64 TabSeparated - while read read_data; do printf "Key_1 $read_data\n"; done - 0 + test_input_1.sh
diff --git a/tests/integration/test_executable_user_defined_functions_config_reload/functions/test_function_config2.xml b/tests/integration/test_executable_user_defined_functions_config_reload/functions/test_function_config2.xml index fe02146a6b8..80ae21a086d 100644 --- a/tests/integration/test_executable_user_defined_functions_config_reload/functions/test_function_config2.xml +++ b/tests/integration/test_executable_user_defined_functions_config_reload/functions/test_function_config2.xml @@ -7,8 +7,7 @@ UInt64 TabSeparated - while read read_data; do printf "Key_2 $read_data\n"; done - 0 + test_input_2.sh
diff --git a/tests/integration/test_executable_user_defined_functions_config_reload/test.py b/tests/integration/test_executable_user_defined_functions_config_reload/test.py index 3117b3e72b1..629c426a28c 100644 --- a/tests/integration/test_executable_user_defined_functions_config_reload/test.py +++ b/tests/integration/test_executable_user_defined_functions_config_reload/test.py @@ -28,6 +28,8 @@ def started_cluster(): cluster.start() copy_file_to_container(os.path.join(SCRIPT_DIR, 'functions/.'), '/etc/clickhouse-server/functions', node.docker_id) + copy_file_to_container(os.path.join(SCRIPT_DIR, 'user_scripts/.'), '/var/lib/clickhouse/user_scripts', node.docker_id) + node.restart_clickhouse() yield cluster diff --git a/tests/integration/test_executable_user_defined_functions_config_reload/user_scripts/test_input_1.sh b/tests/integration/test_executable_user_defined_functions_config_reload/user_scripts/test_input_1.sh new file mode 100755 index 00000000000..a6cffe83bba --- /dev/null +++ b/tests/integration/test_executable_user_defined_functions_config_reload/user_scripts/test_input_1.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +while read read_data; + do printf "Key_1 $read_data\n"; +done diff --git a/tests/integration/test_executable_user_defined_functions_config_reload/user_scripts/test_input_2.sh b/tests/integration/test_executable_user_defined_functions_config_reload/user_scripts/test_input_2.sh new file mode 100755 index 00000000000..a673cfd18fb --- /dev/null +++ b/tests/integration/test_executable_user_defined_functions_config_reload/user_scripts/test_input_2.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +while read read_data; + do printf "Key_2 $read_data\n"; +done diff --git a/tests/integration/test_groupBitmapAnd_on_distributed/__init__.py b/tests/integration/test_groupBitmapAnd_on_distributed/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/integration/test_groupBitmapAnd_on_distributed/configs/clusters.xml b/tests/integration/test_groupBitmapAnd_on_distributed/configs/clusters.xml new file mode 100644 index 00000000000..5ac57bed6a6 --- /dev/null +++ b/tests/integration/test_groupBitmapAnd_on_distributed/configs/clusters.xml @@ -0,0 +1,32 @@ + + + + + + node1 + 9000 + + + + + node2 + 9000 + + + + + + + node3 + 9000 + + + + + node4 + 9000 + + + + + diff --git a/tests/integration/test_groupBitmapAnd_on_distributed/test.py b/tests/integration/test_groupBitmapAnd_on_distributed/test.py new file mode 100644 index 00000000000..b0fb55b13ff --- /dev/null +++ b/tests/integration/test_groupBitmapAnd_on_distributed/test.py @@ -0,0 +1,82 @@ +import pytest + +from helpers.cluster import ClickHouseCluster +cluster = ClickHouseCluster(__file__) + +node1 = cluster.add_instance('node1', main_configs=["configs/clusters.xml"], with_zookeeper=True) +node2 = cluster.add_instance('node2', main_configs=["configs/clusters.xml"], with_zookeeper=True) +node3 = cluster.add_instance('node3', main_configs=["configs/clusters.xml"], with_zookeeper=True) +node4 = cluster.add_instance('node4', main_configs=["configs/clusters.xml"], image='yandex/clickhouse-server', tag='21.5', with_zookeeper=True) + +def insert_data(node, table_name): + node.query("""INSERT INTO {} + VALUES (bitmapBuild(cast([1,2,3,4,5,6,7,8,9,10] as Array(UInt32))));""".format(table_name)) + +@pytest.fixture(scope="module") +def start_cluster(): + try: + cluster.start() + yield cluster + finally: + cluster.shutdown() + + +def test_groupBitmapAnd_on_distributed_table(start_cluster): + local_table_name = "bitmap_column_expr_test" + distributed_table_name = "bitmap_column_expr_test_dst" + cluster_name = "awesome_cluster" + + for node in (node1, node2): + node.query("""CREATE TABLE {} + ( + z AggregateFunction(groupBitmap, UInt32) + ) + ENGINE = MergeTree() + ORDER BY tuple()""".format(local_table_name)) + + node.query("""CREATE TABLE {} + ( + z AggregateFunction(groupBitmap, UInt32) + ) + ENGINE = Distributed('{}', 'default', '{}')""".format(distributed_table_name, cluster_name, local_table_name)) + insert_data(node1, local_table_name) + + expected = "10" + + for node in (node1, node2): + result = node.query("select groupBitmapAnd(z) FROM {};".format(distributed_table_name)).strip() + assert(result == expected) + +def test_groupBitmapAnd_function_versioning(start_cluster): + local_table_name = "bitmap_column_expr_versioning_test" + distributed_table_name = "bitmap_column_expr_versioning_test_dst" + cluster_name = "test_version_cluster" + + for node in (node3, node4): + node.query("""CREATE TABLE {} + ( + z AggregateFunction(groupBitmap, UInt32) + ) + ENGINE = MergeTree() + ORDER BY tuple()""".format(local_table_name)) + + node.query("""CREATE TABLE {} + ( + z AggregateFunction(groupBitmap, UInt32) + ) + ENGINE = Distributed('{}', 'default', '{}')""".format(distributed_table_name, cluster_name, local_table_name)) + + node.query("""INSERT INTO {} VALUES + (bitmapBuild(cast([1,2,3,4,5,6,7,8,9,10] as Array(UInt32))));""".format(local_table_name)) + + expected = "10" + new_version_distributed_result = node3.query("select groupBitmapAnd(z) FROM {};".format(distributed_table_name)).strip() + old_version_distributed_result = node4.query("select groupBitmapAnd(z) FROM {};".format(distributed_table_name)).strip() + assert(new_version_distributed_result == expected) + assert(old_version_distributed_result == expected) + + result_from_old_to_new_version = node3.query("select groupBitmapAnd(z) FROM remote('node4', default.{})".format(local_table_name)).strip() + assert(result_from_old_to_new_version == expected) + + result_from_new_to_old_version = node4.query("select groupBitmapAnd(z) FROM remote('node3', default.{})".format(local_table_name)).strip() + assert(result_from_new_to_old_version == expected) diff --git a/tests/integration/test_grpc_protocol/test.py b/tests/integration/test_grpc_protocol/test.py index c892fc94712..e17ed0d9c8e 100644 --- a/tests/integration/test_grpc_protocol/test.py +++ b/tests/integration/test_grpc_protocol/test.py @@ -5,6 +5,8 @@ import time import grpc from helpers.cluster import ClickHouseCluster, run_and_check from threading import Thread +import gzip +import lz4.frame GRPC_PORT = 9100 SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__)) @@ -365,3 +367,67 @@ def test_result_compression(): stub = clickhouse_grpc_pb2_grpc.ClickHouseStub(main_channel) result = stub.ExecuteQuery(query_info) assert result.output == (b'0\n')*1000000 + +def test_compressed_output(): + query_info = clickhouse_grpc_pb2.QueryInfo(query="SELECT 0 FROM numbers(1000)", compression_type="lz4") + stub = clickhouse_grpc_pb2_grpc.ClickHouseStub(main_channel) + result = stub.ExecuteQuery(query_info) + assert lz4.frame.decompress(result.output) == (b'0\n')*1000 + +def test_compressed_output_streaming(): + query_info = clickhouse_grpc_pb2.QueryInfo(query="SELECT 0 FROM numbers(100000)", compression_type="lz4") + stub = clickhouse_grpc_pb2_grpc.ClickHouseStub(main_channel) + d_context = lz4.frame.create_decompression_context() + data = b'' + for result in stub.ExecuteQueryWithStreamOutput(query_info): + d1, _, _ = lz4.frame.decompress_chunk(d_context, result.output) + data += d1 + assert data == (b'0\n')*100000 + +def test_compressed_output_gzip(): + query_info = clickhouse_grpc_pb2.QueryInfo(query="SELECT 0 FROM numbers(1000)", compression_type="gzip", compression_level=6) + stub = clickhouse_grpc_pb2_grpc.ClickHouseStub(main_channel) + result = stub.ExecuteQuery(query_info) + assert gzip.decompress(result.output) == (b'0\n')*1000 + +def test_compressed_totals_and_extremes(): + query("CREATE TABLE t (x UInt8, y UInt8) ENGINE = Memory") + query("INSERT INTO t VALUES (1, 2), (2, 4), (3, 2), (3, 3), (3, 4)") + stub = clickhouse_grpc_pb2_grpc.ClickHouseStub(main_channel) + query_info = clickhouse_grpc_pb2.QueryInfo(query="SELECT sum(x), y FROM t GROUP BY y WITH TOTALS", compression_type="lz4") + result = stub.ExecuteQuery(query_info) + assert lz4.frame.decompress(result.totals) == b'12\t0\n' + query_info = clickhouse_grpc_pb2.QueryInfo(query="SELECT x, y FROM t", settings={"extremes": "1"}, compression_type="lz4") + result = stub.ExecuteQuery(query_info) + assert lz4.frame.decompress(result.extremes) == b'1\t2\n3\t4\n' + +def test_compressed_insert_query_streaming(): + query("CREATE TABLE t (a UInt8) ENGINE = Memory") + data = lz4.frame.compress(b'(1),(2),(3),(5),(4),(6),(7),(8),(9)') + sz1 = len(data) // 3 + sz2 = len(data) // 3 + d1 = data[:sz1] + d2 = data[sz1:sz1+sz2] + d3 = data[sz1+sz2:] + def send_query_info(): + yield clickhouse_grpc_pb2.QueryInfo(query="INSERT INTO t VALUES", input_data=d1, compression_type="lz4", next_query_info=True) + yield clickhouse_grpc_pb2.QueryInfo(input_data=d2, next_query_info=True) + yield clickhouse_grpc_pb2.QueryInfo(input_data=d3) + stub = clickhouse_grpc_pb2_grpc.ClickHouseStub(main_channel) + stub.ExecuteQueryWithStreamInput(send_query_info()) + assert query("SELECT a FROM t ORDER BY a") == "1\n2\n3\n4\n5\n6\n7\n8\n9\n" + +def test_compressed_external_table(): + columns = [clickhouse_grpc_pb2.NameAndType(name='UserID', type='UInt64'), clickhouse_grpc_pb2.NameAndType(name='UserName', type='String')] + d1 = lz4.frame.compress(b'1\tAlex\n2\tBen\n3\tCarl\n') + d2 = gzip.compress(b'4,Daniel\n5,Ethan\n') + ext1 = clickhouse_grpc_pb2.ExternalTable(name='ext1', columns=columns, data=d1, format='TabSeparated', compression_type="lz4") + ext2 = clickhouse_grpc_pb2.ExternalTable(name='ext2', columns=columns, data=d2, format='CSV', compression_type="gzip") + stub = clickhouse_grpc_pb2_grpc.ClickHouseStub(main_channel) + query_info = clickhouse_grpc_pb2.QueryInfo(query="SELECT * FROM (SELECT * FROM ext1 UNION ALL SELECT * FROM ext2) ORDER BY UserID", external_tables=[ext1, ext2]) + result = stub.ExecuteQuery(query_info) + assert result.output == b"1\tAlex\n"\ + b"2\tBen\n"\ + b"3\tCarl\n"\ + b"4\tDaniel\n"\ + b"5\tEthan\n" diff --git a/tests/integration/test_http_handlers_config/test.py b/tests/integration/test_http_handlers_config/test.py index 818a1e54640..01872a1d0c3 100644 --- a/tests/integration/test_http_handlers_config/test.py +++ b/tests/integration/test_http_handlers_config/test.py @@ -58,9 +58,9 @@ def test_predefined_query_handler(): 'test_predefined_handler_get?max_threads=1&setting_name=max_threads', method='GET', headers={'XXX': 'xxx'}).content - assert b'max_threads\t1\nmax_alter_threads\t1\n' == cluster.instance.http_request( - 'query_param_with_url/max_threads?max_threads=1&max_alter_threads=1', - headers={'XXX': 'max_alter_threads'}).content + assert b'max_final_threads\t1\nmax_threads\t1\n' == cluster.instance.http_request( + 'query_param_with_url/max_threads?max_threads=1&max_final_threads=1', + headers={'XXX': 'max_final_threads'}).content def test_fixed_static_handler(): diff --git a/tests/integration/test_input_format_parallel_parsing_memory_tracking/configs/conf.xml b/tests/integration/test_input_format_parallel_parsing_memory_tracking/configs/conf.xml index 3e4c885d1f6..3adba1d402a 100644 --- a/tests/integration/test_input_format_parallel_parsing_memory_tracking/configs/conf.xml +++ b/tests/integration/test_input_format_parallel_parsing_memory_tracking/configs/conf.xml @@ -1,4 +1,23 @@ - - 3000000000 + + 4000000000 + + + + + + + + + + + + + + diff --git a/tests/integration/test_input_format_parallel_parsing_memory_tracking/test.py b/tests/integration/test_input_format_parallel_parsing_memory_tracking/test.py index bc7f32bf544..1c686c7982e 100644 --- a/tests/integration/test_input_format_parallel_parsing_memory_tracking/test.py +++ b/tests/integration/test_input_format_parallel_parsing_memory_tracking/test.py @@ -24,16 +24,13 @@ def start_cluster(): # max_memory_usage_for_user cannot be used, since the memory for user accounted -# correctly, only total is not +# correctly, only total is not (it is set via conf.xml) def test_memory_tracking_total(): - instance.query(''' - CREATE TABLE null (row String) ENGINE=Null; - ''') + instance.query('CREATE TABLE null (row String) ENGINE=Null') instance.exec_in_container(['bash', '-c', 'clickhouse local -q "SELECT arrayStringConcat(arrayMap(x->toString(cityHash64(x)), range(1000)), \' \') from numbers(10000)" > data.json']) for it in range(0, 20): # the problem can be triggered only via HTTP, # since clickhouse-client parses the data by itself. assert instance.exec_in_container(['curl', '--silent', '--show-error', '--data-binary', '@data.json', - 'http://127.1:8123/?query=INSERT%20INTO%20null%20FORMAT%20TSV']) == '', 'Failed on {} iteration'.format( - it) + 'http://127.1:8123/?query=INSERT%20INTO%20null%20FORMAT%20TSV']) == '', f'Failed on {it} iteration' diff --git a/tests/integration/test_jemalloc_percpu_arena/__init__.py b/tests/integration/test_jemalloc_percpu_arena/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/integration/test_jemalloc_percpu_arena/test.py b/tests/integration/test_jemalloc_percpu_arena/test.py new file mode 100755 index 00000000000..bdd0ada966f --- /dev/null +++ b/tests/integration/test_jemalloc_percpu_arena/test.py @@ -0,0 +1,87 @@ +#!/usr/bin/env python3 +# pylint: disable=line-too-long + +import os +import subprocess +import multiprocessing +from tempfile import NamedTemporaryFile +import pytest + + +CPU_ID = 4 + + +def run_command_in_container(cmd, *args): + # /clickhouse is mounted by interation tests runner + alternative_binary = os.getenv('CLICKHOUSE_BINARY', '/clickhouse') + if alternative_binary: + args+=( + '--volume', f'{alternative_binary}:/usr/bin/clickhouse', + ) + + return subprocess.check_output(['docker', 'run', '--rm', + *args, + 'ubuntu:20.04', + 'sh', '-c', cmd, + ]) + + +def run_with_cpu_limit(cmd, *args): + with NamedTemporaryFile() as online_cpu: + # NOTE: this is not the number of CPUs, but specific CPU ID + online_cpu.write(f'{CPU_ID}'.encode()) + online_cpu.flush() + + # replace /sys/devices/system/cpu/online to full _SC_NPROCESSORS_ONLN + # like LXD/LXC from [1] does. + # + # [1]: https://github.com/ClickHouse/ClickHouse/issues/32806 + args+=( + '--volume', f'{online_cpu.name}:/sys/devices/system/cpu/online', + ) + + return run_command_in_container(cmd, *args) + + +def skip_if_jemalloc_disabled(): + output = run_command_in_container("""clickhouse local -q " + SELECT value FROM system.build_options WHERE name = 'USE_JEMALLOC'" + """).strip() + if output != b'ON' and output != b'1': + pytest.skip(f'Compiled w/o jemalloc (USE_JEMALLOC={output})') + +# Ensure that clickhouse works even when number of online CPUs +# (_SC_NPROCESSORS_ONLN) is smaller then available (_SC_NPROCESSORS_CONF). +# +# Refs: https://github.com/jemalloc/jemalloc/pull/2181 +def test_jemalloc_percpu_arena(): + skip_if_jemalloc_disabled() + + assert multiprocessing.cpu_count() > CPU_ID + + online_cpus = int(run_with_cpu_limit('getconf _NPROCESSORS_ONLN')) + assert online_cpus == 1, online_cpus + + all_cpus = int(run_with_cpu_limit('getconf _NPROCESSORS_CONF')) + assert all_cpus == multiprocessing.cpu_count(), all_cpus + + # implicitly disable percpu arena + result = run_with_cpu_limit('clickhouse local -q "select 1"', + # NOTE: explicitly disable, since it is enabled by default in debug build + # (and even though debug builds are not in CI let's state this). + '--env', 'MALLOC_CONF=abort_conf:false') + assert int(result) == int(1), result + + # should fail because of abort_conf:true + with pytest.raises(subprocess.CalledProcessError): + run_with_cpu_limit('clickhouse local -q "select 1"', + '--env', 'MALLOC_CONF=abort_conf:true') + + # should not fail even with abort_conf:true, due to explicit narenas + # NOTE: abort:false to make it compatible with debug build + run_with_cpu_limit('clickhouse local -q "select 1"', + '--env', f'MALLOC_CONF=abort_conf:true,abort:false,narenas:{all_cpus}') + +# For manual run. +if __name__ == '__main__': + test_jemalloc_percpu_arena() diff --git a/tests/integration/test_keeper_auth/test.py b/tests/integration/test_keeper_auth/test.py index 276fe3d8518..2df08cc94b7 100644 --- a/tests/integration/test_keeper_auth/test.py +++ b/tests/integration/test_keeper_auth/test.py @@ -43,12 +43,11 @@ def test_digest_auth_basic(started_cluster, get_zk): auth_connection.create("/test_no_acl", b"") auth_connection.create("/test_all_acl", b"data", acl=[make_acl("auth", "", all=True)]) - # for some reason original zookeeper accepts this ACL, but doesn't allow to do anything with this node - # even with correct credentials. - auth_connection.create("/test_all_digest_acl", b"dataX", acl=[make_acl("digest", "user1:password1", all=True)]) + # Consistent with zookeeper, accept generated digest + auth_connection.create("/test_all_digest_acl", b"dataX", acl=[make_acl("digest", "user1:XDkd2dsEuhc9ImU3q8pa8UOdtpI=", all=True)]) assert auth_connection.get("/test_all_acl")[0] == b"data" - #assert auth_connection.get("/test_all_digest_acl")[0] == b"dataX" + assert auth_connection.get("/test_all_digest_acl")[0] == b"dataX" no_auth_connection = get_zk() no_auth_connection.set("/test_no_acl", b"hello") diff --git a/tests/integration/test_materialized_mysql_database/materialize_with_ddl.py b/tests/integration/test_materialized_mysql_database/materialize_with_ddl.py index 6c50da23d9a..7265105c8df 100644 --- a/tests/integration/test_materialized_mysql_database/materialize_with_ddl.py +++ b/tests/integration/test_materialized_mysql_database/materialize_with_ddl.py @@ -624,7 +624,7 @@ def err_sync_user_privs_with_materialized_mysql_database(clickhouse_node, mysql_ service_name)) assert "priv_err_db" in clickhouse_node.query("SHOW DATABASES") assert "test_table_1" not in clickhouse_node.query("SHOW TABLES FROM priv_err_db") - clickhouse_node.query("DETACH DATABASE priv_err_db") + clickhouse_node.query_with_retry("DETACH DATABASE priv_err_db") mysql_node.query("REVOKE SELECT ON priv_err_db.* FROM 'test'@'%'") time.sleep(3) @@ -743,7 +743,7 @@ def mysql_kill_sync_thread_restore_test(clickhouse_node, mysql_node, service_nam time.sleep(sleep_time) clickhouse_node.query("SELECT * FROM test_database.test_table") - clickhouse_node.query("DETACH DATABASE test_database") + clickhouse_node.query_with_retry("DETACH DATABASE test_database") clickhouse_node.query("ATTACH DATABASE test_database") check_query(clickhouse_node, "SELECT * FROM test_database.test_table ORDER BY id FORMAT TSV", '1\n2\n') @@ -784,7 +784,7 @@ def mysql_killed_while_insert(clickhouse_node, mysql_node, service_name): mysql_node.alloc_connection() - clickhouse_node.query("DETACH DATABASE kill_mysql_while_insert") + clickhouse_node.query_with_retry("DETACH DATABASE kill_mysql_while_insert") clickhouse_node.query("ATTACH DATABASE kill_mysql_while_insert") result = mysql_node.query_and_get_data("SELECT COUNT(1) FROM kill_mysql_while_insert.test") @@ -1053,3 +1053,96 @@ def table_table(clickhouse_node, mysql_node, service_name): mysql_node.query("DROP DATABASE table_test") clickhouse_node.query("DROP DATABASE table_test") + +def table_overrides(clickhouse_node, mysql_node, service_name): + mysql_node.query("DROP DATABASE IF EXISTS table_overrides") + clickhouse_node.query("DROP DATABASE IF EXISTS table_overrides") + mysql_node.query("CREATE DATABASE table_overrides") + mysql_node.query("CREATE TABLE table_overrides.t1 (sensor_id INT UNSIGNED, timestamp DATETIME, temperature FLOAT, PRIMARY KEY(timestamp, sensor_id))") + for id in range(10): + mysql_node.query("BEGIN") + for day in range(100): + mysql_node.query(f"INSERT INTO table_overrides.t1 VALUES({id}, TIMESTAMP('2021-01-01') + INTERVAL {day} DAY, (RAND()*20)+20)") + mysql_node.query("COMMIT") + clickhouse_node.query(f""" + CREATE DATABASE table_overrides ENGINE=MaterializeMySQL('{service_name}:3306', 'table_overrides', 'root', 'clickhouse') + TABLE OVERRIDE t1 (COLUMNS (sensor_id UInt64, temp_f Nullable(Float32) ALIAS if(isNull(temperature), NULL, (temperature * 9 / 5) + 32))) + """) + check_query( + clickhouse_node, + "SELECT type FROM system.columns WHERE database = 'table_overrides' AND table = 't1' AND name = 'sensor_id'", + "UInt64\n") + check_query( + clickhouse_node, + "SELECT type, default_kind FROM system.columns WHERE database = 'table_overrides' AND table = 't1' AND name = 'temp_f'", + "Nullable(Float32)\tALIAS\n") + check_query(clickhouse_node, "SELECT count() FROM table_overrides.t1", "1000\n") + mysql_node.query("INSERT INTO table_overrides.t1 VALUES(1001, '2021-10-01 00:00:00', 42.0)") + check_query(clickhouse_node, "SELECT count() FROM table_overrides.t1", "1001\n") + clickhouse_node.query("DROP DATABASE IF EXISTS table_overrides") + mysql_node.query("DROP DATABASE IF EXISTS table_overrides") + +def materialized_database_support_all_kinds_of_mysql_datatype(clickhouse_node, mysql_node, service_name): + mysql_node.query("DROP DATABASE IF EXISTS test_database_datatype") + clickhouse_node.query("DROP DATABASE IF EXISTS test_database_datatype") + mysql_node.query("CREATE DATABASE test_database_datatype DEFAULT CHARACTER SET 'utf8'") + mysql_node.query(""" + CREATE TABLE test_database_datatype.t1 ( + `v1` int(10) unsigned AUTO_INCREMENT, + `v2` TINYINT, + `v3` SMALLINT, + `v4` BIGINT, + `v5` int, + `v6` TINYINT unsigned, + `v7` SMALLINT unsigned, + `v8` BIGINT unsigned, + `v9` FLOAT, + `v10` FLOAT unsigned, + `v11` DOUBLE, + `v12` DOUBLE unsigned, + `v13` DECIMAL(5,4), + `v14` date, + `v15` TEXT, + `v16` varchar(100) , + `v17` BLOB, + `v18` datetime DEFAULT CURRENT_TIMESTAMP, + `v19` datetime(6) DEFAULT CURRENT_TIMESTAMP(6), + `v20` TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + `v21` TIMESTAMP(6) DEFAULT CURRENT_TIMESTAMP(6), + /* todo support */ + # `v22` YEAR, + # `v23` TIME, + # `v24` TIME(3), + # `v25` GEOMETRY, + `v26` bit(4), + # `v27` JSON DEFAULT NULL, + # `v28` set('a', 'c', 'f', 'd', 'e', 'b'), + `v29` mediumint(4) unsigned NOT NULL DEFAULT '0', + `v30` varbinary(255) DEFAULT NULL COMMENT 'varbinary support', + `v31` binary(200) DEFAULT NULL, + `v32` ENUM('RED','GREEN','BLUE'), + PRIMARY KEY (`v1`) + ) ENGINE=InnoDB; + """) + + mysql_node.query(""" + INSERT INTO test_database_datatype.t1 (v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v26, v29, v30, v31, v32) values + (1, 11, 9223372036854775807, -1, 1, 11, 18446744073709551615, -1.1, 1.1, -1.111, 1.111, 1.1111, '2021-10-06', 'text', 'varchar', 'BLOB', '2021-10-06 18:32:57', '2021-10-06 18:32:57.482786', '2021-10-06 18:32:57', '2021-10-06 18:32:57.482786', b'1010', 11, 'varbinary', 'binary', 'RED'); + """) + clickhouse_node.query( + "CREATE DATABASE test_database_datatype ENGINE = MaterializeMySQL('{}:3306', 'test_database_datatype', 'root', 'clickhouse')".format( + service_name)) + + check_query(clickhouse_node, "SELECT name FROM system.tables WHERE database = 'test_database_datatype'", "t1\n") + # full synchronization check + check_query(clickhouse_node, "SELECT v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v26, v29, v30, v32 FROM test_database_datatype.t1 FORMAT TSV", + "1\t1\t11\t9223372036854775807\t-1\t1\t11\t18446744073709551615\t-1.1\t1.1\t-1.111\t1.111\t1.1111\t2021-10-06\ttext\tvarchar\tBLOB\t2021-10-06 18:32:57\t2021-10-06 18:32:57.482786\t2021-10-06 18:32:57\t2021-10-06 18:32:57.482786\t10\t11\tvarbinary\tRED\n") + + mysql_node.query(""" + INSERT INTO test_database_datatype.t1 (v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v26, v29, v30, v31, v32) values + (2, 22, 9223372036854775807, -2, 2, 22, 18446744073709551615, -2.2, 2.2, -2.22, 2.222, 2.2222, '2021-10-07', 'text', 'varchar', 'BLOB', '2021-10-07 18:32:57', '2021-10-07 18:32:57.482786', '2021-10-07 18:32:57', '2021-10-07 18:32:57.482786', b'1011', 22, 'varbinary', 'binary', 'GREEN' ); + """) + # increment synchronization check + check_query(clickhouse_node, "SELECT v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v26, v29, v30, v32 FROM test_database_datatype.t1 ORDER BY v1 FORMAT TSV", + "1\t1\t11\t9223372036854775807\t-1\t1\t11\t18446744073709551615\t-1.1\t1.1\t-1.111\t1.111\t1.1111\t2021-10-06\ttext\tvarchar\tBLOB\t2021-10-06 18:32:57\t2021-10-06 18:32:57.482786\t2021-10-06 18:32:57\t2021-10-06 18:32:57.482786\t10\t11\tvarbinary\tRED\n" + + "2\t2\t22\t9223372036854775807\t-2\t2\t22\t18446744073709551615\t-2.2\t2.2\t-2.22\t2.222\t2.2222\t2021-10-07\ttext\tvarchar\tBLOB\t2021-10-07 18:32:57\t2021-10-07 18:32:57.482786\t2021-10-07 18:32:57\t2021-10-07 18:32:57.482786\t11\t22\tvarbinary\tGREEN\n") diff --git a/tests/integration/test_materialized_mysql_database/test.py b/tests/integration/test_materialized_mysql_database/test.py index 59c5a08e0f5..501c0cd78fa 100644 --- a/tests/integration/test_materialized_mysql_database/test.py +++ b/tests/integration/test_materialized_mysql_database/test.py @@ -249,3 +249,11 @@ def test_large_transaction(started_cluster, started_mysql_8_0, started_mysql_5_7 def test_table_table(started_cluster, started_mysql_8_0, started_mysql_5_7, clickhouse_node): materialize_with_ddl.table_table(clickhouse_node, started_mysql_8_0, "mysql80") materialize_with_ddl.table_table(clickhouse_node, started_mysql_5_7, "mysql57") + +def test_table_overrides(started_cluster, started_mysql_8_0, started_mysql_5_7, clickhouse_node): + materialize_with_ddl.table_overrides(clickhouse_node, started_mysql_5_7, "mysql57") + materialize_with_ddl.table_overrides(clickhouse_node, started_mysql_8_0, "mysql80") + +def test_materialized_database_support_all_kinds_of_mysql_datatype(started_cluster, started_mysql_8_0, started_mysql_5_7, clickhouse_node): + materialize_with_ddl.materialized_database_support_all_kinds_of_mysql_datatype(clickhouse_node, started_mysql_8_0, "mysql80") + materialize_with_ddl.materialized_database_support_all_kinds_of_mysql_datatype(clickhouse_node, started_mysql_5_7, "mysql57") diff --git a/tests/integration/test_merge_tree_azure_blob_storage/__init__.py b/tests/integration/test_merge_tree_azure_blob_storage/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/integration/test_merge_tree_azure_blob_storage/configs/config.d/bg_processing_pool_conf.xml b/tests/integration/test_merge_tree_azure_blob_storage/configs/config.d/bg_processing_pool_conf.xml new file mode 100644 index 00000000000..e45b647fd24 --- /dev/null +++ b/tests/integration/test_merge_tree_azure_blob_storage/configs/config.d/bg_processing_pool_conf.xml @@ -0,0 +1,5 @@ + + 0.5 + 0.5 + 0.5 + diff --git a/tests/integration/test_merge_tree_azure_blob_storage/configs/config.d/storage_conf.xml b/tests/integration/test_merge_tree_azure_blob_storage/configs/config.d/storage_conf.xml new file mode 100644 index 00000000000..09fa0d6c767 --- /dev/null +++ b/tests/integration/test_merge_tree_azure_blob_storage/configs/config.d/storage_conf.xml @@ -0,0 +1,33 @@ + + + + + azure_blob_storage + http://azurite1:10000/devstoreaccount1 + cont + false + false + + devstoreaccount1 + Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw== + 33554432 + + + local + / + + + + + +
+ blob_storage_disk +
+ + hdd + +
+
+
+
+
diff --git a/tests/integration/test_merge_tree_azure_blob_storage/configs/config.xml b/tests/integration/test_merge_tree_azure_blob_storage/configs/config.xml new file mode 100644 index 00000000000..59829ffdb56 --- /dev/null +++ b/tests/integration/test_merge_tree_azure_blob_storage/configs/config.xml @@ -0,0 +1,20 @@ + + + 9000 + 127.0.0.1 + + + + true + none + + AcceptCertificateHandler + + + + + 500 + 5368709120 + ./clickhouse/ + users.xml + diff --git a/tests/integration/test_merge_tree_azure_blob_storage/test.py b/tests/integration/test_merge_tree_azure_blob_storage/test.py new file mode 100644 index 00000000000..92b9d52cf86 --- /dev/null +++ b/tests/integration/test_merge_tree_azure_blob_storage/test.py @@ -0,0 +1,372 @@ +import logging +import time +import os + +import pytest +from helpers.cluster import ClickHouseCluster, get_instances_dir +from helpers.utility import generate_values, replace_config, SafeThread + + +SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__)) +CONFIG_PATH = os.path.join(SCRIPT_DIR, './{}/node/configs/config.d/storage_conf.xml'.format(get_instances_dir())) + +NODE_NAME = "node" +TABLE_NAME = "blob_storage_table" +AZURE_BLOB_STORAGE_DISK = "blob_storage_disk" +LOCAL_DISK = "hdd" +CONTAINER_NAME = "cont" + + +@pytest.fixture(scope="module") +def cluster(): + try: + cluster = ClickHouseCluster(__file__) + cluster.add_instance(NODE_NAME, + main_configs=["configs/config.d/storage_conf.xml", "configs/config.d/bg_processing_pool_conf.xml"], + with_azurite=True) + logging.info("Starting cluster...") + cluster.start() + logging.info("Cluster started") + + yield cluster + finally: + cluster.shutdown() + +# Note: use this for selects and inserts and create table queries. +# For inserts there is no guarantee that retries will not result in duplicates. +# But it is better to retry anyway because 'Connection was closed by the server' error +# happens in fact only for inserts because reads already have build-in retries in code. +def azure_query(node, query, try_num=3): + for i in range(try_num): + try: + return node.query(query) + except Exception as ex: + retriable_errors = [ + 'DB::Exception: Azure::Core::Http::TransportException: Connection was closed by the server while trying to read a response', + ] + retry = False + for error in retriable_errors: + if error in str(ex): + retry = True + logging.info(f"Try num: {i}. Having retriable error: {ex}") + break + if not retry or i == try_num - 1: + raise Exception(ex) + continue + +def create_table(node, table_name, **additional_settings): + settings = { + "storage_policy": "blob_storage_policy", + "old_parts_lifetime": 1, + "index_granularity": 512 + } + settings.update(additional_settings) + + create_table_statement = f""" + CREATE TABLE {table_name} ( + dt Date, + id Int64, + data String, + INDEX min_max (id) TYPE minmax GRANULARITY 3 + ) ENGINE=MergeTree() + PARTITION BY dt + ORDER BY (dt, id) + SETTINGS {",".join((k+"="+repr(v) for k, v in settings.items()))}""" + + node.query(f"DROP TABLE IF EXISTS {table_name}") + azure_query(node, create_table_statement) + assert azure_query(node, f"SELECT COUNT(*) FROM {table_name} FORMAT Values") == "(0)" + + +def test_create_table(cluster): + node = cluster.instances[NODE_NAME] + create_table(node, TABLE_NAME) + + +def test_read_after_cache_is_wiped(cluster): + node = cluster.instances[NODE_NAME] + create_table(node, TABLE_NAME) + + values = "('2021-11-13',3,'hello'),('2021-11-14',4,'heyo')" + + azure_query(node, f"INSERT INTO {TABLE_NAME} VALUES {values}") + + # Wipe cache + cluster.exec_in_container(cluster.get_container_id(NODE_NAME), ["rm", "-rf", "/var/lib/clickhouse/disks/blob_storage_disk/cache/"]) + + # After cache is populated again, only .bin files should be accessed from Blob Storage. + assert azure_query(node, f"SELECT * FROM {TABLE_NAME} order by dt, id FORMAT Values") == values + + +def test_simple_insert_select(cluster): + node = cluster.instances[NODE_NAME] + create_table(node, TABLE_NAME) + + values = "('2021-11-13',3,'hello')" + azure_query(node, f"INSERT INTO {TABLE_NAME} VALUES {values}") + assert azure_query(node, f"SELECT dt, id, data FROM {TABLE_NAME} FORMAT Values") == values + blob_container_client = cluster.blob_service_client.get_container_client(CONTAINER_NAME) + assert len(list(blob_container_client.list_blobs())) >= 12 # 1 format file + 2 skip index files + 9 regular MergeTree files + leftovers from other tests + + +def test_inserts_selects(cluster): + node = cluster.instances[NODE_NAME] + create_table(node, TABLE_NAME) + + values1 = generate_values('2020-01-03', 4096) + azure_query(node, f"INSERT INTO {TABLE_NAME} VALUES {values1}") + assert azure_query(node, f"SELECT * FROM {TABLE_NAME} order by dt, id FORMAT Values") == values1 + + values2 = generate_values('2020-01-04', 4096) + azure_query(node, f"INSERT INTO {TABLE_NAME} VALUES {values2}") + assert azure_query(node, f"SELECT * FROM {TABLE_NAME} ORDER BY dt, id FORMAT Values") == values1 + "," + values2 + + assert azure_query(node, f"SELECT count(*) FROM {TABLE_NAME} where id = 1 FORMAT Values") == "(2)" + + +@pytest.mark.parametrize( + "merge_vertical", [ + (True), + (False), +]) +def test_insert_same_partition_and_merge(cluster, merge_vertical): + settings = {} + if merge_vertical: + settings['vertical_merge_algorithm_min_rows_to_activate'] = 0 + settings['vertical_merge_algorithm_min_columns_to_activate'] = 0 + + node = cluster.instances[NODE_NAME] + create_table(node, TABLE_NAME, **settings) + + node.query(f"SYSTEM STOP MERGES {TABLE_NAME}") + azure_query(node, f"INSERT INTO {TABLE_NAME} VALUES {generate_values('2020-01-03', 1024)}") + azure_query(node, f"INSERT INTO {TABLE_NAME} VALUES {generate_values('2020-01-03', 2048)}") + azure_query(node, f"INSERT INTO {TABLE_NAME} VALUES {generate_values('2020-01-03', 4096)}") + azure_query(node, f"INSERT INTO {TABLE_NAME} VALUES {generate_values('2020-01-03', 1024, -1)}") + azure_query(node, f"INSERT INTO {TABLE_NAME} VALUES {generate_values('2020-01-03', 2048, -1)}") + azure_query(node, f"INSERT INTO {TABLE_NAME} VALUES {generate_values('2020-01-03', 4096, -1)}") + assert azure_query(node, f"SELECT sum(id) FROM {TABLE_NAME} FORMAT Values") == "(0)" + assert azure_query(node, f"SELECT count(distinct(id)) FROM {TABLE_NAME} FORMAT Values") == "(8192)" + + node.query(f"SYSTEM START MERGES {TABLE_NAME}") + + # Wait for merges and old parts deletion + for attempt in range(0, 10): + parts_count = azure_query(node, f"SELECT COUNT(*) FROM system.parts WHERE table = '{TABLE_NAME}' FORMAT Values") + if parts_count == "(1)": + break + + if attempt == 9: + assert parts_count == "(1)" + + time.sleep(1) + + assert azure_query(node, f"SELECT sum(id) FROM {TABLE_NAME} FORMAT Values") == "(0)" + assert azure_query(node, f"SELECT count(distinct(id)) FROM {TABLE_NAME} FORMAT Values") == "(8192)" + + +def test_alter_table_columns(cluster): + node = cluster.instances[NODE_NAME] + create_table(node, TABLE_NAME) + + azure_query(node, f"INSERT INTO {TABLE_NAME} VALUES {generate_values('2020-01-03', 4096)}") + azure_query(node, f"INSERT INTO {TABLE_NAME} VALUES {generate_values('2020-01-03', 4096, -1)}") + + node.query(f"ALTER TABLE {TABLE_NAME} ADD COLUMN col1 UInt64 DEFAULT 1") + # To ensure parts have been merged + node.query(f"OPTIMIZE TABLE {TABLE_NAME}") + + assert azure_query(node, f"SELECT sum(col1) FROM {TABLE_NAME} FORMAT Values") == "(8192)" + assert azure_query(node, f"SELECT sum(col1) FROM {TABLE_NAME} WHERE id > 0 FORMAT Values") == "(4096)" + + node.query(f"ALTER TABLE {TABLE_NAME} MODIFY COLUMN col1 String", settings={"mutations_sync": 2}) + + assert azure_query(node, f"SELECT distinct(col1) FROM {TABLE_NAME} FORMAT Values") == "('1')" + + +def test_attach_detach_partition(cluster): + node = cluster.instances[NODE_NAME] + create_table(node, TABLE_NAME) + + azure_query(node, f"INSERT INTO {TABLE_NAME} VALUES {generate_values('2020-01-03', 4096)}") + azure_query(node, f"INSERT INTO {TABLE_NAME} VALUES {generate_values('2020-01-04', 4096)}") + assert azure_query(node, f"SELECT count(*) FROM {TABLE_NAME} FORMAT Values") == "(8192)" + + node.query(f"ALTER TABLE {TABLE_NAME} DETACH PARTITION '2020-01-03'") + assert azure_query(node, f"SELECT count(*) FROM {TABLE_NAME} FORMAT Values") == "(4096)" + + node.query(f"ALTER TABLE {TABLE_NAME} ATTACH PARTITION '2020-01-03'") + assert azure_query(node, f"SELECT count(*) FROM {TABLE_NAME} FORMAT Values") == "(8192)" + + node.query(f"ALTER TABLE {TABLE_NAME} DROP PARTITION '2020-01-03'") + assert azure_query(node, f"SELECT count(*) FROM {TABLE_NAME} FORMAT Values") == "(4096)" + + node.query(f"ALTER TABLE {TABLE_NAME} DETACH PARTITION '2020-01-04'") + node.query(f"ALTER TABLE {TABLE_NAME} DROP DETACHED PARTITION '2020-01-04'", settings={"allow_drop_detached": 1}) + assert azure_query(node, f"SELECT count(*) FROM {TABLE_NAME} FORMAT Values") == "(0)" + + +def test_move_partition_to_another_disk(cluster): + node = cluster.instances[NODE_NAME] + create_table(node, TABLE_NAME) + + azure_query(node, f"INSERT INTO {TABLE_NAME} VALUES {generate_values('2020-01-03', 4096)}") + azure_query(node, f"INSERT INTO {TABLE_NAME} VALUES {generate_values('2020-01-04', 4096)}") + assert azure_query(node, f"SELECT count(*) FROM {TABLE_NAME} FORMAT Values") == "(8192)" + + node.query(f"ALTER TABLE {TABLE_NAME} MOVE PARTITION '2020-01-04' TO DISK '{LOCAL_DISK}'") + assert azure_query(node, f"SELECT count(*) FROM {TABLE_NAME} FORMAT Values") == "(8192)" + + node.query(f"ALTER TABLE {TABLE_NAME} MOVE PARTITION '2020-01-04' TO DISK '{AZURE_BLOB_STORAGE_DISK}'") + assert azure_query(node, f"SELECT count(*) FROM {TABLE_NAME} FORMAT Values") == "(8192)" + + +def test_table_manipulations(cluster): + node = cluster.instances[NODE_NAME] + create_table(node, TABLE_NAME) + + renamed_table = TABLE_NAME + "_renamed" + + node.query_with_retry(f"INSERT INTO {TABLE_NAME} VALUES {generate_values('2020-01-03', 4096)}") + node.query_with_retry(f"INSERT INTO {TABLE_NAME} VALUES {generate_values('2020-01-04', 4096)}") + + node.query(f"RENAME TABLE {TABLE_NAME} TO {renamed_table}") + assert azure_query(node, f"SELECT count(*) FROM {renamed_table} FORMAT Values") == "(8192)" + + node.query(f"RENAME TABLE {renamed_table} TO {TABLE_NAME}") + assert node.query(f"CHECK TABLE {TABLE_NAME} FORMAT Values") == "(1)" + + node.query(f"DETACH TABLE {TABLE_NAME}") + node.query(f"ATTACH TABLE {TABLE_NAME}") + assert azure_query(node, f"SELECT count(*) FROM {TABLE_NAME} FORMAT Values") == "(8192)" + + node.query(f"TRUNCATE TABLE {TABLE_NAME}") + assert azure_query(node, f"SELECT count(*) FROM {TABLE_NAME} FORMAT Values") == "(0)" + + +@pytest.mark.long_run +def test_move_replace_partition_to_another_table(cluster): + node = cluster.instances[NODE_NAME] + create_table(node, TABLE_NAME) + + table_clone_name = TABLE_NAME + "_clone" + + azure_query(node, f"INSERT INTO {TABLE_NAME} VALUES {generate_values('2020-01-03', 256)}") + azure_query(node, f"INSERT INTO {TABLE_NAME} VALUES {generate_values('2020-01-04', 256)}") + azure_query(node, f"INSERT INTO {TABLE_NAME} VALUES {generate_values('2020-01-05', 256, -1)}") + azure_query(node, f"INSERT INTO {TABLE_NAME} VALUES {generate_values('2020-01-06', 256, -1)}") + assert azure_query(node, f"SELECT sum(id) FROM {TABLE_NAME} FORMAT Values") == "(0)" + assert azure_query(node, f"SELECT count(*) FROM {TABLE_NAME} FORMAT Values") == "(1024)" + + create_table(node, table_clone_name) + + node.query(f"ALTER TABLE {TABLE_NAME} MOVE PARTITION '2020-01-03' TO TABLE {table_clone_name}") + node.query(f"ALTER TABLE {TABLE_NAME} MOVE PARTITION '2020-01-05' TO TABLE {table_clone_name}") + assert azure_query(node, f"SELECT sum(id) FROM {TABLE_NAME} FORMAT Values") == "(0)" + assert azure_query(node, f"SELECT count(*) FROM {TABLE_NAME} FORMAT Values") == "(512)" + assert azure_query(node, f"SELECT sum(id) FROM {table_clone_name} FORMAT Values") == "(0)" + assert azure_query(node, f"SELECT count(*) FROM {table_clone_name} FORMAT Values") == "(512)" + + # Add new partitions to source table, but with different values and replace them from copied table. + azure_query(node, f"INSERT INTO {TABLE_NAME} VALUES {generate_values('2020-01-03', 256, -1)}") + azure_query(node, f"INSERT INTO {TABLE_NAME} VALUES {generate_values('2020-01-05', 256)}") + assert azure_query(node, f"SELECT sum(id) FROM {TABLE_NAME} FORMAT Values") == "(0)" + assert azure_query(node, f"SELECT count(*) FROM {TABLE_NAME} FORMAT Values") == "(1024)" + + node.query(f"ALTER TABLE {TABLE_NAME} REPLACE PARTITION '2020-01-03' FROM {table_clone_name}") + node.query(f"ALTER TABLE {TABLE_NAME} REPLACE PARTITION '2020-01-05' FROM {table_clone_name}") + assert azure_query(node, f"SELECT sum(id) FROM {TABLE_NAME} FORMAT Values") == "(0)" + assert azure_query(node, f"SELECT count(*) FROM {TABLE_NAME} FORMAT Values") == "(1024)" + assert azure_query(node, f"SELECT sum(id) FROM {table_clone_name} FORMAT Values") == "(0)" + assert azure_query(node, f"SELECT count(*) FROM {table_clone_name} FORMAT Values") == "(512)" + + node.query(f"DROP TABLE {table_clone_name} NO DELAY") + assert azure_query(node, f"SELECT sum(id) FROM {TABLE_NAME} FORMAT Values") == "(0)" + assert azure_query(node, f"SELECT count(*) FROM {TABLE_NAME} FORMAT Values") == "(1024)" + + node.query(f"ALTER TABLE {TABLE_NAME} FREEZE") + + node.query(f"DROP TABLE {TABLE_NAME} NO DELAY") + + +def test_freeze_unfreeze(cluster): + node = cluster.instances[NODE_NAME] + create_table(node, TABLE_NAME) + + backup1 = 'backup1' + backup2 = 'backup2' + + azure_query(node, f"INSERT INTO {TABLE_NAME} VALUES {generate_values('2020-01-03', 4096)}") + node.query(f"ALTER TABLE {TABLE_NAME} FREEZE WITH NAME '{backup1}'") + azure_query(node, f"INSERT INTO {TABLE_NAME} VALUES {generate_values('2020-01-04', 4096)}") + node.query(f"ALTER TABLE {TABLE_NAME} FREEZE WITH NAME '{backup2}'") + + azure_query(node, f"TRUNCATE TABLE {TABLE_NAME}") + + # Unfreeze single partition from backup1. + node.query(f"ALTER TABLE {TABLE_NAME} UNFREEZE PARTITION '2020-01-03' WITH NAME '{backup1}'") + # Unfreeze all partitions from backup2. + node.query(f"ALTER TABLE {TABLE_NAME} UNFREEZE WITH NAME '{backup2}'") + + +def test_apply_new_settings(cluster): + node = cluster.instances[NODE_NAME] + create_table(node, TABLE_NAME) + + azure_query(node, f"INSERT INTO {TABLE_NAME} VALUES {generate_values('2020-01-03', 4096)}") + + # Force multi-part upload mode. + replace_config( + CONFIG_PATH, + "33554432", + "4096") + + node.query("SYSTEM RELOAD CONFIG") + azure_query(node, f"INSERT INTO {TABLE_NAME} VALUES {generate_values('2020-01-04', 4096, -1)}") + + +# NOTE: this test takes a couple of minutes when run together with other tests +@pytest.mark.long_run +def test_restart_during_load(cluster): + node = cluster.instances[NODE_NAME] + create_table(node, TABLE_NAME) + + # Force multi-part upload mode. + replace_config(CONFIG_PATH, "false", "") + + azure_query(node, f"INSERT INTO {TABLE_NAME} VALUES {generate_values('2020-01-04', 4096)}") + azure_query(node, f"INSERT INTO {TABLE_NAME} VALUES {generate_values('2020-01-05', 4096, -1)}") + + + def read(): + for ii in range(0, 5): + logging.info(f"Executing {ii} query") + assert azure_query(node, f"SELECT sum(id) FROM {TABLE_NAME} FORMAT Values") == "(0)" + logging.info(f"Query {ii} executed") + time.sleep(0.2) + + def restart_disk(): + for iii in range(0, 2): + logging.info(f"Restarting disk, attempt {iii}") + node.query(f"SYSTEM RESTART DISK {AZURE_BLOB_STORAGE_DISK}") + logging.info(f"Disk restarted, attempt {iii}") + time.sleep(0.5) + + threads = [] + for _ in range(0, 4): + threads.append(SafeThread(target=read)) + + threads.append(SafeThread(target=restart_disk)) + + for thread in threads: + thread.start() + + for thread in threads: + thread.join() + + +def test_big_insert(cluster): + node = cluster.instances[NODE_NAME] + create_table(node, TABLE_NAME) + azure_query(node, f"INSERT INTO {TABLE_NAME} select '2020-01-03', number, toString(number) from numbers(5000000)") + assert int(azure_query(node, f"SELECT count() FROM {TABLE_NAME}")) == 5000000 diff --git a/tests/integration/test_merge_tree_hdfs/test.py b/tests/integration/test_merge_tree_hdfs/test.py index d26692a0d93..d6e3315e45d 100644 --- a/tests/integration/test_merge_tree_hdfs/test.py +++ b/tests/integration/test_merge_tree_hdfs/test.py @@ -1,12 +1,10 @@ import logging -import random -import string import time -import threading import os import pytest from helpers.cluster import ClickHouseCluster +from helpers.utility import generate_values from pyhdfs import HdfsClient @@ -43,17 +41,6 @@ FILES_OVERHEAD_PER_PART_WIDE = FILES_OVERHEAD_PER_COLUMN * 3 + 2 + 6 + 1 FILES_OVERHEAD_PER_PART_COMPACT = 10 + 1 -def random_string(length): - letters = string.ascii_letters - return ''.join(random.choice(letters) for i in range(length)) - - -def generate_values(date_str, count, sign=1): - data = [[date_str, sign * (i + 1), random_string(10)] for i in range(count)] - data.sort(key=lambda tup: tup[1]) - return ",".join(["('{}',{},'{}')".format(x, y, z) for x, y, z in data]) - - @pytest.fixture(scope="module") def cluster(): try: diff --git a/tests/integration/test_merge_tree_s3/test.py b/tests/integration/test_merge_tree_s3/test.py index b577d4a1405..04981523432 100644 --- a/tests/integration/test_merge_tree_s3/test.py +++ b/tests/integration/test_merge_tree_s3/test.py @@ -1,47 +1,16 @@ import logging -import random -import string import time -import threading import os import pytest from helpers.cluster import ClickHouseCluster, get_instances_dir +from helpers.utility import generate_values, replace_config, SafeThread -# By default the exceptions that was throwed in threads will be ignored -# (they will not mark the test as failed, only printed to stderr). -# -# Wrap thrading.Thread and re-throw exception on join() -class SafeThread(threading.Thread): - def __init__(self, target): - super().__init__() - self.target = target - self.exception = None - def run(self): - try: - self.target() - except Exception as e: # pylint: disable=broad-except - self.exception = e - def join(self, timeout=None): - super().join(timeout) - if self.exception: - raise self.exception - SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__)) CONFIG_PATH = os.path.join(SCRIPT_DIR, './{}/node/configs/config.d/storage_conf.xml'.format(get_instances_dir())) -def replace_config(old, new): - config = open(CONFIG_PATH, 'r') - config_lines = config.readlines() - config.close() - config_lines = [line.replace(old, new) for line in config_lines] - config = open(CONFIG_PATH, 'w') - config.writelines(config_lines) - config.close() - - @pytest.fixture(scope="module") def cluster(): try: @@ -66,17 +35,6 @@ FILES_OVERHEAD_PER_PART_WIDE = FILES_OVERHEAD_PER_COLUMN * 3 + 2 + 6 + 1 FILES_OVERHEAD_PER_PART_COMPACT = 10 + 1 -def random_string(length): - letters = string.ascii_letters - return ''.join(random.choice(letters) for i in range(length)) - - -def generate_values(date_str, count, sign=1): - data = [[date_str, sign * (i + 1), random_string(10)] for i in range(count)] - data.sort(key=lambda tup: tup[1]) - return ",".join(["('{}',{},'{}')".format(x, y, z) for x, y, z in data]) - - def create_table(node, table_name, **additional_settings): settings = { "storage_policy": "s3", @@ -442,8 +400,9 @@ def test_s3_disk_apply_new_settings(cluster, node_name): s3_requests_to_write_partition = get_s3_requests() - s3_requests_before # Force multi-part upload mode. - replace_config("33554432", - "0") + replace_config(CONFIG_PATH, + "33554432", + "0") node.query("SYSTEM RELOAD CONFIG") @@ -497,3 +456,16 @@ def test_s3_disk_reads_on_unstable_connection(cluster, node_name): for i in range(30): print(f"Read sequence {i}") assert node.query("SELECT sum(id) FROM s3_test").splitlines() == ["40499995500000"] + + +@pytest.mark.parametrize("node_name", ["node"]) +def test_lazy_seek_optimization_for_async_read(cluster, node_name): + node = cluster.instances[node_name] + node.query("DROP TABLE IF EXISTS s3_test NO DELAY") + node.query("CREATE TABLE s3_test (key UInt32, value String) Engine=MergeTree() ORDER BY key SETTINGS storage_policy='s3';") + node.query("INSERT INTO s3_test SELECT * FROM generateRandom('key UInt32, value String') LIMIT 10000000") + node.query("SELECT * FROM s3_test WHERE value LIKE '%abc%' ORDER BY value LIMIT 10") + node.query("DROP TABLE IF EXISTS s3_test NO DELAY") + minio = cluster.minio_client + for obj in list(minio.list_objects(cluster.minio_bucket, 'data/')): + minio.remove_object(cluster.minio_bucket, obj.object_name) diff --git a/tests/integration/test_merge_tree_s3_failover/test.py b/tests/integration/test_merge_tree_s3_failover/test.py index b6b47417523..44e7e0ae5ad 100644 --- a/tests/integration/test_merge_tree_s3_failover/test.py +++ b/tests/integration/test_merge_tree_s3_failover/test.py @@ -37,7 +37,6 @@ def fail_request(cluster, request): ["curl", "-s", "http://resolver:8080/fail_request/{}".format(request)]) assert response == 'OK', 'Expected "OK", but got "{}"'.format(response) - def throttle_request(cluster, request): response = cluster.exec_in_container(cluster.get_container_id('resolver'), ["curl", "-s", "http://resolver:8080/throttle_request/{}".format(request)]) diff --git a/tests/integration/test_merge_tree_s3_restore/test.py b/tests/integration/test_merge_tree_s3_restore/test.py index babbea2beba..e12b69cdf17 100644 --- a/tests/integration/test_merge_tree_s3_restore/test.py +++ b/tests/integration/test_merge_tree_s3_restore/test.py @@ -7,6 +7,7 @@ import time import pytest from helpers.cluster import ClickHouseCluster, get_instances_dir + SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__)) NOT_RESTORABLE_CONFIG_PATH = os.path.join(SCRIPT_DIR, './{}/node_not_restorable/configs/config.d/storage_conf_not_restorable.xml'.format(get_instances_dir())) COMMON_CONFIGS = ["configs/config.d/bg_processing_pool_conf.xml", "configs/config.d/clusters.xml"] diff --git a/tests/integration/test_merge_tree_s3_with_cache/test.py b/tests/integration/test_merge_tree_s3_with_cache/test.py index e15eaf61812..be3d2709873 100644 --- a/tests/integration/test_merge_tree_s3_with_cache/test.py +++ b/tests/integration/test_merge_tree_s3_with_cache/test.py @@ -36,7 +36,6 @@ def get_query_stat(instance, hint): result[ev[0]] = int(ev[1]) return result - @pytest.mark.parametrize("min_rows_for_wide_part,read_requests", [(0, 2), (8192, 1)]) def test_write_is_cached(cluster, min_rows_for_wide_part, read_requests): node = cluster.instances["node"] diff --git a/tests/integration/test_mysql_database_engine/test.py b/tests/integration/test_mysql_database_engine/test.py index 49206ab1abe..ff1c955d78b 100644 --- a/tests/integration/test_mysql_database_engine/test.py +++ b/tests/integration/test_mysql_database_engine/test.py @@ -6,9 +6,10 @@ import pymysql.cursors import pytest from helpers.client import QueryRuntimeException from helpers.cluster import ClickHouseCluster +from helpers.network import PartitionManager cluster = ClickHouseCluster(__file__) -clickhouse_node = cluster.add_instance('node1', main_configs=['configs/remote_servers.xml', 'configs/named_collections.xml'], with_mysql=True) +clickhouse_node = cluster.add_instance('node1', main_configs=['configs/remote_servers.xml', 'configs/named_collections.xml'], with_mysql=True, stay_alive=True) @pytest.fixture(scope="module") @@ -27,6 +28,7 @@ class MySQLNodeInstance: self.hostname = hostname self.password = password self.mysql_connection = None # lazy init + self.ip_address = hostname def query(self, execution_query): if self.mysql_connection is None: @@ -424,3 +426,24 @@ def test_predefined_connection_configuration(started_cluster): clickhouse_node.query("CREATE DATABASE test_database ENGINE = MySQL(mysql1, port=3306)") assert clickhouse_node.query("SELECT count() FROM `test_database`.`test_table`").rstrip() == '100' + + +def test_restart_server(started_cluster): + with contextlib.closing(MySQLNodeInstance('root', 'clickhouse', started_cluster.mysql_ip, started_cluster.mysql_port)) as mysql_node: + mysql_node.query("DROP DATABASE IF EXISTS test_restart") + clickhouse_node.query("DROP DATABASE IF EXISTS test_restart") + clickhouse_node.query_and_get_error("CREATE DATABASE test_restart ENGINE = MySQL('mysql57:3306', 'test_restart', 'root', 'clickhouse')") + assert 'test_restart' not in clickhouse_node.query('SHOW DATABASES') + + mysql_node.query("CREATE DATABASE test_restart DEFAULT CHARACTER SET 'utf8'") + mysql_node.query("CREATE TABLE `test_restart`.`test_table` ( `id` int(11) NOT NULL, PRIMARY KEY (`id`) ) ENGINE=InnoDB;") + clickhouse_node.query("CREATE DATABASE test_restart ENGINE = MySQL('mysql57:3306', 'test_restart', 'root', 'clickhouse')") + + assert 'test_restart' in clickhouse_node.query('SHOW DATABASES') + assert 'test_table' in clickhouse_node.query('SHOW TABLES FROM test_restart') + + with PartitionManager() as pm: + pm.partition_instances(clickhouse_node, mysql_node, action='REJECT --reject-with tcp-reset') + clickhouse_node.restart_clickhouse() + clickhouse_node.query_and_get_error('SHOW TABLES FROM test_restart') + assert 'test_table' in clickhouse_node.query('SHOW TABLES FROM test_restart') diff --git a/tests/integration/test_parts_delete_zookeeper/test.py b/tests/integration/test_parts_delete_zookeeper/test.py index 8a4aafaa55c..62e14b68bd1 100644 --- a/tests/integration/test_parts_delete_zookeeper/test.py +++ b/tests/integration/test_parts_delete_zookeeper/test.py @@ -49,12 +49,16 @@ def test_merge_doesnt_work_without_zookeeper(start_cluster): node1.query("INSERT INTO test_table VALUES ('2018-10-01', 1), ('2018-10-02', 2), ('2018-10-03', 3)") node1.query("INSERT INTO test_table VALUES ('2018-10-01', 4), ('2018-10-02', 5), ('2018-10-03', 6)") - assert node1.query("SELECT count(*) from system.parts where table = 'test_table'") == "2\n" + assert node1.query("SELECT count(*) from system.parts where table = 'test_table' and active") == "2\n" with PartitionManager() as pm: node1.query("OPTIMIZE TABLE test_table FINAL") pm.drop_instance_zk_connections(node1) - time.sleep(10) # > old_parts_lifetime - assert node1.query("SELECT count(*) from system.parts where table = 'test_table'") == "3\n" + # unfortunately we can be too fast and delete node before partition with ZK + if node1.query("SELECT count(*) from system.parts where table = 'test_table'") == "1\n": + print("We were too fast and deleted parts before partition with ZK") + else: + time.sleep(10) # > old_parts_lifetime + assert node1.query("SELECT count(*) from system.parts where table = 'test_table'") == "3\n" assert_eq_with_retry(node1, "SELECT count(*) from system.parts where table = 'test_table' and active = 1", "1") diff --git a/tests/integration/test_postgresql_replica_database_engine_1/test.py b/tests/integration/test_postgresql_replica_database_engine_1/test.py index 0dd36d64516..cba9e93c056 100644 --- a/tests/integration/test_postgresql_replica_database_engine_1/test.py +++ b/tests/integration/test_postgresql_replica_database_engine_1/test.py @@ -985,18 +985,29 @@ def test_abrupt_server_restart_while_heavy_replication(started_cluster): cursor.execute('drop table if exists postgresql_replica_{};'.format(i)) -def test_quoting(started_cluster): - table_name = 'user' - conn = get_postgres_conn(ip=started_cluster.postgres_ip, - port=started_cluster.postgres_port, - database=True) +def test_quoting_1(started_cluster): + conn = get_postgres_conn(ip=started_cluster.postgres_ip, port=started_cluster.postgres_port, database=True) cursor = conn.cursor() + table_name = 'user' create_postgres_table(cursor, table_name); - instance.query("INSERT INTO postgres_database.{} SELECT number, number from numbers(50)".format(table_name)) + instance.query(f"INSERT INTO postgres_database.{table_name} SELECT number, number from numbers(50)") create_materialized_db(ip=started_cluster.postgres_ip, port=started_cluster.postgres_port) check_tables_are_synchronized(table_name); - drop_postgres_table(cursor, table_name) drop_materialized_db() + drop_postgres_table(cursor, table_name) + + +def test_quoting_2(started_cluster): + conn = get_postgres_conn(ip=started_cluster.postgres_ip, port=started_cluster.postgres_port, database=True) + cursor = conn.cursor() + table_name = 'user' + create_postgres_table(cursor, table_name); + instance.query(f"INSERT INTO postgres_database.{table_name} SELECT number, number from numbers(50)") + create_materialized_db(ip=started_cluster.postgres_ip, port=started_cluster.postgres_port, + settings=[f"materialized_postgresql_tables_list = '{table_name}'"]) + check_tables_are_synchronized(table_name); + drop_materialized_db() + drop_postgres_table(cursor, table_name) def test_user_managed_slots(started_cluster): diff --git a/tests/integration/test_postgresql_replica_database_engine_2/test.py b/tests/integration/test_postgresql_replica_database_engine_2/test.py index 99f2facbaf6..7aee454c4a9 100644 --- a/tests/integration/test_postgresql_replica_database_engine_2/test.py +++ b/tests/integration/test_postgresql_replica_database_engine_2/test.py @@ -34,6 +34,10 @@ postgres_table_template_4 = """ CREATE TABLE IF NOT EXISTS "{}"."{}" ( key Integer NOT NULL, value Integer, PRIMARY KEY(key)) """ +postgres_table_template_5 = """ + CREATE TABLE IF NOT EXISTS "{}" ( + key Integer NOT NULL, value UUID, PRIMARY KEY(key)) + """ def get_postgres_conn(ip, port, database=False, auto_commit=True, database_name='postgres_database', replication=False): if database == True: @@ -93,7 +97,7 @@ def drop_clickhouse_postgres_db(name='postgres_database'): def create_materialized_db(ip, port, materialized_database='test_database', postgres_database='postgres_database', - settings=[]): + settings=[], table_overrides=''): instance.query(f"DROP DATABASE IF EXISTS {materialized_database}") create_query = f"CREATE DATABASE {materialized_database} ENGINE = MaterializedPostgreSQL('{ip}:{port}', '{postgres_database}', 'postgres', 'mysecretpassword')" if len(settings) > 0: @@ -102,6 +106,7 @@ def create_materialized_db(ip, port, if i != 0: create_query += ', ' create_query += settings[i] + create_query += table_overrides instance.query(create_query) assert materialized_database in instance.query('SHOW DATABASES') @@ -173,7 +178,7 @@ def assert_number_of_columns(expected, table_name, database_name='test_database' def check_tables_are_synchronized(table_name, order_by='key', postgres_database='postgres_database', materialized_database='test_database', schema_name=''): assert_nested_table_is_created(table_name, materialized_database, schema_name) - print("Checking table is synchronized:", table_name) + print(f"Checking table is synchronized. Table name: {table_name}, table schema: {schema_name}") expected = instance.query('select * from {}.{} order by {};'.format(postgres_database, table_name, order_by)) if len(schema_name) == 0: result = instance.query('select * from {}.{} order by {};'.format(materialized_database, table_name, order_by)) @@ -351,6 +356,11 @@ def test_remove_table_from_replication(started_cluster): for i in range(NUM_TABLES): cursor.execute('drop table if exists postgresql_replica_{};'.format(i)) + # Removing from replication table which does not exist in PostgreSQL must be ok. + instance.query('DETACH TABLE test_database.postgresql_replica_0'); + assert instance.contains_in_log("from publication, because table does not exist in PostgreSQL") + drop_materialized_db() + def test_predefined_connection_configuration(started_cluster): drop_materialized_db() @@ -374,6 +384,7 @@ def test_database_with_single_non_default_schema(started_cluster): NUM_TABLES=5 schema_name = 'test_schema' + materialized_db = 'test_database' clickhouse_postgres_db = 'postgres_database_with_schema' global insert_counter insert_counter = 0 @@ -425,6 +436,14 @@ def test_database_with_single_non_default_schema(started_cluster): instance.query(f"INSERT INTO {clickhouse_postgres_db}.postgresql_replica_{altered_table} SELECT number, number, number from numbers(5000, 1000)") assert_number_of_columns(3, f'postgresql_replica_{altered_table}') check_tables_are_synchronized(f"postgresql_replica_{altered_table}", postgres_database=clickhouse_postgres_db); + + print('DETACH-ATTACH') + detached_table_name = "postgresql_replica_1" + instance.query(f"DETACH TABLE {materialized_db}.{detached_table_name}") + assert not instance.contains_in_log("from publication, because table does not exist in PostgreSQL") + instance.query(f"ATTACH TABLE {materialized_db}.{detached_table_name}") + check_tables_are_synchronized(detached_table_name, postgres_database=clickhouse_postgres_db); + drop_materialized_db() @@ -435,6 +454,7 @@ def test_database_with_multiple_non_default_schemas_1(started_cluster): NUM_TABLES = 5 schema_name = 'test_schema' clickhouse_postgres_db = 'postgres_database_with_schema' + materialized_db = 'test_database' publication_tables = '' global insert_counter insert_counter = 0 @@ -489,6 +509,15 @@ def test_database_with_multiple_non_default_schemas_1(started_cluster): instance.query(f"INSERT INTO {clickhouse_postgres_db}.postgresql_replica_{altered_table} SELECT number, number, number from numbers(5000, 1000)") assert_number_of_columns(3, f'{schema_name}.postgresql_replica_{altered_table}') check_tables_are_synchronized(f"postgresql_replica_{altered_table}", schema_name=schema_name, postgres_database=clickhouse_postgres_db); + + print('DETACH-ATTACH') + detached_table_name = "postgresql_replica_1" + instance.query(f"DETACH TABLE {materialized_db}.`{schema_name}.{detached_table_name}`") + assert not instance.contains_in_log("from publication, because table does not exist in PostgreSQL") + instance.query(f"ATTACH TABLE {materialized_db}.`{schema_name}.{detached_table_name}`") + assert_show_tables("test_schema.postgresql_replica_0\ntest_schema.postgresql_replica_1\ntest_schema.postgresql_replica_2\ntest_schema.postgresql_replica_3\ntest_schema.postgresql_replica_4\n") + check_tables_are_synchronized(detached_table_name, schema_name=schema_name, postgres_database=clickhouse_postgres_db); + drop_materialized_db() @@ -499,6 +528,7 @@ def test_database_with_multiple_non_default_schemas_2(started_cluster): NUM_TABLES = 2 schemas_num = 2 schema_list = 'schema0, schema1' + materialized_db = 'test_database' global insert_counter insert_counter = 0 @@ -552,14 +582,50 @@ def test_database_with_multiple_non_default_schemas_2(started_cluster): print('ALTER') altered_schema = random.randint(0, schemas_num-1) altered_table = random.randint(0, NUM_TABLES-1) + clickhouse_postgres_db = f'clickhouse_postgres_db{altered_schema}' cursor.execute(f"ALTER TABLE schema{altered_schema}.postgresql_replica_{altered_table} ADD COLUMN value2 integer") instance.query(f"INSERT INTO clickhouse_postgres_db{altered_schema}.postgresql_replica_{altered_table} SELECT number, number, number from numbers(1000 * {insert_counter}, 1000)") assert_number_of_columns(3, f'schema{altered_schema}.postgresql_replica_{altered_table}') - check_tables_are_synchronized(f"postgresql_replica_{altered_table}", schema_name=schema_name, postgres_database=clickhouse_postgres_db); + check_tables_are_synchronized(f"postgresql_replica_{altered_table}", schema_name=f"schema{altered_schema}", postgres_database=clickhouse_postgres_db); + + print('DETACH-ATTACH') + detached_table_name = "postgresql_replica_1" + detached_table_schema = "schema0" + clickhouse_postgres_db = f'clickhouse_postgres_db0' + instance.query(f"DETACH TABLE {materialized_db}.`{detached_table_schema}.{detached_table_name}`") + assert not instance.contains_in_log("from publication, because table does not exist in PostgreSQL") + instance.query(f"ATTACH TABLE {materialized_db}.`{detached_table_schema}.{detached_table_name}`") + assert_show_tables("schema0.postgresql_replica_0\nschema0.postgresql_replica_1\nschema1.postgresql_replica_0\nschema1.postgresql_replica_1\n") + check_tables_are_synchronized(f"postgresql_replica_{altered_table}", schema_name=detached_table_schema, postgres_database=clickhouse_postgres_db); + drop_materialized_db() +def test_table_override(started_cluster): + conn = get_postgres_conn(ip=started_cluster.postgres_ip, port=started_cluster.postgres_port, database=True) + cursor = conn.cursor() + table_name = 'table_override' + materialized_database = 'test_database' + create_postgres_table(cursor, table_name, template=postgres_table_template_5); + instance.query(f"create table {table_name}(key Int32, value UUID) engine = PostgreSQL (postgres1, table={table_name})") + instance.query(f"insert into {table_name} select number, generateUUIDv4() from numbers(10)") + table_overrides = f" TABLE OVERRIDE {table_name} (COLUMNS (key Int32, value UUID))" + create_materialized_db(ip=started_cluster.postgres_ip, port=started_cluster.postgres_port, + settings=[f"materialized_postgresql_tables_list = '{table_name}'"], table_overrides=table_overrides) + assert_nested_table_is_created(table_name, materialized_database) + result = instance.query(f"show create table {materialized_database}.{table_name}") + print(result) + expected = "CREATE TABLE test_database.table_override\\n(\\n `key` Int32,\\n `value` UUID,\\n `_sign` Int8() MATERIALIZED 1,\\n `_version` UInt64() MATERIALIZED 1\\n)\\nENGINE = ReplacingMergeTree(_version)\\nORDER BY tuple(key)" + assert(result.strip() == expected) + time.sleep(5) + query = f"select * from {materialized_database}.{table_name} order by key" + expected = instance.query(f"select * from {table_name} order by key") + assert_eq_with_retry(instance, query, expected) + drop_materialized_db() + drop_postgres_table(cursor, table_name) + + if __name__ == '__main__': cluster.start() input("Cluster created, press any key to destroy...") diff --git a/tests/integration/test_prometheus_endpoint/test.py b/tests/integration/test_prometheus_endpoint/test.py index 06276803c3d..60d9164acd2 100644 --- a/tests/integration/test_prometheus_endpoint/test.py +++ b/tests/integration/test_prometheus_endpoint/test.py @@ -30,7 +30,7 @@ def parse_response_line(line): if line.startswith("#"): return {} - match = re.match('^([a-zA-Z_:][a-zA-Z0-9_:]+)(\{.*\})? (\d)', line) + match = re.match('^([a-zA-Z_:][a-zA-Z0-9_:]+)(\{.*\})? -?(\d)', line) assert match, line name, _, val = match.groups() return {name: int(val)} diff --git a/tests/integration/test_replicated_merge_tree_s3_zero_copy/test.py b/tests/integration/test_replicated_merge_tree_s3_zero_copy/test.py index 793abc53566..edf39969b47 100644 --- a/tests/integration/test_replicated_merge_tree_s3_zero_copy/test.py +++ b/tests/integration/test_replicated_merge_tree_s3_zero_copy/test.py @@ -65,7 +65,6 @@ def create_table(cluster, additional_settings=None): list(cluster.instances.values())[0].query(create_table_statement) - @pytest.fixture(autouse=True) def drop_table(cluster): yield diff --git a/tests/integration/test_s3_zero_copy_replication/test.py b/tests/integration/test_s3_zero_copy_replication/test.py index d9f7cca4a3a..1c3713c02a2 100644 --- a/tests/integration/test_s3_zero_copy_replication/test.py +++ b/tests/integration/test_s3_zero_copy_replication/test.py @@ -46,6 +46,22 @@ def wait_for_large_objects_count(cluster, expected, size=100, timeout=30): assert get_large_objects_count(cluster, size=size) == expected +def wait_for_active_parts(node, num_expected_parts, table_name, timeout=30): + deadline = time.monotonic() + timeout + num_parts = 0 + while time.monotonic() < deadline: + num_parts_str = node.query("select count() from system.parts where table = '{}' and active".format(table_name)) + num_parts = int(num_parts_str.strip()) + if num_parts == num_expected_parts: + return + + time.sleep(0.2) + + assert num_parts == num_expected_parts + + +# Result of `get_large_objects_count` can be changed in other tests, so run this case at the beginning +@pytest.mark.order(0) @pytest.mark.parametrize( "policy", ["s3"] ) @@ -68,7 +84,7 @@ def test_s3_zero_copy_replication(cluster, policy): assert node1.query("SELECT * FROM s3_test order by id FORMAT Values") == "(0,'data'),(1,'data')" assert node2.query("SELECT * FROM s3_test order by id FORMAT Values") == "(0,'data'),(1,'data')" - # Based on version 20.x - should be only one file with size 100+ (checksums.txt), used by both nodes + # Based on version 21.x - should be only 1 file with size 100+ (checksums.txt), used by both nodes assert get_large_objects_count(cluster) == 1 node2.query("INSERT INTO s3_test VALUES (2,'data'),(3,'data')") @@ -77,15 +93,15 @@ def test_s3_zero_copy_replication(cluster, policy): assert node2.query("SELECT * FROM s3_test order by id FORMAT Values") == "(0,'data'),(1,'data'),(2,'data'),(3,'data')" assert node1.query("SELECT * FROM s3_test order by id FORMAT Values") == "(0,'data'),(1,'data'),(2,'data'),(3,'data')" - # Based on version 20.x - two parts + # Based on version 21.x - two parts wait_for_large_objects_count(cluster, 2) node1.query("OPTIMIZE TABLE s3_test FINAL") - # Based on version 20.x - after merge, two old parts and one merged + # Based on version 21.x - after merge, two old parts and one merged wait_for_large_objects_count(cluster, 3) - # Based on version 20.x - after cleanup - only one merged part + # Based on version 21.x - after cleanup - only one merged part wait_for_large_objects_count(cluster, 1, timeout=60) node1.query("DROP TABLE IF EXISTS s3_test NO DELAY") @@ -248,3 +264,50 @@ def test_s3_zero_copy_with_ttl_delete(cluster, large_data, iterations): node1.query("DROP TABLE IF EXISTS ttl_delete_test NO DELAY") node2.query("DROP TABLE IF EXISTS ttl_delete_test NO DELAY") + + +def test_s3_zero_copy_concurrent_merge(cluster): + node1 = cluster.instances["node1"] + node2 = cluster.instances["node2"] + + node1.query("DROP TABLE IF EXISTS concurrent_merge NO DELAY") + node2.query("DROP TABLE IF EXISTS concurrent_merge NO DELAY") + + for node in (node1, node2): + node.query( + """ + CREATE TABLE concurrent_merge (id UInt64) + ENGINE=ReplicatedMergeTree('/clickhouse/tables/concurrent_merge', '{replica}') + ORDER BY id + SETTINGS index_granularity=2, storage_policy='s3', remote_fs_execute_merges_on_single_replica_time_threshold=1 + """ + ) + + node1.query("system stop merges") + node2.query("system stop merges") + + # This will generate two parts with 20 granules each + node1.query("insert into concurrent_merge select number from numbers(40)") + node1.query("insert into concurrent_merge select number + 1 from numbers(40)") + + wait_for_active_parts(node2, 2, 'concurrent_merge') + + # Merge will materialize default column, it should sleep every granule and take 20 * 2 * 0.1 = 4 sec. + node1.query("alter table concurrent_merge add column x UInt32 default sleep(0.1)") + + node1.query("system start merges") + node2.query("system start merges") + + # Now, the merge should start. + # Because of remote_fs_execute_merges_on_single_replica_time_threshold=1, + # only one replica will start merge instantly. + # The other replica should wait for 1 sec and also start it. + # That should probably cause a data race at s3 storage. + # For now, it does not happen (every blob has a random name, and we just have a duplicating data) + node1.query("optimize table concurrent_merge final") + + wait_for_active_parts(node1, 1, 'concurrent_merge') + wait_for_active_parts(node2, 1, 'concurrent_merge') + + for node in (node1, node2): + assert node.query('select sum(id) from concurrent_merge').strip() == '1600' diff --git a/tests/integration/test_server_reload/.gitignore b/tests/integration/test_server_reload/.gitignore new file mode 100644 index 00000000000..edf565ec632 --- /dev/null +++ b/tests/integration/test_server_reload/.gitignore @@ -0,0 +1 @@ +_gen diff --git a/tests/integration/test_server_reload/__init__.py b/tests/integration/test_server_reload/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/integration/test_server_reload/configs/default_passwd.xml b/tests/integration/test_server_reload/configs/default_passwd.xml new file mode 100644 index 00000000000..5c23be0dcb0 --- /dev/null +++ b/tests/integration/test_server_reload/configs/default_passwd.xml @@ -0,0 +1,13 @@ + + + + + + + + + + 123 + + + diff --git a/tests/integration/test_server_reload/configs/dhparam.pem b/tests/integration/test_server_reload/configs/dhparam.pem new file mode 100644 index 00000000000..fb935b9c898 --- /dev/null +++ b/tests/integration/test_server_reload/configs/dhparam.pem @@ -0,0 +1,8 @@ +-----BEGIN DH PARAMETERS----- +MIIBCAKCAQEAkPGhfLY5nppeQkFBKYRpiisxzrRQfyyTUu6aabZP2CbAMAuoYzaC +Z+iqeWSQZKRYeA21SZXkC9xE1e5FJsc5IWzCRiMNZeLuj4ApUNysMu89DpX8/b91 ++Ka6wRJnaO43ZqHj/9FpU4JiYtxoIpXDC9HeiSAnwLwJc3L+nkYfnSGgvzWIxhGV +gCoVmVBoTe7wrqCyVlM5nrNZSjhlSugvXmu2bSK3MwYF08QLKvlF68eedbs0PMWh +WC0bFM/X7gMBEqL4DiINufAShbZPKxD6eL2APiHPUo6xun3ed/Po/5j8QBmiku0c +5Jb12ZhOTRTQjaRg2aFF8LPdW2tDE7HmewIBAg== +-----END DH PARAMETERS----- diff --git a/tests/integration/test_server_reload/configs/ports_from_zk.xml b/tests/integration/test_server_reload/configs/ports_from_zk.xml new file mode 100644 index 00000000000..ae3435a3d3c --- /dev/null +++ b/tests/integration/test_server_reload/configs/ports_from_zk.xml @@ -0,0 +1,9 @@ + + + + + + + + + \ No newline at end of file diff --git a/tests/integration/test_server_reload/configs/server.crt b/tests/integration/test_server_reload/configs/server.crt new file mode 100644 index 00000000000..6f4deca038f --- /dev/null +++ b/tests/integration/test_server_reload/configs/server.crt @@ -0,0 +1,18 @@ +-----BEGIN CERTIFICATE----- +MIIC+zCCAeOgAwIBAgIJAIhI9ozZJ+TWMA0GCSqGSIb3DQEBCwUAMBQxEjAQBgNV +BAMMCWxvY2FsaG9zdDAeFw0xOTA0MjIwNDMyNTJaFw0yMDA0MjEwNDMyNTJaMBQx +EjAQBgNVBAMMCWxvY2FsaG9zdDCCASIwDQYJKoZIhvcNAQEBBQADggEPADCCAQoC +ggEBAK+wVUEdqF2uXvN0MJBgnAHyXi6JTi4p/F6igsrCjSNjJWzHH0vQmK8ujfcF +CkifW88i+W5eHctuEtQqNHK+t9x9YiZtXrj6m/XkOXs20mYgENSmbbbHbriTPnZB +zZrq6UqMlwIHNNAa+I3NMORQxVRaI0ybXnGVO5elr70xHpk03xL0JWKHpEqYp4db +2aBQgF6y3Ww4khxjIYqpUYXWXGFnVIRU7FKVEAM1xyKqvQzXjQ5sVM/wyHknveEF +3b/X4ggN+KNl5KOc0cWDh1/XaatJAPaUUPqZcq76tynLbP64Xm3dxHcj+gtRkO67 +ef6MSg6l63m3XQP6Qb+MIkd06OsCAwEAAaNQME4wHQYDVR0OBBYEFDmODTO8QLDN +ykR3x0LIOnjNhrKhMB8GA1UdIwQYMBaAFDmODTO8QLDNykR3x0LIOnjNhrKhMAwG +A1UdEwQFMAMBAf8wDQYJKoZIhvcNAQELBQADggEBAAwaiJc7uqEpnH3aukbftDwX +m8GfEnj1HVdgg+9GGNq+9rvUYBF6gdPmjRCX9dO0cclLFx8jc2org0rTSq9WoOhX +E6qL4Eqrmc5SE3Y9jZM0h6GRD4oXK014FmtZ3T6ddZU3dQLj3BS2r1XrvmubTvGN +ZuTJNY8nx8Hh6H5XINmsEjUF9E5hog+PwCE03xt2adIdYL+gsbxASeNYyeUFpZv5 +zcXR3VoakBWnAaOVgCHq2qh96QAnL7ZKzFkGf/MdwV10KU3dmb+ICbQUUdf9Gc17 +aaDCIRws312F433FdXBkGs2UkB7ZZme9dfn6O1QbeTNvex2VLMqYx/CTkfFbOQA= +-----END CERTIFICATE----- diff --git a/tests/integration/test_server_reload/configs/server.key b/tests/integration/test_server_reload/configs/server.key new file mode 100644 index 00000000000..6eddb3295db --- /dev/null +++ b/tests/integration/test_server_reload/configs/server.key @@ -0,0 +1,28 @@ +-----BEGIN PRIVATE KEY----- +MIIEvQIBADANBgkqhkiG9w0BAQEFAASCBKcwggSjAgEAAoIBAQCvsFVBHahdrl7z +dDCQYJwB8l4uiU4uKfxeooLKwo0jYyVsxx9L0JivLo33BQpIn1vPIvluXh3LbhLU +KjRyvrfcfWImbV64+pv15Dl7NtJmIBDUpm22x264kz52Qc2a6ulKjJcCBzTQGviN +zTDkUMVUWiNMm15xlTuXpa+9MR6ZNN8S9CVih6RKmKeHW9mgUIBest1sOJIcYyGK +qVGF1lxhZ1SEVOxSlRADNcciqr0M140ObFTP8Mh5J73hBd2/1+IIDfijZeSjnNHF +g4df12mrSQD2lFD6mXKu+rcpy2z+uF5t3cR3I/oLUZDuu3n+jEoOpet5t10D+kG/ +jCJHdOjrAgMBAAECggEARF66zrxb6RkSmmt8+rKeA6PuQu3sHsr4C1vyyjUr97l9 +tvdGlpp20LWtSZQMjHZ3pARYTTsTHTeY3DgQcRcHNicVKx8k3ZepWeeW9vw+pL+V +zSt3RsoVrH6gsCSrfr4sS3aqzX9AbjwQvh48CJ3mLQ1m70kHV+xbZIh1+4pB/hyP +1wKyUE18ZkOptXvO/TtoHzLQCecpkXtWzmry1Eh2isvXA+NMrAtLibGsyM1mtm7i +5ozevzHabvvCDBEe+KgZdONgVhhhvm2eOd+/s4w3rw4ETud4fI/ZAJyWXhiIKFnA +VJbElWruSAoVBW7p2bsF5PbmVzvo8vXL+VylxYD+AQKBgQDhLoRKTVhNkn/QjKxq +sdOh+QZra0LzjVpAmkQzu7wZMSHEz9qePQciDQQrYKrmRF1vNcIRCVUTqWYheJ/1 +lKRrCGa0ab6k96zkWMqLHD5u+UeJV7r1dJIx08ME9kNJ+x/XtB8klRIji16NiQUS +qc6p8z0M2AnbJzsRfWZRH8FeYwKBgQDHu8dzdtVGI7MtxfPOE/bfajiopDg8BdTC +pdug2T8XofRHRq7Q+0vYjTAZFT/slib91Pk6VvvPdo9VBZiL4omv4dAq6mOOdX/c +U14mJe1X5GCrr8ExZ8BfNJ3t/6sV1fcxyJwAw7iBguqxA2JqdM/wFk10K8XqvzVn +CD6O9yGt2QKBgFX1BMi8N538809vs41S7l9hCQNOQZNo/O+2M5yv6ECRkbtoQKKw +1x03bMUGNJaLuELweXE5Z8GGo5bZTe5X3F+DKHlr+DtO1C+ieUaa9HY2MAmMdLCn +2/qrREGLo+oEs4YKmuzC/taUp/ZNPKOAMISNdluFyFVg51pozPrgrVbTAoGBAKkE +LBl3O67o0t0vH8sJdeVFG8EJhlS0koBMnfgVHqC++dm+5HwPyvTrNQJkyv1HaqNt +r6FArkG3ED9gRuBIyT6+lctbIPgSUip9mbQqcBfqOCvQxGksZMur2ODncz09HLtS +CUFUXjOqNzOnq4ZuZu/Bz7U4vXiSaXxQq6+LTUKxAoGAFZU/qrI06XxnrE9A1X0W +l7DSkpZaDcu11NrZ473yONih/xOZNh4SSBpX8a7F6Pmh9BdtGqphML8NFPvQKcfP +b9H2iid2tc292uyrUEb5uTMmv61zoTwtitqLzO0+tS6PT3fXobX+eyeEWKzPBljL +HFtxG5CCXpkdnWRmaJnhTzA= +-----END PRIVATE KEY----- diff --git a/tests/integration/test_server_reload/configs/ssl_conf.xml b/tests/integration/test_server_reload/configs/ssl_conf.xml new file mode 100644 index 00000000000..43b25032059 --- /dev/null +++ b/tests/integration/test_server_reload/configs/ssl_conf.xml @@ -0,0 +1,18 @@ + + + + + + + /etc/clickhouse-server/config.d/server.crt + /etc/clickhouse-server/config.d/server.key + + /etc/clickhouse-server/config.d/dhparam.pem + none + true + true + sslv2,sslv3 + true + + + diff --git a/tests/integration/test_server_reload/protos/clickhouse_grpc.proto b/tests/integration/test_server_reload/protos/clickhouse_grpc.proto new file mode 100644 index 00000000000..c6cafaf6e40 --- /dev/null +++ b/tests/integration/test_server_reload/protos/clickhouse_grpc.proto @@ -0,0 +1,174 @@ +/* This file describes gRPC protocol supported in ClickHouse. + * + * To use this protocol a client should send one or more messages of the QueryInfo type + * and then receive one or more messages of the Result type. + * According to that the service provides four methods for that: + * ExecuteQuery(QueryInfo) returns (Result) + * ExecuteQueryWithStreamInput(stream QueryInfo) returns (Result) + * ExecuteQueryWithStreamOutput(QueryInfo) returns (stream Result) + * ExecuteQueryWithStreamIO(stream QueryInfo) returns (stream Result) + * It's up to the client to choose which method to use. + * For example, ExecuteQueryWithStreamInput() allows the client to add data multiple times + * while executing a query, which is suitable for inserting many rows. + */ + +syntax = "proto3"; + +package clickhouse.grpc; + +message NameAndType { + string name = 1; + string type = 2; +} + +// Describes an external table - a table which will exists only while a query is executing. +message ExternalTable { + // Name of the table. If omitted, "_data" is used. + string name = 1; + + // Columns of the table. Types are required, names can be omitted. If the names are omitted, "_1", "_2", ... is used. + repeated NameAndType columns = 2; + + // Data to insert to the external table. + // If a method with streaming input (i.e. ExecuteQueryWithStreamInput() or ExecuteQueryWithStreamIO()) is used, + // then data for insertion to the same external table can be split between multiple QueryInfos. + bytes data = 3; + + // Format of the data to insert to the external table. + string format = 4; + + // Settings for executing that insertion, applied after QueryInfo.settings. + map settings = 5; +} + +enum CompressionAlgorithm { + NO_COMPRESSION = 0; + DEFLATE = 1; + GZIP = 2; + STREAM_GZIP = 3; +} + +enum CompressionLevel { + COMPRESSION_NONE = 0; + COMPRESSION_LOW = 1; + COMPRESSION_MEDIUM = 2; + COMPRESSION_HIGH = 3; +} + +message Compression { + CompressionAlgorithm algorithm = 1; + CompressionLevel level = 2; +} + +// Information about a query which a client sends to a ClickHouse server. +// The first QueryInfo can set any of the following fields. Extra QueryInfos only add extra data. +// In extra QueryInfos only `input_data`, `external_tables`, `next_query_info` and `cancel` fields can be set. +message QueryInfo { + string query = 1; + string query_id = 2; + map settings = 3; + + // Default database. + string database = 4; + + // Input data, used both as data for INSERT query and as data for the input() function. + bytes input_data = 5; + + // Delimiter for input_data, inserted between input_data from adjacent QueryInfos. + bytes input_data_delimiter = 6; + + // Default output format. If not specified, 'TabSeparated' is used. + string output_format = 7; + + repeated ExternalTable external_tables = 8; + + string user_name = 9; + string password = 10; + string quota = 11; + + // Works exactly like sessions in the HTTP protocol. + string session_id = 12; + bool session_check = 13; + uint32 session_timeout = 14; + + // Set `cancel` to true to stop executing the query. + bool cancel = 15; + + // If true there will be at least one more QueryInfo in the input stream. + // `next_query_info` is allowed to be set only if a method with streaming input (i.e. ExecuteQueryWithStreamInput() or ExecuteQueryWithStreamIO()) is used. + bool next_query_info = 16; + + /// Controls how a ClickHouse server will compress query execution results before sending back to the client. + /// If not set the compression settings from the configuration file will be used. + Compression result_compression = 17; +} + +enum LogsLevel { + LOG_NONE = 0; + LOG_FATAL = 1; + LOG_CRITICAL = 2; + LOG_ERROR = 3; + LOG_WARNING = 4; + LOG_NOTICE = 5; + LOG_INFORMATION = 6; + LOG_DEBUG = 7; + LOG_TRACE = 8; +} + +message LogEntry { + uint32 time = 1; + uint32 time_microseconds = 2; + uint64 thread_id = 3; + string query_id = 4; + LogsLevel level = 5; + string source = 6; + string text = 7; +} + +message Progress { + uint64 read_rows = 1; + uint64 read_bytes = 2; + uint64 total_rows_to_read = 3; + uint64 written_rows = 4; + uint64 written_bytes = 5; +} + +message Stats { + uint64 rows = 1; + uint64 blocks = 2; + uint64 allocated_bytes = 3; + bool applied_limit = 4; + uint64 rows_before_limit = 5; +} + +message Exception { + int32 code = 1; + string name = 2; + string display_text = 3; + string stack_trace = 4; +} + +// Result of execution of a query which is sent back by the ClickHouse server to the client. +message Result { + // Output of the query, represented in the `output_format` or in a format specified in `query`. + bytes output = 1; + bytes totals = 2; + bytes extremes = 3; + + repeated LogEntry logs = 4; + Progress progress = 5; + Stats stats = 6; + + // Set by the ClickHouse server if there was an exception thrown while executing. + Exception exception = 7; + + // Set by the ClickHouse server if executing was cancelled by the `cancel` field in QueryInfo. + bool cancelled = 8; +} + +service ClickHouse { + rpc ExecuteQuery(QueryInfo) returns (Result) {} + rpc ExecuteQueryWithStreamInput(stream QueryInfo) returns (Result) {} + rpc ExecuteQueryWithStreamOutput(QueryInfo) returns (stream Result) {} + rpc ExecuteQueryWithStreamIO(stream QueryInfo) returns (stream Result) {} +} diff --git a/tests/integration/test_server_reload/test.py b/tests/integration/test_server_reload/test.py new file mode 100644 index 00000000000..3c22b476f64 --- /dev/null +++ b/tests/integration/test_server_reload/test.py @@ -0,0 +1,284 @@ +import contextlib +import grpc +import psycopg2 +import pymysql.connections +import pymysql.err +import pytest +import sys +import time +from helpers.cluster import ClickHouseCluster, run_and_check +from helpers.client import Client, QueryRuntimeException +from kazoo.exceptions import NodeExistsError +from pathlib import Path +from requests.exceptions import ConnectionError +from urllib3.util.retry import Retry + +cluster = ClickHouseCluster(__file__) +instance = cluster.add_instance( + "instance", + main_configs=[ + "configs/ports_from_zk.xml", "configs/ssl_conf.xml", "configs/dhparam.pem", "configs/server.crt", "configs/server.key" + ], + user_configs=["configs/default_passwd.xml"], + with_zookeeper=True) + + +LOADS_QUERY = "SELECT value FROM system.events WHERE event = 'MainConfigLoads'" + + +# Use grpcio-tools to generate *pb2.py files from *.proto. + +proto_dir = Path(__file__).parent / "protos" +gen_dir = Path(__file__).parent / "_gen" +gen_dir.mkdir(exist_ok=True) +run_and_check( + f"python3 -m grpc_tools.protoc -I{proto_dir!s} --python_out={gen_dir!s} --grpc_python_out={gen_dir!s} \ + {proto_dir!s}/clickhouse_grpc.proto", shell=True) + +sys.path.append(str(gen_dir)) +import clickhouse_grpc_pb2 +import clickhouse_grpc_pb2_grpc + + +@pytest.fixture(name="cluster", scope="module") +def fixture_cluster(): + try: + cluster.add_zookeeper_startup_command(configure_ports_from_zk) + cluster.start() + yield cluster + finally: + cluster.shutdown() + + +@pytest.fixture(name="zk", scope="module") +def fixture_zk(cluster): + return cluster.get_kazoo_client("zoo1") + + +def get_client(cluster, port): + return Client(host=cluster.get_instance_ip("instance"), port=port, command=cluster.client_bin_path) + + +def get_mysql_client(cluster, port): + start_time = time.monotonic() + while True: + try: + return pymysql.connections.Connection( + host=cluster.get_instance_ip("instance"), user="default", password="", database="default", port=port) + except pymysql.err.OperationalError: + if time.monotonic() - start_time > 10: + raise + time.sleep(0.1) + + +def get_pgsql_client(cluster, port): + start_time = time.monotonic() + while True: + try: + return psycopg2.connect( + host=cluster.get_instance_ip("instance"), user="postgresql", password="123", database="default", port=port) + except psycopg2.OperationalError: + if time.monotonic() - start_time > 10: + raise + time.sleep(0.1) + + +def get_grpc_channel(cluster, port): + host_port = cluster.get_instance_ip("instance") + f":{port}" + channel = grpc.insecure_channel(host_port) + grpc.channel_ready_future(channel).result(timeout=10) + return channel + + +def grpc_query(channel, query_text): + query_info = clickhouse_grpc_pb2.QueryInfo(query=query_text) + stub = clickhouse_grpc_pb2_grpc.ClickHouseStub(channel) + result = stub.ExecuteQuery(query_info) + if result and result.HasField("exception"): + raise Exception(result.exception.display_text) + return result.output.decode() + + +def configure_ports_from_zk(zk, querier=None): + default_config = [ + ("/clickhouse/listen_hosts", b"0.0.0.0"), + ("/clickhouse/ports/tcp", b"9000"), + ("/clickhouse/ports/http", b"8123"), + ("/clickhouse/ports/mysql", b"9004"), + ("/clickhouse/ports/postgresql", b"9005"), + ("/clickhouse/ports/grpc", b"9100"), + ] + for path, value in default_config: + if querier is not None: + loads_before = querier(LOADS_QUERY) + has_changed = False + try: + zk.create(path=path, value=value, makepath=True) + has_changed = True + except NodeExistsError: + if zk.get(path) != value: + zk.set(path=path, value=value) + has_changed = True + if has_changed and querier is not None: + wait_loaded_config_changed(loads_before, querier) + + +@contextlib.contextmanager +def sync_loaded_config(querier): + # Depending on whether we test a change on tcp or http + # we monitor canges using the other, untouched, protocol + loads_before = querier(LOADS_QUERY) + yield + wait_loaded_config_changed(loads_before, querier) + + +def wait_loaded_config_changed(loads_before, querier): + loads_after = None + start_time = time.monotonic() + while time.monotonic() - start_time < 10: + try: + loads_after = querier(LOADS_QUERY) + if loads_after != loads_before: + return + except (QueryRuntimeException, ConnectionError): + pass + time.sleep(0.1) + assert loads_after is not None and loads_after != loads_before + + +@contextlib.contextmanager +def default_client(cluster, zk, restore_via_http=False): + client = get_client(cluster, port=9000) + try: + yield client + finally: + querier = instance.http_query if restore_via_http else client.query + configure_ports_from_zk(zk, querier) + + +def test_change_tcp_port(cluster, zk): + with default_client(cluster, zk, restore_via_http=True) as client: + assert client.query("SELECT 1") == "1\n" + with sync_loaded_config(instance.http_query): + zk.set("/clickhouse/ports/tcp", b"9090") + with pytest.raises(QueryRuntimeException, match="Connection refused"): + client.query("SELECT 1") + client_on_new_port = get_client(cluster, port=9090) + assert client_on_new_port.query("SELECT 1") == "1\n" + + +def test_change_http_port(cluster, zk): + with default_client(cluster, zk) as client: + retry_strategy = Retry(total=10, backoff_factor=0.1) + assert instance.http_query("SELECT 1", retry_strategy=retry_strategy) == "1\n" + with sync_loaded_config(client.query): + zk.set("/clickhouse/ports/http", b"9090") + with pytest.raises(ConnectionError, match="Connection refused"): + instance.http_query("SELECT 1") + instance.http_query("SELECT 1", port=9090) == "1\n" + + +def test_change_mysql_port(cluster, zk): + with default_client(cluster, zk) as client: + mysql_client = get_mysql_client(cluster, port=9004) + assert mysql_client.query("SELECT 1") == 1 + with sync_loaded_config(client.query): + zk.set("/clickhouse/ports/mysql", b"9090") + with pytest.raises(pymysql.err.OperationalError, match="Lost connection"): + mysql_client.query("SELECT 1") + mysql_client_on_new_port = get_mysql_client(cluster, port=9090) + assert mysql_client_on_new_port.query("SELECT 1") == 1 + + +def test_change_postgresql_port(cluster, zk): + with default_client(cluster, zk) as client: + pgsql_client = get_pgsql_client(cluster, port=9005) + cursor = pgsql_client.cursor() + cursor.execute("SELECT 1") + assert cursor.fetchall() == [(1,)] + with sync_loaded_config(client.query): + zk.set("/clickhouse/ports/postgresql", b"9090") + with pytest.raises(psycopg2.OperationalError, match="closed"): + cursor.execute("SELECT 1") + pgsql_client_on_new_port = get_pgsql_client(cluster, port=9090) + cursor = pgsql_client_on_new_port.cursor() + cursor.execute("SELECT 1") + cursor.fetchall() == [(1,)] + + +def test_change_grpc_port(cluster, zk): + with default_client(cluster, zk) as client: + grpc_channel = get_grpc_channel(cluster, port=9100) + assert grpc_query(grpc_channel, "SELECT 1") == "1\n" + with sync_loaded_config(client.query): + zk.set("/clickhouse/ports/grpc", b"9090") + with pytest.raises(grpc._channel._InactiveRpcError, match="StatusCode.UNAVAILABLE"): + grpc_query(grpc_channel, "SELECT 1") + grpc_channel_on_new_port = get_grpc_channel(cluster, port=9090) + assert grpc_query(grpc_channel_on_new_port, "SELECT 1") == "1\n" + + +def test_remove_tcp_port(cluster, zk): + with default_client(cluster, zk, restore_via_http=True) as client: + assert client.query("SELECT 1") == "1\n" + with sync_loaded_config(instance.http_query): + zk.delete("/clickhouse/ports/tcp") + with pytest.raises(QueryRuntimeException, match="Connection refused"): + client.query("SELECT 1") + + +def test_remove_http_port(cluster, zk): + with default_client(cluster, zk) as client: + assert instance.http_query("SELECT 1") == "1\n" + with sync_loaded_config(client.query): + zk.delete("/clickhouse/ports/http") + with pytest.raises(ConnectionError, match="Connection refused"): + instance.http_query("SELECT 1") + + +def test_remove_mysql_port(cluster, zk): + with default_client(cluster, zk) as client: + mysql_client = get_mysql_client(cluster, port=9004) + assert mysql_client.query("SELECT 1") == 1 + with sync_loaded_config(client.query): + zk.delete("/clickhouse/ports/mysql") + with pytest.raises(pymysql.err.OperationalError, match="Lost connection"): + mysql_client.query("SELECT 1") + + +def test_remove_postgresql_port(cluster, zk): + with default_client(cluster, zk) as client: + pgsql_client = get_pgsql_client(cluster, port=9005) + cursor = pgsql_client.cursor() + cursor.execute("SELECT 1") + assert cursor.fetchall() == [(1,)] + with sync_loaded_config(client.query): + zk.delete("/clickhouse/ports/postgresql") + with pytest.raises(psycopg2.OperationalError, match="closed"): + cursor.execute("SELECT 1") + + +def test_remove_grpc_port(cluster, zk): + with default_client(cluster, zk) as client: + grpc_channel = get_grpc_channel(cluster, port=9100) + assert grpc_query(grpc_channel, "SELECT 1") == "1\n" + with sync_loaded_config(client.query): + zk.delete("/clickhouse/ports/grpc") + with pytest.raises(grpc._channel._InactiveRpcError, match="StatusCode.UNAVAILABLE"): + grpc_query(grpc_channel, "SELECT 1") + + +def test_change_listen_host(cluster, zk): + localhost_client = Client(host="127.0.0.1", port=9000, command="/usr/bin/clickhouse") + localhost_client.command = ["docker", "exec", "-i", instance.docker_id] + localhost_client.command + try: + client = get_client(cluster, port=9000) + with sync_loaded_config(localhost_client.query): + zk.set("/clickhouse/listen_hosts", b"127.0.0.1") + with pytest.raises(QueryRuntimeException, match="Connection refused"): + client.query("SELECT 1") + assert localhost_client.query("SELECT 1") == "1\n" + finally: + with sync_loaded_config(localhost_client.query): + configure_ports_from_zk(zk) + diff --git a/tests/integration/test_storage_hdfs/test.py b/tests/integration/test_storage_hdfs/test.py index ede1dafefb1..f317fb5429a 100644 --- a/tests/integration/test_storage_hdfs/test.py +++ b/tests/integration/test_storage_hdfs/test.py @@ -310,6 +310,7 @@ def test_seekable_formats(started_cluster): result = node1.query(f"SELECT count() FROM {table_function}") assert(int(result) == 5000000) + def test_read_table_with_default(started_cluster): hdfs_api = started_cluster.hdfs_api @@ -322,6 +323,44 @@ def test_read_table_with_default(started_cluster): "select * from hdfs('hdfs://hdfs1:9000/simple_table_function', 'TSVWithNames', 'n UInt32, m UInt32 DEFAULT n * 2') FORMAT TSVWithNames") == output +def test_schema_inference(started_cluster): + node1.query(f"insert into table function hdfs('hdfs://hdfs1:9000/native', 'Native', 'a Int32, b String') SELECT number, randomString(100) FROM numbers(5000000)") + + result = node1.query(f"desc hdfs('hdfs://hdfs1:9000/native', 'Native')") + assert result == "a\tInt32\t\t\t\t\t\nb\tString\t\t\t\t\t\n" + + result = node1.query(f"select count(*) from hdfs('hdfs://hdfs1:9000/native', 'Native')") + assert(int(result) == 5000000) + + node1.query(f"create table schema_inference engine=HDFS('hdfs://hdfs1:9000/native', 'Native')") + result = node1.query(f"desc schema_inference") + assert result == "a\tInt32\t\t\t\t\t\nb\tString\t\t\t\t\t\n" + + result = node1.query(f"select count(*) from schema_inference") + assert(int(result) == 5000000) + + +def test_hdfsCluster(started_cluster): + hdfs_api = started_cluster.hdfs_api + fs = HdfsClient(hosts=started_cluster.hdfs_ip) + dir = '/test_hdfsCluster' + exists = fs.exists(dir) + if exists: + fs.delete(dir, recursive=True) + fs.mkdirs(dir) + hdfs_api.write_data("/test_hdfsCluster/file1", "1\n") + hdfs_api.write_data("/test_hdfsCluster/file2", "2\n") + hdfs_api.write_data("/test_hdfsCluster/file3", "3\n") + + actual = node1.query("select id, _file as file_name, _path as file_path from hdfs('hdfs://hdfs1:9000/test_hdfsCluster/file*', 'TSV', 'id UInt32') order by id") + expected = "1\tfile1\thdfs://hdfs1:9000/test_hdfsCluster/file1\n2\tfile2\thdfs://hdfs1:9000/test_hdfsCluster/file2\n3\tfile3\thdfs://hdfs1:9000/test_hdfsCluster/file3\n" + assert actual == expected + + actual = node1.query("select id, _file as file_name, _path as file_path from hdfsCluster('test_cluster_two_shards', 'hdfs://hdfs1:9000/test_hdfsCluster/file*', 'TSV', 'id UInt32') order by id") + expected = "1\tfile1\thdfs://hdfs1:9000/test_hdfsCluster/file1\n2\tfile2\thdfs://hdfs1:9000/test_hdfsCluster/file2\n3\tfile3\thdfs://hdfs1:9000/test_hdfsCluster/file3\n" + assert actual == expected + fs.delete(dir, recursive=True) + if __name__ == '__main__': cluster.start() diff --git a/tests/integration/test_storage_kafka/clickhouse_path/format_schemas/message_with_repeated.proto b/tests/integration/test_storage_kafka/clickhouse_path/format_schemas/message_with_repeated.proto new file mode 100644 index 00000000000..791a5086866 --- /dev/null +++ b/tests/integration/test_storage_kafka/clickhouse_path/format_schemas/message_with_repeated.proto @@ -0,0 +1,19 @@ +syntax = "proto3"; +option optimize_for = SPEED; +message Message { + uint32 tnow = 1; + string server = 2; + string clien = 3; + uint32 sPort = 4; + uint32 cPort = 5; + repeated dd r = 6; + string method = 7; +} + +message dd { + string name = 1; + uint32 class = 2; + uint32 type = 3; + uint64 ttl = 4; + bytes data = 5; +} \ No newline at end of file diff --git a/tests/integration/test_storage_kafka/message_with_repeated_pb2.py b/tests/integration/test_storage_kafka/message_with_repeated_pb2.py new file mode 100644 index 00000000000..69702307e7f --- /dev/null +++ b/tests/integration/test_storage_kafka/message_with_repeated_pb2.py @@ -0,0 +1,180 @@ +# Generated by the protocol buffer compiler. DO NOT EDIT! +# source: clickhouse_path/format_schemas/message_with_repeated.proto + +import sys +_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1')) +from google.protobuf import descriptor as _descriptor +from google.protobuf import message as _message +from google.protobuf import reflection as _reflection +from google.protobuf import symbol_database as _symbol_database +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + + + + +DESCRIPTOR = _descriptor.FileDescriptor( + name='clickhouse_path/format_schemas/message_with_repeated.proto', + package='', + syntax='proto3', + serialized_options=_b('H\001'), + serialized_pb=_b('\n:clickhouse_path/format_schemas/message_with_repeated.proto\"t\n\x07Message\x12\x0c\n\x04tnow\x18\x01 \x01(\r\x12\x0e\n\x06server\x18\x02 \x01(\t\x12\r\n\x05\x63lien\x18\x03 \x01(\t\x12\r\n\x05sPort\x18\x04 \x01(\r\x12\r\n\x05\x63Port\x18\x05 \x01(\r\x12\x0e\n\x01r\x18\x06 \x03(\x0b\x32\x03.dd\x12\x0e\n\x06method\x18\x07 \x01(\t\"J\n\x02\x64\x64\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\r\n\x05\x63lass\x18\x02 \x01(\r\x12\x0c\n\x04type\x18\x03 \x01(\r\x12\x0b\n\x03ttl\x18\x04 \x01(\x04\x12\x0c\n\x04\x64\x61ta\x18\x05 \x01(\x0c\x42\x02H\x01\x62\x06proto3') +) + + + + +_MESSAGE = _descriptor.Descriptor( + name='Message', + full_name='Message', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='tnow', full_name='Message.tnow', index=0, + number=1, type=13, cpp_type=3, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='server', full_name='Message.server', index=1, + number=2, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=_b("").decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='clien', full_name='Message.clien', index=2, + number=3, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=_b("").decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='sPort', full_name='Message.sPort', index=3, + number=4, type=13, cpp_type=3, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='cPort', full_name='Message.cPort', index=4, + number=5, type=13, cpp_type=3, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='r', full_name='Message.r', index=5, + number=6, type=11, cpp_type=10, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='method', full_name='Message.method', index=6, + number=7, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=_b("").decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + serialized_options=None, + is_extendable=False, + syntax='proto3', + extension_ranges=[], + oneofs=[ + ], + serialized_start=62, + serialized_end=178, +) + + +_DD = _descriptor.Descriptor( + name='dd', + full_name='dd', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='name', full_name='dd.name', index=0, + number=1, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=_b("").decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='class', full_name='dd.class', index=1, + number=2, type=13, cpp_type=3, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='type', full_name='dd.type', index=2, + number=3, type=13, cpp_type=3, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='ttl', full_name='dd.ttl', index=3, + number=4, type=4, cpp_type=4, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='data', full_name='dd.data', index=4, + number=5, type=12, cpp_type=9, label=1, + has_default_value=False, default_value=_b(""), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + serialized_options=None, + is_extendable=False, + syntax='proto3', + extension_ranges=[], + oneofs=[ + ], + serialized_start=180, + serialized_end=254, +) + +_MESSAGE.fields_by_name['r'].message_type = _DD +DESCRIPTOR.message_types_by_name['Message'] = _MESSAGE +DESCRIPTOR.message_types_by_name['dd'] = _DD +_sym_db.RegisterFileDescriptor(DESCRIPTOR) + +Message = _reflection.GeneratedProtocolMessageType('Message', (_message.Message,), dict( + DESCRIPTOR = _MESSAGE, + __module__ = 'clickhouse_path.format_schemas.message_with_repeated_pb2' + # @@protoc_insertion_point(class_scope:Message) + )) +_sym_db.RegisterMessage(Message) + +dd = _reflection.GeneratedProtocolMessageType('dd', (_message.Message,), dict( + DESCRIPTOR = _DD, + __module__ = 'clickhouse_path.format_schemas.message_with_repeated_pb2' + # @@protoc_insertion_point(class_scope:dd) + )) +_sym_db.RegisterMessage(dd) + + +DESCRIPTOR._options = None +# @@protoc_insertion_point(module_scope) diff --git a/tests/integration/test_storage_kafka/test.py b/tests/integration/test_storage_kafka/test.py index c909926d8f0..1ee7f3cf125 100644 --- a/tests/integration/test_storage_kafka/test.py +++ b/tests/integration/test_storage_kafka/test.py @@ -36,6 +36,7 @@ from kafka.admin import NewTopic from . import kafka_pb2 from . import social_pb2 +from . import message_with_repeated_pb2 # TODO: add test for run-time offset update in CH, if we manually update it on Kafka side. @@ -3219,6 +3220,124 @@ def test_kafka_predefined_configuration(kafka_cluster): kafka_check_result(result, True) +# https://github.com/ClickHouse/ClickHouse/issues/26643 +def test_issue26643(kafka_cluster): + + # for backporting: + # admin_client = KafkaAdminClient(bootstrap_servers="localhost:9092") + admin_client = KafkaAdminClient(bootstrap_servers="localhost:{}".format(kafka_cluster.kafka_port)) + producer = KafkaProducer(bootstrap_servers="localhost:{}".format(kafka_cluster.kafka_port), value_serializer=producer_serializer) + + topic_list = [] + topic_list.append(NewTopic(name="test_issue26643", num_partitions=4, replication_factor=1)) + admin_client.create_topics(new_topics=topic_list, validate_only=False) + + msg = message_with_repeated_pb2.Message( + tnow=1629000000, + server='server1', + clien='host1', + sPort=443, + cPort=50000, + r=[ + message_with_repeated_pb2.dd(name='1', type=444, ttl=123123, data=b'adsfasd'), + message_with_repeated_pb2.dd(name='2') + ], + method='GET' + ) + + data = b'' + serialized_msg = msg.SerializeToString() + data = data + _VarintBytes(len(serialized_msg)) + serialized_msg + + msg = message_with_repeated_pb2.Message( + tnow=1629000002 + ) + + serialized_msg = msg.SerializeToString() + data = data + _VarintBytes(len(serialized_msg)) + serialized_msg + + producer.send(topic="test_issue26643", value=data) + + data = _VarintBytes(len(serialized_msg)) + serialized_msg + producer.send(topic="test_issue26643", value=data) + producer.flush() + + instance.query(''' + CREATE TABLE IF NOT EXISTS test.test_queue + ( + `tnow` UInt32, + `server` String, + `client` String, + `sPort` UInt16, + `cPort` UInt16, + `r.name` Array(String), + `r.class` Array(UInt16), + `r.type` Array(UInt16), + `r.ttl` Array(UInt32), + `r.data` Array(String), + `method` String + ) + ENGINE = Kafka + SETTINGS + kafka_broker_list = 'kafka1:19092', + kafka_topic_list = 'test_issue26643', + kafka_group_name = 'test_issue26643_group', + kafka_format = 'Protobuf', + kafka_schema = 'message_with_repeated.proto:Message', + kafka_num_consumers = 4, + kafka_skip_broken_messages = 10000; + + SET allow_suspicious_low_cardinality_types=1; + + CREATE TABLE test.log + ( + `tnow` DateTime CODEC(DoubleDelta, LZ4), + `server` LowCardinality(String), + `client` LowCardinality(String), + `sPort` LowCardinality(UInt16), + `cPort` UInt16 CODEC(T64, LZ4), + `r.name` Array(String), + `r.class` Array(LowCardinality(UInt16)), + `r.type` Array(LowCardinality(UInt16)), + `r.ttl` Array(LowCardinality(UInt32)), + `r.data` Array(String), + `method` LowCardinality(String) + ) + ENGINE = MergeTree + PARTITION BY toYYYYMMDD(tnow) + ORDER BY (tnow, server) + TTL toDate(tnow) + toIntervalMonth(1000) + SETTINGS index_granularity = 16384, merge_with_ttl_timeout = 7200; + + CREATE MATERIALIZED VIEW test.test_consumer TO test.log AS + SELECT + toDateTime(a.tnow) AS tnow, + a.server AS server, + a.client AS client, + a.sPort AS sPort, + a.cPort AS cPort, + a.`r.name` AS `r.name`, + a.`r.class` AS `r.class`, + a.`r.type` AS `r.type`, + a.`r.ttl` AS `r.ttl`, + a.`r.data` AS `r.data`, + a.method AS method + FROM test.test_queue AS a; + ''') + + instance.wait_for_log_line("Committed offset") + result = instance.query('SELECT * FROM test.log') + + expected = '''\ +2021-08-15 07:00:00 server1 443 50000 ['1','2'] [0,0] [444,0] [123123,0] ['adsfasd',''] GET +2021-08-15 07:00:02 0 0 [] [] [] [] [] +2021-08-15 07:00:02 0 0 [] [] [] [] [] +''' + assert TSV(result) == TSV(expected) + + # kafka_cluster.open_bash_shell('instance') + + if __name__ == '__main__': cluster.start() input("Cluster created, press any key to destroy...") diff --git a/tests/integration/test_storage_kerberized_kafka/test.py b/tests/integration/test_storage_kerberized_kafka/test.py index 5f5063a879f..567a9b7184d 100644 --- a/tests/integration/test_storage_kerberized_kafka/test.py +++ b/tests/integration/test_storage_kerberized_kafka/test.py @@ -97,6 +97,22 @@ def test_kafka_json_as_string(kafka_cluster): assert instance.contains_in_log("Parsing of message (topic: kafka_json_as_string, partition: 0, offset: 1) return no rows") def test_kafka_json_as_string_no_kdc(kafka_cluster): + # When the test is run alone (not preceded by any other kerberized kafka test), + # we need a ticket to + # assert instance.contains_in_log("Ticket expired") + instance.query(''' + CREATE TABLE test.kafka_no_kdc_warm_up (field String) + ENGINE = Kafka + SETTINGS kafka_broker_list = 'kerberized_kafka1:19092', + kafka_topic_list = 'kafka_json_as_string_no_kdc_warm_up', + kafka_group_name = 'kafka_json_as_string_no_kdc_warm_up', + kafka_commit_on_select = 1, + kafka_format = 'JSONAsString', + kafka_flush_interval_ms=1000; + ''') + + instance.query('SELECT * FROM test.kafka_no_kdc_warm_up;') + kafka_produce(kafka_cluster, 'kafka_json_as_string_no_kdc', ['{"t": 123, "e": {"x": "woof"} }', '', '{"t": 124, "e": {"x": "test"} }', '{"F1":"V1","F2":{"F21":"V21","F22":{},"F23":"V23","F24":"2019-12-24T16:28:04"},"F3":"V3"}']) kafka_cluster.pause_container('kafka_kerberos') diff --git a/tests/integration/test_storage_mongodb/configs/named_collections.xml b/tests/integration/test_storage_mongodb/configs/named_collections.xml index feb6b55af02..5f7db390982 100644 --- a/tests/integration/test_storage_mongodb/configs/named_collections.xml +++ b/tests/integration/test_storage_mongodb/configs/named_collections.xml @@ -6,7 +6,7 @@ mongo1 27017 test -
simple_table
+ simple_table diff --git a/tests/integration/test_storage_mongodb/test.py b/tests/integration/test_storage_mongodb/test.py index 1a5de353d7d..16358ed4cad 100644 --- a/tests/integration/test_storage_mongodb/test.py +++ b/tests/integration/test_storage_mongodb/test.py @@ -20,8 +20,12 @@ def started_cluster(request): cluster.shutdown() -def get_mongo_connection(started_cluster, secure=False): - connection_str = 'mongodb://root:clickhouse@localhost:{}'.format(started_cluster.mongo_port) +def get_mongo_connection(started_cluster, secure=False, with_credentials=True): + connection_str = '' + if with_credentials: + connection_str = 'mongodb://root:clickhouse@localhost:{}'.format(started_cluster.mongo_port) + else: + connection_str = 'mongodb://localhost:27018' if secure: connection_str += '/?tls=true&tlsAllowInvalidCertificates=true' return pymongo.MongoClient(connection_str) @@ -138,4 +142,44 @@ def test_predefined_connection_configuration(started_cluster): node = started_cluster.instances['node'] node.query("create table simple_mongo_table(key UInt64, data String) engine = MongoDB(mongo1)") + assert node.query("SELECT count() FROM simple_mongo_table") == '100\n' + simple_mongo_table.drop() + +@pytest.mark.parametrize('started_cluster', [False], indirect=['started_cluster']) +def test_no_credentials(started_cluster): + mongo_connection = get_mongo_connection(started_cluster, with_credentials=False) + db = mongo_connection['test'] + simple_mongo_table = db['simple_table'] + data = [] + for i in range(0, 100): + data.append({'key': i, 'data': hex(i * i)}) + simple_mongo_table.insert_many(data) + + node = started_cluster.instances['node'] + node.query("create table simple_mongo_table_2(key UInt64, data String) engine = MongoDB('mongo2:27017', 'test', 'simple_table', '', '')") + assert node.query("SELECT count() FROM simple_mongo_table_2") == '100\n' + simple_mongo_table.drop() + +@pytest.mark.parametrize('started_cluster', [False], indirect=['started_cluster']) +def test_auth_source(started_cluster): + mongo_connection = get_mongo_connection(started_cluster, with_credentials=False) + admin_db = mongo_connection['admin'] + admin_db.add_user('root', 'clickhouse', roles=[{ 'role': "userAdminAnyDatabase", 'db': "admin" }, "readWriteAnyDatabase"]) + simple_mongo_table = admin_db['simple_table'] + data = [] + for i in range(0, 50): + data.append({'key': i, 'data': hex(i * i)}) + simple_mongo_table.insert_many(data) + db = mongo_connection['test'] + simple_mongo_table = db['simple_table'] + data = [] + for i in range(0, 100): + data.append({'key': i, 'data': hex(i * i)}) + simple_mongo_table.insert_many(data) + + node = started_cluster.instances['node'] + node.query("create table simple_mongo_table_fail(key UInt64, data String) engine = MongoDB('mongo2:27017', 'test', 'simple_table', 'root', 'clickhouse')") + node.query_and_get_error("SELECT count() FROM simple_mongo_table_fail") + node.query("create table simple_mongo_table_ok(key UInt64, data String) engine = MongoDB('mongo2:27017', 'test', 'simple_table', 'root', 'clickhouse', 'authSource=admin')") + assert node.query("SELECT count() FROM simple_mongo_table_ok") == '100\n' simple_mongo_table.drop() diff --git a/tests/integration/test_storage_postgresql/test.py b/tests/integration/test_storage_postgresql/test.py index 6f43036e64d..b6ac121cd0c 100644 --- a/tests/integration/test_storage_postgresql/test.py +++ b/tests/integration/test_storage_postgresql/test.py @@ -424,6 +424,21 @@ def test_predefined_connection_configuration(started_cluster): cursor.execute(f'DROP TABLE test_table ') +def test_where_false(started_cluster): + cursor = started_cluster.postgres_conn.cursor() + cursor.execute("DROP TABLE IF EXISTS test") + cursor.execute('CREATE TABLE test (a Integer)') + cursor.execute("INSERT INTO test SELECT 1") + + result = node1.query("SELECT count() FROM postgresql('postgres1:5432', 'postgres', 'test', 'postgres', 'mysecretpassword') WHERE 1=0") + assert(int(result) == 0) + result = node1.query("SELECT count() FROM postgresql('postgres1:5432', 'postgres', 'test', 'postgres', 'mysecretpassword') WHERE 0") + assert(int(result) == 0) + result = node1.query("SELECT count() FROM postgresql('postgres1:5432', 'postgres', 'test', 'postgres', 'mysecretpassword') WHERE 1=1") + assert(int(result) == 1) + cursor.execute("DROP TABLE test") + + if __name__ == '__main__': cluster.start() input("Cluster created, press any key to destroy...") diff --git a/tests/integration/test_storage_rabbitmq/test.py b/tests/integration/test_storage_rabbitmq/test.py index 66ec97ac027..a3d99159cb2 100644 --- a/tests/integration/test_storage_rabbitmq/test.py +++ b/tests/integration/test_storage_rabbitmq/test.py @@ -35,6 +35,17 @@ def rabbitmq_check_result(result, check=False, ref_file='test_rabbitmq_json.refe else: return TSV(result) == TSV(reference) +def wait_rabbitmq_to_start(rabbitmq_docker_id, timeout=180): + start = time.time() + while time.time() - start < timeout: + try: + if instance.cluster.check_rabbitmq_is_available(rabbitmq_docker_id): + logging.debug("RabbitMQ is available") + return + time.sleep(0.5) + except Exception as ex: + logging.debug("Can't connect to RabbitMQ " + str(ex)) + time.sleep(0.5) def kill_rabbitmq(rabbitmq_id): p = subprocess.Popen(('docker', 'stop', rabbitmq_id), stdout=subprocess.PIPE) @@ -45,7 +56,7 @@ def kill_rabbitmq(rabbitmq_id): def revive_rabbitmq(rabbitmq_id): p = subprocess.Popen(('docker', 'start', rabbitmq_id), stdout=subprocess.PIPE) p.communicate() - return p.returncode == 0 + wait_rabbitmq_to_start(rabbitmq_id) # Fixtures @@ -67,8 +78,8 @@ def rabbitmq_cluster(): def rabbitmq_setup_teardown(): print("RabbitMQ is available - running test") yield # run test - for table_name in ['view', 'consumer', 'rabbitmq']: - instance.query(f'DROP TABLE IF EXISTS test.{table_name}') + instance.query('DROP DATABASE test NO DELAY') + instance.query('CREATE DATABASE test') # Tests @@ -284,6 +295,12 @@ def test_rabbitmq_materialized_view(rabbitmq_cluster): ORDER BY key; CREATE MATERIALIZED VIEW test.consumer TO test.view AS SELECT * FROM test.rabbitmq; + + CREATE TABLE test.view2 (key UInt64, value UInt64) + ENGINE = MergeTree() + ORDER BY key; + CREATE MATERIALIZED VIEW test.consumer2 TO test.view2 AS + SELECT * FROM test.rabbitmq group by (key, value); ''') credentials = pika.PlainCredentials('root', 'clickhouse') @@ -297,14 +314,26 @@ def test_rabbitmq_materialized_view(rabbitmq_cluster): for message in messages: channel.basic_publish(exchange='mv', routing_key='', body=message) - while True: + time_limit_sec = 60 + deadline = time.monotonic() + time_limit_sec + + while time.monotonic() < deadline: result = instance.query('SELECT * FROM test.view ORDER BY key') if (rabbitmq_check_result(result)): break - connection.close() rabbitmq_check_result(result, True) + deadline = time.monotonic() + time_limit_sec + + while time.monotonic() < deadline: + result = instance.query('SELECT * FROM test.view2 ORDER BY key') + if (rabbitmq_check_result(result)): + break + + rabbitmq_check_result(result, True) + connection.close() + def test_rabbitmq_materialized_view_with_subquery(rabbitmq_cluster): instance.query(''' diff --git a/tests/integration/test_storage_s3/configs/named_collections.xml b/tests/integration/test_storage_s3/configs/named_collections.xml index dfcbeeb2d4a..efadedc1bde 100644 --- a/tests/integration/test_storage_s3/configs/named_collections.xml +++ b/tests/integration/test_storage_s3/configs/named_collections.xml @@ -15,5 +15,10 @@ minio minio123 + + http://minio1:9001/root/test_native + minio + minio123 + diff --git a/tests/integration/test_storage_s3/test.py b/tests/integration/test_storage_s3/test.py index a4ba7a95dc7..885a37f875c 100644 --- a/tests/integration/test_storage_s3/test.py +++ b/tests/integration/test_storage_s3/test.py @@ -11,6 +11,7 @@ import helpers.client import pytest from helpers.cluster import ClickHouseCluster, ClickHouseInstance, get_instances_dir from helpers.network import PartitionManager +from helpers.test_tools import exec_query_with_retry MINIO_INTERNAL_PORT = 9001 @@ -125,7 +126,7 @@ def run_query(instance, query, stdin=None, settings=None): pytest.param("'wrongid','wrongkey',", False, 'xz', id="xz"), pytest.param("'wrongid','wrongkey',", False, 'zstd', id="zstd") ]) -def test_put(started_cluster, maybe_auth, positive, compression): +def _test_put(started_cluster, maybe_auth, positive, compression): # type: (ClickHouseCluster) -> None bucket = started_cluster.minio_bucket if not maybe_auth else started_cluster.minio_restricted_bucket @@ -147,7 +148,7 @@ def test_put(started_cluster, maybe_auth, positive, compression): assert values_csv == get_s3_file_content(started_cluster, bucket, filename) -def test_partition_by(started_cluster): +def _test_partition_by(started_cluster): bucket = started_cluster.minio_bucket instance = started_cluster.instances["dummy"] # type: ClickHouseInstance table_format = "column1 UInt32, column2 UInt32, column3 UInt32" @@ -172,7 +173,7 @@ def test_partition_by(started_cluster): assert "78,43,45\n" == get_s3_file_content(started_cluster, bucket, "test2_45.csv") -def test_partition_by_string_column(started_cluster): +def _test_partition_by_string_column(started_cluster): bucket = started_cluster.minio_bucket instance = started_cluster.instances["dummy"] # type: ClickHouseInstance table_format = "col_num UInt32, col_str String" @@ -190,7 +191,7 @@ def test_partition_by_string_column(started_cluster): assert '78,"你好"\n' == get_s3_file_content(started_cluster, bucket, "test_你好.csv") -def test_partition_by_const_column(started_cluster): +def _test_partition_by_const_column(started_cluster): bucket = started_cluster.minio_bucket instance = started_cluster.instances["dummy"] # type: ClickHouseInstance table_format = "column1 UInt32, column2 UInt32, column3 UInt32" @@ -211,7 +212,7 @@ def test_partition_by_const_column(started_cluster): "space", "plus" ]) -def test_get_file_with_special(started_cluster, special): +def _test_get_file_with_special(started_cluster, special): symbol = {"space": " ", "plus": "+"}[special] urlsafe_symbol = {"space": "%20", "plus": "%2B"}[special] auth = "'minio','minio123'," @@ -238,7 +239,7 @@ def test_get_file_with_special(started_cluster, special): "plus", "plus2" ]) -def test_get_path_with_special(started_cluster, special): +def _test_get_path_with_special(started_cluster, special): symbol = {"space": "%20", "plus": "%2B", "plus2": "%2B"}[special] safe_symbol = {"space": "%20", "plus": "+", "plus2": "%2B"}[special] auth = "'minio','minio123'," @@ -252,7 +253,7 @@ def test_get_path_with_special(started_cluster, special): @pytest.mark.parametrize("auth", [ pytest.param("'minio','minio123',", id="minio") ]) -def test_empty_put(started_cluster, auth): +def _test_empty_put(started_cluster, auth): # type: (ClickHouseCluster, str) -> None bucket = started_cluster.minio_bucket @@ -290,7 +291,7 @@ def test_empty_put(started_cluster, auth): pytest.param("'minio','minio123',", True, id="auth_positive"), pytest.param("'wrongid','wrongkey',", False, id="negative"), ]) -def test_put_csv(started_cluster, maybe_auth, positive): +def _test_put_csv(started_cluster, maybe_auth, positive): # type: (ClickHouseCluster, bool, str) -> None bucket = started_cluster.minio_bucket if not maybe_auth else started_cluster.minio_restricted_bucket @@ -312,7 +313,7 @@ def test_put_csv(started_cluster, maybe_auth, positive): # Test put and get with S3 server redirect. -def test_put_get_with_redirect(started_cluster): +def _test_put_get_with_redirect(started_cluster): # type: (ClickHouseCluster) -> None bucket = started_cluster.minio_bucket @@ -339,7 +340,7 @@ def test_put_get_with_redirect(started_cluster): # Test put with restricted S3 server redirect. -def test_put_with_zero_redirect(started_cluster): +def _test_put_with_zero_redirect(started_cluster): # type: (ClickHouseCluster) -> None bucket = started_cluster.minio_bucket @@ -366,7 +367,7 @@ def test_put_with_zero_redirect(started_cluster): assert exception_raised -def test_put_get_with_globs(started_cluster): +def _test_put_get_with_globs(started_cluster): # type: (ClickHouseCluster) -> None unique_prefix = random.randint(1,10000) bucket = started_cluster.minio_bucket @@ -398,7 +399,7 @@ def test_put_get_with_globs(started_cluster): pytest.param("'wrongid','wrongkey'", False, id="negative"), # ("'minio','minio123',",True), Redirect with credentials not working with nginx. ]) -def test_multipart_put(started_cluster, maybe_auth, positive): +def _test_multipart_put(started_cluster, maybe_auth, positive): # type: (ClickHouseCluster) -> None bucket = started_cluster.minio_bucket if not maybe_auth else started_cluster.minio_restricted_bucket @@ -438,7 +439,7 @@ def test_multipart_put(started_cluster, maybe_auth, positive): assert csv_data == get_s3_file_content(started_cluster, bucket, filename) -def test_remote_host_filter(started_cluster): +def _test_remote_host_filter(started_cluster): instance = started_cluster.instances["restricted_dummy"] format = "column1 UInt32, column2 UInt32, column3 UInt32" @@ -456,7 +457,7 @@ def test_remote_host_filter(started_cluster): pytest.param("''", id="1_argument"), pytest.param("'','','','','',''", id="6_arguments"), ]) -def test_wrong_s3_syntax(started_cluster, s3_storage_args): +def _test_wrong_s3_syntax(started_cluster, s3_storage_args): instance = started_cluster.instances["dummy"] # type: ClickHouseInstance expected_err_msg = "Code: 42" # NUMBER_OF_ARGUMENTS_DOESNT_MATCH @@ -465,7 +466,7 @@ def test_wrong_s3_syntax(started_cluster, s3_storage_args): # https://en.wikipedia.org/wiki/One_Thousand_and_One_Nights -def test_s3_glob_scheherazade(started_cluster): +def _test_s3_glob_scheherazade(started_cluster): bucket = started_cluster.minio_bucket instance = started_cluster.instances["dummy"] # type: ClickHouseInstance table_format = "column1 UInt32, column2 UInt32, column3 UInt32" @@ -534,7 +535,7 @@ def replace_config(old, new): config.close() -def test_custom_auth_headers(started_cluster): +def _test_custom_auth_headers(started_cluster): table_format = "column1 UInt32, column2 UInt32, column3 UInt32" filename = "test.csv" get_query = "select * from s3('http://resolver:8080/{bucket}/{file}', 'CSV', '{table_format}')".format( @@ -565,7 +566,7 @@ def test_custom_auth_headers(started_cluster): instance.query("DROP TABLE test") -def test_custom_auth_headers_exclusion(started_cluster): +def _test_custom_auth_headers_exclusion(started_cluster): table_format = "column1 UInt32, column2 UInt32, column3 UInt32" filename = "test.csv" get_query = f"SELECT * FROM s3('http://resolver:8080/{started_cluster.minio_restricted_bucket}/restricteddirectory/{filename}', 'CSV', '{table_format}')" @@ -579,7 +580,7 @@ def test_custom_auth_headers_exclusion(started_cluster): assert 'Forbidden Error' in ei.value.stderr -def test_infinite_redirect(started_cluster): +def _test_infinite_redirect(started_cluster): bucket = "redirected" table_format = "column1 UInt32, column2 UInt32, column3 UInt32" filename = "test.csv" @@ -597,7 +598,7 @@ def test_infinite_redirect(started_cluster): pytest.param("bin", "gzip", id="bin"), pytest.param("gz", "auto", id="gz"), ]) -def test_storage_s3_get_gzip(started_cluster, extension, method): +def _test_storage_s3_get_gzip(started_cluster, extension, method): bucket = started_cluster.minio_bucket instance = started_cluster.instances["dummy"] filename = f"test_get_gzip.{extension}" @@ -637,7 +638,7 @@ def test_storage_s3_get_gzip(started_cluster, extension, method): run_query(instance, f"DROP TABLE {name}") -def test_storage_s3_get_unstable(started_cluster): +def _test_storage_s3_get_unstable(started_cluster): bucket = started_cluster.minio_bucket instance = started_cluster.instances["dummy"] table_format = "column1 Int64, column2 Int64, column3 Int64, column4 Int64" @@ -646,7 +647,7 @@ def test_storage_s3_get_unstable(started_cluster): assert result.splitlines() == ["500001,500000,0"] -def test_storage_s3_put_uncompressed(started_cluster): +def _test_storage_s3_put_uncompressed(started_cluster): bucket = started_cluster.minio_bucket instance = started_cluster.instances["dummy"] filename = "test_put_uncompressed.bin" @@ -683,7 +684,7 @@ def test_storage_s3_put_uncompressed(started_cluster): pytest.param("bin", "gzip", id="bin"), pytest.param("gz", "auto", id="gz") ]) -def test_storage_s3_put_gzip(started_cluster, extension, method): +def _test_storage_s3_put_gzip(started_cluster, extension, method): bucket = started_cluster.minio_bucket instance = started_cluster.instances["dummy"] filename = f"test_put_gzip.{extension}" @@ -720,7 +721,7 @@ def test_storage_s3_put_gzip(started_cluster, extension, method): assert sum([ int(i.split(',')[1]) for i in uncompressed_content.splitlines() ]) == 708 -def test_truncate_table(started_cluster): +def _test_truncate_table(started_cluster): bucket = started_cluster.minio_bucket instance = started_cluster.instances["dummy"] # type: ClickHouseInstance name = "truncate" @@ -744,7 +745,7 @@ def test_truncate_table(started_cluster): assert instance.query("SELECT * FROM {}".format(name)) == "" -def test_predefined_connection_configuration(started_cluster): +def _test_predefined_connection_configuration(started_cluster): bucket = started_cluster.minio_bucket instance = started_cluster.instances["dummy"] # type: ClickHouseInstance name = "test_table" @@ -761,7 +762,7 @@ def test_predefined_connection_configuration(started_cluster): result = "" -def test_url_reconnect_in_the_middle(started_cluster): +def _test_url_reconnect_in_the_middle(started_cluster): bucket = started_cluster.minio_bucket instance = started_cluster.instances["dummy"] table_format = "id String, data String" @@ -782,7 +783,7 @@ def test_url_reconnect_in_the_middle(started_cluster): f"""select sum(cityHash64(x)) from (select toUInt64(id) + sleep(0.1) as x from url('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{filename}', 'TSV', '{table_format}') settings http_max_tries = 10, http_retry_max_backoff_ms=2000, http_send_timeout=1, http_receive_timeout=1)""") - assert(int(result), 3914219105369203805) + assert(int(result) == 3914219105369203805) thread = threading.Thread(target=select) thread.start() @@ -795,10 +796,10 @@ def test_url_reconnect_in_the_middle(started_cluster): thread.join() - assert(int(result), 3914219105369203805) + assert(int(result) == 3914219105369203805) -def test_seekable_formats(started_cluster): +def _test_seekable_formats(started_cluster): bucket = started_cluster.minio_bucket instance = started_cluster.instances["dummy"] # type: ClickHouseInstance @@ -809,17 +810,18 @@ def test_seekable_formats(started_cluster): assert(int(result) == 5000000) table_function = f"s3(s3_orc, structure='a Int32, b String', format='ORC')" - instance.query(f"insert into table function {table_function} SELECT number, randomString(100) FROM numbers(5000000)") + exec_query_with_retry(instance, f"insert into table function {table_function} SELECT number, randomString(100) FROM numbers(5000000)") result = instance.query(f"SELECT count() FROM {table_function}") assert(int(result) == 5000000) + instance.query("SYSTEM FLUSH LOGS") result = instance.query(f"SELECT formatReadableSize(memory_usage) FROM system.query_log WHERE startsWith(query, 'SELECT count() FROM s3') AND memory_usage > 0 ORDER BY event_time desc") print(result[:3]) assert(int(result[:3]) < 200) -def test_seekable_formats_url(started_cluster): +def _test_seekable_formats_url(started_cluster): bucket = started_cluster.minio_bucket instance = started_cluster.instances["dummy"] @@ -831,12 +833,85 @@ def test_seekable_formats_url(started_cluster): assert(int(result) == 5000000) table_function = f"s3(s3_orc, structure='a Int32, b String', format='ORC')" - instance.query(f"insert into table function {table_function} select number, randomString(100) from numbers(5000000)") + exec_query_with_retry(instance, f"insert into table function {table_function} select number, randomString(100) from numbers(5000000)") table_function = f"url('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_orc', 'ORC', 'a Int32, b String')" result = instance.query(f"SELECT count() FROM {table_function}") assert(int(result) == 5000000) + instance.query("SYSTEM FLUSH LOGS") result = instance.query(f"SELECT formatReadableSize(memory_usage) FROM system.query_log WHERE startsWith(query, 'SELECT count() FROM url') AND memory_usage > 0 ORDER BY event_time desc") print(result[:3]) assert(int(result[:3]) < 200) + + +def test_empty_file(started_cluster): + bucket = started_cluster.minio_bucket + instance = started_cluster.instances["dummy"] + + name = "empty" + url = f'http://{started_cluster.minio_ip}:{MINIO_INTERNAL_PORT}/{bucket}/{name}' + + minio = started_cluster.minio_client + minio.put_object(bucket, name, io.BytesIO(b""), 0) + + table_function = f"s3('{url}', 'CSV', 'id Int32')" + result = instance.query(f"SELECT count() FROM {table_function}") + assert(int(result) == 0) + + +def test_insert_with_path_with_globs(started_cluster): + instance = started_cluster.instances["dummy"] + + table_function_3 = f"s3('http://minio1:9001/root/test_parquet*', 'minio', 'minio123', 'Parquet', 'a Int32, b String')" + instance.query_and_get_error(f"insert into table function {table_function_3} SELECT number, randomString(100) FROM numbers(500)") + + +def test_s3_schema_inference(started_cluster): + bucket = started_cluster.minio_bucket + instance = started_cluster.instances["dummy"] + + instance.query(f"insert into table function s3(s3_native, structure='a Int32, b String', format='Native') select number, randomString(100) from numbers(5000000)") + result = instance.query(f"desc s3(s3_native, format='Native')") + assert result == "a\tInt32\t\t\t\t\t\nb\tString\t\t\t\t\t\n" + + result = instance.query(f"select count(*) from s3(s3_native, format='Native')") + assert(int(result) == 5000000) + + instance.query(f"create table schema_inference engine=S3(s3_native, format='Native')") + result = instance.query(f"desc schema_inference") + assert result == "a\tInt32\t\t\t\t\t\nb\tString\t\t\t\t\t\n" + + result = instance.query(f"select count(*) from schema_inference") + assert(int(result) == 5000000) + + + table_function = f"url('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_native', 'Native')" + result = instance.query(f"desc {table_function}") + assert result == "a\tInt32\t\t\t\t\t\nb\tString\t\t\t\t\t\n" + + result = instance.query(f"select count(*) from {table_function}") + assert(int(result) == 5000000) + + instance.query(f"create table schema_inference_2 engine=URL('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_native', 'Native')") + result = instance.query(f"desc schema_inference_2") + assert result == "a\tInt32\t\t\t\t\t\nb\tString\t\t\t\t\t\n" + + result = instance.query(f"select count(*) from schema_inference_2") + assert(int(result) == 5000000) + + +def test_empty_file(started_cluster): + bucket = started_cluster.minio_bucket + instance = started_cluster.instances["dummy"] + + name = "empty" + url = f'http://{started_cluster.minio_ip}:{MINIO_INTERNAL_PORT}/{bucket}/{name}' + + minio = started_cluster.minio_client + minio.put_object(bucket, name, io.BytesIO(b""), 0) + + table_function = f"s3('{url}', 'CSV', 'id Int32')" + result = instance.query(f"SELECT count() FROM {table_function}") + assert(int(result) == 0) + diff --git a/tests/integration/test_system_flush_logs/test.py b/tests/integration/test_system_flush_logs/test.py index a4d70339c09..407e66d56a7 100644 --- a/tests/integration/test_system_flush_logs/test.py +++ b/tests/integration/test_system_flush_logs/test.py @@ -54,3 +54,11 @@ def test_system_logs_non_empty_queue(): 'log_queries_min_type': 'QUERY_START', }) node.query('SYSTEM FLUSH LOGS') + + +def test_system_suspend(): + node.query("CREATE TABLE t (x DateTime) ENGINE=Memory;") + node.query("INSERT INTO t VALUES (now());") + node.query("SYSTEM SUSPEND FOR 1 SECOND;") + node.query("INSERT INTO t VALUES (now());") + assert "1\n" == node.query("SELECT max(x) - min(x) >= 1 FROM t;") diff --git a/tests/integration/test_system_metrics/test.py b/tests/integration/test_system_metrics/test.py index 9e8eac162f6..efcc6f88a24 100644 --- a/tests/integration/test_system_metrics/test.py +++ b/tests/integration/test_system_metrics/test.py @@ -59,3 +59,32 @@ def test_readonly_metrics(start_cluster): node1.query("ATTACH TABLE test.test_table") assert_eq_with_retry(node1, "SELECT value FROM system.metrics WHERE metric = 'ReadonlyReplica'", "0\n", retry_count=300, sleep_time=1) +#For LowCardinality-columns, the bytes for N rows is not N*size of 1 row. +def test_metrics_storage_buffer_size(start_cluster): + node1.query(''' + CREATE TABLE test.test_mem_table + ( + `str` LowCardinality(String) + ) + ENGINE = Memory; + + CREATE TABLE test.buffer_table + ( + `str` LowCardinality(String) + ) + ENGINE = Buffer('test', 'test_mem_table', 1, 600, 600, 1000, 100000, 100000, 10000000); + ''') + + #before flush + node1.query("INSERT INTO test.buffer_table VALUES('hello');") + assert node1.query("SELECT value FROM system.metrics WHERE metric = 'StorageBufferRows'") == "1\n" + assert node1.query("SELECT value FROM system.metrics WHERE metric = 'StorageBufferBytes'") == "24\n" + + node1.query("INSERT INTO test.buffer_table VALUES('hello');") + assert node1.query("SELECT value FROM system.metrics WHERE metric = 'StorageBufferRows'") == "2\n" + assert node1.query("SELECT value FROM system.metrics WHERE metric = 'StorageBufferBytes'") == "25\n" + + #flush + node1.query("OPTIMIZE TABLE test.buffer_table") + assert node1.query("SELECT value FROM system.metrics WHERE metric = 'StorageBufferRows'") == "0\n" + assert node1.query("SELECT value FROM system.metrics WHERE metric = 'StorageBufferBytes'") == "0\n" diff --git a/tests/integration/test_table_functions_access_rights/test.py b/tests/integration/test_table_functions_access_rights/test.py index 16f18407960..90106303315 100644 --- a/tests/integration/test_table_functions_access_rights/test.py +++ b/tests/integration/test_table_functions_access_rights/test.py @@ -39,7 +39,7 @@ def test_merge(): instance.query("GRANT CREATE TEMPORARY TABLE ON *.* TO A") assert "no tables in database matches" in instance.query_and_get_error(select_query, user = 'A') - + instance.query("GRANT SELECT ON default.table1 TO A") assert instance.query(select_query, user = 'A') == "1\n" diff --git a/tests/performance/questdb_sum_int32.xml b/tests/performance/questdb_sum_int32.xml index 613ef3dc058..ba1eed6b074 100644 --- a/tests/performance/questdb_sum_int32.xml +++ b/tests/performance/questdb_sum_int32.xml @@ -25,7 +25,7 @@ CREATE TABLE `zz_{type}_{engine}` (x {type}) ENGINE {engine} - INSERT INTO `zz_{type}_{engine}` SELECT rand() FROM numbers_mt(1000000000) SETTINGS max_insert_threads = 8 + INSERT INTO `zz_{type}_{engine}` SELECT rand() FROM numbers_mt(300000000) SETTINGS max_insert_threads = 8 OPTIMIZE TABLE `zz_{type}_MergeTree ORDER BY tuple()` FINAL SELECT sum(x) FROM `zz_{type}_{engine}` diff --git a/tests/performance/reinterpret_as.xml b/tests/performance/reinterpret_as.xml index 79ce167a363..dbf6df160ed 100644 --- a/tests/performance/reinterpret_as.xml +++ b/tests/performance/reinterpret_as.xml @@ -191,7 +191,7 @@ toInt256(number) as d, toString(number) as f, toFixedString(f, 20) as g - FROM numbers_mt(200000000) + FROM numbers_mt(100000000) SETTINGS max_threads = 8 FORMAT Null diff --git a/tests/performance/set_index.xml b/tests/performance/set_index.xml index 1fb7cf967f3..631cad9986e 100644 --- a/tests/performance/set_index.xml +++ b/tests/performance/set_index.xml @@ -3,17 +3,17 @@ INSERT INTO test_in SELECT number FROM numbers(500000000) - SELECT count() FROM test_in WHERE a IN (SELECT rand(1) FROM numbers(200000)) SETTINGS max_rows_to_read = 1, read_overflow_mode = 'break' + SELECT count() FROM test_in WHERE a IN (SELECT rand(1) FROM numbers(200000)) SETTINGS max_rows_to_read = 200001, read_overflow_mode = 'break' - SELECT count() FROM test_in WHERE toInt64(a) IN (SELECT toInt64(rand(1)) FROM numbers(200000)) settings max_rows_to_read=1, read_overflow_mode='break' + SELECT count() FROM test_in WHERE toInt64(a) IN (SELECT toInt64(rand(1)) FROM numbers(200000)) settings max_rows_to_read=200001, read_overflow_mode='break' - SELECT count() FROM test_in WHERE -toInt64(a) IN (SELECT toInt64(rand(1)) FROM numbers(200000)) settings max_rows_to_read=1, read_overflow_mode='break' + SELECT count() FROM test_in WHERE -toInt64(a) IN (SELECT toInt64(rand(1)) FROM numbers(200000)) settings max_rows_to_read=200001, read_overflow_mode='break' - SELECT count() FROM test_in WHERE -toInt64(a) NOT IN (SELECT toInt64(rand(1)) FROM numbers(200000)) settings max_rows_to_read=1, read_overflow_mode='break' + SELECT count() FROM test_in WHERE -toInt64(a) NOT IN (SELECT toInt64(rand(1)) FROM numbers(200000)) settings max_rows_to_read=200001, read_overflow_mode='break' SELECT (rand(), rand()) IN ((17258, 93148), (4508, 52749), (68660, 70017), (77797, 23528), (1136, 37393), (53237, 15379), (68370, 73211), (15782, 54962), (59432, 45415), (68396, 920), (96154, 21016), (12700, 26887), (88016, 43191), (68153, 51575), (91315, 40005), (18070, 73178), (86, 631), (77717, 20324), (3227, 76188), (74960, 43147), (77538, 19628), (82292, 6525), (24293, 12566), (85244, 96287), (93982, 1329), (38064, 54723), (83999, 45810), (71921, 53673), (88638, 9669), (1959, 39535), (82235, 95796), (27907, 90975), (42383, 91015), (9948, 91514), (81712, 47309), (400, 25808), (31791, 46948), (39740, 36098), (25943, 84598), (99598, 52939), (77134, 15845), (40313, 72174), (85017, 94036), (36595, 14303), (83961, 68078), (55792, 72759), (73574, 43606), (9853, 63560), (28580, 56721), (74804, 41025), (32095, 55657), (52881, 63416), (91368, 90310), (23922, 38883), (30592, 10758), (66448, 61183), (31880, 96697), (11362, 20633), (75331, 2015), (71129, 8785), (1115, 70955), (7886, 83698), (18961, 84556), (16677, 43028), (37347, 70220), (31699, 71244), (10578, 96159), (67600, 39041), (78791, 86687), (21545, 54174), (68774, 37637), (46132, 81768), (98413, 20605), (2960, 23665), (31507, 35719), (96209, 18368), (60558, 38035), (21952, 3264), (11834, 86458), (21651, 17650), (86276, 36087), (18818, 24849), (61951, 3390), (59637, 62545), (30346, 72253), (36281, 2992), (78340, 49872), (94326, 93723), (3416, 94405), (12272, 8741), (22600, 22095), (57636, 37106), (38702, 14889), (70238, 11276), (17325, 60648), (16492, 41271), (52100, 1304), (93416, 7795), (57209, 71008), (48010, 36078), (20384, 74420), (77440, 34439), (69224, 45099), (30374, 33884), (49038, 90140), (1154, 84725), (64926, 86985), (91746, 73472), (59757, 75755), (45860, 71557), (45833, 36526), (74618, 73598), (91360, 65168), (58029, 30793), (56332, 14973), (99943, 96877), (97454, 6450), (64502, 77301), (73182, 31853), (76809, 83964), (82916, 86188), (78736, 65427), (36495, 7422), (76196, 2804), (96117, 61093), (9177, 26099), (52942, 63007), (48578, 47876), (50638, 89903), (7113, 97316), (35301, 12750), (47807, 7254), (38217, 55418), (56970, 41687), (20527, 62886), (358, 14021), (64018, 18582), (91740, 21683), (81967, 53589), (45437, 38450), (45476, 67752), (76851, 72072), (7304, 60091), (40097, 12897), (39906, 29247), (84262, 58734), (30857, 43791), (56087, 78929), (20498, 45954), (48726, 500), (62723, 43763), (28368, 30756), (74048, 52403), (15045, 95926), (75542, 55384), (52543, 22525), (56001, 6935), (11431, 46745), (77731, 7310), (36718, 59909), (32235, 91254), (92417, 25917), (21782, 79277), (46378, 87536), (35324, 26075), (6310, 76915), (1551, 69473), (50642, 68865), (55190, 72934), (49780, 21873), (99466, 29686), (90761, 13179), (72959, 57033), (20020, 90200), (46186, 79105), (73871, 52382), (59559, 38801), (59916, 16082), (33610, 94966), (46001, 45225), (86679, 26469), (77245, 91929), (32887, 36623), (11179, 46898), (87881, 68087), (45438, 47991), (24950, 94525), (91664, 51656), (43914, 47805), (15736, 96156), (56346, 20283), (85053, 48931), (17790, 26179), (96195, 55728), (43765, 54807), (44988, 89269), (55911, 99411), (52446, 47397), (28346, 65442), (96669, 68226), (66194, 26848), (37276, 55864), (14116, 41583), (18058, 16317), (93136, 85318), (35616, 86252), (29222, 29969), (33386, 85372), (71094, 44238), (27733, 31838), (64626, 16692), (52904, 97899), (97619, 12663), (50165, 4688), (67557, 44053), (69184, 66269), (73164, 89705), (39822, 15169), (65499, 72808), (30068, 63697), (30154, 64235), (97016, 58716), (94366, 36592), (1592, 16261), (87985, 52102), (12554, 23652), (15909, 25292), (2527, 91531), (92139, 36031), (28986, 30032), (3038, 56314), (32239, 26707), (15973, 34901), (70246, 39680), (82529, 38132), (45827, 74783), (53665, 64111), (55218, 84170), (20466, 16130), (55734, 71203), (31438, 96906), (66338, 85858), (35988, 68511), (78391, 15191), (80747, 59213), (5357, 11546), (16822, 16607), (36607, 41106), (74949, 30739), (45726, 64887), (1524, 54847), (37371, 89195), (28726, 27788), (22600, 44777), (53999, 63625), (84304, 98338), (49260, 76480), (74564, 53907), (89867, 97096), (60157, 61299), (17165, 10146), (56334, 36268), (62114, 49222), (22715, 23620), (42830, 11539), (41091, 69151), (75471, 68364), (18681, 43249), (42738, 63219), (35474, 98454), (76815, 46024), (66310, 36521), (86095, 77013), (63693, 77319), (80731, 63031), (95478, 92387), (23787, 63724), (46299, 68994), (4800, 2460), (9663, 80639), (77231, 85814), (81615, 11311), (35638, 27340), (13598, 14322), (30657, 17238), (90957, 96846), (69962, 52140), (41681, 65962), (96836, 58177), (36190, 11623), (4231, 40500), (43049, 41949), (71177, 98492), (30193, 39750), (19744, 33204), (63358, 30210), (45638, 58918), (43641, 38741), (35598, 40932), (33238, 36236), (50835, 20968), (25099, 34071), (84986, 88456), (35333, 1529), (79771, 23985), (647, 61658), (9424, 11743), (77766, 31528), (77811, 86973), (76403, 74377), (55568, 79251), (68858, 20762), (68520, 66773), (93598, 89823), (8080, 82539), (87760, 52247), (25191, 16905), (17837, 8339), (85177, 59050), (51680, 77374), (3287, 43018), (43479, 62141), (34909, 46322), (11869, 5885), (96193, 58417), (101, 47460), (34937, 88582), (83216, 88388), (28571, 15292), (66683, 62613), (34478, 8924), (2680, 89973), (62438, 44460), (11724, 4791), (5383, 72888), (88206, 67586), (8124, 21690), (28779, 75789), (66791, 4757), (6176, 47760), (6403, 78084), (78122, 35446), (99494, 73608), (39691, 89098), (59182, 19484), (25389, 98963), (96487, 3692), (76222, 67381), (21199, 50358), (95998, 58137), (28777, 43913), (14176, 60117), (52257, 81703), (14604, 13438), (71301, 14401), (19758, 66914), (15506, 29873), (87205, 29449), (93295, 15930), (63651, 11287), (19785, 15966), (30795, 75112), (69462, 37655), (18793, 85764), (36240, 31236), (98153, 73724), (72491, 4223), (66930, 35048), (25686, 13269), (13940, 13259), (69163, 11235), (1183, 86961), (54323, 67315), (85044, 60872), (48875, 3683), (43052, 92861), (87574, 32969), (92552, 80564), (94832, 47682), (72011, 80994), (60182, 917), (97788, 34169), (66432, 47940), (87468, 80954), (35385, 68758), (50555, 63710), (55311, 44337), (87065, 26514), (84581, 98736), (23212, 56499), (75120, 72447), (56087, 38285), (58171, 45629), (28401, 44319), (70432, 27883), (18891, 14646), (26206, 49924), (79957, 44914), (56064, 27529), (99090, 29197), (49435, 340), (53525, 65601), (76998, 88349), (50416, 70860), (42506, 75290), (34024, 13295), (86663, 46523), (88814, 231), (57809, 21), (84914, 84771), (43042, 66892), (17288, 33908), (4934, 63195), (50590, 1516), (97843, 80208), (20091, 86717), (71566, 15929), (19531, 23634), (41646, 45549), (89226, 82902), (96683, 63386), (31072, 53788), (51135, 41099), (78912, 65609), (36094, 23603), (88403, 51455), (73795, 47066), (26448, 82852), (22829, 2894), (30041, 92548), (27733, 20608), (70180, 19892), (51650, 63440), (76328, 13666), (40514, 6677), (2786, 51059), (40809, 16499), (10857, 82541), (78221, 61067), (17982, 51969), (85369, 66965), (47153, 47149), (43965, 75796), (82725, 60767), (42407, 97249), (51475, 81224), (60957, 89414), (33065, 21663), (36601, 5290), (95842, 67301), (64630, 60398), (55212, 35638), (41750, 44235), (75260, 82400), (91291, 25843), (6477, 8311), (14919, 52306), (66220, 33180), (45736, 2313), (37450, 64444), (98614, 61344), (75007, 50946), (56701, 28117), (66632, 5174), (92323, 76613), (6796, 73695), (33696, 76280), (86876, 5614), (50863, 67993), (36068, 17049), (91912, 34271), (70706, 1904), (97798, 41117), (68154, 72483), (83862, 25578), (61643, 17204), (69974, 64232), (77926, 19637), (64901, 88988), (71424, 91703), (91655, 17147), (46872, 56530), (44189, 98087), (95939, 54420), (72651, 68785), (67624, 84875), (92587, 87663), (65275, 81256), (53798, 2506), (14702, 3638), (71291, 50452), (14909, 13903), (66965, 26606), (14127, 60345), (35306, 1738), (77234, 10468), (53521, 41218), (80681, 82583), (44227, 26521), (32263, 21482), (82270, 56963), (50580, 80567), (11593, 22346), (20074, 26867), (73126, 28667), (62996, 24317), (20295, 57163), (1506, 57668), (69567, 45236), (43366, 26001), (88052, 40181), (1599, 89349), (36789, 1579), (39895, 46673), (30381, 3206), (31723, 5625), (19252, 31317), (16932, 77149), (48794, 34409), (55986, 30328), (47551, 75088), (57363, 78365), (95221, 63385), (26449, 5733), (96588, 53077), (52980, 41140), (8187, 85947), (36723, 26520), (23579, 38909), (33350, 19275), (63930, 19357), (43536, 59941), (31117, 77322), (44638, 94812), (44730, 99097), (95108, 48170), (57813, 49503), (79959, 89436), (86980, 62031), (8275, 44009), (36666, 94645), (22064, 38882), (40471, 16939), (31156, 11337), (13101, 96977), (17906, 26835), (89861, 51405), (73369, 67946), (99141, 58572), (27131, 98703), (15900, 43412), (51768, 93125), (78579, 46689), (23029, 13895), (60870, 55830), (22553, 8236), (76449, 96207), (83766, 51024), (27630, 50614), (53484, 90104), (77626, 21944), (46755, 41583), (53616, 34240), (94159, 44415), (13914, 90059), (44387, 89012), (27499, 64579), (83415, 30809), (77558, 82619), (88880, 9814), (8466, 4424), (43598, 91921), (24695, 3349), (46295, 65208), (51256, 82461), (49126, 93012), (16186, 96585), (43284, 22655), (93130, 90393), (77495, 34372), (85509, 65856), (86662, 61906), (50988, 44393), (29828, 17737), (91651, 35308), (29796, 49716), (14019, 87751), (29688, 71207), (82845, 19100), (11989, 50132), (21158, 99905), (54732, 42547), (32314, 12851), (46405, 43794), (87849, 45643), (53524, 21212), (61925, 75491), (12498, 21937), (30185, 69475), (48421, 52487), (15112, 90935), (33187, 17801), (61704, 25514), (17889, 23917), (18758, 57197), (7693, 47232), (47905, 24618), (11494, 78950), (95662, 54561), (8075, 33909), (90427, 46065), (73962, 19821), (50691, 79400), (58218, 4881), (94106, 2509), (60633, 55169), (49600, 83054), (23339, 13270), (70262, 58946), (48417, 97266), (27629, 46905), (74465, 75514), (41687, 2564), (12814, 19492), (78899, 30168), (17745, 35206), (37972, 35296), (22288, 80001), diff --git a/tests/performance/sparse_column.xml b/tests/performance/sparse_column.xml new file mode 100644 index 00000000000..6523d37df44 --- /dev/null +++ b/tests/performance/sparse_column.xml @@ -0,0 +1,58 @@ + + + + serialization + + sparse + full + + + + ratio + + 10 + 100 + 1000 + + + + + + CREATE TABLE test_full_{ratio} (id UInt64, u8 UInt8, u64 UInt64, str String) + ENGINE = MergeTree ORDER BY id + + + + CREATE TABLE test_sparse_{ratio} (id UInt64, u8 UInt8, u64 UInt64, str String) + ENGINE = MergeTree ORDER BY id + SETTINGS ratio_of_defaults_for_sparse_serialization = 0.9 + + + SYSTEM STOP MERGES test_{serialization}_{ratio} + + + INSERT INTO test_{serialization}_{ratio} SELECT + number, + number % {ratio} = 0 ? rand(1) : 0, + number % {ratio} = 0 ? rand(2) : 0, + number % {ratio} = 0 ? randomPrintableASCII(64, 3) : '' + FROM numbers(100000000) + + + SELECT u8 FROM test_{serialization}_{ratio} FORMAT Null + SELECT u64 FROM test_{serialization}_{ratio} FORMAT Null + SELECT str FROM test_{serialization}_{ratio} FORMAT Null + + SELECT erf(u64) FROM test_{serialization}_{ratio} FORMAT Null + SELECT lower(str) FROM test_{serialization}_{ratio} FORMAT Null + + SELECT id FROM test_{serialization}_{ratio} ORDER BY u64 DESC LIMIT 100 FORMAT Null + SELECT id FROM test_{serialization}_{ratio} ORDER BY str DESC LIMIT 100 FORMAT Null + SELECT id FROM test_{serialization}_{ratio} ORDER BY u8, u64 DESC LIMIT 100 FORMAT Null + SELECT * FROM test_{serialization}_{ratio} ORDER BY u8, u64 DESC LIMIT 100 FORMAT Null + + SELECT sum(u64) FROM test_{serialization}_{ratio} GROUP BY id % 11 FORMAT Null + SELECT uniq(str) FROM test_{serialization}_{ratio} GROUP BY id % 11 FORMAT Null + + + diff --git a/tests/queries/0_stateless/00498_array_functions_concat_slice_push_pop.reference b/tests/queries/0_stateless/00498_array_functions_concat_slice_push_pop.reference index 1cc42544311..f757a86aeee 100644 --- a/tests/queries/0_stateless/00498_array_functions_concat_slice_push_pop.reference +++ b/tests/queries/0_stateless/00498_array_functions_concat_slice_push_pop.reference @@ -35,6 +35,8 @@ slice [2,NULL,4,5] ['b','c','d'] ['b',NULL,'d'] +[] 1 +[] 1 push back \N [1,1] diff --git a/tests/queries/0_stateless/00498_array_functions_concat_slice_push_pop.sql b/tests/queries/0_stateless/00498_array_functions_concat_slice_push_pop.sql index 8f2f0811193..c87d52d2478 100644 --- a/tests/queries/0_stateless/00498_array_functions_concat_slice_push_pop.sql +++ b/tests/queries/0_stateless/00498_array_functions_concat_slice_push_pop.sql @@ -36,6 +36,7 @@ select arraySlice([1, 2, 3, 4, 5, 6], 10, 1); select arraySlice([1, 2, Null, 4, 5, 6], 2, 4); select arraySlice(['a', 'b', 'c', 'd', 'e'], 2, 3); select arraySlice([Null, 'b', Null, 'd', 'e'], 2, 3); +select arraySlice([], materialize(NULL), NULL), 1 from numbers(2); select 'push back'; select arrayPushBack(Null, 1); diff --git a/tests/queries/0_stateless/00646_url_engine.python b/tests/queries/0_stateless/00646_url_engine.python index 85ae3e776ed..4f47e819328 100644 --- a/tests/queries/0_stateless/00646_url_engine.python +++ b/tests/queries/0_stateless/00646_url_engine.python @@ -156,6 +156,7 @@ def test_select(table_name="", schema="str String,numuint UInt32,numint Int32,do if table_name: get_ch_answer("drop table if exists {}".format(table_name)) + def test_insert(table_name="", schema="str String,numuint UInt32,numint Int32,double Float64", requests_insert=[], requests_select=[], answers=[]): with open(CSV_DATA, 'w') as f: # flush test file f.write('') diff --git a/tests/queries/0_stateless/00700_decimal_math.reference b/tests/queries/0_stateless/00700_decimal_math.reference index eb556ac49b8..389b428e27b 100644 --- a/tests/queries/0_stateless/00700_decimal_math.reference +++ b/tests/queries/0_stateless/00700_decimal_math.reference @@ -4,7 +4,7 @@ 42.42 6.513 42.419169 42.42 3.4875 42.417263671875 1 0.8427007929497149 0.15729920705028513 -42.42 115.60113124678627 1.6029995567009473e50 +42.42 115.601131 1.603 0 0 1 0 3.14159265 0 -1 -0 1 1.5707963267948966 0 0.7853981633974483 @@ -14,7 +14,7 @@ 42.42 6.513 42.419169 42.42 3.4875 42.417263671875 1 0.8427007929497149 0.15729920705028513 -42.42 115.60113124678627 1.6029995567009473e50 +42.42 115.601131 1.603 0 0 1 0 3.14159265358979328 0 -1 -0 1 1.5707963267948966 0 0.7853981633974483 @@ -24,7 +24,7 @@ 42.42 6.513 42.419169 42.42 3.4875 42.417263671875 1 0.8427007929497149 0.15729920705028513 -42.42 115.60113124678627 1.6029995567009473e50 +42.42 115.601131 1.603 0 0 1 0 3.14159265358979 0 -1 -0 1 1.5707963267948966 0 0.7853981633974483 diff --git a/tests/queries/0_stateless/00700_decimal_math.sql b/tests/queries/0_stateless/00700_decimal_math.sql index 237bee1c691..cefbf2fd604 100644 --- a/tests/queries/0_stateless/00700_decimal_math.sql +++ b/tests/queries/0_stateless/00700_decimal_math.sql @@ -5,7 +5,7 @@ SELECT toDecimal32('42.42', 4) AS x, toDecimal32(log10(x), 4) AS y, round(exp10( SELECT toDecimal32('42.42', 4) AS x, toDecimal32(sqrt(x), 3) AS y, y * y; SELECT toDecimal32('42.42', 4) AS x, toDecimal32(cbrt(x), 4) AS y, toDecimal64(y, 4) * y * y; SELECT toDecimal32('1.0', 5) AS x, erf(x), erfc(x); -SELECT toDecimal32('42.42', 4) AS x, lgamma(x), tgamma(x); +SELECT toDecimal32('42.42', 4) AS x, round(lgamma(x), 6), round(tgamma(x) / 1e50, 6); SELECT toDecimal32('0.0', 2) AS x, round(sin(x), 8), round(cos(x), 8), round(tan(x), 8); SELECT toDecimal32(pi(), 8) AS x, round(sin(x), 8), round(cos(x), 8), round(tan(x), 8); @@ -19,7 +19,7 @@ SELECT toDecimal64('42.42', 4) AS x, toDecimal32(log10(x), 4) AS y, round(exp10( SELECT toDecimal64('42.42', 4) AS x, toDecimal32(sqrt(x), 3) AS y, y * y; SELECT toDecimal64('42.42', 4) AS x, toDecimal32(cbrt(x), 4) AS y, toDecimal64(y, 4) * y * y; SELECT toDecimal64('1.0', 5) AS x, erf(x), erfc(x); -SELECT toDecimal64('42.42', 4) AS x, lgamma(x), tgamma(x); +SELECT toDecimal64('42.42', 4) AS x, round(lgamma(x), 6), round(tgamma(x) / 1e50, 6); SELECT toDecimal64('0.0', 2) AS x, round(sin(x), 8), round(cos(x), 8), round(tan(x), 8); SELECT toDecimal64(pi(), 17) AS x, round(sin(x), 8), round(cos(x), 8), round(tan(x), 8); @@ -33,7 +33,7 @@ SELECT toDecimal128('42.42', 4) AS x, toDecimal32(log10(x), 4) AS y, round(exp10 SELECT toDecimal128('42.42', 4) AS x, toDecimal32(sqrt(x), 3) AS y, y * y; SELECT toDecimal128('42.42', 4) AS x, toDecimal32(cbrt(x), 4) AS y, toDecimal64(y, 4) * y * y; SELECT toDecimal128('1.0', 5) AS x, erf(x), erfc(x); -SELECT toDecimal128('42.42', 4) AS x, lgamma(x), tgamma(x); +SELECT toDecimal128('42.42', 4) AS x, round(lgamma(x), 6), round(tgamma(x) / 1e50, 6); SELECT toDecimal128('0.0', 2) AS x, round(sin(x), 8), round(cos(x), 8), round(tan(x), 8); SELECT toDecimal128(pi(), 14) AS x, round(sin(x), 8), round(cos(x), 8), round(tan(x), 8); diff --git a/tests/queries/0_stateless/00825_protobuf_format_enum_mapping.sh b/tests/queries/0_stateless/00825_protobuf_format_enum_mapping.sh index 2731e4bcce3..8d9e2689e26 100755 --- a/tests/queries/0_stateless/00825_protobuf_format_enum_mapping.sh +++ b/tests/queries/0_stateless/00825_protobuf_format_enum_mapping.sh @@ -25,15 +25,15 @@ SELECT * FROM enum_mapping_protobuf_00825; EOF BINARY_FILE_PATH=$(mktemp "$CURDIR/00825_protobuf_format_enum_mapping.XXXXXX.binary") -$CLICKHOUSE_CLIENT --query "SELECT * FROM enum_mapping_protobuf_00825 FORMAT Protobuf SETTINGS format_schema = '$SCHEMADIR/00825_protobuf_format_enum_mapping:Message'" > "$BINARY_FILE_PATH" +$CLICKHOUSE_CLIENT --query "SELECT * FROM enum_mapping_protobuf_00825 FORMAT Protobuf SETTINGS format_schema = '$SCHEMADIR/00825_protobuf_format_enum_mapping:EnumMessage'" > "$BINARY_FILE_PATH" # Check the output in the protobuf format echo -$CURDIR/helpers/protobuf_length_delimited_encoder.py --decode_and_check --format_schema "$SCHEMADIR/00825_protobuf_format_enum_mapping:Message" --input "$BINARY_FILE_PATH" +$CURDIR/helpers/protobuf_length_delimited_encoder.py --decode_and_check --format_schema "$SCHEMADIR/00825_protobuf_format_enum_mapping:EnumMessage" --input "$BINARY_FILE_PATH" # Check the input in the protobuf format (now the table contains the same data twice). echo -$CLICKHOUSE_CLIENT --query "INSERT INTO enum_mapping_protobuf_00825 FORMAT Protobuf SETTINGS format_schema='$SCHEMADIR/00825_protobuf_format_enum_mapping:Message'" < "$BINARY_FILE_PATH" +$CLICKHOUSE_CLIENT --query "INSERT INTO enum_mapping_protobuf_00825 FORMAT Protobuf SETTINGS format_schema='$SCHEMADIR/00825_protobuf_format_enum_mapping:EnumMessage'" < "$BINARY_FILE_PATH" $CLICKHOUSE_CLIENT --query "SELECT * FROM enum_mapping_protobuf_00825" rm "$BINARY_FILE_PATH" diff --git a/tests/queries/0_stateless/00825_protobuf_format_enum_mapping.yaodzJ.binary b/tests/queries/0_stateless/00825_protobuf_format_enum_mapping.yaodzJ.binary new file mode 100644 index 00000000000..4b7b97a300f Binary files /dev/null and b/tests/queries/0_stateless/00825_protobuf_format_enum_mapping.yaodzJ.binary differ diff --git a/tests/queries/0_stateless/00825_protobuf_format_skipped_column_in_nested.reference b/tests/queries/0_stateless/00825_protobuf_format_skipped_column_in_nested.reference index 1a80e6401db..12550ffbf28 100644 --- a/tests/queries/0_stateless/00825_protobuf_format_skipped_column_in_nested.reference +++ b/tests/queries/0_stateless/00825_protobuf_format_skipped_column_in_nested.reference @@ -25,3 +25,6 @@ modules { } Binary representation is as expected + +e4048ead-30a2-45e5-90be-2af1c7137523 [1] [50639] [58114] [[5393]] [[1]] [[]] [[17811]] [[(0,20)]] +e4048ead-30a2-45e5-90be-2af1c7137523 dummy [1] [50639] [58114] [[5393]] [[1]] [[3411]] [[17811]] [[(10,20)]] diff --git a/tests/queries/0_stateless/00825_protobuf_format_skipped_column_in_nested.sh b/tests/queries/0_stateless/00825_protobuf_format_skipped_column_in_nested.sh index b413385fb77..ed35df5e98b 100755 --- a/tests/queries/0_stateless/00825_protobuf_format_skipped_column_in_nested.sh +++ b/tests/queries/0_stateless/00825_protobuf_format_skipped_column_in_nested.sh @@ -47,9 +47,9 @@ echo $CURDIR/helpers/protobuf_length_delimited_encoder.py --decode_and_check --format_schema "$SCHEMADIR/00825_protobuf_format_skipped_column_in_nested:UpdateMessage" --input "$BINARY_FILE_PATH" # Check the input in the protobuf format (now the table contains the same data twice). -#echo -#$CLICKHOUSE_CLIENT --query "INSERT INTO table_skipped_column_in_nested_00825 FORMAT Protobuf SETTINGS format_schema='$SCHEMADIR/00825_protobuf_format_skipped_column_in_nested:UpdateMessage'" < "$BINARY_FILE_PATH" -#$CLICKHOUSE_CLIENT --query "SELECT * FROM table_skipped_column_in_nested_00825" +echo +$CLICKHOUSE_CLIENT --query "INSERT INTO table_skipped_column_in_nested_00825 FORMAT Protobuf SETTINGS format_schema='$SCHEMADIR/00825_protobuf_format_skipped_column_in_nested:UpdateMessage'" < "$BINARY_FILE_PATH" +$CLICKHOUSE_CLIENT --query "SELECT * FROM table_skipped_column_in_nested_00825 ORDER BY unused1" rm "$BINARY_FILE_PATH" $CLICKHOUSE_CLIENT --query "DROP TABLE table_skipped_column_in_nested_00825" diff --git a/tests/queries/0_stateless/00900_long_parquet_load.reference b/tests/queries/0_stateless/00900_long_parquet_load.reference index 421def88e41..89da3c6fa43 100644 --- a/tests/queries/0_stateless/00900_long_parquet_load.reference +++ b/tests/queries/0_stateless/00900_long_parquet_load.reference @@ -89,7 +89,7 @@ idx10 ['This','is','a','test'] 23 24 === Try load data from datapage_v2.snappy.parquet -Code: 33. DB::ParsingEx---tion: Error while reading Parquet data: IOError: Not yet implemented: Unsupported encoding.: While executing ParquetBlockInputFormat: data for INSERT was parsed from stdin: (in query: INSERT INTO parquet_load FORMAT Parquet). (CANNOT_READ_ALL_DATA) +Code: 33. DB::ParsingEx---tion: Error while reading Parquet data: IOError: Unknown encoding type.: While executing ParquetBlockInputFormat: data for INSERT was parsed from stdin: (in query: INSERT INTO parquet_load FORMAT Parquet). (CANNOT_READ_ALL_DATA) === Try load data from datatype-date32.parquet 1925-01-01 diff --git a/tests/queries/0_stateless/00945_bloom_filter_index.sql b/tests/queries/0_stateless/00945_bloom_filter_index.sql index f45c4c04290..d72f5ad1c6d 100644 --- a/tests/queries/0_stateless/00945_bloom_filter_index.sql +++ b/tests/queries/0_stateless/00945_bloom_filter_index.sql @@ -14,10 +14,10 @@ SELECT COUNT() FROM single_column_bloom_filter WHERE i32 IN (1, 2) SETTINGS max_ SELECT COUNT() FROM single_column_bloom_filter WHERE (i32, i32) IN ((1, 2), (2, 3)) SETTINGS max_rows_to_read = 6; SELECT COUNT() FROM single_column_bloom_filter WHERE (i32, i64) IN ((1, 1), (2, 2)) SETTINGS max_rows_to_read = 6; SELECT COUNT() FROM single_column_bloom_filter WHERE (i64, (i64, i32)) IN ((1, (1, 1)), (2, (2, 2))) SETTINGS max_rows_to_read = 6; -SELECT COUNT() FROM single_column_bloom_filter WHERE i32 IN (SELECT arrayJoin([toInt32(1), toInt32(2)])) SETTINGS max_rows_to_read = 6; -SELECT COUNT() FROM single_column_bloom_filter WHERE (i32, i32) IN (SELECT arrayJoin([(toInt32(1), toInt32(2)), (toInt32(2), toInt32(3))])) SETTINGS max_rows_to_read = 6; -SELECT COUNT() FROM single_column_bloom_filter WHERE (i32, i64) IN (SELECT arrayJoin([(toInt32(1), toUInt64(1)), (toInt32(2), toUInt64(2))])) SETTINGS max_rows_to_read = 6; -SELECT COUNT() FROM single_column_bloom_filter WHERE (i64, (i64, i32)) IN (SELECT arrayJoin([(toUInt64(1), (toUInt64(1), toInt32(1))), (toUInt64(2), (toUInt64(2), toInt32(2)))])) SETTINGS max_rows_to_read = 6; +SELECT COUNT() FROM single_column_bloom_filter WHERE i32 IN (SELECT arrayJoin([toInt32(1), toInt32(2)])) SETTINGS max_rows_to_read = 7; +SELECT COUNT() FROM single_column_bloom_filter WHERE (i32, i32) IN (SELECT arrayJoin([(toInt32(1), toInt32(2)), (toInt32(2), toInt32(3))])) SETTINGS max_rows_to_read = 7; +SELECT COUNT() FROM single_column_bloom_filter WHERE (i32, i64) IN (SELECT arrayJoin([(toInt32(1), toUInt64(1)), (toInt32(2), toUInt64(2))])) SETTINGS max_rows_to_read = 7; +SELECT COUNT() FROM single_column_bloom_filter WHERE (i64, (i64, i32)) IN (SELECT arrayJoin([(toUInt64(1), (toUInt64(1), toInt32(1))), (toUInt64(2), (toUInt64(2), toInt32(2)))])) SETTINGS max_rows_to_read = 7; WITH (1, 2) AS liter_prepared_set SELECT COUNT() FROM single_column_bloom_filter WHERE i32 IN liter_prepared_set SETTINGS max_rows_to_read = 6; WITH ((1, 2), (2, 3)) AS liter_prepared_set SELECT COUNT() FROM single_column_bloom_filter WHERE (i32, i32) IN liter_prepared_set SETTINGS max_rows_to_read = 6; WITH ((1, 1), (2, 2)) AS liter_prepared_set SELECT COUNT() FROM single_column_bloom_filter WHERE (i32, i64) IN liter_prepared_set SETTINGS max_rows_to_read = 6; diff --git a/tests/queries/0_stateless/01047_window_view_parser_inner_table.reference b/tests/queries/0_stateless/01047_window_view_parser_inner_table.reference index 19ebe5e0dbc..77f48f2832c 100644 --- a/tests/queries/0_stateless/01047_window_view_parser_inner_table.reference +++ b/tests/queries/0_stateless/01047_window_view_parser_inner_table.reference @@ -1,22 +1,22 @@ ---TUMBLE--- ||---WINDOW COLUMN NAME--- -CREATE TABLE test_01047.`.inner.wv`\n(\n `WINDOW_ID(timestamp, toIntervalSecond(\'1\'))` UInt32,\n `count(a)` AggregateFunction(count, Int32)\n)\nENGINE = AggregatingMergeTree\nORDER BY `WINDOW_ID(timestamp, toIntervalSecond(\'1\'))`\nSETTINGS index_granularity = 8192 +CREATE TABLE test_01047.`.inner.wv`\n(\n `windowID(timestamp, toIntervalSecond(\'1\'))` UInt32,\n `count(a)` AggregateFunction(count, Int32)\n)\nENGINE = AggregatingMergeTree\nORDER BY `windowID(timestamp, toIntervalSecond(\'1\'))`\nSETTINGS index_granularity = 8192 ||---WINDOW COLUMN ALIAS--- -CREATE TABLE test_01047.`.inner.wv`\n(\n `WINDOW_ID(timestamp, toIntervalSecond(\'1\'))` UInt32,\n `count(a)` AggregateFunction(count, Int32)\n)\nENGINE = AggregatingMergeTree\nORDER BY `WINDOW_ID(timestamp, toIntervalSecond(\'1\'))`\nSETTINGS index_granularity = 8192 +CREATE TABLE test_01047.`.inner.wv`\n(\n `windowID(timestamp, toIntervalSecond(\'1\'))` UInt32,\n `count(a)` AggregateFunction(count, Int32)\n)\nENGINE = AggregatingMergeTree\nORDER BY `windowID(timestamp, toIntervalSecond(\'1\'))`\nSETTINGS index_granularity = 8192 ||---IDENTIFIER--- -CREATE TABLE test_01047.`.inner.wv`\n(\n `b` Int32,\n `WINDOW_ID(timestamp, toIntervalSecond(\'1\'))` UInt32,\n `count(a)` AggregateFunction(count, Int32)\n)\nENGINE = AggregatingMergeTree\nPRIMARY KEY `WINDOW_ID(timestamp, toIntervalSecond(\'1\'))`\nORDER BY (`WINDOW_ID(timestamp, toIntervalSecond(\'1\'))`, b)\nSETTINGS index_granularity = 8192 +CREATE TABLE test_01047.`.inner.wv`\n(\n `b` Int32,\n `windowID(timestamp, toIntervalSecond(\'1\'))` UInt32,\n `count(a)` AggregateFunction(count, Int32)\n)\nENGINE = AggregatingMergeTree\nPRIMARY KEY `windowID(timestamp, toIntervalSecond(\'1\'))`\nORDER BY (`windowID(timestamp, toIntervalSecond(\'1\'))`, b)\nSETTINGS index_granularity = 8192 ||---FUNCTION--- -CREATE TABLE test_01047.`.inner.wv`\n(\n `plus(a, b)` Int64,\n `WINDOW_ID(timestamp, toIntervalSecond(\'1\'))` UInt32,\n `count(a)` AggregateFunction(count, Int32)\n)\nENGINE = AggregatingMergeTree\nPRIMARY KEY `WINDOW_ID(timestamp, toIntervalSecond(\'1\'))`\nORDER BY (`WINDOW_ID(timestamp, toIntervalSecond(\'1\'))`, `plus(a, b)`)\nSETTINGS index_granularity = 8192 +CREATE TABLE test_01047.`.inner.wv`\n(\n `plus(a, b)` Int64,\n `windowID(timestamp, toIntervalSecond(\'1\'))` UInt32,\n `count(a)` AggregateFunction(count, Int32)\n)\nENGINE = AggregatingMergeTree\nPRIMARY KEY `windowID(timestamp, toIntervalSecond(\'1\'))`\nORDER BY (`windowID(timestamp, toIntervalSecond(\'1\'))`, `plus(a, b)`)\nSETTINGS index_granularity = 8192 ||---PARTITION--- -CREATE TABLE test_01047.`.inner.wv`\n(\n `WINDOW_ID(____timestamp, toIntervalSecond(\'1\'))` UInt32,\n `count(a)` AggregateFunction(count, Int32)\n)\nENGINE = AggregatingMergeTree\nPARTITION BY `WINDOW_ID(____timestamp, toIntervalSecond(\'1\'))`\nORDER BY `WINDOW_ID(____timestamp, toIntervalSecond(\'1\'))`\nSETTINGS index_granularity = 8192 +CREATE TABLE test_01047.`.inner.wv`\n(\n `windowID(____timestamp, toIntervalSecond(\'1\'))` UInt32,\n `count(a)` AggregateFunction(count, Int32)\n)\nENGINE = AggregatingMergeTree\nPARTITION BY `windowID(____timestamp, toIntervalSecond(\'1\'))`\nORDER BY `windowID(____timestamp, toIntervalSecond(\'1\'))`\nSETTINGS index_granularity = 8192 ---HOP--- ||---WINDOW COLUMN NAME--- -CREATE TABLE test_01047.`.inner.wv`\n(\n `WINDOW_ID(timestamp, toIntervalSecond(\'1\'), toIntervalSecond(\'3\'))` UInt32,\n `count(a)` AggregateFunction(count, Int32)\n)\nENGINE = AggregatingMergeTree\nORDER BY `WINDOW_ID(timestamp, toIntervalSecond(\'1\'), toIntervalSecond(\'3\'))`\nSETTINGS index_granularity = 8192 +CREATE TABLE test_01047.`.inner.wv`\n(\n `windowID(timestamp, toIntervalSecond(\'1\'), toIntervalSecond(\'3\'))` UInt32,\n `count(a)` AggregateFunction(count, Int32)\n)\nENGINE = AggregatingMergeTree\nORDER BY `windowID(timestamp, toIntervalSecond(\'1\'), toIntervalSecond(\'3\'))`\nSETTINGS index_granularity = 8192 ||---WINDOW COLUMN ALIAS--- -CREATE TABLE test_01047.`.inner.wv`\n(\n `WINDOW_ID(timestamp, toIntervalSecond(\'1\'), toIntervalSecond(\'3\'))` UInt32,\n `count(a)` AggregateFunction(count, Int32)\n)\nENGINE = AggregatingMergeTree\nORDER BY `WINDOW_ID(timestamp, toIntervalSecond(\'1\'), toIntervalSecond(\'3\'))`\nSETTINGS index_granularity = 8192 +CREATE TABLE test_01047.`.inner.wv`\n(\n `windowID(timestamp, toIntervalSecond(\'1\'), toIntervalSecond(\'3\'))` UInt32,\n `count(a)` AggregateFunction(count, Int32)\n)\nENGINE = AggregatingMergeTree\nORDER BY `windowID(timestamp, toIntervalSecond(\'1\'), toIntervalSecond(\'3\'))`\nSETTINGS index_granularity = 8192 ||---IDENTIFIER--- -CREATE TABLE test_01047.`.inner.wv`\n(\n `b` Int32,\n `WINDOW_ID(timestamp, toIntervalSecond(\'1\'), toIntervalSecond(\'3\'))` UInt32,\n `count(a)` AggregateFunction(count, Int32)\n)\nENGINE = AggregatingMergeTree\nPRIMARY KEY `WINDOW_ID(timestamp, toIntervalSecond(\'1\'), toIntervalSecond(\'3\'))`\nORDER BY (`WINDOW_ID(timestamp, toIntervalSecond(\'1\'), toIntervalSecond(\'3\'))`, b)\nSETTINGS index_granularity = 8192 +CREATE TABLE test_01047.`.inner.wv`\n(\n `b` Int32,\n `windowID(timestamp, toIntervalSecond(\'1\'), toIntervalSecond(\'3\'))` UInt32,\n `count(a)` AggregateFunction(count, Int32)\n)\nENGINE = AggregatingMergeTree\nPRIMARY KEY `windowID(timestamp, toIntervalSecond(\'1\'), toIntervalSecond(\'3\'))`\nORDER BY (`windowID(timestamp, toIntervalSecond(\'1\'), toIntervalSecond(\'3\'))`, b)\nSETTINGS index_granularity = 8192 ||---FUNCTION--- -CREATE TABLE test_01047.`.inner.wv`\n(\n `plus(a, b)` Int64,\n `WINDOW_ID(timestamp, toIntervalSecond(\'1\'), toIntervalSecond(\'3\'))` UInt32,\n `count(a)` AggregateFunction(count, Int32)\n)\nENGINE = AggregatingMergeTree\nPRIMARY KEY `WINDOW_ID(timestamp, toIntervalSecond(\'1\'), toIntervalSecond(\'3\'))`\nORDER BY (`WINDOW_ID(timestamp, toIntervalSecond(\'1\'), toIntervalSecond(\'3\'))`, `plus(a, b)`)\nSETTINGS index_granularity = 8192 +CREATE TABLE test_01047.`.inner.wv`\n(\n `plus(a, b)` Int64,\n `windowID(timestamp, toIntervalSecond(\'1\'), toIntervalSecond(\'3\'))` UInt32,\n `count(a)` AggregateFunction(count, Int32)\n)\nENGINE = AggregatingMergeTree\nPRIMARY KEY `windowID(timestamp, toIntervalSecond(\'1\'), toIntervalSecond(\'3\'))`\nORDER BY (`windowID(timestamp, toIntervalSecond(\'1\'), toIntervalSecond(\'3\'))`, `plus(a, b)`)\nSETTINGS index_granularity = 8192 ||---PARTITION--- -CREATE TABLE test_01047.`.inner.wv`\n(\n `WINDOW_ID(____timestamp, toIntervalSecond(\'1\'), toIntervalSecond(\'3\'))` UInt32,\n `count(a)` AggregateFunction(count, Int32)\n)\nENGINE = AggregatingMergeTree\nPARTITION BY `WINDOW_ID(____timestamp, toIntervalSecond(\'1\'), toIntervalSecond(\'3\'))`\nORDER BY `WINDOW_ID(____timestamp, toIntervalSecond(\'1\'), toIntervalSecond(\'3\'))`\nSETTINGS index_granularity = 8192 +CREATE TABLE test_01047.`.inner.wv`\n(\n `windowID(____timestamp, toIntervalSecond(\'1\'), toIntervalSecond(\'3\'))` UInt32,\n `count(a)` AggregateFunction(count, Int32)\n)\nENGINE = AggregatingMergeTree\nPARTITION BY `windowID(____timestamp, toIntervalSecond(\'1\'), toIntervalSecond(\'3\'))`\nORDER BY `windowID(____timestamp, toIntervalSecond(\'1\'), toIntervalSecond(\'3\'))`\nSETTINGS index_granularity = 8192 diff --git a/tests/queries/0_stateless/01047_window_view_parser_inner_table.sql b/tests/queries/0_stateless/01047_window_view_parser_inner_table.sql index f332ec57b7f..777c5ae2a5a 100644 --- a/tests/queries/0_stateless/01047_window_view_parser_inner_table.sql +++ b/tests/queries/0_stateless/01047_window_view_parser_inner_table.sql @@ -12,31 +12,31 @@ SELECT '---TUMBLE---'; SELECT '||---WINDOW COLUMN NAME---'; DROP TABLE IF EXISTS test_01047.wv; DROP TABLE IF EXISTS test_01047.`.inner.wv`; -CREATE WINDOW VIEW test_01047.wv ENGINE AggregatingMergeTree ORDER BY TUMBLE(timestamp, INTERVAL '1' SECOND) AS SELECT count(a), TUMBLE_END(wid) AS count FROM test_01047.mt GROUP BY TUMBLE(timestamp, INTERVAL '1' SECOND) as wid; +CREATE WINDOW VIEW test_01047.wv ENGINE AggregatingMergeTree ORDER BY tumble(timestamp, INTERVAL '1' SECOND) AS SELECT count(a), tumbleEnd(wid) AS count FROM test_01047.mt GROUP BY tumble(timestamp, INTERVAL '1' SECOND) as wid; SHOW CREATE TABLE test_01047.`.inner.wv`; SELECT '||---WINDOW COLUMN ALIAS---'; DROP TABLE IF EXISTS test_01047.wv; DROP TABLE IF EXISTS test_01047.`.inner.wv`; -CREATE WINDOW VIEW test_01047.wv ENGINE AggregatingMergeTree ORDER BY wid AS SELECT count(a) AS count, TUMBLE(timestamp, INTERVAL '1' SECOND) AS wid FROM test_01047.mt GROUP BY wid; +CREATE WINDOW VIEW test_01047.wv ENGINE AggregatingMergeTree ORDER BY wid AS SELECT count(a) AS count, tumble(timestamp, INTERVAL '1' SECOND) AS wid FROM test_01047.mt GROUP BY wid; SHOW CREATE TABLE test_01047.`.inner.wv`; SELECT '||---IDENTIFIER---'; DROP TABLE IF EXISTS test_01047.wv; DROP TABLE IF EXISTS test_01047.`.inner.wv`; -CREATE WINDOW VIEW test_01047.wv ENGINE AggregatingMergeTree ORDER BY (TUMBLE(timestamp, INTERVAL '1' SECOND), b) PRIMARY KEY TUMBLE(timestamp, INTERVAL '1' SECOND) AS SELECT count(a) AS count FROM test_01047.mt GROUP BY b, TUMBLE(timestamp, INTERVAL '1' SECOND) AS wid; +CREATE WINDOW VIEW test_01047.wv ENGINE AggregatingMergeTree ORDER BY (tumble(timestamp, INTERVAL '1' SECOND), b) PRIMARY KEY tumble(timestamp, INTERVAL '1' SECOND) AS SELECT count(a) AS count FROM test_01047.mt GROUP BY b, tumble(timestamp, INTERVAL '1' SECOND) AS wid; SHOW CREATE TABLE test_01047.`.inner.wv`; SELECT '||---FUNCTION---'; DROP TABLE IF EXISTS test_01047.wv; DROP TABLE IF EXISTS test_01047.`.inner.wv`; -CREATE WINDOW VIEW test_01047.wv ENGINE AggregatingMergeTree ORDER BY (TUMBLE(timestamp, INTERVAL '1' SECOND), plus(a, b)) PRIMARY KEY TUMBLE(timestamp, INTERVAL '1' SECOND) AS SELECT count(a) AS count FROM test_01047.mt GROUP BY plus(a, b) as _type, TUMBLE(timestamp, INTERVAL '1' SECOND) AS wid; +CREATE WINDOW VIEW test_01047.wv ENGINE AggregatingMergeTree ORDER BY (tumble(timestamp, INTERVAL '1' SECOND), plus(a, b)) PRIMARY KEY tumble(timestamp, INTERVAL '1' SECOND) AS SELECT count(a) AS count FROM test_01047.mt GROUP BY plus(a, b) as _type, tumble(timestamp, INTERVAL '1' SECOND) AS wid; SHOW CREATE TABLE test_01047.`.inner.wv`; SELECT '||---PARTITION---'; DROP TABLE IF EXISTS test_01047.wv; DROP TABLE IF EXISTS test_01047.`.inner.wv`; -CREATE WINDOW VIEW test_01047.wv ENGINE AggregatingMergeTree ORDER BY wid PARTITION BY wid AS SELECT count(a) AS count, TUMBLE(now(), INTERVAL '1' SECOND) AS wid FROM test_01047.mt GROUP BY wid; +CREATE WINDOW VIEW test_01047.wv ENGINE AggregatingMergeTree ORDER BY wid PARTITION BY wid AS SELECT count(a) AS count, tumble(now(), INTERVAL '1' SECOND) AS wid FROM test_01047.mt GROUP BY wid; SHOW CREATE TABLE test_01047.`.inner.wv`; @@ -44,31 +44,31 @@ SELECT '---HOP---'; SELECT '||---WINDOW COLUMN NAME---'; DROP TABLE IF EXISTS test_01047.wv; DROP TABLE IF EXISTS test_01047.`.inner.wv`; -CREATE WINDOW VIEW test_01047.wv ENGINE AggregatingMergeTree ORDER BY HOP(timestamp, INTERVAL '1' SECOND, INTERVAL '3' SECOND) AS SELECT count(a) AS count, HOP_END(wid) FROM test_01047.mt GROUP BY HOP(timestamp, INTERVAL '1' SECOND, INTERVAL '3' SECOND) as wid; +CREATE WINDOW VIEW test_01047.wv ENGINE AggregatingMergeTree ORDER BY hop(timestamp, INTERVAL '1' SECOND, INTERVAL '3' SECOND) AS SELECT count(a) AS count, hopEnd(wid) FROM test_01047.mt GROUP BY hop(timestamp, INTERVAL '1' SECOND, INTERVAL '3' SECOND) as wid; SHOW CREATE TABLE test_01047.`.inner.wv`; SELECT '||---WINDOW COLUMN ALIAS---'; DROP TABLE IF EXISTS test_01047.wv; DROP TABLE IF EXISTS test_01047.`.inner.wv`; -CREATE WINDOW VIEW test_01047.wv ENGINE AggregatingMergeTree ORDER BY wid AS SELECT count(a) AS count, HOP(timestamp, INTERVAL '1' SECOND, INTERVAL '3' SECOND) AS wid FROM test_01047.mt GROUP BY wid; +CREATE WINDOW VIEW test_01047.wv ENGINE AggregatingMergeTree ORDER BY wid AS SELECT count(a) AS count, hop(timestamp, INTERVAL '1' SECOND, INTERVAL '3' SECOND) AS wid FROM test_01047.mt GROUP BY wid; SHOW CREATE TABLE test_01047.`.inner.wv`; SELECT '||---IDENTIFIER---'; DROP TABLE IF EXISTS test_01047.wv; DROP TABLE IF EXISTS test_01047.`.inner.wv`; -CREATE WINDOW VIEW test_01047.wv ENGINE AggregatingMergeTree ORDER BY (HOP(timestamp, INTERVAL '1' SECOND, INTERVAL '3' SECOND), b) PRIMARY KEY HOP(timestamp, INTERVAL '1' SECOND, INTERVAL '3' SECOND) AS SELECT count(a) AS count FROM test_01047.mt GROUP BY b, HOP(timestamp, INTERVAL '1' SECOND, INTERVAL '3' SECOND) AS wid; +CREATE WINDOW VIEW test_01047.wv ENGINE AggregatingMergeTree ORDER BY (hop(timestamp, INTERVAL '1' SECOND, INTERVAL '3' SECOND), b) PRIMARY KEY hop(timestamp, INTERVAL '1' SECOND, INTERVAL '3' SECOND) AS SELECT count(a) AS count FROM test_01047.mt GROUP BY b, hop(timestamp, INTERVAL '1' SECOND, INTERVAL '3' SECOND) AS wid; SHOW CREATE TABLE test_01047.`.inner.wv`; SELECT '||---FUNCTION---'; DROP TABLE IF EXISTS test_01047.wv; DROP TABLE IF EXISTS test_01047.`.inner.wv`; -CREATE WINDOW VIEW test_01047.wv ENGINE AggregatingMergeTree ORDER BY (HOP(timestamp, INTERVAL '1' SECOND, INTERVAL '3' SECOND), plus(a, b)) PRIMARY KEY HOP(timestamp, INTERVAL '1' SECOND, INTERVAL '3' SECOND) AS SELECT count(a) AS count FROM test_01047.mt GROUP BY plus(a, b) as _type, HOP(timestamp, INTERVAL '1' SECOND, INTERVAL '3' SECOND) AS wid; +CREATE WINDOW VIEW test_01047.wv ENGINE AggregatingMergeTree ORDER BY (hop(timestamp, INTERVAL '1' SECOND, INTERVAL '3' SECOND), plus(a, b)) PRIMARY KEY hop(timestamp, INTERVAL '1' SECOND, INTERVAL '3' SECOND) AS SELECT count(a) AS count FROM test_01047.mt GROUP BY plus(a, b) as _type, hop(timestamp, INTERVAL '1' SECOND, INTERVAL '3' SECOND) AS wid; SHOW CREATE TABLE test_01047.`.inner.wv`; SELECT '||---PARTITION---'; DROP TABLE IF EXISTS test_01047.wv; DROP TABLE IF EXISTS test_01047.`.inner.wv`; -CREATE WINDOW VIEW test_01047.wv ENGINE AggregatingMergeTree ORDER BY wid PARTITION BY wid AS SELECT count(a) AS count, HOP_END(wid) FROM test_01047.mt GROUP BY HOP(now(), INTERVAL '1' SECOND, INTERVAL '3' SECOND) as wid; +CREATE WINDOW VIEW test_01047.wv ENGINE AggregatingMergeTree ORDER BY wid PARTITION BY wid AS SELECT count(a) AS count, hopEnd(wid) FROM test_01047.mt GROUP BY hop(now(), INTERVAL '1' SECOND, INTERVAL '3' SECOND) as wid; SHOW CREATE TABLE test_01047.`.inner.wv`; DROP TABLE test_01047.wv; diff --git a/tests/queries/0_stateless/01048_window_view_parser.reference b/tests/queries/0_stateless/01048_window_view_parser.reference index 47ed39fc1d8..6625313f572 100644 --- a/tests/queries/0_stateless/01048_window_view_parser.reference +++ b/tests/queries/0_stateless/01048_window_view_parser.reference @@ -1,26 +1,26 @@ ---TUMBLE--- ||---WINDOW COLUMN NAME--- -CREATE TABLE test_01048.`.inner.wv`\n(\n `WINDOW_ID(timestamp, toIntervalSecond(1))` UInt32,\n `count(a)` AggregateFunction(count, Int32)\n)\nENGINE = AggregatingMergeTree\nPRIMARY KEY `WINDOW_ID(timestamp, toIntervalSecond(1))`\nORDER BY tuple(`WINDOW_ID(timestamp, toIntervalSecond(1))`)\nSETTINGS index_granularity = 8192 +CREATE TABLE test_01048.`.inner.wv`\n(\n `windowID(timestamp, toIntervalSecond(1))` UInt32,\n `count(a)` AggregateFunction(count, Int32)\n)\nENGINE = AggregatingMergeTree\nPRIMARY KEY `windowID(timestamp, toIntervalSecond(1))`\nORDER BY tuple(`windowID(timestamp, toIntervalSecond(1))`)\nSETTINGS index_granularity = 8192 ||---WINDOW COLUMN ALIAS--- -CREATE TABLE test_01048.`.inner.wv`\n(\n `WINDOW_ID(timestamp, toIntervalSecond(\'1\'))` UInt32,\n `count(a)` AggregateFunction(count, Int32)\n)\nENGINE = AggregatingMergeTree\nPRIMARY KEY `WINDOW_ID(timestamp, toIntervalSecond(\'1\'))`\nORDER BY tuple(`WINDOW_ID(timestamp, toIntervalSecond(\'1\'))`)\nSETTINGS index_granularity = 8192 +CREATE TABLE test_01048.`.inner.wv`\n(\n `windowID(timestamp, toIntervalSecond(\'1\'))` UInt32,\n `count(a)` AggregateFunction(count, Int32)\n)\nENGINE = AggregatingMergeTree\nPRIMARY KEY `windowID(timestamp, toIntervalSecond(\'1\'))`\nORDER BY tuple(`windowID(timestamp, toIntervalSecond(\'1\'))`)\nSETTINGS index_granularity = 8192 ||---IDENTIFIER--- -CREATE TABLE test_01048.`.inner.wv`\n(\n `b` Int32,\n `WINDOW_ID(timestamp, toIntervalSecond(\'1\'))` UInt32,\n `count(a)` AggregateFunction(count, Int32)\n)\nENGINE = AggregatingMergeTree\nPRIMARY KEY `WINDOW_ID(timestamp, toIntervalSecond(\'1\'))`\nORDER BY (`WINDOW_ID(timestamp, toIntervalSecond(\'1\'))`, b)\nSETTINGS index_granularity = 8192 -CREATE TABLE test_01048.`.inner.wv`\n(\n `WINDOW_ID(timestamp, toIntervalSecond(\'1\'))` UInt32,\n `b` Int32,\n `count(a)` AggregateFunction(count, Int32)\n)\nENGINE = AggregatingMergeTree\nPRIMARY KEY `WINDOW_ID(timestamp, toIntervalSecond(\'1\'))`\nORDER BY (`WINDOW_ID(timestamp, toIntervalSecond(\'1\'))`, b)\nSETTINGS index_granularity = 8192 +CREATE TABLE test_01048.`.inner.wv`\n(\n `b` Int32,\n `windowID(timestamp, toIntervalSecond(\'1\'))` UInt32,\n `count(a)` AggregateFunction(count, Int32)\n)\nENGINE = AggregatingMergeTree\nPRIMARY KEY `windowID(timestamp, toIntervalSecond(\'1\'))`\nORDER BY (`windowID(timestamp, toIntervalSecond(\'1\'))`, b)\nSETTINGS index_granularity = 8192 +CREATE TABLE test_01048.`.inner.wv`\n(\n `windowID(timestamp, toIntervalSecond(\'1\'))` UInt32,\n `b` Int32,\n `count(a)` AggregateFunction(count, Int32)\n)\nENGINE = AggregatingMergeTree\nPRIMARY KEY `windowID(timestamp, toIntervalSecond(\'1\'))`\nORDER BY (`windowID(timestamp, toIntervalSecond(\'1\'))`, b)\nSETTINGS index_granularity = 8192 ||---FUNCTION--- -CREATE TABLE test_01048.`.inner.wv`\n(\n `plus(a, b)` Int64,\n `WINDOW_ID(timestamp, toIntervalSecond(\'1\'))` UInt32,\n `count(a)` AggregateFunction(count, Int32)\n)\nENGINE = AggregatingMergeTree\nPRIMARY KEY `WINDOW_ID(timestamp, toIntervalSecond(\'1\'))`\nORDER BY (`WINDOW_ID(timestamp, toIntervalSecond(\'1\'))`, `plus(a, b)`)\nSETTINGS index_granularity = 8192 -CREATE TABLE test_01048.`.inner.wv`\n(\n `WINDOW_ID(timestamp, toIntervalSecond(\'1\'))` UInt32,\n `plus(a, b)` Int64,\n `count(a)` AggregateFunction(count, Int32)\n)\nENGINE = AggregatingMergeTree\nPRIMARY KEY `WINDOW_ID(timestamp, toIntervalSecond(\'1\'))`\nORDER BY (`WINDOW_ID(timestamp, toIntervalSecond(\'1\'))`, `plus(a, b)`)\nSETTINGS index_granularity = 8192 +CREATE TABLE test_01048.`.inner.wv`\n(\n `plus(a, b)` Int64,\n `windowID(timestamp, toIntervalSecond(\'1\'))` UInt32,\n `count(a)` AggregateFunction(count, Int32)\n)\nENGINE = AggregatingMergeTree\nPRIMARY KEY `windowID(timestamp, toIntervalSecond(\'1\'))`\nORDER BY (`windowID(timestamp, toIntervalSecond(\'1\'))`, `plus(a, b)`)\nSETTINGS index_granularity = 8192 +CREATE TABLE test_01048.`.inner.wv`\n(\n `windowID(timestamp, toIntervalSecond(\'1\'))` UInt32,\n `plus(a, b)` Int64,\n `count(a)` AggregateFunction(count, Int32)\n)\nENGINE = AggregatingMergeTree\nPRIMARY KEY `windowID(timestamp, toIntervalSecond(\'1\'))`\nORDER BY (`windowID(timestamp, toIntervalSecond(\'1\'))`, `plus(a, b)`)\nSETTINGS index_granularity = 8192 ||---TimeZone--- -CREATE TABLE test_01048.`.inner.wv`\n(\n `WINDOW_ID(timestamp, toIntervalSecond(\'1\'), \'Asia/Shanghai\')` UInt32,\n `count(a)` AggregateFunction(count, Int32)\n)\nENGINE = AggregatingMergeTree\nPRIMARY KEY `WINDOW_ID(timestamp, toIntervalSecond(\'1\'), \'Asia/Shanghai\')`\nORDER BY tuple(`WINDOW_ID(timestamp, toIntervalSecond(\'1\'), \'Asia/Shanghai\')`)\nSETTINGS index_granularity = 8192 +CREATE TABLE test_01048.`.inner.wv`\n(\n `windowID(timestamp, toIntervalSecond(\'1\'), \'Asia/Shanghai\')` UInt32,\n `count(a)` AggregateFunction(count, Int32)\n)\nENGINE = AggregatingMergeTree\nPRIMARY KEY `windowID(timestamp, toIntervalSecond(\'1\'), \'Asia/Shanghai\')`\nORDER BY tuple(`windowID(timestamp, toIntervalSecond(\'1\'), \'Asia/Shanghai\')`)\nSETTINGS index_granularity = 8192 ---HOP--- ||---WINDOW COLUMN NAME--- -CREATE TABLE test_01048.`.inner.wv`\n(\n `WINDOW_ID(timestamp, toIntervalSecond(1), toIntervalSecond(3))` UInt32,\n `count(a)` AggregateFunction(count, Int32)\n)\nENGINE = AggregatingMergeTree\nPRIMARY KEY `WINDOW_ID(timestamp, toIntervalSecond(1), toIntervalSecond(3))`\nORDER BY tuple(`WINDOW_ID(timestamp, toIntervalSecond(1), toIntervalSecond(3))`)\nSETTINGS index_granularity = 8192 +CREATE TABLE test_01048.`.inner.wv`\n(\n `windowID(timestamp, toIntervalSecond(1), toIntervalSecond(3))` UInt32,\n `count(a)` AggregateFunction(count, Int32)\n)\nENGINE = AggregatingMergeTree\nPRIMARY KEY `windowID(timestamp, toIntervalSecond(1), toIntervalSecond(3))`\nORDER BY tuple(`windowID(timestamp, toIntervalSecond(1), toIntervalSecond(3))`)\nSETTINGS index_granularity = 8192 ||---WINDOW COLUMN ALIAS--- -CREATE TABLE test_01048.`.inner.wv`\n(\n `WINDOW_ID(timestamp, toIntervalSecond(\'1\'), toIntervalSecond(\'3\'))` UInt32,\n `count(a)` AggregateFunction(count, Int32)\n)\nENGINE = AggregatingMergeTree\nPRIMARY KEY `WINDOW_ID(timestamp, toIntervalSecond(\'1\'), toIntervalSecond(\'3\'))`\nORDER BY tuple(`WINDOW_ID(timestamp, toIntervalSecond(\'1\'), toIntervalSecond(\'3\'))`)\nSETTINGS index_granularity = 8192 +CREATE TABLE test_01048.`.inner.wv`\n(\n `windowID(timestamp, toIntervalSecond(\'1\'), toIntervalSecond(\'3\'))` UInt32,\n `count(a)` AggregateFunction(count, Int32)\n)\nENGINE = AggregatingMergeTree\nPRIMARY KEY `windowID(timestamp, toIntervalSecond(\'1\'), toIntervalSecond(\'3\'))`\nORDER BY tuple(`windowID(timestamp, toIntervalSecond(\'1\'), toIntervalSecond(\'3\'))`)\nSETTINGS index_granularity = 8192 ||---IDENTIFIER--- -CREATE TABLE test_01048.`.inner.wv`\n(\n `b` Int32,\n `WINDOW_ID(timestamp, toIntervalSecond(\'1\'), toIntervalSecond(\'3\'))` UInt32,\n `count(a)` AggregateFunction(count, Int32)\n)\nENGINE = AggregatingMergeTree\nPRIMARY KEY `WINDOW_ID(timestamp, toIntervalSecond(\'1\'), toIntervalSecond(\'3\'))`\nORDER BY (`WINDOW_ID(timestamp, toIntervalSecond(\'1\'), toIntervalSecond(\'3\'))`, b)\nSETTINGS index_granularity = 8192 -CREATE TABLE test_01048.`.inner.wv`\n(\n `WINDOW_ID(timestamp, toIntervalSecond(\'1\'), toIntervalSecond(\'3\'))` UInt32,\n `b` Int32,\n `count(a)` AggregateFunction(count, Int32)\n)\nENGINE = AggregatingMergeTree\nPRIMARY KEY `WINDOW_ID(timestamp, toIntervalSecond(\'1\'), toIntervalSecond(\'3\'))`\nORDER BY (`WINDOW_ID(timestamp, toIntervalSecond(\'1\'), toIntervalSecond(\'3\'))`, b)\nSETTINGS index_granularity = 8192 +CREATE TABLE test_01048.`.inner.wv`\n(\n `b` Int32,\n `windowID(timestamp, toIntervalSecond(\'1\'), toIntervalSecond(\'3\'))` UInt32,\n `count(a)` AggregateFunction(count, Int32)\n)\nENGINE = AggregatingMergeTree\nPRIMARY KEY `windowID(timestamp, toIntervalSecond(\'1\'), toIntervalSecond(\'3\'))`\nORDER BY (`windowID(timestamp, toIntervalSecond(\'1\'), toIntervalSecond(\'3\'))`, b)\nSETTINGS index_granularity = 8192 +CREATE TABLE test_01048.`.inner.wv`\n(\n `windowID(timestamp, toIntervalSecond(\'1\'), toIntervalSecond(\'3\'))` UInt32,\n `b` Int32,\n `count(a)` AggregateFunction(count, Int32)\n)\nENGINE = AggregatingMergeTree\nPRIMARY KEY `windowID(timestamp, toIntervalSecond(\'1\'), toIntervalSecond(\'3\'))`\nORDER BY (`windowID(timestamp, toIntervalSecond(\'1\'), toIntervalSecond(\'3\'))`, b)\nSETTINGS index_granularity = 8192 ||---FUNCTION--- -CREATE TABLE test_01048.`.inner.wv`\n(\n `plus(a, b)` Int64,\n `WINDOW_ID(timestamp, toIntervalSecond(\'1\'), toIntervalSecond(\'3\'))` UInt32,\n `count(a)` AggregateFunction(count, Int32)\n)\nENGINE = AggregatingMergeTree\nPRIMARY KEY `WINDOW_ID(timestamp, toIntervalSecond(\'1\'), toIntervalSecond(\'3\'))`\nORDER BY (`WINDOW_ID(timestamp, toIntervalSecond(\'1\'), toIntervalSecond(\'3\'))`, `plus(a, b)`)\nSETTINGS index_granularity = 8192 +CREATE TABLE test_01048.`.inner.wv`\n(\n `plus(a, b)` Int64,\n `windowID(timestamp, toIntervalSecond(\'1\'), toIntervalSecond(\'3\'))` UInt32,\n `count(a)` AggregateFunction(count, Int32)\n)\nENGINE = AggregatingMergeTree\nPRIMARY KEY `windowID(timestamp, toIntervalSecond(\'1\'), toIntervalSecond(\'3\'))`\nORDER BY (`windowID(timestamp, toIntervalSecond(\'1\'), toIntervalSecond(\'3\'))`, `plus(a, b)`)\nSETTINGS index_granularity = 8192 ||---TimeZone--- -CREATE TABLE test_01048.`.inner.wv`\n(\n `WINDOW_ID(timestamp, toIntervalSecond(1), toIntervalSecond(3), \'Asia/Shanghai\')` UInt32,\n `count(a)` AggregateFunction(count, Int32)\n)\nENGINE = AggregatingMergeTree\nPRIMARY KEY `WINDOW_ID(timestamp, toIntervalSecond(1), toIntervalSecond(3), \'Asia/Shanghai\')`\nORDER BY tuple(`WINDOW_ID(timestamp, toIntervalSecond(1), toIntervalSecond(3), \'Asia/Shanghai\')`)\nSETTINGS index_granularity = 8192 -CREATE TABLE test_01048.`.inner.wv`\n(\n `WINDOW_ID(timestamp, toIntervalSecond(\'1\'), toIntervalSecond(\'3\'))` UInt32,\n `plus(a, b)` Int64,\n `count(a)` AggregateFunction(count, Int32)\n)\nENGINE = AggregatingMergeTree\nPRIMARY KEY `WINDOW_ID(timestamp, toIntervalSecond(\'1\'), toIntervalSecond(\'3\'))`\nORDER BY (`WINDOW_ID(timestamp, toIntervalSecond(\'1\'), toIntervalSecond(\'3\'))`, `plus(a, b)`)\nSETTINGS index_granularity = 8192 +CREATE TABLE test_01048.`.inner.wv`\n(\n `windowID(timestamp, toIntervalSecond(1), toIntervalSecond(3), \'Asia/Shanghai\')` UInt32,\n `count(a)` AggregateFunction(count, Int32)\n)\nENGINE = AggregatingMergeTree\nPRIMARY KEY `windowID(timestamp, toIntervalSecond(1), toIntervalSecond(3), \'Asia/Shanghai\')`\nORDER BY tuple(`windowID(timestamp, toIntervalSecond(1), toIntervalSecond(3), \'Asia/Shanghai\')`)\nSETTINGS index_granularity = 8192 +CREATE TABLE test_01048.`.inner.wv`\n(\n `windowID(timestamp, toIntervalSecond(\'1\'), toIntervalSecond(\'3\'))` UInt32,\n `plus(a, b)` Int64,\n `count(a)` AggregateFunction(count, Int32)\n)\nENGINE = AggregatingMergeTree\nPRIMARY KEY `windowID(timestamp, toIntervalSecond(\'1\'), toIntervalSecond(\'3\'))`\nORDER BY (`windowID(timestamp, toIntervalSecond(\'1\'), toIntervalSecond(\'3\'))`, `plus(a, b)`)\nSETTINGS index_granularity = 8192 diff --git a/tests/queries/0_stateless/01048_window_view_parser.sql b/tests/queries/0_stateless/01048_window_view_parser.sql index e7dc4b324f6..3f57f6fbd91 100644 --- a/tests/queries/0_stateless/01048_window_view_parser.sql +++ b/tests/queries/0_stateless/01048_window_view_parser.sql @@ -11,71 +11,71 @@ CREATE TABLE test_01048.mt(a Int32, b Int32, timestamp DateTime) ENGINE=MergeTre SELECT '---TUMBLE---'; SELECT '||---WINDOW COLUMN NAME---'; DROP TABLE IF EXISTS test_01048.wv; -CREATE WINDOW VIEW test_01048.wv AS SELECT count(a) AS count, TUMBLE_END(wid) as wend FROM test_01048.mt GROUP BY TUMBLE(timestamp, INTERVAL 1 SECOND) as wid; +CREATE WINDOW VIEW test_01048.wv AS SELECT count(a) AS count, tumbleEnd(wid) as wend FROM test_01048.mt GROUP BY tumble(timestamp, INTERVAL 1 SECOND) as wid; SHOW CREATE TABLE test_01048.`.inner.wv`; SELECT '||---WINDOW COLUMN ALIAS---'; DROP TABLE IF EXISTS test_01048.wv; -CREATE WINDOW VIEW test_01048.wv AS SELECT count(a) AS count, TUMBLE(timestamp, INTERVAL '1' SECOND) AS wid FROM test_01048.mt GROUP BY wid; +CREATE WINDOW VIEW test_01048.wv AS SELECT count(a) AS count, tumble(timestamp, INTERVAL '1' SECOND) AS wid FROM test_01048.mt GROUP BY wid; SHOW CREATE TABLE test_01048.`.inner.wv`; SELECT '||---IDENTIFIER---'; DROP TABLE IF EXISTS test_01048.wv; -CREATE WINDOW VIEW test_01048.wv AS SELECT count(a) AS count FROM test_01048.mt GROUP BY b, TUMBLE(timestamp, INTERVAL '1' SECOND) AS wid; +CREATE WINDOW VIEW test_01048.wv AS SELECT count(a) AS count FROM test_01048.mt GROUP BY b, tumble(timestamp, INTERVAL '1' SECOND) AS wid; SHOW CREATE TABLE test_01048.`.inner.wv`; DROP TABLE IF EXISTS test_01048.wv; -CREATE WINDOW VIEW test_01048.wv AS SELECT count(a) AS count FROM test_01048.mt GROUP BY TUMBLE(timestamp, INTERVAL '1' SECOND) AS wid, b; +CREATE WINDOW VIEW test_01048.wv AS SELECT count(a) AS count FROM test_01048.mt GROUP BY tumble(timestamp, INTERVAL '1' SECOND) AS wid, b; SHOW CREATE TABLE test_01048.`.inner.wv`; SELECT '||---FUNCTION---'; DROP TABLE IF EXISTS test_01048.wv; -CREATE WINDOW VIEW test_01048.wv AS SELECT count(a) AS count FROM test_01048.mt GROUP BY plus(a, b) as _type, TUMBLE(timestamp, INTERVAL '1' SECOND) AS wid; +CREATE WINDOW VIEW test_01048.wv AS SELECT count(a) AS count FROM test_01048.mt GROUP BY plus(a, b) as _type, tumble(timestamp, INTERVAL '1' SECOND) AS wid; SHOW CREATE TABLE test_01048.`.inner.wv`; DROP TABLE IF EXISTS test_01048.wv; -CREATE WINDOW VIEW test_01048.wv AS SELECT count(a) AS count FROM test_01048.mt GROUP BY TUMBLE(timestamp, INTERVAL '1' SECOND) AS wid, plus(a, b); +CREATE WINDOW VIEW test_01048.wv AS SELECT count(a) AS count FROM test_01048.mt GROUP BY tumble(timestamp, INTERVAL '1' SECOND) AS wid, plus(a, b); SHOW CREATE TABLE test_01048.`.inner.wv`; SELECT '||---TimeZone---'; DROP TABLE IF EXISTS test_01048.wv; -CREATE WINDOW VIEW test_01048.wv AS SELECT count(a) AS count, TUMBLE(timestamp, INTERVAL '1' SECOND, 'Asia/Shanghai') AS wid FROM test_01048.mt GROUP BY wid; +CREATE WINDOW VIEW test_01048.wv AS SELECT count(a) AS count, tumble(timestamp, INTERVAL '1' SECOND, 'Asia/Shanghai') AS wid FROM test_01048.mt GROUP BY wid; SHOW CREATE TABLE test_01048.`.inner.wv`; SELECT '---HOP---'; SELECT '||---WINDOW COLUMN NAME---'; DROP TABLE IF EXISTS test_01048.wv; -CREATE WINDOW VIEW test_01048.wv AS SELECT count(a) AS count, HOP_END(wid) as wend FROM test_01048.mt GROUP BY HOP(timestamp, INTERVAL 1 SECOND, INTERVAL 3 SECOND) as wid; +CREATE WINDOW VIEW test_01048.wv AS SELECT count(a) AS count, hopEnd(wid) as wend FROM test_01048.mt GROUP BY hop(timestamp, INTERVAL 1 SECOND, INTERVAL 3 SECOND) as wid; SHOW CREATE TABLE test_01048.`.inner.wv`; SELECT '||---WINDOW COLUMN ALIAS---'; DROP TABLE IF EXISTS test_01048.wv; -CREATE WINDOW VIEW test_01048.wv AS SELECT count(a) AS count, HOP(timestamp, INTERVAL '1' SECOND, INTERVAL '3' SECOND) AS wid FROM test_01048.mt GROUP BY wid; +CREATE WINDOW VIEW test_01048.wv AS SELECT count(a) AS count, hop(timestamp, INTERVAL '1' SECOND, INTERVAL '3' SECOND) AS wid FROM test_01048.mt GROUP BY wid; SHOW CREATE TABLE test_01048.`.inner.wv`; SELECT '||---IDENTIFIER---'; DROP TABLE IF EXISTS test_01048.wv; -CREATE WINDOW VIEW test_01048.wv AS SELECT count(a) AS count FROM test_01048.mt GROUP BY b, HOP(timestamp, INTERVAL '1' SECOND, INTERVAL '3' SECOND) AS wid; +CREATE WINDOW VIEW test_01048.wv AS SELECT count(a) AS count FROM test_01048.mt GROUP BY b, hop(timestamp, INTERVAL '1' SECOND, INTERVAL '3' SECOND) AS wid; SHOW CREATE TABLE test_01048.`.inner.wv`; DROP TABLE IF EXISTS test_01048.wv; -CREATE WINDOW VIEW test_01048.wv AS SELECT count(a) AS count FROM test_01048.mt GROUP BY HOP(timestamp, INTERVAL '1' SECOND, INTERVAL '3' SECOND) AS wid, b; +CREATE WINDOW VIEW test_01048.wv AS SELECT count(a) AS count FROM test_01048.mt GROUP BY hop(timestamp, INTERVAL '1' SECOND, INTERVAL '3' SECOND) AS wid, b; SHOW CREATE TABLE test_01048.`.inner.wv`; SELECT '||---FUNCTION---'; DROP TABLE IF EXISTS test_01048.wv; -CREATE WINDOW VIEW test_01048.wv AS SELECT count(a) AS count FROM test_01048.mt GROUP BY plus(a, b) as _type, HOP(timestamp, INTERVAL '1' SECOND, INTERVAL '3' SECOND) AS wid; +CREATE WINDOW VIEW test_01048.wv AS SELECT count(a) AS count FROM test_01048.mt GROUP BY plus(a, b) as _type, hop(timestamp, INTERVAL '1' SECOND, INTERVAL '3' SECOND) AS wid; SHOW CREATE TABLE test_01048.`.inner.wv`; SELECT '||---TimeZone---'; DROP TABLE IF EXISTS test_01048.wv; -CREATE WINDOW VIEW test_01048.wv AS SELECT count(a) AS count, HOP_END(wid) as wend FROM test_01048.mt GROUP BY HOP(timestamp, INTERVAL 1 SECOND, INTERVAL 3 SECOND, 'Asia/Shanghai') as wid; +CREATE WINDOW VIEW test_01048.wv AS SELECT count(a) AS count, hopEnd(wid) as wend FROM test_01048.mt GROUP BY hop(timestamp, INTERVAL 1 SECOND, INTERVAL 3 SECOND, 'Asia/Shanghai') as wid; SHOW CREATE TABLE test_01048.`.inner.wv`; DROP TABLE IF EXISTS test_01048.wv; -CREATE WINDOW VIEW test_01048.wv AS SELECT count(a) AS count FROM test_01048.mt GROUP BY HOP(timestamp, INTERVAL '1' SECOND, INTERVAL '3' SECOND) AS wid, plus(a, b); +CREATE WINDOW VIEW test_01048.wv AS SELECT count(a) AS count FROM test_01048.mt GROUP BY hop(timestamp, INTERVAL '1' SECOND, INTERVAL '3' SECOND) AS wid, plus(a, b); SHOW CREATE TABLE test_01048.`.inner.wv`; DROP TABLE test_01048.wv; diff --git a/tests/queries/0_stateless/01049_window_view_window_functions.reference b/tests/queries/0_stateless/01049_window_view_window_functions.reference index e8813db5a7d..2d49664b280 100644 --- a/tests/queries/0_stateless/01049_window_view_window_functions.reference +++ b/tests/queries/0_stateless/01049_window_view_window_functions.reference @@ -1,69 +1,69 @@ -- { echo } -SELECT TUMBLE(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL 1 SECOND, 'US/Samoa'); +SELECT tumble(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL 1 SECOND, 'US/Samoa'); ('2020-01-09 12:00:01','2020-01-09 12:00:02') -SELECT TUMBLE(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL 1 MINUTE, 'US/Samoa'); +SELECT tumble(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL 1 MINUTE, 'US/Samoa'); ('2020-01-09 12:00:00','2020-01-09 12:01:00') -SELECT TUMBLE(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' HOUR, 'US/Samoa'); +SELECT tumble(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' HOUR, 'US/Samoa'); ('2020-01-09 12:00:00','2020-01-09 13:00:00') -SELECT TUMBLE(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' DAY, 'US/Samoa'); +SELECT tumble(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' DAY, 'US/Samoa'); ('2020-01-09 00:00:00','2020-01-10 00:00:00') -SELECT TUMBLE(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL 1 WEEK, 'US/Samoa'); +SELECT tumble(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL 1 WEEK, 'US/Samoa'); ('2020-01-06','2020-01-13') -SELECT TUMBLE(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' MONTH, 'US/Samoa'); +SELECT tumble(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' MONTH, 'US/Samoa'); ('2020-01-01','2020-02-01') -SELECT TUMBLE(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' QUARTER, 'US/Samoa'); +SELECT tumble(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' QUARTER, 'US/Samoa'); ('2020-01-01','2020-04-01') -SELECT TUMBLE(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' YEAR, 'US/Samoa'); +SELECT tumble(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' YEAR, 'US/Samoa'); ('2020-01-01','2021-01-01') -SELECT TUMBLE(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' DAY, 'US/Samoa'); +SELECT tumble(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' DAY, 'US/Samoa'); ('2020-01-09 00:00:00','2020-01-10 00:00:00') -SELECT TUMBLE_START(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' DAY, 'US/Samoa'); +SELECT tumbleStart(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' DAY, 'US/Samoa'); 2020-01-09 00:00:00 -SELECT toDateTime(TUMBLE_START(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' DAY, 'US/Samoa'), 'US/Samoa'); +SELECT toDateTime(tumbleStart(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' DAY, 'US/Samoa'), 'US/Samoa'); 2020-01-09 00:00:00 -SELECT toDateTime(TUMBLE_START(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' DAY, 'US/Samoa'), 'US/Samoa'); +SELECT toDateTime(tumbleStart(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' DAY, 'US/Samoa'), 'US/Samoa'); 2020-01-09 00:00:00 -SELECT TUMBLE_START(TUMBLE(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' DAY, 'US/Samoa')); +SELECT tumbleStart(tumble(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' DAY, 'US/Samoa')); 2020-01-09 00:00:00 -SELECT TUMBLE_END(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' DAY, 'US/Samoa'); +SELECT tumbleEnd(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' DAY, 'US/Samoa'); 2020-01-10 00:00:00 -SELECT toDateTime(TUMBLE_END(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' DAY, 'US/Samoa'), 'US/Samoa'); +SELECT toDateTime(tumbleEnd(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' DAY, 'US/Samoa'), 'US/Samoa'); 2020-01-10 00:00:00 -SELECT toDateTime(TUMBLE_END(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' DAY, 'US/Samoa'), 'US/Samoa'); +SELECT toDateTime(tumbleEnd(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' DAY, 'US/Samoa'), 'US/Samoa'); 2020-01-10 00:00:00 -SELECT TUMBLE_END(TUMBLE(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' DAY, 'US/Samoa')); +SELECT tumbleEnd(tumble(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' DAY, 'US/Samoa')); 2020-01-10 00:00:00 -SELECT HOP(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL 1 SECOND, INTERVAL 3 SECOND, 'US/Samoa'); +SELECT hop(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL 1 SECOND, INTERVAL 3 SECOND, 'US/Samoa'); ('2020-01-09 11:59:59','2020-01-09 12:00:02') -SELECT HOP(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL 1 MINUTE, INTERVAL 3 MINUTE, 'US/Samoa'); +SELECT hop(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL 1 MINUTE, INTERVAL 3 MINUTE, 'US/Samoa'); ('2020-01-09 11:58:00','2020-01-09 12:01:00') -SELECT HOP(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL 1 HOUR, INTERVAL 3 HOUR, 'US/Samoa'); +SELECT hop(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL 1 HOUR, INTERVAL 3 HOUR, 'US/Samoa'); ('2020-01-09 10:00:00','2020-01-09 13:00:00') -SELECT HOP(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL 1 DAY, INTERVAL 3 DAY, 'US/Samoa'); +SELECT hop(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL 1 DAY, INTERVAL 3 DAY, 'US/Samoa'); ('2020-01-07 00:00:00','2020-01-10 00:00:00') -SELECT HOP(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL 1 WEEK, INTERVAL 3 WEEK, 'US/Samoa'); +SELECT hop(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL 1 WEEK, INTERVAL 3 WEEK, 'US/Samoa'); ('2019-12-23','2020-01-13') -SELECT HOP(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL 1 MONTH, INTERVAL 3 MONTH, 'US/Samoa'); +SELECT hop(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL 1 MONTH, INTERVAL 3 MONTH, 'US/Samoa'); ('2019-11-01','2020-02-01') -SELECT HOP(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL 1 QUARTER, INTERVAL 3 QUARTER, 'US/Samoa'); +SELECT hop(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL 1 QUARTER, INTERVAL 3 QUARTER, 'US/Samoa'); ('2019-07-01','2020-04-01') -SELECT HOP(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL 1 YEAR, INTERVAL 3 YEAR, 'US/Samoa'); +SELECT hop(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL 1 YEAR, INTERVAL 3 YEAR, 'US/Samoa'); ('2018-01-01','2021-01-01') -SELECT HOP(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' DAY, INTERVAL '3' DAY, 'US/Samoa'); +SELECT hop(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' DAY, INTERVAL '3' DAY, 'US/Samoa'); ('2020-01-07 00:00:00','2020-01-10 00:00:00') -SELECT HOP_START(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' DAY, INTERVAL '3' DAY, 'US/Samoa'); +SELECT hopStart(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' DAY, INTERVAL '3' DAY, 'US/Samoa'); 2020-01-07 00:00:00 -SELECT toDateTime(HOP_START(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' DAY, INTERVAL '3' DAY, 'US/Samoa'), 'US/Samoa'); +SELECT toDateTime(hopStart(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' DAY, INTERVAL '3' DAY, 'US/Samoa'), 'US/Samoa'); 2020-01-07 00:00:00 -SELECT toDateTime(HOP_START(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' DAY, INTERVAL '3' DAY, 'US/Samoa'), 'US/Samoa'); +SELECT toDateTime(hopStart(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' DAY, INTERVAL '3' DAY, 'US/Samoa'), 'US/Samoa'); 2020-01-07 00:00:00 -SELECT HOP_START(HOP(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' DAY, INTERVAL '3' DAY, 'US/Samoa')); +SELECT hopStart(hop(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' DAY, INTERVAL '3' DAY, 'US/Samoa')); 2020-01-07 00:00:00 -SELECT HOP_END(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' DAY, INTERVAL '3' DAY, 'US/Samoa'); +SELECT hopEnd(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' DAY, INTERVAL '3' DAY, 'US/Samoa'); 2020-01-10 00:00:00 -SELECT toDateTime(HOP_END(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' DAY, INTERVAL '3' DAY, 'US/Samoa'), 'US/Samoa'); +SELECT toDateTime(hopEnd(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' DAY, INTERVAL '3' DAY, 'US/Samoa'), 'US/Samoa'); 2020-01-10 00:00:00 -SELECT toDateTime(HOP_END(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' DAY, INTERVAL '3' DAY, 'US/Samoa'), 'US/Samoa'); +SELECT toDateTime(hopEnd(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' DAY, INTERVAL '3' DAY, 'US/Samoa'), 'US/Samoa'); 2020-01-10 00:00:00 -SELECT HOP_END(HOP(toDateTime('2019-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' DAY, INTERVAL '3' DAY, 'US/Samoa')); +SELECT hopEnd(hop(toDateTime('2019-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' DAY, INTERVAL '3' DAY, 'US/Samoa')); 2019-01-10 00:00:00 diff --git a/tests/queries/0_stateless/01049_window_view_window_functions.sql b/tests/queries/0_stateless/01049_window_view_window_functions.sql index 4c98f9445e1..617019bd2c6 100644 --- a/tests/queries/0_stateless/01049_window_view_window_functions.sql +++ b/tests/queries/0_stateless/01049_window_view_window_functions.sql @@ -1,38 +1,38 @@ -- { echo } -SELECT TUMBLE(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL 1 SECOND, 'US/Samoa'); -SELECT TUMBLE(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL 1 MINUTE, 'US/Samoa'); -SELECT TUMBLE(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' HOUR, 'US/Samoa'); -SELECT TUMBLE(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' DAY, 'US/Samoa'); -SELECT TUMBLE(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL 1 WEEK, 'US/Samoa'); -SELECT TUMBLE(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' MONTH, 'US/Samoa'); -SELECT TUMBLE(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' QUARTER, 'US/Samoa'); -SELECT TUMBLE(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' YEAR, 'US/Samoa'); +SELECT tumble(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL 1 SECOND, 'US/Samoa'); +SELECT tumble(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL 1 MINUTE, 'US/Samoa'); +SELECT tumble(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' HOUR, 'US/Samoa'); +SELECT tumble(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' DAY, 'US/Samoa'); +SELECT tumble(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL 1 WEEK, 'US/Samoa'); +SELECT tumble(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' MONTH, 'US/Samoa'); +SELECT tumble(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' QUARTER, 'US/Samoa'); +SELECT tumble(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' YEAR, 'US/Samoa'); -SELECT TUMBLE(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' DAY, 'US/Samoa'); -SELECT TUMBLE_START(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' DAY, 'US/Samoa'); -SELECT toDateTime(TUMBLE_START(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' DAY, 'US/Samoa'), 'US/Samoa'); -SELECT toDateTime(TUMBLE_START(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' DAY, 'US/Samoa'), 'US/Samoa'); -SELECT TUMBLE_START(TUMBLE(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' DAY, 'US/Samoa')); -SELECT TUMBLE_END(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' DAY, 'US/Samoa'); -SELECT toDateTime(TUMBLE_END(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' DAY, 'US/Samoa'), 'US/Samoa'); -SELECT toDateTime(TUMBLE_END(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' DAY, 'US/Samoa'), 'US/Samoa'); -SELECT TUMBLE_END(TUMBLE(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' DAY, 'US/Samoa')); +SELECT tumble(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' DAY, 'US/Samoa'); +SELECT tumbleStart(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' DAY, 'US/Samoa'); +SELECT toDateTime(tumbleStart(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' DAY, 'US/Samoa'), 'US/Samoa'); +SELECT toDateTime(tumbleStart(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' DAY, 'US/Samoa'), 'US/Samoa'); +SELECT tumbleStart(tumble(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' DAY, 'US/Samoa')); +SELECT tumbleEnd(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' DAY, 'US/Samoa'); +SELECT toDateTime(tumbleEnd(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' DAY, 'US/Samoa'), 'US/Samoa'); +SELECT toDateTime(tumbleEnd(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' DAY, 'US/Samoa'), 'US/Samoa'); +SELECT tumbleEnd(tumble(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' DAY, 'US/Samoa')); -SELECT HOP(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL 1 SECOND, INTERVAL 3 SECOND, 'US/Samoa'); -SELECT HOP(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL 1 MINUTE, INTERVAL 3 MINUTE, 'US/Samoa'); -SELECT HOP(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL 1 HOUR, INTERVAL 3 HOUR, 'US/Samoa'); -SELECT HOP(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL 1 DAY, INTERVAL 3 DAY, 'US/Samoa'); -SELECT HOP(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL 1 WEEK, INTERVAL 3 WEEK, 'US/Samoa'); -SELECT HOP(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL 1 MONTH, INTERVAL 3 MONTH, 'US/Samoa'); -SELECT HOP(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL 1 QUARTER, INTERVAL 3 QUARTER, 'US/Samoa'); -SELECT HOP(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL 1 YEAR, INTERVAL 3 YEAR, 'US/Samoa'); +SELECT hop(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL 1 SECOND, INTERVAL 3 SECOND, 'US/Samoa'); +SELECT hop(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL 1 MINUTE, INTERVAL 3 MINUTE, 'US/Samoa'); +SELECT hop(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL 1 HOUR, INTERVAL 3 HOUR, 'US/Samoa'); +SELECT hop(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL 1 DAY, INTERVAL 3 DAY, 'US/Samoa'); +SELECT hop(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL 1 WEEK, INTERVAL 3 WEEK, 'US/Samoa'); +SELECT hop(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL 1 MONTH, INTERVAL 3 MONTH, 'US/Samoa'); +SELECT hop(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL 1 QUARTER, INTERVAL 3 QUARTER, 'US/Samoa'); +SELECT hop(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL 1 YEAR, INTERVAL 3 YEAR, 'US/Samoa'); -SELECT HOP(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' DAY, INTERVAL '3' DAY, 'US/Samoa'); -SELECT HOP_START(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' DAY, INTERVAL '3' DAY, 'US/Samoa'); -SELECT toDateTime(HOP_START(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' DAY, INTERVAL '3' DAY, 'US/Samoa'), 'US/Samoa'); -SELECT toDateTime(HOP_START(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' DAY, INTERVAL '3' DAY, 'US/Samoa'), 'US/Samoa'); -SELECT HOP_START(HOP(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' DAY, INTERVAL '3' DAY, 'US/Samoa')); -SELECT HOP_END(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' DAY, INTERVAL '3' DAY, 'US/Samoa'); -SELECT toDateTime(HOP_END(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' DAY, INTERVAL '3' DAY, 'US/Samoa'), 'US/Samoa'); -SELECT toDateTime(HOP_END(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' DAY, INTERVAL '3' DAY, 'US/Samoa'), 'US/Samoa'); -SELECT HOP_END(HOP(toDateTime('2019-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' DAY, INTERVAL '3' DAY, 'US/Samoa')); +SELECT hop(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' DAY, INTERVAL '3' DAY, 'US/Samoa'); +SELECT hopStart(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' DAY, INTERVAL '3' DAY, 'US/Samoa'); +SELECT toDateTime(hopStart(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' DAY, INTERVAL '3' DAY, 'US/Samoa'), 'US/Samoa'); +SELECT toDateTime(hopStart(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' DAY, INTERVAL '3' DAY, 'US/Samoa'), 'US/Samoa'); +SELECT hopStart(hop(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' DAY, INTERVAL '3' DAY, 'US/Samoa')); +SELECT hopEnd(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' DAY, INTERVAL '3' DAY, 'US/Samoa'); +SELECT toDateTime(hopEnd(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' DAY, INTERVAL '3' DAY, 'US/Samoa'), 'US/Samoa'); +SELECT toDateTime(hopEnd(toDateTime('2020-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' DAY, INTERVAL '3' DAY, 'US/Samoa'), 'US/Samoa'); +SELECT hopEnd(hop(toDateTime('2019-01-09 12:00:01', 'US/Samoa'), INTERVAL '1' DAY, INTERVAL '3' DAY, 'US/Samoa')); diff --git a/tests/queries/0_stateless/01050_window_view_parser_tumble.sql b/tests/queries/0_stateless/01050_window_view_parser_tumble.sql index 6837036263c..54f9ed00cbe 100644 --- a/tests/queries/0_stateless/01050_window_view_parser_tumble.sql +++ b/tests/queries/0_stateless/01050_window_view_parser_tumble.sql @@ -6,28 +6,28 @@ CREATE TABLE mt(a Int32, timestamp DateTime) ENGINE=MergeTree ORDER BY tuple(); SELECT '---WATERMARK---'; DROP TABLE IF EXISTS wv NO DELAY; -CREATE WINDOW VIEW wv WATERMARK=INTERVAL '1' SECOND AS SELECT count(a), TUMBLE_START(wid) AS w_start, TUMBLE_END(wid) AS w_end FROM mt GROUP BY TUMBLE(timestamp, INTERVAL '3' SECOND) AS wid; +CREATE WINDOW VIEW wv WATERMARK=INTERVAL '1' SECOND AS SELECT count(a), tumbleStart(wid) AS w_start, tumbleEnd(wid) AS w_end FROM mt GROUP BY tumble(timestamp, INTERVAL '3' SECOND) AS wid; SELECT '---With w_end---'; DROP TABLE IF EXISTS wv NO DELAY; -CREATE WINDOW VIEW wv AS SELECT count(a), TUMBLE_START(TUMBLE(timestamp, INTERVAL '3' SECOND)) AS w_start, TUMBLE_END(wid) AS w_end FROM mt GROUP BY TUMBLE(timestamp, INTERVAL '3' SECOND) AS wid; +CREATE WINDOW VIEW wv AS SELECT count(a), tumbleStart(tumble(timestamp, INTERVAL '3' SECOND)) AS w_start, tumbleEnd(wid) AS w_end FROM mt GROUP BY tumble(timestamp, INTERVAL '3' SECOND) AS wid; SELECT '---WithOut w_end---'; DROP TABLE IF EXISTS wv NO DELAY; -CREATE WINDOW VIEW wv AS SELECT count(a), TUMBLE_START(wid) AS w_start FROM mt GROUP BY TUMBLE(timestamp, INTERVAL '3' SECOND) AS wid; +CREATE WINDOW VIEW wv AS SELECT count(a), tumbleStart(wid) AS w_start FROM mt GROUP BY tumble(timestamp, INTERVAL '3' SECOND) AS wid; SELECT '---WITH---'; DROP TABLE IF EXISTS wv NO DELAY; -CREATE WINDOW VIEW wv AS WITH toDateTime('2018-01-01 00:00:00') AS date_time SELECT count(a), TUMBLE_START(wid) AS w_start, TUMBLE_END(wid) AS w_end, date_time FROM mt GROUP BY TUMBLE(timestamp, INTERVAL '3' SECOND) AS wid; +CREATE WINDOW VIEW wv AS WITH toDateTime('2018-01-01 00:00:00') AS date_time SELECT count(a), tumbleStart(wid) AS w_start, tumbleEnd(wid) AS w_end, date_time FROM mt GROUP BY tumble(timestamp, INTERVAL '3' SECOND) AS wid; SELECT '---WHERE---'; DROP TABLE IF EXISTS wv NO DELAY; -CREATE WINDOW VIEW wv AS SELECT count(a), TUMBLE_START(wid) AS w_start FROM mt WHERE a != 1 GROUP BY TUMBLE(timestamp, INTERVAL '3' SECOND) AS wid; +CREATE WINDOW VIEW wv AS SELECT count(a), tumbleStart(wid) AS w_start FROM mt WHERE a != 1 GROUP BY tumble(timestamp, INTERVAL '3' SECOND) AS wid; SELECT '---ORDER_BY---'; DROP TABLE IF EXISTS wv NO DELAY; -CREATE WINDOW VIEW wv AS SELECT count(a), TUMBLE_START(wid) AS w_start FROM mt WHERE a != 1 GROUP BY TUMBLE(timestamp, INTERVAL '3' SECOND) AS wid ORDER BY w_start; +CREATE WINDOW VIEW wv AS SELECT count(a), tumbleStart(wid) AS w_start FROM mt WHERE a != 1 GROUP BY tumble(timestamp, INTERVAL '3' SECOND) AS wid ORDER BY w_start; SELECT '---With now---'; DROP TABLE IF EXISTS wv NO DELAY; -CREATE WINDOW VIEW wv AS SELECT count(a), TUMBLE_START(wid) AS w_start, TUMBLE_END(TUMBLE(now(), INTERVAL '3' SECOND)) AS w_end FROM mt GROUP BY TUMBLE(now(), INTERVAL '3' SECOND) AS wid; +CREATE WINDOW VIEW wv AS SELECT count(a), tumbleStart(wid) AS w_start, tumbleEnd(tumble(now(), INTERVAL '3' SECOND)) AS w_end FROM mt GROUP BY tumble(now(), INTERVAL '3' SECOND) AS wid; diff --git a/tests/queries/0_stateless/01051_window_view_parser_hop.sql b/tests/queries/0_stateless/01051_window_view_parser_hop.sql index df0729108d0..0f705d5c911 100644 --- a/tests/queries/0_stateless/01051_window_view_parser_hop.sql +++ b/tests/queries/0_stateless/01051_window_view_parser_hop.sql @@ -6,28 +6,28 @@ CREATE TABLE mt(a Int32, timestamp DateTime) ENGINE=MergeTree ORDER BY tuple(); SELECT '---WATERMARK---'; DROP TABLE IF EXISTS wv NO DELAY; -CREATE WINDOW VIEW wv WATERMARK=INTERVAL '1' SECOND AS SELECT count(a), HOP_START(wid) AS w_start, HOP_END(wid) AS w_end FROM mt GROUP BY HOP(timestamp, INTERVAL '3' SECOND, INTERVAL '5' SECOND) AS wid; +CREATE WINDOW VIEW wv WATERMARK=INTERVAL '1' SECOND AS SELECT count(a), hopStart(wid) AS w_start, hopEnd(wid) AS w_end FROM mt GROUP BY hop(timestamp, INTERVAL '3' SECOND, INTERVAL '5' SECOND) AS wid; SELECT '---With w_end---'; DROP TABLE IF EXISTS wv NO DELAY; -CREATE WINDOW VIEW wv AS SELECT count(a), HOP_START(wid) AS w_start, HOP_END(wid) AS w_end FROM mt GROUP BY HOP(timestamp, INTERVAL '3' SECOND, INTERVAL '5' SECOND) AS wid; +CREATE WINDOW VIEW wv AS SELECT count(a), hopStart(wid) AS w_start, hopEnd(wid) AS w_end FROM mt GROUP BY hop(timestamp, INTERVAL '3' SECOND, INTERVAL '5' SECOND) AS wid; SELECT '---WithOut w_end---'; DROP TABLE IF EXISTS wv NO DELAY; -CREATE WINDOW VIEW wv AS SELECT count(a), HOP_START(wid) AS w_start FROM mt GROUP BY HOP(timestamp, INTERVAL '3' SECOND, INTERVAL '5' SECOND) AS wid; +CREATE WINDOW VIEW wv AS SELECT count(a), hopStart(wid) AS w_start FROM mt GROUP BY hop(timestamp, INTERVAL '3' SECOND, INTERVAL '5' SECOND) AS wid; SELECT '---WITH---'; DROP TABLE IF EXISTS wv NO DELAY; -CREATE WINDOW VIEW wv AS WITH toDateTime('2018-01-01 00:00:00') AS date_time SELECT count(a), HOP_START(wid) AS w_start, HOP_END(wid) AS w_end, date_time FROM mt GROUP BY HOP(timestamp, INTERVAL '3' SECOND, INTERVAL '5' SECOND) AS wid; +CREATE WINDOW VIEW wv AS WITH toDateTime('2018-01-01 00:00:00') AS date_time SELECT count(a), hopStart(wid) AS w_start, hopEnd(wid) AS w_end, date_time FROM mt GROUP BY hop(timestamp, INTERVAL '3' SECOND, INTERVAL '5' SECOND) AS wid; SELECT '---WHERE---'; DROP TABLE IF EXISTS wv NO DELAY; -CREATE WINDOW VIEW wv AS SELECT count(a), HOP_START(wid) AS w_start FROM mt WHERE a != 1 GROUP BY HOP(timestamp, INTERVAL '3' SECOND, INTERVAL '5' SECOND) AS wid; +CREATE WINDOW VIEW wv AS SELECT count(a), hopStart(wid) AS w_start FROM mt WHERE a != 1 GROUP BY hop(timestamp, INTERVAL '3' SECOND, INTERVAL '5' SECOND) AS wid; SELECT '---ORDER_BY---'; DROP TABLE IF EXISTS wv NO DELAY; -CREATE WINDOW VIEW wv AS SELECT count(a), HOP_START(wid) AS w_start FROM mt WHERE a != 1 GROUP BY HOP(timestamp, INTERVAL '3' SECOND, INTERVAL '5' SECOND) AS wid ORDER BY w_start; +CREATE WINDOW VIEW wv AS SELECT count(a), hopStart(wid) AS w_start FROM mt WHERE a != 1 GROUP BY hop(timestamp, INTERVAL '3' SECOND, INTERVAL '5' SECOND) AS wid ORDER BY w_start; SELECT '---With now---'; DROP TABLE IF EXISTS wv NO DELAY; -CREATE WINDOW VIEW wv AS SELECT count(a), HOP_START(wid) AS w_start, HOP_END(HOP(now(), INTERVAL '1' SECOND, INTERVAL '3' SECOND)) as w_end FROM mt GROUP BY HOP(now(), INTERVAL '1' SECOND, INTERVAL '3' SECOND) AS wid; +CREATE WINDOW VIEW wv AS SELECT count(a), hopStart(wid) AS w_start, hopEnd(hop(now(), INTERVAL '1' SECOND, INTERVAL '3' SECOND)) as w_end FROM mt GROUP BY hop(now(), INTERVAL '1' SECOND, INTERVAL '3' SECOND) AS wid; diff --git a/tests/queries/0_stateless/01052_window_view_proc_tumble_to_now.reference b/tests/queries/0_stateless/01052_window_view_proc_tumble_to_now.reference index 0d66ea1aee9..d00491fd7e5 100644 --- a/tests/queries/0_stateless/01052_window_view_proc_tumble_to_now.reference +++ b/tests/queries/0_stateless/01052_window_view_proc_tumble_to_now.reference @@ -1,2 +1 @@ -0 1 diff --git a/tests/queries/0_stateless/01052_window_view_proc_tumble_to_now.sh b/tests/queries/0_stateless/01052_window_view_proc_tumble_to_now.sh new file mode 100755 index 00000000000..033568b6077 --- /dev/null +++ b/tests/queries/0_stateless/01052_window_view_proc_tumble_to_now.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +$CLICKHOUSE_CLIENT --multiquery < arrayMap(b -> arrayMap(c -> (round(c.1, 6), round(c.2, 6)), b), a), polygonsUnionCartesian([[[(2., 100.0000991821289), (0., 3.), (1., 2.9), (2., 2.6), (2.6, 2.), (2.9, 1), (3., 0.), (100.0000991821289, 2.)]]], [[[(1., 1.), (1000.0001220703125, nan), (4., 4.), (4., 1.), (1., 1.)]]])); -- { serverError 43 } -select polygonsUnionSpherical([[[(4.3613577, 50.8651821), (4.349556, 50.8535879), (4.3602419, 50.8435626), (4.3830299, 50.8428851), (4.3904543, 50.8564867), (4.3613148, 50.8651279)]]], [[[(4.346693, 50.858306), (4.367945, 50.852455), (4.366227, 50.840809), (4.344961, 50.833264), (4.338074, 50.848677), (4.346693, 50.858306)]]]); +select arrayMap(a -> arrayMap(b -> arrayMap(c -> (round(c.1, 6), round(c.2, 6)), b), a), polygonsUnionSpherical([[[(4.3613577, 50.8651821), (4.349556, 50.8535879), (4.3602419, 50.8435626), (4.3830299, 50.8428851), (4.3904543, 50.8564867), (4.3613148, 50.8651279)]]], [[[(4.346693, 50.858306), (4.367945, 50.852455), (4.366227, 50.840809), (4.344961, 50.833264), (4.338074, 50.848677), (4.346693, 50.858306)]]])); select '-------- MultiPolygon with Polygon'; -select wkt(polygonsUnionSpherical([[(29.453587685533865,59.779570356240356),(29.393139070478895,52.276266797422124),(40.636581470703206,59.38168915000267),(41.21084331372543,59.103467777099866),(29.786055068336193,52.146627480315004),(31.23682182965546,52.16517054781818),(41.69443223416517,58.85424941916091),(42.51048853740727,58.47703162291134),(32.59691566839227,52.22075341251539),(34.289476889931414,52.22075341251539),(43.02430176537451,58.07974369546071),(43.02430176537451,57.25537683364851),(35.468224883503325,52.2022335126388),(37.16078610504247,52.23926559241349),(43.02430176537451,56.26136189644947),(43.02430176537451,55.326904361850836),(38.33953409861437,52.16517054781818),(40.09254393520848,52.16517054781818),(44.4146199116388,55.3097062225408),(44.47506852669377,59.80998197603594),(39.72985224487867,59.931351417569715),(30.23941968124846,53.67744677450975),(30.20919537372098,54.63314259659509),(38.73245009647167,59.94649146557819),(37.2816833351524,59.97675082987618),(30.23941968124846,55.2752875586599),(30.33009260383092,56.19415599955667),(36.28428118674541,59.96162460231375),(34.863738732953635,59.97675082987618),(30.178971066193498,56.97640788219866),(30.178971066193498,57.91957806959033),(33.65476643185424,59.94649146557819),(32.32489690064491,59.94649146557819),(30.481214141468342,58.85424941916091),(30.571887064050795,59.99187015036608),(29.453587685533865,59.779570356240356)]], [[[(33.473420586689336,58.85424941916091),(32.23422397806246,58.492830557036),(32.173775363007486,58.03176922751564),(31.508840597402823,57.499784781503735),(31.750635057622702,56.86092686957355),(31.508840597402823,55.941082594334574),(32.20399967053497,55.515591939372456),(31.84130798020516,54.998862226280465),(31.418167674820367,54.422670886434275),(32.47601843828233,53.83826377018255),(32.08310244042503,53.408048308050866),(33.171177511414484,52.82758702113742),(34.77306581037117,52.91880107773494),(34.77306581037117,53.784726518357985),(34.108131044766516,54.17574726780569),(35.07530888564602,54.59813930694554),(34.25925258240394,54.96417435716029),(35.01486027059106,55.361278263643584),(33.50364489421682,55.37845402950552),(32.7480372060297,55.90721384574556),(35.67979503619571,55.68634475630185),(32.83871012861215,56.311688992608396),(34.591719965206266,56.29492065473883),(35.7100193437232,56.311688992608396),(33.83611227701915,56.695333481003644),(32.95960735872209,56.9434497616887),(36.072711034053015,57.091531913901434),(33.171177511414484,57.33702717078384),(36.193608264162954,57.499784781503735),(33.23162612646945,57.77481561306047),(36.43540272438284,58.04776787540811),(33.62454212432676,58.27099811968307),(36.344729801800376,58.54018474404165),(33.83611227701915,58.68186423448108),(34.74284150284369,59.565911441555244),(33.473420586689336,58.85424941916091)]], [[(34.65216858026123,58.91672306881671),(37.19101041256995,58.68186423448108),(36.01226241899805,58.28688958537609),(37.16078610504247,58.04776787540811),(35.74024365125068,57.79092907387934),(37.009664567405046,57.499784781503735),(35.77046795877817,57.25537683364851),(36.979440259877556,57.07510745541089),(34.22902827487645,56.794777197297435),(36.7074214921302,56.210968525786996),(34.712617195316206,56.10998276812964),(36.55629995449277,55.63519693782703),(35.13575750070099,55.53270067649592),(36.43540272438284,55.34409504165558),(34.83351442542614,55.01619492319591),(35.61934642114075,54.49294870011772),(34.89396304048112,54.12264226523038),(35.37755196092087,53.046178687628185),(37.43280487278982,52.95523300597458),(35.92158949641559,53.80257986695776),(36.91899164482259,53.856094327816805),(36.01226241899805,54.75541714463799),(37.765272255592166,55.189110239786885),(36.828318722240134,55.44708256557195),(38.03729102333953,55.652253637168315),(36.64697287707522,55.941082594334574),(38.21863686850443,56.05939028508024),(36.37495410932787,56.64551287174558),(38.30930979108689,56.992876013526654),(37.16078610504247,57.25537683364851),(38.127963945921984,57.516020773674256),(37.43280487278982,57.710289827306724),(38.33953409861437,57.935626886818994),(37.40258056526235,58.31865112960426),(38.58132855883426,58.744648733419496),(37.31190764267989,59.02578062465136),(34.65216858026123,58.91672306881671)]], [[(38.52087994377928,59.11898412389468),(39.54850639971376,58.713270635642914),(38.369758406141855,58.28688958537609),(38.85334732658162,58.06375936407028),(38.33953409861437,57.710289827306724),(38.73245009647167,57.48354156434209),(38.21863686850443,57.271721400459285),(38.97424455669155,56.87744603722649),(37.463029180317314,56.5623320541159),(38.94402024916407,56.05939028508024),(38.18841256097694,55.856355210835915),(38.490655636251795,55.53270067649592),(37.795496563119656,55.39562234093384),(38.30930979108689,55.154587013355666),(36.7074214921302,54.65063295250911),(37.31190764267989,53.92734063371401),(36.979440259877556,53.58783775557231),(37.855945178174615,52.91880107773497),(39.57873070724124,52.69956490610895),(38.33953409861437,53.281741738901104),(40.00187101262603,53.35396273604752),(39.54850639971376,53.58783775557231),(40.24366547284591,53.58783775557231),(39.97164670509855,53.98069568468355),(40.60635716317572,54.03398248547225),(40.39478701048334,54.44025165268903),(39.54850639971376,54.56310590284329),(39.54850639971376,54.87732350170489),(40.39478701048334,54.87732350170489),(40.39478701048334,55.24083903654295),(39.82052516746112,55.2752875586599),(39.760076552406154,55.75443792473942),(40.57613285564824,55.78844000174894),(40.425011318010824,56.19415599955667),(39.82052516746112,56.07626182891758),(39.79030085993364,56.41214455508424),(40.48545993306579,56.495655446714636),(40.33433839542836,56.95993246553937),(39.79030085993364,56.992876013526654),(39.72985224487867,57.46729112028032),(40.33433839542836,57.46729112028032),(40.24366547284591,58.04776787540811),(39.63917932229622,58.04776787540811),(39.63917932229622,58.382088724871295),(40.33433839542836,58.382088724871295),(40.45523562553831,58.9011152358548),(38.52087994377928,59.11898412389468)]]])) format TSV; +select wkt(arrayMap(a -> arrayMap(b -> arrayMap(c -> (round(c.1, 6), round(c.2, 6)), b), a), +polygonsUnionSpherical([[(29.453587685533865,59.779570356240356),(29.393139070478895,52.276266797422124),(40.636581470703206,59.38168915000267),(41.21084331372543,59.103467777099866),(29.786055068336193,52.146627480315004),(31.23682182965546,52.16517054781818),(41.69443223416517,58.85424941916091),(42.51048853740727,58.47703162291134),(32.59691566839227,52.22075341251539),(34.289476889931414,52.22075341251539),(43.02430176537451,58.07974369546071),(43.02430176537451,57.25537683364851),(35.468224883503325,52.2022335126388),(37.16078610504247,52.23926559241349),(43.02430176537451,56.26136189644947),(43.02430176537451,55.326904361850836),(38.33953409861437,52.16517054781818),(40.09254393520848,52.16517054781818),(44.4146199116388,55.3097062225408),(44.47506852669377,59.80998197603594),(39.72985224487867,59.931351417569715),(30.23941968124846,53.67744677450975),(30.20919537372098,54.63314259659509),(38.73245009647167,59.94649146557819),(37.2816833351524,59.97675082987618),(30.23941968124846,55.2752875586599),(30.33009260383092,56.19415599955667),(36.28428118674541,59.96162460231375),(34.863738732953635,59.97675082987618),(30.178971066193498,56.97640788219866),(30.178971066193498,57.91957806959033),(33.65476643185424,59.94649146557819),(32.32489690064491,59.94649146557819),(30.481214141468342,58.85424941916091),(30.571887064050795,59.99187015036608),(29.453587685533865,59.779570356240356)]], [[[(33.473420586689336,58.85424941916091),(32.23422397806246,58.492830557036),(32.173775363007486,58.03176922751564),(31.508840597402823,57.499784781503735),(31.750635057622702,56.86092686957355),(31.508840597402823,55.941082594334574),(32.20399967053497,55.515591939372456),(31.84130798020516,54.998862226280465),(31.418167674820367,54.422670886434275),(32.47601843828233,53.83826377018255),(32.08310244042503,53.408048308050866),(33.171177511414484,52.82758702113742),(34.77306581037117,52.91880107773494),(34.77306581037117,53.784726518357985),(34.108131044766516,54.17574726780569),(35.07530888564602,54.59813930694554),(34.25925258240394,54.96417435716029),(35.01486027059106,55.361278263643584),(33.50364489421682,55.37845402950552),(32.7480372060297,55.90721384574556),(35.67979503619571,55.68634475630185),(32.83871012861215,56.311688992608396),(34.591719965206266,56.29492065473883),(35.7100193437232,56.311688992608396),(33.83611227701915,56.695333481003644),(32.95960735872209,56.9434497616887),(36.072711034053015,57.091531913901434),(33.171177511414484,57.33702717078384),(36.193608264162954,57.499784781503735),(33.23162612646945,57.77481561306047),(36.43540272438284,58.04776787540811),(33.62454212432676,58.27099811968307),(36.344729801800376,58.54018474404165),(33.83611227701915,58.68186423448108),(34.74284150284369,59.565911441555244),(33.473420586689336,58.85424941916091)]], [[(34.65216858026123,58.91672306881671),(37.19101041256995,58.68186423448108),(36.01226241899805,58.28688958537609),(37.16078610504247,58.04776787540811),(35.74024365125068,57.79092907387934),(37.009664567405046,57.499784781503735),(35.77046795877817,57.25537683364851),(36.979440259877556,57.07510745541089),(34.22902827487645,56.794777197297435),(36.7074214921302,56.210968525786996),(34.712617195316206,56.10998276812964),(36.55629995449277,55.63519693782703),(35.13575750070099,55.53270067649592),(36.43540272438284,55.34409504165558),(34.83351442542614,55.01619492319591),(35.61934642114075,54.49294870011772),(34.89396304048112,54.12264226523038),(35.37755196092087,53.046178687628185),(37.43280487278982,52.95523300597458),(35.92158949641559,53.80257986695776),(36.91899164482259,53.856094327816805),(36.01226241899805,54.75541714463799),(37.765272255592166,55.189110239786885),(36.828318722240134,55.44708256557195),(38.03729102333953,55.652253637168315),(36.64697287707522,55.941082594334574),(38.21863686850443,56.05939028508024),(36.37495410932787,56.64551287174558),(38.30930979108689,56.992876013526654),(37.16078610504247,57.25537683364851),(38.127963945921984,57.516020773674256),(37.43280487278982,57.710289827306724),(38.33953409861437,57.935626886818994),(37.40258056526235,58.31865112960426),(38.58132855883426,58.744648733419496),(37.31190764267989,59.02578062465136),(34.65216858026123,58.91672306881671)]], [[(38.52087994377928,59.11898412389468),(39.54850639971376,58.713270635642914),(38.369758406141855,58.28688958537609),(38.85334732658162,58.06375936407028),(38.33953409861437,57.710289827306724),(38.73245009647167,57.48354156434209),(38.21863686850443,57.271721400459285),(38.97424455669155,56.87744603722649),(37.463029180317314,56.5623320541159),(38.94402024916407,56.05939028508024),(38.18841256097694,55.856355210835915),(38.490655636251795,55.53270067649592),(37.795496563119656,55.39562234093384),(38.30930979108689,55.154587013355666),(36.7074214921302,54.65063295250911),(37.31190764267989,53.92734063371401),(36.979440259877556,53.58783775557231),(37.855945178174615,52.91880107773497),(39.57873070724124,52.69956490610895),(38.33953409861437,53.281741738901104),(40.00187101262603,53.35396273604752),(39.54850639971376,53.58783775557231),(40.24366547284591,53.58783775557231),(39.97164670509855,53.98069568468355),(40.60635716317572,54.03398248547225),(40.39478701048334,54.44025165268903),(39.54850639971376,54.56310590284329),(39.54850639971376,54.87732350170489),(40.39478701048334,54.87732350170489),(40.39478701048334,55.24083903654295),(39.82052516746112,55.2752875586599),(39.760076552406154,55.75443792473942),(40.57613285564824,55.78844000174894),(40.425011318010824,56.19415599955667),(39.82052516746112,56.07626182891758),(39.79030085993364,56.41214455508424),(40.48545993306579,56.495655446714636),(40.33433839542836,56.95993246553937),(39.79030085993364,56.992876013526654),(39.72985224487867,57.46729112028032),(40.33433839542836,57.46729112028032),(40.24366547284591,58.04776787540811),(39.63917932229622,58.04776787540811),(39.63917932229622,58.382088724871295),(40.33433839542836,58.382088724871295),(40.45523562553831,58.9011152358548),(38.52087994377928,59.11898412389468)]]]))) format TSV; select '-------- MultiPolygon with Polygon with Holes'; -select wkt(polygonsUnionSpherical([[[(33.473420586689336,58.85424941916091),(32.23422397806246,58.492830557036),(32.173775363007486,58.03176922751564),(31.508840597402823,57.499784781503735),(31.750635057622702,56.86092686957355),(31.508840597402823,55.941082594334574),(32.20399967053497,55.515591939372456),(31.84130798020516,54.998862226280465),(31.418167674820367,54.422670886434275),(32.47601843828233,53.83826377018255),(32.08310244042503,53.408048308050866),(33.171177511414484,52.82758702113742),(34.77306581037117,52.91880107773494),(34.77306581037117,53.784726518357985),(34.108131044766516,54.17574726780569),(35.07530888564602,54.59813930694554),(34.25925258240394,54.96417435716029),(35.01486027059106,55.361278263643584),(33.50364489421682,55.37845402950552),(32.7480372060297,55.90721384574556),(35.67979503619571,55.68634475630185),(32.83871012861215,56.311688992608396),(34.591719965206266,56.29492065473883),(35.7100193437232,56.311688992608396),(33.83611227701915,56.695333481003644),(32.95960735872209,56.9434497616887),(36.072711034053015,57.091531913901434),(33.171177511414484,57.33702717078384),(36.193608264162954,57.499784781503735),(33.23162612646945,57.77481561306047),(36.43540272438284,58.04776787540811),(33.62454212432676,58.27099811968307),(36.344729801800376,58.54018474404165),(33.83611227701915,58.68186423448108),(34.74284150284369,59.565911441555244),(33.473420586689336,58.85424941916091)]], [[(34.65216858026123,58.91672306881671),(37.19101041256995,58.68186423448108),(36.01226241899805,58.28688958537609),(37.16078610504247,58.04776787540811),(35.74024365125068,57.79092907387934),(37.009664567405046,57.499784781503735),(35.77046795877817,57.25537683364851),(36.979440259877556,57.07510745541089),(34.22902827487645,56.794777197297435),(36.7074214921302,56.210968525786996),(34.712617195316206,56.10998276812964),(36.55629995449277,55.63519693782703),(35.13575750070099,55.53270067649592),(36.43540272438284,55.34409504165558),(34.83351442542614,55.01619492319591),(35.61934642114075,54.49294870011772),(34.89396304048112,54.12264226523038),(35.37755196092087,53.046178687628185),(37.43280487278982,52.95523300597458),(35.92158949641559,53.80257986695776),(36.91899164482259,53.856094327816805),(36.01226241899805,54.75541714463799),(37.765272255592166,55.189110239786885),(36.828318722240134,55.44708256557195),(38.03729102333953,55.652253637168315),(36.64697287707522,55.941082594334574),(38.21863686850443,56.05939028508024),(36.37495410932787,56.64551287174558),(38.30930979108689,56.992876013526654),(37.16078610504247,57.25537683364851),(38.127963945921984,57.516020773674256),(37.43280487278982,57.710289827306724),(38.33953409861437,57.935626886818994),(37.40258056526235,58.31865112960426),(38.58132855883426,58.744648733419496),(37.31190764267989,59.02578062465136),(34.65216858026123,58.91672306881671)]], [[(38.52087994377928,59.11898412389468),(39.54850639971376,58.713270635642914),(38.369758406141855,58.28688958537609),(38.85334732658162,58.06375936407028),(38.33953409861437,57.710289827306724),(38.73245009647167,57.48354156434209),(38.21863686850443,57.271721400459285),(38.97424455669155,56.87744603722649),(37.463029180317314,56.5623320541159),(38.94402024916407,56.05939028508024),(38.18841256097694,55.856355210835915),(38.490655636251795,55.53270067649592),(37.795496563119656,55.39562234093384),(38.30930979108689,55.154587013355666),(36.7074214921302,54.65063295250911),(37.31190764267989,53.92734063371401),(36.979440259877556,53.58783775557231),(37.855945178174615,52.91880107773497),(39.57873070724124,52.69956490610895),(38.33953409861437,53.281741738901104),(40.00187101262603,53.35396273604752),(39.54850639971376,53.58783775557231),(40.24366547284591,53.58783775557231),(39.97164670509855,53.98069568468355),(40.60635716317572,54.03398248547225),(40.39478701048334,54.44025165268903),(39.54850639971376,54.56310590284329),(39.54850639971376,54.87732350170489),(40.39478701048334,54.87732350170489),(40.39478701048334,55.24083903654295),(39.82052516746112,55.2752875586599),(39.760076552406154,55.75443792473942),(40.57613285564824,55.78844000174894),(40.425011318010824,56.19415599955667),(39.82052516746112,56.07626182891758),(39.79030085993364,56.41214455508424),(40.48545993306579,56.495655446714636),(40.33433839542836,56.95993246553937),(39.79030085993364,56.992876013526654),(39.72985224487867,57.46729112028032),(40.33433839542836,57.46729112028032),(40.24366547284591,58.04776787540811),(39.63917932229622,58.04776787540811),(39.63917932229622,58.382088724871295),(40.33433839542836,58.382088724871295),(40.45523562553831,58.9011152358548),(38.52087994377928,59.11898412389468)]]], [[(24.367675781249993,61.45977057029751),(19.577636718749993,58.67693767258692),(19.577636718749993,57.492213666700735),(19.445800781249996,55.87531083569678),(19.445800781249996,54.085173420886775),(17.468261718749996,53.014783245859235),(20.017089843749993,51.563412328675895),(21.203613281249993,50.205033264943324),(26.125488281249993,50.40151532278236),(27.22412109374999,48.980216985374994),(32.80517578124999,49.525208341974405),(35.26611328124999,48.74894534343292),(36.93603515624999,49.66762782262194),(42.56103515625,48.77791275550183),(43.92333984374999,49.8096315635631),(47.17529296875,49.152969656170455),(49.28466796875,50.54136296522162),(48.05419921875,51.17934297928929),(51.39404296875,52.48278022207825),(50.64697265625,53.014783245859235),(52.88818359375,53.93021986394004),(51.65771484374999,54.29088164657006),(52.66845703125,55.825973254619015),(50.25146484375,56.145549500679095),(51.92138671875,57.914847767009206),(49.15283203125,58.17070248348605),(49.59228515625,60.086762746260064),(47.043457031249986,59.88893689676584),(43.57177734375,61.37567331572748),(42.64892578125,60.630101766266705),(36.89208984374999,62.000904713685856),(36.01318359374999,61.143235250840576),(31.398925781249993,62.02152819100766),(30.563964843749996,61.05828537037917),(26.872558593749993,61.71070595883174),(26.652832031249993,61.10078883158897),(24.367675781249993,61.45977057029751)], [(24.455566406249993,59.42272750081452),(21.203613281249993,58.49369382056807),(21.335449218749993,56.89700392127261),(21.599121093749993,55.92458580482949),(25.202636718749993,55.998380955359636),(28.850097656249993,57.06463027327854),(27.09228515625,57.844750992890994),(28.806152343749996,59.17592824927138),(26.257324218749993,59.17592824927138),(24.455566406249993,59.42272750081452)], [(35.13427734375,59.84481485969107),(31.970214843749993,58.97266715450152),(33.20068359374999,56.776808316568406),(36.67236328125,56.41390137600675),(39.08935546874999,57.25528054528888),(42.69287109374999,58.03137242177638),(40.89111328124999,59.26588062825809),(37.28759765625,58.722598828043374),(37.11181640624999,59.66774058164964),(35.13427734375,59.84481485969107)], [(29.157714843749993,55.75184939173528),(22.565917968749993,55.128649068488784),(22.565917968749993,53.54030739150019),(22.038574218749996,51.48138289610097),(26.257324218749993,51.42661449707484),(30.124511718749993,50.54136296522162),(32.18994140624999,51.17934297928929),(30.124511718749993,53.173119202640635),(35.09033203124999,53.173119202640635),(33.11279296875,54.085173420886775),(29.597167968749993,55.50374985927513),(29.157714843749993,55.75184939173528)], [(42.82470703125,56.58369172128337),(36.584472656249986,55.329144408405085),(37.99072265625,53.592504809039355),(34.95849609374999,51.48138289610097),(36.54052734374999,50.40151532278236),(39.66064453124999,50.289339253291786),(39.79248046875,52.13348804077148),(41.77001953125,50.68079714532166),(44.49462890624999,51.97134580885171),(47.30712890624999,52.509534770327264),(44.05517578125,53.54030739150019),(46.60400390625,53.696706475303245),(47.61474609375,55.40406982700608),(45.37353515625,55.40406982700608),(42.82470703125,56.58369172128337)]])) format TSV; +select wkt(arrayMap(a -> arrayMap(b -> arrayMap(c -> (round(c.1, 6), round(c.2, 6)), b), a), +polygonsUnionSpherical([[[(33.473420586689336,58.85424941916091),(32.23422397806246,58.492830557036),(32.173775363007486,58.03176922751564),(31.508840597402823,57.499784781503735),(31.750635057622702,56.86092686957355),(31.508840597402823,55.941082594334574),(32.20399967053497,55.515591939372456),(31.84130798020516,54.998862226280465),(31.418167674820367,54.422670886434275),(32.47601843828233,53.83826377018255),(32.08310244042503,53.408048308050866),(33.171177511414484,52.82758702113742),(34.77306581037117,52.91880107773494),(34.77306581037117,53.784726518357985),(34.108131044766516,54.17574726780569),(35.07530888564602,54.59813930694554),(34.25925258240394,54.96417435716029),(35.01486027059106,55.361278263643584),(33.50364489421682,55.37845402950552),(32.7480372060297,55.90721384574556),(35.67979503619571,55.68634475630185),(32.83871012861215,56.311688992608396),(34.591719965206266,56.29492065473883),(35.7100193437232,56.311688992608396),(33.83611227701915,56.695333481003644),(32.95960735872209,56.9434497616887),(36.072711034053015,57.091531913901434),(33.171177511414484,57.33702717078384),(36.193608264162954,57.499784781503735),(33.23162612646945,57.77481561306047),(36.43540272438284,58.04776787540811),(33.62454212432676,58.27099811968307),(36.344729801800376,58.54018474404165),(33.83611227701915,58.68186423448108),(34.74284150284369,59.565911441555244),(33.473420586689336,58.85424941916091)]], [[(34.65216858026123,58.91672306881671),(37.19101041256995,58.68186423448108),(36.01226241899805,58.28688958537609),(37.16078610504247,58.04776787540811),(35.74024365125068,57.79092907387934),(37.009664567405046,57.499784781503735),(35.77046795877817,57.25537683364851),(36.979440259877556,57.07510745541089),(34.22902827487645,56.794777197297435),(36.7074214921302,56.210968525786996),(34.712617195316206,56.10998276812964),(36.55629995449277,55.63519693782703),(35.13575750070099,55.53270067649592),(36.43540272438284,55.34409504165558),(34.83351442542614,55.01619492319591),(35.61934642114075,54.49294870011772),(34.89396304048112,54.12264226523038),(35.37755196092087,53.046178687628185),(37.43280487278982,52.95523300597458),(35.92158949641559,53.80257986695776),(36.91899164482259,53.856094327816805),(36.01226241899805,54.75541714463799),(37.765272255592166,55.189110239786885),(36.828318722240134,55.44708256557195),(38.03729102333953,55.652253637168315),(36.64697287707522,55.941082594334574),(38.21863686850443,56.05939028508024),(36.37495410932787,56.64551287174558),(38.30930979108689,56.992876013526654),(37.16078610504247,57.25537683364851),(38.127963945921984,57.516020773674256),(37.43280487278982,57.710289827306724),(38.33953409861437,57.935626886818994),(37.40258056526235,58.31865112960426),(38.58132855883426,58.744648733419496),(37.31190764267989,59.02578062465136),(34.65216858026123,58.91672306881671)]], [[(38.52087994377928,59.11898412389468),(39.54850639971376,58.713270635642914),(38.369758406141855,58.28688958537609),(38.85334732658162,58.06375936407028),(38.33953409861437,57.710289827306724),(38.73245009647167,57.48354156434209),(38.21863686850443,57.271721400459285),(38.97424455669155,56.87744603722649),(37.463029180317314,56.5623320541159),(38.94402024916407,56.05939028508024),(38.18841256097694,55.856355210835915),(38.490655636251795,55.53270067649592),(37.795496563119656,55.39562234093384),(38.30930979108689,55.154587013355666),(36.7074214921302,54.65063295250911),(37.31190764267989,53.92734063371401),(36.979440259877556,53.58783775557231),(37.855945178174615,52.91880107773497),(39.57873070724124,52.69956490610895),(38.33953409861437,53.281741738901104),(40.00187101262603,53.35396273604752),(39.54850639971376,53.58783775557231),(40.24366547284591,53.58783775557231),(39.97164670509855,53.98069568468355),(40.60635716317572,54.03398248547225),(40.39478701048334,54.44025165268903),(39.54850639971376,54.56310590284329),(39.54850639971376,54.87732350170489),(40.39478701048334,54.87732350170489),(40.39478701048334,55.24083903654295),(39.82052516746112,55.2752875586599),(39.760076552406154,55.75443792473942),(40.57613285564824,55.78844000174894),(40.425011318010824,56.19415599955667),(39.82052516746112,56.07626182891758),(39.79030085993364,56.41214455508424),(40.48545993306579,56.495655446714636),(40.33433839542836,56.95993246553937),(39.79030085993364,56.992876013526654),(39.72985224487867,57.46729112028032),(40.33433839542836,57.46729112028032),(40.24366547284591,58.04776787540811),(39.63917932229622,58.04776787540811),(39.63917932229622,58.382088724871295),(40.33433839542836,58.382088724871295),(40.45523562553831,58.9011152358548),(38.52087994377928,59.11898412389468)]]], [[(24.367675781249993,61.45977057029751),(19.577636718749993,58.67693767258692),(19.577636718749993,57.492213666700735),(19.445800781249996,55.87531083569678),(19.445800781249996,54.085173420886775),(17.468261718749996,53.014783245859235),(20.017089843749993,51.563412328675895),(21.203613281249993,50.205033264943324),(26.125488281249993,50.40151532278236),(27.22412109374999,48.980216985374994),(32.80517578124999,49.525208341974405),(35.26611328124999,48.74894534343292),(36.93603515624999,49.66762782262194),(42.56103515625,48.77791275550183),(43.92333984374999,49.8096315635631),(47.17529296875,49.152969656170455),(49.28466796875,50.54136296522162),(48.05419921875,51.17934297928929),(51.39404296875,52.48278022207825),(50.64697265625,53.014783245859235),(52.88818359375,53.93021986394004),(51.65771484374999,54.29088164657006),(52.66845703125,55.825973254619015),(50.25146484375,56.145549500679095),(51.92138671875,57.914847767009206),(49.15283203125,58.17070248348605),(49.59228515625,60.086762746260064),(47.043457031249986,59.88893689676584),(43.57177734375,61.37567331572748),(42.64892578125,60.630101766266705),(36.89208984374999,62.000904713685856),(36.01318359374999,61.143235250840576),(31.398925781249993,62.02152819100766),(30.563964843749996,61.05828537037917),(26.872558593749993,61.71070595883174),(26.652832031249993,61.10078883158897),(24.367675781249993,61.45977057029751)], [(24.455566406249993,59.42272750081452),(21.203613281249993,58.49369382056807),(21.335449218749993,56.89700392127261),(21.599121093749993,55.92458580482949),(25.202636718749993,55.998380955359636),(28.850097656249993,57.06463027327854),(27.09228515625,57.844750992890994),(28.806152343749996,59.17592824927138),(26.257324218749993,59.17592824927138),(24.455566406249993,59.42272750081452)], [(35.13427734375,59.84481485969107),(31.970214843749993,58.97266715450152),(33.20068359374999,56.776808316568406),(36.67236328125,56.41390137600675),(39.08935546874999,57.25528054528888),(42.69287109374999,58.03137242177638),(40.89111328124999,59.26588062825809),(37.28759765625,58.722598828043374),(37.11181640624999,59.66774058164964),(35.13427734375,59.84481485969107)], [(29.157714843749993,55.75184939173528),(22.565917968749993,55.128649068488784),(22.565917968749993,53.54030739150019),(22.038574218749996,51.48138289610097),(26.257324218749993,51.42661449707484),(30.124511718749993,50.54136296522162),(32.18994140624999,51.17934297928929),(30.124511718749993,53.173119202640635),(35.09033203124999,53.173119202640635),(33.11279296875,54.085173420886775),(29.597167968749993,55.50374985927513),(29.157714843749993,55.75184939173528)], [(42.82470703125,56.58369172128337),(36.584472656249986,55.329144408405085),(37.99072265625,53.592504809039355),(34.95849609374999,51.48138289610097),(36.54052734374999,50.40151532278236),(39.66064453124999,50.289339253291786),(39.79248046875,52.13348804077148),(41.77001953125,50.68079714532166),(44.49462890624999,51.97134580885171),(47.30712890624999,52.509534770327264),(44.05517578125,53.54030739150019),(46.60400390625,53.696706475303245),(47.61474609375,55.40406982700608),(45.37353515625,55.40406982700608),(42.82470703125,56.58369172128337)]]))) format TSV; select '-------- Polygon with Polygon with Holes'; -select wkt(polygonsUnionSpherical([[(29.453587685533865,59.779570356240356),(29.393139070478895,52.276266797422124),(40.636581470703206,59.38168915000267),(41.21084331372543,59.103467777099866),(29.786055068336193,52.146627480315004),(31.23682182965546,52.16517054781818),(41.69443223416517,58.85424941916091),(42.51048853740727,58.47703162291134),(32.59691566839227,52.22075341251539),(34.289476889931414,52.22075341251539),(43.02430176537451,58.07974369546071),(43.02430176537451,57.25537683364851),(35.468224883503325,52.2022335126388),(37.16078610504247,52.23926559241349),(43.02430176537451,56.26136189644947),(43.02430176537451,55.326904361850836),(38.33953409861437,52.16517054781818),(40.09254393520848,52.16517054781818),(44.4146199116388,55.3097062225408),(44.47506852669377,59.80998197603594),(39.72985224487867,59.931351417569715),(30.23941968124846,53.67744677450975),(30.20919537372098,54.63314259659509),(38.73245009647167,59.94649146557819),(37.2816833351524,59.97675082987618),(30.23941968124846,55.2752875586599),(30.33009260383092,56.19415599955667),(36.28428118674541,59.96162460231375),(34.863738732953635,59.97675082987618),(30.178971066193498,56.97640788219866),(30.178971066193498,57.91957806959033),(33.65476643185424,59.94649146557819),(32.32489690064491,59.94649146557819),(30.481214141468342,58.85424941916091),(30.571887064050795,59.99187015036608),(29.453587685533865,59.779570356240356)]], [[(24.367675781249993,61.45977057029751),(19.577636718749993,58.67693767258692),(19.577636718749993,57.492213666700735),(19.445800781249996,55.87531083569678),(19.445800781249996,54.085173420886775),(17.468261718749996,53.014783245859235),(20.017089843749993,51.563412328675895),(21.203613281249993,50.205033264943324),(26.125488281249993,50.40151532278236),(27.22412109374999,48.980216985374994),(32.80517578124999,49.525208341974405),(35.26611328124999,48.74894534343292),(36.93603515624999,49.66762782262194),(42.56103515625,48.77791275550183),(43.92333984374999,49.8096315635631),(47.17529296875,49.152969656170455),(49.28466796875,50.54136296522162),(48.05419921875,51.17934297928929),(51.39404296875,52.48278022207825),(50.64697265625,53.014783245859235),(52.88818359375,53.93021986394004),(51.65771484374999,54.29088164657006),(52.66845703125,55.825973254619015),(50.25146484375,56.145549500679095),(51.92138671875,57.914847767009206),(49.15283203125,58.17070248348605),(49.59228515625,60.086762746260064),(47.043457031249986,59.88893689676584),(43.57177734375,61.37567331572748),(42.64892578125,60.630101766266705),(36.89208984374999,62.000904713685856),(36.01318359374999,61.143235250840576),(31.398925781249993,62.02152819100766),(30.563964843749996,61.05828537037917),(26.872558593749993,61.71070595883174),(26.652832031249993,61.10078883158897),(24.367675781249993,61.45977057029751)], [(24.455566406249993,59.42272750081452),(21.203613281249993,58.49369382056807),(21.335449218749993,56.89700392127261),(21.599121093749993,55.92458580482949),(25.202636718749993,55.998380955359636),(28.850097656249993,57.06463027327854),(27.09228515625,57.844750992890994),(28.806152343749996,59.17592824927138),(26.257324218749993,59.17592824927138),(24.455566406249993,59.42272750081452)], [(35.13427734375,59.84481485969107),(31.970214843749993,58.97266715450152),(33.20068359374999,56.776808316568406),(36.67236328125,56.41390137600675),(39.08935546874999,57.25528054528888),(42.69287109374999,58.03137242177638),(40.89111328124999,59.26588062825809),(37.28759765625,58.722598828043374),(37.11181640624999,59.66774058164964),(35.13427734375,59.84481485969107)], [(29.157714843749993,55.75184939173528),(22.565917968749993,55.128649068488784),(22.565917968749993,53.54030739150019),(22.038574218749996,51.48138289610097),(26.257324218749993,51.42661449707484),(30.124511718749993,50.54136296522162),(32.18994140624999,51.17934297928929),(30.124511718749993,53.173119202640635),(35.09033203124999,53.173119202640635),(33.11279296875,54.085173420886775),(29.597167968749993,55.50374985927513),(29.157714843749993,55.75184939173528)], [(42.82470703125,56.58369172128337),(36.584472656249986,55.329144408405085),(37.99072265625,53.592504809039355),(34.95849609374999,51.48138289610097),(36.54052734374999,50.40151532278236),(39.66064453124999,50.289339253291786),(39.79248046875,52.13348804077148),(41.77001953125,50.68079714532166),(44.49462890624999,51.97134580885171),(47.30712890624999,52.509534770327264),(44.05517578125,53.54030739150019),(46.60400390625,53.696706475303245),(47.61474609375,55.40406982700608),(45.37353515625,55.40406982700608),(42.82470703125,56.58369172128337)]])) format TSV; +select wkt(arrayMap(a -> arrayMap(b -> arrayMap(c -> (round(c.1, 6), round(c.2, 6)), b), a), +polygonsUnionSpherical([[(29.453587685533865,59.779570356240356),(29.393139070478895,52.276266797422124),(40.636581470703206,59.38168915000267),(41.21084331372543,59.103467777099866),(29.786055068336193,52.146627480315004),(31.23682182965546,52.16517054781818),(41.69443223416517,58.85424941916091),(42.51048853740727,58.47703162291134),(32.59691566839227,52.22075341251539),(34.289476889931414,52.22075341251539),(43.02430176537451,58.07974369546071),(43.02430176537451,57.25537683364851),(35.468224883503325,52.2022335126388),(37.16078610504247,52.23926559241349),(43.02430176537451,56.26136189644947),(43.02430176537451,55.326904361850836),(38.33953409861437,52.16517054781818),(40.09254393520848,52.16517054781818),(44.4146199116388,55.3097062225408),(44.47506852669377,59.80998197603594),(39.72985224487867,59.931351417569715),(30.23941968124846,53.67744677450975),(30.20919537372098,54.63314259659509),(38.73245009647167,59.94649146557819),(37.2816833351524,59.97675082987618),(30.23941968124846,55.2752875586599),(30.33009260383092,56.19415599955667),(36.28428118674541,59.96162460231375),(34.863738732953635,59.97675082987618),(30.178971066193498,56.97640788219866),(30.178971066193498,57.91957806959033),(33.65476643185424,59.94649146557819),(32.32489690064491,59.94649146557819),(30.481214141468342,58.85424941916091),(30.571887064050795,59.99187015036608),(29.453587685533865,59.779570356240356)]], [[(24.367675781249993,61.45977057029751),(19.577636718749993,58.67693767258692),(19.577636718749993,57.492213666700735),(19.445800781249996,55.87531083569678),(19.445800781249996,54.085173420886775),(17.468261718749996,53.014783245859235),(20.017089843749993,51.563412328675895),(21.203613281249993,50.205033264943324),(26.125488281249993,50.40151532278236),(27.22412109374999,48.980216985374994),(32.80517578124999,49.525208341974405),(35.26611328124999,48.74894534343292),(36.93603515624999,49.66762782262194),(42.56103515625,48.77791275550183),(43.92333984374999,49.8096315635631),(47.17529296875,49.152969656170455),(49.28466796875,50.54136296522162),(48.05419921875,51.17934297928929),(51.39404296875,52.48278022207825),(50.64697265625,53.014783245859235),(52.88818359375,53.93021986394004),(51.65771484374999,54.29088164657006),(52.66845703125,55.825973254619015),(50.25146484375,56.145549500679095),(51.92138671875,57.914847767009206),(49.15283203125,58.17070248348605),(49.59228515625,60.086762746260064),(47.043457031249986,59.88893689676584),(43.57177734375,61.37567331572748),(42.64892578125,60.630101766266705),(36.89208984374999,62.000904713685856),(36.01318359374999,61.143235250840576),(31.398925781249993,62.02152819100766),(30.563964843749996,61.05828537037917),(26.872558593749993,61.71070595883174),(26.652832031249993,61.10078883158897),(24.367675781249993,61.45977057029751)], [(24.455566406249993,59.42272750081452),(21.203613281249993,58.49369382056807),(21.335449218749993,56.89700392127261),(21.599121093749993,55.92458580482949),(25.202636718749993,55.998380955359636),(28.850097656249993,57.06463027327854),(27.09228515625,57.844750992890994),(28.806152343749996,59.17592824927138),(26.257324218749993,59.17592824927138),(24.455566406249993,59.42272750081452)], [(35.13427734375,59.84481485969107),(31.970214843749993,58.97266715450152),(33.20068359374999,56.776808316568406),(36.67236328125,56.41390137600675),(39.08935546874999,57.25528054528888),(42.69287109374999,58.03137242177638),(40.89111328124999,59.26588062825809),(37.28759765625,58.722598828043374),(37.11181640624999,59.66774058164964),(35.13427734375,59.84481485969107)], [(29.157714843749993,55.75184939173528),(22.565917968749993,55.128649068488784),(22.565917968749993,53.54030739150019),(22.038574218749996,51.48138289610097),(26.257324218749993,51.42661449707484),(30.124511718749993,50.54136296522162),(32.18994140624999,51.17934297928929),(30.124511718749993,53.173119202640635),(35.09033203124999,53.173119202640635),(33.11279296875,54.085173420886775),(29.597167968749993,55.50374985927513),(29.157714843749993,55.75184939173528)], [(42.82470703125,56.58369172128337),(36.584472656249986,55.329144408405085),(37.99072265625,53.592504809039355),(34.95849609374999,51.48138289610097),(36.54052734374999,50.40151532278236),(39.66064453124999,50.289339253291786),(39.79248046875,52.13348804077148),(41.77001953125,50.68079714532166),(44.49462890624999,51.97134580885171),(47.30712890624999,52.509534770327264),(44.05517578125,53.54030739150019),(46.60400390625,53.696706475303245),(47.61474609375,55.40406982700608),(45.37353515625,55.40406982700608),(42.82470703125,56.58369172128337)]]))) format TSV; diff --git a/tests/queries/0_stateless/01306_polygons_intersection.reference b/tests/queries/0_stateless/01306_polygons_intersection.reference index 43ee975913e..99b26f7acc7 100644 --- a/tests/queries/0_stateless/01306_polygons_intersection.reference +++ b/tests/queries/0_stateless/01306_polygons_intersection.reference @@ -1,9 +1,9 @@ [[[(1,2.9),(2,2.6),(2.6,2),(2.9,1),(1,1),(1,2.9)]]] [] [] -[[[(4.3666052904432435,50.84337386140151),(4.3602419,50.8435626),(4.349556,50.8535879),(4.3526804582393535,50.856658100365976),(4.367945,50.852455),(4.3666052904432435,50.84337386140151)]]] +[[[(4.366605,50.843374),(4.360242,50.843563),(4.349556,50.853588),(4.35268,50.856658),(4.367945,50.852455),(4.366605,50.843374)]]] -------- MultiPolygon with Polygon -MULTIPOLYGON(((35.5408 58.9593,36.9725 59.0149,36.4989 58.7512,35.3712 58.8556,35.5408 58.9593)),((34.4816 56.8232,36.1999 57.0022,35.4083 56.5254,34.3867 56.7596,34.4816 56.8232)),((35.9179 57.7512,36.0848 57.855,37.1608 58.0478,36.5949 58.1673,37.8553 58.9075,38.5813 58.7446,37.4026 58.3187,38.0535 58.0542,36.4587 57.1544,35.7705 57.2554,37.0097 57.4998,35.9179 57.7512)),((36.8709 53.2765,37.4328 52.9552,36.5022 53.0008,36.8709 53.2765)),((36.1528 53.6763,35.3645 53.076,34.9611 53.9765,36.0472 54.7217,36.6985 54.0791,36.3552 53.8269,35.9216 53.8026,36.1528 53.6763)),((37.0035 54.2999,36.7074 54.6506,38.1601 55.1091,37.0035 54.2999)),((38.1688 56.0758,38.2186 56.0594,38.1319 56.0534,38.1688 56.0758)),((37.6238 55.7402,38.0373 55.6523,37.2824 55.5258,37.6238 55.7402)),((37.06 55.3843,37.7653 55.1891,36.151 54.791,37.06 55.3843)),((38.2312 56.9795,36.5334 56.6753,37.4489 57.1909,38.2699 57.0021,38.2312 56.9795)),((37.2281 56.3799,36.193 55.7319,35.3188 55.9582,35.6571 56.1619,36.7074 56.211,36.0233 56.3789,36.4446 56.6242,37.2281 56.3799)),((34.9952 58.6226,36.1498 58.553,36.0877 58.5174,34.6028 58.3749,34.9952 58.6226)),((34.3593 58.2189,35.4314 58.1349,35.1134 57.9454,33.7581 57.8255,34.3593 58.2189)),((33.6325 57.7419,34.6332 57.6538,34.2274 57.4023,33.1712 57.337,34.0208 57.2724,33.5602 56.9781,32.9596 56.9434,33.3418 56.8364,31.7782 55.7778,31.5088 55.9411,31.6069 56.3194,33.6325 57.7419)),((36.403 58.0507,36.4354 58.0478,36.3932 58.0447,36.403 58.0507)),((35.613 57.5595,36.1936 57.4998,35.4682 57.4674,35.613 57.5595)),((35.0338 57.1875,36.0727 57.0915,34.8098 57.0409,35.0338 57.1875)),((34.1885 56.6259,35.2273 56.414,35.0485 56.303,34.5917 56.2949,33.7222 56.3063,34.1885 56.6259)),((33.5244 56.1686,34.4996 55.9565,34.2598 55.8023,33.1204 55.8832,33.5244 56.1686)),((32.9547 55.7645,33.5036 55.3785,33.6125 55.3778,31.8748 54.1736,31.4182 54.4227,31.7439 54.8677,32.9547 55.7645)),((34.7279 53.8116,34.7731 53.7847,34.7731 52.9188,33.4048 52.8423,34.7279 53.8116)),((34.7231 54.7576,32.5275 53.1741,32.0831 53.408,32.476 53.8383,32.2523 53.964,34.3709 55.3709,35.0149 55.3613,34.2593 54.9642,34.7231 54.7576)),((34.9706 54.9262,34.8335 55.0162,35.2275 55.0993,34.9706 54.9262)),((35.7505 55.4454,35.1358 55.5327,35.9817 55.5958,35.7505 55.4454)),((35.0954 55.822,35.6798 55.6863,34.9721 55.7463,35.0954 55.822)),((34.7331 56.1049,34.7126 56.11,34.744 56.1118,34.7331 56.1049)),((40.2143 54.467,38.5511 53.2922,38.3395 53.2817,38.4609 53.226,38.0214 52.8989,37.8559 52.9188,37.135 53.4711,39.8151 55.3187,39.8205 55.2753,40.3948 55.2408,40.3948 54.8773,39.5485 54.8773,39.5485 54.5631,40.2143 54.467)),((40.5716 55.8007,40.5761 55.7884,40.5504 55.7875,40.5716 55.8007)),((40.4543 56.5923,40.2529 56.4682,39.7903 56.4121,39.8102 56.1914,38.2609 55.1775,37.7955 55.3956,38.4907 55.5327,38.1884 55.8564,38.944 56.0594,38.4339 56.2361,39.7863 57.025,39.7903 56.9929,40.3343 56.9599,40.4543 56.5923)),((40.1389 58.048,38.4915 57.1308,38.2186 57.2717,38.7325 57.4835,38.3737 57.6908,39.6392 58.3427,39.6392 58.0478,40.1389 58.048)),((37.5054 56.5484,37.463 56.5623,37.565 56.5843,37.5054 56.5484)),((38.0744 57.5312,38.128 57.516,37.9669 57.4734,38.0744 57.5312)),((40.4136 58.7241,40.3343 58.3821,39.7184 58.3823,40.4136 58.7241)),((39.8163 58.9766,39.4085 58.7696,38.5209 59.119,39.8163 58.9766)),((38.432 58.2584,38.3698 58.2869,38.7465 58.4255,38.432 58.2584)),((32.2175 58.3664,32.5691 58.5924,33.4734 58.8542,34.7428 59.5659,33.8361 58.6819,34.0496 58.6717,31.6514 57.1258,31.5088 57.4998,32.1738 58.0318,32.2175 58.3664)),((39.9942 53.358,40.0019 53.354,39.9877 53.3534,39.9942 53.358)),((39.2704 52.8471,39.5787 52.6996,39.1456 52.7573,39.2704 52.8471))) +MULTIPOLYGON(((35.5408 58.9593,36.9725 59.0149,36.4989 58.7512,35.3712 58.8556,35.5408 58.9593)),((34.4816 56.8232,36.1999 57.0022,35.4083 56.5254,34.3867 56.7596,34.4816 56.8232)),((35.9179 57.7512,36.0848 57.855,37.1608 58.0478,36.5949 58.1673,37.8553 58.9075,38.5813 58.7446,37.4026 58.3187,38.0535 58.0542,36.4587 57.1544,35.7705 57.2554,37.0097 57.4998,35.9179 57.7512)),((36.8709 53.2765,37.4328 52.9552,36.5022 53.0008,36.8709 53.2765)),((36.1528 53.6763,35.3645 53.076,34.9611 53.9765,36.0472 54.7217,36.6985 54.0791,36.3552 53.8269,35.9216 53.8026,36.1528 53.6763)),((37.0035 54.2999,36.7074 54.6506,38.1601 55.1091,37.0035 54.2999)),((38.1688 56.0758,38.2186 56.0594,38.1319 56.0534,38.1688 56.0758)),((37.6238 55.7402,38.0373 55.6523,37.2824 55.5258,37.6238 55.7402)),((37.06 55.3843,37.7653 55.1891,36.151 54.791,37.06 55.3843)),((38.2312 56.9795,36.5334 56.6753,37.4489 57.1909,38.2699 57.0021,38.2312 56.9795)),((37.2281 56.3799,36.193 55.7319,35.3188 55.9582,35.6571 56.1619,36.7074 56.211,36.0233 56.3789,36.4446 56.6242,37.2281 56.3799)),((34.9952 58.6226,36.1498 58.553,36.0877 58.5174,34.6028 58.3749,34.9952 58.6226)),((34.3593 58.2189,35.4314 58.1349,35.1134 57.9454,33.7581 57.8255,34.3593 58.2189)),((33.6325 57.7419,34.6332 57.6538,34.2274 57.4023,33.1712 57.337,34.0208 57.2724,33.5602 56.9781,32.9596 56.9434,33.3418 56.8364,31.7782 55.7778,31.5088 55.9411,31.6069 56.3194,33.6325 57.7419)),((36.403 58.0507,36.4354 58.0478,36.3932 58.0447,36.403 58.0507)),((35.613 57.5595,36.1936 57.4998,35.4682 57.4674,35.613 57.5595)),((35.0338 57.1875,36.0727 57.0915,34.8098 57.0409,35.0338 57.1875)),((34.1885 56.6259,35.2273 56.414,35.0485 56.303,34.5917 56.2949,33.7222 56.3063,34.1885 56.6259)),((33.5244 56.1686,34.4996 55.9565,34.2598 55.8023,33.1204 55.8832,33.5244 56.1686)),((32.9547 55.7645,33.5036 55.3785,33.6125 55.3778,31.8748 54.1736,31.4182 54.4227,31.7439 54.8677,32.9547 55.7645)),((34.7279 53.8116,34.7731 53.7847,34.7731 52.9188,33.4048 52.8423,34.7279 53.8116)),((34.7231 54.7576,32.5275 53.1741,32.0831 53.408,32.476 53.8383,32.2523 53.964,34.3709 55.3709,35.0149 55.3613,34.2593 54.9642,34.7231 54.7576)),((34.9706 54.9262,34.8335 55.0162,35.2275 55.0993,34.9706 54.9262)),((35.7505 55.4454,35.1358 55.5327,35.9817 55.5958,35.7505 55.4454)),((35.0954 55.822,35.6798 55.6863,34.9721 55.7463,35.0954 55.822)),((34.7331 56.1049,34.7126 56.11,34.744 56.1118,34.7331 56.1049)),((40.2143 54.467,38.5511 53.2922,38.3395 53.2817,38.4609 53.226,38.0214 52.8989,37.8559 52.9188,37.135 53.4711,39.8151 55.3187,39.8205 55.2753,40.3948 55.2408,40.3948 54.8773,39.5485 54.8773,39.5485 54.5631,40.2143 54.467)),((40.5716 55.8007,40.5761 55.7884,40.5504 55.7875,40.5716 55.8007)),((40.4543 56.5923,40.2529 56.4682,39.7903 56.4121,39.8102 56.1914,38.2609 55.1775,37.7955 55.3956,38.4907 55.5327,38.1884 55.8564,38.944 56.0594,38.4339 56.2361,39.7863 57.025,39.7903 56.9929,40.3343 56.9599,40.4543 56.5923)),((40.1389 58.048,38.4915 57.1308,38.2186 57.2717,38.7325 57.4835,38.3737 57.6908,39.6392 58.3427,39.6392 58.0478,40.1389 58.048)),((37.5054 56.5484,37.463 56.5623,37.565 56.5843,37.5054 56.5484)),((38.0744 57.5312,38.128 57.516,37.9669 57.4734,38.0744 57.5312)),((40.4136 58.7241,40.3343 58.3821,39.7184 58.3823,40.4136 58.7241)),((39.8163 58.9766,39.4085 58.7696,38.5209 59.119,39.8163 58.9766)),((38.432 58.2584,38.3698 58.2869,38.7465 58.4255,38.432 58.2584)),((32.2175 58.3664,32.5691 58.5924,33.4734 58.8542,34.7428 59.5659,33.8361 58.6819,34.0496 58.6717,31.6514 57.1258,31.5088 57.4998,32.1738 58.0318,32.2175 58.3664)),((39.9942 53.358,40.0019 53.354,39.9877 53.3534,39.9942 53.358)),((39.2704 52.8471,39.5787 52.6996,39.1456 52.7572,39.2704 52.8471))) -------- MultiPolygon with Polygon with Holes MULTIPOLYGON(((33.1079 56.9523,32.9596 56.9434,33.1392 56.8934,33.2007 56.7768,33.7182 56.7292,33.8361 56.6953,35.71 56.3117,34.5917 56.2949,32.8387 56.3117,35.6798 55.6863,32.748 55.9072,33.5036 55.3785,35.0149 55.3613,34.2593 54.9642,35.0753 54.5981,34.1081 54.1757,34.7731 53.7847,34.7731 53.3243,33.1128 54.0852,31.627 54.7093,31.8413 54.9989,32.204 55.5156,31.5088 55.9411,31.7506 56.8609,31.5088 57.4998,32.1738 58.0318,32.2342 58.4928,32.25 58.4976,33.1079 56.9523)),((35.1489 56.5859,36.6724 56.4139,36.8799 56.4895,38.2186 56.0594,36.647 55.9411,38.0262 55.6546,37.9482 55.6376,36.8283 55.4471,36.9508 55.414,36.5845 55.3291,36.8822 54.975,36.0123 54.7554,36.919 53.8561,35.9216 53.8026,37.2165 53.0798,37.0604 52.9744,35.3776 53.0462,34.894 54.1226,35.6193 54.4929,34.8335 55.0162,36.4354 55.3441,35.1358 55.5327,36.5563 55.6352,34.7126 56.11,36.7074 56.211,35.1489 56.5859)),((37.2327 59.0233,37.3119 59.0258,38.0944 58.8545,37.2876 58.7226,37.2327 59.0233)),((37.4471 53.2343,36.9794 53.5878,37.3119 53.9273,36.7074 54.6506,37.0572 54.7635,37.9907 53.5925,37.4471 53.2343)),((34.7731 53.1793,34.7731 52.9188,33.1712 52.8276,32.4808 53.1989,34.7731 53.1793)),((40.4412 56.1511,38.3184 55.7179,38.1884 55.8564,38.944 56.0594,37.463 56.5623,38.9742 56.8774,38.5798 57.0849,39.0894 57.2553,39.7379 57.4051,39.7903 56.9929,40.3343 56.9599,40.4855 56.4957,39.7903 56.4121,39.8205 56.0763,40.425 56.1942,40.4412 56.1511)),((38.3092 56.9929,38.3093 56.9929,38.309 56.9928,38.3092 56.9929)),((40.3237 57.5365,40.3343 57.4673,40.0149 57.4677,40.3237 57.5365)),((39.2792 59.0373,38.8838 58.9777,38.5209 59.119,39.2792 59.0373))) -------- Polygon with Polygon with Holes diff --git a/tests/queries/0_stateless/01306_polygons_intersection.sql b/tests/queries/0_stateless/01306_polygons_intersection.sql index 144408ca0ae..5bfba6124cd 100644 --- a/tests/queries/0_stateless/01306_polygons_intersection.sql +++ b/tests/queries/0_stateless/01306_polygons_intersection.sql @@ -1,14 +1,17 @@ select polygonsIntersectionCartesian([[[(0., 0.),(0., 3.),(1., 2.9),(2., 2.6),(2.6, 2.),(2.9, 1.),(3., 0.),(0., 0.)]]], [[[(1., 1.),(1., 4.),(4., 4.),(4., 1.),(1., 1.)]]]); select polygonsIntersectionCartesian([[[(0., 0.),(0., 3.),(1., 2.9),(2., 2.6),(2.6, 2.),(2.9, 1.),(3., 0.),(0., 0.)]]], [[[(3., 3.),(3., 4.),(4., 4.),(4., 3.),(3., 3.)]]]); -select polygonsIntersectionSpherical([[[(4.346693, 50.858306), (4.367945, 50.852455), (4.366227, 50.840809), (4.344961, 50.833264), (4.338074, 50.848677), (4.346693, 50.858306)]]], [[[(25.0010, 136.9987), (17.7500, 142.5000), (11.3733, 142.5917)]]]); -select polygonsIntersectionSpherical([[[(4.3613577, 50.8651821), (4.349556, 50.8535879), (4.3602419, 50.8435626), (4.3830299, 50.8428851), (4.3904543, 50.8564867), (4.3613148, 50.8651279)]]], [[[(4.346693, 50.858306), (4.367945, 50.852455), (4.366227, 50.840809), (4.344961, 50.833264), (4.338074, 50.848677), (4.346693, 50.858306)]]]); +select arrayMap(a -> arrayMap(b -> arrayMap(c -> (round(c.1, 6), round(c.2, 6)), b), a), polygonsIntersectionSpherical([[[(4.346693, 50.858306), (4.367945, 50.852455), (4.366227, 50.840809), (4.344961, 50.833264), (4.338074, 50.848677), (4.346693, 50.858306)]]], [[[(25.0010, 136.9987), (17.7500, 142.5000), (11.3733, 142.5917)]]])); +select arrayMap(a -> arrayMap(b -> arrayMap(c -> (round(c.1, 6), round(c.2, 6)), b), a),polygonsIntersectionSpherical([[[(4.3613577, 50.8651821), (4.349556, 50.8535879), (4.3602419, 50.8435626), (4.3830299, 50.8428851), (4.3904543, 50.8564867), (4.3613148, 50.8651279)]]], [[[(4.346693, 50.858306), (4.367945, 50.852455), (4.366227, 50.840809), (4.344961, 50.833264), (4.338074, 50.848677), (4.346693, 50.858306)]]])); select '-------- MultiPolygon with Polygon'; -select wkt(polygonsIntersectionSpherical([[(29.453587685533865,59.779570356240356),(29.393139070478895,52.276266797422124),(40.636581470703206,59.38168915000267),(41.21084331372543,59.103467777099866),(29.786055068336193,52.146627480315004),(31.23682182965546,52.16517054781818),(41.69443223416517,58.85424941916091),(42.51048853740727,58.47703162291134),(32.59691566839227,52.22075341251539),(34.289476889931414,52.22075341251539),(43.02430176537451,58.07974369546071),(43.02430176537451,57.25537683364851),(35.468224883503325,52.2022335126388),(37.16078610504247,52.23926559241349),(43.02430176537451,56.26136189644947),(43.02430176537451,55.326904361850836),(38.33953409861437,52.16517054781818),(40.09254393520848,52.16517054781818),(44.4146199116388,55.3097062225408),(44.47506852669377,59.80998197603594),(39.72985224487867,59.931351417569715),(30.23941968124846,53.67744677450975),(30.20919537372098,54.63314259659509),(38.73245009647167,59.94649146557819),(37.2816833351524,59.97675082987618),(30.23941968124846,55.2752875586599),(30.33009260383092,56.19415599955667),(36.28428118674541,59.96162460231375),(34.863738732953635,59.97675082987618),(30.178971066193498,56.97640788219866),(30.178971066193498,57.91957806959033),(33.65476643185424,59.94649146557819),(32.32489690064491,59.94649146557819),(30.481214141468342,58.85424941916091),(30.571887064050795,59.99187015036608),(29.453587685533865,59.779570356240356)]], [[[(33.473420586689336,58.85424941916091),(32.23422397806246,58.492830557036),(32.173775363007486,58.03176922751564),(31.508840597402823,57.499784781503735),(31.750635057622702,56.86092686957355),(31.508840597402823,55.941082594334574),(32.20399967053497,55.515591939372456),(31.84130798020516,54.998862226280465),(31.418167674820367,54.422670886434275),(32.47601843828233,53.83826377018255),(32.08310244042503,53.408048308050866),(33.171177511414484,52.82758702113742),(34.77306581037117,52.91880107773494),(34.77306581037117,53.784726518357985),(34.108131044766516,54.17574726780569),(35.07530888564602,54.59813930694554),(34.25925258240394,54.96417435716029),(35.01486027059106,55.361278263643584),(33.50364489421682,55.37845402950552),(32.7480372060297,55.90721384574556),(35.67979503619571,55.68634475630185),(32.83871012861215,56.311688992608396),(34.591719965206266,56.29492065473883),(35.7100193437232,56.311688992608396),(33.83611227701915,56.695333481003644),(32.95960735872209,56.9434497616887),(36.072711034053015,57.091531913901434),(33.171177511414484,57.33702717078384),(36.193608264162954,57.499784781503735),(33.23162612646945,57.77481561306047),(36.43540272438284,58.04776787540811),(33.62454212432676,58.27099811968307),(36.344729801800376,58.54018474404165),(33.83611227701915,58.68186423448108),(34.74284150284369,59.565911441555244),(33.473420586689336,58.85424941916091)]], [[(34.65216858026123,58.91672306881671),(37.19101041256995,58.68186423448108),(36.01226241899805,58.28688958537609),(37.16078610504247,58.04776787540811),(35.74024365125068,57.79092907387934),(37.009664567405046,57.499784781503735),(35.77046795877817,57.25537683364851),(36.979440259877556,57.07510745541089),(34.22902827487645,56.794777197297435),(36.7074214921302,56.210968525786996),(34.712617195316206,56.10998276812964),(36.55629995449277,55.63519693782703),(35.13575750070099,55.53270067649592),(36.43540272438284,55.34409504165558),(34.83351442542614,55.01619492319591),(35.61934642114075,54.49294870011772),(34.89396304048112,54.12264226523038),(35.37755196092087,53.046178687628185),(37.43280487278982,52.95523300597458),(35.92158949641559,53.80257986695776),(36.91899164482259,53.856094327816805),(36.01226241899805,54.75541714463799),(37.765272255592166,55.189110239786885),(36.828318722240134,55.44708256557195),(38.03729102333953,55.652253637168315),(36.64697287707522,55.941082594334574),(38.21863686850443,56.05939028508024),(36.37495410932787,56.64551287174558),(38.30930979108689,56.992876013526654),(37.16078610504247,57.25537683364851),(38.127963945921984,57.516020773674256),(37.43280487278982,57.710289827306724),(38.33953409861437,57.935626886818994),(37.40258056526235,58.31865112960426),(38.58132855883426,58.744648733419496),(37.31190764267989,59.02578062465136),(34.65216858026123,58.91672306881671)]], [[(38.52087994377928,59.11898412389468),(39.54850639971376,58.713270635642914),(38.369758406141855,58.28688958537609),(38.85334732658162,58.06375936407028),(38.33953409861437,57.710289827306724),(38.73245009647167,57.48354156434209),(38.21863686850443,57.271721400459285),(38.97424455669155,56.87744603722649),(37.463029180317314,56.5623320541159),(38.94402024916407,56.05939028508024),(38.18841256097694,55.856355210835915),(38.490655636251795,55.53270067649592),(37.795496563119656,55.39562234093384),(38.30930979108689,55.154587013355666),(36.7074214921302,54.65063295250911),(37.31190764267989,53.92734063371401),(36.979440259877556,53.58783775557231),(37.855945178174615,52.91880107773497),(39.57873070724124,52.69956490610895),(38.33953409861437,53.281741738901104),(40.00187101262603,53.35396273604752),(39.54850639971376,53.58783775557231),(40.24366547284591,53.58783775557231),(39.97164670509855,53.98069568468355),(40.60635716317572,54.03398248547225),(40.39478701048334,54.44025165268903),(39.54850639971376,54.56310590284329),(39.54850639971376,54.87732350170489),(40.39478701048334,54.87732350170489),(40.39478701048334,55.24083903654295),(39.82052516746112,55.2752875586599),(39.760076552406154,55.75443792473942),(40.57613285564824,55.78844000174894),(40.425011318010824,56.19415599955667),(39.82052516746112,56.07626182891758),(39.79030085993364,56.41214455508424),(40.48545993306579,56.495655446714636),(40.33433839542836,56.95993246553937),(39.79030085993364,56.992876013526654),(39.72985224487867,57.46729112028032),(40.33433839542836,57.46729112028032),(40.24366547284591,58.04776787540811),(39.63917932229622,58.04776787540811),(39.63917932229622,58.382088724871295),(40.33433839542836,58.382088724871295),(40.45523562553831,58.9011152358548),(38.52087994377928,59.11898412389468)]]])) format TSV; +select wkt(arrayMap(a -> arrayMap(b -> arrayMap(c -> (round(c.1, 6), round(c.2, 6)), b), a), +polygonsIntersectionSpherical([[(29.453587685533865,59.779570356240356),(29.393139070478895,52.276266797422124),(40.636581470703206,59.38168915000267),(41.21084331372543,59.103467777099866),(29.786055068336193,52.146627480315004),(31.23682182965546,52.16517054781818),(41.69443223416517,58.85424941916091),(42.51048853740727,58.47703162291134),(32.59691566839227,52.22075341251539),(34.289476889931414,52.22075341251539),(43.02430176537451,58.07974369546071),(43.02430176537451,57.25537683364851),(35.468224883503325,52.2022335126388),(37.16078610504247,52.23926559241349),(43.02430176537451,56.26136189644947),(43.02430176537451,55.326904361850836),(38.33953409861437,52.16517054781818),(40.09254393520848,52.16517054781818),(44.4146199116388,55.3097062225408),(44.47506852669377,59.80998197603594),(39.72985224487867,59.931351417569715),(30.23941968124846,53.67744677450975),(30.20919537372098,54.63314259659509),(38.73245009647167,59.94649146557819),(37.2816833351524,59.97675082987618),(30.23941968124846,55.2752875586599),(30.33009260383092,56.19415599955667),(36.28428118674541,59.96162460231375),(34.863738732953635,59.97675082987618),(30.178971066193498,56.97640788219866),(30.178971066193498,57.91957806959033),(33.65476643185424,59.94649146557819),(32.32489690064491,59.94649146557819),(30.481214141468342,58.85424941916091),(30.571887064050795,59.99187015036608),(29.453587685533865,59.779570356240356)]], [[[(33.473420586689336,58.85424941916091),(32.23422397806246,58.492830557036),(32.173775363007486,58.03176922751564),(31.508840597402823,57.499784781503735),(31.750635057622702,56.86092686957355),(31.508840597402823,55.941082594334574),(32.20399967053497,55.515591939372456),(31.84130798020516,54.998862226280465),(31.418167674820367,54.422670886434275),(32.47601843828233,53.83826377018255),(32.08310244042503,53.408048308050866),(33.171177511414484,52.82758702113742),(34.77306581037117,52.91880107773494),(34.77306581037117,53.784726518357985),(34.108131044766516,54.17574726780569),(35.07530888564602,54.59813930694554),(34.25925258240394,54.96417435716029),(35.01486027059106,55.361278263643584),(33.50364489421682,55.37845402950552),(32.7480372060297,55.90721384574556),(35.67979503619571,55.68634475630185),(32.83871012861215,56.311688992608396),(34.591719965206266,56.29492065473883),(35.7100193437232,56.311688992608396),(33.83611227701915,56.695333481003644),(32.95960735872209,56.9434497616887),(36.072711034053015,57.091531913901434),(33.171177511414484,57.33702717078384),(36.193608264162954,57.499784781503735),(33.23162612646945,57.77481561306047),(36.43540272438284,58.04776787540811),(33.62454212432676,58.27099811968307),(36.344729801800376,58.54018474404165),(33.83611227701915,58.68186423448108),(34.74284150284369,59.565911441555244),(33.473420586689336,58.85424941916091)]], [[(34.65216858026123,58.91672306881671),(37.19101041256995,58.68186423448108),(36.01226241899805,58.28688958537609),(37.16078610504247,58.04776787540811),(35.74024365125068,57.79092907387934),(37.009664567405046,57.499784781503735),(35.77046795877817,57.25537683364851),(36.979440259877556,57.07510745541089),(34.22902827487645,56.794777197297435),(36.7074214921302,56.210968525786996),(34.712617195316206,56.10998276812964),(36.55629995449277,55.63519693782703),(35.13575750070099,55.53270067649592),(36.43540272438284,55.34409504165558),(34.83351442542614,55.01619492319591),(35.61934642114075,54.49294870011772),(34.89396304048112,54.12264226523038),(35.37755196092087,53.046178687628185),(37.43280487278982,52.95523300597458),(35.92158949641559,53.80257986695776),(36.91899164482259,53.856094327816805),(36.01226241899805,54.75541714463799),(37.765272255592166,55.189110239786885),(36.828318722240134,55.44708256557195),(38.03729102333953,55.652253637168315),(36.64697287707522,55.941082594334574),(38.21863686850443,56.05939028508024),(36.37495410932787,56.64551287174558),(38.30930979108689,56.992876013526654),(37.16078610504247,57.25537683364851),(38.127963945921984,57.516020773674256),(37.43280487278982,57.710289827306724),(38.33953409861437,57.935626886818994),(37.40258056526235,58.31865112960426),(38.58132855883426,58.744648733419496),(37.31190764267989,59.02578062465136),(34.65216858026123,58.91672306881671)]], [[(38.52087994377928,59.11898412389468),(39.54850639971376,58.713270635642914),(38.369758406141855,58.28688958537609),(38.85334732658162,58.06375936407028),(38.33953409861437,57.710289827306724),(38.73245009647167,57.48354156434209),(38.21863686850443,57.271721400459285),(38.97424455669155,56.87744603722649),(37.463029180317314,56.5623320541159),(38.94402024916407,56.05939028508024),(38.18841256097694,55.856355210835915),(38.490655636251795,55.53270067649592),(37.795496563119656,55.39562234093384),(38.30930979108689,55.154587013355666),(36.7074214921302,54.65063295250911),(37.31190764267989,53.92734063371401),(36.979440259877556,53.58783775557231),(37.855945178174615,52.91880107773497),(39.57873070724124,52.69956490610895),(38.33953409861437,53.281741738901104),(40.00187101262603,53.35396273604752),(39.54850639971376,53.58783775557231),(40.24366547284591,53.58783775557231),(39.97164670509855,53.98069568468355),(40.60635716317572,54.03398248547225),(40.39478701048334,54.44025165268903),(39.54850639971376,54.56310590284329),(39.54850639971376,54.87732350170489),(40.39478701048334,54.87732350170489),(40.39478701048334,55.24083903654295),(39.82052516746112,55.2752875586599),(39.760076552406154,55.75443792473942),(40.57613285564824,55.78844000174894),(40.425011318010824,56.19415599955667),(39.82052516746112,56.07626182891758),(39.79030085993364,56.41214455508424),(40.48545993306579,56.495655446714636),(40.33433839542836,56.95993246553937),(39.79030085993364,56.992876013526654),(39.72985224487867,57.46729112028032),(40.33433839542836,57.46729112028032),(40.24366547284591,58.04776787540811),(39.63917932229622,58.04776787540811),(39.63917932229622,58.382088724871295),(40.33433839542836,58.382088724871295),(40.45523562553831,58.9011152358548),(38.52087994377928,59.11898412389468)]]]))) format TSV; select '-------- MultiPolygon with Polygon with Holes'; -select wkt(polygonsIntersectionSpherical([[[(33.473420586689336,58.85424941916091),(32.23422397806246,58.492830557036),(32.173775363007486,58.03176922751564),(31.508840597402823,57.499784781503735),(31.750635057622702,56.86092686957355),(31.508840597402823,55.941082594334574),(32.20399967053497,55.515591939372456),(31.84130798020516,54.998862226280465),(31.418167674820367,54.422670886434275),(32.47601843828233,53.83826377018255),(32.08310244042503,53.408048308050866),(33.171177511414484,52.82758702113742),(34.77306581037117,52.91880107773494),(34.77306581037117,53.784726518357985),(34.108131044766516,54.17574726780569),(35.07530888564602,54.59813930694554),(34.25925258240394,54.96417435716029),(35.01486027059106,55.361278263643584),(33.50364489421682,55.37845402950552),(32.7480372060297,55.90721384574556),(35.67979503619571,55.68634475630185),(32.83871012861215,56.311688992608396),(34.591719965206266,56.29492065473883),(35.7100193437232,56.311688992608396),(33.83611227701915,56.695333481003644),(32.95960735872209,56.9434497616887),(36.072711034053015,57.091531913901434),(33.171177511414484,57.33702717078384),(36.193608264162954,57.499784781503735),(33.23162612646945,57.77481561306047),(36.43540272438284,58.04776787540811),(33.62454212432676,58.27099811968307),(36.344729801800376,58.54018474404165),(33.83611227701915,58.68186423448108),(34.74284150284369,59.565911441555244),(33.473420586689336,58.85424941916091)]], [[(34.65216858026123,58.91672306881671),(37.19101041256995,58.68186423448108),(36.01226241899805,58.28688958537609),(37.16078610504247,58.04776787540811),(35.74024365125068,57.79092907387934),(37.009664567405046,57.499784781503735),(35.77046795877817,57.25537683364851),(36.979440259877556,57.07510745541089),(34.22902827487645,56.794777197297435),(36.7074214921302,56.210968525786996),(34.712617195316206,56.10998276812964),(36.55629995449277,55.63519693782703),(35.13575750070099,55.53270067649592),(36.43540272438284,55.34409504165558),(34.83351442542614,55.01619492319591),(35.61934642114075,54.49294870011772),(34.89396304048112,54.12264226523038),(35.37755196092087,53.046178687628185),(37.43280487278982,52.95523300597458),(35.92158949641559,53.80257986695776),(36.91899164482259,53.856094327816805),(36.01226241899805,54.75541714463799),(37.765272255592166,55.189110239786885),(36.828318722240134,55.44708256557195),(38.03729102333953,55.652253637168315),(36.64697287707522,55.941082594334574),(38.21863686850443,56.05939028508024),(36.37495410932787,56.64551287174558),(38.30930979108689,56.992876013526654),(37.16078610504247,57.25537683364851),(38.127963945921984,57.516020773674256),(37.43280487278982,57.710289827306724),(38.33953409861437,57.935626886818994),(37.40258056526235,58.31865112960426),(38.58132855883426,58.744648733419496),(37.31190764267989,59.02578062465136),(34.65216858026123,58.91672306881671)]], [[(38.52087994377928,59.11898412389468),(39.54850639971376,58.713270635642914),(38.369758406141855,58.28688958537609),(38.85334732658162,58.06375936407028),(38.33953409861437,57.710289827306724),(38.73245009647167,57.48354156434209),(38.21863686850443,57.271721400459285),(38.97424455669155,56.87744603722649),(37.463029180317314,56.5623320541159),(38.94402024916407,56.05939028508024),(38.18841256097694,55.856355210835915),(38.490655636251795,55.53270067649592),(37.795496563119656,55.39562234093384),(38.30930979108689,55.154587013355666),(36.7074214921302,54.65063295250911),(37.31190764267989,53.92734063371401),(36.979440259877556,53.58783775557231),(37.855945178174615,52.91880107773497),(39.57873070724124,52.69956490610895),(38.33953409861437,53.281741738901104),(40.00187101262603,53.35396273604752),(39.54850639971376,53.58783775557231),(40.24366547284591,53.58783775557231),(39.97164670509855,53.98069568468355),(40.60635716317572,54.03398248547225),(40.39478701048334,54.44025165268903),(39.54850639971376,54.56310590284329),(39.54850639971376,54.87732350170489),(40.39478701048334,54.87732350170489),(40.39478701048334,55.24083903654295),(39.82052516746112,55.2752875586599),(39.760076552406154,55.75443792473942),(40.57613285564824,55.78844000174894),(40.425011318010824,56.19415599955667),(39.82052516746112,56.07626182891758),(39.79030085993364,56.41214455508424),(40.48545993306579,56.495655446714636),(40.33433839542836,56.95993246553937),(39.79030085993364,56.992876013526654),(39.72985224487867,57.46729112028032),(40.33433839542836,57.46729112028032),(40.24366547284591,58.04776787540811),(39.63917932229622,58.04776787540811),(39.63917932229622,58.382088724871295),(40.33433839542836,58.382088724871295),(40.45523562553831,58.9011152358548),(38.52087994377928,59.11898412389468)]]], [[(24.367675781249993,61.45977057029751),(19.577636718749993,58.67693767258692),(19.577636718749993,57.492213666700735),(19.445800781249996,55.87531083569678),(19.445800781249996,54.085173420886775),(17.468261718749996,53.014783245859235),(20.017089843749993,51.563412328675895),(21.203613281249993,50.205033264943324),(26.125488281249993,50.40151532278236),(27.22412109374999,48.980216985374994),(32.80517578124999,49.525208341974405),(35.26611328124999,48.74894534343292),(36.93603515624999,49.66762782262194),(42.56103515625,48.77791275550183),(43.92333984374999,49.8096315635631),(47.17529296875,49.152969656170455),(49.28466796875,50.54136296522162),(48.05419921875,51.17934297928929),(51.39404296875,52.48278022207825),(50.64697265625,53.014783245859235),(52.88818359375,53.93021986394004),(51.65771484374999,54.29088164657006),(52.66845703125,55.825973254619015),(50.25146484375,56.145549500679095),(51.92138671875,57.914847767009206),(49.15283203125,58.17070248348605),(49.59228515625,60.086762746260064),(47.043457031249986,59.88893689676584),(43.57177734375,61.37567331572748),(42.64892578125,60.630101766266705),(36.89208984374999,62.000904713685856),(36.01318359374999,61.143235250840576),(31.398925781249993,62.02152819100766),(30.563964843749996,61.05828537037917),(26.872558593749993,61.71070595883174),(26.652832031249993,61.10078883158897),(24.367675781249993,61.45977057029751)], [(24.455566406249993,59.42272750081452),(21.203613281249993,58.49369382056807),(21.335449218749993,56.89700392127261),(21.599121093749993,55.92458580482949),(25.202636718749993,55.998380955359636),(28.850097656249993,57.06463027327854),(27.09228515625,57.844750992890994),(28.806152343749996,59.17592824927138),(26.257324218749993,59.17592824927138),(24.455566406249993,59.42272750081452)], [(35.13427734375,59.84481485969107),(31.970214843749993,58.97266715450152),(33.20068359374999,56.776808316568406),(36.67236328125,56.41390137600675),(39.08935546874999,57.25528054528888),(42.69287109374999,58.03137242177638),(40.89111328124999,59.26588062825809),(37.28759765625,58.722598828043374),(37.11181640624999,59.66774058164964),(35.13427734375,59.84481485969107)], [(29.157714843749993,55.75184939173528),(22.565917968749993,55.128649068488784),(22.565917968749993,53.54030739150019),(22.038574218749996,51.48138289610097),(26.257324218749993,51.42661449707484),(30.124511718749993,50.54136296522162),(32.18994140624999,51.17934297928929),(30.124511718749993,53.173119202640635),(35.09033203124999,53.173119202640635),(33.11279296875,54.085173420886775),(29.597167968749993,55.50374985927513),(29.157714843749993,55.75184939173528)], [(42.82470703125,56.58369172128337),(36.584472656249986,55.329144408405085),(37.99072265625,53.592504809039355),(34.95849609374999,51.48138289610097),(36.54052734374999,50.40151532278236),(39.66064453124999,50.289339253291786),(39.79248046875,52.13348804077148),(41.77001953125,50.68079714532166),(44.49462890624999,51.97134580885171),(47.30712890624999,52.509534770327264),(44.05517578125,53.54030739150019),(46.60400390625,53.696706475303245),(47.61474609375,55.40406982700608),(45.37353515625,55.40406982700608),(42.82470703125,56.58369172128337)]])) format TSV; +select wkt(arrayMap(a -> arrayMap(b -> arrayMap(c -> (round(c.1, 6), round(c.2, 6)), b), a), +polygonsIntersectionSpherical([[[(33.473420586689336,58.85424941916091),(32.23422397806246,58.492830557036),(32.173775363007486,58.03176922751564),(31.508840597402823,57.499784781503735),(31.750635057622702,56.86092686957355),(31.508840597402823,55.941082594334574),(32.20399967053497,55.515591939372456),(31.84130798020516,54.998862226280465),(31.418167674820367,54.422670886434275),(32.47601843828233,53.83826377018255),(32.08310244042503,53.408048308050866),(33.171177511414484,52.82758702113742),(34.77306581037117,52.91880107773494),(34.77306581037117,53.784726518357985),(34.108131044766516,54.17574726780569),(35.07530888564602,54.59813930694554),(34.25925258240394,54.96417435716029),(35.01486027059106,55.361278263643584),(33.50364489421682,55.37845402950552),(32.7480372060297,55.90721384574556),(35.67979503619571,55.68634475630185),(32.83871012861215,56.311688992608396),(34.591719965206266,56.29492065473883),(35.7100193437232,56.311688992608396),(33.83611227701915,56.695333481003644),(32.95960735872209,56.9434497616887),(36.072711034053015,57.091531913901434),(33.171177511414484,57.33702717078384),(36.193608264162954,57.499784781503735),(33.23162612646945,57.77481561306047),(36.43540272438284,58.04776787540811),(33.62454212432676,58.27099811968307),(36.344729801800376,58.54018474404165),(33.83611227701915,58.68186423448108),(34.74284150284369,59.565911441555244),(33.473420586689336,58.85424941916091)]], [[(34.65216858026123,58.91672306881671),(37.19101041256995,58.68186423448108),(36.01226241899805,58.28688958537609),(37.16078610504247,58.04776787540811),(35.74024365125068,57.79092907387934),(37.009664567405046,57.499784781503735),(35.77046795877817,57.25537683364851),(36.979440259877556,57.07510745541089),(34.22902827487645,56.794777197297435),(36.7074214921302,56.210968525786996),(34.712617195316206,56.10998276812964),(36.55629995449277,55.63519693782703),(35.13575750070099,55.53270067649592),(36.43540272438284,55.34409504165558),(34.83351442542614,55.01619492319591),(35.61934642114075,54.49294870011772),(34.89396304048112,54.12264226523038),(35.37755196092087,53.046178687628185),(37.43280487278982,52.95523300597458),(35.92158949641559,53.80257986695776),(36.91899164482259,53.856094327816805),(36.01226241899805,54.75541714463799),(37.765272255592166,55.189110239786885),(36.828318722240134,55.44708256557195),(38.03729102333953,55.652253637168315),(36.64697287707522,55.941082594334574),(38.21863686850443,56.05939028508024),(36.37495410932787,56.64551287174558),(38.30930979108689,56.992876013526654),(37.16078610504247,57.25537683364851),(38.127963945921984,57.516020773674256),(37.43280487278982,57.710289827306724),(38.33953409861437,57.935626886818994),(37.40258056526235,58.31865112960426),(38.58132855883426,58.744648733419496),(37.31190764267989,59.02578062465136),(34.65216858026123,58.91672306881671)]], [[(38.52087994377928,59.11898412389468),(39.54850639971376,58.713270635642914),(38.369758406141855,58.28688958537609),(38.85334732658162,58.06375936407028),(38.33953409861437,57.710289827306724),(38.73245009647167,57.48354156434209),(38.21863686850443,57.271721400459285),(38.97424455669155,56.87744603722649),(37.463029180317314,56.5623320541159),(38.94402024916407,56.05939028508024),(38.18841256097694,55.856355210835915),(38.490655636251795,55.53270067649592),(37.795496563119656,55.39562234093384),(38.30930979108689,55.154587013355666),(36.7074214921302,54.65063295250911),(37.31190764267989,53.92734063371401),(36.979440259877556,53.58783775557231),(37.855945178174615,52.91880107773497),(39.57873070724124,52.69956490610895),(38.33953409861437,53.281741738901104),(40.00187101262603,53.35396273604752),(39.54850639971376,53.58783775557231),(40.24366547284591,53.58783775557231),(39.97164670509855,53.98069568468355),(40.60635716317572,54.03398248547225),(40.39478701048334,54.44025165268903),(39.54850639971376,54.56310590284329),(39.54850639971376,54.87732350170489),(40.39478701048334,54.87732350170489),(40.39478701048334,55.24083903654295),(39.82052516746112,55.2752875586599),(39.760076552406154,55.75443792473942),(40.57613285564824,55.78844000174894),(40.425011318010824,56.19415599955667),(39.82052516746112,56.07626182891758),(39.79030085993364,56.41214455508424),(40.48545993306579,56.495655446714636),(40.33433839542836,56.95993246553937),(39.79030085993364,56.992876013526654),(39.72985224487867,57.46729112028032),(40.33433839542836,57.46729112028032),(40.24366547284591,58.04776787540811),(39.63917932229622,58.04776787540811),(39.63917932229622,58.382088724871295),(40.33433839542836,58.382088724871295),(40.45523562553831,58.9011152358548),(38.52087994377928,59.11898412389468)]]], [[(24.367675781249993,61.45977057029751),(19.577636718749993,58.67693767258692),(19.577636718749993,57.492213666700735),(19.445800781249996,55.87531083569678),(19.445800781249996,54.085173420886775),(17.468261718749996,53.014783245859235),(20.017089843749993,51.563412328675895),(21.203613281249993,50.205033264943324),(26.125488281249993,50.40151532278236),(27.22412109374999,48.980216985374994),(32.80517578124999,49.525208341974405),(35.26611328124999,48.74894534343292),(36.93603515624999,49.66762782262194),(42.56103515625,48.77791275550183),(43.92333984374999,49.8096315635631),(47.17529296875,49.152969656170455),(49.28466796875,50.54136296522162),(48.05419921875,51.17934297928929),(51.39404296875,52.48278022207825),(50.64697265625,53.014783245859235),(52.88818359375,53.93021986394004),(51.65771484374999,54.29088164657006),(52.66845703125,55.825973254619015),(50.25146484375,56.145549500679095),(51.92138671875,57.914847767009206),(49.15283203125,58.17070248348605),(49.59228515625,60.086762746260064),(47.043457031249986,59.88893689676584),(43.57177734375,61.37567331572748),(42.64892578125,60.630101766266705),(36.89208984374999,62.000904713685856),(36.01318359374999,61.143235250840576),(31.398925781249993,62.02152819100766),(30.563964843749996,61.05828537037917),(26.872558593749993,61.71070595883174),(26.652832031249993,61.10078883158897),(24.367675781249993,61.45977057029751)], [(24.455566406249993,59.42272750081452),(21.203613281249993,58.49369382056807),(21.335449218749993,56.89700392127261),(21.599121093749993,55.92458580482949),(25.202636718749993,55.998380955359636),(28.850097656249993,57.06463027327854),(27.09228515625,57.844750992890994),(28.806152343749996,59.17592824927138),(26.257324218749993,59.17592824927138),(24.455566406249993,59.42272750081452)], [(35.13427734375,59.84481485969107),(31.970214843749993,58.97266715450152),(33.20068359374999,56.776808316568406),(36.67236328125,56.41390137600675),(39.08935546874999,57.25528054528888),(42.69287109374999,58.03137242177638),(40.89111328124999,59.26588062825809),(37.28759765625,58.722598828043374),(37.11181640624999,59.66774058164964),(35.13427734375,59.84481485969107)], [(29.157714843749993,55.75184939173528),(22.565917968749993,55.128649068488784),(22.565917968749993,53.54030739150019),(22.038574218749996,51.48138289610097),(26.257324218749993,51.42661449707484),(30.124511718749993,50.54136296522162),(32.18994140624999,51.17934297928929),(30.124511718749993,53.173119202640635),(35.09033203124999,53.173119202640635),(33.11279296875,54.085173420886775),(29.597167968749993,55.50374985927513),(29.157714843749993,55.75184939173528)], [(42.82470703125,56.58369172128337),(36.584472656249986,55.329144408405085),(37.99072265625,53.592504809039355),(34.95849609374999,51.48138289610097),(36.54052734374999,50.40151532278236),(39.66064453124999,50.289339253291786),(39.79248046875,52.13348804077148),(41.77001953125,50.68079714532166),(44.49462890624999,51.97134580885171),(47.30712890624999,52.509534770327264),(44.05517578125,53.54030739150019),(46.60400390625,53.696706475303245),(47.61474609375,55.40406982700608),(45.37353515625,55.40406982700608),(42.82470703125,56.58369172128337)]]))) format TSV; select '-------- Polygon with Polygon with Holes'; -select wkt(polygonsIntersectionSpherical([[(29.453587685533865,59.779570356240356),(29.393139070478895,52.276266797422124),(40.636581470703206,59.38168915000267),(41.21084331372543,59.103467777099866),(29.786055068336193,52.146627480315004),(31.23682182965546,52.16517054781818),(41.69443223416517,58.85424941916091),(42.51048853740727,58.47703162291134),(32.59691566839227,52.22075341251539),(34.289476889931414,52.22075341251539),(43.02430176537451,58.07974369546071),(43.02430176537451,57.25537683364851),(35.468224883503325,52.2022335126388),(37.16078610504247,52.23926559241349),(43.02430176537451,56.26136189644947),(43.02430176537451,55.326904361850836),(38.33953409861437,52.16517054781818),(40.09254393520848,52.16517054781818),(44.4146199116388,55.3097062225408),(44.47506852669377,59.80998197603594),(39.72985224487867,59.931351417569715),(30.23941968124846,53.67744677450975),(30.20919537372098,54.63314259659509),(38.73245009647167,59.94649146557819),(37.2816833351524,59.97675082987618),(30.23941968124846,55.2752875586599),(30.33009260383092,56.19415599955667),(36.28428118674541,59.96162460231375),(34.863738732953635,59.97675082987618),(30.178971066193498,56.97640788219866),(30.178971066193498,57.91957806959033),(33.65476643185424,59.94649146557819),(32.32489690064491,59.94649146557819),(30.481214141468342,58.85424941916091),(30.571887064050795,59.99187015036608),(29.453587685533865,59.779570356240356)]], [[(24.367675781249993,61.45977057029751),(19.577636718749993,58.67693767258692),(19.577636718749993,57.492213666700735),(19.445800781249996,55.87531083569678),(19.445800781249996,54.085173420886775),(17.468261718749996,53.014783245859235),(20.017089843749993,51.563412328675895),(21.203613281249993,50.205033264943324),(26.125488281249993,50.40151532278236),(27.22412109374999,48.980216985374994),(32.80517578124999,49.525208341974405),(35.26611328124999,48.74894534343292),(36.93603515624999,49.66762782262194),(42.56103515625,48.77791275550183),(43.92333984374999,49.8096315635631),(47.17529296875,49.152969656170455),(49.28466796875,50.54136296522162),(48.05419921875,51.17934297928929),(51.39404296875,52.48278022207825),(50.64697265625,53.014783245859235),(52.88818359375,53.93021986394004),(51.65771484374999,54.29088164657006),(52.66845703125,55.825973254619015),(50.25146484375,56.145549500679095),(51.92138671875,57.914847767009206),(49.15283203125,58.17070248348605),(49.59228515625,60.086762746260064),(47.043457031249986,59.88893689676584),(43.57177734375,61.37567331572748),(42.64892578125,60.630101766266705),(36.89208984374999,62.000904713685856),(36.01318359374999,61.143235250840576),(31.398925781249993,62.02152819100766),(30.563964843749996,61.05828537037917),(26.872558593749993,61.71070595883174),(26.652832031249993,61.10078883158897),(24.367675781249993,61.45977057029751)], [(24.455566406249993,59.42272750081452),(21.203613281249993,58.49369382056807),(21.335449218749993,56.89700392127261),(21.599121093749993,55.92458580482949),(25.202636718749993,55.998380955359636),(28.850097656249993,57.06463027327854),(27.09228515625,57.844750992890994),(28.806152343749996,59.17592824927138),(26.257324218749993,59.17592824927138),(24.455566406249993,59.42272750081452)], [(35.13427734375,59.84481485969107),(31.970214843749993,58.97266715450152),(33.20068359374999,56.776808316568406),(36.67236328125,56.41390137600675),(39.08935546874999,57.25528054528888),(42.69287109374999,58.03137242177638),(40.89111328124999,59.26588062825809),(37.28759765625,58.722598828043374),(37.11181640624999,59.66774058164964),(35.13427734375,59.84481485969107)], [(29.157714843749993,55.75184939173528),(22.565917968749993,55.128649068488784),(22.565917968749993,53.54030739150019),(22.038574218749996,51.48138289610097),(26.257324218749993,51.42661449707484),(30.124511718749993,50.54136296522162),(32.18994140624999,51.17934297928929),(30.124511718749993,53.173119202640635),(35.09033203124999,53.173119202640635),(33.11279296875,54.085173420886775),(29.597167968749993,55.50374985927513),(29.157714843749993,55.75184939173528)], [(42.82470703125,56.58369172128337),(36.584472656249986,55.329144408405085),(37.99072265625,53.592504809039355),(34.95849609374999,51.48138289610097),(36.54052734374999,50.40151532278236),(39.66064453124999,50.289339253291786),(39.79248046875,52.13348804077148),(41.77001953125,50.68079714532166),(44.49462890624999,51.97134580885171),(47.30712890624999,52.509534770327264),(44.05517578125,53.54030739150019),(46.60400390625,53.696706475303245),(47.61474609375,55.40406982700608),(45.37353515625,55.40406982700608),(42.82470703125,56.58369172128337)]])) format TSV; +select wkt(arrayMap(a -> arrayMap(b -> arrayMap(c -> (round(c.1, 6), round(c.2, 6)), b), a), +polygonsIntersectionSpherical([[(29.453587685533865,59.779570356240356),(29.393139070478895,52.276266797422124),(40.636581470703206,59.38168915000267),(41.21084331372543,59.103467777099866),(29.786055068336193,52.146627480315004),(31.23682182965546,52.16517054781818),(41.69443223416517,58.85424941916091),(42.51048853740727,58.47703162291134),(32.59691566839227,52.22075341251539),(34.289476889931414,52.22075341251539),(43.02430176537451,58.07974369546071),(43.02430176537451,57.25537683364851),(35.468224883503325,52.2022335126388),(37.16078610504247,52.23926559241349),(43.02430176537451,56.26136189644947),(43.02430176537451,55.326904361850836),(38.33953409861437,52.16517054781818),(40.09254393520848,52.16517054781818),(44.4146199116388,55.3097062225408),(44.47506852669377,59.80998197603594),(39.72985224487867,59.931351417569715),(30.23941968124846,53.67744677450975),(30.20919537372098,54.63314259659509),(38.73245009647167,59.94649146557819),(37.2816833351524,59.97675082987618),(30.23941968124846,55.2752875586599),(30.33009260383092,56.19415599955667),(36.28428118674541,59.96162460231375),(34.863738732953635,59.97675082987618),(30.178971066193498,56.97640788219866),(30.178971066193498,57.91957806959033),(33.65476643185424,59.94649146557819),(32.32489690064491,59.94649146557819),(30.481214141468342,58.85424941916091),(30.571887064050795,59.99187015036608),(29.453587685533865,59.779570356240356)]], [[(24.367675781249993,61.45977057029751),(19.577636718749993,58.67693767258692),(19.577636718749993,57.492213666700735),(19.445800781249996,55.87531083569678),(19.445800781249996,54.085173420886775),(17.468261718749996,53.014783245859235),(20.017089843749993,51.563412328675895),(21.203613281249993,50.205033264943324),(26.125488281249993,50.40151532278236),(27.22412109374999,48.980216985374994),(32.80517578124999,49.525208341974405),(35.26611328124999,48.74894534343292),(36.93603515624999,49.66762782262194),(42.56103515625,48.77791275550183),(43.92333984374999,49.8096315635631),(47.17529296875,49.152969656170455),(49.28466796875,50.54136296522162),(48.05419921875,51.17934297928929),(51.39404296875,52.48278022207825),(50.64697265625,53.014783245859235),(52.88818359375,53.93021986394004),(51.65771484374999,54.29088164657006),(52.66845703125,55.825973254619015),(50.25146484375,56.145549500679095),(51.92138671875,57.914847767009206),(49.15283203125,58.17070248348605),(49.59228515625,60.086762746260064),(47.043457031249986,59.88893689676584),(43.57177734375,61.37567331572748),(42.64892578125,60.630101766266705),(36.89208984374999,62.000904713685856),(36.01318359374999,61.143235250840576),(31.398925781249993,62.02152819100766),(30.563964843749996,61.05828537037917),(26.872558593749993,61.71070595883174),(26.652832031249993,61.10078883158897),(24.367675781249993,61.45977057029751)], [(24.455566406249993,59.42272750081452),(21.203613281249993,58.49369382056807),(21.335449218749993,56.89700392127261),(21.599121093749993,55.92458580482949),(25.202636718749993,55.998380955359636),(28.850097656249993,57.06463027327854),(27.09228515625,57.844750992890994),(28.806152343749996,59.17592824927138),(26.257324218749993,59.17592824927138),(24.455566406249993,59.42272750081452)], [(35.13427734375,59.84481485969107),(31.970214843749993,58.97266715450152),(33.20068359374999,56.776808316568406),(36.67236328125,56.41390137600675),(39.08935546874999,57.25528054528888),(42.69287109374999,58.03137242177638),(40.89111328124999,59.26588062825809),(37.28759765625,58.722598828043374),(37.11181640624999,59.66774058164964),(35.13427734375,59.84481485969107)], [(29.157714843749993,55.75184939173528),(22.565917968749993,55.128649068488784),(22.565917968749993,53.54030739150019),(22.038574218749996,51.48138289610097),(26.257324218749993,51.42661449707484),(30.124511718749993,50.54136296522162),(32.18994140624999,51.17934297928929),(30.124511718749993,53.173119202640635),(35.09033203124999,53.173119202640635),(33.11279296875,54.085173420886775),(29.597167968749993,55.50374985927513),(29.157714843749993,55.75184939173528)], [(42.82470703125,56.58369172128337),(36.584472656249986,55.329144408405085),(37.99072265625,53.592504809039355),(34.95849609374999,51.48138289610097),(36.54052734374999,50.40151532278236),(39.66064453124999,50.289339253291786),(39.79248046875,52.13348804077148),(41.77001953125,50.68079714532166),(44.49462890624999,51.97134580885171),(47.30712890624999,52.509534770327264),(44.05517578125,53.54030739150019),(46.60400390625,53.696706475303245),(47.61474609375,55.40406982700608),(45.37353515625,55.40406982700608),(42.82470703125,56.58369172128337)]]))) format TSV; diff --git a/tests/queries/0_stateless/01370_client_autocomplete_word_break_characters.expect b/tests/queries/0_stateless/01370_client_autocomplete_word_break_characters.expect index fd2de93c39c..9c20b7c517e 100755 --- a/tests/queries/0_stateless/01370_client_autocomplete_word_break_characters.expect +++ b/tests/queries/0_stateless/01370_client_autocomplete_word_break_characters.expect @@ -1,14 +1,13 @@ #!/usr/bin/expect -f -# Tags: no-fasttest log_user 0 set timeout 60 match_max 100000 -# A default timeout action is to do nothing, change it to fail expect_after { - timeout { - exit 1 - } + # Do not ignore eof from expect + eof { exp_continue } + # A default timeout action is to do nothing, change it to fail + timeout { exit 1 } } set basedir [file dirname $argv0] @@ -21,6 +20,7 @@ expect "SET max_distributed" # Wait for suggestions to load, they are loaded in background set is_done 0 +set timeout 1 while {$is_done == 0} { send -- "\t" expect { @@ -28,10 +28,15 @@ while {$is_done == 0} { set is_done 1 } default { - sleep 1 + # expect "_" will wait for timeout, + # if completion was not loaded it will fail, + # and we will retry, + # but for retry on timeout default should be reseted, + # this is what for this block. } } } +set timeout 60 send -- "\3\4" expect eof diff --git a/tests/queries/0_stateless/01410_nullable_key_more_tests.reference b/tests/queries/0_stateless/01410_nullable_key_more_tests.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/01410_nullable_key_more_tests.sh b/tests/queries/0_stateless/01410_nullable_key_more_tests.sh new file mode 100755 index 00000000000..03bebed324b --- /dev/null +++ b/tests/queries/0_stateless/01410_nullable_key_more_tests.sh @@ -0,0 +1,33 @@ +#!/usr/bin/env bash +# Tags: no-parallel + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +test_func() +{ + engine=$1 + + curl -d@- -sS "${CLICKHOUSE_URL}" <<< "drop table if exists table_with_nullable_keys" + + curl -d@- -sS "${CLICKHOUSE_URL}" <<< "create table table_with_nullable_keys (nullable_int Nullable(UInt32), nullable_str Nullable(String), nullable_lc LowCardinality(Nullable(String)), nullable_ints Array(Nullable(UInt32)), nullable_misc Tuple(Nullable(String), Nullable(UInt32)), nullable_val Map(UInt32, Nullable(String)), value UInt8) engine $engine order by (nullable_int, nullable_str, nullable_lc, nullable_ints, nullable_misc, nullable_val) settings allow_nullable_key = 1, index_granularity = 1" + + curl -d@- -sS "${CLICKHOUSE_URL}" <<< "insert into table_with_nullable_keys select * replace (cast(nullable_val as Map(UInt32, Nullable(String))) as nullable_val) from generateRandom('nullable_int Nullable(UInt32), nullable_str Nullable(String), nullable_lc Nullable(String), nullable_ints Array(Nullable(UInt32)), nullable_misc Tuple(Nullable(String), Nullable(UInt32)), nullable_val Array(Tuple(UInt32, Nullable(String))), value UInt8', 1, 30, 30) limit 1024" + + curl -d@- -sS "${CLICKHOUSE_URL}" <<< "select * from table_with_nullable_keys where nullable_str = (select randomPrintableASCII(30)) or nullable_str in (select randomPrintableASCII(30) from numbers(3)) format Null" + + curl -d@- -sS "${CLICKHOUSE_URL}" <<< "select * from table_with_nullable_keys where nullable_lc = (select randomPrintableASCII(30)) or nullable_lc in (select randomPrintableASCII(30) from numbers(3)) format Null" + + curl -d@- -sS "${CLICKHOUSE_URL}" <<< "select * from table_with_nullable_keys where nullable_ints = [1, 2, null] or nullable_ints in (select * from generateRandom('nullable_ints Array(Nullable(UInt32))', 1, 30, 30) limit 3) format Null" + + curl -d@- -sS "${CLICKHOUSE_URL}" <<< "select * from table_with_nullable_keys where nullable_misc = (select (randomPrintableASCII(30), rand())) or nullable_misc in (select arrayJoin([(randomPrintableASCII(30), null), (null, rand())]))" + + curl -d@- -sS "${CLICKHOUSE_URL}" <<< "select * from table_with_nullable_keys where nullable_val = (select map(rand(), randomPrintableASCII(10), rand(2), randomPrintableASCII(20), rand(3), null)) or nullable_val in (select cast(nullable_ints as Map(UInt32, Nullable(String))) from generateRandom('nullable_ints Array(Tuple(UInt32, Nullable(String)))', 1, 30, 30) limit 3) format Null" + + curl -d@- -sS "${CLICKHOUSE_URL}" <<< "drop table table_with_nullable_keys" +} + +test_func MergeTree +test_func AggregatingMergeTree +test_func ReplacingMergeTree diff --git a/tests/queries/0_stateless/01442_date_time_with_params.reference b/tests/queries/0_stateless/01442_date_time_with_params.reference index 19f78c83f82..726e59d4d35 100644 --- a/tests/queries/0_stateless/01442_date_time_with_params.reference +++ b/tests/queries/0_stateless/01442_date_time_with_params.reference @@ -13,6 +13,8 @@ parseDateTimeBestEffort 2020-05-14 03:37:03.253 DateTime64(3, \'UTC\') 2020-05-14 06:37:03.253 DateTime64(3, \'Europe/Minsk\') 2020-05-14 03:37:03.253 DateTime64(3, \'UTC\') +2021-12-28 00:00:00.123 DateTime64(3, \'UTC\') +2021-12-28 00:00:00 DateTime(\'UTC\') parseDateTimeBestEffortOrNull \N Nullable(DateTime64(3)) 2020-05-14 03:37:03.000 Nullable(DateTime64(3, \'UTC\')) @@ -25,6 +27,8 @@ parseDateTimeBestEffortOrNull 2020-05-14 03:37:03.253 Nullable(DateTime64(3, \'UTC\')) 2020-05-14 06:37:03.253 Nullable(DateTime64(3, \'Europe/Minsk\')) 2020-05-14 03:37:03.253 Nullable(DateTime64(3, \'UTC\')) +2021-12-28 00:00:00.123 Nullable(DateTime64(3, \'UTC\')) +2021-12-28 00:00:00 Nullable(DateTime(\'UTC\')) parseDateTimeBestEffortOrZero 1970-01-01 00:00:00.000 DateTime64(3, \'UTC\') 2020-05-14 03:37:03.000 DateTime64(3, \'UTC\') @@ -37,6 +41,8 @@ parseDateTimeBestEffortOrZero 2020-05-14 03:37:03.253 DateTime64(3, \'UTC\') 2020-05-14 06:37:03.253 DateTime64(3, \'Europe/Minsk\') 2020-05-14 03:37:03.253 DateTime64(3, \'UTC\') +2021-12-28 00:00:00.123 DateTime64(3, \'UTC\') +2021-12-28 00:00:00 DateTime(\'UTC\') parseDateTime32BestEffort 2020-05-14 03:37:03 DateTime(\'UTC\') 2020-05-14 03:37:03 DateTime(\'UTC\') @@ -48,6 +54,7 @@ parseDateTime32BestEffort 2020-05-14 03:37:03 DateTime(\'UTC\') 2020-05-14 06:37:03 DateTime(\'Europe/Minsk\') 2020-05-14 03:37:03 DateTime(\'UTC\') +2021-12-28 00:00:00 DateTime(\'UTC\') parseDateTime32BestEffortOrNull \N Nullable(DateTime) 2020-05-14 03:37:03 Nullable(DateTime(\'UTC\')) @@ -60,6 +67,7 @@ parseDateTime32BestEffortOrNull 2020-05-14 03:37:03 Nullable(DateTime(\'UTC\')) 2020-05-14 06:37:03 Nullable(DateTime(\'Europe/Minsk\')) 2020-05-14 03:37:03 Nullable(DateTime(\'UTC\')) +2021-12-28 00:00:00 Nullable(DateTime(\'UTC\')) parseDateTime32BestEffortOrZero 1970-01-01 00:00:00 DateTime(\'UTC\') 2020-05-14 03:37:03 DateTime(\'UTC\') @@ -72,3 +80,4 @@ parseDateTime32BestEffortOrZero 2020-05-14 03:37:03 DateTime(\'UTC\') 2020-05-14 06:37:03 DateTime(\'Europe/Minsk\') 2020-05-14 03:37:03 DateTime(\'UTC\') +2021-12-28 00:00:00 DateTime(\'UTC\') diff --git a/tests/queries/0_stateless/01442_date_time_with_params.sql b/tests/queries/0_stateless/01442_date_time_with_params.sql index 52815460245..5a57aabdb0c 100644 --- a/tests/queries/0_stateless/01442_date_time_with_params.sql +++ b/tests/queries/0_stateless/01442_date_time_with_params.sql @@ -24,6 +24,8 @@ SELECT parseDateTimeBestEffort('2020-05-14T03:37:03.253184', 3, 'UTC') AS a, toT SELECT parseDateTimeBestEffort('2020-05-14T03:37:03.253184Z', 3, 'UTC') AS a, toTypeName(a); SELECT parseDateTimeBestEffort('2020-05-14T03:37:03.253184Z', 3, 'Europe/Minsk') AS a, toTypeName(a); SELECT parseDateTimeBestEffort(materialize('2020-05-14T03:37:03.253184Z'), 3, 'UTC') AS a, toTypeName(a); +SELECT parseDateTimeBestEffort('1640649600123', 3, 'UTC') AS a, toTypeName(a); +SELECT parseDateTimeBestEffort('1640649600123', 'UTC') AS a, toTypeName(a); SELECT 'parseDateTimeBestEffortOrNull'; SELECT parseDateTimeBestEffortOrNull('', 3) AS a, toTypeName(a); @@ -37,6 +39,8 @@ SELECT parseDateTimeBestEffortOrNull('2020-05-14T03:37:03.253184', 3, 'UTC') AS SELECT parseDateTimeBestEffortOrNull('2020-05-14T03:37:03.253184Z', 3, 'UTC') AS a, toTypeName(a); SELECT parseDateTimeBestEffortOrNull('2020-05-14T03:37:03.253184Z', 3, 'Europe/Minsk') AS a, toTypeName(a); SELECT parseDateTimeBestEffortOrNull(materialize('2020-05-14T03:37:03.253184Z'), 3, 'UTC') AS a, toTypeName(a); +SELECT parseDateTimeBestEffortOrNull('1640649600123', 3, 'UTC') AS a, toTypeName(a); +SELECT parseDateTimeBestEffortOrNull('1640649600123', 'UTC') AS a, toTypeName(a); SELECT 'parseDateTimeBestEffortOrZero'; SELECT parseDateTimeBestEffortOrZero('', 3, 'UTC') AS a, toTypeName(a); @@ -50,6 +54,8 @@ SELECT parseDateTimeBestEffortOrZero('2020-05-14T03:37:03.253184', 3, 'UTC') AS SELECT parseDateTimeBestEffortOrZero('2020-05-14T03:37:03.253184Z', 3, 'UTC') AS a, toTypeName(a); SELECT parseDateTimeBestEffortOrZero('2020-05-14T03:37:03.253184Z', 3, 'Europe/Minsk') AS a, toTypeName(a); SELECT parseDateTimeBestEffortOrZero(materialize('2020-05-14T03:37:03.253184Z'), 3, 'UTC') AS a, toTypeName(a); +SELECT parseDateTimeBestEffortOrZero('1640649600123', 3, 'UTC') AS a, toTypeName(a); +SELECT parseDateTimeBestEffortOrZero('1640649600123', 'UTC') AS a, toTypeName(a); SELECT 'parseDateTime32BestEffort'; SELECT parseDateTime32BestEffort('') AS a, toTypeName(a); -- {serverError 41} @@ -63,6 +69,7 @@ SELECT parseDateTime32BestEffort('2020-05-14T03:37:03.253184', 'UTC') AS a, toTy SELECT parseDateTime32BestEffort('2020-05-14T03:37:03.253184Z', 'UTC') AS a, toTypeName(a); SELECT parseDateTime32BestEffort('2020-05-14T03:37:03.253184Z', 'Europe/Minsk') AS a, toTypeName(a); SELECT parseDateTime32BestEffort(materialize('2020-05-14T03:37:03.253184Z'), 'UTC') AS a, toTypeName(a); +SELECT parseDateTime32BestEffort('1640649600123', 'UTC') AS a, toTypeName(a); SELECT 'parseDateTime32BestEffortOrNull'; SELECT parseDateTime32BestEffortOrNull('') AS a, toTypeName(a); @@ -76,6 +83,7 @@ SELECT parseDateTime32BestEffortOrNull('2020-05-14T03:37:03.253184', 'UTC') AS a SELECT parseDateTime32BestEffortOrNull('2020-05-14T03:37:03.253184Z', 'UTC') AS a, toTypeName(a); SELECT parseDateTime32BestEffortOrNull('2020-05-14T03:37:03.253184Z', 'Europe/Minsk') AS a, toTypeName(a); SELECT parseDateTime32BestEffortOrNull(materialize('2020-05-14T03:37:03.253184Z'), 'UTC') AS a, toTypeName(a); +SELECT parseDateTime32BestEffortOrNull('1640649600123', 'UTC') AS a, toTypeName(a); SELECT 'parseDateTime32BestEffortOrZero'; SELECT parseDateTime32BestEffortOrZero('', 'UTC') AS a, toTypeName(a); @@ -89,6 +97,6 @@ SELECT parseDateTime32BestEffortOrZero('2020-05-14T03:37:03.253184', 'UTC') AS a SELECT parseDateTime32BestEffortOrZero('2020-05-14T03:37:03.253184Z', 'UTC') AS a, toTypeName(a); SELECT parseDateTime32BestEffortOrZero('2020-05-14T03:37:03.253184Z', 'Europe/Minsk') AS a, toTypeName(a); SELECT parseDateTime32BestEffortOrZero(materialize('2020-05-14T03:37:03.253184Z'), 'UTC') AS a, toTypeName(a); - +SELECT parseDateTime32BestEffortOrZero('1640649600123', 'UTC') AS a, toTypeName(a); DROP TABLE IF EXISTS test; diff --git a/tests/queries/0_stateless/01455_opentelemetry_distributed.reference b/tests/queries/0_stateless/01455_opentelemetry_distributed.reference index 1712ca329cf..119642df395 100644 --- a/tests/queries/0_stateless/01455_opentelemetry_distributed.reference +++ b/tests/queries/0_stateless/01455_opentelemetry_distributed.reference @@ -1,9 +1,9 @@ ===http=== {"query":"select 1 from remote('127.0.0.2', system, one) format Null\n","status":"QueryFinish","tracestate":"some custom state","sorted_by_start_time":1} {"query":"DESC TABLE system.one","status":"QueryFinish","tracestate":"some custom state","sorted_by_start_time":1} -{"query":"SELECT 1 FROM system.one","status":"QueryFinish","tracestate":"some custom state","sorted_by_start_time":1} +{"query":"SELECT 1 FROM `system`.`one`","status":"QueryFinish","tracestate":"some custom state","sorted_by_start_time":1} {"query":"DESC TABLE system.one","query_status":"QueryFinish","tracestate":"some custom state","sorted_by_finish_time":1} -{"query":"SELECT 1 FROM system.one","query_status":"QueryFinish","tracestate":"some custom state","sorted_by_finish_time":1} +{"query":"SELECT 1 FROM `system`.`one`","query_status":"QueryFinish","tracestate":"some custom state","sorted_by_finish_time":1} {"query":"select 1 from remote('127.0.0.2', system, one) format Null\n","query_status":"QueryFinish","tracestate":"some custom state","sorted_by_finish_time":1} {"total spans":"3","unique spans":"3","unique non-zero parent spans":"3"} {"initial query spans with proper parent":"1"} diff --git a/tests/queries/0_stateless/01502_jemalloc_percpu_arena.reference b/tests/queries/0_stateless/01502_jemalloc_percpu_arena.reference index d00491fd7e5..1eb57a24638 100644 --- a/tests/queries/0_stateless/01502_jemalloc_percpu_arena.reference +++ b/tests/queries/0_stateless/01502_jemalloc_percpu_arena.reference @@ -1 +1,4 @@ 1 +: Number of CPUs is not deterministic +: Number of CPUs is not deterministic, but narenas is set. Hope you not what you are doing and you have set narenas to largest possible CPU ID. +1 diff --git a/tests/queries/0_stateless/01502_jemalloc_percpu_arena.sh b/tests/queries/0_stateless/01502_jemalloc_percpu_arena.sh index 869e3a1d26d..265ca4a6763 100755 --- a/tests/queries/0_stateless/01502_jemalloc_percpu_arena.sh +++ b/tests/queries/0_stateless/01502_jemalloc_percpu_arena.sh @@ -1,4 +1,7 @@ #!/usr/bin/env bash +# Tags: no-tsan, no-asan, no-msan, no-ubsan, no-fasttest +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +# NOTE: jemalloc is disabled under sanitizers CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh @@ -6,9 +9,11 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) ncpus="$(getconf _NPROCESSORS_ONLN)" -# to hit possible issues even in unbundled builds: -# (although likiley jemalloc will be compiled with NDEBUG there) -export MALLOC_CONF=percpu_arena:percpu +# In debug build the following settings enabled by default: +# - abort_conf +# - abort +# Disable them explicitly (will enable when required). +export MALLOC_CONF=abort_conf:false,abort:false # Regression for: # @@ -18,3 +23,15 @@ export MALLOC_CONF=percpu_arena:percpu taskset --cpu-list $((ncpus-1)) ${CLICKHOUSE_LOCAL} -q 'select 1' # just in case something more complicated taskset --cpu-list $((ncpus-1)) ${CLICKHOUSE_LOCAL} -q 'select * from numbers_mt(100000000) settings max_threads=100 FORMAT Null' + +# this command should fail because percpu arena will be disabled, +# and with abort_conf:true it is not allowed +( + # subshell is required to suppress "Aborted" message from the shell. + MALLOC_CONF=abort_conf:true,abort:true + taskset --cpu-list $((ncpus-1)) ${CLICKHOUSE_LOCAL} -q 'select 1' +) |& grep -F 'Number of CPUs is not deterministic' + +# this command should not fail because we specify narenas explicitly +# (even with abort_conf:true) +MALLOC_CONF=abort_conf:true,abort:false,narenas:$((ncpus)) taskset --cpu-list $((ncpus-1)) ${CLICKHOUSE_LOCAL} -q 'select 1' 2>&1 diff --git a/tests/queries/0_stateless/01504_rocksdb.sql b/tests/queries/0_stateless/01504_rocksdb.sql index 9f9e6c3b1ac..f79f31139fe 100644 --- a/tests/queries/0_stateless/01504_rocksdb.sql +++ b/tests/queries/0_stateless/01504_rocksdb.sql @@ -34,7 +34,7 @@ INSERT INTO 01504_test_memory SELECT number % 77 AS k, SUM(number) AS value, (1, SELECT A.a = B.a, A.b = B.b, A.c = B.c, A.d = B.d, A.e = B.e FROM ( SELECT 0 AS a, groupBitmapMerge(bm) AS b , SUM(k) AS c, SUM(value) AS d, SUM(dummy.1) AS e FROM 01504_test) A ANY LEFT JOIN (SELECT 0 AS a, groupBitmapMerge(bm) AS b , SUM(k) AS c, SUM(value) AS d, SUM(dummy.1) AS e FROM 01504_test_memory) B USING a ORDER BY a; -CREATE TEMPORARY TABLE keys AS SELECT * FROM numbers(1000); +CREATE TEMPORARY TABLE keys AS SELECT * FROM system.numbers LIMIT 1 OFFSET 4; SET max_rows_to_read = 2; SELECT dummy == (1,1.2) FROM 01504_test WHERE k IN (1, 3) OR k IN (1) OR k IN (3, 1) OR k IN [1] OR k IN [1, 3] ; diff --git a/tests/queries/0_stateless/01505_pipeline_executor_UAF.sh b/tests/queries/0_stateless/01505_pipeline_executor_UAF.sh index 645eaea743c..ff22597c620 100755 --- a/tests/queries/0_stateless/01505_pipeline_executor_UAF.sh +++ b/tests/queries/0_stateless/01505_pipeline_executor_UAF.sh @@ -1,9 +1,14 @@ #!/usr/bin/env bash +# Tags: long CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh . "$CURDIR"/../shell_config.sh +# In debug build abort_conf:true is set by default, disable it explicitly +# to avoid "Number of CPUs is not deterministic" error from jemalloc. +export MALLOC_CONF=abort_conf:false + # Regression for UAF in ThreadPool. # (Triggered under TSAN) for _ in {1..10}; do diff --git a/tests/queries/0_stateless/01520_client_print_query_id.expect b/tests/queries/0_stateless/01520_client_print_query_id.expect index 1989ab51aea..8b6e0e17a85 100755 --- a/tests/queries/0_stateless/01520_client_print_query_id.expect +++ b/tests/queries/0_stateless/01520_client_print_query_id.expect @@ -1,14 +1,13 @@ #!/usr/bin/expect -f -# Tags: no-fasttest log_user 0 set timeout 60 match_max 100000 -# A default timeout action is to do nothing, change it to fail expect_after { - timeout { - exit 1 - } + # Do not ignore eof from expect + eof { exp_continue } + # A default timeout action is to do nothing, change it to fail + timeout { exit 1 } } set basedir [file dirname $argv0] diff --git a/tests/queries/0_stateless/01565_reconnect_after_client_error.expect b/tests/queries/0_stateless/01565_reconnect_after_client_error.expect index 261e46c2111..819450ffd30 100755 --- a/tests/queries/0_stateless/01565_reconnect_after_client_error.expect +++ b/tests/queries/0_stateless/01565_reconnect_after_client_error.expect @@ -1,5 +1,5 @@ #!/usr/bin/expect -f -# Tags: no-fasttest +# Tags: long # This is a separate test, because we want to test the interactive mode. # https://github.com/ClickHouse/ClickHouse/issues/19353 @@ -8,11 +8,11 @@ log_user 0 set timeout 60 match_max 100000 -# A default timeout action is to fail expect_after { - timeout { - exit 1 - } + # Do not ignore eof from expect + eof { exp_continue } + # A default timeout action is to do nothing, change it to fail + timeout { exit 1 } } set basedir [file dirname $argv0] diff --git a/tests/queries/0_stateless/01583_const_column_in_set_index.sql b/tests/queries/0_stateless/01583_const_column_in_set_index.sql index e40249eaf08..b781efb0f13 100644 --- a/tests/queries/0_stateless/01583_const_column_in_set_index.sql +++ b/tests/queries/0_stateless/01583_const_column_in_set_index.sql @@ -3,7 +3,7 @@ drop table if exists insub; create table insub (i int, j int) engine MergeTree order by i settings index_granularity = 1; insert into insub select number a, a + 2 from numbers(10); -SET max_rows_to_read = 2; +SET max_rows_to_read = 12; -- 10 from numbers + 2 from table select * from insub where i in (select toInt32(3) from numbers(10)); drop table if exists insub; diff --git a/tests/queries/0_stateless/01585_use_index_for_global_in.sql b/tests/queries/0_stateless/01585_use_index_for_global_in.sql index a0a5b90ac1f..1dd7609350f 100644 --- a/tests/queries/0_stateless/01585_use_index_for_global_in.sql +++ b/tests/queries/0_stateless/01585_use_index_for_global_in.sql @@ -8,10 +8,12 @@ create table xp_d as xp engine Distributed(test_shard_localhost, currentDatabase insert into xp select number, number + 2 from numbers(10); -set max_rows_to_read = 2; +set max_rows_to_read = 4; -- 2 from numbers, 2 from tables select * from xp where i in (select * from numbers(2)); select * from xp where i global in (select * from numbers(2)); select * from xp_d where i in (select * from numbers(2)); + +set max_rows_to_read = 6; -- 2 from numbers, 2 from GLOBAL temp table (pushed from numbers), 2 from local xp select * from xp_d where i global in (select * from numbers(2)); drop table if exists xp; diff --git a/tests/queries/0_stateless/01585_use_index_for_global_in_with_null.reference b/tests/queries/0_stateless/01585_use_index_for_global_in_with_null.reference index de0116f9eaa..0cb1993057f 100644 --- a/tests/queries/0_stateless/01585_use_index_for_global_in_with_null.reference +++ b/tests/queries/0_stateless/01585_use_index_for_global_in_with_null.reference @@ -14,6 +14,14 @@ 1 3 0 2 1 3 +0 2 +1 3 +0 2 +1 3 +0 2 +1 3 +0 2 +1 3 \N 100 \N 100 \N 100 diff --git a/tests/queries/0_stateless/01585_use_index_for_global_in_with_null.sql b/tests/queries/0_stateless/01585_use_index_for_global_in_with_null.sql index 6129c92c888..d4147a445ec 100644 --- a/tests/queries/0_stateless/01585_use_index_for_global_in_with_null.sql +++ b/tests/queries/0_stateless/01585_use_index_for_global_in_with_null.sql @@ -12,17 +12,29 @@ insert into xp select null, 100; optimize table xp final; set max_rows_to_read = 2; +select * from xp where i in [0, 1]; +select * from xp where i global in [0, 1]; +select * from xp_d where i in [0, 1]; +select * from xp_d where i global in [0, 1]; + +set max_rows_to_read = 4; -- 2 in the subquery, 2 in the query itself select * from xp where i in (select * from numbers(2)); select * from xp where i global in (select * from numbers(2)); select * from xp_d where i in (select * from numbers(2)); + +set max_rows_to_read = 6; -- 2 subquery, 2 from global temp table (GLOBAL IN), 2 from local xp table select * from xp_d where i global in (select * from numbers(2)); set transform_null_in = 1; +set max_rows_to_read = 4; -- 2 in the subquery, 2 in the query itself select * from xp where i in (select * from numbers(2)); select * from xp where i global in (select * from numbers(2)); select * from xp_d where i in (select * from numbers(2)); + +set max_rows_to_read = 6; -- 2 subquery, 2 from global temp table (GLOBAL IN), 2 from local xp table select * from xp_d where i global in (select * from numbers(2)); +set max_rows_to_read = 0; -- No rows should be read select * from xp where i in (null); select * from xp where i global in (null); select * from xp_d where i in (null); diff --git a/tests/queries/0_stateless/01591_window_functions.reference b/tests/queries/0_stateless/01591_window_functions.reference index 07562557369..4811d0a02ad 100644 --- a/tests/queries/0_stateless/01591_window_functions.reference +++ b/tests/queries/0_stateless/01591_window_functions.reference @@ -174,6 +174,8 @@ select groupArray(number) over () from numbers(3); -- Seen errors like 'column `1` not found' from count(1). select count(1) over (rows unbounded preceding), max(number + 1) over () from numbers(3); 1 3 +2 3 +3 3 -- Should work in DISTINCT select distinct sum(0) over (rows unbounded preceding) from numbers(2); 0 diff --git a/tests/queries/0_stateless/01615_random_one_shard_insertion.sql b/tests/queries/0_stateless/01615_random_one_shard_insertion.sql index e205a358db1..5a087e1e20f 100644 --- a/tests/queries/0_stateless/01615_random_one_shard_insertion.sql +++ b/tests/queries/0_stateless/01615_random_one_shard_insertion.sql @@ -15,11 +15,11 @@ set insert_distributed_one_random_shard = 1; set max_block_size = 1; set max_insert_block_size = 1; set min_insert_block_size_rows = 1; -insert into distr select number from numbers(20); +insert into distr select number from numbers(100); select count() != 0 from shard_0.tbl; select count() != 0 from shard_1.tbl; -select * from distr order by number; +select * from distr order by number LIMIT 20; drop table if exists shard_0.tbl; drop table if exists shard_1.tbl; diff --git a/tests/queries/0_stateless/01622_defaults_for_url_engine.reference b/tests/queries/0_stateless/01622_defaults_for_url_engine.reference index 7326d960397..c0a49154cd9 100644 --- a/tests/queries/0_stateless/01622_defaults_for_url_engine.reference +++ b/tests/queries/0_stateless/01622_defaults_for_url_engine.reference @@ -1 +1 @@ -Ok +(1,7,8) \ No newline at end of file diff --git a/tests/queries/0_stateless/01622_defaults_for_url_engine.sh b/tests/queries/0_stateless/01622_defaults_for_url_engine.sh index 0a93a3ef479..cf89a624bc9 100755 --- a/tests/queries/0_stateless/01622_defaults_for_url_engine.sh +++ b/tests/queries/0_stateless/01622_defaults_for_url_engine.sh @@ -1,37 +1,13 @@ #!/usr/bin/env bash # Tags: no-fasttest -# Tag no-fasttest: nc - command not found CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh . "$CURDIR"/../shell_config.sh - -PORT="$(($RANDOM%63000+2001))" - -function thread1 -{ - while true; do - echo -e "HTTP/1.1 200 OK\n\n{\"a\": 1}" | nc -l -p $1 -q 1; - done -} - -function thread2 -{ - while true; do - $CLICKHOUSE_CLIENT --input_format_defaults_for_omitted_fields=1 -q "SELECT * FROM url('http://127.0.0.1:$1/', JSONEachRow, 'a int, b int default 7, c default a + b') format Values" | grep -F '(1,7,8)' && break - done -} - -# https://stackoverflow.com/questions/9954794/execute-a-shell-function-with-timeout -export -f thread1; -export -f thread2; - -TIMEOUT=60 - -timeout $TIMEOUT bash -c "thread1 $PORT" > /dev/null 2>&1 & -PID=$! - -bash -c "thread2 $PORT" 2> /dev/null | grep -q -F '(1,7,8)' && echo "Ok" && kill -9 $PID - -wait >/dev/null 2>&1 +${CLICKHOUSE_CLIENT} --query " +SELECT * FROM url( + \$\$http://127.0.0.1:${CLICKHOUSE_PORT_HTTP}/?query=SELECT+'{\"a\":1}'\$\$, + JSONEachRow, + 'a int, b int default 7, c default a + b') +FORMAT Values" diff --git a/tests/queries/0_stateless/01634_summap_nullable.reference b/tests/queries/0_stateless/01634_summap_nullable.reference new file mode 100644 index 00000000000..babed7df00d --- /dev/null +++ b/tests/queries/0_stateless/01634_summap_nullable.reference @@ -0,0 +1,2 @@ +(['a'],[1]) +(['a','b'],[1,0]) diff --git a/tests/queries/0_stateless/01634_summap_nullable.sql b/tests/queries/0_stateless/01634_summap_nullable.sql new file mode 100644 index 00000000000..226da645e9f --- /dev/null +++ b/tests/queries/0_stateless/01634_summap_nullable.sql @@ -0,0 +1,2 @@ +SELECT sumMap(['a', 'b'], [1, NULL]); +SELECT sumMap(['a', 'b'], [1, toNullable(0)]); diff --git a/tests/queries/0_stateless/01643_system_suspend.sql b/tests/queries/0_stateless/01643_system_suspend.sql deleted file mode 100644 index c2cd37e6156..00000000000 --- a/tests/queries/0_stateless/01643_system_suspend.sql +++ /dev/null @@ -1,5 +0,0 @@ -CREATE TEMPORARY TABLE t (x DateTime); -INSERT INTO t VALUES (now()); -SYSTEM SUSPEND FOR 1 SECOND; -INSERT INTO t VALUES (now()); -SELECT max(x) - min(x) >= 1 FROM t; diff --git a/tests/queries/0_stateless/01650_drop_part_and_deduplication_zookeeper_long.reference b/tests/queries/0_stateless/01650_drop_part_and_deduplication_zookeeper_long.reference index ebb0b033d5b..05f7d08de7d 100644 --- a/tests/queries/0_stateless/01650_drop_part_and_deduplication_zookeeper_long.reference +++ b/tests/queries/0_stateless/01650_drop_part_and_deduplication_zookeeper_long.reference @@ -1,3 +1,4 @@ +~~~~source parts~~~~~ 1 1_0_0_0 1 1_1_1_0 2 2_0_0_0 @@ -10,6 +11,7 @@ 2_ 2_1_1_0 3_ 3_0_0_0 3_ 3_1_1_0 +~~~~parts after deduplication~~~~~ 1 1_0_0_0 1 1_1_1_0 2 2_0_0_0 @@ -22,6 +24,7 @@ 2_ 2_1_1_0 3_ 3_0_0_0 3_ 3_1_1_0 +~~~~parts after drop 3_1_1_0~~~~~ 1 1_0_0_0 1 1_1_1_0 2 2_0_0_0 @@ -32,6 +35,7 @@ 2_ 2_0_0_0 2_ 2_1_1_0 3_ 3_0_0_0 +~~~~parts after new part without deduplication~~~~~ 1 1_0_0_0 1 1_1_1_0 2 2_0_0_0 diff --git a/tests/queries/0_stateless/01650_drop_part_and_deduplication_zookeeper_long.sql b/tests/queries/0_stateless/01650_drop_part_and_deduplication_zookeeper_long.sql index 1aa568c1663..c77f29d89c2 100644 --- a/tests/queries/0_stateless/01650_drop_part_and_deduplication_zookeeper_long.sql +++ b/tests/queries/0_stateless/01650_drop_part_and_deduplication_zookeeper_long.sql @@ -17,25 +17,33 @@ SYSTEM STOP MERGES partitioned_table; INSERT INTO partitioned_table VALUES (1, 1, 'A'), (2, 2, 'B'), (3, 3, 'C'); INSERT INTO partitioned_table VALUES (11, 1, 'AA'), (22, 2, 'BB'), (33, 3, 'CC'); -SELECT partition_id, name FROM system.parts WHERE table = 'partitioned_table' AND database = currentDatabase() ORDER BY name; +SELECT '~~~~source parts~~~~~'; + +SELECT partition_id, name FROM system.parts WHERE table = 'partitioned_table' AND database = currentDatabase() and active ORDER BY name; SELECT substring(name, 1, 2), value FROM system.zookeeper WHERE path='/clickhouse/' || currentDatabase() || '/01650_drop_part_and_deduplication_partitioned_table/blocks/' ORDER BY value; INSERT INTO partitioned_table VALUES (33, 3, 'CC'); -- must be deduplicated -SELECT partition_id, name FROM system.parts WHERE table = 'partitioned_table' AND database = currentDatabase() ORDER BY name; +SELECT '~~~~parts after deduplication~~~~~'; + +SELECT partition_id, name FROM system.parts WHERE table = 'partitioned_table' AND database = currentDatabase() and active ORDER BY name; SELECT substring(name, 1, 2), value FROM system.zookeeper WHERE path='/clickhouse/' || currentDatabase() || '/01650_drop_part_and_deduplication_partitioned_table/blocks/' ORDER BY value; ALTER TABLE partitioned_table DROP PART '3_1_1_0'; -SELECT partition_id, name FROM system.parts WHERE table = 'partitioned_table' AND database = currentDatabase() ORDER BY name; +SELECT '~~~~parts after drop 3_1_1_0~~~~~'; + +SELECT partition_id, name FROM system.parts WHERE table = 'partitioned_table' AND database = currentDatabase() and active ORDER BY name; SELECT substring(name, 1, 2), value FROM system.zookeeper WHERE path='/clickhouse/' || currentDatabase() || '/01650_drop_part_and_deduplication_partitioned_table/blocks/' ORDER BY value; INSERT INTO partitioned_table VALUES (33, 3, 'CC'); -- mustn't be deduplicated -SELECT partition_id, name FROM system.parts WHERE table = 'partitioned_table' AND database = currentDatabase() ORDER BY name; +SELECT '~~~~parts after new part without deduplication~~~~~'; + +SELECT partition_id, name FROM system.parts WHERE table = 'partitioned_table' AND database = currentDatabase() and active ORDER BY name; SELECT substring(name, 1, 2), value FROM system.zookeeper WHERE path='/clickhouse/' || currentDatabase() || '/01650_drop_part_and_deduplication_partitioned_table/blocks/' ORDER BY value; diff --git a/tests/queries/0_stateless/01658_read_file_to_stringcolumn.reference b/tests/queries/0_stateless/01658_read_file_to_stringcolumn.reference index ad8df154dcc..5c7d5d4cd6e 100644 --- a/tests/queries/0_stateless/01658_read_file_to_stringcolumn.reference +++ b/tests/queries/0_stateless/01658_read_file_to_stringcolumn.reference @@ -8,11 +8,11 @@ ccccccccc aaaaaaaaa bbbbbbbbb aaaaaaaaa bbbbbbbbb ccccccccc -:233 -:79 +:107 +:74 +:35 :35 :35 -:233 695071 aaaaaaaaa bbbbbbbbb ccccccccc aaaaaaaaa bbbbbbbbb @@ -21,5 +21,9 @@ ccccccccc aaaaaaaaa bbbbbbbbb ccccccccc aaaaaaaaa bbbbbbbbb 695071 0 :0 -:233 -:79 +:107 +:74 +Hello 2 +World 1 +Hello 2 +World 1 diff --git a/tests/queries/0_stateless/01658_read_file_to_stringcolumn.sh b/tests/queries/0_stateless/01658_read_file_to_stringcolumn.sh index 3797b264f26..6f4b4cb8a9a 100755 --- a/tests/queries/0_stateless/01658_read_file_to_stringcolumn.sh +++ b/tests/queries/0_stateless/01658_read_file_to_stringcolumn.sh @@ -8,16 +8,17 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) . "$CURDIR"/../shell_config.sh # Data preparation. + # Now we can get the user_files_path by use the table file function for trick. also we can get it by query as: # "insert into function file('exist.txt', 'CSV', 'val1 char') values ('aaaa'); select _path from file('exist.txt', 'CSV', 'val1 char')" -user_files_path=$(clickhouse-client --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}') +CLICKHOUSE_USER_FILES_PATH=$(clickhouse-client --query "select _path, _file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}') -mkdir -p ${user_files_path}/ -echo -n aaaaaaaaa > ${user_files_path}/a.txt -echo -n bbbbbbbbb > ${user_files_path}/b.txt -echo -n ccccccccc > ${user_files_path}/c.txt +mkdir -p ${CLICKHOUSE_USER_FILES_PATH}/ +echo -n aaaaaaaaa > ${CLICKHOUSE_USER_FILES_PATH}/a.txt +echo -n bbbbbbbbb > ${CLICKHOUSE_USER_FILES_PATH}/b.txt +echo -n ccccccccc > ${CLICKHOUSE_USER_FILES_PATH}/c.txt echo -n ccccccccc > /tmp/c.txt -mkdir -p ${user_files_path}/dir +mkdir -p ${CLICKHOUSE_USER_FILES_PATH}/dir ### 1st TEST in CLIENT mode. @@ -26,28 +27,28 @@ ${CLICKHOUSE_CLIENT} --query "create table data (A String, B String) engine=Merg # Valid cases: -${CLICKHOUSE_CLIENT} --query "select file('${user_files_path}/a.txt'), file('${user_files_path}/b.txt');";echo ":"$? -${CLICKHOUSE_CLIENT} --query "insert into data select file('${user_files_path}/a.txt'), file('${user_files_path}/b.txt');";echo ":"$? -${CLICKHOUSE_CLIENT} --query "insert into data select file('${user_files_path}/a.txt'), file('${user_files_path}/b.txt');";echo ":"$? -${CLICKHOUSE_CLIENT} --query "select file('${user_files_path}/c.txt'), * from data";echo ":"$? +${CLICKHOUSE_CLIENT} --query "select file('a.txt'), file('b.txt');";echo ":"$? +${CLICKHOUSE_CLIENT} --query "insert into data select file('a.txt'), file('b.txt');";echo ":"$? +${CLICKHOUSE_CLIENT} --query "insert into data select file('a.txt'), file('b.txt');";echo ":"$? +${CLICKHOUSE_CLIENT} --query "select file('c.txt'), * from data";echo ":"$? ${CLICKHOUSE_CLIENT} --multiquery --query " - create table filenames(name String) engine=MergeTree() order by tuple(); - insert into filenames values ('a.txt'), ('b.txt'), ('c.txt'); - select file(name) from filenames format TSV; - drop table if exists filenames; + create table filenames(name String) engine=MergeTree() order by tuple(); + insert into filenames values ('a.txt'), ('b.txt'), ('c.txt'); + select file(name) from filenames format TSV; + drop table if exists filenames; " # Invalid cases: (Here using sub-shell to catch exception avoiding the test quit) # Test non-exists file -echo "clickhouse-client --query "'"select file('"'nonexist.txt'), file('${user_files_path}/b.txt')"'";echo :$?' | bash 2>/dev/null +echo "${CLICKHOUSE_CLIENT} --query "'"select file('"'nonexist.txt'), file('b.txt')"'";echo :$?' | bash 2>/dev/null # Test isDir -echo "clickhouse-client --query "'"select file('"'${user_files_path}/dir'), file('${user_files_path}/b.txt')"'";echo :$?' | bash 2>/dev/null +echo "${CLICKHOUSE_CLIENT} --query "'"select file('"'dir'), file('b.txt')"'";echo :$?' | bash 2>/dev/null # Test path out of the user_files directory. It's not allowed in client mode -echo "clickhouse-client --query "'"select file('"'/tmp/c.txt'), file('${user_files_path}/b.txt')"'";echo :$?' | bash 2>/dev/null +echo "${CLICKHOUSE_CLIENT} --query "'"select file('"'/tmp/c.txt'), file('b.txt')"'";echo :$?' | bash 2>/dev/null # Test relative path consists of ".." whose absolute path is out of the user_files directory. -echo "clickhouse-client --query "'"select file('"'${user_files_path}/../../../../../../../../../../../../../../../../../../../tmp/c.txt'), file('b.txt')"'";echo :$?' | bash 2>/dev/null -echo "clickhouse-client --query "'"select file('"'../../../../a.txt'), file('${user_files_path}/b.txt')"'";echo :$?' | bash 2>/dev/null +echo "${CLICKHOUSE_CLIENT} --query "'"select file('"'../../../../../../../../../../../../../../../../../../../tmp/c.txt'), file('b.txt')"'";echo :$?' | bash 2>/dev/null +echo "${CLICKHOUSE_CLIENT} --query "'"select file('"'../../../../a.txt'), file('b.txt')"'";echo :$?' | bash 2>/dev/null ### 2nd TEST in LOCAL mode. @@ -63,29 +64,36 @@ echo $c_count # Valid cases: # The default dir is the CWD path in LOCAL mode ${CLICKHOUSE_LOCAL} --query " - drop table if exists data; - create table data (A String, B String) engine=MergeTree() order by A; - select file('a.txt'), file('b.txt'); - insert into data select file('a.txt'), file('b.txt'); - insert into data select file('a.txt'), file('b.txt'); - select file('c.txt'), * from data; - select file('/tmp/c.txt'), * from data; - select $c_count, $c_count -length(file('${CURDIR}/01518_nullable_aggregate_states2.reference')) + drop table if exists data; + create table data (A String, B String) engine=MergeTree() order by A; + select file('a.txt'), file('b.txt'); + insert into data select file('a.txt'), file('b.txt'); + insert into data select file('a.txt'), file('b.txt'); + select file('c.txt'), * from data; + select file('/tmp/c.txt'), * from data; + select $c_count, $c_count -length(file('${CURDIR}/01518_nullable_aggregate_states2.reference')) " echo ":"$? # Invalid cases: (Here using sub-shell to catch exception avoiding the test quit) # Test non-exists file -echo "clickhouse-local --query "'"select file('"'nonexist.txt'), file('b.txt')"'";echo :$?' | bash 2>/dev/null +echo "${CLICKHOUSE_LOCAL} --query "'"select file('"'nonexist.txt'), file('b.txt')"'";echo :$?' | bash 2>/dev/null # Test isDir -echo "clickhouse-local --query "'"select file('"'dir'), file('b.txt')"'";echo :$?' | bash 2>/dev/null +echo "${CLICKHOUSE_LOCAL} --query "'"select file('"'dir'), file('b.txt')"'";echo :$?' | bash 2>/dev/null + +# Test that the function is not injective + +echo -n Hello > ${CLICKHOUSE_USER_FILES_PATH}/a +echo -n Hello > ${CLICKHOUSE_USER_FILES_PATH}/b +echo -n World > ${CLICKHOUSE_USER_FILES_PATH}/c + +${CLICKHOUSE_CLIENT} --query "SELECT file(arrayJoin(['a', 'b', 'c'])) AS s, count() GROUP BY s ORDER BY s" +${CLICKHOUSE_CLIENT} --query "SELECT s, count() FROM file('?', TSV, 's String') GROUP BY s ORDER BY s" # Restore -rm -rf a.txt b.txt c.txt dir -rm -rf ${user_files_path}/a.txt -rm -rf ${user_files_path}/b.txt -rm -rf ${user_files_path}/c.txt -rm -rf /tmp/c.txt -rm -rf ${user_files_path}/dir +rm ${CLICKHOUSE_USER_FILES_PATH}/{a,b,c}.txt +rm ${CLICKHOUSE_USER_FILES_PATH}/{a,b,c} +rm /tmp/c.txt +rm -rf ${CLICKHOUSE_USER_FILES_PATH}/dir diff --git a/tests/queries/0_stateless/01660_sum_ubsan.reference b/tests/queries/0_stateless/01660_sum_ubsan.reference index c2ff9f590d5..6ac74108e71 100644 --- a/tests/queries/0_stateless/01660_sum_ubsan.reference +++ b/tests/queries/0_stateless/01660_sum_ubsan.reference @@ -1,5 +1,10 @@ +-- { echo } + +-- Aggregate function 'sum' allows overflow with two's complement arithmetics. +-- This contradicts the standard SQL semantic and we are totally fine with it. +SELECT sum(-8000000000000000000) FROM numbers(11); 4233720368547758080 -384883669867978000 +SELECT sum(-8000000000000000000) FROM remote('127.0.0.{1,2,3,4,5,6,7,8,9,10,11}', system.one); 4233720368547758080 -384883669867978000 +SELECT sumKahan(-8000000000000000000) FROM numbers(11); -88000000000000000000 diff --git a/tests/queries/0_stateless/01660_sum_ubsan.sql b/tests/queries/0_stateless/01660_sum_ubsan.sql index 1d544324f77..9a3268563ef 100644 --- a/tests/queries/0_stateless/01660_sum_ubsan.sql +++ b/tests/queries/0_stateless/01660_sum_ubsan.sql @@ -1,9 +1,7 @@ +-- { echo } + -- Aggregate function 'sum' allows overflow with two's complement arithmetics. -- This contradicts the standard SQL semantic and we are totally fine with it. SELECT sum(-8000000000000000000) FROM numbers(11); -SELECT avg(-8000000000000000000) FROM numbers(11); - SELECT sum(-8000000000000000000) FROM remote('127.0.0.{1,2,3,4,5,6,7,8,9,10,11}', system.one); -SELECT avg(-8000000000000000000) FROM remote('127.0.0.{1,2,3,4,5,6,7,8,9,10,11}', system.one); - SELECT sumKahan(-8000000000000000000) FROM numbers(11); diff --git a/tests/queries/0_stateless/01675_distributed_bytes_to_delay_insert_long.reference b/tests/queries/0_stateless/01675_distributed_bytes_to_delay_insert.reference similarity index 61% rename from tests/queries/0_stateless/01675_distributed_bytes_to_delay_insert_long.reference rename to tests/queries/0_stateless/01675_distributed_bytes_to_delay_insert.reference index 343d1f3639f..63972b90eda 100644 --- a/tests/queries/0_stateless/01675_distributed_bytes_to_delay_insert_long.reference +++ b/tests/queries/0_stateless/01675_distributed_bytes_to_delay_insert.reference @@ -1,2 +1,3 @@ max_delay_to_insert will throw +Too many bytes pending for async INSERT max_delay_to_insert will succeed diff --git a/tests/queries/0_stateless/01675_distributed_bytes_to_delay_insert.sh b/tests/queries/0_stateless/01675_distributed_bytes_to_delay_insert.sh new file mode 100755 index 00000000000..a02589dde94 --- /dev/null +++ b/tests/queries/0_stateless/01675_distributed_bytes_to_delay_insert.sh @@ -0,0 +1,125 @@ +#!/usr/bin/env bash +# Tags: long, distributed + +# NOTE: $SECONDS accuracy is second, so we need some delta, hence -1 in time conditions. + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +function drop_tables() +{ + ${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}" -d @- <<<"drop table if exists dist_01675" + ${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}" -d @- <<<"drop table if exists data_01675" +} + +# +# Case 1: max_delay_to_insert will throw. +# +function test_max_delay_to_insert_will_throw() +{ + echo "max_delay_to_insert will throw" + + local max_delay_to_insert=2 + ${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}" -d @- <<<"create table data_01675 (key Int) engine=Null()" + ${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}" -d @- <<<"create table dist_01675 (key Int) engine=Distributed(test_shard_localhost, currentDatabase(), data_01675) settings bytes_to_delay_insert=1, max_delay_to_insert=$max_delay_to_insert" + ${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}" -d @- <<<"system stop distributed sends dist_01675" + + local start_seconds=$SECONDS + # first batch is always OK, since there is no pending bytes yet + ${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}" -d @- <<<"insert into dist_01675 select * from numbers(1) settings prefer_localhost_replica=0" + # second will fail, because of bytes_to_delay_insert=1 and max_delay_to_insert>0, + # while distributed sends is stopped. + # + # (previous block definitelly takes more, since it has header) + ${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}" -d @- <<<"insert into dist_01675 select * from numbers(1) settings prefer_localhost_replica=0" |& grep -o 'Too many bytes pending for async INSERT' + ${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}" -d @- <<<"system flush distributed dist_01675" + local end_seconds=$SECONDS + + if (( (end_seconds-start_seconds)<(max_delay_to_insert-1) )); then + echo "max_delay_to_insert was not satisfied ($end_seconds-$start_seconds)" + fi +} + +# +# Case 2: max_delay_to_insert will finally finished. +# +function test_max_delay_to_insert_will_succeed_once() +{ + local max_delay_to_insert=4 + local flush_delay=2 + + drop_tables + + ${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}" -d @- <<<"create table data_01675 (key Int) engine=Null()" + ${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}" -d @- <<<"create table dist_01675 (key Int) engine=Distributed(test_shard_localhost, currentDatabase(), data_01675) settings bytes_to_delay_insert=1, max_delay_to_insert=$max_delay_to_insert" + ${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}" -d @- <<<"system stop distributed sends dist_01675" + + function flush_distributed_worker() + { + sleep $flush_delay + ${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}" -d @- <<<"system flush distributed dist_01675" + } + flush_distributed_worker & + + local start_seconds=$SECONDS + { + # NOTE: + # ignore stderr, since it may produce exception if flushing thread will be too slow + # (this is possible on CI) + + # first batch is always OK, since there is no pending bytes yet + ${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}&extremes=1" -d @- <<<"insert into dist_01675 select * from numbers(1) settings prefer_localhost_replica=0" >& /dev/null + # second will succeed, due to SYSTEM FLUSH DISTRIBUTED in background. + ${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}&extremes=1" -d @- <<<"insert into dist_01675 select * from numbers(1) settings prefer_localhost_replica=0" >& /dev/null + } + local end_seconds=$SECONDS + + wait + + local diff=$(( end_seconds-start_seconds )) + + if (( diff<(flush_delay-1) )); then + # this is fatal error, that should not be retriable + echo "max_delay_to_insert was not wait flush_delay ($diff)" + exit 1 + fi + + # retry the test until the diff will be satisfied + # (since we should not assume that there will be no other lags) + if (( diff>=(max_delay_to_insert-1) )); then + return 1 + fi + + return 0 +} +function test_max_delay_to_insert_will_succeed() +{ + echo "max_delay_to_insert will succeed" + + local retries=20 i=0 + while (( (i++) < retries )); do + if test_max_delay_to_insert_will_succeed_once; then + return + fi + done + + echo failed +} + +function run_test() +{ + local test_case=$1 && shift + + drop_tables + $test_case +} + +function main() +{ + run_test test_max_delay_to_insert_will_throw + run_test test_max_delay_to_insert_will_succeed + + drop_tables +} +main "$@" diff --git a/tests/queries/0_stateless/01675_distributed_bytes_to_delay_insert_long.sh b/tests/queries/0_stateless/01675_distributed_bytes_to_delay_insert_long.sh deleted file mode 100755 index e373f632155..00000000000 --- a/tests/queries/0_stateless/01675_distributed_bytes_to_delay_insert_long.sh +++ /dev/null @@ -1,130 +0,0 @@ -#!/usr/bin/env bash -# Tags: long, distributed - -# NOTE: $SECONDS accuracy is second, so we need some delta, hence -1 in time conditions. - -CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) -# shellcheck source=../shell_config.sh -. "$CURDIR"/../shell_config.sh - -function drop_tables() -{ - ${CLICKHOUSE_CLIENT} -nq " - drop table if exists dist_01675; - drop table if exists data_01675; - " -} - -# -# Case 1: max_delay_to_insert will throw. -# -function test_max_delay_to_insert_will_throw() -{ - echo "max_delay_to_insert will throw" - - local max_delay_to_insert=2 - ${CLICKHOUSE_CLIENT} -nq " - create table data_01675 (key Int) engine=Null(); - create table dist_01675 (key Int) engine=Distributed(test_shard_localhost, currentDatabase(), data_01675) settings bytes_to_delay_insert=1, max_delay_to_insert=$max_delay_to_insert; - system stop distributed sends dist_01675; - " - - local start_seconds=$SECONDS - ${CLICKHOUSE_CLIENT} --testmode -nq " - -- first batch is always OK, since there is no pending bytes yet - insert into dist_01675 select * from numbers(1) settings prefer_localhost_replica=0; - -- second will fail, because of bytes_to_delay_insert=1 and max_delay_to_insert>0, - -- while distributed sends is stopped. - -- - -- (previous block definitelly takes more, since it has header) - insert into dist_01675 select * from numbers(1) settings prefer_localhost_replica=0; -- { serverError 574 } - system flush distributed dist_01675; - " - local end_seconds=$SECONDS - - if (( (end_seconds-start_seconds)<(max_delay_to_insert-1) )); then - echo "max_delay_to_insert was not satisfied ($end_seconds-$start_seconds)" - fi -} - -# -# Case 2: max_delay_to_insert will finally finished. -# -function test_max_delay_to_insert_will_succeed_once() -{ - local max_delay_to_insert=4 - local flush_delay=2 - - drop_tables - - ${CLICKHOUSE_CLIENT} -nq " - create table data_01675 (key Int) engine=Null(); - create table dist_01675 (key Int) engine=Distributed(test_shard_localhost, currentDatabase(), data_01675) settings bytes_to_delay_insert=1, max_delay_to_insert=$max_delay_to_insert; - system stop distributed sends dist_01675; - " - - function flush_distributed_worker() - { - sleep $flush_delay - ${CLICKHOUSE_CLIENT} -q "system flush distributed dist_01675" - } - flush_distributed_worker & - - local start_seconds=$SECONDS - # ignore stderr, since it may produce exception if flushing thread will be too slow - # (this is possible on CI) - ${CLICKHOUSE_CLIENT} --testmode -nq " - -- first batch is always OK, since there is no pending bytes yet - insert into dist_01675 select * from numbers(1) settings prefer_localhost_replica=0; - -- second will succeed, due to SYSTEM FLUSH DISTRIBUTED in background. - insert into dist_01675 select * from numbers(1) settings prefer_localhost_replica=0; - " >& /dev/null - local end_seconds=$SECONDS - wait - - local diff=$(( end_seconds-start_seconds )) - - if (( diff<(flush_delay-1) )); then - # this is fatal error, that should not be retriable - echo "max_delay_to_insert was not wait flush_delay ($diff)" - exit 1 - fi - - # retry the test until the diff will be satisfied - # (since we cannot assume that there will be no other lags) - if (( diff>=(max_delay_to_insert-1) )); then - return 1 - fi - - return 0 -} -function test_max_delay_to_insert_will_succeed() -{ - echo "max_delay_to_insert will succeed" - - local retries=20 i=0 - while (( (i++) < retries )); do - if test_max_delay_to_insert_will_succeed_once; then - return - fi - done - - echo failed -} - -function run_test() -{ - local test_case=$1 && shift - - drop_tables - $test_case -} - -function main() -{ - run_test test_max_delay_to_insert_will_throw - run_test test_max_delay_to_insert_will_succeed - - drop_tables -} -main "$@" diff --git a/tests/queries/0_stateless/01676_long_clickhouse_client_autocomplete.sh b/tests/queries/0_stateless/01676_long_clickhouse_client_autocomplete.sh index fb2d97b6270..1be082a6aae 100755 --- a/tests/queries/0_stateless/01676_long_clickhouse_client_autocomplete.sh +++ b/tests/queries/0_stateless/01676_long_clickhouse_client_autocomplete.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash -# Tags: long, no-fasttest +# Tags: long CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh @@ -20,11 +20,11 @@ function test_completion_word_client() log_user 0 set timeout 3 match_max 100000 -# A default timeout action is to do nothing, change it to fail expect_after { - timeout { - exit 1 - } + # Do not ignore eof from expect + eof { exp_continue } + # A default timeout action is to do nothing, change it to fail + timeout { exit 1 } } spawn bash -c "$CLICKHOUSE_CLIENT_BINARY $CLICKHOUSE_CLIENT_OPT" @@ -104,11 +104,11 @@ function test_completion_word_local() log_user 0 set timeout 3 match_max 100000 -# A default timeout action is to do nothing, change it to fail expect_after { - timeout { - exit 1 - } + # Do not ignore eof from expect + eof { exp_continue } + # A default timeout action is to do nothing, change it to fail + timeout { exit 1 } } spawn bash -c "$CLICKHOUSE_LOCAL" diff --git a/tests/queries/0_stateless/01643_system_suspend.reference b/tests/queries/0_stateless/01710_projection_detach_part.reference similarity index 100% rename from tests/queries/0_stateless/01643_system_suspend.reference rename to tests/queries/0_stateless/01710_projection_detach_part.reference diff --git a/tests/queries/0_stateless/01710_projection_detach_part.sql b/tests/queries/0_stateless/01710_projection_detach_part.sql new file mode 100644 index 00000000000..e3e6c7ac165 --- /dev/null +++ b/tests/queries/0_stateless/01710_projection_detach_part.sql @@ -0,0 +1,15 @@ +set allow_experimental_projection_optimization = 1; + +drop table if exists t; + +create table t (i int, j int, projection x (select * order by j)) engine MergeTree partition by i order by i; + +insert into t values (1, 2); + +alter table t detach partition 1; + +alter table t attach partition 1; + +select count() from system.projection_parts where database = currentDatabase() and table = 't'; + +drop table t; diff --git a/tests/queries/0_stateless/01710_projection_optimize_materialize.reference b/tests/queries/0_stateless/01710_projection_optimize_materialize.reference new file mode 100644 index 00000000000..24d24e52797 --- /dev/null +++ b/tests/queries/0_stateless/01710_projection_optimize_materialize.reference @@ -0,0 +1 @@ +pp 2021-10-24 474.00 B 1.56 KiB 3.38 100 1 diff --git a/tests/queries/0_stateless/01710_projection_optimize_materialize.sql b/tests/queries/0_stateless/01710_projection_optimize_materialize.sql new file mode 100644 index 00000000000..d8251aabaf6 --- /dev/null +++ b/tests/queries/0_stateless/01710_projection_optimize_materialize.sql @@ -0,0 +1,13 @@ +drop table if exists z; + +create table z (pk Int64, d Date, id UInt64, c UInt64) Engine MergeTree partition by d order by pk ; + +insert into z select number, '2021-10-24', intDiv (number, 10000), 1 from numbers(1000000); +optimize table z final; + +alter table z add projection pp (select id, sum(c) group by id); +alter table z materialize projection pp settings mutations_sync=1; + +SELECT name, partition, formatReadableSize(sum(data_compressed_bytes) AS size) AS compressed, formatReadableSize(sum(data_uncompressed_bytes) AS usize) AS uncompressed, round(usize / size, 2) AS compr_rate, sum(rows) AS rows, count() AS part_count FROM system.projection_parts WHERE database = currentDatabase() and table = 'z' AND active GROUP BY name, partition ORDER BY size DESC; + +drop table z; diff --git a/tests/queries/0_stateless/01720_country_perimeter_and_area.reference b/tests/queries/0_stateless/01720_country_perimeter_and_area.reference index 8a9690791c6..461aea090e7 100644 --- a/tests/queries/0_stateless/01720_country_perimeter_and_area.reference +++ b/tests/queries/0_stateless/01720_country_perimeter_and_area.reference @@ -1,214 +1,214 @@ -Dhekelia Sovereign Base Area 0.0186259930051051 -Kyrgyzstan 0.5868323961091907 +Dhekelia Sovereign Base Area 0.018626 +Kyrgyzstan 0.586832 ------------------------------------- -Dhekelia Sovereign Base Area 0.000003139488070896512 -Kyrgyzstan 0.004895645023822883 +Dhekelia Sovereign Base Area 0.000003 +Kyrgyzstan 0.004896 ------------------------------------- -Aruba 0.011249330810410983 -Afghanistan 0.8199216326776404 -Albania 0.17108622597702605 -Andorra 0.015145740647213184 -Ashmore and Cartier Islands 0.001111472909012953 -Austria 0.3258464621357028 -Burundi 0.1409500621452211 -Belgium 0.1794463601873955 -Benin 0.31426073515874664 -Burkina Faso 0.5144381682226761 -Bulgaria 0.3083164214454252 -Bahrain 0.02137170357214413 -Bosnia and Herzegovina 0.20611959113245232 -Bajo Nuevo Bank (Petrel Is.) 0.0001254597070361587 -Saint Barthelemy 0.0032990108720812672 -Belarus 0.42899119772830474 -Bolivia 0.9279328001326348 -Barbados 0.014116142490651021 -Bhutan 0.1601735058766338 -Botswana 0.5896697538755427 -Central African Republic 0.7760222837198817 -Switzerland 0.2318851512510408 -Clipperton Island 0.0014072924221565273 -Cameroon 0.8001045813665599 -Republic of Congo 0.6904316055863188 -Coral Sea Islands 0.00011634674137689659 -Curaçao 0.02078862020307983 -Czech Republic 0.2708588915805718 -Djibouti 0.12937731543684822 -Dominica 0.020094439807419574 -Algeria 1.1549683948032776 -Ethiopia 0.8210654364815099 -Georgia 0.26823008017781313 -Ghana 0.4056578143818251 -Gibraltar 0.0014059440610631154 -Guinea 0.6350853755877334 -Gambia 0.19279774895359095 -Guatemala 0.3030953561509038 -Guam 0.020321390076536976 -Heard Island and McDonald Islands 0.017334896920453105 -Hungary 0.2617732480910806 -Isle of Man 0.01875803631141408 -Iraq 0.5469861219502402 -Israel 0.19353851895699914 -Jamaica 0.10055860979159512 -Jersey 0.008427337812134537 -Jordan 0.2642243503964102 -Baykonur Cosmodrome 0.04482995477542441 -Siachen Glacier 0.03872116827341272 -Kosovo 0.08773172991408161 -Laos 0.6899867972760174 -Lebanon 0.09676977254650951 -Liberia 0.2961649538030388 -Libya 0.9538430912224716 -Saint Lucia 0.016786201647759867 -Liechtenstein 0.009288582116863231 -Lesotho 0.12315874900320756 -Luxembourg 0.04125996057810259 -Latvia 0.24488610945731157 -Saint Martin 0.006547834154217771 -Morocco 0.8817924249630141 -Monaco 0.0026049777439637527 -Moldova 0.20765701819586885 -Macedonia 0.1128831074330059 -Mali 1.1385970015559317 -Montenegro 0.11756794062084858 -Mongolia 1.142306166871007 -Montserrat 0.006620100691409788 -Namibia 0.843464957679987 -Niger 0.8780744302377772 -Norfolk Island 0.004912027225339993 -Niue 0.009881892958363517 -Nepal 0.4076113675280835 -Nauru 0.0031205159769295255 -Poland 0.48922069488271314 -Paraguay 0.5475256537493991 -Qatar 0.09362771431858698 -Romania 0.44095021664473105 -Rwanda 0.1293663890297039 -Western Sahara 0.4691920993279596 -Scarborough Reef 0.00019842225207367386 -South Sudan 0.7584190842556537 -Senegal 0.5883247226863264 -Serranilla Bank 0.0002389083935906293 -Singapore 0.015233384733369614 -San Marino 0.004596873449598911 -Somaliland 0.3096791489207226 -Somalia 0.6879915318072617 -Republic of Serbia 0.29677234233404165 -Suriname 0.32255243342976203 -Slovakia 0.19843599488831584 -Slovenia 0.14713148471782736 -Swaziland 0.08434161089555517 -Sint Maarten 0.0037955305365309296 -Syria 0.35675522352394456 -Chad 0.9102578296637189 -Togo 0.2600585482954555 -Uganda 0.38301730108810556 -Uruguay 0.3083564407046887 -Vatican 0.00006702452496391445 -Akrotiri Sovereign Base Area 0.013376747415600219 -Zambia 0.8807923488623808 -Zimbabwe 0.4553903789902945 +Aruba 0.011249 +Afghanistan 0.819922 +Albania 0.171086 +Andorra 0.015146 +Ashmore and Cartier Islands 0.001111 +Austria 0.325846 +Burundi 0.14095 +Belgium 0.179446 +Benin 0.314261 +Burkina Faso 0.514438 +Bulgaria 0.308316 +Bahrain 0.021372 +Bosnia and Herzegovina 0.20612 +Bajo Nuevo Bank (Petrel Is.) 0.000125 +Saint Barthelemy 0.003299 +Belarus 0.428991 +Bolivia 0.927933 +Barbados 0.014116 +Bhutan 0.160174 +Botswana 0.58967 +Central African Republic 0.776022 +Switzerland 0.231885 +Clipperton Island 0.001407 +Cameroon 0.800105 +Republic of Congo 0.690432 +Coral Sea Islands 0.000116 +Curaçao 0.020789 +Czech Republic 0.270859 +Djibouti 0.129377 +Dominica 0.020094 +Algeria 1.154968 +Ethiopia 0.821065 +Georgia 0.26823 +Ghana 0.405658 +Gibraltar 0.001406 +Guinea 0.635085 +Gambia 0.192798 +Guatemala 0.303095 +Guam 0.020321 +Heard Island and McDonald Islands 0.017335 +Hungary 0.261773 +Isle of Man 0.018758 +Iraq 0.546986 +Israel 0.193539 +Jamaica 0.100559 +Jersey 0.008427 +Jordan 0.264224 +Baykonur Cosmodrome 0.04483 +Siachen Glacier 0.038721 +Kosovo 0.087732 +Laos 0.689987 +Lebanon 0.09677 +Liberia 0.296165 +Libya 0.953843 +Saint Lucia 0.016786 +Liechtenstein 0.009289 +Lesotho 0.123159 +Luxembourg 0.04126 +Latvia 0.244886 +Saint Martin 0.006548 +Morocco 0.881792 +Monaco 0.002605 +Moldova 0.207657 +Macedonia 0.112883 +Mali 1.138597 +Montenegro 0.117568 +Mongolia 1.142306 +Montserrat 0.00662 +Namibia 0.843465 +Niger 0.878074 +Norfolk Island 0.004912 +Niue 0.009882 +Nepal 0.407611 +Nauru 0.003121 +Poland 0.489221 +Paraguay 0.547526 +Qatar 0.093628 +Romania 0.44095 +Rwanda 0.129366 +Western Sahara 0.469192 +Scarborough Reef 0.000198 +South Sudan 0.758419 +Senegal 0.588325 +Serranilla Bank 0.000239 +Singapore 0.015233 +San Marino 0.004597 +Somaliland 0.309679 +Somalia 0.687992 +Republic of Serbia 0.296772 +Suriname 0.322552 +Slovakia 0.198436 +Slovenia 0.147131 +Swaziland 0.084342 +Sint Maarten 0.003796 +Syria 0.356755 +Chad 0.910258 +Togo 0.260059 +Uganda 0.383017 +Uruguay 0.308356 +Vatican 0.000067 +Akrotiri Sovereign Base Area 0.013377 +Zambia 0.880792 +Zimbabwe 0.45539 ------------------------------------- -Aruba 0.0000041986375296795025 -Afghanistan 0.015826481758320493 -Albania 0.0006971811189621746 -Andorra 0.00001112355564980348 -Ashmore and Cartier Islands 6.66668338977609e-8 -Austria 0.0020634744883290235 -Burundi 0.000669169243101558 -Belgium 0.0007529367590741593 -Benin 0.00287239734953164 -Burkina Faso 0.006746218025419332 -Bulgaria 0.0027733372191197786 -Bahrain 0.00001443842547561405 -Bosnia and Herzegovina 0.0012742491201009779 -Bajo Nuevo Bank (Petrel Is.) 8.864825701897049e-10 -Saint Barthelemy 6.036607210116289e-7 -Belarus 0.005090738074359067 -Bolivia 0.026865324735758436 -Barbados 0.0000109856680212211 -Bhutan 0.0009961026696220909 -Botswana 0.01430200501713062 -Central African Republic 0.015290667187215962 -Switzerland 0.0010181463734151514 -Clipperton Island 1.2373029819547803e-7 -Cameroon 0.011488908713113137 -Republic of Congo 0.008534881807187833 -Coral Sea Islands 5.121674593493771e-10 -Curaçao 0.000011457378136273848 -Czech Republic 0.0019339153549488386 -Djibouti 0.000540370985929321 -Dominica 0.000018056168258583246 -Algeria 0.05696762706232162 -Ethiopia 0.02789047634482515 -Georgia 0.0017113229913929072 -Ghana 0.0059048504621945965 -Gibraltar 9.095456688875715e-8 -Guinea 0.006043151808047173 -Gambia 0.0002596816395280707 -Guatemala 0.0026901925526205263 -Guam 0.000013952443476670549 -Heard Island and McDonald Islands 0.000009688375334192321 -Hungary 0.0022899094702118978 -Isle of Man 0.00001410012284549863 -Iraq 0.010780689598789812 -Israel 0.0005400181032289429 -Jamaica 0.00027268062650994383 -Jersey 0.0000029236161155167853 -Jordan 0.002191215069390572 -Baykonur Cosmodrome 0.00015978303781425133 -Siachen Glacier 0.0000513879615262916 -Kosovo 0.0002684178325412152 -Laos 0.005637555524983489 -Lebanon 0.0002464436461544738 -Liberia 0.002357973807538481 -Libya 0.040072512808839354 -Saint Lucia 0.000014963842166249258 -Liechtenstein 0.0000033722024322722466 -Lesotho 0.0007426290112070925 -Luxembourg 0.00006405006804909529 -Latvia 0.00158313668683266 -Saint Martin 0.00000168759530251474 -Morocco 0.014595589778269167 -Monaco 4.6325700981005285e-7 -Moldova 0.0008158639460823913 -Macedonia 0.0006245180554490506 -Mali 0.03096381132470007 -Montenegro 0.00033762445623993013 -Mongolia 0.038446609480001344 -Montserrat 0.0000024620326175206004 -Namibia 0.020320978539029165 -Niger 0.02919849042641136 -Norfolk Island 0.0000010150641235563077 -Niue 0.000005450796200539049 -Nepal 0.003629565673884544 -Nauru 7.119067469952887e-7 -Poland 0.0076921097527402876 -Paraguay 0.009875843128670564 -Qatar 0.0002752610716836153 -Romania 0.005809479702080411 -Rwanda 0.0006262235765421803 -Western Sahara 0.0022344529652030694 -Scarborough Reef 2.4176335726807567e-9 -South Sudan 0.015509656314462458 -Senegal 0.00485201810074574 -Serranilla Bank 2.6035559945372385e-9 -Singapore 0.000012633505579848072 -San Marino 0.0000014830814619737624 -Somaliland 0.0041412916217828406 -Somalia 0.011674654119996183 -Republic of Serbia 0.001907268740192651 -Suriname 0.0035911641359236534 -Slovakia 0.0011901587428922095 -Slovenia 0.0004995546076509384 -Swaziland 0.00042234053226485263 -Sint Maarten 5.772865969377286e-7 -Syria 0.004581243750467663 -Chad 0.0313064894302088 -Togo 0.0014067991034602252 -Uganda 0.005985159048654327 -Uruguay 0.0043716082436750115 -Vatican 3.002600504657064e-10 -Akrotiri Sovereign Base Area 0.0000024314362587592923 -Zambia 0.018594119224502336 -Zimbabwe 0.009621356779606268 +Aruba 0.000004 +Afghanistan 0.015826 +Albania 0.000697 +Andorra 0.000011 +Ashmore and Cartier Islands 0 +Austria 0.002063 +Burundi 0.000669 +Belgium 0.000753 +Benin 0.002872 +Burkina Faso 0.006746 +Bulgaria 0.002773 +Bahrain 0.000014 +Bosnia and Herzegovina 0.001274 +Bajo Nuevo Bank (Petrel Is.) 0 +Saint Barthelemy 0.000001 +Belarus 0.005091 +Bolivia 0.026865 +Barbados 0.000011 +Bhutan 0.000996 +Botswana 0.014302 +Central African Republic 0.015291 +Switzerland 0.001018 +Clipperton Island 0 +Cameroon 0.011489 +Republic of Congo 0.008535 +Coral Sea Islands 0 +Curaçao 0.000011 +Czech Republic 0.001934 +Djibouti 0.00054 +Dominica 0.000018 +Algeria 0.056968 +Ethiopia 0.02789 +Georgia 0.001711 +Ghana 0.005905 +Gibraltar 0 +Guinea 0.006043 +Gambia 0.00026 +Guatemala 0.00269 +Guam 0.000014 +Heard Island and McDonald Islands 0.00001 +Hungary 0.00229 +Isle of Man 0.000014 +Iraq 0.010781 +Israel 0.00054 +Jamaica 0.000273 +Jersey 0.000003 +Jordan 0.002191 +Baykonur Cosmodrome 0.00016 +Siachen Glacier 0.000051 +Kosovo 0.000268 +Laos 0.005638 +Lebanon 0.000246 +Liberia 0.002358 +Libya 0.040073 +Saint Lucia 0.000015 +Liechtenstein 0.000003 +Lesotho 0.000743 +Luxembourg 0.000064 +Latvia 0.001583 +Saint Martin 0.000002 +Morocco 0.014596 +Monaco 0 +Moldova 0.000816 +Macedonia 0.000625 +Mali 0.030964 +Montenegro 0.000338 +Mongolia 0.038447 +Montserrat 0.000002 +Namibia 0.020321 +Niger 0.029198 +Norfolk Island 0.000001 +Niue 0.000005 +Nepal 0.00363 +Nauru 0.000001 +Poland 0.007692 +Paraguay 0.009876 +Qatar 0.000275 +Romania 0.005809 +Rwanda 0.000626 +Western Sahara 0.002234 +Scarborough Reef 0 +South Sudan 0.01551 +Senegal 0.004852 +Serranilla Bank 0 +Singapore 0.000013 +San Marino 0.000001 +Somaliland 0.004141 +Somalia 0.011675 +Republic of Serbia 0.001907 +Suriname 0.003591 +Slovakia 0.00119 +Slovenia 0.0005 +Swaziland 0.000422 +Sint Maarten 0.000001 +Syria 0.004581 +Chad 0.031306 +Togo 0.001407 +Uganda 0.005985 +Uruguay 0.004372 +Vatican 0 +Akrotiri Sovereign Base Area 0.000002 +Zambia 0.018594 +Zimbabwe 0.009621 ------------------------------------- diff --git a/tests/queries/0_stateless/01720_country_perimeter_and_area.sh b/tests/queries/0_stateless/01720_country_perimeter_and_area.sh index 75016ee1d1f..0080c9a1a1b 100755 --- a/tests/queries/0_stateless/01720_country_perimeter_and_area.sh +++ b/tests/queries/0_stateless/01720_country_perimeter_and_area.sh @@ -8,9 +8,9 @@ ${CLICKHOUSE_CLIENT} -q "drop table if exists country_polygons;" ${CLICKHOUSE_CLIENT} -q "create table country_polygons(name String, p Array(Array(Tuple(Float64, Float64)))) engine=MergeTree() order by tuple();" cat ${CURDIR}/country_polygons.tsv | ${CLICKHOUSE_CLIENT} -q "insert into country_polygons format TSV" -${CLICKHOUSE_CLIENT} -q "SELECT name, polygonPerimeterSpherical(p) from country_polygons" +${CLICKHOUSE_CLIENT} -q "SELECT name, round(polygonPerimeterSpherical(p), 6) from country_polygons" ${CLICKHOUSE_CLIENT} -q "SELECT '-------------------------------------'" -${CLICKHOUSE_CLIENT} -q "SELECT name, polygonAreaSpherical(p) from country_polygons" +${CLICKHOUSE_CLIENT} -q "SELECT name, round(polygonAreaSpherical(p), 6) from country_polygons" ${CLICKHOUSE_CLIENT} -q "SELECT '-------------------------------------'" ${CLICKHOUSE_CLIENT} -q "drop table if exists country_rings;" @@ -18,9 +18,9 @@ ${CLICKHOUSE_CLIENT} -q "drop table if exists country_rings;" ${CLICKHOUSE_CLIENT} -q "create table country_rings(name String, p Array(Tuple(Float64, Float64))) engine=MergeTree() order by tuple();" cat ${CURDIR}/country_rings.tsv | ${CLICKHOUSE_CLIENT} -q "insert into country_rings format TSV" -${CLICKHOUSE_CLIENT} -q "SELECT name, polygonPerimeterSpherical(p) from country_rings" +${CLICKHOUSE_CLIENT} -q "SELECT name, round(polygonPerimeterSpherical(p), 6) from country_rings" ${CLICKHOUSE_CLIENT} -q "SELECT '-------------------------------------'" -${CLICKHOUSE_CLIENT} -q "SELECT name, polygonAreaSpherical(p) from country_rings" +${CLICKHOUSE_CLIENT} -q "SELECT name, round(polygonAreaSpherical(p), 6) from country_rings" ${CLICKHOUSE_CLIENT} -q "SELECT '-------------------------------------'" ${CLICKHOUSE_CLIENT} -q "drop table if exists country_rings;" diff --git a/tests/queries/0_stateless/01748_partition_id_pruning.sql b/tests/queries/0_stateless/01748_partition_id_pruning.sql index 17a405e17ad..e0d45884c60 100644 --- a/tests/queries/0_stateless/01748_partition_id_pruning.sql +++ b/tests/queries/0_stateless/01748_partition_id_pruning.sql @@ -8,12 +8,12 @@ set max_rows_to_read = 3; select * from x where _partition_id = partitionId(1); -set max_rows_to_read = 4; -- one row for subquery +set max_rows_to_read = 5; -- one row for subquery + subquery select * from x where _partition_id in (select partitionId(number + 1) from numbers(1)); -- trivial count optimization test -set max_rows_to_read = 1; -- one row for subquery +set max_rows_to_read = 2; -- one row for subquery + subquery itself select count() from x where _partition_id in (select partitionId(number + 1) from numbers(1)); drop table x; diff --git a/tests/queries/0_stateless/01755_client_highlight_multi_line_comment_regression.expect b/tests/queries/0_stateless/01755_client_highlight_multi_line_comment_regression.expect index 922a6914584..022320e2d4b 100755 --- a/tests/queries/0_stateless/01755_client_highlight_multi_line_comment_regression.expect +++ b/tests/queries/0_stateless/01755_client_highlight_multi_line_comment_regression.expect @@ -1,14 +1,13 @@ #!/usr/bin/expect -f -# Tags: no-fasttest log_user 0 set timeout 60 match_max 100000 -# A default timeout action is to do nothing, change it to fail expect_after { - timeout { - exit 2 - } + # Do not ignore eof from expect + eof { exp_continue } + # A default timeout action is to do nothing, change it to fail + timeout { exit 1 } } set basedir [file dirname $argv0] diff --git a/tests/queries/0_stateless/01756_optimize_skip_unused_shards_rewrite_in.reference b/tests/queries/0_stateless/01756_optimize_skip_unused_shards_rewrite_in.reference index 66fbe8a5d1c..9b76ca91780 100644 --- a/tests/queries/0_stateless/01756_optimize_skip_unused_shards_rewrite_in.reference +++ b/tests/queries/0_stateless/01756_optimize_skip_unused_shards_rewrite_in.reference @@ -1,17 +1,17 @@ (0, 2) 0 0 0 0 -WITH _CAST(\'default\', \'Nullable(String)\') AS id_no SELECT one.dummy, ignore(id_no) FROM system.one WHERE dummy IN (0, 2) -WITH _CAST(\'default\', \'Nullable(String)\') AS id_no SELECT one.dummy, ignore(id_no) FROM system.one WHERE dummy IN (0, 2) +WITH _CAST(\'default\', \'Nullable(String)\') AS `id_no` SELECT `one`.`dummy`, ignore(`id_no`) FROM `system`.`one` WHERE `dummy` IN (0, 2) +WITH _CAST(\'default\', \'Nullable(String)\') AS `id_no` SELECT `one`.`dummy`, ignore(`id_no`) FROM `system`.`one` WHERE `dummy` IN (0, 2) optimize_skip_unused_shards_rewrite_in(0, 2) 0 0 -WITH _CAST(\'default\', \'Nullable(String)\') AS id_02 SELECT one.dummy, ignore(id_02) FROM system.one WHERE dummy IN tuple(0) -WITH _CAST(\'default\', \'Nullable(String)\') AS id_02 SELECT one.dummy, ignore(id_02) FROM system.one WHERE dummy IN tuple(2) +WITH _CAST(\'default\', \'Nullable(String)\') AS `id_02` SELECT `one`.`dummy`, ignore(`id_02`) FROM `system`.`one` WHERE `dummy` IN tuple(0) +WITH _CAST(\'default\', \'Nullable(String)\') AS `id_02` SELECT `one`.`dummy`, ignore(`id_02`) FROM `system`.`one` WHERE `dummy` IN tuple(2) optimize_skip_unused_shards_rewrite_in(2,) -WITH _CAST(\'default\', \'Nullable(String)\') AS id_2 SELECT one.dummy, ignore(id_2) FROM system.one WHERE dummy IN tuple(2) +WITH _CAST(\'default\', \'Nullable(String)\') AS `id_2` SELECT `one`.`dummy`, ignore(`id_2`) FROM `system`.`one` WHERE `dummy` IN tuple(2) optimize_skip_unused_shards_rewrite_in(0,) 0 0 -WITH _CAST(\'default\', \'Nullable(String)\') AS id_0 SELECT one.dummy, ignore(id_0) FROM system.one WHERE dummy IN tuple(0) +WITH _CAST(\'default\', \'Nullable(String)\') AS `id_0` SELECT `one`.`dummy`, ignore(`id_0`) FROM `system`.`one` WHERE `dummy` IN tuple(0) 0 0 errors diff --git a/tests/queries/0_stateless/01756_optimize_skip_unused_shards_rewrite_in.sql b/tests/queries/0_stateless/01756_optimize_skip_unused_shards_rewrite_in.sql index efde0ac1e60..220d5d91a0b 100644 --- a/tests/queries/0_stateless/01756_optimize_skip_unused_shards_rewrite_in.sql +++ b/tests/queries/0_stateless/01756_optimize_skip_unused_shards_rewrite_in.sql @@ -32,11 +32,11 @@ select '(0, 2)'; with (select currentDatabase()) as id_no select *, ignore(id_no) from dist_01756 where dummy in (0, 2); system flush logs; select query from system.query_log where - event_date = today() and + event_date >= yesterday() and event_time > now() - interval 1 hour and not is_initial_query and - query not like '%system.query_log%' and - query like concat('WITH%', currentDatabase(), '%AS id_no %') and + query not like '%system%query_log%' and + query like concat('WITH%', currentDatabase(), '%AS `id_no` %') and type = 'QueryFinish' order by query; @@ -51,11 +51,11 @@ select 'optimize_skip_unused_shards_rewrite_in(0, 2)'; with (select currentDatabase()) as id_02 select *, ignore(id_02) from dist_01756 where dummy in (0, 2); system flush logs; select query from system.query_log where - event_date = today() and + event_date >= yesterday() and event_time > now() - interval 1 hour and not is_initial_query and - query not like '%system.query_log%' and - query like concat('WITH%', currentDatabase(), '%AS id_02 %') and + query not like '%system%query_log%' and + query like concat('WITH%', currentDatabase(), '%AS `id_02` %') and type = 'QueryFinish' order by query; @@ -63,11 +63,11 @@ select 'optimize_skip_unused_shards_rewrite_in(2,)'; with (select currentDatabase()) as id_2 select *, ignore(id_2) from dist_01756 where dummy in (2,); system flush logs; select query from system.query_log where - event_date = today() and + event_date >= yesterday() and event_time > now() - interval 1 hour and not is_initial_query and - query not like '%system.query_log%' and - query like concat('WITH%', currentDatabase(), '%AS id_2 %') and + query not like '%system%query_log%' and + query like concat('WITH%', currentDatabase(), '%AS `id_2` %') and type = 'QueryFinish' order by query; @@ -75,11 +75,11 @@ select 'optimize_skip_unused_shards_rewrite_in(0,)'; with (select currentDatabase()) as id_0 select *, ignore(id_0) from dist_01756 where dummy in (0,); system flush logs; select query from system.query_log where - event_date = today() and + event_date >= yesterday() and event_time > now() - interval 1 hour and not is_initial_query and - query not like '%system.query_log%' and - query like concat('WITH%', currentDatabase(), '%AS id_0 %') and + query not like '%system%query_log%' and + query like concat('WITH%', currentDatabase(), '%AS `id_0` %') and type = 'QueryFinish' order by query; @@ -87,6 +87,7 @@ order by query; select * from dist_01756 where dummy in (0); select * from dist_01756 where dummy in ('0'); + -- -- errors -- diff --git a/tests/queries/0_stateless/01780_column_sparse.reference b/tests/queries/0_stateless/01780_column_sparse.reference new file mode 100644 index 00000000000..08aef433172 --- /dev/null +++ b/tests/queries/0_stateless/01780_column_sparse.reference @@ -0,0 +1,182 @@ +-- { echo } + +DROP TABLE IF EXISTS t_sparse; +DROP TABLE IF EXISTS t_sparse_1; +CREATE TABLE t_sparse (id UInt64, u UInt64, s String, arr1 Array(String), arr2 Array(UInt64)) +ENGINE = MergeTree ORDER BY tuple() +SETTINGS ratio_of_defaults_for_sparse_serialization = 0.1; +INSERT INTO t_sparse SELECT + number, + if (number % 10 = 0, number, 0), + if (number % 5 = 0, toString(number), ''), + if (number % 7 = 0, arrayMap(x -> toString(x), range(number % 10)), []), + if (number % 12 = 0, range(number % 10), []) +FROM numbers (200); +SELECT column, serialization_kind FROM system.parts_columns +WHERE table = 't_sparse' AND database = currentDatabase() +ORDER BY column; +arr1 Default +arr2 Default +id Default +s Sparse +u Sparse +SELECT * FROM t_sparse WHERE u != 0 ORDER BY id; +10 10 10 [] [] +20 20 20 [] [] +30 30 30 [] [] +40 40 40 [] [] +50 50 50 [] [] +60 60 60 [] [] +70 70 70 [] [] +80 80 80 [] [] +90 90 90 [] [] +100 100 100 [] [] +110 110 110 [] [] +120 120 120 [] [] +130 130 130 [] [] +140 140 140 [] [] +150 150 150 [] [] +160 160 160 [] [] +170 170 170 [] [] +180 180 180 [] [] +190 190 190 [] [] +SELECT * FROM t_sparse WHERE s != '' ORDER BY id; +0 0 0 [] [] +5 0 5 [] [] +10 10 10 [] [] +15 0 15 [] [] +20 20 20 [] [] +25 0 25 [] [] +30 30 30 [] [] +35 0 35 ['0','1','2','3','4'] [] +40 40 40 [] [] +45 0 45 [] [] +50 50 50 [] [] +55 0 55 [] [] +60 60 60 [] [] +65 0 65 [] [] +70 70 70 [] [] +75 0 75 [] [] +80 80 80 [] [] +85 0 85 [] [] +90 90 90 [] [] +95 0 95 [] [] +100 100 100 [] [] +105 0 105 ['0','1','2','3','4'] [] +110 110 110 [] [] +115 0 115 [] [] +120 120 120 [] [] +125 0 125 [] [] +130 130 130 [] [] +135 0 135 [] [] +140 140 140 [] [] +145 0 145 [] [] +150 150 150 [] [] +155 0 155 [] [] +160 160 160 [] [] +165 0 165 [] [] +170 170 170 [] [] +175 0 175 ['0','1','2','3','4'] [] +180 180 180 [] [] +185 0 185 [] [] +190 190 190 [] [] +195 0 195 [] [] +SELECT * FROM t_sparse WHERE arr1 != [] ORDER BY id; +7 0 ['0','1','2','3','4','5','6'] [] +14 0 ['0','1','2','3'] [] +21 0 ['0'] [] +28 0 ['0','1','2','3','4','5','6','7'] [] +35 0 35 ['0','1','2','3','4'] [] +42 0 ['0','1'] [] +49 0 ['0','1','2','3','4','5','6','7','8'] [] +56 0 ['0','1','2','3','4','5'] [] +63 0 ['0','1','2'] [] +77 0 ['0','1','2','3','4','5','6'] [] +84 0 ['0','1','2','3'] [0,1,2,3] +91 0 ['0'] [] +98 0 ['0','1','2','3','4','5','6','7'] [] +105 0 105 ['0','1','2','3','4'] [] +112 0 ['0','1'] [] +119 0 ['0','1','2','3','4','5','6','7','8'] [] +126 0 ['0','1','2','3','4','5'] [] +133 0 ['0','1','2'] [] +147 0 ['0','1','2','3','4','5','6'] [] +154 0 ['0','1','2','3'] [] +161 0 ['0'] [] +168 0 ['0','1','2','3','4','5','6','7'] [0,1,2,3,4,5,6,7] +175 0 175 ['0','1','2','3','4'] [] +182 0 ['0','1'] [] +189 0 ['0','1','2','3','4','5','6','7','8'] [] +196 0 ['0','1','2','3','4','5'] [] +SELECT * FROM t_sparse WHERE arr2 != [] ORDER BY id; +12 0 [] [0,1] +24 0 [] [0,1,2,3] +36 0 [] [0,1,2,3,4,5] +48 0 [] [0,1,2,3,4,5,6,7] +72 0 [] [0,1] +84 0 ['0','1','2','3'] [0,1,2,3] +96 0 [] [0,1,2,3,4,5] +108 0 [] [0,1,2,3,4,5,6,7] +132 0 [] [0,1] +144 0 [] [0,1,2,3] +156 0 [] [0,1,2,3,4,5] +168 0 ['0','1','2','3','4','5','6','7'] [0,1,2,3,4,5,6,7] +192 0 [] [0,1] +SELECT sum(u) FROM t_sparse; +1900 +SELECT sum(u) FROM t_sparse GROUP BY id % 7; +210 +360 +300 +240 +190 +330 +270 +SELECT arrayFilter(x -> x % 2 = 1, arr2) FROM t_sparse WHERE arr2 != [] LIMIT 5; +[1] +[1,3] +[1,3,5] +[1,3,5,7] +[1] +CREATE TABLE t_sparse_1 (id UInt64, v Int64) +ENGINE = MergeTree ORDER BY tuple() +SETTINGS ratio_of_defaults_for_sparse_serialization = 0; +INSERT INTO t_sparse_1 VALUES (1, 6), (2, 1), (3, 0), (4, -1), (5, 0), (6, 0), (7, -2), (8, 0), (9, 0), (10, 4), (11, 0); +SELECT * FROM t_sparse_1 ORDER BY v; +7 -2 +4 -1 +3 0 +5 0 +6 0 +8 0 +9 0 +11 0 +2 1 +10 4 +1 6 +SELECT * FROM t_sparse_1 ORDER BY v DESC; +1 6 +10 4 +2 1 +3 0 +5 0 +6 0 +8 0 +9 0 +11 0 +4 -1 +7 -2 +SELECT * FROM t_sparse_1 ORDER BY v, id LIMIT 5; +7 -2 +4 -1 +3 0 +5 0 +6 0 +SELECT * FROM t_sparse_1 ORDER BY v DESC, id LIMIT 5; +1 6 +10 4 +2 1 +3 0 +5 0 +DROP TABLE t_sparse; +DROP TABLE t_sparse_1; diff --git a/tests/queries/0_stateless/01780_column_sparse.sql b/tests/queries/0_stateless/01780_column_sparse.sql new file mode 100644 index 00000000000..480321c6d14 --- /dev/null +++ b/tests/queries/0_stateless/01780_column_sparse.sql @@ -0,0 +1,44 @@ +-- { echo } + +DROP TABLE IF EXISTS t_sparse; +DROP TABLE IF EXISTS t_sparse_1; + +CREATE TABLE t_sparse (id UInt64, u UInt64, s String, arr1 Array(String), arr2 Array(UInt64)) +ENGINE = MergeTree ORDER BY tuple() +SETTINGS ratio_of_defaults_for_sparse_serialization = 0.1; + +INSERT INTO t_sparse SELECT + number, + if (number % 10 = 0, number, 0), + if (number % 5 = 0, toString(number), ''), + if (number % 7 = 0, arrayMap(x -> toString(x), range(number % 10)), []), + if (number % 12 = 0, range(number % 10), []) +FROM numbers (200); + +SELECT column, serialization_kind FROM system.parts_columns +WHERE table = 't_sparse' AND database = currentDatabase() +ORDER BY column; + +SELECT * FROM t_sparse WHERE u != 0 ORDER BY id; +SELECT * FROM t_sparse WHERE s != '' ORDER BY id; +SELECT * FROM t_sparse WHERE arr1 != [] ORDER BY id; +SELECT * FROM t_sparse WHERE arr2 != [] ORDER BY id; + +SELECT sum(u) FROM t_sparse; +SELECT sum(u) FROM t_sparse GROUP BY id % 7; + +SELECT arrayFilter(x -> x % 2 = 1, arr2) FROM t_sparse WHERE arr2 != [] LIMIT 5; + +CREATE TABLE t_sparse_1 (id UInt64, v Int64) +ENGINE = MergeTree ORDER BY tuple() +SETTINGS ratio_of_defaults_for_sparse_serialization = 0; + +INSERT INTO t_sparse_1 VALUES (1, 6), (2, 1), (3, 0), (4, -1), (5, 0), (6, 0), (7, -2), (8, 0), (9, 0), (10, 4), (11, 0); + +SELECT * FROM t_sparse_1 ORDER BY v; +SELECT * FROM t_sparse_1 ORDER BY v DESC; +SELECT * FROM t_sparse_1 ORDER BY v, id LIMIT 5; +SELECT * FROM t_sparse_1 ORDER BY v DESC, id LIMIT 5; + +DROP TABLE t_sparse; +DROP TABLE t_sparse_1; diff --git a/tests/queries/0_stateless/01780_column_sparse_alter.reference b/tests/queries/0_stateless/01780_column_sparse_alter.reference new file mode 100644 index 00000000000..cec7af647b3 --- /dev/null +++ b/tests/queries/0_stateless/01780_column_sparse_alter.reference @@ -0,0 +1,7 @@ +id Default +u Sparse +s Sparse +182 155 +id Default +t Sparse +182 diff --git a/tests/queries/0_stateless/01780_column_sparse_alter.sql b/tests/queries/0_stateless/01780_column_sparse_alter.sql new file mode 100644 index 00000000000..444a1f9cf43 --- /dev/null +++ b/tests/queries/0_stateless/01780_column_sparse_alter.sql @@ -0,0 +1,26 @@ +SET mutations_sync = 2; + +DROP TABLE IF EXISTS t_sparse_alter; + +CREATE TABLE t_sparse_alter (id UInt64, u UInt64, s String) +ENGINE = MergeTree ORDER BY id +SETTINGS ratio_of_defaults_for_sparse_serialization = 0.5; + +INSERT INTO t_sparse_alter SELECT + number, + if (number % 11 = 0, number, 0), + if (number % 13 = 0, toString(number), '') +FROM numbers(2000); + +SELECT column, serialization_kind FROM system.parts_columns WHERE database = currentDatabase() AND table = 't_sparse_alter' AND active ORDER BY name; + +SELECT uniqExact(u), uniqExact(s) FROM t_sparse_alter; + +ALTER TABLE t_sparse_alter DROP COLUMN s, RENAME COLUMN u TO t; +ALTER TABLE t_sparse_alter MODIFY COLUMN t UInt16; + +SELECT column, serialization_kind FROM system.parts_columns WHERE database = currentDatabase() AND table = 't_sparse_alter' AND active ORDER BY name; + +SELECT uniqExact(t) FROM t_sparse_alter; + +DROP TABLE t_sparse_alter; diff --git a/tests/queries/0_stateless/01780_column_sparse_distinct.reference b/tests/queries/0_stateless/01780_column_sparse_distinct.reference new file mode 100644 index 00000000000..bb0cebc6540 --- /dev/null +++ b/tests/queries/0_stateless/01780_column_sparse_distinct.reference @@ -0,0 +1,7 @@ +all_1_1_0 v Default +all_2_2_0 v Sparse +0 +1 +2 +3 +4 diff --git a/tests/queries/0_stateless/01780_column_sparse_distinct.sql b/tests/queries/0_stateless/01780_column_sparse_distinct.sql new file mode 100644 index 00000000000..502ca7600d4 --- /dev/null +++ b/tests/queries/0_stateless/01780_column_sparse_distinct.sql @@ -0,0 +1,20 @@ +DROP TABLE IF EXISTS t_sparse_distinct; + +CREATE TABLE t_sparse_distinct (id UInt32, v UInt64) +ENGINE = MergeTree +ORDER BY id +SETTINGS ratio_of_defaults_for_sparse_serialization = 0.9; + +SYSTEM STOP MERGES t_sparse_distinct; + +INSERT INTO t_sparse_distinct SELECT number, number % 5 FROM numbers(100000); +INSERT INTO t_sparse_distinct SELECT number, number % 100 = 0 FROM numbers(100000); + +SELECT name, column, serialization_kind +FROM system.parts_columns +WHERE table = 't_sparse_distinct' AND database = currentDatabase() AND column = 'v' +ORDER BY name; + +SELECT DISTINCT v FROM t_sparse_distinct ORDER BY v; + +DROP TABLE t_sparse_distinct; diff --git a/tests/queries/0_stateless/01780_column_sparse_full.reference b/tests/queries/0_stateless/01780_column_sparse_full.reference new file mode 100644 index 00000000000..4d2d0a58798 --- /dev/null +++ b/tests/queries/0_stateless/01780_column_sparse_full.reference @@ -0,0 +1,110 @@ +all_1_1_0 id Default +all_1_1_0 s Sparse +all_1_1_0 u Sparse +all_2_2_0 id Default +all_2_2_0 s Default +all_2_2_0 u Default +0 0 +0 0 +1 0 +1 1 +====== +0 0 +0 0 +1 0 +1 1 +====== +990 990 +980 980 980 +970 970 +====== +990 990 +980 980 980 +970 970 +====== +0 58413 +1 57920 +2 57917 +====== +507 +====== +0 [0,2,1,3] +1 [0,2,1,3] +2 [0,2,1,3] +3 [0,2,1,3] +4 [0,2,1,3] +8 +====== +0 0 0 +0 0 0 +0 0 0 +0 0 0 +1 1 1 +====== +58413 +57920 +57917 + +174250 +====== +174250 +58413 +57920 +57917 +====== +174250 +58413 +57920 +57917 +====== +508413 +57920 +57917 +====== +1075 +====== +1077 +====== +0 +1 +2 +3 +4 +====== +0 0 0 +0 0 0 +1 0 +2 0 +3 0 +====== +0 0 0 +0 0 0 +1 0 +1 1 1 +2 0 +====== +0 0 0 +0 0 0 +1 0 +2 0 +2 2 2 +====== +0 0 0 +0 0 0 +1 1 1 +1 0 +2 2 2 +====== +0 0 0 +0 0 0 +0 0 0 +0 0 0 +0 0 0 +====== +id Default +s Sparse +u Sparse +====== +990 990 +980 980 980 +970 970 diff --git a/tests/queries/0_stateless/01780_column_sparse_full.sql b/tests/queries/0_stateless/01780_column_sparse_full.sql new file mode 100644 index 00000000000..af6fde116d9 --- /dev/null +++ b/tests/queries/0_stateless/01780_column_sparse_full.sql @@ -0,0 +1,105 @@ +-- This test checks, that common SQL operations work +-- with mixed columns (sparse and full) in table. + +DROP TABLE IF EXISTS t_sparse_full; + +CREATE TABLE t_sparse_full (id UInt64, u UInt64, s String) +ENGINE = MergeTree ORDER BY id +SETTINGS index_granularity = 32, +ratio_of_defaults_for_sparse_serialization = 0.1; + +SYSTEM STOP MERGES t_sparse_full; + +INSERT INTO t_sparse_full +SELECT + number, + if (number % 10 = 0, number, 0), + if (number % 7 = 0, toString(number), '') +FROM numbers(1000); + +INSERT INTO t_sparse_full +SELECT + number, + number, + toString(number) +FROM numbers(500); + +SELECT name, column, serialization_kind +FROM system.parts_columns WHERE table = 't_sparse_full' AND database = currentDatabase() AND active +ORDER BY name, column; + +SELECT id, u FROM t_sparse_full ORDER BY id, u LIMIT 4; +SELECT '======'; +SELECT id, u FROM t_sparse_full ORDER BY id, u LIMIT 4 SETTINGS optimize_read_in_order = 0; +SELECT '======'; +SELECT id, u, s FROM t_sparse_full ORDER BY u DESC LIMIT 3; +SELECT '======'; +SELECT id, u, s FROM t_sparse_full WHERE u != 0 ORDER BY u DESC LIMIT 3; +SELECT '======'; +SELECT id % 3 AS k, sum(u) FROM t_sparse_full WHERE u != 0 GROUP BY k ORDER BY k; +SELECT '======'; +SELECT uniqExact(u) FROM t_sparse_full WHERE s != ''; +SELECT '======'; +SELECT toUInt32(s) % 5 AS k, groupUniqArray(u % 4) FROM t_sparse_full WHERE s != '' GROUP BY k ORDER BY k; +SELECT max(range(id % 10)[u]) FROM t_sparse_full; +SELECT '======'; +SELECT id, u, s FROM remote('127.0.0.{1,2}', currentDatabase(), t_sparse_full) ORDER BY id LIMIT 5; +SELECT '======'; +SELECT sum(u) FROM t_sparse_full GROUP BY id % 3 AS k WITH TOTALS ORDER BY k; +SELECT '======'; +SELECT sum(u) FROM t_sparse_full GROUP BY id % 3 AS k WITH ROLLUP ORDER BY k; +SELECT '======'; +SELECT sum(u) FROM t_sparse_full GROUP BY id % 3 AS k WITH CUBE ORDER BY k; +SELECT '======'; +SELECT sum(id) FROM t_sparse_full GROUP BY u % 3 AS k ORDER BY k; +SELECT '======'; +SELECT count() FROM t_sparse_full WHERE u % 4 = 0; +SELECT '======'; +SELECT count() FROM t_sparse_full WHERE u IN (SELECT u FROM t_sparse_full WHERE id % 4 = 2); +SELECT '======'; +SELECT DISTINCT u FROM t_sparse_full ORDER BY id LIMIT 5; + +SELECT '======'; + +SELECT id, u, s FROM t_sparse_full INNER JOIN +( + SELECT number * 3 AS u FROM numbers(10) +) AS t1 USING(u) ORDER BY id, u, s LIMIT 5; + +SELECT '======'; + +SELECT id, u, s FROM t_sparse_full FULL JOIN +( + SELECT number * 3 AS u FROM numbers(10) +) AS t1 USING(u) ORDER BY id, u, s LIMIT 5; + +SELECT '======'; + +SELECT id, u, s FROM (SELECT number * 2 AS u FROM numbers(10)) AS t1 +INNER JOIN t_sparse_full USING(u) ORDER BY id, u, s LIMIT 5; + +SELECT '======'; + +SELECT id, u, s FROM (SELECT number * 2 AS u FROM numbers(10)) AS t1 +FULL JOIN t_sparse_full USING(u) ORDER BY id LIMIT 5; + +SELECT '======'; + +SELECT id, u, s FROM (SELECT u FROM t_sparse_full) AS t1 +FULL JOIN t_sparse_full USING(u) ORDER BY id, u, s LIMIT 5; + +SYSTEM START MERGES t_sparse_full; + +OPTIMIZE TABLE t_sparse_full FINAL; + +SELECT '======'; + +SELECT column, serialization_kind +FROM system.parts_columns WHERE table = 't_sparse_full' AND database = currentDatabase() AND active +ORDER BY name, column; + +SELECT '======'; + +SELECT id, u, s FROM t_sparse_full ORDER BY u DESC LIMIT 3; + +DROP TABLE t_sparse_full; diff --git a/tests/queries/0_stateless/01780_column_sparse_pk.reference b/tests/queries/0_stateless/01780_column_sparse_pk.reference new file mode 100644 index 00000000000..11bb0471689 --- /dev/null +++ b/tests/queries/0_stateless/01780_column_sparse_pk.reference @@ -0,0 +1,15 @@ +2 +2 e +0 a +0 b +3 f +200 84 +200 84 +800 167 +800 167 +\N +\N +\N +[] +[] +[] diff --git a/tests/queries/0_stateless/01780_column_sparse_pk.sql b/tests/queries/0_stateless/01780_column_sparse_pk.sql new file mode 100644 index 00000000000..63ed9e99a87 --- /dev/null +++ b/tests/queries/0_stateless/01780_column_sparse_pk.sql @@ -0,0 +1,43 @@ +DROP TABLE IF EXISTS t_sparse_pk; +DROP TABLE IF EXISTS t_full_pk; + +CREATE TABLE t_sparse_pk (k UInt64, s String) +ENGINE = MergeTree ORDER BY k +SETTINGS ratio_of_defaults_for_sparse_serialization = 0.0, index_granularity = 1; + +INSERT INTO t_sparse_pk VALUES (0, 'a'), (0, 'b'), (1, ''), (2, ''), (2, 'e'), (3, 'f'), (4, 'g'); + +SET force_primary_key = 1; + +SELECT k, s FROM t_sparse_pk WHERE k = 2 ORDER BY k, s; +SELECT k, s FROM t_sparse_pk WHERE k = 0 OR k = 3 ORDER BY k, s; + +DROP TABLE IF EXISTS t_sparse_pk; + +CREATE TABLE t_sparse_pk (k UInt64, v UInt64 CODEC(NONE)) +ENGINE = MergeTree ORDER BY k +SETTINGS ratio_of_defaults_for_sparse_serialization = 0.0, index_granularity = 30; + +CREATE TABLE t_full_pk (k UInt64, v UInt64) +ENGINE = MergeTree ORDER BY k +SETTINGS ratio_of_defaults_for_sparse_serialization = 1.1, index_granularity = 30; + +INSERT INTO t_sparse_pk SELECT number % 10, number % 4 = 0 FROM numbers(1000); +INSERT INTO t_full_pk SELECT number % 10, number % 4 = 0 FROM numbers(1000); + +INSERT INTO t_sparse_pk SELECT number % 10, number % 6 = 0 FROM numbers(1000); +INSERT INTO t_full_pk SELECT number % 10, number % 6 = 0 FROM numbers(1000); + +SELECT count(v), sum(v) FROM t_sparse_pk WHERE k = 0; +SELECT count(v), sum(v) FROM t_full_pk WHERE k = 0; + +SELECT count(v), sum(v) FROM t_sparse_pk WHERE k = 0 OR k = 3 OR k = 7 OR k = 8; +SELECT count(v), sum(v) FROM t_full_pk WHERE k = 0 OR k = 3 OR k = 7 OR k = 8; + +SET force_primary_key = 0; + +SELECT (k = NULL) OR (k = 1000) FROM t_sparse_pk LIMIT 3; +SELECT range(k) FROM t_sparse_pk LIMIT 3; + +DROP TABLE IF EXISTS t_sparse_pk; +DROP TABLE IF EXISTS t_full_pk; diff --git a/tests/queries/0_stateless/01780_column_sparse_tuple.reference b/tests/queries/0_stateless/01780_column_sparse_tuple.reference new file mode 100644 index 00000000000..22337838cff --- /dev/null +++ b/tests/queries/0_stateless/01780_column_sparse_tuple.reference @@ -0,0 +1,66 @@ +id [] [] [] +t ['a','s'] ['UInt64','String'] ['Sparse','Default'] +(0,'a') +(0,'aa') +(0,'aaa') +(0,'aaaa') +(0,'aaaaa') +(20,'a') +(40,'a') +(60,'a') +(80,'a') +(100,'a') +(20,'a') +(40,'a') +(60,'a') +(80,'a') +(100,'a') +0 +0 +0 +0 +0 +20 +40 +60 +80 +100 +20 +40 +60 +80 +100 +a +aa +aaa +aaaa +aaaaa +a +a +a +a +a +id [] [] [] +t ['a','b','b.u','b.s'] ['UInt64','Tuple(u UInt32, s String)','UInt32','String'] ['Sparse','Default','Sparse','Default'] +0 +0 +0 +60 +0 +a +aa +aaa +aaaa +aaaaa +aaaaaa +a +aaaaaa +a +aaaaaa +id [] [] [] +t ['a','b','b.u','b.s'] ['UInt64','Tuple(u UInt32, s String)','UInt32','String'] ['Sparse','Default','Sparse','Default'] +aaaaaa +a +aaaaaa +a +aaaaaa diff --git a/tests/queries/0_stateless/01780_column_sparse_tuple.sql b/tests/queries/0_stateless/01780_column_sparse_tuple.sql new file mode 100644 index 00000000000..da679f2c7eb --- /dev/null +++ b/tests/queries/0_stateless/01780_column_sparse_tuple.sql @@ -0,0 +1,53 @@ +DROP TABLE IF EXISTS sparse_tuple; + +CREATE TABLE sparse_tuple (id UInt64, t Tuple(a UInt64, s String)) +ENGINE = MergeTree ORDER BY tuple() +SETTINGS ratio_of_defaults_for_sparse_serialization = 0.5; + +INSERT INTO sparse_tuple SELECT number, (if (number % 20 = 0, number, 0), repeat('a', number % 10 + 1)) FROM numbers(1000); + +SELECT column, subcolumns.names, subcolumns.types, subcolumns.serializations +FROM system.parts_columns +WHERE table = 'sparse_tuple' AND database = currentDatabase() +ORDER BY column; + +SELECT t FROM sparse_tuple ORDER BY id LIMIT 5; +SELECT t FROM sparse_tuple WHERE t.a != 0 ORDER BY id LIMIT 5; +SELECT t FROM sparse_tuple WHERE t.a != 0 ORDER BY t.a LIMIT 5; + +SELECT t.a FROM sparse_tuple ORDER BY id LIMIT 5; +SELECT t.a FROM sparse_tuple WHERE t.a != 0 ORDER BY id LIMIT 5; +SELECT t.a FROM sparse_tuple WHERE t.a != 0 ORDER BY t.a LIMIT 5; + +SELECT t.s FROM sparse_tuple ORDER BY id LIMIT 5; +SELECT t.s FROM sparse_tuple WHERE t.a != 0 ORDER BY id LIMIT 5; + +DROP TABLE IF EXISTS sparse_tuple; + +CREATE TABLE sparse_tuple (id UInt64, t Tuple(a UInt64, b Tuple(u UInt32, s String))) +ENGINE = MergeTree ORDER BY tuple() +SETTINGS ratio_of_defaults_for_sparse_serialization = 0.5; + +INSERT INTO sparse_tuple SELECT number, (if (number % 20 = 0, number, 0), (if (number % 15 = 0, number, 0), repeat('a', number % 10 + 1))) FROM numbers(1000); + +SELECT column, subcolumns.names, subcolumns.types, subcolumns.serializations +FROM system.parts_columns +WHERE table = 'sparse_tuple' AND database = currentDatabase() +ORDER BY column; + +SELECT t.a FROM sparse_tuple WHERE t.b.u != 0 ORDER BY id LIMIT 5; + +SELECT t.b.s FROM sparse_tuple ORDER BY id LIMIT 5; +SELECT t.b.s FROM sparse_tuple WHERE t.b.u != 0 ORDER BY id LIMIT 5; + +DETACH TABLE sparse_tuple; +ATTACH TABLE sparse_tuple; + +SELECT column, subcolumns.names, subcolumns.types, subcolumns.serializations +FROM system.parts_columns +WHERE table = 'sparse_tuple' AND database = currentDatabase() +ORDER BY column; + +SELECT t.b.s FROM sparse_tuple WHERE t.b.u != 0 ORDER BY id LIMIT 5; + +DROP TABLE IF EXISTS sparse_tuple; diff --git a/tests/queries/0_stateless/01810_max_part_removal_threads_long.sh b/tests/queries/0_stateless/01810_max_part_removal_threads_long.sh index a2945de5b0c..c5aaa794ac9 100755 --- a/tests/queries/0_stateless/01810_max_part_removal_threads_long.sh +++ b/tests/queries/0_stateless/01810_max_part_removal_threads_long.sh @@ -20,7 +20,7 @@ $CLICKHOUSE_CLIENT -nm -q """ insert into data_01810 select * from numbers(50); drop table data_01810 settings log_queries=1; system flush logs; - select throwIf(length(thread_ids)<50) from system.query_log where event_date = today() and current_database = currentDatabase() and query = 'drop table data_01810 settings log_queries=1;' and type = 'QueryFinish' format Null; + select throwIf(length(thread_ids)<50) from system.query_log where event_date >= yesterday() and current_database = currentDatabase() and query = 'drop table data_01810 settings log_queries=1;' and type = 'QueryFinish' format Null; """ # ReplicatedMergeTree @@ -31,7 +31,7 @@ $CLICKHOUSE_CLIENT -nm -q """ insert into rep_data_01810 select * from numbers(50); drop table rep_data_01810 settings log_queries=1; system flush logs; - select throwIf(length(thread_ids)<50) from system.query_log where event_date = today() and current_database = currentDatabase() and query = 'drop table rep_data_01810 settings log_queries=1;' and type = 'QueryFinish' format Null; + select throwIf(length(thread_ids)<50) from system.query_log where event_date >= yesterday() and current_database = currentDatabase() and query = 'drop table rep_data_01810 settings log_queries=1;' and type = 'QueryFinish' format Null; """ $CLICKHOUSE_CLIENT -nm -q "drop database ordinary_$CLICKHOUSE_DATABASE" diff --git a/tests/queries/0_stateless/01814_distributed_push_down_limit.sh b/tests/queries/0_stateless/01814_distributed_push_down_limit.sh index 81ed4568092..1412ea3be65 100755 --- a/tests/queries/0_stateless/01814_distributed_push_down_limit.sh +++ b/tests/queries/0_stateless/01814_distributed_push_down_limit.sh @@ -69,7 +69,7 @@ function test_distributed_push_down_limit_with_query_log() system flush logs; select read_rows from system.query_log where - event_date = today() + event_date >= yesterday() and query_kind = 'Select' /* exclude DESC TABLE */ and initial_query_id = '$query_id' and initial_query_id != query_id; " | xargs # convert new lines to spaces diff --git a/tests/queries/0_stateless/01821_to_date_time_ubsan.reference b/tests/queries/0_stateless/01821_to_date_time_ubsan.reference index e69de29bb2d..0a762ec3b77 100644 --- a/tests/queries/0_stateless/01821_to_date_time_ubsan.reference +++ b/tests/queries/0_stateless/01821_to_date_time_ubsan.reference @@ -0,0 +1,2 @@ +2283-11-11 23:48:05.4775806 +2283-11-11 23:52:48.54775806 diff --git a/tests/queries/0_stateless/01821_to_date_time_ubsan.sql b/tests/queries/0_stateless/01821_to_date_time_ubsan.sql index 74226fc221f..377291e015f 100644 --- a/tests/queries/0_stateless/01821_to_date_time_ubsan.sql +++ b/tests/queries/0_stateless/01821_to_date_time_ubsan.sql @@ -1,2 +1,2 @@ -SELECT toDateTime('9223372036854775806', 7); -- { serverError 407 } -SELECT toDateTime('9223372036854775806', 8); -- { serverError 407 } +SELECT toDateTime('9223372036854775806', 7, 'Europe/Moscow'); +SELECT toDateTime('9223372036854775806', 8, 'Europe/Moscow'); diff --git a/tests/queries/0_stateless/01910_client_replxx_container_overflow_long.expect b/tests/queries/0_stateless/01910_client_replxx_container_overflow_long.expect index 3003a0de42d..d5ce4c3cbf2 100755 --- a/tests/queries/0_stateless/01910_client_replxx_container_overflow_long.expect +++ b/tests/queries/0_stateless/01910_client_replxx_container_overflow_long.expect @@ -1,14 +1,14 @@ #!/usr/bin/expect -f -# Tags: long, no-fasttest +# Tags: long log_user 0 set timeout 60 match_max 100000 -# A default timeout action is to do nothing, change it to fail expect_after { - timeout { - exit 1 - } + # Do not ignore eof from expect + eof { exp_continue } + # A default timeout action is to do nothing, change it to fail + timeout { exit 1 } } set basedir [file dirname $argv0] diff --git a/tests/queries/0_stateless/01927_query_views_log_current_database.reference b/tests/queries/0_stateless/01927_query_views_log_current_database.reference index ff9eca2d97f..eaa1e98c55c 100644 --- a/tests/queries/0_stateless/01927_query_views_log_current_database.reference +++ b/tests/queries/0_stateless/01927_query_views_log_current_database.reference @@ -1,70 +1,94 @@ Row 1: ────── -stage: Query log rows -read_rows: 100 -written_rows: 201 -databases: ['_table_function','default'] -tables: ['_table_function.numbers','default.table_a','default.table_b','default.table_b_live_view','default.table_c'] -views: ['default.matview_a_to_b','default.matview_b_to_c','default.table_b_live_view'] -sleep_calls: 200 -sleep_us: 298 +stage: Query log rows +read_rows: 400 +written_rows: 201 +databases: ['_table_function','default'] +tables: ['_table_function.numbers','default.table_a','default.table_b','default.table_b_live_view','default.table_c'] +views: ['default.matview_a_to_b','default.matview_b_to_c','default.table_b_live_view'] +sleep_calls: 200 +sleep_us: 298 +profile_select_rows: 400 +profile_select_bytes: 5200 +profile_insert_rows: 201 +profile_insert_bytes: 2808 Row 1: ────── -stage: Depending views -view_name: default.matview_a_to_b -view_type: Materialized -status: QueryFinish -view_target: default.table_b -view_query: SELECT toFloat64(a) AS a, b + sleepEachRow(0.000001) AS count FROM default.table_a -read_rows: 100 -written_rows: 100 -sleep_calls: 100 -sleep_us: 99 +stage: Depending views +view_name: default.matview_a_to_b +view_type: Materialized +status: QueryFinish +view_target: default.table_b +view_query: SELECT toFloat64(a) AS a, b + sleepEachRow(0.000001) AS count FROM default.table_a +read_rows: 100 +written_rows: 100 +sleep_calls: 100 +sleep_us: 99 +profile_select_rows: 100 +profile_select_bytes: 2000 +profile_insert_rows: 100 +profile_insert_bytes: 800 Row 2: ────── -stage: Depending views -view_name: default.matview_b_to_c -view_type: Materialized -status: QueryFinish -view_target: default.table_c -view_query: SELECT sum(a + sleepEachRow(0.000002)) AS a FROM default.table_b -read_rows: 100 -written_rows: 1 -sleep_calls: 100 -sleep_us: 199 +stage: Depending views +view_name: default.matview_b_to_c +view_type: Materialized +status: QueryFinish +view_target: default.table_c +view_query: SELECT sum(a + sleepEachRow(0.000002)) AS a FROM default.table_b +read_rows: 100 +written_rows: 1 +sleep_calls: 100 +sleep_us: 199 +profile_select_rows: 100 +profile_select_bytes: 800 +profile_insert_rows: 1 +profile_insert_bytes: 8 Row 3: ────── -stage: Depending views -view_name: default.table_b_live_view -view_type: Live -status: QueryFinish -view_target: default.table_b_live_view -view_query: SELECT sum(a + b) FROM default.table_b -read_rows: 100 -written_rows: 0 -sleep_calls: 0 -sleep_us: 0 +stage: Depending views +view_name: default.table_b_live_view +view_type: Live +status: QueryFinish +view_target: default.table_b_live_view +view_query: SELECT sum(a + b) FROM default.table_b +read_rows: 100 +written_rows: 0 +sleep_calls: 0 +sleep_us: 0 +profile_select_rows: 100 +profile_select_bytes: 1600 +profile_insert_rows: 0 +profile_insert_bytes: 0 Row 1: ────── -stage: Query log rows 2 -read_rows: 50 -written_rows: 100 -databases: ['_table_function','default'] -tables: ['_table_function.numbers','default.table_d','default.table_e','default.table_f'] -views: ['default.matview_join_d_e'] -sleep_calls: 50 -sleep_us: 150 +stage: Query log rows 2 +read_rows: 100 +written_rows: 100 +databases: ['_table_function','default'] +tables: ['_table_function.numbers','default.table_d','default.table_e','default.table_f'] +views: ['default.matview_join_d_e'] +sleep_calls: 50 +sleep_us: 150 +profile_select_rows: 100 +profile_select_bytes: 800 +profile_insert_rows: 100 +profile_insert_bytes: 1600 Row 1: ────── -stage: Depending views 2 -view_name: default.matview_join_d_e -view_type: Materialized -status: QueryFinish -view_target: default.table_f -view_query: SELECT table_d.a AS a, table_e.count + sleepEachRow(0.000003) AS count FROM default.table_d LEFT JOIN default.table_e ON table_d.a = table_e.a -read_rows: 50 -written_rows: 50 -sleep_calls: 50 -sleep_us: 150 +stage: Depending views 2 +view_name: default.matview_join_d_e +view_type: Materialized +status: QueryFinish +view_target: default.table_f +view_query: SELECT table_d.a AS a, table_e.count + sleepEachRow(0.000003) AS count FROM default.table_d LEFT JOIN default.table_e ON table_d.a = table_e.a +read_rows: 50 +written_rows: 50 +sleep_calls: 50 +sleep_us: 150 +profile_select_rows: 50 +profile_select_bytes: 400 +profile_insert_rows: 50 +profile_insert_bytes: 800 diff --git a/tests/queries/0_stateless/01927_query_views_log_current_database.sql b/tests/queries/0_stateless/01927_query_views_log_current_database.sql index 40ab8c8e16a..fbfbeab0167 100644 --- a/tests/queries/0_stateless/01927_query_views_log_current_database.sql +++ b/tests/queries/0_stateless/01927_query_views_log_current_database.sql @@ -45,7 +45,11 @@ SELECT arraySort(tables) as tables, arraySort(views) as views, ProfileEvents['SleepFunctionCalls'] as sleep_calls, - ProfileEvents['SleepFunctionMicroseconds'] as sleep_us + ProfileEvents['SleepFunctionMicroseconds'] as sleep_us, + ProfileEvents['SelectedRows'] as profile_select_rows, + ProfileEvents['SelectedBytes'] as profile_select_bytes, + ProfileEvents['InsertedRows'] as profile_insert_rows, + ProfileEvents['InsertedBytes'] as profile_insert_bytes FROM system.query_log WHERE query like '-- INSERT 1%INSERT INTO table_a%' AND current_database = currentDatabase() @@ -62,7 +66,11 @@ SELECT read_rows, written_rows, ProfileEvents['SleepFunctionCalls'] as sleep_calls, - ProfileEvents['SleepFunctionMicroseconds'] as sleep_us + ProfileEvents['SleepFunctionMicroseconds'] as sleep_us, + ProfileEvents['SelectedRows'] as profile_select_rows, + ProfileEvents['SelectedBytes'] as profile_select_bytes, + ProfileEvents['InsertedRows'] as profile_insert_rows, + ProfileEvents['InsertedBytes'] as profile_insert_bytes FROM system.query_views_log WHERE initial_query_id = ( @@ -85,7 +93,11 @@ SELECT arraySort(tables) as tables, arraySort(views) as views, ProfileEvents['SleepFunctionCalls'] as sleep_calls, - ProfileEvents['SleepFunctionMicroseconds'] as sleep_us + ProfileEvents['SleepFunctionMicroseconds'] as sleep_us, + ProfileEvents['SelectedRows'] as profile_select_rows, + ProfileEvents['SelectedBytes'] as profile_select_bytes, + ProfileEvents['InsertedRows'] as profile_insert_rows, + ProfileEvents['InsertedBytes'] as profile_insert_bytes FROM system.query_log WHERE query like '-- INSERT 2%INSERT INTO table_d%' AND current_database = currentDatabase() @@ -102,7 +114,11 @@ SELECT read_rows, written_rows, ProfileEvents['SleepFunctionCalls'] as sleep_calls, - ProfileEvents['SleepFunctionMicroseconds'] as sleep_us + ProfileEvents['SleepFunctionMicroseconds'] as sleep_us, + ProfileEvents['SelectedRows'] as profile_select_rows, + ProfileEvents['SelectedBytes'] as profile_select_bytes, + ProfileEvents['InsertedRows'] as profile_insert_rows, + ProfileEvents['InsertedBytes'] as profile_insert_bytes FROM system.query_views_log WHERE initial_query_id = ( diff --git a/tests/queries/0_stateless/01933_client_replxx_convert_history.expect b/tests/queries/0_stateless/01933_client_replxx_convert_history.expect index 664c3f06d20..c5645179ab3 100755 --- a/tests/queries/0_stateless/01933_client_replxx_convert_history.expect +++ b/tests/queries/0_stateless/01933_client_replxx_convert_history.expect @@ -1,15 +1,15 @@ #!/usr/bin/expect -f -# Tags: no-parallel, no-fasttest +# Tags: no-parallel # Tag no-parallel: Uses non unique history file log_user 0 set timeout 60 match_max 100000 -# A default timeout action is to do nothing, change it to fail expect_after { - timeout { - exit 1 - } + # Do not ignore eof from expect + eof { exp_continue } + # A default timeout action is to do nothing, change it to fail + timeout { exit 1 } } set basedir [file dirname $argv0] diff --git a/tests/queries/0_stateless/01945_show_debug_warning.expect b/tests/queries/0_stateless/01945_show_debug_warning.expect index 402ad9a1f35..2f74b6e33ae 100755 --- a/tests/queries/0_stateless/01945_show_debug_warning.expect +++ b/tests/queries/0_stateless/01945_show_debug_warning.expect @@ -7,11 +7,11 @@ log_user 0 set timeout 60 match_max 100000 -# A default timeout action is to do nothing, change it to fail expect_after { - timeout { - exit 1 - } + # Do not ignore eof from expect + eof { exp_continue } + # A default timeout action is to do nothing, change it to fail + timeout { exit 1 } } set basedir [file dirname $argv0] diff --git a/tests/queries/0_stateless/01950_kill_large_group_by_query.sh b/tests/queries/0_stateless/01950_kill_large_group_by_query.sh index 0b369c7257e..aba9d2d2467 100755 --- a/tests/queries/0_stateless/01950_kill_large_group_by_query.sh +++ b/tests/queries/0_stateless/01950_kill_large_group_by_query.sh @@ -12,9 +12,11 @@ function wait_for_query_to_start() } +MAX_TIMEOUT=30 + # TCP CLIENT -$CLICKHOUSE_CLIENT --max_execution_time 10 --query_id "test_01948_tcp_$CLICKHOUSE_DATABASE" -q \ +$CLICKHOUSE_CLIENT --max_execution_time $MAX_TIMEOUT --query_id "test_01948_tcp_$CLICKHOUSE_DATABASE" -q \ "SELECT * FROM ( SELECT a.name as n @@ -30,12 +32,12 @@ $CLICKHOUSE_CLIENT --max_execution_time 10 --query_id "test_01948_tcp_$CLICKHOUS LIMIT 20 FORMAT Null" > /dev/null 2>&1 & wait_for_query_to_start "test_01948_tcp_$CLICKHOUSE_DATABASE" -$CLICKHOUSE_CLIENT --max_execution_time 10 -q "KILL QUERY WHERE query_id = 'test_01948_tcp_$CLICKHOUSE_DATABASE' SYNC" +$CLICKHOUSE_CLIENT --max_execution_time $MAX_TIMEOUT -q "KILL QUERY WHERE query_id = 'test_01948_tcp_$CLICKHOUSE_DATABASE' SYNC" # HTTP CLIENT -${CLICKHOUSE_CURL_COMMAND} -q --max-time 10 -sS "$CLICKHOUSE_URL&query_id=test_01948_http_$CLICKHOUSE_DATABASE" -d \ +${CLICKHOUSE_CURL_COMMAND} -q --max-time $MAX_TIMEOUT -sS "$CLICKHOUSE_URL&query_id=test_01948_http_$CLICKHOUSE_DATABASE" -d \ "SELECT * FROM ( SELECT a.name as n @@ -51,4 +53,4 @@ ${CLICKHOUSE_CURL_COMMAND} -q --max-time 10 -sS "$CLICKHOUSE_URL&query_id=test_0 LIMIT 20 FORMAT Null" > /dev/null 2>&1 & wait_for_query_to_start "test_01948_http_$CLICKHOUSE_DATABASE" -$CLICKHOUSE_CURL --max-time 10 -sS "$CLICKHOUSE_URL" -d "KILL QUERY WHERE query_id = 'test_01948_http_$CLICKHOUSE_DATABASE' SYNC" +$CLICKHOUSE_CURL --max-time $MAX_TIMEOUT -sS "$CLICKHOUSE_URL" -d "KILL QUERY WHERE query_id = 'test_01948_http_$CLICKHOUSE_DATABASE' SYNC" diff --git a/tests/queries/0_stateless/01999_grant_with_replace.reference b/tests/queries/0_stateless/01999_grant_with_replace.reference index 9e089a05e52..740c55d5325 100644 --- a/tests/queries/0_stateless/01999_grant_with_replace.reference +++ b/tests/queries/0_stateless/01999_grant_with_replace.reference @@ -13,7 +13,7 @@ GRANT SELECT(cola) ON db5.table TO test_user_01999 GRANT INSERT(colb) ON db6.tb61 TO test_user_01999 GRANT SHOW ON db7.* TO test_user_01999 F -GRANT SELECT ON all.* TO test_user_01999 +GRANT SELECT ON `all`.* TO test_user_01999 G H GRANT SELECT ON db1.tb1 TO test_user_01999 diff --git a/tests/queries/0_stateless/02003_memory_limit_in_client.expect b/tests/queries/0_stateless/02003_memory_limit_in_client.expect index 47ac4926537..29701f49746 100755 --- a/tests/queries/0_stateless/02003_memory_limit_in_client.expect +++ b/tests/queries/0_stateless/02003_memory_limit_in_client.expect @@ -8,11 +8,11 @@ log_user 0 set timeout 60 match_max 100000 -# A default timeout action is to do nothing, change it to fail expect_after { - timeout { - exit 1 - } + # Do not ignore eof from expect + eof { exp_continue } + # A default timeout action is to do nothing, change it to fail + timeout { exit 1 } } set basedir [file dirname $argv0] diff --git a/tests/queries/0_stateless/02004_invalid_partition_mutation_stuck.sql b/tests/queries/0_stateless/02004_invalid_partition_mutation_stuck.sql index 481a5565095..71c8b9af652 100644 --- a/tests/queries/0_stateless/02004_invalid_partition_mutation_stuck.sql +++ b/tests/queries/0_stateless/02004_invalid_partition_mutation_stuck.sql @@ -28,6 +28,6 @@ PARTITION BY p ORDER BY t SETTINGS number_of_free_entries_in_pool_to_execute_mutation=0; INSERT INTO data VALUES (1, now()); -ALTER TABLE data MATERIALIZE INDEX idx IN PARTITION ID 'NO_SUCH_PART'; -- { serverError 341 } +ALTER TABLE data MATERIALIZE INDEX idx IN PARTITION ID 'NO_SUCH_PART'; -- { serverError 248 } ALTER TABLE data MATERIALIZE INDEX idx IN PARTITION ID '1'; ALTER TABLE data MATERIALIZE INDEX idx IN PARTITION ID '2'; diff --git a/tests/queries/0_stateless/02006_test_positional_arguments.reference b/tests/queries/0_stateless/02006_test_positional_arguments.reference index 7b75ab43430..5fc070ffd0b 100644 --- a/tests/queries/0_stateless/02006_test_positional_arguments.reference +++ b/tests/queries/0_stateless/02006_test_positional_arguments.reference @@ -46,22 +46,6 @@ select x1, x2, x3 from test order by 3 limit 1 by 1; 100 100 1 10 1 10 1 10 100 -explain syntax select x3, x2, x1 from test order by 1 + 1; -SELECT - x3, - x2, - x1 -FROM test -ORDER BY x3 + x3 ASC -explain syntax select x3, x2, x1 from test order by (1 + 1) * 3; -SELECT - x3, - x2, - x1 -FROM test -ORDER BY (x3 + x3) * x1 ASC -select x2, x1 from test group by x2 + x1; -- { serverError 215 } -select x2, x1 from test group by 1 + 2; -- { serverError 215 } explain syntax select x3, x2, x1 from test order by 1; SELECT x3, @@ -110,27 +94,6 @@ GROUP BY x2 select max(x1), x2 from test group by 1, 2; -- { serverError 43 } select 1 + max(x1), x2 from test group by 1, 2; -- { serverError 43 } -select x1 + x2, x3 from test group by x1 + x2, x3; -11 100 -200 1 -11 200 -11 10 -select x3, x2, x1 from test order by x3 * 2, x2, x1; -- check x3 * 2 does not become x3 * x2 -1 100 100 -1 100 100 -10 1 10 -100 10 1 -200 1 10 -200 10 1 -explain syntax select x1, x3 from test group by 1 + 2, 1, 2; -SELECT - x1, - x3 -FROM test -GROUP BY - x1 + x3, - x1, - x3 explain syntax select x1 + x3, x3 from test group by 1, 2; SELECT x1 + x3, @@ -152,3 +115,5 @@ SELECT 1 + 1 AS a GROUP BY a select substr('aaaaaaaaaaaaaa', 8) as a group by a; aaaaaaa +select substr('aaaaaaaaaaaaaa', 8) as a group by substr('aaaaaaaaaaaaaa', 8); +aaaaaaa diff --git a/tests/queries/0_stateless/02006_test_positional_arguments.sql b/tests/queries/0_stateless/02006_test_positional_arguments.sql index 3ba01b47efa..3a2cf76f6c4 100644 --- a/tests/queries/0_stateless/02006_test_positional_arguments.sql +++ b/tests/queries/0_stateless/02006_test_positional_arguments.sql @@ -22,12 +22,6 @@ select x1, x2, x3 from test order by 3 limit 1 by 3; select x1, x2, x3 from test order by x3 limit 1 by x1; select x1, x2, x3 from test order by 3 limit 1 by 1; -explain syntax select x3, x2, x1 from test order by 1 + 1; -explain syntax select x3, x2, x1 from test order by (1 + 1) * 3; - -select x2, x1 from test group by x2 + x1; -- { serverError 215 } -select x2, x1 from test group by 1 + 2; -- { serverError 215 } - explain syntax select x3, x2, x1 from test order by 1; explain syntax select x3 + 1, x2, x1 from test order by 1; explain syntax select x3, x3 - x2, x2, x1 from test order by 2; @@ -37,11 +31,7 @@ explain syntax select 1 + greatest(x1, 1), x2 from test group by 1, 2; select max(x1), x2 from test group by 1, 2; -- { serverError 43 } select 1 + max(x1), x2 from test group by 1, 2; -- { serverError 43 } -select x1 + x2, x3 from test group by x1 + x2, x3; -select x3, x2, x1 from test order by x3 * 2, x2, x1; -- check x3 * 2 does not become x3 * x2 - -explain syntax select x1, x3 from test group by 1 + 2, 1, 2; explain syntax select x1 + x3, x3 from test group by 1, 2; create table test2(x1 Int, x2 Int, x3 Int) engine=Memory; @@ -52,3 +42,5 @@ select a, b, c, d, e, f from (select 44 a, 88 b, 13 c, 14 d, 15 e, 16 f) t grou explain syntax select plus(1, 1) as a group by a; select substr('aaaaaaaaaaaaaa', 8) as a group by a; +select substr('aaaaaaaaaaaaaa', 8) as a group by substr('aaaaaaaaaaaaaa', 8); + diff --git a/tests/queries/0_stateless/02007_ipv4_and_ipv6_to_and_from_string.reference b/tests/queries/0_stateless/02007_ipv4_and_ipv6_to_and_from_string.reference new file mode 100644 index 00000000000..8a4df1605fb --- /dev/null +++ b/tests/queries/0_stateless/02007_ipv4_and_ipv6_to_and_from_string.reference @@ -0,0 +1,6 @@ +127.0.0.1 IPv4 +127.0.0.1 String +2001:db8:0:85a3::ac1f:8001 IPv6 +2001:db8:0:85a3::ac1f:8001 String +0.0.0.0 IPv4 +:: IPv6 diff --git a/tests/queries/0_stateless/02007_ipv4_and_ipv6_to_and_from_string.sql b/tests/queries/0_stateless/02007_ipv4_and_ipv6_to_and_from_string.sql new file mode 100644 index 00000000000..2fcc20b9811 --- /dev/null +++ b/tests/queries/0_stateless/02007_ipv4_and_ipv6_to_and_from_string.sql @@ -0,0 +1,13 @@ +SELECT CAST('127.0.0.1' as IPv4) as v, toTypeName(v); +SELECT CAST(toIPv4('127.0.0.1') as String) as v, toTypeName(v); + +SELECT CAST('2001:0db8:0000:85a3:0000:0000:ac1f:8001' as IPv6) as v, toTypeName(v); +SELECT CAST(toIPv6('2001:0db8:0000:85a3:0000:0000:ac1f:8001') as String) as v, toTypeName(v); + +SELECT toIPv4('hello') as v, toTypeName(v); +SELECT toIPv6('hello') as v, toTypeName(v); + +SELECT CAST('hello' as IPv4) as v, toTypeName(v); -- { serverError CANNOT_PARSE_DOMAIN_VALUE_FROM_STRING } +SELECT CAST('hello' as IPv6) as v, toTypeName(v); -- { serverError CANNOT_PARSE_DOMAIN_VALUE_FROM_STRING } + +SELECT CAST('1.1.1.1' as IPv6) as v, toTypeName(v); -- { serverError CANNOT_PARSE_DOMAIN_VALUE_FROM_STRING } diff --git a/tests/queries/0_stateless/02008_materialize_column.sql b/tests/queries/0_stateless/02008_materialize_column.sql index 4136a04568e..8a8eb2afe83 100644 --- a/tests/queries/0_stateless/02008_materialize_column.sql +++ b/tests/queries/0_stateless/02008_materialize_column.sql @@ -5,6 +5,8 @@ SET mutations_sync = 2; CREATE TABLE tmp (x Int64) ENGINE = MergeTree() ORDER BY tuple() PARTITION BY tuple(); INSERT INTO tmp SELECT * FROM system.numbers LIMIT 20; +ALTER TABLE tmp MATERIALIZE COLUMN x; -- { serverError 36 } + ALTER TABLE tmp ADD COLUMN s String DEFAULT toString(x); SELECT groupArray(x), groupArray(s) FROM tmp; diff --git a/tests/queries/0_stateless/02010_lc_native.python b/tests/queries/0_stateless/02010_lc_native.python index 56e981555f3..71965512e64 100755 --- a/tests/queries/0_stateless/02010_lc_native.python +++ b/tests/queries/0_stateless/02010_lc_native.python @@ -302,11 +302,44 @@ def insertLowCardinalityRowWithIncorrectDictType(): print(readException(s)) s.close() +def insertLowCardinalityRowWithIncorrectAdditionalKeys(): + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.settimeout(30) + s.connect((CLICKHOUSE_HOST, CLICKHOUSE_PORT)) + sendHello(s) + receiveHello(s) + sendQuery(s, 'insert into {}.tab format TSV'.format(CLICKHOUSE_DATABASE)) + + # external tables + sendEmptyBlock(s) + readHeader(s) + + # Data + ba = bytearray() + writeVarUInt(2, ba) # Data + writeStringBinary('', ba) + serializeBlockInfo(ba) + writeVarUInt(1, ba) # rows + writeVarUInt(1, ba) # columns + writeStringBinary('x', ba) + writeStringBinary('LowCardinality(String)', ba) + ba.extend([1] + [0] * 7) # SharedDictionariesWithAdditionalKeys + ba.extend([3, 0] + [0] * 6) # indexes type: UInt64 [3], with NO additional keys [0] + ba.extend([1] + [0] * 7) # num_keys in dict + writeStringBinary('hello', ba) # key + ba.extend([1] + [0] * 7) # num_indexes + ba.extend([0] * 8) # UInt64 index (0 for 'hello') + s.sendall(ba) + + assertPacket(readVarUInt(s), 2) + print(readException(s)) + s.close() def main(): insertValidLowCardinalityRow() insertLowCardinalityRowWithIndexOverflow() insertLowCardinalityRowWithIncorrectDictType() + insertLowCardinalityRowWithIncorrectAdditionalKeys() if __name__ == "__main__": main() diff --git a/tests/queries/0_stateless/02010_lc_native.reference b/tests/queries/0_stateless/02010_lc_native.reference index 0167f05c952..bbf0c9c025d 100644 --- a/tests/queries/0_stateless/02010_lc_native.reference +++ b/tests/queries/0_stateless/02010_lc_native.reference @@ -6,3 +6,6 @@ code 117: Index for LowCardinality is out of range. Dictionary size is 1, but f Rows 0 Columns 1 Column x type LowCardinality(String) code 117: LowCardinality indexes serialization type for Native format cannot use global dictionary +Rows 0 Columns 1 +Column x type LowCardinality(String) +code 117: No additional keys found. diff --git a/tests/queries/0_stateless/02015_global_in_threads.sh b/tests/queries/0_stateless/02015_global_in_threads.sh index c112e47fe92..9437187d462 100755 --- a/tests/queries/0_stateless/02015_global_in_threads.sh +++ b/tests/queries/0_stateless/02015_global_in_threads.sh @@ -6,4 +6,4 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) ${CLICKHOUSE_CLIENT} --log_queries=1 --max_threads=32 --query_id "2015_${CLICKHOUSE_DATABASE}_query" -q "select count() from remote('127.0.0.{2,3}', numbers(10)) where number global in (select number % 5 from numbers_mt(1000000))" ${CLICKHOUSE_CLIENT} -q "system flush logs" -${CLICKHOUSE_CLIENT} -q "select length(thread_ids) >= 32 from system.query_log where event_date = today() and query_id = '2015_${CLICKHOUSE_DATABASE}_query' and type = 'QueryFinish' and current_database = currentDatabase()" +${CLICKHOUSE_CLIENT} -q "select length(thread_ids) >= 32 from system.query_log where event_date >= yesterday() and query_id = '2015_${CLICKHOUSE_DATABASE}_query' and type = 'QueryFinish' and current_database = currentDatabase()" diff --git a/tests/queries/0_stateless/02047_client_exception.expect b/tests/queries/0_stateless/02047_client_exception.expect index 0025afa88eb..f7d4bfb555d 100755 --- a/tests/queries/0_stateless/02047_client_exception.expect +++ b/tests/queries/0_stateless/02047_client_exception.expect @@ -1,15 +1,14 @@ #!/usr/bin/expect -f -# Tags: no-fasttest log_user 0 set timeout 20 match_max 100000 -# A default timeout action is to fail expect_after { - timeout { - exit 1 - } + # Do not ignore eof from expect + eof { exp_continue } + # A default timeout action is to do nothing, change it to fail + timeout { exit 1 } } set basedir [file dirname $argv0] diff --git a/tests/queries/0_stateless/02048_views_with_comment.reference b/tests/queries/0_stateless/02048_views_with_comment.reference new file mode 100644 index 00000000000..ad9817ad120 --- /dev/null +++ b/tests/queries/0_stateless/02048_views_with_comment.reference @@ -0,0 +1,3 @@ +live_view_comment_test LiveView live view +materialized_view_comment_test MaterializedView materialized view +view_comment_test View simple view diff --git a/tests/queries/0_stateless/02048_views_with_comment.sql b/tests/queries/0_stateless/02048_views_with_comment.sql new file mode 100644 index 00000000000..a7c991d119b --- /dev/null +++ b/tests/queries/0_stateless/02048_views_with_comment.sql @@ -0,0 +1,12 @@ +-- Make sure that any kind of `VIEW` can be created with a `COMMENT` clause +-- and value of that clause is visible as `comment` column of `system.tables` table. + +CREATE VIEW view_comment_test AS (SELECT 1) COMMENT 'simple view'; +CREATE MATERIALIZED VIEW materialized_view_comment_test TO test1 (a UInt64) AS (SELECT 1) COMMENT 'materialized view'; + +SET allow_experimental_live_view=1; +CREATE LIVE VIEW live_view_comment_test AS (SELECT 1) COMMENT 'live view'; + +SYSTEM FLUSH LOGS; + +SELECT name, engine, comment FROM system.tables WHERE database == currentDatabase() ORDER BY name; diff --git a/tests/queries/0_stateless/02049_clickhouse_local_merge_tree.expect b/tests/queries/0_stateless/02049_clickhouse_local_merge_tree.expect index 17b98b077d5..ffa25b964db 100755 --- a/tests/queries/0_stateless/02049_clickhouse_local_merge_tree.expect +++ b/tests/queries/0_stateless/02049_clickhouse_local_merge_tree.expect @@ -1,16 +1,14 @@ #!/usr/bin/expect -f -# Tags: no-fasttest log_user 0 set timeout 20 match_max 100000 -# A default timeout action is to fail expect_after { - timeout { - exit 1 - } - + # Do not ignore eof from expect + eof { exp_continue } + # A default timeout action is to do nothing, change it to fail + timeout { exit 1 } } set basedir [file dirname $argv0] diff --git a/tests/queries/0_stateless/02050_client_profile_events.reference b/tests/queries/0_stateless/02050_client_profile_events.reference index 00fc3b5d06a..2451417ddf0 100644 --- a/tests/queries/0_stateless/02050_client_profile_events.reference +++ b/tests/queries/0_stateless/02050_client_profile_events.reference @@ -1,4 +1,5 @@ 0 -SelectedRows: 131010 (increment) +100000 +[ 0 ] SelectedRows: 131010 (increment) OK OK diff --git a/tests/queries/0_stateless/02050_client_profile_events.sh b/tests/queries/0_stateless/02050_client_profile_events.sh index 5c3887cf5fb..459e8505e22 100755 --- a/tests/queries/0_stateless/02050_client_profile_events.sh +++ b/tests/queries/0_stateless/02050_client_profile_events.sh @@ -1,5 +1,4 @@ #!/usr/bin/env bash -# Tags: long CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh @@ -7,9 +6,11 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # do not print any ProfileEvents packets $CLICKHOUSE_CLIENT -q 'select * from numbers(1e5) format Null' |& grep -c 'SelectedRows' -# print only last -$CLICKHOUSE_CLIENT --print-profile-events --profile-events-delay-ms=-1 -q 'select * from numbers(1e5) format Null' |& grep -o 'SelectedRows: .*$' +# print only last (and also number of rows to provide more info in case of failures) +$CLICKHOUSE_CLIENT --print-profile-events --profile-events-delay-ms=-1 -q 'select * from numbers(1e5)' 2> >(grep -o -e '\[ 0 \] SelectedRows: .*$' -e Exception) 1> >(wc -l) # print everything -test "$($CLICKHOUSE_CLIENT --print-profile-events -q 'select * from numbers(1e9) format Null' |& grep -c 'SelectedRows')" -gt 1 && echo OK || echo FAIL +profile_events="$($CLICKHOUSE_CLIENT --max_block_size 1 --print-profile-events -q 'select sleep(1) from numbers(2) format Null' |& grep -c 'SelectedRows')" +test "$profile_events" -gt 1 && echo OK || echo "FAIL ($profile_events)" # print each 100 ms -test "$($CLICKHOUSE_CLIENT --print-profile-events --profile-events-delay-ms=100 -q 'select * from numbers(1e9) format Null' |& grep -c 'SelectedRows')" -gt 1 && echo OK || echo FAIL +profile_events="$($CLICKHOUSE_CLIENT --max_block_size 1 --print-profile-events --profile-events-delay-ms=100 -q 'select sleep(1) from numbers(2) format Null' |& grep -c 'SelectedRows')" +test "$profile_events" -gt 1 && echo OK || echo "FAIL ($profile_events)" diff --git a/tests/queries/0_stateless/02098_date32_comparison.sql b/tests/queries/0_stateless/02098_date32_comparison.sql index 5fd7172e0bb..b35191e58ed 100644 --- a/tests/queries/0_stateless/02098_date32_comparison.sql +++ b/tests/queries/0_stateless/02098_date32_comparison.sql @@ -1,19 +1,19 @@ -select toDate32('1990-01-01') = toDate('1990-01-01'); -select toDate('1991-01-02') > toDate32('1990-01-01'); -select toDate32('1925-01-01') <= toDate('1990-01-01'); -select toDate('1991-01-01') < toDate32('2283-11-11'); -select toDate32('1990-01-01') = toDateTime('1990-01-01'); -select toDateTime('1991-01-02') > toDate32('1990-01-01'); -select toDate32('1925-01-01') <= toDateTime('1990-01-01'); -select toDateTime('1991-01-01') < toDate32('2283-11-11'); -select toDate32('1990-01-01') = toDateTime64('1990-01-01',2); -select toDateTime64('1991-01-02',2) > toDate32('1990-01-01'); -select toDate32('1925-01-01') = toDateTime64('1925-01-01',2); -select toDateTime64('1925-01-02',2) > toDate32('1925-01-01'); -select toDate32('2283-11-11') = toDateTime64('2283-11-11',2); -select toDateTime64('2283-11-11',2) > toDate32('1925-01-01'); -select toDate32('1990-01-01') = '1990-01-01'; -select '1991-01-02' > toDate32('1990-01-01'); -select toDate32('1925-01-01') = '1925-01-01'; -select '2283-11-11' >= toDate32('2283-11-10'); -select '2283-11-11' > toDate32('1925-01-01'); \ No newline at end of file +select toDate32('1990-02-01') = toDate('1990-02-01'); +select toDate('1991-01-02') > toDate32('1990-02-01'); +select toDate32('1925-02-01') <= toDate('1990-02-01'); +select toDate('1991-02-01') < toDate32('2283-11-11'); +select toDate32('1990-02-01') = toDateTime('1990-02-01'); +select toDateTime('1991-01-02') > toDate32('1990-02-01'); +select toDate32('1925-02-01') <= toDateTime('1990-02-01'); +select toDateTime('1991-02-01') < toDate32('2283-11-11'); +select toDate32('1990-02-01') = toDateTime64('1990-02-01',2); +select toDateTime64('1991-01-02',2) > toDate32('1990-02-01'); +select toDate32('1925-02-01') = toDateTime64('1925-02-01',2); +select toDateTime64('1925-02-02',2) > toDate32('1925-02-01'); +select toDate32('2283-11-11') = toDateTime64('2283-11-11',2); +select toDateTime64('2283-11-11',2) > toDate32('1925-02-01'); +select toDate32('1990-02-01') = '1990-02-01'; +select '1991-01-02' > toDate32('1990-02-01'); +select toDate32('1925-02-01') = '1925-02-01'; +select '2283-11-11' >= toDate32('2283-11-10'); +select '2283-11-11' > toDate32('1925-02-01'); diff --git a/tests/queries/0_stateless/02101_avro_union_index_out_of_boundary.reference b/tests/queries/0_stateless/02101_avro_union_index_out_of_boundary.reference new file mode 100644 index 00000000000..33702ab4186 --- /dev/null +++ b/tests/queries/0_stateless/02101_avro_union_index_out_of_boundary.reference @@ -0,0 +1 @@ +index out of boundary diff --git a/tests/queries/0_stateless/02101_avro_union_index_out_of_boundary.sh b/tests/queries/0_stateless/02101_avro_union_index_out_of_boundary.sh new file mode 100755 index 00000000000..1e9c49b8963 --- /dev/null +++ b/tests/queries/0_stateless/02101_avro_union_index_out_of_boundary.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash +# Tags: no-fasttest + +set -e + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +DATA_DIR=$CUR_DIR/data_avro +cat "$DATA_DIR"/nested_complex_incorrect_data.avro | ${CLICKHOUSE_LOCAL} --input-format Avro --output-format CSV -S "\"b.b2_null_str\" Nullable(String)" -q 'select * from table;' 2>&1 | grep -i 'index out of boundary' -o diff --git a/tests/queries/0_stateless/02105_backslash_letter_commands.expect b/tests/queries/0_stateless/02105_backslash_letter_commands.expect index 9c6f3e10227..e67d60912fa 100755 --- a/tests/queries/0_stateless/02105_backslash_letter_commands.expect +++ b/tests/queries/0_stateless/02105_backslash_letter_commands.expect @@ -1,14 +1,13 @@ #!/usr/bin/expect -f -# Tags: no-fasttest log_user 0 set timeout 02 match_max 100000 -# A default timeout action is to do nothing, change it to fail expect_after { - timeout { - exit 1 - } + # Do not ignore eof from expect + eof { exp_continue } + # A default timeout action is to do nothing, change it to fail + timeout { exit 1 } } set basedir [file dirname $argv0] diff --git a/tests/queries/0_stateless/02112_delayed_clickhouse_client_with_queries_file.expect b/tests/queries/0_stateless/02112_delayed_clickhouse_client_with_queries_file.expect index b676c221c65..0abe25e60f4 100755 --- a/tests/queries/0_stateless/02112_delayed_clickhouse_client_with_queries_file.expect +++ b/tests/queries/0_stateless/02112_delayed_clickhouse_client_with_queries_file.expect @@ -1,27 +1,28 @@ #!/usr/bin/expect -f -# Tags: no-parallel, no-fasttest +# Tags: no-parallel log_user 0 set timeout 20 match_max 100000 -# A default timeout action is to fail expect_after { - timeout { - exit 1 - } + # Do not ignore eof from expect + eof { exp_continue } + # A default timeout action is to do nothing, change it to fail + timeout { exit 1 } } - -spawn bash -c "\$CLICKHOUSE_TESTS_DIR/helpers/02112_prepare.sh" - set basedir [file dirname $argv0] -spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_CLIENT --disable_suggestion --interactive --queries-file \$CURDIR/file_02112" + +system "$basedir/helpers/02112_prepare.sh" +spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_CLIENT --disable_suggestion --interactive --queries-file $basedir/file_02112" expect ":) " send -- "select * from t format TSV\r" expect "1" expect ":) " -spawn bash -c "\$CLICKHOUSE_TESTS_DIR/helpers/02112_clean.sh" +send "" +expect eof +system "$basedir/helpers/02112_clean.sh" diff --git a/tests/queries/0_stateless/02112_delayed_clickhouse_local.expect b/tests/queries/0_stateless/02112_delayed_clickhouse_local.expect index cd42388c099..c846464b011 100755 --- a/tests/queries/0_stateless/02112_delayed_clickhouse_local.expect +++ b/tests/queries/0_stateless/02112_delayed_clickhouse_local.expect @@ -1,15 +1,14 @@ #!/usr/bin/expect -f -# Tags: no-fasttest log_user 0 set timeout 20 match_max 100000 -# A default timeout action is to fail expect_after { - timeout { - exit 1 - } + # Do not ignore eof from expect + eof { exp_continue } + # A default timeout action is to do nothing, change it to fail + timeout { exit 1 } } set basedir [file dirname $argv0] diff --git a/tests/queries/0_stateless/02112_delayed_clickhouse_local_with_queries_file.expect b/tests/queries/0_stateless/02112_delayed_clickhouse_local_with_queries_file.expect index f0aef1550c3..c64f149a93c 100755 --- a/tests/queries/0_stateless/02112_delayed_clickhouse_local_with_queries_file.expect +++ b/tests/queries/0_stateless/02112_delayed_clickhouse_local_with_queries_file.expect @@ -1,27 +1,28 @@ #!/usr/bin/expect -f -# Tags: no-parallel, no-fasttest +# Tags: no-parallel log_user 0 set timeout 20 match_max 100000 -# A default timeout action is to fail expect_after { - timeout { - exit 1 - } + # Do not ignore eof from expect + eof { exp_continue } + # A default timeout action is to do nothing, change it to fail + timeout { exit 1 } } - -spawn bash -c "\$CLICKHOUSE_TESTS_DIR/helpers/02112_prepare.sh" - set basedir [file dirname $argv0] -spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_LOCAL --disable_suggestion --interactive --queries-file \$CURDIR/file_02112" + +system "$basedir/helpers/02112_prepare.sh" +spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_LOCAL --disable_suggestion --interactive --queries-file $basedir/file_02112" expect ":) " send -- "select * from t format TSV\r" expect "1" expect ":) " -spawn bash -c "\$CLICKHOUSE_TESTS_DIR/helpers/02112_clean.sh" +send "" +expect eof +system "$basedir/helpers/02112_clean.sh" diff --git a/tests/queries/0_stateless/02113_base64encode_trailing_bytes.reference b/tests/queries/0_stateless/02113_base64encode_trailing_bytes.reference new file mode 100644 index 00000000000..61911f9540a --- /dev/null +++ b/tests/queries/0_stateless/02113_base64encode_trailing_bytes.reference @@ -0,0 +1,2 @@ +SELECT * FROM tabl_1 SETTINGS log_comment = ?; +SELECT * FROM tabl_2 SETTINGS log_comment = ?; diff --git a/tests/queries/0_stateless/02113_base64encode_trailing_bytes.sql b/tests/queries/0_stateless/02113_base64encode_trailing_bytes.sql new file mode 100644 index 00000000000..120055b12e2 --- /dev/null +++ b/tests/queries/0_stateless/02113_base64encode_trailing_bytes.sql @@ -0,0 +1,20 @@ +-- Tags: no-fasttest +SET log_queries=1; + +DROP TABLE IF EXISTS tabl_1; +DROP TABLE IF EXISTS tabl_2; + +CREATE TABLE tabl_1 (key String) ENGINE MergeTree ORDER BY key; +CREATE TABLE tabl_2 (key String) ENGINE MergeTree ORDER BY key; +SELECT * FROM tabl_1 SETTINGS log_comment = 'ad15a651'; +SELECT * FROM tabl_2 SETTINGS log_comment = 'ad15a651'; +SYSTEM FLUSH LOGS; + +SELECT base64Decode(base64Encode(normalizeQuery(query))) + FROM system.query_log + WHERE type = 'QueryFinish' AND log_comment = 'ad15a651' AND current_database = currentDatabase() + GROUP BY normalizeQuery(query) + ORDER BY normalizeQuery(query); + +DROP TABLE tabl_1; +DROP TABLE tabl_2; diff --git a/tests/queries/0_stateless/02113_base64encode_trailing_bytes_1.reference b/tests/queries/0_stateless/02113_base64encode_trailing_bytes_1.reference new file mode 100644 index 00000000000..8d9df2da010 --- /dev/null +++ b/tests/queries/0_stateless/02113_base64encode_trailing_bytes_1.reference @@ -0,0 +1,100 @@ +0 +1 61 +2 6161 +3 616161 +4 61616161 +5 6161616161 +6 616161616161 +7 61616161616161 +8 6161616161616161 +9 616161616161616161 +10 61616161616161616161 +11 6161616161616161616161 +12 616161616161616161616161 +13 61616161616161616161616161 +14 6161616161616161616161616161 +15 616161616161616161616161616161 +16 61616161616161616161616161616161 +17 6161616161616161616161616161616161 +18 616161616161616161616161616161616161 +19 61616161616161616161616161616161616161 +20 6161616161616161616161616161616161616161 +21 616161616161616161616161616161616161616161 +22 61616161616161616161616161616161616161616161 +23 6161616161616161616161616161616161616161616161 +24 616161616161616161616161616161616161616161616161 +25 61616161616161616161616161616161616161616161616161 +26 6161616161616161616161616161616161616161616161616161 +27 616161616161616161616161616161616161616161616161616161 +28 61616161616161616161616161616161616161616161616161616161 +29 6161616161616161616161616161616161616161616161616161616161 +30 616161616161616161616161616161616161616161616161616161616161 +31 61616161616161616161616161616161616161616161616161616161616161 +32 6161616161616161616161616161616161616161616161616161616161616161 +33 616161616161616161616161616161616161616161616161616161616161616161 +34 61616161616161616161616161616161616161616161616161616161616161616161 +35 6161616161616161616161616161616161616161616161616161616161616161616161 +36 616161616161616161616161616161616161616161616161616161616161616161616161 +37 61616161616161616161616161616161616161616161616161616161616161616161616161 +38 6161616161616161616161616161616161616161616161616161616161616161616161616161 +39 616161616161616161616161616161616161616161616161616161616161616161616161616161 +40 61616161616161616161616161616161616161616161616161616161616161616161616161616161 +41 6161616161616161616161616161616161616161616161616161616161616161616161616161616161 +42 616161616161616161616161616161616161616161616161616161616161616161616161616161616161 +43 61616161616161616161616161616161616161616161616161616161616161616161616161616161616161 +44 6161616161616161616161616161616161616161616161616161616161616161616161616161616161616161 +45 616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161 +46 61616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161 +47 6161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161 +48 616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161 +49 61616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161 +50 6161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161 +51 616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161 +52 61616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161 +53 6161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161 +54 616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161 +55 61616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161 +56 6161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161 +57 616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161 +58 61616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161 +59 6161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161 +60 616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161 +61 61616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161 +62 6161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161 +63 616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161 +64 61616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161 +65 6161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161 +66 616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161 +67 61616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161 +68 6161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161 +69 616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161 +70 61616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161 +71 6161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161 +72 616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161 +73 61616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161 +74 6161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161 +75 616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161 +76 61616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161 +77 6161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161 +78 616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161 +79 61616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161 +80 6161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161 +81 616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161 +82 61616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161 +83 6161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161 +84 616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161 +85 61616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161 +86 6161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161 +87 616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161 +88 61616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161 +89 6161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161 +90 616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161 +91 61616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161 +92 6161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161 +93 616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161 +94 61616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161 +95 6161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161 +96 616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161 +97 61616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161 +98 6161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161 +99 616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161 diff --git a/tests/queries/0_stateless/02113_base64encode_trailing_bytes_1.sql b/tests/queries/0_stateless/02113_base64encode_trailing_bytes_1.sql new file mode 100644 index 00000000000..56edf5dbf6f --- /dev/null +++ b/tests/queries/0_stateless/02113_base64encode_trailing_bytes_1.sql @@ -0,0 +1,6 @@ +-- Tags: no-fasttest + +SELECT + number, + hex(base64Decode(base64Encode(repeat('a', number)))) r +FROM numbers(100); diff --git a/tests/queries/0_stateless/02114_bool_type.sql b/tests/queries/0_stateless/02114_bool_type.sql index 4542cc68a3a..d4ea4e54028 100644 --- a/tests/queries/0_stateless/02114_bool_type.sql +++ b/tests/queries/0_stateless/02114_bool_type.sql @@ -5,7 +5,7 @@ CREATE TABLE bool_test (value Bool,f String) ENGINE = Memory; -- value column shoud have type 'Bool' SHOW CREATE TABLE bool_test; -INSERT INTO bool_test (value,f) VALUES ('false', 'test'), ('true' , 'test'), (0, 'test'), (1, 'test'), ('FALSE', 'test'), ('TRUE', 'test'); +INSERT INTO bool_test (value,f) VALUES (false, 'test'), (true , 'test'), (0, 'test'), (1, 'test'), (FALSE, 'test'), (TRUE, 'test'); INSERT INTO bool_test (value,f) FORMAT JSONEachRow {"value":false,"f":"test"}{"value":true,"f":"test"}{"value":0,"f":"test"}{"value":1,"f":"test"} SELECT value,f FROM bool_test; diff --git a/tests/queries/0_stateless/02114_hdfs_bad_url.sh b/tests/queries/0_stateless/02114_hdfs_bad_url.sh index 5117568b67f..a05baf19e6f 100755 --- a/tests/queries/0_stateless/02114_hdfs_bad_url.sh +++ b/tests/queries/0_stateless/02114_hdfs_bad_url.sh @@ -20,7 +20,7 @@ $CLICKHOUSE_CLIENT -q "SELECT * FROM hdfs('://abcd:9000/data', 'CSV', 'x UInt32' $CLICKHOUSE_CLIENT -q "SELECT * FROM hdfs('abcd/', 'CSV', 'x UInt32')" 2>&1 | grep -F -q "BAD_ARGUMENTS" && echo 'OK' || echo 'FAIL'; $CLICKHOUSE_CLIENT -q "SELECT * FROM hdfs('hdfs://abcd', 'CSV', 'x UInt32')" 2>&1 | grep -F -q "BAD_ARGUMENTS" && echo 'OK' || echo 'FAIL'; $CLICKHOUSE_CLIENT -q "SELECT * FROM hdfs('hdfs1:9000/data', 'CSV', 'x UInt32')" 2>&1 | grep -F -q "BAD_ARGUMENTS" && echo 'OK' || echo 'FAIL'; -$CLICKHOUSE_CLIENT -q "SELECT * FROM hdfs('hdfs://hdfs1/data', 'CSV', 'x UInt32')" 2>&1 | grep -F -q "BAD_ARGUMENTS" && echo 'OK' || echo 'FAIL'; +$CLICKHOUSE_CLIENT -q "SELECT * FROM hdfs('hdfs://hdfs1/data', 'CSV', 'x UInt32')" 2>&1 | grep -F -q "NETWORK_ERROR" && echo 'OK' || echo 'FAIL'; $CLICKHOUSE_CLIENT -q "SELECT * FROM hdfs('http://hdfs1:9000/data', 'CSV', 'x UInt32')" 2>&1 | grep -F -q "BAD_ARGUMENTS" && echo 'OK' || echo 'FAIL'; -$CLICKHOUSE_CLIENT -q "SELECT * FROM hdfs('hdfs://hdfs1/abcd:9000/data', 'CSV', 'x UInt32')" 2>&1 | grep -F -q "BAD_ARGUMENTS" && echo 'OK' || echo 'FAIL'; +$CLICKHOUSE_CLIENT -q "SELECT * FROM hdfs('hdfs://hdfs1@nameservice/abcd/data', 'CSV', 'x UInt32')" 2>&1 | grep -F -q "NETWORK_ERROR" && echo 'OK' || echo 'FAIL'; diff --git a/tests/queries/0_stateless/02116_interactive_hello.expect b/tests/queries/0_stateless/02116_interactive_hello.expect index 1642ac91e42..e659cf8703c 100755 --- a/tests/queries/0_stateless/02116_interactive_hello.expect +++ b/tests/queries/0_stateless/02116_interactive_hello.expect @@ -1,15 +1,15 @@ #!/usr/bin/expect -f -# Tags: no-fasttest +# Tags: long log_user 0 set timeout 60 match_max 100000 -# A default timeout action is to do nothing, change it to fail expect_after { - timeout { - exit 1 - } + # Do not ignore eof from expect + eof { exp_continue } + # A default timeout action is to do nothing, change it to fail + timeout { exit 1 } } set basedir [file dirname $argv0] diff --git a/tests/queries/0_stateless/02117_show_create_table_system.reference b/tests/queries/0_stateless/02117_show_create_table_system.reference index 2b391cd292e..35de7f8e82c 100644 --- a/tests/queries/0_stateless/02117_show_create_table_system.reference +++ b/tests/queries/0_stateless/02117_show_create_table_system.reference @@ -34,7 +34,7 @@ CREATE TABLE system.numbers_mt\n(\n `number` UInt64\n)\nENGINE = SystemNumber CREATE TABLE system.one\n(\n `dummy` UInt8\n)\nENGINE = SystemOne()\nCOMMENT \'SYSTEM TABLE is built on the fly.\' CREATE TABLE system.part_moves_between_shards\n(\n `database` String,\n `table` String,\n `task_name` String,\n `task_uuid` UUID,\n `create_time` DateTime,\n `part_name` String,\n `part_uuid` UUID,\n `to_shard` String,\n `dst_part_name` String,\n `update_time` DateTime,\n `state` String,\n `rollback` UInt8,\n `num_tries` UInt32,\n `last_exception` String\n)\nENGINE = SystemShardMoves()\nCOMMENT \'SYSTEM TABLE is built on the fly.\' CREATE TABLE system.parts\n(\n `partition` String,\n `name` String,\n `uuid` UUID,\n `part_type` String,\n `active` UInt8,\n `marks` UInt64,\n `rows` UInt64,\n `bytes_on_disk` UInt64,\n `data_compressed_bytes` UInt64,\n `data_uncompressed_bytes` UInt64,\n `marks_bytes` UInt64,\n `secondary_indices_compressed_bytes` UInt64,\n `secondary_indices_uncompressed_bytes` UInt64,\n `secondary_indices_marks_bytes` UInt64,\n `modification_time` DateTime,\n `remove_time` DateTime,\n `refcount` UInt32,\n `min_date` Date,\n `max_date` Date,\n `min_time` DateTime,\n `max_time` DateTime,\n `partition_id` String,\n `min_block_number` Int64,\n `max_block_number` Int64,\n `level` UInt32,\n `data_version` UInt64,\n `primary_key_bytes_in_memory` UInt64,\n `primary_key_bytes_in_memory_allocated` UInt64,\n `is_frozen` UInt8,\n `database` String,\n `table` String,\n `engine` String,\n `disk_name` String,\n `path` String,\n `hash_of_all_files` String,\n `hash_of_uncompressed_files` String,\n `uncompressed_hash_of_compressed_files` String,\n `delete_ttl_info_min` DateTime,\n `delete_ttl_info_max` DateTime,\n `move_ttl_info.expression` Array(String),\n `move_ttl_info.min` Array(DateTime),\n `move_ttl_info.max` Array(DateTime),\n `default_compression_codec` String,\n `recompression_ttl_info.expression` Array(String),\n `recompression_ttl_info.min` Array(DateTime),\n `recompression_ttl_info.max` Array(DateTime),\n `group_by_ttl_info.expression` Array(String),\n `group_by_ttl_info.min` Array(DateTime),\n `group_by_ttl_info.max` Array(DateTime),\n `rows_where_ttl_info.expression` Array(String),\n `rows_where_ttl_info.min` Array(DateTime),\n `rows_where_ttl_info.max` Array(DateTime),\n `projections` Array(String),\n `bytes` UInt64,\n `marks_size` UInt64\n)\nENGINE = SystemParts()\nCOMMENT \'SYSTEM TABLE is built on the fly.\' -CREATE TABLE system.parts_columns\n(\n `partition` String,\n `name` String,\n `uuid` UUID,\n `part_type` String,\n `active` UInt8,\n `marks` UInt64,\n `rows` UInt64,\n `bytes_on_disk` UInt64,\n `data_compressed_bytes` UInt64,\n `data_uncompressed_bytes` UInt64,\n `marks_bytes` UInt64,\n `modification_time` DateTime,\n `remove_time` DateTime,\n `refcount` UInt32,\n `min_date` Date,\n `max_date` Date,\n `min_time` DateTime,\n `max_time` DateTime,\n `partition_id` String,\n `min_block_number` Int64,\n `max_block_number` Int64,\n `level` UInt32,\n `data_version` UInt64,\n `primary_key_bytes_in_memory` UInt64,\n `primary_key_bytes_in_memory_allocated` UInt64,\n `database` String,\n `table` String,\n `engine` String,\n `disk_name` String,\n `path` String,\n `column` String,\n `type` String,\n `column_position` UInt64,\n `default_kind` String,\n `default_expression` String,\n `column_bytes_on_disk` UInt64,\n `column_data_compressed_bytes` UInt64,\n `column_data_uncompressed_bytes` UInt64,\n `column_marks_bytes` UInt64,\n `bytes` UInt64,\n `marks_size` UInt64\n)\nENGINE = SystemPartsColumns()\nCOMMENT \'SYSTEM TABLE is built on the fly.\' +CREATE TABLE system.parts_columns\n(\n `partition` String,\n `name` String,\n `uuid` UUID,\n `part_type` String,\n `active` UInt8,\n `marks` UInt64,\n `rows` UInt64,\n `bytes_on_disk` UInt64,\n `data_compressed_bytes` UInt64,\n `data_uncompressed_bytes` UInt64,\n `marks_bytes` UInt64,\n `modification_time` DateTime,\n `remove_time` DateTime,\n `refcount` UInt32,\n `min_date` Date,\n `max_date` Date,\n `min_time` DateTime,\n `max_time` DateTime,\n `partition_id` String,\n `min_block_number` Int64,\n `max_block_number` Int64,\n `level` UInt32,\n `data_version` UInt64,\n `primary_key_bytes_in_memory` UInt64,\n `primary_key_bytes_in_memory_allocated` UInt64,\n `database` String,\n `table` String,\n `engine` String,\n `disk_name` String,\n `path` String,\n `column` String,\n `type` String,\n `column_position` UInt64,\n `default_kind` String,\n `default_expression` String,\n `column_bytes_on_disk` UInt64,\n `column_data_compressed_bytes` UInt64,\n `column_data_uncompressed_bytes` UInt64,\n `column_marks_bytes` UInt64,\n `serialization_kind` String,\n `subcolumns.names` Array(String),\n `subcolumns.types` Array(String),\n `subcolumns.serializations` Array(String),\n `bytes` UInt64,\n `marks_size` UInt64\n)\nENGINE = SystemPartsColumns()\nCOMMENT \'SYSTEM TABLE is built on the fly.\' CREATE TABLE system.privileges\n(\n `privilege` Enum8(\'SQLITE\' = -128, \'ODBC\' = -127, \'JDBC\' = -126, \'HDFS\' = -125, \'S3\' = -124, \'SOURCES\' = -123, \'ALL\' = -122, \'NONE\' = -121, \'SHOW DATABASES\' = 0, \'SHOW TABLES\' = 1, \'SHOW COLUMNS\' = 2, \'SHOW DICTIONARIES\' = 3, \'SHOW\' = 4, \'SELECT\' = 5, \'INSERT\' = 6, \'ALTER UPDATE\' = 7, \'ALTER DELETE\' = 8, \'ALTER ADD COLUMN\' = 9, \'ALTER MODIFY COLUMN\' = 10, \'ALTER DROP COLUMN\' = 11, \'ALTER COMMENT COLUMN\' = 12, \'ALTER CLEAR COLUMN\' = 13, \'ALTER RENAME COLUMN\' = 14, \'ALTER MATERIALIZE COLUMN\' = 15, \'ALTER COLUMN\' = 16, \'ALTER MODIFY COMMENT\' = 17, \'ALTER ORDER BY\' = 18, \'ALTER SAMPLE BY\' = 19, \'ALTER ADD INDEX\' = 20, \'ALTER DROP INDEX\' = 21, \'ALTER MATERIALIZE INDEX\' = 22, \'ALTER CLEAR INDEX\' = 23, \'ALTER INDEX\' = 24, \'ALTER ADD PROJECTION\' = 25, \'ALTER DROP PROJECTION\' = 26, \'ALTER MATERIALIZE PROJECTION\' = 27, \'ALTER CLEAR PROJECTION\' = 28, \'ALTER PROJECTION\' = 29, \'ALTER ADD CONSTRAINT\' = 30, \'ALTER DROP CONSTRAINT\' = 31, \'ALTER CONSTRAINT\' = 32, \'ALTER TTL\' = 33, \'ALTER MATERIALIZE TTL\' = 34, \'ALTER SETTINGS\' = 35, \'ALTER MOVE PARTITION\' = 36, \'ALTER FETCH PARTITION\' = 37, \'ALTER FREEZE PARTITION\' = 38, \'ALTER DATABASE SETTINGS\' = 39, \'ALTER TABLE\' = 40, \'ALTER DATABASE\' = 41, \'ALTER VIEW REFRESH\' = 42, \'ALTER VIEW MODIFY QUERY\' = 43, \'ALTER VIEW\' = 44, \'ALTER\' = 45, \'CREATE DATABASE\' = 46, \'CREATE TABLE\' = 47, \'CREATE VIEW\' = 48, \'CREATE DICTIONARY\' = 49, \'CREATE TEMPORARY TABLE\' = 50, \'CREATE FUNCTION\' = 51, \'CREATE\' = 52, \'DROP DATABASE\' = 53, \'DROP TABLE\' = 54, \'DROP VIEW\' = 55, \'DROP DICTIONARY\' = 56, \'DROP FUNCTION\' = 57, \'DROP\' = 58, \'TRUNCATE\' = 59, \'OPTIMIZE\' = 60, \'KILL QUERY\' = 61, \'MOVE PARTITION BETWEEN SHARDS\' = 62, \'CREATE USER\' = 63, \'ALTER USER\' = 64, \'DROP USER\' = 65, \'CREATE ROLE\' = 66, \'ALTER ROLE\' = 67, \'DROP ROLE\' = 68, \'ROLE ADMIN\' = 69, \'CREATE ROW POLICY\' = 70, \'ALTER ROW POLICY\' = 71, \'DROP ROW POLICY\' = 72, \'CREATE QUOTA\' = 73, \'ALTER QUOTA\' = 74, \'DROP QUOTA\' = 75, \'CREATE SETTINGS PROFILE\' = 76, \'ALTER SETTINGS PROFILE\' = 77, \'DROP SETTINGS PROFILE\' = 78, \'SHOW USERS\' = 79, \'SHOW ROLES\' = 80, \'SHOW ROW POLICIES\' = 81, \'SHOW QUOTAS\' = 82, \'SHOW SETTINGS PROFILES\' = 83, \'SHOW ACCESS\' = 84, \'ACCESS MANAGEMENT\' = 85, \'SYSTEM SHUTDOWN\' = 86, \'SYSTEM DROP DNS CACHE\' = 87, \'SYSTEM DROP MARK CACHE\' = 88, \'SYSTEM DROP UNCOMPRESSED CACHE\' = 89, \'SYSTEM DROP MMAP CACHE\' = 90, \'SYSTEM DROP COMPILED EXPRESSION CACHE\' = 91, \'SYSTEM DROP CACHE\' = 92, \'SYSTEM RELOAD CONFIG\' = 93, \'SYSTEM RELOAD SYMBOLS\' = 94, \'SYSTEM RELOAD DICTIONARY\' = 95, \'SYSTEM RELOAD MODEL\' = 96, \'SYSTEM RELOAD FUNCTION\' = 97, \'SYSTEM RELOAD EMBEDDED DICTIONARIES\' = 98, \'SYSTEM RELOAD\' = 99, \'SYSTEM RESTART DISK\' = 100, \'SYSTEM MERGES\' = 101, \'SYSTEM TTL MERGES\' = 102, \'SYSTEM FETCHES\' = 103, \'SYSTEM MOVES\' = 104, \'SYSTEM DISTRIBUTED SENDS\' = 105, \'SYSTEM REPLICATED SENDS\' = 106, \'SYSTEM SENDS\' = 107, \'SYSTEM REPLICATION QUEUES\' = 108, \'SYSTEM DROP REPLICA\' = 109, \'SYSTEM SYNC REPLICA\' = 110, \'SYSTEM RESTART REPLICA\' = 111, \'SYSTEM RESTORE REPLICA\' = 112, \'SYSTEM FLUSH DISTRIBUTED\' = 113, \'SYSTEM FLUSH LOGS\' = 114, \'SYSTEM FLUSH\' = 115, \'SYSTEM\' = 116, \'dictGet\' = 117, \'addressToLine\' = 118, \'addressToSymbol\' = 119, \'demangle\' = 120, \'INTROSPECTION\' = 121, \'FILE\' = 122, \'URL\' = 123, \'REMOTE\' = 124, \'MONGO\' = 125, \'MYSQL\' = 126, \'POSTGRES\' = 127),\n `aliases` Array(String),\n `level` Nullable(Enum8(\'GLOBAL\' = 0, \'DATABASE\' = 1, \'TABLE\' = 2, \'DICTIONARY\' = 3, \'VIEW\' = 4, \'COLUMN\' = 5)),\n `parent_group` Nullable(Enum8(\'SQLITE\' = -128, \'ODBC\' = -127, \'JDBC\' = -126, \'HDFS\' = -125, \'S3\' = -124, \'SOURCES\' = -123, \'ALL\' = -122, \'NONE\' = -121, \'SHOW DATABASES\' = 0, \'SHOW TABLES\' = 1, \'SHOW COLUMNS\' = 2, \'SHOW DICTIONARIES\' = 3, \'SHOW\' = 4, \'SELECT\' = 5, \'INSERT\' = 6, \'ALTER UPDATE\' = 7, \'ALTER DELETE\' = 8, \'ALTER ADD COLUMN\' = 9, \'ALTER MODIFY COLUMN\' = 10, \'ALTER DROP COLUMN\' = 11, \'ALTER COMMENT COLUMN\' = 12, \'ALTER CLEAR COLUMN\' = 13, \'ALTER RENAME COLUMN\' = 14, \'ALTER MATERIALIZE COLUMN\' = 15, \'ALTER COLUMN\' = 16, \'ALTER MODIFY COMMENT\' = 17, \'ALTER ORDER BY\' = 18, \'ALTER SAMPLE BY\' = 19, \'ALTER ADD INDEX\' = 20, \'ALTER DROP INDEX\' = 21, \'ALTER MATERIALIZE INDEX\' = 22, \'ALTER CLEAR INDEX\' = 23, \'ALTER INDEX\' = 24, \'ALTER ADD PROJECTION\' = 25, \'ALTER DROP PROJECTION\' = 26, \'ALTER MATERIALIZE PROJECTION\' = 27, \'ALTER CLEAR PROJECTION\' = 28, \'ALTER PROJECTION\' = 29, \'ALTER ADD CONSTRAINT\' = 30, \'ALTER DROP CONSTRAINT\' = 31, \'ALTER CONSTRAINT\' = 32, \'ALTER TTL\' = 33, \'ALTER MATERIALIZE TTL\' = 34, \'ALTER SETTINGS\' = 35, \'ALTER MOVE PARTITION\' = 36, \'ALTER FETCH PARTITION\' = 37, \'ALTER FREEZE PARTITION\' = 38, \'ALTER DATABASE SETTINGS\' = 39, \'ALTER TABLE\' = 40, \'ALTER DATABASE\' = 41, \'ALTER VIEW REFRESH\' = 42, \'ALTER VIEW MODIFY QUERY\' = 43, \'ALTER VIEW\' = 44, \'ALTER\' = 45, \'CREATE DATABASE\' = 46, \'CREATE TABLE\' = 47, \'CREATE VIEW\' = 48, \'CREATE DICTIONARY\' = 49, \'CREATE TEMPORARY TABLE\' = 50, \'CREATE FUNCTION\' = 51, \'CREATE\' = 52, \'DROP DATABASE\' = 53, \'DROP TABLE\' = 54, \'DROP VIEW\' = 55, \'DROP DICTIONARY\' = 56, \'DROP FUNCTION\' = 57, \'DROP\' = 58, \'TRUNCATE\' = 59, \'OPTIMIZE\' = 60, \'KILL QUERY\' = 61, \'MOVE PARTITION BETWEEN SHARDS\' = 62, \'CREATE USER\' = 63, \'ALTER USER\' = 64, \'DROP USER\' = 65, \'CREATE ROLE\' = 66, \'ALTER ROLE\' = 67, \'DROP ROLE\' = 68, \'ROLE ADMIN\' = 69, \'CREATE ROW POLICY\' = 70, \'ALTER ROW POLICY\' = 71, \'DROP ROW POLICY\' = 72, \'CREATE QUOTA\' = 73, \'ALTER QUOTA\' = 74, \'DROP QUOTA\' = 75, \'CREATE SETTINGS PROFILE\' = 76, \'ALTER SETTINGS PROFILE\' = 77, \'DROP SETTINGS PROFILE\' = 78, \'SHOW USERS\' = 79, \'SHOW ROLES\' = 80, \'SHOW ROW POLICIES\' = 81, \'SHOW QUOTAS\' = 82, \'SHOW SETTINGS PROFILES\' = 83, \'SHOW ACCESS\' = 84, \'ACCESS MANAGEMENT\' = 85, \'SYSTEM SHUTDOWN\' = 86, \'SYSTEM DROP DNS CACHE\' = 87, \'SYSTEM DROP MARK CACHE\' = 88, \'SYSTEM DROP UNCOMPRESSED CACHE\' = 89, \'SYSTEM DROP MMAP CACHE\' = 90, \'SYSTEM DROP COMPILED EXPRESSION CACHE\' = 91, \'SYSTEM DROP CACHE\' = 92, \'SYSTEM RELOAD CONFIG\' = 93, \'SYSTEM RELOAD SYMBOLS\' = 94, \'SYSTEM RELOAD DICTIONARY\' = 95, \'SYSTEM RELOAD MODEL\' = 96, \'SYSTEM RELOAD FUNCTION\' = 97, \'SYSTEM RELOAD EMBEDDED DICTIONARIES\' = 98, \'SYSTEM RELOAD\' = 99, \'SYSTEM RESTART DISK\' = 100, \'SYSTEM MERGES\' = 101, \'SYSTEM TTL MERGES\' = 102, \'SYSTEM FETCHES\' = 103, \'SYSTEM MOVES\' = 104, \'SYSTEM DISTRIBUTED SENDS\' = 105, \'SYSTEM REPLICATED SENDS\' = 106, \'SYSTEM SENDS\' = 107, \'SYSTEM REPLICATION QUEUES\' = 108, \'SYSTEM DROP REPLICA\' = 109, \'SYSTEM SYNC REPLICA\' = 110, \'SYSTEM RESTART REPLICA\' = 111, \'SYSTEM RESTORE REPLICA\' = 112, \'SYSTEM FLUSH DISTRIBUTED\' = 113, \'SYSTEM FLUSH LOGS\' = 114, \'SYSTEM FLUSH\' = 115, \'SYSTEM\' = 116, \'dictGet\' = 117, \'addressToLine\' = 118, \'addressToSymbol\' = 119, \'demangle\' = 120, \'INTROSPECTION\' = 121, \'FILE\' = 122, \'URL\' = 123, \'REMOTE\' = 124, \'MONGO\' = 125, \'MYSQL\' = 126, \'POSTGRES\' = 127))\n)\nENGINE = SystemPrivileges()\nCOMMENT \'SYSTEM TABLE is built on the fly.\' CREATE TABLE system.processes\n(\n `is_initial_query` UInt8,\n `user` String,\n `query_id` String,\n `address` IPv6,\n `port` UInt16,\n `initial_user` String,\n `initial_query_id` String,\n `initial_address` IPv6,\n `initial_port` UInt16,\n `interface` UInt8,\n `os_user` String,\n `client_hostname` String,\n `client_name` String,\n `client_revision` UInt64,\n `client_version_major` UInt64,\n `client_version_minor` UInt64,\n `client_version_patch` UInt64,\n `http_method` UInt8,\n `http_user_agent` String,\n `http_referer` String,\n `forwarded_for` String,\n `quota_key` String,\n `elapsed` Float64,\n `is_cancelled` UInt8,\n `read_rows` UInt64,\n `read_bytes` UInt64,\n `total_rows_approx` UInt64,\n `written_rows` UInt64,\n `written_bytes` UInt64,\n `memory_usage` Int64,\n `peak_memory_usage` Int64,\n `query` String,\n `thread_ids` Array(UInt64),\n `ProfileEvents` Map(String, UInt64),\n `Settings` Map(String, String),\n `current_database` String,\n `ProfileEvents.Names` Array(String),\n `ProfileEvents.Values` Array(UInt64),\n `Settings.Names` Array(String),\n `Settings.Values` Array(String)\n)\nENGINE = SystemProcesses()\nCOMMENT \'SYSTEM TABLE is built on the fly.\' CREATE TABLE system.projection_parts\n(\n `partition` String,\n `name` String,\n `part_type` String,\n `parent_name` String,\n `parent_uuid` UUID,\n `parent_part_type` String,\n `active` UInt8,\n `marks` UInt64,\n `rows` UInt64,\n `bytes_on_disk` UInt64,\n `data_compressed_bytes` UInt64,\n `data_uncompressed_bytes` UInt64,\n `marks_bytes` UInt64,\n `parent_marks` UInt64,\n `parent_rows` UInt64,\n `parent_bytes_on_disk` UInt64,\n `parent_data_compressed_bytes` UInt64,\n `parent_data_uncompressed_bytes` UInt64,\n `parent_marks_bytes` UInt64,\n `modification_time` DateTime,\n `remove_time` DateTime,\n `refcount` UInt32,\n `min_date` Date,\n `max_date` Date,\n `min_time` DateTime,\n `max_time` DateTime,\n `partition_id` String,\n `min_block_number` Int64,\n `max_block_number` Int64,\n `level` UInt32,\n `data_version` UInt64,\n `primary_key_bytes_in_memory` UInt64,\n `primary_key_bytes_in_memory_allocated` UInt64,\n `is_frozen` UInt8,\n `database` String,\n `table` String,\n `engine` String,\n `disk_name` String,\n `path` String,\n `hash_of_all_files` String,\n `hash_of_uncompressed_files` String,\n `uncompressed_hash_of_compressed_files` String,\n `delete_ttl_info_min` DateTime,\n `delete_ttl_info_max` DateTime,\n `move_ttl_info.expression` Array(String),\n `move_ttl_info.min` Array(DateTime),\n `move_ttl_info.max` Array(DateTime),\n `default_compression_codec` String,\n `recompression_ttl_info.expression` Array(String),\n `recompression_ttl_info.min` Array(DateTime),\n `recompression_ttl_info.max` Array(DateTime),\n `group_by_ttl_info.expression` Array(String),\n `group_by_ttl_info.min` Array(DateTime),\n `group_by_ttl_info.max` Array(DateTime),\n `rows_where_ttl_info.expression` Array(String),\n `rows_where_ttl_info.min` Array(DateTime),\n `rows_where_ttl_info.max` Array(DateTime),\n `bytes` UInt64,\n `marks_size` UInt64\n)\nENGINE = SystemProjectionParts()\nCOMMENT \'SYSTEM TABLE is built on the fly.\' diff --git a/tests/queries/0_stateless/02122_4letter_words_stress_zookeeper.sh b/tests/queries/0_stateless/02122_4letter_words_stress_zookeeper.sh index 4a94beddbba..2deaf788ecf 100755 --- a/tests/queries/0_stateless/02122_4letter_words_stress_zookeeper.sh +++ b/tests/queries/0_stateless/02122_4letter_words_stress_zookeeper.sh @@ -18,7 +18,7 @@ function four_letter_thread() function create_drop_thread() { while true; do - num=$RANDOM + num=$(($RANDOM % 10 + 1)) $CLICKHOUSE_CLIENT --query "CREATE TABLE test_table$num (key UInt64, value1 UInt8, value2 UInt8) ENGINE = ReplicatedMergeTree('/clickhouse/tables/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/test_table$num', '0') ORDER BY key" sleep 0.$RANDOM $CLICKHOUSE_CLIENT --query "DROP TABLE IF EXISTS test_table$num" @@ -43,5 +43,12 @@ timeout $TIMEOUT bash -c create_drop_thread 2> /dev/null & wait +for num in $(seq 1 10); do + $CLICKHOUSE_CLIENT --query "DROP TABLE IF EXISTS test_table$num" 2>/dev/null + while [ $? -ne 0 ]; do + $CLICKHOUSE_CLIENT --query "DROP TABLE IF EXISTS test_table$num" 2>/dev/null + done +done + # still alive $CLICKHOUSE_CLIENT --query "SELECT 1" diff --git a/tests/queries/0_stateless/02122_parallel_formatting.sh b/tests/queries/0_stateless/02122_parallel_formatting.sh index 8061cbe58b2..f0c24344329 100755 --- a/tests/queries/0_stateless/02122_parallel_formatting.sh +++ b/tests/queries/0_stateless/02122_parallel_formatting.sh @@ -11,14 +11,14 @@ formats="RowBinary RowBinaryWithNames RowBinaryWithNamesAndTypes XML Markdown Ve for format in ${formats}; do echo $format-1 - $CLICKHOUSE_CLIENT -q "select number, number + 1, concat('string: ', toString(number)) from numbers(200000) format $format" --output_format_parallel_formatting=0 --output_format_pretty_max_rows=1000000 | grep -v "elapsed" > $non_parallel_file - $CLICKHOUSE_CLIENT -q "select number, number + 1, concat('string: ', toString(number)) from numbers(200000) format $format" --output_format_parallel_formatting=1 --output_format_pretty_max_rows=1000000 | grep -v "elapsed" > $parallel_file + $CLICKHOUSE_CLIENT -q "select number, number + 1, concat('string: ', toString(number)) from numbers(200000) format $format" --output_format_parallel_formatting=0 --output_format_pretty_max_rows=1000000 | grep -a -v "elapsed" > $non_parallel_file + $CLICKHOUSE_CLIENT -q "select number, number + 1, concat('string: ', toString(number)) from numbers(200000) format $format" --output_format_parallel_formatting=1 --output_format_pretty_max_rows=1000000 | grep -a -v "elapsed" > $parallel_file diff $non_parallel_file $parallel_file echo $format-2 - $CLICKHOUSE_CLIENT -q "select number, number + 1, concat('string: ', toString(number)) from numbers(200000) group by number with totals limit 190000 format $format" --extremes=1 --output_format_parallel_formatting=0 --output_format_pretty_max_rows=1000000 | grep -v "elapsed" > $non_parallel_file - $CLICKHOUSE_CLIENT -q "select number, number + 1, concat('string: ', toString(number)) from numbers(200000) group by number with totals limit 190000 format $format" --extremes=1 --output_format_parallel_formatting=1 --output_format_pretty_max_rows=1000000 | grep -v "elapsed" > $parallel_file + $CLICKHOUSE_CLIENT -q "select number, number + 1, concat('string: ', toString(number)) from numbers(200000) group by number with totals limit 190000 format $format" --extremes=1 --output_format_parallel_formatting=0 --output_format_pretty_max_rows=1000000 | grep -a -v "elapsed" > $non_parallel_file + $CLICKHOUSE_CLIENT -q "select number, number + 1, concat('string: ', toString(number)) from numbers(200000) group by number with totals limit 190000 format $format" --extremes=1 --output_format_parallel_formatting=1 --output_format_pretty_max_rows=1000000 | grep -a -v "elapsed" > $parallel_file diff $non_parallel_file $parallel_file done diff --git a/tests/queries/0_stateless/02125_constant_if_condition_and_not_existing_column.reference b/tests/queries/0_stateless/02125_constant_if_condition_and_not_existing_column.reference index 67f2590a0c6..a7903610a42 100644 --- a/tests/queries/0_stateless/02125_constant_if_condition_and_not_existing_column.reference +++ b/tests/queries/0_stateless/02125_constant_if_condition_and_not_existing_column.reference @@ -6,3 +6,7 @@ 42 42 42 +SELECT + x, + concat(x, \'_\') +FROM test diff --git a/tests/queries/0_stateless/02125_constant_if_condition_and_not_existing_column.sql b/tests/queries/0_stateless/02125_constant_if_condition_and_not_existing_column.sql index ad3d417bc26..4aad7ae3694 100644 --- a/tests/queries/0_stateless/02125_constant_if_condition_and_not_existing_column.sql +++ b/tests/queries/0_stateless/02125_constant_if_condition_and_not_existing_column.sql @@ -11,4 +11,5 @@ select if(toUInt8(1), 42, y) from test; select if(toInt8(1), 42, y) from test; select if(toUInt8(toUInt8(0)), y, 42) from test; select if(cast(cast(0, 'UInt8'), 'UInt8'), y, 42) from test; +explain syntax select x, if((select hasColumnInTable(currentDatabase(), 'test', 'y')), y, x || '_') from test; drop table if exists t; diff --git a/tests/queries/0_stateless/02125_query_views_log.reference b/tests/queries/0_stateless/02125_query_views_log.reference index 3ae4af9b4d0..fac70027113 100644 --- a/tests/queries/0_stateless/02125_query_views_log.reference +++ b/tests/queries/0_stateless/02125_query_views_log.reference @@ -18,7 +18,7 @@ written_bytes: 4000000 select read_rows, read_bytes, written_rows, written_bytes from system.query_log where type = 'QueryFinish' and query_kind = 'Insert' and current_database = currentDatabase() format Vertical; Row 1: ────── -read_rows: 1000000 -read_bytes: 8000000 +read_rows: 3000000 +read_bytes: 16000000 written_rows: 3000000 written_bytes: 12000000 diff --git a/tests/queries/0_stateless/02126_lc_window_functions.reference b/tests/queries/0_stateless/02126_lc_window_functions.reference new file mode 100644 index 00000000000..bb2c453139e --- /dev/null +++ b/tests/queries/0_stateless/02126_lc_window_functions.reference @@ -0,0 +1,13 @@ +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 +a\0aa 1 +a\0aa 1 +a\0aa 1 diff --git a/tests/queries/0_stateless/02126_lc_window_functions.sql b/tests/queries/0_stateless/02126_lc_window_functions.sql new file mode 100644 index 00000000000..6a1fb691a37 --- /dev/null +++ b/tests/queries/0_stateless/02126_lc_window_functions.sql @@ -0,0 +1,38 @@ +SELECT max(id) OVER () AS aid +FROM +( + SELECT materialize(toLowCardinality('aaaa')) AS id + FROM numbers_mt(1000000) +) +FORMAT `Null`; + +SELECT max(id) OVER (PARTITION BY id) AS id +FROM +( + SELECT materialize('aaaa') AS id + FROM numbers_mt(1000000) +) +FORMAT `Null`; + +SELECT countIf(sym = 'Red') OVER () AS res +FROM +( + SELECT CAST(CAST(number % 5, 'Enum8(\'Red\' = 0, \'Blue\' = 1, \'Yellow\' = 2, \'Black\' = 3, \'White\' = 4)'), 'LowCardinality(String)') AS sym + FROM numbers(10) +); + +SELECT materialize(toLowCardinality('a\0aa')), countIf(toLowCardinality('aaaaaaa\0aaaaaaa\0aaaaaaa\0aaaaaaa\0aaaaaaa\0aaaaaaa\0aaaaaaa\0aaaaaaa\0aaaaaaa\0aaaaaaa\0aaaaaaa\0aaaaaaa\0aaaaaaa\0aaaaaaa\0aaaaaaa\0aaaaaaa\0'), sym = 'Red') OVER (Range BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS res FROM (SELECT CAST(CAST(number % 5, 'Enum8(\'Red\' = 0, \'Blue\' = 1, \'Yellow\' = 2, \'Black\' = 3, \'White\' = 4)'), 'LowCardinality(String)') AS sym FROM numbers(3)); + +SELECT + NULL, + id, + max(id) OVER (Rows BETWEEN 10 PRECEDING AND UNBOUNDED FOLLOWING) AS aid +FROM +( + SELECT + NULL, + max(id) OVER (), + materialize(toLowCardinality('')) AS id + FROM numbers_mt(0, 1) +) +FORMAT `Null`; diff --git a/tests/queries/0_stateless/02127_connection_drain.reference b/tests/queries/0_stateless/02127_connection_drain.reference new file mode 100644 index 00000000000..c31f2f40f6d --- /dev/null +++ b/tests/queries/0_stateless/02127_connection_drain.reference @@ -0,0 +1,2 @@ +OK: sync drain +OK: async drain diff --git a/tests/queries/0_stateless/02127_connection_drain.sh b/tests/queries/0_stateless/02127_connection_drain.sh new file mode 100755 index 00000000000..523b02d9bd5 --- /dev/null +++ b/tests/queries/0_stateless/02127_connection_drain.sh @@ -0,0 +1,30 @@ +#!/usr/bin/env bash +# Tags: no-parallel + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +# sync drain +for _ in {1..100}; do + prev=$(curl -d@- -sS "${CLICKHOUSE_URL}" <<<"select value from system.metrics where metric = 'SyncDrainedConnections'") + curl -d@- -sS "${CLICKHOUSE_URL}" <<<"select * from remote('127.{2,3}', view(select * from numbers(1e6))) limit 100 settings drain_timeout=-1 format Null" + now=$(curl -d@- -sS "${CLICKHOUSE_URL}" <<<"select value from system.metrics where metric = 'SyncDrainedConnections'") + if [[ "$prev" != $(( now-2 )) ]]; then + continue + fi + echo "OK: sync drain" + break +done + +# async drain +for _ in {1..100}; do + prev=$(curl -d@- -sS "${CLICKHOUSE_URL}" <<<"select value from system.metrics where metric = 'AsyncDrainedConnections'") + curl -d@- -sS "${CLICKHOUSE_URL}" <<<"select * from remote('127.{2,3}', view(select * from numbers(1e6))) limit 100 settings drain_timeout=10 format Null" + now=$(curl -d@- -sS "${CLICKHOUSE_URL}" <<<"select value from system.metrics where metric = 'AsyncDrainedConnections'") + if [[ "$prev" != $(( now-2 )) ]]; then + continue + fi + echo "OK: async drain" + break +done diff --git a/tests/queries/0_stateless/02128_apply_lambda_parsing.reference b/tests/queries/0_stateless/02128_apply_lambda_parsing.reference new file mode 100644 index 00000000000..120eec989de --- /dev/null +++ b/tests/queries/0_stateless/02128_apply_lambda_parsing.reference @@ -0,0 +1,10 @@ +1 +1 +1 +1 +1 +1 +2 +3 +4 +5 diff --git a/tests/queries/0_stateless/02128_apply_lambda_parsing.sql b/tests/queries/0_stateless/02128_apply_lambda_parsing.sql new file mode 100644 index 00000000000..5fc809ca75d --- /dev/null +++ b/tests/queries/0_stateless/02128_apply_lambda_parsing.sql @@ -0,0 +1,13 @@ +WITH * APPLY lambda(e); -- { clientError SYNTAX_ERROR } +SELECT * APPLY lambda(); -- { clientError SYNTAX_ERROR } +SELECT * APPLY lambda(1); -- { clientError SYNTAX_ERROR } +SELECT * APPLY lambda(x); -- { clientError SYNTAX_ERROR } +SELECT * APPLY lambda(range(1)); -- { clientError SYNTAX_ERROR } +SELECT * APPLY lambda(range(x)); -- { clientError SYNTAX_ERROR } +SELECT * APPLY lambda(1, 2); -- { clientError SYNTAX_ERROR } +SELECT * APPLY lambda(x, y); -- { clientError SYNTAX_ERROR } +SELECT * APPLY lambda((x, y), 2); -- { clientError SYNTAX_ERROR } +SELECT * APPLY lambda((x, y), x + y); -- { clientError SYNTAX_ERROR } +SELECT * APPLY lambda(tuple(1), 1); -- { clientError SYNTAX_ERROR } +SELECT * APPLY lambda(tuple(x), 1) FROM numbers(5); +SELECT * APPLY lambda(tuple(x), x + 1) FROM numbers(5); diff --git a/tests/queries/0_stateless/02129_add_column_add_ttl.reference b/tests/queries/0_stateless/02129_add_column_add_ttl.reference new file mode 100644 index 00000000000..8b3280ef095 --- /dev/null +++ b/tests/queries/0_stateless/02129_add_column_add_ttl.reference @@ -0,0 +1,41 @@ +0 2021-01-01 0 +0 2021-01-01 0 +1 2021-01-01 0 +1 2021-01-01 0 +2 2021-01-01 0 +2 2021-01-01 0 +3 2021-01-01 0 +3 2021-01-01 0 +4 2021-01-01 0 +4 2021-01-01 0 +5 2021-01-01 0 +5 2021-01-01 0 +6 2021-01-01 0 +6 2021-01-01 0 +7 2021-01-01 0 +7 2021-01-01 0 +8 2021-01-01 0 +8 2021-01-01 0 +9 2021-01-01 0 +9 2021-01-01 0 +========== +0 2021-01-01 0 +0 2021-01-01 0 +1 2021-01-01 0 +1 2021-01-01 0 +2 2021-01-01 0 +2 2021-01-01 0 +3 2021-01-01 0 +3 2021-01-01 0 +4 2021-01-01 0 +4 2021-01-01 0 +5 2021-01-01 0 +5 2021-01-01 0 +6 2021-01-01 0 +6 2021-01-01 0 +7 2021-01-01 0 +7 2021-01-01 0 +8 2021-01-01 0 +8 2021-01-01 0 +9 2021-01-01 0 +9 2021-01-01 0 diff --git a/tests/queries/0_stateless/02129_add_column_add_ttl.sql b/tests/queries/0_stateless/02129_add_column_add_ttl.sql new file mode 100644 index 00000000000..7a6dd928a3f --- /dev/null +++ b/tests/queries/0_stateless/02129_add_column_add_ttl.sql @@ -0,0 +1,31 @@ +drop table if exists ttl_test_02129; + +create table ttl_test_02129(a Int64, b String, d Date) +Engine=MergeTree partition by d order by a +settings min_bytes_for_wide_part = 0, min_rows_for_wide_part = 0, materialize_ttl_recalculate_only = 0; + +insert into ttl_test_02129 select number, '', '2021-01-01' from numbers(10); +alter table ttl_test_02129 add column c Int64 settings mutations_sync=2; + +insert into ttl_test_02129 select number, '', '2021-01-01', 0 from numbers(10); +alter table ttl_test_02129 modify TTL (d + INTERVAL 1 MONTH) DELETE WHERE c=1 settings mutations_sync=2; + +select * from ttl_test_02129 order by a, b, d, c; +drop table ttl_test_02129; + +drop table if exists ttl_test_02129; + +select '=========='; + +create table ttl_test_02129(a Int64, b String, d Date) +Engine=MergeTree partition by d order by a +settings min_bytes_for_wide_part = 0, min_rows_for_wide_part = 0, materialize_ttl_recalculate_only = 1; + +insert into ttl_test_02129 select number, '', '2021-01-01' from numbers(10); +alter table ttl_test_02129 add column c Int64 settings mutations_sync=2; + +insert into ttl_test_02129 select number, '', '2021-01-01', 0 from numbers(10); +alter table ttl_test_02129 modify TTL (d + INTERVAL 1 MONTH) DELETE WHERE c=1 settings mutations_sync=2; + +select * from ttl_test_02129 order by a, b, d, c; +drop table ttl_test_02129; diff --git a/tests/queries/0_stateless/02129_window_functions_disable_optimizations.reference b/tests/queries/0_stateless/02129_window_functions_disable_optimizations.reference new file mode 100644 index 00000000000..f66c81021c9 --- /dev/null +++ b/tests/queries/0_stateless/02129_window_functions_disable_optimizations.reference @@ -0,0 +1,36 @@ +1 1 +0 1 +0 1 +0 1 +0 1 +0 1 +0 1 +0 1 +0 1 +0 1 +0 0.5 30 15 +1 1 30 45 +2 0.5 30 60 +3 1 30 90 +4 0.5 30 105 +5 1 30 135 +6 0.5 30 150 +7 1 30 180 +8 0.5 30 195 +9 1 30 225 +0 0 0 +1 1 1 +2 0 1 +3 0 1 +4 0 1 +5 0 1 +6 0 1 +7 0 1 +8 0 1 +9 0 1 +5772761.230862 +5773916.014064 +5775070.797267 +5776226.273617 +5777381.749967 +5778537.226317 diff --git a/tests/queries/0_stateless/02129_window_functions_disable_optimizations.sql b/tests/queries/0_stateless/02129_window_functions_disable_optimizations.sql new file mode 100644 index 00000000000..cfe9f20d378 --- /dev/null +++ b/tests/queries/0_stateless/02129_window_functions_disable_optimizations.sql @@ -0,0 +1,27 @@ +SET optimize_rewrite_sum_if_to_count_if = 1; + +SELECT if(number % 10 = 0, 1, 0) AS dummy, +sum(dummy) OVER w +FROM numbers(10) +WINDOW w AS (ORDER BY number ASC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW); + +SET optimize_arithmetic_operations_in_aggregate_functions=1; +SELECT + *, + if((number % 2) = 0, 0.5, 1) AS a, + 30 AS b, + sum(a * b) OVER (ORDER BY number ASC) AS s +FROM numbers(10); + +SET optimize_aggregators_of_group_by_keys=1; + +SELECT + *, + if(number = 1, 1, 0) as a, + max(a) OVER (ORDER BY number ASC) AS s +FROM numbers(10); + +SET optimize_group_by_function_keys = 1; +SELECT round(sum(log(2) * number), 6) AS k FROM numbers(10000) +GROUP BY (number % 2) * (number % 3), number % 3, number % 2 +HAVING sum(log(2) * number) > 346.57353 ORDER BY k; diff --git a/tests/queries/0_stateless/02131_multiply_row_policies_on_same_column.reference b/tests/queries/0_stateless/02131_multiply_row_policies_on_same_column.reference new file mode 100644 index 00000000000..3f71510f3a5 --- /dev/null +++ b/tests/queries/0_stateless/02131_multiply_row_policies_on_same_column.reference @@ -0,0 +1,8 @@ +4 +1 +2 +3 +3 +3 +3 +4 diff --git a/tests/queries/0_stateless/02131_multiply_row_policies_on_same_column.sql b/tests/queries/0_stateless/02131_multiply_row_policies_on_same_column.sql new file mode 100644 index 00000000000..75f7f737e85 --- /dev/null +++ b/tests/queries/0_stateless/02131_multiply_row_policies_on_same_column.sql @@ -0,0 +1,30 @@ +DROP TABLE IF EXISTS 02131_multiply_row_policies_on_same_column; +CREATE TABLE 02131_multiply_row_policies_on_same_column (x UInt8) ENGINE = MergeTree ORDER BY x; +INSERT INTO 02131_multiply_row_policies_on_same_column VALUES (1), (2), (3), (4); + + +DROP ROW POLICY IF EXISTS 02131_filter_1 ON 02131_multiply_row_policies_on_same_column; +DROP ROW POLICY IF EXISTS 02131_filter_2 ON 02131_multiply_row_policies_on_same_column; +DROP ROW POLICY IF EXISTS 02131_filter_3 ON 02131_multiply_row_policies_on_same_column; +SELECT count() FROM 02131_multiply_row_policies_on_same_column; + + +CREATE ROW POLICY 02131_filter_1 ON 02131_multiply_row_policies_on_same_column USING x=1 TO ALL; +SELECT count() FROM 02131_multiply_row_policies_on_same_column; +CREATE ROW POLICY 02131_filter_2 ON 02131_multiply_row_policies_on_same_column USING x=2 TO ALL; +SELECT count() FROM 02131_multiply_row_policies_on_same_column; +CREATE ROW POLICY 02131_filter_3 ON 02131_multiply_row_policies_on_same_column USING x=3 TO ALL; +SELECT count() FROM 02131_multiply_row_policies_on_same_column; +SELECT count() FROM 02131_multiply_row_policies_on_same_column; + + +CREATE ROW POLICY 02131_filter_4 ON 02131_multiply_row_policies_on_same_column USING x<4 AS RESTRICTIVE TO ALL; +SELECT count() FROM 02131_multiply_row_policies_on_same_column; +SELECT count() FROM 02131_multiply_row_policies_on_same_column; + +DROP ROW POLICY 02131_filter_1 ON 02131_multiply_row_policies_on_same_column; +DROP ROW POLICY 02131_filter_2 ON 02131_multiply_row_policies_on_same_column; +DROP ROW POLICY 02131_filter_3 ON 02131_multiply_row_policies_on_same_column; +DROP ROW POLICY 02131_filter_4 ON 02131_multiply_row_policies_on_same_column; +SELECT count() FROM 02131_multiply_row_policies_on_same_column; +DROP TABLE 02131_multiply_row_policies_on_same_column; diff --git a/tests/queries/0_stateless/02131_mv_many_chunks_bug.reference b/tests/queries/0_stateless/02131_mv_many_chunks_bug.reference new file mode 100644 index 00000000000..9183bf03fcc --- /dev/null +++ b/tests/queries/0_stateless/02131_mv_many_chunks_bug.reference @@ -0,0 +1 @@ +256 diff --git a/tests/queries/0_stateless/02131_mv_many_chunks_bug.sql b/tests/queries/0_stateless/02131_mv_many_chunks_bug.sql new file mode 100644 index 00000000000..736fd9242b0 --- /dev/null +++ b/tests/queries/0_stateless/02131_mv_many_chunks_bug.sql @@ -0,0 +1,15 @@ +drop table if exists t; +drop table if exists t_mv; + +create table t (x UInt64) engine = MergeTree order by x; +create materialized view t_mv engine = MergeTree order by tuple() as select uniq(x), bitAnd(x, 255) as y from t group by y; + +set max_bytes_before_external_group_by = 1000000000; +set group_by_two_level_threshold = 100; +set min_insert_block_size_rows = 100; + +insert into t select number from numbers(300); +select count() from (select y from t_mv group by y); + +drop table if exists t; +drop table if exists t_mv; diff --git a/tests/queries/0_stateless/02131_skip_index_not_materialized.reference b/tests/queries/0_stateless/02131_skip_index_not_materialized.reference new file mode 100644 index 00000000000..d00491fd7e5 --- /dev/null +++ b/tests/queries/0_stateless/02131_skip_index_not_materialized.reference @@ -0,0 +1 @@ +1 diff --git a/tests/queries/0_stateless/02131_skip_index_not_materialized.sql b/tests/queries/0_stateless/02131_skip_index_not_materialized.sql new file mode 100644 index 00000000000..cae0b1d9fb3 --- /dev/null +++ b/tests/queries/0_stateless/02131_skip_index_not_materialized.sql @@ -0,0 +1,12 @@ +DROP TABLE IF EXISTS t_index_non_materialized; + +CREATE TABLE t_index_non_materialized (a UInt32) ENGINE = MergeTree ORDER BY tuple(); + +INSERT INTO t_index_non_materialized VALUES (1); + +ALTER TABLE t_index_non_materialized ADD INDEX ind_set (a) TYPE set(1) GRANULARITY 1; +ALTER TABLE t_index_non_materialized ADD INDEX ind_minmax (a) TYPE minmax() GRANULARITY 1; + +SELECT count() FROM t_index_non_materialized WHERE a = 1; + +DROP TABLE t_index_non_materialized; diff --git a/tests/queries/0_stateless/02132_client_history_navigation.expect b/tests/queries/0_stateless/02132_client_history_navigation.expect new file mode 100755 index 00000000000..b722a0af04c --- /dev/null +++ b/tests/queries/0_stateless/02132_client_history_navigation.expect @@ -0,0 +1,33 @@ +#!/usr/bin/expect -f + +log_user 0 +set timeout 3 +match_max 100000 + +expect_after { + # Do not ignore eof from expect + eof { exp_continue } + # A default timeout action is to do nothing, change it to fail + timeout { exit 1 } +} + +# useful debugging configuration +# exp_internal 1 + +set basedir [file dirname $argv0] +spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_CLIENT_BINARY \$CLICKHOUSE_CLIENT_OPT --disable_suggestion --highlight 0" +expect ":) " + +# Make a query +send -- "SELECT 1\r" +expect "1" +expect ":) " +send -- "SELECT 2" +send -- "\033\[A" +expect "SELECT 1" +send -- "\033\[B" +expect "SELECT 2" +send -- "\r" +expect "2" +send -- "exit\r" +expect eof diff --git a/tests/queries/0_stateless/02132_client_history_navigation.reference b/tests/queries/0_stateless/02132_client_history_navigation.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/02133_distributed_queries_formatting.reference b/tests/queries/0_stateless/02133_distributed_queries_formatting.reference new file mode 100644 index 00000000000..191936fe00b --- /dev/null +++ b/tests/queries/0_stateless/02133_distributed_queries_formatting.reference @@ -0,0 +1,2 @@ +Hello World +Hello World diff --git a/tests/queries/0_stateless/02133_distributed_queries_formatting.sql b/tests/queries/0_stateless/02133_distributed_queries_formatting.sql new file mode 100644 index 00000000000..3015ddf18e3 --- /dev/null +++ b/tests/queries/0_stateless/02133_distributed_queries_formatting.sql @@ -0,0 +1 @@ +SELECT * FROM cluster(test_cluster_two_shards, view(SELECT 'Hello' AS all, 'World' AS distinct)); diff --git a/tests/queries/0_stateless/02133_final_prewhere_where_lowcardinality_replacing.reference b/tests/queries/0_stateless/02133_final_prewhere_where_lowcardinality_replacing.reference new file mode 100644 index 00000000000..0b7680a594f --- /dev/null +++ b/tests/queries/0_stateless/02133_final_prewhere_where_lowcardinality_replacing.reference @@ -0,0 +1,2 @@ +LowCardinality(String) +LowCardinality(String) diff --git a/tests/queries/0_stateless/02133_final_prewhere_where_lowcardinality_replacing.sql b/tests/queries/0_stateless/02133_final_prewhere_where_lowcardinality_replacing.sql new file mode 100644 index 00000000000..a801fe08614 --- /dev/null +++ b/tests/queries/0_stateless/02133_final_prewhere_where_lowcardinality_replacing.sql @@ -0,0 +1,15 @@ +DROP TABLE IF EXISTS errors_local; + +CREATE TABLE errors_local (level LowCardinality(String)) ENGINE=ReplacingMergeTree ORDER BY level settings min_bytes_for_wide_part = '10000000'; +insert into errors_local select toString(number) from numbers(10000); + +SELECT toTypeName(level) FROM errors_local FINAL PREWHERE isNotNull(level) WHERE isNotNull(level) LIMIT 1; + +DROP TABLE errors_local; + +CREATE TABLE errors_local(level LowCardinality(String)) ENGINE=ReplacingMergeTree ORDER BY level; +insert into errors_local select toString(number) from numbers(10000); + +SELECT toTypeName(level) FROM errors_local FINAL PREWHERE isNotNull(level) WHERE isNotNull(level) LIMIT 1; + +DROP TABLE errors_local; diff --git a/tests/queries/0_stateless/02133_issue_32458.reference b/tests/queries/0_stateless/02133_issue_32458.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/02133_issue_32458.sql b/tests/queries/0_stateless/02133_issue_32458.sql new file mode 100644 index 00000000000..16af361db7a --- /dev/null +++ b/tests/queries/0_stateless/02133_issue_32458.sql @@ -0,0 +1,13 @@ +DROP TABLE IF EXISTS t1; +DROP TABLE IF EXISTS t2; + +CREATE TABLE t1 (`id` Int32, `key` String) ENGINE = Memory; +CREATE TABLE t2 (`id` Int32, `key` String) ENGINE = Memory; + +INSERT INTO t1 VALUES (0, ''); +INSERT INTO t2 VALUES (0, ''); + +SELECT * FROM t1 ANY INNER JOIN t2 ON ((NULL = t1.key) = t2.id) AND (('' = t1.key) = t2.id); + +DROP TABLE IF EXISTS t1; +DROP TABLE IF EXISTS t2; diff --git a/tests/queries/0_stateless/02134_async_inserts_formats.reference b/tests/queries/0_stateless/02134_async_inserts_formats.reference new file mode 100644 index 00000000000..15b025b708c --- /dev/null +++ b/tests/queries/0_stateless/02134_async_inserts_formats.reference @@ -0,0 +1,10 @@ +1 a +2 b +3 a +4 b +5 a +6 b +7 a +8 b +all_1_1_0 4 0 +all_2_2_0 4 0 diff --git a/tests/queries/0_stateless/02134_async_inserts_formats.sh b/tests/queries/0_stateless/02134_async_inserts_formats.sh new file mode 100755 index 00000000000..bd102fefe9f --- /dev/null +++ b/tests/queries/0_stateless/02134_async_inserts_formats.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +url="${CLICKHOUSE_URL}&async_insert=1&wait_for_async_insert=1" + +${CLICKHOUSE_CLIENT} -q "DROP TABLE IF EXISTS async_inserts" +${CLICKHOUSE_CLIENT} -q "CREATE TABLE async_inserts (id UInt32, s String) ENGINE = MergeTree ORDER BY id" + +${CLICKHOUSE_CURL} -sS "$url" -d "INSERT INTO async_inserts FORMAT CustomSeparated settings format_custom_escaping_rule='CSV', format_custom_field_delimiter=',' +1,\"a\" +2,\"b\" +" & + +${CLICKHOUSE_CURL} -sS "$url" -d "INSERT INTO async_inserts FORMAT CustomSeparated settings format_custom_escaping_rule='CSV', format_custom_field_delimiter=',' +3,\"a\" +4,\"b\" +" & + +${CLICKHOUSE_CURL} -sS "$url" -d "INSERT INTO async_inserts FORMAT CustomSeparatedWithNames settings format_custom_escaping_rule='CSV', format_custom_field_delimiter=',' +\"id\",\"s\" +5,\"a\" +6,\"b\" +" & + +${CLICKHOUSE_CURL} -sS "$url" -d "INSERT INTO async_inserts FORMAT CustomSeparatedWithNames settings format_custom_escaping_rule='CSV', format_custom_field_delimiter=',' +\"id\",\"s\" +7,\"a\" +8,\"b\" +" & + +wait + +${CLICKHOUSE_CLIENT} -q "SELECT * FROM async_inserts ORDER BY id" +${CLICKHOUSE_CLIENT} -q "SELECT name, rows, level FROM system.parts WHERE table = 'async_inserts' AND database = '$CLICKHOUSE_DATABASE' ORDER BY name" + +${CLICKHOUSE_CLIENT} -q "DROP TABLE async_inserts" diff --git a/tests/queries/0_stateless/02135_local_create_db.reference b/tests/queries/0_stateless/02135_local_create_db.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/02135_local_create_db.sh b/tests/queries/0_stateless/02135_local_create_db.sh new file mode 100755 index 00000000000..2a0105e554e --- /dev/null +++ b/tests/queries/0_stateless/02135_local_create_db.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +for Engine in Atomic Ordinary; do + $CLICKHOUSE_LOCAL --query """ + CREATE DATABASE foo_$Engine Engine=$Engine; + DROP DATABASE foo_$Engine; + """ +done diff --git a/tests/queries/0_stateless/02136_kill_scalar_queries.reference b/tests/queries/0_stateless/02136_kill_scalar_queries.reference new file mode 100644 index 00000000000..a598447cff5 --- /dev/null +++ b/tests/queries/0_stateless/02136_kill_scalar_queries.reference @@ -0,0 +1,2 @@ +finished default_TEST02132KILL_QUERY1 default select (SELECT max(number) from system.numbers) + 1; +finished default_TEST02132KILL_QUERY2 default SELECT (SELECT number FROM system.numbers WHERE number = 1000000000000); diff --git a/tests/queries/0_stateless/02136_kill_scalar_queries.sh b/tests/queries/0_stateless/02136_kill_scalar_queries.sh new file mode 100755 index 00000000000..382f6555c66 --- /dev/null +++ b/tests/queries/0_stateless/02136_kill_scalar_queries.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash +# Ref: https://github.com/ClickHouse/ClickHouse/issues/1576 +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +function wait_for_query_to_start() +{ + while [[ $($CLICKHOUSE_CURL -sS "$CLICKHOUSE_URL" -d "SELECT count() FROM system.processes WHERE query_id = '$1'") == 0 ]]; do sleep 0.1; done +} + +QUERY_1_ID="${CLICKHOUSE_DATABASE}_TEST02132KILL_QUERY1" +(${CLICKHOUSE_CLIENT} --query_id="${QUERY_1_ID}" --query='select (SELECT max(number) from system.numbers) + 1;' 2>&1 | grep -q "Code: 394." || echo 'FAIL') & +wait_for_query_to_start "${QUERY_1_ID}" +${CLICKHOUSE_CLIENT} --query="KILL QUERY WHERE query_id='${QUERY_1_ID}' SYNC" + +QUERY_2_ID="${CLICKHOUSE_DATABASE}_TEST02132KILL_QUERY2" +(${CLICKHOUSE_CLIENT} --query_id="${QUERY_2_ID}" --query='SELECT (SELECT number FROM system.numbers WHERE number = 1000000000000);' 2>&1 | grep -q "Code: 394." || echo 'FAIL') & +wait_for_query_to_start "${QUERY_2_ID}" +${CLICKHOUSE_CLIENT} --query="KILL QUERY WHERE query_id='${QUERY_2_ID}' SYNC" + +wait diff --git a/tests/queries/0_stateless/02136_scalar_progress.reference b/tests/queries/0_stateless/02136_scalar_progress.reference new file mode 100644 index 00000000000..21f6d3e0043 --- /dev/null +++ b/tests/queries/0_stateless/02136_scalar_progress.reference @@ -0,0 +1,6 @@ +< X-ClickHouse-Progress: {"read_rows":"0","read_bytes":"0","written_rows":"0","written_bytes":"0","total_rows_to_read":"100000"} +< X-ClickHouse-Progress: {"read_rows":"65505","read_bytes":"524040","written_rows":"0","written_bytes":"0","total_rows_to_read":"100000"} +< X-ClickHouse-Progress: {"read_rows":"131010","read_bytes":"1048080","written_rows":"0","written_bytes":"0","total_rows_to_read":"100000"} +< X-ClickHouse-Progress: {"read_rows":"131011","read_bytes":"1048081","written_rows":"0","written_bytes":"0","total_rows_to_read":"100000"} +< X-ClickHouse-Progress: {"read_rows":"131011","read_bytes":"1048081","written_rows":"0","written_bytes":"0","total_rows_to_read":"100000"} +< X-ClickHouse-Summary: {"read_rows":"131011","read_bytes":"1048081","written_rows":"0","written_bytes":"0","total_rows_to_read":"100000"} diff --git a/tests/queries/0_stateless/02136_scalar_progress.sh b/tests/queries/0_stateless/02136_scalar_progress.sh new file mode 100755 index 00000000000..4608031f83d --- /dev/null +++ b/tests/queries/0_stateless/02136_scalar_progress.sh @@ -0,0 +1,7 @@ +#!/usr/bin/env bash +# Ref: https://github.com/ClickHouse/ClickHouse/issues/1576 +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +$CLICKHOUSE_CURL -sS "${CLICKHOUSE_URL}&wait_end_of_query=1&send_progress_in_http_headers=1&http_headers_progress_interval_ms=0" -d "SELECT (SELECT max(number), count(number) FROM numbers(100000));" -v 2>&1 | grep -E "X-ClickHouse-Summary|X-ClickHouse-Progress" diff --git a/tests/queries/0_stateless/02136_scalar_read_rows_json.reference b/tests/queries/0_stateless/02136_scalar_read_rows_json.reference new file mode 100644 index 00000000000..49020a4432f --- /dev/null +++ b/tests/queries/0_stateless/02136_scalar_read_rows_json.reference @@ -0,0 +1,50 @@ +#1 +{ + "meta": + [ + { + "name": "count()", + "type": "UInt64" + } + ], + + "data": + [ + { + "count()": "100" + } + ], + + "rows": 1, + + "rows_before_limit_at_least": 100, + + "statistics": + { + "rows_read": 100, + "bytes_read": 800 + } +} +#2 +{ + "meta": + [ + { + "type": "Tuple(UInt64, UInt64)" + } + ], + + "data": + [ + { + } + ], + + "rows": 1, + + "statistics": + { + "rows_read": 131011, + "bytes_read": 1048081 + } +} diff --git a/tests/queries/0_stateless/02136_scalar_read_rows_json.sh b/tests/queries/0_stateless/02136_scalar_read_rows_json.sh new file mode 100755 index 00000000000..d589cb60086 --- /dev/null +++ b/tests/queries/0_stateless/02136_scalar_read_rows_json.sh @@ -0,0 +1,10 @@ +#!/usr/bin/env bash +# Ref: https://github.com/ClickHouse/ClickHouse/issues/1576 +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +echo "#1" +${CLICKHOUSE_CLIENT} --query='SELECT count() FROM numbers(100) FORMAT JSON;' | grep -a -v "elapsed" +echo "#2" +${CLICKHOUSE_CLIENT} --query='SELECT (SELECT max(number), count(number) FROM numbers(100000) as n) FORMAT JSON;' | grep -a -v "elapsed" | grep -v "_subquery" diff --git a/tests/queries/0_stateless/02136_scalar_subquery_metrics.reference b/tests/queries/0_stateless/02136_scalar_subquery_metrics.reference new file mode 100644 index 00000000000..7bef11d008f --- /dev/null +++ b/tests/queries/0_stateless/02136_scalar_subquery_metrics.reference @@ -0,0 +1,9 @@ +#02136_scalar_subquery_1 999 +#02136_scalar_subquery_2 999 0 +#02136_scalar_subquery_3 999 999 +#02136_scalar_subquery_4 999 +#02136_scalar_subquery_4 999 +1001 SELECT \'#02136_scalar_subquery_1\', (SELECT max(number) FROM numbers(1000)) as n; +2001 SELECT \'#02136_scalar_subquery_2\', (SELECT max(number) FROM numbers(1000)) as n, (SELECT min(number) FROM numbers(1000)) as n2; +1001 SELECT \'#02136_scalar_subquery_3\', (SELECT max(number) FROM numbers(1000)) as n, (SELECT max(number) FROM numbers(1000)) as n2; +1002 SELECT \'#02136_scalar_subquery_4\', (SELECT max(number) FROM numbers(1000)) as n FROM system.numbers LIMIT 2; diff --git a/tests/queries/0_stateless/02136_scalar_subquery_metrics.sql b/tests/queries/0_stateless/02136_scalar_subquery_metrics.sql new file mode 100644 index 00000000000..180610288aa --- /dev/null +++ b/tests/queries/0_stateless/02136_scalar_subquery_metrics.sql @@ -0,0 +1,13 @@ +SELECT '#02136_scalar_subquery_1', (SELECT max(number) FROM numbers(1000)) as n; +SELECT '#02136_scalar_subquery_2', (SELECT max(number) FROM numbers(1000)) as n, (SELECT min(number) FROM numbers(1000)) as n2; +SELECT '#02136_scalar_subquery_3', (SELECT max(number) FROM numbers(1000)) as n, (SELECT max(number) FROM numbers(1000)) as n2; -- Cached +SELECT '#02136_scalar_subquery_4', (SELECT max(number) FROM numbers(1000)) as n FROM system.numbers LIMIT 2; -- Cached + +SYSTEM FLUSH LOGS; +SELECT read_rows, query FROM system.query_log +WHERE + event_date > yesterday() + AND type = 'QueryFinish' + AND current_database == currentDatabase() + AND query LIKE 'SELECT ''#02136_scalar_subquery_%' +ORDER BY query ASC; diff --git a/tests/queries/0_stateless/02137_mv_into_join.reference b/tests/queries/0_stateless/02137_mv_into_join.reference new file mode 100644 index 00000000000..1228a2322e6 --- /dev/null +++ b/tests/queries/0_stateless/02137_mv_into_join.reference @@ -0,0 +1,3 @@ +sku_0001 black women nice shirt +sku_0001_black sku_0001 black women nice shirt +sku_0001_black sku_0001 black women nice shirt diff --git a/tests/queries/0_stateless/02137_mv_into_join.sql b/tests/queries/0_stateless/02137_mv_into_join.sql new file mode 100644 index 00000000000..cca896ac622 --- /dev/null +++ b/tests/queries/0_stateless/02137_mv_into_join.sql @@ -0,0 +1,17 @@ +CREATE TABLE main ( `id` String, `color` String, `section` String, `description` String) ENGINE = MergeTree ORDER BY tuple(); +CREATE TABLE destination_join ( `key` String, `id` String, `color` String, `section` String, `description` String) ENGINE = Join(ANY, LEFT, key); +CREATE TABLE destination_set (`key` String) ENGINE = Set; + +CREATE MATERIALIZED VIEW mv_to_join TO `destination_join` AS SELECT concat(id, '_', color) AS key, * FROM main; +CREATE MATERIALIZED VIEW mv_to_set TO `destination_set` AS SELECT key FROM destination_join; + +INSERT INTO main VALUES ('sku_0001','black','women','nice shirt'); +SELECT * FROM main; +SELECT * FROM destination_join; +SELECT * FROM destination_join WHERE key in destination_set; + +DROP TABLE mv_to_set; +DROP TABLE destination_set; +DROP TABLE mv_to_join; +DROP TABLE destination_join; +DROP TABLE main; diff --git a/tests/queries/0_stateless/02139_MV_with_scalar_subquery.reference b/tests/queries/0_stateless/02139_MV_with_scalar_subquery.reference new file mode 100644 index 00000000000..51cfca81ddb --- /dev/null +++ b/tests/queries/0_stateless/02139_MV_with_scalar_subquery.reference @@ -0,0 +1,4 @@ +2000 +2 +1500 0 1499 1500 0 1499 +500 1500 1999 500 1500 1999 diff --git a/tests/queries/0_stateless/02139_MV_with_scalar_subquery.sql b/tests/queries/0_stateless/02139_MV_with_scalar_subquery.sql new file mode 100644 index 00000000000..f0285bbec3d --- /dev/null +++ b/tests/queries/0_stateless/02139_MV_with_scalar_subquery.sql @@ -0,0 +1,24 @@ +-- https://github.com/ClickHouse/ClickHouse/issues/9587#issuecomment-944431385 + +CREATE TABLE source (a Int32) ENGINE=MergeTree() ORDER BY tuple(); +CREATE TABLE source_null AS source ENGINE=Null; +CREATE TABLE dest_a (count UInt32, min Int32, max Int32, count_subquery Int32, min_subquery Int32, max_subquery Int32) ENGINE=MergeTree() ORDER BY tuple(); + +CREATE MATERIALIZED VIEW mv_null TO source_null AS SELECT * FROM source; +CREATE MATERIALIZED VIEW mv_a to dest_a AS +SELECT + count() AS count, + min(a) AS min, + max(a) AS max, + (SELECT count() FROM source_null) AS count_subquery, + (SELECT min(a) FROM source_null) AS min_subquery, + (SELECT max(a) FROM source_null) AS max_subquery +FROM source_null +GROUP BY count_subquery, min_subquery, max_subquery; + + +INSERT INTO source SELECT number FROM numbers(2000) SETTINGS min_insert_block_size_rows=1500, max_insert_block_size=1500; + +SELECT count() FROM source; +SELECT count() FROM dest_a; +SELECT * from dest_a ORDER BY count DESC; diff --git a/tests/queries/0_stateless/02140_clickhouse_local_queries_file_table.reference b/tests/queries/0_stateless/02140_clickhouse_local_queries_file_table.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/02140_clickhouse_local_queries_file_table.sh b/tests/queries/0_stateless/02140_clickhouse_local_queries_file_table.sh new file mode 100755 index 00000000000..377cbb13688 --- /dev/null +++ b/tests/queries/0_stateless/02140_clickhouse_local_queries_file_table.sh @@ -0,0 +1,7 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +$CLICKHOUSE_LOCAL --file /dev/null --structure "key String" --input-format TSVWithNamesAndTypes --queries-file <(echo 'select 1') --queries-file <(echo 'select 2') --format Null diff --git a/tests/queries/0_stateless/02141_clickhouse_local_interactive_table.reference b/tests/queries/0_stateless/02141_clickhouse_local_interactive_table.reference new file mode 100644 index 00000000000..e4c93e9e1c5 --- /dev/null +++ b/tests/queries/0_stateless/02141_clickhouse_local_interactive_table.reference @@ -0,0 +1 @@ +CREATE TABLE _local.table\n(\n `key` String\n)\nENGINE = File(\'TSVWithNamesAndTypes\', \'/dev/null\') diff --git a/tests/queries/0_stateless/02141_clickhouse_local_interactive_table.sh b/tests/queries/0_stateless/02141_clickhouse_local_interactive_table.sh new file mode 100755 index 00000000000..fc71f779fa1 --- /dev/null +++ b/tests/queries/0_stateless/02141_clickhouse_local_interactive_table.sh @@ -0,0 +1,7 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +$CLICKHOUSE_LOCAL --file /dev/null --structure "key String" --input-format TSVWithNamesAndTypes --interactive --send_logs_level=trace <<<'show create table table' diff --git a/tests/queries/0_stateless/02142_http_with_query_parameters.reference b/tests/queries/0_stateless/02142_http_with_query_parameters.reference new file mode 100644 index 00000000000..5d0d05b9ed8 --- /dev/null +++ b/tests/queries/0_stateless/02142_http_with_query_parameters.reference @@ -0,0 +1 @@ +22 [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 22 diff --git a/tests/queries/0_stateless/02142_http_with_query_parameters.sh b/tests/queries/0_stateless/02142_http_with_query_parameters.sh new file mode 100755 index 00000000000..cb7b11d5cdd --- /dev/null +++ b/tests/queries/0_stateless/02142_http_with_query_parameters.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash +set -e + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +echo ' +SELECT + sum(toUInt8(1) ? toUInt8(1) : toUInt8(1)) AS metric, + groupArray(toUInt8(1) ? toUInt8(1) : toUInt8(1)), + groupArray(toUInt8(1) ? toUInt8(1) : 1), + sum(toUInt8(1) ? toUInt8(1) : 1) +FROM (SELECT materialize(toUInt64(1)) as key FROM numbers(22)) +WHERE key = {b1:Int64}' | ${CLICKHOUSE_CURL} -sSg "${CLICKHOUSE_URL}¶m_b1=1" -d @- diff --git a/tests/queries/0_stateless/02144_avg_ubsan.reference b/tests/queries/0_stateless/02144_avg_ubsan.reference new file mode 100644 index 00000000000..09f03e40e59 --- /dev/null +++ b/tests/queries/0_stateless/02144_avg_ubsan.reference @@ -0,0 +1,14 @@ +-- { echo } + +-- Aggregate function 'avg' allows overflow with two's complement arithmetics. +-- This contradicts the standard SQL semantic and we are totally fine with it. + +-- AggregateFunctionAvg::add +SELECT avg(-8000000000000000000) FROM (SELECT *, 1 AS k FROM numbers(65535*2)) GROUP BY k; +63121857572613.94 +-- AggregateFunctionAvg::addBatchSinglePlace +SELECT avg(-8000000000000000000) FROM numbers(65535 * 2); +63121857572613.94 +-- AggregateFunctionAvg::addBatchSinglePlaceNotNull +SELECT avg(toNullable(-8000000000000000000)) FROM numbers(65535 * 2); +63121857572613.94 diff --git a/tests/queries/0_stateless/02144_avg_ubsan.sql b/tests/queries/0_stateless/02144_avg_ubsan.sql new file mode 100644 index 00000000000..7c51963333e --- /dev/null +++ b/tests/queries/0_stateless/02144_avg_ubsan.sql @@ -0,0 +1,11 @@ +-- { echo } + +-- Aggregate function 'avg' allows overflow with two's complement arithmetics. +-- This contradicts the standard SQL semantic and we are totally fine with it. + +-- AggregateFunctionAvg::add +SELECT avg(-8000000000000000000) FROM (SELECT *, 1 AS k FROM numbers(65535*2)) GROUP BY k; +-- AggregateFunctionAvg::addBatchSinglePlace +SELECT avg(-8000000000000000000) FROM numbers(65535 * 2); +-- AggregateFunctionAvg::addBatchSinglePlaceNotNull +SELECT avg(toNullable(-8000000000000000000)) FROM numbers(65535 * 2); diff --git a/tests/queries/0_stateless/02146_mv_non_phys.reference b/tests/queries/0_stateless/02146_mv_non_phys.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/02146_mv_non_phys.sql b/tests/queries/0_stateless/02146_mv_non_phys.sql new file mode 100644 index 00000000000..4b15900fe76 --- /dev/null +++ b/tests/queries/0_stateless/02146_mv_non_phys.sql @@ -0,0 +1,2 @@ +drop table if exists mv_02146; +create materialized view mv_02146 engine=MergeTree() order by number as select * from numbers(10); -- { serverError QUERY_IS_NOT_SUPPORTED_IN_MATERIALIZED_VIEW } diff --git a/tests/queries/0_stateless/02147_arrow_duplicate_columns.reference b/tests/queries/0_stateless/02147_arrow_duplicate_columns.reference new file mode 100644 index 00000000000..d86bac9de59 --- /dev/null +++ b/tests/queries/0_stateless/02147_arrow_duplicate_columns.reference @@ -0,0 +1 @@ +OK diff --git a/tests/queries/0_stateless/02147_arrow_duplicate_columns.sh b/tests/queries/0_stateless/02147_arrow_duplicate_columns.sh new file mode 100755 index 00000000000..11c1522d10b --- /dev/null +++ b/tests/queries/0_stateless/02147_arrow_duplicate_columns.sh @@ -0,0 +1,31 @@ +#!/usr/bin/env bash +# Tags: no-fasttest + +set -e + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +# Reproduce GZDATA: +# +# ```python +# import pyarrow as pa +# data = [pa.array([1]), pa.array([2]), pa.array([3])] +# batch = pa.record_batch(data, names=['x', 'y', 'x']) +# with pa.ipc.new_file('data.arrow', batch.schema) as writer: +# writer.write_batch(batch) +# ``` +# +# ```bash +# cat data.arrow | gzip | base64 +# ``` + +GZDATA="H4sIAHTzuWEAA9VTuw3CMBB9+RCsyIULhFIwAC0SJQWZACkNi1CAxCCMwCCMQMEIKdkgPJ8PJbIIEiVPujuf73yfp6Rumt1+BXTEA4CDRwmLAhMYnogkpw96hjpXDWSUA2Wt/pU1mJz6GjO9k+eUI+UicSRbqvuX3BPlNsh1zDCcZypTOJ0xvF186GOYZ5ht9NrX8Pu12svDYq4bWqmKLEdFU+GNkmcr23oOzspNgh4FxmEiO3bvoriL4jJa1Bc/+OmghkcXeJU+lmwUwoALHHDbDfUSgVNfo9V3T7U9Pz3++bswDNbyD7wAxr434AoDAAA=" + +${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS t1" +${CLICKHOUSE_CLIENT} --query="CREATE TABLE t1 ( x Int64, y Int64, z Int64 ) ENGINE = Memory" + +echo ${GZDATA} | base64 --decode | gunzip | ${CLICKHOUSE_CLIENT} -q "INSERT INTO t1 FORMAT Arrow" 2>&1 | grep -qF "DUPLICATE_COLUMN" && echo 'OK' || echo 'FAIL' ||: + +${CLICKHOUSE_CLIENT} -q "DROP TABLE IF EXISTS t1" diff --git a/tests/queries/0_stateless/02147_order_by_optimizations.reference b/tests/queries/0_stateless/02147_order_by_optimizations.reference new file mode 100644 index 00000000000..8708826ea2b --- /dev/null +++ b/tests/queries/0_stateless/02147_order_by_optimizations.reference @@ -0,0 +1,21 @@ +SELECT + date, + v +FROM t_02147 +ORDER BY + toStartOfHour(date) ASC, + v ASC +SELECT + date, + v +FROM t_02147_dist +ORDER BY + toStartOfHour(date) ASC, + v ASC +SELECT + date, + v +FROM t_02147_merge +ORDER BY + toStartOfHour(date) ASC, + v ASC diff --git a/tests/queries/0_stateless/02147_order_by_optimizations.sql b/tests/queries/0_stateless/02147_order_by_optimizations.sql new file mode 100644 index 00000000000..7aa631ff432 --- /dev/null +++ b/tests/queries/0_stateless/02147_order_by_optimizations.sql @@ -0,0 +1,15 @@ +DROP TABLE IF EXISTS t_02147; +DROP TABLE IF EXISTS t_02147_dist; +DROP TABLE IF EXISTS t_02147_merge; + +CREATE TABLE t_02147 (date DateTime, v UInt32) +ENGINE = MergeTree ORDER BY toStartOfHour(date); + +CREATE TABLE t_02147_dist AS t_02147 ENGINE = Distributed(test_shard_localhost, currentDatabase(), t_02147); +CREATE TABLE t_02147_merge AS t_02147 ENGINE = Merge(currentDatabase(), 't_02147'); + +SET optimize_monotonous_functions_in_order_by = 1; + +EXPLAIN SYNTAX SELECT * FROM t_02147 ORDER BY toStartOfHour(date), v; +EXPLAIN SYNTAX SELECT * FROM t_02147_dist ORDER BY toStartOfHour(date), v; +EXPLAIN SYNTAX SELECT * FROM t_02147_merge ORDER BY toStartOfHour(date), v; diff --git a/tests/queries/0_stateless/02148_cast_type_parsing.reference b/tests/queries/0_stateless/02148_cast_type_parsing.reference new file mode 100644 index 00000000000..a078e3b9d66 --- /dev/null +++ b/tests/queries/0_stateless/02148_cast_type_parsing.reference @@ -0,0 +1 @@ +[(1,'Hello'),(2,'World')] diff --git a/tests/queries/0_stateless/02148_cast_type_parsing.sql b/tests/queries/0_stateless/02148_cast_type_parsing.sql new file mode 100644 index 00000000000..f5d9023604c --- /dev/null +++ b/tests/queries/0_stateless/02148_cast_type_parsing.sql @@ -0,0 +1 @@ +SELECT CAST([(1, 'Hello'), (2, 'World')] AS Array(Tuple(a UInt64, b String))); diff --git a/tests/queries/0_stateless/02148_in_memory_part_flush.reference b/tests/queries/0_stateless/02148_in_memory_part_flush.reference new file mode 100644 index 00000000000..219c5f4b22f --- /dev/null +++ b/tests/queries/0_stateless/02148_in_memory_part_flush.reference @@ -0,0 +1,4 @@ +before DETACH TABLE +500 +after DETACH TABLE +500 diff --git a/tests/queries/0_stateless/02148_in_memory_part_flush.sql b/tests/queries/0_stateless/02148_in_memory_part_flush.sql new file mode 100644 index 00000000000..ec20721186e --- /dev/null +++ b/tests/queries/0_stateless/02148_in_memory_part_flush.sql @@ -0,0 +1,26 @@ +DROP TABLE IF EXISTS mem_part_flush; + +CREATE TABLE mem_part_flush +( +`key` UInt32, +`ts` DateTime, +`db_time` DateTime DEFAULT now() +) +ENGINE = MergeTree +ORDER BY (key, ts) +SETTINGS min_rows_for_compact_part = 1000000, min_bytes_for_compact_part = 200000000, in_memory_parts_enable_wal = 0; + +INSERT INTO mem_part_flush(key, ts) SELECT number % 1000, now() + intDiv(number,1000) FROM numbers(500); + +SELECT 'before DETACH TABLE'; +SELECT count(*) FROM mem_part_flush; + +DETACH TABLE mem_part_flush; + +ATTACH TABLE mem_part_flush; + +SELECT 'after DETACH TABLE'; +SELECT count(*) FROM mem_part_flush; + + +DROP TABLE mem_part_flush; diff --git a/tests/queries/0_stateless/02148_issue_32737.reference b/tests/queries/0_stateless/02148_issue_32737.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/02148_issue_32737.sql b/tests/queries/0_stateless/02148_issue_32737.sql new file mode 100644 index 00000000000..c8fbac457e7 --- /dev/null +++ b/tests/queries/0_stateless/02148_issue_32737.sql @@ -0,0 +1,3 @@ +SELECT fuzzBits(toFixedString('', 200), 0.99) from numbers(1) FORMAT Null; +SELECT fuzzBits(toFixedString('', 200), 0.99) from numbers(128) FORMAT Null; +SELECT fuzzBits(toFixedString('', 200), 0.99) from numbers(60000) FORMAT Null; diff --git a/tests/queries/0_stateless/02148_sql_user_defined_function_subquery.reference b/tests/queries/0_stateless/02148_sql_user_defined_function_subquery.reference new file mode 100644 index 00000000000..8851ce8322c --- /dev/null +++ b/tests/queries/0_stateless/02148_sql_user_defined_function_subquery.reference @@ -0,0 +1,6 @@ +1 +2 +2 +4 +(0,'Value') +Value diff --git a/tests/queries/0_stateless/02148_sql_user_defined_function_subquery.sql b/tests/queries/0_stateless/02148_sql_user_defined_function_subquery.sql new file mode 100644 index 00000000000..cc62d1ac495 --- /dev/null +++ b/tests/queries/0_stateless/02148_sql_user_defined_function_subquery.sql @@ -0,0 +1,35 @@ +-- Tags: no-parallel + +DROP FUNCTION IF EXISTS 02148_test_function; +CREATE FUNCTION 02148_test_function AS () -> (SELECT 1); + +SELECT 02148_test_function(); + +CREATE OR REPLACE FUNCTION 02148_test_function AS () -> (SELECT 2); + +SELECT 02148_test_function(); + +DROP FUNCTION 02148_test_function; + +CREATE FUNCTION 02148_test_function AS (x) -> (SELECT x + 1); +SELECT 02148_test_function(1); + +DROP FUNCTION IF EXISTS 02148_test_function_nested; +CREATE FUNCTION 02148_test_function_nested AS (x) -> 02148_test_function(x + 2); +SELECT 02148_test_function_nested(1); + +DROP FUNCTION 02148_test_function; +DROP FUNCTION 02148_test_function_nested; + +DROP TABLE IF EXISTS 02148_test_table; +CREATE TABLE 02148_test_table (id UInt64, value String) ENGINE=TinyLog; +INSERT INTO 02148_test_table VALUES (0, 'Value'); + +CREATE FUNCTION 02148_test_function AS () -> (SELECT * FROM 02148_test_table LIMIT 1); +SELECT 02148_test_function(); + +CREATE OR REPLACE FUNCTION 02148_test_function AS () -> (SELECT value FROM 02148_test_table LIMIT 1); +SELECT 02148_test_function(); + +DROP FUNCTION 02148_test_function; +DROP TABLE 02148_test_table; diff --git a/tests/queries/0_stateless/02149_external_schema_inference.reference b/tests/queries/0_stateless/02149_external_schema_inference.reference new file mode 100644 index 00000000000..875659c7fb6 --- /dev/null +++ b/tests/queries/0_stateless/02149_external_schema_inference.reference @@ -0,0 +1,168 @@ +Protobuf + +a_b_c Array(Array(Array(Int32))) + +a String +b_c Array(Array(Float64)) + +x Enum8(\'FIRST\' = 0, \'SECOND\' = 1, \'TEN\' = 10, \'HUNDRED\' = 100) + +a Map(String, UInt32) + +x_y_z Array(Array(Int32)) + +uuid String +name String +surname String +gender Enum8(\'female\' = 0, \'male\' = 1) +birthDate UInt32 +photo String +phoneNumber String +isOnline UInt8 +visitTime UInt32 +age UInt32 +zodiacSign Enum8(\'aries\' = 0, \'taurus\' = 1, \'gemini\' = 2, \'cancer\' = 3, \'leo\' = 4, \'virgo\' = 5, \'libra\' = 6, \'scorpius\' = 7, \'sagittarius\' = 8, \'capricorn\' = 9, \'aquarius\' = 10, \'pisces\' = 11) +songs Array(String) +color Array(UInt32) +hometown String +location Array(Float32) +pi Float64 +lotteryWin Float64 +someRatio Float32 +temperature Float32 +randomBigNumber Int64 +measureUnits Array(Tuple(unit String, coef Float32)) +nestiness_a_b_c Tuple(d UInt32, e Array(UInt32)) + +location Array(Int32) +pi Float32 +uuid String +newFieldBool UInt8 +name String +gender Enum8(\'male\' = 0, \'female\' = 1) +zodiacSign Int32 +birthDate Int64 +age String +isOnline Enum8(\'offline\' = 0, \'online\' = 1) +someRatio Float64 +visitTime UInt64 +newMessage Tuple(empty Array(Tuple()), z Float32) +randomBigNumber Int64 +newFieldInt Array(Int32) +color Array(Float32) +lotteryWin UInt64 +surname String +phoneNumber UInt64 +temperature Int32 +newFieldStr String +measureUnits_unit Array(String) +measureUnits_coef Array(Float32) +nestiness_a_b_c_d UInt32 +nestiness_a_b_c_e Array(UInt32) + +uuid String +name String +surname String +gender String +birthDate String +phoneNumber String +isOnline String +visitTime String +age String +zodiacSign String +songs Array(String) +color Array(String) +hometown String +location Array(String) +pi String +lotteryWin String +someRatio String +temperature String +randomBigNumber String +measureUnits Tuple(unit Array(String), coef Array(String)) +nestiness_a_b_c Tuple(d String, e Array(String)) + +uuid String +name String +surname String +gender Enum8(\'female\' = 0, \'male\' = 1) +birthDate UInt32 +photo String +phoneNumber String +isOnline UInt8 +visitTime UInt32 +age UInt32 +zodiacSign Enum8(\'aries\' = 0, \'taurus\' = 1, \'gemini\' = 2, \'cancer\' = 3, \'leo\' = 4, \'virgo\' = 5, \'libra\' = 6, \'scorpius\' = 7, \'sagittarius\' = 8, \'capricorn\' = 9, \'aquarius\' = 10, \'pisces\' = 11) +songs Array(String) +color Array(UInt32) +hometown String +location Array(Float32) +pi Float64 +lotteryWin Float64 +someRatio Float32 +temperature Float32 +randomBigNumber Int64 +measureunits Tuple(coef Array(Float32), unit Array(String)) +nestiness_a_b_c Tuple(d UInt32, e Array(UInt32)) +newFieldStr String +newFieldInt Int32 +newBool UInt8 + +identifier String +modules Array(Tuple(module_id UInt32, supply UInt32, temp UInt32, nodes Array(Tuple(node_id UInt32, opening_time UInt32, closing_time UInt32, current UInt32, coords_y Float32)))) + +Capnproto + +value Enum8(\'one\' = 0, \'two\' = 1, \'tHrEe\' = 2) + +value UInt64 +list1 Array(UInt64) +list2 Array(Array(Array(UInt64))) + +lc1 String +lc2 Nullable(String) +lc3 Array(Nullable(String)) + +value UInt64 +nested Tuple(a Tuple(b UInt64, c Array(Array(UInt64))), d Array(Tuple(e Array(Array(Tuple(f UInt64, g UInt64))), h Array(Tuple(k Array(UInt64)))))) + +nested Tuple(value Array(UInt64), array Array(Array(UInt64)), tuple Array(Tuple(one UInt64, two UInt64))) + +a Tuple(b UInt64, c Tuple(d UInt64, e Tuple(f UInt64))) + +nullable Nullable(UInt64) +array Array(Nullable(UInt64)) +tuple Tuple(nullable Nullable(UInt64)) + +int8 Int8 +uint8 UInt8 +int16 Int16 +uint16 UInt16 +int32 Int32 +uint32 UInt32 +int64 Int64 +uint64 UInt64 +float32 Float32 +float64 Float64 +string String +fixed String +data String +date UInt16 +datetime UInt32 +datetime64 Int64 + +value UInt64 +tuple1 Tuple(one UInt64, two Tuple(three UInt64, four UInt64)) +tuple2 Tuple(nested1 Tuple(nested2 Tuple(x UInt64))) + +RawBLOB + +raw_blob String + +LineAsString + +line String + +JSONAsString + +json String diff --git a/tests/queries/0_stateless/02149_external_schema_inference.sh b/tests/queries/0_stateless/02149_external_schema_inference.sh new file mode 100755 index 00000000000..df2b9a43565 --- /dev/null +++ b/tests/queries/0_stateless/02149_external_schema_inference.sh @@ -0,0 +1,91 @@ +#!/usr/bin/env bash +# Tags: no-parallel, no-fasttest + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + + +USER_FILES_PATH=$(clickhouse-client --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}') +FILE_NAME=test_02149.data +DATA_FILE=$USER_FILES_PATH/$FILE_NAME + +touch $DATA_FILE + +SCHEMADIR=$(clickhouse-client --query "select * from file('$FILE_NAME', 'CapnProto', 'val1 char') settings format_schema='nonexist:Message'" 2>&1 | grep Exception | grep -oP "file \K.*(?=/nonexist.capnp)") +CLIENT_SCHEMADIR=$CURDIR/format_schemas +SERVER_SCHEMADIR=test_02149 +mkdir -p $SCHEMADIR/$SERVER_SCHEMADIR +cp -r $CLIENT_SCHEMADIR/* $SCHEMADIR/$SERVER_SCHEMADIR/ + +echo -e "Protobuf\n" +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Protobuf') settings format_schema='$SERVER_SCHEMADIR/00825_protobuf_format_array_3dim:ABC'" + +echo +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Protobuf') settings format_schema='$SERVER_SCHEMADIR/00825_protobuf_format_array_of_arrays:AA'" + +echo +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Protobuf') settings format_schema='$SERVER_SCHEMADIR/00825_protobuf_format_enum_mapping.proto:EnumMessage'" + +echo +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Protobuf') settings format_schema='$SERVER_SCHEMADIR/00825_protobuf_format_map:Message'" + +echo +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Protobuf') settings format_schema='$SERVER_SCHEMADIR/00825_protobuf_format_nested_in_nested:MessageType'" + +echo +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Protobuf') settings format_schema='$SERVER_SCHEMADIR/00825_protobuf_format_persons:Person'" + +echo +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Protobuf') settings format_schema='$SERVER_SCHEMADIR/00825_protobuf_format_persons:AltPerson'" + +echo +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Protobuf') settings format_schema='$SERVER_SCHEMADIR/00825_protobuf_format_persons:StrPerson'" + +echo +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Protobuf') settings format_schema='$SERVER_SCHEMADIR/00825_protobuf_format_persons_syntax2:Syntax2Person'" + +echo +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Protobuf') settings format_schema='$SERVER_SCHEMADIR/00825_protobuf_format_skipped_column_in_nested:UpdateMessage'" + + +echo -e "\nCapnproto\n" +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CapnProto') settings format_schema='$SERVER_SCHEMADIR/02030_capnp_enum:Message'" + +echo +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CapnProto') settings format_schema='$SERVER_SCHEMADIR/02030_capnp_lists:Message'" + +echo +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CapnProto') settings format_schema='$SERVER_SCHEMADIR/02030_capnp_low_cardinality:Message'" + +echo +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CapnProto') settings format_schema='$SERVER_SCHEMADIR/02030_capnp_nested_lists_and_tuples:Message'" + +echo +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CapnProto') settings format_schema='$SERVER_SCHEMADIR/02030_capnp_nested_table:Message'" + +echo +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CapnProto') settings format_schema='$SERVER_SCHEMADIR/02030_capnp_nested_tuples:Message'" + +echo +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CapnProto') settings format_schema='$SERVER_SCHEMADIR/02030_capnp_nullable:Message'" + +echo +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CapnProto') settings format_schema='$SERVER_SCHEMADIR/02030_capnp_simple_types:Message'" + +echo +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CapnProto') settings format_schema='$SERVER_SCHEMADIR/02030_capnp_tuples:Message'" + +echo -e "\nRawBLOB\n" +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'RawBLOB')" + +echo -e "\nLineAsString\n" +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'LineAsString')" + +echo -e "\nJSONAsString\n" +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'JSONAsString')" + + + +rm -rf ${SCHEMADIR:?}/$SERVER_SCHEMADIR +rm $DATA_FILE diff --git a/tests/queries/0_stateless/02149_issue_32487.reference b/tests/queries/0_stateless/02149_issue_32487.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/02149_issue_32487.sql b/tests/queries/0_stateless/02149_issue_32487.sql new file mode 100644 index 00000000000..4e75c981774 --- /dev/null +++ b/tests/queries/0_stateless/02149_issue_32487.sql @@ -0,0 +1 @@ +SELECT topKWeightedState(2)(now(), 1) FORMAT Null; diff --git a/tests/queries/0_stateless/02149_schema_inference.reference b/tests/queries/0_stateless/02149_schema_inference.reference new file mode 100644 index 00000000000..f46e3bee101 --- /dev/null +++ b/tests/queries/0_stateless/02149_schema_inference.reference @@ -0,0 +1,170 @@ +TSV +c1 Nullable(String) +c2 Nullable(String) +c3 Nullable(String) +c4 Nullable(String) +42 Some string [1, 2, 3, 4] (1, 2, 3) +42 abcd [] (4, 5, 6) +TSVWithNames +number Nullable(String) +string Nullable(String) +array Nullable(String) +tuple Nullable(String) +42 Some string [1, 2, 3, 4] (1, 2, 3) +42 abcd [] (4, 5, 6) +CSV +c1 Nullable(Float64) +c2 Nullable(String) +c3 Array(Tuple(Array(Nullable(Float64)), Nullable(String))) +c4 Array(Nullable(Float64)) +\N Some string [([1,2.3],'String'),([],NULL)] [1,NULL,3] +42 \N [([1,2.3],'String'),([3],'abcd')] [4,5,6] +c1 Nullable(String) +c2 Nullable(String) +42 String +String 42 +c1 Nullable(String) +c2 Nullable(String) +\N [NULL, NULL] +\N [] +CSVWithNames +a Nullable(Float64) +b Nullable(String) +c Array(Tuple(Array(Nullable(Float64)), Nullable(String))) +d Array(Nullable(Float64)) +\N Some string [([1,2.3],'String'),([],NULL)] [1,NULL,3] +42 \N [([1,2.3],'String'),([3],'abcd')] [4,5,6] +JSONCompactEachRow +c1 Nullable(Float64) +c2 Array(Tuple(Nullable(Float64), Nullable(String))) +c3 Map(String, Nullable(Float64)) +c4 Nullable(UInt8) +42.42 [(1,'String'),(2,'abcd')] {'key':42,'key2':24} 1 +c1 Nullable(Float64) +c2 Array(Tuple(Nullable(Float64), Nullable(String))) +c3 Map(String, Nullable(Float64)) +c4 Nullable(UInt8) +\N [(1,'String'),(2,NULL)] {'key':NULL,'key2':24} \N +32 [(2,'String 2'),(3,'hello')] {'key3':4242,'key4':2424} 1 +JSONCompactEachRowWithNames +a Nullable(Float64) +b Array(Tuple(Nullable(Float64), Nullable(String))) +c Map(String, Nullable(Float64)) +d Nullable(UInt8) +42.42 [(1,'String'),(2,'abcd')] {'key':42,'key2':24} 1 +JSONEachRow +d Nullable(UInt8) +b Array(Tuple(Nullable(Float64), Nullable(String))) +c Map(String, Nullable(Float64)) +a Nullable(Float64) +1 [(1,'String'),(2,'abcd')] {'key':42,'key2':24} 42.42 +d Nullable(UInt8) +b Array(Tuple(Nullable(Float64), Nullable(String))) +c Map(String, Nullable(Float64)) +a Nullable(Float64) +\N [(1,'String'),(2,NULL)] {'key':NULL,'key2':24} \N +1 [(2,'String 2'),(3,'hello')] {'key3':4242,'key4':2424} 32 +b Nullable(String) +c Array(Nullable(Float64)) +a Nullable(Float64) +s1 [] 1 +\N [2] 2 +\N [] \N +\N [] \N +\N [3] \N +TSKV +b Nullable(String) +c Nullable(String) +a Nullable(String) +s1 \N 1 +} [2] 2 +\N \N \N +\N \N \N +\N [3] \N +Values +c1 Nullable(Float64) +c2 Nullable(String) +c3 Array(Nullable(Float64)) +c4 Tuple(Nullable(Float64), Nullable(String)) +c5 Tuple(Array(Nullable(Float64)), Array(Tuple(Nullable(Float64), Nullable(String)))) +42.42 Some string [1,2,3] (1,'2') ([1,2],[(3,'4'),(5,'6')]) +c1 Nullable(Float64) +c2 Nullable(String) +c3 Array(Nullable(Float64)) +c4 Tuple(Nullable(Float64), Nullable(Float64)) +c5 Tuple(Array(Nullable(Float64)), Array(Tuple(Nullable(Float64), Nullable(String)))) +42.42 \N [1,NULL,3] (1,NULL) ([1,2],[(3,'4'),(5,'6')]) +\N Some string [10] (1,2) ([],[]) +Regexp +c1 Nullable(String) +c2 Nullable(String) +c3 Nullable(String) +42 Some string 1 [([1, 2, 3], String 1), ([], String 1)] +2 Some string 2 [([4, 5, 6], String 2), ([], String 2)] +312 Some string 3 [([1, 2, 3], String 2), ([], String 2)] +c1 Nullable(Float64) +c2 Nullable(String) +c3 Array(Tuple(Array(Nullable(Float64)), Nullable(String))) +42 Some string 1 [([1,2,3],'String 1'),([],'String 1')] +3 Some string 2 [([3,5,1],'String 2'),([],'String 2')] +244 Some string 3 [([],'String 3'),([],'String 3')] +c1 Nullable(Float64) +c2 Nullable(String) +c3 Array(Tuple(Array(Nullable(Float64)), Nullable(String))) +42 Some string 1 [([1,2,3],'String 1'),([],'String 1')] +2 Some string 2 [([],'String 2'),([],'String 2')] +43 Some string 3 [([1,5,3],'String 3'),([],'String 3')] +c1 Nullable(Float64) +c2 Nullable(String) +c3 Array(Tuple(Array(Nullable(Float64)), Nullable(String))) +42 Some string 1 [([1,2,3],'String 1'),([1],'String 1')] +52 Some string 2 [([],'String 2'),([1],'String 2')] +24 Some string 3 [([1,2,3],'String 3'),([1],'String 3')] +CustomSeparated +c1 Nullable(Float64) +c2 Nullable(String) +c3 Array(Tuple(Array(Nullable(Float64)), Nullable(String))) +42.42 Some string 1 [([1,2,3],'String 1'),([1],'String 1')] +42 Some string 2 [([],'String 2'),([],'String 2')] +\N Some string 3 [([1,2,3],'String 3'),([1],'String 3')] +c1 Nullable(Float64) +c2 Nullable(String) +c3 Array(Tuple(Array(Nullable(Float64)), Nullable(String))) +42.42 Some string 1 [([1,2,3],'String 1'),([1],'String 1')] +42 Some string 2 [([],'String 2'),([],'String 2')] +\N Some string 3 [([1,2,3],'String 3'),([1],'String 3')] +c1 Nullable(Float64) +c2 Nullable(String) +c3 Array(Tuple(Array(Nullable(Float64)), Nullable(String))) +42.42 Some string 1 [([1,2,3],'String 1'),([1],'String 1')] +42 Some string 2 [([],'String 2'),([],'String 2')] +\N Some string 3 [([1,2,3],'String 3'),([1],'String 3')] +Template +column_1 Nullable(Float64) +column_2 Nullable(String) +column_3 Array(Tuple(Array(Nullable(Float64)), Nullable(String))) +42.42 Some string 1 [([1,2,3],'String 1'),([1],'String 1')] +42 Some string 2 [([],'String 2'),([],'String 2')] +\N Some string 3 [([1,2,3],'String 3'),([1],'String 3')] +column_1 Nullable(Float64) +column_2 Nullable(String) +column_3 Array(Tuple(Array(Nullable(Float64)), Nullable(String))) +42.42 Some string 1 [([1,2,3],'String 1'),([1],'String 1')] +42 Some string 2 [([],'String 2'),([],'String 2')] +\N Some string 3 [([1,2,3],'String 3'),([1],'String 3')] +column_1 Nullable(Float64) +column_2 Nullable(String) +column_3 Array(Tuple(Array(Nullable(Float64)), Nullable(String))) +42.42 Some string 1 [([1,2,3],'String 1'),([1],'String 1')] +42 Some string 2 [([],'String 2'),([],'String 2')] +\N Some string 3 [([1,2,3],'String 3'),([1],'String 3')] +MsgPack +c1 Nullable(Int64) +c2 Nullable(Int64) +c3 Nullable(Float32) +c4 Nullable(String) +c5 Array(Array(Nullable(Int64))) +c6 Map(Int64, Array(Nullable(Int64))) +\N 0 0 Str: 0 [[0,1],[0]] {0:[0,1]} +1 \N 1 Str: 1 [[1,2],[1]] {1:[1,2]} +\N 2 2 Str: 2 [[2,3],[2]] {2:[2,3]} diff --git a/tests/queries/0_stateless/02149_schema_inference.sh b/tests/queries/0_stateless/02149_schema_inference.sh new file mode 100755 index 00000000000..1ccec240627 --- /dev/null +++ b/tests/queries/0_stateless/02149_schema_inference.sh @@ -0,0 +1,251 @@ +#!/usr/bin/env bash +# Tags: no-parallel, no-fasttest + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + + +USER_FILES_PATH=$(clickhouse-client --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}') +FILE_NAME=test_02149.data +DATA_FILE=${USER_FILES_PATH:?}/$FILE_NAME + +touch $DATA_FILE + +SCHEMADIR=$(clickhouse-client --query "select * from file('$FILE_NAME', 'Template', 'val1 char') settings format_template_row='nonexist'" 2>&1 | grep Exception | grep -oP "file \K.*(?=/nonexist)") + +echo "TSV" + +echo -e "42\tSome string\t[1, 2, 3, 4]\t(1, 2, 3) +42\tabcd\t[]\t(4, 5, 6)" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')" + +echo "TSVWithNames" + +echo -e "number\tstring\tarray\ttuple +42\tSome string\t[1, 2, 3, 4]\t(1, 2, 3) +42\tabcd\t[]\t(4, 5, 6)" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSVWithNames')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSVWithNames')" + +echo "CSV" + +echo -e "\N,\"Some string\",\"[([1, 2.3], 'String'), ([], NULL)]\",\"[1, NULL, 3]\" +42,\N,\"[([1, 2.3], 'String'), ([3.], 'abcd')]\",\"[4, 5, 6]\"" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CSV')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'CSV')" + +echo -e "42,\"String\" +\"String\",42" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CSV')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'CSV')" + +echo -e "\N,\"[NULL, NULL]\" +\N,[]" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CSV')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'CSV')" + +echo "CSVWithNames" + +echo -e "a,b,c,d +\N,\"Some string\",\"[([1, 2.3], 'String'), ([], NULL)]\",\"[1, NULL, 3]\" +42,\N,\"[([1, 2.3], 'String'), ([3.], 'abcd')]\",\"[4, 5, 6]\"" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CSVWithNames')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'CSVWithNames')" + +echo "JSONCompactEachRow" + +echo -e "[42.42, [[1, \"String\"], [2, \"abcd\"]], {\"key\" : 42, \"key2\" : 24}, true]" > $DATA_FILE +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'JSONCompactEachRow')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'JSONCompactEachRow')" + +echo -e "[null, [[1, \"String\"], [2, null]], {\"key\" : null, \"key2\" : 24}, null] +[32, [[2, \"String 2\"], [3, \"hello\"]], {\"key3\" : 4242, \"key4\" : 2424}, true]" > $DATA_FILE +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'JSONCompactEachRow')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'JSONCompactEachRow')" + +echo "JSONCompactEachRowWithNames" + +echo -e "[\"a\", \"b\", \"c\", \"d\"] +[42.42, [[1, \"String\"], [2, \"abcd\"]], {\"key\" : 42, \"key2\" : 24}, true]" > $DATA_FILE +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'JSONCompactEachRowWithNames')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'JSONCompactEachRowWithNames')" + + +echo "JSONEachRow" +echo -e '{"a" : 42.42, "b" : [[1, "String"], [2, "abcd"]], "c" : {"key" : 42, "key2" : 24}, "d" : true}' > $DATA_FILE +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'JSONEachRow')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'JSONEachRow')" + +echo -e '{"a" : null, "b" : [[1, "String"], [2, null]], "c" : {"key" : null, "key2" : 24}, "d" : null} +{"a" : 32, "b" : [[2, "String 2"], [3, "hello"]], "c" : {"key3" : 4242, "key4" : 2424}, "d" : true}' > $DATA_FILE +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'JSONEachRow')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'JSONEachRow')" + +echo -e '{"a" : 1, "b" : "s1", "c" : null} +{"c" : [2], "a" : 2, "b" : null} +{} +{"a" : null} +{"c" : [3], "a" : null}' > $DATA_FILE +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'JSONEachRow')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'JSONEachRow')" + + +echo "TSKV" + +echo -e 'a=1\tb=s1\tc=\N +c=[2]\ta=2\tb=\N} + +a=\N +c=[3]\ta=\N' > $DATA_FILE +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSKV')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSKV')" + + +echo "Values" + +echo -e "(42.42, 'Some string', [1, 2, 3], (1, '2'), ([1, 2], [(3, '4'), (5, '6')]))" > $DATA_FILE +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Values')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'Values')" + +echo -e "(42.42, NULL, [1, NULL, 3], (1, NULL), ([1, 2], [(3, '4'), (5, '6')])), (NULL, 'Some string', [10], (1, 2), ([], []))" > $DATA_FILE +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Values')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'Values')" + + +echo "Regexp" + +REGEXP="^Line: value_1=(.+?), value_2=(.+?), value_3=(.+?)" + +echo "Line: value_1=42, value_2=Some string 1, value_3=[([1, 2, 3], String 1), ([], String 1)] +Line: value_1=2, value_2=Some string 2, value_3=[([4, 5, 6], String 2), ([], String 2)] +Line: value_1=312, value_2=Some string 3, value_3=[([1, 2, 3], String 2), ([], String 2)]" > $DATA_FILE +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Regexp') settings format_regexp='$REGEXP', format_regexp_escaping_rule='Escaped'" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'Regexp') settings format_regexp='$REGEXP', format_regexp_escaping_rule='Escaped'" + + +echo "Line: value_1=42, value_2=\"Some string 1\", value_3=\"[([1, 2, 3], 'String 1'), ([], 'String 1')]\" +Line: value_1=3, value_2=\"Some string 2\", value_3=\"[([3, 5, 1], 'String 2'), ([], 'String 2')]\" +Line: value_1=244, value_2=\"Some string 3\", value_3=\"[([], 'String 3'), ([], 'String 3')]\"" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Regexp') settings format_regexp='$REGEXP', format_regexp_escaping_rule='CSV'" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'Regexp') settings format_regexp='$REGEXP', format_regexp_escaping_rule='CSV'" + + +echo "Line: value_1=42, value_2='Some string 1', value_3=[([1, 2, 3], 'String 1'), ([], 'String 1')] +Line: value_1=2, value_2='Some string 2', value_3=[([], 'String 2'), ([], 'String 2')] +Line: value_1=43, value_2='Some string 3', value_3=[([1, 5, 3], 'String 3'), ([], 'String 3')]" > $DATA_FILE +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Regexp') settings format_regexp='$REGEXP', format_regexp_escaping_rule='Quoted'" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'Regexp') settings format_regexp='$REGEXP', format_regexp_escaping_rule='Quoted'" + + +echo "Line: value_1=42, value_2=\"Some string 1\", value_3=[[[1, 2, 3], \"String 1\"], [[1], \"String 1\"]] +Line: value_1=52, value_2=\"Some string 2\", value_3=[[[], \"String 2\"], [[1], \"String 2\"]] +Line: value_1=24, value_2=\"Some string 3\", value_3=[[[1, 2, 3], \"String 3\"], [[1], \"String 3\"]]" > $DATA_FILE +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Regexp') settings format_regexp='$REGEXP', format_regexp_escaping_rule='JSON'" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'Regexp') settings format_regexp='$REGEXP', format_regexp_escaping_rule='JSON'" + + +echo "CustomSeparated" + +CUSTOM_SETTINGS="SETTINGS format_custom_row_before_delimiter='', format_custom_row_after_delimiter='\n', format_custom_row_between_delimiter='\n', format_custom_result_before_delimiter='\n', format_custom_result_after_delimiter='\n', format_custom_field_delimiter=''" + +echo -e " +42.42\"Some string 1\"\"[([1, 2, 3], 'String 1'), ([1], 'String 1')]\" + +42\"Some string 2\"\"[([], 'String 2'), ([], 'String 2')]\" + +\N\"Some string 3\"\"[([1, 2, 3], 'String 3'), ([1], 'String 3')]\" +" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CustomSeparated') $CUSTOM_SETTINGS, format_custom_escaping_rule='CSV'" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'CustomSeparated') $CUSTOM_SETTINGS, format_custom_escaping_rule='CSV'" + +echo -e " +42.42'Some string 1'[([1, 2, 3], 'String 1'), ([1], 'String 1')] + +42'Some string 2'[([], 'String 2'), ([], 'String 2')] + +NULL'Some string 3'[([1, 2, 3], 'String 3'), ([1], 'String 3')] +" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CustomSeparated') $CUSTOM_SETTINGS, format_custom_escaping_rule='Quoted'" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'CustomSeparated') $CUSTOM_SETTINGS, format_custom_escaping_rule='Quoted'" + +echo -e " +42.42\"Some string 1\"[[[1, 2, 3], \"String 1\"], [[1], \"String 1\"]] + +42\"Some string 2\"[[[], \"String 2\"], [[], \"String 2\"]] + +null\"Some string 3\"[[[1, 2, 3], \"String 3\"], [[1], \"String 3\"]] +" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CustomSeparated') $CUSTOM_SETTINGS, format_custom_escaping_rule='JSON'" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'CustomSeparated') $CUSTOM_SETTINGS, format_custom_escaping_rule='JSON'" + + +echo "Template" + +echo -e " +\${data}" > $SCHEMADIR/resultset_format_02149 + +echo -e "\${column_1:CSV}\${column_2:CSV}\${column_3:CSV}" > $SCHEMADIR/row_format_02149 + +TEMPLATE_SETTINGS="SETTINGS format_template_rows_between_delimiter='\n', format_template_row='row_format_02149', format_template_resultset='resultset_format_02149'" + +echo -e " +42.42\"Some string 1\"\"[([1, 2, 3], 'String 1'), ([1], 'String 1')]\" + +42\"Some string 2\"\"[([], 'String 2'), ([], 'String 2')]\" + +\N\"Some string 3\"\"[([1, 2, 3], 'String 3'), ([1], 'String 3')]\" +" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Template') $TEMPLATE_SETTINGS" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'Template') $TEMPLATE_SETTINGS" + +echo -e "\${column_1:Quoted}\${column_2:Quoted}\${column_3:Quoted}" > $SCHEMADIR/row_format_02149 + +echo -e " +42.42'Some string 1'[([1, 2, 3], 'String 1'), ([1], 'String 1')] + +42'Some string 2'[([], 'String 2'), ([], 'String 2')] + +NULL'Some string 3'[([1, 2, 3], 'String 3'), ([1], 'String 3')] +" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Template') $TEMPLATE_SETTINGS" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'Template') $TEMPLATE_SETTINGS" + +echo -e "\${column_1:JSON}\${column_2:JSON}\${column_3:JSON}" > $SCHEMADIR/row_format_02149 + +echo -e " +42.42\"Some string 1\"[[[1, 2, 3], \"String 1\"], [[1], \"String 1\"]] + +42\"Some string 2\"[[[], \"String 2\"], [[], \"String 2\"]] + +null\"Some string 3\"[[[1, 2, 3], \"String 3\"], [[1], \"String 3\"]] +" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Template') $TEMPLATE_SETTINGS" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'Template') $TEMPLATE_SETTINGS" + + +echo "MsgPack" + +$CLICKHOUSE_CLIENT -q "select toInt32(number % 2 ? number : NULL) as int, toUInt64(number % 2 ? NULL : number) as uint, toFloat32(number) as float, concat('Str: ', toString(number)) as str, [[number, number + 1], [number]] as arr, map(number, [number, number + 1]) as map from numbers(3) format MsgPack" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'MsgPack') settings input_format_msgpack_number_of_columns=6" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'MsgPack') settings input_format_msgpack_number_of_columns=6" + + +rm $SCHEMADIR/resultset_format_02149 $SCHEMADIR/row_format_02149 +rm $DATA_FILE + diff --git a/tests/queries/0_stateless/02149_schema_inference_create_table_syntax.reference b/tests/queries/0_stateless/02149_schema_inference_create_table_syntax.reference new file mode 100644 index 00000000000..dae12318ce0 --- /dev/null +++ b/tests/queries/0_stateless/02149_schema_inference_create_table_syntax.reference @@ -0,0 +1,40 @@ +0 Str: 0 [0,1] +1 Str: 1 [1,2] +2 Str: 2 [2,3] +3 Str: 3 [3,4] +4 Str: 4 [4,5] +5 Str: 5 [5,6] +6 Str: 6 [6,7] +7 Str: 7 [7,8] +8 Str: 8 [8,9] +9 Str: 9 [9,10] +0 0 [0,1] +1 1 [1,2] +2 2 [2,3] +3 3 [3,4] +4 4 [4,5] +5 5 [5,6] +6 6 [6,7] +7 7 [7,8] +8 8 [8,9] +9 9 [9,10] +0 0 [0,1] +1 1 [1,2] +2 2 [2,3] +3 3 [3,4] +4 4 [4,5] +5 5 [5,6] +6 6 [6,7] +7 7 [7,8] +8 8 [8,9] +9 9 [9,10] +0 0 [0,1] +1 1 [1,2] +2 2 [2,3] +3 3 [3,4] +4 4 [4,5] +5 5 [5,6] +6 6 [6,7] +7 7 [7,8] +8 8 [8,9] +9 9 [9,10] diff --git a/tests/queries/0_stateless/02149_schema_inference_create_table_syntax.sh b/tests/queries/0_stateless/02149_schema_inference_create_table_syntax.sh new file mode 100755 index 00000000000..f00f2531dd0 --- /dev/null +++ b/tests/queries/0_stateless/02149_schema_inference_create_table_syntax.sh @@ -0,0 +1,40 @@ +#!/usr/bin/env bash +# Tags: no-parallel, no-fasttest + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + + +USER_FILES_PATH=$(clickhouse-client --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}') +mkdir $USER_FILES_PATH/test_02149 +FILE_NAME=test_02149/data.Parquet +DATA_FILE=$USER_FILES_PATH/$FILE_NAME + +$CLICKHOUSE_CLIENT -q "select number as num, concat('Str: ', toString(number)) as str, [number, number + 1] as arr from numbers(10) format Parquet" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "drop table if exists test_02149" +$CLICKHOUSE_CLIENT -q "create table test_02149 engine=File('Parquet', '$FILE_NAME')" +$CLICKHOUSE_CLIENT -q "select * from test_02149" +$CLICKHOUSE_CLIENT -q "drop table test_02149" + +$CLICKHOUSE_CLIENT -q "create table test_02149 (x UInt32, s String, a Array(UInt32)) engine=Memory" +$CLICKHOUSE_CLIENT -q "insert into test_02149 select number, toString(number), [number, number + 1] from numbers(10)" + +$CLICKHOUSE_CLIENT -q "drop table if exists test_merge" +$CLICKHOUSE_CLIENT -q "create table test_merge engine=Merge(currentDatabase(), 'test_02149')" +$CLICKHOUSE_CLIENT -q "select * from test_merge" +$CLICKHOUSE_CLIENT -q "drop table test_merge" + +$CLICKHOUSE_CLIENT -q "drop table if exists test_distributed" +$CLICKHOUSE_CLIENT -q "create table test_distributed engine=Distributed(test_shard_localhost, currentDatabase(), 'test_02149')" +$CLICKHOUSE_CLIENT -q "select * from test_distributed" +$CLICKHOUSE_CLIENT -q "drop table test_distributed" + +$CLICKHOUSE_CLIENT -q "drop table if exists test_buffer" +$CLICKHOUSE_CLIENT -q "create table test_buffer engine=Buffer(currentDatabase(), 'test_02149', 16, 10, 100, 10000, 1000000, 10000000, 100000000)" +$CLICKHOUSE_CLIENT -q "select * from test_buffer" +$CLICKHOUSE_CLIENT -q "drop table test_buffer" + +rm -rf ${USER_FILES_PATH:?}/test_02149 + diff --git a/tests/queries/0_stateless/02149_schema_inference_formats_with_schema.reference b/tests/queries/0_stateless/02149_schema_inference_formats_with_schema.reference new file mode 100644 index 00000000000..d3d2d86d696 --- /dev/null +++ b/tests/queries/0_stateless/02149_schema_inference_formats_with_schema.reference @@ -0,0 +1,435 @@ +Arrow +int8 Int8 +uint8 UInt8 +int16 Int16 +uint16 UInt16 +int32 Int32 +uint32 UInt32 +int64 Int64 +uint64 UInt64 +0 0 0 0 0 0 0 0 +-1 1 -1 1 -1 1 -1 1 +float32 Float32 +float64 Float64 +decimal32 Decimal(9, 5) +decimal64 Decimal(18, 5) +0 0 0 0 +1.2 0.7692307692307692 3.33333 333.33333 +date UInt16 +date32 Date32 +0 1970-01-01 +1 1970-01-02 +str String +fixed_string String +Str: 0 100 +Str: 1 200 +array Array(UInt64) +tuple Tuple(`tuple.0` UInt64, `tuple.1` String) +map Map(String, UInt64) +[0,1] (0,'0') {'0':0} +[1,2] (1,'1') {'1':1} +nested1 Array(Tuple(`nested1.0` Array(UInt64), `nested1.1` Map(String, UInt64))) +nested2 Tuple(`nested2.0` Tuple(`nested2.0.0` Array(Array(UInt64)), `nested2.0.1` Map(UInt64, Array(Tuple(`nested2.0.1.0` UInt64, `nested2.0.1.1` String)))), `nested2.1` UInt8) +[([0,1],{'42':0}),([],{}),([42],{'42':42})] (([[0],[1],[]],{0:[(0,'42'),(1,'42')]}),42) +[([1,2],{'42':1}),([],{}),([42],{'42':42})] (([[1],[2],[]],{1:[(1,'42'),(2,'42')]}),42) +ArrowStream +int8 Int8 +uint8 UInt8 +int16 Int16 +uint16 UInt16 +int32 Int32 +uint32 UInt32 +int64 Int64 +uint64 UInt64 +0 0 0 0 0 0 0 0 +-1 1 -1 1 -1 1 -1 1 +float32 Float32 +float64 Float64 +decimal32 Decimal(9, 5) +decimal64 Decimal(18, 5) +0 0 0 0 +1.2 0.7692307692307692 3.33333 333.33333 +date UInt16 +date32 Date32 +0 1970-01-01 +1 1970-01-02 +str String +fixed_string String +Str: 0 100 +Str: 1 200 +array Array(UInt64) +tuple Tuple(`tuple.0` UInt64, `tuple.1` String) +map Map(String, UInt64) +[0,1] (0,'0') {'0':0} +[1,2] (1,'1') {'1':1} +nested1 Array(Tuple(`nested1.0` Array(UInt64), `nested1.1` Map(String, UInt64))) +nested2 Tuple(`nested2.0` Tuple(`nested2.0.0` Array(Array(UInt64)), `nested2.0.1` Map(UInt64, Array(Tuple(`nested2.0.1.0` UInt64, `nested2.0.1.1` String)))), `nested2.1` UInt8) +[([0,1],{'42':0}),([],{}),([42],{'42':42})] (([[0],[1],[]],{0:[(0,'42'),(1,'42')]}),42) +[([1,2],{'42':1}),([],{}),([42],{'42':42})] (([[1],[2],[]],{1:[(1,'42'),(2,'42')]}),42) +Parquet +int8 Int8 +uint8 UInt8 +int16 Int16 +uint16 UInt16 +int32 Int32 +uint32 Int64 +int64 Int64 +uint64 UInt64 +0 0 0 0 0 0 0 0 +-1 1 -1 1 -1 1 -1 1 +float32 Float32 +float64 Float64 +decimal32 Decimal(9, 5) +decimal64 Decimal(18, 5) +0 0 0 0 +1.2 0.7692307692307692 3.33333 333.33333 +date UInt16 +date32 Date32 +0 1970-01-01 +1 1970-01-02 +str String +fixed_string String +Str: 0 100 +Str: 1 200 +array Array(UInt64) +tuple Tuple(`tuple.0` UInt64, `tuple.1` String) +map Map(String, UInt64) +[0,1] (0,'0') {'0':0} +[1,2] (1,'1') {'1':1} +nested1 Array(Tuple(`nested1.0` Array(UInt64), `nested1.1` Map(String, UInt64))) +nested2 Tuple(`nested2.0` Tuple(`nested2.0.0` Array(Array(UInt64)), `nested2.0.1` Map(UInt64, Array(Tuple(`nested2.0.1.0` UInt64, `nested2.0.1.1` String)))), `nested2.1` UInt8) +[([0,1],{'42':0}),([],{}),([42],{'42':42})] (([[0],[1],[]],{0:[(0,'42'),(1,'42')]}),42) +[([1,2],{'42':1}),([],{}),([42],{'42':42})] (([[1],[2],[]],{1:[(1,'42'),(2,'42')]}),42) +ORC +int8 Int8 +uint8 Int8 +int16 Int16 +uint16 Int16 +int32 Int32 +uint32 Int32 +int64 Int64 +uint64 Int64 +0 0 0 0 0 0 0 0 +-1 1 -1 1 -1 1 -1 1 +float32 Float32 +float64 Float64 +decimal32 Decimal(9, 5) +decimal64 Decimal(18, 5) +0 0 0 0 +1.2 0.7692307692307692 3.33333 333.33333 +date Date32 +date32 Date32 +1970-01-01 1970-01-01 +1970-01-02 1970-01-02 +str String +fixed_string String +Str: 0 100 +Str: 1 200 +array Array(Int64) +tuple Tuple(`tuple.0` Int64, `tuple.1` String) +map Map(String, Int64) +[0,1] (0,'0') {'0':0} +[1,2] (1,'1') {'1':1} +nested1 Array(Tuple(`nested1.0` Array(Int64), `nested1.1` Map(String, Int64))) +nested2 Tuple(`nested2.0` Tuple(`nested2.0.0` Array(Array(Int64)), `nested2.0.1` Map(Int64, Array(Tuple(`nested2.0.1.0` Int64, `nested2.0.1.1` String)))), `nested2.1` Int8) +[([0,1],{'42':0}),([],{}),([42],{'42':42})] (([[0],[1],[]],{0:[(0,'42'),(1,'42')]}),42) +[([1,2],{'42':1}),([],{}),([42],{'42':42})] (([[1],[2],[]],{1:[(1,'42'),(2,'42')]}),42) +Native +int8 Int8 +uint8 UInt8 +int16 Int16 +uint16 UInt16 +int32 Int32 +uint32 UInt32 +int64 Int64 +uint64 UInt64 +0 0 0 0 0 0 0 0 +-1 1 -1 1 -1 1 -1 1 +float32 Float32 +float64 Float64 +decimal32 Decimal(9, 5) +decimal64 Decimal(18, 5) +0 0 0 0 +1.2 0.7692307692307692 3.33333 333.33333 +date Date +date32 Date32 +1970-01-01 1970-01-01 +1970-01-02 1970-01-02 +str String +fixed_string FixedString(3) +Str: 0 100 +Str: 1 200 +array Array(UInt64) +tuple Tuple(UInt64, String) +map Map(String, UInt64) +[0,1] (0,'0') {'0':0} +[1,2] (1,'1') {'1':1} +nested1 Array(Tuple(Array(UInt64), Map(String, UInt64))) +nested2 Tuple(Tuple(Array(Array(UInt64)), Map(UInt64, Array(Tuple(UInt64, String)))), UInt8) +[([0,1],{'42':0}),([],{}),([42],{'42':42})] (([[0],[1],[]],{0:[(0,'42'),(1,'42')]}),42) +[([1,2],{'42':1}),([],{}),([42],{'42':42})] (([[1],[2],[]],{1:[(1,'42'),(2,'42')]}),42) +TSVWithNamesAndTypes +int8 Int8 +uint8 UInt8 +int16 Int16 +uint16 UInt16 +int32 Int32 +uint32 UInt32 +int64 Int64 +uint64 UInt64 +0 0 0 0 0 0 0 0 +-1 1 -1 1 -1 1 -1 1 +float32 Float32 +float64 Float64 +decimal32 Decimal(9, 5) +decimal64 Decimal(18, 5) +0 0 0 0 +1.2 0.7692307692307692 3.33333 333.33333 +date Date +date32 Date32 +1970-01-01 1970-01-01 +1970-01-02 1970-01-02 +str String +fixed_string FixedString(3) +Str: 0 100 +Str: 1 200 +array Array(UInt64) +tuple Tuple(UInt64, String) +map Map(String, UInt64) +[0,1] (0,'0') {'0':0} +[1,2] (1,'1') {'1':1} +nested1 Array(Tuple(Array(UInt64), Map(String, UInt64))) +nested2 Tuple(Tuple(Array(Array(UInt64)), Map(UInt64, Array(Tuple(UInt64, String)))), UInt8) +[([0,1],{'42':0}),([],{}),([42],{'42':42})] (([[0],[1],[]],{0:[(0,'42'),(1,'42')]}),42) +[([1,2],{'42':1}),([],{}),([42],{'42':42})] (([[1],[2],[]],{1:[(1,'42'),(2,'42')]}),42) +TSVRawWithNamesAndTypes +int8 Int8 +uint8 UInt8 +int16 Int16 +uint16 UInt16 +int32 Int32 +uint32 UInt32 +int64 Int64 +uint64 UInt64 +0 0 0 0 0 0 0 0 +-1 1 -1 1 -1 1 -1 1 +float32 Float32 +float64 Float64 +decimal32 Decimal(9, 5) +decimal64 Decimal(18, 5) +0 0 0 0 +1.2 0.7692307692307692 3.33333 333.33333 +date Date +date32 Date32 +1970-01-01 1970-01-01 +1970-01-02 1970-01-02 +str String +fixed_string FixedString(3) +Str: 0 100 +Str: 1 200 +array Array(UInt64) +tuple Tuple(UInt64, String) +map Map(String, UInt64) +[0,1] (0,'0') {'0':0} +[1,2] (1,'1') {'1':1} +nested1 Array(Tuple(Array(UInt64), Map(String, UInt64))) +nested2 Tuple(Tuple(Array(Array(UInt64)), Map(UInt64, Array(Tuple(UInt64, String)))), UInt8) +[([0,1],{'42':0}),([],{}),([42],{'42':42})] (([[0],[1],[]],{0:[(0,'42'),(1,'42')]}),42) +[([1,2],{'42':1}),([],{}),([42],{'42':42})] (([[1],[2],[]],{1:[(1,'42'),(2,'42')]}),42) +CSVWithNamesAndTypes +int8 Int8 +uint8 UInt8 +int16 Int16 +uint16 UInt16 +int32 Int32 +uint32 UInt32 +int64 Int64 +uint64 UInt64 +0 0 0 0 0 0 0 0 +-1 1 -1 1 -1 1 -1 1 +float32 Float32 +float64 Float64 +decimal32 Decimal(9, 5) +decimal64 Decimal(18, 5) +0 0 0 0 +1.2 0.7692307692307692 3.33333 333.33333 +date Date +date32 Date32 +1970-01-01 1970-01-01 +1970-01-02 1970-01-02 +str String +fixed_string FixedString(3) +Str: 0 100 +Str: 1 200 +array Array(UInt64) +tuple Tuple(UInt64, String) +map Map(String, UInt64) +[0,1] (0,'0') {'0':0} +[1,2] (1,'1') {'1':1} +nested1 Array(Tuple(Array(UInt64), Map(String, UInt64))) +nested2 Tuple(Tuple(Array(Array(UInt64)), Map(UInt64, Array(Tuple(UInt64, String)))), UInt8) +[([0,1],{'42':0}),([],{}),([42],{'42':42})] (([[0],[1],[]],{0:[(0,'42'),(1,'42')]}),42) +[([1,2],{'42':1}),([],{}),([42],{'42':42})] (([[1],[2],[]],{1:[(1,'42'),(2,'42')]}),42) +JSONCompactEachRowWithNamesAndTypes +int8 Int8 +uint8 UInt8 +int16 Int16 +uint16 UInt16 +int32 Int32 +uint32 UInt32 +int64 Int64 +uint64 UInt64 +0 0 0 0 0 0 0 0 +-1 1 -1 1 -1 1 -1 1 +float32 Float32 +float64 Float64 +decimal32 Decimal(9, 5) +decimal64 Decimal(18, 5) +0 0 0 0 +1.2 0.7692307692307692 3.33333 333.33333 +date Date +date32 Date32 +1970-01-01 1970-01-01 +1970-01-02 1970-01-02 +str String +fixed_string FixedString(3) +Str: 0 100 +Str: 1 200 +array Array(UInt64) +tuple Tuple(UInt64, String) +map Map(String, UInt64) +[0,1] (0,'0') {'0':0} +[1,2] (1,'1') {'1':1} +nested1 Array(Tuple(Array(UInt64), Map(String, UInt64))) +nested2 Tuple(Tuple(Array(Array(UInt64)), Map(UInt64, Array(Tuple(UInt64, String)))), UInt8) +[([0,1],{'42':0}),([],{}),([42],{'42':42})] (([[0],[1],[]],{0:[(0,'42'),(1,'42')]}),42) +[([1,2],{'42':1}),([],{}),([42],{'42':42})] (([[1],[2],[]],{1:[(1,'42'),(2,'42')]}),42) +JSONCompactStringsEachRowWithNamesAndTypes +int8 Int8 +uint8 UInt8 +int16 Int16 +uint16 UInt16 +int32 Int32 +uint32 UInt32 +int64 Int64 +uint64 UInt64 +0 0 0 0 0 0 0 0 +-1 1 -1 1 -1 1 -1 1 +float32 Float32 +float64 Float64 +decimal32 Decimal(9, 5) +decimal64 Decimal(18, 5) +0 0 0 0 +1.2 0.7692307692307692 3.33333 333.33333 +date Date +date32 Date32 +1970-01-01 1970-01-01 +1970-01-02 1970-01-02 +str String +fixed_string FixedString(3) +Str: 0 100 +Str: 1 200 +array Array(UInt64) +tuple Tuple(UInt64, String) +map Map(String, UInt64) +[0,1] (0,'0') {'0':0} +[1,2] (1,'1') {'1':1} +nested1 Array(Tuple(Array(UInt64), Map(String, UInt64))) +nested2 Tuple(Tuple(Array(Array(UInt64)), Map(UInt64, Array(Tuple(UInt64, String)))), UInt8) +[([0,1],{'42':0}),([],{}),([42],{'42':42})] (([[0],[1],[]],{0:[(0,'42'),(1,'42')]}),42) +[([1,2],{'42':1}),([],{}),([42],{'42':42})] (([[1],[2],[]],{1:[(1,'42'),(2,'42')]}),42) +RowBinaryWithNamesAndTypes +int8 Int8 +uint8 UInt8 +int16 Int16 +uint16 UInt16 +int32 Int32 +uint32 UInt32 +int64 Int64 +uint64 UInt64 +0 0 0 0 0 0 0 0 +-1 1 -1 1 -1 1 -1 1 +float32 Float32 +float64 Float64 +decimal32 Decimal(9, 5) +decimal64 Decimal(18, 5) +0 0 0 0 +1.2 0.7692307692307692 3.33333 333.33333 +date Date +date32 Date32 +1970-01-01 1970-01-01 +1970-01-02 1970-01-02 +str String +fixed_string FixedString(3) +Str: 0 100 +Str: 1 200 +array Array(UInt64) +tuple Tuple(UInt64, String) +map Map(String, UInt64) +[0,1] (0,'0') {'0':0} +[1,2] (1,'1') {'1':1} +nested1 Array(Tuple(Array(UInt64), Map(String, UInt64))) +nested2 Tuple(Tuple(Array(Array(UInt64)), Map(UInt64, Array(Tuple(UInt64, String)))), UInt8) +[([0,1],{'42':0}),([],{}),([42],{'42':42})] (([[0],[1],[]],{0:[(0,'42'),(1,'42')]}),42) +[([1,2],{'42':1}),([],{}),([42],{'42':42})] (([[1],[2],[]],{1:[(1,'42'),(2,'42')]}),42) +CustomSeparatedWithNamesAndTypes +int8 Int8 +uint8 UInt8 +int16 Int16 +uint16 UInt16 +int32 Int32 +uint32 UInt32 +int64 Int64 +uint64 UInt64 +0 0 0 0 0 0 0 0 +-1 1 -1 1 -1 1 -1 1 +float32 Float32 +float64 Float64 +decimal32 Decimal(9, 5) +decimal64 Decimal(18, 5) +0 0 0 0 +1.2 0.7692307692307692 3.33333 333.33333 +date Date +date32 Date32 +1970-01-01 1970-01-01 +1970-01-02 1970-01-02 +str String +fixed_string FixedString(3) +Str: 0 100 +Str: 1 200 +array Array(UInt64) +tuple Tuple(UInt64, String) +map Map(String, UInt64) +[0,1] (0,'0') {'0':0} +[1,2] (1,'1') {'1':1} +nested1 Array(Tuple(Array(UInt64), Map(String, UInt64))) +nested2 Tuple(Tuple(Array(Array(UInt64)), Map(UInt64, Array(Tuple(UInt64, String)))), UInt8) +[([0,1],{'42':0}),([],{}),([42],{'42':42})] (([[0],[1],[]],{0:[(0,'42'),(1,'42')]}),42) +[([1,2],{'42':1}),([],{}),([42],{'42':42})] (([[1],[2],[]],{1:[(1,'42'),(2,'42')]}),42) +Avro +CustomSeparatedWithNamesAndTypes +int8 Int32 +uint8 Int32 +int16 Int32 +uint16 Int32 +int32 Int32 +uint32 Int32 +int64 Int64 +uint64 Int64 +0 0 0 0 0 0 0 0 +-1 1 -1 1 -1 1 -1 1 +float32 Float32 +float64 Float64 +0 0 +1.2 0.7692307692307692 +date Int32 +0 +1 +str String +fixed_string FixedString(3) +Str: 0 100 +Str: 1 200 +array Array(Int64) +nested Array(Array(Array(Int64))) +[0,1] [[[0],[1]]] +[1,2] [[[1],[2]]] diff --git a/tests/queries/0_stateless/02149_schema_inference_formats_with_schema.sh b/tests/queries/0_stateless/02149_schema_inference_formats_with_schema.sh new file mode 100755 index 00000000000..d263ef63681 --- /dev/null +++ b/tests/queries/0_stateless/02149_schema_inference_formats_with_schema.sh @@ -0,0 +1,65 @@ +#!/usr/bin/env bash +# Tags: no-parallel, no-fasttest + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + + +USER_FILES_PATH=$(clickhouse-client --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}') +FILE_NAME=test_02149.data +DATA_FILE=$USER_FILES_PATH/$FILE_NAME + +for format in Arrow ArrowStream Parquet ORC Native TSVWithNamesAndTypes TSVRawWithNamesAndTypes CSVWithNamesAndTypes JSONCompactEachRowWithNamesAndTypes JSONCompactStringsEachRowWithNamesAndTypes RowBinaryWithNamesAndTypes CustomSeparatedWithNamesAndTypes +do + echo $format + $CLICKHOUSE_CLIENT -q "select toInt8(-number) as int8, toUInt8(number) as uint8, toInt16(-number) as int16, toUInt16(number) as uint16, toInt32(-number) as int32, toUInt32(number) as uint32, toInt64(-number) as int64, toUInt64(number) as uint64 from numbers(2) format $format" > $DATA_FILE + $CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', '$format')" + $CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', '$format')" + + $CLICKHOUSE_CLIENT -q "select toFloat32(number * 1.2) as float32, toFloat64(number / 1.3) as float64, toDecimal32(number / 0.3, 5) as decimal32, toDecimal64(number / 0.003, 5) as decimal64 from numbers(2) format $format" > $DATA_FILE + $CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', '$format')" + $CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', '$format')" + + $CLICKHOUSE_CLIENT -q "select toDate(number) as date, toDate32(number) as date32 from numbers(2) format $format" > $DATA_FILE + $CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', '$format')" + $CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', '$format')" + + $CLICKHOUSE_CLIENT -q "select concat('Str: ', toString(number)) as str, toFixedString(toString((number + 1) * 100 % 1000), 3) as fixed_string from numbers(2) format $format" > $DATA_FILE + $CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', '$format')" + $CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', '$format')" + + $CLICKHOUSE_CLIENT -q "select [number, number + 1] as array, (number, toString(number)) as tuple, map(toString(number), number) as map from numbers(2) format $format" > $DATA_FILE + $CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', '$format')" + $CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', '$format')" + + $CLICKHOUSE_CLIENT -q "select [([number, number + 1], map('42', number)), ([], map()), ([42], map('42', 42))] as nested1, (([[number], [number + 1], []], map(number, [(number, '42'), (number + 1, '42')])), 42) as nested2 from numbers(2) format $format" > $DATA_FILE + $CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', '$format')" + $CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', '$format')" +done + +echo "Avro" + +echo $format +$CLICKHOUSE_CLIENT -q "select toInt8(-number) as int8, toUInt8(number) as uint8, toInt16(-number) as int16, toUInt16(number) as uint16, toInt32(-number) as int32, toUInt32(number) as uint32, toInt64(-number) as int64, toUInt64(number) as uint64 from numbers(2) format Avro" > $DATA_FILE +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Avro')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'Avro')" + +$CLICKHOUSE_CLIENT -q "select toFloat32(number * 1.2) as float32, toFloat64(number / 1.3) as float64 from numbers(2) format Avro" > $DATA_FILE +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Avro')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'Avro')" + +$CLICKHOUSE_CLIENT -q "select toDate(number) as date from numbers(2) format Avro" > $DATA_FILE +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Avro')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'Avro')" + +$CLICKHOUSE_CLIENT -q "select concat('Str: ', toString(number)) as str, toFixedString(toString((number + 1) * 100 % 1000), 3) as fixed_string from numbers(2) format Avro" > $DATA_FILE +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Avro')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'Avro')" + +$CLICKHOUSE_CLIENT -q "select [number, number + 1] as array, [[[number], [number + 1]]] as nested from numbers(2) format Avro" > $DATA_FILE +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Avro')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'Avro')" + +rm $DATA_FILE + diff --git a/tests/queries/0_stateless/02150_index_hypothesis_race_long.reference b/tests/queries/0_stateless/02150_index_hypothesis_race_long.reference new file mode 100644 index 00000000000..d86bac9de59 --- /dev/null +++ b/tests/queries/0_stateless/02150_index_hypothesis_race_long.reference @@ -0,0 +1 @@ +OK diff --git a/tests/queries/0_stateless/02150_index_hypothesis_race_long.sh b/tests/queries/0_stateless/02150_index_hypothesis_race_long.sh new file mode 100755 index 00000000000..da2dcd055ea --- /dev/null +++ b/tests/queries/0_stateless/02150_index_hypothesis_race_long.sh @@ -0,0 +1,23 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +$CLICKHOUSE_CLIENT -q "DROP TABLE IF EXISTS t_index_hypothesis" + +$CLICKHOUSE_CLIENT -q "CREATE TABLE t_index_hypothesis (a UInt32, b UInt32, INDEX t a != b TYPE hypothesis GRANULARITY 1) ENGINE = MergeTree ORDER BY a" + +$CLICKHOUSE_CLIENT -q "INSERT INTO t_index_hypothesis SELECT number, number + 1 FROM numbers(10000000)" + +for _ in {0..30}; do + output=`$CLICKHOUSE_CLIENT -q "SELECT count() FROM t_index_hypothesis WHERE a = b"` + if [[ $output != "0" ]]; then + echo "output: $output, expected: 0" + exit 1 + fi +done + +echo OK + +$CLICKHOUSE_CLIENT -q "DROP TABLE t_index_hypothesis" diff --git a/tests/queries/0_stateless/02150_replace_regexp_all_empty_match.reference b/tests/queries/0_stateless/02150_replace_regexp_all_empty_match.reference new file mode 100644 index 00000000000..6e269c2a690 --- /dev/null +++ b/tests/queries/0_stateless/02150_replace_regexp_all_empty_match.reference @@ -0,0 +1 @@ +here: Hello, World! diff --git a/tests/queries/0_stateless/02150_replace_regexp_all_empty_match.sql b/tests/queries/0_stateless/02150_replace_regexp_all_empty_match.sql new file mode 100644 index 00000000000..a7b52a1c8b6 --- /dev/null +++ b/tests/queries/0_stateless/02150_replace_regexp_all_empty_match.sql @@ -0,0 +1 @@ +select replaceRegexpAll('Hello, World!', '^', 'here: '); diff --git a/tests/queries/0_stateless/02151_clickhouse_client_hints.reference b/tests/queries/0_stateless/02151_clickhouse_client_hints.reference new file mode 100644 index 00000000000..d86bac9de59 --- /dev/null +++ b/tests/queries/0_stateless/02151_clickhouse_client_hints.reference @@ -0,0 +1 @@ +OK diff --git a/tests/queries/0_stateless/02151_clickhouse_client_hints.sh b/tests/queries/0_stateless/02151_clickhouse_client_hints.sh new file mode 100755 index 00000000000..3e6c6cb16a5 --- /dev/null +++ b/tests/queries/0_stateless/02151_clickhouse_client_hints.sh @@ -0,0 +1,8 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + + +$CLICKHOUSE_CLIENT --hardware_utilization 2>&1 | grep -q "Code: 552. DB::Exception: Unrecognized option '--hardware_utilization'. Maybe you meant \['--hardware-utilization'\]. (UNRECOGNIZED_ARGUMENTS)" && echo 'OK' || echo 'FAIL' ||: diff --git a/tests/queries/0_stateless/02151_client_option_echo.reference b/tests/queries/0_stateless/02151_client_option_echo.reference new file mode 100644 index 00000000000..4dba04c5d41 --- /dev/null +++ b/tests/queries/0_stateless/02151_client_option_echo.reference @@ -0,0 +1,3 @@ +DROP TABLE IF EXISTS echo_test_0 +DROP TABLE IF EXISTS echo_test_2; +DROP TABLE IF EXISTS echo_test_3 diff --git a/tests/queries/0_stateless/02151_client_option_echo.sh b/tests/queries/0_stateless/02151_client_option_echo.sh new file mode 100755 index 00000000000..8056b3b5ed1 --- /dev/null +++ b/tests/queries/0_stateless/02151_client_option_echo.sh @@ -0,0 +1,16 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) + +# shellcheck source=../shell_config.sh + +. "$CURDIR"/../shell_config.sh + +# single query echo on +${CLICKHOUSE_CLIENT} --echo --query="DROP TABLE IF EXISTS echo_test_0" +# single query echo off +${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS echo_test_1" +# multi query echo on +${CLICKHOUSE_CLIENT} --echo --multiquery --query="DROP TABLE IF EXISTS echo_test_2;DROP TABLE IF EXISTS echo_test_3" +# multi query echo off +${CLICKHOUSE_CLIENT} --multiquery --query="DROP TABLE IF EXISTS echo_test_4;DROP TABLE IF EXISTS echo_test_5" diff --git a/tests/queries/0_stateless/02151_http_s_structure_set_eof.reference b/tests/queries/0_stateless/02151_http_s_structure_set_eof.reference new file mode 100644 index 00000000000..51de8112089 --- /dev/null +++ b/tests/queries/0_stateless/02151_http_s_structure_set_eof.reference @@ -0,0 +1,2 @@ +124 +124 diff --git a/tests/queries/0_stateless/02151_http_s_structure_set_eof.sh b/tests/queries/0_stateless/02151_http_s_structure_set_eof.sh new file mode 100755 index 00000000000..448fa9bfede --- /dev/null +++ b/tests/queries/0_stateless/02151_http_s_structure_set_eof.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +tmp_file=$(mktemp "$CURDIR/clickhouse.XXXXXX.csv") +trap 'rm $tmp_file' EXIT + +# NOTE: this file should be huge enough, so that it is impossible to upload it +# in 0.15s, see timeout command below, this will ensure, that EOF will be +# received during creating a set from externally uploaded table. +# +# Previously code there wasn't ready for EOF, and you will get one of the +# following assertions: +# +# - ./src/IO/ReadBuffer.h:58: bool DB::ReadBuffer::next(): Assertion `!hasPendingData()' failed. +# - ./src/Server/HTTP/HTMLForm.cpp:245: bool DB::HTMLForm::MultipartReadBuffer::skipToNextBoundary(): Assertion `boundary_hit' failed. +# - ./src/IO/LimitReadBuffer.cpp:17: virtual bool DB::LimitReadBuffer::nextImpl(): Assertion `position() >= in->position()' failed. +# +$CLICKHOUSE_CLIENT -q "SELECT toString(number) FROM numbers(10e6) FORMAT TSV" > "$tmp_file" + +# NOTE: Just in case check w/ input_format_parallel_parsing and w/o +timeout 0.15s ${CLICKHOUSE_CURL} -sS -F "s=@$tmp_file;" "${CLICKHOUSE_URL}&s_structure=key+Int&query=SELECT+dummy+IN+s&input_format_parallel_parsing=true" -o /dev/null +echo $? +timeout 0.15s ${CLICKHOUSE_CURL} -sS -F "s=@$tmp_file;" "${CLICKHOUSE_URL}&s_structure=key+Int&query=SELECT+dummy+IN+s&input_format_parallel_parsing=false" -o /dev/null +echo $? diff --git a/tests/queries/0_stateless/02151_invalid_setting_with_hints_in_query.reference b/tests/queries/0_stateless/02151_invalid_setting_with_hints_in_query.reference new file mode 100644 index 00000000000..d86bac9de59 --- /dev/null +++ b/tests/queries/0_stateless/02151_invalid_setting_with_hints_in_query.reference @@ -0,0 +1 @@ +OK diff --git a/tests/queries/0_stateless/02151_invalid_setting_with_hints_in_query.sh b/tests/queries/0_stateless/02151_invalid_setting_with_hints_in_query.sh new file mode 100755 index 00000000000..2faaa3bb1b6 --- /dev/null +++ b/tests/queries/0_stateless/02151_invalid_setting_with_hints_in_query.sh @@ -0,0 +1,7 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +$CLICKHOUSE_LOCAL --query="SET input_format_with_names_use_headers = 1" 2>&1 | grep -qF "Code: 115. DB::Exception: Unknown setting input_format_with_names_use_headers: Maybe you meant ['input_format_with_names_use_header','input_format_with_types_use_header']. (UNKNOWN_SETTING)" && echo 'OK' || echo 'FAIL' ||: diff --git a/tests/queries/0_stateless/02151_lc_prefetch.reference b/tests/queries/0_stateless/02151_lc_prefetch.reference new file mode 100644 index 00000000000..deebb18c2f7 --- /dev/null +++ b/tests/queries/0_stateless/02151_lc_prefetch.reference @@ -0,0 +1 @@ +2000000 diff --git a/tests/queries/0_stateless/02151_lc_prefetch.sql b/tests/queries/0_stateless/02151_lc_prefetch.sql new file mode 100644 index 00000000000..83d8d23264e --- /dev/null +++ b/tests/queries/0_stateless/02151_lc_prefetch.sql @@ -0,0 +1,7 @@ +-- Tags: no-tsan, no-asan, no-ubsan, no-msan, no-debug +drop table if exists tab_lc; +CREATE TABLE tab_lc (x UInt64, y LowCardinality(String)) engine = MergeTree order by x; +insert into tab_lc select number, toString(number % 10) from numbers(20000000); +optimize table tab_lc; +select count() from tab_lc where y == '0' settings local_filesystem_read_prefetch=1; +drop table if exists tab_lc; diff --git a/tests/queries/0_stateless/02151_replace_regexp_all_empty_match_alternative.reference b/tests/queries/0_stateless/02151_replace_regexp_all_empty_match_alternative.reference new file mode 100644 index 00000000000..e8183f05f5d --- /dev/null +++ b/tests/queries/0_stateless/02151_replace_regexp_all_empty_match_alternative.reference @@ -0,0 +1,3 @@ +1 +1 +1 diff --git a/tests/queries/0_stateless/02151_replace_regexp_all_empty_match_alternative.sql b/tests/queries/0_stateless/02151_replace_regexp_all_empty_match_alternative.sql new file mode 100644 index 00000000000..6725fa04114 --- /dev/null +++ b/tests/queries/0_stateless/02151_replace_regexp_all_empty_match_alternative.sql @@ -0,0 +1,3 @@ +select replaceRegexpAll(',,1,,', '^[,]*|[,]*$', '') x; +select replaceRegexpAll(',,1', '^[,]*|[,]*$', '') x; +select replaceRegexpAll('1,,', '^[,]*|[,]*$', '') x; diff --git a/tests/queries/0_stateless/02152_bool_type.reference b/tests/queries/0_stateless/02152_bool_type.reference new file mode 100644 index 00000000000..a8c04f651e9 --- /dev/null +++ b/tests/queries/0_stateless/02152_bool_type.reference @@ -0,0 +1,46 @@ +true +true +true +true +true +true +true +true +true +true +true +true +true +true +false +false +false +false +false +false +false +false +false +false +false +false +false +false +Custom true +Custom true +(true) +Row 1: +────── +CAST('true', 'Bool'): Custom true +┏━━━━━━━━━━━━━━━━━━━━━━┓ +┃ CAST('true', 'Bool') ┃ +┡━━━━━━━━━━━━━━━━━━━━━━┩ +│ Custom true │ +└──────────────────────┘ +{"CAST('true', 'Bool')":true} +1 +1 +1 +1 +1 +1 diff --git a/tests/queries/0_stateless/02152_bool_type.sql b/tests/queries/0_stateless/02152_bool_type.sql new file mode 100644 index 00000000000..e9efde0795f --- /dev/null +++ b/tests/queries/0_stateless/02152_bool_type.sql @@ -0,0 +1,48 @@ +SELECT CAST('True', 'Bool'); +SELECT CAST('TrUe', 'Bool'); +SELECT CAST('true', 'Bool'); +SELECT CAST('On', 'Bool'); +SELECT CAST('on', 'Bool'); +SELECT CAST('Yes', 'Bool'); +SELECT CAST('yes', 'Bool'); +SELECT CAST('T', 'Bool'); +SELECT CAST('t', 'Bool'); +SELECT CAST('Y', 'Bool'); +SELECT CAST('y', 'Bool'); +SELECT CAST('1', 'Bool'); +SELECT CAST('enabled', 'Bool'); +SELECT CAST('enable', 'Bool'); + +SELECT CAST('False', 'Bool'); +SELECT CAST('FaLse', 'Bool'); +SELECT CAST('false', 'Bool'); +SELECT CAST('Off', 'Bool'); +SELECT CAST('off', 'Bool'); +SELECT CAST('No', 'Bool'); +SELECT CAST('no', 'Bool'); +SELECT CAST('N', 'Bool'); +SELECT CAST('n', 'Bool'); +SELECT CAST('F', 'Bool'); +SELECT CAST('f', 'Bool'); +SELECT CAST('0', 'Bool'); +SELECT CAST('disabled', 'Bool'); +SELECT CAST('disable', 'Bool'); + +SET bool_true_representation = 'Custom true'; +SET bool_false_representation = 'Custom false'; + +SELECT CAST('true', 'Bool') format CSV; +SELECT CAST('true', 'Bool') format TSV; +SELECT CAST('true', 'Bool') format Values; +SELECT ''; +SELECT CAST('true', 'Bool') format Vertical; +SELECT CAST('true', 'Bool') format Pretty; +SELECT CAST('true', 'Bool') format JSONEachRow; + +SELECT CAST(CAST(2, 'Bool'), 'UInt8'); +SELECT CAST(CAST(toUInt32(2), 'Bool'), 'UInt8'); +SELECT CAST(CAST(toInt8(2), 'Bool'), 'UInt8'); +SELECT CAST(CAST(toFloat32(2), 'Bool'), 'UInt8'); +SELECT CAST(CAST(toDecimal32(2, 2), 'Bool'), 'UInt8'); +SELECT CAST(CAST(materialize(2), 'Bool'), 'UInt8'); + diff --git a/tests/queries/0_stateless/02152_bool_type_parsing.reference b/tests/queries/0_stateless/02152_bool_type_parsing.reference new file mode 100644 index 00000000000..f9fcd324dbc --- /dev/null +++ b/tests/queries/0_stateless/02152_bool_type_parsing.reference @@ -0,0 +1,146 @@ +true +false +true +false +true +false +true +true +false +true +false +true +false +true +false +true +false +true +false +true +false +true +false +true +false +true +false +true +false +true +false +true +true +false +true +false +true +false +true +false +true +false +true +false +true +false +true +false +true +false +true +false +true +false +true +false +true +true +false +true +false +true +false +true +false +true +false +true +false +true +false +true +false +true +false +true +false +true +false +true +false +true +true +false +true +false +true +false +true +false +true +false +true +false +true +false +true +false +true +false +true +false +true +false +true +true +false +true +false +true +false +true +false +true +false +true +false +true +false +true +false +true +false +true +false +true +false +true +true +false +true +false +true +false +true +false +true +false +true +false +true +false +true +false +true +false diff --git a/tests/queries/0_stateless/02152_bool_type_parsing.sh b/tests/queries/0_stateless/02152_bool_type_parsing.sh new file mode 100755 index 00000000000..9e9db499cf5 --- /dev/null +++ b/tests/queries/0_stateless/02152_bool_type_parsing.sh @@ -0,0 +1,25 @@ +#!/usr/bin/env bash +# Tags: no-parallel + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +USER_FILES_PATH=$(clickhouse-client --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}') +FILE_NAME=test_02152.data +DATA_FILE=$USER_FILES_PATH/$FILE_NAME + +echo -e "Custom true\nCustom false\nYes\nNo\nyes\nno\ny\nY\nN\nTrue\nFalse\ntrue\nfalse\nt\nf\nT\nF\nOn\nOff\non\noff\nenable\ndisable\nenabled\ndisabled" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "SELECT * FROM file('$FILE_NAME', 'TSV', 'bool Bool') settings bool_true_representation='Custom true', bool_false_representation='Custom false'" +$CLICKHOUSE_CLIENT -q "SELECT * FROM file('$FILE_NAME', 'TSV', 'bool Bool') settings bool_true_representation='Custom true', bool_false_representation='Custom false', input_format_parallel_parsing=0, max_read_buffer_size=2" + +$CLICKHOUSE_CLIENT -q "SELECT * FROM file('$FILE_NAME', 'CSV', 'bool Bool') settings bool_true_representation='Custom true', bool_false_representation='Custom false'" +$CLICKHOUSE_CLIENT -q "SELECT * FROM file('$FILE_NAME', 'CSV', 'bool Bool') settings bool_true_representation='Custom true', bool_false_representation='Custom false', input_format_parallel_parsing=0, max_read_buffer_size=2" + +echo -e "'Yes'\n'No'\n'yes'\n'no'\n'y'\n'Y'\n'N'\nTrue\nFalse\ntrue\nfalse\n't'\n'f'\n'T'\n'F'\n'On'\n'Off'\n'on'\n'off'\n'enable'\n'disable'\n'enabled'\n'disabled'" > $DATA_FILE +$CLICKHOUSE_CLIENT -q "SELECT * FROM file('$FILE_NAME', 'CustomSeparated', 'bool Bool') settings format_custom_escaping_rule='Quoted'" +$CLICKHOUSE_CLIENT -q "SELECT * FROM file('$FILE_NAME', 'CustomSeparated', 'bool Bool') settings format_custom_escaping_rule='Quoted', input_format_parallel_parsing=0, max_read_buffer_size=2" + +rm $DATA_FILE + diff --git a/tests/queries/0_stateless/02152_csv_tuple.reference b/tests/queries/0_stateless/02152_csv_tuple.reference new file mode 100644 index 00000000000..9d77cd9e0a6 --- /dev/null +++ b/tests/queries/0_stateless/02152_csv_tuple.reference @@ -0,0 +1,2 @@ +1 Hello [1,2,3] (2,'World',[4,5,6]) +1 Hello [1,2,3] (2,'World',[4,5,6]) diff --git a/tests/queries/0_stateless/02152_csv_tuple.sql b/tests/queries/0_stateless/02152_csv_tuple.sql new file mode 100644 index 00000000000..6a6c029e524 --- /dev/null +++ b/tests/queries/0_stateless/02152_csv_tuple.sql @@ -0,0 +1,11 @@ +drop table if exists test_02152; +create table test_02152 (x UInt32, y String, z Array(UInt32), t Tuple(UInt32, String, Array(UInt32))) engine=File('CSV') settings format_csv_delimiter=';'; +insert into test_02152 select 1, 'Hello', [1,2,3], tuple(2, 'World', [4,5,6]); +select * from test_02152; +drop table test_02152; + +create table test_02152 (x UInt32, y String, z Array(UInt32), t Tuple(UInt32, String, Array(UInt32))) engine=File('CustomSeparated') settings format_custom_field_delimiter='', format_custom_row_before_delimiter='', format_custom_row_after_delimiter='', format_custom_escaping_rule='CSV'; +insert into test_02152 select 1, 'Hello', [1,2,3], tuple(2, 'World', [4,5,6]); +select * from test_02152; +drop table test_02152; + diff --git a/tests/queries/0_stateless/02152_http_external_tables_memory_tracking.reference b/tests/queries/0_stateless/02152_http_external_tables_memory_tracking.reference new file mode 100644 index 00000000000..1fc09c8d154 --- /dev/null +++ b/tests/queries/0_stateless/02152_http_external_tables_memory_tracking.reference @@ -0,0 +1,16 @@ +Checking input_format_parallel_parsing=false& +1 +Checking input_format_parallel_parsing=false&cancel_http_readonly_queries_on_client_close=1&readonly=1 +1 +Checking input_format_parallel_parsing=false&send_progress_in_http_headers=true +1 +Checking input_format_parallel_parsing=false&cancel_http_readonly_queries_on_client_close=1&readonly=1&send_progress_in_http_headers=true +1 +Checking input_format_parallel_parsing=true& +1 +Checking input_format_parallel_parsing=true&cancel_http_readonly_queries_on_client_close=1&readonly=1 +1 +Checking input_format_parallel_parsing=true&send_progress_in_http_headers=true +1 +Checking input_format_parallel_parsing=true&cancel_http_readonly_queries_on_client_close=1&readonly=1&send_progress_in_http_headers=true +1 diff --git a/tests/queries/0_stateless/02152_http_external_tables_memory_tracking.sh b/tests/queries/0_stateless/02152_http_external_tables_memory_tracking.sh new file mode 100755 index 00000000000..2801ec16a43 --- /dev/null +++ b/tests/queries/0_stateless/02152_http_external_tables_memory_tracking.sh @@ -0,0 +1,51 @@ +#!/usr/bin/env bash +# Tags: no-tsan +# ^^^^^^^ +# TSan does not supports tracing. + +# Regression for proper release of Context, +# via tracking memory of external tables. + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +tmp_file=$(mktemp "$CURDIR/clickhouse.XXXXXX.csv") +trap 'rm $tmp_file' EXIT + +$CLICKHOUSE_CLIENT -q "SELECT toString(number) FROM numbers(1e6) FORMAT TSV" > "$tmp_file" + +function run_and_check() +{ + local query_id + query_id="$(${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}" --data-binary @- <<<'SELECT generateUUIDv4()')" + + echo "Checking $*" + + # Run query with external table (implicit StorageMemory user) + $CLICKHOUSE_CURL -sS -F "s=@$tmp_file;" "$CLICKHOUSE_URL&s_structure=key+Int&query=SELECT+count()+FROM+s&memory_profiler_sample_probability=1&query_id=$query_id&$*" -o /dev/null + + ${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}" --data-binary @- <<<'SYSTEM FLUSH LOGS' + + # Check that temporary table had been destroyed. + ${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}&allow_introspection_functions=1" --data-binary @- <<<" + WITH arrayStringConcat(arrayMap(x -> demangle(addressToSymbol(x)), trace), '\n') AS sym + SELECT count()>0 FROM system.trace_log + WHERE + sym LIKE '%DB::StorageMemory::drop%\n%TemporaryTableHolder::~TemporaryTableHolder%' AND + query_id = '$query_id' + " +} + +for input_format_parallel_parsing in false true; do + query_args_variants=( + "" + "cancel_http_readonly_queries_on_client_close=1&readonly=1" + "send_progress_in_http_headers=true" + # nested progress callback + "cancel_http_readonly_queries_on_client_close=1&readonly=1&send_progress_in_http_headers=true" + ) + for query_args in "${query_args_variants[@]}"; do + run_and_check "input_format_parallel_parsing=$input_format_parallel_parsing&$query_args" + done +done diff --git a/tests/queries/0_stateless/02152_invalid_setting_with_hints_in_http_request.reference b/tests/queries/0_stateless/02152_invalid_setting_with_hints_in_http_request.reference new file mode 100644 index 00000000000..d86bac9de59 --- /dev/null +++ b/tests/queries/0_stateless/02152_invalid_setting_with_hints_in_http_request.reference @@ -0,0 +1 @@ +OK diff --git a/tests/queries/0_stateless/02152_invalid_setting_with_hints_in_http_request.sh b/tests/queries/0_stateless/02152_invalid_setting_with_hints_in_http_request.sh new file mode 100755 index 00000000000..1fbf747da4f --- /dev/null +++ b/tests/queries/0_stateless/02152_invalid_setting_with_hints_in_http_request.sh @@ -0,0 +1,7 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}&input_format_with_names_use_headers=1" -d 'SELECT 1' 2>&1 | grep -q "Code: 115.*Maybe you meant \['input_format_with_names_use_header','input_format_with_types_use_header'\]. (UNKNOWN_SETTING)" && echo 'OK' || echo 'FAIL' ||: diff --git a/tests/queries/0_stateless/02152_short_circuit_throw_if.reference b/tests/queries/0_stateless/02152_short_circuit_throw_if.reference new file mode 100644 index 00000000000..aa47d0d46d4 --- /dev/null +++ b/tests/queries/0_stateless/02152_short_circuit_throw_if.reference @@ -0,0 +1,2 @@ +0 +0 diff --git a/tests/queries/0_stateless/02152_short_circuit_throw_if.sql b/tests/queries/0_stateless/02152_short_circuit_throw_if.sql new file mode 100644 index 00000000000..3fdc3cc48c8 --- /dev/null +++ b/tests/queries/0_stateless/02152_short_circuit_throw_if.sql @@ -0,0 +1,2 @@ +SELECT if(1, 0, throwIf(1, 'Executing FALSE branch')); +SELECT if(empty(''), 0, throwIf(1, 'Executing FALSE branch')); diff --git a/tests/queries/0_stateless/02153_clickhouse_local_profile_info.reference b/tests/queries/0_stateless/02153_clickhouse_local_profile_info.reference new file mode 100644 index 00000000000..2e1b607ac04 --- /dev/null +++ b/tests/queries/0_stateless/02153_clickhouse_local_profile_info.reference @@ -0,0 +1,32 @@ +{ + "meta": + [ + { + "name": "count()", + "type": "UInt64" + }, + { + "name": "n", + "type": "UInt8" + } + ], + + "data": + [ + { + "count()": "1", + "n": 1 + } + ], + + "totals": + { + "count()": "3", + "n": 0 + }, + + "rows": 1, + + "rows_before_limit_at_least": 3, + + "statistics": diff --git a/tests/queries/0_stateless/02153_clickhouse_local_profile_info.sh b/tests/queries/0_stateless/02153_clickhouse_local_profile_info.sh new file mode 100755 index 00000000000..65754d390fa --- /dev/null +++ b/tests/queries/0_stateless/02153_clickhouse_local_profile_info.sh @@ -0,0 +1,8 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +${CLICKHOUSE_LOCAL} --query "SELECT count(), arrayJoin([1, 2, 3]) AS n GROUP BY n WITH TOTALS ORDER BY n LIMIT 1 FORMAT JSON;" 2>&1 | head -32 + diff --git a/tests/queries/0_stateless/02153_native_bounds_check.reference b/tests/queries/0_stateless/02153_native_bounds_check.reference new file mode 100644 index 00000000000..d00491fd7e5 --- /dev/null +++ b/tests/queries/0_stateless/02153_native_bounds_check.reference @@ -0,0 +1 @@ +1 diff --git a/tests/queries/0_stateless/02153_native_bounds_check.sh b/tests/queries/0_stateless/02153_native_bounds_check.sh new file mode 100755 index 00000000000..a3475ddacae --- /dev/null +++ b/tests/queries/0_stateless/02153_native_bounds_check.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +# Should correctly handle error. + +${CLICKHOUSE_LOCAL} --query "SELECT toString(number) AS a, toString(number) AS a FROM numbers(10)" --output-format Native | + ${CLICKHOUSE_LOCAL} --query "SELECT * FROM table" --input-format Native --structure 'a LowCardinality(String)' 2>&1 | + grep -c -F Exception diff --git a/tests/queries/0_stateless/02154_bitmap_contains.reference b/tests/queries/0_stateless/02154_bitmap_contains.reference new file mode 100644 index 00000000000..573541ac970 --- /dev/null +++ b/tests/queries/0_stateless/02154_bitmap_contains.reference @@ -0,0 +1 @@ +0 diff --git a/tests/queries/0_stateless/02154_bitmap_contains.sql b/tests/queries/0_stateless/02154_bitmap_contains.sql new file mode 100644 index 00000000000..3235e81e2bb --- /dev/null +++ b/tests/queries/0_stateless/02154_bitmap_contains.sql @@ -0,0 +1 @@ +select bitmapContains(bitmapBuild([9]), 964291337) diff --git a/tests/queries/0_stateless/02155_create_table_w_timezone.reference b/tests/queries/0_stateless/02155_create_table_w_timezone.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/02155_create_table_w_timezone.sql b/tests/queries/0_stateless/02155_create_table_w_timezone.sql new file mode 100644 index 00000000000..0b72122ce39 --- /dev/null +++ b/tests/queries/0_stateless/02155_create_table_w_timezone.sql @@ -0,0 +1,8 @@ +create table t02155_t64_tz ( a DateTime64(9, America/Chicago)) Engine = Memory; -- { clientError 62 } +create table t02155_t_tz ( a DateTime(America/Chicago)) Engine = Memory; -- { clientError 62 } + +create table t02155_t64_tz ( a DateTime64(9, 'America/Chicago')) Engine = Memory; +create table t02155_t_tz ( a DateTime('America/Chicago')) Engine = Memory; + +drop table t02155_t64_tz; +drop table t02155_t_tz; diff --git a/tests/queries/0_stateless/02155_csv_with_strings_with_slash.reference b/tests/queries/0_stateless/02155_csv_with_strings_with_slash.reference new file mode 100644 index 00000000000..db750f36364 --- /dev/null +++ b/tests/queries/0_stateless/02155_csv_with_strings_with_slash.reference @@ -0,0 +1,62 @@ +input_format_null_as_default = 1 +0 \\asdf 2000-01-01 +1 x\\x\\ 2000-01-01 +2 x\\x 2000-01-01 +3 x\\ 2000-01-01 +4 x\\ 2000-01-01 +5 \\x 2000-01-01 +6 2000-01-01 +7 \\r\\n 2000-01-01 +8 \\\\r\\\\n 2000-01-01 +9 x\\\\ 2000-01-01 +10 \\asdf 2000-01-01 +11 x\\x\\ 2000-01-01 +12 x\\x 2000-01-01 +13 x\\ 2000-01-01 +14 x\\ 2000-01-01 +15 \\x 2000-01-01 +16 \\N 2000-01-01 +17 \\r\\n 2000-01-01 +18 \\\\r\\\\n 2000-01-01 +19 x\\\\ 2000-01-01 +20 \\asdf 2000-01-01 +21 x\\x\\ 2000-01-01 +22 x\\x 2000-01-01 +23 x\\ 2000-01-01 +24 x\\ 2000-01-01 +25 \\x 2000-01-01 +26 \\N 2000-01-01 +27 \\r\\n 2000-01-01 +28 \\\\r\\\\n 2000-01-01 +29 x\\\\ 2000-01-01 +input_format_null_as_default = 0 +0 \\asdf 2000-01-01 +1 x\\x\\ 2000-01-01 +2 x\\x 2000-01-01 +3 x\\ 2000-01-01 +4 x\\ 2000-01-01 +5 \\x 2000-01-01 +6 \\N 2000-01-01 +7 \\r\\n 2000-01-01 +8 \\\\r\\\\n 2000-01-01 +9 x\\\\ 2000-01-01 +10 \\asdf 2000-01-01 +11 x\\x\\ 2000-01-01 +12 x\\x 2000-01-01 +13 x\\ 2000-01-01 +14 x\\ 2000-01-01 +15 \\x 2000-01-01 +16 \\N 2000-01-01 +17 \\r\\n 2000-01-01 +18 \\\\r\\\\n 2000-01-01 +19 x\\\\ 2000-01-01 +20 \\asdf 2000-01-01 +21 x\\x\\ 2000-01-01 +22 x\\x 2000-01-01 +23 x\\ 2000-01-01 +24 x\\ 2000-01-01 +25 \\x 2000-01-01 +26 \\N 2000-01-01 +27 \\r\\n 2000-01-01 +28 \\\\r\\\\n 2000-01-01 +29 x\\\\ 2000-01-01 diff --git a/tests/queries/0_stateless/02155_csv_with_strings_with_slash.sh b/tests/queries/0_stateless/02155_csv_with_strings_with_slash.sh new file mode 100755 index 00000000000..ab2577e6138 --- /dev/null +++ b/tests/queries/0_stateless/02155_csv_with_strings_with_slash.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS test_02155_csv" + +${CLICKHOUSE_CLIENT} --query="create table test_02155_csv (A Int64, S String, D Date) Engine=Memory;" + + +echo "input_format_null_as_default = 1" +cat $CUR_DIR/data_csv/csv_with_slash.csv | ${CLICKHOUSE_CLIENT} -q "INSERT INTO test_02155_csv FORMAT CSV SETTINGS input_format_null_as_default = 1" +${CLICKHOUSE_CLIENT} --query="SELECT * FROM test_02155_csv" + +${CLICKHOUSE_CLIENT} --query="TRUNCATE TABLE test_02155_csv" + +echo "input_format_null_as_default = 0" +cat $CUR_DIR/data_csv/csv_with_slash.csv | ${CLICKHOUSE_CLIENT} -q "INSERT INTO test_02155_csv FORMAT CSV SETTINGS input_format_null_as_default = 0" +${CLICKHOUSE_CLIENT} --query="SELECT * FROM test_02155_csv" + + +${CLICKHOUSE_CLIENT} --query="DROP TABLE test_02155_csv" + diff --git a/tests/queries/0_stateless/02155_dictionary_comment.reference b/tests/queries/0_stateless/02155_dictionary_comment.reference new file mode 100644 index 00000000000..69b871a6925 --- /dev/null +++ b/tests/queries/0_stateless/02155_dictionary_comment.reference @@ -0,0 +1,11 @@ +02155_test_dictionary +02155_test_dictionary 02155_test_dictionary_comment_0 +02155_test_dictionary 02155_test_dictionary_comment_0 +0 Value +02155_test_dictionary 02155_test_dictionary_comment_0 +02155_test_dictionary 02155_test_dictionary_comment_0 +02155_test_dictionary 02155_test_dictionary_comment_1 +02155_test_dictionary 02155_test_dictionary_comment_1 +0 Value +02155_test_dictionary_view 02155_test_dictionary_view_comment_0 +02155_test_dictionary_view 02155_test_dictionary_view_comment_0 diff --git a/tests/queries/0_stateless/02155_dictionary_comment.sql b/tests/queries/0_stateless/02155_dictionary_comment.sql new file mode 100644 index 00000000000..e31d9d28366 --- /dev/null +++ b/tests/queries/0_stateless/02155_dictionary_comment.sql @@ -0,0 +1,53 @@ +DROP TABLE IF EXISTS 02155_test_table; +CREATE TABLE 02155_test_table +( + id UInt64, + value String +) ENGINE=TinyLog; + +INSERT INTO 02155_test_table VALUES (0, 'Value'); + +DROP DICTIONARY IF EXISTS 02155_test_dictionary; +CREATE DICTIONARY 02155_test_dictionary +( + id UInt64, + value String +) +PRIMARY KEY id +SOURCE(CLICKHOUSE(TABLE '02155_test_table')) +LAYOUT(DIRECT()); + +SELECT name, comment FROM system.dictionaries WHERE name == '02155_test_dictionary' AND database == currentDatabase(); + +ALTER TABLE 02155_test_dictionary COMMENT COLUMN value 'value_column'; --{serverError 48} + +ALTER TABLE 02155_test_dictionary MODIFY COMMENT '02155_test_dictionary_comment_0'; +SELECT name, comment FROM system.dictionaries WHERE name == '02155_test_dictionary' AND database == currentDatabase(); +SELECT name, comment FROM system.tables WHERE name == '02155_test_dictionary' AND database == currentDatabase(); + +SELECT * FROM 02155_test_dictionary; +SELECT name, comment FROM system.dictionaries WHERE name == '02155_test_dictionary' AND database == currentDatabase(); +SELECT name, comment FROM system.tables WHERE name == '02155_test_dictionary' AND database == currentDatabase(); + +ALTER TABLE 02155_test_dictionary MODIFY COMMENT '02155_test_dictionary_comment_1'; +SELECT name, comment FROM system.dictionaries WHERE name == '02155_test_dictionary' AND database == currentDatabase(); +SELECT name, comment FROM system.tables WHERE name == '02155_test_dictionary' AND database == currentDatabase(); + +DROP TABLE IF EXISTS 02155_test_dictionary_view; +CREATE TABLE 02155_test_dictionary_view +( + id UInt64, + value String +) ENGINE=Dictionary(concat(currentDatabase(), '.02155_test_dictionary')); + +SELECT * FROM 02155_test_dictionary_view; + +ALTER TABLE 02155_test_dictionary_view COMMENT COLUMN value 'value_column'; --{serverError 48} + +ALTER TABLE 02155_test_dictionary_view MODIFY COMMENT '02155_test_dictionary_view_comment_0'; +SELECT name, comment FROM system.tables WHERE name == '02155_test_dictionary_view' AND database == currentDatabase(); +SELECT name, comment FROM system.tables WHERE name == '02155_test_dictionary_view' AND database == currentDatabase(); + +DROP TABLE 02155_test_dictionary_view; +DROP TABLE 02155_test_table; +DROP DICTIONARY 02155_test_dictionary; diff --git a/tests/queries/0_stateless/02156_async_insert_query_log.reference b/tests/queries/0_stateless/02156_async_insert_query_log.reference new file mode 100644 index 00000000000..404dbfe753d --- /dev/null +++ b/tests/queries/0_stateless/02156_async_insert_query_log.reference @@ -0,0 +1,4 @@ +1 a +2 b +INSERT INTO async_inserts_2156 VALUES 1 Insert 1 0 +INSERT INTO async_inserts_2156 VALUES 1 Insert 1 diff --git a/tests/queries/0_stateless/02156_async_insert_query_log.sh b/tests/queries/0_stateless/02156_async_insert_query_log.sh new file mode 100755 index 00000000000..d7177fbe70c --- /dev/null +++ b/tests/queries/0_stateless/02156_async_insert_query_log.sh @@ -0,0 +1,23 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +${CLICKHOUSE_CLIENT} -q "DROP TABLE IF EXISTS async_inserts_2156" +${CLICKHOUSE_CLIENT} -q "CREATE TABLE async_inserts_2156 (id UInt32, s String) ENGINE = Memory" + +${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}&async_insert=1&wait_for_async_insert=0" -d "INSERT INTO async_inserts_2156 VALUES (1, 'a')" +${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}&async_insert=1&wait_for_async_insert=1" -d "INSERT INTO async_inserts_2156 VALUES (2, 'b')" + +${CLICKHOUSE_CLIENT} -q "SELECT * FROM async_inserts_2156 ORDER BY id" + +${CLICKHOUSE_CLIENT} -q "SYSTEM FLUSH LOGS" + +${CLICKHOUSE_CLIENT} -q "SELECT query, arrayExists(x -> x LIKE '%async_inserts_2156', tables), \ + query_kind, Settings['async_insert'], Settings['wait_for_async_insert'] FROM system.query_log \ + WHERE event_date >= yesterday() AND current_database = '$CLICKHOUSE_DATABASE' \ + AND query ILIKE 'INSERT INTO async_inserts_2156 VALUES%' AND type = 'QueryFinish' \ + ORDER BY query_start_time_microseconds" + +${CLICKHOUSE_CLIENT} -q "DROP TABLE async_inserts_2156" diff --git a/tests/queries/0_stateless/data_avro/nested_complex_incorrect_data.avro b/tests/queries/0_stateless/data_avro/nested_complex_incorrect_data.avro new file mode 100755 index 00000000000..9feb3c10486 Binary files /dev/null and b/tests/queries/0_stateless/data_avro/nested_complex_incorrect_data.avro differ diff --git a/tests/queries/0_stateless/data_csv/csv_with_slash.csv b/tests/queries/0_stateless/data_csv/csv_with_slash.csv new file mode 100644 index 00000000000..0f2c166faa8 --- /dev/null +++ b/tests/queries/0_stateless/data_csv/csv_with_slash.csv @@ -0,0 +1,30 @@ +0,\asdf,2000-01-01 +1,x\x\,2000-01-01 +2,x\x,2000-01-01 +3,x\,2000-01-01 +4,x\,2000-01-01 +5,\x,2000-01-01 +6,\N,2000-01-01 +7,\r\n,2000-01-01 +8,\\r\\n,2000-01-01 +9,x\\,2000-01-01 +10,'\asdf',2000-01-01 +11,'x\x\',2000-01-01 +12,'x\x',2000-01-01 +13,'x\',2000-01-01 +14,'x\',2000-01-01 +15,'\x',2000-01-01 +16,'\N',2000-01-01 +17,'\r\n',2000-01-01 +18,"\\r\\n",2000-01-01 +19,"x\\",2000-01-01 +20,"\asdf",2000-01-01 +21,"x\x\",2000-01-01 +22,"x\x",2000-01-01 +23,"x\",2000-01-01 +24,"x\",2000-01-01 +25,"\x",2000-01-01 +26,"\N",2000-01-01 +27,"\r\n",2000-01-01 +28,"\\r\\n",2000-01-01 +29,"x\\",2000-01-01 diff --git a/tests/queries/0_stateless/format_schemas/00825_protobuf_format_enum_mapping.proto b/tests/queries/0_stateless/format_schemas/00825_protobuf_format_enum_mapping.proto index ba558dbbadb..048a689d021 100644 --- a/tests/queries/0_stateless/format_schemas/00825_protobuf_format_enum_mapping.proto +++ b/tests/queries/0_stateless/format_schemas/00825_protobuf_format_enum_mapping.proto @@ -1,6 +1,6 @@ syntax = "proto3"; -message Message +message EnumMessage { enum Enum { @@ -10,4 +10,4 @@ message Message HUNDRED = 100; }; Enum x = 1; -}; \ No newline at end of file +}; diff --git a/tests/queries/0_stateless/helpers/02112_clean.sh b/tests/queries/0_stateless/helpers/02112_clean.sh index 910c0709955..95af0cede9c 100755 --- a/tests/queries/0_stateless/helpers/02112_clean.sh +++ b/tests/queries/0_stateless/helpers/02112_clean.sh @@ -1,6 +1,5 @@ #!/usr/bin/env bash -FILE=${CURDIR}/file_02112 -if [ -f $FILE ]; then - rm $FILE -fi +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +FILE=${CURDIR}/../file_02112 +rm "$FILE" diff --git a/tests/queries/0_stateless/helpers/02112_prepare.sh b/tests/queries/0_stateless/helpers/02112_prepare.sh index 1f371789f86..c2791b01140 100755 --- a/tests/queries/0_stateless/helpers/02112_prepare.sh +++ b/tests/queries/0_stateless/helpers/02112_prepare.sh @@ -1,7 +1,5 @@ #!/usr/bin/env bash -FILE=${CURDIR}/file_02112 -if [ -f $FILE ]; then - rm $FILE -fi -echo "drop table if exists t;create table t(i Int32) engine=Memory; insert into t select 1" >> $FILE +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +FILE=${CURDIR}/../file_02112 +echo "drop table if exists t;create table t(i Int32) engine=Memory; insert into t select 1" > "$FILE" diff --git a/tests/queries/1_stateful/00168_parallel_processing_on_replicas_part_1.sh b/tests/queries/1_stateful/00168_parallel_processing_on_replicas_part_1.sh index ba1245d9679..d025dae5b2e 100755 --- a/tests/queries/1_stateful/00168_parallel_processing_on_replicas_part_1.sh +++ b/tests/queries/1_stateful/00168_parallel_processing_on_replicas_part_1.sh @@ -72,7 +72,7 @@ do if [[ "$expected" != "$actual" ]]; then FAILED+=("$TESTNAME") - echo "Failed! ❌ " + echo "Failed! ❌" echo "Plain:" cat $TESTNAME_RESULT echo "Distributed:" diff --git a/utils/check-style/check-style b/utils/check-style/check-style index c65099f2582..22b5faa0fcb 100755 --- a/utils/check-style/check-style +++ b/utils/check-style/check-style @@ -184,7 +184,6 @@ tables_with_database_column=( tests_with_database_column=( $( find $ROOT_PATH/tests/queries -iname '*.sql' -or -iname '*.sh' -or -iname '*.py' -or -iname '*.j2' | grep -vP $EXCLUDE_DIRS | - grep -v -x -e $ROOT_PATH/tests/queries/query_test.py | xargs grep --with-filename $(printf -- "-e %s " "${tables_with_database_column[@]}") | cut -d: -f1 | sort -u ) ) for test_case in "${tests_with_database_column[@]}"; do @@ -299,6 +298,20 @@ for src in "${sources_with_std_cerr_cout[@]}"; do fi done +# Queries with event_date should have yesterday() not today() +# +# NOTE: it is not that accuate, but at least something. +tests_with_event_time_date=( $( + find $ROOT_PATH/tests/queries -iname '*.sql' -or -iname '*.sh' -or -iname '*.py' -or -iname '*.j2' | + grep -vP $EXCLUDE_DIRS | + xargs grep --with-filename -e event_time -e event_date | cut -d: -f1 | sort -u +) ) +for test_case in "${tests_with_event_time_date[@]}"; do + cat "$test_case" | tr '\n' ' ' | grep -q -i -e 'WHERE.*event_date[ ]*=[ ]*today()' -e 'WHERE.*event_date[ ]*=[ ]*today()' && { + echo "event_time/event_date should be filtered using >=yesterday() in $test_case (to avoid flakiness)" + } +done + # Conflict markers find $ROOT_PATH/{src,base,programs,utils,tests,docs,website,cmake} -name '*.md' -or -name '*.cpp' -or -name '*.h' | xargs grep -P '^(<<<<<<<|=======|>>>>>>>)$' | grep -P '.' && echo "Conflict markers are found in files" diff --git a/utils/check-style/codespell-ignore-words.list b/utils/check-style/codespell-ignore-words.list index 200b55d112d..d3a7586647c 100644 --- a/utils/check-style/codespell-ignore-words.list +++ b/utils/check-style/codespell-ignore-words.list @@ -6,7 +6,7 @@ nd ect pullrequest pullrequests -thenn ths offsett numer +ue diff --git a/utils/ci/install-libraries.sh b/utils/ci/install-libraries.sh index 7615375fbc1..3c26e3b09b1 100755 --- a/utils/ci/install-libraries.sh +++ b/utils/ci/install-libraries.sh @@ -4,4 +4,3 @@ set -e -x source default-config ./install-os-packages.sh libicu-dev -./install-os-packages.sh libreadline-dev diff --git a/utils/ci/install-os-packages.sh b/utils/ci/install-os-packages.sh index 38fa6dbba15..b4b0c74f30c 100755 --- a/utils/ci/install-os-packages.sh +++ b/utils/ci/install-os-packages.sh @@ -46,9 +46,6 @@ case $PACKAGE_MANAGER in libicu-dev) $SUDO apt-get install -y libicu-dev ;; - libreadline-dev) - $SUDO apt-get install -y libreadline-dev - ;; llvm-libs*) $SUDO apt-get install -y ${WHAT/llvm-libs/liblld}-dev ${WHAT/llvm-libs/libclang}-dev ;; @@ -91,9 +88,6 @@ case $PACKAGE_MANAGER in libicu-dev) $SUDO yum install -y libicu-devel ;; - libreadline-dev) - $SUDO yum install -y readline-devel - ;; *) echo "Unknown package"; exit 1; ;; @@ -130,9 +124,6 @@ case $PACKAGE_MANAGER in libicu-dev) $SUDO pkg install -y icu ;; - libreadline-dev) - $SUDO pkg install -y readline - ;; *) echo "Unknown package"; exit 1; ;; diff --git a/utils/ci/jobs/quick-build/README.md b/utils/ci/jobs/quick-build/README.md deleted file mode 100644 index 803acae0f93..00000000000 --- a/utils/ci/jobs/quick-build/README.md +++ /dev/null @@ -1,5 +0,0 @@ -## Build with debug mode and without many libraries - -This job is intended as first check that build is not broken on wide variety of platforms. - -Results of this build are not intended for production usage. diff --git a/utils/ci/jobs/quick-build/run.sh b/utils/ci/jobs/quick-build/run.sh deleted file mode 100755 index af977d14465..00000000000 --- a/utils/ci/jobs/quick-build/run.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/usr/bin/env bash -set -e -x - -# How to run: -# From "ci" directory: -# jobs/quick-build/run.sh -# or: -# ./run-with-docker.sh ubuntu:bionic jobs/quick-build/run.sh - -cd "$(dirname $0)"/../.. - -. default-config - -SOURCES_METHOD=local -COMPILER=clang -COMPILER_INSTALL_METHOD=packages -COMPILER_PACKAGE_VERSION=6.0 -BUILD_METHOD=normal -BUILD_TARGETS=clickhouse -BUILD_TYPE=Debug -ENABLE_EMBEDDED_COMPILER=0 - -CMAKE_FLAGS="-D CMAKE_C_FLAGS_ADD=-g0 -D CMAKE_CXX_FLAGS_ADD=-g0 -D ENABLE_JEMALLOC=0 -D ENABLE_CAPNP=0 -D ENABLE_RDKAFKA=0 -D ENABLE_UNWIND=0 -D ENABLE_ICU=0 -D ENABLE_POCO_MONGODB=0 -D ENABLE_POCO_REDIS=0 -D ENABLE_POCO_NETSSL=0 -D ENABLE_ODBC=0 -D ENABLE_MYSQL=0 -D ENABLE_SSL=0 -D ENABLE_POCO_NETSSL=0 -D ENABLE_CASSANDRA=0 -D ENABLE_LDAP=0" - -[[ $(uname) == "FreeBSD" ]] && COMPILER_PACKAGE_VERSION=devel && export COMPILER_PATH=/usr/local/bin - -. get-sources.sh -. prepare-toolchain.sh -. install-libraries.sh -. build-normal.sh diff --git a/utils/ci/vagrant-freebsd/.gitignore b/utils/ci/vagrant-freebsd/.gitignore deleted file mode 100644 index 8000dd9db47..00000000000 --- a/utils/ci/vagrant-freebsd/.gitignore +++ /dev/null @@ -1 +0,0 @@ -.vagrant diff --git a/utils/ci/vagrant-freebsd/Vagrantfile b/utils/ci/vagrant-freebsd/Vagrantfile deleted file mode 100644 index c01ae5fa6e2..00000000000 --- a/utils/ci/vagrant-freebsd/Vagrantfile +++ /dev/null @@ -1,3 +0,0 @@ -Vagrant.configure("2") do |config| - config.vm.box = "generic/freebsd11" -end diff --git a/utils/clickhouse-diagnostics/README.md b/utils/clickhouse-diagnostics/README.md new file mode 100644 index 00000000000..991efefdf5a --- /dev/null +++ b/utils/clickhouse-diagnostics/README.md @@ -0,0 +1,2657 @@ +## Installation + +``` +python3 -m pip install -r requirements.txt +``` + +## Usage + +``` +./clickhouse-diagnostics +``` + +Example output: + +### Diagnostics data for host clickhouse01.test_net_3697 +Version: **21.11.8.4** +Timestamp: **2021-12-25 15:34:02** +Uptime: **13 minutes and 51 seconds** +#### ClickHouse configuration +**result** +```XML + + + + trace + /var/log/clickhouse-server/clickhouse-server.log + /var/log/clickhouse-server/clickhouse-server.err.log + 1000M + 10 + 1 + + 8123 + 9000 + 9004 + 9005 + 9009 + 4096 + 3 + + false + /path/to/ssl_cert_file + /path/to/ssl_key_file + false + /path/to/ssl_ca_cert_file + deflate + medium + -1 + -1 + false + + + + /etc/clickhouse-server/server.crt + /etc/clickhouse-server/server.key + + none + true + true + sslv2,sslv3 + true + + + true + true + sslv2,sslv3,tlsv1,tlsv1_1 + true + + RejectCertificateHandler + + /etc/clickhouse-server/server.crt + /etc/clickhouse-server/server.key + /etc/clickhouse-server/allCAs.pem + + + 100 + 0 + 10000 + 0.9 + 4194304 + 0 + 8589934592 + 5368709120 + 1000 + 134217728 + 10000 + /var/lib/clickhouse/ + /var/lib/clickhouse/tmp/ + /var/lib/clickhouse/user_files/ + + + + users.xml + + + /var/lib/clickhouse/access/ + + + default + + default + true + false + + + + + localhost + 9000 + + + + + + + localhost + 9000 + + + + + localhost + 9000 + + + + + + + 127.0.0.1 + 9000 + + + + + 127.0.0.2 + 9000 + + + + + + true + + 127.0.0.1 + 9000 + + + + true + + 127.0.0.2 + 9000 + + + + + + + localhost + 9440 + 1 + + + + + + + localhost + 9000 + + + + + localhost + 1 + + + + + + + clickhouse01.test_net_3697 + 9000 + + + + + 3600 + 3600 + 60 + + system + query_log
+ toYYYYMM(event_date) + 7500 +
+ + system + trace_log
+ toYYYYMM(event_date) + 7500 +
+ + system + query_thread_log
+ toYYYYMM(event_date) + 7500 +
+ + system + query_views_log
+ toYYYYMM(event_date) + 7500 +
+ + system + part_log
+ toYYYYMM(event_date) + 7500 +
+ + system + metric_log
+ 7500 + 1000 +
+ + system + asynchronous_metric_log
+ 7000 +
+ + engine MergeTree + partition by toYYYYMM(finish_date) + order by (finish_date, finish_time_us, trace_id) + system + opentelemetry_span_log
+ 7500 +
+ + system + crash_log
+ + 1000 +
+ + system + session_log
+ toYYYYMM(event_date) + 7500 +
+ + *_dictionary.xml + *_function.xml + + + /clickhouse/task_queue/ddl + + + + click_cost + any + + 0 + 3600 + + + 86400 + 60 + + + + max + + 0 + 60 + + + 3600 + 300 + + + 86400 + 3600 + + + + /var/lib/clickhouse/format_schemas/ + + + hide encrypt/decrypt arguments + ((?:aes_)?(?:encrypt|decrypt)(?:_mysql)?)\s*\(\s*(?:'(?:\\'|.)+'|.*?)\s*\) + \1(???) + + + + false + false + https://6f33034cfe684dd7a3ab9875e57b1c8d@o388870.ingest.sentry.io/5226277 + + 0.0.0.0 + 8443 + 9440 + + + zookeeper01.test_net_3697 + 2281 + 1 + + 3000 + /clickhouse01 + ***** + + + clickhouse01 + shard1 + + 0 + + + + /hdd1/ + + + /hdd2/ + + + s3 + http://minio01:9000/cloud-storage-01/data/ + bB5vT2M8yaRv9J14SnAP + ***** + true + + + + + +
+ default +
+ + hdd1 + + + hdd2 + +
+ 0.0 +
+ + +
+ s3 +
+ + default + +
+ 0.0 +
+ + +
+ default +
+ + s3 + +
+ 0.0 +
+
+
+``` +#### Access configuration +**query** +```sql +SHOW ACCESS +``` +**result** +``` +CREATE USER default IDENTIFIED WITH plaintext_password SETTINGS PROFILE default +CREATE SETTINGS PROFILE default SETTINGS max_memory_usage = 10000000000, load_balancing = 'random' +CREATE SETTINGS PROFILE readonly SETTINGS readonly = 1 +CREATE QUOTA default KEYED BY user_name FOR INTERVAL 1 hour TRACKING ONLY TO default +GRANT ALL ON *.* TO default WITH GRANT OPTION +``` +#### Quotas +**query** +```sql +SHOW QUOTA +``` +**result** +``` +Row 1: +────── +quota_name: default +quota_key: default +start_time: 2021-12-25 15:00:00 +end_time: 2021-12-25 16:00:00 +duration: 3600 +queries: 49 +max_queries: ᴺᵁᴸᴸ +query_selects: 49 +max_query_selects: ᴺᵁᴸᴸ +query_inserts: 0 +max_query_inserts: ᴺᵁᴸᴸ +errors: 6 +max_errors: ᴺᵁᴸᴸ +result_rows: 607 +max_result_rows: ᴺᵁᴸᴸ +result_bytes: 237632 +max_result_bytes: ᴺᵁᴸᴸ +read_rows: 1256 +max_read_rows: ᴺᵁᴸᴸ +read_bytes: 778936 +max_read_bytes: ᴺᵁᴸᴸ +execution_time: 0 +max_execution_time: ᴺᵁᴸᴸ +``` +#### Schema +##### Database engines +**query** +```sql +SELECT + engine, + count() "count" +FROM system.databases +GROUP BY engine +``` +**result** +``` +┌─engine─┬─count─┐ +│ Memory │ 2 │ +│ Atomic │ 2 │ +└────────┴───────┘ +``` +##### Databases (top 10 by size) +**query** +```sql +SELECT + name, + engine, + tables, + partitions, + parts, + formatReadableSize(bytes_on_disk) "disk_size" +FROM system.databases db +LEFT JOIN +( + SELECT + database, + uniq(table) "tables", + uniq(table, partition) "partitions", + count() AS parts, + sum(bytes_on_disk) "bytes_on_disk" + FROM system.parts + WHERE active + GROUP BY database +) AS db_stats ON db.name = db_stats.database +ORDER BY bytes_on_disk DESC +LIMIT 10 +``` +**result** +``` +┌─name───────────────┬─engine─┬─tables─┬─partitions─┬─parts─┬─disk_size──┐ +│ system │ Atomic │ 6 │ 6 │ 22 │ 716.29 KiB │ +│ INFORMATION_SCHEMA │ Memory │ 0 │ 0 │ 0 │ 0.00 B │ +│ default │ Atomic │ 0 │ 0 │ 0 │ 0.00 B │ +│ information_schema │ Memory │ 0 │ 0 │ 0 │ 0.00 B │ +└────────────────────┴────────┴────────┴────────────┴───────┴────────────┘ +``` +##### Table engines +**query** +```sql +SELECT + engine, + count() "count" +FROM system.tables +WHERE database != 'system' +GROUP BY engine +``` +**result** +``` +┌─engine─┬─count─┐ +│ View │ 8 │ +└────────┴───────┘ +``` +##### Dictionaries +**query** +```sql +SELECT + source, + type, + status, + count() "count" +FROM system.dictionaries +GROUP BY source, type, status +ORDER BY status DESC, source +``` +**result** +``` + +``` +#### Replication +##### Replicated tables (top 10 by absolute delay) +**query** +```sql +SELECT + database, + table, + is_leader, + is_readonly, + absolute_delay, + queue_size, + inserts_in_queue, + merges_in_queue +FROM system.replicas +ORDER BY absolute_delay DESC +LIMIT 10 +``` +**result** +``` + +``` +##### Replication queue (top 20 oldest tasks) +**query** +```sql +SELECT + database, + table, + replica_name, + position, + node_name, + type, + source_replica, + parts_to_merge, + new_part_name, + create_time, + required_quorum, + is_detach, + is_currently_executing, + num_tries, + last_attempt_time, + last_exception, + concat('time: ', toString(last_postpone_time), ', number: ', toString(num_postponed), ', reason: ', postpone_reason) postpone +FROM system.replication_queue +ORDER BY create_time ASC +LIMIT 20 +``` +**result** +``` + +``` +##### Replicated fetches +**query** +```sql +SELECT + database, + table, + round(elapsed, 1) "elapsed", + round(100 * progress, 1) "progress", + partition_id, + result_part_name, + result_part_path, + total_size_bytes_compressed, + bytes_read_compressed, + source_replica_path, + source_replica_hostname, + source_replica_port, + interserver_scheme, + to_detached, + thread_id +FROM system.replicated_fetches +``` +**result** +``` + +``` +#### Top 10 tables by max parts per partition +**query** +```sql +SELECT + database, + table, + count() "partitions", + sum(part_count) "parts", + max(part_count) "max_parts_per_partition" +FROM +( + SELECT + database, + table, + partition, + count() "part_count" + FROM system.parts + WHERE active + GROUP BY database, table, partition +) partitions +GROUP BY database, table +ORDER BY max_parts_per_partition DESC +LIMIT 10 +``` +**result** +``` +┌─database─┬─table───────────────────┬─partitions─┬─parts─┬─max_parts_per_partition─┐ +│ system │ metric_log │ 1 │ 5 │ 5 │ +│ system │ trace_log │ 1 │ 5 │ 5 │ +│ system │ query_thread_log │ 1 │ 3 │ 3 │ +│ system │ query_log │ 1 │ 3 │ 3 │ +│ system │ asynchronous_metric_log │ 1 │ 3 │ 3 │ +│ system │ session_log │ 1 │ 3 │ 3 │ +└──────────┴─────────────────────────┴────────────┴───────┴─────────────────────────┘ +``` +#### Merges in progress +**query** +```sql +SELECT + database, + table, + round(elapsed, 1) "elapsed", + round(100 * progress, 1) "progress", + is_mutation, + partition_id, +result_part_path, + source_part_paths, +num_parts, + formatReadableSize(total_size_bytes_compressed) "total_size_compressed", + formatReadableSize(bytes_read_uncompressed) "read_uncompressed", + formatReadableSize(bytes_written_uncompressed) "written_uncompressed", + columns_written, +formatReadableSize(memory_usage) "memory_usage", + thread_id +FROM system.merges +``` +**result** +``` + +``` +#### Mutations in progress +**query** +```sql +SELECT + database, + table, + mutation_id, + command, + create_time, +parts_to_do_names, +parts_to_do, + is_done, + latest_failed_part, + latest_fail_time, + latest_fail_reason +FROM system.mutations +WHERE NOT is_done +ORDER BY create_time DESC +``` +**result** +``` + +``` +#### Recent data parts (modification time within last 3 minutes) +**query** +```sql +SELECT + database, + table, + engine, + partition_id, + name, +part_type, +active, + level, +disk_name, +path, + marks, + rows, + bytes_on_disk, + data_compressed_bytes, + data_uncompressed_bytes, + marks_bytes, + modification_time, + remove_time, + refcount, + is_frozen, + min_date, + max_date, + min_time, + max_time, + min_block_number, + max_block_number +FROM system.parts +WHERE modification_time > now() - INTERVAL 3 MINUTE +ORDER BY modification_time DESC +``` +**result** +``` +Row 1: +────── +database: system +table: metric_log +engine: MergeTree +partition_id: 202112 +name: 202112_110_110_0 +part_type: Compact +active: 1 +level: 0 +disk_name: default +path: /var/lib/clickhouse/store/9a2/9a2fb3b4-8ced-4c0b-9a2f-b3b48ced4c0b/202112_110_110_0/ +marks: 2 +rows: 8 +bytes_on_disk: 21752 +data_compressed_bytes: 11699 +data_uncompressed_bytes: 19952 +marks_bytes: 10032 +modification_time: 2021-12-25 15:33:59 +remove_time: 1970-01-01 03:00:00 +refcount: 1 +is_frozen: 0 +min_date: 2021-12-25 +max_date: 2021-12-25 +min_time: 1970-01-01 03:00:00 +max_time: 1970-01-01 03:00:00 +min_block_number: 110 +max_block_number: 110 + +Row 2: +────── +database: system +table: asynchronous_metric_log +engine: MergeTree +partition_id: 202112 +name: 202112_118_118_0 +part_type: Compact +active: 1 +level: 0 +disk_name: default +path: /var/lib/clickhouse/store/78e/78e6eec8-3f71-4724-b8e6-eec83f71a724/202112_118_118_0/ +marks: 2 +rows: 4767 +bytes_on_disk: 10856 +data_compressed_bytes: 10656 +data_uncompressed_bytes: 128675 +marks_bytes: 176 +modification_time: 2021-12-25 15:33:58 +remove_time: 1970-01-01 03:00:00 +refcount: 1 +is_frozen: 0 +min_date: 2021-12-25 +max_date: 2021-12-25 +min_time: 1970-01-01 03:00:00 +max_time: 1970-01-01 03:00:00 +min_block_number: 118 +max_block_number: 118 + +Row 3: +────── +database: system +table: asynchronous_metric_log +engine: MergeTree +partition_id: 202112 +name: 202112_117_117_0 +part_type: Compact +active: 1 +level: 0 +disk_name: default +path: /var/lib/clickhouse/store/78e/78e6eec8-3f71-4724-b8e6-eec83f71a724/202112_117_117_0/ +marks: 2 +rows: 4767 +bytes_on_disk: 11028 +data_compressed_bytes: 10828 +data_uncompressed_bytes: 128675 +marks_bytes: 176 +modification_time: 2021-12-25 15:33:51 +remove_time: 1970-01-01 03:00:00 +refcount: 1 +is_frozen: 0 +min_date: 2021-12-25 +max_date: 2021-12-25 +min_time: 1970-01-01 03:00:00 +max_time: 1970-01-01 03:00:00 +min_block_number: 117 +max_block_number: 117 + +Row 4: +────── +database: system +table: metric_log +engine: MergeTree +partition_id: 202112 +name: 202112_109_109_0 +part_type: Compact +active: 1 +level: 0 +disk_name: default +path: /var/lib/clickhouse/store/9a2/9a2fb3b4-8ced-4c0b-9a2f-b3b48ced4c0b/202112_109_109_0/ +marks: 2 +rows: 7 +bytes_on_disk: 21802 +data_compressed_bytes: 11749 +data_uncompressed_bytes: 17458 +marks_bytes: 10032 +modification_time: 2021-12-25 15:33:51 +remove_time: 1970-01-01 03:00:00 +refcount: 1 +is_frozen: 0 +min_date: 2021-12-25 +max_date: 2021-12-25 +min_time: 1970-01-01 03:00:00 +max_time: 1970-01-01 03:00:00 +min_block_number: 109 +max_block_number: 109 + +Row 5: +────── +database: system +table: trace_log +engine: MergeTree +partition_id: 202112 +name: 202112_53_53_0 +part_type: Compact +active: 1 +level: 0 +disk_name: default +path: /var/lib/clickhouse/store/c0b/c0bc3be3-22d7-45a3-80bc-3be322d7b5a3/202112_53_53_0/ +marks: 2 +rows: 6 +bytes_on_disk: 1057 +data_compressed_bytes: 700 +data_uncompressed_bytes: 1894 +marks_bytes: 336 +modification_time: 2021-12-25 15:33:49 +remove_time: 1970-01-01 03:00:00 +refcount: 1 +is_frozen: 0 +min_date: 2021-12-25 +max_date: 2021-12-25 +min_time: 1970-01-01 03:00:00 +max_time: 1970-01-01 03:00:00 +min_block_number: 53 +max_block_number: 53 + +Row 6: +────── +database: system +table: asynchronous_metric_log +engine: MergeTree +partition_id: 202112 +name: 202112_116_116_0 +part_type: Compact +active: 0 +level: 0 +disk_name: default +path: /var/lib/clickhouse/store/78e/78e6eec8-3f71-4724-b8e6-eec83f71a724/202112_116_116_0/ +marks: 2 +rows: 4767 +bytes_on_disk: 10911 +data_compressed_bytes: 10711 +data_uncompressed_bytes: 128675 +marks_bytes: 176 +modification_time: 2021-12-25 15:33:44 +remove_time: 2021-12-25 15:33:44 +refcount: 1 +is_frozen: 0 +min_date: 2021-12-25 +max_date: 2021-12-25 +min_time: 1970-01-01 03:00:00 +max_time: 1970-01-01 03:00:00 +min_block_number: 116 +max_block_number: 116 + +Row 7: +────── +database: system +table: asynchronous_metric_log +engine: MergeTree +partition_id: 202112 +name: 202112_1_116_23 +part_type: Wide +active: 1 +level: 23 +disk_name: default +path: /var/lib/clickhouse/store/78e/78e6eec8-3f71-4724-b8e6-eec83f71a724/202112_1_116_23/ +marks: 69 +rows: 553071 +bytes_on_disk: 435279 +data_compressed_bytes: 424915 +data_uncompressed_bytes: 13289123 +marks_bytes: 9936 +modification_time: 2021-12-25 15:33:44 +remove_time: 1970-01-01 03:00:00 +refcount: 1 +is_frozen: 0 +min_date: 2021-12-25 +max_date: 2021-12-25 +min_time: 1970-01-01 03:00:00 +max_time: 1970-01-01 03:00:00 +min_block_number: 1 +max_block_number: 116 + +Row 8: +────── +database: system +table: metric_log +engine: MergeTree +partition_id: 202112 +name: 202112_108_108_0 +part_type: Compact +active: 1 +level: 0 +disk_name: default +path: /var/lib/clickhouse/store/9a2/9a2fb3b4-8ced-4c0b-9a2f-b3b48ced4c0b/202112_108_108_0/ +marks: 2 +rows: 8 +bytes_on_disk: 21833 +data_compressed_bytes: 11780 +data_uncompressed_bytes: 19952 +marks_bytes: 10032 +modification_time: 2021-12-25 15:33:44 +remove_time: 1970-01-01 03:00:00 +refcount: 1 +is_frozen: 0 +min_date: 2021-12-25 +max_date: 2021-12-25 +min_time: 1970-01-01 03:00:00 +max_time: 1970-01-01 03:00:00 +min_block_number: 108 +max_block_number: 108 + +Row 9: +─────── +database: system +table: asynchronous_metric_log +engine: MergeTree +partition_id: 202112 +name: 202112_115_115_0 +part_type: Compact +active: 0 +level: 0 +disk_name: default +path: /var/lib/clickhouse/store/78e/78e6eec8-3f71-4724-b8e6-eec83f71a724/202112_115_115_0/ +marks: 2 +rows: 4767 +bytes_on_disk: 11146 +data_compressed_bytes: 10946 +data_uncompressed_bytes: 128675 +marks_bytes: 176 +modification_time: 2021-12-25 15:33:37 +remove_time: 2021-12-25 15:33:44 +refcount: 1 +is_frozen: 0 +min_date: 2021-12-25 +max_date: 2021-12-25 +min_time: 1970-01-01 03:00:00 +max_time: 1970-01-01 03:00:00 +min_block_number: 115 +max_block_number: 115 + +Row 10: +─────── +database: system +table: metric_log +engine: MergeTree +partition_id: 202112 +name: 202112_107_107_0 +part_type: Compact +active: 1 +level: 0 +disk_name: default +path: /var/lib/clickhouse/store/9a2/9a2fb3b4-8ced-4c0b-9a2f-b3b48ced4c0b/202112_107_107_0/ +marks: 2 +rows: 7 +bytes_on_disk: 21996 +data_compressed_bytes: 11943 +data_uncompressed_bytes: 17458 +marks_bytes: 10032 +modification_time: 2021-12-25 15:33:36 +remove_time: 1970-01-01 03:00:00 +refcount: 1 +is_frozen: 0 +min_date: 2021-12-25 +max_date: 2021-12-25 +min_time: 1970-01-01 03:00:00 +max_time: 1970-01-01 03:00:00 +min_block_number: 107 +max_block_number: 107 + +Row 11: +─────── +database: system +table: session_log +engine: MergeTree +partition_id: 202112 +name: 202112_3_3_0 +part_type: Compact +active: 1 +level: 0 +disk_name: default +path: /var/lib/clickhouse/store/9f3/9f3dd592-781c-48d8-9f3d-d592781c48d8/202112_3_3_0/ +marks: 2 +rows: 44 +bytes_on_disk: 2208 +data_compressed_bytes: 1498 +data_uncompressed_bytes: 5130 +marks_bytes: 688 +modification_time: 2021-12-25 15:33:34 +remove_time: 1970-01-01 03:00:00 +refcount: 1 +is_frozen: 0 +min_date: 2021-12-25 +max_date: 2021-12-25 +min_time: 1970-01-01 03:00:00 +max_time: 1970-01-01 03:00:00 +min_block_number: 3 +max_block_number: 3 + +Row 12: +─────── +database: system +table: query_log +engine: MergeTree +partition_id: 202112 +name: 202112_3_3_0 +part_type: Compact +active: 1 +level: 0 +disk_name: default +path: /var/lib/clickhouse/store/1a3/1a3ec308-d42e-4f3c-9a3e-c308d42e2f3c/202112_3_3_0/ +marks: 2 +rows: 43 +bytes_on_disk: 17843 +data_compressed_bytes: 15725 +data_uncompressed_bytes: 61869 +marks_bytes: 2096 +modification_time: 2021-12-25 15:33:34 +remove_time: 1970-01-01 03:00:00 +refcount: 1 +is_frozen: 0 +min_date: 2021-12-25 +max_date: 2021-12-25 +min_time: 1970-01-01 03:00:00 +max_time: 1970-01-01 03:00:00 +min_block_number: 3 +max_block_number: 3 + +Row 13: +─────── +database: system +table: query_thread_log +engine: MergeTree +partition_id: 202112 +name: 202112_3_3_0 +part_type: Compact +active: 1 +level: 0 +disk_name: default +path: /var/lib/clickhouse/store/afa/afa652ef-f91d-4a48-afa6-52eff91daa48/202112_3_3_0/ +marks: 2 +rows: 43 +bytes_on_disk: 11878 +data_compressed_bytes: 10432 +data_uncompressed_bytes: 52339 +marks_bytes: 1424 +modification_time: 2021-12-25 15:33:34 +remove_time: 1970-01-01 03:00:00 +refcount: 1 +is_frozen: 0 +min_date: 2021-12-25 +max_date: 2021-12-25 +min_time: 1970-01-01 03:00:00 +max_time: 1970-01-01 03:00:00 +min_block_number: 3 +max_block_number: 3 + +Row 14: +─────── +database: system +table: trace_log +engine: MergeTree +partition_id: 202112 +name: 202112_52_52_0 +part_type: Compact +active: 1 +level: 0 +disk_name: default +path: /var/lib/clickhouse/store/c0b/c0bc3be3-22d7-45a3-80bc-3be322d7b5a3/202112_52_52_0/ +marks: 2 +rows: 4 +bytes_on_disk: 1078 +data_compressed_bytes: 721 +data_uncompressed_bytes: 1252 +marks_bytes: 336 +modification_time: 2021-12-25 15:33:34 +remove_time: 1970-01-01 03:00:00 +refcount: 1 +is_frozen: 0 +min_date: 2021-12-25 +max_date: 2021-12-25 +min_time: 1970-01-01 03:00:00 +max_time: 1970-01-01 03:00:00 +min_block_number: 52 +max_block_number: 52 + +Row 15: +─────── +database: system +table: asynchronous_metric_log +engine: MergeTree +partition_id: 202112 +name: 202112_114_114_0 +part_type: Compact +active: 0 +level: 0 +disk_name: default +path: /var/lib/clickhouse/store/78e/78e6eec8-3f71-4724-b8e6-eec83f71a724/202112_114_114_0/ +marks: 2 +rows: 4767 +bytes_on_disk: 11447 +data_compressed_bytes: 11247 +data_uncompressed_bytes: 128675 +marks_bytes: 176 +modification_time: 2021-12-25 15:33:30 +remove_time: 2021-12-25 15:33:44 +refcount: 1 +is_frozen: 0 +min_date: 2021-12-25 +max_date: 2021-12-25 +min_time: 1970-01-01 03:00:00 +max_time: 1970-01-01 03:00:00 +min_block_number: 114 +max_block_number: 114 + +Row 16: +─────── +database: system +table: metric_log +engine: MergeTree +partition_id: 202112 +name: 202112_1_106_21 +part_type: Compact +active: 1 +level: 21 +disk_name: default +path: /var/lib/clickhouse/store/9a2/9a2fb3b4-8ced-4c0b-9a2f-b3b48ced4c0b/202112_1_106_21/ +marks: 2 +rows: 798 +bytes_on_disk: 84853 +data_compressed_bytes: 74798 +data_uncompressed_bytes: 1990212 +marks_bytes: 10032 +modification_time: 2021-12-25 15:33:29 +remove_time: 1970-01-01 03:00:00 +refcount: 1 +is_frozen: 0 +min_date: 2021-12-25 +max_date: 2021-12-25 +min_time: 1970-01-01 03:00:00 +max_time: 1970-01-01 03:00:00 +min_block_number: 1 +max_block_number: 106 + +Row 17: +─────── +database: system +table: metric_log +engine: MergeTree +partition_id: 202112 +name: 202112_106_106_0 +part_type: Compact +active: 0 +level: 0 +disk_name: default +path: /var/lib/clickhouse/store/9a2/9a2fb3b4-8ced-4c0b-9a2f-b3b48ced4c0b/202112_106_106_0/ +marks: 2 +rows: 8 +bytes_on_disk: 21863 +data_compressed_bytes: 11810 +data_uncompressed_bytes: 19952 +marks_bytes: 10032 +modification_time: 2021-12-25 15:33:28 +remove_time: 2021-12-25 15:33:29 +refcount: 1 +is_frozen: 0 +min_date: 2021-12-25 +max_date: 2021-12-25 +min_time: 1970-01-01 03:00:00 +max_time: 1970-01-01 03:00:00 +min_block_number: 106 +max_block_number: 106 + +Row 18: +─────── +database: system +table: asynchronous_metric_log +engine: MergeTree +partition_id: 202112 +name: 202112_113_113_0 +part_type: Compact +active: 0 +level: 0 +disk_name: default +path: /var/lib/clickhouse/store/78e/78e6eec8-3f71-4724-b8e6-eec83f71a724/202112_113_113_0/ +marks: 2 +rows: 4767 +bytes_on_disk: 11191 +data_compressed_bytes: 10991 +data_uncompressed_bytes: 128675 +marks_bytes: 176 +modification_time: 2021-12-25 15:33:23 +remove_time: 2021-12-25 15:33:44 +refcount: 1 +is_frozen: 0 +min_date: 2021-12-25 +max_date: 2021-12-25 +min_time: 1970-01-01 03:00:00 +max_time: 1970-01-01 03:00:00 +min_block_number: 113 +max_block_number: 113 + +Row 19: +─────── +database: system +table: metric_log +engine: MergeTree +partition_id: 202112 +name: 202112_105_105_0 +part_type: Compact +active: 0 +level: 0 +disk_name: default +path: /var/lib/clickhouse/store/9a2/9a2fb3b4-8ced-4c0b-9a2f-b3b48ced4c0b/202112_105_105_0/ +marks: 2 +rows: 7 +bytes_on_disk: 21786 +data_compressed_bytes: 11733 +data_uncompressed_bytes: 17458 +marks_bytes: 10032 +modification_time: 2021-12-25 15:33:21 +remove_time: 2021-12-25 15:33:29 +refcount: 1 +is_frozen: 0 +min_date: 2021-12-25 +max_date: 2021-12-25 +min_time: 1970-01-01 03:00:00 +max_time: 1970-01-01 03:00:00 +min_block_number: 105 +max_block_number: 105 + +Row 20: +─────── +database: system +table: asynchronous_metric_log +engine: MergeTree +partition_id: 202112 +name: 202112_112_112_0 +part_type: Compact +active: 0 +level: 0 +disk_name: default +path: /var/lib/clickhouse/store/78e/78e6eec8-3f71-4724-b8e6-eec83f71a724/202112_112_112_0/ +marks: 2 +rows: 4767 +bytes_on_disk: 11281 +data_compressed_bytes: 11081 +data_uncompressed_bytes: 128675 +marks_bytes: 176 +modification_time: 2021-12-25 15:33:16 +remove_time: 2021-12-25 15:33:44 +refcount: 1 +is_frozen: 0 +min_date: 2021-12-25 +max_date: 2021-12-25 +min_time: 1970-01-01 03:00:00 +max_time: 1970-01-01 03:00:00 +min_block_number: 112 +max_block_number: 112 +``` +#### Detached data +##### system.detached_parts +**query** +```sql +SELECT + database, + table, + partition_id, + name, + disk, + reason, + min_block_number, + max_block_number, + level +FROM system.detached_parts +``` +**result** +``` +┌─database─┬─table─┬─partition_id─┬─name─┬─disk─┬─reason─┬─min_block_number─┬─max_block_number─┬─level─┐ +└──────────┴───────┴──────────────┴──────┴──────┴────────┴──────────────────┴──────────────────┴───────┘ +``` +##### Disk space usage +**command** +``` +du -sh -L -c /var/lib/clickhouse/data/*/*/detached/* | sort -rsh +``` +**result** +``` +0 total + +``` +#### Queries +##### Queries in progress (process list) +**query** +```sql +SELECT + elapsed, + query_id, + query, + is_cancelled, + concat(toString(read_rows), ' rows / ', formatReadableSize(read_bytes)) AS read, + concat(toString(written_rows), ' rows / ', formatReadableSize(written_bytes)) AS written, + formatReadableSize(memory_usage) AS "memory usage", + user, + multiIf(empty(client_name), http_user_agent, concat(client_name, ' ', toString(client_version_major), '.', toString(client_version_minor), '.', toString(client_version_patch))) AS client, + thread_ids, + ProfileEvents, + Settings + FROM system.processes +ORDER BY elapsed DESC +``` +**result** +``` +Row 1: +────── +elapsed: 0.000785246 +query_id: b51cbc7a-2260-4c9b-b26c-6307b10ad948 +query: SELECT + elapsed, + query_id, + query, + is_cancelled, + concat(toString(read_rows), ' rows / ', formatReadableSize(read_bytes)) AS read, + concat(toString(written_rows), ' rows / ', formatReadableSize(written_bytes)) AS written, + formatReadableSize(memory_usage) AS "memory usage", + user, + multiIf(empty(client_name), http_user_agent, concat(client_name, ' ', toString(client_version_major), '.', toString(client_version_minor), '.', toString(client_version_patch))) AS client, + thread_ids, + ProfileEvents, + Settings + FROM system.processes +ORDER BY elapsed DESC FORMAT Vertical + +is_cancelled: 0 +read: 0 rows / 0.00 B +written: 0 rows / 0.00 B +memory usage: 0.00 B +user: default +client: python-requests/2.26.0 +thread_ids: [66] +ProfileEvents: {'Query':1,'SelectQuery':1,'ContextLock':38,'RWLockAcquiredReadLocks':1} +Settings: {'load_balancing':'random','max_memory_usage':'10000000000'} +``` +##### Top 10 queries by duration +**query** +```sql +SELECT + type, + query_start_time, + query_duration_ms, + query_id, + query_kind, + is_initial_query, + query, + concat(toString(read_rows), ' rows / ', formatReadableSize(read_bytes)) AS read, + concat(toString(written_rows), ' rows / ', formatReadableSize(written_bytes)) AS written, + concat(toString(result_rows), ' rows / ', formatReadableSize(result_bytes)) AS result, + formatReadableSize(memory_usage) AS "memory usage", + exception, + '\n' || stack_trace AS stack_trace, + user, + initial_user, + multiIf(empty(client_name), http_user_agent, concat(client_name, ' ', toString(client_version_major), '.', toString(client_version_minor), '.', toString(client_version_patch))) AS client, + client_hostname, + databases, + tables, + columns, + used_aggregate_functions, + used_aggregate_function_combinators, + used_database_engines, + used_data_type_families, + used_dictionaries, + used_formats, + used_functions, + used_storages, + used_table_functions, + thread_ids, + ProfileEvents, + Settings + FROM system.query_log +WHERE type != 'QueryStart' + AND event_date >= today() - 1 + AND event_time >= now() - INTERVAL 1 DAY +ORDER BY query_duration_ms DESC +LIMIT 10 +``` +**result** +``` +Row 1: +────── +type: QueryFinish +query_start_time: 2021-12-25 15:25:01 +query_duration_ms: 60 +query_id: f72e1120-cc66-434c-9809-3a99077ed842 +query_kind: Select +is_initial_query: 1 +query: SELECT + database, + table, + count() "partitions", + sum(part_count) "parts", + max(part_count) "max_parts_per_partition" +FROM +( + SELECT + database, + table, + partition, + count() "part_count" + FROM system.parts + WHERE active + GROUP BY database, table, partition +) partitions +GROUP BY database, table +ORDER BY max_parts_per_partition DESC +LIMIT 10 FORMAT PrettyCompactNoEscapes + +read: 5 rows / 262.00 B +written: 0 rows / 0.00 B +result: 3 rows / 488.00 B +memory usage: 0.00 B +exception: +stack_trace: + +user: default +initial_user: default +client: python-requests/2.26.0 +client_hostname: +databases: ['system'] +tables: ['system.parts'] +columns: ['system.parts.active','system.parts.database','system.parts.partition','system.parts.table'] +used_aggregate_functions: ['count','max','sum'] +used_aggregate_function_combinators: [] +used_database_engines: [] +used_data_type_families: [] +used_dictionaries: [] +used_formats: ['PrettyCompactNoEscapes'] +used_functions: [] +used_storages: [] +used_table_functions: [] +thread_ids: [66] +ProfileEvents: {'Query':1,'SelectQuery':1,'ArenaAllocChunks':2,'ArenaAllocBytes':8192,'CompileFunction':1,'CompileExpressionsMicroseconds':52574,'CompileExpressionsBytes':8192,'SelectedRows':5,'SelectedBytes':262,'ContextLock':58,'RWLockAcquiredReadLocks':6,'RealTimeMicroseconds':61493,'UserTimeMicroseconds':34154,'SystemTimeMicroseconds':9874,'SoftPageFaults':170,'HardPageFaults':33,'OSIOWaitMicroseconds':10000,'OSCPUWaitMicroseconds':2433,'OSCPUVirtualTimeMicroseconds':43706,'OSReadBytes':3080192,'OSWriteBytes':4096,'OSReadChars':863,'OSWriteChars':5334} +Settings: {'load_balancing':'random','max_memory_usage':'10000000000'} + +Row 2: +────── +type: QueryFinish +query_start_time: 2021-12-25 15:26:26 +query_duration_ms: 12 +query_id: eabd7483-70df-4d60-a668-d8961416e3fb +query_kind: Select +is_initial_query: 1 +query: SELECT + type, + query_start_time, + query_duration_ms, + query_id, + query_kind, + is_initial_query, + query, + concat(toString(read_rows), ' rows / ', formatReadableSize(read_bytes)) AS read, + concat(toString(written_rows), ' rows / ', formatReadableSize(written_bytes)) AS written, + concat(toString(result_rows), ' rows / ', formatReadableSize(result_bytes)) AS result, + formatReadableSize(memory_usage) AS "memory usage", + exception, + '\n' || stack_trace AS stack_trace, + user, + initial_user, + multiIf(empty(client_name), http_user_agent, concat(client_name, ' ', toString(client_version_major), '.', toString(client_version_minor), '.', toString(client_version_patch))) AS client, + client_hostname, + databases, + tables, + columns, + used_aggregate_functions, + used_aggregate_function_combinators, + used_database_engines, + used_data_type_families, + used_dictionaries, + used_formats, + used_functions, + used_storages, + used_table_functions, + thread_ids, + ProfileEvents, + Settings + FROM system.query_log +WHERE type != 'QueryStart' + AND event_date >= today() - 1 + AND event_time >= now() - INTERVAL 1 DAY +ORDER BY query_duration_ms DESC +LIMIT 10 FORMAT Vertical + +read: 40 rows / 67.42 KiB +written: 0 rows / 0.00 B +result: 10 rows / 41.23 KiB +memory usage: 0.00 B +exception: +stack_trace: + +user: default +initial_user: default +client: python-requests/2.26.0 +client_hostname: +databases: ['system'] +tables: ['system.query_log'] +columns: ['system.query_log.ProfileEvents','system.query_log.Settings','system.query_log.client_hostname','system.query_log.client_name','system.query_log.client_version_major','system.query_log.client_version_minor','system.query_log.client_version_patch','system.query_log.columns','system.query_log.databases','system.query_log.event_date','system.query_log.event_time','system.query_log.exception','system.query_log.http_user_agent','system.query_log.initial_user','system.query_log.is_initial_query','system.query_log.memory_usage','system.query_log.query','system.query_log.query_duration_ms','system.query_log.query_id','system.query_log.query_kind','system.query_log.query_start_time','system.query_log.read_bytes','system.query_log.read_rows','system.query_log.result_bytes','system.query_log.result_rows','system.query_log.stack_trace','system.query_log.tables','system.query_log.thread_ids','system.query_log.type','system.query_log.used_aggregate_function_combinators','system.query_log.used_aggregate_functions','system.query_log.used_data_type_families','system.query_log.used_database_engines','system.query_log.used_dictionaries','system.query_log.used_formats','system.query_log.used_functions','system.query_log.used_storages','system.query_log.used_table_functions','system.query_log.user','system.query_log.written_bytes','system.query_log.written_rows'] +used_aggregate_functions: [] +used_aggregate_function_combinators: [] +used_database_engines: [] +used_data_type_families: [] +used_dictionaries: [] +used_formats: ['Vertical'] +used_functions: ['empty','and','now','concat','today','toIntervalDay','formatReadableSize','minus','greaterOrEquals','multiIf','toString','subtractDays','notEquals'] +used_storages: [] +used_table_functions: [] +thread_ids: [66] +ProfileEvents: {'Query':1,'SelectQuery':1,'FileOpen':2,'Seek':3,'ReadBufferFromFileDescriptorRead':10,'ReadBufferFromFileDescriptorReadBytes':16873,'ReadCompressedBytes':12855,'CompressedReadBufferBlocks':41,'CompressedReadBufferBytes':61376,'IOBufferAllocs':5,'IOBufferAllocBytes':26594,'FunctionExecute':28,'MarkCacheHits':1,'MarkCacheMisses':1,'CreatedReadBufferOrdinary':3,'DiskReadElapsedMicroseconds':30,'SelectedParts':1,'SelectedRanges':1,'SelectedMarks':1,'SelectedRows':40,'SelectedBytes':69039,'ContextLock':342,'RWLockAcquiredReadLocks':1,'RealTimeMicroseconds':14451,'UserTimeMicroseconds':10009,'SystemTimeMicroseconds':1515,'SoftPageFaults':44,'OSCPUWaitMicroseconds':3050,'OSCPUVirtualTimeMicroseconds':11523,'OSReadChars':17311,'OSWriteChars':7288} +Settings: {'load_balancing':'random','max_memory_usage':'10000000000'} + +Row 3: +────── +type: QueryFinish +query_start_time: 2021-12-25 15:33:29 +query_duration_ms: 12 +query_id: d9557845-5b5e-44ef-befa-55f837065d00 +query_kind: Select +is_initial_query: 1 +query: SELECT + type, + query_start_time, + query_duration_ms, + query_id, + query_kind, + is_initial_query, + query, + concat(toString(read_rows), ' rows / ', formatReadableSize(read_bytes)) AS read, + concat(toString(written_rows), ' rows / ', formatReadableSize(written_bytes)) AS written, + concat(toString(result_rows), ' rows / ', formatReadableSize(result_bytes)) AS result, + formatReadableSize(memory_usage) AS "memory usage", + exception, + '\n' || stack_trace AS stack_trace, + user, + initial_user, + multiIf(empty(client_name), http_user_agent, concat(client_name, ' ', toString(client_version_major), '.', toString(client_version_minor), '.', toString(client_version_patch))) AS client, + client_hostname, + databases, + tables, + columns, + used_aggregate_functions, + used_aggregate_function_combinators, + used_database_engines, + used_data_type_families, + used_dictionaries, + used_formats, + used_functions, + used_storages, + used_table_functions, + thread_ids, + ProfileEvents, + Settings + FROM system.query_log +WHERE type != 'QueryStart' + AND event_date >= today() - 1 + AND event_time >= now() - INTERVAL 1 DAY +ORDER BY query_duration_ms DESC +LIMIT 10 FORMAT Vertical + +read: 83 rows / 130.00 KiB +written: 0 rows / 0.00 B +result: 10 rows / 183.10 KiB +memory usage: 0.00 B +exception: +stack_trace: + +user: default +initial_user: default +client: python-requests/2.26.0 +client_hostname: +databases: ['system'] +tables: ['system.query_log'] +columns: ['system.query_log.ProfileEvents','system.query_log.Settings','system.query_log.client_hostname','system.query_log.client_name','system.query_log.client_version_major','system.query_log.client_version_minor','system.query_log.client_version_patch','system.query_log.columns','system.query_log.databases','system.query_log.event_date','system.query_log.event_time','system.query_log.exception','system.query_log.http_user_agent','system.query_log.initial_user','system.query_log.is_initial_query','system.query_log.memory_usage','system.query_log.query','system.query_log.query_duration_ms','system.query_log.query_id','system.query_log.query_kind','system.query_log.query_start_time','system.query_log.read_bytes','system.query_log.read_rows','system.query_log.result_bytes','system.query_log.result_rows','system.query_log.stack_trace','system.query_log.tables','system.query_log.thread_ids','system.query_log.type','system.query_log.used_aggregate_function_combinators','system.query_log.used_aggregate_functions','system.query_log.used_data_type_families','system.query_log.used_database_engines','system.query_log.used_dictionaries','system.query_log.used_formats','system.query_log.used_functions','system.query_log.used_storages','system.query_log.used_table_functions','system.query_log.user','system.query_log.written_bytes','system.query_log.written_rows'] +used_aggregate_functions: [] +used_aggregate_function_combinators: [] +used_database_engines: [] +used_data_type_families: [] +used_dictionaries: [] +used_formats: ['Vertical'] +used_functions: ['empty','and','now','concat','today','toIntervalDay','formatReadableSize','minus','greaterOrEquals','multiIf','toString','subtractDays','notEquals'] +used_storages: [] +used_table_functions: [] +thread_ids: [66,283,225,281,282] +ProfileEvents: {'Query':1,'SelectQuery':1,'FileOpen':3,'Seek':6,'ReadBufferFromFileDescriptorRead':18,'ReadBufferFromFileDescriptorReadBytes':32140,'ReadCompressedBytes':25892,'CompressedReadBufferBlocks':82,'CompressedReadBufferBytes':116215,'IOBufferAllocs':9,'IOBufferAllocBytes':47368,'FunctionExecute':51,'MarkCacheHits':3,'MarkCacheMisses':1,'CreatedReadBufferOrdinary':5,'DiskReadElapsedMicroseconds':13,'SelectedParts':2,'SelectedRanges':2,'SelectedMarks':2,'SelectedRows':83,'SelectedBytes':133125,'ContextLock':351,'RWLockAcquiredReadLocks':1,'RealTimeMicroseconds':19368,'UserTimeMicroseconds':12036,'SystemTimeMicroseconds':2047,'SoftPageFaults':42,'OSCPUWaitMicroseconds':710,'OSCPUVirtualTimeMicroseconds':13623,'OSWriteBytes':4096,'OSReadChars':34225,'OSWriteChars':8142} +Settings: {'load_balancing':'random','max_memory_usage':'10000000000'} + +Row 4: +────── +type: QueryFinish +query_start_time: 2021-12-25 15:33:29 +query_duration_ms: 11 +query_id: bae8a338-eee9-406b-80d2-4596af2ba31f +query_kind: Select +is_initial_query: 1 +query: SELECT + name, + engine, + tables, + partitions, + parts, + formatReadableSize(bytes_on_disk) "disk_size" +FROM system.databases db +LEFT JOIN +( + SELECT + database, + uniq(table) "tables", + uniq(table, partition) "partitions", + count() AS parts, + sum(bytes_on_disk) "bytes_on_disk" + FROM system.parts + WHERE active + GROUP BY database +) AS db_stats ON db.name = db_stats.database +ORDER BY bytes_on_disk DESC +LIMIT 10 FORMAT PrettyCompactNoEscapes + +read: 17 rows / 1.31 KiB +written: 0 rows / 0.00 B +result: 4 rows / 640.00 B +memory usage: 0.00 B +exception: +stack_trace: + +user: default +initial_user: default +client: python-requests/2.26.0 +client_hostname: +databases: ['system'] +tables: ['system.databases','system.parts'] +columns: ['system.databases.engine','system.databases.name','system.parts.active','system.parts.bytes_on_disk','system.parts.database','system.parts.partition','system.parts.table'] +used_aggregate_functions: ['count','sum','uniq'] +used_aggregate_function_combinators: [] +used_database_engines: [] +used_data_type_families: [] +used_dictionaries: [] +used_formats: ['PrettyCompactNoEscapes'] +used_functions: ['formatReadableSize'] +used_storages: [] +used_table_functions: [] +thread_ids: [66] +ProfileEvents: {'Query':1,'SelectQuery':1,'ArenaAllocChunks':5,'ArenaAllocBytes':20480,'FunctionExecute':1,'SelectedRows':17,'SelectedBytes':1345,'ContextLock':69,'RWLockAcquiredReadLocks':9,'RealTimeMicroseconds':12225,'UserTimeMicroseconds':10731,'SystemTimeMicroseconds':1146,'SoftPageFaults':2,'OSCPUWaitMicroseconds':720,'OSCPUVirtualTimeMicroseconds':11876,'OSWriteBytes':4096,'OSReadChars':438,'OSWriteChars':8938} +Settings: {'load_balancing':'random','max_memory_usage':'10000000000'} + +Row 5: +────── +type: QueryFinish +query_start_time: 2021-12-25 15:26:26 +query_duration_ms: 9 +query_id: f0c62bc7-36da-4542-a3d5-68a40c1c4b48 +query_kind: Select +is_initial_query: 1 +query: SELECT + type, + query_start_time, + query_duration_ms, + query_id, + query_kind, + is_initial_query, + query, + concat(toString(read_rows), ' rows / ', formatReadableSize(read_bytes)) AS read, + concat(toString(written_rows), ' rows / ', formatReadableSize(written_bytes)) AS written, + concat(toString(result_rows), ' rows / ', formatReadableSize(result_bytes)) AS result, + formatReadableSize(memory_usage) AS "memory usage", + exception, + '\n' || stack_trace AS stack_trace, + user, + initial_user, + multiIf(empty(client_name), http_user_agent, concat(client_name, ' ', toString(client_version_major), '.', toString(client_version_minor), '.', toString(client_version_patch))) AS client, + client_hostname, + databases, + tables, + columns, + used_aggregate_functions, + used_aggregate_function_combinators, + used_database_engines, + used_data_type_families, + used_dictionaries, + used_formats, + used_functions, + used_storages, + used_table_functions, + thread_ids, + ProfileEvents, + Settings + FROM system.query_log +WHERE type != 'QueryStart' + AND event_date >= today() - 1 + AND event_time >= now() - INTERVAL 1 DAY + AND exception != '' +ORDER BY query_start_time DESC +LIMIT 10 FORMAT Vertical + +read: 40 rows / 67.42 KiB +written: 0 rows / 0.00 B +result: 4 rows / 43.13 KiB +memory usage: 0.00 B +exception: +stack_trace: + +user: default +initial_user: default +client: python-requests/2.26.0 +client_hostname: +databases: ['system'] +tables: ['system.query_log'] +columns: ['system.query_log.ProfileEvents','system.query_log.Settings','system.query_log.client_hostname','system.query_log.client_name','system.query_log.client_version_major','system.query_log.client_version_minor','system.query_log.client_version_patch','system.query_log.columns','system.query_log.databases','system.query_log.event_date','system.query_log.event_time','system.query_log.exception','system.query_log.http_user_agent','system.query_log.initial_user','system.query_log.is_initial_query','system.query_log.memory_usage','system.query_log.query','system.query_log.query_duration_ms','system.query_log.query_id','system.query_log.query_kind','system.query_log.query_start_time','system.query_log.read_bytes','system.query_log.read_rows','system.query_log.result_bytes','system.query_log.result_rows','system.query_log.stack_trace','system.query_log.tables','system.query_log.thread_ids','system.query_log.type','system.query_log.used_aggregate_function_combinators','system.query_log.used_aggregate_functions','system.query_log.used_data_type_families','system.query_log.used_database_engines','system.query_log.used_dictionaries','system.query_log.used_formats','system.query_log.used_functions','system.query_log.used_storages','system.query_log.used_table_functions','system.query_log.user','system.query_log.written_bytes','system.query_log.written_rows'] +used_aggregate_functions: [] +used_aggregate_function_combinators: [] +used_database_engines: [] +used_data_type_families: [] +used_dictionaries: [] +used_formats: ['Vertical'] +used_functions: ['empty','and','now','concat','today','toIntervalDay','formatReadableSize','minus','greaterOrEquals','multiIf','toString','subtractDays','notEquals'] +used_storages: [] +used_table_functions: [] +thread_ids: [66] +ProfileEvents: {'Query':1,'SelectQuery':1,'FileOpen':1,'Seek':3,'ReadBufferFromFileDescriptorRead':8,'ReadBufferFromFileDescriptorReadBytes':15561,'ReadCompressedBytes':12855,'CompressedReadBufferBlocks':41,'CompressedReadBufferBytes':61376,'IOBufferAllocs':4,'IOBufferAllocBytes':25506,'FunctionExecute':31,'MarkCacheHits':2,'CreatedReadBufferOrdinary':2,'DiskReadElapsedMicroseconds':16,'SelectedParts':1,'SelectedRanges':1,'SelectedMarks':1,'SelectedRows':40,'SelectedBytes':69039,'ContextLock':361,'RWLockAcquiredReadLocks':1,'RealTimeMicroseconds':11353,'UserTimeMicroseconds':8910,'SystemTimeMicroseconds':533,'SoftPageFaults':7,'HardPageFaults':2,'OSCPUWaitMicroseconds':1117,'OSCPUVirtualTimeMicroseconds':9443,'OSReadBytes':16384,'OSWriteBytes':4096,'OSReadChars':15999,'OSWriteChars':7714,'QueryProfilerRuns':1} +Settings: {'load_balancing':'random','max_memory_usage':'10000000000'} + +Row 6: +────── +type: QueryFinish +query_start_time: 2021-12-25 15:33:29 +query_duration_ms: 8 +query_id: 72f3f9de-d17c-456b-8316-d494bea2096a +query_kind: Select +is_initial_query: 1 +query: SELECT name FROM system.tables WHERE database = 'system' FORMAT JSONCompact + +read: 74 rows / 2.61 KiB +written: 0 rows / 0.00 B +result: 74 rows / 2.00 KiB +memory usage: 0.00 B +exception: +stack_trace: + +user: default +initial_user: default +client: python-requests/2.26.0 +client_hostname: +databases: ['system'] +tables: ['system.tables'] +columns: ['system.tables.database','system.tables.name'] +used_aggregate_functions: [] +used_aggregate_function_combinators: [] +used_database_engines: [] +used_data_type_families: [] +used_dictionaries: [] +used_formats: ['JSONCompact'] +used_functions: ['equals'] +used_storages: [] +used_table_functions: [] +thread_ids: [66] +ProfileEvents: {'Query':1,'SelectQuery':1,'IOBufferAllocs':2,'IOBufferAllocBytes':8192,'FunctionExecute':4,'SelectedRows':74,'SelectedBytes':2675,'ContextLock':23,'RWLockAcquiredReadLocks':75,'RealTimeMicroseconds':9190,'UserTimeMicroseconds':6468,'SystemTimeMicroseconds':517,'OSCPUWaitMicroseconds':2237,'OSCPUVirtualTimeMicroseconds':6984,'OSReadChars':438,'OSWriteChars':1270} +Settings: {'load_balancing':'random','max_memory_usage':'10000000000'} + +Row 7: +────── +type: QueryFinish +query_start_time: 2021-12-25 15:33:29 +query_duration_ms: 8 +query_id: d55da87f-b030-4b5d-95fc-f9103ce58601 +query_kind: Select +is_initial_query: 1 +query: SELECT + type, + query_start_time, + query_duration_ms, + query_id, + query_kind, + is_initial_query, + query, + concat(toString(read_rows), ' rows / ', formatReadableSize(read_bytes)) AS read, + concat(toString(written_rows), ' rows / ', formatReadableSize(written_bytes)) AS written, + concat(toString(result_rows), ' rows / ', formatReadableSize(result_bytes)) AS result, + formatReadableSize(memory_usage) AS "memory usage", + exception, + '\n' || stack_trace AS stack_trace, + user, + initial_user, + multiIf(empty(client_name), http_user_agent, concat(client_name, ' ', toString(client_version_major), '.', toString(client_version_minor), '.', toString(client_version_patch))) AS client, + client_hostname, + databases, + tables, + columns, + used_aggregate_functions, + used_aggregate_function_combinators, + used_database_engines, + used_data_type_families, + used_dictionaries, + used_formats, + used_functions, + used_storages, + used_table_functions, + thread_ids, + ProfileEvents, + Settings + FROM system.query_log +WHERE type != 'QueryStart' + AND event_date >= today() - 1 + AND event_time >= now() - INTERVAL 1 DAY +ORDER BY memory_usage DESC +LIMIT 10 FORMAT Vertical + +read: 83 rows / 130.00 KiB +written: 0 rows / 0.00 B +result: 10 rows / 178.41 KiB +memory usage: 0.00 B +exception: +stack_trace: + +user: default +initial_user: default +client: python-requests/2.26.0 +client_hostname: +databases: ['system'] +tables: ['system.query_log'] +columns: ['system.query_log.ProfileEvents','system.query_log.Settings','system.query_log.client_hostname','system.query_log.client_name','system.query_log.client_version_major','system.query_log.client_version_minor','system.query_log.client_version_patch','system.query_log.columns','system.query_log.databases','system.query_log.event_date','system.query_log.event_time','system.query_log.exception','system.query_log.http_user_agent','system.query_log.initial_user','system.query_log.is_initial_query','system.query_log.memory_usage','system.query_log.query','system.query_log.query_duration_ms','system.query_log.query_id','system.query_log.query_kind','system.query_log.query_start_time','system.query_log.read_bytes','system.query_log.read_rows','system.query_log.result_bytes','system.query_log.result_rows','system.query_log.stack_trace','system.query_log.tables','system.query_log.thread_ids','system.query_log.type','system.query_log.used_aggregate_function_combinators','system.query_log.used_aggregate_functions','system.query_log.used_data_type_families','system.query_log.used_database_engines','system.query_log.used_dictionaries','system.query_log.used_formats','system.query_log.used_functions','system.query_log.used_storages','system.query_log.used_table_functions','system.query_log.user','system.query_log.written_bytes','system.query_log.written_rows'] +used_aggregate_functions: [] +used_aggregate_function_combinators: [] +used_database_engines: [] +used_data_type_families: [] +used_dictionaries: [] +used_formats: ['Vertical'] +used_functions: ['empty','and','now','concat','today','toIntervalDay','formatReadableSize','minus','greaterOrEquals','multiIf','toString','subtractDays','notEquals'] +used_storages: [] +used_table_functions: [] +thread_ids: [66,283,283,225,282] +ProfileEvents: {'Query':1,'SelectQuery':1,'FileOpen':2,'Seek':6,'ReadBufferFromFileDescriptorRead':16,'ReadBufferFromFileDescriptorReadBytes':30044,'ReadCompressedBytes':25892,'CompressedReadBufferBlocks':82,'CompressedReadBufferBytes':116215,'IOBufferAllocs':8,'IOBufferAllocBytes':45272,'FunctionExecute':51,'MarkCacheHits':4,'CreatedReadBufferOrdinary':4,'SelectedParts':2,'SelectedRanges':2,'SelectedMarks':2,'SelectedRows':83,'SelectedBytes':133125,'ContextLock':351,'RWLockAcquiredReadLocks':1,'RealTimeMicroseconds':12416,'UserTimeMicroseconds':7727,'SystemTimeMicroseconds':1247,'SoftPageFaults':41,'OSCPUWaitMicroseconds':1058,'OSCPUVirtualTimeMicroseconds':9018,'OSWriteBytes':4096,'OSReadChars':32137,'OSWriteChars':8108} +Settings: {'load_balancing':'random','max_memory_usage':'10000000000'} + +Row 8: +────── +type: QueryFinish +query_start_time: 2021-12-25 15:33:29 +query_duration_ms: 8 +query_id: cc2a0e7a-3b9b-47d2-9255-009c62584bc4 +query_kind: Select +is_initial_query: 1 +query: SELECT + type, + query_start_time, + query_duration_ms, + query_id, + query_kind, + is_initial_query, + query, + concat(toString(read_rows), ' rows / ', formatReadableSize(read_bytes)) AS read, + concat(toString(written_rows), ' rows / ', formatReadableSize(written_bytes)) AS written, + concat(toString(result_rows), ' rows / ', formatReadableSize(result_bytes)) AS result, + formatReadableSize(memory_usage) AS "memory usage", + exception, + '\n' || stack_trace AS stack_trace, + user, + initial_user, + multiIf(empty(client_name), http_user_agent, concat(client_name, ' ', toString(client_version_major), '.', toString(client_version_minor), '.', toString(client_version_patch))) AS client, + client_hostname, + databases, + tables, + columns, + used_aggregate_functions, + used_aggregate_function_combinators, + used_database_engines, + used_data_type_families, + used_dictionaries, + used_formats, + used_functions, + used_storages, + used_table_functions, + thread_ids, + ProfileEvents, + Settings + FROM system.query_log +WHERE type != 'QueryStart' + AND event_date >= today() - 1 + AND event_time >= now() - INTERVAL 1 DAY + AND exception != '' +ORDER BY query_start_time DESC +LIMIT 10 FORMAT Vertical + +read: 83 rows / 130.00 KiB +written: 0 rows / 0.00 B +result: 5 rows / 57.80 KiB +memory usage: 0.00 B +exception: +stack_trace: + +user: default +initial_user: default +client: python-requests/2.26.0 +client_hostname: +databases: ['system'] +tables: ['system.query_log'] +columns: ['system.query_log.ProfileEvents','system.query_log.Settings','system.query_log.client_hostname','system.query_log.client_name','system.query_log.client_version_major','system.query_log.client_version_minor','system.query_log.client_version_patch','system.query_log.columns','system.query_log.databases','system.query_log.event_date','system.query_log.event_time','system.query_log.exception','system.query_log.http_user_agent','system.query_log.initial_user','system.query_log.is_initial_query','system.query_log.memory_usage','system.query_log.query','system.query_log.query_duration_ms','system.query_log.query_id','system.query_log.query_kind','system.query_log.query_start_time','system.query_log.read_bytes','system.query_log.read_rows','system.query_log.result_bytes','system.query_log.result_rows','system.query_log.stack_trace','system.query_log.tables','system.query_log.thread_ids','system.query_log.type','system.query_log.used_aggregate_function_combinators','system.query_log.used_aggregate_functions','system.query_log.used_data_type_families','system.query_log.used_database_engines','system.query_log.used_dictionaries','system.query_log.used_formats','system.query_log.used_functions','system.query_log.used_storages','system.query_log.used_table_functions','system.query_log.user','system.query_log.written_bytes','system.query_log.written_rows'] +used_aggregate_functions: [] +used_aggregate_function_combinators: [] +used_database_engines: [] +used_data_type_families: [] +used_dictionaries: [] +used_formats: ['Vertical'] +used_functions: ['empty','and','now','concat','today','toIntervalDay','formatReadableSize','minus','greaterOrEquals','multiIf','toString','subtractDays','notEquals'] +used_storages: [] +used_table_functions: [] +thread_ids: [66,281,283,282,225] +ProfileEvents: {'Query':1,'SelectQuery':1,'FileOpen':2,'Seek':6,'ReadBufferFromFileDescriptorRead':16,'ReadBufferFromFileDescriptorReadBytes':31464,'ReadCompressedBytes':25892,'CompressedReadBufferBlocks':82,'CompressedReadBufferBytes':116215,'IOBufferAllocs':8,'IOBufferAllocBytes':46860,'FunctionExecute':56,'MarkCacheHits':4,'CreatedReadBufferOrdinary':4,'SelectedParts':2,'SelectedRanges':2,'SelectedMarks':2,'SelectedRows':83,'SelectedBytes':133125,'ContextLock':370,'RWLockAcquiredReadLocks':1,'RealTimeMicroseconds':13096,'UserTimeMicroseconds':9503,'SystemTimeMicroseconds':195,'SoftPageFaults':23,'OSCPUWaitMicroseconds':1380,'OSCPUVirtualTimeMicroseconds':9661,'OSWriteBytes':4096,'OSReadChars':33567,'OSWriteChars':8310} +Settings: {'load_balancing':'random','max_memory_usage':'10000000000'} + +Row 9: +─────── +type: QueryFinish +query_start_time: 2021-12-25 15:25:01 +query_duration_ms: 8 +query_id: a3d717fd-c43f-4723-a18d-557c733299f6 +query_kind: Select +is_initial_query: 1 +query: SELECT + name, + engine, + tables, + partitions, + parts, + formatReadableSize(bytes_on_disk) "disk_size" +FROM system.databases db +LEFT JOIN +( + SELECT + database, + uniq(table) "tables", + uniq(table, partition) "partitions", + count() AS parts, + sum(bytes_on_disk) "bytes_on_disk" + FROM system.parts + WHERE active + GROUP BY database +) AS db_stats ON db.name = db_stats.database +ORDER BY bytes_on_disk DESC +LIMIT 10 FORMAT PrettyCompactNoEscapes + +read: 9 rows / 845.00 B +written: 0 rows / 0.00 B +result: 4 rows / 640.00 B +memory usage: 0.00 B +exception: +stack_trace: + +user: default +initial_user: default +client: python-requests/2.26.0 +client_hostname: +databases: ['system'] +tables: ['system.databases','system.parts'] +columns: ['system.databases.engine','system.databases.name','system.parts.active','system.parts.bytes_on_disk','system.parts.database','system.parts.partition','system.parts.table'] +used_aggregate_functions: ['count','sum','uniq'] +used_aggregate_function_combinators: [] +used_database_engines: [] +used_data_type_families: [] +used_dictionaries: [] +used_formats: ['PrettyCompactNoEscapes'] +used_functions: ['formatReadableSize'] +used_storages: [] +used_table_functions: [] +thread_ids: [66] +ProfileEvents: {'Query':1,'SelectQuery':1,'ArenaAllocChunks':5,'ArenaAllocBytes':20480,'FunctionExecute':1,'SelectedRows':9,'SelectedBytes':845,'ContextLock':69,'RWLockAcquiredReadLocks':6,'RealTimeMicroseconds':9090,'UserTimeMicroseconds':4654,'SystemTimeMicroseconds':1171,'SoftPageFaults':8,'HardPageFaults':2,'OSCPUWaitMicroseconds':2126,'OSCPUVirtualTimeMicroseconds':5824,'OSReadBytes':212992,'OSWriteBytes':4096,'OSReadChars':427,'OSWriteChars':8936} +Settings: {'load_balancing':'random','max_memory_usage':'10000000000'} + +Row 10: +─────── +type: QueryFinish +query_start_time: 2021-12-25 15:26:26 +query_duration_ms: 7 +query_id: 49305759-0f08-4d5a-81d8-c1a11cfc0eb4 +query_kind: Select +is_initial_query: 1 +query: SELECT + type, + query_start_time, + query_duration_ms, + query_id, + query_kind, + is_initial_query, + query, + concat(toString(read_rows), ' rows / ', formatReadableSize(read_bytes)) AS read, + concat(toString(written_rows), ' rows / ', formatReadableSize(written_bytes)) AS written, + concat(toString(result_rows), ' rows / ', formatReadableSize(result_bytes)) AS result, + formatReadableSize(memory_usage) AS "memory usage", + exception, + '\n' || stack_trace AS stack_trace, + user, + initial_user, + multiIf(empty(client_name), http_user_agent, concat(client_name, ' ', toString(client_version_major), '.', toString(client_version_minor), '.', toString(client_version_patch))) AS client, + client_hostname, + databases, + tables, + columns, + used_aggregate_functions, + used_aggregate_function_combinators, + used_database_engines, + used_data_type_families, + used_dictionaries, + used_formats, + used_functions, + used_storages, + used_table_functions, + thread_ids, + ProfileEvents, + Settings + FROM system.query_log +WHERE type != 'QueryStart' + AND event_date >= today() - 1 + AND event_time >= now() - INTERVAL 1 DAY +ORDER BY memory_usage DESC +LIMIT 10 FORMAT Vertical + +read: 40 rows / 67.42 KiB +written: 0 rows / 0.00 B +result: 10 rows / 57.95 KiB +memory usage: 0.00 B +exception: +stack_trace: + +user: default +initial_user: default +client: python-requests/2.26.0 +client_hostname: +databases: ['system'] +tables: ['system.query_log'] +columns: ['system.query_log.ProfileEvents','system.query_log.Settings','system.query_log.client_hostname','system.query_log.client_name','system.query_log.client_version_major','system.query_log.client_version_minor','system.query_log.client_version_patch','system.query_log.columns','system.query_log.databases','system.query_log.event_date','system.query_log.event_time','system.query_log.exception','system.query_log.http_user_agent','system.query_log.initial_user','system.query_log.is_initial_query','system.query_log.memory_usage','system.query_log.query','system.query_log.query_duration_ms','system.query_log.query_id','system.query_log.query_kind','system.query_log.query_start_time','system.query_log.read_bytes','system.query_log.read_rows','system.query_log.result_bytes','system.query_log.result_rows','system.query_log.stack_trace','system.query_log.tables','system.query_log.thread_ids','system.query_log.type','system.query_log.used_aggregate_function_combinators','system.query_log.used_aggregate_functions','system.query_log.used_data_type_families','system.query_log.used_database_engines','system.query_log.used_dictionaries','system.query_log.used_formats','system.query_log.used_functions','system.query_log.used_storages','system.query_log.used_table_functions','system.query_log.user','system.query_log.written_bytes','system.query_log.written_rows'] +used_aggregate_functions: [] +used_aggregate_function_combinators: [] +used_database_engines: [] +used_data_type_families: [] +used_dictionaries: [] +used_formats: ['Vertical'] +used_functions: ['empty','and','now','concat','today','toIntervalDay','formatReadableSize','minus','greaterOrEquals','multiIf','toString','subtractDays','notEquals'] +used_storages: [] +used_table_functions: [] +thread_ids: [66] +ProfileEvents: {'Query':1,'SelectQuery':1,'FileOpen':1,'Seek':3,'ReadBufferFromFileDescriptorRead':8,'ReadBufferFromFileDescriptorReadBytes':14777,'ReadCompressedBytes':12855,'CompressedReadBufferBlocks':41,'CompressedReadBufferBytes':61376,'IOBufferAllocs':4,'IOBufferAllocBytes':24498,'FunctionExecute':28,'MarkCacheHits':2,'CreatedReadBufferOrdinary':2,'DiskReadElapsedMicroseconds':16,'SelectedParts':1,'SelectedRanges':1,'SelectedMarks':1,'SelectedRows':40,'SelectedBytes':69039,'ContextLock':342,'RWLockAcquiredReadLocks':1,'RealTimeMicroseconds':9159,'UserTimeMicroseconds':4713,'SystemTimeMicroseconds':1942,'SoftPageFaults':19,'OSCPUWaitMicroseconds':2421,'OSCPUVirtualTimeMicroseconds':6655,'OSWriteBytes':4096,'OSReadChars':15215,'OSWriteChars':7278} +Settings: {'load_balancing':'random','max_memory_usage':'10000000000'} +``` +##### Top 10 queries by memory usage +**query** +```sql +SELECT + type, + query_start_time, + query_duration_ms, + query_id, + query_kind, + is_initial_query, + query, + concat(toString(read_rows), ' rows / ', formatReadableSize(read_bytes)) AS read, + concat(toString(written_rows), ' rows / ', formatReadableSize(written_bytes)) AS written, + concat(toString(result_rows), ' rows / ', formatReadableSize(result_bytes)) AS result, + formatReadableSize(memory_usage) AS "memory usage", + exception, + '\n' || stack_trace AS stack_trace, + user, + initial_user, + multiIf(empty(client_name), http_user_agent, concat(client_name, ' ', toString(client_version_major), '.', toString(client_version_minor), '.', toString(client_version_patch))) AS client, + client_hostname, + databases, + tables, + columns, + used_aggregate_functions, + used_aggregate_function_combinators, + used_database_engines, + used_data_type_families, + used_dictionaries, + used_formats, + used_functions, + used_storages, + used_table_functions, + thread_ids, + ProfileEvents, + Settings + FROM system.query_log +WHERE type != 'QueryStart' + AND event_date >= today() - 1 + AND event_time >= now() - INTERVAL 1 DAY +ORDER BY memory_usage DESC +LIMIT 10 +``` +**result** +``` +Row 1: +────── +type: QueryFinish +query_start_time: 2021-12-25 15:26:25 +query_duration_ms: 0 +query_id: c6b6a96c-d5c5-4406-98cd-80857a8412d4 +query_kind: +is_initial_query: 1 +query: SHOW ACCESS FORMAT TSVRaw + +read: 5 rows / 405.00 B +written: 0 rows / 0.00 B +result: 5 rows / 4.50 KiB +memory usage: 1.82 KiB +exception: +stack_trace: + +user: default +initial_user: default +client: python-requests/2.26.0 +client_hostname: +databases: [] +tables: [] +columns: [] +used_aggregate_functions: [] +used_aggregate_function_combinators: [] +used_database_engines: [] +used_data_type_families: [] +used_dictionaries: [] +used_formats: ['TSVRaw'] +used_functions: [] +used_storages: [] +used_table_functions: [] +thread_ids: [66,283,225,281] +ProfileEvents: {'Query':1,'IOBufferAllocs':3,'IOBufferAllocBytes':3145728,'SelectedRows':5,'SelectedBytes':405,'ContextLock':8,'RealTimeMicroseconds':959,'UserTimeMicroseconds':452,'SystemTimeMicroseconds':238,'OSCPUWaitMicroseconds':90,'OSCPUVirtualTimeMicroseconds':690,'OSWriteBytes':4096,'OSReadChars':846,'OSWriteChars':880} +Settings: {'load_balancing':'random','max_memory_usage':'10000000000'} + +Row 2: +────── +type: QueryFinish +query_start_time: 2021-12-25 15:33:29 +query_duration_ms: 2 +query_id: 253362ba-40a1-4593-a4cc-30d3dfdfe0ab +query_kind: +is_initial_query: 1 +query: SHOW ACCESS FORMAT TSVRaw + +read: 5 rows / 405.00 B +written: 0 rows / 0.00 B +result: 5 rows / 4.50 KiB +memory usage: 1.82 KiB +exception: +stack_trace: + +user: default +initial_user: default +client: python-requests/2.26.0 +client_hostname: +databases: [] +tables: [] +columns: [] +used_aggregate_functions: [] +used_aggregate_function_combinators: [] +used_database_engines: [] +used_data_type_families: [] +used_dictionaries: [] +used_formats: ['TSVRaw'] +used_functions: [] +used_storages: [] +used_table_functions: [] +thread_ids: [66,225,283,282] +ProfileEvents: {'Query':1,'IOBufferAllocs':3,'IOBufferAllocBytes':3145728,'SelectedRows':5,'SelectedBytes':405,'ContextLock':8,'RealTimeMicroseconds':4687,'UserTimeMicroseconds':2171,'SystemTimeMicroseconds':1264,'OSCPUWaitMicroseconds':513,'OSCPUVirtualTimeMicroseconds':3335,'OSReadChars':848,'OSWriteChars':880} +Settings: {'load_balancing':'random','max_memory_usage':'10000000000'} + +Row 3: +────── +type: QueryFinish +query_start_time: 2021-12-25 15:25:01 +query_duration_ms: 1 +query_id: 61b20c8c-ca63-4384-adb4-ce7765d77389 +query_kind: +is_initial_query: 1 +query: SHOW ACCESS FORMAT TSVRaw + +read: 5 rows / 405.00 B +written: 0 rows / 0.00 B +result: 5 rows / 4.50 KiB +memory usage: 1.82 KiB +exception: +stack_trace: + +user: default +initial_user: default +client: python-requests/2.26.0 +client_hostname: +databases: [] +tables: [] +columns: [] +used_aggregate_functions: [] +used_aggregate_function_combinators: [] +used_database_engines: [] +used_data_type_families: [] +used_dictionaries: [] +used_formats: ['TSVRaw'] +used_functions: [] +used_storages: [] +used_table_functions: [] +thread_ids: [66,225,281,283] +ProfileEvents: {'Query':1,'IOBufferAllocs':3,'IOBufferAllocBytes':3145728,'SelectedRows':5,'SelectedBytes':405,'ContextLock':8,'RealTimeMicroseconds':3442,'UserTimeMicroseconds':715,'SystemTimeMicroseconds':485,'SoftPageFaults':1,'OSCPUWaitMicroseconds':443,'OSCPUVirtualTimeMicroseconds':1170,'OSReadChars':833,'OSWriteChars':880} +Settings: {'load_balancing':'random','max_memory_usage':'10000000000'} + +Row 4: +────── +type: QueryFinish +query_start_time: 2021-12-25 15:26:25 +query_duration_ms: 1 +query_id: 13ebdab7-e368-4f9f-b47e-023dbd9e91ce +query_kind: Select +is_initial_query: 1 +query: +SELECT formatReadableTimeDelta(uptime()) + + +read: 1 rows / 1.00 B +written: 0 rows / 0.00 B +result: 1 rows / 128.00 B +memory usage: 1.49 KiB +exception: +stack_trace: + +user: default +initial_user: default +client: python-requests/2.26.0 +client_hostname: +databases: ['system'] +tables: ['system.one'] +columns: ['system.one.dummy'] +used_aggregate_functions: [] +used_aggregate_function_combinators: [] +used_database_engines: [] +used_data_type_families: [] +used_dictionaries: [] +used_formats: ['TabSeparated'] +used_functions: ['uptime','formatReadableTimeDelta'] +used_storages: [] +used_table_functions: [] +thread_ids: [66,283,282,225,281] +ProfileEvents: {'Query':1,'SelectQuery':1,'IOBufferAllocs':3,'IOBufferAllocBytes':3145728,'SelectedRows':1,'SelectedBytes':1,'ContextLock':17,'RWLockAcquiredReadLocks':1,'RealTimeMicroseconds':1613,'UserTimeMicroseconds':708,'SystemTimeMicroseconds':274,'SoftPageFaults':3,'OSCPUWaitMicroseconds':2,'OSCPUVirtualTimeMicroseconds':980,'OSReadChars':846,'OSWriteChars':1190} +Settings: {'load_balancing':'random','max_memory_usage':'10000000000'} + +Row 5: +────── +type: QueryFinish +query_start_time: 2021-12-25 15:33:29 +query_duration_ms: 2 +query_id: ff330183-854b-46bc-a548-30e12a7bee9c +query_kind: Select +is_initial_query: 1 +query: +SELECT formatReadableTimeDelta(uptime()) + + +read: 1 rows / 1.00 B +written: 0 rows / 0.00 B +result: 1 rows / 128.00 B +memory usage: 1.49 KiB +exception: +stack_trace: + +user: default +initial_user: default +client: python-requests/2.26.0 +client_hostname: +databases: ['system'] +tables: ['system.one'] +columns: ['system.one.dummy'] +used_aggregate_functions: [] +used_aggregate_function_combinators: [] +used_database_engines: [] +used_data_type_families: [] +used_dictionaries: [] +used_formats: ['TabSeparated'] +used_functions: ['formatReadableTimeDelta','uptime'] +used_storages: [] +used_table_functions: [] +thread_ids: [66,225,283,281,282] +ProfileEvents: {'Query':1,'SelectQuery':1,'IOBufferAllocs':3,'IOBufferAllocBytes':3145728,'SelectedRows':1,'SelectedBytes':1,'ContextLock':17,'RWLockAcquiredReadLocks':1,'RealTimeMicroseconds':4372,'UserTimeMicroseconds':1022,'SystemTimeMicroseconds':177,'OSCPUWaitMicroseconds':2070,'OSCPUVirtualTimeMicroseconds':1198,'OSWriteBytes':4096,'OSReadChars':848,'OSWriteChars':1190} +Settings: {'load_balancing':'random','max_memory_usage':'10000000000'} + +Row 6: +────── +type: QueryFinish +query_start_time: 2021-12-25 15:25:01 +query_duration_ms: 3 +query_id: b763c2f9-6234-47f7-8b30-43d619909289 +query_kind: Select +is_initial_query: 1 +query: +SELECT formatReadableTimeDelta(uptime()) + + +read: 1 rows / 1.00 B +written: 0 rows / 0.00 B +result: 1 rows / 128.00 B +memory usage: 1.49 KiB +exception: +stack_trace: + +user: default +initial_user: default +client: python-requests/2.26.0 +client_hostname: +databases: ['system'] +tables: ['system.one'] +columns: ['system.one.dummy'] +used_aggregate_functions: [] +used_aggregate_function_combinators: [] +used_database_engines: [] +used_data_type_families: [] +used_dictionaries: [] +used_formats: ['TabSeparated'] +used_functions: ['uptime','formatReadableTimeDelta'] +used_storages: [] +used_table_functions: [] +thread_ids: [66,225,281,283,282] +ProfileEvents: {'Query':1,'SelectQuery':1,'IOBufferAllocs':3,'IOBufferAllocBytes':3145728,'SelectedRows':1,'SelectedBytes':1,'ContextLock':17,'RWLockAcquiredReadLocks':1,'RealTimeMicroseconds':6367,'UserTimeMicroseconds':3329,'SystemTimeMicroseconds':531,'SoftPageFaults':6,'HardPageFaults':1,'OSCPUWaitMicroseconds':1090,'OSCPUVirtualTimeMicroseconds':3859,'OSReadBytes':102400,'OSReadChars':830,'OSWriteChars':1190} +Settings: {'load_balancing':'random','max_memory_usage':'10000000000'} + +Row 7: +────── +type: QueryFinish +query_start_time: 2021-12-25 15:26:25 +query_duration_ms: 1 +query_id: e9c25bd1-00d3-4239-9611-1c3d391178da +query_kind: Select +is_initial_query: 1 +query: SELECT version() + +read: 1 rows / 1.00 B +written: 0 rows / 0.00 B +result: 1 rows / 128.00 B +memory usage: 1.45 KiB +exception: +stack_trace: + +user: default +initial_user: default +client: python-requests/2.26.0 +client_hostname: +databases: ['system'] +tables: ['system.one'] +columns: ['system.one.dummy'] +used_aggregate_functions: [] +used_aggregate_function_combinators: [] +used_database_engines: [] +used_data_type_families: [] +used_dictionaries: [] +used_formats: ['TabSeparated'] +used_functions: ['version'] +used_storages: [] +used_table_functions: [] +thread_ids: [66,283,225,282] +ProfileEvents: {'Query':1,'SelectQuery':1,'IOBufferAllocs':3,'IOBufferAllocBytes':3145728,'SelectedRows':1,'SelectedBytes':1,'ContextLock':15,'RWLockAcquiredReadLocks':1,'RealTimeMicroseconds':2720,'UserTimeMicroseconds':648,'SystemTimeMicroseconds':1144,'OSCPUWaitMicroseconds':110,'OSCPUVirtualTimeMicroseconds':1790,'OSReadChars':845,'OSWriteChars':1140} +Settings: {'load_balancing':'random','max_memory_usage':'10000000000'} + +Row 8: +────── +type: QueryFinish +query_start_time: 2021-12-25 15:33:29 +query_duration_ms: 4 +query_id: 69762642-8a75-4149-aaf5-bc1969558747 +query_kind: Select +is_initial_query: 1 +query: SELECT version() + +read: 1 rows / 1.00 B +written: 0 rows / 0.00 B +result: 1 rows / 128.00 B +memory usage: 1.45 KiB +exception: +stack_trace: + +user: default +initial_user: default +client: python-requests/2.26.0 +client_hostname: +databases: ['system'] +tables: ['system.one'] +columns: ['system.one.dummy'] +used_aggregate_functions: [] +used_aggregate_function_combinators: [] +used_database_engines: [] +used_data_type_families: [] +used_dictionaries: [] +used_formats: ['TabSeparated'] +used_functions: ['version'] +used_storages: [] +used_table_functions: [] +thread_ids: [66,282,283] +ProfileEvents: {'Query':1,'SelectQuery':1,'IOBufferAllocs':3,'IOBufferAllocBytes':3145728,'SelectedRows':1,'SelectedBytes':1,'ContextLock':15,'RWLockAcquiredReadLocks':1,'RealTimeMicroseconds':10137,'UserTimeMicroseconds':6289,'SystemTimeMicroseconds':47,'SoftPageFaults':2,'HardPageFaults':1,'OSCPUWaitMicroseconds':859,'OSCPUVirtualTimeMicroseconds':6336,'OSReadBytes':12288,'OSReadChars':845,'OSWriteChars':1140} +Settings: {'load_balancing':'random','max_memory_usage':'10000000000'} + +Row 9: +─────── +type: QueryFinish +query_start_time: 2021-12-25 15:25:01 +query_duration_ms: 4 +query_id: 9e31242c-62c5-4bb1-9a3e-f96e99f3bddf +query_kind: Select +is_initial_query: 1 +query: SELECT version() + +read: 1 rows / 1.00 B +written: 0 rows / 0.00 B +result: 1 rows / 128.00 B +memory usage: 1.45 KiB +exception: +stack_trace: + +user: default +initial_user: default +client: python-requests/2.26.0 +client_hostname: +databases: ['system'] +tables: ['system.one'] +columns: ['system.one.dummy'] +used_aggregate_functions: [] +used_aggregate_function_combinators: [] +used_database_engines: [] +used_data_type_families: [] +used_dictionaries: [] +used_formats: ['TabSeparated'] +used_functions: ['version'] +used_storages: [] +used_table_functions: [] +thread_ids: [66,225,282,281,283] +ProfileEvents: {'Query':1,'SelectQuery':1,'IOBufferAllocs':3,'IOBufferAllocBytes':3145728,'SelectedRows':1,'SelectedBytes':1,'ContextLock':15,'RWLockAcquiredReadLocks':1,'RealTimeMicroseconds':8688,'UserTimeMicroseconds':3598,'SystemTimeMicroseconds':1288,'SoftPageFaults':42,'HardPageFaults':1,'OSCPUWaitMicroseconds':214,'OSCPUVirtualTimeMicroseconds':4885,'OSReadBytes':98304,'OSReadChars':818,'OSWriteChars':1140} +Settings: {'load_balancing':'random','max_memory_usage':'10000000000'} + +Row 10: +─────── +type: QueryFinish +query_start_time: 2021-12-25 15:26:26 +query_duration_ms: 2 +query_id: de1fc64c-09c3-420a-8801-a2f9f04407cd +query_kind: Select +is_initial_query: 1 +query: SELECT + database, + table, + count() "partitions", + sum(part_count) "parts", + max(part_count) "max_parts_per_partition" +FROM +( + SELECT + database, + table, + partition, + count() "part_count" + FROM system.parts + WHERE active + GROUP BY database, table, partition +) partitions +GROUP BY database, table +ORDER BY max_parts_per_partition DESC +LIMIT 10 FORMAT PrettyCompactNoEscapes + +read: 12 rows / 643.00 B +written: 0 rows / 0.00 B +result: 6 rows / 752.00 B +memory usage: 0.00 B +exception: +stack_trace: + +user: default +initial_user: default +client: python-requests/2.26.0 +client_hostname: +databases: ['system'] +tables: ['system.parts'] +columns: ['system.parts.active','system.parts.database','system.parts.partition','system.parts.table'] +used_aggregate_functions: ['count','max','sum'] +used_aggregate_function_combinators: [] +used_database_engines: [] +used_data_type_families: [] +used_dictionaries: [] +used_formats: ['PrettyCompactNoEscapes'] +used_functions: [] +used_storages: [] +used_table_functions: [] +thread_ids: [66] +ProfileEvents: {'Query':1,'SelectQuery':1,'ArenaAllocChunks':2,'ArenaAllocBytes':8192,'SelectedRows':12,'SelectedBytes':643,'ContextLock':58,'RWLockAcquiredReadLocks':9,'RWLockReadersWaitMilliseconds':1,'RealTimeMicroseconds':2924,'UserTimeMicroseconds':1583,'SystemTimeMicroseconds':892,'SoftPageFaults':6,'OSCPUVirtualTimeMicroseconds':3423,'OSReadChars':438,'OSWriteChars':5086} +Settings: {'load_balancing':'random','max_memory_usage':'10000000000'} +``` +##### Last 10 failed queries +**query** +```sql +SELECT + type, + query_start_time, + query_duration_ms, + query_id, + query_kind, + is_initial_query, + query, + concat(toString(read_rows), ' rows / ', formatReadableSize(read_bytes)) AS read, + concat(toString(written_rows), ' rows / ', formatReadableSize(written_bytes)) AS written, + concat(toString(result_rows), ' rows / ', formatReadableSize(result_bytes)) AS result, + formatReadableSize(memory_usage) AS "memory usage", + exception, + '\n' || stack_trace AS stack_trace, + user, + initial_user, + multiIf(empty(client_name), http_user_agent, concat(client_name, ' ', toString(client_version_major), '.', toString(client_version_minor), '.', toString(client_version_patch))) AS client, + client_hostname, + databases, + tables, + columns, + used_aggregate_functions, + used_aggregate_function_combinators, + used_database_engines, + used_data_type_families, + used_dictionaries, + used_formats, + used_functions, + used_storages, + used_table_functions, + thread_ids, + ProfileEvents, + Settings + FROM system.query_log +WHERE type != 'QueryStart' + AND event_date >= today() - 1 + AND event_time >= now() - INTERVAL 1 DAY + AND exception != '' +ORDER BY query_start_time DESC +LIMIT 10 +``` +**result** +``` +Row 1: +────── +type: ExceptionBeforeStart +query_start_time: 2021-12-25 15:33:29 +query_duration_ms: 0 +query_id: 323743ef-4dff-4ed3-9559-f405c64fbd4a +query_kind: Select +is_initial_query: 1 +query: SELECT + '\n' || arrayStringConcat( + arrayMap( + x, + y -> concat(x, ': ', y), + arrayMap(x -> addressToLine(x), trace), + arrayMap(x -> demangle(addressToSymbol(x)), trace)), + '\n') AS trace +FROM system.stack_trace FORMAT Vertical + +read: 0 rows / 0.00 B +written: 0 rows / 0.00 B +result: 0 rows / 0.00 B +memory usage: 0.00 B +exception: Code: 446. DB::Exception: default: Introspection functions are disabled, because setting 'allow_introspection_functions' is set to 0: While processing concat('\n', arrayStringConcat(arrayMap((x, y) -> concat(x, ': ', y), arrayMap(x -> addressToLine(x), trace), arrayMap(x -> demangle(addressToSymbol(x)), trace)), '\n')) AS trace. (FUNCTION_NOT_ALLOWED) (version 21.11.8.4 (official build)) +stack_trace: +0. DB::Exception::Exception(std::__1::basic_string, std::__1::allocator > const&, int, bool) @ 0x9b682d4 in /usr/bin/clickhouse +1. bool DB::ContextAccess::checkAccessImplHelper(DB::AccessFlags const&) const::'lambda'(std::__1::basic_string, std::__1::allocator > const&, int)::operator()(std::__1::basic_string, std::__1::allocator > const&, int) const @ 0x119786bc in /usr/bin/clickhouse +2. bool DB::ContextAccess::checkAccessImplHelper(DB::AccessFlags const&) const @ 0x11977416 in /usr/bin/clickhouse +3. DB::Context::checkAccess(DB::AccessFlags const&) const @ 0x11eb2f08 in /usr/bin/clickhouse +4. ? @ 0xf96aefb in /usr/bin/clickhouse +5. DB::FunctionFactory::tryGetImpl(std::__1::basic_string, std::__1::allocator > const&, std::__1::shared_ptr) const @ 0x118f74b4 in /usr/bin/clickhouse +6. DB::FunctionFactory::getImpl(std::__1::basic_string, std::__1::allocator > const&, std::__1::shared_ptr) const @ 0x118f71fc in /usr/bin/clickhouse +7. DB::ActionsMatcher::visit(DB::ASTFunction const&, std::__1::shared_ptr const&, DB::ActionsMatcher::Data&) @ 0x120c3abf in /usr/bin/clickhouse +8. DB::ActionsMatcher::visit(DB::ASTFunction const&, std::__1::shared_ptr const&, DB::ActionsMatcher::Data&) @ 0x120c6b9f in /usr/bin/clickhouse +9. DB::ActionsMatcher::visit(DB::ASTFunction const&, std::__1::shared_ptr const&, DB::ActionsMatcher::Data&) @ 0x120c41ed in /usr/bin/clickhouse +10. DB::ActionsMatcher::visit(DB::ASTFunction const&, std::__1::shared_ptr const&, DB::ActionsMatcher::Data&) @ 0x120c41ed in /usr/bin/clickhouse +11. DB::ActionsMatcher::visit(DB::ASTFunction const&, std::__1::shared_ptr const&, DB::ActionsMatcher::Data&) @ 0x120c41ed in /usr/bin/clickhouse +12. DB::ActionsMatcher::visit(DB::ASTExpressionList&, std::__1::shared_ptr const&, DB::ActionsMatcher::Data&) @ 0x120ca818 in /usr/bin/clickhouse +13. DB::InDepthNodeVisitor const>::visit(std::__1::shared_ptr const&) @ 0x12099bb7 in /usr/bin/clickhouse +14. DB::ExpressionAnalyzer::getRootActions(std::__1::shared_ptr const&, bool, std::__1::shared_ptr&, bool) @ 0x120999cb in /usr/bin/clickhouse +15. DB::SelectQueryExpressionAnalyzer::appendSelect(DB::ExpressionActionsChain&, bool) @ 0x120a4409 in /usr/bin/clickhouse +16. DB::ExpressionAnalysisResult::ExpressionAnalysisResult(DB::SelectQueryExpressionAnalyzer&, std::__1::shared_ptr const&, bool, bool, bool, std::__1::shared_ptr const&, DB::Block const&) @ 0x120a9070 in /usr/bin/clickhouse +17. DB::InterpreterSelectQuery::getSampleBlockImpl() @ 0x1232fd0d in /usr/bin/clickhouse +18. ? @ 0x12328864 in /usr/bin/clickhouse +19. DB::InterpreterSelectQuery::InterpreterSelectQuery(std::__1::shared_ptr const&, std::__1::shared_ptr, std::__1::optional, std::__1::shared_ptr const&, DB::SelectQueryOptions const&, std::__1::vector, std::__1::allocator >, std::__1::allocator, std::__1::allocator > > > const&, std::__1::shared_ptr const&, std::__1::unordered_map, DB::PreparedSetKey::Hash, std::__1::equal_to, std::__1::allocator > > >) @ 0x123232c7 in /usr/bin/clickhouse +20. DB::InterpreterSelectQuery::InterpreterSelectQuery(std::__1::shared_ptr const&, std::__1::shared_ptr, DB::SelectQueryOptions const&, std::__1::vector, std::__1::allocator >, std::__1::allocator, std::__1::allocator > > > const&) @ 0x12321c54 in /usr/bin/clickhouse +21. DB::InterpreterSelectWithUnionQuery::buildCurrentChildInterpreter(std::__1::shared_ptr const&, std::__1::vector, std::__1::allocator >, std::__1::allocator, std::__1::allocator > > > const&) @ 0x12547fa2 in /usr/bin/clickhouse +22. DB::InterpreterSelectWithUnionQuery::InterpreterSelectWithUnionQuery(std::__1::shared_ptr const&, std::__1::shared_ptr, DB::SelectQueryOptions const&, std::__1::vector, std::__1::allocator >, std::__1::allocator, std::__1::allocator > > > const&) @ 0x12546680 in /usr/bin/clickhouse +23. DB::InterpreterFactory::get(std::__1::shared_ptr&, std::__1::shared_ptr, DB::SelectQueryOptions const&) @ 0x122c6216 in /usr/bin/clickhouse +24. ? @ 0x1277dd26 in /usr/bin/clickhouse +25. DB::executeQuery(DB::ReadBuffer&, DB::WriteBuffer&, bool, std::__1::shared_ptr, std::__1::function, std::__1::allocator > const&, std::__1::basic_string, std::__1::allocator > const&, std::__1::basic_string, std::__1::allocator > const&, std::__1::basic_string, std::__1::allocator > const&)>, std::__1::optional const&) @ 0x12781319 in /usr/bin/clickhouse +26. DB::HTTPHandler::processQuery(DB::HTTPServerRequest&, DB::HTMLForm&, DB::HTTPServerResponse&, DB::HTTPHandler::Output&, std::__1::optional&) @ 0x130c20fa in /usr/bin/clickhouse +27. DB::HTTPHandler::handleRequest(DB::HTTPServerRequest&, DB::HTTPServerResponse&) @ 0x130c6760 in /usr/bin/clickhouse +28. DB::HTTPServerConnection::run() @ 0x1312b5e8 in /usr/bin/clickhouse +29. Poco::Net::TCPServerConnection::start() @ 0x15d682cf in /usr/bin/clickhouse +30. Poco::Net::TCPServerDispatcher::run() @ 0x15d6a6c1 in /usr/bin/clickhouse +31. Poco::PooledThread::run() @ 0x15e7f069 in /usr/bin/clickhouse + +user: default +initial_user: default +client: python-requests/2.26.0 +client_hostname: +databases: [] +tables: [] +columns: [] +used_aggregate_functions: [] +used_aggregate_function_combinators: [] +used_database_engines: [] +used_data_type_families: [] +used_dictionaries: [] +used_formats: [] +used_functions: [] +used_storages: [] +used_table_functions: [] +thread_ids: [] +ProfileEvents: {} +Settings: {} + +``` +#### Stack traces +**query** +```sql +SELECT + '\n' || arrayStringConcat( + arrayMap( + x, + y -> concat(x, ': ', y), + arrayMap(x -> addressToLine(x), trace), + arrayMap(x -> demangle(addressToSymbol(x)), trace)), + '\n') AS trace +FROM system.stack_trace +``` +**result** +``` +ClickhouseError("Code: 446. DB::Exception: default: Introspection functions are disabled, because setting 'allow_introspection_functions' is set to 0: While processing concat('\\n', arrayStringConcat(arrayMap((x, y) -> concat(x, ': ', y), arrayMap(x -> addressToLine(x), trace), arrayMap(x -> demangle(addressToSymbol(x)), trace)), '\\n')) AS trace. (FUNCTION_NOT_ALLOWED) (version 21.11.8.4 (official build))",) +``` +#### uname +**command** +``` +uname -a +``` +**result** +``` +Linux clickhouse01 5.10.76-linuxkit #1 SMP Mon Nov 8 10:21:19 UTC 2021 x86_64 x86_64 x86_64 GNU/Linux +``` diff --git a/utils/clickhouse-diagnostics/clickhouse-diagnostics b/utils/clickhouse-diagnostics/clickhouse-diagnostics new file mode 100644 index 00000000000..ffddee0bdc4 --- /dev/null +++ b/utils/clickhouse-diagnostics/clickhouse-diagnostics @@ -0,0 +1,960 @@ +#!/usr/bin/env python3 + +import argparse +import gzip +import io +import json +import socket +import subprocess +import sys +from copy import deepcopy +from datetime import datetime +from typing import MutableMapping + +import jinja2 +import requests +import sqlparse +import tenacity +import xmltodict +import yaml + +SELECT_VERSION = r'SELECT version()' + +SELECT_UPTIME = r''' +{% if version_ge('21.3') -%} +SELECT formatReadableTimeDelta(uptime()) +{% else -%} +SELECT + toString(floor(uptime() / 3600 / 24)) || ' days ' || + toString(floor(uptime() % (24 * 3600) / 3600, 1)) || ' hours' +{% endif -%} +''' + +SELECT_SYSTEM_TABLES = "SELECT name FROM system.tables WHERE database = 'system'" + +SELECT_DATABASE_ENGINES = r'''SELECT + engine, + count() "count" +FROM system.databases +GROUP BY engine +''' + +SELECT_DATABASES = r'''SELECT + name, + engine, + tables, + partitions, + parts, + formatReadableSize(bytes_on_disk) "disk_size" +FROM system.databases db +LEFT JOIN +( + SELECT + database, + uniq(table) "tables", + uniq(table, partition) "partitions", + count() AS parts, + sum(bytes_on_disk) "bytes_on_disk" + FROM system.parts + WHERE active + GROUP BY database +) AS db_stats ON db.name = db_stats.database +ORDER BY bytes_on_disk DESC +LIMIT 10 +''' + +SELECT_TABLE_ENGINES = r'''SELECT + engine, + count() "count" +FROM system.tables +WHERE database != 'system' +GROUP BY engine +''' + +SELECT_DICTIONARIES = r'''SELECT + source, + type, + status, + count() "count" +FROM system.dictionaries +GROUP BY source, type, status +ORDER BY status DESC, source +''' + +SELECT_ACCESS = "SHOW ACCESS" + +SELECT_QUOTA_USAGE = "SHOW QUOTA" + +SELECT_REPLICAS = r'''SELECT + database, + table, + is_leader, + is_readonly, + absolute_delay, + queue_size, + inserts_in_queue, + merges_in_queue +FROM system.replicas +ORDER BY absolute_delay DESC +LIMIT 10 +''' + +SELECT_REPLICATION_QUEUE = r'''SELECT + database, + table, + replica_name, + position, + node_name, + type, + source_replica, + parts_to_merge, + new_part_name, + create_time, + required_quorum, + is_detach, + is_currently_executing, + num_tries, + last_attempt_time, + last_exception, + concat('time: ', toString(last_postpone_time), ', number: ', toString(num_postponed), ', reason: ', postpone_reason) postpone +FROM system.replication_queue +ORDER BY create_time ASC +LIMIT 20 +''' + +SELECT_REPLICATED_FETCHES = r'''SELECT + database, + table, + round(elapsed, 1) "elapsed", + round(100 * progress, 1) "progress", + partition_id, + result_part_name, + result_part_path, + total_size_bytes_compressed, + bytes_read_compressed, + source_replica_path, + source_replica_hostname, + source_replica_port, + interserver_scheme, + to_detached, + thread_id +FROM system.replicated_fetches +''' + +SELECT_PARTS_PER_TABLE = r'''SELECT + database, + table, + count() "partitions", + sum(part_count) "parts", + max(part_count) "max_parts_per_partition" +FROM +( + SELECT + database, + table, + partition, + count() "part_count" + FROM system.parts + WHERE active + GROUP BY database, table, partition +) partitions +GROUP BY database, table +ORDER BY max_parts_per_partition DESC +LIMIT 10 +''' + +SELECT_MERGES = r'''SELECT + database, + table, + round(elapsed, 1) "elapsed", + round(100 * progress, 1) "progress", + is_mutation, + partition_id, +{% if version_ge('20.3') -%} + result_part_path, + source_part_paths, +{% endif -%} + num_parts, + formatReadableSize(total_size_bytes_compressed) "total_size_compressed", + formatReadableSize(bytes_read_uncompressed) "read_uncompressed", + formatReadableSize(bytes_written_uncompressed) "written_uncompressed", + columns_written, +{% if version_ge('20.3') -%} + formatReadableSize(memory_usage) "memory_usage", + thread_id +{% else -%} + formatReadableSize(memory_usage) "memory_usage" +{% endif -%} +FROM system.merges +''' + +SELECT_MUTATIONS = r'''SELECT + database, + table, + mutation_id, + command, + create_time, +{% if version_ge('20.3') -%} + parts_to_do_names, +{% endif -%} + parts_to_do, + is_done, + latest_failed_part, + latest_fail_time, + latest_fail_reason +FROM system.mutations +WHERE NOT is_done +ORDER BY create_time DESC +''' + +SELECT_RECENT_DATA_PARTS = r'''SELECT + database, + table, + engine, + partition_id, + name, +{% if version_ge('20.3') -%} + part_type, +{% endif -%} + active, + level, +{% if version_ge('20.3') -%} + disk_name, +{% endif -%} + path, + marks, + rows, + bytes_on_disk, + data_compressed_bytes, + data_uncompressed_bytes, + marks_bytes, + modification_time, + remove_time, + refcount, + is_frozen, + min_date, + max_date, + min_time, + max_time, + min_block_number, + max_block_number +FROM system.parts +WHERE modification_time > now() - INTERVAL 3 MINUTE +ORDER BY modification_time DESC +''' + +SELECT_DETACHED_DATA_PARTS = r'''SELECT + database, + table, + partition_id, + name, + disk, + reason, + min_block_number, + max_block_number, + level +FROM system.detached_parts +''' + +SELECT_PROCESSES = r'''SELECT + elapsed, + query_id, + {% if normalize_queries -%} + normalizeQuery(query) AS normalized_query, + {% else -%} + query, + {% endif -%} + is_cancelled, + concat(toString(read_rows), ' rows / ', formatReadableSize(read_bytes)) AS read, + concat(toString(written_rows), ' rows / ', formatReadableSize(written_bytes)) AS written, + formatReadableSize(memory_usage) AS "memory usage", + user, + multiIf(empty(client_name), http_user_agent, concat(client_name, ' ', toString(client_version_major), '.', toString(client_version_minor), '.', toString(client_version_patch))) AS client, + {% if version_ge('21.3') -%} + thread_ids, + {% endif -%} + {% if version_ge('21.8') -%} + ProfileEvents, + Settings + {% else -%} + ProfileEvents.Names, + ProfileEvents.Values, + Settings.Names, + Settings.Values + {% endif -%} +FROM system.processes +ORDER BY elapsed DESC +''' + +SELECT_TOP_QUERIES_BY_DURATION = r'''SELECT + type, + query_start_time, + query_duration_ms, + query_id, + query_kind, + is_initial_query, + {% if normalize_queries -%} + normalizeQuery(query) AS normalized_query, + {% else -%} + query, + {% endif -%} + concat(toString(read_rows), ' rows / ', formatReadableSize(read_bytes)) AS read, + concat(toString(written_rows), ' rows / ', formatReadableSize(written_bytes)) AS written, + concat(toString(result_rows), ' rows / ', formatReadableSize(result_bytes)) AS result, + formatReadableSize(memory_usage) AS "memory usage", + exception, + '\n' || stack_trace AS stack_trace, + user, + initial_user, + multiIf(empty(client_name), http_user_agent, concat(client_name, ' ', toString(client_version_major), '.', toString(client_version_minor), '.', toString(client_version_patch))) AS client, + client_hostname, + {% if version_ge('21.3') -%} + databases, + tables, + columns, + used_aggregate_functions, + used_aggregate_function_combinators, + used_database_engines, + used_data_type_families, + used_dictionaries, + used_formats, + used_functions, + used_storages, + used_table_functions, + thread_ids, + {% endif -%} + {% if version_ge('21.8') -%} + ProfileEvents, + Settings + {% else -%} + ProfileEvents.Names, + ProfileEvents.Values, + Settings.Names, + Settings.Values + {% endif -%} +FROM system.query_log +WHERE type != 'QueryStart' + AND event_date >= today() - 1 + AND event_time >= now() - INTERVAL 1 DAY +ORDER BY query_duration_ms DESC +LIMIT 10 +''' + +SELECT_TOP_QUERIES_BY_MEMORY_USAGE = r'''SELECT + type, + query_start_time, + query_duration_ms, + query_id, + query_kind, + is_initial_query, + {% if normalize_queries -%} + normalizeQuery(query) AS normalized_query, + {% else -%} + query, + {% endif -%} + concat(toString(read_rows), ' rows / ', formatReadableSize(read_bytes)) AS read, + concat(toString(written_rows), ' rows / ', formatReadableSize(written_bytes)) AS written, + concat(toString(result_rows), ' rows / ', formatReadableSize(result_bytes)) AS result, + formatReadableSize(memory_usage) AS "memory usage", + exception, + '\n' || stack_trace AS stack_trace, + user, + initial_user, + multiIf(empty(client_name), http_user_agent, concat(client_name, ' ', toString(client_version_major), '.', toString(client_version_minor), '.', toString(client_version_patch))) AS client, + client_hostname, + {% if version_ge('21.3') -%} + databases, + tables, + columns, + used_aggregate_functions, + used_aggregate_function_combinators, + used_database_engines, + used_data_type_families, + used_dictionaries, + used_formats, + used_functions, + used_storages, + used_table_functions, + thread_ids, + {% endif -%} + {% if version_ge('21.8') -%} + ProfileEvents, + Settings + {% else -%} + ProfileEvents.Names, + ProfileEvents.Values, + Settings.Names, + Settings.Values + {% endif -%} +FROM system.query_log +WHERE type != 'QueryStart' + AND event_date >= today() - 1 + AND event_time >= now() - INTERVAL 1 DAY +ORDER BY memory_usage DESC +LIMIT 10 +''' + +SELECT_FAILED_QUERIES = r'''SELECT + type, + query_start_time, + query_duration_ms, + query_id, + query_kind, + is_initial_query, + {% if normalize_queries -%} + normalizeQuery(query) AS normalized_query, + {% else -%} + query, + {% endif -%} + concat(toString(read_rows), ' rows / ', formatReadableSize(read_bytes)) AS read, + concat(toString(written_rows), ' rows / ', formatReadableSize(written_bytes)) AS written, + concat(toString(result_rows), ' rows / ', formatReadableSize(result_bytes)) AS result, + formatReadableSize(memory_usage) AS "memory usage", + exception, + '\n' || stack_trace AS stack_trace, + user, + initial_user, + multiIf(empty(client_name), http_user_agent, concat(client_name, ' ', toString(client_version_major), '.', toString(client_version_minor), '.', toString(client_version_patch))) AS client, + client_hostname, + {% if version_ge('21.3') -%} + databases, + tables, + columns, + used_aggregate_functions, + used_aggregate_function_combinators, + used_database_engines, + used_data_type_families, + used_dictionaries, + used_formats, + used_functions, + used_storages, + used_table_functions, + thread_ids, + {% endif -%} + {% if version_ge('21.8') -%} + ProfileEvents, + Settings + {% else -%} + ProfileEvents.Names, + ProfileEvents.Values, + Settings.Names, + Settings.Values + {% endif -%} +FROM system.query_log +WHERE type != 'QueryStart' + AND event_date >= today() - 1 + AND event_time >= now() - INTERVAL 1 DAY + AND exception != '' +ORDER BY query_start_time DESC +LIMIT 10 +''' + +SELECT_STACK_TRACES = r'''SELECT + '\n' || arrayStringConcat( + arrayMap( + x, + y -> concat(x, ': ', y), + arrayMap(x -> addressToLine(x), trace), + arrayMap(x -> demangle(addressToSymbol(x)), trace)), + '\n') AS trace +FROM system.stack_trace +''' + +SELECT_CRASH_LOG = r'''SELECT + event_time, + signal, + thread_id, + query_id, + '\n' || arrayStringConcat(trace_full, '\n') AS trace, + version +FROM system.crash_log +ORDER BY event_time DESC +''' + + +def retry(exception_types, max_attempts=5, max_interval=5): + """ + Function decorator that retries wrapped function on failures. + """ + return tenacity.retry( + retry=tenacity.retry_if_exception_type(exception_types), + wait=tenacity.wait_random_exponential(multiplier=0.5, max=max_interval), + stop=tenacity.stop_after_attempt(max_attempts), + reraise=True) + + +class ClickhouseError(Exception): + """ + ClickHouse interaction error. + """ + + def __init__(self, response): + self.response = response + super().__init__(self.response.text.strip()) + + +class ClickhouseClient: + """ + ClickHouse client. + """ + + def __init__(self, *, host, port=8123, user=None): + self._session = requests.Session() + if user: + self._session.headers['X-ClickHouse-User'] = user + self._url = f'http://{host}:{port}' + self._timeout = 60 + self._ch_version = None + + @property + def clickhouse_version(self): + if self._ch_version is None: + self._ch_version = self.query(SELECT_VERSION) + + return self._ch_version + + @retry(requests.exceptions.ConnectionError) + def query(self, query, query_args=None, format=None, post_data=None, timeout=None, echo=False, dry_run=False): + """ + Execute query. + """ + if query_args: + query = self.render_query(query, **query_args) + + if format: + query += f' FORMAT {format}' + + if timeout is None: + timeout = self._timeout + + if echo: + print(sqlparse.format(query, reindent=True), '\n') + + if dry_run: + return None + + try: + response = self._session.post(self._url, + params={ + 'query': query, + }, + json=post_data, + timeout=timeout) + + response.raise_for_status() + + if format in ('JSON', 'JSONCompact'): + return response.json() + + return response.text.strip() + except requests.exceptions.HTTPError as e: + raise ClickhouseError(e.response) from None + + def render_query(self, query, **kwargs): + env = jinja2.Environment() + + env.globals['version_ge'] = lambda version: version_ge(self.clickhouse_version, version) + + template = env.from_string(query) + return template.render(kwargs) + + +class ClickhouseConfig: + """ + ClickHouse server configuration. + """ + + def __init__(self, config): + self._config = config + + def dump(self, mask_secrets=True): + config = deepcopy(self._config) + if mask_secrets: + self._mask_secrets(config) + + return xmltodict.unparse(config, pretty=True) + + @classmethod + def load(cls): + return ClickhouseConfig(cls._load_config('/var/lib/clickhouse/preprocessed_configs/config.xml')) + + @staticmethod + def _load_config(config_path): + with open(config_path, 'r') as file: + return xmltodict.parse(file.read()) + + @classmethod + def _mask_secrets(cls, config): + if isinstance(config, MutableMapping): + for key, value in list(config.items()): + if isinstance(value, MutableMapping): + cls._mask_secrets(config[key]) + elif key in ('password', 'secret_access_key', 'header', 'identity'): + config[key] = '*****' + + +class DiagnosticsData: + """ + Diagnostics data. + """ + + def __init__(self, args, host): + self.args = args + self.host = host + self._sections = [{'section': None, 'data': {}}] + + def add_string(self, name, value, section=None): + self._section(section)[name] = { + 'type': 'string', + 'value': value, + } + + def add_xml_document(self, name, document, section=None): + self._section(section)[name] = { + 'type': 'xml', + 'value': document, + } + + def add_query(self, name, query, result, section=None): + self._section(section)[name] = { + 'type': 'query', + 'query': query, + 'result': result, + } + + def add_command(self, name, command, result, section=None): + self._section(section)[name] = { + 'type': 'command', + 'command': command, + 'result': result, + } + + def dump(self, format): + if format.startswith('json'): + result = self._dump_json() + elif format.startswith('yaml'): + result = self._dump_yaml() + else: + result = self._dump_wiki() + + if format.endswith('.gz'): + compressor = gzip.GzipFile(mode='wb', fileobj=sys.stdout.buffer) + compressor.write(result.encode()) + else: + print(result) + + def _section(self, name=None): + if self._sections[-1]['section'] != name: + self._sections.append({'section': name, 'data': {}}) + + return self._sections[-1]['data'] + + def _dump_json(self): + """ + Dump diagnostic data in JSON format. + """ + return json.dumps(self._sections, indent=2, ensure_ascii=False) + + def _dump_yaml(self): + """ + Dump diagnostic data in YAML format. + """ + return yaml.dump(self._sections, default_flow_style=False, allow_unicode=True) + + def _dump_wiki(self): + """ + Dump diagnostic data in Yandex wiki format. + """ + + def _write_title(buffer, value): + buffer.write(f'### {value}\n') + + def _write_subtitle(buffer, value): + buffer.write(f'#### {value}\n') + + def _write_string_item(buffer, name, item): + value = item['value'] + if value != '': + value = f'**{value}**' + buffer.write(f'{name}: {value}\n') + + def _write_xml_item(buffer, section_name, name, item): + if section_name: + buffer.write(f'##### {name}\n') + else: + _write_subtitle(buffer, name) + + _write_result(buffer, item['value'], format='XML') + + def _write_query_item(buffer, section_name, name, item): + if section_name: + buffer.write(f'##### {name}\n') + else: + _write_subtitle(buffer, name) + + _write_query(buffer, item['query']) + _write_result(buffer, item['result']) + + def _write_command_item(buffer, section_name, name, item): + if section_name: + buffer.write(f'##### {name}\n') + else: + _write_subtitle(buffer, name) + + _write_command(buffer, item['command']) + _write_result(buffer, item['result']) + + def _write_unknown_item(buffer, section_name, name, item): + if section_name: + buffer.write(f'**{name}**\n') + else: + _write_subtitle(buffer, name) + + json.dump(item, buffer, indent=2) + + def _write_query(buffer, query): + buffer.write('**query**\n') + buffer.write('```sql\n') + buffer.write(query) + buffer.write('\n```\n') + + def _write_command(buffer, command): + buffer.write('**command**\n') + buffer.write('```\n') + buffer.write(command) + buffer.write('\n```\n') + + def _write_result(buffer, result, format=None): + buffer.write('**result**\n') + buffer.write(f'```{format}\n' if format else '```\n') + buffer.write(result) + buffer.write('\n```\n') + + buffer = io.StringIO() + + _write_title(buffer, f'Diagnostics data for host {self.host}') + for section in self._sections: + section_name = section['section'] + if section_name: + _write_subtitle(buffer, section_name) + + for name, item in section['data'].items(): + if item['type'] == 'string': + _write_string_item(buffer, name, item) + elif item['type'] == 'query': + _write_query_item(buffer, section_name, name, item) + elif item['type'] == 'command': + _write_command_item(buffer, section_name, name, item) + elif item['type'] == 'xml': + _write_xml_item(buffer, section_name, name, item) + else: + _write_unknown_item(buffer, section_name, name, item) + + return buffer.getvalue() + + +def main(): + """ + Program entry point. + """ + args = parse_args() + + host = socket.getfqdn() + timestamp = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S') + client = ClickhouseClient(host=host) + ch_config = ClickhouseConfig.load() + version = client.clickhouse_version + system_tables = [row[0] for row in execute_query(client, SELECT_SYSTEM_TABLES, format='JSONCompact')['data']] + + diagnostics = DiagnosticsData(args, host) + diagnostics.add_string('Version', version) + diagnostics.add_string('Timestamp', timestamp) + diagnostics.add_string('Uptime', execute_query(client, SELECT_UPTIME)) + + diagnostics.add_xml_document('ClickHouse configuration', ch_config.dump()) + + if version_ge(version, '20.8'): + add_query(diagnostics, 'Access configuration', + client=client, + query=SELECT_ACCESS, + format='TSVRaw') + add_query(diagnostics, 'Quotas', + client=client, + query=SELECT_QUOTA_USAGE, + format='Vertical') + + add_query(diagnostics, 'Database engines', + client=client, + query=SELECT_DATABASE_ENGINES, + format='PrettyCompactNoEscapes', + section='Schema') + add_query(diagnostics, 'Databases (top 10 by size)', + client=client, + query=SELECT_DATABASES, + format='PrettyCompactNoEscapes', + section='Schema') + add_query(diagnostics, 'Table engines', + client=client, + query=SELECT_TABLE_ENGINES, + format='PrettyCompactNoEscapes', + section='Schema') + add_query(diagnostics, 'Dictionaries', + client=client, + query=SELECT_DICTIONARIES, + format='PrettyCompactNoEscapes', + section='Schema') + + add_query(diagnostics, 'Replicated tables (top 10 by absolute delay)', + client=client, + query=SELECT_REPLICAS, + format='PrettyCompactNoEscapes', + section='Replication') + add_query(diagnostics, 'Replication queue (top 20 oldest tasks)', + client=client, + query=SELECT_REPLICATION_QUEUE, + format='Vertical', + section='Replication') + if version_ge(version, '21.3'): + add_query(diagnostics, 'Replicated fetches', + client=client, + query=SELECT_REPLICATED_FETCHES, + format='Vertical', + section='Replication') + + add_query(diagnostics, 'Top 10 tables by max parts per partition', + client=client, + query=SELECT_PARTS_PER_TABLE, + format='PrettyCompactNoEscapes') + add_query(diagnostics, 'Merges in progress', + client=client, + query=SELECT_MERGES, + format='Vertical') + add_query(diagnostics, 'Mutations in progress', + client=client, + query=SELECT_MUTATIONS, + format='Vertical') + add_query(diagnostics, 'Recent data parts (modification time within last 3 minutes)', + client=client, + query=SELECT_RECENT_DATA_PARTS, + format='Vertical') + + add_query(diagnostics, 'system.detached_parts', + client=client, + query=SELECT_DETACHED_DATA_PARTS, + format='PrettyCompactNoEscapes', + section='Detached data') + add_command(diagnostics, 'Disk space usage', + command='du -sh -L -c /var/lib/clickhouse/data/*/*/detached/* | sort -rsh', + section='Detached data') + + add_query(diagnostics, 'Queries in progress (process list)', + client=client, + query=SELECT_PROCESSES, + format='Vertical', + section='Queries') + add_query(diagnostics, 'Top 10 queries by duration', + client=client, + query=SELECT_TOP_QUERIES_BY_DURATION, + format='Vertical', + section='Queries') + add_query(diagnostics, 'Top 10 queries by memory usage', + client=client, + query=SELECT_TOP_QUERIES_BY_MEMORY_USAGE, + format='Vertical', + section='Queries') + add_query(diagnostics, 'Last 10 failed queries', + client=client, + query=SELECT_FAILED_QUERIES, + format='Vertical', + section='Queries') + + add_query(diagnostics, 'Stack traces', + client=client, + query=SELECT_STACK_TRACES, + format='Vertical') + + if 'crash_log' in system_tables: + add_query(diagnostics, 'Crash log', + client=client, + query=SELECT_CRASH_LOG, + format='Vertical') + + add_command(diagnostics, 'uname', 'uname -a') + + diagnostics.dump(args.format) + + +def parse_args(): + """ + Parse command-line arguments. + """ + parser = argparse.ArgumentParser() + parser.add_argument('--format', + choices=['json', 'yaml', 'json.gz', 'yaml.gz', 'wiki', 'wiki.gz'], + default='wiki') + parser.add_argument('--normalize-queries', + action='store_true', + default=False) + return parser.parse_args() + + +def add_query(diagnostics, name, client, query, format, section=None): + query_args = { + 'normalize_queries': diagnostics.args.normalize_queries, + } + query = client.render_query(query, **query_args) + diagnostics.add_query( + name=name, + query=query, + result=execute_query(client, query, render_query=False, format=format), + section=section) + + +def execute_query(client, query, render_query=True, format=None): + if render_query: + query = client.render_query(query) + + try: + return client.query(query, format=format) + except Exception as e: + return repr(e) + + +def add_command(diagnostics, name, command, section=None): + diagnostics.add_command( + name=name, + command=command, + result=execute_command(command), + section=section) + + +def execute_command(command, input=None): + proc = subprocess.Popen(command, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + + if isinstance(input, str): + input = input.encode() + + stdout, stderr = proc.communicate(input=input) + + if proc.returncode: + return f'failed with exit code {proc.returncode}\n{stderr.decode()}' + + return stdout.decode() + + +def version_ge(version1, version2): + """ + Return True if version1 is greater or equal than version2. + """ + return parse_version(version1) >= parse_version(version2) + + +def parse_version(version): + """ + Parse version string. + """ + return [int(x) for x in version.strip().split('.')] + + +if __name__ == '__main__': + main() diff --git a/utils/clickhouse-diagnostics/requirements.txt b/utils/clickhouse-diagnostics/requirements.txt new file mode 100644 index 00000000000..1d2b6ef3916 --- /dev/null +++ b/utils/clickhouse-diagnostics/requirements.txt @@ -0,0 +1,6 @@ +Jinja2 +PyYAML +requests +sqlparse +tenacity +xmltodict diff --git a/utils/grammar-fuzzer/ClickHouseUnlexer.py b/utils/grammar-fuzzer/ClickHouseUnlexer.py deleted file mode 100644 index c91522bd7be..00000000000 --- a/utils/grammar-fuzzer/ClickHouseUnlexer.py +++ /dev/null @@ -1,1771 +0,0 @@ -# Generated by Grammarinator 19.3 - -from itertools import chain -from grammarinator.runtime import * - -charset_0 = list(chain(*multirange_diff(printable_unicode_ranges, [(39, 40),(92, 93)]))) -charset_1 = list(chain(range(97, 98), range(65, 66))) -charset_2 = list(chain(range(98, 99), range(66, 67))) -charset_3 = list(chain(range(99, 100), range(67, 68))) -charset_4 = list(chain(range(100, 101), range(68, 69))) -charset_5 = list(chain(range(101, 102), range(69, 70))) -charset_6 = list(chain(range(102, 103), range(70, 71))) -charset_7 = list(chain(range(103, 104), range(71, 72))) -charset_8 = list(chain(range(104, 105), range(72, 73))) -charset_9 = list(chain(range(105, 106), range(73, 74))) -charset_10 = list(chain(range(106, 107), range(74, 75))) -charset_11 = list(chain(range(107, 108), range(75, 76))) -charset_12 = list(chain(range(108, 109), range(76, 77))) -charset_13 = list(chain(range(109, 110), range(77, 78))) -charset_14 = list(chain(range(110, 111), range(78, 79))) -charset_15 = list(chain(range(111, 112), range(79, 80))) -charset_16 = list(chain(range(112, 113), range(80, 81))) -charset_17 = list(chain(range(113, 114), range(81, 82))) -charset_18 = list(chain(range(114, 115), range(82, 83))) -charset_19 = list(chain(range(115, 116), range(83, 84))) -charset_20 = list(chain(range(116, 117), range(84, 85))) -charset_21 = list(chain(range(117, 118), range(85, 86))) -charset_22 = list(chain(range(118, 119), range(86, 87))) -charset_23 = list(chain(range(119, 120), range(87, 88))) -charset_24 = list(chain(range(120, 121), range(88, 89))) -charset_25 = list(chain(range(121, 122), range(89, 90))) -charset_26 = list(chain(range(122, 123), range(90, 91))) -charset_27 = list(chain(range(97, 123), range(65, 91))) -charset_28 = list(chain(range(48, 58))) -charset_29 = list(chain(range(48, 58), range(97, 103), range(65, 71))) -charset_30 = list(chain(*multirange_diff(printable_unicode_ranges, [(92, 93),(92, 93)]))) -charset_31 = list(chain(range(32, 33), range(11, 12), range(12, 13), range(9, 10), range(13, 14), range(10, 11))) - - -class ClickHouseUnlexer(Grammarinator): - - def __init__(self, *, max_depth=float('inf'), weights=None, cooldown=1.0): - super(ClickHouseUnlexer, self).__init__() - self.unlexer = self - self.max_depth = max_depth - self.weights = weights or dict() - self.cooldown = cooldown - - def EOF(self, *args, **kwargs): - pass - - @depthcontrol - def INTERVAL_TYPE(self): - current = self.create_node(UnlexerRule(name='INTERVAL_TYPE')) - choice = self.choice([0 if [2, 2, 2, 2, 2, 2, 2, 2][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_0', i), 1) for i, w in enumerate([1, 1, 1, 1, 1, 1, 1, 1])]) - self.unlexer.weights[('alt_0', choice)] = self.unlexer.weights.get(('alt_0', choice), 1) * self.unlexer.cooldown - if choice == 0: - current += self.unlexer.SECOND() - elif choice == 1: - current += self.unlexer.MINUTE() - elif choice == 2: - current += self.unlexer.HOUR() - elif choice == 3: - current += self.unlexer.DAY() - elif choice == 4: - current += self.unlexer.WEEK() - elif choice == 5: - current += self.unlexer.MONTH() - elif choice == 6: - current += self.unlexer.QUARTER() - elif choice == 7: - current += self.unlexer.YEAR() - return current - INTERVAL_TYPE.min_depth = 2 - - @depthcontrol - def ALIAS(self): - current = self.create_node(UnlexerRule(name='ALIAS')) - current += self.unlexer.A() - current += self.unlexer.L() - current += self.unlexer.I() - current += self.unlexer.A() - current += self.unlexer.S() - return current - ALIAS.min_depth = 1 - - @depthcontrol - def ALL(self): - current = self.create_node(UnlexerRule(name='ALL')) - current += self.unlexer.A() - current += self.unlexer.L() - current += self.unlexer.L() - return current - ALL.min_depth = 1 - - @depthcontrol - def AND(self): - current = self.create_node(UnlexerRule(name='AND')) - current += self.unlexer.A() - current += self.unlexer.N() - current += self.unlexer.D() - return current - AND.min_depth = 1 - - @depthcontrol - def ANTI(self): - current = self.create_node(UnlexerRule(name='ANTI')) - current += self.unlexer.A() - current += self.unlexer.N() - current += self.unlexer.T() - current += self.unlexer.I() - return current - ANTI.min_depth = 1 - - @depthcontrol - def ANY(self): - current = self.create_node(UnlexerRule(name='ANY')) - current += self.unlexer.A() - current += self.unlexer.N() - current += self.unlexer.Y() - return current - ANY.min_depth = 1 - - @depthcontrol - def ARRAY(self): - current = self.create_node(UnlexerRule(name='ARRAY')) - current += self.unlexer.A() - current += self.unlexer.R() - current += self.unlexer.R() - current += self.unlexer.A() - current += self.unlexer.Y() - return current - ARRAY.min_depth = 1 - - @depthcontrol - def AS(self): - current = self.create_node(UnlexerRule(name='AS')) - current += self.unlexer.A() - current += self.unlexer.S() - return current - AS.min_depth = 1 - - @depthcontrol - def ASCENDING(self): - current = self.create_node(UnlexerRule(name='ASCENDING')) - choice = self.choice([0 if [1, 1][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_9', i), 1) for i, w in enumerate([1, 1])]) - self.unlexer.weights[('alt_9', choice)] = self.unlexer.weights.get(('alt_9', choice), 1) * self.unlexer.cooldown - if choice == 0: - current += self.unlexer.A() - current += self.unlexer.S() - current += self.unlexer.C() - elif choice == 1: - current += self.unlexer.A() - current += self.unlexer.S() - current += self.unlexer.C() - current += self.unlexer.E() - current += self.unlexer.N() - current += self.unlexer.D() - current += self.unlexer.I() - current += self.unlexer.N() - current += self.unlexer.G() - return current - ASCENDING.min_depth = 1 - - @depthcontrol - def ASOF(self): - current = self.create_node(UnlexerRule(name='ASOF')) - current += self.unlexer.A() - current += self.unlexer.S() - current += self.unlexer.O() - current += self.unlexer.F() - return current - ASOF.min_depth = 1 - - @depthcontrol - def BETWEEN(self): - current = self.create_node(UnlexerRule(name='BETWEEN')) - current += self.unlexer.B() - current += self.unlexer.E() - current += self.unlexer.T() - current += self.unlexer.W() - current += self.unlexer.E() - current += self.unlexer.E() - current += self.unlexer.N() - return current - BETWEEN.min_depth = 1 - - @depthcontrol - def BOTH(self): - current = self.create_node(UnlexerRule(name='BOTH')) - current += self.unlexer.B() - current += self.unlexer.O() - current += self.unlexer.T() - current += self.unlexer.H() - return current - BOTH.min_depth = 1 - - @depthcontrol - def BY(self): - current = self.create_node(UnlexerRule(name='BY')) - current += self.unlexer.B() - current += self.unlexer.Y() - return current - BY.min_depth = 1 - - @depthcontrol - def CASE(self): - current = self.create_node(UnlexerRule(name='CASE')) - current += self.unlexer.C() - current += self.unlexer.A() - current += self.unlexer.S() - current += self.unlexer.E() - return current - CASE.min_depth = 1 - - @depthcontrol - def CAST(self): - current = self.create_node(UnlexerRule(name='CAST')) - current += self.unlexer.C() - current += self.unlexer.A() - current += self.unlexer.S() - current += self.unlexer.T() - return current - CAST.min_depth = 1 - - @depthcontrol - def CLUSTER(self): - current = self.create_node(UnlexerRule(name='CLUSTER')) - current += self.unlexer.C() - current += self.unlexer.L() - current += self.unlexer.U() - current += self.unlexer.S() - current += self.unlexer.T() - current += self.unlexer.E() - current += self.unlexer.R() - return current - CLUSTER.min_depth = 1 - - @depthcontrol - def COLLATE(self): - current = self.create_node(UnlexerRule(name='COLLATE')) - current += self.unlexer.C() - current += self.unlexer.O() - current += self.unlexer.L() - current += self.unlexer.L() - current += self.unlexer.A() - current += self.unlexer.T() - current += self.unlexer.E() - return current - COLLATE.min_depth = 1 - - @depthcontrol - def CREATE(self): - current = self.create_node(UnlexerRule(name='CREATE')) - current += self.unlexer.C() - current += self.unlexer.R() - current += self.unlexer.E() - current += self.unlexer.A() - current += self.unlexer.T() - current += self.unlexer.E() - return current - CREATE.min_depth = 1 - - @depthcontrol - def CROSS(self): - current = self.create_node(UnlexerRule(name='CROSS')) - current += self.unlexer.C() - current += self.unlexer.R() - current += self.unlexer.O() - current += self.unlexer.S() - current += self.unlexer.S() - return current - CROSS.min_depth = 1 - - @depthcontrol - def DATABASE(self): - current = self.create_node(UnlexerRule(name='DATABASE')) - current += self.unlexer.D() - current += self.unlexer.A() - current += self.unlexer.T() - current += self.unlexer.A() - current += self.unlexer.B() - current += self.unlexer.A() - current += self.unlexer.S() - current += self.unlexer.E() - return current - DATABASE.min_depth = 1 - - @depthcontrol - def DAY(self): - current = self.create_node(UnlexerRule(name='DAY')) - current += self.unlexer.D() - current += self.unlexer.A() - current += self.unlexer.Y() - return current - DAY.min_depth = 1 - - @depthcontrol - def DEFAULT(self): - current = self.create_node(UnlexerRule(name='DEFAULT')) - current += self.unlexer.D() - current += self.unlexer.E() - current += self.unlexer.F() - current += self.unlexer.A() - current += self.unlexer.U() - current += self.unlexer.L() - current += self.unlexer.T() - return current - DEFAULT.min_depth = 1 - - @depthcontrol - def DELETE(self): - current = self.create_node(UnlexerRule(name='DELETE')) - current += self.unlexer.D() - current += self.unlexer.E() - current += self.unlexer.L() - current += self.unlexer.E() - current += self.unlexer.T() - current += self.unlexer.E() - return current - DELETE.min_depth = 1 - - @depthcontrol - def DESCENDING(self): - current = self.create_node(UnlexerRule(name='DESCENDING')) - choice = self.choice([0 if [1, 1][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_12', i), 1) for i, w in enumerate([1, 1])]) - self.unlexer.weights[('alt_12', choice)] = self.unlexer.weights.get(('alt_12', choice), 1) * self.unlexer.cooldown - if choice == 0: - current += self.unlexer.D() - current += self.unlexer.E() - current += self.unlexer.S() - current += self.unlexer.C() - elif choice == 1: - current += self.unlexer.D() - current += self.unlexer.E() - current += self.unlexer.S() - current += self.unlexer.C() - current += self.unlexer.E() - current += self.unlexer.N() - current += self.unlexer.D() - current += self.unlexer.I() - current += self.unlexer.N() - current += self.unlexer.G() - return current - DESCENDING.min_depth = 1 - - @depthcontrol - def DISK(self): - current = self.create_node(UnlexerRule(name='DISK')) - current += self.unlexer.D() - current += self.unlexer.I() - current += self.unlexer.S() - current += self.unlexer.K() - return current - DISK.min_depth = 1 - - @depthcontrol - def DISTINCT(self): - current = self.create_node(UnlexerRule(name='DISTINCT')) - current += self.unlexer.D() - current += self.unlexer.I() - current += self.unlexer.S() - current += self.unlexer.T() - current += self.unlexer.I() - current += self.unlexer.N() - current += self.unlexer.C() - current += self.unlexer.T() - return current - DISTINCT.min_depth = 1 - - @depthcontrol - def DROP(self): - current = self.create_node(UnlexerRule(name='DROP')) - current += self.unlexer.D() - current += self.unlexer.R() - current += self.unlexer.O() - current += self.unlexer.P() - return current - DROP.min_depth = 1 - - @depthcontrol - def ELSE(self): - current = self.create_node(UnlexerRule(name='ELSE')) - current += self.unlexer.E() - current += self.unlexer.L() - current += self.unlexer.S() - current += self.unlexer.E() - return current - ELSE.min_depth = 1 - - @depthcontrol - def END(self): - current = self.create_node(UnlexerRule(name='END')) - current += self.unlexer.E() - current += self.unlexer.N() - current += self.unlexer.D() - return current - END.min_depth = 1 - - @depthcontrol - def ENGINE(self): - current = self.create_node(UnlexerRule(name='ENGINE')) - current += self.unlexer.E() - current += self.unlexer.N() - current += self.unlexer.G() - current += self.unlexer.I() - current += self.unlexer.N() - current += self.unlexer.E() - return current - ENGINE.min_depth = 1 - - @depthcontrol - def EXISTS(self): - current = self.create_node(UnlexerRule(name='EXISTS')) - current += self.unlexer.E() - current += self.unlexer.X() - current += self.unlexer.I() - current += self.unlexer.S() - current += self.unlexer.T() - current += self.unlexer.S() - return current - EXISTS.min_depth = 1 - - @depthcontrol - def EXTRACT(self): - current = self.create_node(UnlexerRule(name='EXTRACT')) - current += self.unlexer.E() - current += self.unlexer.X() - current += self.unlexer.T() - current += self.unlexer.R() - current += self.unlexer.A() - current += self.unlexer.C() - current += self.unlexer.T() - return current - EXTRACT.min_depth = 1 - - @depthcontrol - def FINAL(self): - current = self.create_node(UnlexerRule(name='FINAL')) - current += self.unlexer.F() - current += self.unlexer.I() - current += self.unlexer.N() - current += self.unlexer.A() - current += self.unlexer.L() - return current - FINAL.min_depth = 1 - - @depthcontrol - def FIRST(self): - current = self.create_node(UnlexerRule(name='FIRST')) - current += self.unlexer.F() - current += self.unlexer.I() - current += self.unlexer.R() - current += self.unlexer.S() - current += self.unlexer.T() - return current - FIRST.min_depth = 1 - - @depthcontrol - def FORMAT(self): - current = self.create_node(UnlexerRule(name='FORMAT')) - current += self.unlexer.F() - current += self.unlexer.O() - current += self.unlexer.R() - current += self.unlexer.M() - current += self.unlexer.A() - current += self.unlexer.T() - return current - FORMAT.min_depth = 1 - - @depthcontrol - def FROM(self): - current = self.create_node(UnlexerRule(name='FROM')) - current += self.unlexer.F() - current += self.unlexer.R() - current += self.unlexer.O() - current += self.unlexer.M() - return current - FROM.min_depth = 1 - - @depthcontrol - def FULL(self): - current = self.create_node(UnlexerRule(name='FULL')) - current += self.unlexer.F() - current += self.unlexer.U() - current += self.unlexer.L() - current += self.unlexer.L() - return current - FULL.min_depth = 1 - - @depthcontrol - def GLOBAL(self): - current = self.create_node(UnlexerRule(name='GLOBAL')) - current += self.unlexer.G() - current += self.unlexer.L() - current += self.unlexer.O() - current += self.unlexer.B() - current += self.unlexer.A() - current += self.unlexer.L() - return current - GLOBAL.min_depth = 1 - - @depthcontrol - def GROUP(self): - current = self.create_node(UnlexerRule(name='GROUP')) - current += self.unlexer.G() - current += self.unlexer.R() - current += self.unlexer.O() - current += self.unlexer.U() - current += self.unlexer.P() - return current - GROUP.min_depth = 1 - - @depthcontrol - def HAVING(self): - current = self.create_node(UnlexerRule(name='HAVING')) - current += self.unlexer.H() - current += self.unlexer.A() - current += self.unlexer.V() - current += self.unlexer.I() - current += self.unlexer.N() - current += self.unlexer.G() - return current - HAVING.min_depth = 1 - - @depthcontrol - def HOUR(self): - current = self.create_node(UnlexerRule(name='HOUR')) - current += self.unlexer.H() - current += self.unlexer.O() - current += self.unlexer.U() - current += self.unlexer.R() - return current - HOUR.min_depth = 1 - - @depthcontrol - def IF(self): - current = self.create_node(UnlexerRule(name='IF')) - current += self.unlexer.I() - current += self.unlexer.F() - return current - IF.min_depth = 1 - - @depthcontrol - def IN(self): - current = self.create_node(UnlexerRule(name='IN')) - current += self.unlexer.I() - current += self.unlexer.N() - return current - IN.min_depth = 1 - - @depthcontrol - def INF(self): - current = self.create_node(UnlexerRule(name='INF')) - choice = self.choice([0 if [1, 1][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_15', i), 1) for i, w in enumerate([1, 1])]) - self.unlexer.weights[('alt_15', choice)] = self.unlexer.weights.get(('alt_15', choice), 1) * self.unlexer.cooldown - if choice == 0: - current += self.unlexer.I() - current += self.unlexer.N() - current += self.unlexer.F() - elif choice == 1: - current += self.unlexer.I() - current += self.unlexer.N() - current += self.unlexer.F() - current += self.unlexer.I() - current += self.unlexer.N() - current += self.unlexer.I() - current += self.unlexer.T() - current += self.unlexer.Y() - return current - INF.min_depth = 1 - - @depthcontrol - def INNER(self): - current = self.create_node(UnlexerRule(name='INNER')) - current += self.unlexer.I() - current += self.unlexer.N() - current += self.unlexer.N() - current += self.unlexer.E() - current += self.unlexer.R() - return current - INNER.min_depth = 1 - - @depthcontrol - def INSERT(self): - current = self.create_node(UnlexerRule(name='INSERT')) - current += self.unlexer.I() - current += self.unlexer.N() - current += self.unlexer.S() - current += self.unlexer.E() - current += self.unlexer.R() - current += self.unlexer.T() - return current - INSERT.min_depth = 1 - - @depthcontrol - def INTERVAL(self): - current = self.create_node(UnlexerRule(name='INTERVAL')) - current += self.unlexer.I() - current += self.unlexer.N() - current += self.unlexer.T() - current += self.unlexer.E() - current += self.unlexer.R() - current += self.unlexer.V() - current += self.unlexer.A() - current += self.unlexer.L() - return current - INTERVAL.min_depth = 1 - - @depthcontrol - def INTO(self): - current = self.create_node(UnlexerRule(name='INTO')) - current += self.unlexer.I() - current += self.unlexer.N() - current += self.unlexer.T() - current += self.unlexer.O() - return current - INTO.min_depth = 1 - - @depthcontrol - def IS(self): - current = self.create_node(UnlexerRule(name='IS')) - current += self.unlexer.I() - current += self.unlexer.S() - return current - IS.min_depth = 1 - - @depthcontrol - def JOIN(self): - current = self.create_node(UnlexerRule(name='JOIN')) - current += self.unlexer.J() - current += self.unlexer.O() - current += self.unlexer.I() - current += self.unlexer.N() - return current - JOIN.min_depth = 1 - - @depthcontrol - def KEY(self): - current = self.create_node(UnlexerRule(name='KEY')) - current += self.unlexer.K() - current += self.unlexer.E() - current += self.unlexer.Y() - return current - KEY.min_depth = 1 - - @depthcontrol - def LAST(self): - current = self.create_node(UnlexerRule(name='LAST')) - current += self.unlexer.L() - current += self.unlexer.A() - current += self.unlexer.S() - current += self.unlexer.T() - return current - LAST.min_depth = 1 - - @depthcontrol - def LEADING(self): - current = self.create_node(UnlexerRule(name='LEADING')) - current += self.unlexer.L() - current += self.unlexer.E() - current += self.unlexer.A() - current += self.unlexer.D() - current += self.unlexer.I() - current += self.unlexer.N() - current += self.unlexer.G() - return current - LEADING.min_depth = 1 - - @depthcontrol - def LEFT(self): - current = self.create_node(UnlexerRule(name='LEFT')) - current += self.unlexer.L() - current += self.unlexer.E() - current += self.unlexer.F() - current += self.unlexer.T() - return current - LEFT.min_depth = 1 - - @depthcontrol - def LIKE(self): - current = self.create_node(UnlexerRule(name='LIKE')) - current += self.unlexer.L() - current += self.unlexer.I() - current += self.unlexer.K() - current += self.unlexer.E() - return current - LIKE.min_depth = 1 - - @depthcontrol - def LIMIT(self): - current = self.create_node(UnlexerRule(name='LIMIT')) - current += self.unlexer.L() - current += self.unlexer.I() - current += self.unlexer.M() - current += self.unlexer.I() - current += self.unlexer.T() - return current - LIMIT.min_depth = 1 - - @depthcontrol - def LOCAL(self): - current = self.create_node(UnlexerRule(name='LOCAL')) - current += self.unlexer.L() - current += self.unlexer.O() - current += self.unlexer.C() - current += self.unlexer.A() - current += self.unlexer.L() - return current - LOCAL.min_depth = 1 - - @depthcontrol - def MATERIALIZED(self): - current = self.create_node(UnlexerRule(name='MATERIALIZED')) - current += self.unlexer.M() - current += self.unlexer.A() - current += self.unlexer.T() - current += self.unlexer.E() - current += self.unlexer.R() - current += self.unlexer.I() - current += self.unlexer.A() - current += self.unlexer.L() - current += self.unlexer.I() - current += self.unlexer.Z() - current += self.unlexer.E() - current += self.unlexer.D() - return current - MATERIALIZED.min_depth = 1 - - @depthcontrol - def MINUTE(self): - current = self.create_node(UnlexerRule(name='MINUTE')) - current += self.unlexer.M() - current += self.unlexer.I() - current += self.unlexer.N() - current += self.unlexer.U() - current += self.unlexer.T() - current += self.unlexer.E() - return current - MINUTE.min_depth = 1 - - @depthcontrol - def MONTH(self): - current = self.create_node(UnlexerRule(name='MONTH')) - current += self.unlexer.M() - current += self.unlexer.O() - current += self.unlexer.N() - current += self.unlexer.T() - current += self.unlexer.H() - return current - MONTH.min_depth = 1 - - @depthcontrol - def NAN_SQL(self): - current = self.create_node(UnlexerRule(name='NAN_SQL')) - current += self.unlexer.N() - current += self.unlexer.A() - current += self.unlexer.N() - return current - NAN_SQL.min_depth = 1 - - @depthcontrol - def NOT(self): - current = self.create_node(UnlexerRule(name='NOT')) - current += self.unlexer.N() - current += self.unlexer.O() - current += self.unlexer.T() - return current - NOT.min_depth = 1 - - @depthcontrol - def NULL_SQL(self): - current = self.create_node(UnlexerRule(name='NULL_SQL')) - current += self.unlexer.N() - current += self.unlexer.U() - current += self.unlexer.L() - current += self.unlexer.L() - return current - NULL_SQL.min_depth = 1 - - @depthcontrol - def NULLS(self): - current = self.create_node(UnlexerRule(name='NULLS')) - current += self.unlexer.N() - current += self.unlexer.U() - current += self.unlexer.L() - current += self.unlexer.L() - current += self.unlexer.S() - return current - NULLS.min_depth = 1 - - @depthcontrol - def OFFSET(self): - current = self.create_node(UnlexerRule(name='OFFSET')) - current += self.unlexer.O() - current += self.unlexer.F() - current += self.unlexer.F() - current += self.unlexer.S() - current += self.unlexer.E() - current += self.unlexer.T() - return current - OFFSET.min_depth = 1 - - @depthcontrol - def ON(self): - current = self.create_node(UnlexerRule(name='ON')) - current += self.unlexer.O() - current += self.unlexer.N() - return current - ON.min_depth = 1 - - @depthcontrol - def OR(self): - current = self.create_node(UnlexerRule(name='OR')) - current += self.unlexer.O() - current += self.unlexer.R() - return current - OR.min_depth = 1 - - @depthcontrol - def ORDER(self): - current = self.create_node(UnlexerRule(name='ORDER')) - current += self.unlexer.O() - current += self.unlexer.R() - current += self.unlexer.D() - current += self.unlexer.E() - current += self.unlexer.R() - return current - ORDER.min_depth = 1 - - @depthcontrol - def OUTER(self): - current = self.create_node(UnlexerRule(name='OUTER')) - current += self.unlexer.O() - current += self.unlexer.U() - current += self.unlexer.T() - current += self.unlexer.E() - current += self.unlexer.R() - return current - OUTER.min_depth = 1 - - @depthcontrol - def OUTFILE(self): - current = self.create_node(UnlexerRule(name='OUTFILE')) - current += self.unlexer.O() - current += self.unlexer.U() - current += self.unlexer.T() - current += self.unlexer.F() - current += self.unlexer.I() - current += self.unlexer.L() - current += self.unlexer.E() - return current - OUTFILE.min_depth = 1 - - @depthcontrol - def PARTITION(self): - current = self.create_node(UnlexerRule(name='PARTITION')) - current += self.unlexer.P() - current += self.unlexer.A() - current += self.unlexer.R() - current += self.unlexer.T() - current += self.unlexer.I() - current += self.unlexer.T() - current += self.unlexer.I() - current += self.unlexer.O() - current += self.unlexer.N() - return current - PARTITION.min_depth = 1 - - @depthcontrol - def PREWHERE(self): - current = self.create_node(UnlexerRule(name='PREWHERE')) - current += self.unlexer.P() - current += self.unlexer.R() - current += self.unlexer.E() - current += self.unlexer.W() - current += self.unlexer.H() - current += self.unlexer.E() - current += self.unlexer.R() - current += self.unlexer.E() - return current - PREWHERE.min_depth = 1 - - @depthcontrol - def PRIMARY(self): - current = self.create_node(UnlexerRule(name='PRIMARY')) - current += self.unlexer.P() - current += self.unlexer.R() - current += self.unlexer.I() - current += self.unlexer.M() - current += self.unlexer.A() - current += self.unlexer.R() - current += self.unlexer.Y() - return current - PRIMARY.min_depth = 1 - - @depthcontrol - def QUARTER(self): - current = self.create_node(UnlexerRule(name='QUARTER')) - current += self.unlexer.Q() - current += self.unlexer.U() - current += self.unlexer.A() - current += self.unlexer.R() - current += self.unlexer.T() - current += self.unlexer.E() - current += self.unlexer.R() - return current - QUARTER.min_depth = 1 - - @depthcontrol - def RIGHT(self): - current = self.create_node(UnlexerRule(name='RIGHT')) - current += self.unlexer.R() - current += self.unlexer.I() - current += self.unlexer.G() - current += self.unlexer.H() - current += self.unlexer.T() - return current - RIGHT.min_depth = 1 - - @depthcontrol - def SAMPLE(self): - current = self.create_node(UnlexerRule(name='SAMPLE')) - current += self.unlexer.S() - current += self.unlexer.A() - current += self.unlexer.M() - current += self.unlexer.P() - current += self.unlexer.L() - current += self.unlexer.E() - return current - SAMPLE.min_depth = 1 - - @depthcontrol - def SECOND(self): - current = self.create_node(UnlexerRule(name='SECOND')) - current += self.unlexer.S() - current += self.unlexer.E() - current += self.unlexer.C() - current += self.unlexer.O() - current += self.unlexer.N() - current += self.unlexer.D() - return current - SECOND.min_depth = 1 - - @depthcontrol - def SELECT(self): - current = self.create_node(UnlexerRule(name='SELECT')) - current += self.unlexer.S() - current += self.unlexer.E() - current += self.unlexer.L() - current += self.unlexer.E() - current += self.unlexer.C() - current += self.unlexer.T() - return current - SELECT.min_depth = 1 - - @depthcontrol - def SEMI(self): - current = self.create_node(UnlexerRule(name='SEMI')) - current += self.unlexer.S() - current += self.unlexer.E() - current += self.unlexer.M() - current += self.unlexer.I() - return current - SEMI.min_depth = 1 - - @depthcontrol - def SET(self): - current = self.create_node(UnlexerRule(name='SET')) - current += self.unlexer.S() - current += self.unlexer.E() - current += self.unlexer.T() - return current - SET.min_depth = 1 - - @depthcontrol - def SETTINGS(self): - current = self.create_node(UnlexerRule(name='SETTINGS')) - current += self.unlexer.S() - current += self.unlexer.E() - current += self.unlexer.T() - current += self.unlexer.T() - current += self.unlexer.I() - current += self.unlexer.N() - current += self.unlexer.G() - current += self.unlexer.S() - return current - SETTINGS.min_depth = 1 - - @depthcontrol - def TABLE(self): - current = self.create_node(UnlexerRule(name='TABLE')) - current += self.unlexer.T() - current += self.unlexer.A() - current += self.unlexer.B() - current += self.unlexer.L() - current += self.unlexer.E() - return current - TABLE.min_depth = 1 - - @depthcontrol - def TEMPORARY(self): - current = self.create_node(UnlexerRule(name='TEMPORARY')) - current += self.unlexer.T() - current += self.unlexer.E() - current += self.unlexer.M() - current += self.unlexer.P() - current += self.unlexer.O() - current += self.unlexer.R() - current += self.unlexer.A() - current += self.unlexer.R() - current += self.unlexer.Y() - return current - TEMPORARY.min_depth = 1 - - @depthcontrol - def THEN(self): - current = self.create_node(UnlexerRule(name='THEN')) - current += self.unlexer.T() - current += self.unlexer.H() - current += self.unlexer.E() - current += self.unlexer.N() - return current - THEN.min_depth = 1 - - @depthcontrol - def TO(self): - current = self.create_node(UnlexerRule(name='TO')) - current += self.unlexer.T() - current += self.unlexer.O() - return current - TO.min_depth = 1 - - @depthcontrol - def TOTALS(self): - current = self.create_node(UnlexerRule(name='TOTALS')) - current += self.unlexer.T() - current += self.unlexer.O() - current += self.unlexer.T() - current += self.unlexer.A() - current += self.unlexer.L() - current += self.unlexer.S() - return current - TOTALS.min_depth = 1 - - @depthcontrol - def TRAILING(self): - current = self.create_node(UnlexerRule(name='TRAILING')) - current += self.unlexer.T() - current += self.unlexer.R() - current += self.unlexer.A() - current += self.unlexer.I() - current += self.unlexer.L() - current += self.unlexer.I() - current += self.unlexer.N() - current += self.unlexer.G() - return current - TRAILING.min_depth = 1 - - @depthcontrol - def TRIM(self): - current = self.create_node(UnlexerRule(name='TRIM')) - current += self.unlexer.T() - current += self.unlexer.R() - current += self.unlexer.I() - current += self.unlexer.M() - return current - TRIM.min_depth = 1 - - @depthcontrol - def TTL(self): - current = self.create_node(UnlexerRule(name='TTL')) - current += self.unlexer.T() - current += self.unlexer.T() - current += self.unlexer.L() - return current - TTL.min_depth = 1 - - @depthcontrol - def UNION(self): - current = self.create_node(UnlexerRule(name='UNION')) - current += self.unlexer.U() - current += self.unlexer.N() - current += self.unlexer.I() - current += self.unlexer.O() - current += self.unlexer.N() - return current - UNION.min_depth = 1 - - @depthcontrol - def USING(self): - current = self.create_node(UnlexerRule(name='USING')) - current += self.unlexer.U() - current += self.unlexer.S() - current += self.unlexer.I() - current += self.unlexer.N() - current += self.unlexer.G() - return current - USING.min_depth = 1 - - @depthcontrol - def VALUES(self): - current = self.create_node(UnlexerRule(name='VALUES')) - current += self.unlexer.V() - current += self.unlexer.A() - current += self.unlexer.L() - current += self.unlexer.U() - current += self.unlexer.E() - current += self.unlexer.S() - return current - VALUES.min_depth = 1 - - @depthcontrol - def VOLUME(self): - current = self.create_node(UnlexerRule(name='VOLUME')) - current += self.unlexer.V() - current += self.unlexer.O() - current += self.unlexer.L() - current += self.unlexer.U() - current += self.unlexer.M() - current += self.unlexer.E() - return current - VOLUME.min_depth = 1 - - @depthcontrol - def WEEK(self): - current = self.create_node(UnlexerRule(name='WEEK')) - current += self.unlexer.W() - current += self.unlexer.E() - current += self.unlexer.E() - current += self.unlexer.K() - return current - WEEK.min_depth = 1 - - @depthcontrol - def WHEN(self): - current = self.create_node(UnlexerRule(name='WHEN')) - current += self.unlexer.W() - current += self.unlexer.H() - current += self.unlexer.E() - current += self.unlexer.N() - return current - WHEN.min_depth = 1 - - @depthcontrol - def WHERE(self): - current = self.create_node(UnlexerRule(name='WHERE')) - current += self.unlexer.W() - current += self.unlexer.H() - current += self.unlexer.E() - current += self.unlexer.R() - current += self.unlexer.E() - return current - WHERE.min_depth = 1 - - @depthcontrol - def WITH(self): - current = self.create_node(UnlexerRule(name='WITH')) - current += self.unlexer.W() - current += self.unlexer.I() - current += self.unlexer.T() - current += self.unlexer.H() - return current - WITH.min_depth = 1 - - @depthcontrol - def YEAR(self): - current = self.create_node(UnlexerRule(name='YEAR')) - current += self.unlexer.Y() - current += self.unlexer.E() - current += self.unlexer.A() - current += self.unlexer.R() - return current - YEAR.min_depth = 1 - - @depthcontrol - def IDENTIFIER(self): - current = self.create_node(UnlexerRule(name='IDENTIFIER')) - choice = self.choice([0 if [1, 1][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_18', i), 1) for i, w in enumerate([1, 1])]) - self.unlexer.weights[('alt_18', choice)] = self.unlexer.weights.get(('alt_18', choice), 1) * self.unlexer.cooldown - if choice == 0: - current += self.unlexer.LETTER() - elif choice == 1: - current += self.unlexer.UNDERSCORE() - if self.unlexer.max_depth >= 1: - for _ in self.zero_or_more(): - choice = self.choice([0 if [1, 1, 1][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_22', i), 1) for i, w in enumerate([1, 1, 1])]) - self.unlexer.weights[('alt_22', choice)] = self.unlexer.weights.get(('alt_22', choice), 1) * self.unlexer.cooldown - if choice == 0: - current += self.unlexer.LETTER() - elif choice == 1: - current += self.unlexer.UNDERSCORE() - elif choice == 2: - current += self.unlexer.DEC_DIGIT() - - return current - IDENTIFIER.min_depth = 1 - - @depthcontrol - def FLOATING_LITERAL(self): - current = self.create_node(UnlexerRule(name='FLOATING_LITERAL')) - choice = self.choice([0 if [2, 2, 2, 2][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_26', i), 1) for i, w in enumerate([1, 1, 1, 1])]) - self.unlexer.weights[('alt_26', choice)] = self.unlexer.weights.get(('alt_26', choice), 1) * self.unlexer.cooldown - if choice == 0: - current += self.unlexer.HEXADECIMAL_LITERAL() - current += self.unlexer.DOT() - if self.unlexer.max_depth >= 1: - for _ in self.zero_or_more(): - current += self.unlexer.HEX_DIGIT() - - if self.unlexer.max_depth >= 1: - for _ in self.zero_or_one(): - choice = self.choice([0 if [1, 1][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_33', i), 1) for i, w in enumerate([1, 1])]) - self.unlexer.weights[('alt_33', choice)] = self.unlexer.weights.get(('alt_33', choice), 1) * self.unlexer.cooldown - if choice == 0: - current += self.unlexer.P() - elif choice == 1: - current += self.unlexer.E() - if self.unlexer.max_depth >= 1: - for _ in self.zero_or_one(): - choice = self.choice([0 if [1, 1][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_37', i), 1) for i, w in enumerate([1, 1])]) - self.unlexer.weights[('alt_37', choice)] = self.unlexer.weights.get(('alt_37', choice), 1) * self.unlexer.cooldown - if choice == 0: - current += self.unlexer.PLUS() - elif choice == 1: - current += self.unlexer.DASH() - - if self.unlexer.max_depth >= 0: - for _ in self.one_or_more(): - current += self.unlexer.DEC_DIGIT() - - - elif choice == 1: - current += self.unlexer.HEXADECIMAL_LITERAL() - choice = self.choice([0 if [1, 1][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_40', i), 1) for i, w in enumerate([1, 1])]) - self.unlexer.weights[('alt_40', choice)] = self.unlexer.weights.get(('alt_40', choice), 1) * self.unlexer.cooldown - if choice == 0: - current += self.unlexer.P() - elif choice == 1: - current += self.unlexer.E() - if self.unlexer.max_depth >= 1: - for _ in self.zero_or_one(): - choice = self.choice([0 if [1, 1][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_44', i), 1) for i, w in enumerate([1, 1])]) - self.unlexer.weights[('alt_44', choice)] = self.unlexer.weights.get(('alt_44', choice), 1) * self.unlexer.cooldown - if choice == 0: - current += self.unlexer.PLUS() - elif choice == 1: - current += self.unlexer.DASH() - - if self.unlexer.max_depth >= 0: - for _ in self.one_or_more(): - current += self.unlexer.DEC_DIGIT() - - elif choice == 2: - current += self.unlexer.INTEGER_LITERAL() - current += self.unlexer.DOT() - if self.unlexer.max_depth >= 1: - for _ in self.zero_or_more(): - current += self.unlexer.DEC_DIGIT() - - if self.unlexer.max_depth >= 1: - for _ in self.zero_or_one(): - current += self.unlexer.E() - if self.unlexer.max_depth >= 1: - for _ in self.zero_or_one(): - choice = self.choice([0 if [1, 1][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_50', i), 1) for i, w in enumerate([1, 1])]) - self.unlexer.weights[('alt_50', choice)] = self.unlexer.weights.get(('alt_50', choice), 1) * self.unlexer.cooldown - if choice == 0: - current += self.unlexer.PLUS() - elif choice == 1: - current += self.unlexer.DASH() - - if self.unlexer.max_depth >= 0: - for _ in self.one_or_more(): - current += self.unlexer.DEC_DIGIT() - - - elif choice == 3: - current += self.unlexer.INTEGER_LITERAL() - current += self.unlexer.E() - if self.unlexer.max_depth >= 1: - for _ in self.zero_or_one(): - choice = self.choice([0 if [1, 1][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_54', i), 1) for i, w in enumerate([1, 1])]) - self.unlexer.weights[('alt_54', choice)] = self.unlexer.weights.get(('alt_54', choice), 1) * self.unlexer.cooldown - if choice == 0: - current += self.unlexer.PLUS() - elif choice == 1: - current += self.unlexer.DASH() - - if self.unlexer.max_depth >= 0: - for _ in self.one_or_more(): - current += self.unlexer.DEC_DIGIT() - - return current - FLOATING_LITERAL.min_depth = 2 - - @depthcontrol - def HEXADECIMAL_LITERAL(self): - current = self.create_node(UnlexerRule(name='HEXADECIMAL_LITERAL')) - current += self.create_node(UnlexerRule(src='0')) - current += self.unlexer.X() - if self.unlexer.max_depth >= 0: - for _ in self.one_or_more(): - current += self.unlexer.HEX_DIGIT() - - return current - HEXADECIMAL_LITERAL.min_depth = 1 - - @depthcontrol - def INTEGER_LITERAL(self): - current = self.create_node(UnlexerRule(name='INTEGER_LITERAL')) - if self.unlexer.max_depth >= 0: - for _ in self.one_or_more(): - current += self.unlexer.DEC_DIGIT() - - return current - INTEGER_LITERAL.min_depth = 1 - - @depthcontrol - def STRING_LITERAL(self): - current = self.create_node(UnlexerRule(name='STRING_LITERAL')) - current += self.unlexer.QUOTE_SINGLE() - if self.unlexer.max_depth >= 0: - for _ in self.zero_or_more(): - choice = self.choice([0 if [0, 1][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_59', i), 1) for i, w in enumerate([1, 1])]) - self.unlexer.weights[('alt_59', choice)] = self.unlexer.weights.get(('alt_59', choice), 1) * self.unlexer.cooldown - if choice == 0: - current += UnlexerRule(src=self.char_from_list(charset_0)) - elif choice == 1: - current += self.unlexer.BACKSLASH() - current += UnlexerRule(src=self.any_char()) - - current += self.unlexer.QUOTE_SINGLE() - return current - STRING_LITERAL.min_depth = 1 - - @depthcontrol - def A(self): - current = self.create_node(UnlexerRule(name='A')) - current += self.create_node(UnlexerRule(src=self.char_from_list(charset_1))) - return current - A.min_depth = 0 - - @depthcontrol - def B(self): - current = self.create_node(UnlexerRule(name='B')) - current += self.create_node(UnlexerRule(src=self.char_from_list(charset_2))) - return current - B.min_depth = 0 - - @depthcontrol - def C(self): - current = self.create_node(UnlexerRule(name='C')) - current += self.create_node(UnlexerRule(src=self.char_from_list(charset_3))) - return current - C.min_depth = 0 - - @depthcontrol - def D(self): - current = self.create_node(UnlexerRule(name='D')) - current += self.create_node(UnlexerRule(src=self.char_from_list(charset_4))) - return current - D.min_depth = 0 - - @depthcontrol - def E(self): - current = self.create_node(UnlexerRule(name='E')) - current += self.create_node(UnlexerRule(src=self.char_from_list(charset_5))) - return current - E.min_depth = 0 - - @depthcontrol - def F(self): - current = self.create_node(UnlexerRule(name='F')) - current += self.create_node(UnlexerRule(src=self.char_from_list(charset_6))) - return current - F.min_depth = 0 - - @depthcontrol - def G(self): - current = self.create_node(UnlexerRule(name='G')) - current += self.create_node(UnlexerRule(src=self.char_from_list(charset_7))) - return current - G.min_depth = 0 - - @depthcontrol - def H(self): - current = self.create_node(UnlexerRule(name='H')) - current += self.create_node(UnlexerRule(src=self.char_from_list(charset_8))) - return current - H.min_depth = 0 - - @depthcontrol - def I(self): - current = self.create_node(UnlexerRule(name='I')) - current += self.create_node(UnlexerRule(src=self.char_from_list(charset_9))) - return current - I.min_depth = 0 - - @depthcontrol - def J(self): - current = self.create_node(UnlexerRule(name='J')) - current += self.create_node(UnlexerRule(src=self.char_from_list(charset_10))) - return current - J.min_depth = 0 - - @depthcontrol - def K(self): - current = self.create_node(UnlexerRule(name='K')) - current += self.create_node(UnlexerRule(src=self.char_from_list(charset_11))) - return current - K.min_depth = 0 - - @depthcontrol - def L(self): - current = self.create_node(UnlexerRule(name='L')) - current += self.create_node(UnlexerRule(src=self.char_from_list(charset_12))) - return current - L.min_depth = 0 - - @depthcontrol - def M(self): - current = self.create_node(UnlexerRule(name='M')) - current += self.create_node(UnlexerRule(src=self.char_from_list(charset_13))) - return current - M.min_depth = 0 - - @depthcontrol - def N(self): - current = self.create_node(UnlexerRule(name='N')) - current += self.create_node(UnlexerRule(src=self.char_from_list(charset_14))) - return current - N.min_depth = 0 - - @depthcontrol - def O(self): - current = self.create_node(UnlexerRule(name='O')) - current += self.create_node(UnlexerRule(src=self.char_from_list(charset_15))) - return current - O.min_depth = 0 - - @depthcontrol - def P(self): - current = self.create_node(UnlexerRule(name='P')) - current += self.create_node(UnlexerRule(src=self.char_from_list(charset_16))) - return current - P.min_depth = 0 - - @depthcontrol - def Q(self): - current = self.create_node(UnlexerRule(name='Q')) - current += self.create_node(UnlexerRule(src=self.char_from_list(charset_17))) - return current - Q.min_depth = 0 - - @depthcontrol - def R(self): - current = self.create_node(UnlexerRule(name='R')) - current += self.create_node(UnlexerRule(src=self.char_from_list(charset_18))) - return current - R.min_depth = 0 - - @depthcontrol - def S(self): - current = self.create_node(UnlexerRule(name='S')) - current += self.create_node(UnlexerRule(src=self.char_from_list(charset_19))) - return current - S.min_depth = 0 - - @depthcontrol - def T(self): - current = self.create_node(UnlexerRule(name='T')) - current += self.create_node(UnlexerRule(src=self.char_from_list(charset_20))) - return current - T.min_depth = 0 - - @depthcontrol - def U(self): - current = self.create_node(UnlexerRule(name='U')) - current += self.create_node(UnlexerRule(src=self.char_from_list(charset_21))) - return current - U.min_depth = 0 - - @depthcontrol - def V(self): - current = self.create_node(UnlexerRule(name='V')) - current += self.create_node(UnlexerRule(src=self.char_from_list(charset_22))) - return current - V.min_depth = 0 - - @depthcontrol - def W(self): - current = self.create_node(UnlexerRule(name='W')) - current += self.create_node(UnlexerRule(src=self.char_from_list(charset_23))) - return current - W.min_depth = 0 - - @depthcontrol - def X(self): - current = self.create_node(UnlexerRule(name='X')) - current += self.create_node(UnlexerRule(src=self.char_from_list(charset_24))) - return current - X.min_depth = 0 - - @depthcontrol - def Y(self): - current = self.create_node(UnlexerRule(name='Y')) - current += self.create_node(UnlexerRule(src=self.char_from_list(charset_25))) - return current - Y.min_depth = 0 - - @depthcontrol - def Z(self): - current = self.create_node(UnlexerRule(name='Z')) - current += self.create_node(UnlexerRule(src=self.char_from_list(charset_26))) - return current - Z.min_depth = 0 - - @depthcontrol - def LETTER(self): - current = self.create_node(UnlexerRule(name='LETTER')) - current += self.create_node(UnlexerRule(src=self.char_from_list(charset_27))) - return current - LETTER.min_depth = 0 - - @depthcontrol - def DEC_DIGIT(self): - current = self.create_node(UnlexerRule(name='DEC_DIGIT')) - current += self.create_node(UnlexerRule(src=self.char_from_list(charset_28))) - return current - DEC_DIGIT.min_depth = 0 - - @depthcontrol - def HEX_DIGIT(self): - current = self.create_node(UnlexerRule(name='HEX_DIGIT')) - current += self.create_node(UnlexerRule(src=self.char_from_list(charset_29))) - return current - HEX_DIGIT.min_depth = 0 - - @depthcontrol - def ARROW(self): - current = self.create_node(UnlexerRule(name='ARROW')) - current += self.create_node(UnlexerRule(src='->')) - return current - ARROW.min_depth = 0 - - @depthcontrol - def ASTERISK(self): - current = self.create_node(UnlexerRule(name='ASTERISK')) - current += self.create_node(UnlexerRule(src='*')) - return current - ASTERISK.min_depth = 0 - - @depthcontrol - def BACKQUOTE(self): - current = self.create_node(UnlexerRule(name='BACKQUOTE')) - current += self.create_node(UnlexerRule(src='`')) - return current - BACKQUOTE.min_depth = 0 - - @depthcontrol - def BACKSLASH(self): - current = self.create_node(UnlexerRule(name='BACKSLASH')) - current += self.create_node(UnlexerRule(src='\\')) - return current - BACKSLASH.min_depth = 0 - - @depthcontrol - def COLON(self): - current = self.create_node(UnlexerRule(name='COLON')) - current += self.create_node(UnlexerRule(src=':')) - return current - COLON.min_depth = 0 - - @depthcontrol - def COMMA(self): - current = self.create_node(UnlexerRule(name='COMMA')) - current += self.create_node(UnlexerRule(src=',')) - return current - COMMA.min_depth = 0 - - @depthcontrol - def CONCAT(self): - current = self.create_node(UnlexerRule(name='CONCAT')) - current += self.create_node(UnlexerRule(src='||')) - return current - CONCAT.min_depth = 0 - - @depthcontrol - def DASH(self): - current = self.create_node(UnlexerRule(name='DASH')) - current += self.create_node(UnlexerRule(src='-')) - return current - DASH.min_depth = 0 - - @depthcontrol - def DOT(self): - current = self.create_node(UnlexerRule(name='DOT')) - current += self.create_node(UnlexerRule(src='.')) - return current - DOT.min_depth = 0 - - @depthcontrol - def EQ_DOUBLE(self): - current = self.create_node(UnlexerRule(name='EQ_DOUBLE')) - current += self.create_node(UnlexerRule(src='==')) - return current - EQ_DOUBLE.min_depth = 0 - - @depthcontrol - def EQ_SINGLE(self): - current = self.create_node(UnlexerRule(name='EQ_SINGLE')) - current += self.create_node(UnlexerRule(src='=')) - return current - EQ_SINGLE.min_depth = 0 - - @depthcontrol - def GE(self): - current = self.create_node(UnlexerRule(name='GE')) - current += self.create_node(UnlexerRule(src='>=')) - return current - GE.min_depth = 0 - - @depthcontrol - def GT(self): - current = self.create_node(UnlexerRule(name='GT')) - current += self.create_node(UnlexerRule(src='>')) - return current - GT.min_depth = 0 - - @depthcontrol - def LBRACKET(self): - current = self.create_node(UnlexerRule(name='LBRACKET')) - current += self.create_node(UnlexerRule(src='[')) - return current - LBRACKET.min_depth = 0 - - @depthcontrol - def LE(self): - current = self.create_node(UnlexerRule(name='LE')) - current += self.create_node(UnlexerRule(src='<=')) - return current - LE.min_depth = 0 - - @depthcontrol - def LPAREN(self): - current = self.create_node(UnlexerRule(name='LPAREN')) - current += self.create_node(UnlexerRule(src='(')) - return current - LPAREN.min_depth = 0 - - @depthcontrol - def LT(self): - current = self.create_node(UnlexerRule(name='LT')) - current += self.create_node(UnlexerRule(src='<')) - return current - LT.min_depth = 0 - - @depthcontrol - def NOT_EQ(self): - current = self.create_node(UnlexerRule(name='NOT_EQ')) - choice = self.choice([0 if [0, 0][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_79', i), 1) for i, w in enumerate([1, 1])]) - self.unlexer.weights[('alt_79', choice)] = self.unlexer.weights.get(('alt_79', choice), 1) * self.unlexer.cooldown - if choice == 0: - current += self.create_node(UnlexerRule(src='!=')) - elif choice == 1: - current += self.create_node(UnlexerRule(src='<>')) - return current - NOT_EQ.min_depth = 0 - - @depthcontrol - def PERCENT(self): - current = self.create_node(UnlexerRule(name='PERCENT')) - current += self.create_node(UnlexerRule(src='%')) - return current - PERCENT.min_depth = 0 - - @depthcontrol - def PLUS(self): - current = self.create_node(UnlexerRule(name='PLUS')) - current += self.create_node(UnlexerRule(src='+')) - return current - PLUS.min_depth = 0 - - @depthcontrol - def QUERY(self): - current = self.create_node(UnlexerRule(name='QUERY')) - current += self.create_node(UnlexerRule(src='?')) - return current - QUERY.min_depth = 0 - - @depthcontrol - def QUOTE_SINGLE(self): - current = self.create_node(UnlexerRule(name='QUOTE_SINGLE')) - current += self.create_node(UnlexerRule(src='\'')) - return current - QUOTE_SINGLE.min_depth = 0 - - @depthcontrol - def RBRACKET(self): - current = self.create_node(UnlexerRule(name='RBRACKET')) - current += self.create_node(UnlexerRule(src=']')) - return current - RBRACKET.min_depth = 0 - - @depthcontrol - def RPAREN(self): - current = self.create_node(UnlexerRule(name='RPAREN')) - current += self.create_node(UnlexerRule(src=')')) - return current - RPAREN.min_depth = 0 - - @depthcontrol - def SEMICOLON(self): - current = self.create_node(UnlexerRule(name='SEMICOLON')) - current += self.create_node(UnlexerRule(src=';')) - return current - SEMICOLON.min_depth = 0 - - @depthcontrol - def SLASH(self): - current = self.create_node(UnlexerRule(name='SLASH')) - current += self.create_node(UnlexerRule(src='/')) - return current - SLASH.min_depth = 0 - - @depthcontrol - def UNDERSCORE(self): - current = self.create_node(UnlexerRule(name='UNDERSCORE')) - current += self.create_node(UnlexerRule(src='_')) - return current - UNDERSCORE.min_depth = 0 - - @depthcontrol - def SINGLE_LINE_COMMENT(self): - current = self.create_node(UnlexerRule(name='SINGLE_LINE_COMMENT')) - current += self.create_node(UnlexerRule(src='--')) - if self.unlexer.max_depth >= 0: - for _ in self.zero_or_more(): - current += UnlexerRule(src=self.char_from_list(charset_30)) - - choice = self.choice([0 if [0, 0, 1][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_95', i), 1) for i, w in enumerate([1, 1, 1])]) - self.unlexer.weights[('alt_95', choice)] = self.unlexer.weights.get(('alt_95', choice), 1) * self.unlexer.cooldown - if choice == 0: - current += self.create_node(UnlexerRule(src='\n')) - elif choice == 1: - current += self.create_node(UnlexerRule(src='\r')) - elif choice == 2: - current += self.unlexer.EOF() - return current - SINGLE_LINE_COMMENT.min_depth = 0 - - @depthcontrol - def MULTI_LINE_COMMENT(self): - current = self.create_node(UnlexerRule(name='MULTI_LINE_COMMENT')) - current += self.create_node(UnlexerRule(src='/*')) - if self.unlexer.max_depth >= 0: - for _ in self.zero_or_more(): - current += UnlexerRule(src=self.any_char()) - - current += self.create_node(UnlexerRule(src='*/')) - return current - MULTI_LINE_COMMENT.min_depth = 0 - - @depthcontrol - def WHITESPACE(self): - current = self.create_node(UnlexerRule(name='WHITESPACE')) - current += self.create_node(UnlexerRule(src=self.char_from_list(charset_31))) - return current - WHITESPACE.min_depth = 0 - diff --git a/utils/grammar-fuzzer/ClickHouseUnparser.py b/utils/grammar-fuzzer/ClickHouseUnparser.py deleted file mode 100644 index 7fa5eb96d31..00000000000 --- a/utils/grammar-fuzzer/ClickHouseUnparser.py +++ /dev/null @@ -1,1815 +0,0 @@ -# Generated by Grammarinator 19.3 - -from itertools import chain -from grammarinator.runtime import * - -import ClickHouseUnlexer - - -class ClickHouseUnparser(Grammarinator): - - def __init__(self, unlexer): - super(ClickHouseUnparser, self).__init__() - self.unlexer = unlexer - @depthcontrol - def queryList(self): - current = self.create_node(UnparserRule(name='queryList')) - current += self.queryStmt() - if self.unlexer.max_depth >= 8: - for _ in self.zero_or_more(): - current += self.unlexer.SEMICOLON() - current += self.queryStmt() - - if self.unlexer.max_depth >= 1: - for _ in self.zero_or_one(): - current += self.unlexer.SEMICOLON() - - current += self.unlexer.EOF() - return current - queryList.min_depth = 8 - - @depthcontrol - def queryStmt(self): - current = self.create_node(UnparserRule(name='queryStmt')) - current += self.query() - if self.unlexer.max_depth >= 2: - for _ in self.zero_or_one(): - current += self.unlexer.INTO() - current += self.unlexer.OUTFILE() - current += self.unlexer.STRING_LITERAL() - - if self.unlexer.max_depth >= 3: - for _ in self.zero_or_one(): - current += self.unlexer.FORMAT() - current += self.identifier() - - return current - queryStmt.min_depth = 7 - - @depthcontrol - def query(self): - current = self.create_node(UnparserRule(name='query')) - choice = self.choice([0 if [6, 7, 6, 6][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_108', i), 1) for i, w in enumerate([1, 1, 1, 1])]) - self.unlexer.weights[('alt_108', choice)] = self.unlexer.weights.get(('alt_108', choice), 1) * self.unlexer.cooldown - if choice == 0: - current += self.distributedStmt() - elif choice == 1: - current += self.insertStmt() - elif choice == 2: - current += self.selectUnionStmt() - elif choice == 3: - current += self.setStmt() - return current - query.min_depth = 6 - - @depthcontrol - def distributedStmt(self): - current = self.create_node(UnparserRule(name='distributedStmt')) - choice = self.choice([0 if [5, 6, 6][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_113', i), 1) for i, w in enumerate([1, 1, 1])]) - self.unlexer.weights[('alt_113', choice)] = self.unlexer.weights.get(('alt_113', choice), 1) * self.unlexer.cooldown - if choice == 0: - current += self.createDatabaseStmt() - elif choice == 1: - current += self.createTableStmt() - elif choice == 2: - current += self.dropStmt() - if self.unlexer.max_depth >= 3: - for _ in self.zero_or_one(): - current += self.unlexer.ON() - current += self.unlexer.CLUSTER() - current += self.identifier() - - return current - distributedStmt.min_depth = 5 - - @depthcontrol - def createDatabaseStmt(self): - current = self.create_node(UnparserRule(name='createDatabaseStmt')) - current += self.unlexer.CREATE() - current += self.unlexer.DATABASE() - if self.unlexer.max_depth >= 2: - for _ in self.zero_or_one(): - current += self.unlexer.IF() - current += self.unlexer.NOT() - current += self.unlexer.EXISTS() - - current += self.databaseIdentifier() - if self.unlexer.max_depth >= 4: - for _ in self.zero_or_one(): - current += self.engineExpr() - - return current - createDatabaseStmt.min_depth = 4 - - @depthcontrol - def createTableStmt(self): - current = self.create_node(UnparserRule(name='createTableStmt')) - current += self.unlexer.CREATE() - current += self.unlexer.TABLE() - if self.unlexer.max_depth >= 2: - for _ in self.zero_or_one(): - current += self.unlexer.IF() - current += self.unlexer.NOT() - current += self.unlexer.EXISTS() - - current += self.tableIdentifier() - current += self.schemaClause() - return current - createTableStmt.min_depth = 5 - - @depthcontrol - def schemaClause(self): - current = self.create_node(UnparserRule(name='schemaClause')) - choice = self.choice([0 if [8, 7, 5, 4][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_121', i), 1) for i, w in enumerate([1, 1, 1, 1])]) - self.unlexer.weights[('alt_121', choice)] = self.unlexer.weights.get(('alt_121', choice), 1) * self.unlexer.cooldown - if choice == 0: - current = self.schemaClause_SchemaDescriptionClause() - elif choice == 1: - current = self.schemaClause_SchemaAsSubqueryClause() - elif choice == 2: - current = self.schemaClause_SchemaAsTableClause() - elif choice == 3: - current = self.schemaClause_SchemaAsFunctionClause() - return current - schemaClause.min_depth = 4 - - @depthcontrol - def schemaClause_SchemaDescriptionClause(self): - current = self.create_node(UnparserRule(name='schemaClause_SchemaDescriptionClause')) - current += self.unlexer.LPAREN() - current += self.tableElementExpr() - if self.unlexer.max_depth >= 7: - for _ in self.zero_or_more(): - current += self.unlexer.COMMA() - current += self.tableElementExpr() - - current += self.unlexer.RPAREN() - current += self.engineClause() - return current - schemaClause_SchemaDescriptionClause.min_depth = 7 - - @depthcontrol - def schemaClause_SchemaAsSubqueryClause(self): - current = self.create_node(UnparserRule(name='schemaClause_SchemaAsSubqueryClause')) - if self.unlexer.max_depth >= 5: - for _ in self.zero_or_one(): - current += self.engineClause() - - current += self.unlexer.AS() - current += self.selectUnionStmt() - return current - schemaClause_SchemaAsSubqueryClause.min_depth = 6 - - @depthcontrol - def schemaClause_SchemaAsTableClause(self): - current = self.create_node(UnparserRule(name='schemaClause_SchemaAsTableClause')) - current += self.unlexer.AS() - current += self.tableIdentifier() - if self.unlexer.max_depth >= 5: - for _ in self.zero_or_one(): - current += self.engineClause() - - return current - schemaClause_SchemaAsTableClause.min_depth = 4 - - @depthcontrol - def schemaClause_SchemaAsFunctionClause(self): - current = self.create_node(UnparserRule(name='schemaClause_SchemaAsFunctionClause')) - current += self.unlexer.AS() - current += self.identifier() - current += self.unlexer.LPAREN() - if self.unlexer.max_depth >= 5: - for _ in self.zero_or_one(): - current += self.tableArgList() - - current += self.unlexer.RPAREN() - return current - schemaClause_SchemaAsFunctionClause.min_depth = 3 - - @depthcontrol - def engineClause(self): - current = self.create_node(UnparserRule(name='engineClause')) - current += self.engineExpr() - if self.unlexer.max_depth >= 6: - for _ in self.zero_or_one(): - current += self.orderByClause() - - if self.unlexer.max_depth >= 4: - for _ in self.zero_or_one(): - current += self.partitionByClause() - - if self.unlexer.max_depth >= 4: - for _ in self.zero_or_one(): - current += self.primaryKeyClause() - - if self.unlexer.max_depth >= 4: - for _ in self.zero_or_one(): - current += self.sampleByClause() - - if self.unlexer.max_depth >= 5: - for _ in self.zero_or_one(): - current += self.ttlClause() - - if self.unlexer.max_depth >= 6: - for _ in self.zero_or_one(): - current += self.settingsClause() - - return current - engineClause.min_depth = 4 - - @depthcontrol - def partitionByClause(self): - current = self.create_node(UnparserRule(name='partitionByClause')) - current += self.unlexer.PARTITION() - current += self.unlexer.BY() - current += self.columnExpr() - return current - partitionByClause.min_depth = 3 - - @depthcontrol - def primaryKeyClause(self): - current = self.create_node(UnparserRule(name='primaryKeyClause')) - current += self.unlexer.PRIMARY() - current += self.unlexer.KEY() - current += self.columnExpr() - return current - primaryKeyClause.min_depth = 3 - - @depthcontrol - def sampleByClause(self): - current = self.create_node(UnparserRule(name='sampleByClause')) - current += self.unlexer.SAMPLE() - current += self.unlexer.BY() - current += self.columnExpr() - return current - sampleByClause.min_depth = 3 - - @depthcontrol - def ttlClause(self): - current = self.create_node(UnparserRule(name='ttlClause')) - current += self.unlexer.TTL() - current += self.ttlExpr() - if self.unlexer.max_depth >= 4: - for _ in self.zero_or_more(): - current += self.unlexer.COMMA() - current += self.ttlExpr() - - return current - ttlClause.min_depth = 4 - - @depthcontrol - def engineExpr(self): - current = self.create_node(UnparserRule(name='engineExpr')) - current += self.unlexer.ENGINE() - if self.unlexer.max_depth >= 1: - for _ in self.zero_or_one(): - current += self.unlexer.EQ_SINGLE() - - current += self.identifier() - if self.unlexer.max_depth >= 1: - for _ in self.zero_or_one(): - current += self.unlexer.LPAREN() - if self.unlexer.max_depth >= 5: - for _ in self.zero_or_one(): - current += self.tableArgList() - - current += self.unlexer.RPAREN() - - return current - engineExpr.min_depth = 3 - - @depthcontrol - def tableElementExpr(self): - current = self.create_node(UnparserRule(name='tableElementExpr')) - current = self.tableElementExpr_TableElementColumn() - return current - tableElementExpr.min_depth = 6 - - @depthcontrol - def tableElementExpr_TableElementColumn(self): - current = self.create_node(UnparserRule(name='tableElementExpr_TableElementColumn')) - current += self.identifier() - current += self.columnTypeExpr() - if self.unlexer.max_depth >= 4: - for _ in self.zero_or_one(): - current += self.tableColumnPropertyExpr() - - if self.unlexer.max_depth >= 3: - for _ in self.zero_or_one(): - current += self.unlexer.TTL() - current += self.columnExpr() - - return current - tableElementExpr_TableElementColumn.min_depth = 5 - - @depthcontrol - def tableColumnPropertyExpr(self): - current = self.create_node(UnparserRule(name='tableColumnPropertyExpr')) - choice = self.choice([0 if [2, 2, 2][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_142', i), 1) for i, w in enumerate([1, 1, 1])]) - self.unlexer.weights[('alt_142', choice)] = self.unlexer.weights.get(('alt_142', choice), 1) * self.unlexer.cooldown - if choice == 0: - current += self.unlexer.DEFAULT() - elif choice == 1: - current += self.unlexer.MATERIALIZED() - elif choice == 2: - current += self.unlexer.ALIAS() - current += self.columnExpr() - return current - tableColumnPropertyExpr.min_depth = 3 - - @depthcontrol - def ttlExpr(self): - current = self.create_node(UnparserRule(name='ttlExpr')) - current += self.columnExpr() - if self.unlexer.max_depth >= 2: - for _ in self.zero_or_one(): - choice = self.choice([0 if [2, 2, 2][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_147', i), 1) for i, w in enumerate([1, 1, 1])]) - self.unlexer.weights[('alt_147', choice)] = self.unlexer.weights.get(('alt_147', choice), 1) * self.unlexer.cooldown - if choice == 0: - current += self.unlexer.DELETE() - elif choice == 1: - current += self.unlexer.TO() - current += self.unlexer.DISK() - current += self.unlexer.STRING_LITERAL() - elif choice == 2: - current += self.unlexer.TO() - current += self.unlexer.VOLUME() - current += self.unlexer.STRING_LITERAL() - - return current - ttlExpr.min_depth = 3 - - @depthcontrol - def dropStmt(self): - current = self.create_node(UnparserRule(name='dropStmt')) - choice = self.choice([0 if [5, 5][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_151', i), 1) for i, w in enumerate([1, 1])]) - self.unlexer.weights[('alt_151', choice)] = self.unlexer.weights.get(('alt_151', choice), 1) * self.unlexer.cooldown - if choice == 0: - current = self.dropStmt_DropDatabaseStmt() - elif choice == 1: - current = self.dropStmt_DropTableStmt() - return current - dropStmt.min_depth = 5 - - @depthcontrol - def dropStmt_DropDatabaseStmt(self): - current = self.create_node(UnparserRule(name='dropStmt_DropDatabaseStmt')) - current += self.unlexer.DROP() - current += self.unlexer.DATABASE() - if self.unlexer.max_depth >= 2: - for _ in self.zero_or_one(): - current += self.unlexer.IF() - current += self.unlexer.EXISTS() - - current += self.databaseIdentifier() - return current - dropStmt_DropDatabaseStmt.min_depth = 4 - - @depthcontrol - def dropStmt_DropTableStmt(self): - current = self.create_node(UnparserRule(name='dropStmt_DropTableStmt')) - current += self.unlexer.DROP() - if self.unlexer.max_depth >= 2: - for _ in self.zero_or_one(): - current += self.unlexer.TEMPORARY() - - current += self.unlexer.TABLE() - if self.unlexer.max_depth >= 2: - for _ in self.zero_or_one(): - current += self.unlexer.IF() - current += self.unlexer.EXISTS() - - current += self.tableIdentifier() - return current - dropStmt_DropTableStmt.min_depth = 4 - - @depthcontrol - def insertStmt(self): - current = self.create_node(UnparserRule(name='insertStmt')) - current += self.unlexer.INSERT() - current += self.unlexer.INTO() - current += self.tableIdentifier() - if self.unlexer.max_depth >= 3: - for _ in self.zero_or_one(): - current += self.unlexer.LPAREN() - current += self.identifier() - if self.unlexer.max_depth >= 3: - for _ in self.zero_or_more(): - current += self.unlexer.COMMA() - current += self.identifier() - - current += self.unlexer.RPAREN() - - current += self.valuesClause() - return current - insertStmt.min_depth = 6 - - @depthcontrol - def valuesClause(self): - current = self.create_node(UnparserRule(name='valuesClause')) - choice = self.choice([0 if [5, 6][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_159', i), 1) for i, w in enumerate([1, 1])]) - self.unlexer.weights[('alt_159', choice)] = self.unlexer.weights.get(('alt_159', choice), 1) * self.unlexer.cooldown - if choice == 0: - current += self.unlexer.VALUES() - current += self.valueTupleExpr() - if self.unlexer.max_depth >= 5: - for _ in self.zero_or_more(): - current += self.unlexer.COMMA() - current += self.valueTupleExpr() - - elif choice == 1: - current += self.selectUnionStmt() - return current - valuesClause.min_depth = 5 - - @depthcontrol - def valueTupleExpr(self): - current = self.create_node(UnparserRule(name='valueTupleExpr')) - current += self.unlexer.LPAREN() - current += self.valueExprList() - current += self.unlexer.RPAREN() - return current - valueTupleExpr.min_depth = 4 - - @depthcontrol - def selectUnionStmt(self): - current = self.create_node(UnparserRule(name='selectUnionStmt')) - current += self.selectStmt() - if self.unlexer.max_depth >= 5: - for _ in self.zero_or_more(): - current += self.unlexer.UNION() - current += self.unlexer.ALL() - current += self.selectStmt() - - return current - selectUnionStmt.min_depth = 5 - - @depthcontrol - def selectStmt(self): - current = self.create_node(UnparserRule(name='selectStmt')) - if self.unlexer.max_depth >= 5: - for _ in self.zero_or_one(): - current += self.withClause() - - current += self.unlexer.SELECT() - if self.unlexer.max_depth >= 2: - for _ in self.zero_or_one(): - current += self.unlexer.DISTINCT() - - current += self.columnExprList() - if self.unlexer.max_depth >= 8: - for _ in self.zero_or_one(): - current += self.fromClause() - - if self.unlexer.max_depth >= 4: - for _ in self.zero_or_one(): - current += self.sampleClause() - - if self.unlexer.max_depth >= 5: - for _ in self.zero_or_one(): - current += self.arrayJoinClause() - - if self.unlexer.max_depth >= 4: - for _ in self.zero_or_one(): - current += self.prewhereClause() - - if self.unlexer.max_depth >= 4: - for _ in self.zero_or_one(): - current += self.whereClause() - - if self.unlexer.max_depth >= 5: - for _ in self.zero_or_one(): - current += self.groupByClause() - - if self.unlexer.max_depth >= 4: - for _ in self.zero_or_one(): - current += self.havingClause() - - if self.unlexer.max_depth >= 6: - for _ in self.zero_or_one(): - current += self.orderByClause() - - if self.unlexer.max_depth >= 5: - for _ in self.zero_or_one(): - current += self.limitByClause() - - if self.unlexer.max_depth >= 4: - for _ in self.zero_or_one(): - current += self.limitClause() - - if self.unlexer.max_depth >= 6: - for _ in self.zero_or_one(): - current += self.settingsClause() - - return current - selectStmt.min_depth = 4 - - @depthcontrol - def withClause(self): - current = self.create_node(UnparserRule(name='withClause')) - current += self.unlexer.WITH() - current += self.columnExprList() - return current - withClause.min_depth = 4 - - @depthcontrol - def fromClause(self): - current = self.create_node(UnparserRule(name='fromClause')) - current += self.unlexer.FROM() - current += self.joinExpr() - if self.unlexer.max_depth >= 2: - for _ in self.zero_or_one(): - current += self.unlexer.FINAL() - - return current - fromClause.min_depth = 7 - - @depthcontrol - def sampleClause(self): - current = self.create_node(UnparserRule(name='sampleClause')) - current += self.unlexer.SAMPLE() - current += self.ratioExpr() - if self.unlexer.max_depth >= 3: - for _ in self.zero_or_one(): - current += self.unlexer.OFFSET() - current += self.ratioExpr() - - return current - sampleClause.min_depth = 3 - - @depthcontrol - def arrayJoinClause(self): - current = self.create_node(UnparserRule(name='arrayJoinClause')) - if self.unlexer.max_depth >= 2: - for _ in self.zero_or_one(): - current += self.unlexer.LEFT() - - current += self.unlexer.ARRAY() - current += self.unlexer.JOIN() - current += self.columnExprList() - return current - arrayJoinClause.min_depth = 4 - - @depthcontrol - def prewhereClause(self): - current = self.create_node(UnparserRule(name='prewhereClause')) - current += self.unlexer.PREWHERE() - current += self.columnExpr() - return current - prewhereClause.min_depth = 3 - - @depthcontrol - def whereClause(self): - current = self.create_node(UnparserRule(name='whereClause')) - current += self.unlexer.WHERE() - current += self.columnExpr() - return current - whereClause.min_depth = 3 - - @depthcontrol - def groupByClause(self): - current = self.create_node(UnparserRule(name='groupByClause')) - current += self.unlexer.GROUP() - current += self.unlexer.BY() - current += self.columnExprList() - if self.unlexer.max_depth >= 2: - for _ in self.zero_or_one(): - current += self.unlexer.WITH() - current += self.unlexer.TOTALS() - - return current - groupByClause.min_depth = 4 - - @depthcontrol - def havingClause(self): - current = self.create_node(UnparserRule(name='havingClause')) - current += self.unlexer.HAVING() - current += self.columnExpr() - return current - havingClause.min_depth = 3 - - @depthcontrol - def orderByClause(self): - current = self.create_node(UnparserRule(name='orderByClause')) - current += self.unlexer.ORDER() - current += self.unlexer.BY() - current += self.orderExprList() - return current - orderByClause.min_depth = 5 - - @depthcontrol - def limitByClause(self): - current = self.create_node(UnparserRule(name='limitByClause')) - current += self.unlexer.LIMIT() - current += self.limitExpr() - current += self.unlexer.BY() - current += self.columnExprList() - return current - limitByClause.min_depth = 4 - - @depthcontrol - def limitClause(self): - current = self.create_node(UnparserRule(name='limitClause')) - current += self.unlexer.LIMIT() - current += self.limitExpr() - return current - limitClause.min_depth = 3 - - @depthcontrol - def settingsClause(self): - current = self.create_node(UnparserRule(name='settingsClause')) - current += self.unlexer.SETTINGS() - current += self.settingExprList() - return current - settingsClause.min_depth = 5 - - @depthcontrol - def joinExpr(self): - current = self.create_node(UnparserRule(name='joinExpr')) - choice = self.choice([0 if [6, 8, 8, 8][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_181', i), 1) for i, w in enumerate([1, 1, 1, 1])]) - self.unlexer.weights[('alt_181', choice)] = self.unlexer.weights.get(('alt_181', choice), 1) * self.unlexer.cooldown - if choice == 0: - current = self.joinExpr_JoinExprTable() - elif choice == 1: - current = self.joinExpr_JoinExprParens() - elif choice == 2: - current = self.joinExpr_JoinExprOp() - elif choice == 3: - current = self.joinExpr_JoinExprCrossOp() - return current - joinExpr.min_depth = 6 - - @depthcontrol - def joinExpr_JoinExprTable(self): - current = self.create_node(UnparserRule(name='joinExpr_JoinExprTable')) - current += self.tableExpr() - return current - joinExpr_JoinExprTable.min_depth = 5 - - @depthcontrol - def joinExpr_JoinExprParens(self): - current = self.create_node(UnparserRule(name='joinExpr_JoinExprParens')) - current += self.unlexer.LPAREN() - current += self.joinExpr() - current += self.unlexer.RPAREN() - return current - joinExpr_JoinExprParens.min_depth = 7 - - @depthcontrol - def joinExpr_JoinExprOp(self): - current = self.create_node(UnparserRule(name='joinExpr_JoinExprOp')) - current += self.joinExpr() - if self.unlexer.max_depth >= 2: - for _ in self.zero_or_one(): - choice = self.choice([0 if [2, 2][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_187', i), 1) for i, w in enumerate([1, 1])]) - self.unlexer.weights[('alt_187', choice)] = self.unlexer.weights.get(('alt_187', choice), 1) * self.unlexer.cooldown - if choice == 0: - current += self.unlexer.GLOBAL() - elif choice == 1: - current += self.unlexer.LOCAL() - - current += self.joinOp() - current += self.unlexer.JOIN() - current += self.joinExpr() - current += self.joinConstraintClause() - return current - joinExpr_JoinExprOp.min_depth = 7 - - @depthcontrol - def joinExpr_JoinExprCrossOp(self): - current = self.create_node(UnparserRule(name='joinExpr_JoinExprCrossOp')) - current += self.joinExpr() - current += self.joinOpCross() - current += self.joinExpr() - return current - joinExpr_JoinExprCrossOp.min_depth = 7 - - @depthcontrol - def joinOp(self): - current = self.create_node(UnparserRule(name='joinOp')) - choice = self.choice([0 if [3, 3, 3][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_190', i), 1) for i, w in enumerate([1, 1, 1])]) - self.unlexer.weights[('alt_190', choice)] = self.unlexer.weights.get(('alt_190', choice), 1) * self.unlexer.cooldown - if choice == 0: - current = self.joinOp_JoinOpInner() - elif choice == 1: - current = self.joinOp_JoinOpLeftRight() - elif choice == 2: - current = self.joinOp_JoinOpFull() - return current - joinOp.min_depth = 3 - - @depthcontrol - def joinOp_JoinOpInner(self): - current = self.create_node(UnparserRule(name='joinOp_JoinOpInner')) - choice = self.choice([0 if [2, 2][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_194', i), 1) for i, w in enumerate([1, 1])]) - self.unlexer.weights[('alt_194', choice)] = self.unlexer.weights.get(('alt_194', choice), 1) * self.unlexer.cooldown - if choice == 0: - if self.unlexer.max_depth >= 2: - for _ in self.zero_or_one(): - current += self.unlexer.ANY() - - current += self.unlexer.INNER() - elif choice == 1: - current += self.unlexer.INNER() - if self.unlexer.max_depth >= 2: - for _ in self.zero_or_one(): - current += self.unlexer.ANY() - - return current - joinOp_JoinOpInner.min_depth = 2 - - @depthcontrol - def joinOp_JoinOpLeftRight(self): - current = self.create_node(UnparserRule(name='joinOp_JoinOpLeftRight')) - choice = self.choice([0 if [2, 2][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_199', i), 1) for i, w in enumerate([1, 1])]) - self.unlexer.weights[('alt_199', choice)] = self.unlexer.weights.get(('alt_199', choice), 1) * self.unlexer.cooldown - if choice == 0: - if self.unlexer.max_depth >= 2: - for _ in self.zero_or_one(): - choice = self.choice([0 if [2, 2, 2, 2, 2][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_203', i), 1) for i, w in enumerate([1, 1, 1, 1, 1])]) - self.unlexer.weights[('alt_203', choice)] = self.unlexer.weights.get(('alt_203', choice), 1) * self.unlexer.cooldown - if choice == 0: - current += self.unlexer.OUTER() - elif choice == 1: - current += self.unlexer.SEMI() - elif choice == 2: - current += self.unlexer.ANTI() - elif choice == 3: - current += self.unlexer.ANY() - elif choice == 4: - current += self.unlexer.ASOF() - - choice = self.choice([0 if [2, 2][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_209', i), 1) for i, w in enumerate([1, 1])]) - self.unlexer.weights[('alt_209', choice)] = self.unlexer.weights.get(('alt_209', choice), 1) * self.unlexer.cooldown - if choice == 0: - current += self.unlexer.LEFT() - elif choice == 1: - current += self.unlexer.RIGHT() - elif choice == 1: - choice = self.choice([0 if [2, 2][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_212', i), 1) for i, w in enumerate([1, 1])]) - self.unlexer.weights[('alt_212', choice)] = self.unlexer.weights.get(('alt_212', choice), 1) * self.unlexer.cooldown - if choice == 0: - current += self.unlexer.LEFT() - elif choice == 1: - current += self.unlexer.RIGHT() - if self.unlexer.max_depth >= 2: - for _ in self.zero_or_one(): - choice = self.choice([0 if [2, 2, 2, 2, 2][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_216', i), 1) for i, w in enumerate([1, 1, 1, 1, 1])]) - self.unlexer.weights[('alt_216', choice)] = self.unlexer.weights.get(('alt_216', choice), 1) * self.unlexer.cooldown - if choice == 0: - current += self.unlexer.OUTER() - elif choice == 1: - current += self.unlexer.SEMI() - elif choice == 2: - current += self.unlexer.ANTI() - elif choice == 3: - current += self.unlexer.ANY() - elif choice == 4: - current += self.unlexer.ASOF() - - return current - joinOp_JoinOpLeftRight.min_depth = 2 - - @depthcontrol - def joinOp_JoinOpFull(self): - current = self.create_node(UnparserRule(name='joinOp_JoinOpFull')) - choice = self.choice([0 if [2, 2][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_222', i), 1) for i, w in enumerate([1, 1])]) - self.unlexer.weights[('alt_222', choice)] = self.unlexer.weights.get(('alt_222', choice), 1) * self.unlexer.cooldown - if choice == 0: - if self.unlexer.max_depth >= 2: - for _ in self.zero_or_one(): - choice = self.choice([0 if [2, 2][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_226', i), 1) for i, w in enumerate([1, 1])]) - self.unlexer.weights[('alt_226', choice)] = self.unlexer.weights.get(('alt_226', choice), 1) * self.unlexer.cooldown - if choice == 0: - current += self.unlexer.OUTER() - elif choice == 1: - current += self.unlexer.ANY() - - current += self.unlexer.FULL() - elif choice == 1: - current += self.unlexer.FULL() - if self.unlexer.max_depth >= 2: - for _ in self.zero_or_one(): - choice = self.choice([0 if [2, 2][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_230', i), 1) for i, w in enumerate([1, 1])]) - self.unlexer.weights[('alt_230', choice)] = self.unlexer.weights.get(('alt_230', choice), 1) * self.unlexer.cooldown - if choice == 0: - current += self.unlexer.OUTER() - elif choice == 1: - current += self.unlexer.ANY() - - return current - joinOp_JoinOpFull.min_depth = 2 - - @depthcontrol - def joinOpCross(self): - current = self.create_node(UnparserRule(name='joinOpCross')) - choice = self.choice([0 if [2, 1][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_233', i), 1) for i, w in enumerate([1, 1])]) - self.unlexer.weights[('alt_233', choice)] = self.unlexer.weights.get(('alt_233', choice), 1) * self.unlexer.cooldown - if choice == 0: - if self.unlexer.max_depth >= 2: - for _ in self.zero_or_one(): - choice = self.choice([0 if [2, 2][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_237', i), 1) for i, w in enumerate([1, 1])]) - self.unlexer.weights[('alt_237', choice)] = self.unlexer.weights.get(('alt_237', choice), 1) * self.unlexer.cooldown - if choice == 0: - current += self.unlexer.GLOBAL() - elif choice == 1: - current += self.unlexer.LOCAL() - - current += self.unlexer.CROSS() - current += self.unlexer.JOIN() - elif choice == 1: - current += self.unlexer.COMMA() - return current - joinOpCross.min_depth = 1 - - @depthcontrol - def joinConstraintClause(self): - current = self.create_node(UnparserRule(name='joinConstraintClause')) - choice = self.choice([0 if [4, 4, 4][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_240', i), 1) for i, w in enumerate([1, 1, 1])]) - self.unlexer.weights[('alt_240', choice)] = self.unlexer.weights.get(('alt_240', choice), 1) * self.unlexer.cooldown - if choice == 0: - current += self.unlexer.ON() - current += self.columnExprList() - elif choice == 1: - current += self.unlexer.USING() - current += self.unlexer.LPAREN() - current += self.columnExprList() - current += self.unlexer.RPAREN() - elif choice == 2: - current += self.unlexer.USING() - current += self.columnExprList() - return current - joinConstraintClause.min_depth = 4 - - @depthcontrol - def limitExpr(self): - current = self.create_node(UnparserRule(name='limitExpr')) - current += self.unlexer.INTEGER_LITERAL() - if self.unlexer.max_depth >= 2: - for _ in self.zero_or_one(): - choice = self.choice([0 if [1, 2][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_245', i), 1) for i, w in enumerate([1, 1])]) - self.unlexer.weights[('alt_245', choice)] = self.unlexer.weights.get(('alt_245', choice), 1) * self.unlexer.cooldown - if choice == 0: - current += self.unlexer.COMMA() - elif choice == 1: - current += self.unlexer.OFFSET() - current += self.unlexer.INTEGER_LITERAL() - - return current - limitExpr.min_depth = 2 - - @depthcontrol - def orderExprList(self): - current = self.create_node(UnparserRule(name='orderExprList')) - current += self.orderExpr() - if self.unlexer.max_depth >= 4: - for _ in self.zero_or_more(): - current += self.unlexer.COMMA() - current += self.orderExpr() - - return current - orderExprList.min_depth = 4 - - @depthcontrol - def orderExpr(self): - current = self.create_node(UnparserRule(name='orderExpr')) - current += self.columnExpr() - if self.unlexer.max_depth >= 2: - for _ in self.zero_or_one(): - choice = self.choice([0 if [2, 2][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_250', i), 1) for i, w in enumerate([1, 1])]) - self.unlexer.weights[('alt_250', choice)] = self.unlexer.weights.get(('alt_250', choice), 1) * self.unlexer.cooldown - if choice == 0: - current += self.unlexer.ASCENDING() - elif choice == 1: - current += self.unlexer.DESCENDING() - - if self.unlexer.max_depth >= 2: - for _ in self.zero_or_one(): - current += self.unlexer.NULLS() - choice = self.choice([0 if [2, 2][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_254', i), 1) for i, w in enumerate([1, 1])]) - self.unlexer.weights[('alt_254', choice)] = self.unlexer.weights.get(('alt_254', choice), 1) * self.unlexer.cooldown - if choice == 0: - current += self.unlexer.FIRST() - elif choice == 1: - current += self.unlexer.LAST() - - if self.unlexer.max_depth >= 2: - for _ in self.zero_or_one(): - current += self.unlexer.COLLATE() - current += self.unlexer.STRING_LITERAL() - - return current - orderExpr.min_depth = 3 - - @depthcontrol - def ratioExpr(self): - current = self.create_node(UnparserRule(name='ratioExpr')) - current += self.unlexer.INTEGER_LITERAL() - current += self.unlexer.SLASH() - current += self.unlexer.INTEGER_LITERAL() - return current - ratioExpr.min_depth = 2 - - @depthcontrol - def settingExprList(self): - current = self.create_node(UnparserRule(name='settingExprList')) - current += self.settingExpr() - if self.unlexer.max_depth >= 4: - for _ in self.zero_or_more(): - current += self.unlexer.COMMA() - current += self.settingExpr() - - return current - settingExprList.min_depth = 4 - - @depthcontrol - def settingExpr(self): - current = self.create_node(UnparserRule(name='settingExpr')) - current += self.identifier() - current += self.unlexer.EQ_SINGLE() - current += self.literal() - return current - settingExpr.min_depth = 3 - - @depthcontrol - def setStmt(self): - current = self.create_node(UnparserRule(name='setStmt')) - current += self.unlexer.SET() - current += self.settingExprList() - return current - setStmt.min_depth = 5 - - @depthcontrol - def valueExprList(self): - current = self.create_node(UnparserRule(name='valueExprList')) - current += self.valueExpr() - if self.unlexer.max_depth >= 3: - for _ in self.zero_or_more(): - current += self.unlexer.COMMA() - current += self.valueExpr() - - return current - valueExprList.min_depth = 3 - - @depthcontrol - def valueExpr(self): - current = self.create_node(UnparserRule(name='valueExpr')) - choice = self.choice([0 if [4, 6, 2][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_260', i), 1) for i, w in enumerate([1, 1, 1])]) - self.unlexer.weights[('alt_260', choice)] = self.unlexer.weights.get(('alt_260', choice), 1) * self.unlexer.cooldown - if choice == 0: - current = self.valueExpr_ValueExprLiteral() - elif choice == 1: - current = self.valueExpr_ValueExprTuple() - elif choice == 2: - current = self.valueExpr_ValueExprArray() - return current - valueExpr.min_depth = 2 - - @depthcontrol - def valueExpr_ValueExprLiteral(self): - current = self.create_node(UnparserRule(name='valueExpr_ValueExprLiteral')) - current += self.literal() - return current - valueExpr_ValueExprLiteral.min_depth = 3 - - @depthcontrol - def valueExpr_ValueExprTuple(self): - current = self.create_node(UnparserRule(name='valueExpr_ValueExprTuple')) - current += self.valueTupleExpr() - return current - valueExpr_ValueExprTuple.min_depth = 5 - - @depthcontrol - def valueExpr_ValueExprArray(self): - current = self.create_node(UnparserRule(name='valueExpr_ValueExprArray')) - current += self.unlexer.LBRACKET() - if self.unlexer.max_depth >= 4: - for _ in self.zero_or_one(): - current += self.valueExprList() - - current += self.unlexer.RBRACKET() - return current - valueExpr_ValueExprArray.min_depth = 1 - - @depthcontrol - def columnTypeExpr(self): - current = self.create_node(UnparserRule(name='columnTypeExpr')) - choice = self.choice([0 if [4, 5, 4, 6][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_265', i), 1) for i, w in enumerate([1, 1, 1, 1])]) - self.unlexer.weights[('alt_265', choice)] = self.unlexer.weights.get(('alt_265', choice), 1) * self.unlexer.cooldown - if choice == 0: - current = self.columnTypeExpr_ColumnTypeExprSimple() - elif choice == 1: - current = self.columnTypeExpr_ColumnTypeExprParam() - elif choice == 2: - current = self.columnTypeExpr_ColumnTypeExprEnum() - elif choice == 3: - current = self.columnTypeExpr_ColumnTypeExprComplex() - return current - columnTypeExpr.min_depth = 4 - - @depthcontrol - def columnTypeExpr_ColumnTypeExprSimple(self): - current = self.create_node(UnparserRule(name='columnTypeExpr_ColumnTypeExprSimple')) - current += self.identifier() - return current - columnTypeExpr_ColumnTypeExprSimple.min_depth = 3 - - @depthcontrol - def columnTypeExpr_ColumnTypeExprParam(self): - current = self.create_node(UnparserRule(name='columnTypeExpr_ColumnTypeExprParam')) - current += self.identifier() - current += self.unlexer.LPAREN() - current += self.columnParamList() - current += self.unlexer.RPAREN() - return current - columnTypeExpr_ColumnTypeExprParam.min_depth = 4 - - @depthcontrol - def columnTypeExpr_ColumnTypeExprEnum(self): - current = self.create_node(UnparserRule(name='columnTypeExpr_ColumnTypeExprEnum')) - current += self.identifier() - current += self.unlexer.LPAREN() - current += self.enumValue() - if self.unlexer.max_depth >= 3: - for _ in self.zero_or_more(): - current += self.unlexer.COMMA() - current += self.enumValue() - - current += self.unlexer.RPAREN() - return current - columnTypeExpr_ColumnTypeExprEnum.min_depth = 3 - - @depthcontrol - def columnTypeExpr_ColumnTypeExprComplex(self): - current = self.create_node(UnparserRule(name='columnTypeExpr_ColumnTypeExprComplex')) - current += self.identifier() - current += self.unlexer.LPAREN() - current += self.columnTypeExpr() - if self.unlexer.max_depth >= 5: - for _ in self.zero_or_more(): - current += self.unlexer.COMMA() - current += self.columnTypeExpr() - - current += self.unlexer.RPAREN() - return current - columnTypeExpr_ColumnTypeExprComplex.min_depth = 5 - - @depthcontrol - def columnExprList(self): - current = self.create_node(UnparserRule(name='columnExprList')) - current += self.columnExpr() - if self.unlexer.max_depth >= 3: - for _ in self.zero_or_more(): - current += self.unlexer.COMMA() - current += self.columnExpr() - - return current - columnExprList.min_depth = 3 - - @depthcontrol - def columnExpr(self): - current = self.create_node(UnparserRule(name='columnExpr')) - choice = self.choice([0 if [4, 2, 5, 2, 4, 4, 4, 4, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_273', i), 1) for i, w in enumerate([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])]) - self.unlexer.weights[('alt_273', choice)] = self.unlexer.weights.get(('alt_273', choice), 1) * self.unlexer.cooldown - if choice == 0: - current = self.columnExpr_ColumnExprLiteral() - elif choice == 1: - current = self.columnExpr_ColumnExprAsterisk() - elif choice == 2: - current = self.columnExpr_ColumnExprTuple() - elif choice == 3: - current = self.columnExpr_ColumnExprArray() - elif choice == 4: - current = self.columnExpr_ColumnExprCase() - elif choice == 5: - current = self.columnExpr_ColumnExprExtract() - elif choice == 6: - current = self.columnExpr_ColumnExprTrim() - elif choice == 7: - current = self.columnExpr_ColumnExprInterval() - elif choice == 8: - current = self.columnExpr_ColumnExprIdentifier() - elif choice == 9: - current = self.columnExpr_ColumnExprFunction() - elif choice == 10: - current = self.columnExpr_ColumnExprArrayAccess() - elif choice == 11: - current = self.columnExpr_ColumnExprTupleAccess() - elif choice == 12: - current = self.columnExpr_ColumnExprUnaryOp() - elif choice == 13: - current = self.columnExpr_ColumnExprIsNull() - elif choice == 14: - current = self.columnExpr_ColumnExprBinaryOp() - elif choice == 15: - current = self.columnExpr_ColumnExprTernaryOp() - elif choice == 16: - current = self.columnExpr_ColumnExprBetween() - elif choice == 17: - current = self.columnExpr_ColumnExprAlias() - return current - columnExpr.min_depth = 2 - - @depthcontrol - def columnExpr_ColumnExprLiteral(self): - current = self.create_node(UnparserRule(name='columnExpr_ColumnExprLiteral')) - current += self.literal() - return current - columnExpr_ColumnExprLiteral.min_depth = 3 - - @depthcontrol - def columnExpr_ColumnExprAsterisk(self): - current = self.create_node(UnparserRule(name='columnExpr_ColumnExprAsterisk')) - current += self.unlexer.ASTERISK() - return current - columnExpr_ColumnExprAsterisk.min_depth = 1 - - @depthcontrol - def columnExpr_ColumnExprTuple(self): - current = self.create_node(UnparserRule(name='columnExpr_ColumnExprTuple')) - current += self.unlexer.LPAREN() - current += self.columnExprList() - current += self.unlexer.RPAREN() - return current - columnExpr_ColumnExprTuple.min_depth = 4 - - @depthcontrol - def columnExpr_ColumnExprArray(self): - current = self.create_node(UnparserRule(name='columnExpr_ColumnExprArray')) - current += self.unlexer.LBRACKET() - if self.unlexer.max_depth >= 4: - for _ in self.zero_or_one(): - current += self.columnExprList() - - current += self.unlexer.RBRACKET() - return current - columnExpr_ColumnExprArray.min_depth = 1 - - @depthcontrol - def columnExpr_ColumnExprCase(self): - current = self.create_node(UnparserRule(name='columnExpr_ColumnExprCase')) - current += self.unlexer.CASE() - if self.unlexer.max_depth >= 3: - for _ in self.zero_or_one(): - current += self.columnExpr() - - if self.unlexer.max_depth >= 0: - for _ in self.one_or_more(): - current += self.unlexer.WHEN() - current += self.columnExpr() - current += self.unlexer.THEN() - current += self.columnExpr() - - if self.unlexer.max_depth >= 3: - for _ in self.zero_or_one(): - current += self.unlexer.ELSE() - current += self.columnExpr() - - current += self.unlexer.END() - return current - columnExpr_ColumnExprCase.min_depth = 3 - - @depthcontrol - def columnExpr_ColumnExprExtract(self): - current = self.create_node(UnparserRule(name='columnExpr_ColumnExprExtract')) - current += self.unlexer.EXTRACT() - current += self.unlexer.LPAREN() - current += self.unlexer.INTERVAL_TYPE() - current += self.unlexer.FROM() - current += self.columnExpr() - current += self.unlexer.RPAREN() - return current - columnExpr_ColumnExprExtract.min_depth = 3 - - @depthcontrol - def columnExpr_ColumnExprTrim(self): - current = self.create_node(UnparserRule(name='columnExpr_ColumnExprTrim')) - current += self.unlexer.TRIM() - current += self.unlexer.LPAREN() - choice = self.choice([0 if [2, 2, 2][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_295', i), 1) for i, w in enumerate([1, 1, 1])]) - self.unlexer.weights[('alt_295', choice)] = self.unlexer.weights.get(('alt_295', choice), 1) * self.unlexer.cooldown - if choice == 0: - current += self.unlexer.BOTH() - elif choice == 1: - current += self.unlexer.LEADING() - elif choice == 2: - current += self.unlexer.TRAILING() - current += self.unlexer.STRING_LITERAL() - current += self.unlexer.FROM() - current += self.columnExpr() - current += self.unlexer.RPAREN() - return current - columnExpr_ColumnExprTrim.min_depth = 3 - - @depthcontrol - def columnExpr_ColumnExprInterval(self): - current = self.create_node(UnparserRule(name='columnExpr_ColumnExprInterval')) - current += self.unlexer.INTERVAL() - current += self.columnExpr() - current += self.unlexer.INTERVAL_TYPE() - return current - columnExpr_ColumnExprInterval.min_depth = 3 - - @depthcontrol - def columnExpr_ColumnExprIdentifier(self): - current = self.create_node(UnparserRule(name='columnExpr_ColumnExprIdentifier')) - current += self.columnIdentifier() - return current - columnExpr_ColumnExprIdentifier.min_depth = 4 - - @depthcontrol - def columnExpr_ColumnExprFunction(self): - current = self.create_node(UnparserRule(name='columnExpr_ColumnExprFunction')) - current += self.identifier() - if self.unlexer.max_depth >= 1: - for _ in self.zero_or_one(): - current += self.unlexer.LPAREN() - if self.unlexer.max_depth >= 4: - for _ in self.zero_or_one(): - current += self.columnParamList() - - current += self.unlexer.RPAREN() - - current += self.unlexer.LPAREN() - if self.unlexer.max_depth >= 5: - for _ in self.zero_or_one(): - current += self.columnArgList() - - current += self.unlexer.RPAREN() - return current - columnExpr_ColumnExprFunction.min_depth = 3 - - @depthcontrol - def columnExpr_ColumnExprArrayAccess(self): - current = self.create_node(UnparserRule(name='columnExpr_ColumnExprArrayAccess')) - current += self.columnExpr() - current += self.unlexer.LBRACKET() - current += self.columnExpr() - current += self.unlexer.RBRACKET() - return current - columnExpr_ColumnExprArrayAccess.min_depth = 3 - - @depthcontrol - def columnExpr_ColumnExprTupleAccess(self): - current = self.create_node(UnparserRule(name='columnExpr_ColumnExprTupleAccess')) - current += self.columnExpr() - current += self.unlexer.DOT() - current += self.unlexer.INTEGER_LITERAL() - return current - columnExpr_ColumnExprTupleAccess.min_depth = 3 - - @depthcontrol - def columnExpr_ColumnExprUnaryOp(self): - current = self.create_node(UnparserRule(name='columnExpr_ColumnExprUnaryOp')) - current += self.unaryOp() - current += self.columnExpr() - return current - columnExpr_ColumnExprUnaryOp.min_depth = 3 - - @depthcontrol - def columnExpr_ColumnExprIsNull(self): - current = self.create_node(UnparserRule(name='columnExpr_ColumnExprIsNull')) - current += self.columnExpr() - current += self.unlexer.IS() - if self.unlexer.max_depth >= 2: - for _ in self.zero_or_one(): - current += self.unlexer.NOT() - - current += self.unlexer.NULL_SQL() - return current - columnExpr_ColumnExprIsNull.min_depth = 3 - - @depthcontrol - def columnExpr_ColumnExprBinaryOp(self): - current = self.create_node(UnparserRule(name='columnExpr_ColumnExprBinaryOp')) - current += self.columnExpr() - current += self.binaryOp() - current += self.columnExpr() - return current - columnExpr_ColumnExprBinaryOp.min_depth = 3 - - @depthcontrol - def columnExpr_ColumnExprTernaryOp(self): - current = self.create_node(UnparserRule(name='columnExpr_ColumnExprTernaryOp')) - current += self.columnExpr() - current += self.unlexer.QUERY() - current += self.columnExpr() - current += self.unlexer.COLON() - current += self.columnExpr() - return current - columnExpr_ColumnExprTernaryOp.min_depth = 3 - - @depthcontrol - def columnExpr_ColumnExprBetween(self): - current = self.create_node(UnparserRule(name='columnExpr_ColumnExprBetween')) - current += self.columnExpr() - if self.unlexer.max_depth >= 2: - for _ in self.zero_or_one(): - current += self.unlexer.NOT() - - current += self.unlexer.BETWEEN() - current += self.columnExpr() - current += self.unlexer.AND() - current += self.columnExpr() - return current - columnExpr_ColumnExprBetween.min_depth = 3 - - @depthcontrol - def columnExpr_ColumnExprAlias(self): - current = self.create_node(UnparserRule(name='columnExpr_ColumnExprAlias')) - current += self.columnExpr() - current += self.unlexer.AS() - current += self.identifier() - return current - columnExpr_ColumnExprAlias.min_depth = 3 - - @depthcontrol - def columnParamList(self): - current = self.create_node(UnparserRule(name='columnParamList')) - current += self.literal() - if self.unlexer.max_depth >= 3: - for _ in self.zero_or_more(): - current += self.unlexer.COMMA() - current += self.literal() - - return current - columnParamList.min_depth = 3 - - @depthcontrol - def columnArgList(self): - current = self.create_node(UnparserRule(name='columnArgList')) - current += self.columnArgExpr() - if self.unlexer.max_depth >= 4: - for _ in self.zero_or_more(): - current += self.unlexer.COMMA() - current += self.columnArgExpr() - - return current - columnArgList.min_depth = 4 - - @depthcontrol - def columnArgExpr(self): - current = self.create_node(UnparserRule(name='columnArgExpr')) - choice = self.choice([0 if [4, 3][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_306', i), 1) for i, w in enumerate([1, 1])]) - self.unlexer.weights[('alt_306', choice)] = self.unlexer.weights.get(('alt_306', choice), 1) * self.unlexer.cooldown - if choice == 0: - current += self.columnLambdaExpr() - elif choice == 1: - current += self.columnExpr() - return current - columnArgExpr.min_depth = 3 - - @depthcontrol - def columnLambdaExpr(self): - current = self.create_node(UnparserRule(name='columnLambdaExpr')) - choice = self.choice([0 if [3, 3][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_309', i), 1) for i, w in enumerate([1, 1])]) - self.unlexer.weights[('alt_309', choice)] = self.unlexer.weights.get(('alt_309', choice), 1) * self.unlexer.cooldown - if choice == 0: - current += self.unlexer.LPAREN() - current += self.identifier() - if self.unlexer.max_depth >= 3: - for _ in self.zero_or_more(): - current += self.unlexer.COMMA() - current += self.identifier() - - current += self.unlexer.RPAREN() - elif choice == 1: - current += self.identifier() - if self.unlexer.max_depth >= 3: - for _ in self.zero_or_more(): - current += self.unlexer.COMMA() - current += self.identifier() - - current += self.unlexer.ARROW() - current += self.columnExpr() - return current - columnLambdaExpr.min_depth = 3 - - @depthcontrol - def columnIdentifier(self): - current = self.create_node(UnparserRule(name='columnIdentifier')) - if self.unlexer.max_depth >= 4: - for _ in self.zero_or_one(): - current += self.tableIdentifier() - current += self.unlexer.DOT() - - current += self.identifier() - return current - columnIdentifier.min_depth = 3 - - @depthcontrol - def tableExpr(self): - current = self.create_node(UnparserRule(name='tableExpr')) - choice = self.choice([0 if [5, 4, 7, 6][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_315', i), 1) for i, w in enumerate([1, 1, 1, 1])]) - self.unlexer.weights[('alt_315', choice)] = self.unlexer.weights.get(('alt_315', choice), 1) * self.unlexer.cooldown - if choice == 0: - current = self.tableExpr_TableExprIdentifier() - elif choice == 1: - current = self.tableExpr_TableExprFunction() - elif choice == 2: - current = self.tableExpr_TableExprSubquery() - elif choice == 3: - current = self.tableExpr_TableExprAlias() - return current - tableExpr.min_depth = 4 - - @depthcontrol - def tableExpr_TableExprIdentifier(self): - current = self.create_node(UnparserRule(name='tableExpr_TableExprIdentifier')) - current += self.tableIdentifier() - return current - tableExpr_TableExprIdentifier.min_depth = 4 - - @depthcontrol - def tableExpr_TableExprFunction(self): - current = self.create_node(UnparserRule(name='tableExpr_TableExprFunction')) - current += self.identifier() - current += self.unlexer.LPAREN() - if self.unlexer.max_depth >= 5: - for _ in self.zero_or_one(): - current += self.tableArgList() - - current += self.unlexer.RPAREN() - return current - tableExpr_TableExprFunction.min_depth = 3 - - @depthcontrol - def tableExpr_TableExprSubquery(self): - current = self.create_node(UnparserRule(name='tableExpr_TableExprSubquery')) - current += self.unlexer.LPAREN() - current += self.selectUnionStmt() - current += self.unlexer.RPAREN() - return current - tableExpr_TableExprSubquery.min_depth = 6 - - @depthcontrol - def tableExpr_TableExprAlias(self): - current = self.create_node(UnparserRule(name='tableExpr_TableExprAlias')) - current += self.tableExpr() - current += self.unlexer.AS() - current += self.identifier() - return current - tableExpr_TableExprAlias.min_depth = 5 - - @depthcontrol - def tableIdentifier(self): - current = self.create_node(UnparserRule(name='tableIdentifier')) - if self.unlexer.max_depth >= 4: - for _ in self.zero_or_one(): - current += self.databaseIdentifier() - current += self.unlexer.DOT() - - current += self.identifier() - return current - tableIdentifier.min_depth = 3 - - @depthcontrol - def tableArgList(self): - current = self.create_node(UnparserRule(name='tableArgList')) - current += self.tableArgExpr() - if self.unlexer.max_depth >= 4: - for _ in self.zero_or_more(): - current += self.unlexer.COMMA() - current += self.tableArgExpr() - - return current - tableArgList.min_depth = 4 - - @depthcontrol - def tableArgExpr(self): - current = self.create_node(UnparserRule(name='tableArgExpr')) - choice = self.choice([0 if [3, 4][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_323', i), 1) for i, w in enumerate([1, 1])]) - self.unlexer.weights[('alt_323', choice)] = self.unlexer.weights.get(('alt_323', choice), 1) * self.unlexer.cooldown - if choice == 0: - current += self.literal() - elif choice == 1: - current += self.tableIdentifier() - return current - tableArgExpr.min_depth = 3 - - @depthcontrol - def databaseIdentifier(self): - current = self.create_node(UnparserRule(name='databaseIdentifier')) - current += self.identifier() - return current - databaseIdentifier.min_depth = 3 - - @depthcontrol - def literal(self): - current = self.create_node(UnparserRule(name='literal')) - choice = self.choice([0 if [2, 2, 2][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_326', i), 1) for i, w in enumerate([1, 1, 1])]) - self.unlexer.weights[('alt_326', choice)] = self.unlexer.weights.get(('alt_326', choice), 1) * self.unlexer.cooldown - if choice == 0: - if self.unlexer.max_depth >= 1: - for _ in self.zero_or_one(): - choice = self.choice([0 if [1, 1][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_331', i), 1) for i, w in enumerate([1, 1])]) - self.unlexer.weights[('alt_331', choice)] = self.unlexer.weights.get(('alt_331', choice), 1) * self.unlexer.cooldown - if choice == 0: - current += self.unlexer.PLUS() - elif choice == 1: - current += self.unlexer.DASH() - - choice = self.choice([0 if [3, 2, 2, 2, 2][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_334', i), 1) for i, w in enumerate([1, 1, 1, 1, 1])]) - self.unlexer.weights[('alt_334', choice)] = self.unlexer.weights.get(('alt_334', choice), 1) * self.unlexer.cooldown - if choice == 0: - current += self.unlexer.FLOATING_LITERAL() - elif choice == 1: - current += self.unlexer.HEXADECIMAL_LITERAL() - elif choice == 2: - current += self.unlexer.INTEGER_LITERAL() - elif choice == 3: - current += self.unlexer.INF() - elif choice == 4: - current += self.unlexer.NAN_SQL() - elif choice == 1: - current += self.unlexer.STRING_LITERAL() - elif choice == 2: - current += self.unlexer.NULL_SQL() - return current - literal.min_depth = 2 - - @depthcontrol - def keyword(self): - current = self.create_node(UnparserRule(name='keyword')) - choice = self.choice([0 if [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_340', i), 1) for i, w in enumerate([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])]) - self.unlexer.weights[('alt_340', choice)] = self.unlexer.weights.get(('alt_340', choice), 1) * self.unlexer.cooldown - if choice == 0: - current += self.unlexer.ALIAS() - elif choice == 1: - current += self.unlexer.ALL() - elif choice == 2: - current += self.unlexer.AND() - elif choice == 3: - current += self.unlexer.ANTI() - elif choice == 4: - current += self.unlexer.ANY() - elif choice == 5: - current += self.unlexer.ARRAY() - elif choice == 6: - current += self.unlexer.AS() - elif choice == 7: - current += self.unlexer.ASCENDING() - elif choice == 8: - current += self.unlexer.ASOF() - elif choice == 9: - current += self.unlexer.BETWEEN() - elif choice == 10: - current += self.unlexer.BOTH() - elif choice == 11: - current += self.unlexer.BY() - elif choice == 12: - current += self.unlexer.CASE() - elif choice == 13: - current += self.unlexer.CAST() - elif choice == 14: - current += self.unlexer.CLUSTER() - elif choice == 15: - current += self.unlexer.COLLATE() - elif choice == 16: - current += self.unlexer.CREATE() - elif choice == 17: - current += self.unlexer.CROSS() - elif choice == 18: - current += self.unlexer.DAY() - elif choice == 19: - current += self.unlexer.DATABASE() - elif choice == 20: - current += self.unlexer.DEFAULT() - elif choice == 21: - current += self.unlexer.DELETE() - elif choice == 22: - current += self.unlexer.DESCENDING() - elif choice == 23: - current += self.unlexer.DISK() - elif choice == 24: - current += self.unlexer.DISTINCT() - elif choice == 25: - current += self.unlexer.DROP() - elif choice == 26: - current += self.unlexer.ELSE() - elif choice == 27: - current += self.unlexer.END() - elif choice == 28: - current += self.unlexer.ENGINE() - elif choice == 29: - current += self.unlexer.EXISTS() - elif choice == 30: - current += self.unlexer.EXTRACT() - elif choice == 31: - current += self.unlexer.FINAL() - elif choice == 32: - current += self.unlexer.FIRST() - elif choice == 33: - current += self.unlexer.FORMAT() - elif choice == 34: - current += self.unlexer.FROM() - elif choice == 35: - current += self.unlexer.FULL() - elif choice == 36: - current += self.unlexer.GLOBAL() - elif choice == 37: - current += self.unlexer.GROUP() - elif choice == 38: - current += self.unlexer.HAVING() - elif choice == 39: - current += self.unlexer.HOUR() - elif choice == 40: - current += self.unlexer.IF() - elif choice == 41: - current += self.unlexer.IN() - elif choice == 42: - current += self.unlexer.INNER() - elif choice == 43: - current += self.unlexer.INSERT() - elif choice == 44: - current += self.unlexer.INTERVAL() - elif choice == 45: - current += self.unlexer.INTO() - elif choice == 46: - current += self.unlexer.IS() - elif choice == 47: - current += self.unlexer.JOIN() - elif choice == 48: - current += self.unlexer.KEY() - elif choice == 49: - current += self.unlexer.LAST() - elif choice == 50: - current += self.unlexer.LEADING() - elif choice == 51: - current += self.unlexer.LEFT() - elif choice == 52: - current += self.unlexer.LIKE() - elif choice == 53: - current += self.unlexer.LIMIT() - elif choice == 54: - current += self.unlexer.LOCAL() - elif choice == 55: - current += self.unlexer.MATERIALIZED() - elif choice == 56: - current += self.unlexer.MINUTE() - elif choice == 57: - current += self.unlexer.MONTH() - elif choice == 58: - current += self.unlexer.NOT() - elif choice == 59: - current += self.unlexer.NULLS() - elif choice == 60: - current += self.unlexer.OFFSET() - elif choice == 61: - current += self.unlexer.ON() - elif choice == 62: - current += self.unlexer.OR() - elif choice == 63: - current += self.unlexer.ORDER() - elif choice == 64: - current += self.unlexer.OUTER() - elif choice == 65: - current += self.unlexer.OUTFILE() - elif choice == 66: - current += self.unlexer.PARTITION() - elif choice == 67: - current += self.unlexer.PREWHERE() - elif choice == 68: - current += self.unlexer.PRIMARY() - elif choice == 69: - current += self.unlexer.QUARTER() - elif choice == 70: - current += self.unlexer.RIGHT() - elif choice == 71: - current += self.unlexer.SAMPLE() - elif choice == 72: - current += self.unlexer.SECOND() - elif choice == 73: - current += self.unlexer.SEMI() - elif choice == 74: - current += self.unlexer.SET() - elif choice == 75: - current += self.unlexer.SETTINGS() - elif choice == 76: - current += self.unlexer.TABLE() - elif choice == 77: - current += self.unlexer.TEMPORARY() - elif choice == 78: - current += self.unlexer.THEN() - elif choice == 79: - current += self.unlexer.TOTALS() - elif choice == 80: - current += self.unlexer.TRAILING() - elif choice == 81: - current += self.unlexer.TRIM() - elif choice == 82: - current += self.unlexer.TO() - elif choice == 83: - current += self.unlexer.TTL() - elif choice == 84: - current += self.unlexer.UNION() - elif choice == 85: - current += self.unlexer.USING() - elif choice == 86: - current += self.unlexer.VALUES() - elif choice == 87: - current += self.unlexer.VOLUME() - elif choice == 88: - current += self.unlexer.WEEK() - elif choice == 89: - current += self.unlexer.WHEN() - elif choice == 90: - current += self.unlexer.WHERE() - elif choice == 91: - current += self.unlexer.WITH() - elif choice == 92: - current += self.unlexer.YEAR() - return current - keyword.min_depth = 2 - - @depthcontrol - def identifier(self): - current = self.create_node(UnparserRule(name='identifier')) - choice = self.choice([0 if [2, 3, 3][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_434', i), 1) for i, w in enumerate([1, 1, 1])]) - self.unlexer.weights[('alt_434', choice)] = self.unlexer.weights.get(('alt_434', choice), 1) * self.unlexer.cooldown - if choice == 0: - current += self.unlexer.IDENTIFIER() - elif choice == 1: - current += self.unlexer.INTERVAL_TYPE() - elif choice == 2: - current += self.keyword() - return current - identifier.min_depth = 2 - - @depthcontrol - def unaryOp(self): - current = self.create_node(UnparserRule(name='unaryOp')) - choice = self.choice([0 if [1, 2][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_438', i), 1) for i, w in enumerate([1, 1])]) - self.unlexer.weights[('alt_438', choice)] = self.unlexer.weights.get(('alt_438', choice), 1) * self.unlexer.cooldown - if choice == 0: - current += self.unlexer.DASH() - elif choice == 1: - current += self.unlexer.NOT() - return current - unaryOp.min_depth = 1 - - @depthcontrol - def binaryOp(self): - current = self.create_node(UnparserRule(name='binaryOp')) - choice = self.choice([0 if [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_441', i), 1) for i, w in enumerate([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])]) - self.unlexer.weights[('alt_441', choice)] = self.unlexer.weights.get(('alt_441', choice), 1) * self.unlexer.cooldown - if choice == 0: - current += self.unlexer.CONCAT() - elif choice == 1: - current += self.unlexer.ASTERISK() - elif choice == 2: - current += self.unlexer.SLASH() - elif choice == 3: - current += self.unlexer.PLUS() - elif choice == 4: - current += self.unlexer.DASH() - elif choice == 5: - current += self.unlexer.PERCENT() - elif choice == 6: - current += self.unlexer.EQ_DOUBLE() - elif choice == 7: - current += self.unlexer.EQ_SINGLE() - elif choice == 8: - current += self.unlexer.NOT_EQ() - elif choice == 9: - current += self.unlexer.LE() - elif choice == 10: - current += self.unlexer.GE() - elif choice == 11: - current += self.unlexer.LT() - elif choice == 12: - current += self.unlexer.GT() - elif choice == 13: - current += self.unlexer.AND() - elif choice == 14: - current += self.unlexer.OR() - elif choice == 15: - if self.unlexer.max_depth >= 2: - for _ in self.zero_or_one(): - current += self.unlexer.NOT() - - current += self.unlexer.LIKE() - elif choice == 16: - if self.unlexer.max_depth >= 2: - for _ in self.zero_or_one(): - current += self.unlexer.GLOBAL() - - if self.unlexer.max_depth >= 2: - for _ in self.zero_or_one(): - current += self.unlexer.NOT() - - current += self.unlexer.IN() - return current - binaryOp.min_depth = 1 - - @depthcontrol - def enumValue(self): - current = self.create_node(UnparserRule(name='enumValue')) - current += self.unlexer.STRING_LITERAL() - current += self.unlexer.EQ_SINGLE() - current += self.unlexer.INTEGER_LITERAL() - return current - enumValue.min_depth = 2 - - default_rule = queryList - diff --git a/utils/grammar-fuzzer/README.md b/utils/grammar-fuzzer/README.md deleted file mode 100644 index b3f233c8648..00000000000 --- a/utils/grammar-fuzzer/README.md +++ /dev/null @@ -1,41 +0,0 @@ -How to use Fuzzer -=== - -The fuzzer consists of auto-generated files: - - ClickHouseUnlexer.py - ClickHouseUnparser.py - -They are generated from grammar files (.g4) using Grammarinator: - - pip3 install grammarinator - grammarinator-process ClickHouseLexer.g4 ClickHouseParser.g4 -o fuzzer/ - -Then you can generate test input for ClickHouse client: - - cd fuzzer - grammarinator-generate \ - -r query_list \ # top-level rule - -o /tmp/sql_test_%d.sql \ # template for output test names - -n 10 \ # number of tests - -c 0.3 \ - -d 20 \ # depth of recursion - -p ClickHouseUnparser.py -l ClickHouseUnlexer.py \ # auto-generated unparser and unlexer - --test-transformers SpaceTransformer.single_line_whitespace \ # transform function to insert whitespace - -For more details see `grammarinator-generate --help`. As a test-transformer function also can be used `SpaceTransformer.multi_line_transformer` - both functions reside in `fuzzer/SpaceTransformer.py` file. - - -Parsing steps -=== - -1. Replace all operators with corresponding functions. -2. Replace all asterisks with columns - if it's inside function call, then expand it as multiple arguments. Warn about undeterministic invocations when functions have positional arguments. - -Old vs. new parser -=== - -- `a as b [c]` - accessing aliased array expression is not possible. -- `a as b . 1` - accessing aliased tuple expression is not possible. -- `between a is not null and b` - `between` operator should have lower priority than `is null`. -- `*.1` - accessing asterisk tuple expression is not possible. diff --git a/utils/grammar-fuzzer/SpaceTransformer.py b/utils/grammar-fuzzer/SpaceTransformer.py deleted file mode 100644 index ad96845c7e2..00000000000 --- a/utils/grammar-fuzzer/SpaceTransformer.py +++ /dev/null @@ -1,38 +0,0 @@ -# -*- coding: utf-8 -*- - -from grammarinator.runtime.tree import * - -from itertools import tee, islice, zip_longest -import random - - -def single_line_whitespace(node): - return _whitespace(node, ' \t') - - -def multi_line_whitespace(node): - return _whitespace(node, ' \t\r\n') - - -def _whitespace(node, symbols): - for child in node.children: - _whitespace(child, symbols) - - # helper function to look ahead one child - def with_next(iterable): - items, nexts = tee(iterable, 2) - nexts = islice(nexts, 1, None) - return zip_longest(items, nexts) - - if isinstance(node, UnparserRule): - new_children = [] - for child, next_child in with_next(node.children): - if (not next_child or - next_child and isinstance(next_child, UnlexerRule) and next_child.name == 'DOT' or - isinstance(child, UnlexerRule) and child.name == 'DOT'): - new_children.append(child) - else: - new_children.extend([child, UnlexerRule(src=random.choice(symbols))]) - node.children = new_children - - return node diff --git a/utils/grammar-fuzzer/__init__.py b/utils/grammar-fuzzer/__init__.py deleted file mode 100644 index 40a96afc6ff..00000000000 --- a/utils/grammar-fuzzer/__init__.py +++ /dev/null @@ -1 +0,0 @@ -# -*- coding: utf-8 -*- diff --git a/utils/junit_to_html/junit-noframes.xsl b/utils/junit_to_html/junit-noframes.xsl deleted file mode 100644 index ae70e230ef6..00000000000 --- a/utils/junit_to_html/junit-noframes.xsl +++ /dev/null @@ -1,390 +0,0 @@ - - - - - - - - Test Results - - - - - - - - -
- - - - -
- - - - - - - - - - - - -

-
- - - - - - - - - -
-

- Back to top - - -

Summary

- - - - - - - - - - - - - - - - - Failure - Error - - - - - - - - -
TestsFailuresErrorsSuccess rateTime
- - - - - - - -
- - - - -
- Note: failures are anticipated and checked for with assertions while errors are unanticipated. -
-
- - - - -

Test Results

-
-
- - - Name - Tests - Errors - Failures - Time(s) - - - - - - Name - Tests - Errors - Failures - Time(s) - Time Stamp - Host - - - - - - Name - Status - Type - Time(s) - - - - - - - - - Failure - Error - - - - - - - - - - - - - - - - - - - - - Error - Failure - TableRowColor - - - - - - Failure - - - - Error - - - - Success - - - - - - - - - - - - -

- - - - - -
- - - -

- - - - - -
- - - - N/A - - - - - - -

- at line - - - , column - - -
-
-
- - - - - - - - - - 32 - - - - - - - - - - - - -
- - - -
- - -
- - - -
- - - -
-
- - - - - - - - -
diff --git a/utils/junit_to_html/junit_to_html b/utils/junit_to_html/junit_to_html deleted file mode 100755 index 132763c7d4c..00000000000 --- a/utils/junit_to_html/junit_to_html +++ /dev/null @@ -1,86 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -import os -import lxml.etree as etree -import json -import argparse - -def export_testcases_json(report, path): - with open(os.path.join(path, "cases.jer"), "w") as testcases_file: - for testsuite in report.getroot(): - for testcase in testsuite: - row = {} - row["hostname"] = testsuite.get("hostname") - row["suite"] = testsuite.get("name") - row["suite_duration"] = testsuite.get("time") - row["timestamp"] = testsuite.get("timestamp") - row["testname"] = testcase.get("name") - row["classname"] = testcase.get("classname") - row["file"] = testcase.get("file") - row["line"] = testcase.get("line") - row["duration"] = testcase.get("time") - for el in testcase: - if el.tag == "system-err": - row["stderr"] = el.text - else: - row["stderr"] = "" - - if el.tag == "system-out": - row["stdout"] = el.text - else: - row["stdout"] = "" - - json.dump(row, testcases_file) - testcases_file.write("\n") - -def export_testsuites_json(report, path): - with open(os.path.join(path, "suites.jer"), "w") as testsuites_file: - for testsuite in report.getroot(): - row = {} - row["suite"] = testsuite.get("name") - row["errors"] = testsuite.get("errors") - row["failures"] = testsuite.get("failures") - row["hostname"] = testsuite.get("hostname") - row["skipped"] = testsuite.get("skipped") - row["duration"] = testsuite.get("time") - row["timestamp"] = testsuite.get("timestamp") - json.dump(row, testsuites_file) - testsuites_file.write("\n") - - -def _convert_junit_to_html(junit_path, result_path, export_cases, export_suites): - with open(os.path.join(os.path.dirname(__file__), "junit-noframes.xsl")) as xslt_file: - junit_to_html_xslt = etree.parse(xslt_file) - if not os.path.exists(result_path): - os.makedirs(result_path) - - with open(junit_path) as junit_file: - junit_xml = etree.parse(junit_file) - - if export_suites: - export_testsuites_json(junit_xml, result_path) - if export_cases: - export_testcases_json(junit_xml, result_path) - transform = etree.XSLT(junit_to_html_xslt) - html = etree.tostring(transform(junit_xml), encoding="utf-8") - - with open(os.path.join(result_path, "result.html"), "w") as html_file: - html_file.write(html) - -if __name__ == "__main__": - - parser = argparse.ArgumentParser(description='Convert JUnit XML.') - parser.add_argument('junit', help='path to junit.xml report') - parser.add_argument('result_dir', nargs='?', help='directory for result files. Default to junit.xml directory') - parser.add_argument('--export-cases', help='Export JSONEachRow result for testcases to upload in CI', action='store_true') - parser.add_argument('--export-suites', help='Export JSONEachRow result for testsuites to upload in CI', action='store_true') - - args = parser.parse_args() - - junit_path = args.junit - if args.result_dir: - result_path = args.result_dir - else: - result_path = os.path.dirname(junit_path) - print("junit_path: {}, result_path: {}, export cases:{}, export suites: {}".format(junit_path, result_path, args.export_cases, args.export_suites)) - _convert_junit_to_html(junit_path, result_path, args.export_cases, args.export_suites) diff --git a/utils/link-validate/link-validate.sh b/utils/link-validate/link-validate.sh deleted file mode 100755 index 2d8d57b95fc..00000000000 --- a/utils/link-validate/link-validate.sh +++ /dev/null @@ -1,42 +0,0 @@ -#/bin/sh -# -# This script is used to validate the shared libraries -# -# Authors: FoundationDB team, https://github.com/apple/foundationdb/blame/master/build/link-validate.sh -# License: Apache License 2.0 - -verlte() { - [ "$1" = "`echo -e "$1\n$2" | sort -V | head -n1`" ] -} - -ALLOWED_SHARED_LIBS=("libdl.so.2" "libpthread.so.0" "librt.so.1" "libm.so.6" "libc.so.6" "ld-linux-x86-64.so.2") - -if [ "$#" -lt 1 ]; then - echo "USAGE: link-validate.sh BINNAME GLIBC_VERSION" - exit 1 -fi - -# Step 1: glibc version - -for i in $(objdump -T "$1" | awk '{print $5}' | grep GLIBC | sed 's/ *$//g' | sed 's/GLIBC_//' | sort | uniq); do - if ! verlte "$i" "${2:-2.10}"; then - echo "Dependency on newer libc detected: $i" - exit 1 - fi -done - -# Step 2: Other dynamic dependencies - -for j in $(objdump -p "$1" | grep NEEDED | awk '{print $2}'); do - PRESENT=0 - for k in ${ALLOWED_SHARED_LIBS[@]}; do - if [[ "$k" == "$j" ]]; then - PRESENT=1 - break - fi - done - if ! [[ $PRESENT == 1 ]]; then - echo "Unexpected shared object dependency detected: $j" - exit 1 - fi -done diff --git a/utils/tests-visualizer/index.html b/utils/tests-visualizer/index.html new file mode 100644 index 00000000000..a15b09ea58e --- /dev/null +++ b/utils/tests-visualizer/index.html @@ -0,0 +1,129 @@ + + + + + +

Loading (10 seconds, 20 MB)...

+ + + + + diff --git a/utils/upload_test_results/README.md b/utils/upload_test_results/README.md deleted file mode 100644 index e6b361081a2..00000000000 --- a/utils/upload_test_results/README.md +++ /dev/null @@ -1,34 +0,0 @@ -## Tool to upload results to CI ClickHouse - -Currently allows to upload results from `junit_to_html` tool to ClickHouse CI - -``` -usage: upload_test_results [-h] --sha SHA --pr PR --file FILE --type - {suites,cases} [--user USER] --password PASSWORD - [--ca-cert CA_CERT] [--host HOST] [--db DB] - -Upload test result to CI ClickHouse. - -optional arguments: - -h, --help show this help message and exit - --sha SHA sha of current commit - --pr PR pr of current commit. 0 for master - --file FILE file to upload - --type {suites,cases} - Export type - --user USER user name - --password PASSWORD password - --ca-cert CA_CERT CA certificate path - --host HOST CI ClickHouse host - --db DB CI ClickHouse database name -``` - -$ ./upload_test_results --sha "cf7eaee3301d4634acdacbfa308ddbe0cc6a061d" --pr "0" --file xyz/cases.jer --type cases --password $PASSWD - -CI checks has single commit sha and pr identifier. -While uploading your local results for testing purposes try to use correct sha and pr. - -CA Certificate for ClickHouse CI can be obtained from Yandex.Cloud where CI database is hosted -``` bash -wget "https://storage.yandexcloud.net/cloud-certs/CA.pem" -O YandexInternalRootCA.crt -``` \ No newline at end of file diff --git a/utils/upload_test_results/upload_test_results b/utils/upload_test_results/upload_test_results deleted file mode 100755 index 5916d0d85e8..00000000000 --- a/utils/upload_test_results/upload_test_results +++ /dev/null @@ -1,127 +0,0 @@ -#!/usr/bin/env python3 -import requests -import argparse - -# CREATE TABLE test_suites -# ( -# sha String, -# pr UInt16, -# suite String, -# errors UInt16, -# failures UInt16, -# hostname String, -# skipped UInt16, -# duration Double, -# timestamp DateTime -# ) ENGINE = MergeTree ORDER BY tuple(timestamp, suite); - -QUERY_SUITES="INSERT INTO test_suites "\ - "SELECT '{sha}' AS sha, "\ - "{pr} AS pr, "\ - "suite, "\ - "errors, "\ - "failures, "\ - "hostname, "\ - "skipped, "\ - "duration, "\ - "timestamp "\ - "FROM input('"\ - "suite String, "\ - "errors UInt16, "\ - "failures UInt16, "\ - "hostname String, "\ - "skipped UInt16, "\ - "duration Double, "\ - "timestamp DateTime"\ - "') FORMAT JSONEachRow" - -# CREATE TABLE test_cases -# ( -# sha String, -# pr UInt16, -# hostname String, -# suite String, -# timestamp DateTime, -# testname String, -# classname String, -# file String, -# line UInt16, -# duration Double, -# suite_duration Double, -# stderr String, -# stdout String -# ) ENGINE = MergeTree ORDER BY tuple(timestamp, testname); - -QUERY_CASES="INSERT INTO test_cases "\ - "SELECT '{sha}' AS sha, "\ - "{pr} AS pr, "\ - "hostname, "\ - "suite, "\ - "timestamp, "\ - "testname, "\ - "classname, "\ - "file, "\ - "line, "\ - "duration, "\ - "suite_duration, "\ - "stderr,"\ - "stdout "\ - "FROM input('"\ - "hostname String, "\ - "suite String, "\ - "timestamp DateTime, "\ - "testname String, "\ - "classname String, "\ - "file String, "\ - "line UInt16, "\ - "duration Double, "\ - "suite_duration Double, "\ - "stderr String, "\ - "stdout String"\ - "') FORMAT JSONEachRow" - - -def upload_request(sha, pr, file, q_type, user, password, ca_cert, host, db): - with open(file) as upload_f: - query = QUERY_SUITES if q_type=="suites" else QUERY_CASES - query = query.format(sha=sha, pr=pr) - url = 'https://{host}:8443/?database={db}&query={query}&date_time_input_format=best_effort'.format( - host=host, - db=db, - query=query - ) - data=upload_f - auth = { - 'X-ClickHouse-User': user, - 'X-ClickHouse-Key': password, - } - - print(query); - - res = requests.post( - url, - data=data, - headers=auth, - verify=ca_cert) - res.raise_for_status() - return res.text - -if __name__ == "__main__": - - parser = argparse.ArgumentParser(description='Upload test result to CI ClickHouse.') - parser.add_argument('--sha', help='sha of current commit', type=str, required=True) - parser.add_argument('--pr', help='pr of current commit. 0 for master', type=int, required=True) - parser.add_argument('--file', help='file to upload', required=True) - parser.add_argument('--type', help='Export type', choices=['suites', 'cases'] , required=True) - parser.add_argument('--user', help='user name', type=str, default="clickhouse-ci") - parser.add_argument('--password', help='password', type=str, required=True) - parser.add_argument('--ca-cert', help='CA certificate path', type=str, default="/usr/local/share/ca-certificates/YandexInternalRootCA.crt") - parser.add_argument('--host', help='CI ClickHouse host', type=str, default="c1a-ity5agjmuhyu6nu9.mdb.yandexcloud.net") - parser.add_argument('--db', help='CI ClickHouse database name', type=str, default="clickhouse-ci") - - args = parser.parse_args() - - print((upload_request(args.sha, args.pr, args.file, args.type, args.user, args.password, args.ca_cert, args.host, args.db))) - - - diff --git a/website/benchmark/hardware/index.html b/website/benchmark/hardware/index.html index 260a928184d..c6b1e2be275 100644 --- a/website/benchmark/hardware/index.html +++ b/website/benchmark/hardware/index.html @@ -82,6 +82,9 @@ Comparison of EBS and EFS is from Ramazan Polat.
Results for Hetzner and Scaleway are from Anthony Najjar Simon (Panelbear).
Results for GCP are from Vy Nguyen Tan.
Results for ThinkPad P15 are from Mikhail Shiryaev.
+Results for RockPi4 are from Kirill Zholnay.
+Results for Xeon 6266C are from David in Shanghai.
+Results for SSDNodes and Cavium are from Lorenzo QXIP.

diff --git a/website/benchmark/hardware/results/cavium_4core.json b/website/benchmark/hardware/results/cavium_4core.json new file mode 100644 index 00000000000..a7cb96b2cd3 --- /dev/null +++ b/website/benchmark/hardware/results/cavium_4core.json @@ -0,0 +1,54 @@ +[ + { + "system": "Cavium ARM64 CPU (4 Core, 1.5 GHz, NVMe SSD)", + "system_full": "Cavium ARM64 CPU (4 Corem 1.5 GHz, NVMe SSD), 16 GiB", + "time": "2021-12-27 00:00:00", + "kind": "server", + "result": + [ +[0.004, 0.004, 0.004], +[0.196, 0.178, 0.180], +[0.495, 0.437, 0.426], +[0.715, 0.499, 0.499], +[0.992, 0.798, 0.795], +[3.958, 3.750, 3.751], +[0.288, 0.274, 0.273], +[0.236, 0.231, 0.239], +[3.129, 2.936, 2.918], +[4.221, 3.924, 3.934], +[2.395, 2.285, 2.226], +[2.832, 2.703, 2.644], +[6.510, 6.301, 6.262], +[7.933, 7.669, 7.704], +[7.397, 7.122, 7.146], +[4.692, 4.537, 4.540], +[15.194, 14.835, 15.051], +[10.446, 10.036, 10.072], +[26.472, 25.655, 25.809], +[0.879, 0.669, 0.694], +[14.614, 13.755, 13.726], +[16.876, 15.675, 15.703], +[34.715, 33.204, 33.250], +[18.850, 15.387, 15.332], +[4.455, 4.025, 4.016], +[3.667, 3.415, 3.457], +[4.507, 4.057, 4.049], +[14.344, 13.394, 13.390], +[17.519, 17.052, 17.067], +[8.606, 8.611, 8.545], +[6.936, 6.491, 6.496], +[10.020, 9.260, 9.233], +[39.793, 39.631, 39.553], +[30.310, 29.604, 29.572], +[30.485, 29.557, 29.649], +[8.539, 8.337, 8.342], +[0.931, 0.912, 0.912], +[0.523, 0.516, 0.507], +[0.460, 0.448, 0.450], +[1.880, 1.817, 1.884], +[0.141, 0.119, 0.117], +[0.116, 0.095, 0.092], +[0.021, 0.017, 0.014] + ] + } +] diff --git a/website/benchmark/hardware/results/rock_pi.json b/website/benchmark/hardware/results/rock_pi.json new file mode 100644 index 00000000000..210dc213a49 --- /dev/null +++ b/website/benchmark/hardware/results/rock_pi.json @@ -0,0 +1,54 @@ +[ + { + "system": "Rock Pi 4, 4GiB, NVMe", + "system_full": "Rock Pi 4, 4GiB C, NVMe", + "time": "2021-12-23 00:00:00", + "kind": "desktop", + "result": + [ +[0.007, 0.014, 0.005], +[0.229, 0.132, 0.215], +[0.489, 0.351, 0.306], +[0.879, 0.774, 0.768], +[1.034, 0.966, 0.879], +[2.491, 2.249, 2.493], +[0.379, 0.212, 0.213], +[0.227, 0.140, 0.152], +[3.944, 3.823, 3.805], +[5.272, 4.985, 5.069], +[2.356, 2.193, 2.254], +[2.819, 2.595, 2.568], +[9.124, 8.306, 8.529], +[11.857, 11.412, 11.290], +[9.796, 9.477, 9.610], +[8.846, 8.867, 8.909], +[null, null, null], +[null, null, null], +[null, null, null], +[1.293, 0.887, 0.980], +[15.018, 14.928, 14.748], +[19.179, 17.889, 18.021], +[45.524, 46.927, 46.909], +[23.904, 23.197, 23.511], +[5.264, 4.891, 4.936], +[4.211, 3.940, 4.047], +[5.113, 4.615, 4.783], +[17.910, 16.800, 16.410], +[23.537, 22.249, 22.172], +[16.549, 16.388, 16.337], +[9.562, 9.006, 9.260], +[17.097, 17.676, 17.585], +[null, null, null], +[null, null, null], +[null, null, null], +[null, null, null], +[1.668, 1.469, 1.342], +[0.463, 0.442, 0.353], +[0.486, 0.410, 0.346], +[2.190, 2.014, 1.878], +[0.263, 0.097, 0.201], +[0.173, 0.082, 0.139], +[0.188, 0.024, 0.016] + ] + } +] diff --git a/website/benchmark/hardware/results/ssdnodes.json b/website/benchmark/hardware/results/ssdnodes.json new file mode 100644 index 00000000000..623f4b49687 --- /dev/null +++ b/website/benchmark/hardware/results/ssdnodes.json @@ -0,0 +1,54 @@ +[ + { + "system": "SSDNodes G6", + "system_full": "G6 Performance+ 48GB RAM, 720GB NVMe, 12x Intel Silver vCPU, KVM", + "time": "2021-12-27 00:00:00", + "kind": "cloud", + "result": + [ +[0.002, 0.002, 0.002], +[0.021, 0.017, 0.017], +[0.053, 0.034, 0.039], +[0.090, 0.053, 0.047], +[0.146, 0.123, 0.117], +[0.358, 0.325, 0.323], +[0.025, 0.020, 0.021], +[0.042, 0.015, 0.014], +[0.566, 0.511, 0.524], +[0.704, 0.626, 0.591], +[0.229, 0.174, 0.194], +[0.255, 0.210, 0.206], +[0.849, 0.725, 0.701], +[0.984, 0.907, 0.948], +[0.952, 0.886, 0.899], +[0.772, 0.741, 0.738], +[2.945, 2.667, 2.703], +[1.645, 1.646, 1.576], +[5.342, 5.042, 5.306], +[0.088, 0.052, 0.051], +[1.176, 0.825, 0.839], +[1.261, 1.001, 0.933], +[2.977, 2.190, 2.193], +[1.872, 0.991, 0.956], +[0.368, 0.264, 0.275], +[0.300, 0.247, 0.241], +[0.329, 0.272, 0.277], +[1.124, 0.870, 0.824], +[1.545, 1.270, 1.281], +[1.478, 1.399, 1.463], +[0.809, 0.696, 0.677], +[1.095, 0.875, 0.832], +[5.164, 4.841, 4.613], +[3.859, 3.435, 3.396], +[4.054, 3.479, 3.496], +[1.325, 1.274, 1.294], +[0.261, 0.248, 0.266], +[0.102, 0.096, 0.104], +[0.102, 0.090, 0.094], +[0.600, 0.550, 0.566], +[0.041, 0.031, 0.028], +[0.029, 0.021, 0.025], +[0.007, 0.006, 0.005] + ] + } +] diff --git a/website/benchmark/hardware/results/xeon_gold_6266.json b/website/benchmark/hardware/results/xeon_gold_6266.json new file mode 100644 index 00000000000..0e68466a633 --- /dev/null +++ b/website/benchmark/hardware/results/xeon_gold_6266.json @@ -0,0 +1,56 @@ +[ + { + "system": "Huawei Cloud c6.xlarge.4, 4vCPUs, 16 GiB", + "system_full": "Huawei Cloud c6.xlarge.4, Xeon Gold 6266C, 3GHz, 4vCPU, 16GiB RAM, vda1 40GB", + "cpu_vendor": "Intel", + "cpu_model": "Xeon Gold 6266C", + "time": "2021-12-23 00:00:00", + "kind": "cloud", + "result": + [ +[0.001, 0.001, 0.001], +[0.034, 0.023, 0.023], +[0.168, 0.105, 0.104], +[0.745, 0.162, 0.160], +[1.512, 0.328, 0.327], +[2.408, 1.162, 1.155], +[0.069, 0.052, 0.051], +[0.074, 0.027, 0.026], +[2.314, 1.833, 1.796], +[2.749, 2.014, 2.011], +[1.424, 0.618, 0.579], +[1.494, 0.681, 0.677], +[3.208, 2.457, 2.529], +[5.071, 3.329, 3.411], +[3.968, 3.289, 3.330], +[3.142, 2.925, 2.827], +[9.473, 9.034, 8.850], +[6.768, 6.256, 6.115], +[18.388, 17.790, 17.892], +[1.105, 0.195, 0.194], +[20.310, 3.459, 3.416], +[22.772, 3.811, 3.773], +[42.554, 8.738, 8.640], +[30.747, 4.013, 3.967], +[4.707, 0.973, 0.965], +[2.003, 0.845, 0.839], +[4.978, 0.991, 0.974], +[19.726, 3.293, 3.264], +[17.151, 5.171, 5.134], +[3.620, 3.600, 3.600], +[4.693, 2.172, 2.115], +[10.842, 2.686, 2.750], +[17.857, 17.086, 16.907], +[22.926, 13.070, 12.808], +[22.803, 12.727, 12.867], +[4.189, 3.888, 3.893], +[0.227, 0.176, 0.177], +[0.085, 0.068, 0.067], +[0.101, 0.064, 0.067], +[0.493, 0.438, 0.399], +[0.042, 0.022, 0.021], +[0.029, 0.017, 0.015], +[0.007, 0.005, 0.003] + ] + } +] diff --git a/website/blog/en/2021/clickhouse-v21.12-released.md b/website/blog/en/2021/clickhouse-v21.12-released.md new file mode 100644 index 00000000000..d1e6cddbc35 --- /dev/null +++ b/website/blog/en/2021/clickhouse-v21.12-released.md @@ -0,0 +1,285 @@ +--- +title: 'What''s New in ClickHouse 21.12' +image: 'https://blog-images.clickhouse.com/en/2021/clickhouse-v21-12/featured.jpg' +date: '2021-12-16' +author: '[Alexey Milovidov](https://github.com/alexey-milovidov), [Christoph Wurm](https://github.com/cwurm)' +tags: ['company', 'community'] +--- + +We're continuing our monthly release cadence. The 21.12 Christmas release includes 2460 new commits from 125 contributors, including 42 new contributors: + +> Alex Cao, Amr Alaa, Andrey Torsunov, Constantine Peresypkin, Dmitriy Dorofeev, Egor O'Sten, Elykov Alexandr, Evgeny, Frank Chen, LB, Natasha Murashkina, Peignon Melvyn, Rich Raposa, Roman Chyrva, Roman, SuperDJY, Thom O'Connor, Timur Magomedov, Tom Risse, Tomáš Hromada, cfcz48, cgp, cms, cmsxbc, congbaoyangrou, dongyifeng, frank chen, freedomDR, jus1096, khamadiev, laurieliyang, leosunli, liyang830, loneylee, michael1589, msaf1980, p0ny, qieqieplus, spume, sunlisheng, yandd, zhanghuajie. + +If you are wondering, this list is generated by the following command: + +``` +clickhouse-local --query " + SELECT arrayStringConcat(groupArray(s), ', ') + FROM file('contributors-21.12.txt', LineAsString, 's String') + WHERE s NOT IN ( + SELECT * + FROM file('contributors-21.11.txt', LineAsString, 's String')) + FORMAT TSVRaw" +``` + +And to list the contributors, you can always run the +``` +SELECT * FROM system.contributors +``` +query on your production server. + +Let's highlight some of the new capabilities in 21.12: + + +## ClickHouse Keeper is Feature Complete + +In 21.12 `clickhouse-keeper` started to support ["four letter commands"](https://zookeeper.apache.org/doc/r3.4.8/zookeeperAdmin.html#sc_zkCommands) for status and monitoring. This feature is contributed by **JackyWoo** and reviewed by **Alexander Sapin** (the author of ClickHouse Keeper). + +It was the only missing feature to implement. In this release, clickhouse-keeper is still considered in pre-production stage, but many companies already started to evaluate and use it as a replacement of ZooKeeper. You can also start using clickhouse-keeper in your testing environments and we will appreciate your feedback. + +ClickHouse Keeper development started in Sep 2020, more than a year ago. It was a long road, and most of the efforts were to ensure correctness and stability in unusual and exceptional scenarios. It is covered by [Jepsen](https://jepsen.io/) tests (including ZooKeeper tests and [new introduced tests](https://github.com/ClickHouse/ClickHouse/tree/master/tests/jepsen.clickhouse-keeper)), continuous randomized stress testing with ClickHouse functional and integration tests. It is started to be tested in Yandex Cloud and among our best friends. If you're pretending to be our best friend, you can also do it. + +**How does this help you?** + +ClickHouse Keeper is a drop-in replacement for ZooKeeper. It implements the ZooKeeper wire protocol and data model, but does it better. + +In contrast to ZooKeeper, there are no issues with zxid overflow or packet sizes. It has better memory usage and it does not require JVM tuning (because it does not use the JVM). Logs and snapshots are compressed (by about 10x typically) and checksummed. It can run as a separate process or directly inside clickhouse-server. You can use it with ClickHouse or with your Kafkas and Hadoops as well. + +[More info](http://presentations.clickhouse.tech/meetup54/keeper.pdf). + + +## Partitions For INSERT INTO File, URL And HDFS Storages + +When using the table engines `File`, `URL`, and `HDFS` ClickHouse now supports partitions. When creating a table you can specify the partition key using the `PARTITION BY` clause e.g. `CREATE TABLE hits_files (...) ENGINE = File(TabSeparated) PARTITION BY toYYYYMM(EventDate)`. + +Similarly, when exporting data from ClickHouse using the `file`, `url`, and `hdfs` table functions you can now specify that the data is to be partitioned into multiple files using a `PARTITION BY` clause. For example, `INSERT INTO TABLE FUNCTION file('path/hits_{_partition_id}', 'TSV', 'columns...') PARTITION BY toYYYYMM(EventDate) VALUES ...` will create as many files as there are unique months in the dataset. + +The `s3` table function has already supported partitioned writes since ClickHouse 21.10. + +**How does this help you?** + +If data is split into multiple files, `SELECT` queries will be automatically parallelized. For example: + +``` +SELECT user_id, count() FROM s3( + 'https://s3.us-east-2.amazonaws.com/.../*.csv.zstd', + '...', '...', + CSV, + 'user_id UInt64, ...') +``` + +You can even parallelize data processing across a distributed compute cluster if you use the `s3Cluster` table function: + +``` +SELECT user_id, count() FROM s3Cluster( + my_cluster, + 'https://s3.us-east-2.amazonaws.com/.../*.csv.zstd', + '...', + '...', CSV, + 'user_id UInt64, ...') +``` + +It can also be used for integration with external data processing tools that consume data from `s3`. + + +## FROM INFILE in clickhouse-client now supports glob patterns and parallel reading + +Just write: + +``` +INSERT INTO my_table FROM INFILE '*.csv.gz' FORMAT CSV +``` + +Glob patterns support `*`, `?` and `{n..m}` with `{1..10}` or (aligned) `{01..10}` forms. +This query will be automatically parallelized and it will also automatically detect the compression format from the file extension and decompress transparently. + +This improvement is done by **Arthur Filatenkov**. + +**How does this help you?** + +Now you don't have to recall how to write a parallel for loop in your command line shell. clickhouse-client will do everything for you, it works intuitively and fast. + + +## Support for INTERVAL operator inside WITH FILL modifier for ORDER BY clause + +What's the... `WITH FILL` modifier in the `ORDER BY` clause? Take a look at the example: + +``` +:) SELECT EventDate, count() FROM test.hits WHERE CounterID = 2841673 GROUP BY EventDate ORDER BY EventDate + +┌──EventDate─┬─count()─┐ +│ 2014-03-17 │ 3 │ +│ 2014-03-19 │ 6 │ +│ 2014-03-21 │ 7 │ +│ 2014-03-22 │ 6 │ +└────────────┴─────────┘ +``` + +We have the report with Mar 17th, 19th, 21st, and 22nd. But Mar 18th and 20th are missing, because there is no data for these dates. +And this is how it works in all SQL databases. + +But ClickHouse also has a quite unique and neat `WITH FILL` modifier for the `ORDER BY` clause. + +You just write: +``` +SELECT EventDate, count() FROM test.hits WHERE CounterID = 2841673 GROUP BY EventDate +ORDER BY EventDate WITH FILL STEP 1 + +┌──EventDate─┬─count()─┐ +│ 2014-03-17 │ 3 │ +│ 2014-03-18 │ 0 │ +│ 2014-03-19 │ 6 │ +│ 2014-03-20 │ 0 │ +│ 2014-03-21 │ 7 │ +│ 2014-03-22 │ 6 │ +└────────────┴─────────┘ +``` + +And missing data is automatically filled. + +You can also add `FROM` and `TO`: + +``` +ORDER BY EventDate WITH FILL FROM '2014-03-01'::Date TO '2014-03-31'::Date STEP 1; +``` + +And it will automatically fill missing rows in the report. + +The `STEP` can be an arbitrary number. But what can you do if you want to fill missing dates for a report by months? You cannot just write `STEP 30` or `STEP 31` because different months contain different number of days... + +Since ClickHouse version 21.12 you can do it like this: + +``` +ORDER BY EventDate WITH FILL STEP INTERVAL 1 MONTH +``` + +`INTERVAL` is a standard SQL operator, you can use SECOND, MINUTE, HOUR, DAY, WEEK, MONTH, QUARTER and YEAR. + +This is implemented by **Anton Popov** who is the author of the "WITH FILL" feature. + +**How does this help you?** + +It allows you to avoid a postprocessing step for your reports. + + +## Add Support For "Identifier" Table and Database Query Parameters + +ClickHouse has support for parameterized queries. For example: + +``` +SELECT uniq(user_id) FROM table WHERE website = {name:String} +``` + +It allows to safely substitute parameters without the risk of SQL injections: + +``` +curl https://clickhouse-server:8443/?param_name=upyachka -d 'SELECT uniq(user_id) FROM table WHERE website = {name:String}' +``` + +You can even create customized API handlers for clickhouse-server based on prepared queries. + +In version 21.12 we introduce support for using parameters for tables and databases in your queries. This is implemented with the `Identifier` table parameter: + +``` +SELECT uniq(user_id) FROM {tbl:Identifier} +``` + +Identifier parameters also work for CREATE, DROP and all DDL queries. This is implemented by **Nikolai Degterinskiy**. + +**How does this help you?** + +Let ClickHouse do the heavy lifting and keep your scripts safe and secure. + + +## Bool Data Type + +This feature is experimental in version 21.12. It is implemented by **Kevin Wan (MaxWk)** on top of initial work by **hczhcz** and reviewed by **Pavel Kruglov**. + +ClickHouse now natively supports a `Bool` data type. It allows to represent values as "true"/"false" during data import and export in text formats. It can also be adjusted to anything else using the settings `bool_true_representation` and `bool_false_representation` (for example, "yes" and "no"). + +**How does this help you?** + +Native boolean data types exist today in other databases that are often integrated with ClickHouse, such as PostgreSQL. The `Bool` data type in ClickHouse will make it more compatible with existing code and ease migration from other databases. + +Also it simplifies data ingestion from various text sources. + + +## Query Optimizations With Table Constraints + +This feature is [contributed](https://github.com/ClickHouse/ClickHouse/pull/18787) by **Nikita Vasilev**. Nikita is one of the most notable ClickHouse contributors. He started in 2019 by introducing data skipping indices into ClickHouse, then continued in 2020 with SSD-optimized key-value dictionaries and now contributed the new advancements in the query optimizer. This feature is reviewed by **Anton Popov**. + +So, what optimizations? ClickHouse already allows to specify constraints for tables: + +``` +CREATE TABLE +( + URL String, + Domain String, + CONSTRAINT validate CHECK isValidUTF8(URL) AND length(URL) BETWEEN 10 AND 10000, + CONSTRAINT my_constraint CHECK Domain = domainWithoutWWW(URL) +) ... +``` + +Constraints are checked on `INSERT`. In this example we validate the URL and check that the `Domain` column actually contains the domain of the URL. + +Since version 21.12 constraints can also automatically optimize your queries! For example, if you write: + +``` +SELECT count() FROM hits WHERE domainWithoutWWW(URL) = 'ghe.clickhouse.tech' +``` + +The query can be automatically rewritten to: + +``` +SELECT count() FROM hits WHERE Domain = 'ghe.clickhouse.tech' +``` + +Because the `Domain` column is smaller and more compressable it will be faster to read and does not require calculation of the domain from the URL. +The only thing you need to do is to enable the `optimize_using_constraints` and `optimize_substitute_columns` settings. + +As a bonus, we introduced a new type of constraint: `ASSUME`. + +``` +CONSTRAINT my_constraint ASSUME Domain = domainWithoutWWW(URL) +``` + +This type of constraint will not check anything on `INSERT` but still use the assumption to optimize the queries. + +It can also do logical inference, simplify the conditions and remove the conditions that are proved to be satisfied by constraints. +It is controlled by the `convert_query_to_cnf` setting. You can also enable `optimize_append_index`. With this setting ClickHouse will derive more conditions on the table primary key. + +The idea is so powerful that we cannot resist adding one more feature: *indices for hypothesis*. + +``` +INDEX my_index (a < b) TYPE hypothesis GRANULARITY 1 +``` + +The expression is checked and the result (true/false) is written as an index for query optimization. + +**How does this help you?** + +Especially in large ClickHouse deployments with many complex tables it can be hard for users to always be up to date on the best way to query a given dataset. Constraints can help optimize queries without having to change the query structure itself. They can also make it easier to make changes to tables. + +For example, let's say you have a table containing web requests and it includes a URL column that contains the full URL of each request. Many times, users will want to know the top level domain (.com, .co.uk, etc.), something ClickHouse provides the `topLevelDomain` function to calculate. If you discover that many people are using this function you might decide to create a new materialized column that pre-calculates the top level domain for each record. + +Rather than tell all your users to change their queries you can use a table constraint to tell ClickHouse that each time a user tries to call the `topLevelDomain` function the request should be rewritten to use the new materialized column. + + +## Read Large Remote Files In Chunks + +ClickHouse combines a fast query engine and efficient data storage. It also allows to integrate external data sources for data import and export or even to process external datasets on the fly without the need for data import or preprocessing. + +When reading large files in `Parquet`, `ORC`, and `Arrow` format using the `s3`, `url`, and `hdfs` table functions, ClickHouse will now automatically choose whether to read the entire file at once or read parts of it incrementally. This is now enabled by default and the setting `remote_read_min_bytes_for_seek` controls when to switch from reading it all to reading in chunks. The default is 1MiB. + +`Parquet`, `ORC`, and `Arrow` are column-oriented formats (quite similar to the ClickHouse Native format) and now we can read only requested columns even if they are being read from a remote HTTP server with the `url` table function (range requests will be performed to skip unneeded data). + +This feature is implemented by **Kseniia Sumarokova**. + +**How does this help our ClickHouse Users?** + +In previous versions, when reading files in Arrow-based formats from remote locations with the `s3`, `url`, and `hdfs` table functions, ClickHouse would always read the entire file into memory. This works well when the files are small but will cause excessive memory usage or not work at all when the files are large. With this change, ClickHouse will read large files in chunks to keep memory usage in check and is now able to read even very large files. + + +## ... And Many More + +Read the [full changelog](https://github.com/ClickHouse/ClickHouse/blob/master/CHANGELOG.md) for the 21.12 "Christmas" release for the full list of gifts from the [ClickHouse Team](https://clickhouse.com/company/). diff --git a/website/blog/en/2021/how-to-enable-predictive-capabilities-in-clickhouse-databases.md b/website/blog/en/2021/how-to-enable-predictive-capabilities-in-clickhouse-databases.md new file mode 100644 index 00000000000..96c2dccf260 --- /dev/null +++ b/website/blog/en/2021/how-to-enable-predictive-capabilities-in-clickhouse-databases.md @@ -0,0 +1,16 @@ +--- +title: 'How to Enable Predictive Capabilities in Clickhouse Databases' +image: 'https://blog-images.clickhouse.com/en/2021/mindsdb-enables-predictive-capabilities-in-clickHouse/featured.png' +date: '2021-12-14' +author: '[Ilya Yatsishin](https://github.com/qoega)' +tags: ['company', 'how-to', 'MindsDB'] +--- + +ClickHouse is a fast, open-source, column-oriented SQL database that is very useful for data analysis and real-time analytics and with MindsDB can be turned into a powerful machine learning platform for business forecasting. + +In this article, we will +- Guide you through the machine learning workflow and how to use ClickHouse’s powerful tools, like materialized views, to better and more effectively handle data cleaning and preparation - especially for the large datasets with billions of rows of data, +- Explore the concept of AI Tables from MindsDB and how they can be used within ClickHouse to automatically build predictive models and make forecasts using simple SQL statements, and +- Share how MindsDB automates really complex machine learning tasks, like multivariate time-series analysis with high cardinality, show how to detect anomalies, and visualize these predictions. + +[Read Further](https://mindsdb.com/blog/enabling-predictive-capabilities-in-clickhouse-database/?utm_medium=referral&utm_source=clickhouse&utm_campaign=clickhouse-ml-article-2021-12) diff --git a/website/blog/en/2021/plausible-uses-clickHouse-to-power-privacy-friendly-google-analytics-alternative.md b/website/blog/en/2021/plausible-uses-clickHouse-to-power-privacy-friendly-google-analytics-alternative.md new file mode 100644 index 00000000000..f462f9b3990 --- /dev/null +++ b/website/blog/en/2021/plausible-uses-clickHouse-to-power-privacy-friendly-google-analytics-alternative.md @@ -0,0 +1,37 @@ +--- +title: 'Plausible Analytics uses ClickHouse to power their privacy-friendly Google Analytics alternative' +image: 'https://blog-images.clickhouse.com/en/2021/plausible-uses-clickHouse-to-power-privacy-friendly-google-analytics-alternative/featured-cropped.jpg' +date: '2021-12-08' +author: 'Elissa Weve' +tags: ['company'] +--- + +Plausible Analytics is a lightweight, open source web analytics tool that has quickly gained popularity as the privacy-friendly alternative to Google Analytics. By using Plausible Analytics, customers keep 100% ownership of their website data and protect the privacy of their visitors since there are no cookies and it is fully compliant with GDPR. + +Since its launch in April 2019, the analytics platform has scaled to service 5000+ paying subscribers. With an annual recurring revenue of half a million dollars, Plausible Analytics currently tracks 28,000 different websites and more than 1 billion page views per month. + +Marko Saric, co-founder at Plausible Analytics, said to handle this increase in volume, it became clear early on that the original architecture using Postgres to store analytics data could not handle the platform’s future growth. + +“We knew that if we’re going to go anywhere in the future we needed something better,” Saric said. + +## “Best technical decision we ever made” + +Through word of mouth, the Plausible team received the recommendation to try ClickHouse. They quickly noticed significant improvements in the loading speed of their dashboards. With Postgres, their dashboards were taking 5 seconds to load; Now with ClickHouse, it took less than a second. + +Plausible co-founder Uku Täht said the team also tried a couple of other solutions, but “Clickhouse came on top in terms of both performance and features that we would make use of,” he said. + +“Plausible Analytics is a lightweight product, so it is important that everything loads quickly—the dashboard, segmentation of the data, and all the cool stuff in the background. Customers don’t know what we’re doing in the background, but they know that they want a fast experience,” Saric added. + +“Plausible Analytics is a lightweight product, so it is important that everything loads quickly—the dashboard, segmentation of the data, and all the cool stuff in the background. Customers don’t know what we’re doing in the background, but they know that they want a fast experience,” Saric added. Using ClickHouse, Plausible Analytics is able to serve even its largest customers with ease, including the biggest customer, with 150 million pages per month. “This would not have been possible previously, it would have crashed everything, it would not have been able to load.,” Saric said. “There would have been no chance we could have had that kind of customer.” + +According to Täht, switching to ClickHouse was the best technical decision their team ever made. “Clickhouse is amazingly efficient, not just in terms of compute power needed but also the time that it saves us. It's very easy to work with Clickhouse. It does exactly what we need and it does it exceptionally well. It's one of those technologies that feels really simple to use but also has a rich feature set.” + +“I don’t think we would be able to be where we are today without ClickHouse,” Saric said. “Without switching from Postgres, Plausible would not have all this growth and new customers.” + +## About Plausible + +Plausible Analytics is an open-source project dedicated to making web analytics more privacy-friendly. Our mission is to reduce corporate surveillance by providing an alternative web analytics tool which doesn’t come from the AdTech world. + +Visit [plausible.io](https://plausible.io/) for more information or to start a free trial. + + diff --git a/website/blog/en/2021/tests-visualization.md b/website/blog/en/2021/tests-visualization.md new file mode 100644 index 00000000000..259cb4d8e34 --- /dev/null +++ b/website/blog/en/2021/tests-visualization.md @@ -0,0 +1,45 @@ +--- +title: 'Decorating a Christmas Tree With the Help Of Flaky Tests' +image: 'https://blog-images.clickhouse.com/en/2021/tests-visualization/tests.png' +date: '2021-12-27' +author: '[Alexey Milovidov](https://github.com/alexey-milovidov)' +tags: ['tests', 'ci', 'flaky', 'christmas', 'visualization'] +--- + +Test suites and testing infrastructure are one of the main assets of ClickHouse. We have tons of functional, integration, unit, performance, stress and fuzz tests. Tests are run on a per commit basis and results are publicly available. + +We also save the results of all test runs into the database in ClickHouse. We started collecting results in June 2020, and we have 1 777 608 240 records so far. Now we run around 5 to 9 million tests every day. + +Tests are good (in general). A good test suite allows for fast development iterations, stable releases, and to accept more contributions from the community. We love tests. If there's something strange in ClickHouse, what are we gonna do? Write more tests. + +Some tests can be flaky. The reasons for flakiness are uncountable - most of them are simple timing issues in the test script itself, but sometimes if a test has failed one of a thousand times it can uncover subtle logic errors in code. + +The problem is how to deal with flaky tests. Some people suggest automatically muting the "annoying" flaky tests. Or adding automatic retries in case of failure. We believe that this is all wrong. Instead of trying to ignore flaky tests, we do the opposite: we put maximum effort into making the tests even more flaky! + +Our recipes for flaky tests: +— never mute or restart them; if the test failed once, always look and investigate the cause; +— randomize the environment for every test run so the test will have more possible reasons to fail; +— if new tests are added, run them 100 times and if at least one fails, do not merge the pull request; +— if new tests are added, use them as a corpus for fuzzing - it will uncover corner cases even if author did not write tests for them; +— [randomize thread scheduling](https://github.com/ClickHouse/ClickHouse/blob/master/src/Common/ThreadFuzzer.h) and add random sleeps and switching between CPU cores at random places and before and after mutex locks/unlocks; +— run everything in parallel on slow machines; + +Key point: to prevent flaky tests, we make our tests as flaky as possible. + +## Nice Way To Visualize Flaky Tests + +There is a test suite named "[functional stateless tests](https://github.com/ClickHouse/ClickHouse/tree/master/tests/queries/0_stateless)" that has 3772 tests. For every day since 2020-06-13 (561 days) and every test (3772 tests), I drew a picture of size 561x3772 where a pixel is green if all test runs finished successfully in the master branch during this day (for all commits and all combinations: release, debug+assertions, ASan, MSan, TSan, UBSan), and a pixel is red if at least one run failed. The pixel will be transparent if the test did not exist that day. + +This visualization is a toy that I've made for fun: + +![Visualization](https://blog-images.clickhouse.com/en/2021/tests-visualization/tree_half.png) + +It looks like a Christmas Tree (you need a bit of imagination). If you have a different kind of imagination, you can see it as a green field with flowers. + +The time is from left to right. The tests are numbered with non-unique numbers (new tests usually get larger numbers), and these numbers are on the vertical axis (newer tests on top). + +If you see red dots in a horizontal line - it is a flaky test. If you see red dots in a vertical line - it means that one day we accidentally broke the master branch. If you see black horizontal lines or cuts in the tree - it means that the tests were added with some old numbers, most likely because some long living feature branch was merged. If you see black vertical lines - it means that some days tests were not run. + +The velocity of adding new tests is represented by how tall and narrow the Christmas tree is. When we add a large number of tests, the tree grows with almost vertical slope. + +The image is prepared by [HTML page](https://github.com/ClickHouse/ClickHouse/pull/33185) with some JavaScript that is querying a ClickHouse database directly and writing to a canvas. It took around ten seconds to build this picture. I also prepared an [interactive version](https://blog-images.clickhouse.com/en/2021/tests-visualization/demo.html) with already-saved data where you can play and find your favorite tests. diff --git a/website/css/main.css b/website/css/main.css index 7f388ffeab6..56230529a11 100644 --- a/website/css/main.css +++ b/website/css/main.css @@ -1 +1 @@ -@media screen and (max-width:978.98px){.btn{padding:8px 16px}}@media screen and (max-width:978.98px){.btn-lg{padding:12px 24px}}.btn-primary,.btn-primary:active,.btn-primary:hover{color:#212529}.btn-outline-primary{background:#fffaf0;border-color:#fc0;color:#212529}.btn-outline-primary:active,.btn-outline-primary:hover{background:#fc0;border-color:#fc0;color:#212529}.btn-secondary{border-color:#212529;color:#fff}.btn-outline-secondary,.btn-secondary:active,.btn-secondary:hover{background:#fff;border-color:#212529;color:#212529}.btn-outline-secondary:active,.btn-outline-secondary:hover{background:#212529;border-color:#212529;color:#fff}.btn-tertiary{border-color:#257af4;color:#fff}.btn-tertiary:active,.btn-tertiary:hover{background:#257af4;border-color:#257af4;color:#fff}.btn-outline-tertiary{background:#e3f1fe;color:#257af4}.btn-outline-tertiary:active,.btn-outline-tertiary:hover{background:#257af4;color:#fff}.btns{align-items:center;display:grid;-moz-column-gap:24px;column-gap:24px;row-gap:16px;grid-auto-flow:column;justify-content:center}@media screen and (max-width:767.98px){.btns{grid-auto-flow:row}}.btns.btns-lg{-moz-column-gap:40px;column-gap:40px}.btns.is-2{grid-template-columns:1fr 1fr}@media screen and (max-width:767.98px){.btns.is-2{grid-template-columns:1fr}}.btns.is-3{grid-template-columns:1fr 1fr 1fr}@media screen and (max-width:767.98px){.btns.is-3{grid-template-columns:1fr}}.card{box-shadow:0 8px 20px rgba(108,117,125,.2);overflow:hidden;transition:box-shadow .2s,transform .2s;width:100%}.card,.card-body{position:relative}.card-body{z-index:10}.card.is-large .card-body{padding:40px}.card.bg-primary-light{border-color:#fc0}.card.has-dark-border{border-color:#6c757d}.card.has-pattern:after,.card.has-pattern:before{background-repeat:no-repeat;background-size:auto 100%;bottom:0;content:"";display:block;position:absolute;top:0;width:72px}.card.has-pattern:before{background-image:url(../images/backgrounds/bg-card-pattern-blue-1.png);background-position:0 0;left:0}.card.has-pattern:after{background-image:url(../images/backgrounds/bg-card-pattern-blue-2.png);background-position:100% 0;right:0}.card.has-hover:active,.card.has-hover:hover,a.card:active,a.card:hover{box-shadow:0 12px 32px rgba(108,117,125,.2);transform:translateY(-8px)}.card.has-highlight:after,.card.has-hover:after,a.card:after{content:"";display:block;height:8px;margin-top:auto;transition:background .2s;width:100%}.card.has-highlight:after,.card.has-hover:active:after,.card.has-hover:hover:after,a.card:active:after,a.card:hover:after{background:#e3f1fe}.case-study-cards{-moz-column-gap:40px;column-gap:40px;display:grid;grid-template-columns:1fr;row-gap:40px;padding-bottom:40px;position:relative}.case-study-cards>div{align-items:stretch;display:flex}.case-study-cards:before{background:#d6dbdf;bottom:0;content:"";display:block;left:20px;position:absolute;top:40px;width:100vw}@media screen and (min-width:980px){.case-study-cards{grid-template-columns:repeat(2,minmax(0,1fr));row-gap:80px;padding-bottom:120px}.case-study-cards:before{left:-40px;top:120px}}.case-study-card{align-items:stretch;flex-direction:row;flex-shrink:0;left:0;transition:box-shadow .2s,left .4s,width .4s,z-index 0s;transition-delay:0s,.6s,.6s,0s;width:100%;z-index:2}@media screen and (max-width:979.98px){.case-study-card .row{min-height:0!important}}@media screen and (min-width:980px){.case-study-card:active,.case-study-card:hover{box-shadow:0 12px 32px rgba(108,117,125,.2)}.case-study-card:not(.is-open){cursor:pointer}.case-study-card.is-open{transform:none!important;transition-delay:0s,0s,0s,0s;width:calc(200% + 40px);z-index:10}.case-study-card.is-closing{z-index:10}.case-study-card.open-left.is-open{left:calc(-100% - 40px)}.case-study-card:before{background:no-repeat url(../images/backgrounds/bg-card-pattern-red.png);background-position:100%;background-size:contain;content:"";display:block;height:calc(100% - 80px);max-height:224px;max-width:234px;position:absolute;right:0;top:40px;transform:translateX(30%);transition:transform .4s;transition-delay:.6s;width:100%;z-index:1}}@media screen and (min-width:980px)and (min-width:1240px){.case-study-card:before{transform:translateX(50%)}}@media screen and (min-width:980px){.case-study-card.is-open:before{transform:translateX(70%);transition-delay:0s}}@media screen and (min-width:980px){.case-study-card-wrap{align-items:stretch;display:flex;flex-shrink:0;min-height:304px;position:relative;transition:width .4s;transition-delay:.6s;width:calc(200% + 42px);z-index:2}}@media screen and (min-width:980px){.case-study-card.is-open .case-study-card-wrap{transition-delay:0s;width:100%}}@media screen and (min-width:980px){.case-study-card-body{display:flex;flex-direction:column;padding-right:80px!important}.case-study-card-body>.row{align-self:stretch}}@media screen and (min-width:980px){.case-study-card-toggle{background:#fff;box-shadow:0 8px 20px rgba(108,117,125,.2);border-radius:100%;cursor:pointer;height:56px;position:relative;width:56px}.case-study-card-toggle:after,.case-study-card-toggle:before{background:#257af4;content:"";display:block;height:4px;left:calc(50% - 15px);position:absolute;top:calc(50% - 2px);transition:opacity .2s,transform .2s;width:30px}.case-study-card-toggle:after{transform:rotate(90deg)}}@media screen and (min-width:980px){.case-study-card.is-open .case-study-card-toggle:before{opacity:0;transform:rotate(-90deg)}}@media screen and (min-width:980px){.case-study-card.is-open .case-study-card-toggle:after{transform:rotate(0)}}@media screen and (min-width:980px){.case-study-card .col-lg-3{left:-60%;position:relative;transition:left .4s;transition-delay:.6s}}@media screen and (min-width:980px)and (min-width:980px){.case-study-card .col-lg-3{flex:0 0 250px;max-width:250px;width:250px}}@media screen and (min-width:980px){.case-study-card.is-open .col-lg-3{left:0;transition-delay:0s}}@media screen and (min-width:980px){.case-study-card .col-lg-auto{opacity:0;transform:translateX(24px);transition:opacity .4s,transform .4s;transition-delay:.2s}}@media screen and (min-width:980px)and (min-width:980px){.case-study-card .col-lg-auto{max-width:605px;width:calc(100% - 319px)}}@media screen and (min-width:980px){.case-study-card.is-open .col-lg-auto{opacity:1;transform:none;transition-delay:.2s}}.footer-copy{white-space:nowrap}form .form-group{position:relative}form .form-group.is-select:before{border-left:6px solid transparent;border-right:6px solid transparent;border-top:8px solid #6c757d;content:"";display:block;position:absolute;right:33px;top:calc(50% - 4px);z-index:10}form .form-control{border:1px solid #6c757d;border-radius:6px;height:auto;line-height:20px;min-height:44px;padding:12px 16px;width:100%}form .form-control,form .form-control:focus{box-shadow:0 8px 20px rgba(108,117,125,.2);color:#212529}form .form-control:focus{border-color:#212529}form .form-control::-moz-placeholder{color:#6c757d}form .form-control:-ms-input-placeholder{color:#6c757d}form .form-control::placeholder{color:#6c757d}form select.form-control{-webkit-appearance:none;-moz-appearance:none;appearance:none;padding-right:24px;white-space:pre-wrap}form select.form-control[name=priority]{height:84px}@media screen and (max-width:767.98px){form select.form-control[name=priority]{height:104px}}@media screen and (max-width:499.98px){form select.form-control[name=priority]{height:124px}}form select.form-control:not([data-chosen]){color:#6c757d}form .btn-secondary:active,form .btn-secondary:hover{color:#212529;background:#fc0;border-color:#fc0}.hero{overflow:visible;position:relative}.hero,.hero-bg{background-repeat:no-repeat;background-position:50%;background-size:cover}.hero-bg{display:block;height:100%;left:50%;position:absolute;top:0;transform:translateX(-50%);z-index:1}.hero>.container{position:relative;z-index:2}.hero.has-offset{margin-bottom:-160px;padding-bottom:160px}.base-hero{height:22.5vw;max-height:324px;min-height:280px}.index-hero{background-image:url(/images/backgrounds/bg-hero-home.svg);height:68vw;max-height:980px}.index-hero,.other-hero{max-width:2448px;width:170vw}.other-hero{background-image:url(/images/backgrounds/bg-hero.svg)}.bg-footer-cta{background-image:url(/images/backgrounds/bg-footer-cta.svg);width:2448px}.quickstart-bg{background-image:url(/images/backgrounds/bg-quick-start.svg);height:40vw;top:220px;width:170vw}hr{background:#f1f6f9;border:0;display:block;height:4px;margin:0;width:100%}hr.is-small{height:2px}hr.is-large{height:8px}hr.is-medium{background:#d6dbdf}hr.is-dark{background:#495057}hr.is-yellow{background:linear-gradient(90deg,#ff8c00,#ff8c00 8px,#fc0 16px,rgba(255,204,0,0));-webkit-clip-path:polygon(8px 100%,0 100%,0 0,8px 0,8px 100%,16px 100%,16px 0,100% 0,100% 100%);clip-path:polygon(8px 100%,0 100%,0 0,8px 0,8px 100%,16px 100%,16px 0,100% 0,100% 100%);height:8px}.icon{display:block;height:48px;margin-bottom:24px;-o-object-fit:contain;object-fit:contain;-o-object-position:center;object-position:center}@media screen and (min-width:576px){.icon{height:64px}}@media screen and (min-width:980px){.icon{height:80px}}img{max-width:100%}.kicker{color:#6c757d;font-family:Hind Siliguri,sans-serif;font-size:.875rem;font-weight:600;letter-spacing:1px;margin:0}@media screen and (max-width:978.98px){.lead{font-size:1.125rem}}.logo{display:block;height:36px;max-width:220px;-o-object-fit:contain;object-fit:contain;-o-object-position:center;object-position:center;width:100%}.navbar-clickhouse{border-bottom:4px solid #f1f6f9;height:142px}.navbar-clickhouse>.container{flex-wrap:wrap}.navbar-super{flex-shrink:0;width:100%}.navbar-super ul{list-style:none}.navbar-super li:not(:last-child){margin-bottom:0;margin-right:24px}.navbar-super a{align-items:center;color:#212529;display:flex;font-size:.875rem}.navbar-super a:active,.navbar-super a:hover{color:#257af4;text-decoration:none}.navbar-super img{flex-shrink:0;margin-right:4px}.navbar-brand-clickhouse{background:no-repeat url(../images/logo-clickhouse.svg);background-size:contain;flex-shrink:0;height:28px;margin-right:48px;padding:0;width:180px}.navbar-nav{align-items:center;height:46px}.navbar .nav-item:not(:last-child){margin-bottom:0;margin-right:24px}.navbar .nav-link{color:#212529}.navbar .nav-link:active,.navbar .nav-link:hover{color:#257af4}.navbar .navbar-nav{flex-direction:row}@media screen and (max-width:978.98px){.navbar>.container{padding-left:20px;padding-right:20px}.navbar .navbar-toggler{height:24px;padding:0;width:24px}.navbar .navbar-toggler:focus{outline:none}.navbar .navbar-toggler-icon{background:no-repeat url(../images/icons/icon-menu.svg);background-position:50%;background-size:contain;height:24px;width:24px}.navbar .navbar-collapse{background:#fff;border-bottom:4px solid #f1f6f9;height:56px;left:0;padding:0 20px 16px;position:absolute;right:0;top:100%}.navbar .nav-link{font-size:.875rem;white-space:nowrap}}@media screen and (max-width:615.98px){.navbar .navbar-collapse{height:auto}.navbar .navbar-nav{flex-direction:column;height:auto}.navbar .nav-item:not(:last-child){margin-bottom:16px;margin-right:0}}@media screen and (max-width:399.98px){.navbar{height:80px}}.page,.photo-frame{overflow:hidden;width:100%}.photo-frame{background:hsla(0,0%,100%,.6);border-radius:100%;box-shadow:0 8px 20px rgba(108,117,125,.2);display:block;margin-bottom:24px;max-width:160px;position:relative}.photo-frame:before{content:"";display:block;padding-bottom:100%;width:100%}.photo-frame img{display:block;height:100%;left:0;-o-object-fit:contain;object-fit:contain;-o-object-position:center;object-position:center;position:absolute;top:0;width:100%}.pullquote{position:relative;width:70%}.pullquote:before{background:no-repeat url(../images/backgrounds/bg-quotes.svg);background-position:50%;background-size:100%;content:"";mix-blend-mode:multiply;right:56px;width:calc(100% - 16px);z-index:2}.pullquote-bg,.pullquote:before{bottom:0;display:block;position:absolute;top:0}.pullquote-bg{right:0;width:calc(50vw + 28.57143%);z-index:1}.pullquote-body{padding:64px 40px 64px 0;position:relative;z-index:3}.pullquote-quote{font-family:Hind Siliguri,sans-serif;font-size:32px;font-weight:700}.pullquote-citation{font-size:1.125rem}.section{overflow:visible;position:relative}.section,.section-bg{background-repeat:no-repeat;background-position:50%;background-size:cover}.section-bg{display:block;height:100%;left:50%;position:absolute;top:0;transform:translateX(-50%);z-index:1}.section>.container{position:relative;z-index:2}.social-icons{align-items:center;display:flex}.social-icons>a{aspect-ratio:24/24;background:#6c757d;display:block;height:24px;width:24px;-webkit-mask-position:center;mask-position:center;-webkit-mask-repeat:no-repeat;mask-repeat:no-repeat;-webkit-mask-size:contain;mask-size:contain;transition:background .2s}.social-icons>a:active,.social-icons>a:hover{background:#212529}.social-icons>a+a{margin-left:32px}.social-icons-facebook{-webkit-mask-image:url(/images/icons/icon-facebook-gray.svg);mask-image:url(/images/icons/icon-facebook-gray.svg)}.social-icons-twitter{-webkit-mask-image:url(/images/icons/icon-twitter-gray.svg);mask-image:url(/images/icons/icon-twitter-gray.svg);width:31px}.social-icons-linkedin{-webkit-mask-image:url(/images/icons/icon-linkedin-gray.svg);mask-image:url(/images/icons/icon-linkedin-gray.svg)}.social-icons-linkedin-alt{-webkit-mask-image:url(/images/icons/icon-linkedin-alt-gray.svg);mask-image:url(/images/icons/icon-linkedin-alt-gray.svg)}.social-icons.size-small>a{height:20px;width:20px}.social-icons.size-small>a:active,.social-icons.size-small>a:hover{background:#212529}.social-icons.size-small>a+a{margin-left:16px}.tabs{position:relative}.tabs:before{background:#fff;border-radius:7px 7px 0 0;content:"";display:block;height:8px;left:1px;position:absolute;right:1px;top:68px;z-index:10}@media screen and (min-width:1240px){.tabs:before{top:76px}}.tabs-body{background:#fff;border-radius:8px;border:1px solid #6c757d;box-shadow:0 8px 20px rgba(108,117,125,.2);padding:24px}@media screen and (min-width:980px){.tabs-body{padding:32px}}@media screen and (min-width:1240px){.tabs-body{padding:40px}}.tabs .nav-tabs{border-bottom:0;flex-wrap:nowrap;height:76px;margin:-20px -20px -9px;-webkit-mask-image:linear-gradient(90deg,transparent,#000 20px,#000 calc(100% - 20px),transparent);mask-image:linear-gradient(90deg,transparent,#000 20px,#000 calc(100% - 20px),transparent);overflow:scroll;overflow-x:scroll;overflow-y:visible;padding:20px 20px 0;position:relative}@media screen and (min-width:940px){.tabs .nav-tabs{overflow:visible}}@media screen and (min-width:1240px){.tabs .nav-tabs{height:84px}}.tabs .nav-link{align-items:center;border-bottom:0;color:#6c757d;display:flex;font-size:.875rem;flex-shrink:0;height:56px;justify-content:center;padding:0 12px 8px;text-align:center;white-space:nowrap}@media screen and (min-width:1240px){.tabs .nav-link{height:64px;padding:0 16px 8px}}.tabs .nav-link.active{background:#fff;box-shadow:0 -4px 8px rgba(108,117,125,.1);font-weight:700;padding:0 16px 8px}@media screen and (min-width:980px){.tabs .nav-link.active{padding:0 24px 8px}}@media screen and (min-width:1240px){.tabs .nav-link.active{padding:0 32px 8px}}.tab-pane pre{background:#212529;border-radius:16px;color:#fff;padding:24px 16px}@media screen and (min-width:1240px){.tab-pane pre{padding:32px 24px}}.trailing-link{align-items:center;color:#212529;display:flex;font-size:.875rem;font-weight:700}.trailing-link:after{background:no-repeat url(../images/icons/icon-arrow.svg);background-position:100%;background-size:contain;content:"";display:block;height:12px;transition:transform .2s;width:20px}.trailing-link:active,.trailing-link:hover{color:#212529;text-decoration:none}.trailing-link:active:after,.trailing-link:hover:after{transform:translateX(8px)}.trailing-link.span-full:after{margin-left:auto}ul{list-style-type:square;padding-left:1.25em}ul li:not(:last-child){margin-bottom:16px}ul li::marker{color:#ff3939}ul.has-separators{list-style:none;padding:0}ul.has-separators li:not(:last-child){border-bottom:4px solid #f1f6f9;margin-bottom:24px;padding-bottom:24px}.bg-gradient-secondary{background-image:linear-gradient(58deg,#ff6443 3%,#fe561d 24%,#e32f0d 93%)}.bg-gradient-light-orange{background-image:linear-gradient(90deg,rgba(255,203,128,0),#ffcb80)}.bg-offset-right{bottom:0;left:-24px;position:absolute;top:0;width:calc(100vw + 24px);z-index:-1}@media screen and (min-width:1240px){.bg-offset-right{left:-96px;width:calc(100vw + 96px)}}.bg-inset-right{bottom:0;left:40px;position:absolute;top:0;width:calc(100vw - 40px);z-index:-1}@media screen and (min-width:980px){.bg-inset-right{left:96px;width:calc(100vw - 96px)}}.has-border-left{border-left:8px solid #f1f6f9;padding-left:16px}.font-xl{font-size:1.25rem}.font-lg{font-size:1.125rem}.font-sm{font-size:.875rem}.font-xs{font-size:.625rem}.font-weight-semibold{font-weight:600}.display-5{color:#212529;font-size:20px;font-weight:500}.display-6{color:#212529;font-size:14px;font-weight:700}.overflow-auto{overflow:auto}.text-decoration-underline{text-decoration:underline}.text-upper{text-transform:uppercase} \ No newline at end of file +@media screen and (max-width:978.98px){.btn{padding:8px 16px}}@media screen and (max-width:978.98px){.btn-lg{padding:12px 24px}}.btn-primary,.btn-primary:active,.btn-primary:hover{color:#212529}.btn-outline-primary{background:#fffaf0;border-color:#fc0;color:#212529}.btn-outline-primary:active,.btn-outline-primary:hover{background:#fc0;border-color:#fc0;color:#212529}.btn-secondary{border-color:#212529;color:#fff}.btn-outline-secondary,.btn-secondary:active,.btn-secondary:hover{background:#fff;border-color:#212529;color:#212529}.btn-outline-secondary:active,.btn-outline-secondary:hover{background:#212529;border-color:#212529;color:#fff}.btn-tertiary{border-color:#257af4;color:#fff}.btn-tertiary:active,.btn-tertiary:hover{background:#257af4;border-color:#257af4;color:#fff}.btn-outline-tertiary{background:#e3f1fe;color:#257af4}.btn-outline-tertiary:active,.btn-outline-tertiary:hover{background:#257af4;color:#fff}.btns{align-items:center;display:grid;-moz-column-gap:24px;column-gap:24px;row-gap:16px;grid-auto-flow:column;justify-content:center}@media screen and (max-width:767.98px){.btns{grid-auto-flow:row}}.btns.btns-lg{-moz-column-gap:40px;column-gap:40px}.btns.is-2{grid-template-columns:1fr 1fr}@media screen and (max-width:767.98px){.btns.is-2{grid-template-columns:1fr}}.btns.is-3{grid-template-columns:1fr 1fr 1fr}@media screen and (max-width:767.98px){.btns.is-3{grid-template-columns:1fr}}.card{box-shadow:0 8px 20px rgba(108,117,125,.2);overflow:hidden;transition:box-shadow .2s,transform .2s;width:100%}.card,.card-body{position:relative}.card-body{z-index:10}.card.is-large .card-body{padding:40px}.card.bg-primary-light{border-color:#fc0}.card.has-dark-border{border-color:#6c757d}.card.has-pattern:after,.card.has-pattern:before{background-repeat:no-repeat;background-size:auto 100%;bottom:0;content:"";display:block;position:absolute;top:0;width:72px}.card.has-pattern:before{background-image:url(../images/backgrounds/bg-card-pattern-blue-1.png);background-position:0 0;left:0}.card.has-pattern:after{background-image:url(../images/backgrounds/bg-card-pattern-blue-2.png);background-position:100% 0;right:0}.card.has-hover:active,.card.has-hover:hover,a.card:active,a.card:hover{box-shadow:0 12px 32px rgba(108,117,125,.2);transform:translateY(-8px)}.card.has-highlight:after,.card.has-hover:after,a.card:after{content:"";display:block;height:8px;margin-top:auto;transition:background .2s;width:100%}.card.has-highlight:after,.card.has-hover:active:after,.card.has-hover:hover:after,a.card:active:after,a.card:hover:after{background:#e3f1fe}.case-study-cards{-moz-column-gap:40px;column-gap:40px;display:grid;grid-template-columns:1fr;row-gap:40px;padding-bottom:40px;position:relative}.case-study-cards>div{align-items:stretch;display:flex}.case-study-cards:before{background:#d6dbdf;bottom:0;content:"";display:block;left:20px;position:absolute;top:40px;width:100vw}@media screen and (min-width:980px){.case-study-cards{grid-template-columns:repeat(2,minmax(0,1fr));row-gap:80px;padding-bottom:120px}.case-study-cards:before{left:-40px;top:120px}}.case-study-card{align-items:stretch;flex-direction:row;flex-shrink:0;left:0;transition:box-shadow .2s,left .4s,width .4s,z-index 0s;transition-delay:0s,.6s,.6s,0s;width:100%;z-index:2}@media screen and (max-width:979.98px){.case-study-card .row{min-height:0!important}}@media screen and (min-width:980px){.case-study-card:active,.case-study-card:hover{box-shadow:0 12px 32px rgba(108,117,125,.2)}.case-study-card:not(.is-open){cursor:pointer}.case-study-card.is-open{transform:none!important;transition-delay:0s,0s,0s,0s;width:calc(200% + 40px);z-index:10}.case-study-card.is-closing{z-index:10}.case-study-card.open-left.is-open{left:calc(-100% - 40px)}.case-study-card:before{background:no-repeat url(../images/backgrounds/bg-card-pattern-red.png);background-position:100%;background-size:contain;content:"";display:block;height:calc(100% - 80px);max-height:224px;max-width:234px;position:absolute;right:0;top:40px;transform:translateX(30%);transition:transform .4s;transition-delay:.6s;width:100%;z-index:1}}@media screen and (min-width:980px)and (min-width:1240px){.case-study-card:before{transform:translateX(50%)}}@media screen and (min-width:980px){.case-study-card.is-open:before{transform:translateX(70%);transition-delay:0s}}@media screen and (min-width:980px){.case-study-card-wrap{align-items:stretch;display:flex;flex-shrink:0;min-height:304px;position:relative;transition:width .4s;transition-delay:.6s;width:calc(200% + 42px);z-index:2}}@media screen and (min-width:980px){.case-study-card.is-open .case-study-card-wrap{transition-delay:0s;width:100%}}@media screen and (min-width:980px){.case-study-card-body{display:flex;flex-direction:column;padding-right:80px!important}.case-study-card-body>.row{align-self:stretch}}@media screen and (min-width:980px){.case-study-card-toggle{background:#fff;box-shadow:0 8px 20px rgba(108,117,125,.2);border-radius:100%;cursor:pointer;height:56px;position:relative;width:56px}.case-study-card-toggle:after,.case-study-card-toggle:before{background:#257af4;content:"";display:block;height:4px;left:calc(50% - 15px);position:absolute;top:calc(50% - 2px);transition:opacity .2s,transform .2s;width:30px}.case-study-card-toggle:after{transform:rotate(90deg)}}@media screen and (min-width:980px){.case-study-card.is-open .case-study-card-toggle:before{opacity:0;transform:rotate(-90deg)}}@media screen and (min-width:980px){.case-study-card.is-open .case-study-card-toggle:after{transform:rotate(0)}}@media screen and (min-width:980px){.case-study-card .col-lg-3{left:-60%;position:relative;transition:left .4s;transition-delay:.6s}}@media screen and (min-width:980px)and (min-width:980px){.case-study-card .col-lg-3{flex:0 0 250px;max-width:250px;width:250px}}@media screen and (min-width:980px){.case-study-card.is-open .col-lg-3{left:0;transition-delay:0s}}@media screen and (min-width:980px){.case-study-card .col-lg-auto{opacity:0;transform:translateX(24px);transition:opacity .4s,transform .4s;transition-delay:.2s}}@media screen and (min-width:980px)and (min-width:980px){.case-study-card .col-lg-auto{max-width:605px;width:calc(100% - 319px)}}@media screen and (min-width:980px){.case-study-card.is-open .col-lg-auto{opacity:1;transform:none;transition-delay:.2s}}.footer-copy{white-space:nowrap}form .form-group{position:relative}form .form-group.is-select:before{border-left:6px solid transparent;border-right:6px solid transparent;border-top:8px solid #6c757d;content:"";display:block;position:absolute;right:33px;top:calc(50% - 4px);z-index:10}form .form-control{border:1px solid #6c757d;border-radius:6px;height:auto;line-height:20px;min-height:44px;padding:12px 16px;width:100%}form .form-control,form .form-control:focus{box-shadow:0 8px 20px rgba(108,117,125,.2);color:#212529}form .form-control:focus{border-color:#212529}form .form-control::-moz-placeholder{color:#6c757d}form .form-control:-ms-input-placeholder{color:#6c757d}form .form-control::placeholder{color:#6c757d}form select.form-control{-webkit-appearance:none;-moz-appearance:none;appearance:none;padding-right:24px;white-space:pre-wrap}form select.form-control:not([data-chosen]){color:#6c757d}form .btn-secondary:active,form .btn-secondary:hover{color:#212529;background:#fc0;border-color:#fc0}.hero{overflow:visible;position:relative}.hero,.hero-bg{background-repeat:no-repeat;background-position:50%;background-size:cover}.hero-bg{display:block;height:100%;left:50%;position:absolute;top:0;transform:translateX(-50%);z-index:1}.hero>.container{position:relative;z-index:2}.hero.has-offset{margin-bottom:-160px;padding-bottom:160px}.base-hero{height:22.5vw;max-height:324px;min-height:280px}.index-hero{background-image:url(/images/backgrounds/bg-hero-home.svg);height:68vw;max-height:980px}.index-hero,.other-hero{max-width:2448px;width:170vw}.other-hero{background-image:url(/images/backgrounds/bg-hero.svg)}.bg-footer-cta{background-image:url(/images/backgrounds/bg-footer-cta.svg);width:2448px}.quickstart-bg{background-image:url(/images/backgrounds/bg-quick-start.svg);height:40vw;top:220px;width:170vw}hr{background:#f1f6f9;border:0;display:block;height:4px;margin:0;width:100%}hr.is-small{height:2px}hr.is-large{height:8px}hr.is-medium{background:#d6dbdf}hr.is-dark{background:#495057}hr.is-yellow{background:linear-gradient(90deg,#ff8c00,#ff8c00 8px,#fc0 16px,rgba(255,204,0,0));-webkit-clip-path:polygon(8px 100%,0 100%,0 0,8px 0,8px 100%,16px 100%,16px 0,100% 0,100% 100%);clip-path:polygon(8px 100%,0 100%,0 0,8px 0,8px 100%,16px 100%,16px 0,100% 0,100% 100%);height:8px}.icon{display:block;height:48px;margin-bottom:24px;-o-object-fit:contain;object-fit:contain;-o-object-position:center;object-position:center}@media screen and (min-width:576px){.icon{height:64px}}@media screen and (min-width:980px){.icon{height:80px}}img{max-width:100%}.kicker{color:#6c757d;font-family:Hind Siliguri,sans-serif;font-size:.875rem;font-weight:600;letter-spacing:1px;margin:0}@media screen and (max-width:978.98px){.lead{font-size:1.125rem}}.logo{display:block;height:36px;max-width:220px;-o-object-fit:contain;object-fit:contain;-o-object-position:center;object-position:center;width:100%}.navbar-clickhouse{border-bottom:4px solid #f1f6f9;height:142px}.navbar-clickhouse>.container{flex-wrap:wrap}.navbar-super{flex-shrink:0;width:100%}.navbar-super ul{list-style:none}.navbar-super li:not(:last-child){margin-bottom:0;margin-right:24px}.navbar-super a{align-items:center;color:#212529;display:flex;font-size:.875rem}.navbar-super a:active,.navbar-super a:hover{color:#257af4;text-decoration:none}.navbar-super img{flex-shrink:0;margin-right:4px}.navbar-brand-clickhouse{background:no-repeat url(../images/logo-clickhouse.svg);background-size:contain;flex-shrink:0;height:28px;margin-right:48px;padding:0;width:180px}.navbar-nav{align-items:center;height:46px}.navbar .nav-item:not(:last-child){margin-bottom:0;margin-right:24px}.navbar .nav-link{color:#212529}.navbar .nav-link:active,.navbar .nav-link:hover{color:#257af4}.navbar .navbar-nav{flex-direction:row}@media screen and (max-width:978.98px){.navbar>.container{padding-left:20px;padding-right:20px}.navbar .navbar-toggler{height:24px;padding:0;width:24px}.navbar .navbar-toggler:focus{outline:none}.navbar .navbar-toggler-icon{background:no-repeat url(../images/icons/icon-menu.svg);background-position:50%;background-size:contain;height:24px;width:24px}.navbar .navbar-collapse{background:#fff;border-bottom:4px solid #f1f6f9;height:56px;left:0;padding:0 20px 16px;position:absolute;right:0;top:100%}.navbar .nav-link{font-size:.875rem;white-space:nowrap}}@media screen and (max-width:615.98px){.navbar .navbar-collapse{height:auto}.navbar .navbar-nav{flex-direction:column;height:auto}.navbar .nav-item:not(:last-child){margin-bottom:16px;margin-right:0}}@media screen and (max-width:399.98px){.navbar{height:80px}}@media screen and (min-width:616px){.navbar.py-1+div .anchor-fixer :target{scroll-margin-top:62px}}@media screen and (min-width:616px){.navbar.py-2+div .anchor-fixer :target{scroll-margin-top:78px}}@media screen and (min-width:616px){.navbar.py-3+div .anchor-fixer :target{scroll-margin-top:94px}}@media screen and (min-width:616px){.navbar.py-4+div .anchor-fixer :target{scroll-margin-top:110px}}@media screen and (min-width:616px){.navbar.py-5+div .anchor-fixer :target{scroll-margin-top:126px}}@media screen and (min-width:616px){.navbar.py-6+div .anchor-fixer :target{scroll-margin-top:142px}}@media screen and (min-width:616px){.navbar.py-7+div .anchor-fixer :target{scroll-margin-top:158px}}@media screen and (min-width:616px){.navbar.py-8+div .anchor-fixer :target{scroll-margin-top:174px}}@media screen and (max-width:615.98px){.navbar+div .anchor-fixer :target{scroll-margin-top:73px}}@media screen and (max-width:399.98px){.navbar+div .anchor-fixer :target{scroll-margin-top:80px}}.page,.photo-frame{overflow:hidden;width:100%}.photo-frame{background:hsla(0,0%,100%,.6);border-radius:100%;box-shadow:0 8px 20px rgba(108,117,125,.2);display:block;margin-bottom:24px;max-width:160px;position:relative}.photo-frame:before{content:"";display:block;padding-bottom:100%;width:100%}.photo-frame img{display:block;height:100%;left:0;-o-object-fit:contain;object-fit:contain;-o-object-position:center;object-position:center;position:absolute;top:0;width:100%}.pullquote{position:relative;width:70%}.pullquote:before{background:no-repeat url(../images/backgrounds/bg-quotes.svg);background-position:50%;background-size:100%;content:"";mix-blend-mode:multiply;right:56px;width:calc(100% - 16px);z-index:2}.pullquote-bg,.pullquote:before{bottom:0;display:block;position:absolute;top:0}.pullquote-bg{right:0;width:calc(50vw + 28.57143%);z-index:1}.pullquote-body{padding:64px 40px 64px 0;position:relative;z-index:3}.pullquote-quote{font-family:Hind Siliguri,sans-serif;font-size:32px;font-weight:700}.pullquote-citation{font-size:1.125rem}.section{overflow:visible;position:relative}.section,.section-bg{background-repeat:no-repeat;background-position:50%;background-size:cover}.section-bg{display:block;height:100%;left:50%;position:absolute;top:0;transform:translateX(-50%);z-index:1}.section>.container{position:relative;z-index:2}.severity-table th{background:#f1f6f9;font-size:.875rem;padding:8px 16px}.severity-table td{border-top:1px solid #d6dbdf;padding:16px}.social-icons{align-items:center;display:flex}.social-icons>a{aspect-ratio:24/24;background:#6c757d;display:block;height:24px;width:24px;-webkit-mask-position:center;mask-position:center;-webkit-mask-repeat:no-repeat;mask-repeat:no-repeat;-webkit-mask-size:contain;mask-size:contain;transition:background .2s}.social-icons>a:active,.social-icons>a:hover{background:#212529}.social-icons>a+a{margin-left:32px}.social-icons-facebook{-webkit-mask-image:url(/images/icons/icon-facebook-gray.svg);mask-image:url(/images/icons/icon-facebook-gray.svg)}.social-icons-twitter{-webkit-mask-image:url(/images/icons/icon-twitter-gray.svg);mask-image:url(/images/icons/icon-twitter-gray.svg);width:31px}.social-icons-linkedin{-webkit-mask-image:url(/images/icons/icon-linkedin-gray.svg);mask-image:url(/images/icons/icon-linkedin-gray.svg)}.social-icons-linkedin-alt{-webkit-mask-image:url(/images/icons/icon-linkedin-alt-gray.svg);mask-image:url(/images/icons/icon-linkedin-alt-gray.svg)}.social-icons.size-small>a{height:20px;width:20px}.social-icons.size-small>a:active,.social-icons.size-small>a:hover{background:#212529}.social-icons.size-small>a+a{margin-left:16px}.tabs{position:relative}.tabs:before{background:#fff;border-radius:7px 7px 0 0;content:"";display:block;height:8px;left:1px;position:absolute;right:1px;top:68px;z-index:10}@media screen and (min-width:1240px){.tabs:before{top:76px}}.tabs-body{background:#fff;border-radius:8px;border:1px solid #6c757d;box-shadow:0 8px 20px rgba(108,117,125,.2);padding:24px}@media screen and (min-width:980px){.tabs-body{padding:32px}}@media screen and (min-width:1240px){.tabs-body{padding:40px}}.tabs .nav-tabs{border-bottom:0;flex-wrap:nowrap;height:76px;margin:-20px -20px -9px;-webkit-mask-image:linear-gradient(90deg,transparent,#000 20px,#000 calc(100% - 20px),transparent);mask-image:linear-gradient(90deg,transparent,#000 20px,#000 calc(100% - 20px),transparent);overflow:scroll;overflow-x:scroll;overflow-y:visible;padding:20px 20px 0;position:relative}@media screen and (min-width:940px){.tabs .nav-tabs{overflow:visible}}@media screen and (min-width:1240px){.tabs .nav-tabs{height:84px}}.tabs .nav-link{align-items:center;border-bottom:0;color:#6c757d;display:flex;font-size:.875rem;flex-shrink:0;height:56px;justify-content:center;padding:0 12px 8px;text-align:center;white-space:nowrap}@media screen and (min-width:1240px){.tabs .nav-link{height:64px;padding:0 16px 8px}}.tabs .nav-link.active{background:#fff;box-shadow:0 -4px 8px rgba(108,117,125,.1);font-weight:700;padding:0 16px 8px}@media screen and (min-width:980px){.tabs .nav-link.active{padding:0 24px 8px}}@media screen and (min-width:1240px){.tabs .nav-link.active{padding:0 32px 8px}}.tab-pane pre{background:#212529;border-radius:16px;color:#fff;padding:24px 16px}@media screen and (min-width:1240px){.tab-pane pre{padding:32px 24px}}.trailing-link{align-items:center;color:#212529;display:flex;font-size:.875rem;font-weight:700}.trailing-link:after{background:no-repeat url(../images/icons/icon-arrow.svg);background-position:100%;background-size:contain;content:"";display:block;height:12px;transition:transform .2s;width:20px}.trailing-link:active,.trailing-link:hover{color:#212529;text-decoration:none}.trailing-link:active:after,.trailing-link:hover:after{transform:translateX(8px)}.trailing-link.span-full:after{margin-left:auto}ul{list-style-type:square;padding-left:1.25em}ul li:not(:last-child){margin-bottom:16px}ul li::marker{color:#ff3939}ul.has-separators{list-style:none;padding:0}ul.has-separators li:not(:last-child){border-bottom:4px solid #f1f6f9;margin-bottom:24px;padding-bottom:24px}.bg-gradient-secondary{background-image:linear-gradient(58deg,#ff6443 3%,#fe561d 24%,#e32f0d 93%)}.bg-gradient-light-orange{background-image:linear-gradient(90deg,rgba(255,203,128,0),#ffcb80)}.bg-offset-right{bottom:0;left:-24px;position:absolute;top:0;width:calc(100vw + 24px);z-index:-1}@media screen and (min-width:1240px){.bg-offset-right{left:-96px;width:calc(100vw + 96px)}}.bg-inset-right{bottom:0;left:40px;position:absolute;top:0;width:calc(100vw - 40px);z-index:-1}@media screen and (min-width:980px){.bg-inset-right{left:96px;width:calc(100vw - 96px)}}.has-border-left{border-left:8px solid #f1f6f9;padding-left:16px}.font-xl{font-size:1.25rem}.font-lg{font-size:1.125rem}.font-sm{font-size:.875rem}.font-xs{font-size:.625rem}.font-weight-semibold{font-weight:600}.display-5{color:#212529;font-size:20px;font-weight:500}.display-6{color:#212529;font-size:14px;font-weight:700}.overflow-auto{overflow:auto}.text-decoration-underline{text-decoration:underline}.text-upper{text-transform:uppercase} \ No newline at end of file diff --git a/website/src/scss/_variables.scss b/website/src/scss/_variables.scss index d511c757055..55b06ac8409 100644 --- a/website/src/scss/_variables.scss +++ b/website/src/scss/_variables.scss @@ -289,6 +289,8 @@ $nav-tabs-link-active-border-color: $gray-700; $navbar-padding-y: 24px; $navbar-padding-x: 0; +$navbar-nav-height: 46px; +$navbar-height-xl: 80px; // Cards diff --git a/website/src/scss/components/_form.scss b/website/src/scss/components/_form.scss index 144b7e7301e..1f4aa09c32b 100644 --- a/website/src/scss/components/_form.scss +++ b/website/src/scss/components/_form.scss @@ -41,18 +41,6 @@ form { appearance: none; padding-right: 24px; white-space: pre-wrap; - - &[name=priority] { - height: 84px; - - @media screen and (max-width: 767.98px) { - height: 104px; - } - - @media screen and (max-width: 499.98px) { - height: 124px; - } - } } select.form-control:not([data-chosen]) { diff --git a/website/src/scss/components/_navbar.scss b/website/src/scss/components/_navbar.scss index 53a834d2ed7..ca6bc52630b 100644 --- a/website/src/scss/components/_navbar.scss +++ b/website/src/scss/components/_navbar.scss @@ -52,7 +52,7 @@ &-nav { align-items: center; - height: 46px; + height: $navbar-nav-height; } .nav-item:not(:last-child) { @@ -131,6 +131,35 @@ } @media screen and (max-width: 399.98px) { - height: 80px; + height: $navbar-height-xl; + } +} + + +.navbar { + @for $i from 1 through 8 { + &.py-#{$i} { + + div { + .anchor-fixer { + :target { + @media screen and (min-width: 616px) { + scroll-margin-top: $navbar-nav-height + $spacer * $i * 2; + } + } + } + } + } + } + + div { + .anchor-fixer { + :target { + @media screen and (max-width: 615.98px) { + scroll-margin-top: 73px; + } + @media screen and (max-width: 399.98px) { + scroll-margin-top: $navbar-height-xl; + } + } + } } } diff --git a/website/src/scss/components/_severity-table.scss b/website/src/scss/components/_severity-table.scss new file mode 100644 index 00000000000..2c748a66083 --- /dev/null +++ b/website/src/scss/components/_severity-table.scss @@ -0,0 +1,12 @@ +.severity-table { + th { + background: $gray-100; + font-size: $font-size-sm; + padding: 8px 16px; + } + + td { + border-top: 1px solid $gray-500; + padding: 16px 16px; + } +} diff --git a/website/templates/docs/content.html b/website/templates/docs/content.html index 3f4db728e99..c2835dd1f39 100644 --- a/website/templates/docs/content.html +++ b/website/templates/docs/content.html @@ -1,4 +1,4 @@ -
+
{% if not single_page %} {% set ancestors = page.ancestors|reverse|list %} diff --git a/website/templates/global/banner.html b/website/templates/global/banner.html index 47763f98082..6a3e38b6e1a 100644 --- a/website/templates/global/banner.html +++ b/website/templates/global/banner.html @@ -1,6 +1,6 @@ diff --git a/website/templates/index/hero.html b/website/templates/index/hero.html index 83853dc1345..cfd21273092 100644 --- a/website/templates/index/hero.html +++ b/website/templates/index/hero.html @@ -1,23 +1,17 @@
- +

- ClickHouse v21.11 Released + ClickHouse v21.12 Released

{{ _('ClickHouse® is an open-source, high performance columnar OLAP database management system for real-time analytics using SQL.') }}

- -

- Read the Blog Post + Read the Blog Post

@@ -28,15 +22,15 @@
- +

ClickHouse Announces $250 Million in Funding

- -

Raising the Company’s Valuation to $2B

+ +

Raising the Company’s Valuation to $2B

diff --git a/website/templates/support/form.html b/website/templates/support/form.html index 14c153c7fde..b4ba3a68cdf 100644 --- a/website/templates/support/form.html +++ b/website/templates/support/form.html @@ -20,9 +20,9 @@
@@ -43,6 +43,35 @@
+ +

Severity Classification Level Definitions

+ + + + + + + + + + + + + + + + + + + + + + + + + + +
Severity Classification LevelOrganizational ImpactDefinition
Severity 1Critical Business ImpactClickHouse DBMS LTS or stable software release in production use is not functioning or is stopped or severely impacted so that Customer cannot reasonably continue use of DBMS and no Workaround is available
Severity 2Major Business ImpactClickHouse DBMS LTS or stable software release is functioning inconsistently causing significantly impaired Customer usage and productivity, such as periodic work stoppages and feature crashes
Severity 3Minor Business Impact or General QuestionsClickHouse DBMS LTS or stable software release is functioning inconsistently causing slightly impaired Customer usage and productivity but Customer can work around such inconsistency or impairment, or Customer has a question or enhancement for the DBMS or ClickHouse not determined an Error