Merge branch 'master' into clickhouse-test-unknown

This commit is contained in:
mergify[bot] 2021-12-21 10:55:11 +00:00 committed by GitHub
commit 1daf469799
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
659 changed files with 16348 additions and 4392 deletions

View File

@ -8,6 +8,10 @@
name: Docker Container Scan (clickhouse-server) name: Docker Container Scan (clickhouse-server)
env:
# Force the stdout and stderr streams to be unbuffered
PYTHONUNBUFFERED: 1
"on": "on":
pull_request: pull_request:
paths: paths:

View File

@ -1,4 +1,9 @@
name: CherryPick name: CherryPick
env:
# Force the stdout and stderr streams to be unbuffered
PYTHONUNBUFFERED: 1
concurrency: concurrency:
group: cherry-pick group: cherry-pick
on: # yamllint disable-line rule:truthy on: # yamllint disable-line rule:truthy
@ -8,18 +13,24 @@ jobs:
CherryPick: CherryPick:
runs-on: [self-hosted, style-checker] runs-on: [self-hosted, style-checker]
steps: steps:
- name: Set envs
# https://docs.github.com/en/actions/learn-github-actions/workflow-commands-for-github-actions#multiline-strings
run: |
cat >> "$GITHUB_ENV" << 'EOF'
TEMP_PATH=${{runner.temp}}/cherry_pick
ROBOT_CLICKHOUSE_SSH_KEY<<RCSK
${{secrets.ROBOT_CLICKHOUSE_SSH_KEY}}
RCSK
REPO_OWNER=ClickHouse
REPO_NAME=ClickHouse
REPO_TEAM=core
EOF
- name: Check out repository code - name: Check out repository code
uses: actions/checkout@v2 uses: actions/checkout@v2
with: with:
token: ${{secrets.ROBOT_CLICKHOUSE_COMMIT_TOKEN}} token: ${{secrets.ROBOT_CLICKHOUSE_COMMIT_TOKEN}}
fetch-depth: 0 fetch-depth: 0
- name: Cherry pick - name: Cherry pick
env:
TEMP_PATH: ${{runner.temp}}/cherry_pick
ROBOT_CLICKHOUSE_SSH_KEY: ${{secrets.ROBOT_CLICKHOUSE_SSH_KEY}}
REPO_OWNER: "ClickHouse"
REPO_NAME: "ClickHouse"
REPO_TEAM: "core"
run: | run: |
sudo pip install GitPython sudo pip install GitPython
cd $GITHUB_WORKSPACE/tests/ci cd $GITHUB_WORKSPACE/tests/ci

View File

@ -1,4 +1,9 @@
name: BackportPR name: BackportPR
env:
# Force the stdout and stderr streams to be unbuffered
PYTHONUNBUFFERED: 1
on: # yamllint disable-line rule:truthy on: # yamllint disable-line rule:truthy
push: push:
branches: branches:
@ -7,6 +12,9 @@ jobs:
DockerHubPush: DockerHubPush:
runs-on: [self-hosted, style-checker] runs-on: [self-hosted, style-checker]
steps: steps:
- name: Clear repository
run: |
sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE
- name: Check out repository code - name: Check out repository code
uses: actions/checkout@v2 uses: actions/checkout@v2
- name: Images check - name: Images check
@ -22,17 +30,23 @@ jobs:
needs: [BuilderDebRelease] needs: [BuilderDebRelease]
runs-on: [self-hosted, style-checker] runs-on: [self-hosted, style-checker]
steps: steps:
- name: Set envs
run: |
cat >> "$GITHUB_ENV" << 'EOF'
TEMP_PATH=${{runner.temp}}/compatibility_check
REPO_COPY=${{runner.temp}}/compatibility_check/ClickHouse
REPORTS_PATH=${{runner.temp}}/reports_dir
EOF
- name: Clear repository
run: |
sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE
- name: Check out repository code - name: Check out repository code
uses: actions/checkout@v2 uses: actions/checkout@v2
- name: Download json reports - name: Download json reports
uses: actions/download-artifact@v2 uses: actions/download-artifact@v2
with: with:
path: ${{runner.temp}}/reports_dir path: ${{ env.REPORTS_PATH }}
- name: CompatibilityCheck - name: CompatibilityCheck
env:
TEMP_PATH: ${{runner.temp}}/compatibility_check
REPO_COPY: ${{runner.temp}}/compatibility_check/ClickHouse
REPORTS_PATH: ${{runner.temp}}/reports_dir
run: | run: |
sudo rm -fr $TEMP_PATH sudo rm -fr $TEMP_PATH
mkdir -p $TEMP_PATH mkdir -p $TEMP_PATH
@ -51,24 +65,30 @@ jobs:
needs: [DockerHubPush] needs: [DockerHubPush]
runs-on: [self-hosted, builder] runs-on: [self-hosted, builder]
steps: steps:
- name: Set envs
run: |
cat >> "$GITHUB_ENV" << 'EOF'
TEMP_PATH=${{runner.temp}}/build_check
IMAGES_PATH=${{runner.temp}}/images_path
REPO_COPY=${{runner.temp}}/build_check/ClickHouse
CACHES_PATH=${{runner.temp}}/../ccaches
CHECK_NAME=ClickHouse build check (actions)
BUILD_NAME=package_release
EOF
- name: Download changed images - name: Download changed images
uses: actions/download-artifact@v2 uses: actions/download-artifact@v2
with: with:
name: changed_images name: changed_images
path: ${{ runner.temp }}/images_path path: ${{ env.IMAGES_PATH }}
- name: Clear repository
run: |
sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE
- name: Check out repository code - name: Check out repository code
uses: actions/checkout@v2 uses: actions/checkout@v2
with: with:
submodules: 'recursive' submodules: 'true'
fetch-depth: 0 # otherwise we will have no info about contributors fetch-depth: 0 # otherwise we will have no info about contributors
- name: Build - name: Build
env:
TEMP_PATH: ${{runner.temp}}/build_check
IMAGES_PATH: ${{runner.temp}}/images_path
REPO_COPY: ${{runner.temp}}/build_check/ClickHouse
CACHES_PATH: ${{runner.temp}}/../ccaches
CHECK_NAME: 'ClickHouse build check (actions)'
BUILD_NAME: 'package_release'
run: | run: |
sudo rm -fr $TEMP_PATH sudo rm -fr $TEMP_PATH
mkdir -p $TEMP_PATH mkdir -p $TEMP_PATH
@ -78,35 +98,41 @@ jobs:
uses: actions/upload-artifact@v2 uses: actions/upload-artifact@v2
with: with:
name: ${{ env.BUILD_NAME }} name: ${{ env.BUILD_NAME }}
path: ${{ runner.temp }}/build_check/${{ env.BUILD_NAME }}.json path: ${{ env.TEMP_PATH }}/${{ env.BUILD_NAME }}.json
- name: Cleanup - name: Cleanup
if: always() if: always()
run: | run: |
docker kill $(docker ps -q) ||: docker kill $(docker ps -q) ||:
docker rm -f $(docker ps -a -q) ||: docker rm -f $(docker ps -a -q) ||:
sudo rm -fr $TEMP_PATH sudo rm -fr $TEMP_PATH $CACHES_PATH
BuilderDebAsan: BuilderDebAsan:
needs: [DockerHubPush] needs: [DockerHubPush]
runs-on: [self-hosted, builder] runs-on: [self-hosted, builder]
steps: steps:
- name: Set envs
run: |
cat >> "$GITHUB_ENV" << 'EOF'
TEMP_PATH=${{runner.temp}}/build_check
IMAGES_PATH=${{runner.temp}}/images_path
REPO_COPY=${{runner.temp}}/build_check/ClickHouse
CACHES_PATH=${{runner.temp}}/../ccaches
CHECK_NAME=ClickHouse build check (actions)
BUILD_NAME=package_asan
EOF
- name: Download changed images - name: Download changed images
uses: actions/download-artifact@v2 uses: actions/download-artifact@v2
with: with:
name: changed_images name: changed_images
path: ${{ runner.temp }}/images_path path: ${{ env.IMAGES_PATH }}
- name: Clear repository
run: |
sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE
- name: Check out repository code - name: Check out repository code
uses: actions/checkout@v2 uses: actions/checkout@v2
with: with:
submodules: 'recursive' submodules: 'true'
fetch-depth: 0 # otherwise we will have no info about contributors fetch-depth: 0 # otherwise we will have no info about contributors
- name: Build - name: Build
env:
TEMP_PATH: ${{runner.temp}}/build_check
IMAGES_PATH: ${{runner.temp}}/images_path
REPO_COPY: ${{runner.temp}}/build_check/ClickHouse
CACHES_PATH: ${{runner.temp}}/../ccaches
CHECK_NAME: 'ClickHouse build check (actions)'
BUILD_NAME: 'package_asan'
run: | run: |
sudo rm -fr $TEMP_PATH sudo rm -fr $TEMP_PATH
mkdir -p $TEMP_PATH mkdir -p $TEMP_PATH
@ -116,35 +142,41 @@ jobs:
uses: actions/upload-artifact@v2 uses: actions/upload-artifact@v2
with: with:
name: ${{ env.BUILD_NAME }} name: ${{ env.BUILD_NAME }}
path: ${{ runner.temp }}/build_check/${{ env.BUILD_NAME }}.json path: ${{ env.TEMP_PATH }}/${{ env.BUILD_NAME }}.json
- name: Cleanup - name: Cleanup
if: always() if: always()
run: | run: |
docker kill $(docker ps -q) ||: docker kill $(docker ps -q) ||:
docker rm -f $(docker ps -a -q) ||: docker rm -f $(docker ps -a -q) ||:
sudo rm -fr $TEMP_PATH sudo rm -fr $TEMP_PATH $CACHES_PATH
BuilderDebTsan: BuilderDebTsan:
needs: [DockerHubPush] needs: [DockerHubPush]
runs-on: [self-hosted, builder] runs-on: [self-hosted, builder]
steps: steps:
- name: Set envs
run: |
cat >> "$GITHUB_ENV" << 'EOF'
TEMP_PATH=${{runner.temp}}/build_check
IMAGES_PATH=${{runner.temp}}/images_path
REPO_COPY=${{runner.temp}}/build_check/ClickHouse
CACHES_PATH=${{runner.temp}}/../ccaches
CHECK_NAME=ClickHouse build check (actions)
BUILD_NAME=package_tsan
EOF
- name: Download changed images - name: Download changed images
uses: actions/download-artifact@v2 uses: actions/download-artifact@v2
with: with:
name: changed_images name: changed_images
path: ${{ runner.temp }}/images_path path: ${{ env.IMAGES_PATH }}
- name: Clear repository
run: |
sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE
- name: Check out repository code - name: Check out repository code
uses: actions/checkout@v2 uses: actions/checkout@v2
with: with:
submodules: 'recursive' submodules: 'true'
fetch-depth: 0 # otherwise we will have no info about contributors fetch-depth: 0 # otherwise we will have no info about contributors
- name: Build - name: Build
env:
TEMP_PATH: ${{runner.temp}}/build_check
IMAGES_PATH: ${{runner.temp}}/images_path
REPO_COPY: ${{runner.temp}}/build_check/ClickHouse
CACHES_PATH: ${{runner.temp}}/../ccaches
CHECK_NAME: 'ClickHouse build check (actions)'
BUILD_NAME: 'package_tsan'
run: | run: |
sudo rm -fr $TEMP_PATH sudo rm -fr $TEMP_PATH
mkdir -p $TEMP_PATH mkdir -p $TEMP_PATH
@ -154,35 +186,41 @@ jobs:
uses: actions/upload-artifact@v2 uses: actions/upload-artifact@v2
with: with:
name: ${{ env.BUILD_NAME }} name: ${{ env.BUILD_NAME }}
path: ${{ runner.temp }}/build_check/${{ env.BUILD_NAME }}.json path: ${{ env.TEMP_PATH }}/${{ env.BUILD_NAME }}.json
- name: Cleanup - name: Cleanup
if: always() if: always()
run: | run: |
docker kill $(docker ps -q) ||: docker kill $(docker ps -q) ||:
docker rm -f $(docker ps -a -q) ||: docker rm -f $(docker ps -a -q) ||:
sudo rm -fr $TEMP_PATH sudo rm -fr $TEMP_PATH $CACHES_PATH
BuilderDebDebug: BuilderDebDebug:
needs: [DockerHubPush] needs: [DockerHubPush]
runs-on: [self-hosted, builder] runs-on: [self-hosted, builder]
steps: steps:
- name: Set envs
run: |
cat >> "$GITHUB_ENV" << 'EOF'
TEMP_PATH=${{runner.temp}}/build_check
IMAGES_PATH=${{runner.temp}}/images_path
REPO_COPY=${{runner.temp}}/build_check/ClickHouse
CACHES_PATH=${{runner.temp}}/../ccaches
CHECK_NAME=ClickHouse build check (actions)
BUILD_NAME=package_debug
EOF
- name: Download changed images - name: Download changed images
uses: actions/download-artifact@v2 uses: actions/download-artifact@v2
with: with:
name: changed_images name: changed_images
path: ${{ runner.temp }}/images_path path: ${{ env.IMAGES_PATH }}
- name: Clear repository
run: |
sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE
- name: Check out repository code - name: Check out repository code
uses: actions/checkout@v2 uses: actions/checkout@v2
with: with:
submodules: 'recursive' submodules: 'true'
fetch-depth: 0 # otherwise we will have no info about contributors fetch-depth: 0 # otherwise we will have no info about contributors
- name: Build - name: Build
env:
TEMP_PATH: ${{runner.temp}}/build_check
IMAGES_PATH: ${{runner.temp}}/images_path
REPO_COPY: ${{runner.temp}}/build_check/ClickHouse
CACHES_PATH: ${{runner.temp}}/../ccaches
CHECK_NAME: 'ClickHouse build check (actions)'
BUILD_NAME: 'package_debug'
run: | run: |
sudo rm -fr $TEMP_PATH sudo rm -fr $TEMP_PATH
mkdir -p $TEMP_PATH mkdir -p $TEMP_PATH
@ -192,13 +230,13 @@ jobs:
uses: actions/upload-artifact@v2 uses: actions/upload-artifact@v2
with: with:
name: ${{ env.BUILD_NAME }} name: ${{ env.BUILD_NAME }}
path: ${{ runner.temp }}/build_check/${{ env.BUILD_NAME }}.json path: ${{ env.TEMP_PATH }}/${{ env.BUILD_NAME }}.json
- name: Cleanup - name: Cleanup
if: always() if: always()
run: | run: |
docker kill $(docker ps -q) ||: docker kill $(docker ps -q) ||:
docker rm -f $(docker ps -a -q) ||: docker rm -f $(docker ps -a -q) ||:
sudo rm -fr $TEMP_PATH sudo rm -fr $TEMP_PATH $CACHES_PATH
############################################################################################ ############################################################################################
##################################### BUILD REPORTER ####################################### ##################################### BUILD REPORTER #######################################
############################################################################################ ############################################################################################
@ -210,17 +248,23 @@ jobs:
- BuilderDebDebug - BuilderDebDebug
runs-on: [self-hosted, style-checker] runs-on: [self-hosted, style-checker]
steps: steps:
- name: Set envs
run: |
cat >> "$GITHUB_ENV" << 'EOF'
TEMP_PATH=${{runner.temp}}/report_check
REPORTS_PATH=${{runner.temp}}/reports_dir
CHECK_NAME=ClickHouse build check (actions)
EOF
- name: Download json reports - name: Download json reports
uses: actions/download-artifact@v2 uses: actions/download-artifact@v2
with: with:
path: ${{runner.temp}}/reports_dir path: ${{ env.REPORTS_PATH }}
- name: Clear repository
run: |
sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE
- name: Check out repository code - name: Check out repository code
uses: actions/checkout@v2 uses: actions/checkout@v2
- name: Report Builder - name: Report Builder
env:
TEMP_PATH: ${{runner.temp}}/report_check
REPORTS_PATH: ${{runner.temp}}/reports_dir
CHECK_NAME: 'ClickHouse build check (actions)'
run: | run: |
sudo rm -fr $TEMP_PATH sudo rm -fr $TEMP_PATH
mkdir -p $TEMP_PATH mkdir -p $TEMP_PATH
@ -239,19 +283,25 @@ jobs:
needs: [BuilderDebAsan] needs: [BuilderDebAsan]
runs-on: [self-hosted, func-tester] runs-on: [self-hosted, func-tester]
steps: steps:
- name: Set envs
run: |
cat >> "$GITHUB_ENV" << 'EOF'
TEMP_PATH=${{runner.temp}}/stateless_debug
REPORTS_PATH=${{runner.temp}}/reports_dir
CHECK_NAME=Stateless tests (address, actions)
REPO_COPY=${{runner.temp}}/stateless_debug/ClickHouse
KILL_TIMEOUT=10800
EOF
- name: Download json reports - name: Download json reports
uses: actions/download-artifact@v2 uses: actions/download-artifact@v2
with: with:
path: ${{runner.temp}}/reports_dir path: ${{ env.REPORTS_PATH }}
- name: Clear repository
run: |
sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE
- name: Check out repository code - name: Check out repository code
uses: actions/checkout@v2 uses: actions/checkout@v2
- name: Functional test - name: Functional test
env:
TEMP_PATH: ${{runner.temp}}/stateless_debug
REPORTS_PATH: ${{runner.temp}}/reports_dir
CHECK_NAME: 'Stateless tests (address, actions)'
REPO_COPY: ${{runner.temp}}/stateless_debug/ClickHouse
KILL_TIMEOUT: 10800
run: | run: |
sudo rm -fr $TEMP_PATH sudo rm -fr $TEMP_PATH
mkdir -p $TEMP_PATH mkdir -p $TEMP_PATH
@ -271,19 +321,25 @@ jobs:
needs: [BuilderDebDebug] needs: [BuilderDebDebug]
runs-on: [self-hosted, func-tester] runs-on: [self-hosted, func-tester]
steps: steps:
- name: Set envs
run: |
cat >> "$GITHUB_ENV" << 'EOF'
TEMP_PATH=${{runner.temp}}/stateful_debug
REPORTS_PATH=${{runner.temp}}/reports_dir
CHECK_NAME=Stateful tests (debug, actions)
REPO_COPY=${{runner.temp}}/stateful_debug/ClickHouse
KILL_TIMEOUT=3600
EOF
- name: Download json reports - name: Download json reports
uses: actions/download-artifact@v2 uses: actions/download-artifact@v2
with: with:
path: ${{runner.temp}}/reports_dir path: ${{ env.REPORTS_PATH }}
- name: Clear repository
run: |
sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE
- name: Check out repository code - name: Check out repository code
uses: actions/checkout@v2 uses: actions/checkout@v2
- name: Functional test - name: Functional test
env:
TEMP_PATH: ${{runner.temp}}/stateful_debug
REPORTS_PATH: ${{runner.temp}}/reports_dir
CHECK_NAME: 'Stateful tests (debug, actions)'
REPO_COPY: ${{runner.temp}}/stateful_debug/ClickHouse
KILL_TIMEOUT: 3600
run: | run: |
sudo rm -fr $TEMP_PATH sudo rm -fr $TEMP_PATH
mkdir -p $TEMP_PATH mkdir -p $TEMP_PATH
@ -301,20 +357,30 @@ jobs:
############################################################################################## ##############################################################################################
StressTestTsan: StressTestTsan:
needs: [BuilderDebTsan] needs: [BuilderDebTsan]
runs-on: [self-hosted, stress-tester] # func testers have 16 cores + 128 GB memory
# while stress testers have 36 cores + 72 memory
# It would be better to have something like 32 + 128,
# but such servers almost unavailable as spot instances.
runs-on: [self-hosted, func-tester]
steps: steps:
- name: Set envs
run: |
cat >> "$GITHUB_ENV" << 'EOF'
TEMP_PATH=${{runner.temp}}/stress_thread
REPORTS_PATH=${{runner.temp}}/reports_dir
CHECK_NAME=Stress test (thread, actions)
REPO_COPY=${{runner.temp}}/stress_thread/ClickHouse
EOF
- name: Download json reports - name: Download json reports
uses: actions/download-artifact@v2 uses: actions/download-artifact@v2
with: with:
path: ${{runner.temp}}/reports_dir path: ${{ env.REPORTS_PATH }}
- name: Clear repository
run: |
sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE
- name: Check out repository code - name: Check out repository code
uses: actions/checkout@v2 uses: actions/checkout@v2
- name: Stress test - name: Stress test
env:
TEMP_PATH: ${{runner.temp}}/stress_thread
REPORTS_PATH: ${{runner.temp}}/reports_dir
CHECK_NAME: 'Stress test (thread, actions)'
REPO_COPY: ${{runner.temp}}/stress_thread/ClickHouse
run: | run: |
sudo rm -fr $TEMP_PATH sudo rm -fr $TEMP_PATH
mkdir -p $TEMP_PATH mkdir -p $TEMP_PATH
@ -334,18 +400,24 @@ jobs:
needs: [BuilderDebRelease] needs: [BuilderDebRelease]
runs-on: [self-hosted, stress-tester] runs-on: [self-hosted, stress-tester]
steps: steps:
- name: Set envs
run: |
cat >> "$GITHUB_ENV" << 'EOF'
TEMP_PATH=${{runner.temp}}/integration_tests_release
REPORTS_PATH=${{runner.temp}}/reports_dir
CHECK_NAME=Integration tests (release, actions)
REPO_COPY=${{runner.temp}}/integration_tests_release/ClickHouse
EOF
- name: Download json reports - name: Download json reports
uses: actions/download-artifact@v2 uses: actions/download-artifact@v2
with: with:
path: ${{runner.temp}}/reports_dir path: ${{ env.REPORTS_PATH }}
- name: Clear repository
run: |
sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE
- name: Check out repository code - name: Check out repository code
uses: actions/checkout@v2 uses: actions/checkout@v2
- name: Integration test - name: Integration test
env:
TEMP_PATH: ${{runner.temp}}/integration_tests_release
REPORTS_PATH: ${{runner.temp}}/reports_dir
CHECK_NAME: 'Integration tests (release, actions)'
REPO_COPY: ${{runner.temp}}/integration_tests_release/ClickHouse
run: | run: |
sudo rm -fr $TEMP_PATH sudo rm -fr $TEMP_PATH
mkdir -p $TEMP_PATH mkdir -p $TEMP_PATH
@ -369,6 +441,9 @@ jobs:
- CompatibilityCheck - CompatibilityCheck
runs-on: [self-hosted, style-checker] runs-on: [self-hosted, style-checker]
steps: steps:
- name: Clear repository
run: |
sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE
- name: Check out repository code - name: Check out repository code
uses: actions/checkout@v2 uses: actions/checkout@v2
- name: Finish label - name: Finish label

View File

@ -1,4 +1,9 @@
name: Cancel name: Cancel
env:
# Force the stdout and stderr streams to be unbuffered
PYTHONUNBUFFERED: 1
on: # yamllint disable-line rule:truthy on: # yamllint disable-line rule:truthy
workflow_run: workflow_run:
workflows: ["CIGithubActions", "ReleaseCI", "DocsCheck", "BackportPR"] workflows: ["CIGithubActions", "ReleaseCI", "DocsCheck", "BackportPR"]

View File

@ -1,4 +1,9 @@
name: DocsCheck name: DocsCheck
env:
# Force the stdout and stderr streams to be unbuffered
PYTHONUNBUFFERED: 1
on: # yamllint disable-line rule:truthy on: # yamllint disable-line rule:truthy
pull_request: pull_request:
types: types:
@ -14,6 +19,9 @@ jobs:
CheckLabels: CheckLabels:
runs-on: [self-hosted, style-checker] runs-on: [self-hosted, style-checker]
steps: steps:
- name: Clear repository
run: |
sudo rm -rf $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE
- name: Check out repository code - name: Check out repository code
uses: actions/checkout@v2 uses: actions/checkout@v2
- name: Labels check - name: Labels check
@ -24,6 +32,9 @@ jobs:
needs: CheckLabels needs: CheckLabels
runs-on: [self-hosted, style-checker] runs-on: [self-hosted, style-checker]
steps: steps:
- name: Clear repository
run: |
sudo rm -rf $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE
- name: Check out repository code - name: Check out repository code
uses: actions/checkout@v2 uses: actions/checkout@v2
- name: Images check - name: Images check
@ -39,17 +50,23 @@ jobs:
needs: DockerHubPush needs: DockerHubPush
runs-on: [self-hosted, func-tester] runs-on: [self-hosted, func-tester]
steps: steps:
- name: Set envs
run: |
cat >> "$GITHUB_ENV" << 'EOF'
TEMP_PATH=${{runner.temp}}/docs_check
REPO_COPY=${{runner.temp}}/docs_check/ClickHouse
EOF
- name: Download changed images - name: Download changed images
uses: actions/download-artifact@v2 uses: actions/download-artifact@v2
with: with:
name: changed_images name: changed_images
path: ${{ runner.temp }}/docs_check path: ${{ env.TEMP_PATH }}
- name: Clear repository
run: |
sudo rm -rf $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE
- name: Check out repository code - name: Check out repository code
uses: actions/checkout@v2 uses: actions/checkout@v2
- name: Docs Check - name: Docs Check
env:
TEMP_PATH: ${{runner.temp}}/docs_check
REPO_COPY: ${{runner.temp}}/docs_check/ClickHouse
run: | run: |
cp -r $GITHUB_WORKSPACE $TEMP_PATH cp -r $GITHUB_WORKSPACE $TEMP_PATH
cd $REPO_COPY/tests/ci cd $REPO_COPY/tests/ci

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,4 +1,9 @@
name: DocsReleaseChecks name: DocsReleaseChecks
env:
# Force the stdout and stderr streams to be unbuffered
PYTHONUNBUFFERED: 1
concurrency: concurrency:
group: master-release group: master-release
cancel-in-progress: true cancel-in-progress: true
@ -11,10 +16,15 @@ on: # yamllint disable-line rule:truthy
- 'website/**' - 'website/**'
- 'benchmark/**' - 'benchmark/**'
- 'docker/**' - 'docker/**'
- '.github/**'
workflow_dispatch:
jobs: jobs:
DockerHubPush: DockerHubPush:
runs-on: [self-hosted, style-checker] runs-on: [self-hosted, style-checker]
steps: steps:
- name: Clear repository
run: |
sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE
- name: Check out repository code - name: Check out repository code
uses: actions/checkout@v2 uses: actions/checkout@v2
- name: Images check - name: Images check
@ -30,20 +40,31 @@ jobs:
needs: DockerHubPush needs: DockerHubPush
runs-on: [self-hosted, func-tester] runs-on: [self-hosted, func-tester]
steps: steps:
- name: Set envs
# https://docs.github.com/en/actions/learn-github-actions/workflow-commands-for-github-actions#multiline-strings
run: |
cat >> "$GITHUB_ENV" << 'EOF'
TEMP_PATH=${{runner.temp}}/docs_release
REPO_COPY=${{runner.temp}}/docs_release/ClickHouse
CLOUDFLARE_TOKEN=${{secrets.CLOUDFLARE}}
ROBOT_CLICKHOUSE_SSH_KEY<<RCSK
${{secrets.ROBOT_CLICKHOUSE_SSH_KEY}}
RCSK
EOF
- name: Clear repository
run: |
sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE
- name: Check out repository code - name: Check out repository code
uses: actions/checkout@v2 uses: actions/checkout@v2
- name: Download changed images - name: Download changed images
uses: actions/download-artifact@v2 uses: actions/download-artifact@v2
with: with:
name: changed_images name: changed_images
path: ${{runner.temp}}/docs_release path: ${{ env.TEMP_PATH }}
- name: Docs Release - name: Docs Release
env:
TEMP_PATH: ${{runner.temp}}/docs_release
REPO_COPY: ${{runner.temp}}/docs_release/ClickHouse
CLOUDFLARE_TOKEN: ${{secrets.CLOUDFLARE}}
ROBOT_CLICKHOUSE_SSH_KEY: ${{secrets.ROBOT_CLICKHOUSE_SSH_KEY}}
run: | run: |
sudo rm -fr $TEMP_PATH
mkdir -p $TEMP_PATH
cp -r $GITHUB_WORKSPACE $TEMP_PATH cp -r $GITHUB_WORKSPACE $TEMP_PATH
cd $REPO_COPY/tests/ci cd $REPO_COPY/tests/ci
python3 docs_release.py python3 docs_release.py

File diff suppressed because it is too large Load Diff

3
.gitmodules vendored
View File

@ -247,3 +247,6 @@
[submodule "contrib/sysroot"] [submodule "contrib/sysroot"]
path = contrib/sysroot path = contrib/sysroot
url = https://github.com/ClickHouse-Extras/sysroot.git url = https://github.com/ClickHouse-Extras/sysroot.git
[submodule "contrib/azure"]
path = contrib/azure
url = https://github.com/ClickHouse-Extras/azure-sdk-for-cpp.git

View File

@ -1,4 +1,4 @@
### ClickHouse release v21.12, 2021-12-13 ### ClickHouse release v21.12, 2021-12-15
#### Backward Incompatible Change #### Backward Incompatible Change
@ -71,8 +71,8 @@
* Fix the issue that `LowCardinality` of `Int256` cannot be created. [#31832](https://github.com/ClickHouse/ClickHouse/pull/31832) ([alexey-milovidov](https://github.com/alexey-milovidov)). * Fix the issue that `LowCardinality` of `Int256` cannot be created. [#31832](https://github.com/ClickHouse/ClickHouse/pull/31832) ([alexey-milovidov](https://github.com/alexey-milovidov)).
* Recreate `system.*_log` tables in case of different engine/partition_by. [#31824](https://github.com/ClickHouse/ClickHouse/pull/31824) ([Azat Khuzhin](https://github.com/azat)). * Recreate `system.*_log` tables in case of different engine/partition_by. [#31824](https://github.com/ClickHouse/ClickHouse/pull/31824) ([Azat Khuzhin](https://github.com/azat)).
* `MaterializedMySQL`: Fix issue with table named 'table'. [#31781](https://github.com/ClickHouse/ClickHouse/pull/31781) ([Håvard Kvålen](https://github.com/havardk)). * `MaterializedMySQL`: Fix issue with table named 'table'. [#31781](https://github.com/ClickHouse/ClickHouse/pull/31781) ([Håvard Kvålen](https://github.com/havardk)).
* ClickHouse dictionary source: support named collections. Closes [#31705](https://github.com/ClickHouse/ClickHouse/issues/31705). [#31749](https://github.com/ClickHouse/ClickHouse/pull/31749) ([Kseniia Sumarokova](https://github.com/kssenii)). * ClickHouse dictionary source: support predefined connections. Closes [#31705](https://github.com/ClickHouse/ClickHouse/issues/31705). [#31749](https://github.com/ClickHouse/ClickHouse/pull/31749) ([Kseniia Sumarokova](https://github.com/kssenii)).
* Allow to use named collections configuration for Kafka and RabbitMQ engines (the same way as for other integration table engines). [#31691](https://github.com/ClickHouse/ClickHouse/pull/31691) ([Kseniia Sumarokova](https://github.com/kssenii)). * Allow to use predefined connections configuration for Kafka and RabbitMQ engines (the same way as for other integration table engines). [#31691](https://github.com/ClickHouse/ClickHouse/pull/31691) ([Kseniia Sumarokova](https://github.com/kssenii)).
* Always re-render prompt while navigating history in clickhouse-client. This will improve usability of manipulating very long queries that don't fit on screen. [#31675](https://github.com/ClickHouse/ClickHouse/pull/31675) ([alexey-milovidov](https://github.com/alexey-milovidov)) (author: Amos Bird). * Always re-render prompt while navigating history in clickhouse-client. This will improve usability of manipulating very long queries that don't fit on screen. [#31675](https://github.com/ClickHouse/ClickHouse/pull/31675) ([alexey-milovidov](https://github.com/alexey-milovidov)) (author: Amos Bird).
* Add key bindings for navigating through history (instead of lines/history). [#31641](https://github.com/ClickHouse/ClickHouse/pull/31641) ([Azat Khuzhin](https://github.com/azat)). * Add key bindings for navigating through history (instead of lines/history). [#31641](https://github.com/ClickHouse/ClickHouse/pull/31641) ([Azat Khuzhin](https://github.com/azat)).
* Improve the `max_execution_time` checks. Fixed some cases when timeout checks do not happen and query could run too long. [#31636](https://github.com/ClickHouse/ClickHouse/pull/31636) ([Raúl Marín](https://github.com/Algunenano)). * Improve the `max_execution_time` checks. Fixed some cases when timeout checks do not happen and query could run too long. [#31636](https://github.com/ClickHouse/ClickHouse/pull/31636) ([Raúl Marín](https://github.com/Algunenano)).

View File

@ -447,7 +447,7 @@ if (MAKE_STATIC_LIBRARIES)
# It's disabled for ARM because otherwise ClickHouse cannot run on Android. # It's disabled for ARM because otherwise ClickHouse cannot run on Android.
set (CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -fno-pie") set (CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -fno-pie")
set (CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELWITHDEBINFO} -fno-pie") set (CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELWITHDEBINFO} -fno-pie")
set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,-no-pie") set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -no-pie -Wl,-no-pie")
endif () endif ()
else () else ()
set (CMAKE_POSITION_INDEPENDENT_CODE ON) set (CMAKE_POSITION_INDEPENDENT_CODE ON)
@ -508,6 +508,7 @@ include (cmake/find/hdfs3.cmake) # uses protobuf
include (cmake/find/poco.cmake) include (cmake/find/poco.cmake)
include (cmake/find/curl.cmake) include (cmake/find/curl.cmake)
include (cmake/find/s3.cmake) include (cmake/find/s3.cmake)
include (cmake/find/blob_storage.cmake)
include (cmake/find/base64.cmake) include (cmake/find/base64.cmake)
include (cmake/find/parquet.cmake) include (cmake/find/parquet.cmake)
include (cmake/find/simdjson.cmake) include (cmake/find/simdjson.cmake)

View File

@ -2,7 +2,13 @@
ClickHouse is an open project, and you can contribute to it in many ways. You can help with ideas, code, or documentation. We appreciate any efforts that help us to make the project better. ClickHouse is an open project, and you can contribute to it in many ways. You can help with ideas, code, or documentation. We appreciate any efforts that help us to make the project better.
Thank you. Thank you!
## Legal Info
When you open your first pull-request to ClickHouse repo, a bot will invite you to accept ClickHouse Individual CLA (Contributor License Agreement). It is a simple few click process. For subsequent pull-requests the bot will check if you have already signed it and won't bother you again.
Optionally, to make contributions even more tight legally, your employer as a legal entity may want to sign a ClickHouse Corporate CLA with ClickHouse, Inc. If you're interested to do so, contact us at [legal@clickhouse.com](mailto:legal@clickhouse.com).
## Technical Info ## Technical Info

View File

@ -82,7 +82,9 @@ PoolWithFailover::PoolWithFailover(
unsigned default_connections_, unsigned default_connections_,
unsigned max_connections_, unsigned max_connections_,
size_t max_tries_, size_t max_tries_,
uint64_t wait_timeout_) uint64_t wait_timeout_,
size_t connect_timeout_,
size_t rw_timeout_)
: max_tries(max_tries_) : max_tries(max_tries_)
, shareable(false) , shareable(false)
, wait_timeout(wait_timeout_) , wait_timeout(wait_timeout_)
@ -93,8 +95,8 @@ PoolWithFailover::PoolWithFailover(
replicas_by_priority[0].emplace_back(std::make_shared<Pool>(database, replicas_by_priority[0].emplace_back(std::make_shared<Pool>(database,
host, user, password, port, host, user, password, port,
/* socket_ = */ "", /* socket_ = */ "",
MYSQLXX_DEFAULT_TIMEOUT, connect_timeout_,
MYSQLXX_DEFAULT_RW_TIMEOUT, rw_timeout_,
default_connections_, default_connections_,
max_connections_)); max_connections_));
} }

View File

@ -6,6 +6,7 @@
#define MYSQLXX_POOL_WITH_FAILOVER_DEFAULT_START_CONNECTIONS 1 #define MYSQLXX_POOL_WITH_FAILOVER_DEFAULT_START_CONNECTIONS 1
#define MYSQLXX_POOL_WITH_FAILOVER_DEFAULT_MAX_CONNECTIONS 16 #define MYSQLXX_POOL_WITH_FAILOVER_DEFAULT_MAX_CONNECTIONS 16
#define MYSQLXX_POOL_WITH_FAILOVER_DEFAULT_MAX_TRIES 3 #define MYSQLXX_POOL_WITH_FAILOVER_DEFAULT_MAX_TRIES 3
#define MYSQLXX_POOL_WITH_FAILOVER_DEFAULT_CONNECTION_WAIT_TIMEOUT 5 /// in seconds
namespace mysqlxx namespace mysqlxx
@ -121,7 +122,9 @@ namespace mysqlxx
unsigned default_connections_ = MYSQLXX_POOL_WITH_FAILOVER_DEFAULT_START_CONNECTIONS, unsigned default_connections_ = MYSQLXX_POOL_WITH_FAILOVER_DEFAULT_START_CONNECTIONS,
unsigned max_connections_ = MYSQLXX_POOL_WITH_FAILOVER_DEFAULT_MAX_CONNECTIONS, unsigned max_connections_ = MYSQLXX_POOL_WITH_FAILOVER_DEFAULT_MAX_CONNECTIONS,
size_t max_tries_ = MYSQLXX_POOL_WITH_FAILOVER_DEFAULT_MAX_TRIES, size_t max_tries_ = MYSQLXX_POOL_WITH_FAILOVER_DEFAULT_MAX_TRIES,
uint64_t wait_timeout_ = UINT64_MAX); uint64_t wait_timeout_ = MYSQLXX_POOL_WITH_FAILOVER_DEFAULT_CONNECTION_WAIT_TIMEOUT,
size_t connect_timeout = MYSQLXX_DEFAULT_TIMEOUT,
size_t rw_timeout = MYSQLXX_DEFAULT_RW_TIMEOUT);
PoolWithFailover(const PoolWithFailover & other); PoolWithFailover(const PoolWithFailover & other);

View File

@ -0,0 +1,30 @@
option (ENABLE_AZURE_BLOB_STORAGE "Enable Azure blob storage" ${ENABLE_LIBRARIES})
option(USE_INTERNAL_AZURE_BLOB_STORAGE_LIBRARY
"Set to FALSE to use system Azure SDK instead of bundled (OFF currently not implemented)"
ON)
if (ENABLE_AZURE_BLOB_STORAGE)
set(USE_AZURE_BLOB_STORAGE 1)
set(AZURE_BLOB_STORAGE_LIBRARY azure_sdk)
endif()
if ((NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/azure/sdk"
OR NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/azure/cmake-modules")
AND USE_INTERNAL_AZURE_BLOB_STORAGE_LIBRARY)
message (WARNING "submodule contrib/azure is missing. to fix try run: \n git submodule update --init")
set(USE_INTERNAL_AZURE_BLOB_STORAGE_LIBRARY OFF)
set(USE_AZURE_BLOB_STORAGE 0)
endif ()
if (NOT USE_INTERNAL_SSL_LIBRARY AND USE_INTERNAL_AZURE_BLOB_STORAGE_LIBRARY)
message (FATAL_ERROR "Currently Blob Storage support can be built only with internal SSL library")
endif()
if (NOT USE_INTERNAL_CURL AND USE_INTERNAL_AZURE_BLOB_STORAGE_LIBRARY)
message (FATAL_ERROR "Currently Blob Storage support can be built only with internal curl library")
endif()
if (USE_AZURE_BLOB_STORAGE)
message (STATUS "Using Azure Blob Storage - ${USE_AZURE_BLOB_STORAGE}")
endif()

View File

@ -249,6 +249,10 @@ endif()
# - sentry-native # - sentry-native
add_subdirectory (curl-cmake) add_subdirectory (curl-cmake)
if (USE_INTERNAL_AZURE_BLOB_STORAGE_LIBRARY)
add_subdirectory(azure-cmake)
endif()
if (USE_SENTRY) if (USE_SENTRY)
add_subdirectory (sentry-native-cmake) add_subdirectory (sentry-native-cmake)
endif() endif()

2
contrib/NuRaft vendored

@ -1 +1 @@
Subproject commit d10351f312c1ae1ca3fdda433693dfbef3acfece Subproject commit bb69d48e0ee35c87a0f19e509a09a914f71f0cff

1
contrib/azure vendored Submodule

@ -0,0 +1 @@
Subproject commit ac4b763d4ca40122275f1497cbdc5451337461d9

View File

@ -0,0 +1,74 @@
set(AZURE_DIR "${ClickHouse_SOURCE_DIR}/contrib/azure")
set(AZURE_SDK_LIBRARY_DIR "${AZURE_DIR}/sdk")
file(GLOB AZURE_SDK_CORE_SRC
"${AZURE_SDK_LIBRARY_DIR}/core/azure-core/src/*.cpp"
"${AZURE_SDK_LIBRARY_DIR}/core/azure-core/src/cryptography/*.cpp"
"${AZURE_SDK_LIBRARY_DIR}/core/azure-core/src/http/*.cpp"
"${AZURE_SDK_LIBRARY_DIR}/core/azure-core/src/http/curl/*.hpp"
"${AZURE_SDK_LIBRARY_DIR}/core/azure-core/src/http/curl/*.cpp"
"${AZURE_SDK_LIBRARY_DIR}/core/azure-core/src/winhttp/*.cpp"
"${AZURE_SDK_LIBRARY_DIR}/core/azure-core/src/io/*.cpp"
"${AZURE_SDK_LIBRARY_DIR}/core/azure-core/src/private/*.hpp"
)
file(GLOB AZURE_SDK_IDENTITY_SRC
"${AZURE_SDK_LIBRARY_DIR}/identity/azure-identity/src/*.cpp"
"${AZURE_SDK_LIBRARY_DIR}/identity/azure-identity/src/private/*.hpp"
)
file(GLOB AZURE_SDK_STORAGE_COMMON_SRC
"${AZURE_SDK_LIBRARY_DIR}/storage/azure-storage-common/src/*.cpp"
"${AZURE_SDK_LIBRARY_DIR}/storage/azure-storage-common/src/private/*.cpp"
)
file(GLOB AZURE_SDK_STORAGE_BLOBS_SRC
"${AZURE_SDK_LIBRARY_DIR}/storage/azure-storage-blobs/src/*.cpp"
"${AZURE_SDK_LIBRARY_DIR}/storage/azure-storage-blobs/src/private/*.hpp"
)
file(GLOB AZURE_SDK_UNIFIED_SRC
${AZURE_SDK_CORE_SRC}
${AZURE_SDK_IDENTITY_SRC}
${AZURE_SDK_STORAGE_COMMON_SRC}
${AZURE_SDK_STORAGE_BLOBS_SRC}
)
set(AZURE_SDK_INCLUDES
"${AZURE_SDK_LIBRARY_DIR}/core/azure-core/inc/"
"${AZURE_SDK_LIBRARY_DIR}/identity/azure-identity/inc/"
"${AZURE_SDK_LIBRARY_DIR}/storage/azure-storage-common/inc/"
"${AZURE_SDK_LIBRARY_DIR}/storage/azure-storage-blobs/inc/"
)
include("${AZURE_DIR}/cmake-modules/AzureTransportAdapters.cmake")
add_library(azure_sdk ${AZURE_SDK_UNIFIED_SRC})
if (COMPILER_CLANG)
target_compile_options(azure_sdk PRIVATE
-Wno-deprecated-copy-dtor
-Wno-extra-semi
-Wno-suggest-destructor-override
-Wno-inconsistent-missing-destructor-override
-Wno-error=unknown-warning-option
)
if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 13)
target_compile_options(azure_sdk PRIVATE -Wno-reserved-identifier)
endif()
endif()
# Originally, on Windows azure-core is built with bcrypt and crypt32 by default
if (OPENSSL_FOUND)
target_link_libraries(azure_sdk PRIVATE ${OPENSSL_LIBRARIES})
endif()
# Originally, on Windows azure-core is built with winhttp by default
if (CURL_FOUND)
target_link_libraries(azure_sdk PRIVATE ${CURL_LIBRARY})
endif()
target_link_libraries(azure_sdk PRIVATE ${LIBXML2_LIBRARIES})
target_include_directories(azure_sdk SYSTEM PUBLIC ${AZURE_SDK_INCLUDES})

View File

@ -639,6 +639,7 @@ add_library(
"${BORINGSSL_SOURCE_DIR}/decrepit/ssl/ssl_decrepit.c" "${BORINGSSL_SOURCE_DIR}/decrepit/ssl/ssl_decrepit.c"
"${BORINGSSL_SOURCE_DIR}/decrepit/cfb/cfb.c" "${BORINGSSL_SOURCE_DIR}/decrepit/cfb/cfb.c"
"${BORINGSSL_SOURCE_DIR}/decrepit/bio/base64_bio.c"
) )
add_executable( add_executable(

View File

@ -268,7 +268,7 @@ XMLPUBFUN void XMLCALL xmlCheckVersion(int version);
* *
* Whether iconv support is available * Whether iconv support is available
*/ */
#if 1 #if 0
#define LIBXML_ICONV_ENABLED #define LIBXML_ICONV_ENABLED
#endif #endif

14
debian/rules vendored
View File

@ -45,6 +45,10 @@ ifdef DEB_CXX
ifeq ($(DEB_BUILD_GNU_TYPE),$(DEB_HOST_GNU_TYPE)) ifeq ($(DEB_BUILD_GNU_TYPE),$(DEB_HOST_GNU_TYPE))
CC := $(DEB_CC) CC := $(DEB_CC)
CXX := $(DEB_CXX) CXX := $(DEB_CXX)
else ifeq (clang,$(findstring clang,$(DEB_CXX)))
# If we crosscompile with clang, it knows what to do
CC := $(DEB_CC)
CXX := $(DEB_CXX)
else else
CC := $(DEB_HOST_GNU_TYPE)-$(DEB_CC) CC := $(DEB_HOST_GNU_TYPE)-$(DEB_CC)
CXX := $(DEB_HOST_GNU_TYPE)-$(DEB_CXX) CXX := $(DEB_HOST_GNU_TYPE)-$(DEB_CXX)
@ -77,10 +81,6 @@ else
THREADS_COUNT = 1 THREADS_COUNT = 1
endif endif
ifneq ($(THREADS_COUNT),)
THREADS_COUNT:=-j$(THREADS_COUNT)
endif
%: %:
dh $@ $(DH_FLAGS) --buildsystem=cmake dh $@ $(DH_FLAGS) --buildsystem=cmake
@ -89,11 +89,11 @@ override_dh_auto_configure:
override_dh_auto_build: override_dh_auto_build:
# Fix for ninja. Do not add -O. # Fix for ninja. Do not add -O.
$(MAKE) $(THREADS_COUNT) -C $(BUILDDIR) $(MAKE_TARGET) $(MAKE) -j$(THREADS_COUNT) -C $(BUILDDIR) $(MAKE_TARGET)
override_dh_auto_test: override_dh_auto_test:
ifeq (,$(filter nocheck,$(DEB_BUILD_OPTIONS))) ifeq (,$(filter nocheck,$(DEB_BUILD_OPTIONS)))
cd $(BUILDDIR) && ctest $(THREADS_COUNT) -V cd $(BUILDDIR) && ctest -j$(THREADS_COUNT) -V
endif endif
override_dh_clean: override_dh_clean:
@ -120,7 +120,7 @@ override_dh_install:
dh_install --list-missing --sourcedir=$(DESTDIR) dh_install --list-missing --sourcedir=$(DESTDIR)
override_dh_auto_install: override_dh_auto_install:
env DESTDIR=$(DESTDIR) $(MAKE) $(THREADS_COUNT) -C $(BUILDDIR) install env DESTDIR=$(DESTDIR) $(MAKE) -j$(THREADS_COUNT) -C $(BUILDDIR) install
override_dh_shlibdeps: override_dh_shlibdeps:
true # We depend only on libc and dh_shlibdeps gives us wrong (too strict) dependency. true # We depend only on libc and dh_shlibdeps gives us wrong (too strict) dependency.

View File

@ -24,40 +24,34 @@ RUN apt-get update \
&& apt-key add /tmp/llvm-snapshot.gpg.key \ && apt-key add /tmp/llvm-snapshot.gpg.key \
&& export CODENAME="$(lsb_release --codename --short | tr 'A-Z' 'a-z')" \ && export CODENAME="$(lsb_release --codename --short | tr 'A-Z' 'a-z')" \
&& echo "deb [trusted=yes] https://apt.llvm.org/${CODENAME}/ llvm-toolchain-${CODENAME}-${LLVM_VERSION} main" >> \ && echo "deb [trusted=yes] https://apt.llvm.org/${CODENAME}/ llvm-toolchain-${CODENAME}-${LLVM_VERSION} main" >> \
/etc/apt/sources.list /etc/apt/sources.list \
&& apt-get clean
# initial packages # initial packages
RUN apt-get update \
&& apt-get install \
bash \
fakeroot \
ccache \
curl \
software-properties-common \
--yes --no-install-recommends
RUN apt-get update \ RUN apt-get update \
&& apt-get install \ && apt-get install \
bash \ bash \
build-essential \ build-essential \
ccache \ ccache \
clang-${LLVM_VERSION} \
clang-tidy-${LLVM_VERSION} \
cmake \ cmake \
curl \ curl \
fakeroot \
gdb \ gdb \
git \ git \
gperf \ gperf \
clang-${LLVM_VERSION} \
clang-tidy-${LLVM_VERSION} \
lld-${LLVM_VERSION} \ lld-${LLVM_VERSION} \
llvm-${LLVM_VERSION} \ llvm-${LLVM_VERSION} \
llvm-${LLVM_VERSION}-dev \ llvm-${LLVM_VERSION}-dev \
libicu-dev \
moreutils \ moreutils \
ninja-build \ ninja-build \
pigz \ pigz \
rename \ rename \
software-properties-common \
tzdata \ tzdata \
--yes --no-install-recommends --yes --no-install-recommends \
&& apt-get clean
# This symlink required by gcc to find lld compiler # This symlink required by gcc to find lld compiler
RUN ln -s /usr/bin/lld-${LLVM_VERSION} /usr/bin/ld.lld RUN ln -s /usr/bin/lld-${LLVM_VERSION} /usr/bin/ld.lld
@ -66,7 +60,7 @@ ENV CC=clang-${LLVM_VERSION}
ENV CXX=clang++-${LLVM_VERSION} ENV CXX=clang++-${LLVM_VERSION}
# libtapi is required to support .tbh format from recent MacOS SDKs # libtapi is required to support .tbh format from recent MacOS SDKs
RUN git clone https://github.com/tpoechtrager/apple-libtapi.git \ RUN git clone --depth 1 https://github.com/tpoechtrager/apple-libtapi.git \
&& cd apple-libtapi \ && cd apple-libtapi \
&& INSTALLPREFIX=/cctools ./build.sh \ && INSTALLPREFIX=/cctools ./build.sh \
&& ./install.sh \ && ./install.sh \
@ -74,7 +68,7 @@ RUN git clone https://github.com/tpoechtrager/apple-libtapi.git \
&& rm -rf apple-libtapi && rm -rf apple-libtapi
# Build and install tools for cross-linking to Darwin (x86-64) # Build and install tools for cross-linking to Darwin (x86-64)
RUN git clone https://github.com/tpoechtrager/cctools-port.git \ RUN git clone --depth 1 https://github.com/tpoechtrager/cctools-port.git \
&& cd cctools-port/cctools \ && cd cctools-port/cctools \
&& ./configure --prefix=/cctools --with-libtapi=/cctools \ && ./configure --prefix=/cctools --with-libtapi=/cctools \
--target=x86_64-apple-darwin \ --target=x86_64-apple-darwin \
@ -83,7 +77,7 @@ RUN git clone https://github.com/tpoechtrager/cctools-port.git \
&& rm -rf cctools-port && rm -rf cctools-port
# Build and install tools for cross-linking to Darwin (aarch64) # Build and install tools for cross-linking to Darwin (aarch64)
RUN git clone https://github.com/tpoechtrager/cctools-port.git \ RUN git clone --depth 1 https://github.com/tpoechtrager/cctools-port.git \
&& cd cctools-port/cctools \ && cd cctools-port/cctools \
&& ./configure --prefix=/cctools --with-libtapi=/cctools \ && ./configure --prefix=/cctools --with-libtapi=/cctools \
--target=aarch64-apple-darwin \ --target=aarch64-apple-darwin \
@ -97,7 +91,8 @@ RUN wget -nv https://github.com/phracker/MacOSX-SDKs/releases/download/11.3/MacO
# NOTE: Seems like gcc-11 is too new for ubuntu20 repository # NOTE: Seems like gcc-11 is too new for ubuntu20 repository
RUN add-apt-repository ppa:ubuntu-toolchain-r/test --yes \ RUN add-apt-repository ppa:ubuntu-toolchain-r/test --yes \
&& apt-get update \ && apt-get update \
&& apt-get install gcc-11 g++-11 --yes && apt-get install gcc-11 g++-11 --yes \
&& apt-get clean
COPY build.sh / COPY build.sh /

View File

@ -64,8 +64,14 @@ RUN add-apt-repository ppa:ubuntu-toolchain-r/test --yes \
&& apt-get install gcc-11 g++-11 --yes && apt-get install gcc-11 g++-11 --yes
# This symlink required by gcc to find lld compiler # These symlinks are required:
RUN ln -s /usr/bin/lld-${LLVM_VERSION} /usr/bin/ld.lld # /usr/bin/ld.lld: by gcc to find lld compiler
# /usr/bin/aarch64-linux-gnu-obj*: for debug symbols stripping
RUN ln -s /usr/bin/lld-${LLVM_VERSION} /usr/bin/ld.lld \
&& ln -sf /usr/lib/llvm-${LLVM_VERSION}/bin/llvm-objcopy /usr/bin/aarch64-linux-gnu-strip \
&& ln -sf /usr/lib/llvm-${LLVM_VERSION}/bin/llvm-objcopy /usr/bin/aarch64-linux-gnu-objcopy \
&& ln -sf /usr/lib/llvm-${LLVM_VERSION}/bin/llvm-objdump /usr/bin/aarch64-linux-gnu-objdump
COPY build.sh / COPY build.sh /

View File

@ -29,7 +29,13 @@ def pull_image(image_name):
return False return False
def build_image(image_name, filepath): def build_image(image_name, filepath):
subprocess.check_call("docker build --network=host -t {} -f {} .".format(image_name, filepath), shell=True) context = os.path.dirname(filepath)
subprocess.check_call(
"docker build --network=host -t {} -f {} {}".format(
image_name, filepath, context
),
shell=True,
)
def run_docker_image_with_env(image_name, output, env_variables, ch_root, ccache_dir, docker_image_version): def run_docker_image_with_env(image_name, output, env_variables, ch_root, ccache_dir, docker_image_version):
env_part = " -e ".join(env_variables) env_part = " -e ".join(env_variables)
@ -90,6 +96,7 @@ def parse_env_variables(build_type, compiler, sanitizer, package_type, image_typ
elif is_cross_arm: elif is_cross_arm:
cc = compiler[:-len(ARM_SUFFIX)] cc = compiler[:-len(ARM_SUFFIX)]
cmake_flags.append("-DCMAKE_TOOLCHAIN_FILE=/build/cmake/linux/toolchain-aarch64.cmake") cmake_flags.append("-DCMAKE_TOOLCHAIN_FILE=/build/cmake/linux/toolchain-aarch64.cmake")
result.append("DEB_ARCH_FLAG=-aarm64")
elif is_cross_freebsd: elif is_cross_freebsd:
cc = compiler[:-len(FREEBSD_SUFFIX)] cc = compiler[:-len(FREEBSD_SUFFIX)]
cmake_flags.append("-DCMAKE_TOOLCHAIN_FILE=/build/cmake/freebsd/toolchain-x86_64.cmake") cmake_flags.append("-DCMAKE_TOOLCHAIN_FILE=/build/cmake/freebsd/toolchain-x86_64.cmake")
@ -98,6 +105,7 @@ def parse_env_variables(build_type, compiler, sanitizer, package_type, image_typ
cmake_flags.append("-DCMAKE_TOOLCHAIN_FILE=/build/cmake/linux/toolchain-ppc64le.cmake") cmake_flags.append("-DCMAKE_TOOLCHAIN_FILE=/build/cmake/linux/toolchain-ppc64le.cmake")
else: else:
cc = compiler cc = compiler
result.append("DEB_ARCH_FLAG=-aamd64")
cxx = cc.replace('gcc', 'g++').replace('clang', 'clang++') cxx = cc.replace('gcc', 'g++').replace('clang', 'clang++')

View File

@ -111,19 +111,6 @@ function start_server
fi fi
echo "ClickHouse server pid '$server_pid' started and responded" echo "ClickHouse server pid '$server_pid' started and responded"
echo "
set follow-fork-mode child
handle all noprint
handle SIGSEGV stop print
handle SIGBUS stop print
handle SIGABRT stop print
continue
thread apply all backtrace
continue
" > script.gdb
gdb -batch -command script.gdb -p "$server_pid" &
} }
function clone_root function clone_root
@ -186,6 +173,8 @@ function clone_submodules
contrib/dragonbox contrib/dragonbox
contrib/fast_float contrib/fast_float
contrib/NuRaft contrib/NuRaft
contrib/jemalloc
contrib/replxx
) )
git submodule sync git submodule sync
@ -206,6 +195,8 @@ function run_cmake
"-DENABLE_THINLTO=0" "-DENABLE_THINLTO=0"
"-DUSE_UNWIND=1" "-DUSE_UNWIND=1"
"-DENABLE_NURAFT=1" "-DENABLE_NURAFT=1"
"-DENABLE_JEMALLOC=1"
"-DENABLE_REPLXX=1"
) )
# TODO remove this? we don't use ccache anyway. An option would be to download it # TODO remove this? we don't use ccache anyway. An option would be to download it

View File

@ -155,17 +155,43 @@ function fuzz
kill -0 $server_pid kill -0 $server_pid
# Set follow-fork-mode to parent, because we attach to clickhouse-server, not to watchdog
# and clickhouse-server can do fork-exec, for example, to run some bridge.
# Do not set nostop noprint for all signals, because some it may cause gdb to hang,
# explicitly ignore non-fatal signals that are used by server.
# Number of SIGRTMIN can be determined only in runtime.
RTMIN=$(kill -l SIGRTMIN)
echo " echo "
set follow-fork-mode child set follow-fork-mode parent
handle all noprint handle SIGHUP nostop noprint pass
handle SIGSEGV stop print handle SIGINT nostop noprint pass
handle SIGBUS stop print handle SIGQUIT nostop noprint pass
continue handle SIGPIPE nostop noprint pass
thread apply all backtrace handle SIGTERM nostop noprint pass
handle SIGUSR1 nostop noprint pass
handle SIGUSR2 nostop noprint pass
handle SIG$RTMIN nostop noprint pass
info signals
continue continue
backtrace full
info locals
info registers
disassemble /s
up
info locals
disassemble /s
up
info locals
disassemble /s
p \"done\"
detach
quit
" > script.gdb " > script.gdb
sudo gdb -batch -command script.gdb -p $server_pid & gdb -batch -command script.gdb -p $server_pid &
sleep 5
# gdb will send SIGSTOP, spend some time loading debug info and then send SIGCONT, wait for it (up to send_timeout, 300s)
time clickhouse-client --query "SELECT 'Connected to clickhouse-server after attaching gdb'" ||:
# Check connectivity after we attach gdb, because it might cause the server # Check connectivity after we attach gdb, because it might cause the server
# to freeze and the fuzzer will fail. # to freeze and the fuzzer will fail.

View File

@ -72,11 +72,13 @@ RUN python3 -m pip install \
grpcio-tools \ grpcio-tools \
kafka-python \ kafka-python \
kazoo \ kazoo \
lz4 \
minio \ minio \
protobuf \ protobuf \
psycopg2-binary==2.8.6 \ psycopg2-binary==2.8.6 \
pymongo==3.11.0 \ pymongo==3.11.0 \
pytest \ pytest \
pytest-order==1.0.0 \
pytest-timeout \ pytest-timeout \
pytest-xdist \ pytest-xdist \
pytest-repeat \ pytest-repeat \
@ -85,7 +87,8 @@ RUN python3 -m pip install \
tzlocal==2.1 \ tzlocal==2.1 \
urllib3 \ urllib3 \
requests-kerberos \ requests-kerberos \
pyhdfs pyhdfs \
azure-storage-blob
COPY modprobe.sh /usr/local/bin/modprobe COPY modprobe.sh /usr/local/bin/modprobe
COPY dockerd-entrypoint.sh /usr/local/bin/ COPY dockerd-entrypoint.sh /usr/local/bin/

View File

@ -0,0 +1,13 @@
version: '2.3'
services:
azurite1:
image: mcr.microsoft.com/azure-storage/azurite
ports:
- "10000:10000"
volumes:
- data1-1:/data1
command: azurite-blob --blobHost 0.0.0.0 --blobPort 10000 --debug /azurite_log
volumes:
data1-1:

View File

@ -8,8 +8,8 @@ echo '{
"ip-forward": true, "ip-forward": true,
"log-level": "debug", "log-level": "debug",
"storage-driver": "overlay2", "storage-driver": "overlay2",
"insecure-registries" : ["dockerhub-proxy.sas.yp-c.yandex.net:5000"], "insecure-registries" : ["dockerhub-proxy.dockerhub-proxy-zone:5000"],
"registry-mirrors" : ["http://dockerhub-proxy.sas.yp-c.yandex.net:5000"] "registry-mirrors" : ["http://dockerhub-proxy.dockerhub-proxy-zone:5000"]
}' | dd of=/etc/docker/daemon.json 2>/dev/null }' | dd of=/etc/docker/daemon.json 2>/dev/null
dockerd --host=unix:///var/run/docker.sock --host=tcp://0.0.0.0:2375 --default-address-pool base=172.17.0.0/12,size=24 &>/ClickHouse/tests/integration/dockerd.log & dockerd --host=unix:///var/run/docker.sock --host=tcp://0.0.0.0:2375 --default-address-pool base=172.17.0.0/12,size=24 &>/ClickHouse/tests/integration/dockerd.log &

View File

@ -193,7 +193,7 @@ function run_tests
then then
# Run only explicitly specified tests, if any. # Run only explicitly specified tests, if any.
# shellcheck disable=SC2010 # shellcheck disable=SC2010
test_files=$(ls "$test_prefix" | grep "$CHPC_TEST_GREP" | xargs -I{} -n1 readlink -f "$test_prefix/{}") test_files=($(ls "$test_prefix" | grep "$CHPC_TEST_GREP" | xargs -I{} -n1 readlink -f "$test_prefix/{}"))
elif [ "$PR_TO_TEST" -ne 0 ] \ elif [ "$PR_TO_TEST" -ne 0 ] \
&& [ "$(wc -l < changed-test-definitions.txt)" -gt 0 ] \ && [ "$(wc -l < changed-test-definitions.txt)" -gt 0 ] \
&& [ "$(wc -l < other-changed-files.txt)" -eq 0 ] && [ "$(wc -l < other-changed-files.txt)" -eq 0 ]
@ -201,10 +201,26 @@ function run_tests
# If only the perf tests were changed in the PR, we will run only these # If only the perf tests were changed in the PR, we will run only these
# tests. The lists of changed files are prepared in entrypoint.sh because # tests. The lists of changed files are prepared in entrypoint.sh because
# it has the repository. # it has the repository.
test_files=$(sed "s/tests\/performance/${test_prefix//\//\\/}/" changed-test-definitions.txt) test_files=($(sed "s/tests\/performance/${test_prefix//\//\\/}/" changed-test-definitions.txt))
else else
# The default -- run all tests found in the test dir. # The default -- run all tests found in the test dir.
test_files=$(ls "$test_prefix"/*.xml) test_files=($(ls "$test_prefix"/*.xml))
fi
# We split perf tests into multiple checks to make them faster
if [ -v CHPC_TEST_RUN_BY_HASH_TOTAL ]; then
# filter tests array in bash https://stackoverflow.com/a/40375567
for index in "${!test_files[@]}"; do
# sorry for this, just calculating hash(test_name) % total_tests_group == my_test_group_num
test_hash_result=$(echo test_files[$index] | perl -ne 'use Digest::MD5 qw(md5); print unpack('Q', md5($_)) % $ENV{CHPC_TEST_RUN_BY_HASH_TOTAL} == $ENV{CHPC_TEST_RUN_BY_HASH_NUM};')
# BTW, for some reason when hash(test_name) % total_tests_group != my_test_group_num perl outputs nothing, not zero
if [ "$test_hash_result" != "1" ]; then
# deleting element from array
unset -v 'test_files[$index]'
fi
done
# to have sequential indexes...
test_files=("${test_files[@]}")
fi fi
# For PRs w/o changes in test definitons, test only a subset of queries, # For PRs w/o changes in test definitons, test only a subset of queries,
@ -212,21 +228,26 @@ function run_tests
# already set, keep those values. # already set, keep those values.
# #
# NOTE: too high CHPC_RUNS/CHPC_MAX_QUERIES may hit internal CI timeout. # NOTE: too high CHPC_RUNS/CHPC_MAX_QUERIES may hit internal CI timeout.
if [ "$PR_TO_TEST" -ne 0 ] && [ "$(wc -l < changed-test-definitions.txt)" -eq 0 ] # NOTE: Currently we disabled complete run even for master branch
then #if [ "$PR_TO_TEST" -ne 0 ] && [ "$(wc -l < changed-test-definitions.txt)" -eq 0 ]
#then
# CHPC_RUNS=${CHPC_RUNS:-7}
# CHPC_MAX_QUERIES=${CHPC_MAX_QUERIES:-10}
#else
# CHPC_RUNS=${CHPC_RUNS:-13}
# CHPC_MAX_QUERIES=${CHPC_MAX_QUERIES:-0}
#fi
CHPC_RUNS=${CHPC_RUNS:-7} CHPC_RUNS=${CHPC_RUNS:-7}
CHPC_MAX_QUERIES=${CHPC_MAX_QUERIES:-10} CHPC_MAX_QUERIES=${CHPC_MAX_QUERIES:-10}
else
CHPC_RUNS=${CHPC_RUNS:-13}
CHPC_MAX_QUERIES=${CHPC_MAX_QUERIES:-0}
fi
export CHPC_RUNS export CHPC_RUNS
export CHPC_MAX_QUERIES export CHPC_MAX_QUERIES
# Determine which concurrent benchmarks to run. For now, the only test # Determine which concurrent benchmarks to run. For now, the only test
# we run as a concurrent benchmark is 'website'. Run it as benchmark if we # we run as a concurrent benchmark is 'website'. Run it as benchmark if we
# are also going to run it as a normal test. # are also going to run it as a normal test.
for test in $test_files; do echo "$test"; done | sed -n '/website/p' > benchmarks-to-run.txt for test in ${test_files[@]}; do echo "$test"; done | sed -n '/website/p' > benchmarks-to-run.txt
# Delete old report files. # Delete old report files.
for x in {test-times,wall-clock-times}.tsv for x in {test-times,wall-clock-times}.tsv
@ -235,8 +256,8 @@ function run_tests
touch "$x" touch "$x"
done done
# Randomize test order. # Randomize test order. BTW, it's not an array no more.
test_files=$(for f in $test_files; do echo "$f"; done | sort -R) test_files=$(for f in ${test_files[@]}; do echo "$f"; done | sort -R)
# Limit profiling time to 10 minutes, not to run for too long. # Limit profiling time to 10 minutes, not to run for too long.
profile_seconds_left=600 profile_seconds_left=600
@ -261,16 +282,24 @@ function run_tests
# Use awk because bash doesn't support floating point arithmetic. # Use awk because bash doesn't support floating point arithmetic.
profile_seconds=$(awk "BEGIN { print ($profile_seconds_left > 0 ? 10 : 0) }") profile_seconds=$(awk "BEGIN { print ($profile_seconds_left > 0 ? 10 : 0) }")
(
set +x
argv=(
--host localhost localhost
--port "$LEFT_SERVER_PORT" "$RIGHT_SERVER_PORT"
--runs "$CHPC_RUNS"
--max-queries "$CHPC_MAX_QUERIES"
--profile-seconds "$profile_seconds"
"$test"
)
TIMEFORMAT=$(printf "$test_name\t%%3R\t%%3U\t%%3S\n") TIMEFORMAT=$(printf "$test_name\t%%3R\t%%3U\t%%3S\n")
# The grep is to filter out set -x output and keep only time output. # one more subshell to suppress trace output for "set +x"
# The '2>&1 >/dev/null' redirects stderr to stdout, and discards stdout. (
{ \ time "$script_dir/perf.py" "${argv[@]}" > "$test_name-raw.tsv" 2> "$test_name-err.log"
time "$script_dir/perf.py" --host localhost localhost --port $LEFT_SERVER_PORT $RIGHT_SERVER_PORT \ ) 2>>wall-clock-times.tsv >/dev/null \
--runs "$CHPC_RUNS" --max-queries "$CHPC_MAX_QUERIES" \
--profile-seconds "$profile_seconds" \
-- "$test" > "$test_name-raw.tsv" 2> "$test_name-err.log" ; \
} 2>&1 >/dev/null | tee >(grep -v ^+ >> "wall-clock-times.tsv") \
|| echo "Test $test_name failed with error code $?" >> "$test_name-err.log" || echo "Test $test_name failed with error code $?" >> "$test_name-err.log"
) 2>/dev/null
profile_seconds_left=$(awk -F' ' \ profile_seconds_left=$(awk -F' ' \
'BEGIN { s = '$profile_seconds_left'; } /^profile-total/ { s -= $2 } END { print s }' \ 'BEGIN { s = '$profile_seconds_left'; } /^profile-total/ { s -= $2 } END { print s }' \
@ -278,8 +307,6 @@ function run_tests
current_test=$((current_test + 1)) current_test=$((current_test + 1))
done done
unset TIMEFORMAT
wait wait
} }
@ -518,7 +545,9 @@ unset IFS
# all nodes. # all nodes.
numactl --show numactl --show
numactl --cpunodebind=all --membind=all numactl --show numactl --cpunodebind=all --membind=all numactl --show
numactl --cpunodebind=all --membind=all parallel --joblog analyze/parallel-log.txt --null < analyze/commands.txt 2>> analyze/errors.log # Use less jobs to avoid OOM. Some queries can consume 8+ GB of memory.
jobs_count=$(($(grep -c ^processor /proc/cpuinfo) / 3))
numactl --cpunodebind=all --membind=all parallel --jobs $jobs_count --joblog analyze/parallel-log.txt --null < analyze/commands.txt 2>> analyze/errors.log
clickhouse-local --query " clickhouse-local --query "
-- Join the metric names back to the metric statistics we've calculated, and make -- Join the metric names back to the metric statistics we've calculated, and make

View File

@ -16,16 +16,28 @@ right_sha=$4
datasets=${CHPC_DATASETS-"hits1 hits10 hits100 values"} datasets=${CHPC_DATASETS-"hits1 hits10 hits100 values"}
declare -A dataset_paths declare -A dataset_paths
dataset_paths["hits10"]="https://s3.mds.yandex.net/clickhouse-private-datasets/hits_10m_single/partitions/hits_10m_single.tar" if [[ $S3_URL == *"s3.amazonaws.com"* ]]; then
dataset_paths["hits100"]="https://s3.mds.yandex.net/clickhouse-private-datasets/hits_100m_single/partitions/hits_100m_single.tar" dataset_paths["hits10"]="https://clickhouse-private-datasets.s3.amazonaws.com/hits_10m_single/partitions/hits_10m_single.tar"
dataset_paths["hits1"]="https://clickhouse-datasets.s3.yandex.net/hits/partitions/hits_v1.tar" dataset_paths["hits100"]="https://clickhouse-private-datasets.s3.amazonaws.com/hits_100m_single/partitions/hits_100m_single.tar"
dataset_paths["values"]="https://clickhouse-datasets.s3.yandex.net/values_with_expressions/partitions/test_values.tar" dataset_paths["hits1"]="https://clickhouse-datasets.s3.amazonaws.com/hits/partitions/hits_v1.tar"
dataset_paths["values"]="https://clickhouse-datasets.s3.amazonaws.com/values_with_expressions/partitions/test_values.tar"
else
dataset_paths["hits10"]="https://s3.mds.yandex.net/clickhouse-private-datasets/hits_10m_single/partitions/hits_10m_single.tar"
dataset_paths["hits100"]="https://s3.mds.yandex.net/clickhouse-private-datasets/hits_100m_single/partitions/hits_100m_single.tar"
dataset_paths["hits1"]="https://clickhouse-datasets.s3.yandex.net/hits/partitions/hits_v1.tar"
dataset_paths["values"]="https://clickhouse-datasets.s3.yandex.net/values_with_expressions/partitions/test_values.tar"
fi
function download function download
{ {
# Historically there were various paths for the performance test package. # Historically there were various paths for the performance test package.
# Test all of them. # Test all of them.
for path in "https://clickhouse-builds.s3.yandex.net/$left_pr/$left_sha/"{,clickhouse_build_check/}"performance/performance.tgz" declare -a urls_to_try=("https://s3.amazonaws.com/clickhouse-builds/$left_pr/$left_sha/performance/performance.tgz"
"https://clickhouse-builds.s3.yandex.net/$left_pr/$left_sha/clickhouse_build_check/performance/performance.tgz"
)
for path in "${urls_to_try[@]}"
do do
if curl --fail --head "$path" if curl --fail --head "$path"
then then

View File

@ -4,6 +4,27 @@ set -ex
CHPC_CHECK_START_TIMESTAMP="$(date +%s)" CHPC_CHECK_START_TIMESTAMP="$(date +%s)"
export CHPC_CHECK_START_TIMESTAMP export CHPC_CHECK_START_TIMESTAMP
S3_URL=${S3_URL:="https://clickhouse-builds.s3.yandex.net"}
COMMON_BUILD_PREFIX="/clickhouse_build_check"
if [[ $S3_URL == *"s3.amazonaws.com"* ]]; then
COMMON_BUILD_PREFIX=""
fi
# Sometimes AWS responde with DNS error and it's impossible to retry it with
# current curl version options.
function curl_with_retry
{
for _ in 1 2 3 4; do
if curl --fail --head "$1";then
return 0
else
sleep 0.5
fi
done
return 1
}
# Use the packaged repository to find the revision we will compare to. # Use the packaged repository to find the revision we will compare to.
function find_reference_sha function find_reference_sha
{ {
@ -43,9 +64,12 @@ function find_reference_sha
# Historically there were various path for the performance test package, # Historically there were various path for the performance test package,
# test all of them. # test all of them.
unset found unset found
for path in "https://clickhouse-builds.s3.yandex.net/0/$REF_SHA/"{,clickhouse_build_check/}"performance/performance.tgz" declare -a urls_to_try=("https://s3.amazonaws.com/clickhouse-builds/0/$REF_SHA/performance/performance.tgz"
"https://clickhouse-builds.s3.yandex.net/0/$REF_SHA/clickhouse_build_check/performance/performance.tgz"
)
for path in "${urls_to_try[@]}"
do do
if curl --fail --head "$path" if curl_with_retry "$path"
then then
found="$path" found="$path"
break break
@ -65,14 +89,11 @@ chmod 777 workspace output
cd workspace cd workspace
# Download the package for the version we are going to test # Download the package for the version we are going to test.
for path in "https://clickhouse-builds.s3.yandex.net/$PR_TO_TEST/$SHA_TO_TEST/"{,clickhouse_build_check/}"performance/performance.tgz" if curl_with_retry "$S3_URL/$PR_TO_TEST/$SHA_TO_TEST$COMMON_BUILD_PREFIX/performance/performance.tgz"
do then
if curl --fail --head "$path" right_path="$S3_URL/$PR_TO_TEST/$SHA_TO_TEST$COMMON_BUILD_PREFIX/performance/performance.tgz"
then fi
right_path="$path"
fi
done
mkdir right mkdir right
wget -nv -nd -c "$right_path" -O- | tar -C right --strip-components=1 -zxv wget -nv -nd -c "$right_path" -O- | tar -C right --strip-components=1 -zxv

View File

@ -45,6 +45,7 @@ parser.add_argument('--runs', type=int, default=1, help='Number of query runs pe
parser.add_argument('--max-queries', type=int, default=None, help='Test no more than this number of queries, chosen at random.') parser.add_argument('--max-queries', type=int, default=None, help='Test no more than this number of queries, chosen at random.')
parser.add_argument('--queries-to-run', nargs='*', type=int, default=None, help='Space-separated list of indexes of queries to test.') parser.add_argument('--queries-to-run', nargs='*', type=int, default=None, help='Space-separated list of indexes of queries to test.')
parser.add_argument('--max-query-seconds', type=int, default=15, help='For how many seconds at most a query is allowed to run. The script finishes with error if this time is exceeded.') parser.add_argument('--max-query-seconds', type=int, default=15, help='For how many seconds at most a query is allowed to run. The script finishes with error if this time is exceeded.')
parser.add_argument('--prewarm-max-query-seconds', type=int, default=180, help='For how many seconds at most a prewarm (cold storage) query is allowed to run. The script finishes with error if this time is exceeded.')
parser.add_argument('--profile-seconds', type=int, default=0, help='For how many seconds to profile a query for which the performance has changed.') parser.add_argument('--profile-seconds', type=int, default=0, help='For how many seconds to profile a query for which the performance has changed.')
parser.add_argument('--long', action='store_true', help='Do not skip the tests tagged as long.') parser.add_argument('--long', action='store_true', help='Do not skip the tests tagged as long.')
parser.add_argument('--print-queries', action='store_true', help='Print test queries and exit.') parser.add_argument('--print-queries', action='store_true', help='Print test queries and exit.')
@ -284,7 +285,7 @@ for query_index in queries_to_run:
# it makes the results unstable. # it makes the results unstable.
res = c.execute(q, query_id = prewarm_id, res = c.execute(q, query_id = prewarm_id,
settings = { settings = {
'max_execution_time': args.max_query_seconds, 'max_execution_time': args.prewarm_max_query_seconds,
'query_profiler_real_time_period_ns': 10000000, 'query_profiler_real_time_period_ns': 10000000,
'memory_profiler_step': '4Mi', 'memory_profiler_step': '4Mi',
}) })

View File

@ -128,14 +128,35 @@ function start()
counter=$((counter + 1)) counter=$((counter + 1))
done done
# Set follow-fork-mode to parent, because we attach to clickhouse-server, not to watchdog
# and clickhouse-server can do fork-exec, for example, to run some bridge.
# Do not set nostop noprint for all signals, because some it may cause gdb to hang,
# explicitly ignore non-fatal signals that are used by server.
# Number of SIGRTMIN can be determined only in runtime.
RTMIN=$(kill -l SIGRTMIN)
echo " echo "
set follow-fork-mode child set follow-fork-mode parent
handle all noprint handle SIGHUP nostop noprint pass
handle SIGSEGV stop print handle SIGINT nostop noprint pass
handle SIGBUS stop print handle SIGQUIT nostop noprint pass
handle SIGABRT stop print handle SIGPIPE nostop noprint pass
handle SIGTERM nostop noprint pass
handle SIGUSR1 nostop noprint pass
handle SIGUSR2 nostop noprint pass
handle SIG$RTMIN nostop noprint pass
info signals
continue continue
thread apply all backtrace backtrace full
info locals
info registers
disassemble /s
up
info locals
disassemble /s
up
info locals
disassemble /s
p \"done\"
detach detach
quit quit
" > script.gdb " > script.gdb
@ -143,7 +164,10 @@ quit
# FIXME Hung check may work incorrectly because of attached gdb # FIXME Hung check may work incorrectly because of attached gdb
# 1. False positives are possible # 1. False positives are possible
# 2. We cannot attach another gdb to get stacktraces if some queries hung # 2. We cannot attach another gdb to get stacktraces if some queries hung
sudo gdb -batch -command script.gdb -p "$(cat /var/run/clickhouse-server/clickhouse-server.pid)" >> /test_output/gdb.log & gdb -batch -command script.gdb -p "$(cat /var/run/clickhouse-server/clickhouse-server.pid)" | ts '%Y-%m-%d %H:%M:%S' >> /test_output/gdb.log &
sleep 5
# gdb will send SIGSTOP, spend some time loading debug info and then send SIGCONT, wait for it (up to send_timeout, 300s)
time clickhouse-client --query "SELECT 'Connected to clickhouse-server after attaching gdb'" ||:
} }
configure configure
@ -214,6 +238,9 @@ zgrep -Fa " <Fatal> " /var/log/clickhouse-server/clickhouse-server.log* > /dev/n
zgrep -Fa "########################################" /test_output/* > /dev/null \ zgrep -Fa "########################################" /test_output/* > /dev/null \
&& echo -e 'Killed by signal (output files)\tFAIL' >> /test_output/test_results.tsv && echo -e 'Killed by signal (output files)\tFAIL' >> /test_output/test_results.tsv
zgrep -Fa " received signal " /test_output/gdb.log > /dev/null \
&& echo -e 'Found signal in gdb.log\tFAIL' >> /test_output/test_results.tsv
# Put logs into /test_output/ # Put logs into /test_output/
for log_file in /var/log/clickhouse-server/clickhouse-server.log* for log_file in /var/log/clickhouse-server/clickhouse-server.log*
do do

View File

@ -5,8 +5,8 @@ echo "Configure to use Yandex dockerhub-proxy"
mkdir -p /etc/docker/ mkdir -p /etc/docker/
cat > /etc/docker/daemon.json << EOF cat > /etc/docker/daemon.json << EOF
{ {
"insecure-registries" : ["dockerhub-proxy.sas.yp-c.yandex.net:5000"], "insecure-registries" : ["dockerhub-proxy.dockerhub-proxy-zone:5000"],
"registry-mirrors" : ["http://dockerhub-proxy.sas.yp-c.yandex.net:5000"] "registry-mirrors" : ["http://dockerhub-proxy.dockerhub-proxy-zone:5000"]
} }
EOF EOF

View File

@ -106,20 +106,20 @@ Build ClickHouse. Run ClickHouse from the terminal: change directory to `program
Note that all clickhouse tools (server, client, etc) are just symlinks to a single binary named `clickhouse`. You can find this binary at `programs/clickhouse`. All tools can also be invoked as `clickhouse tool` instead of `clickhouse-tool`. Note that all clickhouse tools (server, client, etc) are just symlinks to a single binary named `clickhouse`. You can find this binary at `programs/clickhouse`. All tools can also be invoked as `clickhouse tool` instead of `clickhouse-tool`.
Alternatively you can install ClickHouse package: either stable release from Yandex repository or you can build package for yourself with `./release` in ClickHouse sources root. Then start the server with `sudo service clickhouse-server start` (or stop to stop the server). Look for logs at `/etc/clickhouse-server/clickhouse-server.log`. Alternatively you can install ClickHouse package: either stable release from ClickHouse repository or you can build package for yourself with `./release` in ClickHouse sources root. Then start the server with `sudo clickhouse start` (or stop to stop the server). Look for logs at `/etc/clickhouse-server/clickhouse-server.log`.
When ClickHouse is already installed on your system, you can build a new `clickhouse` binary and replace the existing binary: When ClickHouse is already installed on your system, you can build a new `clickhouse` binary and replace the existing binary:
``` bash ``` bash
$ sudo service clickhouse-server stop $ sudo clickhouse stop
$ sudo cp ./clickhouse /usr/bin/ $ sudo cp ./clickhouse /usr/bin/
$ sudo service clickhouse-server start $ sudo clickhouse start
``` ```
Also you can stop system clickhouse-server and run your own with the same configuration but with logging to terminal: Also you can stop system clickhouse-server and run your own with the same configuration but with logging to terminal:
``` bash ``` bash
$ sudo service clickhouse-server stop $ sudo clickhouse stop
$ sudo -u clickhouse /usr/bin/clickhouse server --config-file /etc/clickhouse-server/config.xml $ sudo -u clickhouse /usr/bin/clickhouse server --config-file /etc/clickhouse-server/config.xml
``` ```
@ -257,9 +257,9 @@ There are five variants (Debug, ASan, TSan, MSan, UBSan).
Thread Fuzzer (please don't mix up with Thread Sanitizer) is another kind of fuzzing that allows to randomize thread order of execution. It helps to find even more special cases. Thread Fuzzer (please don't mix up with Thread Sanitizer) is another kind of fuzzing that allows to randomize thread order of execution. It helps to find even more special cases.
## Security Audit {#security-audit} ## Security Audit
People from Yandex Security Team do some basic overview of ClickHouse capabilities from the security standpoint. People from Yandex Security Team did some basic overview of ClickHouse capabilities from the security standpoint.
## Static Analyzers {#static-analyzers} ## Static Analyzers {#static-analyzers}
@ -326,15 +326,11 @@ There is automated check for flaky tests. It runs all new tests 100 times (for f
## Testflows ## Testflows
[Testflows](https://testflows.com/) is an enterprise-grade testing framework. It is used by Altinity for some of the tests and we run these tests in our CI. [Testflows](https://testflows.com/) is an enterprise-grade open-source testing framework, which is used to test a subset of ClickHouse.
## Yandex Checks (only for Yandex employees)
These checks are importing ClickHouse code into Yandex internal monorepository, so ClickHouse codebase can be used as a library by other products at Yandex (YT and YDB). Note that clickhouse-server itself is not being build from internal repo and unmodified open-source build is used for Yandex applications.
## Test Automation {#test-automation} ## Test Automation {#test-automation}
We run tests with Yandex internal CI and job automation system named “Sandbox”. We run tests with [GitHub Actions](https://github.com/features/actions).
Build jobs and tests are run in Sandbox on per commit basis. Resulting packages and test results are published in GitHub and can be downloaded by direct links. Artifacts are stored for several months. When you send a pull request on GitHub, we tag it as “can be tested” and our CI system will build ClickHouse packages (release, debug, with address sanitizer, etc) for you. Build jobs and tests are run in Sandbox on per commit basis. Resulting packages and test results are published in GitHub and can be downloaded by direct links. Artifacts are stored for several months. When you send a pull request on GitHub, we tag it as “can be tested” and our CI system will build ClickHouse packages (release, debug, with address sanitizer, etc) for you.

View File

@ -17,6 +17,7 @@ ClickHouse server works as MySQL replica. It reads binlog and performs DDL and D
``` sql ``` sql
CREATE DATABASE [IF NOT EXISTS] db_name [ON CLUSTER cluster] CREATE DATABASE [IF NOT EXISTS] db_name [ON CLUSTER cluster]
ENGINE = MaterializedMySQL('host:port', ['database' | database], 'user', 'password') [SETTINGS ...] ENGINE = MaterializedMySQL('host:port', ['database' | database], 'user', 'password') [SETTINGS ...]
[TABLE OVERRIDE table1 (...), TABLE OVERRIDE table2 (...)]
``` ```
**Engine Parameters** **Engine Parameters**
@ -109,15 +110,19 @@ MySQL DDL queries are converted into the corresponding ClickHouse DDL queries ([
- MySQL `DELETE` query is converted into `INSERT` with `_sign=-1`. - MySQL `DELETE` query is converted into `INSERT` with `_sign=-1`.
- MySQL `UPDATE` query is converted into `INSERT` with `_sign=-1` and `INSERT` with `_sign=1`. - MySQL `UPDATE` query is converted into `INSERT` with `_sign=-1` and `INSERT` with `_sign=1` if the primary key has been changed, or
`INSERT` with `_sign=1` if not.
### Selecting from MaterializedMySQL Tables {#select} ### Selecting from MaterializedMySQL Tables {#select}
`SELECT` query from `MaterializedMySQL` tables has some specifics: `SELECT` query from `MaterializedMySQL` tables has some specifics:
- If `_version` is not specified in the `SELECT` query, [FINAL](../../sql-reference/statements/select/from.md#select-from-final) modifier is used. So only rows with `MAX(_version)` are selected. - If `_version` is not specified in the `SELECT` query, the
[FINAL](../../sql-reference/statements/select/from.md#select-from-final) modifier is used, so only rows with
`MAX(_version)` are returned for each primary key value.
- If `_sign` is not specified in the `SELECT` query, `WHERE _sign=1` is used by default. So the deleted rows are not included into the result set. - If `_sign` is not specified in the `SELECT` query, `WHERE _sign=1` is used by default. So the deleted rows are not
included into the result set.
- The result includes columns comments in case they exist in MySQL database tables. - The result includes columns comments in case they exist in MySQL database tables.
@ -125,15 +130,77 @@ MySQL DDL queries are converted into the corresponding ClickHouse DDL queries ([
MySQL `PRIMARY KEY` and `INDEX` clauses are converted into `ORDER BY` tuples in ClickHouse tables. MySQL `PRIMARY KEY` and `INDEX` clauses are converted into `ORDER BY` tuples in ClickHouse tables.
ClickHouse has only one physical order, which is determined by `ORDER BY` clause. To create a new physical order, use [materialized views](../../sql-reference/statements/create/view.md#materialized). ClickHouse has only one physical order, which is determined by `ORDER BY` clause. To create a new physical order, use
[materialized views](../../sql-reference/statements/create/view.md#materialized).
**Notes** **Notes**
- Rows with `_sign=-1` are not deleted physically from the tables. - Rows with `_sign=-1` are not deleted physically from the tables.
- Cascade `UPDATE/DELETE` queries are not supported by the `MaterializedMySQL` engine. - Cascade `UPDATE/DELETE` queries are not supported by the `MaterializedMySQL` engine, as they are not visible in the
MySQL binlog.
- Replication can be easily broken. - Replication can be easily broken.
- Manual operations on database and tables are forbidden. - Manual operations on database and tables are forbidden.
- `MaterializedMySQL` is influenced by [optimize_on_insert](../../operations/settings/settings.md#optimize-on-insert) setting. The data is merged in the corresponding table in the `MaterializedMySQL` database when a table in the MySQL server changes. - `MaterializedMySQL` is affected by the [optimize_on_insert](../../operations/settings/settings.md#optimize-on-insert)
setting. Data is merged in the corresponding table in the `MaterializedMySQL` database when a table in the MySQL
server changes.
### Table Overrides {#table-overrides}
Table overrides can be used to customize the ClickHouse DDL queries, allowing you to make schema optimizations for your
application. This is especially useful for controlling partitioning, which is important for the overall performance of
MaterializedMySQL.
```sql
CREATE DATABASE db_name ENGINE = MaterializedMySQL(...)
[SETTINGS ...]
[TABLE OVERRIDE table_name (
[COLUMNS (
[name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1] [TTL expr1], ...]
[INDEX index_name1 expr1 TYPE type1(...) GRANULARITY value1, ...]
[PROJECTION projection_name_1 (SELECT <COLUMN LIST EXPR> [GROUP BY] [ORDER BY]), ...]
)]
[ORDER BY expr]
[PRIMARY KEY expr]
[PARTITION BY expr]
[SAMPLE BY expr]
[TTL expr]
), ...]
```
Example:
```sql
CREATE DATABASE db_name ENGINE = MaterializedMySQL(...)
TABLE OVERRIDE table1 (
COLUMNS (
userid UUID,
category LowCardinality(String),
timestamp DateTime CODEC(Delta, Default)
)
PARTITION BY toYear(timestamp)
),
TABLE OVERRIDE table2 (
COLUMNS (
ip_hash UInt32 MATERIALIZED xxHash32(client_ip),
client_ip String TTL created + INTERVAL 72 HOUR
)
SAMPLE BY ip_hash
)
```
The `COLUMNS` list is sparse; it contains only modified or extra (MATERIALIZED or ALIAS) columns. Modified columns with
a different type must be assignable from the original type. There is currently no validation of this or similar issues
when the `CREATE DATABASE` query executes, so extra care needs to be taken.
You may specify overrides for tables that do not exist yet.
!!! note "Warning"
It is easy to break replication with TABLE OVERRIDEs if not used with care. For example:
* If a column is added with a table override, but then later added to the source MySQL table, the converted ALTER TABLE
query in ClickHouse will fail because the column already exists.
* It is currently possible to add overrides that reference nullable columns where not-nullable are required, such as in
`ORDER BY` or `PARTITION BY`.
## Examples of Use {#examples-of-use} ## Examples of Use {#examples-of-use}

View File

@ -5,8 +5,7 @@ toc_title: HDFS
# HDFS {#table_engines-hdfs} # HDFS {#table_engines-hdfs}
This engine provides integration with [Apache Hadoop](https://en.wikipedia.org/wiki/Apache_Hadoop) ecosystem by allowing to manage data on [HDFS](https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-hdfs/HdfsDesign.html) via ClickHouse. This engine is similar This engine provides integration with the [Apache Hadoop](https://en.wikipedia.org/wiki/Apache_Hadoop) ecosystem by allowing to manage data on [HDFS](https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-hdfs/HdfsDesign.html) via ClickHouse. This engine is similar to the [File](../../../engines/table-engines/special/file.md#table_engines-file) and [URL](../../../engines/table-engines/special/url.md#table_engines-url) engines, but provides Hadoop-specific features.
to the [File](../../../engines/table-engines/special/file.md#table_engines-file) and [URL](../../../engines/table-engines/special/url.md#table_engines-url) engines, but provides Hadoop-specific features.
## Usage {#usage} ## Usage {#usage}
@ -14,12 +13,13 @@ to the [File](../../../engines/table-engines/special/file.md#table_engines-file)
ENGINE = HDFS(URI, format) ENGINE = HDFS(URI, format)
``` ```
The `URI` parameter is the whole file URI in HDFS. **Engine Parameters**
The `format` parameter specifies one of the available file formats. To perform
- `URI` - whole file URI in HDFS. The path part of `URI` may contain globs. In this case the table would be readonly.
- `format` - specifies one of the available file formats. To perform
`SELECT` queries, the format must be supported for input, and to perform `SELECT` queries, the format must be supported for input, and to perform
`INSERT` queries for output. The available formats are listed in the `INSERT` queries for output. The available formats are listed in the
[Formats](../../../interfaces/formats.md#formats) section. [Formats](../../../interfaces/formats.md#formats) section.
The path part of `URI` may contain globs. In this case the table would be readonly.
**Example:** **Example:**
@ -71,12 +71,12 @@ Constructions with `{}` are similar to the [remote](../../../sql-reference/table
1. Suppose we have several files in TSV format with the following URIs on HDFS: 1. Suppose we have several files in TSV format with the following URIs on HDFS:
- 'hdfs://hdfs1:9000/some_dir/some_file_1' - 'hdfs://hdfs1:9000/some_dir/some_file_1'
- 'hdfs://hdfs1:9000/some_dir/some_file_2' - 'hdfs://hdfs1:9000/some_dir/some_file_2'
- 'hdfs://hdfs1:9000/some_dir/some_file_3' - 'hdfs://hdfs1:9000/some_dir/some_file_3'
- 'hdfs://hdfs1:9000/another_dir/some_file_1' - 'hdfs://hdfs1:9000/another_dir/some_file_1'
- 'hdfs://hdfs1:9000/another_dir/some_file_2' - 'hdfs://hdfs1:9000/another_dir/some_file_2'
- 'hdfs://hdfs1:9000/another_dir/some_file_3' - 'hdfs://hdfs1:9000/another_dir/some_file_3'
1. There are several ways to make a table consisting of all six files: 1. There are several ways to make a table consisting of all six files:
@ -132,6 +132,7 @@ Similar to GraphiteMergeTree, the HDFS engine supports extended configuration us
| **parameter** | **default value** | | **parameter** | **default value** |
| - | - |
| rpc\_client\_connect\_tcpnodelay | true | | rpc\_client\_connect\_tcpnodelay | true |
| dfs\_client\_read\_shortcircuit | true | | dfs\_client\_read\_shortcircuit | true |
| output\_replace-datanode-on-failure | true | | output\_replace-datanode-on-failure | true |
@ -181,25 +182,26 @@ Similar to GraphiteMergeTree, the HDFS engine supports extended configuration us
#### ClickHouse extras {#clickhouse-extras} #### ClickHouse extras {#clickhouse-extras}
| **parameter** | **default value** | | **parameter** | **default value** |
| - | - |
|hadoop\_kerberos\_keytab | "" | |hadoop\_kerberos\_keytab | "" |
|hadoop\_kerberos\_principal | "" | |hadoop\_kerberos\_principal | "" |
|hadoop\_kerberos\_kinit\_command | kinit | |hadoop\_kerberos\_kinit\_command | kinit |
|libhdfs3\_conf | "" | |libhdfs3\_conf | "" |
### Limitations {#limitations} ### Limitations {#limitations}
* hadoop\_security\_kerberos\_ticket\_cache\_path and libhdfs3\_conf can be global only, not user specific * `hadoop_security_kerberos_ticket_cache_path` and `libhdfs3_conf` can be global only, not user specific
## Kerberos support {#kerberos-support} ## Kerberos support {#kerberos-support}
If hadoop\_security\_authentication parameter has value 'kerberos', ClickHouse authentifies via Kerberos facility. If the `hadoop_security_authentication` parameter has the value `kerberos`, ClickHouse authenticates via Kerberos.
Parameters [here](#clickhouse-extras) and hadoop\_security\_kerberos\_ticket\_cache\_path may be of help. Parameters are [here](#clickhouse-extras) and `hadoop_security_kerberos_ticket_cache_path` may be of help.
Note that due to libhdfs3 limitations only old-fashioned approach is supported, Note that due to libhdfs3 limitations only old-fashioned approach is supported,
datanode communications are not secured by SASL (HADOOP\_SECURE\_DN\_USER is a reliable indicator of such datanode communications are not secured by SASL (`HADOOP_SECURE_DN_USER` is a reliable indicator of such
security approach). Use tests/integration/test\_storage\_kerberized\_hdfs/hdfs_configs/bootstrap.sh for reference. security approach). Use `tests/integration/test_storage_kerberized_hdfs/hdfs_configs/bootstrap.sh` for reference.
If hadoop\_kerberos\_keytab, hadoop\_kerberos\_principal or hadoop\_kerberos\_kinit\_command is specified, kinit will be invoked. hadoop\_kerberos\_keytab and hadoop\_kerberos\_principal are mandatory in this case. kinit tool and krb5 configuration files are required. If `hadoop_kerberos_keytab`, `hadoop_kerberos_principal` or `hadoop_kerberos_kinit_command` is specified, `kinit` will be invoked. `hadoop_kerberos_keytab` and `hadoop_kerberos_principal` are mandatory in this case. `kinit` tool and krb5 configuration files are required.
## HDFS Namenode HA support{#namenode-ha} ## HDFS Namenode HA support {#namenode-ha}
libhdfs3 support HDFS namenode HA. libhdfs3 support HDFS namenode HA.

View File

@ -37,6 +37,7 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster]
[rabbitmq_skip_broken_messages = N,] [rabbitmq_skip_broken_messages = N,]
[rabbitmq_max_block_size = N,] [rabbitmq_max_block_size = N,]
[rabbitmq_flush_interval_ms = N] [rabbitmq_flush_interval_ms = N]
[rabbitmq_queue_settings_list = 'x-dead-letter-exchange=my-dlx,x-max-length=10,x-overflow=reject-publish']
``` ```
Required parameters: Required parameters:
@ -59,6 +60,7 @@ Optional parameters:
- `rabbitmq_skip_broken_messages` RabbitMQ message parser tolerance to schema-incompatible messages per block. Default: `0`. If `rabbitmq_skip_broken_messages = N` then the engine skips *N* RabbitMQ messages that cannot be parsed (a message equals a row of data). - `rabbitmq_skip_broken_messages` RabbitMQ message parser tolerance to schema-incompatible messages per block. Default: `0`. If `rabbitmq_skip_broken_messages = N` then the engine skips *N* RabbitMQ messages that cannot be parsed (a message equals a row of data).
- `rabbitmq_max_block_size` - `rabbitmq_max_block_size`
- `rabbitmq_flush_interval_ms` - `rabbitmq_flush_interval_ms`
- `rabbitmq_queue_settings_list` - allows to set RabbitMQ settings when creating a queue. Available settings: `x-max-length`, `x-max-length-bytes`, `x-message-ttl`, `x-expires`, `x-priority`, `x-max-priority`, `x-overflow`, `x-dead-letter-exchange`, `x-queue-type`. The `durable` setting is enabled automatically for the queue.
SSL connection: SSL connection:

View File

@ -66,9 +66,9 @@ WHERE table = 'visits'
└───────────┴────────────────┴────────┘ └───────────┴────────────────┴────────┘
``` ```
The `partition` column contains the names of the partitions. There are two partitions in this example: `201901` and `201902`. You can use this column value to specify the partition name in [ALTER … PARTITION](#alter_manipulations-with-partitions) queries. The `partition` column contains the names of the partitions. There are two partitions in this example: `201901` and `201902`. You can use this column value to specify the partition name in [ALTER … PARTITION](../../../sql-reference/statements/alter/partition.md) queries.
The `name` column contains the names of the partition data parts. You can use this column to specify the name of the part in the [ALTER ATTACH PART](#alter_attach-partition) query. The `name` column contains the names of the partition data parts. You can use this column to specify the name of the part in the [ALTER ATTACH PART](../../../sql-reference/statements/alter/partition.md#alter_attach-partition) query.
Lets break down the name of the first part: `201901_1_3_1`: Lets break down the name of the first part: `201901_1_3_1`:

View File

@ -8,24 +8,43 @@ toc_title: Distributed
Tables with Distributed engine do not store any data of their own, but allow distributed query processing on multiple servers. Tables with Distributed engine do not store any data of their own, but allow distributed query processing on multiple servers.
Reading is automatically parallelized. During a read, the table indexes on remote servers are used, if there are any. Reading is automatically parallelized. During a read, the table indexes on remote servers are used, if there are any.
The Distributed engine accepts parameters: ## Creating a Table {#distributed-creating-a-table}
- the cluster name in the servers config file ``` sql
CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster]
(
name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1],
name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2],
...
) ENGINE = Distributed(cluster, database, table[, sharding_key[, policy_name]])
[SETTINGS name=value, ...]
```
- the name of a remote database ### From a Table {#distributed-from-a-table}
When the `Distributed` table is pointing to a table on the current server you can adopt that table's schema:
- the name of a remote table ``` sql
CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] AS [db2.]name2 ENGINE = Distributed(cluster, database, table[, sharding_key[, policy_name]]) [SETTINGS name=value, ...]
```
- (optionally) sharding key **Distributed Parameters**
- (optionally) policy name, it will be used to store temporary files for async send - `cluster` - the cluster name in the servers config file
See also: - `database` - the name of a remote database
- `table` - the name of a remote table
- `sharding_key` - (optionally) sharding key
- `policy_name` - (optionally) policy name, it will be used to store temporary files for async send
**See Also**
- [insert_distributed_sync](../../../operations/settings/settings.md#insert_distributed_sync) setting - [insert_distributed_sync](../../../operations/settings/settings.md#insert_distributed_sync) setting
- [MergeTree](../../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-multiple-volumes) for the examples - [MergeTree](../../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-multiple-volumes) for the examples
Also, it accepts the following settings: **Distributed Settings**
- `fsync_after_insert` - do the `fsync` for the file data after asynchronous insert to Distributed. Guarantees that the OS flushed the whole inserted data to a file **on the initiator node** disk. - `fsync_after_insert` - do the `fsync` for the file data after asynchronous insert to Distributed. Guarantees that the OS flushed the whole inserted data to a file **on the initiator node** disk.
@ -59,24 +78,25 @@ Also, it accepts the following settings:
- [prefer_localhost_replica](../../../operations/settings/settings.md#settings-prefer-localhost-replica) setting - [prefer_localhost_replica](../../../operations/settings/settings.md#settings-prefer-localhost-replica) setting
- `bytes_to_throw_insert` handled before `bytes_to_delay_insert`, so you should not set it to the value less then `bytes_to_delay_insert` - `bytes_to_throw_insert` handled before `bytes_to_delay_insert`, so you should not set it to the value less then `bytes_to_delay_insert`
Example: **Example**
``` sql ``` sql
Distributed(logs, default, hits[, sharding_key[, policy_name]]) CREATE TABLE hits_all AS hits
ENGINE = Distributed(logs, default, hits[, sharding_key[, policy_name]])
SETTINGS SETTINGS
fsync_after_insert=0, fsync_after_insert=0,
fsync_directories=0; fsync_directories=0;
``` ```
Data will be read from all servers in the `logs` cluster, from the default.hits table located on every server in the cluster. Data will be read from all servers in the `logs` cluster, from the `default.hits` table located on every server in the cluster.
Data is not only read but is partially processed on the remote servers (to the extent that this is possible). Data is not only read but is partially processed on the remote servers (to the extent that this is possible).
For example, for a query with GROUP BY, data will be aggregated on remote servers, and the intermediate states of aggregate functions will be sent to the requestor server. Then data will be further aggregated. For example, for a query with `GROUP BY`, data will be aggregated on remote servers, and the intermediate states of aggregate functions will be sent to the requestor server. Then data will be further aggregated.
Instead of the database name, you can use a constant expression that returns a string. For example: currentDatabase(). Instead of the database name, you can use a constant expression that returns a string. For example: `currentDatabase()`.
logs The cluster name in the servers config file. ## Clusters {#distributed-clusters}
Clusters are set like this: Clusters are configured in the [server configuration file](../../../operations/configuration-files.md):
``` xml ``` xml
<remote_servers> <remote_servers>
@ -132,12 +152,13 @@ Replicas are duplicating servers (in order to read all the data, you can access
Cluster names must not contain dots. Cluster names must not contain dots.
The parameters `host`, `port`, and optionally `user`, `password`, `secure`, `compression` are specified for each server: The parameters `host`, `port`, and optionally `user`, `password`, `secure`, `compression` are specified for each server:
- `host` The address of the remote server. You can use either the domain or the IPv4 or IPv6 address. If you specify the domain, the server makes a DNS request when it starts, and the result is stored as long as the server is running. If the DNS request fails, the server does not start. If you change the DNS record, restart the server. - `host` The address of the remote server. You can use either the domain or the IPv4 or IPv6 address. If you specify the domain, the server makes a DNS request when it starts, and the result is stored as long as the server is running. If the DNS request fails, the server does not start. If you change the DNS record, restart the server.
- `port` The TCP port for messenger activity (`tcp_port` in the config, usually set to 9000). Do not confuse it with http_port. - `port` The TCP port for messenger activity (`tcp_port` in the config, usually set to 9000). Not to be confused with `http_port`.
- `user` Name of the user for connecting to a remote server. Default value: default. This user must have access to connect to the specified server. Access is configured in the users.xml file. For more information, see the section [Access rights](../../../operations/access-rights.md). - `user` Name of the user for connecting to a remote server. Default value is the `default` user. This user must have access to connect to the specified server. Access is configured in the `users.xml` file. For more information, see the section [Access rights](../../../operations/access-rights.md).
- `password` The password for connecting to a remote server (not masked). Default value: empty string. - `password` The password for connecting to a remote server (not masked). Default value: empty string.
- `secure` - Use ssl for connection, usually you also should define `port` = 9440. Server should listen on `<tcp_port_secure>9440</tcp_port_secure>` and have correct certificates. - `secure` - Whether to use a secure SSL/TLS connection. Usually also requires specifying the port (the default secure port is `9440`). The server should listen on `<tcp_port_secure>9440</tcp_port_secure>` and be configured with correct certificates.
- `compression` - Use data compression. Default value: true. - `compression` - Use data compression. Default value: `true`.
When specifying replicas, one of the available replicas will be selected for each of the shards when reading. You can configure the algorithm for load balancing (the preference for which replica to access) see the [load_balancing](../../../operations/settings/settings.md#settings-load_balancing) setting. When specifying replicas, one of the available replicas will be selected for each of the shards when reading. You can configure the algorithm for load balancing (the preference for which replica to access) see the [load_balancing](../../../operations/settings/settings.md#settings-load_balancing) setting.
If the connection with the server is not established, there will be an attempt to connect with a short timeout. If the connection failed, the next replica will be selected, and so on for all the replicas. If the connection attempt failed for all the replicas, the attempt will be repeated the same way, several times. If the connection with the server is not established, there will be an attempt to connect with a short timeout. If the connection failed, the next replica will be selected, and so on for all the replicas. If the connection attempt failed for all the replicas, the attempt will be repeated the same way, several times.
@ -149,40 +170,42 @@ You can specify as many clusters as you wish in the configuration.
To view your clusters, use the `system.clusters` table. To view your clusters, use the `system.clusters` table.
The Distributed engine allows working with a cluster like a local server. However, the cluster is inextensible: you must write its configuration in the server config file (even better, for all the clusters servers). The `Distributed` engine allows working with a cluster like a local server. However, the cluster's configuration cannot be specified dynamically, it has to be configured in the server config file. Usually, all servers in a cluster will have the same cluster config (though this is not required). Clusters from the config file are updated on the fly, without restarting the server.
The Distributed engine requires writing clusters to the config file. Clusters from the config file are updated on the fly, without restarting the server. If you need to send a query to an unknown set of shards and replicas each time, you do not need to create a Distributed table use the `remote` table function instead. See the section [Table functions](../../../sql-reference/table-functions/index.md). If you need to send a query to an unknown set of shards and replicas each time, you do not need to create a `Distributed` table use the `remote` table function instead. See the section [Table functions](../../../sql-reference/table-functions/index.md).
## Writing data {#distributed-writing-data}
There are two methods for writing data to a cluster: There are two methods for writing data to a cluster:
First, you can define which servers to write which data to and perform the write directly on each shard. In other words, perform INSERT in the tables that the distributed table “looks at”. This is the most flexible solution as you can use any sharding scheme, which could be non-trivial due to the requirements of the subject area. This is also the most optimal solution since data can be written to different shards completely independently. First, you can define which servers to write which data to and perform the write directly on each shard. In other words, perform direct `INSERT` statements on the remote tables in the cluster that the `Distributed` table is pointing to. This is the most flexible solution as you can use any sharding scheme, even one that is non-trivial due to the requirements of the subject area. This is also the most optimal solution since data can be written to different shards completely independently.
Second, you can perform INSERT in a Distributed table. In this case, the table will distribute the inserted data across the servers itself. In order to write to a Distributed table, it must have a sharding key set (the last parameter). In addition, if there is only one shard, the write operation works without specifying the sharding key, since it does not mean anything in this case. Second, you can perform `INSERT` statements on a `Distributed` table. In this case, the table will distribute the inserted data across the servers itself. In order to write to a `Distributed` table, it must have the `sharding_key` parameter configured (except if there is only one shard).
Each shard can have a weight defined in the config file. By default, the weight is equal to one. Data is distributed across shards in the amount proportional to the shard weight. For example, if there are two shards and the first has a weight of 9 while the second has a weight of 10, the first will be sent 9 / 19 parts of the rows, and the second will be sent 10 / 19. Each shard can have a `<weight>` defined in the config file. By default, the weight is `1`. Data is distributed across shards in the amount proportional to the shard weight. All shard weights are summed up, then each shard's weight is divided by the total to determine each shard's proportion. For example, if there are two shards and the first has a weight of 1 while the second has a weight of 2, the first will be sent one third (1 / 3) of inserted rows and the second will be sent two thirds (2 / 3).
Each shard can have the `internal_replication` parameter defined in the config file. Each shard can have the `internal_replication` parameter defined in the config file. If this parameter is set to `true`, the write operation selects the first healthy replica and writes data to it. Use this if the tables underlying the `Distributed` table are replicated tables (e.g. any of the `Replicated*MergeTree` table engines). One of the table replicas will receive the write and it will be replicated to the other replicas automatically.
If this parameter is set to `true`, the write operation selects the first healthy replica and writes data to it. Use this alternative if the Distributed table “looks at” replicated tables. In other words, if the table where data will be written is going to replicate them itself. If `internal_replication` is set to `false` (the default), data is written to all replicas. In this case, the `Distributed` table replicates data itself. This is worse than using replicated tables because the consistency of replicas is not checked and, over time, they will contain slightly different data.
If it is set to `false` (the default), data is written to all replicas. In essence, this means that the Distributed table replicates data itself. This is worse than using replicated tables, because the consistency of replicas is not checked, and over time they will contain slightly different data.
To select the shard that a row of data is sent to, the sharding expression is analyzed, and its remainder is taken from dividing it by the total weight of the shards. The row is sent to the shard that corresponds to the half-interval of the remainders from `prev_weights` to `prev_weights + weight`, where `prev_weights` is the total weight of the shards with the smallest number, and `weight` is the weight of this shard. For example, if there are two shards, and the first has a weight of 9 while the second has a weight of 10, the row will be sent to the first shard for the remainders from the range \[0, 9), and to the second for the remainders from the range \[9, 19). To select the shard that a row of data is sent to, the sharding expression is analyzed, and its remainder is taken from dividing it by the total weight of the shards. The row is sent to the shard that corresponds to the half-interval of the remainders from `prev_weights` to `prev_weights + weight`, where `prev_weights` is the total weight of the shards with the smallest number, and `weight` is the weight of this shard. For example, if there are two shards, and the first has a weight of 9 while the second has a weight of 10, the row will be sent to the first shard for the remainders from the range \[0, 9), and to the second for the remainders from the range \[9, 19).
The sharding expression can be any expression from constants and table columns that returns an integer. For example, you can use the expression `rand()` for random distribution of data, or `UserID` for distribution by the remainder from dividing the users ID (then the data of a single user will reside on a single shard, which simplifies running IN and JOIN by users). If one of the columns is not distributed evenly enough, you can wrap it in a hash function: intHash64(UserID). The sharding expression can be any expression from constants and table columns that returns an integer. For example, you can use the expression `rand()` for random distribution of data, or `UserID` for distribution by the remainder from dividing the users ID (then the data of a single user will reside on a single shard, which simplifies running `IN` and `JOIN` by users). If one of the columns is not distributed evenly enough, you can wrap it in a hash function e.g. `intHash64(UserID)`.
A simple remainder from the division is a limited solution for sharding and isnt always appropriate. It works for medium and large volumes of data (dozens of servers), but not for very large volumes of data (hundreds of servers or more). In the latter case, use the sharding scheme required by the subject area, rather than using entries in Distributed tables. A simple remainder from the division is a limited solution for sharding and isnt always appropriate. It works for medium and large volumes of data (dozens of servers), but not for very large volumes of data (hundreds of servers or more). In the latter case, use the sharding scheme required by the subject area rather than using entries in `Distributed` tables.
SELECT queries are sent to all the shards and work regardless of how data is distributed across the shards (they can be distributed completely randomly). When you add a new shard, you do not have to transfer old data into it. Instead, you can write new data to it by using a heavier weight the data will be distributed slightly unevenly, but queries will work correctly and efficiently.
You should be concerned about the sharding scheme in the following cases: You should be concerned about the sharding scheme in the following cases:
- Queries are used that require joining data (IN or JOIN) by a specific key. If data is sharded by this key, you can use local IN or JOIN instead of GLOBAL IN or GLOBAL JOIN, which is much more efficient. - Queries are used that require joining data (`IN` or `JOIN`) by a specific key. If data is sharded by this key, you can use local `IN` or `JOIN` instead of `GLOBAL IN` or `GLOBAL JOIN`, which is much more efficient.
- A large number of servers is used (hundreds or more) with a large number of small queries (queries of individual clients - websites, advertisers, or partners). In order for the small queries to not affect the entire cluster, it makes sense to locate data for a single client on a single shard. Alternatively, as weve done in Yandex.Metrica, you can set up bi-level sharding: divide the entire cluster into “layers”, where a layer may consist of multiple shards. Data for a single client is located on a single layer, but shards can be added to a layer as necessary, and data is randomly distributed within them. Distributed tables are created for each layer, and a single shared distributed table is created for global queries. - A large number of servers is used (hundreds or more) with a large number of small queries, for example, queries for data of individual clients (e.g. websites, advertisers, or partners). In order for the small queries to not affect the entire cluster, it makes sense to locate data for a single client on a single shard. Alternatively, as weve done in Yandex.Metrica, you can set up bi-level sharding: divide the entire cluster into “layers”, where a layer may consist of multiple shards. Data for a single client is located on a single layer, but shards can be added to a layer as necessary, and data is randomly distributed within them. `Distributed` tables are created for each layer, and a single shared distributed table is created for global queries.
Data is written asynchronously. When inserted in the table, the data block is just written to the local file system. The data is sent to the remote servers in the background as soon as possible. The periodicity for sending data is managed by the [distributed_directory_monitor_sleep_time_ms](../../../operations/settings/settings.md#distributed_directory_monitor_sleep_time_ms) and [distributed_directory_monitor_max_sleep_time_ms](../../../operations/settings/settings.md#distributed_directory_monitor_max_sleep_time_ms) settings. The `Distributed` engine sends each file with inserted data separately, but you can enable batch sending of files with the [distributed_directory_monitor_batch_inserts](../../../operations/settings/settings.md#distributed_directory_monitor_batch_inserts) setting. This setting improves cluster performance by better utilizing local server and network resources. You should check whether data is sent successfully by checking the list of files (data waiting to be sent) in the table directory: `/var/lib/clickhouse/data/database/table/`. The number of threads performing background tasks can be set by [background_distributed_schedule_pool_size](../../../operations/settings/settings.md#background_distributed_schedule_pool_size) setting. Data is written asynchronously. When inserted in the table, the data block is just written to the local file system. The data is sent to the remote servers in the background as soon as possible. The periodicity for sending data is managed by the [distributed_directory_monitor_sleep_time_ms](../../../operations/settings/settings.md#distributed_directory_monitor_sleep_time_ms) and [distributed_directory_monitor_max_sleep_time_ms](../../../operations/settings/settings.md#distributed_directory_monitor_max_sleep_time_ms) settings. The `Distributed` engine sends each file with inserted data separately, but you can enable batch sending of files with the [distributed_directory_monitor_batch_inserts](../../../operations/settings/settings.md#distributed_directory_monitor_batch_inserts) setting. This setting improves cluster performance by better utilizing local server and network resources. You should check whether data is sent successfully by checking the list of files (data waiting to be sent) in the table directory: `/var/lib/clickhouse/data/database/table/`. The number of threads performing background tasks can be set by [background_distributed_schedule_pool_size](../../../operations/settings/settings.md#background_distributed_schedule_pool_size) setting.
If the server ceased to exist or had a rough restart (for example, after a device failure) after an INSERT to a Distributed table, the inserted data might be lost. If a damaged data part is detected in the table directory, it is transferred to the `broken` subdirectory and no longer used. If the server ceased to exist or had a rough restart (for example, due to a hardware failure) after an `INSERT` to a `Distributed` table, the inserted data might be lost. If a damaged data part is detected in the table directory, it is transferred to the `broken` subdirectory and no longer used.
## Reading data {#distributed-reading-data}
When querying a `Distributed` table, `SELECT` queries are sent to all shards and work regardless of how data is distributed across the shards (they can be distributed completely randomly). When you add a new shard, you do not have to transfer old data into it. Instead, you can write new data to it by using a heavier weight the data will be distributed slightly unevenly, but queries will work correctly and efficiently.
When the `max_parallel_replicas` option is enabled, query processing is parallelized across all replicas within a single shard. For more information, see the section [max_parallel_replicas](../../../operations/settings/settings.md#settings-max_parallel_replicas). When the `max_parallel_replicas` option is enabled, query processing is parallelized across all replicas within a single shard. For more information, see the section [max_parallel_replicas](../../../operations/settings/settings.md#settings-max_parallel_replicas).

View File

@ -204,7 +204,7 @@ When parsing with this format, tabs or linefeeds are not allowed in each field.
This format is also available under the name `TSVRawWithNames`. This format is also available under the name `TSVRawWithNames`.
## TabSeparatedWithNamesAndTypes {#tabseparatedrawwithnamesandtypes} ## TabSeparatedRawWithNamesAndTypes {#tabseparatedrawwithnamesandtypes}
Differs from `TabSeparatedWithNamesAndTypes` format in that the rows are written without escaping. Differs from `TabSeparatedWithNamesAndTypes` format in that the rows are written without escaping.
When parsing with this format, tabs or linefeeds are not allowed in each field. When parsing with this format, tabs or linefeeds are not allowed in each field.

View File

@ -178,5 +178,9 @@ toc_title: Adopters
| <a href="https://promo.croc.ru/digitalworker" class="favicon">Цифровой Рабочий</a> | Industrial IoT, Analytics | — | — | — | [Blog post in Russian, March 2021](https://habr.com/en/company/croc/blog/548018/) | | <a href="https://promo.croc.ru/digitalworker" class="favicon">Цифровой Рабочий</a> | Industrial IoT, Analytics | — | — | — | [Blog post in Russian, March 2021](https://habr.com/en/company/croc/blog/548018/) |
| <a href="https://shop.okraina.ru/" class="favicon">ООО «МПЗ Богородский»</a> | Agriculture | — | — | — | [Article in Russian, November 2020](https://cloud.yandex.ru/cases/okraina) | | <a href="https://shop.okraina.ru/" class="favicon">ООО «МПЗ Богородский»</a> | Agriculture | — | — | — | [Article in Russian, November 2020](https://cloud.yandex.ru/cases/okraina) |
| <a href="https://domclick.ru/" class="favicon">ДомКлик</a> | Real Estate | — | — | — | [Article in Russian, October 2021](https://habr.com/ru/company/domclick/blog/585936/) | | <a href="https://domclick.ru/" class="favicon">ДомКлик</a> | Real Estate | — | — | — | [Article in Russian, October 2021](https://habr.com/ru/company/domclick/blog/585936/) |
| <a href="https://futurragroup.com/" class="favicon">Futurra Group</a> | Analytics | — | — | — | [Article in Russian, December 2021](https://dou.ua/forums/topic/35587/) |
| <a href="https://usetech.com/" class="favicon">UseTech</a> | Software Development | — | — | — | [Job Posting, December 2021](https://vk.com/wall136266658_2418) |
| <a href="https://lookforsale.ru/" class="favicon">Lookforsale</a> | E-Commerce | — | — | — | [Job Posting, December 2021](https://telegram.me/javascript_jobs/587318) |
| <a href="https://rvision.pro/en/" class="favicon">R-Vision</a> | Information Security | — | — | — | [Article in Russian, December 2021](https://www.anti-malware.ru/reviews/R-Vision-SENSE-15) |
[Original article](https://clickhouse.com/docs/en/introduction/adopters/) <!--hide--> [Original article](https://clickhouse.com/docs/en/introduction/adopters/) <!--hide-->

View File

@ -16,6 +16,11 @@ ZooKeeper is one of the first well-known open-source coordination systems. It's
By default, ClickHouse Keeper provides the same guarantees as ZooKeeper (linearizable writes, non-linearizable reads). It has a compatible client-server protocol, so any standard ZooKeeper client can be used to interact with ClickHouse Keeper. Snapshots and logs have an incompatible format with ZooKeeper, but `clickhouse-keeper-converter` tool allows to convert ZooKeeper data to ClickHouse Keeper snapshot. Interserver protocol in ClickHouse Keeper is also incompatible with ZooKeeper so mixed ZooKeeper / ClickHouse Keeper cluster is impossible. By default, ClickHouse Keeper provides the same guarantees as ZooKeeper (linearizable writes, non-linearizable reads). It has a compatible client-server protocol, so any standard ZooKeeper client can be used to interact with ClickHouse Keeper. Snapshots and logs have an incompatible format with ZooKeeper, but `clickhouse-keeper-converter` tool allows to convert ZooKeeper data to ClickHouse Keeper snapshot. Interserver protocol in ClickHouse Keeper is also incompatible with ZooKeeper so mixed ZooKeeper / ClickHouse Keeper cluster is impossible.
ClickHouse Keeper supports Access Control List (ACL) the same way as [ZooKeeper](https://zookeeper.apache.org/doc/r3.1.2/zookeeperProgrammers.html#sc_ZooKeeperAccessControl) does. ClickHouse Keeper supports the same set of permissions and has the identical built-in schemes: `world`, `auth`, `digest`, `host` and `ip`. Digest authentication scheme uses pair `username:password`. Password is encoded in Base64.
!!! info "Note"
External integrations are not supported.
## Configuration ## Configuration
ClickHouse Keeper can be used as a standalone replacement for ZooKeeper or as an internal part of the ClickHouse server, but in both cases configuration is almost the same `.xml` file. The main ClickHouse Keeper configuration tag is `<keeper_server>`. Keeper configuration has the following parameters: ClickHouse Keeper can be used as a standalone replacement for ZooKeeper or as an internal part of the ClickHouse server, but in both cases configuration is almost the same `.xml` file. The main ClickHouse Keeper configuration tag is `<keeper_server>`. Keeper configuration has the following parameters:
@ -118,13 +123,13 @@ echo mntr | nc localhost 9181
Bellow is the detailed 4lw commands: Bellow is the detailed 4lw commands:
- ruok : Tests if server is running in a non-error state. The server will respond with imok if it is running. Otherwise it will not respond at all. A response of "imok" does not necessarily indicate that the server has joined the quorum, just that the server process is active and bound to the specified client port. Use "stat" for details on state wrt quorum and client connection information. - `ruok`: Tests if server is running in a non-error state. The server will respond with imok if it is running. Otherwise it will not respond at all. A response of "imok" does not necessarily indicate that the server has joined the quorum, just that the server process is active and bound to the specified client port. Use "stat" for details on state wrt quorum and client connection information.
``` ```
imok imok
``` ```
- mntr : Outputs a list of variables that could be used for monitoring the health of the cluster. - `mntr`: Outputs a list of variables that could be used for monitoring the health of the cluster.
``` ```
zk_version v21.11.1.1-prestable-7a4a0b0edef0ad6e0aa662cd3b90c3f4acf796e7 zk_version v21.11.1.1-prestable-7a4a0b0edef0ad6e0aa662cd3b90c3f4acf796e7
@ -146,12 +151,11 @@ zk_followers 0
zk_synced_followers 0 zk_synced_followers 0
``` ```
- srvr : Lists full details for the server. - `srvr`: Lists full details for the server.
``` ```
ClickHouse Keeper version: v21.11.1.1-prestable-7a4a0b0edef0ad6e0aa662cd3b90c3f4acf796e7 ClickHouse Keeper version: v21.11.1.1-prestable-7a4a0b0edef0ad6e0aa662cd3b90c3f4acf796e7
Latency min/avg/max: 0/0/0 Latency min/avg/max: 0/0/0
Received: 2 Received: 2
Sent : 2 Sent : 2
Connections: 1 Connections: 1
@ -161,16 +165,14 @@ Mode: leader
Node count: 4 Node count: 4
``` ```
- stat : Lists brief details for the server and connected clients. - `stat`: Lists brief details for the server and connected clients.
``` ```
ClickHouse Keeper version: v21.11.1.1-prestable-7a4a0b0edef0ad6e0aa662cd3b90c3f4acf796e7 ClickHouse Keeper version: v21.11.1.1-prestable-7a4a0b0edef0ad6e0aa662cd3b90c3f4acf796e7
Clients: Clients:
192.168.1.1:52852(recved=0,sent=0) 192.168.1.1:52852(recved=0,sent=0)
192.168.1.1:52042(recved=24,sent=48) 192.168.1.1:52042(recved=24,sent=48)
Latency min/avg/max: 0/0/0 Latency min/avg/max: 0/0/0
Received: 4 Received: 4
Sent : 4 Sent : 4
Connections: 1 Connections: 1
@ -178,16 +180,15 @@ Outstanding: 0
Zxid: 36 Zxid: 36
Mode: leader Mode: leader
Node count: 4 Node count: 4
``` ```
- srst : Reset server statistics. The command will affect the result of `srvr`, `mntr` and `stat`. - `srst`: Reset server statistics. The command will affect the result of `srvr`, `mntr` and `stat`.
``` ```
Server stats reset. Server stats reset.
``` ```
- conf : Print details about serving configuration. - `conf`: Print details about serving configuration.
``` ```
server_id=1 server_id=1
@ -220,20 +221,20 @@ compress_snapshots_with_zstd_format=true
configuration_change_tries_count=20 configuration_change_tries_count=20
``` ```
- cons : List full connection/session details for all clients connected to this server. Includes information on numbers of packets received/sent, session id, operation latencies, last operation performed, etc... - `cons`: List full connection/session details for all clients connected to this server. Includes information on numbers of packets received/sent, session id, operation latencies, last operation performed, etc...
``` ```
192.168.1.1:52163(recved=0,sent=0,sid=0xffffffffffffffff,lop=NA,est=1636454787393,to=30000,lzxid=0xffffffffffffffff,lresp=0,llat=0,minlat=0,avglat=0,maxlat=0) 192.168.1.1:52163(recved=0,sent=0,sid=0xffffffffffffffff,lop=NA,est=1636454787393,to=30000,lzxid=0xffffffffffffffff,lresp=0,llat=0,minlat=0,avglat=0,maxlat=0)
192.168.1.1:52042(recved=9,sent=18,sid=0x0000000000000001,lop=List,est=1636454739887,to=30000,lcxid=0x0000000000000005,lzxid=0x0000000000000005,lresp=1636454739892,llat=0,minlat=0,avglat=0,maxlat=0) 192.168.1.1:52042(recved=9,sent=18,sid=0x0000000000000001,lop=List,est=1636454739887,to=30000,lcxid=0x0000000000000005,lzxid=0x0000000000000005,lresp=1636454739892,llat=0,minlat=0,avglat=0,maxlat=0)
``` ```
- crst : Reset connection/session statistics for all connections. - `crst`: Reset connection/session statistics for all connections.
``` ```
Connection stats reset. Connection stats reset.
``` ```
- envi : Print details about serving environment - `envi`: Print details about serving environment
``` ```
Environment: Environment:
@ -250,41 +251,41 @@ user.tmp=/var/folders/b4/smbq5mfj7578f2jzwn602tt40000gn/T/
``` ```
- dirs : Shows the total size of snapshot and log files in bytes - `dirs`: Shows the total size of snapshot and log files in bytes
``` ```
snapshot_dir_size: 0 snapshot_dir_size: 0
log_dir_size: 3875 log_dir_size: 3875
``` ```
- isro: Tests if server is running in read-only mode. The server will respond with "ro" if in read-only mode or "rw" if not in read-only mode. - `isro`: Tests if server is running in read-only mode. The server will respond with "ro" if in read-only mode or "rw" if not in read-only mode.
``` ```
rw rw
``` ```
- wchs : Lists brief information on watches for the server. - `wchs`: Lists brief information on watches for the server.
``` ```
1 connections watching 1 paths 1 connections watching 1 paths
Total watches:1 Total watches:1
``` ```
- wchc : Lists detailed information on watches for the server, by session. This outputs a list of sessions(connections) with associated watches (paths). Note, depending on the number of watches this operation may be expensive (ie impact server performance), use it carefully. - `wchc`: Lists detailed information on watches for the server, by session. This outputs a list of sessions (connections) with associated watches (paths). Note, depending on the number of watches this operation may be expensive (ie impact server performance), use it carefully.
``` ```
0x0000000000000001 0x0000000000000001
/clickhouse/task_queue/ddl /clickhouse/task_queue/ddl
``` ```
- wchp : Lists detailed information on watches for the server, by path. This outputs a list of paths (znodes) with associated sessions. Note, depending on the number of watches this operation may be expensive (ie impact server performance), use it carefully. - `wchp`: Lists detailed information on watches for the server, by path. This outputs a list of paths (znodes) with associated sessions. Note, depending on the number of watches this operation may be expensive (i. e. impact server performance), use it carefully.
``` ```
/clickhouse/task_queue/ddl /clickhouse/task_queue/ddl
0x0000000000000001 0x0000000000000001
``` ```
- dump : Lists the outstanding sessions and ephemeral nodes. This only works on the leader. - `dump`: Lists the outstanding sessions and ephemeral nodes. This only works on the leader.
``` ```
Sessions dump (2): Sessions dump (2):

View File

@ -505,7 +505,7 @@ Keys:
- `level` Logging level. Acceptable values: `trace`, `debug`, `information`, `warning`, `error`. - `level` Logging level. Acceptable values: `trace`, `debug`, `information`, `warning`, `error`.
- `log` The log file. Contains all the entries according to `level`. - `log` The log file. Contains all the entries according to `level`.
- `errorlog` Error log file. - `errorlog` Error log file.
- `size` Size of the file. Applies to `log`and`errorlog`. Once the file reaches `size`, ClickHouse archives and renames it, and creates a new log file in its place. - `size` Size of the file. Applies to `log` and `errorlog`. Once the file reaches `size`, ClickHouse archives and renames it, and creates a new log file in its place.
- `count` The number of archived log files that ClickHouse stores. - `count` The number of archived log files that ClickHouse stores.
**Example** **Example**
@ -750,9 +750,13 @@ The value 0 means that you can delete all tables without any restrictions.
## max_thread_pool_size {#max-thread-pool-size} ## max_thread_pool_size {#max-thread-pool-size}
The maximum number of threads in the Global Thread pool. ClickHouse uses threads from the Global Thread pool to process queries. If there is no idle thread to process a query, then a new thread is created in the pool. `max_thread_pool_size` limits the maximum number of threads in the pool.
Default value: 10000. Possible values:
- Positive integer.
Default value: `10000`.
**Example** **Example**
@ -762,9 +766,13 @@ Default value: 10000.
## max_thread_pool_free_size {#max-thread-pool-free-size} ## max_thread_pool_free_size {#max-thread-pool-free-size}
The number of threads that are always held in the Global Thread pool. If the number of **idle** threads in the Global Thread pool is greater than `max_thread_pool_free_size`, then ClickHouse releases resources occupied by some threads and the pool size is decreased. Threads can be created again if necessary.
Default value: 1000. Possible values:
- Positive integer.
Default value: `1000`.
**Example** **Example**
@ -774,9 +782,13 @@ Default value: 1000.
## thread_pool_queue_size {#thread-pool-queue-size} ## thread_pool_queue_size {#thread-pool-queue-size}
The limit to the number of jobs that can be scheduled on the Global Thread pool. Increasing queue size leads to larger memory usage. It is recommended to keep this value equal to the `max_thread_pool_size`. The maximum number of jobs that can be scheduled on the Global Thread pool. Increasing queue size leads to larger memory usage. It is recommended to keep this value equal to [max_thread_pool_size](#max-thread-pool-size).
Default value: 10000. Possible values:
- Positive integer.
Default value: `10000`.
**Example** **Example**
@ -1443,7 +1455,7 @@ You can also define sections `memory` — means storing information only in memo
To add an LDAP server as a remote user directory of users that are not defined locally, define a single `ldap` section with a following parameters: To add an LDAP server as a remote user directory of users that are not defined locally, define a single `ldap` section with a following parameters:
- `server` — one of LDAP server names defined in `ldap_servers` config section. This parameter is mandatory and cannot be empty. - `server` — one of LDAP server names defined in `ldap_servers` config section. This parameter is mandatory and cannot be empty.
- `roles` — section with a list of locally defined roles that will be assigned to each user retrieved from the LDAP server. If no roles are specified, user will not be able to perform any actions after authentication. If any of the listed roles is not defined locally at the time of authentication, the authenthication attept will fail as if the provided password was incorrect. - `roles` — section with a list of locally defined roles that will be assigned to each user retrieved from the LDAP server. If no roles are specified, user will not be able to perform any actions after authentication. If any of the listed roles is not defined locally at the time of authentication, the authentication attempt will fail as if the provided password was incorrect.
**Example** **Example**
@ -1507,3 +1519,4 @@ Possible values:
- Positive integer. - Positive integer.
Default value: `10000`. Default value: `10000`.

View File

@ -1687,18 +1687,17 @@ Quorum writes
`INSERT` succeeds only when ClickHouse manages to correctly write data to the `insert_quorum` of replicas during the `insert_quorum_timeout`. If for any reason the number of replicas with successful writes does not reach the `insert_quorum`, the write is considered failed and ClickHouse will delete the inserted block from all the replicas where data has already been written. `INSERT` succeeds only when ClickHouse manages to correctly write data to the `insert_quorum` of replicas during the `insert_quorum_timeout`. If for any reason the number of replicas with successful writes does not reach the `insert_quorum`, the write is considered failed and ClickHouse will delete the inserted block from all the replicas where data has already been written.
All the replicas in the quorum are consistent, i.e., they contain data from all previous `INSERT` queries. The `INSERT` sequence is linearized. When `insert_quorum_parallel` is disabled, all replicas in the quorum are consistent, i.e. they contain data from all previous `INSERT` queries (the `INSERT` sequence is linearized). When reading data written using `insert_quorum` and `insert_quorum_parallel` is disabled, you can turn on sequential consistency for `SELECT` queries using [select_sequential_consistency](#settings-select_sequential_consistency).
When reading the data written from the `insert_quorum`, you can use the [select_sequential_consistency](#settings-select_sequential_consistency) option. ClickHouse generates an exception:
ClickHouse generates an exception
- If the number of available replicas at the time of the query is less than the `insert_quorum`. - If the number of available replicas at the time of the query is less than the `insert_quorum`.
- At an attempt to write data when the previous block has not yet been inserted in the `insert_quorum` of replicas. This situation may occur if the user tries to perform an `INSERT` before the previous one with the `insert_quorum` is completed. - When `insert_quorum_parallel` is disabled and an attempt to write data is made when the previous block has not yet been inserted in `insert_quorum` of replicas. This situation may occur if the user tries to perform another `INSERT` query to the same table before the previous one with `insert_quorum` is completed.
See also: See also:
- [insert_quorum_timeout](#settings-insert_quorum_timeout) - [insert_quorum_timeout](#settings-insert_quorum_timeout)
- [insert_quorum_parallel](#settings-insert_quorum_parallel)
- [select_sequential_consistency](#settings-select_sequential_consistency) - [select_sequential_consistency](#settings-select_sequential_consistency)
## insert_quorum_timeout {#settings-insert_quorum_timeout} ## insert_quorum_timeout {#settings-insert_quorum_timeout}
@ -1710,11 +1709,29 @@ Default value: 600 000 milliseconds (ten minutes).
See also: See also:
- [insert_quorum](#settings-insert_quorum) - [insert_quorum](#settings-insert_quorum)
- [insert_quorum_parallel](#settings-insert_quorum_parallel)
- [select_sequential_consistency](#settings-select_sequential_consistency)
## insert_quorum_parallel {#settings-insert_quorum_parallel}
Enables or disables parallelism for quorum `INSERT` queries. If enabled, additional `INSERT` queries can be sent while previous queries have not yet finished. If disabled, additional writes to the same table will be rejected.
Possible values:
- 0 — Disabled.
- 1 — Enabled.
Default value: 1.
See also:
- [insert_quorum](#settings-insert_quorum)
- [insert_quorum_timeout](#settings-insert_quorum_timeout)
- [select_sequential_consistency](#settings-select_sequential_consistency) - [select_sequential_consistency](#settings-select_sequential_consistency)
## select_sequential_consistency {#settings-select_sequential_consistency} ## select_sequential_consistency {#settings-select_sequential_consistency}
Enables or disables sequential consistency for `SELECT` queries: Enables or disables sequential consistency for `SELECT` queries. Requires `insert_quorum_parallel` to be disabled (enabled by default).
Possible values: Possible values:
@ -1727,10 +1744,13 @@ Usage
When sequential consistency is enabled, ClickHouse allows the client to execute the `SELECT` query only for those replicas that contain data from all previous `INSERT` queries executed with `insert_quorum`. If the client refers to a partial replica, ClickHouse will generate an exception. The SELECT query will not include data that has not yet been written to the quorum of replicas. When sequential consistency is enabled, ClickHouse allows the client to execute the `SELECT` query only for those replicas that contain data from all previous `INSERT` queries executed with `insert_quorum`. If the client refers to a partial replica, ClickHouse will generate an exception. The SELECT query will not include data that has not yet been written to the quorum of replicas.
When `insert_quorum_parallel` is enabled (the default), then `select_sequential_consistency` does not work. This is because parallel `INSERT` queries can be written to different sets of quorum replicas so there is no guarantee a single replica will have received all writes.
See also: See also:
- [insert_quorum](#settings-insert_quorum) - [insert_quorum](#settings-insert_quorum)
- [insert_quorum_timeout](#settings-insert_quorum_timeout) - [insert_quorum_timeout](#settings-insert_quorum_timeout)
- [insert_quorum_parallel](#settings-insert_quorum_parallel)
## insert_deduplicate {#settings-insert-deduplicate} ## insert_deduplicate {#settings-insert-deduplicate}

View File

@ -41,7 +41,7 @@ Example of a polygon dictionary configuration:
</dictionary> </dictionary>
``` ```
Tne corresponding [DDL-query](../../../sql-reference/statements/create/dictionary.md#create-dictionary-query): The corresponding [DDL-query](../../../sql-reference/statements/create/dictionary.md#create-dictionary-query):
``` sql ``` sql
CREATE DICTIONARY polygon_dict_name ( CREATE DICTIONARY polygon_dict_name (
key Array(Array(Array(Array(Float64)))), key Array(Array(Array(Array(Float64)))),

View File

@ -31,7 +31,7 @@ CREATE ROLE accountant;
GRANT SELECT ON db.* TO accountant; GRANT SELECT ON db.* TO accountant;
``` ```
This sequence of queries creates the role `accountant` that has the privilege of reading data from the `accounting` database. This sequence of queries creates the role `accountant` that has the privilege of reading data from the `db` database.
Assigning the role to the user `mira`: Assigning the role to the user `mira`:

View File

@ -22,7 +22,7 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster]
) ENGINE = engine ) ENGINE = engine
``` ```
Creates a table named `name` in the `db` database or the current database if `db` is not set, with the structure specified in brackets and the `engine` engine. Creates a table named `table_name` in the `db` database or the current database if `db` is not set, with the structure specified in brackets and the `engine` engine.
The structure of the table is a list of column descriptions, secondary indexes and constraints . If [primary key](#primary-key) is supported by the engine, it will be indicated as parameter for the table engine. The structure of the table is a list of column descriptions, secondary indexes and constraints . If [primary key](#primary-key) is supported by the engine, it will be indicated as parameter for the table engine.
A column description is `name type` in the simplest case. Example: `RegionID UInt32`. A column description is `name type` in the simplest case. Example: `RegionID UInt32`.

View File

@ -298,13 +298,16 @@ Note that elements emitted by a late firing should be treated as updated results
### Monitoring New Windows {#window-view-monitoring} ### Monitoring New Windows {#window-view-monitoring}
Window view supports the `WATCH` query to constantly append the processing results to the console or use `TO` syntax to output the results to a table. Window view supports the [WATCH](../../../sql-reference/statements/watch.md) query to monitoring changes, or use `TO` syntax to output the results to a table.
``` sql ``` sql
WATCH [db.]name [LIMIT n] WATCH [db.]window_view
[EVENTS]
[LIMIT n]
[FORMAT format]
``` ```
`WATCH` query acts similar as in `LIVE VIEW`. A `LIMIT` can be specified to set the number of updates to receive before terminating the query. `WATCH` query acts similar as in `LIVE VIEW`. A `LIMIT` can be specified to set the number of updates to receive before terminating the query. The `EVENTS` clause can be used to obtain a short form of the `WATCH` query where instead of the query result you will just get the latest query watermark.
### Settings {#window-view-settings} ### Settings {#window-view-settings}

View File

@ -206,6 +206,9 @@ This extra row is only produced in `JSON*`, `TabSeparated*`, and `Pretty*` forma
- In `Pretty*` formats, the row is output as a separate table after the main result. - In `Pretty*` formats, the row is output as a separate table after the main result.
- In the other formats it is not available. - In the other formats it is not available.
!!! note "Note"
totals is output in the results of `SELECT` queries, and is not output in `INSERT INTO ... SELECT`.
`WITH TOTALS` can be run in different ways when [HAVING](../../../sql-reference/statements/select/having.md) is present. The behavior depends on the `totals_mode` setting. `WITH TOTALS` can be run in different ways when [HAVING](../../../sql-reference/statements/select/having.md) is present. The behavior depends on the `totals_mode` setting.
### Configuring Totals Processing {#configuring-totals-processing} ### Configuring Totals Processing {#configuring-totals-processing}

View File

@ -129,6 +129,9 @@ world
Каждый элемент структуры типа [Nested](../sql-reference/data-types/nested-data-structures/nested.md) представляется как отдельный массив. Каждый элемент структуры типа [Nested](../sql-reference/data-types/nested-data-structures/nested.md) представляется как отдельный массив.
Входящие параметры типа "перечисление" (`ENUM`) могут передаваться в виде значений или порядковых номеров. Сначала переданное значение будет сопоставляться с элементами перечисления. Если совпадение не будет найдено и при этом переданное значение является числом, оно будет трактоваться как порядковый номер в перечислении.
Если входящие параметры типа `ENUM` содержат только порядковые номера, рекомендуется включить настройку [input_format_tsv_enum_as_number](../operations/settings/settings.md#settings-input_format_tsv_enum_as_number) для ускорения парсинга.
Например: Например:
``` sql ``` sql
@ -362,6 +365,9 @@ $ clickhouse-client --format_csv_delimiter="|" --query="INSERT INTO test.csv FOR
Если установлена настройка [input_format_defaults_for_omitted_fields = 1](../operations/settings/settings.md#session_settings-input_format_defaults_for_omitted_fields) и тип столбца не `Nullable(T)`, то пустые значения без кавычек заменяются значениями по умолчанию для типа данных столбца. Если установлена настройка [input_format_defaults_for_omitted_fields = 1](../operations/settings/settings.md#session_settings-input_format_defaults_for_omitted_fields) и тип столбца не `Nullable(T)`, то пустые значения без кавычек заменяются значениями по умолчанию для типа данных столбца.
Входящие параметры типа "перечисление" (`ENUM`) могут передаваться в виде значений или порядковых номеров. Сначала переданное значение будет сопоставляться с элементами перечисления. Если совпадение не будет найдено и при этом переданное значение является числом, оно будет трактоваться как порядковый номер в перечислении.
Если входящие параметры типа `ENUM` содержат только порядковые номера, рекомендуется включить настройку [input_format_tsv_enum_as_number](../operations/settings/settings.md#settings-input_format_tsv_enum_as_number) для ускорения парсинга.
Формат CSV поддерживает вывод totals и extremes аналогично `TabSeparated`. Формат CSV поддерживает вывод totals и extremes аналогично `TabSeparated`.
## CSVWithNames {#csvwithnames} ## CSVWithNames {#csvwithnames}
@ -693,7 +699,7 @@ CREATE TABLE IF NOT EXISTS example_table
- Если `input_format_defaults_for_omitted_fields = 1`, то значение по умолчанию для `x` равно `0`, а значение по умолчанию `a` равно `x * 2`. - Если `input_format_defaults_for_omitted_fields = 1`, то значение по умолчанию для `x` равно `0`, а значение по умолчанию `a` равно `x * 2`.
!!! note "Предупреждение" !!! note "Предупреждение"
Если `input_format_defaults_for_omitted_fields = 1`, то при обработке запросов ClickHouse потребляет больше вычислительных ресурсов, чем если `input_format_defaults_for_omitted_fields = 0`. При добавлении данных с помощью `input_format_defaults_for_omitted_fields = 1`, ClickHouse потребляет больше вычислительных ресурсов по сравнению с `input_format_defaults_for_omitted_fields = 0`.
### Выборка данных {#vyborka-dannykh} ### Выборка данных {#vyborka-dannykh}

View File

@ -16,12 +16,17 @@ ZooKeeper — один из первых широко известных сер
По умолчанию ClickHouse Keeper предоставляет те же гарантии, что и ZooKeeper (линеаризуемость записей, последовательная согласованность чтений). У него есть совместимый клиент-серверный протокол, поэтому любой стандартный клиент ZooKeeper может использоваться для взаимодействия с ClickHouse Keeper. Снэпшоты и журналы имеют несовместимый с ZooKeeper формат, однако можно конвертировать данные Zookeeper в снэпшот ClickHouse Keeper с помощью `clickhouse-keeper-converter`. Межсерверный протокол ClickHouse Keeper также несовместим с ZooKeeper, поэтому создание смешанного кластера ZooKeeper / ClickHouse Keeper невозможно. По умолчанию ClickHouse Keeper предоставляет те же гарантии, что и ZooKeeper (линеаризуемость записей, последовательная согласованность чтений). У него есть совместимый клиент-серверный протокол, поэтому любой стандартный клиент ZooKeeper может использоваться для взаимодействия с ClickHouse Keeper. Снэпшоты и журналы имеют несовместимый с ZooKeeper формат, однако можно конвертировать данные Zookeeper в снэпшот ClickHouse Keeper с помощью `clickhouse-keeper-converter`. Межсерверный протокол ClickHouse Keeper также несовместим с ZooKeeper, поэтому создание смешанного кластера ZooKeeper / ClickHouse Keeper невозможно.
Система управления доступом (ACL) ClickHouse Keeper реализована так же, как в [ZooKeeper](https://zookeeper.apache.org/doc/r3.1.2/zookeeperProgrammers.html#sc_ZooKeeperAccessControl). ClickHouse Keeper поддерживает тот же набор разрешений и идентичные схемы: `world`, `auth`, `digest`, `host` и `ip`. Digest для аутентификации использует пару значений `username:password`. Пароль кодируется в Base64.
!!! info "Примечание"
Внешние интеграции не поддерживаются.
## Конфигурация ## Конфигурация
ClickHouse Keeper может использоваться как равноценная замена ZooKeeper или как внутренняя часть сервера ClickHouse, но в обоих случаях конфигурация представлена файлом `.xml`. Главный тег конфигурации ClickHouse Keeper — это `<keeper_server>`. Параметры конфигурации: ClickHouse Keeper может использоваться как равноценная замена ZooKeeper или как внутренняя часть сервера ClickHouse, но в обоих случаях конфигурация представлена файлом `.xml`. Главный тег конфигурации ClickHouse Keeper — это `<keeper_server>`. Параметры конфигурации:
- `tcp_port` — порт для подключения клиента (по умолчанию для ZooKeeper: `2181`). - `tcp_port` — порт для подключения клиента (по умолчанию для ZooKeeper: `2181`).
- `tcp_port_secure` — зашифрованный порт для подключения клиента. - `tcp_port_secure` — зашифрованный порт для SSL-соединения между клиентом и сервером сервиса.
- `server_id` — уникальный идентификатор сервера, каждый участник кластера должен иметь уникальный номер&nbsp;(1,&nbsp;2,&nbsp;3&nbsp;и&nbsp;т.&nbsp;д.). - `server_id` — уникальный идентификатор сервера, каждый участник кластера должен иметь уникальный номер&nbsp;(1,&nbsp;2,&nbsp;3&nbsp;и&nbsp;т.&nbsp;д.).
- `log_storage_path` — путь к журналам координации, лучше хранить их на незанятом устройстве (актуально и для ZooKeeper). - `log_storage_path` — путь к журналам координации, лучше хранить их на незанятом устройстве (актуально и для ZooKeeper).
- `snapshot_storage_path` — путь к снэпшотам координации. - `snapshot_storage_path` — путь к снэпшотам координации.
@ -50,7 +55,11 @@ ClickHouse Keeper может использоваться как равноце
- `shutdown_timeout` — время ожидания завершения внутренних подключений и выключения, в миллисекундах (по умолчанию: 5000). - `shutdown_timeout` — время ожидания завершения внутренних подключений и выключения, в миллисекундах (по умолчанию: 5000).
- `startup_timeout` — время отключения сервера, если он не подключается к другим участникам кворума, в миллисекундах (по умолчанию: 30000). - `startup_timeout` — время отключения сервера, если он не подключается к другим участникам кворума, в миллисекундах (по умолчанию: 30000).
Конфигурация кворума находится в `<keeper_server>.<raft_configuration>` и содержит описание серверов. Единственный параметр для всего кворума — `secure`, который включает зашифрованное соединение для связи между участниками кворума. Параметры для каждого `<server>`: Конфигурация кворума находится в `<keeper_server>.<raft_configuration>` и содержит описание серверов.
Единственный параметр для всего кворума — `secure`, который включает зашифрованное соединение для связи между участниками кворума. Параметру можно задать значение `true`, если для внутренней коммуникации между узлами требуется SSL-соединение, в ином случае не указывайте ничего.
Параметры для каждого `<server>`:
- `id` — идентификатор сервера в кворуме. - `id` — идентификатор сервера в кворуме.
- `hostname` — имя хоста, на котором размещен сервер. - `hostname` — имя хоста, на котором размещен сервер.

View File

@ -52,7 +52,7 @@ ClickHouse перезагружает встроенные словари с з
ClickHouse проверяет условия для `min_part_size` и `min_part_size_ratio` и выполнит те блоки `case`, для которых условия совпали. ClickHouse проверяет условия для `min_part_size` и `min_part_size_ratio` и выполнит те блоки `case`, для которых условия совпали.
- Если кусок данных совпадает с условиями, ClickHouse использует указанные метод сжатия. - Если кусок данных совпадает с условиями, ClickHouse использует указанные метод сжатия.
- Если кусок данных совпадает с несколькими блоками `case`, ClickHouse использует перый совпавший блок условий. - Если кусок данных совпадает с несколькими блоками `case`, ClickHouse использует первый совпавший блок условий.
Если ни один `<case>` не подходит, то ClickHouse применит алгоритм сжатия `lz4`. Если ни один `<case>` не подходит, то ClickHouse применит алгоритм сжатия `lz4`.
@ -554,13 +554,13 @@ ClickHouse проверяет условия для `min_part_size` и `min_part
Ключи: Ключи:
- `enabled` Булевый флаг чтобы включить функциональность, по умолчанию `false`. Установите `true` чтобы разрешить отправку отчетов о сбоях. - `enabled` Булевый флаг чтобы включить функциональность, по умолчанию `false`. Установите `true` чтобы разрешить отправку отчетов о сбоях.
- `endpoint` Вы можете переопределить URL на который будут отсылаться отчеты об ошибках и использовать собственную инсталяцию Sentry. Используйте URL синтаксис [Sentry DSN](https://docs.sentry.io/error-reporting/quickstart/?platform=native#configure-the-sdk). - `endpoint` Вы можете переопределить URL на который будут отсылаться отчеты об ошибках и использовать собственную инсталляцию Sentry. Используйте URL синтаксис [Sentry DSN](https://docs.sentry.io/error-reporting/quickstart/?platform=native#configure-the-sdk).
- `anonymize` - Запретить отсылку имени хоста сервера в отчете о сбое. - `anonymize` - Запретить отсылку имени хоста сервера в отчете о сбое.
- `http_proxy` - Настройка HTTP proxy для отсылки отчетов о сбоях. - `http_proxy` - Настройка HTTP proxy для отсылки отчетов о сбоях.
- `debug` - Настроить клиентскую библиотеку Sentry в debug режим. - `debug` - Настроить клиентскую библиотеку Sentry в debug режим.
- `tmp_path` - Путь в файловой системе для временного хранения состояния отчетов о сбоях перед отправкой на сервер Sentry. - `tmp_path` - Путь в файловой системе для временного хранения состояния отчетов о сбоях перед отправкой на сервер Sentry.
**Рекомендованые настройки** **Рекомендованные настройки**
``` xml ``` xml
<send_crash_reports> <send_crash_reports>
@ -751,9 +751,13 @@ ClickHouse проверяет условия для `min_part_size` и `min_part
## max_thread_pool_size {#max-thread-pool-size} ## max_thread_pool_size {#max-thread-pool-size}
Максимальное количество потоков в глобальном пуле потоков. ClickHouse использует потоки из глобального пула потоков для обработки запросов. Если в пуле нет свободных потоков, то в нем создается еще один. Параметр `max_thread_pool_size` ограничивает максимальное количество потоков в пуле.
Значение по умолчанию: 10000. Возможные значения:
- Положительное целое число.
Значение по умолчанию: `10000`.
**Пример** **Пример**
@ -761,6 +765,38 @@ ClickHouse проверяет условия для `min_part_size` и `min_part
<max_thread_pool_size>12000</max_thread_pool_size> <max_thread_pool_size>12000</max_thread_pool_size>
``` ```
## max_thread_pool_free_size {#max-thread-pool-free-size}
Если в глобальном пуле потоков количество **свободных** потоков больше, чем задано параметром `max_thread_pool_free_size`, то ClickHouse освобождает ресурсы, занятые некоторыми потоками. В таком случае размер пула уменьшается. При необходимости потоки будут созданы заново.
Возможные значения:
- Положительное целое число.
Значение по умолчанию: `1000`.
**Пример**
``` xml
<max_thread_pool_free_size>1200</max_thread_pool_free_size>
```
## thread_pool_queue_size {#thread-pool-queue-size}
Максимальное количество задач, которые запланированы для выполнения в глобальном пуле потоков. При увеличении этого параметра возрастает использование памяти. Рекомендуется, чтобы значение этого параметра совпадало со значением параметра [max_thread_pool_size](#max-thread-pool-size).
Возможные значения:
- Положительное целое число.
Значение по умолчанию: `10000`.
**Пример**
``` xml
<thread_pool_queue_size>12000</thread_pool_queue_size>
```
## merge_tree {#server_configuration_parameters-merge_tree} ## merge_tree {#server_configuration_parameters-merge_tree}
Тонкая настройка таблиц семейства [MergeTree](../../operations/server-configuration-parameters/settings.md). Тонкая настройка таблиц семейства [MergeTree](../../operations/server-configuration-parameters/settings.md).
@ -1011,7 +1047,7 @@ ClickHouse проверяет условия для `min_part_size` и `min_part
Если таблица не существует, то ClickHouse создаст её. Если структура журнала запросов изменилась при обновлении сервера ClickHouse, то таблица со старой структурой переименовывается, а новая таблица создается автоматически. Если таблица не существует, то ClickHouse создаст её. Если структура журнала запросов изменилась при обновлении сервера ClickHouse, то таблица со старой структурой переименовывается, а новая таблица создается автоматически.
**Example** **Пример**
``` xml ``` xml
<query_views_log> <query_views_log>
@ -1075,9 +1111,8 @@ Parameters:
## query_masking_rules {#query-masking-rules} ## query_masking_rules {#query-masking-rules}
Правила основанные на регурялных выражениях, которые будут применены для всех запросов а также для всех сообщений перед сохранением их в лог на сервере, Правила, основанные на регулярных выражениях, которые будут применены для всех запросов, а также для всех сообщений перед сохранением их в лог на сервере,
`system.query_log`, `system.text_log`, `system.processes` таблицы, а также в логах отсылаемых клиенту. Это позволяет предотвратить `system.query_log`, `system.text_log`, `system.processes` таблицы, а также в логах, отсылаемых клиенту. Это позволяет предотвратить утечку конфиденциальных данных из SQL запросов (такие как имена, электронные письма, личные идентификаторы или номера кредитных карт) в логи.
утечку конфиденциальных данных из SQL запросов (такие как имена, электронные письма, личные идентификаторы или номера кредитных карт) в логи.
**Пример** **Пример**
@ -1096,7 +1131,7 @@ Parameters:
- `regexp` - совместимое с RE2 регулярное выражение (обязательное) - `regexp` - совместимое с RE2 регулярное выражение (обязательное)
- `replace` - строка замены для конфиденциальных данных (опционально, по умолчанию - шесть звездочек) - `replace` - строка замены для конфиденциальных данных (опционально, по умолчанию - шесть звездочек)
Правила маскировки применяются ко всему запросу (для предотвращения утечки конфиденциальных данных из неправильно оформленных / не интерпритируемых запросов). Правила маскировки применяются ко всему запросу (для предотвращения утечки конфиденциальных данных из неправильно оформленных / не интерпретируемых запросов).
`system.events` таблица содержит счетчик `QueryMaskingRulesMatch` который считает общее кол-во совпадений правил маскировки. `system.events` таблица содержит счетчик `QueryMaskingRulesMatch` который считает общее кол-во совпадений правил маскировки.
@ -1418,7 +1453,7 @@ ClickHouse использует ZooKeeper для хранения метадан
Также вы можете добавить секции `memory` — означает хранение информации только в памяти, без записи на диск, и `ldap` — означает хранения информации на [LDAP-сервере](https://en.wikipedia.org/wiki/Lightweight_Directory_Access_Protocol). Также вы можете добавить секции `memory` — означает хранение информации только в памяти, без записи на диск, и `ldap` — означает хранения информации на [LDAP-сервере](https://en.wikipedia.org/wiki/Lightweight_Directory_Access_Protocol).
Чтобы добавить LDAP-сервер в качестве удаленного каталога пользователей, которые не определены локально, определите один раздел `ldap` со следующими параметрами: Чтобы добавить LDAP-сервер в качестве удаленного каталога пользователей, которые не определены локально, определите один раздел `ldap` со следующими параметрами:
- `server` — имя одного из LDAP-серверов, определенных в секции `ldap_servers` конфигурациионного файла. Этот параметр явялется необязательным и может быть пустым. - `server` — имя одного из LDAP-серверов, определенных в секции `ldap_servers` конфигурационного файла. Этот параметр является необязательным и может быть пустым.
- `roles` — раздел со списком локально определенных ролей, которые будут назначены каждому пользователю, полученному с LDAP-сервера. Если роли не заданы, пользователь не сможет выполнять никаких действий после аутентификации. Если какая-либо из перечисленных ролей не определена локально во время проверки подлинности, попытка проверки подлинности завершится неудачей, как если бы предоставленный пароль был неверным. - `roles` — раздел со списком локально определенных ролей, которые будут назначены каждому пользователю, полученному с LDAP-сервера. Если роли не заданы, пользователь не сможет выполнять никаких действий после аутентификации. Если какая-либо из перечисленных ролей не определена локально во время проверки подлинности, попытка проверки подлинности завершится неудачей, как если бы предоставленный пароль был неверным.
**Пример** **Пример**

View File

@ -391,12 +391,14 @@ INSERT INTO test VALUES (lower('Hello')), (lower('world')), (lower('INSERT')), (
## input_format_tsv_enum_as_number {#settings-input_format_tsv_enum_as_number} ## input_format_tsv_enum_as_number {#settings-input_format_tsv_enum_as_number}
Включает или отключает парсинг значений перечислений как идентификаторов перечислений для входного формата TSV. Включает или отключает парсинг значений перечислений как порядковых номеров.
Если режим включен, то во входящих данных в формате `TCV` значения перечисления (тип `ENUM`) всегда трактуются как порядковые номера, а не как элементы перечисления. Эту настройку рекомендуется включать для оптимизации парсинга, если данные типа `ENUM` содержат только порядковые номера, а не сами элементы перечисления.
Возможные значения: Возможные значения:
- 0 — парсинг значений перечисления как значений. - 0 — входящие значения типа `ENUM` сначала сопоставляются с элементами перечисления, а если совпадений не найдено, то трактуются как порядковые номера.
- 1 — парсинг значений перечисления как идентификаторов перечисления. - 1 — входящие значения типа `ENUM` сразу трактуются как порядковые номера.
Значение по умолчанию: 0. Значение по умолчанию: 0.
@ -410,10 +412,39 @@ CREATE TABLE table_with_enum_column_for_tsv_insert (Id Int32,Value Enum('first'
При включенной настройке `input_format_tsv_enum_as_number`: При включенной настройке `input_format_tsv_enum_as_number`:
Запрос:
```sql ```sql
SET input_format_tsv_enum_as_number = 1; SET input_format_tsv_enum_as_number = 1;
INSERT INTO table_with_enum_column_for_tsv_insert FORMAT TSV 102 2; INSERT INTO table_with_enum_column_for_tsv_insert FORMAT TSV 102 2;
INSERT INTO table_with_enum_column_for_tsv_insert FORMAT TSV 103 1; SELECT * FROM table_with_enum_column_for_tsv_insert;
```
Результат:
```text
┌──Id─┬─Value──┐
│ 102 │ second │
└─────┴────────┘
```
Запрос:
```sql
SET input_format_tsv_enum_as_number = 1;
INSERT INTO table_with_enum_column_for_tsv_insert FORMAT TSV 103 'first';
```
сгенерирует исключение.
При отключенной настройке `input_format_tsv_enum_as_number`:
Запрос:
```sql
SET input_format_tsv_enum_as_number = 0;
INSERT INTO table_with_enum_column_for_tsv_insert FORMAT TSV 102 2;
INSERT INTO table_with_enum_column_for_tsv_insert FORMAT TSV 103 'first';
SELECT * FROM table_with_enum_column_for_tsv_insert; SELECT * FROM table_with_enum_column_for_tsv_insert;
``` ```
@ -428,15 +459,6 @@ SELECT * FROM table_with_enum_column_for_tsv_insert;
└─────┴────────┘ └─────┴────────┘
``` ```
При отключенной настройке `input_format_tsv_enum_as_number` запрос `INSERT`:
```sql
SET input_format_tsv_enum_as_number = 0;
INSERT INTO table_with_enum_column_for_tsv_insert FORMAT TSV 102 2;
```
сгенерирует исключение.
## input_format_null_as_default {#settings-input-format-null-as-default} ## input_format_null_as_default {#settings-input-format-null-as-default}
Включает или отключает инициализацию [значениями по умолчанию](../../sql-reference/statements/create/table.md#create-default-values) ячеек с [NULL](../../sql-reference/syntax.md#null-literal), если тип данных столбца не позволяет [хранить NULL](../../sql-reference/data-types/nullable.md#data_type-nullable). Включает или отключает инициализацию [значениями по умолчанию](../../sql-reference/statements/create/table.md#create-default-values) ячеек с [NULL](../../sql-reference/syntax.md#null-literal), если тип данных столбца не позволяет [хранить NULL](../../sql-reference/data-types/nullable.md#data_type-nullable).
@ -1511,12 +1533,13 @@ SELECT area/period FROM account_orders FORMAT JSON;
## input_format_csv_enum_as_number {#settings-input_format_csv_enum_as_number} ## input_format_csv_enum_as_number {#settings-input_format_csv_enum_as_number}
Включает или отключает парсинг значений перечислений как идентификаторов перечислений для входного формата CSV. Включает или отключает парсинг значений перечислений как порядковых номеров.
Если режим включен, то во входящих данных в формате `CSV` значения перечисления (тип `ENUM`) всегда трактуются как порядковые номера, а не как элементы перечисления. Эту настройку рекомендуется включать для оптимизации парсинга, если данные типа `ENUM` содержат только порядковые номера, а не сами элементы перечисления.
Возможные значения: Возможные значения:
- 0 — парсинг значений перечисления как значений. - 0 — входящие значения типа `ENUM` сначала сопоставляются с элементами перечисления, а если совпадений не найдено, то трактуются как порядковые номера.
- 1 — парсинг значений перечисления как идентификаторов перечисления. - 1 — входящие значения типа `ENUM` сразу трактуются как порядковые номера.
Значение по умолчанию: 0. Значение по умолчанию: 0.
@ -1530,10 +1553,11 @@ CREATE TABLE table_with_enum_column_for_csv_insert (Id Int32,Value Enum('first'
При включенной настройке `input_format_csv_enum_as_number`: При включенной настройке `input_format_csv_enum_as_number`:
Запрос:
```sql ```sql
SET input_format_csv_enum_as_number = 1; SET input_format_csv_enum_as_number = 1;
INSERT INTO table_with_enum_column_for_csv_insert FORMAT CSV 102,2; INSERT INTO table_with_enum_column_for_csv_insert FORMAT CSV 102,2;
SELECT * FROM table_with_enum_column_for_csv_insert;
``` ```
Результат: Результат:
@ -1544,15 +1568,37 @@ SELECT * FROM table_with_enum_column_for_csv_insert;
└─────┴────────┘ └─────┴────────┘
``` ```
При отключенной настройке `input_format_csv_enum_as_number` запрос `INSERT`: Запрос:
```sql ```sql
SET input_format_csv_enum_as_number = 0; SET input_format_csv_enum_as_number = 1;
INSERT INTO table_with_enum_column_for_csv_insert FORMAT CSV 102,2; INSERT INTO table_with_enum_column_for_csv_insert FORMAT CSV 103,'first'
``` ```
сгенерирует исключение. сгенерирует исключение.
При отключенной настройке `input_format_csv_enum_as_number`:
Запрос:
```sql
SET input_format_csv_enum_as_number = 0;
INSERT INTO table_with_enum_column_for_csv_insert FORMAT CSV 102,2
INSERT INTO table_with_enum_column_for_csv_insert FORMAT CSV 103,'first'
SELECT * FROM table_with_enum_column_for_csv_insert;
```
Результат:
```text
┌──Id─┬─Value──┐
│ 102 │ second │
└─────┴────────┘
┌──Id─┬─Value─┐
│ 103 │ first │
└─────┴───────┘
```
## output_format_csv_crlf_end_of_line {#settings-output-format-csv-crlf-end-of-line} ## output_format_csv_crlf_end_of_line {#settings-output-format-csv-crlf-end-of-line}
Использовать в качестве разделителя строк для CSV формата CRLF (DOS/Windows стиль) вместо LF (Unix стиль). Использовать в качестве разделителя строк для CSV формата CRLF (DOS/Windows стиль) вместо LF (Unix стиль).

View File

@ -203,6 +203,9 @@ SELECT year, month, day, count(*) FROM t GROUP BY year, month, day WITH CUBE;
- В `Pretty*` форматах, строка выводится в виде отдельной таблицы после основного результата. - В `Pretty*` форматах, строка выводится в виде отдельной таблицы после основного результата.
- В других форматах она не доступна. - В других форматах она не доступна.
!!! note "Примечание"
totals выводится только в результатах запросов `SELECT`, и не вывоводится в `INSERT INTO ... SELECT`.
При использовании секции [HAVING](having.md) поведение `WITH TOTALS` контролируется настройкой `totals_mode`. При использовании секции [HAVING](having.md) поведение `WITH TOTALS` контролируется настройкой `totals_mode`.
### Настройка обработки итогов {#configuring-totals-processing} ### Настройка обработки итогов {#configuring-totals-processing}

View File

@ -271,7 +271,7 @@ SELECT * FROM collate_test ORDER BY s ASC COLLATE 'en';
## Модификатор ORDER BY expr WITH FILL {#orderby-with-fill} ## Модификатор ORDER BY expr WITH FILL {#orderby-with-fill}
Этот модификатор также может быть скобинирован с модификатором [LIMIT ... WITH TIES](../../../sql-reference/statements/select/limit.md#limit-with-ties) Этот модификатор также может быть скомбинирован с модификатором [LIMIT ... WITH TIES](../../../sql-reference/statements/select/limit.md#limit-with-ties)
Модификатор `WITH FILL` может быть установлен после `ORDER BY expr` с опциональными параметрами `FROM expr`, `TO expr` и `STEP expr`. Модификатор `WITH FILL` может быть установлен после `ORDER BY expr` с опциональными параметрами `FROM expr`, `TO expr` и `STEP expr`.
Все пропущенные значения для колонки `expr` будут заполнены значениями, соответствующими предполагаемой последовательности значений колонки, другие колонки будут заполнены значениями по умолчанию. Все пропущенные значения для колонки `expr` будут заполнены значениями, соответствующими предполагаемой последовательности значений колонки, другие колонки будут заполнены значениями по умолчанию.

View File

@ -1,16 +1,55 @@
--- # system.merge_tree_settings {#system-merge_tree_settings}
machine_translated: true
machine_translated_rev: 5decc73b5dc60054f19087d3690c4eb99446a6c3
---
# 系统。merge_tree_settings {#system-merge_tree_settings} 包含 `MergeTree` 表的设置 (Setting) 信息。
包含有关以下设置的信息 `MergeTree` 桌子
列: 列:
- `name` (String) — Setting name. - `name` (String) — 设置名称。
- `value` (String) — Setting value. - `value` (String) — 设置的值。
- `description` (String) — Setting description. - `description` (String) — 设置描述。
- `type` (String) — Setting type (implementation specific string value). - `type` (String) — 设置类型 (执行特定的字符串值)。
- `changed` (UInt8) — Whether the setting was explicitly defined in the config or explicitly changed. - `changed` (UInt8) — 该设置是否在配置中明确定义或是明确改变。
**示例**
```sql
:) SELECT * FROM system.merge_tree_settings LIMIT 4 FORMAT Vertical;
```
```text
Row 1:
──────
name: index_granularity
value: 8192
changed: 0
description: How many rows correspond to one primary key value.
type: SettingUInt64
Row 2:
──────
name: min_bytes_for_wide_part
value: 0
changed: 0
description: Minimal uncompressed size in bytes to create part in wide format instead of compact
type: SettingUInt64
Row 3:
──────
name: min_rows_for_wide_part
value: 0
changed: 0
description: Minimal number of rows to create part in wide format instead of compact
type: SettingUInt64
Row 4:
──────
name: merge_max_block_size
value: 8192
changed: 0
description: How many rows in blocks should be formed for merge operations.
type: SettingUInt64
4 rows in set. Elapsed: 0.001 sec.
```
[原文](https://clickhouse.com/docs/zh/operations/system-tables/merge_tree_settings) <!--hide-->

View File

@ -1,58 +1,128 @@
--- # system.tables {#system-tables}
machine_translated: true
machine_translated_rev: 5decc73b5dc60054f19087d3690c4eb99446a6c3
---
# 系统。表 {#system-tables} 包含服务器知道的每个表的元数据。 [分离的](../../sql-reference/statements/detach.md)表不在 `system.tables` 显示。
包含服务器知道的每个表的元数据。 分离的表不显示在 `system.tables` [临时表](../../sql-reference/statements/create/table.md#temporary-tables)只在创建它们的会话中的 `system.tables` 中才可见。它们的数据库字段显示为空,并且 `is_temporary` 标志显示为开启
此表包含以下列列类型显示在括号中): 此表包含以下列 (列类型显示在括号中):
- `database` (String) — 表所在的数据库名。 - `database` ([String](../../sql-reference/data-types/string.md)) — 表所在的数据库名。
- `name` (String) — 表名。 - `name` ([String](../../sql-reference/data-types/string.md)) — 表名。
- `engine` (String) — 表引擎名 (不包含参数)。 - `engine` ([String](../../sql-reference/data-types/string.md)) — 表引擎名 (不包含参数)。
- `is_temporary` (UInt8)-指示表是否是临时的标志。 - `is_temporary` ([UInt8](../../sql-reference/data-types/int-uint.md)) - 指示表是否是临时的标志。
- `data_path` (String)-文件系统中表数据的路径。 - `data_path` ([String](../../sql-reference/data-types/string.md)) - 表数据在文件系统中的路径。
- `metadata_path` (String)-文件系统中表元数据的路径。 - `metadata_path` ([String](../../sql-reference/data-types/string.md)) - 表元数据在文件系统中的路径。
- `metadata_modification_time` (DateTime)-表元数据的最新修改时间。 - `metadata_modification_time` ([DateTime](../../sql-reference/data-types/datetime.md)) - 表元数据的最新修改时间。
- `dependencies_database` (数组(字符串))-数据库依赖关系。 - `dependencies_database` ([Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md))) - 数据库依赖关系。
- `dependencies_table` (数组(字符串))-表依赖关系 ([MaterializedView](../../engines/table-engines/special/materializedview.md) 基于当前表的表) - `dependencies_table` ([Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md))) - 表依赖关系 (基于当前表的 [物化视图](../../engines/table-engines/special/materializedview.md) 表)
- `create_table_query` (String)-用于创建表的SQL语句。 - `create_table_query` ([String](../../sql-reference/data-types/string.md)) - 用于创建表的 SQL 语句。
- `engine_full` (String)-表引擎的参数。 - `engine_full` ([String](../../sql-reference/data-types/string.md)) - 表引擎的参数。
- `partition_key` (String)-表中指定的分区键表达式 - `as_select` ([String](../../sql-reference/data-types/string.md)) - 视图的 `SELECT` 语句
- `sorting_key` (String)-表中指定的排序键表达式。 - `partition_key` ([String](../../sql-reference/data-types/string.md)) - 表中指定的分区键表达式。
- `primary_key` (String)-表中指定的主键表达式。 - `sorting_key` ([String](../../sql-reference/data-types/string.md)) - 表中指定的排序键表达式。
- `sampling_key` (String)-表中指定的采样键表达式。 - `primary_key` ([String](../../sql-reference/data-types/string.md)) - 表中指定的主键表达式。
- `storage_policy` (字符串)-存储策略: - `sampling_key` ([String](../../sql-reference/data-types/string.md)) - 表中指定的采样键表达式。
- `storage_policy` ([String](../../sql-reference/data-types/string.md)) - 存储策略:
- [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-multiple-volumes) - [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-multiple-volumes)
- [分布](../../engines/table-engines/special/distributed.md#distributed) - [Distributed](../../engines/table-engines/special/distributed.md#distributed)
- `total_rows` (Nullable(UInt64))-总行数,如果可以快速确定表中的确切行数,否则行数为`Null`(包括底层 `Buffer` 表) - `total_rows` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) - 总行数,如果无法快速确定表中的确切行数,则行数返回为 `NULL` (包括底层 `Buffer` 表)
- `total_bytes` (Nullable(UInt64))-总字节数,如果可以快速确定存储表的确切字节数,否则字节数为`Null` (即**不** 包括任何底层存储) - `total_bytes` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) - 总字节数,如果无法快速确定存储表的确切字节数,则字节数返回为 `NULL` ( **不** 包括任何底层存储)
- 如果表将数据存在磁盘上,返回实际使用的磁盘空间(压缩后) - 如果表将数据存在磁盘上,返回实际使用的磁盘空间 (压缩后)
- 如果表在内存中存储数据,返回在内存中使用的近似字节数。 - 如果表在内存中存储数据,返回在内存中使用的近似字节数。
- `lifetime_rows` (Nullbale(UInt64))-服务启动后插入的总行数(只针对`Buffer`表)。 - `lifetime_rows` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) - 服务启动后插入的总行数(只针对 `Buffer` 表) 。
- `lifetime_bytes` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) - 服务启动后插入的总字节数(只针对 `Buffer` 表) 。
- `comment` ([String](../../sql-reference/data-types/string.md)) - 表的注释。
- `has_own_data` ([UInt8](../../sql-reference/data-types/int-uint.md)) — 标志,表示表本身是否在磁盘上存储数据,或者访问其他来源。
`system.tables` 表被用于 `SHOW TABLES` 的查询实现中。 `system.tables` 表被用于 `SHOW TABLES` 的查询实现中。
**示例**
```sql
SELECT * FROM system.tables LIMIT 2 FORMAT Vertical;
```
```text
Row 1:
──────
database: base
name: t1
uuid: 81b1c20a-b7c6-4116-a2ce-7583fb6b6736
engine: MergeTree
is_temporary: 0
data_paths: ['/var/lib/clickhouse/store/81b/81b1c20a-b7c6-4116-a2ce-7583fb6b6736/']
metadata_path: /var/lib/clickhouse/store/461/461cf698-fd0b-406d-8c01-5d8fd5748a91/t1.sql
metadata_modification_time: 2021-01-25 19:14:32
dependencies_database: []
dependencies_table: []
create_table_query: CREATE TABLE base.t1 (`n` UInt64) ENGINE = MergeTree ORDER BY n SETTINGS index_granularity = 8192
engine_full: MergeTree ORDER BY n SETTINGS index_granularity = 8192
as_select: SELECT database AS table_catalog
partition_key:
sorting_key: n
primary_key: n
sampling_key:
storage_policy: default
total_rows: 1
total_bytes: 99
lifetime_rows: ᴺᵁᴸᴸ
lifetime_bytes: ᴺᵁᴸᴸ
comment:
has_own_data: 0
Row 2:
──────
database: default
name: 53r93yleapyears
uuid: 00000000-0000-0000-0000-000000000000
engine: MergeTree
is_temporary: 0
data_paths: ['/var/lib/clickhouse/data/default/53r93yleapyears/']
metadata_path: /var/lib/clickhouse/metadata/default/53r93yleapyears.sql
metadata_modification_time: 2020-09-23 09:05:36
dependencies_database: []
dependencies_table: []
create_table_query: CREATE TABLE default.`53r93yleapyears` (`id` Int8, `febdays` Int8) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 8192
engine_full: MergeTree ORDER BY id SETTINGS index_granularity = 8192
as_select: SELECT name AS catalog_name
partition_key:
sorting_key: id
primary_key: id
sampling_key:
storage_policy: default
total_rows: 2
total_bytes: 155
lifetime_rows: ᴺᵁᴸᴸ
lifetime_bytes: ᴺᵁᴸᴸ
comment:
has_own_data: 0
```
[原文](https://clickhouse.com/docs/zh/operations/system-tables/tables) <!--hide--> [原文](https://clickhouse.com/docs/zh/operations/system-tables/tables) <!--hide-->

View File

@ -727,7 +727,6 @@ void LocalServer::printHelpMessage([[maybe_unused]] const OptionsDescription & o
void LocalServer::addOptions(OptionsDescription & options_description) void LocalServer::addOptions(OptionsDescription & options_description)
{ {
options_description.main_description->add_options() options_description.main_description->add_options()
("database,d", po::value<std::string>(), "database")
("table,N", po::value<std::string>(), "name of the initial table") ("table,N", po::value<std::string>(), "name of the initial table")
/// If structure argument is omitted then initial query is not generated /// If structure argument is omitted then initial query is not generated

View File

@ -152,6 +152,7 @@
This setting could be used to switch replication to another network interface This setting could be used to switch replication to another network interface
(the server may be connected to multiple networks via multiple addresses) (the server may be connected to multiple networks via multiple addresses)
--> -->
<!-- <!--
<interserver_http_host>example.yandex.ru</interserver_http_host> <interserver_http_host>example.yandex.ru</interserver_http_host>
--> -->
@ -177,6 +178,7 @@
--> -->
<!-- <listen_host>::</listen_host> --> <!-- <listen_host>::</listen_host> -->
<!-- Same for hosts without support for IPv6: --> <!-- Same for hosts without support for IPv6: -->
<!-- <listen_host>0.0.0.0</listen_host> --> <!-- <listen_host>0.0.0.0</listen_host> -->
@ -293,6 +295,10 @@
<max_thread_pool_size>10000</max_thread_pool_size> <max_thread_pool_size>10000</max_thread_pool_size>
<!-- Number of workers to recycle connections in background (see also drain_timeout).
If the pool is full, connection will be drained synchronously. -->
<!-- <max_threads_for_connection_collector>10</max_threads_for_connection_collector> -->
<!-- On memory constrained environments you may have to set this to value larger than 1. <!-- On memory constrained environments you may have to set this to value larger than 1.
--> -->
<max_server_memory_usage_to_ram_ratio>0.9</max_server_memory_usage_to_ram_ratio> <max_server_memory_usage_to_ram_ratio>0.9</max_server_memory_usage_to_ram_ratio>

View File

@ -87,7 +87,7 @@ if [ -z "$NO_BUILD" ] ; then
# Build (only binary packages). # Build (only binary packages).
debuild --preserve-env -e PATH \ debuild --preserve-env -e PATH \
-e DEB_CC=$DEB_CC -e DEB_CXX=$DEB_CXX -e CMAKE_FLAGS="$CMAKE_FLAGS" \ -e DEB_CC=$DEB_CC -e DEB_CXX=$DEB_CXX -e CMAKE_FLAGS="$CMAKE_FLAGS" \
-b ${DEBUILD_NOSIGN_OPTIONS} ${DEBUILD_NODEPS_OPTIONS} -b ${DEBUILD_NOSIGN_OPTIONS} ${DEBUILD_NODEPS_OPTIONS} ${DEB_ARCH_FLAG}
fi fi
if [ -n "$MAKE_RPM" ]; then if [ -n "$MAKE_RPM" ]; then

View File

@ -54,7 +54,7 @@ namespace
const Poco::SHA1Engine::Digest & digest = engine.digest(); const Poco::SHA1Engine::Digest & digest = engine.digest();
Poco::SHA1Engine::Digest calculated_password_sha1(sha1_size); Poco::SHA1Engine::Digest calculated_password_sha1(sha1_size);
for (size_t i = 0; i < sha1_size; i++) for (size_t i = 0; i < sha1_size; ++i)
calculated_password_sha1[i] = scrambled_password[i] ^ digest[i]; calculated_password_sha1[i] = scrambled_password[i] ^ digest[i];
auto calculated_password_double_sha1 = Util::encodeSHA1(calculated_password_sha1); auto calculated_password_double_sha1 = Util::encodeSHA1(calculated_password_sha1);

View File

@ -448,7 +448,7 @@ LDAPClient::SearchResults LDAPClient::search(const SearchParams & search_params)
vals = nullptr; vals = nullptr;
}); });
for (std::size_t i = 0; vals[i]; i++) for (size_t i = 0; vals[i]; ++i)
{ {
if (vals[i]->bv_val && vals[i]->bv_len > 0) if (vals[i]->bv_val && vals[i]->bv_len > 0)
result.emplace(vals[i]->bv_val, vals[i]->bv_len); result.emplace(vals[i]->bv_val, vals[i]->bv_len);
@ -473,7 +473,7 @@ LDAPClient::SearchResults LDAPClient::search(const SearchParams & search_params)
referrals = nullptr; referrals = nullptr;
}); });
for (std::size_t i = 0; referrals[i]; i++) for (size_t i = 0; referrals[i]; ++i)
{ {
LOG_WARNING(&Poco::Logger::get("LDAPClient"), "Received reference during LDAP search but not following it: {}", referrals[i]); LOG_WARNING(&Poco::Logger::get("LDAPClient"), "Received reference during LDAP search but not following it: {}", referrals[i]);
} }

View File

@ -15,6 +15,7 @@ namespace ErrorCodes
extern const int READONLY; extern const int READONLY;
extern const int QUERY_IS_PROHIBITED; extern const int QUERY_IS_PROHIBITED;
extern const int SETTING_CONSTRAINT_VIOLATION; extern const int SETTING_CONSTRAINT_VIOLATION;
extern const int UNKNOWN_SETTING;
} }
@ -200,7 +201,23 @@ bool SettingsConstraints::checkImpl(const Settings & current_settings, SettingCh
}; };
if (reaction == THROW_ON_VIOLATION) if (reaction == THROW_ON_VIOLATION)
{
try
{
access_control->checkSettingNameIsAllowed(setting_name); access_control->checkSettingNameIsAllowed(setting_name);
}
catch (Exception & e)
{
if (e.code() == ErrorCodes::UNKNOWN_SETTING)
{
if (const auto hints = current_settings.getHints(change.name); !hints.empty())
{
e.addMessage(fmt::format("Maybe you meant {}", toString(hints)));
}
}
throw;
}
}
else if (!access_control->isSettingNameAllowed(setting_name)) else if (!access_control->isSettingNameAllowed(setting_name))
return false; return false;

View File

@ -90,7 +90,7 @@ private:
throw; throw;
} }
for (i = 0; i < old_size; i++) for (i = 0; i < old_size; ++i)
{ {
nested_func->merge(&new_state[i * nested_size_of_data], nested_func->merge(&new_state[i * nested_size_of_data],
&old_state[i * nested_size_of_data], &old_state[i * nested_size_of_data],

View File

@ -54,6 +54,8 @@ public:
template <typename T, typename Data, typename Policy> template <typename T, typename Data, typename Policy>
class AggregateFunctionBitmapL2 final : public IAggregateFunctionDataHelper<Data, AggregateFunctionBitmapL2<T, Data, Policy>> class AggregateFunctionBitmapL2 final : public IAggregateFunctionDataHelper<Data, AggregateFunctionBitmapL2<T, Data, Policy>>
{ {
private:
static constexpr auto STATE_VERSION_1_MIN_REVISION = 54455;
public: public:
AggregateFunctionBitmapL2(const DataTypePtr & type) AggregateFunctionBitmapL2(const DataTypePtr & type)
: IAggregateFunctionDataHelper<Data, AggregateFunctionBitmapL2<T, Data, Policy>>({type}, {}) : IAggregateFunctionDataHelper<Data, AggregateFunctionBitmapL2<T, Data, Policy>>({type}, {})
@ -105,9 +107,38 @@ public:
} }
} }
void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional<size_t> /* version */) const override { this->data(place).rbs.write(buf); } bool isVersioned() const override { return true; }
void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional<size_t> /* version */, Arena *) const override { this->data(place).rbs.read(buf); } size_t getDefaultVersion() const override { return 1; }
size_t getVersionFromRevision(size_t revision) const override
{
if (revision >= STATE_VERSION_1_MIN_REVISION)
return 1;
else
return 0;
}
void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional<size_t> version) const override
{
if (!version)
version = getDefaultVersion();
if (*version >= 1)
DB::writeBoolText(this->data(place).init, buf);
this->data(place).rbs.write(buf);
}
void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional<size_t> version, Arena *) const override
{
if (!version)
version = getDefaultVersion();
if (*version >= 1)
DB::readBoolText(this->data(place).init, buf);
this->data(place).rbs.read(buf);
}
void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override
{ {

View File

@ -271,7 +271,7 @@ public:
{ {
lower_bound = std::min(lower_bound, other.lower_bound); lower_bound = std::min(lower_bound, other.lower_bound);
upper_bound = std::max(upper_bound, other.upper_bound); upper_bound = std::max(upper_bound, other.upper_bound);
for (size_t i = 0; i < other.size; i++) for (size_t i = 0; i < other.size; ++i)
add(other.points[i].mean, other.points[i].weight, max_bins); add(other.points[i].mean, other.points[i].weight, max_bins);
} }

View File

@ -56,7 +56,7 @@ static bool ALWAYS_INLINE inline is_all_zeros(const UInt8 * flags, size_t size)
i += 8; i += 8;
} }
for (; i < size; i++) for (; i < size; ++i)
if (flags[i]) if (flags[i])
return false; return false;

View File

@ -7,18 +7,20 @@
#include <DataTypes/DataTypeDateTime.h> #include <DataTypes/DataTypeDateTime.h>
#define TOP_K_MAX_SIZE 0xFFFFFF static inline constexpr UInt64 TOP_K_MAX_SIZE = 0xFFFFFF;
namespace DB namespace DB
{ {
struct Settings; struct Settings;
namespace ErrorCodes namespace ErrorCodes
{ {
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
extern const int ARGUMENT_OUT_OF_BOUND; extern const int ARGUMENT_OUT_OF_BOUND;
extern const int ILLEGAL_TYPE_OF_ARGUMENT; extern const int ILLEGAL_TYPE_OF_ARGUMENT;
extern const int LOGICAL_ERROR;
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
} }
@ -42,19 +44,22 @@ class AggregateFunctionTopKDateTime : public AggregateFunctionTopK<DataTypeDateT
template <bool is_weighted> template <bool is_weighted>
static IAggregateFunction * createWithExtraTypes(const DataTypePtr & argument_type, UInt64 threshold, UInt64 load_factor, const Array & params) static IAggregateFunction * createWithExtraTypes(const DataTypes & argument_types, UInt64 threshold, UInt64 load_factor, const Array & params)
{ {
WhichDataType which(argument_type); if (argument_types.empty())
throw DB::Exception(ErrorCodes::LOGICAL_ERROR, "Got empty arguments list");
WhichDataType which(argument_types[0]);
if (which.idx == TypeIndex::Date) if (which.idx == TypeIndex::Date)
return new AggregateFunctionTopKDate<is_weighted>(threshold, load_factor, {argument_type}, params); return new AggregateFunctionTopKDate<is_weighted>(threshold, load_factor, argument_types, params);
if (which.idx == TypeIndex::DateTime) if (which.idx == TypeIndex::DateTime)
return new AggregateFunctionTopKDateTime<is_weighted>(threshold, load_factor, {argument_type}, params); return new AggregateFunctionTopKDateTime<is_weighted>(threshold, load_factor, argument_types, params);
/// Check that we can use plain version of AggregateFunctionTopKGeneric /// Check that we can use plain version of AggregateFunctionTopKGeneric
if (argument_type->isValueUnambiguouslyRepresentedInContiguousMemoryRegion()) if (argument_types[0]->isValueUnambiguouslyRepresentedInContiguousMemoryRegion())
return new AggregateFunctionTopKGeneric<true, is_weighted>(threshold, load_factor, argument_type, params); return new AggregateFunctionTopKGeneric<true, is_weighted>(threshold, load_factor, argument_types, params);
else else
return new AggregateFunctionTopKGeneric<false, is_weighted>(threshold, load_factor, argument_type, params); return new AggregateFunctionTopKGeneric<false, is_weighted>(threshold, load_factor, argument_types, params);
} }
@ -78,40 +83,37 @@ AggregateFunctionPtr createAggregateFunctionTopK(const std::string & name, const
if (!params.empty()) if (!params.empty())
{ {
if (params.size() > 2) if (params.size() > 2)
throw Exception("Aggregate function " + name + " requires two parameters or less.", throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH,
ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH); "Aggregate function '{}' requires two parameters or less", name);
UInt64 k = applyVisitor(FieldVisitorConvertToNumber<UInt64>(), params[0]);
if (params.size() == 2) if (params.size() == 2)
{ {
load_factor = applyVisitor(FieldVisitorConvertToNumber<UInt64>(), params[1]); load_factor = applyVisitor(FieldVisitorConvertToNumber<UInt64>(), params[1]);
if (load_factor < 1) if (load_factor < 1)
throw Exception("Too small parameter 'load_factor' for aggregate function " + name + ". Minimum: 1", throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND,
ErrorCodes::ARGUMENT_OUT_OF_BOUND); "Too small parameter 'load_factor' for aggregate function '{}' (got {}, minimum is 1)", name, load_factor);
} }
if (k > TOP_K_MAX_SIZE || load_factor > TOP_K_MAX_SIZE || k * load_factor > TOP_K_MAX_SIZE) threshold = applyVisitor(FieldVisitorConvertToNumber<UInt64>(), params[0]);
throw Exception("Too large parameter(s) for aggregate function " + name + ". Maximum: " + toString(TOP_K_MAX_SIZE),
ErrorCodes::ARGUMENT_OUT_OF_BOUND);
if (k == 0) if (threshold > TOP_K_MAX_SIZE || load_factor > TOP_K_MAX_SIZE || threshold * load_factor > TOP_K_MAX_SIZE)
throw Exception("Parameter 0 is illegal for aggregate function " + name, throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND,
ErrorCodes::ARGUMENT_OUT_OF_BOUND); "Too large parameter(s) for aggregate function '{}' (maximum is {})", name, toString(TOP_K_MAX_SIZE));
threshold = k; if (threshold == 0)
throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Parameter 0 is illegal for aggregate function '{}'", name);
} }
AggregateFunctionPtr res(createWithNumericType<AggregateFunctionTopK, is_weighted>( AggregateFunctionPtr res(createWithNumericType<AggregateFunctionTopK, is_weighted>(
*argument_types[0], threshold, load_factor, argument_types, params)); *argument_types[0], threshold, load_factor, argument_types, params));
if (!res) if (!res)
res = AggregateFunctionPtr(createWithExtraTypes<is_weighted>(argument_types[0], threshold, load_factor, params)); res = AggregateFunctionPtr(createWithExtraTypes<is_weighted>(argument_types, threshold, load_factor, params));
if (!res) if (!res)
throw Exception("Illegal type " + argument_types[0]->getName() + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
" of argument for aggregate function " + name, ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); "Illegal type {} of argument for aggregate function '{}'", argument_types[0]->getName(), name);
return res; return res;
} }

View File

@ -132,8 +132,8 @@ private:
public: public:
AggregateFunctionTopKGeneric( AggregateFunctionTopKGeneric(
UInt64 threshold_, UInt64 load_factor, const DataTypePtr & input_data_type_, const Array & params) UInt64 threshold_, UInt64 load_factor, const DataTypes & argument_types_, const Array & params)
: IAggregateFunctionDataHelper<AggregateFunctionTopKGenericData, AggregateFunctionTopKGeneric<is_plain_column, is_weighted>>({input_data_type_}, params) : IAggregateFunctionDataHelper<AggregateFunctionTopKGenericData, AggregateFunctionTopKGeneric<is_plain_column, is_weighted>>(argument_types_, params)
, threshold(threshold_), reserved(load_factor * threshold), input_data_type(this->argument_types[0]) {} , threshold(threshold_), reserved(load_factor * threshold), input_data_type(this->argument_types[0]) {}
String getName() const override { return is_weighted ? "topKWeighted" : "topK"; } String getName() const override { return is_weighted ? "topKWeighted" : "topK"; }

View File

@ -2,6 +2,7 @@
#include <Columns/ColumnTuple.h> #include <Columns/ColumnTuple.h>
#include <Columns/ColumnsNumber.h> #include <Columns/ColumnsNumber.h>
#include <Columns/ColumnSparse.h>
#include <Core/Block.h> #include <Core/Block.h>
#include <Core/ColumnNumbers.h> #include <Core/ColumnNumbers.h>
#include <Core/Field.h> #include <Core/Field.h>
@ -181,6 +182,13 @@ public:
Arena * arena, Arena * arena,
ssize_t if_argument_pos = -1) const = 0; ssize_t if_argument_pos = -1) const = 0;
/// The version of "addBatch", that handle sparse columns as arguments.
virtual void addBatchSparse(
AggregateDataPtr * places,
size_t place_offset,
const IColumn ** columns,
Arena * arena) const = 0;
virtual void mergeBatch( virtual void mergeBatch(
size_t batch_size, size_t batch_size,
AggregateDataPtr * places, AggregateDataPtr * places,
@ -193,6 +201,10 @@ public:
virtual void addBatchSinglePlace( virtual void addBatchSinglePlace(
size_t batch_size, AggregateDataPtr place, const IColumn ** columns, Arena * arena, ssize_t if_argument_pos = -1) const = 0; size_t batch_size, AggregateDataPtr place, const IColumn ** columns, Arena * arena, ssize_t if_argument_pos = -1) const = 0;
/// The version of "addBatchSinglePlace", that handle sparse columns as arguments.
virtual void addBatchSparseSinglePlace(
AggregateDataPtr place, const IColumn ** columns, Arena * arena) const = 0;
/** The same for single place when need to aggregate only filtered data. /** The same for single place when need to aggregate only filtered data.
* Instead of using an if-column, the condition is combined inside the null_map * Instead of using an if-column, the condition is combined inside the null_map
*/ */
@ -367,6 +379,22 @@ public:
} }
} }
void addBatchSparse(
AggregateDataPtr * places,
size_t place_offset,
const IColumn ** columns,
Arena * arena) const override
{
const auto & column_sparse = assert_cast<const ColumnSparse &>(*columns[0]);
const auto * values = &column_sparse.getValuesColumn();
size_t batch_size = column_sparse.size();
auto offset_it = column_sparse.begin();
for (size_t i = 0; i < batch_size; ++i, ++offset_it)
static_cast<const Derived *>(this)->add(places[offset_it.getCurrentRow()] + place_offset,
&values, offset_it.getValueIndex(), arena);
}
void mergeBatch( void mergeBatch(
size_t batch_size, size_t batch_size,
AggregateDataPtr * places, AggregateDataPtr * places,
@ -398,6 +426,19 @@ public:
} }
} }
void addBatchSparseSinglePlace(
AggregateDataPtr place, const IColumn ** columns, Arena * arena) const override
{
/// TODO: add values and defaults separately if order of adding isn't important.
const auto & column_sparse = assert_cast<const ColumnSparse &>(*columns[0]);
const auto * values = &column_sparse.getValuesColumn();
size_t batch_size = column_sparse.size();
auto offset_it = column_sparse.begin();
for (size_t i = 0; i < batch_size; ++i, ++offset_it)
static_cast<const Derived *>(this)->add(place, &values, offset_it.getValueIndex(), arena);
}
void addBatchSinglePlaceNotNull( void addBatchSinglePlaceNotNull(
size_t batch_size, size_t batch_size,
AggregateDataPtr place, AggregateDataPtr place,

View File

@ -106,6 +106,10 @@ if (USE_AWS_S3)
add_headers_and_sources(dbms Disks/S3) add_headers_and_sources(dbms Disks/S3)
endif() endif()
if (USE_AZURE_BLOB_STORAGE)
add_headers_and_sources(dbms Disks/AzureBlobStorage)
endif()
if (USE_HDFS) if (USE_HDFS)
add_headers_and_sources(dbms Storages/HDFS) add_headers_and_sources(dbms Storages/HDFS)
add_headers_and_sources(dbms Disks/HDFS) add_headers_and_sources(dbms Disks/HDFS)
@ -450,6 +454,11 @@ if (USE_AWS_S3)
target_include_directories (clickhouse_common_io SYSTEM BEFORE PUBLIC ${AWS_S3_INCLUDE_DIR}) target_include_directories (clickhouse_common_io SYSTEM BEFORE PUBLIC ${AWS_S3_INCLUDE_DIR})
endif() endif()
if (USE_AZURE_BLOB_STORAGE)
target_link_libraries (clickhouse_common_io PUBLIC ${AZURE_BLOB_STORAGE_LIBRARY})
target_include_directories (clickhouse_common_io SYSTEM BEFORE PUBLIC ${AZURE_SDK_INCLUDES})
endif()
if (USE_S2_GEOMETRY) if (USE_S2_GEOMETRY)
dbms_target_link_libraries (PUBLIC ${S2_GEOMETRY_LIBRARY}) dbms_target_link_libraries (PUBLIC ${S2_GEOMETRY_LIBRARY})
dbms_target_include_directories (SYSTEM BEFORE PUBLIC ${S2_GEOMETRY_INCLUDE_DIR}) dbms_target_include_directories (SYSTEM BEFORE PUBLIC ${S2_GEOMETRY_INCLUDE_DIR})

View File

@ -4,6 +4,8 @@
#include <iomanip> #include <iomanip>
#include <string_view> #include <string_view>
#include <filesystem> #include <filesystem>
#include <map>
#include <unordered_map>
#include <base/argsToConfig.h> #include <base/argsToConfig.h>
#include <base/DateLUT.h> #include <base/DateLUT.h>
@ -52,6 +54,7 @@
#include <Processors/Executors/PullingAsyncPipelineExecutor.h> #include <Processors/Executors/PullingAsyncPipelineExecutor.h>
#include <Processors/Transforms/AddingDefaultsTransform.h> #include <Processors/Transforms/AddingDefaultsTransform.h>
#include <Interpreters/ReplaceQueryParameterVisitor.h> #include <Interpreters/ReplaceQueryParameterVisitor.h>
#include <Interpreters/ProfileEventsExt.h>
#include <IO/WriteBufferFromOStream.h> #include <IO/WriteBufferFromOStream.h>
#include <IO/CompressionMethod.h> #include <IO/CompressionMethod.h>
#include <Client/InternalTextLogs.h> #include <Client/InternalTextLogs.h>
@ -105,6 +108,99 @@ namespace ProfileEvents
namespace DB namespace DB
{ {
static void incrementProfileEventsBlock(Block & dst, const Block & src)
{
if (!dst)
{
dst = src;
return;
}
assertBlocksHaveEqualStructure(src, dst, "ProfileEvents");
std::unordered_map<String, size_t> name_pos;
for (size_t i = 0; i < dst.columns(); ++i)
name_pos[dst.getByPosition(i).name] = i;
size_t dst_rows = dst.rows();
MutableColumns mutable_columns = dst.mutateColumns();
auto & dst_column_host_name = typeid_cast<ColumnString &>(*mutable_columns[name_pos["host_name"]]);
auto & dst_array_current_time = typeid_cast<ColumnUInt32 &>(*mutable_columns[name_pos["current_time"]]).getData();
auto & dst_array_thread_id = typeid_cast<ColumnUInt64 &>(*mutable_columns[name_pos["thread_id"]]).getData();
auto & dst_array_type = typeid_cast<ColumnInt8 &>(*mutable_columns[name_pos["type"]]).getData();
auto & dst_column_name = typeid_cast<ColumnString &>(*mutable_columns[name_pos["name"]]);
auto & dst_array_value = typeid_cast<ColumnInt64 &>(*mutable_columns[name_pos["value"]]).getData();
const auto & src_column_host_name = typeid_cast<const ColumnString &>(*src.getByName("host_name").column);
const auto & src_array_current_time = typeid_cast<const ColumnUInt32 &>(*src.getByName("current_time").column).getData();
const auto & src_array_thread_id = typeid_cast<const ColumnUInt64 &>(*src.getByName("thread_id").column).getData();
const auto & src_column_name = typeid_cast<const ColumnString &>(*src.getByName("name").column);
const auto & src_array_value = typeid_cast<const ColumnInt64 &>(*src.getByName("value").column).getData();
struct Id
{
StringRef name;
StringRef host_name;
UInt64 thread_id;
bool operator<(const Id & rhs) const
{
return std::tie(name, host_name, thread_id)
< std::tie(rhs.name, rhs.host_name, rhs.thread_id);
}
};
std::map<Id, UInt64> rows_by_name;
for (size_t src_row = 0; src_row < src.rows(); ++src_row)
{
Id id{
src_column_name.getDataAt(src_row),
src_column_host_name.getDataAt(src_row),
src_array_thread_id[src_row],
};
rows_by_name[id] = src_row;
}
/// Merge src into dst.
for (size_t dst_row = 0; dst_row < dst_rows; ++dst_row)
{
Id id{
dst_column_name.getDataAt(dst_row),
dst_column_host_name.getDataAt(dst_row),
dst_array_thread_id[dst_row],
};
if (auto it = rows_by_name.find(id); it != rows_by_name.end())
{
size_t src_row = it->second;
dst_array_current_time[dst_row] = src_array_current_time[src_row];
switch (dst_array_type[dst_row])
{
case ProfileEvents::Type::INCREMENT:
dst_array_value[dst_row] += src_array_value[src_row];
break;
case ProfileEvents::Type::GAUGE:
dst_array_value[dst_row] = src_array_value[src_row];
break;
}
rows_by_name.erase(it);
}
}
/// Copy rows from src that dst does not contains.
for (const auto & [id, pos] : rows_by_name)
{
for (size_t col = 0; col < src.columns(); ++col)
{
mutable_columns[col]->insert((*src.getByPosition(col).column)[pos]);
}
}
dst.setColumns(std::move(mutable_columns));
}
std::atomic_flag exit_on_signal = ATOMIC_FLAG_INIT; std::atomic_flag exit_on_signal = ATOMIC_FLAG_INIT;
@ -753,7 +849,7 @@ void ClientBase::onProfileEvents(Block & block)
} }
else else
{ {
profile_events.last_block = block; incrementProfileEventsBlock(profile_events.last_block, block);
} }
} }
profile_events.watch.restart(); profile_events.watch.restart();
@ -1635,7 +1731,13 @@ void ClientBase::parseAndCheckOptions(OptionsDescription & options_description,
/// Check unrecognized options without positional options. /// Check unrecognized options without positional options.
auto unrecognized_options = po::collect_unrecognized(parsed.options, po::collect_unrecognized_mode::exclude_positional); auto unrecognized_options = po::collect_unrecognized(parsed.options, po::collect_unrecognized_mode::exclude_positional);
if (!unrecognized_options.empty()) if (!unrecognized_options.empty())
{
auto hints = this->getHints(unrecognized_options[0]);
if (!hints.empty())
throw Exception(ErrorCodes::UNRECOGNIZED_ARGUMENTS, "Unrecognized option '{}'. Maybe you meant {}", unrecognized_options[0], toString(hints));
throw Exception(ErrorCodes::UNRECOGNIZED_ARGUMENTS, "Unrecognized option '{}'", unrecognized_options[0]); throw Exception(ErrorCodes::UNRECOGNIZED_ARGUMENTS, "Unrecognized option '{}'", unrecognized_options[0]);
}
/// Check positional options (options after ' -- ', ex: clickhouse-client -- <options>). /// Check positional options (options after ' -- ', ex: clickhouse-client -- <options>).
unrecognized_options = po::collect_unrecognized(parsed.options, po::collect_unrecognized_mode::include_positional); unrecognized_options = po::collect_unrecognized(parsed.options, po::collect_unrecognized_mode::include_positional);
@ -1713,6 +1815,25 @@ void ClientBase::init(int argc, char ** argv)
; ;
addOptions(options_description); addOptions(options_description);
auto getter = [](const auto & op)
{
String op_long_name = op->long_name();
return "--" + String(op_long_name);
};
if (options_description.main_description)
{
const auto & main_options = options_description.main_description->options();
std::transform(main_options.begin(), main_options.end(), std::back_inserter(cmd_options), getter);
}
if (options_description.external_description)
{
const auto & external_options = options_description.external_description->options();
std::transform(external_options.begin(), external_options.end(), std::back_inserter(cmd_options), getter);
}
parseAndCheckOptions(options_description, options, common_arguments); parseAndCheckOptions(options_description, options, common_arguments);
po::notify(options); po::notify(options);

View File

@ -1,5 +1,6 @@
#pragma once #pragma once
#include "Common/NamePrompter.h"
#include <Common/ProgressIndication.h> #include <Common/ProgressIndication.h>
#include <Common/InterruptListener.h> #include <Common/InterruptListener.h>
#include <Common/ShellCommand.h> #include <Common/ShellCommand.h>
@ -37,7 +38,7 @@ void interruptSignalHandler(int signum);
class InternalTextLogs; class InternalTextLogs;
class ClientBase : public Poco::Util::Application class ClientBase : public Poco::Util::Application, public IHints<2, ClientBase>
{ {
public: public:
@ -48,6 +49,8 @@ public:
void init(int argc, char ** argv); void init(int argc, char ** argv);
std::vector<String> getAllRegisteredNames() const override { return cmd_options; }
protected: protected:
void runInteractive(); void runInteractive();
void runNonInteractive(); void runNonInteractive();
@ -145,6 +148,7 @@ protected:
std::vector<String> queries_files; /// If not empty, queries will be read from these files std::vector<String> queries_files; /// If not empty, queries will be read from these files
std::vector<String> interleave_queries_files; /// If not empty, run queries from these files before processing every file from 'queries_files'. std::vector<String> interleave_queries_files; /// If not empty, run queries from these files before processing every file from 'queries_files'.
std::vector<String> cmd_options;
bool stdin_is_a_tty = false; /// stdin is a terminal. bool stdin_is_a_tty = false; /// stdin is a terminal.
bool stdout_is_a_tty = false; /// stdout is a terminal. bool stdout_is_a_tty = false; /// stdout is a terminal.

View File

@ -25,7 +25,12 @@ struct PocoSocketWrapper : public Poco::Net::SocketImpl
void IConnections::DrainCallback::operator()(int fd, Poco::Timespan, const std::string fd_description) const void IConnections::DrainCallback::operator()(int fd, Poco::Timespan, const std::string fd_description) const
{ {
if (!PocoSocketWrapper(fd).poll(drain_timeout, Poco::Net::Socket::SELECT_READ)) if (!PocoSocketWrapper(fd).poll(drain_timeout, Poco::Net::Socket::SELECT_READ))
throw Exception(ErrorCodes::SOCKET_TIMEOUT, "Read timeout while draining from {}", fd_description); {
throw Exception(ErrorCodes::SOCKET_TIMEOUT,
"Read timeout ({} ms) while draining from {}",
drain_timeout.totalMilliseconds(),
fd_description);
}
} }
} }

View File

@ -395,17 +395,17 @@ MultiplexedConnections::ReplicaState & MultiplexedConnections::getReplicaForRead
read_list.push_back(*connection->socket); read_list.push_back(*connection->socket);
} }
auto timeout = is_draining ? drain_timeout : receive_timeout;
int n = Poco::Net::Socket::select( int n = Poco::Net::Socket::select(
read_list, read_list,
write_list, write_list,
except_list, except_list,
is_draining ? drain_timeout : receive_timeout); timeout);
/// We treat any error as timeout for simplicity. /// We treat any error as timeout for simplicity.
/// And we also check if read_list is still empty just in case. /// And we also check if read_list is still empty just in case.
if (n <= 0 || read_list.empty()) if (n <= 0 || read_list.empty())
{ {
auto err_msg = fmt::format("Timeout exceeded while reading from {}", dumpAddressesUnlocked());
for (ReplicaState & state : replica_states) for (ReplicaState & state : replica_states)
{ {
Connection * connection = state.connection; Connection * connection = state.connection;
@ -415,7 +415,10 @@ MultiplexedConnections::ReplicaState & MultiplexedConnections::getReplicaForRead
invalidateReplica(state); invalidateReplica(state);
} }
} }
throw Exception(err_msg, ErrorCodes::TIMEOUT_EXCEEDED); throw Exception(ErrorCodes::TIMEOUT_EXCEEDED,
"Timeout ({} ms) exceeded while reading from {}",
timeout.totalMilliseconds(),
dumpAddressesUnlocked());
} }
} }

View File

@ -133,6 +133,11 @@ public:
void get(size_t n, Field & res) const override; void get(size_t n, Field & res) const override;
bool isDefaultAt(size_t) const override
{
throw Exception("Method isDefaultAt is not supported for ColumnAggregateFunction", ErrorCodes::NOT_IMPLEMENTED);
}
StringRef getDataAt(size_t n) const override; StringRef getDataAt(size_t n) const override;
void insertData(const char * pos, size_t length) override; void insertData(const char * pos, size_t length) override;
@ -208,6 +213,16 @@ public:
throw Exception("Method hasEqualValues is not supported for ColumnAggregateFunction", ErrorCodes::NOT_IMPLEMENTED); throw Exception("Method hasEqualValues is not supported for ColumnAggregateFunction", ErrorCodes::NOT_IMPLEMENTED);
} }
double getRatioOfDefaultRows(double) const override
{
throw Exception("Method getRatioOfDefaultRows is not supported for ColumnAggregateFunction", ErrorCodes::NOT_IMPLEMENTED);
}
void getIndicesOfNonDefaultRows(Offsets &, size_t, size_t) const override
{
throw Exception("Method getIndicesOfNonDefaultRows is not supported for ColumnAggregateFunction", ErrorCodes::NOT_IMPLEMENTED);
}
void getPermutation(bool reverse, size_t limit, int nan_direction_hint, Permutation & res) const override; void getPermutation(bool reverse, size_t limit, int nan_direction_hint, Permutation & res) const override;
void updatePermutation(bool reverse, size_t limit, int, Permutation & res, EqualRanges & equal_range) const override; void updatePermutation(bool reverse, size_t limit, int, Permutation & res, EqualRanges & equal_range) const override;

View File

@ -182,6 +182,13 @@ StringRef ColumnArray::getDataAt(size_t n) const
} }
bool ColumnArray::isDefaultAt(size_t n) const
{
const auto & offsets_data = getOffsets();
return offsets_data[n] == offsets_data[static_cast<ssize_t>(n) - 1];
}
void ColumnArray::insertData(const char * pos, size_t length) void ColumnArray::insertData(const char * pos, size_t length)
{ {
/** Similarly - only for arrays of fixed length values. /** Similarly - only for arrays of fixed length values.
@ -576,7 +583,8 @@ void ColumnArray::expand(const IColumn::Filter & mask, bool inverted)
} }
if (from != -1) if (from != -1)
throw Exception("Not enough bytes in mask", ErrorCodes::LOGICAL_ERROR);} throw Exception("Not enough bytes in mask", ErrorCodes::LOGICAL_ERROR);
}
template <typename T> template <typename T>
ColumnPtr ColumnArray::filterNumber(const Filter & filt, ssize_t result_size_hint) const ColumnPtr ColumnArray::filterNumber(const Filter & filt, ssize_t result_size_hint) const
@ -868,6 +876,16 @@ ColumnPtr ColumnArray::compress() const
}); });
} }
double ColumnArray::getRatioOfDefaultRows(double sample_ratio) const
{
return getRatioOfDefaultRowsImpl<ColumnArray>(sample_ratio);
}
void ColumnArray::getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const
{
return getIndicesOfNonDefaultRowsImpl<ColumnArray>(indices, from, limit);
}
ColumnPtr ColumnArray::replicate(const Offsets & replicate_offsets) const ColumnPtr ColumnArray::replicate(const Offsets & replicate_offsets) const
{ {

View File

@ -60,6 +60,7 @@ public:
Field operator[](size_t n) const override; Field operator[](size_t n) const override;
void get(size_t n, Field & res) const override; void get(size_t n, Field & res) const override;
StringRef getDataAt(size_t n) const override; StringRef getDataAt(size_t n) const override;
bool isDefaultAt(size_t n) const override;
void insertData(const char * pos, size_t length) override; void insertData(const char * pos, size_t length) override;
StringRef serializeValueIntoArena(size_t n, Arena & arena, char const *& begin) const override; StringRef serializeValueIntoArena(size_t n, Arena & arena, char const *& begin) const override;
const char * deserializeAndInsertFromArena(const char * pos) override; const char * deserializeAndInsertFromArena(const char * pos) override;
@ -143,6 +144,10 @@ public:
return false; return false;
} }
double getRatioOfDefaultRows(double sample_ratio) const override;
void getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const override;
bool isCollationSupported() const override { return getData().isCollationSupported(); } bool isCollationSupported() const override { return getData().isCollationSupported(); }
private: private:

View File

@ -82,6 +82,7 @@ public:
Field operator[](size_t) const override { throwMustBeDecompressed(); } Field operator[](size_t) const override { throwMustBeDecompressed(); }
void get(size_t, Field &) const override { throwMustBeDecompressed(); } void get(size_t, Field &) const override { throwMustBeDecompressed(); }
StringRef getDataAt(size_t) const override { throwMustBeDecompressed(); } StringRef getDataAt(size_t) const override { throwMustBeDecompressed(); }
bool isDefaultAt(size_t) const override { throwMustBeDecompressed(); }
void insert(const Field &) override { throwMustBeDecompressed(); } void insert(const Field &) override { throwMustBeDecompressed(); }
void insertRangeFrom(const IColumn &, size_t, size_t) override { throwMustBeDecompressed(); } void insertRangeFrom(const IColumn &, size_t, size_t) override { throwMustBeDecompressed(); }
void insertData(const char *, size_t) override { throwMustBeDecompressed(); } void insertData(const char *, size_t) override { throwMustBeDecompressed(); }
@ -113,6 +114,8 @@ public:
void gather(ColumnGathererStream &) override { throwMustBeDecompressed(); } void gather(ColumnGathererStream &) override { throwMustBeDecompressed(); }
void getExtremes(Field &, Field &) const override { throwMustBeDecompressed(); } void getExtremes(Field &, Field &) const override { throwMustBeDecompressed(); }
size_t byteSizeAt(size_t) const override { throwMustBeDecompressed(); } size_t byteSizeAt(size_t) const override { throwMustBeDecompressed(); }
double getRatioOfDefaultRows(double) const override { throwMustBeDecompressed(); }
void getIndicesOfNonDefaultRows(Offsets &, size_t, size_t) const override { throwMustBeDecompressed(); }
protected: protected:
size_t rows; size_t rows;

View File

@ -5,6 +5,7 @@
#include <Columns/IColumn.h> #include <Columns/IColumn.h>
#include <Common/typeid_cast.h> #include <Common/typeid_cast.h>
#include <Common/assert_cast.h> #include <Common/assert_cast.h>
#include <Common/PODArray.h>
namespace DB namespace DB
@ -115,6 +116,11 @@ public:
return data->getFloat32(0); return data->getFloat32(0);
} }
bool isDefaultAt(size_t) const override
{
return data->isDefaultAt(0);
}
bool isNullAt(size_t) const override bool isNullAt(size_t) const override
{ {
return data->isNullAt(0); return data->isNullAt(0);
@ -239,6 +245,27 @@ public:
return false; return false;
} }
double getRatioOfDefaultRows(double) const override
{
return data->isDefaultAt(0) ? 1.0 : 0.0;
}
void getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const override
{
if (!data->isDefaultAt(0))
{
size_t to = limit && from + limit < size() ? from + limit : size();
indices.reserve(indices.size() + to - from);
for (size_t i = from; i < to; ++i)
indices.push_back(i);
}
}
SerializationInfoPtr getSerializationInfo() const override
{
return data->getSerializationInfo();
}
bool isNullable() const override { return isColumnNullable(*data); } bool isNullable() const override { return isColumnNullable(*data); }
bool onlyNull() const override { return data->isNullAt(0); } bool onlyNull() const override { return data->isNullAt(0); }
bool isNumeric() const override { return data->isNumeric(); } bool isNumeric() const override { return data->isNumeric(); }

View File

@ -331,7 +331,8 @@ void ColumnDecimal<T>::gather(ColumnGathererStream & gatherer)
template <is_decimal T> template <is_decimal T>
ColumnPtr ColumnDecimal<T>::compress() const ColumnPtr ColumnDecimal<T>::compress() const
{ {
size_t source_size = data.size() * sizeof(T); const size_t data_size = data.size();
const size_t source_size = data_size * sizeof(T);
/// Don't compress small blocks. /// Don't compress small blocks.
if (source_size < 4096) /// A wild guess. if (source_size < 4096) /// A wild guess.
@ -342,8 +343,9 @@ ColumnPtr ColumnDecimal<T>::compress() const
if (!compressed) if (!compressed)
return ColumnCompressed::wrap(this->getPtr()); return ColumnCompressed::wrap(this->getPtr());
return ColumnCompressed::create(data.size(), compressed->size(), const size_t compressed_size = compressed->size();
[compressed = std::move(compressed), column_size = data.size(), scale = this->scale] return ColumnCompressed::create(data_size, compressed_size,
[compressed = std::move(compressed), column_size = data_size, scale = this->scale]
{ {
auto res = ColumnDecimal<T>::create(column_size, scale); auto res = ColumnDecimal<T>::create(column_size, scale);
ColumnCompressed::decompressBuffer( ColumnCompressed::decompressBuffer(

View File

@ -177,8 +177,17 @@ public:
return false; return false;
} }
ColumnPtr compress() const override; double getRatioOfDefaultRows(double sample_ratio) const override
{
return this->template getRatioOfDefaultRowsImpl<Self>(sample_ratio);
}
void getIndicesOfNonDefaultRows(IColumn::Offsets & indices, size_t from, size_t limit) const override
{
return this->template getIndicesOfNonDefaultRowsImpl<Self>(indices, from, limit);
}
ColumnPtr compress() const override;
void insertValue(const T value) { data.push_back(value); } void insertValue(const T value) { data.push_back(value); }
Container & getData() { return data; } Container & getData() { return data; }

View File

@ -51,6 +51,12 @@ MutableColumnPtr ColumnFixedString::cloneResized(size_t size) const
return new_col_holder; return new_col_holder;
} }
bool ColumnFixedString::isDefaultAt(size_t index) const
{
assert(index < size());
return memoryIsZero(chars.data() + index * n, n);
}
void ColumnFixedString::insert(const Field & x) void ColumnFixedString::insert(const Field & x)
{ {
const String & s = DB::get<const String &>(x); const String & s = DB::get<const String &>(x);
@ -409,9 +415,9 @@ ColumnPtr ColumnFixedString::compress() const
if (!compressed) if (!compressed)
return ColumnCompressed::wrap(this->getPtr()); return ColumnCompressed::wrap(this->getPtr());
size_t column_size = size(); const size_t column_size = size();
const size_t compressed_size = compressed->size();
return ColumnCompressed::create(column_size, compressed->size(), return ColumnCompressed::create(column_size, compressed_size,
[compressed = std::move(compressed), column_size, n = n] [compressed = std::move(compressed), column_size, n = n]
{ {
size_t chars_size = n * column_size; size_t chars_size = n * column_size;

View File

@ -88,6 +88,8 @@ public:
return StringRef(&chars[n * index], n); return StringRef(&chars[n * index], n);
} }
bool isDefaultAt(size_t index) const override;
void insert(const Field & x) override; void insert(const Field & x) override;
void insertFrom(const IColumn & src_, size_t index) override; void insertFrom(const IColumn & src_, size_t index) override;
@ -182,6 +184,16 @@ public:
return false; return false;
} }
double getRatioOfDefaultRows(double sample_ratio) const override
{
return getRatioOfDefaultRowsImpl<ColumnFixedString>(sample_ratio);
}
void getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const override
{
return getIndicesOfNonDefaultRowsImpl<ColumnFixedString>(indices, from, limit);
}
bool canBeInsideNullable() const override { return true; } bool canBeInsideNullable() const override { return true; }
bool isFixedAndContiguous() const override { return true; } bool isFixedAndContiguous() const override { return true; }

View File

@ -24,7 +24,12 @@ class ColumnFunction final : public COWHelper<IColumn, ColumnFunction>
private: private:
friend class COWHelper<IColumn, ColumnFunction>; friend class COWHelper<IColumn, ColumnFunction>;
ColumnFunction(size_t size, FunctionBasePtr function_, const ColumnsWithTypeAndName & columns_to_capture, bool is_short_circuit_argument_ = false, bool is_function_compiled_ = false); ColumnFunction(
size_t size,
FunctionBasePtr function_,
const ColumnsWithTypeAndName & columns_to_capture,
bool is_short_circuit_argument_ = false,
bool is_function_compiled_ = false);
public: public:
const char * getFamilyName() const override { return "Function"; } const char * getFamilyName() const override { return "Function"; }
@ -68,6 +73,11 @@ public:
throw Exception("Cannot get value from " + getName(), ErrorCodes::NOT_IMPLEMENTED); throw Exception("Cannot get value from " + getName(), ErrorCodes::NOT_IMPLEMENTED);
} }
bool isDefaultAt(size_t) const override
{
throw Exception("isDefaultAt is not implemented for " + getName(), ErrorCodes::NOT_IMPLEMENTED);
}
void insert(const Field &) override void insert(const Field &) override
{ {
throw Exception("Cannot insert into " + getName(), ErrorCodes::NOT_IMPLEMENTED); throw Exception("Cannot insert into " + getName(), ErrorCodes::NOT_IMPLEMENTED);
@ -153,6 +163,16 @@ public:
throw Exception("Method gather is not supported for " + getName(), ErrorCodes::NOT_IMPLEMENTED); throw Exception("Method gather is not supported for " + getName(), ErrorCodes::NOT_IMPLEMENTED);
} }
double getRatioOfDefaultRows(double) const override
{
throw Exception("Method getRatioOfDefaultRows is not supported for " + getName(), ErrorCodes::NOT_IMPLEMENTED);
}
void getIndicesOfNonDefaultRows(Offsets &, size_t, size_t) const override
{
throw Exception("Method getIndicesOfNonDefaultRows is not supported for " + getName(), ErrorCodes::NOT_IMPLEMENTED);
}
bool isShortCircuitArgument() const { return is_short_circuit_argument; } bool isShortCircuitArgument() const { return is_short_circuit_argument; }
DataTypePtr getResultType() const; DataTypePtr getResultType() const;

View File

@ -64,6 +64,7 @@ public:
return getDictionary().getDataAtWithTerminatingZero(getIndexes().getUInt(n)); return getDictionary().getDataAtWithTerminatingZero(getIndexes().getUInt(n));
} }
bool isDefaultAt(size_t n) const override { return getDictionary().isDefaultAt(getIndexes().getUInt(n)); }
UInt64 get64(size_t n) const override { return getDictionary().get64(getIndexes().getUInt(n)); } UInt64 get64(size_t n) const override { return getDictionary().get64(getIndexes().getUInt(n)); }
UInt64 getUInt(size_t n) const override { return getDictionary().getUInt(getIndexes().getUInt(n)); } UInt64 getUInt(size_t n) const override { return getDictionary().getUInt(getIndexes().getUInt(n)); }
Int64 getInt(size_t n) const override { return getDictionary().getInt(getIndexes().getUInt(n)); } Int64 getInt(size_t n) const override { return getDictionary().getInt(getIndexes().getUInt(n)); }
@ -180,6 +181,16 @@ public:
return false; return false;
} }
double getRatioOfDefaultRows(double sample_ratio) const override
{
return getIndexes().getRatioOfDefaultRows(sample_ratio);
}
void getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const override
{
return getIndexes().getIndicesOfNonDefaultRows(indices, from, limit);
}
bool valuesHaveFixedSize() const override { return getDictionary().valuesHaveFixedSize(); } bool valuesHaveFixedSize() const override { return getDictionary().valuesHaveFixedSize(); }
bool isFixedAndContiguous() const override { return false; } bool isFixedAndContiguous() const override { return false; }
size_t sizeOfValueIfFixed() const override { return getDictionary().sizeOfValueIfFixed(); } size_t sizeOfValueIfFixed() const override { return getDictionary().sizeOfValueIfFixed(); }

View File

@ -81,6 +81,11 @@ void ColumnMap::get(size_t n, Field & res) const
getNestedData().get(offset + i, map[i]); getNestedData().get(offset + i, map[i]);
} }
bool ColumnMap::isDefaultAt(size_t n) const
{
return nested->isDefaultAt(n);
}
StringRef ColumnMap::getDataAt(size_t) const StringRef ColumnMap::getDataAt(size_t) const
{ {
throw Exception("Method getDataAt is not supported for " + getName(), ErrorCodes::NOT_IMPLEMENTED); throw Exception("Method getDataAt is not supported for " + getName(), ErrorCodes::NOT_IMPLEMENTED);
@ -273,6 +278,16 @@ bool ColumnMap::structureEquals(const IColumn & rhs) const
return false; return false;
} }
double ColumnMap::getRatioOfDefaultRows(double sample_ratio) const
{
return getRatioOfDefaultRowsImpl<ColumnMap>(sample_ratio);
}
void ColumnMap::getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const
{
return getIndicesOfNonDefaultRowsImpl<ColumnMap>(indices, from, limit);
}
ColumnPtr ColumnMap::compress() const ColumnPtr ColumnMap::compress() const
{ {
auto compressed = nested->compress(); auto compressed = nested->compress();

View File

@ -51,6 +51,7 @@ public:
Field operator[](size_t n) const override; Field operator[](size_t n) const override;
void get(size_t n, Field & res) const override; void get(size_t n, Field & res) const override;
bool isDefaultAt(size_t n) const override;
StringRef getDataAt(size_t n) const override; StringRef getDataAt(size_t n) const override;
void insertData(const char * pos, size_t length) override; void insertData(const char * pos, size_t length) override;
void insert(const Field & x) override; void insert(const Field & x) override;
@ -85,6 +86,8 @@ public:
void protect() override; void protect() override;
void forEachSubcolumn(ColumnCallback callback) override; void forEachSubcolumn(ColumnCallback callback) override;
bool structureEquals(const IColumn & rhs) const override; bool structureEquals(const IColumn & rhs) const override;
double getRatioOfDefaultRows(double sample_ratio) const override;
void getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const override;
const ColumnArray & getNestedColumn() const { return assert_cast<const ColumnArray &>(*nested); } const ColumnArray & getNestedColumn() const { return assert_cast<const ColumnArray &>(*nested); }
ColumnArray & getNestedColumn() { return assert_cast<ColumnArray &>(*nested); } ColumnArray & getNestedColumn() { return assert_cast<ColumnArray &>(*nested); }

View File

@ -648,6 +648,29 @@ void ColumnNullable::checkConsistency() const
ErrorCodes::SIZES_OF_NESTED_COLUMNS_ARE_INCONSISTENT); ErrorCodes::SIZES_OF_NESTED_COLUMNS_ARE_INCONSISTENT);
} }
ColumnPtr ColumnNullable::createWithOffsets(const IColumn::Offsets & offsets, const Field & default_field, size_t total_rows, size_t shift) const
{
ColumnPtr new_values;
ColumnPtr new_null_map;
if (default_field.getType() == Field::Types::Null)
{
auto default_column = nested_column->cloneEmpty();
default_column->insertDefault();
/// Value in main column, when null map is 1 is implementation defined. So, take any value.
new_values = nested_column->createWithOffsets(offsets, (*default_column)[0], total_rows, shift);
new_null_map = null_map->createWithOffsets(offsets, Field(1u), total_rows, shift);
}
else
{
new_values = nested_column->createWithOffsets(offsets, default_field, total_rows, shift);
new_null_map = null_map->createWithOffsets(offsets, Field(0u), total_rows, shift);
}
return ColumnNullable::create(new_values, new_null_map);
}
ColumnPtr makeNullable(const ColumnPtr & column) ColumnPtr makeNullable(const ColumnPtr & column)
{ {
if (isColumnNullable(*column)) if (isColumnNullable(*column))

View File

@ -54,6 +54,7 @@ public:
void get(size_t n, Field & res) const override; void get(size_t n, Field & res) const override;
bool getBool(size_t n) const override { return isNullAt(n) ? false : nested_column->getBool(n); } bool getBool(size_t n) const override { return isNullAt(n) ? false : nested_column->getBool(n); }
UInt64 get64(size_t n) const override { return nested_column->get64(n); } UInt64 get64(size_t n) const override { return nested_column->get64(n); }
bool isDefaultAt(size_t n) const override { return isNullAt(n); }
/** /**
* If isNullAt(n) returns false, returns the nested column's getDataAt(n), otherwise returns a special value * If isNullAt(n) returns false, returns the nested column's getDataAt(n), otherwise returns a special value
@ -137,6 +138,18 @@ public:
return false; return false;
} }
double getRatioOfDefaultRows(double sample_ratio) const override
{
return null_map->getRatioOfDefaultRows(sample_ratio);
}
void getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const override
{
null_map->getIndicesOfNonDefaultRows(indices, from, limit);
}
ColumnPtr createWithOffsets(const IColumn::Offsets & offsets, const Field & default_field, size_t total_rows, size_t shift) const override;
bool isNullable() const override { return true; } bool isNullable() const override { return true; }
bool isFixedAndContiguous() const override { return false; } bool isFixedAndContiguous() const override { return false; }
bool valuesHaveFixedSize() const override { return nested_column->valuesHaveFixedSize(); } bool valuesHaveFixedSize() const override { return nested_column->valuesHaveFixedSize(); }

View File

@ -0,0 +1,779 @@
#include <Columns/ColumnSparse.h>
#include <Columns/ColumnsCommon.h>
#include <Columns/ColumnCompressed.h>
#include <Columns/ColumnTuple.h>
#include <Common/WeakHash.h>
#include <Common/SipHash.h>
#include <Common/HashTable/Hash.h>
#include <Processors/Transforms/ColumnGathererTransform.h>
#include <algorithm>
namespace DB
{
namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
extern const int SIZES_OF_COLUMNS_DOESNT_MATCH;
}
ColumnSparse::ColumnSparse(MutableColumnPtr && values_)
: values(std::move(values_)), _size(0)
{
if (!values->empty())
throw Exception("Not empty values passed to ColumnSparse, but no offsets passed", ErrorCodes::LOGICAL_ERROR);
values->insertDefault();
offsets = ColumnUInt64::create();
}
ColumnSparse::ColumnSparse(MutableColumnPtr && values_, MutableColumnPtr && offsets_, size_t size_)
: values(std::move(values_)), offsets(std::move(offsets_)), _size(size_)
{
const ColumnUInt64 * offsets_concrete = typeid_cast<const ColumnUInt64 *>(offsets.get());
if (!offsets_concrete)
throw Exception(ErrorCodes::LOGICAL_ERROR, "'offsets' column must be a ColumnUInt64, got: {}", offsets->getName());
/// 'values' should contain one extra element: default value at 0 position.
if (offsets->size() + 1 != values->size())
throw Exception(ErrorCodes::LOGICAL_ERROR,
"Values size ({}) is inconsistent with offsets size ({})", values->size(), offsets->size());
if (_size < offsets->size())
throw Exception(ErrorCodes::LOGICAL_ERROR,
"Size of sparse column ({}) cannot be lower than number of non-default values ({})", _size, offsets->size());
if (!offsets_concrete->empty() && _size <= offsets_concrete->getData().back())
throw Exception(ErrorCodes::LOGICAL_ERROR,
"Size of sparse column ({}) should be greater than last position of non-default value ({})",
_size, offsets_concrete->getData().back());
#ifndef NDEBUG
const auto & offsets_data = getOffsetsData();
const auto * it = std::adjacent_find(offsets_data.begin(), offsets_data.end(), std::greater_equal<UInt64>());
if (it != offsets_data.end())
throw Exception(ErrorCodes::LOGICAL_ERROR, "Offsets of ColumnSparse must be strictly sorted");
#endif
}
MutableColumnPtr ColumnSparse::cloneResized(size_t new_size) const
{
if (new_size == 0)
return ColumnSparse::create(values->cloneEmpty());
if (new_size >= _size)
return ColumnSparse::create(IColumn::mutate(values), IColumn::mutate(offsets), new_size);
auto res = ColumnSparse::create(values->cloneEmpty());
res->insertRangeFrom(*this, 0, new_size);
return res;
}
bool ColumnSparse::isDefaultAt(size_t n) const
{
return getValueIndex(n) == 0;
}
bool ColumnSparse::isNullAt(size_t n) const
{
return values->isNullAt(getValueIndex(n));
}
Field ColumnSparse::operator[](size_t n) const
{
return (*values)[getValueIndex(n)];
}
void ColumnSparse::get(size_t n, Field & res) const
{
values->get(getValueIndex(n), res);
}
bool ColumnSparse::getBool(size_t n) const
{
return values->getBool(getValueIndex(n));
}
Float64 ColumnSparse::getFloat64(size_t n) const
{
return values->getFloat64(getValueIndex(n));
}
Float32 ColumnSparse::getFloat32(size_t n) const
{
return values->getFloat32(getValueIndex(n));
}
UInt64 ColumnSparse::getUInt(size_t n) const
{
return values->getUInt(getValueIndex(n));
}
Int64 ColumnSparse::getInt(size_t n) const
{
return values->getInt(getValueIndex(n));
}
UInt64 ColumnSparse::get64(size_t n) const
{
return values->get64(getValueIndex(n));
}
StringRef ColumnSparse::getDataAt(size_t n) const
{
return values->getDataAt(getValueIndex(n));
}
ColumnPtr ColumnSparse::convertToFullColumnIfSparse() const
{
return values->createWithOffsets(getOffsetsData(), (*values)[0], _size, /*shift=*/ 1);
}
void ColumnSparse::insertSingleValue(const Inserter & inserter)
{
inserter(*values);
size_t last_idx = values->size() - 1;
if (values->isDefaultAt(last_idx))
values->popBack(1);
else
getOffsetsData().push_back(_size);
++_size;
}
void ColumnSparse::insertData(const char * pos, size_t length)
{
insertSingleValue([&](IColumn & column) { column.insertData(pos, length); });
}
StringRef ColumnSparse::serializeValueIntoArena(size_t n, Arena & arena, char const *& begin) const
{
return values->serializeValueIntoArena(getValueIndex(n), arena, begin);
}
const char * ColumnSparse::deserializeAndInsertFromArena(const char * pos)
{
const char * res = nullptr;
insertSingleValue([&](IColumn & column) { res = column.deserializeAndInsertFromArena(pos); });
return res;
}
const char * ColumnSparse::skipSerializedInArena(const char * pos) const
{
return values->skipSerializedInArena(pos);
}
void ColumnSparse::insertRangeFrom(const IColumn & src, size_t start, size_t length)
{
if (length == 0)
return;
if (start + length > src.size())
throw Exception("Parameter out of bound in IColumnString::insertRangeFrom method.",
ErrorCodes::LOGICAL_ERROR);
auto & offsets_data = getOffsetsData();
size_t end = start + length;
if (const auto * src_sparse = typeid_cast<const ColumnSparse *>(&src))
{
const auto & src_offsets = src_sparse->getOffsetsData();
const auto & src_values = src_sparse->getValuesColumn();
size_t offset_start = std::lower_bound(src_offsets.begin(), src_offsets.end(), start) - src_offsets.begin();
size_t offset_end = std::lower_bound(src_offsets.begin(), src_offsets.end(), end) - src_offsets.begin();
assert(offset_start <= offset_end);
if (offset_start != offset_end)
{
offsets_data.reserve(offsets_data.size() + offset_end - offset_start);
insertManyDefaults(src_offsets[offset_start] - start);
offsets_data.push_back(_size);
++_size;
for (size_t i = offset_start + 1; i < offset_end; ++i)
{
size_t current_diff = src_offsets[i] - src_offsets[i - 1];
insertManyDefaults(current_diff - 1);
offsets_data.push_back(_size);
++_size;
}
/// 'end' <= 'src_offsets[offsets_end]', but end is excluded, so index is 'offsets_end' - 1.
/// Since 'end' is excluded, need to subtract one more row from result.
insertManyDefaults(end - src_offsets[offset_end - 1] - 1);
values->insertRangeFrom(src_values, offset_start + 1, offset_end - offset_start);
}
else
{
insertManyDefaults(length);
}
}
else
{
for (size_t i = start; i < end; ++i)
{
if (!src.isDefaultAt(i))
{
values->insertFrom(src, i);
offsets_data.push_back(_size);
}
++_size;
}
}
}
void ColumnSparse::insert(const Field & x)
{
insertSingleValue([&](IColumn & column) { column.insert(x); });
}
void ColumnSparse::insertFrom(const IColumn & src, size_t n)
{
if (const auto * src_sparse = typeid_cast<const ColumnSparse *>(&src))
{
if (size_t value_index = src_sparse->getValueIndex(n))
{
getOffsetsData().push_back(_size);
values->insertFrom(src_sparse->getValuesColumn(), value_index);
}
}
else
{
if (!src.isDefaultAt(n))
{
values->insertFrom(src, n);
getOffsetsData().push_back(_size);
}
}
++_size;
}
void ColumnSparse::insertDefault()
{
++_size;
}
void ColumnSparse::insertManyDefaults(size_t length)
{
_size += length;
}
void ColumnSparse::popBack(size_t n)
{
assert(n < _size);
auto & offsets_data = getOffsetsData();
size_t new_size = _size - n;
size_t removed_values = 0;
while (!offsets_data.empty() && offsets_data.back() >= new_size)
{
offsets_data.pop_back();
++removed_values;
}
if (removed_values)
values->popBack(removed_values);
_size = new_size;
}
ColumnPtr ColumnSparse::filter(const Filter & filt, ssize_t) const
{
if (_size != filt.size())
throw Exception("Size of filter doesn't match size of column.", ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH);
if (offsets->empty())
{
auto res = cloneEmpty();
res->insertManyDefaults(countBytesInFilter(filt));
return res;
}
auto res_offsets = offsets->cloneEmpty();
auto & res_offsets_data = assert_cast<ColumnUInt64 &>(*res_offsets).getData();
Filter values_filter;
values_filter.reserve(values->size());
values_filter.push_back(1);
size_t values_result_size_hint = 1;
size_t res_offset = 0;
auto offset_it = begin();
for (size_t i = 0; i < _size; ++i, ++offset_it)
{
if (!offset_it.isDefault())
{
if (filt[i])
{
res_offsets_data.push_back(res_offset);
values_filter.push_back(1);
++res_offset;
++values_result_size_hint;
}
else
{
values_filter.push_back(0);
}
}
else
{
res_offset += filt[i] != 0;
}
}
auto res_values = values->filter(values_filter, values_result_size_hint);
return this->create(std::move(res_values), std::move(res_offsets), res_offset);
}
void ColumnSparse::expand(const Filter & mask, bool inverted)
{
if (mask.size() < _size)
throw Exception("Mask size should be no less than data size.", ErrorCodes::LOGICAL_ERROR);
auto res_offsets = offsets->cloneEmpty();
auto & res_offsets_data = assert_cast<ColumnUInt64 &>(*res_offsets).getData();
auto it = begin();
for (size_t i = 0; i < mask.size(); ++i)
{
if (!!mask[i] ^ inverted)
{
if (it.getCurrentRow() == _size)
throw Exception("Too many bytes in mask", ErrorCodes::LOGICAL_ERROR);
if (!it.isDefault())
res_offsets_data[it.getCurrentOffset()] = i;
++it;
}
}
_size = mask.size();
}
ColumnPtr ColumnSparse::permute(const Permutation & perm, size_t limit) const
{
return permuteImpl(*this, perm, limit);
}
ColumnPtr ColumnSparse::index(const IColumn & indexes, size_t limit) const
{
return selectIndexImpl(*this, indexes, limit);
}
template <typename Type>
ColumnPtr ColumnSparse::indexImpl(const PaddedPODArray<Type> & indexes, size_t limit) const
{
assert(limit <= indexes.size());
if (limit == 0)
return ColumnSparse::create(values->cloneEmpty());
if (offsets->empty())
{
auto res = cloneEmpty();
res->insertManyDefaults(limit);
return res;
}
auto res_offsets = offsets->cloneEmpty();
auto & res_offsets_data = assert_cast<ColumnUInt64 &>(*res_offsets).getData();
auto res_values = values->cloneEmpty();
res_values->insertDefault();
/// If we need to permute full column, or if limit is large enough,
/// it's better to save indexes of values in O(size)
/// and avoid binary search for obtaining every index.
/// 3 is just a guess for overhead on copying indexes.
bool execute_linear =
limit == _size || limit * std::bit_width(offsets->size()) > _size * 3;
if (execute_linear)
{
PaddedPODArray<UInt64> values_index(_size);
auto offset_it = begin();
for (size_t i = 0; i < _size; ++i, ++offset_it)
values_index[i] = offset_it.getValueIndex();
for (size_t i = 0; i < limit; ++i)
{
size_t index = values_index[indexes[i]];
if (index != 0)
{
res_values->insertFrom(*values, index);
res_offsets_data.push_back(i);
}
}
}
else
{
for (size_t i = 0; i < limit; ++i)
{
size_t index = getValueIndex(indexes[i]);
if (index != 0)
{
res_values->insertFrom(*values, index);
res_offsets_data.push_back(i);
}
}
}
return ColumnSparse::create(std::move(res_values), std::move(res_offsets), limit);
}
int ColumnSparse::compareAt(size_t n, size_t m, const IColumn & rhs_, int null_direction_hint) const
{
if (const auto * rhs_sparse = typeid_cast<const ColumnSparse *>(&rhs_))
return values->compareAt(getValueIndex(n), rhs_sparse->getValueIndex(m), rhs_sparse->getValuesColumn(), null_direction_hint);
return values->compareAt(getValueIndex(n), m, rhs_, null_direction_hint);
}
void ColumnSparse::compareColumn(const IColumn & rhs, size_t rhs_row_num,
PaddedPODArray<UInt64> * row_indexes, PaddedPODArray<Int8> & compare_results,
int direction, int nan_direction_hint) const
{
if (row_indexes)
{
/// TODO: implement without conversion to full column.
auto this_full = convertToFullColumnIfSparse();
auto rhs_full = rhs.convertToFullColumnIfSparse();
this_full->compareColumn(*rhs_full, rhs_row_num, row_indexes, compare_results, direction, nan_direction_hint);
}
else
{
const auto & rhs_sparse = assert_cast<const ColumnSparse &>(rhs);
PaddedPODArray<Int8> nested_result;
values->compareColumn(rhs_sparse.getValuesColumn(), rhs_sparse.getValueIndex(rhs_row_num),
nullptr, nested_result, direction, nan_direction_hint);
const auto & offsets_data = getOffsetsData();
compare_results.resize_fill(_size, nested_result[0]);
for (size_t i = 0; i < offsets_data.size(); ++i)
compare_results[offsets_data[i]] = nested_result[i + 1];
}
}
int ColumnSparse::compareAtWithCollation(size_t n, size_t m, const IColumn & rhs, int null_direction_hint, const Collator & collator) const
{
if (const auto * rhs_sparse = typeid_cast<const ColumnSparse *>(&rhs))
return values->compareAtWithCollation(getValueIndex(n), rhs_sparse->getValueIndex(m), rhs_sparse->getValuesColumn(), null_direction_hint, collator);
return values->compareAtWithCollation(getValueIndex(n), m, rhs, null_direction_hint, collator);
}
bool ColumnSparse::hasEqualValues() const
{
size_t num_defaults = getNumberOfDefaults();
if (num_defaults == _size)
return true;
/// Have at least 1 default and 1 non-default values.
if (num_defaults != 0)
return false;
/// Check that probably all non-default values are equal.
/// It's suboptiomal, but it's a rare case.
for (size_t i = 2; i < values->size(); ++i)
if (values->compareAt(1, i, *values, 1) != 0)
return false;
return true;
}
void ColumnSparse::getPermutationImpl(bool reverse, size_t limit, int null_direction_hint, Permutation & res, const Collator * collator) const
{
if (_size == 0)
return;
res.resize(_size);
if (offsets->empty())
{
for (size_t i = 0; i < _size; ++i)
res[i] = i;
return;
}
if (limit == 0 || limit > _size)
limit = _size;
Permutation perm;
/// Firstly we sort all values.
/// limit + 1 for case when there are 0 default values.
if (collator)
values->getPermutationWithCollation(*collator, reverse, limit + 1, null_direction_hint, perm);
else
values->getPermutation(reverse, limit + 1, null_direction_hint, perm);
size_t num_of_defaults = getNumberOfDefaults();
size_t row = 0;
const auto & offsets_data = getOffsetsData();
/// Fill the permutation.
for (size_t i = 0; i < perm.size() && row < limit; ++i)
{
if (perm[i] == 0)
{
if (!num_of_defaults)
continue;
/// Fill the positions of default values in the required quantity.
auto offset_it = begin();
while (row < limit)
{
while (offset_it.getCurrentRow() < _size && !offset_it.isDefault())
++offset_it;
if (offset_it.getCurrentRow() == _size)
break;
res[row++] = offset_it.getCurrentRow();
++offset_it;
}
}
else
{
res[row++] = offsets_data[perm[i] - 1];
}
}
assert(row == limit);
}
void ColumnSparse::getPermutation(bool reverse, size_t limit, int null_direction_hint, Permutation & res) const
{
return getPermutationImpl(reverse, limit, null_direction_hint, res, nullptr);
}
void ColumnSparse::updatePermutation(bool reverse, size_t limit, int null_direction_hint, Permutation & res, EqualRanges & equal_range) const
{
auto this_full = convertToFullColumnIfSparse();
this_full->updatePermutation(reverse, limit, null_direction_hint, res, equal_range);
}
void ColumnSparse::getPermutationWithCollation(const Collator & collator, bool reverse, size_t limit, int null_direction_hint, Permutation & res) const
{
return getPermutationImpl(reverse, limit, null_direction_hint, res, &collator);
}
void ColumnSparse::updatePermutationWithCollation(
const Collator & collator, bool reverse, size_t limit, int null_direction_hint, Permutation & res, EqualRanges& equal_range) const
{
auto this_full = convertToFullColumnIfSparse();
this_full->updatePermutationWithCollation(collator, reverse, limit, null_direction_hint, res, equal_range);
}
size_t ColumnSparse::byteSize() const
{
return values->byteSize() + offsets->byteSize() + sizeof(_size);
}
size_t ColumnSparse::byteSizeAt(size_t n) const
{
size_t index = getValueIndex(n);
size_t res = values->byteSizeAt(index);
if (index)
res += sizeof(UInt64);
return res;
}
size_t ColumnSparse::allocatedBytes() const
{
return values->allocatedBytes() + offsets->allocatedBytes() + sizeof(_size);
}
void ColumnSparse::protect()
{
values->protect();
offsets->protect();
}
ColumnPtr ColumnSparse::replicate(const Offsets & replicate_offsets) const
{
/// TODO: implement specializations.
if (_size != replicate_offsets.size())
throw Exception("Size of offsets doesn't match size of column.", ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH);
if (_size == 0)
return ColumnSparse::create(values->cloneEmpty());
auto res_offsets = offsets->cloneEmpty();
auto & res_offsets_data = assert_cast<ColumnUInt64 &>(*res_offsets).getData();
auto res_values = values->cloneEmpty();
res_values->insertDefault();
auto offset_it = begin();
for (size_t i = 0; i < _size; ++i, ++offset_it)
{
if (!offset_it.isDefault())
{
size_t replicate_size = replicate_offsets[i] - replicate_offsets[i - 1];
res_offsets_data.reserve(res_offsets_data.size() + replicate_size);
for (size_t row = replicate_offsets[i - 1]; row < replicate_offsets[i]; ++row)
{
res_offsets_data.push_back(row);
res_values->insertFrom(*values, offset_it.getValueIndex());
}
}
}
return ColumnSparse::create(std::move(res_values), std::move(res_offsets), replicate_offsets.back());
}
void ColumnSparse::updateHashWithValue(size_t n, SipHash & hash) const
{
values->updateHashWithValue(getValueIndex(n), hash);
}
void ColumnSparse::updateWeakHash32(WeakHash32 & hash) const
{
if (hash.getData().size() != _size)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Size of WeakHash32 does not match size of column: "
"column size is {}, hash size is {}", _size, hash.getData().size());
auto offset_it = begin();
auto & hash_data = hash.getData();
for (size_t i = 0; i < _size; ++i, ++offset_it)
{
size_t value_index = offset_it.getValueIndex();
auto data_ref = values->getDataAt(value_index);
hash_data[i] = ::updateWeakHash32(reinterpret_cast<const UInt8 *>(data_ref.data), data_ref.size, hash_data[i]);
}
}
void ColumnSparse::updateHashFast(SipHash & hash) const
{
values->updateHashFast(hash);
offsets->updateHashFast(hash);
hash.update(_size);
}
void ColumnSparse::getExtremes(Field & min, Field & max) const
{
if (_size == 0)
{
values->get(0, min);
values->get(0, max);
return;
}
if (getNumberOfDefaults() == 0)
{
size_t min_idx = 1;
size_t max_idx = 1;
for (size_t i = 2; i < values->size(); ++i)
{
if (values->compareAt(i, min_idx, *values, 1) < 0)
min_idx = i;
else if (values->compareAt(i, max_idx, *values, 1) > 0)
max_idx = i;
}
values->get(min_idx, min);
values->get(max_idx, max);
return;
}
values->getExtremes(min, max);
}
void ColumnSparse::getIndicesOfNonDefaultRows(IColumn::Offsets & indices, size_t from, size_t limit) const
{
const auto & offsets_data = getOffsetsData();
const auto * start = from ? std::lower_bound(offsets_data.begin(), offsets_data.end(), from) : offsets_data.begin();
const auto * end = limit ? std::lower_bound(offsets_data.begin(), offsets_data.end(), from + limit) : offsets_data.end();
indices.insert(start, end);
}
double ColumnSparse::getRatioOfDefaultRows(double) const
{
return static_cast<double>(getNumberOfDefaults()) / _size;
}
MutableColumns ColumnSparse::scatter(ColumnIndex num_columns, const Selector & selector) const
{
return scatterImpl<ColumnSparse>(num_columns, selector);
}
void ColumnSparse::gather(ColumnGathererStream & gatherer_stream)
{
gatherer_stream.gather(*this);
}
ColumnPtr ColumnSparse::compress() const
{
auto values_compressed = values->compress();
auto offsets_compressed = offsets->compress();
size_t byte_size = values_compressed->byteSize() + offsets_compressed->byteSize();
return ColumnCompressed::create(size(), byte_size,
[values_compressed = std::move(values_compressed), offsets_compressed = std::move(offsets_compressed), size = size()]
{
return ColumnSparse::create(values_compressed->decompress(), offsets_compressed->decompress(), size);
});
}
bool ColumnSparse::structureEquals(const IColumn & rhs) const
{
if (const auto * rhs_sparse = typeid_cast<const ColumnSparse *>(&rhs))
return values->structureEquals(*rhs_sparse->values);
return false;
}
void ColumnSparse::forEachSubcolumn(ColumnCallback callback)
{
callback(values);
callback(offsets);
}
const IColumn::Offsets & ColumnSparse::getOffsetsData() const
{
return assert_cast<const ColumnUInt64 &>(*offsets).getData();
}
IColumn::Offsets & ColumnSparse::getOffsetsData()
{
return assert_cast<ColumnUInt64 &>(*offsets).getData();
}
size_t ColumnSparse::getValueIndex(size_t n) const
{
assert(n < _size);
const auto & offsets_data = getOffsetsData();
const auto * it = std::lower_bound(offsets_data.begin(), offsets_data.end(), n);
if (it == offsets_data.end() || *it != n)
return 0;
return it - offsets_data.begin() + 1;
}
ColumnPtr recursiveRemoveSparse(const ColumnPtr & column)
{
if (!column)
return column;
if (const auto * column_tuple = typeid_cast<const ColumnTuple *>(column.get()))
{
auto columns = column_tuple->getColumns();
for (auto & element : columns)
element = recursiveRemoveSparse(element);
return ColumnTuple::create(columns);
}
return column->convertToFullColumnIfSparse();
}
}

231
src/Columns/ColumnSparse.h Normal file
View File

@ -0,0 +1,231 @@
#pragma once
#include <Columns/IColumn.h>
#include <Columns/IColumnImpl.h>
#include <Columns/ColumnsNumber.h>
#include <Common/typeid_cast.h>
#include <Common/assert_cast.h>
class Collator;
namespace DB
{
/** Column for spare representation.
* It stores column with non-default values and column
* with their sorted positions in original column. Column with
* values contains also one default value at 0 position to make
* implementation of execution of functions and sorting more convenient.
*/
class ColumnSparse final : public COWHelper<IColumn, ColumnSparse>
{
private:
friend class COWHelper<IColumn, ColumnSparse>;
explicit ColumnSparse(MutableColumnPtr && values_);
ColumnSparse(MutableColumnPtr && values_, MutableColumnPtr && offsets_, size_t size_);
ColumnSparse(const ColumnSparse &) = default;
public:
static constexpr auto DEFAULT_ROWS_SEARCH_SAMPLE_RATIO = 0.1;
static constexpr auto DEFAULT_RATIO_FOR_SPARSE_SERIALIZATION = 0.95;
using Base = COWHelper<IColumn, ColumnSparse>;
static Ptr create(const ColumnPtr & values_, const ColumnPtr & offsets_, size_t size_)
{
return Base::create(values_->assumeMutable(), offsets_->assumeMutable(), size_);
}
template <typename TColumnPtr, typename = typename std::enable_if<IsMutableColumns<TColumnPtr>::value>::type>
static MutablePtr create(TColumnPtr && values_, TColumnPtr && offsets_, size_t size_)
{
return Base::create(std::move(values_), std::move(offsets_), size_);
}
static Ptr create(const ColumnPtr & values_)
{
return Base::create(values_->assumeMutable());
}
template <typename TColumnPtr, typename = typename std::enable_if<IsMutableColumns<TColumnPtr>::value>::type>
static MutablePtr create(TColumnPtr && values_)
{
return Base::create(std::forward<TColumnPtr>(values_));
}
bool isSparse() const override { return true; }
const char * getFamilyName() const override { return "Sparse"; }
std::string getName() const override { return "Sparse(" + values->getName() + ")"; }
TypeIndex getDataType() const override { return values->getDataType(); }
MutableColumnPtr cloneResized(size_t new_size) const override;
size_t size() const override { return _size; }
bool isDefaultAt(size_t n) const override;
bool isNullAt(size_t n) const override;
Field operator[](size_t n) const override;
void get(size_t n, Field & res) const override;
bool getBool(size_t n) const override;
Float64 getFloat64(size_t n) const override;
Float32 getFloat32(size_t n) const override;
UInt64 getUInt(size_t n) const override;
Int64 getInt(size_t n) const override;
UInt64 get64(size_t n) const override;
StringRef getDataAt(size_t n) const override;
ColumnPtr convertToFullColumnIfSparse() const override;
/// Will insert null value if pos=nullptr
void insertData(const char * pos, size_t length) override;
StringRef serializeValueIntoArena(size_t n, Arena & arena, char const *& begin) const override;
const char * deserializeAndInsertFromArena(const char * pos) override;
const char * skipSerializedInArena(const char *) const override;
void insertRangeFrom(const IColumn & src, size_t start, size_t length) override;
void insert(const Field & x) override;
void insertFrom(const IColumn & src, size_t n) override;
void insertDefault() override;
void insertManyDefaults(size_t length) override;
void popBack(size_t n) override;
ColumnPtr filter(const Filter & filt, ssize_t) const override;
void expand(const Filter & mask, bool inverted) override;
ColumnPtr permute(const Permutation & perm, size_t limit) const override;
ColumnPtr index(const IColumn & indexes, size_t limit) const override;
template <typename Type>
ColumnPtr indexImpl(const PaddedPODArray<Type> & indexes, size_t limit) const;
int compareAt(size_t n, size_t m, const IColumn & rhs_, int null_direction_hint) const override;
void compareColumn(const IColumn & rhs, size_t rhs_row_num,
PaddedPODArray<UInt64> * row_indexes, PaddedPODArray<Int8> & compare_results,
int direction, int nan_direction_hint) const override;
int compareAtWithCollation(size_t n, size_t m, const IColumn & rhs, int null_direction_hint, const Collator & collator) const override;
bool hasEqualValues() const override;
void getPermutationImpl(bool reverse, size_t limit, int null_direction_hint, Permutation & res, const Collator * collator) const;
void getPermutation(bool reverse, size_t limit, int null_direction_hint, Permutation & res) const override;
void updatePermutation(bool reverse, size_t limit, int null_direction_hint, Permutation & res, EqualRanges & equal_range) const override;
void getPermutationWithCollation(const Collator & collator, bool reverse, size_t limit, int null_direction_hint, Permutation & res) const override;
void updatePermutationWithCollation(
const Collator & collator, bool reverse, size_t limit, int null_direction_hint, Permutation & res, EqualRanges& equal_range) const override;
size_t byteSize() const override;
size_t byteSizeAt(size_t n) const override;
size_t allocatedBytes() const override;
void protect() override;
ColumnPtr replicate(const Offsets & replicate_offsets) const override;
void updateHashWithValue(size_t n, SipHash & hash) const override;
void updateWeakHash32(WeakHash32 & hash) const override;
void updateHashFast(SipHash & hash) const override;
void getExtremes(Field & min, Field & max) const override;
void getIndicesOfNonDefaultRows(IColumn::Offsets & indices, size_t from, size_t limit) const override;
double getRatioOfDefaultRows(double sample_ratio) const override;
MutableColumns scatter(ColumnIndex num_columns, const Selector & selector) const override;
void gather(ColumnGathererStream & gatherer_stream) override;
ColumnPtr compress() const override;
void forEachSubcolumn(ColumnCallback callback) override;
bool structureEquals(const IColumn & rhs) const override;
bool isNullable() const override { return values->isNullable(); }
bool isFixedAndContiguous() const override { return false; }
bool valuesHaveFixedSize() const override { return values->valuesHaveFixedSize(); }
size_t sizeOfValueIfFixed() const override { return values->sizeOfValueIfFixed() + values->sizeOfValueIfFixed(); }
bool isCollationSupported() const override { return values->isCollationSupported(); }
size_t getNumberOfDefaults() const { return _size - offsets->size(); }
size_t getNumberOfTrailingDefaults() const
{
return offsets->empty() ? _size : _size - getOffsetsData().back() - 1;
}
/// Return position of element in 'values' columns,
/// that corresponds to n-th element of full column.
/// O(log(offsets.size())) complexity,
size_t getValueIndex(size_t n) const;
const IColumn & getValuesColumn() const { return *values; }
IColumn & getValuesColumn() { return *values; }
const ColumnPtr & getValuesPtr() const { return values; }
ColumnPtr & getValuesPtr() { return values; }
const IColumn::Offsets & getOffsetsData() const;
IColumn::Offsets & getOffsetsData();
const ColumnPtr & getOffsetsPtr() const { return offsets; }
ColumnPtr & getOffsetsPtr() { return offsets; }
const IColumn & getOffsetsColumn() const { return *offsets; }
IColumn & getOffsetsColumn() { return *offsets; }
/// This class helps to iterate over all values in ColumnSparse.
class Iterator
{
public:
Iterator(const PaddedPODArray<UInt64> & offsets_, size_t size_, size_t current_offset_, size_t current_row_)
: offsets(offsets_), size(size_), current_offset(current_offset_), current_row(current_row_)
{
}
bool ALWAYS_INLINE isDefault() const { return current_offset == offsets.size() || current_row != offsets[current_offset]; }
size_t ALWAYS_INLINE getValueIndex() const { return isDefault() ? 0 : current_offset + 1; }
size_t ALWAYS_INLINE getCurrentRow() const { return current_row; }
size_t ALWAYS_INLINE getCurrentOffset() const { return current_offset; }
bool operator==(const Iterator & other) const
{
return size == other.size
&& current_offset == other.current_offset
&& current_row == other.current_row;
}
bool operator!=(const Iterator & other) const { return !(*this == other); }
Iterator operator++()
{
if (!isDefault())
++current_offset;
++current_row;
return *this;
}
private:
const PaddedPODArray<UInt64> & offsets;
const size_t size;
size_t current_offset;
size_t current_row;
};
Iterator begin() const { return Iterator(getOffsetsData(), _size, 0, 0); }
Iterator end() const { return Iterator(getOffsetsData(), _size, getOffsetsData().size(), _size); }
private:
using Inserter = std::function<void(IColumn &)>;
/// Inserts value to 'values' column via callback.
/// Properly handles cases, when inserted value is default.
/// Used, when it's unknown in advance if inserted value is default.
void insertSingleValue(const Inserter & inserter);
/// Contains default value at 0 position.
/// It's convenient, because it allows to execute, e.g functions or sorting,
/// for this column without handling different cases.
WrappedPtr values;
/// Sorted offsets of non-default values in the full column.
/// 'offsets[i]' corresponds to 'values[i + 1]'.
WrappedPtr offsets;
size_t _size;
};
ColumnPtr recursiveRemoveSparse(const ColumnPtr & column);
}

View File

@ -474,8 +474,9 @@ void ColumnString::getExtremes(Field & min, Field & max) const
ColumnPtr ColumnString::compress() const ColumnPtr ColumnString::compress() const
{ {
size_t source_chars_size = chars.size(); const size_t source_chars_size = chars.size();
size_t source_offsets_size = offsets.size() * sizeof(Offset); const size_t source_offsets_elements = offsets.size();
const size_t source_offsets_size = source_offsets_elements * sizeof(Offset);
/// Don't compress small blocks. /// Don't compress small blocks.
if (source_chars_size < 4096) /// A wild guess. if (source_chars_size < 4096) /// A wild guess.
@ -489,12 +490,14 @@ ColumnPtr ColumnString::compress() const
auto offsets_compressed = ColumnCompressed::compressBuffer(offsets.data(), source_offsets_size, true); auto offsets_compressed = ColumnCompressed::compressBuffer(offsets.data(), source_offsets_size, true);
return ColumnCompressed::create(offsets.size(), chars_compressed->size() + offsets_compressed->size(), const size_t chars_compressed_size = chars_compressed->size();
const size_t offsets_compressed_size = offsets_compressed->size();
return ColumnCompressed::create(source_offsets_elements, chars_compressed_size + offsets_compressed_size,
[ [
chars_compressed = std::move(chars_compressed), chars_compressed = std::move(chars_compressed),
offsets_compressed = std::move(offsets_compressed), offsets_compressed = std::move(offsets_compressed),
source_chars_size, source_chars_size,
source_offsets_elements = offsets.size() source_offsets_elements
] ]
{ {
auto res = ColumnString::create(); auto res = ColumnString::create();

View File

@ -107,6 +107,12 @@ public:
return StringRef(&chars[offsetAt(n)], sizeAt(n)); return StringRef(&chars[offsetAt(n)], sizeAt(n));
} }
bool isDefaultAt(size_t n) const override
{
assert(n < size());
return sizeAt(n) == 1;
}
/// Suppress gcc 7.3.1 warning: '*((void*)&<anonymous> +8)' may be used uninitialized in this function /// Suppress gcc 7.3.1 warning: '*((void*)&<anonymous> +8)' may be used uninitialized in this function
#if !defined(__clang__) #if !defined(__clang__)
#pragma GCC diagnostic push #pragma GCC diagnostic push
@ -278,6 +284,16 @@ public:
return typeid(rhs) == typeid(ColumnString); return typeid(rhs) == typeid(ColumnString);
} }
double getRatioOfDefaultRows(double sample_ratio) const override
{
return getRatioOfDefaultRowsImpl<ColumnString>(sample_ratio);
}
void getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const override
{
return getIndicesOfNonDefaultRowsImpl<ColumnString>(indices, from, limit);
}
Chars & getChars() { return chars; } Chars & getChars() { return chars; }
const Chars & getChars() const { return chars; } const Chars & getChars() const { return chars; }

View File

@ -12,6 +12,7 @@
#include <base/sort.h> #include <base/sort.h>
#include <base/map.h> #include <base/map.h>
#include <base/range.h> #include <base/range.h>
#include <DataTypes/Serializations/SerializationInfoTuple.h>
namespace DB namespace DB
@ -113,6 +114,15 @@ void ColumnTuple::get(size_t n, Field & res) const
res = tuple; res = tuple;
} }
bool ColumnTuple::isDefaultAt(size_t n) const
{
const size_t tuple_size = columns.size();
for (size_t i = 0; i < tuple_size; ++i)
if (!columns[i]->isDefaultAt(n))
return false;
return true;
}
StringRef ColumnTuple::getDataAt(size_t) const StringRef ColumnTuple::getDataAt(size_t) const
{ {
throw Exception("Method getDataAt is not supported for " + getName(), ErrorCodes::NOT_IMPLEMENTED); throw Exception("Method getDataAt is not supported for " + getName(), ErrorCodes::NOT_IMPLEMENTED);
@ -536,4 +546,25 @@ ColumnPtr ColumnTuple::compress() const
}); });
} }
double ColumnTuple::getRatioOfDefaultRows(double sample_ratio) const
{
return getRatioOfDefaultRowsImpl<ColumnTuple>(sample_ratio);
}
void ColumnTuple::getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const
{
return getIndicesOfNonDefaultRowsImpl<ColumnTuple>(indices, from, limit);
}
SerializationInfoPtr ColumnTuple::getSerializationInfo() const
{
MutableSerializationInfos infos;
infos.reserve(columns.size());
for (const auto & column : columns)
infos.push_back(const_pointer_cast<SerializationInfo>(column->getSerializationInfo()));
return std::make_shared<SerializationInfoTuple>(std::move(infos), SerializationInfo::Settings{});
}
} }

View File

@ -53,6 +53,7 @@ public:
Field operator[](size_t n) const override; Field operator[](size_t n) const override;
void get(size_t n, Field & res) const override; void get(size_t n, Field & res) const override;
bool isDefaultAt(size_t n) const override;
StringRef getDataAt(size_t n) const override; StringRef getDataAt(size_t n) const override;
void insertData(const char * pos, size_t length) override; void insertData(const char * pos, size_t length) override;
void insert(const Field & x) override; void insert(const Field & x) override;
@ -93,6 +94,9 @@ public:
bool structureEquals(const IColumn & rhs) const override; bool structureEquals(const IColumn & rhs) const override;
bool isCollationSupported() const override; bool isCollationSupported() const override;
ColumnPtr compress() const override; ColumnPtr compress() const override;
double getRatioOfDefaultRows(double sample_ratio) const override;
void getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const override;
SerializationInfoPtr getSerializationInfo() const override;
size_t tupleSize() const { return columns.size(); } size_t tupleSize() const { return columns.size(); }

Some files were not shown because too many files have changed in this diff Show More