Merge branch 'master' into MDB-15474

This commit is contained in:
ianton-ru 2021-12-21 17:38:36 +03:00 committed by GitHub
commit e6fd4bfb50
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
562 changed files with 12058 additions and 3597 deletions

View File

@ -8,6 +8,10 @@
name: Docker Container Scan (clickhouse-server)
env:
# Force the stdout and stderr streams to be unbuffered
PYTHONUNBUFFERED: 1
"on":
pull_request:
paths:

View File

@ -1,4 +1,9 @@
name: CherryPick
env:
# Force the stdout and stderr streams to be unbuffered
PYTHONUNBUFFERED: 1
concurrency:
group: cherry-pick
on: # yamllint disable-line rule:truthy
@ -8,18 +13,24 @@ jobs:
CherryPick:
runs-on: [self-hosted, style-checker]
steps:
- name: Set envs
# https://docs.github.com/en/actions/learn-github-actions/workflow-commands-for-github-actions#multiline-strings
run: |
cat >> "$GITHUB_ENV" << 'EOF'
TEMP_PATH=${{runner.temp}}/cherry_pick
ROBOT_CLICKHOUSE_SSH_KEY<<RCSK
${{secrets.ROBOT_CLICKHOUSE_SSH_KEY}}
RCSK
REPO_OWNER=ClickHouse
REPO_NAME=ClickHouse
REPO_TEAM=core
EOF
- name: Check out repository code
uses: actions/checkout@v2
with:
token: ${{secrets.ROBOT_CLICKHOUSE_COMMIT_TOKEN}}
fetch-depth: 0
- name: Cherry pick
env:
TEMP_PATH: ${{runner.temp}}/cherry_pick
ROBOT_CLICKHOUSE_SSH_KEY: ${{secrets.ROBOT_CLICKHOUSE_SSH_KEY}}
REPO_OWNER: "ClickHouse"
REPO_NAME: "ClickHouse"
REPO_TEAM: "core"
run: |
sudo pip install GitPython
cd $GITHUB_WORKSPACE/tests/ci

View File

@ -1,4 +1,9 @@
name: BackportPR
env:
# Force the stdout and stderr streams to be unbuffered
PYTHONUNBUFFERED: 1
on: # yamllint disable-line rule:truthy
push:
branches:
@ -7,6 +12,9 @@ jobs:
DockerHubPush:
runs-on: [self-hosted, style-checker]
steps:
- name: Clear repository
run: |
sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE
- name: Check out repository code
uses: actions/checkout@v2
- name: Images check
@ -22,17 +30,23 @@ jobs:
needs: [BuilderDebRelease]
runs-on: [self-hosted, style-checker]
steps:
- name: Set envs
run: |
cat >> "$GITHUB_ENV" << 'EOF'
TEMP_PATH=${{runner.temp}}/compatibility_check
REPO_COPY=${{runner.temp}}/compatibility_check/ClickHouse
REPORTS_PATH=${{runner.temp}}/reports_dir
EOF
- name: Clear repository
run: |
sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE
- name: Check out repository code
uses: actions/checkout@v2
- name: Download json reports
uses: actions/download-artifact@v2
with:
path: ${{runner.temp}}/reports_dir
path: ${{ env.REPORTS_PATH }}
- name: CompatibilityCheck
env:
TEMP_PATH: ${{runner.temp}}/compatibility_check
REPO_COPY: ${{runner.temp}}/compatibility_check/ClickHouse
REPORTS_PATH: ${{runner.temp}}/reports_dir
run: |
sudo rm -fr $TEMP_PATH
mkdir -p $TEMP_PATH
@ -51,24 +65,30 @@ jobs:
needs: [DockerHubPush]
runs-on: [self-hosted, builder]
steps:
- name: Set envs
run: |
cat >> "$GITHUB_ENV" << 'EOF'
TEMP_PATH=${{runner.temp}}/build_check
IMAGES_PATH=${{runner.temp}}/images_path
REPO_COPY=${{runner.temp}}/build_check/ClickHouse
CACHES_PATH=${{runner.temp}}/../ccaches
CHECK_NAME=ClickHouse build check (actions)
BUILD_NAME=package_release
EOF
- name: Download changed images
uses: actions/download-artifact@v2
with:
name: changed_images
path: ${{ runner.temp }}/images_path
path: ${{ env.IMAGES_PATH }}
- name: Clear repository
run: |
sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE
- name: Check out repository code
uses: actions/checkout@v2
with:
submodules: 'recursive'
submodules: 'true'
fetch-depth: 0 # otherwise we will have no info about contributors
- name: Build
env:
TEMP_PATH: ${{runner.temp}}/build_check
IMAGES_PATH: ${{runner.temp}}/images_path
REPO_COPY: ${{runner.temp}}/build_check/ClickHouse
CACHES_PATH: ${{runner.temp}}/../ccaches
CHECK_NAME: 'ClickHouse build check (actions)'
BUILD_NAME: 'package_release'
run: |
sudo rm -fr $TEMP_PATH
mkdir -p $TEMP_PATH
@ -78,35 +98,41 @@ jobs:
uses: actions/upload-artifact@v2
with:
name: ${{ env.BUILD_NAME }}
path: ${{ runner.temp }}/build_check/${{ env.BUILD_NAME }}.json
path: ${{ env.TEMP_PATH }}/${{ env.BUILD_NAME }}.json
- name: Cleanup
if: always()
run: |
docker kill $(docker ps -q) ||:
docker rm -f $(docker ps -a -q) ||:
sudo rm -fr $TEMP_PATH
sudo rm -fr $TEMP_PATH $CACHES_PATH
BuilderDebAsan:
needs: [DockerHubPush]
runs-on: [self-hosted, builder]
steps:
- name: Set envs
run: |
cat >> "$GITHUB_ENV" << 'EOF'
TEMP_PATH=${{runner.temp}}/build_check
IMAGES_PATH=${{runner.temp}}/images_path
REPO_COPY=${{runner.temp}}/build_check/ClickHouse
CACHES_PATH=${{runner.temp}}/../ccaches
CHECK_NAME=ClickHouse build check (actions)
BUILD_NAME=package_asan
EOF
- name: Download changed images
uses: actions/download-artifact@v2
with:
name: changed_images
path: ${{ runner.temp }}/images_path
path: ${{ env.IMAGES_PATH }}
- name: Clear repository
run: |
sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE
- name: Check out repository code
uses: actions/checkout@v2
with:
submodules: 'recursive'
submodules: 'true'
fetch-depth: 0 # otherwise we will have no info about contributors
- name: Build
env:
TEMP_PATH: ${{runner.temp}}/build_check
IMAGES_PATH: ${{runner.temp}}/images_path
REPO_COPY: ${{runner.temp}}/build_check/ClickHouse
CACHES_PATH: ${{runner.temp}}/../ccaches
CHECK_NAME: 'ClickHouse build check (actions)'
BUILD_NAME: 'package_asan'
run: |
sudo rm -fr $TEMP_PATH
mkdir -p $TEMP_PATH
@ -116,35 +142,41 @@ jobs:
uses: actions/upload-artifact@v2
with:
name: ${{ env.BUILD_NAME }}
path: ${{ runner.temp }}/build_check/${{ env.BUILD_NAME }}.json
path: ${{ env.TEMP_PATH }}/${{ env.BUILD_NAME }}.json
- name: Cleanup
if: always()
run: |
docker kill $(docker ps -q) ||:
docker rm -f $(docker ps -a -q) ||:
sudo rm -fr $TEMP_PATH
sudo rm -fr $TEMP_PATH $CACHES_PATH
BuilderDebTsan:
needs: [DockerHubPush]
runs-on: [self-hosted, builder]
steps:
- name: Set envs
run: |
cat >> "$GITHUB_ENV" << 'EOF'
TEMP_PATH=${{runner.temp}}/build_check
IMAGES_PATH=${{runner.temp}}/images_path
REPO_COPY=${{runner.temp}}/build_check/ClickHouse
CACHES_PATH=${{runner.temp}}/../ccaches
CHECK_NAME=ClickHouse build check (actions)
BUILD_NAME=package_tsan
EOF
- name: Download changed images
uses: actions/download-artifact@v2
with:
name: changed_images
path: ${{ runner.temp }}/images_path
path: ${{ env.IMAGES_PATH }}
- name: Clear repository
run: |
sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE
- name: Check out repository code
uses: actions/checkout@v2
with:
submodules: 'recursive'
submodules: 'true'
fetch-depth: 0 # otherwise we will have no info about contributors
- name: Build
env:
TEMP_PATH: ${{runner.temp}}/build_check
IMAGES_PATH: ${{runner.temp}}/images_path
REPO_COPY: ${{runner.temp}}/build_check/ClickHouse
CACHES_PATH: ${{runner.temp}}/../ccaches
CHECK_NAME: 'ClickHouse build check (actions)'
BUILD_NAME: 'package_tsan'
run: |
sudo rm -fr $TEMP_PATH
mkdir -p $TEMP_PATH
@ -154,35 +186,41 @@ jobs:
uses: actions/upload-artifact@v2
with:
name: ${{ env.BUILD_NAME }}
path: ${{ runner.temp }}/build_check/${{ env.BUILD_NAME }}.json
path: ${{ env.TEMP_PATH }}/${{ env.BUILD_NAME }}.json
- name: Cleanup
if: always()
run: |
docker kill $(docker ps -q) ||:
docker rm -f $(docker ps -a -q) ||:
sudo rm -fr $TEMP_PATH
sudo rm -fr $TEMP_PATH $CACHES_PATH
BuilderDebDebug:
needs: [DockerHubPush]
runs-on: [self-hosted, builder]
steps:
- name: Set envs
run: |
cat >> "$GITHUB_ENV" << 'EOF'
TEMP_PATH=${{runner.temp}}/build_check
IMAGES_PATH=${{runner.temp}}/images_path
REPO_COPY=${{runner.temp}}/build_check/ClickHouse
CACHES_PATH=${{runner.temp}}/../ccaches
CHECK_NAME=ClickHouse build check (actions)
BUILD_NAME=package_debug
EOF
- name: Download changed images
uses: actions/download-artifact@v2
with:
name: changed_images
path: ${{ runner.temp }}/images_path
path: ${{ env.IMAGES_PATH }}
- name: Clear repository
run: |
sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE
- name: Check out repository code
uses: actions/checkout@v2
with:
submodules: 'recursive'
submodules: 'true'
fetch-depth: 0 # otherwise we will have no info about contributors
- name: Build
env:
TEMP_PATH: ${{runner.temp}}/build_check
IMAGES_PATH: ${{runner.temp}}/images_path
REPO_COPY: ${{runner.temp}}/build_check/ClickHouse
CACHES_PATH: ${{runner.temp}}/../ccaches
CHECK_NAME: 'ClickHouse build check (actions)'
BUILD_NAME: 'package_debug'
run: |
sudo rm -fr $TEMP_PATH
mkdir -p $TEMP_PATH
@ -192,13 +230,13 @@ jobs:
uses: actions/upload-artifact@v2
with:
name: ${{ env.BUILD_NAME }}
path: ${{ runner.temp }}/build_check/${{ env.BUILD_NAME }}.json
path: ${{ env.TEMP_PATH }}/${{ env.BUILD_NAME }}.json
- name: Cleanup
if: always()
run: |
docker kill $(docker ps -q) ||:
docker rm -f $(docker ps -a -q) ||:
sudo rm -fr $TEMP_PATH
sudo rm -fr $TEMP_PATH $CACHES_PATH
############################################################################################
##################################### BUILD REPORTER #######################################
############################################################################################
@ -210,17 +248,23 @@ jobs:
- BuilderDebDebug
runs-on: [self-hosted, style-checker]
steps:
- name: Set envs
run: |
cat >> "$GITHUB_ENV" << 'EOF'
TEMP_PATH=${{runner.temp}}/report_check
REPORTS_PATH=${{runner.temp}}/reports_dir
CHECK_NAME=ClickHouse build check (actions)
EOF
- name: Download json reports
uses: actions/download-artifact@v2
with:
path: ${{runner.temp}}/reports_dir
path: ${{ env.REPORTS_PATH }}
- name: Clear repository
run: |
sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE
- name: Check out repository code
uses: actions/checkout@v2
- name: Report Builder
env:
TEMP_PATH: ${{runner.temp}}/report_check
REPORTS_PATH: ${{runner.temp}}/reports_dir
CHECK_NAME: 'ClickHouse build check (actions)'
run: |
sudo rm -fr $TEMP_PATH
mkdir -p $TEMP_PATH
@ -239,19 +283,25 @@ jobs:
needs: [BuilderDebAsan]
runs-on: [self-hosted, func-tester]
steps:
- name: Set envs
run: |
cat >> "$GITHUB_ENV" << 'EOF'
TEMP_PATH=${{runner.temp}}/stateless_debug
REPORTS_PATH=${{runner.temp}}/reports_dir
CHECK_NAME=Stateless tests (address, actions)
REPO_COPY=${{runner.temp}}/stateless_debug/ClickHouse
KILL_TIMEOUT=10800
EOF
- name: Download json reports
uses: actions/download-artifact@v2
with:
path: ${{runner.temp}}/reports_dir
path: ${{ env.REPORTS_PATH }}
- name: Clear repository
run: |
sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE
- name: Check out repository code
uses: actions/checkout@v2
- name: Functional test
env:
TEMP_PATH: ${{runner.temp}}/stateless_debug
REPORTS_PATH: ${{runner.temp}}/reports_dir
CHECK_NAME: 'Stateless tests (address, actions)'
REPO_COPY: ${{runner.temp}}/stateless_debug/ClickHouse
KILL_TIMEOUT: 10800
run: |
sudo rm -fr $TEMP_PATH
mkdir -p $TEMP_PATH
@ -271,19 +321,25 @@ jobs:
needs: [BuilderDebDebug]
runs-on: [self-hosted, func-tester]
steps:
- name: Set envs
run: |
cat >> "$GITHUB_ENV" << 'EOF'
TEMP_PATH=${{runner.temp}}/stateful_debug
REPORTS_PATH=${{runner.temp}}/reports_dir
CHECK_NAME=Stateful tests (debug, actions)
REPO_COPY=${{runner.temp}}/stateful_debug/ClickHouse
KILL_TIMEOUT=3600
EOF
- name: Download json reports
uses: actions/download-artifact@v2
with:
path: ${{runner.temp}}/reports_dir
path: ${{ env.REPORTS_PATH }}
- name: Clear repository
run: |
sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE
- name: Check out repository code
uses: actions/checkout@v2
- name: Functional test
env:
TEMP_PATH: ${{runner.temp}}/stateful_debug
REPORTS_PATH: ${{runner.temp}}/reports_dir
CHECK_NAME: 'Stateful tests (debug, actions)'
REPO_COPY: ${{runner.temp}}/stateful_debug/ClickHouse
KILL_TIMEOUT: 3600
run: |
sudo rm -fr $TEMP_PATH
mkdir -p $TEMP_PATH
@ -301,20 +357,30 @@ jobs:
##############################################################################################
StressTestTsan:
needs: [BuilderDebTsan]
runs-on: [self-hosted, stress-tester]
# func testers have 16 cores + 128 GB memory
# while stress testers have 36 cores + 72 memory
# It would be better to have something like 32 + 128,
# but such servers almost unavailable as spot instances.
runs-on: [self-hosted, func-tester]
steps:
- name: Set envs
run: |
cat >> "$GITHUB_ENV" << 'EOF'
TEMP_PATH=${{runner.temp}}/stress_thread
REPORTS_PATH=${{runner.temp}}/reports_dir
CHECK_NAME=Stress test (thread, actions)
REPO_COPY=${{runner.temp}}/stress_thread/ClickHouse
EOF
- name: Download json reports
uses: actions/download-artifact@v2
with:
path: ${{runner.temp}}/reports_dir
path: ${{ env.REPORTS_PATH }}
- name: Clear repository
run: |
sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE
- name: Check out repository code
uses: actions/checkout@v2
- name: Stress test
env:
TEMP_PATH: ${{runner.temp}}/stress_thread
REPORTS_PATH: ${{runner.temp}}/reports_dir
CHECK_NAME: 'Stress test (thread, actions)'
REPO_COPY: ${{runner.temp}}/stress_thread/ClickHouse
run: |
sudo rm -fr $TEMP_PATH
mkdir -p $TEMP_PATH
@ -334,18 +400,24 @@ jobs:
needs: [BuilderDebRelease]
runs-on: [self-hosted, stress-tester]
steps:
- name: Set envs
run: |
cat >> "$GITHUB_ENV" << 'EOF'
TEMP_PATH=${{runner.temp}}/integration_tests_release
REPORTS_PATH=${{runner.temp}}/reports_dir
CHECK_NAME=Integration tests (release, actions)
REPO_COPY=${{runner.temp}}/integration_tests_release/ClickHouse
EOF
- name: Download json reports
uses: actions/download-artifact@v2
with:
path: ${{runner.temp}}/reports_dir
path: ${{ env.REPORTS_PATH }}
- name: Clear repository
run: |
sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE
- name: Check out repository code
uses: actions/checkout@v2
- name: Integration test
env:
TEMP_PATH: ${{runner.temp}}/integration_tests_release
REPORTS_PATH: ${{runner.temp}}/reports_dir
CHECK_NAME: 'Integration tests (release, actions)'
REPO_COPY: ${{runner.temp}}/integration_tests_release/ClickHouse
run: |
sudo rm -fr $TEMP_PATH
mkdir -p $TEMP_PATH
@ -369,6 +441,9 @@ jobs:
- CompatibilityCheck
runs-on: [self-hosted, style-checker]
steps:
- name: Clear repository
run: |
sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE
- name: Check out repository code
uses: actions/checkout@v2
- name: Finish label

View File

@ -1,4 +1,9 @@
name: Cancel
env:
# Force the stdout and stderr streams to be unbuffered
PYTHONUNBUFFERED: 1
on: # yamllint disable-line rule:truthy
workflow_run:
workflows: ["CIGithubActions", "ReleaseCI", "DocsCheck", "BackportPR"]

View File

@ -1,4 +1,9 @@
name: DocsCheck
env:
# Force the stdout and stderr streams to be unbuffered
PYTHONUNBUFFERED: 1
on: # yamllint disable-line rule:truthy
pull_request:
types:
@ -14,6 +19,9 @@ jobs:
CheckLabels:
runs-on: [self-hosted, style-checker]
steps:
- name: Clear repository
run: |
sudo rm -rf $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE
- name: Check out repository code
uses: actions/checkout@v2
- name: Labels check
@ -24,6 +32,9 @@ jobs:
needs: CheckLabels
runs-on: [self-hosted, style-checker]
steps:
- name: Clear repository
run: |
sudo rm -rf $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE
- name: Check out repository code
uses: actions/checkout@v2
- name: Images check
@ -39,17 +50,23 @@ jobs:
needs: DockerHubPush
runs-on: [self-hosted, func-tester]
steps:
- name: Set envs
run: |
cat >> "$GITHUB_ENV" << 'EOF'
TEMP_PATH=${{runner.temp}}/docs_check
REPO_COPY=${{runner.temp}}/docs_check/ClickHouse
EOF
- name: Download changed images
uses: actions/download-artifact@v2
with:
name: changed_images
path: ${{ runner.temp }}/docs_check
path: ${{ env.TEMP_PATH }}
- name: Clear repository
run: |
sudo rm -rf $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE
- name: Check out repository code
uses: actions/checkout@v2
- name: Docs Check
env:
TEMP_PATH: ${{runner.temp}}/docs_check
REPO_COPY: ${{runner.temp}}/docs_check/ClickHouse
run: |
cp -r $GITHUB_WORKSPACE $TEMP_PATH
cd $REPO_COPY/tests/ci

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,4 +1,9 @@
name: DocsReleaseChecks
env:
# Force the stdout and stderr streams to be unbuffered
PYTHONUNBUFFERED: 1
concurrency:
group: master-release
cancel-in-progress: true
@ -11,10 +16,15 @@ on: # yamllint disable-line rule:truthy
- 'website/**'
- 'benchmark/**'
- 'docker/**'
- '.github/**'
workflow_dispatch:
jobs:
DockerHubPush:
runs-on: [self-hosted, style-checker]
steps:
- name: Clear repository
run: |
sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE
- name: Check out repository code
uses: actions/checkout@v2
- name: Images check
@ -30,20 +40,31 @@ jobs:
needs: DockerHubPush
runs-on: [self-hosted, func-tester]
steps:
- name: Set envs
# https://docs.github.com/en/actions/learn-github-actions/workflow-commands-for-github-actions#multiline-strings
run: |
cat >> "$GITHUB_ENV" << 'EOF'
TEMP_PATH=${{runner.temp}}/docs_release
REPO_COPY=${{runner.temp}}/docs_release/ClickHouse
CLOUDFLARE_TOKEN=${{secrets.CLOUDFLARE}}
ROBOT_CLICKHOUSE_SSH_KEY<<RCSK
${{secrets.ROBOT_CLICKHOUSE_SSH_KEY}}
RCSK
EOF
- name: Clear repository
run: |
sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE
- name: Check out repository code
uses: actions/checkout@v2
- name: Download changed images
uses: actions/download-artifact@v2
with:
name: changed_images
path: ${{runner.temp}}/docs_release
path: ${{ env.TEMP_PATH }}
- name: Docs Release
env:
TEMP_PATH: ${{runner.temp}}/docs_release
REPO_COPY: ${{runner.temp}}/docs_release/ClickHouse
CLOUDFLARE_TOKEN: ${{secrets.CLOUDFLARE}}
ROBOT_CLICKHOUSE_SSH_KEY: ${{secrets.ROBOT_CLICKHOUSE_SSH_KEY}}
run: |
sudo rm -fr $TEMP_PATH
mkdir -p $TEMP_PATH
cp -r $GITHUB_WORKSPACE $TEMP_PATH
cd $REPO_COPY/tests/ci
python3 docs_release.py

File diff suppressed because it is too large Load Diff

View File

@ -71,8 +71,8 @@
* Fix the issue that `LowCardinality` of `Int256` cannot be created. [#31832](https://github.com/ClickHouse/ClickHouse/pull/31832) ([alexey-milovidov](https://github.com/alexey-milovidov)).
* Recreate `system.*_log` tables in case of different engine/partition_by. [#31824](https://github.com/ClickHouse/ClickHouse/pull/31824) ([Azat Khuzhin](https://github.com/azat)).
* `MaterializedMySQL`: Fix issue with table named 'table'. [#31781](https://github.com/ClickHouse/ClickHouse/pull/31781) ([Håvard Kvålen](https://github.com/havardk)).
* ClickHouse dictionary source: support named collections. Closes [#31705](https://github.com/ClickHouse/ClickHouse/issues/31705). [#31749](https://github.com/ClickHouse/ClickHouse/pull/31749) ([Kseniia Sumarokova](https://github.com/kssenii)).
* Allow to use named collections configuration for Kafka and RabbitMQ engines (the same way as for other integration table engines). [#31691](https://github.com/ClickHouse/ClickHouse/pull/31691) ([Kseniia Sumarokova](https://github.com/kssenii)).
* ClickHouse dictionary source: support predefined connections. Closes [#31705](https://github.com/ClickHouse/ClickHouse/issues/31705). [#31749](https://github.com/ClickHouse/ClickHouse/pull/31749) ([Kseniia Sumarokova](https://github.com/kssenii)).
* Allow to use predefined connections configuration for Kafka and RabbitMQ engines (the same way as for other integration table engines). [#31691](https://github.com/ClickHouse/ClickHouse/pull/31691) ([Kseniia Sumarokova](https://github.com/kssenii)).
* Always re-render prompt while navigating history in clickhouse-client. This will improve usability of manipulating very long queries that don't fit on screen. [#31675](https://github.com/ClickHouse/ClickHouse/pull/31675) ([alexey-milovidov](https://github.com/alexey-milovidov)) (author: Amos Bird).
* Add key bindings for navigating through history (instead of lines/history). [#31641](https://github.com/ClickHouse/ClickHouse/pull/31641) ([Azat Khuzhin](https://github.com/azat)).
* Improve the `max_execution_time` checks. Fixed some cases when timeout checks do not happen and query could run too long. [#31636](https://github.com/ClickHouse/ClickHouse/pull/31636) ([Raúl Marín](https://github.com/Algunenano)).

View File

@ -27,8 +27,7 @@ execute_process(COMMAND uname -m OUTPUT_VARIABLE ARCH)
if (OS MATCHES "Linux"
AND NOT DEFINED CMAKE_TOOLCHAIN_FILE
AND NOT DISABLE_HERMETIC_BUILD
AND ($ENV{CC} MATCHES ".*clang.*" OR CMAKE_C_COMPILER MATCHES ".*clang.*")
AND (USE_STATIC_LIBRARIES OR NOT DEFINED USE_STATIC_LIBRARIES))
AND ($ENV{CC} MATCHES ".*clang.*" OR CMAKE_C_COMPILER MATCHES ".*clang.*"))
if (ARCH MATCHES "amd64|x86_64")
set (CMAKE_TOOLCHAIN_FILE "cmake/linux/toolchain-x86_64.cmake" CACHE INTERNAL "" FORCE)

View File

@ -82,7 +82,9 @@ PoolWithFailover::PoolWithFailover(
unsigned default_connections_,
unsigned max_connections_,
size_t max_tries_,
uint64_t wait_timeout_)
uint64_t wait_timeout_,
size_t connect_timeout_,
size_t rw_timeout_)
: max_tries(max_tries_)
, shareable(false)
, wait_timeout(wait_timeout_)
@ -93,8 +95,8 @@ PoolWithFailover::PoolWithFailover(
replicas_by_priority[0].emplace_back(std::make_shared<Pool>(database,
host, user, password, port,
/* socket_ = */ "",
MYSQLXX_DEFAULT_TIMEOUT,
MYSQLXX_DEFAULT_RW_TIMEOUT,
connect_timeout_,
rw_timeout_,
default_connections_,
max_connections_));
}

View File

@ -6,6 +6,7 @@
#define MYSQLXX_POOL_WITH_FAILOVER_DEFAULT_START_CONNECTIONS 1
#define MYSQLXX_POOL_WITH_FAILOVER_DEFAULT_MAX_CONNECTIONS 16
#define MYSQLXX_POOL_WITH_FAILOVER_DEFAULT_MAX_TRIES 3
#define MYSQLXX_POOL_WITH_FAILOVER_DEFAULT_CONNECTION_WAIT_TIMEOUT 5 /// in seconds
namespace mysqlxx
@ -121,7 +122,9 @@ namespace mysqlxx
unsigned default_connections_ = MYSQLXX_POOL_WITH_FAILOVER_DEFAULT_START_CONNECTIONS,
unsigned max_connections_ = MYSQLXX_POOL_WITH_FAILOVER_DEFAULT_MAX_CONNECTIONS,
size_t max_tries_ = MYSQLXX_POOL_WITH_FAILOVER_DEFAULT_MAX_TRIES,
uint64_t wait_timeout_ = UINT64_MAX);
uint64_t wait_timeout_ = MYSQLXX_POOL_WITH_FAILOVER_DEFAULT_CONNECTION_WAIT_TIMEOUT,
size_t connect_timeout = MYSQLXX_DEFAULT_TIMEOUT,
size_t rw_timeout = MYSQLXX_DEFAULT_RW_TIMEOUT);
PoolWithFailover(const PoolWithFailover & other);

View File

@ -1,8 +1,10 @@
option (ENABLE_AZURE_BLOB_STORAGE "Enable Azure blob storage" ${ENABLE_LIBRARIES})
option(USE_INTERNAL_AZURE_BLOB_STORAGE_LIBRARY
"Set to FALSE to use system Azure SDK instead of bundled (OFF currently not implemented)"
ON)
if (USE_INTERNAL_AZURE_BLOB_STORAGE_LIBRARY)
if (ENABLE_AZURE_BLOB_STORAGE)
set(USE_AZURE_BLOB_STORAGE 1)
set(AZURE_BLOB_STORAGE_LIBRARY azure_sdk)
endif()

View File

@ -14,9 +14,12 @@ set (TOOLCHAIN_PATH "${CMAKE_CURRENT_LIST_DIR}/../../contrib/sysroot/linux-x86_6
set (CMAKE_SYSROOT "${TOOLCHAIN_PATH}/x86_64-linux-gnu/libc")
set (CMAKE_C_FLAGS_INIT "${CMAKE_C_FLAGS} --gcc-toolchain=${TOOLCHAIN_PATH}")
set (CMAKE_CXX_FLAGS_INIT "${CMAKE_CXX_FLAGS} --gcc-toolchain=${TOOLCHAIN_PATH}")
set (CMAKE_ASM_FLAGS_INIT "${CMAKE_ASM_FLAGS} --gcc-toolchain=${TOOLCHAIN_PATH}")
set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} --gcc-toolchain=${TOOLCHAIN_PATH}")
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} --gcc-toolchain=${TOOLCHAIN_PATH}")
set (CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} --gcc-toolchain=${TOOLCHAIN_PATH}")
set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} --gcc-toolchain=${TOOLCHAIN_PATH}")
set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} --gcc-toolchain=${TOOLCHAIN_PATH}")
set (CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} --gcc-toolchain=${TOOLCHAIN_PATH}")
set (HAS_PRE_1970_EXITCODE "0" CACHE STRING "Result from TRY_RUN" FORCE)
set (HAS_PRE_1970_EXITCODE__TRYRUN_OUTPUT "" CACHE STRING "Output from TRY_RUN" FORCE)

2
contrib/NuRaft vendored

@ -1 +1 @@
Subproject commit d10351f312c1ae1ca3fdda433693dfbef3acfece
Subproject commit bb69d48e0ee35c87a0f19e509a09a914f71f0cff

View File

@ -46,14 +46,17 @@ include("${AZURE_DIR}/cmake-modules/AzureTransportAdapters.cmake")
add_library(azure_sdk ${AZURE_SDK_UNIFIED_SRC})
if (COMPILER_CLANG)
target_compile_options(azure_sdk PUBLIC
target_compile_options(azure_sdk PRIVATE
-Wno-deprecated-copy-dtor
-Wno-extra-semi
-Wno-suggest-destructor-override
-Wno-inconsistent-missing-destructor-override
-Wno-error=unknown-warning-option
-Wno-reserved-identifier
)
if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 13)
target_compile_options(azure_sdk PRIVATE -Wno-reserved-identifier)
endif()
endif()
# Originally, on Windows azure-core is built with bcrypt and crypt32 by default
@ -68,4 +71,4 @@ endif()
target_link_libraries(azure_sdk PRIVATE ${LIBXML2_LIBRARIES})
target_include_directories(azure_sdk PUBLIC ${AZURE_SDK_INCLUDES})
target_include_directories(azure_sdk SYSTEM PUBLIC ${AZURE_SDK_INCLUDES})

View File

@ -268,7 +268,7 @@ XMLPUBFUN void XMLCALL xmlCheckVersion(int version);
*
* Whether iconv support is available
*/
#if 1
#if 0
#define LIBXML_ICONV_ENABLED
#endif

2
contrib/sysroot vendored

@ -1 +1 @@
Subproject commit 410845187f582c5e6692b53dddbe43efbb728734
Subproject commit bbcac834526d90d1e764164b861be426891d1743

14
debian/rules vendored
View File

@ -45,6 +45,10 @@ ifdef DEB_CXX
ifeq ($(DEB_BUILD_GNU_TYPE),$(DEB_HOST_GNU_TYPE))
CC := $(DEB_CC)
CXX := $(DEB_CXX)
else ifeq (clang,$(findstring clang,$(DEB_CXX)))
# If we crosscompile with clang, it knows what to do
CC := $(DEB_CC)
CXX := $(DEB_CXX)
else
CC := $(DEB_HOST_GNU_TYPE)-$(DEB_CC)
CXX := $(DEB_HOST_GNU_TYPE)-$(DEB_CXX)
@ -77,10 +81,6 @@ else
THREADS_COUNT = 1
endif
ifneq ($(THREADS_COUNT),)
THREADS_COUNT:=-j$(THREADS_COUNT)
endif
%:
dh $@ $(DH_FLAGS) --buildsystem=cmake
@ -89,11 +89,11 @@ override_dh_auto_configure:
override_dh_auto_build:
# Fix for ninja. Do not add -O.
$(MAKE) $(THREADS_COUNT) -C $(BUILDDIR) $(MAKE_TARGET)
$(MAKE) -j$(THREADS_COUNT) -C $(BUILDDIR) $(MAKE_TARGET)
override_dh_auto_test:
ifeq (,$(filter nocheck,$(DEB_BUILD_OPTIONS)))
cd $(BUILDDIR) && ctest $(THREADS_COUNT) -V
cd $(BUILDDIR) && ctest -j$(THREADS_COUNT) -V
endif
override_dh_clean:
@ -120,7 +120,7 @@ override_dh_install:
dh_install --list-missing --sourcedir=$(DESTDIR)
override_dh_auto_install:
env DESTDIR=$(DESTDIR) $(MAKE) $(THREADS_COUNT) -C $(BUILDDIR) install
env DESTDIR=$(DESTDIR) $(MAKE) -j$(THREADS_COUNT) -C $(BUILDDIR) install
override_dh_shlibdeps:
true # We depend only on libc and dh_shlibdeps gives us wrong (too strict) dependency.

View File

@ -24,40 +24,34 @@ RUN apt-get update \
&& apt-key add /tmp/llvm-snapshot.gpg.key \
&& export CODENAME="$(lsb_release --codename --short | tr 'A-Z' 'a-z')" \
&& echo "deb [trusted=yes] https://apt.llvm.org/${CODENAME}/ llvm-toolchain-${CODENAME}-${LLVM_VERSION} main" >> \
/etc/apt/sources.list
/etc/apt/sources.list \
&& apt-get clean
# initial packages
RUN apt-get update \
&& apt-get install \
bash \
fakeroot \
ccache \
curl \
software-properties-common \
--yes --no-install-recommends
RUN apt-get update \
&& apt-get install \
bash \
build-essential \
ccache \
clang-${LLVM_VERSION} \
clang-tidy-${LLVM_VERSION} \
cmake \
curl \
fakeroot \
gdb \
git \
gperf \
clang-${LLVM_VERSION} \
clang-tidy-${LLVM_VERSION} \
lld-${LLVM_VERSION} \
llvm-${LLVM_VERSION} \
llvm-${LLVM_VERSION}-dev \
libicu-dev \
moreutils \
ninja-build \
pigz \
rename \
software-properties-common \
tzdata \
--yes --no-install-recommends
--yes --no-install-recommends \
&& apt-get clean
# This symlink required by gcc to find lld compiler
RUN ln -s /usr/bin/lld-${LLVM_VERSION} /usr/bin/ld.lld
@ -66,7 +60,7 @@ ENV CC=clang-${LLVM_VERSION}
ENV CXX=clang++-${LLVM_VERSION}
# libtapi is required to support .tbh format from recent MacOS SDKs
RUN git clone https://github.com/tpoechtrager/apple-libtapi.git \
RUN git clone --depth 1 https://github.com/tpoechtrager/apple-libtapi.git \
&& cd apple-libtapi \
&& INSTALLPREFIX=/cctools ./build.sh \
&& ./install.sh \
@ -74,7 +68,7 @@ RUN git clone https://github.com/tpoechtrager/apple-libtapi.git \
&& rm -rf apple-libtapi
# Build and install tools for cross-linking to Darwin (x86-64)
RUN git clone https://github.com/tpoechtrager/cctools-port.git \
RUN git clone --depth 1 https://github.com/tpoechtrager/cctools-port.git \
&& cd cctools-port/cctools \
&& ./configure --prefix=/cctools --with-libtapi=/cctools \
--target=x86_64-apple-darwin \
@ -83,7 +77,7 @@ RUN git clone https://github.com/tpoechtrager/cctools-port.git \
&& rm -rf cctools-port
# Build and install tools for cross-linking to Darwin (aarch64)
RUN git clone https://github.com/tpoechtrager/cctools-port.git \
RUN git clone --depth 1 https://github.com/tpoechtrager/cctools-port.git \
&& cd cctools-port/cctools \
&& ./configure --prefix=/cctools --with-libtapi=/cctools \
--target=aarch64-apple-darwin \
@ -97,7 +91,8 @@ RUN wget -nv https://github.com/phracker/MacOSX-SDKs/releases/download/11.3/MacO
# NOTE: Seems like gcc-11 is too new for ubuntu20 repository
RUN add-apt-repository ppa:ubuntu-toolchain-r/test --yes \
&& apt-get update \
&& apt-get install gcc-11 g++-11 --yes
&& apt-get install gcc-11 g++-11 --yes \
&& apt-get clean
COPY build.sh /

View File

@ -64,8 +64,14 @@ RUN add-apt-repository ppa:ubuntu-toolchain-r/test --yes \
&& apt-get install gcc-11 g++-11 --yes
# This symlink required by gcc to find lld compiler
RUN ln -s /usr/bin/lld-${LLVM_VERSION} /usr/bin/ld.lld
# These symlinks are required:
# /usr/bin/ld.lld: by gcc to find lld compiler
# /usr/bin/aarch64-linux-gnu-obj*: for debug symbols stripping
RUN ln -s /usr/bin/lld-${LLVM_VERSION} /usr/bin/ld.lld \
&& ln -sf /usr/lib/llvm-${LLVM_VERSION}/bin/llvm-objcopy /usr/bin/aarch64-linux-gnu-strip \
&& ln -sf /usr/lib/llvm-${LLVM_VERSION}/bin/llvm-objcopy /usr/bin/aarch64-linux-gnu-objcopy \
&& ln -sf /usr/lib/llvm-${LLVM_VERSION}/bin/llvm-objdump /usr/bin/aarch64-linux-gnu-objdump
COPY build.sh /

View File

@ -29,7 +29,13 @@ def pull_image(image_name):
return False
def build_image(image_name, filepath):
subprocess.check_call("docker build --network=host -t {} -f {} .".format(image_name, filepath), shell=True)
context = os.path.dirname(filepath)
subprocess.check_call(
"docker build --network=host -t {} -f {} {}".format(
image_name, filepath, context
),
shell=True,
)
def run_docker_image_with_env(image_name, output, env_variables, ch_root, ccache_dir, docker_image_version):
env_part = " -e ".join(env_variables)
@ -90,6 +96,7 @@ def parse_env_variables(build_type, compiler, sanitizer, package_type, image_typ
elif is_cross_arm:
cc = compiler[:-len(ARM_SUFFIX)]
cmake_flags.append("-DCMAKE_TOOLCHAIN_FILE=/build/cmake/linux/toolchain-aarch64.cmake")
result.append("DEB_ARCH_FLAG=-aarm64")
elif is_cross_freebsd:
cc = compiler[:-len(FREEBSD_SUFFIX)]
cmake_flags.append("-DCMAKE_TOOLCHAIN_FILE=/build/cmake/freebsd/toolchain-x86_64.cmake")
@ -98,6 +105,7 @@ def parse_env_variables(build_type, compiler, sanitizer, package_type, image_typ
cmake_flags.append("-DCMAKE_TOOLCHAIN_FILE=/build/cmake/linux/toolchain-ppc64le.cmake")
else:
cc = compiler
result.append("DEB_ARCH_FLAG=-aamd64")
cxx = cc.replace('gcc', 'g++').replace('clang', 'clang++')

View File

@ -173,6 +173,8 @@ function clone_submodules
contrib/dragonbox
contrib/fast_float
contrib/NuRaft
contrib/jemalloc
contrib/replxx
)
git submodule sync
@ -193,6 +195,8 @@ function run_cmake
"-DENABLE_THINLTO=0"
"-DUSE_UNWIND=1"
"-DENABLE_NURAFT=1"
"-DENABLE_JEMALLOC=1"
"-DENABLE_REPLXX=1"
)
# TODO remove this? we don't use ccache anyway. An option would be to download it

View File

@ -175,6 +175,15 @@ info signals
continue
backtrace full
info locals
info registers
disassemble /s
up
info locals
disassemble /s
up
info locals
disassemble /s
p \"done\"
detach
quit
" > script.gdb

View File

@ -72,11 +72,13 @@ RUN python3 -m pip install \
grpcio-tools \
kafka-python \
kazoo \
lz4 \
minio \
protobuf \
psycopg2-binary==2.8.6 \
pymongo==3.11.0 \
pytest \
pytest-order==1.0.0 \
pytest-timeout \
pytest-xdist \
pytest-repeat \

View File

@ -8,8 +8,8 @@ echo '{
"ip-forward": true,
"log-level": "debug",
"storage-driver": "overlay2",
"insecure-registries" : ["dockerhub-proxy.sas.yp-c.yandex.net:5000"],
"registry-mirrors" : ["http://dockerhub-proxy.sas.yp-c.yandex.net:5000"]
"insecure-registries" : ["dockerhub-proxy.dockerhub-proxy-zone:5000"],
"registry-mirrors" : ["http://dockerhub-proxy.dockerhub-proxy-zone:5000"]
}' | dd of=/etc/docker/daemon.json 2>/dev/null
dockerd --host=unix:///var/run/docker.sock --host=tcp://0.0.0.0:2375 --default-address-pool base=172.17.0.0/12,size=24 &>/ClickHouse/tests/integration/dockerd.log &

View File

@ -11,6 +11,20 @@ if [[ $S3_URL == *"s3.amazonaws.com"* ]]; then
COMMON_BUILD_PREFIX=""
fi
# Sometimes AWS responde with DNS error and it's impossible to retry it with
# current curl version options.
function curl_with_retry
{
for _ in 1 2 3 4; do
if curl --fail --head "$1";then
return 0
else
sleep 0.5
fi
done
return 1
}
# Use the packaged repository to find the revision we will compare to.
function find_reference_sha
{
@ -55,7 +69,7 @@ function find_reference_sha
)
for path in "${urls_to_try[@]}"
do
if curl --fail --head "$path"
if curl_with_retry "$path"
then
found="$path"
break
@ -76,7 +90,7 @@ chmod 777 workspace output
cd workspace
# Download the package for the version we are going to test.
if curl --fail --head "$S3_URL/$PR_TO_TEST/$SHA_TO_TEST$COMMON_BUILD_PREFIX/performance/performance.tgz"
if curl_with_retry "$S3_URL/$PR_TO_TEST/$SHA_TO_TEST$COMMON_BUILD_PREFIX/performance/performance.tgz"
then
right_path="$S3_URL/$PR_TO_TEST/$SHA_TO_TEST$COMMON_BUILD_PREFIX/performance/performance.tgz"
fi

View File

@ -148,6 +148,15 @@ info signals
continue
backtrace full
info locals
info registers
disassemble /s
up
info locals
disassemble /s
up
info locals
disassemble /s
p \"done\"
detach
quit
" > script.gdb

View File

@ -5,8 +5,8 @@ echo "Configure to use Yandex dockerhub-proxy"
mkdir -p /etc/docker/
cat > /etc/docker/daemon.json << EOF
{
"insecure-registries" : ["dockerhub-proxy.sas.yp-c.yandex.net:5000"],
"registry-mirrors" : ["http://dockerhub-proxy.sas.yp-c.yandex.net:5000"]
"insecure-registries" : ["dockerhub-proxy.dockerhub-proxy-zone:5000"],
"registry-mirrors" : ["http://dockerhub-proxy.dockerhub-proxy-zone:5000"]
}
EOF

View File

@ -106,20 +106,20 @@ Build ClickHouse. Run ClickHouse from the terminal: change directory to `program
Note that all clickhouse tools (server, client, etc) are just symlinks to a single binary named `clickhouse`. You can find this binary at `programs/clickhouse`. All tools can also be invoked as `clickhouse tool` instead of `clickhouse-tool`.
Alternatively you can install ClickHouse package: either stable release from Yandex repository or you can build package for yourself with `./release` in ClickHouse sources root. Then start the server with `sudo service clickhouse-server start` (or stop to stop the server). Look for logs at `/etc/clickhouse-server/clickhouse-server.log`.
Alternatively you can install ClickHouse package: either stable release from ClickHouse repository or you can build package for yourself with `./release` in ClickHouse sources root. Then start the server with `sudo clickhouse start` (or stop to stop the server). Look for logs at `/etc/clickhouse-server/clickhouse-server.log`.
When ClickHouse is already installed on your system, you can build a new `clickhouse` binary and replace the existing binary:
``` bash
$ sudo service clickhouse-server stop
$ sudo clickhouse stop
$ sudo cp ./clickhouse /usr/bin/
$ sudo service clickhouse-server start
$ sudo clickhouse start
```
Also you can stop system clickhouse-server and run your own with the same configuration but with logging to terminal:
``` bash
$ sudo service clickhouse-server stop
$ sudo clickhouse stop
$ sudo -u clickhouse /usr/bin/clickhouse server --config-file /etc/clickhouse-server/config.xml
```
@ -257,9 +257,9 @@ There are five variants (Debug, ASan, TSan, MSan, UBSan).
Thread Fuzzer (please don't mix up with Thread Sanitizer) is another kind of fuzzing that allows to randomize thread order of execution. It helps to find even more special cases.
## Security Audit {#security-audit}
## Security Audit
People from Yandex Security Team do some basic overview of ClickHouse capabilities from the security standpoint.
People from Yandex Security Team did some basic overview of ClickHouse capabilities from the security standpoint.
## Static Analyzers {#static-analyzers}
@ -326,15 +326,11 @@ There is automated check for flaky tests. It runs all new tests 100 times (for f
## Testflows
[Testflows](https://testflows.com/) is an enterprise-grade testing framework. It is used by Altinity for some of the tests and we run these tests in our CI.
## Yandex Checks (only for Yandex employees)
These checks are importing ClickHouse code into Yandex internal monorepository, so ClickHouse codebase can be used as a library by other products at Yandex (YT and YDB). Note that clickhouse-server itself is not being build from internal repo and unmodified open-source build is used for Yandex applications.
[Testflows](https://testflows.com/) is an enterprise-grade open-source testing framework, which is used to test a subset of ClickHouse.
## Test Automation {#test-automation}
We run tests with Yandex internal CI and job automation system named “Sandbox”.
We run tests with [GitHub Actions](https://github.com/features/actions).
Build jobs and tests are run in Sandbox on per commit basis. Resulting packages and test results are published in GitHub and can be downloaded by direct links. Artifacts are stored for several months. When you send a pull request on GitHub, we tag it as “can be tested” and our CI system will build ClickHouse packages (release, debug, with address sanitizer, etc) for you.

View File

@ -5,8 +5,7 @@ toc_title: HDFS
# HDFS {#table_engines-hdfs}
This engine provides integration with [Apache Hadoop](https://en.wikipedia.org/wiki/Apache_Hadoop) ecosystem by allowing to manage data on [HDFS](https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-hdfs/HdfsDesign.html) via ClickHouse. This engine is similar
to the [File](../../../engines/table-engines/special/file.md#table_engines-file) and [URL](../../../engines/table-engines/special/url.md#table_engines-url) engines, but provides Hadoop-specific features.
This engine provides integration with the [Apache Hadoop](https://en.wikipedia.org/wiki/Apache_Hadoop) ecosystem by allowing to manage data on [HDFS](https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-hdfs/HdfsDesign.html) via ClickHouse. This engine is similar to the [File](../../../engines/table-engines/special/file.md#table_engines-file) and [URL](../../../engines/table-engines/special/url.md#table_engines-url) engines, but provides Hadoop-specific features.
## Usage {#usage}
@ -14,12 +13,13 @@ to the [File](../../../engines/table-engines/special/file.md#table_engines-file)
ENGINE = HDFS(URI, format)
```
The `URI` parameter is the whole file URI in HDFS.
The `format` parameter specifies one of the available file formats. To perform
**Engine Parameters**
- `URI` - whole file URI in HDFS. The path part of `URI` may contain globs. In this case the table would be readonly.
- `format` - specifies one of the available file formats. To perform
`SELECT` queries, the format must be supported for input, and to perform
`INSERT` queries for output. The available formats are listed in the
[Formats](../../../interfaces/formats.md#formats) section.
The path part of `URI` may contain globs. In this case the table would be readonly.
**Example:**
@ -71,12 +71,12 @@ Constructions with `{}` are similar to the [remote](../../../sql-reference/table
1. Suppose we have several files in TSV format with the following URIs on HDFS:
- 'hdfs://hdfs1:9000/some_dir/some_file_1'
- 'hdfs://hdfs1:9000/some_dir/some_file_2'
- 'hdfs://hdfs1:9000/some_dir/some_file_3'
- 'hdfs://hdfs1:9000/another_dir/some_file_1'
- 'hdfs://hdfs1:9000/another_dir/some_file_2'
- 'hdfs://hdfs1:9000/another_dir/some_file_3'
- 'hdfs://hdfs1:9000/some_dir/some_file_1'
- 'hdfs://hdfs1:9000/some_dir/some_file_2'
- 'hdfs://hdfs1:9000/some_dir/some_file_3'
- 'hdfs://hdfs1:9000/another_dir/some_file_1'
- 'hdfs://hdfs1:9000/another_dir/some_file_2'
- 'hdfs://hdfs1:9000/another_dir/some_file_3'
1. There are several ways to make a table consisting of all six files:
@ -132,6 +132,7 @@ Similar to GraphiteMergeTree, the HDFS engine supports extended configuration us
| **parameter** | **default value** |
| - | - |
| rpc\_client\_connect\_tcpnodelay | true |
| dfs\_client\_read\_shortcircuit | true |
| output\_replace-datanode-on-failure | true |
@ -181,25 +182,26 @@ Similar to GraphiteMergeTree, the HDFS engine supports extended configuration us
#### ClickHouse extras {#clickhouse-extras}
| **parameter** | **default value** |
| - | - |
|hadoop\_kerberos\_keytab | "" |
|hadoop\_kerberos\_principal | "" |
|hadoop\_kerberos\_kinit\_command | kinit |
|libhdfs3\_conf | "" |
### Limitations {#limitations}
* hadoop\_security\_kerberos\_ticket\_cache\_path and libhdfs3\_conf can be global only, not user specific
* `hadoop_security_kerberos_ticket_cache_path` and `libhdfs3_conf` can be global only, not user specific
## Kerberos support {#kerberos-support}
If hadoop\_security\_authentication parameter has value 'kerberos', ClickHouse authentifies via Kerberos facility.
Parameters [here](#clickhouse-extras) and hadoop\_security\_kerberos\_ticket\_cache\_path may be of help.
If the `hadoop_security_authentication` parameter has the value `kerberos`, ClickHouse authenticates via Kerberos.
Parameters are [here](#clickhouse-extras) and `hadoop_security_kerberos_ticket_cache_path` may be of help.
Note that due to libhdfs3 limitations only old-fashioned approach is supported,
datanode communications are not secured by SASL (HADOOP\_SECURE\_DN\_USER is a reliable indicator of such
security approach). Use tests/integration/test\_storage\_kerberized\_hdfs/hdfs_configs/bootstrap.sh for reference.
datanode communications are not secured by SASL (`HADOOP_SECURE_DN_USER` is a reliable indicator of such
security approach). Use `tests/integration/test_storage_kerberized_hdfs/hdfs_configs/bootstrap.sh` for reference.
If hadoop\_kerberos\_keytab, hadoop\_kerberos\_principal or hadoop\_kerberos\_kinit\_command is specified, kinit will be invoked. hadoop\_kerberos\_keytab and hadoop\_kerberos\_principal are mandatory in this case. kinit tool and krb5 configuration files are required.
If `hadoop_kerberos_keytab`, `hadoop_kerberos_principal` or `hadoop_kerberos_kinit_command` is specified, `kinit` will be invoked. `hadoop_kerberos_keytab` and `hadoop_kerberos_principal` are mandatory in this case. `kinit` tool and krb5 configuration files are required.
## HDFS Namenode HA support{#namenode-ha}
## HDFS Namenode HA support {#namenode-ha}
libhdfs3 support HDFS namenode HA.

View File

@ -37,6 +37,7 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster]
[rabbitmq_skip_broken_messages = N,]
[rabbitmq_max_block_size = N,]
[rabbitmq_flush_interval_ms = N]
[rabbitmq_queue_settings_list = 'x-dead-letter-exchange=my-dlx,x-max-length=10,x-overflow=reject-publish']
```
Required parameters:
@ -59,6 +60,7 @@ Optional parameters:
- `rabbitmq_skip_broken_messages` RabbitMQ message parser tolerance to schema-incompatible messages per block. Default: `0`. If `rabbitmq_skip_broken_messages = N` then the engine skips *N* RabbitMQ messages that cannot be parsed (a message equals a row of data).
- `rabbitmq_max_block_size`
- `rabbitmq_flush_interval_ms`
- `rabbitmq_queue_settings_list` - allows to set RabbitMQ settings when creating a queue. Available settings: `x-max-length`, `x-max-length-bytes`, `x-message-ttl`, `x-expires`, `x-priority`, `x-max-priority`, `x-overflow`, `x-dead-letter-exchange`, `x-queue-type`. The `durable` setting is enabled automatically for the queue.
SSL connection:

View File

@ -66,9 +66,9 @@ WHERE table = 'visits'
└───────────┴────────────────┴────────┘
```
The `partition` column contains the names of the partitions. There are two partitions in this example: `201901` and `201902`. You can use this column value to specify the partition name in [ALTER … PARTITION](#alter_manipulations-with-partitions) queries.
The `partition` column contains the names of the partitions. There are two partitions in this example: `201901` and `201902`. You can use this column value to specify the partition name in [ALTER … PARTITION](../../../sql-reference/statements/alter/partition.md) queries.
The `name` column contains the names of the partition data parts. You can use this column to specify the name of the part in the [ALTER ATTACH PART](#alter_attach-partition) query.
The `name` column contains the names of the partition data parts. You can use this column to specify the name of the part in the [ALTER ATTACH PART](../../../sql-reference/statements/alter/partition.md#alter_attach-partition) query.
Lets break down the name of the first part: `201901_1_3_1`:

View File

@ -39,10 +39,10 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] AS [db2.]name2
- `policy_name` - (optionally) policy name, it will be used to store temporary files for async send
See also:
**See Also**
- [insert_distributed_sync](../../../operations/settings/settings.md#insert_distributed_sync) setting
- [MergeTree](../../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-multiple-volumes) for the examples
- [insert_distributed_sync](../../../operations/settings/settings.md#insert_distributed_sync) setting
- [MergeTree](../../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-multiple-volumes) for the examples
**Distributed Settings**

View File

@ -204,7 +204,7 @@ When parsing with this format, tabs or linefeeds are not allowed in each field.
This format is also available under the name `TSVRawWithNames`.
## TabSeparatedWithNamesAndTypes {#tabseparatedrawwithnamesandtypes}
## TabSeparatedRawWithNamesAndTypes {#tabseparatedrawwithnamesandtypes}
Differs from `TabSeparatedWithNamesAndTypes` format in that the rows are written without escaping.
When parsing with this format, tabs or linefeeds are not allowed in each field.

View File

@ -178,5 +178,9 @@ toc_title: Adopters
| <a href="https://promo.croc.ru/digitalworker" class="favicon">Цифровой Рабочий</a> | Industrial IoT, Analytics | — | — | — | [Blog post in Russian, March 2021](https://habr.com/en/company/croc/blog/548018/) |
| <a href="https://shop.okraina.ru/" class="favicon">ООО «МПЗ Богородский»</a> | Agriculture | — | — | — | [Article in Russian, November 2020](https://cloud.yandex.ru/cases/okraina) |
| <a href="https://domclick.ru/" class="favicon">ДомКлик</a> | Real Estate | — | — | — | [Article in Russian, October 2021](https://habr.com/ru/company/domclick/blog/585936/) |
| <a href="https://futurragroup.com/" class="favicon">Futurra Group</a> | Analytics | — | — | — | [Article in Russian, December 2021](https://dou.ua/forums/topic/35587/) |
| <a href="https://usetech.com/" class="favicon">UseTech</a> | Software Development | — | — | — | [Job Posting, December 2021](https://vk.com/wall136266658_2418) |
| <a href="https://lookforsale.ru/" class="favicon">Lookforsale</a> | E-Commerce | — | — | — | [Job Posting, December 2021](https://telegram.me/javascript_jobs/587318) |
| <a href="https://rvision.pro/en/" class="favicon">R-Vision</a> | Information Security | — | — | — | [Article in Russian, December 2021](https://www.anti-malware.ru/reviews/R-Vision-SENSE-15) |
[Original article](https://clickhouse.com/docs/en/introduction/adopters/) <!--hide-->

View File

@ -16,6 +16,11 @@ ZooKeeper is one of the first well-known open-source coordination systems. It's
By default, ClickHouse Keeper provides the same guarantees as ZooKeeper (linearizable writes, non-linearizable reads). It has a compatible client-server protocol, so any standard ZooKeeper client can be used to interact with ClickHouse Keeper. Snapshots and logs have an incompatible format with ZooKeeper, but `clickhouse-keeper-converter` tool allows to convert ZooKeeper data to ClickHouse Keeper snapshot. Interserver protocol in ClickHouse Keeper is also incompatible with ZooKeeper so mixed ZooKeeper / ClickHouse Keeper cluster is impossible.
ClickHouse Keeper supports Access Control List (ACL) the same way as [ZooKeeper](https://zookeeper.apache.org/doc/r3.1.2/zookeeperProgrammers.html#sc_ZooKeeperAccessControl) does. ClickHouse Keeper supports the same set of permissions and has the identical built-in schemes: `world`, `auth`, `digest`, `host` and `ip`. Digest authentication scheme uses pair `username:password`. Password is encoded in Base64.
!!! info "Note"
External integrations are not supported.
## Configuration
ClickHouse Keeper can be used as a standalone replacement for ZooKeeper or as an internal part of the ClickHouse server, but in both cases configuration is almost the same `.xml` file. The main ClickHouse Keeper configuration tag is `<keeper_server>`. Keeper configuration has the following parameters:
@ -118,13 +123,13 @@ echo mntr | nc localhost 9181
Bellow is the detailed 4lw commands:
- ruok : Tests if server is running in a non-error state. The server will respond with imok if it is running. Otherwise it will not respond at all. A response of "imok" does not necessarily indicate that the server has joined the quorum, just that the server process is active and bound to the specified client port. Use "stat" for details on state wrt quorum and client connection information.
- `ruok`: Tests if server is running in a non-error state. The server will respond with imok if it is running. Otherwise it will not respond at all. A response of "imok" does not necessarily indicate that the server has joined the quorum, just that the server process is active and bound to the specified client port. Use "stat" for details on state wrt quorum and client connection information.
```
imok
```
- mntr : Outputs a list of variables that could be used for monitoring the health of the cluster.
- `mntr`: Outputs a list of variables that could be used for monitoring the health of the cluster.
```
zk_version v21.11.1.1-prestable-7a4a0b0edef0ad6e0aa662cd3b90c3f4acf796e7
@ -146,12 +151,11 @@ zk_followers 0
zk_synced_followers 0
```
- srvr : Lists full details for the server.
- `srvr`: Lists full details for the server.
```
ClickHouse Keeper version: v21.11.1.1-prestable-7a4a0b0edef0ad6e0aa662cd3b90c3f4acf796e7
Latency min/avg/max: 0/0/0
Received: 2
Sent : 2
Connections: 1
@ -161,16 +165,14 @@ Mode: leader
Node count: 4
```
- stat : Lists brief details for the server and connected clients.
- `stat`: Lists brief details for the server and connected clients.
```
ClickHouse Keeper version: v21.11.1.1-prestable-7a4a0b0edef0ad6e0aa662cd3b90c3f4acf796e7
Clients:
192.168.1.1:52852(recved=0,sent=0)
192.168.1.1:52042(recved=24,sent=48)
Latency min/avg/max: 0/0/0
Received: 4
Sent : 4
Connections: 1
@ -178,16 +180,15 @@ Outstanding: 0
Zxid: 36
Mode: leader
Node count: 4
```
- srst : Reset server statistics. The command will affect the result of `srvr`, `mntr` and `stat`.
- `srst`: Reset server statistics. The command will affect the result of `srvr`, `mntr` and `stat`.
```
Server stats reset.
```
- conf : Print details about serving configuration.
- `conf`: Print details about serving configuration.
```
server_id=1
@ -220,20 +221,20 @@ compress_snapshots_with_zstd_format=true
configuration_change_tries_count=20
```
- cons : List full connection/session details for all clients connected to this server. Includes information on numbers of packets received/sent, session id, operation latencies, last operation performed, etc...
- `cons`: List full connection/session details for all clients connected to this server. Includes information on numbers of packets received/sent, session id, operation latencies, last operation performed, etc...
```
192.168.1.1:52163(recved=0,sent=0,sid=0xffffffffffffffff,lop=NA,est=1636454787393,to=30000,lzxid=0xffffffffffffffff,lresp=0,llat=0,minlat=0,avglat=0,maxlat=0)
192.168.1.1:52042(recved=9,sent=18,sid=0x0000000000000001,lop=List,est=1636454739887,to=30000,lcxid=0x0000000000000005,lzxid=0x0000000000000005,lresp=1636454739892,llat=0,minlat=0,avglat=0,maxlat=0)
```
- crst : Reset connection/session statistics for all connections.
- `crst`: Reset connection/session statistics for all connections.
```
Connection stats reset.
```
- envi : Print details about serving environment
- `envi`: Print details about serving environment
```
Environment:
@ -250,41 +251,41 @@ user.tmp=/var/folders/b4/smbq5mfj7578f2jzwn602tt40000gn/T/
```
- dirs : Shows the total size of snapshot and log files in bytes
- `dirs`: Shows the total size of snapshot and log files in bytes
```
snapshot_dir_size: 0
log_dir_size: 3875
```
- isro: Tests if server is running in read-only mode. The server will respond with "ro" if in read-only mode or "rw" if not in read-only mode.
- `isro`: Tests if server is running in read-only mode. The server will respond with "ro" if in read-only mode or "rw" if not in read-only mode.
```
rw
```
- wchs : Lists brief information on watches for the server.
- `wchs`: Lists brief information on watches for the server.
```
1 connections watching 1 paths
Total watches:1
```
- wchc : Lists detailed information on watches for the server, by session. This outputs a list of sessions(connections) with associated watches (paths). Note, depending on the number of watches this operation may be expensive (ie impact server performance), use it carefully.
- `wchc`: Lists detailed information on watches for the server, by session. This outputs a list of sessions (connections) with associated watches (paths). Note, depending on the number of watches this operation may be expensive (ie impact server performance), use it carefully.
```
0x0000000000000001
/clickhouse/task_queue/ddl
```
- wchp : Lists detailed information on watches for the server, by path. This outputs a list of paths (znodes) with associated sessions. Note, depending on the number of watches this operation may be expensive (ie impact server performance), use it carefully.
- `wchp`: Lists detailed information on watches for the server, by path. This outputs a list of paths (znodes) with associated sessions. Note, depending on the number of watches this operation may be expensive (i. e. impact server performance), use it carefully.
```
/clickhouse/task_queue/ddl
0x0000000000000001
```
- dump : Lists the outstanding sessions and ephemeral nodes. This only works on the leader.
- `dump`: Lists the outstanding sessions and ephemeral nodes. This only works on the leader.
```
Sessions dump (2):

View File

@ -1687,18 +1687,17 @@ Quorum writes
`INSERT` succeeds only when ClickHouse manages to correctly write data to the `insert_quorum` of replicas during the `insert_quorum_timeout`. If for any reason the number of replicas with successful writes does not reach the `insert_quorum`, the write is considered failed and ClickHouse will delete the inserted block from all the replicas where data has already been written.
All the replicas in the quorum are consistent, i.e., they contain data from all previous `INSERT` queries. The `INSERT` sequence is linearized.
When `insert_quorum_parallel` is disabled, all replicas in the quorum are consistent, i.e. they contain data from all previous `INSERT` queries (the `INSERT` sequence is linearized). When reading data written using `insert_quorum` and `insert_quorum_parallel` is disabled, you can turn on sequential consistency for `SELECT` queries using [select_sequential_consistency](#settings-select_sequential_consistency).
When reading the data written from the `insert_quorum`, you can use the [select_sequential_consistency](#settings-select_sequential_consistency) option.
ClickHouse generates an exception
ClickHouse generates an exception:
- If the number of available replicas at the time of the query is less than the `insert_quorum`.
- At an attempt to write data when the previous block has not yet been inserted in the `insert_quorum` of replicas. This situation may occur if the user tries to perform an `INSERT` before the previous one with the `insert_quorum` is completed.
- When `insert_quorum_parallel` is disabled and an attempt to write data is made when the previous block has not yet been inserted in `insert_quorum` of replicas. This situation may occur if the user tries to perform another `INSERT` query to the same table before the previous one with `insert_quorum` is completed.
See also:
- [insert_quorum_timeout](#settings-insert_quorum_timeout)
- [insert_quorum_parallel](#settings-insert_quorum_parallel)
- [select_sequential_consistency](#settings-select_sequential_consistency)
## insert_quorum_timeout {#settings-insert_quorum_timeout}
@ -1710,11 +1709,29 @@ Default value: 600 000 milliseconds (ten minutes).
See also:
- [insert_quorum](#settings-insert_quorum)
- [insert_quorum_parallel](#settings-insert_quorum_parallel)
- [select_sequential_consistency](#settings-select_sequential_consistency)
## insert_quorum_parallel {#settings-insert_quorum_parallel}
Enables or disables parallelism for quorum `INSERT` queries. If enabled, additional `INSERT` queries can be sent while previous queries have not yet finished. If disabled, additional writes to the same table will be rejected.
Possible values:
- 0 — Disabled.
- 1 — Enabled.
Default value: 1.
See also:
- [insert_quorum](#settings-insert_quorum)
- [insert_quorum_timeout](#settings-insert_quorum_timeout)
- [select_sequential_consistency](#settings-select_sequential_consistency)
## select_sequential_consistency {#settings-select_sequential_consistency}
Enables or disables sequential consistency for `SELECT` queries:
Enables or disables sequential consistency for `SELECT` queries. Requires `insert_quorum_parallel` to be disabled (enabled by default).
Possible values:
@ -1727,10 +1744,13 @@ Usage
When sequential consistency is enabled, ClickHouse allows the client to execute the `SELECT` query only for those replicas that contain data from all previous `INSERT` queries executed with `insert_quorum`. If the client refers to a partial replica, ClickHouse will generate an exception. The SELECT query will not include data that has not yet been written to the quorum of replicas.
When `insert_quorum_parallel` is enabled (the default), then `select_sequential_consistency` does not work. This is because parallel `INSERT` queries can be written to different sets of quorum replicas so there is no guarantee a single replica will have received all writes.
See also:
- [insert_quorum](#settings-insert_quorum)
- [insert_quorum_timeout](#settings-insert_quorum_timeout)
- [insert_quorum_parallel](#settings-insert_quorum_parallel)
## insert_deduplicate {#settings-insert-deduplicate}

View File

@ -41,7 +41,7 @@ Example of a polygon dictionary configuration:
</dictionary>
```
Tne corresponding [DDL-query](../../../sql-reference/statements/create/dictionary.md#create-dictionary-query):
The corresponding [DDL-query](../../../sql-reference/statements/create/dictionary.md#create-dictionary-query):
``` sql
CREATE DICTIONARY polygon_dict_name (
key Array(Array(Array(Array(Float64)))),

View File

@ -31,7 +31,7 @@ CREATE ROLE accountant;
GRANT SELECT ON db.* TO accountant;
```
This sequence of queries creates the role `accountant` that has the privilege of reading data from the `accounting` database.
This sequence of queries creates the role `accountant` that has the privilege of reading data from the `db` database.
Assigning the role to the user `mira`:

View File

@ -22,7 +22,7 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster]
) ENGINE = engine
```
Creates a table named `name` in the `db` database or the current database if `db` is not set, with the structure specified in brackets and the `engine` engine.
Creates a table named `table_name` in the `db` database or the current database if `db` is not set, with the structure specified in brackets and the `engine` engine.
The structure of the table is a list of column descriptions, secondary indexes and constraints . If [primary key](#primary-key) is supported by the engine, it will be indicated as parameter for the table engine.
A column description is `name type` in the simplest case. Example: `RegionID UInt32`.

View File

@ -206,6 +206,9 @@ This extra row is only produced in `JSON*`, `TabSeparated*`, and `Pretty*` forma
- In `Pretty*` formats, the row is output as a separate table after the main result.
- In the other formats it is not available.
!!! note "Note"
totals is output in the results of `SELECT` queries, and is not output in `INSERT INTO ... SELECT`.
`WITH TOTALS` can be run in different ways when [HAVING](../../../sql-reference/statements/select/having.md) is present. The behavior depends on the `totals_mode` setting.
### Configuring Totals Processing {#configuring-totals-processing}

View File

@ -129,6 +129,9 @@ world
Каждый элемент структуры типа [Nested](../sql-reference/data-types/nested-data-structures/nested.md) представляется как отдельный массив.
Входящие параметры типа "перечисление" (`ENUM`) могут передаваться в виде значений или порядковых номеров. Сначала переданное значение будет сопоставляться с элементами перечисления. Если совпадение не будет найдено и при этом переданное значение является числом, оно будет трактоваться как порядковый номер в перечислении.
Если входящие параметры типа `ENUM` содержат только порядковые номера, рекомендуется включить настройку [input_format_tsv_enum_as_number](../operations/settings/settings.md#settings-input_format_tsv_enum_as_number) для ускорения парсинга.
Например:
``` sql
@ -362,6 +365,9 @@ $ clickhouse-client --format_csv_delimiter="|" --query="INSERT INTO test.csv FOR
Если установлена настройка [input_format_defaults_for_omitted_fields = 1](../operations/settings/settings.md#session_settings-input_format_defaults_for_omitted_fields) и тип столбца не `Nullable(T)`, то пустые значения без кавычек заменяются значениями по умолчанию для типа данных столбца.
Входящие параметры типа "перечисление" (`ENUM`) могут передаваться в виде значений или порядковых номеров. Сначала переданное значение будет сопоставляться с элементами перечисления. Если совпадение не будет найдено и при этом переданное значение является числом, оно будет трактоваться как порядковый номер в перечислении.
Если входящие параметры типа `ENUM` содержат только порядковые номера, рекомендуется включить настройку [input_format_tsv_enum_as_number](../operations/settings/settings.md#settings-input_format_tsv_enum_as_number) для ускорения парсинга.
Формат CSV поддерживает вывод totals и extremes аналогично `TabSeparated`.
## CSVWithNames {#csvwithnames}
@ -693,7 +699,7 @@ CREATE TABLE IF NOT EXISTS example_table
- Если `input_format_defaults_for_omitted_fields = 1`, то значение по умолчанию для `x` равно `0`, а значение по умолчанию `a` равно `x * 2`.
!!! note "Предупреждение"
Если `input_format_defaults_for_omitted_fields = 1`, то при обработке запросов ClickHouse потребляет больше вычислительных ресурсов, чем если `input_format_defaults_for_omitted_fields = 0`.
При добавлении данных с помощью `input_format_defaults_for_omitted_fields = 1`, ClickHouse потребляет больше вычислительных ресурсов по сравнению с `input_format_defaults_for_omitted_fields = 0`.
### Выборка данных {#vyborka-dannykh}

View File

@ -16,12 +16,17 @@ ZooKeeper — один из первых широко известных сер
По умолчанию ClickHouse Keeper предоставляет те же гарантии, что и ZooKeeper (линеаризуемость записей, последовательная согласованность чтений). У него есть совместимый клиент-серверный протокол, поэтому любой стандартный клиент ZooKeeper может использоваться для взаимодействия с ClickHouse Keeper. Снэпшоты и журналы имеют несовместимый с ZooKeeper формат, однако можно конвертировать данные Zookeeper в снэпшот ClickHouse Keeper с помощью `clickhouse-keeper-converter`. Межсерверный протокол ClickHouse Keeper также несовместим с ZooKeeper, поэтому создание смешанного кластера ZooKeeper / ClickHouse Keeper невозможно.
Система управления доступом (ACL) ClickHouse Keeper реализована так же, как в [ZooKeeper](https://zookeeper.apache.org/doc/r3.1.2/zookeeperProgrammers.html#sc_ZooKeeperAccessControl). ClickHouse Keeper поддерживает тот же набор разрешений и идентичные схемы: `world`, `auth`, `digest`, `host` и `ip`. Digest для аутентификации использует пару значений `username:password`. Пароль кодируется в Base64.
!!! info "Примечание"
Внешние интеграции не поддерживаются.
## Конфигурация
ClickHouse Keeper может использоваться как равноценная замена ZooKeeper или как внутренняя часть сервера ClickHouse, но в обоих случаях конфигурация представлена файлом `.xml`. Главный тег конфигурации ClickHouse Keeper — это `<keeper_server>`. Параметры конфигурации:
- `tcp_port` — порт для подключения клиента (по умолчанию для ZooKeeper: `2181`).
- `tcp_port_secure` — зашифрованный порт для подключения клиента.
- `tcp_port_secure` — зашифрованный порт для SSL-соединения между клиентом и сервером сервиса.
- `server_id` — уникальный идентификатор сервера, каждый участник кластера должен иметь уникальный номер&nbsp;(1,&nbsp;2,&nbsp;3&nbsp;и&nbsp;т.&nbsp;д.).
- `log_storage_path` — путь к журналам координации, лучше хранить их на незанятом устройстве (актуально и для ZooKeeper).
- `snapshot_storage_path` — путь к снэпшотам координации.
@ -50,7 +55,11 @@ ClickHouse Keeper может использоваться как равноце
- `shutdown_timeout` — время ожидания завершения внутренних подключений и выключения, в миллисекундах (по умолчанию: 5000).
- `startup_timeout` — время отключения сервера, если он не подключается к другим участникам кворума, в миллисекундах (по умолчанию: 30000).
Конфигурация кворума находится в `<keeper_server>.<raft_configuration>` и содержит описание серверов. Единственный параметр для всего кворума — `secure`, который включает зашифрованное соединение для связи между участниками кворума. Параметры для каждого `<server>`:
Конфигурация кворума находится в `<keeper_server>.<raft_configuration>` и содержит описание серверов.
Единственный параметр для всего кворума — `secure`, который включает зашифрованное соединение для связи между участниками кворума. Параметру можно задать значение `true`, если для внутренней коммуникации между узлами требуется SSL-соединение, в ином случае не указывайте ничего.
Параметры для каждого `<server>`:
- `id` — идентификатор сервера в кворуме.
- `hostname` — имя хоста, на котором размещен сервер.

View File

@ -391,12 +391,14 @@ INSERT INTO test VALUES (lower('Hello')), (lower('world')), (lower('INSERT')), (
## input_format_tsv_enum_as_number {#settings-input_format_tsv_enum_as_number}
Включает или отключает парсинг значений перечислений как идентификаторов перечислений для входного формата TSV.
Включает или отключает парсинг значений перечислений как порядковых номеров.
Если режим включен, то во входящих данных в формате `TCV` значения перечисления (тип `ENUM`) всегда трактуются как порядковые номера, а не как элементы перечисления. Эту настройку рекомендуется включать для оптимизации парсинга, если данные типа `ENUM` содержат только порядковые номера, а не сами элементы перечисления.
Возможные значения:
- 0 — парсинг значений перечисления как значений.
- 1 — парсинг значений перечисления как идентификаторов перечисления.
- 0 — входящие значения типа `ENUM` сначала сопоставляются с элементами перечисления, а если совпадений не найдено, то трактуются как порядковые номера.
- 1 — входящие значения типа `ENUM` сразу трактуются как порядковые номера.
Значение по умолчанию: 0.
@ -410,10 +412,39 @@ CREATE TABLE table_with_enum_column_for_tsv_insert (Id Int32,Value Enum('first'
При включенной настройке `input_format_tsv_enum_as_number`:
Запрос:
```sql
SET input_format_tsv_enum_as_number = 1;
INSERT INTO table_with_enum_column_for_tsv_insert FORMAT TSV 102 2;
INSERT INTO table_with_enum_column_for_tsv_insert FORMAT TSV 103 1;
SELECT * FROM table_with_enum_column_for_tsv_insert;
```
Результат:
```text
┌──Id─┬─Value──┐
│ 102 │ second │
└─────┴────────┘
```
Запрос:
```sql
SET input_format_tsv_enum_as_number = 1;
INSERT INTO table_with_enum_column_for_tsv_insert FORMAT TSV 103 'first';
```
сгенерирует исключение.
При отключенной настройке `input_format_tsv_enum_as_number`:
Запрос:
```sql
SET input_format_tsv_enum_as_number = 0;
INSERT INTO table_with_enum_column_for_tsv_insert FORMAT TSV 102 2;
INSERT INTO table_with_enum_column_for_tsv_insert FORMAT TSV 103 'first';
SELECT * FROM table_with_enum_column_for_tsv_insert;
```
@ -428,15 +459,6 @@ SELECT * FROM table_with_enum_column_for_tsv_insert;
└─────┴────────┘
```
При отключенной настройке `input_format_tsv_enum_as_number` запрос `INSERT`:
```sql
SET input_format_tsv_enum_as_number = 0;
INSERT INTO table_with_enum_column_for_tsv_insert FORMAT TSV 102 2;
```
сгенерирует исключение.
## input_format_null_as_default {#settings-input-format-null-as-default}
Включает или отключает инициализацию [значениями по умолчанию](../../sql-reference/statements/create/table.md#create-default-values) ячеек с [NULL](../../sql-reference/syntax.md#null-literal), если тип данных столбца не позволяет [хранить NULL](../../sql-reference/data-types/nullable.md#data_type-nullable).
@ -1511,12 +1533,13 @@ SELECT area/period FROM account_orders FORMAT JSON;
## input_format_csv_enum_as_number {#settings-input_format_csv_enum_as_number}
Включает или отключает парсинг значений перечислений как идентификаторов перечислений для входного формата CSV.
Включает или отключает парсинг значений перечислений как порядковых номеров.
Если режим включен, то во входящих данных в формате `CSV` значения перечисления (тип `ENUM`) всегда трактуются как порядковые номера, а не как элементы перечисления. Эту настройку рекомендуется включать для оптимизации парсинга, если данные типа `ENUM` содержат только порядковые номера, а не сами элементы перечисления.
Возможные значения:
- 0 — парсинг значений перечисления как значений.
- 1 — парсинг значений перечисления как идентификаторов перечисления.
- 0 — входящие значения типа `ENUM` сначала сопоставляются с элементами перечисления, а если совпадений не найдено, то трактуются как порядковые номера.
- 1 — входящие значения типа `ENUM` сразу трактуются как порядковые номера.
Значение по умолчанию: 0.
@ -1530,10 +1553,11 @@ CREATE TABLE table_with_enum_column_for_csv_insert (Id Int32,Value Enum('first'
При включенной настройке `input_format_csv_enum_as_number`:
Запрос:
```sql
SET input_format_csv_enum_as_number = 1;
INSERT INTO table_with_enum_column_for_csv_insert FORMAT CSV 102,2;
SELECT * FROM table_with_enum_column_for_csv_insert;
```
Результат:
@ -1544,15 +1568,37 @@ SELECT * FROM table_with_enum_column_for_csv_insert;
└─────┴────────┘
```
При отключенной настройке `input_format_csv_enum_as_number` запрос `INSERT`:
Запрос:
```sql
SET input_format_csv_enum_as_number = 0;
INSERT INTO table_with_enum_column_for_csv_insert FORMAT CSV 102,2;
SET input_format_csv_enum_as_number = 1;
INSERT INTO table_with_enum_column_for_csv_insert FORMAT CSV 103,'first'
```
сгенерирует исключение.
При отключенной настройке `input_format_csv_enum_as_number`:
Запрос:
```sql
SET input_format_csv_enum_as_number = 0;
INSERT INTO table_with_enum_column_for_csv_insert FORMAT CSV 102,2
INSERT INTO table_with_enum_column_for_csv_insert FORMAT CSV 103,'first'
SELECT * FROM table_with_enum_column_for_csv_insert;
```
Результат:
```text
┌──Id─┬─Value──┐
│ 102 │ second │
└─────┴────────┘
┌──Id─┬─Value─┐
│ 103 │ first │
└─────┴───────┘
```
## output_format_csv_crlf_end_of_line {#settings-output-format-csv-crlf-end-of-line}
Использовать в качестве разделителя строк для CSV формата CRLF (DOS/Windows стиль) вместо LF (Unix стиль).

View File

@ -203,6 +203,9 @@ SELECT year, month, day, count(*) FROM t GROUP BY year, month, day WITH CUBE;
- В `Pretty*` форматах, строка выводится в виде отдельной таблицы после основного результата.
- В других форматах она не доступна.
!!! note "Примечание"
totals выводится только в результатах запросов `SELECT`, и не вывоводится в `INSERT INTO ... SELECT`.
При использовании секции [HAVING](having.md) поведение `WITH TOTALS` контролируется настройкой `totals_mode`.
### Настройка обработки итогов {#configuring-totals-processing}

View File

@ -1,16 +1,55 @@
---
machine_translated: true
machine_translated_rev: 5decc73b5dc60054f19087d3690c4eb99446a6c3
---
# system.merge_tree_settings {#system-merge_tree_settings}
# 系统。merge_tree_settings {#system-merge_tree_settings}
包含有关以下设置的信息 `MergeTree` 桌子
包含 `MergeTree` 表的设置 (Setting) 信息。
列:
- `name` (String) — Setting name.
- `value` (String) — Setting value.
- `description` (String) — Setting description.
- `type` (String) — Setting type (implementation specific string value).
- `changed` (UInt8) — Whether the setting was explicitly defined in the config or explicitly changed.
- `name` (String) — 设置名称。
- `value` (String) — 设置的值。
- `description` (String) — 设置描述。
- `type` (String) — 设置类型 (执行特定的字符串值)。
- `changed` (UInt8) — 该设置是否在配置中明确定义或是明确改变。
**示例**
```sql
:) SELECT * FROM system.merge_tree_settings LIMIT 4 FORMAT Vertical;
```
```text
Row 1:
──────
name: index_granularity
value: 8192
changed: 0
description: How many rows correspond to one primary key value.
type: SettingUInt64
Row 2:
──────
name: min_bytes_for_wide_part
value: 0
changed: 0
description: Minimal uncompressed size in bytes to create part in wide format instead of compact
type: SettingUInt64
Row 3:
──────
name: min_rows_for_wide_part
value: 0
changed: 0
description: Minimal number of rows to create part in wide format instead of compact
type: SettingUInt64
Row 4:
──────
name: merge_max_block_size
value: 8192
changed: 0
description: How many rows in blocks should be formed for merge operations.
type: SettingUInt64
4 rows in set. Elapsed: 0.001 sec.
```
[原文](https://clickhouse.com/docs/zh/operations/system-tables/merge_tree_settings) <!--hide-->

View File

@ -1,58 +1,128 @@
---
machine_translated: true
machine_translated_rev: 5decc73b5dc60054f19087d3690c4eb99446a6c3
---
# system.tables {#system-tables}
# 系统。表 {#system-tables}
包含服务器知道的每个表的元数据。 [分离的](../../sql-reference/statements/detach.md)表不在 `system.tables` 显示。
包含服务器知道的每个表的元数据。 分离的表不显示在 `system.tables`
[临时表](../../sql-reference/statements/create/table.md#temporary-tables)只在创建它们的会话中的 `system.tables` 中才可见。它们的数据库字段显示为空,并且 `is_temporary` 标志显示为开启
此表包含以下列列类型显示在括号中):
此表包含以下列 (列类型显示在括号中):
- `database` (String) — 表所在的数据库名。
- `database` ([String](../../sql-reference/data-types/string.md)) — 表所在的数据库名。
- `name` (String) — 表名。
- `name` ([String](../../sql-reference/data-types/string.md)) — 表名。
- `engine` (String) — 表引擎名 (不包含参数)。
- `engine` ([String](../../sql-reference/data-types/string.md)) — 表引擎名 (不包含参数)。
- `is_temporary` (UInt8)-指示表是否是临时的标志。
- `is_temporary` ([UInt8](../../sql-reference/data-types/int-uint.md)) - 指示表是否是临时的标志。
- `data_path` (String)-文件系统中表数据的路径。
- `data_path` ([String](../../sql-reference/data-types/string.md)) - 表数据在文件系统中的路径。
- `metadata_path` (String)-文件系统中表元数据的路径。
- `metadata_path` ([String](../../sql-reference/data-types/string.md)) - 表元数据在文件系统中的路径。
- `metadata_modification_time` (DateTime)-表元数据的最新修改时间。
- `metadata_modification_time` ([DateTime](../../sql-reference/data-types/datetime.md)) - 表元数据的最新修改时间。
- `dependencies_database` (数组(字符串))-数据库依赖关系。
- `dependencies_database` ([Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md))) - 数据库依赖关系。
- `dependencies_table` (数组(字符串))-表依赖关系 ([MaterializedView](../../engines/table-engines/special/materializedview.md) 基于当前表的表)
- `dependencies_table` ([Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md))) - 表依赖关系 (基于当前表的 [物化视图](../../engines/table-engines/special/materializedview.md) 表)
- `create_table_query` (String)-用于创建表的SQL语句。
- `create_table_query` ([String](../../sql-reference/data-types/string.md)) - 用于创建表的 SQL 语句。
- `engine_full` (String)-表引擎的参数。
- `engine_full` ([String](../../sql-reference/data-types/string.md)) - 表引擎的参数。
- `partition_key` (String)-表中指定的分区键表达式
- `as_select` ([String](../../sql-reference/data-types/string.md)) - 视图的 `SELECT` 语句
- `sorting_key` (String)-表中指定的排序键表达式。
- `partition_key` ([String](../../sql-reference/data-types/string.md)) - 表中指定的分区键表达式。
- `primary_key` (String)-表中指定的主键表达式。
- `sorting_key` ([String](../../sql-reference/data-types/string.md)) - 表中指定的排序键表达式。
- `sampling_key` (String)-表中指定的采样键表达式。
- `primary_key` ([String](../../sql-reference/data-types/string.md)) - 表中指定的主键表达式。
- `storage_policy` (字符串)-存储策略:
- `sampling_key` ([String](../../sql-reference/data-types/string.md)) - 表中指定的采样键表达式。
- `storage_policy` ([String](../../sql-reference/data-types/string.md)) - 存储策略:
- [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-multiple-volumes)
- [分布](../../engines/table-engines/special/distributed.md#distributed)
- [Distributed](../../engines/table-engines/special/distributed.md#distributed)
- `total_rows` (Nullable(UInt64))-总行数,如果可以快速确定表中的确切行数,否则行数为`Null`(包括底层 `Buffer` 表)
- `total_rows` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) - 总行数,如果无法快速确定表中的确切行数,则行数返回为 `NULL` (包括底层 `Buffer` 表)
- `total_bytes` (Nullable(UInt64))-总字节数,如果可以快速确定存储表的确切字节数,否则字节数为`Null` (即**不** 包括任何底层存储)
- `total_bytes` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) - 总字节数,如果无法快速确定存储表的确切字节数,则字节数返回为 `NULL` ( **不** 包括任何底层存储)
- 如果表将数据存在磁盘上,返回实际使用的磁盘空间(压缩后)
- 如果表将数据存在磁盘上,返回实际使用的磁盘空间 (压缩后)
- 如果表在内存中存储数据,返回在内存中使用的近似字节数。
- `lifetime_rows` (Nullbale(UInt64))-服务启动后插入的总行数(只针对`Buffer`表)。
- `lifetime_rows` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) - 服务启动后插入的总行数(只针对 `Buffer` 表) 。
- `lifetime_bytes` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) - 服务启动后插入的总字节数(只针对 `Buffer` 表) 。
- `comment` ([String](../../sql-reference/data-types/string.md)) - 表的注释。
- `has_own_data` ([UInt8](../../sql-reference/data-types/int-uint.md)) — 标志,表示表本身是否在磁盘上存储数据,或者访问其他来源。
`system.tables` 表被用于 `SHOW TABLES` 的查询实现中。
**示例**
```sql
SELECT * FROM system.tables LIMIT 2 FORMAT Vertical;
```
```text
Row 1:
──────
database: base
name: t1
uuid: 81b1c20a-b7c6-4116-a2ce-7583fb6b6736
engine: MergeTree
is_temporary: 0
data_paths: ['/var/lib/clickhouse/store/81b/81b1c20a-b7c6-4116-a2ce-7583fb6b6736/']
metadata_path: /var/lib/clickhouse/store/461/461cf698-fd0b-406d-8c01-5d8fd5748a91/t1.sql
metadata_modification_time: 2021-01-25 19:14:32
dependencies_database: []
dependencies_table: []
create_table_query: CREATE TABLE base.t1 (`n` UInt64) ENGINE = MergeTree ORDER BY n SETTINGS index_granularity = 8192
engine_full: MergeTree ORDER BY n SETTINGS index_granularity = 8192
as_select: SELECT database AS table_catalog
partition_key:
sorting_key: n
primary_key: n
sampling_key:
storage_policy: default
total_rows: 1
total_bytes: 99
lifetime_rows: ᴺᵁᴸᴸ
lifetime_bytes: ᴺᵁᴸᴸ
comment:
has_own_data: 0
Row 2:
──────
database: default
name: 53r93yleapyears
uuid: 00000000-0000-0000-0000-000000000000
engine: MergeTree
is_temporary: 0
data_paths: ['/var/lib/clickhouse/data/default/53r93yleapyears/']
metadata_path: /var/lib/clickhouse/metadata/default/53r93yleapyears.sql
metadata_modification_time: 2020-09-23 09:05:36
dependencies_database: []
dependencies_table: []
create_table_query: CREATE TABLE default.`53r93yleapyears` (`id` Int8, `febdays` Int8) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 8192
engine_full: MergeTree ORDER BY id SETTINGS index_granularity = 8192
as_select: SELECT name AS catalog_name
partition_key:
sorting_key: id
primary_key: id
sampling_key:
storage_policy: default
total_rows: 2
total_bytes: 155
lifetime_rows: ᴺᵁᴸᴸ
lifetime_bytes: ᴺᵁᴸᴸ
comment:
has_own_data: 0
```
[原文](https://clickhouse.com/docs/zh/operations/system-tables/tables) <!--hide-->

View File

@ -727,7 +727,6 @@ void LocalServer::printHelpMessage([[maybe_unused]] const OptionsDescription & o
void LocalServer::addOptions(OptionsDescription & options_description)
{
options_description.main_description->add_options()
("database,d", po::value<std::string>(), "database")
("table,N", po::value<std::string>(), "name of the initial table")
/// If structure argument is omitted then initial query is not generated

View File

@ -152,6 +152,7 @@
This setting could be used to switch replication to another network interface
(the server may be connected to multiple networks via multiple addresses)
-->
<!--
<interserver_http_host>example.yandex.ru</interserver_http_host>
-->
@ -177,6 +178,7 @@
-->
<!-- <listen_host>::</listen_host> -->
<!-- Same for hosts without support for IPv6: -->
<!-- <listen_host>0.0.0.0</listen_host> -->
@ -293,6 +295,10 @@
<max_thread_pool_size>10000</max_thread_pool_size>
<!-- Number of workers to recycle connections in background (see also drain_timeout).
If the pool is full, connection will be drained synchronously. -->
<!-- <max_threads_for_connection_collector>10</max_threads_for_connection_collector> -->
<!-- On memory constrained environments you may have to set this to value larger than 1.
-->
<max_server_memory_usage_to_ram_ratio>0.9</max_server_memory_usage_to_ram_ratio>

View File

@ -87,7 +87,7 @@ if [ -z "$NO_BUILD" ] ; then
# Build (only binary packages).
debuild --preserve-env -e PATH \
-e DEB_CC=$DEB_CC -e DEB_CXX=$DEB_CXX -e CMAKE_FLAGS="$CMAKE_FLAGS" \
-b ${DEBUILD_NOSIGN_OPTIONS} ${DEBUILD_NODEPS_OPTIONS}
-b ${DEBUILD_NOSIGN_OPTIONS} ${DEBUILD_NODEPS_OPTIONS} ${DEB_ARCH_FLAG}
fi
if [ -n "$MAKE_RPM" ]; then

View File

@ -54,7 +54,7 @@ namespace
const Poco::SHA1Engine::Digest & digest = engine.digest();
Poco::SHA1Engine::Digest calculated_password_sha1(sha1_size);
for (size_t i = 0; i < sha1_size; i++)
for (size_t i = 0; i < sha1_size; ++i)
calculated_password_sha1[i] = scrambled_password[i] ^ digest[i];
auto calculated_password_double_sha1 = Util::encodeSHA1(calculated_password_sha1);

View File

@ -448,7 +448,7 @@ LDAPClient::SearchResults LDAPClient::search(const SearchParams & search_params)
vals = nullptr;
});
for (std::size_t i = 0; vals[i]; i++)
for (size_t i = 0; vals[i]; ++i)
{
if (vals[i]->bv_val && vals[i]->bv_len > 0)
result.emplace(vals[i]->bv_val, vals[i]->bv_len);
@ -473,7 +473,7 @@ LDAPClient::SearchResults LDAPClient::search(const SearchParams & search_params)
referrals = nullptr;
});
for (std::size_t i = 0; referrals[i]; i++)
for (size_t i = 0; referrals[i]; ++i)
{
LOG_WARNING(&Poco::Logger::get("LDAPClient"), "Received reference during LDAP search but not following it: {}", referrals[i]);
}

View File

@ -15,6 +15,7 @@ namespace ErrorCodes
extern const int READONLY;
extern const int QUERY_IS_PROHIBITED;
extern const int SETTING_CONSTRAINT_VIOLATION;
extern const int UNKNOWN_SETTING;
}
@ -200,7 +201,23 @@ bool SettingsConstraints::checkImpl(const Settings & current_settings, SettingCh
};
if (reaction == THROW_ON_VIOLATION)
access_control->checkSettingNameIsAllowed(setting_name);
{
try
{
access_control->checkSettingNameIsAllowed(setting_name);
}
catch (Exception & e)
{
if (e.code() == ErrorCodes::UNKNOWN_SETTING)
{
if (const auto hints = current_settings.getHints(change.name); !hints.empty())
{
e.addMessage(fmt::format("Maybe you meant {}", toString(hints)));
}
}
throw;
}
}
else if (!access_control->isSettingNameAllowed(setting_name))
return false;

View File

@ -90,7 +90,7 @@ private:
throw;
}
for (i = 0; i < old_size; i++)
for (i = 0; i < old_size; ++i)
{
nested_func->merge(&new_state[i * nested_size_of_data],
&old_state[i * nested_size_of_data],

View File

@ -54,6 +54,8 @@ public:
template <typename T, typename Data, typename Policy>
class AggregateFunctionBitmapL2 final : public IAggregateFunctionDataHelper<Data, AggregateFunctionBitmapL2<T, Data, Policy>>
{
private:
static constexpr auto STATE_VERSION_1_MIN_REVISION = 54455;
public:
AggregateFunctionBitmapL2(const DataTypePtr & type)
: IAggregateFunctionDataHelper<Data, AggregateFunctionBitmapL2<T, Data, Policy>>({type}, {})
@ -105,9 +107,38 @@ public:
}
}
void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional<size_t> /* version */) const override { this->data(place).rbs.write(buf); }
bool isVersioned() const override { return true; }
void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional<size_t> /* version */, Arena *) const override { this->data(place).rbs.read(buf); }
size_t getDefaultVersion() const override { return 1; }
size_t getVersionFromRevision(size_t revision) const override
{
if (revision >= STATE_VERSION_1_MIN_REVISION)
return 1;
else
return 0;
}
void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional<size_t> version) const override
{
if (!version)
version = getDefaultVersion();
if (*version >= 1)
DB::writeBoolText(this->data(place).init, buf);
this->data(place).rbs.write(buf);
}
void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional<size_t> version, Arena *) const override
{
if (!version)
version = getDefaultVersion();
if (*version >= 1)
DB::readBoolText(this->data(place).init, buf);
this->data(place).rbs.read(buf);
}
void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override
{

View File

@ -271,7 +271,7 @@ public:
{
lower_bound = std::min(lower_bound, other.lower_bound);
upper_bound = std::max(upper_bound, other.upper_bound);
for (size_t i = 0; i < other.size; i++)
for (size_t i = 0; i < other.size; ++i)
add(other.points[i].mean, other.points[i].weight, max_bins);
}

View File

@ -56,7 +56,7 @@ static bool ALWAYS_INLINE inline is_all_zeros(const UInt8 * flags, size_t size)
i += 8;
}
for (; i < size; i++)
for (; i < size; ++i)
if (flags[i])
return false;

View File

@ -7,18 +7,20 @@
#include <DataTypes/DataTypeDateTime.h>
#define TOP_K_MAX_SIZE 0xFFFFFF
static inline constexpr UInt64 TOP_K_MAX_SIZE = 0xFFFFFF;
namespace DB
{
struct Settings;
namespace ErrorCodes
{
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
extern const int ARGUMENT_OUT_OF_BOUND;
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
extern const int LOGICAL_ERROR;
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
}
@ -42,19 +44,22 @@ class AggregateFunctionTopKDateTime : public AggregateFunctionTopK<DataTypeDateT
template <bool is_weighted>
static IAggregateFunction * createWithExtraTypes(const DataTypePtr & argument_type, UInt64 threshold, UInt64 load_factor, const Array & params)
static IAggregateFunction * createWithExtraTypes(const DataTypes & argument_types, UInt64 threshold, UInt64 load_factor, const Array & params)
{
WhichDataType which(argument_type);
if (argument_types.empty())
throw DB::Exception(ErrorCodes::LOGICAL_ERROR, "Got empty arguments list");
WhichDataType which(argument_types[0]);
if (which.idx == TypeIndex::Date)
return new AggregateFunctionTopKDate<is_weighted>(threshold, load_factor, {argument_type}, params);
return new AggregateFunctionTopKDate<is_weighted>(threshold, load_factor, argument_types, params);
if (which.idx == TypeIndex::DateTime)
return new AggregateFunctionTopKDateTime<is_weighted>(threshold, load_factor, {argument_type}, params);
return new AggregateFunctionTopKDateTime<is_weighted>(threshold, load_factor, argument_types, params);
/// Check that we can use plain version of AggregateFunctionTopKGeneric
if (argument_type->isValueUnambiguouslyRepresentedInContiguousMemoryRegion())
return new AggregateFunctionTopKGeneric<true, is_weighted>(threshold, load_factor, argument_type, params);
if (argument_types[0]->isValueUnambiguouslyRepresentedInContiguousMemoryRegion())
return new AggregateFunctionTopKGeneric<true, is_weighted>(threshold, load_factor, argument_types, params);
else
return new AggregateFunctionTopKGeneric<false, is_weighted>(threshold, load_factor, argument_type, params);
return new AggregateFunctionTopKGeneric<false, is_weighted>(threshold, load_factor, argument_types, params);
}
@ -78,40 +83,37 @@ AggregateFunctionPtr createAggregateFunctionTopK(const std::string & name, const
if (!params.empty())
{
if (params.size() > 2)
throw Exception("Aggregate function " + name + " requires two parameters or less.",
ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH,
"Aggregate function '{}' requires two parameters or less", name);
UInt64 k = applyVisitor(FieldVisitorConvertToNumber<UInt64>(), params[0]);
if (params.size() == 2)
{
load_factor = applyVisitor(FieldVisitorConvertToNumber<UInt64>(), params[1]);
if (load_factor < 1)
throw Exception("Too small parameter 'load_factor' for aggregate function " + name + ". Minimum: 1",
ErrorCodes::ARGUMENT_OUT_OF_BOUND);
throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND,
"Too small parameter 'load_factor' for aggregate function '{}' (got {}, minimum is 1)", name, load_factor);
}
if (k > TOP_K_MAX_SIZE || load_factor > TOP_K_MAX_SIZE || k * load_factor > TOP_K_MAX_SIZE)
throw Exception("Too large parameter(s) for aggregate function " + name + ". Maximum: " + toString(TOP_K_MAX_SIZE),
ErrorCodes::ARGUMENT_OUT_OF_BOUND);
threshold = applyVisitor(FieldVisitorConvertToNumber<UInt64>(), params[0]);
if (k == 0)
throw Exception("Parameter 0 is illegal for aggregate function " + name,
ErrorCodes::ARGUMENT_OUT_OF_BOUND);
if (threshold > TOP_K_MAX_SIZE || load_factor > TOP_K_MAX_SIZE || threshold * load_factor > TOP_K_MAX_SIZE)
throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND,
"Too large parameter(s) for aggregate function '{}' (maximum is {})", name, toString(TOP_K_MAX_SIZE));
threshold = k;
if (threshold == 0)
throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Parameter 0 is illegal for aggregate function '{}'", name);
}
AggregateFunctionPtr res(createWithNumericType<AggregateFunctionTopK, is_weighted>(
*argument_types[0], threshold, load_factor, argument_types, params));
if (!res)
res = AggregateFunctionPtr(createWithExtraTypes<is_weighted>(argument_types[0], threshold, load_factor, params));
res = AggregateFunctionPtr(createWithExtraTypes<is_weighted>(argument_types, threshold, load_factor, params));
if (!res)
throw Exception("Illegal type " + argument_types[0]->getName() +
" of argument for aggregate function " + name, ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
"Illegal type {} of argument for aggregate function '{}'", argument_types[0]->getName(), name);
return res;
}

View File

@ -132,8 +132,8 @@ private:
public:
AggregateFunctionTopKGeneric(
UInt64 threshold_, UInt64 load_factor, const DataTypePtr & input_data_type_, const Array & params)
: IAggregateFunctionDataHelper<AggregateFunctionTopKGenericData, AggregateFunctionTopKGeneric<is_plain_column, is_weighted>>({input_data_type_}, params)
UInt64 threshold_, UInt64 load_factor, const DataTypes & argument_types_, const Array & params)
: IAggregateFunctionDataHelper<AggregateFunctionTopKGenericData, AggregateFunctionTopKGeneric<is_plain_column, is_weighted>>(argument_types_, params)
, threshold(threshold_), reserved(load_factor * threshold), input_data_type(this->argument_types[0]) {}
String getName() const override { return is_weighted ? "topKWeighted" : "topK"; }

View File

@ -2,6 +2,7 @@
#include <Columns/ColumnTuple.h>
#include <Columns/ColumnsNumber.h>
#include <Columns/ColumnSparse.h>
#include <Core/Block.h>
#include <Core/ColumnNumbers.h>
#include <Core/Field.h>
@ -181,6 +182,13 @@ public:
Arena * arena,
ssize_t if_argument_pos = -1) const = 0;
/// The version of "addBatch", that handle sparse columns as arguments.
virtual void addBatchSparse(
AggregateDataPtr * places,
size_t place_offset,
const IColumn ** columns,
Arena * arena) const = 0;
virtual void mergeBatch(
size_t batch_size,
AggregateDataPtr * places,
@ -193,6 +201,10 @@ public:
virtual void addBatchSinglePlace(
size_t batch_size, AggregateDataPtr place, const IColumn ** columns, Arena * arena, ssize_t if_argument_pos = -1) const = 0;
/// The version of "addBatchSinglePlace", that handle sparse columns as arguments.
virtual void addBatchSparseSinglePlace(
AggregateDataPtr place, const IColumn ** columns, Arena * arena) const = 0;
/** The same for single place when need to aggregate only filtered data.
* Instead of using an if-column, the condition is combined inside the null_map
*/
@ -367,6 +379,22 @@ public:
}
}
void addBatchSparse(
AggregateDataPtr * places,
size_t place_offset,
const IColumn ** columns,
Arena * arena) const override
{
const auto & column_sparse = assert_cast<const ColumnSparse &>(*columns[0]);
const auto * values = &column_sparse.getValuesColumn();
size_t batch_size = column_sparse.size();
auto offset_it = column_sparse.begin();
for (size_t i = 0; i < batch_size; ++i, ++offset_it)
static_cast<const Derived *>(this)->add(places[offset_it.getCurrentRow()] + place_offset,
&values, offset_it.getValueIndex(), arena);
}
void mergeBatch(
size_t batch_size,
AggregateDataPtr * places,
@ -398,6 +426,19 @@ public:
}
}
void addBatchSparseSinglePlace(
AggregateDataPtr place, const IColumn ** columns, Arena * arena) const override
{
/// TODO: add values and defaults separately if order of adding isn't important.
const auto & column_sparse = assert_cast<const ColumnSparse &>(*columns[0]);
const auto * values = &column_sparse.getValuesColumn();
size_t batch_size = column_sparse.size();
auto offset_it = column_sparse.begin();
for (size_t i = 0; i < batch_size; ++i, ++offset_it)
static_cast<const Derived *>(this)->add(place, &values, offset_it.getValueIndex(), arena);
}
void addBatchSinglePlaceNotNull(
size_t batch_size,
AggregateDataPtr place,

View File

@ -107,7 +107,7 @@ if (USE_AWS_S3)
endif()
if (USE_AZURE_BLOB_STORAGE)
add_headers_and_sources(dbms Disks/BlobStorage)
add_headers_and_sources(dbms Disks/AzureBlobStorage)
endif()
if (USE_HDFS)

View File

@ -4,6 +4,8 @@
#include <iomanip>
#include <string_view>
#include <filesystem>
#include <map>
#include <unordered_map>
#include <base/argsToConfig.h>
#include <base/DateLUT.h>
@ -52,6 +54,7 @@
#include <Processors/Executors/PullingAsyncPipelineExecutor.h>
#include <Processors/Transforms/AddingDefaultsTransform.h>
#include <Interpreters/ReplaceQueryParameterVisitor.h>
#include <Interpreters/ProfileEventsExt.h>
#include <IO/WriteBufferFromOStream.h>
#include <IO/CompressionMethod.h>
#include <Client/InternalTextLogs.h>
@ -105,6 +108,99 @@ namespace ProfileEvents
namespace DB
{
static void incrementProfileEventsBlock(Block & dst, const Block & src)
{
if (!dst)
{
dst = src;
return;
}
assertBlocksHaveEqualStructure(src, dst, "ProfileEvents");
std::unordered_map<String, size_t> name_pos;
for (size_t i = 0; i < dst.columns(); ++i)
name_pos[dst.getByPosition(i).name] = i;
size_t dst_rows = dst.rows();
MutableColumns mutable_columns = dst.mutateColumns();
auto & dst_column_host_name = typeid_cast<ColumnString &>(*mutable_columns[name_pos["host_name"]]);
auto & dst_array_current_time = typeid_cast<ColumnUInt32 &>(*mutable_columns[name_pos["current_time"]]).getData();
auto & dst_array_thread_id = typeid_cast<ColumnUInt64 &>(*mutable_columns[name_pos["thread_id"]]).getData();
auto & dst_array_type = typeid_cast<ColumnInt8 &>(*mutable_columns[name_pos["type"]]).getData();
auto & dst_column_name = typeid_cast<ColumnString &>(*mutable_columns[name_pos["name"]]);
auto & dst_array_value = typeid_cast<ColumnInt64 &>(*mutable_columns[name_pos["value"]]).getData();
const auto & src_column_host_name = typeid_cast<const ColumnString &>(*src.getByName("host_name").column);
const auto & src_array_current_time = typeid_cast<const ColumnUInt32 &>(*src.getByName("current_time").column).getData();
const auto & src_array_thread_id = typeid_cast<const ColumnUInt64 &>(*src.getByName("thread_id").column).getData();
const auto & src_column_name = typeid_cast<const ColumnString &>(*src.getByName("name").column);
const auto & src_array_value = typeid_cast<const ColumnInt64 &>(*src.getByName("value").column).getData();
struct Id
{
StringRef name;
StringRef host_name;
UInt64 thread_id;
bool operator<(const Id & rhs) const
{
return std::tie(name, host_name, thread_id)
< std::tie(rhs.name, rhs.host_name, rhs.thread_id);
}
};
std::map<Id, UInt64> rows_by_name;
for (size_t src_row = 0; src_row < src.rows(); ++src_row)
{
Id id{
src_column_name.getDataAt(src_row),
src_column_host_name.getDataAt(src_row),
src_array_thread_id[src_row],
};
rows_by_name[id] = src_row;
}
/// Merge src into dst.
for (size_t dst_row = 0; dst_row < dst_rows; ++dst_row)
{
Id id{
dst_column_name.getDataAt(dst_row),
dst_column_host_name.getDataAt(dst_row),
dst_array_thread_id[dst_row],
};
if (auto it = rows_by_name.find(id); it != rows_by_name.end())
{
size_t src_row = it->second;
dst_array_current_time[dst_row] = src_array_current_time[src_row];
switch (dst_array_type[dst_row])
{
case ProfileEvents::Type::INCREMENT:
dst_array_value[dst_row] += src_array_value[src_row];
break;
case ProfileEvents::Type::GAUGE:
dst_array_value[dst_row] = src_array_value[src_row];
break;
}
rows_by_name.erase(it);
}
}
/// Copy rows from src that dst does not contains.
for (const auto & [id, pos] : rows_by_name)
{
for (size_t col = 0; col < src.columns(); ++col)
{
mutable_columns[col]->insert((*src.getByPosition(col).column)[pos]);
}
}
dst.setColumns(std::move(mutable_columns));
}
std::atomic_flag exit_on_signal = ATOMIC_FLAG_INIT;
@ -753,7 +849,7 @@ void ClientBase::onProfileEvents(Block & block)
}
else
{
profile_events.last_block = block;
incrementProfileEventsBlock(profile_events.last_block, block);
}
}
profile_events.watch.restart();
@ -1635,7 +1731,13 @@ void ClientBase::parseAndCheckOptions(OptionsDescription & options_description,
/// Check unrecognized options without positional options.
auto unrecognized_options = po::collect_unrecognized(parsed.options, po::collect_unrecognized_mode::exclude_positional);
if (!unrecognized_options.empty())
{
auto hints = this->getHints(unrecognized_options[0]);
if (!hints.empty())
throw Exception(ErrorCodes::UNRECOGNIZED_ARGUMENTS, "Unrecognized option '{}'. Maybe you meant {}", unrecognized_options[0], toString(hints));
throw Exception(ErrorCodes::UNRECOGNIZED_ARGUMENTS, "Unrecognized option '{}'", unrecognized_options[0]);
}
/// Check positional options (options after ' -- ', ex: clickhouse-client -- <options>).
unrecognized_options = po::collect_unrecognized(parsed.options, po::collect_unrecognized_mode::include_positional);
@ -1713,6 +1815,25 @@ void ClientBase::init(int argc, char ** argv)
;
addOptions(options_description);
auto getter = [](const auto & op)
{
String op_long_name = op->long_name();
return "--" + String(op_long_name);
};
if (options_description.main_description)
{
const auto & main_options = options_description.main_description->options();
std::transform(main_options.begin(), main_options.end(), std::back_inserter(cmd_options), getter);
}
if (options_description.external_description)
{
const auto & external_options = options_description.external_description->options();
std::transform(external_options.begin(), external_options.end(), std::back_inserter(cmd_options), getter);
}
parseAndCheckOptions(options_description, options, common_arguments);
po::notify(options);

View File

@ -1,5 +1,6 @@
#pragma once
#include "Common/NamePrompter.h"
#include <Common/ProgressIndication.h>
#include <Common/InterruptListener.h>
#include <Common/ShellCommand.h>
@ -37,7 +38,7 @@ void interruptSignalHandler(int signum);
class InternalTextLogs;
class ClientBase : public Poco::Util::Application
class ClientBase : public Poco::Util::Application, public IHints<2, ClientBase>
{
public:
@ -48,6 +49,8 @@ public:
void init(int argc, char ** argv);
std::vector<String> getAllRegisteredNames() const override { return cmd_options; }
protected:
void runInteractive();
void runNonInteractive();
@ -145,6 +148,7 @@ protected:
std::vector<String> queries_files; /// If not empty, queries will be read from these files
std::vector<String> interleave_queries_files; /// If not empty, run queries from these files before processing every file from 'queries_files'.
std::vector<String> cmd_options;
bool stdin_is_a_tty = false; /// stdin is a terminal.
bool stdout_is_a_tty = false; /// stdout is a terminal.

View File

@ -25,7 +25,12 @@ struct PocoSocketWrapper : public Poco::Net::SocketImpl
void IConnections::DrainCallback::operator()(int fd, Poco::Timespan, const std::string fd_description) const
{
if (!PocoSocketWrapper(fd).poll(drain_timeout, Poco::Net::Socket::SELECT_READ))
throw Exception(ErrorCodes::SOCKET_TIMEOUT, "Read timeout while draining from {}", fd_description);
{
throw Exception(ErrorCodes::SOCKET_TIMEOUT,
"Read timeout ({} ms) while draining from {}",
drain_timeout.totalMilliseconds(),
fd_description);
}
}
}

View File

@ -395,17 +395,17 @@ MultiplexedConnections::ReplicaState & MultiplexedConnections::getReplicaForRead
read_list.push_back(*connection->socket);
}
auto timeout = is_draining ? drain_timeout : receive_timeout;
int n = Poco::Net::Socket::select(
read_list,
write_list,
except_list,
is_draining ? drain_timeout : receive_timeout);
timeout);
/// We treat any error as timeout for simplicity.
/// And we also check if read_list is still empty just in case.
if (n <= 0 || read_list.empty())
{
auto err_msg = fmt::format("Timeout exceeded while reading from {}", dumpAddressesUnlocked());
for (ReplicaState & state : replica_states)
{
Connection * connection = state.connection;
@ -415,7 +415,10 @@ MultiplexedConnections::ReplicaState & MultiplexedConnections::getReplicaForRead
invalidateReplica(state);
}
}
throw Exception(err_msg, ErrorCodes::TIMEOUT_EXCEEDED);
throw Exception(ErrorCodes::TIMEOUT_EXCEEDED,
"Timeout ({} ms) exceeded while reading from {}",
timeout.totalMilliseconds(),
dumpAddressesUnlocked());
}
}

View File

@ -133,6 +133,11 @@ public:
void get(size_t n, Field & res) const override;
bool isDefaultAt(size_t) const override
{
throw Exception("Method isDefaultAt is not supported for ColumnAggregateFunction", ErrorCodes::NOT_IMPLEMENTED);
}
StringRef getDataAt(size_t n) const override;
void insertData(const char * pos, size_t length) override;
@ -208,6 +213,16 @@ public:
throw Exception("Method hasEqualValues is not supported for ColumnAggregateFunction", ErrorCodes::NOT_IMPLEMENTED);
}
double getRatioOfDefaultRows(double) const override
{
throw Exception("Method getRatioOfDefaultRows is not supported for ColumnAggregateFunction", ErrorCodes::NOT_IMPLEMENTED);
}
void getIndicesOfNonDefaultRows(Offsets &, size_t, size_t) const override
{
throw Exception("Method getIndicesOfNonDefaultRows is not supported for ColumnAggregateFunction", ErrorCodes::NOT_IMPLEMENTED);
}
void getPermutation(bool reverse, size_t limit, int nan_direction_hint, Permutation & res) const override;
void updatePermutation(bool reverse, size_t limit, int, Permutation & res, EqualRanges & equal_range) const override;

View File

@ -182,6 +182,13 @@ StringRef ColumnArray::getDataAt(size_t n) const
}
bool ColumnArray::isDefaultAt(size_t n) const
{
const auto & offsets_data = getOffsets();
return offsets_data[n] == offsets_data[static_cast<ssize_t>(n) - 1];
}
void ColumnArray::insertData(const char * pos, size_t length)
{
/** Similarly - only for arrays of fixed length values.
@ -576,7 +583,8 @@ void ColumnArray::expand(const IColumn::Filter & mask, bool inverted)
}
if (from != -1)
throw Exception("Not enough bytes in mask", ErrorCodes::LOGICAL_ERROR);}
throw Exception("Not enough bytes in mask", ErrorCodes::LOGICAL_ERROR);
}
template <typename T>
ColumnPtr ColumnArray::filterNumber(const Filter & filt, ssize_t result_size_hint) const
@ -868,6 +876,16 @@ ColumnPtr ColumnArray::compress() const
});
}
double ColumnArray::getRatioOfDefaultRows(double sample_ratio) const
{
return getRatioOfDefaultRowsImpl<ColumnArray>(sample_ratio);
}
void ColumnArray::getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const
{
return getIndicesOfNonDefaultRowsImpl<ColumnArray>(indices, from, limit);
}
ColumnPtr ColumnArray::replicate(const Offsets & replicate_offsets) const
{

View File

@ -60,6 +60,7 @@ public:
Field operator[](size_t n) const override;
void get(size_t n, Field & res) const override;
StringRef getDataAt(size_t n) const override;
bool isDefaultAt(size_t n) const override;
void insertData(const char * pos, size_t length) override;
StringRef serializeValueIntoArena(size_t n, Arena & arena, char const *& begin) const override;
const char * deserializeAndInsertFromArena(const char * pos) override;
@ -143,6 +144,10 @@ public:
return false;
}
double getRatioOfDefaultRows(double sample_ratio) const override;
void getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const override;
bool isCollationSupported() const override { return getData().isCollationSupported(); }
private:

View File

@ -82,6 +82,7 @@ public:
Field operator[](size_t) const override { throwMustBeDecompressed(); }
void get(size_t, Field &) const override { throwMustBeDecompressed(); }
StringRef getDataAt(size_t) const override { throwMustBeDecompressed(); }
bool isDefaultAt(size_t) const override { throwMustBeDecompressed(); }
void insert(const Field &) override { throwMustBeDecompressed(); }
void insertRangeFrom(const IColumn &, size_t, size_t) override { throwMustBeDecompressed(); }
void insertData(const char *, size_t) override { throwMustBeDecompressed(); }
@ -113,6 +114,8 @@ public:
void gather(ColumnGathererStream &) override { throwMustBeDecompressed(); }
void getExtremes(Field &, Field &) const override { throwMustBeDecompressed(); }
size_t byteSizeAt(size_t) const override { throwMustBeDecompressed(); }
double getRatioOfDefaultRows(double) const override { throwMustBeDecompressed(); }
void getIndicesOfNonDefaultRows(Offsets &, size_t, size_t) const override { throwMustBeDecompressed(); }
protected:
size_t rows;

View File

@ -5,6 +5,7 @@
#include <Columns/IColumn.h>
#include <Common/typeid_cast.h>
#include <Common/assert_cast.h>
#include <Common/PODArray.h>
namespace DB
@ -115,6 +116,11 @@ public:
return data->getFloat32(0);
}
bool isDefaultAt(size_t) const override
{
return data->isDefaultAt(0);
}
bool isNullAt(size_t) const override
{
return data->isNullAt(0);
@ -239,6 +245,27 @@ public:
return false;
}
double getRatioOfDefaultRows(double) const override
{
return data->isDefaultAt(0) ? 1.0 : 0.0;
}
void getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const override
{
if (!data->isDefaultAt(0))
{
size_t to = limit && from + limit < size() ? from + limit : size();
indices.reserve(indices.size() + to - from);
for (size_t i = from; i < to; ++i)
indices.push_back(i);
}
}
SerializationInfoPtr getSerializationInfo() const override
{
return data->getSerializationInfo();
}
bool isNullable() const override { return isColumnNullable(*data); }
bool onlyNull() const override { return data->isNullAt(0); }
bool isNumeric() const override { return data->isNumeric(); }

View File

@ -331,7 +331,8 @@ void ColumnDecimal<T>::gather(ColumnGathererStream & gatherer)
template <is_decimal T>
ColumnPtr ColumnDecimal<T>::compress() const
{
size_t source_size = data.size() * sizeof(T);
const size_t data_size = data.size();
const size_t source_size = data_size * sizeof(T);
/// Don't compress small blocks.
if (source_size < 4096) /// A wild guess.
@ -342,8 +343,9 @@ ColumnPtr ColumnDecimal<T>::compress() const
if (!compressed)
return ColumnCompressed::wrap(this->getPtr());
return ColumnCompressed::create(data.size(), compressed->size(),
[compressed = std::move(compressed), column_size = data.size(), scale = this->scale]
const size_t compressed_size = compressed->size();
return ColumnCompressed::create(data_size, compressed_size,
[compressed = std::move(compressed), column_size = data_size, scale = this->scale]
{
auto res = ColumnDecimal<T>::create(column_size, scale);
ColumnCompressed::decompressBuffer(

View File

@ -177,8 +177,17 @@ public:
return false;
}
ColumnPtr compress() const override;
double getRatioOfDefaultRows(double sample_ratio) const override
{
return this->template getRatioOfDefaultRowsImpl<Self>(sample_ratio);
}
void getIndicesOfNonDefaultRows(IColumn::Offsets & indices, size_t from, size_t limit) const override
{
return this->template getIndicesOfNonDefaultRowsImpl<Self>(indices, from, limit);
}
ColumnPtr compress() const override;
void insertValue(const T value) { data.push_back(value); }
Container & getData() { return data; }

View File

@ -51,6 +51,12 @@ MutableColumnPtr ColumnFixedString::cloneResized(size_t size) const
return new_col_holder;
}
bool ColumnFixedString::isDefaultAt(size_t index) const
{
assert(index < size());
return memoryIsZero(chars.data() + index * n, n);
}
void ColumnFixedString::insert(const Field & x)
{
const String & s = DB::get<const String &>(x);
@ -409,9 +415,9 @@ ColumnPtr ColumnFixedString::compress() const
if (!compressed)
return ColumnCompressed::wrap(this->getPtr());
size_t column_size = size();
return ColumnCompressed::create(column_size, compressed->size(),
const size_t column_size = size();
const size_t compressed_size = compressed->size();
return ColumnCompressed::create(column_size, compressed_size,
[compressed = std::move(compressed), column_size, n = n]
{
size_t chars_size = n * column_size;

View File

@ -88,6 +88,8 @@ public:
return StringRef(&chars[n * index], n);
}
bool isDefaultAt(size_t index) const override;
void insert(const Field & x) override;
void insertFrom(const IColumn & src_, size_t index) override;
@ -182,6 +184,16 @@ public:
return false;
}
double getRatioOfDefaultRows(double sample_ratio) const override
{
return getRatioOfDefaultRowsImpl<ColumnFixedString>(sample_ratio);
}
void getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const override
{
return getIndicesOfNonDefaultRowsImpl<ColumnFixedString>(indices, from, limit);
}
bool canBeInsideNullable() const override { return true; }
bool isFixedAndContiguous() const override { return true; }

View File

@ -24,7 +24,12 @@ class ColumnFunction final : public COWHelper<IColumn, ColumnFunction>
private:
friend class COWHelper<IColumn, ColumnFunction>;
ColumnFunction(size_t size, FunctionBasePtr function_, const ColumnsWithTypeAndName & columns_to_capture, bool is_short_circuit_argument_ = false, bool is_function_compiled_ = false);
ColumnFunction(
size_t size,
FunctionBasePtr function_,
const ColumnsWithTypeAndName & columns_to_capture,
bool is_short_circuit_argument_ = false,
bool is_function_compiled_ = false);
public:
const char * getFamilyName() const override { return "Function"; }
@ -68,6 +73,11 @@ public:
throw Exception("Cannot get value from " + getName(), ErrorCodes::NOT_IMPLEMENTED);
}
bool isDefaultAt(size_t) const override
{
throw Exception("isDefaultAt is not implemented for " + getName(), ErrorCodes::NOT_IMPLEMENTED);
}
void insert(const Field &) override
{
throw Exception("Cannot insert into " + getName(), ErrorCodes::NOT_IMPLEMENTED);
@ -153,6 +163,16 @@ public:
throw Exception("Method gather is not supported for " + getName(), ErrorCodes::NOT_IMPLEMENTED);
}
double getRatioOfDefaultRows(double) const override
{
throw Exception("Method getRatioOfDefaultRows is not supported for " + getName(), ErrorCodes::NOT_IMPLEMENTED);
}
void getIndicesOfNonDefaultRows(Offsets &, size_t, size_t) const override
{
throw Exception("Method getIndicesOfNonDefaultRows is not supported for " + getName(), ErrorCodes::NOT_IMPLEMENTED);
}
bool isShortCircuitArgument() const { return is_short_circuit_argument; }
DataTypePtr getResultType() const;

View File

@ -64,6 +64,7 @@ public:
return getDictionary().getDataAtWithTerminatingZero(getIndexes().getUInt(n));
}
bool isDefaultAt(size_t n) const override { return getDictionary().isDefaultAt(getIndexes().getUInt(n)); }
UInt64 get64(size_t n) const override { return getDictionary().get64(getIndexes().getUInt(n)); }
UInt64 getUInt(size_t n) const override { return getDictionary().getUInt(getIndexes().getUInt(n)); }
Int64 getInt(size_t n) const override { return getDictionary().getInt(getIndexes().getUInt(n)); }
@ -180,6 +181,16 @@ public:
return false;
}
double getRatioOfDefaultRows(double sample_ratio) const override
{
return getIndexes().getRatioOfDefaultRows(sample_ratio);
}
void getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const override
{
return getIndexes().getIndicesOfNonDefaultRows(indices, from, limit);
}
bool valuesHaveFixedSize() const override { return getDictionary().valuesHaveFixedSize(); }
bool isFixedAndContiguous() const override { return false; }
size_t sizeOfValueIfFixed() const override { return getDictionary().sizeOfValueIfFixed(); }

View File

@ -81,6 +81,11 @@ void ColumnMap::get(size_t n, Field & res) const
getNestedData().get(offset + i, map[i]);
}
bool ColumnMap::isDefaultAt(size_t n) const
{
return nested->isDefaultAt(n);
}
StringRef ColumnMap::getDataAt(size_t) const
{
throw Exception("Method getDataAt is not supported for " + getName(), ErrorCodes::NOT_IMPLEMENTED);
@ -273,6 +278,16 @@ bool ColumnMap::structureEquals(const IColumn & rhs) const
return false;
}
double ColumnMap::getRatioOfDefaultRows(double sample_ratio) const
{
return getRatioOfDefaultRowsImpl<ColumnMap>(sample_ratio);
}
void ColumnMap::getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const
{
return getIndicesOfNonDefaultRowsImpl<ColumnMap>(indices, from, limit);
}
ColumnPtr ColumnMap::compress() const
{
auto compressed = nested->compress();

View File

@ -51,6 +51,7 @@ public:
Field operator[](size_t n) const override;
void get(size_t n, Field & res) const override;
bool isDefaultAt(size_t n) const override;
StringRef getDataAt(size_t n) const override;
void insertData(const char * pos, size_t length) override;
void insert(const Field & x) override;
@ -85,6 +86,8 @@ public:
void protect() override;
void forEachSubcolumn(ColumnCallback callback) override;
bool structureEquals(const IColumn & rhs) const override;
double getRatioOfDefaultRows(double sample_ratio) const override;
void getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const override;
const ColumnArray & getNestedColumn() const { return assert_cast<const ColumnArray &>(*nested); }
ColumnArray & getNestedColumn() { return assert_cast<ColumnArray &>(*nested); }

View File

@ -648,6 +648,29 @@ void ColumnNullable::checkConsistency() const
ErrorCodes::SIZES_OF_NESTED_COLUMNS_ARE_INCONSISTENT);
}
ColumnPtr ColumnNullable::createWithOffsets(const IColumn::Offsets & offsets, const Field & default_field, size_t total_rows, size_t shift) const
{
ColumnPtr new_values;
ColumnPtr new_null_map;
if (default_field.getType() == Field::Types::Null)
{
auto default_column = nested_column->cloneEmpty();
default_column->insertDefault();
/// Value in main column, when null map is 1 is implementation defined. So, take any value.
new_values = nested_column->createWithOffsets(offsets, (*default_column)[0], total_rows, shift);
new_null_map = null_map->createWithOffsets(offsets, Field(1u), total_rows, shift);
}
else
{
new_values = nested_column->createWithOffsets(offsets, default_field, total_rows, shift);
new_null_map = null_map->createWithOffsets(offsets, Field(0u), total_rows, shift);
}
return ColumnNullable::create(new_values, new_null_map);
}
ColumnPtr makeNullable(const ColumnPtr & column)
{
if (isColumnNullable(*column))

View File

@ -54,6 +54,7 @@ public:
void get(size_t n, Field & res) const override;
bool getBool(size_t n) const override { return isNullAt(n) ? false : nested_column->getBool(n); }
UInt64 get64(size_t n) const override { return nested_column->get64(n); }
bool isDefaultAt(size_t n) const override { return isNullAt(n); }
/**
* If isNullAt(n) returns false, returns the nested column's getDataAt(n), otherwise returns a special value
@ -137,6 +138,18 @@ public:
return false;
}
double getRatioOfDefaultRows(double sample_ratio) const override
{
return null_map->getRatioOfDefaultRows(sample_ratio);
}
void getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const override
{
null_map->getIndicesOfNonDefaultRows(indices, from, limit);
}
ColumnPtr createWithOffsets(const IColumn::Offsets & offsets, const Field & default_field, size_t total_rows, size_t shift) const override;
bool isNullable() const override { return true; }
bool isFixedAndContiguous() const override { return false; }
bool valuesHaveFixedSize() const override { return nested_column->valuesHaveFixedSize(); }

View File

@ -0,0 +1,779 @@
#include <Columns/ColumnSparse.h>
#include <Columns/ColumnsCommon.h>
#include <Columns/ColumnCompressed.h>
#include <Columns/ColumnTuple.h>
#include <Common/WeakHash.h>
#include <Common/SipHash.h>
#include <Common/HashTable/Hash.h>
#include <Processors/Transforms/ColumnGathererTransform.h>
#include <algorithm>
namespace DB
{
namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
extern const int SIZES_OF_COLUMNS_DOESNT_MATCH;
}
ColumnSparse::ColumnSparse(MutableColumnPtr && values_)
: values(std::move(values_)), _size(0)
{
if (!values->empty())
throw Exception("Not empty values passed to ColumnSparse, but no offsets passed", ErrorCodes::LOGICAL_ERROR);
values->insertDefault();
offsets = ColumnUInt64::create();
}
ColumnSparse::ColumnSparse(MutableColumnPtr && values_, MutableColumnPtr && offsets_, size_t size_)
: values(std::move(values_)), offsets(std::move(offsets_)), _size(size_)
{
const ColumnUInt64 * offsets_concrete = typeid_cast<const ColumnUInt64 *>(offsets.get());
if (!offsets_concrete)
throw Exception(ErrorCodes::LOGICAL_ERROR, "'offsets' column must be a ColumnUInt64, got: {}", offsets->getName());
/// 'values' should contain one extra element: default value at 0 position.
if (offsets->size() + 1 != values->size())
throw Exception(ErrorCodes::LOGICAL_ERROR,
"Values size ({}) is inconsistent with offsets size ({})", values->size(), offsets->size());
if (_size < offsets->size())
throw Exception(ErrorCodes::LOGICAL_ERROR,
"Size of sparse column ({}) cannot be lower than number of non-default values ({})", _size, offsets->size());
if (!offsets_concrete->empty() && _size <= offsets_concrete->getData().back())
throw Exception(ErrorCodes::LOGICAL_ERROR,
"Size of sparse column ({}) should be greater than last position of non-default value ({})",
_size, offsets_concrete->getData().back());
#ifndef NDEBUG
const auto & offsets_data = getOffsetsData();
const auto * it = std::adjacent_find(offsets_data.begin(), offsets_data.end(), std::greater_equal<UInt64>());
if (it != offsets_data.end())
throw Exception(ErrorCodes::LOGICAL_ERROR, "Offsets of ColumnSparse must be strictly sorted");
#endif
}
MutableColumnPtr ColumnSparse::cloneResized(size_t new_size) const
{
if (new_size == 0)
return ColumnSparse::create(values->cloneEmpty());
if (new_size >= _size)
return ColumnSparse::create(IColumn::mutate(values), IColumn::mutate(offsets), new_size);
auto res = ColumnSparse::create(values->cloneEmpty());
res->insertRangeFrom(*this, 0, new_size);
return res;
}
bool ColumnSparse::isDefaultAt(size_t n) const
{
return getValueIndex(n) == 0;
}
bool ColumnSparse::isNullAt(size_t n) const
{
return values->isNullAt(getValueIndex(n));
}
Field ColumnSparse::operator[](size_t n) const
{
return (*values)[getValueIndex(n)];
}
void ColumnSparse::get(size_t n, Field & res) const
{
values->get(getValueIndex(n), res);
}
bool ColumnSparse::getBool(size_t n) const
{
return values->getBool(getValueIndex(n));
}
Float64 ColumnSparse::getFloat64(size_t n) const
{
return values->getFloat64(getValueIndex(n));
}
Float32 ColumnSparse::getFloat32(size_t n) const
{
return values->getFloat32(getValueIndex(n));
}
UInt64 ColumnSparse::getUInt(size_t n) const
{
return values->getUInt(getValueIndex(n));
}
Int64 ColumnSparse::getInt(size_t n) const
{
return values->getInt(getValueIndex(n));
}
UInt64 ColumnSparse::get64(size_t n) const
{
return values->get64(getValueIndex(n));
}
StringRef ColumnSparse::getDataAt(size_t n) const
{
return values->getDataAt(getValueIndex(n));
}
ColumnPtr ColumnSparse::convertToFullColumnIfSparse() const
{
return values->createWithOffsets(getOffsetsData(), (*values)[0], _size, /*shift=*/ 1);
}
void ColumnSparse::insertSingleValue(const Inserter & inserter)
{
inserter(*values);
size_t last_idx = values->size() - 1;
if (values->isDefaultAt(last_idx))
values->popBack(1);
else
getOffsetsData().push_back(_size);
++_size;
}
void ColumnSparse::insertData(const char * pos, size_t length)
{
insertSingleValue([&](IColumn & column) { column.insertData(pos, length); });
}
StringRef ColumnSparse::serializeValueIntoArena(size_t n, Arena & arena, char const *& begin) const
{
return values->serializeValueIntoArena(getValueIndex(n), arena, begin);
}
const char * ColumnSparse::deserializeAndInsertFromArena(const char * pos)
{
const char * res = nullptr;
insertSingleValue([&](IColumn & column) { res = column.deserializeAndInsertFromArena(pos); });
return res;
}
const char * ColumnSparse::skipSerializedInArena(const char * pos) const
{
return values->skipSerializedInArena(pos);
}
void ColumnSparse::insertRangeFrom(const IColumn & src, size_t start, size_t length)
{
if (length == 0)
return;
if (start + length > src.size())
throw Exception("Parameter out of bound in IColumnString::insertRangeFrom method.",
ErrorCodes::LOGICAL_ERROR);
auto & offsets_data = getOffsetsData();
size_t end = start + length;
if (const auto * src_sparse = typeid_cast<const ColumnSparse *>(&src))
{
const auto & src_offsets = src_sparse->getOffsetsData();
const auto & src_values = src_sparse->getValuesColumn();
size_t offset_start = std::lower_bound(src_offsets.begin(), src_offsets.end(), start) - src_offsets.begin();
size_t offset_end = std::lower_bound(src_offsets.begin(), src_offsets.end(), end) - src_offsets.begin();
assert(offset_start <= offset_end);
if (offset_start != offset_end)
{
offsets_data.reserve(offsets_data.size() + offset_end - offset_start);
insertManyDefaults(src_offsets[offset_start] - start);
offsets_data.push_back(_size);
++_size;
for (size_t i = offset_start + 1; i < offset_end; ++i)
{
size_t current_diff = src_offsets[i] - src_offsets[i - 1];
insertManyDefaults(current_diff - 1);
offsets_data.push_back(_size);
++_size;
}
/// 'end' <= 'src_offsets[offsets_end]', but end is excluded, so index is 'offsets_end' - 1.
/// Since 'end' is excluded, need to subtract one more row from result.
insertManyDefaults(end - src_offsets[offset_end - 1] - 1);
values->insertRangeFrom(src_values, offset_start + 1, offset_end - offset_start);
}
else
{
insertManyDefaults(length);
}
}
else
{
for (size_t i = start; i < end; ++i)
{
if (!src.isDefaultAt(i))
{
values->insertFrom(src, i);
offsets_data.push_back(_size);
}
++_size;
}
}
}
void ColumnSparse::insert(const Field & x)
{
insertSingleValue([&](IColumn & column) { column.insert(x); });
}
void ColumnSparse::insertFrom(const IColumn & src, size_t n)
{
if (const auto * src_sparse = typeid_cast<const ColumnSparse *>(&src))
{
if (size_t value_index = src_sparse->getValueIndex(n))
{
getOffsetsData().push_back(_size);
values->insertFrom(src_sparse->getValuesColumn(), value_index);
}
}
else
{
if (!src.isDefaultAt(n))
{
values->insertFrom(src, n);
getOffsetsData().push_back(_size);
}
}
++_size;
}
void ColumnSparse::insertDefault()
{
++_size;
}
void ColumnSparse::insertManyDefaults(size_t length)
{
_size += length;
}
void ColumnSparse::popBack(size_t n)
{
assert(n < _size);
auto & offsets_data = getOffsetsData();
size_t new_size = _size - n;
size_t removed_values = 0;
while (!offsets_data.empty() && offsets_data.back() >= new_size)
{
offsets_data.pop_back();
++removed_values;
}
if (removed_values)
values->popBack(removed_values);
_size = new_size;
}
ColumnPtr ColumnSparse::filter(const Filter & filt, ssize_t) const
{
if (_size != filt.size())
throw Exception("Size of filter doesn't match size of column.", ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH);
if (offsets->empty())
{
auto res = cloneEmpty();
res->insertManyDefaults(countBytesInFilter(filt));
return res;
}
auto res_offsets = offsets->cloneEmpty();
auto & res_offsets_data = assert_cast<ColumnUInt64 &>(*res_offsets).getData();
Filter values_filter;
values_filter.reserve(values->size());
values_filter.push_back(1);
size_t values_result_size_hint = 1;
size_t res_offset = 0;
auto offset_it = begin();
for (size_t i = 0; i < _size; ++i, ++offset_it)
{
if (!offset_it.isDefault())
{
if (filt[i])
{
res_offsets_data.push_back(res_offset);
values_filter.push_back(1);
++res_offset;
++values_result_size_hint;
}
else
{
values_filter.push_back(0);
}
}
else
{
res_offset += filt[i] != 0;
}
}
auto res_values = values->filter(values_filter, values_result_size_hint);
return this->create(std::move(res_values), std::move(res_offsets), res_offset);
}
void ColumnSparse::expand(const Filter & mask, bool inverted)
{
if (mask.size() < _size)
throw Exception("Mask size should be no less than data size.", ErrorCodes::LOGICAL_ERROR);
auto res_offsets = offsets->cloneEmpty();
auto & res_offsets_data = assert_cast<ColumnUInt64 &>(*res_offsets).getData();
auto it = begin();
for (size_t i = 0; i < mask.size(); ++i)
{
if (!!mask[i] ^ inverted)
{
if (it.getCurrentRow() == _size)
throw Exception("Too many bytes in mask", ErrorCodes::LOGICAL_ERROR);
if (!it.isDefault())
res_offsets_data[it.getCurrentOffset()] = i;
++it;
}
}
_size = mask.size();
}
ColumnPtr ColumnSparse::permute(const Permutation & perm, size_t limit) const
{
return permuteImpl(*this, perm, limit);
}
ColumnPtr ColumnSparse::index(const IColumn & indexes, size_t limit) const
{
return selectIndexImpl(*this, indexes, limit);
}
template <typename Type>
ColumnPtr ColumnSparse::indexImpl(const PaddedPODArray<Type> & indexes, size_t limit) const
{
assert(limit <= indexes.size());
if (limit == 0)
return ColumnSparse::create(values->cloneEmpty());
if (offsets->empty())
{
auto res = cloneEmpty();
res->insertManyDefaults(limit);
return res;
}
auto res_offsets = offsets->cloneEmpty();
auto & res_offsets_data = assert_cast<ColumnUInt64 &>(*res_offsets).getData();
auto res_values = values->cloneEmpty();
res_values->insertDefault();
/// If we need to permute full column, or if limit is large enough,
/// it's better to save indexes of values in O(size)
/// and avoid binary search for obtaining every index.
/// 3 is just a guess for overhead on copying indexes.
bool execute_linear =
limit == _size || limit * std::bit_width(offsets->size()) > _size * 3;
if (execute_linear)
{
PaddedPODArray<UInt64> values_index(_size);
auto offset_it = begin();
for (size_t i = 0; i < _size; ++i, ++offset_it)
values_index[i] = offset_it.getValueIndex();
for (size_t i = 0; i < limit; ++i)
{
size_t index = values_index[indexes[i]];
if (index != 0)
{
res_values->insertFrom(*values, index);
res_offsets_data.push_back(i);
}
}
}
else
{
for (size_t i = 0; i < limit; ++i)
{
size_t index = getValueIndex(indexes[i]);
if (index != 0)
{
res_values->insertFrom(*values, index);
res_offsets_data.push_back(i);
}
}
}
return ColumnSparse::create(std::move(res_values), std::move(res_offsets), limit);
}
int ColumnSparse::compareAt(size_t n, size_t m, const IColumn & rhs_, int null_direction_hint) const
{
if (const auto * rhs_sparse = typeid_cast<const ColumnSparse *>(&rhs_))
return values->compareAt(getValueIndex(n), rhs_sparse->getValueIndex(m), rhs_sparse->getValuesColumn(), null_direction_hint);
return values->compareAt(getValueIndex(n), m, rhs_, null_direction_hint);
}
void ColumnSparse::compareColumn(const IColumn & rhs, size_t rhs_row_num,
PaddedPODArray<UInt64> * row_indexes, PaddedPODArray<Int8> & compare_results,
int direction, int nan_direction_hint) const
{
if (row_indexes)
{
/// TODO: implement without conversion to full column.
auto this_full = convertToFullColumnIfSparse();
auto rhs_full = rhs.convertToFullColumnIfSparse();
this_full->compareColumn(*rhs_full, rhs_row_num, row_indexes, compare_results, direction, nan_direction_hint);
}
else
{
const auto & rhs_sparse = assert_cast<const ColumnSparse &>(rhs);
PaddedPODArray<Int8> nested_result;
values->compareColumn(rhs_sparse.getValuesColumn(), rhs_sparse.getValueIndex(rhs_row_num),
nullptr, nested_result, direction, nan_direction_hint);
const auto & offsets_data = getOffsetsData();
compare_results.resize_fill(_size, nested_result[0]);
for (size_t i = 0; i < offsets_data.size(); ++i)
compare_results[offsets_data[i]] = nested_result[i + 1];
}
}
int ColumnSparse::compareAtWithCollation(size_t n, size_t m, const IColumn & rhs, int null_direction_hint, const Collator & collator) const
{
if (const auto * rhs_sparse = typeid_cast<const ColumnSparse *>(&rhs))
return values->compareAtWithCollation(getValueIndex(n), rhs_sparse->getValueIndex(m), rhs_sparse->getValuesColumn(), null_direction_hint, collator);
return values->compareAtWithCollation(getValueIndex(n), m, rhs, null_direction_hint, collator);
}
bool ColumnSparse::hasEqualValues() const
{
size_t num_defaults = getNumberOfDefaults();
if (num_defaults == _size)
return true;
/// Have at least 1 default and 1 non-default values.
if (num_defaults != 0)
return false;
/// Check that probably all non-default values are equal.
/// It's suboptiomal, but it's a rare case.
for (size_t i = 2; i < values->size(); ++i)
if (values->compareAt(1, i, *values, 1) != 0)
return false;
return true;
}
void ColumnSparse::getPermutationImpl(bool reverse, size_t limit, int null_direction_hint, Permutation & res, const Collator * collator) const
{
if (_size == 0)
return;
res.resize(_size);
if (offsets->empty())
{
for (size_t i = 0; i < _size; ++i)
res[i] = i;
return;
}
if (limit == 0 || limit > _size)
limit = _size;
Permutation perm;
/// Firstly we sort all values.
/// limit + 1 for case when there are 0 default values.
if (collator)
values->getPermutationWithCollation(*collator, reverse, limit + 1, null_direction_hint, perm);
else
values->getPermutation(reverse, limit + 1, null_direction_hint, perm);
size_t num_of_defaults = getNumberOfDefaults();
size_t row = 0;
const auto & offsets_data = getOffsetsData();
/// Fill the permutation.
for (size_t i = 0; i < perm.size() && row < limit; ++i)
{
if (perm[i] == 0)
{
if (!num_of_defaults)
continue;
/// Fill the positions of default values in the required quantity.
auto offset_it = begin();
while (row < limit)
{
while (offset_it.getCurrentRow() < _size && !offset_it.isDefault())
++offset_it;
if (offset_it.getCurrentRow() == _size)
break;
res[row++] = offset_it.getCurrentRow();
++offset_it;
}
}
else
{
res[row++] = offsets_data[perm[i] - 1];
}
}
assert(row == limit);
}
void ColumnSparse::getPermutation(bool reverse, size_t limit, int null_direction_hint, Permutation & res) const
{
return getPermutationImpl(reverse, limit, null_direction_hint, res, nullptr);
}
void ColumnSparse::updatePermutation(bool reverse, size_t limit, int null_direction_hint, Permutation & res, EqualRanges & equal_range) const
{
auto this_full = convertToFullColumnIfSparse();
this_full->updatePermutation(reverse, limit, null_direction_hint, res, equal_range);
}
void ColumnSparse::getPermutationWithCollation(const Collator & collator, bool reverse, size_t limit, int null_direction_hint, Permutation & res) const
{
return getPermutationImpl(reverse, limit, null_direction_hint, res, &collator);
}
void ColumnSparse::updatePermutationWithCollation(
const Collator & collator, bool reverse, size_t limit, int null_direction_hint, Permutation & res, EqualRanges& equal_range) const
{
auto this_full = convertToFullColumnIfSparse();
this_full->updatePermutationWithCollation(collator, reverse, limit, null_direction_hint, res, equal_range);
}
size_t ColumnSparse::byteSize() const
{
return values->byteSize() + offsets->byteSize() + sizeof(_size);
}
size_t ColumnSparse::byteSizeAt(size_t n) const
{
size_t index = getValueIndex(n);
size_t res = values->byteSizeAt(index);
if (index)
res += sizeof(UInt64);
return res;
}
size_t ColumnSparse::allocatedBytes() const
{
return values->allocatedBytes() + offsets->allocatedBytes() + sizeof(_size);
}
void ColumnSparse::protect()
{
values->protect();
offsets->protect();
}
ColumnPtr ColumnSparse::replicate(const Offsets & replicate_offsets) const
{
/// TODO: implement specializations.
if (_size != replicate_offsets.size())
throw Exception("Size of offsets doesn't match size of column.", ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH);
if (_size == 0)
return ColumnSparse::create(values->cloneEmpty());
auto res_offsets = offsets->cloneEmpty();
auto & res_offsets_data = assert_cast<ColumnUInt64 &>(*res_offsets).getData();
auto res_values = values->cloneEmpty();
res_values->insertDefault();
auto offset_it = begin();
for (size_t i = 0; i < _size; ++i, ++offset_it)
{
if (!offset_it.isDefault())
{
size_t replicate_size = replicate_offsets[i] - replicate_offsets[i - 1];
res_offsets_data.reserve(res_offsets_data.size() + replicate_size);
for (size_t row = replicate_offsets[i - 1]; row < replicate_offsets[i]; ++row)
{
res_offsets_data.push_back(row);
res_values->insertFrom(*values, offset_it.getValueIndex());
}
}
}
return ColumnSparse::create(std::move(res_values), std::move(res_offsets), replicate_offsets.back());
}
void ColumnSparse::updateHashWithValue(size_t n, SipHash & hash) const
{
values->updateHashWithValue(getValueIndex(n), hash);
}
void ColumnSparse::updateWeakHash32(WeakHash32 & hash) const
{
if (hash.getData().size() != _size)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Size of WeakHash32 does not match size of column: "
"column size is {}, hash size is {}", _size, hash.getData().size());
auto offset_it = begin();
auto & hash_data = hash.getData();
for (size_t i = 0; i < _size; ++i, ++offset_it)
{
size_t value_index = offset_it.getValueIndex();
auto data_ref = values->getDataAt(value_index);
hash_data[i] = ::updateWeakHash32(reinterpret_cast<const UInt8 *>(data_ref.data), data_ref.size, hash_data[i]);
}
}
void ColumnSparse::updateHashFast(SipHash & hash) const
{
values->updateHashFast(hash);
offsets->updateHashFast(hash);
hash.update(_size);
}
void ColumnSparse::getExtremes(Field & min, Field & max) const
{
if (_size == 0)
{
values->get(0, min);
values->get(0, max);
return;
}
if (getNumberOfDefaults() == 0)
{
size_t min_idx = 1;
size_t max_idx = 1;
for (size_t i = 2; i < values->size(); ++i)
{
if (values->compareAt(i, min_idx, *values, 1) < 0)
min_idx = i;
else if (values->compareAt(i, max_idx, *values, 1) > 0)
max_idx = i;
}
values->get(min_idx, min);
values->get(max_idx, max);
return;
}
values->getExtremes(min, max);
}
void ColumnSparse::getIndicesOfNonDefaultRows(IColumn::Offsets & indices, size_t from, size_t limit) const
{
const auto & offsets_data = getOffsetsData();
const auto * start = from ? std::lower_bound(offsets_data.begin(), offsets_data.end(), from) : offsets_data.begin();
const auto * end = limit ? std::lower_bound(offsets_data.begin(), offsets_data.end(), from + limit) : offsets_data.end();
indices.insert(start, end);
}
double ColumnSparse::getRatioOfDefaultRows(double) const
{
return static_cast<double>(getNumberOfDefaults()) / _size;
}
MutableColumns ColumnSparse::scatter(ColumnIndex num_columns, const Selector & selector) const
{
return scatterImpl<ColumnSparse>(num_columns, selector);
}
void ColumnSparse::gather(ColumnGathererStream & gatherer_stream)
{
gatherer_stream.gather(*this);
}
ColumnPtr ColumnSparse::compress() const
{
auto values_compressed = values->compress();
auto offsets_compressed = offsets->compress();
size_t byte_size = values_compressed->byteSize() + offsets_compressed->byteSize();
return ColumnCompressed::create(size(), byte_size,
[values_compressed = std::move(values_compressed), offsets_compressed = std::move(offsets_compressed), size = size()]
{
return ColumnSparse::create(values_compressed->decompress(), offsets_compressed->decompress(), size);
});
}
bool ColumnSparse::structureEquals(const IColumn & rhs) const
{
if (const auto * rhs_sparse = typeid_cast<const ColumnSparse *>(&rhs))
return values->structureEquals(*rhs_sparse->values);
return false;
}
void ColumnSparse::forEachSubcolumn(ColumnCallback callback)
{
callback(values);
callback(offsets);
}
const IColumn::Offsets & ColumnSparse::getOffsetsData() const
{
return assert_cast<const ColumnUInt64 &>(*offsets).getData();
}
IColumn::Offsets & ColumnSparse::getOffsetsData()
{
return assert_cast<ColumnUInt64 &>(*offsets).getData();
}
size_t ColumnSparse::getValueIndex(size_t n) const
{
assert(n < _size);
const auto & offsets_data = getOffsetsData();
const auto * it = std::lower_bound(offsets_data.begin(), offsets_data.end(), n);
if (it == offsets_data.end() || *it != n)
return 0;
return it - offsets_data.begin() + 1;
}
ColumnPtr recursiveRemoveSparse(const ColumnPtr & column)
{
if (!column)
return column;
if (const auto * column_tuple = typeid_cast<const ColumnTuple *>(column.get()))
{
auto columns = column_tuple->getColumns();
for (auto & element : columns)
element = recursiveRemoveSparse(element);
return ColumnTuple::create(columns);
}
return column->convertToFullColumnIfSparse();
}
}

231
src/Columns/ColumnSparse.h Normal file
View File

@ -0,0 +1,231 @@
#pragma once
#include <Columns/IColumn.h>
#include <Columns/IColumnImpl.h>
#include <Columns/ColumnsNumber.h>
#include <Common/typeid_cast.h>
#include <Common/assert_cast.h>
class Collator;
namespace DB
{
/** Column for spare representation.
* It stores column with non-default values and column
* with their sorted positions in original column. Column with
* values contains also one default value at 0 position to make
* implementation of execution of functions and sorting more convenient.
*/
class ColumnSparse final : public COWHelper<IColumn, ColumnSparse>
{
private:
friend class COWHelper<IColumn, ColumnSparse>;
explicit ColumnSparse(MutableColumnPtr && values_);
ColumnSparse(MutableColumnPtr && values_, MutableColumnPtr && offsets_, size_t size_);
ColumnSparse(const ColumnSparse &) = default;
public:
static constexpr auto DEFAULT_ROWS_SEARCH_SAMPLE_RATIO = 0.1;
static constexpr auto DEFAULT_RATIO_FOR_SPARSE_SERIALIZATION = 0.95;
using Base = COWHelper<IColumn, ColumnSparse>;
static Ptr create(const ColumnPtr & values_, const ColumnPtr & offsets_, size_t size_)
{
return Base::create(values_->assumeMutable(), offsets_->assumeMutable(), size_);
}
template <typename TColumnPtr, typename = typename std::enable_if<IsMutableColumns<TColumnPtr>::value>::type>
static MutablePtr create(TColumnPtr && values_, TColumnPtr && offsets_, size_t size_)
{
return Base::create(std::move(values_), std::move(offsets_), size_);
}
static Ptr create(const ColumnPtr & values_)
{
return Base::create(values_->assumeMutable());
}
template <typename TColumnPtr, typename = typename std::enable_if<IsMutableColumns<TColumnPtr>::value>::type>
static MutablePtr create(TColumnPtr && values_)
{
return Base::create(std::forward<TColumnPtr>(values_));
}
bool isSparse() const override { return true; }
const char * getFamilyName() const override { return "Sparse"; }
std::string getName() const override { return "Sparse(" + values->getName() + ")"; }
TypeIndex getDataType() const override { return values->getDataType(); }
MutableColumnPtr cloneResized(size_t new_size) const override;
size_t size() const override { return _size; }
bool isDefaultAt(size_t n) const override;
bool isNullAt(size_t n) const override;
Field operator[](size_t n) const override;
void get(size_t n, Field & res) const override;
bool getBool(size_t n) const override;
Float64 getFloat64(size_t n) const override;
Float32 getFloat32(size_t n) const override;
UInt64 getUInt(size_t n) const override;
Int64 getInt(size_t n) const override;
UInt64 get64(size_t n) const override;
StringRef getDataAt(size_t n) const override;
ColumnPtr convertToFullColumnIfSparse() const override;
/// Will insert null value if pos=nullptr
void insertData(const char * pos, size_t length) override;
StringRef serializeValueIntoArena(size_t n, Arena & arena, char const *& begin) const override;
const char * deserializeAndInsertFromArena(const char * pos) override;
const char * skipSerializedInArena(const char *) const override;
void insertRangeFrom(const IColumn & src, size_t start, size_t length) override;
void insert(const Field & x) override;
void insertFrom(const IColumn & src, size_t n) override;
void insertDefault() override;
void insertManyDefaults(size_t length) override;
void popBack(size_t n) override;
ColumnPtr filter(const Filter & filt, ssize_t) const override;
void expand(const Filter & mask, bool inverted) override;
ColumnPtr permute(const Permutation & perm, size_t limit) const override;
ColumnPtr index(const IColumn & indexes, size_t limit) const override;
template <typename Type>
ColumnPtr indexImpl(const PaddedPODArray<Type> & indexes, size_t limit) const;
int compareAt(size_t n, size_t m, const IColumn & rhs_, int null_direction_hint) const override;
void compareColumn(const IColumn & rhs, size_t rhs_row_num,
PaddedPODArray<UInt64> * row_indexes, PaddedPODArray<Int8> & compare_results,
int direction, int nan_direction_hint) const override;
int compareAtWithCollation(size_t n, size_t m, const IColumn & rhs, int null_direction_hint, const Collator & collator) const override;
bool hasEqualValues() const override;
void getPermutationImpl(bool reverse, size_t limit, int null_direction_hint, Permutation & res, const Collator * collator) const;
void getPermutation(bool reverse, size_t limit, int null_direction_hint, Permutation & res) const override;
void updatePermutation(bool reverse, size_t limit, int null_direction_hint, Permutation & res, EqualRanges & equal_range) const override;
void getPermutationWithCollation(const Collator & collator, bool reverse, size_t limit, int null_direction_hint, Permutation & res) const override;
void updatePermutationWithCollation(
const Collator & collator, bool reverse, size_t limit, int null_direction_hint, Permutation & res, EqualRanges& equal_range) const override;
size_t byteSize() const override;
size_t byteSizeAt(size_t n) const override;
size_t allocatedBytes() const override;
void protect() override;
ColumnPtr replicate(const Offsets & replicate_offsets) const override;
void updateHashWithValue(size_t n, SipHash & hash) const override;
void updateWeakHash32(WeakHash32 & hash) const override;
void updateHashFast(SipHash & hash) const override;
void getExtremes(Field & min, Field & max) const override;
void getIndicesOfNonDefaultRows(IColumn::Offsets & indices, size_t from, size_t limit) const override;
double getRatioOfDefaultRows(double sample_ratio) const override;
MutableColumns scatter(ColumnIndex num_columns, const Selector & selector) const override;
void gather(ColumnGathererStream & gatherer_stream) override;
ColumnPtr compress() const override;
void forEachSubcolumn(ColumnCallback callback) override;
bool structureEquals(const IColumn & rhs) const override;
bool isNullable() const override { return values->isNullable(); }
bool isFixedAndContiguous() const override { return false; }
bool valuesHaveFixedSize() const override { return values->valuesHaveFixedSize(); }
size_t sizeOfValueIfFixed() const override { return values->sizeOfValueIfFixed() + values->sizeOfValueIfFixed(); }
bool isCollationSupported() const override { return values->isCollationSupported(); }
size_t getNumberOfDefaults() const { return _size - offsets->size(); }
size_t getNumberOfTrailingDefaults() const
{
return offsets->empty() ? _size : _size - getOffsetsData().back() - 1;
}
/// Return position of element in 'values' columns,
/// that corresponds to n-th element of full column.
/// O(log(offsets.size())) complexity,
size_t getValueIndex(size_t n) const;
const IColumn & getValuesColumn() const { return *values; }
IColumn & getValuesColumn() { return *values; }
const ColumnPtr & getValuesPtr() const { return values; }
ColumnPtr & getValuesPtr() { return values; }
const IColumn::Offsets & getOffsetsData() const;
IColumn::Offsets & getOffsetsData();
const ColumnPtr & getOffsetsPtr() const { return offsets; }
ColumnPtr & getOffsetsPtr() { return offsets; }
const IColumn & getOffsetsColumn() const { return *offsets; }
IColumn & getOffsetsColumn() { return *offsets; }
/// This class helps to iterate over all values in ColumnSparse.
class Iterator
{
public:
Iterator(const PaddedPODArray<UInt64> & offsets_, size_t size_, size_t current_offset_, size_t current_row_)
: offsets(offsets_), size(size_), current_offset(current_offset_), current_row(current_row_)
{
}
bool ALWAYS_INLINE isDefault() const { return current_offset == offsets.size() || current_row != offsets[current_offset]; }
size_t ALWAYS_INLINE getValueIndex() const { return isDefault() ? 0 : current_offset + 1; }
size_t ALWAYS_INLINE getCurrentRow() const { return current_row; }
size_t ALWAYS_INLINE getCurrentOffset() const { return current_offset; }
bool operator==(const Iterator & other) const
{
return size == other.size
&& current_offset == other.current_offset
&& current_row == other.current_row;
}
bool operator!=(const Iterator & other) const { return !(*this == other); }
Iterator operator++()
{
if (!isDefault())
++current_offset;
++current_row;
return *this;
}
private:
const PaddedPODArray<UInt64> & offsets;
const size_t size;
size_t current_offset;
size_t current_row;
};
Iterator begin() const { return Iterator(getOffsetsData(), _size, 0, 0); }
Iterator end() const { return Iterator(getOffsetsData(), _size, getOffsetsData().size(), _size); }
private:
using Inserter = std::function<void(IColumn &)>;
/// Inserts value to 'values' column via callback.
/// Properly handles cases, when inserted value is default.
/// Used, when it's unknown in advance if inserted value is default.
void insertSingleValue(const Inserter & inserter);
/// Contains default value at 0 position.
/// It's convenient, because it allows to execute, e.g functions or sorting,
/// for this column without handling different cases.
WrappedPtr values;
/// Sorted offsets of non-default values in the full column.
/// 'offsets[i]' corresponds to 'values[i + 1]'.
WrappedPtr offsets;
size_t _size;
};
ColumnPtr recursiveRemoveSparse(const ColumnPtr & column);
}

View File

@ -474,8 +474,9 @@ void ColumnString::getExtremes(Field & min, Field & max) const
ColumnPtr ColumnString::compress() const
{
size_t source_chars_size = chars.size();
size_t source_offsets_size = offsets.size() * sizeof(Offset);
const size_t source_chars_size = chars.size();
const size_t source_offsets_elements = offsets.size();
const size_t source_offsets_size = source_offsets_elements * sizeof(Offset);
/// Don't compress small blocks.
if (source_chars_size < 4096) /// A wild guess.
@ -489,12 +490,14 @@ ColumnPtr ColumnString::compress() const
auto offsets_compressed = ColumnCompressed::compressBuffer(offsets.data(), source_offsets_size, true);
return ColumnCompressed::create(offsets.size(), chars_compressed->size() + offsets_compressed->size(),
const size_t chars_compressed_size = chars_compressed->size();
const size_t offsets_compressed_size = offsets_compressed->size();
return ColumnCompressed::create(source_offsets_elements, chars_compressed_size + offsets_compressed_size,
[
chars_compressed = std::move(chars_compressed),
offsets_compressed = std::move(offsets_compressed),
source_chars_size,
source_offsets_elements = offsets.size()
source_offsets_elements
]
{
auto res = ColumnString::create();

View File

@ -107,6 +107,12 @@ public:
return StringRef(&chars[offsetAt(n)], sizeAt(n));
}
bool isDefaultAt(size_t n) const override
{
assert(n < size());
return sizeAt(n) == 1;
}
/// Suppress gcc 7.3.1 warning: '*((void*)&<anonymous> +8)' may be used uninitialized in this function
#if !defined(__clang__)
#pragma GCC diagnostic push
@ -278,6 +284,16 @@ public:
return typeid(rhs) == typeid(ColumnString);
}
double getRatioOfDefaultRows(double sample_ratio) const override
{
return getRatioOfDefaultRowsImpl<ColumnString>(sample_ratio);
}
void getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const override
{
return getIndicesOfNonDefaultRowsImpl<ColumnString>(indices, from, limit);
}
Chars & getChars() { return chars; }
const Chars & getChars() const { return chars; }

View File

@ -12,6 +12,7 @@
#include <base/sort.h>
#include <base/map.h>
#include <base/range.h>
#include <DataTypes/Serializations/SerializationInfoTuple.h>
namespace DB
@ -113,6 +114,15 @@ void ColumnTuple::get(size_t n, Field & res) const
res = tuple;
}
bool ColumnTuple::isDefaultAt(size_t n) const
{
const size_t tuple_size = columns.size();
for (size_t i = 0; i < tuple_size; ++i)
if (!columns[i]->isDefaultAt(n))
return false;
return true;
}
StringRef ColumnTuple::getDataAt(size_t) const
{
throw Exception("Method getDataAt is not supported for " + getName(), ErrorCodes::NOT_IMPLEMENTED);
@ -536,4 +546,25 @@ ColumnPtr ColumnTuple::compress() const
});
}
double ColumnTuple::getRatioOfDefaultRows(double sample_ratio) const
{
return getRatioOfDefaultRowsImpl<ColumnTuple>(sample_ratio);
}
void ColumnTuple::getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const
{
return getIndicesOfNonDefaultRowsImpl<ColumnTuple>(indices, from, limit);
}
SerializationInfoPtr ColumnTuple::getSerializationInfo() const
{
MutableSerializationInfos infos;
infos.reserve(columns.size());
for (const auto & column : columns)
infos.push_back(const_pointer_cast<SerializationInfo>(column->getSerializationInfo()));
return std::make_shared<SerializationInfoTuple>(std::move(infos), SerializationInfo::Settings{});
}
}

View File

@ -53,6 +53,7 @@ public:
Field operator[](size_t n) const override;
void get(size_t n, Field & res) const override;
bool isDefaultAt(size_t n) const override;
StringRef getDataAt(size_t n) const override;
void insertData(const char * pos, size_t length) override;
void insert(const Field & x) override;
@ -93,6 +94,9 @@ public:
bool structureEquals(const IColumn & rhs) const override;
bool isCollationSupported() const override;
ColumnPtr compress() const override;
double getRatioOfDefaultRows(double sample_ratio) const override;
void getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const override;
SerializationInfoPtr getSerializationInfo() const override;
size_t tupleSize() const { return columns.size(); }

View File

@ -68,6 +68,7 @@ public:
Field operator[](size_t n) const override { return (*getNestedColumn())[n]; }
void get(size_t n, Field & res) const override { getNestedColumn()->get(n, res); }
bool isDefaultAt(size_t n) const override { return n == 0; }
StringRef getDataAt(size_t n) const override { return getNestedColumn()->getDataAt(n); }
StringRef getDataAtWithTerminatingZero(size_t n) const override
{
@ -122,6 +123,16 @@ public:
return false;
}
double getRatioOfDefaultRows(double) const override
{
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method 'getRatioOfDefaultRows' not implemented for ColumnUnique");
}
void getIndicesOfNonDefaultRows(IColumn::Offsets &, size_t, size_t) const override
{
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method 'getIndicesOfNonDefaultRows' not implemented for ColumnUnique");
}
const UInt64 * tryGetSavedHash() const override { return reverse_index.tryGetSavedHash(); }
UInt128 getHash() const override { return hash.getHash(*getRawColumnPtr()); }

View File

@ -481,7 +481,8 @@ void ColumnVector<T>::getExtremes(Field & min, Field & max) const
template <typename T>
ColumnPtr ColumnVector<T>::compress() const
{
size_t source_size = data.size() * sizeof(T);
const size_t data_size = data.size();
const size_t source_size = data_size * sizeof(T);
/// Don't compress small blocks.
if (source_size < 4096) /// A wild guess.
@ -492,8 +493,9 @@ ColumnPtr ColumnVector<T>::compress() const
if (!compressed)
return ColumnCompressed::wrap(this->getPtr());
return ColumnCompressed::create(data.size(), compressed->size(),
[compressed = std::move(compressed), column_size = data.size()]
const size_t compressed_size = compressed->size();
return ColumnCompressed::create(data_size, compressed_size,
[compressed = std::move(compressed), column_size = data_size]
{
auto res = ColumnVector<T>::create(column_size);
ColumnCompressed::decompressBuffer(
@ -502,6 +504,24 @@ ColumnPtr ColumnVector<T>::compress() const
});
}
template <typename T>
ColumnPtr ColumnVector<T>::createWithOffsets(const IColumn::Offsets & offsets, const Field & default_field, size_t total_rows, size_t shift) const
{
if (offsets.size() + shift != size())
throw Exception(ErrorCodes::LOGICAL_ERROR,
"Incompatible sizes of offsets ({}), shift ({}) and size of column {}", offsets.size(), shift, size());
auto res = this->create();
auto & res_data = res->getData();
T default_value = safeGet<T>(default_field);
res_data.resize_fill(total_rows, default_value);
for (size_t i = 0; i < offsets.size(); ++i)
res_data[offsets[i]] = data[i + shift];
return res;
}
/// Explicit template instantiations - to avoid code bloat in headers.
template class ColumnVector<UInt8>;
template class ColumnVector<UInt16>;

View File

@ -328,11 +328,25 @@ public:
return StringRef(reinterpret_cast<const char *>(&data[n]), sizeof(data[n]));
}
bool isDefaultAt(size_t n) const override { return data[n] == T{}; }
bool structureEquals(const IColumn & rhs) const override
{
return typeid(rhs) == typeid(ColumnVector<T>);
}
double getRatioOfDefaultRows(double sample_ratio) const override
{
return this->template getRatioOfDefaultRowsImpl<Self>(sample_ratio);
}
void getIndicesOfNonDefaultRows(IColumn::Offsets & indices, size_t from, size_t limit) const override
{
return this->template getIndicesOfNonDefaultRowsImpl<Self>(indices, from, limit);
}
ColumnPtr createWithOffsets(const IColumn::Offsets & offsets, const Field & default_field, size_t total_rows, size_t shift) const override;
ColumnPtr compress() const override;
/// Replace elements that match the filter with zeroes. If inverted replaces not matched elements.

View File

@ -4,6 +4,7 @@
#include <Columns/ColumnsNumber.h>
#include <Columns/ColumnNullable.h>
#include <Columns/ColumnConst.h>
#include <Columns/ColumnSparse.h>
#include <Core/ColumnWithTypeAndName.h>
@ -50,6 +51,9 @@ ConstantFilterDescription::ConstantFilterDescription(const IColumn & column)
FilterDescription::FilterDescription(const IColumn & column_)
{
if (column_.isSparse())
data_holder = recursiveRemoveSparse(column_.getPtr());
if (column_.lowCardinality())
data_holder = column_.convertToFullColumnIfLowCardinality();

View File

@ -4,11 +4,17 @@
#include <Columns/ColumnNullable.h>
#include <Columns/ColumnConst.h>
#include <Core/Field.h>
#include <DataTypes/Serializations/SerializationInfo.h>
namespace DB
{
namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
}
String IColumn::dumpStructure() const
{
WriteBufferFromOwnString res;
@ -30,6 +36,39 @@ void IColumn::insertFrom(const IColumn & src, size_t n)
insert(src[n]);
}
ColumnPtr IColumn::createWithOffsets(const Offsets & offsets, const Field & default_field, size_t total_rows, size_t shift) const
{
if (offsets.size() + shift != size())
throw Exception(ErrorCodes::LOGICAL_ERROR,
"Incompatible sizes of offsets ({}), shift ({}) and size of column {}", offsets.size(), shift, size());
auto res = cloneEmpty();
res->reserve(total_rows);
ssize_t current_offset = -1;
for (size_t i = 0; i < offsets.size(); ++i)
{
ssize_t offsets_diff = static_cast<ssize_t>(offsets[i]) - current_offset;
current_offset = offsets[i];
if (offsets_diff > 1)
res->insertMany(default_field, offsets_diff - 1);
res->insertFrom(*this, i + shift);
}
ssize_t offsets_diff = static_cast<ssize_t>(total_rows) - current_offset;
if (offsets_diff > 1)
res->insertMany(default_field, offsets_diff - 1);
return res;
}
SerializationInfoPtr IColumn::getSerializationInfo() const
{
return std::make_shared<SerializationInfo>(ISerialization::getKind(*this), SerializationInfo::Settings{});
}
bool isColumnNullable(const IColumn & column)
{
return checkColumn<ColumnNullable>(column);

View File

@ -26,9 +26,8 @@ class ColumnGathererStream;
class Field;
class WeakHash32;
class ISerialization;
using SerializationPtr = std::shared_ptr<const ISerialization>;
class SerializationInfo;
using SerializationInfoPtr = std::shared_ptr<const SerializationInfo>;
/*
* Represents a set of equal ranges in previous column to perform sorting in current column.
@ -64,9 +63,18 @@ public:
virtual Ptr convertToFullColumnIfConst() const { return getPtr(); }
/// If column isn't ColumnLowCardinality, return itself.
/// If column is ColumnLowCardinality, transforms is to full column.
/// If column is ColumnLowCardinality, transforms it to full column.
virtual Ptr convertToFullColumnIfLowCardinality() const { return getPtr(); }
/// If column isn't ColumnSparse, return itself.
/// If column is ColumnSparse, transforms it to full column.
virtual Ptr convertToFullColumnIfSparse() const { return getPtr(); }
Ptr convertToFullIfNeeded() const
{
return convertToFullColumnIfSparse()->convertToFullColumnIfConst()->convertToFullColumnIfLowCardinality();
}
/// Creates empty column with the same type.
virtual MutablePtr cloneEmpty() const { return cloneResized(0); }
@ -133,7 +141,7 @@ public:
throw Exception("Method getInt is not supported for " + getName(), ErrorCodes::NOT_IMPLEMENTED);
}
virtual bool isDefaultAt(size_t n) const { return get64(n) == 0; }
virtual bool isDefaultAt(size_t n) const = 0;
virtual bool isNullAt(size_t /*n*/) const { return false; }
/** If column is numeric, return value of n-th element, casted to bool.
@ -173,6 +181,13 @@ public:
insertFrom(src, position);
}
/// Appends one field multiple times. Can be optimized in inherited classes.
virtual void insertMany(const Field & field, size_t length)
{
for (size_t i = 0; i < length; ++i)
insert(field);
}
/// Appends data located in specified memory chunk if it is possible (throws an exception if it cannot be implemented).
/// Is used to optimize some computations (in aggregation, for example).
/// Parameter length could be ignored if column values have fixed size.
@ -375,6 +390,22 @@ public:
throw Exception("Method structureEquals is not supported for " + getName(), ErrorCodes::NOT_IMPLEMENTED);
}
/// Returns ration of values in column, that equal to default value of column.
/// Checks only @sample_ratio ratio of rows.
virtual double getRatioOfDefaultRows(double sample_ratio = 1.0) const = 0;
/// Returns indices of values in column, that not equal to default value of column.
virtual void getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const = 0;
/// Returns column with @total_size elements.
/// In result column values from current column are at positions from @offsets.
/// Other values are filled by @default_value.
/// @shift means how much rows to skip from the beginning of current column.
/// Used to create full column from sparse.
virtual Ptr createWithOffsets(const Offsets & offsets, const Field & default_field, size_t total_rows, size_t shift) const;
virtual SerializationInfoPtr getSerializationInfo() const;
/// Compress column in memory to some representation that allows to decompress it back.
/// Return itself if compression is not applicable for this column type.
virtual Ptr compress() const
@ -457,6 +488,8 @@ public:
virtual bool lowCardinality() const { return false; }
virtual bool isSparse() const { return false; }
virtual bool isCollationSupported() const { return false; }
virtual ~IColumn() = default;
@ -468,7 +501,6 @@ public:
String dumpStructure() const;
protected:
/// Template is to devirtualize calls to insertFrom method.
/// In derived classes (that use final keyword), implement scatter method as call to scatterImpl.
template <typename Derived>
@ -489,6 +521,13 @@ protected:
template <typename Derived>
bool hasEqualValuesImpl() const;
/// Template is to devirtualize calls to 'isDefaultAt' method.
template <typename Derived>
double getRatioOfDefaultRowsImpl(double sample_ratio) const;
template <typename Derived>
void getIndicesOfNonDefaultRowsImpl(Offsets & indices, size_t from, size_t limit) const;
/// Uses std::sort and partial_sort as default algorithms.
/// Implements 'less' and 'equals' via comparator.
/// If 'less' and 'equals' can be implemented more optimal

View File

@ -46,6 +46,7 @@ public:
Field operator[](size_t) const override { throw Exception("Cannot get value from " + getName(), ErrorCodes::NOT_IMPLEMENTED); }
void get(size_t, Field &) const override { throw Exception("Cannot get value from " + getName(), ErrorCodes::NOT_IMPLEMENTED); }
void insert(const Field &) override { throw Exception("Cannot insert element into " + getName(), ErrorCodes::NOT_IMPLEMENTED); }
bool isDefaultAt(size_t) const override { throw Exception("isDefaultAt is not implemented for " + getName(), ErrorCodes::NOT_IMPLEMENTED); }
StringRef getDataAt(size_t) const override
{
@ -161,6 +162,16 @@ public:
return res;
}
double getRatioOfDefaultRows(double) const override
{
throw Exception("Method getRatioOfDefaultRows is not supported for " + getName(), ErrorCodes::NOT_IMPLEMENTED);
}
void getIndicesOfNonDefaultRows(Offsets &, size_t, size_t) const override
{
throw Exception("Method getIndicesOfNonDefaultRows is not supported for " + getName(), ErrorCodes::NOT_IMPLEMENTED);
}
void gather(ColumnGathererStream &) override
{
throw Exception("Method gather is not supported for " + getName(), ErrorCodes::NOT_IMPLEMENTED);

View File

@ -16,6 +16,7 @@ namespace DB
namespace ErrorCodes
{
extern const int SIZES_OF_COLUMNS_DOESNT_MATCH;
extern const int LOGICAL_ERROR;
}
template <typename Derived>
@ -141,6 +142,56 @@ bool IColumn::hasEqualValuesImpl() const
return true;
}
template <typename Derived>
double IColumn::getRatioOfDefaultRowsImpl(double sample_ratio) const
{
if (sample_ratio <= 0.0 || sample_ratio > 1.0)
throw Exception(ErrorCodes::LOGICAL_ERROR,
"Value of 'sample_ratio' must be in interval (0.0; 1.0], but got: {}", sample_ratio);
/// Randomize a little to avoid boundary effects.
std::uniform_int_distribution<size_t> dist(1, static_cast<size_t>(1.0 / sample_ratio));
size_t num_rows = size();
size_t num_sampled_rows = static_cast<size_t>(num_rows * sample_ratio);
size_t num_checked_rows = dist(thread_local_rng);
num_sampled_rows = std::min(num_sampled_rows + dist(thread_local_rng), num_rows);
size_t res = 0;
if (num_sampled_rows == num_rows)
{
for (size_t i = 0; i < num_rows; ++i)
res += static_cast<const Derived &>(*this).isDefaultAt(i);
num_checked_rows = num_rows;
}
else if (num_sampled_rows != 0)
{
for (size_t i = num_checked_rows; i < num_rows; ++i)
{
if (num_checked_rows * num_rows <= i * num_sampled_rows)
{
res += static_cast<const Derived &>(*this).isDefaultAt(i);
++num_checked_rows;
}
}
}
return static_cast<double>(res) / num_checked_rows;
}
template <typename Derived>
void IColumn::getIndicesOfNonDefaultRowsImpl(Offsets & indices, size_t from, size_t limit) const
{
size_t to = limit && from + limit < size() ? from + limit : size();
indices.reserve(indices.size() + to - from);
for (size_t i = from; i < to; ++i)
{
if (!static_cast<const Derived &>(*this).isDefaultAt(i))
indices.push_back(i);
}
}
template <typename Comparator>
void IColumn::updatePermutationImpl(
size_t limit,

View File

@ -293,7 +293,7 @@ void executeColumnIfNeeded(ColumnWithTypeAndName & column, bool empty)
column.column = column_function->getResultType()->createColumn();
}
int checkShirtCircuitArguments(const ColumnsWithTypeAndName & arguments)
int checkShortCircuitArguments(const ColumnsWithTypeAndName & arguments)
{
int last_short_circuit_argument_index = -1;
for (size_t i = 0; i != arguments.size(); ++i)

View File

@ -66,7 +66,7 @@ void executeColumnIfNeeded(ColumnWithTypeAndName & column, bool empty = false);
/// Check if arguments contain lazy executed argument. If contain, return index of the last one,
/// otherwise return -1.
int checkShirtCircuitArguments(const ColumnsWithTypeAndName & arguments);
int checkShortCircuitArguments(const ColumnsWithTypeAndName & arguments);
void copyMask(const PaddedPODArray<UInt8> & from, PaddedPODArray<UInt8> & to);

View File

@ -0,0 +1,327 @@
#include <Columns/ColumnSparse.h>
#include <Columns/ColumnsNumber.h>
#include <Common/randomSeed.h>
#include <pcg_random.hpp>
#include <gtest/gtest.h>
#include <algorithm>
#include <numeric>
#include <Common/FieldVisitors.h>
using namespace DB;
pcg64 rng(randomSeed());
std::pair<MutableColumnPtr, MutableColumnPtr> createColumns(size_t n, size_t k)
{
auto values = ColumnVector<UInt64>::create();
auto offsets = ColumnVector<UInt64>::create();
auto full = ColumnVector<UInt64>::create();
auto & values_data = values->getData();
auto & offsets_data = offsets->getData();
auto & full_data = full->getData();
values_data.push_back(0);
for (size_t i = 0; i < n; ++i)
{
bool not_zero = rng() % k == 0;
size_t value = not_zero ? rng() % 1000000 : 0;
full_data.push_back(value);
if (not_zero)
{
values_data.push_back(value);
offsets_data.push_back(i);
}
}
auto sparse = ColumnSparse::create(std::move(values), std::move(offsets), n);
return std::make_pair(std::move(sparse), std::move(full));
}
bool checkEquals(const IColumn & lhs, const IColumn & rhs)
{
if (lhs.size() != rhs.size())
return false;
for (size_t i = 0; i < lhs.size(); ++i)
if (lhs.compareAt(i, i, rhs, 0) != 0)
return false;
return true;
}
// Can't use ErrorCodes, because of 'using namespace DB'.
constexpr int error_code = 12345;
constexpr size_t T = 5000;
constexpr size_t MAX_ROWS = 10000;
constexpr size_t sparse_ratios[] = {1, 2, 5, 10, 32, 50, 64, 100, 256, 500, 1000, 5000, 10000};
constexpr size_t K = sizeof(sparse_ratios) / sizeof(sparse_ratios[0]);
#define DUMP_COLUMN(column) std::cerr << #column << ": " << (column)->dumpStructure() << "\n"
TEST(ColumnSparse, InsertRangeFrom)
{
auto test_case = [&](size_t n1, size_t k1, size_t n2, size_t k2, size_t from, size_t len)
{
auto [sparse_dst, full_dst] = createColumns(n1, k1);
auto [sparse_src, full_src] = createColumns(n2, k2);
sparse_dst->insertRangeFrom(*sparse_src, from, len);
full_dst->insertRangeFrom(*full_src, from, len);
if (!checkEquals(*sparse_dst->convertToFullColumnIfSparse(), *full_dst))
{
DUMP_COLUMN(sparse_src);
DUMP_COLUMN(full_src);
DUMP_COLUMN(sparse_dst);
DUMP_COLUMN(full_dst);
throw Exception(error_code, "Columns are unequal");
}
};
try
{
for (size_t i = 0; i < T; ++i)
{
size_t n1 = rng() % MAX_ROWS + 1;
size_t k1 = sparse_ratios[rng() % K];
size_t n2 = rng() % MAX_ROWS + 1;
size_t k2 = sparse_ratios[rng() % K];
size_t from = rng() % n2;
size_t to = rng() % n2;
if (from > to)
std::swap(from, to);
test_case(n1, k1, n2, k2, from, to - from);
}
}
catch (const Exception & e)
{
FAIL() << e.displayText();
}
}
TEST(ColumnSparse, PopBack)
{
auto test_case = [&](size_t n, size_t k, size_t m)
{
auto [sparse_dst, full_dst] = createColumns(n, k);
sparse_dst->popBack(m);
full_dst->popBack(m);
if (!checkEquals(*sparse_dst->convertToFullColumnIfSparse(), *full_dst))
{
DUMP_COLUMN(sparse_dst);
DUMP_COLUMN(full_dst);
throw Exception(error_code, "Columns are unequal");
}
};
try
{
for (size_t i = 0; i < T; ++i)
{
size_t n = rng() % MAX_ROWS + 1;
size_t k = sparse_ratios[rng() % K];
size_t m = rng() % n;
test_case(n, k, m);
}
}
catch (const Exception & e)
{
FAIL() << e.displayText();
}
}
TEST(ColumnSparse, Filter)
{
auto test_case = [&](size_t n, size_t k, size_t m)
{
auto [sparse_src, full_src] = createColumns(n, k);
PaddedPODArray<UInt8> filt(n);
for (size_t i = 0; i < n; ++i)
filt[i] = rng() % m == 0;
auto sparse_dst = sparse_src->filter(filt, -1);
auto full_dst = full_src->filter(filt, -1);
if (!checkEquals(*sparse_dst->convertToFullColumnIfSparse(), *full_dst))
{
DUMP_COLUMN(sparse_src);
DUMP_COLUMN(full_src);
DUMP_COLUMN(sparse_dst);
DUMP_COLUMN(full_dst);
throw Exception(error_code, "Columns are unequal");
}
};
try
{
for (size_t i = 0; i < T; ++i)
{
size_t n = rng() % MAX_ROWS + 1;
size_t k = sparse_ratios[rng() % K];
size_t m = sparse_ratios[rng() % K];
test_case(n, k, m);
}
}
catch (const Exception & e)
{
FAIL() << e.displayText();
}
}
TEST(ColumnSparse, Permute)
{
auto test_case = [&](size_t n, size_t k, size_t limit)
{
auto [sparse_src, full_src] = createColumns(n, k);
IColumn::Permutation perm(n);
std::iota(perm.begin(), perm.end(), 0);
std::shuffle(perm.begin(), perm.end(), rng);
auto sparse_dst = sparse_src->permute(perm, limit);
auto full_dst = full_src->permute(perm, limit);
if (limit)
{
sparse_dst = sparse_dst->cut(0, limit);
full_dst = full_dst->cut(0, limit);
}
if (!checkEquals(*sparse_dst->convertToFullColumnIfSparse(), *full_dst))
{
DUMP_COLUMN(sparse_src);
DUMP_COLUMN(full_src);
DUMP_COLUMN(sparse_dst);
DUMP_COLUMN(full_dst);
throw Exception(error_code, "Columns are unequal");
}
};
try
{
for (size_t i = 0; i < T; ++i)
{
size_t n = rng() % MAX_ROWS + 1;
size_t k = sparse_ratios[rng() % K];
size_t limit = rng() % 2 ? 0 : rng() % n;
test_case(n, k, limit);
}
}
catch (const Exception & e)
{
FAIL() << e.displayText();
}
}
TEST(ColumnSparse, CompareColumn)
{
auto test_case = [&](size_t n1, size_t k1, size_t n2, size_t k2, size_t row_num)
{
auto [sparse_src1, full_src1] = createColumns(n1, k1);
auto [sparse_src2, full_src2] = createColumns(n2, k2);
PaddedPODArray<Int8> comp_sparse;
PaddedPODArray<Int8> comp_full;
sparse_src1->compareColumn(*sparse_src2, row_num, nullptr, comp_sparse, 1, 1);
full_src1->compareColumn(*full_src2, row_num, nullptr, comp_full, 1, 1);
if (comp_sparse != comp_full)
{
DUMP_COLUMN(sparse_src1);
DUMP_COLUMN(full_src1);
DUMP_COLUMN(sparse_src2);
DUMP_COLUMN(full_src2);
throw Exception(error_code, "Compare results are unequal");
}
};
try
{
for (size_t i = 0; i < T; ++i)
{
size_t n1 = rng() % MAX_ROWS + 1;
size_t k1 = sparse_ratios[rng() % K];
size_t n2 = rng() % MAX_ROWS + 1;
size_t k2 = sparse_ratios[rng() % K];
size_t row_num = rng() % n2;
test_case(n1, k1, n2, k2, row_num);
}
}
catch (const Exception & e)
{
FAIL() << e.displayText();
}
}
TEST(ColumnSparse, GetPermutation)
{
auto test_case = [&](size_t n, size_t k, size_t limit, bool reverse)
{
auto [sparse_src, full_src] = createColumns(n, k);
IColumn::Permutation perm_sparse;
IColumn::Permutation perm_full;
sparse_src->getPermutation(reverse, limit, 1, perm_sparse);
full_src->getPermutation(reverse, limit, 1, perm_full);
auto sparse_sorted = sparse_src->permute(perm_sparse, limit);
auto full_sorted = full_src->permute(perm_full, limit);
if (limit)
{
sparse_sorted = sparse_sorted->cut(0, limit);
full_sorted = full_sorted->cut(0, limit);
}
if (!checkEquals(*sparse_sorted->convertToFullColumnIfSparse(), *full_sorted))
{
DUMP_COLUMN(sparse_src);
DUMP_COLUMN(full_src);
DUMP_COLUMN(sparse_sorted);
DUMP_COLUMN(full_sorted);
throw Exception(error_code, "Sorted columns are unequal");
}
};
try
{
for (size_t i = 0; i < T; ++i)
{
size_t n = rng() % MAX_ROWS + 1;
size_t k = sparse_ratios[rng() % K];
size_t limit = rng() % 2 ? 0 : rng() % n;
bool reverse = rng() % 2;
test_case(n, k, limit, reverse);
}
}
catch (const Exception & e)
{
FAIL() << e.displayText();
}
}
#undef DUMP_COLUMN
#undef DUMP_NON_DEFAULTS

View File

@ -838,7 +838,7 @@ bool Dwarf::findLocation(
// The next inlined subroutine's call file and call line is the current
// caller's location.
for (size_t i = 0; i < num_found - 1; i++)
for (size_t i = 0; i < num_found - 1; ++i)
{
call_locations[i].file = call_locations[i + 1].file;
call_locations[i].line = call_locations[i + 1].line;

Some files were not shown because too many files have changed in this diff Show More