Merge branch 'master' into update-arrow

This commit is contained in:
Alexey Milovidov 2022-07-31 03:05:53 +03:00 committed by GitHub
commit 075ff5005e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
792 changed files with 4908 additions and 3718 deletions

View File

@ -1126,6 +1126,84 @@ jobs:
# shellcheck disable=SC2046
docker rm -f $(docker ps -a -q) ||:
sudo rm -fr "$TEMP_PATH"
FunctionalStatelessTestReleaseDatabaseReplicated0:
needs: [BuilderDebRelease]
runs-on: [self-hosted, func-tester]
steps:
- name: Set envs
run: |
cat >> "$GITHUB_ENV" << 'EOF'
TEMP_PATH=${{runner.temp}}/stateless_database_replicated
REPORTS_PATH=${{runner.temp}}/reports_dir
CHECK_NAME=Stateless tests (release, DatabaseReplicated)
REPO_COPY=${{runner.temp}}/stateless_database_replicated/ClickHouse
KILL_TIMEOUT=10800
RUN_BY_HASH_NUM=0
RUN_BY_HASH_TOTAL=2
EOF
- name: Download json reports
uses: actions/download-artifact@v2
with:
path: ${{ env.REPORTS_PATH }}
- name: Clear repository
run: |
sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE"
- name: Check out repository code
uses: actions/checkout@v2
- name: Functional test
run: |
sudo rm -fr "$TEMP_PATH"
mkdir -p "$TEMP_PATH"
cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
cd "$REPO_COPY/tests/ci"
python3 functional_test_check.py "$CHECK_NAME" "$KILL_TIMEOUT"
- name: Cleanup
if: always()
run: |
# shellcheck disable=SC2046
docker kill $(docker ps -q) ||:
# shellcheck disable=SC2046
docker rm -f $(docker ps -a -q) ||:
sudo rm -fr "$TEMP_PATH"
FunctionalStatelessTestReleaseDatabaseReplicated1:
needs: [BuilderDebRelease]
runs-on: [self-hosted, func-tester]
steps:
- name: Set envs
run: |
cat >> "$GITHUB_ENV" << 'EOF'
TEMP_PATH=${{runner.temp}}/stateless_database_replicated
REPORTS_PATH=${{runner.temp}}/reports_dir
CHECK_NAME=Stateless tests (release, DatabaseReplicated)
REPO_COPY=${{runner.temp}}/stateless_database_replicated/ClickHouse
KILL_TIMEOUT=10800
RUN_BY_HASH_NUM=1
RUN_BY_HASH_TOTAL=2
EOF
- name: Download json reports
uses: actions/download-artifact@v2
with:
path: ${{ env.REPORTS_PATH }}
- name: Clear repository
run: |
sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE"
- name: Check out repository code
uses: actions/checkout@v2
- name: Functional test
run: |
sudo rm -fr "$TEMP_PATH"
mkdir -p "$TEMP_PATH"
cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
cd "$REPO_COPY/tests/ci"
python3 functional_test_check.py "$CHECK_NAME" "$KILL_TIMEOUT"
- name: Cleanup
if: always()
run: |
# shellcheck disable=SC2046
docker kill $(docker ps -q) ||:
# shellcheck disable=SC2046
docker rm -f $(docker ps -a -q) ||:
sudo rm -fr "$TEMP_PATH"
FunctionalStatelessTestReleaseS3:
needs: [BuilderDebRelease]
runs-on: [self-hosted, func-tester]
@ -1706,43 +1784,6 @@ jobs:
# shellcheck disable=SC2046
docker rm -f $(docker ps -a -q) ||:
sudo rm -fr "$TEMP_PATH"
FunctionalStatefulTestReleaseDatabaseOrdinary:
needs: [BuilderDebRelease]
runs-on: [self-hosted, func-tester]
steps:
- name: Set envs
run: |
cat >> "$GITHUB_ENV" << 'EOF'
TEMP_PATH=${{runner.temp}}/stateful_release_database_ordinary
REPORTS_PATH=${{runner.temp}}/reports_dir
CHECK_NAME=Stateful tests (release, DatabaseOrdinary)
REPO_COPY=${{runner.temp}}/stateful_release_database_ordinary/ClickHouse
KILL_TIMEOUT=3600
EOF
- name: Download json reports
uses: actions/download-artifact@v2
with:
path: ${{ env.REPORTS_PATH }}
- name: Clear repository
run: |
sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE"
- name: Check out repository code
uses: actions/checkout@v2
- name: Functional test
run: |
sudo rm -fr "$TEMP_PATH"
mkdir -p "$TEMP_PATH"
cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
cd "$REPO_COPY/tests/ci"
python3 functional_test_check.py "$CHECK_NAME" "$KILL_TIMEOUT"
- name: Cleanup
if: always()
run: |
# shellcheck disable=SC2046
docker kill $(docker ps -q) ||:
# shellcheck disable=SC2046
docker rm -f $(docker ps -a -q) ||:
sudo rm -fr "$TEMP_PATH"
FunctionalStatefulTestAarch64:
needs: [BuilderDebAarch64]
runs-on: [self-hosted, func-tester-aarch64]
@ -3063,6 +3104,8 @@ jobs:
- FunctionalStatelessTestDebug2
- FunctionalStatelessTestRelease
- FunctionalStatelessTestReleaseDatabaseOrdinary
- FunctionalStatelessTestReleaseDatabaseReplicated0
- FunctionalStatelessTestReleaseDatabaseReplicated1
- FunctionalStatelessTestAarch64
- FunctionalStatelessTestAsan0
- FunctionalStatelessTestAsan1
@ -3075,7 +3118,6 @@ jobs:
- FunctionalStatelessTestUBsan
- FunctionalStatefulTestDebug
- FunctionalStatefulTestRelease
- FunctionalStatefulTestReleaseDatabaseOrdinary
- FunctionalStatelessTestReleaseS3
- FunctionalStatefulTestAarch64
- FunctionalStatefulTestAsan

View File

@ -254,7 +254,7 @@ jobs:
#################################### ORDINARY BUILDS ####################################
#########################################################################################
BuilderDebRelease:
needs: [DockerHubPush, FastTest]
needs: [DockerHubPush, FastTest, StyleCheck]
runs-on: [self-hosted, builder]
steps:
- name: Set envs
@ -301,7 +301,7 @@ jobs:
docker rm -f $(docker ps -a -q) ||:
sudo rm -fr "$TEMP_PATH"
BuilderBinRelease:
needs: [DockerHubPush, FastTest]
needs: [DockerHubPush, FastTest, StyleCheck]
runs-on: [self-hosted, builder]
steps:
- name: Set envs
@ -345,53 +345,8 @@ jobs:
# shellcheck disable=SC2046
docker rm -f $(docker ps -a -q) ||:
sudo rm -fr "$TEMP_PATH" "$CACHES_PATH"
# BuilderBinGCC:
# needs: [DockerHubPush, FastTest]
# runs-on: [self-hosted, builder]
# steps:
# - name: Set envs
# run: |
# cat >> "$GITHUB_ENV" << 'EOF'
# TEMP_PATH=${{runner.temp}}/build_check
# IMAGES_PATH=${{runner.temp}}/images_path
# REPO_COPY=${{runner.temp}}/build_check/ClickHouse
# CACHES_PATH=${{runner.temp}}/../ccaches
# BUILD_NAME=binary_gcc
# EOF
# - name: Download changed images
# uses: actions/download-artifact@v2
# with:
# name: changed_images
# path: ${{ runner.temp }}/images_path
# - name: Clear repository
# run: |
# sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE"
# - name: Check out repository code
# uses: actions/checkout@v2
# - name: Build
# run: |
# git -C "$GITHUB_WORKSPACE" submodule sync --recursive
# git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10
# sudo rm -fr "$TEMP_PATH"
# mkdir -p "$TEMP_PATH"
# cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
# cd "$REPO_COPY/tests/ci" && python3 build_check.py "$BUILD_NAME"
# - name: Upload build URLs to artifacts
# if: ${{ success() || failure() }}
# uses: actions/upload-artifact@v2
# with:
# name: ${{ env.BUILD_URLS }}
# path: ${{ env.TEMP_PATH }}/${{ env.BUILD_URLS }}.json
# - name: Cleanup
# if: always()
# run: |
# # shellcheck disable=SC2046
# docker kill $(docker ps -q) ||:
# # shellcheck disable=SC2046
# docker rm -f $(docker ps -a -q) ||:
# sudo rm -fr "$TEMP_PATH" "$CACHES_PATH"
BuilderDebAarch64:
needs: [DockerHubPush, FastTest]
needs: [DockerHubPush, FastTest, StyleCheck]
runs-on: [self-hosted, builder]
steps:
- name: Set envs
@ -438,7 +393,7 @@ jobs:
docker rm -f $(docker ps -a -q) ||:
sudo rm -fr "$TEMP_PATH" "$CACHES_PATH"
BuilderDebAsan:
needs: [DockerHubPush, FastTest]
needs: [DockerHubPush, FastTest, StyleCheck]
runs-on: [self-hosted, builder]
steps:
- name: Set envs
@ -483,7 +438,7 @@ jobs:
docker rm -f $(docker ps -a -q) ||:
sudo rm -fr "$TEMP_PATH" "$CACHES_PATH"
BuilderDebUBsan:
needs: [DockerHubPush, FastTest]
needs: [DockerHubPush, FastTest, StyleCheck]
runs-on: [self-hosted, builder]
steps:
- name: Set envs
@ -528,7 +483,7 @@ jobs:
docker rm -f $(docker ps -a -q) ||:
sudo rm -fr "$TEMP_PATH" "$CACHES_PATH"
BuilderDebTsan:
needs: [DockerHubPush, FastTest]
needs: [DockerHubPush, FastTest, StyleCheck]
runs-on: [self-hosted, builder]
steps:
- name: Set envs
@ -573,7 +528,7 @@ jobs:
docker rm -f $(docker ps -a -q) ||:
sudo rm -fr "$TEMP_PATH" "$CACHES_PATH"
BuilderDebMsan:
needs: [DockerHubPush, FastTest]
needs: [DockerHubPush, FastTest, StyleCheck]
runs-on: [self-hosted, builder]
steps:
- name: Set envs
@ -618,7 +573,7 @@ jobs:
docker rm -f $(docker ps -a -q) ||:
sudo rm -fr "$TEMP_PATH" "$CACHES_PATH"
BuilderDebDebug:
needs: [DockerHubPush, FastTest]
needs: [DockerHubPush, FastTest, StyleCheck]
runs-on: [self-hosted, builder]
steps:
- name: Set envs
@ -666,7 +621,7 @@ jobs:
##################################### SPECIAL BUILDS #####################################
##########################################################################################
BuilderDebSplitted:
needs: [DockerHubPush, FastTest]
needs: [DockerHubPush, FastTest, StyleCheck]
runs-on: [self-hosted, builder]
steps:
- name: Set envs
@ -711,7 +666,7 @@ jobs:
docker rm -f $(docker ps -a -q) ||:
sudo rm -fr "$TEMP_PATH" "$CACHES_PATH"
BuilderBinClangTidy:
needs: [DockerHubPush, FastTest]
needs: [DockerHubPush, FastTest, StyleCheck]
runs-on: [self-hosted, builder]
steps:
- name: Set envs
@ -756,7 +711,7 @@ jobs:
docker rm -f $(docker ps -a -q) ||:
sudo rm -fr "$TEMP_PATH" "$CACHES_PATH"
BuilderBinDarwin:
needs: [DockerHubPush, FastTest]
needs: [DockerHubPush, FastTest, StyleCheck]
runs-on: [self-hosted, builder]
steps:
- name: Set envs
@ -801,7 +756,7 @@ jobs:
docker rm -f $(docker ps -a -q) ||:
sudo rm -fr "$TEMP_PATH" "$CACHES_PATH"
BuilderBinAarch64:
needs: [DockerHubPush, FastTest]
needs: [DockerHubPush, FastTest, StyleCheck]
runs-on: [self-hosted, builder]
steps:
- name: Set envs
@ -846,7 +801,7 @@ jobs:
docker rm -f $(docker ps -a -q) ||:
sudo rm -fr "$TEMP_PATH" "$CACHES_PATH"
BuilderBinFreeBSD:
needs: [DockerHubPush, FastTest]
needs: [DockerHubPush, FastTest, StyleCheck]
runs-on: [self-hosted, builder]
steps:
- name: Set envs
@ -891,7 +846,7 @@ jobs:
docker rm -f $(docker ps -a -q) ||:
sudo rm -fr "$TEMP_PATH" "$CACHES_PATH"
BuilderBinDarwinAarch64:
needs: [DockerHubPush, FastTest]
needs: [DockerHubPush, FastTest, StyleCheck]
runs-on: [self-hosted, builder]
steps:
- name: Set envs
@ -936,7 +891,7 @@ jobs:
docker rm -f $(docker ps -a -q) ||:
sudo rm -fr "$TEMP_PATH" "$CACHES_PATH"
BuilderBinPPC64:
needs: [DockerHubPush, FastTest]
needs: [DockerHubPush, FastTest, StyleCheck]
runs-on: [self-hosted, builder]
steps:
- name: Set envs
@ -2974,42 +2929,6 @@ jobs:
# shellcheck disable=SC2046
docker rm -f $(docker ps -a -q) ||:
sudo rm -fr "$TEMP_PATH"
# UnitTestsReleaseGCC:
# needs: [BuilderBinGCC]
# runs-on: [self-hosted, fuzzer-unit-tester]
# steps:
# - name: Set envs
# run: |
# cat >> "$GITHUB_ENV" << 'EOF'
# TEMP_PATH=${{runner.temp}}/unit_tests_asan
# REPORTS_PATH=${{runner.temp}}/reports_dir
# CHECK_NAME=Unit tests (release-gcc)
# REPO_COPY=${{runner.temp}}/unit_tests_asan/ClickHouse
# EOF
# - name: Download json reports
# uses: actions/download-artifact@v2
# with:
# path: ${{ env.REPORTS_PATH }}
# - name: Clear repository
# run: |
# sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE"
# - name: Check out repository code
# uses: actions/checkout@v2
# - name: Unit test
# run: |
# sudo rm -fr "$TEMP_PATH"
# mkdir -p "$TEMP_PATH"
# cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
# cd "$REPO_COPY/tests/ci"
# python3 unit_tests_check.py "$CHECK_NAME"
# - name: Cleanup
# if: always()
# run: |
# # shellcheck disable=SC2046
# docker kill $(docker ps -q) ||:
# # shellcheck disable=SC2046
# docker rm -f $(docker ps -a -q) ||:
# sudo rm -fr "$TEMP_PATH"
UnitTestsTsan:
needs: [BuilderDebTsan]
runs-on: [self-hosted, fuzzer-unit-tester]

View File

@ -12,11 +12,7 @@ ClickHouse® is an open-source column-oriented database management system that a
* [Blog](https://clickhouse.com/blog/en/) contains various ClickHouse-related articles, as well as announcements and reports about events.
* [Code Browser (Woboq)](https://clickhouse.com/codebrowser/ClickHouse/index.html) with syntax highlight and navigation.
* [Code Browser (github.dev)](https://github.dev/ClickHouse/ClickHouse) with syntax highlight, powered by github.dev.
* [Contacts](https://clickhouse.com/company/#contact) can help to get your questions answered if there are any.
* [Contacts](https://clickhouse.com/company/contact) can help to get your questions answered if there are any.
## Upcoming events
* [v22.7 Release Webinar](https://clickhouse.com/company/events/v22-7-release-webinar/) Original creator, co-founder, and CTO of ClickHouse Alexey Milovidov will walk us through the highlights of the release, provide live demos, and share vision into what is coming in the roadmap.
* [ClickHouse Meetup at the Cloudflare office in London](https://www.meetup.com/clickhouse-london-user-group/events/286891586/) ClickHouse meetup at the Cloudflare office space in central London
* [ClickHouse Meetup at the Metoda office in Munich](https://www.meetup.com/clickhouse-meetup-munich/events/286891667/) ClickHouse meetup at the Metoda office in Munich
* **v22.8 Release Webinar** Original creator, co-founder, and CTO of ClickHouse Alexey Milovidov will walk us through the highlights of the release, provide live demos, and share vision into what is coming in the roadmap.

View File

@ -16,7 +16,7 @@ option (ENABLE_SSE41 "Use SSE4.1 instructions on x86_64" 1)
option (ENABLE_SSE42 "Use SSE4.2 instructions on x86_64" 1)
option (ENABLE_PCLMULQDQ "Use pclmulqdq instructions on x86_64" 1)
option (ENABLE_POPCNT "Use popcnt instructions on x86_64" 1)
option (ENABLE_AVX "Use AVX instructions on x86_64" 0)
option (ENABLE_AVX "Use AVX instructions on x86_64" 1)
option (ENABLE_AVX2 "Use AVX2 instructions on x86_64" 0)
option (ENABLE_AVX512 "Use AVX512 instructions on x86_64" 0)
option (ENABLE_AVX512_VBMI "Use AVX512_VBMI instruction on x86_64 (depends on ENABLE_AVX512)" 0)

2
contrib/NuRaft vendored

@ -1 +1 @@
Subproject commit 1334b9ae72576821a698d657d08838861cf33007
Subproject commit e1dc47c1cfd529801a8c94a396a3921a71ae3ccf

2
contrib/libgsasl vendored

@ -1 +1 @@
Subproject commit 383ee28e82f69fa16ed43b48bd9c8ee5b313ab84
Subproject commit 0324680f13f22bb43df5353a08e26453d7d640ac

View File

@ -55,7 +55,7 @@ ccache --zero-stats ||:
if [ "$BUILD_MUSL_KEEPER" == "1" ]
then
# build keeper with musl separately
cmake --debug-trycompile --verbose=1 -DBUILD_STANDALONE_KEEPER=1 -DENABLE_CLICKHOUSE_KEEPER=1 -DCMAKE_VERBOSE_MAKEFILE=1 -DUSE_MUSL=1 -LA -DCMAKE_TOOLCHAIN_FILE=/build/cmake/linux/toolchain-x86_64-musl.cmake "-DCMAKE_BUILD_TYPE=$BUILD_TYPE" "-DSANITIZE=$SANITIZER" -DENABLE_CHECK_HEAVY_BUILDS=1 "${CMAKE_FLAGS[@]}" ..
cmake --debug-trycompile -DBUILD_STANDALONE_KEEPER=1 -DENABLE_CLICKHOUSE_KEEPER=1 -DCMAKE_VERBOSE_MAKEFILE=1 -DUSE_MUSL=1 -LA -DCMAKE_TOOLCHAIN_FILE=/build/cmake/linux/toolchain-x86_64-musl.cmake "-DCMAKE_BUILD_TYPE=$BUILD_TYPE" "-DSANITIZE=$SANITIZER" -DENABLE_CHECK_HEAVY_BUILDS=1 "${CMAKE_FLAGS[@]}" ..
# shellcheck disable=SC2086 # No quotes because I want it to expand to nothing if empty.
ninja $NINJA_FLAGS clickhouse-keeper
@ -70,10 +70,10 @@ then
rm -f CMakeCache.txt
# Build the rest of binaries
cmake --debug-trycompile --verbose=1 -DBUILD_STANDALONE_KEEPER=0 -DCREATE_KEEPER_SYMLINK=0 -DCMAKE_VERBOSE_MAKEFILE=1 -LA "-DCMAKE_BUILD_TYPE=$BUILD_TYPE" "-DSANITIZE=$SANITIZER" -DENABLE_CHECK_HEAVY_BUILDS=1 "${CMAKE_FLAGS[@]}" ..
cmake --debug-trycompile -DBUILD_STANDALONE_KEEPER=0 -DCREATE_KEEPER_SYMLINK=0 -DCMAKE_VERBOSE_MAKEFILE=1 -LA "-DCMAKE_BUILD_TYPE=$BUILD_TYPE" "-DSANITIZE=$SANITIZER" -DENABLE_CHECK_HEAVY_BUILDS=1 "${CMAKE_FLAGS[@]}" ..
else
# Build everything
cmake --debug-trycompile --verbose=1 -DCMAKE_VERBOSE_MAKEFILE=1 -LA "-DCMAKE_BUILD_TYPE=$BUILD_TYPE" "-DSANITIZE=$SANITIZER" -DENABLE_CHECK_HEAVY_BUILDS=1 "${CMAKE_FLAGS[@]}" ..
cmake --debug-trycompile -DCMAKE_VERBOSE_MAKEFILE=1 -LA "-DCMAKE_BUILD_TYPE=$BUILD_TYPE" "-DSANITIZE=$SANITIZER" -DENABLE_CHECK_HEAVY_BUILDS=1 "${CMAKE_FLAGS[@]}" ..
fi
if [ "coverity" == "$COMBINED_OUTPUT" ]
@ -88,7 +88,7 @@ fi
# No quotes because I want it to expand to nothing if empty.
# shellcheck disable=SC2086 # No quotes because I want it to expand to nothing if empty.
$SCAN_WRAPPER ninja $NINJA_FLAGS clickhouse-bundle
$SCAN_WRAPPER ninja $NINJA_FLAGS $BUILD_TARGET
ls -la ./programs

View File

@ -134,6 +134,7 @@ def parse_env_variables(
result = []
result.append("OUTPUT_DIR=/output")
cmake_flags = ["$CMAKE_FLAGS"]
build_target = "clickhouse-bundle"
is_cross_darwin = compiler.endswith(DARWIN_SUFFIX)
is_cross_darwin_arm = compiler.endswith(DARWIN_ARM_SUFFIX)
@ -236,7 +237,11 @@ def parse_env_variables(
result.append("CCACHE_BASEDIR=/build")
result.append("CCACHE_NOHASHDIR=true")
result.append("CCACHE_COMPILERCHECK=content")
result.append("CCACHE_MAXSIZE=15G")
cache_maxsize = "15G"
if clang_tidy:
# 15G is not enough for tidy build
cache_maxsize = "25G"
result.append(f"CCACHE_MAXSIZE={cache_maxsize}")
# result.append("CCACHE_UMASK=777")
if distcc_hosts:
@ -268,15 +273,20 @@ def parse_env_variables(
# we have to build them at least in some way in CI. The split build is
# probably the least heavy disk-wise.
cmake_flags.append("-DENABLE_UTILS=1")
# utils are not included into clickhouse-bundle, so build everything
build_target = "all"
if clang_tidy:
cmake_flags.append("-DENABLE_CLANG_TIDY=1")
cmake_flags.append("-DENABLE_UTILS=1")
cmake_flags.append("-DENABLE_TESTS=1")
cmake_flags.append("-DENABLE_EXAMPLES=1")
# Don't stop on first error to find more clang-tidy errors in one run.
result.append("NINJA_FLAGS=-k0")
cmake_flags.append("-DENABLE_UTILS=1")
# utils are not included into clickhouse-bundle, so build everything
build_target = "all"
if with_coverage:
cmake_flags.append("-DWITH_COVERAGE=1")
@ -290,6 +300,7 @@ def parse_env_variables(
cmake_flags.append("-DCLICKHOUSE_OFFICIAL_BUILD=1")
result.append('CMAKE_FLAGS="' + " ".join(cmake_flags) + '"')
result.append(f"BUILD_TARGET={build_target}")
return result

View File

@ -34,13 +34,14 @@ RUN apt-get update \
ARG TARGETARCH
# Install MySQL ODBC driver from RHEL rpm
# For reference https://downloads.mysql.com/archives/c-odbc/ RHEL
RUN arch=${TARGETARCH:-amd64} \
&& case $arch in \
amd64) rarch=x86_64 ;; \
arm64) rarch=aarch64 ;; \
esac \
&& cd /tmp \
&& curl -o mysql-odbc.rpm "https://cdn.mysql.com/Downloads/Connector-ODBC/8.0/mysql-connector-odbc-8.0.27-1.el8.${rarch}.rpm" \
&& curl -o mysql-odbc.rpm "https://cdn.mysql.com/archives/mysql-connector-odbc-8.0/mysql-connector-odbc-8.0.27-1.el8.${rarch}.rpm" \
&& rpm2archive mysql-odbc.rpm \
&& tar xf mysql-odbc.rpm.tgz -C / ./usr/lib64/ \
&& LINK_DIR=$(dpkg -L libodbc1 | grep '^/usr/lib/.*-linux-gnu/odbc$') \

View File

@ -149,7 +149,7 @@ Features:
### ClickCat {#clickcat}
[ClickCat](https://github.com/open-botech/ClickCat) is a firendly user interface that lets you search, explore and visualize your ClickHouse Data.
[ClickCat](https://github.com/clickcat-project/ClickCat) is a friendly user interface that lets you search, explore and visualize your ClickHouse Data.
Features:

View File

@ -5,12 +5,12 @@ sidebar_label: Testing Hardware
# How to Test Your Hardware with ClickHouse
You can run basic ClickHouse performance test on any server without installation of ClickHouse packages.
You can run a basic ClickHouse performance test on any server without installation of ClickHouse packages.
## Automated Run
You can run benchmark with a single script.
You can run the benchmark with a single script.
1. Download the script.
```
@ -26,58 +26,3 @@ chmod a+x ./hardware.sh
3. Copy the output and send it to feedback@clickhouse.com
All the results are published here: https://clickhouse.com/benchmark/hardware/
## Manual Run
Alternatively you can perform benchmark in the following steps.
1. ssh to the server and download the binary with wget:
```bash
# For amd64:
wget https://builds.clickhouse.com/master/amd64/clickhouse
# For aarch64:
wget https://builds.clickhouse.com/master/aarch64/clickhouse
# For powerpc64le:
wget https://builds.clickhouse.com/master/powerpc64le/clickhouse
# For freebsd:
wget https://builds.clickhouse.com/master/freebsd/clickhouse
# For freebsd-aarch64:
wget https://builds.clickhouse.com/master/freebsd-aarch64/clickhouse
# For freebsd-powerpc64le:
wget https://builds.clickhouse.com/master/freebsd-powerpc64le/clickhouse
# For macos:
wget https://builds.clickhouse.com/master/macos/clickhouse
# For macos-aarch64:
wget https://builds.clickhouse.com/master/macos-aarch64/clickhouse
# Then do:
chmod a+x clickhouse
```
2. Download benchmark files:
```bash
wget https://raw.githubusercontent.com/ClickHouse/ClickHouse/master/benchmark/hardware/benchmark-new.sh
chmod a+x benchmark-new.sh
wget https://raw.githubusercontent.com/ClickHouse/ClickHouse/master/benchmark/hardware/queries.sql
```
3. Download the [web analytics dataset](../getting-started/example-datasets/metrica.md) (“hits” table containing 100 million rows).
```bash
wget https://datasets.clickhouse.com/hits/partitions/hits_100m_obfuscated_v1.tar.xz
tar xvf hits_100m_obfuscated_v1.tar.xz -C .
mv hits_100m_obfuscated_v1/* .
```
4. Run the server:
```bash
./clickhouse server
```
5. Check the data: ssh to the server in another terminal
```bash
./clickhouse client --query "SELECT count() FROM hits_100m_obfuscated"
100000000
```
6. Run the benchmark:
```bash
./benchmark-new.sh hits_100m_obfuscated
```
7. Send the numbers and the info about your hardware configuration to feedback@clickhouse.com
All the results are published here: https://clickhouse.com/benchmark/hardware/

View File

@ -197,7 +197,7 @@ Default value: `480` (8 minute).
Parameter of a task that cleans up garbage from `store/` directory.
If some subdirectory is not used by clickhouse-server and this directory was not modified for last
`database_catalog_unused_dir_hide_timeout_sec` seconds, the task will "hide" this directory by
`database_catalog_unused_dir_hide_timeout_sec` seconds, the task will "hide" this directory by
removing all access rights. It also works for directories that clickhouse-server does not
expect to see inside `store/`. Zero means "immediately".
@ -206,10 +206,10 @@ Default value: `3600` (1 hour).
## database_catalog_unused_dir_rm_timeout_sec {#database_catalog_unused_dir_rm_timeout_sec}
Parameter of a task that cleans up garbage from `store/` directory.
If some subdirectory is not used by clickhouse-server and it was previousely "hidden"
(see [database_catalog_unused_dir_hide_timeout_sec](../../operations/server-configuration-parameters/settings.md#database_catalog_unused_dir_hide_timeout_sec))
If some subdirectory is not used by clickhouse-server and it was previousely "hidden"
(see [database_catalog_unused_dir_hide_timeout_sec](../../operations/server-configuration-parameters/settings.md#database_catalog_unused_dir_hide_timeout_sec))
and this directory was not modified for last
`database_catalog_unused_dir_rm_timeout_sec` seconds, the task will remove this directory.
`database_catalog_unused_dir_rm_timeout_sec` seconds, the task will remove this directory.
It also works for directories that clickhouse-server does not
expect to see inside `store/`. Zero means "never".
@ -731,6 +731,16 @@ On hosts with low RAM and swap, you possibly need setting `max_server_memory_usa
- [max_server_memory_usage](#max_server_memory_usage)
## concurrent_threads_soft_limit {#concurrent_threads_soft_limit}
The maximum number of query processing threads, excluding threads for retrieving data from remote servers, allowed to run all queries. This is not a hard limit. In case if the limit is reached the query will still get one thread to run.
Possible values:
- Positive integer.
- 0 — No limit.
- -1 — The parameter is initialized by number of logical cores multiplies by 3. Which is a good heuristic for CPU-bound tasks.
Default value: `0`.
## max_concurrent_queries {#max-concurrent-queries}
The maximum number of simultaneously processed queries.

View File

@ -4,7 +4,7 @@ sidebar_position: 6
# any
Selects the first encountered value.
Selects the first encountered (non-NULL) value, unless all rows have NULL values in that column.
The query can be executed in any order and even in a different order each time, so the result of this function is indeterminate.
To get a determinate result, you can use the min or max function instead of any.

View File

@ -956,9 +956,28 @@ SELECT
## timeSlots(StartTime, Duration,\[, Size\])
For a time interval starting at StartTime and continuing for Duration seconds, it returns an array of moments in time, consisting of points from this interval rounded down to the Size in seconds. Size is an optional parameter: a constant UInt32, set to 1800 by default.
For example, `timeSlots(toDateTime('2012-01-01 12:20:00'), 600) = [toDateTime('2012-01-01 12:00:00'), toDateTime('2012-01-01 12:30:00')]`.
This is necessary for searching for pageviews in the corresponding session.
For a time interval starting at StartTime and continuing for Duration seconds, it returns an array of moments in time, consisting of points from this interval rounded down to the Size in seconds. Size is an optional parameter set to 1800 (30 minutes) by default.
This is necessary, for example, when searching for pageviews in the corresponding session.
Accepts DateTime and DateTime64 as StartTime argument. For DateTime, Duration and Size arguments must be `UInt32`. For DateTime64 they must be `Decimal64`.
Returns an array of DateTime/DateTime64 (return type matches the type of StartTime). For DateTime64, the return value's scale can differ from the scale of StartTime --- the highest scale among all given arguments is taken.
Example:
```sql
SELECT timeSlots(toDateTime('2012-01-01 12:20:00'), toUInt32(600));
SELECT timeSlots(toDateTime('1980-12-12 21:01:02', 'UTC'), toUInt32(600), 299);
SELECT timeSlots(toDateTime64('1980-12-12 21:01:02.1234', 4, 'UTC'), toDecimal64(600.1, 1), toDecimal64(299, 0));
```
``` text
┌─timeSlots(toDateTime('2012-01-01 12:20:00'), toUInt32(600))─┐
│ ['2012-01-01 12:00:00','2012-01-01 12:30:00'] │
└─────────────────────────────────────────────────────────────┘
┌─timeSlots(toDateTime('1980-12-12 21:01:02', 'UTC'), toUInt32(600), 299)─┐
│ ['1980-12-12 20:56:13','1980-12-12 21:01:12','1980-12-12 21:06:11'] │
└─────────────────────────────────────────────────────────────────────────┘
┌─timeSlots(toDateTime64('1980-12-12 21:01:02.1234', 4, 'UTC'), toDecimal64(600.1, 1), toDecimal64(299, 0))─┐
│ ['1980-12-12 20:56:13.0000','1980-12-12 21:01:12.0000','1980-12-12 21:06:11.0000'] │
└───────────────────────────────────────────────────────────────────────────────────────────────────────────┘
```
## formatDateTime

View File

@ -11,7 +11,7 @@ Compressed files are supported. Compression type is detected by the extension of
**Syntax**
```sql
SELECT <expr_list> INTO OUTFILE file_name [COMPRESSION type [LEVEL level]]
SELECT <expr_list> INTO OUTFILE file_name [AND STDOUT] [COMPRESSION type [LEVEL level]]
```
`file_name` and `type` are string literals. Supported compression types are: `'none'`, `'gzip'`, `'deflate'`, `'br'`, `'xz'`, `'zstd'`, `'lz4'`, `'bz2'`.
@ -23,6 +23,7 @@ SELECT <expr_list> INTO OUTFILE file_name [COMPRESSION type [LEVEL level]]
- This functionality is available in the [command-line client](../../../interfaces/cli.md) and [clickhouse-local](../../../operations/utilities/clickhouse-local.md). Thus a query sent via [HTTP interface](../../../interfaces/http.md) will fail.
- The query will fail if a file with the same file name already exists.
- The default [output format](../../../interfaces/formats.md) is `TabSeparated` (like in the command-line client batch mode). Use [FORMAT](format.md) clause to change it.
- If `AND STDOUT` is mentioned in the query then the output that is written to the file is also displayed on standard output. If used with compression, the plaintext is displayed on standard output.
**Example**

View File

@ -944,14 +944,31 @@ SELECT now('Europe/Moscow');
## timeSlot {#timeslot}
Округляет время до получаса.
Эта функция является специфичной для Яндекс.Метрики, так как пол часа - минимальное время, для которого, если соседние по времени хиты одного посетителя на одном счётчике отстоят друг от друга строго более, чем на это время, визит может быть разбит на два визита. То есть, кортежи (номер счётчика, идентификатор посетителя, тайм-слот) могут использоваться для поиска хитов, входящий в соответствующий визит.
Эта функция является специфичной для Яндекс.Метрики, так как полчаса - минимальное время, для которого, если соседние по времени хиты одного посетителя на одном счётчике отстоят друг от друга строго более, чем на это время, визит может быть разбит на два визита. То есть, кортежи (номер счётчика, идентификатор посетителя, тайм-слот) могут использоваться для поиска хитов, входящий в соответствующий визит.
## timeSlots(StartTime, Duration,\[, Size\]) {#timeslotsstarttime-duration-size}
Для интервала, начинающегося в `StartTime` и длящегося `Duration` секунд, возвращает массив моментов времени, кратных `Size`. Параметр `Size` указывать необязательно, по умолчанию он равен 1800 секундам (30 минутам) - необязательный параметр.
Данная функция может использоваться, например, для анализа количества просмотров страницы за соответствующую сессию.
Аргумент `StartTime` может иметь тип `DateTime` или `DateTime64`. В случае, если используется `DateTime`, аргументы `Duration` и `Size` должны иметь тип `UInt32`; Для DateTime64 они должны быть типа `Decimal64`.
Возвращает массив DateTime/DateTime64 (тип будет совпадать с типом параметра StartTime). Для DateTime64 масштаб(scale) возвращаемой величины может отличаться от масштаба фргумента StartTime --- результат будет иметь наибольший масштаб среди всех данных аргументов.
Для интервала времени, начинающегося в StartTime и продолжающегося Duration секунд, возвращает массив моментов времени, состоящий из округлений вниз до Size точек в секундах из этого интервала. Size - необязательный параметр, константный UInt32, по умолчанию равен 1800.
Например, `timeSlots(toDateTime('2012-01-01 12:20:00'), toUInt32(600)) = [toDateTime('2012-01-01 12:00:00'), toDateTime('2012-01-01 12:30:00')]`.
Это нужно для поиска хитов, входящих в соответствующий визит.
Пример использования:
```sql
SELECT timeSlots(toDateTime('2012-01-01 12:20:00'), toUInt32(600));
SELECT timeSlots(toDateTime('1980-12-12 21:01:02', 'UTC'), toUInt32(600), 299);
SELECT timeSlots(toDateTime64('1980-12-12 21:01:02.1234', 4, 'UTC'), toDecimal64(600.1, 1), toDecimal64(299, 0));
```
``` text
┌─timeSlots(toDateTime('2012-01-01 12:20:00'), toUInt32(600))─┐
│ ['2012-01-01 12:00:00','2012-01-01 12:30:00'] │
└─────────────────────────────────────────────────────────────┘
┌─timeSlots(toDateTime('1980-12-12 21:01:02', 'UTC'), toUInt32(600), 299)─┐
│ ['1980-12-12 20:56:13','1980-12-12 21:01:12','1980-12-12 21:06:11'] │
└─────────────────────────────────────────────────────────────────────────┘
┌─timeSlots(toDateTime64('1980-12-12 21:01:02.1234', 4, 'UTC'), toDecimal64(600.1, 1), toDecimal64(299, 0))─┐
│ ['1980-12-12 20:56:13.0000','1980-12-12 21:01:12.0000','1980-12-12 21:06:11.0000'] │
└───────────────────────────────────────────────────────────────────────────────────────────────────────────┘
```
## toYYYYMM

View File

@ -62,6 +62,8 @@ ORDER BY expr
- `PARTITION BY` — [分区键](custom-partitioning-key.md) ,可选项。
大多数情况下,不需要分使用区键。即使需要使用,也不需要使用比月更细粒度的分区键。分区不会加快查询(这与 ORDER BY 表达式不同)。永远也别使用过细粒度的分区键。不要使用客户端指定分区标识符或分区字段名称来对数据进行分区(而是将分区字段标识或名称作为 ORDER BY 表达式的第一列来指定分区)。
要按月分区,可以使用表达式 `toYYYYMM(date_column)` ,这里的 `date_column` 是一个 [Date](../../../engines/table-engines/mergetree-family/mergetree.md) 类型的列。分区名的格式会是 `"YYYYMM"`
- `PRIMARY KEY` - 如果要 [选择与排序键不同的主键](#choosing-a-primary-key-that-differs-from-the-sorting-key),在这里指定,可选项。

View File

@ -195,6 +195,12 @@ else()
message(STATUS "ClickHouse disks mode: OFF")
endif()
if (ENABLE_CLICKHOUSE_SU)
message(STATUS "ClickHouse su: ON")
else()
message(STATUS "ClickHouse su: OFF")
endif()
configure_file (config_tools.h.in ${ConfigIncludePath}/config_tools.h)
macro(clickhouse_target_link_split_lib target name)
@ -367,6 +373,10 @@ if (CLICKHOUSE_SPLIT_BINARY)
list (APPEND CLICKHOUSE_ALL_TARGETS clickhouse-keeper-converter)
endif ()
if (ENABLE_CLICKHOUSE_SU)
list (APPEND CLICKHOUSE_ALL_TARGETS clickhouse-su)
endif ()
set_target_properties(${CLICKHOUSE_ALL_TARGETS} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ..)
add_custom_target (clickhouse-bundle ALL DEPENDS ${CLICKHOUSE_ALL_TARGETS})

View File

@ -39,14 +39,19 @@ int mainEntryClickHouseKeeperConverter(int argc, char ** argv)
try
{
DB::KeeperStorage storage(500, "", true);
auto keeper_context = std::make_shared<KeeperContext>();
keeper_context->digest_enabled = true;
DB::KeeperStorage storage(/* tick_time_ms */ 500, /* superdigest */ "", keeper_context, /* initialize_system_nodes */ false);
DB::deserializeKeeperStorageFromSnapshotsDir(storage, options["zookeeper-snapshots-dir"].as<std::string>(), logger);
storage.initializeSystemNodes();
DB::deserializeLogsAndApplyToStorage(storage, options["zookeeper-logs-dir"].as<std::string>(), logger);
DB::SnapshotMetadataPtr snapshot_meta = std::make_shared<DB::SnapshotMetadata>(storage.getZXID(), 1, std::make_shared<nuraft::cluster_config>());
DB::KeeperStorageSnapshot snapshot(&storage, snapshot_meta);
DB::KeeperSnapshotManager manager(options["output-dir"].as<std::string>(), 1);
DB::KeeperSnapshotManager manager(options["output-dir"].as<std::string>(), 1, keeper_context);
auto snp = manager.serializeSnapshotToBuffer(snapshot);
auto path = manager.serializeSnapshotBufferToDisk(*snp, storage.getZXID());
std::cout << "Snapshot serialized to path:" << path << std::endl;

View File

@ -29,6 +29,7 @@
#include <Common/ClickHouseRevision.h>
#include <Common/DNSResolver.h>
#include <Common/CurrentMetrics.h>
#include <Common/ConcurrencyControl.h>
#include <Common/Macros.h>
#include <Common/ShellCommand.h>
#include <Common/StringUtils/StringUtils.h>
@ -1124,6 +1125,23 @@ int Server::main(const std::vector<std::string> & /*args*/)
if (config->has("max_partition_size_to_drop"))
global_context->setMaxPartitionSizeToDrop(config->getUInt64("max_partition_size_to_drop"));
if (config->has("concurrent_threads_soft_limit"))
{
auto concurrent_threads_soft_limit = config->getInt("concurrent_threads_soft_limit", 0);
if (concurrent_threads_soft_limit == -1)
{
// Based on tests concurrent_threads_soft_limit has an optimal value when it's about 3 times of logical CPU cores
constexpr size_t thread_factor = 3;
concurrent_threads_soft_limit = std::thread::hardware_concurrency() * thread_factor;
}
if (concurrent_threads_soft_limit)
ConcurrencyControl::instance().setMaxConcurrency(concurrent_threads_soft_limit);
else
ConcurrencyControl::instance().setMaxConcurrency(ConcurrencyControl::Unlimited);
}
else
ConcurrencyControl::instance().setMaxConcurrency(ConcurrencyControl::Unlimited);
if (config->has("max_concurrent_queries"))
global_context->getProcessList().setMaxSize(config->getInt("max_concurrent_queries", 0));

View File

@ -269,6 +269,13 @@
<http_server_default_response><![CDATA[<html ng-app="SMI2"><head><base href="http://ui.tabix.io/"></head><body><div ui-view="" class="content-ui"></div><script src="http://loader.tabix.io/master.js"></script></body></html>]]></http_server_default_response>
-->
<!-- Maximum number of query processing threads to run all queries.
Note that This is not a hard limit. In case if the limit is reached the query will still get one thread to run.
For value equals to -1 this parameter is initialized by number of logical cores multiplies by 3.
Which is a good heuristic for CPU-bound tasks.
-->
<concurrent_threads_soft_limit>0</concurrent_threads_soft_limit>
<!-- Maximum number of concurrent queries. -->
<max_concurrent_queries>100</max_concurrent_queries>
@ -604,7 +611,7 @@
if this setting is true the user B will see all rows, and if this setting is false the user B will see no rows.
By default this setting is false for compatibility with earlier access configurations. -->
<users_without_row_policies_can_read_rows>false</users_without_row_policies_can_read_rows>
<!-- By default, for backward compatibility ON CLUSTER queries ignore CLUSTER grant,
however you can change this behaviour by setting this to true -->
<on_cluster_queries_require_cluster_grant>false</on_cluster_queries_require_cluster_grant>

View File

@ -1,3 +1,3 @@
set (CLICKHOUSE_SU_SOURCES clickhouse-su.cpp)
set (CLICKHOUSE_SU_SOURCES clickhouse-su.cpp su.cpp)
set (CLICKHOUSE_SU_LINK PRIVATE dbms)
clickhouse_program_add(su)

View File

@ -1,145 +1,2 @@
#include <Common/Exception.h>
#include <IO/ReadHelpers.h>
#include <fmt/format.h>
#include <vector>
#include <sys/types.h>
#include <unistd.h>
#include <pwd.h>
#include <grp.h>
/// "su" means "set user"
/// In fact, this program can set Unix user and group.
///
/// Usage:
/// clickhouse su user[:group] args...
///
/// - will set user and, optionally, group and exec the remaining args.
/// user and group can be numeric identifiers or strings.
///
/// The motivation for this tool is very obscure and idiosyncratic. It is needed for Docker.
/// People want to run programs inside Docker with dropped privileges (less than root).
/// But the standard Linux "su" program is not suitable for usage inside Docker,
/// because it is creating pseudoterminals to avoid hijacking input from the terminal, for security,
/// but Docker is also doing something with the terminal and it is incompatible.
/// For this reason, people use alternative and less "secure" versions of "su" tools like "gosu" or "su-exec".
/// But it would be very strange to use 3rd-party software only to do two-three syscalls.
/// That's why we provide this tool.
///
/// Note: ClickHouse does not need Docker at all and works better without Docker.
/// ClickHouse has no dependencies, it is packaged and distributed in single binary.
/// There is no reason to use Docker unless you are already running all your software in Docker.
namespace DB
{
namespace ErrorCodes
{
extern const int BAD_ARGUMENTS;
extern const int SYSTEM_ERROR;
}
void setUserAndGroup(std::string arg_uid, std::string arg_gid)
{
static constexpr size_t buf_size = 16384; /// Linux man page says it is enough. Nevertheless, we will check if it's not enough and throw.
std::unique_ptr<char[]> buf(new char[buf_size]);
/// Set the group first, because if we set user, the privileges will be already dropped and we will not be able to set the group later.
if (!arg_gid.empty())
{
gid_t gid = 0;
if (!tryParse(gid, arg_gid) || gid == 0)
{
group entry{};
group * result{};
if (0 != getgrnam_r(arg_gid.data(), &entry, buf.get(), buf_size, &result))
throwFromErrno(fmt::format("Cannot do 'getgrnam_r' to obtain gid from group name ({})", arg_gid), ErrorCodes::SYSTEM_ERROR);
if (!result)
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Group {} is not found in the system", arg_gid);
gid = entry.gr_gid;
}
if (gid == 0 && getgid() != 0)
throw Exception("Group has id 0, but dropping privileges to gid 0 does not make sense", ErrorCodes::BAD_ARGUMENTS);
if (0 != setgid(gid))
throwFromErrno(fmt::format("Cannot do 'setgid' to user ({})", arg_gid), ErrorCodes::SYSTEM_ERROR);
}
if (!arg_uid.empty())
{
/// Is it numeric id or name?
uid_t uid = 0;
if (!tryParse(uid, arg_uid) || uid == 0)
{
passwd entry{};
passwd * result{};
if (0 != getpwnam_r(arg_uid.data(), &entry, buf.get(), buf_size, &result))
throwFromErrno(fmt::format("Cannot do 'getpwnam_r' to obtain uid from user name ({})", arg_uid), ErrorCodes::SYSTEM_ERROR);
if (!result)
throw Exception(ErrorCodes::BAD_ARGUMENTS, "User {} is not found in the system", arg_uid);
uid = entry.pw_uid;
}
if (uid == 0 && getuid() != 0)
throw Exception("User has id 0, but dropping privileges to uid 0 does not make sense", ErrorCodes::BAD_ARGUMENTS);
if (0 != setuid(uid))
throwFromErrno(fmt::format("Cannot do 'setuid' to user ({})", arg_uid), ErrorCodes::SYSTEM_ERROR);
}
}
}
int mainEntryClickHouseSU(int argc, char ** argv)
try
{
using namespace DB;
if (argc < 3)
{
std::cout << "Usage: ./clickhouse su user:group ..." << std::endl;
exit(0);
}
std::string_view user_and_group = argv[1];
std::string user;
std::string group;
auto pos = user_and_group.find(':');
if (pos == std::string_view::npos)
{
user = user_and_group;
}
else
{
user = user_and_group.substr(0, pos);
group = user_and_group.substr(pos + 1);
}
setUserAndGroup(std::move(user), std::move(group));
std::vector<char *> new_argv;
new_argv.reserve(argc - 1);
new_argv.insert(new_argv.begin(), argv + 2, argv + argc);
new_argv.push_back(nullptr);
execvp(new_argv.front(), new_argv.data());
throwFromErrno("Cannot execvp", ErrorCodes::SYSTEM_ERROR);
}
catch (...)
{
std::cerr << DB::getCurrentExceptionMessage(false) << '\n';
return 1;
}
int mainEntryClickHouseSU(int argc, char ** argv);
int main(int argc_, char ** argv_) { return mainEntryClickHouseSU(argc_, argv_); }

145
programs/su/su.cpp Normal file
View File

@ -0,0 +1,145 @@
#include <Common/Exception.h>
#include <IO/ReadHelpers.h>
#include <fmt/format.h>
#include <vector>
#include <sys/types.h>
#include <unistd.h>
#include <pwd.h>
#include <grp.h>
/// "su" means "set user"
/// In fact, this program can set Unix user and group.
///
/// Usage:
/// clickhouse su user[:group] args...
///
/// - will set user and, optionally, group and exec the remaining args.
/// user and group can be numeric identifiers or strings.
///
/// The motivation for this tool is very obscure and idiosyncratic. It is needed for Docker.
/// People want to run programs inside Docker with dropped privileges (less than root).
/// But the standard Linux "su" program is not suitable for usage inside Docker,
/// because it is creating pseudoterminals to avoid hijacking input from the terminal, for security,
/// but Docker is also doing something with the terminal and it is incompatible.
/// For this reason, people use alternative and less "secure" versions of "su" tools like "gosu" or "su-exec".
/// But it would be very strange to use 3rd-party software only to do two-three syscalls.
/// That's why we provide this tool.
///
/// Note: ClickHouse does not need Docker at all and works better without Docker.
/// ClickHouse has no dependencies, it is packaged and distributed in single binary.
/// There is no reason to use Docker unless you are already running all your software in Docker.
namespace DB
{
namespace ErrorCodes
{
extern const int BAD_ARGUMENTS;
extern const int SYSTEM_ERROR;
}
void setUserAndGroup(std::string arg_uid, std::string arg_gid)
{
static constexpr size_t buf_size = 16384; /// Linux man page says it is enough. Nevertheless, we will check if it's not enough and throw.
std::unique_ptr<char[]> buf(new char[buf_size]);
/// Set the group first, because if we set user, the privileges will be already dropped and we will not be able to set the group later.
if (!arg_gid.empty())
{
gid_t gid = 0;
if (!tryParse(gid, arg_gid) || gid == 0)
{
group entry{};
group * result{};
if (0 != getgrnam_r(arg_gid.data(), &entry, buf.get(), buf_size, &result))
throwFromErrno(fmt::format("Cannot do 'getgrnam_r' to obtain gid from group name ({})", arg_gid), ErrorCodes::SYSTEM_ERROR);
if (!result)
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Group {} is not found in the system", arg_gid);
gid = entry.gr_gid;
}
if (gid == 0 && getgid() != 0)
throw Exception("Group has id 0, but dropping privileges to gid 0 does not make sense", ErrorCodes::BAD_ARGUMENTS);
if (0 != setgid(gid))
throwFromErrno(fmt::format("Cannot do 'setgid' to user ({})", arg_gid), ErrorCodes::SYSTEM_ERROR);
}
if (!arg_uid.empty())
{
/// Is it numeric id or name?
uid_t uid = 0;
if (!tryParse(uid, arg_uid) || uid == 0)
{
passwd entry{};
passwd * result{};
if (0 != getpwnam_r(arg_uid.data(), &entry, buf.get(), buf_size, &result))
throwFromErrno(fmt::format("Cannot do 'getpwnam_r' to obtain uid from user name ({})", arg_uid), ErrorCodes::SYSTEM_ERROR);
if (!result)
throw Exception(ErrorCodes::BAD_ARGUMENTS, "User {} is not found in the system", arg_uid);
uid = entry.pw_uid;
}
if (uid == 0 && getuid() != 0)
throw Exception("User has id 0, but dropping privileges to uid 0 does not make sense", ErrorCodes::BAD_ARGUMENTS);
if (0 != setuid(uid))
throwFromErrno(fmt::format("Cannot do 'setuid' to user ({})", arg_uid), ErrorCodes::SYSTEM_ERROR);
}
}
}
int mainEntryClickHouseSU(int argc, char ** argv)
try
{
using namespace DB;
if (argc < 3)
{
std::cout << "Usage: ./clickhouse su user:group ..." << std::endl;
exit(0);
}
std::string_view user_and_group = argv[1];
std::string user;
std::string group;
auto pos = user_and_group.find(':');
if (pos == std::string_view::npos)
{
user = user_and_group;
}
else
{
user = user_and_group.substr(0, pos);
group = user_and_group.substr(pos + 1);
}
setUserAndGroup(std::move(user), std::move(group));
std::vector<char *> new_argv;
new_argv.reserve(argc - 1);
new_argv.insert(new_argv.begin(), argv + 2, argv + argc);
new_argv.push_back(nullptr);
execvp(new_argv.front(), new_argv.data());
throwFromErrno("Cannot execvp", ErrorCodes::SYSTEM_ERROR);
}
catch (...)
{
std::cerr << DB::getCurrentExceptionMessage(false) << '\n';
return 1;
}

View File

@ -25,7 +25,7 @@ int main(int argc, char ** argv)
return 0;
}
String cache_name = "";
const char * cache_name = "";
if (argc == 4)
cache_name = argv[3];

View File

@ -8,21 +8,22 @@ class SeekableReadBuffer;
class WriteBuffer;
/// Represents operations of loading from disk or downloading for reading a backup.
class IBackupReader /// BackupReaderFile, BackupReaderDisk, BackupReaderS3
class IBackupReader /// BackupReaderFile, BackupReaderDisk
{
public:
virtual ~IBackupReader() = default;
virtual bool fileExists(const String & file_name) = 0;
virtual size_t getFileSize(const String & file_name) = 0;
virtual UInt64 getFileSize(const String & file_name) = 0;
virtual std::unique_ptr<SeekableReadBuffer> readFile(const String & file_name) = 0;
};
/// Represents operations of storing to disk or uploading for writing a backup.
class IBackupWriter /// BackupWriterFile, BackupWriterDisk, BackupWriterS3
class IBackupWriter /// BackupWriterFile, BackupWriterDisk
{
public:
virtual ~IBackupWriter() = default;
virtual bool fileExists(const String & file_name) = 0;
virtual UInt64 getFileSize(const String & file_name) = 0;
virtual bool fileContentsEqual(const String & file_name, const String & expected_file_contents) = 0;
virtual std::unique_ptr<WriteBuffer> writeFile(const String & file_name) = 0;
virtual void removeFiles(const Strings & file_names) = 0;

View File

@ -17,7 +17,7 @@ bool BackupReaderDisk::fileExists(const String & file_name)
return disk->exists(path / file_name);
}
size_t BackupReaderDisk::getFileSize(const String & file_name)
UInt64 BackupReaderDisk::getFileSize(const String & file_name)
{
return disk->getFileSize(path / file_name);
}
@ -38,6 +38,11 @@ bool BackupWriterDisk::fileExists(const String & file_name)
return disk->exists(path / file_name);
}
UInt64 BackupWriterDisk::getFileSize(const String & file_name)
{
return disk->getFileSize(path / file_name);
}
bool BackupWriterDisk::fileContentsEqual(const String & file_name, const String & expected_file_contents)
{
if (!disk->exists(path / file_name))

View File

@ -15,7 +15,7 @@ public:
~BackupReaderDisk() override;
bool fileExists(const String & file_name) override;
size_t getFileSize(const String & file_name) override;
UInt64 getFileSize(const String & file_name) override;
std::unique_ptr<SeekableReadBuffer> readFile(const String & file_name) override;
private:
@ -30,6 +30,7 @@ public:
~BackupWriterDisk() override;
bool fileExists(const String & file_name) override;
UInt64 getFileSize(const String & file_name) override;
bool fileContentsEqual(const String & file_name, const String & expected_file_contents) override;
std::unique_ptr<WriteBuffer> writeFile(const String & file_name) override;
void removeFiles(const Strings & file_names) override;

View File

@ -18,7 +18,7 @@ bool BackupReaderFile::fileExists(const String & file_name)
return fs::exists(path / file_name);
}
size_t BackupReaderFile::getFileSize(const String & file_name)
UInt64 BackupReaderFile::getFileSize(const String & file_name)
{
return fs::file_size(path / file_name);
}
@ -39,6 +39,11 @@ bool BackupWriterFile::fileExists(const String & file_name)
return fs::exists(path / file_name);
}
UInt64 BackupWriterFile::getFileSize(const String & file_name)
{
return fs::file_size(path / file_name);
}
bool BackupWriterFile::fileContentsEqual(const String & file_name, const String & expected_file_contents)
{
if (!fs::exists(path / file_name))

View File

@ -13,7 +13,7 @@ public:
~BackupReaderFile() override;
bool fileExists(const String & file_name) override;
size_t getFileSize(const String & file_name) override;
UInt64 getFileSize(const String & file_name) override;
std::unique_ptr<SeekableReadBuffer> readFile(const String & file_name) override;
private:
@ -27,6 +27,7 @@ public:
~BackupWriterFile() override;
bool fileExists(const String & file_name) override;
UInt64 getFileSize(const String & file_name) override;
bool fileContentsEqual(const String & file_name, const String & expected_file_contents) override;
std::unique_ptr<WriteBuffer> writeFile(const String & file_name) override;
void removeFiles(const Strings & file_names) override;

View File

@ -219,10 +219,7 @@ void BackupImpl::open(const ContextPtr & context)
void BackupImpl::close()
{
std::lock_guard lock{mutex};
archive_readers.clear();
for (auto & archive_writer : archive_writers)
archive_writer = {"", nullptr};
closeArchives();
if (!is_internal_backup && writer && !writing_finalized)
removeAllFilesAfterFailure();
@ -232,10 +229,29 @@ void BackupImpl::close()
coordination.reset();
}
time_t BackupImpl::getTimestamp() const
void BackupImpl::closeArchives()
{
archive_readers.clear();
for (auto & archive_writer : archive_writers)
archive_writer = {"", nullptr};
}
size_t BackupImpl::getNumFiles() const
{
std::lock_guard lock{mutex};
return timestamp;
return num_files;
}
UInt64 BackupImpl::getUncompressedSize() const
{
std::lock_guard lock{mutex};
return uncompressed_size;
}
UInt64 BackupImpl::getCompressedSize() const
{
std::lock_guard lock{mutex};
return compressed_size;
}
void BackupImpl::writeBackupMetadata()
@ -290,6 +306,7 @@ void BackupImpl::writeBackupMetadata()
if (info.pos_in_archive != static_cast<size_t>(-1))
config->setUInt64(prefix + "pos_in_archive", info.pos_in_archive);
}
increaseUncompressedSize(info);
++index;
}
@ -306,6 +323,8 @@ void BackupImpl::writeBackupMetadata()
out = writer->writeFile(".backup");
out->write(str.data(), str.size());
out->finalize();
increaseUncompressedSize(str.size());
}
void BackupImpl::readBackupMetadata()
@ -315,6 +334,7 @@ void BackupImpl::readBackupMetadata()
{
if (!reader->fileExists(archive_params.archive_name))
throw Exception(ErrorCodes::BACKUP_NOT_FOUND, "Backup {} not found", backup_name);
setCompressedSize();
in = getArchiveReader("")->readFile(".backup");
}
else
@ -326,6 +346,7 @@ void BackupImpl::readBackupMetadata()
String str;
readStringUntilEOF(str, *in);
increaseUncompressedSize(str.size());
std::istringstream stream(str); // STYLE_CHECK_ALLOW_STD_STRING_STREAM
Poco::AutoPtr<Poco::Util::XMLConfiguration> config{new Poco::Util::XMLConfiguration()};
config->load(stream);
@ -382,8 +403,12 @@ void BackupImpl::readBackupMetadata()
}
coordination->addFileInfo(info);
increaseUncompressedSize(info);
}
}
if (!use_archives)
setCompressedSize();
}
void BackupImpl::checkBackupDoesntExist() const
@ -750,6 +775,8 @@ void BackupImpl::finalizeWriting()
{
LOG_TRACE(log, "Finalizing backup {}", backup_name);
writeBackupMetadata();
closeArchives();
setCompressedSize();
removeLockFile();
LOG_TRACE(log, "Finalized backup {}", backup_name);
}
@ -758,12 +785,32 @@ void BackupImpl::finalizeWriting()
}
void BackupImpl::increaseUncompressedSize(UInt64 file_size)
{
uncompressed_size += file_size;
++num_files;
}
void BackupImpl::increaseUncompressedSize(const FileInfo & info)
{
if ((info.size > info.base_size) && (info.data_file_name.empty() || (info.data_file_name == info.file_name)))
increaseUncompressedSize(info.size - info.base_size);
}
void BackupImpl::setCompressedSize()
{
if (use_archives)
compressed_size = writer ? writer->getFileSize(archive_params.archive_name) : reader->getFileSize(archive_params.archive_name);
else
compressed_size = uncompressed_size;
}
String BackupImpl::getArchiveNameWithSuffix(const String & suffix) const
{
return archive_params.archive_name + (suffix.empty() ? "" : ".") + suffix;
}
std::shared_ptr<IArchiveReader> BackupImpl::getArchiveReader(const String & suffix) const
{
auto it = archive_readers.find(suffix);
@ -796,6 +843,7 @@ std::shared_ptr<IArchiveWriter> BackupImpl::getArchiveWriter(const String & suff
return new_archive_writer;
}
void BackupImpl::removeAllFilesAfterFailure()
{
if (is_internal_backup)

View File

@ -55,8 +55,11 @@ public:
const String & getName() const override { return backup_name; }
OpenMode getOpenMode() const override { return open_mode; }
time_t getTimestamp() const override;
time_t getTimestamp() const override { return timestamp; }
UUID getUUID() const override { return *uuid; }
size_t getNumFiles() const override;
UInt64 getUncompressedSize() const override;
UInt64 getCompressedSize() const override;
Strings listFiles(const String & directory, bool recursive) const override;
bool hasFiles(const String & directory) const override;
bool fileExists(const String & file_name) const override;
@ -76,6 +79,7 @@ private:
void open(const ContextPtr & context);
void close();
void closeArchives();
/// Writes the file ".backup" containing backup's metadata.
void writeBackupMetadata();
@ -96,6 +100,13 @@ private:
std::shared_ptr<IArchiveReader> getArchiveReader(const String & suffix) const;
std::shared_ptr<IArchiveWriter> getArchiveWriter(const String & suffix);
/// Increases `uncompressed_size` by a specific value and `num_files` by 1.
void increaseUncompressedSize(UInt64 file_size);
void increaseUncompressedSize(const FileInfo & info);
/// Calculates and sets `compressed_size`.
void setCompressedSize();
const String backup_name;
const ArchiveParams archive_params;
const bool use_archives;
@ -108,6 +119,9 @@ private:
mutable std::mutex mutex;
std::optional<UUID> uuid;
time_t timestamp = 0;
size_t num_files = 0;
UInt64 uncompressed_size = 0;
UInt64 compressed_size = 0;
UInt64 version;
std::optional<BackupInfo> base_backup_info;
std::shared_ptr<const IBackup> base_backup;

View File

@ -60,6 +60,7 @@ namespace
/// List of backup settings except base_backup_name and cluster_host_ids.
#define LIST_OF_BACKUP_SETTINGS(M) \
M(String, id) \
M(String, compression_method) \
M(Int64, compression_level) \
M(String, password) \

View File

@ -11,6 +11,9 @@ class ASTBackupQuery;
/// Settings specified in the "SETTINGS" clause of a BACKUP query.
struct BackupSettings
{
/// ID of the backup operation, to identify it in the system.backups table. Auto-generated if not set.
String id;
/// Base backup, if it's set an incremental backup will be built. That means only differences made after the base backup will be put
/// into a new backup.
std::optional<BackupInfo> base_backup_info;

View File

@ -15,18 +15,18 @@ std::string_view toString(BackupStatus backup_status)
{
switch (backup_status)
{
case BackupStatus::MAKING_BACKUP:
return "MAKING_BACKUP";
case BackupStatus::BACKUP_COMPLETE:
return "BACKUP_COMPLETE";
case BackupStatus::FAILED_TO_BACKUP:
return "FAILED_TO_BACKUP";
case BackupStatus::CREATING_BACKUP:
return "CREATING_BACKUP";
case BackupStatus::BACKUP_CREATED:
return "BACKUP_CREATED";
case BackupStatus::BACKUP_FAILED:
return "BACKUP_FAILED";
case BackupStatus::RESTORING:
return "RESTORING";
case BackupStatus::RESTORED:
return "RESTORED";
case BackupStatus::FAILED_TO_RESTORE:
return "FAILED_TO_RESTORE";
case BackupStatus::RESTORE_FAILED:
return "RESTORE_FAILED";
default:
break;
}

View File

@ -9,14 +9,14 @@ namespace DB
enum class BackupStatus
{
/// Statuses of making backups
MAKING_BACKUP,
BACKUP_COMPLETE,
FAILED_TO_BACKUP,
CREATING_BACKUP,
BACKUP_CREATED,
BACKUP_FAILED,
/// Status of restoring
RESTORING,
RESTORED,
FAILED_TO_RESTORE,
RESTORE_FAILED,
MAX,
};

View File

@ -27,9 +27,11 @@ namespace DB
namespace ErrorCodes
{
extern const int BAD_ARGUMENTS;
extern const int LOGICAL_ERROR;
}
using OperationID = BackupsWorker::OperationID;
namespace Stage = BackupCoordinationStage;
namespace
@ -92,10 +94,21 @@ namespace
}
}
bool isFinalStatus(BackupStatus status)
{
return (status == BackupStatus::BACKUP_CREATED) || (status == BackupStatus::BACKUP_FAILED) || (status == BackupStatus::RESTORED)
|| (status == BackupStatus::RESTORE_FAILED);
}
bool isErrorStatus(BackupStatus status)
{
return (status == BackupStatus::BACKUP_FAILED) || (status == BackupStatus::RESTORE_FAILED);
}
/// Used to change num_active_backups.
size_t getNumActiveBackupsChange(BackupStatus status)
{
return status == BackupStatus::MAKING_BACKUP;
return status == BackupStatus::CREATING_BACKUP;
}
/// Used to change num_active_restores.
@ -115,7 +128,7 @@ BackupsWorker::BackupsWorker(size_t num_backup_threads, size_t num_restore_threa
}
std::pair<UUID, bool> BackupsWorker::start(const ASTPtr & backup_or_restore_query, ContextMutablePtr context)
OperationID BackupsWorker::start(const ASTPtr & backup_or_restore_query, ContextMutablePtr context)
{
const ASTBackupQuery & backup_query = typeid_cast<const ASTBackupQuery &>(*backup_or_restore_query);
if (backup_query.kind == ASTBackupQuery::Kind::BACKUP)
@ -125,17 +138,24 @@ std::pair<UUID, bool> BackupsWorker::start(const ASTPtr & backup_or_restore_quer
}
std::pair<UUID, bool> BackupsWorker::startMakingBackup(const ASTPtr & query, const ContextPtr & context)
OperationID BackupsWorker::startMakingBackup(const ASTPtr & query, const ContextPtr & context)
{
auto backup_query = std::static_pointer_cast<ASTBackupQuery>(query->clone());
auto backup_settings = BackupSettings::fromBackupQuery(*backup_query);
if (!backup_settings.backup_uuid)
backup_settings.backup_uuid = UUIDHelpers::generateV4();
UUID backup_uuid = *backup_settings.backup_uuid;
/// `backup_id` will be used as a key to the `infos` map, so it should be unique.
OperationID backup_id;
if (backup_settings.internal)
backup_id = "internal-" + toString(UUIDHelpers::generateV4()); /// Always generate `backup_id` for internal backup to avoid collision if both internal and non-internal backups are on the same host
else if (!backup_settings.id.empty())
backup_id = backup_settings.id;
else
backup_id = toString(*backup_settings.backup_uuid);
std::shared_ptr<IBackupCoordination> backup_coordination;
if (backup_settings.internal)
{
/// The following call of makeBackupCoordination() is not essential because doBackup() will later create a backup coordination
@ -147,7 +167,7 @@ std::pair<UUID, bool> BackupsWorker::startMakingBackup(const ASTPtr & query, con
try
{
auto backup_info = BackupInfo::fromAST(*backup_query->backup_name);
addInfo(backup_uuid, backup_settings.internal, backup_info.toString(), BackupStatus::MAKING_BACKUP);
addInfo(backup_id, backup_info.toString(), backup_settings.internal, BackupStatus::CREATING_BACKUP);
/// Prepare context to use.
ContextPtr context_in_use = context;
@ -163,10 +183,11 @@ std::pair<UUID, bool> BackupsWorker::startMakingBackup(const ASTPtr & query, con
if (backup_settings.async)
{
backups_thread_pool.scheduleOrThrowOnError(
[this, backup_uuid, backup_query, backup_settings, backup_info, backup_coordination, context_in_use, mutable_context] {
[this, backup_query, backup_id, backup_settings, backup_info, backup_coordination, context_in_use, mutable_context]
{
doBackup(
backup_uuid,
backup_query,
backup_id,
backup_settings,
backup_info,
backup_coordination,
@ -178,8 +199,8 @@ std::pair<UUID, bool> BackupsWorker::startMakingBackup(const ASTPtr & query, con
else
{
doBackup(
backup_uuid,
backup_query,
backup_id,
backup_settings,
backup_info,
backup_coordination,
@ -188,12 +209,12 @@ std::pair<UUID, bool> BackupsWorker::startMakingBackup(const ASTPtr & query, con
/* called_async= */ false);
}
return {backup_uuid, backup_settings.internal};
return backup_id;
}
catch (...)
{
/// Something bad happened, the backup has not built.
setStatus(backup_uuid, backup_settings.internal, BackupStatus::FAILED_TO_BACKUP);
setStatusSafe(backup_id, BackupStatus::BACKUP_FAILED);
sendCurrentExceptionToCoordination(backup_coordination, backup_settings.host_id);
throw;
}
@ -201,8 +222,8 @@ std::pair<UUID, bool> BackupsWorker::startMakingBackup(const ASTPtr & query, con
void BackupsWorker::doBackup(
const UUID & backup_uuid,
const std::shared_ptr<ASTBackupQuery> & backup_query,
const OperationID & backup_id,
BackupSettings backup_settings,
const BackupInfo & backup_info,
std::shared_ptr<IBackupCoordination> backup_coordination,
@ -237,7 +258,7 @@ void BackupsWorker::doBackup(
if (backup_settings.coordination_zk_path.empty())
{
String root_zk_path = context->getConfigRef().getString("backups.zookeeper_path", "/clickhouse/backups");
backup_settings.coordination_zk_path = root_zk_path + "/backup-" + toString(backup_uuid);
backup_settings.coordination_zk_path = root_zk_path + "/backup-" + toString(*backup_settings.backup_uuid);
}
}
@ -256,7 +277,7 @@ void BackupsWorker::doBackup(
backup_create_params.password = backup_settings.password;
backup_create_params.is_internal_backup = backup_settings.internal;
backup_create_params.backup_coordination = backup_coordination;
backup_create_params.backup_uuid = backup_uuid;
backup_create_params.backup_uuid = backup_settings.backup_uuid;
BackupMutablePtr backup = BackupFactory::instance().createBackup(backup_create_params);
/// Write the backup.
@ -297,15 +318,25 @@ void BackupsWorker::doBackup(
backup_coordination->setStage(backup_settings.host_id, Stage::COMPLETED, "");
}
size_t num_files = 0;
UInt64 uncompressed_size = 0;
UInt64 compressed_size = 0;
/// Finalize backup (write its metadata).
if (!backup_settings.internal)
{
backup->finalizeWriting();
num_files = backup->getNumFiles();
uncompressed_size = backup->getUncompressedSize();
compressed_size = backup->getCompressedSize();
}
/// Close the backup.
backup.reset();
LOG_INFO(log, "{} {} was created successfully", (backup_settings.internal ? "Internal backup" : "Backup"), backup_info.toString());
setStatus(backup_uuid, backup_settings.internal, BackupStatus::BACKUP_COMPLETE);
setStatus(backup_id, BackupStatus::BACKUP_CREATED);
setNumFilesAndSize(backup_id, num_files, uncompressed_size, compressed_size);
}
catch (...)
{
@ -313,7 +344,7 @@ void BackupsWorker::doBackup(
if (called_async)
{
tryLogCurrentException(log, fmt::format("Failed to make {} {}", (backup_settings.internal ? "internal backup" : "backup"), backup_info.toString()));
setStatus(backup_uuid, backup_settings.internal, BackupStatus::FAILED_TO_BACKUP);
setStatusSafe(backup_id, BackupStatus::BACKUP_FAILED);
sendCurrentExceptionToCoordination(backup_coordination, backup_settings.host_id);
}
else
@ -325,14 +356,21 @@ void BackupsWorker::doBackup(
}
std::pair<UUID, bool> BackupsWorker::startRestoring(const ASTPtr & query, ContextMutablePtr context)
OperationID BackupsWorker::startRestoring(const ASTPtr & query, ContextMutablePtr context)
{
auto restore_query = std::static_pointer_cast<ASTBackupQuery>(query->clone());
auto restore_settings = RestoreSettings::fromRestoreQuery(*restore_query);
UUID restore_uuid = UUIDHelpers::generateV4();
/// `restore_id` will be used as a key to the `infos` map, so it should be unique.
OperationID restore_id;
if (restore_settings.internal)
restore_id = "internal-" + toString(UUIDHelpers::generateV4()); /// Always generate `restore_id` for internal restore to avoid collision if both internal and non-internal restores are on the same host
else if (!restore_settings.id.empty())
restore_id = restore_settings.id;
else
restore_id = toString(UUIDHelpers::generateV4());
std::shared_ptr<IRestoreCoordination> restore_coordination;
if (restore_settings.internal)
{
/// The following call of makeRestoreCoordination() is not essential because doRestore() will later create a restore coordination
@ -344,7 +382,7 @@ std::pair<UUID, bool> BackupsWorker::startRestoring(const ASTPtr & query, Contex
try
{
auto backup_info = BackupInfo::fromAST(*restore_query->backup_name);
addInfo(restore_uuid, restore_settings.internal, backup_info.toString(), BackupStatus::RESTORING);
addInfo(restore_id, backup_info.toString(), restore_settings.internal, BackupStatus::RESTORING);
/// Prepare context to use.
ContextMutablePtr context_in_use = context;
@ -359,10 +397,10 @@ std::pair<UUID, bool> BackupsWorker::startRestoring(const ASTPtr & query, Contex
if (restore_settings.async)
{
backups_thread_pool.scheduleOrThrowOnError(
[this, restore_uuid, restore_query, restore_settings, backup_info, restore_coordination, context_in_use] {
[this, restore_query, restore_id, restore_settings, backup_info, restore_coordination, context_in_use] {
doRestore(
restore_uuid,
restore_query,
restore_id,
restore_settings,
backup_info,
restore_coordination,
@ -373,8 +411,8 @@ std::pair<UUID, bool> BackupsWorker::startRestoring(const ASTPtr & query, Contex
else
{
doRestore(
restore_uuid,
restore_query,
restore_id,
restore_settings,
backup_info,
restore_coordination,
@ -382,12 +420,12 @@ std::pair<UUID, bool> BackupsWorker::startRestoring(const ASTPtr & query, Contex
/* called_async= */ false);
}
return {restore_uuid, restore_settings.internal};
return restore_id;
}
catch (...)
{
/// Something bad happened, the backup has not built.
setStatus(restore_uuid, restore_settings.internal, BackupStatus::FAILED_TO_RESTORE);
setStatusSafe(restore_id, BackupStatus::RESTORE_FAILED);
sendCurrentExceptionToCoordination(restore_coordination, restore_settings.host_id);
throw;
}
@ -395,8 +433,8 @@ std::pair<UUID, bool> BackupsWorker::startRestoring(const ASTPtr & query, Contex
void BackupsWorker::doRestore(
const UUID & restore_uuid,
const std::shared_ptr<ASTBackupQuery> & restore_query,
const OperationID & restore_id,
RestoreSettings restore_settings,
const BackupInfo & backup_info,
std::shared_ptr<IRestoreCoordination> restore_coordination,
@ -421,6 +459,8 @@ void BackupsWorker::doRestore(
backup_open_params.password = restore_settings.password;
BackupPtr backup = BackupFactory::instance().createBackup(backup_open_params);
setNumFilesAndSize(restore_id, backup->getNumFiles(), backup->getUncompressedSize(), backup->getCompressedSize());
String current_database = context->getCurrentDatabase();
/// Checks access rights if this is ON CLUSTER query.
@ -453,7 +493,7 @@ void BackupsWorker::doRestore(
if (on_cluster && restore_settings.coordination_zk_path.empty())
{
String root_zk_path = context->getConfigRef().getString("backups.zookeeper_path", "/clickhouse/backups");
restore_settings.coordination_zk_path = root_zk_path + "/restore-" + toString(restore_uuid);
restore_settings.coordination_zk_path = root_zk_path + "/restore-" + toString(UUIDHelpers::generateV4());
}
if (!restore_coordination)
@ -500,7 +540,7 @@ void BackupsWorker::doRestore(
}
LOG_INFO(log, "Restored from {} {} successfully", (restore_settings.internal ? "internal backup" : "backup"), backup_info.toString());
setStatus(restore_uuid, restore_settings.internal, BackupStatus::RESTORED);
setStatus(restore_id, BackupStatus::RESTORED);
}
catch (...)
{
@ -508,7 +548,7 @@ void BackupsWorker::doRestore(
if (called_async)
{
tryLogCurrentException(log, fmt::format("Failed to restore from {} {}", (restore_settings.internal ? "internal backup" : "backup"), backup_info.toString()));
setStatus(restore_uuid, restore_settings.internal, BackupStatus::FAILED_TO_RESTORE);
setStatusSafe(restore_id, BackupStatus::RESTORE_FAILED);
sendCurrentExceptionToCoordination(restore_coordination, restore_settings.host_id);
}
else
@ -520,63 +560,103 @@ void BackupsWorker::doRestore(
}
void BackupsWorker::addInfo(const UUID & uuid, bool internal, const String & backup_name, BackupStatus status)
void BackupsWorker::addInfo(const OperationID & id, const String & name, bool internal, BackupStatus status)
{
Info info;
info.uuid = uuid;
info.backup_name = backup_name;
info.status = status;
info.status_changed_time = time(nullptr);
info.id = id;
info.name = name;
info.internal = internal;
info.status = status;
info.start_time = std::chrono::system_clock::now();
if (isFinalStatus(status))
info.end_time = info.start_time;
std::lock_guard lock{infos_mutex};
bool inserted = infos.try_emplace({uuid, internal}, std::move(info)).second;
if (!inserted)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Pair of UUID={} and internal={} is already in use", uuid, internal);
auto it = infos.find(id);
if (it != infos.end())
{
/// It's better not allow to overwrite the current status if it's in progress.
auto current_status = it->second.status;
if (!isFinalStatus(current_status))
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot start a backup or restore: ID {} is already in use", id);
}
infos[id] = std::move(info);
num_active_backups += getNumActiveBackupsChange(status);
num_active_restores += getNumActiveRestoresChange(status);
}
void BackupsWorker::setStatus(const UUID & uuid, bool internal, BackupStatus status)
void BackupsWorker::setStatus(const String & id, BackupStatus status, bool throw_if_error)
{
std::lock_guard lock{infos_mutex};
auto it = infos.find({uuid, internal});
auto it = infos.find(id);
if (it == infos.end())
throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown pair of UUID={} and internal={}", uuid, internal);
{
if (throw_if_error)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown backup ID {}", id);
else
return;
}
auto & info = it->second;
auto old_status = info.status;
info.status = status;
info.status_changed_time = time(nullptr);
if (isFinalStatus(status))
info.end_time = std::chrono::system_clock::now();
if (isErrorStatus(status))
{
info.error_message = getCurrentExceptionMessage(false);
info.exception = std::current_exception();
}
num_active_backups += getNumActiveBackupsChange(status) - getNumActiveBackupsChange(old_status);
num_active_restores += getNumActiveRestoresChange(status) - getNumActiveRestoresChange(old_status);
}
void BackupsWorker::wait(const UUID & backup_or_restore_uuid, bool internal, bool rethrow_exception)
void BackupsWorker::setNumFilesAndSize(const String & id, size_t num_files, UInt64 uncompressed_size, UInt64 compressed_size)
{
std::lock_guard lock{infos_mutex};
auto it = infos.find(id);
if (it == infos.end())
throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown backup ID {}", id);
auto & info = it->second;
info.num_files = num_files;
info.uncompressed_size = uncompressed_size;
info.compressed_size = compressed_size;
}
void BackupsWorker::wait(const OperationID & id, bool rethrow_exception)
{
std::unique_lock lock{infos_mutex};
status_changed.wait(lock, [&]
{
auto it = infos.find({backup_or_restore_uuid, internal});
auto it = infos.find(id);
if (it == infos.end())
throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown pair of UUID={} and internal={}", backup_or_restore_uuid, internal);
throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown backup ID {}", id);
const auto & info = it->second;
auto current_status = info.status;
if (rethrow_exception && ((current_status == BackupStatus::FAILED_TO_BACKUP) || (current_status == BackupStatus::FAILED_TO_RESTORE)))
if (rethrow_exception && isErrorStatus(current_status))
std::rethrow_exception(info.exception);
return (current_status == BackupStatus::BACKUP_COMPLETE) || (current_status == BackupStatus::RESTORED);
return isFinalStatus(current_status);
});
}
BackupsWorker::Info BackupsWorker::getInfo(const UUID & backup_or_restore_uuid, bool internal) const
BackupsWorker::Info BackupsWorker::getInfo(const OperationID & id) const
{
std::lock_guard lock{infos_mutex};
auto it = infos.find({backup_or_restore_uuid, internal});
auto it = infos.find(id);
if (it == infos.end())
throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown pair of UUID={} and internal={}", backup_or_restore_uuid, internal);
throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown backup ID {}", id);
return it->second;
}
@ -585,20 +665,23 @@ std::vector<BackupsWorker::Info> BackupsWorker::getAllInfos() const
std::vector<Info> res_infos;
std::lock_guard lock{infos_mutex};
for (const auto & info : infos | boost::adaptors::map_values)
res_infos.push_back(info);
{
if (!info.internal)
res_infos.push_back(info);
}
return res_infos;
}
void BackupsWorker::shutdown()
{
bool has_active_backups_or_restores = (num_active_backups || num_active_restores);
if (has_active_backups_or_restores)
bool has_active_backups_and_restores = (num_active_backups || num_active_restores);
if (has_active_backups_and_restores)
LOG_INFO(log, "Waiting for {} backups and {} restores to be finished", num_active_backups, num_active_restores);
backups_thread_pool.wait();
restores_thread_pool.wait();
if (has_active_backups_or_restores)
if (has_active_backups_and_restores)
LOG_INFO(log, "All backup and restore tasks have finished");
}

View File

@ -28,55 +28,72 @@ public:
/// Waits until all tasks have been completed.
void shutdown();
/// Starts executing a BACKUP or RESTORE query. Returns UUID of the operation.
std::pair<UUID, bool> start(const ASTPtr & backup_or_restore_query, ContextMutablePtr context);
/// Backup's or restore's operation ID, can be either passed via SETTINGS id=... or be randomly generated UUID.
using OperationID = String;
/// Starts executing a BACKUP or RESTORE query. Returns ID of the operation.
OperationID start(const ASTPtr & backup_or_restore_query, ContextMutablePtr context);
/// Waits until a BACKUP or RESTORE query started by start() is finished.
/// The function returns immediately if the operation is already finished.
void wait(const UUID & backup_or_restore_uuid, bool internal, bool rethrow_exception = true);
void wait(const OperationID & backup_or_restore_id, bool rethrow_exception = true);
/// Information about executing a BACKUP or RESTORE query started by calling start().
struct Info
{
UUID uuid;
/// Backup's or restore's operation ID, can be either passed via SETTINGS id=... or be randomly generated UUID.
OperationID id;
/// Backup's name, a string like "Disk('backups', 'my_backup')"
String backup_name;
String name;
BackupStatus status;
time_t status_changed_time;
String error_message;
std::exception_ptr exception;
/// Whether this operation is internal, i.e. caused by another BACKUP or RESTORE operation.
/// For example BACKUP ON CLUSTER executes an internal BACKUP commands per each node.
/// This operation is internal and should not be shown in system.backups
bool internal = false;
/// Status of backup or restore operation.
BackupStatus status;
/// Number of files in the backup (including backup's metadata; only unique files are counted).
size_t num_files = 0;
/// Size of all files in the backup (including backup's metadata; only unique files are counted).
UInt64 uncompressed_size = 0;
/// Size of the backup if it's stored as an archive; or the same as `uncompressed_size` if the backup is stored as a folder.
UInt64 compressed_size = 0;
/// Set only if there was an error.
std::exception_ptr exception;
String error_message;
std::chrono::system_clock::time_point start_time;
std::chrono::system_clock::time_point end_time;
};
Info getInfo(const UUID & backup_or_restore_uuid, bool internal) const;
Info getInfo(const OperationID & id) const;
std::vector<Info> getAllInfos() const;
private:
std::pair<UUID, bool> startMakingBackup(const ASTPtr & query, const ContextPtr & context);
OperationID startMakingBackup(const ASTPtr & query, const ContextPtr & context);
void doBackup(const UUID & backup_uuid, const std::shared_ptr<ASTBackupQuery> & backup_query, BackupSettings backup_settings,
void doBackup(const std::shared_ptr<ASTBackupQuery> & backup_query, const OperationID & backup_id, BackupSettings backup_settings,
const BackupInfo & backup_info, std::shared_ptr<IBackupCoordination> backup_coordination, const ContextPtr & context,
ContextMutablePtr mutable_context, bool called_async);
std::pair<UUID, bool> startRestoring(const ASTPtr & query, ContextMutablePtr context);
OperationID startRestoring(const ASTPtr & query, ContextMutablePtr context);
void doRestore(const UUID & restore_uuid, const std::shared_ptr<ASTBackupQuery> & restore_query, RestoreSettings restore_settings,
const BackupInfo & backup_info, std::shared_ptr<IRestoreCoordination> restore_coordination, ContextMutablePtr context,
bool called_async);
void doRestore(const std::shared_ptr<ASTBackupQuery> & restore_query, const OperationID & restore_id, RestoreSettings restore_settings, const BackupInfo & backup_info,
std::shared_ptr<IRestoreCoordination> restore_coordination, ContextMutablePtr context, bool called_async);
void addInfo(const UUID & uuid, bool internal, const String & backup_name, BackupStatus status);
void setStatus(const UUID & uuid, bool internal, BackupStatus status);
void addInfo(const OperationID & id, const String & name, bool internal, BackupStatus status);
void setStatus(const OperationID & id, BackupStatus status, bool throw_if_error = true);
void setStatusSafe(const String & id, BackupStatus status) { setStatus(id, status, false); }
void setNumFilesAndSize(const OperationID & id, size_t num_files, UInt64 uncompressed_size, UInt64 compressed_size);
ThreadPool backups_thread_pool;
ThreadPool restores_thread_pool;
std::map<std::pair<UUID, bool>, Info> infos;
std::unordered_map<OperationID, Info> infos;
std::condition_variable status_changed;
std::atomic<size_t> num_active_backups = 0;
std::atomic<size_t> num_active_restores = 0;

View File

@ -36,6 +36,15 @@ public:
/// Returns UUID of the backup.
virtual UUID getUUID() const = 0;
/// Returns the number of unique files in the backup.
virtual size_t getNumFiles() const = 0;
/// Returns the total size of unique files in the backup.
virtual UInt64 getUncompressedSize() const = 0;
/// Returns the compressed size of the backup. If the backup is not stored as an archive it returns the same as getUncompressedSize().
virtual UInt64 getCompressedSize() const = 0;
/// Returns names of entries stored in a specified directory in the backup.
/// If `directory` is empty or '/' the functions returns entries in the backup's root.
virtual Strings listFiles(const String & directory, bool recursive = false) const = 0;

View File

@ -143,6 +143,7 @@ namespace
/// List of restore settings except base_backup_name and cluster_host_ids.
#define LIST_OF_RESTORE_SETTINGS(M) \
M(String, id) \
M(String, password) \
M(Bool, structure_only) \
M(RestoreTableCreationMode, create_table) \

View File

@ -41,6 +41,9 @@ using RestoreUDFCreationMode = RestoreAccessCreationMode;
/// Settings specified in the "SETTINGS" clause of a RESTORE query.
struct RestoreSettings
{
/// ID of the restore operation, to identify it in the system.backups table. Auto-generated if not set.
String id;
/// Base backup, with this setting we can override the location of the base backup while restoring.
/// Any incremental backup keeps inside the information about its base backup, so using this setting is optional.
std::optional<BackupInfo> base_backup_info;

View File

@ -144,8 +144,8 @@ endif ()
list (APPEND clickhouse_common_io_sources ${CONFIG_BUILD})
list (APPEND clickhouse_common_io_headers ${CONFIG_VERSION} ${CONFIG_COMMON})
list (APPEND dbms_sources Functions/IFunction.cpp Functions/FunctionFactory.cpp Functions/FunctionHelpers.cpp Functions/extractTimeZoneFromFunctionArguments.cpp Functions/replicate.cpp Functions/FunctionsLogical.cpp)
list (APPEND dbms_headers Functions/IFunction.h Functions/FunctionFactory.h Functions/FunctionHelpers.h Functions/extractTimeZoneFromFunctionArguments.h Functions/replicate.h Functions/FunctionsLogical.h)
list (APPEND dbms_sources Functions/IFunction.cpp Functions/FunctionFactory.cpp Functions/FunctionHelpers.cpp Functions/extractTimeZoneFromFunctionArguments.cpp Functions/FunctionsLogical.cpp)
list (APPEND dbms_headers Functions/IFunction.h Functions/FunctionFactory.h Functions/FunctionHelpers.h Functions/extractTimeZoneFromFunctionArguments.h Functions/FunctionsLogical.h)
list (APPEND dbms_sources
AggregateFunctions/IAggregateFunction.cpp

View File

@ -69,6 +69,7 @@
#include <IO/CompressionMethod.h>
#include <Client/InternalTextLogs.h>
#include <boost/algorithm/string/replace.hpp>
#include <IO/ForkWriteBuffer.h>
namespace fs = std::filesystem;
@ -403,7 +404,6 @@ void ClientBase::onData(Block & block, ASTPtr parsed_query)
return;
processed_rows += block.rows();
/// Even if all blocks are empty, we still need to initialize the output stream to write empty resultset.
initOutputFormat(block, parsed_query);
@ -414,7 +414,7 @@ void ClientBase::onData(Block & block, ASTPtr parsed_query)
return;
/// If results are written INTO OUTFILE, we can avoid clearing progress to avoid flicker.
if (need_render_progress && (stdout_is_a_tty || is_interactive) && !select_into_file)
if (need_render_progress && (stdout_is_a_tty || is_interactive) && (!select_into_file || select_into_file_and_stdout))
progress_indication.clearProgressOutput();
try
@ -434,7 +434,7 @@ void ClientBase::onData(Block & block, ASTPtr parsed_query)
/// Restore progress bar after data block.
if (need_render_progress && (stdout_is_a_tty || is_interactive))
{
if (select_into_file)
if (select_into_file && !select_into_file_and_stdout)
std::cerr << "\r";
progress_indication.writeProgress();
}
@ -511,7 +511,7 @@ try
String current_format = format;
select_into_file = false;
select_into_file_and_stdout = false;
/// The query can specify output format or output file.
if (const auto * query_with_output = dynamic_cast<const ASTQueryWithOutput *>(parsed_query.get()))
{
@ -554,6 +554,13 @@ try
compression_level
);
if (query_with_output->is_into_outfile_with_stdout)
{
select_into_file_and_stdout = true;
out_file_buf = std::make_unique<ForkWriteBuffer>(std::vector<WriteBufferPtr>{std::move(out_file_buf),
std::make_shared<WriteBufferFromFileDescriptor>(STDOUT_FILENO)});
}
// We are writing to file, so default format is the same as in non-interactive mode.
if (is_interactive && is_default_format)
current_format = "TabSeparated";
@ -578,7 +585,7 @@ try
/// It is not clear how to write progress intermixed with data with parallel formatting.
/// It may increase code complexity significantly.
if (!need_render_progress || select_into_file)
if (!need_render_progress || (select_into_file && !select_into_file_and_stdout))
output_format = global_context->getOutputFormatParallelIfPossible(
current_format, out_file_buf ? *out_file_buf : *out_buf, block);
else

View File

@ -181,6 +181,7 @@ protected:
String format; /// Query results output format.
bool select_into_file = false; /// If writing result INTO OUTFILE. It affects progress rendering.
bool select_into_file_and_stdout = false; /// If writing result INTO OUTFILE AND STDOUT. It affects progress rendering.
bool is_default_format = true; /// false, if format is set in the config or command line.
size_t format_max_block_size = 0; /// Max block size for console output.
String insert_format; /// Format of INSERT data that is read from stdin in batch mode.

View File

@ -130,7 +130,7 @@ public:
private:
ReadBuffer & in;
/// The physical location of the current cell.
Locus locus;
Locus locus{};
/// The current position in the file as a cell number.
BucketIndex current_bucket_index = 0;
/// The number of bytes read.

View File

@ -0,0 +1,266 @@
#pragma once
#include <base/types.h>
#include <boost/core/noncopyable.hpp>
#include <mutex>
#include <memory>
#include <list>
#include <condition_variable>
#include <Common/Exception.h>
namespace DB
{
namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
}
}
/*
* Controls how many threads can be allocated for a query (or another activity).
* There is a limited amount of slots for threads. It can be set with `setMaxConcurrency(limit)`.
*
* Lifecycle of a slot: free -> granted -> acquired -> free.
* free: slot is available to be allocated by any query.
* granted: slot is allocated by specific query, but not yet acquired by any thread.
* acquired: slot is allocated by specific query and acquired by a thread.
*
* USAGE:
* 1. Create an allocation for a query:
* `auto slots = ConcurrencyControl::instance().allocate(min, max);`
* It will allocate at least `min` and at most `max` slots.
* Note that `min` slots are granted immediately, but other `max - min` may be granted later.
* 2. For every thread a slot has to be acquired from that allocation:
* `while (auto slot = slots->tryAcquire()) createYourThread([slot = std::move(slot)] { ... });`
* This snippet can be used at query startup and for upscaling later.
* (both functions are non-blocking)
*
* Released slots are distributed between waiting allocations in a round-robin manner to provide fairness.
* Oversubscription is possible: total amount of allocated slots can exceed `setMaxConcurrency(limit)`
* because `min` amount of slots is allocated for each query unconditionally.
*/
class ConcurrencyControl : boost::noncopyable
{
public:
struct Allocation;
using AllocationPtr = std::shared_ptr<Allocation>;
using SlotCount = UInt64;
using Waiters = std::list<Allocation *>;
static constexpr SlotCount Unlimited = std::numeric_limits<SlotCount>::max();
// Scoped guard for acquired slot, see Allocation::tryAcquire()
struct Slot : boost::noncopyable
{
~Slot()
{
allocation->release();
}
private:
friend struct Allocation; // for ctor
explicit Slot(AllocationPtr && allocation_)
: allocation(std::move(allocation_))
{}
AllocationPtr allocation;
};
// FIXME: have to be unique_ptr, but ThreadFromGlobalPool does not support move semantics yet
using SlotPtr = std::shared_ptr<Slot>;
// Manages group of slots for a single query, see ConcurrencyControl::allocate(min, max)
struct Allocation : std::enable_shared_from_this<Allocation>, boost::noncopyable
{
~Allocation()
{
// We have to lock parent's mutex to avoid race with grant()
// NOTE: shortcut can be added, but it requires Allocation::mutex lock even to check if shortcut is possible
parent.free(this);
}
// Take one already granted slot if available. Lock-free iff there is no granted slot.
[[nodiscard]] SlotPtr tryAcquire()
{
SlotCount value = granted.load();
while (value)
{
if (granted.compare_exchange_strong(value, value - 1))
{
std::unique_lock lock{mutex};
return SlotPtr(new Slot(shared_from_this())); // can't use std::make_shared due to private ctor
}
}
return {}; // avoid unnecessary locking
}
SlotCount grantedCount() const
{
return granted;
}
private:
friend struct Slot; // for release()
friend class ConcurrencyControl; // for grant(), free() and ctor
Allocation(ConcurrencyControl & parent_, SlotCount limit_, SlotCount granted_, Waiters::iterator waiter_ = {})
: parent(parent_)
, limit(limit_)
, allocated(granted_)
, granted(granted_)
, waiter(waiter_)
{
if (allocated < limit)
*waiter = this;
}
auto cancel()
{
std::unique_lock lock{mutex};
return std::pair{allocated - released,
allocated < limit ?
std::optional<Waiters::iterator>(waiter) :
std::optional<Waiters::iterator>()};
}
// Grant single slot to allocation, returns true iff more slot(s) are required
bool grant()
{
std::unique_lock lock{mutex};
granted++;
allocated++;
return allocated < limit;
}
// Release one slot and grant it to other allocation if required
void release()
{
parent.release(1);
std::unique_lock lock{mutex};
released++;
if (released > allocated)
abort();
}
ConcurrencyControl & parent;
const SlotCount limit;
std::mutex mutex; // the following values must be accessed under this mutex
SlotCount allocated; // allocated total (including already `released`)
SlotCount released = 0;
std::atomic<SlotCount> granted; // allocated, but not yet acquired
const Waiters::iterator waiter; // iterator to itself in Waiters list; valid iff allocated < limit
};
public:
ConcurrencyControl()
: cur_waiter(waiters.end())
{}
// WARNING: all Allocation objects MUST be destructed before ConcurrencyControl
// NOTE: Recommended way to achieve this is to use `instance()` and do graceful shutdown of queries
~ConcurrencyControl()
{
if (!waiters.empty())
abort();
}
// Allocate at least `min` and at most `max` slots.
// If not all `max` slots were successfully allocated, a subscription for later allocation is created
// Use `Allocation::tryAcquire()` to acquire allocated slot, before running a thread.
[[nodiscard]] AllocationPtr allocate(SlotCount min, SlotCount max)
{
if (min > max)
throw DB::Exception("ConcurrencyControl: invalid allocation requirements", DB::ErrorCodes::LOGICAL_ERROR);
std::unique_lock lock{mutex};
// Acquire as much slots as we can, but not lower than `min`
SlotCount granted = std::max(min, std::min(max, available(lock)));
cur_concurrency += granted;
// Create allocation and start waiting if more slots are required
if (granted < max)
return AllocationPtr(new Allocation(*this, max, granted,
waiters.insert(cur_waiter, nullptr /* pointer is set by Allocation ctor */)));
else
return AllocationPtr(new Allocation(*this, max, granted));
}
void setMaxConcurrency(SlotCount value)
{
std::unique_lock lock{mutex};
max_concurrency = std::max<SlotCount>(1, value); // never allow max_concurrency to be zero
schedule(lock);
}
static ConcurrencyControl & instance()
{
static ConcurrencyControl result;
return result;
}
private:
friend struct Allocation; // for free() and release()
void free(Allocation * allocation)
{
// Allocation is allowed to be canceled even if there are:
// - `amount`: granted slots (acquired slots are not possible, because Slot holds AllocationPtr)
// - `waiter`: active waiting for more slots to be allocated
// Thus Allocation destruction may require the following lock, to avoid race conditions
std::unique_lock lock{mutex};
auto [amount, waiter] = allocation->cancel();
cur_concurrency -= amount;
if (waiter)
{
if (cur_waiter == *waiter)
cur_waiter = waiters.erase(*waiter);
else
waiters.erase(*waiter);
}
schedule(lock);
}
void release(SlotCount amount)
{
std::unique_lock lock{mutex};
cur_concurrency -= amount;
schedule(lock);
}
// Round-robin scheduling of available slots among waiting allocations
void schedule(std::unique_lock<std::mutex> &)
{
while (cur_concurrency < max_concurrency && !waiters.empty())
{
cur_concurrency++;
if (cur_waiter == waiters.end())
cur_waiter = waiters.begin();
Allocation * allocation = *cur_waiter;
if (allocation->grant())
++cur_waiter;
else
cur_waiter = waiters.erase(cur_waiter); // last required slot has just been granted -- stop waiting
}
}
SlotCount available(std::unique_lock<std::mutex> &)
{
if (cur_concurrency < max_concurrency)
return max_concurrency - cur_concurrency;
else
return 0;
}
std::mutex mutex;
Waiters waiters;
Waiters::iterator cur_waiter; // round-robin pointer
SlotCount max_concurrency = Unlimited;
SlotCount cur_concurrency = 0;
};

View File

@ -3,12 +3,18 @@
// MemoryTrackerBlockerInThread
thread_local uint64_t MemoryTrackerBlockerInThread::counter = 0;
thread_local VariableContext MemoryTrackerBlockerInThread::level = VariableContext::Global;
MemoryTrackerBlockerInThread::MemoryTrackerBlockerInThread(VariableContext level_)
: previous_level(level)
{
++counter;
level = level_;
}
MemoryTrackerBlockerInThread::MemoryTrackerBlockerInThread() : MemoryTrackerBlockerInThread(VariableContext::User)
{
}
MemoryTrackerBlockerInThread::~MemoryTrackerBlockerInThread()
{
--counter;

View File

@ -11,9 +11,12 @@ private:
static thread_local VariableContext level;
VariableContext previous_level;
public:
/// level_ - block in level and above
explicit MemoryTrackerBlockerInThread(VariableContext level_ = VariableContext::User);
explicit MemoryTrackerBlockerInThread(VariableContext level_);
public:
explicit MemoryTrackerBlockerInThread();
~MemoryTrackerBlockerInThread();
MemoryTrackerBlockerInThread(const MemoryTrackerBlockerInThread &) = delete;
@ -23,4 +26,6 @@ public:
{
return counter > 0 && current_level >= level;
}
friend class MemoryTracker;
};

View File

@ -79,7 +79,7 @@ void SystemLogBase<LogElement>::add(const LogElement & element)
/// The size of allocation can be in order of a few megabytes.
/// But this should not be accounted for query memory usage.
/// Otherwise the tests like 01017_uniqCombined_memory_usage.sql will be flacky.
MemoryTrackerBlockerInThread temporarily_disable_memory_tracker(VariableContext::Global);
MemoryTrackerBlockerInThread temporarily_disable_memory_tracker;
/// Should not log messages under mutex.
bool queue_is_half_full = false;

View File

@ -2,6 +2,7 @@
#include <base/types.h>
#include <Common/Exception.h>
#include <Coordination/KeeperConstants.h>
#include <vector>
#include <memory>
@ -57,6 +58,8 @@ struct Stat
int32_t dataLength{0}; /// NOLINT
int32_t numChildren{0}; /// NOLINT
int64_t pzxid{0};
bool operator==(const Stat &) const = default;
};
enum class Error : int32_t
@ -109,7 +112,6 @@ bool isUserError(Error code);
const char * errorMessage(Error code);
struct Request;
using RequestPtr = std::shared_ptr<Request>;
using Requests = std::vector<RequestPtr>;
@ -516,6 +518,8 @@ public:
const Requests & requests,
MultiCallback callback) = 0;
virtual DB::KeeperApiVersion getApiVersion() = 0;
/// Expire session and finish all pending requests
virtual void finalize(const String & reason) = 0;
};

View File

@ -90,6 +90,11 @@ public:
void finalize(const String & reason) override;
DB::KeeperApiVersion getApiVersion() override
{
return KeeperApiVersion::ZOOKEEPER_COMPATIBLE;
}
struct Node
{
String data;

View File

@ -337,17 +337,17 @@ Coordination::Error ZooKeeper::getChildrenImpl(const std::string & path, Strings
}
}
Strings ZooKeeper::getChildren(const std::string & path, Coordination::Stat * stat, const EventPtr & watch)
Strings ZooKeeper::getChildren(const std::string & path, Coordination::Stat * stat, const EventPtr & watch, Coordination::ListRequestType list_request_type)
{
Strings res;
check(tryGetChildren(path, res, stat, watch), path);
check(tryGetChildren(path, res, stat, watch, list_request_type), path);
return res;
}
Strings ZooKeeper::getChildrenWatch(const std::string & path, Coordination::Stat * stat, Coordination::WatchCallback watch_callback)
Strings ZooKeeper::getChildrenWatch(const std::string & path, Coordination::Stat * stat, Coordination::WatchCallback watch_callback, Coordination::ListRequestType list_request_type)
{
Strings res;
check(tryGetChildrenWatch(path, res, stat, watch_callback), path);
check(tryGetChildrenWatch(path, res, stat, watch_callback, list_request_type), path);
return res;
}
@ -540,7 +540,6 @@ Coordination::Error ZooKeeper::getImpl(const std::string & path, std::string & r
}
}
std::string ZooKeeper::get(const std::string & path, Coordination::Stat * stat, const EventPtr & watch)
{
Coordination::Error code = Coordination::Error::ZOK;
@ -904,6 +903,11 @@ bool ZooKeeper::expired()
return impl->isExpired();
}
DB::KeeperApiVersion ZooKeeper::getApiVersion()
{
return impl->getApiVersion();
}
Int64 ZooKeeper::getClientID()
{
return impl->getSessionID();

View File

@ -127,6 +127,8 @@ public:
/// Returns true, if the session has expired.
bool expired();
DB::KeeperApiVersion getApiVersion();
/// Create a znode.
/// Throw an exception if something went wrong.
std::string create(const std::string & path, const std::string & data, int32_t mode);
@ -184,11 +186,13 @@ public:
Strings getChildren(const std::string & path,
Coordination::Stat * stat = nullptr,
const EventPtr & watch = nullptr);
const EventPtr & watch = nullptr,
Coordination::ListRequestType list_request_type = Coordination::ListRequestType::ALL);
Strings getChildrenWatch(const std::string & path,
Coordination::Stat * stat,
Coordination::WatchCallback watch_callback);
Coordination::WatchCallback watch_callback,
Coordination::ListRequestType list_request_type = Coordination::ListRequestType::ALL);
/// Doesn't not throw in the following cases:
/// * The node doesn't exist.

View File

@ -724,7 +724,10 @@ void ZooKeeperResponse::fillLogElements(LogElements & elems, size_t idx) const
assert(!elem.xid || elem.xid == xid);
elem.xid = xid;
int32_t response_op = tryGetOpNum();
assert(!elem.op_num || elem.op_num == response_op || response_op < 0);
[[maybe_unused]] const bool is_filtered_list = elem.op_num == static_cast<int32_t>(Coordination::OpNum::FilteredList)
&& response_op == static_cast<int32_t>(Coordination::OpNum::List);
assert(!elem.op_num || elem.op_num == response_op || is_filtered_list || response_op < 0);
elem.op_num = response_op;
elem.zxid = zxid;
@ -892,6 +895,7 @@ ZooKeeperRequestFactory::ZooKeeperRequestFactory()
registerZooKeeperRequest<OpNum::SessionID, ZooKeeperSessionIDRequest>(*this);
registerZooKeeperRequest<OpNum::GetACL, ZooKeeperGetACLRequest>(*this);
registerZooKeeperRequest<OpNum::SetACL, ZooKeeperSetACLRequest>(*this);
registerZooKeeperRequest<OpNum::FilteredList, ZooKeeperFilteredListRequest>(*this);
}
}

View File

@ -24,6 +24,7 @@ static const std::unordered_set<int32_t> VALID_OPERATIONS =
static_cast<int32_t>(OpNum::SessionID),
static_cast<int32_t>(OpNum::SetACL),
static_cast<int32_t>(OpNum::GetACL),
static_cast<int32_t>(OpNum::FilteredList),
};
std::string toString(OpNum op_num)

View File

@ -6,6 +6,7 @@
#include <Common/ZooKeeper/ZooKeeperIO.h>
#include <IO/WriteHelpers.h>
#include <IO/ReadHelpers.h>
#include <IO/ReadBufferFromString.h>
#include <IO/Operators.h>
#include <IO/WriteBufferFromString.h>
#include <Common/logger_useful.h>
@ -352,6 +353,8 @@ ZooKeeper::ZooKeeper(
send_thread = ThreadFromGlobalPool([this] { sendThread(); });
receive_thread = ThreadFromGlobalPool([this] { receiveThread(); });
initApiVersion();
ProfileEvents::increment(ProfileEvents::ZooKeeperInit);
}
@ -698,6 +701,7 @@ void ZooKeeper::receiveEvent()
RequestInfo request_info;
ZooKeeperResponsePtr response;
UInt64 elapsed_ms = 0;
if (xid == PING_XID)
{
@ -758,8 +762,8 @@ void ZooKeeper::receiveEvent()
CurrentMetrics::sub(CurrentMetrics::ZooKeeperRequest);
}
auto elapsed_microseconds = std::chrono::duration_cast<std::chrono::microseconds>(clock::now() - request_info.time).count();
ProfileEvents::increment(ProfileEvents::ZooKeeperWaitMicroseconds, elapsed_microseconds);
elapsed_ms = std::chrono::duration_cast<std::chrono::microseconds>(clock::now() - request_info.time).count();
ProfileEvents::increment(ProfileEvents::ZooKeeperWaitMicroseconds, elapsed_ms);
}
try
@ -809,7 +813,7 @@ void ZooKeeper::receiveEvent()
if (length != actual_length)
throw Exception("Response length doesn't match. Expected: " + DB::toString(length) + ", actual: " + DB::toString(actual_length), Error::ZMARSHALLINGERROR);
logOperationIfNeeded(request_info.request, response); //-V614
logOperationIfNeeded(request_info.request, response, /* finalize= */ false, elapsed_ms); //-V614
}
catch (...)
{
@ -828,7 +832,7 @@ void ZooKeeper::receiveEvent()
if (request_info.callback)
request_info.callback(*response);
logOperationIfNeeded(request_info.request, response);
logOperationIfNeeded(request_info.request, response, /* finalize= */ false, elapsed_ms);
}
catch (...)
{
@ -916,13 +920,14 @@ void ZooKeeper::finalize(bool error_send, bool error_receive, const String & rea
? Error::ZCONNECTIONLOSS
: Error::ZSESSIONEXPIRED;
response->xid = request_info.request->xid;
UInt64 elapsed_ms = std::chrono::duration_cast<std::chrono::microseconds>(clock::now() - request_info.time).count();
if (request_info.callback)
{
try
{
request_info.callback(*response);
logOperationIfNeeded(request_info.request, response, true);
logOperationIfNeeded(request_info.request, response, true, elapsed_ms);
}
catch (...)
{
@ -982,7 +987,8 @@ void ZooKeeper::finalize(bool error_send, bool error_receive, const String & rea
try
{
info.callback(*response);
logOperationIfNeeded(info.request, response, true);
UInt64 elapsed_ms = std::chrono::duration_cast<std::chrono::microseconds>(clock::now() - info.time).count();
logOperationIfNeeded(info.request, response, true, elapsed_ms);
}
catch (...)
{
@ -1057,6 +1063,44 @@ void ZooKeeper::pushRequest(RequestInfo && info)
ProfileEvents::increment(ProfileEvents::ZooKeeperTransactions);
}
KeeperApiVersion ZooKeeper::getApiVersion()
{
return keeper_api_version;
}
void ZooKeeper::initApiVersion()
{
auto promise = std::make_shared<std::promise<Coordination::GetResponse>>();
auto future = promise->get_future();
auto callback = [promise](const Coordination::GetResponse & response) mutable
{
promise->set_value(response);
};
get(keeper_api_version_path, std::move(callback), {});
if (future.wait_for(std::chrono::milliseconds(operation_timeout.totalMilliseconds())) != std::future_status::ready)
{
LOG_TRACE(log, "Failed to get API version: timeout");
return;
}
auto response = future.get();
if (response.error != Coordination::Error::ZOK)
{
LOG_TRACE(log, "Failed to get API version");
return;
}
uint8_t keeper_version{0};
DB::ReadBufferFromOwnString buf(response.data);
DB::readIntText(keeper_version, buf);
keeper_api_version = static_cast<DB::KeeperApiVersion>(keeper_version);
LOG_TRACE(log, "Detected server's API version: {}", keeper_api_version);
}
void ZooKeeper::executeGenericRequest(
const ZooKeeperRequestPtr & request,
ResponseCallback callback)
@ -1172,14 +1216,27 @@ void ZooKeeper::list(
ListCallback callback,
WatchCallback watch)
{
ZooKeeperFilteredListRequest request;
request.path = path;
request.list_request_type = list_request_type;
std::shared_ptr<ZooKeeperListRequest> request{nullptr};
if (keeper_api_version < Coordination::KeeperApiVersion::WITH_FILTERED_LIST)
{
if (list_request_type != ListRequestType::ALL)
throw Exception("Filtered list request type cannot be used because it's not supported by the server", Error::ZBADARGUMENTS);
request = std::make_shared<ZooKeeperListRequest>();
}
else
{
auto filtered_list_request = std::make_shared<ZooKeeperFilteredListRequest>();
filtered_list_request->list_request_type = list_request_type;
request = std::move(filtered_list_request);
}
request->path = path;
RequestInfo request_info;
request_info.request = std::make_shared<ZooKeeperListRequest>(std::move(request));
request_info.callback = [callback](const Response & response) { callback(dynamic_cast<const ListResponse &>(response)); };
request_info.watch = watch;
request_info.request = std::move(request);
pushRequest(std::move(request_info));
ProfileEvents::increment(ProfileEvents::ZooKeeperList);
@ -1256,7 +1313,7 @@ void ZooKeeper::setZooKeeperLog(std::shared_ptr<DB::ZooKeeperLog> zk_log_)
}
#ifdef ZOOKEEPER_LOG
void ZooKeeper::logOperationIfNeeded(const ZooKeeperRequestPtr & request, const ZooKeeperResponsePtr & response, bool finalize)
void ZooKeeper::logOperationIfNeeded(const ZooKeeperRequestPtr & request, const ZooKeeperResponsePtr & response, bool finalize, UInt64 elapsed_ms)
{
auto maybe_zk_log = std::atomic_load(&zk_log);
if (!maybe_zk_log)
@ -1294,6 +1351,7 @@ void ZooKeeper::logOperationIfNeeded(const ZooKeeperRequestPtr & request, const
elem.event_time = event_time;
elem.address = socket_address;
elem.session_id = session_id;
elem.duration_ms = elapsed_ms;
if (request)
{
elem.thread_id = request->thread_id;
@ -1303,7 +1361,7 @@ void ZooKeeper::logOperationIfNeeded(const ZooKeeperRequestPtr & request, const
}
}
#else
void ZooKeeper::logOperationIfNeeded(const ZooKeeperRequestPtr &, const ZooKeeperResponsePtr &, bool)
void ZooKeeper::logOperationIfNeeded(const ZooKeeperRequestPtr &, const ZooKeeperResponsePtr &, bool, UInt64)
{}
#endif

View File

@ -7,6 +7,7 @@
#include <Common/ThreadPool.h>
#include <Common/ZooKeeper/IKeeper.h>
#include <Common/ZooKeeper/ZooKeeperCommon.h>
#include <Coordination/KeeperConstants.h>
#include <IO/ReadBuffer.h>
#include <IO/WriteBuffer.h>
@ -181,6 +182,8 @@ public:
const Requests & requests,
MultiCallback callback) override;
DB::KeeperApiVersion getApiVersion() override;
/// Without forcefully invalidating (finalizing) ZooKeeper session before
/// establishing a new one, there was a possibility that server is using
/// two ZooKeeper sessions simultaneously in different parts of code.
@ -273,10 +276,14 @@ private:
template <typename T>
void read(T &);
void logOperationIfNeeded(const ZooKeeperRequestPtr & request, const ZooKeeperResponsePtr & response = nullptr, bool finalize = false);
void logOperationIfNeeded(const ZooKeeperRequestPtr & request, const ZooKeeperResponsePtr & response = nullptr, bool finalize = false, UInt64 elapsed_ms = 0);
void initApiVersion();
CurrentMetrics::Increment active_session_metric_increment{CurrentMetrics::ZooKeeperSession};
std::shared_ptr<ZooKeeperLog> zk_log;
DB::KeeperApiVersion keeper_api_version{DB::KeeperApiVersion::ZOOKEEPER_COMPATIBLE};
};
}

View File

@ -0,0 +1,12 @@
#include <Common/register_objects.h>
namespace DB
{
FunctionRegisterMap & FunctionRegisterMap::instance()
{
static FunctionRegisterMap map;
return map;
}
}

View File

@ -0,0 +1,33 @@
#pragma once
#include <string_view>
#include <unordered_map>
namespace DB
{
class FunctionFactory;
using FunctionRegisterFunctionPtr = void (*)(::DB::FunctionFactory &);
struct FunctionRegisterMap : public std::unordered_map<std::string_view, FunctionRegisterFunctionPtr>
{
static FunctionRegisterMap & instance();
};
struct FunctionRegister
{
FunctionRegister(std::string_view name, FunctionRegisterFunctionPtr func_ptr)
{
FunctionRegisterMap::instance().emplace(std::move(name), func_ptr);
}
};
}
#define REGISTER_FUNCTION_IMPL(fn, func_name, register_name) \
void func_name(::DB::FunctionFactory & factory); \
static ::DB::FunctionRegister register_name(#fn, func_name); \
void func_name(::DB::FunctionFactory & factory)
#define REGISTER_FUNCTION(fn) REGISTER_FUNCTION_IMPL(fn, registerFunction##fn, REGISTER_FUNCTION_##fn)

View File

@ -0,0 +1,289 @@
#include <gtest/gtest.h>
#include <vector>
#include <thread>
#include <pcg_random.hpp>
#include <base/types.h>
#include <base/sleep.h>
#include <Common/ConcurrencyControl.h>
#include <Common/randomSeed.h>
struct ConcurrencyControlTest
{
ConcurrencyControl cc;
explicit ConcurrencyControlTest(ConcurrencyControl::SlotCount limit = ConcurrencyControl::Unlimited)
{
cc.setMaxConcurrency(limit);
}
};
TEST(ConcurrencyControl, Unlimited)
{
ConcurrencyControlTest t; // unlimited number of slots
auto slots = t.cc.allocate(0, 100500);
std::vector<ConcurrencyControl::SlotPtr> acquired;
while (auto slot = slots->tryAcquire())
acquired.emplace_back(std::move(slot));
ASSERT_TRUE(acquired.size() == 100500);
}
TEST(ConcurrencyControl, Fifo)
{
ConcurrencyControlTest t(1); // use single slot
std::vector<ConcurrencyControl::AllocationPtr> allocations;
constexpr int count = 42;
allocations.reserve(count);
for (int i = 0; i < count; i++)
allocations.emplace_back(t.cc.allocate(0, 1));
for (int i = 0; i < count; i++)
{
ConcurrencyControl::SlotPtr holder;
for (int j = 0; j < count; j++)
{
auto slot = allocations[j]->tryAcquire();
if (i == j) // check fifo order of allocations
{
ASSERT_TRUE(slot);
holder = std::move(slot);
}
else
ASSERT_TRUE(!slot);
}
holder.reset(); // release slot -- leads to the next allocation
}
}
TEST(ConcurrencyControl, Oversubscription)
{
ConcurrencyControlTest t(10);
std::vector<ConcurrencyControl::AllocationPtr> allocations;
allocations.reserve(10);
for (int i = 0; i < 10; i++)
allocations.emplace_back(t.cc.allocate(1, 2));
std::vector<ConcurrencyControl::SlotPtr> slots;
// Normal allocation using maximum amount of slots
for (int i = 0; i < 5; i++)
{
auto slot1 = allocations[i]->tryAcquire();
ASSERT_TRUE(slot1);
slots.emplace_back(std::move(slot1));
auto slot2 = allocations[i]->tryAcquire();
ASSERT_TRUE(slot2);
slots.emplace_back(std::move(slot2));
ASSERT_TRUE(!allocations[i]->tryAcquire());
}
// Oversubscription: only minimum amount of slots are allocated
for (int i = 5; i < 10; i++)
{
auto slot1 = allocations[i]->tryAcquire();
ASSERT_TRUE(slot1);
slots.emplace_back(std::move(slot1));
ASSERT_TRUE(!allocations[i]->tryAcquire());
}
}
TEST(ConcurrencyControl, ReleaseUnacquiredSlots)
{
ConcurrencyControlTest t(10);
{
std::vector<ConcurrencyControl::AllocationPtr> allocations;
allocations.reserve(10);
for (int i = 0; i < 10; i++)
allocations.emplace_back(t.cc.allocate(1, 2));
// Do not acquire - just destroy allocations with granted slots
}
// Check that slots were actually released
auto allocation = t.cc.allocate(0, 20);
std::vector<ConcurrencyControl::SlotPtr> acquired;
while (auto slot = allocation->tryAcquire())
acquired.emplace_back(std::move(slot));
ASSERT_TRUE(acquired.size() == 10);
}
TEST(ConcurrencyControl, DestroyNotFullyAllocatedAllocation)
{
ConcurrencyControlTest t(10);
for (int i = 0; i < 3; i++)
{
auto allocation = t.cc.allocate(5, 20);
std::vector<ConcurrencyControl::SlotPtr> acquired;
while (auto slot = allocation->tryAcquire())
acquired.emplace_back(std::move(slot));
ASSERT_TRUE(acquired.size() == 10);
}
}
TEST(ConcurrencyControl, DestroyAllocationBeforeSlots)
{
ConcurrencyControlTest t(10);
for (int i = 0; i < 3; i++)
{
std::vector<ConcurrencyControl::SlotPtr> acquired;
auto allocation = t.cc.allocate(5, 20);
while (auto slot = allocation->tryAcquire())
acquired.emplace_back(std::move(slot));
ASSERT_TRUE(acquired.size() == 10);
allocation.reset(); // slots are still acquired (they should actually hold allocation)
}
}
TEST(ConcurrencyControl, GrantReleasedToTheSameAllocation)
{
ConcurrencyControlTest t(3);
auto allocation = t.cc.allocate(0, 10);
std::list<ConcurrencyControl::SlotPtr> acquired;
while (auto slot = allocation->tryAcquire())
acquired.emplace_back(std::move(slot));
ASSERT_TRUE(acquired.size() == 3); // 0 1 2
acquired.clear();
while (auto slot = allocation->tryAcquire())
acquired.emplace_back(std::move(slot));
ASSERT_TRUE(acquired.size() == 3); // 3 4 5
acquired.pop_back();
while (auto slot = allocation->tryAcquire())
acquired.emplace_back(std::move(slot));
ASSERT_TRUE(acquired.size() == 3); // 3 4 6
acquired.pop_front();
while (auto slot = allocation->tryAcquire())
acquired.emplace_back(std::move(slot));
ASSERT_TRUE(acquired.size() == 3); // 4 6 7
acquired.clear();
while (auto slot = allocation->tryAcquire())
acquired.emplace_back(std::move(slot));
ASSERT_TRUE(acquired.size() == 2); // 8 9
}
TEST(ConcurrencyControl, FairGranting)
{
ConcurrencyControlTest t(3);
auto start_busy_period = t.cc.allocate(3, 3);
auto a1 = t.cc.allocate(0, 10);
auto a2 = t.cc.allocate(0, 10);
auto a3 = t.cc.allocate(0, 10);
start_busy_period.reset();
for (int i = 0; i < 10; i++)
{
auto s1 = a1->tryAcquire();
ASSERT_TRUE(s1);
ASSERT_TRUE(!a1->tryAcquire());
auto s2 = a2->tryAcquire();
ASSERT_TRUE(s2);
ASSERT_TRUE(!a2->tryAcquire());
auto s3 = a3->tryAcquire();
ASSERT_TRUE(s3);
ASSERT_TRUE(!a3->tryAcquire());
}
}
TEST(ConcurrencyControl, SetSlotCount)
{
ConcurrencyControlTest t(10);
auto allocation = t.cc.allocate(5, 30);
std::vector<ConcurrencyControl::SlotPtr> acquired;
while (auto slot = allocation->tryAcquire())
acquired.emplace_back(std::move(slot));
ASSERT_TRUE(acquired.size() == 10);
t.cc.setMaxConcurrency(15);
while (auto slot = allocation->tryAcquire())
acquired.emplace_back(std::move(slot));
ASSERT_TRUE(acquired.size() == 15);
t.cc.setMaxConcurrency(5);
acquired.clear();
while (auto slot = allocation->tryAcquire())
acquired.emplace_back(std::move(slot));
ASSERT_TRUE(acquired.size() == 5);
// Check that newly added slots are equally distributed over waiting allocations
std::vector<ConcurrencyControl::SlotPtr> acquired2;
auto allocation2 = t.cc.allocate(0, 30);
ASSERT_TRUE(!allocation->tryAcquire());
t.cc.setMaxConcurrency(15); // 10 slots added: 5 to the first allocation and 5 to the second one
while (auto slot = allocation->tryAcquire())
acquired.emplace_back(std::move(slot));
while (auto slot = allocation2->tryAcquire())
acquired2.emplace_back(std::move(slot));
ASSERT_TRUE(acquired.size() == 10);
ASSERT_TRUE(acquired2.size() == 5);
}
TEST(ConcurrencyControl, MultipleThreads)
{
constexpr int cfg_total_queries = 1000; // total amount of queries to run
constexpr int cfg_work_us = 49; // max microseconds per single work
constexpr int cfg_concurrent_queries = 8; // do not run more than specified number of concurrent queries
constexpr int cfg_max_threads = 4; // max amount of threads a query is allowed to have
constexpr int cfg_max_concurrency = 16; // concurrency control limit (must be >3)
ConcurrencyControlTest t(cfg_max_concurrency);
auto run_query = [&] (size_t max_threads)
{
ConcurrencyControl::AllocationPtr slots = t.cc.allocate(1, max_threads);
std::mutex threads_mutex;
std::vector<std::thread> threads;
threads.reserve(max_threads);
std::function<void()> spawn_threads = [&] ()
{
while (auto slot = slots->tryAcquire())
{
std::unique_lock lock{threads_mutex};
threads.emplace_back([&, slot = std::move(slot)]
{
pcg64 rng(randomSeed());
std::uniform_int_distribution<size_t> distribution(1, cfg_work_us);
size_t steps = distribution(rng);
for (size_t step = 0; step < steps; step++)
{
sleepForMicroseconds(distribution(rng)); // emulate work
spawn_threads(); // upscale
}
});
}
};
spawn_threads();
// graceful shutdown of a query
for (size_t thread_num = 0; ; thread_num++)
{
std::unique_lock lock{threads_mutex};
if (thread_num >= threads.size())
break;
if (threads[thread_num].joinable())
{
auto & thread = threads[thread_num];
lock.unlock(); // to avoid deadlock if thread we are going to join starts spawning threads
thread.join();
}
}
// NOTE: No races: all concurrent spawn_threads() calls are done from `threads`, but they're already joined.
};
pcg64 rng(randomSeed());
std::uniform_int_distribution<size_t> max_threads_distribution(1, cfg_max_threads);
std::vector<std::thread> queries;
std::atomic<int> started = 0; // queries started in total
std::atomic<int> finished = 0; // queries finished in total
while (started < cfg_total_queries)
{
while (started < finished + cfg_concurrent_queries)
{
queries.emplace_back([&, max_threads = max_threads_distribution(rng)]
{
run_query(max_threads);
finished++;
});
started++;
}
sleepForMicroseconds(5); // wait some queries to finish
t.cc.setMaxConcurrency(cfg_max_concurrency - started % 3); // emulate configuration updates
}
for (auto & query : queries)
query.join();
}

View File

@ -61,8 +61,11 @@ GTEST_TEST(WideInteger, Conversions)
ASSERT_EQ(zero, minus_one);
zero += minus_one;
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
ASSERT_EQ(0, memcmp(&zero, "\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFE\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF", sizeof(zero)));
#else
ASSERT_EQ(0, memcmp(&zero, "\xFE\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF", sizeof(zero)));
#endif
zero += 2;
ASSERT_EQ(zero, 0);
@ -156,8 +159,11 @@ GTEST_TEST(WideInteger, Arithmetic)
ASSERT_EQ(zero, minus_one);
zero += minus_one;
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
ASSERT_EQ(0, memcmp(&zero, "\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFE\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF", sizeof(zero)));
#else
ASSERT_EQ(0, memcmp(&zero, "\xFE\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF", sizeof(zero)));
#endif
zero += 2;
ASSERT_EQ(zero, 0);
@ -236,8 +242,12 @@ GTEST_TEST(WideInteger, Shift)
Int128 x = 1;
auto y = x << 64;
ASSERT_EQ(0, memcmp(&y, "\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00", sizeof(Int128)));
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
ASSERT_EQ(0, memcmp(&y, "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01", sizeof(Int128)));
#else
ASSERT_EQ(0, memcmp(&y, "\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00", sizeof(Int128)));
#endif
auto z = y << 11;
ASSERT_EQ(toString(z), "37778931862957161709568");
@ -250,8 +260,11 @@ GTEST_TEST(WideInteger, Shift)
x = -1;
y = x << 16;
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
ASSERT_EQ(0, memcmp(&y, "\xFF\xFF\xFF\xFF\xFF\xFF\x00\x00\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF", sizeof(Int128)));
#else
ASSERT_EQ(0, memcmp(&y, "\x00\x00\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF", sizeof(Int128)));
#endif
y >>= 16;
ASSERT_EQ(0, memcmp(&y, "\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF", sizeof(Int128)));
@ -259,10 +272,18 @@ GTEST_TEST(WideInteger, Shift)
ASSERT_EQ(0, memcmp(&y, "\x00\x00\x00\x00\x00\x00\x00\x00\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF", sizeof(Int128)));
y >>= 32;
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
ASSERT_EQ(0, memcmp(&y, "\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF", sizeof(Int128)));
#else
ASSERT_EQ(0, memcmp(&y, "\x00\x00\x00\x00\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF", sizeof(Int128)));
#endif
y <<= 64;
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
ASSERT_EQ(0, memcmp(&y, "\x00\x00\x00\x00\x00\x00\x00\x00\xFF\xFF\xFF\xFF\x00\x00\x00\x00", sizeof(Int128)));
#else
ASSERT_EQ(0, memcmp(&y, "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xFF\xFF\xFF\xFF", sizeof(Int128)));
#endif
}

View File

@ -36,7 +36,7 @@ void CoordinationSettings::loadFromConfig(const String & config_elem, const Poco
}
const String KeeperConfigurationAndSettings::DEFAULT_FOUR_LETTER_WORD_CMD = "conf,cons,crst,envi,ruok,srst,srvr,stat,wchs,dirs,mntr,isro,rcvr";
const String KeeperConfigurationAndSettings::DEFAULT_FOUR_LETTER_WORD_CMD = "conf,cons,crst,envi,ruok,srst,srvr,stat,wchs,dirs,mntr,isro,rcvr,apiv";
KeeperConfigurationAndSettings::KeeperConfigurationAndSettings()
: server_id(NOT_EXIST)

View File

@ -2,6 +2,7 @@
#include <Coordination/KeeperDispatcher.h>
#include <Server/KeeperTCPHandler.h>
#include <Common/ZooKeeper/IKeeper.h>
#include <Common/logger_useful.h>
#include <Poco/Environment.h>
#include <Poco/Path.h>
@ -132,6 +133,9 @@ void FourLetterCommandFactory::registerCommands(KeeperDispatcher & keeper_dispat
FourLetterCommandPtr recovery_command = std::make_shared<RecoveryCommand>(keeper_dispatcher);
factory.registerCommand(recovery_command);
FourLetterCommandPtr api_version_command = std::make_shared<ApiVersionCommand>(keeper_dispatcher);
factory.registerCommand(api_version_command);
factory.initializeAllowList(keeper_dispatcher);
factory.setInitialize(true);
}
@ -463,4 +467,9 @@ String RecoveryCommand::run()
return "ok";
}
String ApiVersionCommand::run()
{
return toString(static_cast<uint8_t>(Coordination::current_keeper_api_version));
}
}

View File

@ -315,4 +315,16 @@ struct RecoveryCommand : public IFourLetterCommand
String run() override;
~RecoveryCommand() override = default;
};
struct ApiVersionCommand : public IFourLetterCommand
{
explicit ApiVersionCommand(KeeperDispatcher & keeper_dispatcher_)
: IFourLetterCommand(keeper_dispatcher_)
{
}
String name() override { return "apiv"; }
String run() override;
~ApiVersionCommand() override = default;
};
}

View File

@ -0,0 +1,25 @@
#pragma once
#include <IO/WriteHelpers.h>
namespace DB
{
enum class KeeperApiVersion : uint8_t
{
ZOOKEEPER_COMPATIBLE = 0,
WITH_FILTERED_LIST
};
inline constexpr auto current_keeper_api_version = KeeperApiVersion::WITH_FILTERED_LIST;
const std::string keeper_system_path = "/keeper";
const std::string keeper_api_version_path = keeper_system_path + "/api_version";
using PathWithData = std::pair<std::string_view, std::string>;
const std::vector<PathWithData> child_system_paths_with_data
{
{keeper_api_version_path, toString(static_cast<uint8_t>(current_keeper_api_version))}
};
}

View File

@ -0,0 +1,22 @@
#pragma once
namespace DB
{
struct KeeperContext
{
enum class Phase : uint8_t
{
INIT,
RUNNING
};
Phase server_state{Phase::INIT};
bool ignore_system_path_on_startup{false};
bool digest_enabled{true};
};
using KeeperContextPtr = std::shared_ptr<KeeperContext>;
}

View File

@ -106,20 +106,31 @@ KeeperServer::KeeperServer(
SnapshotsQueue & snapshots_queue_)
: server_id(configuration_and_settings_->server_id)
, coordination_settings(configuration_and_settings_->coordination_settings)
, state_machine(nuraft::cs_new<KeeperStateMachine>(
responses_queue_,
snapshots_queue_,
configuration_and_settings_->snapshot_storage_path,
coordination_settings,
checkAndGetSuperdigest(configuration_and_settings_->super_digest),
config.getBool("keeper_server.digest_enabled", false)))
, state_manager(nuraft::cs_new<KeeperStateManager>(
server_id, "keeper_server", configuration_and_settings_->log_storage_path, configuration_and_settings_->state_file_path, config, coordination_settings))
, log(&Poco::Logger::get("KeeperServer"))
, is_recovering(config.has("keeper_server.force_recovery") && config.getBool("keeper_server.force_recovery"))
, keeper_context{std::make_shared<KeeperContext>()}
{
if (coordination_settings->quorum_reads)
LOG_WARNING(log, "Quorum reads enabled, Keeper will work slower.");
keeper_context->digest_enabled = config.getBool("keeper_server.digest_enabled", false);
keeper_context->ignore_system_path_on_startup = config.getBool("keeper_server.ignore_system_path_on_startup", false);
state_machine = nuraft::cs_new<KeeperStateMachine>(
responses_queue_,
snapshots_queue_,
configuration_and_settings_->snapshot_storage_path,
coordination_settings,
keeper_context,
checkAndGetSuperdigest(configuration_and_settings_->super_digest));
state_manager = nuraft::cs_new<KeeperStateManager>(
server_id,
"keeper_server",
configuration_and_settings_->log_storage_path,
configuration_and_settings_->state_file_path,
config,
coordination_settings);
}
/**
@ -341,6 +352,8 @@ void KeeperServer::startup(const Poco::Util::AbstractConfiguration & config, boo
last_local_config = state_manager->parseServersConfiguration(config, true).cluster_config;
launchRaftServer(enable_ipv6);
keeper_context->server_state = KeeperContext::Phase::RUNNING;
}
void KeeperServer::shutdownRaftServer()

View File

@ -9,6 +9,7 @@
#include <libnuraft/raft_server.hxx>
#include <Poco/Util/AbstractConfiguration.h>
#include <Coordination/Keeper4LWInfo.h>
#include <Coordination/KeeperContext.h>
namespace DB
{
@ -61,6 +62,8 @@ private:
std::atomic_bool is_recovering = false;
std::shared_ptr<KeeperContext> keeper_context;
public:
KeeperServer(
const KeeperConfigurationAndSettingsPtr & settings_,

View File

@ -13,6 +13,8 @@
#include <filesystem>
#include <memory>
#include <Common/logger_useful.h>
#include "Coordination/KeeperContext.h"
#include <Coordination/KeeperConstants.h>
namespace DB
{
@ -144,8 +146,34 @@ namespace
}
}
namespace
{
void KeeperStorageSnapshot::serialize(const KeeperStorageSnapshot & snapshot, WriteBuffer & out)
enum class PathMatchResult
{
NOT_MATCH,
EXACT,
IS_CHILD
};
PathMatchResult matchPath(const std::string_view path, const std::string_view match_to)
{
using enum PathMatchResult;
auto [first_it, second_it] = std::mismatch(path.begin(), path.end(), match_to.begin(), match_to.end());
if (second_it != match_to.end())
return NOT_MATCH;
if (first_it == path.end())
return EXACT;
return *first_it == '/' ? IS_CHILD : NOT_MATCH;
}
}
void KeeperStorageSnapshot::serialize(const KeeperStorageSnapshot & snapshot, WriteBuffer & out, KeeperContextPtr keeper_context)
{
writeBinary(static_cast<uint8_t>(snapshot.version), out);
serializeSnapshotMetadata(snapshot.snapshot_meta, out);
@ -153,7 +181,7 @@ void KeeperStorageSnapshot::serialize(const KeeperStorageSnapshot & snapshot, Wr
if (snapshot.version >= SnapshotVersion::V5)
{
writeBinary(snapshot.zxid, out);
if (snapshot.storage->digest_enabled)
if (keeper_context->digest_enabled)
{
writeBinary(static_cast<uint8_t>(KeeperStorage::CURRENT_DIGEST_VERSION), out);
writeBinary(snapshot.nodes_digest, out);
@ -182,12 +210,21 @@ void KeeperStorageSnapshot::serialize(const KeeperStorageSnapshot & snapshot, Wr
}
/// Serialize data tree
writeBinary(snapshot.snapshot_container_size, out);
writeBinary(snapshot.snapshot_container_size - child_system_paths_with_data.size(), out);
size_t counter = 0;
for (auto it = snapshot.begin; counter < snapshot.snapshot_container_size; ++counter)
{
const auto & path = it->key;
// write only the root system path because of digest
if (matchPath(path.toView(), keeper_system_path) == PathMatchResult::IS_CHILD)
{
++it;
continue;
}
const auto & node = it->value;
/// Benign race condition possible while taking snapshot: NuRaft decide to create snapshot at some log id
/// and only after some time we lock storage and enable snapshot mode. So snapshot_container_size can be
/// slightly bigger than required.
@ -241,7 +278,7 @@ void KeeperStorageSnapshot::serialize(const KeeperStorageSnapshot & snapshot, Wr
}
}
void KeeperStorageSnapshot::deserialize(SnapshotDeserializationResult & deserialization_result, ReadBuffer & in)
void KeeperStorageSnapshot::deserialize(SnapshotDeserializationResult & deserialization_result, ReadBuffer & in, KeeperContextPtr keeper_context)
{
uint8_t version;
readBinary(version, in);
@ -252,7 +289,7 @@ void KeeperStorageSnapshot::deserialize(SnapshotDeserializationResult & deserial
deserialization_result.snapshot_meta = deserializeSnapshotMetadata(in);
KeeperStorage & storage = *deserialization_result.storage;
bool recalculate_digest = storage.digest_enabled;
bool recalculate_digest = keeper_context->digest_enabled;
if (version >= SnapshotVersion::V5)
{
readBinary(storage.zxid, in);
@ -316,19 +353,55 @@ void KeeperStorageSnapshot::deserialize(SnapshotDeserializationResult & deserial
if (recalculate_digest)
storage.nodes_digest = 0;
size_t current_size = 0;
while (current_size < snapshot_container_size)
const auto is_node_empty = [](const auto & node)
{
return node.getData().empty() && node.stat == Coordination::Stat{};
};
for (size_t nodes_read = 0; nodes_read < snapshot_container_size; ++nodes_read)
{
std::string path;
readBinary(path, in);
KeeperStorage::Node node{};
readNode(node, in, current_version, storage.acl_map);
using enum PathMatchResult;
auto match_result = matchPath(path, keeper_system_path);
const std::string error_msg = fmt::format("Cannot read node on path {} from a snapshot because it is used as a system node", path);
if (match_result == IS_CHILD)
{
if (keeper_context->ignore_system_path_on_startup || keeper_context->server_state != KeeperContext::Phase::INIT)
{
LOG_ERROR(&Poco::Logger::get("KeeperSnapshotManager"), "{}. Ignoring it", error_msg);
continue;
}
else
throw Exception(
ErrorCodes::LOGICAL_ERROR,
"{}. Ignoring it can lead to data loss. "
"If you still want to ignore it, you can set 'keeper_server.ignore_system_path_on_startup' to true",
error_msg);
}
else if (match_result == EXACT && !is_node_empty(node))
{
if (keeper_context->ignore_system_path_on_startup || keeper_context->server_state != KeeperContext::Phase::INIT)
{
LOG_ERROR(&Poco::Logger::get("KeeperSnapshotManager"), "{}. Ignoring it", error_msg);
node = KeeperStorage::Node{};
}
else
throw Exception(
ErrorCodes::LOGICAL_ERROR,
"{}. Ignoring it can lead to data loss. "
"If you still want to ignore it, you can set 'keeper_server.ignore_system_path_on_startup' to true",
error_msg);
}
storage.container.insertOrReplace(path, node);
if (node.stat.ephemeralOwner != 0)
storage.ephemerals[node.stat.ephemeralOwner].insert(path);
current_size++;
if (recalculate_digest)
storage.nodes_digest += node.getDigest(path);
}
@ -451,16 +524,16 @@ KeeperStorageSnapshot::~KeeperStorageSnapshot()
KeeperSnapshotManager::KeeperSnapshotManager(
const std::string & snapshots_path_,
size_t snapshots_to_keep_,
const KeeperContextPtr & keeper_context_,
bool compress_snapshots_zstd_,
const std::string & superdigest_,
size_t storage_tick_time_,
const bool digest_enabled_)
size_t storage_tick_time_)
: snapshots_path(snapshots_path_)
, snapshots_to_keep(snapshots_to_keep_)
, compress_snapshots_zstd(compress_snapshots_zstd_)
, superdigest(superdigest_)
, storage_tick_time(storage_tick_time_)
, digest_enabled(digest_enabled_)
, keeper_context(keeper_context_)
{
namespace fs = std::filesystem;
@ -554,7 +627,7 @@ nuraft::ptr<nuraft::buffer> KeeperSnapshotManager::serializeSnapshotToBuffer(con
else
compressed_writer = std::make_unique<CompressedWriteBuffer>(*writer);
KeeperStorageSnapshot::serialize(snapshot, *compressed_writer);
KeeperStorageSnapshot::serialize(snapshot, *compressed_writer, keeper_context);
compressed_writer->finalize();
return buffer_raw_ptr->getBuffer();
}
@ -583,8 +656,9 @@ SnapshotDeserializationResult KeeperSnapshotManager::deserializeSnapshotFromBuff
compressed_reader = std::make_unique<CompressedReadBuffer>(*reader);
SnapshotDeserializationResult result;
result.storage = std::make_unique<KeeperStorage>(storage_tick_time, superdigest, digest_enabled);
KeeperStorageSnapshot::deserialize(result, *compressed_reader);
result.storage = std::make_unique<KeeperStorage>(storage_tick_time, superdigest, keeper_context, /* initialize_system_nodes */ false);
KeeperStorageSnapshot::deserialize(result, *compressed_reader, keeper_context);
result.storage->initializeSystemNodes();
return result;
}
@ -629,7 +703,7 @@ std::pair<std::string, std::error_code> KeeperSnapshotManager::serializeSnapshot
else
compressed_writer = std::make_unique<CompressedWriteBuffer>(*writer);
KeeperStorageSnapshot::serialize(snapshot, *compressed_writer);
KeeperStorageSnapshot::serialize(snapshot, *compressed_writer, keeper_context);
compressed_writer->finalize();
compressed_writer->sync();

View File

@ -5,6 +5,7 @@
#include <IO/ReadBuffer.h>
#include <IO/WriteBuffer.h>
#include <libnuraft/nuraft.hxx>
#include <Coordination/KeeperContext.h>
namespace DB
{
@ -55,9 +56,9 @@ public:
~KeeperStorageSnapshot();
static void serialize(const KeeperStorageSnapshot & snapshot, WriteBuffer & out);
static void serialize(const KeeperStorageSnapshot & snapshot, WriteBuffer & out, KeeperContextPtr keeper_context);
static void deserialize(SnapshotDeserializationResult & deserialization_result, ReadBuffer & in);
static void deserialize(SnapshotDeserializationResult & deserialization_result, ReadBuffer & in, KeeperContextPtr keeper_context);
KeeperStorage * storage;
@ -99,10 +100,10 @@ public:
KeeperSnapshotManager(
const std::string & snapshots_path_,
size_t snapshots_to_keep_,
const KeeperContextPtr & keeper_context_,
bool compress_snapshots_zstd_ = true,
const std::string & superdigest_ = "",
size_t storage_tick_time_ = 500,
bool digest_enabled_ = true);
size_t storage_tick_time_ = 500);
/// Restore storage from latest available snapshot
SnapshotDeserializationResult restoreFromLatestSnapshot();
@ -168,7 +169,8 @@ private:
const std::string superdigest;
/// Storage sessions timeout check interval (also for deserializatopn)
size_t storage_tick_time;
const bool digest_enabled;
KeeperContextPtr keeper_context;
};
/// Keeper create snapshots in background thread. KeeperStateMachine just create

View File

@ -41,22 +41,22 @@ KeeperStateMachine::KeeperStateMachine(
SnapshotsQueue & snapshots_queue_,
const std::string & snapshots_path_,
const CoordinationSettingsPtr & coordination_settings_,
const std::string & superdigest_,
const bool digest_enabled_)
const KeeperContextPtr & keeper_context_,
const std::string & superdigest_)
: coordination_settings(coordination_settings_)
, snapshot_manager(
snapshots_path_,
coordination_settings->snapshots_to_keep,
keeper_context_,
coordination_settings->compress_snapshots_with_zstd_format,
superdigest_,
coordination_settings->dead_session_check_period_ms.totalMilliseconds(),
digest_enabled_)
coordination_settings->dead_session_check_period_ms.totalMilliseconds())
, responses_queue(responses_queue_)
, snapshots_queue(snapshots_queue_)
, last_committed_idx(0)
, log(&Poco::Logger::get("KeeperStateMachine"))
, superdigest(superdigest_)
, digest_enabled(digest_enabled_)
, keeper_context(keeper_context_)
{
}
@ -109,7 +109,7 @@ void KeeperStateMachine::init()
if (!storage)
storage = std::make_unique<KeeperStorage>(
coordination_settings->dead_session_check_period_ms.totalMilliseconds(), superdigest, digest_enabled);
coordination_settings->dead_session_check_period_ms.totalMilliseconds(), superdigest, keeper_context);
}
namespace
@ -204,7 +204,7 @@ void KeeperStateMachine::preprocess(const KeeperStorage::RequestForSession & req
true /* check_acl */,
request_for_session.digest);
if (digest_enabled && request_for_session.digest)
if (keeper_context->digest_enabled && request_for_session.digest)
assertDigest(*request_for_session.digest, storage->getNodesDigest(false), *request_for_session.request, false);
}
@ -253,10 +253,8 @@ nuraft::ptr<nuraft::buffer> KeeperStateMachine::commit(const uint64_t log_idx, n
response_for_session.session_id);
}
if (digest_enabled && request_for_session.digest)
{
if (keeper_context->digest_enabled && request_for_session.digest)
assertDigest(*request_for_session.digest, storage->getNodesDigest(true), *request_for_session.request, true);
}
}
ProfileEvents::increment(ProfileEvents::KeeperCommits);

View File

@ -6,6 +6,7 @@
#include <libnuraft/nuraft.hxx>
#include <Common/ConcurrentBoundedQueue.h>
#include <Common/logger_useful.h>
#include <Coordination/KeeperContext.h>
namespace DB
@ -24,8 +25,8 @@ public:
SnapshotsQueue & snapshots_queue_,
const std::string & snapshots_path_,
const CoordinationSettingsPtr & coordination_settings_,
const std::string & superdigest_ = "",
bool digest_enabled_ = true);
const KeeperContextPtr & keeper_context_,
const std::string & superdigest_ = "");
/// Read state from the latest snapshot
void init();
@ -140,7 +141,7 @@ private:
/// Special part of ACL system -- superdigest specified in server config.
const std::string superdigest;
const bool digest_enabled;
KeeperContextPtr keeper_context;
};
}

View File

@ -15,6 +15,7 @@
#include <Common/logger_useful.h>
#include <Common/setThreadName.h>
#include <Coordination/pathUtils.h>
#include <Coordination/KeeperConstants.h>
#include <sstream>
#include <iomanip>
#include <mutex>
@ -226,12 +227,67 @@ void KeeperStorage::Node::shallowCopy(const KeeperStorage::Node & other)
cached_digest = other.cached_digest;
}
KeeperStorage::KeeperStorage(int64_t tick_time_ms, const String & superdigest_, const bool digest_enabled_)
: session_expiry_queue(tick_time_ms), digest_enabled(digest_enabled_), superdigest(superdigest_)
KeeperStorage::KeeperStorage(
int64_t tick_time_ms, const String & superdigest_, const KeeperContextPtr & keeper_context_, const bool initialize_system_nodes)
: session_expiry_queue(tick_time_ms), keeper_context(keeper_context_), superdigest(superdigest_)
{
Node root_node;
container.insert("/", root_node);
nodes_digest += root_node.getDigest("/");
addDigest(root_node, "/");
if (initialize_system_nodes)
initializeSystemNodes();
}
void KeeperStorage::initializeSystemNodes()
{
if (initialized)
throw Exception(ErrorCodes::LOGICAL_ERROR, "KeeperStorage system nodes initialized twice");
// insert root system path if it isn't already inserted
if (container.find(keeper_system_path) == container.end())
{
Node system_node;
container.insert(keeper_system_path, system_node);
// store digest for the empty node because we won't update
// its stats
addDigest(system_node, keeper_system_path);
// update root and the digest based on it
auto current_root_it = container.find("/");
assert(current_root_it != container.end());
removeDigest(current_root_it->value, "/");
auto updated_root_it = container.updateValue(
"/",
[](auto & node)
{
++node.stat.numChildren;
node.addChild(getBaseName(keeper_system_path));
}
);
addDigest(updated_root_it->value, "/");
}
// insert child system nodes
for (const auto & [path, data] : child_system_paths_with_data)
{
assert(keeper_api_version_path.starts_with(keeper_system_path));
Node child_system_node;
child_system_node.setData(data);
auto [map_key, _] = container.insert(std::string{path}, child_system_node);
/// Take child path from key owned by map.
auto child_path = getBaseName(map_key->getKey());
container.updateValue(
parentPath(StringRef(path)),
[child_path](auto & parent)
{
// don't update stats so digest is okay
parent.addChild(child_path);
}
);
}
initialized = true;
}
template <class... Ts>
@ -610,7 +666,7 @@ struct KeeperStorageRequestProcessor
explicit KeeperStorageRequestProcessor(const Coordination::ZooKeeperRequestPtr & zk_request_) : zk_request(zk_request_) { }
virtual Coordination::ZooKeeperResponsePtr process(KeeperStorage & storage, int64_t zxid) const = 0;
virtual std::vector<KeeperStorage::Delta>
preprocess(KeeperStorage & /*storage*/, int64_t /*zxid*/, int64_t /*session_id*/, int64_t /*time*/, uint64_t & /*digest*/) const
preprocess(KeeperStorage & /*storage*/, int64_t /*zxid*/, int64_t /*session_id*/, int64_t /*time*/, uint64_t & /*digest*/, const KeeperContext & /*keeper_context*/) const
{
return {};
}
@ -658,21 +714,34 @@ struct KeeperStorageSyncRequestProcessor final : public KeeperStorageRequestProc
namespace
{
Coordination::ACLs getNodeACLs(KeeperStorage & storage, StringRef path, bool is_local)
Coordination::ACLs getNodeACLs(KeeperStorage & storage, StringRef path, bool is_local)
{
if (is_local)
{
if (is_local)
{
auto node_it = storage.container.find(path);
if (node_it == storage.container.end())
return {};
auto node_it = storage.container.find(path);
if (node_it == storage.container.end())
return {};
return storage.acl_map.convertNumber(node_it->value.acl_id);
}
return storage.uncommitted_state.getACLs(path);
return storage.acl_map.convertNumber(node_it->value.acl_id);
}
return storage.uncommitted_state.getACLs(path);
}
void handleSystemNodeModification(const KeeperContext & keeper_context, std::string_view error_msg)
{
if (keeper_context.server_state == KeeperContext::Phase::INIT && !keeper_context.ignore_system_path_on_startup)
throw Exception(
ErrorCodes::LOGICAL_ERROR,
"{}. Ignoring it can lead to data loss. "
"If you still want to ignore it, you can set 'keeper_server.ignore_system_path_on_startup' to true.",
error_msg);
LOG_ERROR(&Poco::Logger::get("KeeperStorage"), fmt::runtime(error_msg));
}
}
bool KeeperStorage::checkACL(StringRef path, int32_t permission, int64_t session_id, bool is_local)
{
const auto node_acls = getNodeACLs(*this, path, is_local);
@ -726,7 +795,7 @@ struct KeeperStorageCreateRequestProcessor final : public KeeperStorageRequestPr
}
std::vector<KeeperStorage::Delta>
preprocess(KeeperStorage & storage, int64_t zxid, int64_t session_id, int64_t time, uint64_t & digest) const override
preprocess(KeeperStorage & storage, int64_t zxid, int64_t session_id, int64_t time, uint64_t & digest, const KeeperContext & keeper_context) const override
{
Coordination::ZooKeeperCreateRequest & request = dynamic_cast<Coordination::ZooKeeperCreateRequest &>(*zk_request);
@ -752,6 +821,14 @@ struct KeeperStorageCreateRequestProcessor final : public KeeperStorageRequestPr
path_created += seq_num_str.str();
}
if (path_created.starts_with(keeper_system_path))
{
auto error_msg = fmt::format("Trying to create a node inside the internal Keeper path ({}) which is not allowed. Path: {}", keeper_system_path, path_created);
handleSystemNodeModification(keeper_context, error_msg);
return {KeeperStorage::Delta{zxid, Coordination::Error::ZBADARGUMENTS}};
}
if (storage.uncommitted_state.getNode(path_created))
return {KeeperStorage::Delta{zxid, Coordination::Error::ZNODEEXISTS}};
@ -839,10 +916,13 @@ struct KeeperStorageGetRequestProcessor final : public KeeperStorageRequestProce
using KeeperStorageRequestProcessor::KeeperStorageRequestProcessor;
std::vector<KeeperStorage::Delta>
preprocess(KeeperStorage & storage, int64_t zxid, int64_t /*session_id*/, int64_t /*time*/, uint64_t & /*digest*/) const override
preprocess(KeeperStorage & storage, int64_t zxid, int64_t /*session_id*/, int64_t /*time*/, uint64_t & /*digest*/, const KeeperContext & /*keeper_context*/) const override
{
Coordination::ZooKeeperGetRequest & request = dynamic_cast<Coordination::ZooKeeperGetRequest &>(*zk_request);
if (request.path == Coordination::keeper_api_version_path)
return {};
if (!storage.uncommitted_state.getNode(request.path))
return {KeeperStorage::Delta{zxid, Coordination::Error::ZNONODE}};
@ -905,12 +985,20 @@ struct KeeperStorageRemoveRequestProcessor final : public KeeperStorageRequestPr
using KeeperStorageRequestProcessor::KeeperStorageRequestProcessor;
std::vector<KeeperStorage::Delta>
preprocess(KeeperStorage & storage, int64_t zxid, int64_t /*session_id*/, int64_t /*time*/, uint64_t & digest) const override
preprocess(KeeperStorage & storage, int64_t zxid, int64_t /*session_id*/, int64_t /*time*/, uint64_t & digest, const KeeperContext & keeper_context) const override
{
Coordination::ZooKeeperRemoveRequest & request = dynamic_cast<Coordination::ZooKeeperRemoveRequest &>(*zk_request);
std::vector<KeeperStorage::Delta> new_deltas;
if (request.path.starts_with(keeper_system_path))
{
auto error_msg = fmt::format("Trying to delete an internal Keeper path ({}) which is not allowed", request.path);
handleSystemNodeModification(keeper_context, error_msg);
return {KeeperStorage::Delta{zxid, Coordination::Error::ZBADARGUMENTS}};
}
const auto update_parent_pzxid = [&]()
{
auto parent_path = parentPath(request.path);
@ -987,7 +1075,7 @@ struct KeeperStorageExistsRequestProcessor final : public KeeperStorageRequestPr
using KeeperStorageRequestProcessor::KeeperStorageRequestProcessor;
std::vector<KeeperStorage::Delta>
preprocess(KeeperStorage & storage, int64_t zxid, int64_t /*session_id*/, int64_t /*time*/, uint64_t & /*digest*/) const override
preprocess(KeeperStorage & storage, int64_t zxid, int64_t /*session_id*/, int64_t /*time*/, uint64_t & /*digest*/, const KeeperContext & /*keeper_context*/) const override
{
Coordination::ZooKeeperExistsRequest & request = dynamic_cast<Coordination::ZooKeeperExistsRequest &>(*zk_request);
@ -1051,12 +1139,20 @@ struct KeeperStorageSetRequestProcessor final : public KeeperStorageRequestProce
using KeeperStorageRequestProcessor::KeeperStorageRequestProcessor;
std::vector<KeeperStorage::Delta>
preprocess(KeeperStorage & storage, int64_t zxid, int64_t /*session_id*/, int64_t time, uint64_t & digest) const override
preprocess(KeeperStorage & storage, int64_t zxid, int64_t /*session_id*/, int64_t time, uint64_t & digest, const KeeperContext & keeper_context) const override
{
Coordination::ZooKeeperSetRequest & request = dynamic_cast<Coordination::ZooKeeperSetRequest &>(*zk_request);
std::vector<KeeperStorage::Delta> new_deltas;
if (request.path.starts_with(keeper_system_path))
{
auto error_msg = fmt::format("Trying to update an internal Keeper path ({}) which is not allowed", request.path);
handleSystemNodeModification(keeper_context, error_msg);
return {KeeperStorage::Delta{zxid, Coordination::Error::ZBADARGUMENTS}};
}
if (!storage.uncommitted_state.getNode(request.path))
return {KeeperStorage::Delta{zxid, Coordination::Error::ZNONODE}};
@ -1135,7 +1231,7 @@ struct KeeperStorageListRequestProcessor final : public KeeperStorageRequestProc
using KeeperStorageRequestProcessor::KeeperStorageRequestProcessor;
std::vector<KeeperStorage::Delta>
preprocess(KeeperStorage & storage, int64_t zxid, int64_t /*session_id*/, int64_t /*time*/, uint64_t & /*digest*/) const override
preprocess(KeeperStorage & storage, int64_t zxid, int64_t /*session_id*/, int64_t /*time*/, uint64_t & /*digest*/, const KeeperContext & /*keeper_context*/) const override
{
Coordination::ZooKeeperListRequest & request = dynamic_cast<Coordination::ZooKeeperListRequest &>(*zk_request);
@ -1187,7 +1283,9 @@ struct KeeperStorageListRequestProcessor final : public KeeperStorageRequestProc
auto list_request_type = ALL;
if (auto * filtered_list = dynamic_cast<Coordination::ZooKeeperFilteredListRequest *>(&request))
{
list_request_type = filtered_list->list_request_type;
}
if (list_request_type == ALL)
return true;
@ -1234,7 +1332,7 @@ struct KeeperStorageCheckRequestProcessor final : public KeeperStorageRequestPro
using KeeperStorageRequestProcessor::KeeperStorageRequestProcessor;
std::vector<KeeperStorage::Delta>
preprocess(KeeperStorage & storage, int64_t zxid, int64_t /*session_id*/, int64_t /*time*/, uint64_t & /*digest*/) const override
preprocess(KeeperStorage & storage, int64_t zxid, int64_t /*session_id*/, int64_t /*time*/, uint64_t & /*digest*/, const KeeperContext & /*keeper_context*/) const override
{
Coordination::ZooKeeperCheckRequest & request = dynamic_cast<Coordination::ZooKeeperCheckRequest &>(*zk_request);
@ -1312,10 +1410,18 @@ struct KeeperStorageSetACLRequestProcessor final : public KeeperStorageRequestPr
using KeeperStorageRequestProcessor::KeeperStorageRequestProcessor;
std::vector<KeeperStorage::Delta>
preprocess(KeeperStorage & storage, int64_t zxid, int64_t session_id, int64_t /*time*/, uint64_t & digest) const override
preprocess(KeeperStorage & storage, int64_t zxid, int64_t session_id, int64_t /*time*/, uint64_t & digest, const KeeperContext & keeper_context) const override
{
Coordination::ZooKeeperSetACLRequest & request = dynamic_cast<Coordination::ZooKeeperSetACLRequest &>(*zk_request);
if (request.path.starts_with(keeper_system_path))
{
auto error_msg = fmt::format("Trying to update an internal Keeper path ({}) which is not allowed", request.path);
handleSystemNodeModification(keeper_context, error_msg);
return {KeeperStorage::Delta{zxid, Coordination::Error::ZBADARGUMENTS}};
}
auto & uncommitted_state = storage.uncommitted_state;
if (!uncommitted_state.getNode(request.path))
return {KeeperStorage::Delta{zxid, Coordination::Error::ZNONODE}};
@ -1386,7 +1492,7 @@ struct KeeperStorageGetACLRequestProcessor final : public KeeperStorageRequestPr
using KeeperStorageRequestProcessor::KeeperStorageRequestProcessor;
std::vector<KeeperStorage::Delta>
preprocess(KeeperStorage & storage, int64_t zxid, int64_t /*session_id*/, int64_t /*time*/, uint64_t & /*digest*/) const override
preprocess(KeeperStorage & storage, int64_t zxid, int64_t /*session_id*/, int64_t /*time*/, uint64_t & /*digest*/, const KeeperContext & /*keeper_context*/) const override
{
Coordination::ZooKeeperGetACLRequest & request = dynamic_cast<Coordination::ZooKeeperGetACLRequest &>(*zk_request);
@ -1483,14 +1589,14 @@ struct KeeperStorageMultiRequestProcessor final : public KeeperStorageRequestPro
}
std::vector<KeeperStorage::Delta>
preprocess(KeeperStorage & storage, int64_t zxid, int64_t session_id, int64_t time, uint64_t & digest) const override
preprocess(KeeperStorage & storage, int64_t zxid, int64_t session_id, int64_t time, uint64_t & digest, const KeeperContext & keeper_context) const override
{
std::vector<Coordination::Error> response_errors;
response_errors.reserve(concrete_requests.size());
uint64_t current_digest = digest;
for (size_t i = 0; i < concrete_requests.size(); ++i)
{
auto new_deltas = concrete_requests[i]->preprocess(storage, zxid, session_id, time, current_digest);
auto new_deltas = concrete_requests[i]->preprocess(storage, zxid, session_id, time, current_digest, keeper_context);
if (!new_deltas.empty())
{
@ -1609,7 +1715,7 @@ struct KeeperStorageAuthRequestProcessor final : public KeeperStorageRequestProc
{
using KeeperStorageRequestProcessor::KeeperStorageRequestProcessor;
std::vector<KeeperStorage::Delta>
preprocess(KeeperStorage & storage, int64_t zxid, int64_t session_id, int64_t /*time*/, uint64_t & /*digest*/) const override
preprocess(KeeperStorage & storage, int64_t zxid, int64_t session_id, int64_t /*time*/, uint64_t & /*digest*/, const KeeperContext & /*keeper_context*/) const override
{
Coordination::ZooKeeperAuthRequest & auth_request = dynamic_cast<Coordination::ZooKeeperAuthRequest &>(*zk_request);
Coordination::ZooKeeperResponsePtr response_ptr = zk_request->makeResponse();
@ -1725,7 +1831,7 @@ KeeperStorageRequestProcessorsFactory::KeeperStorageRequestProcessorsFactory()
UInt64 KeeperStorage::calculateNodesDigest(UInt64 current_digest, const std::vector<Delta> & new_deltas) const
{
if (!digest_enabled)
if (!keeper_context->digest_enabled)
return current_digest;
std::unordered_map<std::string, std::shared_ptr<Node>> updated_nodes;
@ -1792,6 +1898,9 @@ void KeeperStorage::preprocessRequest(
bool check_acl,
std::optional<Digest> digest)
{
if (!initialized)
throw Exception(ErrorCodes::LOGICAL_ERROR, "KeeperStorage system nodes are not initialized");
int64_t last_zxid = getNextZXID() - 1;
if (uncommitted_transactions.empty())
@ -1816,7 +1925,7 @@ void KeeperStorage::preprocessRequest(
TransactionInfo transaction{.zxid = new_last_zxid};
uint64_t new_digest = getNodesDigest(false).value;
SCOPE_EXIT({
if (digest_enabled)
if (keeper_context->digest_enabled)
// if the version of digest we got from the leader is the same as the one this instances has, we can simply copy the value
// and just check the digest on the commit
// a mistake can happen while applying the changes to the uncommitted_state so for now let's just recalculate the digest here also
@ -1867,7 +1976,7 @@ void KeeperStorage::preprocessRequest(
return;
}
new_deltas = request_processor->preprocess(*this, transaction.zxid, session_id, time, new_digest);
new_deltas = request_processor->preprocess(*this, transaction.zxid, session_id, time, new_digest, *keeper_context);
}
KeeperStorage::ResponsesForSessions KeeperStorage::processRequest(
@ -1877,6 +1986,9 @@ KeeperStorage::ResponsesForSessions KeeperStorage::processRequest(
bool check_acl,
bool is_local)
{
if (!initialized)
throw Exception(ErrorCodes::LOGICAL_ERROR, "KeeperStorage system nodes are not initialized");
if (new_last_zxid)
{
if (uncommitted_transactions.empty())
@ -1968,8 +2080,10 @@ KeeperStorage::ResponsesForSessions KeeperStorage::processRequest(
{
if (response->error == Coordination::Error::ZOK)
{
auto & watches_type
= zk_request->getOpNum() == Coordination::OpNum::List || zk_request->getOpNum() == Coordination::OpNum::SimpleList
static constexpr std::array list_requests{
Coordination::OpNum::List, Coordination::OpNum::SimpleList, Coordination::OpNum::FilteredList};
auto & watches_type = std::find(list_requests.begin(), list_requests.end(), zk_request->getOpNum()) != list_requests.end()
? list_watches
: watches;
@ -2011,7 +2125,7 @@ void KeeperStorage::rollbackRequest(int64_t rollback_zxid)
KeeperStorage::Digest KeeperStorage::getNodesDigest(bool committed) const
{
if (!digest_enabled)
if (!keeper_context->digest_enabled)
return {.version = DigestVersion::NO_DIGEST};
if (committed || uncommitted_transactions.empty())
@ -2022,13 +2136,13 @@ KeeperStorage::Digest KeeperStorage::getNodesDigest(bool committed) const
void KeeperStorage::removeDigest(const Node & node, const std::string_view path)
{
if (digest_enabled)
if (keeper_context->digest_enabled)
nodes_digest -= node.getDigest(path);
}
void KeeperStorage::addDigest(const Node & node, const std::string_view path)
{
if (digest_enabled)
if (keeper_context->digest_enabled)
{
node.invalidateDigestCache();
nodes_digest += node.getDigest(path);

View File

@ -9,6 +9,7 @@
#include <Common/ConcurrentBoundedQueue.h>
#include <Common/ZooKeeper/IKeeper.h>
#include <Common/ZooKeeper/ZooKeeperCommon.h>
#include <Coordination/KeeperContext.h>
#include <absl/container/flat_hash_set.h>
@ -336,11 +337,15 @@ public:
Digest getNodesDigest(bool committed) const;
const bool digest_enabled;
KeeperContextPtr keeper_context;
const String superdigest;
KeeperStorage(int64_t tick_time_ms, const String & superdigest_, bool digest_enabled_);
bool initialized{false};
KeeperStorage(int64_t tick_time_ms, const String & superdigest_, const KeeperContextPtr & keeper_context_, bool initialize_system_nodes = true);
void initializeSystemNodes();
/// Allocate new session id with the specified timeouts
int64_t getSessionID(int64_t session_timeout_ms)

View File

@ -2,6 +2,7 @@
#include <gtest/gtest.h>
#include "Common/ZooKeeper/IKeeper.h"
#include "Coordination/KeeperContext.h"
#include "Coordination/KeeperStorage.h"
#include "Core/Defines.h"
#include "IO/WriteHelpers.h"
@ -63,7 +64,10 @@ struct CompressionParam
};
class CoordinationTest : public ::testing::TestWithParam<CompressionParam>
{};
{
protected:
DB::KeeperContextPtr keeper_context = std::make_shared<DB::KeeperContext>();
};
TEST_P(CoordinationTest, BuildTest)
{
@ -1083,9 +1087,9 @@ TEST_P(CoordinationTest, TestStorageSnapshotSimple)
{
auto params = GetParam();
ChangelogDirTest test("./snapshots");
DB::KeeperSnapshotManager manager("./snapshots", 3, params.enable_compression);
DB::KeeperSnapshotManager manager("./snapshots", 3, keeper_context, params.enable_compression);
DB::KeeperStorage storage(500, "", true);
DB::KeeperStorage storage(500, "", keeper_context);
addNode(storage, "/hello", "world", 1);
addNode(storage, "/hello/somepath", "somedata", 3);
storage.session_id_counter = 5;
@ -1099,7 +1103,7 @@ TEST_P(CoordinationTest, TestStorageSnapshotSimple)
EXPECT_EQ(snapshot.snapshot_meta->get_last_log_idx(), 2);
EXPECT_EQ(snapshot.session_id, 7);
EXPECT_EQ(snapshot.snapshot_container_size, 3);
EXPECT_EQ(snapshot.snapshot_container_size, 5);
EXPECT_EQ(snapshot.session_and_timeout.size(), 2);
auto buf = manager.serializeSnapshotToBuffer(snapshot);
@ -1111,8 +1115,8 @@ TEST_P(CoordinationTest, TestStorageSnapshotSimple)
auto [restored_storage, snapshot_meta, _] = manager.deserializeSnapshotFromBuffer(debuf);
EXPECT_EQ(restored_storage->container.size(), 3);
EXPECT_EQ(restored_storage->container.getValue("/").getChildren().size(), 1);
EXPECT_EQ(restored_storage->container.size(), 5);
EXPECT_EQ(restored_storage->container.getValue("/").getChildren().size(), 2);
EXPECT_EQ(restored_storage->container.getValue("/hello").getChildren().size(), 1);
EXPECT_EQ(restored_storage->container.getValue("/hello/somepath").getChildren().size(), 0);
@ -1131,9 +1135,9 @@ TEST_P(CoordinationTest, TestStorageSnapshotMoreWrites)
{
auto params = GetParam();
ChangelogDirTest test("./snapshots");
DB::KeeperSnapshotManager manager("./snapshots", 3, params.enable_compression);
DB::KeeperSnapshotManager manager("./snapshots", 3, keeper_context, params.enable_compression);
DB::KeeperStorage storage(500, "", true);
DB::KeeperStorage storage(500, "", keeper_context);
storage.getSessionID(130);
for (size_t i = 0; i < 50; ++i)
@ -1143,14 +1147,14 @@ TEST_P(CoordinationTest, TestStorageSnapshotMoreWrites)
DB::KeeperStorageSnapshot snapshot(&storage, 50);
EXPECT_EQ(snapshot.snapshot_meta->get_last_log_idx(), 50);
EXPECT_EQ(snapshot.snapshot_container_size, 51);
EXPECT_EQ(snapshot.snapshot_container_size, 53);
for (size_t i = 50; i < 100; ++i)
{
addNode(storage, "/hello_" + std::to_string(i), "world_" + std::to_string(i));
}
EXPECT_EQ(storage.container.size(), 101);
EXPECT_EQ(storage.container.size(), 103);
auto buf = manager.serializeSnapshotToBuffer(snapshot);
manager.serializeSnapshotBufferToDisk(*buf, 50);
@ -1160,7 +1164,7 @@ TEST_P(CoordinationTest, TestStorageSnapshotMoreWrites)
auto debuf = manager.deserializeSnapshotBufferFromDisk(50);
auto [restored_storage, meta, _] = manager.deserializeSnapshotFromBuffer(debuf);
EXPECT_EQ(restored_storage->container.size(), 51);
EXPECT_EQ(restored_storage->container.size(), 53);
for (size_t i = 0; i < 50; ++i)
{
EXPECT_EQ(restored_storage->container.getValue("/hello_" + std::to_string(i)).getData(), "world_" + std::to_string(i));
@ -1172,9 +1176,9 @@ TEST_P(CoordinationTest, TestStorageSnapshotManySnapshots)
{
auto params = GetParam();
ChangelogDirTest test("./snapshots");
DB::KeeperSnapshotManager manager("./snapshots", 3, params.enable_compression);
DB::KeeperSnapshotManager manager("./snapshots", 3, keeper_context, params.enable_compression);
DB::KeeperStorage storage(500, "", true);
DB::KeeperStorage storage(500, "", keeper_context);
storage.getSessionID(130);
for (size_t j = 1; j <= 5; ++j)
@ -1199,7 +1203,7 @@ TEST_P(CoordinationTest, TestStorageSnapshotManySnapshots)
auto [restored_storage, meta, _] = manager.restoreFromLatestSnapshot();
EXPECT_EQ(restored_storage->container.size(), 251);
EXPECT_EQ(restored_storage->container.size(), 253);
for (size_t i = 0; i < 250; ++i)
{
@ -1211,8 +1215,8 @@ TEST_P(CoordinationTest, TestStorageSnapshotMode)
{
auto params = GetParam();
ChangelogDirTest test("./snapshots");
DB::KeeperSnapshotManager manager("./snapshots", 3, params.enable_compression);
DB::KeeperStorage storage(500, "", true);
DB::KeeperSnapshotManager manager("./snapshots", 3, keeper_context, params.enable_compression);
DB::KeeperStorage storage(500, "", keeper_context);
for (size_t i = 0; i < 50; ++i)
{
addNode(storage, "/hello_" + std::to_string(i), "world_" + std::to_string(i));
@ -1233,16 +1237,16 @@ TEST_P(CoordinationTest, TestStorageSnapshotMode)
if (i % 2 == 0)
storage.container.erase("/hello_" + std::to_string(i));
}
EXPECT_EQ(storage.container.size(), 26);
EXPECT_EQ(storage.container.snapshotSizeWithVersion().first, 102);
EXPECT_EQ(storage.container.size(), 28);
EXPECT_EQ(storage.container.snapshotSizeWithVersion().first, 104);
EXPECT_EQ(storage.container.snapshotSizeWithVersion().second, 1);
auto buf = manager.serializeSnapshotToBuffer(snapshot);
manager.serializeSnapshotBufferToDisk(*buf, 50);
}
EXPECT_TRUE(fs::exists("./snapshots/snapshot_50.bin" + params.extension));
EXPECT_EQ(storage.container.size(), 26);
EXPECT_EQ(storage.container.size(), 28);
storage.clearGarbageAfterSnapshot();
EXPECT_EQ(storage.container.snapshotSizeWithVersion().first, 26);
EXPECT_EQ(storage.container.snapshotSizeWithVersion().first, 28);
for (size_t i = 0; i < 50; ++i)
{
if (i % 2 != 0)
@ -1264,8 +1268,8 @@ TEST_P(CoordinationTest, TestStorageSnapshotBroken)
{
auto params = GetParam();
ChangelogDirTest test("./snapshots");
DB::KeeperSnapshotManager manager("./snapshots", 3, params.enable_compression);
DB::KeeperStorage storage(500, "", true);
DB::KeeperSnapshotManager manager("./snapshots", 3, keeper_context, params.enable_compression);
DB::KeeperStorage storage(500, "", keeper_context);
for (size_t i = 0; i < 50; ++i)
{
addNode(storage, "/hello_" + std::to_string(i), "world_" + std::to_string(i));
@ -1304,7 +1308,7 @@ nuraft::ptr<nuraft::log_entry> getLogEntryFromZKRequest(size_t term, int64_t ses
return nuraft::cs_new<nuraft::log_entry>(term, buffer);
}
void testLogAndStateMachine(Coordination::CoordinationSettingsPtr settings, uint64_t total_logs, bool enable_compression)
void testLogAndStateMachine(Coordination::CoordinationSettingsPtr settings, uint64_t total_logs, bool enable_compression, Coordination::KeeperContextPtr keeper_context)
{
using namespace Coordination;
using namespace DB;
@ -1314,7 +1318,7 @@ void testLogAndStateMachine(Coordination::CoordinationSettingsPtr settings, uint
ResponsesQueue queue(std::numeric_limits<size_t>::max());
SnapshotsQueue snapshots_queue{1};
auto state_machine = std::make_shared<KeeperStateMachine>(queue, snapshots_queue, "./snapshots", settings);
auto state_machine = std::make_shared<KeeperStateMachine>(queue, snapshots_queue, "./snapshots", settings, keeper_context);
state_machine->init();
DB::KeeperLogStore changelog("./logs", settings->rotate_log_storage_interval, true, enable_compression);
changelog.init(state_machine->last_commit_index() + 1, settings->reserved_log_items);
@ -1355,7 +1359,7 @@ void testLogAndStateMachine(Coordination::CoordinationSettingsPtr settings, uint
}
SnapshotsQueue snapshots_queue1{1};
auto restore_machine = std::make_shared<KeeperStateMachine>(queue, snapshots_queue1, "./snapshots", settings);
auto restore_machine = std::make_shared<KeeperStateMachine>(queue, snapshots_queue1, "./snapshots", settings, keeper_context);
restore_machine->init();
EXPECT_EQ(restore_machine->last_commit_index(), total_logs - total_logs % settings->snapshot_distance);
@ -1397,63 +1401,63 @@ TEST_P(CoordinationTest, TestStateMachineAndLogStore)
settings->snapshot_distance = 10;
settings->reserved_log_items = 10;
settings->rotate_log_storage_interval = 10;
testLogAndStateMachine(settings, 37, params.enable_compression);
testLogAndStateMachine(settings, 37, params.enable_compression, keeper_context);
}
{
CoordinationSettingsPtr settings = std::make_shared<CoordinationSettings>();
settings->snapshot_distance = 10;
settings->reserved_log_items = 10;
settings->rotate_log_storage_interval = 10;
testLogAndStateMachine(settings, 11, params.enable_compression);
testLogAndStateMachine(settings, 11, params.enable_compression, keeper_context);
}
{
CoordinationSettingsPtr settings = std::make_shared<CoordinationSettings>();
settings->snapshot_distance = 10;
settings->reserved_log_items = 10;
settings->rotate_log_storage_interval = 10;
testLogAndStateMachine(settings, 40, params.enable_compression);
testLogAndStateMachine(settings, 40, params.enable_compression, keeper_context);
}
{
CoordinationSettingsPtr settings = std::make_shared<CoordinationSettings>();
settings->snapshot_distance = 10;
settings->reserved_log_items = 20;
settings->rotate_log_storage_interval = 30;
testLogAndStateMachine(settings, 40, params.enable_compression);
testLogAndStateMachine(settings, 40, params.enable_compression, keeper_context);
}
{
CoordinationSettingsPtr settings = std::make_shared<CoordinationSettings>();
settings->snapshot_distance = 10;
settings->reserved_log_items = 0;
settings->rotate_log_storage_interval = 10;
testLogAndStateMachine(settings, 40, params.enable_compression);
testLogAndStateMachine(settings, 40, params.enable_compression, keeper_context);
}
{
CoordinationSettingsPtr settings = std::make_shared<CoordinationSettings>();
settings->snapshot_distance = 1;
settings->reserved_log_items = 1;
settings->rotate_log_storage_interval = 32;
testLogAndStateMachine(settings, 32, params.enable_compression);
testLogAndStateMachine(settings, 32, params.enable_compression, keeper_context);
}
{
CoordinationSettingsPtr settings = std::make_shared<CoordinationSettings>();
settings->snapshot_distance = 10;
settings->reserved_log_items = 7;
settings->rotate_log_storage_interval = 1;
testLogAndStateMachine(settings, 33, params.enable_compression);
testLogAndStateMachine(settings, 33, params.enable_compression, keeper_context);
}
{
CoordinationSettingsPtr settings = std::make_shared<CoordinationSettings>();
settings->snapshot_distance = 37;
settings->reserved_log_items = 1000;
settings->rotate_log_storage_interval = 5000;
testLogAndStateMachine(settings, 33, params.enable_compression);
testLogAndStateMachine(settings, 33, params.enable_compression, keeper_context);
}
{
CoordinationSettingsPtr settings = std::make_shared<CoordinationSettings>();
settings->snapshot_distance = 37;
settings->reserved_log_items = 1000;
settings->rotate_log_storage_interval = 5000;
testLogAndStateMachine(settings, 45, params.enable_compression);
testLogAndStateMachine(settings, 45, params.enable_compression, keeper_context);
}
}
@ -1467,7 +1471,7 @@ TEST_P(CoordinationTest, TestEphemeralNodeRemove)
ResponsesQueue queue(std::numeric_limits<size_t>::max());
SnapshotsQueue snapshots_queue{1};
auto state_machine = std::make_shared<KeeperStateMachine>(queue, snapshots_queue, "./snapshots", settings);
auto state_machine = std::make_shared<KeeperStateMachine>(queue, snapshots_queue, "./snapshots", settings, keeper_context);
state_machine->init();
std::shared_ptr<ZooKeeperCreateRequest> request_c = std::make_shared<ZooKeeperCreateRequest>();
@ -1634,9 +1638,9 @@ TEST_P(CoordinationTest, TestStorageSnapshotDifferentCompressions)
auto params = GetParam();
ChangelogDirTest test("./snapshots");
DB::KeeperSnapshotManager manager("./snapshots", 3, params.enable_compression);
DB::KeeperSnapshotManager manager("./snapshots", 3, keeper_context, params.enable_compression);
DB::KeeperStorage storage(500, "", true);
DB::KeeperStorage storage(500, "", keeper_context);
addNode(storage, "/hello", "world", 1);
addNode(storage, "/hello/somepath", "somedata", 3);
storage.session_id_counter = 5;
@ -1652,14 +1656,14 @@ TEST_P(CoordinationTest, TestStorageSnapshotDifferentCompressions)
manager.serializeSnapshotBufferToDisk(*buf, 2);
EXPECT_TRUE(fs::exists("./snapshots/snapshot_2.bin" + params.extension));
DB::KeeperSnapshotManager new_manager("./snapshots", 3, !params.enable_compression);
DB::KeeperSnapshotManager new_manager("./snapshots", 3, keeper_context, !params.enable_compression);
auto debuf = new_manager.deserializeSnapshotBufferFromDisk(2);
auto [restored_storage, snapshot_meta, _] = new_manager.deserializeSnapshotFromBuffer(debuf);
EXPECT_EQ(restored_storage->container.size(), 3);
EXPECT_EQ(restored_storage->container.getValue("/").getChildren().size(), 1);
EXPECT_EQ(restored_storage->container.size(), 5);
EXPECT_EQ(restored_storage->container.getValue("/").getChildren().size(), 2);
EXPECT_EQ(restored_storage->container.getValue("/hello").getChildren().size(), 1);
EXPECT_EQ(restored_storage->container.getValue("/hello/somepath").getChildren().size(), 0);
@ -1786,9 +1790,9 @@ TEST_P(CoordinationTest, TestStorageSnapshotEqual)
std::optional<UInt128> snapshot_hash;
for (size_t i = 0; i < 15; ++i)
{
DB::KeeperSnapshotManager manager("./snapshots", 3, params.enable_compression);
DB::KeeperSnapshotManager manager("./snapshots", 3, keeper_context, params.enable_compression);
DB::KeeperStorage storage(500, "", true);
DB::KeeperStorage storage(500, "", keeper_context);
addNode(storage, "/hello", "");
for (size_t j = 0; j < 5000; ++j)
{
@ -1859,7 +1863,7 @@ TEST_P(CoordinationTest, TestUncommittedStateBasicCrud)
using namespace DB;
using namespace Coordination;
DB::KeeperStorage storage{500, "", true};
DB::KeeperStorage storage{500, "", keeper_context};
constexpr std::string_view path = "/test";
@ -1976,32 +1980,35 @@ TEST_P(CoordinationTest, TestListRequestTypes)
using namespace DB;
using namespace Coordination;
KeeperStorage storage{500, "", true};
KeeperStorage storage{500, "", keeper_context};
int64_t zxid = 0;
static constexpr std::string_view path = "/test";
static constexpr std::string_view test_path = "/list_request_type/node";
const auto create_path = [&](bool is_ephemeral)
const auto create_path = [&](const auto & path, bool is_ephemeral, bool is_sequential = true)
{
const auto create_request = std::make_shared<ZooKeeperCreateRequest>();
int new_zxid = ++zxid;
create_request->path = path;
create_request->is_sequential = true;
create_request->is_sequential = is_sequential;
create_request->is_ephemeral = is_ephemeral;
storage.preprocessRequest(create_request, 1, 0, new_zxid);
auto responses = storage.processRequest(create_request, 1, new_zxid);
EXPECT_GE(responses.size(), 1);
EXPECT_EQ(responses[0].response->error, Coordination::Error::ZOK) << "Failed to create " << path;
const auto & create_response = dynamic_cast<ZooKeeperCreateResponse &>(*responses[0].response);
return create_response.path_created;
};
create_path(parentPath(StringRef{test_path}).toString(), false, false);
static constexpr size_t persistent_num = 5;
std::unordered_set<std::string> expected_persistent_children;
for (size_t i = 0; i < persistent_num; ++i)
{
expected_persistent_children.insert(getBaseName(create_path(false)).toString());
expected_persistent_children.insert(getBaseName(create_path(test_path, false)).toString());
}
ASSERT_EQ(expected_persistent_children.size(), persistent_num);
@ -2009,7 +2016,7 @@ TEST_P(CoordinationTest, TestListRequestTypes)
std::unordered_set<std::string> expected_ephemeral_children;
for (size_t i = 0; i < ephemeral_num; ++i)
{
expected_ephemeral_children.insert(getBaseName(create_path(true)).toString());
expected_ephemeral_children.insert(getBaseName(create_path(test_path, true)).toString());
}
ASSERT_EQ(expected_ephemeral_children.size(), ephemeral_num);
@ -2017,7 +2024,7 @@ TEST_P(CoordinationTest, TestListRequestTypes)
{
const auto list_request = std::make_shared<ZooKeeperFilteredListRequest>();
int new_zxid = ++zxid;
list_request->path = parentPath(StringRef{path}).toString();
list_request->path = parentPath(StringRef{test_path}).toString();
list_request->list_request_type = list_request_type;
storage.preprocessRequest(list_request, 1, 0, new_zxid);
auto responses = storage.processRequest(list_request, 1, new_zxid);
@ -2120,6 +2127,20 @@ TEST_P(CoordinationTest, TestDurableState)
}
}
TEST_P(CoordinationTest, TestCurrentApiVersion)
{
using namespace Coordination;
KeeperStorage storage{500, "", keeper_context};
auto request = std::make_shared<ZooKeeperGetRequest>();
request->path = DB::keeper_api_version_path;
auto responses = storage.processRequest(request, 0, std::nullopt, true, true);
const auto & get_response = getSingleResponse<ZooKeeperGetResponse>(responses);
uint8_t keeper_version{0};
DB::ReadBufferFromOwnString buf(get_response.data);
DB::readIntText(keeper_version, buf);
EXPECT_EQ(keeper_version, static_cast<uint8_t>(current_keeper_api_version));
}
INSTANTIATE_TEST_SUITE_P(CoordinationTestSuite,
CoordinationTest,
::testing::ValuesIn(std::initializer_list<CompressionParam>{

79
src/Core/Joins.cpp Normal file
View File

@ -0,0 +1,79 @@
#include <Core/Joins.h>
namespace DB
{
const char * toString(JoinKind kind)
{
switch (kind)
{
case JoinKind::Inner: return "INNER";
case JoinKind::Left: return "LEFT";
case JoinKind::Right: return "RIGHT";
case JoinKind::Full: return "FULL";
case JoinKind::Cross: return "CROSS";
case JoinKind::Comma: return "COMMA";
}
};
const char * toString(JoinStrictness strictness)
{
switch (strictness)
{
case JoinStrictness::Unspecified: return "UNSPECIFIED";
case JoinStrictness::RightAny: return "RIGHT_ANY";
case JoinStrictness::Any: return "ANY";
case JoinStrictness::All: return "ALL";
case JoinStrictness::Asof: return "ASOF";
case JoinStrictness::Semi: return "SEMI";
case JoinStrictness::Anti: return "ANTI";
}
}
const char * toString(JoinLocality locality)
{
switch (locality)
{
case JoinLocality::Unspecified: return "UNSPECIFIED";
case JoinLocality::Local: return "LOCAL";
case JoinLocality::Global: return "GLOBAL";
}
}
const char * toString(ASOFJoinInequality asof_join_inequality)
{
switch (asof_join_inequality)
{
case ASOFJoinInequality::None: return "NONE";
case ASOFJoinInequality::Less: return "LESS";
case ASOFJoinInequality::Greater: return "GREATER";
case ASOFJoinInequality::LessOrEquals: return "LESS_OR_EQUALS";
case ASOFJoinInequality::GreaterOrEquals: return "GREATER_OR_EQUALS";
}
}
const char * toString(JoinAlgorithm join_algorithm)
{
switch (join_algorithm)
{
case JoinAlgorithm::DEFAULT: return "DEFAULT";
case JoinAlgorithm::AUTO: return "AUTO";
case JoinAlgorithm::HASH: return "HASH";
case JoinAlgorithm::PARTIAL_MERGE: return "PARTIAL_MERGE";
case JoinAlgorithm::PREFER_PARTIAL_MERGE: return "PREFER_PARTIAL_MERGE";
case JoinAlgorithm::PARALLEL_HASH: return "PARALLEL_HASH";
case JoinAlgorithm::DIRECT: return "DIRECT";
case JoinAlgorithm::FULL_SORTING_MERGE: return "FULL_SORTING_MERGE";
}
}
const char * toString(JoinTableSide join_table_side)
{
switch (join_table_side)
{
case JoinTableSide::Left: return "LEFT";
case JoinTableSide::Right: return "RIGHT";
}
}
}

119
src/Core/Joins.h Normal file
View File

@ -0,0 +1,119 @@
#pragma once
#include <string_view>
namespace DB
{
/// Join method.
enum class JoinKind
{
Inner, /// Leave only rows that was JOINed.
Left, /// If in "right" table there is no corresponding rows, use default values instead.
Right,
Full,
Cross, /// Direct product. Strictness and condition doesn't matter.
Comma /// Same as direct product. Intended to be converted to INNER JOIN with conditions from WHERE.
};
const char * toString(JoinKind kind);
inline constexpr bool isLeft(JoinKind kind) { return kind == JoinKind::Left; }
inline constexpr bool isRight(JoinKind kind) { return kind == JoinKind::Right; }
inline constexpr bool isInner(JoinKind kind) { return kind == JoinKind::Inner; }
inline constexpr bool isFull(JoinKind kind) { return kind == JoinKind::Full; }
inline constexpr bool isCrossOrComma(JoinKind kind) { return kind == JoinKind::Comma || kind == JoinKind::Cross; }
inline constexpr bool isRightOrFull(JoinKind kind) { return kind == JoinKind::Right || kind == JoinKind::Full; }
inline constexpr bool isLeftOrFull(JoinKind kind) { return kind == JoinKind::Left || kind == JoinKind::Full; }
inline constexpr bool isInnerOrRight(JoinKind kind) { return kind == JoinKind::Inner || kind == JoinKind::Right; }
inline constexpr bool isInnerOrLeft(JoinKind kind) { return kind == JoinKind::Inner || kind == JoinKind::Left; }
/// Allows more optimal JOIN for typical cases.
enum class JoinStrictness
{
Unspecified,
RightAny, /// Old ANY JOIN. If there are many suitable rows in right table, use any from them to join.
Any, /// Semi Join with any value from filtering table. For LEFT JOIN with Any and RightAny are the same.
All, /// If there are many suitable rows to join, use all of them and replicate rows of "left" table (usual semantic of JOIN).
Asof, /// For the last JOIN column, pick the latest value
Semi, /// LEFT or RIGHT. SEMI LEFT JOIN filters left table by values exists in right table. SEMI RIGHT - otherwise.
Anti, /// LEFT or RIGHT. Same as SEMI JOIN but filter values that are NOT exists in other table.
};
const char * toString(JoinStrictness strictness);
/// Algorithm for distributed query processing.
enum class JoinLocality
{
Unspecified,
Local, /// Perform JOIN, using only data available on same servers (co-located data).
Global /// Collect and merge data from remote servers, and broadcast it to each server.
};
const char * toString(JoinLocality locality);
/// ASOF JOIN inequality type
enum class ASOFJoinInequality
{
None,
Less,
Greater,
LessOrEquals,
GreaterOrEquals,
};
const char * toString(ASOFJoinInequality asof_join_inequality);
inline constexpr ASOFJoinInequality getASOFJoinInequality(std::string_view func_name)
{
ASOFJoinInequality inequality = ASOFJoinInequality::None;
if (func_name == "less")
inequality = ASOFJoinInequality::Less;
else if (func_name == "greater")
inequality = ASOFJoinInequality::Greater;
else if (func_name == "lessOrEquals")
inequality = ASOFJoinInequality::LessOrEquals;
else if (func_name == "greaterOrEquals")
inequality = ASOFJoinInequality::GreaterOrEquals;
return inequality;
}
inline constexpr ASOFJoinInequality reverseASOFJoinInequality(ASOFJoinInequality inequality)
{
if (inequality == ASOFJoinInequality::Less)
return ASOFJoinInequality::Greater;
else if (inequality == ASOFJoinInequality::Greater)
return ASOFJoinInequality::Less;
else if (inequality == ASOFJoinInequality::LessOrEquals)
return ASOFJoinInequality::GreaterOrEquals;
else if (inequality == ASOFJoinInequality::GreaterOrEquals)
return ASOFJoinInequality::LessOrEquals;
return ASOFJoinInequality::None;
}
enum class JoinAlgorithm
{
DEFAULT = 0,
AUTO,
HASH,
PARTIAL_MERGE,
PREFER_PARTIAL_MERGE,
PARALLEL_HASH,
DIRECT,
FULL_SORTING_MERGE,
};
const char * toString(JoinAlgorithm join_algorithm);
enum class JoinTableSide
{
Left,
Right
};
const char * toString(JoinTableSide join_table_side);
}

View File

@ -247,7 +247,7 @@ static constexpr UInt64 operator""_GiB(unsigned long long value)
\
M(Bool, join_use_nulls, false, "Use NULLs for non-joined rows of outer JOINs for types that can be inside Nullable. If false, use default value of corresponding columns data type.", IMPORTANT) \
\
M(JoinStrictness, join_default_strictness, JoinStrictness::ALL, "Set default strictness in JOIN query. Possible values: empty string, 'ANY', 'ALL'. If empty, query without strictness will throw exception.", 0) \
M(JoinStrictness, join_default_strictness, JoinStrictness::All, "Set default strictness in JOIN query. Possible values: empty string, 'ANY', 'ALL'. If empty, query without strictness will throw exception.", 0) \
M(Bool, any_join_distinct_right_table_keys, false, "Enable old ANY JOIN logic with many-to-one left-to-right table keys mapping for all ANY JOINs. It leads to confusing not equal results for 't1 ANY LEFT JOIN t2' and 't2 ANY RIGHT JOIN t1'. ANY RIGHT JOIN needs one-to-many keys mapping to be consistent with LEFT one.", IMPORTANT) \
\
M(UInt64, preferred_block_size_bytes, 1000000, "", 0) \

View File

@ -26,8 +26,8 @@ IMPLEMENT_SETTING_ENUM(LoadBalancing, ErrorCodes::UNKNOWN_LOAD_BALANCING,
IMPLEMENT_SETTING_ENUM(JoinStrictness, ErrorCodes::UNKNOWN_JOIN,
{{"", JoinStrictness::Unspecified},
{"ALL", JoinStrictness::ALL},
{"ANY", JoinStrictness::ANY}})
{"ALL", JoinStrictness::All},
{"ANY", JoinStrictness::Any}})
IMPLEMENT_SETTING_MULTI_ENUM(JoinAlgorithm, ErrorCodes::UNKNOWN_JOIN,

View File

@ -1,12 +1,14 @@
#pragma once
#include <Core/SettingsFields.h>
#include <Core/Joins.h>
#include <QueryPipeline/SizeLimits.h>
#include <Formats/FormatSettings.h>
namespace DB
{
enum class LoadBalancing
{
/// among replicas with a minimum number of errors selected randomly
@ -26,28 +28,8 @@ enum class LoadBalancing
DECLARE_SETTING_ENUM(LoadBalancing)
enum class JoinStrictness
{
Unspecified = 0, /// Query JOIN without strictness will throw Exception.
ALL, /// Query JOIN without strictness -> ALL JOIN ...
ANY, /// Query JOIN without strictness -> ANY JOIN ...
};
DECLARE_SETTING_ENUM(JoinStrictness)
enum class JoinAlgorithm
{
DEFAULT = 0,
AUTO,
HASH,
PARTIAL_MERGE,
PREFER_PARTIAL_MERGE,
PARALLEL_HASH,
DIRECT,
FULL_SORTING_MERGE,
};
DECLARE_SETTING_MULTI_ENUM(JoinAlgorithm)
@ -200,4 +182,5 @@ DECLARE_SETTING_ENUM_WITH_RENAME(EnumComparingMode, FormatSettings::EnumComparin
DECLARE_SETTING_ENUM_WITH_RENAME(EscapingRule, FormatSettings::EscapingRule)
DECLARE_SETTING_ENUM_WITH_RENAME(MsgPackUUIDRepresentation, FormatSettings::MsgPackUUIDRepresentation)
}

View File

@ -28,7 +28,9 @@ struct FillColumnDescription
/// All missed values in range [FROM, TO) will be filled
/// Range [FROM, TO) respects sorting direction
Field fill_from; /// Fill value >= FILL_FROM
DataTypePtr fill_from_type;
Field fill_to; /// Fill value + STEP < FILL_TO
DataTypePtr fill_to_type;
Field fill_step; /// Default = +1 or -1 according to direction
std::optional<IntervalKind> step_kind;

View File

@ -12,7 +12,7 @@
#if defined(__clang__)
#include <experimental/coroutine>
namespace std
namespace std // NOLINT(cert-dcl58-cpp)
{
using namespace experimental::coroutines_v1;
}
@ -97,14 +97,14 @@ struct Task
static bool resumeImpl(Task *r)
{
if (r->value)
return false;
return false;
auto & next = r->my.promise().next;
if (next)
{
if (resumeImpl(next.promise().r))
return true;
return true;
next = {};
}
@ -183,7 +183,7 @@ int main()
auto t = foo("foo");
std::cout << ".. started" << std::endl;
while (t.resume())
std::cout << ".. yielded" << std::endl;
std::cout << ".. yielded" << std::endl;
std::cout << ".. done: " << t.res() << std::endl;
}
catch (DB::Exception & e)

View File

@ -171,13 +171,13 @@ ColumnUInt8::Ptr DirectDictionary<dictionary_key_type>::hasKeys(
auto requested_keys = requested_keys_extractor.extractAllKeys();
size_t requested_keys_size = requested_keys.size();
HashMap<KeyType, size_t> requested_key_to_index;
HashMap<KeyType, PaddedPODArray<size_t>> requested_key_to_index;
requested_key_to_index.reserve(requested_keys_size);
for (size_t i = 0; i < requested_keys.size(); ++i)
{
auto requested_key = requested_keys[i];
requested_key_to_index[requested_key] = i;
requested_key_to_index[requested_key].push_back(i);
}
auto result = ColumnUInt8::create(requested_keys_size, false);
@ -208,10 +208,13 @@ ColumnUInt8::Ptr DirectDictionary<dictionary_key_type>::hasKeys(
const auto * it = requested_key_to_index.find(block_key);
assert(it);
size_t result_data_found_index = it->getMapped();
/// block_keys_size cannot be used, due to duplicates.
keys_found += !result_data[result_data_found_index];
result_data[result_data_found_index] = true;
auto & result_data_found_indexes = it->getMapped();
for (size_t result_data_found_index : result_data_found_indexes)
{
/// block_keys_size cannot be used, due to duplicates.
keys_found += !result_data[result_data_found_index];
result_data[result_data_found_index] = true;
}
block_keys_extractor.rollbackCurrentKey();
}

View File

@ -68,6 +68,8 @@ static void loadDiskLocalConfig(const String & name,
throw Exception("Disk path can not be empty. Disk " + name, ErrorCodes::UNKNOWN_ELEMENT_IN_CONFIG);
if (path.back() != '/')
throw Exception("Disk path must end with /. Disk " + name, ErrorCodes::UNKNOWN_ELEMENT_IN_CONFIG);
if (path == context->getPath())
throw Exception(ErrorCodes::UNKNOWN_ELEMENT_IN_CONFIG, "Disk path ('{}') cannot be equal to <path>. Use <default> disk instead.", path);
}
bool has_space_ratio = config.has(config_prefix + ".keep_free_space_ratio");

View File

@ -6,13 +6,14 @@ add_subdirectory(divide)
include("${ClickHouse_SOURCE_DIR}/cmake/dbms_glob_sources.cmake")
add_headers_and_sources(clickhouse_functions .)
list(REMOVE_ITEM clickhouse_functions_sources IFunction.cpp FunctionFactory.cpp FunctionHelpers.cpp)
list(REMOVE_ITEM clickhouse_functions_headers IFunction.h FunctionFactory.h FunctionHelpers.h)
list(REMOVE_ITEM clickhouse_functions_sources IFunction.cpp FunctionFactory.cpp FunctionHelpers.cpp extractTimeZoneFromFunctionArguments.cpp FunctionsLogical.cpp)
list(REMOVE_ITEM clickhouse_functions_headers IFunction.h FunctionFactory.h FunctionHelpers.h extractTimeZoneFromFunctionArguments.h FunctionsLogical.h)
add_library(clickhouse_functions ${clickhouse_functions_sources})
add_library(clickhouse_functions_obj OBJECT ${clickhouse_functions_sources})
target_link_libraries(clickhouse_functions
PUBLIC
list (APPEND OBJECT_LIBS $<TARGET_OBJECTS:clickhouse_functions_obj>)
list (APPEND PUBLIC_LIBS
ch_contrib::wyhash
ch_contrib::cityhash
ch_contrib::farmhash
@ -24,27 +25,28 @@ target_link_libraries(clickhouse_functions
ch_contrib::metrohash
ch_contrib::murmurhash
ch_contrib::hashidsxx
)
PRIVATE
list (APPEND PRIVATE_LIBS
ch_contrib::zlib
boost::filesystem
divide_impl
)
if (TARGET OpenSSL::Crypto)
target_link_libraries(clickhouse_functions PUBLIC OpenSSL::Crypto)
list (APPEND PUBLIC_LIBS OpenSSL::Crypto)
endif()
if (OMIT_HEAVY_DEBUG_SYMBOLS)
target_compile_options(clickhouse_functions PRIVATE "-g0")
target_compile_options(clickhouse_functions_obj PRIVATE "-g0")
endif()
if (TARGET ch_contrib::icu)
target_link_libraries (clickhouse_functions PRIVATE ch_contrib::icu)
list (APPEND PRIVATE_LIBS ch_contrib::icu)
endif ()
if (TARGET ch_contrib::fastops)
target_link_libraries (clickhouse_functions PRIVATE ch_contrib::fastops)
list (APPEND PRIVATE_LIBS ch_contrib::fastops)
endif ()
if (ENABLE_EXAMPLES)
@ -52,45 +54,46 @@ if (ENABLE_EXAMPLES)
endif ()
if (TARGET ch_contrib::llvm)
target_link_libraries(clickhouse_functions PRIVATE ch_contrib::llvm)
list (APPEND PRIVATE_LIBS ch_contrib::llvm)
endif ()
if (TARGET ch_contrib::base64)
target_link_libraries(clickhouse_functions PRIVATE ch_contrib::base64)
list (APPEND PRIVATE_LIBS ch_contrib::base64)
endif()
target_link_libraries(clickhouse_functions PRIVATE ch_contrib::lz4)
list (APPEND PRIVATE_LIBS ch_contrib::lz4)
if (ENABLE_NLP)
target_link_libraries(clickhouse_functions PRIVATE ch_contrib::cld2)
list (APPEND PRIVATE_LIBS ch_contrib::cld2)
endif()
if (TARGET ch_contrib::h3)
target_link_libraries (clickhouse_functions PRIVATE ch_contrib::h3)
list (APPEND PRIVATE_LIBS ch_contrib::h3)
endif()
if (TARGET ch_contrib::vectorscan)
target_link_libraries(clickhouse_functions PRIVATE ch_contrib::vectorscan)
list (APPEND PRIVATE_LIBS ch_contrib::vectorscan)
endif()
if (TARGET ch_contrib::simdjson)
target_link_libraries(clickhouse_functions PRIVATE ch_contrib::simdjson)
list (APPEND PRIVATE_LIBS ch_contrib::simdjson)
endif()
if (TARGET ch_contrib::rapidjson)
target_link_libraries(clickhouse_functions PRIVATE ch_contrib::rapidjson)
list (APPEND PRIVATE_LIBS ch_contrib::rapidjson)
endif()
add_subdirectory(GatherUtils)
target_link_libraries(clickhouse_functions PRIVATE clickhouse_functions_gatherutils)
list (APPEND PRIVATE_LIBS clickhouse_functions_gatherutils)
add_subdirectory(URL)
target_link_libraries(clickhouse_functions PRIVATE clickhouse_functions_url)
list (APPEND OBJECT_LIBS $<TARGET_OBJECTS:clickhouse_functions_url>)
add_subdirectory(array)
target_link_libraries(clickhouse_functions PRIVATE clickhouse_functions_array)
list (APPEND OBJECT_LIBS $<TARGET_OBJECTS:clickhouse_functions_array>)
add_subdirectory(JSONPath)
list (APPEND PRIVATE_LIBS clickhouse_functions_jsonpath)
# Signed integer overflow on user-provided data inside boost::geometry - ignore.
set_source_files_properties("pointInPolygon.cpp" PROPERTIES COMPILE_FLAGS -fno-sanitize=signed-integer-overflow)
@ -98,3 +101,15 @@ set_source_files_properties("pointInPolygon.cpp" PROPERTIES COMPILE_FLAGS -fno-s
if (ENABLE_FUZZING)
add_compile_definitions(FUZZING_MODE=1)
endif ()
target_link_libraries(clickhouse_functions_obj PUBLIC ${PUBLIC_LIBS} PRIVATE ${PRIVATE_LIBS})
if (USE_STATIC_LIBRARIES OR NOT SPLIT_SHARED_LIBRARIES)
# Used to forward the linking information to the final binaries such as clickhouse / unit_tests_dbms,
# since such information are lost after we convert to OBJECT target
add_library(clickhouse_functions INTERFACE)
target_link_libraries(clickhouse_functions INTERFACE ${OBJECT_LIBS} ${PUBLIC_LIBS} ${PRIVATE_LIBS})
else()
add_library(clickhouse_functions SHARED ${OBJECT_LIBS})
target_link_libraries(clickhouse_functions PUBLIC ${PUBLIC_LIBS} PRIVATE ${PRIVATE_LIBS})
endif ()

View File

@ -143,7 +143,7 @@ void registerFunctionCRCImpl(FunctionFactory & factory)
factory.registerFunction<T>(T::name, FunctionFactory::CaseInsensitive);
}
void registerFunctionCRC(FunctionFactory & factory)
REGISTER_FUNCTION(CRC)
{
registerFunctionCRCImpl<FunctionCRC32ZLIB>(factory);
registerFunctionCRCImpl<FunctionCRC32IEEE>(factory);

View File

@ -5,7 +5,7 @@
namespace DB
{
void registerCastOverloadResolvers(FunctionFactory & factory)
REGISTER_FUNCTION(CastOverloadResolvers)
{
factory.registerFunction<CastInternalOverloadResolver<CastType::nonAccurate>>(FunctionFactory::CaseInsensitive);
factory.registerFunction<CastInternalOverloadResolver<CastType::accurate>>();

View File

@ -113,7 +113,7 @@ private:
}
};
void registerFunctionChar(FunctionFactory & factory)
REGISTER_FUNCTION(Char)
{
factory.registerFunction<FunctionChar>(FunctionFactory::CaseInsensitive);
}

View File

@ -44,7 +44,7 @@ public:
};
void registerFunctionFQDN(FunctionFactory & factory)
REGISTER_FUNCTION(FQDN)
{
factory.registerFunction<FunctionFQDN>(FunctionFactory::CaseInsensitive);
factory.registerFunction<FunctionFQDN>("fullHostName");

View File

@ -1,6 +1,7 @@
#pragma once
#include <Interpreters/Context_fwd.h>
#include <Common/register_objects.h>
#include <Common/IFactoryWithAliases.h>
#include <Functions/IFunction.h>
#include <Functions/IFunctionAdaptors.h>

View File

@ -94,7 +94,7 @@ public:
};
void registerFunctionFile(FunctionFactory & factory)
REGISTER_FUNCTION(File)
{
factory.registerFunction<FunctionFile>();
}

View File

@ -4,7 +4,7 @@
namespace DB
{
void registerFunctionHashID(FunctionFactory & factory)
REGISTER_FUNCTION(HashID)
{
factory.registerFunction<FunctionHashID>();
}

View File

@ -96,7 +96,7 @@ FunctionBasePtr JoinGetOverloadResolver<or_null>::buildImpl(const ColumnsWithTyp
return std::make_unique<FunctionJoinGet<or_null>>(getContext(), table_lock, storage_join, attr_name, argument_types, return_type);
}
void registerFunctionJoinGet(FunctionFactory & factory)
REGISTER_FUNCTION(JoinGet)
{
// joinGet
factory.registerFunction<JoinGetOverloadResolver<false>>();

View File

@ -5,7 +5,7 @@
namespace DB
{
void registerFunctionsSQLJSON(FunctionFactory & factory)
REGISTER_FUNCTION(SQLJSON)
{
factory.registerFunction<FunctionSQLJSON<NameJSONExists, JSONExistsImpl>>();
factory.registerFunction<FunctionSQLJSON<NameJSONQuery, JSONQueryImpl>>();

View File

@ -4,7 +4,7 @@
namespace DB
{
void registerFunctionShowCertificate(FunctionFactory & factory)
REGISTER_FUNCTION(ShowCertificate)
{
factory.registerFunction<FunctionShowCertificate>();
}

View File

@ -3,12 +3,12 @@
namespace DB
{
void registerFunctionBase58Encode(FunctionFactory & factory)
REGISTER_FUNCTION(Base58Encode)
{
factory.registerFunction<FunctionBase58Conversion<Base58Encode>>();
}
void registerFunctionBase58Decode(FunctionFactory & factory)
REGISTER_FUNCTION(Base58Decode)
{
factory.registerFunction<FunctionBase58Conversion<Base58Decode>>();
}

View File

@ -621,7 +621,7 @@ public:
}
};
void registerFunctionsBinaryRepr(FunctionFactory & factory)
REGISTER_FUNCTION(BinaryRepr)
{
factory.registerFunction<EncodeToBinaryRepresentation<HexImpl>>(FunctionFactory::CaseInsensitive);
factory.registerFunction<DecodeFromBinaryRepresentation<UnhexImpl>>(FunctionFactory::CaseInsensitive);

View File

@ -329,7 +329,7 @@ public:
}
void registerFunctionsBitToArray(FunctionFactory & factory)
REGISTER_FUNCTION(BitToArray)
{
factory.registerFunction<FunctionBitPositionsToArray>();
factory.registerFunction<FunctionBitmaskToArray>();

View File

@ -7,7 +7,7 @@
namespace DB
{
void registerFunctionsBitmap(FunctionFactory & factory)
REGISTER_FUNCTION(Bitmap)
{
factory.registerFunction<FunctionBitmapBuild>();
factory.registerFunction<FunctionBitmapToArray>();

View File

@ -143,7 +143,7 @@ struct NameDetectLanguageUnknown
using FunctionDetectCharset = FunctionTextClassificationString<CharsetClassificationImpl<false>, NameDetectCharset>;
using FunctionDetectLanguageUnknown = FunctionTextClassificationString<CharsetClassificationImpl<true>, NameDetectLanguageUnknown>;
void registerFunctionDetectCharset(FunctionFactory & factory)
REGISTER_FUNCTION(DetectCharset)
{
factory.registerFunction<FunctionDetectCharset>();
factory.registerFunction<FunctionDetectLanguageUnknown>();

Some files were not shown because too many files have changed in this diff Show More