Merge branch 'master' into mvcc_prototype

This commit is contained in:
Alexander Tokmakov 2021-12-20 22:06:22 +03:00
commit 9cd49bc0ec
432 changed files with 7801 additions and 1548 deletions

View File

@ -14,6 +14,7 @@ jobs:
runs-on: [self-hosted, style-checker]
steps:
- name: Set envs
# https://docs.github.com/en/actions/learn-github-actions/workflow-commands-for-github-actions#multiline-strings
run: |
cat >> "$GITHUB_ENV" << 'EOF'
TEMP_PATH=${{runner.temp}}/cherry_pick

View File

@ -40,6 +40,17 @@ jobs:
needs: DockerHubPush
runs-on: [self-hosted, func-tester]
steps:
- name: Set envs
# https://docs.github.com/en/actions/learn-github-actions/workflow-commands-for-github-actions#multiline-strings
run: |
cat >> "$GITHUB_ENV" << 'EOF'
TEMP_PATH=${{runner.temp}}/docs_release
REPO_COPY=${{runner.temp}}/docs_release/ClickHouse
CLOUDFLARE_TOKEN=${{secrets.CLOUDFLARE}}
ROBOT_CLICKHOUSE_SSH_KEY<<RCSK
${{secrets.ROBOT_CLICKHOUSE_SSH_KEY}}
RCSK
EOF
- name: Clear repository
run: |
sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE
@ -51,11 +62,6 @@ jobs:
name: changed_images
path: ${{ env.TEMP_PATH }}
- name: Docs Release
env:
TEMP_PATH: ${{runner.temp}}/docs_release
REPO_COPY: ${{runner.temp}}/docs_release/ClickHouse
CLOUDFLARE_TOKEN: ${{secrets.CLOUDFLARE}}
ROBOT_CLICKHOUSE_SSH_KEY: ${{secrets.ROBOT_CLICKHOUSE_SSH_KEY}}
run: |
sudo rm -fr $TEMP_PATH
mkdir -p $TEMP_PATH

View File

@ -407,7 +407,7 @@ jobs:
docker kill $(docker ps -q) ||:
docker rm -f $(docker ps -a -q) ||:
sudo rm -fr $TEMP_PATH
FunctionalStatelessTestAsan:
FunctionalStatelessTestAsan0:
needs: [BuilderDebAsan]
runs-on: [self-hosted, func-tester]
steps:
@ -419,6 +419,8 @@ jobs:
CHECK_NAME=Stateless tests (address, actions)
REPO_COPY=${{runner.temp}}/stateless_debug/ClickHouse
KILL_TIMEOUT=10800
RUN_BY_HASH_NUM=0
RUN_BY_HASH_TOTAL=2
EOF
- name: Download json reports
uses: actions/download-artifact@v2
@ -442,7 +444,44 @@ jobs:
docker kill $(docker ps -q) ||:
docker rm -f $(docker ps -a -q) ||:
sudo rm -fr $TEMP_PATH
FunctionalStatelessTestTsan:
FunctionalStatelessTestAsan1:
needs: [BuilderDebAsan]
runs-on: [self-hosted, func-tester]
steps:
- name: Set envs
run: |
cat >> "$GITHUB_ENV" << 'EOF'
TEMP_PATH=${{runner.temp}}/stateless_debug
REPORTS_PATH=${{runner.temp}}/reports_dir
CHECK_NAME=Stateless tests (address, actions)
REPO_COPY=${{runner.temp}}/stateless_debug/ClickHouse
KILL_TIMEOUT=10800
RUN_BY_HASH_NUM=1
RUN_BY_HASH_TOTAL=2
EOF
- name: Download json reports
uses: actions/download-artifact@v2
with:
path: ${{ env.REPORTS_PATH }}
- name: Clear repository
run: |
sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE
- name: Check out repository code
uses: actions/checkout@v2
- name: Functional test
run: |
sudo rm -fr $TEMP_PATH
mkdir -p $TEMP_PATH
cp -r $GITHUB_WORKSPACE $TEMP_PATH
cd $REPO_COPY/tests/ci
python3 functional_test_check.py "$CHECK_NAME" $KILL_TIMEOUT
- name: Cleanup
if: always()
run: |
docker kill $(docker ps -q) ||:
docker rm -f $(docker ps -a -q) ||:
sudo rm -fr $TEMP_PATH
FunctionalStatelessTestTsan0:
needs: [BuilderDebTsan]
runs-on: [self-hosted, func-tester]
steps:
@ -454,6 +493,82 @@ jobs:
CHECK_NAME=Stateless tests (thread, actions)
REPO_COPY=${{runner.temp}}/stateless_tsan/ClickHouse
KILL_TIMEOUT=10800
RUN_BY_HASH_NUM=0
RUN_BY_HASH_TOTAL=3
EOF
- name: Download json reports
uses: actions/download-artifact@v2
with:
path: ${{ env.REPORTS_PATH }}
- name: Clear repository
run: |
sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE
- name: Check out repository code
uses: actions/checkout@v2
- name: Functional test
run: |
sudo rm -fr $TEMP_PATH
mkdir -p $TEMP_PATH
cp -r $GITHUB_WORKSPACE $TEMP_PATH
cd $REPO_COPY/tests/ci
python3 functional_test_check.py "$CHECK_NAME" $KILL_TIMEOUT
- name: Cleanup
if: always()
run: |
docker kill $(docker ps -q) ||:
docker rm -f $(docker ps -a -q) ||:
sudo rm -fr $TEMP_PATH
FunctionalStatelessTestTsan1:
needs: [BuilderDebTsan]
runs-on: [self-hosted, func-tester]
steps:
- name: Set envs
run: |
cat >> "$GITHUB_ENV" << 'EOF'
TEMP_PATH=${{runner.temp}}/stateless_tsan
REPORTS_PATH=${{runner.temp}}/reports_dir
CHECK_NAME=Stateless tests (thread, actions)
REPO_COPY=${{runner.temp}}/stateless_tsan/ClickHouse
KILL_TIMEOUT=10800
RUN_BY_HASH_NUM=1
RUN_BY_HASH_TOTAL=3
EOF
- name: Download json reports
uses: actions/download-artifact@v2
with:
path: ${{ env.REPORTS_PATH }}
- name: Clear repository
run: |
sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE
- name: Check out repository code
uses: actions/checkout@v2
- name: Functional test
run: |
sudo rm -fr $TEMP_PATH
mkdir -p $TEMP_PATH
cp -r $GITHUB_WORKSPACE $TEMP_PATH
cd $REPO_COPY/tests/ci
python3 functional_test_check.py "$CHECK_NAME" $KILL_TIMEOUT
- name: Cleanup
if: always()
run: |
docker kill $(docker ps -q) ||:
docker rm -f $(docker ps -a -q) ||:
sudo rm -fr $TEMP_PATH
FunctionalStatelessTestTsan2:
needs: [BuilderDebTsan]
runs-on: [self-hosted, func-tester]
steps:
- name: Set envs
run: |
cat >> "$GITHUB_ENV" << 'EOF'
TEMP_PATH=${{runner.temp}}/stateless_tsan
REPORTS_PATH=${{runner.temp}}/reports_dir
CHECK_NAME=Stateless tests (thread, actions)
REPO_COPY=${{runner.temp}}/stateless_tsan/ClickHouse
KILL_TIMEOUT=10800
RUN_BY_HASH_NUM=2
RUN_BY_HASH_TOTAL=3
EOF
- name: Download json reports
uses: actions/download-artifact@v2
@ -512,7 +627,7 @@ jobs:
docker kill $(docker ps -q) ||:
docker rm -f $(docker ps -a -q) ||:
sudo rm -fr $TEMP_PATH
FunctionalStatelessTestMsan:
FunctionalStatelessTestMsan0:
needs: [BuilderDebMsan]
runs-on: [self-hosted, func-tester]
steps:
@ -524,6 +639,8 @@ jobs:
CHECK_NAME=Stateless tests (memory, actions)
REPO_COPY=${{runner.temp}}/stateless_memory/ClickHouse
KILL_TIMEOUT=10800
RUN_BY_HASH_NUM=0
RUN_BY_HASH_TOTAL=3
EOF
- name: Download json reports
uses: actions/download-artifact@v2
@ -547,7 +664,81 @@ jobs:
docker kill $(docker ps -q) ||:
docker rm -f $(docker ps -a -q) ||:
sudo rm -fr $TEMP_PATH
FunctionalStatelessTestDebug:
FunctionalStatelessTestMsan1:
needs: [BuilderDebMsan]
runs-on: [self-hosted, func-tester]
steps:
- name: Set envs
run: |
cat >> "$GITHUB_ENV" << 'EOF'
TEMP_PATH=${{runner.temp}}/stateless_memory
REPORTS_PATH=${{runner.temp}}/reports_dir
CHECK_NAME=Stateless tests (memory, actions)
REPO_COPY=${{runner.temp}}/stateless_memory/ClickHouse
KILL_TIMEOUT=10800
RUN_BY_HASH_NUM=1
RUN_BY_HASH_TOTAL=3
EOF
- name: Download json reports
uses: actions/download-artifact@v2
with:
path: ${{ env.REPORTS_PATH }}
- name: Clear repository
run: |
sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE
- name: Check out repository code
uses: actions/checkout@v2
- name: Functional test
run: |
sudo rm -fr $TEMP_PATH
mkdir -p $TEMP_PATH
cp -r $GITHUB_WORKSPACE $TEMP_PATH
cd $REPO_COPY/tests/ci
python3 functional_test_check.py "$CHECK_NAME" $KILL_TIMEOUT
- name: Cleanup
if: always()
run: |
docker kill $(docker ps -q) ||:
docker rm -f $(docker ps -a -q) ||:
sudo rm -fr $TEMP_PATH
FunctionalStatelessTestMsan2:
needs: [BuilderDebMsan]
runs-on: [self-hosted, func-tester]
steps:
- name: Set envs
run: |
cat >> "$GITHUB_ENV" << 'EOF'
TEMP_PATH=${{runner.temp}}/stateless_memory
REPORTS_PATH=${{runner.temp}}/reports_dir
CHECK_NAME=Stateless tests (memory, actions)
REPO_COPY=${{runner.temp}}/stateless_memory/ClickHouse
KILL_TIMEOUT=10800
RUN_BY_HASH_NUM=2
RUN_BY_HASH_TOTAL=3
EOF
- name: Download json reports
uses: actions/download-artifact@v2
with:
path: ${{ env.REPORTS_PATH }}
- name: Clear repository
run: |
sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE
- name: Check out repository code
uses: actions/checkout@v2
- name: Functional test
run: |
sudo rm -fr $TEMP_PATH
mkdir -p $TEMP_PATH
cp -r $GITHUB_WORKSPACE $TEMP_PATH
cd $REPO_COPY/tests/ci
python3 functional_test_check.py "$CHECK_NAME" $KILL_TIMEOUT
- name: Cleanup
if: always()
run: |
docker kill $(docker ps -q) ||:
docker rm -f $(docker ps -a -q) ||:
sudo rm -fr $TEMP_PATH
FunctionalStatelessTestDebug0:
needs: [BuilderDebDebug]
runs-on: [self-hosted, func-tester]
steps:
@ -559,6 +750,82 @@ jobs:
CHECK_NAME=Stateless tests (debug, actions)
REPO_COPY=${{runner.temp}}/stateless_debug/ClickHouse
KILL_TIMEOUT=10800
RUN_BY_HASH_NUM=0
RUN_BY_HASH_TOTAL=3
EOF
- name: Download json reports
uses: actions/download-artifact@v2
with:
path: ${{ env.REPORTS_PATH }}
- name: Clear repository
run: |
sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE
- name: Check out repository code
uses: actions/checkout@v2
- name: Functional test
run: |
sudo rm -fr $TEMP_PATH
mkdir -p $TEMP_PATH
cp -r $GITHUB_WORKSPACE $TEMP_PATH
cd $REPO_COPY/tests/ci
python3 functional_test_check.py "$CHECK_NAME" $KILL_TIMEOUT
- name: Cleanup
if: always()
run: |
docker kill $(docker ps -q) ||:
docker rm -f $(docker ps -a -q) ||:
sudo rm -fr $TEMP_PATH
FunctionalStatelessTestDebug1:
needs: [BuilderDebDebug]
runs-on: [self-hosted, func-tester]
steps:
- name: Set envs
run: |
cat >> "$GITHUB_ENV" << 'EOF'
TEMP_PATH=${{runner.temp}}/stateless_debug
REPORTS_PATH=${{runner.temp}}/reports_dir
CHECK_NAME=Stateless tests (debug, actions)
REPO_COPY=${{runner.temp}}/stateless_debug/ClickHouse
KILL_TIMEOUT=10800
RUN_BY_HASH_NUM=1
RUN_BY_HASH_TOTAL=3
EOF
- name: Download json reports
uses: actions/download-artifact@v2
with:
path: ${{ env.REPORTS_PATH }}
- name: Clear repository
run: |
sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE
- name: Check out repository code
uses: actions/checkout@v2
- name: Functional test
run: |
sudo rm -fr $TEMP_PATH
mkdir -p $TEMP_PATH
cp -r $GITHUB_WORKSPACE $TEMP_PATH
cd $REPO_COPY/tests/ci
python3 functional_test_check.py "$CHECK_NAME" $KILL_TIMEOUT
- name: Cleanup
if: always()
run: |
docker kill $(docker ps -q) ||:
docker rm -f $(docker ps -a -q) ||:
sudo rm -fr $TEMP_PATH
FunctionalStatelessTestDebug2:
needs: [BuilderDebDebug]
runs-on: [self-hosted, func-tester]
steps:
- name: Set envs
run: |
cat >> "$GITHUB_ENV" << 'EOF'
TEMP_PATH=${{runner.temp}}/stateless_debug
REPORTS_PATH=${{runner.temp}}/reports_dir
CHECK_NAME=Stateless tests (debug, actions)
REPO_COPY=${{runner.temp}}/stateless_debug/ClickHouse
KILL_TIMEOUT=10800
RUN_BY_HASH_NUM=2
RUN_BY_HASH_TOTAL=3
EOF
- name: Download json reports
uses: actions/download-artifact@v2
@ -975,8 +1242,8 @@ jobs:
#############################################################################################
############################# INTEGRATION TESTS #############################################
#############################################################################################
IntegrationTestsAsan:
needs: [BuilderDebAsan, FunctionalStatelessTestAsan]
IntegrationTestsAsan0:
needs: [BuilderDebAsan]
runs-on: [self-hosted, stress-tester]
steps:
- name: Set envs
@ -986,6 +1253,8 @@ jobs:
REPORTS_PATH=${{runner.temp}}/reports_dir
CHECK_NAME=Integration tests (asan, actions)
REPO_COPY=${{runner.temp}}/integration_tests_asan/ClickHouse
RUN_BY_HASH_NUM=0
RUN_BY_HASH_TOTAL=3
EOF
- name: Download json reports
uses: actions/download-artifact@v2
@ -1009,8 +1278,80 @@ jobs:
docker kill $(docker ps -q) ||:
docker rm -f $(docker ps -a -q) ||:
sudo rm -fr $TEMP_PATH
IntegrationTestsTsan:
needs: [BuilderDebTsan, FunctionalStatelessTestTsan]
IntegrationTestsAsan1:
needs: [BuilderDebAsan]
runs-on: [self-hosted, stress-tester]
steps:
- name: Set envs
run: |
cat >> "$GITHUB_ENV" << 'EOF'
TEMP_PATH=${{runner.temp}}/integration_tests_asan
REPORTS_PATH=${{runner.temp}}/reports_dir
CHECK_NAME=Integration tests (asan, actions)
REPO_COPY=${{runner.temp}}/integration_tests_asan/ClickHouse
RUN_BY_HASH_NUM=1
RUN_BY_HASH_TOTAL=3
EOF
- name: Download json reports
uses: actions/download-artifact@v2
with:
path: ${{ env.REPORTS_PATH }}
- name: Clear repository
run: |
sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE
- name: Check out repository code
uses: actions/checkout@v2
- name: Integration test
run: |
sudo rm -fr $TEMP_PATH
mkdir -p $TEMP_PATH
cp -r $GITHUB_WORKSPACE $TEMP_PATH
cd $REPO_COPY/tests/ci
python3 integration_test_check.py "$CHECK_NAME"
- name: Cleanup
if: always()
run: |
docker kill $(docker ps -q) ||:
docker rm -f $(docker ps -a -q) ||:
sudo rm -fr $TEMP_PATH
IntegrationTestsAsan2:
needs: [BuilderDebAsan]
runs-on: [self-hosted, stress-tester]
steps:
- name: Set envs
run: |
cat >> "$GITHUB_ENV" << 'EOF'
TEMP_PATH=${{runner.temp}}/integration_tests_asan
REPORTS_PATH=${{runner.temp}}/reports_dir
CHECK_NAME=Integration tests (asan, actions)
REPO_COPY=${{runner.temp}}/integration_tests_asan/ClickHouse
RUN_BY_HASH_NUM=2
RUN_BY_HASH_TOTAL=3
EOF
- name: Download json reports
uses: actions/download-artifact@v2
with:
path: ${{ env.REPORTS_PATH }}
- name: Clear repository
run: |
sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE
- name: Check out repository code
uses: actions/checkout@v2
- name: Integration test
run: |
sudo rm -fr $TEMP_PATH
mkdir -p $TEMP_PATH
cp -r $GITHUB_WORKSPACE $TEMP_PATH
cd $REPO_COPY/tests/ci
python3 integration_test_check.py "$CHECK_NAME"
- name: Cleanup
if: always()
run: |
docker kill $(docker ps -q) ||:
docker rm -f $(docker ps -a -q) ||:
sudo rm -fr $TEMP_PATH
IntegrationTestsTsan0:
needs: [BuilderDebTsan]
runs-on: [self-hosted, stress-tester]
steps:
- name: Set envs
@ -1020,6 +1361,8 @@ jobs:
REPORTS_PATH=${{runner.temp}}/reports_dir
CHECK_NAME=Integration tests (thread, actions)
REPO_COPY=${{runner.temp}}/integration_tests_tsan/ClickHouse
RUN_BY_HASH_NUM=0
RUN_BY_HASH_TOTAL=4
EOF
- name: Download json reports
uses: actions/download-artifact@v2
@ -1043,8 +1386,116 @@ jobs:
docker kill $(docker ps -q) ||:
docker rm -f $(docker ps -a -q) ||:
sudo rm -fr $TEMP_PATH
IntegrationTestsRelease:
needs: [BuilderDebRelease, FunctionalStatelessTestRelease]
IntegrationTestsTsan1:
needs: [BuilderDebTsan]
runs-on: [self-hosted, stress-tester]
steps:
- name: Set envs
run: |
cat >> "$GITHUB_ENV" << 'EOF'
TEMP_PATH=${{runner.temp}}/integration_tests_tsan
REPORTS_PATH=${{runner.temp}}/reports_dir
CHECK_NAME=Integration tests (thread, actions)
REPO_COPY=${{runner.temp}}/integration_tests_tsan/ClickHouse
RUN_BY_HASH_NUM=1
RUN_BY_HASH_TOTAL=4
EOF
- name: Download json reports
uses: actions/download-artifact@v2
with:
path: ${{ env.REPORTS_PATH }}
- name: Clear repository
run: |
sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE
- name: Check out repository code
uses: actions/checkout@v2
- name: Integration test
run: |
sudo rm -fr $TEMP_PATH
mkdir -p $TEMP_PATH
cp -r $GITHUB_WORKSPACE $TEMP_PATH
cd $REPO_COPY/tests/ci
python3 integration_test_check.py "$CHECK_NAME"
- name: Cleanup
if: always()
run: |
docker kill $(docker ps -q) ||:
docker rm -f $(docker ps -a -q) ||:
sudo rm -fr $TEMP_PATH
IntegrationTestsTsan2:
needs: [BuilderDebTsan]
runs-on: [self-hosted, stress-tester]
steps:
- name: Set envs
run: |
cat >> "$GITHUB_ENV" << 'EOF'
TEMP_PATH=${{runner.temp}}/integration_tests_tsan
REPORTS_PATH=${{runner.temp}}/reports_dir
CHECK_NAME=Integration tests (thread, actions)
REPO_COPY=${{runner.temp}}/integration_tests_tsan/ClickHouse
RUN_BY_HASH_NUM=2
RUN_BY_HASH_TOTAL=4
EOF
- name: Download json reports
uses: actions/download-artifact@v2
with:
path: ${{ env.REPORTS_PATH }}
- name: Clear repository
run: |
sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE
- name: Check out repository code
uses: actions/checkout@v2
- name: Integration test
run: |
sudo rm -fr $TEMP_PATH
mkdir -p $TEMP_PATH
cp -r $GITHUB_WORKSPACE $TEMP_PATH
cd $REPO_COPY/tests/ci
python3 integration_test_check.py "$CHECK_NAME"
- name: Cleanup
if: always()
run: |
docker kill $(docker ps -q) ||:
docker rm -f $(docker ps -a -q) ||:
sudo rm -fr $TEMP_PATH
IntegrationTestsTsan3:
needs: [BuilderDebTsan]
runs-on: [self-hosted, stress-tester]
steps:
- name: Set envs
run: |
cat >> "$GITHUB_ENV" << 'EOF'
TEMP_PATH=${{runner.temp}}/integration_tests_tsan
REPORTS_PATH=${{runner.temp}}/reports_dir
CHECK_NAME=Integration tests (thread, actions)
REPO_COPY=${{runner.temp}}/integration_tests_tsan/ClickHouse
RUN_BY_HASH_NUM=3
RUN_BY_HASH_TOTAL=4
EOF
- name: Download json reports
uses: actions/download-artifact@v2
with:
path: ${{ env.REPORTS_PATH }}
- name: Clear repository
run: |
sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE
- name: Check out repository code
uses: actions/checkout@v2
- name: Integration test
run: |
sudo rm -fr $TEMP_PATH
mkdir -p $TEMP_PATH
cp -r $GITHUB_WORKSPACE $TEMP_PATH
cd $REPO_COPY/tests/ci
python3 integration_test_check.py "$CHECK_NAME"
- name: Cleanup
if: always()
run: |
docker kill $(docker ps -q) ||:
docker rm -f $(docker ps -a -q) ||:
sudo rm -fr $TEMP_PATH
IntegrationTestsRelease0:
needs: [BuilderDebRelease]
runs-on: [self-hosted, stress-tester]
steps:
- name: Set envs
@ -1054,6 +1505,44 @@ jobs:
REPORTS_PATH=${{runner.temp}}/reports_dir
CHECK_NAME=Integration tests (release, actions)
REPO_COPY=${{runner.temp}}/integration_tests_release/ClickHouse
RUN_BY_HASH_NUM=0
RUN_BY_HASH_TOTAL=2
EOF
- name: Download json reports
uses: actions/download-artifact@v2
with:
path: ${{ env.REPORTS_PATH }}
- name: Clear repository
run: |
sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE
- name: Check out repository code
uses: actions/checkout@v2
- name: Integration test
run: |
sudo rm -fr $TEMP_PATH
mkdir -p $TEMP_PATH
cp -r $GITHUB_WORKSPACE $TEMP_PATH
cd $REPO_COPY/tests/ci
python3 integration_test_check.py "$CHECK_NAME"
- name: Cleanup
if: always()
run: |
docker kill $(docker ps -q) ||:
docker rm -f $(docker ps -a -q) ||:
sudo rm -fr $TEMP_PATH
IntegrationTestsRelease1:
needs: [BuilderDebRelease]
runs-on: [self-hosted, stress-tester]
steps:
- name: Set envs
run: |
cat >> "$GITHUB_ENV" << 'EOF'
TEMP_PATH=${{runner.temp}}/integration_tests_release
REPORTS_PATH=${{runner.temp}}/reports_dir
CHECK_NAME=Integration tests (release, actions)
REPO_COPY=${{runner.temp}}/integration_tests_release/ClickHouse
RUN_BY_HASH_NUM=1
RUN_BY_HASH_TOTAL=2
EOF
- name: Download json reports
uses: actions/download-artifact@v2
@ -1081,11 +1570,18 @@ jobs:
needs:
- DockerHubPush
- BuilderReport
- FunctionalStatelessTestDebug
- FunctionalStatelessTestDebug0
- FunctionalStatelessTestDebug1
- FunctionalStatelessTestDebug2
- FunctionalStatelessTestRelease
- FunctionalStatelessTestAsan
- FunctionalStatelessTestTsan
- FunctionalStatelessTestMsan
- FunctionalStatelessTestAsan0
- FunctionalStatelessTestAsan1
- FunctionalStatelessTestTsan0
- FunctionalStatelessTestTsan1
- FunctionalStatelessTestTsan2
- FunctionalStatelessTestMsan0
- FunctionalStatelessTestMsan1
- FunctionalStatelessTestMsan2
- FunctionalStatelessTestUBsan
- FunctionalStatefulTestDebug
- FunctionalStatefulTestRelease
@ -1098,9 +1594,15 @@ jobs:
- StressTestTsan
- StressTestMsan
- StressTestUBsan
- IntegrationTestsAsan
- IntegrationTestsRelease
- IntegrationTestsTsan
- IntegrationTestsAsan0
- IntegrationTestsAsan1
- IntegrationTestsAsan2
- IntegrationTestsRelease0
- IntegrationTestsRelease1
- IntegrationTestsTsan0
- IntegrationTestsTsan1
- IntegrationTestsTsan2
- IntegrationTestsTsan3
- CompatibilityCheck
runs-on: [self-hosted, style-checker]
steps:

View File

@ -1,8 +1,10 @@
option (ENABLE_AZURE_BLOB_STORAGE "Enable Azure blob storage" ${ENABLE_LIBRARIES})
option(USE_INTERNAL_AZURE_BLOB_STORAGE_LIBRARY
"Set to FALSE to use system Azure SDK instead of bundled (OFF currently not implemented)"
${ENABLE_LIBRARIES})
ON)
if (USE_INTERNAL_AZURE_BLOB_STORAGE_LIBRARY)
if (ENABLE_AZURE_BLOB_STORAGE)
set(USE_AZURE_BLOB_STORAGE 1)
set(AZURE_BLOB_STORAGE_LIBRARY azure_sdk)
endif()

View File

@ -46,14 +46,17 @@ include("${AZURE_DIR}/cmake-modules/AzureTransportAdapters.cmake")
add_library(azure_sdk ${AZURE_SDK_UNIFIED_SRC})
if (COMPILER_CLANG)
target_compile_options(azure_sdk PUBLIC
target_compile_options(azure_sdk PRIVATE
-Wno-deprecated-copy-dtor
-Wno-extra-semi
-Wno-suggest-destructor-override
-Wno-inconsistent-missing-destructor-override
-Wno-error=unknown-warning-option
-Wno-reserved-identifier
)
if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 13)
target_compile_options(azure_sdk PRIVATE -Wno-reserved-identifier)
endif()
endif()
# Originally, on Windows azure-core is built with bcrypt and crypt32 by default
@ -68,4 +71,4 @@ endif()
target_link_libraries(azure_sdk PRIVATE ${LIBXML2_LIBRARIES})
target_include_directories(azure_sdk PUBLIC ${AZURE_SDK_INCLUDES})
target_include_directories(azure_sdk SYSTEM PUBLIC ${AZURE_SDK_INCLUDES})

View File

@ -173,6 +173,8 @@ function clone_submodules
contrib/dragonbox
contrib/fast_float
contrib/NuRaft
contrib/jemalloc
contrib/replxx
)
git submodule sync
@ -193,6 +195,8 @@ function run_cmake
"-DENABLE_THINLTO=0"
"-DUSE_UNWIND=1"
"-DENABLE_NURAFT=1"
"-DENABLE_JEMALLOC=1"
"-DENABLE_REPLXX=1"
)
# TODO remove this? we don't use ccache anyway. An option would be to download it

View File

@ -175,6 +175,15 @@ info signals
continue
backtrace full
info locals
info registers
disassemble /s
up
info locals
disassemble /s
up
info locals
disassemble /s
p \"done\"
detach
quit
" > script.gdb

View File

@ -72,11 +72,13 @@ RUN python3 -m pip install \
grpcio-tools \
kafka-python \
kazoo \
lz4 \
minio \
protobuf \
psycopg2-binary==2.8.6 \
pymongo==3.11.0 \
pytest \
pytest-order==1.0.0 \
pytest-timeout \
pytest-xdist \
pytest-repeat \

View File

@ -8,8 +8,8 @@ echo '{
"ip-forward": true,
"log-level": "debug",
"storage-driver": "overlay2",
"insecure-registries" : ["dockerhub-proxy.sas.yp-c.yandex.net:5000"],
"registry-mirrors" : ["http://dockerhub-proxy.sas.yp-c.yandex.net:5000"]
"insecure-registries" : ["dockerhub-proxy.dockerhub-proxy-zone:5000"],
"registry-mirrors" : ["http://dockerhub-proxy.dockerhub-proxy-zone:5000"]
}' | dd of=/etc/docker/daemon.json 2>/dev/null
dockerd --host=unix:///var/run/docker.sock --host=tcp://0.0.0.0:2375 --default-address-pool base=172.17.0.0/12,size=24 &>/ClickHouse/tests/integration/dockerd.log &

View File

@ -11,6 +11,20 @@ if [[ $S3_URL == *"s3.amazonaws.com"* ]]; then
COMMON_BUILD_PREFIX=""
fi
# Sometimes AWS responde with DNS error and it's impossible to retry it with
# current curl version options.
function curl_with_retry
{
for _ in 1 2 3 4; do
if curl --fail --head "$1";then
return 0
else
sleep 0.5
fi
done
return 1
}
# Use the packaged repository to find the revision we will compare to.
function find_reference_sha
{
@ -55,7 +69,7 @@ function find_reference_sha
)
for path in "${urls_to_try[@]}"
do
if curl --fail --retry 5 --retry-delay 1 --retry-max-time 15 --head "$path"
if curl_with_retry "$path"
then
found="$path"
break
@ -76,7 +90,7 @@ chmod 777 workspace output
cd workspace
# Download the package for the version we are going to test.
if curl --fail --retry 5 --retry-delay 1 --retry-max-time 15 --head "$S3_URL/$PR_TO_TEST/$SHA_TO_TEST$COMMON_BUILD_PREFIX/performance/performance.tgz"
if curl_with_retry "$S3_URL/$PR_TO_TEST/$SHA_TO_TEST$COMMON_BUILD_PREFIX/performance/performance.tgz"
then
right_path="$S3_URL/$PR_TO_TEST/$SHA_TO_TEST$COMMON_BUILD_PREFIX/performance/performance.tgz"
fi

View File

@ -148,6 +148,15 @@ info signals
continue
backtrace full
info locals
info registers
disassemble /s
up
info locals
disassemble /s
up
info locals
disassemble /s
p \"done\"
detach
quit
" > script.gdb

View File

@ -5,8 +5,8 @@ echo "Configure to use Yandex dockerhub-proxy"
mkdir -p /etc/docker/
cat > /etc/docker/daemon.json << EOF
{
"insecure-registries" : ["dockerhub-proxy.sas.yp-c.yandex.net:5000"],
"registry-mirrors" : ["http://dockerhub-proxy.sas.yp-c.yandex.net:5000"]
"insecure-registries" : ["dockerhub-proxy.dockerhub-proxy-zone:5000"],
"registry-mirrors" : ["http://dockerhub-proxy.dockerhub-proxy-zone:5000"]
}
EOF

View File

@ -106,20 +106,20 @@ Build ClickHouse. Run ClickHouse from the terminal: change directory to `program
Note that all clickhouse tools (server, client, etc) are just symlinks to a single binary named `clickhouse`. You can find this binary at `programs/clickhouse`. All tools can also be invoked as `clickhouse tool` instead of `clickhouse-tool`.
Alternatively you can install ClickHouse package: either stable release from Yandex repository or you can build package for yourself with `./release` in ClickHouse sources root. Then start the server with `sudo service clickhouse-server start` (or stop to stop the server). Look for logs at `/etc/clickhouse-server/clickhouse-server.log`.
Alternatively you can install ClickHouse package: either stable release from ClickHouse repository or you can build package for yourself with `./release` in ClickHouse sources root. Then start the server with `sudo clickhouse start` (or stop to stop the server). Look for logs at `/etc/clickhouse-server/clickhouse-server.log`.
When ClickHouse is already installed on your system, you can build a new `clickhouse` binary and replace the existing binary:
``` bash
$ sudo service clickhouse-server stop
$ sudo clickhouse stop
$ sudo cp ./clickhouse /usr/bin/
$ sudo service clickhouse-server start
$ sudo clickhouse start
```
Also you can stop system clickhouse-server and run your own with the same configuration but with logging to terminal:
``` bash
$ sudo service clickhouse-server stop
$ sudo clickhouse stop
$ sudo -u clickhouse /usr/bin/clickhouse server --config-file /etc/clickhouse-server/config.xml
```
@ -257,9 +257,9 @@ There are five variants (Debug, ASan, TSan, MSan, UBSan).
Thread Fuzzer (please don't mix up with Thread Sanitizer) is another kind of fuzzing that allows to randomize thread order of execution. It helps to find even more special cases.
## Security Audit {#security-audit}
## Security Audit
People from Yandex Security Team do some basic overview of ClickHouse capabilities from the security standpoint.
People from Yandex Security Team did some basic overview of ClickHouse capabilities from the security standpoint.
## Static Analyzers {#static-analyzers}
@ -326,15 +326,11 @@ There is automated check for flaky tests. It runs all new tests 100 times (for f
## Testflows
[Testflows](https://testflows.com/) is an enterprise-grade testing framework. It is used by Altinity for some of the tests and we run these tests in our CI.
## Yandex Checks (only for Yandex employees)
These checks are importing ClickHouse code into Yandex internal monorepository, so ClickHouse codebase can be used as a library by other products at Yandex (YT and YDB). Note that clickhouse-server itself is not being build from internal repo and unmodified open-source build is used for Yandex applications.
[Testflows](https://testflows.com/) is an enterprise-grade open-source testing framework, which is used to test a subset of ClickHouse.
## Test Automation {#test-automation}
We run tests with Yandex internal CI and job automation system named “Sandbox”.
We run tests with [GitHub Actions](https://github.com/features/actions).
Build jobs and tests are run in Sandbox on per commit basis. Resulting packages and test results are published in GitHub and can be downloaded by direct links. Artifacts are stored for several months. When you send a pull request on GitHub, we tag it as “can be tested” and our CI system will build ClickHouse packages (release, debug, with address sanitizer, etc) for you.

View File

@ -37,6 +37,7 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster]
[rabbitmq_skip_broken_messages = N,]
[rabbitmq_max_block_size = N,]
[rabbitmq_flush_interval_ms = N]
[rabbitmq_queue_settings_list = 'x-dead-letter-exchange=my-dlx,x-max-length=10,x-overflow=reject-publish']
```
Required parameters:
@ -59,6 +60,7 @@ Optional parameters:
- `rabbitmq_skip_broken_messages` RabbitMQ message parser tolerance to schema-incompatible messages per block. Default: `0`. If `rabbitmq_skip_broken_messages = N` then the engine skips *N* RabbitMQ messages that cannot be parsed (a message equals a row of data).
- `rabbitmq_max_block_size`
- `rabbitmq_flush_interval_ms`
- `rabbitmq_queue_settings_list` - allows to set RabbitMQ settings when creating a queue. Available settings: `x-max-length`, `x-max-length-bytes`, `x-message-ttl`, `x-expires`, `x-priority`, `x-max-priority`, `x-overflow`, `x-dead-letter-exchange`, `x-queue-type`. The `durable` setting is enabled automatically for the queue.
SSL connection:

View File

@ -178,5 +178,9 @@ toc_title: Adopters
| <a href="https://promo.croc.ru/digitalworker" class="favicon">Цифровой Рабочий</a> | Industrial IoT, Analytics | — | — | — | [Blog post in Russian, March 2021](https://habr.com/en/company/croc/blog/548018/) |
| <a href="https://shop.okraina.ru/" class="favicon">ООО «МПЗ Богородский»</a> | Agriculture | — | — | — | [Article in Russian, November 2020](https://cloud.yandex.ru/cases/okraina) |
| <a href="https://domclick.ru/" class="favicon">ДомКлик</a> | Real Estate | — | — | — | [Article in Russian, October 2021](https://habr.com/ru/company/domclick/blog/585936/) |
| <a href="https://futurragroup.com/" class="favicon">Futurra Group</a> | Analytics | — | — | — | [Article in Russian, December 2021](https://dou.ua/forums/topic/35587/) |
| <a href="https://usetech.com/" class="favicon">UseTech</a> | Software Development | — | — | — | [Job Posting, December 2021](https://vk.com/wall136266658_2418) |
| <a href="https://lookforsale.ru/" class="favicon">Lookforsale</a> | E-Commerce | — | — | — | [Job Posting, December 2021](https://telegram.me/javascript_jobs/587318) |
| <a href="https://rvision.pro/en/" class="favicon">R-Vision</a> | Information Security | — | — | — | [Article in Russian, December 2021](https://www.anti-malware.ru/reviews/R-Vision-SENSE-15) |
[Original article](https://clickhouse.com/docs/en/introduction/adopters/) <!--hide-->

View File

@ -16,6 +16,11 @@ ZooKeeper is one of the first well-known open-source coordination systems. It's
By default, ClickHouse Keeper provides the same guarantees as ZooKeeper (linearizable writes, non-linearizable reads). It has a compatible client-server protocol, so any standard ZooKeeper client can be used to interact with ClickHouse Keeper. Snapshots and logs have an incompatible format with ZooKeeper, but `clickhouse-keeper-converter` tool allows to convert ZooKeeper data to ClickHouse Keeper snapshot. Interserver protocol in ClickHouse Keeper is also incompatible with ZooKeeper so mixed ZooKeeper / ClickHouse Keeper cluster is impossible.
ClickHouse Keeper supports Access Control List (ACL) the same way as [ZooKeeper](https://zookeeper.apache.org/doc/r3.1.2/zookeeperProgrammers.html#sc_ZooKeeperAccessControl) does. ClickHouse Keeper supports the same set of permissions and has the identical built-in schemes: `world`, `auth`, `digest`, `host` and `ip`. Digest authentication scheme uses pair `username:password`. Password is encoded in Base64.
!!! info "Note"
External integrations are not supported.
## Configuration
ClickHouse Keeper can be used as a standalone replacement for ZooKeeper or as an internal part of the ClickHouse server, but in both cases configuration is almost the same `.xml` file. The main ClickHouse Keeper configuration tag is `<keeper_server>`. Keeper configuration has the following parameters:
@ -118,13 +123,13 @@ echo mntr | nc localhost 9181
Bellow is the detailed 4lw commands:
- ruok : Tests if server is running in a non-error state. The server will respond with imok if it is running. Otherwise it will not respond at all. A response of "imok" does not necessarily indicate that the server has joined the quorum, just that the server process is active and bound to the specified client port. Use "stat" for details on state wrt quorum and client connection information.
- `ruok`: Tests if server is running in a non-error state. The server will respond with imok if it is running. Otherwise it will not respond at all. A response of "imok" does not necessarily indicate that the server has joined the quorum, just that the server process is active and bound to the specified client port. Use "stat" for details on state wrt quorum and client connection information.
```
imok
```
- mntr : Outputs a list of variables that could be used for monitoring the health of the cluster.
- `mntr`: Outputs a list of variables that could be used for monitoring the health of the cluster.
```
zk_version v21.11.1.1-prestable-7a4a0b0edef0ad6e0aa662cd3b90c3f4acf796e7
@ -146,12 +151,11 @@ zk_followers 0
zk_synced_followers 0
```
- srvr : Lists full details for the server.
- `srvr`: Lists full details for the server.
```
ClickHouse Keeper version: v21.11.1.1-prestable-7a4a0b0edef0ad6e0aa662cd3b90c3f4acf796e7
Latency min/avg/max: 0/0/0
Received: 2
Sent : 2
Connections: 1
@ -161,16 +165,14 @@ Mode: leader
Node count: 4
```
- stat : Lists brief details for the server and connected clients.
- `stat`: Lists brief details for the server and connected clients.
```
ClickHouse Keeper version: v21.11.1.1-prestable-7a4a0b0edef0ad6e0aa662cd3b90c3f4acf796e7
Clients:
192.168.1.1:52852(recved=0,sent=0)
192.168.1.1:52042(recved=24,sent=48)
Latency min/avg/max: 0/0/0
Received: 4
Sent : 4
Connections: 1
@ -178,16 +180,15 @@ Outstanding: 0
Zxid: 36
Mode: leader
Node count: 4
```
- srst : Reset server statistics. The command will affect the result of `srvr`, `mntr` and `stat`.
- `srst`: Reset server statistics. The command will affect the result of `srvr`, `mntr` and `stat`.
```
Server stats reset.
```
- conf : Print details about serving configuration.
- `conf`: Print details about serving configuration.
```
server_id=1
@ -220,20 +221,20 @@ compress_snapshots_with_zstd_format=true
configuration_change_tries_count=20
```
- cons : List full connection/session details for all clients connected to this server. Includes information on numbers of packets received/sent, session id, operation latencies, last operation performed, etc...
- `cons`: List full connection/session details for all clients connected to this server. Includes information on numbers of packets received/sent, session id, operation latencies, last operation performed, etc...
```
192.168.1.1:52163(recved=0,sent=0,sid=0xffffffffffffffff,lop=NA,est=1636454787393,to=30000,lzxid=0xffffffffffffffff,lresp=0,llat=0,minlat=0,avglat=0,maxlat=0)
192.168.1.1:52042(recved=9,sent=18,sid=0x0000000000000001,lop=List,est=1636454739887,to=30000,lcxid=0x0000000000000005,lzxid=0x0000000000000005,lresp=1636454739892,llat=0,minlat=0,avglat=0,maxlat=0)
```
- crst : Reset connection/session statistics for all connections.
- `crst`: Reset connection/session statistics for all connections.
```
Connection stats reset.
```
- envi : Print details about serving environment
- `envi`: Print details about serving environment
```
Environment:
@ -250,41 +251,41 @@ user.tmp=/var/folders/b4/smbq5mfj7578f2jzwn602tt40000gn/T/
```
- dirs : Shows the total size of snapshot and log files in bytes
- `dirs`: Shows the total size of snapshot and log files in bytes
```
snapshot_dir_size: 0
log_dir_size: 3875
```
- isro: Tests if server is running in read-only mode. The server will respond with "ro" if in read-only mode or "rw" if not in read-only mode.
- `isro`: Tests if server is running in read-only mode. The server will respond with "ro" if in read-only mode or "rw" if not in read-only mode.
```
rw
```
- wchs : Lists brief information on watches for the server.
- `wchs`: Lists brief information on watches for the server.
```
1 connections watching 1 paths
Total watches:1
```
- wchc : Lists detailed information on watches for the server, by session. This outputs a list of sessions(connections) with associated watches (paths). Note, depending on the number of watches this operation may be expensive (ie impact server performance), use it carefully.
- `wchc`: Lists detailed information on watches for the server, by session. This outputs a list of sessions (connections) with associated watches (paths). Note, depending on the number of watches this operation may be expensive (ie impact server performance), use it carefully.
```
0x0000000000000001
/clickhouse/task_queue/ddl
```
- wchp : Lists detailed information on watches for the server, by path. This outputs a list of paths (znodes) with associated sessions. Note, depending on the number of watches this operation may be expensive (ie impact server performance), use it carefully.
- `wchp`: Lists detailed information on watches for the server, by path. This outputs a list of paths (znodes) with associated sessions. Note, depending on the number of watches this operation may be expensive (i. e. impact server performance), use it carefully.
```
/clickhouse/task_queue/ddl
0x0000000000000001
```
- dump : Lists the outstanding sessions and ephemeral nodes. This only works on the leader.
- `dump`: Lists the outstanding sessions and ephemeral nodes. This only works on the leader.
```
Sessions dump (2):

View File

@ -41,7 +41,7 @@ Example of a polygon dictionary configuration:
</dictionary>
```
Tne corresponding [DDL-query](../../../sql-reference/statements/create/dictionary.md#create-dictionary-query):
The corresponding [DDL-query](../../../sql-reference/statements/create/dictionary.md#create-dictionary-query):
``` sql
CREATE DICTIONARY polygon_dict_name (
key Array(Array(Array(Array(Float64)))),

View File

@ -129,6 +129,9 @@ world
Каждый элемент структуры типа [Nested](../sql-reference/data-types/nested-data-structures/nested.md) представляется как отдельный массив.
Входящие параметры типа "перечисление" (`ENUM`) могут передаваться в виде значений или порядковых номеров. Сначала переданное значение будет сопоставляться с элементами перечисления. Если совпадение не будет найдено и при этом переданное значение является числом, оно будет трактоваться как порядковый номер в перечислении.
Если входящие параметры типа `ENUM` содержат только порядковые номера, рекомендуется включить настройку [input_format_tsv_enum_as_number](../operations/settings/settings.md#settings-input_format_tsv_enum_as_number) для ускорения парсинга.
Например:
``` sql
@ -362,6 +365,9 @@ $ clickhouse-client --format_csv_delimiter="|" --query="INSERT INTO test.csv FOR
Если установлена настройка [input_format_defaults_for_omitted_fields = 1](../operations/settings/settings.md#session_settings-input_format_defaults_for_omitted_fields) и тип столбца не `Nullable(T)`, то пустые значения без кавычек заменяются значениями по умолчанию для типа данных столбца.
Входящие параметры типа "перечисление" (`ENUM`) могут передаваться в виде значений или порядковых номеров. Сначала переданное значение будет сопоставляться с элементами перечисления. Если совпадение не будет найдено и при этом переданное значение является числом, оно будет трактоваться как порядковый номер в перечислении.
Если входящие параметры типа `ENUM` содержат только порядковые номера, рекомендуется включить настройку [input_format_tsv_enum_as_number](../operations/settings/settings.md#settings-input_format_tsv_enum_as_number) для ускорения парсинга.
Формат CSV поддерживает вывод totals и extremes аналогично `TabSeparated`.
## CSVWithNames {#csvwithnames}
@ -693,7 +699,7 @@ CREATE TABLE IF NOT EXISTS example_table
- Если `input_format_defaults_for_omitted_fields = 1`, то значение по умолчанию для `x` равно `0`, а значение по умолчанию `a` равно `x * 2`.
!!! note "Предупреждение"
Если `input_format_defaults_for_omitted_fields = 1`, то при обработке запросов ClickHouse потребляет больше вычислительных ресурсов, чем если `input_format_defaults_for_omitted_fields = 0`.
При добавлении данных с помощью `input_format_defaults_for_omitted_fields = 1`, ClickHouse потребляет больше вычислительных ресурсов по сравнению с `input_format_defaults_for_omitted_fields = 0`.
### Выборка данных {#vyborka-dannykh}

View File

@ -16,12 +16,17 @@ ZooKeeper — один из первых широко известных сер
По умолчанию ClickHouse Keeper предоставляет те же гарантии, что и ZooKeeper (линеаризуемость записей, последовательная согласованность чтений). У него есть совместимый клиент-серверный протокол, поэтому любой стандартный клиент ZooKeeper может использоваться для взаимодействия с ClickHouse Keeper. Снэпшоты и журналы имеют несовместимый с ZooKeeper формат, однако можно конвертировать данные Zookeeper в снэпшот ClickHouse Keeper с помощью `clickhouse-keeper-converter`. Межсерверный протокол ClickHouse Keeper также несовместим с ZooKeeper, поэтому создание смешанного кластера ZooKeeper / ClickHouse Keeper невозможно.
Система управления доступом (ACL) ClickHouse Keeper реализована так же, как в [ZooKeeper](https://zookeeper.apache.org/doc/r3.1.2/zookeeperProgrammers.html#sc_ZooKeeperAccessControl). ClickHouse Keeper поддерживает тот же набор разрешений и идентичные схемы: `world`, `auth`, `digest`, `host` и `ip`. Digest для аутентификации использует пару значений `username:password`. Пароль кодируется в Base64.
!!! info "Примечание"
Внешние интеграции не поддерживаются.
## Конфигурация
ClickHouse Keeper может использоваться как равноценная замена ZooKeeper или как внутренняя часть сервера ClickHouse, но в обоих случаях конфигурация представлена файлом `.xml`. Главный тег конфигурации ClickHouse Keeper — это `<keeper_server>`. Параметры конфигурации:
- `tcp_port` — порт для подключения клиента (по умолчанию для ZooKeeper: `2181`).
- `tcp_port_secure` — зашифрованный порт для подключения клиента.
- `tcp_port_secure` — зашифрованный порт для SSL-соединения между клиентом и сервером сервиса.
- `server_id` — уникальный идентификатор сервера, каждый участник кластера должен иметь уникальный номер&nbsp;(1,&nbsp;2,&nbsp;3&nbsp;и&nbsp;т.&nbsp;д.).
- `log_storage_path` — путь к журналам координации, лучше хранить их на незанятом устройстве (актуально и для ZooKeeper).
- `snapshot_storage_path` — путь к снэпшотам координации.
@ -50,7 +55,11 @@ ClickHouse Keeper может использоваться как равноце
- `shutdown_timeout` — время ожидания завершения внутренних подключений и выключения, в миллисекундах (по умолчанию: 5000).
- `startup_timeout` — время отключения сервера, если он не подключается к другим участникам кворума, в миллисекундах (по умолчанию: 30000).
Конфигурация кворума находится в `<keeper_server>.<raft_configuration>` и содержит описание серверов. Единственный параметр для всего кворума — `secure`, который включает зашифрованное соединение для связи между участниками кворума. Параметры для каждого `<server>`:
Конфигурация кворума находится в `<keeper_server>.<raft_configuration>` и содержит описание серверов.
Единственный параметр для всего кворума — `secure`, который включает зашифрованное соединение для связи между участниками кворума. Параметру можно задать значение `true`, если для внутренней коммуникации между узлами требуется SSL-соединение, в ином случае не указывайте ничего.
Параметры для каждого `<server>`:
- `id` — идентификатор сервера в кворуме.
- `hostname` — имя хоста, на котором размещен сервер.

View File

@ -391,12 +391,14 @@ INSERT INTO test VALUES (lower('Hello')), (lower('world')), (lower('INSERT')), (
## input_format_tsv_enum_as_number {#settings-input_format_tsv_enum_as_number}
Включает или отключает парсинг значений перечислений как идентификаторов перечислений для входного формата TSV.
Включает или отключает парсинг значений перечислений как порядковых номеров.
Если режим включен, то во входящих данных в формате `TCV` значения перечисления (тип `ENUM`) всегда трактуются как порядковые номера, а не как элементы перечисления. Эту настройку рекомендуется включать для оптимизации парсинга, если данные типа `ENUM` содержат только порядковые номера, а не сами элементы перечисления.
Возможные значения:
- 0 — парсинг значений перечисления как значений.
- 1 — парсинг значений перечисления как идентификаторов перечисления.
- 0 — входящие значения типа `ENUM` сначала сопоставляются с элементами перечисления, а если совпадений не найдено, то трактуются как порядковые номера.
- 1 — входящие значения типа `ENUM` сразу трактуются как порядковые номера.
Значение по умолчанию: 0.
@ -410,10 +412,39 @@ CREATE TABLE table_with_enum_column_for_tsv_insert (Id Int32,Value Enum('first'
При включенной настройке `input_format_tsv_enum_as_number`:
Запрос:
```sql
SET input_format_tsv_enum_as_number = 1;
INSERT INTO table_with_enum_column_for_tsv_insert FORMAT TSV 102 2;
INSERT INTO table_with_enum_column_for_tsv_insert FORMAT TSV 103 1;
SELECT * FROM table_with_enum_column_for_tsv_insert;
```
Результат:
```text
┌──Id─┬─Value──┐
│ 102 │ second │
└─────┴────────┘
```
Запрос:
```sql
SET input_format_tsv_enum_as_number = 1;
INSERT INTO table_with_enum_column_for_tsv_insert FORMAT TSV 103 'first';
```
сгенерирует исключение.
При отключенной настройке `input_format_tsv_enum_as_number`:
Запрос:
```sql
SET input_format_tsv_enum_as_number = 0;
INSERT INTO table_with_enum_column_for_tsv_insert FORMAT TSV 102 2;
INSERT INTO table_with_enum_column_for_tsv_insert FORMAT TSV 103 'first';
SELECT * FROM table_with_enum_column_for_tsv_insert;
```
@ -428,15 +459,6 @@ SELECT * FROM table_with_enum_column_for_tsv_insert;
└─────┴────────┘
```
При отключенной настройке `input_format_tsv_enum_as_number` запрос `INSERT`:
```sql
SET input_format_tsv_enum_as_number = 0;
INSERT INTO table_with_enum_column_for_tsv_insert FORMAT TSV 102 2;
```
сгенерирует исключение.
## input_format_null_as_default {#settings-input-format-null-as-default}
Включает или отключает инициализацию [значениями по умолчанию](../../sql-reference/statements/create/table.md#create-default-values) ячеек с [NULL](../../sql-reference/syntax.md#null-literal), если тип данных столбца не позволяет [хранить NULL](../../sql-reference/data-types/nullable.md#data_type-nullable).
@ -1511,12 +1533,13 @@ SELECT area/period FROM account_orders FORMAT JSON;
## input_format_csv_enum_as_number {#settings-input_format_csv_enum_as_number}
Включает или отключает парсинг значений перечислений как идентификаторов перечислений для входного формата CSV.
Включает или отключает парсинг значений перечислений как порядковых номеров.
Если режим включен, то во входящих данных в формате `CSV` значения перечисления (тип `ENUM`) всегда трактуются как порядковые номера, а не как элементы перечисления. Эту настройку рекомендуется включать для оптимизации парсинга, если данные типа `ENUM` содержат только порядковые номера, а не сами элементы перечисления.
Возможные значения:
- 0 — парсинг значений перечисления как значений.
- 1 — парсинг значений перечисления как идентификаторов перечисления.
- 0 — входящие значения типа `ENUM` сначала сопоставляются с элементами перечисления, а если совпадений не найдено, то трактуются как порядковые номера.
- 1 — входящие значения типа `ENUM` сразу трактуются как порядковые номера.
Значение по умолчанию: 0.
@ -1530,10 +1553,11 @@ CREATE TABLE table_with_enum_column_for_csv_insert (Id Int32,Value Enum('first'
При включенной настройке `input_format_csv_enum_as_number`:
Запрос:
```sql
SET input_format_csv_enum_as_number = 1;
INSERT INTO table_with_enum_column_for_csv_insert FORMAT CSV 102,2;
SELECT * FROM table_with_enum_column_for_csv_insert;
```
Результат:
@ -1544,15 +1568,37 @@ SELECT * FROM table_with_enum_column_for_csv_insert;
└─────┴────────┘
```
При отключенной настройке `input_format_csv_enum_as_number` запрос `INSERT`:
Запрос:
```sql
SET input_format_csv_enum_as_number = 0;
INSERT INTO table_with_enum_column_for_csv_insert FORMAT CSV 102,2;
SET input_format_csv_enum_as_number = 1;
INSERT INTO table_with_enum_column_for_csv_insert FORMAT CSV 103,'first'
```
сгенерирует исключение.
При отключенной настройке `input_format_csv_enum_as_number`:
Запрос:
```sql
SET input_format_csv_enum_as_number = 0;
INSERT INTO table_with_enum_column_for_csv_insert FORMAT CSV 102,2
INSERT INTO table_with_enum_column_for_csv_insert FORMAT CSV 103,'first'
SELECT * FROM table_with_enum_column_for_csv_insert;
```
Результат:
```text
┌──Id─┬─Value──┐
│ 102 │ second │
└─────┴────────┘
┌──Id─┬─Value─┐
│ 103 │ first │
└─────┴───────┘
```
## output_format_csv_crlf_end_of_line {#settings-output-format-csv-crlf-end-of-line}
Использовать в качестве разделителя строк для CSV формата CRLF (DOS/Windows стиль) вместо LF (Unix стиль).

View File

@ -1,16 +1,55 @@
---
machine_translated: true
machine_translated_rev: 5decc73b5dc60054f19087d3690c4eb99446a6c3
---
# system.merge_tree_settings {#system-merge_tree_settings}
# 系统。merge_tree_settings {#system-merge_tree_settings}
包含有关以下设置的信息 `MergeTree` 桌子
包含 `MergeTree` 表的设置 (Setting) 信息。
列:
- `name` (String) — Setting name.
- `value` (String) — Setting value.
- `description` (String) — Setting description.
- `type` (String) — Setting type (implementation specific string value).
- `changed` (UInt8) — Whether the setting was explicitly defined in the config or explicitly changed.
- `name` (String) — 设置名称。
- `value` (String) — 设置的值。
- `description` (String) — 设置描述。
- `type` (String) — 设置类型 (执行特定的字符串值)。
- `changed` (UInt8) — 该设置是否在配置中明确定义或是明确改变。
**示例**
```sql
:) SELECT * FROM system.merge_tree_settings LIMIT 4 FORMAT Vertical;
```
```text
Row 1:
──────
name: index_granularity
value: 8192
changed: 0
description: How many rows correspond to one primary key value.
type: SettingUInt64
Row 2:
──────
name: min_bytes_for_wide_part
value: 0
changed: 0
description: Minimal uncompressed size in bytes to create part in wide format instead of compact
type: SettingUInt64
Row 3:
──────
name: min_rows_for_wide_part
value: 0
changed: 0
description: Minimal number of rows to create part in wide format instead of compact
type: SettingUInt64
Row 4:
──────
name: merge_max_block_size
value: 8192
changed: 0
description: How many rows in blocks should be formed for merge operations.
type: SettingUInt64
4 rows in set. Elapsed: 0.001 sec.
```
[原文](https://clickhouse.com/docs/zh/operations/system-tables/merge_tree_settings) <!--hide-->

View File

@ -1,58 +1,128 @@
---
machine_translated: true
machine_translated_rev: 5decc73b5dc60054f19087d3690c4eb99446a6c3
---
# system.tables {#system-tables}
# 系统。表 {#system-tables}
包含服务器知道的每个表的元数据。 [分离的](../../sql-reference/statements/detach.md)表不在 `system.tables` 显示。
包含服务器知道的每个表的元数据。 分离的表不显示在 `system.tables`
[临时表](../../sql-reference/statements/create/table.md#temporary-tables)只在创建它们的会话中的 `system.tables` 中才可见。它们的数据库字段显示为空,并且 `is_temporary` 标志显示为开启
此表包含以下列列类型显示在括号中):
此表包含以下列 (列类型显示在括号中):
- `database` (String) — 表所在的数据库名。
- `database` ([String](../../sql-reference/data-types/string.md)) — 表所在的数据库名。
- `name` (String) — 表名。
- `name` ([String](../../sql-reference/data-types/string.md)) — 表名。
- `engine` (String) — 表引擎名 (不包含参数)。
- `engine` ([String](../../sql-reference/data-types/string.md)) — 表引擎名 (不包含参数)。
- `is_temporary` (UInt8)-指示表是否是临时的标志。
- `is_temporary` ([UInt8](../../sql-reference/data-types/int-uint.md)) - 指示表是否是临时的标志。
- `data_path` (String)-文件系统中表数据的路径。
- `data_path` ([String](../../sql-reference/data-types/string.md)) - 表数据在文件系统中的路径。
- `metadata_path` (String)-文件系统中表元数据的路径。
- `metadata_path` ([String](../../sql-reference/data-types/string.md)) - 表元数据在文件系统中的路径。
- `metadata_modification_time` (DateTime)-表元数据的最新修改时间。
- `metadata_modification_time` ([DateTime](../../sql-reference/data-types/datetime.md)) - 表元数据的最新修改时间。
- `dependencies_database` (数组(字符串))-数据库依赖关系。
- `dependencies_database` ([Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md))) - 数据库依赖关系。
- `dependencies_table` (数组(字符串))-表依赖关系 ([MaterializedView](../../engines/table-engines/special/materializedview.md) 基于当前表的表)
- `dependencies_table` ([Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md))) - 表依赖关系 (基于当前表的 [物化视图](../../engines/table-engines/special/materializedview.md) 表)
- `create_table_query` (String)-用于创建表的SQL语句。
- `create_table_query` ([String](../../sql-reference/data-types/string.md)) - 用于创建表的 SQL 语句。
- `engine_full` (String)-表引擎的参数。
- `engine_full` ([String](../../sql-reference/data-types/string.md)) - 表引擎的参数。
- `partition_key` (String)-表中指定的分区键表达式
- `as_select` ([String](../../sql-reference/data-types/string.md)) - 视图的 `SELECT` 语句
- `sorting_key` (String)-表中指定的排序键表达式。
- `partition_key` ([String](../../sql-reference/data-types/string.md)) - 表中指定的分区键表达式。
- `primary_key` (String)-表中指定的主键表达式。
- `sorting_key` ([String](../../sql-reference/data-types/string.md)) - 表中指定的排序键表达式。
- `sampling_key` (String)-表中指定的采样键表达式。
- `primary_key` ([String](../../sql-reference/data-types/string.md)) - 表中指定的主键表达式。
- `storage_policy` (字符串)-存储策略:
- `sampling_key` ([String](../../sql-reference/data-types/string.md)) - 表中指定的采样键表达式。
- `storage_policy` ([String](../../sql-reference/data-types/string.md)) - 存储策略:
- [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-multiple-volumes)
- [分布](../../engines/table-engines/special/distributed.md#distributed)
- [Distributed](../../engines/table-engines/special/distributed.md#distributed)
- `total_rows` (Nullable(UInt64))-总行数,如果可以快速确定表中的确切行数,否则行数为`Null`(包括底层 `Buffer` 表)
- `total_rows` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) - 总行数,如果无法快速确定表中的确切行数,则行数返回为 `NULL` (包括底层 `Buffer` 表)
- `total_bytes` (Nullable(UInt64))-总字节数,如果可以快速确定存储表的确切字节数,否则字节数为`Null` (即**不** 包括任何底层存储)
- `total_bytes` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) - 总字节数,如果无法快速确定存储表的确切字节数,则字节数返回为 `NULL` ( **不** 包括任何底层存储)
- 如果表将数据存在磁盘上,返回实际使用的磁盘空间(压缩后)
- 如果表将数据存在磁盘上,返回实际使用的磁盘空间 (压缩后)
- 如果表在内存中存储数据,返回在内存中使用的近似字节数。
- `lifetime_rows` (Nullbale(UInt64))-服务启动后插入的总行数(只针对`Buffer`表)。
- `lifetime_rows` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) - 服务启动后插入的总行数(只针对 `Buffer` 表) 。
- `lifetime_bytes` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) - 服务启动后插入的总字节数(只针对 `Buffer` 表) 。
- `comment` ([String](../../sql-reference/data-types/string.md)) - 表的注释。
- `has_own_data` ([UInt8](../../sql-reference/data-types/int-uint.md)) — 标志,表示表本身是否在磁盘上存储数据,或者访问其他来源。
`system.tables` 表被用于 `SHOW TABLES` 的查询实现中。
**示例**
```sql
SELECT * FROM system.tables LIMIT 2 FORMAT Vertical;
```
```text
Row 1:
──────
database: base
name: t1
uuid: 81b1c20a-b7c6-4116-a2ce-7583fb6b6736
engine: MergeTree
is_temporary: 0
data_paths: ['/var/lib/clickhouse/store/81b/81b1c20a-b7c6-4116-a2ce-7583fb6b6736/']
metadata_path: /var/lib/clickhouse/store/461/461cf698-fd0b-406d-8c01-5d8fd5748a91/t1.sql
metadata_modification_time: 2021-01-25 19:14:32
dependencies_database: []
dependencies_table: []
create_table_query: CREATE TABLE base.t1 (`n` UInt64) ENGINE = MergeTree ORDER BY n SETTINGS index_granularity = 8192
engine_full: MergeTree ORDER BY n SETTINGS index_granularity = 8192
as_select: SELECT database AS table_catalog
partition_key:
sorting_key: n
primary_key: n
sampling_key:
storage_policy: default
total_rows: 1
total_bytes: 99
lifetime_rows: ᴺᵁᴸᴸ
lifetime_bytes: ᴺᵁᴸᴸ
comment:
has_own_data: 0
Row 2:
──────
database: default
name: 53r93yleapyears
uuid: 00000000-0000-0000-0000-000000000000
engine: MergeTree
is_temporary: 0
data_paths: ['/var/lib/clickhouse/data/default/53r93yleapyears/']
metadata_path: /var/lib/clickhouse/metadata/default/53r93yleapyears.sql
metadata_modification_time: 2020-09-23 09:05:36
dependencies_database: []
dependencies_table: []
create_table_query: CREATE TABLE default.`53r93yleapyears` (`id` Int8, `febdays` Int8) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 8192
engine_full: MergeTree ORDER BY id SETTINGS index_granularity = 8192
as_select: SELECT name AS catalog_name
partition_key:
sorting_key: id
primary_key: id
sampling_key:
storage_policy: default
total_rows: 2
total_bytes: 155
lifetime_rows: ᴺᵁᴸᴸ
lifetime_bytes: ᴺᵁᴸᴸ
comment:
has_own_data: 0
```
[原文](https://clickhouse.com/docs/zh/operations/system-tables/tables) <!--hide-->

View File

@ -152,6 +152,7 @@
This setting could be used to switch replication to another network interface
(the server may be connected to multiple networks via multiple addresses)
-->
<!--
<interserver_http_host>example.yandex.ru</interserver_http_host>
-->
@ -177,6 +178,7 @@
-->
<!-- <listen_host>::</listen_host> -->
<!-- Same for hosts without support for IPv6: -->
<!-- <listen_host>0.0.0.0</listen_host> -->

View File

@ -54,7 +54,7 @@ namespace
const Poco::SHA1Engine::Digest & digest = engine.digest();
Poco::SHA1Engine::Digest calculated_password_sha1(sha1_size);
for (size_t i = 0; i < sha1_size; i++)
for (size_t i = 0; i < sha1_size; ++i)
calculated_password_sha1[i] = scrambled_password[i] ^ digest[i];
auto calculated_password_double_sha1 = Util::encodeSHA1(calculated_password_sha1);

View File

@ -448,7 +448,7 @@ LDAPClient::SearchResults LDAPClient::search(const SearchParams & search_params)
vals = nullptr;
});
for (std::size_t i = 0; vals[i]; i++)
for (size_t i = 0; vals[i]; ++i)
{
if (vals[i]->bv_val && vals[i]->bv_len > 0)
result.emplace(vals[i]->bv_val, vals[i]->bv_len);
@ -473,7 +473,7 @@ LDAPClient::SearchResults LDAPClient::search(const SearchParams & search_params)
referrals = nullptr;
});
for (std::size_t i = 0; referrals[i]; i++)
for (size_t i = 0; referrals[i]; ++i)
{
LOG_WARNING(&Poco::Logger::get("LDAPClient"), "Received reference during LDAP search but not following it: {}", referrals[i]);
}

View File

@ -90,7 +90,7 @@ private:
throw;
}
for (i = 0; i < old_size; i++)
for (i = 0; i < old_size; ++i)
{
nested_func->merge(&new_state[i * nested_size_of_data],
&old_state[i * nested_size_of_data],

View File

@ -54,6 +54,8 @@ public:
template <typename T, typename Data, typename Policy>
class AggregateFunctionBitmapL2 final : public IAggregateFunctionDataHelper<Data, AggregateFunctionBitmapL2<T, Data, Policy>>
{
private:
static constexpr auto STATE_VERSION_1_MIN_REVISION = 54455;
public:
AggregateFunctionBitmapL2(const DataTypePtr & type)
: IAggregateFunctionDataHelper<Data, AggregateFunctionBitmapL2<T, Data, Policy>>({type}, {})
@ -105,9 +107,38 @@ public:
}
}
void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional<size_t> /* version */) const override { this->data(place).rbs.write(buf); }
bool isVersioned() const override { return true; }
void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional<size_t> /* version */, Arena *) const override { this->data(place).rbs.read(buf); }
size_t getDefaultVersion() const override { return 1; }
size_t getVersionFromRevision(size_t revision) const override
{
if (revision >= STATE_VERSION_1_MIN_REVISION)
return 1;
else
return 0;
}
void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional<size_t> version) const override
{
if (!version)
version = getDefaultVersion();
if (*version >= 1)
DB::writeBoolText(this->data(place).init, buf);
this->data(place).rbs.write(buf);
}
void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional<size_t> version, Arena *) const override
{
if (!version)
version = getDefaultVersion();
if (*version >= 1)
DB::readBoolText(this->data(place).init, buf);
this->data(place).rbs.read(buf);
}
void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override
{

View File

@ -271,7 +271,7 @@ public:
{
lower_bound = std::min(lower_bound, other.lower_bound);
upper_bound = std::max(upper_bound, other.upper_bound);
for (size_t i = 0; i < other.size; i++)
for (size_t i = 0; i < other.size; ++i)
add(other.points[i].mean, other.points[i].weight, max_bins);
}

View File

@ -56,7 +56,7 @@ static bool ALWAYS_INLINE inline is_all_zeros(const UInt8 * flags, size_t size)
i += 8;
}
for (; i < size; i++)
for (; i < size; ++i)
if (flags[i])
return false;

View File

@ -7,18 +7,20 @@
#include <DataTypes/DataTypeDateTime.h>
#define TOP_K_MAX_SIZE 0xFFFFFF
static inline constexpr UInt64 TOP_K_MAX_SIZE = 0xFFFFFF;
namespace DB
{
struct Settings;
namespace ErrorCodes
{
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
extern const int ARGUMENT_OUT_OF_BOUND;
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
extern const int LOGICAL_ERROR;
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
}
@ -42,19 +44,22 @@ class AggregateFunctionTopKDateTime : public AggregateFunctionTopK<DataTypeDateT
template <bool is_weighted>
static IAggregateFunction * createWithExtraTypes(const DataTypePtr & argument_type, UInt64 threshold, UInt64 load_factor, const Array & params)
static IAggregateFunction * createWithExtraTypes(const DataTypes & argument_types, UInt64 threshold, UInt64 load_factor, const Array & params)
{
WhichDataType which(argument_type);
if (argument_types.empty())
throw DB::Exception(ErrorCodes::LOGICAL_ERROR, "Got empty arguments list");
WhichDataType which(argument_types[0]);
if (which.idx == TypeIndex::Date)
return new AggregateFunctionTopKDate<is_weighted>(threshold, load_factor, {argument_type}, params);
return new AggregateFunctionTopKDate<is_weighted>(threshold, load_factor, argument_types, params);
if (which.idx == TypeIndex::DateTime)
return new AggregateFunctionTopKDateTime<is_weighted>(threshold, load_factor, {argument_type}, params);
return new AggregateFunctionTopKDateTime<is_weighted>(threshold, load_factor, argument_types, params);
/// Check that we can use plain version of AggregateFunctionTopKGeneric
if (argument_type->isValueUnambiguouslyRepresentedInContiguousMemoryRegion())
return new AggregateFunctionTopKGeneric<true, is_weighted>(threshold, load_factor, argument_type, params);
if (argument_types[0]->isValueUnambiguouslyRepresentedInContiguousMemoryRegion())
return new AggregateFunctionTopKGeneric<true, is_weighted>(threshold, load_factor, argument_types, params);
else
return new AggregateFunctionTopKGeneric<false, is_weighted>(threshold, load_factor, argument_type, params);
return new AggregateFunctionTopKGeneric<false, is_weighted>(threshold, load_factor, argument_types, params);
}
@ -78,40 +83,37 @@ AggregateFunctionPtr createAggregateFunctionTopK(const std::string & name, const
if (!params.empty())
{
if (params.size() > 2)
throw Exception("Aggregate function " + name + " requires two parameters or less.",
ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH,
"Aggregate function '{}' requires two parameters or less", name);
UInt64 k = applyVisitor(FieldVisitorConvertToNumber<UInt64>(), params[0]);
if (params.size() == 2)
{
load_factor = applyVisitor(FieldVisitorConvertToNumber<UInt64>(), params[1]);
if (load_factor < 1)
throw Exception("Too small parameter 'load_factor' for aggregate function " + name + ". Minimum: 1",
ErrorCodes::ARGUMENT_OUT_OF_BOUND);
throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND,
"Too small parameter 'load_factor' for aggregate function '{}' (got {}, minimum is 1)", name, load_factor);
}
if (k > TOP_K_MAX_SIZE || load_factor > TOP_K_MAX_SIZE || k * load_factor > TOP_K_MAX_SIZE)
throw Exception("Too large parameter(s) for aggregate function " + name + ". Maximum: " + toString(TOP_K_MAX_SIZE),
ErrorCodes::ARGUMENT_OUT_OF_BOUND);
threshold = applyVisitor(FieldVisitorConvertToNumber<UInt64>(), params[0]);
if (k == 0)
throw Exception("Parameter 0 is illegal for aggregate function " + name,
ErrorCodes::ARGUMENT_OUT_OF_BOUND);
if (threshold > TOP_K_MAX_SIZE || load_factor > TOP_K_MAX_SIZE || threshold * load_factor > TOP_K_MAX_SIZE)
throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND,
"Too large parameter(s) for aggregate function '{}' (maximum is {})", name, toString(TOP_K_MAX_SIZE));
threshold = k;
if (threshold == 0)
throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Parameter 0 is illegal for aggregate function '{}'", name);
}
AggregateFunctionPtr res(createWithNumericType<AggregateFunctionTopK, is_weighted>(
*argument_types[0], threshold, load_factor, argument_types, params));
if (!res)
res = AggregateFunctionPtr(createWithExtraTypes<is_weighted>(argument_types[0], threshold, load_factor, params));
res = AggregateFunctionPtr(createWithExtraTypes<is_weighted>(argument_types, threshold, load_factor, params));
if (!res)
throw Exception("Illegal type " + argument_types[0]->getName() +
" of argument for aggregate function " + name, ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
"Illegal type {} of argument for aggregate function '{}'", argument_types[0]->getName(), name);
return res;
}

View File

@ -132,8 +132,8 @@ private:
public:
AggregateFunctionTopKGeneric(
UInt64 threshold_, UInt64 load_factor, const DataTypePtr & input_data_type_, const Array & params)
: IAggregateFunctionDataHelper<AggregateFunctionTopKGenericData, AggregateFunctionTopKGeneric<is_plain_column, is_weighted>>({input_data_type_}, params)
UInt64 threshold_, UInt64 load_factor, const DataTypes & argument_types_, const Array & params)
: IAggregateFunctionDataHelper<AggregateFunctionTopKGenericData, AggregateFunctionTopKGeneric<is_plain_column, is_weighted>>(argument_types_, params)
, threshold(threshold_), reserved(load_factor * threshold), input_data_type(this->argument_types[0]) {}
String getName() const override { return is_weighted ? "topKWeighted" : "topK"; }

View File

@ -2,6 +2,7 @@
#include <Columns/ColumnTuple.h>
#include <Columns/ColumnsNumber.h>
#include <Columns/ColumnSparse.h>
#include <Core/Block.h>
#include <Core/ColumnNumbers.h>
#include <Core/Field.h>
@ -181,6 +182,13 @@ public:
Arena * arena,
ssize_t if_argument_pos = -1) const = 0;
/// The version of "addBatch", that handle sparse columns as arguments.
virtual void addBatchSparse(
AggregateDataPtr * places,
size_t place_offset,
const IColumn ** columns,
Arena * arena) const = 0;
virtual void mergeBatch(
size_t batch_size,
AggregateDataPtr * places,
@ -193,6 +201,10 @@ public:
virtual void addBatchSinglePlace(
size_t batch_size, AggregateDataPtr place, const IColumn ** columns, Arena * arena, ssize_t if_argument_pos = -1) const = 0;
/// The version of "addBatchSinglePlace", that handle sparse columns as arguments.
virtual void addBatchSparseSinglePlace(
AggregateDataPtr place, const IColumn ** columns, Arena * arena) const = 0;
/** The same for single place when need to aggregate only filtered data.
* Instead of using an if-column, the condition is combined inside the null_map
*/
@ -367,6 +379,22 @@ public:
}
}
void addBatchSparse(
AggregateDataPtr * places,
size_t place_offset,
const IColumn ** columns,
Arena * arena) const override
{
const auto & column_sparse = assert_cast<const ColumnSparse &>(*columns[0]);
const auto * values = &column_sparse.getValuesColumn();
size_t batch_size = column_sparse.size();
auto offset_it = column_sparse.begin();
for (size_t i = 0; i < batch_size; ++i, ++offset_it)
static_cast<const Derived *>(this)->add(places[offset_it.getCurrentRow()] + place_offset,
&values, offset_it.getValueIndex(), arena);
}
void mergeBatch(
size_t batch_size,
AggregateDataPtr * places,
@ -398,6 +426,19 @@ public:
}
}
void addBatchSparseSinglePlace(
AggregateDataPtr place, const IColumn ** columns, Arena * arena) const override
{
/// TODO: add values and defaults separately if order of adding isn't important.
const auto & column_sparse = assert_cast<const ColumnSparse &>(*columns[0]);
const auto * values = &column_sparse.getValuesColumn();
size_t batch_size = column_sparse.size();
auto offset_it = column_sparse.begin();
for (size_t i = 0; i < batch_size; ++i, ++offset_it)
static_cast<const Derived *>(this)->add(place, &values, offset_it.getValueIndex(), arena);
}
void addBatchSinglePlaceNotNull(
size_t batch_size,
AggregateDataPtr place,

View File

@ -107,7 +107,7 @@ if (USE_AWS_S3)
endif()
if (USE_AZURE_BLOB_STORAGE)
add_headers_and_sources(dbms Disks/BlobStorage)
add_headers_and_sources(dbms Disks/AzureBlobStorage)
endif()
if (USE_HDFS)

View File

@ -4,6 +4,8 @@
#include <iomanip>
#include <string_view>
#include <filesystem>
#include <map>
#include <unordered_map>
#include <base/argsToConfig.h>
#include <base/DateLUT.h>
@ -52,6 +54,7 @@
#include <Processors/Executors/PullingAsyncPipelineExecutor.h>
#include <Processors/Transforms/AddingDefaultsTransform.h>
#include <Interpreters/ReplaceQueryParameterVisitor.h>
#include <Interpreters/ProfileEventsExt.h>
#include <IO/WriteBufferFromOStream.h>
#include <IO/CompressionMethod.h>
#include <Client/InternalTextLogs.h>
@ -105,6 +108,99 @@ namespace ProfileEvents
namespace DB
{
static void incrementProfileEventsBlock(Block & dst, const Block & src)
{
if (!dst)
{
dst = src;
return;
}
assertBlocksHaveEqualStructure(src, dst, "ProfileEvents");
std::unordered_map<String, size_t> name_pos;
for (size_t i = 0; i < dst.columns(); ++i)
name_pos[dst.getByPosition(i).name] = i;
size_t dst_rows = dst.rows();
MutableColumns mutable_columns = dst.mutateColumns();
auto & dst_column_host_name = typeid_cast<ColumnString &>(*mutable_columns[name_pos["host_name"]]);
auto & dst_array_current_time = typeid_cast<ColumnUInt32 &>(*mutable_columns[name_pos["current_time"]]).getData();
auto & dst_array_thread_id = typeid_cast<ColumnUInt64 &>(*mutable_columns[name_pos["thread_id"]]).getData();
auto & dst_array_type = typeid_cast<ColumnInt8 &>(*mutable_columns[name_pos["type"]]).getData();
auto & dst_column_name = typeid_cast<ColumnString &>(*mutable_columns[name_pos["name"]]);
auto & dst_array_value = typeid_cast<ColumnInt64 &>(*mutable_columns[name_pos["value"]]).getData();
const auto & src_column_host_name = typeid_cast<const ColumnString &>(*src.getByName("host_name").column);
const auto & src_array_current_time = typeid_cast<const ColumnUInt32 &>(*src.getByName("current_time").column).getData();
const auto & src_array_thread_id = typeid_cast<const ColumnUInt64 &>(*src.getByName("thread_id").column).getData();
const auto & src_column_name = typeid_cast<const ColumnString &>(*src.getByName("name").column);
const auto & src_array_value = typeid_cast<const ColumnInt64 &>(*src.getByName("value").column).getData();
struct Id
{
StringRef name;
StringRef host_name;
UInt64 thread_id;
bool operator<(const Id & rhs) const
{
return std::tie(name, host_name, thread_id)
< std::tie(rhs.name, rhs.host_name, rhs.thread_id);
}
};
std::map<Id, UInt64> rows_by_name;
for (size_t src_row = 0; src_row < src.rows(); ++src_row)
{
Id id{
src_column_name.getDataAt(src_row),
src_column_host_name.getDataAt(src_row),
src_array_thread_id[src_row],
};
rows_by_name[id] = src_row;
}
/// Merge src into dst.
for (size_t dst_row = 0; dst_row < dst_rows; ++dst_row)
{
Id id{
dst_column_name.getDataAt(dst_row),
dst_column_host_name.getDataAt(dst_row),
dst_array_thread_id[dst_row],
};
if (auto it = rows_by_name.find(id); it != rows_by_name.end())
{
size_t src_row = it->second;
dst_array_current_time[dst_row] = src_array_current_time[src_row];
switch (dst_array_type[dst_row])
{
case ProfileEvents::Type::INCREMENT:
dst_array_value[dst_row] += src_array_value[src_row];
break;
case ProfileEvents::Type::GAUGE:
dst_array_value[dst_row] = src_array_value[src_row];
break;
}
rows_by_name.erase(it);
}
}
/// Copy rows from src that dst does not contains.
for (const auto & [id, pos] : rows_by_name)
{
for (size_t col = 0; col < src.columns(); ++col)
{
mutable_columns[col]->insert((*src.getByPosition(col).column)[pos]);
}
}
dst.setColumns(std::move(mutable_columns));
}
std::atomic_flag exit_on_signal = ATOMIC_FLAG_INIT;
@ -753,7 +849,7 @@ void ClientBase::onProfileEvents(Block & block)
}
else
{
profile_events.last_block = block;
incrementProfileEventsBlock(profile_events.last_block, block);
}
}
profile_events.watch.restart();

View File

@ -133,6 +133,11 @@ public:
void get(size_t n, Field & res) const override;
bool isDefaultAt(size_t) const override
{
throw Exception("Method isDefaultAt is not supported for ColumnAggregateFunction", ErrorCodes::NOT_IMPLEMENTED);
}
StringRef getDataAt(size_t n) const override;
void insertData(const char * pos, size_t length) override;
@ -208,6 +213,16 @@ public:
throw Exception("Method hasEqualValues is not supported for ColumnAggregateFunction", ErrorCodes::NOT_IMPLEMENTED);
}
double getRatioOfDefaultRows(double) const override
{
throw Exception("Method getRatioOfDefaultRows is not supported for ColumnAggregateFunction", ErrorCodes::NOT_IMPLEMENTED);
}
void getIndicesOfNonDefaultRows(Offsets &, size_t, size_t) const override
{
throw Exception("Method getIndicesOfNonDefaultRows is not supported for ColumnAggregateFunction", ErrorCodes::NOT_IMPLEMENTED);
}
void getPermutation(bool reverse, size_t limit, int nan_direction_hint, Permutation & res) const override;
void updatePermutation(bool reverse, size_t limit, int, Permutation & res, EqualRanges & equal_range) const override;

View File

@ -182,6 +182,13 @@ StringRef ColumnArray::getDataAt(size_t n) const
}
bool ColumnArray::isDefaultAt(size_t n) const
{
const auto & offsets_data = getOffsets();
return offsets_data[n] == offsets_data[static_cast<ssize_t>(n) - 1];
}
void ColumnArray::insertData(const char * pos, size_t length)
{
/** Similarly - only for arrays of fixed length values.
@ -576,7 +583,8 @@ void ColumnArray::expand(const IColumn::Filter & mask, bool inverted)
}
if (from != -1)
throw Exception("Not enough bytes in mask", ErrorCodes::LOGICAL_ERROR);}
throw Exception("Not enough bytes in mask", ErrorCodes::LOGICAL_ERROR);
}
template <typename T>
ColumnPtr ColumnArray::filterNumber(const Filter & filt, ssize_t result_size_hint) const
@ -868,6 +876,16 @@ ColumnPtr ColumnArray::compress() const
});
}
double ColumnArray::getRatioOfDefaultRows(double sample_ratio) const
{
return getRatioOfDefaultRowsImpl<ColumnArray>(sample_ratio);
}
void ColumnArray::getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const
{
return getIndicesOfNonDefaultRowsImpl<ColumnArray>(indices, from, limit);
}
ColumnPtr ColumnArray::replicate(const Offsets & replicate_offsets) const
{

View File

@ -60,6 +60,7 @@ public:
Field operator[](size_t n) const override;
void get(size_t n, Field & res) const override;
StringRef getDataAt(size_t n) const override;
bool isDefaultAt(size_t n) const override;
void insertData(const char * pos, size_t length) override;
StringRef serializeValueIntoArena(size_t n, Arena & arena, char const *& begin) const override;
const char * deserializeAndInsertFromArena(const char * pos) override;
@ -143,6 +144,10 @@ public:
return false;
}
double getRatioOfDefaultRows(double sample_ratio) const override;
void getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const override;
bool isCollationSupported() const override { return getData().isCollationSupported(); }
private:

View File

@ -82,6 +82,7 @@ public:
Field operator[](size_t) const override { throwMustBeDecompressed(); }
void get(size_t, Field &) const override { throwMustBeDecompressed(); }
StringRef getDataAt(size_t) const override { throwMustBeDecompressed(); }
bool isDefaultAt(size_t) const override { throwMustBeDecompressed(); }
void insert(const Field &) override { throwMustBeDecompressed(); }
void insertRangeFrom(const IColumn &, size_t, size_t) override { throwMustBeDecompressed(); }
void insertData(const char *, size_t) override { throwMustBeDecompressed(); }
@ -113,6 +114,8 @@ public:
void gather(ColumnGathererStream &) override { throwMustBeDecompressed(); }
void getExtremes(Field &, Field &) const override { throwMustBeDecompressed(); }
size_t byteSizeAt(size_t) const override { throwMustBeDecompressed(); }
double getRatioOfDefaultRows(double) const override { throwMustBeDecompressed(); }
void getIndicesOfNonDefaultRows(Offsets &, size_t, size_t) const override { throwMustBeDecompressed(); }
protected:
size_t rows;

View File

@ -5,6 +5,7 @@
#include <Columns/IColumn.h>
#include <Common/typeid_cast.h>
#include <Common/assert_cast.h>
#include <Common/PODArray.h>
namespace DB
@ -115,6 +116,11 @@ public:
return data->getFloat32(0);
}
bool isDefaultAt(size_t) const override
{
return data->isDefaultAt(0);
}
bool isNullAt(size_t) const override
{
return data->isNullAt(0);
@ -239,6 +245,27 @@ public:
return false;
}
double getRatioOfDefaultRows(double) const override
{
return data->isDefaultAt(0) ? 1.0 : 0.0;
}
void getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const override
{
if (!data->isDefaultAt(0))
{
size_t to = limit && from + limit < size() ? from + limit : size();
indices.reserve(indices.size() + to - from);
for (size_t i = from; i < to; ++i)
indices.push_back(i);
}
}
SerializationInfoPtr getSerializationInfo() const override
{
return data->getSerializationInfo();
}
bool isNullable() const override { return isColumnNullable(*data); }
bool onlyNull() const override { return data->isNullAt(0); }
bool isNumeric() const override { return data->isNumeric(); }

View File

@ -331,7 +331,8 @@ void ColumnDecimal<T>::gather(ColumnGathererStream & gatherer)
template <is_decimal T>
ColumnPtr ColumnDecimal<T>::compress() const
{
size_t source_size = data.size() * sizeof(T);
const size_t data_size = data.size();
const size_t source_size = data_size * sizeof(T);
/// Don't compress small blocks.
if (source_size < 4096) /// A wild guess.
@ -342,8 +343,9 @@ ColumnPtr ColumnDecimal<T>::compress() const
if (!compressed)
return ColumnCompressed::wrap(this->getPtr());
return ColumnCompressed::create(data.size(), compressed->size(),
[compressed = std::move(compressed), column_size = data.size(), scale = this->scale]
const size_t compressed_size = compressed->size();
return ColumnCompressed::create(data_size, compressed_size,
[compressed = std::move(compressed), column_size = data_size, scale = this->scale]
{
auto res = ColumnDecimal<T>::create(column_size, scale);
ColumnCompressed::decompressBuffer(

View File

@ -177,8 +177,17 @@ public:
return false;
}
ColumnPtr compress() const override;
double getRatioOfDefaultRows(double sample_ratio) const override
{
return this->template getRatioOfDefaultRowsImpl<Self>(sample_ratio);
}
void getIndicesOfNonDefaultRows(IColumn::Offsets & indices, size_t from, size_t limit) const override
{
return this->template getIndicesOfNonDefaultRowsImpl<Self>(indices, from, limit);
}
ColumnPtr compress() const override;
void insertValue(const T value) { data.push_back(value); }
Container & getData() { return data; }

View File

@ -51,6 +51,12 @@ MutableColumnPtr ColumnFixedString::cloneResized(size_t size) const
return new_col_holder;
}
bool ColumnFixedString::isDefaultAt(size_t index) const
{
assert(index < size());
return memoryIsZero(chars.data() + index * n, n);
}
void ColumnFixedString::insert(const Field & x)
{
const String & s = DB::get<const String &>(x);
@ -409,9 +415,9 @@ ColumnPtr ColumnFixedString::compress() const
if (!compressed)
return ColumnCompressed::wrap(this->getPtr());
size_t column_size = size();
return ColumnCompressed::create(column_size, compressed->size(),
const size_t column_size = size();
const size_t compressed_size = compressed->size();
return ColumnCompressed::create(column_size, compressed_size,
[compressed = std::move(compressed), column_size, n = n]
{
size_t chars_size = n * column_size;

View File

@ -88,6 +88,8 @@ public:
return StringRef(&chars[n * index], n);
}
bool isDefaultAt(size_t index) const override;
void insert(const Field & x) override;
void insertFrom(const IColumn & src_, size_t index) override;
@ -182,6 +184,16 @@ public:
return false;
}
double getRatioOfDefaultRows(double sample_ratio) const override
{
return getRatioOfDefaultRowsImpl<ColumnFixedString>(sample_ratio);
}
void getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const override
{
return getIndicesOfNonDefaultRowsImpl<ColumnFixedString>(indices, from, limit);
}
bool canBeInsideNullable() const override { return true; }
bool isFixedAndContiguous() const override { return true; }

View File

@ -24,7 +24,12 @@ class ColumnFunction final : public COWHelper<IColumn, ColumnFunction>
private:
friend class COWHelper<IColumn, ColumnFunction>;
ColumnFunction(size_t size, FunctionBasePtr function_, const ColumnsWithTypeAndName & columns_to_capture, bool is_short_circuit_argument_ = false, bool is_function_compiled_ = false);
ColumnFunction(
size_t size,
FunctionBasePtr function_,
const ColumnsWithTypeAndName & columns_to_capture,
bool is_short_circuit_argument_ = false,
bool is_function_compiled_ = false);
public:
const char * getFamilyName() const override { return "Function"; }
@ -68,6 +73,11 @@ public:
throw Exception("Cannot get value from " + getName(), ErrorCodes::NOT_IMPLEMENTED);
}
bool isDefaultAt(size_t) const override
{
throw Exception("isDefaultAt is not implemented for " + getName(), ErrorCodes::NOT_IMPLEMENTED);
}
void insert(const Field &) override
{
throw Exception("Cannot insert into " + getName(), ErrorCodes::NOT_IMPLEMENTED);
@ -153,6 +163,16 @@ public:
throw Exception("Method gather is not supported for " + getName(), ErrorCodes::NOT_IMPLEMENTED);
}
double getRatioOfDefaultRows(double) const override
{
throw Exception("Method getRatioOfDefaultRows is not supported for " + getName(), ErrorCodes::NOT_IMPLEMENTED);
}
void getIndicesOfNonDefaultRows(Offsets &, size_t, size_t) const override
{
throw Exception("Method getIndicesOfNonDefaultRows is not supported for " + getName(), ErrorCodes::NOT_IMPLEMENTED);
}
bool isShortCircuitArgument() const { return is_short_circuit_argument; }
DataTypePtr getResultType() const;

View File

@ -64,6 +64,7 @@ public:
return getDictionary().getDataAtWithTerminatingZero(getIndexes().getUInt(n));
}
bool isDefaultAt(size_t n) const override { return getDictionary().isDefaultAt(getIndexes().getUInt(n)); }
UInt64 get64(size_t n) const override { return getDictionary().get64(getIndexes().getUInt(n)); }
UInt64 getUInt(size_t n) const override { return getDictionary().getUInt(getIndexes().getUInt(n)); }
Int64 getInt(size_t n) const override { return getDictionary().getInt(getIndexes().getUInt(n)); }
@ -180,6 +181,16 @@ public:
return false;
}
double getRatioOfDefaultRows(double sample_ratio) const override
{
return getIndexes().getRatioOfDefaultRows(sample_ratio);
}
void getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const override
{
return getIndexes().getIndicesOfNonDefaultRows(indices, from, limit);
}
bool valuesHaveFixedSize() const override { return getDictionary().valuesHaveFixedSize(); }
bool isFixedAndContiguous() const override { return false; }
size_t sizeOfValueIfFixed() const override { return getDictionary().sizeOfValueIfFixed(); }

View File

@ -81,6 +81,11 @@ void ColumnMap::get(size_t n, Field & res) const
getNestedData().get(offset + i, map[i]);
}
bool ColumnMap::isDefaultAt(size_t n) const
{
return nested->isDefaultAt(n);
}
StringRef ColumnMap::getDataAt(size_t) const
{
throw Exception("Method getDataAt is not supported for " + getName(), ErrorCodes::NOT_IMPLEMENTED);
@ -273,6 +278,16 @@ bool ColumnMap::structureEquals(const IColumn & rhs) const
return false;
}
double ColumnMap::getRatioOfDefaultRows(double sample_ratio) const
{
return getRatioOfDefaultRowsImpl<ColumnMap>(sample_ratio);
}
void ColumnMap::getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const
{
return getIndicesOfNonDefaultRowsImpl<ColumnMap>(indices, from, limit);
}
ColumnPtr ColumnMap::compress() const
{
auto compressed = nested->compress();

View File

@ -51,6 +51,7 @@ public:
Field operator[](size_t n) const override;
void get(size_t n, Field & res) const override;
bool isDefaultAt(size_t n) const override;
StringRef getDataAt(size_t n) const override;
void insertData(const char * pos, size_t length) override;
void insert(const Field & x) override;
@ -85,6 +86,8 @@ public:
void protect() override;
void forEachSubcolumn(ColumnCallback callback) override;
bool structureEquals(const IColumn & rhs) const override;
double getRatioOfDefaultRows(double sample_ratio) const override;
void getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const override;
const ColumnArray & getNestedColumn() const { return assert_cast<const ColumnArray &>(*nested); }
ColumnArray & getNestedColumn() { return assert_cast<ColumnArray &>(*nested); }

View File

@ -648,6 +648,29 @@ void ColumnNullable::checkConsistency() const
ErrorCodes::SIZES_OF_NESTED_COLUMNS_ARE_INCONSISTENT);
}
ColumnPtr ColumnNullable::createWithOffsets(const IColumn::Offsets & offsets, const Field & default_field, size_t total_rows, size_t shift) const
{
ColumnPtr new_values;
ColumnPtr new_null_map;
if (default_field.getType() == Field::Types::Null)
{
auto default_column = nested_column->cloneEmpty();
default_column->insertDefault();
/// Value in main column, when null map is 1 is implementation defined. So, take any value.
new_values = nested_column->createWithOffsets(offsets, (*default_column)[0], total_rows, shift);
new_null_map = null_map->createWithOffsets(offsets, Field(1u), total_rows, shift);
}
else
{
new_values = nested_column->createWithOffsets(offsets, default_field, total_rows, shift);
new_null_map = null_map->createWithOffsets(offsets, Field(0u), total_rows, shift);
}
return ColumnNullable::create(new_values, new_null_map);
}
ColumnPtr makeNullable(const ColumnPtr & column)
{
if (isColumnNullable(*column))

View File

@ -54,6 +54,7 @@ public:
void get(size_t n, Field & res) const override;
bool getBool(size_t n) const override { return isNullAt(n) ? false : nested_column->getBool(n); }
UInt64 get64(size_t n) const override { return nested_column->get64(n); }
bool isDefaultAt(size_t n) const override { return isNullAt(n); }
/**
* If isNullAt(n) returns false, returns the nested column's getDataAt(n), otherwise returns a special value
@ -137,6 +138,18 @@ public:
return false;
}
double getRatioOfDefaultRows(double sample_ratio) const override
{
return null_map->getRatioOfDefaultRows(sample_ratio);
}
void getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const override
{
null_map->getIndicesOfNonDefaultRows(indices, from, limit);
}
ColumnPtr createWithOffsets(const IColumn::Offsets & offsets, const Field & default_field, size_t total_rows, size_t shift) const override;
bool isNullable() const override { return true; }
bool isFixedAndContiguous() const override { return false; }
bool valuesHaveFixedSize() const override { return nested_column->valuesHaveFixedSize(); }

View File

@ -0,0 +1,779 @@
#include <Columns/ColumnSparse.h>
#include <Columns/ColumnsCommon.h>
#include <Columns/ColumnCompressed.h>
#include <Columns/ColumnTuple.h>
#include <Common/WeakHash.h>
#include <Common/SipHash.h>
#include <Common/HashTable/Hash.h>
#include <Processors/Transforms/ColumnGathererTransform.h>
#include <algorithm>
namespace DB
{
namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
extern const int SIZES_OF_COLUMNS_DOESNT_MATCH;
}
ColumnSparse::ColumnSparse(MutableColumnPtr && values_)
: values(std::move(values_)), _size(0)
{
if (!values->empty())
throw Exception("Not empty values passed to ColumnSparse, but no offsets passed", ErrorCodes::LOGICAL_ERROR);
values->insertDefault();
offsets = ColumnUInt64::create();
}
ColumnSparse::ColumnSparse(MutableColumnPtr && values_, MutableColumnPtr && offsets_, size_t size_)
: values(std::move(values_)), offsets(std::move(offsets_)), _size(size_)
{
const ColumnUInt64 * offsets_concrete = typeid_cast<const ColumnUInt64 *>(offsets.get());
if (!offsets_concrete)
throw Exception(ErrorCodes::LOGICAL_ERROR, "'offsets' column must be a ColumnUInt64, got: {}", offsets->getName());
/// 'values' should contain one extra element: default value at 0 position.
if (offsets->size() + 1 != values->size())
throw Exception(ErrorCodes::LOGICAL_ERROR,
"Values size ({}) is inconsistent with offsets size ({})", values->size(), offsets->size());
if (_size < offsets->size())
throw Exception(ErrorCodes::LOGICAL_ERROR,
"Size of sparse column ({}) cannot be lower than number of non-default values ({})", _size, offsets->size());
if (!offsets_concrete->empty() && _size <= offsets_concrete->getData().back())
throw Exception(ErrorCodes::LOGICAL_ERROR,
"Size of sparse column ({}) should be greater than last position of non-default value ({})",
_size, offsets_concrete->getData().back());
#ifndef NDEBUG
const auto & offsets_data = getOffsetsData();
const auto * it = std::adjacent_find(offsets_data.begin(), offsets_data.end(), std::greater_equal<UInt64>());
if (it != offsets_data.end())
throw Exception(ErrorCodes::LOGICAL_ERROR, "Offsets of ColumnSparse must be strictly sorted");
#endif
}
MutableColumnPtr ColumnSparse::cloneResized(size_t new_size) const
{
if (new_size == 0)
return ColumnSparse::create(values->cloneEmpty());
if (new_size >= _size)
return ColumnSparse::create(IColumn::mutate(values), IColumn::mutate(offsets), new_size);
auto res = ColumnSparse::create(values->cloneEmpty());
res->insertRangeFrom(*this, 0, new_size);
return res;
}
bool ColumnSparse::isDefaultAt(size_t n) const
{
return getValueIndex(n) == 0;
}
bool ColumnSparse::isNullAt(size_t n) const
{
return values->isNullAt(getValueIndex(n));
}
Field ColumnSparse::operator[](size_t n) const
{
return (*values)[getValueIndex(n)];
}
void ColumnSparse::get(size_t n, Field & res) const
{
values->get(getValueIndex(n), res);
}
bool ColumnSparse::getBool(size_t n) const
{
return values->getBool(getValueIndex(n));
}
Float64 ColumnSparse::getFloat64(size_t n) const
{
return values->getFloat64(getValueIndex(n));
}
Float32 ColumnSparse::getFloat32(size_t n) const
{
return values->getFloat32(getValueIndex(n));
}
UInt64 ColumnSparse::getUInt(size_t n) const
{
return values->getUInt(getValueIndex(n));
}
Int64 ColumnSparse::getInt(size_t n) const
{
return values->getInt(getValueIndex(n));
}
UInt64 ColumnSparse::get64(size_t n) const
{
return values->get64(getValueIndex(n));
}
StringRef ColumnSparse::getDataAt(size_t n) const
{
return values->getDataAt(getValueIndex(n));
}
ColumnPtr ColumnSparse::convertToFullColumnIfSparse() const
{
return values->createWithOffsets(getOffsetsData(), (*values)[0], _size, /*shift=*/ 1);
}
void ColumnSparse::insertSingleValue(const Inserter & inserter)
{
inserter(*values);
size_t last_idx = values->size() - 1;
if (values->isDefaultAt(last_idx))
values->popBack(1);
else
getOffsetsData().push_back(_size);
++_size;
}
void ColumnSparse::insertData(const char * pos, size_t length)
{
insertSingleValue([&](IColumn & column) { column.insertData(pos, length); });
}
StringRef ColumnSparse::serializeValueIntoArena(size_t n, Arena & arena, char const *& begin) const
{
return values->serializeValueIntoArena(getValueIndex(n), arena, begin);
}
const char * ColumnSparse::deserializeAndInsertFromArena(const char * pos)
{
const char * res = nullptr;
insertSingleValue([&](IColumn & column) { res = column.deserializeAndInsertFromArena(pos); });
return res;
}
const char * ColumnSparse::skipSerializedInArena(const char * pos) const
{
return values->skipSerializedInArena(pos);
}
void ColumnSparse::insertRangeFrom(const IColumn & src, size_t start, size_t length)
{
if (length == 0)
return;
if (start + length > src.size())
throw Exception("Parameter out of bound in IColumnString::insertRangeFrom method.",
ErrorCodes::LOGICAL_ERROR);
auto & offsets_data = getOffsetsData();
size_t end = start + length;
if (const auto * src_sparse = typeid_cast<const ColumnSparse *>(&src))
{
const auto & src_offsets = src_sparse->getOffsetsData();
const auto & src_values = src_sparse->getValuesColumn();
size_t offset_start = std::lower_bound(src_offsets.begin(), src_offsets.end(), start) - src_offsets.begin();
size_t offset_end = std::lower_bound(src_offsets.begin(), src_offsets.end(), end) - src_offsets.begin();
assert(offset_start <= offset_end);
if (offset_start != offset_end)
{
offsets_data.reserve(offsets_data.size() + offset_end - offset_start);
insertManyDefaults(src_offsets[offset_start] - start);
offsets_data.push_back(_size);
++_size;
for (size_t i = offset_start + 1; i < offset_end; ++i)
{
size_t current_diff = src_offsets[i] - src_offsets[i - 1];
insertManyDefaults(current_diff - 1);
offsets_data.push_back(_size);
++_size;
}
/// 'end' <= 'src_offsets[offsets_end]', but end is excluded, so index is 'offsets_end' - 1.
/// Since 'end' is excluded, need to subtract one more row from result.
insertManyDefaults(end - src_offsets[offset_end - 1] - 1);
values->insertRangeFrom(src_values, offset_start + 1, offset_end - offset_start);
}
else
{
insertManyDefaults(length);
}
}
else
{
for (size_t i = start; i < end; ++i)
{
if (!src.isDefaultAt(i))
{
values->insertFrom(src, i);
offsets_data.push_back(_size);
}
++_size;
}
}
}
void ColumnSparse::insert(const Field & x)
{
insertSingleValue([&](IColumn & column) { column.insert(x); });
}
void ColumnSparse::insertFrom(const IColumn & src, size_t n)
{
if (const auto * src_sparse = typeid_cast<const ColumnSparse *>(&src))
{
if (size_t value_index = src_sparse->getValueIndex(n))
{
getOffsetsData().push_back(_size);
values->insertFrom(src_sparse->getValuesColumn(), value_index);
}
}
else
{
if (!src.isDefaultAt(n))
{
values->insertFrom(src, n);
getOffsetsData().push_back(_size);
}
}
++_size;
}
void ColumnSparse::insertDefault()
{
++_size;
}
void ColumnSparse::insertManyDefaults(size_t length)
{
_size += length;
}
void ColumnSparse::popBack(size_t n)
{
assert(n < _size);
auto & offsets_data = getOffsetsData();
size_t new_size = _size - n;
size_t removed_values = 0;
while (!offsets_data.empty() && offsets_data.back() >= new_size)
{
offsets_data.pop_back();
++removed_values;
}
if (removed_values)
values->popBack(removed_values);
_size = new_size;
}
ColumnPtr ColumnSparse::filter(const Filter & filt, ssize_t) const
{
if (_size != filt.size())
throw Exception("Size of filter doesn't match size of column.", ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH);
if (offsets->empty())
{
auto res = cloneEmpty();
res->insertManyDefaults(countBytesInFilter(filt));
return res;
}
auto res_offsets = offsets->cloneEmpty();
auto & res_offsets_data = assert_cast<ColumnUInt64 &>(*res_offsets).getData();
Filter values_filter;
values_filter.reserve(values->size());
values_filter.push_back(1);
size_t values_result_size_hint = 1;
size_t res_offset = 0;
auto offset_it = begin();
for (size_t i = 0; i < _size; ++i, ++offset_it)
{
if (!offset_it.isDefault())
{
if (filt[i])
{
res_offsets_data.push_back(res_offset);
values_filter.push_back(1);
++res_offset;
++values_result_size_hint;
}
else
{
values_filter.push_back(0);
}
}
else
{
res_offset += filt[i] != 0;
}
}
auto res_values = values->filter(values_filter, values_result_size_hint);
return this->create(std::move(res_values), std::move(res_offsets), res_offset);
}
void ColumnSparse::expand(const Filter & mask, bool inverted)
{
if (mask.size() < _size)
throw Exception("Mask size should be no less than data size.", ErrorCodes::LOGICAL_ERROR);
auto res_offsets = offsets->cloneEmpty();
auto & res_offsets_data = assert_cast<ColumnUInt64 &>(*res_offsets).getData();
auto it = begin();
for (size_t i = 0; i < mask.size(); ++i)
{
if (!!mask[i] ^ inverted)
{
if (it.getCurrentRow() == _size)
throw Exception("Too many bytes in mask", ErrorCodes::LOGICAL_ERROR);
if (!it.isDefault())
res_offsets_data[it.getCurrentOffset()] = i;
++it;
}
}
_size = mask.size();
}
ColumnPtr ColumnSparse::permute(const Permutation & perm, size_t limit) const
{
return permuteImpl(*this, perm, limit);
}
ColumnPtr ColumnSparse::index(const IColumn & indexes, size_t limit) const
{
return selectIndexImpl(*this, indexes, limit);
}
template <typename Type>
ColumnPtr ColumnSparse::indexImpl(const PaddedPODArray<Type> & indexes, size_t limit) const
{
assert(limit <= indexes.size());
if (limit == 0)
return ColumnSparse::create(values->cloneEmpty());
if (offsets->empty())
{
auto res = cloneEmpty();
res->insertManyDefaults(limit);
return res;
}
auto res_offsets = offsets->cloneEmpty();
auto & res_offsets_data = assert_cast<ColumnUInt64 &>(*res_offsets).getData();
auto res_values = values->cloneEmpty();
res_values->insertDefault();
/// If we need to permute full column, or if limit is large enough,
/// it's better to save indexes of values in O(size)
/// and avoid binary search for obtaining every index.
/// 3 is just a guess for overhead on copying indexes.
bool execute_linear =
limit == _size || limit * std::bit_width(offsets->size()) > _size * 3;
if (execute_linear)
{
PaddedPODArray<UInt64> values_index(_size);
auto offset_it = begin();
for (size_t i = 0; i < _size; ++i, ++offset_it)
values_index[i] = offset_it.getValueIndex();
for (size_t i = 0; i < limit; ++i)
{
size_t index = values_index[indexes[i]];
if (index != 0)
{
res_values->insertFrom(*values, index);
res_offsets_data.push_back(i);
}
}
}
else
{
for (size_t i = 0; i < limit; ++i)
{
size_t index = getValueIndex(indexes[i]);
if (index != 0)
{
res_values->insertFrom(*values, index);
res_offsets_data.push_back(i);
}
}
}
return ColumnSparse::create(std::move(res_values), std::move(res_offsets), limit);
}
int ColumnSparse::compareAt(size_t n, size_t m, const IColumn & rhs_, int null_direction_hint) const
{
if (const auto * rhs_sparse = typeid_cast<const ColumnSparse *>(&rhs_))
return values->compareAt(getValueIndex(n), rhs_sparse->getValueIndex(m), rhs_sparse->getValuesColumn(), null_direction_hint);
return values->compareAt(getValueIndex(n), m, rhs_, null_direction_hint);
}
void ColumnSparse::compareColumn(const IColumn & rhs, size_t rhs_row_num,
PaddedPODArray<UInt64> * row_indexes, PaddedPODArray<Int8> & compare_results,
int direction, int nan_direction_hint) const
{
if (row_indexes)
{
/// TODO: implement without conversion to full column.
auto this_full = convertToFullColumnIfSparse();
auto rhs_full = rhs.convertToFullColumnIfSparse();
this_full->compareColumn(*rhs_full, rhs_row_num, row_indexes, compare_results, direction, nan_direction_hint);
}
else
{
const auto & rhs_sparse = assert_cast<const ColumnSparse &>(rhs);
PaddedPODArray<Int8> nested_result;
values->compareColumn(rhs_sparse.getValuesColumn(), rhs_sparse.getValueIndex(rhs_row_num),
nullptr, nested_result, direction, nan_direction_hint);
const auto & offsets_data = getOffsetsData();
compare_results.resize_fill(_size, nested_result[0]);
for (size_t i = 0; i < offsets_data.size(); ++i)
compare_results[offsets_data[i]] = nested_result[i + 1];
}
}
int ColumnSparse::compareAtWithCollation(size_t n, size_t m, const IColumn & rhs, int null_direction_hint, const Collator & collator) const
{
if (const auto * rhs_sparse = typeid_cast<const ColumnSparse *>(&rhs))
return values->compareAtWithCollation(getValueIndex(n), rhs_sparse->getValueIndex(m), rhs_sparse->getValuesColumn(), null_direction_hint, collator);
return values->compareAtWithCollation(getValueIndex(n), m, rhs, null_direction_hint, collator);
}
bool ColumnSparse::hasEqualValues() const
{
size_t num_defaults = getNumberOfDefaults();
if (num_defaults == _size)
return true;
/// Have at least 1 default and 1 non-default values.
if (num_defaults != 0)
return false;
/// Check that probably all non-default values are equal.
/// It's suboptiomal, but it's a rare case.
for (size_t i = 2; i < values->size(); ++i)
if (values->compareAt(1, i, *values, 1) != 0)
return false;
return true;
}
void ColumnSparse::getPermutationImpl(bool reverse, size_t limit, int null_direction_hint, Permutation & res, const Collator * collator) const
{
if (_size == 0)
return;
res.resize(_size);
if (offsets->empty())
{
for (size_t i = 0; i < _size; ++i)
res[i] = i;
return;
}
if (limit == 0 || limit > _size)
limit = _size;
Permutation perm;
/// Firstly we sort all values.
/// limit + 1 for case when there are 0 default values.
if (collator)
values->getPermutationWithCollation(*collator, reverse, limit + 1, null_direction_hint, perm);
else
values->getPermutation(reverse, limit + 1, null_direction_hint, perm);
size_t num_of_defaults = getNumberOfDefaults();
size_t row = 0;
const auto & offsets_data = getOffsetsData();
/// Fill the permutation.
for (size_t i = 0; i < perm.size() && row < limit; ++i)
{
if (perm[i] == 0)
{
if (!num_of_defaults)
continue;
/// Fill the positions of default values in the required quantity.
auto offset_it = begin();
while (row < limit)
{
while (offset_it.getCurrentRow() < _size && !offset_it.isDefault())
++offset_it;
if (offset_it.getCurrentRow() == _size)
break;
res[row++] = offset_it.getCurrentRow();
++offset_it;
}
}
else
{
res[row++] = offsets_data[perm[i] - 1];
}
}
assert(row == limit);
}
void ColumnSparse::getPermutation(bool reverse, size_t limit, int null_direction_hint, Permutation & res) const
{
return getPermutationImpl(reverse, limit, null_direction_hint, res, nullptr);
}
void ColumnSparse::updatePermutation(bool reverse, size_t limit, int null_direction_hint, Permutation & res, EqualRanges & equal_range) const
{
auto this_full = convertToFullColumnIfSparse();
this_full->updatePermutation(reverse, limit, null_direction_hint, res, equal_range);
}
void ColumnSparse::getPermutationWithCollation(const Collator & collator, bool reverse, size_t limit, int null_direction_hint, Permutation & res) const
{
return getPermutationImpl(reverse, limit, null_direction_hint, res, &collator);
}
void ColumnSparse::updatePermutationWithCollation(
const Collator & collator, bool reverse, size_t limit, int null_direction_hint, Permutation & res, EqualRanges& equal_range) const
{
auto this_full = convertToFullColumnIfSparse();
this_full->updatePermutationWithCollation(collator, reverse, limit, null_direction_hint, res, equal_range);
}
size_t ColumnSparse::byteSize() const
{
return values->byteSize() + offsets->byteSize() + sizeof(_size);
}
size_t ColumnSparse::byteSizeAt(size_t n) const
{
size_t index = getValueIndex(n);
size_t res = values->byteSizeAt(index);
if (index)
res += sizeof(UInt64);
return res;
}
size_t ColumnSparse::allocatedBytes() const
{
return values->allocatedBytes() + offsets->allocatedBytes() + sizeof(_size);
}
void ColumnSparse::protect()
{
values->protect();
offsets->protect();
}
ColumnPtr ColumnSparse::replicate(const Offsets & replicate_offsets) const
{
/// TODO: implement specializations.
if (_size != replicate_offsets.size())
throw Exception("Size of offsets doesn't match size of column.", ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH);
if (_size == 0)
return ColumnSparse::create(values->cloneEmpty());
auto res_offsets = offsets->cloneEmpty();
auto & res_offsets_data = assert_cast<ColumnUInt64 &>(*res_offsets).getData();
auto res_values = values->cloneEmpty();
res_values->insertDefault();
auto offset_it = begin();
for (size_t i = 0; i < _size; ++i, ++offset_it)
{
if (!offset_it.isDefault())
{
size_t replicate_size = replicate_offsets[i] - replicate_offsets[i - 1];
res_offsets_data.reserve(res_offsets_data.size() + replicate_size);
for (size_t row = replicate_offsets[i - 1]; row < replicate_offsets[i]; ++row)
{
res_offsets_data.push_back(row);
res_values->insertFrom(*values, offset_it.getValueIndex());
}
}
}
return ColumnSparse::create(std::move(res_values), std::move(res_offsets), replicate_offsets.back());
}
void ColumnSparse::updateHashWithValue(size_t n, SipHash & hash) const
{
values->updateHashWithValue(getValueIndex(n), hash);
}
void ColumnSparse::updateWeakHash32(WeakHash32 & hash) const
{
if (hash.getData().size() != _size)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Size of WeakHash32 does not match size of column: "
"column size is {}, hash size is {}", _size, hash.getData().size());
auto offset_it = begin();
auto & hash_data = hash.getData();
for (size_t i = 0; i < _size; ++i, ++offset_it)
{
size_t value_index = offset_it.getValueIndex();
auto data_ref = values->getDataAt(value_index);
hash_data[i] = ::updateWeakHash32(reinterpret_cast<const UInt8 *>(data_ref.data), data_ref.size, hash_data[i]);
}
}
void ColumnSparse::updateHashFast(SipHash & hash) const
{
values->updateHashFast(hash);
offsets->updateHashFast(hash);
hash.update(_size);
}
void ColumnSparse::getExtremes(Field & min, Field & max) const
{
if (_size == 0)
{
values->get(0, min);
values->get(0, max);
return;
}
if (getNumberOfDefaults() == 0)
{
size_t min_idx = 1;
size_t max_idx = 1;
for (size_t i = 2; i < values->size(); ++i)
{
if (values->compareAt(i, min_idx, *values, 1) < 0)
min_idx = i;
else if (values->compareAt(i, max_idx, *values, 1) > 0)
max_idx = i;
}
values->get(min_idx, min);
values->get(max_idx, max);
return;
}
values->getExtremes(min, max);
}
void ColumnSparse::getIndicesOfNonDefaultRows(IColumn::Offsets & indices, size_t from, size_t limit) const
{
const auto & offsets_data = getOffsetsData();
const auto * start = from ? std::lower_bound(offsets_data.begin(), offsets_data.end(), from) : offsets_data.begin();
const auto * end = limit ? std::lower_bound(offsets_data.begin(), offsets_data.end(), from + limit) : offsets_data.end();
indices.insert(start, end);
}
double ColumnSparse::getRatioOfDefaultRows(double) const
{
return static_cast<double>(getNumberOfDefaults()) / _size;
}
MutableColumns ColumnSparse::scatter(ColumnIndex num_columns, const Selector & selector) const
{
return scatterImpl<ColumnSparse>(num_columns, selector);
}
void ColumnSparse::gather(ColumnGathererStream & gatherer_stream)
{
gatherer_stream.gather(*this);
}
ColumnPtr ColumnSparse::compress() const
{
auto values_compressed = values->compress();
auto offsets_compressed = offsets->compress();
size_t byte_size = values_compressed->byteSize() + offsets_compressed->byteSize();
return ColumnCompressed::create(size(), byte_size,
[values_compressed = std::move(values_compressed), offsets_compressed = std::move(offsets_compressed), size = size()]
{
return ColumnSparse::create(values_compressed->decompress(), offsets_compressed->decompress(), size);
});
}
bool ColumnSparse::structureEquals(const IColumn & rhs) const
{
if (const auto * rhs_sparse = typeid_cast<const ColumnSparse *>(&rhs))
return values->structureEquals(*rhs_sparse->values);
return false;
}
void ColumnSparse::forEachSubcolumn(ColumnCallback callback)
{
callback(values);
callback(offsets);
}
const IColumn::Offsets & ColumnSparse::getOffsetsData() const
{
return assert_cast<const ColumnUInt64 &>(*offsets).getData();
}
IColumn::Offsets & ColumnSparse::getOffsetsData()
{
return assert_cast<ColumnUInt64 &>(*offsets).getData();
}
size_t ColumnSparse::getValueIndex(size_t n) const
{
assert(n < _size);
const auto & offsets_data = getOffsetsData();
const auto * it = std::lower_bound(offsets_data.begin(), offsets_data.end(), n);
if (it == offsets_data.end() || *it != n)
return 0;
return it - offsets_data.begin() + 1;
}
ColumnPtr recursiveRemoveSparse(const ColumnPtr & column)
{
if (!column)
return column;
if (const auto * column_tuple = typeid_cast<const ColumnTuple *>(column.get()))
{
auto columns = column_tuple->getColumns();
for (auto & element : columns)
element = recursiveRemoveSparse(element);
return ColumnTuple::create(columns);
}
return column->convertToFullColumnIfSparse();
}
}

231
src/Columns/ColumnSparse.h Normal file
View File

@ -0,0 +1,231 @@
#pragma once
#include <Columns/IColumn.h>
#include <Columns/IColumnImpl.h>
#include <Columns/ColumnsNumber.h>
#include <Common/typeid_cast.h>
#include <Common/assert_cast.h>
class Collator;
namespace DB
{
/** Column for spare representation.
* It stores column with non-default values and column
* with their sorted positions in original column. Column with
* values contains also one default value at 0 position to make
* implementation of execution of functions and sorting more convenient.
*/
class ColumnSparse final : public COWHelper<IColumn, ColumnSparse>
{
private:
friend class COWHelper<IColumn, ColumnSparse>;
explicit ColumnSparse(MutableColumnPtr && values_);
ColumnSparse(MutableColumnPtr && values_, MutableColumnPtr && offsets_, size_t size_);
ColumnSparse(const ColumnSparse &) = default;
public:
static constexpr auto DEFAULT_ROWS_SEARCH_SAMPLE_RATIO = 0.1;
static constexpr auto DEFAULT_RATIO_FOR_SPARSE_SERIALIZATION = 0.95;
using Base = COWHelper<IColumn, ColumnSparse>;
static Ptr create(const ColumnPtr & values_, const ColumnPtr & offsets_, size_t size_)
{
return Base::create(values_->assumeMutable(), offsets_->assumeMutable(), size_);
}
template <typename TColumnPtr, typename = typename std::enable_if<IsMutableColumns<TColumnPtr>::value>::type>
static MutablePtr create(TColumnPtr && values_, TColumnPtr && offsets_, size_t size_)
{
return Base::create(std::move(values_), std::move(offsets_), size_);
}
static Ptr create(const ColumnPtr & values_)
{
return Base::create(values_->assumeMutable());
}
template <typename TColumnPtr, typename = typename std::enable_if<IsMutableColumns<TColumnPtr>::value>::type>
static MutablePtr create(TColumnPtr && values_)
{
return Base::create(std::forward<TColumnPtr>(values_));
}
bool isSparse() const override { return true; }
const char * getFamilyName() const override { return "Sparse"; }
std::string getName() const override { return "Sparse(" + values->getName() + ")"; }
TypeIndex getDataType() const override { return values->getDataType(); }
MutableColumnPtr cloneResized(size_t new_size) const override;
size_t size() const override { return _size; }
bool isDefaultAt(size_t n) const override;
bool isNullAt(size_t n) const override;
Field operator[](size_t n) const override;
void get(size_t n, Field & res) const override;
bool getBool(size_t n) const override;
Float64 getFloat64(size_t n) const override;
Float32 getFloat32(size_t n) const override;
UInt64 getUInt(size_t n) const override;
Int64 getInt(size_t n) const override;
UInt64 get64(size_t n) const override;
StringRef getDataAt(size_t n) const override;
ColumnPtr convertToFullColumnIfSparse() const override;
/// Will insert null value if pos=nullptr
void insertData(const char * pos, size_t length) override;
StringRef serializeValueIntoArena(size_t n, Arena & arena, char const *& begin) const override;
const char * deserializeAndInsertFromArena(const char * pos) override;
const char * skipSerializedInArena(const char *) const override;
void insertRangeFrom(const IColumn & src, size_t start, size_t length) override;
void insert(const Field & x) override;
void insertFrom(const IColumn & src, size_t n) override;
void insertDefault() override;
void insertManyDefaults(size_t length) override;
void popBack(size_t n) override;
ColumnPtr filter(const Filter & filt, ssize_t) const override;
void expand(const Filter & mask, bool inverted) override;
ColumnPtr permute(const Permutation & perm, size_t limit) const override;
ColumnPtr index(const IColumn & indexes, size_t limit) const override;
template <typename Type>
ColumnPtr indexImpl(const PaddedPODArray<Type> & indexes, size_t limit) const;
int compareAt(size_t n, size_t m, const IColumn & rhs_, int null_direction_hint) const override;
void compareColumn(const IColumn & rhs, size_t rhs_row_num,
PaddedPODArray<UInt64> * row_indexes, PaddedPODArray<Int8> & compare_results,
int direction, int nan_direction_hint) const override;
int compareAtWithCollation(size_t n, size_t m, const IColumn & rhs, int null_direction_hint, const Collator & collator) const override;
bool hasEqualValues() const override;
void getPermutationImpl(bool reverse, size_t limit, int null_direction_hint, Permutation & res, const Collator * collator) const;
void getPermutation(bool reverse, size_t limit, int null_direction_hint, Permutation & res) const override;
void updatePermutation(bool reverse, size_t limit, int null_direction_hint, Permutation & res, EqualRanges & equal_range) const override;
void getPermutationWithCollation(const Collator & collator, bool reverse, size_t limit, int null_direction_hint, Permutation & res) const override;
void updatePermutationWithCollation(
const Collator & collator, bool reverse, size_t limit, int null_direction_hint, Permutation & res, EqualRanges& equal_range) const override;
size_t byteSize() const override;
size_t byteSizeAt(size_t n) const override;
size_t allocatedBytes() const override;
void protect() override;
ColumnPtr replicate(const Offsets & replicate_offsets) const override;
void updateHashWithValue(size_t n, SipHash & hash) const override;
void updateWeakHash32(WeakHash32 & hash) const override;
void updateHashFast(SipHash & hash) const override;
void getExtremes(Field & min, Field & max) const override;
void getIndicesOfNonDefaultRows(IColumn::Offsets & indices, size_t from, size_t limit) const override;
double getRatioOfDefaultRows(double sample_ratio) const override;
MutableColumns scatter(ColumnIndex num_columns, const Selector & selector) const override;
void gather(ColumnGathererStream & gatherer_stream) override;
ColumnPtr compress() const override;
void forEachSubcolumn(ColumnCallback callback) override;
bool structureEquals(const IColumn & rhs) const override;
bool isNullable() const override { return values->isNullable(); }
bool isFixedAndContiguous() const override { return false; }
bool valuesHaveFixedSize() const override { return values->valuesHaveFixedSize(); }
size_t sizeOfValueIfFixed() const override { return values->sizeOfValueIfFixed() + values->sizeOfValueIfFixed(); }
bool isCollationSupported() const override { return values->isCollationSupported(); }
size_t getNumberOfDefaults() const { return _size - offsets->size(); }
size_t getNumberOfTrailingDefaults() const
{
return offsets->empty() ? _size : _size - getOffsetsData().back() - 1;
}
/// Return position of element in 'values' columns,
/// that corresponds to n-th element of full column.
/// O(log(offsets.size())) complexity,
size_t getValueIndex(size_t n) const;
const IColumn & getValuesColumn() const { return *values; }
IColumn & getValuesColumn() { return *values; }
const ColumnPtr & getValuesPtr() const { return values; }
ColumnPtr & getValuesPtr() { return values; }
const IColumn::Offsets & getOffsetsData() const;
IColumn::Offsets & getOffsetsData();
const ColumnPtr & getOffsetsPtr() const { return offsets; }
ColumnPtr & getOffsetsPtr() { return offsets; }
const IColumn & getOffsetsColumn() const { return *offsets; }
IColumn & getOffsetsColumn() { return *offsets; }
/// This class helps to iterate over all values in ColumnSparse.
class Iterator
{
public:
Iterator(const PaddedPODArray<UInt64> & offsets_, size_t size_, size_t current_offset_, size_t current_row_)
: offsets(offsets_), size(size_), current_offset(current_offset_), current_row(current_row_)
{
}
bool ALWAYS_INLINE isDefault() const { return current_offset == offsets.size() || current_row != offsets[current_offset]; }
size_t ALWAYS_INLINE getValueIndex() const { return isDefault() ? 0 : current_offset + 1; }
size_t ALWAYS_INLINE getCurrentRow() const { return current_row; }
size_t ALWAYS_INLINE getCurrentOffset() const { return current_offset; }
bool operator==(const Iterator & other) const
{
return size == other.size
&& current_offset == other.current_offset
&& current_row == other.current_row;
}
bool operator!=(const Iterator & other) const { return !(*this == other); }
Iterator operator++()
{
if (!isDefault())
++current_offset;
++current_row;
return *this;
}
private:
const PaddedPODArray<UInt64> & offsets;
const size_t size;
size_t current_offset;
size_t current_row;
};
Iterator begin() const { return Iterator(getOffsetsData(), _size, 0, 0); }
Iterator end() const { return Iterator(getOffsetsData(), _size, getOffsetsData().size(), _size); }
private:
using Inserter = std::function<void(IColumn &)>;
/// Inserts value to 'values' column via callback.
/// Properly handles cases, when inserted value is default.
/// Used, when it's unknown in advance if inserted value is default.
void insertSingleValue(const Inserter & inserter);
/// Contains default value at 0 position.
/// It's convenient, because it allows to execute, e.g functions or sorting,
/// for this column without handling different cases.
WrappedPtr values;
/// Sorted offsets of non-default values in the full column.
/// 'offsets[i]' corresponds to 'values[i + 1]'.
WrappedPtr offsets;
size_t _size;
};
ColumnPtr recursiveRemoveSparse(const ColumnPtr & column);
}

View File

@ -474,8 +474,9 @@ void ColumnString::getExtremes(Field & min, Field & max) const
ColumnPtr ColumnString::compress() const
{
size_t source_chars_size = chars.size();
size_t source_offsets_size = offsets.size() * sizeof(Offset);
const size_t source_chars_size = chars.size();
const size_t source_offsets_elements = offsets.size();
const size_t source_offsets_size = source_offsets_elements * sizeof(Offset);
/// Don't compress small blocks.
if (source_chars_size < 4096) /// A wild guess.
@ -489,12 +490,14 @@ ColumnPtr ColumnString::compress() const
auto offsets_compressed = ColumnCompressed::compressBuffer(offsets.data(), source_offsets_size, true);
return ColumnCompressed::create(offsets.size(), chars_compressed->size() + offsets_compressed->size(),
const size_t chars_compressed_size = chars_compressed->size();
const size_t offsets_compressed_size = offsets_compressed->size();
return ColumnCompressed::create(source_offsets_elements, chars_compressed_size + offsets_compressed_size,
[
chars_compressed = std::move(chars_compressed),
offsets_compressed = std::move(offsets_compressed),
source_chars_size,
source_offsets_elements = offsets.size()
source_offsets_elements
]
{
auto res = ColumnString::create();

View File

@ -107,6 +107,12 @@ public:
return StringRef(&chars[offsetAt(n)], sizeAt(n));
}
bool isDefaultAt(size_t n) const override
{
assert(n < size());
return sizeAt(n) == 1;
}
/// Suppress gcc 7.3.1 warning: '*((void*)&<anonymous> +8)' may be used uninitialized in this function
#if !defined(__clang__)
#pragma GCC diagnostic push
@ -278,6 +284,16 @@ public:
return typeid(rhs) == typeid(ColumnString);
}
double getRatioOfDefaultRows(double sample_ratio) const override
{
return getRatioOfDefaultRowsImpl<ColumnString>(sample_ratio);
}
void getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const override
{
return getIndicesOfNonDefaultRowsImpl<ColumnString>(indices, from, limit);
}
Chars & getChars() { return chars; }
const Chars & getChars() const { return chars; }

View File

@ -12,6 +12,7 @@
#include <base/sort.h>
#include <base/map.h>
#include <base/range.h>
#include <DataTypes/Serializations/SerializationInfoTuple.h>
namespace DB
@ -113,6 +114,15 @@ void ColumnTuple::get(size_t n, Field & res) const
res = tuple;
}
bool ColumnTuple::isDefaultAt(size_t n) const
{
const size_t tuple_size = columns.size();
for (size_t i = 0; i < tuple_size; ++i)
if (!columns[i]->isDefaultAt(n))
return false;
return true;
}
StringRef ColumnTuple::getDataAt(size_t) const
{
throw Exception("Method getDataAt is not supported for " + getName(), ErrorCodes::NOT_IMPLEMENTED);
@ -536,4 +546,25 @@ ColumnPtr ColumnTuple::compress() const
});
}
double ColumnTuple::getRatioOfDefaultRows(double sample_ratio) const
{
return getRatioOfDefaultRowsImpl<ColumnTuple>(sample_ratio);
}
void ColumnTuple::getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const
{
return getIndicesOfNonDefaultRowsImpl<ColumnTuple>(indices, from, limit);
}
SerializationInfoPtr ColumnTuple::getSerializationInfo() const
{
MutableSerializationInfos infos;
infos.reserve(columns.size());
for (const auto & column : columns)
infos.push_back(const_pointer_cast<SerializationInfo>(column->getSerializationInfo()));
return std::make_shared<SerializationInfoTuple>(std::move(infos), SerializationInfo::Settings{});
}
}

View File

@ -53,6 +53,7 @@ public:
Field operator[](size_t n) const override;
void get(size_t n, Field & res) const override;
bool isDefaultAt(size_t n) const override;
StringRef getDataAt(size_t n) const override;
void insertData(const char * pos, size_t length) override;
void insert(const Field & x) override;
@ -93,6 +94,9 @@ public:
bool structureEquals(const IColumn & rhs) const override;
bool isCollationSupported() const override;
ColumnPtr compress() const override;
double getRatioOfDefaultRows(double sample_ratio) const override;
void getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const override;
SerializationInfoPtr getSerializationInfo() const override;
size_t tupleSize() const { return columns.size(); }

View File

@ -68,6 +68,7 @@ public:
Field operator[](size_t n) const override { return (*getNestedColumn())[n]; }
void get(size_t n, Field & res) const override { getNestedColumn()->get(n, res); }
bool isDefaultAt(size_t n) const override { return n == 0; }
StringRef getDataAt(size_t n) const override { return getNestedColumn()->getDataAt(n); }
StringRef getDataAtWithTerminatingZero(size_t n) const override
{
@ -122,6 +123,16 @@ public:
return false;
}
double getRatioOfDefaultRows(double) const override
{
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method 'getRatioOfDefaultRows' not implemented for ColumnUnique");
}
void getIndicesOfNonDefaultRows(IColumn::Offsets &, size_t, size_t) const override
{
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method 'getIndicesOfNonDefaultRows' not implemented for ColumnUnique");
}
const UInt64 * tryGetSavedHash() const override { return reverse_index.tryGetSavedHash(); }
UInt128 getHash() const override { return hash.getHash(*getRawColumnPtr()); }

View File

@ -481,7 +481,8 @@ void ColumnVector<T>::getExtremes(Field & min, Field & max) const
template <typename T>
ColumnPtr ColumnVector<T>::compress() const
{
size_t source_size = data.size() * sizeof(T);
const size_t data_size = data.size();
const size_t source_size = data_size * sizeof(T);
/// Don't compress small blocks.
if (source_size < 4096) /// A wild guess.
@ -492,8 +493,9 @@ ColumnPtr ColumnVector<T>::compress() const
if (!compressed)
return ColumnCompressed::wrap(this->getPtr());
return ColumnCompressed::create(data.size(), compressed->size(),
[compressed = std::move(compressed), column_size = data.size()]
const size_t compressed_size = compressed->size();
return ColumnCompressed::create(data_size, compressed_size,
[compressed = std::move(compressed), column_size = data_size]
{
auto res = ColumnVector<T>::create(column_size);
ColumnCompressed::decompressBuffer(
@ -502,6 +504,24 @@ ColumnPtr ColumnVector<T>::compress() const
});
}
template <typename T>
ColumnPtr ColumnVector<T>::createWithOffsets(const IColumn::Offsets & offsets, const Field & default_field, size_t total_rows, size_t shift) const
{
if (offsets.size() + shift != size())
throw Exception(ErrorCodes::LOGICAL_ERROR,
"Incompatible sizes of offsets ({}), shift ({}) and size of column {}", offsets.size(), shift, size());
auto res = this->create();
auto & res_data = res->getData();
T default_value = safeGet<T>(default_field);
res_data.resize_fill(total_rows, default_value);
for (size_t i = 0; i < offsets.size(); ++i)
res_data[offsets[i]] = data[i + shift];
return res;
}
/// Explicit template instantiations - to avoid code bloat in headers.
template class ColumnVector<UInt8>;
template class ColumnVector<UInt16>;

View File

@ -328,11 +328,25 @@ public:
return StringRef(reinterpret_cast<const char *>(&data[n]), sizeof(data[n]));
}
bool isDefaultAt(size_t n) const override { return data[n] == T{}; }
bool structureEquals(const IColumn & rhs) const override
{
return typeid(rhs) == typeid(ColumnVector<T>);
}
double getRatioOfDefaultRows(double sample_ratio) const override
{
return this->template getRatioOfDefaultRowsImpl<Self>(sample_ratio);
}
void getIndicesOfNonDefaultRows(IColumn::Offsets & indices, size_t from, size_t limit) const override
{
return this->template getIndicesOfNonDefaultRowsImpl<Self>(indices, from, limit);
}
ColumnPtr createWithOffsets(const IColumn::Offsets & offsets, const Field & default_field, size_t total_rows, size_t shift) const override;
ColumnPtr compress() const override;
/// Replace elements that match the filter with zeroes. If inverted replaces not matched elements.

View File

@ -4,6 +4,7 @@
#include <Columns/ColumnsNumber.h>
#include <Columns/ColumnNullable.h>
#include <Columns/ColumnConst.h>
#include <Columns/ColumnSparse.h>
#include <Core/ColumnWithTypeAndName.h>
@ -50,6 +51,9 @@ ConstantFilterDescription::ConstantFilterDescription(const IColumn & column)
FilterDescription::FilterDescription(const IColumn & column_)
{
if (column_.isSparse())
data_holder = recursiveRemoveSparse(column_.getPtr());
if (column_.lowCardinality())
data_holder = column_.convertToFullColumnIfLowCardinality();

View File

@ -4,11 +4,17 @@
#include <Columns/ColumnNullable.h>
#include <Columns/ColumnConst.h>
#include <Core/Field.h>
#include <DataTypes/Serializations/SerializationInfo.h>
namespace DB
{
namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
}
String IColumn::dumpStructure() const
{
WriteBufferFromOwnString res;
@ -30,6 +36,39 @@ void IColumn::insertFrom(const IColumn & src, size_t n)
insert(src[n]);
}
ColumnPtr IColumn::createWithOffsets(const Offsets & offsets, const Field & default_field, size_t total_rows, size_t shift) const
{
if (offsets.size() + shift != size())
throw Exception(ErrorCodes::LOGICAL_ERROR,
"Incompatible sizes of offsets ({}), shift ({}) and size of column {}", offsets.size(), shift, size());
auto res = cloneEmpty();
res->reserve(total_rows);
ssize_t current_offset = -1;
for (size_t i = 0; i < offsets.size(); ++i)
{
ssize_t offsets_diff = static_cast<ssize_t>(offsets[i]) - current_offset;
current_offset = offsets[i];
if (offsets_diff > 1)
res->insertMany(default_field, offsets_diff - 1);
res->insertFrom(*this, i + shift);
}
ssize_t offsets_diff = static_cast<ssize_t>(total_rows) - current_offset;
if (offsets_diff > 1)
res->insertMany(default_field, offsets_diff - 1);
return res;
}
SerializationInfoPtr IColumn::getSerializationInfo() const
{
return std::make_shared<SerializationInfo>(ISerialization::getKind(*this), SerializationInfo::Settings{});
}
bool isColumnNullable(const IColumn & column)
{
return checkColumn<ColumnNullable>(column);

View File

@ -26,9 +26,8 @@ class ColumnGathererStream;
class Field;
class WeakHash32;
class ISerialization;
using SerializationPtr = std::shared_ptr<const ISerialization>;
class SerializationInfo;
using SerializationInfoPtr = std::shared_ptr<const SerializationInfo>;
/*
* Represents a set of equal ranges in previous column to perform sorting in current column.
@ -64,9 +63,18 @@ public:
virtual Ptr convertToFullColumnIfConst() const { return getPtr(); }
/// If column isn't ColumnLowCardinality, return itself.
/// If column is ColumnLowCardinality, transforms is to full column.
/// If column is ColumnLowCardinality, transforms it to full column.
virtual Ptr convertToFullColumnIfLowCardinality() const { return getPtr(); }
/// If column isn't ColumnSparse, return itself.
/// If column is ColumnSparse, transforms it to full column.
virtual Ptr convertToFullColumnIfSparse() const { return getPtr(); }
Ptr convertToFullIfNeeded() const
{
return convertToFullColumnIfSparse()->convertToFullColumnIfConst()->convertToFullColumnIfLowCardinality();
}
/// Creates empty column with the same type.
virtual MutablePtr cloneEmpty() const { return cloneResized(0); }
@ -133,7 +141,7 @@ public:
throw Exception("Method getInt is not supported for " + getName(), ErrorCodes::NOT_IMPLEMENTED);
}
virtual bool isDefaultAt(size_t n) const { return get64(n) == 0; }
virtual bool isDefaultAt(size_t n) const = 0;
virtual bool isNullAt(size_t /*n*/) const { return false; }
/** If column is numeric, return value of n-th element, casted to bool.
@ -173,6 +181,13 @@ public:
insertFrom(src, position);
}
/// Appends one field multiple times. Can be optimized in inherited classes.
virtual void insertMany(const Field & field, size_t length)
{
for (size_t i = 0; i < length; ++i)
insert(field);
}
/// Appends data located in specified memory chunk if it is possible (throws an exception if it cannot be implemented).
/// Is used to optimize some computations (in aggregation, for example).
/// Parameter length could be ignored if column values have fixed size.
@ -375,6 +390,22 @@ public:
throw Exception("Method structureEquals is not supported for " + getName(), ErrorCodes::NOT_IMPLEMENTED);
}
/// Returns ration of values in column, that equal to default value of column.
/// Checks only @sample_ratio ratio of rows.
virtual double getRatioOfDefaultRows(double sample_ratio = 1.0) const = 0;
/// Returns indices of values in column, that not equal to default value of column.
virtual void getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const = 0;
/// Returns column with @total_size elements.
/// In result column values from current column are at positions from @offsets.
/// Other values are filled by @default_value.
/// @shift means how much rows to skip from the beginning of current column.
/// Used to create full column from sparse.
virtual Ptr createWithOffsets(const Offsets & offsets, const Field & default_field, size_t total_rows, size_t shift) const;
virtual SerializationInfoPtr getSerializationInfo() const;
/// Compress column in memory to some representation that allows to decompress it back.
/// Return itself if compression is not applicable for this column type.
virtual Ptr compress() const
@ -457,6 +488,8 @@ public:
virtual bool lowCardinality() const { return false; }
virtual bool isSparse() const { return false; }
virtual bool isCollationSupported() const { return false; }
virtual ~IColumn() = default;
@ -468,7 +501,6 @@ public:
String dumpStructure() const;
protected:
/// Template is to devirtualize calls to insertFrom method.
/// In derived classes (that use final keyword), implement scatter method as call to scatterImpl.
template <typename Derived>
@ -489,6 +521,13 @@ protected:
template <typename Derived>
bool hasEqualValuesImpl() const;
/// Template is to devirtualize calls to 'isDefaultAt' method.
template <typename Derived>
double getRatioOfDefaultRowsImpl(double sample_ratio) const;
template <typename Derived>
void getIndicesOfNonDefaultRowsImpl(Offsets & indices, size_t from, size_t limit) const;
/// Uses std::sort and partial_sort as default algorithms.
/// Implements 'less' and 'equals' via comparator.
/// If 'less' and 'equals' can be implemented more optimal

View File

@ -46,6 +46,7 @@ public:
Field operator[](size_t) const override { throw Exception("Cannot get value from " + getName(), ErrorCodes::NOT_IMPLEMENTED); }
void get(size_t, Field &) const override { throw Exception("Cannot get value from " + getName(), ErrorCodes::NOT_IMPLEMENTED); }
void insert(const Field &) override { throw Exception("Cannot insert element into " + getName(), ErrorCodes::NOT_IMPLEMENTED); }
bool isDefaultAt(size_t) const override { throw Exception("isDefaultAt is not implemented for " + getName(), ErrorCodes::NOT_IMPLEMENTED); }
StringRef getDataAt(size_t) const override
{
@ -161,6 +162,16 @@ public:
return res;
}
double getRatioOfDefaultRows(double) const override
{
throw Exception("Method getRatioOfDefaultRows is not supported for " + getName(), ErrorCodes::NOT_IMPLEMENTED);
}
void getIndicesOfNonDefaultRows(Offsets &, size_t, size_t) const override
{
throw Exception("Method getIndicesOfNonDefaultRows is not supported for " + getName(), ErrorCodes::NOT_IMPLEMENTED);
}
void gather(ColumnGathererStream &) override
{
throw Exception("Method gather is not supported for " + getName(), ErrorCodes::NOT_IMPLEMENTED);

View File

@ -16,6 +16,7 @@ namespace DB
namespace ErrorCodes
{
extern const int SIZES_OF_COLUMNS_DOESNT_MATCH;
extern const int LOGICAL_ERROR;
}
template <typename Derived>
@ -141,6 +142,56 @@ bool IColumn::hasEqualValuesImpl() const
return true;
}
template <typename Derived>
double IColumn::getRatioOfDefaultRowsImpl(double sample_ratio) const
{
if (sample_ratio <= 0.0 || sample_ratio > 1.0)
throw Exception(ErrorCodes::LOGICAL_ERROR,
"Value of 'sample_ratio' must be in interval (0.0; 1.0], but got: {}", sample_ratio);
/// Randomize a little to avoid boundary effects.
std::uniform_int_distribution<size_t> dist(1, static_cast<size_t>(1.0 / sample_ratio));
size_t num_rows = size();
size_t num_sampled_rows = static_cast<size_t>(num_rows * sample_ratio);
size_t num_checked_rows = dist(thread_local_rng);
num_sampled_rows = std::min(num_sampled_rows + dist(thread_local_rng), num_rows);
size_t res = 0;
if (num_sampled_rows == num_rows)
{
for (size_t i = 0; i < num_rows; ++i)
res += static_cast<const Derived &>(*this).isDefaultAt(i);
num_checked_rows = num_rows;
}
else if (num_sampled_rows != 0)
{
for (size_t i = num_checked_rows; i < num_rows; ++i)
{
if (num_checked_rows * num_rows <= i * num_sampled_rows)
{
res += static_cast<const Derived &>(*this).isDefaultAt(i);
++num_checked_rows;
}
}
}
return static_cast<double>(res) / num_checked_rows;
}
template <typename Derived>
void IColumn::getIndicesOfNonDefaultRowsImpl(Offsets & indices, size_t from, size_t limit) const
{
size_t to = limit && from + limit < size() ? from + limit : size();
indices.reserve(indices.size() + to - from);
for (size_t i = from; i < to; ++i)
{
if (!static_cast<const Derived &>(*this).isDefaultAt(i))
indices.push_back(i);
}
}
template <typename Comparator>
void IColumn::updatePermutationImpl(
size_t limit,

View File

@ -293,7 +293,7 @@ void executeColumnIfNeeded(ColumnWithTypeAndName & column, bool empty)
column.column = column_function->getResultType()->createColumn();
}
int checkShirtCircuitArguments(const ColumnsWithTypeAndName & arguments)
int checkShortCircuitArguments(const ColumnsWithTypeAndName & arguments)
{
int last_short_circuit_argument_index = -1;
for (size_t i = 0; i != arguments.size(); ++i)

View File

@ -66,7 +66,7 @@ void executeColumnIfNeeded(ColumnWithTypeAndName & column, bool empty = false);
/// Check if arguments contain lazy executed argument. If contain, return index of the last one,
/// otherwise return -1.
int checkShirtCircuitArguments(const ColumnsWithTypeAndName & arguments);
int checkShortCircuitArguments(const ColumnsWithTypeAndName & arguments);
void copyMask(const PaddedPODArray<UInt8> & from, PaddedPODArray<UInt8> & to);

View File

@ -0,0 +1,327 @@
#include <Columns/ColumnSparse.h>
#include <Columns/ColumnsNumber.h>
#include <Common/randomSeed.h>
#include <pcg_random.hpp>
#include <gtest/gtest.h>
#include <algorithm>
#include <numeric>
#include <Common/FieldVisitors.h>
using namespace DB;
pcg64 rng(randomSeed());
std::pair<MutableColumnPtr, MutableColumnPtr> createColumns(size_t n, size_t k)
{
auto values = ColumnVector<UInt64>::create();
auto offsets = ColumnVector<UInt64>::create();
auto full = ColumnVector<UInt64>::create();
auto & values_data = values->getData();
auto & offsets_data = offsets->getData();
auto & full_data = full->getData();
values_data.push_back(0);
for (size_t i = 0; i < n; ++i)
{
bool not_zero = rng() % k == 0;
size_t value = not_zero ? rng() % 1000000 : 0;
full_data.push_back(value);
if (not_zero)
{
values_data.push_back(value);
offsets_data.push_back(i);
}
}
auto sparse = ColumnSparse::create(std::move(values), std::move(offsets), n);
return std::make_pair(std::move(sparse), std::move(full));
}
bool checkEquals(const IColumn & lhs, const IColumn & rhs)
{
if (lhs.size() != rhs.size())
return false;
for (size_t i = 0; i < lhs.size(); ++i)
if (lhs.compareAt(i, i, rhs, 0) != 0)
return false;
return true;
}
// Can't use ErrorCodes, because of 'using namespace DB'.
constexpr int error_code = 12345;
constexpr size_t T = 5000;
constexpr size_t MAX_ROWS = 10000;
constexpr size_t sparse_ratios[] = {1, 2, 5, 10, 32, 50, 64, 100, 256, 500, 1000, 5000, 10000};
constexpr size_t K = sizeof(sparse_ratios) / sizeof(sparse_ratios[0]);
#define DUMP_COLUMN(column) std::cerr << #column << ": " << (column)->dumpStructure() << "\n"
TEST(ColumnSparse, InsertRangeFrom)
{
auto test_case = [&](size_t n1, size_t k1, size_t n2, size_t k2, size_t from, size_t len)
{
auto [sparse_dst, full_dst] = createColumns(n1, k1);
auto [sparse_src, full_src] = createColumns(n2, k2);
sparse_dst->insertRangeFrom(*sparse_src, from, len);
full_dst->insertRangeFrom(*full_src, from, len);
if (!checkEquals(*sparse_dst->convertToFullColumnIfSparse(), *full_dst))
{
DUMP_COLUMN(sparse_src);
DUMP_COLUMN(full_src);
DUMP_COLUMN(sparse_dst);
DUMP_COLUMN(full_dst);
throw Exception(error_code, "Columns are unequal");
}
};
try
{
for (size_t i = 0; i < T; ++i)
{
size_t n1 = rng() % MAX_ROWS + 1;
size_t k1 = sparse_ratios[rng() % K];
size_t n2 = rng() % MAX_ROWS + 1;
size_t k2 = sparse_ratios[rng() % K];
size_t from = rng() % n2;
size_t to = rng() % n2;
if (from > to)
std::swap(from, to);
test_case(n1, k1, n2, k2, from, to - from);
}
}
catch (const Exception & e)
{
FAIL() << e.displayText();
}
}
TEST(ColumnSparse, PopBack)
{
auto test_case = [&](size_t n, size_t k, size_t m)
{
auto [sparse_dst, full_dst] = createColumns(n, k);
sparse_dst->popBack(m);
full_dst->popBack(m);
if (!checkEquals(*sparse_dst->convertToFullColumnIfSparse(), *full_dst))
{
DUMP_COLUMN(sparse_dst);
DUMP_COLUMN(full_dst);
throw Exception(error_code, "Columns are unequal");
}
};
try
{
for (size_t i = 0; i < T; ++i)
{
size_t n = rng() % MAX_ROWS + 1;
size_t k = sparse_ratios[rng() % K];
size_t m = rng() % n;
test_case(n, k, m);
}
}
catch (const Exception & e)
{
FAIL() << e.displayText();
}
}
TEST(ColumnSparse, Filter)
{
auto test_case = [&](size_t n, size_t k, size_t m)
{
auto [sparse_src, full_src] = createColumns(n, k);
PaddedPODArray<UInt8> filt(n);
for (size_t i = 0; i < n; ++i)
filt[i] = rng() % m == 0;
auto sparse_dst = sparse_src->filter(filt, -1);
auto full_dst = full_src->filter(filt, -1);
if (!checkEquals(*sparse_dst->convertToFullColumnIfSparse(), *full_dst))
{
DUMP_COLUMN(sparse_src);
DUMP_COLUMN(full_src);
DUMP_COLUMN(sparse_dst);
DUMP_COLUMN(full_dst);
throw Exception(error_code, "Columns are unequal");
}
};
try
{
for (size_t i = 0; i < T; ++i)
{
size_t n = rng() % MAX_ROWS + 1;
size_t k = sparse_ratios[rng() % K];
size_t m = sparse_ratios[rng() % K];
test_case(n, k, m);
}
}
catch (const Exception & e)
{
FAIL() << e.displayText();
}
}
TEST(ColumnSparse, Permute)
{
auto test_case = [&](size_t n, size_t k, size_t limit)
{
auto [sparse_src, full_src] = createColumns(n, k);
IColumn::Permutation perm(n);
std::iota(perm.begin(), perm.end(), 0);
std::shuffle(perm.begin(), perm.end(), rng);
auto sparse_dst = sparse_src->permute(perm, limit);
auto full_dst = full_src->permute(perm, limit);
if (limit)
{
sparse_dst = sparse_dst->cut(0, limit);
full_dst = full_dst->cut(0, limit);
}
if (!checkEquals(*sparse_dst->convertToFullColumnIfSparse(), *full_dst))
{
DUMP_COLUMN(sparse_src);
DUMP_COLUMN(full_src);
DUMP_COLUMN(sparse_dst);
DUMP_COLUMN(full_dst);
throw Exception(error_code, "Columns are unequal");
}
};
try
{
for (size_t i = 0; i < T; ++i)
{
size_t n = rng() % MAX_ROWS + 1;
size_t k = sparse_ratios[rng() % K];
size_t limit = rng() % 2 ? 0 : rng() % n;
test_case(n, k, limit);
}
}
catch (const Exception & e)
{
FAIL() << e.displayText();
}
}
TEST(ColumnSparse, CompareColumn)
{
auto test_case = [&](size_t n1, size_t k1, size_t n2, size_t k2, size_t row_num)
{
auto [sparse_src1, full_src1] = createColumns(n1, k1);
auto [sparse_src2, full_src2] = createColumns(n2, k2);
PaddedPODArray<Int8> comp_sparse;
PaddedPODArray<Int8> comp_full;
sparse_src1->compareColumn(*sparse_src2, row_num, nullptr, comp_sparse, 1, 1);
full_src1->compareColumn(*full_src2, row_num, nullptr, comp_full, 1, 1);
if (comp_sparse != comp_full)
{
DUMP_COLUMN(sparse_src1);
DUMP_COLUMN(full_src1);
DUMP_COLUMN(sparse_src2);
DUMP_COLUMN(full_src2);
throw Exception(error_code, "Compare results are unequal");
}
};
try
{
for (size_t i = 0; i < T; ++i)
{
size_t n1 = rng() % MAX_ROWS + 1;
size_t k1 = sparse_ratios[rng() % K];
size_t n2 = rng() % MAX_ROWS + 1;
size_t k2 = sparse_ratios[rng() % K];
size_t row_num = rng() % n2;
test_case(n1, k1, n2, k2, row_num);
}
}
catch (const Exception & e)
{
FAIL() << e.displayText();
}
}
TEST(ColumnSparse, GetPermutation)
{
auto test_case = [&](size_t n, size_t k, size_t limit, bool reverse)
{
auto [sparse_src, full_src] = createColumns(n, k);
IColumn::Permutation perm_sparse;
IColumn::Permutation perm_full;
sparse_src->getPermutation(reverse, limit, 1, perm_sparse);
full_src->getPermutation(reverse, limit, 1, perm_full);
auto sparse_sorted = sparse_src->permute(perm_sparse, limit);
auto full_sorted = full_src->permute(perm_full, limit);
if (limit)
{
sparse_sorted = sparse_sorted->cut(0, limit);
full_sorted = full_sorted->cut(0, limit);
}
if (!checkEquals(*sparse_sorted->convertToFullColumnIfSparse(), *full_sorted))
{
DUMP_COLUMN(sparse_src);
DUMP_COLUMN(full_src);
DUMP_COLUMN(sparse_sorted);
DUMP_COLUMN(full_sorted);
throw Exception(error_code, "Sorted columns are unequal");
}
};
try
{
for (size_t i = 0; i < T; ++i)
{
size_t n = rng() % MAX_ROWS + 1;
size_t k = sparse_ratios[rng() % K];
size_t limit = rng() % 2 ? 0 : rng() % n;
bool reverse = rng() % 2;
test_case(n, k, limit, reverse);
}
}
catch (const Exception & e)
{
FAIL() << e.displayText();
}
}
#undef DUMP_COLUMN
#undef DUMP_NON_DEFAULTS

View File

@ -838,7 +838,7 @@ bool Dwarf::findLocation(
// The next inlined subroutine's call file and call line is the current
// caller's location.
for (size_t i = 0; i < num_found - 1; i++)
for (size_t i = 0; i < num_found - 1; ++i)
{
call_locations[i].file = call_locations[i + 1].file;
call_locations[i].line = call_locations[i + 1].line;

View File

@ -470,7 +470,7 @@
M(497, ACCESS_DENIED) \
M(498, LIMIT_BY_WITH_TIES_IS_NOT_SUPPORTED) \
M(499, S3_ERROR) \
M(500, BLOB_STORAGE_ERROR) \
M(500, AZURE_BLOB_STORAGE_ERROR) \
M(501, CANNOT_CREATE_DATABASE) \
M(502, CANNOT_SIGQUEUE) \
M(503, AGGREGATE_FUNCTION_THROW) \

View File

@ -72,6 +72,24 @@ static thread_local bool has_alt_stack = false;
#endif
std::vector<ThreadGroupStatus::ProfileEventsCountersAndMemory> ThreadGroupStatus::getProfileEventsCountersAndMemoryForThreads()
{
std::lock_guard guard(mutex);
/// It is OK to move it, since it is enough to report statistics for the thread at least once.
auto stats = std::move(finished_threads_counters_memory);
for (auto * thread : threads)
{
stats.emplace_back(ProfileEventsCountersAndMemory{
thread->performance_counters.getPartiallyAtomicSnapshot(),
thread->memory_tracker.get(),
thread->thread_id,
});
}
return stats;
}
ThreadStatus::ThreadStatus()
: thread_id{getThreadId()}
{
@ -139,11 +157,17 @@ ThreadStatus::~ThreadStatus()
{
/// It's a minor tracked memory leak here (not the memory itself but it's counter).
/// We've already allocated a little bit more than the limit and cannot track it in the thread memory tracker or its parent.
tryLogCurrentException(log);
}
if (thread_group)
{
std::lock_guard guard(thread_group->mutex);
thread_group->finished_threads_counters_memory.emplace_back(ThreadGroupStatus::ProfileEventsCountersAndMemory{
performance_counters.getPartiallyAtomicSnapshot(),
memory_tracker.get(),
thread_id,
});
thread_group->threads.erase(this);
}

View File

@ -61,6 +61,13 @@ using ThreadStatusPtr = ThreadStatus *;
class ThreadGroupStatus
{
public:
struct ProfileEventsCountersAndMemory
{
ProfileEvents::Counters::Snapshot counters;
Int64 memory_usage;
UInt64 thread_id;
};
mutable std::mutex mutex;
ProfileEvents::Counters performance_counters{VariableContext::Process};
@ -83,6 +90,10 @@ public:
String query;
UInt64 normalized_query_hash = 0;
std::vector<ProfileEventsCountersAndMemory> finished_threads_counters_memory;
std::vector<ProfileEventsCountersAndMemory> getProfileEventsCountersAndMemoryForThreads();
};
using ThreadGroupStatusPtr = std::shared_ptr<ThreadGroupStatus>;

View File

@ -153,7 +153,7 @@ void TraceCollector::run()
Array trace;
trace.reserve(trace_size);
for (size_t i = 0; i < trace_size; i++)
for (size_t i = 0; i < trace_size; ++i)
{
uintptr_t addr = 0;
readPODBinary(addr, in);

View File

@ -41,7 +41,7 @@ static void append(std::vector<String> & to, const std::vector<String> & what, s
static bool parseNumber(const String & description, size_t l, size_t r, size_t & res)
{
res = 0;
for (size_t pos = l; pos < r; pos ++)
for (size_t pos = l; pos < r; ++pos)
{
if (!isNumericASCII(description[pos]))
return false;

View File

@ -117,7 +117,7 @@ ASTPtr CompressionCodecFactory::validateCodecAndGetPreprocessedAST(
};
ISerialization::SubstreamPath path;
column_type->getDefaultSerialization()->enumerateStreams(path, callback, column_type, nullptr);
column_type->getDefaultSerialization()->enumerateStreams(path, callback, column_type);
if (!result_codec)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot find any substream with data type for type {}. It's a bug", column_type->getName());

View File

@ -55,7 +55,7 @@ namespace
return "/";
}
void writeNode(const KeeperStorage::Node & node, WriteBuffer & out)
void writeNode(const KeeperStorage::Node & node, SnapshotVersion version, WriteBuffer & out)
{
writeBinary(node.data, out);
@ -76,6 +76,11 @@ namespace
writeBinary(node.stat.pzxid, out);
writeBinary(node.seq_num, out);
if (version >= SnapshotVersion::V4)
{
writeBinary(node.size_bytes, out);
}
}
void readNode(KeeperStorage::Node & node, ReadBuffer & in, SnapshotVersion version, ACLMap & acl_map)
@ -124,6 +129,11 @@ namespace
readBinary(node.stat.numChildren, in);
readBinary(node.stat.pzxid, in);
readBinary(node.seq_num, in);
if (version >= SnapshotVersion::V4)
{
readBinary(node.size_bytes, in);
}
}
void serializeSnapshotMetadata(const SnapshotMetadataPtr & snapshot_meta, WriteBuffer & out)
@ -176,7 +186,7 @@ void KeeperStorageSnapshot::serialize(const KeeperStorageSnapshot & snapshot, Wr
throw Exception(ErrorCodes::LOGICAL_ERROR, "Trying to serialize node with mzxid {}, but last snapshot index {}", node.stat.mzxid, snapshot.snapshot_meta->get_last_log_idx());
writeBinary(path, out);
writeNode(node, out);
writeNode(node, snapshot.version, out);
/// Last iteration: check and exit here without iterator increment. Otherwise
/// false positive race condition on list end is possible.

View File

@ -18,9 +18,10 @@ enum SnapshotVersion : uint8_t
V1 = 1, /// with ACL map
V2 = 2, /// with 64 bit buffer header
V3 = 3, /// compress snapshots with ZSTD codec
V4 = 4, /// add Node size to snapshots
};
static constexpr auto CURRENT_SNAPSHOT_VERSION = SnapshotVersion::V3;
static constexpr auto CURRENT_SNAPSHOT_VERSION = SnapshotVersion::V4;
/// What is stored in binary shapsnot
struct SnapshotDeserializationResult

View File

@ -133,21 +133,6 @@ static bool fixupACL(
return valid_found;
}
uint64_t KeeperStorage::Node::sizeInBytes() const
{
uint64_t total_size{0};
for (const auto & child : children)
total_size += child.size();
total_size += data.size();
total_size += sizeof(acl_id);
total_size += sizeof(is_sequental);
total_size += sizeof(stat);
total_size += sizeof(seq_num);
return total_size;
}
static KeeperStorage::ResponsesForSessions processWatchesImpl(const String & path, KeeperStorage::Watches & watches, KeeperStorage::Watches & list_watches, Coordination::Event event_type)
{
KeeperStorage::ResponsesForSessions result;
@ -354,6 +339,7 @@ struct KeeperStorageCreateRequestProcessor final : public KeeperStorageRequestPr
{
parent.children.insert(child_path);
parent.size_bytes += child_path.size();
prev_parent_cversion = parent.stat.cversion;
prev_parent_zxid = parent.stat.pzxid;
@ -391,6 +377,7 @@ struct KeeperStorageCreateRequestProcessor final : public KeeperStorageRequestPr
undo_parent.stat.cversion = prev_parent_cversion;
undo_parent.stat.pzxid = prev_parent_zxid;
undo_parent.children.erase(child_path);
undo_parent.size_bytes -= child_path.size();
});
};
@ -524,6 +511,7 @@ struct KeeperStorageRemoveRequestProcessor final : public KeeperStorageRequestPr
--parent.stat.numChildren;
++parent.stat.cversion;
parent.children.erase(child_basename);
parent.size_bytes -= child_basename.size();
});
response.error = Coordination::Error::ZOK;
@ -543,6 +531,7 @@ struct KeeperStorageRemoveRequestProcessor final : public KeeperStorageRequestPr
++parent.stat.numChildren;
--parent.stat.cversion;
parent.children.insert(child_basename);
parent.size_bytes += child_basename.size();
});
};
}
@ -621,11 +610,11 @@ struct KeeperStorageSetRequestProcessor final : public KeeperStorageRequestProce
auto itr = container.updateValue(request.path, [zxid, request] (KeeperStorage::Node & value)
{
value.data = request.data;
value.stat.version++;
value.stat.mzxid = zxid;
value.stat.mtime = std::chrono::system_clock::now().time_since_epoch() / std::chrono::milliseconds(1);
value.stat.dataLength = request.data.length();
value.size_bytes = value.size_bytes + request.data.size() - value.data.size();
value.data = request.data;
});
@ -1110,6 +1099,7 @@ KeeperStorage::ResponsesForSessions KeeperStorage::processRequest(const Coordina
--parent.stat.numChildren;
++parent.stat.cversion;
parent.children.erase(getBaseName(ephemeral_path));
parent.size_bytes -= getBaseName(ephemeral_path).size();
});
auto responses = processWatchesImpl(ephemeral_path, watches, list_watches, Coordination::Event::DELETED);

View File

@ -35,9 +35,22 @@ public:
Coordination::Stat stat{};
int32_t seq_num = 0;
ChildrenSet children{};
uint64_t size_bytes; // save size to avoid calculate every time
Node()
{
size_bytes = sizeof(size_bytes);
size_bytes += data.size();
size_bytes += sizeof(acl_id);
size_bytes += sizeof(is_sequental);
size_bytes += sizeof(stat);
size_bytes += sizeof(seq_num);
}
/// Object memory size
uint64_t sizeInBytes() const;
uint64_t sizeInBytes() const
{
return size_bytes;
}
};
struct ResponseForSession

View File

@ -977,24 +977,24 @@ TEST_P(CoordinationTest, SnapshotableHashMapDataSize)
world.disableSnapshotMode();
world.insert("world", n1);
EXPECT_EQ(world.getApproximateDataSize(), 94);
EXPECT_EQ(world.getApproximateDataSize(), 98);
world.updateValue("world", [&](Node & value) { value = n2; });
EXPECT_EQ(world.getApproximateDataSize(), 96);
EXPECT_EQ(world.getApproximateDataSize(), 98);
world.erase("world");
EXPECT_EQ(world.getApproximateDataSize(), 0);
world.enableSnapshotMode();
world.insert("world", n1);
EXPECT_EQ(world.getApproximateDataSize(), 94);
EXPECT_EQ(world.getApproximateDataSize(), 98);
world.updateValue("world", [&](Node & value) { value = n2; });
EXPECT_EQ(world.getApproximateDataSize(), 190);
EXPECT_EQ(world.getApproximateDataSize(), 196);
world.clearOutdatedNodes();
EXPECT_EQ(world.getApproximateDataSize(), 96);
EXPECT_EQ(world.getApproximateDataSize(), 98);
world.erase("world");
EXPECT_EQ(world.getApproximateDataSize(), 96);
EXPECT_EQ(world.getApproximateDataSize(), 98);
world.clear();
EXPECT_EQ(world.getApproximateDataSize(), 0);

View File

@ -9,6 +9,7 @@
#include <Common/assert_cast.h>
#include <Columns/ColumnConst.h>
#include <Columns/ColumnSparse.h>
#include <iterator>
@ -37,7 +38,7 @@ static ReturnType onError(const std::string & message [[maybe_unused]], int code
template <typename ReturnType>
static ReturnType checkColumnStructure(const ColumnWithTypeAndName & actual, const ColumnWithTypeAndName & expected,
const std::string & context_description, bool allow_remove_constants, int code)
const std::string & context_description, bool allow_materialize, int code)
{
if (actual.name != expected.name)
return onError<ReturnType>("Block structure mismatch in " + context_description + " stream: different names of columns:\n"
@ -52,11 +53,16 @@ static ReturnType checkColumnStructure(const ColumnWithTypeAndName & actual, con
const IColumn * actual_column = actual.column.get();
/// If we allow to remove constants, and expected column is not const, then unwrap actual constant column.
if (allow_remove_constants && !isColumnConst(*expected.column))
/// If we allow to materialize, and expected column is not const or sparse, then unwrap actual column.
if (allow_materialize)
{
if (const auto * column_const = typeid_cast<const ColumnConst *>(actual_column))
actual_column = &column_const->getDataColumn();
if (!isColumnConst(*expected.column))
if (const auto * column_const = typeid_cast<const ColumnConst *>(actual_column))
actual_column = &column_const->getDataColumn();
if (!expected.column->isSparse())
if (const auto * column_sparse = typeid_cast<const ColumnSparse *>(actual_column))
actual_column = &column_sparse->getValuesColumn();
}
if (actual_column->getName() != expected.column->getName())
@ -79,7 +85,7 @@ static ReturnType checkColumnStructure(const ColumnWithTypeAndName & actual, con
template <typename ReturnType>
static ReturnType checkBlockStructure(const Block & lhs, const Block & rhs, const std::string & context_description, bool allow_remove_constants)
static ReturnType checkBlockStructure(const Block & lhs, const Block & rhs, const std::string & context_description, bool allow_materialize)
{
size_t columns = rhs.columns();
if (lhs.columns() != columns)
@ -93,11 +99,11 @@ static ReturnType checkBlockStructure(const Block & lhs, const Block & rhs, cons
if constexpr (std::is_same_v<ReturnType, bool>)
{
if (!checkColumnStructure<ReturnType>(actual, expected, context_description, allow_remove_constants, ErrorCodes::LOGICAL_ERROR))
if (!checkColumnStructure<ReturnType>(actual, expected, context_description, allow_materialize, ErrorCodes::LOGICAL_ERROR))
return false;
}
else
checkColumnStructure<ReturnType>(actual, expected, context_description, allow_remove_constants, ErrorCodes::LOGICAL_ERROR);
checkColumnStructure<ReturnType>(actual, expected, context_description, allow_materialize, ErrorCodes::LOGICAL_ERROR);
}
return ReturnType(true);
@ -203,7 +209,7 @@ void Block::eraseImpl(size_t position)
for (auto it = index_by_name.begin(); it != index_by_name.end();)
{
if (it->second == position)
index_by_name.erase(it++);
it = index_by_name.erase(it);
else
{
if (it->second > position)
@ -706,6 +712,11 @@ void Block::updateHash(SipHash & hash) const
col.column->updateHashWithValue(row_no, hash);
}
void convertToFullIfSparse(Block & block)
{
for (auto & column : block)
column.column = recursiveRemoveSparse(column.column);
}
ColumnPtr getColumnFromBlock(const Block & block, const NameAndTypePair & column)
{
@ -729,7 +740,7 @@ Block materializeBlock(const Block & block)
for (size_t i = 0; i < columns; ++i)
{
auto & element = res.getByPosition(i);
element.column = element.column->convertToFullColumnIfConst();
element.column = recursiveRemoveSparse(element.column->convertToFullColumnIfConst());
}
return res;
@ -738,7 +749,7 @@ Block materializeBlock(const Block & block)
void materializeBlockInplace(Block & block)
{
for (size_t i = 0; i < block.columns(); ++i)
block.getByPosition(i).column = block.getByPosition(i).column->convertToFullColumnIfConst();
block.getByPosition(i).column = recursiveRemoveSparse(block.getByPosition(i).column->convertToFullColumnIfConst());
}
}

View File

@ -193,6 +193,8 @@ void assertCompatibleHeader(const Block & actual, const Block & desired, const s
/// Calculate difference in structure of blocks and write description into output strings. NOTE It doesn't compare values of constant columns.
void getBlocksDifference(const Block & lhs, const Block & rhs, std::string & out_lhs_diff, std::string & out_rhs_diff);
void convertToFullIfSparse(Block & block);
/// Helps in-memory storages to extract columns from block.
/// Properly handles cases, when column is a subcolumn and when it is compressed.
ColumnPtr getColumnFromBlock(const Block & block, const NameAndTypePair & column);

View File

@ -8,5 +8,6 @@ namespace DB
{
using ColumnNumbers = std::vector<size_t>;
using ColumnNumbersList = std::vector<ColumnNumbers>;
}

View File

@ -71,7 +71,7 @@ Native41::Native41(const String & password_, const String & scramble_)
const Poco::SHA1Engine::Digest & digest = engine3.digest();
scramble.resize(SCRAMBLE_LENGTH);
for (size_t i = 0; i < SCRAMBLE_LENGTH; i++)
for (size_t i = 0; i < SCRAMBLE_LENGTH; ++i)
scramble[i] = static_cast<unsigned char>(password_sha1[i] ^ digest[i]);
}
@ -191,7 +191,7 @@ void Sha256Password::authenticate(
}
password.resize(plaintext_size);
for (int i = 0; i < plaintext_size; i++)
for (int i = 0; i < plaintext_size; ++i)
{
password[i] = plaintext[i] ^ static_cast<unsigned char>(scramble[i % SCRAMBLE_LENGTH]);
}

View File

@ -41,7 +41,7 @@ void GTIDSets::parse(const String gtid_format)
GTIDSet set;
set.uuid = DB::parse<UUID>(server_ids[0]);
for (size_t k = 1; k < server_ids.size(); k++)
for (size_t k = 1; k < server_ids.size(); ++k)
{
std::vector<String> inters;
boost::split(inters, server_ids[k], [](char c) { return c == '-'; });
@ -74,7 +74,7 @@ void GTIDSets::update(const GTID & other)
{
if (set.uuid == other.uuid)
{
for (auto i = 0U; i < set.intervals.size(); i++)
for (auto i = 0U; i < set.intervals.size(); ++i)
{
auto & current = set.intervals[i];
@ -134,7 +134,7 @@ String GTIDSets::toString() const
{
WriteBufferFromOwnString buffer;
for (size_t i = 0; i < sets.size(); i++)
for (size_t i = 0; i < sets.size(); ++i)
{
GTIDSet set = sets[i];
writeUUIDText(set.uuid, buffer);

View File

@ -159,7 +159,7 @@ namespace MySQLReplication
payload.ignore(1);
column_count = readLengthEncodedNumber(payload);
for (auto i = 0U; i < column_count; i++)
for (auto i = 0U; i < column_count; ++i)
{
UInt8 v = 0x00;
payload.readStrict(reinterpret_cast<char *>(&v), 1);
@ -188,7 +188,7 @@ namespace MySQLReplication
{
auto pos = 0;
column_meta.reserve(column_count);
for (auto i = 0U; i < column_count; i++)
for (auto i = 0U; i < column_count; ++i)
{
UInt16 typ = column_type[i];
switch (typ)
@ -255,7 +255,7 @@ namespace MySQLReplication
out << "Table Len: " << std::to_string(this->table_len) << '\n';
out << "Table: " << this->table << '\n';
out << "Column Count: " << this->column_count << '\n';
for (auto i = 0U; i < column_count; i++)
for (UInt32 i = 0; i < column_count; ++i)
{
out << "Column Type [" << i << "]: " << std::to_string(column_type[i]) << ", Meta: " << column_meta[i] << '\n';
}
@ -312,7 +312,7 @@ namespace MySQLReplication
UInt32 null_index = 0;
UInt32 re_count = 0;
for (auto i = 0U; i < number_columns; i++)
for (UInt32 i = 0; i < number_columns; ++i)
{
if (bitmap[i])
re_count++;
@ -321,7 +321,7 @@ namespace MySQLReplication
boost::dynamic_bitset<> columns_null_set;
readBitmap(payload, columns_null_set, re_count);
for (auto i = 0U; i < number_columns; i++)
for (UInt32 i = 0; i < number_columns; ++i)
{
UInt32 field_len = 0;
@ -523,7 +523,7 @@ namespace MySQLReplication
res += (val ^ (mask & compressed_integer_align_numbers[compressed_integers]));
}
for (auto k = 0U; k < uncompressed_integers; k++)
for (size_t k = 0; k < uncompressed_integers; ++k)
{
UInt32 val = 0;
readBigEndianStrict(payload, reinterpret_cast<char *>(&val), 4);
@ -536,7 +536,7 @@ namespace MySQLReplication
size_t uncompressed_decimals = scale / digits_per_integer;
size_t compressed_decimals = scale - (uncompressed_decimals * digits_per_integer);
for (auto k = 0U; k < uncompressed_decimals; k++)
for (size_t k = 0; k < uncompressed_decimals; ++k)
{
UInt32 val = 0;
readBigEndianStrict(payload, reinterpret_cast<char *>(&val), 4);
@ -669,7 +669,7 @@ namespace MySQLReplication
header.dump(out);
out << "Schema: " << this->schema << '\n';
out << "Table: " << this->table << '\n';
for (auto i = 0U; i < rows.size(); i++)
for (size_t i = 0; i < rows.size(); ++i)
{
out << "Row[" << i << "]: " << applyVisitor(to_string, rows[i]) << '\n';
}

View File

@ -15,7 +15,7 @@ namespace ProtocolText
ResultSetRow::ResultSetRow(const Serializations & serializations, const Columns & columns_, int row_num_)
: columns(columns_), row_num(row_num_)
{
for (size_t i = 0; i < columns.size(); i++)
for (size_t i = 0; i < columns.size(); ++i)
{
if (columns[i]->isNullAt(row_num))
{
@ -39,7 +39,7 @@ size_t ResultSetRow::getPayloadSize() const
void ResultSetRow::writePayloadImpl(WriteBuffer & buffer) const
{
for (size_t i = 0; i < columns.size(); i++)
for (size_t i = 0; i < columns.size(); ++i)
{
if (columns[i]->isNullAt(row_num))
buffer.write(serialized[i].data(), 1);

View File

@ -7,6 +7,7 @@
#include <IO/WriteHelpers.h>
#include <IO/ReadBufferFromString.h>
#include <IO/WriteBufferFromString.h>
#include <IO/Operators.h>
namespace DB
@ -43,6 +44,17 @@ String NameAndTypePair::getSubcolumnName() const
return name.substr(*subcolumn_delimiter_position + 1, name.size() - *subcolumn_delimiter_position);
}
String NameAndTypePair::dump() const
{
WriteBufferFromOwnString out;
out << "name: " << name << "\n"
<< "type: " << type->getName() << "\n"
<< "name in storage: " << getNameInStorage() << "\n"
<< "type in storage: " << getTypeInStorage()->getName();
return out.str();
}
void NamesAndTypesList::readText(ReadBuffer & buf)
{
const DataTypeFactory & data_type_factory = DataTypeFactory::instance();

View File

@ -40,6 +40,8 @@ public:
return name == rhs.name && type->equals(*rhs.type);
}
String dump() const;
String name;
DataTypePtr type;
@ -107,6 +109,8 @@ public:
std::optional<NameAndTypePair> tryGetByName(const std::string & name) const;
};
using NamesAndTypesLists = std::vector<NamesAndTypesList>;
}
namespace std

View File

@ -85,7 +85,7 @@ void insertPostgreSQLValue(
assert_cast<ColumnString &>(column).insertData(value.data(), value.size());
break;
case ExternalResultDescription::ValueType::vtUUID:
assert_cast<ColumnUInt128 &>(column).insert(parse<UUID>(value.data(), value.size()));
assert_cast<ColumnUUID &>(column).insertValue(parse<UUID>(value.data(), value.size()));
break;
case ExternalResultDescription::ValueType::vtDate:
assert_cast<ColumnUInt16 &>(column).insertValue(UInt16{LocalDate{std::string(value)}.getDayNum()});

View File

@ -44,6 +44,8 @@
#define DBMS_MIN_PROTOCOL_VERSION_WITH_INCREMENTAL_PROFILE_EVENTS 54451
#define DBMS_MIN_REVISION_WITH_CUSTOM_SERIALIZATION 54454
/// Version of ClickHouse TCP protocol.
///
/// Should be incremented manually on protocol changes.
@ -51,7 +53,6 @@
/// NOTE: DBMS_TCP_PROTOCOL_VERSION has nothing common with VERSION_REVISION,
/// later is just a number for server version (one number instead of commit SHA)
/// for simplicity (sometimes it may be more convenient in some use cases).
#define DBMS_TCP_PROTOCOL_VERSION 54453
#define DBMS_TCP_PROTOCOL_VERSION 54455
#define DBMS_MIN_PROTOCOL_VERSION_WITH_INITIAL_QUERY_START_TIME 54449

View File

@ -593,6 +593,7 @@ class IColumn;
M(Bool, input_format_null_as_default, true, "For text input formats initialize null fields with default values if data type of this field is not nullable", 0) \
M(Bool, input_format_arrow_import_nested, false, "Allow to insert array of structs into Nested table in Arrow input format.", 0) \
M(Bool, input_format_orc_import_nested, false, "Allow to insert array of structs into Nested table in ORC input format.", 0) \
M(Int64, input_format_orc_row_batch_size, 100'000, "Batch size when reading ORC stripes.", 0) \
M(Bool, input_format_parquet_import_nested, false, "Allow to insert array of structs into Nested table in Parquet input format.", 0) \
M(Bool, input_format_allow_seeks, true, "Allow seeks while reading in ORC/Parquet/Arrow input formats", 0) \
\

View File

@ -66,6 +66,7 @@ public:
bool shouldAlignRightInPrettyFormats() const override { return false; }
SerializationPtr doGetDefaultSerialization() const override;
bool supportsSparseSerialization() const override { return false; }
bool isVersioned() const { return function->isVersioned(); }

View File

@ -51,6 +51,7 @@ public:
bool isNullable() const override { return false; }
bool onlyNull() const override { return false; }
bool lowCardinality() const override { return true; }
bool supportsSparseSerialization() const override { return false; }
bool isLowCardinalityNullable() const override { return dictionary_type->isNullable(); }
static MutableColumnUniquePtr createColumnUnique(const IDataType & keys_type);

View File

@ -6,8 +6,10 @@
#include <DataTypes/DataTypeTuple.h>
#include <DataTypes/DataTypeArray.h>
#include <DataTypes/DataTypeFactory.h>
#include <DataTypes/Serializations/SerializationInfo.h>
#include <DataTypes/Serializations/SerializationTuple.h>
#include <DataTypes/Serializations/SerializationNamed.h>
#include <DataTypes/Serializations/SerializationInfoTuple.h>
#include <DataTypes/NestedUtils.h>
#include <Parsers/IAST.h>
#include <Parsers/ASTNameTypePair.h>
@ -152,6 +154,20 @@ MutableColumnPtr DataTypeTuple::createColumn() const
return ColumnTuple::create(std::move(tuple_columns));
}
MutableColumnPtr DataTypeTuple::createColumn(const ISerialization & serialization) const
{
const auto & element_serializations =
assert_cast<const SerializationTuple &>(serialization).getElementsSerializations();
size_t size = elems.size();
assert(element_serializations.size() == size);
MutableColumns tuple_columns(size);
for (size_t i = 0; i < size; ++i)
tuple_columns[i] = elems[i]->createColumn(*element_serializations[i]->getNested());
return ColumnTuple::create(std::move(tuple_columns));
}
Field DataTypeTuple::getDefault() const
{
return Tuple(collections::map<Tuple>(elems, [] (const DataTypePtr & elem) { return elem->getDefault(); }));
@ -248,21 +264,33 @@ SerializationPtr DataTypeTuple::doGetDefaultSerialization() const
return std::make_shared<SerializationTuple>(std::move(serializations), use_explicit_names);
}
SerializationPtr DataTypeTuple::getSerialization(const String & column_name, const StreamExistenceCallback & callback) const
SerializationPtr DataTypeTuple::getSerialization(const SerializationInfo & info) const
{
SerializationTuple::ElementSerializations serializations(elems.size());
const auto & info_tuple = assert_cast<const SerializationInfoTuple &>(info);
bool use_explicit_names = have_explicit_names && serialize_names;
for (size_t i = 0; i < elems.size(); ++i)
{
String elem_name = use_explicit_names ? names[i] : toString(i + 1);
auto subcolumn_name = Nested::concatenateName(column_name, elem_name);
auto serializaion = elems[i]->getSerialization(subcolumn_name, callback);
serializations[i] = std::make_shared<SerializationNamed>(serializaion, elem_name);
auto serialization = elems[i]->getSerialization(*info_tuple.getElementInfo(i));
serializations[i] = std::make_shared<SerializationNamed>(serialization, elem_name);
}
return std::make_shared<SerializationTuple>(std::move(serializations), use_explicit_names);
}
MutableSerializationInfoPtr DataTypeTuple::createSerializationInfo(const SerializationInfo::Settings & settings) const
{
MutableSerializationInfos infos;
infos.reserve(elems.size());
for (const auto & elem : elems)
infos.push_back(elem->createSerializationInfo(settings));
return std::make_shared<SerializationInfoTuple>(std::move(infos), settings);
}
static DataTypePtr create(const ASTPtr & arguments)
{
if (!arguments || arguments->children.empty())

View File

@ -36,8 +36,10 @@ public:
const char * getFamilyName() const override { return "Tuple"; }
bool canBeInsideNullable() const override { return false; }
bool supportsSparseSerialization() const override { return true; }
MutableColumnPtr createColumn() const override;
MutableColumnPtr createColumn(const ISerialization & serialization) const override;
Field getDefault() const override;
void insertDefaultInto(IColumn & column) const override;
@ -52,9 +54,9 @@ public:
size_t getMaximumSizeOfValueInMemory() const override;
size_t getSizeOfValueInMemory() const override;
SerializationPtr getSerialization(const String & column_name, const StreamExistenceCallback & callback) const override;
SerializationPtr doGetDefaultSerialization() const override;
SerializationPtr getSerialization(const SerializationInfo & info) const override;
MutableSerializationInfoPtr createSerializationInfo(const SerializationInfo::Settings & settings) const override;
const DataTypePtr & getElement(size_t i) const { return elems[i]; }
const DataTypes & getElements() const { return elems; }

View File

@ -1,5 +1,6 @@
#include <Columns/IColumn.h>
#include <Columns/ColumnConst.h>
#include <Columns/ColumnSparse.h>
#include <Common/Exception.h>
#include <Common/SipHash.h>
@ -10,6 +11,8 @@
#include <DataTypes/IDataType.h>
#include <DataTypes/DataTypeCustom.h>
#include <DataTypes/NestedUtils.h>
#include <DataTypes/Serializations/SerializationSparse.h>
#include <DataTypes/Serializations/SerializationInfo.h>
namespace DB
@ -40,6 +43,15 @@ void IDataType::updateAvgValueSizeHint(const IColumn & column, double & avg_valu
}
}
MutableColumnPtr IDataType::createColumn(const ISerialization & serialization) const
{
auto column = createColumn();
if (serialization.getKind() == ISerialization::Kind::SPARSE)
return ColumnSparse::create(std::move(column));
return column;
}
ColumnPtr IDataType::createColumnConst(size_t size, const Field & field) const
{
auto column = createColumn();
@ -65,9 +77,7 @@ size_t IDataType::getSizeOfValueInMemory() const
void IDataType::forEachSubcolumn(
const SubcolumnCallback & callback,
const SerializationPtr & serialization,
const DataTypePtr & type,
const ColumnPtr & column)
const SubstreamData & data)
{
ISerialization::StreamCallback callback_with_data = [&](const auto & subpath)
{
@ -76,66 +86,59 @@ void IDataType::forEachSubcolumn(
if (!subpath[i].visited && ISerialization::hasSubcolumnForPath(subpath, i + 1))
{
auto name = ISerialization::getSubcolumnNameForStream(subpath, i + 1);
auto data = ISerialization::createFromPath(subpath, i);
callback(subpath, name, data);
auto subdata = ISerialization::createFromPath(subpath, i);
callback(subpath, name, subdata);
}
subpath[i].visited = true;
}
};
ISerialization::SubstreamPath path;
serialization->enumerateStreams(path, callback_with_data, type, column);
SubstreamPath path;
data.serialization->enumerateStreams(path, callback_with_data, data);
}
DataTypePtr IDataType::tryGetSubcolumnType(const String & subcolumn_name) const
template <typename Ptr>
Ptr IDataType::getForSubcolumn(
const String & subcolumn_name,
const SubstreamData & data,
Ptr SubstreamData::*member,
bool throw_if_null) const
{
DataTypePtr res;
forEachSubcolumn([&](const auto &, const auto & name, const auto & data)
Ptr res;
forEachSubcolumn([&](const auto &, const auto & name, const auto & subdata)
{
if (name == subcolumn_name)
res = data.type;
}, getDefaultSerialization(), getPtr(), nullptr);
res = subdata.*member;
}, data);
if (!res && throw_if_null)
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "There is no subcolumn {} in type {}", subcolumn_name, getName());
return res;
}
DataTypePtr IDataType::tryGetSubcolumnType(const String & subcolumn_name) const
{
SubstreamData data = { getDefaultSerialization(), getPtr(), nullptr, nullptr };
return getForSubcolumn<DataTypePtr>(subcolumn_name, data, &SubstreamData::type, false);
}
DataTypePtr IDataType::getSubcolumnType(const String & subcolumn_name) const
{
auto subcolumn_type = tryGetSubcolumnType(subcolumn_name);
if (subcolumn_type)
return subcolumn_type;
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "There is no subcolumn {} in type {}", subcolumn_name, getName());
SubstreamData data = { getDefaultSerialization(), getPtr(), nullptr, nullptr };
return getForSubcolumn<DataTypePtr>(subcolumn_name, data, &SubstreamData::type);
}
SerializationPtr IDataType::getSubcolumnSerialization(const String & subcolumn_name, const SerializationPtr & serialization) const
{
SerializationPtr res;
forEachSubcolumn([&](const auto &, const auto & name, const auto & data)
{
if (name == subcolumn_name)
res = data.serialization;
}, serialization, nullptr, nullptr);
if (res)
return res;
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "There is no subcolumn {} in type {}", subcolumn_name, getName());
SubstreamData data = { serialization, nullptr, nullptr, nullptr };
return getForSubcolumn<SerializationPtr>(subcolumn_name, data, &SubstreamData::serialization);
}
ColumnPtr IDataType::getSubcolumn(const String & subcolumn_name, const ColumnPtr & column) const
{
ColumnPtr res;
forEachSubcolumn([&](const auto &, const auto & name, const auto & data)
{
if (name == subcolumn_name)
res = data.column;
}, getDefaultSerialization(), nullptr, column);
if (res)
return res;
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "There is no subcolumn {} in type {}", subcolumn_name, getName());
SubstreamData data = { getDefaultSerialization(), nullptr, column, nullptr };
return getForSubcolumn<ColumnPtr>(subcolumn_name, data, &SubstreamData::column);
}
Names IDataType::getSubcolumnNames() const
@ -144,7 +147,7 @@ Names IDataType::getSubcolumnNames() const
forEachSubcolumn([&](const auto &, const auto & name, const auto &)
{
res.push_back(name);
}, getDefaultSerialization(), nullptr, nullptr);
}, { getDefaultSerialization(), nullptr, nullptr, nullptr });
return res;
}
@ -163,6 +166,12 @@ void IDataType::setCustomization(DataTypeCustomDescPtr custom_desc_) const
custom_serialization = std::move(custom_desc_->serialization);
}
MutableSerializationInfoPtr IDataType::createSerializationInfo(
const SerializationInfo::Settings & settings) const
{
return std::make_shared<SerializationInfo>(ISerialization::Kind::DEFAULT, settings);
}
SerializationPtr IDataType::getDefaultSerialization() const
{
if (custom_serialization)
@ -171,22 +180,48 @@ SerializationPtr IDataType::getDefaultSerialization() const
return doGetDefaultSerialization();
}
SerializationPtr IDataType::getSparseSerialization() const
{
return std::make_shared<SerializationSparse>(getDefaultSerialization());
}
SerializationPtr IDataType::getSerialization(ISerialization::Kind kind) const
{
if (supportsSparseSerialization() && kind == ISerialization::Kind::SPARSE)
return getSparseSerialization();
return getDefaultSerialization();
}
SerializationPtr IDataType::getSerialization(const SerializationInfo & info) const
{
return getSerialization(info.getKind());
}
// static
SerializationPtr IDataType::getSerialization(const NameAndTypePair & column, const IDataType::StreamExistenceCallback & callback)
SerializationPtr IDataType::getSerialization(const NameAndTypePair & column, const SerializationInfo & info)
{
if (column.isSubcolumn())
{
const auto & type_in_storage = column.getTypeInStorage();
auto default_serialization = type_in_storage->getDefaultSerialization();
return type_in_storage->getSubcolumnSerialization(column.getSubcolumnName(), default_serialization);
auto serialization = type_in_storage->getSerialization(info);
return type_in_storage->getSubcolumnSerialization(column.getSubcolumnName(), serialization);
}
return column.type->getSerialization(column.name, callback);
return column.type->getSerialization(info);
}
SerializationPtr IDataType::getSerialization(const String &, const StreamExistenceCallback &) const
// static
SerializationPtr IDataType::getSerialization(const NameAndTypePair & column)
{
return getDefaultSerialization();
if (column.isSubcolumn())
{
const auto & type_in_storage = column.getTypeInStorage();
auto serialization = type_in_storage->getDefaultSerialization();
return type_in_storage->getSubcolumnSerialization(column.getSubcolumnName(), serialization);
}
return column.type->getDefaultSerialization();
}
}

View File

@ -6,7 +6,8 @@
#include <Core/Names.h>
#include <Core/TypeId.h>
#include <DataTypes/DataTypeCustom.h>
#include <DataTypes/Serializations/ISerialization.h>
#include <DataTypes/Serializations/SerializationInfo.h>
namespace DB
{
@ -27,7 +28,6 @@ using DataTypePtr = std::shared_ptr<const IDataType>;
using DataTypes = std::vector<DataTypePtr>;
struct NameAndTypePair;
class SerializationInfo;
struct DataTypeWithConstInfo
{
@ -84,45 +84,54 @@ public:
SerializationPtr getSubcolumnSerialization(const String & subcolumn_name, const SerializationPtr & serialization) const;
ColumnPtr getSubcolumn(const String & subcolumn_name, const ColumnPtr & column) const;
using SubstreamData = ISerialization::SubstreamData;
using SubstreamPath = ISerialization::SubstreamPath;
using SubcolumnCallback = std::function<void(
const ISerialization::SubstreamPath &,
const SubstreamPath &,
const String &,
const ISerialization::SubstreamData &)>;
const SubstreamData &)>;
static void forEachSubcolumn(
const SubcolumnCallback & callback,
const SerializationPtr & serialization,
const DataTypePtr & type,
const ColumnPtr & column);
const SubstreamData & data);
Names getSubcolumnNames() const;
/// Returns default serialization of data type.
virtual MutableSerializationInfoPtr createSerializationInfo(
const SerializationInfo::Settings & settings) const;
/// TODO: support more types.
virtual bool supportsSparseSerialization() const { return !haveSubtypes(); }
SerializationPtr getDefaultSerialization() const;
SerializationPtr getSparseSerialization() const;
/// Asks whether the stream with given name exists in table.
/// If callback returned true for all streams, which are required for
/// one of serialization types, that serialization will be chosen for reading.
/// If callback always returned false, the default serialization will be chosen.
using StreamExistenceCallback = std::function<bool(const String &)>;
/// Chooses serialization according to serialization kind.
SerializationPtr getSerialization(ISerialization::Kind kind) const;
/// Chooses serialization for reading of one column or subcolumns by
/// checking existence of substreams using callback.
static SerializationPtr getSerialization(
const NameAndTypePair & column,
const StreamExistenceCallback & callback = [](const String &) { return false; });
/// Chooses serialization according to collected information about content of column.
virtual SerializationPtr getSerialization(const SerializationInfo & info) const;
virtual SerializationPtr getSerialization(const String & column_name, const StreamExistenceCallback & callback) const;
/// Chooses between subcolumn serialization and regular serialization according to @column.
/// This method typically should be used to get serialization for reading column or subcolumn.
static SerializationPtr getSerialization(const NameAndTypePair & column, const SerializationInfo & info);
static SerializationPtr getSerialization(const NameAndTypePair & column);
protected:
virtual String doGetName() const { return getFamilyName(); }
virtual SerializationPtr doGetDefaultSerialization() const = 0;
public:
/** Create empty column for corresponding type.
/** Create empty column for corresponding type and default serialization.
*/
virtual MutableColumnPtr createColumn() const = 0;
/** Create empty column for corresponding type and serialization.
*/
virtual MutableColumnPtr createColumn(const ISerialization & serialization) const;
/** Create ColumnConst for corresponding type, with specified size and value.
*/
ColumnPtr createColumnConst(size_t size, const Field & field) const;
@ -292,6 +301,14 @@ protected:
public:
const IDataTypeCustomName * getCustomName() const { return custom_name.get(); }
const ISerialization * getCustomSerialization() const { return custom_serialization.get(); }
private:
template <typename Ptr>
Ptr getForSubcolumn(
const String & subcolumn_name,
const SubstreamData & data,
Ptr SubstreamData::*member,
bool throw_if_null = true) const;
};

View File

@ -36,18 +36,18 @@ std::string concatenateName(const std::string & nested_table_name, const std::st
/** Name can be treated as compound if it contains dot (.) in the middle.
*/
std::pair<std::string, std::string> splitName(const std::string & name)
std::pair<std::string, std::string> splitName(const std::string & name, bool reverse)
{
auto idx = name.find_first_of('.');
auto idx = (reverse ? name.find_last_of('.') : name.find_first_of('.'));
if (idx == std::string::npos || idx == 0 || idx + 1 == name.size())
return {name, {}};
return {name.substr(0, idx), name.substr(idx + 1)};
}
std::pair<std::string_view, std::string_view> splitName(const std::string_view & name)
std::pair<std::string_view, std::string_view> splitName(const std::string_view & name, bool reverse)
{
auto idx = name.find_first_of('.');
auto idx = (reverse ? name.find_last_of('.') : name.find_first_of('.'));
if (idx == std::string::npos || idx == 0 || idx + 1 == name.size())
return {name, {}};

View File

@ -11,8 +11,9 @@ namespace Nested
{
std::string concatenateName(const std::string & nested_table_name, const std::string & nested_field_name);
std::pair<std::string, std::string> splitName(const std::string & name);
std::pair<std::string_view, std::string_view> splitName(const std::string_view & name);
/// Splits name of compound identifier by first/last dot (depending on 'reverse' parameter).
std::pair<std::string, std::string> splitName(const std::string & name, bool reverse = false);
std::pair<std::string_view, std::string_view> splitName(const std::string_view & name, bool reverse = false);
/// Returns the prefix of the name to the first '.'. Or the name is unchanged if there is no dot.
std::string extractTableName(const std::string & nested_name);

View File

@ -16,12 +16,43 @@ namespace ErrorCodes
{
extern const int MULTIPLE_STREAMS_REQUIRED;
extern const int UNEXPECTED_DATA_AFTER_PARSED_VALUE;
extern const int LOGICAL_ERROR;
}
ISerialization::Kind ISerialization::getKind(const IColumn & column)
{
if (column.isSparse())
return Kind::SPARSE;
return Kind::DEFAULT;
}
String ISerialization::kindToString(Kind kind)
{
switch (kind)
{
case Kind::DEFAULT:
return "Default";
case Kind::SPARSE:
return "Sparse";
}
__builtin_unreachable();
}
ISerialization::Kind ISerialization::stringToKind(const String & str)
{
if (str == "Default")
return Kind::DEFAULT;
else if (str == "Sparse")
return Kind::SPARSE;
else
throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown serialization kind '{}'", str);
}
String ISerialization::Substream::toString() const
{
if (type == TupleElement)
return fmt::format("TupleElement({}, escape_tuple_delimiter={})",
return fmt::format("TupleElement({}, escape_tuple_delimiter = {})",
tuple_element_name, escape_tuple_delimiter ? "true" : "false");
return String(magic_enum::enum_name(type));
@ -44,18 +75,22 @@ String ISerialization::SubstreamPath::toString() const
void ISerialization::enumerateStreams(
SubstreamPath & path,
const StreamCallback & callback,
DataTypePtr type,
ColumnPtr column) const
const SubstreamData & data) const
{
path.push_back(Substream::Regular);
path.back().data = {type, column, getPtr(), nullptr};
path.back().data = data;
callback(path);
path.pop_back();
}
void ISerialization::enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const
{
enumerateStreams(path, callback, nullptr, nullptr);
enumerateStreams(path, callback, {getPtr(), nullptr, nullptr, nullptr});
}
void ISerialization::enumerateStreams(SubstreamPath & path, const StreamCallback & callback, const DataTypePtr & type) const
{
enumerateStreams(path, callback, {getPtr(), type, nullptr, nullptr});
}
void ISerialization::serializeBinaryBulk(const IColumn & column, WriteBuffer &, size_t, size_t) const
@ -147,11 +182,23 @@ String ISerialization::getFileNameForStream(const NameAndTypePair & column, cons
return getFileNameForStream(column.getNameInStorage(), path);
}
static size_t isOffsetsOfNested(const ISerialization::SubstreamPath & path)
{
if (path.empty())
return false;
for (const auto & elem : path)
if (elem.type == ISerialization::Substream::ArrayElements)
return false;
return path.back().type == ISerialization::Substream::ArraySizes;
}
String ISerialization::getFileNameForStream(const String & name_in_storage, const SubstreamPath & path)
{
String stream_name;
auto nested_storage_name = Nested::extractTableName(name_in_storage);
if (name_in_storage != nested_storage_name && (path.size() == 1 && path[0].type == ISerialization::Substream::ArraySizes))
if (name_in_storage != nested_storage_name && isOffsetsOfNested(path))
stream_name = escapeForFileName(nested_storage_name);
else
stream_name = escapeForFileName(name_in_storage);
@ -242,10 +289,9 @@ ISerialization::SubstreamData ISerialization::createFromPath(const SubstreamPath
assert(prefix_len < path.size());
SubstreamData res = path[prefix_len].data;
res.creator.reset();
for (ssize_t i = static_cast<ssize_t>(prefix_len) - 1; i >= 0; --i)
{
const auto & creator = path[i].data.creator;
const auto & creator = path[i].creator;
if (creator)
{
res.type = res.type ? creator->create(res.type) : res.type;

View File

@ -2,15 +2,25 @@
#include <Common/COW.h>
#include <Core/Types.h>
#include <base/demangle.h>
#include <Common/typeid_cast.h>
#include <Columns/IColumn.h>
#include <boost/noncopyable.hpp>
#include <unordered_map>
#include <memory>
#include <variant>
namespace DB
{
namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
}
class IDataType;
class ReadBuffer;
class WriteBuffer;
class ProtobufReader;
@ -22,19 +32,40 @@ using DataTypePtr = std::shared_ptr<const IDataType>;
class ISerialization;
using SerializationPtr = std::shared_ptr<const ISerialization>;
class SerializationInfo;
using SerializationInfoPtr = std::shared_ptr<const SerializationInfo>;
class Field;
struct FormatSettings;
struct NameAndTypePair;
/** Represents serialization of data type.
* Has methods to serialize/deserialize column in binary and several text formats.
* Every data type has default serialization, but can be serialized in different representations.
* Default serialization can be wrapped to one of the special kind of serializations.
* Currently there is only one special serialization: Sparse.
* Each serialization has its own implementation of IColumn as its in-memory representation.
*/
class ISerialization : private boost::noncopyable, public std::enable_shared_from_this<ISerialization>
{
public:
ISerialization() = default;
virtual ~ISerialization() = default;
enum class Kind : UInt8
{
DEFAULT = 0,
SPARSE = 1,
};
virtual Kind getKind() const { return Kind::DEFAULT; }
SerializationPtr getPtr() const { return shared_from_this(); }
static Kind getKind(const IColumn & column);
static String kindToString(Kind kind);
static Kind stringToKind(const String & str);
/** Binary serialization for range of values in column - for writing to disk/network, etc.
*
* Some data types are represented in multiple streams while being serialized.
@ -70,10 +101,10 @@ public:
struct SubstreamData
{
SerializationPtr serialization;
DataTypePtr type;
ColumnPtr column;
SerializationPtr serialization;
SubcolumnCreatorPtr creator;
SerializationInfoPtr serialization_info;
};
struct Substream
@ -108,6 +139,9 @@ public:
/// Data for current substream.
SubstreamData data;
/// Creator of subcolumn for current substream.
SubcolumnCreatorPtr creator = nullptr;
/// Flag, that may help to traverse substream paths.
mutable bool visited = false;
@ -130,13 +164,14 @@ public:
virtual void enumerateStreams(
SubstreamPath & path,
const StreamCallback & callback,
DataTypePtr type,
ColumnPtr column) const;
const SubstreamData & data) const;
void enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const;
void enumerateStreams(const StreamCallback & callback, SubstreamPath && path) const { enumerateStreams(callback, path); }
void enumerateStreams(const StreamCallback & callback) const { enumerateStreams(callback, {}); }
void enumerateStreams(SubstreamPath & path, const StreamCallback & callback, const DataTypePtr & type) const;
using OutputStreamGetter = std::function<WriteBuffer*(const SubstreamPath &)>;
using InputStreamGetter = std::function<ReadBuffer*(const SubstreamPath &)>;
@ -300,16 +335,41 @@ public:
static ColumnPtr getFromSubstreamsCache(SubstreamsCache * cache, const SubstreamPath & path);
static bool isSpecialCompressionAllowed(const SubstreamPath & path);
static size_t getArrayLevel(const SubstreamPath & path);
static size_t getArrayLevel(const SubstreamPath & path);
static bool hasSubcolumnForPath(const SubstreamPath & path, size_t prefix_len);
static SubstreamData createFromPath(const SubstreamPath & path, size_t prefix_len);
protected:
template <typename State, typename StatePtr>
State * checkAndGetState(const StatePtr & state) const;
[[noreturn]] void throwUnexpectedDataAfterParsedValue(IColumn & column, ReadBuffer & istr, const FormatSettings &, const String & type_name) const;
};
using SerializationPtr = std::shared_ptr<const ISerialization>;
using Serializations = std::vector<SerializationPtr>;
using SerializationByName = std::unordered_map<String, SerializationPtr>;
template <typename State, typename StatePtr>
State * ISerialization::checkAndGetState(const StatePtr & state) const
{
if (!state)
throw Exception(ErrorCodes::LOGICAL_ERROR,
"Got empty state for {}", demangle(typeid(*this).name()));
auto * state_concrete = typeid_cast<State *>(state.get());
if (!state_concrete)
{
auto & state_ref = *state;
throw Exception(ErrorCodes::LOGICAL_ERROR,
"Invalid State for {}. Expected: {}, got {}",
demangle(typeid(*this).name()),
demangle(typeid(State).name()),
demangle(typeid(state_ref).name()));
}
return state_concrete;
}
}

Some files were not shown because too many files have changed in this diff Show More