diff --git a/.clang-tidy b/.clang-tidy index 5da1d309f62..e9451272681 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -16,6 +16,7 @@ Checks: '*, -android-*, + -bugprone-assignment-in-if-condition, -bugprone-branch-clone, -bugprone-easily-swappable-parameters, -bugprone-exception-escape, @@ -23,7 +24,6 @@ Checks: '*, -bugprone-narrowing-conversions, -bugprone-not-null-terminated-result, -bugprone-unchecked-optional-access, - -bugprone-assignment-in-if-condition, -cert-dcl16-c, -cert-err58-cpp, @@ -34,7 +34,6 @@ Checks: '*, -clang-analyzer-optin.performance.Padding, -clang-analyzer-optin.portability.UnixAPI, - -clang-analyzer-security.insecureAPI.bzero, -clang-analyzer-security.insecureAPI.strcpy, @@ -103,12 +102,13 @@ Checks: '*, -openmp-*, + -misc-const-correctness, -misc-no-recursion, -misc-non-private-member-variables-in-classes, - -misc-const-correctness, -modernize-avoid-c-arrays, -modernize-concat-nested-namespaces, + -modernize-macro-to-enum, -modernize-pass-by-value, -modernize-return-braced-init-list, -modernize-use-auto, @@ -117,7 +117,6 @@ Checks: '*, -modernize-use-nodiscard, -modernize-use-override, -modernize-use-trailing-return-type, - -modernize-macro-to-enum, -performance-inefficient-string-concatenation, -performance-no-int-to-ptr, @@ -135,17 +134,35 @@ Checks: '*, -readability-magic-numbers, -readability-named-parameter, -readability-redundant-declaration, + -readability-simplify-boolean-expr, -readability-static-accessed-through-instance, -readability-suspicious-call-argument, -readability-uppercase-literal-suffix, -readability-use-anyofallof, - -readability-simplify-boolean-expr, -zirkon-*, + + -misc-*, # temporarily disabled due to being too slow + # also disable checks in other categories which are aliases of checks in misc-*: + # https://releases.llvm.org/15.0.0/tools/clang/tools/extra/docs/clang-tidy/checks/list.html + -cert-dcl54-cpp, # alias of misc-new-delete-overloads + -hicpp-new-delete-operators, # alias of misc-new-delete-overloads + -cert-fio38-c, # alias of misc-non-copyable-objects + -cert-dcl03-c, # alias of misc-static-assert + -hicpp-static-assert, # alias of misc-static-assert + -cert-err09-cpp, # alias of misc-throw-by-value-catch-by-reference + -cert-err61-cpp, # alias of misc-throw-by-value-catch-by-reference + -cppcoreguidelines-c-copy-assignment-signature, # alias of misc-unconventional-assign-operator + -cppcoreguidelines-non-private-member-variables-in-classes, # alias of misc-non-private-member-variables-in-classes ' WarningsAsErrors: '*' +# TODO: use dictionary syntax for CheckOptions when minimum clang-tidy level rose to 15 +# some-check.SomeOption: 'some value' +# instead of +# - key: some-check.SomeOption +# value: 'some value' CheckOptions: - key: readability-identifier-naming.ClassCase value: CamelCase diff --git a/.exrc b/.exrc new file mode 100644 index 00000000000..162bd41ce4f --- /dev/null +++ b/.exrc @@ -0,0 +1 @@ +au BufRead,BufNewFile * set tabstop=4 softtabstop=0 expandtab shiftwidth=4 smarttab tags=tags,../tags diff --git a/.github/ISSUE_TEMPLATE/85_bug-report.md b/.github/ISSUE_TEMPLATE/85_bug-report.md index 3d2ed6148e3..08d03c284ca 100644 --- a/.github/ISSUE_TEMPLATE/85_bug-report.md +++ b/.github/ISSUE_TEMPLATE/85_bug-report.md @@ -13,6 +13,8 @@ assignees: '' > A clear and concise description of what works not as it is supposed to. +> A link to reproducer in [https://fiddle.clickhouse.com/](https://fiddle.clickhouse.com/). + **Does it reproduce on recent release?** [The list of releases](https://github.com/ClickHouse/ClickHouse/blob/master/utils/list-versions/version_date.tsv) diff --git a/.github/ISSUE_TEMPLATE/96_installation-issues.md b/.github/ISSUE_TEMPLATE/96_installation-issues.md index c322ccc92ce..e4be8af86b6 100644 --- a/.github/ISSUE_TEMPLATE/96_installation-issues.md +++ b/.github/ISSUE_TEMPLATE/96_installation-issues.md @@ -7,6 +7,8 @@ assignees: '' --- +**I have tried the following solutions**: https://clickhouse.com/docs/en/faq/troubleshooting/#troubleshooting-installation-errors + **Installation type** Packages, docker, single binary, curl? diff --git a/.github/workflows/backport_branches.yml b/.github/workflows/backport_branches.yml index 30a77a9b27f..c90df6e57b7 100644 --- a/.github/workflows/backport_branches.yml +++ b/.github/workflows/backport_branches.yml @@ -145,8 +145,8 @@ jobs: fetch-depth: 0 # For a proper version and performance artifacts - name: Build run: | - git -C "$GITHUB_WORKSPACE" submodule sync --recursive - git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 + git -C "$GITHUB_WORKSPACE" submodule sync + git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10 sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" @@ -190,8 +190,8 @@ jobs: fetch-depth: 0 # For a proper version and performance artifacts - name: Build run: | - git -C "$GITHUB_WORKSPACE" submodule sync --recursive - git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 + git -C "$GITHUB_WORKSPACE" submodule sync + git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10 sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" @@ -233,8 +233,8 @@ jobs: uses: actions/checkout@v2 - name: Build run: | - git -C "$GITHUB_WORKSPACE" submodule sync --recursive - git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 + git -C "$GITHUB_WORKSPACE" submodule sync + git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10 sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" @@ -276,8 +276,8 @@ jobs: uses: actions/checkout@v2 - name: Build run: | - git -C "$GITHUB_WORKSPACE" submodule sync --recursive - git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 + git -C "$GITHUB_WORKSPACE" submodule sync + git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10 sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" @@ -319,8 +319,8 @@ jobs: uses: actions/checkout@v2 - name: Build run: | - git -C "$GITHUB_WORKSPACE" submodule sync --recursive - git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 + git -C "$GITHUB_WORKSPACE" submodule sync + git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10 sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" @@ -364,8 +364,8 @@ jobs: fetch-depth: 0 # otherwise we will have no info about contributors - name: Build run: | - git -C "$GITHUB_WORKSPACE" submodule sync --recursive - git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 + git -C "$GITHUB_WORKSPACE" submodule sync + git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10 sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" @@ -409,8 +409,8 @@ jobs: fetch-depth: 0 # otherwise we will have no info about contributors - name: Build run: | - git -C "$GITHUB_WORKSPACE" submodule sync --recursive - git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 + git -C "$GITHUB_WORKSPACE" submodule sync + git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10 sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" diff --git a/.github/workflows/debug.yml b/.github/workflows/debug.yml index fa980a95a39..993fa8c0d07 100644 --- a/.github/workflows/debug.yml +++ b/.github/workflows/debug.yml @@ -2,7 +2,7 @@ name: Debug 'on': - [push, pull_request, release, workflow_dispatch] + [push, pull_request, release, workflow_dispatch, workflow_call] jobs: DebugInfo: diff --git a/.github/workflows/jepsen.yml b/.github/workflows/jepsen.yml index a8b04af5773..5afc066065e 100644 --- a/.github/workflows/jepsen.yml +++ b/.github/workflows/jepsen.yml @@ -32,10 +32,41 @@ jobs: mkdir -p "$TEMP_PATH" cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" cd "$REPO_COPY/tests/ci" - python3 keeper_jepsen_check.py + python3 jepsen_check.py keeper - name: Cleanup if: always() run: | docker ps --quiet | xargs --no-run-if-empty docker kill ||: docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: sudo rm -fr "$TEMP_PATH" + # ServerJepsenRelease: + # runs-on: [self-hosted, style-checker] + # if: ${{ always() }} + # needs: [KeeperJepsenRelease] + # steps: + # - name: Set envs + # run: | + # cat >> "$GITHUB_ENV" << 'EOF' + # TEMP_PATH=${{runner.temp}}/server_jepsen + # REPO_COPY=${{runner.temp}}/server_jepsen/ClickHouse + # EOF + # - name: Clear repository + # run: | + # sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE" + # - name: Check out repository code + # uses: actions/checkout@v2 + # with: + # fetch-depth: 0 + # - name: Jepsen Test + # run: | + # sudo rm -fr "$TEMP_PATH" + # mkdir -p "$TEMP_PATH" + # cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" + # cd "$REPO_COPY/tests/ci" + # python3 jepsen_check.py server + # - name: Cleanup + # if: always() + # run: | + # docker ps --quiet | xargs --no-run-if-empty docker kill ||: + # docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: + # sudo rm -fr "$TEMP_PATH" diff --git a/.github/workflows/master.yml b/.github/workflows/master.yml index fba8a975ca6..f3d672136ef 100644 --- a/.github/workflows/master.yml +++ b/.github/workflows/master.yml @@ -209,8 +209,8 @@ jobs: fetch-depth: 0 # For a proper version and performance artifacts - name: Build run: | - git -C "$GITHUB_WORKSPACE" submodule sync --recursive - git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 + git -C "$GITHUB_WORKSPACE" submodule sync + git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10 sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" @@ -251,8 +251,8 @@ jobs: fetch-depth: 0 # For a proper version and performance artifacts - name: Build run: | - git -C "$GITHUB_WORKSPACE" submodule sync --recursive - git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 + git -C "$GITHUB_WORKSPACE" submodule sync + git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10 sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" @@ -295,8 +295,8 @@ jobs: fetch-depth: 0 # otherwise we will have no info about contributors - name: Build run: | - git -C "$GITHUB_WORKSPACE" submodule sync --recursive - git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 + git -C "$GITHUB_WORKSPACE" submodule sync + git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10 sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" @@ -338,8 +338,8 @@ jobs: uses: actions/checkout@v2 - name: Build run: | - git -C "$GITHUB_WORKSPACE" submodule sync --recursive - git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 + git -C "$GITHUB_WORKSPACE" submodule sync + git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10 sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" @@ -381,8 +381,8 @@ jobs: uses: actions/checkout@v2 - name: Build run: | - git -C "$GITHUB_WORKSPACE" submodule sync --recursive - git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 + git -C "$GITHUB_WORKSPACE" submodule sync + git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10 sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" @@ -424,8 +424,8 @@ jobs: uses: actions/checkout@v2 - name: Build run: | - git -C "$GITHUB_WORKSPACE" submodule sync --recursive - git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 + git -C "$GITHUB_WORKSPACE" submodule sync + git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10 sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" @@ -467,8 +467,8 @@ jobs: uses: actions/checkout@v2 - name: Build run: | - git -C "$GITHUB_WORKSPACE" submodule sync --recursive - git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 + git -C "$GITHUB_WORKSPACE" submodule sync + git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10 sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" @@ -510,8 +510,8 @@ jobs: uses: actions/checkout@v2 - name: Build run: | - git -C "$GITHUB_WORKSPACE" submodule sync --recursive - git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 + git -C "$GITHUB_WORKSPACE" submodule sync + git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10 sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" @@ -556,8 +556,8 @@ jobs: uses: actions/checkout@v2 - name: Build run: | - git -C "$GITHUB_WORKSPACE" submodule sync --recursive - git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 + git -C "$GITHUB_WORKSPACE" submodule sync + git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10 sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" @@ -599,8 +599,8 @@ jobs: uses: actions/checkout@v2 - name: Build run: | - git -C "$GITHUB_WORKSPACE" submodule sync --recursive - git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 + git -C "$GITHUB_WORKSPACE" submodule sync + git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10 sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" @@ -644,8 +644,8 @@ jobs: fetch-depth: 0 # otherwise we will have no info about contributors - name: Build run: | - git -C "$GITHUB_WORKSPACE" submodule sync --recursive - git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 + git -C "$GITHUB_WORKSPACE" submodule sync + git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10 sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" @@ -689,8 +689,8 @@ jobs: fetch-depth: 0 # otherwise we will have no info about contributors - name: Build run: | - git -C "$GITHUB_WORKSPACE" submodule sync --recursive - git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 + git -C "$GITHUB_WORKSPACE" submodule sync + git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10 sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" @@ -734,8 +734,8 @@ jobs: fetch-depth: 0 # otherwise we will have no info about contributors - name: Build run: | - git -C "$GITHUB_WORKSPACE" submodule sync --recursive - git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 + git -C "$GITHUB_WORKSPACE" submodule sync + git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10 sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" @@ -779,8 +779,8 @@ jobs: fetch-depth: 0 # otherwise we will have no info about contributors - name: Build run: | - git -C "$GITHUB_WORKSPACE" submodule sync --recursive - git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 + git -C "$GITHUB_WORKSPACE" submodule sync + git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10 sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" @@ -824,8 +824,8 @@ jobs: fetch-depth: 0 # otherwise we will have no info about contributors - name: Build run: | - git -C "$GITHUB_WORKSPACE" submodule sync --recursive - git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 + git -C "$GITHUB_WORKSPACE" submodule sync + git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10 sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" @@ -869,8 +869,8 @@ jobs: fetch-depth: 0 # otherwise we will have no info about contributors - name: Build run: | - git -C "$GITHUB_WORKSPACE" submodule sync --recursive - git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 + git -C "$GITHUB_WORKSPACE" submodule sync + git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10 sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" @@ -914,8 +914,8 @@ jobs: fetch-depth: 0 # otherwise we will have no info about contributors - name: Build run: | - git -C "$GITHUB_WORKSPACE" submodule sync --recursive - git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 + git -C "$GITHUB_WORKSPACE" submodule sync + git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10 sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" @@ -1056,6 +1056,23 @@ jobs: docker ps --quiet | xargs --no-run-if-empty docker kill ||: docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: sudo rm -fr "$TEMP_PATH" + MarkReleaseReady: + needs: + - BuilderBinDarwin + - BuilderBinDarwinAarch64 + - BuilderDebRelease + - BuilderDebAarch64 + runs-on: [self-hosted, style-checker] + steps: + - name: Clear repository + run: | + sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE" + - name: Check out repository code + uses: actions/checkout@v2 + - name: Mark Commit Release Ready + run: | + cd "$GITHUB_WORKSPACE/tests/ci" + python3 mark_release_ready.py ############################################################################################## ########################### FUNCTIONAl STATELESS TESTS ####################################### ############################################################################################## @@ -2994,10 +3011,227 @@ jobs: docker ps --quiet | xargs --no-run-if-empty docker kill ||: docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: sudo rm -fr "$TEMP_PATH" + PerformanceComparisonAarch-0: + needs: [BuilderDebAarch64] + runs-on: [self-hosted, func-tester-aarch64] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/performance_comparison + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Performance Comparison Aarch64 + REPO_COPY=${{runner.temp}}/performance_comparison/ClickHouse + RUN_BY_HASH_NUM=0 + RUN_BY_HASH_TOTAL=4 + EOF + - name: Download json reports + uses: actions/download-artifact@v2 + with: + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE" + - name: Check out repository code + uses: actions/checkout@v2 + - name: Performance Comparison + run: | + sudo rm -fr "$TEMP_PATH" + mkdir -p "$TEMP_PATH" + cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" + cd "$REPO_COPY/tests/ci" + python3 performance_comparison_check.py "$CHECK_NAME" + - name: Cleanup + if: always() + run: | + docker ps --quiet | xargs --no-run-if-empty docker kill ||: + docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: + sudo rm -fr "$TEMP_PATH" + PerformanceComparisonAarch-1: + needs: [BuilderDebAarch64] + runs-on: [self-hosted, func-tester-aarch64] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/performance_comparison + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Performance Comparison Aarch64 + REPO_COPY=${{runner.temp}}/performance_comparison/ClickHouse + RUN_BY_HASH_NUM=1 + RUN_BY_HASH_TOTAL=4 + EOF + - name: Download json reports + uses: actions/download-artifact@v2 + with: + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE" + - name: Check out repository code + uses: actions/checkout@v2 + - name: Performance Comparison + run: | + sudo rm -fr "$TEMP_PATH" + mkdir -p "$TEMP_PATH" + cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" + cd "$REPO_COPY/tests/ci" + python3 performance_comparison_check.py "$CHECK_NAME" + - name: Cleanup + if: always() + run: | + docker ps --quiet | xargs --no-run-if-empty docker kill ||: + docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: + sudo rm -fr "$TEMP_PATH" + PerformanceComparisonAarch-2: + needs: [BuilderDebAarch64] + runs-on: [self-hosted, func-tester-aarch64] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/performance_comparison + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Performance Comparison Aarch64 + REPO_COPY=${{runner.temp}}/performance_comparison/ClickHouse + RUN_BY_HASH_NUM=2 + RUN_BY_HASH_TOTAL=4 + EOF + - name: Download json reports + uses: actions/download-artifact@v2 + with: + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE" + - name: Check out repository code + uses: actions/checkout@v2 + - name: Performance Comparison + run: | + sudo rm -fr "$TEMP_PATH" + mkdir -p "$TEMP_PATH" + cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" + cd "$REPO_COPY/tests/ci" + python3 performance_comparison_check.py "$CHECK_NAME" + - name: Cleanup + if: always() + run: | + docker ps --quiet | xargs --no-run-if-empty docker kill ||: + docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: + sudo rm -fr "$TEMP_PATH" + PerformanceComparisonAarch-3: + needs: [BuilderDebAarch64] + runs-on: [self-hosted, func-tester-aarch64] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/performance_comparison + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Performance Comparison Aarch64 + REPO_COPY=${{runner.temp}}/performance_comparison/ClickHouse + RUN_BY_HASH_NUM=3 + RUN_BY_HASH_TOTAL=4 + EOF + - name: Download json reports + uses: actions/download-artifact@v2 + with: + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE" + - name: Check out repository code + uses: actions/checkout@v2 + - name: Performance Comparison + run: | + sudo rm -fr "$TEMP_PATH" + mkdir -p "$TEMP_PATH" + cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" + cd "$REPO_COPY/tests/ci" + python3 performance_comparison_check.py "$CHECK_NAME" + - name: Cleanup + if: always() + run: | + docker ps --quiet | xargs --no-run-if-empty docker kill ||: + docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: + sudo rm -fr "$TEMP_PATH" +############################################################################################## +###################################### SQLANCER FUZZERS ###################################### +############################################################################################## + SQLancerTestRelease: + needs: [BuilderDebRelease] + runs-on: [self-hosted, fuzzer-unit-tester] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/sqlancer_release + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=SQLancer (release) + REPO_COPY=${{runner.temp}}/sqlancer_release/ClickHouse + EOF + - name: Download json reports + uses: actions/download-artifact@v2 + with: + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE" + - name: Check out repository code + uses: actions/checkout@v2 + - name: SQLancer + run: | + sudo rm -fr "$TEMP_PATH" + mkdir -p "$TEMP_PATH" + cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" + cd "$REPO_COPY/tests/ci" + python3 sqlancer_check.py "$CHECK_NAME" + - name: Cleanup + if: always() + run: | + docker ps --quiet | xargs --no-run-if-empty docker kill ||: + docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: + sudo rm -fr "$TEMP_PATH" + SQLancerTestDebug: + needs: [BuilderDebDebug] + runs-on: [self-hosted, fuzzer-unit-tester] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/sqlancer_debug + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=SQLancer (debug) + REPO_COPY=${{runner.temp}}/sqlancer_debug/ClickHouse + EOF + - name: Download json reports + uses: actions/download-artifact@v2 + with: + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE" + - name: Check out repository code + uses: actions/checkout@v2 + - name: SQLancer + run: | + sudo rm -fr "$TEMP_PATH" + mkdir -p "$TEMP_PATH" + cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" + cd "$REPO_COPY/tests/ci" + python3 sqlancer_check.py "$CHECK_NAME" + - name: Cleanup + if: always() + run: | + docker ps --quiet | xargs --no-run-if-empty docker kill ||: + docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: + sudo rm -fr "$TEMP_PATH" FinishCheck: needs: - DockerHubPush - BuilderReport + - BuilderSpecialReport + - MarkReleaseReady - FunctionalStatelessTestDebug0 - FunctionalStatelessTestDebug1 - FunctionalStatelessTestDebug2 @@ -3053,6 +3287,8 @@ jobs: - UnitTestsUBsan - UnitTestsReleaseClang - SharedBuildSmokeTest + - SQLancerTestRelease + - SQLancerTestDebug runs-on: [self-hosted, style-checker] steps: - name: Clear repository diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml index 612bb1f8f9b..9ebbe4e090d 100644 --- a/.github/workflows/nightly.yml +++ b/.github/workflows/nightly.yml @@ -10,6 +10,9 @@ env: workflow_dispatch: jobs: + Debug: + # The task for having a preserved ENV and event.json for later investigation + uses: ./.github/workflows/debug.yml DockerHubPushAarch64: runs-on: [self-hosted, style-checker-aarch64] steps: @@ -102,7 +105,7 @@ jobs: - name: Build run: | git -C "$GITHUB_WORKSPACE" submodule sync - git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --init --jobs=10 + git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10 sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml index 23245c16374..857e2c7f604 100644 --- a/.github/workflows/pull_request.yml +++ b/.github/workflows/pull_request.yml @@ -22,6 +22,8 @@ on: # yamllint disable-line rule:truthy jobs: CheckLabels: runs-on: [self-hosted, style-checker] + # Run the first check always, even if the CI is cancelled + if: ${{ always() }} steps: - name: Clear repository run: | @@ -112,7 +114,8 @@ jobs: StyleCheck: needs: DockerHubPush runs-on: [self-hosted, style-checker] - if: ${{ success() || failure() || always() }} + # We need additional `&& ! cancelled()` to have the job being able to cancel + if: ${{ success() || failure() || ( always() && ! cancelled() ) }} steps: - name: Set envs run: | @@ -272,8 +275,8 @@ jobs: fetch-depth: 0 # for performance artifact - name: Build run: | - git -C "$GITHUB_WORKSPACE" submodule sync --recursive - git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 + git -C "$GITHUB_WORKSPACE" submodule sync + git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10 sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" @@ -315,8 +318,8 @@ jobs: uses: actions/checkout@v2 - name: Build run: | - git -C "$GITHUB_WORKSPACE" submodule sync --recursive - git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 + git -C "$GITHUB_WORKSPACE" submodule sync + git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10 sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" @@ -360,8 +363,8 @@ jobs: fetch-depth: 0 # for performance artifact - name: Build run: | - git -C "$GITHUB_WORKSPACE" submodule sync --recursive - git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 + git -C "$GITHUB_WORKSPACE" submodule sync + git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10 sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" @@ -403,8 +406,8 @@ jobs: uses: actions/checkout@v2 - name: Build run: | - git -C "$GITHUB_WORKSPACE" submodule sync --recursive - git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 + git -C "$GITHUB_WORKSPACE" submodule sync + git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10 sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" @@ -446,8 +449,8 @@ jobs: uses: actions/checkout@v2 - name: Build run: | - git -C "$GITHUB_WORKSPACE" submodule sync --recursive - git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 + git -C "$GITHUB_WORKSPACE" submodule sync + git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10 sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" @@ -489,8 +492,8 @@ jobs: uses: actions/checkout@v2 - name: Build run: | - git -C "$GITHUB_WORKSPACE" submodule sync --recursive - git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 + git -C "$GITHUB_WORKSPACE" submodule sync + git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10 sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" @@ -532,8 +535,8 @@ jobs: uses: actions/checkout@v2 - name: Build run: | - git -C "$GITHUB_WORKSPACE" submodule sync --recursive - git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 + git -C "$GITHUB_WORKSPACE" submodule sync + git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10 sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" @@ -575,8 +578,8 @@ jobs: uses: actions/checkout@v2 - name: Build run: | - git -C "$GITHUB_WORKSPACE" submodule sync --recursive - git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 + git -C "$GITHUB_WORKSPACE" submodule sync + git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10 sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" @@ -621,8 +624,8 @@ jobs: uses: actions/checkout@v2 - name: Build run: | - git -C "$GITHUB_WORKSPACE" submodule sync --recursive - git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 + git -C "$GITHUB_WORKSPACE" submodule sync + git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10 sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" @@ -664,8 +667,8 @@ jobs: uses: actions/checkout@v2 - name: Build run: | - git -C "$GITHUB_WORKSPACE" submodule sync --recursive - git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 + git -C "$GITHUB_WORKSPACE" submodule sync + git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10 sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" @@ -707,8 +710,8 @@ jobs: uses: actions/checkout@v2 - name: Build run: | - git -C "$GITHUB_WORKSPACE" submodule sync --recursive - git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 + git -C "$GITHUB_WORKSPACE" submodule sync + git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10 sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" @@ -750,8 +753,8 @@ jobs: uses: actions/checkout@v2 - name: Build run: | - git -C "$GITHUB_WORKSPACE" submodule sync --recursive - git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 + git -C "$GITHUB_WORKSPACE" submodule sync + git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10 sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" @@ -793,8 +796,8 @@ jobs: uses: actions/checkout@v2 - name: Build run: | - git -C "$GITHUB_WORKSPACE" submodule sync --recursive - git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 + git -C "$GITHUB_WORKSPACE" submodule sync + git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10 sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" @@ -836,8 +839,8 @@ jobs: uses: actions/checkout@v2 - name: Build run: | - git -C "$GITHUB_WORKSPACE" submodule sync --recursive - git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 + git -C "$GITHUB_WORKSPACE" submodule sync + git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10 sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" @@ -879,8 +882,8 @@ jobs: uses: actions/checkout@v2 - name: Build run: | - git -C "$GITHUB_WORKSPACE" submodule sync --recursive - git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 + git -C "$GITHUB_WORKSPACE" submodule sync + git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10 sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" @@ -922,8 +925,8 @@ jobs: uses: actions/checkout@v2 - name: Build run: | - git -C "$GITHUB_WORKSPACE" submodule sync --recursive - git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 + git -C "$GITHUB_WORKSPACE" submodule sync + git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10 sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" @@ -965,8 +968,8 @@ jobs: uses: actions/checkout@v2 - name: Build run: | - git -C "$GITHUB_WORKSPACE" submodule sync --recursive - git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 + git -C "$GITHUB_WORKSPACE" submodule sync + git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10 sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" @@ -1157,7 +1160,7 @@ jobs: REPO_COPY=${{runner.temp}}/stateless_database_replicated/ClickHouse KILL_TIMEOUT=10800 RUN_BY_HASH_NUM=0 - RUN_BY_HASH_TOTAL=2 + RUN_BY_HASH_TOTAL=4 EOF - name: Download json reports uses: actions/download-artifact@v2 @@ -1194,7 +1197,81 @@ jobs: REPO_COPY=${{runner.temp}}/stateless_database_replicated/ClickHouse KILL_TIMEOUT=10800 RUN_BY_HASH_NUM=1 - RUN_BY_HASH_TOTAL=2 + RUN_BY_HASH_TOTAL=4 + EOF + - name: Download json reports + uses: actions/download-artifact@v2 + with: + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE" + - name: Check out repository code + uses: actions/checkout@v2 + - name: Functional test + run: | + sudo rm -fr "$TEMP_PATH" + mkdir -p "$TEMP_PATH" + cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" + cd "$REPO_COPY/tests/ci" + python3 functional_test_check.py "$CHECK_NAME" "$KILL_TIMEOUT" + - name: Cleanup + if: always() + run: | + docker ps --quiet | xargs --no-run-if-empty docker kill ||: + docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: + sudo rm -fr "$TEMP_PATH" + FunctionalStatelessTestReleaseDatabaseReplicated2: + needs: [BuilderDebRelease] + runs-on: [self-hosted, func-tester] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stateless_database_replicated + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stateless tests (release, DatabaseReplicated) + REPO_COPY=${{runner.temp}}/stateless_database_replicated/ClickHouse + KILL_TIMEOUT=10800 + RUN_BY_HASH_NUM=2 + RUN_BY_HASH_TOTAL=4 + EOF + - name: Download json reports + uses: actions/download-artifact@v2 + with: + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE" + - name: Check out repository code + uses: actions/checkout@v2 + - name: Functional test + run: | + sudo rm -fr "$TEMP_PATH" + mkdir -p "$TEMP_PATH" + cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" + cd "$REPO_COPY/tests/ci" + python3 functional_test_check.py "$CHECK_NAME" "$KILL_TIMEOUT" + - name: Cleanup + if: always() + run: | + docker ps --quiet | xargs --no-run-if-empty docker kill ||: + docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: + sudo rm -fr "$TEMP_PATH" + FunctionalStatelessTestReleaseDatabaseReplicated3: + needs: [BuilderDebRelease] + runs-on: [self-hosted, func-tester] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stateless_database_replicated + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stateless tests (release, DatabaseReplicated) + REPO_COPY=${{runner.temp}}/stateless_database_replicated/ClickHouse + KILL_TIMEOUT=10800 + RUN_BY_HASH_NUM=3 + RUN_BY_HASH_TOTAL=4 EOF - name: Download json reports uses: actions/download-artifact@v2 @@ -1253,7 +1330,7 @@ jobs: docker ps --quiet | xargs --no-run-if-empty docker kill ||: docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: sudo rm -fr "$TEMP_PATH" - FunctionalStatelessTestReleaseS3: + FunctionalStatelessTestReleaseS3_0: needs: [BuilderDebRelease] runs-on: [self-hosted, func-tester] steps: @@ -1265,6 +1342,45 @@ jobs: CHECK_NAME=Stateless tests (release, s3 storage) REPO_COPY=${{runner.temp}}/stateless_s3_storage/ClickHouse KILL_TIMEOUT=10800 + RUN_BY_HASH_NUM=0 + RUN_BY_HASH_TOTAL=2 + EOF + - name: Download json reports + uses: actions/download-artifact@v2 + with: + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE" + - name: Check out repository code + uses: actions/checkout@v2 + - name: Functional test + run: | + sudo rm -fr "$TEMP_PATH" + mkdir -p "$TEMP_PATH" + cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" + cd "$REPO_COPY/tests/ci" + python3 functional_test_check.py "$CHECK_NAME" "$KILL_TIMEOUT" + - name: Cleanup + if: always() + run: | + docker ps --quiet | xargs --no-run-if-empty docker kill ||: + docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: + sudo rm -fr "$TEMP_PATH" + FunctionalStatelessTestReleaseS3_1: + needs: [BuilderDebRelease] + runs-on: [self-hosted, func-tester] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stateless_s3_storage + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stateless tests (release, s3 storage) + REPO_COPY=${{runner.temp}}/stateless_s3_storage/ClickHouse + KILL_TIMEOUT=10800 + RUN_BY_HASH_NUM=1 + RUN_BY_HASH_TOTAL=2 EOF - name: Download json reports uses: actions/download-artifact@v2 @@ -1301,7 +1417,7 @@ jobs: REPO_COPY=${{runner.temp}}/stateless_s3_storage_debug/ClickHouse KILL_TIMEOUT=10800 RUN_BY_HASH_NUM=0 - RUN_BY_HASH_TOTAL=3 + RUN_BY_HASH_TOTAL=6 EOF - name: Download json reports uses: actions/download-artifact@v2 @@ -1338,7 +1454,7 @@ jobs: REPO_COPY=${{runner.temp}}/stateless_s3_storage_debug/ClickHouse KILL_TIMEOUT=10800 RUN_BY_HASH_NUM=1 - RUN_BY_HASH_TOTAL=3 + RUN_BY_HASH_TOTAL=6 EOF - name: Download json reports uses: actions/download-artifact@v2 @@ -1375,7 +1491,118 @@ jobs: REPO_COPY=${{runner.temp}}/stateless_s3_storage_debug/ClickHouse KILL_TIMEOUT=10800 RUN_BY_HASH_NUM=2 - RUN_BY_HASH_TOTAL=3 + RUN_BY_HASH_TOTAL=6 + EOF + - name: Download json reports + uses: actions/download-artifact@v2 + with: + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE" + - name: Check out repository code + uses: actions/checkout@v2 + - name: Functional test + run: | + sudo rm -fr "$TEMP_PATH" + mkdir -p "$TEMP_PATH" + cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" + cd "$REPO_COPY/tests/ci" + python3 functional_test_check.py "$CHECK_NAME" "$KILL_TIMEOUT" + - name: Cleanup + if: always() + run: | + docker ps --quiet | xargs --no-run-if-empty docker kill ||: + docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: + sudo rm -fr "$TEMP_PATH" + FunctionalStatelessTestS3Debug3: + needs: [BuilderDebDebug] + runs-on: [self-hosted, func-tester] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stateless_s3_storage_debug + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stateless tests (debug, s3 storage) + REPO_COPY=${{runner.temp}}/stateless_s3_storage_debug/ClickHouse + KILL_TIMEOUT=10800 + RUN_BY_HASH_NUM=3 + RUN_BY_HASH_TOTAL=6 + EOF + - name: Download json reports + uses: actions/download-artifact@v2 + with: + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE" + - name: Check out repository code + uses: actions/checkout@v2 + - name: Functional test + run: | + sudo rm -fr "$TEMP_PATH" + mkdir -p "$TEMP_PATH" + cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" + cd "$REPO_COPY/tests/ci" + python3 functional_test_check.py "$CHECK_NAME" "$KILL_TIMEOUT" + - name: Cleanup + if: always() + run: | + docker ps --quiet | xargs --no-run-if-empty docker kill ||: + docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: + sudo rm -fr "$TEMP_PATH" + FunctionalStatelessTestS3Debug4: + needs: [BuilderDebDebug] + runs-on: [self-hosted, func-tester] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stateless_s3_storage_debug + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stateless tests (debug, s3 storage) + REPO_COPY=${{runner.temp}}/stateless_s3_storage_debug/ClickHouse + KILL_TIMEOUT=10800 + RUN_BY_HASH_NUM=4 + RUN_BY_HASH_TOTAL=6 + EOF + - name: Download json reports + uses: actions/download-artifact@v2 + with: + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE" + - name: Check out repository code + uses: actions/checkout@v2 + - name: Functional test + run: | + sudo rm -fr "$TEMP_PATH" + mkdir -p "$TEMP_PATH" + cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" + cd "$REPO_COPY/tests/ci" + python3 functional_test_check.py "$CHECK_NAME" "$KILL_TIMEOUT" + - name: Cleanup + if: always() + run: | + docker ps --quiet | xargs --no-run-if-empty docker kill ||: + docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: + sudo rm -fr "$TEMP_PATH" + FunctionalStatelessTestS3Debug5: + needs: [BuilderDebDebug] + runs-on: [self-hosted, func-tester] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stateless_s3_storage_debug + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stateless tests (debug, s3 storage) + REPO_COPY=${{runner.temp}}/stateless_s3_storage_debug/ClickHouse + KILL_TIMEOUT=10800 + RUN_BY_HASH_NUM=5 + RUN_BY_HASH_TOTAL=6 EOF - name: Download json reports uses: actions/download-artifact@v2 @@ -1412,7 +1639,7 @@ jobs: REPO_COPY=${{runner.temp}}/stateless_s3_storage_tsan/ClickHouse KILL_TIMEOUT=10800 RUN_BY_HASH_NUM=0 - RUN_BY_HASH_TOTAL=3 + RUN_BY_HASH_TOTAL=5 EOF - name: Download json reports uses: actions/download-artifact@v2 @@ -1449,7 +1676,7 @@ jobs: REPO_COPY=${{runner.temp}}/stateless_s3_storage_tsan/ClickHouse KILL_TIMEOUT=10800 RUN_BY_HASH_NUM=1 - RUN_BY_HASH_TOTAL=3 + RUN_BY_HASH_TOTAL=5 EOF - name: Download json reports uses: actions/download-artifact@v2 @@ -1486,7 +1713,81 @@ jobs: REPO_COPY=${{runner.temp}}/stateless_s3_storage_tsan/ClickHouse KILL_TIMEOUT=10800 RUN_BY_HASH_NUM=2 - RUN_BY_HASH_TOTAL=3 + RUN_BY_HASH_TOTAL=5 + EOF + - name: Download json reports + uses: actions/download-artifact@v2 + with: + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE" + - name: Check out repository code + uses: actions/checkout@v2 + - name: Functional test + run: | + sudo rm -fr "$TEMP_PATH" + mkdir -p "$TEMP_PATH" + cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" + cd "$REPO_COPY/tests/ci" + python3 functional_test_check.py "$CHECK_NAME" "$KILL_TIMEOUT" + - name: Cleanup + if: always() + run: | + docker ps --quiet | xargs --no-run-if-empty docker kill ||: + docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: + sudo rm -fr "$TEMP_PATH" + FunctionalStatelessTestS3Tsan3: + needs: [BuilderDebTsan] + runs-on: [self-hosted, func-tester] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stateless_s3_storage_tsan + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stateless tests (tsan, s3 storage) + REPO_COPY=${{runner.temp}}/stateless_s3_storage_tsan/ClickHouse + KILL_TIMEOUT=10800 + RUN_BY_HASH_NUM=3 + RUN_BY_HASH_TOTAL=5 + EOF + - name: Download json reports + uses: actions/download-artifact@v2 + with: + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE" + - name: Check out repository code + uses: actions/checkout@v2 + - name: Functional test + run: | + sudo rm -fr "$TEMP_PATH" + mkdir -p "$TEMP_PATH" + cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" + cd "$REPO_COPY/tests/ci" + python3 functional_test_check.py "$CHECK_NAME" "$KILL_TIMEOUT" + - name: Cleanup + if: always() + run: | + docker ps --quiet | xargs --no-run-if-empty docker kill ||: + docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: + sudo rm -fr "$TEMP_PATH" + FunctionalStatelessTestS3Tsan4: + needs: [BuilderDebTsan] + runs-on: [self-hosted, func-tester] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stateless_s3_storage_tsan + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stateless tests (tsan, s3 storage) + REPO_COPY=${{runner.temp}}/stateless_s3_storage_tsan/ClickHouse + KILL_TIMEOUT=10800 + RUN_BY_HASH_NUM=4 + RUN_BY_HASH_TOTAL=5 EOF - name: Download json reports uses: actions/download-artifact@v2 @@ -1558,7 +1859,7 @@ jobs: REPO_COPY=${{runner.temp}}/stateless_debug/ClickHouse KILL_TIMEOUT=10800 RUN_BY_HASH_NUM=0 - RUN_BY_HASH_TOTAL=2 + RUN_BY_HASH_TOTAL=4 EOF - name: Download json reports uses: actions/download-artifact@v2 @@ -1595,7 +1896,81 @@ jobs: REPO_COPY=${{runner.temp}}/stateless_debug/ClickHouse KILL_TIMEOUT=10800 RUN_BY_HASH_NUM=1 - RUN_BY_HASH_TOTAL=2 + RUN_BY_HASH_TOTAL=4 + EOF + - name: Download json reports + uses: actions/download-artifact@v2 + with: + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE" + - name: Check out repository code + uses: actions/checkout@v2 + - name: Functional test + run: | + sudo rm -fr "$TEMP_PATH" + mkdir -p "$TEMP_PATH" + cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" + cd "$REPO_COPY/tests/ci" + python3 functional_test_check.py "$CHECK_NAME" "$KILL_TIMEOUT" + - name: Cleanup + if: always() + run: | + docker ps --quiet | xargs --no-run-if-empty docker kill ||: + docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: + sudo rm -fr "$TEMP_PATH" + FunctionalStatelessTestAsan2: + needs: [BuilderDebAsan] + runs-on: [self-hosted, func-tester] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stateless_debug + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stateless tests (asan) + REPO_COPY=${{runner.temp}}/stateless_debug/ClickHouse + KILL_TIMEOUT=10800 + RUN_BY_HASH_NUM=2 + RUN_BY_HASH_TOTAL=4 + EOF + - name: Download json reports + uses: actions/download-artifact@v2 + with: + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE" + - name: Check out repository code + uses: actions/checkout@v2 + - name: Functional test + run: | + sudo rm -fr "$TEMP_PATH" + mkdir -p "$TEMP_PATH" + cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" + cd "$REPO_COPY/tests/ci" + python3 functional_test_check.py "$CHECK_NAME" "$KILL_TIMEOUT" + - name: Cleanup + if: always() + run: | + docker ps --quiet | xargs --no-run-if-empty docker kill ||: + docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: + sudo rm -fr "$TEMP_PATH" + FunctionalStatelessTestAsan3: + needs: [BuilderDebAsan] + runs-on: [self-hosted, func-tester] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stateless_debug + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stateless tests (asan) + REPO_COPY=${{runner.temp}}/stateless_debug/ClickHouse + KILL_TIMEOUT=10800 + RUN_BY_HASH_NUM=3 + RUN_BY_HASH_TOTAL=4 EOF - name: Download json reports uses: actions/download-artifact@v2 @@ -1632,7 +2007,7 @@ jobs: REPO_COPY=${{runner.temp}}/stateless_tsan/ClickHouse KILL_TIMEOUT=10800 RUN_BY_HASH_NUM=0 - RUN_BY_HASH_TOTAL=3 + RUN_BY_HASH_TOTAL=5 EOF - name: Download json reports uses: actions/download-artifact@v2 @@ -1669,7 +2044,7 @@ jobs: REPO_COPY=${{runner.temp}}/stateless_tsan/ClickHouse KILL_TIMEOUT=10800 RUN_BY_HASH_NUM=1 - RUN_BY_HASH_TOTAL=3 + RUN_BY_HASH_TOTAL=5 EOF - name: Download json reports uses: actions/download-artifact@v2 @@ -1706,7 +2081,7 @@ jobs: REPO_COPY=${{runner.temp}}/stateless_tsan/ClickHouse KILL_TIMEOUT=10800 RUN_BY_HASH_NUM=2 - RUN_BY_HASH_TOTAL=3 + RUN_BY_HASH_TOTAL=5 EOF - name: Download json reports uses: actions/download-artifact@v2 @@ -1730,7 +2105,81 @@ jobs: docker ps --quiet | xargs --no-run-if-empty docker kill ||: docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: sudo rm -fr "$TEMP_PATH" - FunctionalStatelessTestUBsan: + FunctionalStatelessTestTsan3: + needs: [BuilderDebTsan] + runs-on: [self-hosted, func-tester] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stateless_tsan + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stateless tests (tsan) + REPO_COPY=${{runner.temp}}/stateless_tsan/ClickHouse + KILL_TIMEOUT=10800 + RUN_BY_HASH_NUM=3 + RUN_BY_HASH_TOTAL=5 + EOF + - name: Download json reports + uses: actions/download-artifact@v2 + with: + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE" + - name: Check out repository code + uses: actions/checkout@v2 + - name: Functional test + run: | + sudo rm -fr "$TEMP_PATH" + mkdir -p "$TEMP_PATH" + cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" + cd "$REPO_COPY/tests/ci" + python3 functional_test_check.py "$CHECK_NAME" "$KILL_TIMEOUT" + - name: Cleanup + if: always() + run: | + docker ps --quiet | xargs --no-run-if-empty docker kill ||: + docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: + sudo rm -fr "$TEMP_PATH" + FunctionalStatelessTestTsan4: + needs: [BuilderDebTsan] + runs-on: [self-hosted, func-tester] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stateless_tsan + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stateless tests (tsan) + REPO_COPY=${{runner.temp}}/stateless_tsan/ClickHouse + KILL_TIMEOUT=10800 + RUN_BY_HASH_NUM=4 + RUN_BY_HASH_TOTAL=5 + EOF + - name: Download json reports + uses: actions/download-artifact@v2 + with: + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE" + - name: Check out repository code + uses: actions/checkout@v2 + - name: Functional test + run: | + sudo rm -fr "$TEMP_PATH" + mkdir -p "$TEMP_PATH" + cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" + cd "$REPO_COPY/tests/ci" + python3 functional_test_check.py "$CHECK_NAME" "$KILL_TIMEOUT" + - name: Cleanup + if: always() + run: | + docker ps --quiet | xargs --no-run-if-empty docker kill ||: + docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: + sudo rm -fr "$TEMP_PATH" + FunctionalStatelessTestUBsan0: needs: [BuilderDebUBsan] runs-on: [self-hosted, func-tester] steps: @@ -1742,6 +2191,45 @@ jobs: CHECK_NAME=Stateless tests (ubsan) REPO_COPY=${{runner.temp}}/stateless_ubsan/ClickHouse KILL_TIMEOUT=10800 + RUN_BY_HASH_NUM=0 + RUN_BY_HASH_TOTAL=2 + EOF + - name: Download json reports + uses: actions/download-artifact@v2 + with: + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE" + - name: Check out repository code + uses: actions/checkout@v2 + - name: Functional test + run: | + sudo rm -fr "$TEMP_PATH" + mkdir -p "$TEMP_PATH" + cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" + cd "$REPO_COPY/tests/ci" + python3 functional_test_check.py "$CHECK_NAME" "$KILL_TIMEOUT" + - name: Cleanup + if: always() + run: | + docker ps --quiet | xargs --no-run-if-empty docker kill ||: + docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: + sudo rm -fr "$TEMP_PATH" + FunctionalStatelessTestUBsan1: + needs: [BuilderDebUBsan] + runs-on: [self-hosted, func-tester] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stateless_ubsan + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stateless tests (ubsan) + REPO_COPY=${{runner.temp}}/stateless_ubsan/ClickHouse + KILL_TIMEOUT=10800 + RUN_BY_HASH_NUM=1 + RUN_BY_HASH_TOTAL=2 EOF - name: Download json reports uses: actions/download-artifact@v2 @@ -1778,7 +2266,7 @@ jobs: REPO_COPY=${{runner.temp}}/stateless_memory/ClickHouse KILL_TIMEOUT=10800 RUN_BY_HASH_NUM=0 - RUN_BY_HASH_TOTAL=3 + RUN_BY_HASH_TOTAL=6 EOF - name: Download json reports uses: actions/download-artifact@v2 @@ -1815,7 +2303,7 @@ jobs: REPO_COPY=${{runner.temp}}/stateless_memory/ClickHouse KILL_TIMEOUT=10800 RUN_BY_HASH_NUM=1 - RUN_BY_HASH_TOTAL=3 + RUN_BY_HASH_TOTAL=6 EOF - name: Download json reports uses: actions/download-artifact@v2 @@ -1852,7 +2340,118 @@ jobs: REPO_COPY=${{runner.temp}}/stateless_memory/ClickHouse KILL_TIMEOUT=10800 RUN_BY_HASH_NUM=2 - RUN_BY_HASH_TOTAL=3 + RUN_BY_HASH_TOTAL=6 + EOF + - name: Download json reports + uses: actions/download-artifact@v2 + with: + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE" + - name: Check out repository code + uses: actions/checkout@v2 + - name: Functional test + run: | + sudo rm -fr "$TEMP_PATH" + mkdir -p "$TEMP_PATH" + cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" + cd "$REPO_COPY/tests/ci" + python3 functional_test_check.py "$CHECK_NAME" "$KILL_TIMEOUT" + - name: Cleanup + if: always() + run: | + docker ps --quiet | xargs --no-run-if-empty docker kill ||: + docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: + sudo rm -fr "$TEMP_PATH" + FunctionalStatelessTestMsan3: + needs: [BuilderDebMsan] + runs-on: [self-hosted, func-tester] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stateless_memory + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stateless tests (msan) + REPO_COPY=${{runner.temp}}/stateless_memory/ClickHouse + KILL_TIMEOUT=10800 + RUN_BY_HASH_NUM=3 + RUN_BY_HASH_TOTAL=6 + EOF + - name: Download json reports + uses: actions/download-artifact@v2 + with: + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE" + - name: Check out repository code + uses: actions/checkout@v2 + - name: Functional test + run: | + sudo rm -fr "$TEMP_PATH" + mkdir -p "$TEMP_PATH" + cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" + cd "$REPO_COPY/tests/ci" + python3 functional_test_check.py "$CHECK_NAME" "$KILL_TIMEOUT" + - name: Cleanup + if: always() + run: | + docker ps --quiet | xargs --no-run-if-empty docker kill ||: + docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: + sudo rm -fr "$TEMP_PATH" + FunctionalStatelessTestMsan4: + needs: [BuilderDebMsan] + runs-on: [self-hosted, func-tester] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stateless_memory + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stateless tests (msan) + REPO_COPY=${{runner.temp}}/stateless_memory/ClickHouse + KILL_TIMEOUT=10800 + RUN_BY_HASH_NUM=4 + RUN_BY_HASH_TOTAL=6 + EOF + - name: Download json reports + uses: actions/download-artifact@v2 + with: + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE" + - name: Check out repository code + uses: actions/checkout@v2 + - name: Functional test + run: | + sudo rm -fr "$TEMP_PATH" + mkdir -p "$TEMP_PATH" + cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" + cd "$REPO_COPY/tests/ci" + python3 functional_test_check.py "$CHECK_NAME" "$KILL_TIMEOUT" + - name: Cleanup + if: always() + run: | + docker ps --quiet | xargs --no-run-if-empty docker kill ||: + docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: + sudo rm -fr "$TEMP_PATH" + FunctionalStatelessTestMsan5: + needs: [BuilderDebMsan] + runs-on: [self-hosted, func-tester] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stateless_memory + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stateless tests (msan) + REPO_COPY=${{runner.temp}}/stateless_memory/ClickHouse + KILL_TIMEOUT=10800 + RUN_BY_HASH_NUM=5 + RUN_BY_HASH_TOTAL=6 EOF - name: Download json reports uses: actions/download-artifact@v2 @@ -1889,7 +2488,7 @@ jobs: REPO_COPY=${{runner.temp}}/stateless_debug/ClickHouse KILL_TIMEOUT=10800 RUN_BY_HASH_NUM=0 - RUN_BY_HASH_TOTAL=3 + RUN_BY_HASH_TOTAL=5 EOF - name: Download json reports uses: actions/download-artifact@v2 @@ -1926,7 +2525,7 @@ jobs: REPO_COPY=${{runner.temp}}/stateless_debug/ClickHouse KILL_TIMEOUT=10800 RUN_BY_HASH_NUM=1 - RUN_BY_HASH_TOTAL=3 + RUN_BY_HASH_TOTAL=5 EOF - name: Download json reports uses: actions/download-artifact@v2 @@ -1963,7 +2562,81 @@ jobs: REPO_COPY=${{runner.temp}}/stateless_debug/ClickHouse KILL_TIMEOUT=10800 RUN_BY_HASH_NUM=2 - RUN_BY_HASH_TOTAL=3 + RUN_BY_HASH_TOTAL=5 + EOF + - name: Download json reports + uses: actions/download-artifact@v2 + with: + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE" + - name: Check out repository code + uses: actions/checkout@v2 + - name: Functional test + run: | + sudo rm -fr "$TEMP_PATH" + mkdir -p "$TEMP_PATH" + cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" + cd "$REPO_COPY/tests/ci" + python3 functional_test_check.py "$CHECK_NAME" "$KILL_TIMEOUT" + - name: Cleanup + if: always() + run: | + docker ps --quiet | xargs --no-run-if-empty docker kill ||: + docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: + sudo rm -fr "$TEMP_PATH" + FunctionalStatelessTestDebug3: + needs: [BuilderDebDebug] + runs-on: [self-hosted, func-tester] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stateless_debug + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stateless tests (debug) + REPO_COPY=${{runner.temp}}/stateless_debug/ClickHouse + KILL_TIMEOUT=10800 + RUN_BY_HASH_NUM=3 + RUN_BY_HASH_TOTAL=5 + EOF + - name: Download json reports + uses: actions/download-artifact@v2 + with: + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE" + - name: Check out repository code + uses: actions/checkout@v2 + - name: Functional test + run: | + sudo rm -fr "$TEMP_PATH" + mkdir -p "$TEMP_PATH" + cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" + cd "$REPO_COPY/tests/ci" + python3 functional_test_check.py "$CHECK_NAME" "$KILL_TIMEOUT" + - name: Cleanup + if: always() + run: | + docker ps --quiet | xargs --no-run-if-empty docker kill ||: + docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: + sudo rm -fr "$TEMP_PATH" + FunctionalStatelessTestDebug4: + needs: [BuilderDebDebug] + runs-on: [self-hosted, func-tester] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stateless_debug + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stateless tests (debug) + REPO_COPY=${{runner.temp}}/stateless_debug/ClickHouse + KILL_TIMEOUT=10800 + RUN_BY_HASH_NUM=4 + RUN_BY_HASH_TOTAL=5 EOF - name: Download json reports uses: actions/download-artifact@v2 @@ -2681,7 +3354,7 @@ jobs: CHECK_NAME=Integration tests (asan) REPO_COPY=${{runner.temp}}/integration_tests_asan/ClickHouse RUN_BY_HASH_NUM=0 - RUN_BY_HASH_TOTAL=3 + RUN_BY_HASH_TOTAL=6 EOF - name: Download json reports uses: actions/download-artifact@v2 @@ -2717,7 +3390,7 @@ jobs: CHECK_NAME=Integration tests (asan) REPO_COPY=${{runner.temp}}/integration_tests_asan/ClickHouse RUN_BY_HASH_NUM=1 - RUN_BY_HASH_TOTAL=3 + RUN_BY_HASH_TOTAL=6 EOF - name: Download json reports uses: actions/download-artifact@v2 @@ -2753,7 +3426,115 @@ jobs: CHECK_NAME=Integration tests (asan) REPO_COPY=${{runner.temp}}/integration_tests_asan/ClickHouse RUN_BY_HASH_NUM=2 - RUN_BY_HASH_TOTAL=3 + RUN_BY_HASH_TOTAL=6 + EOF + - name: Download json reports + uses: actions/download-artifact@v2 + with: + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE" + - name: Check out repository code + uses: actions/checkout@v2 + - name: Integration test + run: | + sudo rm -fr "$TEMP_PATH" + mkdir -p "$TEMP_PATH" + cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" + cd "$REPO_COPY/tests/ci" + python3 integration_test_check.py "$CHECK_NAME" + - name: Cleanup + if: always() + run: | + docker ps --quiet | xargs --no-run-if-empty docker kill ||: + docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: + sudo rm -fr "$TEMP_PATH" + IntegrationTestsAsan3: + needs: [BuilderDebAsan] + runs-on: [self-hosted, stress-tester] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/integration_tests_asan + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Integration tests (asan) + REPO_COPY=${{runner.temp}}/integration_tests_asan/ClickHouse + RUN_BY_HASH_NUM=3 + RUN_BY_HASH_TOTAL=6 + EOF + - name: Download json reports + uses: actions/download-artifact@v2 + with: + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE" + - name: Check out repository code + uses: actions/checkout@v2 + - name: Integration test + run: | + sudo rm -fr "$TEMP_PATH" + mkdir -p "$TEMP_PATH" + cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" + cd "$REPO_COPY/tests/ci" + python3 integration_test_check.py "$CHECK_NAME" + - name: Cleanup + if: always() + run: | + docker ps --quiet | xargs --no-run-if-empty docker kill ||: + docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: + sudo rm -fr "$TEMP_PATH" + IntegrationTestsAsan4: + needs: [BuilderDebAsan] + runs-on: [self-hosted, stress-tester] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/integration_tests_asan + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Integration tests (asan) + REPO_COPY=${{runner.temp}}/integration_tests_asan/ClickHouse + RUN_BY_HASH_NUM=4 + RUN_BY_HASH_TOTAL=6 + EOF + - name: Download json reports + uses: actions/download-artifact@v2 + with: + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE" + - name: Check out repository code + uses: actions/checkout@v2 + - name: Integration test + run: | + sudo rm -fr "$TEMP_PATH" + mkdir -p "$TEMP_PATH" + cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" + cd "$REPO_COPY/tests/ci" + python3 integration_test_check.py "$CHECK_NAME" + - name: Cleanup + if: always() + run: | + docker ps --quiet | xargs --no-run-if-empty docker kill ||: + docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: + sudo rm -fr "$TEMP_PATH" + IntegrationTestsAsan5: + needs: [BuilderDebAsan] + runs-on: [self-hosted, stress-tester] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/integration_tests_asan + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Integration tests (asan) + REPO_COPY=${{runner.temp}}/integration_tests_asan/ClickHouse + RUN_BY_HASH_NUM=5 + RUN_BY_HASH_TOTAL=6 EOF - name: Download json reports uses: actions/download-artifact@v2 @@ -2789,7 +3570,7 @@ jobs: CHECK_NAME=Integration tests (tsan) REPO_COPY=${{runner.temp}}/integration_tests_tsan/ClickHouse RUN_BY_HASH_NUM=0 - RUN_BY_HASH_TOTAL=4 + RUN_BY_HASH_TOTAL=6 EOF - name: Download json reports uses: actions/download-artifact@v2 @@ -2825,7 +3606,7 @@ jobs: CHECK_NAME=Integration tests (tsan) REPO_COPY=${{runner.temp}}/integration_tests_tsan/ClickHouse RUN_BY_HASH_NUM=1 - RUN_BY_HASH_TOTAL=4 + RUN_BY_HASH_TOTAL=6 EOF - name: Download json reports uses: actions/download-artifact@v2 @@ -2861,7 +3642,7 @@ jobs: CHECK_NAME=Integration tests (tsan) REPO_COPY=${{runner.temp}}/integration_tests_tsan/ClickHouse RUN_BY_HASH_NUM=2 - RUN_BY_HASH_TOTAL=4 + RUN_BY_HASH_TOTAL=6 EOF - name: Download json reports uses: actions/download-artifact@v2 @@ -2897,7 +3678,79 @@ jobs: CHECK_NAME=Integration tests (tsan) REPO_COPY=${{runner.temp}}/integration_tests_tsan/ClickHouse RUN_BY_HASH_NUM=3 - RUN_BY_HASH_TOTAL=4 + RUN_BY_HASH_TOTAL=6 + EOF + - name: Download json reports + uses: actions/download-artifact@v2 + with: + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE" + - name: Check out repository code + uses: actions/checkout@v2 + - name: Integration test + run: | + sudo rm -fr "$TEMP_PATH" + mkdir -p "$TEMP_PATH" + cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" + cd "$REPO_COPY/tests/ci" + python3 integration_test_check.py "$CHECK_NAME" + - name: Cleanup + if: always() + run: | + docker ps --quiet | xargs --no-run-if-empty docker kill ||: + docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: + sudo rm -fr "$TEMP_PATH" + IntegrationTestsTsan4: + needs: [BuilderDebTsan] + runs-on: [self-hosted, stress-tester] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/integration_tests_tsan + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Integration tests (tsan) + REPO_COPY=${{runner.temp}}/integration_tests_tsan/ClickHouse + RUN_BY_HASH_NUM=4 + RUN_BY_HASH_TOTAL=6 + EOF + - name: Download json reports + uses: actions/download-artifact@v2 + with: + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE" + - name: Check out repository code + uses: actions/checkout@v2 + - name: Integration test + run: | + sudo rm -fr "$TEMP_PATH" + mkdir -p "$TEMP_PATH" + cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" + cd "$REPO_COPY/tests/ci" + python3 integration_test_check.py "$CHECK_NAME" + - name: Cleanup + if: always() + run: | + docker ps --quiet | xargs --no-run-if-empty docker kill ||: + docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: + sudo rm -fr "$TEMP_PATH" + IntegrationTestsTsan5: + needs: [BuilderDebTsan] + runs-on: [self-hosted, stress-tester] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/integration_tests_tsan + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Integration tests (tsan) + REPO_COPY=${{runner.temp}}/integration_tests_tsan/ClickHouse + RUN_BY_HASH_NUM=5 + RUN_BY_HASH_TOTAL=6 EOF - name: Download json reports uses: actions/download-artifact@v2 @@ -2933,7 +3786,7 @@ jobs: CHECK_NAME=Integration tests (release) REPO_COPY=${{runner.temp}}/integration_tests_release/ClickHouse RUN_BY_HASH_NUM=0 - RUN_BY_HASH_TOTAL=2 + RUN_BY_HASH_TOTAL=4 EOF - name: Download json reports uses: actions/download-artifact@v2 @@ -2969,7 +3822,79 @@ jobs: CHECK_NAME=Integration tests (release) REPO_COPY=${{runner.temp}}/integration_tests_release/ClickHouse RUN_BY_HASH_NUM=1 - RUN_BY_HASH_TOTAL=2 + RUN_BY_HASH_TOTAL=4 + EOF + - name: Download json reports + uses: actions/download-artifact@v2 + with: + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE" + - name: Check out repository code + uses: actions/checkout@v2 + - name: Integration test + run: | + sudo rm -fr "$TEMP_PATH" + mkdir -p "$TEMP_PATH" + cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" + cd "$REPO_COPY/tests/ci" + python3 integration_test_check.py "$CHECK_NAME" + - name: Cleanup + if: always() + run: | + docker ps --quiet | xargs --no-run-if-empty docker kill ||: + docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: + sudo rm -fr "$TEMP_PATH" + IntegrationTestsRelease2: + needs: [BuilderDebRelease] + runs-on: [self-hosted, stress-tester] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/integration_tests_release + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Integration tests (release) + REPO_COPY=${{runner.temp}}/integration_tests_release/ClickHouse + RUN_BY_HASH_NUM=2 + RUN_BY_HASH_TOTAL=4 + EOF + - name: Download json reports + uses: actions/download-artifact@v2 + with: + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE" + - name: Check out repository code + uses: actions/checkout@v2 + - name: Integration test + run: | + sudo rm -fr "$TEMP_PATH" + mkdir -p "$TEMP_PATH" + cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" + cd "$REPO_COPY/tests/ci" + python3 integration_test_check.py "$CHECK_NAME" + - name: Cleanup + if: always() + run: | + docker ps --quiet | xargs --no-run-if-empty docker kill ||: + docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: + sudo rm -fr "$TEMP_PATH" + IntegrationTestsRelease3: + needs: [BuilderDebRelease] + runs-on: [self-hosted, stress-tester] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/integration_tests_release + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Integration tests (release) + REPO_COPY=${{runner.temp}}/integration_tests_release/ClickHouse + RUN_BY_HASH_NUM=3 + RUN_BY_HASH_TOTAL=4 EOF - name: Download json reports uses: actions/download-artifact@v2 @@ -3491,6 +4416,77 @@ jobs: docker ps --quiet | xargs --no-run-if-empty docker kill ||: docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: sudo rm -fr "$TEMP_PATH" +############################################################################################## +###################################### SQLANCER FUZZERS ###################################### +############################################################################################## + SQLancerTestRelease: + needs: [BuilderDebRelease] + runs-on: [self-hosted, fuzzer-unit-tester] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/sqlancer_release + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=SQLancer (release) + REPO_COPY=${{runner.temp}}/sqlancer_release/ClickHouse + EOF + - name: Download json reports + uses: actions/download-artifact@v2 + with: + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE" + - name: Check out repository code + uses: actions/checkout@v2 + - name: SQLancer + run: | + sudo rm -fr "$TEMP_PATH" + mkdir -p "$TEMP_PATH" + cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" + cd "$REPO_COPY/tests/ci" + python3 sqlancer_check.py "$CHECK_NAME" + - name: Cleanup + if: always() + run: | + docker ps --quiet | xargs --no-run-if-empty docker kill ||: + docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: + sudo rm -fr "$TEMP_PATH" + SQLancerTestDebug: + needs: [BuilderDebDebug] + runs-on: [self-hosted, fuzzer-unit-tester] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/sqlancer_debug + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=SQLancer (debug) + REPO_COPY=${{runner.temp}}/sqlancer_debug/ClickHouse + EOF + - name: Download json reports + uses: actions/download-artifact@v2 + with: + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE" + - name: Check out repository code + uses: actions/checkout@v2 + - name: SQLancer + run: | + sudo rm -fr "$TEMP_PATH" + mkdir -p "$TEMP_PATH" + cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" + cd "$REPO_COPY/tests/ci" + python3 sqlancer_check.py "$CHECK_NAME" + - name: Cleanup + if: always() + run: | + docker ps --quiet | xargs --no-run-if-empty docker kill ||: + docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: + sudo rm -fr "$TEMP_PATH" ############################################################################################# ###################################### JEPSEN TESTS ######################################### ############################################################################################# @@ -3501,7 +4497,6 @@ jobs: if: contains(github.event.pull_request.labels.*.name, 'jepsen-test') needs: [BuilderBinRelease] uses: ./.github/workflows/jepsen.yml - FinishCheck: needs: - StyleCheck @@ -3509,24 +4504,37 @@ jobs: - DockerServerImages - CheckLabels - BuilderReport + - BuilderSpecialReport - FastTest - FunctionalStatelessTestDebug0 - FunctionalStatelessTestDebug1 - FunctionalStatelessTestDebug2 + - FunctionalStatelessTestDebug3 + - FunctionalStatelessTestDebug4 - FunctionalStatelessTestRelease - FunctionalStatelessTestReleaseDatabaseReplicated0 - FunctionalStatelessTestReleaseDatabaseReplicated1 + - FunctionalStatelessTestReleaseDatabaseReplicated2 + - FunctionalStatelessTestReleaseDatabaseReplicated3 - FunctionalStatelessTestReleaseWideParts - FunctionalStatelessTestAarch64 - FunctionalStatelessTestAsan0 - FunctionalStatelessTestAsan1 + - FunctionalStatelessTestAsan2 + - FunctionalStatelessTestAsan3 - FunctionalStatelessTestTsan0 - FunctionalStatelessTestTsan1 - FunctionalStatelessTestTsan2 + - FunctionalStatelessTestTsan3 + - FunctionalStatelessTestTsan4 - FunctionalStatelessTestMsan0 - FunctionalStatelessTestMsan1 - FunctionalStatelessTestMsan2 - - FunctionalStatelessTestUBsan + - FunctionalStatelessTestMsan3 + - FunctionalStatelessTestMsan4 + - FunctionalStatelessTestMsan5 + - FunctionalStatelessTestUBsan0 + - FunctionalStatelessTestUBsan1 - FunctionalStatefulTestDebug - FunctionalStatefulTestRelease - FunctionalStatefulTestAarch64 @@ -3534,13 +4542,17 @@ jobs: - FunctionalStatefulTestTsan - FunctionalStatefulTestMsan - FunctionalStatefulTestUBsan - - FunctionalStatelessTestReleaseS3 + - FunctionalStatelessTestReleaseS3_0 + - FunctionalStatelessTestReleaseS3_1 - FunctionalStatelessTestS3Debug0 - FunctionalStatelessTestS3Debug1 - FunctionalStatelessTestS3Debug2 + - FunctionalStatelessTestS3Debug4 + - FunctionalStatelessTestS3Debug5 - FunctionalStatelessTestS3Tsan0 - FunctionalStatelessTestS3Tsan1 - FunctionalStatelessTestS3Tsan2 + - FunctionalStatelessTestS3Tsan4 - StressTestDebug - StressTestAsan - StressTestTsan @@ -3554,12 +4566,19 @@ jobs: - IntegrationTestsAsan0 - IntegrationTestsAsan1 - IntegrationTestsAsan2 + - IntegrationTestsAsan3 + - IntegrationTestsAsan4 + - IntegrationTestsAsan5 - IntegrationTestsRelease0 - IntegrationTestsRelease1 + - IntegrationTestsRelease2 + - IntegrationTestsRelease3 - IntegrationTestsTsan0 - IntegrationTestsTsan1 - IntegrationTestsTsan2 - IntegrationTestsTsan3 + - IntegrationTestsTsan4 + - IntegrationTestsTsan5 - PerformanceComparisonX86-0 - PerformanceComparisonX86-1 - PerformanceComparisonX86-2 @@ -3576,6 +4595,8 @@ jobs: - SharedBuildSmokeTest - CompatibilityCheck - IntegrationTestsFlakyCheck + - SQLancerTestRelease + - SQLancerTestDebug runs-on: [self-hosted, style-checker] steps: - name: Clear repository diff --git a/.github/workflows/release_branches.yml b/.github/workflows/release_branches.yml index abe85d3e72d..bf35ca76fc6 100644 --- a/.github/workflows/release_branches.yml +++ b/.github/workflows/release_branches.yml @@ -136,8 +136,8 @@ jobs: fetch-depth: 0 # otherwise we will have no info about contributors - name: Build run: | - git -C "$GITHUB_WORKSPACE" submodule sync --recursive - git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 + git -C "$GITHUB_WORKSPACE" submodule sync + git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10 sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" @@ -178,8 +178,8 @@ jobs: fetch-depth: 0 # otherwise we will have no info about contributors - name: Build run: | - git -C "$GITHUB_WORKSPACE" submodule sync --recursive - git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 + git -C "$GITHUB_WORKSPACE" submodule sync + git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10 sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" @@ -220,8 +220,8 @@ jobs: uses: actions/checkout@v2 - name: Build run: | - git -C "$GITHUB_WORKSPACE" submodule sync --recursive - git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 + git -C "$GITHUB_WORKSPACE" submodule sync + git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10 sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" @@ -263,8 +263,8 @@ jobs: uses: actions/checkout@v2 - name: Build run: | - git -C "$GITHUB_WORKSPACE" submodule sync --recursive - git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 + git -C "$GITHUB_WORKSPACE" submodule sync + git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10 sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" @@ -306,8 +306,8 @@ jobs: uses: actions/checkout@v2 - name: Build run: | - git -C "$GITHUB_WORKSPACE" submodule sync --recursive - git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 + git -C "$GITHUB_WORKSPACE" submodule sync + git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10 sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" @@ -349,8 +349,8 @@ jobs: uses: actions/checkout@v2 - name: Build run: | - git -C "$GITHUB_WORKSPACE" submodule sync --recursive - git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 + git -C "$GITHUB_WORKSPACE" submodule sync + git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10 sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" @@ -392,8 +392,8 @@ jobs: uses: actions/checkout@v2 - name: Build run: | - git -C "$GITHUB_WORKSPACE" submodule sync --recursive - git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 + git -C "$GITHUB_WORKSPACE" submodule sync + git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10 sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" @@ -437,8 +437,8 @@ jobs: fetch-depth: 0 # otherwise we will have no info about contributors - name: Build run: | - git -C "$GITHUB_WORKSPACE" submodule sync --recursive - git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 + git -C "$GITHUB_WORKSPACE" submodule sync + git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10 sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" @@ -482,8 +482,8 @@ jobs: fetch-depth: 0 # otherwise we will have no info about contributors - name: Build run: | - git -C "$GITHUB_WORKSPACE" submodule sync --recursive - git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 + git -C "$GITHUB_WORKSPACE" submodule sync + git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10 sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" @@ -615,6 +615,23 @@ jobs: docker ps --quiet | xargs --no-run-if-empty docker kill ||: docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: sudo rm -fr "$TEMP_PATH" + MarkReleaseReady: + needs: + - BuilderBinDarwin + - BuilderBinDarwinAarch64 + - BuilderDebRelease + - BuilderDebAarch64 + runs-on: [self-hosted, style-checker] + steps: + - name: Clear repository + run: | + sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE" + - name: Check out repository code + uses: actions/checkout@v2 + - name: Mark Commit Release Ready + run: | + cd "$GITHUB_WORKSPACE/tests/ci" + python3 mark_release_ready.py ############################################################################################## ########################### FUNCTIONAl STATELESS TESTS ####################################### ############################################################################################## @@ -1888,6 +1905,7 @@ jobs: - DockerServerImages - BuilderReport - BuilderSpecialReport + - MarkReleaseReady - FunctionalStatelessTestDebug0 - FunctionalStatelessTestDebug1 - FunctionalStatelessTestDebug2 diff --git a/.github/workflows/tags_stable.yml b/.github/workflows/tags_stable.yml index a9172a8a2e2..f8cfa1137cc 100644 --- a/.github/workflows/tags_stable.yml +++ b/.github/workflows/tags_stable.yml @@ -38,7 +38,7 @@ jobs: with: ref: master fetch-depth: 0 - - name: Generate versions + - name: Update versions, docker version, changelog, security env: GITHUB_TOKEN: ${{ secrets.ROBOT_CLICKHOUSE_COMMIT_TOKEN }} run: | @@ -51,6 +51,7 @@ jobs: --gh-user-or-token="$GITHUB_TOKEN" --jobs=5 \ --output="/ClickHouse/docs/changelogs/${GITHUB_TAG}.md" "${GITHUB_TAG}" git add "./docs/changelogs/${GITHUB_TAG}.md" + python3 ./utils/security-generator/generate_security.py > SECURITY.md git diff HEAD - name: Create Pull Request uses: peter-evans/create-pull-request@v3 @@ -60,6 +61,7 @@ jobs: committer: "robot-clickhouse " commit-message: Update version_date.tsv and changelogs after ${{ env.GITHUB_TAG }} branch: auto/${{ env.GITHUB_TAG }} + assignees: ${{ github.event.sender.login }} # assign the PR to the tag pusher delete-branch: true title: Update version_date.tsv and changelogs after ${{ env.GITHUB_TAG }} labels: do not test diff --git a/.gitignore b/.gitignore index 5b8f2ca452d..6d94cade384 100644 --- a/.gitignore +++ b/.gitignore @@ -17,6 +17,7 @@ # logs *.log +*.debuglog *.stderr *.stdout @@ -154,3 +155,6 @@ website/package-lock.json /programs/server/metadata /programs/server/store +# temporary test files +tests/queries/0_stateless/test_* +tests/queries/0_stateless/*.binary diff --git a/.gitmodules b/.gitmodules index 293029ad171..a4cfcc91485 100644 --- a/.gitmodules +++ b/.gitmodules @@ -65,12 +65,6 @@ [submodule "contrib/libgsasl"] path = contrib/libgsasl url = https://github.com/ClickHouse/libgsasl.git -[submodule "contrib/libcxx"] - path = contrib/libcxx - url = https://github.com/ClickHouse/libcxx.git -[submodule "contrib/libcxxabi"] - path = contrib/libcxxabi - url = https://github.com/ClickHouse/libcxxabi.git [submodule "contrib/snappy"] path = contrib/snappy url = https://github.com/ClickHouse/snappy.git @@ -290,3 +284,9 @@ [submodule "contrib/morton-nd"] path = contrib/morton-nd url = https://github.com/morton-nd/morton-nd +[submodule "contrib/xxHash"] + path = contrib/xxHash + url = https://github.com/Cyan4973/xxHash.git +[submodule "contrib/google-benchmark"] + path = contrib/google-benchmark + url = https://github.com/google/benchmark.git diff --git a/.vimrc b/.vimrc deleted file mode 100644 index ba996eb8a42..00000000000 --- a/.vimrc +++ /dev/null @@ -1,2 +0,0 @@ -au BufRead,BufNewFile ./* set tabstop=4 softtabstop=0 expandtab shiftwidth=4 smarttab tags=tags,../tags - diff --git a/CHANGELOG.md b/CHANGELOG.md index 68767612892..0e41894b8bd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,5 @@ ### Table of Contents +**[ClickHouse release v22.11, 2022-11-17](#2211)**
**[ClickHouse release v22.10, 2022-10-25](#2210)**
**[ClickHouse release v22.9, 2022-09-22](#229)**
**[ClickHouse release v22.8-lts, 2022-08-18](#228)**
@@ -11,6 +12,109 @@ **[ClickHouse release v22.1, 2022-01-18](#221)**
**[Changelog for 2021](https://clickhouse.com/docs/en/whats-new/changelog/2021/)**
+### ClickHouse release 22.11, 2022-11-17 + +#### Backward Incompatible Change +* `JSONExtract` family of functions will now attempt to coerce to the requested type. [#41502](https://github.com/ClickHouse/ClickHouse/pull/41502) ([Márcio Martins](https://github.com/marcioapm)). + +#### New Feature +* Adds support for retries during INSERTs into ReplicatedMergeTree when a session with ClickHouse Keeper is lost. Apart from fault tolerance, it aims to provide better user experience, - avoid returning a user an error during insert if keeper is restarted (for example, due to upgrade). This is controlled by the `insert_keeper_max_retries` setting, which is disabled by default. [#42607](https://github.com/ClickHouse/ClickHouse/pull/42607) ([Igor Nikonov](https://github.com/devcrafter)). +* Add `Hudi` and `DeltaLake` table engines, read-only, only for tables on S3. [#41054](https://github.com/ClickHouse/ClickHouse/pull/41054) ([Daniil Rubin](https://github.com/rubin-do), [Kseniia Sumarokova](https://github.com/kssenii)). +* Add table function `hudi` and `deltaLake`. [#43080](https://github.com/ClickHouse/ClickHouse/pull/43080) ([flynn](https://github.com/ucasfl)). +* Support for composite time intervals. 1. Add, subtract and negate operations are now available on Intervals. In the case where the types of Intervals are different, they will be transformed into the Tuple of those types. 2. A tuple of intervals can be added to or subtracted from a Date/DateTime field. 3. Added parsing of Intervals with different types, for example: `INTERVAL '1 HOUR 1 MINUTE 1 SECOND'`. [#42195](https://github.com/ClickHouse/ClickHouse/pull/42195) ([Nikolay Degterinsky](https://github.com/evillique)). +* Added `**` glob support for recursive directory traversal of the filesystem and S3. Resolves [#36316](https://github.com/ClickHouse/ClickHouse/issues/36316). [#42376](https://github.com/ClickHouse/ClickHouse/pull/42376) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)). +* Introduce `s3_plain` disk type for write-once-read-many operations. Implement `ATTACH` of `MergeTree` table for `s3_plain` disk. [#42628](https://github.com/ClickHouse/ClickHouse/pull/42628) ([Azat Khuzhin](https://github.com/azat)). +* Added applied row-level policies to `system.query_log`. [#39819](https://github.com/ClickHouse/ClickHouse/pull/39819) ([Vladimir Chebotaryov](https://github.com/quickhouse)). +* Add four-letter command `csnp` for manually creating snapshots in ClickHouse Keeper. Additionally, `lgif` was added to get Raft information for a specific node (e.g. index of last created snapshot, last committed log index). [#41766](https://github.com/ClickHouse/ClickHouse/pull/41766) ([JackyWoo](https://github.com/JackyWoo)). +* Add function `ascii` like in Apache Spark: https://spark.apache.org/docs/latest/api/sql/#ascii. [#42670](https://github.com/ClickHouse/ClickHouse/pull/42670) ([李扬](https://github.com/taiyang-li)). +* Add function `positive_modulo` (`pmod`) which returns non-negative result based on modulo. [#42755](https://github.com/ClickHouse/ClickHouse/pull/42755) ([李扬](https://github.com/taiyang-li)). +* Add function `formatReadableDecimalSize`. [#42774](https://github.com/ClickHouse/ClickHouse/pull/42774) ([Alejandro](https://github.com/alexon1234)). +* Add function `randCanonical`, which is similar to the `rand` function in Apache Spark or Impala. The function generates pseudo random results with independent and identically distributed uniformly distributed values in [0, 1). [#43124](https://github.com/ClickHouse/ClickHouse/pull/43124) ([李扬](https://github.com/taiyang-li)). +* Add function `displayName`, closes [#36770](https://github.com/ClickHouse/ClickHouse/issues/36770). [#37681](https://github.com/ClickHouse/ClickHouse/pull/37681) ([hongbin](https://github.com/xlwh)). +* Add `min_age_to_force_merge_on_partition_only` setting to optimize old parts for the entire partition only. [#42659](https://github.com/ClickHouse/ClickHouse/pull/42659) ([Antonio Andelic](https://github.com/antonio2368)). +* Add generic implementation for arbitrary structured named collections, access type and `system.named_collections`. [#43147](https://github.com/ClickHouse/ClickHouse/pull/43147) ([Kseniia Sumarokova](https://github.com/kssenii)). + +#### Performance Improvement +* Parallelized merging of `uniqExact` states for aggregation without key, i.e. queries like `SELECT uniqExact(number) FROM table`. The improvement becomes noticeable when the number of unique keys approaches 10^6. Also `uniq` performance is slightly optimized. [#43072](https://github.com/ClickHouse/ClickHouse/pull/43072) ([Nikita Taranov](https://github.com/nickitat)). +* `match` function can use the index if it's a condition on string prefix. This closes [#37333](https://github.com/ClickHouse/ClickHouse/issues/37333). [#42458](https://github.com/ClickHouse/ClickHouse/pull/42458) ([clarkcaoliu](https://github.com/Clark0)). +* Speed up AND and OR operators when they are sequenced. [#42214](https://github.com/ClickHouse/ClickHouse/pull/42214) ([Zhiguo Zhou](https://github.com/ZhiguoZh)). +* Support parallel parsing for `LineAsString` input format. This improves performance just slightly. This closes [#42502](https://github.com/ClickHouse/ClickHouse/issues/42502). [#42780](https://github.com/ClickHouse/ClickHouse/pull/42780) ([Kruglov Pavel](https://github.com/Avogar)). +* ClickHouse Keeper performance improvement: improve commit performance for cases when many different nodes have uncommitted states. This should help with cases when a follower node can't sync fast enough. [#42926](https://github.com/ClickHouse/ClickHouse/pull/42926) ([Antonio Andelic](https://github.com/antonio2368)). +* A condition like `NOT LIKE 'prefix%'` can use the primary index. [#42209](https://github.com/ClickHouse/ClickHouse/pull/42209) ([Duc Canh Le](https://github.com/canhld94)). + +#### Experimental Feature +* Support type `Object` inside other types, e.g. `Array(JSON)`. [#36969](https://github.com/ClickHouse/ClickHouse/pull/36969) ([Anton Popov](https://github.com/CurtizJ)). +* Ignore MySQL binlog SAVEPOINT event for MaterializedMySQL. [#42931](https://github.com/ClickHouse/ClickHouse/pull/42931) ([zzsmdfj](https://github.com/zzsmdfj)). Handle (ignore) SAVEPOINT queries in MaterializedMySQL. [#43086](https://github.com/ClickHouse/ClickHouse/pull/43086) ([Stig Bakken](https://github.com/stigsb)). + +#### Improvement +* Trivial queries with small LIMIT will properly determine the number of estimated rows to read, so that the threshold will be checked properly. Closes [#7071](https://github.com/ClickHouse/ClickHouse/issues/7071). [#42580](https://github.com/ClickHouse/ClickHouse/pull/42580) ([Han Fei](https://github.com/hanfei1991)). +* Add support for interactive parameters in INSERT VALUES queries. [#43077](https://github.com/ClickHouse/ClickHouse/pull/43077) ([Nikolay Degterinsky](https://github.com/evillique)). +* Added new field `allow_readonly` in `system.table_functions` to allow using table functions in readonly mode. Resolves [#42414](https://github.com/ClickHouse/ClickHouse/issues/42414) Implementation: * Added a new field allow_readonly to table system.table_functions. * Updated to use new field allow_readonly to allow using table functions in readonly mode. Testing: * Added a test for filesystem tests/queries/0_stateless/02473_functions_in_readonly_mode.sh Documentation: * Updated the english documentation for Table Functions. [#42708](https://github.com/ClickHouse/ClickHouse/pull/42708) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)). +* The `system.asynchronous_metrics` gets embedded documentation. This documentation is also exported to Prometheus. Fixed an error with the metrics about `cache` disks - they were calculated only for one arbitrary cache disk instead all of them. This closes [#7644](https://github.com/ClickHouse/ClickHouse/issues/7644). [#43194](https://github.com/ClickHouse/ClickHouse/pull/43194) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Throttling algorithm changed to token bucket. [#42665](https://github.com/ClickHouse/ClickHouse/pull/42665) ([Sergei Trifonov](https://github.com/serxa)). +* Mask passwords and secret keys both in `system.query_log` and `/var/log/clickhouse-server/*.log` and also in error messages. [#42484](https://github.com/ClickHouse/ClickHouse/pull/42484) ([Vitaly Baranov](https://github.com/vitlibar)). +* Remove covered parts for fetched part (to avoid possible replication delay grows). [#39737](https://github.com/ClickHouse/ClickHouse/pull/39737) ([Azat Khuzhin](https://github.com/azat)). +* If `/dev/tty` is available, the progress in clickhouse-client and clickhouse-local will be rendered directly to the terminal, without writing to STDERR. It allows getting progress even if STDERR is redirected to a file, and the file will not be polluted by terminal escape sequences. The progress can be disabled by `--progress false`. This closes [#32238](https://github.com/ClickHouse/ClickHouse/issues/32238). [#42003](https://github.com/ClickHouse/ClickHouse/pull/42003) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Add support for `FixedString` input to base64 coding functions. [#42285](https://github.com/ClickHouse/ClickHouse/pull/42285) ([ltrk2](https://github.com/ltrk2)). +* Add columns `bytes_on_disk` and `path` to `system.detached_parts`. Closes [#42264](https://github.com/ClickHouse/ClickHouse/issues/42264). [#42303](https://github.com/ClickHouse/ClickHouse/pull/42303) ([chen](https://github.com/xiedeyantu)). +* Improve using structure from insertion table in table functions, now setting `use_structure_from_insertion_table_in_table_functions` has new possible value - `2` that means that ClickHouse will try to determine if we can use structure from insertion table or not automatically. Closes [#40028](https://github.com/ClickHouse/ClickHouse/issues/40028). [#42320](https://github.com/ClickHouse/ClickHouse/pull/42320) ([Kruglov Pavel](https://github.com/Avogar)). +* Fix no progress indication on INSERT FROM INFILE. Closes [#42548](https://github.com/ClickHouse/ClickHouse/issues/42548). [#42634](https://github.com/ClickHouse/ClickHouse/pull/42634) ([chen](https://github.com/xiedeyantu)). +* Refactor function `tokens` to enable max tokens returned for related functions (disabled by default). [#42673](https://github.com/ClickHouse/ClickHouse/pull/42673) ([李扬](https://github.com/taiyang-li)). +* Allow to use `Date32` arguments for `formatDateTime` and `FROM_UNIXTIME` functions. [#42737](https://github.com/ClickHouse/ClickHouse/pull/42737) ([Roman Vasin](https://github.com/rvasin)). +* Update tzdata to 2022f. Mexico will no longer observe DST except near the US border: https://www.timeanddate.com/news/time/mexico-abolishes-dst-2022.html. Chihuahua moves to year-round UTC-6 on 2022-10-30. Fiji no longer observes DST. See https://github.com/google/cctz/pull/235 and https://bugs.launchpad.net/ubuntu/+source/tzdata/+bug/1995209. [#42796](https://github.com/ClickHouse/ClickHouse/pull/42796) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Add `FailedAsyncInsertQuery` event metric for async inserts. [#42814](https://github.com/ClickHouse/ClickHouse/pull/42814) ([Krzysztof Góralski](https://github.com/kgoralski)). +* Implement `read-in-order` optimization on top of query plan. It is enabled by default. Set `query_plan_read_in_order = 0` to use previous AST-based version. [#42829](https://github.com/ClickHouse/ClickHouse/pull/42829) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Increase the size of upload part exponentially for backup to S3 to avoid errors about max 10 000 parts limit of the multipart upload to s3. [#42833](https://github.com/ClickHouse/ClickHouse/pull/42833) ([Vitaly Baranov](https://github.com/vitlibar)). +* When the merge task is continuously busy and the disk space is insufficient, the completely expired parts cannot be selected and dropped, resulting in insufficient disk space. My idea is that when the entire Part expires, there is no need for additional disk space to guarantee, ensure the normal execution of TTL. [#42869](https://github.com/ClickHouse/ClickHouse/pull/42869) ([zhongyuankai](https://github.com/zhongyuankai)). +* Add `oss` function and `OSS` table engine (this is convenient for users). oss is fully compatible with s3. [#43155](https://github.com/ClickHouse/ClickHouse/pull/43155) ([zzsmdfj](https://github.com/zzsmdfj)). +* Improve error reporting in the collection of OS-related info for the `system.asynchronous_metrics` table. [#43192](https://github.com/ClickHouse/ClickHouse/pull/43192) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Modify the `INFORMATION_SCHEMA` tables in a way so that ClickHouse can connect to itself using the MySQL compatibility protocol. Add columns instead of aliases (related to [#9769](https://github.com/ClickHouse/ClickHouse/issues/9769)). It will improve the compatibility with various MySQL clients. [#43198](https://github.com/ClickHouse/ClickHouse/pull/43198) ([Filatenkov Artur](https://github.com/FArthur-cmd)). +* Add some functions for compatibility with PowerBI, when it connects using MySQL protocol [#42612](https://github.com/ClickHouse/ClickHouse/pull/42612) ([Filatenkov Artur](https://github.com/FArthur-cmd)). +* Better usability for Dashboard on changes [#42872](https://github.com/ClickHouse/ClickHouse/pull/42872) ([Vladimir C](https://github.com/vdimir)). + +#### Build/Testing/Packaging Improvement +* Run SQLancer for each pull request and commit to master. [SQLancer](https://github.com/sqlancer/sqlancer) is an OpenSource fuzzer that focuses on automatic detection of logical bugs. [#42397](https://github.com/ClickHouse/ClickHouse/pull/42397) ([Ilya Yatsishin](https://github.com/qoega)). +* Update to latest zlib-ng. [#42463](https://github.com/ClickHouse/ClickHouse/pull/42463) ([Boris Kuschel](https://github.com/bkuschel)). +* Add support for testing ClickHouse server with Jepsen. By the way, we already have support for testing ClickHouse Keeper with Jepsen. This pull request extends it to Replicated tables. [#42619](https://github.com/ClickHouse/ClickHouse/pull/42619) ([Antonio Andelic](https://github.com/antonio2368)). +* Use https://github.com/matus-chochlik/ctcache for clang-tidy results caching. [#42913](https://github.com/ClickHouse/ClickHouse/pull/42913) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Before the fix, the user-defined config was preserved by RPM in `$file.rpmsave`. The PR fixes it and won't replace the user's files from packages. [#42936](https://github.com/ClickHouse/ClickHouse/pull/42936) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Remove some libraries from Ubuntu Docker image. [#42622](https://github.com/ClickHouse/ClickHouse/pull/42622) ([Alexey Milovidov](https://github.com/alexey-milovidov)). + +#### Bug Fix (user-visible misbehavior in official stable or prestable release) + +* Updated normaliser to clone the alias ast. Resolves [#42452](https://github.com/ClickHouse/ClickHouse/issues/42452) Implementation: * Updated QueryNormalizer to clone alias ast, when its replaced. Previously just assigning the same leads to exception in LogicalExpressinsOptimizer as it would be the same parent being inserted again. * This bug is not seen with new analyser (allow_experimental_analyzer), so no changes for it. I added a test for the same. [#42827](https://github.com/ClickHouse/ClickHouse/pull/42827) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)). +* Fix race for backup of tables in `Lazy` databases. [#43104](https://github.com/ClickHouse/ClickHouse/pull/43104) ([Vitaly Baranov](https://github.com/vitlibar)). +* Fix for `skip_unavailable_shards`: it did not work with the `s3Cluster` table function. [#43131](https://github.com/ClickHouse/ClickHouse/pull/43131) ([chen](https://github.com/xiedeyantu)). +* Fix schema inference in `s3Cluster` and improvement in `hdfsCluster`. [#41979](https://github.com/ClickHouse/ClickHouse/pull/41979) ([Kruglov Pavel](https://github.com/Avogar)). +* Fix retries while reading from URL table engines / table function. (retriable errors could be retries more times than needed, non-retriable errors resulted in failed assertion in code). [#42224](https://github.com/ClickHouse/ClickHouse/pull/42224) ([Kseniia Sumarokova](https://github.com/kssenii)). +* A segmentation fault related to DNS & c-ares has been reported and fixed. [#42234](https://github.com/ClickHouse/ClickHouse/pull/42234) ([Arthur Passos](https://github.com/arthurpassos)). +* Fix `LOGICAL_ERROR` `Arguments of 'plus' have incorrect data types` which may happen in PK analysis (monotonicity check). Fix invalid PK analysis for monotonic binary functions with first constant argument. [#42410](https://github.com/ClickHouse/ClickHouse/pull/42410) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Fix incorrect key analysis when key types cannot be inside Nullable. This fixes [#42456](https://github.com/ClickHouse/ClickHouse/issues/42456). [#42469](https://github.com/ClickHouse/ClickHouse/pull/42469) ([Amos Bird](https://github.com/amosbird)). +* Fix typo in a setting name that led to bad usage of schema inference cache while using setting `input_format_csv_use_best_effort_in_schema_inference`. Closes [#41735](https://github.com/ClickHouse/ClickHouse/issues/41735). [#42536](https://github.com/ClickHouse/ClickHouse/pull/42536) ([Kruglov Pavel](https://github.com/Avogar)). +* Fix creating a Set with wrong header when data type is LowCardinality. Closes [#42460](https://github.com/ClickHouse/ClickHouse/issues/42460). [#42579](https://github.com/ClickHouse/ClickHouse/pull/42579) ([flynn](https://github.com/ucasfl)). +* `(U)Int128` and `(U)Int256` values were correctly checked in `PREWHERE`. [#42605](https://github.com/ClickHouse/ClickHouse/pull/42605) ([Antonio Andelic](https://github.com/antonio2368)). +* Fix a bug in functions parser that could have led to a segmentation fault. [#42724](https://github.com/ClickHouse/ClickHouse/pull/42724) ([Nikolay Degterinsky](https://github.com/evillique)). +* Fix the locking in `truncate table`. [#42728](https://github.com/ClickHouse/ClickHouse/pull/42728) ([flynn](https://github.com/ucasfl)). +* Fix possible crash in `web` disks when file does not exist (or `OPTIMIZE TABLE FINAL`, that also can got the same error eventually). [#42767](https://github.com/ClickHouse/ClickHouse/pull/42767) ([Azat Khuzhin](https://github.com/azat)). +* Fix `auth_type` mapping in `system.session_log`, by including `SSL_CERTIFICATE` for the enum values. [#42782](https://github.com/ClickHouse/ClickHouse/pull/42782) ([Miel Donkers](https://github.com/mdonkers)). +* Fix stack-use-after-return under ASAN build in the Create User query parser. [#42804](https://github.com/ClickHouse/ClickHouse/pull/42804) ([Nikolay Degterinsky](https://github.com/evillique)). +* Fix `lowerUTF8`/`upperUTF8` in case of symbol was in between 16-byte boundary (very frequent case of you have strings > 16 bytes long). [#42812](https://github.com/ClickHouse/ClickHouse/pull/42812) ([Azat Khuzhin](https://github.com/azat)). +* Additional bound check was added to LZ4 decompression routine to fix misbehaviour in case of malformed input. [#42868](https://github.com/ClickHouse/ClickHouse/pull/42868) ([Nikita Taranov](https://github.com/nickitat)). +* Fix rare possible hang on query cancellation. [#42874](https://github.com/ClickHouse/ClickHouse/pull/42874) ([Azat Khuzhin](https://github.com/azat)). +* Fix incorrect behavior with multiple disjuncts in hash join, close [#42832](https://github.com/ClickHouse/ClickHouse/issues/42832). [#42876](https://github.com/ClickHouse/ClickHouse/pull/42876) ([Vladimir C](https://github.com/vdimir)). +* A null pointer will be generated when select if as from ‘three table join’ , For example, this SQL query: [#42883](https://github.com/ClickHouse/ClickHouse/pull/42883) ([zzsmdfj](https://github.com/zzsmdfj)). +* Fix memory sanitizer report in Cluster Discovery, close [#42763](https://github.com/ClickHouse/ClickHouse/issues/42763). [#42905](https://github.com/ClickHouse/ClickHouse/pull/42905) ([Vladimir C](https://github.com/vdimir)). +* Improve DateTime schema inference in case of empty string. [#42911](https://github.com/ClickHouse/ClickHouse/pull/42911) ([Kruglov Pavel](https://github.com/Avogar)). +* Fix rare NOT_FOUND_COLUMN_IN_BLOCK error when projection is possible to use but there is no projection available. This fixes [#42771](https://github.com/ClickHouse/ClickHouse/issues/42771) . The bug was introduced in https://github.com/ClickHouse/ClickHouse/pull/25563. [#42938](https://github.com/ClickHouse/ClickHouse/pull/42938) ([Amos Bird](https://github.com/amosbird)). +* Fix ATTACH TABLE in `PostgreSQL` database engine if the table contains DATETIME data type. Closes [#42817](https://github.com/ClickHouse/ClickHouse/issues/42817). [#42960](https://github.com/ClickHouse/ClickHouse/pull/42960) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Fix lambda parsing. Closes [#41848](https://github.com/ClickHouse/ClickHouse/issues/41848). [#42979](https://github.com/ClickHouse/ClickHouse/pull/42979) ([Nikolay Degterinsky](https://github.com/evillique)). +* Fix incorrect key analysis when nullable keys appear in the middle of a hyperrectangle. This fixes [#43111](https://github.com/ClickHouse/ClickHouse/issues/43111) . [#43133](https://github.com/ClickHouse/ClickHouse/pull/43133) ([Amos Bird](https://github.com/amosbird)). +* Fix several buffer over-reads in deserialization of carefully crafted aggregate function states. [#43159](https://github.com/ClickHouse/ClickHouse/pull/43159) ([Raúl Marín](https://github.com/Algunenano)). +* Fix function `if` in case of NULL and const Nullable arguments. Closes [#43069](https://github.com/ClickHouse/ClickHouse/issues/43069). [#43178](https://github.com/ClickHouse/ClickHouse/pull/43178) ([Kruglov Pavel](https://github.com/Avogar)). +* Fix decimal math overflow in parsing DateTime with the 'best effort' algorithm. Closes [#43061](https://github.com/ClickHouse/ClickHouse/issues/43061). [#43180](https://github.com/ClickHouse/ClickHouse/pull/43180) ([Kruglov Pavel](https://github.com/Avogar)). +* The `indent` field produced by the `git-import` tool was miscalculated. See https://clickhouse.com/docs/en/getting-started/example-datasets/github/. [#43191](https://github.com/ClickHouse/ClickHouse/pull/43191) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Fixed unexpected behaviour of `Interval` types with subquery and casting. [#43193](https://github.com/ClickHouse/ClickHouse/pull/43193) ([jh0x](https://github.com/jh0x)). + ### ClickHouse release 22.10, 2022-10-26 #### Backward Incompatible Change @@ -570,7 +674,7 @@ * Support SQL standard CREATE INDEX and DROP INDEX syntax. [#35166](https://github.com/ClickHouse/ClickHouse/pull/35166) ([Jianmei Zhang](https://github.com/zhangjmruc)). * Send profile events for INSERT queries (previously only SELECT was supported). [#37391](https://github.com/ClickHouse/ClickHouse/pull/37391) ([Azat Khuzhin](https://github.com/azat)). * Implement in order aggregation (`optimize_aggregation_in_order`) for fully materialized projections. [#37469](https://github.com/ClickHouse/ClickHouse/pull/37469) ([Azat Khuzhin](https://github.com/azat)). -* Remove subprocess run for kerberos initialization. Added new integration test. Closes [#27651](https://github.com/ClickHouse/ClickHouse/issues/27651). [#38105](https://github.com/ClickHouse/ClickHouse/pull/38105) ([Roman Vasin](https://github.com/rvasin)). +* Remove subprocess run for Kerberos initialization. Added new integration test. Closes [#27651](https://github.com/ClickHouse/ClickHouse/issues/27651). [#38105](https://github.com/ClickHouse/ClickHouse/pull/38105) ([Roman Vasin](https://github.com/rvasin)). * * Add setting `multiple_joins_try_to_keep_original_names` to not rewrite identifier name on multiple JOINs rewrite, close [#34697](https://github.com/ClickHouse/ClickHouse/issues/34697). [#38149](https://github.com/ClickHouse/ClickHouse/pull/38149) ([Vladimir C](https://github.com/vdimir)). * Improved trace-visualizer UX. [#38169](https://github.com/ClickHouse/ClickHouse/pull/38169) ([Sergei Trifonov](https://github.com/serxa)). * Enable stack trace collection and query profiler for AArch64. [#38181](https://github.com/ClickHouse/ClickHouse/pull/38181) ([Maksim Kita](https://github.com/kitaisreal)). @@ -850,8 +954,8 @@ #### Upgrade Notes -* Now, background merges, mutations and `OPTIMIZE` will not increment `SelectedRows` and `SelectedBytes` metrics. They (still) will increment `MergedRows` and `MergedUncompressedBytes` as it was before. This only affects the metric values, and makes them better. This change does not introduce any incompatibility, but you may wonder about the changes of metrics, so we put in this category. [#37040](https://github.com/ClickHouse/ClickHouse/pull/37040) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). -* Updated the BoringSSL module to the official FIPS compliant version. This makes ClickHouse FIPS compliant. [#35914](https://github.com/ClickHouse/ClickHouse/pull/35914) ([Meena-Renganathan](https://github.com/Meena-Renganathan)). The ciphers `aes-192-cfb128` and `aes-256-cfb128` were removed, because they are not included in the FIPS certified version of BoringSSL. +* Now, background merges, mutations, and `OPTIMIZE` will not increment `SelectedRows` and `SelectedBytes` metrics. They (still) will increment `MergedRows` and `MergedUncompressedBytes` as it was before. This only affects the metric values and makes them better. This change does not introduce any incompatibility, but you may wonder about the changes to the metrics, so we put in this category. [#37040](https://github.com/ClickHouse/ClickHouse/pull/37040) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Updated the BoringSSL module to the official FIPS compliant version. This makes ClickHouse FIPS compliant in this area. [#35914](https://github.com/ClickHouse/ClickHouse/pull/35914) ([Meena-Renganathan](https://github.com/Meena-Renganathan)). The ciphers `aes-192-cfb128` and `aes-256-cfb128` were removed, because they are not included in the FIPS certified version of BoringSSL. * `max_memory_usage` setting is removed from the default user profile in `users.xml`. This enables flexible memory limits for queries instead of the old rigid limit of 10 GB. * Disable `log_query_threads` setting by default. It controls the logging of statistics about every thread participating in query execution. After supporting asynchronous reads, the total number of distinct thread ids became too large, and logging into the `query_thread_log` has become too heavy. [#37077](https://github.com/ClickHouse/ClickHouse/pull/37077) ([Alexey Milovidov](https://github.com/alexey-milovidov)). * Remove function `groupArraySorted` which has a bug. [#36822](https://github.com/ClickHouse/ClickHouse/pull/36822) ([Alexey Milovidov](https://github.com/alexey-milovidov)). diff --git a/CMakeLists.txt b/CMakeLists.txt index 7a04f347b2d..e121559d4e0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -111,6 +111,7 @@ if (ENABLE_FUZZING) set (ENABLE_JEMALLOC 0) set (ENABLE_CHECK_HEAVY_BUILDS 1) set (GLIBC_COMPATIBILITY OFF) + set (ENABLE_BENCHMARKS 0) # For codegen_select_fuzzer set (ENABLE_PROTOBUF 1) @@ -168,6 +169,7 @@ endif () option(ENABLE_TESTS "Provide unit_test_dbms target with Google.Test unit tests" ON) option(ENABLE_EXAMPLES "Build all example programs in 'examples' subdirectories" OFF) +option(ENABLE_BENCHMARKS "Build all benchmark programs in 'benchmarks' subdirectories" OFF) if (OS_LINUX AND (ARCH_AMD64 OR ARCH_AARCH64) AND USE_STATIC_LIBRARIES AND NOT SPLIT_SHARED_LIBRARIES AND NOT USE_MUSL) # Only for Linux, x86_64 or aarch64. @@ -202,7 +204,7 @@ option(ADD_GDB_INDEX_FOR_GOLD "Add .gdb-index to resulting binaries for gold lin if (NOT CMAKE_BUILD_TYPE_UC STREQUAL "RELEASE") # Can be lld or ld-lld or lld-13 or /path/to/lld. - if (LINKER_NAME MATCHES "lld") + if (LINKER_NAME MATCHES "lld" AND OS_LINUX) set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,--gdb-index") set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,--gdb-index") message (STATUS "Adding .gdb-index via --gdb-index linker option.") @@ -248,7 +250,7 @@ endif () # Create BuildID when using lld. For other linkers it is created by default. # (NOTE: LINKER_NAME can be either path or name, and in different variants) -if (LINKER_NAME MATCHES "lld") +if (LINKER_NAME MATCHES "lld" AND OS_LINUX) # SHA1 is not cryptographically secure but it is the best what lld is offering. set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,--build-id=sha1") endif () @@ -442,8 +444,9 @@ elseif (OS_DARWIN) include(cmake/darwin/default_libs.cmake) elseif (OS_FREEBSD) include(cmake/freebsd/default_libs.cmake) +else() + link_libraries(global-group) endif () -link_libraries(global-group) if (NOT (OS_LINUX OR OS_DARWIN)) # Using system libs can cause a lot of warnings in includes (on macro expansion). @@ -592,7 +595,7 @@ add_subdirectory (programs) add_subdirectory (tests) add_subdirectory (utils) -include (cmake/sanitize_target_link_libraries.cmake) +include (cmake/sanitize_targets.cmake) # Build native targets if necessary get_property(NATIVE_BUILD_TARGETS GLOBAL PROPERTY NATIVE_BUILD_TARGETS) diff --git a/README.md b/README.md index f90df9686c2..59c9c180c90 100644 --- a/README.md +++ b/README.md @@ -16,6 +16,6 @@ ClickHouse® is an open-source column-oriented database management system that a * [Contacts](https://clickhouse.com/company/contact) can help to get your questions answered if there are any. ## Upcoming events -* [**v22.11 Release Webinar**](https://clickhouse.com/company/events/v22-11-release-webinar) Original creator, co-founder, and CTO of ClickHouse Alexey Milovidov will walk us through the highlights of the release, provide live demos, and share vision into what is coming in the roadmap. -* [**ClickHouse Meetup at the Deutsche Bank office in Berlin**](https://www.meetup.com/clickhouse-berlin-user-group/events/289311596/) Hear from Deutsche Bank on why they chose ClickHouse for big sensitive data in a regulated environment. The ClickHouse team will then present how ClickHouse is used for real time financial data analytics, including tick data, trade analytics and risk management. -* [**AWS re:Invent**](https://clickhouse.com/company/events/aws-reinvent) Core members of the ClickHouse team -- including 2 of our founders -- will be at re:Invent from November 29 to December 3. We are available on the show floor, but are also determining interest in holding an event during the time there. +* [**v22.12 Release Webinar**](https://clickhouse.com/company/events/v22-12-release-webinar) Original creator, co-founder, and CTO of ClickHouse Alexey Milovidov will walk us through the highlights of the release, provide live demos, and share vision into what is coming in the roadmap. +* [**ClickHouse Meetup at the CHEQ office in Tel Aviv**](https://www.meetup.com/clickhouse-tel-aviv-user-group/events/289599423/) - Jan 16 - We are very excited to be holding our next in-person ClickHouse meetup at the CHEQ office in Tel Aviv! Hear from CHEQ, ServiceNow and Contentsquare, as well as a deep dive presentation from ClickHouse CTO Alexey Milovidov. Join us for a fun evening of talks, food and discussion! +* **ClickHouse Meetup in Seattle* - Keep an eye on this space as we will be announcing a January meetup in Seattle soon! diff --git a/SECURITY.md b/SECURITY.md index 0fb333c8ea3..a4f431d7552 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -1,3 +1,6 @@ + # Security Policy @@ -10,6 +13,7 @@ The following versions of ClickHouse server are currently being supported with s | Version | Supported | |:-|:-| +| 22.11 | ✔️ | | 22.10 | ✔️ | | 22.9 | ✔️ | | 22.8 | ✔️ | @@ -61,5 +65,5 @@ As the security issue moves from triage, to identified fix, to release planning ## Public Disclosure Timing -A public disclosure date is negotiated by the ClickHouse maintainers and the bug submitter. We prefer to fully disclose the bug as soon as possible once a user mitigation is available. It is reasonable to delay disclosure when the bug or the fix is not yet fully understood, the solution is not well-tested, or for vendor coordination. The timeframe for disclosure is from immediate (especially if it's already publicly known) to 90 days. For a vulnerability with a straightforward mitigation, we expect the report date to disclosure date to be on the order of 7 days. +A public disclosure date is negotiated by the ClickHouse maintainers and the bug submitter. We prefer to fully disclose the bug as soon as possible once a user mitigation is available. It is reasonable to delay disclosure when the bug or the fix is not yet fully understood, the solution is not well-tested, or for vendor coordination. The timeframe for disclosure is from immediate (especially if it's already publicly known) to 90 days. For a vulnerability with a straightforward mitigation, we expect the report date to disclosure date to be on the order of 7 days. diff --git a/base/base/ReplxxLineReader.cpp b/base/base/ReplxxLineReader.cpp index e0dc81af5b0..b86746365b7 100644 --- a/base/base/ReplxxLineReader.cpp +++ b/base/base/ReplxxLineReader.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #include /// is_any_of namespace @@ -38,7 +39,7 @@ std::string getEditor() return editor; } -std::string getFuzzyFinder() +std::pair getFuzzyFinder() { const char * env_path = std::getenv("PATH"); // NOLINT(concurrency-mt-unsafe) @@ -52,14 +53,20 @@ std::string getFuzzyFinder() std::filesystem::path path(path_str); std::filesystem::path sk_bin_path = path / "sk"; if (!access(sk_bin_path.c_str(), X_OK)) - return sk_bin_path; + return {sk_bin_path, FUZZY_FINDER_SKIM}; std::filesystem::path fzf_bin_path = path / "fzf"; if (!access(fzf_bin_path.c_str(), X_OK)) - return fzf_bin_path; + return {fzf_bin_path, FUZZY_FINDER_FZF}; } - return {}; + return {"", FUZZY_FINDER_NONE}; +} + +String escapeShellArgument(std::string arg) +{ + boost::replace_all(arg, "'", "'\\''"); + return fmt::format("'{}'", arg); } /// See comments in ShellCommand::executeImpl() @@ -305,11 +312,12 @@ ReplxxLineReader::ReplxxLineReader( replxx::Replxx::highlighter_callback_t highlighter_) : LineReader(history_file_path_, multiline_, std::move(extenders_), std::move(delimiters_)), highlighter(std::move(highlighter_)) , editor(getEditor()) - , fuzzy_finder(getFuzzyFinder()) { using namespace std::placeholders; using Replxx = replxx::Replxx; + std::tie(fuzzy_finder, fuzzy_finder_type) = getFuzzyFinder(); + if (!history_file_path.empty()) { history_file_fd = open(history_file_path.c_str(), O_RDWR); @@ -415,11 +423,12 @@ ReplxxLineReader::ReplxxLineReader( rx.bind_key(Replxx::KEY::meta('#'), insert_comment_action); /// interactive search in history (requires fzf/sk) - if (!fuzzy_finder.empty()) + if (fuzzy_finder_type != FUZZY_FINDER_NONE) { auto interactive_history_search = [this](char32_t code) { openInteractiveHistorySearch(); + rx.invoke(Replxx::ACTION::CLEAR_SELF, code); return rx.invoke(Replxx::ACTION::REPAINT, code); }; rx.bind_key(Replxx::KEY::control('R'), interactive_history_search); @@ -515,9 +524,22 @@ void ReplxxLineReader::openInteractiveHistorySearch() /// /// And also note, that fzf and skim is 95% compatible (at least option /// that is used here) - std::string fuzzy_finder_command = fmt::format( - "{} --read0 --tac --no-sort --tiebreak=index --bind=ctrl-r:toggle-sort --height=30% < {} > {}", - fuzzy_finder, history_file.getPath(), output_file.getPath()); + std::string fuzzy_finder_command = fmt::format("{} --read0 --height=30%", fuzzy_finder); + switch (fuzzy_finder_type) + { + case FUZZY_FINDER_SKIM: + fuzzy_finder_command += " --tac --tiebreak=-score"; + break; + case FUZZY_FINDER_FZF: + fuzzy_finder_command += " --tac --tiebreak=index"; + break; + case FUZZY_FINDER_NONE: + /// assertion for !fuzzy_finder.empty() is enough + break; + } + fuzzy_finder_command += fmt::format(" < {} > {}", + escapeShellArgument(history_file.getPath()), + escapeShellArgument(output_file.getPath())); char * const argv[] = {sh, sh_c, fuzzy_finder_command.data(), nullptr}; try diff --git a/base/base/ReplxxLineReader.h b/base/base/ReplxxLineReader.h index fea1405a208..9be3b3aa993 100644 --- a/base/base/ReplxxLineReader.h +++ b/base/base/ReplxxLineReader.h @@ -4,6 +4,14 @@ #include +enum FuzzyFinderType +{ + FUZZY_FINDER_NONE, + /// Use https://github.com/junegunn/fzf + FUZZY_FINDER_FZF, + /// Use https://github.com/lotabout/skim + FUZZY_FINDER_SKIM, +}; class ReplxxLineReader : public LineReader { @@ -38,4 +46,5 @@ private: std::string editor; std::string fuzzy_finder; + FuzzyFinderType fuzzy_finder_type = FUZZY_FINDER_NONE; }; diff --git a/base/base/bit_cast.h b/base/base/bit_cast.h index b2b6915764d..8198991e98e 100644 --- a/base/base/bit_cast.h +++ b/base/base/bit_cast.h @@ -12,7 +12,21 @@ template std::decay_t bit_cast(const From & from) { + /** + * Assume the source value is 0xAABBCCDD (i.e. sizeof(from) == 4). + * Its BE representation is 0xAABBCCDD, the LE representation is 0xDDCCBBAA. + * Further assume, sizeof(res) == 8 and that res is initially zeroed out. + * With LE, the result after bit_cast will be 0xDDCCBBAA00000000 --> input value == output value. + * With BE, the result after bit_cast will be 0x00000000AABBCCDD --> input value == output value. + */ To res {}; - memcpy(static_cast(&res), &from, std::min(sizeof(res), sizeof(from))); + if constexpr (std::endian::native == std::endian::little) + memcpy(static_cast(&res), &from, std::min(sizeof(res), sizeof(from))); + else + { + uint32_t offset_to = (sizeof(res) > sizeof(from)) ? (sizeof(res) - sizeof(from)) : 0; + uint32_t offset_from = (sizeof(from) > sizeof(res)) ? (sizeof(from) - sizeof(res)) : 0; + memcpy(reinterpret_cast(&res) + offset_to, reinterpret_cast(&from) + offset_from, std::min(sizeof(res), sizeof(from))); + } return res; } diff --git a/base/base/safeExit.cpp b/base/base/safeExit.cpp index ddb93dac65b..12ad9dc12ee 100644 --- a/base/base/safeExit.cpp +++ b/base/base/safeExit.cpp @@ -1,8 +1,10 @@ #if defined(OS_LINUX) # include #endif +#include #include #include +#include /// for THREAD_SANITIZER [[noreturn]] void safeExit(int code) { diff --git a/base/base/wide_integer_impl.h b/base/base/wide_integer_impl.h index 1b5f502722c..f5b30cbab55 100644 --- a/base/base/wide_integer_impl.h +++ b/base/base/wide_integer_impl.h @@ -187,8 +187,20 @@ struct integer::_impl static_assert(Bits % base_bits == 0); /// Simple iteration in both directions - static constexpr unsigned little(unsigned idx) { return idx; } - static constexpr unsigned big(unsigned idx) { return item_count - 1 - idx; } + static constexpr unsigned little(unsigned idx) + { + if constexpr (std::endian::native == std::endian::little) + return idx; + else + return item_count - 1 - idx; + } + static constexpr unsigned big(unsigned idx) + { + if constexpr (std::endian::native == std::endian::little) + return item_count - 1 - idx; + else + return idx; + } static constexpr unsigned any(unsigned idx) { return idx; } template @@ -240,20 +252,20 @@ struct integer::_impl { static_assert(sizeof(Integral) <= sizeof(base_type)); - self.items[0] = _impl::to_Integral(rhs); + self.items[little(0)] = _impl::to_Integral(rhs); if constexpr (std::is_signed_v) { if (rhs < 0) { - for (size_t i = 1; i < item_count; ++i) - self.items[i] = -1; + for (unsigned i = 1; i < item_count; ++i) + self.items[little(i)] = -1; return; } } - for (size_t i = 1; i < item_count; ++i) - self.items[i] = 0; + for (unsigned i = 1; i < item_count; ++i) + self.items[little(i)] = 0; } template @@ -348,7 +360,7 @@ struct integer::_impl constexpr const unsigned to_copy = min_bits / base_bits; for (unsigned i = 0; i < to_copy; ++i) - self.items[i] = rhs.items[i]; + self.items[little(i)] = rhs.items[little(i)]; if constexpr (Bits > Bits2) { @@ -357,13 +369,13 @@ struct integer::_impl if (rhs < 0) { for (unsigned i = to_copy; i < item_count; ++i) - self.items[i] = -1; + self.items[little(i)] = -1; return; } } for (unsigned i = to_copy; i < item_count; ++i) - self.items[i] = 0; + self.items[little(i)] = 0; } } @@ -454,7 +466,7 @@ private: { if constexpr (sizeof(T) <= sizeof(base_type)) { - if (0 == idx) + if (little(0) == idx) return static_cast(x); } else if (idx * sizeof(base_type) < sizeof(T)) @@ -475,7 +487,7 @@ private: for (unsigned i = 0; i < op_items; ++i) { - base_type rhs_item = get_item(rhs, i); + base_type rhs_item = get_item(rhs, little(i)); base_type & res_item = res.items[little(i)]; underflows[i] = res_item < rhs_item; @@ -508,7 +520,7 @@ private: for (unsigned i = 0; i < op_items; ++i) { - base_type rhs_item = get_item(rhs, i); + base_type rhs_item = get_item(rhs, little(i)); base_type & res_item = res.items[little(i)]; res_item += rhs_item; @@ -580,12 +592,12 @@ private: else if constexpr (Bits == 128 && sizeof(base_type) == 8) { using CompilerUInt128 = unsigned __int128; - CompilerUInt128 a = (CompilerUInt128(lhs.items[1]) << 64) + lhs.items[0]; // NOLINT(clang-analyzer-core.UndefinedBinaryOperatorResult) - CompilerUInt128 b = (CompilerUInt128(rhs.items[1]) << 64) + rhs.items[0]; // NOLINT(clang-analyzer-core.UndefinedBinaryOperatorResult) + CompilerUInt128 a = (CompilerUInt128(lhs.items[little(1)]) << 64) + lhs.items[little(0)]; // NOLINT(clang-analyzer-core.UndefinedBinaryOperatorResult) + CompilerUInt128 b = (CompilerUInt128(rhs.items[little(1)]) << 64) + rhs.items[little(0)]; // NOLINT(clang-analyzer-core.UndefinedBinaryOperatorResult) CompilerUInt128 c = a * b; integer res; - res.items[0] = c; - res.items[1] = c >> 64; + res.items[little(0)] = c; + res.items[little(1)] = c >> 64; return res; } else @@ -597,7 +609,7 @@ private: #endif for (unsigned i = 0; i < item_count; ++i) { - base_type rhs_item = get_item(rhs, i); + base_type rhs_item = get_item(rhs, little(i)); unsigned pos = i * base_bits; while (rhs_item) @@ -792,7 +804,7 @@ public: integer res; for (unsigned i = 0; i < item_count; ++i) - res.items[little(i)] = lhs.items[little(i)] | get_item(rhs, i); + res.items[little(i)] = lhs.items[little(i)] | get_item(rhs, little(i)); return res; } else @@ -810,7 +822,7 @@ public: integer res; for (unsigned i = 0; i < item_count; ++i) - res.items[little(i)] = lhs.items[little(i)] & get_item(rhs, i); + res.items[little(i)] = lhs.items[little(i)] & get_item(rhs, little(i)); return res; } else @@ -845,17 +857,17 @@ public: { using CompilerUInt128 = unsigned __int128; - CompilerUInt128 a = (CompilerUInt128(numerator.items[1]) << 64) + numerator.items[0]; // NOLINT(clang-analyzer-core.UndefinedBinaryOperatorResult) - CompilerUInt128 b = (CompilerUInt128(denominator.items[1]) << 64) + denominator.items[0]; // NOLINT(clang-analyzer-core.UndefinedBinaryOperatorResult) + CompilerUInt128 a = (CompilerUInt128(numerator.items[little(1)]) << 64) + numerator.items[little(0)]; // NOLINT(clang-analyzer-core.UndefinedBinaryOperatorResult) + CompilerUInt128 b = (CompilerUInt128(denominator.items[little(1)]) << 64) + denominator.items[little(0)]; // NOLINT(clang-analyzer-core.UndefinedBinaryOperatorResult) CompilerUInt128 c = a / b; // NOLINT integer res; - res.items[0] = c; - res.items[1] = c >> 64; + res.items[little(0)] = c; + res.items[little(1)] = c >> 64; CompilerUInt128 remainder = a - b * c; - numerator.items[0] = remainder; - numerator.items[1] = remainder >> 64; + numerator.items[little(0)] = remainder; + numerator.items[little(1)] = remainder >> 64; return res; } @@ -1039,15 +1051,15 @@ constexpr integer::integer(std::initializer_list il) noexcept else { auto it = il.begin(); - for (size_t i = 0; i < _impl::item_count; ++i) + for (unsigned i = 0; i < _impl::item_count; ++i) { if (it < il.end()) { - items[i] = *it; + items[_impl::little(i)] = *it; ++it; } else - items[i] = 0; + items[_impl::little(i)] = 0; } } } @@ -1208,7 +1220,7 @@ constexpr integer::operator T() const noexcept UnsignedT res{}; for (unsigned i = 0; i < _impl::item_count && i < (sizeof(T) + sizeof(base_type) - 1) / sizeof(base_type); ++i) - res += UnsignedT(items[i]) << (sizeof(base_type) * 8 * i); // NOLINT(clang-analyzer-core.UndefinedBinaryOperatorResult) + res += UnsignedT(items[_impl::little(i)]) << (sizeof(base_type) * 8 * i); // NOLINT(clang-analyzer-core.UndefinedBinaryOperatorResult) return res; } diff --git a/base/glibc-compatibility/glibc-compatibility.c b/base/glibc-compatibility/glibc-compatibility.c index d10bc6ba723..bae03ad590a 100644 --- a/base/glibc-compatibility/glibc-compatibility.c +++ b/base/glibc-compatibility/glibc-compatibility.c @@ -220,13 +220,13 @@ struct statx { uint32_t stx_dev_minor; uint64_t spare[14]; }; -#endif int statx(int fd, const char *restrict path, int flag, unsigned int mask, struct statx *restrict statxbuf) { return syscall(SYS_statx, fd, path, flag, mask, statxbuf); } +#endif #include diff --git a/base/glibc-compatibility/musl/getauxval.c b/base/glibc-compatibility/musl/getauxval.c index 22886013d07..44a9f979f99 100644 --- a/base/glibc-compatibility/musl/getauxval.c +++ b/base/glibc-compatibility/musl/getauxval.c @@ -8,6 +8,14 @@ #include // ElfW #include +#include "syscall.h" + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) +#include +#endif +#endif + #define ARRAY_SIZE(a) sizeof((a))/sizeof((a[0])) /// Suppress TSan since it is possible for this code to be called from multiple threads, @@ -39,7 +47,9 @@ ssize_t __retry_read(int fd, void * buf, size_t count) { for (;;) { - ssize_t ret = read(fd, buf, count); + // We cannot use the read syscall as it will be intercept by sanitizers, which aren't + // initialized yet. Emit syscall directly. + ssize_t ret = __syscall_ret(__syscall(SYS_read, fd, buf, count)); if (ret == -1) { if (errno == EINTR) @@ -90,6 +100,11 @@ static unsigned long NO_SANITIZE_THREAD __auxv_init_procfs(unsigned long type) _Static_assert(sizeof(aux) < 4096, "Unexpected sizeof(aux)"); while (__retry_read(fd, &aux, sizeof(aux)) == sizeof(aux)) { +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) + __msan_unpoison(&aux, sizeof(aux)); +#endif +#endif if (aux.a_type == AT_NULL) { break; diff --git a/cmake/autogenerated_versions.txt b/cmake/autogenerated_versions.txt index 11b37f5a7c8..d06d3918612 100644 --- a/cmake/autogenerated_versions.txt +++ b/cmake/autogenerated_versions.txt @@ -2,11 +2,11 @@ # NOTE: has nothing common with DBMS_TCP_PROTOCOL_VERSION, # only DBMS_TCP_PROTOCOL_VERSION should be incremented on protocol changes. -SET(VERSION_REVISION 54468) +SET(VERSION_REVISION 54469) SET(VERSION_MAJOR 22) -SET(VERSION_MINOR 11) +SET(VERSION_MINOR 12) SET(VERSION_PATCH 1) -SET(VERSION_GITHASH 98ab5a3c189232ea2a3dddb9d2be7196ae8b3434) -SET(VERSION_DESCRIBE v22.11.1.1-testing) -SET(VERSION_STRING 22.11.1.1) +SET(VERSION_GITHASH 0d211ed19849fe44b0e43fdebe2c15d76d560a77) +SET(VERSION_DESCRIBE v22.12.1.1-testing) +SET(VERSION_STRING 22.12.1.1) # end of autochange diff --git a/cmake/clang_tidy.cmake b/cmake/clang_tidy.cmake index 57295682487..ceaafdaa9aa 100644 --- a/cmake/clang_tidy.cmake +++ b/cmake/clang_tidy.cmake @@ -5,21 +5,21 @@ if (ENABLE_CLANG_TIDY) find_program (CLANG_TIDY_CACHE_PATH NAMES "clang-tidy-cache") if (CLANG_TIDY_CACHE_PATH) - find_program (_CLANG_TIDY_PATH NAMES "clang-tidy" "clang-tidy-15" "clang-tidy-14" "clang-tidy-13" "clang-tidy-12") + find_program (_CLANG_TIDY_PATH NAMES "clang-tidy-15" "clang-tidy-14" "clang-tidy-13" "clang-tidy-12" "clang-tidy") # Why do we use ';' here? # It's a cmake black magic: https://cmake.org/cmake/help/latest/prop_tgt/LANG_CLANG_TIDY.html#prop_tgt:%3CLANG%3E_CLANG_TIDY # The CLANG_TIDY_PATH is passed to CMAKE_CXX_CLANG_TIDY, which follows CXX_CLANG_TIDY syntax. set (CLANG_TIDY_PATH "${CLANG_TIDY_CACHE_PATH};${_CLANG_TIDY_PATH}" CACHE STRING "A combined command to run clang-tidy with caching wrapper") else () - find_program (CLANG_TIDY_PATH NAMES "clang-tidy" "clang-tidy-15" "clang-tidy-14" "clang-tidy-13" "clang-tidy-12") + find_program (CLANG_TIDY_PATH NAMES "clang-tidy-15" "clang-tidy-14" "clang-tidy-13" "clang-tidy-12" "clang-tidy") endif () if (CLANG_TIDY_PATH) message (STATUS "Using clang-tidy: ${CLANG_TIDY_PATH}. - The checks will be run during build process. - See the .clang-tidy file at the root directory to configure the checks.") + The checks will be run during the build process. + See the .clang-tidy file in the root directory to configure the checks.") set (USE_CLANG_TIDY ON) diff --git a/cmake/darwin/default_libs.cmake b/cmake/darwin/default_libs.cmake index 1f92663a4b9..3e6e4907a71 100644 --- a/cmake/darwin/default_libs.cmake +++ b/cmake/darwin/default_libs.cmake @@ -23,6 +23,7 @@ set(THREADS_PREFER_PTHREAD_FLAG ON) find_package(Threads REQUIRED) include (cmake/cxx.cmake) +link_libraries(global-group) target_link_libraries(global-group INTERFACE $ diff --git a/cmake/freebsd/default_libs.cmake b/cmake/freebsd/default_libs.cmake index 65d5f0511d9..3e1f22ef2e4 100644 --- a/cmake/freebsd/default_libs.cmake +++ b/cmake/freebsd/default_libs.cmake @@ -24,6 +24,7 @@ find_package(Threads REQUIRED) include (cmake/unwind.cmake) include (cmake/cxx.cmake) +link_libraries(global-group) target_link_libraries(global-group INTERFACE $ diff --git a/cmake/linux/default_libs.cmake b/cmake/linux/default_libs.cmake index 21bead7020c..23c5fc3e14f 100644 --- a/cmake/linux/default_libs.cmake +++ b/cmake/linux/default_libs.cmake @@ -34,6 +34,13 @@ set(CMAKE_C_STANDARD_LIBRARIES ${DEFAULT_LIBS}) set(THREADS_PREFER_PTHREAD_FLAG ON) find_package(Threads REQUIRED) +include (cmake/unwind.cmake) +include (cmake/cxx.cmake) + +# Delay the call to link the global interface after the libc++ libraries are included to avoid circular dependencies +# which are ok with static libraries but not with dynamic ones +link_libraries(global-group) + if (NOT OS_ANDROID) if (NOT USE_MUSL) # Our compatibility layer doesn't build under Android, many errors in musl. @@ -42,9 +49,6 @@ if (NOT OS_ANDROID) add_subdirectory(base/harmful) endif () -include (cmake/unwind.cmake) -include (cmake/cxx.cmake) - target_link_libraries(global-group INTERFACE -Wl,--start-group $ diff --git a/cmake/sanitize.cmake b/cmake/sanitize.cmake index f0cef54b0b8..3e3bb7ec2b2 100644 --- a/cmake/sanitize.cmake +++ b/cmake/sanitize.cmake @@ -16,7 +16,9 @@ endmacro() if (SANITIZE) if (SANITIZE STREQUAL "address") - set (ASAN_FLAGS "-fsanitize=address -fsanitize-address-use-after-scope") + # LLVM-15 has a bug in Address Sanitizer, preventing the usage of 'sanitize-address-use-after-scope', + # see https://github.com/llvm/llvm-project/issues/58633 + set (ASAN_FLAGS "-fsanitize=address -fno-sanitize-address-use-after-scope") set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SAN_FLAGS} ${ASAN_FLAGS}") set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SAN_FLAGS} ${ASAN_FLAGS}") diff --git a/cmake/sanitize_target_link_libraries.cmake b/cmake/sanitize_targets.cmake similarity index 65% rename from cmake/sanitize_target_link_libraries.cmake rename to cmake/sanitize_targets.cmake index d66ea338a52..8f61da2009d 100644 --- a/cmake/sanitize_target_link_libraries.cmake +++ b/cmake/sanitize_targets.cmake @@ -1,3 +1,13 @@ +# https://stackoverflow.com/a/62311397/328260 +macro (get_all_targets_recursive targets dir) + get_property (subdirectories DIRECTORY ${dir} PROPERTY SUBDIRECTORIES) + foreach (subdir ${subdirectories}) + get_all_targets_recursive (${targets} ${subdir}) + endforeach () + get_property (current_targets DIRECTORY ${dir} PROPERTY BUILDSYSTEM_TARGETS) + list (APPEND ${targets} ${current_targets}) +endmacro () + # When you will try to link target with the directory (that exists), cmake will # skip this without an error, only the following warning will be reported: # @@ -18,23 +28,12 @@ # -- but cannot be used with link_libraries() # - use BUILDSYSTEM_TARGETS property to get list of all targets and sanitize # -- this will work. - -# https://stackoverflow.com/a/62311397/328260 function (get_all_targets var) set (targets) get_all_targets_recursive (targets ${CMAKE_CURRENT_SOURCE_DIR}) set (${var} ${targets} PARENT_SCOPE) endfunction() -macro (get_all_targets_recursive targets dir) - get_property (subdirectories DIRECTORY ${dir} PROPERTY SUBDIRECTORIES) - foreach (subdir ${subdirectories}) - get_all_targets_recursive (${targets} ${subdir}) - endforeach () - get_property (current_targets DIRECTORY ${dir} PROPERTY BUILDSYSTEM_TARGETS) - list (APPEND ${targets} ${current_targets}) -endmacro () - -macro (sanitize_link_libraries target) +function (sanitize_link_libraries target) get_target_property(target_type ${target} TYPE) if (${target_type} STREQUAL "INTERFACE_LIBRARY") get_property(linked_libraries TARGET ${target} PROPERTY INTERFACE_LINK_LIBRARIES) @@ -48,9 +47,35 @@ macro (sanitize_link_libraries target) message(FATAL_ERROR "${target} requested to link with directory: ${linked_library}") endif() endforeach() -endmacro() - +endfunction() get_all_targets (all_targets) foreach (target ${all_targets}) sanitize_link_libraries(${target}) endforeach() + +# +# Do not allow to define -W* from contrib publically (INTERFACE/PUBLIC). +# +function (get_contrib_targets var) + set (targets) + get_all_targets_recursive (targets ${CMAKE_CURRENT_SOURCE_DIR}/contrib) + set (${var} ${targets} PARENT_SCOPE) +endfunction() +function (sanitize_interface_flags target) + get_target_property(target_type ${target} TYPE) + get_property(compile_definitions TARGET ${target} PROPERTY INTERFACE_COMPILE_DEFINITIONS) + get_property(compile_options TARGET ${target} PROPERTY INTERFACE_COMPILE_OPTIONS) + if (NOT "${compile_options}" STREQUAL "") + message(FATAL_ERROR "${target} set INTERFACE_COMPILE_OPTIONS to ${compile_options}. This is forbidden.") + endif() + if ("${compile_definitions}" MATCHES "-Wl,") + # linker option - OK + elseif ("${compile_definitions}" MATCHES "-W") + message(FATAL_ERROR "${target} contains ${compile_definitions} flags in INTERFACE_COMPILE_DEFINITIONS. This is forbidden.") + endif() +endfunction() +get_contrib_targets (contrib_targets) +foreach (contrib_target ${contrib_targets}) + sanitize_interface_flags(${contrib_target}) +endforeach() + diff --git a/cmake/tools.cmake b/cmake/tools.cmake index 8a17d97cf13..3ddf8a869be 100644 --- a/cmake/tools.cmake +++ b/cmake/tools.cmake @@ -21,12 +21,12 @@ set (APPLE_CLANG_MINIMUM_VERSION 12.0.0) set (GCC_MINIMUM_VERSION 11) if (COMPILER_GCC) + message (FATAL_ERROR "Compilation with GCC is unsupported. Please use Clang instead.") + if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS ${GCC_MINIMUM_VERSION}) message (FATAL_ERROR "Compilation with GCC version ${CMAKE_CXX_COMPILER_VERSION} is unsupported, the minimum required version is ${GCC_MINIMUM_VERSION}.") endif () - message (WARNING "Compilation with GCC is unsupported. Please use Clang instead.") - elseif (COMPILER_CLANG) if (CMAKE_CXX_COMPILER_ID MATCHES "AppleClang") # (Experimental!) Specify "-DALLOW_APPLECLANG=ON" when running CMake configuration step, if you want to experiment with using it. @@ -58,13 +58,19 @@ if (NOT LINKER_NAME) find_program (LLD_PATH NAMES "ld.lld") find_program (GOLD_PATH NAMES "ld.gold") elseif (COMPILER_CLANG) - find_program (LLD_PATH NAMES "ld.lld-${COMPILER_VERSION_MAJOR}" "lld-${COMPILER_VERSION_MAJOR}" "ld.lld" "lld") + # llvm lld is a generic driver. + # Invoke ld.lld (Unix), ld64.lld (macOS), lld-link (Windows), wasm-ld (WebAssembly) instead + if (OS_LINUX) + find_program (LLD_PATH NAMES "ld.lld-${COMPILER_VERSION_MAJOR}" "ld.lld") + elseif (OS_DARWIN) + find_program (LLD_PATH NAMES "ld64.lld-${COMPILER_VERSION_MAJOR}" "ld64.lld") + endif () find_program (GOLD_PATH NAMES "ld.gold" "gold") endif () endif() -if (OS_LINUX AND NOT LINKER_NAME) - # prefer lld linker over gold or ld on linux +if ((OS_LINUX OR OS_DARWIN) AND NOT LINKER_NAME) + # prefer lld linker over gold or ld on linux and macos if (LLD_PATH) if (COMPILER_GCC) # GCC driver requires one of supported linker names like "lld". @@ -77,7 +83,7 @@ if (OS_LINUX AND NOT LINKER_NAME) if (NOT LINKER_NAME) if (GOLD_PATH) - message (WARNING "Linking with gold is not recommended. Please use lld.") + message (FATAL_ERROR "Linking with gold is unsupported. Please use lld.") if (COMPILER_GCC) set (LINKER_NAME "gold") else () diff --git a/contrib/CMakeLists.txt b/contrib/CMakeLists.txt index 8ebd4ab55d3..c7419d74aac 100644 --- a/contrib/CMakeLists.txt +++ b/contrib/CMakeLists.txt @@ -167,7 +167,11 @@ add_contrib (c-ares-cmake c-ares) add_contrib (qpl-cmake qpl) add_contrib (morton-nd-cmake morton-nd) -add_contrib(annoy-cmake annoy) +add_contrib (annoy-cmake annoy) + +add_contrib (xxHash-cmake xxHash) + +add_contrib (google-benchmark-cmake google-benchmark) # Put all targets defined here and in subdirectories under "contrib/" folders in GUI-based IDEs. # Some of third-party projects may override CMAKE_FOLDER or FOLDER property of their targets, so they would not appear diff --git a/contrib/NuRaft b/contrib/NuRaft index 1be805e7cb2..afc36dfa9b0 160000 --- a/contrib/NuRaft +++ b/contrib/NuRaft @@ -1 +1 @@ -Subproject commit 1be805e7cb2494aa8170015493474379b0362dfc +Subproject commit afc36dfa9b0beb45bc4cd935060631cc80ba04a5 diff --git a/contrib/google-benchmark b/contrib/google-benchmark new file mode 160000 index 00000000000..2257fa4d6af --- /dev/null +++ b/contrib/google-benchmark @@ -0,0 +1 @@ +Subproject commit 2257fa4d6afb8e5a2ccd510a70f38fe7fcdf1edf diff --git a/contrib/google-benchmark-cmake/CMakeLists.txt b/contrib/google-benchmark-cmake/CMakeLists.txt new file mode 100644 index 00000000000..5d8fa7b838b --- /dev/null +++ b/contrib/google-benchmark-cmake/CMakeLists.txt @@ -0,0 +1,34 @@ +set (SRC_DIR "${ClickHouse_SOURCE_DIR}/contrib/google-benchmark/src") + +set (SRCS + "${SRC_DIR}/benchmark.cc" + "${SRC_DIR}/benchmark_api_internal.cc" + "${SRC_DIR}/benchmark_name.cc" + "${SRC_DIR}/benchmark_register.cc" + "${SRC_DIR}/benchmark_runner.cc" + "${SRC_DIR}/check.cc" + "${SRC_DIR}/colorprint.cc" + "${SRC_DIR}/commandlineflags.cc" + "${SRC_DIR}/complexity.cc" + "${SRC_DIR}/console_reporter.cc" + "${SRC_DIR}/counter.cc" + "${SRC_DIR}/csv_reporter.cc" + "${SRC_DIR}/json_reporter.cc" + "${SRC_DIR}/perf_counters.cc" + "${SRC_DIR}/reporter.cc" + "${SRC_DIR}/sleep.cc" + "${SRC_DIR}/statistics.cc" + "${SRC_DIR}/string_util.cc" + "${SRC_DIR}/sysinfo.cc" + "${SRC_DIR}/timers.cc") + +add_library(google_benchmark "${SRCS}") +target_include_directories(google_benchmark SYSTEM PUBLIC "${SRC_DIR}/../include") + +add_library(google_benchmark_main "${SRC_DIR}/benchmark_main.cc") +target_link_libraries(google_benchmark_main PUBLIC google_benchmark) + +add_library(google_benchmark_all INTERFACE) +target_link_libraries(google_benchmark_all INTERFACE google_benchmark google_benchmark_main) + +add_library(ch_contrib::gbenchmark_all ALIAS google_benchmark_all) diff --git a/contrib/libcxx b/contrib/libcxx deleted file mode 160000 index 4db7f838afd..00000000000 --- a/contrib/libcxx +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 4db7f838afd3139eb3761694b04d31275df45d2d diff --git a/contrib/libcxx-cmake/CMakeLists.txt b/contrib/libcxx-cmake/CMakeLists.txt index 53c6ff58f83..21ed76f8b6f 100644 --- a/contrib/libcxx-cmake/CMakeLists.txt +++ b/contrib/libcxx-cmake/CMakeLists.txt @@ -1,6 +1,6 @@ include(CheckCXXCompilerFlag) -set(LIBCXX_SOURCE_DIR "${ClickHouse_SOURCE_DIR}/contrib/libcxx") +set(LIBCXX_SOURCE_DIR "${ClickHouse_SOURCE_DIR}/contrib/llvm-project/libcxx") set(SRCS "${LIBCXX_SOURCE_DIR}/src/algorithm.cpp" @@ -57,7 +57,7 @@ add_library(cxx ${SRCS}) set_target_properties(cxx PROPERTIES FOLDER "contrib/libcxx-cmake") target_include_directories(cxx SYSTEM BEFORE PRIVATE $) -target_include_directories(cxx SYSTEM BEFORE PUBLIC $) +target_include_directories(cxx SYSTEM BEFORE PUBLIC $<$:$>) target_compile_definitions(cxx PRIVATE -D_LIBCPP_BUILDING_LIBRARY -DLIBCXX_BUILDING_LIBCXXABI) # Enable capturing stack traces for all exceptions. diff --git a/contrib/libcxxabi b/contrib/libcxxabi deleted file mode 160000 index a736a6b3c6a..00000000000 --- a/contrib/libcxxabi +++ /dev/null @@ -1 +0,0 @@ -Subproject commit a736a6b3c6a7b8aae2ebad629ca21b2c55b4820e diff --git a/contrib/libcxxabi-cmake/CMakeLists.txt b/contrib/libcxxabi-cmake/CMakeLists.txt index a59452eee9a..0473527912e 100644 --- a/contrib/libcxxabi-cmake/CMakeLists.txt +++ b/contrib/libcxxabi-cmake/CMakeLists.txt @@ -1,4 +1,4 @@ -set(LIBCXXABI_SOURCE_DIR "${ClickHouse_SOURCE_DIR}/contrib/libcxxabi") +set(LIBCXXABI_SOURCE_DIR "${ClickHouse_SOURCE_DIR}/contrib/llvm-project/libcxxabi") set(SRCS "${LIBCXXABI_SOURCE_DIR}/src/abort_message.cpp" diff --git a/contrib/llvm-project b/contrib/llvm-project index 3a39038345a..e61a81aa6fc 160000 --- a/contrib/llvm-project +++ b/contrib/llvm-project @@ -1 +1 @@ -Subproject commit 3a39038345a400e7e767811b142a94355d511215 +Subproject commit e61a81aa6fc529b469e2a54b7ce788606e138b5d diff --git a/contrib/poco b/contrib/poco index 76746b35d0e..79923422618 160000 --- a/contrib/poco +++ b/contrib/poco @@ -1 +1 @@ -Subproject commit 76746b35d0e254eaaba71dc3b79e46cba8cbb144 +Subproject commit 799234226187c0ae0b8c90f23465b25ed7956e56 diff --git a/contrib/qpl b/contrib/qpl index cdc8442f7a5..becb7a1b15b 160000 --- a/contrib/qpl +++ b/contrib/qpl @@ -1 +1 @@ -Subproject commit cdc8442f7a5e7a6ff6eea39c69665e0c5034d85d +Subproject commit becb7a1b15bdb4845ec3721a550707ffa51d029d diff --git a/contrib/qpl-cmake/CMakeLists.txt b/contrib/qpl-cmake/CMakeLists.txt index dc90f07a9bc..beef8432e7a 100644 --- a/contrib/qpl-cmake/CMakeLists.txt +++ b/contrib/qpl-cmake/CMakeLists.txt @@ -15,7 +15,7 @@ set (QPL_SRC_DIR "${ClickHouse_SOURCE_DIR}/contrib/qpl/sources") set (QPL_BINARY_DIR "${ClickHouse_BINARY_DIR}/build/contrib/qpl") set (UUID_DIR "${ClickHouse_SOURCE_DIR}/contrib/qpl-cmake") -set (EFFICIENT_WAIT ON) +set (EFFICIENT_WAIT OFF) set (BLOCK_ON_FAULT ON) set (LOG_HW_INIT OFF) set (SANITIZE_MEMORY OFF) @@ -42,7 +42,7 @@ include("${QPL_PROJECT_DIR}/cmake/CompileOptions.cmake") include(CheckLanguage) check_language(ASM_NASM) if(NOT CMAKE_ASM_NASM_COMPILER) - message(FATAL_ERROR "Please install NASM from 'https://www.nasm.us/' because NASM compiler can not be found!") + message(FATAL_ERROR "Please install NASM from 'https://www.nasm.us/' because NASM compiler can not be found!") endif() # [SUBDIR]isal @@ -110,18 +110,18 @@ target_compile_options(isal PRIVATE "$<$:>" "$<$:>") -target_compile_options(isal_asm PUBLIC "-I${QPL_SRC_DIR}/isal/include/" - PUBLIC "-I${QPL_SRC_DIR}/isal/igzip/" - PUBLIC "-I${QPL_SRC_DIR}/isal/crc/" - PUBLIC "-DQPL_LIB") +target_compile_options(isal_asm PRIVATE "-I${QPL_SRC_DIR}/isal/include/" + PRIVATE "-I${QPL_SRC_DIR}/isal/igzip/" + PRIVATE "-I${QPL_SRC_DIR}/isal/crc/" + PRIVATE "-DQPL_LIB") # AS_FEATURE_LEVEL=10 means "Check SIMD capabilities of the target system at runtime and use up to AVX512 if available". # AS_FEATURE_LEVEL=5 means "Check SIMD capabilities of the target system at runtime and use up to AVX2 if available". # HAVE_KNOWS_AVX512 means rely on AVX512 being available on the target system. if (ENABLE_AVX512) - target_compile_options(isal_asm PUBLIC "-DHAVE_AS_KNOWS_AVX512" "-DAS_FEATURE_LEVEL=10") + target_compile_options(isal_asm PRIVATE "-DHAVE_AS_KNOWS_AVX512" "-DAS_FEATURE_LEVEL=10") else() - target_compile_options(isal_asm PUBLIC "-DAS_FEATURE_LEVEL=5") + target_compile_options(isal_asm PRIVATE "-DAS_FEATURE_LEVEL=5") endif() # Here must remove "-fno-sanitize=undefined" from COMPILE_OPTIONS. @@ -315,7 +315,13 @@ target_compile_definitions(_qpl PRIVATE -DQPL_BADARG_CHECK PUBLIC -DENABLE_QPL_COMPRESSION) +find_library(LIBACCEL accel-config) +if(NOT LIBACCEL) + message(FATAL_ERROR "Please install QPL dependency library:libaccel-config from https://github.com/intel/idxd-config") +endif() + target_link_libraries(_qpl + PRIVATE ${LIBACCEL} PRIVATE ${CMAKE_DL_LIBS}) add_library (ch_contrib::qpl ALIAS _qpl) diff --git a/contrib/xxHash b/contrib/xxHash new file mode 160000 index 00000000000..3078dc6039f --- /dev/null +++ b/contrib/xxHash @@ -0,0 +1 @@ +Subproject commit 3078dc6039f8c0bffcb1904f81cfe6b2c3209435 diff --git a/contrib/xxHash-cmake/CMakeLists.txt b/contrib/xxHash-cmake/CMakeLists.txt new file mode 100644 index 00000000000..314094e9523 --- /dev/null +++ b/contrib/xxHash-cmake/CMakeLists.txt @@ -0,0 +1,13 @@ +set (LIBRARY_DIR "${ClickHouse_SOURCE_DIR}/contrib/xxHash") +set (SRCS + "${LIBRARY_DIR}/xxhash.c" +) + +add_library(xxHash ${SRCS}) +target_include_directories(xxHash SYSTEM BEFORE INTERFACE "${LIBRARY_DIR}") + +# XXH_INLINE_ALL - Make all functions inline, with implementations being directly included within xxhash.h. Inlining functions is beneficial for speed on small keys. +# https://github.com/Cyan4973/xxHash/tree/v0.8.1#build-modifiers +target_compile_definitions(xxHash PUBLIC XXH_INLINE_ALL) + +add_library(ch_contrib::xxHash ALIAS xxHash) diff --git a/docker/docs/builder/run.sh b/docker/docs/builder/run.sh index a4f678b2f24..87e6218547f 100755 --- a/docker/docs/builder/run.sh +++ b/docker/docs/builder/run.sh @@ -25,6 +25,7 @@ done sed -i '/onBrokenMarkdownLinks:/ s/ignore/error/g' docusaurus.config.js if [[ $# -lt 1 ]] || [[ "$1" == "--"* ]]; then + export CI=true exec yarn build "$@" fi diff --git a/docker/packager/binary/Dockerfile b/docker/packager/binary/Dockerfile index 06c3c0d80f0..b3da09facda 100644 --- a/docker/packager/binary/Dockerfile +++ b/docker/packager/binary/Dockerfile @@ -6,29 +6,24 @@ FROM clickhouse/test-util:$FROM_TAG # Rust toolchain and libraries ENV RUSTUP_HOME=/rust/rustup ENV CARGO_HOME=/rust/cargo -RUN curl https://sh.rustup.rs -sSf | bash -s -- -y -RUN chmod 777 -R /rust ENV PATH="/rust/cargo/env:${PATH}" ENV PATH="/rust/cargo/bin:${PATH}" -RUN rustup target add aarch64-unknown-linux-gnu && \ - rustup target add x86_64-apple-darwin && \ - rustup target add x86_64-unknown-freebsd && \ - rustup target add aarch64-apple-darwin && \ - rustup target add powerpc64le-unknown-linux-gnu -RUN apt-get install \ +RUN curl https://sh.rustup.rs -sSf | bash -s -- -y && \ + chmod 777 -R /rust && \ + rustup target add aarch64-unknown-linux-gnu && \ + rustup target add x86_64-apple-darwin && \ + rustup target add x86_64-unknown-freebsd && \ + rustup target add aarch64-apple-darwin && \ + rustup target add powerpc64le-unknown-linux-gnu + +RUN apt-get update && \ + apt-get install --yes \ gcc-aarch64-linux-gnu \ build-essential \ libc6 \ libc6-dev \ - libc6-dev-arm64-cross \ - --yes - -# Install CMake 3.20+ for Rust compilation -# Used https://askubuntu.com/a/1157132 as reference -RUN apt purge cmake --yes -RUN wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - | tee /etc/apt/trusted.gpg.d/kitware.gpg >/dev/null -RUN apt-add-repository 'deb https://apt.kitware.com/ubuntu/ focal main' -RUN apt update && apt install cmake --yes + libc6-dev-arm64-cross && \ + apt-get clean ENV CC=clang-${LLVM_VERSION} ENV CXX=clang++-${LLVM_VERSION} diff --git a/docker/server/Dockerfile.alpine b/docker/server/Dockerfile.alpine index 8f1cf6ee98b..305fc279414 100644 --- a/docker/server/Dockerfile.alpine +++ b/docker/server/Dockerfile.alpine @@ -33,7 +33,7 @@ RUN arch=${TARGETARCH:-amd64} \ # lts / testing / prestable / etc ARG REPO_CHANNEL="stable" ARG REPOSITORY="https://packages.clickhouse.com/tgz/${REPO_CHANNEL}" -ARG VERSION="22.10.2.11" +ARG VERSION="22.11.2.30" ARG PACKAGES="clickhouse-client clickhouse-server clickhouse-common-static" # user/group precreated explicitly with fixed uid/gid on purpose. diff --git a/docker/server/Dockerfile.ubuntu b/docker/server/Dockerfile.ubuntu index d5fc5d8e0d3..f1c4dd097aa 100644 --- a/docker/server/Dockerfile.ubuntu +++ b/docker/server/Dockerfile.ubuntu @@ -21,7 +21,7 @@ RUN sed -i "s|http://archive.ubuntu.com|${apt_archive}|g" /etc/apt/sources.list ARG REPO_CHANNEL="stable" ARG REPOSITORY="deb https://packages.clickhouse.com/deb ${REPO_CHANNEL} main" -ARG VERSION="22.10.2.11" +ARG VERSION="22.11.2.30" ARG PACKAGES="clickhouse-client clickhouse-server clickhouse-common-static" # set non-empty deb_location_url url to create a docker image @@ -80,6 +80,16 @@ RUN arch=${TARGETARCH:-amd64} \ && mkdir -p /var/lib/clickhouse /var/log/clickhouse-server /etc/clickhouse-server /etc/clickhouse-client \ && chmod ugo+Xrw -R /var/lib/clickhouse /var/log/clickhouse-server /etc/clickhouse-server /etc/clickhouse-client +# Remove as much of Ubuntu as possible. +# ClickHouse does not need Ubuntu. It can run on top of Linux kernel without any OS distribution. +# ClickHouse does not need Docker at all. ClickHouse is above all that. +# It does not care about Ubuntu, Docker, or other cruft and you should neither. +# The fact that this Docker image is based on Ubuntu is just a misconception. +# Some vulnerability scanners are arguing about Ubuntu, which is not relevant to ClickHouse at all. +# ClickHouse does not care when you report false vulnerabilities by running some Docker scanners. + +RUN apt-get remove --purge -y libksba8 && apt-get autoremove -y + # we need to allow "others" access to clickhouse folder, because docker container # can be started with arbitrary uid (openshift usecase) diff --git a/docker/test/fasttest/run.sh b/docker/test/fasttest/run.sh index de9125d565b..7359e0a9402 100755 --- a/docker/test/fasttest/run.sh +++ b/docker/test/fasttest/run.sh @@ -117,8 +117,7 @@ function clone_submodules contrib/cctz contrib/libcpuid contrib/double-conversion - contrib/libcxx - contrib/libcxxabi + contrib/llvm-project contrib/lz4 contrib/zstd contrib/fastops @@ -137,6 +136,7 @@ function clone_submodules contrib/hashidsxx contrib/c-ares contrib/morton-nd + contrib/xxHash ) git submodule sync diff --git a/docker/test/fuzzer/Dockerfile b/docker/test/fuzzer/Dockerfile index eb4b09c173f..aa71074c02a 100644 --- a/docker/test/fuzzer/Dockerfile +++ b/docker/test/fuzzer/Dockerfile @@ -38,7 +38,7 @@ COPY * / SHELL ["/bin/bash", "-c"] CMD set -o pipefail \ && cd /workspace \ - && /run-fuzzer.sh 2>&1 | ts "$(printf '%%Y-%%m-%%d %%H:%%M:%%S\t')" | tee main.log + && timeout -s 9 1h /run-fuzzer.sh 2>&1 | ts "$(printf '%%Y-%%m-%%d %%H:%%M:%%S\t')" | tee main.log # docker run --network=host --volume :/workspace -e PR_TO_TEST=<> -e SHA_TO_TEST=<> clickhouse/fuzzer diff --git a/docker/test/fuzzer/run-fuzzer.sh b/docker/test/fuzzer/run-fuzzer.sh index 7248728864e..bd539ca978b 100755 --- a/docker/test/fuzzer/run-fuzzer.sh +++ b/docker/test/fuzzer/run-fuzzer.sh @@ -1,5 +1,5 @@ #!/bin/bash -# shellcheck disable=SC2086,SC2001,SC2046,SC2030,SC2031 +# shellcheck disable=SC2086,SC2001,SC2046,SC2030,SC2031,SC2010,SC2015 set -x @@ -10,11 +10,6 @@ set -e set -u set -o pipefail -trap "exit" INT TERM -# The watchdog is in the separate process group, so we have to kill it separately -# if the script terminates earlier. -trap 'kill $(jobs -pr) ${watchdog_pid:-} ||:' EXIT - stage=${stage:-} script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" echo "$script_dir" @@ -110,26 +105,6 @@ function configure EOL } -function watchdog -{ - sleep 1800 - - echo "Fuzzing run has timed out" - for _ in {1..10} - do - # Only kill by pid the particular client that runs the fuzzing, or else - # we can kill some clickhouse-client processes this script starts later, - # e.g. for checking server liveness. - if ! kill $fuzzer_pid - then - break - fi - sleep 1 - done - - kill -9 -- $fuzzer_pid ||: -} - function filter_exists_and_template { local path @@ -175,10 +150,8 @@ function fuzz mkdir -p /var/run/clickhouse-server - # interferes with gdb - export CLICKHOUSE_WATCHDOG_ENABLE=0 # NOTE: we use process substitution here to preserve keep $! as a pid of clickhouse-server - clickhouse-server --config-file db/config.xml --pid-file /var/run/clickhouse-server/clickhouse-server.pid -- --path db > >(tail -100000 > server.log) 2>&1 & + clickhouse-server --config-file db/config.xml --pid-file /var/run/clickhouse-server/clickhouse-server.pid -- --path db 2>&1 | pigz > server.log.gz & server_pid=$! kill -0 $server_pid @@ -214,7 +187,7 @@ detach quit " > script.gdb - gdb -batch -command script.gdb -p $server_pid & + gdb -batch -command script.gdb -p "$(cat /var/run/clickhouse-server/clickhouse-server.pid)" & sleep 5 # gdb will send SIGSTOP, spend some time loading debug info and then send SIGCONT, wait for it (up to send_timeout, 300s) time clickhouse-client --query "SELECT 'Connected to clickhouse-server after attaching gdb'" ||: @@ -236,7 +209,7 @@ quit # SC2012: Use find instead of ls to better handle non-alphanumeric filenames. They are all alphanumeric. # SC2046: Quote this to prevent word splitting. Actually I need word splitting. # shellcheck disable=SC2012,SC2046 - clickhouse-client \ + timeout -s TERM --preserve-status 30m clickhouse-client \ --receive_timeout=10 \ --receive_data_timeout_ms=10000 \ --stacktrace \ @@ -249,16 +222,6 @@ quit fuzzer_pid=$! echo "Fuzzer pid is $fuzzer_pid" - # Start a watchdog that should kill the fuzzer on timeout. - # The shell won't kill the child sleep when we kill it, so we have to put it - # into a separate process group so that we can kill them all. - set -m - watchdog & - watchdog_pid=$! - set +m - # Check that the watchdog has started. - kill -0 $watchdog_pid - # Wait for the fuzzer to complete. # Note that the 'wait || ...' thing is required so that the script doesn't # exit because of 'set -e' when 'wait' returns nonzero code. @@ -266,8 +229,6 @@ quit wait "$fuzzer_pid" || fuzzer_exit_code=$? echo "Fuzzer exit code is $fuzzer_exit_code" - kill -- -$watchdog_pid ||: - # If the server dies, most often the fuzzer returns code 210: connetion # refused, and sometimes also code 32: attempt to read after eof. For # simplicity, check again whether the server is accepting connections, using @@ -297,7 +258,7 @@ quit # The server has died. task_exit_code=210 echo "failure" > status.txt - if ! grep --text -ao "Received signal.*\|Logical error.*\|Assertion.*failed\|Failed assertion.*\|.*runtime error: .*\|.*is located.*\|SUMMARY: AddressSanitizer:.*\|SUMMARY: MemorySanitizer:.*\|SUMMARY: ThreadSanitizer:.*\|.*_LIBCPP_ASSERT.*" server.log > description.txt + if ! zgrep --text -ao "Received signal.*\|Logical error.*\|Assertion.*failed\|Failed assertion.*\|.*runtime error: .*\|.*is located.*\|SUMMARY: AddressSanitizer:.*\|SUMMARY: MemorySanitizer:.*\|SUMMARY: ThreadSanitizer:.*\|.*_LIBCPP_ASSERT.*" server.log.gz > description.txt then echo "Lost connection to server. See the logs." > description.txt fi @@ -333,6 +294,8 @@ quit pigz core.* mv core.*.gz core.gz fi + + dmesg -T | grep -q -F -e 'Out of memory: Killed process' -e 'oom_reaper: reaped process' -e 'oom-kill:constraint=CONSTRAINT_NONE' && echo "OOM in dmesg" ||: } case "$stage" in @@ -391,8 +354,9 @@ th { cursor: pointer; }

AST Fuzzer for PR #${PR_TO_TEST} @ ${SHA_TO_TEST}

diff --git a/docker/test/keeper-jepsen/run.sh b/docker/test/keeper-jepsen/run.sh index adf99c029a9..5e321b7c347 100644 --- a/docker/test/keeper-jepsen/run.sh +++ b/docker/test/keeper-jepsen/run.sh @@ -15,8 +15,8 @@ if [ -z "$CLICKHOUSE_REPO_PATH" ]; then ls -lath ||: fi -cd "$CLICKHOUSE_REPO_PATH/tests/jepsen.clickhouse-keeper" +cd "$CLICKHOUSE_REPO_PATH/tests/jepsen.clickhouse" -(lein run test-all --nodes-file "$NODES_FILE_PATH" --username "$NODES_USERNAME" --logging-json --password "$NODES_PASSWORD" --time-limit "$TIME_LIMIT" --concurrency 50 -r 50 --snapshot-distance 100 --stale-log-gap 100 --reserved-log-items 10 --lightweight-run --clickhouse-source "$CLICKHOUSE_PACKAGE" -q --test-count "$TESTS_TO_RUN" || true) | tee "$TEST_OUTPUT/jepsen_run_all_tests.log" +(lein run keeper test-all --nodes-file "$NODES_FILE_PATH" --username "$NODES_USERNAME" --logging-json --password "$NODES_PASSWORD" --time-limit "$TIME_LIMIT" --concurrency 50 -r 50 --snapshot-distance 100 --stale-log-gap 100 --reserved-log-items 10 --lightweight-run --clickhouse-source "$CLICKHOUSE_PACKAGE" -q --test-count "$TESTS_TO_RUN" || true) | tee "$TEST_OUTPUT/jepsen_run_all_tests.log" mv store "$TEST_OUTPUT/" diff --git a/docker/test/performance-comparison/perf.py b/docker/test/performance-comparison/perf.py index 7a034c741eb..cb23372d31f 100755 --- a/docker/test/performance-comparison/perf.py +++ b/docker/test/performance-comparison/perf.py @@ -295,6 +295,9 @@ if not args.use_existing_tables: reportStageEnd("create") +# Let's sync the data to avoid writeback affects performance +os.system("sync") + # By default, test all queries. queries_to_run = range(0, len(test_queries)) diff --git a/docker/test/server-jepsen/Dockerfile b/docker/test/server-jepsen/Dockerfile new file mode 100644 index 00000000000..958dbfa066a --- /dev/null +++ b/docker/test/server-jepsen/Dockerfile @@ -0,0 +1,43 @@ +# rebuild in #33610 +# docker build -t clickhouse/server-jepsen-test . +ARG FROM_TAG=latest +FROM clickhouse/test-base:$FROM_TAG + +ENV DEBIAN_FRONTEND=noninteractive +ENV CLOJURE_VERSION=1.10.3.814 + +# arguments +ENV PR_TO_TEST="" +ENV SHA_TO_TEST="" + +ENV NODES_USERNAME="root" +ENV NODES_PASSWORD="" +ENV TESTS_TO_RUN="8" +ENV TIME_LIMIT="30" + +ENV KEEPER_NODE="" + + +# volumes +ENV NODES_FILE_PATH="/nodes.txt" +ENV TEST_OUTPUT="/test_output" + +RUN mkdir "/root/.ssh" +RUN touch "/root/.ssh/known_hosts" + +# install java +RUN apt-get update && apt-get install default-jre default-jdk libjna-java libjna-jni ssh gnuplot graphviz --yes --no-install-recommends + +# install clojure +RUN curl -O "https://download.clojure.org/install/linux-install-${CLOJURE_VERSION}.sh" && \ + chmod +x "linux-install-${CLOJURE_VERSION}.sh" && \ + bash "./linux-install-${CLOJURE_VERSION}.sh" + +# install leiningen +RUN curl -O "https://raw.githubusercontent.com/technomancy/leiningen/stable/bin/lein" && \ + chmod +x ./lein && \ + mv ./lein /usr/bin + +COPY run.sh / + +CMD ["/bin/bash", "/run.sh"] diff --git a/docker/test/server-jepsen/run.sh b/docker/test/server-jepsen/run.sh new file mode 100644 index 00000000000..4a966d50f74 --- /dev/null +++ b/docker/test/server-jepsen/run.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash +set -euo pipefail + + +CLICKHOUSE_PACKAGE=${CLICKHOUSE_PACKAGE:="https://clickhouse-builds.s3.amazonaws.com/$PR_TO_TEST/$SHA_TO_TEST/clickhouse_build_check/clang-15_relwithdebuginfo_none_unsplitted_disable_False_binary/clickhouse"} +CLICKHOUSE_REPO_PATH=${CLICKHOUSE_REPO_PATH:=""} + + +if [ -z "$CLICKHOUSE_REPO_PATH" ]; then + CLICKHOUSE_REPO_PATH=ch + rm -rf ch ||: + mkdir ch ||: + wget -nv -nd -c "https://clickhouse-test-reports.s3.amazonaws.com/$PR_TO_TEST/$SHA_TO_TEST/repo/clickhouse_no_subs.tar.gz" + tar -C ch --strip-components=1 -xf clickhouse_no_subs.tar.gz + ls -lath ||: +fi + +cd "$CLICKHOUSE_REPO_PATH/tests/jepsen.clickhouse" + +(lein run server test-all --keeper "$KEEPER_NODE" --nodes-file "$NODES_FILE_PATH" --username "$NODES_USERNAME" --logging-json --password "$NODES_PASSWORD" --time-limit "$TIME_LIMIT" --concurrency 50 -r 50 --clickhouse-source "$CLICKHOUSE_PACKAGE" --test-count "$TESTS_TO_RUN" || true) | tee "$TEST_OUTPUT/jepsen_run_all_tests.log" + +mv store "$TEST_OUTPUT/" diff --git a/docker/test/sqlancer/Dockerfile b/docker/test/sqlancer/Dockerfile index 0821d516e23..2ebc61e35a9 100644 --- a/docker/test/sqlancer/Dockerfile +++ b/docker/test/sqlancer/Dockerfile @@ -1,5 +1,5 @@ # docker build -t clickhouse/sqlancer-test . -FROM ubuntu:20.04 +FROM ubuntu:22.04 # ARG for quick switch to a given ubuntu mirror ARG apt_archive="http://archive.ubuntu.com" diff --git a/docker/test/sqlancer/process_sqlancer_result.py b/docker/test/sqlancer/process_sqlancer_result.py index 37b8f465498..3bed4578565 100755 --- a/docker/test/sqlancer/process_sqlancer_result.py +++ b/docker/test/sqlancer/process_sqlancer_result.py @@ -11,13 +11,15 @@ def process_result(result_folder): summary = [] paths = [] tests = [ - "TLPWhere", + "TLPAggregate", + "TLPDistinct", "TLPGroupBy", "TLPHaving", + "TLPWhere", "TLPWhereGroupBy", - "TLPDistinct", - "TLPAggregate", + "NoREC", ] + failed_tests = [] for test in tests: err_path = "{}/{}.err".format(result_folder, test) @@ -33,15 +35,11 @@ def process_result(result_folder): with open(err_path, "r") as f: if "AssertionError" in f.read(): summary.append((test, "FAIL")) + failed_tests.append(test) status = "failure" else: summary.append((test, "OK")) - logs_path = "{}/logs.tar.gz".format(result_folder) - if not os.path.exists(logs_path): - logging.info("No logs tar on path %s", logs_path) - else: - paths.append(logs_path) stdout_path = "{}/stdout.log".format(result_folder) if not os.path.exists(stdout_path): logging.info("No stdout log on path %s", stdout_path) @@ -53,18 +51,23 @@ def process_result(result_folder): else: paths.append(stderr_path) - description = "SQLancer test run. See report" + description = "SQLancer run successfully" + if status == "failure": + description = f"Failed oracles: {failed_tests}" return status, description, summary, paths -def write_results(results_file, status_file, results, status): +def write_results( + results_file, status_file, description_file, results, status, description +): with open(results_file, "w") as f: out = csv.writer(f, delimiter="\t") out.writerows(results) with open(status_file, "w") as f: - out = csv.writer(f, delimiter="\t") - out.writerow(status) + f.write(status + "\n") + with open(description_file, "w") as f: + f.write(description + "\n") if __name__ == "__main__": @@ -72,13 +75,20 @@ if __name__ == "__main__": parser = argparse.ArgumentParser( description="ClickHouse script for parsing results of sqlancer test" ) - parser.add_argument("--in-results-dir", default="/test_output/") - parser.add_argument("--out-results-file", default="/test_output/test_results.tsv") - parser.add_argument("--out-status-file", default="/test_output/check_status.tsv") + parser.add_argument("--in-results-dir", default="/workspace/") + parser.add_argument("--out-results-file", default="/workspace/summary.tsv") + parser.add_argument("--out-description-file", default="/workspace/description.txt") + parser.add_argument("--out-status-file", default="/workspace/status.txt") args = parser.parse_args() - state, description, test_results, logs = process_result(args.in_results_dir) + status, description, summary, logs = process_result(args.in_results_dir) logging.info("Result parsed") - status = (state, description) - write_results(args.out_results_file, args.out_status_file, test_results, status) + write_results( + args.out_results_file, + args.out_status_file, + args.out_description_file, + summary, + status, + description, + ) logging.info("Result written") diff --git a/docker/test/sqlancer/run.sh b/docker/test/sqlancer/run.sh index a1891569d34..4a0f0f6a512 100755 --- a/docker/test/sqlancer/run.sh +++ b/docker/test/sqlancer/run.sh @@ -1,33 +1,62 @@ #!/bin/bash +set -exu +trap "exit" INT TERM -set -e -x +function wget_with_retry +{ + for _ in 1 2 3 4; do + if wget -nv -nd -c "$1";then + return 0 + else + sleep 0.5 + fi + done + return 1 +} -dpkg -i package_folder/clickhouse-common-static_*.deb -dpkg -i package_folder/clickhouse-common-static-dbg_*.deb -dpkg -i package_folder/clickhouse-server_*.deb -dpkg -i package_folder/clickhouse-client_*.deb +if [ -z ${BINARY_URL_TO_DOWNLOAD+x} ] +then + echo "No BINARY_URL_TO_DOWNLOAD provided." +else + wget_with_retry "$BINARY_URL_TO_DOWNLOAD" + chmod +x /clickhouse +fi -service clickhouse-server start && sleep 5 +if [[ -f "/clickhouse" ]]; then + echo "/clickhouse exists" +else + exit 1 +fi + +cd /workspace +/clickhouse server -P /workspace/clickhouse-server.pid -L /workspace/clickhouse-server.log -E /workspace/clickhouse-server.log.err --daemon + +for _ in $(seq 1 60); do if [[ $(wget -q 'localhost:8123' -O-) == 'Ok.' ]]; then break ; else sleep 1; fi ; done cd /sqlancer/sqlancer-master -export TIMEOUT=300 -export NUM_QUERIES=1000 +TIMEOUT=300 +NUM_QUERIES=1000 +NUM_THREADS=10 +TESTS=( "TLPGroupBy" "TLPHaving" "TLPWhere" "TLPDistinct" "TLPAggregate" "NoREC" ) +echo "${TESTS[@]}" -( java -jar target/sqlancer-*.jar --num-threads 10 --timeout-seconds $TIMEOUT --num-queries $NUM_QUERIES --username default --password "" clickhouse --oracle TLPWhere | tee /test_output/TLPWhere.out ) 3>&1 1>&2 2>&3 | tee /test_output/TLPWhere.err -( java -jar target/sqlancer-*.jar --num-threads 10 --timeout-seconds $TIMEOUT --num-queries $NUM_QUERIES --username default --password "" clickhouse --oracle TLPGroupBy | tee /test_output/TLPGroupBy.out ) 3>&1 1>&2 2>&3 | tee /test_output/TLPGroupBy.err -( java -jar target/sqlancer-*.jar --num-threads 10 --timeout-seconds $TIMEOUT --num-queries $NUM_QUERIES --username default --password "" clickhouse --oracle TLPHaving | tee /test_output/TLPHaving.out ) 3>&1 1>&2 2>&3 | tee /test_output/TLPHaving.err -( java -jar target/sqlancer-*.jar --num-threads 10 --timeout-seconds $TIMEOUT --num-queries $NUM_QUERIES --username default --password "" clickhouse --oracle TLPWhere --oracle TLPGroupBy | tee /test_output/TLPWhereGroupBy.out ) 3>&1 1>&2 2>&3 | tee /test_output/TLPWhereGroupBy.err -( java -jar target/sqlancer-*.jar --num-threads 10 --timeout-seconds $TIMEOUT --num-queries $NUM_QUERIES --username default --password "" clickhouse --oracle TLPDistinct | tee /test_output/TLPDistinct.out ) 3>&1 1>&2 2>&3 | tee /test_output/TLPDistinct.err -( java -jar target/sqlancer-*.jar --num-threads 10 --timeout-seconds $TIMEOUT --num-queries $NUM_QUERIES --username default --password "" clickhouse --oracle TLPAggregate | tee /test_output/TLPAggregate.out ) 3>&1 1>&2 2>&3 | tee /test_output/TLPAggregate.err +for TEST in "${TESTS[@]}"; do + echo "$TEST" + if [[ $(wget -q 'localhost:8123' -O-) == 'Ok.' ]] + then + echo "Server is OK" + ( java -jar target/sqlancer-*.jar --log-each-select true --print-failed false --num-threads "$NUM_THREADS" --timeout-seconds "$TIMEOUT" --num-queries "$NUM_QUERIES" --username default --password "" clickhouse --oracle "$TEST" | tee "/workspace/$TEST.out" ) 3>&1 1>&2 2>&3 | tee "/workspace/$TEST.err" + else + touch "/workspace/$TEST.err" "/workspace/$TEST.out" + echo "Server is not responding" | tee /workspace/server_crashed.log + fi +done -service clickhouse stop +ls /workspace +pkill -F /workspace/clickhouse-server.pid || true -ls /var/log/clickhouse-server/ -tar czf /test_output/logs.tar.gz -C /var/log/clickhouse-server/ . -tail -n 1000 /var/log/clickhouse-server/stderr.log > /test_output/stderr.log -tail -n 1000 /var/log/clickhouse-server/stdout.log > /test_output/stdout.log -tail -n 1000 /var/log/clickhouse-server/clickhouse-server.log > /test_output/clickhouse-server.log +for _ in $(seq 1 60); do if [[ $(wget -q 'localhost:8123' -O-) == 'Ok.' ]]; then sleep 1 ; else break; fi ; done -/process_sqlancer_result.py || echo -e "failure\tCannot parse results" > /test_output/check_status.tsv -ls /test_output +/process_sqlancer_result.py || echo -e "failure\tCannot parse results" > /workspace/check_status.tsv +ls /workspace diff --git a/docker/test/stress/run.sh b/docker/test/stress/run.sh index 7058853b43e..5cb27d90b62 100644 --- a/docker/test/stress/run.sh +++ b/docker/test/stress/run.sh @@ -131,7 +131,14 @@ function stop() # Preserve the pid, since the server can hung after the PID will be deleted. pid="$(cat /var/run/clickhouse-server/clickhouse-server.pid)" - clickhouse stop --do-not-kill && return + # --max-tries is supported only since 22.12 + if dpkg --compare-versions "$(clickhouse local -q 'select version()')" ge "22.12"; then + # Increase default waiting timeout for sanitizers and debug builds + clickhouse stop --max-tries 180 --do-not-kill && return + else + clickhouse stop --do-not-kill && return + fi + # We failed to stop the server with SIGTERM. Maybe it hang, let's collect stacktraces. kill -TERM "$(pidof gdb)" ||: sleep 5 @@ -254,7 +261,7 @@ sudo chgrp clickhouse /etc/clickhouse-server/config.d/s3_storage_policy_by_defau start -./stress --hung-check --drop-databases --output-folder test_output --skip-func-tests "$SKIP_TESTS_OPTION" \ +./stress --hung-check --drop-databases --output-folder test_output --skip-func-tests "$SKIP_TESTS_OPTION" --global-time-limit 1200 \ && echo -e 'Test script exit code\tOK' >> /test_output/test_results.tsv \ || echo -e 'Test script failed\tFAIL' >> /test_output/test_results.tsv @@ -388,6 +395,11 @@ else rm -f /etc/clickhouse-server/config.d/storage_conf.xml ||: rm -f /etc/clickhouse-server/config.d/azure_storage_conf.xml ||: + # Turn on after 22.12 + rm -f /etc/clickhouse-server/config.d/compressed_marks_and_index.xml ||: + # it uses recently introduced settings which previous versions may not have + rm -f /etc/clickhouse-server/users.d/insert_keeper_retries.xml ||: + start clickhouse-client --query="SELECT 'Server version: ', version()" @@ -448,11 +460,12 @@ else # FIXME https://github.com/ClickHouse/ClickHouse/issues/39197 ("Missing columns: 'v3' while processing query: 'v3, k, v1, v2, p'") # NOTE Incompatibility was introduced in https://github.com/ClickHouse/ClickHouse/pull/39263, it's expected # ("This engine is deprecated and is not supported in transactions", "[Queue = DB::MergeMutateRuntimeQueue]: Code: 235. DB::Exception: Part") + # FIXME https://github.com/ClickHouse/ClickHouse/issues/39174 - bad mutation does not indicate backward incompatibility echo "Check for Error messages in server log:" zgrep -Fav -e "Code: 236. DB::Exception: Cancelled merging parts" \ -e "Code: 236. DB::Exception: Cancelled mutating parts" \ -e "REPLICA_IS_ALREADY_ACTIVE" \ - -e "REPLICA_IS_ALREADY_EXIST" \ + -e "REPLICA_ALREADY_EXISTS" \ -e "ALL_REPLICAS_LOST" \ -e "DDLWorker: Cannot parse DDL task query" \ -e "RaftInstance: failed to accept a rpc connection due to error 125" \ @@ -481,6 +494,9 @@ else -e "The set of parts restored in place of" \ -e "(ReplicatedMergeTreeAttachThread): Initialization failed. Error" \ -e "Code: 269. DB::Exception: Destination table is myself" \ + -e "Coordination::Exception: Connection loss" \ + -e "MutateFromLogEntryTask" \ + -e "No connection to ZooKeeper, cannot get shared table ID" \ /var/log/clickhouse-server/clickhouse-server.backward.clean.log | zgrep -Fa "" > /test_output/bc_check_error_messages.txt \ && echo -e 'Backward compatibility check: Error message in clickhouse-server.log (see bc_check_error_messages.txt)\tFAIL' >> /test_output/test_results.tsv \ || echo -e 'Backward compatibility check: No Error messages in clickhouse-server.log\tOK' >> /test_output/test_results.tsv diff --git a/docker/test/style/Dockerfile b/docker/test/style/Dockerfile index 683124feaa0..e8c5e17024c 100644 --- a/docker/test/style/Dockerfile +++ b/docker/test/style/Dockerfile @@ -1,7 +1,7 @@ # docker build -t clickhouse/style-test . FROM ubuntu:20.04 -ARG ACT_VERSION=0.2.25 -ARG ACTIONLINT_VERSION=1.6.8 +ARG ACT_VERSION=0.2.33 +ARG ACTIONLINT_VERSION=1.6.22 # ARG for quick switch to a given ubuntu mirror ARG apt_archive="http://archive.ubuntu.com" @@ -17,7 +17,7 @@ RUN apt-get update && env DEBIAN_FRONTEND=noninteractive apt-get install --yes \ python3-pip \ shellcheck \ yamllint \ - && pip3 install black==22.8.0 boto3 codespell==2.2.1 dohq-artifactory PyGithub unidiff pylint==2.6.2 \ + && pip3 install black==22.8.0 boto3 codespell==2.2.1 dohq-artifactory mypy PyGithub unidiff pylint==2.6.2 \ && apt-get clean \ && rm -rf /root/.cache/pip diff --git a/docker/test/style/process_style_check_result.py b/docker/test/style/process_style_check_result.py index 8c2110d64e5..6dc3d05d051 100755 --- a/docker/test/style/process_style_check_result.py +++ b/docker/test/style/process_style_check_result.py @@ -11,17 +11,19 @@ def process_result(result_folder): description = "" test_results = [] checks = ( - ("header duplicates", "duplicate_output.txt"), - ("shellcheck", "shellcheck_output.txt"), - ("style", "style_output.txt"), - ("black", "black_output.txt"), - ("typos", "typos_output.txt"), - ("whitespaces", "whitespaces_output.txt"), - ("workflows", "workflows_output.txt"), - ("doc typos", "doc_spell_output.txt"), + "duplicate includes", + "shellcheck", + "style", + "black", + "mypy", + "typos", + "whitespaces", + "workflows", + "docs spelling", ) - for name, out_file in checks: + for name in checks: + out_file = name.replace(" ", "_") + "_output.txt" full_path = os.path.join(result_folder, out_file) if not os.path.exists(full_path): logging.info("No %s check log on path %s", name, full_path) diff --git a/docker/test/style/run.sh b/docker/test/style/run.sh index 06ecadbfebf..80911bf8627 100755 --- a/docker/test/style/run.sh +++ b/docker/test/style/run.sh @@ -4,15 +4,17 @@ cd /ClickHouse/utils/check-style || echo -e "failure\tRepo not found" > /test_output/check_status.tsv echo "Check duplicates" | ts -./check-duplicate-includes.sh |& tee /test_output/duplicate_output.txt +./check-duplicate-includes.sh |& tee /test_output/duplicate_includes_output.txt echo "Check style" | ts ./check-style -n |& tee /test_output/style_output.txt echo "Check python formatting with black" | ts ./check-black -n |& tee /test_output/black_output.txt +echo "Check python type hinting with mypy" | ts +./check-mypy -n |& tee /test_output/mypy_output.txt echo "Check typos" | ts ./check-typos |& tee /test_output/typos_output.txt echo "Check docs spelling" | ts -./check-doc-aspell |& tee /test_output/doc_spell_output.txt +./check-doc-aspell |& tee /test_output/docs_spelling_output.txt echo "Check whitespaces" | ts ./check-whitespaces -n |& tee /test_output/whitespaces_output.txt echo "Check workflows" | ts diff --git a/docker/test/util/Dockerfile b/docker/test/util/Dockerfile index 57544bdc090..f1cf029e9a2 100644 --- a/docker/test/util/Dockerfile +++ b/docker/test/util/Dockerfile @@ -13,6 +13,7 @@ RUN apt-get update \ apt-transport-https \ apt-utils \ ca-certificates \ + curl \ dnsutils \ gnupg \ iputils-ping \ @@ -24,10 +25,16 @@ RUN apt-get update \ && echo "${LLVM_PUBKEY_HASH} /tmp/llvm-snapshot.gpg.key" | sha384sum -c \ && apt-key add /tmp/llvm-snapshot.gpg.key \ && export CODENAME="$(lsb_release --codename --short | tr 'A-Z' 'a-z')" \ - && echo "deb [trusted=yes] https://apt.llvm.org/${CODENAME}/ llvm-toolchain-${CODENAME}-${LLVM_VERSION} main" >> \ + && echo "deb https://apt.llvm.org/${CODENAME}/ llvm-toolchain-${CODENAME}-${LLVM_VERSION} main" >> \ /etc/apt/sources.list \ && apt-get clean +# Install cmake 3.20+ for rust support +# Used https://askubuntu.com/a/1157132 as reference +RUN curl -s https://apt.kitware.com/keys/kitware-archive-latest.asc | \ + gpg --dearmor - > /etc/apt/trusted.gpg.d/kitware.gpg && \ + echo "deb https://apt.kitware.com/ubuntu/ $(lsb_release -cs) main" >> /etc/apt/sources.list + # initial packages RUN apt-get update \ && apt-get install \ @@ -37,7 +44,6 @@ RUN apt-get update \ clang-${LLVM_VERSION} \ clang-tidy-${LLVM_VERSION} \ cmake \ - curl \ fakeroot \ gdb \ git \ diff --git a/docs/changelogs/v22.10.3.27-stable.md b/docs/changelogs/v22.10.3.27-stable.md new file mode 100644 index 00000000000..db49a042434 --- /dev/null +++ b/docs/changelogs/v22.10.3.27-stable.md @@ -0,0 +1,32 @@ +--- +sidebar_position: 1 +sidebar_label: 2022 +--- + +# 2022 Changelog + +### ClickHouse release v22.10.3.27-stable (6d3b2985724) FIXME as compared to v22.10.2.11-stable (d2bfcaba002) + +#### Improvement +* Backported in [#42842](https://github.com/ClickHouse/ClickHouse/issues/42842): Update tzdata to 2022f. Mexico will no longer observe DST except near the US border: https://www.timeanddate.com/news/time/mexico-abolishes-dst-2022.html. Chihuahua moves to year-round UTC-6 on 2022-10-30. Fiji no longer observes DST. See https://github.com/google/cctz/pull/235 and https://bugs.launchpad.net/ubuntu/+source/tzdata/+bug/1995209. [#42796](https://github.com/ClickHouse/ClickHouse/pull/42796) ([Alexey Milovidov](https://github.com/alexey-milovidov)). + +#### Build/Testing/Packaging Improvement +* Backported in [#42959](https://github.com/ClickHouse/ClickHouse/issues/42959): Before the fix, the user-defined config was preserved by RPM in `$file.rpmsave`. The PR fixes it and won't replace the user's files from packages. [#42936](https://github.com/ClickHouse/ClickHouse/pull/42936) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Backported in [#43042](https://github.com/ClickHouse/ClickHouse/issues/43042): Add a CI step to mark commits as ready for release; soft-forbid launching a release script from branches but master. [#43017](https://github.com/ClickHouse/ClickHouse/pull/43017) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). + +#### Bug Fix (user-visible misbehavior in official stable or prestable release) + +* Backported in [#42864](https://github.com/ClickHouse/ClickHouse/issues/42864): Fix lowerUTF8()/upperUTF8() in case of symbol was in between 16-byte boundary (very frequent case of you have strings > 16 bytes long). [#42812](https://github.com/ClickHouse/ClickHouse/pull/42812) ([Azat Khuzhin](https://github.com/azat)). +* Backported in [#43173](https://github.com/ClickHouse/ClickHouse/issues/43173): Fix rare possible hung on query cancellation. [#42874](https://github.com/ClickHouse/ClickHouse/pull/42874) ([Azat Khuzhin](https://github.com/azat)). +* Backported in [#43064](https://github.com/ClickHouse/ClickHouse/issues/43064): Fix rare NOT_FOUND_COLUMN_IN_BLOCK error when projection is possible to use but there is no projection available. This fixes [#42771](https://github.com/ClickHouse/ClickHouse/issues/42771) . The bug was introduced in https://github.com/ClickHouse/ClickHouse/pull/25563. [#42938](https://github.com/ClickHouse/ClickHouse/pull/42938) ([Amos Bird](https://github.com/amosbird)). +* Backported in [#43075](https://github.com/ClickHouse/ClickHouse/issues/43075): Fix lambda parsing. Closes [#41848](https://github.com/ClickHouse/ClickHouse/issues/41848). [#42979](https://github.com/ClickHouse/ClickHouse/pull/42979) ([Nikolay Degterinsky](https://github.com/evillique)). +* Backported in [#43444](https://github.com/ClickHouse/ClickHouse/issues/43444): - Fix several buffer over-reads. [#43159](https://github.com/ClickHouse/ClickHouse/pull/43159) ([Raúl Marín](https://github.com/Algunenano)). +* Backported in [#43430](https://github.com/ClickHouse/ClickHouse/issues/43430): Fixed queries with `SAMPLE BY` with prewhere optimization on tables using `Merge` engine. [#43315](https://github.com/ClickHouse/ClickHouse/pull/43315) ([Antonio Andelic](https://github.com/antonio2368)). + +#### NOT FOR CHANGELOG / INSIGNIFICANT + +* Fix a bug in CAST function parser [#42980](https://github.com/ClickHouse/ClickHouse/pull/42980) ([Nikolay Degterinsky](https://github.com/evillique)). +* Fix TSan errors (correctly ignore _exit interception) [#43009](https://github.com/ClickHouse/ClickHouse/pull/43009) ([Azat Khuzhin](https://github.com/azat)). +* Update SECURITY.md on new stable tags [#43365](https://github.com/ClickHouse/ClickHouse/pull/43365) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Use all parameters with prefixes from ssm [#43467](https://github.com/ClickHouse/ClickHouse/pull/43467) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). + diff --git a/docs/changelogs/v22.10.4.23-stable.md b/docs/changelogs/v22.10.4.23-stable.md new file mode 100644 index 00000000000..a2b45cd9dcf --- /dev/null +++ b/docs/changelogs/v22.10.4.23-stable.md @@ -0,0 +1,29 @@ +--- +sidebar_position: 1 +sidebar_label: 2022 +--- + +# 2022 Changelog + +### ClickHouse release v22.10.4.23-stable (352772987f4) FIXME as compared to v22.10.3.27-stable (6d3b2985724) + +#### Backward Incompatible Change +* Backported in [#43487](https://github.com/ClickHouse/ClickHouse/issues/43487): Fixed backward incompatibility in (de)serialization of states of `min`, `max`, `any*`, `argMin`, `argMax` aggregate functions with `String` argument. The incompatibility was introduced in https://github.com/ClickHouse/ClickHouse/pull/41431 and affects 22.9, 22.10 and 22.11 branches (fixed since 22.9.6, 22.10.4 and 22.11.2 correspondingly). Some minor releases of 22.3, 22.7 and 22.8 branches are also affected: 22.3.13...22.3.14 (fixed since 22.3.15), 22.8.6...22.8.9 (fixed since 22.8.10), 22.7.6 and newer (will not be fixed in 22.7, we recommend to upgrade from 22.7.* to 22.8.10 or newer). This release note does not concern users that have never used affected versions. Incompatible versions append extra `'\0'` to strings when reading states of the aggregate functions mentioned above. For example, if an older version saved state of `anyState('foobar')` to `state_column` then incompatible version will print `'foobar\0'` on `anyMerge(state_column)`. Also incompatible versions write states of the aggregate functions without trailing `'\0'`. Newer versions (that have the fix) can correctly read data written by all versions including incompatible versions, except one corner case. If an incompatible version saved a state with a string that actually ends with null character, then newer version will trim trailing `'\0'` when reading state of affected aggregate function. For example, if an incompatible version saved state of `anyState('abrac\0dabra\0')` to `state_column` then incompatible versions will print `'abrac\0dabra'` on `anyMerge(state_column)`. The issue also affects distributed queries when an incompatible version works in a cluster together with older or newer versions. [#43038](https://github.com/ClickHouse/ClickHouse/pull/43038) ([Raúl Marín](https://github.com/Algunenano)). + +#### Build/Testing/Packaging Improvement +* Backported in [#43053](https://github.com/ClickHouse/ClickHouse/issues/43053): Wait for all files are in sync before archiving them in integration tests. [#42891](https://github.com/ClickHouse/ClickHouse/pull/42891) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). + +#### Bug Fix (user-visible misbehavior in official stable or prestable release) + +* Backported in [#43715](https://github.com/ClickHouse/ClickHouse/issues/43715): An issue with the following exception has been reported while trying to read a Parquet file from S3 into ClickHouse:. [#43297](https://github.com/ClickHouse/ClickHouse/pull/43297) ([Arthur Passos](https://github.com/arthurpassos)). +* Backported in [#43576](https://github.com/ClickHouse/ClickHouse/issues/43576): Fix possible `Cannot create non-empty column with type Nothing` in functions if/multiIf. Closes [#43356](https://github.com/ClickHouse/ClickHouse/issues/43356). [#43368](https://github.com/ClickHouse/ClickHouse/pull/43368) ([Kruglov Pavel](https://github.com/Avogar)). +* Backported in [#43506](https://github.com/ClickHouse/ClickHouse/issues/43506): Fix a bug when row level filter uses default value of column. [#43387](https://github.com/ClickHouse/ClickHouse/pull/43387) ([Alexander Gololobov](https://github.com/davenger)). +* Backported in [#43723](https://github.com/ClickHouse/ClickHouse/issues/43723): Fixed primary key analysis with conditions involving `toString(enum)`. [#43596](https://github.com/ClickHouse/ClickHouse/pull/43596) ([Nikita Taranov](https://github.com/nickitat)). + +#### NOT FOR CHANGELOG / INSIGNIFICANT + +* Temporarily disable `test_hive_query` [#43542](https://github.com/ClickHouse/ClickHouse/pull/43542) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Do not checkout submodules recursively [#43637](https://github.com/ClickHouse/ClickHouse/pull/43637) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Use docker images cache from merged PRs in master and release branches [#43664](https://github.com/ClickHouse/ClickHouse/pull/43664) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Fix pagination issue in GITHUB_JOB_ID() [#43681](https://github.com/ClickHouse/ClickHouse/pull/43681) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). + diff --git a/docs/changelogs/v22.11.1.1360-stable.md b/docs/changelogs/v22.11.1.1360-stable.md new file mode 100644 index 00000000000..77ad54b4fd8 --- /dev/null +++ b/docs/changelogs/v22.11.1.1360-stable.md @@ -0,0 +1,249 @@ +--- +sidebar_position: 1 +sidebar_label: 2022 +--- + +# 2022 Changelog + +### ClickHouse release v22.11.1.1360-stable (0d211ed1984) FIXME as compared to v22.10.1.1877-stable (98ab5a3c189) + +#### Backward Incompatible Change +* JSONExtract family of functions will now attempt to coerce to the request type. [#41502](https://github.com/ClickHouse/ClickHouse/pull/41502) ([Márcio Martins](https://github.com/marcioapm)). + +#### New Feature +* - Add function `displayName`, closes [#36770](https://github.com/ClickHouse/ClickHouse/issues/36770). [#37681](https://github.com/ClickHouse/ClickHouse/pull/37681) ([hongbin](https://github.com/xlwh)). +* Added applied row-level policies to `system.query_log`. [#39819](https://github.com/ClickHouse/ClickHouse/pull/39819) ([Vladimir Chebotaryov](https://github.com/quickhouse)). +* Add Hudi and DeltaLake table engines, read-only, only for tables on S3. [#41054](https://github.com/ClickHouse/ClickHouse/pull/41054) ([Daniil Rubin](https://github.com/rubin-do)). +* Add 4LW command `csnp` for manually creating snapshots. Additionally, `lgif` was added to get Raft information for a specific node (e.g. index of last created snapshot, last committed log index). [#41766](https://github.com/ClickHouse/ClickHouse/pull/41766) ([JackyWoo](https://github.com/JackyWoo)). +* Support for keeper request retries during insert into replicated merge trees. Apart from fault tolerance, it aims to provide better user experience, - avoid returning a user an error during insert if keeper is restarted (for example, due to upgrade). [#42607](https://github.com/ClickHouse/ClickHouse/pull/42607) ([Igor Nikonov](https://github.com/devcrafter)). +* Add function ascii like in spark: https://spark.apache.org/docs/latest/api/sql/#ascii. [#42670](https://github.com/ClickHouse/ClickHouse/pull/42670) ([李扬](https://github.com/taiyang-li)). +* Add function pmod which return non-negative result based on modulo. [#42755](https://github.com/ClickHouse/ClickHouse/pull/42755) ([李扬](https://github.com/taiyang-li)). +* Published function `formatReadableDecimalSize`. [#42774](https://github.com/ClickHouse/ClickHouse/pull/42774) ([Alejandro](https://github.com/alexon1234)). +* Added S3 PUTs and GETs request per second rate throttling. Settings `s3_max_get_rps`, `s3_max_get_burst`, `s3_max_put_rps`, `s3_max_put_burst` are used to configure token bucket throttler. Can be used with both S3 ObjectStorage and S3 table function. Different limits can be configured for different S3 disks or endpoints. [#43014](https://github.com/ClickHouse/ClickHouse/pull/43014) ([Sergei Trifonov](https://github.com/serxa)). +* Add table function hudi and deltaLake. [#43080](https://github.com/ClickHouse/ClickHouse/pull/43080) ([flynn](https://github.com/ucasfl)). +* Add function factorial, as in Impala or Spark. [#43110](https://github.com/ClickHouse/ClickHouse/pull/43110) ([李扬](https://github.com/taiyang-li)). +* Add function randCanonical, which is similar to rand function in spark or impala. The function generates pseudo random results with independent and identically distributed uniformly distributed values in [0, 1). [#43124](https://github.com/ClickHouse/ClickHouse/pull/43124) ([李扬](https://github.com/taiyang-li)). + +#### Performance Improvement +* Currently, the only saturable operators are And and Or, and their code paths are affected by this change. [#42214](https://github.com/ClickHouse/ClickHouse/pull/42214) ([Zhiguo Zhou](https://github.com/ZhiguoZh)). +* `match` function can use the index if it's a condition on string prefix. This closes [#37333](https://github.com/ClickHouse/ClickHouse/issues/37333). [#42458](https://github.com/ClickHouse/ClickHouse/pull/42458) ([clarkcaoliu](https://github.com/Clark0)). +* Fixed slowness in JSONExtract with LowCardinality(String) tuples. [#42761](https://github.com/ClickHouse/ClickHouse/pull/42761) ([AlfVII](https://github.com/AlfVII)). +* Support parallel parsing for LineAsString input format. This improves performance just slightly. This closes [#42502](https://github.com/ClickHouse/ClickHouse/issues/42502). [#42780](https://github.com/ClickHouse/ClickHouse/pull/42780) ([Kruglov Pavel](https://github.com/Avogar)). +* Keeper performance improvement: improve commit performance for cases when many different nodes have uncommitted states. This should help with cases when a follower node can't sync fast enough. [#42926](https://github.com/ClickHouse/ClickHouse/pull/42926) ([Antonio Andelic](https://github.com/antonio2368)). +* Parallelized merging of `uniqExact` states for aggregation without a key, i.e. queries like `SELECT uniqExact(number) FROM table`. The improvement becomes noticeable when the number of unique keys approaches 10^6. Also `uniq` performance is slightly optimized. This closes [#4510](https://github.com/ClickHouse/ClickHouse/issues/4510). [#43072](https://github.com/ClickHouse/ClickHouse/pull/43072) ([Nikita Taranov](https://github.com/nickitat)). + +#### Improvement +* Support type `Object` inside other types, e.g. `Array(JSON)`. [#36969](https://github.com/ClickHouse/ClickHouse/pull/36969) ([Anton Popov](https://github.com/CurtizJ)). +* Remove covered parts for fetched part (to avoid possible replication delay grows). [#39737](https://github.com/ClickHouse/ClickHouse/pull/39737) ([Azat Khuzhin](https://github.com/azat)). +* ClickHouse Client and ClickHouse Local will show progress by default even in non-interactive mode. If `/dev/tty` is available, the progress will be rendered directly to the terminal, without writing to stderr. It allows to get progress even if stderr is redirected to a file, and the file will not be polluted by terminal escape sequences. The progress can be disabled by `--progress false`. This closes [#32238](https://github.com/ClickHouse/ClickHouse/issues/32238). [#42003](https://github.com/ClickHouse/ClickHouse/pull/42003) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* 1. Add, subtract and negate operations are now available on Intervals. In case when the types of Intervals are different they will be transformed into the Tuple of those types. 2. A tuple of intervals can be added to or subtracted from a Date/DateTime field. 3. Added parsing of Intervals with different types, for example: `INTERVAL '1 HOUR 1 MINUTE 1 SECOND'`. [#42195](https://github.com/ClickHouse/ClickHouse/pull/42195) ([Nikolay Degterinsky](https://github.com/evillique)). +* - Add `notLike` to key condition atom map, so condition like `NOT LIKE 'prefix%'` can use primary index. [#42209](https://github.com/ClickHouse/ClickHouse/pull/42209) ([Duc Canh Le](https://github.com/canhld94)). +* Add support for FixedString input to base64 coding functions. [#42285](https://github.com/ClickHouse/ClickHouse/pull/42285) ([ltrk2](https://github.com/ltrk2)). +* Add columns `bytes_on_disk` and `path` to `system.detached_parts`. Closes [#42264](https://github.com/ClickHouse/ClickHouse/issues/42264). [#42303](https://github.com/ClickHouse/ClickHouse/pull/42303) ([chen](https://github.com/xiedeyantu)). +* Improve using structure from insertion table in table functions, now setting `use_structure_from_insertion_table_in_table_functions` has new possible value - `2` that means that ClickHouse will try to determine if we can use structure from insertion table or not automatically. Closes [#40028](https://github.com/ClickHouse/ClickHouse/issues/40028). [#42320](https://github.com/ClickHouse/ClickHouse/pull/42320) ([Kruglov Pavel](https://github.com/Avogar)). +* Added ** glob support for recursive directory traversal to filesystem and S3. resolves [#36316](https://github.com/ClickHouse/ClickHouse/issues/36316). [#42376](https://github.com/ClickHouse/ClickHouse/pull/42376) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)). +* Mask passwords and secret keys both in `system.query_log` and `/var/log/clickhouse-server/*.log` and also in error messages. [#42484](https://github.com/ClickHouse/ClickHouse/pull/42484) ([Vitaly Baranov](https://github.com/vitlibar)). +* Add a new variable call `limit` in query_info, indicating whether this query is a limit-trivial query. If so, we will adjust the approximate total rows for later estimation. Closes [#7071](https://github.com/ClickHouse/ClickHouse/issues/7071). [#42580](https://github.com/ClickHouse/ClickHouse/pull/42580) ([Han Fei](https://github.com/hanfei1991)). +* Implement `ATTACH` of `MergeTree` table for `s3_plain` disk (plus some fixes for `s3_plain`). [#42628](https://github.com/ClickHouse/ClickHouse/pull/42628) ([Azat Khuzhin](https://github.com/azat)). +* Fix no progress indication on INSERT FROM INFILE. Closes [#42548](https://github.com/ClickHouse/ClickHouse/issues/42548). [#42634](https://github.com/ClickHouse/ClickHouse/pull/42634) ([chen](https://github.com/xiedeyantu)). +* Add `min_age_to_force_merge_on_partition_only` setting to optimize old parts for the entire partition only. [#42659](https://github.com/ClickHouse/ClickHouse/pull/42659) ([Antonio Andelic](https://github.com/antonio2368)). +* Throttling algorithm changed to token bucket. [#42665](https://github.com/ClickHouse/ClickHouse/pull/42665) ([Sergei Trifonov](https://github.com/serxa)). +* Refactor FunctionTokens to enable max tokens returned for related functions(default disabled). [#42673](https://github.com/ClickHouse/ClickHouse/pull/42673) ([李扬](https://github.com/taiyang-li)). +* Added new field allow_readonly in system.table_functions to allow using table functions in readonly mode resolves [#42414](https://github.com/ClickHouse/ClickHouse/issues/42414) Implementation: * Added a new field allow_readonly to table system.table_functions. * Updated to use new field allow_readonly to allow using table functions in readonly mode. Testing: * Added a test for filesystem tests/queries/0_stateless/02473_functions_in_readonly_mode.sh Documentation: * Updated the english documentation for Table Functions. [#42708](https://github.com/ClickHouse/ClickHouse/pull/42708) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)). +* Allow to use Date32 arguments for formatDateTime and FROM_UNIXTIME functions. [#42737](https://github.com/ClickHouse/ClickHouse/pull/42737) ([Roman Vasin](https://github.com/rvasin)). +* Update tzdata to 2022f. Mexico will no longer observe DST except near the US border: https://www.timeanddate.com/news/time/mexico-abolishes-dst-2022.html. Chihuahua moves to year-round UTC-6 on 2022-10-30. Fiji no longer observes DST. See https://github.com/google/cctz/pull/235 and https://bugs.launchpad.net/ubuntu/+source/tzdata/+bug/1995209. [#42796](https://github.com/ClickHouse/ClickHouse/pull/42796) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Add `FailedAsyncInsertQuery` event metric for async inserts. [#42814](https://github.com/ClickHouse/ClickHouse/pull/42814) ([Krzysztof Góralski](https://github.com/kgoralski)). +* Implement `read-in-order` optimization on top of query plan. It is enabled by default. Set `query_plan_read_in_order = 0` to use previous AST-based version. [#42829](https://github.com/ClickHouse/ClickHouse/pull/42829) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Increase the size of upload part exponentially for backup to S3. [#42833](https://github.com/ClickHouse/ClickHouse/pull/42833) ([Vitaly Baranov](https://github.com/vitlibar)). +* When the merge task is continuously busy and the disk space is insufficient, the completely expired parts cannot be selected and dropped, resulting in insufficient disk space. My idea is that when the entire Part expires, there is no need for additional disk space to guarantee, ensure the normal execution of TTL. [#42869](https://github.com/ClickHouse/ClickHouse/pull/42869) ([zhongyuankai](https://github.com/zhongyuankai)). +* bugfix: [#42856](https://github.com/ClickHouse/ClickHouse/issues/42856) ignore Mysql binlog SAVEPOINT event. [#42931](https://github.com/ClickHouse/ClickHouse/pull/42931) ([zzsmdfj](https://github.com/zzsmdfj)). +* Add support for interactive parameters in INSERT VALUES queries. [#43077](https://github.com/ClickHouse/ClickHouse/pull/43077) ([Nikolay Degterinsky](https://github.com/evillique)). +* Add generic implementation for arbitrary structured named collections, access type and system.named_collections. [#43147](https://github.com/ClickHouse/ClickHouse/pull/43147) ([Kseniia Sumarokova](https://github.com/kssenii)). +* add oss function and StorageOSS (This is convenient for users). oss is fully compatible with s3. [#43155](https://github.com/ClickHouse/ClickHouse/pull/43155) ([zzsmdfj](https://github.com/zzsmdfj)). +* Improve error reporting in the collection of OS-related info for the `system.asynchronous_metrics` table. [#43192](https://github.com/ClickHouse/ClickHouse/pull/43192) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* The `system.asynchronous_metrics` gets embedded documentation. This documentation is also exported to Prometheus. Fixed an error with the metrics about `cache` disks - they were calculated only for one arbitrary cache disk instead all of them. This closes [#7644](https://github.com/ClickHouse/ClickHouse/issues/7644). [#43194](https://github.com/ClickHouse/ClickHouse/pull/43194) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Modify the `INFORMATION_SCHEMA` tables in a way so that now ClickHouse can connect to itself using the MySQL compatibility protocol. Add columns instead of aliases (related to [#9769](https://github.com/ClickHouse/ClickHouse/issues/9769)). It will improve the compatibility with various MySQL clients. [#43198](https://github.com/ClickHouse/ClickHouse/pull/43198) ([Filatenkov Artur](https://github.com/FArthur-cmd)). +* Disable `deltaLake` and `hudi` table functions in readonly mode. [#43316](https://github.com/ClickHouse/ClickHouse/pull/43316) ([Antonio Andelic](https://github.com/antonio2368)). + +#### Bug Fix +* Updated normaliser to clone the alias ast. resolves [#42452](https://github.com/ClickHouse/ClickHouse/issues/42452) Implementation: * Updated QueryNormalizer to clone alias ast, when its replaced. Previously just assigning the same leads to exception in LogicalExpressinsOptimizer as it would be the same parent being inserted again. * This bug is not seen with new analyser (allow_experimental_analyzer), so no changes for it. I added a test for the same. [#42827](https://github.com/ClickHouse/ClickHouse/pull/42827) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)). +* Fix race for backup of tables in Lazy databases. [#43104](https://github.com/ClickHouse/ClickHouse/pull/43104) ([Vitaly Baranov](https://github.com/vitlibar)). +* fix skip_unavailable_shards does not work using s3Cluster table function. [#43131](https://github.com/ClickHouse/ClickHouse/pull/43131) ([chen](https://github.com/xiedeyantu)). + +#### Build/Testing/Packaging Improvement +* Run SQLancer for each pull request and commit to master. [SQLancer](https://github.com/sqlancer/sqlancer) is an OpenSource fuzzer that focuses on automatic detection of logical bugs. [#42397](https://github.com/ClickHouse/ClickHouse/pull/42397) ([Ilya Yatsishin](https://github.com/qoega)). +* Update to latest zlib-ng. [#42463](https://github.com/ClickHouse/ClickHouse/pull/42463) ([Boris Kuschel](https://github.com/bkuschel)). +* use llvm `l64.lld` in macOS suppress ld warnings, close [#42282](https://github.com/ClickHouse/ClickHouse/issues/42282). [#42470](https://github.com/ClickHouse/ClickHouse/pull/42470) ([Lloyd-Pottiger](https://github.com/Lloyd-Pottiger)). +* Add support for testing ClickHouse server with Jepsen. By the way, we already have support for testing ClickHouse Keeper with Jepsen. This pull request extends it to Replicated tables. [#42619](https://github.com/ClickHouse/ClickHouse/pull/42619) ([Antonio Andelic](https://github.com/antonio2368)). +* * Improve bugfix validation check: fix bug with skipping the check, port separate status in CI, run after check labels and style check. Close [#40349](https://github.com/ClickHouse/ClickHouse/issues/40349). [#42702](https://github.com/ClickHouse/ClickHouse/pull/42702) ([Vladimir C](https://github.com/vdimir)). +* Wait for all files are in sync before archiving them in integration tests. [#42891](https://github.com/ClickHouse/ClickHouse/pull/42891) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Use https://github.com/matus-chochlik/ctcache for clang-tidy results caching. [#42913](https://github.com/ClickHouse/ClickHouse/pull/42913) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Before the fix, the user-defined config was preserved by RPM in `$file.rpmsave`. The PR fixes it and won't replace the user's files from packages. [#42936](https://github.com/ClickHouse/ClickHouse/pull/42936) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Add a CI step to mark commits as ready for release; soft-forbid launching a release script from branches but master. [#43017](https://github.com/ClickHouse/ClickHouse/pull/43017) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). + +#### Bug Fix (user-visible misbehavior in official stable or prestable release) + +* Fix schema inference in s3Cluster and improve in hdfsCluster. [#41979](https://github.com/ClickHouse/ClickHouse/pull/41979) ([Kruglov Pavel](https://github.com/Avogar)). +* Fix retries while reading from http table engines / table function. (retrtiable errors could be retries more times than needed, non-retrialble errors resulted in failed assertion in code). [#42224](https://github.com/ClickHouse/ClickHouse/pull/42224) ([Kseniia Sumarokova](https://github.com/kssenii)). +* A segmentation fault related to DNS & c-ares has been reported. The below error ocurred in multiple threads: ``` 2022-09-28 15:41:19.008,2022.09.28 15:41:19.008088 [ 356 ] {} BaseDaemon: ######################################## 2022-09-28 15:41:19.008,"2022.09.28 15:41:19.008147 [ 356 ] {} BaseDaemon: (version 22.8.5.29 (official build), build id: 92504ACA0B8E2267) (from thread 353) (no query) Received signal Segmentation fault (11)" 2022-09-28 15:41:19.008,2022.09.28 15:41:19.008196 [ 356 ] {} BaseDaemon: Address: 0xf Access: write. Address not mapped to object. 2022-09-28 15:41:19.008,2022.09.28 15:41:19.008216 [ 356 ] {} BaseDaemon: Stack trace: 0x188f8212 0x1626851b 0x1626a69e 0x16269b3f 0x16267eab 0x13cf8284 0x13d24afc 0x13c5217e 0x14ec2495 0x15ba440f 0x15b9d13b 0x15bb2699 0x1891ccb3 0x1891e00d 0x18ae0769 0x18ade022 0x7f76aa985609 0x7f76aa8aa133 2022-09-28 15:41:19.008,2022.09.28 15:41:19.008274 [ 356 ] {} BaseDaemon: 2. Poco::Net::IPAddress::family() const @ 0x188f8212 in /usr/bin/clickhouse 2022-09-28 15:41:19.008,2022.09.28 15:41:19.008297 [ 356 ] {} BaseDaemon: 3. ? @ 0x1626851b in /usr/bin/clickhouse 2022-09-28 15:41:19.008,2022.09.28 15:41:19.008309 [ 356 ] {} BaseDaemon: 4. ? @ 0x1626a69e in /usr/bin/clickhouse ```. [#42234](https://github.com/ClickHouse/ClickHouse/pull/42234) ([Arthur Passos](https://github.com/arthurpassos)). +* Fix `LOGICAL_ERROR` `Arguments of 'plus' have incorrect data types` which may happen in PK analysis (monotonicity check). Fix invalid PK analysis for monotonic binary functions with first constant argument. [#42410](https://github.com/ClickHouse/ClickHouse/pull/42410) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Fix incorrect key analysis when key types cannot be inside Nullable. This fixes [#42456](https://github.com/ClickHouse/ClickHouse/issues/42456). [#42469](https://github.com/ClickHouse/ClickHouse/pull/42469) ([Amos Bird](https://github.com/amosbird)). +* Fix typo in setting name that led to bad usage of schema inference cache while using setting `input_format_csv_use_best_effort_in_schema_inference`. Closes [#41735](https://github.com/ClickHouse/ClickHouse/issues/41735). [#42536](https://github.com/ClickHouse/ClickHouse/pull/42536) ([Kruglov Pavel](https://github.com/Avogar)). +* Fix create Set with wrong header when data type is LowCardinality. Closes [#42460](https://github.com/ClickHouse/ClickHouse/issues/42460). [#42579](https://github.com/ClickHouse/ClickHouse/pull/42579) ([flynn](https://github.com/ucasfl)). +* `(U)Int128` and `(U)Int256` values are correctly checked in `PREWHERE`. [#42605](https://github.com/ClickHouse/ClickHouse/pull/42605) ([Antonio Andelic](https://github.com/antonio2368)). +* Fix a bug in ParserFunction that could have led to a segmentation fault. [#42724](https://github.com/ClickHouse/ClickHouse/pull/42724) ([Nikolay Degterinsky](https://github.com/evillique)). +* Fix truncate table does not hold lock correctly. [#42728](https://github.com/ClickHouse/ClickHouse/pull/42728) ([flynn](https://github.com/ucasfl)). +* Fix possible SIGSEGV for web disks when file does not exists (or `OPTIMIZE TABLE FINAL`, that also can got the same error eventually). [#42767](https://github.com/ClickHouse/ClickHouse/pull/42767) ([Azat Khuzhin](https://github.com/azat)). +* Fix `auth_type` mapping in `system.session_log`, by including `SSL_CERTIFICATE` for the enum values. [#42782](https://github.com/ClickHouse/ClickHouse/pull/42782) ([Miel Donkers](https://github.com/mdonkers)). +* Fix stack-use-after-return under ASAN build in ParserCreateUserQuery. [#42804](https://github.com/ClickHouse/ClickHouse/pull/42804) ([Nikolay Degterinsky](https://github.com/evillique)). +* Fix lowerUTF8()/upperUTF8() in case of symbol was in between 16-byte boundary (very frequent case of you have strings > 16 bytes long). [#42812](https://github.com/ClickHouse/ClickHouse/pull/42812) ([Azat Khuzhin](https://github.com/azat)). +* Additional bound check was added to lz4 decompression routine to fix misbehaviour in case of malformed input. [#42868](https://github.com/ClickHouse/ClickHouse/pull/42868) ([Nikita Taranov](https://github.com/nickitat)). +* Fix rare possible hung on query cancellation. [#42874](https://github.com/ClickHouse/ClickHouse/pull/42874) ([Azat Khuzhin](https://github.com/azat)). +* * Fix incorrect saved_block_sample with multiple disjuncts in hash join, close [#42832](https://github.com/ClickHouse/ClickHouse/issues/42832). [#42876](https://github.com/ClickHouse/ClickHouse/pull/42876) ([Vladimir C](https://github.com/vdimir)). +* A null pointer will be generated when select if as from ‘three table join’ , For example, the SQL:. [#42883](https://github.com/ClickHouse/ClickHouse/pull/42883) ([zzsmdfj](https://github.com/zzsmdfj)). +* Fix memory sanitizer report in ClusterDiscovery, close [#42763](https://github.com/ClickHouse/ClickHouse/issues/42763). [#42905](https://github.com/ClickHouse/ClickHouse/pull/42905) ([Vladimir C](https://github.com/vdimir)). +* Fix datetime schema inference in case of empty string. [#42911](https://github.com/ClickHouse/ClickHouse/pull/42911) ([Kruglov Pavel](https://github.com/Avogar)). +* Fix rare NOT_FOUND_COLUMN_IN_BLOCK error when projection is possible to use but there is no projection available. This fixes [#42771](https://github.com/ClickHouse/ClickHouse/issues/42771) . The bug was introduced in https://github.com/ClickHouse/ClickHouse/pull/25563. [#42938](https://github.com/ClickHouse/ClickHouse/pull/42938) ([Amos Bird](https://github.com/amosbird)). +* Fixes for s3_plain disk that will allow to attach Wide parts. [#42950](https://github.com/ClickHouse/ClickHouse/pull/42950) ([Azat Khuzhin](https://github.com/azat)). +* Fix ATTACH TABLE in PostgreSQL database engine if the table contains DATETIME data type. Closes [#42817](https://github.com/ClickHouse/ClickHouse/issues/42817). [#42960](https://github.com/ClickHouse/ClickHouse/pull/42960) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Fix lambda parsing. Closes [#41848](https://github.com/ClickHouse/ClickHouse/issues/41848). [#42979](https://github.com/ClickHouse/ClickHouse/pull/42979) ([Nikolay Degterinsky](https://github.com/evillique)). +* Handle (ignore) SAVEPOINT queries in MaterializedMySQL. [#43086](https://github.com/ClickHouse/ClickHouse/pull/43086) ([Stig Bakken](https://github.com/stigsb)). +* Fix incorrect key analysis when nullable keys appear in the middle of a hyperrectangle. This fixes [#43111](https://github.com/ClickHouse/ClickHouse/issues/43111) . [#43133](https://github.com/ClickHouse/ClickHouse/pull/43133) ([Amos Bird](https://github.com/amosbird)). +* - Fix several buffer over-reads. [#43159](https://github.com/ClickHouse/ClickHouse/pull/43159) ([Raúl Marín](https://github.com/Algunenano)). +* Fix function if in case of NULL and const Nullable arguments. Closes [#43069](https://github.com/ClickHouse/ClickHouse/issues/43069). [#43178](https://github.com/ClickHouse/ClickHouse/pull/43178) ([Kruglov Pavel](https://github.com/Avogar)). +* Fix decimal math overflow in parsing datetime with 'best effort' algorithm. Closes [#43061](https://github.com/ClickHouse/ClickHouse/issues/43061). [#43180](https://github.com/ClickHouse/ClickHouse/pull/43180) ([Kruglov Pavel](https://github.com/Avogar)). +* The `indent` field produced by the `git-import` tool was miscalculated. See https://clickhouse.com/docs/en/getting-started/example-datasets/github/. [#43191](https://github.com/ClickHouse/ClickHouse/pull/43191) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Fixed unexpected behaviour of Interval types with subquery and casting. [#43193](https://github.com/ClickHouse/ClickHouse/pull/43193) ([jh0x](https://github.com/jh0x)). +* * Fix logical error in `sumMap/minMap/maxMap` functions executing `TOTALS/ROLLUP/CUBE` on `NULL` values. Close [#43022](https://github.com/ClickHouse/ClickHouse/issues/43022). [#43232](https://github.com/ClickHouse/ClickHouse/pull/43232) ([Vladimir C](https://github.com/vdimir)). +* - Fix ubsan in AggregateFunctionMinMaxAny::read with high sizes. [#43249](https://github.com/ClickHouse/ClickHouse/pull/43249) ([Raúl Marín](https://github.com/Algunenano)). +* Fix IS (NOT) NULL operator priority in regard to other operators. [#43265](https://github.com/ClickHouse/ClickHouse/pull/43265) ([Nikolay Degterinsky](https://github.com/evillique)). + +#### Build Improvement + +* ... Add support for format ipv6 on s390x. [#42412](https://github.com/ClickHouse/ClickHouse/pull/42412) ([Suzy Wang](https://github.com/SuzyWangIBMer)). + +#### NO CL ENTRY + +* NO CL ENTRY: 'Revert "Sonar Cloud Workflow"'. [#42725](https://github.com/ClickHouse/ClickHouse/pull/42725) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* NO CL ENTRY: 'Revert " Keeper retries during insert (clean)"'. [#43116](https://github.com/ClickHouse/ClickHouse/pull/43116) ([Alexander Tokmakov](https://github.com/tavplubix)). +* NO CL ENTRY: 'Revert "Revert " Keeper retries during insert (clean)""'. [#43122](https://github.com/ClickHouse/ClickHouse/pull/43122) ([Igor Nikonov](https://github.com/devcrafter)). +* NO CL ENTRY: 'Revert "Optimize TTL merge, completely expired parts can be removed in time"'. [#43134](https://github.com/ClickHouse/ClickHouse/pull/43134) ([Alexander Tokmakov](https://github.com/tavplubix)). +* NO CL ENTRY: 'Revert "Randomize keeper fault injection settings in stress tests"'. [#43218](https://github.com/ClickHouse/ClickHouse/pull/43218) ([Alexander Gololobov](https://github.com/davenger)). +* NO CL ENTRY: 'Revert "S3 request per second rate throttling"'. [#43306](https://github.com/ClickHouse/ClickHouse/pull/43306) ([Alexander Tokmakov](https://github.com/tavplubix)). + +#### NOT FOR CHANGELOG / INSIGNIFICANT + +* Better logging for docs builder [#41903](https://github.com/ClickHouse/ClickHouse/pull/41903) ([filimonov](https://github.com/filimonov)). +* Save full server log in AST Fuzzer checks [#42316](https://github.com/ClickHouse/ClickHouse/pull/42316) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Build with libcxx(abi) 15 [#42513](https://github.com/ClickHouse/ClickHouse/pull/42513) ([Robert Schulze](https://github.com/rschu1ze)). +* Sonar Cloud Workflow [#42534](https://github.com/ClickHouse/ClickHouse/pull/42534) ([Julio Jimenez](https://github.com/juliojimenez)). +* Invalid type in where for Merge table (logical error) [#42576](https://github.com/ClickHouse/ClickHouse/pull/42576) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Fix frequent memory drift message and clarify things in comments [#42582](https://github.com/ClickHouse/ClickHouse/pull/42582) ([Azat Khuzhin](https://github.com/azat)). +* Add functions for PowerBI connect [#42612](https://github.com/ClickHouse/ClickHouse/pull/42612) ([Filatenkov Artur](https://github.com/FArthur-cmd)). +* Try to save `IDataPartStorage` interface [#42618](https://github.com/ClickHouse/ClickHouse/pull/42618) ([Anton Popov](https://github.com/CurtizJ)). +* Remove Ubuntu cruft [#42622](https://github.com/ClickHouse/ClickHouse/pull/42622) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Analyzer change setting into allow_experimental_analyzer [#42649](https://github.com/ClickHouse/ClickHouse/pull/42649) ([Maksim Kita](https://github.com/kitaisreal)). +* Analyzer IQueryTreeNode remove getName method [#42651](https://github.com/ClickHouse/ClickHouse/pull/42651) ([Maksim Kita](https://github.com/kitaisreal)). +* Minor fix iotest_nonblock build [#42658](https://github.com/ClickHouse/ClickHouse/pull/42658) ([Jordi Villar](https://github.com/jrdi)). +* Add tests and doc for some url-related functions [#42664](https://github.com/ClickHouse/ClickHouse/pull/42664) ([Vladimir C](https://github.com/vdimir)). +* Update version_date.tsv and changelogs after v22.10.1.1875-stable [#42676](https://github.com/ClickHouse/ClickHouse/pull/42676) ([robot-clickhouse](https://github.com/robot-clickhouse)). +* Fix error handling in clickhouse_helper.py [#42678](https://github.com/ClickHouse/ClickHouse/pull/42678) ([Ilya Yatsishin](https://github.com/qoega)). +* Fix execution of version_helper.py to use git tweaks [#42679](https://github.com/ClickHouse/ClickHouse/pull/42679) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* MergeTree indexes use RPNBuilderTree [#42681](https://github.com/ClickHouse/ClickHouse/pull/42681) ([Maksim Kita](https://github.com/kitaisreal)). +* Always run `BuilderReport` and `BuilderSpecialReport` in all CI types [#42684](https://github.com/ClickHouse/ClickHouse/pull/42684) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Support optimize_syntax_fuse_functions for sum/count/avg via analyzer [#42689](https://github.com/ClickHouse/ClickHouse/pull/42689) ([Vladimir C](https://github.com/vdimir)). +* Update version after release [#42699](https://github.com/ClickHouse/ClickHouse/pull/42699) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Update version_date.tsv and changelogs after v22.10.1.1877-stable [#42700](https://github.com/ClickHouse/ClickHouse/pull/42700) ([robot-clickhouse](https://github.com/robot-clickhouse)). +* OrderByLimitByDuplicateEliminationPass improve performance [#42704](https://github.com/ClickHouse/ClickHouse/pull/42704) ([Maksim Kita](https://github.com/kitaisreal)). +* Analyzer improve subqueries representation [#42705](https://github.com/ClickHouse/ClickHouse/pull/42705) ([Maksim Kita](https://github.com/kitaisreal)). +* Update version_date.tsv and changelogs after v22.9.4.32-stable [#42712](https://github.com/ClickHouse/ClickHouse/pull/42712) ([robot-clickhouse](https://github.com/robot-clickhouse)). +* Update version_date.tsv and changelogs after v22.8.7.34-lts [#42713](https://github.com/ClickHouse/ClickHouse/pull/42713) ([robot-clickhouse](https://github.com/robot-clickhouse)). +* Update version_date.tsv and changelogs after v22.7.7.24-stable [#42714](https://github.com/ClickHouse/ClickHouse/pull/42714) ([robot-clickhouse](https://github.com/robot-clickhouse)). +* Move SonarCloud Job to nightly [#42718](https://github.com/ClickHouse/ClickHouse/pull/42718) ([Julio Jimenez](https://github.com/juliojimenez)). +* Update version_date.tsv and changelogs after v22.8.8.3-lts [#42738](https://github.com/ClickHouse/ClickHouse/pull/42738) ([robot-clickhouse](https://github.com/robot-clickhouse)). +* Minor fix implicit cast CaresPTRResolver [#42747](https://github.com/ClickHouse/ClickHouse/pull/42747) ([Jordi Villar](https://github.com/jrdi)). +* Fix build on master [#42752](https://github.com/ClickHouse/ClickHouse/pull/42752) ([Igor Nikonov](https://github.com/devcrafter)). +* Update version_date.tsv and changelogs after v22.3.14.18-lts [#42759](https://github.com/ClickHouse/ClickHouse/pull/42759) ([robot-clickhouse](https://github.com/robot-clickhouse)). +* Fix anchor links [#42760](https://github.com/ClickHouse/ClickHouse/pull/42760) ([Sergei Trifonov](https://github.com/serxa)). +* Update version_date.tsv and changelogs after v22.3.14.23-lts [#42764](https://github.com/ClickHouse/ClickHouse/pull/42764) ([robot-clickhouse](https://github.com/robot-clickhouse)). +* Update README.md [#42783](https://github.com/ClickHouse/ClickHouse/pull/42783) ([Yuko Takagi](https://github.com/yukotakagi)). +* Slightly better code with projections [#42794](https://github.com/ClickHouse/ClickHouse/pull/42794) ([Anton Popov](https://github.com/CurtizJ)). +* Fix some races in MergeTree [#42805](https://github.com/ClickHouse/ClickHouse/pull/42805) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Fix typo in comments [#42809](https://github.com/ClickHouse/ClickHouse/pull/42809) ([Gabriel](https://github.com/Gabriel39)). +* Fix compilation of LLVM with cmake cache [#42816](https://github.com/ClickHouse/ClickHouse/pull/42816) ([Azat Khuzhin](https://github.com/azat)). +* Fix link in docs [#42821](https://github.com/ClickHouse/ClickHouse/pull/42821) ([Sergei Trifonov](https://github.com/serxa)). +* Link to proper place in docs [#42822](https://github.com/ClickHouse/ClickHouse/pull/42822) ([Sergei Trifonov](https://github.com/serxa)). +* Fix argument type check in AggregateFunctionAnalysisOfVariance [#42823](https://github.com/ClickHouse/ClickHouse/pull/42823) ([Vladimir C](https://github.com/vdimir)). +* Tests/lambda analyzer [#42824](https://github.com/ClickHouse/ClickHouse/pull/42824) ([Denny Crane](https://github.com/den-crane)). +* Fix Missing Quotes - Sonar Nightly [#42831](https://github.com/ClickHouse/ClickHouse/pull/42831) ([Julio Jimenez](https://github.com/juliojimenez)). +* Add exclusions from the Snyk scan [#42834](https://github.com/ClickHouse/ClickHouse/pull/42834) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Fix Missing Env Vars - Sonar Nightly [#42843](https://github.com/ClickHouse/ClickHouse/pull/42843) ([Julio Jimenez](https://github.com/juliojimenez)). +* Fix typo [#42855](https://github.com/ClickHouse/ClickHouse/pull/42855) ([GoGoWen](https://github.com/GoGoWen)). +* Add timezone to 02458_datediff_date32 [#42857](https://github.com/ClickHouse/ClickHouse/pull/42857) ([Vladimir C](https://github.com/vdimir)). +* Adjust cancel and rerun workflow names to the actual [#42862](https://github.com/ClickHouse/ClickHouse/pull/42862) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Analyzer subquery in JOIN TREE with aggregation [#42865](https://github.com/ClickHouse/ClickHouse/pull/42865) ([Maksim Kita](https://github.com/kitaisreal)). +* Fix getauxval for sanitizer builds [#42866](https://github.com/ClickHouse/ClickHouse/pull/42866) ([Amos Bird](https://github.com/amosbird)). +* Update version_date.tsv and changelogs after v22.10.2.11-stable [#42871](https://github.com/ClickHouse/ClickHouse/pull/42871) ([robot-clickhouse](https://github.com/robot-clickhouse)). +* Better usability for dashboard.html on changes [#42872](https://github.com/ClickHouse/ClickHouse/pull/42872) ([Vladimir C](https://github.com/vdimir)). +* Some fixes for ReplicatedMergeTree [#42878](https://github.com/ClickHouse/ClickHouse/pull/42878) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Validate Query Tree in debug [#42879](https://github.com/ClickHouse/ClickHouse/pull/42879) ([Dmitry Novik](https://github.com/novikd)). +* changed type name for s3 plain storage [#42890](https://github.com/ClickHouse/ClickHouse/pull/42890) ([Aleksandr](https://github.com/AVMusorin)). +* Cleanup implementation of regexpReplace(All|One) [#42907](https://github.com/ClickHouse/ClickHouse/pull/42907) ([Robert Schulze](https://github.com/rschu1ze)). +* Do not show status for Bugfix validate check in non bugfix PRs [#42932](https://github.com/ClickHouse/ClickHouse/pull/42932) ([Vladimir C](https://github.com/vdimir)). +* fix(typo): Passible -> Possible [#42933](https://github.com/ClickHouse/ClickHouse/pull/42933) ([Yakko Majuri](https://github.com/yakkomajuri)). +* Pin the cryptography version to not break lambdas [#42934](https://github.com/ClickHouse/ClickHouse/pull/42934) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Fix: bad cast from type DB::ColumnLowCardinality to DB::ColumnString [#42937](https://github.com/ClickHouse/ClickHouse/pull/42937) ([Igor Nikonov](https://github.com/devcrafter)). +* Attach thread pool for loading parts to the query [#42947](https://github.com/ClickHouse/ClickHouse/pull/42947) ([Azat Khuzhin](https://github.com/azat)). +* Fix macOS M1 builds due to sprintf deprecation [#42962](https://github.com/ClickHouse/ClickHouse/pull/42962) ([Jordi Villar](https://github.com/jrdi)). +* Less use of CH-specific bit_cast() [#42968](https://github.com/ClickHouse/ClickHouse/pull/42968) ([Robert Schulze](https://github.com/rschu1ze)). +* Remove some utils [#42972](https://github.com/ClickHouse/ClickHouse/pull/42972) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Fix a bug in CAST function parser [#42980](https://github.com/ClickHouse/ClickHouse/pull/42980) ([Nikolay Degterinsky](https://github.com/evillique)). +* Fix old bug to remove `refs/head` from ref name [#42981](https://github.com/ClickHouse/ClickHouse/pull/42981) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Add debug information to nightly builds [#42997](https://github.com/ClickHouse/ClickHouse/pull/42997) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Add some guard rails around aggregation memory management [#42999](https://github.com/ClickHouse/ClickHouse/pull/42999) ([Raúl Marín](https://github.com/Algunenano)). +* Add `on: workflow_call` to debug CI [#43000](https://github.com/ClickHouse/ClickHouse/pull/43000) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Analyzer added identifier typo corrections [#43002](https://github.com/ClickHouse/ClickHouse/pull/43002) ([Maksim Kita](https://github.com/kitaisreal)). +* Simple fixes for restart replica description [#43004](https://github.com/ClickHouse/ClickHouse/pull/43004) ([Igor Nikonov](https://github.com/devcrafter)). +* Cleanup match code [#43006](https://github.com/ClickHouse/ClickHouse/pull/43006) ([Robert Schulze](https://github.com/rschu1ze)). +* Fix TSan errors (correctly ignore _exit interception) [#43009](https://github.com/ClickHouse/ClickHouse/pull/43009) ([Azat Khuzhin](https://github.com/azat)). +* fix bandwidth throttlers initialization order [#43015](https://github.com/ClickHouse/ClickHouse/pull/43015) ([Sergei Trifonov](https://github.com/serxa)). +* Add test for issue [#42520](https://github.com/ClickHouse/ClickHouse/issues/42520) [#43027](https://github.com/ClickHouse/ClickHouse/pull/43027) ([Robert Schulze](https://github.com/rschu1ze)). +* Analyzer improve ARRAY JOIN with JOIN [#43048](https://github.com/ClickHouse/ClickHouse/pull/43048) ([Maksim Kita](https://github.com/kitaisreal)). +* Fix projection part removal with zero-copy replication [#43060](https://github.com/ClickHouse/ClickHouse/pull/43060) ([alesapin](https://github.com/alesapin)). +* Fix msan warning [#43065](https://github.com/ClickHouse/ClickHouse/pull/43065) ([Raúl Marín](https://github.com/Algunenano)). +* Analyzer AST key condition crash fix [#43070](https://github.com/ClickHouse/ClickHouse/pull/43070) ([Maksim Kita](https://github.com/kitaisreal)). +* Better logging for mark range filtering on projection parts [#43076](https://github.com/ClickHouse/ClickHouse/pull/43076) ([Duc Canh Le](https://github.com/canhld94)). +* Fix ub type punning [#43088](https://github.com/ClickHouse/ClickHouse/pull/43088) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)). +* Analyzer improve aliases support for table expressions [#43089](https://github.com/ClickHouse/ClickHouse/pull/43089) ([Maksim Kita](https://github.com/kitaisreal)). +* Throw not implemented for window frame type 'groups' in analyzer [#43090](https://github.com/ClickHouse/ClickHouse/pull/43090) ([Vladimir C](https://github.com/vdimir)). +* Disable clickhouse local and client non-interactive progress by default. [#43092](https://github.com/ClickHouse/ClickHouse/pull/43092) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Make error message after dropping current user more correct. [#43097](https://github.com/ClickHouse/ClickHouse/pull/43097) ([Vitaly Baranov](https://github.com/vitlibar)). +* More stable test [#43102](https://github.com/ClickHouse/ClickHouse/pull/43102) ([alesapin](https://github.com/alesapin)). +* Rewrite tests for memory overcommit [#43105](https://github.com/ClickHouse/ClickHouse/pull/43105) ([Dmitry Novik](https://github.com/novikd)). +* Fix trailing \n from SQLancer status [#43114](https://github.com/ClickHouse/ClickHouse/pull/43114) ([Ilya Yatsishin](https://github.com/qoega)). +* Fix `test_keeper_four_word_command::test_cmd_stat` [#43115](https://github.com/ClickHouse/ClickHouse/pull/43115) ([Antonio Andelic](https://github.com/antonio2368)). +* Enable keeper fault injection for inserts in functional tests [#43117](https://github.com/ClickHouse/ClickHouse/pull/43117) ([Igor Nikonov](https://github.com/devcrafter)). +* Analyzer aggregation crash fix [#43118](https://github.com/ClickHouse/ClickHouse/pull/43118) ([Maksim Kita](https://github.com/kitaisreal)). +* Analyzer aggregation totals crash fix [#43119](https://github.com/ClickHouse/ClickHouse/pull/43119) ([Maksim Kita](https://github.com/kitaisreal)). +* Improve commit_status_helper.py [#43121](https://github.com/ClickHouse/ClickHouse/pull/43121) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Skip hash logging on sanitizer builds [#43129](https://github.com/ClickHouse/ClickHouse/pull/43129) ([Raúl Marín](https://github.com/Algunenano)). +* Analyzer improve JOIN with constants [#43141](https://github.com/ClickHouse/ClickHouse/pull/43141) ([Maksim Kita](https://github.com/kitaisreal)). +* Remove POCO_CLICKHOUSE_PATCH [#43146](https://github.com/ClickHouse/ClickHouse/pull/43146) ([Azat Khuzhin](https://github.com/azat)). +* Update CompressionCodecDeflateQpl.cpp [#43150](https://github.com/ClickHouse/ClickHouse/pull/43150) ([Tiaonmmn](https://github.com/Tiaonmmn)). +* Randomize keeper fault injection settings in stress tests [#43187](https://github.com/ClickHouse/ClickHouse/pull/43187) ([Igor Nikonov](https://github.com/devcrafter)). +* Fix for missing columns bug with projections an ALTER UPDATE [#43189](https://github.com/ClickHouse/ClickHouse/pull/43189) ([Alexander Gololobov](https://github.com/davenger)). +* A workaround for LLVM bug, https://github.com/llvm/llvm-project/issues/58633 [#43195](https://github.com/ClickHouse/ClickHouse/pull/43195) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Stop `ConfigReloader` first to avoid data race [#43201](https://github.com/ClickHouse/ClickHouse/pull/43201) ([Antonio Andelic](https://github.com/antonio2368)). +* Fix typo [#43203](https://github.com/ClickHouse/ClickHouse/pull/43203) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Miscellaneous changes [#43206](https://github.com/ClickHouse/ClickHouse/pull/43206) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Fix flaky 02449_check_dependencies_and_table_shutdown [#43212](https://github.com/ClickHouse/ClickHouse/pull/43212) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Add test to check [#43167](https://github.com/ClickHouse/ClickHouse/issues/43167) for all builds [#43216](https://github.com/ClickHouse/ClickHouse/pull/43216) ([Ilya Yatsishin](https://github.com/qoega)). +* Don't throw if shared ID already created in `StorageReplicatedMergeTree` [#43244](https://github.com/ClickHouse/ClickHouse/pull/43244) ([Antonio Andelic](https://github.com/antonio2368)). +* Fix nullptr dereference in collectScopeValidIdentifiersForTypoCorrection [#43245](https://github.com/ClickHouse/ClickHouse/pull/43245) ([Vladimir C](https://github.com/vdimir)). +* Better message in wait_zookeeper_to_start [#43256](https://github.com/ClickHouse/ClickHouse/pull/43256) ([Vladimir C](https://github.com/vdimir)). +* Make test_global_overcommit_tracker non-parallel [#43266](https://github.com/ClickHouse/ClickHouse/pull/43266) ([Dmitry Novik](https://github.com/novikd)). +* Rename canonicalRand to randCanonical [#43283](https://github.com/ClickHouse/ClickHouse/pull/43283) ([Nikita Taranov](https://github.com/nickitat)). +* check limits for an AST in select parser fuzzer [#43285](https://github.com/ClickHouse/ClickHouse/pull/43285) ([Sema Checherinda](https://github.com/CheSema)). +* Allow autoremoval of old parts if detach_not_byte_identical_parts enabled [#43287](https://github.com/ClickHouse/ClickHouse/pull/43287) ([filimonov](https://github.com/filimonov)). +* `pmod`: compatibility with Spark, better documentation [#43313](https://github.com/ClickHouse/ClickHouse/pull/43313) ([Alexey Milovidov](https://github.com/alexey-milovidov)). + diff --git a/docs/changelogs/v22.11.2.30-stable.md b/docs/changelogs/v22.11.2.30-stable.md new file mode 100644 index 00000000000..a220c469f7f --- /dev/null +++ b/docs/changelogs/v22.11.2.30-stable.md @@ -0,0 +1,33 @@ +--- +sidebar_position: 1 +sidebar_label: 2022 +--- + +# 2022 Changelog + +### ClickHouse release v22.11.2.30-stable (28f72d8ab09) FIXME as compared to v22.11.1.1360-stable (0d211ed1984) + +#### Backward Incompatible Change +* Backported in [#43488](https://github.com/ClickHouse/ClickHouse/issues/43488): Fixed backward incompatibility in (de)serialization of states of `min`, `max`, `any*`, `argMin`, `argMax` aggregate functions with `String` argument. The incompatibility was introduced in https://github.com/ClickHouse/ClickHouse/pull/41431 and affects 22.9, 22.10 and 22.11 branches (fixed since 22.9.6, 22.10.4 and 22.11.2 correspondingly). Some minor releases of 22.3, 22.7 and 22.8 branches are also affected: 22.3.13...22.3.14 (fixed since 22.3.15), 22.8.6...22.8.9 (fixed since 22.8.10), 22.7.6 and newer (will not be fixed in 22.7, we recommend to upgrade from 22.7.* to 22.8.10 or newer). This release note does not concern users that have never used affected versions. Incompatible versions append extra `'\0'` to strings when reading states of the aggregate functions mentioned above. For example, if an older version saved state of `anyState('foobar')` to `state_column` then incompatible version will print `'foobar\0'` on `anyMerge(state_column)`. Also incompatible versions write states of the aggregate functions without trailing `'\0'`. Newer versions (that have the fix) can correctly read data written by all versions including incompatible versions, except one corner case. If an incompatible version saved a state with a string that actually ends with null character, then newer version will trim trailing `'\0'` when reading state of affected aggregate function. For example, if an incompatible version saved state of `anyState('abrac\0dabra\0')` to `state_column` then incompatible versions will print `'abrac\0dabra'` on `anyMerge(state_column)`. The issue also affects distributed queries when an incompatible version works in a cluster together with older or newer versions. [#43038](https://github.com/ClickHouse/ClickHouse/pull/43038) ([Raúl Marín](https://github.com/Algunenano)). + +#### Improvement +* Backported in [#43511](https://github.com/ClickHouse/ClickHouse/issues/43511): Restrict default access to named collections for user defined in config. It must have explicit `show_named_collections=1` to be able to see them. [#43325](https://github.com/ClickHouse/ClickHouse/pull/43325) ([Kseniia Sumarokova](https://github.com/kssenii)). + +#### Bug Fix (user-visible misbehavior in official stable or prestable release) + +* Backported in [#43716](https://github.com/ClickHouse/ClickHouse/issues/43716): An issue with the following exception has been reported while trying to read a Parquet file from S3 into ClickHouse:. [#43297](https://github.com/ClickHouse/ClickHouse/pull/43297) ([Arthur Passos](https://github.com/arthurpassos)). +* Backported in [#43431](https://github.com/ClickHouse/ClickHouse/issues/43431): Fixed queries with `SAMPLE BY` with prewhere optimization on tables using `Merge` engine. [#43315](https://github.com/ClickHouse/ClickHouse/pull/43315) ([Antonio Andelic](https://github.com/antonio2368)). +* Backported in [#43577](https://github.com/ClickHouse/ClickHouse/issues/43577): Fix possible `Cannot create non-empty column with type Nothing` in functions if/multiIf. Closes [#43356](https://github.com/ClickHouse/ClickHouse/issues/43356). [#43368](https://github.com/ClickHouse/ClickHouse/pull/43368) ([Kruglov Pavel](https://github.com/Avogar)). +* Backported in [#43507](https://github.com/ClickHouse/ClickHouse/issues/43507): Fix a bug when row level filter uses default value of column. [#43387](https://github.com/ClickHouse/ClickHouse/pull/43387) ([Alexander Gololobov](https://github.com/davenger)). +* Backported in [#43724](https://github.com/ClickHouse/ClickHouse/issues/43724): Fixed primary key analysis with conditions involving `toString(enum)`. [#43596](https://github.com/ClickHouse/ClickHouse/pull/43596) ([Nikita Taranov](https://github.com/nickitat)). +* Backported in [#43807](https://github.com/ClickHouse/ClickHouse/issues/43807): Optimized number of List requests to ZooKeeper when selecting a part to merge. Previously it could produce thousands of requests in some cases. Fixes [#43647](https://github.com/ClickHouse/ClickHouse/issues/43647). [#43675](https://github.com/ClickHouse/ClickHouse/pull/43675) ([Alexander Tokmakov](https://github.com/tavplubix)). + +#### NOT FOR CHANGELOG / INSIGNIFICANT + +* Update SECURITY.md on new stable tags [#43365](https://github.com/ClickHouse/ClickHouse/pull/43365) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Use all parameters with prefixes from ssm [#43467](https://github.com/ClickHouse/ClickHouse/pull/43467) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Temporarily disable `test_hive_query` [#43542](https://github.com/ClickHouse/ClickHouse/pull/43542) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Do not checkout submodules recursively [#43637](https://github.com/ClickHouse/ClickHouse/pull/43637) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Use docker images cache from merged PRs in master and release branches [#43664](https://github.com/ClickHouse/ClickHouse/pull/43664) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Fix pagination issue in GITHUB_JOB_ID() [#43681](https://github.com/ClickHouse/ClickHouse/pull/43681) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). + diff --git a/docs/changelogs/v22.3.15.33-lts.md b/docs/changelogs/v22.3.15.33-lts.md new file mode 100644 index 00000000000..8f7e9442406 --- /dev/null +++ b/docs/changelogs/v22.3.15.33-lts.md @@ -0,0 +1,34 @@ +--- +sidebar_position: 1 +sidebar_label: 2022 +--- + +# 2022 Changelog + +### ClickHouse release v22.3.15.33-lts (4ef30f2c4b6) FIXME as compared to v22.3.14.23-lts (74956bfee4d) + +#### Backward Incompatible Change +* Backported in [#43484](https://github.com/ClickHouse/ClickHouse/issues/43484): Fixed backward incompatibility in (de)serialization of states of `min`, `max`, `any*`, `argMin`, `argMax` aggregate functions with `String` argument. The incompatibility was introduced in https://github.com/ClickHouse/ClickHouse/pull/41431 and affects 22.9, 22.10 and 22.11 branches (fixed since 22.9.6, 22.10.4 and 22.11.2 correspondingly). Some minor releases of 22.3, 22.7 and 22.8 branches are also affected: 22.3.13...22.3.14 (fixed since 22.3.15), 22.8.6...22.8.9 (fixed since 22.8.10), 22.7.6 and newer (will not be fixed in 22.7, we recommend to upgrade from 22.7.* to 22.8.10 or newer). This release note does not concern users that have never used affected versions. Incompatible versions append extra `'\0'` to strings when reading states of the aggregate functions mentioned above. For example, if an older version saved state of `anyState('foobar')` to `state_column` then incompatible version will print `'foobar\0'` on `anyMerge(state_column)`. Also incompatible versions write states of the aggregate functions without trailing `'\0'`. Newer versions (that have the fix) can correctly read data written by all versions including incompatible versions, except one corner case. If an incompatible version saved a state with a string that actually ends with null character, then newer version will trim trailing `'\0'` when reading state of affected aggregate function. For example, if an incompatible version saved state of `anyState('abrac\0dabra\0')` to `state_column` then incompatible versions will print `'abrac\0dabra'` on `anyMerge(state_column)`. The issue also affects distributed queries when an incompatible version works in a cluster together with older or newer versions. [#43038](https://github.com/ClickHouse/ClickHouse/pull/43038) ([Raúl Marín](https://github.com/Algunenano)). + +#### Improvement +* Backported in [#42839](https://github.com/ClickHouse/ClickHouse/issues/42839): Update tzdata to 2022f. Mexico will no longer observe DST except near the US border: https://www.timeanddate.com/news/time/mexico-abolishes-dst-2022.html. Chihuahua moves to year-round UTC-6 on 2022-10-30. Fiji no longer observes DST. See https://github.com/google/cctz/pull/235 and https://bugs.launchpad.net/ubuntu/+source/tzdata/+bug/1995209. [#42796](https://github.com/ClickHouse/ClickHouse/pull/42796) ([Alexey Milovidov](https://github.com/alexey-milovidov)). + +#### Build/Testing/Packaging Improvement +* Backported in [#43050](https://github.com/ClickHouse/ClickHouse/issues/43050): Wait for all files are in sync before archiving them in integration tests. [#42891](https://github.com/ClickHouse/ClickHouse/pull/42891) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Backported in [#42963](https://github.com/ClickHouse/ClickHouse/issues/42963): Before the fix, the user-defined config was preserved by RPM in `$file.rpmsave`. The PR fixes it and won't replace the user's files from packages. [#42936](https://github.com/ClickHouse/ClickHouse/pull/42936) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Backported in [#43039](https://github.com/ClickHouse/ClickHouse/issues/43039): Add a CI step to mark commits as ready for release; soft-forbid launching a release script from branches but master. [#43017](https://github.com/ClickHouse/ClickHouse/pull/43017) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). + +#### Bug Fix (user-visible misbehavior in official stable or prestable release) + +* Backported in [#43427](https://github.com/ClickHouse/ClickHouse/issues/43427): Fixed queries with `SAMPLE BY` with prewhere optimization on tables using `Merge` engine. [#43315](https://github.com/ClickHouse/ClickHouse/pull/43315) ([Antonio Andelic](https://github.com/antonio2368)). +* Backported in [#43720](https://github.com/ClickHouse/ClickHouse/issues/43720): Fixed primary key analysis with conditions involving `toString(enum)`. [#43596](https://github.com/ClickHouse/ClickHouse/pull/43596) ([Nikita Taranov](https://github.com/nickitat)). + +#### NOT FOR CHANGELOG / INSIGNIFICANT + +* Always run `BuilderReport` and `BuilderSpecialReport` in all CI types [#42684](https://github.com/ClickHouse/ClickHouse/pull/42684) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Update SECURITY.md on new stable tags [#43365](https://github.com/ClickHouse/ClickHouse/pull/43365) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Use all parameters with prefixes from ssm [#43467](https://github.com/ClickHouse/ClickHouse/pull/43467) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Temporarily disable `test_hive_query` [#43542](https://github.com/ClickHouse/ClickHouse/pull/43542) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Do not checkout submodules recursively [#43637](https://github.com/ClickHouse/ClickHouse/pull/43637) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Use docker images cache from merged PRs in master and release branches [#43664](https://github.com/ClickHouse/ClickHouse/pull/43664) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). + diff --git a/docs/changelogs/v22.8.10.29-lts.md b/docs/changelogs/v22.8.10.29-lts.md new file mode 100644 index 00000000000..8f866d2aa40 --- /dev/null +++ b/docs/changelogs/v22.8.10.29-lts.md @@ -0,0 +1,32 @@ +--- +sidebar_position: 1 +sidebar_label: 2022 +--- + +# 2022 Changelog + +### ClickHouse release v22.8.10.29-lts (d568a57f7af) FIXME as compared to v22.8.9.24-lts (a1b69551d40) + +#### Backward Incompatible Change +* Backported in [#43485](https://github.com/ClickHouse/ClickHouse/issues/43485): Fixed backward incompatibility in (de)serialization of states of `min`, `max`, `any*`, `argMin`, `argMax` aggregate functions with `String` argument. The incompatibility was introduced in https://github.com/ClickHouse/ClickHouse/pull/41431 and affects 22.9, 22.10 and 22.11 branches (fixed since 22.9.6, 22.10.4 and 22.11.2 correspondingly). Some minor releases of 22.3, 22.7 and 22.8 branches are also affected: 22.3.13...22.3.14 (fixed since 22.3.15), 22.8.6...22.8.9 (fixed since 22.8.10), 22.7.6 and newer (will not be fixed in 22.7, we recommend to upgrade from 22.7.* to 22.8.10 or newer). This release note does not concern users that have never used affected versions. Incompatible versions append extra `'\0'` to strings when reading states of the aggregate functions mentioned above. For example, if an older version saved state of `anyState('foobar')` to `state_column` then incompatible version will print `'foobar\0'` on `anyMerge(state_column)`. Also incompatible versions write states of the aggregate functions without trailing `'\0'`. Newer versions (that have the fix) can correctly read data written by all versions including incompatible versions, except one corner case. If an incompatible version saved a state with a string that actually ends with null character, then newer version will trim trailing `'\0'` when reading state of affected aggregate function. For example, if an incompatible version saved state of `anyState('abrac\0dabra\0')` to `state_column` then incompatible versions will print `'abrac\0dabra'` on `anyMerge(state_column)`. The issue also affects distributed queries when an incompatible version works in a cluster together with older or newer versions. [#43038](https://github.com/ClickHouse/ClickHouse/pull/43038) ([Raúl Marín](https://github.com/Algunenano)). + +#### Build/Testing/Packaging Improvement +* Backported in [#43051](https://github.com/ClickHouse/ClickHouse/issues/43051): Wait for all files are in sync before archiving them in integration tests. [#42891](https://github.com/ClickHouse/ClickHouse/pull/42891) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). + +#### Bug Fix (user-visible misbehavior in official stable or prestable release) + +* Backported in [#43513](https://github.com/ClickHouse/ClickHouse/issues/43513): - Fix several buffer over-reads. [#43159](https://github.com/ClickHouse/ClickHouse/pull/43159) ([Raúl Marín](https://github.com/Algunenano)). +* Backported in [#43428](https://github.com/ClickHouse/ClickHouse/issues/43428): Fixed queries with `SAMPLE BY` with prewhere optimization on tables using `Merge` engine. [#43315](https://github.com/ClickHouse/ClickHouse/pull/43315) ([Antonio Andelic](https://github.com/antonio2368)). +* Backported in [#43580](https://github.com/ClickHouse/ClickHouse/issues/43580): Fix a bug when row level filter uses default value of column. [#43387](https://github.com/ClickHouse/ClickHouse/pull/43387) ([Alexander Gololobov](https://github.com/davenger)). +* Backported in [#43721](https://github.com/ClickHouse/ClickHouse/issues/43721): Fixed primary key analysis with conditions involving `toString(enum)`. [#43596](https://github.com/ClickHouse/ClickHouse/pull/43596) ([Nikita Taranov](https://github.com/nickitat)). + +#### NOT FOR CHANGELOG / INSIGNIFICANT + +* Fix 02267_file_globs_schema_inference.sql flakiness [#41877](https://github.com/ClickHouse/ClickHouse/pull/41877) ([Kruglov Pavel](https://github.com/Avogar)). +* Update SECURITY.md on new stable tags [#43365](https://github.com/ClickHouse/ClickHouse/pull/43365) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Use all parameters with prefixes from ssm [#43467](https://github.com/ClickHouse/ClickHouse/pull/43467) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Temporarily disable `test_hive_query` [#43542](https://github.com/ClickHouse/ClickHouse/pull/43542) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Do not checkout submodules recursively [#43637](https://github.com/ClickHouse/ClickHouse/pull/43637) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Use docker images cache from merged PRs in master and release branches [#43664](https://github.com/ClickHouse/ClickHouse/pull/43664) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Fix pagination issue in GITHUB_JOB_ID() [#43681](https://github.com/ClickHouse/ClickHouse/pull/43681) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). + diff --git a/docs/changelogs/v22.8.11.15-lts.md b/docs/changelogs/v22.8.11.15-lts.md new file mode 100644 index 00000000000..b0c4a7cc168 --- /dev/null +++ b/docs/changelogs/v22.8.11.15-lts.md @@ -0,0 +1,23 @@ +--- +sidebar_position: 1 +sidebar_label: 2022 +--- + +# 2022 Changelog + +### ClickHouse release v22.8.11.15-lts (65c9506d161) FIXME as compared to v22.8.10.29-lts (d568a57f7af) + +#### Bug Fix +* Backported in [#43098](https://github.com/ClickHouse/ClickHouse/issues/43098): Updated normaliser to clone the alias ast. resolves [#42452](https://github.com/ClickHouse/ClickHouse/issues/42452) Implementation: * Updated QueryNormalizer to clone alias ast, when its replaced. Previously just assigning the same leads to exception in LogicalExpressinsOptimizer as it would be the same parent being inserted again. * This bug is not seen with new analyser (allow_experimental_analyzer), so no changes for it. I added a test for the same. [#42827](https://github.com/ClickHouse/ClickHouse/pull/42827) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)). + +#### Bug Fix (user-visible misbehavior in official stable or prestable release) + +* Backported in [#43751](https://github.com/ClickHouse/ClickHouse/issues/43751): An issue with the following exception has been reported while trying to read a Parquet file from S3 into ClickHouse:. [#43297](https://github.com/ClickHouse/ClickHouse/pull/43297) ([Arthur Passos](https://github.com/arthurpassos)). +* Backported in [#43617](https://github.com/ClickHouse/ClickHouse/issues/43617): Fix sumMap() for Nullable(Decimal()). [#43414](https://github.com/ClickHouse/ClickHouse/pull/43414) ([Azat Khuzhin](https://github.com/azat)). +* Backported in [#43886](https://github.com/ClickHouse/ClickHouse/issues/43886): Fixed `ALTER ... RESET SETTING` with `ON CLUSTER`. It could be applied to one replica only. Fixes [#43843](https://github.com/ClickHouse/ClickHouse/issues/43843). [#43848](https://github.com/ClickHouse/ClickHouse/pull/43848) ([Elena Torró](https://github.com/elenatorro)). + +#### NOT FOR CHANGELOG / INSIGNIFICANT + +* Use only PRs to our repository in pr_info on push [#43895](https://github.com/ClickHouse/ClickHouse/pull/43895) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Fix tags workflow [#43942](https://github.com/ClickHouse/ClickHouse/pull/43942) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). + diff --git a/docs/changelogs/v22.8.9.24-lts.md b/docs/changelogs/v22.8.9.24-lts.md new file mode 100644 index 00000000000..e1f4c2bcdf0 --- /dev/null +++ b/docs/changelogs/v22.8.9.24-lts.md @@ -0,0 +1,31 @@ +--- +sidebar_position: 1 +sidebar_label: 2022 +--- + +# 2022 Changelog + +### ClickHouse release v22.8.9.24-lts (a1b69551d40) FIXME as compared to v22.8.8.3-lts (ac5a6cababc) + +#### Performance Improvement +* Backported in [#43012](https://github.com/ClickHouse/ClickHouse/issues/43012): Keeper performance improvement: improve commit performance for cases when many different nodes have uncommitted states. This should help with cases when a follower node can't sync fast enough. [#42926](https://github.com/ClickHouse/ClickHouse/pull/42926) ([Antonio Andelic](https://github.com/antonio2368)). + +#### Improvement +* Backported in [#42840](https://github.com/ClickHouse/ClickHouse/issues/42840): Update tzdata to 2022f. Mexico will no longer observe DST except near the US border: https://www.timeanddate.com/news/time/mexico-abolishes-dst-2022.html. Chihuahua moves to year-round UTC-6 on 2022-10-30. Fiji no longer observes DST. See https://github.com/google/cctz/pull/235 and https://bugs.launchpad.net/ubuntu/+source/tzdata/+bug/1995209. [#42796](https://github.com/ClickHouse/ClickHouse/pull/42796) ([Alexey Milovidov](https://github.com/alexey-milovidov)). + +#### Build/Testing/Packaging Improvement +* Backported in [#42964](https://github.com/ClickHouse/ClickHouse/issues/42964): Before the fix, the user-defined config was preserved by RPM in `$file.rpmsave`. The PR fixes it and won't replace the user's files from packages. [#42936](https://github.com/ClickHouse/ClickHouse/pull/42936) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Backported in [#43040](https://github.com/ClickHouse/ClickHouse/issues/43040): Add a CI step to mark commits as ready for release; soft-forbid launching a release script from branches but master. [#43017](https://github.com/ClickHouse/ClickHouse/pull/43017) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). + +#### Bug Fix (user-visible misbehavior in official stable or prestable release) + +* Backported in [#42720](https://github.com/ClickHouse/ClickHouse/issues/42720): Fixed `Unknown identifier (aggregate-function)` exception which appears when a user tries to calculate WINDOW ORDER BY/PARTITION BY expressions over aggregate functions: ``` CREATE TABLE default.tenk1 ( `unique1` Int32, `unique2` Int32, `ten` Int32 ) ENGINE = MergeTree ORDER BY tuple() SETTINGS index_granularity = 8192; SELECT ten, sum(unique1) + sum(unique2) AS res, rank() OVER (ORDER BY sum(unique1) + sum(unique2) ASC) AS rank FROM _complex GROUP BY ten ORDER BY ten ASC; ``` which gives: ``` Code: 47. DB::Exception: Received from localhost:9000. DB::Exception: Unknown identifier: sum(unique1); there are columns: unique1, unique2, ten: While processing sum(unique1) + sum(unique2) ASC. (UNKNOWN_IDENTIFIER) ```. [#39762](https://github.com/ClickHouse/ClickHouse/pull/39762) ([Vladimir Chebotaryov](https://github.com/quickhouse)). +* Backported in [#42748](https://github.com/ClickHouse/ClickHouse/issues/42748): A segmentation fault related to DNS & c-ares has been reported. The below error ocurred in multiple threads: ``` 2022-09-28 15:41:19.008,2022.09.28 15:41:19.008088 [ 356 ] {} BaseDaemon: ######################################## 2022-09-28 15:41:19.008,"2022.09.28 15:41:19.008147 [ 356 ] {} BaseDaemon: (version 22.8.5.29 (official build), build id: 92504ACA0B8E2267) (from thread 353) (no query) Received signal Segmentation fault (11)" 2022-09-28 15:41:19.008,2022.09.28 15:41:19.008196 [ 356 ] {} BaseDaemon: Address: 0xf Access: write. Address not mapped to object. 2022-09-28 15:41:19.008,2022.09.28 15:41:19.008216 [ 356 ] {} BaseDaemon: Stack trace: 0x188f8212 0x1626851b 0x1626a69e 0x16269b3f 0x16267eab 0x13cf8284 0x13d24afc 0x13c5217e 0x14ec2495 0x15ba440f 0x15b9d13b 0x15bb2699 0x1891ccb3 0x1891e00d 0x18ae0769 0x18ade022 0x7f76aa985609 0x7f76aa8aa133 2022-09-28 15:41:19.008,2022.09.28 15:41:19.008274 [ 356 ] {} BaseDaemon: 2. Poco::Net::IPAddress::family() const @ 0x188f8212 in /usr/bin/clickhouse 2022-09-28 15:41:19.008,2022.09.28 15:41:19.008297 [ 356 ] {} BaseDaemon: 3. ? @ 0x1626851b in /usr/bin/clickhouse 2022-09-28 15:41:19.008,2022.09.28 15:41:19.008309 [ 356 ] {} BaseDaemon: 4. ? @ 0x1626a69e in /usr/bin/clickhouse ```. [#42234](https://github.com/ClickHouse/ClickHouse/pull/42234) ([Arthur Passos](https://github.com/arthurpassos)). +* Backported in [#43062](https://github.com/ClickHouse/ClickHouse/issues/43062): Fix rare NOT_FOUND_COLUMN_IN_BLOCK error when projection is possible to use but there is no projection available. This fixes [#42771](https://github.com/ClickHouse/ClickHouse/issues/42771) . The bug was introduced in https://github.com/ClickHouse/ClickHouse/pull/25563. [#42938](https://github.com/ClickHouse/ClickHouse/pull/42938) ([Amos Bird](https://github.com/amosbird)). + +#### NOT FOR CHANGELOG / INSIGNIFICANT + +* Do not warn about kvm-clock [#41217](https://github.com/ClickHouse/ClickHouse/pull/41217) ([Sergei Trifonov](https://github.com/serxa)). +* Revert revert 41268 disable s3 parallel write for part moves to disk s3 [#42617](https://github.com/ClickHouse/ClickHouse/pull/42617) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Always run `BuilderReport` and `BuilderSpecialReport` in all CI types [#42684](https://github.com/ClickHouse/ClickHouse/pull/42684) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). + diff --git a/docs/changelogs/v22.9.5.25-stable.md b/docs/changelogs/v22.9.5.25-stable.md new file mode 100644 index 00000000000..e94f97ed662 --- /dev/null +++ b/docs/changelogs/v22.9.5.25-stable.md @@ -0,0 +1,30 @@ +--- +sidebar_position: 1 +sidebar_label: 2022 +--- + +# 2022 Changelog + +### ClickHouse release v22.9.5.25-stable (68ba857aa82) FIXME as compared to v22.9.4.32-stable (3db8bcf1a70) + +#### Improvement +* Backported in [#42841](https://github.com/ClickHouse/ClickHouse/issues/42841): Update tzdata to 2022f. Mexico will no longer observe DST except near the US border: https://www.timeanddate.com/news/time/mexico-abolishes-dst-2022.html. Chihuahua moves to year-round UTC-6 on 2022-10-30. Fiji no longer observes DST. See https://github.com/google/cctz/pull/235 and https://bugs.launchpad.net/ubuntu/+source/tzdata/+bug/1995209. [#42796](https://github.com/ClickHouse/ClickHouse/pull/42796) ([Alexey Milovidov](https://github.com/alexey-milovidov)). + +#### Build/Testing/Packaging Improvement +* Backported in [#42965](https://github.com/ClickHouse/ClickHouse/issues/42965): Before the fix, the user-defined config was preserved by RPM in `$file.rpmsave`. The PR fixes it and won't replace the user's files from packages. [#42936](https://github.com/ClickHouse/ClickHouse/pull/42936) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Backported in [#43041](https://github.com/ClickHouse/ClickHouse/issues/43041): Add a CI step to mark commits as ready for release; soft-forbid launching a release script from branches but master. [#43017](https://github.com/ClickHouse/ClickHouse/pull/43017) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). + +#### Bug Fix (user-visible misbehavior in official stable or prestable release) + +* Backported in [#42749](https://github.com/ClickHouse/ClickHouse/issues/42749): A segmentation fault related to DNS & c-ares has been reported. The below error ocurred in multiple threads: ``` 2022-09-28 15:41:19.008,2022.09.28 15:41:19.008088 [ 356 ] {} BaseDaemon: ######################################## 2022-09-28 15:41:19.008,"2022.09.28 15:41:19.008147 [ 356 ] {} BaseDaemon: (version 22.8.5.29 (official build), build id: 92504ACA0B8E2267) (from thread 353) (no query) Received signal Segmentation fault (11)" 2022-09-28 15:41:19.008,2022.09.28 15:41:19.008196 [ 356 ] {} BaseDaemon: Address: 0xf Access: write. Address not mapped to object. 2022-09-28 15:41:19.008,2022.09.28 15:41:19.008216 [ 356 ] {} BaseDaemon: Stack trace: 0x188f8212 0x1626851b 0x1626a69e 0x16269b3f 0x16267eab 0x13cf8284 0x13d24afc 0x13c5217e 0x14ec2495 0x15ba440f 0x15b9d13b 0x15bb2699 0x1891ccb3 0x1891e00d 0x18ae0769 0x18ade022 0x7f76aa985609 0x7f76aa8aa133 2022-09-28 15:41:19.008,2022.09.28 15:41:19.008274 [ 356 ] {} BaseDaemon: 2. Poco::Net::IPAddress::family() const @ 0x188f8212 in /usr/bin/clickhouse 2022-09-28 15:41:19.008,2022.09.28 15:41:19.008297 [ 356 ] {} BaseDaemon: 3. ? @ 0x1626851b in /usr/bin/clickhouse 2022-09-28 15:41:19.008,2022.09.28 15:41:19.008309 [ 356 ] {} BaseDaemon: 4. ? @ 0x1626a69e in /usr/bin/clickhouse ```. [#42234](https://github.com/ClickHouse/ClickHouse/pull/42234) ([Arthur Passos](https://github.com/arthurpassos)). +* Backported in [#42863](https://github.com/ClickHouse/ClickHouse/issues/42863): Fix lowerUTF8()/upperUTF8() in case of symbol was in between 16-byte boundary (very frequent case of you have strings > 16 bytes long). [#42812](https://github.com/ClickHouse/ClickHouse/pull/42812) ([Azat Khuzhin](https://github.com/azat)). +* Backported in [#43063](https://github.com/ClickHouse/ClickHouse/issues/43063): Fix rare NOT_FOUND_COLUMN_IN_BLOCK error when projection is possible to use but there is no projection available. This fixes [#42771](https://github.com/ClickHouse/ClickHouse/issues/42771) . The bug was introduced in https://github.com/ClickHouse/ClickHouse/pull/25563. [#42938](https://github.com/ClickHouse/ClickHouse/pull/42938) ([Amos Bird](https://github.com/amosbird)). +* Backported in [#43443](https://github.com/ClickHouse/ClickHouse/issues/43443): - Fix several buffer over-reads. [#43159](https://github.com/ClickHouse/ClickHouse/pull/43159) ([Raúl Marín](https://github.com/Algunenano)). +* Backported in [#43429](https://github.com/ClickHouse/ClickHouse/issues/43429): Fixed queries with `SAMPLE BY` with prewhere optimization on tables using `Merge` engine. [#43315](https://github.com/ClickHouse/ClickHouse/pull/43315) ([Antonio Andelic](https://github.com/antonio2368)). + +#### NOT FOR CHANGELOG / INSIGNIFICANT + +* Always run `BuilderReport` and `BuilderSpecialReport` in all CI types [#42684](https://github.com/ClickHouse/ClickHouse/pull/42684) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Update SECURITY.md on new stable tags [#43365](https://github.com/ClickHouse/ClickHouse/pull/43365) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Use all parameters with prefixes from ssm [#43467](https://github.com/ClickHouse/ClickHouse/pull/43467) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). + diff --git a/docs/changelogs/v22.9.6.20-stable.md b/docs/changelogs/v22.9.6.20-stable.md new file mode 100644 index 00000000000..a7127643fd3 --- /dev/null +++ b/docs/changelogs/v22.9.6.20-stable.md @@ -0,0 +1,28 @@ +--- +sidebar_position: 1 +sidebar_label: 2022 +--- + +# 2022 Changelog + +### ClickHouse release v22.9.6.20-stable (ef6343f9579) FIXME as compared to v22.9.5.25-stable (68ba857aa82) + +#### Backward Incompatible Change +* Backported in [#43486](https://github.com/ClickHouse/ClickHouse/issues/43486): Fixed backward incompatibility in (de)serialization of states of `min`, `max`, `any*`, `argMin`, `argMax` aggregate functions with `String` argument. The incompatibility was introduced in https://github.com/ClickHouse/ClickHouse/pull/41431 and affects 22.9, 22.10 and 22.11 branches (fixed since 22.9.6, 22.10.4 and 22.11.2 correspondingly). Some minor releases of 22.3, 22.7 and 22.8 branches are also affected: 22.3.13...22.3.14 (fixed since 22.3.15), 22.8.6...22.8.9 (fixed since 22.8.10), 22.7.6 and newer (will not be fixed in 22.7, we recommend to upgrade from 22.7.* to 22.8.10 or newer). This release note does not concern users that have never used affected versions. Incompatible versions append extra `'\0'` to strings when reading states of the aggregate functions mentioned above. For example, if an older version saved state of `anyState('foobar')` to `state_column` then incompatible version will print `'foobar\0'` on `anyMerge(state_column)`. Also incompatible versions write states of the aggregate functions without trailing `'\0'`. Newer versions (that have the fix) can correctly read data written by all versions including incompatible versions, except one corner case. If an incompatible version saved a state with a string that actually ends with null character, then newer version will trim trailing `'\0'` when reading state of affected aggregate function. For example, if an incompatible version saved state of `anyState('abrac\0dabra\0')` to `state_column` then incompatible versions will print `'abrac\0dabra'` on `anyMerge(state_column)`. The issue also affects distributed queries when an incompatible version works in a cluster together with older or newer versions. [#43038](https://github.com/ClickHouse/ClickHouse/pull/43038) ([Raúl Marín](https://github.com/Algunenano)). + +#### Build/Testing/Packaging Improvement +* Backported in [#43052](https://github.com/ClickHouse/ClickHouse/issues/43052): Wait for all files are in sync before archiving them in integration tests. [#42891](https://github.com/ClickHouse/ClickHouse/pull/42891) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). + +#### Bug Fix (user-visible misbehavior in official stable or prestable release) + +* Backported in [#43505](https://github.com/ClickHouse/ClickHouse/issues/43505): Fix a bug when row level filter uses default value of column. [#43387](https://github.com/ClickHouse/ClickHouse/pull/43387) ([Alexander Gololobov](https://github.com/davenger)). +* Backported in [#43722](https://github.com/ClickHouse/ClickHouse/issues/43722): Fixed primary key analysis with conditions involving `toString(enum)`. [#43596](https://github.com/ClickHouse/ClickHouse/pull/43596) ([Nikita Taranov](https://github.com/nickitat)). + +#### NOT FOR CHANGELOG / INSIGNIFICANT + +* Fix 02267_file_globs_schema_inference.sql flakiness [#41877](https://github.com/ClickHouse/ClickHouse/pull/41877) ([Kruglov Pavel](https://github.com/Avogar)). +* Temporarily disable `test_hive_query` [#43542](https://github.com/ClickHouse/ClickHouse/pull/43542) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Do not checkout submodules recursively [#43637](https://github.com/ClickHouse/ClickHouse/pull/43637) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Use docker images cache from merged PRs in master and release branches [#43664](https://github.com/ClickHouse/ClickHouse/pull/43664) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Fix pagination issue in GITHUB_JOB_ID() [#43681](https://github.com/ClickHouse/ClickHouse/pull/43681) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). + diff --git a/docs/en/engines/database-engines/materialized-mysql.md b/docs/en/engines/database-engines/materialized-mysql.md index c8aa65bdd91..7dd43858416 100644 --- a/docs/en/engines/database-engines/materialized-mysql.md +++ b/docs/en/engines/database-engines/materialized-mysql.md @@ -77,15 +77,15 @@ While turning on `gtid_mode` you should also specify `enforce_gtid_consistency = ## Virtual Columns {#virtual-columns} -When working with the `MaterializedMySQL` database engine, [ReplacingMergeTree](../../engines/table-engines/mergetree-family/replacingmergetree.md) tables are used with virtual `_sign` and `_version` columns. +When working with the `MaterializedMySQL` database engine, [ReplacingMergeTree](/docs/en/engines/table-engines/mergetree-family/replacingmergetree.md) tables are used with virtual `_sign` and `_version` columns. ### \_version -`_version` — Transaction counter. Type [UInt64](../../sql-reference/data-types/int-uint.md). +`_version` — Transaction counter. Type [UInt64](/docs/en/sql-reference/data-types/int-uint.md). ### \_sign -`_sign` — Deletion mark. Type [Int8](../../sql-reference/data-types/int-uint.md). Possible values: +`_sign` — Deletion mark. Type [Int8](/docs/en/sql-reference/data-types/int-uint.md). Possible values: - `1` — Row is not deleted, - `-1` — Row is deleted. @@ -93,29 +93,29 @@ When working with the `MaterializedMySQL` database engine, [ReplacingMergeTree]( | MySQL | ClickHouse | |-------------------------|--------------------------------------------------------------| -| TINY | [Int8](../../sql-reference/data-types/int-uint.md) | -| SHORT | [Int16](../../sql-reference/data-types/int-uint.md) | -| INT24 | [Int32](../../sql-reference/data-types/int-uint.md) | -| LONG | [UInt32](../../sql-reference/data-types/int-uint.md) | -| LONGLONG | [UInt64](../../sql-reference/data-types/int-uint.md) | -| FLOAT | [Float32](../../sql-reference/data-types/float.md) | -| DOUBLE | [Float64](../../sql-reference/data-types/float.md) | -| DECIMAL, NEWDECIMAL | [Decimal](../../sql-reference/data-types/decimal.md) | -| DATE, NEWDATE | [Date](../../sql-reference/data-types/date.md) | -| DATETIME, TIMESTAMP | [DateTime](../../sql-reference/data-types/datetime.md) | -| DATETIME2, TIMESTAMP2 | [DateTime64](../../sql-reference/data-types/datetime64.md) | -| YEAR | [UInt16](../../sql-reference/data-types/int-uint.md) | -| TIME | [Int64](../../sql-reference/data-types/int-uint.md) | -| ENUM | [Enum](../../sql-reference/data-types/enum.md) | -| STRING | [String](../../sql-reference/data-types/string.md) | -| VARCHAR, VAR_STRING | [String](../../sql-reference/data-types/string.md) | -| BLOB | [String](../../sql-reference/data-types/string.md) | -| GEOMETRY | [String](../../sql-reference/data-types/string.md) | -| BINARY | [FixedString](../../sql-reference/data-types/fixedstring.md) | -| BIT | [UInt64](../../sql-reference/data-types/int-uint.md) | -| SET | [UInt64](../../sql-reference/data-types/int-uint.md) | +| TINY | [Int8](/docs/en/sql-reference/data-types/int-uint.md) | +| SHORT | [Int16](/docs/en/sql-reference/data-types/int-uint.md) | +| INT24 | [Int32](/docs/en/sql-reference/data-types/int-uint.md) | +| LONG | [UInt32](/docs/en/sql-reference/data-types/int-uint.md) | +| LONGLONG | [UInt64](/docs/en/sql-reference/data-types/int-uint.md) | +| FLOAT | [Float32](/docs/en/sql-reference/data-types/float.md) | +| DOUBLE | [Float64](/docs/en/sql-reference/data-types/float.md) | +| DECIMAL, NEWDECIMAL | [Decimal](/docs/en/sql-reference/data-types/decimal.md) | +| DATE, NEWDATE | [Date](/docs/en/sql-reference/data-types/date.md) | +| DATETIME, TIMESTAMP | [DateTime](/docs/en/sql-reference/data-types/datetime.md) | +| DATETIME2, TIMESTAMP2 | [DateTime64](/docs/en/sql-reference/data-types/datetime64.md) | +| YEAR | [UInt16](/docs/en/sql-reference/data-types/int-uint.md) | +| TIME | [Int64](/docs/en/sql-reference/data-types/int-uint.md) | +| ENUM | [Enum](/docs/en/sql-reference/data-types/enum.md) | +| STRING | [String](/docs/en/sql-reference/data-types/string.md) | +| VARCHAR, VAR_STRING | [String](/docs/en/sql-reference/data-types/string.md) | +| BLOB | [String](/docs/en/sql-reference/data-types/string.md) | +| GEOMETRY | [String](/docs/en/sql-reference/data-types/string.md) | +| BINARY | [FixedString](/docs/en/sql-reference/data-types/fixedstring.md) | +| BIT | [UInt64](/docs/en/sql-reference/data-types/int-uint.md) | +| SET | [UInt64](/docs/en/sql-reference/data-types/int-uint.md) | -[Nullable](../../sql-reference/data-types/nullable.md) is supported. +[Nullable](/docs/en/sql-reference/data-types/nullable.md) is supported. The data of TIME type in MySQL is converted to microseconds in ClickHouse. @@ -133,7 +133,7 @@ Apart of the data types limitations there are few restrictions comparing to `MyS ### DDL Queries {#ddl-queries} -MySQL DDL queries are converted into the corresponding ClickHouse DDL queries ([ALTER](../../sql-reference/statements/alter/index.md), [CREATE](../../sql-reference/statements/create/index.md), [DROP](../../sql-reference/statements/drop), [RENAME](../../sql-reference/statements/rename.md)). If ClickHouse cannot parse some DDL query, the query is ignored. +MySQL DDL queries are converted into the corresponding ClickHouse DDL queries ([ALTER](/docs/en/sql-reference/statements/alter/index.md), [CREATE](/docs/en/sql-reference/statements/create/index.md), [DROP](/docs/en/sql-reference/statements/drop.md), [RENAME](/docs/en/sql-reference/statements/rename.md)). If ClickHouse cannot parse some DDL query, the query is ignored. ### Data Replication {#data-replication} @@ -151,7 +151,7 @@ MySQL DDL queries are converted into the corresponding ClickHouse DDL queries ([ `SELECT` query from `MaterializedMySQL` tables has some specifics: - If `_version` is not specified in the `SELECT` query, the - [FINAL](../../sql-reference/statements/select/from.md#select-from-final) modifier is used, so only rows with + [FINAL](/docs/en/sql-reference/statements/select/from.md/#select-from-final) modifier is used, so only rows with `MAX(_version)` are returned for each primary key value. - If `_sign` is not specified in the `SELECT` query, `WHERE _sign=1` is used by default. So the deleted rows are not @@ -164,7 +164,7 @@ MySQL DDL queries are converted into the corresponding ClickHouse DDL queries ([ MySQL `PRIMARY KEY` and `INDEX` clauses are converted into `ORDER BY` tuples in ClickHouse tables. ClickHouse has only one physical order, which is determined by `ORDER BY` clause. To create a new physical order, use -[materialized views](../../sql-reference/statements/create/view.md#materialized). +[materialized views](/docs/en/sql-reference/statements/create/view.md/#materialized). **Notes** @@ -173,7 +173,7 @@ ClickHouse has only one physical order, which is determined by `ORDER BY` clause MySQL binlog. - Replication can be easily broken. - Manual operations on database and tables are forbidden. -- `MaterializedMySQL` is affected by the [optimize_on_insert](../../operations/settings/settings.md#optimize-on-insert) +- `MaterializedMySQL` is affected by the [optimize_on_insert](/docs/en/operations/settings/settings.md/#optimize-on-insert) setting. Data is merged in the corresponding table in the `MaterializedMySQL` database when a table in the MySQL server changes. @@ -187,19 +187,19 @@ These are the schema conversion manipulations you can do with table overrides fo * Modify column type. Must be compatible with the original type, or replication will fail. For example, you can modify a UInt32 column to UInt64, but you can not modify a String column to Array(String). - * Modify [column TTL](../table-engines/mergetree-family/mergetree/#mergetree-column-ttl). - * Modify [column compression codec](../../sql-reference/statements/create/table/#codecs). - * Add [ALIAS columns](../../sql-reference/statements/create/table/#alias). - * Add [skipping indexes](../table-engines/mergetree-family/mergetree/#table_engine-mergetree-data_skipping-indexes) - * Add [projections](../table-engines/mergetree-family/mergetree/#projections). Note that projection optimizations are + * Modify [column TTL](/docs/en/engines/table-engines/mergetree-family/mergetree.md/#mergetree-column-ttl). + * Modify [column compression codec](/docs/en/sql-reference/statements/create/table.md/#codecs). + * Add [ALIAS columns](/docs/en/sql-reference/statements/create/table.md/#alias). + * Add [skipping indexes](/docs/en/engines/table-engines/mergetree-family/mergetree.md/#table_engine-mergetree-data_skipping-indexes) + * Add [projections](/docs/en/engines/table-engines/mergetree-family/mergetree.md/#projections). Note that projection optimizations are disabled when using `SELECT ... FINAL` (which MaterializedMySQL does by default), so their utility is limited here. `INDEX ... TYPE hypothesis` as [described in the v21.12 blog post]](https://clickhouse.com/blog/en/2021/clickhouse-v21.12-released/) may be more useful in this case. - * Modify [PARTITION BY](../table-engines/mergetree-family/custom-partitioning-key/) - * Modify [ORDER BY](../table-engines/mergetree-family/mergetree/#mergetree-query-clauses) - * Modify [PRIMARY KEY](../table-engines/mergetree-family/mergetree/#mergetree-query-clauses) - * Add [SAMPLE BY](../table-engines/mergetree-family/mergetree/#mergetree-query-clauses) - * Add [table TTL](../table-engines/mergetree-family/mergetree/#mergetree-query-clauses) + * Modify [PARTITION BY](/docs/en/engines/table-engines/mergetree-family/custom-partitioning-key/) + * Modify [ORDER BY](/docs/en/engines/table-engines/mergetree-family/mergetree.md/#mergetree-query-clauses) + * Modify [PRIMARY KEY](/docs/en/engines/table-engines/mergetree-family/mergetree.md/#mergetree-query-clauses) + * Add [SAMPLE BY](/docs/en/engines/table-engines/mergetree-family/mergetree.md/#mergetree-query-clauses) + * Add [table TTL](/docs/en/engines/table-engines/mergetree-family/mergetree.md/#mergetree-query-clauses) ```sql CREATE DATABASE db_name ENGINE = MaterializedMySQL(...) diff --git a/docs/en/engines/database-engines/replicated.md b/docs/en/engines/database-engines/replicated.md index f0ef1e981fe..43d1ce5ec3f 100644 --- a/docs/en/engines/database-engines/replicated.md +++ b/docs/en/engines/database-engines/replicated.md @@ -86,7 +86,7 @@ node1 :) SELECT materialize(hostName()) AS host, groupArray(n) FROM r.d GROUP BY ``` text ┌─hosts─┬─groupArray(n)─┐ -│ node1 │ [1,3,5,7,9] │ +│ node3 │ [1,3,5,7,9] │ │ node2 │ [0,2,4,6,8] │ └───────┴───────────────┘ ``` diff --git a/docs/en/engines/table-engines/integrations/index.md b/docs/en/engines/table-engines/integrations/index.md index 7e67bcb6249..09e89209ea9 100644 --- a/docs/en/engines/table-engines/integrations/index.md +++ b/docs/en/engines/table-engines/integrations/index.md @@ -6,7 +6,7 @@ sidebar_label: Integrations # Table Engines for Integrations -ClickHouse provides various means for integrating with external systems, including table engines. Like with all other table engines, the configuration is done using `CREATE TABLE` or `ALTER TABLE` queries. Then from a user perspective, the configured integration looks like a normal table, but queries to it are proxied to the external system. This transparent querying is one of the key advantages of this approach over alternative integration methods, like external dictionaries or table functions, which require to use custom query methods on each use. +ClickHouse provides various means for integrating with external systems, including table engines. Like with all other table engines, the configuration is done using `CREATE TABLE` or `ALTER TABLE` queries. Then from a user perspective, the configured integration looks like a normal table, but queries to it are proxied to the external system. This transparent querying is one of the key advantages of this approach over alternative integration methods, like dictionaries or table functions, which require to use custom query methods on each use. List of supported integrations: diff --git a/docs/en/engines/table-engines/integrations/mysql.md b/docs/en/engines/table-engines/integrations/mysql.md index 7c9c4cfea53..9f637c50989 100644 --- a/docs/en/engines/table-engines/integrations/mysql.md +++ b/docs/en/engines/table-engines/integrations/mysql.md @@ -180,6 +180,6 @@ Default value: `300`. ## See Also {#see-also} - [The mysql table function](../../../sql-reference/table-functions/mysql.md) -- [Using MySQL as a source of external dictionary](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md#dicts-external_dicts_dict_sources-mysql) +- [Using MySQL as a dictionary source](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md#dicts-external_dicts_dict_sources-mysql) [Original article](https://clickhouse.com/docs/en/engines/table-engines/integrations/mysql/) diff --git a/docs/en/engines/table-engines/integrations/odbc.md b/docs/en/engines/table-engines/integrations/odbc.md index 043d5170654..e21a64bc5b2 100644 --- a/docs/en/engines/table-engines/integrations/odbc.md +++ b/docs/en/engines/table-engines/integrations/odbc.md @@ -126,7 +126,7 @@ SELECT * FROM odbc_t ## See Also {#see-also} -- [ODBC external dictionaries](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md#dicts-external_dicts_dict_sources-odbc) +- [ODBC dictionaries](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md#dicts-external_dicts_dict_sources-odbc) - [ODBC table function](../../../sql-reference/table-functions/odbc.md) [Original article](https://clickhouse.com/docs/en/engines/table-engines/integrations/odbc/) diff --git a/docs/en/engines/table-engines/integrations/postgresql.md b/docs/en/engines/table-engines/integrations/postgresql.md index 4bb8033de9c..c07512cf0ce 100644 --- a/docs/en/engines/table-engines/integrations/postgresql.md +++ b/docs/en/engines/table-engines/integrations/postgresql.md @@ -174,6 +174,6 @@ CREATE TABLE pg_table_schema_with_dots (a UInt32) **See Also** - [The `postgresql` table function](../../../sql-reference/table-functions/postgresql.md) -- [Using PostgreSQL as a source of external dictionary](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md#dicts-external_dicts_dict_sources-postgresql) +- [Using PostgreSQL as a dictionary source](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md#dicts-external_dicts_dict_sources-postgresql) [Original article](https://clickhouse.com/docs/en/engines/table-engines/integrations/postgresql/) diff --git a/docs/en/engines/table-engines/integrations/s3.md b/docs/en/engines/table-engines/integrations/s3.md index db983ab9c68..484fd265c3d 100644 --- a/docs/en/engines/table-engines/integrations/s3.md +++ b/docs/en/engines/table-engines/integrations/s3.md @@ -127,6 +127,10 @@ The following settings can be set before query execution or placed into configur - `s3_min_upload_part_size` — The minimum size of part to upload during multipart upload to [S3 Multipart upload](https://docs.aws.amazon.com/AmazonS3/latest/dev/uploadobjusingmpu.html). Default value is `512Mb`. - `s3_max_redirects` — Max number of S3 redirects hops allowed. Default value is `10`. - `s3_single_read_retries` — The maximum number of attempts during single read. Default value is `4`. +- `s3_max_put_rps` — Maximum PUT requests per second rate before throttling. Default value is `0` (unlimited). +- `s3_max_put_burst` — Max number of requests that can be issued simultaneously before hitting request per second limit. By default (`0` value) equals to `s3_max_put_rps`. +- `s3_max_get_rps` — Maximum GET requests per second rate before throttling. Default value is `0` (unlimited). +- `s3_max_get_burst` — Max number of requests that can be issued simultaneously before hitting request per second limit. By default (`0` value) equals to `s3_max_get_rps`. Security consideration: if malicious user can specify arbitrary S3 URLs, `s3_max_redirects` must be set to zero to avoid [SSRF](https://en.wikipedia.org/wiki/Server-side_request_forgery) attacks; or alternatively, `remote_host_filter` must be specified in server configuration. @@ -142,6 +146,7 @@ The following settings can be specified in configuration file for given endpoint - `header` — Adds specified HTTP header to a request to given endpoint. Optional, can be specified multiple times. - `server_side_encryption_customer_key_base64` — If specified, required headers for accessing S3 objects with SSE-C encryption will be set. Optional. - `max_single_read_retries` — The maximum number of attempts during single read. Default value is `4`. Optional. +- `max_put_rps`, `max_put_burst`, `max_get_rps` and `max_get_burst` - Throttling settings (see description above) to use for specific endpoint instead of per query. Optional. **Example:** diff --git a/docs/en/engines/table-engines/log-family/index.md b/docs/en/engines/table-engines/log-family/index.md index 98bc4dbad04..21f857510f7 100644 --- a/docs/en/engines/table-engines/log-family/index.md +++ b/docs/en/engines/table-engines/log-family/index.md @@ -10,11 +10,11 @@ These engines were developed for scenarios when you need to quickly write many s Engines of the family: -- [StripeLog](../../../engines/table-engines/log-family/stripelog.md) -- [Log](../../../engines/table-engines/log-family/log.md) -- [TinyLog](../../../engines/table-engines/log-family/tinylog.md) +- [StripeLog](/docs/en/engines/table-engines/log-family/stripelog.md) +- [Log](/docs/en/engines/table-engines/log-family/log.md) +- [TinyLog](/docs/en/engines/table-engines/log-family/tinylog.md) -`Log` family table engines can store data to [HDFS](../../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-hdfs) or [S3](../../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-s3) distributed file systems. +`Log` family table engines can store data to [HDFS](/docs/en/engines/table-engines/mergetree-family/mergetree.md/#table_engine-mergetree-hdfs) or [S3](/docs/en/engines/table-engines/mergetree-family/mergetree.md/#table_engine-mergetree-s3) distributed file systems. ## Common Properties {#common-properties} @@ -28,7 +28,7 @@ Engines: During `INSERT` queries, the table is locked, and other queries for reading and writing data both wait for the table to unlock. If there are no data writing queries, any number of data reading queries can be performed concurrently. -- Do not support [mutations](../../../sql-reference/statements/alter/index.md#alter-mutations). +- Do not support [mutations](/docs/en/sql-reference/statements/alter/index.md#alter-mutations). - Do not support indexes. diff --git a/docs/en/engines/table-engines/mergetree-family/aggregatingmergetree.md b/docs/en/engines/table-engines/mergetree-family/aggregatingmergetree.md index ba518f51657..267e5c81dda 100644 --- a/docs/en/engines/table-engines/mergetree-family/aggregatingmergetree.md +++ b/docs/en/engines/table-engines/mergetree-family/aggregatingmergetree.md @@ -68,36 +68,57 @@ In the results of `SELECT` query, the values of `AggregateFunction` type have im ## Example of an Aggregated Materialized View {#example-of-an-aggregated-materialized-view} -`AggregatingMergeTree` materialized view that watches the `test.visits` table: +We will create the table `test.visits` that contain the raw data: ``` sql -CREATE MATERIALIZED VIEW test.basic -ENGINE = AggregatingMergeTree() PARTITION BY toYYYYMM(StartDate) ORDER BY (CounterID, StartDate) +CREATE TABLE test.visits + ( + StartDate DateTime64 NOT NULL, + CounterID UInt64, + Sign Nullable(Int32), + UserID Nullable(Int32) +) ENGINE = MergeTree ORDER BY (StartDate, CounterID); +``` + +`AggregatingMergeTree` materialized view that watches the `test.visits` table, and use the `AggregateFunction` type: + +``` sql +CREATE MATERIALIZED VIEW test.mv_visits +( + StartDate DateTime64 NOT NULL, + CounterID UInt64, + Visits AggregateFunction(sum, Nullable(Int32)), + Users AggregateFunction(uniq, Nullable(Int32)) +) +ENGINE = AggregatingMergeTree() ORDER BY (StartDate, CounterID) AS SELECT - CounterID, StartDate, - sumState(Sign) AS Visits, + CounterID, + sumState(Sign) AS Visits, uniqState(UserID) AS Users FROM test.visits -GROUP BY CounterID, StartDate; +GROUP BY StartDate, CounterID; ``` Inserting data into the `test.visits` table. ``` sql -INSERT INTO test.visits ... +INSERT INTO test.visits (StartDate, CounterID, Sign, UserID) + VALUES (1667446031, 1, 3, 4) +INSERT INTO test.visits (StartDate, CounterID, Sign, UserID) + VALUES (1667446031, 1, 6, 3) ``` -The data are inserted in both the table and view `test.basic` that will perform the aggregation. +The data are inserted in both the table and the materialized view `test.mv_visits`. -To get the aggregated data, we need to execute a query such as `SELECT ... GROUP BY ...` from the view `test.basic`: +To get the aggregated data, we need to execute a query such as `SELECT ... GROUP BY ...` from the materialized view `test.mv_visits`: ``` sql SELECT StartDate, sumMerge(Visits) AS Visits, uniqMerge(Users) AS Users -FROM test.basic +FROM test.mv_visits GROUP BY StartDate ORDER BY StartDate; ``` diff --git a/docs/en/engines/table-engines/mergetree-family/annindexes.md b/docs/en/engines/table-engines/mergetree-family/annindexes.md index 3b2431e4b5b..e482926f400 100644 --- a/docs/en/engines/table-engines/mergetree-family/annindexes.md +++ b/docs/en/engines/table-engines/mergetree-family/annindexes.md @@ -2,13 +2,20 @@ The main task that indexes achieve is to quickly find nearest neighbors for multidimensional data. An example of such a problem can be finding similar pictures (texts) for a given picture (text). That problem can be reduced to finding the nearest [embeddings](https://cloud.google.com/architecture/overview-extracting-and-serving-feature-embeddings-for-machine-learning). They can be created from data using [UDF](../../../sql-reference/functions/index.md#executable-user-defined-functions). -The next query finds the closest neighbors in N-dimensional space using the L2 (Euclidean) distance: +The next queries find the closest neighbors in N-dimensional space using the L2 (Euclidean) distance: ``` sql SELECT * FROM table_name WHERE L2Distance(Column, Point) < MaxDistance LIMIT N ``` + +``` sql +SELECT * +FROM table_name +ORDER BY L2Distance(Column, Point) +LIMIT N +``` But it will take some time for execution because of the long calculation of the distance between `TargetEmbedding` and all other vectors. This is where ANN indexes can help. They store a compact approximation of the search space (e.g. using clustering, search trees, etc.) and are able to compute approximate neighbors quickly. ## Indexes Structure @@ -34,26 +41,27 @@ Approximate Nearest Neighbor Search Indexes (`ANNIndexes`) are similar to skip i In these queries, `DistanceFunction` is selected from [distance functions](../../../sql-reference/functions/distance-functions). `Point` is a known vector (something like `(0.1, 0.1, ... )`). To avoid writing large vectors, use [client parameters](../../../interfaces/cli.md#queries-with-parameters-cli-queries-with-parameters). `Value` - a float value that will bound the neighbourhood. -!!! note "Note" - ANN index can't speed up query that satisfies both types(`where + order by`, only one of them). All queries must have the limit, as algorithms are used to find nearest neighbors and need a specific number of them. +:::note +ANN index can't speed up query that satisfies both types (`where + order by`, only one of them). All queries must have the limit, as algorithms are used to find nearest neighbors and need a specific number of them. +::: -!!! note "Note" - Indexes are applied only to queries with a limit less than the `max_limit_for_ann_queries` setting. This helps to avoid memory overflows in queries with a large limit. `max_limit_for_ann_queries` setting can be changed if you know you can provide enough memory. The default value is `1000000`. +:::note +Indexes are applied only to queries with a limit less than the `max_limit_for_ann_queries` setting. This helps to avoid memory overflows in queries with a large limit. `max_limit_for_ann_queries` setting can be changed if you know you can provide enough memory. The default value is `1000000`. +::: Both types of queries are handled the same way. The indexes get `n` neighbors (where `n` is taken from the `LIMIT` clause) and work with them. In `ORDER BY` query they remember the numbers of all parts of the granule that have at least one of neighbor. In `WHERE` query they remember only those parts that satisfy the requirements. - ## Create table with ANNIndex -This feature is disabled by default. To enable it, set `allow_experimental_annoy_index` to 1. Also, this feature is disabled for arm, due to likely problems with the algorithm. +This feature is disabled by default. To enable it, set `allow_experimental_annoy_index` to 1. Also, this feature is disabled on ARM, due to likely problems with the algorithm. ```sql CREATE TABLE t ( `id` Int64, - `number` Tuple(Float32, Float32, Float32), - INDEX x number TYPE annoy GRANULARITY N + `data` Tuple(Float32, Float32, Float32), + INDEX ann_index_name data TYPE ann_index_type(ann_index_parameters) GRANULARITY N ) ENGINE = MergeTree ORDER BY id; @@ -63,8 +71,8 @@ ORDER BY id; CREATE TABLE t ( `id` Int64, - `number` Array(Float32), - INDEX x number TYPE annoy GRANULARITY N + `data` Array(Float32), + INDEX ann_index_name data TYPE ann_index_type(ann_index_parameters) GRANULARITY N ) ENGINE = MergeTree ORDER BY id; @@ -73,7 +81,7 @@ ORDER BY id; With greater `GRANULARITY` indexes remember the data structure better. The `GRANULARITY` indicates how many granules will be used to construct the index. The more data is provided for the index, the more of it can be handled by one index and the more chances that with the right hyperparameters the index will remember the data structure better. But some indexes can't be built if they don't have enough data, so this granule will always participate in the query. For more information, see the description of indexes. As the indexes are built only during insertions into table, `INSERT` and `OPTIMIZE` queries are slower than for ordinary table. At this stage indexes remember all the information about the given data. ANNIndexes should be used if you have immutable or rarely changed data and many read requests. - + You can create your table with index which uses certain algorithm. Now only indices based on the following algorithms are supported: # Index list @@ -91,8 +99,8 @@ __Examples__: CREATE TABLE t ( id Int64, - number Tuple(Float32, Float32, Float32), - INDEX x number TYPE annoy(T) GRANULARITY N + data Tuple(Float32, Float32, Float32), + INDEX ann_index_name data TYPE annoy(NumTrees, DistanceName) GRANULARITY N ) ENGINE = MergeTree ORDER BY id; @@ -102,18 +110,30 @@ ORDER BY id; CREATE TABLE t ( id Int64, - number Array(Float32), - INDEX x number TYPE annoy(T) GRANULARITY N + data Array(Float32), + INDEX ann_index_name data TYPE annoy(NumTrees, DistanceName) GRANULARITY N ) ENGINE = MergeTree ORDER BY id; ``` -!!! note "Note" - Table with array field will work faster, but all arrays **must** have same length. Use [CONSTRAINT](../../../sql-reference/statements/create/table.md#constraints) to avoid errors. For example, `CONSTRAINT constraint_name_1 CHECK length(number) = 256`. -Parameter `T` is the number of trees which algorithm will create. The bigger it is, the slower (approximately linear) it works (in both `CREATE` and `SELECT` requests), but the better accuracy you get (adjusted for randomness). +:::note +Table with array field will work faster, but all arrays **must** have same length. Use [CONSTRAINT](../../../sql-reference/statements/create/table.md#constraints) to avoid errors. For example, `CONSTRAINT constraint_name_1 CHECK length(data) = 256`. +::: -Annoy supports only `L2Distance`. +Parameter `NumTrees` is the number of trees which the algorithm will create. The bigger it is, the slower (approximately linear) it works (in both `CREATE` and `SELECT` requests), but the better accuracy you get (adjusted for randomness). By default it is set to `100`. Parameter `DistanceName` is name of distance function. By default it is set to `L2Distance`. It can be set without changing first parameter, for example +```sql +CREATE TABLE t +( + id Int64, + data Array(Float32), + INDEX ann_index_name data TYPE annoy('cosineDistance') GRANULARITY N +) +ENGINE = MergeTree +ORDER BY id; +``` + +Annoy supports `L2Distance` and `cosineDistance`. In the `SELECT` in the settings (`ann_index_select_query_params`) you can specify the size of the internal buffer (more details in the description above or in the [original repository](https://github.com/spotify/annoy)). During the query it will inspect up to `search_k` nodes which defaults to `n_trees * n` if not provided. `search_k` gives you a run-time tradeoff between better accuracy and speed. diff --git a/docs/en/engines/table-engines/mergetree-family/mergetree.md b/docs/en/engines/table-engines/mergetree-family/mergetree.md index 486baac2310..7614a09c018 100644 --- a/docs/en/engines/table-engines/mergetree-family/mergetree.md +++ b/docs/en/engines/table-engines/mergetree-family/mergetree.md @@ -16,20 +16,20 @@ Main features: This allows you to create a small sparse index that helps find data faster. -- Partitions can be used if the [partitioning key](../../../engines/table-engines/mergetree-family/custom-partitioning-key.md) is specified. +- Partitions can be used if the [partitioning key](/docs/en/engines/table-engines/mergetree-family/custom-partitioning-key.md) is specified. ClickHouse supports certain operations with partitions that are more efficient than general operations on the same data with the same result. ClickHouse also automatically cuts off the partition data where the partitioning key is specified in the query. - Data replication support. - The family of `ReplicatedMergeTree` tables provides data replication. For more information, see [Data replication](../../../engines/table-engines/mergetree-family/replication.md). + The family of `ReplicatedMergeTree` tables provides data replication. For more information, see [Data replication](/docs/en/engines/table-engines/mergetree-family/replication.md). - Data sampling support. If necessary, you can set the data sampling method in the table. :::info -The [Merge](../../../engines/table-engines/special/merge.md#merge) engine does not belong to the `*MergeTree` family. +The [Merge](/docs/en/engines/table-engines/special/merge.md/#merge) engine does not belong to the `*MergeTree` family. ::: ## Creating a Table {#table_engine-mergetree-creating-a-table} @@ -57,7 +57,7 @@ ORDER BY expr [SETTINGS name=value, ...] ``` -For a description of parameters, see the [CREATE query description](../../../sql-reference/statements/create/table.md). +For a description of parameters, see the [CREATE query description](/docs/en/sql-reference/statements/create/table.md). ### Query Clauses {#mergetree-query-clauses} @@ -77,9 +77,9 @@ Use the `ORDER BY tuple()` syntax, if you do not need sorting. See [Selecting th #### PARTITION BY -`PARTITION BY` — The [partitioning key](../../../engines/table-engines/mergetree-family/custom-partitioning-key.md). Optional. In most cases you don't need partition key, and in most other cases you don't need partition key more granular than by months. Partitioning does not speed up queries (in contrast to the ORDER BY expression). You should never use too granular partitioning. Don't partition your data by client identifiers or names (instead make client identifier or name the first column in the ORDER BY expression). +`PARTITION BY` — The [partitioning key](/docs/en/engines/table-engines/mergetree-family/custom-partitioning-key.md). Optional. In most cases you don't need partition key, and in most other cases you don't need partition key more granular than by months. Partitioning does not speed up queries (in contrast to the ORDER BY expression). You should never use too granular partitioning. Don't partition your data by client identifiers or names (instead make client identifier or name the first column in the ORDER BY expression). -For partitioning by month, use the `toYYYYMM(date_column)` expression, where `date_column` is a column with a date of the type [Date](../../../sql-reference/data-types/date.md). The partition names here have the `"YYYYMM"` format. +For partitioning by month, use the `toYYYYMM(date_column)` expression, where `date_column` is a column with a date of the type [Date](/docs/en/sql-reference/data-types/date.md). The partition names here have the `"YYYYMM"` format. #### PRIMARY KEY @@ -127,7 +127,7 @@ Additional parameters that control the behavior of the `MergeTree` (optional): #### use_minimalistic_part_header_in_zookeeper -`use_minimalistic_part_header_in_zookeeper` — Storage method of the data parts headers in ZooKeeper. If `use_minimalistic_part_header_in_zookeeper=1`, then ZooKeeper stores less data. For more information, see the [setting description](../../../operations/server-configuration-parameters/settings.md#server-settings-use_minimalistic_part_header_in_zookeeper) in “Server configuration parameters”. +`use_minimalistic_part_header_in_zookeeper` — Storage method of the data parts headers in ZooKeeper. If `use_minimalistic_part_header_in_zookeeper=1`, then ZooKeeper stores less data. For more information, see the [setting description](/docs/en/operations/server-configuration-parameters/settings.md/#server-settings-use_minimalistic_part_header_in_zookeeper) in “Server configuration parameters”. #### min_merge_bytes_to_use_direct_io @@ -166,15 +166,15 @@ Additional parameters that control the behavior of the `MergeTree` (optional): #### max_compress_block_size -`max_compress_block_size` — Maximum size of blocks of uncompressed data before compressing for writing to a table. You can also specify this setting in the global settings (see [max_compress_block_size](../../../operations/settings/settings.md#max-compress-block-size) setting). The value specified when table is created overrides the global value for this setting. +`max_compress_block_size` — Maximum size of blocks of uncompressed data before compressing for writing to a table. You can also specify this setting in the global settings (see [max_compress_block_size](/docs/en/operations/settings/settings.md/#max-compress-block-size) setting). The value specified when table is created overrides the global value for this setting. #### min_compress_block_size -`min_compress_block_size` — Minimum size of blocks of uncompressed data required for compression when writing the next mark. You can also specify this setting in the global settings (see [min_compress_block_size](../../../operations/settings/settings.md#min-compress-block-size) setting). The value specified when table is created overrides the global value for this setting. +`min_compress_block_size` — Minimum size of blocks of uncompressed data required for compression when writing the next mark. You can also specify this setting in the global settings (see [min_compress_block_size](/docs/en/operations/settings/settings.md/#min-compress-block-size) setting). The value specified when table is created overrides the global value for this setting. #### max_partitions_to_read -`max_partitions_to_read` — Limits the maximum number of partitions that can be accessed in one query. You can also specify setting [max_partitions_to_read](../../../operations/settings/merge-tree-settings.md#max-partitions-to-read) in the global setting. +`max_partitions_to_read` — Limits the maximum number of partitions that can be accessed in one query. You can also specify setting [max_partitions_to_read](/docs/en/operations/settings/merge-tree-settings.md/#max-partitions-to-read) in the global setting. **Example of Sections Setting** @@ -184,7 +184,7 @@ ENGINE MergeTree() PARTITION BY toYYYYMM(EventDate) ORDER BY (CounterID, EventDa In the example, we set partitioning by month. -We also set an expression for sampling as a hash by the user ID. This allows you to pseudorandomize the data in the table for each `CounterID` and `EventDate`. If you define a [SAMPLE](../../../sql-reference/statements/select/sample.md#select-sample-clause) clause when selecting the data, ClickHouse will return an evenly pseudorandom data sample for a subset of users. +We also set an expression for sampling as a hash by the user ID. This allows you to pseudorandomize the data in the table for each `CounterID` and `EventDate`. If you define a [SAMPLE](/docs/en/sql-reference/statements/select/sample.md/#select-sample-clause) clause when selecting the data, ClickHouse will return an evenly pseudorandom data sample for a subset of users. The `index_granularity` setting can be omitted because 8192 is the default value. @@ -207,9 +207,9 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] **MergeTree() Parameters** -- `date-column` — The name of a column of the [Date](../../../sql-reference/data-types/date.md) type. ClickHouse automatically creates partitions by month based on this column. The partition names are in the `"YYYYMM"` format. +- `date-column` — The name of a column of the [Date](/docs/en/sql-reference/data-types/date.md) type. ClickHouse automatically creates partitions by month based on this column. The partition names are in the `"YYYYMM"` format. - `sampling_expression` — An expression for sampling. -- `(primary, key)` — Primary key. Type: [Tuple()](../../../sql-reference/data-types/tuple.md) +- `(primary, key)` — Primary key. Type: [Tuple()](/docs/en/sql-reference/data-types/tuple.md) - `index_granularity` — The granularity of an index. The number of data rows between the “marks” of an index. The value 8192 is appropriate for most tasks. **Example** @@ -262,7 +262,7 @@ Sparse indexes allow you to work with a very large number of table rows, because ClickHouse does not require a unique primary key. You can insert multiple rows with the same primary key. -You can use `Nullable`-typed expressions in the `PRIMARY KEY` and `ORDER BY` clauses but it is strongly discouraged. To allow this feature, turn on the [allow_nullable_key](../../../operations/settings/settings.md#allow-nullable-key) setting. The [NULLS_LAST](../../../sql-reference/statements/select/order-by.md#sorting-of-special-values) principle applies for `NULL` values in the `ORDER BY` clause. +You can use `Nullable`-typed expressions in the `PRIMARY KEY` and `ORDER BY` clauses but it is strongly discouraged. To allow this feature, turn on the [allow_nullable_key](/docs/en/operations/settings/settings.md/#allow-nullable-key) setting. The [NULLS_LAST](/docs/en/sql-reference/statements/select/order-by.md/#sorting-of-special-values) principle applies for `NULL` values in the `ORDER BY` clause. ### Selecting the Primary Key {#selecting-the-primary-key} @@ -279,26 +279,26 @@ The number of columns in the primary key is not explicitly limited. Depending on ClickHouse sorts data by primary key, so the higher the consistency, the better the compression. -- Provide additional logic when merging data parts in the [CollapsingMergeTree](../../../engines/table-engines/mergetree-family/collapsingmergetree.md#table_engine-collapsingmergetree) and [SummingMergeTree](../../../engines/table-engines/mergetree-family/summingmergetree.md) engines. +- Provide additional logic when merging data parts in the [CollapsingMergeTree](/docs/en/engines/table-engines/mergetree-family/collapsingmergetree.md/#table_engine-collapsingmergetree) and [SummingMergeTree](/docs/en/engines/table-engines/mergetree-family/summingmergetree.md) engines. In this case it makes sense to specify the *sorting key* that is different from the primary key. A long primary key will negatively affect the insert performance and memory consumption, but extra columns in the primary key do not affect ClickHouse performance during `SELECT` queries. -You can create a table without a primary key using the `ORDER BY tuple()` syntax. In this case, ClickHouse stores data in the order of inserting. If you want to save data order when inserting data by `INSERT ... SELECT` queries, set [max_insert_threads = 1](../../../operations/settings/settings.md#settings-max-insert-threads). +You can create a table without a primary key using the `ORDER BY tuple()` syntax. In this case, ClickHouse stores data in the order of inserting. If you want to save data order when inserting data by `INSERT ... SELECT` queries, set [max_insert_threads = 1](/docs/en/operations/settings/settings.md/#settings-max-insert-threads). -To select data in the initial order, use [single-threaded](../../../operations/settings/settings.md#settings-max_threads) `SELECT` queries. +To select data in the initial order, use [single-threaded](/docs/en/operations/settings/settings.md/#settings-max_threads) `SELECT` queries. ### Choosing a Primary Key that Differs from the Sorting Key {#choosing-a-primary-key-that-differs-from-the-sorting-key} It is possible to specify a primary key (an expression with values that are written in the index file for each mark) that is different from the sorting key (an expression for sorting the rows in data parts). In this case the primary key expression tuple must be a prefix of the sorting key expression tuple. -This feature is helpful when using the [SummingMergeTree](../../../engines/table-engines/mergetree-family/summingmergetree.md) and -[AggregatingMergeTree](../../../engines/table-engines/mergetree-family/aggregatingmergetree.md) table engines. In a common case when using these engines, the table has two types of columns: *dimensions* and *measures*. Typical queries aggregate values of measure columns with arbitrary `GROUP BY` and filtering by dimensions. Because SummingMergeTree and AggregatingMergeTree aggregate rows with the same value of the sorting key, it is natural to add all dimensions to it. As a result, the key expression consists of a long list of columns and this list must be frequently updated with newly added dimensions. +This feature is helpful when using the [SummingMergeTree](/docs/en/engines/table-engines/mergetree-family/summingmergetree.md) and +[AggregatingMergeTree](/docs/en/engines/table-engines/mergetree-family/aggregatingmergetree.md) table engines. In a common case when using these engines, the table has two types of columns: *dimensions* and *measures*. Typical queries aggregate values of measure columns with arbitrary `GROUP BY` and filtering by dimensions. Because SummingMergeTree and AggregatingMergeTree aggregate rows with the same value of the sorting key, it is natural to add all dimensions to it. As a result, the key expression consists of a long list of columns and this list must be frequently updated with newly added dimensions. In this case it makes sense to leave only a few columns in the primary key that will provide efficient range scans and add the remaining dimension columns to the sorting key tuple. -[ALTER](../../../sql-reference/statements/alter/index.md) of the sorting key is a lightweight operation because when a new column is simultaneously added to the table and to the sorting key, existing data parts do not need to be changed. Since the old sorting key is a prefix of the new sorting key and there is no data in the newly added column, the data is sorted by both the old and new sorting keys at the moment of table modification. +[ALTER](/docs/en/sql-reference/statements/alter/index.md) of the sorting key is a lightweight operation because when a new column is simultaneously added to the table and to the sorting key, existing data parts do not need to be changed. Since the old sorting key is a prefix of the new sorting key and there is no data in the newly added column, the data is sorted by both the old and new sorting keys at the moment of table modification. ### Use of Indexes and Partitions in Queries {#use-of-indexes-and-partitions-in-queries} @@ -342,7 +342,7 @@ In the example below, the index can’t be used. SELECT count() FROM table WHERE CounterID = 34 OR URL LIKE '%upyachka%' ``` -To check whether ClickHouse can use the index when running a query, use the settings [force_index_by_date](../../../operations/settings/settings.md#settings-force_index_by_date) and [force_primary_key](../../../operations/settings/settings.md#force-primary-key). +To check whether ClickHouse can use the index when running a query, use the settings [force_index_by_date](/docs/en/operations/settings/settings.md/#settings-force_index_by_date) and [force_primary_key](/docs/en/operations/settings/settings.md/#force-primary-key). The key for partitioning by month allows reading only those data blocks which contain dates from the proper range. In this case, the data block may contain data for many dates (up to an entire month). Within a block, data is sorted by primary key, which might not contain the date as the first column. Because of this, using a query with only a date condition that does not specify the primary key prefix will cause more data to be read than for a single date. @@ -400,7 +400,7 @@ Stores unique values of the specified expression (no more than `max_rows` rows, #### `ngrambf_v1(n, size_of_bloom_filter_in_bytes, number_of_hash_functions, random_seed)` -Stores a [Bloom filter](https://en.wikipedia.org/wiki/Bloom_filter) that contains all ngrams from a block of data. Works only with datatypes: [String](../../../sql-reference/data-types/string.md), [FixedString](../../../sql-reference/data-types/fixedstring.md) and [Map](../../../sql-reference/data-types/map.md). Can be used for optimization of `EQUALS`, `LIKE` and `IN` expressions. +Stores a [Bloom filter](https://en.wikipedia.org/wiki/Bloom_filter) that contains all ngrams from a block of data. Works only with datatypes: [String](/docs/en/sql-reference/data-types/string.md), [FixedString](/docs/en/sql-reference/data-types/fixedstring.md) and [Map](/docs/en/sql-reference/data-types/map.md). Can be used for optimization of `EQUALS`, `LIKE` and `IN` expressions. - `n` — ngram size, - `size_of_bloom_filter_in_bytes` — Bloom filter size in bytes (you can use large values here, for example, 256 or 512, because it can be compressed well). @@ -417,11 +417,11 @@ The optional `false_positive` parameter is the probability of receiving a false Supported data types: `Int*`, `UInt*`, `Float*`, `Enum`, `Date`, `DateTime`, `String`, `FixedString`, `Array`, `LowCardinality`, `Nullable`, `UUID`, `Map`. -For `Map` data type client can specify if index should be created for keys or values using [mapKeys](../../../sql-reference/functions/tuple-map-functions.md#mapkeys) or [mapValues](../../../sql-reference/functions/tuple-map-functions.md#mapvalues) function. +For `Map` data type client can specify if index should be created for keys or values using [mapKeys](/docs/en/sql-reference/functions/tuple-map-functions.md/#mapkeys) or [mapValues](/docs/en/sql-reference/functions/tuple-map-functions.md/#mapvalues) function. There are also special-purpose and experimental indexes to support approximate nearest neighbor (ANN) queries. See [here](annindexes.md) for details. -The following functions can use the filter: [equals](../../../sql-reference/functions/comparison-functions.md), [notEquals](../../../sql-reference/functions/comparison-functions.md), [in](../../../sql-reference/functions/in-functions), [notIn](../../../sql-reference/functions/in-functions), [has](../../../sql-reference/functions/array-functions#hasarr-elem), [hasAny](../../../sql-reference/functions/array-functions#hasany), [hasAll](../../../sql-reference/functions/array-functions#hasall). +The following functions can use the filter: [equals](/docs/en/sql-reference/functions/comparison-functions.md), [notEquals](/docs/en/sql-reference/functions/comparison-functions.md), [in](/docs/en/sql-reference/functions/in-functions), [notIn](/docs/en/sql-reference/functions/in-functions), [has](/docs/en/sql-reference/functions/array-functions#hasarr-elem), [hasAny](/docs/en/sql-reference/functions/array-functions#hasany), [hasAll](/docs/en/sql-reference/functions/array-functions#hasall). Example of index creation for `Map` data type @@ -445,21 +445,21 @@ The `set` index can be used with all functions. Function subsets for other index | Function (operator) / Index | primary key | minmax | ngrambf_v1 | tokenbf_v1 | bloom_filter | |------------------------------------------------------------------------------------------------------------|-------------|--------|-------------|-------------|---------------| -| [equals (=, ==)](../../../sql-reference/functions/comparison-functions.md#function-equals) | ✔ | ✔ | ✔ | ✔ | ✔ | -| [notEquals(!=, <>)](../../../sql-reference/functions/comparison-functions.md#function-notequals) | ✔ | ✔ | ✔ | ✔ | ✔ | -| [like](../../../sql-reference/functions/string-search-functions.md#function-like) | ✔ | ✔ | ✔ | ✔ | ✗ | -| [notLike](../../../sql-reference/functions/string-search-functions.md#function-notlike) | ✔ | ✔ | ✔ | ✔ | ✗ | -| [startsWith](../../../sql-reference/functions/string-functions.md#startswith) | ✔ | ✔ | ✔ | ✔ | ✗ | -| [endsWith](../../../sql-reference/functions/string-functions.md#endswith) | ✗ | ✗ | ✔ | ✔ | ✗ | -| [multiSearchAny](../../../sql-reference/functions/string-search-functions.md#function-multisearchany) | ✗ | ✗ | ✔ | ✗ | ✗ | -| [in](../../../sql-reference/functions/in-functions#in-functions) | ✔ | ✔ | ✔ | ✔ | ✔ | -| [notIn](../../../sql-reference/functions/in-functions#in-functions) | ✔ | ✔ | ✔ | ✔ | ✔ | -| [less (<)](../../../sql-reference/functions/comparison-functions.md#function-less) | ✔ | ✔ | ✗ | ✗ | ✗ | -| [greater (>)](../../../sql-reference/functions/comparison-functions.md#function-greater) | ✔ | ✔ | ✗ | ✗ | ✗ | -| [lessOrEquals (<=)](../../../sql-reference/functions/comparison-functions.md#function-lessorequals) | ✔ | ✔ | ✗ | ✗ | ✗ | -| [greaterOrEquals (>=)](../../../sql-reference/functions/comparison-functions.md#function-greaterorequals) | ✔ | ✔ | ✗ | ✗ | ✗ | -| [empty](../../../sql-reference/functions/array-functions#function-empty) | ✔ | ✔ | ✗ | ✗ | ✗ | -| [notEmpty](../../../sql-reference/functions/array-functions#function-notempty) | ✔ | ✔ | ✗ | ✗ | ✗ | +| [equals (=, ==)](/docs/en/sql-reference/functions/comparison-functions.md/#function-equals) | ✔ | ✔ | ✔ | ✔ | ✔ | +| [notEquals(!=, <>)](/docs/en/sql-reference/functions/comparison-functions.md/#function-notequals) | ✔ | ✔ | ✔ | ✔ | ✔ | +| [like](/docs/en/sql-reference/functions/string-search-functions.md/#function-like) | ✔ | ✔ | ✔ | ✔ | ✗ | +| [notLike](/docs/en/sql-reference/functions/string-search-functions.md/#function-notlike) | ✔ | ✔ | ✔ | ✔ | ✗ | +| [startsWith](/docs/en/sql-reference/functions/string-functions.md/#startswith) | ✔ | ✔ | ✔ | ✔ | ✗ | +| [endsWith](/docs/en/sql-reference/functions/string-functions.md/#endswith) | ✗ | ✗ | ✔ | ✔ | ✗ | +| [multiSearchAny](/docs/en/sql-reference/functions/string-search-functions.md/#function-multisearchany) | ✗ | ✗ | ✔ | ✗ | ✗ | +| [in](/docs/en/sql-reference/functions/in-functions#in-functions) | ✔ | ✔ | ✔ | ✔ | ✔ | +| [notIn](/docs/en/sql-reference/functions/in-functions#in-functions) | ✔ | ✔ | ✔ | ✔ | ✔ | +| [less (<)](/docs/en/sql-reference/functions/comparison-functions.md/#function-less) | ✔ | ✔ | ✗ | ✗ | ✗ | +| [greater (>)](/docs/en/sql-reference/functions/comparison-functions.md/#function-greater) | ✔ | ✔ | ✗ | ✗ | ✗ | +| [lessOrEquals (<=)](/docs/en/sql-reference/functions/comparison-functions.md/#function-lessorequals) | ✔ | ✔ | ✗ | ✗ | ✗ | +| [greaterOrEquals (>=)](/docs/en/sql-reference/functions/comparison-functions.md/#function-greaterorequals) | ✔ | ✔ | ✗ | ✗ | ✗ | +| [empty](/docs/en/sql-reference/functions/array-functions#function-empty) | ✔ | ✔ | ✗ | ✗ | ✗ | +| [notEmpty](/docs/en/sql-reference/functions/array-functions#function-notempty) | ✔ | ✔ | ✗ | ✗ | ✗ | | hasToken | ✗ | ✗ | ✗ | ✔ | ✗ | Functions with a constant argument that is less than ngram size can’t be used by `ngrambf_v1` for query optimization. @@ -485,16 +485,16 @@ For example: ## Approximate Nearest Neighbor Search Indexes [experimental] {#table_engines-ANNIndex} -In addition to skip indices, there are also [Approximate Nearest Neighbor Search Indexes](../../../engines/table-engines/mergetree-family/annindexes.md). +In addition to skip indices, there are also [Approximate Nearest Neighbor Search Indexes](/docs/en/engines/table-engines/mergetree-family/annindexes.md). ## Projections {#projections} -Projections are like [materialized views](../../../sql-reference/statements/create/view.md#materialized) but defined in part-level. It provides consistency guarantees along with automatic usage in queries. +Projections are like [materialized views](/docs/en/sql-reference/statements/create/view.md/#materialized) but defined in part-level. It provides consistency guarantees along with automatic usage in queries. :::note -When you are implementing projections you should also consider the [force_optimize_projection](../../../operations/settings/settings.md#force-optimize-projection) setting. +When you are implementing projections you should also consider the [force_optimize_projection](/docs/en/operations/settings/settings.md/#force-optimize-projection) setting. ::: -Projections are not supported in the `SELECT` statements with the [FINAL](../../../sql-reference/statements/select/from.md#select-from-final) modifier. +Projections are not supported in the `SELECT` statements with the [FINAL](/docs/en/sql-reference/statements/select/from.md/#select-from-final) modifier. ### Projection Query {#projection-query} A projection query is what defines a projection. It implicitly selects data from the parent table. @@ -504,7 +504,7 @@ A projection query is what defines a projection. It implicitly selects data from SELECT [GROUP BY] [ORDER BY] ``` -Projections can be modified or dropped with the [ALTER](../../../sql-reference/statements/alter/projection.md) statement. +Projections can be modified or dropped with the [ALTER](/docs/en/sql-reference/statements/alter/projection.md) statement. ### Projection Storage {#projection-storage} Projections are stored inside the part directory. It's similar to an index but contains a subdirectory that stores an anonymous `MergeTree` table's part. The table is induced by the definition query of the projection. If there is a `GROUP BY` clause, the underlying storage engine becomes [AggregatingMergeTree](aggregatingmergetree.md), and all aggregate functions are converted to `AggregateFunction`. If there is an `ORDER BY` clause, the `MergeTree` table uses it as its primary key expression. During the merge process the projection part is merged via its storage's merge routine. The checksum of the parent table's part is combined with the projection's part. Other maintenance jobs are similar to skip indices. @@ -526,7 +526,7 @@ Determines the lifetime of values. The `TTL` clause can be set for the whole table and for each individual column. Table-level `TTL` can also specify the logic of automatic moving data between disks and volumes, or recompressing parts where all the data has been expired. -Expressions must evaluate to [Date](../../../sql-reference/data-types/date.md) or [DateTime](../../../sql-reference/data-types/datetime.md) data type. +Expressions must evaluate to [Date](/docs/en/sql-reference/data-types/date.md) or [DateTime](/docs/en/sql-reference/data-types/datetime.md) data type. **Syntax** @@ -537,7 +537,7 @@ TTL time_column TTL time_column + interval ``` -To define `interval`, use [time interval](../../../sql-reference/operators/index.md#operators-datetime) operators, for example: +To define `interval`, use [time interval](/docs/en/sql-reference/operators/index.md#operators-datetime) operators, for example: ``` sql TTL date_time + INTERVAL 1 MONTH @@ -684,11 +684,11 @@ Data with an expired `TTL` is removed when ClickHouse merges data parts. When ClickHouse detects that data is expired, it performs an off-schedule merge. To control the frequency of such merges, you can set `merge_with_ttl_timeout`. If the value is too low, it will perform many off-schedule merges that may consume a lot of resources. -If you perform the `SELECT` query between merges, you may get expired data. To avoid it, use the [OPTIMIZE](../../../sql-reference/statements/optimize.md) query before `SELECT`. +If you perform the `SELECT` query between merges, you may get expired data. To avoid it, use the [OPTIMIZE](/docs/en/sql-reference/statements/optimize.md) query before `SELECT`. **See Also** -- [ttl_only_drop_parts](../../../operations/settings/settings.md#ttl_only_drop_parts) setting +- [ttl_only_drop_parts](/docs/en/operations/settings/settings.md/#ttl_only_drop_parts) setting ## Using Multiple Block Devices for Data Storage {#table_engine-mergetree-multiple-volumes} @@ -697,16 +697,16 @@ If you perform the `SELECT` query between merges, you may get expired data. To a `MergeTree` family table engines can store data on multiple block devices. For example, it can be useful when the data of a certain table are implicitly split into “hot” and “cold”. The most recent data is regularly requested but requires only a small amount of space. On the contrary, the fat-tailed historical data is requested rarely. If several disks are available, the “hot” data may be located on fast disks (for example, NVMe SSDs or in memory), while the “cold” data - on relatively slow ones (for example, HDD). -Data part is the minimum movable unit for `MergeTree`-engine tables. The data belonging to one part are stored on one disk. Data parts can be moved between disks in the background (according to user settings) as well as by means of the [ALTER](../../../sql-reference/statements/alter/partition.md#alter_move-partition) queries. +Data part is the minimum movable unit for `MergeTree`-engine tables. The data belonging to one part are stored on one disk. Data parts can be moved between disks in the background (according to user settings) as well as by means of the [ALTER](/docs/en/sql-reference/statements/alter/partition.md/#alter_move-partition) queries. ### Terms {#terms} - Disk — Block device mounted to the filesystem. -- Default disk — Disk that stores the path specified in the [path](../../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-path) server setting. +- Default disk — Disk that stores the path specified in the [path](/docs/en/operations/server-configuration-parameters/settings.md/#server_configuration_parameters-path) server setting. - Volume — Ordered set of equal disks (similar to [JBOD](https://en.wikipedia.org/wiki/Non-RAID_drive_architectures)). - Storage policy — Set of volumes and the rules for moving data between them. -The names given to the described entities can be found in the system tables, [system.storage_policies](../../../operations/system-tables/storage_policies.md#system_tables-storage_policies) and [system.disks](../../../operations/system-tables/disks.md#system_tables-disks). To apply one of the configured storage policies for a table, use the `storage_policy` setting of `MergeTree`-engine family tables. +The names given to the described entities can be found in the system tables, [system.storage_policies](/docs/en/operations/system-tables/storage_policies.md/#system_tables-storage_policies) and [system.disks](/docs/en/operations/system-tables/disks.md/#system_tables-disks). To apply one of the configured storage policies for a table, use the `storage_policy` setting of `MergeTree`-engine family tables. ### Configuration {#table_engine-mergetree-multiple-volumes_configure} @@ -853,16 +853,16 @@ SETTINGS storage_policy = 'moving_from_ssd_to_hdd' The `default` storage policy implies using only one volume, which consists of only one disk given in ``. You could change storage policy after table creation with [ALTER TABLE ... MODIFY SETTING] query, new policy should include all old disks and volumes with same names. -The number of threads performing background moves of data parts can be changed by [background_move_pool_size](../../../operations/settings/settings.md#background_move_pool_size) setting. +The number of threads performing background moves of data parts can be changed by [background_move_pool_size](/docs/en/operations/settings/settings.md/#background_move_pool_size) setting. ### Details {#details} In the case of `MergeTree` tables, data is getting to disk in different ways: - As a result of an insert (`INSERT` query). -- During background merges and [mutations](../../../sql-reference/statements/alter/index.md#alter-mutations). +- During background merges and [mutations](/docs/en/sql-reference/statements/alter/index.md#alter-mutations). - When downloading from another replica. -- As a result of partition freezing [ALTER TABLE … FREEZE PARTITION](../../../sql-reference/statements/alter/partition.md#alter_freeze-partition). +- As a result of partition freezing [ALTER TABLE … FREEZE PARTITION](/docs/en/sql-reference/statements/alter/partition.md/#alter_freeze-partition). In all these cases except for mutations and partition freezing, a part is stored on a volume and a disk according to the given storage policy: @@ -872,16 +872,16 @@ In all these cases except for mutations and partition freezing, a part is stored Under the hood, mutations and partition freezing make use of [hard links](https://en.wikipedia.org/wiki/Hard_link). Hard links between different disks are not supported, therefore in such cases the resulting parts are stored on the same disks as the initial ones. In the background, parts are moved between volumes on the basis of the amount of free space (`move_factor` parameter) according to the order the volumes are declared in the configuration file. -Data is never transferred from the last one and into the first one. One may use system tables [system.part_log](../../../operations/system-tables/part_log.md#system_tables-part-log) (field `type = MOVE_PART`) and [system.parts](../../../operations/system-tables/parts.md#system_tables-parts) (fields `path` and `disk`) to monitor background moves. Also, the detailed information can be found in server logs. +Data is never transferred from the last one and into the first one. One may use system tables [system.part_log](/docs/en/operations/system-tables/part_log.md/#system_tables-part-log) (field `type = MOVE_PART`) and [system.parts](/docs/en/operations/system-tables/parts.md/#system_tables-parts) (fields `path` and `disk`) to monitor background moves. Also, the detailed information can be found in server logs. -User can force moving a part or a partition from one volume to another using the query [ALTER TABLE … MOVE PART\|PARTITION … TO VOLUME\|DISK …](../../../sql-reference/statements/alter/partition.md#alter_move-partition), all the restrictions for background operations are taken into account. The query initiates a move on its own and does not wait for background operations to be completed. User will get an error message if not enough free space is available or if any of the required conditions are not met. +User can force moving a part or a partition from one volume to another using the query [ALTER TABLE … MOVE PART\|PARTITION … TO VOLUME\|DISK …](/docs/en/sql-reference/statements/alter/partition.md/#alter_move-partition), all the restrictions for background operations are taken into account. The query initiates a move on its own and does not wait for background operations to be completed. User will get an error message if not enough free space is available or if any of the required conditions are not met. Moving data does not interfere with data replication. Therefore, different storage policies can be specified for the same table on different replicas. After the completion of background merges and mutations, old parts are removed only after a certain amount of time (`old_parts_lifetime`). During this time, they are not moved to other volumes or disks. Therefore, until the parts are finally removed, they are still taken into account for evaluation of the occupied disk space. -User can assign new big parts to different disks of a [JBOD](https://en.wikipedia.org/wiki/Non-RAID_drive_architectures) volume in a balanced way using the [min_bytes_to_rebalance_partition_over_jbod](../../../operations/settings/merge-tree-settings.md#min-bytes-to-rebalance-partition-over-jbod) setting. +User can assign new big parts to different disks of a [JBOD](https://en.wikipedia.org/wiki/Non-RAID_drive_architectures) volume in a balanced way using the [min_bytes_to_rebalance_partition_over_jbod](/docs/en/operations/settings/merge-tree-settings.md/#min-bytes-to-rebalance-partition-over-jbod) setting. ## Using S3 for Data Storage {#table_engine-mergetree-s3} @@ -940,6 +940,10 @@ Optional parameters: - `cache_path` — Path on local FS where to store cached mark and index files. Default value is `/var/lib/clickhouse/disks//cache/`. - `skip_access_check` — If true, disk access checks will not be performed on disk start-up. Default value is `false`. - `server_side_encryption_customer_key_base64` — If specified, required headers for accessing S3 objects with SSE-C encryption will be set. +- `s3_max_put_rps` — Maximum PUT requests per second rate before throttling. Default value is `0` (unlimited). +- `s3_max_put_burst` — Max number of requests that can be issued simultaneously before hitting request per second limit. By default (`0` value) equals to `s3_max_put_rps`. +- `s3_max_get_rps` — Maximum GET requests per second rate before throttling. Default value is `0` (unlimited). +- `s3_max_get_burst` — Max number of requests that can be issued simultaneously before hitting request per second limit. By default (`0` value) equals to `s3_max_get_rps`. S3 disk can be configured as `main` or `cold` storage: ``` xml diff --git a/docs/en/engines/table-engines/mergetree-family/replication.md b/docs/en/engines/table-engines/mergetree-family/replication.md index 06faceab8ec..4867140789f 100644 --- a/docs/en/engines/table-engines/mergetree-family/replication.md +++ b/docs/en/engines/table-engines/mergetree-family/replication.md @@ -20,7 +20,7 @@ Replication works at the level of an individual table, not the entire server. A Replication does not depend on sharding. Each shard has its own independent replication. -Compressed data for `INSERT` and `ALTER` queries is replicated (for more information, see the documentation for [ALTER](../../../sql-reference/statements/alter/index.md#query_language_queries_alter)). +Compressed data for `INSERT` and `ALTER` queries is replicated (for more information, see the documentation for [ALTER](/docs/en/sql-reference/statements/alter/index.md#query_language_queries_alter)). `CREATE`, `DROP`, `ATTACH`, `DETACH` and `RENAME` queries are executed on a single server and are not replicated: @@ -28,9 +28,9 @@ Compressed data for `INSERT` and `ALTER` queries is replicated (for more informa - The `DROP TABLE` query deletes the replica located on the server where the query is run. - The `RENAME` query renames the table on one of the replicas. In other words, replicated tables can have different names on different replicas. -ClickHouse uses [ClickHouse Keeper](../../../guides/sre/keeper/clickhouse-keeper.md) for storing replicas meta information. It is possible to use ZooKeeper version 3.4.5 or newer, but ClickHouse Keeper is recommended. +ClickHouse uses [ClickHouse Keeper](/docs/en/guides/sre/keeper/clickhouse-keeper.md) for storing replicas meta information. It is possible to use ZooKeeper version 3.4.5 or newer, but ClickHouse Keeper is recommended. -To use replication, set parameters in the [zookeeper](../../../operations/server-configuration-parameters/settings.md#server-settings_zookeeper) server configuration section. +To use replication, set parameters in the [zookeeper](/docs/en/operations/server-configuration-parameters/settings.md/#server-settings_zookeeper) server configuration section. :::warning Don’t neglect the security setting. ClickHouse supports the `digest` [ACL scheme](https://zookeeper.apache.org/doc/current/zookeeperProgrammers.html#sc_ZooKeeperAccessControl) of the ZooKeeper security subsystem. @@ -85,7 +85,7 @@ Example of setting the addresses of the auxiliary ZooKeeper cluster: ``` -To store table datameta in a auxiliary ZooKeeper cluster instead of default ZooKeeper cluster, we can use the SQL to create table with +To store table metadata in an auxiliary ZooKeeper cluster instead of default ZooKeeper cluster, we can use the SQL to create table with ReplicatedMergeTree engine as follow: ``` @@ -95,21 +95,21 @@ You can specify any existing ZooKeeper cluster and the system will use a directo If ZooKeeper isn’t set in the config file, you can’t create replicated tables, and any existing replicated tables will be read-only. -ZooKeeper is not used in `SELECT` queries because replication does not affect the performance of `SELECT` and queries run just as fast as they do for non-replicated tables. When querying distributed replicated tables, ClickHouse behavior is controlled by the settings [max_replica_delay_for_distributed_queries](../../../operations/settings/settings.md#settings-max_replica_delay_for_distributed_queries) and [fallback_to_stale_replicas_for_distributed_queries](../../../operations/settings/settings.md#settings-fallback_to_stale_replicas_for_distributed_queries). +ZooKeeper is not used in `SELECT` queries because replication does not affect the performance of `SELECT` and queries run just as fast as they do for non-replicated tables. When querying distributed replicated tables, ClickHouse behavior is controlled by the settings [max_replica_delay_for_distributed_queries](/docs/en/operations/settings/settings.md/#settings-max_replica_delay_for_distributed_queries) and [fallback_to_stale_replicas_for_distributed_queries](/docs/en/operations/settings/settings.md/#settings-fallback_to_stale_replicas_for_distributed_queries). For each `INSERT` query, approximately ten entries are added to ZooKeeper through several transactions. (To be more precise, this is for each inserted block of data; an INSERT query contains one block or one block per `max_insert_block_size = 1048576` rows.) This leads to slightly longer latencies for `INSERT` compared to non-replicated tables. But if you follow the recommendations to insert data in batches of no more than one `INSERT` per second, it does not create any problems. The entire ClickHouse cluster used for coordinating one ZooKeeper cluster has a total of several hundred `INSERTs` per second. The throughput on data inserts (the number of rows per second) is just as high as for non-replicated data. For very large clusters, you can use different ZooKeeper clusters for different shards. However, from our experience this has not proven necessary based on production clusters with approximately 300 servers. -Replication is asynchronous and multi-master. `INSERT` queries (as well as `ALTER`) can be sent to any available server. Data is inserted on the server where the query is run, and then it is copied to the other servers. Because it is asynchronous, recently inserted data appears on the other replicas with some latency. If part of the replicas are not available, the data is written when they become available. If a replica is available, the latency is the amount of time it takes to transfer the block of compressed data over the network. The number of threads performing background tasks for replicated tables can be set by [background_schedule_pool_size](../../../operations/settings/settings.md#background_schedule_pool_size) setting. +Replication is asynchronous and multi-master. `INSERT` queries (as well as `ALTER`) can be sent to any available server. Data is inserted on the server where the query is run, and then it is copied to the other servers. Because it is asynchronous, recently inserted data appears on the other replicas with some latency. If part of the replicas are not available, the data is written when they become available. If a replica is available, the latency is the amount of time it takes to transfer the block of compressed data over the network. The number of threads performing background tasks for replicated tables can be set by [background_schedule_pool_size](/docs/en/operations/settings/settings.md/#background_schedule_pool_size) setting. -`ReplicatedMergeTree` engine uses a separate thread pool for replicated fetches. Size of the pool is limited by the [background_fetches_pool_size](../../../operations/settings/settings.md#background_fetches_pool_size) setting which can be tuned with a server restart. +`ReplicatedMergeTree` engine uses a separate thread pool for replicated fetches. Size of the pool is limited by the [background_fetches_pool_size](/docs/en/operations/settings/settings.md/#background_fetches_pool_size) setting which can be tuned with a server restart. By default, an INSERT query waits for confirmation of writing the data from only one replica. If the data was successfully written to only one replica and the server with this replica ceases to exist, the stored data will be lost. To enable getting confirmation of data writes from multiple replicas, use the `insert_quorum` option. Each block of data is written atomically. The INSERT query is divided into blocks up to `max_insert_block_size = 1048576` rows. In other words, if the `INSERT` query has less than 1048576 rows, it is made atomically. -Data blocks are deduplicated. For multiple writes of the same data block (data blocks of the same size containing the same rows in the same order), the block is only written once. The reason for this is in case of network failures when the client application does not know if the data was written to the DB, so the `INSERT` query can simply be repeated. It does not matter which replica INSERTs were sent to with identical data. `INSERTs` are idempotent. Deduplication parameters are controlled by [merge_tree](../../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-merge_tree) server settings. +Data blocks are deduplicated. For multiple writes of the same data block (data blocks of the same size containing the same rows in the same order), the block is only written once. The reason for this is in case of network failures when the client application does not know if the data was written to the DB, so the `INSERT` query can simply be repeated. It does not matter which replica INSERTs were sent to with identical data. `INSERTs` are idempotent. Deduplication parameters are controlled by [merge_tree](/docs/en/operations/server-configuration-parameters/settings.md/#server_configuration_parameters-merge_tree) server settings. During replication, only the source data to insert is transferred over the network. Further data transformation (merging) is coordinated and performed on all the replicas in the same way. This minimizes network usage, which means that replication works well when replicas reside in different datacenters. (Note that duplicating data in different datacenters is the main goal of replication.) @@ -165,7 +165,7 @@ CREATE TABLE table_name -As the example shows, these parameters can contain substitutions in curly brackets. The substituted values are taken from the [macros](../../../operations/server-configuration-parameters/settings.md#macros) section of the configuration file. +As the example shows, these parameters can contain substitutions in curly brackets. The substituted values are taken from the [macros](/docs/en/operations/server-configuration-parameters/settings.md/#macros) section of the configuration file. Example: @@ -295,10 +295,10 @@ If the data in ClickHouse Keeper was lost or damaged, you can save data by movin **See Also** -- [background_schedule_pool_size](../../../operations/settings/settings.md#background_schedule_pool_size) -- [background_fetches_pool_size](../../../operations/settings/settings.md#background_fetches_pool_size) -- [execute_merges_on_single_replica_time_threshold](../../../operations/settings/settings.md#execute-merges-on-single-replica-time-threshold) -- [max_replicated_fetches_network_bandwidth](../../../operations/settings/merge-tree-settings.md#max_replicated_fetches_network_bandwidth) -- [max_replicated_sends_network_bandwidth](../../../operations/settings/merge-tree-settings.md#max_replicated_sends_network_bandwidth) +- [background_schedule_pool_size](/docs/en/operations/settings/settings.md/#background_schedule_pool_size) +- [background_fetches_pool_size](/docs/en/operations/settings/settings.md/#background_fetches_pool_size) +- [execute_merges_on_single_replica_time_threshold](/docs/en/operations/settings/settings.md/#execute-merges-on-single-replica-time-threshold) +- [max_replicated_fetches_network_bandwidth](/docs/en/operations/settings/merge-tree-settings.md/#max_replicated_fetches_network_bandwidth) +- [max_replicated_sends_network_bandwidth](/docs/en/operations/settings/merge-tree-settings.md/#max_replicated_sends_network_bandwidth) [Original article](https://clickhouse.com/docs/en/operations/table_engines/replication/) diff --git a/docs/en/engines/table-engines/special/join.md b/docs/en/engines/table-engines/special/join.md index 161896e5550..a49214bd00a 100644 --- a/docs/en/engines/table-engines/special/join.md +++ b/docs/en/engines/table-engines/special/join.md @@ -6,10 +6,10 @@ sidebar_label: Join # Join Table Engine -Optional prepared data structure for usage in [JOIN](../../../sql-reference/statements/select/join.md#select-join) operations. +Optional prepared data structure for usage in [JOIN](/docs/en/sql-reference/statements/select/join.md/#select-join) operations. :::note -This is not an article about the [JOIN clause](../../../sql-reference/statements/select/join.md#select-join) itself. +This is not an article about the [JOIN clause](/docs/en/sql-reference/statements/select/join.md/#select-join) itself. ::: ## Creating a Table {#creating-a-table} @@ -22,17 +22,17 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] ) ENGINE = Join(join_strictness, join_type, k1[, k2, ...]) ``` -See the detailed description of the [CREATE TABLE](../../../sql-reference/statements/create/table.md#create-table-query) query. +See the detailed description of the [CREATE TABLE](/docs/en/sql-reference/statements/create/table.md/#create-table-query) query. ## Engine Parameters ### join_strictness -`join_strictness` – [JOIN strictness](../../../sql-reference/statements/select/join.md#select-join-types). +`join_strictness` – [JOIN strictness](/docs/en/sql-reference/statements/select/join.md/#select-join-types). ### join_type -`join_type` – [JOIN type](../../../sql-reference/statements/select/join.md#select-join-types). +`join_type` – [JOIN type](/docs/en/sql-reference/statements/select/join.md/#select-join-types). ### Key columns @@ -55,11 +55,11 @@ You can use `INSERT` queries to add data to the `Join`-engine tables. If the tab Main use-cases for `Join`-engine tables are following: - Place the table to the right side in a `JOIN` clause. -- Call the [joinGet](../../../sql-reference/functions/other-functions.md#joinget) function, which lets you extract data from the table the same way as from a dictionary. +- Call the [joinGet](/docs/en/sql-reference/functions/other-functions.md/#joinget) function, which lets you extract data from the table the same way as from a dictionary. ### Deleting Data {#deleting-data} -`ALTER DELETE` queries for `Join`-engine tables are implemented as [mutations](../../../sql-reference/statements/alter/index.md#mutations). `DELETE` mutation reads filtered data and overwrites data of memory and disk. +`ALTER DELETE` queries for `Join`-engine tables are implemented as [mutations](/docs/en/sql-reference/statements/alter/index.md#mutations). `DELETE` mutation reads filtered data and overwrites data of memory and disk. ### Limitations and Settings {#join-limitations-and-settings} @@ -67,30 +67,30 @@ When creating a table, the following settings are applied: #### join_use_nulls -[join_use_nulls](../../../operations/settings/settings.md#join_use_nulls) +[join_use_nulls](/docs/en/operations/settings/settings.md/#join_use_nulls) #### max_rows_in_join -[max_rows_in_join](../../../operations/settings/query-complexity.md#settings-max_rows_in_join) +[max_rows_in_join](/docs/en/operations/settings/query-complexity.md/#settings-max_rows_in_join) #### max_bytes_in_join -[max_bytes_in_join](../../../operations/settings/query-complexity.md#settings-max_bytes_in_join) +[max_bytes_in_join](/docs/en/operations/settings/query-complexity.md/#settings-max_bytes_in_join) #### join_overflow_mode -[join_overflow_mode](../../../operations/settings/query-complexity.md#settings-join_overflow_mode) +[join_overflow_mode](/docs/en/operations/settings/query-complexity.md/#settings-join_overflow_mode) #### join_any_take_last_row -[join_any_take_last_row](../../../operations/settings/settings.md#settings-join_any_take_last_row) +[join_any_take_last_row](/docs/en/operations/settings/settings.md/#settings-join_any_take_last_row) #### join_use_nulls -[persistent](../../../operations/settings/settings.md#persistent) +[persistent](/docs/en/operations/settings/settings.md/#persistent) The `Join`-engine tables can’t be used in `GLOBAL JOIN` operations. -The `Join`-engine allows to specify [join_use_nulls](../../../operations/settings/settings.md#join_use_nulls) setting in the `CREATE TABLE` statement. [SELECT](../../../sql-reference/statements/select/index.md) query should have the same `join_use_nulls` value. +The `Join`-engine allows to specify [join_use_nulls](/docs/en/operations/settings/settings.md/#join_use_nulls) setting in the `CREATE TABLE` statement. [SELECT](/docs/en/sql-reference/statements/select/index.md) query should have the same `join_use_nulls` value. ## Usage Examples {#example} diff --git a/docs/en/getting-started/example-datasets/cell-towers.md b/docs/en/getting-started/example-datasets/cell-towers.md index 3d993c3e224..b19d09c777a 100644 --- a/docs/en/getting-started/example-datasets/cell-towers.md +++ b/docs/en/getting-started/example-datasets/cell-towers.md @@ -4,25 +4,39 @@ sidebar_label: Cell Towers sidebar_position: 3 title: "Cell Towers" --- +import ConnectionDetails from '@site/docs/en/_snippets/_gather_your_details_http.mdx'; import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; import CodeBlock from '@theme/CodeBlock'; import ActionsMenu from '@site/docs/en/_snippets/_service_actions_menu.md'; import SQLConsoleDetail from '@site/docs/en/_snippets/_launch_sql_console.md'; +import SupersetDocker from '@site/docs/en/_snippets/_add_superset_detail.md'; -This dataset is from [OpenCellid](https://www.opencellid.org/) - The world's largest Open Database of Cell Towers. +## Goal + +In this guide you will learn how to: +- Load the OpenCelliD data in ClickHouse +- Connect Apache Superset to ClickHouse +- Build a dashboard based on data available in the dataset + +Here is a preview of the dashboard created in this guide: + +![Dashboard of cell towers by radio type in mcc 204](@site/docs/en/getting-started/example-datasets/images/superset-cell-tower-dashboard.png) + +## Get the Dataset {#get-the-dataset} + +This dataset is from [OpenCelliD](https://www.opencellid.org/) - The world's largest Open Database of Cell Towers. As of 2021, it contains more than 40 million records about cell towers (GSM, LTE, UMTS, etc.) around the world with their geographical coordinates and metadata (country code, network, etc). OpenCelliD Project is licensed under a Creative Commons Attribution-ShareAlike 4.0 International License, and we redistribute a snapshot of this dataset under the terms of the same license. The up-to-date version of the dataset is available to download after sign in. - -## Get the Dataset {#get-the-dataset} - +### Load the sample data + ClickHouse Cloud provides an easy-button for uploading this dataset from S3. Log in to your ClickHouse Cloud organization, or create a free trial at [ClickHouse.cloud](https://clickhouse.cloud). @@ -30,13 +44,33 @@ Choose the **Cell Towers** dataset from the **Sample data** tab, and **Load data ![Load cell towers dataset](@site/docs/en/_snippets/images/cloud-load-data-sample.png) -Examine the schema of the cell_towers table: +### Examine the schema of the cell_towers table ```sql DESCRIBE TABLE cell_towers ``` +This is the output of `DESCRIBE`. Down further in this guide the field type choices will be described. +```response +┌─name──────────┬─type──────────────────────────────────────────────────────────────────┬ +│ radio │ Enum8('' = 0, 'CDMA' = 1, 'GSM' = 2, 'LTE' = 3, 'NR' = 4, 'UMTS' = 5) │ +│ mcc │ UInt16 │ +│ net │ UInt16 │ +│ area │ UInt16 │ +│ cell │ UInt64 │ +│ unit │ Int16 │ +│ lon │ Float64 │ +│ lat │ Float64 │ +│ range │ UInt32 │ +│ samples │ UInt32 │ +│ changeable │ UInt8 │ +│ created │ DateTime │ +│ updated │ DateTime │ +│ averageSignal │ UInt8 │ +└───────────────┴───────────────────────────────────────────────────────────────────────┴ +``` + @@ -86,7 +120,7 @@ clickhouse-client --query "INSERT INTO cell_towers FORMAT CSVWithNames" < cell_t -## Example queries {#examples} +## Run some example queries {#examples} 1. A number of cell towers by type: @@ -127,13 +161,13 @@ SELECT mcc, count() FROM cell_towers GROUP BY mcc ORDER BY count() DESC LIMIT 10 10 rows in set. Elapsed: 0.019 sec. Processed 43.28 million rows, 86.55 MB (2.33 billion rows/s., 4.65 GB/s.) ``` -So, the top countries are: the USA, Germany, and Russia. +Based on the above query and the [MCC list](https://en.wikipedia.org/wiki/Mobile_country_code), the countries with the most cell towers are: the USA, Germany, and Russia. -You may want to create an [External Dictionary](../../sql-reference/dictionaries/external-dictionaries/external-dicts.md) in ClickHouse to decode these values. +You may want to create a [Dictionary](../../sql-reference/dictionaries/external-dictionaries/external-dicts.md) in ClickHouse to decode these values. ## Use case: Incorporate geo data {#use-case} -Using `pointInPolygon` function. +Using the [`pointInPolygon`](/docs/en/sql-reference/functions/geo/coordinates.md/#pointinpolygon) function. 1. Create a table where we will store polygons: @@ -224,6 +258,110 @@ WHERE pointInPolygon((lon, lat), (SELECT * FROM moscow)) 1 rows in set. Elapsed: 0.067 sec. Processed 43.28 million rows, 692.42 MB (645.83 million rows/s., 10.33 GB/s.) ``` -The data is also available for interactive queries in the [Playground](https://play.clickhouse.com/play?user=play), [example](https://play.clickhouse.com/play?user=play#U0VMRUNUIG1jYywgY291bnQoKSBGUk9NIGNlbGxfdG93ZXJzIEdST1VQIEJZIG1jYyBPUkRFUiBCWSBjb3VudCgpIERFU0M=). +## Review of the schema -Although you cannot create temporary tables there. +Before building visualizations in Superset have a look at the columns that you will use. This dataset primarily provides the location (Longitude and Latitude) and radio types at mobile cellular towers worldwide. The column descriptions can be found in the [community forum](https://community.opencellid.org/t/documenting-the-columns-in-the-downloadable-cells-database-csv/186). The columns used in the visualizations that will be built are described below + +Here is a description of the columns taken from the OpenCelliD forum: + +| Column | Description | +|--------------|--------------------------------------------------------| +| radio | Technology generation: CDMA, GSM, UMTS, 5G NR | +| mcc | Mobile Country Code: `204` is The Netherlands | +| lon | Longitude: With Latitude, approximate tower location | +| lat | Latitude: With Longitude, approximate tower location | + +:::tip mcc +To find your MCC check [Mobile network codes](https://en.wikipedia.org/wiki/Mobile_country_code), and use the three digits in the **Mobile country code** column. +::: + +The schema for this table was designed for compact storage on disk and query speed. +- The `radio` data is stored as an `Enum8` (`UInt8`) rather than a string. +- `mcc` or Mobile country code, is stored as a `UInt16` as we know the range is 1 - 999. +- `lon` and `lat` are `Float64`. + +None of the other fields are used in the queries or visualizations in this guide, but they are described in the forum linked above if you are interested. + +## Build visualizations with Apache Superset + +Superset is easy to run from Docker. If you already have Superset running, all you need to do is add ClickHouse Connect with `pip install clickhouse-connect`. If you need to install Superset open the **Launch Apache Superset in Docker** directly below. + + + +To build a Superset dashboard using the OpenCelliD dataset you should: +- Add your ClickHouse service as a Superset **database** +- Add the table **cell_towers** as a Superset **dataset** +- Create some **charts** +- Add the charts to a **dashboard** + +### Add your ClickHouse service as a Superset database + + + + In Superset a database can be added by choosing the database type, and then providing the connection details. Open Superset and look for the **+**, it has a menu with **Data** and then **Connect database** options. + + ![Add a database](@site/docs/en/getting-started/example-datasets/images/superset-add.png) + + Choose **ClickHouse Connect** from the list: + + ![Choose clickhouse connect as database type](@site/docs/en/getting-started/example-datasets/images/superset-choose-a-database.png) + +:::note + If **ClickHouse Connect** is not one of your options, then you will need to install it. The comand is `pip install clickhouse-connect`, and more info is [available here](https://pypi.org/project/clickhouse-connect/). +::: + +#### Add your connection details: + +:::tip + Make sure that you set **SSL** on when connecting to ClickHouse Cloud or other ClickHouse systems that enforce the use of SSL. +::: + + ![Add ClickHouse as a Superset datasource](@site/docs/en/getting-started/example-datasets/images/superset-connect-a-database.png) + +### Add the table **cell_towers** as a Superset **dataset** + + In Superset a **dataset** maps to a table within a database. Click on add a dataset and choose your ClickHouse service, the database containing your table (`default`), and choose the `cell_towers` table: + +![Add cell_towers table as a dataset](@site/docs/en/getting-started/example-datasets/images/superset-add-dataset.png) + +### Create some **charts** + +When you choose to add a chart in Superset you have to specify the dataset (`cell_towers`) and the chart type. Since the OpenCelliD dataset provides longitude and latitude coordinates for cell towers we will create a **Map** chart. The **deck.gL Scatterplot** type is suited to this dataset as it works well with dense data points on a map. + +![Create a map in Superset](@site/docs/en/getting-started/example-datasets/images/superset-create-map.png) + +#### Specify the query used for the map + +A deck.gl Scatterplot requires a longitude and latitude, and one or more filters can also be applied to the query. In this example two filters are applied, one for cell towers with UMTS radios, and one for the Mobile country code assigned to The Netherlands. + +The fields `lon` and `lat` contain the longitude and latitude: + +![Specify longitude and latitude fields](@site/docs/en/getting-started/example-datasets/images/superset-lon-lat.png) + +Add a filter with `mcc` = `204` (or substitute any other `mcc` value): + +![Filter on MCC 204](@site/docs/en/getting-started/example-datasets/images/superset-mcc-204.png) + +Add a filter with `radio` = `'UMTS'` (or substitute any other `radio` value, you can see the choices in the output of `DESCRIBE TABLE cell_towers`): + +![Filter on radio = UMTS](@site/docs/en/getting-started/example-datasets/images/superset-radio-umts.png) + +This is the full configuration for the chart that filters on `radio = 'UMTS'` and `mcc = 204`: + +![Chart for UMTS radios in MCC 204](@site/docs/en/getting-started/example-datasets/images/superset-umts-netherlands.png) + +Click on **UPDATE CHART** to render the visualization. + +### Add the charts to a **dashboard** + +This screenshot shows cell tower locations with LTE, UMTS, and GSM radios. The charts are all created in the same way and they are added to a dashboard. + + ![Dashboard of cell towers by radio type in mcc 204](@site/docs/en/getting-started/example-datasets/images/superset-cell-tower-dashboard.png) + +:::tip +The data is also available for interactive queries in the [Playground](https://play.clickhouse.com/play?user=play). + +This [example](https://play.clickhouse.com/play?user=play#U0VMRUNUIG1jYywgY291bnQoKSBGUk9NIGNlbGxfdG93ZXJzIEdST1VQIEJZIG1jYyBPUkRFUiBCWSBjb3VudCgpIERFU0M=) will populate the username and even the query for you. + +Although you cannot create tables in the Playground, you can run all of the queries and even use Superset (adjust the hostname and port number). +::: diff --git a/docs/en/getting-started/example-datasets/github.md b/docs/en/getting-started/example-datasets/github.md new file mode 100644 index 00000000000..239637a34e9 --- /dev/null +++ b/docs/en/getting-started/example-datasets/github.md @@ -0,0 +1,2504 @@ +--- +slug: /en/getting-started/example-datasets/github +sidebar_label: GitHub Repo Analysis +description: Analyze the ClickHouse GitHub repo or any repository of your choosing +--- + +# ClickHouse GitHub data + +This dataset contains all of the commits and changes for the ClickHouse repository. It can be generated using the native `git-import` tool distributed with ClickHouse. + +The generated data provides a `tsv` file for each of the following tables: + +- `commits` - commits with statistics. +- `file_changes` - files changed in every commit with the info about the change and statistics. +- `line_changes` - every changed line in every changed file in every commit with full info about the line and the information about the previous change of this line. + +As of November 8th, 2022, each TSV is approximately the following size and number of rows: + +- `commits` - 7.8M - 266,051 rows +- `file_changes` - 53M - 266,051 rows +- `line_changes` - 2.7G - 7,535,157 rows + +# Table of Contents + +- [ClickHouse GitHub data](#clickhouse-github-data) +- [Table of Contents](#table-of-contents) +- [Generating the data](#generating-the-data) +- [Downloading and inserting the data](#downloading-and-inserting-the-data) +- [Queries](#queries) + - [History of a single file](#history-of-a-single-file) + - [Find the current active files](#find-the-current-active-files) + - [List files with most modifications](#list-files-with-most-modifications) + - [What day of the week do commits usually occur?](#what-day-of-the-week-do-commits-usually-occur) + - [History of subdirectory/file - number of lines, commits and contributors over time](#history-of-subdirectoryfile---number-of-lines-commits-and-contributors-over-time) + - [List files with maximum number of authors](#list-files-with-maximum-number-of-authors) + - [Oldest lines of code in the repository](#oldest-lines-of-code-in-the-repository) + - [Files with longest history](#files-with-longest-history) + - [Distribution of contributors with respect to docs and code over the month](#distribution-of-contributors-with-respect-to-docs-and-code-over-the-month) + - [Authors with the most diverse impact](#authors-with-the-most-diverse-impact) + - [Favorite files for an author](#favorite-files-for-an-author) + - [Largest files with lowest number of authors](#largest-files-with-lowest-number-of-authors) + - [Commits and lines of code distribution by time; by weekday, by author; for specific subdirectories](#commits-and-lines-of-code-distribution-by-time-by-weekday-by-author-for-specific-subdirectories) + - [Matrix of authors that shows what authors tends to rewrite another authors code](#matrix-of-authors-that-shows-what-authors-tends-to-rewrite-another-authors-code) + - [Who is the highest percentage contributor per day of week?](#who-is-the-highest-percentage-contributor-per-day-of-week) + - [Distribution of code age across repository](#distribution-of-code-age-across-repository) + - [What percentage of code for an author has been removed by other authors?](#what-percentage-of-code-for-an-author-has-been-removed-by-other-authors) + - [List files that were rewritten most number of times?](#list-files-that-were-rewritten-most-number-of-times) + - [What weekday does the code have the highest chance to stay in the repository?](#what-weekday-does-the-code-have-the-highest-chance-to-stay-in-the-repository) + - [Files sorted by average code age](#files-sorted-by-average-code-age) + - [Who tends to write more tests / CPP code / comments?](#who-tends-to-write-more-tests--cpp-code--comments) + - [How does an authors commits change over time with respect to code/comments percentage?](#how-does-an-authors-commits-change-over-time-with-respect-to-codecomments-percentage) + - [What is the average time before code will be rewritten and the median (half-life of code decay)?](#what-is-the-average-time-before-code-will-be-rewritten-and-the-median-half-life-of-code-decay) + - [What is the worst time to write code in sense that the code has highest chance to be re-written?](#what-is-the-worst-time-to-write-code-in-sense-that-the-code-has-highest-chance-to-be-re-written) + - [Which authors code is the most sticky?](#which-authors-code-is-the-most-sticky) + - [Most consecutive days of commits by an author](#most-consecutive-days-of-commits-by-an-author) + - [Line by line commit history of a file](#line-by-line-commit-history-of-a-file) +- [Unsolved Questions](#unsolved-questions) + - [Git blame](#git-blame) +- [Related Content](#related-content) + +# Generating the data + +This is optional. We distribute the data freely - see [Downloading and inserting the data](#downloading-and-inserting-the-data). + +```bash +git clone git@github.com:ClickHouse/ClickHouse.git +cd ClickHouse +clickhouse git-import --skip-paths 'generated\.cpp|^(contrib|docs?|website|libs/(libcityhash|liblz4|libdivide|libvectorclass|libdouble-conversion|libcpuid|libzstd|libfarmhash|libmetrohash|libpoco|libwidechar_width))/' --skip-commits-with-messages '^Merge branch ' +``` + +This will take around 3 minutes (as of November 8th 2022 on a MacBook Pro 2021) to complete for the ClickHouse repository. + +A full list of available options can be obtained from the tools native help. + +```bash +clickhouse git-import -h +``` + +This help also provides the DDL for each of the above tables e.g. + +``` +CREATE TABLE git.commits +( + hash String, + author LowCardinality(String), + time DateTime, + message String, + files_added UInt32, + files_deleted UInt32, + files_renamed UInt32, + files_modified UInt32, + lines_added UInt32, + lines_deleted UInt32, + hunks_added UInt32, + hunks_removed UInt32, + hunks_changed UInt32 +) ENGINE = MergeTree ORDER BY time; +``` + +**These queries should work on any repository. Feel free to explore and report your findings** Some guidelines with respect to execution times (as of November 2022): + +- Linux - `~/clickhouse git-import` - 160 mins + +# Downloading and inserting the data + +The following data can be used to reproduce a working environment. Alternatively, this dataset is available in play.clickhouse.com - see [Queries](#queries) for further details. + +Generated files for the following repositories can be found below: + +- ClickHouse (Nov 8th 2022) + - https://datasets-documentation.s3.amazonaws.com/github/commits/clickhouse/commits.tsv.xz - 2.5 MB + - https://datasets-documentation.s3.amazonaws.com/github/commits/clickhouse/file_changes.tsv.xz - 4.5MB + - https://datasets-documentation.s3.amazonaws.com/github/commits/clickhouse/line_changes.tsv.xz - 127.4 MB +- Linux (Nov 8th 2022) + - https://datasets-documentation.s3.amazonaws.com/github/commits/linux/commits.tsv.xz - 44 MB + - https://datasets-documentation.s3.amazonaws.com/github/commits/linux/file_changes.tsv.xz - 467MB + - https://datasets-documentation.s3.amazonaws.com/github/commits/linux/line_changes.tsv.xz - 1.1G + +To insert this data, prepare the database by executing the following queries: + +```sql +DROP DATABASE IF EXISTS git; +CREATE DATABASE git; + +CREATE TABLE git.commits +( + hash String, + author LowCardinality(String), + time DateTime, + message String, + files_added UInt32, + files_deleted UInt32, + files_renamed UInt32, + files_modified UInt32, + lines_added UInt32, + lines_deleted UInt32, + hunks_added UInt32, + hunks_removed UInt32, + hunks_changed UInt32 +) ENGINE = MergeTree ORDER BY time; + +CREATE TABLE git.file_changes +( + change_type Enum('Add' = 1, 'Delete' = 2, 'Modify' = 3, 'Rename' = 4, 'Copy' = 5, 'Type' = 6), + path LowCardinality(String), + old_path LowCardinality(String), + file_extension LowCardinality(String), + lines_added UInt32, + lines_deleted UInt32, + hunks_added UInt32, + hunks_removed UInt32, + hunks_changed UInt32, + + commit_hash String, + author LowCardinality(String), + time DateTime, + commit_message String, + commit_files_added UInt32, + commit_files_deleted UInt32, + commit_files_renamed UInt32, + commit_files_modified UInt32, + commit_lines_added UInt32, + commit_lines_deleted UInt32, + commit_hunks_added UInt32, + commit_hunks_removed UInt32, + commit_hunks_changed UInt32 +) ENGINE = MergeTree ORDER BY time; + +CREATE TABLE git.line_changes +( + sign Int8, + line_number_old UInt32, + line_number_new UInt32, + hunk_num UInt32, + hunk_start_line_number_old UInt32, + hunk_start_line_number_new UInt32, + hunk_lines_added UInt32, + hunk_lines_deleted UInt32, + hunk_context LowCardinality(String), + line LowCardinality(String), + indent UInt8, + line_type Enum('Empty' = 0, 'Comment' = 1, 'Punct' = 2, 'Code' = 3), + + prev_commit_hash String, + prev_author LowCardinality(String), + prev_time DateTime, + + file_change_type Enum('Add' = 1, 'Delete' = 2, 'Modify' = 3, 'Rename' = 4, 'Copy' = 5, 'Type' = 6), + path LowCardinality(String), + old_path LowCardinality(String), + file_extension LowCardinality(String), + file_lines_added UInt32, + file_lines_deleted UInt32, + file_hunks_added UInt32, + file_hunks_removed UInt32, + file_hunks_changed UInt32, + + commit_hash String, + author LowCardinality(String), + time DateTime, + commit_message String, + commit_files_added UInt32, + commit_files_deleted UInt32, + commit_files_renamed UInt32, + commit_files_modified UInt32, + commit_lines_added UInt32, + commit_lines_deleted UInt32, + commit_hunks_added UInt32, + commit_hunks_removed UInt32, + commit_hunks_changed UInt32 +) ENGINE = MergeTree ORDER BY time; +``` + +Insert the data using `INSERT INTO SELECT` and the [s3 function](https://clickhouse.com/docs/en/integrations/s3/s3-table-functions/). For example, below, we insert the ClickHouse files into each of their respective tables: + +*commits* + +```sql +INSERT INTO git.commits SELECT * +FROM s3('https://datasets-documentation.s3.amazonaws.com/github/commits/clickhouse/commits.tsv.xz', 'TSV', 'hash String,author LowCardinality(String), time DateTime, message String, files_added UInt32, files_deleted UInt32, files_renamed UInt32, files_modified UInt32, lines_added UInt32, lines_deleted UInt32, hunks_added UInt32, hunks_removed UInt32, hunks_changed UInt32') + +0 rows in set. Elapsed: 1.826 sec. Processed 62.78 thousand rows, 8.50 MB (34.39 thousand rows/s., 4.66 MB/s.) +``` + +*file_changes* + +```sql +INSERT INTO git.file_changes SELECT * +FROM s3('https://datasets-documentation.s3.amazonaws.com/github/commits/clickhouse/file_changes.tsv.xz', 'TSV', 'change_type Enum(\'Add\' = 1, \'Delete\' = 2, \'Modify\' = 3, \'Rename\' = 4, \'Copy\' = 5, \'Type\' = 6), path LowCardinality(String), old_path LowCardinality(String), file_extension LowCardinality(String), lines_added UInt32, lines_deleted UInt32, hunks_added UInt32, hunks_removed UInt32, hunks_changed UInt32, commit_hash String, author LowCardinality(String), time DateTime, commit_message String, commit_files_added UInt32, commit_files_deleted UInt32, commit_files_renamed UInt32, commit_files_modified UInt32, commit_lines_added UInt32, commit_lines_deleted UInt32, commit_hunks_added UInt32, commit_hunks_removed UInt32, commit_hunks_changed UInt32') + +0 rows in set. Elapsed: 2.688 sec. Processed 266.05 thousand rows, 48.30 MB (98.97 thousand rows/s., 17.97 MB/s.) +``` + +*line_changes* + +```sql +INSERT INTO git.line_changes SELECT * +FROM s3('https://datasets-documentation.s3.amazonaws.com/github/commits/clickhouse/line_changes.tsv.xz', 'TSV', ' sign Int8, line_number_old UInt32, line_number_new UInt32, hunk_num UInt32, hunk_start_line_number_old UInt32, hunk_start_line_number_new UInt32, hunk_lines_added UInt32,\n hunk_lines_deleted UInt32, hunk_context LowCardinality(String), line LowCardinality(String), indent UInt8, line_type Enum(\'Empty\' = 0, \'Comment\' = 1, \'Punct\' = 2, \'Code\' = 3), prev_commit_hash String, prev_author LowCardinality(String), prev_time DateTime, file_change_type Enum(\'Add\' = 1, \'Delete\' = 2, \'Modify\' = 3, \'Rename\' = 4, \'Copy\' = 5, \'Type\' = 6),\n path LowCardinality(String), old_path LowCardinality(String), file_extension LowCardinality(String), file_lines_added UInt32, file_lines_deleted UInt32, file_hunks_added UInt32, file_hunks_removed UInt32, file_hunks_changed UInt32, commit_hash String,\n author LowCardinality(String), time DateTime, commit_message String, commit_files_added UInt32, commit_files_deleted UInt32, commit_files_renamed UInt32, commit_files_modified UInt32, commit_lines_added UInt32, commit_lines_deleted UInt32, commit_hunks_added UInt32, commit_hunks_removed UInt32, commit_hunks_changed UInt32') + +0 rows in set. Elapsed: 50.535 sec. Processed 7.54 million rows, 2.09 GB (149.11 thousand rows/s., 41.40 MB/s.) +``` + +# Queries + +The tool suggests several queries via its help output. We have answered these in addition to some additional supplementary questions of interest. These queries are of approximately increasing complexity vs. the tool's arbitrary order. + +This dataset is available in [play.clickhouse.com](https://play.clickhouse.com/play?user=play#U0hPVyBUQUJMRVMgSU4gZ2l0X2NsaWNraG91c2U=) in the `git_clickhouse` databases. We provide a link to this environment for all queries, adapting the database name as required. Note that play results may vary from the those presented here due to differences in time of data collection. + +## History of a single file + +The simplest of queries. Here we look at all commit messages for the `StorageReplicatedMergeTree.cpp`. Since these are likely more interesting, we sort by the most recent messages first. + +[play](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICB0aW1lLAogICAgc3Vic3RyaW5nKGNvbW1pdF9oYXNoLCAxLCAxMSkgQVMgY29tbWl0LAogICAgY2hhbmdlX3R5cGUsCiAgICBhdXRob3IsCiAgICBwYXRoLAogICAgb2xkX3BhdGgsCiAgICBsaW5lc19hZGRlZCwKICAgIGxpbmVzX2RlbGV0ZWQsCiAgICBjb21taXRfbWVzc2FnZQpGUk9NIGdpdF9jbGlja2hvdXNlLmZpbGVfY2hhbmdlcwpXSEVSRSBwYXRoID0gJ3NyYy9TdG9yYWdlcy9TdG9yYWdlUmVwbGljYXRlZE1lcmdlVHJlZS5jcHAnCk9SREVSIEJZIHRpbWUgREVTQwpMSU1JVCAxMA==) + +```sql +SELECT + time, + substring(commit_hash, 1, 11) AS commit, + change_type, + author, + path, + old_path, + lines_added, + lines_deleted, + commit_message +FROM git.file_changes +WHERE path = 'src/Storages/StorageReplicatedMergeTree.cpp' +ORDER BY time DESC +LIMIT 10 + +┌────────────────time─┬─commit──────┬─change_type─┬─author─────────────┬─path────────────────────────────────────────┬─old_path─┬─lines_added─┬─lines_deleted─┬─commit_message───────────────────────────────────┐ +│ 2022-10-30 16:30:51 │ c68ab231f91 │ Modify │ Alexander Tokmakov │ src/Storages/StorageReplicatedMergeTree.cpp │ │ 13 │ 10 │ fix accessing part in Deleting state │ +│ 2022-10-23 16:24:20 │ b40d9200d20 │ Modify │ Anton Popov │ src/Storages/StorageReplicatedMergeTree.cpp │ │ 28 │ 30 │ better semantic of constsness of DataPartStorage │ +│ 2022-10-23 01:23:15 │ 56e5daba0c9 │ Modify │ Anton Popov │ src/Storages/StorageReplicatedMergeTree.cpp │ │ 28 │ 44 │ remove DataPartStorageBuilder │ +│ 2022-10-21 13:35:37 │ 851f556d65a │ Modify │ Igor Nikonov │ src/Storages/StorageReplicatedMergeTree.cpp │ │ 3 │ 2 │ Remove unused parameter │ +│ 2022-10-21 13:02:52 │ 13d31eefbc3 │ Modify │ Igor Nikonov │ src/Storages/StorageReplicatedMergeTree.cpp │ │ 4 │ 4 │ Replicated merge tree polishing │ +│ 2022-10-21 12:25:19 │ 4e76629aafc │ Modify │ Azat Khuzhin │ src/Storages/StorageReplicatedMergeTree.cpp │ │ 3 │ 2 │ Fixes for -Wshorten-64-to-32 │ +│ 2022-10-19 13:59:28 │ 05e6b94b541 │ Modify │ Antonio Andelic │ src/Storages/StorageReplicatedMergeTree.cpp │ │ 4 │ 0 │ Polishing │ +│ 2022-10-19 13:34:20 │ e5408aac991 │ Modify │ Antonio Andelic │ src/Storages/StorageReplicatedMergeTree.cpp │ │ 3 │ 53 │ Simplify logic │ +│ 2022-10-18 15:36:11 │ 7befe2825c9 │ Modify │ Alexey Milovidov │ src/Storages/StorageReplicatedMergeTree.cpp │ │ 2 │ 2 │ Update StorageReplicatedMergeTree.cpp │ +│ 2022-10-18 15:35:44 │ 0623ad4e374 │ Modify │ Alexey Milovidov │ src/Storages/StorageReplicatedMergeTree.cpp │ │ 1 │ 1 │ Update StorageReplicatedMergeTree.cpp │ +└─────────────────────┴─────────────┴─────────────┴────────────────────┴─────────────────────────────────────────────┴──────────┴─────────────┴───────────────┴──────────────────────────────────────────────────┘ + +10 rows in set. Elapsed: 0.006 sec. Processed 12.10 thousand rows, 1.60 MB (1.93 million rows/s., 255.40 MB/s.) +``` + + +We can also review the line changes, excluding renames i.e. we won't show changes before a rename event when the file existed under a different name: + +[play](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICB0aW1lLAogICAgc3Vic3RyaW5nKGNvbW1pdF9oYXNoLCAxLCAxMSkgQVMgY29tbWl0LAogICAgc2lnbiwKICAgIGxpbmVfbnVtYmVyX29sZCwKICAgIGxpbmVfbnVtYmVyX25ldywKICAgIGF1dGhvciwKICAgIGxpbmUKRlJPTSBnaXRfY2xpY2tob3VzZS5saW5lX2NoYW5nZXMKV0hFUkUgcGF0aCA9ICdzcmMvU3RvcmFnZXMvU3RvcmFnZVJlcGxpY2F0ZWRNZXJnZVRyZWUuY3BwJwpPUkRFUiBCWSBsaW5lX251bWJlcl9uZXcgQVNDCkxJTUlUIDEw) + +```sql +SELECT + time, + substring(commit_hash, 1, 11) AS commit, + sign, + line_number_old, + line_number_new, + author, + line +FROM git.line_changes +WHERE path = 'src/Storages/StorageReplicatedMergeTree.cpp' +ORDER BY line_number_new ASC +LIMIT 10 + +┌────────────────time─┬─commit──────┬─sign─┬─line_number_old─┬─line_number_new─┬─author───────────┬─line──────────────────────────────────────────────────┐ +│ 2020-04-16 02:06:10 │ cdeda4ab915 │ -1 │ 1 │ 1 │ Alexey Milovidov │ #include │ +│ 2020-04-16 02:06:10 │ cdeda4ab915 │ 1 │ 2 │ 1 │ Alexey Milovidov │ #include │ +│ 2020-04-16 02:06:10 │ cdeda4ab915 │ 1 │ 2 │ 2 │ Alexey Milovidov │ │ +│ 2021-05-03 23:46:51 │ 02ce9cc7254 │ -1 │ 3 │ 2 │ Alexey Milovidov │ #include │ +│ 2021-05-27 22:21:02 │ e2f29b9df02 │ -1 │ 3 │ 2 │ s-kat │ #include │ +│ 2022-10-03 22:30:50 │ 210882b9c4d │ 1 │ 2 │ 3 │ alesapin │ #include │ +│ 2022-10-23 16:24:20 │ b40d9200d20 │ 1 │ 2 │ 3 │ Anton Popov │ #include │ +│ 2021-06-20 09:24:43 │ 4c391f8e994 │ 1 │ 2 │ 3 │ Mike Kot │ #include "Common/hex.h" │ +│ 2021-12-29 09:18:56 │ 8112a712336 │ -1 │ 6 │ 5 │ avogar │ #include │ +│ 2022-04-21 20:19:13 │ 9133e398b8c │ 1 │ 11 │ 12 │ Nikolai Kochetov │ #include │ +└─────────────────────┴─────────────┴──────┴─────────────────┴─────────────────┴──────────────────┴───────────────────────────────────────────────────────┘ + +10 rows in set. Elapsed: 0.258 sec. Processed 7.54 million rows, 654.92 MB (29.24 million rows/s., 2.54 GB/s.) +``` + +Note a more complex variant of this query exists where we find the [line-by-line commit history of a file](#line-by-line-commit-history-of-a-file) considering renames. + +## Find the current active files + +This is important for later analysis when we only want to consider the current files in the repository. We estimate this set as the files which haven't been renamed or deleted (and then re-added/re-named). + +**Note there appears to have been a broken commit history in relation to files under the `dbms`, `libs`, `tests/testflows/` directories during their renames. We also thus exclude these.** + +[play](https://play.clickhouse.com/play?user=play#U0VMRUNUIHBhdGgKRlJPTQooCiAgICBTRUxFQ1QKICAgICAgICBvbGRfcGF0aCBBUyBwYXRoLAogICAgICAgIG1heCh0aW1lKSBBUyBsYXN0X3RpbWUsCiAgICAgICAgMiBBUyBjaGFuZ2VfdHlwZQogICAgRlJPTSBnaXRfY2xpY2tob3VzZS5maWxlX2NoYW5nZXMKICAgIEdST1VQIEJZIG9sZF9wYXRoCiAgICBVTklPTiBBTEwKICAgIFNFTEVDVAogICAgICAgIHBhdGgsCiAgICAgICAgbWF4KHRpbWUpIEFTIGxhc3RfdGltZSwKICAgICAgICBhcmdNYXgoY2hhbmdlX3R5cGUsIHRpbWUpIEFTIGNoYW5nZV90eXBlCiAgICBGUk9NIGdpdF9jbGlja2hvdXNlLmZpbGVfY2hhbmdlcwogICAgR1JPVVAgQlkgcGF0aAopCkdST1VQIEJZIHBhdGgKSEFWSU5HIChhcmdNYXgoY2hhbmdlX3R5cGUsIGxhc3RfdGltZSkgIT0gMikgQU5EIE5PVCBtYXRjaChwYXRoLCAnKF5kYm1zLyl8KF5saWJzLyl8KF50ZXN0cy90ZXN0Zmxvd3MvKXwoXnByb2dyYW1zL3NlcnZlci9zdG9yZS8pJykgT1JERVIgQlkgcGF0aApMSU1JVCAxMA==) + +```sql +SELECT path +FROM +( + SELECT + old_path AS path, + max(time) AS last_time, + 2 AS change_type + FROM git.file_changes + GROUP BY old_path + UNION ALL + SELECT + path, + max(time) AS last_time, + argMax(change_type, time) AS change_type + FROM git.file_changes + GROUP BY path +) +GROUP BY path +HAVING (argMax(change_type, last_time) != 2) AND NOT match(path, '(^dbms/)|(^libs/)|(^tests/testflows/)|(^programs/server/store/)') ORDER BY path +LIMIT 10 + +┌─path────────────────────────────────────────────────────────────┐ +│ tests/queries/0_stateless/01054_random_printable_ascii_ubsan.sh │ +│ tests/queries/0_stateless/02247_read_bools_as_numbers_json.sh │ +│ tests/performance/file_table_function.xml │ +│ tests/queries/0_stateless/01902_self_aliases_in_columns.sql │ +│ tests/queries/0_stateless/01070_h3_get_base_cell.reference │ +│ src/Functions/ztest.cpp │ +│ src/Interpreters/InterpreterShowTablesQuery.h │ +│ src/Parsers/Kusto/ParserKQLStatement.h │ +│ tests/queries/0_stateless/00938_dataset_test.sql │ +│ src/Dictionaries/Embedded/GeodataProviders/Types.h │ +└─────────────────────────────────────────────────────────────────┘ + +10 rows in set. Elapsed: 0.085 sec. Processed 532.10 thousand rows, 8.68 MB (6.30 million rows/s., 102.64 MB/s.) +``` + +Note that this allows for files to be renamed and then re-renamed to their original values. First we aggregate `old_path` for a list of deleted files as a result of renaming. We union this with the last operation for every `path`. Finally, we filter this list to those where the final event is not a `Delete`. + +[play](https://play.clickhouse.com/play?user=play#U0VMRUNUIHVuaXEocGF0aCkKRlJPTQooCiAgICBTRUxFQ1QgcGF0aAogICAgRlJPTQogICAgKAogICAgICAgIFNFTEVDVAogICAgICAgICAgICBvbGRfcGF0aCBBUyBwYXRoLAogICAgICAgICAgICBtYXgodGltZSkgQVMgbGFzdF90aW1lLAogICAgICAgICAgICAyIEFTIGNoYW5nZV90eXBlCiAgICAgICAgRlJPTSBnaXRfY2xpY2tob3VzZS5maWxlX2NoYW5nZXMKICAgICAgICBHUk9VUCBCWSBvbGRfcGF0aAogICAgICAgIFVOSU9OIEFMTAogICAgICAgIFNFTEVDVAogICAgICAgICAgICBwYXRoLAogICAgICAgICAgICBtYXgodGltZSkgQVMgbGFzdF90aW1lLAogICAgICAgICAgICBhcmdNYXgoY2hhbmdlX3R5cGUsIHRpbWUpIEFTIGNoYW5nZV90eXBlCiAgICAgICAgRlJPTSBnaXRfY2xpY2tob3VzZS5maWxlX2NoYW5nZXMKICAgICAgICBHUk9VUCBCWSBwYXRoCiAgICApCiAgICBHUk9VUCBCWSBwYXRoCiAgICBIQVZJTkcgKGFyZ01heChjaGFuZ2VfdHlwZSwgbGFzdF90aW1lKSAhPSAyKSBBTkQgTk9UIG1hdGNoKHBhdGgsICcoXmRibXMvKXwoXmxpYnMvKXwoXnRlc3RzL3Rlc3RmbG93cy8pfChecHJvZ3JhbXMvc2VydmVyL3N0b3JlLyknKSBPUkRFUiBCWSBwYXRoCikK) + +```sql +SELECT uniq(path) +FROM +( + SELECT path + FROM + ( + SELECT + old_path AS path, + max(time) AS last_time, + 2 AS change_type + FROM git.file_changes + GROUP BY old_path + UNION ALL + SELECT + path, + max(time) AS last_time, + argMax(change_type, time) AS change_type + FROM git.file_changes + GROUP BY path + ) + GROUP BY path + HAVING (argMax(change_type, last_time) != 2) AND NOT match(path, '(^dbms/)|(^libs/)|(^tests/testflows/)|(^programs/server/store/)') ORDER BY path +) + +┌─uniq(path)─┐ +│ 18559 │ +└────────────┘ +1 row in set. Elapsed: 0.089 sec. Processed 532.10 thousand rows, 8.68 MB (6.01 million rows/s., 97.99 MB/s.) +``` + +Note that we skipped import of several directories during import i.e. + +`--skip-paths 'generated\.cpp|^(contrib|docs?|website|libs/(libcityhash|liblz4|libdivide|libvectorclass|libdouble-conversion|libcpuid|libzstd|libfarmhash|libmetrohash|libpoco|libwidechar_width))/'` + +Applying this pattern to `git list-files`, reports 18155. + +```bash +git ls-files | grep -v -E 'generated\.cpp|^(contrib|docs?|website|libs/(libcityhash|liblz4|libdivide|libvectorclass|libdouble-conversion|libcpuid|libzstd|libfarmhash|libmetrohash|libpoco|libwidechar_width))/' | wc -l + 18155 +``` + +**Our current solution is therefore an estimate of the current files** + +The difference here is caused by a few factors: + +- A rename can occur alongside other modifications to the file. These are listed as separate events in file_changes but with the same time. The `argMax` function has no way of distinguishing these - it picks the first value. The natural ordering of the inserts (the only means of knowing the correct order) is not maintained across the union so modified events can be selected. For example, below the `src/Functions/geometryFromColumn.h` file has several modifications before being renamed to `src/Functions/geometryConverters.h`. Our current solution may pick a Modify event as the latest change causing `src/Functions/geometryFromColumn.h` to be retained. + +[play](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICAgIGNoYW5nZV90eXBlLAogICAgICBwYXRoLAogICAgICBvbGRfcGF0aCwKICAgICAgdGltZSwKICAgICAgY29tbWl0X2hhc2gKICBGUk9NIGdpdF9jbGlja2hvdXNlLmZpbGVfY2hhbmdlcwogIFdIRVJFIChwYXRoID0gJ3NyYy9GdW5jdGlvbnMvZ2VvbWV0cnlGcm9tQ29sdW1uLmgnKSBPUiAob2xkX3BhdGggPSAnc3JjL0Z1bmN0aW9ucy9nZW9tZXRyeUZyb21Db2x1bW4uaCcpCg==) + +```sql + SELECT + change_type, + path, + old_path, + time, + commit_hash + FROM git.file_changes + WHERE (path = 'src/Functions/geometryFromColumn.h') OR (old_path = 'src/Functions/geometryFromColumn.h') + + ┌─change_type─┬─path───────────────────────────────┬─old_path───────────────────────────┬────────────────time─┬─commit_hash──────────────────────────────┐ + │ Add │ src/Functions/geometryFromColumn.h │ │ 2021-03-11 12:08:16 │ 9376b676e9a9bb8911b872e1887da85a45f7479d │ + │ Modify │ src/Functions/geometryFromColumn.h │ │ 2021-03-11 12:08:16 │ 6d59be5ea4768034f6526f7f9813062e0c369f7b │ + │ Modify │ src/Functions/geometryFromColumn.h │ │ 2021-03-11 12:08:16 │ 33acc2aa5dc091a7cb948f78c558529789b2bad8 │ + │ Modify │ src/Functions/geometryFromColumn.h │ │ 2021-03-11 12:08:16 │ 78e0db268ceadc42f82bc63a77ee1a4da6002463 │ + │ Modify │ src/Functions/geometryFromColumn.h │ │ 2021-03-11 12:08:16 │ 14a891057d292a164c4179bfddaef45a74eaf83a │ + │ Modify │ src/Functions/geometryFromColumn.h │ │ 2021-03-11 12:08:16 │ d0d6e6953c2a2af9fb2300921ff96b9362f22edb │ + │ Modify │ src/Functions/geometryFromColumn.h │ │ 2021-03-11 12:08:16 │ fe8382521139a58c0ba277eb848e88894658db66 │ + │ Modify │ src/Functions/geometryFromColumn.h │ │ 2021-03-11 12:08:16 │ 3be3d5cde8788165bc0558f1e2a22568311c3103 │ + │ Modify │ src/Functions/geometryFromColumn.h │ │ 2021-03-11 12:08:16 │ afad9bf4d0a55ed52a3f55483bc0973456e10a56 │ + │ Modify │ src/Functions/geometryFromColumn.h │ │ 2021-03-11 12:08:16 │ e3290ecc78ca3ea82b49ebcda22b5d3a4df154e6 │ + │ Rename │ src/Functions/geometryConverters.h │ src/Functions/geometryFromColumn.h │ 2021-03-11 12:08:16 │ 125945769586baf6ffd15919b29565b1b2a63218 │ + └─────────────┴────────────────────────────────────┴────────────────────────────────────┴─────────────────────┴──────────────────────────────────────────┘ + 11 rows in set. Elapsed: 0.030 sec. Processed 266.05 thousand rows, 6.61 MB (8.89 million rows/s., 220.82 MB/s.) +``` +- Broken commit history - missing delete events. Source and cause TBD. + +These differences shouldn't meaningfully impact our analysis. **We welcome improved versions of this query**. + +## List files with most modifications + +Limiting to current files, we consider the number of modifications to be the sum of deletes and additions. + +[play](https://play.clickhouse.com/play?user=play#V0lUSCBjdXJyZW50X2ZpbGVzIEFTCiAgICAoCiAgICAgICAgU0VMRUNUIHBhdGgKICAgICAgICBGUk9NCiAgICAgICAgKAogICAgICAgICAgICBTRUxFQ1QKICAgICAgICAgICAgICAgIG9sZF9wYXRoIEFTIHBhdGgsCiAgICAgICAgICAgICAgICBtYXgodGltZSkgQVMgbGFzdF90aW1lLAogICAgICAgICAgICAgICAgMiBBUyBjaGFuZ2VfdHlwZQogICAgICAgICAgICBGUk9NIGdpdF9jbGlja2hvdXNlLmZpbGVfY2hhbmdlcwogICAgICAgICAgICBHUk9VUCBCWSBvbGRfcGF0aAogICAgICAgICAgICBVTklPTiBBTEwKICAgICAgICAgICAgU0VMRUNUCiAgICAgICAgICAgICAgICBwYXRoLAogICAgICAgICAgICAgICAgbWF4KHRpbWUpIEFTIGxhc3RfdGltZSwKICAgICAgICAgICAgICAgIGFyZ01heChjaGFuZ2VfdHlwZSwgdGltZSkgQVMgY2hhbmdlX3R5cGUKICAgICAgICAgICAgRlJPTSBnaXRfY2xpY2tob3VzZS5maWxlX2NoYW5nZXMKICAgICAgICAgICAgR1JPVVAgQlkgcGF0aAogICAgICAgICkKICAgICAgICBHUk9VUCBCWSBwYXRoCiAgICAgICAgSEFWSU5HIChhcmdNYXgoY2hhbmdlX3R5cGUsIGxhc3RfdGltZSkgIT0gMikgQU5EIChOT1QgbWF0Y2gocGF0aCwgJyheZGJtcy8pfChebGlicy8pfChedGVzdHMvdGVzdGZsb3dzLyl8KF5wcm9ncmFtcy9zZXJ2ZXIvc3RvcmUvKScpKQogICAgICAgIE9SREVSIEJZIHBhdGggQVNDCiAgICApClNFTEVDVAogICAgcGF0aCwKICAgIHN1bShsaW5lc19hZGRlZCkgKyBzdW0obGluZXNfZGVsZXRlZCkgQVMgbW9kaWZpY2F0aW9ucwpGUk9NIGdpdF9jbGlja2hvdXNlLmZpbGVfY2hhbmdlcwpXSEVSRSAocGF0aCBJTiAoY3VycmVudF9maWxlcykpIEFORCAoZmlsZV9leHRlbnNpb24gSU4gKCdoJywgJ2NwcCcsICdzcWwnKSkKR1JPVVAgQlkgcGF0aApPUkRFUiBCWSBtb2RpZmljYXRpb25zIERFU0MKTElNSVQgMTA=) + +```sql +WITH current_files AS + ( + SELECT path + FROM + ( + SELECT + old_path AS path, + max(time) AS last_time, + 2 AS change_type + FROM git.file_changes + GROUP BY old_path + UNION ALL + SELECT + path, + max(time) AS last_time, + argMax(change_type, time) AS change_type + FROM git.file_changes + GROUP BY path + ) + GROUP BY path + HAVING (argMax(change_type, last_time) != 2) AND (NOT match(path, '(^dbms/)|(^libs/)|(^tests/testflows/)|(^programs/server/store/)')) + ORDER BY path ASC + ) +SELECT + path, + sum(lines_added) + sum(lines_deleted) AS modifications +FROM git.file_changes +WHERE (path IN (current_files)) AND (file_extension IN ('h', 'cpp', 'sql')) +GROUP BY path +ORDER BY modifications DESC +LIMIT 10 + +┌─path───────────────────────────────────────────────────┬─modifications─┐ +│ src/Storages/StorageReplicatedMergeTree.cpp │ 21871 │ +│ src/Storages/MergeTree/MergeTreeData.cpp │ 17709 │ +│ programs/client/Client.cpp │ 15882 │ +│ src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp │ 14249 │ +│ src/Interpreters/InterpreterSelectQuery.cpp │ 12636 │ +│ src/Parsers/ExpressionListParsers.cpp │ 11794 │ +│ src/Analyzer/QueryAnalysisPass.cpp │ 11760 │ +│ src/Coordination/KeeperStorage.cpp │ 10225 │ +│ src/Functions/FunctionsConversion.h │ 9247 │ +│ src/Parsers/ExpressionElementParsers.cpp │ 8197 │ +└────────────────────────────────────────────────────────┴───────────────┘ + +10 rows in set. Elapsed: 0.134 sec. Processed 798.15 thousand rows, 16.46 MB (5.95 million rows/s., 122.62 MB/s.) +``` + +## What day of the week do commits usually occur? + +[play](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICBkYXlfb2Zfd2VlaywKICAgIGNvdW50KCkgQVMgYwpGUk9NIGdpdF9jbGlja2hvdXNlLmNvbW1pdHMKR1JPVVAgQlkgZGF5T2ZXZWVrKHRpbWUpIEFTIGRheV9vZl93ZWVrCg==) + +```sql +SELECT + day_of_week, + count() AS c +FROM git.commits +GROUP BY dayOfWeek(time) AS day_of_week + +┌─day_of_week─┬─────c─┐ +│ 1 │ 10575 │ +│ 2 │ 10645 │ +│ 3 │ 10748 │ +│ 4 │ 10944 │ +│ 5 │ 10090 │ +│ 6 │ 4617 │ +│ 7 │ 5166 │ +└─────────────┴───────┘ +7 rows in set. Elapsed: 0.262 sec. Processed 62.78 thousand rows, 251.14 KB (239.73 thousand rows/s., 958.93 KB/s.) +``` + +This makes sense with some productivity drop-off on Fridays. Great to see people committing code at weekends! Big thanks to our contributors! + +## History of subdirectory/file - number of lines, commits and contributors over time + +This would produce a large query result that is unrealistic to show or visualize if unfiltered. We, therefore, allow a file or subdirectory to be filtered in the following example. Here we group by week using the `toStartOfWeek` function - adapt as required. + +[play](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICB3ZWVrLAogICAgc3VtKGxpbmVzX2FkZGVkKSBBUyBsaW5lc19hZGRlZCwKICAgIHN1bShsaW5lc19kZWxldGVkKSBBUyBsaW5lc19kZWxldGVkLAogICAgdW5pcShjb21taXRfaGFzaCkgQVMgbnVtX2NvbW1pdHMsCiAgICB1bmlxKGF1dGhvcikgQVMgYXV0aG9ycwpGUk9NIGdpdF9jbGlja2hvdXNlLmZpbGVfY2hhbmdlcwpXSEVSRSBwYXRoIExJS0UgJ3NyYy9TdG9yYWdlcyUnCkdST1VQIEJZIHRvU3RhcnRPZldlZWsodGltZSkgQVMgd2VlawpPUkRFUiBCWSB3ZWVrIEFTQwpMSU1JVCAxMAo=) + +```sql +SELECT + week, + sum(lines_added) AS lines_added, + sum(lines_deleted) AS lines_deleted, + uniq(commit_hash) AS num_commits, + uniq(author) AS authors +FROM git.file_changes +WHERE path LIKE 'src/Storages%' +GROUP BY toStartOfWeek(time) AS week +ORDER BY week ASC +LIMIT 10 + +┌───────week─┬─lines_added─┬─lines_deleted─┬─num_commits─┬─authors─┐ +│ 2020-03-29 │ 49 │ 35 │ 4 │ 3 │ +│ 2020-04-05 │ 940 │ 601 │ 55 │ 14 │ +│ 2020-04-12 │ 1472 │ 607 │ 32 │ 11 │ +│ 2020-04-19 │ 917 │ 841 │ 39 │ 12 │ +│ 2020-04-26 │ 1067 │ 626 │ 36 │ 10 │ +│ 2020-05-03 │ 514 │ 435 │ 27 │ 10 │ +│ 2020-05-10 │ 2552 │ 537 │ 48 │ 12 │ +│ 2020-05-17 │ 3585 │ 1913 │ 83 │ 9 │ +│ 2020-05-24 │ 2851 │ 1812 │ 74 │ 18 │ +│ 2020-05-31 │ 2771 │ 2077 │ 77 │ 16 │ +└────────────┴─────────────┴───────────────┴─────────────┴─────────┘ +10 rows in set. Elapsed: 0.043 sec. Processed 266.05 thousand rows, 15.85 MB (6.12 million rows/s., 364.61 MB/s.) +``` + +This data visualizes well. Below we use Superset. + +**For lines added and deleted:** + +![](./images/superset-github-lines-added-deleted.png) + +**For commits and authors:** + +![](./images/superset-commits-authors.png) + +## List files with maximum number of authors + +Limit to current files only. + +[play](https://play.clickhouse.com/play?user=play#V0lUSCBjdXJyZW50X2ZpbGVzIEFTCiAgICAoCiAgICAgICAgU0VMRUNUIHBhdGgKICAgICAgICBGUk9NCiAgICAgICAgKAogICAgICAgICAgICBTRUxFQ1QKICAgICAgICAgICAgICAgIG9sZF9wYXRoIEFTIHBhdGgsCiAgICAgICAgICAgICAgICBtYXgodGltZSkgQVMgbGFzdF90aW1lLAogICAgICAgICAgICAgICAgMiBBUyBjaGFuZ2VfdHlwZQogICAgICAgICAgICBGUk9NIGdpdF9jbGlja2hvdXNlLmZpbGVfY2hhbmdlcwogICAgICAgICAgICBHUk9VUCBCWSBvbGRfcGF0aAogICAgICAgICAgICBVTklPTiBBTEwKICAgICAgICAgICAgU0VMRUNUCiAgICAgICAgICAgICAgICBwYXRoLAogICAgICAgICAgICAgICAgbWF4KHRpbWUpIEFTIGxhc3RfdGltZSwKICAgICAgICAgICAgICAgIGFyZ01heChjaGFuZ2VfdHlwZSwgdGltZSkgQVMgY2hhbmdlX3R5cGUKICAgICAgICAgICAgRlJPTSBnaXRfY2xpY2tob3VzZS5maWxlX2NoYW5nZXMKICAgICAgICAgICAgR1JPVVAgQlkgcGF0aAogICAgICAgICkKICAgICAgICBHUk9VUCBCWSBwYXRoCiAgICAgICAgSEFWSU5HIChhcmdNYXgoY2hhbmdlX3R5cGUsIGxhc3RfdGltZSkgIT0gMikgQU5EIChOT1QgbWF0Y2gocGF0aCwgJyheZGJtcy8pfChebGlicy8pfChedGVzdHMvdGVzdGZsb3dzLyl8KF5wcm9ncmFtcy9zZXJ2ZXIvc3RvcmUvKScpKQogICAgICAgIE9SREVSIEJZIHBhdGggQVNDCiAgICApClNFTEVDVAogICAgcGF0aCwKICAgIHVuaXEoYXV0aG9yKSBBUyBudW1fYXV0aG9ycwpGUk9NIGdpdF9jbGlja2hvdXNlLmZpbGVfY2hhbmdlcwpXSEVSRSBwYXRoIElOIChjdXJyZW50X2ZpbGVzKQpHUk9VUCBCWSBwYXRoCk9SREVSIEJZIG51bV9hdXRob3JzIERFU0MKTElNSVQgMTA=) + +```sql +WITH current_files AS + ( + SELECT path + FROM + ( + SELECT + old_path AS path, + max(time) AS last_time, + 2 AS change_type + FROM git.file_changes + GROUP BY old_path + UNION ALL + SELECT + path, + max(time) AS last_time, + argMax(change_type, time) AS change_type + FROM git.file_changes + GROUP BY path + ) + GROUP BY path + HAVING (argMax(change_type, last_time) != 2) AND (NOT match(path, '(^dbms/)|(^libs/)|(^tests/testflows/)|(^programs/server/store/)')) + ORDER BY path ASC + ) +SELECT + path, + uniq(author) AS num_authors +FROM git.file_changes +WHERE path IN (current_files) +GROUP BY path +ORDER BY num_authors DESC +LIMIT 10 + +┌─path────────────────────────────────────────┬─num_authors─┐ +│ src/Core/Settings.h │ 127 │ +│ CMakeLists.txt │ 96 │ +│ .gitmodules │ 85 │ +│ src/Storages/MergeTree/MergeTreeData.cpp │ 72 │ +│ src/CMakeLists.txt │ 71 │ +│ programs/server/Server.cpp │ 70 │ +│ src/Interpreters/Context.cpp │ 64 │ +│ src/Storages/StorageReplicatedMergeTree.cpp │ 63 │ +│ src/Common/ErrorCodes.cpp │ 61 │ +│ src/Interpreters/InterpreterSelectQuery.cpp │ 59 │ +└─────────────────────────────────────────────┴─────────────┘ + +10 rows in set. Elapsed: 0.239 sec. Processed 798.15 thousand rows, 14.13 MB (3.35 million rows/s., 59.22 MB/s.) +``` + +## Oldest lines of code in the repository + +Limited to current files only. + +[play](https://play.clickhouse.com/play?user=play#V0lUSCBjdXJyZW50X2ZpbGVzIEFTCiAgICAoCiAgICAgICAgU0VMRUNUIHBhdGgKICAgICAgICBGUk9NCiAgICAgICAgKAogICAgICAgICAgICBTRUxFQ1QKICAgICAgICAgICAgICAgIG9sZF9wYXRoIEFTIHBhdGgsCiAgICAgICAgICAgICAgICBtYXgodGltZSkgQVMgbGFzdF90aW1lLAogICAgICAgICAgICAgICAgMiBBUyBjaGFuZ2VfdHlwZQogICAgICAgICAgICBGUk9NIGdpdF9jbGlja2hvdXNlLmZpbGVfY2hhbmdlcwogICAgICAgICAgICBHUk9VUCBCWSBvbGRfcGF0aAogICAgICAgICAgICBVTklPTiBBTEwKICAgICAgICAgICAgU0VMRUNUCiAgICAgICAgICAgICAgICBwYXRoLAogICAgICAgICAgICAgICAgbWF4KHRpbWUpIEFTIGxhc3RfdGltZSwKICAgICAgICAgICAgICAgIGFyZ01heChjaGFuZ2VfdHlwZSwgdGltZSkgQVMgY2hhbmdlX3R5cGUKICAgICAgICAgICAgRlJPTSBnaXRfY2xpY2tob3VzZS5maWxlX2NoYW5nZXMKICAgICAgICAgICAgR1JPVVAgQlkgcGF0aAogICAgICAgICkKICAgICAgICBHUk9VUCBCWSBwYXRoCiAgICAgICAgSEFWSU5HIChhcmdNYXgoY2hhbmdlX3R5cGUsIGxhc3RfdGltZSkgIT0gMikgQU5EIChOT1QgbWF0Y2gocGF0aCwgJyheZGJtcy8pfChebGlicy8pfChedGVzdHMvdGVzdGZsb3dzLyl8KF5wcm9ncmFtcy9zZXJ2ZXIvc3RvcmUvKScpKQogICAgICAgIE9SREVSIEJZIHBhdGggQVNDCiAgICApClNFTEVDVAogICAgYW55KHBhdGgpIEFTIGZpbGVfcGF0aCwKICAgIGxpbmUsCiAgICBtYXgodGltZSkgQVMgbGF0ZXN0X2NoYW5nZSwKICAgIGFueShmaWxlX2NoYW5nZV90eXBlKQpGUk9NIGdpdF9jbGlja2hvdXNlLmxpbmVfY2hhbmdlcwpXSEVSRSBwYXRoIElOIChjdXJyZW50X2ZpbGVzKQpHUk9VUCBCWSBsaW5lCk9SREVSIEJZIGxhdGVzdF9jaGFuZ2UgQVNDCkxJTUlUIDEw) + +```sql +WITH current_files AS + ( + SELECT path + FROM + ( + SELECT + old_path AS path, + max(time) AS last_time, + 2 AS change_type + FROM git.file_changes + GROUP BY old_path + UNION ALL + SELECT + path, + max(time) AS last_time, + argMax(change_type, time) AS change_type + FROM git.file_changes + GROUP BY path + ) + GROUP BY path + HAVING (argMax(change_type, last_time) != 2) AND (NOT match(path, '(^dbms/)|(^libs/)|(^tests/testflows/)|(^programs/server/store/)')) + ORDER BY path ASC + ) +SELECT + any(path) AS file_path, + line, + max(time) AS latest_change, + any(file_change_type) +FROM git.line_changes +WHERE path IN (current_files) +GROUP BY line +ORDER BY latest_change ASC +LIMIT 10 + +┌─file_path───────────────────────────────────┬─line────────────────────────────────────────────────────────┬───────latest_change─┬─any(file_change_type)─┐ +│ utils/compressor/test.sh │ ./compressor -d < compressor.snp > compressor2 │ 2011-06-17 22:19:39 │ Modify │ +│ utils/compressor/test.sh │ ./compressor < compressor > compressor.snp │ 2011-06-17 22:19:39 │ Modify │ +│ utils/compressor/test.sh │ ./compressor -d < compressor.qlz > compressor2 │ 2014-02-24 03:14:30 │ Add │ +│ utils/compressor/test.sh │ ./compressor < compressor > compressor.qlz │ 2014-02-24 03:14:30 │ Add │ +│ utils/config-processor/config-processor.cpp │ if (argc != 2) │ 2014-02-26 19:10:00 │ Add │ +│ utils/config-processor/config-processor.cpp │ std::cerr << "std::exception: " << e.what() << std::endl; │ 2014-02-26 19:10:00 │ Add │ +│ utils/config-processor/config-processor.cpp │ std::cerr << "Exception: " << e.displayText() << std::endl; │ 2014-02-26 19:10:00 │ Add │ +│ utils/config-processor/config-processor.cpp │ Poco::XML::DOMWriter().writeNode(std::cout, document); │ 2014-02-26 19:10:00 │ Add │ +│ utils/config-processor/config-processor.cpp │ std::cerr << "Some exception" << std::endl; │ 2014-02-26 19:10:00 │ Add │ +│ utils/config-processor/config-processor.cpp │ std::cerr << "usage: " << argv[0] << " path" << std::endl; │ 2014-02-26 19:10:00 │ Add │ +└─────────────────────────────────────────────┴─────────────────────────────────────────────────────────────┴─────────────────────┴───────────────────────┘ + +10 rows in set. Elapsed: 1.101 sec. Processed 8.07 million rows, 905.86 MB (7.33 million rows/s., 823.13 MB/s.) +``` + +## Files with longest history + +Limited to current files only. + +[play](https://play.clickhouse.com/play?user=play#V0lUSCBjdXJyZW50X2ZpbGVzIEFTCiAgICAoCiAgICAgICAgU0VMRUNUIHBhdGgKICAgICAgICBGUk9NCiAgICAgICAgKAogICAgICAgICAgICBTRUxFQ1QKICAgICAgICAgICAgICAgIG9sZF9wYXRoIEFTIHBhdGgsCiAgICAgICAgICAgICAgICBtYXgodGltZSkgQVMgbGFzdF90aW1lLAogICAgICAgICAgICAgICAgMiBBUyBjaGFuZ2VfdHlwZQogICAgICAgICAgICBGUk9NIGdpdF9jbGlja2hvdXNlLmZpbGVfY2hhbmdlcwogICAgICAgICAgICBHUk9VUCBCWSBvbGRfcGF0aAogICAgICAgICAgICBVTklPTiBBTEwKICAgICAgICAgICAgU0VMRUNUCiAgICAgICAgICAgICAgICBwYXRoLAogICAgICAgICAgICAgICAgbWF4KHRpbWUpIEFTIGxhc3RfdGltZSwKICAgICAgICAgICAgICAgIGFyZ01heChjaGFuZ2VfdHlwZSwgdGltZSkgQVMgY2hhbmdlX3R5cGUKICAgICAgICAgICAgRlJPTSBnaXRfY2xpY2tob3VzZS5maWxlX2NoYW5nZXMKICAgICAgICAgICAgR1JPVVAgQlkgcGF0aAogICAgICAgICkKICAgICAgICBHUk9VUCBCWSBwYXRoCiAgICAgICAgSEFWSU5HIChhcmdNYXgoY2hhbmdlX3R5cGUsIGxhc3RfdGltZSkgIT0gMikgQU5EIChOT1QgbWF0Y2gocGF0aCwgJyheZGJtcy8pfChebGlicy8pfChedGVzdHMvdGVzdGZsb3dzLyl8KF5wcm9ncmFtcy9zZXJ2ZXIvc3RvcmUvKScpKQogICAgICAgIE9SREVSIEJZIHBhdGggQVNDCiAgICApClNFTEVDVAogICAgY291bnQoKSBBUyBjLAogICAgcGF0aCwKICAgIG1heCh0aW1lKSBBUyBsYXRlc3RfY2hhbmdlCkZST00gZ2l0X2NsaWNraG91c2UuZmlsZV9jaGFuZ2VzCldIRVJFIHBhdGggSU4gKGN1cnJlbnRfZmlsZXMpCkdST1VQIEJZIHBhdGgKT1JERVIgQlkgYyBERVNDCkxJTUlUIDEw) + +```sql +WITH current_files AS + ( + SELECT path + FROM + ( + SELECT + old_path AS path, + max(time) AS last_time, + 2 AS change_type + FROM git.file_changes + GROUP BY old_path + UNION ALL + SELECT + path, + max(time) AS last_time, + argMax(change_type, time) AS change_type + FROM git.file_changes + GROUP BY path + ) + GROUP BY path + HAVING (argMax(change_type, last_time) != 2) AND (NOT match(path, '(^dbms/)|(^libs/)|(^tests/testflows/)|(^programs/server/store/)')) + ORDER BY path ASC + ) +SELECT + count() AS c, + path, + max(time) AS latest_change +FROM git.file_changes +WHERE path IN (current_files) +GROUP BY path +ORDER BY c DESC +LIMIT 10 + +┌───c─┬─path────────────────────────────────────────┬───────latest_change─┐ +│ 790 │ src/Storages/StorageReplicatedMergeTree.cpp │ 2022-10-30 16:30:51 │ +│ 788 │ src/Storages/MergeTree/MergeTreeData.cpp │ 2022-11-04 09:26:44 │ +│ 752 │ src/Core/Settings.h │ 2022-10-25 11:35:25 │ +│ 749 │ CMakeLists.txt │ 2022-10-05 21:00:49 │ +│ 575 │ src/Interpreters/InterpreterSelectQuery.cpp │ 2022-11-01 10:20:10 │ +│ 563 │ CHANGELOG.md │ 2022-10-27 08:19:50 │ +│ 491 │ src/Interpreters/Context.cpp │ 2022-10-25 12:26:29 │ +│ 437 │ programs/server/Server.cpp │ 2022-10-21 12:25:19 │ +│ 375 │ programs/client/Client.cpp │ 2022-11-03 03:16:55 │ +│ 350 │ src/CMakeLists.txt │ 2022-10-24 09:22:37 │ +└─────┴─────────────────────────────────────────────┴─────────────────────┘ + +10 rows in set. Elapsed: 0.124 sec. Processed 798.15 thousand rows, 14.71 MB (6.44 million rows/s., 118.61 MB/s.) +``` + +Our core data structure, the Merge Tree, is obviously under constant evolution with a long history of edits! + +## Distribution of contributors with respect to docs and code over the month + +**During data capture the changes on the `docs/` folder have been filtered out due to a very commit dirty history. The results of this query are therefore not accurate.** + +Do we write more docs at certain times of the month e.g., around release dates? We can use the `countIf` function to compute a simple ratio, visualizing the result using the `bar` function. + +[play](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICBkYXksCiAgICBiYXIoZG9jc19yYXRpbyAqIDEwMDAsIDAsIDEwMCwgMTAwKSBBUyBiYXIKRlJPTQooCiAgICBTRUxFQ1QKICAgICAgICBkYXksCiAgICAgICAgY291bnRJZihmaWxlX2V4dGVuc2lvbiBJTiAoJ2gnLCAnY3BwJywgJ3NxbCcpKSBBUyBjb2RlLAogICAgICAgIGNvdW50SWYoZmlsZV9leHRlbnNpb24gPSAnbWQnKSBBUyBkb2NzLAogICAgICAgIGRvY3MgLyAoY29kZSArIGRvY3MpIEFTIGRvY3NfcmF0aW8KICAgIEZST00gZ2l0X2NsaWNraG91c2UubGluZV9jaGFuZ2VzCiAgICBXSEVSRSAoc2lnbiA9IDEpIEFORCAoZmlsZV9leHRlbnNpb24gSU4gKCdoJywgJ2NwcCcsICdzcWwnLCAnbWQnKSkKICAgIEdST1VQIEJZIGRheU9mTW9udGgodGltZSkgQVMgZGF5CikK) + +```sql +SELECT + day, + bar(docs_ratio * 1000, 0, 100, 100) AS bar +FROM +( + SELECT + day, + countIf(file_extension IN ('h', 'cpp', 'sql')) AS code, + countIf(file_extension = 'md') AS docs, + docs / (code + docs) AS docs_ratio + FROM git.line_changes + WHERE (sign = 1) AND (file_extension IN ('h', 'cpp', 'sql', 'md')) + GROUP BY dayOfMonth(time) AS day +) + +┌─day─┬─bar─────────────────────────────────────────────────────────────┐ +│ 1 │ ███████████████████████████████████▍ │ +│ 2 │ ███████████████████████▋ │ +│ 3 │ ████████████████████████████████▋ │ +│ 4 │ █████████████ │ +│ 5 │ █████████████████████▎ │ +│ 6 │ ████████ │ +│ 7 │ ███▋ │ +│ 8 │ ████████▌ │ +│ 9 │ ██████████████▎ │ +│ 10 │ █████████████████▏ │ +│ 11 │ █████████████▎ │ +│ 12 │ ███████████████████████████████████▋ │ +│ 13 │ █████████████████████████████▎ │ +│ 14 │ ██████▋ │ +│ 15 │ █████████████████████████████████████████▊ │ +│ 16 │ ██████████▎ │ +│ 17 │ ██████████████████████████████████████▋ │ +│ 18 │ █████████████████████████████████▌ │ +│ 19 │ ███████████ │ +│ 20 │ █████████████████████████████████▊ │ +│ 21 │ █████ │ +│ 22 │ ███████████████████████▋ │ +│ 23 │ ███████████████████████████▌ │ +│ 24 │ ███████▌ │ +│ 25 │ ██████████████████████████████████▎ │ +│ 26 │ ███████████▏ │ +│ 27 │ ███████████████████████████████████████████████████████████████ │ +│ 28 │ ████████████████████████████████████████████████████▏ │ +│ 29 │ ███▌ │ +│ 30 │ ████████████████████████████████████████▎ │ +│ 31 │ █████████████████████████████████▏ │ +└─────┴─────────────────────────────────────────────────────────────────┘ + +31 rows in set. Elapsed: 0.043 sec. Processed 7.54 million rows, 40.53 MB (176.71 million rows/s., 950.40 MB/s.) +``` + +Maybe a little more near the end of the month, but overall we keep a good even distribution. Again this is unrealiable due to the filtering of the docs filter during data insertion. + +## Authors with the most diverse impact + +We consider diversity here to be the number of unique files an author has contributed to. + +[play](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICBhdXRob3IsCiAgICB1bmlxKHBhdGgpIEFTIG51bV9maWxlcwpGUk9NIGdpdF9jbGlja2hvdXNlLmZpbGVfY2hhbmdlcwpXSEVSRSAoY2hhbmdlX3R5cGUgSU4gKCdBZGQnLCAnTW9kaWZ5JykpIEFORCAoZmlsZV9leHRlbnNpb24gSU4gKCdoJywgJ2NwcCcsICdzcWwnKSkKR1JPVVAgQlkgYXV0aG9yCk9SREVSIEJZIG51bV9maWxlcyBERVNDCkxJTUlUIDEw) + +```sql +SELECT + author, + uniq(path) AS num_files +FROM git.file_changes +WHERE (change_type IN ('Add', 'Modify')) AND (file_extension IN ('h', 'cpp', 'sql')) +GROUP BY author +ORDER BY num_files DESC +LIMIT 10 + +┌─author─────────────┬─num_files─┐ +│ Alexey Milovidov │ 8433 │ +│ Nikolai Kochetov │ 3257 │ +│ Vitaly Baranov │ 2316 │ +│ Maksim Kita │ 2172 │ +│ Azat Khuzhin │ 1988 │ +│ alesapin │ 1818 │ +│ Alexander Tokmakov │ 1751 │ +│ Amos Bird │ 1641 │ +│ Ivan │ 1629 │ +│ alexey-milovidov │ 1581 │ +└────────────────────┴───────────┘ + +10 rows in set. Elapsed: 0.041 sec. Processed 266.05 thousand rows, 4.92 MB (6.56 million rows/s., 121.21 MB/s.) +``` + +Let's see who has the most diverse commits in their recent work. Rather than limit by date, we'll restrict to an author's last N commits (in this case, we've used 3 but feel free to modify): + +[play](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICBhdXRob3IsCiAgICBzdW0obnVtX2ZpbGVzX2NvbW1pdCkgQVMgbnVtX2ZpbGVzCkZST00KKAogICAgU0VMRUNUCiAgICAgICAgYXV0aG9yLAogICAgICAgIGNvbW1pdF9oYXNoLAogICAgICAgIHVuaXEocGF0aCkgQVMgbnVtX2ZpbGVzX2NvbW1pdCwKICAgICAgICBtYXgodGltZSkgQVMgY29tbWl0X3RpbWUKICAgIEZST00gZ2l0X2NsaWNraG91c2UuZmlsZV9jaGFuZ2VzCiAgICBXSEVSRSAoY2hhbmdlX3R5cGUgSU4gKCdBZGQnLCAnTW9kaWZ5JykpIEFORCAoZmlsZV9leHRlbnNpb24gSU4gKCdoJywgJ2NwcCcsICdzcWwnKSkKICAgIEdST1VQIEJZCiAgICAgICAgYXV0aG9yLAogICAgICAgIGNvbW1pdF9oYXNoCiAgICBPUkRFUiBCWQogICAgICAgIGF1dGhvciBBU0MsCiAgICAgICAgY29tbWl0X3RpbWUgREVTQwogICAgTElNSVQgMyBCWSBhdXRob3IKKQpHUk9VUCBCWSBhdXRob3IKT1JERVIgQlkgbnVtX2ZpbGVzIERFU0MKTElNSVQgMTA=) + +```sql +SELECT + author, + sum(num_files_commit) AS num_files +FROM +( + SELECT + author, + commit_hash, + uniq(path) AS num_files_commit, + max(time) AS commit_time + FROM git.file_changes + WHERE (change_type IN ('Add', 'Modify')) AND (file_extension IN ('h', 'cpp', 'sql')) + GROUP BY + author, + commit_hash + ORDER BY + author ASC, + commit_time DESC + LIMIT 3 BY author +) +GROUP BY author +ORDER BY num_files DESC +LIMIT 10 + +┌─author───────────────┬─num_files─┐ +│ Mikhail │ 782 │ +│ Li Yin │ 553 │ +│ Roman Peshkurov │ 119 │ +│ Vladimir Smirnov │ 88 │ +│ f1yegor │ 65 │ +│ maiha │ 54 │ +│ Vitaliy Lyudvichenko │ 53 │ +│ Pradeep Chhetri │ 40 │ +│ Orivej Desh │ 38 │ +│ liyang │ 36 │ +└──────────────────────┴───────────┘ + +10 rows in set. Elapsed: 0.106 sec. Processed 266.05 thousand rows, 21.04 MB (2.52 million rows/s., 198.93 MB/s.) +``` + +## Favorite files for an author + +Here we select our founder [Alexey Milovidov](https://github.com/alexey-milovidov) and limit our analysis to current files. + +[play](https://play.clickhouse.com/play?user=play#V0lUSCBjdXJyZW50X2ZpbGVzIEFTCiAgICAoCiAgICAgICAgU0VMRUNUIHBhdGgKICAgICAgICBGUk9NCiAgICAgICAgKAogICAgICAgICAgICBTRUxFQ1QKICAgICAgICAgICAgICAgIG9sZF9wYXRoIEFTIHBhdGgsCiAgICAgICAgICAgICAgICBtYXgodGltZSkgQVMgbGFzdF90aW1lLAogICAgICAgICAgICAgICAgMiBBUyBjaGFuZ2VfdHlwZQogICAgICAgICAgICBGUk9NIGdpdF9jbGlja2hvdXNlLmZpbGVfY2hhbmdlcwogICAgICAgICAgICBHUk9VUCBCWSBvbGRfcGF0aAogICAgICAgICAgICBVTklPTiBBTEwKICAgICAgICAgICAgU0VMRUNUCiAgICAgICAgICAgICAgICBwYXRoLAogICAgICAgICAgICAgICAgbWF4KHRpbWUpIEFTIGxhc3RfdGltZSwKICAgICAgICAgICAgICAgIGFyZ01heChjaGFuZ2VfdHlwZSwgdGltZSkgQVMgY2hhbmdlX3R5cGUKICAgICAgICAgICAgRlJPTSBnaXRfY2xpY2tob3VzZS5maWxlX2NoYW5nZXMKICAgICAgICAgICAgR1JPVVAgQlkgcGF0aAogICAgICAgICkKICAgICAgICBHUk9VUCBCWSBwYXRoCiAgICAgICAgSEFWSU5HIChhcmdNYXgoY2hhbmdlX3R5cGUsIGxhc3RfdGltZSkgIT0gMikgQU5EIChOT1QgbWF0Y2gocGF0aCwgJyheZGJtcy8pfChebGlicy8pfChedGVzdHMvdGVzdGZsb3dzLyl8KF5wcm9ncmFtcy9zZXJ2ZXIvc3RvcmUvKScpKQogICAgICAgIE9SREVSIEJZIHBhdGggQVNDCiAgICApClNFTEVDVAogICAgcGF0aCwKICAgIGNvdW50KCkgQVMgYwpGUk9NIGdpdF9jbGlja2hvdXNlLmZpbGVfY2hhbmdlcwpXSEVSRSAoYXV0aG9yID0gJ0FsZXhleSBNaWxvdmlkb3YnKSBBTkQgKHBhdGggSU4gKGN1cnJlbnRfZmlsZXMpKQpHUk9VUCBCWSBwYXRoCk9SREVSIEJZIGMgREVTQwpMSU1JVCAxMA==) + +```sql +WITH current_files AS + ( + SELECT path + FROM + ( + SELECT + old_path AS path, + max(time) AS last_time, + 2 AS change_type + FROM git.file_changes + GROUP BY old_path + UNION ALL + SELECT + path, + max(time) AS last_time, + argMax(change_type, time) AS change_type + FROM git.file_changes + GROUP BY path + ) + GROUP BY path + HAVING (argMax(change_type, last_time) != 2) AND (NOT match(path, '(^dbms/)|(^libs/)|(^tests/testflows/)|(^programs/server/store/)')) + ORDER BY path ASC + ) +SELECT + path, + count() AS c +FROM git.file_changes +WHERE (author = 'Alexey Milovidov') AND (path IN (current_files)) +GROUP BY path +ORDER BY c DESC +LIMIT 10 + +┌─path────────────────────────────────────────┬───c─┐ +│ CMakeLists.txt │ 165 │ +│ CHANGELOG.md │ 126 │ +│ programs/server/Server.cpp │ 73 │ +│ src/Storages/MergeTree/MergeTreeData.cpp │ 71 │ +│ src/Storages/StorageReplicatedMergeTree.cpp │ 68 │ +│ src/Core/Settings.h │ 65 │ +│ programs/client/Client.cpp │ 57 │ +│ programs/server/play.html │ 48 │ +│ .gitmodules │ 47 │ +│ programs/install/Install.cpp │ 37 │ +└─────────────────────────────────────────────┴─────┘ + +10 rows in set. Elapsed: 0.106 sec. Processed 798.15 thousand rows, 13.97 MB (7.51 million rows/s., 131.41 MB/s.) +``` + +This makes sense because Alexey has been responsible for maintaining the Change log. But what if we use the basename of the file to identify his popular files - this allows for renames and should focus on code contributions. + +[play](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICBiYXNlLAogICAgY291bnQoKSBBUyBjCkZST00gZ2l0X2NsaWNraG91c2UuZmlsZV9jaGFuZ2VzCldIRVJFIChhdXRob3IgPSAnQWxleGV5IE1pbG92aWRvdicpIEFORCAoZmlsZV9leHRlbnNpb24gSU4gKCdoJywgJ2NwcCcsICdzcWwnKSkKR1JPVVAgQlkgYmFzZW5hbWUocGF0aCkgQVMgYmFzZQpPUkRFUiBCWSBjIERFU0MKTElNSVQgMTA=) + +```sql +SELECT + base, + count() AS c +FROM git.file_changes +WHERE (author = 'Alexey Milovidov') AND (file_extension IN ('h', 'cpp', 'sql')) +GROUP BY basename(path) AS base +ORDER BY c DESC +LIMIT 10 + +┌─base───────────────────────────┬───c─┐ +│ StorageReplicatedMergeTree.cpp │ 393 │ +│ InterpreterSelectQuery.cpp │ 299 │ +│ Aggregator.cpp │ 297 │ +│ Client.cpp │ 280 │ +│ MergeTreeData.cpp │ 274 │ +│ Server.cpp │ 264 │ +│ ExpressionAnalyzer.cpp │ 259 │ +│ StorageMergeTree.cpp │ 239 │ +│ Settings.h │ 225 │ +│ TCPHandler.cpp │ 205 │ +└────────────────────────────────┴─────┘ +10 rows in set. Elapsed: 0.032 sec. Processed 266.05 thousand rows, 5.68 MB (8.22 million rows/s., 175.50 MB/s.) +``` + +This is maybe more reflective of his areas of interest. + +## Largest files with lowest number of authors + +For this, we first need to identify the largest files. Estimating this via a full file reconstruction, for every file, from the history of commits will be very expensive! + +To estimate, assuming we restrict to current files, we sum line additions and subtract deletions. We can then compute a ratio of length to the number of authors. + +[play](https://play.clickhouse.com/play?user=play#V0lUSCBjdXJyZW50X2ZpbGVzIEFTCiAgICAoCiAgICAgICAgU0VMRUNUIHBhdGgKICAgICAgICBGUk9NCiAgICAgICAgKAogICAgICAgICAgICBTRUxFQ1QKICAgICAgICAgICAgICAgIG9sZF9wYXRoIEFTIHBhdGgsCiAgICAgICAgICAgICAgICBtYXgodGltZSkgQVMgbGFzdF90aW1lLAogICAgICAgICAgICAgICAgMiBBUyBjaGFuZ2VfdHlwZQogICAgICAgICAgICBGUk9NIGdpdF9jbGlja2hvdXNlLmZpbGVfY2hhbmdlcwogICAgICAgICAgICBHUk9VUCBCWSBvbGRfcGF0aAogICAgICAgICAgICBVTklPTiBBTEwKICAgICAgICAgICAgU0VMRUNUCiAgICAgICAgICAgICAgICBwYXRoLAogICAgICAgICAgICAgICAgbWF4KHRpbWUpIEFTIGxhc3RfdGltZSwKICAgICAgICAgICAgICAgIGFyZ01heChjaGFuZ2VfdHlwZSwgdGltZSkgQVMgY2hhbmdlX3R5cGUKICAgICAgICAgICAgRlJPTSBnaXRfY2xpY2tob3VzZS5maWxlX2NoYW5nZXMKICAgICAgICAgICAgR1JPVVAgQlkgcGF0aAogICAgICAgICkKICAgICAgICBHUk9VUCBCWSBwYXRoCiAgICAgICAgSEFWSU5HIChhcmdNYXgoY2hhbmdlX3R5cGUsIGxhc3RfdGltZSkgIT0gMikgQU5EIChOT1QgbWF0Y2gocGF0aCwgJyheZGJtcy8pfChebGlicy8pfChedGVzdHMvdGVzdGZsb3dzLyl8KF5wcm9ncmFtcy9zZXJ2ZXIvc3RvcmUvKScpKQogICAgICAgIE9SREVSIEJZIHBhdGggQVNDCiAgICApClNFTEVDVAogICAgcGF0aCwKICAgIHN1bShsaW5lc19hZGRlZCkgLSBzdW0obGluZXNfZGVsZXRlZCkgQVMgbnVtX2xpbmVzLAogICAgdW5pcUV4YWN0KGF1dGhvcikgQVMgbnVtX2F1dGhvcnMsCiAgICBudW1fbGluZXMgLyBudW1fYXV0aG9ycyBBUyBsaW5lc19hdXRob3JfcmF0aW8KRlJPTSBnaXRfY2xpY2tob3VzZS5maWxlX2NoYW5nZXMKV0hFUkUgcGF0aCBJTiAoY3VycmVudF9maWxlcykKR1JPVVAgQlkgcGF0aApPUkRFUiBCWSBsaW5lc19hdXRob3JfcmF0aW8gREVTQwpMSU1JVCAxMA==) + +```sql +WITH current_files AS + ( + SELECT path + FROM + ( + SELECT + old_path AS path, + max(time) AS last_time, + 2 AS change_type + FROM git.file_changes + GROUP BY old_path + UNION ALL + SELECT + path, + max(time) AS last_time, + argMax(change_type, time) AS change_type + FROM git.file_changes + GROUP BY path + ) + GROUP BY path + HAVING (argMax(change_type, last_time) != 2) AND (NOT match(path, '(^dbms/)|(^libs/)|(^tests/testflows/)|(^programs/server/store/)')) + ORDER BY path ASC + ) +SELECT + path, + sum(lines_added) - sum(lines_deleted) AS num_lines, + uniqExact(author) AS num_authors, + num_lines / num_authors AS lines_author_ratio +FROM git.file_changes +WHERE path IN (current_files) +GROUP BY path +ORDER BY lines_author_ratio DESC +LIMIT 10 + +┌─path──────────────────────────────────────────────────────────────────┬─num_lines─┬─num_authors─┬─lines_author_ratio─┐ +│ src/Common/ClassificationDictionaries/emotional_dictionary_rus.txt │ 148590 │ 1 │ 148590 │ +│ src/Functions/ClassificationDictionaries/emotional_dictionary_rus.txt │ 55533 │ 1 │ 55533 │ +│ src/Functions/ClassificationDictionaries/charset_freq.txt │ 35722 │ 1 │ 35722 │ +│ src/Common/ClassificationDictionaries/charset_freq.txt │ 35722 │ 1 │ 35722 │ +│ tests/integration/test_storage_meilisearch/movies.json │ 19549 │ 1 │ 19549 │ +│ tests/queries/0_stateless/02364_multiSearch_function_family.reference │ 12874 │ 1 │ 12874 │ +│ src/Functions/ClassificationDictionaries/programming_freq.txt │ 9434 │ 1 │ 9434 │ +│ src/Common/ClassificationDictionaries/programming_freq.txt │ 9434 │ 1 │ 9434 │ +│ tests/performance/explain_ast.xml │ 5911 │ 1 │ 5911 │ +│ src/Analyzer/QueryAnalysisPass.cpp │ 5686 │ 1 │ 5686 │ +└───────────────────────────────────────────────────────────────────────┴───────────┴─────────────┴────────────────────┘ + +10 rows in set. Elapsed: 0.138 sec. Processed 798.15 thousand rows, 16.57 MB (5.79 million rows/s., 120.11 MB/s.) +``` + +Text dictionaries aren't maybe realistic, so lets restrict to code only via a file extension filter! + +[play](https://play.clickhouse.com/play?user=play#V0lUSCBjdXJyZW50X2ZpbGVzIEFTCiAgICAoCiAgICAgICAgU0VMRUNUIHBhdGgKICAgICAgICBGUk9NCiAgICAgICAgKAogICAgICAgICAgICBTRUxFQ1QKICAgICAgICAgICAgICAgIG9sZF9wYXRoIEFTIHBhdGgsCiAgICAgICAgICAgICAgICBtYXgodGltZSkgQVMgbGFzdF90aW1lLAogICAgICAgICAgICAgICAgMiBBUyBjaGFuZ2VfdHlwZQogICAgICAgICAgICBGUk9NIGdpdF9jbGlja2hvdXNlLmZpbGVfY2hhbmdlcwogICAgICAgICAgICBHUk9VUCBCWSBvbGRfcGF0aAogICAgICAgICAgICBVTklPTiBBTEwKICAgICAgICAgICAgU0VMRUNUCiAgICAgICAgICAgICAgICBwYXRoLAogICAgICAgICAgICAgICAgbWF4KHRpbWUpIEFTIGxhc3RfdGltZSwKICAgICAgICAgICAgICAgIGFyZ01heChjaGFuZ2VfdHlwZSwgdGltZSkgQVMgY2hhbmdlX3R5cGUKICAgICAgICAgICAgRlJPTSBnaXRfY2xpY2tob3VzZS5maWxlX2NoYW5nZXMKICAgICAgICAgICAgR1JPVVAgQlkgcGF0aAogICAgICAgICkKICAgICAgICBHUk9VUCBCWSBwYXRoCiAgICAgICAgSEFWSU5HIChhcmdNYXgoY2hhbmdlX3R5cGUsIGxhc3RfdGltZSkgIT0gMikgQU5EIChOT1QgbWF0Y2gocGF0aCwgJyheZGJtcy8pfChebGlicy8pfChedGVzdHMvdGVzdGZsb3dzLyl8KF5wcm9ncmFtcy9zZXJ2ZXIvc3RvcmUvKScpKQogICAgICAgIE9SREVSIEJZIHBhdGggQVNDCiAgICApClNFTEVDVAogICAgcGF0aCwKICAgIHN1bShsaW5lc19hZGRlZCkgLSBzdW0obGluZXNfZGVsZXRlZCkgQVMgbnVtX2xpbmVzLAogICAgdW5pcUV4YWN0KGF1dGhvcikgQVMgbnVtX2F1dGhvcnMsCiAgICBudW1fbGluZXMgLyBudW1fYXV0aG9ycyBBUyBsaW5lc19hdXRob3JfcmF0aW8KRlJPTSBnaXRfY2xpY2tob3VzZS5maWxlX2NoYW5nZXMKV0hFUkUgKHBhdGggSU4gKGN1cnJlbnRfZmlsZXMpKSBBTkQgKGZpbGVfZXh0ZW5zaW9uIElOICgnaCcsICdjcHAnLCAnc3FsJykpCkdST1VQIEJZIHBhdGgKT1JERVIgQlkgbGluZXNfYXV0aG9yX3JhdGlvIERFU0MKTElNSVQgMTA=) + +```sql +WITH current_files AS + ( + SELECT path + FROM + ( + SELECT + old_path AS path, + max(time) AS last_time, + 2 AS change_type + FROM git.file_changes + GROUP BY old_path + UNION ALL + SELECT + path, + max(time) AS last_time, + argMax(change_type, time) AS change_type + FROM git.file_changes + GROUP BY path + ) + GROUP BY path + HAVING (argMax(change_type, last_time) != 2) AND (NOT match(path, '(^dbms/)|(^libs/)|(^tests/testflows/)|(^programs/server/store/)')) + ORDER BY path ASC + ) +SELECT + path, + sum(lines_added) - sum(lines_deleted) AS num_lines, + uniqExact(author) AS num_authors, + num_lines / num_authors AS lines_author_ratio +FROM git.file_changes +WHERE (path IN (current_files)) AND (file_extension IN ('h', 'cpp', 'sql')) +GROUP BY path +ORDER BY lines_author_ratio DESC +LIMIT 10 + +┌─path──────────────────────────────────┬─num_lines─┬─num_authors─┬─lines_author_ratio─┐ +│ src/Analyzer/QueryAnalysisPass.cpp │ 5686 │ 1 │ 5686 │ +│ src/Analyzer/QueryTreeBuilder.cpp │ 880 │ 1 │ 880 │ +│ src/Planner/Planner.cpp │ 873 │ 1 │ 873 │ +│ src/Backups/RestorerFromBackup.cpp │ 869 │ 1 │ 869 │ +│ utils/memcpy-bench/FastMemcpy.h │ 770 │ 1 │ 770 │ +│ src/Planner/PlannerActionsVisitor.cpp │ 765 │ 1 │ 765 │ +│ src/Functions/sphinxstemen.cpp │ 728 │ 1 │ 728 │ +│ src/Planner/PlannerJoinTree.cpp │ 708 │ 1 │ 708 │ +│ src/Planner/PlannerJoins.cpp │ 695 │ 1 │ 695 │ +│ src/Analyzer/QueryNode.h │ 607 │ 1 │ 607 │ +└───────────────────────────────────────┴───────────┴─────────────┴────────────────────┘ +10 rows in set. Elapsed: 0.140 sec. Processed 798.15 thousand rows, 16.84 MB (5.70 million rows/s., 120.32 MB/s.) +``` + +There is some recency bias in this - newer files have fewer opportunities for commits. What about if we restrict to files at least 1 yr old? + +[play](https://play.clickhouse.com/play?user=play#V0lUSCBjdXJyZW50X2ZpbGVzIEFTCiAgICAoCiAgICAgICAgU0VMRUNUIHBhdGgKICAgICAgICBGUk9NCiAgICAgICAgKAogICAgICAgICAgICBTRUxFQ1QKICAgICAgICAgICAgICAgIG9sZF9wYXRoIEFTIHBhdGgsCiAgICAgICAgICAgICAgICBtYXgodGltZSkgQVMgbGFzdF90aW1lLAogICAgICAgICAgICAgICAgMiBBUyBjaGFuZ2VfdHlwZQogICAgICAgICAgICBGUk9NIGdpdF9jbGlja2hvdXNlLmZpbGVfY2hhbmdlcwogICAgICAgICAgICBHUk9VUCBCWSBvbGRfcGF0aAogICAgICAgICAgICBVTklPTiBBTEwKICAgICAgICAgICAgU0VMRUNUCiAgICAgICAgICAgICAgICBwYXRoLAogICAgICAgICAgICAgICAgbWF4KHRpbWUpIEFTIGxhc3RfdGltZSwKICAgICAgICAgICAgICAgIGFyZ01heChjaGFuZ2VfdHlwZSwgdGltZSkgQVMgY2hhbmdlX3R5cGUKICAgICAgICAgICAgRlJPTSBnaXRfY2xpY2tob3VzZS5maWxlX2NoYW5nZXMKICAgICAgICAgICAgR1JPVVAgQlkgcGF0aAogICAgICAgICkKICAgICAgICBHUk9VUCBCWSBwYXRoCiAgICAgICAgSEFWSU5HIChhcmdNYXgoY2hhbmdlX3R5cGUsIGxhc3RfdGltZSkgIT0gMikgQU5EIChOT1QgbWF0Y2gocGF0aCwgJyheZGJtcy8pfChebGlicy8pfChedGVzdHMvdGVzdGZsb3dzLyl8KF5wcm9ncmFtcy9zZXJ2ZXIvc3RvcmUvKScpKQogICAgICAgIE9SREVSIEJZIHBhdGggQVNDCiAgICApClNFTEVDVAogICAgbWluKHRpbWUpIEFTIG1pbl9kYXRlLAogICAgcGF0aCwKICAgIHN1bShsaW5lc19hZGRlZCkgLSBzdW0obGluZXNfZGVsZXRlZCkgQVMgbnVtX2xpbmVzLAogICAgdW5pcUV4YWN0KGF1dGhvcikgQVMgbnVtX2F1dGhvcnMsCiAgICBudW1fbGluZXMgLyBudW1fYXV0aG9ycyBBUyBsaW5lc19hdXRob3JfcmF0aW8KRlJPTSBnaXRfY2xpY2tob3VzZS5maWxlX2NoYW5nZXMKV0hFUkUgKHBhdGggSU4gKGN1cnJlbnRfZmlsZXMpKSBBTkQgKGZpbGVfZXh0ZW5zaW9uIElOICgnaCcsICdjcHAnLCAnc3FsJykpCkdST1VQIEJZIHBhdGgKSEFWSU5HIG1pbl9kYXRlIDw9IChub3coKSAtIHRvSW50ZXJ2YWxZZWFyKDEpKQpPUkRFUiBCWSBsaW5lc19hdXRob3JfcmF0aW8gREVTQwpMSU1JVCAxMA==) + +```sql +WITH current_files AS + ( + SELECT path + FROM + ( + SELECT + old_path AS path, + max(time) AS last_time, + 2 AS change_type + FROM git.file_changes + GROUP BY old_path + UNION ALL + SELECT + path, + max(time) AS last_time, + argMax(change_type, time) AS change_type + FROM git.file_changes + GROUP BY path + ) + GROUP BY path + HAVING (argMax(change_type, last_time) != 2) AND (NOT match(path, '(^dbms/)|(^libs/)|(^tests/testflows/)|(^programs/server/store/)')) + ORDER BY path ASC + ) +SELECT + min(time) AS min_date, + path, + sum(lines_added) - sum(lines_deleted) AS num_lines, + uniqExact(author) AS num_authors, + num_lines / num_authors AS lines_author_ratio +FROM git.file_changes +WHERE (path IN (current_files)) AND (file_extension IN ('h', 'cpp', 'sql')) +GROUP BY path +HAVING min_date <= (now() - toIntervalYear(1)) +ORDER BY lines_author_ratio DESC +LIMIT 10 + +┌────────────min_date─┬─path───────────────────────────────────────────────────────────┬─num_lines─┬─num_authors─┬─lines_author_ratio─┐ +│ 2021-03-08 07:00:54 │ utils/memcpy-bench/FastMemcpy.h │ 770 │ 1 │ 770 │ +│ 2021-05-04 13:47:34 │ src/Functions/sphinxstemen.cpp │ 728 │ 1 │ 728 │ +│ 2021-03-14 16:52:51 │ utils/memcpy-bench/glibc/dwarf2.h │ 592 │ 1 │ 592 │ +│ 2021-03-08 09:04:52 │ utils/memcpy-bench/FastMemcpy_Avx.h │ 496 │ 1 │ 496 │ +│ 2020-10-19 01:10:50 │ tests/queries/0_stateless/01518_nullable_aggregate_states2.sql │ 411 │ 1 │ 411 │ +│ 2020-11-24 14:53:34 │ programs/server/GRPCHandler.cpp │ 399 │ 1 │ 399 │ +│ 2021-03-09 14:10:28 │ src/DataTypes/Serializations/SerializationSparse.cpp │ 363 │ 1 │ 363 │ +│ 2021-08-20 15:06:57 │ src/Functions/vectorFunctions.cpp │ 1327 │ 4 │ 331.75 │ +│ 2020-08-04 03:26:23 │ src/Interpreters/MySQL/CreateQueryConvertVisitor.cpp │ 311 │ 1 │ 311 │ +│ 2020-11-06 15:45:13 │ src/Storages/Rocksdb/StorageEmbeddedRocksdb.cpp │ 611 │ 2 │ 305.5 │ +└─────────────────────┴────────────────────────────────────────────────────────────────┴───────────┴─────────────┴────────────────────┘ + +10 rows in set. Elapsed: 0.143 sec. Processed 798.15 thousand rows, 18.00 MB (5.58 million rows/s., 125.87 MB/s.) +``` + +## Commits and lines of code distribution by time; by weekday, by author; for specific subdirectories + +We interpret this as the number of lines added and removed by the day of the week. In this case, we focus on the [Functions directory](https://github.com/ClickHouse/ClickHouse/tree/master/src/Functions) + +[play](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICBkYXlPZldlZWssCiAgICB1bmlxKGNvbW1pdF9oYXNoKSBBUyBjb21taXRzLAogICAgc3VtKGxpbmVzX2FkZGVkKSBBUyBsaW5lc19hZGRlZCwKICAgIHN1bShsaW5lc19kZWxldGVkKSBBUyBsaW5lc19kZWxldGVkCkZST00gZ2l0X2NsaWNraG91c2UuZmlsZV9jaGFuZ2VzCldIRVJFIHBhdGggTElLRSAnc3JjL0Z1bmN0aW9ucyUnCkdST1VQIEJZIHRvRGF5T2ZXZWVrKHRpbWUpIEFTIGRheU9mV2Vlaw==) + +```sql +SELECT + dayOfWeek, + uniq(commit_hash) AS commits, + sum(lines_added) AS lines_added, + sum(lines_deleted) AS lines_deleted +FROM git.file_changes +WHERE path LIKE 'src/Functions%' +GROUP BY toDayOfWeek(time) AS dayOfWeek + +┌─dayOfWeek─┬─commits─┬─lines_added─┬─lines_deleted─┐ +│ 1 │ 476 │ 24619 │ 15782 │ +│ 2 │ 434 │ 18098 │ 9938 │ +│ 3 │ 496 │ 26562 │ 20883 │ +│ 4 │ 587 │ 65674 │ 18862 │ +│ 5 │ 504 │ 85917 │ 14518 │ +│ 6 │ 314 │ 13604 │ 10144 │ +│ 7 │ 294 │ 11938 │ 6451 │ +└───────────┴─────────┴─────────────┴───────────────┘ + +7 rows in set. Elapsed: 0.034 sec. Processed 266.05 thousand rows, 14.66 MB (7.73 million rows/s., 425.56 MB/s.) +``` + +And by time of day, + +[play](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICBob3VyT2ZEYXksCiAgICB1bmlxKGNvbW1pdF9oYXNoKSBBUyBjb21taXRzLAogICAgc3VtKGxpbmVzX2FkZGVkKSBBUyBsaW5lc19hZGRlZCwKICAgIHN1bShsaW5lc19kZWxldGVkKSBBUyBsaW5lc19kZWxldGVkCkZST00gZ2l0X2NsaWNraG91c2UuZmlsZV9jaGFuZ2VzCldIRVJFIHBhdGggTElLRSAnc3JjL0Z1bmN0aW9ucyUnCkdST1VQIEJZIHRvSG91cih0aW1lKSBBUyBob3VyT2ZEYXk=) + +```sql +SELECT + hourOfDay, + uniq(commit_hash) AS commits, + sum(lines_added) AS lines_added, + sum(lines_deleted) AS lines_deleted +FROM git.file_changes +WHERE path LIKE 'src/Functions%' +GROUP BY toHour(time) AS hourOfDay + +┌─hourOfDay─┬─commits─┬─lines_added─┬─lines_deleted─┐ +│ 0 │ 71 │ 4169 │ 3404 │ +│ 1 │ 90 │ 2174 │ 1927 │ +│ 2 │ 65 │ 2343 │ 1515 │ +│ 3 │ 76 │ 2552 │ 493 │ +│ 4 │ 62 │ 1480 │ 1304 │ +│ 5 │ 38 │ 1644 │ 253 │ +│ 6 │ 104 │ 4434 │ 2979 │ +│ 7 │ 117 │ 4171 │ 1678 │ +│ 8 │ 106 │ 4604 │ 4673 │ +│ 9 │ 135 │ 60550 │ 2678 │ +│ 10 │ 149 │ 6133 │ 3482 │ +│ 11 │ 182 │ 8040 │ 3833 │ +│ 12 │ 209 │ 29428 │ 15040 │ +│ 13 │ 187 │ 10204 │ 5491 │ +│ 14 │ 204 │ 9028 │ 6060 │ +│ 15 │ 231 │ 15179 │ 10077 │ +│ 16 │ 196 │ 9568 │ 5925 │ +│ 17 │ 138 │ 4941 │ 3849 │ +│ 18 │ 123 │ 4193 │ 3036 │ +│ 19 │ 165 │ 8817 │ 6646 │ +│ 20 │ 140 │ 3749 │ 2379 │ +│ 21 │ 132 │ 41585 │ 4182 │ +│ 22 │ 85 │ 4094 │ 3955 │ +│ 23 │ 100 │ 3332 │ 1719 │ +└───────────┴─────────┴─────────────┴───────────────┘ + +24 rows in set. Elapsed: 0.039 sec. Processed 266.05 thousand rows, 14.66 MB (6.77 million rows/s., 372.89 MB/s.) +``` + +This distribution makes sense given most of our development team is in Amsterdam. The `bar` functions helps us visualize these distributions: + +[play](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICBob3VyT2ZEYXksCiAgICBiYXIoY29tbWl0cywgMCwgNDAwLCA1MCkgQVMgY29tbWl0cywKICAgIGJhcihsaW5lc19hZGRlZCwgMCwgMzAwMDAsIDUwKSBBUyBsaW5lc19hZGRlZCwKICAgIGJhcihsaW5lc19kZWxldGVkLCAwLCAxNTAwMCwgNTApIEFTIGxpbmVzX2RlbGV0ZWQKRlJPTQooCiAgICBTRUxFQ1QKICAgICAgICBob3VyT2ZEYXksCiAgICAgICAgdW5pcShjb21taXRfaGFzaCkgQVMgY29tbWl0cywKICAgICAgICBzdW0obGluZXNfYWRkZWQpIEFTIGxpbmVzX2FkZGVkLAogICAgICAgIHN1bShsaW5lc19kZWxldGVkKSBBUyBsaW5lc19kZWxldGVkCiAgICBGUk9NIGdpdF9jbGlja2hvdXNlLmZpbGVfY2hhbmdlcwogICAgV0hFUkUgcGF0aCBMSUtFICdzcmMvRnVuY3Rpb25zJScKICAgIEdST1VQIEJZIHRvSG91cih0aW1lKSBBUyBob3VyT2ZEYXkKKQ==) + +```sql +SELECT + hourOfDay, + bar(commits, 0, 400, 50) AS commits, + bar(lines_added, 0, 30000, 50) AS lines_added, + bar(lines_deleted, 0, 15000, 50) AS lines_deleted +FROM +( + SELECT + hourOfDay, + uniq(commit_hash) AS commits, + sum(lines_added) AS lines_added, + sum(lines_deleted) AS lines_deleted + FROM git.file_changes + WHERE path LIKE 'src/Functions%' + GROUP BY toHour(time) AS hourOfDay +) + +┌─hourOfDay─┬─commits───────────────────────┬─lines_added────────────────────────────────────────┬─lines_deleted──────────────────────────────────────┐ +│ 0 │ ████████▊ │ ██████▊ │ ███████████▎ │ +│ 1 │ ███████████▎ │ ███▌ │ ██████▍ │ +│ 2 │ ████████ │ ███▊ │ █████ │ +│ 3 │ █████████▌ │ ████▎ │ █▋ │ +│ 4 │ ███████▋ │ ██▍ │ ████▎ │ +│ 5 │ ████▋ │ ██▋ │ ▋ │ +│ 6 │ █████████████ │ ███████▍ │ █████████▊ │ +│ 7 │ ██████████████▋ │ ██████▊ │ █████▌ │ +│ 8 │ █████████████▎ │ ███████▋ │ ███████████████▌ │ +│ 9 │ ████████████████▊ │ ██████████████████████████████████████████████████ │ ████████▊ │ +│ 10 │ ██████████████████▋ │ ██████████▏ │ ███████████▌ │ +│ 11 │ ██████████████████████▋ │ █████████████▍ │ ████████████▋ │ +│ 12 │ ██████████████████████████ │ █████████████████████████████████████████████████ │ ██████████████████████████████████████████████████ │ +│ 13 │ ███████████████████████▍ │ █████████████████ │ ██████████████████▎ │ +│ 14 │ █████████████████████████▌ │ ███████████████ │ ████████████████████▏ │ +│ 15 │ ████████████████████████████▊ │ █████████████████████████▎ │ █████████████████████████████████▌ │ +│ 16 │ ████████████████████████▌ │ ███████████████▊ │ ███████████████████▋ │ +│ 17 │ █████████████████▎ │ ████████▏ │ ████████████▋ │ +│ 18 │ ███████████████▍ │ ██████▊ │ ██████████ │ +│ 19 │ ████████████████████▋ │ ██████████████▋ │ ██████████████████████▏ │ +│ 20 │ █████████████████▌ │ ██████▏ │ ███████▊ │ +│ 21 │ ████████████████▌ │ ██████████████████████████████████████████████████ │ █████████████▊ │ +│ 22 │ ██████████▋ │ ██████▋ │ █████████████▏ │ +│ 23 │ ████████████▌ │ █████▌ │ █████▋ │ +└───────────┴───────────────────────────────┴────────────────────────────────────────────────────┴────────────────────────────────────────────────────┘ + +24 rows in set. Elapsed: 0.038 sec. Processed 266.05 thousand rows, 14.66 MB (7.09 million rows/s., 390.69 MB/s.) +``` + +## Matrix of authors that shows what authors tends to rewrite another authors code + +The `sign = -1` indicates a code deletion. We exclude punctuation and the insertion of empty lines. + +[play](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICBwcmV2X2F1dGhvciB8fCAnKGEpJyBhcyBhZGRfYXV0aG9yLAogICAgYXV0aG9yICB8fCAnKGQpJyBhcyBkZWxldGVfYXV0aG9yLAogICAgY291bnQoKSBBUyBjCkZST00gZ2l0X2NsaWNraG91c2UubGluZV9jaGFuZ2VzCldIRVJFIChzaWduID0gLTEpIEFORCAoZmlsZV9leHRlbnNpb24gSU4gKCdoJywgJ2NwcCcpKSBBTkQgKGxpbmVfdHlwZSBOT1QgSU4gKCdQdW5jdCcsICdFbXB0eScpKSBBTkQgKGF1dGhvciAhPSBwcmV2X2F1dGhvcikgQU5EIChwcmV2X2F1dGhvciAhPSAnJykKR1JPVVAgQlkKICAgIHByZXZfYXV0aG9yLAogICAgYXV0aG9yCk9SREVSIEJZIGMgREVTQwpMSU1JVCAxIEJZIHByZXZfYXV0aG9yCkxJTUlUIDEwMA==) + +```sql +SELECT + prev_author || '(a)' as add_author, + author || '(d)' as delete_author, + count() AS c +FROM git.line_changes +WHERE (sign = -1) AND (file_extension IN ('h', 'cpp')) AND (line_type NOT IN ('Punct', 'Empty')) AND (author != prev_author) AND (prev_author != '') +GROUP BY + prev_author, + author +ORDER BY c DESC +LIMIT 1 BY prev_author +LIMIT 100 + +┌─prev_author──────────┬─author───────────┬─────c─┐ +│ Ivan │ Alexey Milovidov │ 18554 │ +│ Alexey Arno │ Alexey Milovidov │ 18475 │ +│ Michael Kolupaev │ Alexey Milovidov │ 14135 │ +│ Alexey Milovidov │ Nikolai Kochetov │ 13435 │ +│ Andrey Mironov │ Alexey Milovidov │ 10418 │ +│ proller │ Alexey Milovidov │ 7280 │ +│ Nikolai Kochetov │ Alexey Milovidov │ 6806 │ +│ alexey-milovidov │ Alexey Milovidov │ 5027 │ +│ Vitaliy Lyudvichenko │ Alexey Milovidov │ 4390 │ +│ Amos Bird │ Ivan Lezhankin │ 3125 │ +│ f1yegor │ Alexey Milovidov │ 3119 │ +│ Pavel Kartavyy │ Alexey Milovidov │ 3087 │ +│ Alexey Zatelepin │ Alexey Milovidov │ 2978 │ +│ alesapin │ Alexey Milovidov │ 2949 │ +│ Sergey Fedorov │ Alexey Milovidov │ 2727 │ +│ Ivan Lezhankin │ Alexey Milovidov │ 2618 │ +│ Vasily Nemkov │ Alexey Milovidov │ 2547 │ +│ Alexander Tokmakov │ Alexey Milovidov │ 2493 │ +│ Nikita Vasilev │ Maksim Kita │ 2420 │ +│ Anton Popov │ Amos Bird │ 2127 │ +└──────────────────────┴──────────────────┴───────┘ + +20 rows in set. Elapsed: 0.098 sec. Processed 7.54 million rows, 42.16 MB (76.67 million rows/s., 428.99 MB/s.) +``` + +A Sankey chart (SuperSet) allows this to be visualized nicely. Note we increase our `LIMIT BY` to 3, to get the top 3 code removers for each author, to improve the variety in the visual. + + +![](./images/superset-authors-matrix.png) + + +Alexey clearly likes removing other peoples code. Lets exclude him for a more balanced view of code removal. + +![](./images/superset-authors-matrix_v2.png) + +## Who is the highest percentage contributor per day of week? + +If we consider by just number of commits: + +[play](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICBkYXlfb2Zfd2VlaywKICAgIGF1dGhvciwKICAgIGNvdW50KCkgQVMgYwpGUk9NIGdpdF9jbGlja2hvdXNlLmNvbW1pdHMKR1JPVVAgQlkKICAgIGRheU9mV2Vlayh0aW1lKSBBUyBkYXlfb2Zfd2VlaywKICAgIGF1dGhvcgpPUkRFUiBCWQogICAgZGF5X29mX3dlZWsgQVNDLAogICAgYyBERVNDCkxJTUlUIDEgQlkgZGF5X29mX3dlZWs=) + +```sql +SELECT + day_of_week, + author, + count() AS c +FROM git.commits +GROUP BY + dayOfWeek(time) AS day_of_week, + author +ORDER BY + day_of_week ASC, + c DESC +LIMIT 1 BY day_of_week + +┌─day_of_week─┬─author───────────┬────c─┐ +│ 1 │ Alexey Milovidov │ 2204 │ +│ 2 │ Alexey Milovidov │ 1588 │ +│ 3 │ Alexey Milovidov │ 1725 │ +│ 4 │ Alexey Milovidov │ 1915 │ +│ 5 │ Alexey Milovidov │ 1940 │ +│ 6 │ Alexey Milovidov │ 1851 │ +│ 7 │ Alexey Milovidov │ 2400 │ +└─────────────┴──────────────────┴──────┘ + +7 rows in set. Elapsed: 0.012 sec. Processed 62.78 thousand rows, 395.47 KB (5.44 million rows/s., 34.27 MB/s.) +``` + +OK, some possible advantages here to the longest contributor - our founder Alexey. Lets limit our analysis to the last year. + +[play](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICBkYXlfb2Zfd2VlaywKICAgIGF1dGhvciwKICAgIGNvdW50KCkgQVMgYwpGUk9NIGdpdF9jbGlja2hvdXNlLmNvbW1pdHMKV0hFUkUgdGltZSA+IChub3coKSAtIHRvSW50ZXJ2YWxZZWFyKDEpKQpHUk9VUCBCWQogICAgZGF5T2ZXZWVrKHRpbWUpIEFTIGRheV9vZl93ZWVrLAogICAgYXV0aG9yCk9SREVSIEJZCiAgICBkYXlfb2Zfd2VlayBBU0MsCiAgICBjIERFU0MKTElNSVQgMSBCWSBkYXlfb2Zfd2Vlaw==) + +```sql +SELECT + day_of_week, + author, + count() AS c +FROM git.commits +WHERE time > (now() - toIntervalYear(1)) +GROUP BY + dayOfWeek(time) AS day_of_week, + author +ORDER BY + day_of_week ASC, + c DESC +LIMIT 1 BY day_of_week + +┌─day_of_week─┬─author───────────┬───c─┐ +│ 1 │ Alexey Milovidov │ 198 │ +│ 2 │ alesapin │ 162 │ +│ 3 │ alesapin │ 163 │ +│ 4 │ Azat Khuzhin │ 166 │ +│ 5 │ alesapin │ 191 │ +│ 6 │ Alexey Milovidov │ 179 │ +│ 7 │ Alexey Milovidov │ 243 │ +└─────────────┴──────────────────┴─────┘ + +7 rows in set. Elapsed: 0.004 sec. Processed 21.82 thousand rows, 140.02 KB (4.88 million rows/s., 31.29 MB/s.) +``` + +This is still a little simple and doesn't reflect people's work. + +A better metric might be who is the top contributor each day as a fraction of the total work performed in the last year. Note that we treat the deletion and adding code equally. + +[play](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICB0b3BfYXV0aG9yLmRheV9vZl93ZWVrLAogICAgdG9wX2F1dGhvci5hdXRob3IsCiAgICB0b3BfYXV0aG9yLmF1dGhvcl93b3JrIC8gYWxsX3dvcmsudG90YWxfd29yayBBUyB0b3BfYXV0aG9yX3BlcmNlbnQKRlJPTQooCiAgICBTRUxFQ1QKICAgICAgICBkYXlfb2Zfd2VlaywKICAgICAgICBhdXRob3IsCiAgICAgICAgc3VtKGxpbmVzX2FkZGVkKSArIHN1bShsaW5lc19kZWxldGVkKSBBUyBhdXRob3Jfd29yawogICAgRlJPTSBnaXRfY2xpY2tob3VzZS5maWxlX2NoYW5nZXMKICAgIFdIRVJFIHRpbWUgPiAobm93KCkgLSB0b0ludGVydmFsWWVhcigxKSkKICAgIEdST1VQIEJZCiAgICAgICAgYXV0aG9yLAogICAgICAgIGRheU9mV2Vlayh0aW1lKSBBUyBkYXlfb2Zfd2VlawogICAgT1JERVIgQlkKICAgICAgICBkYXlfb2Zfd2VlayBBU0MsCiAgICAgICAgYXV0aG9yX3dvcmsgREVTQwogICAgTElNSVQgMSBCWSBkYXlfb2Zfd2VlawopIEFTIHRvcF9hdXRob3IKSU5ORVIgSk9JTgooCiAgICBTRUxFQ1QKICAgICAgICBkYXlfb2Zfd2VlaywKICAgICAgICBzdW0obGluZXNfYWRkZWQpICsgc3VtKGxpbmVzX2RlbGV0ZWQpIEFTIHRvdGFsX3dvcmsKICAgIEZST00gZ2l0X2NsaWNraG91c2UuZmlsZV9jaGFuZ2VzCiAgICBXSEVSRSB0aW1lID4gKG5vdygpIC0gdG9JbnRlcnZhbFllYXIoMSkpCiAgICBHUk9VUCBCWSBkYXlPZldlZWsodGltZSkgQVMgZGF5X29mX3dlZWsKKSBBUyBhbGxfd29yayBVU0lORyAoZGF5X29mX3dlZWsp) + +```sql +SELECT + top_author.day_of_week, + top_author.author, + top_author.author_work / all_work.total_work AS top_author_percent +FROM +( + SELECT + day_of_week, + author, + sum(lines_added) + sum(lines_deleted) AS author_work + FROM git.file_changes + WHERE time > (now() - toIntervalYear(1)) + GROUP BY + author, + dayOfWeek(time) AS day_of_week + ORDER BY + day_of_week ASC, + author_work DESC + LIMIT 1 BY day_of_week +) AS top_author +INNER JOIN +( + SELECT + day_of_week, + sum(lines_added) + sum(lines_deleted) AS total_work + FROM git.file_changes + WHERE time > (now() - toIntervalYear(1)) + GROUP BY dayOfWeek(time) AS day_of_week +) AS all_work USING (day_of_week) + +┌─day_of_week─┬─author──────────────┬──top_author_percent─┐ +│ 1 │ Alexey Milovidov │ 0.3168282877768332 │ +│ 2 │ Mikhail f. Shiryaev │ 0.3523434231193969 │ +│ 3 │ vdimir │ 0.11859742484577324 │ +│ 4 │ Nikolay Degterinsky │ 0.34577318920318467 │ +│ 5 │ Alexey Milovidov │ 0.13208704423684223 │ +│ 6 │ Alexey Milovidov │ 0.18895257783624633 │ +│ 7 │ Robert Schulze │ 0.3617405888930302 │ +└─────────────┴─────────────────────┴─────────────────────┘ + +7 rows in set. Elapsed: 0.014 sec. Processed 106.12 thousand rows, 1.38 MB (7.61 million rows/s., 98.65 MB/s.) +``` + +## Distribution of code age across repository + +We limit the analysis to the current files. For brevity, we restrict the results to a depth of 2 with 5 files per root folder. Adjust as required. + +[play](https://play.clickhouse.com/play?user=play#V0lUSCBjdXJyZW50X2ZpbGVzIEFTCiAgICAoCiAgICAgICAgU0VMRUNUIHBhdGgKICAgICAgICBGUk9NCiAgICAgICAgKAogICAgICAgICAgICBTRUxFQ1QKICAgICAgICAgICAgICAgIG9sZF9wYXRoIEFTIHBhdGgsCiAgICAgICAgICAgICAgICBtYXgodGltZSkgQVMgbGFzdF90aW1lLAogICAgICAgICAgICAgICAgMiBBUyBjaGFuZ2VfdHlwZQogICAgICAgICAgICBGUk9NIGdpdF9jbGlja2hvdXNlLmZpbGVfY2hhbmdlcwogICAgICAgICAgICBHUk9VUCBCWSBvbGRfcGF0aAogICAgICAgICAgICBVTklPTiBBTEwKICAgICAgICAgICAgU0VMRUNUCiAgICAgICAgICAgICAgICBwYXRoLAogICAgICAgICAgICAgICAgbWF4KHRpbWUpIEFTIGxhc3RfdGltZSwKICAgICAgICAgICAgICAgIGFyZ01heChjaGFuZ2VfdHlwZSwgdGltZSkgQVMgY2hhbmdlX3R5cGUKICAgICAgICAgICAgRlJPTSBnaXRfY2xpY2tob3VzZS5maWxlX2NoYW5nZXMKICAgICAgICAgICAgR1JPVVAgQlkgcGF0aAogICAgICAgICkKICAgICAgICBHUk9VUCBCWSBwYXRoCiAgICAgICAgSEFWSU5HIChhcmdNYXgoY2hhbmdlX3R5cGUsIGxhc3RfdGltZSkgIT0gMikgQU5EIChOT1QgbWF0Y2gocGF0aCwgJyheZGJtcy8pfChebGlicy8pfChedGVzdHMvdGVzdGZsb3dzLyl8KF5wcm9ncmFtcy9zZXJ2ZXIvc3RvcmUvKScpKQogICAgICAgIE9SREVSIEJZIHBhdGggQVNDCiAgICApClNFTEVDVAogICAgY29uY2F0KHJvb3QsICcvJywgc3ViX2ZvbGRlcikgQVMgZm9sZGVyLAogICAgcm91bmQoYXZnKGRheXNfcHJlc2VudCkpIEFTIGF2Z19hZ2Vfb2ZfZmlsZXMsCiAgICBtaW4oZGF5c19wcmVzZW50KSBBUyBtaW5fYWdlX2ZpbGVzLAogICAgbWF4KGRheXNfcHJlc2VudCkgQVMgbWF4X2FnZV9maWxlcywKICAgIGNvdW50KCkgQVMgYwpGUk9NCigKICAgIFNFTEVDVAogICAgICAgIHBhdGgsCiAgICAgICAgZGF0ZURpZmYoJ2RheScsIG1pbih0aW1lKSwgdG9EYXRlKCcyMDIyLTExLTAzJykpIEFTIGRheXNfcHJlc2VudAogICAgRlJPTSBnaXRfY2xpY2tob3VzZS5maWxlX2NoYW5nZXMKICAgIFdIRVJFIChwYXRoIElOIChjdXJyZW50X2ZpbGVzKSkgQU5EIChmaWxlX2V4dGVuc2lvbiBJTiAoJ2gnLCAnY3BwJywgJ3NxbCcpKQogICAgR1JPVVAgQlkgcGF0aAopCkdST1VQIEJZCiAgICBzcGxpdEJ5Q2hhcignLycsIHBhdGgpWzFdIEFTIHJvb3QsCiAgICBzcGxpdEJ5Q2hhcignLycsIHBhdGgpWzJdIEFTIHN1Yl9mb2xkZXIKT1JERVIgQlkKICAgIHJvb3QgQVNDLAogICAgYyBERVNDCkxJTUlUIDUgQlkgcm9vdAo=) + +```sql +WITH current_files AS + ( + SELECT path + FROM + ( + SELECT + old_path AS path, + max(time) AS last_time, + 2 AS change_type + FROM git.file_changes + GROUP BY old_path + UNION ALL + SELECT + path, + max(time) AS last_time, + argMax(change_type, time) AS change_type + FROM git.file_changes + GROUP BY path + ) + GROUP BY path + HAVING (argMax(change_type, last_time) != 2) AND (NOT match(path, '(^dbms/)|(^libs/)|(^tests/testflows/)|(^programs/server/store/)')) + ORDER BY path ASC + ) +SELECT + concat(root, '/', sub_folder) AS folder, + round(avg(days_present)) AS avg_age_of_files, + min(days_present) AS min_age_files, + max(days_present) AS max_age_files, + count() AS c +FROM +( + SELECT + path, + dateDiff('day', min(time), toDate('2022-11-03')) AS days_present + FROM git.file_changes + WHERE (path IN (current_files)) AND (file_extension IN ('h', 'cpp', 'sql')) + GROUP BY path +) +GROUP BY + splitByChar('/', path)[1] AS root, + splitByChar('/', path)[2] AS sub_folder +ORDER BY + root ASC, + c DESC +LIMIT 5 BY root + +┌─folder───────────────────────────┬─avg_age_of_files─┬─min_age_files─┬─max_age_files─┬────c─┐ +│ base/base │ 387 │ 201 │ 397 │ 84 │ +│ base/glibc-compatibility │ 887 │ 59 │ 993 │ 19 │ +│ base/consistent-hashing │ 993 │ 993 │ 993 │ 5 │ +│ base/widechar_width │ 993 │ 993 │ 993 │ 2 │ +│ base/consistent-hashing-sumbur │ 993 │ 993 │ 993 │ 2 │ +│ docker/test │ 1043 │ 1043 │ 1043 │ 1 │ +│ programs/odbc-bridge │ 835 │ 91 │ 945 │ 25 │ +│ programs/copier │ 587 │ 14 │ 945 │ 22 │ +│ programs/library-bridge │ 155 │ 47 │ 608 │ 21 │ +│ programs/disks │ 144 │ 62 │ 150 │ 14 │ +│ programs/server │ 874 │ 709 │ 945 │ 10 │ +│ rust/BLAKE3 │ 52 │ 52 │ 52 │ 1 │ +│ src/Functions │ 752 │ 0 │ 944 │ 809 │ +│ src/Storages │ 700 │ 8 │ 944 │ 736 │ +│ src/Interpreters │ 684 │ 3 │ 944 │ 490 │ +│ src/Processors │ 703 │ 44 │ 944 │ 482 │ +│ src/Common │ 673 │ 7 │ 944 │ 473 │ +│ tests/queries │ 674 │ -5 │ 945 │ 3777 │ +│ tests/integration │ 656 │ 132 │ 945 │ 4 │ +│ utils/memcpy-bench │ 601 │ 599 │ 605 │ 10 │ +│ utils/keeper-bench │ 570 │ 569 │ 570 │ 7 │ +│ utils/durability-test │ 793 │ 793 │ 793 │ 4 │ +│ utils/self-extracting-executable │ 143 │ 143 │ 143 │ 3 │ +│ utils/self-extr-exec │ 224 │ 224 │ 224 │ 2 │ +└──────────────────────────────────┴──────────────────┴───────────────┴───────────────┴──────┘ + +24 rows in set. Elapsed: 0.129 sec. Processed 798.15 thousand rows, 15.11 MB (6.19 million rows/s., 117.08 MB/s.) +``` + +## What percentage of code for an author has been removed by other authors? + +For this question, we need the number of lines written by an author divided by the total number of lines they have had removed by another contributor. + +[play](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICBrLAogICAgd3JpdHRlbl9jb2RlLmMsCiAgICByZW1vdmVkX2NvZGUuYywKICAgIHJlbW92ZWRfY29kZS5jIC8gd3JpdHRlbl9jb2RlLmMgQVMgcmVtb3ZlX3JhdGlvCkZST00KKAogICAgU0VMRUNUCiAgICAgICAgYXV0aG9yIEFTIGssCiAgICAgICAgY291bnQoKSBBUyBjCiAgICBGUk9NIGdpdF9jbGlja2hvdXNlLmxpbmVfY2hhbmdlcwogICAgV0hFUkUgKHNpZ24gPSAxKSBBTkQgKGZpbGVfZXh0ZW5zaW9uIElOICgnaCcsICdjcHAnKSkgQU5EIChsaW5lX3R5cGUgTk9UIElOICgnUHVuY3QnLCAnRW1wdHknKSkKICAgIEdST1VQIEJZIGsKKSBBUyB3cml0dGVuX2NvZGUKSU5ORVIgSk9JTgooCiAgICBTRUxFQ1QKICAgICAgICBwcmV2X2F1dGhvciBBUyBrLAogICAgICAgIGNvdW50KCkgQVMgYwogICAgRlJPTSBnaXRfY2xpY2tob3VzZS5saW5lX2NoYW5nZXMKICAgIFdIRVJFIChzaWduID0gLTEpIEFORCAoZmlsZV9leHRlbnNpb24gSU4gKCdoJywgJ2NwcCcpKSBBTkQgKGxpbmVfdHlwZSBOT1QgSU4gKCdQdW5jdCcsICdFbXB0eScpKSBBTkQgKGF1dGhvciAhPSBwcmV2X2F1dGhvcikKICAgIEdST1VQIEJZIGsKKSBBUyByZW1vdmVkX2NvZGUgVVNJTkcgKGspCldIRVJFIHdyaXR0ZW5fY29kZS5jID4gMTAwMApPUkRFUiBCWSByZW1vdmVfcmF0aW8gREVTQwpMSU1JVCAxMAo=) + +```sql +SELECT + k, + written_code.c, + removed_code.c, + removed_code.c / written_code.c AS remove_ratio +FROM +( + SELECT + author AS k, + count() AS c + FROM git.line_changes + WHERE (sign = 1) AND (file_extension IN ('h', 'cpp')) AND (line_type NOT IN ('Punct', 'Empty')) + GROUP BY k +) AS written_code +INNER JOIN +( + SELECT + prev_author AS k, + count() AS c + FROM git.line_changes + WHERE (sign = -1) AND (file_extension IN ('h', 'cpp')) AND (line_type NOT IN ('Punct', 'Empty')) AND (author != prev_author) + GROUP BY k +) AS removed_code USING (k) +WHERE written_code.c > 1000 +ORDER BY remove_ratio DESC +LIMIT 10 + +┌─k──────────────────┬─────c─┬─removed_code.c─┬───────remove_ratio─┐ +│ Marek Vavruša │ 1458 │ 1318 │ 0.9039780521262003 │ +│ Ivan │ 32715 │ 27500 │ 0.8405930001528351 │ +│ artpaul │ 3450 │ 2840 │ 0.8231884057971014 │ +│ Silviu Caragea │ 1542 │ 1209 │ 0.7840466926070039 │ +│ Ruslan │ 1027 │ 802 │ 0.7809152872444012 │ +│ Tsarkova Anastasia │ 1755 │ 1364 │ 0.7772079772079772 │ +│ Vyacheslav Alipov │ 3526 │ 2727 │ 0.7733976176971072 │ +│ Marek Vavruša │ 1467 │ 1124 │ 0.7661895023858214 │ +│ f1yegor │ 7194 │ 5213 │ 0.7246316374756742 │ +│ kreuzerkrieg │ 3406 │ 2468 │ 0.724603640634175 │ +└────────────────────┴───────┴────────────────┴────────────────────┘ + +10 rows in set. Elapsed: 0.126 sec. Processed 15.07 million rows, 73.51 MB (119.97 million rows/s., 585.16 MB/s.) +``` + +## List files that were rewritten most number of times? + + +The simplest approach to this question might be to simply count the most number of line modifications per path (restricted to current files) e.g.: + +```sql +WITH current_files AS + ( + SELECT path + FROM + ( + SELECT + old_path AS path, + max(time) AS last_time, + 2 AS change_type + FROM git.file_changes + GROUP BY old_path + UNION ALL + SELECT + path, + max(time) AS last_time, + argMax(change_type, time) AS change_type + FROM git.file_changes + GROUP BY path + ) + GROUP BY path + HAVING (argMax(change_type, last_time) != 2) AND (NOT match(path, '(^dbms/)|(^libs/)|(^tests/testflows/)|(^programs/server/store/)')) + ORDER BY path ASC + ) +SELECT + path, + count() AS c +FROM git.line_changes +WHERE (file_extension IN ('h', 'cpp', 'sql')) AND (path IN (current_files)) +GROUP BY path +ORDER BY c DESC +LIMIT 10 + +┌─path───────────────────────────────────────────────────┬─────c─┐ +│ src/Storages/StorageReplicatedMergeTree.cpp │ 21871 │ +│ src/Storages/MergeTree/MergeTreeData.cpp │ 17709 │ +│ programs/client/Client.cpp │ 15882 │ +│ src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp │ 14249 │ +│ src/Interpreters/InterpreterSelectQuery.cpp │ 12636 │ +│ src/Parsers/ExpressionListParsers.cpp │ 11794 │ +│ src/Analyzer/QueryAnalysisPass.cpp │ 11760 │ +│ src/Coordination/KeeperStorage.cpp │ 10225 │ +│ src/Functions/FunctionsConversion.h │ 9247 │ +│ src/Parsers/ExpressionElementParsers.cpp │ 8197 │ +└────────────────────────────────────────────────────────┴───────┘ + +10 rows in set. Elapsed: 0.160 sec. Processed 8.07 million rows, 98.99 MB (50.49 million rows/s., 619.49 MB/s.) +``` + +This doesn't capture the notion of a "re-write" however, where a large portion of the file changes in any commit. This requires a more complex query. If we consider a rewrite to be when over 50% of the file are deleted, and 50% added. You can adjust the query to your own interpretation of what constitutes this. + +The query is limited to the current files only. We list all file changes by grouping by `path` and `commit_hash`, returning the number of lines added and removed. Using a window function, we estimate the file's total size at any moment in time by performing a cumulative sum and estimating the impact of any change on file size as `lines added - lines removed`. Using this statistic, we can calculate the percentage of the file that has been added or removed for each change. Finally, we count the number of file changes that constitute a rewrite per file i.e. `(percent_add >= 0.5) AND (percent_delete >= 0.5) AND current_size > 50`. Note we require files to be more than 50 lines to avoid early contributions to a file being counted as a rewrite. This also avoids a bias to very small files, which may be more likely to be rewritten. + +[play](https://play.clickhouse.com/play?user=play#V0lUSAogICAgY3VycmVudF9maWxlcyBBUwogICAgKAogICAgICAgIFNFTEVDVCBwYXRoCiAgICAgICAgRlJPTQogICAgICAgICgKICAgICAgICAgICAgU0VMRUNUCiAgICAgICAgICAgICAgICBvbGRfcGF0aCBBUyBwYXRoLAogICAgICAgICAgICAgICAgbWF4KHRpbWUpIEFTIGxhc3RfdGltZSwKICAgICAgICAgICAgICAgIDIgQVMgY2hhbmdlX3R5cGUKICAgICAgICAgICAgRlJPTSBnaXRfY2xpY2tob3VzZS5maWxlX2NoYW5nZXMKICAgICAgICAgICAgR1JPVVAgQlkgb2xkX3BhdGgKICAgICAgICAgICAgVU5JT04gQUxMCiAgICAgICAgICAgIFNFTEVDVAogICAgICAgICAgICAgICAgcGF0aCwKICAgICAgICAgICAgICAgIG1heCh0aW1lKSBBUyBsYXN0X3RpbWUsCiAgICAgICAgICAgICAgICBhcmdNYXgoY2hhbmdlX3R5cGUsIHRpbWUpIEFTIGNoYW5nZV90eXBlCiAgICAgICAgICAgIEZST00gZ2l0X2NsaWNraG91c2UuZmlsZV9jaGFuZ2VzCiAgICAgICAgICAgIEdST1VQIEJZIHBhdGgKICAgICAgICApCiAgICAgICAgR1JPVVAgQlkgcGF0aAogICAgICAgIEhBVklORyAoYXJnTWF4KGNoYW5nZV90eXBlLCBsYXN0X3RpbWUpICE9IDIpIEFORCAoTk9UIG1hdGNoKHBhdGgsICcoXmRibXMvKXwoXmxpYnMvKXwoXnRlc3RzL3Rlc3RmbG93cy8pfChecHJvZ3JhbXMvc2VydmVyL3N0b3JlLyknKSkKICAgICAgICBPUkRFUiBCWSBwYXRoIEFTQwogICAgKSwKICAgIGNoYW5nZXMgQVMKICAgICgKICAgICAgICBTRUxFQ1QKICAgICAgICAgICAgcGF0aCwKICAgICAgICAgICAgbWF4KHRpbWUpIEFTIG1heF90aW1lLAogICAgICAgICAgICBjb21taXRfaGFzaCwKICAgICAgICAgICAgYW55KGxpbmVzX2FkZGVkKSBBUyBudW1fYWRkZWQsCiAgICAgICAgICAgIGFueShsaW5lc19kZWxldGVkKSBBUyBudW1fZGVsZXRlZCwKICAgICAgICAgICAgYW55KGNoYW5nZV90eXBlKSBBUyB0eXBlCiAgICAgICAgRlJPTSBnaXRfY2xpY2tob3VzZS5maWxlX2NoYW5nZXMKICAgICAgICBXSEVSRSAoY2hhbmdlX3R5cGUgSU4gKCdBZGQnLCAnTW9kaWZ5JykpIEFORCAocGF0aCBJTiAoY3VycmVudF9maWxlcykpIEFORCAoZmlsZV9leHRlbnNpb24gSU4gKCdoJywgJ2NwcCcsICdzcWwnKSkKICAgICAgICBHUk9VUCBCWQogICAgICAgICAgICBwYXRoLAogICAgICAgICAgICBjb21taXRfaGFzaAogICAgICAgIE9SREVSIEJZCiAgICAgICAgICAgIHBhdGggQVNDLAogICAgICAgICAgICBtYXhfdGltZSBBU0MKICAgICksCiAgICByZXdyaXRlcyBBUwogICAgKAogICAgICAgIFNFTEVDVAogICAgICAgICAgICBwYXRoLAogICAgICAgICAgICBjb21taXRfaGFzaCwKICAgICAgICAgICAgbWF4X3RpbWUsCiAgICAgICAgICAgIHR5cGUsCiAgICAgICAgICAgIG51bV9hZGRlZCwKICAgICAgICAgICAgbnVtX2RlbGV0ZWQsCiAgICAgICAgICAgIHN1bShudW1fYWRkZWQgLSBudW1fZGVsZXRlZCkgT1ZFUiAoUEFSVElUSU9OIEJZIHBhdGggT1JERVIgQlkgbWF4X3RpbWUgQVNDKSBBUyBjdXJyZW50X3NpemUsCiAgICAgICAgICAgIGlmKGN1cnJlbnRfc2l6ZSA+IDAsIG51bV9hZGRlZCAvIGN1cnJlbnRfc2l6ZSwgMCkgQVMgcGVyY2VudF9hZGQsCiAgICAgICAgICAgIGlmKGN1cnJlbnRfc2l6ZSA+IDAsIG51bV9kZWxldGVkIC8gY3VycmVudF9zaXplLCAwKSBBUyBwZXJjZW50X2RlbGV0ZQogICAgICAgIEZST00gY2hhbmdlcwogICAgKQpTRUxFQ1QKICAgIHBhdGgsCiAgICBjb3VudCgpIEFTIG51bV9yZXdyaXRlcwpGUk9NIHJld3JpdGVzCldIRVJFICh0eXBlID0gJ01vZGlmeScpIEFORCAocGVyY2VudF9hZGQgPj0gMC41KSBBTkQgKHBlcmNlbnRfZGVsZXRlID49IDAuNSkgQU5EIChjdXJyZW50X3NpemUgPiA1MCkKR1JPVVAgQlkgcGF0aApPUkRFUiBCWSBudW1fcmV3cml0ZXMgREVTQwpMSU1JVCAxMA==) + +```sql +WITH + current_files AS + ( + SELECT path + FROM + ( + SELECT + old_path AS path, + max(time) AS last_time, + 2 AS change_type + FROM git.file_changes + GROUP BY old_path + UNION ALL + SELECT + path, + max(time) AS last_time, + argMax(change_type, time) AS change_type + FROM git.file_changes + GROUP BY path + ) + GROUP BY path + HAVING (argMax(change_type, last_time) != 2) AND (NOT match(path, '(^dbms/)|(^libs/)|(^tests/testflows/)|(^programs/server/store/)')) + ORDER BY path ASC + ), + changes AS + ( + SELECT + path, + max(time) AS max_time, + commit_hash, + any(lines_added) AS num_added, + any(lines_deleted) AS num_deleted, + any(change_type) AS type + FROM git.file_changes + WHERE (change_type IN ('Add', 'Modify')) AND (path IN (current_files)) AND (file_extension IN ('h', 'cpp', 'sql')) + GROUP BY + path, + commit_hash + ORDER BY + path ASC, + max_time ASC + ), + rewrites AS + ( + SELECT + path, + commit_hash, + max_time, + type, + num_added, + num_deleted, + sum(num_added - num_deleted) OVER (PARTITION BY path ORDER BY max_time ASC) AS current_size, + if(current_size > 0, num_added / current_size, 0) AS percent_add, + if(current_size > 0, num_deleted / current_size, 0) AS percent_delete + FROM changes + ) +SELECT + path, + count() AS num_rewrites +FROM rewrites +WHERE (type = 'Modify') AND (percent_add >= 0.5) AND (percent_delete >= 0.5) AND (current_size > 50) +GROUP BY path +ORDER BY num_rewrites DESC +LIMIT 10 + +┌─path──────────────────────────────────────────────────┬─num_rewrites─┐ +│ src/Storages/WindowView/StorageWindowView.cpp │ 8 │ +│ src/Functions/array/arrayIndex.h │ 7 │ +│ src/Dictionaries/CacheDictionary.cpp │ 6 │ +│ src/Dictionaries/RangeHashedDictionary.cpp │ 5 │ +│ programs/client/Client.cpp │ 4 │ +│ src/Functions/polygonPerimeter.cpp │ 4 │ +│ src/Functions/polygonsEquals.cpp │ 4 │ +│ src/Functions/polygonsWithin.cpp │ 4 │ +│ src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp │ 4 │ +│ src/Functions/polygonsSymDifference.cpp │ 4 │ +└───────────────────────────────────────────────────────┴──────────────┘ + +10 rows in set. Elapsed: 0.299 sec. Processed 798.15 thousand rows, 31.52 MB (2.67 million rows/s., 105.29 MB/s.) +``` + +## What weekday does the code have the highest chance to stay in the repository? + +For this, we need to identify a line of code uniquely. We estimate this(as the same line may appear multiple times in a file) using the path and line contents. + +We query for lines added, joining this with the lines removed - filtering to cases where the latter occurs more recently than the former. This gives us the deleted lines from which we can compute the time between these two events. + +Finally, we aggregate across this dataset to compute the average number of days lines stay in the repository by the day of the week. + +[play](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICBkYXlfb2Zfd2Vla19hZGRlZCwKICAgIGNvdW50KCkgQVMgbnVtLAogICAgYXZnKGRheXNfcHJlc2VudCkgQVMgYXZnX2RheXNfcHJlc2VudApGUk9NCigKICAgIFNFTEVDVAogICAgICAgIGFkZGVkX2NvZGUubGluZSwKICAgICAgICBhZGRlZF9jb2RlLnRpbWUgQVMgYWRkZWRfZGF5LAogICAgICAgIGRhdGVEaWZmKCdkYXknLCBhZGRlZF9jb2RlLnRpbWUsIHJlbW92ZWRfY29kZS50aW1lKSBBUyBkYXlzX3ByZXNlbnQKICAgIEZST00KICAgICgKICAgICAgICBTRUxFQ1QKICAgICAgICAgICAgcGF0aCwKICAgICAgICAgICAgbGluZSwKICAgICAgICAgICAgbWF4KHRpbWUpIEFTIHRpbWUKICAgICAgICBGUk9NIGdpdF9jbGlja2hvdXNlLmxpbmVfY2hhbmdlcwogICAgICAgIFdIRVJFIChzaWduID0gMSkgQU5EIChsaW5lX3R5cGUgTk9UIElOICgnUHVuY3QnLCAnRW1wdHknKSkKICAgICAgICBHUk9VUCBCWQogICAgICAgICAgICBwYXRoLAogICAgICAgICAgICBsaW5lCiAgICApIEFTIGFkZGVkX2NvZGUKICAgIElOTkVSIEpPSU4KICAgICgKICAgICAgICBTRUxFQ1QKICAgICAgICAgICAgcGF0aCwKICAgICAgICAgICAgbGluZSwKICAgICAgICAgICAgbWF4KHRpbWUpIEFTIHRpbWUKICAgICAgICBGUk9NIGdpdF9jbGlja2hvdXNlLmxpbmVfY2hhbmdlcwogICAgICAgIFdIRVJFIChzaWduID0gLTEpIEFORCAobGluZV90eXBlIE5PVCBJTiAoJ1B1bmN0JywgJ0VtcHR5JykpCiAgICAgICAgR1JPVVAgQlkKICAgICAgICAgICAgcGF0aCwKICAgICAgICAgICAgbGluZQogICAgKSBBUyByZW1vdmVkX2NvZGUgVVNJTkcgKHBhdGgsIGxpbmUpCiAgICBXSEVSRSByZW1vdmVkX2NvZGUudGltZSA+IGFkZGVkX2NvZGUudGltZQopCkdST1VQIEJZIGRheU9mV2VlayhhZGRlZF9kYXkpIEFTIGRheV9vZl93ZWVrX2FkZGVk) + +```sql +SELECT + day_of_week_added, + count() AS num, + avg(days_present) AS avg_days_present +FROM +( + SELECT + added_code.line, + added_code.time AS added_day, + dateDiff('day', added_code.time, removed_code.time) AS days_present + FROM + ( + SELECT + path, + line, + max(time) AS time + FROM git.line_changes + WHERE (sign = 1) AND (line_type NOT IN ('Punct', 'Empty')) + GROUP BY + path, + line + ) AS added_code + INNER JOIN + ( + SELECT + path, + line, + max(time) AS time + FROM git.line_changes + WHERE (sign = -1) AND (line_type NOT IN ('Punct', 'Empty')) + GROUP BY + path, + line + ) AS removed_code USING (path, line) + WHERE removed_code.time > added_code.time +) +GROUP BY dayOfWeek(added_day) AS day_of_week_added + +┌─day_of_week_added─┬────num─┬───avg_days_present─┐ +│ 1 │ 171879 │ 193.81759260875384 │ +│ 2 │ 141448 │ 153.0931013517335 │ +│ 3 │ 161230 │ 137.61553681076722 │ +│ 4 │ 255728 │ 121.14149799787273 │ +│ 5 │ 203907 │ 141.60181847606998 │ +│ 6 │ 62305 │ 202.43449161383518 │ +│ 7 │ 70904 │ 220.0266134491707 │ +└───────────────────┴────────┴────────────────────┘ + +7 rows in set. Elapsed: 3.965 sec. Processed 15.07 million rows, 1.92 GB (3.80 million rows/s., 483.50 MB/s.) +``` + +## Files sorted by average code age + +This query uses the same principle as [What weekday does the code have the highest chance to stay in the repository](#what-weekday-does-the-code-have-the-highest-chance-to-stay-in-the-repository) - by aiming to uniquely identify a line of code using the path and line contents. +This allows us to identify the time between when a line was added and removed. We filter to current files and code only, however, and average the time for each file across lines. + +[play](https://play.clickhouse.com/play?user=play#V0lUSAogICAgY3VycmVudF9maWxlcyBBUwogICAgKAogICAgICAgIFNFTEVDVCBwYXRoCiAgICAgICAgRlJPTQogICAgICAgICgKICAgICAgICAgICAgU0VMRUNUCiAgICAgICAgICAgICAgICBvbGRfcGF0aCBBUyBwYXRoLAogICAgICAgICAgICAgICAgbWF4KHRpbWUpIEFTIGxhc3RfdGltZSwKICAgICAgICAgICAgICAgIDIgQVMgY2hhbmdlX3R5cGUKICAgICAgICAgICAgRlJPTSBnaXRfY2xpY2tob3VzZS5maWxlX2NoYW5nZXMKICAgICAgICAgICAgR1JPVVAgQlkgb2xkX3BhdGgKICAgICAgICAgICAgVU5JT04gQUxMCiAgICAgICAgICAgIFNFTEVDVAogICAgICAgICAgICAgICAgcGF0aCwKICAgICAgICAgICAgICAgIG1heCh0aW1lKSBBUyBsYXN0X3RpbWUsCiAgICAgICAgICAgICAgICBhcmdNYXgoY2hhbmdlX3R5cGUsIHRpbWUpIEFTIGNoYW5nZV90eXBlCiAgICAgICAgICAgIEZST00gZ2l0X2NsaWNraG91c2UuZmlsZV9jaGFuZ2VzCiAgICAgICAgICAgIEdST1VQIEJZIHBhdGgKICAgICAgICApCiAgICAgICAgR1JPVVAgQlkgcGF0aAogICAgICAgIEhBVklORyAoYXJnTWF4KGNoYW5nZV90eXBlLCBsYXN0X3RpbWUpICE9IDIpIEFORCAoTk9UIG1hdGNoKHBhdGgsICcoXmRibXMvKXwoXmxpYnMvKXwoXnRlc3RzL3Rlc3RmbG93cy8pfChecHJvZ3JhbXMvc2VydmVyL3N0b3JlLyknKSkKICAgICAgICBPUkRFUiBCWSBwYXRoIEFTQwogICAgKSwKICAgIGxpbmVzX3JlbW92ZWQgQVMKICAgICgKICAgICAgICBTRUxFQ1QKICAgICAgICAgICAgYWRkZWRfY29kZS5wYXRoIEFTIHBhdGgsCiAgICAgICAgICAgIGFkZGVkX2NvZGUubGluZSwKICAgICAgICAgICAgYWRkZWRfY29kZS50aW1lIEFTIGFkZGVkX2RheSwKICAgICAgICAgICAgZGF0ZURpZmYoJ2RheScsIGFkZGVkX2NvZGUudGltZSwgcmVtb3ZlZF9jb2RlLnRpbWUpIEFTIGRheXNfcHJlc2VudAogICAgICAgIEZST00KICAgICAgICAoCiAgICAgICAgICAgIFNFTEVDVAogICAgICAgICAgICAgICAgcGF0aCwKICAgICAgICAgICAgICAgIGxpbmUsCiAgICAgICAgICAgICAgICBtYXgodGltZSkgQVMgdGltZSwKICAgICAgICAgICAgICAgIGFueShmaWxlX2V4dGVuc2lvbikgQVMgZmlsZV9leHRlbnNpb24KICAgICAgICAgICAgRlJPTSBnaXRfY2xpY2tob3VzZS5saW5lX2NoYW5nZXMKICAgICAgICAgICAgV0hFUkUgKHNpZ24gPSAxKSBBTkQgKGxpbmVfdHlwZSBOT1QgSU4gKCdQdW5jdCcsICdFbXB0eScpKQogICAgICAgICAgICBHUk9VUCBCWQogICAgICAgICAgICAgICAgcGF0aCwKICAgICAgICAgICAgICAgIGxpbmUKICAgICAgICApIEFTIGFkZGVkX2NvZGUKICAgICAgICBJTk5FUiBKT0lOCiAgICAgICAgKAogICAgICAgICAgICBTRUxFQ1QKICAgICAgICAgICAgICAgIHBhdGgsCiAgICAgICAgICAgICAgICBsaW5lLAogICAgICAgICAgICAgICAgbWF4KHRpbWUpIEFTIHRpbWUKICAgICAgICAgICAgRlJPTSBnaXRfY2xpY2tob3VzZS5saW5lX2NoYW5nZXMKICAgICAgICAgICAgV0hFUkUgKHNpZ24gPSAtMSkgQU5EIChsaW5lX3R5cGUgTk9UIElOICgnUHVuY3QnLCAnRW1wdHknKSkKICAgICAgICAgICAgR1JPVVAgQlkKICAgICAgICAgICAgICAgIHBhdGgsCiAgICAgICAgICAgICAgICBsaW5lCiAgICAgICAgKSBBUyByZW1vdmVkX2NvZGUgVVNJTkcgKHBhdGgsIGxpbmUpCiAgICAgICAgV0hFUkUgKHJlbW92ZWRfY29kZS50aW1lID4gYWRkZWRfY29kZS50aW1lKSBBTkQgKHBhdGggSU4gKGN1cnJlbnRfZmlsZXMpKSBBTkQgKGZpbGVfZXh0ZW5zaW9uIElOICgnaCcsICdjcHAnLCAnc3FsJykpCiAgICApClNFTEVDVAogICAgcGF0aCwKICAgIGF2ZyhkYXlzX3ByZXNlbnQpIEFTIGF2Z19jb2RlX2FnZQpGUk9NIGxpbmVzX3JlbW92ZWQKR1JPVVAgQlkgcGF0aApPUkRFUiBCWSBhdmdfY29kZV9hZ2UgREVTQwpMSU1JVCAxMA==) + +```sql +WITH + current_files AS + ( + SELECT path + FROM + ( + SELECT + old_path AS path, + max(time) AS last_time, + 2 AS change_type + FROM git.file_changes + GROUP BY old_path + UNION ALL + SELECT + path, + max(time) AS last_time, + argMax(change_type, time) AS change_type + FROM git.file_changes + GROUP BY path + ) + GROUP BY path + HAVING (argMax(change_type, last_time) != 2) AND (NOT match(path, '(^dbms/)|(^libs/)|(^tests/testflows/)|(^programs/server/store/)')) + ORDER BY path ASC + ), + lines_removed AS + ( + SELECT + added_code.path AS path, + added_code.line, + added_code.time AS added_day, + dateDiff('day', added_code.time, removed_code.time) AS days_present + FROM + ( + SELECT + path, + line, + max(time) AS time, + any(file_extension) AS file_extension + FROM git.line_changes + WHERE (sign = 1) AND (line_type NOT IN ('Punct', 'Empty')) + GROUP BY + path, + line + ) AS added_code + INNER JOIN + ( + SELECT + path, + line, + max(time) AS time + FROM git.line_changes + WHERE (sign = -1) AND (line_type NOT IN ('Punct', 'Empty')) + GROUP BY + path, + line + ) AS removed_code USING (path, line) + WHERE (removed_code.time > added_code.time) AND (path IN (current_files)) AND (file_extension IN ('h', 'cpp', 'sql')) + ) +SELECT + path, + avg(days_present) AS avg_code_age +FROM lines_removed +GROUP BY path +ORDER BY avg_code_age DESC +LIMIT 10 + +┌─path────────────────────────────────────────────────────────────┬──────avg_code_age─┐ +│ utils/corrector_utf8/corrector_utf8.cpp │ 1353.888888888889 │ +│ tests/queries/0_stateless/01288_shard_max_network_bandwidth.sql │ 881 │ +│ src/Functions/replaceRegexpOne.cpp │ 861 │ +│ src/Functions/replaceRegexpAll.cpp │ 861 │ +│ src/Functions/replaceOne.cpp │ 861 │ +│ utils/zookeeper-remove-by-list/main.cpp │ 838.25 │ +│ tests/queries/0_stateless/01356_state_resample.sql │ 819 │ +│ tests/queries/0_stateless/01293_create_role.sql │ 819 │ +│ src/Functions/ReplaceStringImpl.h │ 810 │ +│ src/Interpreters/createBlockSelector.cpp │ 795 │ +└─────────────────────────────────────────────────────────────────┴───────────────────┘ + +10 rows in set. Elapsed: 3.134 sec. Processed 16.13 million rows, 1.83 GB (5.15 million rows/s., 582.99 MB/s.) +``` + +## Who tends to write more tests / CPP code / comments? + +There are a few ways we can address this question. Focusing on the code to test ratio, this query is relatively simple - count the number of contributions to folders containing `tests` and compute the ratio to total contributions. + +Note we limit to users with more than 20 changes to focus on regular committers and avoid a bias to one-off contributions. + +[play](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICBhdXRob3IsCiAgICBjb3VudElmKChmaWxlX2V4dGVuc2lvbiBJTiAoJ2gnLCAnY3BwJywgJ3NxbCcsICdzaCcsICdweScsICdleHBlY3QnKSkgQU5EIChwYXRoIExJS0UgJyV0ZXN0cyUnKSkgQVMgdGVzdCwKICAgIGNvdW50SWYoKGZpbGVfZXh0ZW5zaW9uIElOICgnaCcsICdjcHAnLCAnc3FsJykpIEFORCAoTk9UIChwYXRoIExJS0UgJyV0ZXN0cyUnKSkpIEFTIGNvZGUsCiAgICBjb2RlIC8gKGNvZGUgKyB0ZXN0KSBBUyByYXRpb19jb2RlCkZST00gZ2l0X2NsaWNraG91c2UuZmlsZV9jaGFuZ2VzCkdST1VQIEJZIGF1dGhvcgpIQVZJTkcgY29kZSA+IDIwCk9SREVSIEJZIGNvZGUgREVTQwpMSU1JVCAyMA==) + +```sql +SELECT + author, + countIf((file_extension IN ('h', 'cpp', 'sql', 'sh', 'py', 'expect')) AND (path LIKE '%tests%')) AS test, + countIf((file_extension IN ('h', 'cpp', 'sql')) AND (NOT (path LIKE '%tests%'))) AS code, + code / (code + test) AS ratio_code +FROM git.file_changes +GROUP BY author +HAVING code > 20 +ORDER BY code DESC +LIMIT 20 + +┌─author───────────────┬─test─┬──code─┬─────────ratio_code─┐ +│ Alexey Milovidov │ 6617 │ 41799 │ 0.8633303040317251 │ +│ Nikolai Kochetov │ 916 │ 13361 │ 0.9358408629263851 │ +│ alesapin │ 2408 │ 8796 │ 0.785076758300607 │ +│ kssenii │ 869 │ 6769 │ 0.8862267609321812 │ +│ Maksim Kita │ 799 │ 5862 │ 0.8800480408347096 │ +│ Alexander Tokmakov │ 1472 │ 5727 │ 0.7955271565495208 │ +│ Vitaly Baranov │ 1764 │ 5521 │ 0.7578586135895676 │ +│ Ivan Lezhankin │ 843 │ 4698 │ 0.8478613968597726 │ +│ Anton Popov │ 599 │ 4346 │ 0.8788675429726996 │ +│ Ivan │ 2630 │ 4269 │ 0.6187853312074214 │ +│ Azat Khuzhin │ 1664 │ 3697 │ 0.689610147360567 │ +│ Amos Bird │ 400 │ 2901 │ 0.8788245986064829 │ +│ proller │ 1207 │ 2377 │ 0.6632254464285714 │ +│ chertus │ 453 │ 2359 │ 0.8389046941678521 │ +│ alexey-milovidov │ 303 │ 2321 │ 0.8845274390243902 │ +│ Alexey Arno │ 169 │ 2310 │ 0.9318273497377975 │ +│ Vitaliy Lyudvichenko │ 334 │ 2283 │ 0.8723729461215132 │ +│ Robert Schulze │ 182 │ 2196 │ 0.9234650967199327 │ +│ CurtizJ │ 460 │ 2158 │ 0.8242933537051184 │ +│ Alexander Kuzmenkov │ 298 │ 2092 │ 0.8753138075313808 │ +└──────────────────────┴──────┴───────┴────────────────────┘ + +20 rows in set. Elapsed: 0.034 sec. Processed 266.05 thousand rows, 4.65 MB (7.93 million rows/s., 138.76 MB/s.) +``` + +We can plot this distribution as a histogram. + +[play](https://play.clickhouse.com/play?user=play#V0lUSCAoCiAgICAgICAgU0VMRUNUIGhpc3RvZ3JhbSgxMCkocmF0aW9fY29kZSkgQVMgaGlzdAogICAgICAgIEZST00KICAgICAgICAoCiAgICAgICAgICAgIFNFTEVDVAogICAgICAgICAgICAgICAgYXV0aG9yLAogICAgICAgICAgICAgICAgY291bnRJZigoZmlsZV9leHRlbnNpb24gSU4gKCdoJywgJ2NwcCcsICdzcWwnLCAnc2gnLCAncHknLCAnZXhwZWN0JykpIEFORCAocGF0aCBMSUtFICcldGVzdHMlJykpIEFTIHRlc3QsCiAgICAgICAgICAgICAgICBjb3VudElmKChmaWxlX2V4dGVuc2lvbiBJTiAoJ2gnLCAnY3BwJywgJ3NxbCcpKSBBTkQgKE5PVCAocGF0aCBMSUtFICcldGVzdHMlJykpKSBBUyBjb2RlLAogICAgICAgICAgICAgICAgY29kZSAvIChjb2RlICsgdGVzdCkgQVMgcmF0aW9fY29kZQogICAgICAgICAgICBGUk9NIGdpdF9jbGlja2hvdXNlLmZpbGVfY2hhbmdlcwogICAgICAgICAgICBHUk9VUCBCWSBhdXRob3IKICAgICAgICAgICAgSEFWSU5HIGNvZGUgPiAyMAogICAgICAgICAgICBPUkRFUiBCWSBjb2RlIERFU0MKICAgICAgICAgICAgTElNSVQgMjAKICAgICAgICApCiAgICApIEFTIGhpc3QKU0VMRUNUCiAgICBhcnJheUpvaW4oaGlzdCkuMSBBUyBsb3dlciwKICAgIGFycmF5Sm9pbihoaXN0KS4yIEFTIHVwcGVyLAogICAgYmFyKGFycmF5Sm9pbihoaXN0KS4zLCAwLCAxMDAsIDUwMCkgQVMgYmFy) + +```sql +WITH ( + SELECT histogram(10)(ratio_code) AS hist + FROM + ( + SELECT + author, + countIf((file_extension IN ('h', 'cpp', 'sql', 'sh', 'py', 'expect')) AND (path LIKE '%tests%')) AS test, + countIf((file_extension IN ('h', 'cpp', 'sql')) AND (NOT (path LIKE '%tests%'))) AS code, + code / (code + test) AS ratio_code + FROM git.file_changes + GROUP BY author + HAVING code > 20 + ORDER BY code DESC + LIMIT 20 + ) + ) AS hist +SELECT + arrayJoin(hist).1 AS lower, + arrayJoin(hist).2 AS upper, + bar(arrayJoin(hist).3, 0, 100, 500) AS bar + +┌──────────────lower─┬──────────────upper─┬─bar───────────────────────────┐ +│ 0.6187853312074214 │ 0.6410053888179964 │ █████ │ +│ 0.6410053888179964 │ 0.6764177968945693 │ █████ │ +│ 0.6764177968945693 │ 0.7237343804750673 │ █████ │ +│ 0.7237343804750673 │ 0.7740802855073157 │ █████▋ │ +│ 0.7740802855073157 │ 0.807297655565091 │ ████████▋ │ +│ 0.807297655565091 │ 0.8338381996094653 │ ██████▎ │ +│ 0.8338381996094653 │ 0.8533566747727687 │ ████████▋ │ +│ 0.8533566747727687 │ 0.871392376017531 │ █████████▍ │ +│ 0.871392376017531 │ 0.904916108899021 │ ████████████████████████████▋ │ +│ 0.904916108899021 │ 0.9358408629263851 │ █████████████████▌ │ +└────────────────────┴────────────────────┴───────────────────────────────┘ +10 rows in set. Elapsed: 0.051 sec. Processed 266.05 thousand rows, 4.65 MB (5.24 million rows/s., 91.64 MB/s.) +``` + +Most contributors write more code than tests, as you'd expect. + +What about who adds the most comments when contributing code? + +[play](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICBhdXRob3IsCiAgICBhdmcocmF0aW9fY29tbWVudHMpIEFTIGF2Z19yYXRpb19jb21tZW50cywKICAgIHN1bShjb2RlKSBBUyBjb2RlCkZST00KKAogICAgU0VMRUNUCiAgICAgICAgYXV0aG9yLAogICAgICAgIGNvbW1pdF9oYXNoLAogICAgICAgIGNvdW50SWYobGluZV90eXBlID0gJ0NvbW1lbnQnKSBBUyBjb21tZW50cywKICAgICAgICBjb3VudElmKGxpbmVfdHlwZSA9ICdDb2RlJykgQVMgY29kZSwKICAgICAgICBpZihjb21tZW50cyA+IDAsIGNvbW1lbnRzIC8gKGNvbW1lbnRzICsgY29kZSksIDApIEFTIHJhdGlvX2NvbW1lbnRzCiAgICBGUk9NIGdpdF9jbGlja2hvdXNlLmxpbmVfY2hhbmdlcwogICAgR1JPVVAgQlkKICAgICAgICBhdXRob3IsCiAgICAgICAgY29tbWl0X2hhc2gKKQpHUk9VUCBCWSBhdXRob3IKT1JERVIgQlkgY29kZSBERVNDCkxJTUlUIDEwCg==) + +```sql +SELECT + author, + avg(ratio_comments) AS avg_ratio_comments, + sum(code) AS code +FROM +( + SELECT + author, + commit_hash, + countIf(line_type = 'Comment') AS comments, + countIf(line_type = 'Code') AS code, + if(comments > 0, comments / (comments + code), 0) AS ratio_comments + FROM git.line_changes + GROUP BY + author, + commit_hash +) +GROUP BY author +ORDER BY code DESC +LIMIT 10 +┌─author─────────────┬──avg_ratio_comments─┬────code─┐ +│ Alexey Milovidov │ 0.1034915408309902 │ 1147196 │ +│ s-kat │ 0.1361718900215362 │ 614224 │ +│ Nikolai Kochetov │ 0.08722993407690126 │ 218328 │ +│ alesapin │ 0.1040477684726504 │ 198082 │ +│ Vitaly Baranov │ 0.06446875712939285 │ 161801 │ +│ Maksim Kita │ 0.06863376297549255 │ 156381 │ +│ Alexey Arno │ 0.11252677608033655 │ 146642 │ +│ Vitaliy Zakaznikov │ 0.06199215397180561 │ 138530 │ +│ kssenii │ 0.07455322590796751 │ 131143 │ +│ Artur │ 0.12383737231074826 │ 121484 │ +└────────────────────┴─────────────────────┴─────────┘ +10 rows in set. Elapsed: 0.290 sec. Processed 7.54 million rows, 394.57 MB (26.00 million rows/s., 1.36 GB/s.) +``` + +Note we sort by code contributions. Surprisingly high % for all our largest contributors and part of what makes our code so readable. + +## How does an authors commits change over time with respect to code/comments percentage? + +To compute this by author is trivial, + +[play](#U0VMRUNUCiAgICBhdXRob3IsCiAgICBjb3VudElmKGxpbmVfdHlwZSA9ICdDb2RlJykgQVMgY29kZV9saW5lcywKICAgIGNvdW50SWYoKGxpbmVfdHlwZSA9ICdDb21tZW50JykgT1IgKGxpbmVfdHlwZSA9ICdQdW5jdCcpKSBBUyBjb21tZW50cywKICAgIGNvZGVfbGluZXMgLyAoY29tbWVudHMgKyBjb2RlX2xpbmVzKSBBUyByYXRpb19jb2RlLAogICAgdG9TdGFydE9mV2Vlayh0aW1lKSBBUyB3ZWVrCkZST00gZ2l0X2NsaWNraG91c2UubGluZV9jaGFuZ2VzCkdST1VQIEJZCiAgICB0aW1lLAogICAgYXV0aG9yCk9SREVSIEJZCiAgICBhdXRob3IgQVNDLAogICAgdGltZSBBU0MKTElNSVQgMTA=) + +```sql +SELECT + author, + countIf(line_type = 'Code') AS code_lines, + countIf((line_type = 'Comment') OR (line_type = 'Punct')) AS comments, + code_lines / (comments + code_lines) AS ratio_code, + toStartOfWeek(time) AS week +FROM git.line_changes +GROUP BY + time, + author +ORDER BY + author ASC, + time ASC +LIMIT 10 + +┌─author──────────────────────┬─code_lines─┬─comments─┬─────────ratio_code─┬───────week─┐ +│ 1lann │ 8 │ 0 │ 1 │ 2022-03-06 │ +│ 20018712 │ 2 │ 0 │ 1 │ 2020-09-13 │ +│ 243f6a8885a308d313198a2e037 │ 0 │ 2 │ 0 │ 2020-12-06 │ +│ 243f6a8885a308d313198a2e037 │ 0 │ 112 │ 0 │ 2020-12-06 │ +│ 243f6a8885a308d313198a2e037 │ 0 │ 14 │ 0 │ 2020-12-06 │ +│ 3ldar-nasyrov │ 2 │ 0 │ 1 │ 2021-03-14 │ +│ 821008736@qq.com │ 27 │ 2 │ 0.9310344827586207 │ 2019-04-21 │ +│ ANDREI STAROVEROV │ 182 │ 60 │ 0.7520661157024794 │ 2021-05-09 │ +│ ANDREI STAROVEROV │ 7 │ 0 │ 1 │ 2021-05-09 │ +│ ANDREI STAROVEROV │ 32 │ 12 │ 0.7272727272727273 │ 2021-05-09 │ +└─────────────────────────────┴────────────┴──────────┴────────────────────┴────────────┘ + +10 rows in set. Elapsed: 0.145 sec. Processed 7.54 million rows, 51.09 MB (51.83 million rows/s., 351.44 MB/s.) +``` + +Ideally, however, we want to see how this changes in aggregate across all authors from the first day they start committing. Do they slowly reduce the number of comments they write? + +To compute this, we first work out each author's comments ratio over time - similar to [Who tends to write more tests / CPP code / comments?](#who-tends-to-write-more-tests--cpp-code--comments). This is joined against each author's start date, allowing us to calculate the comment ratio by week offset. + +After calculating the average by-week offset across all authors, we sample these results by selecting every 10th week. + +[play](https://play.clickhouse.com/play?user=play#V0lUSCBhdXRob3JfcmF0aW9zX2J5X29mZnNldCBBUwogICAgKAogICAgICAgIFNFTEVDVAogICAgICAgICAgICBhdXRob3IsCiAgICAgICAgICAgIGRhdGVEaWZmKCd3ZWVrJywgc3RhcnRfZGF0ZXMuc3RhcnRfZGF0ZSwgY29udHJpYnV0aW9ucy53ZWVrKSBBUyB3ZWVrX29mZnNldCwKICAgICAgICAgICAgcmF0aW9fY29kZQogICAgICAgIEZST00KICAgICAgICAoCiAgICAgICAgICAgIFNFTEVDVAogICAgICAgICAgICAgICAgYXV0aG9yLAogICAgICAgICAgICAgICAgdG9TdGFydE9mV2VlayhtaW4odGltZSkpIEFTIHN0YXJ0X2RhdGUKICAgICAgICAgICAgRlJPTSBnaXRfY2xpY2tob3VzZS5saW5lX2NoYW5nZXMKICAgICAgICAgICAgV0hFUkUgZmlsZV9leHRlbnNpb24gSU4gKCdoJywgJ2NwcCcsICdzcWwnKQogICAgICAgICAgICBHUk9VUCBCWSBhdXRob3IgQVMgc3RhcnRfZGF0ZXMKICAgICAgICApIEFTIHN0YXJ0X2RhdGVzCiAgICAgICAgSU5ORVIgSk9JTgogICAgICAgICgKICAgICAgICAgICAgU0VMRUNUCiAgICAgICAgICAgICAgICBhdXRob3IsCiAgICAgICAgICAgICAgICBjb3VudElmKGxpbmVfdHlwZSA9ICdDb2RlJykgQVMgY29kZSwKICAgICAgICAgICAgICAgIGNvdW50SWYoKGxpbmVfdHlwZSA9ICdDb21tZW50JykgT1IgKGxpbmVfdHlwZSA9ICdQdW5jdCcpKSBBUyBjb21tZW50cywKICAgICAgICAgICAgICAgIGNvbW1lbnRzIC8gKGNvbW1lbnRzICsgY29kZSkgQVMgcmF0aW9fY29kZSwKICAgICAgICAgICAgICAgIHRvU3RhcnRPZldlZWsodGltZSkgQVMgd2VlawogICAgICAgICAgICBGUk9NIGdpdF9jbGlja2hvdXNlLmxpbmVfY2hhbmdlcwogICAgICAgICAgICBXSEVSRSAoZmlsZV9leHRlbnNpb24gSU4gKCdoJywgJ2NwcCcsICdzcWwnKSkgQU5EIChzaWduID0gMSkKICAgICAgICAgICAgR1JPVVAgQlkKICAgICAgICAgICAgICAgIHRpbWUsCiAgICAgICAgICAgICAgICBhdXRob3IKICAgICAgICAgICAgSEFWSU5HIGNvZGUgPiAyMAogICAgICAgICAgICBPUkRFUiBCWQogICAgICAgICAgICAgICAgYXV0aG9yIEFTQywKICAgICAgICAgICAgICAgIHRpbWUgQVNDCiAgICAgICAgKSBBUyBjb250cmlidXRpb25zIFVTSU5HIChhdXRob3IpCiAgICApClNFTEVDVAogICAgd2Vla19vZmZzZXQsCiAgICBhdmcocmF0aW9fY29kZSkgQVMgYXZnX2NvZGVfcmF0aW8KRlJPTSBhdXRob3JfcmF0aW9zX2J5X29mZnNldApHUk9VUCBCWSB3ZWVrX29mZnNldApIQVZJTkcgKHdlZWtfb2Zmc2V0ICUgMTApID0gMApPUkRFUiBCWSB3ZWVrX29mZnNldCBBU0MKTElNSVQgMjAK) + +```sql +WITH author_ratios_by_offset AS + ( + SELECT + author, + dateDiff('week', start_dates.start_date, contributions.week) AS week_offset, + ratio_code + FROM + ( + SELECT + author, + toStartOfWeek(min(time)) AS start_date + FROM git.line_changes + WHERE file_extension IN ('h', 'cpp', 'sql') + GROUP BY author AS start_dates + ) AS start_dates + INNER JOIN + ( + SELECT + author, + countIf(line_type = 'Code') AS code, + countIf((line_type = 'Comment') OR (line_type = 'Punct')) AS comments, + comments / (comments + code) AS ratio_code, + toStartOfWeek(time) AS week + FROM git.line_changes + WHERE (file_extension IN ('h', 'cpp', 'sql')) AND (sign = 1) + GROUP BY + time, + author + HAVING code > 20 + ORDER BY + author ASC, + time ASC + ) AS contributions USING (author) + ) +SELECT + week_offset, + avg(ratio_code) AS avg_code_ratio +FROM author_ratios_by_offset +GROUP BY week_offset +HAVING (week_offset % 10) = 0 +ORDER BY week_offset ASC +LIMIT 20 + +┌─week_offset─┬──────avg_code_ratio─┐ +│ 0 │ 0.21626798253005078 │ +│ 10 │ 0.18299433892099454 │ +│ 20 │ 0.22847255749045017 │ +│ 30 │ 0.2037816688365288 │ +│ 40 │ 0.1987063517030308 │ +│ 50 │ 0.17341406302829748 │ +│ 60 │ 0.1808884776496144 │ +│ 70 │ 0.18711773536450496 │ +│ 80 │ 0.18905573684766458 │ +│ 90 │ 0.2505147771581594 │ +│ 100 │ 0.2427673990917429 │ +│ 110 │ 0.19088569009169926 │ +│ 120 │ 0.14218574654598348 │ +│ 130 │ 0.20894252550489317 │ +│ 140 │ 0.22316626978848397 │ +│ 150 │ 0.1859507592277053 │ +│ 160 │ 0.22007759757363546 │ +│ 170 │ 0.20406936638195144 │ +│ 180 │ 0.1412102467834332 │ +│ 190 │ 0.20677550885049117 │ +└─────────────┴─────────────────────┘ + +20 rows in set. Elapsed: 0.167 sec. Processed 15.07 million rows, 101.74 MB (90.51 million rows/s., 610.98 MB/s.) +``` + +Encouragingly, our comment % is pretty constant and doesn't degrade the longer authors contribute. + +## What is the average time before code will be rewritten and the median (half-life of code decay)? + +We can use the same principle as [List files that were rewritten most number of time or by most of authors](#list-files-that-were-rewritten-most-number-of-time-or-by-most-of-authors) to identify rewrites but consider all files. A window function is used to compute the time between rewrites for each file. From this, we can calculate an average and median across all files. + +[play](https://play.clickhouse.com/play?user=play#V0lUSAogICAgY2hhbmdlcyBBUwogICAgKAogICAgICAgIFNFTEVDVAogICAgICAgICAgICBwYXRoLAogICAgICAgICAgICBjb21taXRfaGFzaCwKICAgICAgICAgICAgbWF4X3RpbWUsCiAgICAgICAgICAgIHR5cGUsCiAgICAgICAgICAgIG51bV9hZGRlZCwKICAgICAgICAgICAgbnVtX2RlbGV0ZWQsCiAgICAgICAgICAgIHN1bShudW1fYWRkZWQgLSBudW1fZGVsZXRlZCkgT1ZFUiAoUEFSVElUSU9OIEJZIHBhdGggT1JERVIgQlkgbWF4X3RpbWUgQVNDKSBBUyBjdXJyZW50X3NpemUsCiAgICAgICAgICAgIGlmKGN1cnJlbnRfc2l6ZSA+IDAsIG51bV9hZGRlZCAvIGN1cnJlbnRfc2l6ZSwgMCkgQVMgcGVyY2VudF9hZGQsCiAgICAgICAgICAgIGlmKGN1cnJlbnRfc2l6ZSA+IDAsIG51bV9kZWxldGVkIC8gY3VycmVudF9zaXplLCAwKSBBUyBwZXJjZW50X2RlbGV0ZQogICAgICAgIEZST00KICAgICAgICAoCiAgICAgICAgICAgIFNFTEVDVAogICAgICAgICAgICAgICAgcGF0aCwKICAgICAgICAgICAgICAgIG1heCh0aW1lKSBBUyBtYXhfdGltZSwKICAgICAgICAgICAgICAgIGNvbW1pdF9oYXNoLAogICAgICAgICAgICAgICAgYW55KGxpbmVzX2FkZGVkKSBBUyBudW1fYWRkZWQsCiAgICAgICAgICAgICAgICBhbnkobGluZXNfZGVsZXRlZCkgQVMgbnVtX2RlbGV0ZWQsCiAgICAgICAgICAgICAgICBhbnkoY2hhbmdlX3R5cGUpIEFTIHR5cGUKICAgICAgICAgICAgRlJPTSBnaXRfY2xpY2tob3VzZS5maWxlX2NoYW5nZXMKICAgICAgICAgICAgV0hFUkUgKGNoYW5nZV90eXBlIElOICgnQWRkJywgJ01vZGlmeScpKSBBTkQgKGZpbGVfZXh0ZW5zaW9uIElOICgnaCcsICdjcHAnLCAnc3FsJykpCiAgICAgICAgICAgIEdST1VQIEJZCiAgICAgICAgICAgICAgICBwYXRoLAogICAgICAgICAgICAgICAgY29tbWl0X2hhc2gKICAgICAgICAgICAgT1JERVIgQlkKICAgICAgICAgICAgICAgIHBhdGggQVNDLAogICAgICAgICAgICAgICAgbWF4X3RpbWUgQVNDCiAgICAgICAgKQogICAgKSwKICAgIHJld3JpdGVzIEFTCiAgICAoCiAgICAgICAgU0VMRUNUCiAgICAgICAgICAgICosCiAgICAgICAgICAgIGFueShtYXhfdGltZSkgT1ZFUiAoUEFSVElUSU9OIEJZIHBhdGggT1JERVIgQlkgbWF4X3RpbWUgQVNDIFJPV1MgQkVUV0VFTiAxIFBSRUNFRElORyBBTkQgQ1VSUkVOVCBST1cpIEFTIHByZXZpb3VzX3Jld3JpdGUsCiAgICAgICAgICAgIGRhdGVEaWZmKCdkYXknLCBwcmV2aW91c19yZXdyaXRlLCBtYXhfdGltZSkgQVMgcmV3cml0ZV9kYXlzCiAgICAgICAgRlJPTSBjaGFuZ2VzCiAgICAgICAgV0hFUkUgKHR5cGUgPSAnTW9kaWZ5JykgQU5EIChwZXJjZW50X2FkZCA+PSAwLjUpIEFORCAocGVyY2VudF9kZWxldGUgPj0gMC41KSBBTkQgKGN1cnJlbnRfc2l6ZSA+IDUwKQogICAgKQpTRUxFQ1QKICAgIGF2Z0lmKHJld3JpdGVfZGF5cywgcmV3cml0ZV9kYXlzID4gMCkgQVMgYXZnX3Jld3JpdGVfdGltZSwKICAgIHF1YW50aWxlc1RpbWluZ0lmKDAuNSkocmV3cml0ZV9kYXlzLCByZXdyaXRlX2RheXMgPiAwKSBBUyBoYWxmX2xpZmUKRlJPTSByZXdyaXRlcw==) + +```sql +WITH + changes AS + ( + SELECT + path, + commit_hash, + max_time, + type, + num_added, + num_deleted, + sum(num_added - num_deleted) OVER (PARTITION BY path ORDER BY max_time ASC) AS current_size, + if(current_size > 0, num_added / current_size, 0) AS percent_add, + if(current_size > 0, num_deleted / current_size, 0) AS percent_delete + FROM + ( + SELECT + path, + max(time) AS max_time, + commit_hash, + any(lines_added) AS num_added, + any(lines_deleted) AS num_deleted, + any(change_type) AS type + FROM git.file_changes + WHERE (change_type IN ('Add', 'Modify')) AND (file_extension IN ('h', 'cpp', 'sql')) + GROUP BY + path, + commit_hash + ORDER BY + path ASC, + max_time ASC + ) + ), + rewrites AS + ( + SELECT + *, + any(max_time) OVER (PARTITION BY path ORDER BY max_time ASC ROWS BETWEEN 1 PRECEDING AND CURRENT ROW) AS previous_rewrite, + dateDiff('day', previous_rewrite, max_time) AS rewrite_days + FROM changes + WHERE (type = 'Modify') AND (percent_add >= 0.5) AND (percent_delete >= 0.5) AND (current_size > 50) + ) +SELECT + avgIf(rewrite_days, rewrite_days > 0) AS avg_rewrite_time, + quantilesTimingIf(0.5)(rewrite_days, rewrite_days > 0) AS half_life +FROM rewrites + +┌─avg_rewrite_time─┬─half_life─┐ +│ 122.2890625 │ [23] │ +└──────────────────┴───────────┘ + +1 row in set. Elapsed: 0.388 sec. Processed 266.05 thousand rows, 22.85 MB (685.82 thousand rows/s., 58.89 MB/s.) +``` + +## What is the worst time to write code in sense that the code has highest chance to be re-written? + +Similar to [What is the average time before code will be rewritten and the median (half-life of code decay)?](#what-is-the-average-time-before-code-will-be-rewritten-and-the-median-half-life-of-code-decay) and [List files that were rewritten most number of time or by most of authors](#list-files-that-were-rewritten-most-number-of-time-or-by-most-of-authors), except we aggregate by day of week. Adjust as required e.g. month of year. + +[play](https://play.clickhouse.com/play?user=play#V0lUSAogICAgY2hhbmdlcyBBUwogICAgKAogICAgICAgIFNFTEVDVAogICAgICAgICAgICBwYXRoLAogICAgICAgICAgICBjb21taXRfaGFzaCwKICAgICAgICAgICAgbWF4X3RpbWUsCiAgICAgICAgICAgIHR5cGUsCiAgICAgICAgICAgIG51bV9hZGRlZCwKICAgICAgICAgICAgbnVtX2RlbGV0ZWQsCiAgICAgICAgICAgIHN1bShudW1fYWRkZWQgLSBudW1fZGVsZXRlZCkgT1ZFUiAoUEFSVElUSU9OIEJZIHBhdGggT1JERVIgQlkgbWF4X3RpbWUgQVNDKSBBUyBjdXJyZW50X3NpemUsCiAgICAgICAgICAgIGlmKGN1cnJlbnRfc2l6ZSA+IDAsIG51bV9hZGRlZCAvIGN1cnJlbnRfc2l6ZSwgMCkgQVMgcGVyY2VudF9hZGQsCiAgICAgICAgICAgIGlmKGN1cnJlbnRfc2l6ZSA+IDAsIG51bV9kZWxldGVkIC8gY3VycmVudF9zaXplLCAwKSBBUyBwZXJjZW50X2RlbGV0ZQogICAgICAgIEZST00KICAgICAgICAoCiAgICAgICAgICAgIFNFTEVDVAogICAgICAgICAgICAgICAgcGF0aCwKICAgICAgICAgICAgICAgIG1heCh0aW1lKSBBUyBtYXhfdGltZSwKICAgICAgICAgICAgICAgIGNvbW1pdF9oYXNoLAogICAgICAgICAgICAgICAgYW55KGZpbGVfbGluZXNfYWRkZWQpIEFTIG51bV9hZGRlZCwKICAgICAgICAgICAgICAgIGFueShmaWxlX2xpbmVzX2RlbGV0ZWQpIEFTIG51bV9kZWxldGVkLAogICAgICAgICAgICAgICAgYW55KGZpbGVfY2hhbmdlX3R5cGUpIEFTIHR5cGUKICAgICAgICAgICAgRlJPTSBnaXRfY2xpY2tob3VzZS5saW5lX2NoYW5nZXMKICAgICAgICAgICAgV0hFUkUgKGZpbGVfY2hhbmdlX3R5cGUgSU4gKCdBZGQnLCAnTW9kaWZ5JykpIEFORCAoZmlsZV9leHRlbnNpb24gSU4gKCdoJywgJ2NwcCcsICdzcWwnKSkKICAgICAgICAgICAgR1JPVVAgQlkKICAgICAgICAgICAgICAgIHBhdGgsCiAgICAgICAgICAgICAgICBjb21taXRfaGFzaAogICAgICAgICAgICBPUkRFUiBCWQogICAgICAgICAgICAgICAgcGF0aCBBU0MsCiAgICAgICAgICAgICAgICBtYXhfdGltZSBBU0MKICAgICAgICApCiAgICApLAogICAgcmV3cml0ZXMgQVMKICAgICgKICAgICAgICBTRUxFQ1QgYW55KG1heF90aW1lKSBPVkVSIChQQVJUSVRJT04gQlkgcGF0aCBPUkRFUiBCWSBtYXhfdGltZSBBU0MgUk9XUyBCRVRXRUVOIDEgUFJFQ0VESU5HIEFORCBDVVJSRU5UIFJPVykgQVMgcHJldmlvdXNfcmV3cml0ZQogICAgICAgIEZST00gY2hhbmdlcwogICAgICAgIFdIRVJFICh0eXBlID0gJ01vZGlmeScpIEFORCAocGVyY2VudF9hZGQgPj0gMC41KSBBTkQgKHBlcmNlbnRfZGVsZXRlID49IDAuNSkgQU5EIChjdXJyZW50X3NpemUgPiA1MCkKICAgICkKU0VMRUNUCiAgICBkYXlPZldlZWsocHJldmlvdXNfcmV3cml0ZSkgQVMgZGF5T2ZXZWVrLAogICAgY291bnQoKSBBUyBudW1fcmVfd3JpdGVzCkZST00gcmV3cml0ZXMKR1JPVVAgQlkgZGF5T2ZXZWVr) + +```sql +WITH + changes AS + ( + SELECT + path, + commit_hash, + max_time, + type, + num_added, + num_deleted, + sum(num_added - num_deleted) OVER (PARTITION BY path ORDER BY max_time ASC) AS current_size, + if(current_size > 0, num_added / current_size, 0) AS percent_add, + if(current_size > 0, num_deleted / current_size, 0) AS percent_delete + FROM + ( + SELECT + path, + max(time) AS max_time, + commit_hash, + any(file_lines_added) AS num_added, + any(file_lines_deleted) AS num_deleted, + any(file_change_type) AS type + FROM git.line_changes + WHERE (file_change_type IN ('Add', 'Modify')) AND (file_extension IN ('h', 'cpp', 'sql')) + GROUP BY + path, + commit_hash + ORDER BY + path ASC, + max_time ASC + ) + ), + rewrites AS + ( + SELECT any(max_time) OVER (PARTITION BY path ORDER BY max_time ASC ROWS BETWEEN 1 PRECEDING AND CURRENT ROW) AS previous_rewrite + FROM changes + WHERE (type = 'Modify') AND (percent_add >= 0.5) AND (percent_delete >= 0.5) AND (current_size > 50) + ) +SELECT + dayOfWeek(previous_rewrite) AS dayOfWeek, + count() AS num_re_writes +FROM rewrites +GROUP BY dayOfWeek + +┌─dayOfWeek─┬─num_re_writes─┐ +│ 1 │ 111 │ +│ 2 │ 121 │ +│ 3 │ 91 │ +│ 4 │ 111 │ +│ 5 │ 90 │ +│ 6 │ 64 │ +│ 7 │ 46 │ +└───────────┴───────────────┘ + +7 rows in set. Elapsed: 0.466 sec. Processed 7.54 million rows, 701.52 MB (16.15 million rows/s., 1.50 GB/s.) +``` + +## Which authors code is the most sticky? + +We define "sticky" as how long does an author's code stay before its rewritten. Similar to the previous question [What is the average time before code will be rewritten and the median (half-life of code decay)?](#what-is-the-average-time-before-code-will-be-rewritten-and-the-median-half-life-of-code-decay) - using the same metric for rewrites i.e. 50% additions and 50% deletions to the file. We compute the average rewrite time per author and only consider contributors with more than two files. + +[play](https://play.clickhouse.com/play?user=play#V0lUSAogICAgY2hhbmdlcyBBUwogICAgKAogICAgICAgIFNFTEVDVAogICAgICAgICAgICBwYXRoLAogICAgICAgICAgICBhdXRob3IsCiAgICAgICAgICAgIGNvbW1pdF9oYXNoLAogICAgICAgICAgICBtYXhfdGltZSwKICAgICAgICAgICAgdHlwZSwKICAgICAgICAgICAgbnVtX2FkZGVkLAogICAgICAgICAgICBudW1fZGVsZXRlZCwKICAgICAgICAgICAgc3VtKG51bV9hZGRlZCAtIG51bV9kZWxldGVkKSBPVkVSIChQQVJUSVRJT04gQlkgcGF0aCBPUkRFUiBCWSBtYXhfdGltZSBBU0MpIEFTIGN1cnJlbnRfc2l6ZSwKICAgICAgICAgICAgaWYoY3VycmVudF9zaXplID4gMCwgbnVtX2FkZGVkIC8gY3VycmVudF9zaXplLCAwKSBBUyBwZXJjZW50X2FkZCwKICAgICAgICAgICAgaWYoY3VycmVudF9zaXplID4gMCwgbnVtX2RlbGV0ZWQgLyBjdXJyZW50X3NpemUsIDApIEFTIHBlcmNlbnRfZGVsZXRlCiAgICAgICAgRlJPTQogICAgICAgICgKICAgICAgICAgICAgU0VMRUNUCiAgICAgICAgICAgICAgICBwYXRoLAogICAgICAgICAgICAgICAgYW55KGF1dGhvcikgQVMgYXV0aG9yLAogICAgICAgICAgICAgICAgbWF4KHRpbWUpIEFTIG1heF90aW1lLAogICAgICAgICAgICAgICAgY29tbWl0X2hhc2gsCiAgICAgICAgICAgICAgICBhbnkoZmlsZV9saW5lc19hZGRlZCkgQVMgbnVtX2FkZGVkLAogICAgICAgICAgICAgICAgYW55KGZpbGVfbGluZXNfZGVsZXRlZCkgQVMgbnVtX2RlbGV0ZWQsCiAgICAgICAgICAgICAgICBhbnkoZmlsZV9jaGFuZ2VfdHlwZSkgQVMgdHlwZQogICAgICAgICAgICBGUk9NIGdpdF9jbGlja2hvdXNlLmxpbmVfY2hhbmdlcwogICAgICAgICAgICBXSEVSRSAoZmlsZV9jaGFuZ2VfdHlwZSBJTiAoJ0FkZCcsICdNb2RpZnknKSkgQU5EIChmaWxlX2V4dGVuc2lvbiBJTiAoJ2gnLCAnY3BwJywgJ3NxbCcpKQogICAgICAgICAgICBHUk9VUCBCWQogICAgICAgICAgICAgICAgcGF0aCwKICAgICAgICAgICAgICAgIGNvbW1pdF9oYXNoCiAgICAgICAgICAgIE9SREVSIEJZCiAgICAgICAgICAgICAgICBwYXRoIEFTQywKICAgICAgICAgICAgICAgIG1heF90aW1lIEFTQwogICAgICAgICkKICAgICksCiAgICByZXdyaXRlcyBBUwogICAgKAogICAgICAgIFNFTEVDVAogICAgICAgICAgICAqLAogICAgICAgICAgICBhbnkobWF4X3RpbWUpIE9WRVIgKFBBUlRJVElPTiBCWSBwYXRoIE9SREVSIEJZIG1heF90aW1lIEFTQyBST1dTIEJFVFdFRU4gMSBQUkVDRURJTkcgQU5EIENVUlJFTlQgUk9XKSBBUyBwcmV2aW91c19yZXdyaXRlLAogICAgICAgICAgICBkYXRlRGlmZignZGF5JywgcHJldmlvdXNfcmV3cml0ZSwgbWF4X3RpbWUpIEFTIHJld3JpdGVfZGF5cywKICAgICAgICAgICAgYW55KGF1dGhvcikgT1ZFUiAoUEFSVElUSU9OIEJZIHBhdGggT1JERVIgQlkgbWF4X3RpbWUgQVNDIFJPV1MgQkVUV0VFTiAxIFBSRUNFRElORyBBTkQgQ1VSUkVOVCBST1cpIEFTIHByZXZfYXV0aG9yCiAgICAgICAgRlJPTSBjaGFuZ2VzCiAgICAgICAgV0hFUkUgKHR5cGUgPSAnTW9kaWZ5JykgQU5EIChwZXJjZW50X2FkZCA+PSAwLjUpIEFORCAocGVyY2VudF9kZWxldGUgPj0gMC41KSBBTkQgKGN1cnJlbnRfc2l6ZSA+IDUwKQogICAgKQpTRUxFQ1QKICAgIHByZXZfYXV0aG9yLAogICAgYXZnKHJld3JpdGVfZGF5cykgQVMgYywKICAgIHVuaXEocGF0aCkgQVMgbnVtX2ZpbGVzCkZST00gcmV3cml0ZXMKR1JPVVAgQlkgcHJldl9hdXRob3IKSEFWSU5HIG51bV9maWxlcyA+IDIKT1JERVIgQlkgYyBERVNDCkxJTUlUIDEwCg==) + +```sql +WITH + changes AS + ( + SELECT + path, + author, + commit_hash, + max_time, + type, + num_added, + num_deleted, + sum(num_added - num_deleted) OVER (PARTITION BY path ORDER BY max_time ASC) AS current_size, + if(current_size > 0, num_added / current_size, 0) AS percent_add, + if(current_size > 0, num_deleted / current_size, 0) AS percent_delete + FROM + ( + SELECT + path, + any(author) AS author, + max(time) AS max_time, + commit_hash, + any(file_lines_added) AS num_added, + any(file_lines_deleted) AS num_deleted, + any(file_change_type) AS type + FROM git.line_changes + WHERE (file_change_type IN ('Add', 'Modify')) AND (file_extension IN ('h', 'cpp', 'sql')) + GROUP BY + path, + commit_hash + ORDER BY + path ASC, + max_time ASC + ) + ), + rewrites AS + ( + SELECT + *, + any(max_time) OVER (PARTITION BY path ORDER BY max_time ASC ROWS BETWEEN 1 PRECEDING AND CURRENT ROW) AS previous_rewrite, + dateDiff('day', previous_rewrite, max_time) AS rewrite_days, + any(author) OVER (PARTITION BY path ORDER BY max_time ASC ROWS BETWEEN 1 PRECEDING AND CURRENT ROW) AS prev_author + FROM changes + WHERE (type = 'Modify') AND (percent_add >= 0.5) AND (percent_delete >= 0.5) AND (current_size > 50) + ) +SELECT + prev_author, + avg(rewrite_days) AS c, + uniq(path) AS num_files +FROM rewrites +GROUP BY prev_author +HAVING num_files > 2 +ORDER BY c DESC +LIMIT 10 + +┌─prev_author─────────┬──────────────────c─┬─num_files─┐ +│ Michael Kolupaev │ 304.6 │ 4 │ +│ alexey-milovidov │ 81.83333333333333 │ 4 │ +│ Alexander Kuzmenkov │ 64.5 │ 5 │ +│ Pavel Kruglov │ 55.8 │ 6 │ +│ Alexey Milovidov │ 48.416666666666664 │ 90 │ +│ Amos Bird │ 42.8 │ 4 │ +│ alesapin │ 38.083333333333336 │ 12 │ +│ Nikolai Kochetov │ 33.18421052631579 │ 26 │ +│ Alexander Tokmakov │ 31.866666666666667 │ 12 │ +│ Alexey Zatelepin │ 22.5 │ 4 │ +└─────────────────────┴────────────────────┴───────────┘ + +10 rows in set. Elapsed: 0.555 sec. Processed 7.54 million rows, 720.60 MB (13.58 million rows/s., 1.30 GB/s.) +``` + +## Most consecutive days of commits by an author + +This query first requires us to calculate the days when an author has committed. Using a window function, partitioning by author, we can compute the days between their commits. For each commit, if the time since the last commit was 1 day we mark it as consecutive (1) and 0 otherwise - storing this result in `consecutive_day`. + +Our subsequent array functions compute each author's longest sequence of consecutive ones. First, the `groupArray` function is used to collate all `consecutive_day` values for an author. This array of 1s and 0s, is then split on 0 values into subarrays. Finally, we calculate the longest subarray. + +[play](https://play.clickhouse.com/play?user=play#V0lUSCBjb21taXRfZGF5cyBBUwogICAgKAogICAgICAgIFNFTEVDVAogICAgICAgICAgICBhdXRob3IsCiAgICAgICAgICAgIGRheSwKICAgICAgICAgICAgYW55KGRheSkgT1ZFUiAoUEFSVElUSU9OIEJZIGF1dGhvciBPUkRFUiBCWSBkYXkgQVNDIFJPV1MgQkVUV0VFTiAxIFBSRUNFRElORyBBTkQgQ1VSUkVOVCBST1cpIEFTIHByZXZpb3VzX2NvbW1pdCwKICAgICAgICAgICAgZGF0ZURpZmYoJ2RheScsIHByZXZpb3VzX2NvbW1pdCwgZGF5KSBBUyBkYXlzX3NpbmNlX2xhc3QsCiAgICAgICAgICAgIGlmKGRheXNfc2luY2VfbGFzdCA9IDEsIDEsIDApIEFTIGNvbnNlY3V0aXZlX2RheQogICAgICAgIEZST00KICAgICAgICAoCiAgICAgICAgICAgIFNFTEVDVAogICAgICAgICAgICAgICAgYXV0aG9yLAogICAgICAgICAgICAgICAgdG9TdGFydE9mRGF5KHRpbWUpIEFTIGRheQogICAgICAgICAgICBGUk9NIGdpdF9jbGlja2hvdXNlLmNvbW1pdHMKICAgICAgICAgICAgR1JPVVAgQlkKICAgICAgICAgICAgICAgIGF1dGhvciwKICAgICAgICAgICAgICAgIGRheQogICAgICAgICAgICBPUkRFUiBCWQogICAgICAgICAgICAgICAgYXV0aG9yIEFTQywKICAgICAgICAgICAgICAgIGRheSBBU0MKICAgICAgICApCiAgICApClNFTEVDVAogICAgYXV0aG9yLAogICAgYXJyYXlNYXgoYXJyYXlNYXAoeCAtPiBsZW5ndGgoeCksIGFycmF5U3BsaXQoeCAtPiAoeCA9IDApLCBncm91cEFycmF5KGNvbnNlY3V0aXZlX2RheSkpKSkgQVMgbWF4X2NvbnNlY3V0aXZlX2RheXMKRlJPTSBjb21taXRfZGF5cwpHUk9VUCBCWSBhdXRob3IKT1JERVIgQlkgbWF4X2NvbnNlY3V0aXZlX2RheXMgREVTQwpMSU1JVCAxMA==) + +```sql +WITH commit_days AS + ( + SELECT + author, + day, + any(day) OVER (PARTITION BY author ORDER BY day ASC ROWS BETWEEN 1 PRECEDING AND CURRENT ROW) AS previous_commit, + dateDiff('day', previous_commit, day) AS days_since_last, + if(days_since_last = 1, 1, 0) AS consecutive_day + FROM + ( + SELECT + author, + toStartOfDay(time) AS day + FROM git.commits + GROUP BY + author, + day + ORDER BY + author ASC, + day ASC + ) + ) +SELECT + author, + arrayMax(arrayMap(x -> length(x), arraySplit(x -> (x = 0), groupArray(consecutive_day)))) - 1 AS max_consecutive_days +FROM commit_days +GROUP BY author +ORDER BY max_consecutive_days DESC +LIMIT 10 + +┌─author───────────┬─max_consecutive_days─┐ +│ kssenii │ 32 │ +│ Alexey Milovidov │ 30 │ +│ alesapin │ 26 │ +│ Azat Khuzhin │ 23 │ +│ Nikolai Kochetov │ 15 │ +│ feng lv │ 11 │ +│ alexey-milovidov │ 11 │ +│ Igor Nikonov │ 11 │ +│ Maksim Kita │ 11 │ +│ Nikita Vasilev │ 11 │ +└──────────────────┴──────────────────────┘ + +10 rows in set. Elapsed: 0.025 sec. Processed 62.78 thousand rows, 395.47 KB (2.54 million rows/s., 16.02 MB/s.) +``` + +## Line by line commit history of a file + +Files can be renamed. When this occurs, we get a rename event, where the `path` column is set to the new path of the file and the `old_path` represents the previous location e.g. + +[play](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICB0aW1lLAogICAgcGF0aCwKICAgIG9sZF9wYXRoLAogICAgY29tbWl0X2hhc2gsCiAgICBjb21taXRfbWVzc2FnZQpGUk9NIGdpdF9jbGlja2hvdXNlLmZpbGVfY2hhbmdlcwpXSEVSRSAocGF0aCA9ICdzcmMvU3RvcmFnZXMvU3RvcmFnZVJlcGxpY2F0ZWRNZXJnZVRyZWUuY3BwJykgQU5EIChjaGFuZ2VfdHlwZSA9ICdSZW5hbWUnKQ==) + +```sql +SELECT + time, + path, + old_path, + commit_hash, + commit_message +FROM git.file_changes +WHERE (path = 'src/Storages/StorageReplicatedMergeTree.cpp') AND (change_type = 'Rename') + +┌────────────────time─┬─path────────────────────────────────────────┬─old_path─────────────────────────────────────┬─commit_hash──────────────────────────────┬─commit_message─┐ +│ 2020-04-03 16:14:31 │ src/Storages/StorageReplicatedMergeTree.cpp │ dbms/Storages/StorageReplicatedMergeTree.cpp │ 06446b4f08a142d6f1bc30664c47ded88ab51782 │ dbms/ → src/ │ +└─────────────────────┴─────────────────────────────────────────────┴──────────────────────────────────────────────┴──────────────────────────────────────────┴────────────────┘ + +1 row in set. Elapsed: 0.135 sec. Processed 266.05 thousand rows, 20.73 MB (1.98 million rows/s., 154.04 MB/s.) +``` + +This makes viewing the full history of a file challenging since we don't have a single value connecting all line or file changes. + +To address this, we can use User Defined Functions (UDFs). These cannot, currently, be recursive, so to identify the history of a file we must define a series of UDFs which call each other explicitly. + +This means we can only track renames to a maximum depth - the below example is 5 deep. It is unlikely a file will be renamed more times than this, so for now, this is sufficient. + +```sql +CREATE FUNCTION file_path_history AS (n) -> if(empty(n), [], arrayConcat([n], file_path_history_01((SELECT if(empty(old_path), Null, old_path) FROM git.file_changes WHERE path = n AND (change_type = 'Rename' OR change_type = 'Add') LIMIT 1)))); +CREATE FUNCTION file_path_history_01 AS (n) -> if(isNull(n), [], arrayConcat([n], file_path_history_02((SELECT if(empty(old_path), Null, old_path) FROM git.file_changes WHERE path = n AND (change_type = 'Rename' OR change_type = 'Add') LIMIT 1)))); +CREATE FUNCTION file_path_history_02 AS (n) -> if(isNull(n), [], arrayConcat([n], file_path_history_03((SELECT if(empty(old_path), Null, old_path) FROM git.file_changes WHERE path = n AND (change_type = 'Rename' OR change_type = 'Add') LIMIT 1)))); +CREATE FUNCTION file_path_history_03 AS (n) -> if(isNull(n), [], arrayConcat([n], file_path_history_04((SELECT if(empty(old_path), Null, old_path) FROM git.file_changes WHERE path = n AND (change_type = 'Rename' OR change_type = 'Add') LIMIT 1)))); +CREATE FUNCTION file_path_history_04 AS (n) -> if(isNull(n), [], arrayConcat([n], file_path_history_05((SELECT if(empty(old_path), Null, old_path) FROM git.file_changes WHERE path = n AND (change_type = 'Rename' OR change_type = 'Add') LIMIT 1)))); +CREATE FUNCTION file_path_history_05 AS (n) -> if(isNull(n), [], [n]); +``` + +By calling `file_path_history('src/Storages/StorageReplicatedMergeTree.cpp')` we recurse through the rename history, with each function calling the next level with the `old_path`. The results are combined using `arrayConcat`. + +For example, + +[play](https://play.clickhouse.com/play?user=play#U0VMRUNUIGZpbGVfcGF0aF9oaXN0b3J5KCdzcmMvU3RvcmFnZXMvU3RvcmFnZVJlcGxpY2F0ZWRNZXJnZVRyZWUuY3BwJykgQVMgcGF0aHMK) + +```sql +SELECT file_path_history('src/Storages/StorageReplicatedMergeTree.cpp') AS paths + +┌─paths─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ +│ ['src/Storages/StorageReplicatedMergeTree.cpp','dbms/Storages/StorageReplicatedMergeTree.cpp','dbms/src/Storages/StorageReplicatedMergeTree.cpp'] │ +└───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ + +1 row in set. Elapsed: 0.074 sec. Processed 344.06 thousand rows, 6.27 MB (4.65 million rows/s., 84.71 MB/s.) +``` + +We can use this capability to now assemble the commits for the entire history of a file. In this example, we show one commit for each of the `path` values. + +[play](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICB0aW1lLAogICAgc3Vic3RyaW5nKGNvbW1pdF9oYXNoLCAxLCAxMSkgQVMgY29tbWl0LAogICAgY2hhbmdlX3R5cGUsCiAgICBhdXRob3IsCiAgICBwYXRoLAogICAgY29tbWl0X21lc3NhZ2UKRlJPTSBnaXRfY2xpY2tob3VzZS5maWxlX2NoYW5nZXMKV0hFUkUgcGF0aCBJTiBmaWxlX3BhdGhfaGlzdG9yeSgnc3JjL1N0b3JhZ2VzL1N0b3JhZ2VSZXBsaWNhdGVkTWVyZ2VUcmVlLmNwcCcpCk9SREVSIEJZIHRpbWUgREVTQwpMSU1JVCAxIEJZIHBhdGgKRk9STUFUIFByZXR0eUNvbXBhY3RNb25vQmxvY2s=) + +```sql +SELECT + time, + substring(commit_hash, 1, 11) AS commit, + change_type, + author, + path, + commit_message +FROM git.file_changes +WHERE path IN file_path_history('src/Storages/StorageReplicatedMergeTree.cpp') +ORDER BY time DESC +LIMIT 1 BY path +FORMAT PrettyCompactMonoBlock + +┌────────────────time─┬─commit──────┬─change_type─┬─author─────────────┬─path─────────────────────────────────────────────┬─commit_message──────────────────────────────────────────────────────────────────┐ +│ 2022-10-30 16:30:51 │ c68ab231f91 │ Modify │ Alexander Tokmakov │ src/Storages/StorageReplicatedMergeTree.cpp │ fix accessing part in Deleting state │ +│ 2020-04-03 15:21:24 │ 38a50f44d34 │ Modify │ alesapin │ dbms/Storages/StorageReplicatedMergeTree.cpp │ Remove empty line │ +│ 2020-04-01 19:21:27 │ 1d5a77c1132 │ Modify │ alesapin │ dbms/src/Storages/StorageReplicatedMergeTree.cpp │ Tried to add ability to rename primary key columns but just banned this ability │ +└─────────────────────┴─────────────┴─────────────┴────────────────────┴──────────────────────────────────────────────────┴─────────────────────────────────────────────────────────────────────────────────┘ + +3 rows in set. Elapsed: 0.170 sec. Processed 611.53 thousand rows, 41.76 MB (3.60 million rows/s., 246.07 MB/s.) +``` + +# Unsolved Questions + +## Git blame + +This is particularly difficult to get an exact result due to the inability to currently keep state in array functions. This will be possible with an `arrayFold` or `arrayReduce`, which allows state to be held on each iteration. + +An approximate solution, sufficient for a high-level analysis, may look something like this: + +[play](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICBsaW5lX251bWJlcl9uZXcsCiAgICBhcmdNYXgoYXV0aG9yLCB0aW1lKSwKICAgIGFyZ01heChsaW5lLCB0aW1lKQpGUk9NIGdpdF9jbGlja2hvdXNlLmxpbmVfY2hhbmdlcwpXSEVSRSBwYXRoIElOIGZpbGVfcGF0aF9oaXN0b3J5KCdzcmMvU3RvcmFnZXMvU3RvcmFnZVJlcGxpY2F0ZWRNZXJnZVRyZWUuY3BwJykKR1JPVVAgQlkgbGluZV9udW1iZXJfbmV3Ck9SREVSIEJZIGxpbmVfbnVtYmVyX25ldyBBU0MKTElNSVQgMjA=) + +```sql +SELECT + line_number_new, + argMax(author, time), + argMax(line, time) +FROM git.line_changes +WHERE path IN file_path_history('src/Storages/StorageReplicatedMergeTree.cpp') +GROUP BY line_number_new +ORDER BY line_number_new ASC +LIMIT 20 + +┌─line_number_new─┬─argMax(author, time)─┬─argMax(line, time)────────────────────────────────────────────┐ +│ 1 │ Alexey Milovidov │ #include │ +│ 2 │ s-kat │ #include │ +│ 3 │ Anton Popov │ #include │ +│ 4 │ Alexander Burmak │ #include │ +│ 5 │ avogar │ #include │ +│ 6 │ Alexander Burmak │ #include │ +│ 7 │ Alexander Burmak │ #include │ +│ 8 │ Alexander Burmak │ #include │ +│ 9 │ Alexander Burmak │ #include │ +│ 10 │ Alexander Burmak │ #include │ +│ 11 │ Alexander Burmak │ #include │ +│ 12 │ Nikolai Kochetov │ #include │ +│ 13 │ alesapin │ #include │ +│ 14 │ alesapin │ │ +│ 15 │ Alexey Milovidov │ #include │ +│ 16 │ Alexey Zatelepin │ #include │ +│ 17 │ CurtizJ │ #include │ +│ 18 │ Kirill Shvakov │ #include │ +│ 19 │ s-kat │ #include │ +│ 20 │ Nikita Mikhaylov │ #include │ +└─────────────────┴──────────────────────┴───────────────────────────────────────────────────────────────┘ +20 rows in set. Elapsed: 0.547 sec. Processed 7.88 million rows, 679.20 MB (14.42 million rows/s., 1.24 GB/s.) +``` + +We welcome exact and improved solutions here. + + +# Related Content + +- [Git commits and our community](https://clickhouse.com/blog/clickhouse-git-community-commits) +- [Window and array functions for Git commit sequences](https://clickhouse.com/blog/clickhouse-window-array-functions-git-commits) diff --git a/docs/en/getting-started/example-datasets/images/superset-add-dataset.png b/docs/en/getting-started/example-datasets/images/superset-add-dataset.png new file mode 100644 index 00000000000..aaa976d76ce Binary files /dev/null and b/docs/en/getting-started/example-datasets/images/superset-add-dataset.png differ diff --git a/docs/en/getting-started/example-datasets/images/superset-add.png b/docs/en/getting-started/example-datasets/images/superset-add.png new file mode 100644 index 00000000000..54bbf11a014 Binary files /dev/null and b/docs/en/getting-started/example-datasets/images/superset-add.png differ diff --git a/docs/en/getting-started/example-datasets/images/superset-authors-matrix.png b/docs/en/getting-started/example-datasets/images/superset-authors-matrix.png new file mode 100644 index 00000000000..bdfc6b6f304 Binary files /dev/null and b/docs/en/getting-started/example-datasets/images/superset-authors-matrix.png differ diff --git a/docs/en/getting-started/example-datasets/images/superset-authors-matrix_v2.png b/docs/en/getting-started/example-datasets/images/superset-authors-matrix_v2.png new file mode 100644 index 00000000000..aad98b5b077 Binary files /dev/null and b/docs/en/getting-started/example-datasets/images/superset-authors-matrix_v2.png differ diff --git a/docs/en/getting-started/example-datasets/images/superset-cell-tower-dashboard.png b/docs/en/getting-started/example-datasets/images/superset-cell-tower-dashboard.png new file mode 100644 index 00000000000..8197ea223c2 Binary files /dev/null and b/docs/en/getting-started/example-datasets/images/superset-cell-tower-dashboard.png differ diff --git a/docs/en/getting-started/example-datasets/images/superset-choose-a-database.png b/docs/en/getting-started/example-datasets/images/superset-choose-a-database.png new file mode 100644 index 00000000000..40c71e0a053 Binary files /dev/null and b/docs/en/getting-started/example-datasets/images/superset-choose-a-database.png differ diff --git a/docs/en/getting-started/example-datasets/images/superset-commits-authors.png b/docs/en/getting-started/example-datasets/images/superset-commits-authors.png new file mode 100644 index 00000000000..7be831467cf Binary files /dev/null and b/docs/en/getting-started/example-datasets/images/superset-commits-authors.png differ diff --git a/docs/en/getting-started/example-datasets/images/superset-connect-a-database.png b/docs/en/getting-started/example-datasets/images/superset-connect-a-database.png new file mode 100644 index 00000000000..f67d0663063 Binary files /dev/null and b/docs/en/getting-started/example-datasets/images/superset-connect-a-database.png differ diff --git a/docs/en/getting-started/example-datasets/images/superset-create-map.png b/docs/en/getting-started/example-datasets/images/superset-create-map.png new file mode 100644 index 00000000000..5ad4395eb13 Binary files /dev/null and b/docs/en/getting-started/example-datasets/images/superset-create-map.png differ diff --git a/docs/en/getting-started/example-datasets/images/superset-github-lines-added-deleted.png b/docs/en/getting-started/example-datasets/images/superset-github-lines-added-deleted.png new file mode 100644 index 00000000000..48dbad1934d Binary files /dev/null and b/docs/en/getting-started/example-datasets/images/superset-github-lines-added-deleted.png differ diff --git a/docs/en/getting-started/example-datasets/images/superset-lon-lat.png b/docs/en/getting-started/example-datasets/images/superset-lon-lat.png new file mode 100644 index 00000000000..f07fb899e72 Binary files /dev/null and b/docs/en/getting-started/example-datasets/images/superset-lon-lat.png differ diff --git a/docs/en/getting-started/example-datasets/images/superset-mcc-204.png b/docs/en/getting-started/example-datasets/images/superset-mcc-204.png new file mode 100644 index 00000000000..a561c539b58 Binary files /dev/null and b/docs/en/getting-started/example-datasets/images/superset-mcc-204.png differ diff --git a/docs/en/getting-started/example-datasets/images/superset-radio-umts.png b/docs/en/getting-started/example-datasets/images/superset-radio-umts.png new file mode 100644 index 00000000000..b0b31b6dbc0 Binary files /dev/null and b/docs/en/getting-started/example-datasets/images/superset-radio-umts.png differ diff --git a/docs/en/getting-started/example-datasets/images/superset-umts-netherlands.png b/docs/en/getting-started/example-datasets/images/superset-umts-netherlands.png new file mode 100644 index 00000000000..5cb887cb5c1 Binary files /dev/null and b/docs/en/getting-started/example-datasets/images/superset-umts-netherlands.png differ diff --git a/docs/en/getting-started/example-datasets/recipes.md b/docs/en/getting-started/example-datasets/recipes.md index cc059f6bd26..6a003571f6e 100644 --- a/docs/en/getting-started/example-datasets/recipes.md +++ b/docs/en/getting-started/example-datasets/recipes.md @@ -4,7 +4,7 @@ sidebar_label: Recipes Dataset title: "Recipes Dataset" --- -RecipeNLG dataset is available for download [here](https://recipenlg.cs.put.poznan.pl/dataset). It contains 2.2 million recipes. The size is slightly less than 1 GB. +The RecipeNLG dataset is available for download [here](https://recipenlg.cs.put.poznan.pl/dataset). It contains 2.2 million recipes. The size is slightly less than 1 GB. ## Download and Unpack the Dataset diff --git a/docs/en/getting-started/index.md b/docs/en/getting-started/index.md index 0bb3ae1ca71..e72e23208ac 100644 --- a/docs/en/getting-started/index.md +++ b/docs/en/getting-started/index.md @@ -22,5 +22,8 @@ functions in ClickHouse. The sample datasets include: - The [Cell Towers dataset](../getting-started/example-datasets/cell-towers.md) imports a CSV into ClickHouse - The [NYPD Complaint Data](../getting-started/example-datasets/nypd_complaint_data.md) demonstrates how to use data inference to simplify creating tables - The ["What's on the Menu?" dataset](../getting-started/example-datasets/menus.md) has an example of denormalizing data +- The [Getting Data Into ClickHouse - Part 1](https://clickhouse.com/blog/getting-data-into-clickhouse-part-1) provides examples of defining a schema and loading a small Hacker News dataset +- The [Getting Data Into ClickHouse - Part 2 - A JSON detour](https://clickhouse.com/blog/getting-data-into-clickhouse-part-2-json) shows how JSON data can be loaded +- The [Getting Data Into ClickHouse - Part 3 - Using S3](https://clickhouse.com/blog/getting-data-into-clickhouse-part-3-s3) has examples of loading data from s3 -View the **Tutorials and Datasets** menu for a complete list of sample datasets. \ No newline at end of file +View the **Tutorials and Datasets** menu for a complete list of sample datasets. diff --git a/docs/en/getting-started/install.md b/docs/en/getting-started/install.md index e88e9e06a68..391d3a3f59a 100644 --- a/docs/en/getting-started/install.md +++ b/docs/en/getting-started/install.md @@ -8,8 +8,8 @@ slug: /en/install You have two options for getting up and running with ClickHouse: -- **[ClickHouse Cloud](https://clickhouse.cloud/):** the official ClickHouse as a service, - built by, maintained, and supported by the creators of ClickHouse -- **Self-managed ClickHouse:** ClickHouse can run on any Linux, FreeBSD, or Mac OS X with x86_64, AArch64, or PowerPC64LE CPU architecture +- **[ClickHouse Cloud](https://clickhouse.com/cloud/):** the official ClickHouse as a service, - built by, maintained, and supported by the creators of ClickHouse +- **[Self-managed ClickHouse](https://github.com/ClickHouse/ClickHouse):** ClickHouse can run on any Linux, FreeBSD, or Mac OS X with x86_64, AArch64, or PowerPC64LE CPU architecture ## ClickHouse Cloud @@ -406,4 +406,3 @@ SELECT 1 **Congratulations, the system works!** To continue experimenting, you can download one of the test data sets or go through [tutorial](/docs/en/tutorial.md). - diff --git a/docs/en/interfaces/formats.md b/docs/en/interfaces/formats.md index 58e986cc2f3..58998a6f491 100644 --- a/docs/en/interfaces/formats.md +++ b/docs/en/interfaces/formats.md @@ -5,7 +5,7 @@ sidebar_label: Input and Output Formats title: Formats for Input and Output Data --- -ClickHouse can accept and return data in various formats. A format supported for input can be used to parse the data provided to `INSERT`s, to perform `SELECT`s from a file-backed table such as File, URL or HDFS, or to read an external dictionary. A format supported for output can be used to arrange the +ClickHouse can accept and return data in various formats. A format supported for input can be used to parse the data provided to `INSERT`s, to perform `SELECT`s from a file-backed table such as File, URL or HDFS, or to read a dictionary. A format supported for output can be used to arrange the results of a `SELECT`, and to perform `INSERT`s into a file-backed table. The supported formats are: @@ -13,7 +13,7 @@ The supported formats are: | Format | Input | Output | |-------------------------------------------------------------------------------------------|------|--------| | [TabSeparated](#tabseparated) | ✔ | ✔ | -| [TabSeparatedRaw](#tabseparatedraw) | ✔ | ✔ | +| [TabSeparatedRaw](#tabseparatedraw) | ✔ | ✔ | | [TabSeparatedWithNames](#tabseparatedwithnames) | ✔ | ✔ | | [TabSeparatedWithNamesAndTypes](#tabseparatedwithnamesandtypes) | ✔ | ✔ | | [TabSeparatedRawWithNames](#tabseparatedrawwithnames) | ✔ | ✔ | @@ -48,6 +48,7 @@ The supported formats are: | [JSONCompactStringsEachRowWithNames](#jsoncompactstringseachrowwithnames) | ✔ | ✔ | | [JSONCompactStringsEachRowWithNamesAndTypes](#jsoncompactstringseachrowwithnamesandtypes) | ✔ | ✔ | | [JSONObjectEachRow](#jsonobjecteachrow) | ✔ | ✔ | +| [BSONEachRow](#bsoneachrow) | ✔ | ✔ | | [TSKV](#tskv) | ✔ | ✔ | | [Pretty](#pretty) | ✗ | ✔ | | [PrettyNoEscapes](#prettynoescapes) | ✗ | ✔ | @@ -1201,6 +1202,7 @@ SELECT * FROM json_each_row_nested - [input_format_import_nested_json](../operations/settings/settings.md#input_format_import_nested_json) - map nested JSON data to nested tables (it works for JSONEachRow format). Default value - `false`. - [input_format_json_read_bools_as_numbers](../operations/settings/settings.md#input_format_json_read_bools_as_numbers) - allow to parse bools as numbers in JSON input formats. Default value - `true`. - [input_format_json_read_numbers_as_strings](../operations/settings/settings.md#input_format_json_read_numbers_as_strings) - allow to parse numbers as strings in JSON input formats. Default value - `false`. +- [input_format_json_read_objects_as_strings](../operations/settings/settings.md#input_format_json_read_objects_as_strings) - allow to parse JSON objects as strings in JSON input formats. Default value - `false`. - [output_format_json_quote_64bit_integers](../operations/settings/settings.md#output_format_json_quote_64bit_integers) - controls quoting of 64-bit integers in JSON output format. Default value - `true`. - [output_format_json_quote_64bit_floats](../operations/settings/settings.md#output_format_json_quote_64bit_floats) - controls quoting of 64-bit floats in JSON output format. Default value - `false`. - [output_format_json_quote_denormals](../operations/settings/settings.md#output_format_json_quote_denormals) - enables '+nan', '-nan', '+inf', '-inf' outputs in JSON output format. Default value - `false`. @@ -1210,6 +1212,69 @@ SELECT * FROM json_each_row_nested - [output_format_json_array_of_rows](../operations/settings/settings.md#output_format_json_array_of_rows) - output a JSON array of all rows in JSONEachRow(Compact) format. Default value - `false`. - [output_format_json_validate_utf8](../operations/settings/settings.md#output_format_json_validate_utf8) - enables validation of UTF-8 sequences in JSON output formats (note that it doesn't impact formats JSON/JSONCompact/JSONColumnsWithMetadata, they always validate utf8). Default value - `false`. +## BSONEachRow {#bsoneachrow} + +In this format, ClickHouse formats/parses data as a sequence of BSON documents without any separator between them. +Each row is formatted as a single document and each column is formatted as a single BSON document field with column name as a key. + +For output it uses the following correspondence between ClickHouse types and BSON types: + +| ClickHouse type | BSON Type | +|-----------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------| +| [Bool](../sql-reference/data-types/boolean.md) | `\x08` boolean | +| [Int8/UInt8](../sql-reference/data-types/int-uint.md) | `\x10` int32 | +| [Int16UInt16](../sql-reference/data-types/int-uint.md) | `\x10` int32 | +| [Int32](../sql-reference/data-types/int-uint.md) | `\x10` int32 | +| [UInt32](../sql-reference/data-types/int-uint.md) | `\x12` int64 | +| [Int64/UInt64](../sql-reference/data-types/int-uint.md) | `\x12` int64 | +| [Float32/Float64](../sql-reference/data-types/float.md) | `\x01` double | +| [Date](../sql-reference/data-types/date.md)/[Date32](../sql-reference/data-types/date32.md) | `\x10` int32 | +| [DateTime](../sql-reference/data-types/datetime.md) | `\x12` int64 | +| [DateTime64](../sql-reference/data-types/datetime64.md) | `\x09` datetime | +| [Decimal32](../sql-reference/data-types/decimal.md) | `\x10` int32 | +| [Decimal64](../sql-reference/data-types/decimal.md) | `\x12` int64 | +| [Decimal128](../sql-reference/data-types/decimal.md) | `\x05` binary, `\x00` binary subtype, size = 16 | +| [Decimal256](../sql-reference/data-types/decimal.md) | `\x05` binary, `\x00` binary subtype, size = 32 | +| [Int128/UInt128](../sql-reference/data-types/int-uint.md) | `\x05` binary, `\x00` binary subtype, size = 16 | +| [Int256/UInt256](../sql-reference/data-types/int-uint.md) | `\x05` binary, `\x00` binary subtype, size = 32 | +| [String](../sql-reference/data-types/string.md)/[FixedString](../sql-reference/data-types/fixedstring.md) | `\x05` binary, `\x00` binary subtype or \x02 string if setting output_format_bson_string_as_string is enabled | +| [UUID](../sql-reference/data-types/uuid.md) | `\x05` binary, `\x04` uuid subtype, size = 16 | +| [Array](../sql-reference/data-types/array.md) | `\x04` array | +| [Tuple](../sql-reference/data-types/tuple.md) | `\x04` array | +| [Named Tuple](../sql-reference/data-types/tuple.md) | `\x03` document | +| [Map](../sql-reference/data-types/map.md) (with String keys) | `\x03` document | + +For input it uses the following correspondence between BSON types and ClickHouse types: + +| BSON Type | ClickHouse Type | +|------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `\x01` double | [Float32/Float64](../sql-reference/data-types/float.md) | +| `\x02` string | [String](../sql-reference/data-types/string.md)/[FixedString](../sql-reference/data-types/fixedstring.md) | +| `\x03` document | [Map](../sql-reference/data-types/map.md)/[Named Tuple](../sql-reference/data-types/tuple.md) | +| `\x04` array | [Array](../sql-reference/data-types/array.md)/[Tuple](../sql-reference/data-types/tuple.md) | +| `\x05` binary, `\x00` binary subtype | [String](../sql-reference/data-types/string.md)/[FixedString](../sql-reference/data-types/fixedstring.md) | +| `\x05` binary, `\x02` old binary subtype | [String](../sql-reference/data-types/string.md)/[FixedString](../sql-reference/data-types/fixedstring.md) | +| `\x05` binary, `\x03` old uuid subtype | [UUID](../sql-reference/data-types/uuid.md) | +| `\x05` binary, `\x04` uuid subtype | [UUID](../sql-reference/data-types/uuid.md) | +| `\x07` ObjectId | [String](../sql-reference/data-types/string.md)/[FixedString](../sql-reference/data-types/fixedstring.md) | +| `\x08` boolean | [Bool](../sql-reference/data-types/boolean.md) | +| `\x09` datetime | [DateTime64](../sql-reference/data-types/datetime64.md) | +| `\x0A` null value | [NULL](../sql-reference/data-types/nullable.md) | +| `\x0D` JavaScript code | [String](../sql-reference/data-types/string.md)/[FixedString](../sql-reference/data-types/fixedstring.md) | +| `\x0E` symbol | [String](../sql-reference/data-types/string.md)/[FixedString](../sql-reference/data-types/fixedstring.md) | +| `\x10` int32 | [Int32/UInt32](../sql-reference/data-types/int-uint.md)/[Decimal32](../sql-reference/data-types/decimal.md) | +| `\x12` int64 | [Int64/UInt64](../sql-reference/data-types/int-uint.md)/[Decimal64](../sql-reference/data-types/decimal.md)/[DateTime64](../sql-reference/data-types/datetime64.md) | + +Other BSON types are not supported. Also, it performs conversion between different integer types (for example, you can insert BSON int32 value into ClickHouse UInt8). +Big integers and decimals (Int128/UInt128/Int256/UInt256/Decimal128/Decimal256) can be parsed from BSON Binary value with `\x00` binary subtype. In this case this format will validate that the size of binary data equals the size of expected value. + +Note: this format don't work properly on Big-Endian platforms. + +### BSON format settings {#bson-format-settings} + +- [output_format_bson_string_as_string](../operations/settings/settings.md#output_format_bson_string_as_string) - use BSON String type instead of Binary for String columns. Default value - `false`. +- [input_format_bson_skip_fields_with_unsupported_types_in_schema_inference](../operations/settings/settings.md#input_format_bson_skip_fields_with_unsupported_types_in_schema_inference) - allow skipping columns with unsupported types while schema inference for format BSONEachRow. Default value - `false`. + ## Native {#native} The most efficient format. Data is written and read by blocks in binary format. For each block, the number of rows, number of columns, column names and types, and parts of columns in this block are recorded one after another. In other words, this format is “columnar” – it does not convert columns to rows. This is the format used in the native interface for interaction between servers, for using the command-line client, and for C++ clients. @@ -1392,6 +1457,10 @@ If setting [input_format_with_types_use_header](../operations/settings/settings. the types from input data will be compared with the types of the corresponding columns from the table. Otherwise, the second row will be skipped. ::: +## RowBinary format settings {#row-binary-format-settings} + +- [format_binary_max_string_size](../operations/settings/settings.md#format_binary_max_string_size) - The maximum allowed size for String in RowBinary format. Default value - `1GiB`. + ## Values {#data-format-values} Prints every row in brackets. Rows are separated by commas. There is no comma after the last row. The values inside the brackets are also comma-separated. Numbers are output in a decimal format without quotes. Arrays are output in square brackets. Strings, dates, and dates with times are output in quotes. Escaping rules and parsing are similar to the [TabSeparated](#tabseparated) format. During formatting, extra spaces aren’t inserted, but during parsing, they are allowed and skipped (except for spaces inside array values, which are not allowed). [NULL](../sql-reference/syntax.md) is represented as `NULL`. diff --git a/docs/en/interfaces/http.md b/docs/en/interfaces/http.md index 2c8044d38f7..9af6df0c87d 100644 --- a/docs/en/interfaces/http.md +++ b/docs/en/interfaces/http.md @@ -244,7 +244,7 @@ The username and password can be indicated in one of three ways: $ echo 'SELECT 1' | curl 'http://user:password@localhost:8123/' -d @- ``` -1. In the ‘user’ and ‘password’ URL parameters. Example: +2. In the ‘user’ and ‘password’ URL parameters (*We do not recommend using this method as the parameter might be logged by web proxy and cached in the browser*). Example: @@ -252,7 +252,7 @@ $ echo 'SELECT 1' | curl 'http://user:password@localhost:8123/' -d @- $ echo 'SELECT 1' | curl 'http://localhost:8123/?user=user&password=password' -d @- ``` -1. Using ‘X-ClickHouse-User’ and ‘X-ClickHouse-Key’ headers. Example: +3. Using ‘X-ClickHouse-User’ and ‘X-ClickHouse-Key’ headers. Example: diff --git a/docs/en/operations/troubleshooting.md b/docs/en/operations/_troubleshooting.md similarity index 86% rename from docs/en/operations/troubleshooting.md rename to docs/en/operations/_troubleshooting.md index 6a1ca3176ad..aed63ec4d0f 100644 --- a/docs/en/operations/troubleshooting.md +++ b/docs/en/operations/_troubleshooting.md @@ -1,9 +1,5 @@ ---- -slug: /en/operations/troubleshooting -sidebar_position: 46 -sidebar_label: Troubleshooting -title: Troubleshooting ---- + +[//]: # (This file is included in FAQ > Troubleshooting) - [Installation](#troubleshooting-installation-errors) - [Connecting to the server](#troubleshooting-accepts-no-connections) @@ -28,18 +24,34 @@ sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 8919F6BD2B48D7 sudo apt-get update ``` -### You Get the Unsupported Architecture Warning with Apt-get {#you-get-the-unsupported-architecture-warning-with-apt-get} +### You Get Different Warnings with `apt-get update` {#you-get-different-warnings-with-apt-get-update} -- The completed warning message is as follows: +- The completed warning messages are as one of following: ``` N: Skipping acquire of configured file 'main/binary-i386/Packages' as repository 'https://packages.clickhouse.com/deb stable InRelease' doesn't support architecture 'i386' ``` +``` +E: Failed to fetch https://packages.clickhouse.com/deb/dists/stable/main/binary-amd64/Packages.gz File has unexpected size (30451 != 28154). Mirror sync in progress? +``` + +``` +E: Repository 'https://packages.clickhouse.com/deb stable InRelease' changed its 'Origin' value from 'Artifactory' to 'ClickHouse' +E: Repository 'https://packages.clickhouse.com/deb stable InRelease' changed its 'Label' value from 'Artifactory' to 'ClickHouse' +N: Repository 'https://packages.clickhouse.com/deb stable InRelease' changed its 'Suite' value from 'stable' to '' +N: This must be accepted explicitly before updates for this repository can be applied. See apt-secure(8) manpage for details. +``` + +``` +Err:11 https://packages.clickhouse.com/deb stable InRelease + 400 Bad Request [IP: 172.66.40.249 443] +``` + To resolve the above issue, please use the following script: ```bash -sudo rm /var/lib/apt/lists/packages.clickhouse.com_* /var/lib/dpkg/arch +sudo rm /var/lib/apt/lists/packages.clickhouse.com_* /var/lib/dpkg/arch /var/lib/apt/lists/partial/packages.clickhouse.com_* sudo apt-get clean sudo apt-get autoclean ``` diff --git a/docs/en/operations/_update.md b/docs/en/operations/_update.md deleted file mode 100644 index 86981da2be6..00000000000 --- a/docs/en/operations/_update.md +++ /dev/null @@ -1,30 +0,0 @@ - -[//]: # (This file is included in Manage > Updates) - -## Self-managed ClickHouse Upgrade - -If ClickHouse was installed from `deb` packages, execute the following commands on the server: - -``` bash -$ sudo apt-get update -$ sudo apt-get install clickhouse-client clickhouse-server -$ sudo service clickhouse-server restart -``` - -If you installed ClickHouse using something other than the recommended `deb` packages, use the appropriate update method. - -:::note -You can update multiple servers at once as soon as there is no moment when all replicas of one shard are offline. -::: - -The upgrade of older version of ClickHouse to specific version: - -As an example: - -`xx.yy.a.b` is a current stable version. The latest stable version could be found [here](https://github.com/ClickHouse/ClickHouse/releases) - -```bash -$ sudo apt-get update -$ sudo apt-get install clickhouse-server=xx.yy.a.b clickhouse-client=xx.yy.a.b clickhouse-common-static=xx.yy.a.b -$ sudo service clickhouse-server restart -``` diff --git a/docs/en/operations/_backup.md b/docs/en/operations/backup.md similarity index 98% rename from docs/en/operations/_backup.md rename to docs/en/operations/backup.md index d694c51cee6..061d95c1152 100644 --- a/docs/en/operations/_backup.md +++ b/docs/en/operations/backup.md @@ -1,5 +1,8 @@ +--- +slug: /en/operations/backup +--- -[//]: # (This file is included in Manage > Backups) +# Backup and Restore - [Backup to a local disk](#backup-to-a-local-disk) - [Configuring backup/restore to use an S3 endpoint](#configuring-backuprestore-to-use-an-s3-endpoint) @@ -55,7 +58,7 @@ The BACKUP and RESTORE statements take a list of DATABASE and TABLE names, a des - SETTINGS: - [`compression_method`](en/sql-reference/statements/create/table/#column-compression-codecs) and compression_level - `password` for the file on disk - - `base_backup`: the destination of the previous backup of this source. For example, `Disk('backups', '1.zip')` + - `base_backup`: the destination of the previous backup of this source. For example, `Disk('backups', '1.zip')` ### Usage examples @@ -72,7 +75,7 @@ RESTORE TABLE test.table FROM Disk('backups', '1.zip') :::note The above RESTORE would fail if the table `test.table` contains data, you would have to drop the table in order to test the RESTORE, or use the setting `allow_non_empty_tables=true`: ``` -RESTORE TABLE test.table FROM Disk('backups', '1.zip') +RESTORE TABLE test.table FROM Disk('backups', '1.zip') SETTINGS allow_non_empty_tables=true ``` ::: @@ -101,7 +104,7 @@ BACKUP TABLE test.table TO Disk('backups', 'incremental-a.zip') Restore all data from the incremental backup and the base_backup into a new table `test.table2`: ``` -RESTORE TABLE test.table AS test.table2 +RESTORE TABLE test.table AS test.table2 FROM Disk('backups', 'incremental-a.zip'); ``` @@ -356,4 +359,3 @@ Data can be restored from backup using the `ALTER TABLE ... ATTACH PARTITION ... For more information about queries related to partition manipulations, see the [ALTER documentation](../sql-reference/statements/alter/partition.md#alter_manipulations-with-partitions). A third-party tool is available to automate this approach: [clickhouse-backup](https://github.com/AlexAkulov/clickhouse-backup). - diff --git a/docs/en/operations/caches.md b/docs/en/operations/caches.md index 3aeae7d1c9d..86760ec245f 100644 --- a/docs/en/operations/caches.md +++ b/docs/en/operations/caches.md @@ -11,6 +11,7 @@ Main cache types: - `mark_cache` — Cache of marks used by table engines of the [MergeTree](../engines/table-engines/mergetree-family/mergetree.md) family. - `uncompressed_cache` — Cache of uncompressed data used by table engines of the [MergeTree](../engines/table-engines/mergetree-family/mergetree.md) family. +- Operating system page cache (used indirectly, for files with actual data). Additional cache types: @@ -22,10 +23,4 @@ Additional cache types: - Schema inference cache. - [Filesystem cache](storing-data.md) over S3, Azure, Local and other disks. -Indirectly used: - -- OS page cache. - -To drop cache, use [SYSTEM DROP ... CACHE](../sql-reference/statements/system.md) statements. - -[Original article](https://clickhouse.com/docs/en/operations/caches/) +To drop one of the caches, use [SYSTEM DROP ... CACHE](../sql-reference/statements/system.md#drop-mark-cache) statements. diff --git a/docs/en/operations/clickhouse-keeper.md b/docs/en/operations/clickhouse-keeper.md index aad20da0010..3f11cc3cf7b 100644 --- a/docs/en/operations/clickhouse-keeper.md +++ b/docs/en/operations/clickhouse-keeper.md @@ -57,7 +57,7 @@ Internal coordination settings are located in the `..` section and contain servers description. @@ -126,7 +126,7 @@ clickhouse keeper --config /etc/your_path_to_config/config.xml ClickHouse Keeper also provides 4lw commands which are almost the same with Zookeeper. Each command is composed of four letters such as `mntr`, `stat` etc. There are some more interesting commands: `stat` gives some general information about the server and connected clients, while `srvr` and `cons` give extended details on server and connections respectively. -The 4lw commands has a white list configuration `four_letter_word_white_list` which has default value `conf,cons,crst,envi,ruok,srst,srvr,stat,wchc,wchs,dirs,mntr,isro`. +The 4lw commands has a white list configuration `four_letter_word_white_list` which has default value `conf,cons,crst,envi,ruok,srst,srvr,stat,wchs,dirs,mntr,isro,rcvr,apiv,csnp,lgif,rqld`. You can issue the commands to ClickHouse Keeper via telnet or nc, at the client port. @@ -309,6 +309,31 @@ Sessions with Ephemerals (1): /clickhouse/task_queue/ddl ``` +- `csnp`: Schedule a snapshot creation task. Return the last committed log index of the scheduled snapshot if success or `Failed to schedule snapshot creation task.` if failed. Note that `lgif` command can help you determine whether the snapshot is done. + +``` +100 +``` + +- `lgif`: Keeper log information. `first_log_idx` : my first log index in log store; `first_log_term` : my first log term; `last_log_idx` : my last log index in log store; `last_log_term` : my last log term; `last_committed_log_idx` : my last committed log index in state machine; `leader_committed_log_idx` : leader's committed log index from my perspective; `target_committed_log_idx` : target log index should be committed to; `last_snapshot_idx` : the largest committed log index in last snapshot. + +``` +first_log_idx 1 +first_log_term 1 +last_log_idx 101 +last_log_term 1 +last_committed_log_idx 100 +leader_committed_log_idx 101 +target_committed_log_idx 101 +last_snapshot_idx 50 +``` + +- `rqld`: Request to become new leader. Return `Sent leadership request to leader.` if request sent or `Failed to send leadership request to leader.` if request not sent. Note that if node is already leader the outcome is same as the request is sent. + +``` +Sent leadership request to leader. +``` + ## Migration from ZooKeeper {#migration-from-zookeeper} Seamlessly migration from ZooKeeper to ClickHouse Keeper is impossible you have to stop your ZooKeeper cluster, convert data and start ClickHouse Keeper. `clickhouse-keeper-converter` tool allows converting ZooKeeper logs and snapshots to ClickHouse Keeper snapshot. It works only with ZooKeeper > 3.4. Steps for migration: diff --git a/docs/en/operations/named-collections.md b/docs/en/operations/named-collections.md index f605045a0ad..cbb8d0a4c02 100644 --- a/docs/en/operations/named-collections.md +++ b/docs/en/operations/named-collections.md @@ -130,7 +130,7 @@ SHOW TABLES FROM mydatabase; └────────┘ ``` -### Example of using named collections with an external dictionary with source MySQL +### Example of using named collections with a dictionary with source MySQL ```sql CREATE DICTIONARY dict (A Int64, B String) @@ -213,7 +213,7 @@ SHOW TABLES FROM mydatabase └──────┘ ``` -### Example of using named collections with an external dictionary with source POSTGRESQL +### Example of using named collections with a dictionary with source POSTGRESQL ```sql CREATE DICTIONARY dict (a Int64, b String) @@ -270,7 +270,7 @@ SELECT * FROM remote(remote1, database = default, table = test); └───┴───┘ ``` -### Example of using named collections with an external dictionary with source ClickHouse +### Example of using named collections with a dictionary with source ClickHouse ```sql CREATE DICTIONARY dict(a Int64, b String) diff --git a/docs/en/operations/server-configuration-parameters/settings.md b/docs/en/operations/server-configuration-parameters/settings.md index dcda7536935..5faf3819d7e 100644 --- a/docs/en/operations/server-configuration-parameters/settings.md +++ b/docs/en/operations/server-configuration-parameters/settings.md @@ -268,14 +268,14 @@ The path to the table in ZooKeeper. ## dictionaries_config {#server_configuration_parameters-dictionaries_config} -The path to the config file for external dictionaries. +The path to the config file for dictionaries. Path: - Specify the absolute path or the path relative to the server config file. - The path can contain wildcards \* and ?. -See also “[External dictionaries](../../sql-reference/dictionaries/external-dictionaries/external-dicts.md)”. +See also “[Dictionaries](../../sql-reference/dictionaries/external-dictionaries/external-dicts.md)”. **Example** diff --git a/docs/en/operations/settings/constraints-on-settings.md b/docs/en/operations/settings/constraints-on-settings.md index 651b6465f7e..bb015f80834 100644 --- a/docs/en/operations/settings/constraints-on-settings.md +++ b/docs/en/operations/settings/constraints-on-settings.md @@ -91,4 +91,21 @@ Code: 452, e.displayText() = DB::Exception: Setting force_index_by_date should n **Note:** the `default` profile has special handling: all the constraints defined for the `default` profile become the default constraints, so they restrict all the users until they’re overridden explicitly for these users. +## Constraints on Merge Tree Settings +It is possible to set constraints for [merge tree settings](merge-tree-settings.md). There constraints are applied when table with merge tree engine is created or its storage settings are altered. Name of merge tree setting must be prepended by `merge_tree_` prefix when referenced in `` section. + +**Example:** Forbid to create new tables with explicitly specified `storage_policy` + +``` xml + + + + + + + + + +``` + [Original article](https://clickhouse.com/docs/en/operations/settings/constraints_on_settings/) diff --git a/docs/en/operations/settings/index.md b/docs/en/operations/settings/index.md index 8603257ea55..eee4058c230 100644 --- a/docs/en/operations/settings/index.md +++ b/docs/en/operations/settings/index.md @@ -26,7 +26,7 @@ Ways to configure settings, in order of priority: - When starting the ClickHouse console client in non-interactive mode, set the startup parameter `--setting=value`. - When using the HTTP API, pass CGI parameters (`URL?setting_1=value&setting_2=value...`). - - Make settings in the [SETTINGS](../../sql-reference/statements/select/index.md#settings-in-select) clause of the SELECT query. The setting value is applied only to that query and is reset to default or previous value after the query is executed. + - Make settings in the [SETTINGS](../../sql-reference/statements/select/index.md#settings-in-select-query) clause of the SELECT query. The setting value is applied only to that query and is reset to default or previous value after the query is executed. Settings that can only be made in the server config file are not covered in this section. diff --git a/docs/en/operations/settings/permissions-for-queries.md b/docs/en/operations/settings/permissions-for-queries.md index 3ba62b78cfe..c565de9b21a 100644 --- a/docs/en/operations/settings/permissions-for-queries.md +++ b/docs/en/operations/settings/permissions-for-queries.md @@ -16,44 +16,54 @@ Queries in ClickHouse can be divided into several types: The following settings regulate user permissions by the type of query: -- [readonly](#settings_readonly) — Restricts permissions for all types of queries except DDL queries. -- [allow_ddl](#settings_allow_ddl) — Restricts permissions for DDL queries. +## readonly +Restricts permissions for read data, write data, and change settings queries. -`KILL QUERY` can be performed with any settings. +When set to 1, allows: -## readonly {#settings_readonly} +- All types of read queries (like SELECT and equivalent queries). +- Queries that modify only session context (like USE). -Restricts permissions for reading data, write data and change settings queries. +When set to 2, allows the above plus: +- SET and CREATE TEMPORARY TABLE -See how the queries are divided into types [above](#permissions_for_queries). + :::tip + Queries like EXISTS, DESCRIBE, EXPLAIN, SHOW PROCESSLIST, etc are equivalent to SELECT, because they just do select from system tables. + ::: Possible values: -- 0 — All queries are allowed. -- 1 — Only read data queries are allowed. -- 2 — Read data and change settings queries are allowed. +- 0 — Read, Write, and Change settings queries are allowed. +- 1 — Only Read data queries are allowed. +- 2 — Read data and Change settings queries are allowed. +Default value: 0 + +:::note After setting `readonly = 1`, the user can’t change `readonly` and `allow_ddl` settings in the current session. When using the `GET` method in the [HTTP interface](../../interfaces/http.md), `readonly = 1` is set automatically. To modify data, use the `POST` method. -Setting `readonly = 1` prohibit the user from changing all the settings. There is a way to prohibit the user from changing only specific settings. Also there is a way to allow changing only specific settings under `readonly = 1` restrictions. For details see [constraints on settings](../../operations/settings/constraints-on-settings.md). +Setting `readonly = 1` prohibits the user from changing settings. There is a way to prohibit the user from changing only specific settings. Also there is a way to allow changing only specific settings under `readonly = 1` restrictions. For details see [constraints on settings](../../operations/settings/constraints-on-settings.md). +::: -Default value: 0 ## allow_ddl {#settings_allow_ddl} Allows or denies [DDL](https://en.wikipedia.org/wiki/Data_definition_language) queries. -See how the queries are divided into types [above](#permissions_for_queries). - Possible values: - 0 — DDL queries are not allowed. - 1 — DDL queries are allowed. -You can’t execute `SET allow_ddl = 1` if `allow_ddl = 0` for the current session. - Default value: 1 -[Original article](https://clickhouse.com/docs/en/operations/settings/permissions_for_queries/) +:::note +You cannot run `SET allow_ddl = 1` if `allow_ddl = 0` for the current session. +::: + + +:::note KILL QUERY +`KILL QUERY` can be performed with any combination of readonly and allow_ddl settings. +::: diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index dfcef4ae200..ddfaab02159 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -70,7 +70,7 @@ Another use case of `prefer_global_in_and_join` is accessing tables created by **See also:** -- [Distributed subqueries](../../sql-reference/operators/in.md#select-distributed-subqueries) for more information on how to use `GLOBAL IN`/`GLOBAL JOIN` +- [Distributed subqueries](../../sql-reference/operators/in.md/#select-distributed-subqueries) for more information on how to use `GLOBAL IN`/`GLOBAL JOIN` ## enable_optimize_predicate_expression {#enable-optimize-predicate-expression} @@ -170,7 +170,7 @@ It makes sense to disable it if the server has millions of tiny tables that are ## function_range_max_elements_in_block {#settings-function_range_max_elements_in_block} -Sets the safety threshold for data volume generated by function [range](../../sql-reference/functions/array-functions.md#range). Defines the maximum number of values generated by function per block of data (sum of array sizes for every row in a block). +Sets the safety threshold for data volume generated by function [range](../../sql-reference/functions/array-functions.md/#range). Defines the maximum number of values generated by function per block of data (sum of array sizes for every row in a block). Possible values: @@ -273,10 +273,10 @@ Default value: 0. ## insert_null_as_default {#insert_null_as_default} -Enables or disables the insertion of [default values](../../sql-reference/statements/create/table.md#create-default-values) instead of [NULL](../../sql-reference/syntax.md#null-literal) into columns with not [nullable](../../sql-reference/data-types/nullable.md#data_type-nullable) data type. +Enables or disables the insertion of [default values](../../sql-reference/statements/create/table.md/#create-default-values) instead of [NULL](../../sql-reference/syntax.md/#null-literal) into columns with not [nullable](../../sql-reference/data-types/nullable.md/#data_type-nullable) data type. If column type is not nullable and this setting is disabled, then inserting `NULL` causes an exception. If column type is nullable, then `NULL` values are inserted as is, regardless of this setting. -This setting is applicable to [INSERT ... SELECT](../../sql-reference/statements/insert-into.md#insert_query_insert-select) queries. Note that `SELECT` subqueries may be concatenated with `UNION ALL` clause. +This setting is applicable to [INSERT ... SELECT](../../sql-reference/statements/insert-into.md/#inserting-the-results-of-select) queries. Note that `SELECT` subqueries may be concatenated with `UNION ALL` clause. Possible values: @@ -287,7 +287,7 @@ Default value: `1`. ## join_default_strictness {#settings-join_default_strictness} -Sets default strictness for [JOIN clauses](../../sql-reference/statements/select/join.md#select-join). +Sets default strictness for [JOIN clauses](../../sql-reference/statements/select/join.md/#select-join). Possible values: @@ -322,7 +322,7 @@ When using `partial_merge` algorithm, ClickHouse sorts the data and dumps it to - `direct` - can be applied when the right storage supports key-value requests. -The `direct` algorithm performs a lookup in the right table using rows from the left table as keys. It's supported only by special storage such as [Dictionary](../../engines/table-engines/special/dictionary.md#dictionary) or [EmbeddedRocksDB](../../engines/table-engines/integrations/embedded-rocksdb.md) and only the `LEFT` and `INNER` JOINs. +The `direct` algorithm performs a lookup in the right table using rows from the left table as keys. It's supported only by special storage such as [Dictionary](../../engines/table-engines/special/dictionary.md/#dictionary) or [EmbeddedRocksDB](../../engines/table-engines/integrations/embedded-rocksdb.md) and only the `LEFT` and `INNER` JOINs. - `auto` — try `hash` join and switch on the fly to another algorithm if the memory limit is violated. @@ -348,7 +348,7 @@ Default value: 0. See also: -- [JOIN clause](../../sql-reference/statements/select/join.md#select-join) +- [JOIN clause](../../sql-reference/statements/select/join.md/#select-join) - [Join table engine](../../engines/table-engines/special/join.md) - [join_default_strictness](#settings-join_default_strictness) @@ -359,7 +359,7 @@ Sets the type of [JOIN](../../sql-reference/statements/select/join.md) behaviour Possible values: - 0 — The empty cells are filled with the default value of the corresponding field type. -- 1 — `JOIN` behaves the same way as in standard SQL. The type of the corresponding field is converted to [Nullable](../../sql-reference/data-types/nullable.md#data_type-nullable), and empty cells are filled with [NULL](../../sql-reference/syntax.md). +- 1 — `JOIN` behaves the same way as in standard SQL. The type of the corresponding field is converted to [Nullable](../../sql-reference/data-types/nullable.md/#data_type-nullable), and empty cells are filled with [NULL](../../sql-reference/syntax.md). Default value: 0. @@ -431,7 +431,7 @@ Default value: 0. See also: -- [JOIN strictness](../../sql-reference/statements/select/join.md#join-settings) +- [JOIN strictness](../../sql-reference/statements/select/join.md/#join-settings) ## temporary_files_codec {#temporary_files_codec} @@ -532,7 +532,7 @@ Default value: 8. If ClickHouse should read more than `merge_tree_max_rows_to_use_cache` rows in one query, it does not use the cache of uncompressed blocks. -The cache of uncompressed blocks stores data extracted for queries. ClickHouse uses this cache to speed up responses to repeated small queries. This setting protects the cache from trashing by queries that read a large amount of data. The [uncompressed_cache_size](../../operations/server-configuration-parameters/settings.md#server-settings-uncompressed_cache_size) server setting defines the size of the cache of uncompressed blocks. +The cache of uncompressed blocks stores data extracted for queries. ClickHouse uses this cache to speed up responses to repeated small queries. This setting protects the cache from trashing by queries that read a large amount of data. The [uncompressed_cache_size](../../operations/server-configuration-parameters/settings.md/#server-settings-uncompressed_cache_size) server setting defines the size of the cache of uncompressed blocks. Possible values: @@ -544,7 +544,7 @@ Default value: 128 ✕ 8192. If ClickHouse should read more than `merge_tree_max_bytes_to_use_cache` bytes in one query, it does not use the cache of uncompressed blocks. -The cache of uncompressed blocks stores data extracted for queries. ClickHouse uses this cache to speed up responses to repeated small queries. This setting protects the cache from trashing by queries that read a large amount of data. The [uncompressed_cache_size](../../operations/server-configuration-parameters/settings.md#server-settings-uncompressed_cache_size) server setting defines the size of the cache of uncompressed blocks. +The cache of uncompressed blocks stores data extracted for queries. ClickHouse uses this cache to speed up responses to repeated small queries. This setting protects the cache from trashing by queries that read a large amount of data. The [uncompressed_cache_size](../../operations/server-configuration-parameters/settings.md/#server-settings-uncompressed_cache_size) server setting defines the size of the cache of uncompressed blocks. Possible values: @@ -594,7 +594,7 @@ Default value: `1`. Setting up query logging. -Queries sent to ClickHouse with this setup are logged according to the rules in the [query_log](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-query-log) server configuration parameter. +Queries sent to ClickHouse with this setup are logged according to the rules in the [query_log](../../operations/server-configuration-parameters/settings.md/#server_configuration_parameters-query-log) server configuration parameter. Example: @@ -639,7 +639,7 @@ log_queries_min_type='EXCEPTION_WHILE_PROCESSING' Setting up query threads logging. -Query threads log into [system.query_thread_log](../../operations/system-tables/query_thread_log.md) table. This setting have effect only when [log_queries](#settings-log-queries) is true. Queries’ threads run by ClickHouse with this setup are logged according to the rules in the [query_thread_log](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-query_thread_log) server configuration parameter. +Query threads log into [system.query_thread_log](../../operations/system-tables/query_thread_log.md) table. This setting have effect only when [log_queries](#settings-log-queries) is true. Queries’ threads run by ClickHouse with this setup are logged according to the rules in the [query_thread_log](../../operations/server-configuration-parameters/settings.md/#server_configuration_parameters-query_thread_log) server configuration parameter. Possible values: @@ -658,7 +658,7 @@ log_query_threads=1 Setting up query views logging. -When a query run by ClickHouse with this setup on has associated views (materialized or live views), they are logged in the [query_views_log](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-query_views_log) server configuration parameter. +When a query run by ClickHouse with this setup on has associated views (materialized or live views), they are logged in the [query_views_log](../../operations/server-configuration-parameters/settings.md/#server_configuration_parameters-query_views_log) server configuration parameter. Example: @@ -668,7 +668,7 @@ log_query_views=1 ## log_formatted_queries {#settings-log-formatted-queries} -Allows to log formatted queries to the [system.query_log](../../operations/system-tables/query_log.md) system table (populates `formatted_query` column in the [system.query_log](../../operations/system-tables/query_log.md)). +Allows to log formatted queries to the [system.query_log](../../operations/system-tables/query_log.md) system table (populates `formatted_query` column in the [system.query_log](../../operations/system-tables/query_log.md)). Possible values: @@ -884,7 +884,7 @@ Default value: `5`. ## max_replicated_fetches_network_bandwidth_for_server {#max_replicated_fetches_network_bandwidth_for_server} -Limits the maximum speed of data exchange over the network in bytes per second for [replicated](../../engines/table-engines/mergetree-family/replication.md) fetches for the server. Only has meaning at server startup. You can also limit the speed for a particular table with [max_replicated_fetches_network_bandwidth](../../operations/settings/merge-tree-settings.md#max_replicated_fetches_network_bandwidth) setting. +Limits the maximum speed of data exchange over the network in bytes per second for [replicated](../../engines/table-engines/mergetree-family/replication.md) fetches for the server. Only has meaning at server startup. You can also limit the speed for a particular table with [max_replicated_fetches_network_bandwidth](../../operations/settings/merge-tree-settings.md/#max_replicated_fetches_network_bandwidth) setting. The setting isn't followed perfectly accurately. @@ -905,7 +905,7 @@ Could be used for throttling speed when replicating the data to add or replace n ## max_replicated_sends_network_bandwidth_for_server {#max_replicated_sends_network_bandwidth_for_server} -Limits the maximum speed of data exchange over the network in bytes per second for [replicated](../../engines/table-engines/mergetree-family/replication.md) sends for the server. Only has meaning at server startup. You can also limit the speed for a particular table with [max_replicated_sends_network_bandwidth](../../operations/settings/merge-tree-settings.md#max_replicated_sends_network_bandwidth) setting. +Limits the maximum speed of data exchange over the network in bytes per second for [replicated](../../engines/table-engines/mergetree-family/replication.md) sends for the server. Only has meaning at server startup. You can also limit the speed for a particular table with [max_replicated_sends_network_bandwidth](../../operations/settings/merge-tree-settings.md/#max_replicated_sends_network_bandwidth) setting. The setting isn't followed perfectly accurately. @@ -955,7 +955,7 @@ For more information, see the section “Extreme values”. ## kafka_max_wait_ms {#kafka-max-wait-ms} -The wait time in milliseconds for reading messages from [Kafka](../../engines/table-engines/integrations/kafka.md#kafka) before retry. +The wait time in milliseconds for reading messages from [Kafka](../../engines/table-engines/integrations/kafka.md/#kafka) before retry. Possible values: @@ -977,7 +977,7 @@ Default value: false. ## use_uncompressed_cache {#setting-use_uncompressed_cache} Whether to use a cache of uncompressed blocks. Accepts 0 or 1. By default, 0 (disabled). -Using the uncompressed cache (only for tables in the MergeTree family) can significantly reduce latency and increase throughput when working with a large number of short queries. Enable this setting for users who send frequent short requests. Also pay attention to the [uncompressed_cache_size](../../operations/server-configuration-parameters/settings.md#server-settings-uncompressed_cache_size) configuration parameter (only set in the config file) – the size of uncompressed cache blocks. By default, it is 8 GiB. The uncompressed cache is filled in as needed and the least-used data is automatically deleted. +Using the uncompressed cache (only for tables in the MergeTree family) can significantly reduce latency and increase throughput when working with a large number of short queries. Enable this setting for users who send frequent short requests. Also pay attention to the [uncompressed_cache_size](../../operations/server-configuration-parameters/settings.md/#server-settings-uncompressed_cache_size) configuration parameter (only set in the config file) – the size of uncompressed cache blocks. By default, it is 8 GiB. The uncompressed cache is filled in as needed and the least-used data is automatically deleted. For queries that read at least a somewhat large volume of data (one million rows or more), the uncompressed cache is disabled automatically to save space for truly small queries. This means that you can keep the ‘use_uncompressed_cache’ setting always set to 1. @@ -1124,7 +1124,7 @@ This setting is useful for replicated tables with a sampling key. A query may be - The cluster latency distribution has a long tail, so that querying more servers increases the query overall latency. :::warning -This setting will produce incorrect results when joins or subqueries are involved, and all tables don't meet certain requirements. See [Distributed Subqueries and max_parallel_replicas](../../sql-reference/operators/in.md#max_parallel_replica-subqueries) for more details. +This setting will produce incorrect results when joins or subqueries are involved, and all tables don't meet certain requirements. See [Distributed Subqueries and max_parallel_replicas](../../sql-reference/operators/in.md/#max_parallel_replica-subqueries) for more details. ::: ## compile_expressions {#compile-expressions} @@ -1261,7 +1261,7 @@ Possible values: Default value: 1. By default, blocks inserted into replicated tables by the `INSERT` statement are deduplicated (see [Data Replication](../../engines/table-engines/mergetree-family/replication.md)). -For the replicated tables by default the only 100 of the most recent blocks for each partition are deduplicated (see [replicated_deduplication_window](merge-tree-settings.md#replicated-deduplication-window), [replicated_deduplication_window_seconds](merge-tree-settings.md/#replicated-deduplication-window-seconds)). +For the replicated tables by default the only 100 of the most recent blocks for each partition are deduplicated (see [replicated_deduplication_window](merge-tree-settings.md/#replicated-deduplication-window), [replicated_deduplication_window_seconds](merge-tree-settings.md/#replicated-deduplication-window-seconds)). For not replicated tables see [non_replicated_deduplication_window](merge-tree-settings.md/#non-replicated-deduplication-window). ## deduplicate_blocks_in_dependent_materialized_views {#settings-deduplicate-blocks-in-dependent-materialized-views} @@ -1296,7 +1296,7 @@ Default value: empty string (disabled) `insert_deduplication_token` is used for deduplication _only_ when not empty. -For the replicated tables by default the only 100 of the most recent inserts for each partition are deduplicated (see [replicated_deduplication_window](merge-tree-settings.md#replicated-deduplication-window), [replicated_deduplication_window_seconds](merge-tree-settings.md/#replicated-deduplication-window-seconds)). +For the replicated tables by default the only 100 of the most recent inserts for each partition are deduplicated (see [replicated_deduplication_window](merge-tree-settings.md/#replicated-deduplication-window), [replicated_deduplication_window_seconds](merge-tree-settings.md/#replicated-deduplication-window-seconds)). For not replicated tables see [non_replicated_deduplication_window](merge-tree-settings.md/#non-replicated-deduplication-window). Example: @@ -1373,15 +1373,15 @@ Default value: 0. ## count_distinct_implementation {#settings-count_distinct_implementation} -Specifies which of the `uniq*` functions should be used to perform the [COUNT(DISTINCT …)](../../sql-reference/aggregate-functions/reference/count.md#agg_function-count) construction. +Specifies which of the `uniq*` functions should be used to perform the [COUNT(DISTINCT …)](../../sql-reference/aggregate-functions/reference/count.md/#agg_function-count) construction. Possible values: -- [uniq](../../sql-reference/aggregate-functions/reference/uniq.md#agg_function-uniq) -- [uniqCombined](../../sql-reference/aggregate-functions/reference/uniqcombined.md#agg_function-uniqcombined) -- [uniqCombined64](../../sql-reference/aggregate-functions/reference/uniqcombined64.md#agg_function-uniqcombined64) -- [uniqHLL12](../../sql-reference/aggregate-functions/reference/uniqhll12.md#agg_function-uniqhll12) -- [uniqExact](../../sql-reference/aggregate-functions/reference/uniqexact.md#agg_function-uniqexact) +- [uniq](../../sql-reference/aggregate-functions/reference/uniq.md/#agg_function-uniq) +- [uniqCombined](../../sql-reference/aggregate-functions/reference/uniqcombined.md/#agg_function-uniqcombined) +- [uniqCombined64](../../sql-reference/aggregate-functions/reference/uniqcombined64.md/#agg_function-uniqcombined64) +- [uniqHLL12](../../sql-reference/aggregate-functions/reference/uniqhll12.md/#agg_function-uniqhll12) +- [uniqExact](../../sql-reference/aggregate-functions/reference/uniqexact.md/#agg_function-uniqexact) Default value: `uniqExact`. @@ -1616,14 +1616,14 @@ Enables or disables optimization by transforming some functions to reading subco These functions can be transformed: -- [length](../../sql-reference/functions/array-functions.md#array_functions-length) to read the [size0](../../sql-reference/data-types/array.md#array-size) subcolumn. -- [empty](../../sql-reference/functions/array-functions.md#function-empty) to read the [size0](../../sql-reference/data-types/array.md#array-size) subcolumn. -- [notEmpty](../../sql-reference/functions/array-functions.md#function-notempty) to read the [size0](../../sql-reference/data-types/array.md#array-size) subcolumn. -- [isNull](../../sql-reference/operators/index.md#operator-is-null) to read the [null](../../sql-reference/data-types/nullable.md#finding-null) subcolumn. -- [isNotNull](../../sql-reference/operators/index.md#is-not-null) to read the [null](../../sql-reference/data-types/nullable.md#finding-null) subcolumn. -- [count](../../sql-reference/aggregate-functions/reference/count.md) to read the [null](../../sql-reference/data-types/nullable.md#finding-null) subcolumn. -- [mapKeys](../../sql-reference/functions/tuple-map-functions.md#mapkeys) to read the [keys](../../sql-reference/data-types/map.md#map-subcolumns) subcolumn. -- [mapValues](../../sql-reference/functions/tuple-map-functions.md#mapvalues) to read the [values](../../sql-reference/data-types/map.md#map-subcolumns) subcolumn. +- [length](../../sql-reference/functions/array-functions.md/#array_functions-length) to read the [size0](../../sql-reference/data-types/array.md/#array-size) subcolumn. +- [empty](../../sql-reference/functions/array-functions.md/#function-empty) to read the [size0](../../sql-reference/data-types/array.md/#array-size) subcolumn. +- [notEmpty](../../sql-reference/functions/array-functions.md/#function-notempty) to read the [size0](../../sql-reference/data-types/array.md/#array-size) subcolumn. +- [isNull](../../sql-reference/operators/index.md#operator-is-null) to read the [null](../../sql-reference/data-types/nullable.md/#finding-null) subcolumn. +- [isNotNull](../../sql-reference/operators/index.md#is-not-null) to read the [null](../../sql-reference/data-types/nullable.md/#finding-null) subcolumn. +- [count](../../sql-reference/aggregate-functions/reference/count.md) to read the [null](../../sql-reference/data-types/nullable.md/#finding-null) subcolumn. +- [mapKeys](../../sql-reference/functions/tuple-map-functions.md/#mapkeys) to read the [keys](../../sql-reference/data-types/map.md/#map-subcolumns) subcolumn. +- [mapValues](../../sql-reference/functions/tuple-map-functions.md/#mapvalues) to read the [values](../../sql-reference/data-types/map.md/#map-subcolumns) subcolumn. Possible values: @@ -1782,7 +1782,7 @@ Default value: 1000000000 nanoseconds (once a second). See also: -- System table [trace_log](../../operations/system-tables/trace_log.md#system_tables-trace_log) +- System table [trace_log](../../operations/system-tables/trace_log.md/#system_tables-trace_log) ## query_profiler_cpu_time_period_ns {#query_profiler_cpu_time_period_ns} @@ -1805,7 +1805,42 @@ Default value: 1000000000 nanoseconds. See also: -- System table [trace_log](../../operations/system-tables/trace_log.md#system_tables-trace_log) +- System table [trace_log](../../operations/system-tables/trace_log.md/#system_tables-trace_log) + +## memory_profiler_step {#memory_profiler_step} + +Sets the step of memory profiler. Whenever query memory usage becomes larger than every next step in number of bytes the memory profiler will collect the allocating stacktrace and will write it into [trace_log](../../operations/system-tables/trace_log.md#system_tables-trace_log). + +Possible values: + +- A positive integer number of bytes. + +- 0 for turning off the memory profiler. + +Default value: 4,194,304 bytes (4 MiB). + +## memory_profiler_sample_probability {#memory_profiler_sample_probability} + +Sets the probability of collecting stacktraces at random allocations and deallocations and writing them into [trace_log](../../operations/system-tables/trace_log.md#system_tables-trace_log). + +Possible values: + +- A positive floating-point number in the range [0..1]. + +- 0.0 for turning off the memory sampling. + +Default value: 0.0. + +## trace_profile_events {#trace_profile_events} + +Enables or disables collecting stacktraces on each update of profile events along with the name of profile event and the value of increment and sending them into [trace_log](../../operations/system-tables/trace_log.md#system_tables-trace_log). + +Possible values: + +- 1 — Tracing of profile events enabled. +- 0 — Tracing of profile events disabled. + +Default value: 0. ## allow_introspection_functions {#settings-allow_introspection_functions} @@ -1821,11 +1856,11 @@ Default value: 0. **See Also** - [Sampling Query Profiler](../../operations/optimizing-performance/sampling-query-profiler.md) -- System table [trace_log](../../operations/system-tables/trace_log.md#system_tables-trace_log) +- System table [trace_log](../../operations/system-tables/trace_log.md/#system_tables-trace_log) ## input_format_parallel_parsing {#input-format-parallel-parsing} -Enables or disables order-preserving parallel parsing of data formats. Supported only for [TSV](../../interfaces/formats.md#tabseparated), [TKSV](../../interfaces/formats.md#tskv), [CSV](../../interfaces/formats.md#csv) and [JSONEachRow](../../interfaces/formats.md#jsoneachrow) formats. +Enables or disables order-preserving parallel parsing of data formats. Supported only for [TSV](../../interfaces/formats.md/#tabseparated), [TKSV](../../interfaces/formats.md/#tskv), [CSV](../../interfaces/formats.md/#csv) and [JSONEachRow](../../interfaces/formats.md/#jsoneachrow) formats. Possible values: @@ -1836,7 +1871,7 @@ Default value: `1`. ## output_format_parallel_formatting {#output-format-parallel-formatting} -Enables or disables parallel formatting of data formats. Supported only for [TSV](../../interfaces/formats.md#tabseparated), [TKSV](../../interfaces/formats.md#tskv), [CSV](../../interfaces/formats.md#csv) and [JSONEachRow](../../interfaces/formats.md#jsoneachrow) formats. +Enables or disables parallel formatting of data formats. Supported only for [TSV](../../interfaces/formats.md/#tabseparated), [TKSV](../../interfaces/formats.md/#tskv), [CSV](../../interfaces/formats.md/#csv) and [JSONEachRow](../../interfaces/formats.md/#jsoneachrow) formats. Possible values: @@ -1878,7 +1913,7 @@ Default value: 0. ## insert_distributed_sync {#insert_distributed_sync} -Enables or disables synchronous data insertion into a [Distributed](../../engines/table-engines/special/distributed.md#distributed) table. +Enables or disables synchronous data insertion into a [Distributed](../../engines/table-engines/special/distributed.md/#distributed) table. By default, when inserting data into a `Distributed` table, the ClickHouse server sends data to cluster nodes in asynchronous mode. When `insert_distributed_sync=1`, the data is processed synchronously, and the `INSERT` operation succeeds only after all the data is saved on all shards (at least one replica for each shard if `internal_replication` is true). @@ -1891,12 +1926,12 @@ Default value: `0`. **See Also** -- [Distributed Table Engine](../../engines/table-engines/special/distributed.md#distributed) -- [Managing Distributed Tables](../../sql-reference/statements/system.md#query-language-system-distributed) +- [Distributed Table Engine](../../engines/table-engines/special/distributed.md/#distributed) +- [Managing Distributed Tables](../../sql-reference/statements/system.md/#query-language-system-distributed) ## insert_shard_id {#insert_shard_id} -If not `0`, specifies the shard of [Distributed](../../engines/table-engines/special/distributed.md#distributed) table into which the data will be inserted synchronously. +If not `0`, specifies the shard of [Distributed](../../engines/table-engines/special/distributed.md/#distributed) table into which the data will be inserted synchronously. If `insert_shard_id` value is incorrect, the server will throw an exception. @@ -1909,7 +1944,7 @@ SELECT uniq(shard_num) FROM system.clusters WHERE cluster = 'requested_cluster'; Possible values: - 0 — Disabled. -- Any number from `1` to `shards_num` of corresponding [Distributed](../../engines/table-engines/special/distributed.md#distributed) table. +- Any number from `1` to `shards_num` of corresponding [Distributed](../../engines/table-engines/special/distributed.md/#distributed) table. Default value: `0`. @@ -1969,7 +2004,7 @@ Default value: 16. ## background_move_pool_size {#background_move_pool_size} -Sets the number of threads performing background moves of data parts for [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-multiple-volumes)-engine tables. This setting is applied at the ClickHouse server start and can’t be changed in a user session. +Sets the number of threads performing background moves of data parts for [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md/#table_engine-mergetree-multiple-volumes)-engine tables. This setting is applied at the ClickHouse server start and can’t be changed in a user session. Possible values: @@ -1979,7 +2014,7 @@ Default value: 8. ## background_schedule_pool_size {#background_schedule_pool_size} -Sets the number of threads performing background tasks for [replicated](../../engines/table-engines/mergetree-family/replication.md) tables, [Kafka](../../engines/table-engines/integrations/kafka.md) streaming, [DNS cache updates](../../operations/server-configuration-parameters/settings.md#server-settings-dns-cache-update-period). This setting is applied at ClickHouse server start and can’t be changed in a user session. +Sets the number of threads performing background tasks for [replicated](../../engines/table-engines/mergetree-family/replication.md) tables, [Kafka](../../engines/table-engines/integrations/kafka.md) streaming, [DNS cache updates](../../operations/server-configuration-parameters/settings.md/#server-settings-dns-cache-update-period). This setting is applied at ClickHouse server start and can’t be changed in a user session. Possible values: @@ -2036,8 +2071,8 @@ Default value: 16. **See Also** -- [Kafka](../../engines/table-engines/integrations/kafka.md#kafka) engine. -- [RabbitMQ](../../engines/table-engines/integrations/rabbitmq.md#rabbitmq-engine) engine. +- [Kafka](../../engines/table-engines/integrations/kafka.md/#kafka) engine. +- [RabbitMQ](../../engines/table-engines/integrations/rabbitmq.md/#rabbitmq-engine) engine. ## validate_polygons {#validate_polygons} @@ -2052,7 +2087,7 @@ Default value: 1. ## transform_null_in {#transform_null_in} -Enables equality of [NULL](../../sql-reference/syntax.md#null-literal) values for [IN](../../sql-reference/operators/in.md) operator. +Enables equality of [NULL](../../sql-reference/syntax.md/#null-literal) values for [IN](../../sql-reference/operators/in.md) operator. By default, `NULL` values can’t be compared because `NULL` means undefined value. Thus, comparison `expr = NULL` must always return `false`. With this setting `NULL = NULL` returns `true` for `IN` operator. @@ -2106,7 +2141,7 @@ Result: **See Also** -- [NULL Processing in IN Operators](../../sql-reference/operators/in.md#in-null-processing) +- [NULL Processing in IN Operators](../../sql-reference/operators/in.md/#in-null-processing) ## low_cardinality_max_dictionary_size {#low_cardinality_max_dictionary_size} @@ -2133,7 +2168,7 @@ Default value: 0. ## low_cardinality_allow_in_native_format {#low_cardinality_allow_in_native_format} -Allows or restricts using the [LowCardinality](../../sql-reference/data-types/lowcardinality.md) data type with the [Native](../../interfaces/formats.md#native) format. +Allows or restricts using the [LowCardinality](../../sql-reference/data-types/lowcardinality.md) data type with the [Native](../../interfaces/formats.md/#native) format. If usage of `LowCardinality` is restricted, ClickHouse server converts `LowCardinality`-columns to ordinary ones for `SELECT` queries, and convert ordinary columns to `LowCardinality`-columns for `INSERT` queries. @@ -2197,7 +2232,7 @@ Default value: 268435456. ## optimize_read_in_order {#optimize_read_in_order} -Enables [ORDER BY](../../sql-reference/statements/select/order-by.md#optimize_read_in_order) optimization in [SELECT](../../sql-reference/statements/select/index.md) queries for reading data from [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) tables. +Enables [ORDER BY](../../sql-reference/statements/select/order-by.md/#optimize_read_in_order) optimization in [SELECT](../../sql-reference/statements/select/index.md) queries for reading data from [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) tables. Possible values: @@ -2208,7 +2243,7 @@ Default value: `1`. **See Also** -- [ORDER BY Clause](../../sql-reference/statements/select/order-by.md#optimize_read_in_order) +- [ORDER BY Clause](../../sql-reference/statements/select/order-by.md/#optimize_read_in_order) ## optimize_aggregation_in_order {#optimize_aggregation_in_order} @@ -2223,7 +2258,7 @@ Default value: `0`. **See Also** -- [GROUP BY optimization](../../sql-reference/statements/select/group-by.md#aggregation-in-order) +- [GROUP BY optimization](../../sql-reference/statements/select/group-by.md/#aggregation-in-order) ## mutations_sync {#mutations_sync} @@ -2261,8 +2296,8 @@ Default value: `0`. **See Also** -- [CREATE TABLE query clauses and settings](../../engines/table-engines/mergetree-family/mergetree.md#mergetree-query-clauses) (`merge_with_ttl_timeout` setting) -- [Table TTL](../../engines/table-engines/mergetree-family/mergetree.md#mergetree-table-ttl) +- [CREATE TABLE query clauses and settings](../../engines/table-engines/mergetree-family/mergetree.md/#mergetree-query-clauses) (`merge_with_ttl_timeout` setting) +- [Table TTL](../../engines/table-engines/mergetree-family/mergetree.md/#mergetree-table-ttl) ## lock_acquire_timeout {#lock_acquire_timeout} @@ -2279,7 +2314,7 @@ Default value: `120` seconds. ## cast_keep_nullable {#cast_keep_nullable} -Enables or disables keeping of the `Nullable` data type in [CAST](../../sql-reference/functions/type-conversion-functions.md#type_conversion_function-cast) operations. +Enables or disables keeping of the `Nullable` data type in [CAST](../../sql-reference/functions/type-conversion-functions.md/#type_conversion_function-cast) operations. When the setting is enabled and the argument of `CAST` function is `Nullable`, the result is also transformed to `Nullable` type. When the setting is disabled, the result always has the destination type exactly. @@ -2324,7 +2359,7 @@ Result: **See Also** -- [CAST](../../sql-reference/functions/type-conversion-functions.md#type_conversion_function-cast) function +- [CAST](../../sql-reference/functions/type-conversion-functions.md/#type_conversion_function-cast) function ## system_events_show_zero_values {#system_events_show_zero_values} @@ -2369,7 +2404,7 @@ Result ## persistent {#persistent} -Disables persistency for the [Set](../../engines/table-engines/special/set.md#set) and [Join](../../engines/table-engines/special/join.md#join) table engines. +Disables persistency for the [Set](../../engines/table-engines/special/set.md/#set) and [Join](../../engines/table-engines/special/join.md/#join) table engines. Reduces the I/O overhead. Suitable for scenarios that pursue performance and do not require persistence. @@ -2382,7 +2417,7 @@ Default value: `1`. ## allow_nullable_key {#allow-nullable-key} -Allows using of the [Nullable](../../sql-reference/data-types/nullable.md#data_type-nullable)-typed values in a sorting and a primary key for [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md#table_engines-mergetree) tables. +Allows using of the [Nullable](../../sql-reference/data-types/nullable.md/#data_type-nullable)-typed values in a sorting and a primary key for [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md/#table_engines-mergetree) tables. Possible values: @@ -2401,7 +2436,7 @@ Do not enable this feature in version `<= 21.8`. It's not properly implemented a ## aggregate_functions_null_for_empty {#aggregate_functions_null_for_empty} -Enables or disables rewriting all aggregate functions in a query, adding [-OrNull](../../sql-reference/aggregate-functions/combinators.md#agg-functions-combinator-ornull) suffix to them. Enable it for SQL standard compatibility. +Enables or disables rewriting all aggregate functions in a query, adding [-OrNull](../../sql-reference/aggregate-functions/combinators.md/#agg-functions-combinator-ornull) suffix to them. Enable it for SQL standard compatibility. It is implemented via query rewrite (similar to [count_distinct_implementation](#settings-count_distinct_implementation) setting) to get consistent results for distributed queries. Possible values: @@ -2448,7 +2483,7 @@ See examples in [UNION](../../sql-reference/statements/select/union.md). ## data_type_default_nullable {#data_type_default_nullable} -Allows data types without explicit modifiers [NULL or NOT NULL](../../sql-reference/statements/create/table.md#null-modifiers) in column definition will be [Nullable](../../sql-reference/data-types/nullable.md#data_type-nullable). +Allows data types without explicit modifiers [NULL or NOT NULL](../../sql-reference/statements/create/table.md/#null-modifiers) in column definition will be [Nullable](../../sql-reference/data-types/nullable.md/#data_type-nullable). Possible values: @@ -2478,7 +2513,7 @@ It can be useful when merges are CPU bounded not IO bounded (performing heavy da ## max_final_threads {#max-final-threads} -Sets the maximum number of parallel threads for the `SELECT` query data read phase with the [FINAL](../../sql-reference/statements/select/from.md#select-from-final) modifier. +Sets the maximum number of parallel threads for the `SELECT` query data read phase with the [FINAL](../../sql-reference/statements/select/from.md/#select-from-final) modifier. Possible values: @@ -2551,7 +2586,7 @@ Result: └─────────────┘ ``` -Note that this setting influences [Materialized view](../../sql-reference/statements/create/view.md#materialized) and [MaterializedMySQL](../../engines/database-engines/materialized-mysql.md) behaviour. +Note that this setting influences [Materialized view](../../sql-reference/statements/create/view.md/#materialized) and [MaterializedMySQL](../../engines/database-engines/materialized-mysql.md) behaviour. ## engine_file_empty_if_not_exists {#engine-file-empty_if-not-exists} @@ -2608,7 +2643,7 @@ Default value: `0`. ## allow_experimental_live_view {#allow-experimental-live-view} -Allows creation of experimental [live views](../../sql-reference/statements/create/view.md#live-view). +Allows creation of experimental [live views](../../sql-reference/statements/create/view.md/#live-view). Possible values: @@ -2619,19 +2654,19 @@ Default value: `0`. ## live_view_heartbeat_interval {#live-view-heartbeat-interval} -Sets the heartbeat interval in seconds to indicate [live view](../../sql-reference/statements/create/view.md#live-view) is alive . +Sets the heartbeat interval in seconds to indicate [live view](../../sql-reference/statements/create/view.md/#live-view) is alive . Default value: `15`. ## max_live_view_insert_blocks_before_refresh {#max-live-view-insert-blocks-before-refresh} -Sets the maximum number of inserted blocks after which mergeable blocks are dropped and query for [live view](../../sql-reference/statements/create/view.md#live-view) is re-executed. +Sets the maximum number of inserted blocks after which mergeable blocks are dropped and query for [live view](../../sql-reference/statements/create/view.md/#live-view) is re-executed. Default value: `64`. ## periodic_live_view_refresh {#periodic-live-view-refresh} -Sets the interval in seconds after which periodically refreshed [live view](../../sql-reference/statements/create/view.md#live-view) is forced to refresh. +Sets the interval in seconds after which periodically refreshed [live view](../../sql-reference/statements/create/view.md/#live-view) is forced to refresh. Default value: `60`. @@ -2670,7 +2705,7 @@ Default value: 180. ## check_query_single_value_result {#check_query_single_value_result} -Defines the level of detail for the [CHECK TABLE](../../sql-reference/statements/check-table.md#checking-mergetree-tables) query result for `MergeTree` family engines . +Defines the level of detail for the [CHECK TABLE](../../sql-reference/statements/check-table.md/#checking-mergetree-tables) query result for `MergeTree` family engines . Possible values: @@ -2681,7 +2716,7 @@ Default value: `0`. ## prefer_column_name_to_alias {#prefer-column-name-to-alias} -Enables or disables using the original column names instead of aliases in query expressions and clauses. It especially matters when alias is the same as the column name, see [Expression Aliases](../../sql-reference/syntax.md#notes-on-usage). Enable this setting to make aliases syntax rules in ClickHouse more compatible with most other database engines. +Enables or disables using the original column names instead of aliases in query expressions and clauses. It especially matters when alias is the same as the column name, see [Expression Aliases](../../sql-reference/syntax.md/#notes-on-usage). Enable this setting to make aliases syntax rules in ClickHouse more compatible with most other database engines. Possible values: @@ -2725,7 +2760,7 @@ Result: ## limit {#limit} -Sets the maximum number of rows to get from the query result. It adjusts the value set by the [LIMIT](../../sql-reference/statements/select/limit.md#limit-clause) clause, so that the limit, specified in the query, cannot exceed the limit, set by this setting. +Sets the maximum number of rows to get from the query result. It adjusts the value set by the [LIMIT](../../sql-reference/statements/select/limit.md/#limit-clause) clause, so that the limit, specified in the query, cannot exceed the limit, set by this setting. Possible values: @@ -2736,7 +2771,7 @@ Default value: `0`. ## offset {#offset} -Sets the number of rows to skip before starting to return rows from the query. It adjusts the offset set by the [OFFSET](../../sql-reference/statements/select/offset.md#offset-fetch) clause, so that these two values are summarized. +Sets the number of rows to skip before starting to return rows from the query. It adjusts the offset set by the [OFFSET](../../sql-reference/statements/select/offset.md/#offset-fetch) clause, so that these two values are summarized. Possible values: @@ -2773,7 +2808,7 @@ Result: ## optimize_syntax_fuse_functions {#optimize_syntax_fuse_functions} -Enables to fuse aggregate functions with identical argument. It rewrites query contains at least two aggregate functions from [sum](../../sql-reference/aggregate-functions/reference/sum.md#agg_function-sum), [count](../../sql-reference/aggregate-functions/reference/count.md#agg_function-count) or [avg](../../sql-reference/aggregate-functions/reference/avg.md#agg_function-avg) with identical argument to [sumCount](../../sql-reference/aggregate-functions/reference/sumcount.md#agg_function-sumCount). +Enables to fuse aggregate functions with identical argument. It rewrites query contains at least two aggregate functions from [sum](../../sql-reference/aggregate-functions/reference/sum.md/#agg_function-sum), [count](../../sql-reference/aggregate-functions/reference/count.md/#agg_function-count) or [avg](../../sql-reference/aggregate-functions/reference/avg.md/#agg_function-avg) with identical argument to [sumCount](../../sql-reference/aggregate-functions/reference/sumcount.md/#agg_function-sumCount). Possible values: @@ -2932,18 +2967,18 @@ If the setting is set to `0`, the table function does not make Nullable columns ## allow_experimental_projection_optimization {#allow-experimental-projection-optimization} -Enables or disables [projection](../../engines/table-engines/mergetree-family/mergetree.md#projections) optimization when processing `SELECT` queries. +Enables or disables [projection](../../engines/table-engines/mergetree-family/mergetree.md/#projections) optimization when processing `SELECT` queries. Possible values: - 0 — Projection optimization disabled. - 1 — Projection optimization enabled. -Default value: `0`. +Default value: `1`. ## force_optimize_projection {#force-optimize-projection} -Enables or disables the obligatory use of [projections](../../engines/table-engines/mergetree-family/mergetree.md#projections) in `SELECT` queries, when projection optimization is enabled (see [allow_experimental_projection_optimization](#allow-experimental-projection-optimization) setting). +Enables or disables the obligatory use of [projections](../../engines/table-engines/mergetree-family/mergetree.md/#projections) in `SELECT` queries, when projection optimization is enabled (see [allow_experimental_projection_optimization](#allow-experimental-projection-optimization) setting). Possible values: @@ -2978,7 +3013,7 @@ Default value: `120` seconds. ## regexp_max_matches_per_row {#regexp-max-matches-per-row} -Sets the maximum number of matches for a single regular expression per row. Use it to protect against memory overload when using greedy regular expression in the [extractAllGroupsHorizontal](../../sql-reference/functions/string-search-functions.md#extractallgroups-horizontal) function. +Sets the maximum number of matches for a single regular expression per row. Use it to protect against memory overload when using greedy regular expression in the [extractAllGroupsHorizontal](../../sql-reference/functions/string-search-functions.md/#extractallgroups-horizontal) function. Possible values: @@ -3010,7 +3045,7 @@ Default value: `1`. ## short_circuit_function_evaluation {#short-circuit-function-evaluation} -Allows calculating the [if](../../sql-reference/functions/conditional-functions.md#if), [multiIf](../../sql-reference/functions/conditional-functions.md#multiif), [and](../../sql-reference/functions/logical-functions.md#logical-and-function), and [or](../../sql-reference/functions/logical-functions.md#logical-or-function) functions according to a [short scheme](https://en.wikipedia.org/wiki/Short-circuit_evaluation). This helps optimize the execution of complex expressions in these functions and prevent possible exceptions (such as division by zero when it is not expected). +Allows calculating the [if](../../sql-reference/functions/conditional-functions.md/#if), [multiIf](../../sql-reference/functions/conditional-functions.md/#multiif), [and](../../sql-reference/functions/logical-functions.md/#logical-and-function), and [or](../../sql-reference/functions/logical-functions.md/#logical-or-function) functions according to a [short scheme](https://en.wikipedia.org/wiki/Short-circuit_evaluation). This helps optimize the execution of complex expressions in these functions and prevent possible exceptions (such as division by zero when it is not expected). Possible values: @@ -3022,7 +3057,7 @@ Default value: `enable`. ## max_hyperscan_regexp_length {#max-hyperscan-regexp-length} -Defines the maximum length for each regular expression in the [hyperscan multi-match functions](../../sql-reference/functions/string-search-functions.md#multimatchanyhaystack-pattern1-pattern2-patternn). +Defines the maximum length for each regular expression in the [hyperscan multi-match functions](../../sql-reference/functions/string-search-functions.md/#multimatchanyhaystack-pattern1-pattern2-patternn). Possible values: @@ -3065,7 +3100,7 @@ Exception: Regexp length too large. ## max_hyperscan_regexp_total_length {#max-hyperscan-regexp-total-length} -Sets the maximum length total of all regular expressions in each [hyperscan multi-match function](../../sql-reference/functions/string-search-functions.md#multimatchanyhaystack-pattern1-pattern2-patternn). +Sets the maximum length total of all regular expressions in each [hyperscan multi-match function](../../sql-reference/functions/string-search-functions.md/#multimatchanyhaystack-pattern1-pattern2-patternn). Possible values: @@ -3142,8 +3177,8 @@ Result: ## enable_extended_results_for_datetime_functions {#enable-extended-results-for-datetime-functions} Enables or disables returning results of type: -- `Date32` with extended range (compared to type `Date`) for functions [toStartOfYear](../../sql-reference/functions/date-time-functions.md#tostartofyear), [toStartOfISOYear](../../sql-reference/functions/date-time-functions.md#tostartofisoyear), [toStartOfQuarter](../../sql-reference/functions/date-time-functions.md#tostartofquarter), [toStartOfMonth](../../sql-reference/functions/date-time-functions.md#tostartofmonth), [toStartOfWeek](../../sql-reference/functions/date-time-functions.md#tostartofweek), [toMonday](../../sql-reference/functions/date-time-functions.md#tomonday) and [toLastDayOfMonth](../../sql-reference/functions/date-time-functions.md#tolastdayofmonth). -- `DateTime64` with extended range (compared to type `DateTime`) for functions [toStartOfDay](../../sql-reference/functions/date-time-functions.md#tostartofday), [toStartOfHour](../../sql-reference/functions/date-time-functions.md#tostartofhour), [toStartOfMinute](../../sql-reference/functions/date-time-functions.md#tostartofminute), [toStartOfFiveMinutes](../../sql-reference/functions/date-time-functions.md#tostartoffiveminutes), [toStartOfTenMinutes](../../sql-reference/functions/date-time-functions.md#tostartoftenminutes), [toStartOfFifteenMinutes](../../sql-reference/functions/date-time-functions.md#tostartoffifteenminutes) and [timeSlot](../../sql-reference/functions/date-time-functions.md#timeslot). +- `Date32` with extended range (compared to type `Date`) for functions [toStartOfYear](../../sql-reference/functions/date-time-functions.md/#tostartofyear), [toStartOfISOYear](../../sql-reference/functions/date-time-functions.md/#tostartofisoyear), [toStartOfQuarter](../../sql-reference/functions/date-time-functions.md/#tostartofquarter), [toStartOfMonth](../../sql-reference/functions/date-time-functions.md/#tostartofmonth), [toStartOfWeek](../../sql-reference/functions/date-time-functions.md/#tostartofweek), [toMonday](../../sql-reference/functions/date-time-functions.md/#tomonday) and [toLastDayOfMonth](../../sql-reference/functions/date-time-functions.md/#tolastdayofmonth). +- `DateTime64` with extended range (compared to type `DateTime`) for functions [toStartOfDay](../../sql-reference/functions/date-time-functions.md/#tostartofday), [toStartOfHour](../../sql-reference/functions/date-time-functions.md/#tostartofhour), [toStartOfMinute](../../sql-reference/functions/date-time-functions.md/#tostartofminute), [toStartOfFiveMinutes](../../sql-reference/functions/date-time-functions.md/#tostartoffiveminutes), [toStartOfTenMinutes](../../sql-reference/functions/date-time-functions.md/#tostartoftenminutes), [toStartOfFifteenMinutes](../../sql-reference/functions/date-time-functions.md/#tostartoffifteenminutes) and [timeSlot](../../sql-reference/functions/date-time-functions.md/#timeslot). Possible values: @@ -3167,7 +3202,7 @@ Default value: `1`. ## optimize_move_to_prewhere_if_final {#optimize_move_to_prewhere_if_final} -Enables or disables automatic [PREWHERE](../../sql-reference/statements/select/prewhere.md) optimization in [SELECT](../../sql-reference/statements/select/index.md) queries with [FINAL](../../sql-reference/statements/select/from.md#select-from-final) modifier. +Enables or disables automatic [PREWHERE](../../sql-reference/statements/select/prewhere.md) optimization in [SELECT](../../sql-reference/statements/select/index.md) queries with [FINAL](../../sql-reference/statements/select/from.md/#select-from-final) modifier. Works only for [*MergeTree](../../engines/table-engines/mergetree-family/index.md) tables. @@ -3184,7 +3219,7 @@ Default value: `0`. ## describe_include_subcolumns {#describe_include_subcolumns} -Enables describing subcolumns for a [DESCRIBE](../../sql-reference/statements/describe-table.md) query. For example, members of a [Tuple](../../sql-reference/data-types/tuple.md) or subcolumns of a [Map](../../sql-reference/data-types/map.md#map-subcolumns), [Nullable](../../sql-reference/data-types/nullable.md#finding-null) or an [Array](../../sql-reference/data-types/array.md#array-size) data type. +Enables describing subcolumns for a [DESCRIBE](../../sql-reference/statements/describe-table.md) query. For example, members of a [Tuple](../../sql-reference/data-types/tuple.md) or subcolumns of a [Map](../../sql-reference/data-types/map.md/#map-subcolumns), [Nullable](../../sql-reference/data-types/nullable.md/#finding-null) or an [Array](../../sql-reference/data-types/array.md/#array-size) data type. Possible values: @@ -3283,7 +3318,7 @@ Default value: `0`. ## alter_partition_verbose_result {#alter-partition-verbose-result} Enables or disables the display of information about the parts to which the manipulation operations with partitions and parts have been successfully applied. -Applicable to [ATTACH PARTITION|PART](../../sql-reference/statements/alter/partition.md#alter_attach-partition) and to [FREEZE PARTITION](../../sql-reference/statements/alter/partition.md#alter_freeze-partition). +Applicable to [ATTACH PARTITION|PART](../../sql-reference/statements/alter/partition.md/#alter_attach-partition) and to [FREEZE PARTITION](../../sql-reference/statements/alter/partition.md/#alter_freeze-partition). Possible values: @@ -3399,6 +3434,17 @@ Use schema from cache for URL with last modification time validation (for urls w Default value: `true`. +## use_structure_from_insertion_table_in_table_functions {use_structure_from_insertion_table_in_table_functions} + +Use structure from insertion table instead of schema inference from data. + +Possible values: +- 0 - disabled +- 1 - enabled +- 2 - auto + +Default value: 2. + ## compatibility {#compatibility} This setting changes other settings according to provided ClickHouse version. @@ -3418,11 +3464,11 @@ When writing data, ClickHouse throws an exception if input data contain columns Supported formats: -- [JSONEachRow](../../interfaces/formats.md#jsoneachrow) -- [TSKV](../../interfaces/formats.md#tskv) +- [JSONEachRow](../../interfaces/formats.md/#jsoneachrow) +- [TSKV](../../interfaces/formats.md/#tskv) - All formats with suffixes WithNames/WithNamesAndTypes -- [JSONColumns](../../interfaces/formats.md#jsoncolumns) -- [MySQLDump](../../interfaces/formats.md#mysqldump) +- [JSONColumns](../../interfaces/formats.md/#jsoncolumns) +- [MySQLDump](../../interfaces/formats.md/#mysqldump) Possible values: @@ -3439,18 +3485,18 @@ To improve insert performance, we recommend disabling this check if you are sure Supported formats: -- [CSVWithNames](../../interfaces/formats.md#csvwithnames) -- [CSVWithNamesAndTypes](../../interfaces/formats.md#csvwithnamesandtypes) -- [TabSeparatedWithNames](../../interfaces/formats.md#tabseparatedwithnames) -- [TabSeparatedWithNamesAndTypes](../../interfaces/formats.md#tabseparatedwithnamesandtypes) -- [JSONCompactEachRowWithNames](../../interfaces/formats.md#jsoncompacteachrowwithnames) -- [JSONCompactEachRowWithNamesAndTypes](../../interfaces/formats.md#jsoncompacteachrowwithnamesandtypes) -- [JSONCompactStringsEachRowWithNames](../../interfaces/formats.md#jsoncompactstringseachrowwithnames) -- [JSONCompactStringsEachRowWithNamesAndTypes](../../interfaces/formats.md#jsoncompactstringseachrowwithnamesandtypes) -- [RowBinaryWithNames](../../interfaces/formats.md#rowbinarywithnames) -- [RowBinaryWithNamesAndTypes](../../interfaces/formats.md#rowbinarywithnamesandtypes) -- [CustomSeparatedWithNames](../../interfaces/formats.md#customseparatedwithnames) -- [CustomSeparatedWithNamesAndTypes](../../interfaces/formats.md#customseparatedwithnamesandtypes) +- [CSVWithNames](../../interfaces/formats.md/#csvwithnames) +- [CSVWithNamesAndTypes](../../interfaces/formats.md/#csvwithnamesandtypes) +- [TabSeparatedWithNames](../../interfaces/formats.md/#tabseparatedwithnames) +- [TabSeparatedWithNamesAndTypes](../../interfaces/formats.md/#tabseparatedwithnamesandtypes) +- [JSONCompactEachRowWithNames](../../interfaces/formats.md/#jsoncompacteachrowwithnames) +- [JSONCompactEachRowWithNamesAndTypes](../../interfaces/formats.md/#jsoncompacteachrowwithnamesandtypes) +- [JSONCompactStringsEachRowWithNames](../../interfaces/formats.md/#jsoncompactstringseachrowwithnames) +- [JSONCompactStringsEachRowWithNamesAndTypes](../../interfaces/formats.md/#jsoncompactstringseachrowwithnamesandtypes) +- [RowBinaryWithNames](../../interfaces/formats.md/#rowbinarywithnames) +- [RowBinaryWithNamesAndTypes](../../interfaces/formats.md/#rowbinarywithnamesandtypes) +- [CustomSeparatedWithNames](../../interfaces/formats.md/#customseparatedwithnames) +- [CustomSeparatedWithNamesAndTypes](../../interfaces/formats.md/#customseparatedwithnamesandtypes) Possible values: @@ -3465,12 +3511,12 @@ Controls whether format parser should check if data types from the input data ma Supported formats: -- [CSVWithNamesAndTypes](../../interfaces/formats.md#csvwithnamesandtypes) -- [TabSeparatedWithNamesAndTypes](../../interfaces/formats.md#tabseparatedwithnamesandtypes) -- [JSONCompactEachRowWithNamesAndTypes](../../interfaces/formats.md#jsoncompacteachrowwithnamesandtypes) -- [JSONCompactStringsEachRowWithNamesAndTypes](../../interfaces/formats.md#jsoncompactstringseachrowwithnamesandtypes) -- [RowBinaryWithNamesAndTypes](../../interfaces/formats.md#rowbinarywithnamesandtypes-rowbinarywithnamesandtypes) -- [CustomSeparatedWithNamesAndTypes](../../interfaces/formats.md#customseparatedwithnamesandtypes) +- [CSVWithNamesAndTypes](../../interfaces/formats.md/#csvwithnamesandtypes) +- [TabSeparatedWithNamesAndTypes](../../interfaces/formats.md/#tabseparatedwithnamesandtypes) +- [JSONCompactEachRowWithNamesAndTypes](../../interfaces/formats.md/#jsoncompacteachrowwithnamesandtypes) +- [JSONCompactStringsEachRowWithNamesAndTypes](../../interfaces/formats.md/#jsoncompactstringseachrowwithnamesandtypes) +- [RowBinaryWithNamesAndTypes](../../interfaces/formats.md/#rowbinarywithnamesandtypes-rowbinarywithnamesandtypes) +- [CustomSeparatedWithNamesAndTypes](../../interfaces/formats.md/#customseparatedwithnamesandtypes) Possible values: @@ -3481,7 +3527,7 @@ Default value: 1. ## input_format_defaults_for_omitted_fields {#input_format_defaults_for_omitted_fields} -When performing `INSERT` queries, replace omitted input column values with default values of the respective columns. This option only applies to [JSONEachRow](../../interfaces/formats.md#jsoneachrow), [CSV](../../interfaces/formats.md#csv), [TabSeparated](../../interfaces/formats.md#tabseparated) formats and formats with `WithNames`/`WithNamesAndTypes` suffixes. +When performing `INSERT` queries, replace omitted input column values with default values of the respective columns. This option only applies to [JSONEachRow](../../interfaces/formats.md/#jsoneachrow), [CSV](../../interfaces/formats.md/#csv), [TabSeparated](../../interfaces/formats.md/#tabseparated) formats and formats with `WithNames`/`WithNamesAndTypes` suffixes. :::note When this option is enabled, extended table metadata are sent from server to client. It consumes additional computing resources on the server and can reduce performance. @@ -3496,7 +3542,7 @@ Default value: 1. ## input_format_null_as_default {#input_format_null_as_default} -Enables or disables the initialization of [NULL](../../sql-reference/syntax.md#null-literal) fields with [default values](../../sql-reference/statements/create/table.md#create-default-values), if data type of these fields is not [nullable](../../sql-reference/data-types/nullable.md#data_type-nullable). +Enables or disables the initialization of [NULL](../../sql-reference/syntax.md/#null-literal) fields with [default values](../../sql-reference/statements/create/table.md/#create-default-values), if data type of these fields is not [nullable](../../sql-reference/data-types/nullable.md/#data_type-nullable). If column type is not nullable and this setting is disabled, then inserting `NULL` causes an exception. If column type is nullable, then `NULL` values are inserted as is, regardless of this setting. This setting is applicable to [INSERT ... VALUES](../../sql-reference/statements/insert-into.md) queries for text input formats. @@ -3663,7 +3709,7 @@ Enabled by default ## insert_distributed_one_random_shard {#insert_distributed_one_random_shard} -Enables or disables random shard insertion into a [Distributed](../../engines/table-engines/special/distributed.md#distributed) table when there is no distributed key. +Enables or disables random shard insertion into a [Distributed](../../engines/table-engines/special/distributed.md/#distributed) table when there is no distributed key. By default, when inserting data into a `Distributed` table with more than one shard, the ClickHouse server will reject any insertion request if there is no distributed key. When `insert_distributed_one_random_shard = 1`, insertions are allowed and data is forwarded randomly among all shards. @@ -3682,7 +3728,7 @@ Enables or disables the insertion of JSON data with nested objects. Supported formats: -- [JSONEachRow](../../interfaces/formats.md#jsoneachrow) +- [JSONEachRow](../../interfaces/formats.md/#jsoneachrow) Possible values: @@ -3693,7 +3739,7 @@ Default value: 0. See also: -- [Usage of Nested Structures](../../interfaces/formats.md#jsoneachrow-nested) with the `JSONEachRow` format. +- [Usage of Nested Structures](../../interfaces/formats.md/#jsoneachrow-nested) with the `JSONEachRow` format. ### input_format_json_read_bools_as_numbers {#input_format_json_read_bools_as_numbers} @@ -3707,6 +3753,29 @@ Allow parsing numbers as strings in JSON input formats. Disabled by default. +### input_format_json_read_objects_as_strings {#input_format_json_read_objects_as_strings} + +Allow parsing JSON objects as strings in JSON input formats. + +Example: + +```sql +SET input_format_json_read_objects_as_strings = 1; +CREATE TABLE test (id UInt64, obj String, date Date) ENGINE=Memory(); +INSERT INTO test FORMAT JSONEachRow {"id" : 1, "obj" : {"a" : 1, "b" : "Hello"}, "date" : "2020-01-01"}; +SELECT * FROM test; +``` + +Result: + +``` +┌─id─┬─obj──────────────────────┬───────date─┐ +│ 1 │ {"a" : 1, "b" : "Hello"} │ 2020-01-01 │ +└────┴──────────────────────────┴────────────┘ +``` + +Disabled by default. + ### input_format_json_validate_types_from_metadata {#input_format_json_validate_types_from_metadata} For JSON/JSONCompact/JSONColumnsWithMetadata input formats, if this setting is set to 1, @@ -3716,7 +3785,7 @@ Enabled by default. ### output_format_json_quote_64bit_integers {#output_format_json_quote_64bit_integers} -Controls quoting of 64-bit or bigger [integers](../../sql-reference/data-types/int-uint.md) (like `UInt64` or `Int128`) when they are output in a [JSON](../../interfaces/formats.md#json) format. +Controls quoting of 64-bit or bigger [integers](../../sql-reference/data-types/int-uint.md) (like `UInt64` or `Int128`) when they are output in a [JSON](../../interfaces/formats.md/#json) format. Such integers are enclosed in quotes by default. This behavior is compatible with most JavaScript implementations. Possible values: @@ -3734,7 +3803,7 @@ Disabled by default. ### output_format_json_quote_denormals {#output_format_json_quote_denormals} -Enables `+nan`, `-nan`, `+inf`, `-inf` outputs in [JSON](../../interfaces/formats.md#json) output format. +Enables `+nan`, `-nan`, `+inf`, `-inf` outputs in [JSON](../../interfaces/formats.md/#json) output format. Possible values: @@ -3851,7 +3920,7 @@ Disabled by default. ### output_format_json_array_of_rows {#output_format_json_array_of_rows} -Enables the ability to output all rows as a JSON array in the [JSONEachRow](../../interfaces/formats.md#jsoneachrow) format. +Enables the ability to output all rows as a JSON array in the [JSONEachRow](../../interfaces/formats.md/#jsoneachrow) format. Possible values: @@ -3904,7 +3973,7 @@ Disabled by default. ### format_json_object_each_row_column_for_object_name {#format_json_object_each_row_column_for_object_name} -The name of column that will be used for storing/writing object names in [JSONObjectEachRow](../../interfaces/formats.md#jsonobjecteachrow) format. +The name of column that will be used for storing/writing object names in [JSONObjectEachRow](../../interfaces/formats.md/#jsonobjecteachrow) format. Column type should be String. If value is empty, default names `row_{i}`will be used for object names. Default value: ''. @@ -4005,7 +4074,7 @@ Disabled by default. ### format_tsv_null_representation {#format_tsv_null_representation} -Defines the representation of `NULL` for [TSV](../../interfaces/formats.md#tabseparated) output and input formats. User can set any string as a value, for example, `My NULL`. +Defines the representation of `NULL` for [TSV](../../interfaces/formats.md/#tabseparated) output and input formats. User can set any string as a value, for example, `My NULL`. Default value: `\N`. @@ -4159,7 +4228,7 @@ Default value: `0`. ### format_csv_null_representation {#format_csv_null_representation} -Defines the representation of `NULL` for [CSV](../../interfaces/formats.md#csv) output and input formats. User can set any string as a value, for example, `My NULL`. +Defines the representation of `NULL` for [CSV](../../interfaces/formats.md/#csv) output and input formats. User can set any string as a value, for example, `My NULL`. Default value: `\N`. @@ -4198,7 +4267,7 @@ My NULL ### input_format_values_interpret_expressions {#input_format_values_interpret_expressions} -Enables or disables the full SQL parser if the fast stream parser can’t parse the data. This setting is used only for the [Values](../../interfaces/formats.md#data-format-values) format at the data insertion. For more information about syntax parsing, see the [Syntax](../../sql-reference/syntax.md) section. +Enables or disables the full SQL parser if the fast stream parser can’t parse the data. This setting is used only for the [Values](../../interfaces/formats.md/#data-format-values) format at the data insertion. For more information about syntax parsing, see the [Syntax](../../sql-reference/syntax.md) section. Possible values: @@ -4248,7 +4317,7 @@ Ok. ### input_format_values_deduce_templates_of_expressions {#input_format_values_deduce_templates_of_expressions} -Enables or disables template deduction for SQL expressions in [Values](../../interfaces/formats.md#data-format-values) format. It allows parsing and interpreting expressions in `Values` much faster if expressions in consecutive rows have the same structure. ClickHouse tries to deduce the template of an expression, parse the following rows using this template and evaluate the expression on a batch of successfully parsed rows. +Enables or disables template deduction for SQL expressions in [Values](../../interfaces/formats.md/#data-format-values) format. It allows parsing and interpreting expressions in `Values` much faster if expressions in consecutive rows have the same structure. ClickHouse tries to deduce the template of an expression, parse the following rows using this template and evaluate the expression on a batch of successfully parsed rows. Possible values: @@ -4293,7 +4362,7 @@ Default value: 1. ### input_format_arrow_import_nested {#input_format_arrow_import_nested} -Enables or disables the ability to insert the data into [Nested](../../sql-reference/data-types/nested-data-structures/nested.md) columns as an array of structs in [Arrow](../../interfaces/formats.md#data_types-matching-arrow) input format. +Enables or disables the ability to insert the data into [Nested](../../sql-reference/data-types/nested-data-structures/nested.md) columns as an array of structs in [Arrow](../../interfaces/formats.md/#data_types-matching-arrow) input format. Possible values: @@ -4322,7 +4391,7 @@ Disabled by default. ### output_format_arrow_low_cardinality_as_dictionary {#output_format_arrow_low_cardinality_as_dictionary} -Allows to convert the [LowCardinality](../../sql-reference/data-types/lowcardinality.md) type to the `DICTIONARY` type of the [Arrow](../../interfaces/formats.md#data-format-arrow) format for `SELECT` queries. +Allows to convert the [LowCardinality](../../sql-reference/data-types/lowcardinality.md) type to the `DICTIONARY` type of the [Arrow](../../interfaces/formats.md/#data-format-arrow) format for `SELECT` queries. Possible values: @@ -4341,7 +4410,7 @@ Disabled by default. ### input_format_orc_import_nested {#input_format_orc_import_nested} -Enables or disables the ability to insert the data into [Nested](../../sql-reference/data-types/nested-data-structures/nested.md) columns as an array of structs in [ORC](../../interfaces/formats.md#data-format-orc) input format. +Enables or disables the ability to insert the data into [Nested](../../sql-reference/data-types/nested-data-structures/nested.md) columns as an array of structs in [ORC](../../interfaces/formats.md/#data-format-orc) input format. Possible values: @@ -4384,7 +4453,7 @@ Disabled by default. ## input_format_parquet_import_nested {#input_format_parquet_import_nested} -Enables or disables the ability to insert the data into [Nested](../../sql-reference/data-types/nested-data-structures/nested.md) columns as an array of structs in [Parquet](../../interfaces/formats.md#data-format-parquet) input format. +Enables or disables the ability to insert the data into [Nested](../../sql-reference/data-types/nested-data-structures/nested.md) columns as an array of structs in [Parquet](../../interfaces/formats.md/#data-format-parquet) input format. Possible values: @@ -4481,7 +4550,7 @@ Disabled by default. ### input_format_avro_allow_missing_fields {#input_format_avro_allow_missing_fields} -Enables using fields that are not specified in [Avro](../../interfaces/formats.md#data-format-avro) or [AvroConfluent](../../interfaces/formats.md#data-format-avro-confluent) format schema. When a field is not found in the schema, ClickHouse uses the default value instead of throwing an exception. +Enables using fields that are not specified in [Avro](../../interfaces/formats.md/#data-format-avro) or [AvroConfluent](../../interfaces/formats.md/#data-format-avro-confluent) format schema. When a field is not found in the schema, ClickHouse uses the default value instead of throwing an exception. Possible values: @@ -4492,7 +4561,7 @@ Default value: 0. ### format_avro_schema_registry_url {#format_avro_schema_registry_url} -Sets [Confluent Schema Registry](https://docs.confluent.io/current/schema-registry/index.html) URL to use with [AvroConfluent](../../interfaces/formats.md#data-format-avro-confluent) format. +Sets [Confluent Schema Registry](https://docs.confluent.io/current/schema-registry/index.html) URL to use with [AvroConfluent](../../interfaces/formats.md/#data-format-avro-confluent) format. Default value: `Empty`. @@ -4549,7 +4618,7 @@ Default value: `250`. ### output_format_pretty_max_value_width {#output_format_pretty_max_value_width} -Limits the width of value displayed in [Pretty](../../interfaces/formats.md#pretty) formats. If the value width exceeds the limit, the value is cut. +Limits the width of value displayed in [Pretty](../../interfaces/formats.md/#pretty) formats. If the value width exceeds the limit, the value is cut. Possible values: @@ -4625,7 +4694,7 @@ SELECT * FROM a; ### output_format_pretty_row_numbers {#output_format_pretty_row_numbers} -Adds row numbers to output in the [Pretty](../../interfaces/formats.md#pretty) format. +Adds row numbers to output in the [Pretty](../../interfaces/formats.md/#pretty) format. Possible values: @@ -4670,52 +4739,52 @@ Delimiter between rows (for Template format). ### format_custom_escaping_rule {#format_custom_escaping_rule} -Sets the field escaping rule for [CustomSeparated](../../interfaces/formats.md#format-customseparated) data format. +Sets the field escaping rule for [CustomSeparated](../../interfaces/formats.md/#format-customseparated) data format. Possible values: -- `'Escaped'` — Similarly to [TSV](../../interfaces/formats.md#tabseparated). -- `'Quoted'` — Similarly to [Values](../../interfaces/formats.md#data-format-values). -- `'CSV'` — Similarly to [CSV](../../interfaces/formats.md#csv). -- `'JSON'` — Similarly to [JSONEachRow](../../interfaces/formats.md#jsoneachrow). -- `'XML'` — Similarly to [XML](../../interfaces/formats.md#xml). -- `'Raw'` — Extracts subpatterns as a whole, no escaping rules, similarly to [TSVRaw](../../interfaces/formats.md#tabseparatedraw). +- `'Escaped'` — Similarly to [TSV](../../interfaces/formats.md/#tabseparated). +- `'Quoted'` — Similarly to [Values](../../interfaces/formats.md/#data-format-values). +- `'CSV'` — Similarly to [CSV](../../interfaces/formats.md/#csv). +- `'JSON'` — Similarly to [JSONEachRow](../../interfaces/formats.md/#jsoneachrow). +- `'XML'` — Similarly to [XML](../../interfaces/formats.md/#xml). +- `'Raw'` — Extracts subpatterns as a whole, no escaping rules, similarly to [TSVRaw](../../interfaces/formats.md/#tabseparatedraw). Default value: `'Escaped'`. ### format_custom_field_delimiter {#format_custom_field_delimiter} -Sets the character that is interpreted as a delimiter between the fields for [CustomSeparated](../../interfaces/formats.md#format-customseparated) data format. +Sets the character that is interpreted as a delimiter between the fields for [CustomSeparated](../../interfaces/formats.md/#format-customseparated) data format. Default value: `'\t'`. ### format_custom_row_before_delimiter {#format_custom_row_before_delimiter} -Sets the character that is interpreted as a delimiter before the field of the first column for [CustomSeparated](../../interfaces/formats.md#format-customseparated) data format. +Sets the character that is interpreted as a delimiter before the field of the first column for [CustomSeparated](../../interfaces/formats.md/#format-customseparated) data format. Default value: `''`. ### format_custom_row_after_delimiter {#format_custom_row_after_delimiter} -Sets the character that is interpreted as a delimiter after the field of the last column for [CustomSeparated](../../interfaces/formats.md#format-customseparated) data format. +Sets the character that is interpreted as a delimiter after the field of the last column for [CustomSeparated](../../interfaces/formats.md/#format-customseparated) data format. Default value: `'\n'`. ### format_custom_row_between_delimiter {#format_custom_row_between_delimiter} -Sets the character that is interpreted as a delimiter between the rows for [CustomSeparated](../../interfaces/formats.md#format-customseparated) data format. +Sets the character that is interpreted as a delimiter between the rows for [CustomSeparated](../../interfaces/formats.md/#format-customseparated) data format. Default value: `''`. ### format_custom_result_before_delimiter {#format_custom_result_before_delimiter} -Sets the character that is interpreted as a prefix before the result set for [CustomSeparated](../../interfaces/formats.md#format-customseparated) data format. +Sets the character that is interpreted as a prefix before the result set for [CustomSeparated](../../interfaces/formats.md/#format-customseparated) data format. Default value: `''`. ### format_custom_result_after_delimiter {#format_custom_result_after_delimiter} -Sets the character that is interpreted as a suffix after the result set for [CustomSeparated](../../interfaces/formats.md#format-customseparated) data format. +Sets the character that is interpreted as a suffix after the result set for [CustomSeparated](../../interfaces/formats.md/#format-customseparated) data format. Default value: `''`. @@ -4727,12 +4796,12 @@ Field escaping rule. Possible values: -- `'Escaped'` — Similarly to [TSV](../../interfaces/formats.md#tabseparated). -- `'Quoted'` — Similarly to [Values](../../interfaces/formats.md#data-format-values). -- `'CSV'` — Similarly to [CSV](../../interfaces/formats.md#csv). -- `'JSON'` — Similarly to [JSONEachRow](../../interfaces/formats.md#jsoneachrow). -- `'XML'` — Similarly to [XML](../../interfaces/formats.md#xml). -- `'Raw'` — Extracts subpatterns as a whole, no escaping rules, similarly to [TSVRaw](../../interfaces/formats.md#tabseparatedraw). +- `'Escaped'` — Similarly to [TSV](../../interfaces/formats.md/#tabseparated). +- `'Quoted'` — Similarly to [Values](../../interfaces/formats.md/#data-format-values). +- `'CSV'` — Similarly to [CSV](../../interfaces/formats.md/#csv). +- `'JSON'` — Similarly to [JSONEachRow](../../interfaces/formats.md/#jsoneachrow). +- `'XML'` — Similarly to [XML](../../interfaces/formats.md/#xml). +- `'Raw'` — Extracts subpatterns as a whole, no escaping rules, similarly to [TSVRaw](../../interfaces/formats.md/#tabseparatedraw). Default value: `Raw`. @@ -4746,7 +4815,7 @@ Disabled by default. ### format_capn_proto_enum_comparising_mode {#format_capn_proto_enum_comparising_mode} -Determines how to map ClickHouse `Enum` data type and [CapnProto](../../interfaces/formats.md#capnproto) `Enum` data type from schema. +Determines how to map ClickHouse `Enum` data type and [CapnProto](../../interfaces/formats.md/#capnproto) `Enum` data type from schema. Possible values: @@ -4773,7 +4842,7 @@ Possible values: Default value: 1. -## SQLInsert format settings {$sqlinsert-format-settings} +## SQLInsert format settings {#sqlinsert-format-settings} ### output_format_sql_insert_max_batch_size {#output_format_sql_insert_max_batch_size} @@ -4804,3 +4873,25 @@ Default value: `false`. Quote column names with "`" characters Default value: `true`. + +## BSONEachRow format settings {#bson-each-row-format-settings} + +### output_format_bson_string_as_string {#output_format_bson_string_as_string} + +Use BSON String type instead of Binary for String columns. + +Disabled by default. + +### input_format_bson_skip_fields_with_unsupported_types_in_schema_inference {#input_format_bson_skip_fields_with_unsupported_types_in_schema_inference} + +Allow skipping columns with unsupported types while schema inference for format BSONEachRow. + +Disabled by default. + +## RowBinary format settings {#row-binary-format-settings} + +### format_binary_max_string_size {#format_binary_max_string_size} + +The maximum allowed size for String in RowBinary format. It prevents allocating large amount of memory in case of corrupted data. 0 means there is no limit. + +Default value: `1GiB` diff --git a/docs/en/operations/storing-data.md b/docs/en/operations/storing-data.md index 43623577e66..203fe4e42d2 100644 --- a/docs/en/operations/storing-data.md +++ b/docs/en/operations/storing-data.md @@ -7,13 +7,13 @@ title: "External Disks for Storing Data" Data, processed in ClickHouse, is usually stored in the local file system — on the same machine with the ClickHouse server. That requires large-capacity disks, which can be expensive enough. To avoid that you can store the data remotely — on [Amazon S3](https://aws.amazon.com/s3/) disks or in the Hadoop Distributed File System ([HDFS](https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-hdfs/HdfsDesign.html)). -To work with data stored on `Amazon S3` disks use [S3](../engines/table-engines/integrations/s3.md) table engine, and to work with data in the Hadoop Distributed File System — [HDFS](../engines/table-engines/integrations/hdfs.md) table engine. +To work with data stored on `Amazon S3` disks use [S3](/docs/en/engines/table-engines/integrations/s3.md) table engine, and to work with data in the Hadoop Distributed File System — [HDFS](/docs/en/engines/table-engines/integrations/hdfs.md) table engine. To load data from a web server with static files use a disk with type [web](#storing-data-on-webserver). ## Configuring HDFS {#configuring-hdfs} -[MergeTree](../engines/table-engines/mergetree-family/mergetree.md) and [Log](../engines/table-engines/log-family/log.md) family table engines can store data to HDFS using a disk with type `HDFS`. +[MergeTree](/docs/en/engines/table-engines/mergetree-family/mergetree.md) and [Log](/docs/en/engines/table-engines/log-family/log.md) family table engines can store data to HDFS using a disk with type `HDFS`. Configuration markup: @@ -53,7 +53,7 @@ Optional parameters: ## Using Virtual File System for Data Encryption {#encrypted-virtual-file-system} -You can encrypt the data stored on [S3](../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-s3), or [HDFS](#configuring-hdfs) external disks, or on a local disk. To turn on the encryption mode, in the configuration file you must define a disk with the type `encrypted` and choose a disk on which the data will be saved. An `encrypted` disk ciphers all written files on the fly, and when you read files from an `encrypted` disk it deciphers them automatically. So you can work with an `encrypted` disk like with a normal one. +You can encrypt the data stored on [S3](/docs/en/engines/table-engines/mergetree-family/mergetree.md/#table_engine-mergetree-s3), or [HDFS](#configuring-hdfs) external disks, or on a local disk. To turn on the encryption mode, in the configuration file you must define a disk with the type `encrypted` and choose a disk on which the data will be saved. An `encrypted` disk ciphers all written files on the fly, and when you read files from an `encrypted` disk it deciphers them automatically. So you can work with an `encrypted` disk like with a normal one. Example of disk configuration: @@ -80,14 +80,14 @@ Required parameters: - `type` — `encrypted`. Otherwise the encrypted disk is not created. - `disk` — Type of disk for data storage. -- `key` — The key for encryption and decryption. Type: [Uint64](../sql-reference/data-types/int-uint.md). You can use `key_hex` parameter to encrypt in hexadecimal form. +- `key` — The key for encryption and decryption. Type: [Uint64](/docs/en/sql-reference/data-types/int-uint.md). You can use `key_hex` parameter to encrypt in hexadecimal form. You can specify multiple keys using the `id` attribute (see example above). Optional parameters: - `path` — Path to the location on the disk where the data will be saved. If not specified, the data will be saved in the root directory. - `current_key_id` — The key used for encryption. All the specified keys can be used for decryption, and you can always switch to another key while maintaining access to previously encrypted data. -- `algorithm` — [Algorithm](../sql-reference/statements/create/table.md#create-query-encryption-codecs) for encryption. Possible values: `AES_128_CTR`, `AES_192_CTR` or `AES_256_CTR`. Default value: `AES_128_CTR`. The key length depends on the algorithm: `AES_128_CTR` — 16 bytes, `AES_192_CTR` — 24 bytes, `AES_256_CTR` — 32 bytes. +- `algorithm` — [Algorithm](/docs/en/sql-reference/statements/create/table.md/#create-query-encryption-codecs) for encryption. Possible values: `AES_128_CTR`, `AES_192_CTR` or `AES_256_CTR`. Default value: `AES_128_CTR`. The key length depends on the algorithm: `AES_128_CTR` — 16 bytes, `AES_192_CTR` — 24 bytes, `AES_256_CTR` — 32 bytes. Example of disk configuration: @@ -265,9 +265,9 @@ Cache profile events: There is a tool `clickhouse-static-files-uploader`, which prepares a data directory for a given table (`SELECT data_paths FROM system.tables WHERE name = 'table_name'`). For each table you need, you get a directory of files. These files can be uploaded to, for example, a web server with static files. After this preparation, you can load this table into any ClickHouse server via `DiskWeb`. -This is a read-only disk. Its data is only read and never modified. A new table is loaded to this disk via `ATTACH TABLE` query (see example below). Local disk is not actually used, each `SELECT` query will result in a `http` request to fetch required data. All modification of the table data will result in an exception, i.e. the following types of queries are not allowed: [CREATE TABLE](../sql-reference/statements/create/table.md), [ALTER TABLE](../sql-reference/statements/alter/index.md), [RENAME TABLE](../sql-reference/statements/rename.md#misc_operations-rename_table), [DETACH TABLE](../sql-reference/statements/detach.md) and [TRUNCATE TABLE](../sql-reference/statements/truncate.md). +This is a read-only disk. Its data is only read and never modified. A new table is loaded to this disk via `ATTACH TABLE` query (see example below). Local disk is not actually used, each `SELECT` query will result in a `http` request to fetch required data. All modification of the table data will result in an exception, i.e. the following types of queries are not allowed: [CREATE TABLE](/docs/en/sql-reference/statements/create/table.md), [ALTER TABLE](/docs/en/sql-reference/statements/alter/index.md), [RENAME TABLE](/docs/en/sql-reference/statements/rename.md/#misc_operations-rename_table), [DETACH TABLE](/docs/en/sql-reference/statements/detach.md) and [TRUNCATE TABLE](/docs/en/sql-reference/statements/truncate.md). -Web server storage is supported only for the [MergeTree](../engines/table-engines/mergetree-family/mergetree.md) and [Log](../engines/table-engines/log-family/log.md) engine families. To access the data stored on a `web` disk, use the [storage_policy](../engines/table-engines/mergetree-family/mergetree.md#terms) setting when executing the query. For example, `ATTACH TABLE table_web UUID '{}' (id Int32) ENGINE = MergeTree() ORDER BY id SETTINGS storage_policy = 'web'`. +Web server storage is supported only for the [MergeTree](/docs/en/engines/table-engines/mergetree-family/mergetree.md) and [Log](/docs/en/engines/table-engines/log-family/log.md) engine families. To access the data stored on a `web` disk, use the [storage_policy](/docs/en/engines/table-engines/mergetree-family/mergetree.md/#terms) setting when executing the query. For example, `ATTACH TABLE table_web UUID '{}' (id Int32) ENGINE = MergeTree() ORDER BY id SETTINGS storage_policy = 'web'`. A ready test case. You need to add this configuration to config: @@ -451,7 +451,7 @@ Optional parameters: - `remote_fs_read_backoff_threashold` — The maximum wait time when trying to read data for remote disk. Default value: `10000` seconds. - `remote_fs_read_backoff_max_tries` — The maximum number of attempts to read with backoff. Default value: `5`. -If a query fails with an exception `DB:Exception Unreachable URL`, then you can try to adjust the settings: [http_connection_timeout](../operations/settings/settings.md#http_connection_timeout), [http_receive_timeout](../operations/settings/settings.md#http_receive_timeout), [keep_alive_timeout](../operations/server-configuration-parameters/settings.md#keep-alive-timeout). +If a query fails with an exception `DB:Exception Unreachable URL`, then you can try to adjust the settings: [http_connection_timeout](/docs/en/operations/settings/settings.md/#http_connection_timeout), [http_receive_timeout](/docs/en/operations/settings/settings.md/#http_receive_timeout), [keep_alive_timeout](/docs/en/operations/server-configuration-parameters/settings.md/#keep-alive-timeout). To get files for upload run: `clickhouse static-files-disk-uploader --metadata-path --output-dir ` (`--metadata-path` can be found in query `SELECT data_paths FROM system.tables WHERE name = 'table_name'`). @@ -460,7 +460,7 @@ When loading files by `endpoint`, they must be loaded into `/store/` p If URL is not reachable on disk load when the server is starting up tables, then all errors are caught. If in this case there were errors, tables can be reloaded (become visible) via `DETACH TABLE table_name` -> `ATTACH TABLE table_name`. If metadata was successfully loaded at server startup, then tables are available straight away. -Use [http_max_single_read_retries](../operations/settings/settings.md#http-max-single-read-retries) setting to limit the maximum number of retries during a single HTTP read. +Use [http_max_single_read_retries](/docs/en/operations/settings/settings.md/#http-max-single-read-retries) setting to limit the maximum number of retries during a single HTTP read. ## Zero-copy Replication (not ready for production) {#zero-copy} diff --git a/docs/en/operations/system-tables/crash-log.md b/docs/en/operations/system-tables/crash-log.md index 0c0a4cd967d..a44b0db8e9b 100644 --- a/docs/en/operations/system-tables/crash-log.md +++ b/docs/en/operations/system-tables/crash-log.md @@ -7,8 +7,8 @@ Contains information about stack traces for fatal errors. The table does not exi Columns: -- `event_date` ([Datetime](../../sql-reference/data-types/datetime.md)) — Date of the event. -- `event_time` ([Datetime](../../sql-reference/data-types/datetime.md)) — Time of the event. +- `event_date` ([DateTime](../../sql-reference/data-types/datetime.md)) — Date of the event. +- `event_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — Time of the event. - `timestamp_ns` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Timestamp of the event with nanoseconds. - `signal` ([Int32](../../sql-reference/data-types/int-uint.md)) — Signal number. - `thread_id` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Thread ID. diff --git a/docs/en/operations/system-tables/dictionaries.md b/docs/en/operations/system-tables/dictionaries.md index 112e2cc2cdf..4b256f0de97 100644 --- a/docs/en/operations/system-tables/dictionaries.md +++ b/docs/en/operations/system-tables/dictionaries.md @@ -3,7 +3,7 @@ slug: /en/operations/system-tables/dictionaries --- # dictionaries -Contains information about [external dictionaries](../../sql-reference/dictionaries/external-dictionaries/external-dicts.md). +Contains information about [dictionaries](../../sql-reference/dictionaries/external-dictionaries/external-dicts.md). Columns: @@ -33,7 +33,7 @@ Columns: - `lifetime_min` ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Minimum [lifetime](../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-lifetime.md) of the dictionary in memory, after which ClickHouse tries to reload the dictionary (if `invalidate_query` is set, then only if it has changed). Set in seconds. - `lifetime_max` ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Maximum [lifetime](../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-lifetime.md) of the dictionary in memory, after which ClickHouse tries to reload the dictionary (if `invalidate_query` is set, then only if it has changed). Set in seconds. - `loading_start_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — Start time for loading the dictionary. -- `last_successful_update_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — End time for loading or updating the dictionary. Helps to monitor some troubles with external sources and investigate causes. +- `last_successful_update_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — End time for loading or updating the dictionary. Helps to monitor some troubles with dictionary sources and investigate the causes. - `loading_duration` ([Float32](../../sql-reference/data-types/float.md)) — Duration of a dictionary loading. - `last_exception` ([String](../../sql-reference/data-types/string.md)) — Text of the error that occurs when creating or reloading the dictionary if the dictionary couldn’t be created. - `comment` ([String](../../sql-reference/data-types/string.md)) — Text of the comment to dictionary. diff --git a/docs/en/operations/system-tables/index.md b/docs/en/operations/system-tables/index.md index e08a727a62a..5fc302cad34 100644 --- a/docs/en/operations/system-tables/index.md +++ b/docs/en/operations/system-tables/index.md @@ -1,7 +1,8 @@ --- slug: /en/operations/system-tables/ sidebar_position: 52 -sidebar_label: System Tables +sidebar_label: Overview +pagination_next: 'en/operations/system-tables/asynchronous_metric_log' --- # System Tables @@ -72,4 +73,3 @@ If procfs is supported and enabled on the system, ClickHouse server collects the - `OSReadBytes` - `OSWriteBytes` -[Original article](https://clickhouse.com/docs/en/operations/system-tables/) diff --git a/docs/en/operations/system-tables/mutations.md b/docs/en/operations/system-tables/mutations.md index 45447f3644e..d8fb91a63f5 100644 --- a/docs/en/operations/system-tables/mutations.md +++ b/docs/en/operations/system-tables/mutations.md @@ -3,31 +3,31 @@ slug: /en/operations/system-tables/mutations --- # mutations -The table contains information about [mutations](../../sql-reference/statements/alter/index.md#mutations) of [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) tables and their progress. Each mutation command is represented by a single row. +The table contains information about [mutations](/docs/en/sql-reference/statements/alter/index.md#mutations) of [MergeTree](/docs/en/engines/table-engines/mergetree-family/mergetree.md) tables and their progress. Each mutation command is represented by a single row. Columns: -- `database` ([String](../../sql-reference/data-types/string.md)) — The name of the database to which the mutation was applied. +- `database` ([String](/docs/en/sql-reference/data-types/string.md)) — The name of the database to which the mutation was applied. -- `table` ([String](../../sql-reference/data-types/string.md)) — The name of the table to which the mutation was applied. +- `table` ([String](/docs/en/sql-reference/data-types/string.md)) — The name of the table to which the mutation was applied. -- `mutation_id` ([String](../../sql-reference/data-types/string.md)) — The ID of the mutation. For replicated tables these IDs correspond to znode names in the `/mutations/` directory in ClickHouse Keeper. For non-replicated tables the IDs correspond to file names in the data directory of the table. +- `mutation_id` ([String](/docs/en/sql-reference/data-types/string.md)) — The ID of the mutation. For replicated tables these IDs correspond to znode names in the `/mutations/` directory in ClickHouse Keeper. For non-replicated tables the IDs correspond to file names in the data directory of the table. -- `command` ([String](../../sql-reference/data-types/string.md)) — The mutation command string (the part of the query after `ALTER TABLE [db.]table`). +- `command` ([String](/docs/en/sql-reference/data-types/string.md)) — The mutation command string (the part of the query after `ALTER TABLE [db.]table`). -- `create_time` ([Datetime](../../sql-reference/data-types/datetime.md)) — Date and time when the mutation command was submitted for execution. +- `create_time` ([DateTime](/docs/en/sql-reference/data-types/datetime.md)) — Date and time when the mutation command was submitted for execution. -- `block_numbers.partition_id` ([Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md))) — For mutations of replicated tables, the array contains the partitions' IDs (one record for each partition). For mutations of non-replicated tables the array is empty. +- `block_numbers.partition_id` ([Array](/docs/en/sql-reference/data-types/array.md)([String](/docs/en/sql-reference/data-types/string.md))) — For mutations of replicated tables, the array contains the partitions' IDs (one record for each partition). For mutations of non-replicated tables the array is empty. -- `block_numbers.number` ([Array](../../sql-reference/data-types/array.md)([Int64](../../sql-reference/data-types/int-uint.md))) — For mutations of replicated tables, the array contains one record for each partition, with the block number that was acquired by the mutation. Only parts that contain blocks with numbers less than this number will be mutated in the partition. +- `block_numbers.number` ([Array](/docs/en/sql-reference/data-types/array.md)([Int64](/docs/en/sql-reference/data-types/int-uint.md))) — For mutations of replicated tables, the array contains one record for each partition, with the block number that was acquired by the mutation. Only parts that contain blocks with numbers less than this number will be mutated in the partition. In non-replicated tables, block numbers in all partitions form a single sequence. This means that for mutations of non-replicated tables, the column will contain one record with a single block number acquired by the mutation. -- `parts_to_do_names` ([Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md))) — An array of names of data parts that need to be mutated for the mutation to complete. +- `parts_to_do_names` ([Array](/docs/en/sql-reference/data-types/array.md)([String](/docs/en/sql-reference/data-types/string.md))) — An array of names of data parts that need to be mutated for the mutation to complete. -- `parts_to_do` ([Int64](../../sql-reference/data-types/int-uint.md)) — The number of data parts that need to be mutated for the mutation to complete. +- `parts_to_do` ([Int64](/docs/en/sql-reference/data-types/int-uint.md)) — The number of data parts that need to be mutated for the mutation to complete. -- `is_done` ([UInt8](../../sql-reference/data-types/int-uint.md)) — The flag whether the mutation is done or not. Possible values: +- `is_done` ([UInt8](/docs/en/sql-reference/data-types/int-uint.md)) — The flag whether the mutation is done or not. Possible values: - `1` if the mutation is completed, - `0` if the mutation is still in process. @@ -37,16 +37,16 @@ Even if `parts_to_do = 0` it is possible that a mutation of a replicated table i If there were problems with mutating some data parts, the following columns contain additional information: -- `latest_failed_part` ([String](../../sql-reference/data-types/string.md)) — The name of the most recent part that could not be mutated. +- `latest_failed_part` ([String](/docs/en/sql-reference/data-types/string.md)) — The name of the most recent part that could not be mutated. -- `latest_fail_time` ([Datetime](../../sql-reference/data-types/datetime.md)) — The date and time of the most recent part mutation failure. +- `latest_fail_time` ([DateTime](/docs/en/sql-reference/data-types/datetime.md)) — The date and time of the most recent part mutation failure. -- `latest_fail_reason` ([String](../../sql-reference/data-types/string.md)) — The exception message that caused the most recent part mutation failure. +- `latest_fail_reason` ([String](/docs/en/sql-reference/data-types/string.md)) — The exception message that caused the most recent part mutation failure. **See Also** -- [Mutations](../../sql-reference/statements/alter/index.md#mutations) -- [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) table engine -- [ReplicatedMergeTree](../../engines/table-engines/mergetree-family/replication.md) family +- [Mutations](/docs/en/sql-reference/statements/alter/index.md#mutations) +- [MergeTree](/docs/en/engines/table-engines/mergetree-family/mergetree.md) table engine +- [ReplicatedMergeTree](/docs/en/engines/table-engines/mergetree-family/replication.md) family [Original article](https://clickhouse.com/docs/en/operations/system-tables/mutations) diff --git a/docs/en/operations/system-tables/parts.md b/docs/en/operations/system-tables/parts.md index f1d60896a2e..bbd5385f44b 100644 --- a/docs/en/operations/system-tables/parts.md +++ b/docs/en/operations/system-tables/parts.md @@ -75,7 +75,7 @@ Columns: - `primary_key_bytes_in_memory_allocated` ([UInt64](../../sql-reference/data-types/int-uint.md)) – The amount of memory (in bytes) reserved for primary key values. -- `is_frozen` ([UInt8](../../sql-reference/data-types/int-uint.md)) – Flag that shows that a partition data backup exists. 1, the backup exists. 0, the backup does not exist. For more details, see [FREEZE PARTITION](../../sql-reference/statements/alter/partition.md#alter_freeze-partition) +- `is_frozen` ([UInt8](../../sql-reference/data-types/int-uint.md)) – Flag that shows that a partition data backup exists. 1, the backup exists. 0, the backup does not exist. For more details, see [FREEZE PARTITION](../../sql-reference/statements/alter/partition.md/#alter_freeze-partition) - `database` ([String](../../sql-reference/data-types/string.md)) – Name of the database. @@ -87,25 +87,25 @@ Columns: - `disk_name` ([String](../../sql-reference/data-types/string.md)) – Name of a disk that stores the data part. -- `hash_of_all_files` ([String](../../sql-reference/data-types/string.md)) – [sipHash128](../../sql-reference/functions/hash-functions.md#hash_functions-siphash128) of compressed files. +- `hash_of_all_files` ([String](../../sql-reference/data-types/string.md)) – [sipHash128](../../sql-reference/functions/hash-functions.md/#hash_functions-siphash128) of compressed files. -- `hash_of_uncompressed_files` ([String](../../sql-reference/data-types/string.md)) – [sipHash128](../../sql-reference/functions/hash-functions.md#hash_functions-siphash128) of uncompressed files (files with marks, index file etc.). +- `hash_of_uncompressed_files` ([String](../../sql-reference/data-types/string.md)) – [sipHash128](../../sql-reference/functions/hash-functions.md/#hash_functions-siphash128) of uncompressed files (files with marks, index file etc.). -- `uncompressed_hash_of_compressed_files` ([String](../../sql-reference/data-types/string.md)) – [sipHash128](../../sql-reference/functions/hash-functions.md#hash_functions-siphash128) of data in the compressed files as if they were uncompressed. +- `uncompressed_hash_of_compressed_files` ([String](../../sql-reference/data-types/string.md)) – [sipHash128](../../sql-reference/functions/hash-functions.md/#hash_functions-siphash128) of data in the compressed files as if they were uncompressed. -- `delete_ttl_info_min` ([DateTime](../../sql-reference/data-types/datetime.md)) — The minimum value of the date and time key for [TTL DELETE rule](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-ttl). +- `delete_ttl_info_min` ([DateTime](../../sql-reference/data-types/datetime.md)) — The minimum value of the date and time key for [TTL DELETE rule](../../engines/table-engines/mergetree-family/mergetree.md/#table_engine-mergetree-ttl). -- `delete_ttl_info_max` ([DateTime](../../sql-reference/data-types/datetime.md)) — The maximum value of the date and time key for [TTL DELETE rule](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-ttl). +- `delete_ttl_info_max` ([DateTime](../../sql-reference/data-types/datetime.md)) — The maximum value of the date and time key for [TTL DELETE rule](../../engines/table-engines/mergetree-family/mergetree.md/#table_engine-mergetree-ttl). -- `move_ttl_info.expression` ([Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md))) — Array of expressions. Each expression defines a [TTL MOVE rule](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-ttl). +- `move_ttl_info.expression` ([Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md))) — Array of expressions. Each expression defines a [TTL MOVE rule](../../engines/table-engines/mergetree-family/mergetree.md/#table_engine-mergetree-ttl). :::warning The `move_ttl_info.expression` array is kept mostly for backward compatibility, now the simpliest way to check `TTL MOVE` rule is to use the `move_ttl_info.min` and `move_ttl_info.max` fields. ::: -- `move_ttl_info.min` ([Array](../../sql-reference/data-types/array.md)([DateTime](../../sql-reference/data-types/datetime.md))) — Array of date and time values. Each element describes the minimum key value for a [TTL MOVE rule](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-ttl). +- `move_ttl_info.min` ([Array](../../sql-reference/data-types/array.md)([DateTime](../../sql-reference/data-types/datetime.md))) — Array of date and time values. Each element describes the minimum key value for a [TTL MOVE rule](../../engines/table-engines/mergetree-family/mergetree.md/#table_engine-mergetree-ttl). -- `move_ttl_info.max` ([Array](../../sql-reference/data-types/array.md)([DateTime](../../sql-reference/data-types/datetime.md))) — Array of date and time values. Each element describes the maximum key value for a [TTL MOVE rule](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-ttl). +- `move_ttl_info.max` ([Array](../../sql-reference/data-types/array.md)([DateTime](../../sql-reference/data-types/datetime.md))) — Array of date and time values. Each element describes the maximum key value for a [TTL MOVE rule](../../engines/table-engines/mergetree-family/mergetree.md/#table_engine-mergetree-ttl). - `bytes` ([UInt64](../../sql-reference/data-types/int-uint.md)) – Alias for `bytes_on_disk`. @@ -166,6 +166,6 @@ move_ttl_info.max: [] **See Also** - [MergeTree family](../../engines/table-engines/mergetree-family/mergetree.md) -- [TTL for Columns and Tables](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-ttl) +- [TTL for Columns and Tables](../../engines/table-engines/mergetree-family/mergetree.md/#table_engine-mergetree-ttl) [Original article](https://clickhouse.com/docs/en/operations/system-tables/parts) diff --git a/docs/en/operations/system-tables/replication_queue.md b/docs/en/operations/system-tables/replication_queue.md index ced20b0048a..dff3bce246a 100644 --- a/docs/en/operations/system-tables/replication_queue.md +++ b/docs/en/operations/system-tables/replication_queue.md @@ -29,7 +29,7 @@ Columns: - `MUTATE_PART` — Apply one or several mutations to the part. - `ALTER_METADATA` — Apply alter modification according to global /metadata and /columns paths. -- `create_time` ([Datetime](../../sql-reference/data-types/datetime.md)) — Date and time when the task was submitted for execution. +- `create_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — Date and time when the task was submitted for execution. - `required_quorum` ([UInt32](../../sql-reference/data-types/int-uint.md)) — The number of replicas waiting for the task to complete with confirmation of completion. This column is only relevant for the `GET_PARTS` task. @@ -47,13 +47,13 @@ Columns: - `last_exception` ([String](../../sql-reference/data-types/string.md)) — Text message about the last error that occurred (if any). -- `last_attempt_time` ([Datetime](../../sql-reference/data-types/datetime.md)) — Date and time when the task was last attempted. +- `last_attempt_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — Date and time when the task was last attempted. - `num_postponed` ([UInt32](../../sql-reference/data-types/int-uint.md)) — The number of postponed tasks. - `postpone_reason` ([String](../../sql-reference/data-types/string.md)) — The reason why the task was postponed. -- `last_postpone_time` ([Datetime](../../sql-reference/data-types/datetime.md)) — Date and time when the task was last postponed. +- `last_postpone_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — Date and time when the task was last postponed. - `merge_type` ([String](../../sql-reference/data-types/string.md)) — Type of the current merge. Empty if it's a mutation. diff --git a/docs/en/operations/system-tables/session_log.md b/docs/en/operations/system-tables/session_log.md index 79c8ea184ce..cdf86b57ef6 100644 --- a/docs/en/operations/system-tables/session_log.md +++ b/docs/en/operations/system-tables/session_log.md @@ -24,6 +24,7 @@ Columns: - `DOUBLE_SHA1_PASSWORD` - `LDAP` - `KERBEROS` + - `SSL_CERTIFICATE` - `profiles` ([Array](../../sql-reference/data-types/array.md)([LowCardinality(String)](../../sql-reference/data-types/lowcardinality.md))) — The list of profiles set for all roles and/or users. - `roles` ([Array](../../sql-reference/data-types/array.md)([LowCardinality(String)](../../sql-reference/data-types/lowcardinality.md))) — The list of roles to which the profile is applied. - `settings` ([Array](../../sql-reference/data-types/array.md)([Tuple](../../sql-reference/data-types/tuple.md)([LowCardinality(String)](../../sql-reference/data-types/lowcardinality.md), [String](../../sql-reference/data-types/string.md)))) — Settings that were changed when the client logged in/out. diff --git a/docs/en/operations/system-tables/trace_log.md b/docs/en/operations/system-tables/trace_log.md index 0effe085b80..6299aafcae2 100644 --- a/docs/en/operations/system-tables/trace_log.md +++ b/docs/en/operations/system-tables/trace_log.md @@ -5,7 +5,8 @@ slug: /en/operations/system-tables/trace_log Contains stack traces collected by the sampling query profiler. -ClickHouse creates this table when the [trace_log](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-trace_log) server configuration section is set. Also the [query_profiler_real_time_period_ns](../../operations/settings/settings.md#query_profiler_real_time_period_ns) and [query_profiler_cpu_time_period_ns](../../operations/settings/settings.md#query_profiler_cpu_time_period_ns) settings should be set. +ClickHouse creates this table when the [trace_log](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-trace_log) server configuration section is set. Also see settings: [query_profiler_real_time_period_ns](../../operations/settings/settings.md#query_profiler_real_time_period_ns), [query_profiler_cpu_time_period_ns](../../operations/settings/settings.md#query_profiler_cpu_time_period_ns), [memory_profiler_step](../../operations/settings/settings.md#memory_profiler_step), +[memory_profiler_sample_probability](../../operations/settings/settings.md#memory_profiler_sample_probability), [trace_profile_events](../../operations/settings/settings.md#trace_profile_events). To analyze logs, use the `addressToLine`, `addressToLineWithInlines`, `addressToSymbol` and `demangle` introspection functions. @@ -29,6 +30,8 @@ Columns: - `CPU` represents collecting stack traces by CPU time. - `Memory` represents collecting allocations and deallocations when memory allocation exceeds the subsequent watermark. - `MemorySample` represents collecting random allocations and deallocations. + - `MemoryPeak` represents collecting updates of peak memory usage. + - `ProfileEvent` represents collecting of increments of profile events. - `thread_number` ([UInt32](../../sql-reference/data-types/int-uint.md)) — Thread identifier. @@ -36,6 +39,12 @@ Columns: - `trace` ([Array(UInt64)](../../sql-reference/data-types/array.md)) — Stack trace at the moment of sampling. Each element is a virtual memory address inside ClickHouse server process. +- `size` ([Int64](../../sql-reference/data-types/int-uint.md)) - For trace types `Memory`, `MemorySample` or `MemoryPeak` is the amount of memory allocated, for other trace types is 0. + +- `event` ([LowCardinality(String)](../../sql-reference/data-types/lowcardinality.md)) - For trace type `ProfileEvent` is the name of updated profile event, for other trace types is an empty string. + +- `increment` ([UInt64](../../sql-reference/data-types/int-uint.md)) - For trace type `ProfileEvent` is the amount of incremnt of profile event, for other trace types is 0. + **Example** ``` sql diff --git a/docs/en/operations/system-tables/users.md b/docs/en/operations/system-tables/users.md index eaeabab131b..6ef9b7b18a4 100644 --- a/docs/en/operations/system-tables/users.md +++ b/docs/en/operations/system-tables/users.md @@ -12,7 +12,7 @@ Columns: - `storage` ([String](../../sql-reference/data-types/string.md)) — Path to the storage of users. Configured in the `access_control_path` parameter. -- `auth_type` ([Enum8](../../sql-reference/data-types/enum.md)('no_password' = 0,'plaintext_password' = 1, 'sha256_password' = 2, 'double_sha1_password' = 3)) — Shows the authentication type. There are multiple ways of user identification: with no password, with plain text password, with [SHA256](https://ru.wikipedia.org/wiki/SHA-2)-encoded password or with [double SHA-1](https://ru.wikipedia.org/wiki/SHA-1)-encoded password. +- `auth_type` ([Enum8](../../sql-reference/data-types/enum.md)('no_password' = 0,'plaintext_password' = 1, 'sha256_password' = 2, 'double_sha1_password' = 3, 'ldap' = 4, 'kerberos' = 5, 'ssl_certificate' = 6)) — Shows the authentication type. There are multiple ways of user identification: with no password, with plain text password, with [SHA256](https://ru.wikipedia.org/wiki/SHA-2)-encoded password or with [double SHA-1](https://ru.wikipedia.org/wiki/SHA-1)-encoded password. - `auth_params` ([String](../../sql-reference/data-types/string.md)) — Authentication parameters in the JSON format depending on the `auth_type`. diff --git a/docs/en/operations/tips.md b/docs/en/operations/tips.md index 827a7e33ea3..da34a6b7e9c 100644 --- a/docs/en/operations/tips.md +++ b/docs/en/operations/tips.md @@ -189,10 +189,12 @@ preAllocSize=131072 # especially if there are a lot of clients. To prevent ZooKeeper from running # out of memory due to queued requests, ZooKeeper will throttle clients so that # there is no more than globalOutstandingLimit outstanding requests in the -# system. The default limit is 1,000.ZooKeeper logs transactions to a -# transaction log. After snapCount transactions are written to a log file a -# snapshot is started and a new transaction log file is started. The default -# snapCount is 10,000. +# system. The default limit is 1000. +# globalOutstandingLimit=1000 + +# ZooKeeper logs transactions to a transaction log. After snapCount transactions +# are written to a log file a snapshot is started and a new transaction log file +# is started. The default snapCount is 100000. snapCount=3000000 # If this option is defined, requests will be will logged to a trace file named @@ -284,3 +286,7 @@ end script If you use antivirus software configure it to skip folders with ClickHouse datafiles (`/var/lib/clickhouse`) otherwise performance may be reduced and you may experience unexpected errors during data ingestion and background merges. [Original article](https://clickhouse.com/docs/en/operations/tips/) + +## Related Content + +- [Getting started with ClickHouse? Here are 13 "Deadly Sins" and how to avoid them](https://clickhouse.com/blog/common-getting-started-issues-with-clickhouse) diff --git a/docs/en/operations/update.md b/docs/en/operations/update.md new file mode 100644 index 00000000000..58d4b690cb9 --- /dev/null +++ b/docs/en/operations/update.md @@ -0,0 +1,105 @@ +--- +slug: /en/operations/update +sidebar_title: Self-managed Upgrade +title: Self-managed Upgrade +--- + +## ClickHouse upgrade overview + +This document contains: +- general guidelines +- a recommended plan +- specifics for upgrading the binaries on your systems + +## General guidelines + +These notes should help you with planning, and to understand why we make the recommendations that we do later in the document. + +### Upgrade ClickHouse server separately from ClickHouse Keeper or ZooKeeper +Unless there is a security fix needed for ClickHouse Keeper or Apache ZooKeeper it is not necessary to upgrade Keeper when you upgrade ClickHouse server. Keeper stability is required during the upgrade process, so complete the ClickHouse server upgrades before considering an upgrade of Keeper. + +### Minor version upgrades should be adopted often +It is highly recommended to always upgrade to the newest minor version as soon as it is released. Minor releases do not have breaking changes but do have important bug fixes (and may have security fixes). + + +### Test experimental features on a separate ClickHouse server running the target version + +The compatibility of experimental features can be broken at any moment in any way. If you are using experimental features, then check the changelogs and consider setting up a separate ClickHouse server with the target version installed and test your use of the experimental features there. + +### Downgrades +If you upgrade and then realize that the new version is not compatible with some feature that you depend on you may be able to downgrade to a recent (less than one year old) version if you have not started to use any of the new features. Once the new features are used the downgrade will not work. + +### Multiple ClickHouse server versions in a cluster + +We make an effort to maintain a one-year compatibility window (which includes 2 LTS versions). This means that any two versions should be able to work together in a cluster if the difference between them is less than one year (or if there are less than two LTS versions between them). However, it is recommended to upgrade all members of a cluster to the same version as quickly as possible, as some minor issues are possible (like slowdown of distributed queries, retriable errors in some background operations in ReplicatedMergeTree, etc). + +We never recommend running different versions in the same cluster when the release dates are more than one year. While we do not expect that you will have data loss, the cluster may become unusable. The issues that you should expect if you have more than one year difference in versions include: + +- the cluster may not work +- some (or even all) queries may fail with arbitrary errors +- arbitrary errors/warnings may appear in the logs +- it may be impossible to downgrade + +### Incremental upgrades + +If the difference between the current version and the target version is more than one year, then it is recommended to either: +- Upgrade with downtime (stop all servers, upgrade all servers, run all servers). +- Or to upgrade through an intermediate version (a version less than one year more recent than the current version). + + + +## Recommended plan + +These are the recommended steps for a zero-downtime ClickHouse upgrade: + +1. Make sure that your configuration changes are not in the default `/etc/clickhouse-server/config.xml` file and that they are instead in `/etc/clickhouse-server/config.d/`, as `/etc/clickhouse-server/config.xml` could be overwritten during an upgrade. +2. Read through the [changelogs](/docs/en/whats-new/changelog/index.md) for breaking changes (going back from the target release to the release you are currently on). +3. Make any updates identified in the breaking changes that can be made before upgrading, and a list of the changes that will need to be made after the upgrade. +4. Identify one or more replicas for each shard to keep up while the rest of the replicas for each shard are upgraded. +5. On the replicas that will be upgraded, one at a time: + - shutdown ClickHouse server + - upgrade the server to the target version + - bring ClickHouse server up + - wait for the Keeper messages to indicate that the system is stable + - continue to the next replica +6. Check for errors in the Keeper log and the ClickHouse log +7. Upgrade the replicas identified in step 4 to the new version +8. Refer to the list of changes made in steps 1 through 3 and make the changes that need to be made after the upgrade. + +:::note +This error message is expected when there are multiple versions of ClickHouse running in a replicated environment. You will stop seeing these when all replicas are upgraded to the same version. +``` +MergeFromLogEntryTask: Code: 40. DB::Exception: Checksums of parts don't match: +hash of uncompressed files doesn't match. (CHECKSUM_DOESNT_MATCH) Data after merge is not +byte-identical to data on another replicas. +``` +::: + + +## ClickHouse server binary upgrade process + +If ClickHouse was installed from `deb` packages, execute the following commands on the server: + +``` bash +$ sudo apt-get update +$ sudo apt-get install clickhouse-client clickhouse-server +$ sudo service clickhouse-server restart +``` + +If you installed ClickHouse using something other than the recommended `deb` packages, use the appropriate update method. + +:::note +You can update multiple servers at once as soon as there is no moment when all replicas of one shard are offline. +::: + +The upgrade of older version of ClickHouse to specific version: + +As an example: + +`xx.yy.a.b` is a current stable version. The latest stable version could be found [here](https://github.com/ClickHouse/ClickHouse/releases) + +```bash +$ sudo apt-get update +$ sudo apt-get install clickhouse-server=xx.yy.a.b clickhouse-client=xx.yy.a.b clickhouse-common-static=xx.yy.a.b +$ sudo service clickhouse-server restart +``` diff --git a/docs/en/operations/utilities/clickhouse-benchmark.md b/docs/en/operations/utilities/clickhouse-benchmark.md index 1a250ea5481..faa7ac75c74 100644 --- a/docs/en/operations/utilities/clickhouse-benchmark.md +++ b/docs/en/operations/utilities/clickhouse-benchmark.md @@ -109,56 +109,38 @@ In the report you can find: `clickhouse-benchmark` can compare performances for two running ClickHouse servers. -To use the comparison mode, specify endpoints of both servers by two pairs of `--host`, `--port` keys. Keys matched together by position in arguments list, the first `--host` is matched with the first `--port` and so on. `clickhouse-benchmark` establishes connections to both servers, then sends queries. Each query addressed to a randomly selected server. The results are shown for each server separately. +To use the comparison mode, specify endpoints of both servers by two pairs of `--host`, `--port` keys. Keys matched together by position in arguments list, the first `--host` is matched with the first `--port` and so on. `clickhouse-benchmark` establishes connections to both servers, then sends queries. Each query addressed to a randomly selected server. The results are shown in a table. ## Example {#clickhouse-benchmark-example} ``` bash -$ echo "SELECT * FROM system.numbers LIMIT 10000000 OFFSET 10000000" | clickhouse-benchmark -i 10 +$ echo "SELECT * FROM system.numbers LIMIT 10000000 OFFSET 10000000" | clickhouse-benchmark --host=localhost --port=9001 --host=localhost --port=9000 -i 10 ``` ``` text Loaded 1 queries. -Queries executed: 6. +Queries executed: 5. -localhost:9000, queries 6, QPS: 6.153, RPS: 123398340.957, MiB/s: 941.455, result RPS: 61532982.200, result MiB/s: 469.459. +localhost:9001, queries 2, QPS: 3.764, RPS: 75446929.370, MiB/s: 575.614, result RPS: 37639659.982, result MiB/s: 287.168. +localhost:9000, queries 3, QPS: 3.815, RPS: 76466659.385, MiB/s: 583.394, result RPS: 38148392.297, result MiB/s: 291.049. -0.000% 0.159 sec. -10.000% 0.159 sec. -20.000% 0.159 sec. -30.000% 0.160 sec. -40.000% 0.160 sec. -50.000% 0.162 sec. -60.000% 0.164 sec. -70.000% 0.165 sec. -80.000% 0.166 sec. -90.000% 0.166 sec. -95.000% 0.167 sec. -99.000% 0.167 sec. -99.900% 0.167 sec. -99.990% 0.167 sec. +0.000% 0.258 sec. 0.250 sec. +10.000% 0.258 sec. 0.250 sec. +20.000% 0.258 sec. 0.250 sec. +30.000% 0.258 sec. 0.267 sec. +40.000% 0.258 sec. 0.267 sec. +50.000% 0.273 sec. 0.267 sec. +60.000% 0.273 sec. 0.267 sec. +70.000% 0.273 sec. 0.267 sec. +80.000% 0.273 sec. 0.269 sec. +90.000% 0.273 sec. 0.269 sec. +95.000% 0.273 sec. 0.269 sec. +99.000% 0.273 sec. 0.269 sec. +99.900% 0.273 sec. 0.269 sec. +99.990% 0.273 sec. 0.269 sec. - - -Queries executed: 10. - -localhost:9000, queries 10, QPS: 6.082, RPS: 121959604.568, MiB/s: 930.478, result RPS: 60815551.642, result MiB/s: 463.986. - -0.000% 0.159 sec. -10.000% 0.159 sec. -20.000% 0.160 sec. -30.000% 0.163 sec. -40.000% 0.164 sec. -50.000% 0.165 sec. -60.000% 0.166 sec. -70.000% 0.166 sec. -80.000% 0.167 sec. -90.000% 0.167 sec. -95.000% 0.170 sec. -99.000% 0.172 sec. -99.900% 0.172 sec. -99.990% 0.172 sec. +No difference proven at 99.5% confidence ``` [Original article](https://clickhouse.com/docs/en/operations/utilities/clickhouse-benchmark.md) diff --git a/docs/en/operations/utilities/clickhouse-local.md b/docs/en/operations/utilities/clickhouse-local.md index cb1b8b9a8e6..b98c7ed9dda 100644 --- a/docs/en/operations/utilities/clickhouse-local.md +++ b/docs/en/operations/utilities/clickhouse-local.md @@ -117,3 +117,8 @@ Read 186 rows, 4.15 KiB in 0.035 sec., 5302 rows/sec., 118.34 KiB/sec. ``` [Original article](https://clickhouse.com/docs/en/operations/utils/clickhouse-local/) + +## Related Content + +- [Getting Data Into ClickHouse - Part 1](https://clickhouse.com/blog/getting-data-into-clickhouse-part-1) +- [Exploring massive, real-world data sets: 100+ Years of Weather Records in ClickHouse](https://clickhouse.com/blog/real-world-data-noaa-climate-data) diff --git a/docs/en/operations/utilities/index.md b/docs/en/operations/utilities/index.md index df4af30768c..9de68923ea4 100644 --- a/docs/en/operations/utilities/index.md +++ b/docs/en/operations/utilities/index.md @@ -1,10 +1,11 @@ --- slug: /en/operations/utilities/ sidebar_position: 56 -sidebar_label: Utilities +sidebar_label: Overview +pagination_next: 'en/operations/utilities/clickhouse-copier' --- -# ClickHouse Utility +# ClickHouse Utilities - [clickhouse-local](../../operations/utilities/clickhouse-local.md) — Allows running SQL queries on data without starting the ClickHouse server, similar to how `awk` does this. - [clickhouse-copier](../../operations/utilities/clickhouse-copier.md) — Copies (and reshards) data from one cluster to another cluster. diff --git a/docs/en/sql-reference/aggregate-functions/reference/exponentialmovingaverage.md b/docs/en/sql-reference/aggregate-functions/reference/exponentialmovingaverage.md index 7b1709e6d5c..2587bc5533f 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/exponentialmovingaverage.md +++ b/docs/en/sql-reference/aggregate-functions/reference/exponentialmovingaverage.md @@ -1,6 +1,7 @@ --- slug: /en/sql-reference/aggregate-functions/reference/exponentialmovingaverage sidebar_position: 108 +sidebar_title: exponentialMovingAverage --- ## exponentialMovingAverage diff --git a/docs/en/sql-reference/aggregate-functions/reference/welchttest.md b/docs/en/sql-reference/aggregate-functions/reference/welchttest.md index 34f875e2138..1e0b1d88c6e 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/welchttest.md +++ b/docs/en/sql-reference/aggregate-functions/reference/welchttest.md @@ -32,8 +32,8 @@ The null hypothesis is that means of populations are equal. Normal distribution - calculated t-statistic. [Float64](../../../sql-reference/data-types/float.md). - calculated p-value. [Float64](../../../sql-reference/data-types/float.md). -- [calculated confidence-interval-low.] [Float64](../../../sql-reference/data-types/float.md). -- [calculated confidence-interval-high.] [Float64](../../../sql-reference/data-types/float.md). +- calculated confidence-interval-low. [Float64](../../../sql-reference/data-types/float.md). +- calculated confidence-interval-high. [Float64](../../../sql-reference/data-types/float.md). **Example** diff --git a/docs/en/sql-reference/data-types/date32.md b/docs/en/sql-reference/data-types/date32.md index ff1a745785b..db41fdf2fc3 100644 --- a/docs/en/sql-reference/data-types/date32.md +++ b/docs/en/sql-reference/data-types/date32.md @@ -6,7 +6,7 @@ sidebar_label: Date32 # Date32 -A date. Supports the date range same with [Datetime64](../../sql-reference/data-types/datetime64.md). Stored in four bytes as the number of days since 1900-01-01. Allows storing values till 2299-12-31. +A date. Supports the date range same with [DateTime64](../../sql-reference/data-types/datetime64.md). Stored as a signed 32-bit integer in native byte order with the value representing the days since 1970-01-01 (0 represents 1970-01-01 and negative values represent the days before 1970). **Examples** diff --git a/docs/en/sql-reference/data-types/datetime.md b/docs/en/sql-reference/data-types/datetime.md index 85587882e01..7f7f21ded54 100644 --- a/docs/en/sql-reference/data-types/datetime.md +++ b/docs/en/sql-reference/data-types/datetime.md @@ -4,7 +4,7 @@ sidebar_position: 48 sidebar_label: DateTime --- -# Datetime +# DateTime Allows to store an instant in time, that can be expressed as a calendar date and a time of a day. diff --git a/docs/en/sql-reference/data-types/datetime64.md b/docs/en/sql-reference/data-types/datetime64.md index c7372e4b064..fa3a1eecd46 100644 --- a/docs/en/sql-reference/data-types/datetime64.md +++ b/docs/en/sql-reference/data-types/datetime64.md @@ -4,7 +4,7 @@ sidebar_position: 49 sidebar_label: DateTime64 --- -# Datetime64 +# DateTime64 Allows to store an instant in time, that can be expressed as a calendar date and a time of a day, with defined sub-second precision diff --git a/docs/en/sql-reference/data-types/geo.md b/docs/en/sql-reference/data-types/geo.md index 48dce40986e..3b2787008d2 100644 --- a/docs/en/sql-reference/data-types/geo.md +++ b/docs/en/sql-reference/data-types/geo.md @@ -95,3 +95,6 @@ Result: └─────────────────────────────────────────────────────────────────────────────────────────────────┴─────────────────┘ ``` +## Related Content + +- [Exploring massive, real-world data sets: 100+ Years of Weather Records in ClickHouse](https://clickhouse.com/blog/real-world-data-noaa-climate-data) diff --git a/docs/en/sql-reference/data-types/json.md b/docs/en/sql-reference/data-types/json.md index ab0f6115a41..ab1596b1760 100644 --- a/docs/en/sql-reference/data-types/json.md +++ b/docs/en/sql-reference/data-types/json.md @@ -75,3 +75,7 @@ SELECT * FROM json FORMAT JSONEachRow ```text {"o":{"a":1,"b":{"c":2,"d":[1,2,3]}}} ``` + +## Related Content + +- [Getting Data Into ClickHouse - Part 2 - A JSON detour](https://clickhouse.com/blog/getting-data-into-clickhouse-part-2-json) diff --git a/docs/en/sql-reference/dictionaries/external-dictionaries/_category_.yml b/docs/en/sql-reference/dictionaries/external-dictionaries/_category_.yml index 1f98223c54c..af79ff9af23 100644 --- a/docs/en/sql-reference/dictionaries/external-dictionaries/_category_.yml +++ b/docs/en/sql-reference/dictionaries/external-dictionaries/_category_.yml @@ -1,8 +1,8 @@ position: 37 -label: 'External Dictionaries' +label: 'Dictionaries' collapsible: true collapsed: true link: type: generated-index - title: External Dictionaries + title: Dictionaries slug: /en/sql-reference/dictionaries/external-dictionaries diff --git a/docs/en/sql-reference/dictionaries/external-dictionaries/_snippet_dictionary_in_cloud.md b/docs/en/sql-reference/dictionaries/external-dictionaries/_snippet_dictionary_in_cloud.md new file mode 100644 index 00000000000..a409dab31f4 --- /dev/null +++ b/docs/en/sql-reference/dictionaries/external-dictionaries/_snippet_dictionary_in_cloud.md @@ -0,0 +1,4 @@ +:::tip +If you are using a dictionary with ClickHouse Cloud please use the DDL query option to create your dictionaries, and create your dictionary as user `default`. +Also, verify the list of supported dictionary sources in the [Cloud Compatibility guide](/docs/en/cloud/reference/cloud-compatibility.md). +::: diff --git a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md index 02a4ad57a3b..aac0db208c6 100644 --- a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md +++ b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md @@ -3,6 +3,7 @@ slug: /en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-l sidebar_position: 41 sidebar_label: Storing Dictionaries in Memory --- +import CloudDetails from '@site/docs/en/sql-reference/dictionaries/external-dictionaries/_snippet_dictionary_in_cloud.md'; # Storing Dictionaries in Memory @@ -22,7 +23,9 @@ ClickHouse generates an exception for errors with dictionaries. Examples of erro - The dictionary being accessed could not be loaded. - Error querying a `cached` dictionary. -You can view the list of external dictionaries and their statuses in the [system.dictionaries](../../../operations/system-tables/dictionaries.md) table. +You can view the list of dictionaries and their statuses in the [system.dictionaries](../../../operations/system-tables/dictionaries.md) table. + + The configuration looks like this: diff --git a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-lifetime.md b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-lifetime.md index 6e4c8c4b94e..e4edad4d9a1 100644 --- a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-lifetime.md +++ b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-lifetime.md @@ -3,6 +3,7 @@ slug: /en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-l sidebar_position: 42 sidebar_label: Dictionary Updates --- +import CloudDetails from '@site/docs/en/sql-reference/dictionaries/external-dictionaries/_snippet_dictionary_in_cloud.md'; # Dictionary Updates @@ -12,6 +13,8 @@ Dictionary updates (other than loading for first use) do not block queries. Duri Example of settings: + + ``` xml ... diff --git a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-polygon.md b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-polygon.md index e5ee48c9166..8ef19a181e7 100644 --- a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-polygon.md +++ b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-polygon.md @@ -4,12 +4,15 @@ sidebar_position: 46 sidebar_label: Polygon Dictionaries With Grids title: "Polygon dictionaries" --- +import CloudDetails from '@site/docs/en/sql-reference/dictionaries/external-dictionaries/_snippet_dictionary_in_cloud.md'; Polygon dictionaries allow you to efficiently search for the polygon containing specified points. For example: defining a city area by geographical coordinates. Example of a polygon dictionary configuration: + + ``` xml @@ -78,7 +81,7 @@ To respond to the query, there is a corresponding cell, and the index for the po - `POLYGON`. Synonym to `POLYGON_INDEX_CELL`. -Dictionary queries are carried out using standard [functions](../../../sql-reference/functions/ext-dict-functions.md) for working with external dictionaries. +Dictionary queries are carried out using standard [functions](../../../sql-reference/functions/ext-dict-functions.md) for working with dictionaries. An important difference is that here the keys will be the points for which you want to find the polygon containing them. **Example** @@ -131,3 +134,7 @@ Result: │ [[[(3,1),(0,1),(0,-1),(3,-1)]]] │ Value │ └─────────────────────────────────┴───────┘ ``` + +## Related Content + +- [Exploring massive, real-world data sets: 100+ Years of Weather Records in ClickHouse](https://clickhouse.com/blog/real-world-data-noaa-climate-data) diff --git a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md index d457f327e7a..4eb96fe80a2 100644 --- a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md +++ b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md @@ -1,12 +1,15 @@ --- slug: /en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources sidebar_position: 43 -sidebar_label: Sources of External Dictionaries +sidebar_label: Dictionary Sources --- +import CloudDetails from '@site/docs/en/sql-reference/dictionaries/external-dictionaries/_snippet_dictionary_in_cloud.md'; -# Sources of External Dictionaries +# Dictionary Sources -An external dictionary can be connected to ClickHouse from many different sources. + + +A dictionary can be connected to ClickHouse from many different sources. If the dictionary is configured using an xml-file, the configuration looks like this: @@ -65,13 +68,13 @@ Types of sources (`source_type`): - [Executable Pool](#dicts-external_dicts_dict_sources-executable_pool) - [HTTP(s)](#dicts-external_dicts_dict_sources-http) - DBMS - - [ODBC](#dicts-external_dicts_dict_sources-odbc) - - [MySQL](#dicts-external_dicts_dict_sources-mysql) - - [ClickHouse](#dicts-external_dicts_dict_sources-clickhouse) - - [MongoDB](#dicts-external_dicts_dict_sources-mongodb) - - [Redis](#dicts-external_dicts_dict_sources-redis) - - [Cassandra](#dicts-external_dicts_dict_sources-cassandra) - - [PostgreSQL](#dicts-external_dicts_dict_sources-postgresql) + - [ODBC](#odbc) + - [MySQL](#mysql) + - [ClickHouse](#clickhouse) + - [MongoDB](#mongodb) + - [Redis](#redis) + - [Cassandra](#cassandra) + - [PostgreSQL](#postgresql) ## Local File diff --git a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md index 895743c3b50..881630167e3 100644 --- a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md +++ b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md @@ -3,9 +3,12 @@ slug: /en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-s sidebar_position: 44 sidebar_label: Dictionary Key and Fields --- +import CloudDetails from '@site/docs/en/sql-reference/dictionaries/external-dictionaries/_snippet_dictionary_in_cloud.md'; # Dictionary Key and Fields + + The `structure` clause describes the dictionary key and fields available for queries. XML description: @@ -171,5 +174,5 @@ Configuration fields: **See Also** -- [Functions for working with external dictionaries](../../../sql-reference/functions/ext-dict-functions.md). +- [Functions for working with dictionaries](../../../sql-reference/functions/ext-dict-functions.md). diff --git a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict.md b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict.md index 5c237eea8c7..76ca3ac978f 100644 --- a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict.md +++ b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict.md @@ -1,10 +1,13 @@ --- slug: /en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict sidebar_position: 40 -sidebar_label: Configuring an External Dictionary +sidebar_label: Configuring a Dictionary --- +import CloudDetails from '@site/docs/en/sql-reference/dictionaries/external-dictionaries/_snippet_dictionary_in_cloud.md'; -# Configuring an External Dictionary +# Configuring a Dictionary + + If dictionary is configured using xml file, than dictionary configuration has the following structure: diff --git a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts.md b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts.md index 095fb6360cd..9f922a2cccb 100644 --- a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts.md +++ b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts.md @@ -3,18 +3,23 @@ slug: /en/sql-reference/dictionaries/external-dictionaries/external-dicts sidebar_position: 39 sidebar_label: General Description --- +import CloudDetails from '@site/docs/en/sql-reference/dictionaries/external-dictionaries/_snippet_dictionary_in_cloud.md'; -# External Dictionaries +# Dictionaries -You can add your own dictionaries from various data sources. The data source for a dictionary can be a local text or executable file, an HTTP(s) resource, or another DBMS. For more information, see “[Sources for external dictionaries](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md)”. +:::tip Tutorial +If you are getting started with Dictionaries in ClickHouse we have a tutorial that covers that topic. Take a look [here](/docs/en/tutorial.md). +::: + +You can add your own dictionaries from various data sources. The source for a dictionary can be a ClickHouse table, a local text or executable file, an HTTP(s) resource, or another DBMS. For more information, see “[Dictionary Sources](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md)”. ClickHouse: - Fully or partially stores dictionaries in RAM. - Periodically updates dictionaries and dynamically loads missing values. In other words, dictionaries can be loaded dynamically. -- Allows to create external dictionaries with xml files or [DDL queries](../../../sql-reference/statements/create/dictionary.md). +- Allows creating dictionaries with xml files or [DDL queries](../../../sql-reference/statements/create/dictionary.md). -The configuration of external dictionaries can be located in one or more xml-files. The path to the configuration is specified in the [dictionaries_config](../../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-dictionaries_config) parameter. +The configuration of dictionaries can be located in one or more xml-files. The path to the configuration is specified in the [dictionaries_config](../../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-dictionaries_config) parameter. Dictionaries can be loaded at server startup or at first use, depending on the [dictionaries_lazy_load](../../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-dictionaries_lazy_load) setting. @@ -24,6 +29,22 @@ The [dictionaries](../../../operations/system-tables/dictionaries.md#system_tabl - Configuration parameters. - Metrics like amount of RAM allocated for the dictionary or a number of queries since the dictionary was successfully loaded. + + +## Creating a dictionary with a DDL query + +Dictionaries can be created with [DDL queries](../../../sql-reference/statements/create/dictionary.md), and this is the recommended method because with DDL created dictionaries: +- No additional records are added to server configuration files +- The dictionaries can be worked with as first-class entities, like tables or views +- Data can be read directly, using familiar SELECT rather than dictionary table functions +- The dictionaries can be easily renamed + +## Creating a dictionary with a configuration file + +:::note +Creating a dictionary with a configuration file is not applicable to ClickHouse Cloud. Please use DDL (see above), and create your dictionary as user `default`. +::: + The dictionary configuration file has the following format: ``` xml @@ -44,18 +65,17 @@ The dictionary configuration file has the following format: You can [configure](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict.md) any number of dictionaries in the same file. -[DDL queries for dictionaries](../../../sql-reference/statements/create/dictionary.md) does not require any additional records in server configuration. They allow to work with dictionaries as first-class entities, like tables or views. :::note -You can convert values for a small dictionary by describing it in a `SELECT` query (see the [transform](../../../sql-reference/functions/other-functions.md) function). This functionality is not related to external dictionaries. +You can convert values for a small dictionary by describing it in a `SELECT` query (see the [transform](../../../sql-reference/functions/other-functions.md) function). This functionality is not related to dictionaries. ::: ## See Also -- [Configuring an External Dictionary](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict.md) +- [Configuring a Dictionary](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict.md) - [Storing Dictionaries in Memory](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md) - [Dictionary Updates](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-lifetime.md) -- [Sources of External Dictionaries](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md) +- [Dictionary Sources](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md) - [Dictionary Key and Fields](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md) -- [Functions for Working with External Dictionaries](../../../sql-reference/functions/ext-dict-functions.md) +- [Functions for Working with Dictionaries](../../../sql-reference/functions/ext-dict-functions.md) diff --git a/docs/en/sql-reference/dictionaries/index.md b/docs/en/sql-reference/dictionaries/index.md index eccd1215e30..b6aa62bdb47 100644 --- a/docs/en/sql-reference/dictionaries/index.md +++ b/docs/en/sql-reference/dictionaries/index.md @@ -12,6 +12,6 @@ ClickHouse supports special functions for working with dictionaries that can be ClickHouse supports: -- [Built-in dictionaries](../../sql-reference/dictionaries/internal-dicts.md#internal_dicts) with a specific [set of functions](../../sql-reference/functions/ym-dict-functions.md). -- [Plug-in (external) dictionaries](../../sql-reference/dictionaries/external-dictionaries/external-dicts.md#dicts-external-dicts) with a [set of functions](../../sql-reference/functions/ext-dict-functions.md). +- [Dictionaries](../../sql-reference/dictionaries/external-dictionaries/external-dicts.md#dicts-external-dicts) with a [set of functions](../../sql-reference/functions/ext-dict-functions.md). +- [Embedded dictionaries](../../sql-reference/dictionaries/internal-dicts.md#internal_dicts) with a specific [set of functions](../../sql-reference/functions/ym-dict-functions.md). diff --git a/docs/en/sql-reference/dictionaries/internal-dicts.md b/docs/en/sql-reference/dictionaries/internal-dicts.md index dbc12a576f7..f26c60880a4 100644 --- a/docs/en/sql-reference/dictionaries/internal-dicts.md +++ b/docs/en/sql-reference/dictionaries/internal-dicts.md @@ -1,10 +1,13 @@ --- slug: /en/sql-reference/dictionaries/internal-dicts sidebar_position: 39 -sidebar_label: Internal Dictionaries +sidebar_label: Embedded Dictionaries --- +import SelfManaged from '@site/docs/en/_snippets/_self_managed_only_no_roadmap.md'; -# Internal Dictionaries +# Embedded Dictionaries + + ClickHouse contains a built-in feature for working with a geobase. diff --git a/docs/en/sql-reference/functions/arithmetic-functions.md b/docs/en/sql-reference/functions/arithmetic-functions.md index 9059facb0c6..56f3a88b28b 100644 --- a/docs/en/sql-reference/functions/arithmetic-functions.md +++ b/docs/en/sql-reference/functions/arithmetic-functions.md @@ -65,6 +65,11 @@ An exception is thrown when dividing by zero or when dividing a minimal negative Differs from [modulo](#modulo) in that it returns zero when the divisor is zero. +## positive_modulo(a, b) +Calculates the remainder when dividing `a` by `b`. Similar to function `modulo` except that `positive_modulo` always return non-negative number. + +Notice that `positive_modulo` is 4-5 times slower than `modulo`. You should not use `positive_modulo` unless you want to get positive result and don't care about performance too much. + ## negate(a), -a operator Calculates a number with the reverse sign. The result is always signed. @@ -156,3 +161,140 @@ Result: │ -1 │ └─────────────┘ ``` + +## multiplyDecimal(a, b[, result_scale]) + +Performs multiplication on two decimals. Result value will be of type [Decimal256](../../sql-reference/data-types/decimal.md). +Result scale can be explicitly specified by `result_scale` argument (const Integer in range `[0, 76]`). If not specified, the result scale is the max scale of given arguments. + +:::note +These functions work significantly slower than usual `multiply`. +In case you don't really need controlled precision and/or need fast computation, consider using [multiply](#multiply) +::: + +**Syntax** + +```sql +multiplyDecimal(a, b[, result_scale]) +``` + +**Arguments** + +- `a` — First value: [Decimal](../../sql-reference/data-types/decimal.md). +- `b` — Second value: [Decimal](../../sql-reference/data-types/decimal.md). +- `result_scale` — Scale of result: [Int/UInt](../../sql-reference/data-types/int-uint.md). + +**Returned value** + +- The result of multiplication with given scale. + +Type: [Decimal256](../../sql-reference/data-types/decimal.md). + +**Example** + +```text +┌─multiplyDecimal(toDecimal256(-12, 0), toDecimal32(-2.1, 1), 1)─┐ +│ 25.2 │ +└────────────────────────────────────────────────────────────────┘ +``` + +**Difference from regular multiplication:** +```sql +SELECT toDecimal64(-12.647, 3) * toDecimal32(2.1239, 4); +SELECT toDecimal64(-12.647, 3) as a, toDecimal32(2.1239, 4) as b, multiplyDecimal(a, b); +``` + +```text +┌─multiply(toDecimal64(-12.647, 3), toDecimal32(2.1239, 4))─┐ +│ -26.8609633 │ +└───────────────────────────────────────────────────────────┘ +┌─multiplyDecimal(toDecimal64(-12.647, 3), toDecimal32(2.1239, 4))─┐ +│ -26.8609 │ +└──────────────────────────────────────────────────────────────────┘ +``` + +```sql +SELECT + toDecimal64(-12.647987876, 9) AS a, + toDecimal64(123.967645643, 9) AS b, + multiplyDecimal(a, b); + +SELECT + toDecimal64(-12.647987876, 9) AS a, + toDecimal64(123.967645643, 9) AS b, + a * b; +``` + +```text +┌─────────────a─┬─────────────b─┬─multiplyDecimal(toDecimal64(-12.647987876, 9), toDecimal64(123.967645643, 9))─┐ +│ -12.647987876 │ 123.967645643 │ -1567.941279108 │ +└───────────────┴───────────────┴───────────────────────────────────────────────────────────────────────────────┘ + +Received exception from server (version 22.11.1): +Code: 407. DB::Exception: Received from localhost:9000. DB::Exception: Decimal math overflow: While processing toDecimal64(-12.647987876, 9) AS a, toDecimal64(123.967645643, 9) AS b, a * b. (DECIMAL_OVERFLOW) +``` + +## divideDecimal(a, b[, result_scale]) + +Performs division on two decimals. Result value will be of type [Decimal256](../../sql-reference/data-types/decimal.md). +Result scale can be explicitly specified by `result_scale` argument (const Integer in range `[0, 76]`). If not specified, the result scale is the max scale of given arguments. + +:::note +These function work significantly slower than usual `divide`. +In case you don't really need controlled precision and/or need fast computation, consider using [divide](#divide). +::: + +**Syntax** + +```sql +divideDecimal(a, b[, result_scale]) +``` + +**Arguments** + +- `a` — First value: [Decimal](../../sql-reference/data-types/decimal.md). +- `b` — Second value: [Decimal](../../sql-reference/data-types/decimal.md). +- `result_scale` — Scale of result: [Int/UInt](../../sql-reference/data-types/int-uint.md). + +**Returned value** + +- The result of division with given scale. + +Type: [Decimal256](../../sql-reference/data-types/decimal.md). + +**Example** + +```text +┌─divideDecimal(toDecimal256(-12, 0), toDecimal32(2.1, 1), 10)─┐ +│ -5.7142857142 │ +└──────────────────────────────────────────────────────────────┘ +``` + +**Difference from regular division:** +```sql +SELECT toDecimal64(-12, 1) / toDecimal32(2.1, 1); +SELECT toDecimal64(-12, 1) as a, toDecimal32(2.1, 1) as b, divideDecimal(a, b, 1), divideDecimal(a, b, 5); +``` + +```text +┌─divide(toDecimal64(-12, 1), toDecimal32(2.1, 1))─┐ +│ -5.7 │ +└──────────────────────────────────────────────────┘ + +┌───a─┬───b─┬─divideDecimal(toDecimal64(-12, 1), toDecimal32(2.1, 1), 1)─┬─divideDecimal(toDecimal64(-12, 1), toDecimal32(2.1, 1), 5)─┐ +│ -12 │ 2.1 │ -5.7 │ -5.71428 │ +└─────┴─────┴────────────────────────────────────────────────────────────┴────────────────────────────────────────────────────────────┘ +``` + +```sql +SELECT toDecimal64(-12, 0) / toDecimal32(2.1, 1); +SELECT toDecimal64(-12, 0) as a, toDecimal32(2.1, 1) as b, divideDecimal(a, b, 1), divideDecimal(a, b, 5); +``` + +```text +DB::Exception: Decimal result's scale is less than argument's one: While processing toDecimal64(-12, 0) / toDecimal32(2.1, 1). (ARGUMENT_OUT_OF_BOUND) + +┌───a─┬───b─┬─divideDecimal(toDecimal64(-12, 0), toDecimal32(2.1, 1), 1)─┬─divideDecimal(toDecimal64(-12, 0), toDecimal32(2.1, 1), 5)─┐ +│ -12 │ 2.1 │ -5.7 │ -5.71428 │ +└─────┴─────┴────────────────────────────────────────────────────────────┴────────────────────────────────────────────────────────────┘ +``` diff --git a/docs/en/sql-reference/functions/date-time-functions.md b/docs/en/sql-reference/functions/date-time-functions.md index f7ea2690b21..6cecc3f01da 100644 --- a/docs/en/sql-reference/functions/date-time-functions.md +++ b/docs/en/sql-reference/functions/date-time-functions.md @@ -410,35 +410,35 @@ Converts a date with time to a certain fixed date, while preserving the time. ## toRelativeYearNum -Converts a date with time or date to the number of the year, starting from a certain fixed point in the past. +Converts a date or date with time to the number of the year, starting from a certain fixed point in the past. ## toRelativeQuarterNum -Converts a date with time or date to the number of the quarter, starting from a certain fixed point in the past. +Converts a date or date with time to the number of the quarter, starting from a certain fixed point in the past. ## toRelativeMonthNum -Converts a date with time or date to the number of the month, starting from a certain fixed point in the past. +Converts a date or date with time to the number of the month, starting from a certain fixed point in the past. ## toRelativeWeekNum -Converts a date with time or date to the number of the week, starting from a certain fixed point in the past. +Converts a date or date with time to the number of the week, starting from a certain fixed point in the past. ## toRelativeDayNum -Converts a date with time or date to the number of the day, starting from a certain fixed point in the past. +Converts a date or date with time to the number of the day, starting from a certain fixed point in the past. ## toRelativeHourNum -Converts a date with time or date to the number of the hour, starting from a certain fixed point in the past. +Converts a date or date with time to the number of the hour, starting from a certain fixed point in the past. ## toRelativeMinuteNum -Converts a date with time or date to the number of the minute, starting from a certain fixed point in the past. +Converts a date or date with time to the number of the minute, starting from a certain fixed point in the past. ## toRelativeSecondNum -Converts a date with time or date to the number of the second, starting from a certain fixed point in the past. +Converts a date or date with time to the number of the second, starting from a certain fixed point in the past. ## toISOYear @@ -517,6 +517,154 @@ SELECT toDate('2016-12-27') AS date, toYearWeek(date) AS yearWeek0, toYearWeek(d └────────────┴───────────┴───────────┴───────────┘ ``` +## age + +Returns the `unit` component of the difference between `startdate` and `enddate`. The difference is calculated using a precision of 1 second. +E.g. the difference between `2021-12-29` and `2022-01-01` is 3 days for `day` unit, 0 months for `month` unit, 0 years for `year` unit. + + +**Syntax** + +``` sql +age('unit', startdate, enddate, [timezone]) +``` + +**Arguments** + +- `unit` — The type of interval for result. [String](../../sql-reference/data-types/string.md). + Possible values: + + - `second` (possible abbreviations: `ss`, `s`) + - `minute` (possible abbreviations: `mi`, `n`) + - `hour` (possible abbreviations: `hh`, `h`) + - `day` (possible abbreviations: `dd`, `d`) + - `week` (possible abbreviations: `wk`, `ww`) + - `month` (possible abbreviations: `mm`, `m`) + - `quarter` (possible abbreviations: `qq`, `q`) + - `year` (possible abbreviations: `yyyy`, `yy`) + +- `startdate` — The first time value to subtract (the subtrahend). [Date](../../sql-reference/data-types/date.md), [Date32](../../sql-reference/data-types/date32.md), [DateTime](../../sql-reference/data-types/datetime.md) or [DateTime64](../../sql-reference/data-types/datetime64.md). + +- `enddate` — The second time value to subtract from (the minuend). [Date](../../sql-reference/data-types/date.md), [Date32](../../sql-reference/data-types/date32.md), [DateTime](../../sql-reference/data-types/datetime.md) or [DateTime64](../../sql-reference/data-types/datetime64.md). + +- `timezone` — [Timezone name](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone) (optional). If specified, it is applied to both `startdate` and `enddate`. If not specified, timezones of `startdate` and `enddate` are used. If they are not the same, the result is unspecified. [String](../../sql-reference/data-types/string.md). + +**Returned value** + +Difference between `enddate` and `startdate` expressed in `unit`. + +Type: [Int](../../sql-reference/data-types/int-uint.md). + +**Example** + +Query: + +``` sql +SELECT age('hour', toDateTime('2018-01-01 22:30:00'), toDateTime('2018-01-02 23:00:00')); +``` + +Result: + +``` text +┌─age('hour', toDateTime('2018-01-01 22:30:00'), toDateTime('2018-01-02 23:00:00'))─┐ +│ 24 │ +└───────────────────────────────────────────────────────────────────────────────────┘ +``` + +Query: + +``` sql +SELECT + toDate('2022-01-01') AS e, + toDate('2021-12-29') AS s, + age('day', s, e) AS day_age, + age('month', s, e) AS month__age, + age('year', s, e) AS year_age; +``` + +Result: + +``` text +┌──────────e─┬──────────s─┬─day_age─┬─month__age─┬─year_age─┐ +│ 2022-01-01 │ 2021-12-29 │ 3 │ 0 │ 0 │ +└────────────┴────────────┴─────────┴────────────┴──────────┘ +``` + + +## date\_diff + +Returns the count of the specified `unit` boundaries crossed between the `startdate` and `enddate`. +The difference is calculated using relative units, e.g. the difference between `2021-12-29` and `2022-01-01` is 3 days for day unit (see [toRelativeDayNum](#torelativedaynum)), 1 month for month unit (see [toRelativeMonthNum](#torelativemonthnum)), 1 year for year unit (see [toRelativeYearNum](#torelativeyearnum)). + +**Syntax** + +``` sql +date_diff('unit', startdate, enddate, [timezone]) +``` + +Aliases: `dateDiff`, `DATE_DIFF`. + +**Arguments** + +- `unit` — The type of interval for result. [String](../../sql-reference/data-types/string.md). + Possible values: + + - `second` (possible abbreviations: `ss`, `s`) + - `minute` (possible abbreviations: `mi`, `n`) + - `hour` (possible abbreviations: `hh`, `h`) + - `day` (possible abbreviations: `dd`, `d`) + - `week` (possible abbreviations: `wk`, `ww`) + - `month` (possible abbreviations: `mm`, `m`) + - `quarter` (possible abbreviations: `qq`, `q`) + - `year` (possible abbreviations: `yyyy`, `yy`) + +- `startdate` — The first time value to subtract (the subtrahend). [Date](../../sql-reference/data-types/date.md), [Date32](../../sql-reference/data-types/date32.md), [DateTime](../../sql-reference/data-types/datetime.md) or [DateTime64](../../sql-reference/data-types/datetime64.md). + +- `enddate` — The second time value to subtract from (the minuend). [Date](../../sql-reference/data-types/date.md), [Date32](../../sql-reference/data-types/date32.md), [DateTime](../../sql-reference/data-types/datetime.md) or [DateTime64](../../sql-reference/data-types/datetime64.md). + +- `timezone` — [Timezone name](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone) (optional). If specified, it is applied to both `startdate` and `enddate`. If not specified, timezones of `startdate` and `enddate` are used. If they are not the same, the result is unspecified. [String](../../sql-reference/data-types/string.md). + +**Returned value** + +Difference between `enddate` and `startdate` expressed in `unit`. + +Type: [Int](../../sql-reference/data-types/int-uint.md). + +**Example** + +Query: + +``` sql +SELECT dateDiff('hour', toDateTime('2018-01-01 22:00:00'), toDateTime('2018-01-02 23:00:00')); +``` + +Result: + +``` text +┌─dateDiff('hour', toDateTime('2018-01-01 22:00:00'), toDateTime('2018-01-02 23:00:00'))─┐ +│ 25 │ +└────────────────────────────────────────────────────────────────────────────────────────┘ +``` + +Query: + +``` sql +SELECT + toDate('2022-01-01') AS e, + toDate('2021-12-29') AS s, + dateDiff('day', s, e) AS day_diff, + dateDiff('month', s, e) AS month__diff, + dateDiff('year', s, e) AS year_diff; +``` + +Result: + +``` text +┌──────────e─┬──────────s─┬─day_diff─┬─month__diff─┬─year_diff─┐ +│ 2022-01-01 │ 2021-12-29 │ 3 │ 1 │ 1 │ +└────────────┴────────────┴──────────┴─────────────┴───────────┘ +``` + ## date\_trunc Truncates date and time data to the specified part of date. @@ -550,7 +698,7 @@ Alias: `dateTrunc`. - Value, truncated to the specified part of date. -Type: [Datetime](../../sql-reference/data-types/datetime.md). +Type: [DateTime](../../sql-reference/data-types/datetime.md). **Example** @@ -637,80 +785,6 @@ Result: └───────────────────────────────────────────────┘ ``` -## date\_diff - -Returns the difference between two dates or dates with time values. -The difference is calculated using relative units, e.g. the difference between `2022-01-01` and `2021-12-29` is 3 days for day unit (see [toRelativeDayNum](#torelativedaynum)), 1 month for month unit (see [toRelativeMonthNum](#torelativemonthnum)), 1 year for year unit (see [toRelativeYearNum](#torelativeyearnum)). - -**Syntax** - -``` sql -date_diff('unit', startdate, enddate, [timezone]) -``` - -Aliases: `dateDiff`, `DATE_DIFF`. - -**Arguments** - -- `unit` — The type of interval for result. [String](../../sql-reference/data-types/string.md). - Possible values: - - - `second` - - `minute` - - `hour` - - `day` - - `week` - - `month` - - `quarter` - - `year` - -- `startdate` — The first time value to subtract (the subtrahend). [Date](../../sql-reference/data-types/date.md), [Date32](../../sql-reference/data-types/date32.md), [DateTime](../../sql-reference/data-types/datetime.md) or [DateTime64](../../sql-reference/data-types/datetime64.md). - -- `enddate` — The second time value to subtract from (the minuend). [Date](../../sql-reference/data-types/date.md), [Date32](../../sql-reference/data-types/date32.md), [DateTime](../../sql-reference/data-types/datetime.md) or [DateTime64](../../sql-reference/data-types/datetime64.md). - -- `timezone` — [Timezone name](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone) (optional). If specified, it is applied to both `startdate` and `enddate`. If not specified, timezones of `startdate` and `enddate` are used. If they are not the same, the result is unspecified. [String](../../sql-reference/data-types/string.md). - -**Returned value** - -Difference between `enddate` and `startdate` expressed in `unit`. - -Type: [Int](../../sql-reference/data-types/int-uint.md). - -**Example** - -Query: - -``` sql -SELECT dateDiff('hour', toDateTime('2018-01-01 22:00:00'), toDateTime('2018-01-02 23:00:00')); -``` - -Result: - -``` text -┌─dateDiff('hour', toDateTime('2018-01-01 22:00:00'), toDateTime('2018-01-02 23:00:00'))─┐ -│ 25 │ -└────────────────────────────────────────────────────────────────────────────────────────┘ -``` - -Query: - -``` sql -SELECT - toDate('2022-01-01') AS e, - toDate('2021-12-29') AS s, - dateDiff('day', s, e) AS day_diff, - dateDiff('month', s, e) AS month__diff, - dateDiff('year', s, e) AS year_diff; -``` - -Result: - -``` text -┌──────────e─┬──────────s─┬─day_diff─┬─month__diff─┬─year_diff─┐ -│ 2022-01-01 │ 2021-12-29 │ 3 │ 1 │ 1 │ -└────────────┴────────────┴──────────┴─────────────┴───────────┘ -``` - ## date\_sub Subtracts the time interval or date interval from the provided date or date with time. @@ -881,7 +955,7 @@ now([timezone]) - Current date and time. -Type: [Datetime](../../sql-reference/data-types/datetime.md). +Type: [DateTime](../../sql-reference/data-types/datetime.md). **Example** @@ -932,7 +1006,7 @@ now64([scale], [timezone]) - Current date and time with sub-second precision. -Type: [Datetime64](../../sql-reference/data-types/datetime64.md). +Type: [DateTime64](../../sql-reference/data-types/datetime64.md). **Example** @@ -968,7 +1042,7 @@ nowInBlock([timezone]) - Current date and time at the moment of processing of each block of data. -Type: [Datetime](../../sql-reference/data-types/datetime.md). +Type: [DateTime](../../sql-reference/data-types/datetime.md). **Example** diff --git a/docs/en/sql-reference/functions/distance-functions.md b/docs/en/sql-reference/functions/distance-functions.md index 88d6c2f3e17..293e02f8a54 100644 --- a/docs/en/sql-reference/functions/distance-functions.md +++ b/docs/en/sql-reference/functions/distance-functions.md @@ -474,13 +474,13 @@ Calculates the cosine distance between two vectors (the values of the tuples are **Syntax** ```sql -cosineDistance(tuple1, tuple2) +cosineDistance(vector1, vector2) ``` **Arguments** -- `tuple1` — First tuple. [Tuple](../../sql-reference/data-types/tuple.md). -- `tuple2` — Second tuple. [Tuple](../../sql-reference/data-types/tuple.md). +- `vector1` — First tuple. [Tuple](../../sql-reference/data-types/tuple.md) or [Array](../../sql-reference/data-types/array.md). +- `vector2` — Second tuple. [Tuple](../../sql-reference/data-types/tuple.md) or [Array](../../sql-reference/data-types/array.md). **Returned value** @@ -488,7 +488,7 @@ cosineDistance(tuple1, tuple2) Type: [Float](../../sql-reference/data-types/float.md). -**Example** +**Examples** Query: diff --git a/docs/en/sql-reference/functions/encoding-functions.md b/docs/en/sql-reference/functions/encoding-functions.md index 4a6e46e1759..cccc02c2553 100644 --- a/docs/en/sql-reference/functions/encoding-functions.md +++ b/docs/en/sql-reference/functions/encoding-functions.md @@ -185,7 +185,7 @@ unhex(arg) **Arguments** -- `arg` — A string containing any number of hexadecimal digits. Type: [String](../../sql-reference/data-types/string.md). +- `arg` — A string containing any number of hexadecimal digits. Type: [String](../../sql-reference/data-types/string.md), [FixedString](../../sql-reference/data-types/fixedstring.md). Supports both uppercase and lowercase letters `A-F`. The number of hexadecimal digits does not have to be even. If it is odd, the last digit is interpreted as the least significant half of the `00-0F` byte. If the argument string contains anything other than hexadecimal digits, some implementation-defined result is returned (an exception isn’t thrown). For a numeric argument the inverse of hex(N) is not performed by unhex(). diff --git a/docs/en/sql-reference/functions/ext-dict-functions.md b/docs/en/sql-reference/functions/ext-dict-functions.md index 728e26d6958..d9e811a5703 100644 --- a/docs/en/sql-reference/functions/ext-dict-functions.md +++ b/docs/en/sql-reference/functions/ext-dict-functions.md @@ -1,20 +1,20 @@ --- slug: /en/sql-reference/functions/ext-dict-functions sidebar_position: 58 -sidebar_label: External Dictionaries +sidebar_label: Dictionaries --- +# Functions for Working with Dictionaries + :::note For dictionaries created with [DDL queries](../../sql-reference/statements/create/dictionary.md), the `dict_name` parameter must be fully specified, like `.`. Otherwise, the current database is used. ::: -# Functions for Working with External Dictionaries - -For information on connecting and configuring external dictionaries, see [External dictionaries](../../sql-reference/dictionaries/external-dictionaries/external-dicts.md). +For information on connecting and configuring dictionaries, see [Dictionaries](../../sql-reference/dictionaries/external-dictionaries/external-dicts.md). ## dictGet, dictGetOrDefault, dictGetOrNull -Retrieves values from an external dictionary. +Retrieves values from a dictionary. ``` sql dictGet('dict_name', attr_names, id_expr) @@ -52,7 +52,7 @@ Create a text file `ext-dict-test.csv` containing the following: The first column is `id`, the second column is `c1`. -Configure the external dictionary: +Configure the dictionary: ``` xml @@ -112,7 +112,7 @@ Create a text file `ext-dict-mult.csv` containing the following: The first column is `id`, the second is `c1`, the third is `c2`. -Configure the external dictionary: +Configure the dictionary: ``` xml @@ -151,7 +151,7 @@ Perform the query: ``` sql SELECT - dictGet('ext-dict-mult', ('c1','c2'), number) AS val, + dictGet('ext-dict-mult', ('c1','c2'), number + 1) AS val, toTypeName(val) AS type FROM system.numbers LIMIT 3; @@ -185,7 +185,7 @@ INSERT INTO range_key_dictionary_source_table VALUES(2, toDate('2019-05-20'), to INSERT INTO range_key_dictionary_source_table VALUES(3, toDate('2019-05-20'), toDate('2019-05-20'), 'Third', 'Third'); ``` -Create the external dictionary: +Create the dictionary: ```sql CREATE DICTIONARY range_key_dictionary @@ -226,7 +226,7 @@ Result: **See Also** -- [External Dictionaries](../../sql-reference/dictionaries/external-dictionaries/external-dicts.md) +- [Dictionaries](../../sql-reference/dictionaries/external-dictionaries/external-dicts.md) ## dictHas diff --git a/docs/en/sql-reference/functions/geo/index.md b/docs/en/sql-reference/functions/geo/index.md index 64e23094105..8d659236d4c 100644 --- a/docs/en/sql-reference/functions/geo/index.md +++ b/docs/en/sql-reference/functions/geo/index.md @@ -8,70 +8,69 @@ title: "Geo Functions" ## Geographical Coordinates Functions -- [greatCircleDistance](./coordinates.md#greatCircleDistance) -- [geoDistance](./coordinates.md#geoDistance) -- [greatCircleAngle](./coordinates.md#greatCircleAngle) -- [pointInEllipses](./coordinates.md#pointInEllipses) -- [pointInPolygon](./coordinates.md#pointInPolygon) +- [greatCircleDistance](./coordinates.md#greatcircledistance) +- [geoDistance](./coordinates.md#geodistance) +- [greatCircleAngle](./coordinates.md#greatcircleangle) +- [pointInEllipses](./coordinates.md#pointinellipses) +- [pointInPolygon](./coordinates.md#pointinpolygon) ## Geohash Functions -- [geohashEncode](./geohash.md#geohashEncode) -- [geohashDecode](./geohash.md#geohashDecode) -- [geohashesInBox](./geohash.md#geohashesInBox) +- [geohashEncode](./geohash.md#geohashencode) +- [geohashDecode](./geohash.md#geohashdecode) +- [geohashesInBox](./geohash.md#geohashesinbox) ## H3 Indexes Functions -- [h3IsValid](./h3.md#h3IsValid) -- [h3GetResolution](./h3.md#h3GetResolution) -- [h3EdgeAngle](./h3.md#h3EdgeAngle) -- [h3EdgeLengthM​](./h3.md#h3EdgeLengthM​) -- [h3EdgeLengthKm](./h3.md#h3EdgeLengthKm) -- [geoToH3](./h3.md#geoToH3) -- [h3ToGeo](./h3.md#h3ToGeo) -- [h3ToGeoBoundary](./h3.md#h3ToGeoBoundary) -- [h3kRing](./h3.md#h3kRing) -- [h3GetBaseCell](./h3.md#h3GetBaseCell) -- [h3HexAreaM2](./h3.md#h3HexAreaM2) -- [h3HexAreaKm2](./h3.md#h3HexAreaKm2) -- [h3IndexesAreNeighbors](./h3.md#h3IndexesAreNeighbors) -- [h3ToChildren](./h3.md#h3ToChildren) -- [h3ToParent](./h3.md#h3ToParent) -- [h3ToString](./h3.md#h3ToString) -- [stringToH3](./h3.md#stringToH3) -- [h3GetResolution](./h3.md#h3GetResolution) -- [h3IsResClassIII](./h3.md#h3IsResClassIII) -- [h3IsPentagon](./h3.md#h3IsPentagon) -- [h3GetFaces](./h3.md#h3GetFaces) -- [h3CellAreaM2](./h3.md#h3CellAreaM2) -- [h3CellAreaRads2](./h3.md#h3CellAreaRads2) -- [h3ToCenterChild](./h3.md#h3ToCenterChild) -- [h3ExactEdgeLengthM](./h3.md#h3ExactEdgeLengthM) -- [h3ExactEdgeLengthKm](./h3.md#h3ExactEdgeLengthKm) -- [h3ExactEdgeLengthRads](./h3.md#h3ExactEdgeLengthRads) -- [h3NumHexagons](./h3.md#h3NumHexagons) -- [h3Line](./h3.md#h3Line) -- [h3Distance](./h3.md#h3Distance) -- [h3HexRing](./h3.md#h3HexRing) -- [h3GetUnidirectionalEdge](./h3.md#h3GetUnidirectionalEdge) -- [h3UnidirectionalEdgeIsValid](./h3.md#h3UnidirectionalEdgeIsValid) -- [h3GetOriginIndexFromUnidirectionalEdge](./h3.md#h3GetOriginIndexFromUnidirectionalEdge) -- [h3GetDestinationIndexFromUnidirectionalEdge](./h3.md#h3GetDestinationIndexFromUnidirectionalEdge) -- [h3GetIndexesFromUnidirectionalEdge](./h3.md#h3GetIndexesFromUnidirectionalEdge) -- [h3GetUnidirectionalEdgesFromHexagon](./h3.md#h3GetUnidirectionalEdgesFromHexagon) -- [h3GetUnidirectionalEdgeBoundary](./h3.md#h3GetUnidirectionalEdgeBoundary) +- [h3IsValid](./h3.md#h3isvalid) +- [h3GetResolution](./h3.md#h3getresolution) +- [h3EdgeAngle](./h3.md#h3edgeangle) +- [h3EdgeLengthM](./h3.md#h3edgelengthm) +- [h3EdgeLengthKm](./h3.md#h3edgelengthkm) +- [geoToH3](./h3.md#geotoh3) +- [h3ToGeo](./h3.md#h3togeo) +- [h3ToGeoBoundary](./h3.md#h3togeoboundary) +- [h3kRing](./h3.md#h3kring) +- [h3GetBaseCell](./h3.md#h3getbasecell) +- [h3HexAreaM2](./h3.md#h3hexaream2) +- [h3HexAreaKm2](./h3.md#h3hexareakm2) +- [h3IndexesAreNeighbors](./h3.md#h3indexesareneighbors) +- [h3ToChildren](./h3.md#h3tochildren) +- [h3ToParent](./h3.md#h3toparent) +- [h3ToString](./h3.md#h3tostring) +- [stringToH3](./h3.md#stringtoh3) +- [h3GetResolution](./h3.md#h3getresolution) +- [h3IsResClassIII](./h3.md#h3isresclassiii) +- [h3IsPentagon](./h3.md#h3ispentagon) +- [h3GetFaces](./h3.md#h3getfaces) +- [h3CellAreaM2](./h3.md#h3cellaream2) +- [h3CellAreaRads2](./h3.md#h3cellarearads2) +- [h3ToCenterChild](./h3.md#h3tocenterchild) +- [h3ExactEdgeLengthM](./h3.md#h3exactedgelengthm) +- [h3ExactEdgeLengthKm](./h3.md#h3exactedgelengthkm) +- [h3ExactEdgeLengthRads](./h3.md#h3exactedgelengthrads) +- [h3NumHexagons](./h3.md#h3numhexagons) +- [h3Line](./h3.md#h3line) +- [h3Distance](./h3.md#h3distance) +- [h3HexRing](./h3.md#h3hexring) +- [h3GetUnidirectionalEdge](./h3.md#h3getunidirectionaledge) +- [h3UnidirectionalEdgeIsValid](./h3.md#h3unidirectionaledgeisvalid) +- [h3GetOriginIndexFromUnidirectionalEdge](./h3.md#h3getoriginindexfromunidirectionaledge) +- [h3GetDestinationIndexFromUnidirectionalEdge](./h3.md#h3getdestinationindexfromunidirectionaledge) +- [h3GetIndexesFromUnidirectionalEdge](./h3.md#h3getindexesfromunidirectionaledge) +- [h3GetUnidirectionalEdgesFromHexagon](./h3.md#h3getunidirectionaledgesfromhexagon) +- [h3GetUnidirectionalEdgeBoundary](./h3.md#h3getunidirectionaledgeboundary) ## S2 Index Functions -- [geoToS2](./s2.md#geoToS2) -- [s2ToGeo](./s2.md#s2ToGeo) -- [s2GetNeighbors](./s2.md#s2GetNeighbors) -- [s2CellsIntersect](./s2.md#s2CellsIntersect) -- [s2CapContains](./s2.md#s2CapContains) -- [s2CapUnion](./s2.md#s2CapUnion) -- [s2RectAdd](./s2.md#s2RectAdd) -- [s2RectContains](./s2.md#s2RectContains) -- [s2RectUinion](./s2.md#s2RectUinion) -- [s2RectIntersection](./s2.md#s2RectIntersection) +- [geoToS2](./s2.md#geotos2) +- [s2ToGeo](./s2.md#s2togeo) +- [s2GetNeighbors](./s2.md#s2getneighbors) +- [s2CellsIntersect](./s2.md#s2cellsintersect) +- [s2CapContains](./s2.md#s2capcontains) +- [s2CapUnion](./s2.md#s2capunion) +- [s2RectAdd](./s2.md#s2rectadd) +- [s2RectContains](./s2.md#s2rectcontains) +- [s2RectUnion](./s2.md#s2rectunion) +- [s2RectIntersection](./s2.md#s2rectintersection) -[Original article](https://clickhouse.com/docs/en/sql-reference/functions/geo/) diff --git a/docs/en/sql-reference/functions/math-functions.md b/docs/en/sql-reference/functions/math-functions.md index 430762a1885..bcd118ce0be 100644 --- a/docs/en/sql-reference/functions/math-functions.md +++ b/docs/en/sql-reference/functions/math-functions.md @@ -549,3 +549,33 @@ Result: │ 3.141592653589793 │ └───────────────────┘ ``` + + +## factorial(n) + +Computes the factorial of an integer value. It works with any native integer type including UInt(8|16|32|64) and Int(8|16|32|64). The return type is UInt64. + +The factorial of 0 is 1. Likewise, the factorial() function returns 1 for any negative value. The maximum positive value for the input argument is 20, a value of 21 or greater will cause exception throw. + + +**Syntax** + +``` sql +factorial(n) +``` + +**Example** + +Query: + +``` sql +SELECT factorial(10); +``` + +Result: + +``` text +┌─factorial(10)─┐ +│ 3628800 │ +└───────────────┘ +``` diff --git a/docs/en/sql-reference/functions/other-functions.md b/docs/en/sql-reference/functions/other-functions.md index 6490d4c2272..536249626e5 100644 --- a/docs/en/sql-reference/functions/other-functions.md +++ b/docs/en/sql-reference/functions/other-functions.md @@ -593,6 +593,27 @@ LIMIT 10 └────────────────┴─────────┘ ``` +## formatReadableDecimalSize(x) + +Accepts the size (number of bytes). Returns a rounded size with a suffix (KB, MB, etc.) as a string. + +Example: + +``` sql +SELECT + arrayJoin([1, 1024, 1024*1024, 192851925]) AS filesize_bytes, + formatReadableDecimalSize(filesize_bytes) AS filesize +``` + +``` text +┌─filesize_bytes─┬─filesize───┐ +│ 1 │ 1.00 B │ +│ 1024 │ 1.02 KB │ +│ 1048576 │ 1.05 MB │ +│ 192851925 │ 192.85 MB │ +└────────────────┴────────────┘ +``` + ## formatReadableSize(x) Accepts the size (number of bytes). Returns a rounded size with a suffix (KiB, MiB, etc.) as a string. @@ -1844,6 +1865,17 @@ Next, specify the path to `libcatboostmodel.` in the clickhouse config ``` +For security and isolation reasons, the model evaluation does not run in the server process but in the clickhouse-library-bridge process. +At the first execution of `catboostEvaluate()`, the server starts the library bridge process if it is not running already. Both processes +communicate using a HTTP interface. By default, port `9012` is used. A different port can be specified as follows - this is useful if port +`9012` is already assigned to a different service. + +``` xml + + 9019 + +``` + 2. Train a catboost model using libcatboost See [Training and applying models](https://catboost.ai/docs/features/training.html#training) for how to train catboost models from a training data set. diff --git a/docs/en/sql-reference/functions/random-functions.md b/docs/en/sql-reference/functions/random-functions.md index d77cc55e5eb..4efa2131eb6 100644 --- a/docs/en/sql-reference/functions/random-functions.md +++ b/docs/en/sql-reference/functions/random-functions.md @@ -24,6 +24,11 @@ Returns a pseudo-random UInt64 number, evenly distributed among all UInt64-type Uses a linear congruential generator. +## randCanonical +The function generates pseudo random results with independent and identically distributed uniformly distributed values in [0, 1). + +Non-deterministic. Return type is Float64. + ## randConstant Produces a constant column with a random value. diff --git a/docs/en/sql-reference/functions/splitting-merging-functions.md b/docs/en/sql-reference/functions/splitting-merging-functions.md index 70a1f10083b..7cad6b2fbbf 100644 --- a/docs/en/sql-reference/functions/splitting-merging-functions.md +++ b/docs/en/sql-reference/functions/splitting-merging-functions.md @@ -6,21 +6,22 @@ sidebar_label: Splitting and Merging Strings and Arrays # Functions for Splitting and Merging Strings and Arrays -## splitByChar(separator, s) +## splitByChar(separator, s[, max_substrings]) -Splits a string into substrings separated by a specified character. It uses a constant string `separator` which consisting of exactly one character. +Splits a string into substrings separated by a specified character. It uses a constant string `separator` which consists of exactly one character. Returns an array of selected substrings. Empty substrings may be selected if the separator occurs at the beginning or end of the string, or if there are multiple consecutive separators. **Syntax** ``` sql -splitByChar(separator, s) +splitByChar(separator, s[, max_substrings])) ``` **Arguments** - `separator` — The separator which should contain exactly one character. [String](../../sql-reference/data-types/string.md). - `s` — The string to split. [String](../../sql-reference/data-types/string.md). +- `max_substrings` — An optional `Int64` defaulting to 0. When `max_substrings` > 0, the returned substrings will be no more than `max_substrings`, otherwise the function will return as many substrings as possible. **Returned value(s)** @@ -44,20 +45,22 @@ SELECT splitByChar(',', '1,2,3,abcde'); └─────────────────────────────────┘ ``` -## splitByString(separator, s) +## splitByString(separator, s[, max_substrings]) Splits a string into substrings separated by a string. It uses a constant string `separator` of multiple characters as the separator. If the string `separator` is empty, it will split the string `s` into an array of single characters. **Syntax** ``` sql -splitByString(separator, s) +splitByString(separator, s[, max_substrings])) ``` **Arguments** - `separator` — The separator. [String](../../sql-reference/data-types/string.md). - `s` — The string to split. [String](../../sql-reference/data-types/string.md). +- `max_substrings` — An optional `Int64` defaulting to 0. When `max_substrings` > 0, the returned substrings will be no more than `max_substrings`, otherwise the function will return as many substrings as possible. + **Returned value(s)** @@ -91,20 +94,22 @@ SELECT splitByString('', 'abcde'); └────────────────────────────┘ ``` -## splitByRegexp(regexp, s) +## splitByRegexp(regexp, s[, max_substrings]) Splits a string into substrings separated by a regular expression. It uses a regular expression string `regexp` as the separator. If the `regexp` is empty, it will split the string `s` into an array of single characters. If no match is found for this regular expression, the string `s` won't be split. **Syntax** ``` sql -splitByRegexp(regexp, s) +splitByRegexp(regexp, s[, max_substrings])) ``` **Arguments** - `regexp` — Regular expression. Constant. [String](../data-types/string.md) or [FixedString](../data-types/fixedstring.md). - `s` — The string to split. [String](../../sql-reference/data-types/string.md). +- `max_substrings` — An optional `Int64` defaulting to 0. When `max_substrings` > 0, the returned substrings will be no more than `max_substrings`, otherwise the function will return as many substrings as possible. + **Returned value(s)** @@ -146,7 +151,7 @@ Result: └────────────────────────────┘ ``` -## splitByWhitespace(s) +## splitByWhitespace(s[, max_substrings]) Splits a string into substrings separated by whitespace characters. Returns an array of selected substrings. @@ -154,12 +159,14 @@ Returns an array of selected substrings. **Syntax** ``` sql -splitByWhitespace(s) +splitByWhitespace(s[, max_substrings])) ``` **Arguments** - `s` — The string to split. [String](../../sql-reference/data-types/string.md). +- `max_substrings` — An optional `Int64` defaulting to 0. When `max_substrings` > 0, the returned substrings will be no more than `max_substrings`, otherwise the function will return as many substrings as possible. + **Returned value(s)** @@ -179,7 +186,7 @@ SELECT splitByWhitespace(' 1! a, b. '); └─────────────────────────────────────┘ ``` -## splitByNonAlpha(s) +## splitByNonAlpha(s[, max_substrings]) Splits a string into substrings separated by whitespace and punctuation characters. Returns an array of selected substrings. @@ -187,12 +194,14 @@ Returns an array of selected substrings. **Syntax** ``` sql -splitByNonAlpha(s) +splitByNonAlpha(s[, max_substrings])) ``` **Arguments** - `s` — The string to split. [String](../../sql-reference/data-types/string.md). +- `max_substrings` — An optional `Int64` defaulting to 0. When `max_substrings` > 0, the returned substrings will be no more than `max_substrings`, otherwise the function will return as many substrings as possible. + **Returned value(s)** @@ -217,10 +226,28 @@ SELECT splitByNonAlpha(' 1! a, b. '); Concatenates string representations of values listed in the array with the separator. `separator` is an optional parameter: a constant string, set to an empty string by default. Returns the string. -## alphaTokens(s) +## alphaTokens(s[, max_substrings]), splitByAlpha(s[, max_substrings]) Selects substrings of consecutive bytes from the ranges a-z and A-Z.Returns an array of substrings. +**Syntax** + +``` sql +alphaTokens(s[, max_substrings])) +splitByAlpha(s[, max_substrings]) +``` + +**Arguments** + +- `s` — The string to split. [String](../../sql-reference/data-types/string.md). +- `max_substrings` — An optional `Int64` defaulting to 0. When `max_substrings` > 0, the returned substrings will be no more than `max_substrings`, otherwise the function will return as many substrings as possible. + +**Returned value(s)** + +Returns an array of selected substrings. + +Type: [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). + **Example** ``` sql diff --git a/docs/en/sql-reference/functions/string-functions.md b/docs/en/sql-reference/functions/string-functions.md index a8ba4843279..cdbf29f3e6d 100644 --- a/docs/en/sql-reference/functions/string-functions.md +++ b/docs/en/sql-reference/functions/string-functions.md @@ -571,13 +571,13 @@ Similar to base58Decode, but returns an empty string in case of error. ## base64Encode(s) -Encodes ‘s’ string into base64 +Encodes ‘s’ FixedString or String into base64. Alias: `TO_BASE64`. ## base64Decode(s) -Decode base64-encoded string ‘s’ into original string. In case of failure raises an exception. +Decode base64-encoded FixedString or String ‘s’ into original string. In case of failure raises an exception. Alias: `FROM_BASE64`. @@ -1150,3 +1150,13 @@ A text with tags . The content within CDATA Do Nothing for 2 Minutes 2:00   ``` + +## ascii(s) {#ascii} + +Returns the ASCII code point of the first character of str. The result type is Int32. + +If s is empty, the result is 0. If the first character is not an ASCII character or not part of the Latin-1 Supplement range of UTF-16, the result is undefined. + + + + diff --git a/docs/en/sql-reference/functions/string-replace-functions.md b/docs/en/sql-reference/functions/string-replace-functions.md index adf2a07b732..d1f0e44f6b4 100644 --- a/docs/en/sql-reference/functions/string-replace-functions.md +++ b/docs/en/sql-reference/functions/string-replace-functions.md @@ -6,28 +6,29 @@ sidebar_label: For Replacing in Strings # Functions for Searching and Replacing in Strings -:::note +:::note Functions for [searching](../../sql-reference/functions/string-search-functions.md) and [other manipulations with strings](../../sql-reference/functions/string-functions.md) are described separately. ::: ## replaceOne(haystack, pattern, replacement) -Replaces the first occurrence, if it exists, of the ‘pattern’ substring in ‘haystack’ with the ‘replacement’ substring. -Hereafter, ‘pattern’ and ‘replacement’ must be constants. +Replaces the first occurrence of the substring ‘pattern’ (if it exists) in ‘haystack’ by the ‘replacement’ string. +‘pattern’ and ‘replacement’ must be constants. ## replaceAll(haystack, pattern, replacement), replace(haystack, pattern, replacement) -Replaces all occurrences of the ‘pattern’ substring in ‘haystack’ with the ‘replacement’ substring. +Replaces all occurrences of the substring ‘pattern’ in ‘haystack’ by the ‘replacement’ string. ## replaceRegexpOne(haystack, pattern, replacement) -Replacement using the ‘pattern’ regular expression. A re2 regular expression. -Replaces only the first occurrence, if it exists. -A pattern can be specified as ‘replacement’. This pattern can include substitutions `\0-\9`. -The substitution `\0` includes the entire regular expression. Substitutions `\1-\9` correspond to the subpattern numbers.To use the `\` character in a template, escape it using `\`. -Also keep in mind that a string literal requires an extra escape. +Replaces the first occurrence of the substring matching the regular expression ‘pattern’ in ‘haystack‘ by the ‘replacement‘ string. +‘pattern‘ must be a constant [re2 regular expression](https://github.com/google/re2/wiki/Syntax). +‘replacement’ must be a plain constant string or a constant string containing substitutions `\0-\9`. +Substitutions `\1-\9` correspond to the 1st to 9th capturing group (submatch), substitution `\0` corresponds to the entire match. +To use a verbatim `\` character in the ‘pattern‘ or ‘replacement‘ string, escape it using `\`. +Also keep in mind that string literals require an extra escaping. -Example 1. Converting the date to American format: +Example 1. Converting ISO dates to American format: ``` sql SELECT DISTINCT @@ -62,7 +63,7 @@ SELECT replaceRegexpOne('Hello, World!', '.*', '\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0') ## replaceRegexpAll(haystack, pattern, replacement) -This does the same thing, but replaces all the occurrences. Example: +Like ‘replaceRegexpOne‘, but replaces all occurrences of the pattern. Example: ``` sql SELECT replaceRegexpAll('Hello, World!', '.', '\\0\\0') AS res diff --git a/docs/en/sql-reference/functions/url-functions.md b/docs/en/sql-reference/functions/url-functions.md index b03ca88fc61..b515f6ad518 100644 --- a/docs/en/sql-reference/functions/url-functions.md +++ b/docs/en/sql-reference/functions/url-functions.md @@ -464,5 +464,39 @@ Removes the query string and fragment identifier. The question mark and number s ### cutURLParameter(URL, name) -Removes the ‘name’ URL parameter, if present. This function works under the assumption that the parameter name is encoded in the URL exactly the same way as in the passed argument. +Removes the `name` parameter from URL, if present. This function does not encode or decode characters in parameter names, e.g. `Client ID` and `Client%20ID` are treated as different parameter names. +**Syntax** + +``` sql +cutURLParameter(URL, name) +``` + +**Arguments** + +- `url` — URL. [String](../../sql-reference/data-types/string.md). +- `name` — name of URL parameter. [String](../../sql-reference/data-types/string.md) or [Array](../../sql-reference/data-types/array.md) of Strings. + +**Returned value** + +- URL with `name` URL parameter removed. + +Type: `String`. + +**Example** + +Query: + +``` sql +SELECT + cutURLParameter('http://bigmir.net/?a=b&c=d&e=f#g', 'a') as url_without_a, + cutURLParameter('http://bigmir.net/?a=b&c=d&e=f#g', ['c', 'e']) as url_without_c_and_e; +``` + +Result: + +``` text +┌─url_without_a────────────────┬─url_without_c_and_e──────┐ +│ http://bigmir.net/?c=d&e=f#g │ http://bigmir.net/?a=b#g │ +└──────────────────────────────┴──────────────────────────┘ +``` diff --git a/docs/en/sql-reference/functions/ym-dict-functions.md b/docs/en/sql-reference/functions/ym-dict-functions.md index 04df3db571e..f92ad5db2ad 100644 --- a/docs/en/sql-reference/functions/ym-dict-functions.md +++ b/docs/en/sql-reference/functions/ym-dict-functions.md @@ -131,7 +131,7 @@ Type: `UInt32`. ### regionToPopulation(id\[, geobase\]) Gets the population for a region. -The population can be recorded in files with the geobase. See the section “External dictionaries”. +The population can be recorded in files with the geobase. See the section “Dictionaries”. If the population is not recorded for the region, it returns 0. In the geobase, the population might be recorded for child regions, but not for parent regions. diff --git a/docs/en/sql-reference/statements/alter/column.md b/docs/en/sql-reference/statements/alter/column.md index cc278465437..ae8671ffa9d 100644 --- a/docs/en/sql-reference/statements/alter/column.md +++ b/docs/en/sql-reference/statements/alter/column.md @@ -35,11 +35,11 @@ These actions are described in detail below. ADD COLUMN [IF NOT EXISTS] name [type] [default_expr] [codec] [AFTER name_after | FIRST] ``` -Adds a new column to the table with the specified `name`, `type`, [`codec`](../create/table.md#codecs) and `default_expr` (see the section [Default expressions](../../../sql-reference/statements/create/table.md#create-default-values)). +Adds a new column to the table with the specified `name`, `type`, [`codec`](../create/table.md/#codecs) and `default_expr` (see the section [Default expressions](/docs/en/sql-reference/statements/create/table.md/#create-default-values)). If the `IF NOT EXISTS` clause is included, the query won’t return an error if the column already exists. If you specify `AFTER name_after` (the name of another column), the column is added after the specified one in the list of table columns. If you want to add a column to the beginning of the table use the `FIRST` clause. Otherwise, the column is added to the end of the table. For a chain of actions, `name_after` can be the name of a column that is added in one of the previous actions. -Adding a column just changes the table structure, without performing any actions with data. The data does not appear on the disk after `ALTER`. If the data is missing for a column when reading from the table, it is filled in with default values (by performing the default expression if there is one, or using zeros or empty strings). The column appears on the disk after merging data parts (see [MergeTree](../../../engines/table-engines/mergetree-family/mergetree.md)). +Adding a column just changes the table structure, without performing any actions with data. The data does not appear on the disk after `ALTER`. If the data is missing for a column when reading from the table, it is filled in with default values (by performing the default expression if there is one, or using zeros or empty strings). The column appears on the disk after merging data parts (see [MergeTree](/docs/en/engines/table-engines/mergetree-family/mergetree.md)). This approach allows us to complete the `ALTER` query instantly, without increasing the volume of old data. @@ -76,7 +76,7 @@ Deletes the column with the name `name`. If the `IF EXISTS` clause is specified, Deletes data from the file system. Since this deletes entire files, the query is completed almost instantly. :::warning -You can’t delete a column if it is referenced by [materialized view](../../../sql-reference/statements/create/view.md#materialized). Otherwise, it returns an error. +You can’t delete a column if it is referenced by [materialized view](/docs/en/sql-reference/statements/create/view.md/#materialized). Otherwise, it returns an error. ::: Example: @@ -107,7 +107,7 @@ ALTER TABLE visits RENAME COLUMN webBrowser TO browser CLEAR COLUMN [IF EXISTS] name IN PARTITION partition_name ``` -Resets all data in a column for a specified partition. Read more about setting the partition name in the section [How to set the partition expression](partition.md#how-to-set-partition-expression). +Resets all data in a column for a specified partition. Read more about setting the partition name in the section [How to set the partition expression](partition.md/#how-to-set-partition-expression). If the `IF EXISTS` clause is specified, the query won’t return an error if the column does not exist. @@ -127,7 +127,7 @@ Adds a comment to the column. If the `IF EXISTS` clause is specified, the query Each column can have one comment. If a comment already exists for the column, a new comment overwrites the previous comment. -Comments are stored in the `comment_expression` column returned by the [DESCRIBE TABLE](../../../sql-reference/statements/describe-table.md) query. +Comments are stored in the `comment_expression` column returned by the [DESCRIBE TABLE](/docs/en/sql-reference/statements/describe-table.md) query. Example: @@ -152,15 +152,15 @@ This query changes the `name` column properties: - TTL -For examples of columns compression CODECS modifying, see [Column Compression Codecs](../create/table.md#codecs). +For examples of columns compression CODECS modifying, see [Column Compression Codecs](../create/table.md/#codecs). -For examples of columns TTL modifying, see [Column TTL](../../../engines/table-engines/mergetree-family/mergetree.md#mergetree-column-ttl). +For examples of columns TTL modifying, see [Column TTL](/docs/en/engines/table-engines/mergetree-family/mergetree.md/#mergetree-column-ttl). If the `IF EXISTS` clause is specified, the query won’t return an error if the column does not exist. The query also can change the order of the columns using `FIRST | AFTER` clause, see [ADD COLUMN](#alter_add-column) description. -When changing the type, values are converted as if the [toType](../../../sql-reference/functions/type-conversion-functions.md) functions were applied to them. If only the default expression is changed, the query does not do anything complex, and is completed almost instantly. +When changing the type, values are converted as if the [toType](/docs/en/sql-reference/functions/type-conversion-functions.md) functions were applied to them. If only the default expression is changed, the query does not do anything complex, and is completed almost instantly. Example: @@ -246,7 +246,7 @@ SELECT groupArray(x), groupArray(s) FROM tmp; **See Also** -- [MATERIALIZED](../../statements/create/table.md#materialized). +- [MATERIALIZED](/docs/en/sql-reference/statements/create/table.md/#materialized). ## Limitations @@ -254,8 +254,8 @@ The `ALTER` query lets you create and delete separate elements (columns) in nest There is no support for deleting columns in the primary key or the sampling key (columns that are used in the `ENGINE` expression). Changing the type for columns that are included in the primary key is only possible if this change does not cause the data to be modified (for example, you are allowed to add values to an Enum or to change a type from `DateTime` to `UInt32`). -If the `ALTER` query is not sufficient to make the table changes you need, you can create a new table, copy the data to it using the [INSERT SELECT](../../../sql-reference/statements/insert-into.md#insert_query_insert-select) query, then switch the tables using the [RENAME](../../../sql-reference/statements/rename.md#rename-table) query and delete the old table. You can use the [clickhouse-copier](../../../operations/utilities/clickhouse-copier.md) as an alternative to the `INSERT SELECT` query. +If the `ALTER` query is not sufficient to make the table changes you need, you can create a new table, copy the data to it using the [INSERT SELECT](/docs/en/sql-reference/statements/insert-into.md/#inserting-the-results-of-select) query, then switch the tables using the [RENAME](/docs/en/sql-reference/statements/rename.md/#rename-table) query and delete the old table. You can use the [clickhouse-copier](/docs/en/operations/utilities/clickhouse-copier.md) as an alternative to the `INSERT SELECT` query. The `ALTER` query blocks all reads and writes for the table. In other words, if a long `SELECT` is running at the time of the `ALTER` query, the `ALTER` query will wait for it to complete. At the same time, all new queries to the same table will wait while this `ALTER` is running. -For tables that do not store data themselves (such as [Merge](../../../sql-reference/statements/alter/index.md) and [Distributed](../../../sql-reference/statements/alter/index.md)), `ALTER` just changes the table structure, and does not change the structure of subordinate tables. For example, when running ALTER for a `Distributed` table, you will also need to run `ALTER` for the tables on all remote servers. +For tables that do not store data themselves (such as [Merge](/docs/en/sql-reference/statements/alter/index.md) and [Distributed](/docs/en/sql-reference/statements/alter/index.md)), `ALTER` just changes the table structure, and does not change the structure of subordinate tables. For example, when running ALTER for a `Distributed` table, you will also need to run `ALTER` for the tables on all remote servers. diff --git a/docs/en/sql-reference/statements/alter/delete.md b/docs/en/sql-reference/statements/alter/delete.md index ba5d01d9b4d..30ed96c0b9c 100644 --- a/docs/en/sql-reference/statements/alter/delete.md +++ b/docs/en/sql-reference/statements/alter/delete.md @@ -10,21 +10,21 @@ sidebar_label: DELETE ALTER TABLE [db.]table [ON CLUSTER cluster] DELETE WHERE filter_expr ``` -Deletes data matching the specified filtering expression. Implemented as a [mutation](../../../sql-reference/statements/alter/index.md#mutations). +Deletes data matching the specified filtering expression. Implemented as a [mutation](/docs/en/sql-reference/statements/alter/index.md#mutations). :::note -The `ALTER TABLE` prefix makes this syntax different from most other systems supporting SQL. It is intended to signify that unlike similar queries in OLTP databases this is a heavy operation not designed for frequent use. `ALTER TABLE` is considered a heavyweight operation that requires the underlying data to be merged before it is deleted. For MergeTree tables, consider using the [`DELETE FROM` query](../delete.md), which performs a lightweight delete and can be considerably faster. +The `ALTER TABLE` prefix makes this syntax different from most other systems supporting SQL. It is intended to signify that unlike similar queries in OLTP databases this is a heavy operation not designed for frequent use. `ALTER TABLE` is considered a heavyweight operation that requires the underlying data to be merged before it is deleted. For MergeTree tables, consider using the [`DELETE FROM` query](/docs/en/sql-reference/statements/delete.md), which performs a lightweight delete and can be considerably faster. ::: The `filter_expr` must be of type `UInt8`. The query deletes rows in the table for which this expression takes a non-zero value. One query can contain several commands separated by commas. -The synchronicity of the query processing is defined by the [mutations_sync](../../../operations/settings/settings.md#mutations_sync) setting. By default, it is asynchronous. +The synchronicity of the query processing is defined by the [mutations_sync](/docs/en/operations/settings/settings.md/#mutations_sync) setting. By default, it is asynchronous. **See also** -- [Mutations](../../../sql-reference/statements/alter/index.md#mutations) -- [Synchronicity of ALTER Queries](../../../sql-reference/statements/alter/index.md#synchronicity-of-alter-queries) -- [mutations_sync](../../../operations/settings/settings.md#mutations_sync) setting +- [Mutations](/docs/en/sql-reference/statements/alter/index.md#mutations) +- [Synchronicity of ALTER Queries](/docs/en/sql-reference/statements/alter/index.md#synchronicity-of-alter-queries) +- [mutations_sync](/docs/en/operations/settings/settings.md/#mutations_sync) setting diff --git a/docs/en/sql-reference/statements/alter/index.md b/docs/en/sql-reference/statements/alter/index.md index 4027429cf0d..1c4d62f3190 100644 --- a/docs/en/sql-reference/statements/alter/index.md +++ b/docs/en/sql-reference/statements/alter/index.md @@ -8,43 +8,43 @@ sidebar_label: ALTER Most `ALTER TABLE` queries modify table settings or data: -- [COLUMN](../../../sql-reference/statements/alter/column.md) -- [PARTITION](../../../sql-reference/statements/alter/partition.md) -- [DELETE](../../../sql-reference/statements/alter/delete.md) -- [UPDATE](../../../sql-reference/statements/alter/update.md) -- [ORDER BY](../../../sql-reference/statements/alter/order-by.md) -- [INDEX](../../../sql-reference/statements/alter/index/index.md) -- [CONSTRAINT](../../../sql-reference/statements/alter/constraint.md) -- [TTL](../../../sql-reference/statements/alter/ttl.md) +- [COLUMN](/docs/en/sql-reference/statements/alter/column.md) +- [PARTITION](/docs/en/sql-reference/statements/alter/partition.md) +- [DELETE](/docs/en/sql-reference/statements/alter/delete.md) +- [UPDATE](/docs/en/sql-reference/statements/alter/update.md) +- [ORDER BY](/docs/en/sql-reference/statements/alter/order-by.md) +- [INDEX](/docs/en/sql-reference/statements/alter/skipping-index.md) +- [CONSTRAINT](/docs/en/sql-reference/statements/alter/constraint.md) +- [TTL](/docs/en/sql-reference/statements/alter/ttl.md) :::note -Most `ALTER TABLE` queries are supported only for [\*MergeTree](../../../engines/table-engines/mergetree-family/index.md) tables, as well as [Merge](../../../engines/table-engines/special/merge.md) and [Distributed](../../../engines/table-engines/special/distributed.md). +Most `ALTER TABLE` queries are supported only for [\*MergeTree](/docs/en/engines/table-engines/mergetree-family/index.md) tables, as well as [Merge](/docs/en/engines/table-engines/special/merge.md) and [Distributed](/docs/en/engines/table-engines/special/distributed.md). ::: These `ALTER` statements manipulate views: -- [ALTER TABLE ... MODIFY QUERY](../../../sql-reference/statements/alter/view.md) — Modifies a [Materialized view](../create/view.md#materialized) structure. -- [ALTER LIVE VIEW](../../../sql-reference/statements/alter/view.md#alter-live-view) — Refreshes a [Live view](../create/view.md#live-view). +- [ALTER TABLE ... MODIFY QUERY](/docs/en/sql-reference/statements/alter/view.md) — Modifies a [Materialized view](/docs/en/sql-reference/statements/create/view.md/#materialized) structure. +- [ALTER LIVE VIEW](/docs/en/sql-reference/statements/alter/view.md/#alter-live-view) — Refreshes a [Live view](/docs/en/sql-reference/statements/create/view.md/#live-view). These `ALTER` statements modify entities related to role-based access control: -- [USER](../../../sql-reference/statements/alter/user.md) -- [ROLE](../../../sql-reference/statements/alter/role.md) -- [QUOTA](../../../sql-reference/statements/alter/quota.md) -- [ROW POLICY](../../../sql-reference/statements/alter/row-policy.md) -- [SETTINGS PROFILE](../../../sql-reference/statements/alter/settings-profile.md) +- [USER](/docs/en/sql-reference/statements/alter/user.md) +- [ROLE](/docs/en/sql-reference/statements/alter/role.md) +- [QUOTA](/docs/en/sql-reference/statements/alter/quota.md) +- [ROW POLICY](/docs/en/sql-reference/statements/alter/row-policy.md) +- [SETTINGS PROFILE](/docs/en/sql-reference/statements/alter/settings-profile.md) -[ALTER TABLE ... MODIFY COMMENT](../../../sql-reference/statements/alter/comment.md) statement adds, modifies, or removes comments to the table, regardless if it was set before or not. +[ALTER TABLE ... MODIFY COMMENT](/docs/en/sql-reference/statements/alter/comment.md) statement adds, modifies, or removes comments to the table, regardless if it was set before or not. ## Mutations -`ALTER` queries that are intended to manipulate table data are implemented with a mechanism called “mutations”, most notably [ALTER TABLE … DELETE](../../../sql-reference/statements/alter/delete.md) and [ALTER TABLE … UPDATE](../../../sql-reference/statements/alter/update.md). They are asynchronous background processes similar to merges in [MergeTree](../../../engines/table-engines/mergetree-family/index.md) tables that to produce new “mutated” versions of parts. +`ALTER` queries that are intended to manipulate table data are implemented with a mechanism called “mutations”, most notably [ALTER TABLE … DELETE](/docs/en/sql-reference/statements/alter/delete.md) and [ALTER TABLE … UPDATE](/docs/en/sql-reference/statements/alter/update.md). They are asynchronous background processes similar to merges in [MergeTree](/docs/en/engines/table-engines/mergetree-family/index.md) tables that to produce new “mutated” versions of parts. For `*MergeTree` tables mutations execute by **rewriting whole data parts**. There is no atomicity - parts are substituted for mutated parts as soon as they are ready and a `SELECT` query that started executing during a mutation will see data from parts that have already been mutated along with data from parts that have not been mutated yet. Mutations are totally ordered by their creation order and are applied to each part in that order. Mutations are also partially ordered with `INSERT INTO` queries: data that was inserted into the table before the mutation was submitted will be mutated and data that was inserted after that will not be mutated. Note that mutations do not block inserts in any way. -A mutation query returns immediately after the mutation entry is added (in case of replicated tables to ZooKeeper, for non-replicated tables - to the filesystem). The mutation itself executes asynchronously using the system profile settings. To track the progress of mutations you can use the [`system.mutations`](../../../operations/system-tables/mutations.md#system_tables-mutations) table. A mutation that was successfully submitted will continue to execute even if ClickHouse servers are restarted. There is no way to roll back the mutation once it is submitted, but if the mutation is stuck for some reason it can be cancelled with the [`KILL MUTATION`](../../../sql-reference/statements/kill.md#kill-mutation) query. +A mutation query returns immediately after the mutation entry is added (in case of replicated tables to ZooKeeper, for non-replicated tables - to the filesystem). The mutation itself executes asynchronously using the system profile settings. To track the progress of mutations you can use the [`system.mutations`](/docs/en/operations/system-tables/mutations.md/#system_tables-mutations) table. A mutation that was successfully submitted will continue to execute even if ClickHouse servers are restarted. There is no way to roll back the mutation once it is submitted, but if the mutation is stuck for some reason it can be cancelled with the [`KILL MUTATION`](/docs/en/sql-reference/statements/kill.md/#kill-mutation) query. Entries for finished mutations are not deleted right away (the number of preserved entries is determined by the `finished_mutations_to_keep` storage engine parameter). Older mutation entries are deleted. @@ -52,12 +52,12 @@ Entries for finished mutations are not deleted right away (the number of preserv For non-replicated tables, all `ALTER` queries are performed synchronously. For replicated tables, the query just adds instructions for the appropriate actions to `ZooKeeper`, and the actions themselves are performed as soon as possible. However, the query can wait for these actions to be completed on all the replicas. -For all `ALTER` queries, you can use the [replication_alter_partitions_sync](../../../operations/settings/settings.md#replication-alter-partitions-sync) setting to set up waiting. +For all `ALTER` queries, you can use the [replication_alter_partitions_sync](/docs/en/operations/settings/settings.md/#replication-alter-partitions-sync) setting to set up waiting. -You can specify how long (in seconds) to wait for inactive replicas to execute all `ALTER` queries with the [replication_wait_for_inactive_replica_timeout](../../../operations/settings/settings.md#replication-wait-for-inactive-replica-timeout) setting. +You can specify how long (in seconds) to wait for inactive replicas to execute all `ALTER` queries with the [replication_wait_for_inactive_replica_timeout](/docs/en/operations/settings/settings.md/#replication-wait-for-inactive-replica-timeout) setting. :::note For all `ALTER` queries, if `replication_alter_partitions_sync = 2` and some replicas are not active for more than the time, specified in the `replication_wait_for_inactive_replica_timeout` setting, then an exception `UNFINISHED` is thrown. ::: -For `ALTER TABLE ... UPDATE|DELETE` queries the synchronicity is defined by the [mutations_sync](../../../operations/settings/settings.md#mutations_sync) setting. +For `ALTER TABLE ... UPDATE|DELETE` queries the synchronicity is defined by the [mutations_sync](/docs/en/operations/settings/settings.md/#mutations_sync) setting. diff --git a/docs/en/sql-reference/statements/alter/partition.md b/docs/en/sql-reference/statements/alter/partition.md index 2d89c1d5d18..aad52efb39d 100644 --- a/docs/en/sql-reference/statements/alter/partition.md +++ b/docs/en/sql-reference/statements/alter/partition.md @@ -5,7 +5,7 @@ sidebar_label: PARTITION title: "Manipulating Partitions and Parts" --- -The following operations with [partitions](../../../engines/table-engines/mergetree-family/custom-partitioning-key.md) are available: +The following operations with [partitions](/docs/en/engines/table-engines/mergetree-family/custom-partitioning-key.md) are available: - [DETACH PARTITION\|PART](#detach-partitionpart) — Moves a partition or part to the `detached` directory and forget it. - [DROP PARTITION\|PART](#drop-partitionpart) — Deletes a partition or part. @@ -43,7 +43,7 @@ Read about setting the partition expression in a section [How to set the partiti After the query is executed, you can do whatever you want with the data in the `detached` directory — delete it from the file system, or just leave it. -This query is replicated – it moves the data to the `detached` directory on all replicas. Note that you can execute this query only on a leader replica. To find out if a replica is a leader, perform the `SELECT` query to the [system.replicas](../../../operations/system-tables/replicas.md#system_tables-replicas) table. Alternatively, it is easier to make a `DETACH` query on all replicas - all the replicas throw an exception, except the leader replicas (as multiple leaders are allowed). +This query is replicated – it moves the data to the `detached` directory on all replicas. Note that you can execute this query only on a leader replica. To find out if a replica is a leader, perform the `SELECT` query to the [system.replicas](/docs/en/operations/system-tables/replicas.md/#system_tables-replicas) table. Alternatively, it is easier to make a `DETACH` query on all replicas - all the replicas throw an exception, except the leader replicas (as multiple leaders are allowed). ## DROP PARTITION\|PART @@ -162,7 +162,7 @@ ALTER TABLE table_name [ON CLUSTER cluster] FREEZE [PARTITION partition_expr] [W This query creates a local backup of a specified partition. If the `PARTITION` clause is omitted, the query creates the backup of all partitions at once. -:::note +:::note The entire backup process is performed without stopping the server. ::: @@ -172,10 +172,10 @@ At the time of execution, for a data snapshot, the query creates hardlinks to a - `/var/lib/clickhouse/` is the working ClickHouse directory specified in the config. - `N` is the incremental number of the backup. -- if the `WITH NAME` parameter is specified, then the value of the `'backup_name'` parameter is used instead of the incremental number. +- if the `WITH NAME` parameter is specified, then the value of the `'backup_name'` parameter is used instead of the incremental number. -:::note -If you use [a set of disks for data storage in a table](../../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-multiple-volumes), the `shadow/N` directory appears on every disk, storing data parts that matched by the `PARTITION` expression. +:::note +If you use [a set of disks for data storage in a table](/docs/en/engines/table-engines/mergetree-family/mergetree.md/#table_engine-mergetree-multiple-volumes), the `shadow/N` directory appears on every disk, storing data parts that matched by the `PARTITION` expression. ::: The same structure of directories is created inside the backup as inside `/var/lib/clickhouse/`. The query performs `chmod` for all files, forbidding writing into them. @@ -194,7 +194,7 @@ To restore data from a backup, do the following: Restoring from a backup does not require stopping the server. -For more information about backups and restoring data, see the [Data Backup](/docs/en/manage/backups.mdx) section. +For more information about backups and restoring data, see the [Data Backup](/docs/en/operations/backup.md) section. ## UNFREEZE PARTITION @@ -249,7 +249,7 @@ Although the query is called `ALTER TABLE`, it does not change the table structu ## MOVE PARTITION\|PART -Moves partitions or data parts to another volume or disk for `MergeTree`-engine tables. See [Using Multiple Block Devices for Data Storage](../../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-multiple-volumes). +Moves partitions or data parts to another volume or disk for `MergeTree`-engine tables. See [Using Multiple Block Devices for Data Storage](/docs/en/engines/table-engines/mergetree-family/mergetree.md/#table_engine-mergetree-multiple-volumes). ``` sql ALTER TABLE table_name [ON CLUSTER cluster] MOVE PARTITION|PART partition_expr TO DISK|VOLUME 'disk_name' @@ -270,7 +270,7 @@ ALTER TABLE hits MOVE PARTITION '2019-09-01' TO DISK 'fast_ssd' ## UPDATE IN PARTITION -Manipulates data in the specifies partition matching the specified filtering expression. Implemented as a [mutation](../../../sql-reference/statements/alter/index.md#mutations). +Manipulates data in the specifies partition matching the specified filtering expression. Implemented as a [mutation](/docs/en/sql-reference/statements/alter/index.md#mutations). Syntax: @@ -286,11 +286,11 @@ ALTER TABLE mt UPDATE x = x + 1 IN PARTITION 2 WHERE p = 2; ### See Also -- [UPDATE](../../../sql-reference/statements/alter/update.md#alter-table-update-statements) +- [UPDATE](/docs/en/sql-reference/statements/alter/update.md/#alter-table-update-statements) ## DELETE IN PARTITION -Deletes data in the specifies partition matching the specified filtering expression. Implemented as a [mutation](../../../sql-reference/statements/alter/index.md#mutations). +Deletes data in the specifies partition matching the specified filtering expression. Implemented as a [mutation](/docs/en/sql-reference/statements/alter/index.md#mutations). Syntax: @@ -306,7 +306,7 @@ ALTER TABLE mt DELETE IN PARTITION 2 WHERE p = 2; ### See Also -- [DELETE](../../../sql-reference/statements/alter/delete.md#alter-mutations) +- [DELETE](/docs/en/sql-reference/statements/alter/delete.md/#alter-mutations) ## How to Set Partition Expression @@ -315,16 +315,16 @@ You can specify the partition expression in `ALTER ... PARTITION` queries in dif - As a value from the `partition` column of the `system.parts` table. For example, `ALTER TABLE visits DETACH PARTITION 201901`. - As a tuple of expressions or constants that matches (in types) the table partitioning keys tuple. In the case of a single element partitioning key, the expression should be wrapped in the `tuple (...)` function. For example, `ALTER TABLE visits DETACH PARTITION tuple(toYYYYMM(toDate('2019-01-25')))`. - Using the partition ID. Partition ID is a string identifier of the partition (human-readable, if possible) that is used as the names of partitions in the file system and in ZooKeeper. The partition ID must be specified in the `PARTITION ID` clause, in a single quotes. For example, `ALTER TABLE visits DETACH PARTITION ID '201901'`. -- In the [ALTER ATTACH PART](#alter_attach-partition) and [DROP DETACHED PART](#alter_drop-detached) query, to specify the name of a part, use string literal with a value from the `name` column of the [system.detached_parts](../../../operations/system-tables/detached_parts.md#system_tables-detached_parts) table. For example, `ALTER TABLE visits ATTACH PART '201901_1_1_0'`. +- In the [ALTER ATTACH PART](#alter_attach-partition) and [DROP DETACHED PART](#alter_drop-detached) query, to specify the name of a part, use string literal with a value from the `name` column of the [system.detached_parts](/docs/en/operations/system-tables/detached_parts.md/#system_tables-detached_parts) table. For example, `ALTER TABLE visits ATTACH PART '201901_1_1_0'`. Usage of quotes when specifying the partition depends on the type of partition expression. For example, for the `String` type, you have to specify its name in quotes (`'`). For the `Date` and `Int*` types no quotes are needed. -All the rules above are also true for the [OPTIMIZE](../../../sql-reference/statements/optimize.md) query. If you need to specify the only partition when optimizing a non-partitioned table, set the expression `PARTITION tuple()`. For example: +All the rules above are also true for the [OPTIMIZE](/docs/en/sql-reference/statements/optimize.md) query. If you need to specify the only partition when optimizing a non-partitioned table, set the expression `PARTITION tuple()`. For example: ``` sql OPTIMIZE TABLE table_not_partitioned PARTITION tuple() FINAL; ``` -`IN PARTITION` specifies the partition to which the [UPDATE](../../../sql-reference/statements/alter/update.md#alter-table-update-statements) or [DELETE](../../../sql-reference/statements/alter/delete.md#alter-mutations) expressions are applied as a result of the `ALTER TABLE` query. New parts are created only from the specified partition. In this way, `IN PARTITION` helps to reduce the load when the table is divided into many partitions, and you only need to update the data point-by-point. +`IN PARTITION` specifies the partition to which the [UPDATE](/docs/en/sql-reference/statements/alter/update.md/#alter-table-update-statements) or [DELETE](/docs/en/sql-reference/statements/alter/delete.md/#alter-mutations) expressions are applied as a result of the `ALTER TABLE` query. New parts are created only from the specified partition. In this way, `IN PARTITION` helps to reduce the load when the table is divided into many partitions, and you only need to update the data point-by-point. The examples of `ALTER ... PARTITION` queries are demonstrated in the tests [`00502_custom_partitioning_local`](https://github.com/ClickHouse/ClickHouse/blob/master/tests/queries/0_stateless/00502_custom_partitioning_local.sql) and [`00502_custom_partitioning_replicated_zookeeper`](https://github.com/ClickHouse/ClickHouse/blob/master/tests/queries/0_stateless/00502_custom_partitioning_replicated_zookeeper.sql). diff --git a/docs/en/sql-reference/statements/alter/projection.md b/docs/en/sql-reference/statements/alter/projection.md index ff8ecf3a77f..908d28d7ab1 100644 --- a/docs/en/sql-reference/statements/alter/projection.md +++ b/docs/en/sql-reference/statements/alter/projection.md @@ -2,10 +2,143 @@ slug: /en/sql-reference/statements/alter/projection sidebar_position: 49 sidebar_label: PROJECTION -title: "Manipulating Projections" +title: "Projections" --- -The following operations with [projections](../../../engines/table-engines/mergetree-family/mergetree.md#projections) are available: +Projections store data in a format that optimizes query execution, this feature is useful for: +- Running queries on a column that is not a part of the primary key +- Pre-aggregating columns, it will reduce both computation and IO + +You can define one or more projections for a table, and during the query analysis the projection with the least data to scan will be selected by ClickHouse without modifying the query provided by the user. + +:::note Disk usage + +Projections will create internally a new hidden table, this means that more IO and space on disk will be required. +Example, If the projection has defined a different primary key, all the data from the original table will be duplicated. +::: + +You can see more technical details about how projections work internally on this [page](/docs/en/guides/improving-query-performance/sparse-primary-indexes/sparse-primary-indexes-multiple.md/#option-3-projections). + +## Example filtering without using primary keys + +Creating the table: +``` +CREATE TABLE visits_order +( + `user_id` UInt64, + `user_name` String, + `pages_visited` Nullable(Float64), + `user_agent` String +) +ENGINE = MergeTree() +PRIMARY KEY user_agent +``` +Using `ALTER TABLE`, we could add the Projection to an existing table: +``` +ALTER TABLE visits_order ADD PROJECTION user_name_projection ( +SELECT +* +ORDER BY user_name +) + +ALTER TABLE visits_order MATERIALIZE PROJECTION user_name_projection +``` +Inserting the data: +``` +INSERT INTO visits_order SELECT + number, + 'test', + 1.5 * (number / 2), + 'Android' +FROM numbers(1, 100); +``` + +The Projection will allow us to filter by `user_name` fast even if in the original Table `user_name` was not defined as a `PRIMARY_KEY`. +At query time ClickHouse determined that less data will be processed if the projection is used, as the data is ordered by `user_name`. +``` +SELECT + * +FROM visits_order +WHERE user_name='test' +LIMIT 2 +``` + +To verify that a query is using the projection, we could review the `system.query_log` table. On the `projections` field we have the name of the projection used or empty if none has been used: +``` +SELECT query, projections FROM system.query_log WHERE query_id='' +``` + +## Example pre-aggregation query + +Creating the table with the Projection: +``` +CREATE TABLE visits +( + `user_id` UInt64, + `user_name` String, + `pages_visited` Nullable(Float64), + `user_agent` String, + PROJECTION projection_visits_by_user + ( + SELECT + user_agent, + sum(pages_visited) + GROUP BY user_id, user_agent + ) +) +ENGINE = MergeTree() +ORDER BY user_agent +``` +Inserting the data: +``` +INSERT INTO visits SELECT + number, + 'test', + 1.5 * (number / 2), + 'Android' +FROM numbers(1, 100); +``` +``` +INSERT INTO visits SELECT + number, + 'test', + 1. * (number / 2), + 'IOS' +FROM numbers(100, 500); +``` +We will execute a first query using `GROUP BY` using the field `user_agent`, this query will not use the projection defined as the pre-aggregation does not match. +``` +SELECT + user_agent, + count(DISTINCT user_id) +FROM visits +GROUP BY user_agent +``` + +To use the projection we could execute queries that select part of, or all of the pre-aggregation and `GROUP BY` fields. +``` +SELECT + user_agent +FROM visits +WHERE user_id > 50 AND user_id < 150 +GROUP BY user_agent +``` +``` +SELECT + user_agent, + sum(pages_visited) +FROM visits +GROUP BY user_id +``` + +As mentioned before, we could review the `system.query_log` table. On the `projections` field we have the name of the projection used or empty if none has been used: +``` +SELECT query, projections FROM system.query_log WHERE query_id='' +``` + +# Manipulating Projections + +The following operations with [projections](/docs/en/engines/table-engines/mergetree-family/mergetree.md/#projections) are available: ## ADD PROJECTION @@ -13,15 +146,15 @@ The following operations with [projections](../../../engines/table-engines/merge ## DROP PROJECTION -`ALTER TABLE [db].name DROP PROJECTION name` - Removes projection description from tables metadata and deletes projection files from disk. Implemented as a [mutation](../../../sql-reference/statements/alter/index.md#mutations). +`ALTER TABLE [db].name DROP PROJECTION name` - Removes projection description from tables metadata and deletes projection files from disk. Implemented as a [mutation](/docs/en/sql-reference/statements/alter/index.md#mutations). ## MATERIALIZE PROJECTION -`ALTER TABLE [db.]table MATERIALIZE PROJECTION name IN PARTITION partition_name` - The query rebuilds the projection `name` in the partition `partition_name`. Implemented as a [mutation](../../../sql-reference/statements/alter/index.md#mutations). +`ALTER TABLE [db.]table MATERIALIZE PROJECTION name IN PARTITION partition_name` - The query rebuilds the projection `name` in the partition `partition_name`. Implemented as a [mutation](/docs/en/sql-reference/statements/alter/index.md#mutations). ## CLEAR PROJECTION -`ALTER TABLE [db.]table CLEAR PROJECTION name IN PARTITION partition_name` - Deletes projection files from disk without removing description. Implemented as a [mutation](../../../sql-reference/statements/alter/index.md#mutations). +`ALTER TABLE [db.]table CLEAR PROJECTION name IN PARTITION partition_name` - Deletes projection files from disk without removing description. Implemented as a [mutation](/docs/en/sql-reference/statements/alter/index.md#mutations). The commands `ADD`, `DROP` and `CLEAR` are lightweight in a sense that they only change metadata or remove files. @@ -29,5 +162,5 @@ The commands `ADD`, `DROP` and `CLEAR` are lightweight in a sense that they only Also, they are replicated, syncing projections metadata via ClickHouse Keeper or ZooKeeper. :::note -Projection manipulation is supported only for tables with [`*MergeTree`](../../../engines/table-engines/mergetree-family/mergetree.md) engine (including [replicated](../../../engines/table-engines/mergetree-family/replication.md) variants). +Projection manipulation is supported only for tables with [`*MergeTree`](/docs/en/engines/table-engines/mergetree-family/mergetree.md) engine (including [replicated](/docs/en/engines/table-engines/mergetree-family/replication.md) variants). ::: diff --git a/docs/en/sql-reference/statements/alter/index/index.md b/docs/en/sql-reference/statements/alter/skipping-index.md similarity index 70% rename from docs/en/sql-reference/statements/alter/index/index.md rename to docs/en/sql-reference/statements/alter/skipping-index.md index 03d4bd47e71..037e4bc38c5 100644 --- a/docs/en/sql-reference/statements/alter/index/index.md +++ b/docs/en/sql-reference/statements/alter/skipping-index.md @@ -1,5 +1,6 @@ --- -slug: /en/sql-reference/statements/alter/index +slug: /en/sql-reference/statements/alter/skipping-index + toc_hidden_folder: true sidebar_position: 42 sidebar_label: INDEX @@ -13,12 +14,12 @@ The following operations are available: - `ALTER TABLE [db].table_name [ON CLUSTER cluster] DROP INDEX name` - Removes index description from tables metadata and deletes index files from disk. -- `ALTER TABLE [db.]table_name [ON CLUSTER cluster] MATERIALIZE INDEX name [IN PARTITION partition_name]` - Rebuilds the secondary index `name` for the specified `partition_name`. Implemented as a [mutation](../../../../sql-reference/statements/alter/index.md#mutations). If `IN PARTITION` part is omitted then it rebuilds the index for the whole table data. +- `ALTER TABLE [db.]table_name [ON CLUSTER cluster] MATERIALIZE INDEX name [IN PARTITION partition_name]` - Rebuilds the secondary index `name` for the specified `partition_name`. Implemented as a [mutation](/docs/en/sql-reference/statements/alter/index.md#mutations). If `IN PARTITION` part is omitted then it rebuilds the index for the whole table data. The first two commands are lightweight in a sense that they only change metadata or remove files. Also, they are replicated, syncing indices metadata via ZooKeeper. :::note -Index manipulation is supported only for tables with [`*MergeTree`](../../../../engines/table-engines/mergetree-family/mergetree.md) engine (including [replicated](../../../../engines/table-engines/mergetree-family/replication.md) variants). +Index manipulation is supported only for tables with [`*MergeTree`](/docs/en/engines/table-engines/mergetree-family/mergetree.md) engine (including [replicated](/docs/en/engines/table-engines/mergetree-family/replication.md) variants). ::: diff --git a/docs/en/sql-reference/statements/alter/update.md b/docs/en/sql-reference/statements/alter/update.md index e4fb872ae24..234812f6ed4 100644 --- a/docs/en/sql-reference/statements/alter/update.md +++ b/docs/en/sql-reference/statements/alter/update.md @@ -7,10 +7,10 @@ sidebar_label: UPDATE # ALTER TABLE … UPDATE Statements ``` sql -ALTER TABLE [db.]table [ON CLUSTER cluster] UPDATE column1 = expr1 [, ...] WHERE filter_expr +ALTER TABLE [db.]table [ON CLUSTER cluster] UPDATE column1 = expr1 [, ...] [IN PARTITION partition_id] WHERE filter_expr ``` -Manipulates data matching the specified filtering expression. Implemented as a [mutation](../../../sql-reference/statements/alter/index.md#mutations). +Manipulates data matching the specified filtering expression. Implemented as a [mutation](/docs/en/sql-reference/statements/alter/index.md#mutations). :::note The `ALTER TABLE` prefix makes this syntax different from most other systems supporting SQL. It is intended to signify that unlike similar queries in OLTP databases this is a heavy operation not designed for frequent use. @@ -20,11 +20,11 @@ The `filter_expr` must be of type `UInt8`. This query updates values of specifie One query can contain several commands separated by commas. -The synchronicity of the query processing is defined by the [mutations_sync](../../../operations/settings/settings.md#mutations_sync) setting. By default, it is asynchronous. +The synchronicity of the query processing is defined by the [mutations_sync](/docs/en/operations/settings/settings.md/#mutations_sync) setting. By default, it is asynchronous. **See also** -- [Mutations](../../../sql-reference/statements/alter/index.md#mutations) -- [Synchronicity of ALTER Queries](../../../sql-reference/statements/alter/index.md#synchronicity-of-alter-queries) -- [mutations_sync](../../../operations/settings/settings.md#mutations_sync) setting +- [Mutations](/docs/en/sql-reference/statements/alter/index.md#mutations) +- [Synchronicity of ALTER Queries](/docs/en/sql-reference/statements/alter/index.md#synchronicity-of-alter-queries) +- [mutations_sync](/docs/en/operations/settings/settings.md/#mutations_sync) setting diff --git a/docs/en/sql-reference/statements/alter/user.md b/docs/en/sql-reference/statements/alter/user.md index 0a68885842a..31db89164d7 100644 --- a/docs/en/sql-reference/statements/alter/user.md +++ b/docs/en/sql-reference/statements/alter/user.md @@ -12,7 +12,7 @@ Syntax: ``` sql ALTER USER [IF EXISTS] name1 [ON CLUSTER cluster_name1] [RENAME TO new_name1] [, name2 [ON CLUSTER cluster_name2] [RENAME TO new_name2] ...] - [NOT IDENTIFIED | IDENTIFIED {[WITH {no_password | plaintext_password | sha256_password | sha256_hash | double_sha1_password | double_sha1_hash}] BY {'password' | 'hash'}} | {WITH ldap SERVER 'server_name'} | {WITH kerberos [REALM 'realm']}] + [NOT IDENTIFIED | IDENTIFIED {[WITH {no_password | plaintext_password | sha256_password | sha256_hash | double_sha1_password | double_sha1_hash}] BY {'password' | 'hash'}} | {WITH ldap SERVER 'server_name'} | {WITH kerberos [REALM 'realm']} | {WITH ssl_certificate CN 'common_name'}] [[ADD | DROP] HOST {LOCAL | NAME 'name' | REGEXP 'name_regexp' | IP 'address' | LIKE 'pattern'} [,...] | ANY | NONE] [DEFAULT ROLE role [,...] | ALL | ALL EXCEPT role [,...] ] [GRANTEES {user | role | ANY | NONE} [,...] [EXCEPT {user | role} [,...]]] diff --git a/docs/en/sql-reference/statements/check-table.md b/docs/en/sql-reference/statements/check-table.md index f9b428b74a1..8c4b8ab90a2 100644 --- a/docs/en/sql-reference/statements/check-table.md +++ b/docs/en/sql-reference/statements/check-table.md @@ -8,7 +8,7 @@ title: "CHECK TABLE Statement" Checks if the data in the table is corrupted. ``` sql -CHECK TABLE [db.]name +CHECK TABLE [db.]name [PARTITION partition_expr] ``` The `CHECK TABLE` query compares actual file sizes with the expected values which are stored on the server. If the file sizes do not match the stored values, it means the data is corrupted. This can be caused, for example, by a system crash during query execution. diff --git a/docs/en/sql-reference/statements/create/dictionary.md b/docs/en/sql-reference/statements/create/dictionary.md index b24ff480c2d..a470b071971 100644 --- a/docs/en/sql-reference/statements/create/dictionary.md +++ b/docs/en/sql-reference/statements/create/dictionary.md @@ -5,9 +5,9 @@ sidebar_label: DICTIONARY title: "CREATE DICTIONARY" --- -Creates a new [external dictionary](../../../sql-reference/dictionaries/external-dictionaries/external-dicts.md) with given [structure](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md), [source](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md), [layout](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md) and [lifetime](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-lifetime.md). +Creates a new [dictionary](../../../sql-reference/dictionaries/external-dictionaries/external-dicts.md) with given [structure](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md), [source](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md), [layout](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md) and [lifetime](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-lifetime.md). -**Syntax** +## Syntax ``` sql CREATE [OR REPLACE] DICTIONARY [IF NOT EXISTS] [db.]dictionary_name [ON CLUSTER cluster] @@ -25,17 +25,21 @@ SETTINGS(setting_name = setting_value, setting_name = setting_value, ...) COMMENT 'Comment' ``` -External dictionary structure consists of attributes. Dictionary attributes are specified similarly to table columns. The only required attribute property is its type, all other properties may have default values. +The dictionary structure consists of attributes. Dictionary attributes are specified similarly to table columns. The only required attribute property is its type, all other properties may have default values. `ON CLUSTER` clause allows creating dictionary on a cluster, see [Distributed DDL](../../../sql-reference/distributed-ddl.md). Depending on dictionary [layout](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md) one or more attributes can be specified as dictionary keys. -For more information, see [External Dictionaries](../../../sql-reference/dictionaries/external-dictionaries/external-dicts.md) section. +## SOURCE -You can add a comment to the dictionary when you creating it using `COMMENT` clause. +The source for a dictionary can be a: +- table in the current ClickHouse service +- table in a remote ClickHouse service +- file available by HTTP(S) +- another database -**Example** +### Create a dictionary from a table in the current ClickHouse service Input table `source_table`: @@ -49,51 +53,81 @@ Input table `source_table`: Creating the dictionary: ``` sql -CREATE DICTIONARY dictionary_with_comment +CREATE DICTIONARY id_value_dictionary ( id UInt64, value String ) PRIMARY KEY id -SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() TABLE 'source_table')) +SOURCE(CLICKHOUSE(TABLE 'source_table')) LAYOUT(FLAT()) LIFETIME(MIN 0 MAX 1000) -COMMENT 'The temporary dictionary'; ``` Output the dictionary: ``` sql -SHOW CREATE DICTIONARY dictionary_with_comment; +SHOW CREATE DICTIONARY id_value_dictionary; ``` -```text -┌─statement───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ -│ CREATE DICTIONARY default.dictionary_with_comment +```response +CREATE DICTIONARY default.id_value_dictionary ( `id` UInt64, `value` String ) PRIMARY KEY id -SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() TABLE 'source_table')) +SOURCE(CLICKHOUSE(TABLE 'source_table')) LIFETIME(MIN 0 MAX 1000) LAYOUT(FLAT()) -COMMENT 'The temporary dictionary' │ -└─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ ``` -Output the comment to dictionary: +### Create a dictionary from a table in a remote ClickHouse service + +Input table (in the remote ClickHouse service) `source_table`: + +``` text +┌─id─┬─value──┐ +│ 1 │ First │ +│ 2 │ Second │ +└────┴────────┘ +``` + +Creating the dictionary: ``` sql -SELECT comment FROM system.dictionaries WHERE name == 'dictionary_with_comment' AND database == currentDatabase(); +CREATE DICTIONARY id_value_dictionary +( + id UInt64, + value String +) +PRIMARY KEY id +SOURCE(CLICKHOUSE(HOST 'HOSTNAME' PORT 9000 USER 'default' PASSWORD 'PASSWORD' TABLE 'source_table' DB 'default')) +LAYOUT(FLAT()) +LIFETIME(MIN 0 MAX 1000) ``` -```text -┌─comment──────────────────┐ -│ The temporary dictionary │ -└──────────────────────────┘ +### Create a dictionary from a file available by HTTP(S) + +```sql +statement: CREATE DICTIONARY default.taxi_zone_dictionary +( + `LocationID` UInt16 DEFAULT 0, + `Borough` String, + `Zone` String, + `service_zone` String +) +PRIMARY KEY LocationID +SOURCE(HTTP(URL 'https://datasets-documentation.s3.eu-west-3.amazonaws.com/nyc-taxi/taxi_zone_lookup.csv' FORMAT 'CSVWithNames')) +LIFETIME(MIN 0 MAX 0) +LAYOUT(HASHED()) ``` +### Create a dictionary from another database + +Please see the details in [Dictionary sources](/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md/#dbms). + **See Also** -- [system.dictionaries](../../../operations/system-tables/dictionaries.md) — This table contains information about [external dictionaries](../../../sql-reference/dictionaries/external-dictionaries/external-dicts.md). +- For more information, see the [Dictionaries](../../../sql-reference/dictionaries/external-dictionaries/external-dicts.md) section. +- [system.dictionaries](../../../operations/system-tables/dictionaries.md) — This table contains information about [Dictionaries](../../../sql-reference/dictionaries/external-dictionaries/external-dicts.md). diff --git a/docs/en/sql-reference/statements/create/settings-profile.md b/docs/en/sql-reference/statements/create/settings-profile.md index 8883b22896b..c4ca89f3284 100644 --- a/docs/en/sql-reference/statements/create/settings-profile.md +++ b/docs/en/sql-reference/statements/create/settings-profile.md @@ -10,7 +10,7 @@ Creates [settings profiles](../../../operations/access-rights.md#settings-profil Syntax: ``` sql -CREATE SETTINGS PROFILE [IF NOT EXISTS | OR REPLACE] TO name1 [ON CLUSTER cluster_name1] +CREATE SETTINGS PROFILE [IF NOT EXISTS | OR REPLACE] name1 [ON CLUSTER cluster_name1] [, name2 [ON CLUSTER cluster_name2] ...] [SETTINGS variable [= value] [MIN [=] min_value] [MAX [=] max_value] [CONST|READONLY|WRITABLE|CHANGEABLE_IN_READONLY] | INHERIT 'profile_name'] [,...] ``` diff --git a/docs/en/sql-reference/statements/create/table.md b/docs/en/sql-reference/statements/create/table.md index 6dbd6bf8136..68fb968c609 100644 --- a/docs/en/sql-reference/statements/create/table.md +++ b/docs/en/sql-reference/statements/create/table.md @@ -59,6 +59,28 @@ If the table already exists and `IF NOT EXISTS` is specified, the query won’t There can be other clauses after the `ENGINE` clause in the query. See detailed documentation on how to create tables in the descriptions of [table engines](../../../engines/table-engines/index.md#table_engines). +:::tip +In ClickHouse Cloud please split this into two steps: +1. Create the table structure + + ```sql + CREATE TABLE t1 + ENGINE = MergeTree + ORDER BY ... + # highlight-next-line + EMPTY AS + SELECT ... + ``` + +2. Populate the table + + ```sql + INSERT INTO t1 + SELECT ... + ``` + +::: + **Example** Query: @@ -159,7 +181,7 @@ ENGINE = engine PRIMARY KEY(expr1[, expr2,...]); ``` -:::warning +:::warning You can't combine both ways in one query. ::: @@ -215,7 +237,7 @@ ALTER TABLE codec_example MODIFY COLUMN float_value CODEC(Default); Codecs can be combined in a pipeline, for example, `CODEC(Delta, Default)`. -:::warning +:::warning You can’t decompress ClickHouse database files with external utilities like `lz4`. Instead, use the special [clickhouse-compressor](https://github.com/ClickHouse/ClickHouse/tree/master/programs/compressor) utility. ::: @@ -301,44 +323,44 @@ Encryption codecs: #### AES_128_GCM_SIV -`CODEC('AES-128-GCM-SIV')` — Encrypts data with AES-128 in [RFC 8452](https://tools.ietf.org/html/rfc8452) GCM-SIV mode. +`CODEC('AES-128-GCM-SIV')` — Encrypts data with AES-128 in [RFC 8452](https://tools.ietf.org/html/rfc8452) GCM-SIV mode. #### AES-256-GCM-SIV -`CODEC('AES-256-GCM-SIV')` — Encrypts data with AES-256 in GCM-SIV mode. +`CODEC('AES-256-GCM-SIV')` — Encrypts data with AES-256 in GCM-SIV mode. These codecs use a fixed nonce and encryption is therefore deterministic. This makes it compatible with deduplicating engines such as [ReplicatedMergeTree](../../../engines/table-engines/mergetree-family/replication.md) but has a weakness: when the same data block is encrypted twice, the resulting ciphertext will be exactly the same so an adversary who can read the disk can see this equivalence (although only the equivalence, without getting its content). -:::warning +:::warning Most engines including the "\*MergeTree" family create index files on disk without applying codecs. This means plaintext will appear on disk if an encrypted column is indexed. ::: -:::warning +:::warning If you perform a SELECT query mentioning a specific value in an encrypted column (such as in its WHERE clause), the value may appear in [system.query_log](../../../operations/system-tables/query_log.md). You may want to disable the logging. ::: **Example** ```sql -CREATE TABLE mytable +CREATE TABLE mytable ( x String Codec(AES_128_GCM_SIV) -) +) ENGINE = MergeTree ORDER BY x; ``` -:::note +:::note If compression needs to be applied, it must be explicitly specified. Otherwise, only encryption will be applied to data. ::: **Example** ```sql -CREATE TABLE mytable +CREATE TABLE mytable ( x String Codec(Delta, LZ4, AES_128_GCM_SIV) -) +) ENGINE = MergeTree ORDER BY x; ``` @@ -372,7 +394,7 @@ It’s possible to use tables with [ENGINE = Memory](../../../engines/table-engi 'REPLACE' query allows you to update the table atomically. -:::note +:::note This query is supported only for [Atomic](../../../engines/database-engines/atomic.md) database engine. ::: @@ -388,7 +410,7 @@ RENAME TABLE myNewTable TO myOldTable; Instead of above, you can use the following: ```sql -REPLACE TABLE myOldTable SELECT * FROM myOldTable WHERE CounterID <12345; +REPLACE TABLE myOldTable ENGINE = MergeTree() ORDER BY CounterID AS SELECT * FROM myOldTable WHERE CounterID <12345; ``` ### Syntax @@ -448,7 +470,7 @@ SELECT * FROM base.t1; You can add a comment to the table when you creating it. -:::note +:::note The comment is supported for all table engines except [Kafka](../../../engines/table-engines/integrations/kafka.md), [RabbitMQ](../../../engines/table-engines/integrations/rabbitmq.md) and [EmbeddedRocksDB](../../../engines/table-engines/integrations/embedded-rocksdb.md). ::: diff --git a/docs/en/sql-reference/statements/create/user.md b/docs/en/sql-reference/statements/create/user.md index 56a0560e57e..a756b3d4a0d 100644 --- a/docs/en/sql-reference/statements/create/user.md +++ b/docs/en/sql-reference/statements/create/user.md @@ -12,7 +12,7 @@ Syntax: ``` sql CREATE USER [IF NOT EXISTS | OR REPLACE] name1 [ON CLUSTER cluster_name1] [, name2 [ON CLUSTER cluster_name2] ...] - [NOT IDENTIFIED | IDENTIFIED {[WITH {no_password | plaintext_password | sha256_password | sha256_hash | double_sha1_password | double_sha1_hash}] BY {'password' | 'hash'}} | {WITH ldap SERVER 'server_name'} | {WITH kerberos [REALM 'realm']}] + [NOT IDENTIFIED | IDENTIFIED {[WITH {no_password | plaintext_password | sha256_password | sha256_hash | double_sha1_password | double_sha1_hash}] BY {'password' | 'hash'}} | {WITH ldap SERVER 'server_name'} | {WITH kerberos [REALM 'realm']} | {WITH ssl_certificate CN 'common_name'}] [HOST {LOCAL | NAME 'name' | REGEXP 'name_regexp' | IP 'address' | LIKE 'pattern'} [,...] | ANY | NONE] [DEFAULT ROLE role [,...]] [DEFAULT DATABASE database | NONE] @@ -34,6 +34,7 @@ There are multiple ways of user identification: - `IDENTIFIED WITH double_sha1_hash BY 'hash'` - `IDENTIFIED WITH ldap SERVER 'server_name'` - `IDENTIFIED WITH kerberos` or `IDENTIFIED WITH kerberos REALM 'realm'` +- `IDENTIFIED WITH ssl_certificate CN 'mysite.com:user'` For identification with sha256_hash using `SALT` - hash must be calculated from concatination of 'password' and 'salt'. @@ -54,7 +55,7 @@ Another way of specifying host is to use `@` syntax following the username. Exam - `CREATE USER mira@'localhost'` — Equivalent to the `HOST LOCAL` syntax. - `CREATE USER mira@'192.168.%.%'` — Equivalent to the `HOST LIKE` syntax. -:::warning +:::warning ClickHouse treats `user_name@'address'` as a username as a whole. Thus, technically you can create multiple users with the same `user_name` and different constructions after `@`. However, we do not recommend to do so. ::: diff --git a/docs/en/sql-reference/statements/create/view.md b/docs/en/sql-reference/statements/create/view.md index 5833c43f55d..85741117d2a 100644 --- a/docs/en/sql-reference/statements/create/view.md +++ b/docs/en/sql-reference/statements/create/view.md @@ -60,7 +60,7 @@ If you specify `POPULATE`, the existing table data is inserted into the view whe A `SELECT` query can contain `DISTINCT`, `GROUP BY`, `ORDER BY`, `LIMIT`. Note that the corresponding conversions are performed independently on each block of inserted data. For example, if `GROUP BY` is set, data is aggregated during insertion, but only within a single packet of inserted data. The data won’t be further aggregated. The exception is when using an `ENGINE` that independently performs data aggregation, such as `SummingMergeTree`. -The execution of [ALTER](../../../sql-reference/statements/alter/view.md) queries on materialized views has limitations, so they might be inconvenient. If the materialized view uses the construction `TO [db.]name`, you can `DETACH` the view, run `ALTER` for the target table, and then `ATTACH` the previously detached (`DETACH`) view. +The execution of [ALTER](/docs/en/sql-reference/statements/alter/view.md) queries on materialized views has limitations, for example, you can not update the `SELECT` query, so this might be inconvenient. If the materialized view uses the construction `TO [db.]name`, you can `DETACH` the view, run `ALTER` for the target table, and then `ATTACH` the previously detached (`DETACH`) view. Note that materialized view is influenced by [optimize_on_insert](../../../operations/settings/settings.md#optimize-on-insert) setting. The data is merged before the insertion into a view. diff --git a/docs/en/sql-reference/statements/explain.md b/docs/en/sql-reference/statements/explain.md index f4a6ccb0c7d..5649486905e 100644 --- a/docs/en/sql-reference/statements/explain.md +++ b/docs/en/sql-reference/statements/explain.md @@ -10,7 +10,7 @@ Shows the execution plan of a statement. Syntax: ```sql -EXPLAIN [AST | SYNTAX | PLAN | PIPELINE | ESTIMATE | TABLE OVERRIDE] [setting = value, ...] +EXPLAIN [AST | SYNTAX | QUERY TREE | PLAN | PIPELINE | ESTIMATE | TABLE OVERRIDE] [setting = value, ...] [ SELECT ... | tableFunction(...) [COLUMNS (...)] [ORDER BY ...] [PARTITION BY ...] [PRIMARY KEY] [SAMPLE BY ...] [TTL ...] @@ -47,6 +47,7 @@ Union - `AST` — Abstract syntax tree. - `SYNTAX` — Query text after AST-level optimizations. +- `QUERY TREE` — Query tree after Query Tree level optimizations. - `PLAN` — Query execution plan. - `PIPELINE` — Query execution pipeline. @@ -110,6 +111,32 @@ FROM CROSS JOIN system.numbers AS c ``` +### EXPLAIN QUERY TREE + +Settings: + +- `run_passes` — Run all query tree passes before dumping the query tree. Defaul: `1`. +- `dump_passes` — Dump information about used passes before dumping the query tree. Default: `0`. +- `passes` — Specifies how many passes to run. If set to `-1`, runs all the passes. Default: `-1`. + +Example: +```sql +EXPLAIN QUERY TREE SELECT id, value FROM test_table; +``` + +``` +QUERY id: 0 + PROJECTION COLUMNS + id UInt64 + value String + PROJECTION + LIST id: 1, nodes: 2 + COLUMN id: 2, column_name: id, result_type: UInt64, source_id: 3 + COLUMN id: 4, column_name: value, result_type: String, source_id: 3 + JOIN TREE + TABLE id: 3, table_name: default.test_table +``` + ### EXPLAIN PLAN Dump query plan steps. diff --git a/docs/en/sql-reference/statements/index.md b/docs/en/sql-reference/statements/index.md index bfb90f4a89f..b286d8c932d 100644 --- a/docs/en/sql-reference/statements/index.md +++ b/docs/en/sql-reference/statements/index.md @@ -8,25 +8,25 @@ sidebar_label: Statements Statements represent various kinds of action you can perform using SQL queries. Each kind of statement has it’s own syntax and usage details that are described separately: -- [SELECT](../../sql-reference/statements/select/index.md) -- [INSERT INTO](../../sql-reference/statements/insert-into.md) -- [CREATE](../../sql-reference/statements/create/index.md) -- [ALTER](../../sql-reference/statements/alter/index.md) -- [SYSTEM](../../sql-reference/statements/system.md) -- [SHOW](../../sql-reference/statements/show.md) -- [GRANT](../../sql-reference/statements/grant.md) -- [REVOKE](../../sql-reference/statements/revoke.md) -- [ATTACH](../../sql-reference/statements/attach.md) -- [CHECK TABLE](../../sql-reference/statements/check-table.md) -- [DESCRIBE TABLE](../../sql-reference/statements/describe-table.md) -- [DETACH](../../sql-reference/statements/detach.md) -- [DROP](../../sql-reference/statements/drop.md) -- [EXISTS](../../sql-reference/statements/exists.md) -- [KILL](../../sql-reference/statements/kill.md) -- [OPTIMIZE](../../sql-reference/statements/optimize.md) -- [RENAME](../../sql-reference/statements/rename.md) -- [SET](../../sql-reference/statements/set.md) -- [SET ROLE](../../sql-reference/statements/set-role.md) -- [TRUNCATE](../../sql-reference/statements/truncate.md) -- [USE](../../sql-reference/statements/use.md) -- [EXPLAIN](../../sql-reference/statements/explain.md) +- [SELECT](/docs/en/sql-reference/statements/select/index.md) +- [INSERT INTO](/docs/en/sql-reference/statements/insert-into.md) +- [CREATE](/docs/en/sql-reference/statements/create/index.md) +- [ALTER](/docs/en/sql-reference/statements/alter/index.md) +- [SYSTEM](/docs/en/sql-reference/statements/system.md) +- [SHOW](/docs/en/sql-reference/statements/show.md) +- [GRANT](/docs/en/sql-reference/statements/grant.md) +- [REVOKE](/docs/en/sql-reference/statements/revoke.md) +- [ATTACH](/docs/en/sql-reference/statements/attach.md) +- [CHECK TABLE](/docs/en/sql-reference/statements/check-table.md) +- [DESCRIBE TABLE](/docs/en/sql-reference/statements/describe-table.md) +- [DETACH](/docs/en/sql-reference/statements/detach.md) +- [DROP](/docs/en/sql-reference/statements/drop.md) +- [EXISTS](/docs/en/sql-reference/statements/exists.md) +- [KILL](/docs/en/sql-reference/statements/kill.md) +- [OPTIMIZE](/docs/en/sql-reference/statements/optimize.md) +- [RENAME](/docs/en/sql-reference/statements/rename.md) +- [SET](/docs/en/sql-reference/statements/set.md) +- [SET ROLE](/docs/en/sql-reference/statements/set-role.md) +- [TRUNCATE](/docs/en/sql-reference/statements/truncate.md) +- [USE](/docs/en/sql-reference/statements/use.md) +- [EXPLAIN](/docs/en/sql-reference/statements/explain.md) diff --git a/docs/en/sql-reference/statements/select/group-by.md b/docs/en/sql-reference/statements/select/group-by.md index ac02e9ab5a1..2df8581c447 100644 --- a/docs/en/sql-reference/statements/select/group-by.md +++ b/docs/en/sql-reference/statements/select/group-by.md @@ -243,6 +243,54 @@ If `max_rows_to_group_by` and `group_by_overflow_mode = 'any'` are not used, all You can use `WITH TOTALS` in subqueries, including subqueries in the [JOIN](../../../sql-reference/statements/select/join.md) clause (in this case, the respective total values are combined). +## GROUP BY ALL + +`GROUP BY ALL` is equivalent to listing all the SELECT-ed expressions that are not aggregate functions. + +For example: + +``` sql +SELECT + a * 2, + b, + count(c), +FROM t +GROUP BY ALL +``` + +is the same as + +``` sql +SELECT + a * 2, + b, + count(c), +FROM t +GROUP BY a * 2, b +``` + +For a special case that if there is a function having both aggregate functions and other fields as its arguments, the `GROUP BY` keys will contain the maximum non-aggregate fields we can extract from it. + +For example: + +``` sql +SELECT + substring(a, 4, 2), + substring(substring(a, 1, 2), 1, count(b)) +FROM t +GROUP BY ALL +``` + +is the same as + +``` sql +SELECT + substring(a, 4, 2), + substring(substring(a, 1, 2), 1, count(b)) +FROM t +GROUP BY substring(a, 4, 2), substring(a, 1, 2) +``` + ## Examples Example: diff --git a/docs/en/sql-reference/statements/select/join.md b/docs/en/sql-reference/statements/select/join.md index 1890ff081d8..62d3e9fd69a 100644 --- a/docs/en/sql-reference/statements/select/join.md +++ b/docs/en/sql-reference/statements/select/join.md @@ -282,7 +282,7 @@ Each time a query is run with the same `JOIN`, the subquery is run again because In some cases, it is more efficient to use [IN](../../../sql-reference/operators/in.md) instead of `JOIN`. -If you need a `JOIN` for joining with dimension tables (these are relatively small tables that contain dimension properties, such as names for advertising campaigns), a `JOIN` might not be very convenient due to the fact that the right table is re-accessed for every query. For such cases, there is an “external dictionaries” feature that you should use instead of `JOIN`. For more information, see the [External dictionaries](../../../sql-reference/dictionaries/external-dictionaries/external-dicts.md) section. +If you need a `JOIN` for joining with dimension tables (these are relatively small tables that contain dimension properties, such as names for advertising campaigns), a `JOIN` might not be very convenient due to the fact that the right table is re-accessed for every query. For such cases, there is a “dictionaries” feature that you should use instead of `JOIN`. For more information, see the [Dictionaries](../../../sql-reference/dictionaries/external-dictionaries/external-dicts.md) section. ### Memory Limitations diff --git a/docs/en/sql-reference/statements/set-role.md b/docs/en/sql-reference/statements/set-role.md index bf998d7841e..e017160623e 100644 --- a/docs/en/sql-reference/statements/set-role.md +++ b/docs/en/sql-reference/statements/set-role.md @@ -41,7 +41,7 @@ Purge default roles from a user: SET DEFAULT ROLE NONE TO user ``` -Set all the granted roles as default excepting some of them: +Set all the granted roles as default except for specific roles `role1` and `role2`: ``` sql SET DEFAULT ROLE ALL EXCEPT role1, role2 TO user diff --git a/docs/en/sql-reference/statements/show.md b/docs/en/sql-reference/statements/show.md index 87248bb115b..0efad3d460f 100644 --- a/docs/en/sql-reference/statements/show.md +++ b/docs/en/sql-reference/statements/show.md @@ -198,7 +198,7 @@ Result: ## SHOW DICTIONARIES -Displays a list of [external dictionaries](../../sql-reference/dictionaries/external-dictionaries/external-dicts.md). +Displays a list of [Dictionaries](../../sql-reference/dictionaries/external-dictionaries/external-dicts.md). ``` sql SHOW DICTIONARIES [FROM ] [LIKE ''] [LIMIT ] [INTO OUTFILE ] [FORMAT ] diff --git a/docs/en/sql-reference/statements/system.md b/docs/en/sql-reference/statements/system.md index e9ff4d45c79..c8b104ea91f 100644 --- a/docs/en/sql-reference/statements/system.md +++ b/docs/en/sql-reference/statements/system.md @@ -281,8 +281,8 @@ After running this statement the `[db.]replicated_merge_tree_family_table_name` ### RESTART REPLICA -Provides possibility to reinitialize Zookeeper sessions state for `ReplicatedMergeTree` table, will compare current state with Zookeeper as source of true and add tasks to Zookeeper queue if needed. -Initialization replication queue based on ZooKeeper date happens in the same way as `ATTACH TABLE` statement. For a short time the table will be unavailable for any operations. +Provides possibility to reinitialize Zookeeper session's state for `ReplicatedMergeTree` table, will compare current state with Zookeeper as source of truth and add tasks to Zookeeper queue if needed. +Initialization of replication queue based on ZooKeeper data happens in the same way as for `ATTACH TABLE` statement. For a short time, the table will be unavailable for any operations. ``` sql SYSTEM RESTART REPLICA [db.]replicated_merge_tree_family_table_name diff --git a/docs/en/sql-reference/table-functions/format.md b/docs/en/sql-reference/table-functions/format.md new file mode 100644 index 00000000000..4d1488ea640 --- /dev/null +++ b/docs/en/sql-reference/table-functions/format.md @@ -0,0 +1,75 @@ +--- +slug: /en/sql-reference/table-functions/format +sidebar_position: 56 +sidebar_label: format +--- + +# format + +Extracts table structure from data and parses it according to specified input format. + +**Syntax** + +``` sql +format(format_name, data) +``` + +**Parameters** + +- `format_name` — The [format](../../interfaces/formats.md#formats) of the data. +- `data` — String literal or constant expression that returns a string containing data in specified format + +**Returned value** + +A table with data parsed from `data` argument according specified format and extracted schema. + +**Examples** + +**Query:** +``` sql +:) select * from format(JSONEachRow, +$$ +{"a": "Hello", "b": 111} +{"a": "World", "b": 123} +{"a": "Hello", "b": 112} +{"a": "World", "b": 124} +$$) +``` + +**Result:** + +```text +┌───b─┬─a─────┐ +│ 111 │ Hello │ +│ 123 │ World │ +│ 112 │ Hello │ +│ 124 │ World │ +└─────┴───────┘ +``` + +**Query:** +```sql + +:) desc format(JSONEachRow, +$$ +{"a": "Hello", "b": 111} +{"a": "World", "b": 123} +{"a": "Hello", "b": 112} +{"a": "World", "b": 124} +$$) +``` + +**Result:** + +```text +┌─name─┬─type──────────────┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┬─ttl_expression─┐ +│ b │ Nullable(Float64) │ │ │ │ │ │ +│ a │ Nullable(String) │ │ │ │ │ │ +└──────┴───────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘ +``` + +**See Also** + +- [Formats](../../interfaces/formats.md) + +[Original article](https://clickhouse.com/docs/en/sql-reference/table-functions/format) diff --git a/docs/en/sql-reference/table-functions/index.md b/docs/en/sql-reference/table-functions/index.md index d09adcd13d6..94b23bc695c 100644 --- a/docs/en/sql-reference/table-functions/index.md +++ b/docs/en/sql-reference/table-functions/index.md @@ -39,3 +39,7 @@ You can’t use table functions if the [allow_ddl](../../operations/settings/per | [s3](../../sql-reference/table-functions/s3.md) | Creates a [S3](../../engines/table-engines/integrations/s3.md)-engine table. | | [sqlite](../../sql-reference/table-functions/sqlite.md) | Creates a [sqlite](../../engines/table-engines/integrations/sqlite.md)-engine table. | +:::note +Only these table functions are enabled in readonly mode : +null, view, viewIfPermitted, numbers, numbers_mt, generateRandom, values, cluster, clusterAllReplicas +::: \ No newline at end of file diff --git a/docs/en/sql-reference/table-functions/mysql.md b/docs/en/sql-reference/table-functions/mysql.md index f867cda45bd..de1567c052e 100644 --- a/docs/en/sql-reference/table-functions/mysql.md +++ b/docs/en/sql-reference/table-functions/mysql.md @@ -110,5 +110,5 @@ SELECT * FROM mysql('localhost:3306', 'test', 'test', 'bayonet', '123'); **See Also** - [The ‘MySQL’ table engine](../../engines/table-engines/integrations/mysql.md) -- [Using MySQL as a source of external dictionary](../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md#dicts-external_dicts_dict_sources-mysql) +- [Using MySQL as a dictionary source](../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md#dicts-external_dicts_dict_sources-mysql) diff --git a/docs/en/sql-reference/table-functions/odbc.md b/docs/en/sql-reference/table-functions/odbc.md index f8c46fe44d8..7e13424bc8a 100644 --- a/docs/en/sql-reference/table-functions/odbc.md +++ b/docs/en/sql-reference/table-functions/odbc.md @@ -101,5 +101,5 @@ SELECT * FROM odbc('DSN=mysqlconn', 'test', 'test') ## See Also -- [ODBC external dictionaries](../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md#dicts-external_dicts_dict_sources-odbc) +- [ODBC dictionaries](../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md#dicts-external_dicts_dict_sources-odbc) - [ODBC table engine](../../engines/table-engines/integrations/odbc.md). diff --git a/docs/en/sql-reference/table-functions/postgresql.md b/docs/en/sql-reference/table-functions/postgresql.md index 367edbe9a00..e98869de739 100644 --- a/docs/en/sql-reference/table-functions/postgresql.md +++ b/docs/en/sql-reference/table-functions/postgresql.md @@ -130,6 +130,6 @@ CREATE TABLE pg_table_schema_with_dots (a UInt32) **See Also** - [The PostgreSQL table engine](../../engines/table-engines/integrations/postgresql.md) -- [Using PostgreSQL as a source of external dictionary](../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md#dicts-external_dicts_dict_sources-postgresql) +- [Using PostgreSQL as a dictionary source](../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md#dicts-external_dicts_dict_sources-postgresql) [Original article](https://clickhouse.com/docs/en/sql-reference/table-functions/postgresql/) diff --git a/docs/en/sql-reference/window-functions/index.md b/docs/en/sql-reference/window-functions/index.md index 4da5f4cc420..a545fb630c9 100644 --- a/docs/en/sql-reference/window-functions/index.md +++ b/docs/en/sql-reference/window-functions/index.md @@ -587,3 +587,8 @@ ORDER BY │ ambient_temp │ 2020-03-01 12:00:00 │ 16 │ 16 │ └──────────────┴─────────────────────┴───────┴─────────────────────────┘ ``` + +## Related Content + +- [Window and array functions for Git commit sequences](https://clickhouse.com/blog/clickhouse-window-array-functions-git-commits) +- [Getting Data Into ClickHouse - Part 3 - Using S3](https://clickhouse.com/blog/getting-data-into-clickhouse-part-3-s3) diff --git a/docs/ru/operations/external-authenticators/kerberos.md b/docs/ru/operations/external-authenticators/kerberos.md index 7b0702b2132..865ea639c89 100644 --- a/docs/ru/operations/external-authenticators/kerberos.md +++ b/docs/ru/operations/external-authenticators/kerberos.md @@ -98,7 +98,7 @@ ClickHouse предоставляет возможность аутентифи :::danger "Важно" - Если пользователь настроен для Kerberos-аутентификации, другие виды уатентификации будут для него недоступны. Если наряду с `kerberos` в определении пользователя будет указан какой-либо другой способ аутентификации, ClickHouse завершит работу. + Если пользователь настроен для Kerberos-аутентификации, другие виды аутентификации будут для него недоступны. Если наряду с `kerberos` в определении пользователя будет указан какой-либо другой способ аутентификации, ClickHouse завершит работу. :::info "" Ещё раз отметим, что кроме `users.xml`, необходимо также включить Kerberos в `config.xml`. diff --git a/docs/ru/operations/settings/index.md b/docs/ru/operations/settings/index.md index 4e055405847..6806aea5135 100644 --- a/docs/ru/operations/settings/index.md +++ b/docs/ru/operations/settings/index.md @@ -24,7 +24,7 @@ slug: /ru/operations/settings/ - При запуске консольного клиента ClickHouse в не интерактивном режиме установите параметр запуска `--setting=value`. - При использовании HTTP API передавайте cgi-параметры (`URL?setting_1=value&setting_2=value...`). - - Укажите необходимые настройки в секции [SETTINGS](../../sql-reference/statements/select/index.md#settings-in-select) запроса SELECT. Эти настройки действуют только в рамках данного запроса, а после его выполнения сбрасываются до предыдущего значения или значения по умолчанию. + - Укажите необходимые настройки в секции [SETTINGS](../../sql-reference/statements/select/index.md#settings-in-select-query) запроса SELECT. Эти настройки действуют только в рамках данного запроса, а после его выполнения сбрасываются до предыдущего значения или значения по умолчанию. Настройки, которые можно задать только в конфигурационном файле сервера, в разделе не рассматриваются. diff --git a/docs/ru/operations/settings/settings.md b/docs/ru/operations/settings/settings.md index a070dbd5e10..58894611386 100644 --- a/docs/ru/operations/settings/settings.md +++ b/docs/ru/operations/settings/settings.md @@ -479,7 +479,7 @@ SELECT * FROM table_with_enum_column_for_tsv_insert; Включает или отключает вставку [значений по умолчанию](../../sql-reference/statements/create/table.md#create-default-values) вместо [NULL](../../sql-reference/syntax.md#null-literal) в столбцы, которые не позволяют [хранить NULL](../../sql-reference/data-types/nullable.md#data_type-nullable). Если столбец не позволяет хранить `NULL` и эта настройка отключена, то вставка `NULL` приведет к возникновению исключения. Если столбец позволяет хранить `NULL`, то значения `NULL` вставляются независимо от этой настройки. -Эта настройка используется для запросов [INSERT ... SELECT](../../sql-reference/statements/insert-into.md#insert_query_insert-select). При этом подзапросы `SELECT` могут объединяться с помощью `UNION ALL`. +Эта настройка используется для запросов [INSERT ... SELECT](../../sql-reference/statements/insert-into.md#inserting-the-results-of-select). При этом подзапросы `SELECT` могут объединяться с помощью `UNION ALL`. Возможные значения: diff --git a/docs/ru/operations/system-tables/crash-log.md b/docs/ru/operations/system-tables/crash-log.md index 4ca8be5a199..68148fec6bd 100644 --- a/docs/ru/operations/system-tables/crash-log.md +++ b/docs/ru/operations/system-tables/crash-log.md @@ -7,8 +7,8 @@ slug: /ru/operations/system-tables/crash-log Колонки: -- `event_date` ([Datetime](../../sql-reference/data-types/datetime.md)) — Дата события. -- `event_time` ([Datetime](../../sql-reference/data-types/datetime.md)) — Время события. +- `event_date` ([DateTime](../../sql-reference/data-types/datetime.md)) — Дата события. +- `event_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — Время события. - `timestamp_ns` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Время события с наносекундами. - `signal` ([Int32](../../sql-reference/data-types/int-uint.md)) — Номер сигнала, пришедшего в поток. - `thread_id` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Идентификатор треда. diff --git a/docs/ru/operations/system-tables/mutations.md b/docs/ru/operations/system-tables/mutations.md index 20e4ebfdaf1..bb0bd44ed7a 100644 --- a/docs/ru/operations/system-tables/mutations.md +++ b/docs/ru/operations/system-tables/mutations.md @@ -15,7 +15,7 @@ slug: /ru/operations/system-tables/mutations - `command` ([String](../../sql-reference/data-types/string.md)) — команда мутации (часть запроса после `ALTER TABLE [db.]table`). -- `create_time` ([Datetime](../../sql-reference/data-types/datetime.md)) — дата и время создания мутации. +- `create_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — дата и время создания мутации. - `block_numbers.partition_id` ([Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md))) — Для мутаций реплицированных таблиц массив содержит содержит номера партиций (по одной записи для каждой партиции). Для мутаций нереплицированных таблиц массив пустой. @@ -39,7 +39,7 @@ slug: /ru/operations/system-tables/mutations - `latest_failed_part` ([String](../../sql-reference/data-types/string.md)) — имя последнего куска, мутация которого не удалась. -- `latest_fail_time` ([Datetime](../../sql-reference/data-types/datetime.md)) — дата и время последней ошибки мутации. +- `latest_fail_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — дата и время последней ошибки мутации. - `latest_fail_reason` ([String](../../sql-reference/data-types/string.md)) — причина последней ошибки мутации. diff --git a/docs/ru/operations/system-tables/replication_queue.md b/docs/ru/operations/system-tables/replication_queue.md index 25de174e98f..60d42133153 100644 --- a/docs/ru/operations/system-tables/replication_queue.md +++ b/docs/ru/operations/system-tables/replication_queue.md @@ -29,7 +29,7 @@ slug: /ru/operations/system-tables/replication_queue - `MUTATE_PART` — применить одну или несколько мутаций к куску. - `ALTER_METADATA` — применить изменения структуры таблицы в результате запросов с выражением `ALTER`. -- `create_time` ([Datetime](../../sql-reference/data-types/datetime.md)) — дата и время отправки задачи на выполнение. +- `create_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — дата и время отправки задачи на выполнение. - `required_quorum` ([UInt32](../../sql-reference/data-types/int-uint.md)) — количество реплик, ожидающих завершения задачи, с подтверждением о завершении. Этот столбец актуален только для задачи `GET_PARTS`. @@ -47,13 +47,13 @@ slug: /ru/operations/system-tables/replication_queue - `last_exception` ([String](../../sql-reference/data-types/string.md)) — текст сообщения о последней возникшей ошибке, если таковые имеются. -- `last_attempt_time` ([Datetime](../../sql-reference/data-types/datetime.md)) — дата и время последней попытки выполнить задачу. +- `last_attempt_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — дата и время последней попытки выполнить задачу. - `num_postponed` ([UInt32](../../sql-reference/data-types/int-uint.md)) — количество отложенных задач. - `postpone_reason` ([String](../../sql-reference/data-types/string.md)) — причина, по которой была отложена задача. -- `last_postpone_time` ([Datetime](../../sql-reference/data-types/datetime.md)) — дата и время, когда была отложена задача в последний раз. +- `last_postpone_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — дата и время, когда была отложена задача в последний раз. - `merge_type` ([String](../../sql-reference/data-types/string.md)) — тип текущего слияния. Пусто, если это мутация. diff --git a/docs/ru/sql-reference/data-types/date32.md b/docs/ru/sql-reference/data-types/date32.md index fcb7d688c20..958b8e9763e 100644 --- a/docs/ru/sql-reference/data-types/date32.md +++ b/docs/ru/sql-reference/data-types/date32.md @@ -6,7 +6,7 @@ sidebar_label: Date32 # Date32 {#data_type-datetime32} -Дата. Поддерживается такой же диапазон дат, как для типа [Datetime64](../../sql-reference/data-types/datetime64.md). Значение хранится в четырех байтах и соответствует числу дней с 1900-01-01 по 2299-12-31. +Дата. Поддерживается такой же диапазон дат, как для типа [DateTime64](../../sql-reference/data-types/datetime64.md). Значение хранится в четырех байтах и соответствует числу дней с 1900-01-01 по 2299-12-31. **Пример** diff --git a/docs/ru/sql-reference/functions/arithmetic-functions.md b/docs/ru/sql-reference/functions/arithmetic-functions.md index bc1d0a55128..4e040edcc70 100644 --- a/docs/ru/sql-reference/functions/arithmetic-functions.md +++ b/docs/ru/sql-reference/functions/arithmetic-functions.md @@ -159,3 +159,150 @@ SELECT min2(-1, 2); └─────────────┘ ``` +## multiplyDecimal(a, b[, result_scale]) + +Совершает умножение двух Decimal. Результат будет иметь тип [Decimal256](../../sql-reference/data-types/decimal.md). +Scale (размер дробной части) результат можно явно задать аргументом `result_scale` (целочисленная константа из интервала `[0, 76]`). +Если этот аргумент не задан, то scale результата будет равен наибольшему из scale обоих аргументов. + +**Синтаксис** + +```sql +multiplyDecimal(a, b[, result_scale]) +``` + +:::note +Эта функция работают гораздо медленнее обычной `multiply`. +В случае, если нет необходимости иметь фиксированную точность и/или нужны быстрые вычисления, следует использовать [multiply](#multiply). +::: + +**Аргументы** + +- `a` — Первый сомножитель/делимое: [Decimal](../../sql-reference/data-types/decimal.md). +- `b` — Второй сомножитель/делитель: [Decimal](../../sql-reference/data-types/decimal.md). +- `result_scale` — Scale результата: [Int/UInt](../../sql-reference/data-types/int-uint.md). + +**Возвращаемое значение** + +- Результат умножения с заданным scale. + +Тип: [Decimal256](../../sql-reference/data-types/decimal.md). + +**Примеры** + +```sql +SELECT multiplyDecimal(toDecimal256(-12, 0), toDecimal32(-2.1, 1), 1); +``` + +```text +┌─multiplyDecimal(toDecimal256(-12, 0), toDecimal32(-2.1, 1), 1)─┐ +│ 25.2 │ +└────────────────────────────────────────────────────────────────┘ +``` + +**Отличие от стандартных функций** +```sql +SELECT toDecimal64(-12.647, 3) * toDecimal32(2.1239, 4); +SELECT toDecimal64(-12.647, 3) as a, toDecimal32(2.1239, 4) as b, multiplyDecimal(a, b); +``` + +```text +┌─multiply(toDecimal64(-12.647, 3), toDecimal32(2.1239, 4))─┐ +│ -26.8609633 │ +└───────────────────────────────────────────────────────────┘ +┌─multiplyDecimal(toDecimal64(-12.647, 3), toDecimal32(2.1239, 4))─┐ +│ -26.8609 │ +└──────────────────────────────────────────────────────────────────┘ +``` + +```sql +SELECT + toDecimal64(-12.647987876, 9) AS a, + toDecimal64(123.967645643, 9) AS b, + multiplyDecimal(a, b); + +SELECT + toDecimal64(-12.647987876, 9) AS a, + toDecimal64(123.967645643, 9) AS b, + a * b; +``` + +```text +┌─────────────a─┬─────────────b─┬─multiplyDecimal(toDecimal64(-12.647987876, 9), toDecimal64(123.967645643, 9))─┐ +│ -12.647987876 │ 123.967645643 │ -1567.941279108 │ +└───────────────┴───────────────┴───────────────────────────────────────────────────────────────────────────────┘ + +Received exception from server (version 22.11.1): +Code: 407. DB::Exception: Received from localhost:9000. DB::Exception: Decimal math overflow: While processing toDecimal64(-12.647987876, 9) AS a, toDecimal64(123.967645643, 9) AS b, a * b. (DECIMAL_OVERFLOW) +``` + +## divideDecimal(a, b[, result_scale]) + +Совершает деление двух Decimal. Результат будет иметь тип [Decimal256](../../sql-reference/data-types/decimal.md). +Scale (размер дробной части) результат можно явно задать аргументом `result_scale` (целочисленная константа из интервала `[0, 76]`). +Если этот аргумент не задан, то scale результата будет равен наибольшему из scale обоих аргументов. + +**Синтаксис** + +```sql +divideDecimal(a, b[, result_scale]) +``` + +:::note +Эта функция работает гораздо медленнее обычной `divide`. +В случае, если нет необходимости иметь фиксированную точность и/или нужны быстрые вычисления, следует использовать [divide](#divide). +::: + +**Аргументы** + +- `a` — Первый сомножитель/делимое: [Decimal](../../sql-reference/data-types/decimal.md). +- `b` — Второй сомножитель/делитель: [Decimal](../../sql-reference/data-types/decimal.md). +- `result_scale` — Scale результата: [Int/UInt](../../sql-reference/data-types/int-uint.md). + +**Возвращаемое значение** + +- Результат деления с заданным scale. + +Тип: [Decimal256](../../sql-reference/data-types/decimal.md). + +**Примеры** + +```sql +SELECT divideDecimal(toDecimal256(-12, 0), toDecimal32(2.1, 1), 10); +``` + +```text +┌─divideDecimal(toDecimal256(-12, 0), toDecimal32(2.1, 1), 10)─┐ +│ -5.7142857142 │ +└──────────────────────────────────────────────────────────────┘ +``` + +**Отличие от стандартных функций** +```sql +SELECT toDecimal64(-12, 1) / toDecimal32(2.1, 1); +SELECT toDecimal64(-12, 1) as a, toDecimal32(2.1, 1) as b, divideDecimal(a, b, 1), divideDecimal(a, b, 5); +``` + +```text +┌─divide(toDecimal64(-12, 1), toDecimal32(2.1, 1))─┐ +│ -5.7 │ +└──────────────────────────────────────────────────┘ + +┌───a─┬───b─┬─divideDecimal(toDecimal64(-12, 1), toDecimal32(2.1, 1), 1)─┬─divideDecimal(toDecimal64(-12, 1), toDecimal32(2.1, 1), 5)─┐ +│ -12 │ 2.1 │ -5.7 │ -5.71428 │ +└─────┴─────┴────────────────────────────────────────────────────────────┴────────────────────────────────────────────────────────────┘ +``` + +```sql +SELECT toDecimal64(-12, 0) / toDecimal32(2.1, 1); +SELECT toDecimal64(-12, 0) as a, toDecimal32(2.1, 1) as b, divideDecimal(a, b, 1), divideDecimal(a, b, 5); +``` + +```text +DB::Exception: Decimal result's scale is less than argument's one: While processing toDecimal64(-12, 0) / toDecimal32(2.1, 1). (ARGUMENT_OUT_OF_BOUND) + +┌───a─┬───b─┬─divideDecimal(toDecimal64(-12, 0), toDecimal32(2.1, 1), 1)─┬─divideDecimal(toDecimal64(-12, 0), toDecimal32(2.1, 1), 5)─┐ +│ -12 │ 2.1 │ -5.7 │ -5.71428 │ +└─────┴─────┴────────────────────────────────────────────────────────────┴────────────────────────────────────────────────────────────┘ +``` + diff --git a/docs/ru/sql-reference/functions/date-time-functions.md b/docs/ru/sql-reference/functions/date-time-functions.md index f18c2ea258a..8fbcaf9568b 100644 --- a/docs/ru/sql-reference/functions/date-time-functions.md +++ b/docs/ru/sql-reference/functions/date-time-functions.md @@ -424,23 +424,23 @@ WITH toDateTime64('2020-01-01 10:20:30.999', 3) AS dt64 SELECT toStartOfSecond(d ## toRelativeYearNum {#torelativeyearnum} -Переводит дату-с-временем или дату в номер года, начиная с некоторого фиксированного момента в прошлом. +Переводит дату или дату-с-временем в номер года, начиная с некоторого фиксированного момента в прошлом. ## toRelativeQuarterNum {#torelativequarternum} -Переводит дату-с-временем или дату в номер квартала, начиная с некоторого фиксированного момента в прошлом. +Переводит дату или дату-с-временем в номер квартала, начиная с некоторого фиксированного момента в прошлом. ## toRelativeMonthNum {#torelativemonthnum} -Переводит дату-с-временем или дату в номер месяца, начиная с некоторого фиксированного момента в прошлом. +Переводит дату или дату-с-временем в номер месяца, начиная с некоторого фиксированного момента в прошлом. ## toRelativeWeekNum {#torelativeweeknum} -Переводит дату-с-временем или дату в номер недели, начиная с некоторого фиксированного момента в прошлом. +Переводит дату или дату-с-временем в номер недели, начиная с некоторого фиксированного момента в прошлом. ## toRelativeDayNum {#torelativedaynum} -Переводит дату-с-временем или дату в номер дня, начиная с некоторого фиксированного момента в прошлом. +Переводит дату или дату-с-временем в номер дня, начиная с некоторого фиксированного момента в прошлом. ## toRelativeHourNum {#torelativehournum} @@ -456,7 +456,7 @@ WITH toDateTime64('2020-01-01 10:20:30.999', 3) AS dt64 SELECT toStartOfSecond(d ## toISOYear {#toisoyear} -Переводит дату-с-временем или дату в число типа UInt16, содержащее номер ISO года. ISO год отличается от обычного года, потому что в соответствии с [ISO 8601:1988](https://en.wikipedia.org/wiki/ISO_8601) ISO год начинается необязательно первого января. +Переводит дату или дату-с-временем в число типа UInt16, содержащее номер ISO года. ISO год отличается от обычного года, потому что в соответствии с [ISO 8601:1988](https://en.wikipedia.org/wiki/ISO_8601) ISO год начинается необязательно первого января. **Пример** @@ -479,7 +479,7 @@ SELECT ## toISOWeek {#toisoweek} -Переводит дату-с-временем или дату в число типа UInt8, содержащее номер ISO недели. +Переводит дату или дату-с-временем в число типа UInt8, содержащее номер ISO недели. Начало ISO года отличается от начала обычного года, потому что в соответствии с [ISO 8601:1988](https://en.wikipedia.org/wiki/ISO_8601) первая неделя года - это неделя с четырьмя или более днями в этом году. 1 Января 2017 г. - воскресение, т.е. первая ISO неделя 2017 года началась в понедельник 2 января, поэтому 1 января 2017 это последняя неделя 2016 года. @@ -503,7 +503,7 @@ SELECT ``` ## toWeek(date\[, mode\]\[, timezone\]) {#toweek} -Переводит дату-с-временем или дату в число UInt8, содержащее номер недели. Второй аргументам mode задает режим, начинается ли неделя с воскресенья или с понедельника и должно ли возвращаемое значение находиться в диапазоне от 0 до 53 или от 1 до 53. Если аргумент mode опущен, то используется режим 0. +Переводит дату или дату-с-временем в число UInt8, содержащее номер недели. Второй аргументам mode задает режим, начинается ли неделя с воскресенья или с понедельника и должно ли возвращаемое значение находиться в диапазоне от 0 до 53 или от 1 до 53. Если аргумент mode опущен, то используется режим 0. `toISOWeek() ` эквивалентно `toWeek(date,3)`. @@ -569,6 +569,132 @@ SELECT toDate('2016-12-27') AS date, toYearWeek(date) AS yearWeek0, toYearWeek(d └────────────┴───────────┴───────────┴───────────┘ ``` +## age + +Вычисляет компонент `unit` разницы между `startdate` и `enddate`. Разница вычисляется с точностью в 1 секунду. +Например, разница между `2021-12-29` и `2022-01-01` 3 дня для единицы `day`, 0 месяцев для единицы `month`, 0 лет для единицы `year`. + +**Синтаксис** + +``` sql +age('unit', startdate, enddate, [timezone]) +``` + +**Аргументы** + +- `unit` — единица измерения времени, в которой будет выражено возвращаемое значение функции. [String](../../sql-reference/data-types/string.md). + Возможные значения: + + - `second` (возможные сокращения: `ss`, `s`) + - `minute` (возможные сокращения: `mi`, `n`) + - `hour` (возможные сокращения: `hh`, `h`) + - `day` (возможные сокращения: `dd`, `d`) + - `week` (возможные сокращения: `wk`, `ww`) + - `month` (возможные сокращения: `mm`, `m`) + - `quarter` (возможные сокращения: `qq`, `q`) + - `year` (возможные сокращения: `yyyy`, `yy`) + +- `startdate` — первая дата или дата со временем, которая вычитается из `enddate`. [Date](../../sql-reference/data-types/date.md), [Date32](../../sql-reference/data-types/date32.md), [DateTime](../../sql-reference/data-types/datetime.md) или [DateTime64](../../sql-reference/data-types/datetime64.md). + +- `enddate` — вторая дата или дата со временем, из которой вычитается `startdate`. [Date](../../sql-reference/data-types/date.md), [Date32](../../sql-reference/data-types/date32.md), [DateTime](../../sql-reference/data-types/datetime.md) или [DateTime64](../../sql-reference/data-types/datetime64.md). + +- `timezone` — [часовой пояс](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone) (необязательно). Если этот аргумент указан, то он применяется как для `startdate`, так и для `enddate`. Если этот аргумент не указан, то используются часовые пояса аргументов `startdate` и `enddate`. Если часовые пояса аргументов `startdate` и `enddate` не совпадают, то результат не определен. [String](../../sql-reference/data-types/string.md). + +**Возвращаемое значение** + +Разница между `enddate` и `startdate`, выраженная в `unit`. + +Тип: [Int](../../sql-reference/data-types/int-uint.md). + +**Пример** + +Запрос: + +``` sql +SELECT age('hour', toDateTime('2018-01-01 22:30:00'), toDateTime('2018-01-02 23:00:00')); +``` + +Результат: + +``` text +┌─age('hour', toDateTime('2018-01-01 22:30:00'), toDateTime('2018-01-02 23:00:00'))─┐ +│ 24 │ +└───────────────────────────────────────────────────────────────────────────────────┘ +``` + +Запрос: + +``` sql +SELECT + toDate('2022-01-01') AS e, + toDate('2021-12-29') AS s, + age('day', s, e) AS day_age, + age('month', s, e) AS month__age, + age('year', s, e) AS year_age; +``` + +Результат: + +``` text +┌──────────e─┬──────────s─┬─day_age─┬─month__age─┬─year_age─┐ +│ 2022-01-01 │ 2021-12-29 │ 3 │ 0 │ 0 │ +└────────────┴────────────┴─────────┴────────────┴──────────┘ +``` + +## date\_diff {#date_diff} + +Вычисляет разницу указанных границ `unit` пересекаемых между `startdate` и `enddate`. + +**Синтаксис** + +``` sql +date_diff('unit', startdate, enddate, [timezone]) +``` + +Синонимы: `dateDiff`, `DATE_DIFF`. + +**Аргументы** + +- `unit` — единица измерения времени, в которой будет выражено возвращаемое значение функции. [String](../../sql-reference/data-types/string.md). + Возможные значения: + + - `second` (возможные сокращения: `ss`, `s`) + - `minute` (возможные сокращения: `mi`, `n`) + - `hour` (возможные сокращения: `hh`, `h`) + - `day` (возможные сокращения: `dd`, `d`) + - `week` (возможные сокращения: `wk`, `ww`) + - `month` (возможные сокращения: `mm`, `m`) + - `quarter` (возможные сокращения: `qq`, `q`) + - `year` (возможные сокращения: `yyyy`, `yy`) + +- `startdate` — первая дата или дата со временем, которая вычитается из `enddate`. [Date](../../sql-reference/data-types/date.md), [Date32](../../sql-reference/data-types/date32.md), [DateTime](../../sql-reference/data-types/datetime.md) или [DateTime64](../../sql-reference/data-types/datetime64.md). + +- `enddate` — вторая дата или дата со временем, из которой вычитается `startdate`. [Date](../../sql-reference/data-types/date.md), [Date32](../../sql-reference/data-types/date32.md), [DateTime](../../sql-reference/data-types/datetime.md) или [DateTime64](../../sql-reference/data-types/datetime64.md). + +- `timezone` — [часовой пояс](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone) (необязательно). Если этот аргумент указан, то он применяется как для `startdate`, так и для `enddate`. Если этот аргумент не указан, то используются часовые пояса аргументов `startdate` и `enddate`. Если часовые пояса аргументов `startdate` и `enddate` не совпадают, то результат не определен. [String](../../sql-reference/data-types/string.md). + +**Возвращаемое значение** + +Разница между `enddate` и `startdate`, выраженная в `unit`. + +Тип: [Int](../../sql-reference/data-types/int-uint.md). + +**Пример** + +Запрос: + +``` sql +SELECT dateDiff('hour', toDateTime('2018-01-01 22:00:00'), toDateTime('2018-01-02 23:00:00')); +``` + +Результат: + +``` text +┌─dateDiff('hour', toDateTime('2018-01-01 22:00:00'), toDateTime('2018-01-02 23:00:00'))─┐ +│ 25 │ +└────────────────────────────────────────────────────────────────────────────────────────┘ +``` + ## date_trunc {#date_trunc} Отсекает от даты и времени части, меньшие чем указанная часть. @@ -602,7 +728,7 @@ date_trunc(unit, value[, timezone]) - Дата и время, отсеченные до указанной части. -Тип: [Datetime](../../sql-reference/data-types/datetime.md). +Тип: [DateTime](../../sql-reference/data-types/datetime.md). **Примеры** @@ -689,60 +815,6 @@ SELECT date_add(YEAR, 3, toDate('2018-01-01')); └───────────────────────────────────────────────┘ ``` -## date\_diff {#date_diff} - -Вычисляет разницу между двумя значениями дат или дат со временем. - -**Синтаксис** - -``` sql -date_diff('unit', startdate, enddate, [timezone]) -``` - -Синонимы: `dateDiff`, `DATE_DIFF`. - -**Аргументы** - -- `unit` — единица измерения времени, в которой будет выражено возвращаемое значение функции. [String](../../sql-reference/data-types/string.md). - Возможные значения: - - - `second` - - `minute` - - `hour` - - `day` - - `week` - - `month` - - `quarter` - - `year` - -- `startdate` — первая дата или дата со временем, которая вычитается из `enddate`. [Date](../../sql-reference/data-types/date.md), [Date32](../../sql-reference/data-types/date32.md), [DateTime](../../sql-reference/data-types/datetime.md) или [DateTime64](../../sql-reference/data-types/datetime64.md). - -- `enddate` — вторая дата или дата со временем, из которой вычитается `startdate`. [Date](../../sql-reference/data-types/date.md), [Date32](../../sql-reference/data-types/date32.md), [DateTime](../../sql-reference/data-types/datetime.md) или [DateTime64](../../sql-reference/data-types/datetime64.md). - -- `timezone` — [часовой пояс](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone) (необязательно). Если этот аргумент указан, то он применяется как для `startdate`, так и для `enddate`. Если этот аргумент не указан, то используются часовые пояса аргументов `startdate` и `enddate`. Если часовые пояса аргументов `startdate` и `enddate` не совпадают, то результат не определен. [String](../../sql-reference/data-types/string.md). - -**Возвращаемое значение** - -Разница между `enddate` и `startdate`, выраженная в `unit`. - -Тип: [Int](../../sql-reference/data-types/int-uint.md). - -**Пример** - -Запрос: - -``` sql -SELECT dateDiff('hour', toDateTime('2018-01-01 22:00:00'), toDateTime('2018-01-02 23:00:00')); -``` - -Результат: - -``` text -┌─dateDiff('hour', toDateTime('2018-01-01 22:00:00'), toDateTime('2018-01-02 23:00:00'))─┐ -│ 25 │ -└────────────────────────────────────────────────────────────────────────────────────────┘ -``` - ## date\_sub {#date_sub} Вычитает интервал времени или даты из указанной даты или даты со временем. @@ -913,7 +985,7 @@ now([timezone]) - Текущие дата и время. -Тип: [Datetime](../../sql-reference/data-types/datetime.md). +Тип: [DateTime](../../sql-reference/data-types/datetime.md). **Пример** diff --git a/docs/ru/sql-reference/functions/ext-dict-functions.md b/docs/ru/sql-reference/functions/ext-dict-functions.md index 9651ad52a76..e6cb878d1c7 100644 --- a/docs/ru/sql-reference/functions/ext-dict-functions.md +++ b/docs/ru/sql-reference/functions/ext-dict-functions.md @@ -151,7 +151,7 @@ LIMIT 3; ``` sql SELECT - dictGet('ext-dict-mult', ('c1','c2'), number) AS val, + dictGet('ext-dict-mult', ('c1','c2'), number + 1) AS val, toTypeName(val) AS type FROM system.numbers LIMIT 3; diff --git a/docs/ru/sql-reference/functions/url-functions.md b/docs/ru/sql-reference/functions/url-functions.md index 34bb88f4991..3c6e6151ef8 100644 --- a/docs/ru/sql-reference/functions/url-functions.md +++ b/docs/ru/sql-reference/functions/url-functions.md @@ -404,5 +404,39 @@ SELECT netloc('http://paul@www.example.com:80/'); ### cutURLParameter(URL, name) {#cuturlparameterurl-name} -Удаляет параметр URL с именем name, если такой есть. Функция работает при допущении, что имя параметра закодировано в URL в точности таким же образом, что и в переданном аргументе. +Удаляет параметр с именем `name` из URL, если такой есть. Функция не кодирует или декодирует символы в именах параметров. Например `Client ID` и `Client%20ID` обрабатываются как разные имена параметров. +**Синтаксис** + +``` sql +cutURLParameter(URL, name) +``` + +**Аргументы** + +- `url` — URL. [String](../../sql-reference/data-types/string.md). +- `name` — имя параметра URL. [String](../../sql-reference/data-types/string.md) или [Array](../../sql-reference/data-types/array.md) состоящий из строк. + +**Возвращаемое значение** + +- URL с удалённым параметром URL с именем `name`. + +Type: `String`. + +**Пример** + +Запрос: + +``` sql +SELECT + cutURLParameter('http://bigmir.net/?a=b&c=d&e=f#g', 'a') as url_without_a, + cutURLParameter('http://bigmir.net/?a=b&c=d&e=f#g', ['c', 'e']) as url_without_c_and_e; +``` + +Результат: + +``` text +┌─url_without_a────────────────┬─url_without_c_and_e──────┐ +│ http://bigmir.net/?c=d&e=f#g │ http://bigmir.net/?a=b#g │ +└──────────────────────────────┴──────────────────────────┘ +``` diff --git a/docs/ru/sql-reference/statements/alter/column.md b/docs/ru/sql-reference/statements/alter/column.md index 11ec72596c4..a8ace213075 100644 --- a/docs/ru/sql-reference/statements/alter/column.md +++ b/docs/ru/sql-reference/statements/alter/column.md @@ -254,7 +254,7 @@ SELECT groupArray(x), groupArray(s) FROM tmp; Отсутствует возможность удалять столбцы, входящие в первичный ключ или ключ для сэмплирования (в общем, входящие в выражение `ENGINE`). Изменение типа у столбцов, входящих в первичный ключ возможно только в том случае, если это изменение не приводит к изменению данных (например, разрешено добавление значения в Enum или изменение типа с `DateTime` на `UInt32`). -Если возможностей запроса `ALTER` не хватает для нужного изменения таблицы, вы можете создать новую таблицу, скопировать туда данные с помощью запроса [INSERT SELECT](../insert-into.md#insert_query_insert-select), затем поменять таблицы местами с помощью запроса [RENAME](../rename.md#rename-table), и удалить старую таблицу. В качестве альтернативы для запроса `INSERT SELECT`, можно использовать инструмент [clickhouse-copier](../../../sql-reference/statements/alter/index.md). +Если возможностей запроса `ALTER` не хватает для нужного изменения таблицы, вы можете создать новую таблицу, скопировать туда данные с помощью запроса [INSERT SELECT](../insert-into.md#inserting-the-results-of-select), затем поменять таблицы местами с помощью запроса [RENAME](../rename.md#rename-table), и удалить старую таблицу. В качестве альтернативы для запроса `INSERT SELECT`, можно использовать инструмент [clickhouse-copier](../../../sql-reference/statements/alter/index.md). Запрос `ALTER` блокирует все чтения и записи для таблицы. То есть если на момент запроса `ALTER` выполнялся долгий `SELECT`, то запрос `ALTER` сначала дождётся его выполнения. И в это время все новые запросы к той же таблице будут ждать, пока завершится этот `ALTER`. diff --git a/docs/ru/sql-reference/statements/create/settings-profile.md b/docs/ru/sql-reference/statements/create/settings-profile.md index 9aa77e4c241..d37b975e096 100644 --- a/docs/ru/sql-reference/statements/create/settings-profile.md +++ b/docs/ru/sql-reference/statements/create/settings-profile.md @@ -11,7 +11,7 @@ sidebar_label: "Профиль настроек" Синтаксис: ``` sql -CREATE SETTINGS PROFILE [IF NOT EXISTS | OR REPLACE] TO name1 [ON CLUSTER cluster_name1] +CREATE SETTINGS PROFILE [IF NOT EXISTS | OR REPLACE] name1 [ON CLUSTER cluster_name1] [, name2 [ON CLUSTER cluster_name2] ...] [SETTINGS variable [= value] [MIN [=] min_value] [MAX [=] max_value] [CONST|READONLY|WRITABLE|CHANGEABLE_IN_READONLY] | INHERIT 'profile_name'] [,...] ``` @@ -26,4 +26,4 @@ CREATE SETTINGS PROFILE [IF NOT EXISTS | OR REPLACE] TO name1 [ON CLUSTER cluste CREATE SETTINGS PROFILE max_memory_usage_profile SETTINGS max_memory_usage = 100000001 MIN 90000000 MAX 110000000 TO robin ``` - \ No newline at end of file + diff --git a/docs/ru/sql-reference/statements/insert-into.md b/docs/ru/sql-reference/statements/insert-into.md index 573b8d39926..4fa6ac4ce66 100644 --- a/docs/ru/sql-reference/statements/insert-into.md +++ b/docs/ru/sql-reference/statements/insert-into.md @@ -95,7 +95,7 @@ INSERT INTO t FORMAT TabSeparated Если в таблице объявлены [ограничения](../../sql-reference/statements/create/table.md#constraints), то их выполнимость будет проверена для каждой вставляемой строки. Если для хотя бы одной строки ограничения не будут выполнены, запрос будет остановлен. -### Вставка результатов `SELECT` {#insert_query_insert-select} +### Вставка результатов `SELECT` {#inserting-the-results-of-select} **Синтаксис** diff --git a/docs/ru/sql-reference/statements/select/index.md b/docs/ru/sql-reference/statements/select/index.md index 4479e24000b..f360a09eb10 100644 --- a/docs/ru/sql-reference/statements/select/index.md +++ b/docs/ru/sql-reference/statements/select/index.md @@ -270,7 +270,7 @@ SELECT * REPLACE(i + 1 AS i) EXCEPT (j) APPLY(sum) from columns_transformers; └─────────────────┴────────┘ ``` -## SETTINGS в запросе SELECT {#settings-in-select} +## SETTINGS в запросе SELECT {#settings-in-select-query} Вы можете задать значения необходимых настроек непосредственно в запросе `SELECT` в секции `SETTINGS`. Эти настройки действуют только в рамках данного запроса, а после его выполнения сбрасываются до предыдущего значения или значения по умолчанию. diff --git a/docs/ru/sql-reference/table-functions/format.md b/docs/ru/sql-reference/table-functions/format.md new file mode 100644 index 00000000000..7b1516bc173 --- /dev/null +++ b/docs/ru/sql-reference/table-functions/format.md @@ -0,0 +1,75 @@ +--- +slug: /ru/sql-reference/table-functions/format +sidebar_position: 56 +sidebar_label: format +--- + +# format + +Extracts table structure from data and parses it according to specified input format. + +**Syntax** + +``` sql +format(format_name, data) +``` + +**Parameters** + +- `format_name` — The [format](../../interfaces/formats.md#formats) of the data. +- `data` — String literal or constant expression that returns a string containing data in specified format + +**Returned value** + +A table with data parsed from `data` argument according specified format and extracted schema. + +**Examples** + +**Query:** +``` sql +:) select * from format(JSONEachRow, +$$ +{"a": "Hello", "b": 111} +{"a": "World", "b": 123} +{"a": "Hello", "b": 112} +{"a": "World", "b": 124} +$$) +``` + +**Result:** + +```text +┌───b─┬─a─────┐ +│ 111 │ Hello │ +│ 123 │ World │ +│ 112 │ Hello │ +│ 124 │ World │ +└─────┴───────┘ +``` + +**Query:** +```sql + +:) desc format(JSONEachRow, +$$ +{"a": "Hello", "b": 111} +{"a": "World", "b": 123} +{"a": "Hello", "b": 112} +{"a": "World", "b": 124} +$$) +``` + +**Result:** + +```text +┌─name─┬─type──────────────┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┬─ttl_expression─┐ +│ b │ Nullable(Float64) │ │ │ │ │ │ +│ a │ Nullable(String) │ │ │ │ │ │ +└──────┴───────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘ +``` + +**See Also** + +- [Formats](../../interfaces/formats.md) + +[Original article](https://clickhouse.com/docs/en/sql-reference/table-functions/format) diff --git a/docs/tools/release.sh b/docs/tools/release.sh index 1d344457bf1..67499631baa 100755 --- a/docs/tools/release.sh +++ b/docs/tools/release.sh @@ -19,7 +19,7 @@ then # Will make a repository with website content as the only commit. git init git remote add origin "${GIT_PROD_URI}" - git config user.email "robot-clickhouse@clickhouse.com" + git config user.email "robot-clickhouse@users.noreply.github.com" git config user.name "robot-clickhouse" # Add files. diff --git a/docs/zh/engines/table-engines/integrations/kafka.md b/docs/zh/engines/table-engines/integrations/kafka.md index 707ee962ace..c6f11d9efce 100644 --- a/docs/zh/engines/table-engines/integrations/kafka.md +++ b/docs/zh/engines/table-engines/integrations/kafka.md @@ -74,7 +74,7 @@ Kafka 特性: 消费的消息会被自动追踪,因此每个消息在不同的消费组里只会记录一次。如果希望获得两次数据,则使用另一个组名创建副本。 -消费组可以灵活配置并且在集群之间同步。例如,如果群集中有10个主题和5个表副本,则每个副本将获得2个主题。 如果副本数量发生变化,主题将自动在副本中重新分配。了解更多信息请访问 http://kafka.apache.org/intro。 +消费组可以灵活配置并且在集群之间同步。例如,如果群集中有10个主题和5个表副本,则每个副本将获得2个主题。 如果副本数量发生变化,主题将自动在副本中重新分配。了解更多信息请访问 [http://kafka.apache.org/intro](http://kafka.apache.org/intro)。 `SELECT` 查询对于读取消息并不是很有用(调试除外),因为每条消息只能被读取一次。使用物化视图创建实时线程更实用。您可以这样做: diff --git a/docs/zh/guides/improving-query-performance/sparse-primary-indexes.md b/docs/zh/guides/improving-query-performance/sparse-primary-indexes.md index 13b4c368a96..e773a02fbc3 100644 --- a/docs/zh/guides/improving-query-performance/sparse-primary-indexes.md +++ b/docs/zh/guides/improving-query-performance/sparse-primary-indexes.md @@ -164,7 +164,7 @@ SETTINGS index_granularity = 8192, index_granularity_bytes = 0;
  • index_granularity: 显式设置为其默认值8192。这意味着对于每一组8192行,主索引将有一个索引条目,例如,如果表包含16384行,那么索引将有两个索引条目。

  • -
  • index_granularity_bytes: 设置为0表示禁止字适应索引粒度。自适应索引粒度意味着ClickHouse自动为一组n行创建一个索引条目 +
  • index_granularity_bytes: 设置为0表示禁止自适应索引粒度。自适应索引粒度意味着ClickHouse自动为一组n行创建一个索引条目