From ee772aaf0f7a05fafb3fa286419148211362b0d6 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Tue, 27 Feb 2024 23:17:11 +0100 Subject: [PATCH 001/289] Use clang-18 in CI --- .github/workflows/nightly.yml | 4 +- docker/packager/README.md | 8 ++-- docker/packager/packager | 24 +++++------ docker/test/fuzzer/run-fuzzer.sh | 2 +- docker/test/keeper-jepsen/run.sh | 2 +- docker/test/server-jepsen/run.sh | 2 +- docker/test/sqltest/run.sh | 2 +- docker/test/util/Dockerfile | 2 +- docs/en/development/build-cross-osx.md | 6 +-- docs/en/development/build-cross-riscv.md | 2 +- docs/en/development/build.md | 6 +-- docs/en/development/continuous-integration.md | 4 +- docs/en/development/developer-instruction.md | 2 +- tests/ci/ci_config.py | 42 +++++++++---------- 14 files changed, 54 insertions(+), 54 deletions(-) diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml index 93ac2be19b4..4f7915acd28 100644 --- a/.github/workflows/nightly.yml +++ b/.github/workflows/nightly.yml @@ -51,8 +51,8 @@ jobs: SONAR_SCANNER_VERSION: 4.8.0.2856 SONAR_SERVER_URL: "https://sonarcloud.io" BUILD_WRAPPER_OUT_DIR: build_wrapper_output_directory # Directory where build-wrapper output will be placed - CC: clang-17 - CXX: clang++-17 + CC: clang-18 + CXX: clang++-18 steps: - name: Check out repository code uses: ClickHouse/checkout@v1 diff --git a/docker/packager/README.md b/docker/packager/README.md index e0b7f38ea58..efc79f2e131 100644 --- a/docker/packager/README.md +++ b/docker/packager/README.md @@ -3,10 +3,10 @@ compilers and build settings. Correctly configured Docker daemon is single depen Usage: -Build deb package with `clang-17` in `debug` mode: +Build deb package with `clang-18` in `debug` mode: ``` $ mkdir deb/test_output -$ ./packager --output-dir deb/test_output/ --package-type deb --compiler=clang-17 --debug-build +$ ./packager --output-dir deb/test_output/ --package-type deb --compiler=clang-18 --debug-build $ ls -l deb/test_output -rw-r--r-- 1 root root 3730 clickhouse-client_22.2.2+debug_all.deb -rw-r--r-- 1 root root 84221888 clickhouse-common-static_22.2.2+debug_amd64.deb @@ -17,11 +17,11 @@ $ ls -l deb/test_output ``` -Build ClickHouse binary with `clang-17` and `address` sanitizer in `relwithdebuginfo` +Build ClickHouse binary with `clang-18` and `address` sanitizer in `relwithdebuginfo` mode: ``` $ mkdir $HOME/some_clickhouse -$ ./packager --output-dir=$HOME/some_clickhouse --package-type binary --compiler=clang-17 --sanitizer=address +$ ./packager --output-dir=$HOME/some_clickhouse --package-type binary --compiler=clang-18 --sanitizer=address $ ls -l $HOME/some_clickhouse -rwxr-xr-x 1 root root 787061952 clickhouse lrwxrwxrwx 1 root root 10 clickhouse-benchmark -> clickhouse diff --git a/docker/packager/packager b/docker/packager/packager index ca0ae8358f3..43391af8660 100755 --- a/docker/packager/packager +++ b/docker/packager/packager @@ -403,19 +403,19 @@ def parse_args() -> argparse.Namespace: parser.add_argument( "--compiler", choices=( - "clang-17", - "clang-17-darwin", - "clang-17-darwin-aarch64", - "clang-17-aarch64", - "clang-17-aarch64-v80compat", - "clang-17-ppc64le", - "clang-17-riscv64", - "clang-17-s390x", - "clang-17-amd64-compat", - "clang-17-amd64-musl", - "clang-17-freebsd", + "clang-18", + "clang-18-darwin", + "clang-18-darwin-aarch64", + "clang-18-aarch64", + "clang-18-aarch64-v80compat", + "clang-18-ppc64le", + "clang-18-riscv64", + "clang-18-s390x", + "clang-18-amd64-compat", + "clang-18-amd64-musl", + "clang-18-freebsd", ), - default="clang-17", + default="clang-18", help="a compiler to use", ) parser.add_argument( diff --git a/docker/test/fuzzer/run-fuzzer.sh b/docker/test/fuzzer/run-fuzzer.sh index b4376fe2409..9e950668b00 100755 --- a/docker/test/fuzzer/run-fuzzer.sh +++ b/docker/test/fuzzer/run-fuzzer.sh @@ -17,7 +17,7 @@ stage=${stage:-} script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" echo "$script_dir" repo_dir=ch -BINARY_TO_DOWNLOAD=${BINARY_TO_DOWNLOAD:="clang-17_debug_none_unsplitted_disable_False_binary"} +BINARY_TO_DOWNLOAD=${BINARY_TO_DOWNLOAD:="clang-18_debug_none_unsplitted_disable_False_binary"} BINARY_URL_TO_DOWNLOAD=${BINARY_URL_TO_DOWNLOAD:="https://clickhouse-builds.s3.amazonaws.com/$PR_TO_TEST/$SHA_TO_TEST/clickhouse_build_check/$BINARY_TO_DOWNLOAD/clickhouse"} function git_clone_with_retry diff --git a/docker/test/keeper-jepsen/run.sh b/docker/test/keeper-jepsen/run.sh index 576a0f0ef8e..444f3cd0de7 100644 --- a/docker/test/keeper-jepsen/run.sh +++ b/docker/test/keeper-jepsen/run.sh @@ -2,7 +2,7 @@ set -euo pipefail -CLICKHOUSE_PACKAGE=${CLICKHOUSE_PACKAGE:="https://clickhouse-builds.s3.amazonaws.com/$PR_TO_TEST/$SHA_TO_TEST/clickhouse_build_check/clang-17_relwithdebuginfo_none_unsplitted_disable_False_binary/clickhouse"} +CLICKHOUSE_PACKAGE=${CLICKHOUSE_PACKAGE:="https://clickhouse-builds.s3.amazonaws.com/$PR_TO_TEST/$SHA_TO_TEST/clickhouse_build_check/clang-18_relwithdebuginfo_none_unsplitted_disable_False_binary/clickhouse"} CLICKHOUSE_REPO_PATH=${CLICKHOUSE_REPO_PATH:=""} diff --git a/docker/test/server-jepsen/run.sh b/docker/test/server-jepsen/run.sh index 81e442e65b6..6ea9c03b954 100644 --- a/docker/test/server-jepsen/run.sh +++ b/docker/test/server-jepsen/run.sh @@ -2,7 +2,7 @@ set -euo pipefail -CLICKHOUSE_PACKAGE=${CLICKHOUSE_PACKAGE:="https://clickhouse-builds.s3.amazonaws.com/$PR_TO_TEST/$SHA_TO_TEST/clickhouse_build_check/clang-17_relwithdebuginfo_none_unsplitted_disable_False_binary/clickhouse"} +CLICKHOUSE_PACKAGE=${CLICKHOUSE_PACKAGE:="https://clickhouse-builds.s3.amazonaws.com/$PR_TO_TEST/$SHA_TO_TEST/clickhouse_build_check/clang-18_relwithdebuginfo_none_unsplitted_disable_False_binary/clickhouse"} CLICKHOUSE_REPO_PATH=${CLICKHOUSE_REPO_PATH:=""} diff --git a/docker/test/sqltest/run.sh b/docker/test/sqltest/run.sh index 1d939805c7b..7edc1341d7d 100755 --- a/docker/test/sqltest/run.sh +++ b/docker/test/sqltest/run.sh @@ -6,7 +6,7 @@ set -e set -u set -o pipefail -BINARY_TO_DOWNLOAD=${BINARY_TO_DOWNLOAD:="clang-17_debug_none_unsplitted_disable_False_binary"} +BINARY_TO_DOWNLOAD=${BINARY_TO_DOWNLOAD:="clang-18_debug_none_unsplitted_disable_False_binary"} BINARY_URL_TO_DOWNLOAD=${BINARY_URL_TO_DOWNLOAD:="https://clickhouse-builds.s3.amazonaws.com/$PR_TO_TEST/$SHA_TO_TEST/clickhouse_build_check/$BINARY_TO_DOWNLOAD/clickhouse"} function wget_with_retry diff --git a/docker/test/util/Dockerfile b/docker/test/util/Dockerfile index 396d5801be9..a5057f024b8 100644 --- a/docker/test/util/Dockerfile +++ b/docker/test/util/Dockerfile @@ -5,7 +5,7 @@ FROM ubuntu:22.04 ARG apt_archive="http://archive.ubuntu.com" RUN sed -i "s|http://archive.ubuntu.com|$apt_archive|g" /etc/apt/sources.list -ENV DEBIAN_FRONTEND=noninteractive LLVM_VERSION=17 +ENV DEBIAN_FRONTEND=noninteractive LLVM_VERSION=18 RUN apt-get update \ && apt-get install \ diff --git a/docs/en/development/build-cross-osx.md b/docs/en/development/build-cross-osx.md index eddf24448c1..66c6e2c6912 100644 --- a/docs/en/development/build-cross-osx.md +++ b/docs/en/development/build-cross-osx.md @@ -13,14 +13,14 @@ The cross-build for macOS is based on the [Build instructions](../development/bu The following sections provide a walk-through for building ClickHouse for `x86_64` macOS. If you’re targeting ARM architecture, simply substitute all occurrences of `x86_64` with `aarch64`. For example, replace `x86_64-apple-darwin` with `aarch64-apple-darwin` throughout the steps. -## Install Clang-17 +## Install clang-18 Follow the instructions from https://apt.llvm.org/ for your Ubuntu or Debian setup. For example the commands for Bionic are like: ``` bash sudo echo "deb [trusted=yes] http://apt.llvm.org/bionic/ llvm-toolchain-bionic-17 main" >> /etc/apt/sources.list -sudo apt-get install clang-17 +sudo apt-get install clang-18 ``` ## Install Cross-Compilation Toolset {#install-cross-compilation-toolset} @@ -59,7 +59,7 @@ curl -L 'https://github.com/phracker/MacOSX-SDKs/releases/download/11.3/MacOSX11 cd ClickHouse mkdir build-darwin cd build-darwin -CC=clang-17 CXX=clang++-17 cmake -DCMAKE_AR:FILEPATH=${CCTOOLS}/bin/x86_64-apple-darwin-ar -DCMAKE_INSTALL_NAME_TOOL=${CCTOOLS}/bin/x86_64-apple-darwin-install_name_tool -DCMAKE_RANLIB:FILEPATH=${CCTOOLS}/bin/x86_64-apple-darwin-ranlib -DLINKER_NAME=${CCTOOLS}/bin/x86_64-apple-darwin-ld -DCMAKE_TOOLCHAIN_FILE=cmake/darwin/toolchain-x86_64.cmake .. +CC=clang-18 CXX=clang++-18 cmake -DCMAKE_AR:FILEPATH=${CCTOOLS}/bin/x86_64-apple-darwin-ar -DCMAKE_INSTALL_NAME_TOOL=${CCTOOLS}/bin/x86_64-apple-darwin-install_name_tool -DCMAKE_RANLIB:FILEPATH=${CCTOOLS}/bin/x86_64-apple-darwin-ranlib -DLINKER_NAME=${CCTOOLS}/bin/x86_64-apple-darwin-ld -DCMAKE_TOOLCHAIN_FILE=cmake/darwin/toolchain-x86_64.cmake .. ninja ``` diff --git a/docs/en/development/build-cross-riscv.md b/docs/en/development/build-cross-riscv.md index 9ee5346f258..759d97823e2 100644 --- a/docs/en/development/build-cross-riscv.md +++ b/docs/en/development/build-cross-riscv.md @@ -23,7 +23,7 @@ sudo bash -c "$(wget -O - https://apt.llvm.org/llvm.sh)" ``` bash cd ClickHouse mkdir build-riscv64 -CC=clang-17 CXX=clang++-17 cmake . -Bbuild-riscv64 -G Ninja -DCMAKE_TOOLCHAIN_FILE=cmake/linux/toolchain-riscv64.cmake -DGLIBC_COMPATIBILITY=OFF -DENABLE_LDAP=OFF -DOPENSSL_NO_ASM=ON -DENABLE_JEMALLOC=ON -DENABLE_PARQUET=OFF -DENABLE_GRPC=OFF -DENABLE_HDFS=OFF -DENABLE_MYSQL=OFF +CC=clang-18 CXX=clang++-18 cmake . -Bbuild-riscv64 -G Ninja -DCMAKE_TOOLCHAIN_FILE=cmake/linux/toolchain-riscv64.cmake -DGLIBC_COMPATIBILITY=OFF -DENABLE_LDAP=OFF -DOPENSSL_NO_ASM=ON -DENABLE_JEMALLOC=ON -DENABLE_PARQUET=OFF -DENABLE_GRPC=OFF -DENABLE_HDFS=OFF -DENABLE_MYSQL=OFF ninja -C build-riscv64 ``` diff --git a/docs/en/development/build.md b/docs/en/development/build.md index b474c445604..e4d0fb146ef 100644 --- a/docs/en/development/build.md +++ b/docs/en/development/build.md @@ -67,8 +67,8 @@ to see what version you have installed before setting this environment variable. ::: ``` bash -export CC=clang-17 -export CXX=clang++-17 +export CC=clang-18 +export CXX=clang++-18 ``` ### Checkout ClickHouse Sources {#checkout-clickhouse-sources} @@ -105,7 +105,7 @@ The build requires the following components: - Git (used to checkout the sources, not needed for the build) - CMake 3.20 or newer -- Compiler: clang-17 or newer +- Compiler: clang-18 or newer - Linker: lld-17 or newer - Ninja - Yasm diff --git a/docs/en/development/continuous-integration.md b/docs/en/development/continuous-integration.md index 46a30f56f11..91253ca5e44 100644 --- a/docs/en/development/continuous-integration.md +++ b/docs/en/development/continuous-integration.md @@ -153,7 +153,7 @@ Builds ClickHouse in various configurations for use in further steps. You have t ### Report Details -- **Compiler**: `clang-17`, optionally with the name of a target platform +- **Compiler**: `clang-18`, optionally with the name of a target platform - **Build type**: `Debug` or `RelWithDebInfo` (cmake). - **Sanitizer**: `none` (without sanitizers), `address` (ASan), `memory` (MSan), `undefined` (UBSan), or `thread` (TSan). - **Status**: `success` or `fail` @@ -177,7 +177,7 @@ Performs static analysis and code style checks using `clang-tidy`. The report is There is a convenience `packager` script that runs the clang-tidy build in docker ```sh mkdir build_tidy -./docker/packager/packager --output-dir=./build_tidy --package-type=binary --compiler=clang-17 --debug-build --clang-tidy +./docker/packager/packager --output-dir=./build_tidy --package-type=binary --compiler=clang-18 --debug-build --clang-tidy ``` diff --git a/docs/en/development/developer-instruction.md b/docs/en/development/developer-instruction.md index e08096d8042..44a5af5911a 100644 --- a/docs/en/development/developer-instruction.md +++ b/docs/en/development/developer-instruction.md @@ -115,7 +115,7 @@ While inside the `build` directory, configure your build by running CMake. Befor export CC=clang CXX=clang++ cmake .. -If you installed clang using the automatic installation script above, also specify the version of clang installed in the first command, e.g. `export CC=clang-17 CXX=clang++-17`. The clang version will be in the script output. +If you installed clang using the automatic installation script above, also specify the version of clang installed in the first command, e.g. `export CC=clang-18 CXX=clang++-18`. The clang version will be in the script output. The `CC` variable specifies the compiler for C (short for C Compiler), and `CXX` variable instructs which C++ compiler is to be used for building. diff --git a/tests/ci/ci_config.py b/tests/ci/ci_config.py index 80994f71280..4757341ce7c 100644 --- a/tests/ci/ci_config.py +++ b/tests/ci/ci_config.py @@ -670,63 +670,63 @@ CI_CONFIG = CIConfig( build_config={ Build.PACKAGE_RELEASE: BuildConfig( name=Build.PACKAGE_RELEASE, - compiler="clang-17", + compiler="clang-18", package_type="deb", static_binary_name="amd64", additional_pkgs=True, ), Build.PACKAGE_AARCH64: BuildConfig( name=Build.PACKAGE_AARCH64, - compiler="clang-17-aarch64", + compiler="clang-18-aarch64", package_type="deb", static_binary_name="aarch64", additional_pkgs=True, ), Build.PACKAGE_ASAN: BuildConfig( name=Build.PACKAGE_ASAN, - compiler="clang-17", + compiler="clang-18", sanitizer="address", package_type="deb", ), Build.PACKAGE_UBSAN: BuildConfig( name=Build.PACKAGE_UBSAN, - compiler="clang-17", + compiler="clang-18", sanitizer="undefined", package_type="deb", ), Build.PACKAGE_TSAN: BuildConfig( name=Build.PACKAGE_TSAN, - compiler="clang-17", + compiler="clang-18", sanitizer="thread", package_type="deb", ), Build.PACKAGE_MSAN: BuildConfig( name=Build.PACKAGE_MSAN, - compiler="clang-17", + compiler="clang-18", sanitizer="memory", package_type="deb", ), Build.PACKAGE_DEBUG: BuildConfig( name=Build.PACKAGE_DEBUG, - compiler="clang-17", + compiler="clang-18", debug_build=True, package_type="deb", sparse_checkout=True, # Check that it works with at least one build, see also update-submodules.sh ), Build.PACKAGE_RELEASE_COVERAGE: BuildConfig( name=Build.PACKAGE_RELEASE_COVERAGE, - compiler="clang-17", + compiler="clang-18", coverage=True, package_type="deb", ), Build.BINARY_RELEASE: BuildConfig( name=Build.BINARY_RELEASE, - compiler="clang-17", + compiler="clang-18", package_type="binary", ), Build.BINARY_TIDY: BuildConfig( name=Build.BINARY_TIDY, - compiler="clang-17", + compiler="clang-18", debug_build=True, package_type="binary", static_binary_name="debug-amd64", @@ -735,69 +735,69 @@ CI_CONFIG = CIConfig( ), Build.BINARY_DARWIN: BuildConfig( name=Build.BINARY_DARWIN, - compiler="clang-17-darwin", + compiler="clang-18-darwin", package_type="binary", static_binary_name="macos", ), Build.BINARY_AARCH64: BuildConfig( name=Build.BINARY_AARCH64, - compiler="clang-17-aarch64", + compiler="clang-18-aarch64", package_type="binary", ), Build.BINARY_AARCH64_V80COMPAT: BuildConfig( name=Build.BINARY_AARCH64_V80COMPAT, - compiler="clang-17-aarch64-v80compat", + compiler="clang-18-aarch64-v80compat", package_type="binary", static_binary_name="aarch64v80compat", comment="For ARMv8.1 and older", ), Build.BINARY_FREEBSD: BuildConfig( name=Build.BINARY_FREEBSD, - compiler="clang-17-freebsd", + compiler="clang-18-freebsd", package_type="binary", static_binary_name="freebsd", ), Build.BINARY_DARWIN_AARCH64: BuildConfig( name=Build.BINARY_DARWIN_AARCH64, - compiler="clang-17-darwin-aarch64", + compiler="clang-18-darwin-aarch64", package_type="binary", static_binary_name="macos-aarch64", ), Build.BINARY_PPC64LE: BuildConfig( name=Build.BINARY_PPC64LE, - compiler="clang-17-ppc64le", + compiler="clang-18-ppc64le", package_type="binary", static_binary_name="powerpc64le", ), Build.BINARY_AMD64_COMPAT: BuildConfig( name=Build.BINARY_AMD64_COMPAT, - compiler="clang-17-amd64-compat", + compiler="clang-18-amd64-compat", package_type="binary", static_binary_name="amd64compat", comment="SSE2-only build", ), Build.BINARY_AMD64_MUSL: BuildConfig( name=Build.BINARY_AMD64_MUSL, - compiler="clang-17-amd64-musl", + compiler="clang-18-amd64-musl", package_type="binary", static_binary_name="amd64musl", comment="Build with Musl", ), Build.BINARY_RISCV64: BuildConfig( name=Build.BINARY_RISCV64, - compiler="clang-17-riscv64", + compiler="clang-18-riscv64", package_type="binary", static_binary_name="riscv64", ), Build.BINARY_S390X: BuildConfig( name=Build.BINARY_S390X, - compiler="clang-17-s390x", + compiler="clang-18-s390x", package_type="binary", static_binary_name="s390x", ), Build.FUZZERS: BuildConfig( name=Build.FUZZERS, - compiler="clang-17", + compiler="clang-18", package_type="fuzzers", job_config=JobConfig(run_by_label=Labels.libFuzzer), ), From aaec92f860c92f403c2f79fb9f724102487ed903 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 3 Mar 2024 23:02:55 +0100 Subject: [PATCH 002/289] Address review comments --- cmake/tools.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/tools.cmake b/cmake/tools.cmake index 1ba3007b0f3..789dc76e6fa 100644 --- a/cmake/tools.cmake +++ b/cmake/tools.cmake @@ -13,7 +13,7 @@ execute_process(COMMAND ${CMAKE_CXX_COMPILER} --version OUTPUT_VARIABLE COMPILER message (STATUS "Using compiler:\n${COMPILER_SELF_IDENTIFICATION}") # Require minimum compiler versions -set (CLANG_MINIMUM_VERSION 16) +set (CLANG_MINIMUM_VERSION 17) set (XCODE_MINIMUM_VERSION 12.0) set (APPLE_CLANG_MINIMUM_VERSION 12.0.0) From 8d25f27b1901d4d4999558b987a415c4c289e2ba Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 3 Mar 2024 23:07:05 +0100 Subject: [PATCH 003/289] Fix UBSan report --- src/Compression/CompressionCodecDoubleDelta.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Compression/CompressionCodecDoubleDelta.cpp b/src/Compression/CompressionCodecDoubleDelta.cpp index 99089ed6770..fe18578eb2a 100644 --- a/src/Compression/CompressionCodecDoubleDelta.cpp +++ b/src/Compression/CompressionCodecDoubleDelta.cpp @@ -343,7 +343,7 @@ UInt32 compressDataForType(const char * source, UInt32 source_size, char * dest) const auto sign = signed_dd < 0; // -1 shrinks dd down to fit into number of bits, and there can't be 0, so it is OK. - const auto abs_value = static_cast(std::abs(signed_dd) - 1); + const auto abs_value = (sign ? -static_cast(signed_dd) : static_cast(signed_dd)) - 1; const auto write_spec = getDeltaWriteSpec(signed_dd); writer.writeBits(write_spec.prefix_bits, write_spec.prefix); From b3a43346b110ecbdbbc9d8c025fee09b4b35e2b6 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 10 Mar 2024 06:59:42 +0100 Subject: [PATCH 004/289] Revert "Fix UBSan report" This reverts commit 8d25f27b1901d4d4999558b987a415c4c289e2ba. --- src/Compression/CompressionCodecDoubleDelta.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Compression/CompressionCodecDoubleDelta.cpp b/src/Compression/CompressionCodecDoubleDelta.cpp index fe18578eb2a..99089ed6770 100644 --- a/src/Compression/CompressionCodecDoubleDelta.cpp +++ b/src/Compression/CompressionCodecDoubleDelta.cpp @@ -343,7 +343,7 @@ UInt32 compressDataForType(const char * source, UInt32 source_size, char * dest) const auto sign = signed_dd < 0; // -1 shrinks dd down to fit into number of bits, and there can't be 0, so it is OK. - const auto abs_value = (sign ? -static_cast(signed_dd) : static_cast(signed_dd)) - 1; + const auto abs_value = static_cast(std::abs(signed_dd) - 1); const auto write_spec = getDeltaWriteSpec(signed_dd); writer.writeBits(write_spec.prefix_bits, write_spec.prefix); From 4e8a363b34380a914a326cadff5f9f796837eb70 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 10 Mar 2024 07:32:47 +0100 Subject: [PATCH 005/289] Fix something entirely wrong in DoubleDelta --- src/Compression/CompressionCodecDoubleDelta.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/Compression/CompressionCodecDoubleDelta.cpp b/src/Compression/CompressionCodecDoubleDelta.cpp index 99089ed6770..e6e8db4c699 100644 --- a/src/Compression/CompressionCodecDoubleDelta.cpp +++ b/src/Compression/CompressionCodecDoubleDelta.cpp @@ -343,7 +343,10 @@ UInt32 compressDataForType(const char * source, UInt32 source_size, char * dest) const auto sign = signed_dd < 0; // -1 shrinks dd down to fit into number of bits, and there can't be 0, so it is OK. - const auto abs_value = static_cast(std::abs(signed_dd) - 1); + const auto abs_value = + signed_dd == std::numeric_limits::min() + ? (static_cast(-1) >> 1) + : static_cast(std::abs(signed_dd) - 1); const auto write_spec = getDeltaWriteSpec(signed_dd); writer.writeBits(write_spec.prefix_bits, write_spec.prefix); From 9bb32dc78fa7872cb965d087e64050c091688777 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 9 Mar 2024 09:22:14 +0100 Subject: [PATCH 006/289] Loosen --- CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index b55e9810361..9ffb4789dc9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -61,8 +61,8 @@ if (ENABLE_CHECK_HEAVY_BUILDS) # set CPU time limit to 1000 seconds set (RLIMIT_CPU 1000) - # -fsanitize=memory is too heavy - if (SANITIZE STREQUAL "memory") + # -fsanitize=memory and address are too heavy + if (SANITIZE) set (RLIMIT_DATA 10000000000) # 10G endif() From 2ff1368678504b3100e7c0f4dc8496e4bfdc9539 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Wed, 13 Mar 2024 09:37:17 +0100 Subject: [PATCH 007/289] Fix FreeBSD --- src/Common/waitForPid.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Common/waitForPid.cpp b/src/Common/waitForPid.cpp index 0ec10811354..894406f69b5 100644 --- a/src/Common/waitForPid.cpp +++ b/src/Common/waitForPid.cpp @@ -144,7 +144,7 @@ static PollPidResult pollPid(pid_t pid, int timeout_in_ms) return PollPidResult::FAILED; } - struct kevent event = {.ident = 0}; + struct kevent event{}; struct timespec remaining_timespec = {.tv_sec = timeout_in_ms / 1000, .tv_nsec = (timeout_in_ms % 1000) * 1000000}; int ready = HANDLE_EINTR(kevent(kq, nullptr, 0, &event, 1, &remaining_timespec)); PollPidResult result = ready < 0 ? PollPidResult::FAILED : PollPidResult::RESTART; From 480ae2cd4982fda4fa3f24e494faa8b1848eff59 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Wed, 13 Mar 2024 09:39:03 +0100 Subject: [PATCH 008/289] Fix clang-tidy --- cmake/clang_tidy.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmake/clang_tidy.cmake b/cmake/clang_tidy.cmake index 4323c20463a..4c9331f6283 100644 --- a/cmake/clang_tidy.cmake +++ b/cmake/clang_tidy.cmake @@ -5,14 +5,14 @@ if (ENABLE_CLANG_TIDY) find_program (CLANG_TIDY_CACHE_PATH NAMES "clang-tidy-cache") if (CLANG_TIDY_CACHE_PATH) - find_program (_CLANG_TIDY_PATH NAMES "clang-tidy-17" "clang-tidy-16" "clang-tidy") + find_program (_CLANG_TIDY_PATH NAMES "clang-tidy-18" "clang-tidy-17" "clang-tidy-16" "clang-tidy") # Why do we use ';' here? # It's a cmake black magic: https://cmake.org/cmake/help/latest/prop_tgt/LANG_CLANG_TIDY.html#prop_tgt:%3CLANG%3E_CLANG_TIDY # The CLANG_TIDY_PATH is passed to CMAKE_CXX_CLANG_TIDY, which follows CXX_CLANG_TIDY syntax. set (CLANG_TIDY_PATH "${CLANG_TIDY_CACHE_PATH};${_CLANG_TIDY_PATH}" CACHE STRING "A combined command to run clang-tidy with caching wrapper") else () - find_program (CLANG_TIDY_PATH NAMES "clang-tidy-17" "clang-tidy-16" "clang-tidy") + find_program (CLANG_TIDY_PATH NAMES "clang-tidy-18" "clang-tidy-17" "clang-tidy-16" "clang-tidy") endif () if (CLANG_TIDY_PATH) From 6a41dbca0bbaae258ee034e271293479a793df9b Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 Mar 2024 13:08:01 +0100 Subject: [PATCH 009/289] Fix Apple's Macintosh OS X 'darwin' --- src/Common/waitForPid.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Common/waitForPid.cpp b/src/Common/waitForPid.cpp index 894406f69b5..5fb2c62415b 100644 --- a/src/Common/waitForPid.cpp +++ b/src/Common/waitForPid.cpp @@ -132,7 +132,7 @@ static PollPidResult pollPid(pid_t pid, int timeout_in_ms) if (kq == -1) return PollPidResult::FAILED; - struct kevent change = {.ident = 0}; + struct kevent change{}; EV_SET(&change, pid, EVFILT_PROC, EV_ADD, NOTE_EXIT, 0, NULL); int event_add_result = HANDLE_EINTR(kevent(kq, &change, 1, NULL, 0, NULL)); From d03ae0655abaef54117f6455063009aa7aed790b Mon Sep 17 00:00:00 2001 From: taiyang-li <654010905@qq.com> Date: Wed, 20 Mar 2024 16:39:37 +0800 Subject: [PATCH 010/289] finish opt utf8 version of substring/substringindex/reverse --- src/Common/UTF8Helpers.cpp | 75 +++++++++++++++++++++++++++++++- src/Common/UTF8Helpers.h | 5 ++- src/Functions/reverse.cpp | 38 +--------------- src/Functions/reverse.h | 42 ++++++++++++++++++ src/Functions/reverseUTF8.cpp | 12 ++++- src/Functions/substring.cpp | 18 +++++++- src/Functions/substringIndex.cpp | 20 +++++++-- 7 files changed, 164 insertions(+), 46 deletions(-) create mode 100644 src/Functions/reverse.h diff --git a/src/Common/UTF8Helpers.cpp b/src/Common/UTF8Helpers.cpp index 0af31726f40..78d645994e5 100644 --- a/src/Common/UTF8Helpers.cpp +++ b/src/Common/UTF8Helpers.cpp @@ -1,9 +1,13 @@ -#include #include +#include +#include #include #include +#if USE_MULTITARGET_CODE +#include +#endif namespace DB { @@ -205,5 +209,74 @@ size_t computeBytesBeforeWidth(const UInt8 * data, size_t size, size_t prefix, s return computeWidthImpl(data, size, prefix, limit); } + +DECLARE_DEFAULT_CODE( +bool isAllASCII(const UInt8 * data, size_t size) +{ + UInt8 mask = 0; + for (size_t i = 0; i < size; ++i) + mask |= data[i]; + + return !(mask & 0x80); +}) + +DECLARE_SSE42_SPECIFIC_CODE( +/// Copy from https://github.com/lemire/fastvalidate-utf-8/blob/master/include/simdasciicheck.h +bool isAllASCII(const UInt8 * data, size_t size) { + size_t i = 0; + __m128i masks = _mm_setzero_si128(); + if (size >= 16) + { + for (; i <= size - 16; i += 16) + { + __m128i bytes = _mm_loadu_si128(reinterpret_cast(data + i)); + masks = _mm_or_si128(masks, bytes); + } + } + int mask = _mm_movemask_epi8(masks); + + UInt8 tail_mask = 0; + for (; i < size; i++) + tail_mask |= data[i]; + + mask |= (tail_mask & 0x80); + return !mask; +}) + +DECLARE_AVX2_SPECIFIC_CODE( +bool isAllASCII(const UInt8 * data, size_t size) +{ + size_t i = 0; + __m256i masks = _mm256_setzero_si256(); + if (size >= 32) + { + for (; i <= size - 32; i += 32) + { + __m256i bytes = _mm256_loadu_si256(reinterpret_cast(data + i)); + masks = _mm256_or_si256(masks, bytes); + } + } + int mask = _mm256_movemask_epi8(masks); + + UInt8 tail_mask = 0; + for (; i < size; i++) + tail_mask |= data[i]; + + mask |= (tail_mask & 0x80); + return !mask; +}) + +bool isAllASCII(const UInt8* data, size_t size) +{ +#if USE_MULTITARGET_CODE + if (isArchSupported(TargetArch::AVX2)) + return TargetSpecific::AVX2::isAllASCII(data, size); + if (isArchSupported(TargetArch::SSE42)) + return TargetSpecific::SSE42::isAllASCII(data, size); +#endif + return TargetSpecific::Default::isAllASCII(data, size); +} + + } } diff --git a/src/Common/UTF8Helpers.h b/src/Common/UTF8Helpers.h index a4dd88921b7..933b62c7b63 100644 --- a/src/Common/UTF8Helpers.h +++ b/src/Common/UTF8Helpers.h @@ -136,7 +136,10 @@ size_t computeWidth(const UInt8 * data, size_t size, size_t prefix = 0) noexcept */ size_t computeBytesBeforeWidth(const UInt8 * data, size_t size, size_t prefix, size_t limit) noexcept; -} +/// If all the characters in the string are ASCII, return true. +bool isAllASCII(const UInt8* data, size_t size); + +} } diff --git a/src/Functions/reverse.cpp b/src/Functions/reverse.cpp index 32b998523c7..39608b77997 100644 --- a/src/Functions/reverse.cpp +++ b/src/Functions/reverse.cpp @@ -1,10 +1,10 @@ #include -#include #include #include #include #include #include +#include "reverse.h" namespace DB @@ -17,42 +17,6 @@ namespace ErrorCodes namespace { - -/** Reverse the string as a sequence of bytes. - */ -struct ReverseImpl -{ - static void vector(const ColumnString::Chars & data, - const ColumnString::Offsets & offsets, - ColumnString::Chars & res_data, - ColumnString::Offsets & res_offsets) - { - res_data.resize(data.size()); - res_offsets.assign(offsets); - size_t size = offsets.size(); - - ColumnString::Offset prev_offset = 0; - for (size_t i = 0; i < size; ++i) - { - for (size_t j = prev_offset; j < offsets[i] - 1; ++j) - res_data[j] = data[offsets[i] + prev_offset - 2 - j]; - res_data[offsets[i] - 1] = 0; - prev_offset = offsets[i]; - } - } - - static void vectorFixed(const ColumnString::Chars & data, size_t n, ColumnString::Chars & res_data) - { - res_data.resize(data.size()); - size_t size = data.size() / n; - - for (size_t i = 0; i < size; ++i) - for (size_t j = i * n; j < (i + 1) * n; ++j) - res_data[j] = data[(i * 2 + 1) * n - j - 1]; - } -}; - - class FunctionReverse : public IFunction { public: diff --git a/src/Functions/reverse.h b/src/Functions/reverse.h new file mode 100644 index 00000000000..128a897c041 --- /dev/null +++ b/src/Functions/reverse.h @@ -0,0 +1,42 @@ +#pragma once + +#include + +namespace DB +{ + +/** Reverse the string as a sequence of bytes. + */ +struct ReverseImpl +{ + static void vector(const ColumnString::Chars & data, + const ColumnString::Offsets & offsets, + ColumnString::Chars & res_data, + ColumnString::Offsets & res_offsets) + { + res_data.resize(data.size()); + res_offsets.assign(offsets); + size_t size = offsets.size(); + + ColumnString::Offset prev_offset = 0; + for (size_t i = 0; i < size; ++i) + { + for (size_t j = prev_offset; j < offsets[i] - 1; ++j) + res_data[j] = data[offsets[i] + prev_offset - 2 - j]; + res_data[offsets[i] - 1] = 0; + prev_offset = offsets[i]; + } + } + + static void vectorFixed(const ColumnString::Chars & data, size_t n, ColumnString::Chars & res_data) + { + res_data.resize(data.size()); + size_t size = data.size() / n; + + for (size_t i = 0; i < size; ++i) + for (size_t j = i * n; j < (i + 1) * n; ++j) + res_data[j] = data[(i * 2 + 1) * n - j - 1]; + } +}; + +} diff --git a/src/Functions/reverseUTF8.cpp b/src/Functions/reverseUTF8.cpp index 8a76af05d86..4ea861919a1 100644 --- a/src/Functions/reverseUTF8.cpp +++ b/src/Functions/reverseUTF8.cpp @@ -1,7 +1,9 @@ -#include #include +#include #include #include +#include +#include "reverse.h" namespace DB @@ -25,10 +27,18 @@ struct ReverseUTF8Impl ColumnString::Chars & res_data, ColumnString::Offsets & res_offsets) { + bool all_ascii = UTF8::isAllASCII(data.data(), data.size()); + if (all_ascii) + { + ReverseImpl::vector(data, offsets, res_data, res_offsets); + return; + } + res_data.resize(data.size()); res_offsets.assign(offsets); size_t size = offsets.size(); + ColumnString::Offset prev_offset = 0; for (size_t i = 0; i < size; ++i) { diff --git a/src/Functions/substring.cpp b/src/Functions/substring.cpp index e809914f5f0..759d41e2ab8 100644 --- a/src/Functions/substring.cpp +++ b/src/Functions/substring.cpp @@ -148,9 +148,23 @@ public: if constexpr (is_utf8) { if (const ColumnString * col = checkAndGetColumn(column_string.get())) - return executeForSource(column_offset, column_length, column_offset_const, column_length_const, offset, length, UTF8StringSource(*col), input_rows_count); + { + bool all_ascii = UTF8::isAllASCII(col->getChars().data(), col->getChars().size()); + if (all_ascii) + return executeForSource(column_offset, column_length, column_offset_const, column_length_const, offset, length, StringSource(*col), input_rows_count); + else + return executeForSource(column_offset, column_length, column_offset_const, column_length_const, offset, length, UTF8StringSource(*col), input_rows_count); + } + if (const ColumnConst * col_const = checkAndGetColumnConst(column_string.get())) - return executeForSource(column_offset, column_length, column_offset_const, column_length_const, offset, length, ConstSource(*col_const), input_rows_count); + { + StringRef str_ref = col_const->getDataAt(0); + bool all_ascii = UTF8::isAllASCII(reinterpret_cast(str_ref.data), str_ref.size); + if (all_ascii) + return executeForSource(column_offset, column_length, column_offset_const, column_length_const, offset, length, ConstSource(*col_const), input_rows_count); + else + return executeForSource(column_offset, column_length, column_offset_const, column_length_const, offset, length, ConstSource(*col_const), input_rows_count); + } throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of first argument of function {}", arguments[0].column->getName(), getName()); } else diff --git a/src/Functions/substringIndex.cpp b/src/Functions/substringIndex.cpp index 5f3f054b624..462941b90ec 100644 --- a/src/Functions/substringIndex.cpp +++ b/src/Functions/substringIndex.cpp @@ -129,8 +129,10 @@ namespace res_data.reserve(str_column->getChars().size() / 2); res_offsets.reserve(rows); + bool all_ascii = UTF8::isAllASCII(str_column->getChars().data(), str_column->getChars().size()) + && UTF8::isAllASCII(reinterpret_cast(delim.data()), delim.size()); std::unique_ptr searcher - = !is_utf8 ? nullptr : std::make_unique(delim.data(), delim.size()); + = !is_utf8 || all_ascii ? nullptr : std::make_unique(delim.data(), delim.size()); for (size_t i = 0; i < rows; ++i) { @@ -140,6 +142,8 @@ namespace StringRef res_ref; if constexpr (!is_utf8) res_ref = substringIndex(str_ref, delim[0], count); + else if (all_ascii) + res_ref = substringIndex(str_ref, delim[0], count); else res_ref = substringIndexUTF8(searcher.get(), str_ref, delim, count); @@ -158,8 +162,10 @@ namespace res_data.reserve(str_column->getChars().size() / 2); res_offsets.reserve(rows); + bool all_ascii = UTF8::isAllASCII(str_column->getChars().data(), str_column->getChars().size()) + && UTF8::isAllASCII(reinterpret_cast(delim.data()), delim.size()); std::unique_ptr searcher - = !is_utf8 ? nullptr : std::make_unique(delim.data(), delim.size()); + = !is_utf8 || all_ascii ? nullptr : std::make_unique(delim.data(), delim.size()); for (size_t i = 0; i < rows; ++i) { @@ -168,6 +174,8 @@ namespace StringRef res_ref; if constexpr (!is_utf8) res_ref = substringIndex(str_ref, delim[0], count); + else if (all_ascii) + res_ref = substringIndex(str_ref, delim[0], count); else res_ref = substringIndexUTF8(searcher.get(), str_ref, delim, count); @@ -186,8 +194,10 @@ namespace res_data.reserve(str.size() * rows / 2); res_offsets.reserve(rows); + bool all_ascii = UTF8::isAllASCII(reinterpret_cast(str.data()), str.size()) + && UTF8::isAllASCII(reinterpret_cast(delim.data()), delim.size()); std::unique_ptr searcher - = !is_utf8 ? nullptr : std::make_unique(delim.data(), delim.size()); + = !is_utf8 || all_ascii ? nullptr : std::make_unique(delim.data(), delim.size()); StringRef str_ref{str.data(), str.size()}; for (size_t i = 0; i < rows; ++i) @@ -197,6 +207,8 @@ namespace StringRef res_ref; if constexpr (!is_utf8) res_ref = substringIndex(str_ref, delim[0], count); + else if (all_ascii) + res_ref = substringIndex(str_ref, delim[0], count); else res_ref = substringIndexUTF8(searcher.get(), str_ref, delim, count); @@ -208,7 +220,7 @@ namespace { size_t res_offset = res_data.size(); res_data.resize(res_offset + res_ref.size + 1); - memcpy(&res_data[res_offset], res_ref.data, res_ref.size); + memcpySmallAllowReadWriteOverflow15(&res_data[res_offset], res_ref.data, res_ref.size); res_offset += res_ref.size; res_data[res_offset] = 0; ++res_offset; From 75d4cebf7c35ea744816161322f99753bdd65a8c Mon Sep 17 00:00:00 2001 From: taiyang-li <654010905@qq.com> Date: Wed, 20 Mar 2024 17:20:46 +0800 Subject: [PATCH 011/289] opt lowerUTF8/upperUTF8 --- src/Functions/LowerUpperImpl.h | 4 ++-- src/Functions/LowerUpperUTF8Impl.h | 13 +++++++++++-- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/src/Functions/LowerUpperImpl.h b/src/Functions/LowerUpperImpl.h index f093e00f7ab..72b3ce1ca34 100644 --- a/src/Functions/LowerUpperImpl.h +++ b/src/Functions/LowerUpperImpl.h @@ -13,14 +13,14 @@ struct LowerUpperImpl ColumnString::Chars & res_data, ColumnString::Offsets & res_offsets) { - res_data.resize(data.size()); + res_data.resize_exact(data.size()); res_offsets.assign(offsets); array(data.data(), data.data() + data.size(), res_data.data()); } static void vectorFixed(const ColumnString::Chars & data, size_t /*n*/, ColumnString::Chars & res_data) { - res_data.resize(data.size()); + res_data.resize_exact(data.size()); array(data.data(), data.data() + data.size(), res_data.data()); } diff --git a/src/Functions/LowerUpperUTF8Impl.h b/src/Functions/LowerUpperUTF8Impl.h index 460f75f9bde..f2a1351c048 100644 --- a/src/Functions/LowerUpperUTF8Impl.h +++ b/src/Functions/LowerUpperUTF8Impl.h @@ -1,8 +1,9 @@ #pragma once #include +#include +#include #include #include -#include #ifdef __SSE2__ #include @@ -92,7 +93,15 @@ struct LowerUpperUTF8Impl { if (data.empty()) return; - res_data.resize(data.size()); + + bool all_ascii = UTF8::isAllASCII(data.data(), data.size()); + if (all_ascii) + { + LowerUpperImpl::vector(data, offsets, res_data, res_offsets); + return; + } + + res_data.resize_exact(data.size()); res_offsets.assign(offsets); array(data.data(), data.data() + data.size(), offsets, res_data.data()); } From a4466496488d357f6f0049c5906c23893336cad8 Mon Sep 17 00:00:00 2001 From: taiyang-li <654010905@qq.com> Date: Wed, 20 Mar 2024 18:48:51 +0800 Subject: [PATCH 012/289] opt pad string for utf8 --- src/Functions/GatherUtils/Sinks.h | 16 +++++----- src/Functions/GatherUtils/Sources.h | 9 ++++-- src/Functions/padString.cpp | 49 +++++++++++++++++++---------- src/Functions/reverse.h | 4 +-- 4 files changed, 49 insertions(+), 29 deletions(-) diff --git a/src/Functions/GatherUtils/Sinks.h b/src/Functions/GatherUtils/Sinks.h index a8054da1159..2aa7c147136 100644 --- a/src/Functions/GatherUtils/Sinks.h +++ b/src/Functions/GatherUtils/Sinks.h @@ -48,7 +48,7 @@ struct NumericArraySink : public ArraySinkImpl> NumericArraySink(IColumn & elements_, ColumnArray::Offsets & offsets_, size_t column_size) : elements(assert_cast(elements_).getData()), offsets(offsets_) { - offsets.resize(column_size); + offsets.resize_exact(column_size); } void next() @@ -69,7 +69,7 @@ struct NumericArraySink : public ArraySinkImpl> void reserve(size_t num_elements) { - elements.reserve(num_elements); + elements.reserve_exact(num_elements); } }; @@ -85,7 +85,7 @@ struct StringSink StringSink(ColumnString & col, size_t column_size) : elements(col.getChars()), offsets(col.getOffsets()) { - offsets.resize(column_size); + offsets.resize_exact(column_size); } void ALWAYS_INLINE next() @@ -108,7 +108,7 @@ struct StringSink void reserve(size_t num_elements) { - elements.reserve(num_elements); + elements.reserve_exact(num_elements); } }; @@ -125,7 +125,7 @@ struct FixedStringSink FixedStringSink(ColumnFixedString & col, size_t column_size) : elements(col.getChars()), string_size(col.getN()), total_rows(column_size) { - elements.resize(column_size * string_size); + elements.resize_exact(column_size * string_size); } void next() @@ -146,7 +146,7 @@ struct FixedStringSink void reserve(size_t num_elements) { - elements.reserve(num_elements); + elements.reserve_exact(num_elements); } }; @@ -165,7 +165,7 @@ struct GenericArraySink : public ArraySinkImpl GenericArraySink(IColumn & elements_, ColumnArray::Offsets & offsets_, size_t column_size) : elements(elements_), offsets(offsets_) { - offsets.resize(column_size); + offsets.resize_exact(column_size); } void next() @@ -210,7 +210,7 @@ struct NullableArraySink : public ArraySink void reserve(size_t num_elements) { ArraySink::reserve(num_elements); - null_map.reserve(num_elements); + null_map.reserve_exact(num_elements); } }; diff --git a/src/Functions/GatherUtils/Sources.h b/src/Functions/GatherUtils/Sources.h index 222f9f19168..41e38e6fa79 100644 --- a/src/Functions/GatherUtils/Sources.h +++ b/src/Functions/GatherUtils/Sources.h @@ -319,6 +319,8 @@ struct StringSource return {&elements[prev_offset], length + elem_size > offset ? std::min(elem_size, length + elem_size - offset) : 0}; return {&elements[prev_offset + elem_size - offset], std::min(length, offset)}; } + + const ColumnString::Chars & getElements() const { return elements; } }; /// Treats Enum values as Strings, modeled after StringSource @@ -513,11 +515,12 @@ struct FixedStringSource const UInt8 * pos; const UInt8 * end; size_t string_size; + const typename ColumnString::Chars & elements; + size_t row_num = 0; size_t column_size = 0; - explicit FixedStringSource(const ColumnFixedString & col) - : string_size(col.getN()) + explicit FixedStringSource(const ColumnFixedString & col) : string_size(col.getN()), elements(col.getChars()) { const auto & chars = col.getChars(); pos = chars.data(); @@ -588,6 +591,8 @@ struct FixedStringSource return {pos, length + string_size > offset ? std::min(string_size, length + string_size - offset) : 0}; return {pos + string_size - offset, std::min(length, offset)}; } + + const ColumnString::Chars & getElements() const { return elements; } }; diff --git a/src/Functions/padString.cpp b/src/Functions/padString.cpp index b26a4ec3d6a..7a424bb1198 100644 --- a/src/Functions/padString.cpp +++ b/src/Functions/padString.cpp @@ -211,19 +211,18 @@ namespace pad_string = column_pad_const->getValue(); } - PaddingChars padding_chars{pad_string}; auto col_res = ColumnString::create(); StringSink res_sink{*col_res, input_rows_count}; if (const ColumnString * col = checkAndGetColumn(column_string.get())) - executeForSource(StringSource{*col}, column_length, padding_chars, res_sink); + executeForSource(StringSource{*col}, column_length, pad_string, res_sink); else if (const ColumnFixedString * col_fixed = checkAndGetColumn(column_string.get())) - executeForSource(FixedStringSource{*col_fixed}, column_length, padding_chars, res_sink); + executeForSource(FixedStringSource{*col_fixed}, column_length, pad_string, res_sink); else if (const ColumnConst * col_const = checkAndGetColumnConst(column_string.get())) - executeForSource(ConstSource{*col_const}, column_length, padding_chars, res_sink); + executeForSource(ConstSource{*col_const}, column_length, pad_string, res_sink); else if (const ColumnConst * col_const_fixed = checkAndGetColumnConst(column_string.get())) - executeForSource(ConstSource{*col_const_fixed}, column_length, padding_chars, res_sink); + executeForSource(ConstSource{*col_const_fixed}, column_length, pad_string, res_sink); else throw Exception( ErrorCodes::ILLEGAL_COLUMN, @@ -236,23 +235,39 @@ namespace private: template - void executeForSource( - SourceStrings && strings, - const ColumnPtr & column_length, - const PaddingChars & padding_chars, - StringSink & res_sink) const + void executeForSource(SourceStrings && strings, const ColumnPtr & column_length, const String & pad_string, StringSink & res_sink) const { - if (const auto * col_const = checkAndGetColumn(column_length.get())) - executeForSourceAndLength(std::forward(strings), ConstSource{*col_const}, padding_chars, res_sink); + const auto & chars = strings.getElements(); + bool all_ascii = UTF8::isAllASCII(reinterpret_cast(pad_string.data()), pad_string.size()) + && UTF8::isAllASCII(chars.data(), chars.size()); + + if (all_ascii) + { + PaddingChars padding_chars{pad_string}; + if (const auto * col_const = checkAndGetColumn(column_length.get())) + executeForSourceAndLength( + std::forward(strings), ConstSource{*col_const}, padding_chars, res_sink); + else + executeForSourceAndLength( + std::forward(strings), GenericValueSource{*column_length}, padding_chars, res_sink); + } else - executeForSourceAndLength(std::forward(strings), GenericValueSource{*column_length}, padding_chars, res_sink); + { + PaddingChars padding_chars{pad_string}; + if (const auto * col_const = checkAndGetColumn(column_length.get())) + executeForSourceAndLength( + std::forward(strings), ConstSource{*col_const}, padding_chars, res_sink); + else + executeForSourceAndLength( + std::forward(strings), GenericValueSource{*column_length}, padding_chars, res_sink); + } } - template + template void executeForSourceAndLength( SourceStrings && strings, SourceLengths && lengths, - const PaddingChars & padding_chars, + const PaddingChars & padding_chars, StringSink & res_sink) const { bool is_const_new_length = lengths.isConst(); @@ -264,7 +279,7 @@ namespace for (; !res_sink.isEnd(); res_sink.next(), strings.next(), lengths.next()) { auto str = strings.getWhole(); - ssize_t current_length = getLengthOfSlice(str); + ssize_t current_length = getLengthOfSlice(str); if (!res_sink.rowNum() || !is_const_new_length) { @@ -294,7 +309,7 @@ namespace } else if (new_length < current_length) { - str = removeSuffixFromSlice(str, current_length - new_length); + str = removeSuffixFromSlice(str, current_length - new_length); writeSlice(str, res_sink); } else if (new_length > current_length) diff --git a/src/Functions/reverse.h b/src/Functions/reverse.h index 128a897c041..5f999af4297 100644 --- a/src/Functions/reverse.h +++ b/src/Functions/reverse.h @@ -14,7 +14,7 @@ struct ReverseImpl ColumnString::Chars & res_data, ColumnString::Offsets & res_offsets) { - res_data.resize(data.size()); + res_data.resize_exact(data.size()); res_offsets.assign(offsets); size_t size = offsets.size(); @@ -30,7 +30,7 @@ struct ReverseImpl static void vectorFixed(const ColumnString::Chars & data, size_t n, ColumnString::Chars & res_data) { - res_data.resize(data.size()); + res_data.resize_exact(data.size()); size_t size = data.size() / n; for (size_t i = 0; i < size; ++i) From 49422debafbf773c887c60f2affb2e038edea911 Mon Sep 17 00:00:00 2001 From: taiyang-li <654010905@qq.com> Date: Wed, 20 Mar 2024 18:59:22 +0800 Subject: [PATCH 013/289] add perf tests --- src/Common/UTF8Helpers.cpp | 3 ++- tests/performance/ascii.xml | 9 +++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) create mode 100644 tests/performance/ascii.xml diff --git a/src/Common/UTF8Helpers.cpp b/src/Common/UTF8Helpers.cpp index 78d645994e5..6e1c3f5397b 100644 --- a/src/Common/UTF8Helpers.cpp +++ b/src/Common/UTF8Helpers.cpp @@ -222,7 +222,8 @@ bool isAllASCII(const UInt8 * data, size_t size) DECLARE_SSE42_SPECIFIC_CODE( /// Copy from https://github.com/lemire/fastvalidate-utf-8/blob/master/include/simdasciicheck.h -bool isAllASCII(const UInt8 * data, size_t size) { +bool isAllASCII(const UInt8 * data, size_t size) +{ size_t i = 0; __m128i masks = _mm_setzero_si128(); if (size >= 16) diff --git a/tests/performance/ascii.xml b/tests/performance/ascii.xml new file mode 100644 index 00000000000..89c6dd57e07 --- /dev/null +++ b/tests/performance/ascii.xml @@ -0,0 +1,9 @@ + + select substringUTF8(materialize('hello world'), 2, 5) from numbers(10000000) + select substringIndexUTF8(materialize('www.clickhouse.com'), '.', 2) from numbers(10000000) + select reverseUTF8(materialize('hello world')) from numbers(10000000) + select lowerUTF8(materialize('hello world')) from numbers(10000000) + select upperUTF8(materialize('hello world')) from numbers(10000000) + select leftPadUTF8(materialize('hello '), 10, ',') from numbers(10000000) + select rightPadUTF8(materialize('hello '), 10, ',') from numbers(10000000) + From 83645350fb9f3db299061f939659595cefe771ee Mon Sep 17 00:00:00 2001 From: taiyang-li <654010905@qq.com> Date: Wed, 20 Mar 2024 22:08:15 +0800 Subject: [PATCH 014/289] fix failed uts --- src/Functions/padString.cpp | 2 +- src/Functions/substring.cpp | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Functions/padString.cpp b/src/Functions/padString.cpp index 7a424bb1198..119baaec723 100644 --- a/src/Functions/padString.cpp +++ b/src/Functions/padString.cpp @@ -241,7 +241,7 @@ namespace bool all_ascii = UTF8::isAllASCII(reinterpret_cast(pad_string.data()), pad_string.size()) && UTF8::isAllASCII(chars.data(), chars.size()); - if (all_ascii) + if (!is_utf8 || all_ascii) { PaddingChars padding_chars{pad_string}; if (const auto * col_const = checkAndGetColumn(column_length.get())) diff --git a/src/Functions/substring.cpp b/src/Functions/substring.cpp index 759d41e2ab8..122f83d758b 100644 --- a/src/Functions/substring.cpp +++ b/src/Functions/substring.cpp @@ -161,9 +161,9 @@ public: StringRef str_ref = col_const->getDataAt(0); bool all_ascii = UTF8::isAllASCII(reinterpret_cast(str_ref.data), str_ref.size); if (all_ascii) - return executeForSource(column_offset, column_length, column_offset_const, column_length_const, offset, length, ConstSource(*col_const), input_rows_count); - else return executeForSource(column_offset, column_length, column_offset_const, column_length_const, offset, length, ConstSource(*col_const), input_rows_count); + else + return executeForSource(column_offset, column_length, column_offset_const, column_length_const, offset, length, ConstSource(*col_const), input_rows_count); } throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of first argument of function {}", arguments[0].column->getName(), getName()); } From c350991005253183c83fac4d380b368b343f2763 Mon Sep 17 00:00:00 2001 From: taiyang-li <654010905@qq.com> Date: Thu, 21 Mar 2024 10:29:35 +0800 Subject: [PATCH 015/289] add more tests --- tests/performance/ascii.xml | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tests/performance/ascii.xml b/tests/performance/ascii.xml index 89c6dd57e07..83440437d2c 100644 --- a/tests/performance/ascii.xml +++ b/tests/performance/ascii.xml @@ -1,9 +1,22 @@ select substringUTF8(materialize('hello world'), 2, 5) from numbers(10000000) + select substringUTF8(materialize('hello 世界'), 2, 5) from numbers(10000000) + select substringIndexUTF8(materialize('www.clickhouse.com'), '.', 2) from numbers(10000000) + select substringIndexUTF8(materialize('官网www.clickhouse.com'), '.', 2) from numbers(10000000) + select reverseUTF8(materialize('hello world')) from numbers(10000000) + select reverseUTF8(materialize('hello 世界')) from numbers(10000000) + select lowerUTF8(materialize('hello world')) from numbers(10000000) + select lowerUTF8(materialize('hello 世界')) from numbers(10000000) + select upperUTF8(materialize('hello world')) from numbers(10000000) + select upperUTF8(materialize('hello 世界')) from numbers(10000000) + select leftPadUTF8(materialize('hello '), 10, ',') from numbers(10000000) + select leftPadUTF8(materialize('hello '), 10, '世界') from numbers(10000000) + select rightPadUTF8(materialize('hello '), 10, ',') from numbers(10000000) + select rightPadUTF8(materialize('hello '), 10, '世界') from numbers(10000000) From 490a8bc7e48d2d87e67ba8d713a01d0fc78a901d Mon Sep 17 00:00:00 2001 From: taiyang-li <654010905@qq.com> Date: Thu, 21 Mar 2024 10:31:45 +0800 Subject: [PATCH 016/289] apply resize_extact for trim --- src/Functions/trim.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Functions/trim.cpp b/src/Functions/trim.cpp index dd51c606ff7..8b3d2870260 100644 --- a/src/Functions/trim.cpp +++ b/src/Functions/trim.cpp @@ -46,8 +46,8 @@ public: ColumnString::Offsets & res_offsets) { size_t size = offsets.size(); - res_offsets.resize(size); - res_data.reserve(data.size()); + res_offsets.resize_exact(size); + res_data.reserve_exact(data.size()); size_t prev_offset = 0; size_t res_offset = 0; @@ -59,7 +59,7 @@ public: { execute(reinterpret_cast(&data[prev_offset]), offsets[i] - prev_offset - 1, start, length); - res_data.resize(res_data.size() + length + 1); + res_data.resize_exact(res_data.size() + length + 1); memcpySmallAllowReadWriteOverflow15(&res_data[res_offset], start, length); res_offset += length + 1; res_data[res_offset - 1] = '\0'; From ea2fd57cca95d45e4d86260f273abaca67012896 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 24 Mar 2024 00:42:59 +0100 Subject: [PATCH 017/289] Set total_rows_approx for trivial queries with LIMIT from system.zeros and generateRandom --- src/Interpreters/InterpreterSelectQuery.cpp | 3 ++ .../QueryPlan/ReadFromSystemNumbersStep.cpp | 6 +-- .../QueryPlan/ReadFromSystemNumbersStep.h | 2 + src/Storages/StorageGenerateRandom.cpp | 15 ++++-- src/Storages/System/StorageSystemNumbers.cpp | 5 +- src/Storages/System/StorageSystemZeros.cpp | 13 +++-- ...rate_random_with_limit_progress_bar.expect | 49 +++++++++++++++++++ ...e_random_with_limit_progress_bar.reference | 0 ...system_zeros_and_generate_random.reference | 0 ...t_for_system_zeros_and_generate_random.sql | 9 ++++ 10 files changed, 86 insertions(+), 16 deletions(-) create mode 100755 tests/queries/0_stateless/03023_zeros_generate_random_with_limit_progress_bar.expect create mode 100644 tests/queries/0_stateless/03023_zeros_generate_random_with_limit_progress_bar.reference create mode 100644 tests/queries/0_stateless/03024_total_rows_approx_is_set_for_system_zeros_and_generate_random.reference create mode 100644 tests/queries/0_stateless/03024_total_rows_approx_is_set_for_system_zeros_and_generate_random.sql diff --git a/src/Interpreters/InterpreterSelectQuery.cpp b/src/Interpreters/InterpreterSelectQuery.cpp index 7c87dadfce6..fa46b115979 100644 --- a/src/Interpreters/InterpreterSelectQuery.cpp +++ b/src/Interpreters/InterpreterSelectQuery.cpp @@ -2501,10 +2501,13 @@ void InterpreterSelectQuery::executeFetchColumns(QueryProcessingStage::Enum proc max_block_size = std::max(1, max_block_limited); max_threads_execute_query = max_streams = 1; } + if (local_limits.local_limits.size_limits.max_rows != 0) { if (max_block_limited < local_limits.local_limits.size_limits.max_rows) query_info.limit = max_block_limited; + else /// Ask to read just enough rows to make the max_rows limit effective (so it has a chance to be triggered). + query_info.limit = 1 + local_limits.local_limits.size_limits.max_rows; } else { diff --git a/src/Processors/QueryPlan/ReadFromSystemNumbersStep.cpp b/src/Processors/QueryPlan/ReadFromSystemNumbersStep.cpp index 7a61d09bdd2..11371578c79 100644 --- a/src/Processors/QueryPlan/ReadFromSystemNumbersStep.cpp +++ b/src/Processors/QueryPlan/ReadFromSystemNumbersStep.cpp @@ -12,8 +12,8 @@ #include #include #include -#include "Core/Types.h" -#include "base/types.h" +#include + namespace DB { @@ -443,7 +443,6 @@ Pipe ReadFromSystemNumbersStep::makePipe() /// Build rpn of query filters KeyCondition condition(filter_actions_dag, context, column_names, key_expression); - if (condition.extractPlainRanges(ranges)) { /// Intersect ranges with table range @@ -505,7 +504,6 @@ Pipe ReadFromSystemNumbersStep::makePipe() } } - /// ranges is blank, return a source who has no data if (intersected_ranges.empty()) { diff --git a/src/Processors/QueryPlan/ReadFromSystemNumbersStep.h b/src/Processors/QueryPlan/ReadFromSystemNumbersStep.h index cab0686474b..bc84e31be62 100644 --- a/src/Processors/QueryPlan/ReadFromSystemNumbersStep.h +++ b/src/Processors/QueryPlan/ReadFromSystemNumbersStep.h @@ -8,6 +8,7 @@ #include #include + namespace DB { @@ -43,4 +44,5 @@ private: UInt64 limit; std::shared_ptr storage_limits; }; + } diff --git a/src/Storages/StorageGenerateRandom.cpp b/src/Storages/StorageGenerateRandom.cpp index fbce6c2bb7d..cdbade51695 100644 --- a/src/Storages/StorageGenerateRandom.cpp +++ b/src/Storages/StorageGenerateRandom.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -30,12 +31,9 @@ #include #include #include -#include #include -#include - namespace DB { @@ -639,7 +637,7 @@ void registerStorageGenerateRandom(StorageFactory & factory) Pipe StorageGenerateRandom::read( const Names & column_names, const StorageSnapshotPtr & storage_snapshot, - SelectQueryInfo & /*query_info*/, + SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum /*processed_stage*/, size_t max_block_size, @@ -682,7 +680,14 @@ Pipe StorageGenerateRandom::read( pcg64 generate(random_seed); for (UInt64 i = 0; i < num_streams; ++i) - pipes.emplace_back(std::make_shared(max_block_size, max_array_length, max_string_length, generate(), block_header, context)); + { + auto source = std::make_shared(max_block_size, max_array_length, max_string_length, generate(), block_header, context); + + if (i == 0 && query_info.limit) + source->addTotalRowsApprox(query_info.limit); + + pipes.emplace_back(std::move(source)); + } return Pipe::unitePipes(std::move(pipes)); } diff --git a/src/Storages/System/StorageSystemNumbers.cpp b/src/Storages/System/StorageSystemNumbers.cpp index 629b11ee7f1..da700a7a4e9 100644 --- a/src/Storages/System/StorageSystemNumbers.cpp +++ b/src/Storages/System/StorageSystemNumbers.cpp @@ -1,17 +1,14 @@ #include #include -#include #include #include -#include #include #include #include -#include -#include #include + namespace DB { diff --git a/src/Storages/System/StorageSystemZeros.cpp b/src/Storages/System/StorageSystemZeros.cpp index ce2e153ea66..a48b109fbbe 100644 --- a/src/Storages/System/StorageSystemZeros.cpp +++ b/src/Storages/System/StorageSystemZeros.cpp @@ -1,4 +1,5 @@ #include +#include #include #include @@ -6,6 +7,7 @@ #include #include + namespace DB { @@ -93,7 +95,7 @@ StorageSystemZeros::StorageSystemZeros(const StorageID & table_id_, bool multith Pipe StorageSystemZeros::read( const Names & column_names, const StorageSnapshotPtr & storage_snapshot, - SelectQueryInfo &, + SelectQueryInfo & query_info, ContextPtr /*context*/, QueryProcessingStage::Enum /*processed_stage*/, size_t max_block_size, @@ -123,8 +125,13 @@ Pipe StorageSystemZeros::read( { auto source = std::make_shared(max_block_size, limit ? *limit : 0, state); - if (limit && i == 0) - source->addTotalRowsApprox(*limit); + if (i == 0) + { + if (limit) + source->addTotalRowsApprox(*limit); + else if (query_info.limit) + source->addTotalRowsApprox(query_info.limit); + } res.addSource(std::move(source)); } diff --git a/tests/queries/0_stateless/03023_zeros_generate_random_with_limit_progress_bar.expect b/tests/queries/0_stateless/03023_zeros_generate_random_with_limit_progress_bar.expect new file mode 100755 index 00000000000..272dc0fdfef --- /dev/null +++ b/tests/queries/0_stateless/03023_zeros_generate_random_with_limit_progress_bar.expect @@ -0,0 +1,49 @@ +#!/usr/bin/expect -f + +set basedir [file dirname $argv0] +set basename [file tail $argv0] +if {[info exists env(CLICKHOUSE_TMP)]} { + set CLICKHOUSE_TMP $env(CLICKHOUSE_TMP) +} else { + set CLICKHOUSE_TMP "." +} +exp_internal -f $CLICKHOUSE_TMP/$basename.debuglog 0 + +log_user 0 +set timeout 60 +match_max 100000 +set stty_init "rows 25 cols 120" + +expect_after { + -i $any_spawn_id eof { exp_continue } + -i $any_spawn_id timeout { exit 1 } +} + +spawn clickhouse-local +expect ":) " + +# Trivial SELECT with LIMIT from system.zeros shows progress bar. +send "SELECT * FROM system.zeros LIMIT 1e11 FORMAT Null\r" +expect "Progress: " +expect "█" +send "\3" +expect "Query was cancelled." +expect ":) " + +send "SELECT * FROM system.zeros_mt LIMIT 1e11 FORMAT Null\r" +expect "Progress: " +expect "█" +send "\3" +expect "Query was cancelled." +expect ":) " + +# As well as from generateRandom +send "SELECT * FROM generateRandom() LIMIT 1e9 FORMAT Null\r" +expect "Progress: " +expect "█" +send "\3" +expect "Query was cancelled." +expect ":) " + +send "exit\r" +expect eof diff --git a/tests/queries/0_stateless/03023_zeros_generate_random_with_limit_progress_bar.reference b/tests/queries/0_stateless/03023_zeros_generate_random_with_limit_progress_bar.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/03024_total_rows_approx_is_set_for_system_zeros_and_generate_random.reference b/tests/queries/0_stateless/03024_total_rows_approx_is_set_for_system_zeros_and_generate_random.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/03024_total_rows_approx_is_set_for_system_zeros_and_generate_random.sql b/tests/queries/0_stateless/03024_total_rows_approx_is_set_for_system_zeros_and_generate_random.sql new file mode 100644 index 00000000000..0db09ead2cd --- /dev/null +++ b/tests/queries/0_stateless/03024_total_rows_approx_is_set_for_system_zeros_and_generate_random.sql @@ -0,0 +1,9 @@ +SET max_rows_to_read = 1e11; + +SELECT * FROM system.numbers LIMIT 1e12 FORMAT Null; -- { serverError TOO_MANY_ROWS } +SELECT * FROM system.numbers_mt LIMIT 1e12 FORMAT Null; -- { serverError TOO_MANY_ROWS } + +SELECT * FROM system.zeros LIMIT 1e12 FORMAT Null; -- { serverError TOO_MANY_ROWS } +SELECT * FROM system.zeros_mt LIMIT 1e12 FORMAT Null; -- { serverError TOO_MANY_ROWS } + +SELECT * FROM generateRandom() LIMIT 1e12 FORMAT Null; -- { serverError TOO_MANY_ROWS } From 86039802e3327c0a527135233f3589d6bcea2348 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 24 Mar 2024 00:46:48 +0100 Subject: [PATCH 018/289] Set total_rows_approx for trivial queries with LIMIT from system.zeros and generateRandom --- src/Interpreters/InterpreterSelectQuery.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Interpreters/InterpreterSelectQuery.cpp b/src/Interpreters/InterpreterSelectQuery.cpp index fa46b115979..65beef27d16 100644 --- a/src/Interpreters/InterpreterSelectQuery.cpp +++ b/src/Interpreters/InterpreterSelectQuery.cpp @@ -2506,7 +2506,7 @@ void InterpreterSelectQuery::executeFetchColumns(QueryProcessingStage::Enum proc { if (max_block_limited < local_limits.local_limits.size_limits.max_rows) query_info.limit = max_block_limited; - else /// Ask to read just enough rows to make the max_rows limit effective (so it has a chance to be triggered). + else if (local_limits.local_limits.size_limits.max_rows < std::numeric_limits::max()) /// Ask to read just enough rows to make the max_rows limit effective (so it has a chance to be triggered). query_info.limit = 1 + local_limits.local_limits.size_limits.max_rows; } else From c55e45bff6212e8e4828198fd13a56bc174d2062 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 24 Mar 2024 18:58:57 +0100 Subject: [PATCH 019/289] Test robustness --- ...023_zeros_generate_random_with_limit_progress_bar.expect | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/queries/0_stateless/03023_zeros_generate_random_with_limit_progress_bar.expect b/tests/queries/0_stateless/03023_zeros_generate_random_with_limit_progress_bar.expect index 272dc0fdfef..de15a199132 100755 --- a/tests/queries/0_stateless/03023_zeros_generate_random_with_limit_progress_bar.expect +++ b/tests/queries/0_stateless/03023_zeros_generate_random_with_limit_progress_bar.expect @@ -23,14 +23,14 @@ spawn clickhouse-local expect ":) " # Trivial SELECT with LIMIT from system.zeros shows progress bar. -send "SELECT * FROM system.zeros LIMIT 1e11 FORMAT Null\r" +send "SELECT * FROM system.zeros LIMIT 10000000 FORMAT Null SETTINGS max_execution_speed = 1000000, timeout_before_checking_execution_speed = 0, max_block_size = 128\r" expect "Progress: " expect "█" send "\3" expect "Query was cancelled." expect ":) " -send "SELECT * FROM system.zeros_mt LIMIT 1e11 FORMAT Null\r" +send "SELECT * FROM system.zeros_mt LIMIT 10000000 FORMAT Null SETTINGS max_execution_speed = 1000000, timeout_before_checking_execution_speed = 0, max_block_size = 128\r" expect "Progress: " expect "█" send "\3" @@ -38,7 +38,7 @@ expect "Query was cancelled." expect ":) " # As well as from generateRandom -send "SELECT * FROM generateRandom() LIMIT 1e9 FORMAT Null\r" +send "SELECT * FROM generateRandom() LIMIT 10000000 FORMAT Null SETTINGS max_execution_speed = 1000000, timeout_before_checking_execution_speed = 0, max_block_size = 128\r" expect "Progress: " expect "█" send "\3" From 98c2048d30d0a6e324875c98a4f138e7ee92734a Mon Sep 17 00:00:00 2001 From: zvonand Date: Sun, 31 Mar 2024 22:12:03 +0200 Subject: [PATCH 020/289] try to improve Storage S3 selection glob performance --- src/Storages/StorageS3.cpp | 122 +++++++++++++++++++++++++++++-------- 1 file changed, 98 insertions(+), 24 deletions(-) diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp index 2d3aef312bf..cee9f11af95 100644 --- a/src/Storages/StorageS3.cpp +++ b/src/Storages/StorageS3.cpp @@ -210,32 +210,36 @@ public: if (globbed_uri.bucket.find_first_of("*?{") != globbed_uri.bucket.npos) throw Exception(ErrorCodes::UNEXPECTED_EXPRESSION, "Expression can not have wildcards inside bucket name"); - const String key_prefix = globbed_uri.key.substr(0, globbed_uri.key.find_first_of("*?{")); - - /// We don't have to list bucket, because there is no asterisks. - if (key_prefix.size() == globbed_uri.key.size()) + for (const auto & key : expandSelectionGlob(globbed_uri.key)) { - buffer.emplace_back(std::make_shared(globbed_uri.key, std::nullopt)); - buffer_iter = buffer.begin(); - is_finished = true; - return; + const String key_prefix = key.substr(0, key.find_first_of("*?{")); + + /// We don't have to list bucket, because there is no asterisks. + if (key_prefix.size() == key.size()) + { + buffer.emplace_back(std::make_shared(key, std::nullopt)); + buffer_iter = buffer.begin(); + is_finished = true; + return; + } + + request.SetBucket(globbed_uri.bucket); + request.SetPrefix(key_prefix); + request.SetMaxKeys(static_cast(request_settings.list_object_keys_size)); + + outcome_future = listObjectsAsync(); + + matcher = std::make_unique(makeRegexpPatternFromGlobs(key)); + if (!matcher->ok()) + throw Exception( + ErrorCodes::CANNOT_COMPILE_REGEXP, "Cannot compile regex from glob ({}): {}", key, matcher->error()); + + recursive = key == "/**"; + + filter_dag = VirtualColumnUtils::createPathAndFileFilterDAG(predicate, virtual_columns); + updateInternalBufferAssumeLocked(); } - - request.SetBucket(globbed_uri.bucket); - request.SetPrefix(key_prefix); - request.SetMaxKeys(static_cast(request_settings.list_object_keys_size)); - - outcome_future = listObjectsAsync(); - - matcher = std::make_unique(makeRegexpPatternFromGlobs(globbed_uri.key)); - if (!matcher->ok()) - throw Exception(ErrorCodes::CANNOT_COMPILE_REGEXP, - "Cannot compile regex from glob ({}): {}", globbed_uri.key, matcher->error()); - - recursive = globbed_uri.key == "/**" ? true : false; - - filter_dag = VirtualColumnUtils::createPathAndFileFilterDAG(predicate, virtual_columns); - fillInternalBufferAssumeLocked(); + buffer_iter = buffer.begin(); } KeyWithInfoPtr next(size_t) @@ -301,6 +305,76 @@ private: } while (true); } + void updateInternalBufferAssumeLocked() + { + assert(outcome_future.valid()); + auto outcome = outcome_future.get(); + + if (!outcome.IsSuccess()) + { + throw S3Exception(outcome.GetError().GetErrorType(), "Could not list objects in bucket {} with prefix {}, S3 exception: {}, message: {}", + quoteString(request.GetBucket()), quoteString(request.GetPrefix()), + backQuote(outcome.GetError().GetExceptionName()), quoteString(outcome.GetError().GetMessage())); + } + + const auto & result_batch = outcome.GetResult().GetContents(); + + /// It returns false when all objects were returned + is_finished = !outcome.GetResult().GetIsTruncated(); + + if (!is_finished) + { + /// Even if task is finished the thread may be not freed in pool. + /// So wait until it will be freed before scheduling a new task. + list_objects_pool.wait(); + outcome_future = listObjectsAsync(); + } + + if (request_settings.throw_on_zero_files_match && result_batch.empty()) + throw Exception(ErrorCodes::FILE_DOESNT_EXIST, "Can not match any files using prefix {}", request.GetPrefix()); + + KeysWithInfo temp_buffer; + temp_buffer.reserve(result_batch.size()); + + for (const auto & row : result_batch) + { + String key = row.GetKey(); + if (recursive || re2::RE2::FullMatch(key, *matcher)) + { + S3::ObjectInfo info = + { + .size = size_t(row.GetSize()), + .last_modification_time = row.GetLastModified().Millis() / 1000, + }; + temp_buffer.emplace_back(std::make_shared(std::move(key), std::move(info))); + } + } + + if (temp_buffer.empty()) + return; + + if (filter_dag) + { + std::vector paths; + paths.reserve(temp_buffer.size()); + for (const auto & key_with_info : temp_buffer) + paths.push_back(fs::path(globbed_uri.bucket) / key_with_info->key); + + VirtualColumnUtils::filterByPathOrFile(temp_buffer, paths, filter_dag, virtual_columns, getContext()); + } + + buffer.insert(buffer.end(), temp_buffer.begin(), temp_buffer.end()); + + if (read_keys) + read_keys->insert(read_keys->end(), temp_buffer.begin(), temp_buffer.end()); + + if (file_progress_callback) + { + for (const auto & key_with_info : buffer) + file_progress_callback(FileProgress(0, key_with_info->info->size)); + } + } + void fillInternalBufferAssumeLocked() { buffer.clear(); From 73b9ef99f4315e89d6f184d836e01da4345151ba Mon Sep 17 00:00:00 2001 From: zvonand Date: Mon, 1 Apr 2024 17:40:40 +0200 Subject: [PATCH 021/289] Revert "try to improve Storage S3 selection glob performance" This reverts commit 9c9421b6897bf4a95346cef52171839ef67bd522. --- src/Storages/StorageS3.cpp | 122 ++++++++----------------------------- 1 file changed, 24 insertions(+), 98 deletions(-) diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp index cee9f11af95..2d3aef312bf 100644 --- a/src/Storages/StorageS3.cpp +++ b/src/Storages/StorageS3.cpp @@ -210,36 +210,32 @@ public: if (globbed_uri.bucket.find_first_of("*?{") != globbed_uri.bucket.npos) throw Exception(ErrorCodes::UNEXPECTED_EXPRESSION, "Expression can not have wildcards inside bucket name"); - for (const auto & key : expandSelectionGlob(globbed_uri.key)) + const String key_prefix = globbed_uri.key.substr(0, globbed_uri.key.find_first_of("*?{")); + + /// We don't have to list bucket, because there is no asterisks. + if (key_prefix.size() == globbed_uri.key.size()) { - const String key_prefix = key.substr(0, key.find_first_of("*?{")); - - /// We don't have to list bucket, because there is no asterisks. - if (key_prefix.size() == key.size()) - { - buffer.emplace_back(std::make_shared(key, std::nullopt)); - buffer_iter = buffer.begin(); - is_finished = true; - return; - } - - request.SetBucket(globbed_uri.bucket); - request.SetPrefix(key_prefix); - request.SetMaxKeys(static_cast(request_settings.list_object_keys_size)); - - outcome_future = listObjectsAsync(); - - matcher = std::make_unique(makeRegexpPatternFromGlobs(key)); - if (!matcher->ok()) - throw Exception( - ErrorCodes::CANNOT_COMPILE_REGEXP, "Cannot compile regex from glob ({}): {}", key, matcher->error()); - - recursive = key == "/**"; - - filter_dag = VirtualColumnUtils::createPathAndFileFilterDAG(predicate, virtual_columns); - updateInternalBufferAssumeLocked(); + buffer.emplace_back(std::make_shared(globbed_uri.key, std::nullopt)); + buffer_iter = buffer.begin(); + is_finished = true; + return; } - buffer_iter = buffer.begin(); + + request.SetBucket(globbed_uri.bucket); + request.SetPrefix(key_prefix); + request.SetMaxKeys(static_cast(request_settings.list_object_keys_size)); + + outcome_future = listObjectsAsync(); + + matcher = std::make_unique(makeRegexpPatternFromGlobs(globbed_uri.key)); + if (!matcher->ok()) + throw Exception(ErrorCodes::CANNOT_COMPILE_REGEXP, + "Cannot compile regex from glob ({}): {}", globbed_uri.key, matcher->error()); + + recursive = globbed_uri.key == "/**" ? true : false; + + filter_dag = VirtualColumnUtils::createPathAndFileFilterDAG(predicate, virtual_columns); + fillInternalBufferAssumeLocked(); } KeyWithInfoPtr next(size_t) @@ -305,76 +301,6 @@ private: } while (true); } - void updateInternalBufferAssumeLocked() - { - assert(outcome_future.valid()); - auto outcome = outcome_future.get(); - - if (!outcome.IsSuccess()) - { - throw S3Exception(outcome.GetError().GetErrorType(), "Could not list objects in bucket {} with prefix {}, S3 exception: {}, message: {}", - quoteString(request.GetBucket()), quoteString(request.GetPrefix()), - backQuote(outcome.GetError().GetExceptionName()), quoteString(outcome.GetError().GetMessage())); - } - - const auto & result_batch = outcome.GetResult().GetContents(); - - /// It returns false when all objects were returned - is_finished = !outcome.GetResult().GetIsTruncated(); - - if (!is_finished) - { - /// Even if task is finished the thread may be not freed in pool. - /// So wait until it will be freed before scheduling a new task. - list_objects_pool.wait(); - outcome_future = listObjectsAsync(); - } - - if (request_settings.throw_on_zero_files_match && result_batch.empty()) - throw Exception(ErrorCodes::FILE_DOESNT_EXIST, "Can not match any files using prefix {}", request.GetPrefix()); - - KeysWithInfo temp_buffer; - temp_buffer.reserve(result_batch.size()); - - for (const auto & row : result_batch) - { - String key = row.GetKey(); - if (recursive || re2::RE2::FullMatch(key, *matcher)) - { - S3::ObjectInfo info = - { - .size = size_t(row.GetSize()), - .last_modification_time = row.GetLastModified().Millis() / 1000, - }; - temp_buffer.emplace_back(std::make_shared(std::move(key), std::move(info))); - } - } - - if (temp_buffer.empty()) - return; - - if (filter_dag) - { - std::vector paths; - paths.reserve(temp_buffer.size()); - for (const auto & key_with_info : temp_buffer) - paths.push_back(fs::path(globbed_uri.bucket) / key_with_info->key); - - VirtualColumnUtils::filterByPathOrFile(temp_buffer, paths, filter_dag, virtual_columns, getContext()); - } - - buffer.insert(buffer.end(), temp_buffer.begin(), temp_buffer.end()); - - if (read_keys) - read_keys->insert(read_keys->end(), temp_buffer.begin(), temp_buffer.end()); - - if (file_progress_callback) - { - for (const auto & key_with_info : buffer) - file_progress_callback(FileProgress(0, key_with_info->info->size)); - } - } - void fillInternalBufferAssumeLocked() { buffer.clear(); From 70da13b9b01d0e9a86b313bdcf165b6c54a4b985 Mon Sep 17 00:00:00 2001 From: zvonand Date: Mon, 1 Apr 2024 22:34:54 +0200 Subject: [PATCH 022/289] simpler way --- src/Storages/StorageS3.cpp | 76 +++++++++++++++++++++++++------------- 1 file changed, 51 insertions(+), 25 deletions(-) diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp index 2d3aef312bf..844f5362ec2 100644 --- a/src/Storages/StorageS3.cpp +++ b/src/Storages/StorageS3.cpp @@ -191,7 +191,7 @@ public: Impl( const S3::Client & client_, const S3::URI & globbed_uri_, - const ActionsDAG::Node * predicate, + const ActionsDAG::Node * predicate_, const NamesAndTypesList & virtual_columns_, ContextPtr context_, KeysWithInfo * read_keys_, @@ -200,6 +200,7 @@ public: : WithContext(context_) , client(client_.clone()) , globbed_uri(globbed_uri_) + , predicate(predicate_) , virtual_columns(virtual_columns_) , read_keys(read_keys_) , request_settings(request_settings_) @@ -210,32 +211,13 @@ public: if (globbed_uri.bucket.find_first_of("*?{") != globbed_uri.bucket.npos) throw Exception(ErrorCodes::UNEXPECTED_EXPRESSION, "Expression can not have wildcards inside bucket name"); - const String key_prefix = globbed_uri.key.substr(0, globbed_uri.key.find_first_of("*?{")); + expanded_keys = expandSelectionGlob(globbed_uri.key); + expanded_keys_iter = expanded_keys.begin(); - /// We don't have to list bucket, because there is no asterisks. - if (key_prefix.size() == globbed_uri.key.size()) - { - buffer.emplace_back(std::make_shared(globbed_uri.key, std::nullopt)); - buffer_iter = buffer.begin(); + bool no_globs_in_key = fillBufferForKey(*expanded_keys_iter); + expanded_keys_iter++; + if (expanded_keys_iter == expanded_keys.end() && no_globs_in_key) is_finished = true; - return; - } - - request.SetBucket(globbed_uri.bucket); - request.SetPrefix(key_prefix); - request.SetMaxKeys(static_cast(request_settings.list_object_keys_size)); - - outcome_future = listObjectsAsync(); - - matcher = std::make_unique(makeRegexpPatternFromGlobs(globbed_uri.key)); - if (!matcher->ok()) - throw Exception(ErrorCodes::CANNOT_COMPILE_REGEXP, - "Cannot compile regex from glob ({}): {}", globbed_uri.key, matcher->error()); - - recursive = globbed_uri.key == "/**" ? true : false; - - filter_dag = VirtualColumnUtils::createPathAndFileFilterDAG(predicate, virtual_columns); - fillInternalBufferAssumeLocked(); } KeyWithInfoPtr next(size_t) @@ -257,6 +239,37 @@ public: private: using ListObjectsOutcome = Aws::S3::Model::ListObjectsV2Outcome; + bool fillBufferForKey(const std::string & uri_key) + { + const String key_prefix = uri_key.substr(0, uri_key.find_first_of("*?{")); + + /// We don't have to list bucket, because there is no asterisks. + if (key_prefix.size() == uri_key.size()) + { + buffer.clear(); + buffer.emplace_back(std::make_shared(uri_key, std::nullopt)); + buffer_iter = buffer.begin(); + return true; + } + + request.SetBucket(globbed_uri.bucket); + request.SetPrefix(key_prefix); + request.SetMaxKeys(static_cast(request_settings.list_object_keys_size)); + + outcome_future = listObjectsAsync(); + + matcher = std::make_unique(makeRegexpPatternFromGlobs(uri_key)); + if (!matcher->ok()) + throw Exception(ErrorCodes::CANNOT_COMPILE_REGEXP, + "Cannot compile regex from glob ({}): {}", uri_key, matcher->error()); + + recursive = globbed_uri.key == "/**"; + + filter_dag = VirtualColumnUtils::createPathAndFileFilterDAG(predicate, virtual_columns); + fillInternalBufferAssumeLocked(); + return false; + } + KeyWithInfoPtr nextAssumeLocked() { do @@ -278,6 +291,15 @@ private: return answer; } + if (expanded_keys_iter != expanded_keys.end()) + { + bool no_globs_in_key = fillBufferForKey(*expanded_keys_iter); + expanded_keys_iter++; + if (expanded_keys_iter == expanded_keys.end() && no_globs_in_key) + is_finished = true; + continue; + } + if (is_finished) return {}; @@ -399,8 +421,12 @@ private: KeysWithInfo buffer; KeysWithInfo::iterator buffer_iter; + std::vector expanded_keys; + std::vector::iterator expanded_keys_iter; + std::unique_ptr client; S3::URI globbed_uri; + const ActionsDAG::Node * predicate; ASTPtr query; NamesAndTypesList virtual_columns; ActionsDAGPtr filter_dag; From a177fbfd8cb52c64f096797a6c65fdc4dfeb828e Mon Sep 17 00:00:00 2001 From: zvonand Date: Tue, 2 Apr 2024 00:05:53 +0200 Subject: [PATCH 023/289] ignore error when one of selection options not exist --- src/Storages/StorageS3.cpp | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp index 844f5362ec2..09a5ffc86a5 100644 --- a/src/Storages/StorageS3.cpp +++ b/src/Storages/StorageS3.cpp @@ -283,7 +283,18 @@ private: /// So we get object info lazily here on 'next()' request. if (!answer->info) { - answer->info = S3::getObjectInfo(*client, globbed_uri.bucket, answer->key, globbed_uri.version_id, request_settings); + try + { + answer->info = S3::getObjectInfo(*client, globbed_uri.bucket, answer->key, globbed_uri.version_id, request_settings); + } + catch (...) + { + /// if no such file AND there was no `{}` glob -- this is an exception + /// otherwise ignore it, this is acceptable + if (expanded_keys.size() == 1) + throw; + continue; + } if (file_progress_callback) file_progress_callback(FileProgress(0, answer->info->size)); } From 5f190e6774aa1a38e6aae740b97f51882355cd25 Mon Sep 17 00:00:00 2001 From: serxa Date: Tue, 2 Apr 2024 11:12:34 +0000 Subject: [PATCH 024/289] Use intrusive lists for `ResourceRequest` instead of deque --- src/Common/Scheduler/Nodes/FifoQueue.h | 39 +++++++++++++++----------- src/Common/Scheduler/ResourceRequest.h | 4 ++- 2 files changed, 26 insertions(+), 17 deletions(-) diff --git a/src/Common/Scheduler/Nodes/FifoQueue.h b/src/Common/Scheduler/Nodes/FifoQueue.h index 45ed32343ff..9ec997c06d2 100644 --- a/src/Common/Scheduler/Nodes/FifoQueue.h +++ b/src/Common/Scheduler/Nodes/FifoQueue.h @@ -6,7 +6,8 @@ #include -#include +#include + #include @@ -15,6 +16,7 @@ namespace DB namespace ErrorCodes { + extern const int LOGICAL_ERROR; extern const int INVALID_SCHEDULER_NODE; } @@ -42,7 +44,7 @@ public: std::lock_guard lock(mutex); queue_cost += request->cost; bool was_empty = requests.empty(); - requests.push_back(request); + requests.push_back(*request); if (was_empty) scheduleActivation(); } @@ -52,7 +54,7 @@ public: std::lock_guard lock(mutex); if (requests.empty()) return {nullptr, false}; - ResourceRequest * result = requests.front(); + ResourceRequest * result = &requests.front(); requests.pop_front(); if (requests.empty()) busy_periods++; @@ -65,19 +67,24 @@ public: bool cancelRequest(ResourceRequest * request) override { std::lock_guard lock(mutex); - // TODO(serxa): reimplement queue as intrusive list of ResourceRequest to make this O(1) instead of O(N) - for (auto i = requests.begin(), e = requests.end(); i != e; ++i) + if (request->is_linked()) { - if (*i == request) - { - requests.erase(i); - if (requests.empty()) - busy_periods++; - queue_cost -= request->cost; - canceled_requests++; - canceled_cost += request->cost; - return true; - } + // It's impossible to check that `request` is indeed inserted to this queue and not another queue. + // It's up to caller to make sure this is the case. Otherwise, list sizes will be corrupted. + // Not tracking list sizes is not an option, because another problem appears: removing from list w/o locking. + // Another possible solution - keep track if request `is_cancelable` guarded by `mutex` + // Simple check for list size corruption + if (requests.empty()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "trying to cancel request (linked into another queue) from empty queue: {}", getPath()); + + requests.erase(requests.iterator_to(*request)); + + if (requests.empty()) + busy_periods++; + queue_cost -= request->cost; + canceled_requests++; + canceled_cost += request->cost; + return true; } return false; } @@ -124,7 +131,7 @@ public: private: std::mutex mutex; Int64 queue_cost = 0; - std::deque requests; // TODO(serxa): reimplement it using intrusive list to avoid allocations/deallocations and O(N) during cancel + boost::intrusive::list requests; }; } diff --git a/src/Common/Scheduler/ResourceRequest.h b/src/Common/Scheduler/ResourceRequest.h index f3153ad382c..d64f624cec5 100644 --- a/src/Common/Scheduler/ResourceRequest.h +++ b/src/Common/Scheduler/ResourceRequest.h @@ -1,5 +1,6 @@ #pragma once +#include #include #include @@ -41,7 +42,7 @@ constexpr ResourceCost ResourceCostMax = std::numeric_limits::max(); * Returning true means successful cancel and therefore steps (4) and (5) are not going to happen * and step (6) MUST be omitted. */ -class ResourceRequest +class ResourceRequest : public boost::intrusive::list_base_hook<> { public: /// Cost of request execution; should be filled before request enqueueing. @@ -62,6 +63,7 @@ public: { cost = cost_; constraint = nullptr; + // Note that list_base_hook should be reset independently (by intrusive list) } virtual ~ResourceRequest() = default; From 7232bf45768f56c768ac03ed4b34c085bc6f060a Mon Sep 17 00:00:00 2001 From: zvonand Date: Tue, 2 Apr 2024 16:12:11 +0200 Subject: [PATCH 025/289] no reuse request --- src/Storages/StorageS3.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp index 09a5ffc86a5..28bfa3c32a9 100644 --- a/src/Storages/StorageS3.cpp +++ b/src/Storages/StorageS3.cpp @@ -252,6 +252,7 @@ private: return true; } + request = S3::ListObjectsV2Request{}; request.SetBucket(globbed_uri.bucket); request.SetPrefix(key_prefix); request.SetMaxKeys(static_cast(request_settings.list_object_keys_size)); From 25cab6f0713221e32b2c2fef844e2c2fde77e985 Mon Sep 17 00:00:00 2001 From: zvonand Date: Wed, 3 Apr 2024 20:57:10 +0200 Subject: [PATCH 026/289] fix schema inference cache (1) --- src/Storages/StorageS3.cpp | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp index 28bfa3c32a9..b19e61762d1 100644 --- a/src/Storages/StorageS3.cpp +++ b/src/Storages/StorageS3.cpp @@ -217,7 +217,7 @@ public: bool no_globs_in_key = fillBufferForKey(*expanded_keys_iter); expanded_keys_iter++; if (expanded_keys_iter == expanded_keys.end() && no_globs_in_key) - is_finished = true; + is_finished_for_key = true; } KeyWithInfoPtr next(size_t) @@ -241,6 +241,7 @@ private: bool fillBufferForKey(const std::string & uri_key) { + is_finished_for_key = false; const String key_prefix = uri_key.substr(0, uri_key.find_first_of("*?{")); /// We don't have to list bucket, because there is no asterisks. @@ -249,10 +250,12 @@ private: buffer.clear(); buffer.emplace_back(std::make_shared(uri_key, std::nullopt)); buffer_iter = buffer.begin(); + if (read_keys) + read_keys->insert(read_keys->end(), buffer.begin(), buffer.end()); return true; } - request = S3::ListObjectsV2Request{}; + request = {}; request.SetBucket(globbed_uri.bucket); request.SetPrefix(key_prefix); request.SetMaxKeys(static_cast(request_settings.list_object_keys_size)); @@ -308,11 +311,11 @@ private: bool no_globs_in_key = fillBufferForKey(*expanded_keys_iter); expanded_keys_iter++; if (expanded_keys_iter == expanded_keys.end() && no_globs_in_key) - is_finished = true; + is_finished_for_key = true; continue; } - if (is_finished) + if (is_finished_for_key) return {}; try @@ -327,7 +330,7 @@ private: /// it may take some time for threads to stop processors and they /// may still use this iterator after exception is thrown. /// To avoid this UB, reset the buffer and return defaults for further calls. - is_finished = true; + is_finished_for_key = true; buffer.clear(); buffer_iter = buffer.begin(); throw; @@ -351,9 +354,9 @@ private: const auto & result_batch = outcome.GetResult().GetContents(); /// It returns false when all objects were returned - is_finished = !outcome.GetResult().GetIsTruncated(); + is_finished_for_key = !outcome.GetResult().GetIsTruncated(); - if (!is_finished) + if (!is_finished_for_key) { /// Even if task is finished the thread may be not freed in pool. /// So wait until it will be freed before scheduling a new task. @@ -444,7 +447,7 @@ private: ActionsDAGPtr filter_dag; std::unique_ptr matcher; bool recursive{false}; - bool is_finished{false}; + bool is_finished_for_key{false}; KeysWithInfo * read_keys; S3::ListObjectsV2Request request; From ce3969e25d7ef8bd661fa6047ac0882735fd567a Mon Sep 17 00:00:00 2001 From: Andrey Zvonov Date: Thu, 4 Apr 2024 19:47:34 +0000 Subject: [PATCH 027/289] adapt test to new behavior --- tests/integration/test_storage_s3/test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/integration/test_storage_s3/test.py b/tests/integration/test_storage_s3/test.py index 6d5b84a8143..9d275575f8a 100644 --- a/tests/integration/test_storage_s3/test.py +++ b/tests/integration/test_storage_s3/test.py @@ -1768,13 +1768,13 @@ def test_schema_inference_cache(started_cluster): check_cache(instance, []) run_describe_query(instance, files, storage_name, started_cluster, bucket) - check_cache_misses(instance, files, storage_name, started_cluster, bucket, 4) + check_cache_misses(instance, files, storage_name, started_cluster, bucket, 4 if storage_name == "url" else 1) instance.query("system drop schema cache") check_cache(instance, []) run_describe_query(instance, files, storage_name, started_cluster, bucket) - check_cache_misses(instance, files, storage_name, started_cluster, bucket, 4) + check_cache_misses(instance, files, storage_name, started_cluster, bucket, 4 if storage_name == "url" else 1) instance.query("system drop schema cache") From e3858107969d6f6363de343197608bf65693dd59 Mon Sep 17 00:00:00 2001 From: zvonand Date: Thu, 4 Apr 2024 22:18:41 +0200 Subject: [PATCH 028/289] fix black --- tests/integration/test_storage_s3/test.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/tests/integration/test_storage_s3/test.py b/tests/integration/test_storage_s3/test.py index 9d275575f8a..a4ed94c815b 100644 --- a/tests/integration/test_storage_s3/test.py +++ b/tests/integration/test_storage_s3/test.py @@ -1768,13 +1768,27 @@ def test_schema_inference_cache(started_cluster): check_cache(instance, []) run_describe_query(instance, files, storage_name, started_cluster, bucket) - check_cache_misses(instance, files, storage_name, started_cluster, bucket, 4 if storage_name == "url" else 1) + check_cache_misses( + instance, + files, + storage_name, + started_cluster, + bucket, + 4 if storage_name == "url" else 1, + ) instance.query("system drop schema cache") check_cache(instance, []) run_describe_query(instance, files, storage_name, started_cluster, bucket) - check_cache_misses(instance, files, storage_name, started_cluster, bucket, 4 if storage_name == "url" else 1) + check_cache_misses( + instance, + files, + storage_name, + started_cluster, + bucket, + 4 if storage_name == "url" else 1, + ) instance.query("system drop schema cache") From 307f69380efbd7af9a3e897df04cc9307558ad91 Mon Sep 17 00:00:00 2001 From: skyoct Date: Sun, 7 Apr 2024 09:02:43 +0000 Subject: [PATCH 029/289] support clamp function --- .../functions/conditional-functions.md | 31 +++++++++ src/Functions/clamp.cpp | 68 +++++++++++++++++++ .../queries/0_stateless/03036_clamp.reference | 4 ++ tests/queries/0_stateless/03036_clamp.sql | 4 ++ 4 files changed, 107 insertions(+) create mode 100644 src/Functions/clamp.cpp create mode 100644 tests/queries/0_stateless/03036_clamp.reference create mode 100644 tests/queries/0_stateless/03036_clamp.sql diff --git a/docs/en/sql-reference/functions/conditional-functions.md b/docs/en/sql-reference/functions/conditional-functions.md index eb4e98961f1..c89de4d9958 100644 --- a/docs/en/sql-reference/functions/conditional-functions.md +++ b/docs/en/sql-reference/functions/conditional-functions.md @@ -234,3 +234,34 @@ SELECT least(toDateTime32(now() + toIntervalDay(1)), toDateTime64(now(), 3)) :::note The type returned is a DateTime64 as the DataTime32 must be promoted to 64 bit for the comparison. ::: + +## clamp + +Constrain the return value between A and B. + +**Syntax** + +``` sql +if(x, min, max) +``` + +**Arguments** + +- `x` – Input value. +- `min` – Limit the lower bound. +- `max` – Limit the upper bound. + +**Returned values** + +If the value is less than the minimum value, return the minimum value; if it is greater than the maximum value, return the maximum value; otherwise, return the current value. + +Examples: + +```sql +SELECT least(1, 2, 3) result, toTypeName(result) type; +``` +```response +┌─result─┬─type────┐ +│ 2 │ Float64 │ +└────────┴─────────┘ +``` \ No newline at end of file diff --git a/src/Functions/clamp.cpp b/src/Functions/clamp.cpp new file mode 100644 index 00000000000..38be5a28f28 --- /dev/null +++ b/src/Functions/clamp.cpp @@ -0,0 +1,68 @@ +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; +} + + +class FunctionClamp : public IFunction +{ + +public: + static constexpr auto name = "clamp"; + + String getName() const override { return name; } + size_t getNumberOfArguments() const override { return 0; } + bool isVariadic() const override { return true; } + bool useDefaultImplementationForConstants() const override { return true; } + bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; } + static FunctionPtr create(ContextPtr) { return std::make_shared(); } + + DataTypePtr getReturnTypeImpl(const DataTypes & types) const override + { + if (types.size() != 3) + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Function {} must be 3 arguments", getName()); + + return getLeastSupertype(types); + } + + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override + { + + size_t arg_size = arguments.size(); + Columns converted_columns(arg_size); + for (size_t arg = 0; arg < arg_size; ++arg) + converted_columns[arg] = castColumn(arguments[arg], result_type)->convertToFullColumnIfConst(); + + auto result_column = result_type->createColumn(); + for (size_t row_num = 0; row_num < input_rows_count; ++row_num) + { + size_t best_arg = 0; + + if (converted_columns[1]->compareAt(row_num, row_num, *converted_columns[best_arg], 1) > 0) + best_arg = 1; + else if (converted_columns[2]->compareAt(row_num, row_num, *converted_columns[best_arg], 1) < 0) + best_arg = 2; + + result_column->insertFrom(*converted_columns[best_arg], row_num); + } + + return result_column; + } + +}; + +REGISTER_FUNCTION(Clamp) +{ + factory.registerFunction(); +} +} diff --git a/tests/queries/0_stateless/03036_clamp.reference b/tests/queries/0_stateless/03036_clamp.reference new file mode 100644 index 00000000000..bd0d34dabea --- /dev/null +++ b/tests/queries/0_stateless/03036_clamp.reference @@ -0,0 +1,4 @@ +10 +20 +15 +b diff --git a/tests/queries/0_stateless/03036_clamp.sql b/tests/queries/0_stateless/03036_clamp.sql new file mode 100644 index 00000000000..8250325661a --- /dev/null +++ b/tests/queries/0_stateless/03036_clamp.sql @@ -0,0 +1,4 @@ +SELECT clamp(1, 10, 20); +SELECT clamp(30, 10, 20); +SELECT clamp(15, 10, 20); +SELECT clamp('a', 'b', 'c'); \ No newline at end of file From 8faa1487549e1dae0ca27d0550d7c90dd3102668 Mon Sep 17 00:00:00 2001 From: skyoct Date: Mon, 8 Apr 2024 06:44:11 +0000 Subject: [PATCH 030/289] batter --- docs/en/sql-reference/functions/conditional-functions.md | 4 ++-- src/Functions/clamp.cpp | 1 - tests/queries/0_stateless/03036_clamp.reference | 4 ++++ tests/queries/0_stateless/03036_clamp.sql | 6 +++++- 4 files changed, 11 insertions(+), 4 deletions(-) diff --git a/docs/en/sql-reference/functions/conditional-functions.md b/docs/en/sql-reference/functions/conditional-functions.md index c89de4d9958..f19386af9f1 100644 --- a/docs/en/sql-reference/functions/conditional-functions.md +++ b/docs/en/sql-reference/functions/conditional-functions.md @@ -242,7 +242,7 @@ Constrain the return value between A and B. **Syntax** ``` sql -if(x, min, max) +clamp(x, min, max) ``` **Arguments** @@ -258,7 +258,7 @@ If the value is less than the minimum value, return the minimum value; if it is Examples: ```sql -SELECT least(1, 2, 3) result, toTypeName(result) type; +SELECT clamp(1, 2, 3) result, toTypeName(result) type; ``` ```response ┌─result─┬─type────┐ diff --git a/src/Functions/clamp.cpp b/src/Functions/clamp.cpp index 38be5a28f28..0210d135235 100644 --- a/src/Functions/clamp.cpp +++ b/src/Functions/clamp.cpp @@ -37,7 +37,6 @@ public: ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override { - size_t arg_size = arguments.size(); Columns converted_columns(arg_size); for (size_t arg = 0; arg < arg_size; ++arg) diff --git a/tests/queries/0_stateless/03036_clamp.reference b/tests/queries/0_stateless/03036_clamp.reference index bd0d34dabea..bea85d8ccdd 100644 --- a/tests/queries/0_stateless/03036_clamp.reference +++ b/tests/queries/0_stateless/03036_clamp.reference @@ -2,3 +2,7 @@ 20 15 b +0 +['hello'] +-1 +234 diff --git a/tests/queries/0_stateless/03036_clamp.sql b/tests/queries/0_stateless/03036_clamp.sql index 8250325661a..8d4f2112830 100644 --- a/tests/queries/0_stateless/03036_clamp.sql +++ b/tests/queries/0_stateless/03036_clamp.sql @@ -1,4 +1,8 @@ SELECT clamp(1, 10, 20); SELECT clamp(30, 10, 20); SELECT clamp(15, 10, 20); -SELECT clamp('a', 'b', 'c'); \ No newline at end of file +SELECT clamp('a', 'b', 'c'); +SELECT clamp(today(), yesterday() - 10, yesterday() + 10) - today() +SELECT clamp([], ['hello'], ['world']); +SELECT clamp(-1., -1000., 18446744073709551615.); +SELECT clamp(toNullable(123), 234, 456); From a6d088bed8ff87c1901077b7c8960034445aa9d4 Mon Sep 17 00:00:00 2001 From: skyoct Date: Mon, 8 Apr 2024 10:35:53 +0000 Subject: [PATCH 031/289] batter --- src/Functions/clamp.cpp | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/src/Functions/clamp.cpp b/src/Functions/clamp.cpp index 0210d135235..dc0ddffbd82 100644 --- a/src/Functions/clamp.cpp +++ b/src/Functions/clamp.cpp @@ -11,6 +11,7 @@ namespace DB namespace ErrorCodes { extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; + extern const int BAD_ARGUMENTS; } @@ -30,7 +31,7 @@ public: DataTypePtr getReturnTypeImpl(const DataTypes & types) const override { if (types.size() != 3) - throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Function {} must be 3 arguments", getName()); + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Function {} requires 3 arguments", getName()); return getLeastSupertype(types); } @@ -41,18 +42,20 @@ public: Columns converted_columns(arg_size); for (size_t arg = 0; arg < arg_size; ++arg) converted_columns[arg] = castColumn(arguments[arg], result_type)->convertToFullColumnIfConst(); + if (converted_columns[1]->compareAt(0, 0, *converted_columns[2], 1) > 0) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Function {} the minimum value cannot be greater than the maximum value", getName()); auto result_column = result_type->createColumn(); for (size_t row_num = 0; row_num < input_rows_count; ++row_num) { - size_t best_arg = 0; - - if (converted_columns[1]->compareAt(row_num, row_num, *converted_columns[best_arg], 1) > 0) - best_arg = 1; - else if (converted_columns[2]->compareAt(row_num, row_num, *converted_columns[best_arg], 1) < 0) - best_arg = 2; + size_t best_arg = 0; + + if (converted_columns[1]->compareAt(row_num, row_num, *converted_columns[best_arg], 1) > 0) + best_arg = 1; + else if (converted_columns[2]->compareAt(row_num, row_num, *converted_columns[best_arg], 1) < 0) + best_arg = 2; - result_column->insertFrom(*converted_columns[best_arg], row_num); + result_column->insertFrom(*converted_columns[best_arg], row_num); } return result_column; From 0d6426d781538b899acfcfbff736f16422097466 Mon Sep 17 00:00:00 2001 From: skyoct Date: Mon, 8 Apr 2024 13:34:54 +0000 Subject: [PATCH 032/289] batter --- tests/queries/0_stateless/03036_clamp.reference | 2 ++ tests/queries/0_stateless/03036_clamp.sql | 2 ++ 2 files changed, 4 insertions(+) diff --git a/tests/queries/0_stateless/03036_clamp.reference b/tests/queries/0_stateless/03036_clamp.reference index bea85d8ccdd..5c8de57794d 100644 --- a/tests/queries/0_stateless/03036_clamp.reference +++ b/tests/queries/0_stateless/03036_clamp.reference @@ -6,3 +6,5 @@ b ['hello'] -1 234 +null +null \ No newline at end of file diff --git a/tests/queries/0_stateless/03036_clamp.sql b/tests/queries/0_stateless/03036_clamp.sql index 8d4f2112830..e075096f8ef 100644 --- a/tests/queries/0_stateless/03036_clamp.sql +++ b/tests/queries/0_stateless/03036_clamp.sql @@ -6,3 +6,5 @@ SELECT clamp(today(), yesterday() - 10, yesterday() + 10) - today() SELECT clamp([], ['hello'], ['world']); SELECT clamp(-1., -1000., 18446744073709551615.); SELECT clamp(toNullable(123), 234, 456); +select clamp(1, null, 5); +select clamp(1, 6, null); \ No newline at end of file From 5d36d1aa4efad0843d4a9389003c8389f7b2e48a Mon Sep 17 00:00:00 2001 From: skyoct Date: Mon, 8 Apr 2024 13:53:33 +0000 Subject: [PATCH 033/289] style --- src/Functions/clamp.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/Functions/clamp.cpp b/src/Functions/clamp.cpp index dc0ddffbd82..0593c6c6b16 100644 --- a/src/Functions/clamp.cpp +++ b/src/Functions/clamp.cpp @@ -49,7 +49,6 @@ public: for (size_t row_num = 0; row_num < input_rows_count; ++row_num) { size_t best_arg = 0; - if (converted_columns[1]->compareAt(row_num, row_num, *converted_columns[best_arg], 1) > 0) best_arg = 1; else if (converted_columns[2]->compareAt(row_num, row_num, *converted_columns[best_arg], 1) < 0) From 88548eb19eb07ea29c8d603d77a43039844a351c Mon Sep 17 00:00:00 2001 From: avogar Date: Mon, 8 Apr 2024 18:59:04 +0000 Subject: [PATCH 034/289] Fix exception message during writing to partitioned s3 path woth globs --- src/Storages/StorageS3.cpp | 22 +++++++++++++------ src/Storages/StorageS3.h | 6 +++-- ...rite_to_globbed_partitioned_path.reference | 0 ...7_s3_write_to_globbed_partitioned_path.sql | 2 ++ 4 files changed, 21 insertions(+), 9 deletions(-) create mode 100644 tests/queries/0_stateless/03037_s3_write_to_globbed_partitioned_path.reference create mode 100644 tests/queries/0_stateless/03037_s3_write_to_globbed_partitioned_path.sql diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp index 6cda0fca60b..c0802d02120 100644 --- a/src/Storages/StorageS3.cpp +++ b/src/Storages/StorageS3.cpp @@ -207,7 +207,7 @@ public: , list_objects_scheduler(threadPoolCallbackRunner(list_objects_pool, "ListObjects")) , file_progress_callback(file_progress_callback_) { - if (globbed_uri.bucket.find_first_of("*?{") != std::string::npos) + if (globbed_uri.bucket.find_first_of("*?{") != globbed_uri.bucket.npos) throw Exception(ErrorCodes::UNEXPECTED_EXPRESSION, "Expression can not have wildcards inside bucket name"); const String key_prefix = globbed_uri.key.substr(0, globbed_uri.key.find_first_of("*?{")); @@ -1194,7 +1194,7 @@ void ReadFromStorageS3Step::createIterator(const ActionsDAG::Node * predicate) void ReadFromStorageS3Step::initializePipeline(QueryPipelineBuilder & pipeline, const BuildQueryPipelineSettings &) { - if (storage.partition_by && query_configuration.withWildcard()) + if (storage.partition_by && query_configuration.withPartitionWildcard()) throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Reading from a partitioned S3 storage is not implemented yet"); createIterator(nullptr); @@ -1249,12 +1249,16 @@ SinkToStoragePtr StorageS3::write(const ASTPtr & query, const StorageMetadataPtr { auto query_configuration = updateConfigurationAndGetCopy(local_context); + if (query_configuration.withGlobsIgnorePartitionWildcard()) + throw Exception(ErrorCodes::DATABASE_ACCESS_DENIED, + "S3 key '{}' contains globs, so the table is in readonly mode", query_configuration.url.key); + auto sample_block = metadata_snapshot->getSampleBlock(); auto chosen_compression_method = chooseCompressionMethod(query_configuration.keys.back(), query_configuration.compression_method); auto insert_query = std::dynamic_pointer_cast(query); auto partition_by_ast = insert_query ? (insert_query->partition_by ? insert_query->partition_by : partition_by) : nullptr; - bool is_partitioned_implementation = partition_by_ast && query_configuration.withWildcard(); + bool is_partitioned_implementation = partition_by_ast && query_configuration.withPartitionWildcard(); if (is_partitioned_implementation) { @@ -1271,10 +1275,6 @@ SinkToStoragePtr StorageS3::write(const ASTPtr & query, const StorageMetadataPtr } else { - if (query_configuration.withGlobs()) - throw Exception(ErrorCodes::DATABASE_ACCESS_DENIED, - "S3 key '{}' contains globs, so the table is in readonly mode", query_configuration.url.key); - bool truncate_in_insert = local_context->getSettingsRef().s3_truncate_on_insert; if (!truncate_in_insert && S3::objectExists(*query_configuration.client, query_configuration.url.bucket, query_configuration.keys.back(), query_configuration.url.version_id, query_configuration.request_settings)) @@ -1460,6 +1460,14 @@ void StorageS3::Configuration::connect(const ContextPtr & context) credentials.GetSessionToken()); } +bool StorageS3::Configuration::withGlobsIgnorePartitionWildcard() const +{ + if (!withPartitionWildcard()) + return withGlobs(); + + return PartitionedSink::replaceWildcards(getPath(), "").find_first_of("*?{") != std::string::npos; +} + void StorageS3::processNamedCollectionResult(StorageS3::Configuration & configuration, const NamedCollection & collection) { validateNamedCollection(collection, required_configuration_keys, optional_configuration_keys); diff --git a/src/Storages/StorageS3.h b/src/Storages/StorageS3.h index 19cbfaa6f08..e657db7bd35 100644 --- a/src/Storages/StorageS3.h +++ b/src/Storages/StorageS3.h @@ -274,7 +274,7 @@ public: { Configuration() = default; - String getPath() const { return url.key; } + const String & getPath() const { return url.key; } bool update(const ContextPtr & context); @@ -282,13 +282,15 @@ public: bool withGlobs() const { return url.key.find_first_of("*?{") != std::string::npos; } - bool withWildcard() const + bool withPartitionWildcard() const { static const String PARTITION_ID_WILDCARD = "{_partition_id}"; return url.bucket.find(PARTITION_ID_WILDCARD) != String::npos || keys.back().find(PARTITION_ID_WILDCARD) != String::npos; } + bool withGlobsIgnorePartitionWildcard() const; + S3::URI url; S3::AuthSettings auth_settings; S3Settings::RequestSettings request_settings; diff --git a/tests/queries/0_stateless/03037_s3_write_to_globbed_partitioned_path.reference b/tests/queries/0_stateless/03037_s3_write_to_globbed_partitioned_path.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/03037_s3_write_to_globbed_partitioned_path.sql b/tests/queries/0_stateless/03037_s3_write_to_globbed_partitioned_path.sql new file mode 100644 index 00000000000..400bc790f0b --- /dev/null +++ b/tests/queries/0_stateless/03037_s3_write_to_globbed_partitioned_path.sql @@ -0,0 +1,2 @@ +insert into function s3('http://localhost:11111/test/data_*_{_partition_id}.csv') partition by number % 3 select * from numbers(10); -- {serverError DATABASE_ACCESS_DENIED} + From 21ab0e4ea27b389ab697cb5f045a9600bd0fa7c2 Mon Sep 17 00:00:00 2001 From: avogar Date: Mon, 8 Apr 2024 19:01:00 +0000 Subject: [PATCH 035/289] Remove bad change --- src/Storages/StorageS3.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp index c0802d02120..8de139ce366 100644 --- a/src/Storages/StorageS3.cpp +++ b/src/Storages/StorageS3.cpp @@ -207,7 +207,7 @@ public: , list_objects_scheduler(threadPoolCallbackRunner(list_objects_pool, "ListObjects")) , file_progress_callback(file_progress_callback_) { - if (globbed_uri.bucket.find_first_of("*?{") != globbed_uri.bucket.npos) + if (globbed_uri.bucket.find_first_of("*?{") != std::string::npos) throw Exception(ErrorCodes::UNEXPECTED_EXPRESSION, "Expression can not have wildcards inside bucket name"); const String key_prefix = globbed_uri.key.substr(0, globbed_uri.key.find_first_of("*?{")); From 78dd23fd83e541c73b2fec0bc6bdf19d8d5b9297 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Mon, 8 Apr 2024 21:23:51 +0200 Subject: [PATCH 036/289] Don't run new test in fasttest --- .../0_stateless/03037_s3_write_to_globbed_partitioned_path.sql | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/queries/0_stateless/03037_s3_write_to_globbed_partitioned_path.sql b/tests/queries/0_stateless/03037_s3_write_to_globbed_partitioned_path.sql index 400bc790f0b..1de89a593b0 100644 --- a/tests/queries/0_stateless/03037_s3_write_to_globbed_partitioned_path.sql +++ b/tests/queries/0_stateless/03037_s3_write_to_globbed_partitioned_path.sql @@ -1,2 +1,4 @@ +-- Tags: no-fasttest + insert into function s3('http://localhost:11111/test/data_*_{_partition_id}.csv') partition by number % 3 select * from numbers(10); -- {serverError DATABASE_ACCESS_DENIED} From 8accf395fb9cb3f2b896151ab09a902c674b9809 Mon Sep 17 00:00:00 2001 From: avogar Date: Mon, 8 Apr 2024 20:37:06 +0000 Subject: [PATCH 037/289] Fix for HDFS and Azure --- src/Storages/HDFS/StorageHDFS.cpp | 4 ++++ src/Storages/StorageAzureBlob.cpp | 19 +++++++++++++------ src/Storages/StorageAzureBlob.h | 4 +++- .../test_storage_azure_blob_storage/test.py | 14 ++++++++++++++ tests/integration/test_storage_hdfs/test.py | 10 ++++++++++ 5 files changed, 44 insertions(+), 7 deletions(-) diff --git a/src/Storages/HDFS/StorageHDFS.cpp b/src/Storages/HDFS/StorageHDFS.cpp index 93cf64f30a0..fe5d590770e 100644 --- a/src/Storages/HDFS/StorageHDFS.cpp +++ b/src/Storages/HDFS/StorageHDFS.cpp @@ -1065,6 +1065,10 @@ SinkToStoragePtr StorageHDFS::write(const ASTPtr & query, const StorageMetadataP if (is_partitioned_implementation) { + String path = current_uri.substr(current_uri.find('/', current_uri.find("//") + 2)); + if (PartitionedSink::replaceWildcards(path, "").find_first_of("*?{") != std::string::npos) + throw Exception(ErrorCodes::DATABASE_ACCESS_DENIED, "URI '{}' contains globs, so the table is in readonly mode", uris.back()); + return std::make_shared( partition_by_ast, current_uri, diff --git a/src/Storages/StorageAzureBlob.cpp b/src/Storages/StorageAzureBlob.cpp index 306a5eac8e5..1eb18021653 100644 --- a/src/Storages/StorageAzureBlob.cpp +++ b/src/Storages/StorageAzureBlob.cpp @@ -462,6 +462,13 @@ Poco::URI StorageAzureBlob::Configuration::getConnectionURL() const return Poco::URI(parsed_connection_string.BlobServiceUrl.GetAbsoluteUrl()); } +bool StorageAzureBlob::Configuration::withGlobsIgnorePartitionWildcard() const +{ + if (!withPartitionWildcard()) + return withGlobs(); + + return PartitionedSink::replaceWildcards(getPath(), "").find_first_of("*?{") != std::string::npos; +} StorageAzureBlob::StorageAzureBlob( const Configuration & configuration_, @@ -749,7 +756,7 @@ void StorageAzureBlob::read( size_t max_block_size, size_t num_streams) { - if (partition_by && configuration.withWildcard()) + if (partition_by && configuration.withPartitionWildcard()) throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Reading from a partitioned Azure storage is not implemented yet"); auto this_ptr = std::static_pointer_cast(shared_from_this()); @@ -836,12 +843,16 @@ void ReadFromAzureBlob::initializePipeline(QueryPipelineBuilder & pipeline, cons SinkToStoragePtr StorageAzureBlob::write(const ASTPtr & query, const StorageMetadataPtr & metadata_snapshot, ContextPtr local_context, bool /*async_insert*/) { + if (configuration.withGlobsIgnorePartitionWildcard()) + throw Exception(ErrorCodes::DATABASE_ACCESS_DENIED, + "AzureBlobStorage key '{}' contains globs, so the table is in readonly mode", configuration.blob_path); + auto sample_block = metadata_snapshot->getSampleBlock(); auto chosen_compression_method = chooseCompressionMethod(configuration.blobs_paths.back(), configuration.compression_method); auto insert_query = std::dynamic_pointer_cast(query); auto partition_by_ast = insert_query ? (insert_query->partition_by ? insert_query->partition_by : partition_by) : nullptr; - bool is_partitioned_implementation = partition_by_ast && configuration.withWildcard(); + bool is_partitioned_implementation = partition_by_ast && configuration.withPartitionWildcard(); if (is_partitioned_implementation) { @@ -857,10 +868,6 @@ SinkToStoragePtr StorageAzureBlob::write(const ASTPtr & query, const StorageMeta } else { - if (configuration.withGlobs()) - throw Exception(ErrorCodes::DATABASE_ACCESS_DENIED, - "AzureBlobStorage key '{}' contains globs, so the table is in readonly mode", configuration.blob_path); - bool truncate_in_insert = local_context->getSettingsRef().azure_truncate_on_insert; if (!truncate_in_insert && object_storage->exists(StoredObject(configuration.blob_path))) diff --git a/src/Storages/StorageAzureBlob.h b/src/Storages/StorageAzureBlob.h index 3f1ba33f636..928a8204bb2 100644 --- a/src/Storages/StorageAzureBlob.h +++ b/src/Storages/StorageAzureBlob.h @@ -37,12 +37,14 @@ public: bool withGlobs() const { return blob_path.find_first_of("*?{") != std::string::npos; } - bool withWildcard() const + bool withPartitionWildcard() const { static const String PARTITION_ID_WILDCARD = "{_partition_id}"; return blobs_paths.back().find(PARTITION_ID_WILDCARD) != String::npos; } + bool withGlobsIgnorePartitionWildcard() const; + Poco::URI getConnectionURL() const; std::string connection_url; diff --git a/tests/integration/test_storage_azure_blob_storage/test.py b/tests/integration/test_storage_azure_blob_storage/test.py index 7d30265e4f8..1c87feee297 100644 --- a/tests/integration/test_storage_azure_blob_storage/test.py +++ b/tests/integration/test_storage_azure_blob_storage/test.py @@ -1323,6 +1323,20 @@ def test_format_detection(cluster): assert result == expected_result +def test_write_to_globbed_partitioned_path(cluster): + node = cluster.instances["node"] + storage_account_url = cluster.env_variables["AZURITE_STORAGE_ACCOUNT_URL"] + account_name = "devstoreaccount1" + account_key = "Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==" + error = azure_query( + node, + f"INSERT INTO TABLE FUNCTION azureBlobStorage('{storage_account_url}', 'cont', 'test_data_*_{{_partition_id}}', '{account_name}', '{account_key}', 'CSV', 'auto', 'x UInt64') partition by 42 select 42", + expect_error="true", + ) + + assert "DATABASE_ACCESS_DENIED" in error + + def test_parallel_read(cluster): node = cluster.instances["node"] connection_string = cluster.env_variables["AZURITE_CONNECTION_STRING"] diff --git a/tests/integration/test_storage_hdfs/test.py b/tests/integration/test_storage_hdfs/test.py index 9dec1954406..5daf8618036 100644 --- a/tests/integration/test_storage_hdfs/test.py +++ b/tests/integration/test_storage_hdfs/test.py @@ -1116,6 +1116,16 @@ def test_format_detection(started_cluster): assert expected_result == result +def test_write_to_globbed_partitioned_path(started_cluster): + node = started_cluster.instances["node1"] + + error = node.query_and_get_error( + "insert into function hdfs('hdfs://hdfs1:9000/test_data_*_{_partition_id}.csv') partition by 42 select 42" + ) + + assert "DATABASE_ACCESS_DENIED" in error + + if __name__ == "__main__": cluster.start() input("Cluster created, press any key to destroy...") From 08f700118f266234ad3eba15ea95f91a85d81db8 Mon Sep 17 00:00:00 2001 From: HowePa <2873679104@qq.com> Date: Tue, 9 Apr 2024 13:24:37 +0800 Subject: [PATCH 038/289] [doc] update npy format data types --- docs/en/interfaces/formats.md | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/docs/en/interfaces/formats.md b/docs/en/interfaces/formats.md index 03cf345349e..cc3ea467ab1 100644 --- a/docs/en/interfaces/formats.md +++ b/docs/en/interfaces/formats.md @@ -2465,23 +2465,22 @@ Result: ## Npy {#data-format-npy} -This function is designed to load a NumPy array from a .npy file into ClickHouse. The NumPy file format is a binary format used for efficiently storing arrays of numerical data. During import, ClickHouse treats top level dimension as an array of rows with single column. Supported Npy data types and their corresponding type in ClickHouse: -| Npy type | ClickHouse type | -|:--------:|:---------------:| -| b1 | UInt8 | -| i1 | Int8 | -| i2 | Int16 | -| i4 | Int32 | -| i8 | Int64 | -| u1 | UInt8 | -| u2 | UInt16 | -| u4 | UInt32 | -| u8 | UInt64 | -| f2 | Float32 | -| f4 | Float32 | -| f8 | Float64 | -| S | String | -| U | String | +This function is designed to load a NumPy array from a .npy file into ClickHouse. The NumPy file format is a binary format used for efficiently storing arrays of numerical data. During import, ClickHouse treats top level dimension as an array of rows with single column. Supported Npy data types and their corresponding type in ClickHouse: + +| Npy data type (`INSERT`) | ClickHouse data type | Npy data type (`SELECT`) | +|--------------------------|-----------------------------------------------------------------|--------------------------| +| `i1` | [Int8](/docs/en/sql-reference/data-types/int-uint.md) | `i1` | +| `i2` | [Int16](/docs/en/sql-reference/data-types/int-uint.md) | `i2` | +| `i4` | [Int32](/docs/en/sql-reference/data-types/int-uint.md) | `i4` | +| `i8` | [Int64](/docs/en/sql-reference/data-types/int-uint.md) | `i8` | +| `u1`, `b1` | [UInt8](/docs/en/sql-reference/data-types/int-uint.md) | `u1` | +| `u2` | [UInt16](/docs/en/sql-reference/data-types/int-uint.md) | `u2` | +| `u4` | [UInt32](/docs/en/sql-reference/data-types/int-uint.md) | `u4` | +| `u8` | [UInt64](/docs/en/sql-reference/data-types/int-uint.md) | `u8` | +| `f2`, `f4` | [Float32](/docs/en/sql-reference/data-types/float.md) | `f4` | +| `f8` | [Float64](/docs/en/sql-reference/data-types/float.md) | `f8` | +| `S`, `U` | [String](/docs/en/sql-reference/data-types/string.md) | `S` | +| | [FixedString](/docs/en/sql-reference/data-types/fixedstring.md) | `S` | **Example of saving an array in .npy format using Python** From c0174fa17efc766bb49db0be67b6b5f7383429fc Mon Sep 17 00:00:00 2001 From: HowePa <2873679104@qq.com> Date: Tue, 9 Apr 2024 14:30:14 +0800 Subject: [PATCH 039/289] [feature] add npy output format --- docs/en/interfaces/formats.md | 8 + src/Formats/registerFormats.cpp | 6 +- .../Formats/Impl/NpyOutputFormat.cpp | 223 ++++++++++++++++++ src/Processors/Formats/Impl/NpyOutputFormat.h | 66 ++++++ .../02895_npy_output_format.reference | 60 +++++ .../0_stateless/02895_npy_output_format.sh | 110 +++++++++ 6 files changed, 471 insertions(+), 2 deletions(-) create mode 100644 src/Processors/Formats/Impl/NpyOutputFormat.cpp create mode 100644 src/Processors/Formats/Impl/NpyOutputFormat.h create mode 100644 tests/queries/0_stateless/02895_npy_output_format.reference create mode 100755 tests/queries/0_stateless/02895_npy_output_format.sh diff --git a/docs/en/interfaces/formats.md b/docs/en/interfaces/formats.md index cc3ea467ab1..f4b082c57ab 100644 --- a/docs/en/interfaces/formats.md +++ b/docs/en/interfaces/formats.md @@ -2507,6 +2507,14 @@ Result: └───────────────┘ ``` +**Selecting Data** + +You can select data from a ClickHouse table and save them into some file in the Npy format by the following command: + +```bash +$ clickhouse-client --query="SELECT {column} FROM {some_table} FORMAT Npy" > {filename.npy} +``` + ## LineAsString {#lineasstring} In this format, every line of input data is interpreted as a single string value. This format can only be parsed for table with a single field of type [String](/docs/en/sql-reference/data-types/string.md). The remaining columns must be set to [DEFAULT](/docs/en/sql-reference/statements/create/table.md/#default) or [MATERIALIZED](/docs/en/sql-reference/statements/create/table.md/#materialized), or omitted. diff --git a/src/Formats/registerFormats.cpp b/src/Formats/registerFormats.cpp index cc9cf380693..2070956883c 100644 --- a/src/Formats/registerFormats.cpp +++ b/src/Formats/registerFormats.cpp @@ -76,6 +76,8 @@ void registerInputFormatCustomSeparated(FormatFactory & factory); void registerOutputFormatCustomSeparated(FormatFactory & factory); void registerInputFormatCapnProto(FormatFactory & factory); void registerOutputFormatCapnProto(FormatFactory & factory); +void registerInputFormatNpy(FormatFactory & factory); +void registerOutputFormatNpy(FormatFactory & factory); /// Output only (presentational) formats. @@ -103,7 +105,6 @@ void registerInputFormatMySQLDump(FormatFactory & factory); void registerInputFormatParquetMetadata(FormatFactory & factory); void registerInputFormatDWARF(FormatFactory & factory); void registerInputFormatOne(FormatFactory & factory); -void registerInputFormatNpy(FormatFactory & factory); #if USE_HIVE void registerInputFormatHiveText(FormatFactory & factory); @@ -221,6 +222,8 @@ void registerFormats() registerOutputFormatAvro(factory); registerInputFormatArrow(factory); registerOutputFormatArrow(factory); + registerInputFormatNpy(factory); + registerOutputFormatNpy(factory); registerOutputFormatPretty(factory); registerOutputFormatPrettyCompact(factory); @@ -251,7 +254,6 @@ void registerFormats() registerInputFormatParquetMetadata(factory); registerInputFormatDWARF(factory); registerInputFormatOne(factory); - registerInputFormatNpy(factory); registerNonTrivialPrefixAndSuffixCheckerJSONEachRow(factory); registerNonTrivialPrefixAndSuffixCheckerJSONAsString(factory); diff --git a/src/Processors/Formats/Impl/NpyOutputFormat.cpp b/src/Processors/Formats/Impl/NpyOutputFormat.cpp new file mode 100644 index 00000000000..4a4af67b07e --- /dev/null +++ b/src/Processors/Formats/Impl/NpyOutputFormat.cpp @@ -0,0 +1,223 @@ +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int TOO_MANY_COLUMNS; + extern const int BAD_ARGUMENTS; + extern const int ILLEGAL_COLUMN; +} + +namespace +{ + +template +void writeNumpyNumbers(const ColumnPtr & column, WriteBuffer & buf) +{ + const auto * number_column = assert_cast(column.get()); + for (size_t i = 0; i < number_column->size(); ++i) + writeBinaryLittleEndian(ValueType(number_column->getElement(i)), buf); +} + +template +void writeNumpyStrings(const ColumnPtr & column, size_t length, WriteBuffer & buf) +{ + const auto * string_column = assert_cast(column.get()); + for (size_t i = 0; i < string_column->size(); ++i) + buf.write(string_column->getDataAt(i).data, length); +} + +} + +String NpyOutputFormat::NumpyDataType::str() +{ + std::ostringstream dtype; + dtype << endianness << type << std::to_string(size); + return dtype.str(); +} + +NpyOutputFormat::NpyOutputFormat(WriteBuffer & out_, const Block & header_) : IOutputFormat(header_, out_) +{ + const auto & header = getPort(PortKind::Main).getHeader(); + auto data_types = header.getDataTypes(); + if (data_types.size() > 1) + throw Exception(ErrorCodes::TOO_MANY_COLUMNS, "Expected single column for Npy output format, got {}", data_types.size()); + data_type = data_types[0]; +} + +void NpyOutputFormat::initialize(const ColumnPtr & column) +{ + auto type = data_type; + ColumnPtr nested_column = column; + while (type->getTypeId() == TypeIndex::Array) + { + const auto * array_column = assert_cast(nested_column.get()); + numpy_shape.push_back(array_column->getOffsets()[0]); + type = assert_cast(type.get())->getNestedType(); + nested_column = array_column->getDataPtr(); + } + + switch (type->getTypeId()) + { + case TypeIndex::Int8: numpy_data_type = NumpyDataType('<', 'i', sizeof(Int8)); break; + case TypeIndex::Int16: numpy_data_type = NumpyDataType('<', 'i', sizeof(Int16)); break; + case TypeIndex::Int32: numpy_data_type = NumpyDataType('<', 'i', sizeof(Int32)); break; + case TypeIndex::Int64: numpy_data_type = NumpyDataType('<', 'i', sizeof(Int64)); break; + case TypeIndex::UInt8: numpy_data_type = NumpyDataType('<', 'u', sizeof(UInt8)); break; + case TypeIndex::UInt16: numpy_data_type = NumpyDataType('<', 'u', sizeof(UInt16)); break; + case TypeIndex::UInt32: numpy_data_type = NumpyDataType('<', 'u', sizeof(UInt32)); break; + case TypeIndex::UInt64: numpy_data_type = NumpyDataType('<', 'u', sizeof(UInt64)); break; + case TypeIndex::Float32: numpy_data_type = NumpyDataType('<', 'f', sizeof(Float32)); break; + case TypeIndex::Float64: numpy_data_type = NumpyDataType('<', 'f', sizeof(Float64)); break; + case TypeIndex::FixedString: numpy_data_type = NumpyDataType('|', 'S', assert_cast(type.get())->getN()); break; + case TypeIndex::String: numpy_data_type = NumpyDataType('|', 'S', 0); break; + default: + has_exception = true; + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Type {} is not supported for Npy output format", type->getName()); + } + nested_data_type = type; +} + +void NpyOutputFormat::consume(Chunk chunk) +{ + if (!has_exception) + { + num_rows += chunk.getNumRows(); + auto column = chunk.getColumns()[0]; + + if (!is_initialized) + { + initialize(column); + is_initialized = true; + } + + /// check shape + auto type = data_type; + ColumnPtr nested_column = column; + int dim = 0; + while (type->getTypeId() == TypeIndex::Array) + { + const auto * array_column = assert_cast(nested_column.get()); + const auto & array_offset = array_column->getOffsets(); + for (size_t i = 1; i < array_offset.size(); ++i) + { + if (array_offset[i] - array_offset[i - 1] != numpy_shape[dim]) + { + has_exception = true; + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "ClickHouse doesn't support object types, cannot format ragged nested sequences (which is a list of arrays with different shapes)"); + } + } + type = assert_cast(type.get())->getNestedType(); + nested_column = array_column->getDataPtr(); + dim++; + } + + /// for type String, get maximum string length + if (type->getTypeId() == TypeIndex::String) + { + const auto & string_offsets = assert_cast(nested_column.get())->getOffsets(); + for (size_t i = 0; i < string_offsets.size(); ++i) + { + size_t string_length = static_cast(string_offsets[i] - 1 - string_offsets[i - 1]); + numpy_data_type.size = numpy_data_type.size > string_length ? numpy_data_type.size : string_length; + } + } + + columns.push_back(nested_column); + } +} + +void NpyOutputFormat::finalizeImpl() +{ + if (!has_exception) + { + writeHeader(); + writeColumns(); + } +} + +void NpyOutputFormat::writeHeader() +{ + std::ostringstream static_header; + static_header << MAGIC_STRING << MAJOR_VERSION << MINOR_VERSION; + String static_header_str = static_header.str(); + + std::ostringstream shape; + shape << '(' << std::to_string(num_rows) << ','; + for (auto dim : numpy_shape) + shape << std::to_string(dim) << ','; + shape << ')'; + + std::ostringstream dict; + dict << "{'descr':'" << numpy_data_type.str() << "','fortran_order':False,'shape':" << shape.str() << ",}"; + String dict_str = dict.str(); + String padding_str = "\n"; + + /// completes the length of the header, which is divisible by 64. + size_t dict_length = dict_str.length() + 1; + size_t header_length = static_header_str.length() + sizeof(UInt32) + dict_length; + if (header_length % 64) + { + header_length = ((header_length / 64) + 1) * 64; + dict_length = header_length - static_header_str.length() - sizeof(UInt32); + padding_str = std::string(dict_length - dict_str.length(), '\x20'); + padding_str.back() = '\n'; + } + + out.write(static_header_str.data(), static_header_str.length()); + writeBinaryLittleEndian(assert_cast(dict_length), out); + out.write(dict_str.data(), dict_str.length()); + out.write(padding_str.data(), padding_str.length()); +} + +void NpyOutputFormat::writeColumns() +{ + for (auto column : columns) + { + switch (nested_data_type->getTypeId()) + { + case TypeIndex::Int8: writeNumpyNumbers(column, out); break; + case TypeIndex::Int16: writeNumpyNumbers(column, out); break; + case TypeIndex::Int32: writeNumpyNumbers(column, out); break; + case TypeIndex::Int64: writeNumpyNumbers(column, out); break; + case TypeIndex::UInt8: writeNumpyNumbers(column, out); break; + case TypeIndex::UInt16: writeNumpyNumbers(column, out); break; + case TypeIndex::UInt32: writeNumpyNumbers(column, out); break; + case TypeIndex::UInt64: writeNumpyNumbers(column, out); break; + case TypeIndex::Float32: writeNumpyNumbers(column, out); break; + case TypeIndex::Float64: writeNumpyNumbers(column, out); break; + case TypeIndex::FixedString: writeNumpyStrings(column, numpy_data_type.size, out); break; + case TypeIndex::String: writeNumpyStrings(column, numpy_data_type.size, out); break; + default: break; + } + } +} + +void registerOutputFormatNpy(FormatFactory & factory) +{ + factory.registerOutputFormat("Npy",[]( + WriteBuffer & buf, + const Block & sample, + const FormatSettings &) + { + return std::make_shared(buf, sample); + }); + factory.markFormatHasNoAppendSupport("Npy"); +} + +} diff --git a/src/Processors/Formats/Impl/NpyOutputFormat.h b/src/Processors/Formats/Impl/NpyOutputFormat.h new file mode 100644 index 00000000000..083aa928b7c --- /dev/null +++ b/src/Processors/Formats/Impl/NpyOutputFormat.h @@ -0,0 +1,66 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + + +namespace DB +{ + +/** Stream for output data in Npy format. + * https://numpy.org/doc/stable/reference/generated/numpy.lib.format.html + */ +class NpyOutputFormat : public IOutputFormat +{ +public: + NpyOutputFormat(WriteBuffer & out_, const Block & header_); + + String getName() const override { return "NpyOutputFormat"; } + + String getContentType() const override { return "application/octet-stream"; } + +private: + struct NumpyDataType + { + char endianness; + char type; + size_t size; + + NumpyDataType() = default; + NumpyDataType(char endianness_, char type_, size_t size_) + : endianness(endianness_), type(type_), size(size_) {} + String str(); + }; + + void initialize(const ColumnPtr & column); + void consume(Chunk) override; + void finalizeImpl() override; + void writeHeader(); + void writeColumns(); + + bool is_initialized = false; + bool has_exception = false; + + DataTypePtr data_type; + DataTypePtr nested_data_type; + NumpyDataType numpy_data_type; + UInt64 num_rows = 0; + std::vector numpy_shape; + Columns columns; + + /// static header (version 3.0) + constexpr static auto MAGIC_STRING = "\x93NUMPY"; + constexpr static auto MAJOR_VERSION = '\x03'; + constexpr static auto MINOR_VERSION = '\x00'; +}; + +} diff --git a/tests/queries/0_stateless/02895_npy_output_format.reference b/tests/queries/0_stateless/02895_npy_output_format.reference new file mode 100644 index 00000000000..b599f1dceea --- /dev/null +++ b/tests/queries/0_stateless/02895_npy_output_format.reference @@ -0,0 +1,60 @@ +-1 +-1 +-1 +-1 +-1 +-1 +-1 +-1 +-1 +-1 +-1 +-1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +0.1 +0.1 +0.1 +0.01 +0.01 +0.01 +npy +npy +npy +npy +npy +npy +array Int8 +array Int16 +array Int32 +array Int64 +array UInt8 +array UInt16 +array UInt32 +array UInt64 +array Float32 +array Float64 +array String +array String +[[[1],[2]],[[3],[4]]] +[[[1],[2]],[[3],[4]]] +[[[1],[2]],[[3],[4]]] +[[0.1],[0.2]] +[[0.1],[0.2]] +[[0.1],[0.2]] +[['abb','bbc'],['ccc','dddd']] +[['abb','bbc'],['ccc','dddd']] +[['abb','bbc'],['ccc','dddd']] +array Array(Array(Array(Int8))) +array Array(Array(Float64)) +array Array(Array(String)) diff --git a/tests/queries/0_stateless/02895_npy_output_format.sh b/tests/queries/0_stateless/02895_npy_output_format.sh new file mode 100755 index 00000000000..e5226e88a8d --- /dev/null +++ b/tests/queries/0_stateless/02895_npy_output_format.sh @@ -0,0 +1,110 @@ +#!/usr/bin/env bash +# Tags: no-parallel + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +user_files_path=$($CLICKHOUSE_CLIENT_BINARY -q "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}') +mkdir -p ${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/ +rm -rf ${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME:?}/* +chmod 777 ${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/ + +${CLICKHOUSE_CLIENT} -q "DROP DATABASE IF EXISTS npy_output_02895;" +${CLICKHOUSE_CLIENT} -q "CREATE DATABASE IF NOT EXISTS npy_output_02895;" + +### test common type +${CLICKHOUSE_CLIENT} -q "CREATE TABLE IF NOT EXISTS npy_output_02895.common +( + i1 Int8, + i2 Int16, + i4 Int32, + i8 Int64, + u1 UInt8, + u2 UInt16, + u4 UInt32, + u8 UInt64, + f4 Float32, + f8 Float64, + fs FixedString(10), + s String, + unknow Int128 +) Engine = MergeTree ORDER BY i1;" + +${CLICKHOUSE_CLIENT} -q "INSERT INTO npy_output_02895.common VALUES (-1,-1,-1,-1,1,1,1,1,0.1,0.01,'npy','npy',1), (-1,-1,-1,-1,1,1,1,1,0.1,0.01,'npy','npy',1), (-1,-1,-1,-1,1,1,1,1,0.1,0.01,'npy','npy',1);" + +${CLICKHOUSE_CLIENT} -n -q "SELECT * FROM npy_output_02895.common FORMAT Npy; -- { clientError TOO_MANY_COLUMNS }" +${CLICKHOUSE_CLIENT} -n -q "SELECT unknow FROM npy_output_02895.common FORMAT Npy; -- { clientError BAD_ARGUMENTS }" + +${CLICKHOUSE_CLIENT} -q "INSERT INTO TABLE FUNCTION file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_int8.npy') SELECT i1 FROM npy_output_02895.common;" +${CLICKHOUSE_CLIENT} -q "INSERT INTO TABLE FUNCTION file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_int16.npy') SELECT i2 FROM npy_output_02895.common;" +${CLICKHOUSE_CLIENT} -q "INSERT INTO TABLE FUNCTION file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_int32.npy') SELECT i4 FROM npy_output_02895.common;" +${CLICKHOUSE_CLIENT} -q "INSERT INTO TABLE FUNCTION file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_int64.npy') SELECT i8 FROM npy_output_02895.common;" +${CLICKHOUSE_CLIENT} -q "INSERT INTO TABLE FUNCTION file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_uint8.npy') SELECT u1 FROM npy_output_02895.common;" +${CLICKHOUSE_CLIENT} -q "INSERT INTO TABLE FUNCTION file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_uint16.npy') SELECT u2 FROM npy_output_02895.common;" +${CLICKHOUSE_CLIENT} -q "INSERT INTO TABLE FUNCTION file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_uint32.npy') SELECT u4 FROM npy_output_02895.common;" +${CLICKHOUSE_CLIENT} -q "INSERT INTO TABLE FUNCTION file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_uint64.npy') SELECT u8 FROM npy_output_02895.common;" +${CLICKHOUSE_CLIENT} -q "INSERT INTO TABLE FUNCTION file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_float32.npy') SELECT f4 FROM npy_output_02895.common;" +${CLICKHOUSE_CLIENT} -q "INSERT INTO TABLE FUNCTION file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_float64.npy') SELECT f8 FROM npy_output_02895.common;" +${CLICKHOUSE_CLIENT} -q "INSERT INTO TABLE FUNCTION file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_fixedstring.npy') SELECT fs FROM npy_output_02895.common;" +${CLICKHOUSE_CLIENT} -q "INSERT INTO TABLE FUNCTION file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_string.npy') SELECT s FROM npy_output_02895.common;" + +${CLICKHOUSE_CLIENT} -q "SELECT * FROM file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_int8.npy');" +${CLICKHOUSE_CLIENT} -q "SELECT * FROM file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_int16.npy');" +${CLICKHOUSE_CLIENT} -q "SELECT * FROM file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_int32.npy');" +${CLICKHOUSE_CLIENT} -q "SELECT * FROM file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_int64.npy');" +${CLICKHOUSE_CLIENT} -q "SELECT * FROM file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_uint8.npy');" +${CLICKHOUSE_CLIENT} -q "SELECT * FROM file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_uint16.npy');" +${CLICKHOUSE_CLIENT} -q "SELECT * FROM file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_uint32.npy');" +${CLICKHOUSE_CLIENT} -q "SELECT * FROM file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_uint64.npy');" +${CLICKHOUSE_CLIENT} -q "SELECT * FROM file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_float32.npy');" +${CLICKHOUSE_CLIENT} -q "SELECT * FROM file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_float64.npy');" +${CLICKHOUSE_CLIENT} -q "SELECT * FROM file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_fixedstring.npy');" +${CLICKHOUSE_CLIENT} -q "SELECT * FROM file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_string.npy');" + +${CLICKHOUSE_CLIENT} -q "DESC file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_int8.npy');" +${CLICKHOUSE_CLIENT} -q "DESC file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_int16.npy');" +${CLICKHOUSE_CLIENT} -q "DESC file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_int32.npy');" +${CLICKHOUSE_CLIENT} -q "DESC file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_int64.npy');" +${CLICKHOUSE_CLIENT} -q "DESC file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_uint8.npy');" +${CLICKHOUSE_CLIENT} -q "DESC file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_uint16.npy');" +${CLICKHOUSE_CLIENT} -q "DESC file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_uint32.npy');" +${CLICKHOUSE_CLIENT} -q "DESC file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_uint64.npy');" +${CLICKHOUSE_CLIENT} -q "DESC file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_float32.npy');" +${CLICKHOUSE_CLIENT} -q "DESC file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_float64.npy');" +${CLICKHOUSE_CLIENT} -q "DESC file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_fixedstring.npy');" +${CLICKHOUSE_CLIENT} -q "DESC file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_string.npy');" + +### test nested type +${CLICKHOUSE_CLIENT} -q "CREATE TABLE IF NOT EXISTS npy_output_02895.nested +( + i4 Array(Array(Array(Int8))), + f8 Array(Array(Float64)), + s Array(Array(String)), + unknow Array(Int128), + ragged_1 Array(Array(Int32)), + ragged_2 Array(Array(Int32)) +) Engine = MergeTree ORDER BY i4;" + +${CLICKHOUSE_CLIENT} -q "INSERT INTO npy_output_02895.nested VALUES ([[[1], [2]], [[3], [4]]], [[0.1], [0.2]], [['a', 'bb'], ['ccc', 'dddd']], [1, 2], [[1, 2], [3, 4]], [[1, 2], [3]]), ([[[1], [2]], [[3], [4]]], [[0.1], [0.2]], [['a', 'bb'], ['ccc', 'dddd']], [1, 2], [[1, 2, 3], [4]], [[1, 2], [3]]), ([[[1], [2]], [[3], [4]]], [[0.1], [0.2]], [['a', 'bb'], ['ccc', 'dddd']], [1, 2], [[1], [2, 3, 4]], [[1, 2], [3]]);" + +${CLICKHOUSE_CLIENT} -n -q "SELECT * FROM npy_output_02895.nested FORMAT Npy; -- { clientError TOO_MANY_COLUMNS }" +${CLICKHOUSE_CLIENT} -n -q "SELECT unknow FROM npy_output_02895.nested FORMAT Npy; -- { clientError BAD_ARGUMENTS }" +${CLICKHOUSE_CLIENT} -n -q "SELECT ragged_1 FROM npy_output_02895.nested FORMAT Npy; -- { clientError ILLEGAL_COLUMN }" +${CLICKHOUSE_CLIENT} -n -q "SELECT ragged_2 FROM npy_output_02895.nested FORMAT Npy; -- { clientError ILLEGAL_COLUMN }" + +${CLICKHOUSE_CLIENT} -q "INSERT INTO TABLE FUNCTION file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_nested_int32.npy') SELECT i4 FROM npy_output_02895.nested;" +${CLICKHOUSE_CLIENT} -q "INSERT INTO TABLE FUNCTION file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_nested_float64.npy') SELECT f8 FROM npy_output_02895.nested;" +${CLICKHOUSE_CLIENT} -q "INSERT INTO TABLE FUNCTION file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_nested_string.npy') SELECT s FROM npy_output_02895.nested;" + +${CLICKHOUSE_CLIENT} -q "SELECT * FROM file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_nested_int32.npy');" +${CLICKHOUSE_CLIENT} -q "SELECT * FROM file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_nested_float64.npy');" +${CLICKHOUSE_CLIENT} -q "SELECT * FROM file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_nested_string.npy');" + +${CLICKHOUSE_CLIENT} -q "DESC file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_nested_int32.npy');" +${CLICKHOUSE_CLIENT} -q "DESC file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_nested_float64.npy');" +${CLICKHOUSE_CLIENT} -q "DESC file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_nested_string.npy');" + +${CLICKHOUSE_CLIENT} -q "DROP DATABASE IF EXISTS npy_output_02895;" + +rm -rf ${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME:?} From fe66d1b67a357155d35ca364a06c43cd44770056 Mon Sep 17 00:00:00 2001 From: HowePa <2873679104@qq.com> Date: Tue, 9 Apr 2024 20:23:45 +0800 Subject: [PATCH 040/289] fix style --- .../Formats/Impl/NpyOutputFormat.cpp | 45 ++++++++++--------- src/Processors/Formats/Impl/NpyOutputFormat.h | 5 +-- 2 files changed, 25 insertions(+), 25 deletions(-) diff --git a/src/Processors/Formats/Impl/NpyOutputFormat.cpp b/src/Processors/Formats/Impl/NpyOutputFormat.cpp index 4a4af67b07e..d04d095e84f 100644 --- a/src/Processors/Formats/Impl/NpyOutputFormat.cpp +++ b/src/Processors/Formats/Impl/NpyOutputFormat.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include #include @@ -46,8 +47,11 @@ void writeNumpyStrings(const ColumnPtr & column, size_t length, WriteBuffer & bu String NpyOutputFormat::NumpyDataType::str() { - std::ostringstream dtype; - dtype << endianness << type << std::to_string(size); + WriteBufferFromOwnString dtype; + writeChar(endianness, dtype); + writeChar(type, dtype); + writeIntText(size, dtype); + return dtype.str(); } @@ -153,36 +157,33 @@ void NpyOutputFormat::finalizeImpl() void NpyOutputFormat::writeHeader() { - std::ostringstream static_header; - static_header << MAGIC_STRING << MAJOR_VERSION << MINOR_VERSION; - String static_header_str = static_header.str(); - - std::ostringstream shape; - shape << '(' << std::to_string(num_rows) << ','; + WriteBufferFromOwnString shape; + writeIntText(num_rows, shape); + writeChar(',', shape); for (auto dim : numpy_shape) - shape << std::to_string(dim) << ','; - shape << ')'; + { + writeIntText(dim, shape); + writeChar(',', shape); + } - std::ostringstream dict; - dict << "{'descr':'" << numpy_data_type.str() << "','fortran_order':False,'shape':" << shape.str() << ",}"; - String dict_str = dict.str(); - String padding_str = "\n"; + String dict = "{'descr':'" + numpy_data_type.str() + "','fortran_order':False,'shape':(" + shape.str() + "),}"; + String padding = "\n"; /// completes the length of the header, which is divisible by 64. - size_t dict_length = dict_str.length() + 1; - size_t header_length = static_header_str.length() + sizeof(UInt32) + dict_length; + size_t dict_length = dict.length() + 1; + size_t header_length = STATIC_HEADER_LENGTH + sizeof(UInt32) + dict_length; if (header_length % 64) { header_length = ((header_length / 64) + 1) * 64; - dict_length = header_length - static_header_str.length() - sizeof(UInt32); - padding_str = std::string(dict_length - dict_str.length(), '\x20'); - padding_str.back() = '\n'; + dict_length = header_length - STATIC_HEADER_LENGTH - sizeof(UInt32); + padding = std::string(dict_length - dict.length(), '\x20'); + padding.back() = '\n'; } - out.write(static_header_str.data(), static_header_str.length()); + out.write(STATIC_HEADER, STATIC_HEADER_LENGTH); writeBinaryLittleEndian(assert_cast(dict_length), out); - out.write(dict_str.data(), dict_str.length()); - out.write(padding_str.data(), padding_str.length()); + out.write(dict.data(), dict.length()); + out.write(padding.data(), padding.length()); } void NpyOutputFormat::writeColumns() diff --git a/src/Processors/Formats/Impl/NpyOutputFormat.h b/src/Processors/Formats/Impl/NpyOutputFormat.h index 083aa928b7c..a483420d2d6 100644 --- a/src/Processors/Formats/Impl/NpyOutputFormat.h +++ b/src/Processors/Formats/Impl/NpyOutputFormat.h @@ -58,9 +58,8 @@ private: Columns columns; /// static header (version 3.0) - constexpr static auto MAGIC_STRING = "\x93NUMPY"; - constexpr static auto MAJOR_VERSION = '\x03'; - constexpr static auto MINOR_VERSION = '\x00'; + constexpr static auto STATIC_HEADER = "\x93NUMPY\x03\x00"; + constexpr static size_t STATIC_HEADER_LENGTH = 8; }; } From 3c58e5873b24e0aec20a4c5b97c3ab6bb849c47e Mon Sep 17 00:00:00 2001 From: Andrey Zvonov Date: Tue, 9 Apr 2024 19:06:14 +0000 Subject: [PATCH 041/289] fix reading of {} with more than 1000 objects under each --- src/Storages/StorageS3.cpp | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp index 85d9b45291c..ffe3213a4bc 100644 --- a/src/Storages/StorageS3.cpp +++ b/src/Storages/StorageS3.cpp @@ -214,10 +214,8 @@ public: expanded_keys = expandSelectionGlob(globbed_uri.key); expanded_keys_iter = expanded_keys.begin(); - bool no_globs_in_key = fillBufferForKey(*expanded_keys_iter); + fillBufferForKey(*expanded_keys_iter); expanded_keys_iter++; - if (expanded_keys_iter == expanded_keys.end() && no_globs_in_key) - is_finished_for_key = true; } KeyWithInfoPtr next(size_t) @@ -252,6 +250,7 @@ private: buffer_iter = buffer.begin(); if (read_keys) read_keys->insert(read_keys->end(), buffer.begin(), buffer.end()); + is_finished_for_key = true; return true; } @@ -306,17 +305,17 @@ private: return answer; } - if (expanded_keys_iter != expanded_keys.end()) - { - bool no_globs_in_key = fillBufferForKey(*expanded_keys_iter); - expanded_keys_iter++; - if (expanded_keys_iter == expanded_keys.end() && no_globs_in_key) - is_finished_for_key = true; - continue; - } - if (is_finished_for_key) - return {}; + { + if (expanded_keys_iter != expanded_keys.end()) + { + fillBufferForKey(*expanded_keys_iter); + expanded_keys_iter++; + continue; + } + else + return {}; + } try { From 093b71b8585161e91f571d9991e2d351effd10fa Mon Sep 17 00:00:00 2001 From: Andrey Zvonov Date: Tue, 9 Apr 2024 21:01:01 +0000 Subject: [PATCH 042/289] added test for selection globs with many files under --- tests/integration/test_storage_s3/test.py | 38 +++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/tests/integration/test_storage_s3/test.py b/tests/integration/test_storage_s3/test.py index a4ed94c815b..60b0e8792d7 100644 --- a/tests/integration/test_storage_s3/test.py +++ b/tests/integration/test_storage_s3/test.py @@ -678,6 +678,44 @@ def test_s3_glob_scheherazade(started_cluster): assert run_query(instance, query).splitlines() == ["1001\t1001\t1001\t1001"] +# a bit modified version of scheherazade test +# checks e.g. `prefix{1,2}/file*.csv`, where there are more than 1000 files under each of prefix1, prefix2. +def test_s3_glob_many_objects_under_selection(started_cluster): + bucket = started_cluster.minio_bucket + instance = started_cluster.instances["dummy"] # type: ClickHouseInstance + table_format = "column1 UInt32, column2 UInt32, column3 UInt32" + values = "(1, 1, 1)" + jobs = [] + for file_num in range(1100): + + def create_files(file_num): + for folder_num in range(1, 3): + path = f"folder{folder_num}/file{file_num}.csv" + query = "insert into table function s3('http://{}:{}/{}/{}', 'CSV', '{}') values {}".format( + started_cluster.minio_ip, + MINIO_INTERNAL_PORT, + bucket, + path, + table_format, + values, + ) + run_query(instance, query) + + jobs.append(threading.Thread(target=create_files, args=(file_num,))) + jobs[-1].start() + + for job in jobs: + job.join() + + query = "select count(), sum(column1), sum(column2), sum(column3) from s3('http://{}:{}/{}/folder{{1,2}}/file*.csv', 'CSV', '{}')".format( + started_cluster.minio_redirect_host, + started_cluster.minio_redirect_port, + bucket, + table_format, + ) + assert run_query(instance, query).splitlines() == ["2200\t2200\t2200\t2200"] + + def run_s3_mocks(started_cluster): script_dir = os.path.join(os.path.dirname(__file__), "s3_mocks") start_mock_servers( From d9de697305522a1c267debaf13792ab900b16a83 Mon Sep 17 00:00:00 2001 From: HowePa <2873679104@qq.com> Date: Wed, 10 Apr 2024 17:48:23 +0800 Subject: [PATCH 043/289] refactor test --- .../0_stateless/02895_npy_output_format.sh | 165 +++++++++--------- 1 file changed, 82 insertions(+), 83 deletions(-) diff --git a/tests/queries/0_stateless/02895_npy_output_format.sh b/tests/queries/0_stateless/02895_npy_output_format.sh index e5226e88a8d..27274f6a925 100755 --- a/tests/queries/0_stateless/02895_npy_output_format.sh +++ b/tests/queries/0_stateless/02895_npy_output_format.sh @@ -10,101 +10,100 @@ mkdir -p ${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/ rm -rf ${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME:?}/* chmod 777 ${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/ -${CLICKHOUSE_CLIENT} -q "DROP DATABASE IF EXISTS npy_output_02895;" -${CLICKHOUSE_CLIENT} -q "CREATE DATABASE IF NOT EXISTS npy_output_02895;" +${CLICKHOUSE_CLIENT} -n -q " + DROP DATABASE IF EXISTS npy_output_02895; + CREATE DATABASE IF NOT EXISTS npy_output_02895; -### test common type -${CLICKHOUSE_CLIENT} -q "CREATE TABLE IF NOT EXISTS npy_output_02895.common -( - i1 Int8, - i2 Int16, - i4 Int32, - i8 Int64, - u1 UInt8, - u2 UInt16, - u4 UInt32, - u8 UInt64, - f4 Float32, - f8 Float64, - fs FixedString(10), - s String, - unknow Int128 -) Engine = MergeTree ORDER BY i1;" + CREATE TABLE IF NOT EXISTS npy_output_02895.common + ( + i1 Int8, + i2 Int16, + i4 Int32, + i8 Int64, + u1 UInt8, + u2 UInt16, + u4 UInt32, + u8 UInt64, + f4 Float32, + f8 Float64, + fs FixedString(10), + s String, + unknow Int128 + ) Engine = MergeTree ORDER BY i1; -${CLICKHOUSE_CLIENT} -q "INSERT INTO npy_output_02895.common VALUES (-1,-1,-1,-1,1,1,1,1,0.1,0.01,'npy','npy',1), (-1,-1,-1,-1,1,1,1,1,0.1,0.01,'npy','npy',1), (-1,-1,-1,-1,1,1,1,1,0.1,0.01,'npy','npy',1);" + INSERT INTO npy_output_02895.common VALUES (-1,-1,-1,-1,1,1,1,1,0.1,0.01,'npy','npy',1), (-1,-1,-1,-1,1,1,1,1,0.1,0.01,'npy','npy',1), (-1,-1,-1,-1,1,1,1,1,0.1,0.01,'npy','npy',1); -${CLICKHOUSE_CLIENT} -n -q "SELECT * FROM npy_output_02895.common FORMAT Npy; -- { clientError TOO_MANY_COLUMNS }" -${CLICKHOUSE_CLIENT} -n -q "SELECT unknow FROM npy_output_02895.common FORMAT Npy; -- { clientError BAD_ARGUMENTS }" + SELECT * FROM npy_output_02895.common FORMAT Npy; -- { clientError TOO_MANY_COLUMNS } + SELECT unknow FROM npy_output_02895.common FORMAT Npy; -- { clientError BAD_ARGUMENTS } -${CLICKHOUSE_CLIENT} -q "INSERT INTO TABLE FUNCTION file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_int8.npy') SELECT i1 FROM npy_output_02895.common;" -${CLICKHOUSE_CLIENT} -q "INSERT INTO TABLE FUNCTION file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_int16.npy') SELECT i2 FROM npy_output_02895.common;" -${CLICKHOUSE_CLIENT} -q "INSERT INTO TABLE FUNCTION file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_int32.npy') SELECT i4 FROM npy_output_02895.common;" -${CLICKHOUSE_CLIENT} -q "INSERT INTO TABLE FUNCTION file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_int64.npy') SELECT i8 FROM npy_output_02895.common;" -${CLICKHOUSE_CLIENT} -q "INSERT INTO TABLE FUNCTION file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_uint8.npy') SELECT u1 FROM npy_output_02895.common;" -${CLICKHOUSE_CLIENT} -q "INSERT INTO TABLE FUNCTION file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_uint16.npy') SELECT u2 FROM npy_output_02895.common;" -${CLICKHOUSE_CLIENT} -q "INSERT INTO TABLE FUNCTION file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_uint32.npy') SELECT u4 FROM npy_output_02895.common;" -${CLICKHOUSE_CLIENT} -q "INSERT INTO TABLE FUNCTION file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_uint64.npy') SELECT u8 FROM npy_output_02895.common;" -${CLICKHOUSE_CLIENT} -q "INSERT INTO TABLE FUNCTION file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_float32.npy') SELECT f4 FROM npy_output_02895.common;" -${CLICKHOUSE_CLIENT} -q "INSERT INTO TABLE FUNCTION file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_float64.npy') SELECT f8 FROM npy_output_02895.common;" -${CLICKHOUSE_CLIENT} -q "INSERT INTO TABLE FUNCTION file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_fixedstring.npy') SELECT fs FROM npy_output_02895.common;" -${CLICKHOUSE_CLIENT} -q "INSERT INTO TABLE FUNCTION file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_string.npy') SELECT s FROM npy_output_02895.common;" + INSERT INTO TABLE FUNCTION file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_int8.npy') SELECT i1 FROM npy_output_02895.common; + INSERT INTO TABLE FUNCTION file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_int16.npy') SELECT i2 FROM npy_output_02895.common; + INSERT INTO TABLE FUNCTION file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_int32.npy') SELECT i4 FROM npy_output_02895.common; + INSERT INTO TABLE FUNCTION file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_int64.npy') SELECT i8 FROM npy_output_02895.common; + INSERT INTO TABLE FUNCTION file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_uint8.npy') SELECT u1 FROM npy_output_02895.common; + INSERT INTO TABLE FUNCTION file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_uint16.npy') SELECT u2 FROM npy_output_02895.common; + INSERT INTO TABLE FUNCTION file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_uint32.npy') SELECT u4 FROM npy_output_02895.common; + INSERT INTO TABLE FUNCTION file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_uint64.npy') SELECT u8 FROM npy_output_02895.common; + INSERT INTO TABLE FUNCTION file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_float32.npy') SELECT f4 FROM npy_output_02895.common; + INSERT INTO TABLE FUNCTION file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_float64.npy') SELECT f8 FROM npy_output_02895.common; + INSERT INTO TABLE FUNCTION file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_fixedstring.npy') SELECT fs FROM npy_output_02895.common; + INSERT INTO TABLE FUNCTION file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_string.npy') SELECT s FROM npy_output_02895.common; -${CLICKHOUSE_CLIENT} -q "SELECT * FROM file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_int8.npy');" -${CLICKHOUSE_CLIENT} -q "SELECT * FROM file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_int16.npy');" -${CLICKHOUSE_CLIENT} -q "SELECT * FROM file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_int32.npy');" -${CLICKHOUSE_CLIENT} -q "SELECT * FROM file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_int64.npy');" -${CLICKHOUSE_CLIENT} -q "SELECT * FROM file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_uint8.npy');" -${CLICKHOUSE_CLIENT} -q "SELECT * FROM file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_uint16.npy');" -${CLICKHOUSE_CLIENT} -q "SELECT * FROM file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_uint32.npy');" -${CLICKHOUSE_CLIENT} -q "SELECT * FROM file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_uint64.npy');" -${CLICKHOUSE_CLIENT} -q "SELECT * FROM file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_float32.npy');" -${CLICKHOUSE_CLIENT} -q "SELECT * FROM file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_float64.npy');" -${CLICKHOUSE_CLIENT} -q "SELECT * FROM file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_fixedstring.npy');" -${CLICKHOUSE_CLIENT} -q "SELECT * FROM file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_string.npy');" + SELECT * FROM file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_int8.npy'); + SELECT * FROM file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_int16.npy'); + SELECT * FROM file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_int32.npy'); + SELECT * FROM file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_int64.npy'); + SELECT * FROM file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_uint8.npy'); + SELECT * FROM file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_uint16.npy'); + SELECT * FROM file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_uint32.npy'); + SELECT * FROM file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_uint64.npy'); + SELECT * FROM file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_float32.npy'); + SELECT * FROM file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_float64.npy'); + SELECT * FROM file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_fixedstring.npy'); + SELECT * FROM file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_string.npy'); -${CLICKHOUSE_CLIENT} -q "DESC file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_int8.npy');" -${CLICKHOUSE_CLIENT} -q "DESC file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_int16.npy');" -${CLICKHOUSE_CLIENT} -q "DESC file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_int32.npy');" -${CLICKHOUSE_CLIENT} -q "DESC file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_int64.npy');" -${CLICKHOUSE_CLIENT} -q "DESC file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_uint8.npy');" -${CLICKHOUSE_CLIENT} -q "DESC file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_uint16.npy');" -${CLICKHOUSE_CLIENT} -q "DESC file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_uint32.npy');" -${CLICKHOUSE_CLIENT} -q "DESC file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_uint64.npy');" -${CLICKHOUSE_CLIENT} -q "DESC file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_float32.npy');" -${CLICKHOUSE_CLIENT} -q "DESC file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_float64.npy');" -${CLICKHOUSE_CLIENT} -q "DESC file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_fixedstring.npy');" -${CLICKHOUSE_CLIENT} -q "DESC file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_string.npy');" + DESC file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_int8.npy'); + DESC file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_int16.npy'); + DESC file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_int32.npy'); + DESC file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_int64.npy'); + DESC file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_uint8.npy'); + DESC file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_uint16.npy'); + DESC file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_uint32.npy'); + DESC file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_uint64.npy'); + DESC file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_float32.npy'); + DESC file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_float64.npy'); + DESC file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_fixedstring.npy'); + DESC file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_common_string.npy'); -### test nested type -${CLICKHOUSE_CLIENT} -q "CREATE TABLE IF NOT EXISTS npy_output_02895.nested -( - i4 Array(Array(Array(Int8))), - f8 Array(Array(Float64)), - s Array(Array(String)), - unknow Array(Int128), - ragged_1 Array(Array(Int32)), - ragged_2 Array(Array(Int32)) -) Engine = MergeTree ORDER BY i4;" + CREATE TABLE IF NOT EXISTS npy_output_02895.nested + ( + i4 Array(Array(Array(Int8))), + f8 Array(Array(Float64)), + s Array(Array(String)), + unknow Array(Int128), + ragged_1 Array(Array(Int32)), + ragged_2 Array(Array(Int32)) + ) Engine = MergeTree ORDER BY i4; -${CLICKHOUSE_CLIENT} -q "INSERT INTO npy_output_02895.nested VALUES ([[[1], [2]], [[3], [4]]], [[0.1], [0.2]], [['a', 'bb'], ['ccc', 'dddd']], [1, 2], [[1, 2], [3, 4]], [[1, 2], [3]]), ([[[1], [2]], [[3], [4]]], [[0.1], [0.2]], [['a', 'bb'], ['ccc', 'dddd']], [1, 2], [[1, 2, 3], [4]], [[1, 2], [3]]), ([[[1], [2]], [[3], [4]]], [[0.1], [0.2]], [['a', 'bb'], ['ccc', 'dddd']], [1, 2], [[1], [2, 3, 4]], [[1, 2], [3]]);" + INSERT INTO npy_output_02895.nested VALUES ([[[1], [2]], [[3], [4]]], [[0.1], [0.2]], [['a', 'bb'], ['ccc', 'dddd']], [1, 2], [[1, 2], [3, 4]], [[1, 2], [3]]), ([[[1], [2]], [[3], [4]]], [[0.1], [0.2]], [['a', 'bb'], ['ccc', 'dddd']], [1, 2], [[1, 2, 3], [4]], [[1, 2], [3]]), ([[[1], [2]], [[3], [4]]], [[0.1], [0.2]], [['a', 'bb'], ['ccc', 'dddd']], [1, 2], [[1], [2, 3, 4]], [[1, 2], [3]]); -${CLICKHOUSE_CLIENT} -n -q "SELECT * FROM npy_output_02895.nested FORMAT Npy; -- { clientError TOO_MANY_COLUMNS }" -${CLICKHOUSE_CLIENT} -n -q "SELECT unknow FROM npy_output_02895.nested FORMAT Npy; -- { clientError BAD_ARGUMENTS }" -${CLICKHOUSE_CLIENT} -n -q "SELECT ragged_1 FROM npy_output_02895.nested FORMAT Npy; -- { clientError ILLEGAL_COLUMN }" -${CLICKHOUSE_CLIENT} -n -q "SELECT ragged_2 FROM npy_output_02895.nested FORMAT Npy; -- { clientError ILLEGAL_COLUMN }" + SELECT * FROM npy_output_02895.nested FORMAT Npy; -- { clientError TOO_MANY_COLUMNS } + SELECT unknow FROM npy_output_02895.nested FORMAT Npy; -- { clientError BAD_ARGUMENTS } + SELECT ragged_1 FROM npy_output_02895.nested FORMAT Npy; -- { clientError ILLEGAL_COLUMN } + SELECT ragged_2 FROM npy_output_02895.nested FORMAT Npy; -- { clientError ILLEGAL_COLUMN } -${CLICKHOUSE_CLIENT} -q "INSERT INTO TABLE FUNCTION file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_nested_int32.npy') SELECT i4 FROM npy_output_02895.nested;" -${CLICKHOUSE_CLIENT} -q "INSERT INTO TABLE FUNCTION file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_nested_float64.npy') SELECT f8 FROM npy_output_02895.nested;" -${CLICKHOUSE_CLIENT} -q "INSERT INTO TABLE FUNCTION file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_nested_string.npy') SELECT s FROM npy_output_02895.nested;" + INSERT INTO TABLE FUNCTION file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_nested_int32.npy') SELECT i4 FROM npy_output_02895.nested; + INSERT INTO TABLE FUNCTION file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_nested_float64.npy') SELECT f8 FROM npy_output_02895.nested; + INSERT INTO TABLE FUNCTION file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_nested_string.npy') SELECT s FROM npy_output_02895.nested; -${CLICKHOUSE_CLIENT} -q "SELECT * FROM file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_nested_int32.npy');" -${CLICKHOUSE_CLIENT} -q "SELECT * FROM file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_nested_float64.npy');" -${CLICKHOUSE_CLIENT} -q "SELECT * FROM file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_nested_string.npy');" + SELECT * FROM file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_nested_int32.npy'); + SELECT * FROM file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_nested_float64.npy'); + SELECT * FROM file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_nested_string.npy'); -${CLICKHOUSE_CLIENT} -q "DESC file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_nested_int32.npy');" -${CLICKHOUSE_CLIENT} -q "DESC file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_nested_float64.npy');" -${CLICKHOUSE_CLIENT} -q "DESC file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_nested_string.npy');" + DESC file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_nested_int32.npy'); + DESC file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_nested_float64.npy'); + DESC file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/02895_nested_string.npy'); -${CLICKHOUSE_CLIENT} -q "DROP DATABASE IF EXISTS npy_output_02895;" + DROP DATABASE IF EXISTS npy_output_02895;" rm -rf ${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME:?} From c9b05eac022254c71323d2715a0dc1a32ae9c2f7 Mon Sep 17 00:00:00 2001 From: Andrey Zvonov Date: Wed, 10 Apr 2024 12:02:01 +0000 Subject: [PATCH 044/289] fix test_s3_glob_many_objects_under_selection --- tests/integration/test_storage_s3/test.py | 26 ++++++++++++++++------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/tests/integration/test_storage_s3/test.py b/tests/integration/test_storage_s3/test.py index 60b0e8792d7..28b70911b77 100644 --- a/tests/integration/test_storage_s3/test.py +++ b/tests/integration/test_storage_s3/test.py @@ -678,19 +678,19 @@ def test_s3_glob_scheherazade(started_cluster): assert run_query(instance, query).splitlines() == ["1001\t1001\t1001\t1001"] -# a bit modified version of scheherazade test -# checks e.g. `prefix{1,2}/file*.csv`, where there are more than 1000 files under each of prefix1, prefix2. +# a bit simplified version of scheherazade test +# checks e.g. `prefix{1,2}/file*.csv`, where there are more than 1000 files under prefix1. def test_s3_glob_many_objects_under_selection(started_cluster): bucket = started_cluster.minio_bucket instance = started_cluster.instances["dummy"] # type: ClickHouseInstance table_format = "column1 UInt32, column2 UInt32, column3 UInt32" values = "(1, 1, 1)" jobs = [] - for file_num in range(1100): + for thread_num in range(16): - def create_files(file_num): - for folder_num in range(1, 3): - path = f"folder{folder_num}/file{file_num}.csv" + def create_files(thread_num): + for f_num in range(thread_num * 63, thread_num * 63 + 63): + path = f"folder1/file{f_num}.csv" query = "insert into table function s3('http://{}:{}/{}/{}', 'CSV', '{}') values {}".format( started_cluster.minio_ip, MINIO_INTERNAL_PORT, @@ -701,9 +701,19 @@ def test_s3_glob_many_objects_under_selection(started_cluster): ) run_query(instance, query) - jobs.append(threading.Thread(target=create_files, args=(file_num,))) + jobs.append(threading.Thread(target=create_files, args=(thread_num,))) jobs[-1].start() + query = "insert into table function s3('http://{}:{}/{}/{}', 'CSV', '{}') values {}".format( + started_cluster.minio_ip, + MINIO_INTERNAL_PORT, + bucket, + f"folder2/file0.csv", + table_format, + values, + ) + run_query(instance, query) + for job in jobs: job.join() @@ -713,7 +723,7 @@ def test_s3_glob_many_objects_under_selection(started_cluster): bucket, table_format, ) - assert run_query(instance, query).splitlines() == ["2200\t2200\t2200\t2200"] + assert run_query(instance, query).splitlines() == ["1009\t1009\t1009\t1009"] def run_s3_mocks(started_cluster): From 9975b6a0f5c313a2552a8169f5be555dd2f7f4ad Mon Sep 17 00:00:00 2001 From: HowePa <2873679104@qq.com> Date: Wed, 10 Apr 2024 20:18:30 +0800 Subject: [PATCH 045/289] fix build --- src/Processors/Formats/Impl/NpyOutputFormat.cpp | 4 ++-- src/Processors/Formats/Impl/NpyOutputFormat.h | 3 +-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/Processors/Formats/Impl/NpyOutputFormat.cpp b/src/Processors/Formats/Impl/NpyOutputFormat.cpp index d04d095e84f..f43deb816e0 100644 --- a/src/Processors/Formats/Impl/NpyOutputFormat.cpp +++ b/src/Processors/Formats/Impl/NpyOutputFormat.cpp @@ -45,7 +45,7 @@ void writeNumpyStrings(const ColumnPtr & column, size_t length, WriteBuffer & bu } -String NpyOutputFormat::NumpyDataType::str() +String NpyOutputFormat::NumpyDataType::str() const { WriteBufferFromOwnString dtype; writeChar(endianness, dtype); @@ -188,7 +188,7 @@ void NpyOutputFormat::writeHeader() void NpyOutputFormat::writeColumns() { - for (auto column : columns) + for (const auto & column : columns) { switch (nested_data_type->getTypeId()) { diff --git a/src/Processors/Formats/Impl/NpyOutputFormat.h b/src/Processors/Formats/Impl/NpyOutputFormat.h index a483420d2d6..f1d0216a8c3 100644 --- a/src/Processors/Formats/Impl/NpyOutputFormat.h +++ b/src/Processors/Formats/Impl/NpyOutputFormat.h @@ -9,7 +9,6 @@ #include #include -#include #include @@ -38,7 +37,7 @@ private: NumpyDataType() = default; NumpyDataType(char endianness_, char type_, size_t size_) : endianness(endianness_), type(type_), size(size_) {} - String str(); + String str() const; }; void initialize(const ColumnPtr & column); From 9085816fcb25c34ffc4ce664f3f10b4c3aca160d Mon Sep 17 00:00:00 2001 From: HowePa <2873679104@qq.com> Date: Wed, 10 Apr 2024 23:06:51 +0800 Subject: [PATCH 046/289] remove from async test --- .../queries/0_stateless/02187_async_inserts_all_formats.python | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/02187_async_inserts_all_formats.python b/tests/queries/0_stateless/02187_async_inserts_all_formats.python index fa555c78f8b..943e32d4cf2 100644 --- a/tests/queries/0_stateless/02187_async_inserts_all_formats.python +++ b/tests/queries/0_stateless/02187_async_inserts_all_formats.python @@ -38,7 +38,7 @@ def run_test(data_format, gen_data_template, settings): formats = ( client.query( "SELECT name FROM system.formats WHERE is_input AND is_output \ - AND name NOT IN ('CapnProto', 'RawBLOB', 'Template', 'ProtobufSingle', 'LineAsString', 'Protobuf', 'ProtobufList') ORDER BY name" + AND name NOT IN ('CapnProto', 'RawBLOB', 'Template', 'ProtobufSingle', 'LineAsString', 'Protobuf', 'ProtobufList', 'Npy') ORDER BY name" ) .strip() .split("\n") From 0669591e35e0b6f19c148ff941c2497a0e38435c Mon Sep 17 00:00:00 2001 From: Andrey Zvonov Date: Wed, 10 Apr 2024 17:33:48 +0000 Subject: [PATCH 047/289] small code cleanup --- src/Storages/StorageS3.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp index ffe3213a4bc..acef213c1f4 100644 --- a/src/Storages/StorageS3.cpp +++ b/src/Storages/StorageS3.cpp @@ -237,7 +237,7 @@ public: private: using ListObjectsOutcome = Aws::S3::Model::ListObjectsV2Outcome; - bool fillBufferForKey(const std::string & uri_key) + void fillBufferForKey(const std::string & uri_key) { is_finished_for_key = false; const String key_prefix = uri_key.substr(0, uri_key.find_first_of("*?{")); @@ -251,7 +251,7 @@ private: if (read_keys) read_keys->insert(read_keys->end(), buffer.begin(), buffer.end()); is_finished_for_key = true; - return true; + return; } request = {}; @@ -270,7 +270,7 @@ private: filter_dag = VirtualColumnUtils::createPathAndFileFilterDAG(predicate, virtual_columns); fillInternalBufferAssumeLocked(); - return false; + return; } KeyWithInfoPtr nextAssumeLocked() From b464d16d5fbb67797677d63d05d8bd9802821a7c Mon Sep 17 00:00:00 2001 From: HowePa <2873679104@qq.com> Date: Thu, 11 Apr 2024 11:34:56 +0800 Subject: [PATCH 048/289] refactor and fix type conversion --- .../Formats/Impl/NpyOutputFormat.cpp | 139 ++++++++++-------- src/Processors/Formats/Impl/NpyOutputFormat.h | 10 +- 2 files changed, 87 insertions(+), 62 deletions(-) diff --git a/src/Processors/Formats/Impl/NpyOutputFormat.cpp b/src/Processors/Formats/Impl/NpyOutputFormat.cpp index f43deb816e0..d54fc7e68f2 100644 --- a/src/Processors/Formats/Impl/NpyOutputFormat.cpp +++ b/src/Processors/Formats/Impl/NpyOutputFormat.cpp @@ -55,6 +55,20 @@ String NpyOutputFormat::NumpyDataType::str() const return dtype.str(); } +String NpyOutputFormat::shapeStr() const +{ + WriteBufferFromOwnString shape; + writeIntText(num_rows, shape); + writeChar(',', shape); + for (UInt64 dim : numpy_shape) + { + writeIntText(dim, shape); + writeChar(',', shape); + } + + return shape.str(); +} + NpyOutputFormat::NpyOutputFormat(WriteBuffer & out_, const Block & header_) : IOutputFormat(header_, out_) { const auto & header = getPort(PortKind::Main).getHeader(); @@ -62,20 +76,13 @@ NpyOutputFormat::NpyOutputFormat(WriteBuffer & out_, const Block & header_) : IO if (data_types.size() > 1) throw Exception(ErrorCodes::TOO_MANY_COLUMNS, "Expected single column for Npy output format, got {}", data_types.size()); data_type = data_types[0]; + + if (!getNumpyDataType(data_type)) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Type {} is not supported for Npy output format", nested_data_type->getName()); } -void NpyOutputFormat::initialize(const ColumnPtr & column) +bool NpyOutputFormat::getNumpyDataType(const DataTypePtr & type) { - auto type = data_type; - ColumnPtr nested_column = column; - while (type->getTypeId() == TypeIndex::Array) - { - const auto * array_column = assert_cast(nested_column.get()); - numpy_shape.push_back(array_column->getOffsets()[0]); - type = assert_cast(type.get())->getNestedType(); - nested_column = array_column->getDataPtr(); - } - switch (type->getTypeId()) { case TypeIndex::Int8: numpy_data_type = NumpyDataType('<', 'i', sizeof(Int8)); break; @@ -90,65 +97,86 @@ void NpyOutputFormat::initialize(const ColumnPtr & column) case TypeIndex::Float64: numpy_data_type = NumpyDataType('<', 'f', sizeof(Float64)); break; case TypeIndex::FixedString: numpy_data_type = NumpyDataType('|', 'S', assert_cast(type.get())->getN()); break; case TypeIndex::String: numpy_data_type = NumpyDataType('|', 'S', 0); break; - default: - has_exception = true; - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Type {} is not supported for Npy output format", type->getName()); + case TypeIndex::Array: return getNumpyDataType(assert_cast(type.get())->getNestedType()); + default: nested_data_type = type; return false; } + nested_data_type = type; + return true; } void NpyOutputFormat::consume(Chunk chunk) { - if (!has_exception) + if (!invalid_shape) { num_rows += chunk.getNumRows(); - auto column = chunk.getColumns()[0]; + const auto & column = chunk.getColumns()[0]; if (!is_initialized) { - initialize(column); + initShape(column); is_initialized = true; } - /// check shape - auto type = data_type; - ColumnPtr nested_column = column; - int dim = 0; - while (type->getTypeId() == TypeIndex::Array) + if (!checkShape(column)) { - const auto * array_column = assert_cast(nested_column.get()); - const auto & array_offset = array_column->getOffsets(); - for (size_t i = 1; i < array_offset.size(); ++i) - { - if (array_offset[i] - array_offset[i - 1] != numpy_shape[dim]) - { - has_exception = true; - throw Exception(ErrorCodes::ILLEGAL_COLUMN, "ClickHouse doesn't support object types, cannot format ragged nested sequences (which is a list of arrays with different shapes)"); - } - } - type = assert_cast(type.get())->getNestedType(); - nested_column = array_column->getDataPtr(); - dim++; + invalid_shape = true; + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "ClickHouse doesn't support object types, cannot format ragged nested sequences (which is a list of arrays with different shapes)"); } - - /// for type String, get maximum string length - if (type->getTypeId() == TypeIndex::String) - { - const auto & string_offsets = assert_cast(nested_column.get())->getOffsets(); - for (size_t i = 0; i < string_offsets.size(); ++i) - { - size_t string_length = static_cast(string_offsets[i] - 1 - string_offsets[i - 1]); - numpy_data_type.size = numpy_data_type.size > string_length ? numpy_data_type.size : string_length; - } - } - - columns.push_back(nested_column); } } +void NpyOutputFormat::initShape(const ColumnPtr & column) +{ + auto type = data_type; + ColumnPtr nested_column = column; + while (type->getTypeId() == TypeIndex::Array) + { + const auto * array_column = assert_cast(nested_column.get()); + + numpy_shape.push_back(array_column->getOffsets()[0]); + + type = assert_cast(type.get())->getNestedType(); + nested_column = array_column->getDataPtr(); + } +} + +bool NpyOutputFormat::checkShape(const ColumnPtr & column) +{ + auto type = data_type; + ColumnPtr nested_column = column; + int dim = 0; + while (type->getTypeId() == TypeIndex::Array) + { + const auto * array_column = assert_cast(nested_column.get()); + const auto & array_offset = array_column->getOffsets(); + + for (size_t i = 1; i < array_offset.size(); ++i) + if (array_offset[i] - array_offset[i - 1] != numpy_shape[dim]) + return false; + + type = assert_cast(type.get())->getNestedType(); + nested_column = array_column->getDataPtr(); + dim += 1; + } + + if (type->getTypeId() == TypeIndex::String) + { + const auto & string_offsets = assert_cast(nested_column.get())->getOffsets(); + for (size_t i = 0; i < string_offsets.size(); ++i) + { + size_t string_length = static_cast(string_offsets[i] - 1 - string_offsets[i - 1]); + numpy_data_type.size = numpy_data_type.size > string_length ? numpy_data_type.size : string_length; + } + } + + columns.push_back(nested_column); + return true; +} + void NpyOutputFormat::finalizeImpl() { - if (!has_exception) + if (!invalid_shape) { writeHeader(); writeColumns(); @@ -157,16 +185,7 @@ void NpyOutputFormat::finalizeImpl() void NpyOutputFormat::writeHeader() { - WriteBufferFromOwnString shape; - writeIntText(num_rows, shape); - writeChar(',', shape); - for (auto dim : numpy_shape) - { - writeIntText(dim, shape); - writeChar(',', shape); - } - - String dict = "{'descr':'" + numpy_data_type.str() + "','fortran_order':False,'shape':(" + shape.str() + "),}"; + String dict = "{'descr':'" + numpy_data_type.str() + "','fortran_order':False,'shape':(" + shapeStr() + "),}"; String padding = "\n"; /// completes the length of the header, which is divisible by 64. @@ -181,7 +200,7 @@ void NpyOutputFormat::writeHeader() } out.write(STATIC_HEADER, STATIC_HEADER_LENGTH); - writeBinaryLittleEndian(assert_cast(dict_length), out); + writeBinaryLittleEndian(static_cast(dict_length), out); out.write(dict.data(), dict.length()); out.write(padding.data(), padding.length()); } diff --git a/src/Processors/Formats/Impl/NpyOutputFormat.h b/src/Processors/Formats/Impl/NpyOutputFormat.h index f1d0216a8c3..83fad657b2e 100644 --- a/src/Processors/Formats/Impl/NpyOutputFormat.h +++ b/src/Processors/Formats/Impl/NpyOutputFormat.h @@ -40,14 +40,20 @@ private: String str() const; }; - void initialize(const ColumnPtr & column); + String shapeStr() const; + + bool getNumpyDataType(const DataTypePtr & type); + void consume(Chunk) override; + void initShape(const ColumnPtr & column); + bool checkShape(const ColumnPtr & column); + void finalizeImpl() override; void writeHeader(); void writeColumns(); bool is_initialized = false; - bool has_exception = false; + bool invalid_shape = false; DataTypePtr data_type; DataTypePtr nested_data_type; From 2f6db2ac47d55a1d740463173d2613d895419256 Mon Sep 17 00:00:00 2001 From: skyoct Date: Sun, 21 Apr 2024 12:01:31 +0000 Subject: [PATCH 049/289] fix: batter --- src/Functions/clamp.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/Functions/clamp.cpp b/src/Functions/clamp.cpp index 0593c6c6b16..3438377afbf 100644 --- a/src/Functions/clamp.cpp +++ b/src/Functions/clamp.cpp @@ -42,12 +42,13 @@ public: Columns converted_columns(arg_size); for (size_t arg = 0; arg < arg_size; ++arg) converted_columns[arg] = castColumn(arguments[arg], result_type)->convertToFullColumnIfConst(); - if (converted_columns[1]->compareAt(0, 0, *converted_columns[2], 1) > 0) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Function {} the minimum value cannot be greater than the maximum value", getName()); auto result_column = result_type->createColumn(); for (size_t row_num = 0; row_num < input_rows_count; ++row_num) { + if (converted_columns[1]->compareAt(row_num, row_num, *converted_columns[2], 1) > 0) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Function {} the minimum value cannot be greater than the maximum value", getName()); + size_t best_arg = 0; if (converted_columns[1]->compareAt(row_num, row_num, *converted_columns[best_arg], 1) > 0) best_arg = 1; From ab49ab9172dba3c26351b9ab1e417545696a86e8 Mon Sep 17 00:00:00 2001 From: skyoct Date: Sun, 21 Apr 2024 12:03:15 +0000 Subject: [PATCH 050/289] test --- tests/queries/0_stateless/03036_clamp.reference | 8 ++++++-- tests/queries/0_stateless/03036_clamp.sql | 9 +++++++-- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/tests/queries/0_stateless/03036_clamp.reference b/tests/queries/0_stateless/03036_clamp.reference index 5c8de57794d..b866caf2261 100644 --- a/tests/queries/0_stateless/03036_clamp.reference +++ b/tests/queries/0_stateless/03036_clamp.reference @@ -6,5 +6,9 @@ b ['hello'] -1 234 -null -null \ No newline at end of file +\N +\N +5 +0 +1 +2 diff --git a/tests/queries/0_stateless/03036_clamp.sql b/tests/queries/0_stateless/03036_clamp.sql index e075096f8ef..d225be63f46 100644 --- a/tests/queries/0_stateless/03036_clamp.sql +++ b/tests/queries/0_stateless/03036_clamp.sql @@ -2,9 +2,14 @@ SELECT clamp(1, 10, 20); SELECT clamp(30, 10, 20); SELECT clamp(15, 10, 20); SELECT clamp('a', 'b', 'c'); -SELECT clamp(today(), yesterday() - 10, yesterday() + 10) - today() +SELECT clamp(today(), yesterday() - 10, yesterday() + 10) - today(); SELECT clamp([], ['hello'], ['world']); SELECT clamp(-1., -1000., 18446744073709551615.); SELECT clamp(toNullable(123), 234, 456); select clamp(1, null, 5); -select clamp(1, 6, null); \ No newline at end of file +select clamp(1, 6, null); +select clamp(1, 5, nan); +select clamp(toInt64(number), toInt64(number-1), toInt64(number+1)) from numbers(3); +select clamp(number, number-1, number+1) from numbers(3); -- { serverError 386 } +select clamp(1, 3, 2); -- { serverError 36 } +select clamp(1, data[1], data[2])from (select arrayJoin([[1, 2], [2,3], [3,2], [4, 4]]) as data); -- { serverError 36 } From c7fcc5493cfe5b5b3e2ab884ad1273cbee25c688 Mon Sep 17 00:00:00 2001 From: HowePa <2873679104@qq.com> Date: Tue, 23 Apr 2024 11:37:55 +0800 Subject: [PATCH 051/289] fix typo for part log --- docs/en/operations/system-tables/part_log.md | 2 +- src/Interpreters/PartLog.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/en/operations/system-tables/part_log.md b/docs/en/operations/system-tables/part_log.md index af582646653..c0caea1ce5e 100644 --- a/docs/en/operations/system-tables/part_log.md +++ b/docs/en/operations/system-tables/part_log.md @@ -14,7 +14,7 @@ The `system.part_log` table contains the following columns: - `event_type` ([Enum8](../../sql-reference/data-types/enum.md)) — Type of the event that occurred with the data part. Can have one of the following values: - `NewPart` — Inserting of a new data part. - `MergeParts` — Merging of data parts. - - `DownloadParts` — Downloading a data part. + - `DownloadPart` — Downloading a data part. - `RemovePart` — Removing or detaching a data part using [DETACH PARTITION](../../sql-reference/statements/alter/partition.md#alter_detach-partition). - `MutatePart` — Mutating of a data part. - `MovePart` — Moving the data part from the one disk to another one. diff --git a/src/Interpreters/PartLog.cpp b/src/Interpreters/PartLog.cpp index 66f933f1afa..db339375231 100644 --- a/src/Interpreters/PartLog.cpp +++ b/src/Interpreters/PartLog.cpp @@ -101,7 +101,7 @@ ColumnsDescription PartLogElement::getColumnsDescription() "Can have one of the following values: " "NewPart — Inserting of a new data part, " "MergeParts — Merging of data parts, " - "DownloadParts — Downloading a data part, " + "DownloadPart — Downloading a data part, " "RemovePart — Removing or detaching a data part using DETACH PARTITION, " "MutatePart — Mutating of a data part, " "MovePart — Moving the data part from the one disk to another one."}, From 1f207de7a10dfc9c9b0ca76f8d62d55da6e04c61 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Wed, 24 Apr 2024 16:40:27 +0000 Subject: [PATCH 052/289] Add another case with GROUP BY USE NULLS --- ...3023_group_by_use_nulls_analyzer_crashes.reference | 10 ++++++++++ .../03023_group_by_use_nulls_analyzer_crashes.sql | 11 +++++++++++ 2 files changed, 21 insertions(+) diff --git a/tests/queries/0_stateless/03023_group_by_use_nulls_analyzer_crashes.reference b/tests/queries/0_stateless/03023_group_by_use_nulls_analyzer_crashes.reference index 17a17484a0c..e2682487229 100644 --- a/tests/queries/0_stateless/03023_group_by_use_nulls_analyzer_crashes.reference +++ b/tests/queries/0_stateless/03023_group_by_use_nulls_analyzer_crashes.reference @@ -66,3 +66,13 @@ a a a a a a +0 0 +0 \N +1 2 +1 \N +2 4 +2 \N +\N 0 +\N 2 +\N 4 +\N \N diff --git a/tests/queries/0_stateless/03023_group_by_use_nulls_analyzer_crashes.sql b/tests/queries/0_stateless/03023_group_by_use_nulls_analyzer_crashes.sql index 68710137542..f2d4ac8acee 100644 --- a/tests/queries/0_stateless/03023_group_by_use_nulls_analyzer_crashes.sql +++ b/tests/queries/0_stateless/03023_group_by_use_nulls_analyzer_crashes.sql @@ -21,3 +21,14 @@ SELECT tuple(number + 1) AS x FROM numbers(10) GROUP BY number + 1, toString(x) SELECT tuple(tuple(number)) AS x FROM numbers(10) WHERE toString(toUUID(tuple(number), NULL), x) GROUP BY number, (toString(x), number) WITH CUBE SETTINGS group_by_use_nulls = 1 FORMAT Null; SELECT materialize('a'), 'a' AS key GROUP BY key WITH CUBE WITH TOTALS SETTINGS group_by_use_nulls = 1; + +EXPLAIN QUERY TREE +SELECT a, b +FROM numbers(3) +GROUP BY number as a, (number + number) as b WITH CUBE +ORDER BY a, b format Null; + +SELECT a, b +FROM numbers(3) +GROUP BY number as a, (number + number) as b WITH CUBE +ORDER BY a, b; From c9a31599c08f7281acff09b86aa68d7da345efdc Mon Sep 17 00:00:00 2001 From: Andrey Zvonov Date: Wed, 24 Apr 2024 22:14:47 +0000 Subject: [PATCH 053/289] fix single-threading failsafe when number of files cannot be estimated --- src/Storages/StorageS3.cpp | 25 +++++++++++++++++++++++-- src/Storages/StorageS3.h | 1 + 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp index acef213c1f4..daab457e46b 100644 --- a/src/Storages/StorageS3.cpp +++ b/src/Storages/StorageS3.cpp @@ -229,6 +229,14 @@ public: return buffer.size(); } + bool hasMore() + { + if (!buffer.size()) + return !(expanded_keys_iter == expanded_keys.end() && is_finished_for_key); + else + return true; + } + ~Impl() { list_objects_pool.wait(); @@ -481,6 +489,11 @@ size_t StorageS3Source::DisclosedGlobIterator::estimatedKeysCount() return pimpl->objectsCount(); } +bool StorageS3Source::DisclosedGlobIterator::hasMore() +{ + return pimpl->hasMore(); +} + class StorageS3Source::KeysIterator::Impl { public: @@ -1243,8 +1256,16 @@ void ReadFromStorageS3Step::initializePipeline(QueryPipelineBuilder & pipeline, if (estimated_keys_count > 1) num_streams = std::min(num_streams, estimated_keys_count); else - /// Disclosed glob iterator can underestimate the amount of keys in some cases. We will keep one stream for this particular case. - num_streams = 1; + { + const auto glob_iter = std::dynamic_pointer_cast(iterator_wrapper); + if (!(glob_iter && glob_iter->hasMore())) + { + /// Disclosed glob iterator can underestimate the amount of keys in some cases. We will keep one stream for this particular case. + num_streams = 1; + } + /// Otherwise, 1000 files were already listed, but none of them is actually what we are looking for. + /// We cannot estimate _how many_ there are left, but if there are more files to list, it's faster to do it in many streams. + } const size_t max_threads = context->getSettingsRef().max_threads; const size_t max_parsing_threads = num_streams >= max_threads ? 1 : (max_threads / std::max(num_streams, 1ul)); diff --git a/src/Storages/StorageS3.h b/src/Storages/StorageS3.h index 19cbfaa6f08..8d21f1d8e8e 100644 --- a/src/Storages/StorageS3.h +++ b/src/Storages/StorageS3.h @@ -83,6 +83,7 @@ public: KeyWithInfoPtr next(size_t idx = 0) override; /// NOLINT size_t estimatedKeysCount() override; + bool hasMore(); private: class Impl; From ae17941e63e1f66520ef13616ff0370e83996a4e Mon Sep 17 00:00:00 2001 From: HowePa <2873679104@qq.com> Date: Thu, 25 Apr 2024 15:51:06 +0800 Subject: [PATCH 054/289] add docs --- docs/en/interfaces/formats.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/interfaces/formats.md b/docs/en/interfaces/formats.md index f4b082c57ab..80ca0109f0b 100644 --- a/docs/en/interfaces/formats.md +++ b/docs/en/interfaces/formats.md @@ -75,7 +75,7 @@ The supported formats are: | [ArrowStream](#data-format-arrow-stream) | ✔ | ✔ | | [ORC](#data-format-orc) | ✔ | ✔ | | [One](#data-format-one) | ✔ | ✗ | -| [Npy](#data-format-npy) | ✔ | ✗ | +| [Npy](#data-format-npy) | ✔ | ✔ | | [RowBinary](#rowbinary) | ✔ | ✔ | | [RowBinaryWithNames](#rowbinarywithnamesandtypes) | ✔ | ✔ | | [RowBinaryWithNamesAndTypes](#rowbinarywithnamesandtypes) | ✔ | ✔ | From 5e8bc4402ab4df42d228c0474ee01fbb83c97a71 Mon Sep 17 00:00:00 2001 From: HowePa <2873679104@qq.com> Date: Thu, 25 Apr 2024 15:52:30 +0800 Subject: [PATCH 055/289] unified NumpyDataTypes --- src/Formats/NumpyDataTypes.h | 50 ++++++++-- .../Formats/Impl/NpyOutputFormat.cpp | 91 ++++++++++++------- src/Processors/Formats/Impl/NpyOutputFormat.h | 15 +-- 3 files changed, 99 insertions(+), 57 deletions(-) diff --git a/src/Formats/NumpyDataTypes.h b/src/Formats/NumpyDataTypes.h index 712797515c9..5cf2ebf5b40 100644 --- a/src/Formats/NumpyDataTypes.h +++ b/src/Formats/NumpyDataTypes.h @@ -1,10 +1,12 @@ #pragma once #include #include +#include namespace ErrorCodes { extern const int BAD_ARGUMENTS; + extern const int NOT_IMPLEMENTED; } enum class NumpyDataTypeIndex @@ -29,9 +31,9 @@ class NumpyDataType public: enum Endianness { - LITTLE, - BIG, - NONE, + LITTLE = '<', + BIG = '>', + NONE = '|', }; NumpyDataTypeIndex type_index; @@ -41,15 +43,18 @@ public: Endianness getEndianness() const { return endianness; } virtual NumpyDataTypeIndex getTypeIndex() const = 0; + virtual size_t getSize() const { throw DB::Exception(DB::ErrorCodes::NOT_IMPLEMENTED, "Function getSize() is not implemented"); } + virtual void setSize(size_t) { throw DB::Exception(DB::ErrorCodes::NOT_IMPLEMENTED, "Function setSize() is not implemented"); } + virtual String str() const { throw DB::Exception(DB::ErrorCodes::NOT_IMPLEMENTED, "Function str() is not implemented"); } -private: +protected: Endianness endianness; }; class NumpyDataTypeInt : public NumpyDataType { public: - NumpyDataTypeInt(Endianness endianness, size_t size_, bool is_signed_) : NumpyDataType(endianness), size(size_), is_signed(is_signed_) + NumpyDataTypeInt(Endianness endianness_, size_t size_, bool is_signed_) : NumpyDataType(endianness_), size(size_), is_signed(is_signed_) { switch (size) { @@ -67,6 +72,14 @@ public: return type_index; } bool isSigned() const { return is_signed; } + String str() const override + { + DB::WriteBufferFromOwnString buf; + writeChar(static_cast(endianness), buf); + writeChar(is_signed ? 'i' : 'u', buf); + writeIntText(size, buf); + return buf.str(); + } private: size_t size; @@ -76,7 +89,7 @@ private: class NumpyDataTypeFloat : public NumpyDataType { public: - NumpyDataTypeFloat(Endianness endianness, size_t size_) : NumpyDataType(endianness), size(size_) + NumpyDataTypeFloat(Endianness endianness_, size_t size_) : NumpyDataType(endianness_), size(size_) { switch (size) { @@ -92,6 +105,14 @@ public: { return type_index; } + String str() const override + { + DB::WriteBufferFromOwnString buf; + writeChar(static_cast(endianness), buf); + writeChar('f', buf); + writeIntText(size, buf); + return buf.str(); + } private: size_t size; }; @@ -99,13 +120,22 @@ private: class NumpyDataTypeString : public NumpyDataType { public: - NumpyDataTypeString(Endianness endianness, size_t size_) : NumpyDataType(endianness), size(size_) + NumpyDataTypeString(Endianness endianness_, size_t size_) : NumpyDataType(endianness_), size(size_) { type_index = NumpyDataTypeIndex::String; } NumpyDataTypeIndex getTypeIndex() const override { return type_index; } - size_t getSize() const { return size; } + size_t getSize() const override { return size; } + void setSize(size_t size_) override { size = size_; } + String str() const override + { + DB::WriteBufferFromOwnString buf; + writeChar(static_cast(endianness), buf); + writeChar('S', buf); + writeIntText(size, buf); + return buf.str(); + } private: size_t size; }; @@ -113,13 +143,13 @@ private: class NumpyDataTypeUnicode : public NumpyDataType { public: - NumpyDataTypeUnicode(Endianness endianness, size_t size_) : NumpyDataType(endianness), size(size_) + NumpyDataTypeUnicode(Endianness endianness_, size_t size_) : NumpyDataType(endianness_), size(size_) { type_index = NumpyDataTypeIndex::Unicode; } NumpyDataTypeIndex getTypeIndex() const override { return type_index; } - size_t getSize() const { return size * 4; } + size_t getSize() const override { return size * 4; } private: size_t size; }; diff --git a/src/Processors/Formats/Impl/NpyOutputFormat.cpp b/src/Processors/Formats/Impl/NpyOutputFormat.cpp index d54fc7e68f2..64272307e9d 100644 --- a/src/Processors/Formats/Impl/NpyOutputFormat.cpp +++ b/src/Processors/Formats/Impl/NpyOutputFormat.cpp @@ -45,16 +45,6 @@ void writeNumpyStrings(const ColumnPtr & column, size_t length, WriteBuffer & bu } -String NpyOutputFormat::NumpyDataType::str() const -{ - WriteBufferFromOwnString dtype; - writeChar(endianness, dtype); - writeChar(type, dtype); - writeIntText(size, dtype); - - return dtype.str(); -} - String NpyOutputFormat::shapeStr() const { WriteBufferFromOwnString shape; @@ -85,20 +75,48 @@ bool NpyOutputFormat::getNumpyDataType(const DataTypePtr & type) { switch (type->getTypeId()) { - case TypeIndex::Int8: numpy_data_type = NumpyDataType('<', 'i', sizeof(Int8)); break; - case TypeIndex::Int16: numpy_data_type = NumpyDataType('<', 'i', sizeof(Int16)); break; - case TypeIndex::Int32: numpy_data_type = NumpyDataType('<', 'i', sizeof(Int32)); break; - case TypeIndex::Int64: numpy_data_type = NumpyDataType('<', 'i', sizeof(Int64)); break; - case TypeIndex::UInt8: numpy_data_type = NumpyDataType('<', 'u', sizeof(UInt8)); break; - case TypeIndex::UInt16: numpy_data_type = NumpyDataType('<', 'u', sizeof(UInt16)); break; - case TypeIndex::UInt32: numpy_data_type = NumpyDataType('<', 'u', sizeof(UInt32)); break; - case TypeIndex::UInt64: numpy_data_type = NumpyDataType('<', 'u', sizeof(UInt64)); break; - case TypeIndex::Float32: numpy_data_type = NumpyDataType('<', 'f', sizeof(Float32)); break; - case TypeIndex::Float64: numpy_data_type = NumpyDataType('<', 'f', sizeof(Float64)); break; - case TypeIndex::FixedString: numpy_data_type = NumpyDataType('|', 'S', assert_cast(type.get())->getN()); break; - case TypeIndex::String: numpy_data_type = NumpyDataType('|', 'S', 0); break; - case TypeIndex::Array: return getNumpyDataType(assert_cast(type.get())->getNestedType()); - default: nested_data_type = type; return false; + case TypeIndex::Int8: + numpy_data_type = std::make_shared(NumpyDataType::Endianness::LITTLE, sizeof(Int8), true); + break; + case TypeIndex::Int16: + numpy_data_type = std::make_shared(NumpyDataType::Endianness::LITTLE, sizeof(Int16), true); + break; + case TypeIndex::Int32: + numpy_data_type = std::make_shared(NumpyDataType::Endianness::LITTLE, sizeof(Int32), true); + break; + case TypeIndex::Int64: + numpy_data_type = std::make_shared(NumpyDataType::Endianness::LITTLE, sizeof(Int64), true); + break; + case TypeIndex::UInt8: + numpy_data_type = std::make_shared(NumpyDataType::Endianness::LITTLE, sizeof(UInt8), false); + break; + case TypeIndex::UInt16: + numpy_data_type = std::make_shared(NumpyDataType::Endianness::LITTLE, sizeof(UInt16), false); + break; + case TypeIndex::UInt32: + numpy_data_type = std::make_shared(NumpyDataType::Endianness::LITTLE, sizeof(UInt32), false); + break; + case TypeIndex::UInt64: + numpy_data_type = std::make_shared(NumpyDataType::Endianness::LITTLE, sizeof(UInt64), false); + break; + case TypeIndex::Float32: + numpy_data_type = std::make_shared(NumpyDataType::Endianness::LITTLE, sizeof(Float32)); + break; + case TypeIndex::Float64: + numpy_data_type = std::make_shared(NumpyDataType::Endianness::LITTLE, sizeof(Float64)); + break; + case TypeIndex::FixedString: + numpy_data_type = std::make_shared( + NumpyDataType::Endianness::NONE, assert_cast(type.get())->getN()); + break; + case TypeIndex::String: + numpy_data_type = std::make_shared(NumpyDataType::Endianness::NONE, 0); + break; + case TypeIndex::Array: + return getNumpyDataType(assert_cast(type.get())->getNestedType()); + default: + nested_data_type = type; + return false; } nested_data_type = type; @@ -117,6 +135,9 @@ void NpyOutputFormat::consume(Chunk chunk) initShape(column); is_initialized = true; } + // ColumnPtr checkShape, if nullptr? + // updateSizeIfTypeString + // columns.push_back() if (!checkShape(column)) { @@ -130,13 +151,9 @@ void NpyOutputFormat::initShape(const ColumnPtr & column) { auto type = data_type; ColumnPtr nested_column = column; - while (type->getTypeId() == TypeIndex::Array) + while (const auto * array_column = typeid_cast(nested_column.get())) { - const auto * array_column = assert_cast(nested_column.get()); - numpy_shape.push_back(array_column->getOffsets()[0]); - - type = assert_cast(type.get())->getNestedType(); nested_column = array_column->getDataPtr(); } } @@ -166,7 +183,8 @@ bool NpyOutputFormat::checkShape(const ColumnPtr & column) for (size_t i = 0; i < string_offsets.size(); ++i) { size_t string_length = static_cast(string_offsets[i] - 1 - string_offsets[i - 1]); - numpy_data_type.size = numpy_data_type.size > string_length ? numpy_data_type.size : string_length; + if (numpy_data_type->getSize() < string_length) + numpy_data_type->setSize(string_length); } } @@ -185,7 +203,7 @@ void NpyOutputFormat::finalizeImpl() void NpyOutputFormat::writeHeader() { - String dict = "{'descr':'" + numpy_data_type.str() + "','fortran_order':False,'shape':(" + shapeStr() + "),}"; + String dict = "{'descr':'" + numpy_data_type->str() + "','fortran_order':False,'shape':(" + shapeStr() + "),}"; String padding = "\n"; /// completes the length of the header, which is divisible by 64. @@ -221,9 +239,14 @@ void NpyOutputFormat::writeColumns() case TypeIndex::UInt64: writeNumpyNumbers(column, out); break; case TypeIndex::Float32: writeNumpyNumbers(column, out); break; case TypeIndex::Float64: writeNumpyNumbers(column, out); break; - case TypeIndex::FixedString: writeNumpyStrings(column, numpy_data_type.size, out); break; - case TypeIndex::String: writeNumpyStrings(column, numpy_data_type.size, out); break; - default: break; + case TypeIndex::FixedString: + writeNumpyStrings(column, numpy_data_type->getSize(), out); + break; + case TypeIndex::String: + writeNumpyStrings(column, numpy_data_type->getSize(), out); + break; + default: + break; } } } diff --git a/src/Processors/Formats/Impl/NpyOutputFormat.h b/src/Processors/Formats/Impl/NpyOutputFormat.h index 83fad657b2e..6859cf10e69 100644 --- a/src/Processors/Formats/Impl/NpyOutputFormat.h +++ b/src/Processors/Formats/Impl/NpyOutputFormat.h @@ -5,6 +5,7 @@ #include #include #include +#include #include #include @@ -28,18 +29,6 @@ public: String getContentType() const override { return "application/octet-stream"; } private: - struct NumpyDataType - { - char endianness; - char type; - size_t size; - - NumpyDataType() = default; - NumpyDataType(char endianness_, char type_, size_t size_) - : endianness(endianness_), type(type_), size(size_) {} - String str() const; - }; - String shapeStr() const; bool getNumpyDataType(const DataTypePtr & type); @@ -57,7 +46,7 @@ private: DataTypePtr data_type; DataTypePtr nested_data_type; - NumpyDataType numpy_data_type; + std::shared_ptr numpy_data_type; UInt64 num_rows = 0; std::vector numpy_shape; Columns columns; From 686ea6af9c3512c7b07345cabc785bb975311162 Mon Sep 17 00:00:00 2001 From: Andrey Zvonov Date: Thu, 25 Apr 2024 09:06:49 +0000 Subject: [PATCH 056/289] fix style and logic of estimation --- src/Storages/StorageS3.cpp | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp index 6ba41d21766..bdfd2b8b453 100644 --- a/src/Storages/StorageS3.cpp +++ b/src/Storages/StorageS3.cpp @@ -231,7 +231,7 @@ public: bool hasMore() { - if (!buffer.size()) + if (buffer.empty()) return !(expanded_keys_iter == expanded_keys.end() && is_finished_for_key); else return true; @@ -1286,19 +1286,21 @@ void ReadFromStorageS3Step::initializePipeline(QueryPipelineBuilder & pipeline, createIterator(nullptr); size_t estimated_keys_count = iterator_wrapper->estimatedKeysCount(); - if (estimated_keys_count > 1) - num_streams = std::min(num_streams, estimated_keys_count); - else + const auto glob_iter = std::dynamic_pointer_cast(iterator_wrapper); + + if (!(glob_iter && glob_iter->hasMore())) { - const auto glob_iter = std::dynamic_pointer_cast(iterator_wrapper); - if (!(glob_iter && glob_iter->hasMore())) + if (estimated_keys_count > 1) + num_streams = std::min(num_streams, estimated_keys_count); + else { - /// Disclosed glob iterator can underestimate the amount of keys in some cases. We will keep one stream for this particular case. + /// The amount of keys (zero) was probably underestimated. We will keep one stream for this particular case. num_streams = 1; } - /// Otherwise, 1000 files were already listed, but none of them is actually what we are looking for. - /// We cannot estimate _how many_ there are left, but if there are more files to list, it's faster to do it in many streams. } + /// OTHERWISE, 1000 files were listed, but we cannot make any estimation of _how many_ there are (because we list bucket lazily); + /// If there are more objects in the bucket, limiting the number of streams is the last thing we may want to do + /// as it would lead to serious (up to times) reading performance degradation. const size_t max_threads = context->getSettingsRef().max_threads; const size_t max_parsing_threads = num_streams >= max_threads ? 1 : (max_threads / std::max(num_streams, 1ul)); From d3d7d3575db1f0d76f4a910c1bc30faf911cf839 Mon Sep 17 00:00:00 2001 From: HowePa <2873679104@qq.com> Date: Thu, 25 Apr 2024 17:16:11 +0800 Subject: [PATCH 057/289] fix empty array & refactor --- .../Formats/Impl/NpyOutputFormat.cpp | 45 +++++++++--------- src/Processors/Formats/Impl/NpyOutputFormat.h | 3 +- .../02895_npy_output_format.reference | Bin 626 -> 1010 bytes .../0_stateless/02895_npy_output_format.sh | 6 ++- 4 files changed, 28 insertions(+), 26 deletions(-) diff --git a/src/Processors/Formats/Impl/NpyOutputFormat.cpp b/src/Processors/Formats/Impl/NpyOutputFormat.cpp index 64272307e9d..f031b776ff7 100644 --- a/src/Processors/Formats/Impl/NpyOutputFormat.cpp +++ b/src/Processors/Formats/Impl/NpyOutputFormat.cpp @@ -135,51 +135,53 @@ void NpyOutputFormat::consume(Chunk chunk) initShape(column); is_initialized = true; } - // ColumnPtr checkShape, if nullptr? - // updateSizeIfTypeString - // columns.push_back() - if (!checkShape(column)) - { - invalid_shape = true; - throw Exception(ErrorCodes::ILLEGAL_COLUMN, "ClickHouse doesn't support object types, cannot format ragged nested sequences (which is a list of arrays with different shapes)"); - } + ColumnPtr nested_column = column; + checkShape(nested_column); + updateSizeIfTypeString(nested_column); + columns.push_back(nested_column); } } void NpyOutputFormat::initShape(const ColumnPtr & column) { - auto type = data_type; ColumnPtr nested_column = column; while (const auto * array_column = typeid_cast(nested_column.get())) { - numpy_shape.push_back(array_column->getOffsets()[0]); + auto dim = array_column->getOffsets()[0]; + invalid_shape = dim == 0; + numpy_shape.push_back(dim); nested_column = array_column->getDataPtr(); } + + if (invalid_shape) + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Shape ({}) is an invalid shape, as dimension size cannot be 0", shapeStr()); } -bool NpyOutputFormat::checkShape(const ColumnPtr & column) +void NpyOutputFormat::checkShape(ColumnPtr & column) { - auto type = data_type; - ColumnPtr nested_column = column; int dim = 0; - while (type->getTypeId() == TypeIndex::Array) + while (const auto * array_column = typeid_cast(column.get())) { - const auto * array_column = assert_cast(nested_column.get()); const auto & array_offset = array_column->getOffsets(); for (size_t i = 1; i < array_offset.size(); ++i) if (array_offset[i] - array_offset[i - 1] != numpy_shape[dim]) - return false; + { + invalid_shape = true; + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "ClickHouse doesn't support object types, cannot format ragged nested sequences (which is a list of arrays with different shapes)"); + } - type = assert_cast(type.get())->getNestedType(); - nested_column = array_column->getDataPtr(); + column = array_column->getDataPtr(); dim += 1; } +} - if (type->getTypeId() == TypeIndex::String) +void NpyOutputFormat::updateSizeIfTypeString(const ColumnPtr & column) +{ + if (nested_data_type->getTypeId() == TypeIndex::String) { - const auto & string_offsets = assert_cast(nested_column.get())->getOffsets(); + const auto & string_offsets = assert_cast(column.get())->getOffsets(); for (size_t i = 0; i < string_offsets.size(); ++i) { size_t string_length = static_cast(string_offsets[i] - 1 - string_offsets[i - 1]); @@ -187,9 +189,6 @@ bool NpyOutputFormat::checkShape(const ColumnPtr & column) numpy_data_type->setSize(string_length); } } - - columns.push_back(nested_column); - return true; } void NpyOutputFormat::finalizeImpl() diff --git a/src/Processors/Formats/Impl/NpyOutputFormat.h b/src/Processors/Formats/Impl/NpyOutputFormat.h index 6859cf10e69..5dd6552ac0c 100644 --- a/src/Processors/Formats/Impl/NpyOutputFormat.h +++ b/src/Processors/Formats/Impl/NpyOutputFormat.h @@ -35,7 +35,8 @@ private: void consume(Chunk) override; void initShape(const ColumnPtr & column); - bool checkShape(const ColumnPtr & column); + void checkShape(ColumnPtr & column); + void updateSizeIfTypeString(const ColumnPtr & column); void finalizeImpl() override; void writeHeader(); diff --git a/tests/queries/0_stateless/02895_npy_output_format.reference b/tests/queries/0_stateless/02895_npy_output_format.reference index b599f1dceeaabbd60cb66d02ab02a5d1a30c97f2..590c0581ac112679dea10b0c431dd1b69fae4ab6 100644 GIT binary patch literal 1010 zcmdNj*ax)~q=R6{@_sHT8KP>pfR$xlR61QtS765RbshDz{GyVg#Ju?YqLfsis9R!AajK4baYkZ6s=Aeiv5pZCYUmJMli1i;F49DS$rz~7Ko4jLP$xc~dSX%%Fk+LE ylGTAa)RU8wfxHwTPzP(JL>iJA96?z`1DOORbxlo8 Date: Thu, 25 Apr 2024 17:17:35 +0800 Subject: [PATCH 058/289] unified array travel Co-authored-by: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> --- src/Processors/Formats/Impl/NpyOutputFormat.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Processors/Formats/Impl/NpyOutputFormat.cpp b/src/Processors/Formats/Impl/NpyOutputFormat.cpp index d54fc7e68f2..64bc916da51 100644 --- a/src/Processors/Formats/Impl/NpyOutputFormat.cpp +++ b/src/Processors/Formats/Impl/NpyOutputFormat.cpp @@ -151,7 +151,7 @@ bool NpyOutputFormat::checkShape(const ColumnPtr & column) const auto * array_column = assert_cast(nested_column.get()); const auto & array_offset = array_column->getOffsets(); - for (size_t i = 1; i < array_offset.size(); ++i) + for (size_t i = 0; i < array_offset.size(); ++i) if (array_offset[i] - array_offset[i - 1] != numpy_shape[dim]) return false; From d85f6ae35d64cd8f40a5cfb05aa53d60af48955e Mon Sep 17 00:00:00 2001 From: HowePa <2873679104@qq.com> Date: Thu, 25 Apr 2024 17:31:18 +0800 Subject: [PATCH 059/289] reset test --- .../Formats/Impl/NpyOutputFormat.cpp | 2 +- .../02895_npy_output_format.reference | Bin 1010 -> 626 bytes 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Processors/Formats/Impl/NpyOutputFormat.cpp b/src/Processors/Formats/Impl/NpyOutputFormat.cpp index f67c91540e4..b3d5042aa79 100644 --- a/src/Processors/Formats/Impl/NpyOutputFormat.cpp +++ b/src/Processors/Formats/Impl/NpyOutputFormat.cpp @@ -155,7 +155,7 @@ void NpyOutputFormat::initShape(const ColumnPtr & column) } if (invalid_shape) - throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Shape ({}) is an invalid shape, as dimension size cannot be 0", shapeStr()); + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Shape ({}) is invalid, as dimension size cannot be 0", shapeStr()); } void NpyOutputFormat::checkShape(ColumnPtr & column) diff --git a/tests/queries/0_stateless/02895_npy_output_format.reference b/tests/queries/0_stateless/02895_npy_output_format.reference index 590c0581ac112679dea10b0c431dd1b69fae4ab6..b599f1dceeaabbd60cb66d02ab02a5d1a30c97f2 100644 GIT binary patch delta 16 Ycmeyw{)uISGvnm*ax)~q=R6{@_sHT8KP>pfR$xlR61QtS765RbshDz{GyVg#Ju?YqLfsis9R!AajK4baYkZ6s=Aeiv5pZCYUmJMli1i;F49DS$rz~7Ko4jLP$xc~dSX%%Fk+LE ylGTAa)RU8wfxHwTPzP(JL>iJA96?z`1DOORbxlo8 Date: Wed, 24 Apr 2024 11:04:56 +0200 Subject: [PATCH 060/289] dx: Enhance error message when non-deterministic function is used with Replicated source --- src/Interpreters/MutationsInterpreter.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Interpreters/MutationsInterpreter.cpp b/src/Interpreters/MutationsInterpreter.cpp index 35fd549559b..667a3e2e7a6 100644 --- a/src/Interpreters/MutationsInterpreter.cpp +++ b/src/Interpreters/MutationsInterpreter.cpp @@ -1313,7 +1313,7 @@ void MutationsInterpreter::validate() if (nondeterministic_func_data.nondeterministic_function_name) throw Exception(ErrorCodes::BAD_ARGUMENTS, - "ALTER UPDATE/ALTER DELETE statements must use only deterministic functions. " + "The source storage is replicated so ALTER UPDATE/ALTER DELETE statements must use only deterministic functions. " "Function '{}' is non-deterministic", *nondeterministic_func_data.nondeterministic_function_name); } } From b13c7d004c6f533a1931eb8ac5c529ca82914cd9 Mon Sep 17 00:00:00 2001 From: Andrey Zvonov Date: Thu, 25 Apr 2024 14:51:44 +0000 Subject: [PATCH 061/289] fix tidy --- src/Storages/StorageS3.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp index bdfd2b8b453..cb5734cfe0c 100644 --- a/src/Storages/StorageS3.cpp +++ b/src/Storages/StorageS3.cpp @@ -278,7 +278,6 @@ private: filter_dag = VirtualColumnUtils::createPathAndFileFilterDAG(predicate, virtual_columns); fillInternalBufferAssumeLocked(); - return; } KeyWithInfoPtr nextAssumeLocked() From 34dd0a229f04f3b7f8b3181ced3be6430c0f1d2c Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Thu, 25 Apr 2024 15:46:29 +0000 Subject: [PATCH 062/289] Another one case of aliases with group_by_use_null --- src/Analyzer/Passes/QueryAnalysisPass.cpp | 87 ++++++++++++------- ...up_by_use_nulls_analyzer_crashes.reference | 10 +++ ...23_group_by_use_nulls_analyzer_crashes.sql | 11 +++ 3 files changed, 76 insertions(+), 32 deletions(-) diff --git a/src/Analyzer/Passes/QueryAnalysisPass.cpp b/src/Analyzer/Passes/QueryAnalysisPass.cpp index aedf860f5be..13ce3d7f0e2 100644 --- a/src/Analyzer/Passes/QueryAnalysisPass.cpp +++ b/src/Analyzer/Passes/QueryAnalysisPass.cpp @@ -474,7 +474,7 @@ struct TableExpressionData class ExpressionsStack { public: - void pushNode(const QueryTreeNodePtr & node) + void push(const QueryTreeNodePtr & node) { if (node->hasAlias()) { @@ -491,7 +491,7 @@ public: expressions.emplace_back(node); } - void popNode() + void pop() { const auto & top_expression = expressions.back(); const auto & top_expression_alias = top_expression->getAlias(); @@ -729,6 +729,8 @@ struct IdentifierResolveScope join_use_nulls = context->getSettingsRef().join_use_nulls; else if (parent_scope) join_use_nulls = parent_scope->join_use_nulls; + + alias_name_to_expression_node = &alias_name_to_expression_node_before_group_by; } QueryTreeNodePtr scope_node; @@ -744,7 +746,10 @@ struct IdentifierResolveScope std::unordered_map expression_argument_name_to_node; /// Alias name to query expression node - std::unordered_map alias_name_to_expression_node; + std::unordered_map alias_name_to_expression_node_before_group_by; + std::unordered_map alias_name_to_expression_node_after_group_by; + + std::unordered_map * alias_name_to_expression_node = nullptr; /// Alias name to lambda node std::unordered_map alias_name_to_lambda_node; @@ -877,6 +882,22 @@ struct IdentifierResolveScope return it->second; } + void pushExpressionNode(const QueryTreeNodePtr & node) + { + bool had_aggregate_function = expressions_in_resolve_process_stack.hasAggregateFunction(); + expressions_in_resolve_process_stack.push(node); + if (group_by_use_nulls && had_aggregate_function != expressions_in_resolve_process_stack.hasAggregateFunction()) + alias_name_to_expression_node = &alias_name_to_expression_node_before_group_by; + } + + void popExpressionNode() + { + bool had_aggregate_function = expressions_in_resolve_process_stack.hasAggregateFunction(); + expressions_in_resolve_process_stack.pop(); + if (group_by_use_nulls && had_aggregate_function != expressions_in_resolve_process_stack.hasAggregateFunction()) + alias_name_to_expression_node = &alias_name_to_expression_node_after_group_by; + } + /// Dump identifier resolve scope [[maybe_unused]] void dump(WriteBuffer & buffer) const { @@ -893,8 +914,8 @@ struct IdentifierResolveScope for (const auto & [alias_name, node] : expression_argument_name_to_node) buffer << "Alias name " << alias_name << " node " << node->formatASTForErrorMessage() << '\n'; - buffer << "Alias name to expression node table size " << alias_name_to_expression_node.size() << '\n'; - for (const auto & [alias_name, node] : alias_name_to_expression_node) + buffer << "Alias name to expression node table size " << alias_name_to_expression_node->size() << '\n'; + for (const auto & [alias_name, node] : *alias_name_to_expression_node) buffer << "Alias name " << alias_name << " expression node " << node->dumpTree() << '\n'; buffer << "Alias name to function node table size " << alias_name_to_lambda_node.size() << '\n'; @@ -1022,7 +1043,7 @@ private: if (is_lambda_node) { - if (scope.alias_name_to_expression_node.contains(alias)) + if (scope.alias_name_to_expression_node->contains(alias)) scope.nodes_with_duplicated_aliases.insert(node); auto [_, inserted] = scope.alias_name_to_lambda_node.insert(std::make_pair(alias, node)); @@ -1035,7 +1056,7 @@ private: if (scope.alias_name_to_lambda_node.contains(alias)) scope.nodes_with_duplicated_aliases.insert(node); - auto [_, inserted] = scope.alias_name_to_expression_node.insert(std::make_pair(alias, node)); + auto [_, inserted] = scope.alias_name_to_expression_node->insert(std::make_pair(alias, node)); if (!inserted) scope.nodes_with_duplicated_aliases.insert(node); @@ -1837,7 +1858,7 @@ void QueryAnalyzer::collectScopeValidIdentifiersForTypoCorrection( if (allow_expression_identifiers) { - for (const auto & [name, expression] : scope.alias_name_to_expression_node) + for (const auto & [name, expression] : *scope.alias_name_to_expression_node) { assert(expression); auto expression_identifier = Identifier(name); @@ -1867,7 +1888,7 @@ void QueryAnalyzer::collectScopeValidIdentifiersForTypoCorrection( { if (allow_function_identifiers) { - for (const auto & [name, _] : scope.alias_name_to_expression_node) + for (const auto & [name, _] : *scope.alias_name_to_expression_node) valid_identifiers_result.insert(Identifier(name)); } @@ -2767,7 +2788,7 @@ bool QueryAnalyzer::tryBindIdentifierToAliases(const IdentifierLookup & identifi auto get_alias_name_to_node_map = [&]() -> const std::unordered_map & { if (identifier_lookup.isExpressionLookup()) - return scope.alias_name_to_expression_node; + return *scope.alias_name_to_expression_node; else if (identifier_lookup.isFunctionLookup()) return scope.alias_name_to_lambda_node; @@ -2829,7 +2850,7 @@ QueryTreeNodePtr QueryAnalyzer::tryResolveIdentifierFromAliases(const Identifier auto get_alias_name_to_node_map = [&]() -> std::unordered_map & { if (identifier_lookup.isExpressionLookup()) - return scope.alias_name_to_expression_node; + return *scope.alias_name_to_expression_node; else if (identifier_lookup.isFunctionLookup()) return scope.alias_name_to_lambda_node; @@ -2867,7 +2888,7 @@ QueryTreeNodePtr QueryAnalyzer::tryResolveIdentifierFromAliases(const Identifier /// Resolve expression if necessary if (node_type == QueryTreeNodeType::IDENTIFIER) { - scope.expressions_in_resolve_process_stack.pushNode(it->second); + scope.pushExpressionNode(it->second); auto & alias_identifier_node = it->second->as(); auto identifier = alias_identifier_node.getIdentifier(); @@ -2898,9 +2919,9 @@ QueryTreeNodePtr QueryAnalyzer::tryResolveIdentifierFromAliases(const Identifier if (identifier_lookup.isExpressionLookup()) scope.alias_name_to_lambda_node.erase(identifier_bind_part); else if (identifier_lookup.isFunctionLookup()) - scope.alias_name_to_expression_node.erase(identifier_bind_part); + scope.alias_name_to_expression_node->erase(identifier_bind_part); - scope.expressions_in_resolve_process_stack.popNode(); + scope.popExpressionNode(); } else if (node_type == QueryTreeNodeType::FUNCTION) { @@ -5195,7 +5216,7 @@ ProjectionNames QueryAnalyzer::resolveLambda(const QueryTreeNodePtr & lambda_nod auto & lambda_argument_node_typed = lambda_argument_node->as(); const auto & lambda_argument_name = lambda_argument_node_typed.getIdentifier().getFullName(); - bool has_expression_node = scope.alias_name_to_expression_node.contains(lambda_argument_name); + bool has_expression_node = scope.alias_name_to_expression_node->contains(lambda_argument_name); bool has_alias_node = scope.alias_name_to_lambda_node.contains(lambda_argument_name); if (has_expression_node || has_alias_node) @@ -6200,8 +6221,8 @@ ProjectionNames QueryAnalyzer::resolveExpressionNode(QueryTreeNodePtr & node, Id * * To resolve b we need to resolve a. */ - auto it = scope.alias_name_to_expression_node.find(node_alias); - if (it != scope.alias_name_to_expression_node.end()) + auto it = scope.alias_name_to_expression_node->find(node_alias); + if (it != scope.alias_name_to_expression_node->end()) node = it->second; if (allow_lambda_expression) @@ -6212,7 +6233,7 @@ ProjectionNames QueryAnalyzer::resolveExpressionNode(QueryTreeNodePtr & node, Id } } - scope.expressions_in_resolve_process_stack.pushNode(node); + scope.pushExpressionNode(node); auto node_type = node->getNodeType(); @@ -6241,7 +6262,7 @@ ProjectionNames QueryAnalyzer::resolveExpressionNode(QueryTreeNodePtr & node, Id resolved_identifier_node = tryResolveIdentifier({unresolved_identifier, IdentifierLookupContext::FUNCTION}, scope).resolved_identifier; if (resolved_identifier_node && !node_alias.empty()) - scope.alias_name_to_expression_node.erase(node_alias); + scope.alias_name_to_expression_node->erase(node_alias); } if (!resolved_identifier_node && allow_table_expression) @@ -6472,8 +6493,8 @@ ProjectionNames QueryAnalyzer::resolveExpressionNode(QueryTreeNodePtr & node, Id */ if (!node_alias.empty() && use_alias_table && !scope.group_by_use_nulls) { - auto it = scope.alias_name_to_expression_node.find(node_alias); - if (it != scope.alias_name_to_expression_node.end()) + auto it = scope.alias_name_to_expression_node->find(node_alias); + if (it != scope.alias_name_to_expression_node->end()) it->second = node; if (allow_lambda_expression) @@ -6486,7 +6507,7 @@ ProjectionNames QueryAnalyzer::resolveExpressionNode(QueryTreeNodePtr & node, Id resolved_expressions.emplace(node, result_projection_names); - scope.expressions_in_resolve_process_stack.popNode(); + scope.popExpressionNode(); bool expression_was_root = scope.expressions_in_resolve_process_stack.empty(); if (expression_was_root) scope.non_cached_identifier_lookups_during_expression_resolve.clear(); @@ -6830,11 +6851,11 @@ void QueryAnalyzer::initializeQueryJoinTreeNode(QueryTreeNodePtr & join_tree_nod */ resolve_settings.allow_to_resolve_subquery_during_identifier_resolution = false; - scope.expressions_in_resolve_process_stack.pushNode(current_join_tree_node); + scope.pushExpressionNode(current_join_tree_node); auto table_identifier_resolve_result = tryResolveIdentifier(table_identifier_lookup, scope, resolve_settings); - scope.expressions_in_resolve_process_stack.popNode(); + scope.popExpressionNode(); bool expression_was_root = scope.expressions_in_resolve_process_stack.empty(); if (expression_was_root) scope.non_cached_identifier_lookups_during_expression_resolve.clear(); @@ -7418,7 +7439,7 @@ void QueryAnalyzer::resolveArrayJoin(QueryTreeNodePtr & array_join_node, Identif for (auto & array_join_expression : array_join_nodes) { auto array_join_expression_alias = array_join_expression->getAlias(); - if (!array_join_expression_alias.empty() && scope.alias_name_to_expression_node.contains(array_join_expression_alias)) + if (!array_join_expression_alias.empty() && scope.alias_name_to_expression_node->contains(array_join_expression_alias)) throw Exception(ErrorCodes::MULTIPLE_EXPRESSIONS_FOR_ALIAS, "ARRAY JOIN expression {} with duplicate alias {}. In scope {}", array_join_expression->formatASTForErrorMessage(), @@ -7512,8 +7533,8 @@ void QueryAnalyzer::resolveArrayJoin(QueryTreeNodePtr & array_join_node, Identif array_join_nodes = std::move(array_join_column_expressions); for (auto & array_join_column_expression : array_join_nodes) { - auto it = scope.alias_name_to_expression_node.find(array_join_column_expression->getAlias()); - if (it != scope.alias_name_to_expression_node.end()) + auto it = scope.alias_name_to_expression_node->find(array_join_column_expression->getAlias()); + if (it != scope.alias_name_to_expression_node->end()) { auto & array_join_column_expression_typed = array_join_column_expression->as(); auto array_join_column = std::make_shared(array_join_column_expression_typed.getColumn(), @@ -8037,8 +8058,10 @@ void QueryAnalyzer::resolveQuery(const QueryTreeNodePtr & query_node, Identifier /// Clone is needed cause aliases share subtrees. /// If not clone, the same (shared) subtree could be resolved again with different (Nullable) type /// See 03023_group_by_use_nulls_analyzer_crashes - for (auto & [_, node] : scope.alias_name_to_expression_node) - node = node->clone(); + for (auto & [key, node] : scope.alias_name_to_expression_node_before_group_by) + scope.alias_name_to_expression_node_after_group_by[key] = node->clone(); + + scope.alias_name_to_expression_node = &scope.alias_name_to_expression_node_after_group_by; } if (query_node_typed.hasHaving()) @@ -8115,8 +8138,8 @@ void QueryAnalyzer::resolveQuery(const QueryTreeNodePtr & query_node, Identifier bool has_node_in_alias_table = false; - auto it = scope.alias_name_to_expression_node.find(node_alias); - if (it != scope.alias_name_to_expression_node.end()) + auto it = scope.alias_name_to_expression_node->find(node_alias); + if (it != scope.alias_name_to_expression_node->end()) { has_node_in_alias_table = true; @@ -8175,7 +8198,7 @@ void QueryAnalyzer::resolveQuery(const QueryTreeNodePtr & query_node, Identifier /// Remove aliases from expression and lambda nodes - for (auto & [_, node] : scope.alias_name_to_expression_node) + for (auto & [_, node] : *scope.alias_name_to_expression_node) node->removeAlias(); for (auto & [_, node] : scope.alias_name_to_lambda_node) diff --git a/tests/queries/0_stateless/03023_group_by_use_nulls_analyzer_crashes.reference b/tests/queries/0_stateless/03023_group_by_use_nulls_analyzer_crashes.reference index e2682487229..0eb9d94e85a 100644 --- a/tests/queries/0_stateless/03023_group_by_use_nulls_analyzer_crashes.reference +++ b/tests/queries/0_stateless/03023_group_by_use_nulls_analyzer_crashes.reference @@ -76,3 +76,13 @@ a a \N 2 \N 4 \N \N +0 0 nan +2 4 nan +1 2 nan +2 \N nan +0 \N nan +1 \N nan +\N 2 nan +\N 0 nan +\N 4 nan +\N \N nan diff --git a/tests/queries/0_stateless/03023_group_by_use_nulls_analyzer_crashes.sql b/tests/queries/0_stateless/03023_group_by_use_nulls_analyzer_crashes.sql index f2d4ac8acee..7311ce54e39 100644 --- a/tests/queries/0_stateless/03023_group_by_use_nulls_analyzer_crashes.sql +++ b/tests/queries/0_stateless/03023_group_by_use_nulls_analyzer_crashes.sql @@ -32,3 +32,14 @@ SELECT a, b FROM numbers(3) GROUP BY number as a, (number + number) as b WITH CUBE ORDER BY a, b; + +SELECT + a, + b, + cramersVBiasCorrected(a, b) +FROM numbers(3) +GROUP BY + number AS a, + number + number AS b + WITH CUBE +SETTINGS group_by_use_nulls = 1; From e09530ab755964b6da12718279ef345bf2800d43 Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Fri, 8 Dec 2023 16:51:35 +0100 Subject: [PATCH 063/289] Fix making backup when multiple shards are used. --- src/Backups/BackupCoordinationLocal.cpp | 24 ++++---- src/Backups/BackupCoordinationLocal.h | 12 ++-- src/Backups/BackupCoordinationRemote.cpp | 48 ++++++++-------- src/Backups/BackupCoordinationRemote.h | 12 ++-- .../BackupCoordinationReplicatedTables.cpp | 24 ++++---- .../BackupCoordinationReplicatedTables.h | 14 ++--- src/Backups/BackupEntriesCollector.cpp | 10 ++-- src/Backups/BackupEntriesCollector.h | 2 +- src/Backups/BackupUtils.cpp | 2 +- src/Backups/DDLAdjustingForBackupVisitor.cpp | 10 +--- src/Backups/DDLAdjustingForBackupVisitor.h | 5 +- src/Backups/IBackupCoordination.h | 12 ++-- src/Storages/StorageReplicatedMergeTree.cpp | 56 +++++-------------- src/Storages/StorageReplicatedMergeTree.h | 7 +-- 14 files changed, 100 insertions(+), 138 deletions(-) diff --git a/src/Backups/BackupCoordinationLocal.cpp b/src/Backups/BackupCoordinationLocal.cpp index 9964de2ad6e..efdc18cc29c 100644 --- a/src/Backups/BackupCoordinationLocal.cpp +++ b/src/Backups/BackupCoordinationLocal.cpp @@ -33,42 +33,42 @@ Strings BackupCoordinationLocal::waitForStage(const String &, std::chrono::milli return {}; } -void BackupCoordinationLocal::addReplicatedPartNames(const String & table_shared_id, const String & table_name_for_logs, const String & replica_name, const std::vector & part_names_and_checksums) +void BackupCoordinationLocal::addReplicatedPartNames(const String & table_zk_path, const String & table_name_for_logs, const String & replica_name, const std::vector & part_names_and_checksums) { std::lock_guard lock{replicated_tables_mutex}; - replicated_tables.addPartNames({table_shared_id, table_name_for_logs, replica_name, part_names_and_checksums}); + replicated_tables.addPartNames({table_zk_path, table_name_for_logs, replica_name, part_names_and_checksums}); } -Strings BackupCoordinationLocal::getReplicatedPartNames(const String & table_shared_id, const String & replica_name) const +Strings BackupCoordinationLocal::getReplicatedPartNames(const String & table_zk_path, const String & replica_name) const { std::lock_guard lock{replicated_tables_mutex}; - return replicated_tables.getPartNames(table_shared_id, replica_name); + return replicated_tables.getPartNames(table_zk_path, replica_name); } -void BackupCoordinationLocal::addReplicatedMutations(const String & table_shared_id, const String & table_name_for_logs, const String & replica_name, const std::vector & mutations) +void BackupCoordinationLocal::addReplicatedMutations(const String & table_zk_path, const String & table_name_for_logs, const String & replica_name, const std::vector & mutations) { std::lock_guard lock{replicated_tables_mutex}; - replicated_tables.addMutations({table_shared_id, table_name_for_logs, replica_name, mutations}); + replicated_tables.addMutations({table_zk_path, table_name_for_logs, replica_name, mutations}); } -std::vector BackupCoordinationLocal::getReplicatedMutations(const String & table_shared_id, const String & replica_name) const +std::vector BackupCoordinationLocal::getReplicatedMutations(const String & table_zk_path, const String & replica_name) const { std::lock_guard lock{replicated_tables_mutex}; - return replicated_tables.getMutations(table_shared_id, replica_name); + return replicated_tables.getMutations(table_zk_path, replica_name); } -void BackupCoordinationLocal::addReplicatedDataPath(const String & table_shared_id, const String & data_path) +void BackupCoordinationLocal::addReplicatedDataPath(const String & table_zk_path, const String & data_path) { std::lock_guard lock{replicated_tables_mutex}; - replicated_tables.addDataPath({table_shared_id, data_path}); + replicated_tables.addDataPath({table_zk_path, data_path}); } -Strings BackupCoordinationLocal::getReplicatedDataPaths(const String & table_shared_id) const +Strings BackupCoordinationLocal::getReplicatedDataPaths(const String & table_zk_path) const { std::lock_guard lock{replicated_tables_mutex}; - return replicated_tables.getDataPaths(table_shared_id); + return replicated_tables.getDataPaths(table_zk_path); } diff --git a/src/Backups/BackupCoordinationLocal.h b/src/Backups/BackupCoordinationLocal.h index e0aa5dc67a4..a7f15c79649 100644 --- a/src/Backups/BackupCoordinationLocal.h +++ b/src/Backups/BackupCoordinationLocal.h @@ -29,16 +29,16 @@ public: Strings waitForStage(const String & stage_to_wait) override; Strings waitForStage(const String & stage_to_wait, std::chrono::milliseconds timeout) override; - void addReplicatedPartNames(const String & table_shared_id, const String & table_name_for_logs, const String & replica_name, + void addReplicatedPartNames(const String & table_zk_path, const String & table_name_for_logs, const String & replica_name, const std::vector & part_names_and_checksums) override; - Strings getReplicatedPartNames(const String & table_shared_id, const String & replica_name) const override; + Strings getReplicatedPartNames(const String & table_zk_path, const String & replica_name) const override; - void addReplicatedMutations(const String & table_shared_id, const String & table_name_for_logs, const String & replica_name, + void addReplicatedMutations(const String & table_zk_path, const String & table_name_for_logs, const String & replica_name, const std::vector & mutations) override; - std::vector getReplicatedMutations(const String & table_shared_id, const String & replica_name) const override; + std::vector getReplicatedMutations(const String & table_zk_path, const String & replica_name) const override; - void addReplicatedDataPath(const String & table_shared_id, const String & data_path) override; - Strings getReplicatedDataPaths(const String & table_shared_id) const override; + void addReplicatedDataPath(const String & table_zk_path, const String & data_path) override; + Strings getReplicatedDataPaths(const String & table_zk_path) const override; void addReplicatedAccessFilePath(const String & access_zk_path, AccessEntityType access_entity_type, const String & file_path) override; Strings getReplicatedAccessFilePaths(const String & access_zk_path, AccessEntityType access_entity_type) const override; diff --git a/src/Backups/BackupCoordinationRemote.cpp b/src/Backups/BackupCoordinationRemote.cpp index 455f45a7a77..f353062f628 100644 --- a/src/Backups/BackupCoordinationRemote.cpp +++ b/src/Backups/BackupCoordinationRemote.cpp @@ -358,7 +358,7 @@ String BackupCoordinationRemote::deserializeFromMultipleZooKeeperNodes(const Str void BackupCoordinationRemote::addReplicatedPartNames( - const String & table_shared_id, + const String & table_zk_path, const String & table_name_for_logs, const String & replica_name, const std::vector & part_names_and_checksums) @@ -374,22 +374,22 @@ void BackupCoordinationRemote::addReplicatedPartNames( [&, &zk = holder.faulty_zookeeper]() { with_retries.renewZooKeeper(zk); - String path = zookeeper_path + "/repl_part_names/" + escapeForFileName(table_shared_id); + String path = zookeeper_path + "/repl_part_names/" + escapeForFileName(table_zk_path); zk->createIfNotExists(path, ""); path += "/" + escapeForFileName(replica_name); zk->createIfNotExists(path, ReplicatedPartNames::serialize(part_names_and_checksums, table_name_for_logs)); }); } -Strings BackupCoordinationRemote::getReplicatedPartNames(const String & table_shared_id, const String & replica_name) const +Strings BackupCoordinationRemote::getReplicatedPartNames(const String & table_zk_path, const String & replica_name) const { std::lock_guard lock{replicated_tables_mutex}; prepareReplicatedTables(); - return replicated_tables->getPartNames(table_shared_id, replica_name); + return replicated_tables->getPartNames(table_zk_path, replica_name); } void BackupCoordinationRemote::addReplicatedMutations( - const String & table_shared_id, + const String & table_zk_path, const String & table_name_for_logs, const String & replica_name, const std::vector & mutations) @@ -405,23 +405,23 @@ void BackupCoordinationRemote::addReplicatedMutations( [&, &zk = holder.faulty_zookeeper]() { with_retries.renewZooKeeper(zk); - String path = zookeeper_path + "/repl_mutations/" + escapeForFileName(table_shared_id); + String path = zookeeper_path + "/repl_mutations/" + escapeForFileName(table_zk_path); zk->createIfNotExists(path, ""); path += "/" + escapeForFileName(replica_name); zk->createIfNotExists(path, ReplicatedMutations::serialize(mutations, table_name_for_logs)); }); } -std::vector BackupCoordinationRemote::getReplicatedMutations(const String & table_shared_id, const String & replica_name) const +std::vector BackupCoordinationRemote::getReplicatedMutations(const String & table_zk_path, const String & replica_name) const { std::lock_guard lock{replicated_tables_mutex}; prepareReplicatedTables(); - return replicated_tables->getMutations(table_shared_id, replica_name); + return replicated_tables->getMutations(table_zk_path, replica_name); } void BackupCoordinationRemote::addReplicatedDataPath( - const String & table_shared_id, const String & data_path) + const String & table_zk_path, const String & data_path) { { std::lock_guard lock{replicated_tables_mutex}; @@ -434,18 +434,18 @@ void BackupCoordinationRemote::addReplicatedDataPath( [&, &zk = holder.faulty_zookeeper]() { with_retries.renewZooKeeper(zk); - String path = zookeeper_path + "/repl_data_paths/" + escapeForFileName(table_shared_id); + String path = zookeeper_path + "/repl_data_paths/" + escapeForFileName(table_zk_path); zk->createIfNotExists(path, ""); path += "/" + escapeForFileName(data_path); zk->createIfNotExists(path, ""); }); } -Strings BackupCoordinationRemote::getReplicatedDataPaths(const String & table_shared_id) const +Strings BackupCoordinationRemote::getReplicatedDataPaths(const String & table_zk_path) const { std::lock_guard lock{replicated_tables_mutex}; prepareReplicatedTables(); - return replicated_tables->getDataPaths(table_shared_id); + return replicated_tables->getDataPaths(table_zk_path); } @@ -464,16 +464,16 @@ void BackupCoordinationRemote::prepareReplicatedTables() const with_retries.renewZooKeeper(zk); String path = zookeeper_path + "/repl_part_names"; - for (const String & escaped_table_shared_id : zk->getChildren(path)) + for (const String & escaped_table_zk_path : zk->getChildren(path)) { - String table_shared_id = unescapeForFileName(escaped_table_shared_id); - String path2 = path + "/" + escaped_table_shared_id; + String table_zk_path = unescapeForFileName(escaped_table_zk_path); + String path2 = path + "/" + escaped_table_zk_path; for (const String & escaped_replica_name : zk->getChildren(path2)) { String replica_name = unescapeForFileName(escaped_replica_name); auto part_names = ReplicatedPartNames::deserialize(zk->get(path2 + "/" + escaped_replica_name)); part_names_for_replicated_tables.push_back( - {table_shared_id, part_names.table_name_for_logs, replica_name, part_names.part_names_and_checksums}); + {table_zk_path, part_names.table_name_for_logs, replica_name, part_names.part_names_and_checksums}); } } }); @@ -489,16 +489,16 @@ void BackupCoordinationRemote::prepareReplicatedTables() const with_retries.renewZooKeeper(zk); String path = zookeeper_path + "/repl_mutations"; - for (const String & escaped_table_shared_id : zk->getChildren(path)) + for (const String & escaped_table_zk_path : zk->getChildren(path)) { - String table_shared_id = unescapeForFileName(escaped_table_shared_id); - String path2 = path + "/" + escaped_table_shared_id; + String table_zk_path = unescapeForFileName(escaped_table_zk_path); + String path2 = path + "/" + escaped_table_zk_path; for (const String & escaped_replica_name : zk->getChildren(path2)) { String replica_name = unescapeForFileName(escaped_replica_name); auto mutations = ReplicatedMutations::deserialize(zk->get(path2 + "/" + escaped_replica_name)); mutations_for_replicated_tables.push_back( - {table_shared_id, mutations.table_name_for_logs, replica_name, mutations.mutations}); + {table_zk_path, mutations.table_name_for_logs, replica_name, mutations.mutations}); } } }); @@ -514,14 +514,14 @@ void BackupCoordinationRemote::prepareReplicatedTables() const with_retries.renewZooKeeper(zk); String path = zookeeper_path + "/repl_data_paths"; - for (const String & escaped_table_shared_id : zk->getChildren(path)) + for (const String & escaped_table_zk_path : zk->getChildren(path)) { - String table_shared_id = unescapeForFileName(escaped_table_shared_id); - String path2 = path + "/" + escaped_table_shared_id; + String table_zk_path = unescapeForFileName(escaped_table_zk_path); + String path2 = path + "/" + escaped_table_zk_path; for (const String & escaped_data_path : zk->getChildren(path2)) { String data_path = unescapeForFileName(escaped_data_path); - data_paths_for_replicated_tables.push_back({table_shared_id, data_path}); + data_paths_for_replicated_tables.push_back({table_zk_path, data_path}); } } }); diff --git a/src/Backups/BackupCoordinationRemote.h b/src/Backups/BackupCoordinationRemote.h index ce891699bd2..7a56b1a4eb8 100644 --- a/src/Backups/BackupCoordinationRemote.h +++ b/src/Backups/BackupCoordinationRemote.h @@ -41,23 +41,23 @@ public: Strings waitForStage(const String & stage_to_wait, std::chrono::milliseconds timeout) override; void addReplicatedPartNames( - const String & table_shared_id, + const String & table_zk_path, const String & table_name_for_logs, const String & replica_name, const std::vector & part_names_and_checksums) override; - Strings getReplicatedPartNames(const String & table_shared_id, const String & replica_name) const override; + Strings getReplicatedPartNames(const String & table_zk_path, const String & replica_name) const override; void addReplicatedMutations( - const String & table_shared_id, + const String & table_zk_path, const String & table_name_for_logs, const String & replica_name, const std::vector & mutations) override; - std::vector getReplicatedMutations(const String & table_shared_id, const String & replica_name) const override; + std::vector getReplicatedMutations(const String & table_zk_path, const String & replica_name) const override; - void addReplicatedDataPath(const String & table_shared_id, const String & data_path) override; - Strings getReplicatedDataPaths(const String & table_shared_id) const override; + void addReplicatedDataPath(const String & table_zk_path, const String & data_path) override; + Strings getReplicatedDataPaths(const String & table_zk_path) const override; void addReplicatedAccessFilePath(const String & access_zk_path, AccessEntityType access_entity_type, const String & file_path) override; Strings getReplicatedAccessFilePaths(const String & access_zk_path, AccessEntityType access_entity_type) const override; diff --git a/src/Backups/BackupCoordinationReplicatedTables.cpp b/src/Backups/BackupCoordinationReplicatedTables.cpp index 1cbb88acb82..a435667f79a 100644 --- a/src/Backups/BackupCoordinationReplicatedTables.cpp +++ b/src/Backups/BackupCoordinationReplicatedTables.cpp @@ -151,7 +151,7 @@ BackupCoordinationReplicatedTables::~BackupCoordinationReplicatedTables() = defa void BackupCoordinationReplicatedTables::addPartNames(PartNamesForTableReplica && part_names) { - const auto & table_shared_id = part_names.table_shared_id; + const auto & table_zk_path = part_names.table_zk_path; const auto & table_name_for_logs = part_names.table_name_for_logs; const auto & replica_name = part_names.replica_name; const auto & part_names_and_checksums = part_names.part_names_and_checksums; @@ -159,7 +159,7 @@ void BackupCoordinationReplicatedTables::addPartNames(PartNamesForTableReplica & if (prepared) throw Exception(ErrorCodes::LOGICAL_ERROR, "addPartNames() must not be called after preparing"); - auto & table_info = table_infos[table_shared_id]; + auto & table_info = table_infos[table_zk_path]; table_info.table_name_for_logs = table_name_for_logs; if (!table_info.covered_parts_finder) @@ -200,11 +200,11 @@ void BackupCoordinationReplicatedTables::addPartNames(PartNamesForTableReplica & } } -Strings BackupCoordinationReplicatedTables::getPartNames(const String & table_shared_id, const String & replica_name) const +Strings BackupCoordinationReplicatedTables::getPartNames(const String & table_zk_path, const String & replica_name) const { prepare(); - auto it = table_infos.find(table_shared_id); + auto it = table_infos.find(table_zk_path); if (it == table_infos.end()) return {}; @@ -218,7 +218,7 @@ Strings BackupCoordinationReplicatedTables::getPartNames(const String & table_sh void BackupCoordinationReplicatedTables::addMutations(MutationsForTableReplica && mutations_for_table_replica) { - const auto & table_shared_id = mutations_for_table_replica.table_shared_id; + const auto & table_zk_path = mutations_for_table_replica.table_zk_path; const auto & table_name_for_logs = mutations_for_table_replica.table_name_for_logs; const auto & replica_name = mutations_for_table_replica.replica_name; const auto & mutations = mutations_for_table_replica.mutations; @@ -226,7 +226,7 @@ void BackupCoordinationReplicatedTables::addMutations(MutationsForTableReplica & if (prepared) throw Exception(ErrorCodes::LOGICAL_ERROR, "addMutations() must not be called after preparing"); - auto & table_info = table_infos[table_shared_id]; + auto & table_info = table_infos[table_zk_path]; table_info.table_name_for_logs = table_name_for_logs; for (const auto & [mutation_id, mutation_entry] : mutations) table_info.mutations.emplace(mutation_id, mutation_entry); @@ -236,11 +236,11 @@ void BackupCoordinationReplicatedTables::addMutations(MutationsForTableReplica & } std::vector -BackupCoordinationReplicatedTables::getMutations(const String & table_shared_id, const String & replica_name) const +BackupCoordinationReplicatedTables::getMutations(const String & table_zk_path, const String & replica_name) const { prepare(); - auto it = table_infos.find(table_shared_id); + auto it = table_infos.find(table_zk_path); if (it == table_infos.end()) return {}; @@ -257,16 +257,16 @@ BackupCoordinationReplicatedTables::getMutations(const String & table_shared_id, void BackupCoordinationReplicatedTables::addDataPath(DataPathForTableReplica && data_path_for_table_replica) { - const auto & table_shared_id = data_path_for_table_replica.table_shared_id; + const auto & table_zk_path = data_path_for_table_replica.table_zk_path; const auto & data_path = data_path_for_table_replica.data_path; - auto & table_info = table_infos[table_shared_id]; + auto & table_info = table_infos[table_zk_path]; table_info.data_paths.emplace(data_path); } -Strings BackupCoordinationReplicatedTables::getDataPaths(const String & table_shared_id) const +Strings BackupCoordinationReplicatedTables::getDataPaths(const String & table_zk_path) const { - auto it = table_infos.find(table_shared_id); + auto it = table_infos.find(table_zk_path); if (it == table_infos.end()) return {}; diff --git a/src/Backups/BackupCoordinationReplicatedTables.h b/src/Backups/BackupCoordinationReplicatedTables.h index 74f21eb9c7c..50ab56aef75 100644 --- a/src/Backups/BackupCoordinationReplicatedTables.h +++ b/src/Backups/BackupCoordinationReplicatedTables.h @@ -40,7 +40,7 @@ public: struct PartNamesForTableReplica { - String table_shared_id; + String table_zk_path; String table_name_for_logs; String replica_name; std::vector part_names_and_checksums; @@ -55,13 +55,13 @@ public: /// Returns the names of the parts which a specified replica of a replicated table should put to the backup. /// This is the same list as it was added by call of the function addPartNames() but without duplications and without /// parts covered by another parts. - Strings getPartNames(const String & table_shared_id, const String & replica_name) const; + Strings getPartNames(const String & table_zk_path, const String & replica_name) const; using MutationInfo = IBackupCoordination::MutationInfo; struct MutationsForTableReplica { - String table_shared_id; + String table_zk_path; String table_name_for_logs; String replica_name; std::vector mutations; @@ -71,11 +71,11 @@ public: void addMutations(MutationsForTableReplica && mutations_for_table_replica); /// Returns all mutations of a replicated table which are not finished for some data parts added by addReplicatedPartNames(). - std::vector getMutations(const String & table_shared_id, const String & replica_name) const; + std::vector getMutations(const String & table_zk_path, const String & replica_name) const; struct DataPathForTableReplica { - String table_shared_id; + String table_zk_path; String data_path; }; @@ -85,7 +85,7 @@ public: void addDataPath(DataPathForTableReplica && data_path_for_table_replica); /// Returns all the data paths in backup added for a replicated table (see also addReplicatedDataPath()). - Strings getDataPaths(const String & table_shared_id) const; + Strings getDataPaths(const String & table_zk_path) const; private: void prepare() const; @@ -110,7 +110,7 @@ private: std::unordered_set data_paths; }; - std::map table_infos; /// Should be ordered because we need this map to be in the same order on every replica. + std::map table_infos; /// Should be ordered because we need this map to be in the same order on every replica. mutable bool prepared = false; }; diff --git a/src/Backups/BackupEntriesCollector.cpp b/src/Backups/BackupEntriesCollector.cpp index cc014c279cc..136e3c49321 100644 --- a/src/Backups/BackupEntriesCollector.cpp +++ b/src/Backups/BackupEntriesCollector.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -758,7 +759,7 @@ void BackupEntriesCollector::makeBackupEntriesForDatabasesDefs() checkIsQueryCancelled(); ASTPtr new_create_query = database_info.create_database_query; - adjustCreateQueryForBackup(new_create_query, context->getGlobalContext(), nullptr); + adjustCreateQueryForBackup(new_create_query, context->getGlobalContext()); renameDatabaseAndTableNameInCreateQuery(new_create_query, renaming_map, context->getGlobalContext()); const String & metadata_path_in_backup = database_info.metadata_path_in_backup; @@ -775,7 +776,8 @@ void BackupEntriesCollector::makeBackupEntriesForTablesDefs() checkIsQueryCancelled(); ASTPtr new_create_query = table_info.create_table_query; - adjustCreateQueryForBackup(new_create_query, context->getGlobalContext(), &table_info.replicated_table_shared_id); + table_info.replicated_table_zk_path = tryExtractZkPathFromCreateQuery(*new_create_query, context->getGlobalContext()); + adjustCreateQueryForBackup(new_create_query, context->getGlobalContext()); renameDatabaseAndTableNameInCreateQuery(new_create_query, renaming_map, context->getGlobalContext()); const String & metadata_path_in_backup = table_info.metadata_path_in_backup; @@ -814,8 +816,8 @@ void BackupEntriesCollector::makeBackupEntriesForTableData(const QualifiedTableN /// If this table is replicated in this case we call IBackupCoordination::addReplicatedDataPath() which will cause /// other replicas to fill the storage's data in the backup. /// If this table is not replicated we'll do nothing leaving the storage's data empty in the backup. - if (table_info.replicated_table_shared_id) - backup_coordination->addReplicatedDataPath(*table_info.replicated_table_shared_id, data_path_in_backup); + if (table_info.replicated_table_zk_path) + backup_coordination->addReplicatedDataPath(*table_info.replicated_table_zk_path, data_path_in_backup); return; } diff --git a/src/Backups/BackupEntriesCollector.h b/src/Backups/BackupEntriesCollector.h index 01e8d594334..c7bce077a2d 100644 --- a/src/Backups/BackupEntriesCollector.h +++ b/src/Backups/BackupEntriesCollector.h @@ -164,7 +164,7 @@ private: ASTPtr create_table_query; String metadata_path_in_backup; std::filesystem::path data_path_in_backup; - std::optional replicated_table_shared_id; + std::optional replicated_table_zk_path; std::optional partitions; }; diff --git a/src/Backups/BackupUtils.cpp b/src/Backups/BackupUtils.cpp index fb448fb64ad..fa8ed5855dd 100644 --- a/src/Backups/BackupUtils.cpp +++ b/src/Backups/BackupUtils.cpp @@ -103,7 +103,7 @@ bool compareRestoredTableDef(const IAST & restored_table_create_query, const IAS auto adjust_before_comparison = [&](const IAST & query) -> ASTPtr { auto new_query = query.clone(); - adjustCreateQueryForBackup(new_query, global_context, nullptr); + adjustCreateQueryForBackup(new_query, global_context); ASTCreateQuery & create = typeid_cast(*new_query); create.setUUID({}); create.if_not_exists = false; diff --git a/src/Backups/DDLAdjustingForBackupVisitor.cpp b/src/Backups/DDLAdjustingForBackupVisitor.cpp index 5ea91094b75..7e5ce91629b 100644 --- a/src/Backups/DDLAdjustingForBackupVisitor.cpp +++ b/src/Backups/DDLAdjustingForBackupVisitor.cpp @@ -27,9 +27,6 @@ namespace { /// Precondition: engine_name.starts_with("Replicated") && engine_name.ends_with("MergeTree") - if (data.replicated_table_shared_id) - *data.replicated_table_shared_id = StorageReplicatedMergeTree::tryGetTableSharedIDFromCreateQuery(*data.create_query, data.global_context); - /// Before storing the metadata in a backup we have to find a zookeeper path in its definition and turn the table's UUID in there /// back into "{uuid}", and also we probably can remove the zookeeper path and replica name if they're default. /// So we're kind of reverting what we had done to the table's definition in registerStorageMergeTree.cpp before we created this table. @@ -98,12 +95,9 @@ void DDLAdjustingForBackupVisitor::visit(ASTPtr ast, const Data & data) visitCreateQuery(*create, data); } -void adjustCreateQueryForBackup(ASTPtr ast, const ContextPtr & global_context, std::optional * replicated_table_shared_id) +void adjustCreateQueryForBackup(ASTPtr ast, const ContextPtr & global_context) { - if (replicated_table_shared_id) - *replicated_table_shared_id = {}; - - DDLAdjustingForBackupVisitor::Data data{ast, global_context, replicated_table_shared_id}; + DDLAdjustingForBackupVisitor::Data data{ast, global_context}; DDLAdjustingForBackupVisitor::Visitor{data}.visit(ast); } diff --git a/src/Backups/DDLAdjustingForBackupVisitor.h b/src/Backups/DDLAdjustingForBackupVisitor.h index 63353dcc000..f0508434e02 100644 --- a/src/Backups/DDLAdjustingForBackupVisitor.h +++ b/src/Backups/DDLAdjustingForBackupVisitor.h @@ -12,9 +12,7 @@ class Context; using ContextPtr = std::shared_ptr; /// Changes a create query to a form which is appropriate or suitable for saving in a backup. -/// Also extracts a replicated table's shared ID from the create query if this is a create query for a replicated table. -/// `replicated_table_shared_id` can be null if you don't need that. -void adjustCreateQueryForBackup(ASTPtr ast, const ContextPtr & global_context, std::optional * replicated_table_shared_id); +void adjustCreateQueryForBackup(ASTPtr ast, const ContextPtr & global_context); /// Visits ASTCreateQuery and changes it to a form which is appropriate or suitable for saving in a backup. class DDLAdjustingForBackupVisitor @@ -24,7 +22,6 @@ public: { ASTPtr create_query; ContextPtr global_context; - std::optional * replicated_table_shared_id = nullptr; }; using Visitor = InDepthNodeVisitor; diff --git a/src/Backups/IBackupCoordination.h b/src/Backups/IBackupCoordination.h index f80b5dee883..4a9f8a23855 100644 --- a/src/Backups/IBackupCoordination.h +++ b/src/Backups/IBackupCoordination.h @@ -36,13 +36,13 @@ public: /// Multiple replicas of the replicated table call this function and then the added part names can be returned by call of the function /// getReplicatedPartNames(). /// Checksums are used only to control that parts under the same names on different replicas are the same. - virtual void addReplicatedPartNames(const String & table_shared_id, const String & table_name_for_logs, const String & replica_name, + virtual void addReplicatedPartNames(const String & table_zk_path, const String & table_name_for_logs, const String & replica_name, const std::vector & part_names_and_checksums) = 0; /// Returns the names of the parts which a specified replica of a replicated table should put to the backup. /// This is the same list as it was added by call of the function addReplicatedPartNames() but without duplications and without /// parts covered by another parts. - virtual Strings getReplicatedPartNames(const String & table_shared_id, const String & replica_name) const = 0; + virtual Strings getReplicatedPartNames(const String & table_zk_path, const String & replica_name) const = 0; struct MutationInfo { @@ -51,10 +51,10 @@ public: }; /// Adds information about mutations of a replicated table. - virtual void addReplicatedMutations(const String & table_shared_id, const String & table_name_for_logs, const String & replica_name, const std::vector & mutations) = 0; + virtual void addReplicatedMutations(const String & table_zk_path, const String & table_name_for_logs, const String & replica_name, const std::vector & mutations) = 0; /// Returns all mutations of a replicated table which are not finished for some data parts added by addReplicatedPartNames(). - virtual std::vector getReplicatedMutations(const String & table_shared_id, const String & replica_name) const = 0; + virtual std::vector getReplicatedMutations(const String & table_zk_path, const String & replica_name) const = 0; /// Adds information about KeeperMap tables virtual void addKeeperMapTable(const String & table_zookeeper_root_path, const String & table_id, const String & data_path_in_backup) = 0; @@ -65,10 +65,10 @@ public: /// Adds a data path in backup for a replicated table. /// Multiple replicas of the replicated table call this function and then all the added paths can be returned by call of the function /// getReplicatedDataPaths(). - virtual void addReplicatedDataPath(const String & table_shared_id, const String & data_path) = 0; + virtual void addReplicatedDataPath(const String & table_zk_path, const String & data_path) = 0; /// Returns all the data paths in backup added for a replicated table (see also addReplicatedDataPath()). - virtual Strings getReplicatedDataPaths(const String & table_shared_id) const = 0; + virtual Strings getReplicatedDataPaths(const String & table_zk_path) const = 0; /// Adds a path to access.txt file keeping access entities of a ReplicatedAccessStorage. virtual void addReplicatedAccessFilePath(const String & access_zk_path, AccessEntityType access_entity_type, const String & file_path) = 0; diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index c425035dfba..58d1846915f 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -187,7 +187,6 @@ namespace ErrorCodes extern const int NOT_INITIALIZED; extern const int TOO_LARGE_DISTRIBUTED_DEPTH; extern const int TABLE_IS_DROPPED; - extern const int CANNOT_BACKUP_TABLE; extern const int SUPPORT_IS_DISABLED; extern const int FAULT_INJECTED; extern const int CANNOT_FORGET_PARTITION; @@ -310,8 +309,9 @@ StorageReplicatedMergeTree::StorageReplicatedMergeTree( true, /// require_part_metadata mode, [this] (const std::string & name) { enqueuePartForCheck(name); }) - , zookeeper_name(zkutil::extractZooKeeperName(zookeeper_path_)) - , zookeeper_path(zkutil::extractZooKeeperPath(zookeeper_path_, /* check_starts_with_slash */ mode <= LoadingStrictnessLevel::CREATE, log.load())) + , full_zookeeper_path(zookeeper_path_) + , zookeeper_name(zkutil::extractZooKeeperName(full_zookeeper_path)) + , zookeeper_path(zkutil::extractZooKeeperPath(full_zookeeper_path, /* check_starts_with_slash */ mode <= LoadingStrictnessLevel::CREATE, log.load())) , replica_name(replica_name_) , replica_path(fs::path(zookeeper_path) / "replicas" / replica_name_) , reader(*this) @@ -9242,24 +9242,6 @@ void StorageReplicatedMergeTree::createTableSharedID() const } -std::optional StorageReplicatedMergeTree::tryGetTableSharedIDFromCreateQuery(const IAST & create_query, const ContextPtr & global_context) -{ - auto zk_path = tryExtractZkPathFromCreateQuery(create_query, global_context); - if (!zk_path) - return {}; - - String zk_name = zkutil::extractZooKeeperName(*zk_path); - zk_path = zkutil::extractZooKeeperPath(*zk_path, false, nullptr); - zkutil::ZooKeeperPtr zookeeper = (zk_name == getDefaultZooKeeperName()) ? global_context->getZooKeeper() : global_context->getAuxiliaryZooKeeper(zk_name); - - String id; - if (!zookeeper->tryGet(fs::path(*zk_path) / "table_shared_id", id)) - return {}; - - return id; -} - - zkutil::EphemeralNodeHolderPtr StorageReplicatedMergeTree::lockSharedDataTemporary(const String & part_name, const String & part_id, const DiskPtr & disk) const { auto settings = getSettings(); @@ -10419,21 +10401,10 @@ void StorageReplicatedMergeTree::adjustCreateQueryForBackup(ASTPtr & create_quer auto metadata_diff = ReplicatedMergeTreeTableMetadata(*this, current_metadata).checkAndFindDiff(metadata_from_entry, current_metadata->getColumns(), getContext()); auto adjusted_metadata = metadata_diff.getNewMetadata(columns_from_entry, getContext(), *current_metadata); applyMetadataChangesToCreateQuery(create_query, adjusted_metadata); - - /// Check that tryGetTableSharedIDFromCreateQuery() works for this storage. - auto actual_table_shared_id = getTableSharedID(); - auto expected_table_shared_id = tryGetTableSharedIDFromCreateQuery(*create_query, getContext()); - if (actual_table_shared_id != expected_table_shared_id) - { - throw Exception(ErrorCodes::CANNOT_BACKUP_TABLE, "Table {} has its shared ID different from one from the create query: " - "actual shared id = {}, expected shared id = {}, create query = {}", - getStorageID().getNameForLogs(), actual_table_shared_id, expected_table_shared_id.value_or("nullopt"), - create_query); - } } catch (...) { - /// We can continue making a backup with non-adjusted name. + /// We can continue making a backup with non-adjusted query. tryLogCurrentException(log, "Failed to adjust the create query of this table for backup"); } } @@ -10459,8 +10430,8 @@ void StorageReplicatedMergeTree::backupData( auto parts_backup_entries = backupParts(data_parts, /* data_path_in_backup */ "", backup_settings, read_settings, local_context); auto coordination = backup_entries_collector.getBackupCoordination(); - String shared_id = getTableSharedID(); - coordination->addReplicatedDataPath(shared_id, data_path_in_backup); + + coordination->addReplicatedDataPath(full_zookeeper_path, data_path_in_backup); using PartNameAndChecksum = IBackupCoordination::PartNameAndChecksum; std::vector part_names_with_hashes; @@ -10469,7 +10440,7 @@ void StorageReplicatedMergeTree::backupData( part_names_with_hashes.emplace_back(PartNameAndChecksum{part_backup_entries.part_name, part_backup_entries.part_checksum}); /// Send our list of part names to the coordination (to compare with other replicas). - coordination->addReplicatedPartNames(shared_id, getStorageID().getFullTableName(), getReplicaName(), part_names_with_hashes); + coordination->addReplicatedPartNames(full_zookeeper_path, getStorageID().getFullTableName(), getReplicaName(), part_names_with_hashes); /// Send a list of mutations to the coordination too (we need to find the mutations which are not finished for added part names). { @@ -10511,25 +10482,25 @@ void StorageReplicatedMergeTree::backupData( } if (!mutation_infos.empty()) - coordination->addReplicatedMutations(shared_id, getStorageID().getFullTableName(), getReplicaName(), mutation_infos); + coordination->addReplicatedMutations(full_zookeeper_path, getStorageID().getFullTableName(), getReplicaName(), mutation_infos); } } /// This task will be executed after all replicas have collected their parts and the coordination is ready to /// give us the final list of parts to add to the BackupEntriesCollector. - auto post_collecting_task = [shared_id, + auto post_collecting_task = [my_full_zookeeper_path = full_zookeeper_path, my_replica_name = getReplicaName(), coordination, my_parts_backup_entries = std::move(parts_backup_entries), &backup_entries_collector]() { - Strings data_paths = coordination->getReplicatedDataPaths(shared_id); + Strings data_paths = coordination->getReplicatedDataPaths(my_full_zookeeper_path); std::vector data_paths_fs; data_paths_fs.reserve(data_paths.size()); for (const auto & data_path : data_paths) data_paths_fs.push_back(data_path); - Strings part_names = coordination->getReplicatedPartNames(shared_id, my_replica_name); + Strings part_names = coordination->getReplicatedPartNames(my_full_zookeeper_path, my_replica_name); std::unordered_set part_names_set{part_names.begin(), part_names.end()}; for (const auto & part_backup_entries : my_parts_backup_entries) @@ -10542,7 +10513,7 @@ void StorageReplicatedMergeTree::backupData( } } - auto mutation_infos = coordination->getReplicatedMutations(shared_id, my_replica_name); + auto mutation_infos = coordination->getReplicatedMutations(my_full_zookeeper_path, my_replica_name); for (const auto & mutation_info : mutation_infos) { auto backup_entry = ReplicatedMergeTreeMutationEntry::parse(mutation_info.entry, mutation_info.id).backup(); @@ -10556,8 +10527,7 @@ void StorageReplicatedMergeTree::backupData( void StorageReplicatedMergeTree::restoreDataFromBackup(RestorerFromBackup & restorer, const String & data_path_in_backup, const std::optional & partitions) { - String full_zk_path = getZooKeeperName() + getZooKeeperPath(); - if (!restorer.getRestoreCoordination()->acquireInsertingDataIntoReplicatedTable(full_zk_path)) + if (!restorer.getRestoreCoordination()->acquireInsertingDataIntoReplicatedTable(full_zookeeper_path)) { /// Other replica is already restoring the data of this table. /// We'll get them later due to replication, it's not necessary to read it from the backup. diff --git a/src/Storages/StorageReplicatedMergeTree.h b/src/Storages/StorageReplicatedMergeTree.h index c472c11e7f8..7f33c82e5c2 100644 --- a/src/Storages/StorageReplicatedMergeTree.h +++ b/src/Storages/StorageReplicatedMergeTree.h @@ -330,17 +330,14 @@ public: // Return default or custom zookeeper name for table const String & getZooKeeperName() const { return zookeeper_name; } - const String & getZooKeeperPath() const { return zookeeper_path; } + const String & getFullZooKeeperPath() const { return full_zookeeper_path; } // Return table id, common for different replicas String getTableSharedID() const override; std::map getUnfinishedMutationCommands() const override; - /// Returns the same as getTableSharedID(), but extracts it from a create query. - static std::optional tryGetTableSharedIDFromCreateQuery(const IAST & create_query, const ContextPtr & global_context); - static const String & getDefaultZooKeeperName() { return default_zookeeper_name; } /// Check if there are new broken disks and enqueue part recovery tasks. @@ -420,9 +417,11 @@ private: bool is_readonly_metric_set = false; + const String full_zookeeper_path; static const String default_zookeeper_name; const String zookeeper_name; const String zookeeper_path; + const String replica_name; const String replica_path; From 6e579312633f2c0abb8784f122bfc75559a5d05a Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Thu, 25 Apr 2024 19:24:36 +0200 Subject: [PATCH 064/289] Get rid of code duplication in extractZkPathFromCreateQuery(). --- src/Backups/BackupEntriesCollector.cpp | 4 +- .../extractZkPathFromCreateQuery.cpp | 61 --- .../MergeTree/extractZkPathFromCreateQuery.h | 19 - ...tractZooKeeperPathFromReplicatedTableDef.h | 18 + .../MergeTree/registerStorageMergeTree.cpp | 401 +++++++++++------- src/Storages/StorageReplicatedMergeTree.cpp | 1 - 6 files changed, 272 insertions(+), 232 deletions(-) delete mode 100644 src/Storages/MergeTree/extractZkPathFromCreateQuery.cpp delete mode 100644 src/Storages/MergeTree/extractZkPathFromCreateQuery.h create mode 100644 src/Storages/MergeTree/extractZooKeeperPathFromReplicatedTableDef.h diff --git a/src/Backups/BackupEntriesCollector.cpp b/src/Backups/BackupEntriesCollector.cpp index 136e3c49321..d91cf47c4d3 100644 --- a/src/Backups/BackupEntriesCollector.cpp +++ b/src/Backups/BackupEntriesCollector.cpp @@ -11,7 +11,7 @@ #include #include #include -#include +#include #include #include #include @@ -776,7 +776,7 @@ void BackupEntriesCollector::makeBackupEntriesForTablesDefs() checkIsQueryCancelled(); ASTPtr new_create_query = table_info.create_table_query; - table_info.replicated_table_zk_path = tryExtractZkPathFromCreateQuery(*new_create_query, context->getGlobalContext()); + table_info.replicated_table_zk_path = extractZooKeeperPathFromReplicatedTableDef(new_create_query->as(), context); adjustCreateQueryForBackup(new_create_query, context->getGlobalContext()); renameDatabaseAndTableNameInCreateQuery(new_create_query, renaming_map, context->getGlobalContext()); diff --git a/src/Storages/MergeTree/extractZkPathFromCreateQuery.cpp b/src/Storages/MergeTree/extractZkPathFromCreateQuery.cpp deleted file mode 100644 index 8ea732b0243..00000000000 --- a/src/Storages/MergeTree/extractZkPathFromCreateQuery.cpp +++ /dev/null @@ -1,61 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -namespace DB -{ - -std::optional tryExtractZkPathFromCreateQuery(const IAST & create_query, const ContextPtr & global_context) -{ - const auto * create = create_query.as(); - if (!create || !create->storage || !create->storage->engine) - return {}; - - /// Check if the table engine is one of the ReplicatedMergeTree family. - const auto & ast_engine = *create->storage->engine; - if (!ast_engine.name.starts_with("Replicated") || !ast_engine.name.ends_with("MergeTree")) - return {}; - - /// Get the first argument. - const auto * ast_arguments = typeid_cast(ast_engine.arguments.get()); - if (!ast_arguments || ast_arguments->children.empty()) - return {}; - - auto * ast_zk_path = typeid_cast(ast_arguments->children[0].get()); - if (!ast_zk_path || (ast_zk_path->value.getType() != Field::Types::String)) - return {}; - - String zk_path = ast_zk_path->value.safeGet(); - - /// Expand macros. - Macros::MacroExpansionInfo info; - info.table_id.table_name = create->getTable(); - info.table_id.database_name = create->getDatabase(); - info.table_id.uuid = create->uuid; - auto database = DatabaseCatalog::instance().tryGetDatabase(info.table_id.database_name); - if (database && database->getEngineName() == "Replicated") - { - info.shard = getReplicatedDatabaseShardName(database); - info.replica = getReplicatedDatabaseReplicaName(database); - } - - try - { - zk_path = global_context->getMacros()->expand(zk_path, info); - } - catch (...) - { - return {}; /// Couldn't expand macros. - } - - return zk_path; -} - -} diff --git a/src/Storages/MergeTree/extractZkPathFromCreateQuery.h b/src/Storages/MergeTree/extractZkPathFromCreateQuery.h deleted file mode 100644 index e22f76d2cd5..00000000000 --- a/src/Storages/MergeTree/extractZkPathFromCreateQuery.h +++ /dev/null @@ -1,19 +0,0 @@ -#pragma once - -#include -#include -#include - - -namespace DB -{ -class IAST; -class Context; -using ContextPtr = std::shared_ptr; - -/// Extracts a zookeeper path from a specified CREATE TABLE query. Returns std::nullopt if fails. -/// The function takes the first argument of the ReplicatedMergeTree table engine and expands macros in it. -/// It works like a part of what the create() function in registerStorageMergeTree.cpp does but in a simpler manner. -std::optional tryExtractZkPathFromCreateQuery(const IAST & create_query, const ContextPtr & global_context); - -} diff --git a/src/Storages/MergeTree/extractZooKeeperPathFromReplicatedTableDef.h b/src/Storages/MergeTree/extractZooKeeperPathFromReplicatedTableDef.h new file mode 100644 index 00000000000..1bd58392201 --- /dev/null +++ b/src/Storages/MergeTree/extractZooKeeperPathFromReplicatedTableDef.h @@ -0,0 +1,18 @@ +#pragma once + +#include +#include +#include + + +namespace DB +{ +class ASTCreateQuery; +class Context; +using ContextPtr = std::shared_ptr; + +/// Extracts a zookeeper path from a specified CREATE TABLE query. Returns std::nullopt if fails. +/// The function checks the table engine and if it is Replicated*MergeTree then it takes the first argument and expands macros in it. +std::optional extractZooKeeperPathFromReplicatedTableDef(const ASTCreateQuery & create_query, const ContextPtr & context); + +} diff --git a/src/Storages/MergeTree/registerStorageMergeTree.cpp b/src/Storages/MergeTree/registerStorageMergeTree.cpp index d552a4b6fa5..9b0200d5a1c 100644 --- a/src/Storages/MergeTree/registerStorageMergeTree.cpp +++ b/src/Storages/MergeTree/registerStorageMergeTree.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include #include #include @@ -122,6 +123,248 @@ static void verifySortingKey(const KeyDescription & sorting_key) } } +/// Returns whether a new syntax is used to define a table engine, i.e. MergeTree() PRIMARY KEY ... PARTITION BY ... SETTINGS ... +/// instead of MergeTree(MergeTree(date, [sample_key], primary_key). +static bool isExtendedStorageDef(const ASTCreateQuery & query) +{ + if (query.storage && query.storage->isExtendedStorageDefinition()) + return true; + + if (query.columns_list && + ((query.columns_list->indices && !query.columns_list->indices->children.empty()) || + (query.columns_list->projections && !query.columns_list->projections->children.empty()))) + { + return true; + } + + return false; +} + +/// Evaluates expressions in engine arguments. +/// In new syntax an argument can be literal or identifier or array/tuple of identifiers. +static void evaluateEngineArgs(ASTs & engine_args, const ContextPtr & context) +{ + size_t arg_idx = 0; + try + { + for (; arg_idx < engine_args.size(); ++arg_idx) + { + auto & arg = engine_args[arg_idx]; + auto * arg_func = arg->as(); + if (!arg_func) + continue; + + /// If we got ASTFunction, let's evaluate it and replace with ASTLiteral. + /// Do not try evaluate array or tuple, because it's array or tuple of column identifiers. + if (arg_func->name == "array" || arg_func->name == "tuple") + continue; + Field value = evaluateConstantExpression(arg, context).first; + arg = std::make_shared(value); + } + } + catch (Exception & e) + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot evaluate engine argument {}: {} {}", + arg_idx, e.message(), verbose_help_message); + } +} + +/// Returns whether this is a Replicated table engine? +static bool isReplicated(const String & engine_name) +{ + return engine_name.starts_with("Replicated") && engine_name.ends_with("MergeTree"); +} + +/// Returns the part of the name of a table engine between "Replicated" (if any) and "MergeTree". +static std::string_view getNamePart(const String & engine_name) +{ + std::string_view name_part = engine_name; + if (name_part.starts_with("Replicated")) + name_part.remove_prefix(strlen("Replicated")); + + if (name_part.ends_with("MergeTree")) + name_part.remove_suffix(strlen("MergeTree")); + + return name_part; +} + +/// Extracts zookeeper path and replica name from the table engine's arguments. +/// The function can modify those arguments (that's why they're passed separately in `engine_args`) and also determines RenamingRestrictions. +/// The function assumes the table engine is Replicated. +static void extractZooKeeperPathAndReplicaNameFromEngineArgs( + const ASTCreateQuery & query, + const StorageID & table_id, + const String & engine_name, + ASTs & engine_args, + LoadingStrictnessLevel mode, + const ContextPtr & context, + String & zookeeper_path, + String & replica_name, + RenamingRestrictions & renaming_restrictions) +{ + chassert(isReplicated(engine_name)); + + zookeeper_path = ""; + replica_name = ""; + renaming_restrictions = RenamingRestrictions::ALLOW_ANY; + + bool is_extended_storage_def = isExtendedStorageDef(query); + + if (is_extended_storage_def) + { + /// Allow expressions in engine arguments. + /// In new syntax argument can be literal or identifier or array/tuple of identifiers. + evaluateEngineArgs(engine_args, context); + } + + bool is_on_cluster = context->getClientInfo().query_kind == ClientInfo::QueryKind::SECONDARY_QUERY; + bool is_replicated_database = context->getClientInfo().query_kind == ClientInfo::QueryKind::SECONDARY_QUERY && + DatabaseCatalog::instance().getDatabase(table_id.database_name)->getEngineName() == "Replicated"; + + /// Allow implicit {uuid} macros only for zookeeper_path in ON CLUSTER queries + /// and if UUID was explicitly passed in CREATE TABLE (like for ATTACH) + bool allow_uuid_macro = is_on_cluster || is_replicated_database || query.attach || query.has_uuid; + + auto expand_macro = [&] (ASTLiteral * ast_zk_path, ASTLiteral * ast_replica_name) + { + /// Unfold {database} and {table} macro on table creation, so table can be renamed. + if (mode < LoadingStrictnessLevel::ATTACH) + { + Macros::MacroExpansionInfo info; + /// NOTE: it's not recursive + info.expand_special_macros_only = true; + info.table_id = table_id; + /// Avoid unfolding {uuid} macro on this step. + /// We did unfold it in previous versions to make moving table from Atomic to Ordinary database work correctly, + /// but now it's not allowed (and it was the only reason to unfold {uuid} macro). + info.table_id.uuid = UUIDHelpers::Nil; + zookeeper_path = context->getMacros()->expand(zookeeper_path, info); + + info.level = 0; + replica_name = context->getMacros()->expand(replica_name, info); + } + + ast_zk_path->value = zookeeper_path; + ast_replica_name->value = replica_name; + + /// Expand other macros (such as {shard} and {replica}). We do not expand them on previous step + /// to make possible copying metadata files between replicas. + Macros::MacroExpansionInfo info; + info.table_id = table_id; + if (is_replicated_database) + { + auto database = DatabaseCatalog::instance().getDatabase(table_id.database_name); + info.shard = getReplicatedDatabaseShardName(database); + info.replica = getReplicatedDatabaseReplicaName(database); + } + if (!allow_uuid_macro) + info.table_id.uuid = UUIDHelpers::Nil; + zookeeper_path = context->getMacros()->expand(zookeeper_path, info); + + info.level = 0; + info.table_id.uuid = UUIDHelpers::Nil; + replica_name = context->getMacros()->expand(replica_name, info); + + /// We do not allow renaming table with these macros in metadata, because zookeeper_path will be broken after RENAME TABLE. + /// NOTE: it may happen if table was created by older version of ClickHouse (< 20.10) and macros was not unfolded on table creation + /// or if one of these macros is recursively expanded from some other macro. + /// Also do not allow to move table from Atomic to Ordinary database if there's {uuid} macro + if (info.expanded_database || info.expanded_table) + renaming_restrictions = RenamingRestrictions::DO_NOT_ALLOW; + else if (info.expanded_uuid) + renaming_restrictions = RenamingRestrictions::ALLOW_PRESERVING_UUID; + }; + + size_t arg_num = 0; + size_t arg_cnt = engine_args.size(); + + bool has_arguments = (arg_num + 2 <= arg_cnt); + bool has_valid_arguments = has_arguments && engine_args[arg_num]->as() && engine_args[arg_num + 1]->as(); + + if (has_valid_arguments) + { + /// Get path and name from engine arguments + auto * ast_zk_path = engine_args[arg_num]->as(); + if (ast_zk_path && ast_zk_path->value.getType() == Field::Types::String) + zookeeper_path = ast_zk_path->value.safeGet(); + else + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Path in ZooKeeper must be a string literal{}", verbose_help_message); + + auto * ast_replica_name = engine_args[arg_num + 1]->as(); + if (ast_replica_name && ast_replica_name->value.getType() == Field::Types::String) + replica_name = ast_replica_name->value.safeGet(); + else + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Replica name must be a string literal{}", verbose_help_message); + + if (replica_name.empty()) + throw Exception(ErrorCodes::NO_REPLICA_NAME_GIVEN, "No replica name in config{}", verbose_help_message); + + expand_macro(ast_zk_path, ast_replica_name); + } + else if (is_extended_storage_def + && (arg_cnt == 0 + || !engine_args[arg_num]->as() + || (arg_cnt == 1 && (getNamePart(engine_name) == "Graphite")))) + { + /// Try use default values if arguments are not specified. + /// Note: {uuid} macro works for ON CLUSTER queries when database engine is Atomic. + const auto & server_settings = context->getServerSettings(); + zookeeper_path = server_settings.default_replica_path; + /// TODO maybe use hostname if {replica} is not defined? + replica_name = server_settings.default_replica_name; + + /// Modify query, so default values will be written to metadata + assert(arg_num == 0); + ASTs old_args; + std::swap(engine_args, old_args); + auto path_arg = std::make_shared(zookeeper_path); + auto name_arg = std::make_shared(replica_name); + auto * ast_zk_path = path_arg.get(); + auto * ast_replica_name = name_arg.get(); + + expand_macro(ast_zk_path, ast_replica_name); + + engine_args.emplace_back(std::move(path_arg)); + engine_args.emplace_back(std::move(name_arg)); + std::move(std::begin(old_args), std::end(old_args), std::back_inserter(engine_args)); + } + else + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Expected two string literal arguments: zookeeper_path and replica_name"); +} + +/// Extracts a zookeeper path from a specified CREATE TABLE query. Returns std::nullopt if fails. +std::optional extractZooKeeperPathFromReplicatedTableDef(const ASTCreateQuery & query, const ContextPtr & context) +{ + try + { + if (!query.storage || !query.storage->engine) + return {}; + + const String & engine_name = query.storage->engine->name; + if (!isReplicated(engine_name)) + return {}; + + StorageID table_id{query.getDatabase(), query.getTable(), query.uuid}; + ASTs engine_args; + if (query.storage->engine->arguments) + engine_args = query.storage->engine->arguments->children; + for (auto & engine_arg : engine_args) + engine_arg = engine_arg->clone(); + LoadingStrictnessLevel mode = LoadingStrictnessLevel::CREATE; + String zookeeper_path; + String replica_name; + RenamingRestrictions renaming_restrictions; + + extractZooKeeperPathAndReplicaNameFromEngineArgs(query, table_id, engine_name, engine_args, mode, context, + zookeeper_path, replica_name, renaming_restrictions); + + return zookeeper_path; + } + catch (...) + { + return {}; + } +} static StoragePtr create(const StorageFactory::Arguments & args) { @@ -156,17 +399,12 @@ static StoragePtr create(const StorageFactory::Arguments & args) * - Additional MergeTreeSettings in the SETTINGS clause; */ - bool is_extended_storage_def = args.storage_def->isExtendedStorageDefinition() - || (args.query.columns_list->indices && !args.query.columns_list->indices->children.empty()) - || (args.query.columns_list->projections && !args.query.columns_list->projections->children.empty()); + bool is_extended_storage_def = isExtendedStorageDef(args.query); const Settings & local_settings = args.getLocalContext()->getSettingsRef(); - String name_part = args.engine_name.substr(0, args.engine_name.size() - strlen("MergeTree")); - - bool replicated = startsWith(name_part, "Replicated"); - if (replicated) - name_part = name_part.substr(strlen("Replicated")); + bool replicated = isReplicated(args.engine_name); + std::string_view name_part = getNamePart(args.engine_name); MergeTreeData::MergingParams merging_params; merging_params.mode = MergeTreeData::MergingParams::Ordinary; @@ -283,29 +521,7 @@ static StoragePtr create(const StorageFactory::Arguments & args) { /// Allow expressions in engine arguments. /// In new syntax argument can be literal or identifier or array/tuple of identifiers. - size_t arg_idx = 0; - try - { - for (; arg_idx < engine_args.size(); ++arg_idx) - { - auto & arg = engine_args[arg_idx]; - auto * arg_func = arg->as(); - if (!arg_func) - continue; - - /// If we got ASTFunction, let's evaluate it and replace with ASTLiteral. - /// Do not try evaluate array or tuple, because it's array or tuple of column identifiers. - if (arg_func->name == "array" || arg_func->name == "tuple") - continue; - Field value = evaluateConstantExpression(arg, args.getLocalContext()).first; - arg = std::make_shared(value); - } - } - catch (Exception & e) - { - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot evaluate engine argument {}: {} {}", - arg_idx, e.message(), verbose_help_message); - } + evaluateEngineArgs(engine_args, args.getLocalContext()); } else if (args.mode <= LoadingStrictnessLevel::CREATE && !local_settings.allow_deprecated_syntax_for_merge_tree) { @@ -314,130 +530,17 @@ static StoragePtr create(const StorageFactory::Arguments & args) "See also `allow_deprecated_syntax_for_merge_tree` setting."); } - /// For Replicated. + /// Extract zookeeper path and replica name from engine arguments. String zookeeper_path; String replica_name; RenamingRestrictions renaming_restrictions = RenamingRestrictions::ALLOW_ANY; - bool is_on_cluster = args.getLocalContext()->getClientInfo().query_kind == ClientInfo::QueryKind::SECONDARY_QUERY; - bool is_replicated_database = args.getLocalContext()->getClientInfo().query_kind == ClientInfo::QueryKind::SECONDARY_QUERY && - DatabaseCatalog::instance().getDatabase(args.table_id.database_name)->getEngineName() == "Replicated"; - - /// Allow implicit {uuid} macros only for zookeeper_path in ON CLUSTER queries - /// and if UUID was explicitly passed in CREATE TABLE (like for ATTACH) - bool allow_uuid_macro = is_on_cluster || is_replicated_database || args.query.attach || args.query.has_uuid; - - auto expand_macro = [&] (ASTLiteral * ast_zk_path, ASTLiteral * ast_replica_name) - { - /// Unfold {database} and {table} macro on table creation, so table can be renamed. - if (args.mode < LoadingStrictnessLevel::ATTACH) - { - Macros::MacroExpansionInfo info; - /// NOTE: it's not recursive - info.expand_special_macros_only = true; - info.table_id = args.table_id; - /// Avoid unfolding {uuid} macro on this step. - /// We did unfold it in previous versions to make moving table from Atomic to Ordinary database work correctly, - /// but now it's not allowed (and it was the only reason to unfold {uuid} macro). - info.table_id.uuid = UUIDHelpers::Nil; - zookeeper_path = context->getMacros()->expand(zookeeper_path, info); - - info.level = 0; - replica_name = context->getMacros()->expand(replica_name, info); - } - - ast_zk_path->value = zookeeper_path; - ast_replica_name->value = replica_name; - - /// Expand other macros (such as {shard} and {replica}). We do not expand them on previous step - /// to make possible copying metadata files between replicas. - Macros::MacroExpansionInfo info; - info.table_id = args.table_id; - if (is_replicated_database) - { - auto database = DatabaseCatalog::instance().getDatabase(args.table_id.database_name); - info.shard = getReplicatedDatabaseShardName(database); - info.replica = getReplicatedDatabaseReplicaName(database); - } - if (!allow_uuid_macro) - info.table_id.uuid = UUIDHelpers::Nil; - zookeeper_path = context->getMacros()->expand(zookeeper_path, info); - - info.level = 0; - info.table_id.uuid = UUIDHelpers::Nil; - replica_name = context->getMacros()->expand(replica_name, info); - - /// We do not allow renaming table with these macros in metadata, because zookeeper_path will be broken after RENAME TABLE. - /// NOTE: it may happen if table was created by older version of ClickHouse (< 20.10) and macros was not unfolded on table creation - /// or if one of these macros is recursively expanded from some other macro. - /// Also do not allow to move table from Atomic to Ordinary database if there's {uuid} macro - if (info.expanded_database || info.expanded_table) - renaming_restrictions = RenamingRestrictions::DO_NOT_ALLOW; - else if (info.expanded_uuid) - renaming_restrictions = RenamingRestrictions::ALLOW_PRESERVING_UUID; - }; - if (replicated) { - bool has_arguments = arg_num + 2 <= arg_cnt; - bool has_valid_arguments = has_arguments && engine_args[arg_num]->as() && engine_args[arg_num + 1]->as(); - - ASTLiteral * ast_zk_path; - ASTLiteral * ast_replica_name; - - if (has_valid_arguments) - { - /// Get path and name from engine arguments - ast_zk_path = engine_args[arg_num]->as(); - if (ast_zk_path && ast_zk_path->value.getType() == Field::Types::String) - zookeeper_path = ast_zk_path->value.safeGet(); - else - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Path in ZooKeeper must be a string literal{}", verbose_help_message); - ++arg_num; - - ast_replica_name = engine_args[arg_num]->as(); - if (ast_replica_name && ast_replica_name->value.getType() == Field::Types::String) - replica_name = ast_replica_name->value.safeGet(); - else - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Replica name must be a string literal{}", verbose_help_message); - - if (replica_name.empty()) - throw Exception(ErrorCodes::NO_REPLICA_NAME_GIVEN, "No replica name in config{}", verbose_help_message); - ++arg_num; - - expand_macro(ast_zk_path, ast_replica_name); - } - else if (is_extended_storage_def - && (arg_cnt == 0 - || !engine_args[arg_num]->as() - || (arg_cnt == 1 && merging_params.mode == MergeTreeData::MergingParams::Graphite))) - { - /// Try use default values if arguments are not specified. - /// Note: {uuid} macro works for ON CLUSTER queries when database engine is Atomic. - const auto & server_settings = args.getContext()->getServerSettings(); - zookeeper_path = server_settings.default_replica_path; - /// TODO maybe use hostname if {replica} is not defined? - replica_name = server_settings.default_replica_name; - - /// Modify query, so default values will be written to metadata - assert(arg_num == 0); - ASTs old_args; - std::swap(engine_args, old_args); - auto path_arg = std::make_shared(zookeeper_path); - auto name_arg = std::make_shared(replica_name); - ast_zk_path = path_arg.get(); - ast_replica_name = name_arg.get(); - - expand_macro(ast_zk_path, ast_replica_name); - - engine_args.emplace_back(std::move(path_arg)); - engine_args.emplace_back(std::move(name_arg)); - std::move(std::begin(old_args), std::end(old_args), std::back_inserter(engine_args)); - arg_num = 2; - arg_cnt += 2; - } - else - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Expected two string literal arguments: zookeeper_path and replica_name"); + extractZooKeeperPathAndReplicaNameFromEngineArgs(args.query, args.table_id, args.engine_name, args.engine_args, args.mode, + args.getLocalContext(), zookeeper_path, replica_name, renaming_restrictions); + arg_cnt = engine_args.size(); /// Update `arg_cnt` here because extractZooKeeperPathAndReplicaNameFromEngineArgs() could add arguments. + arg_num = 2; /// zookeeper_path and replica_name together are always two arguments. } /// This merging param maybe used as part of sorting key diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index 58d1846915f..0639b172d31 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -31,7 +31,6 @@ #include #include #include -#include #include #include #include From faae8a4f2b683eed530b74f92ab58d1a76b5d001 Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Wed, 24 Apr 2024 20:37:06 +0200 Subject: [PATCH 065/289] Add tests for backup on cluster with 2 shards and 2 replicas. --- .../configs/cluster_2x2.xml | 26 +++ .../test_backup_restore_on_cluster/test.py | 1 - .../test_two_shards_two_replicas.py | 153 ++++++++++++++++++ 3 files changed, 179 insertions(+), 1 deletion(-) create mode 100644 tests/integration/test_backup_restore_on_cluster/configs/cluster_2x2.xml create mode 100644 tests/integration/test_backup_restore_on_cluster/test_two_shards_two_replicas.py diff --git a/tests/integration/test_backup_restore_on_cluster/configs/cluster_2x2.xml b/tests/integration/test_backup_restore_on_cluster/configs/cluster_2x2.xml new file mode 100644 index 00000000000..97e60fbbed7 --- /dev/null +++ b/tests/integration/test_backup_restore_on_cluster/configs/cluster_2x2.xml @@ -0,0 +1,26 @@ + + + + + + node_1_1 + 9000 + + + node_1_2 + 9000 + + + + + node_2_1 + 9000 + + + node_2_2 + 9000 + + + + + diff --git a/tests/integration/test_backup_restore_on_cluster/test.py b/tests/integration/test_backup_restore_on_cluster/test.py index d1520444df1..700ed6f15f5 100644 --- a/tests/integration/test_backup_restore_on_cluster/test.py +++ b/tests/integration/test_backup_restore_on_cluster/test.py @@ -41,7 +41,6 @@ node2 = cluster.add_instance( stay_alive=True, # Necessary for the "test_stop_other_host_while_backup" test ) - node3 = cluster.add_instance( "node3", main_configs=main_configs, diff --git a/tests/integration/test_backup_restore_on_cluster/test_two_shards_two_replicas.py b/tests/integration/test_backup_restore_on_cluster/test_two_shards_two_replicas.py new file mode 100644 index 00000000000..c0e318c8bb7 --- /dev/null +++ b/tests/integration/test_backup_restore_on_cluster/test_two_shards_two_replicas.py @@ -0,0 +1,153 @@ +import pytest +from helpers.cluster import ClickHouseCluster +from helpers.test_tools import TSV + + +cluster = ClickHouseCluster(__file__) + +main_configs = [ + "configs/backups_disk.xml", + "configs/cluster_2x2.xml", + "configs/lesser_timeouts.xml", # Default timeouts are quite big (a few minutes), the tests don't need them to be that big. +] + +user_configs = [ + "configs/zookeeper_retries.xml", +] + +node_1_1 = cluster.add_instance( + "node_1_1", + main_configs=main_configs, + user_configs=user_configs, + external_dirs=["/backups/"], + macros={"replica": "1", "shard": "1"}, + with_zookeeper=True, +) + +node_1_2 = cluster.add_instance( + "node_1_2", + main_configs=main_configs, + user_configs=user_configs, + external_dirs=["/backups/"], + macros={"replica": "2", "shard": "1"}, + with_zookeeper=True, +) + +node_2_1 = cluster.add_instance( + "node_2_1", + main_configs=main_configs, + user_configs=user_configs, + external_dirs=["/backups/"], + macros={"replica": "1", "shard": "2"}, + with_zookeeper=True, +) + +node_2_2 = cluster.add_instance( + "node_2_2", + main_configs=main_configs, + user_configs=user_configs, + external_dirs=["/backups/"], + macros={"replica": "2", "shard": "2"}, + with_zookeeper=True, +) + + +@pytest.fixture(scope="module", autouse=True) +def start_cluster(): + try: + cluster.start() + yield cluster + finally: + cluster.shutdown() + + +@pytest.fixture(autouse=True) +def drop_after_test(): + try: + yield + finally: + node_1_1.query("DROP TABLE IF EXISTS tbl ON CLUSTER 'cluster_2x2' SYNC") + node_1_1.query("DROP TABLE IF EXISTS table_a ON CLUSTER 'cluster_2x2' SYNC") + node_1_1.query("DROP TABLE IF EXISTS table_b ON CLUSTER 'cluster_2x2' SYNC") + + +backup_id_counter = 0 + + +def new_backup_name(): + global backup_id_counter + backup_id_counter += 1 + return f"Disk('backups', '{backup_id_counter}')" + + +def test_replicated_table(): + node_1_1.query( + "CREATE TABLE tbl ON CLUSTER 'cluster_2x2' (" + "x Int64" + ") ENGINE=ReplicatedMergeTree('/clickhouse/tables/tbl/{shard}', '{replica}')" + "ORDER BY x" + ) + + node_1_1.query("INSERT INTO tbl VALUES (100), (200)") + node_2_1.query("INSERT INTO tbl VALUES (300), (400)") + + backup_name = new_backup_name() + + node_1_1.query(f"BACKUP TABLE tbl ON CLUSTER 'cluster_2x2' TO {backup_name}") + + node_1_1.query(f"DROP TABLE tbl ON CLUSTER 'cluster_2x2' SYNC") + + node_1_1.query(f"RESTORE ALL ON CLUSTER 'cluster_2x2' FROM {backup_name}") + + node_1_1.query("SYSTEM SYNC REPLICA ON CLUSTER 'cluster_2x2' tbl") + + assert node_1_1.query("SELECT * FROM tbl ORDER BY x") == TSV([[100], [200]]) + assert node_1_2.query("SELECT * FROM tbl ORDER BY x") == TSV([[100], [200]]) + assert node_2_1.query("SELECT * FROM tbl ORDER BY x") == TSV([[300], [400]]) + assert node_2_2.query("SELECT * FROM tbl ORDER BY x") == TSV([[300], [400]]) + + +def test_two_tables_with_uuid_in_zk_path(): + node_1_1.query( + "CREATE TABLE table_a ON CLUSTER 'cluster_2x2' (" + "x Int64" + ") ENGINE=ReplicatedMergeTree('/clickhouse/tables/{uuid}/{shard}', '{replica}')" + "ORDER BY x" + ) + + node_1_1.query( + "CREATE TABLE table_b ON CLUSTER 'cluster_2x2' (" + "x Int64" + ") ENGINE=ReplicatedMergeTree('/clickhouse/tables/{uuid}/{shard}', '{replica}')" + "ORDER BY x" + ) + + node_1_1.query("INSERT INTO table_a VALUES (100), (200)") + node_2_1.query("INSERT INTO table_a VALUES (300), (400)") + + node_1_2.query("INSERT INTO table_b VALUES (500), (600)") + node_2_2.query("INSERT INTO table_b VALUES (700), (800)") + + backup_name = new_backup_name() + + node_1_1.query( + f"BACKUP TABLE table_a, TABLE table_b ON CLUSTER 'cluster_2x2' TO {backup_name}" + ) + + node_1_1.query(f"DROP TABLE table_a ON CLUSTER 'cluster_2x2' SYNC") + node_1_1.query(f"DROP TABLE table_b ON CLUSTER 'cluster_2x2' SYNC") + + node_1_1.query(f"RESTORE ALL ON CLUSTER 'cluster_2x2' FROM {backup_name}") + + node_1_1.query("SYSTEM SYNC REPLICA ON CLUSTER 'cluster_2x2' table_a") + node_1_1.query("SYSTEM SYNC REPLICA ON CLUSTER 'cluster_2x2' table_b") + + assert node_1_1.query("SELECT * FROM table_a ORDER BY x") == TSV([[100], [200]]) + assert node_1_2.query("SELECT * FROM table_a ORDER BY x") == TSV([[100], [200]]) + assert node_2_1.query("SELECT * FROM table_a ORDER BY x") == TSV([[300], [400]]) + assert node_2_2.query("SELECT * FROM table_a ORDER BY x") == TSV([[300], [400]]) + + assert node_1_1.query("SELECT * FROM table_b ORDER BY x") == TSV([[500], [600]]) + assert node_1_2.query("SELECT * FROM table_b ORDER BY x") == TSV([[500], [600]]) + assert node_2_1.query("SELECT * FROM table_b ORDER BY x") == TSV([[700], [800]]) + assert node_2_2.query("SELECT * FROM table_b ORDER BY x") == TSV([[700], [800]]) From bff72f3b2753e88aaa1b703258ad904ad11fc4ee Mon Sep 17 00:00:00 2001 From: skyoct Date: Fri, 26 Apr 2024 09:06:17 +0000 Subject: [PATCH 066/289] batter --- src/Functions/clamp.cpp | 2 +- tests/queries/0_stateless/03036_clamp.sql | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Functions/clamp.cpp b/src/Functions/clamp.cpp index 3438377afbf..448711aed27 100644 --- a/src/Functions/clamp.cpp +++ b/src/Functions/clamp.cpp @@ -47,7 +47,7 @@ public: for (size_t row_num = 0; row_num < input_rows_count; ++row_num) { if (converted_columns[1]->compareAt(row_num, row_num, *converted_columns[2], 1) > 0) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Function {} the minimum value cannot be greater than the maximum value", getName()); + throw Exception(ErrorCodes::BAD_ARGUMENTS, "The minimum value cannot be greater than the maximum value for function {}", getName()); size_t best_arg = 0; if (converted_columns[1]->compareAt(row_num, row_num, *converted_columns[best_arg], 1) > 0) diff --git a/tests/queries/0_stateless/03036_clamp.sql b/tests/queries/0_stateless/03036_clamp.sql index d225be63f46..0ca1f99572a 100644 --- a/tests/queries/0_stateless/03036_clamp.sql +++ b/tests/queries/0_stateless/03036_clamp.sql @@ -10,6 +10,6 @@ select clamp(1, null, 5); select clamp(1, 6, null); select clamp(1, 5, nan); select clamp(toInt64(number), toInt64(number-1), toInt64(number+1)) from numbers(3); -select clamp(number, number-1, number+1) from numbers(3); -- { serverError 386 } -select clamp(1, 3, 2); -- { serverError 36 } +select clamp(number, number-1, number+1) from numbers(3); -- { serverError NO_COMMON_TYPE } +select clamp(1, 3, 2); -- { serverError BAD_ARGUMENTS } select clamp(1, data[1], data[2])from (select arrayJoin([[1, 2], [2,3], [3,2], [4, 4]]) as data); -- { serverError 36 } From fb375e7a74826a6741df24999936147606ea9c8d Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Mon, 29 Apr 2024 14:59:31 +0200 Subject: [PATCH 067/289] Apply suggestions from code review --- src/Functions/clamp.cpp | 3 +-- tests/queries/0_stateless/03036_clamp.sql | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/Functions/clamp.cpp b/src/Functions/clamp.cpp index 448711aed27..bb347a575e4 100644 --- a/src/Functions/clamp.cpp +++ b/src/Functions/clamp.cpp @@ -22,8 +22,7 @@ public: static constexpr auto name = "clamp"; String getName() const override { return name; } - size_t getNumberOfArguments() const override { return 0; } - bool isVariadic() const override { return true; } + size_t getNumberOfArguments() const override { return 3; } bool useDefaultImplementationForConstants() const override { return true; } bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; } static FunctionPtr create(ContextPtr) { return std::make_shared(); } diff --git a/tests/queries/0_stateless/03036_clamp.sql b/tests/queries/0_stateless/03036_clamp.sql index 0ca1f99572a..9973265c13b 100644 --- a/tests/queries/0_stateless/03036_clamp.sql +++ b/tests/queries/0_stateless/03036_clamp.sql @@ -12,4 +12,4 @@ select clamp(1, 5, nan); select clamp(toInt64(number), toInt64(number-1), toInt64(number+1)) from numbers(3); select clamp(number, number-1, number+1) from numbers(3); -- { serverError NO_COMMON_TYPE } select clamp(1, 3, 2); -- { serverError BAD_ARGUMENTS } -select clamp(1, data[1], data[2])from (select arrayJoin([[1, 2], [2,3], [3,2], [4, 4]]) as data); -- { serverError 36 } +select clamp(1, data[1], data[2])from (select arrayJoin([[1, 2], [2,3], [3,2], [4, 4]]) as data); -- { serverError BAD_ARGUMENTS } From 155866b262e9cd3123b23522ea8bda301b514233 Mon Sep 17 00:00:00 2001 From: Max Kainov Date: Mon, 29 Apr 2024 13:47:08 +0000 Subject: [PATCH 068/289] CI: try separate wf file for MQ --- .github/workflows/merge_queue.yml | 97 ++++++++++++++++++++++++++++++ .github/workflows/pull_request.yml | 33 +++++----- 2 files changed, 113 insertions(+), 17 deletions(-) create mode 100644 .github/workflows/merge_queue.yml diff --git a/.github/workflows/merge_queue.yml b/.github/workflows/merge_queue.yml new file mode 100644 index 00000000000..2b820926651 --- /dev/null +++ b/.github/workflows/merge_queue.yml @@ -0,0 +1,97 @@ +# yamllint disable rule:comments-indentation +name: MergeQueueCI + +env: + # Force the stdout and stderr streams to be unbuffered + PYTHONUNBUFFERED: 1 + +on: # yamllint disable-line rule:truthy + merge_group: + +jobs: + RunConfig: + runs-on: [self-hosted, style-checker-aarch64] + outputs: + data: ${{ steps.runconfig.outputs.CI_DATA }} + steps: + - name: GH event json + run: | + cat "$GITHUB_EVENT_PATH" ||: + - name: Check out repository code + uses: ClickHouse/checkout@v1 + with: + clear-repository: true # to ensure correct digests + fetch-depth: 0 # to get version + filter: tree:0 + - name: Python unit tests + run: | + cd "$GITHUB_WORKSPACE/tests/ci" + echo "Testing the main ci directory" + python3 -m unittest discover -s . -p 'test_*.py' + for dir in *_lambda/; do + echo "Testing $dir" + python3 -m unittest discover -s "$dir" -p 'test_*.py' + done + - name: PrepareRunConfig + id: runconfig + run: | + python3 "$GITHUB_WORKSPACE/tests/ci/ci.py" --configure --outfile ${{ runner.temp }}/ci_run_data.json + + echo "::group::CI configuration" + python3 -m json.tool ${{ runner.temp }}/ci_run_data.json + echo "::endgroup::" + + { + echo 'CI_DATA<> "$GITHUB_OUTPUT" + BuildDockers: + needs: [RunConfig] + if: ${{ !failure() && !cancelled() && toJson(fromJson(needs.RunConfig.outputs.data).docker_data.missing_multi) != '[]' }} + uses: ./.github/workflows/reusable_docker.yml + with: + data: ${{ needs.RunConfig.outputs.data }} + StyleCheck: + needs: [RunConfig, BuildDockers] + if: ${{ !failure() && !cancelled() && contains(fromJson(needs.RunConfig.outputs.data).jobs_data.jobs_to_do, 'Style check')}} + uses: ./.github/workflows/reusable_test.yml + with: + test_name: Style check + runner_type: style-checker + run_command: | + python3 style_check.py + data: ${{ needs.RunConfig.outputs.data }} + secrets: + secret_envs: | + ROBOT_CLICKHOUSE_SSH_KEY<> "$GITHUB_OUTPUT" - name: Re-create GH statuses for skipped jobs if any - if: ${{ github.event_name != 'merge_group' }} run: | python3 "$GITHUB_WORKSPACE/tests/ci/ci.py" --infile ${{ runner.temp }}/ci_run_data.json --update-gh-statuses BuildDockers: @@ -83,7 +86,7 @@ jobs: ${{secrets.ROBOT_CLICKHOUSE_SSH_KEY}} RCSK FastTest: - needs: [RunConfig, BuildDockers] + needs: [RunConfig, BuildDockers, StyleCheck] if: ${{ !failure() && !cancelled() && contains(fromJson(needs.RunConfig.outputs.data).jobs_data.jobs_to_do, 'Fast test') }} uses: ./.github/workflows/reusable_test.yml with: @@ -163,20 +166,16 @@ jobs: steps: - name: Check out repository code uses: ClickHouse/checkout@v1 - - name: Check sync status - if: ${{ github.event_name == 'merge_group' }} - run: | - cd "$GITHUB_WORKSPACE/tests/ci" - python3 sync_pr.py --status - name: Finish label run: | cd "$GITHUB_WORKSPACE/tests/ci" - python3 finish_check.py ${{ (contains(needs.*.result, 'failure') && github.event_name == 'merge_group') && '--pipeline-failure' || '' }} - - name: Auto merge if approved - if: ${{ github.event_name != 'merge_group' }} - run: | - cd "$GITHUB_WORKSPACE/tests/ci" - python3 merge_pr.py --check-approved + python3 finish_check.py + # FIXME: merge on approval does not work with MQ. Could be fixed by using defaul GH's automerge after some corrections in Mergeable Check status + # - name: Auto merge if approved + # if: ${{ github.event_name != 'merge_group' }} + # run: | + # cd "$GITHUB_WORKSPACE/tests/ci" + # python3 merge_pr.py --check-approved ############################################################################################# From 8f93df354ec51a6c8393cdeae67016dbb89aa217 Mon Sep 17 00:00:00 2001 From: HowePa <2873679104@qq.com> Date: Tue, 30 Apr 2024 20:34:08 +0800 Subject: [PATCH 069/289] fix end symbol --- src/Processors/Formats/Impl/NpyOutputFormat.cpp | 6 +++++- tests/queries/0_stateless/02895_npy_output_format.reference | 6 +++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/src/Processors/Formats/Impl/NpyOutputFormat.cpp b/src/Processors/Formats/Impl/NpyOutputFormat.cpp index b3d5042aa79..e02787b4f70 100644 --- a/src/Processors/Formats/Impl/NpyOutputFormat.cpp +++ b/src/Processors/Formats/Impl/NpyOutputFormat.cpp @@ -40,7 +40,11 @@ void writeNumpyStrings(const ColumnPtr & column, size_t length, WriteBuffer & bu { const auto * string_column = assert_cast(column.get()); for (size_t i = 0; i < string_column->size(); ++i) - buf.write(string_column->getDataAt(i).data, length); + { + auto data = string_column->getDataAt(i); + buf.write(data.data, data.size); + writeChar(0, length - data.size, buf); + } } } diff --git a/tests/queries/0_stateless/02895_npy_output_format.reference b/tests/queries/0_stateless/02895_npy_output_format.reference index b599f1dceea..77d9b55ca9f 100644 --- a/tests/queries/0_stateless/02895_npy_output_format.reference +++ b/tests/queries/0_stateless/02895_npy_output_format.reference @@ -52,9 +52,9 @@ array String [[0.1],[0.2]] [[0.1],[0.2]] [[0.1],[0.2]] -[['abb','bbc'],['ccc','dddd']] -[['abb','bbc'],['ccc','dddd']] -[['abb','bbc'],['ccc','dddd']] +[['a','bb'],['ccc','dddd']] +[['a','bb'],['ccc','dddd']] +[['a','bb'],['ccc','dddd']] array Array(Array(Array(Int8))) array Array(Array(Float64)) array Array(Array(String)) From 8de80954809435e15aa65ab29fba59230bc1eae5 Mon Sep 17 00:00:00 2001 From: Blargian Date: Thu, 2 May 2024 11:56:14 +0200 Subject: [PATCH 070/289] Update addXYZ documentation --- .../functions/date-time-functions.md | 468 +++++++++++++++++- .../aspell-ignore/en/aspell-dict.txt | 2 + 2 files changed, 465 insertions(+), 5 deletions(-) diff --git a/docs/en/sql-reference/functions/date-time-functions.md b/docs/en/sql-reference/functions/date-time-functions.md index 51f841657d9..629b3ca9305 100644 --- a/docs/en/sql-reference/functions/date-time-functions.md +++ b/docs/en/sql-reference/functions/date-time-functions.md @@ -2557,13 +2557,27 @@ Like function `YYYYMMDDhhmmssToDate()` but produces a [DateTime64](../../sql-ref Accepts an additional, optional `precision` parameter after the `timezone` parameter. -## addYears, addQuarters, addMonths, addWeeks, addDays, addHours, addMinutes, addSeconds, addMilliseconds, addMicroseconds, addNanoseconds +## addYears -These functions add units of the interval specified by the function name to a date, a date with time or a string-encoded date / date with time. A date or date with time is returned. +Adds a specified number of years to a date, a date with time or a string-encoded date / date with time. -Example: +**Syntax** -``` sql +```sql +addYears(date, x) +``` + +**Parameters** + +- `date`: Date / date with time to add specified number of years to. [date](../data-types/date.md)/[date32](../data-types/date32.md)/[datetime](../data-types/datetime.md)/[datetime64](../data-types/datetime64.md), [String](../data-types/string.md). +- `x`: Number of years to add. [(U)Int*](../data-types/int-uint.md), [float*](../data-types/float.md). + +**Returned value** +- Returns `date` plus `x` years. [date](../data-types/date.md)/[date32](../data-types/date32.md)/[datetime](../data-types/datetime.md)/[datetime64](../data-types/datetime64.md). + +**Example** + +```sql WITH toDate('2024-01-01') AS date, toDateTime('2024-01-01 00:00:00') AS date_time, @@ -2574,12 +2588,456 @@ SELECT addYears(date_time_string, 1) AS add_years_with_date_time_string ``` -``` text +```response ┌─add_years_with_date─┬─add_years_with_date_time─┬─add_years_with_date_time_string─┐ │ 2025-01-01 │ 2025-01-01 00:00:00 │ 2025-01-01 00:00:00.000 │ └─────────────────────┴──────────────────────────┴─────────────────────────────────┘ ``` +## addQuarters + +Adds a specified number of quarters to a date, a date with time or a string-encoded date / date with time. + +**Syntax** + +```sql +addQuarters(date, x) +``` + +**Parameters** + +- `date`: Date / date with time to add specified number of quarters to. [date](../data-types/date.md)/[date32](../data-types/date32.md)/[datetime](../data-types/datetime.md)/[datetime64](../data-types/datetime64.md), [String](../data-types/string.md). +- `x`: Number of quarters to add. [(U)Int*](../data-types/int-uint.md), [float*](../data-types/float.md). + +**Returned value** +- Returns `date` plus `x` quarters. [date](../data-types/date.md)/[date32](../data-types/date32.md)/[datetime](../data-types/datetime.md)/[datetime64](../data-types/datetime64.md). + +**Example** + +```sql +WITH + toDate('2024-01-01') AS date, + toDateTime('2024-01-01 00:00:00') AS date_time, + '2024-01-01 00:00:00' AS date_time_string +SELECT + addQuarters(date, 1) AS add_quarters_with_date, + addQuarters(date_time, 1) AS add_quarters_with_date_time, + addQuarters(date_time_string, 1) AS add_quarters_with_date_time_string +``` + +```response +┌─add_quarters_with_date─┬─add_quarters_with_date_time─┬─add_quarters_with_date_time_string─┐ +│ 2024-04-01 │ 2024-04-01 00:00:00 │ 2024-04-01 00:00:00.000 │ +└────────────────────────┴─────────────────────────────┴────────────────────────────────────┘ +``` + +## addMonths + +Adds a specified number of months to a date, a date with time or a string-encoded date / date with time. + +**Syntax** + +```sql +addMonths(date, x) +``` + +**Parameters** + +- `date`: Date / date with time to add specified number of months to. [date](../data-types/date.md)/[date32](../data-types/date32.md)/[datetime](../data-types/datetime.md)/[datetime64](../data-types/datetime64.md), [String](../data-types/string.md). +- `x`: Number of months to add. [(U)Int*](../data-types/int-uint.md), [float*](../data-types/float.md). + +**Returned value** +- Returns `date` plus `x` months. [date](../data-types/date.md)/[date32](../data-types/date32.md)/[datetime](../data-types/datetime.md)/[datetime64](../data-types/datetime64.md). + +**Example** + +```sql +WITH + toDate('2024-01-01') AS date, + toDateTime('2024-01-01 00:00:00') AS date_time, + '2024-01-01 00:00:00' AS date_time_string +SELECT + addMonths(date, 6) AS add_months_with_date, + addMonths(date_time, 6) AS add_months_with_date_time, + addMonths(date_time_string, 6) AS add_months_with_date_time_string +``` + +```response +┌─add_months_with_date─┬─add_months_with_date_time─┬─add_months_with_date_time_string─┐ +│ 2024-07-01 │ 2024-07-01 00:00:00 │ 2024-07-01 00:00:00.000 │ +└──────────────────────┴───────────────────────────┴──────────────────────────────────┘ +``` + +## addWeeks + +Adds a specified number of weeks to a date, a date with time or a string-encoded date / date with time. + +**Syntax** + +```sql +addWeeks(date, x) +``` + +**Parameters** + +- `date`: Date / date with time to add specified number of weeks to. [date](../data-types/date.md)/[date32](../data-types/date32.md)/[datetime](../data-types/datetime.md)/[datetime64](../data-types/datetime64.md), [String](../data-types/string.md). +- `x`: Number of weeks to add. [(U)Int*](../data-types/int-uint.md), [float*](../data-types/float.md). + +**Returned value** +- Returns `date` plus `x` weeks. [date](../data-types/date.md)/[date32](../data-types/date32.md)/[datetime](../data-types/datetime.md)/[datetime64](../data-types/datetime64.md). + +**Example** + +```sql +WITH + toDate('2024-01-01') AS date, + toDateTime('2024-01-01 00:00:00') AS date_time, + '2024-01-01 00:00:00' AS date_time_string +SELECT + addWeeks(date, 5) AS add_weeks_with_date, + addWeeks(date_time, 5) AS add_weeks_with_date_time, + addWeeks(date_time_string, 5) AS add_weeks_with_date_time_string +``` + +```response +┌─add_weeks_with_date─┬─add_weeks_with_date_time─┬─add_weeks_with_date_time_string─┐ +│ 2024-02-05 │ 2024-02-05 00:00:00 │ 2024-02-05 00:00:00.000 │ +└─────────────────────┴──────────────────────────┴─────────────────────────────────┘ +``` + +## addDays + +Adds a specified number of days to a date, a date with time or a string-encoded date / date with time. + +**Syntax** + +```sql +addDays(date, x) +``` + +**Parameters** + +- `date`: Date / date with time to add specified number of days to. [date](../data-types/date.md)/[date32](../data-types/date32.md)/[datetime](../data-types/datetime.md)/[datetime64](../data-types/datetime64.md), [String](../data-types/string.md). +- `x`: Number of days to add. [(U)Int*](../data-types/int-uint.md), [float*](../data-types/float.md). + +**Returned value** +- Returns `date` plus `x` days. [date](../data-types/date.md)/[date32](../data-types/date32.md)/[datetime](../data-types/datetime.md)/[datetime64](../data-types/datetime64.md). + +**Example** + +```sql +WITH + toDate('2024-01-01') AS date, + toDateTime('2024-01-01 00:00:00') AS date_time, + '2024-01-01 00:00:00' AS date_time_string +SELECT + addDays(date, 5) AS add_days_with_date, + addDays(date_time, 5) AS add_days_with_date_time, + addDays(date_time_string, 5) AS add_days_with_date_time_string +``` + +```response +┌─add_days_with_date─┬─add_days_with_date_time─┬─add_days_with_date_time_string─┐ +│ 2024-01-06 │ 2024-01-06 00:00:00 │ 2024-01-06 00:00:00.000 │ +└────────────────────┴─────────────────────────┴────────────────────────────────┘ +``` + +## addHours + +Adds a specified number of days to a date, a date with time or a string-encoded date / date with time. + +**Syntax** + +```sql +addHours(date, x) +``` + +**Parameters** + +- `date`: Date / date with time to add specified number of hours to. [date](../data-types/date.md)/[date32](../data-types/date32.md)/[datetime](../data-types/datetime.md)/[datetime64](../data-types/datetime64.md), [String](../data-types/string.md). +- `x`: Number of hours to add. [(U)Int*](../data-types/int-uint.md), [float*](../data-types/float.md). + +**Returned value** +- Returns `date` plus `x` hours. [date](../data-types/date.md)/[date32](../data-types/date32.md)/[datetime](../data-types/datetime.md)/[datetime64](../data-types/datetime64.md). + +**Example** + +```sql +WITH + toDate('2024-01-01') AS date, + toDateTime('2024-01-01 00:00:00') AS date_time, + '2024-01-01 00:00:00' AS date_time_string +SELECT + addHours(date, 12) AS add_hours_with_date, + addHours(date_time, 12) AS add_hours_with_date_time, + addHours(date_time_string, 12) AS add_hours_with_date_time_string +``` + +```response +┌─add_hours_with_date─┬─add_hours_with_date_time─┬─add_hours_with_date_time_string─┐ +│ 2024-01-01 12:00:00 │ 2024-01-01 12:00:00 │ 2024-01-01 12:00:00.000 │ +└─────────────────────┴──────────────────────────┴─────────────────────────────────┘ +``` + +## addMinutes + +Adds a specified number of minutes to a date, a date with time or a string-encoded date / date with time. + +**Syntax** + +```sql +addMinutes(date, x) +``` + +**Parameters** + +- `date`: Date / date with time to add specified number of minutes to. [date](../data-types/date.md)/[date32](../data-types/date32.md)/[datetime](../data-types/datetime.md)/[datetime64](../data-types/datetime64.md), [String](../data-types/string.md). +- `x`: Number of minutes to add. [(U)Int*](../data-types/int-uint.md), [float*](../data-types/float.md). + +**Returned value** +- Returns `date` plus `x` minutes. [date](../data-types/date.md)/[date32](../data-types/date32.md)/[datetime](../data-types/datetime.md)/[datetime64](../data-types/datetime64.md). + +**Example** + +```sql +WITH + toDate('2024-01-01') AS date, + toDateTime('2024-01-01 00:00:00') AS date_time, + '2024-01-01 00:00:00' AS date_time_string +SELECT + addMinutes(date, 20) AS add_minutes_with_date, + addMinutes(date_time, 20) AS add_minutes_with_date_time, + addMinutes(date_time_string, 20) AS add_minutes_with_date_time_string +``` + +```response +┌─add_minutes_with_date─┬─add_minutes_with_date_time─┬─add_minutes_with_date_time_string─┐ +│ 2024-01-01 00:20:00 │ 2024-01-01 00:20:00 │ 2024-01-01 00:20:00.000 │ +└───────────────────────┴────────────────────────────┴───────────────────────────────────┘ +``` + +## addSeconds + +Adds a specified number of seconds to a date, a date with time or a string-encoded date / date with time. + +**Syntax** + +```sql +addSeconds(date, x) +``` + +**Parameters** + +- `date`: Date / date with time to add specified number of seconds to. [date](../data-types/date.md)/[date32](../data-types/date32.md)/[datetime](../data-types/datetime.md)/[datetime64](../data-types/datetime64.md), [String](../data-types/string.md). +- `x`: Number of seconds to add. [(U)Int*](../data-types/int-uint.md), [float*](../data-types/float.md). + +**Returned value** +- Returns `date` plus `x` seconds. [date](../data-types/date.md)/[date32](../data-types/date32.md)/[datetime](../data-types/datetime.md)/[datetime64](../data-types/datetime64.md). + +**Example** + +```sql +WITH + toDate('2024-01-01') AS date, + toDateTime('2024-01-01 00:00:00') AS date_time, + '2024-01-01 00:00:00' AS date_time_string +SELECT + addSeconds(date, 30) AS add_seconds_with_date, + addSeconds(date_time, 30) AS add_seconds_with_date_time, + addSeconds(date_time_string, 30) AS add_seconds_with_date_time_string +``` + +```response +┌─add_seconds_with_date─┬─add_seconds_with_date_time─┬─add_seconds_with_date_time_string─┐ +│ 2024-01-01 00:00:30 │ 2024-01-01 00:00:30 │ 2024-01-01 00:00:30.000 │ +└───────────────────────┴────────────────────────────┴───────────────────────────────────┘ +``` + +## addMilliseconds + +Adds a specified number of milliseconds to a date with time or a string-encoded date with time. + +**Syntax** + +```sql +addMilliseconds(date_time, x) +``` + +**Parameters** + +- `date_time`: Date with time to add specified number of milliseconds to. [datetime](../data-types/datetime.md)/[datetime64](../data-types/datetime64.md), [String](../data-types/string.md). +- `x`: Number of milliseconds to add. [(U)Int*](../data-types/int-uint.md), [float*](../data-types/float.md). + +**Returned value** +- Returns `date_time` plus `x` milliseconds. [date](../data-types/date.md)/[date32](../data-types/date32.md)/[datetime](../data-types/datetime.md)/[datetime64](../data-types/datetime64.md). + +**Example** + +```sql +WITH + toDateTime('2024-01-01 00:00:00') AS date_time, + '2024-01-01 00:00:00' AS date_time_string +SELECT + addMilliseconds(date_time, 1000) AS add_milliseconds_with_date_time, + addMilliseconds(date_time_string, 1000) AS add_milliseconds_with_date_time_string +``` + +```response +┌─add_milliseconds_with_date_time─┬─add_milliseconds_with_date_time_string─┐ +│ 2024-01-01 00:00:01.000 │ 2024-01-01 00:00:01.000 │ +└─────────────────────────────────┴────────────────────────────────────────┘ +``` + +## addMicroseconds + +Adds a specified number of microseconds to a date with time or a string-encoded date with time. + +**Syntax** + +```sql +addMicroseconds(date_time, x) +``` + +**Parameters** + +- `date_time`: Date with time to add specified number of microseconds to. [datetime](../data-types/datetime.md)/[datetime64](../data-types/datetime64.md), [String](../data-types/string.md). +- `x`: Number of microseconds to add. [(U)Int*](../data-types/int-uint.md), [float*](../data-types/float.md). + +**Returned value** +- Returns `date_time` plus `x` microseconds. [date](../data-types/date.md)/[date32](../data-types/date32.md)/[datetime](../data-types/datetime.md)/[datetime64](../data-types/datetime64.md). + +**Example** + +```sql +WITH + toDateTime('2024-01-01 00:00:00') AS date_time, + '2024-01-01 00:00:00' AS date_time_string +SELECT + addMicroseconds(date_time, 1000000) AS add_microseconds_with_date_time, + addMicroseconds(date_time_string, 1000000) AS add_microseconds_with_date_time_string +``` + +```response +┌─add_microseconds_with_date_time─┬─add_microseconds_with_date_time_string─┐ +│ 2024-01-01 00:00:01.000000 │ 2024-01-01 00:00:01.000000 │ +└─────────────────────────────────┴────────────────────────────────────────┘ +``` + +## addNanoseconds + +Adds a specified number of microseconds to a date with time or a string-encoded date with time. + +**Syntax** + +```sql +addNanoseconds(date_time, x) +``` + +**Parameters** + +- `date_time`: Date with time to add specified number of nanoseconds to. [datetime](../data-types/datetime.md)/[datetime64](../data-types/datetime64.md), [String](../data-types/string.md). +- `x`: Number of nanoseconds to add. [(U)Int*](../data-types/int-uint.md), [float*](../data-types/float.md). + +**Returned value** +- Returns `date_time` plus `x` nanoseconds. [date](../data-types/date.md)/[date32](../data-types/date32.md)/[datetime](../data-types/datetime.md)/[datetime64](../data-types/datetime64.md). + +**Example** + +```sql +WITH + toDateTime('2024-01-01 00:00:00') AS date_time, + '2024-01-01 00:00:00' AS date_time_string +SELECT + addNanoseconds(date_time, 1000) AS add_nanoseconds_with_date_time, + addNanoseconds(date_time_string, 1000) AS add_nanoseconds_with_date_time_string +``` + +```response +┌─add_nanoseconds_with_date_time─┬─add_nanoseconds_with_date_time_string─┐ +│ 2024-01-01 00:00:00.000001000 │ 2024-01-01 00:00:00.000001000 │ +└────────────────────────────────┴───────────────────────────────────────┘ +``` + +## addInterval + +Adds an interval to another interval or tuple of intervals. + +**Syntax** + +```sql +addInterval(interval_1, interval_2) +``` + +**Parameters** + +- `interval_1`: First interval or interval of tuples. [interval](../data-types/special-data-types/interval.md), [tuple](../data-types/tuple.md)([interval](../data-types/special-data-types/interval.md)). +- `interval_2`: Second interval to be added. [interval](../data-types/special-data-types/interval.md). + +**Returned value** +- Returns a tuple of intervals. [tuple](../data-types/tuple.md)([interval](../data-types/special-data-types/interval.md)). + +:::note +If the types of the first interval (or the interval in the tuple) and the second interval are the same they will be merged into one interval. +::: + +**Example** + +Query: + +```sql +SELECT addInterval(INTERVAL 1 DAY, INTERVAL 1 MONTH); +SELECT addInterval((INTERVAL 1 DAY, INTERVAL 1 YEAR), INTERVAL 1 MONTH); +SELECT addInterval(INTERVAL 2 DAY, INTERVAL 1 DAY); +``` + +Result: + +```response +┌─addInterval(toIntervalDay(1), toIntervalMonth(1))─┐ +│ (1,1) │ +└───────────────────────────────────────────────────┘ +┌─addInterval((toIntervalDay(1), toIntervalYear(1)), toIntervalMonth(1))─┐ +│ (1,1,1) │ +└────────────────────────────────────────────────────────────────────────┘ +┌─addInterval(toIntervalDay(2), toIntervalDay(1))─┐ +│ (3) │ +└─────────────────────────────────────────────────┘ +``` + +## addTupleOfIntervals + +Consecutively adds a tuple of intervals to a Date or a DateTime. + +**Syntax** + +```sql +addTupleOfIntervals(interval_1, interval_2) +``` + +**Parameters** + +- `date`: First interval or interval of tuples. [date](../data-types/date.md)/[date32](../data-types/date32.md)/[datetime](../data-types/datetime.md)/[datetime64](../data-types/datetime64.md). +- `intervals`: Tuple of intervals to add to `date`. [tuple](../data-types/tuple.md)([interval](../data-types/special-data-types/interval.md)). + +**Returned value** +- Returns `date` with added `intervals`. [date](../data-types/date.md)/[date32](../data-types/date32.md)/[datetime](../data-types/datetime.md)/[datetime64](../data-types/datetime64.md). + +**Example** + +Query: + +```sql +WITH toDate('2018-01-01') AS date SELECT addTupleOfIntervals(date, (INTERVAL 1 DAY, INTERVAL 1 MONTH, INTERVAL 1 YEAR)) +``` + +Result: + +```response +┌─addTupleOfIntervals(date, (toIntervalDay(1), toIntervalMonth(1), toIntervalYear(1)))─┐ +│ 2019-02-02 │ +└──────────────────────────────────────────────────────────────────────────────────────┘ +``` + ## subtractYears, subtractQuarters, subtractMonths, subtractWeeks, subtractDays, subtractHours, subtractMinutes, subtractSeconds, subtractMilliseconds, subtractMicroseconds, subtractNanoseconds These functions subtract units of the interval specified by the function name from a date, a date with time or a string-encoded date / date with time. A date or date with time is returned. diff --git a/utils/check-style/aspell-ignore/en/aspell-dict.txt b/utils/check-style/aspell-ignore/en/aspell-dict.txt index e7477ffc5e1..441df44dd3f 100644 --- a/utils/check-style/aspell-ignore/en/aspell-dict.txt +++ b/utils/check-style/aspell-ignore/en/aspell-dict.txt @@ -1088,6 +1088,8 @@ addQuarters addSeconds addWeeks addYears +addInterval +addTupleOfIntervals addr addressToLine addressToLineWithInlines From 4e6e234d24eb46d62c7e7f0e99a1ecccda5cc0f4 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Fri, 3 May 2024 12:19:57 +0200 Subject: [PATCH 071/289] Safer parent part access --- src/Storages/MergeTree/MergeTreeReadPoolBase.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeReadPoolBase.cpp b/src/Storages/MergeTree/MergeTreeReadPoolBase.cpp index 0cbb0a86b2f..3e10285d6b0 100644 --- a/src/Storages/MergeTree/MergeTreeReadPoolBase.cpp +++ b/src/Storages/MergeTree/MergeTreeReadPoolBase.cpp @@ -113,9 +113,9 @@ MergeTreeReadTaskPtr MergeTreeReadPoolBase::createTask( ? std::make_unique(*read_info->shared_size_predictor) : nullptr; /// make a copy - auto get_part_name = [](const auto & task_info) -> const String & + auto get_part_name = [](const auto & task_info) -> String { - return task_info.data_part->isProjectionPart() ? task_info.data_part->getParentPart()->name : task_info.data_part->name; + return task_info.data_part->isProjectionPart() ? task_info.data_part->getParentPartName() : task_info.data_part->name; }; auto extras = getExtras(); From 314573b7a66a2379aecb0b8e680d24561f00e03e Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Fri, 3 May 2024 12:40:05 +0200 Subject: [PATCH 072/289] Add debug check --- .../MergeTree/MergeTreeReadPoolBase.cpp | 23 ++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/src/Storages/MergeTree/MergeTreeReadPoolBase.cpp b/src/Storages/MergeTree/MergeTreeReadPoolBase.cpp index 3e10285d6b0..c759a12e151 100644 --- a/src/Storages/MergeTree/MergeTreeReadPoolBase.cpp +++ b/src/Storages/MergeTree/MergeTreeReadPoolBase.cpp @@ -6,6 +6,11 @@ namespace DB { +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; +} + MergeTreeReadPoolBase::MergeTreeReadPoolBase( RangesInDataParts && parts_, VirtualFields shared_virtual_fields_, @@ -115,7 +120,23 @@ MergeTreeReadTaskPtr MergeTreeReadPoolBase::createTask( auto get_part_name = [](const auto & task_info) -> String { - return task_info.data_part->isProjectionPart() ? task_info.data_part->getParentPartName() : task_info.data_part->name; + const auto & data_part = task_info.data_part; + + if (data_part->isProjectionPart()) + { + auto parent_part_name = data_part->getParentPartName(); + + auto parent_part = data_part->storage.getPartIfExists( + parent_part_name, {MergeTreeDataPartState::PreActive, MergeTreeDataPartState::Active, MergeTreeDataPartState::Outdated}); + + if (!parent_part) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Did not find parent part {} for potentially broken projection part {}", + parent_part_name, data_part->getDataPartStorage().getFullPath()); + + return parent_part_name; + } + + return data_part->name; }; auto extras = getExtras(); From 9e670fe67cd560638ffd731ba391333bb7b78ee8 Mon Sep 17 00:00:00 2001 From: alesapin Date: Fri, 3 May 2024 18:38:09 +0200 Subject: [PATCH 073/289] Fix logical error during SELECT query after ALTER in rare case --- src/Storages/MergeTree/IMergeTreeReader.cpp | 9 +- ...rop_column_zookeeper_on_steroids.reference | 11 ++ ...r_add_drop_column_zookeeper_on_steroids.sh | 149 ++++++++++++++++++ 3 files changed, 168 insertions(+), 1 deletion(-) create mode 100644 tests/queries/0_stateless/03144_parallel_alter_add_drop_column_zookeeper_on_steroids.reference create mode 100755 tests/queries/0_stateless/03144_parallel_alter_add_drop_column_zookeeper_on_steroids.sh diff --git a/src/Storages/MergeTree/IMergeTreeReader.cpp b/src/Storages/MergeTree/IMergeTreeReader.cpp index cf6b64aac85..54da03d1756 100644 --- a/src/Storages/MergeTree/IMergeTreeReader.cpp +++ b/src/Storages/MergeTree/IMergeTreeReader.cpp @@ -152,7 +152,14 @@ void IMergeTreeReader::evaluateMissingDefaults(Block additional_columns, Columns if (res_columns[pos] == nullptr) continue; - additional_columns.insert({res_columns[pos], name_and_type->type, name_and_type->name}); + /// We must take column type from part if it exists. Because at the end of defaults + /// calculations we will materialize ALL the columns, not only missing. + /// If column doesn't exist in part than it will be substituted with default expression + const auto * column_in_part = part_columns.tryGet(name_and_type->name); + if (column_in_part != nullptr) + additional_columns.insert({res_columns[pos], column_in_part->type, name_and_type->name}); + else + additional_columns.insert({res_columns[pos], name_and_type->type, name_and_type->name}); } auto dag = DB::evaluateMissingDefaults( diff --git a/tests/queries/0_stateless/03144_parallel_alter_add_drop_column_zookeeper_on_steroids.reference b/tests/queries/0_stateless/03144_parallel_alter_add_drop_column_zookeeper_on_steroids.reference new file mode 100644 index 00000000000..ed135e928a9 --- /dev/null +++ b/tests/queries/0_stateless/03144_parallel_alter_add_drop_column_zookeeper_on_steroids.reference @@ -0,0 +1,11 @@ +Starting alters +Finishing alters +Equal number of columns +Replication did not hang: synced all replicas of concurrent_alter_add_drop_steroids_ +Consistency: 1 +0 +0 +0 +0 +0 +0 diff --git a/tests/queries/0_stateless/03144_parallel_alter_add_drop_column_zookeeper_on_steroids.sh b/tests/queries/0_stateless/03144_parallel_alter_add_drop_column_zookeeper_on_steroids.sh new file mode 100755 index 00000000000..ea7bb8f7ad0 --- /dev/null +++ b/tests/queries/0_stateless/03144_parallel_alter_add_drop_column_zookeeper_on_steroids.sh @@ -0,0 +1,149 @@ +#!/usr/bin/env bash +# Tags: zookeeper, no-parallel, no-fasttest + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh +# shellcheck source=./replication.lib +. "$CURDIR"/replication.lib + +REPLICAS=3 + +for i in $(seq $REPLICAS); do + $CLICKHOUSE_CLIENT --query "DROP TABLE IF EXISTS concurrent_alter_add_drop_steroids_$i" +done + + +for i in $(seq $REPLICAS); do + $CLICKHOUSE_CLIENT --query "CREATE TABLE concurrent_alter_add_drop_steroids_$i (key UInt64, value0 UInt8) ENGINE = ReplicatedMergeTree('/clickhouse/tables/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/concurrent_alter_add_drop_steroids_column', '$i') ORDER BY key SETTINGS max_replicated_mutations_in_queue = 1000, number_of_free_entries_in_pool_to_execute_mutation = 0, max_replicated_merges_in_queue = 1000, index_granularity = 8192, index_granularity_bytes = '10Mi'" +done + +$CLICKHOUSE_CLIENT --query "INSERT INTO concurrent_alter_add_drop_steroids_1 SELECT number, number + 10 from numbers(100000)" + +for i in $(seq $REPLICAS); do + $CLICKHOUSE_CLIENT --query "SYSTEM SYNC REPLICA concurrent_alter_add_drop_steroids_$i" +done + + +function alter_thread() +{ + while true; do + REPLICA=$(($RANDOM % 3 + 1)) + ADD=$(($RANDOM % 5 + 1)) + $CLICKHOUSE_CLIENT --query "ALTER TABLE concurrent_alter_add_drop_steroids_$REPLICA ADD COLUMN value$ADD UInt32 DEFAULT 42 SETTINGS replication_alter_partitions_sync=0"; # additionaly we don't wait anything for more heavy concurrency + DROP=$(($RANDOM % 5 + 1)) + $CLICKHOUSE_CLIENT --query "ALTER TABLE concurrent_alter_add_drop_steroids_$REPLICA DROP COLUMN value$DROP SETTINGS replication_alter_partitions_sync=0"; # additionaly we don't wait anything for more heavy concurrency + sleep 0.$RANDOM + done +} + +function alter_thread_1() +{ + while true; do + REPLICA=$(($RANDOM % 3 + 1)) + ${CLICKHOUSE_CLIENT} --query "ALTER TABLE concurrent_alter_add_drop_steroids_1 MODIFY COLUMN value0 String SETTINGS mutations_sync = 0" + sleep 1.$RANDOM + ${CLICKHOUSE_CLIENT} --query "ALTER TABLE concurrent_alter_add_drop_steroids_1 MODIFY COLUMN value0 UInt8 SETTINGS mutations_sync = 0" + sleep 1.$RANDOM + done + +} + +function optimize_thread() +{ + while true; do + REPLICA=$(($RANDOM % 3 + 1)) + $CLICKHOUSE_CLIENT --query "OPTIMIZE TABLE concurrent_alter_add_drop_steroids_$REPLICA FINAL SETTINGS replication_alter_partitions_sync=0"; + sleep 0.$RANDOM + done +} + +function insert_thread() +{ + while true; do + REPLICA=$(($RANDOM % 3 + 1)) + $CLICKHOUSE_CLIENT --query "INSERT INTO concurrent_alter_add_drop_steroids_$REPLICA VALUES($RANDOM, 7)" + sleep 0.$RANDOM + done +} + +function select_thread() +{ + while true; do + REPLICA=$(($RANDOM % 3 + 1)) + $CLICKHOUSE_CLIENT --query "SELECT * FROM merge(currentDatabase(), 'concurrent_alter_add_drop_steroids_') FORMAT Null" + sleep 0.$RANDOM + done +} + + +echo "Starting alters" +export -f alter_thread; +export -f alter_thread_1; +export -f select_thread; +export -f optimize_thread; +export -f insert_thread; + + +TIMEOUT=30 + +# Sometimes we detach and attach tables +timeout $TIMEOUT bash -c alter_thread 2> /dev/null & +timeout $TIMEOUT bash -c alter_thread 2> /dev/null & +timeout $TIMEOUT bash -c alter_thread 2> /dev/null & + +timeout $TIMEOUT bash -c alter_thread_1 2> /dev/null & +timeout $TIMEOUT bash -c alter_thread_1 2> /dev/null & +timeout $TIMEOUT bash -c alter_thread_1 2> /dev/null & + +timeout $TIMEOUT bash -c select_thread 2> /dev/null & +timeout $TIMEOUT bash -c select_thread 2> /dev/null & +timeout $TIMEOUT bash -c select_thread 2> /dev/null & + +timeout $TIMEOUT bash -c optimize_thread 2> /dev/null & +timeout $TIMEOUT bash -c optimize_thread 2> /dev/null & +timeout $TIMEOUT bash -c optimize_thread 2> /dev/null & + +timeout $TIMEOUT bash -c insert_thread 2> /dev/null & +timeout $TIMEOUT bash -c insert_thread 2> /dev/null & +timeout $TIMEOUT bash -c insert_thread 2> /dev/null & +timeout $TIMEOUT bash -c insert_thread 2> /dev/null & +timeout $TIMEOUT bash -c insert_thread 2> /dev/null & + +wait + +echo "Finishing alters" + +columns1=$($CLICKHOUSE_CLIENT --query "select count() from system.columns where table='concurrent_alter_add_drop_steroids_1' and database='$CLICKHOUSE_DATABASE'" 2> /dev/null) +columns2=$($CLICKHOUSE_CLIENT --query "select count() from system.columns where table='concurrent_alter_add_drop_steroids_2' and database='$CLICKHOUSE_DATABASE'" 2> /dev/null) +columns3=$($CLICKHOUSE_CLIENT --query "select count() from system.columns where table='concurrent_alter_add_drop_steroids_3' and database='$CLICKHOUSE_DATABASE'" 2> /dev/null) + +while [ "$columns1" != "$columns2" ] || [ "$columns2" != "$columns3" ]; do + columns1=$($CLICKHOUSE_CLIENT --query "select count() from system.columns where table='concurrent_alter_add_drop_steroids_1' and database='$CLICKHOUSE_DATABASE'" 2> /dev/null) + columns2=$($CLICKHOUSE_CLIENT --query "select count() from system.columns where table='concurrent_alter_add_drop_steroids_2' and database='$CLICKHOUSE_DATABASE'" 2> /dev/null) + columns3=$($CLICKHOUSE_CLIENT --query "select count() from system.columns where table='concurrent_alter_add_drop_steroids_3' and database='$CLICKHOUSE_DATABASE'" 2> /dev/null) + + sleep 1 +done + +echo "Equal number of columns" + +# This alter will finish all previous, but replica 1 maybe still not up-to-date +while [[ $(timeout 120 ${CLICKHOUSE_CLIENT} --query "ALTER TABLE concurrent_alter_add_drop_steroids_1 MODIFY COLUMN value0 String SETTINGS replication_alter_partitions_sync=2" 2>&1) ]]; do + sleep 1 +done + +check_replication_consistency "concurrent_alter_add_drop_steroids_" "count(), sum(key), sum(cityHash64(value0))" + +for i in $(seq $REPLICAS); do + $CLICKHOUSE_CLIENT --query "SYSTEM SYNC REPLICA concurrent_alter_add_drop_steroids_$i" + $CLICKHOUSE_CLIENT --query "SELECT COUNT() FROM system.mutations WHERE is_done = 0 and table = 'concurrent_alter_add_drop_steroids_$i'" + $CLICKHOUSE_CLIENT --query "SELECT * FROM system.mutations WHERE is_done = 0 and table = 'concurrent_alter_add_drop_steroids_$i'" + $CLICKHOUSE_CLIENT --query "SELECT COUNT() FROM system.replication_queue WHERE table = 'concurrent_alter_add_drop_steroids_$i'" + $CLICKHOUSE_CLIENT --query "SELECT * FROM system.replication_queue WHERE table = 'concurrent_alter_add_drop_steroids_$i' and (type = 'ALTER_METADATA' or type = 'MUTATE_PART')" + + $CLICKHOUSE_CLIENT --query "DETACH TABLE concurrent_alter_add_drop_steroids_$i" + $CLICKHOUSE_CLIENT --query "ATTACH TABLE concurrent_alter_add_drop_steroids_$i" + + $CLICKHOUSE_CLIENT --query "DROP TABLE IF EXISTS concurrent_alter_add_drop_steroids_$i" +done From ec15298b824d6e51a7549ec79260a0c349174d98 Mon Sep 17 00:00:00 2001 From: Maxim Alexeev Date: Sun, 28 Apr 2024 14:20:59 +0300 Subject: [PATCH 074/289] First working prototype --- src/Core/Settings.h | 1 + src/Interpreters/GraceHashJoin.cpp | 1 + src/Interpreters/HashJoin.cpp | 37 +++++++++-- src/Interpreters/HashJoin.h | 14 ++++ src/Interpreters/JoinedTables.cpp | 2 +- src/Interpreters/TableJoin.cpp | 4 +- src/Interpreters/TableJoin.h | 10 ++- src/Interpreters/TemporaryDataOnDisk.cpp | 85 ++++++++++++++++-------- src/Interpreters/TemporaryDataOnDisk.h | 27 +++++++- src/Planner/PlannerJoinTree.cpp | 2 +- 10 files changed, 147 insertions(+), 36 deletions(-) diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 7ed175d38f4..5dc80912ebb 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -463,6 +463,7 @@ class IColumn; M(UInt64, partial_merge_join_rows_in_right_blocks, 65536, "Split right-hand joining data in blocks of specified size. It's a portion of data indexed by min-max values and possibly unloaded on disk.", 0) \ M(UInt64, join_on_disk_max_files_to_merge, 64, "For MergeJoin on disk set how much files it's allowed to sort simultaneously. Then this value bigger then more memory used and then less disk I/O needed. Minimum is 2.", 0) \ M(UInt64, max_rows_in_set_to_optimize_join, 0, "Maximal size of the set to filter joined tables by each other row sets before joining. 0 - disable.", 0) \ + M(UInt64, cross_join_max_bytes_inmemory, 1000000, "Maximal bytes to complete cross join inmemory.", 0) \ \ M(Bool, compatibility_ignore_collation_in_create_table, true, "Compatibility ignore collation in create table", 0) \ \ diff --git a/src/Interpreters/GraceHashJoin.cpp b/src/Interpreters/GraceHashJoin.cpp index 53d1f48c291..ba51953e979 100644 --- a/src/Interpreters/GraceHashJoin.cpp +++ b/src/Interpreters/GraceHashJoin.cpp @@ -657,6 +657,7 @@ IBlocksStreamPtr GraceHashJoin::getDelayedBlocks() GraceHashJoin::InMemoryJoinPtr GraceHashJoin::makeInMemoryJoin(const String & bucket_id, size_t reserve_num) { + LOG_INFO(log, "GreaceHashJoin\n"); return std::make_unique(table_join, right_sample_block, any_take_last_row, reserve_num, bucket_id); } diff --git a/src/Interpreters/HashJoin.cpp b/src/Interpreters/HashJoin.cpp index 9b05edbce36..e194df4e660 100644 --- a/src/Interpreters/HashJoin.cpp +++ b/src/Interpreters/HashJoin.cpp @@ -35,6 +35,7 @@ #include #include #include +#include "Core/Joins.h" #include #include @@ -249,11 +250,13 @@ HashJoin::HashJoin(std::shared_ptr table_join_, const Block & right_s , instance_id(instance_id_) , asof_inequality(table_join->getAsofInequality()) , data(std::make_shared()) + , tmp_data(std::make_unique(table_join_->getTempDataOnDisk())) , right_sample_block(right_sample_block_) , max_joined_block_rows(table_join->maxJoinedBlockRows()) , instance_log_id(!instance_id_.empty() ? "(" + instance_id_ + ") " : "") , log(getLogger("HashJoin")) { + LOG_INFO(log, "KEK CONSTRUCTOR {}\n", reserve_num); LOG_TRACE(log, "{}Keys: {}, datatype: {}, kind: {}, strictness: {}, right header: {}", instance_log_id, TableJoin::formatClauses(table_join->getClauses(), true), data->type, kind, strictness, right_sample_block.dumpStructure()); @@ -827,6 +830,16 @@ bool HashJoin::addBlockToJoin(const Block & source_block_, bool check_limits) if (shrink_blocks) block_to_save = block_to_save.shrinkToFit(); + + if (kind == JoinKind::Cross) + { + if (tmp_stream == nullptr) + { + tmp_stream = &tmp_data->createStream(right_sample_block); + } + tmp_stream->write(block_to_save); + } + size_t total_rows = 0; size_t total_bytes = 0; { @@ -928,6 +941,7 @@ bool HashJoin::addBlockToJoin(const Block & source_block_, bool check_limits) if (!flag_per_row && !is_inserted) { + LOG_INFO(log, "LOL\n\n\n\n\n\n"); LOG_TRACE(log, "Skipping inserting block with {} rows", rows); data->blocks_allocated_size -= stored_block->allocatedBytes(); data->blocks.pop_back(); @@ -944,7 +958,6 @@ bool HashJoin::addBlockToJoin(const Block & source_block_, bool check_limits) shrinkStoredBlocksToFit(total_bytes); - return table_join->sizeLimits().check(total_rows, total_bytes, "JOIN", ErrorCodes::SET_SIZE_LIMIT_EXCEEDED); } @@ -2275,13 +2288,13 @@ void HashJoin::joinBlockImplCross(Block & block, ExtraBlockPtr & not_processed) for (size_t left_row = start_left_row; left_row < rows_left; ++left_row) { size_t block_number = 0; - for (const Block & compressed_block_right : data->blocks) + + auto process_right_block = [&](const Block & block_right) { ++block_number; if (block_number < start_right_block) - continue; + return; - auto block_right = compressed_block_right.decompress(); size_t rows_right = block_right.rows(); rows_added += rows_right; @@ -2294,6 +2307,22 @@ void HashJoin::joinBlockImplCross(Block & block, ExtraBlockPtr & not_processed) const IColumn & column_right = *block_right.getByPosition(col_num).column; dst_columns[num_existing_columns + col_num]->insertRangeFrom(column_right, 0, rows_right); } + }; + + if (tmp_stream) + { + tmp_stream->finishWritingAsyncSafe(); + auto reader = tmp_stream->getReadStream(); + while (auto block_right = reader->read()) + { + process_right_block(block_right); + } + } + + for (const Block & compressed_block_right : data->blocks) + { + auto block_right = compressed_block_right.decompress(); + process_right_block(block_right); } start_right_block = 0; diff --git a/src/Interpreters/HashJoin.h b/src/Interpreters/HashJoin.h index b7f41a7eb6b..1eb4d0f8030 100644 --- a/src/Interpreters/HashJoin.h +++ b/src/Interpreters/HashJoin.h @@ -26,6 +26,7 @@ #include #include +#include namespace DB { @@ -414,6 +415,15 @@ public: void setMaxJoinedBlockRows(size_t value) { max_joined_block_rows = value; } + TemporaryFileStream* getStreamForCrossJoin() + { + auto streams = tmp_data->getStreams(); + assert(streams.size() <= 1); + if (streams.empty()) + return nullptr; + return streams[0]; + } + private: friend class NotJoinedHash; @@ -442,6 +452,10 @@ private: RightTableDataPtr data; std::vector key_sizes; + /// Needed to do external cross join + TemporaryDataOnDiskPtr tmp_data; + TemporaryFileStream* tmp_stream{nullptr}; + /// Block with columns from the right-side table. Block right_sample_block; /// Block with columns from the right-side table except key columns. diff --git a/src/Interpreters/JoinedTables.cpp b/src/Interpreters/JoinedTables.cpp index 5b549a19083..457ed3ef4a6 100644 --- a/src/Interpreters/JoinedTables.cpp +++ b/src/Interpreters/JoinedTables.cpp @@ -310,7 +310,7 @@ std::shared_ptr JoinedTables::makeTableJoin(const ASTSelectQuery & se auto settings = context->getSettingsRef(); MultiEnum join_algorithm = settings.join_algorithm; bool try_use_direct_join = join_algorithm.isSet(JoinAlgorithm::DIRECT) || join_algorithm.isSet(JoinAlgorithm::DEFAULT); - auto table_join = std::make_shared(settings, context->getGlobalTemporaryVolume()); + auto table_join = std::make_shared(settings, context->getGlobalTemporaryVolume(), context->getTempDataOnDisk()); const ASTTablesInSelectQueryElement * ast_join = select_query_.join(); const auto & table_to_join = ast_join->table_expression->as(); diff --git a/src/Interpreters/TableJoin.cpp b/src/Interpreters/TableJoin.cpp index 48d59dd3b24..7ceb90704f3 100644 --- a/src/Interpreters/TableJoin.cpp +++ b/src/Interpreters/TableJoin.cpp @@ -103,7 +103,7 @@ bool forAllKeys(OnExpr & expressions, Func callback) } -TableJoin::TableJoin(const Settings & settings, VolumePtr tmp_volume_) +TableJoin::TableJoin(const Settings & settings, VolumePtr tmp_volume_, TemporaryDataOnDiskScopePtr tmp_data_) : size_limits(SizeLimits{settings.max_rows_in_join, settings.max_bytes_in_join, settings.join_overflow_mode}) , default_max_bytes(settings.default_max_bytes_in_join) , join_use_nulls(settings.join_use_nulls) @@ -111,12 +111,14 @@ TableJoin::TableJoin(const Settings & settings, VolumePtr tmp_volume_) , cross_join_min_bytes_to_compress(settings.cross_join_min_bytes_to_compress) , max_joined_block_rows(settings.max_joined_block_size_rows) , join_algorithm(settings.join_algorithm) + , cross_join_max_bytes_inmemory(settings.cross_join_max_bytes_inmemory) , partial_merge_join_rows_in_right_blocks(settings.partial_merge_join_rows_in_right_blocks) , partial_merge_join_left_table_buffer_bytes(settings.partial_merge_join_left_table_buffer_bytes) , max_files_to_merge(settings.join_on_disk_max_files_to_merge) , temporary_files_codec(settings.temporary_files_codec) , max_memory_usage(settings.max_memory_usage) , tmp_volume(tmp_volume_) + , tmp_data(tmp_data_) { } diff --git a/src/Interpreters/TableJoin.h b/src/Interpreters/TableJoin.h index 88905edd3e8..d6920d6afbf 100644 --- a/src/Interpreters/TableJoin.h +++ b/src/Interpreters/TableJoin.h @@ -9,6 +9,7 @@ #include #include #include +#include #include #include @@ -144,6 +145,7 @@ private: const UInt64 cross_join_min_bytes_to_compress = 10000; const size_t max_joined_block_rows = 0; std::vector join_algorithm; + const UInt64 cross_join_max_bytes_inmemory = 1000000; const size_t partial_merge_join_rows_in_right_blocks = 0; const size_t partial_merge_join_left_table_buffer_bytes = 0; const size_t max_files_to_merge = 0; @@ -188,6 +190,8 @@ private: VolumePtr tmp_volume; + TemporaryDataOnDiskScopePtr tmp_data; + std::shared_ptr right_storage_join; std::shared_ptr right_kv_storage; @@ -233,7 +237,7 @@ private: public: TableJoin() = default; - TableJoin(const Settings & settings, VolumePtr tmp_volume_); + TableJoin(const Settings & settings, VolumePtr tmp_volume_, TemporaryDataOnDiskScopePtr tmp_data_); /// for StorageJoin TableJoin(SizeLimits limits, bool use_nulls, JoinKind kind, JoinStrictness strictness, @@ -259,6 +263,8 @@ public: VolumePtr getGlobalTemporaryVolume() { return tmp_volume; } + TemporaryDataOnDiskScopePtr getTempDataOnDisk() { return tmp_data; } + ActionsDAGPtr createJoinedBlockActions(ContextPtr context) const; const std::vector & getEnabledJoinAlgorithms() const { return join_algorithm; } @@ -275,6 +281,8 @@ public: bool allowParallelHashJoin() const; + UInt64 crossJoinMaxBytesInmemory() const { return cross_join_max_bytes_inmemory; } + bool joinUseNulls() const { return join_use_nulls; } UInt64 crossJoinMinRowsToCompress() const { return cross_join_min_rows_to_compress; } diff --git a/src/Interpreters/TemporaryDataOnDisk.cpp b/src/Interpreters/TemporaryDataOnDisk.cpp index 9a237738b3e..e50d501f6d3 100644 --- a/src/Interpreters/TemporaryDataOnDisk.cpp +++ b/src/Interpreters/TemporaryDataOnDisk.cpp @@ -1,12 +1,11 @@ +#include +#include #include #include -#include #include -#include #include #include -#include #include #include #include @@ -14,6 +13,7 @@ #include #include +#include "Common/Exception.h" namespace ProfileEvents { @@ -224,33 +224,26 @@ struct TemporaryFileStream::OutputWriter bool finalized = false; }; -struct TemporaryFileStream::InputReader +InputReader::InputReader(const String & path, const Block & header_, size_t size) + : in_file_buf(path, size ? std::min(DBMS_DEFAULT_BUFFER_SIZE, size) : DBMS_DEFAULT_BUFFER_SIZE) + , in_compressed_buf(in_file_buf) + , in_reader(in_compressed_buf, header_, DBMS_TCP_PROTOCOL_VERSION) { - InputReader(const String & path, const Block & header_, size_t size = 0) - : in_file_buf(path, size ? std::min(DBMS_DEFAULT_BUFFER_SIZE, size) : DBMS_DEFAULT_BUFFER_SIZE) - , in_compressed_buf(in_file_buf) - , in_reader(in_compressed_buf, header_, DBMS_TCP_PROTOCOL_VERSION) - { - LOG_TEST(getLogger("TemporaryFileStream"), "Reading {} from {}", header_.dumpStructure(), path); - } + LOG_TEST(getLogger("TemporaryFileStream"), "Reading {} from {}", header_.dumpStructure(), path); +} - explicit InputReader(const String & path, size_t size = 0) - : in_file_buf(path, size ? std::min(DBMS_DEFAULT_BUFFER_SIZE, size) : DBMS_DEFAULT_BUFFER_SIZE) - , in_compressed_buf(in_file_buf) - , in_reader(in_compressed_buf, DBMS_TCP_PROTOCOL_VERSION) - { - LOG_TEST(getLogger("TemporaryFileStream"), "Reading from {}", path); - } +InputReader::InputReader(const String & path, size_t size) + : in_file_buf(path, size ? std::min(DBMS_DEFAULT_BUFFER_SIZE, size) : DBMS_DEFAULT_BUFFER_SIZE) + , in_compressed_buf(in_file_buf) + , in_reader(in_compressed_buf, DBMS_TCP_PROTOCOL_VERSION) +{ + LOG_TEST(getLogger("TemporaryFileStream"), "Reading from {}", path); +} - Block read() - { - return in_reader.read(); - } - - ReadBufferFromFile in_file_buf; - CompressedReadBuffer in_compressed_buf; - NativeReader in_reader; -}; +Block InputReader::read() +{ + return in_reader.read(); +} TemporaryFileStream::TemporaryFileStream(TemporaryFileOnDiskHolder file_, const Block & header_, TemporaryDataOnDisk * parent_) : parent(parent_) @@ -310,6 +303,20 @@ TemporaryFileStream::Stat TemporaryFileStream::finishWriting() return stat; } +TemporaryFileStream::Stat TemporaryFileStream::finishWritingAsyncSafe() +{ + if (!writing_finished.load(std::memory_order_relaxed)) + { + std::lock_guard lock(finish_writing); + if (!writing_finished.load()) + { + return finishWriting(); + } + writing_finished.store(true); + } + return stat; +} + bool TemporaryFileStream::isWriteFinished() const { assert(in_reader == nullptr || out_writer == nullptr); @@ -324,6 +331,12 @@ Block TemporaryFileStream::read() if (isEof()) return {}; + if (auto type = read_type.exchange(1); type == 2) + { + read_type.store(2); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Different type of reading was requested earlier"); + } + if (!in_reader) { in_reader = std::make_unique(getPath(), header, getSize()); @@ -334,10 +347,28 @@ Block TemporaryFileStream::read() { /// finalize earlier to release resources, do not wait for the destructor this->release(); + in_reader.reset(); } return block; } +std::unique_ptr TemporaryFileStream::getReadStream() +{ + if (!isWriteFinished()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Writing has been not finished"); + + if (isEof()) + return nullptr; + + if (auto type = read_type.exchange(2); type == 1) + { + read_type.store(1); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Different type of reading was requested earlier"); + } + + return std::make_unique(getPath(), header, getSize()); +} + void TemporaryFileStream::updateAllocAndCheck() { assert(out_writer); diff --git a/src/Interpreters/TemporaryDataOnDisk.h b/src/Interpreters/TemporaryDataOnDisk.h index 40100a62b44..792988e94bd 100644 --- a/src/Interpreters/TemporaryDataOnDisk.h +++ b/src/Interpreters/TemporaryDataOnDisk.h @@ -1,7 +1,11 @@ #pragma once +#include #include +#include +#include +#include #include #include #include @@ -130,6 +134,19 @@ private: typename CurrentMetrics::Metric current_metric_scope = CurrentMetrics::TemporaryFilesUnknown; }; +struct InputReader +{ + InputReader(const String & path, const Block & header_, size_t size = 0); + + explicit InputReader(const String & path, size_t size = 0); + + Block read(); + + ReadBufferFromFile in_file_buf; + CompressedReadBuffer in_compressed_buf; + NativeReader in_reader; +}; + /* * Data can be written into this stream and then read. * After finish writing, call `finishWriting` and then `read` to read the data. @@ -154,8 +171,11 @@ public: void flush(); Stat finishWriting(); + Stat finishWritingAsyncSafe(); bool isWriteFinished() const; + std::unique_ptr getReadStream(); + Block read(); String getPath() const; @@ -184,10 +204,15 @@ private: Stat stat; + /// 0 - means that we haven't requested any read, 1 - read from function TemporaryFileStream::read, 2 - + std::atomic_char read_type{0}; + + mutable std::mutex finish_writing; + std::atomic_bool writing_finished{false}; + struct OutputWriter; std::unique_ptr out_writer; - struct InputReader; std::unique_ptr in_reader; }; diff --git a/src/Planner/PlannerJoinTree.cpp b/src/Planner/PlannerJoinTree.cpp index 514c19b0f89..3467299812a 100644 --- a/src/Planner/PlannerJoinTree.cpp +++ b/src/Planner/PlannerJoinTree.cpp @@ -1196,7 +1196,7 @@ JoinTreeQueryPlan buildQueryPlanForJoinNode(const QueryTreeNodePtr & join_table_ } } - auto table_join = std::make_shared(settings, query_context->getGlobalTemporaryVolume()); + auto table_join = std::make_shared(settings, query_context->getGlobalTemporaryVolume(), query_context->getTempDataOnDisk()); table_join->getTableJoin() = join_node.toASTTableJoin()->as(); if (join_constant) From d6690f8384235531947a09cd65f00623095f3ed4 Mon Sep 17 00:00:00 2001 From: unashi Date: Mon, 6 May 2024 15:12:16 +0800 Subject: [PATCH 075/289] [feature] Raw as a synonym for TSVRaw --- docs/en/interfaces/formats.md | 6 +++--- .../Formats/Impl/TabSeparatedRowInputFormat.cpp | 8 ++++++++ .../Formats/Impl/TabSeparatedRowOutputFormat.cpp | 3 +++ .../0_stateless/00397_tsv_format_synonym.reference | 3 +++ tests/queries/0_stateless/00397_tsv_format_synonym.sql | 1 + 5 files changed, 18 insertions(+), 3 deletions(-) diff --git a/docs/en/interfaces/formats.md b/docs/en/interfaces/formats.md index 03cf345349e..937dfb52609 100644 --- a/docs/en/interfaces/formats.md +++ b/docs/en/interfaces/formats.md @@ -206,7 +206,7 @@ SELECT * FROM nestedt FORMAT TSV Differs from `TabSeparated` format in that the rows are written without escaping. When parsing with this format, tabs or linefeeds are not allowed in each field. -This format is also available under the name `TSVRaw`. +This format is also available under the name `TSVRaw`, `Raw`. ## TabSeparatedWithNames {#tabseparatedwithnames} @@ -241,14 +241,14 @@ This format is also available under the name `TSVWithNamesAndTypes`. Differs from `TabSeparatedWithNames` format in that the rows are written without escaping. When parsing with this format, tabs or linefeeds are not allowed in each field. -This format is also available under the name `TSVRawWithNames`. +This format is also available under the name `TSVRawWithNames`, `RawWithNames`. ## TabSeparatedRawWithNamesAndTypes {#tabseparatedrawwithnamesandtypes} Differs from `TabSeparatedWithNamesAndTypes` format in that the rows are written without escaping. When parsing with this format, tabs or linefeeds are not allowed in each field. -This format is also available under the name `TSVRawWithNamesAndNames`. +This format is also available under the name `TSVRawWithNamesAndNames`, `RawWithNamesAndNames`. ## Template {#format-template} diff --git a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp index 85b1797dab8..09f8fa92e5f 100644 --- a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp @@ -402,6 +402,8 @@ void registerInputFormatTabSeparated(FormatFactory & factory) registerWithNamesAndTypes(is_raw ? "TabSeparatedRaw" : "TabSeparated", register_func); registerWithNamesAndTypes(is_raw ? "TSVRaw" : "TSV", register_func); + if (is_raw) + registerWithNamesAndTypes("Raw", register_func); } } @@ -433,6 +435,8 @@ void registerTSVSchemaReader(FormatFactory & factory) registerWithNamesAndTypes(is_raw ? "TabSeparatedRaw" : "TabSeparated", register_func); registerWithNamesAndTypes(is_raw ? "TSVRaw" : "TSV", register_func); + if (is_raw) + registerWithNamesAndTypes("Raw", register_func); } } @@ -506,8 +510,12 @@ void registerFileSegmentationEngineTabSeparated(FormatFactory & factory) registerWithNamesAndTypes(is_raw ? "TSVRaw" : "TSV", register_func); registerWithNamesAndTypes(is_raw ? "TabSeparatedRaw" : "TabSeparated", register_func); + if (is_raw) + registerWithNamesAndTypes("Raw", register_func); markFormatWithNamesAndTypesSupportsSamplingColumns(is_raw ? "TSVRaw" : "TSV", factory); markFormatWithNamesAndTypesSupportsSamplingColumns(is_raw ? "TabSeparatedRaw" : "TabSeparated", factory); + if (is_raw) + markFormatWithNamesAndTypesSupportsSamplingColumns("Raw", factory); } // We can use the same segmentation engine for TSKV. diff --git a/src/Processors/Formats/Impl/TabSeparatedRowOutputFormat.cpp b/src/Processors/Formats/Impl/TabSeparatedRowOutputFormat.cpp index a4a5aea26cb..c8384c09be6 100644 --- a/src/Processors/Formats/Impl/TabSeparatedRowOutputFormat.cpp +++ b/src/Processors/Formats/Impl/TabSeparatedRowOutputFormat.cpp @@ -95,7 +95,10 @@ void registerOutputFormatTabSeparated(FormatFactory & factory) registerWithNamesAndTypes(is_raw ? "TSVRaw" : "TSV", register_func); registerWithNamesAndTypes(is_raw ? "TabSeparatedRaw" : "TabSeparated", register_func); if (is_raw) + { registerWithNamesAndTypes("LineAsString", register_func); + registerWithNamesAndTypes("Raw", register_func); + } } } diff --git a/tests/queries/0_stateless/00397_tsv_format_synonym.reference b/tests/queries/0_stateless/00397_tsv_format_synonym.reference index c4a86983be3..c91169a06fa 100644 --- a/tests/queries/0_stateless/00397_tsv_format_synonym.reference +++ b/tests/queries/0_stateless/00397_tsv_format_synonym.reference @@ -28,3 +28,6 @@ UInt8 String String 1 hello world 2 hello world 3 hello world +1 hello world +2 hello world +3 hello world diff --git a/tests/queries/0_stateless/00397_tsv_format_synonym.sql b/tests/queries/0_stateless/00397_tsv_format_synonym.sql index 8c69a795857..51283c6ced9 100644 --- a/tests/queries/0_stateless/00397_tsv_format_synonym.sql +++ b/tests/queries/0_stateless/00397_tsv_format_synonym.sql @@ -9,3 +9,4 @@ SELECT arrayJoin([1, 2, 3]) AS arr, 'hello' AS s1, 'world' AS s2 FORMAT TSVWithN SELECT arrayJoin([1, 2, 3]) AS arr, 'hello' AS s1, 'world' AS s2 FORMAT TabSeparatedRaw; SELECT arrayJoin([1, 2, 3]) AS arr, 'hello' AS s1, 'world' AS s2 FORMAT TSVRaw; +SELECT arrayJoin([1, 2, 3]) AS arr, 'hello' AS s1, 'world' AS s2 FORMAT Raw; From ab7d457c51d92123d1845486a2f3243324f06503 Mon Sep 17 00:00:00 2001 From: Yohann Jardin Date: Mon, 6 May 2024 11:35:48 +0200 Subject: [PATCH 076/289] Update ErrorCodes for functions using NUMBER_OF_ARGUMENTS_DOESNT_MATCH --- src/AggregateFunctions/AggregateFunctionAggThrow.cpp | 4 ++-- .../AggregateFunctionGroupArrayInsertAt.cpp | 3 ++- src/AggregateFunctions/AggregateFunctionMLMethod.cpp | 7 ++++--- src/AggregateFunctions/AggregateFunctionMannWhitney.cpp | 4 ++-- src/AggregateFunctions/AggregateFunctionQuantile.cpp | 4 ++-- .../AggregateFunctionQuantileBFloat16Weighted.cpp | 4 ++-- src/AggregateFunctions/AggregateFunctionQuantileDD.cpp | 4 ++-- .../AggregateFunctionQuantileExactHigh.cpp | 4 ++-- .../AggregateFunctionQuantileExactInclusive.cpp | 4 ++-- .../AggregateFunctionQuantileTiming.cpp | 4 ++-- src/AggregateFunctions/AggregateFunctionTopK.cpp | 4 ++-- src/AggregateFunctions/AggregateFunctionWelchTTest.cpp | 4 ++-- .../Combinators/AggregateFunctionArray.cpp | 4 ++-- src/AggregateFunctions/Combinators/AggregateFunctionIf.h | 4 ++-- src/Functions/Kusto/KqlArraySort.cpp | 4 ++-- src/Functions/MultiSearchFirstPositionImpl.h | 4 ++-- src/Functions/MultiSearchImpl.h | 4 ++-- src/Functions/array/arrayEnumerateExtended.h | 4 ++-- src/Functions/array/arrayEnumerateRanked.h | 7 ++++--- src/Functions/array/arrayFold.cpp | 6 +++--- src/Functions/array/arrayReduce.cpp | 4 ++-- src/Functions/array/arrayZip.cpp | 4 ++-- src/Functions/concat.cpp | 6 +++--- src/Functions/generateULID.cpp | 4 ++-- src/Functions/jsonMergePatch.cpp | 4 ++-- src/Functions/nested.cpp | 9 +++++---- src/Functions/now64.cpp | 4 ++-- src/Functions/nowInBlock.cpp | 4 ++-- src/Functions/parseTimeDelta.cpp | 7 ++++--- src/Functions/pointInPolygon.cpp | 4 ++-- src/TableFunctions/TableFunctionValues.cpp | 4 ++-- 31 files changed, 73 insertions(+), 68 deletions(-) diff --git a/src/AggregateFunctions/AggregateFunctionAggThrow.cpp b/src/AggregateFunctions/AggregateFunctionAggThrow.cpp index 27432bc94ba..ebf6cabbcfd 100644 --- a/src/AggregateFunctions/AggregateFunctionAggThrow.cpp +++ b/src/AggregateFunctions/AggregateFunctionAggThrow.cpp @@ -16,7 +16,7 @@ struct Settings; namespace ErrorCodes { extern const int AGGREGATE_FUNCTION_THROW; - extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; + extern const int TOO_MANY_ARGUMENTS_FOR_FUNCTION; } namespace @@ -116,7 +116,7 @@ void registerAggregateFunctionAggThrow(AggregateFunctionFactory & factory) if (parameters.size() == 1) throw_probability = parameters[0].safeGet(); else if (parameters.size() > 1) - throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Aggregate function {} cannot have more than one parameter", name); + throw Exception(ErrorCodes::TOO_MANY_ARGUMENTS_FOR_FUNCTION, "Aggregate function {} cannot have more than one parameter", name); return std::make_shared(argument_types, parameters, throw_probability); }); diff --git a/src/AggregateFunctions/AggregateFunctionGroupArrayInsertAt.cpp b/src/AggregateFunctions/AggregateFunctionGroupArrayInsertAt.cpp index 60e8df64283..8674aac2e90 100644 --- a/src/AggregateFunctions/AggregateFunctionGroupArrayInsertAt.cpp +++ b/src/AggregateFunctions/AggregateFunctionGroupArrayInsertAt.cpp @@ -27,6 +27,7 @@ struct Settings; namespace ErrorCodes { extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; + extern const int TOO_MANY_ARGUMENTS_FOR_FUNCTION; extern const int TOO_LARGE_ARRAY_SIZE; extern const int CANNOT_CONVERT_TYPE; extern const int ILLEGAL_TYPE_OF_ARGUMENT; @@ -74,7 +75,7 @@ public: if (!params.empty()) { if (params.size() > 2) - throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Aggregate function {} requires at most two parameters.", getName()); + throw Exception(ErrorCodes::TOO_MANY_ARGUMENTS_FOR_FUNCTION, "Aggregate function {} requires at most two parameters.", getName()); default_value = params[0]; diff --git a/src/AggregateFunctions/AggregateFunctionMLMethod.cpp b/src/AggregateFunctions/AggregateFunctionMLMethod.cpp index d2178437770..730b5cfa5cc 100644 --- a/src/AggregateFunctions/AggregateFunctionMLMethod.cpp +++ b/src/AggregateFunctions/AggregateFunctionMLMethod.cpp @@ -22,7 +22,8 @@ namespace ErrorCodes extern const int BAD_ARGUMENTS; extern const int LOGICAL_ERROR; extern const int ILLEGAL_TYPE_OF_ARGUMENT; - extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; + extern const int TOO_FEW_ARGUMENTS_FOR_FUNCTION; + extern const int TOO_MANY_ARGUMENTS_FOR_FUNCTION; } namespace @@ -34,12 +35,12 @@ namespace const std::string & name, const DataTypes & argument_types, const Array & parameters, const Settings *) { if (parameters.size() > 4) - throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + throw Exception(ErrorCodes::TOO_MANY_ARGUMENTS_FOR_FUNCTION, "Aggregate function {} requires at most four parameters: " "learning_rate, l2_regularization_coef, mini-batch size and weights_updater method", name); if (argument_types.size() < 2) - throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + throw Exception(ErrorCodes::TOO_FEW_ARGUMENTS_FOR_FUNCTION, "Aggregate function {} requires at least two arguments: target and model's parameters", name); for (size_t i = 0; i < argument_types.size(); ++i) diff --git a/src/AggregateFunctions/AggregateFunctionMannWhitney.cpp b/src/AggregateFunctions/AggregateFunctionMannWhitney.cpp index e7bc5df335f..d185058dbd0 100644 --- a/src/AggregateFunctions/AggregateFunctionMannWhitney.cpp +++ b/src/AggregateFunctions/AggregateFunctionMannWhitney.cpp @@ -21,7 +21,7 @@ namespace ErrorCodes { extern const int NOT_IMPLEMENTED; extern const int ILLEGAL_TYPE_OF_ARGUMENT; - extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; + extern const int TOO_MANY_ARGUMENTS_FOR_FUNCTION; extern const int BAD_ARGUMENTS; } @@ -141,7 +141,7 @@ public: : IAggregateFunctionDataHelper ({arguments}, {}, createResultType()) { if (params.size() > 2) - throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Aggregate function {} require two parameter or less", getName()); + throw Exception(ErrorCodes::TOO_MANY_ARGUMENTS_FOR_FUNCTION, "Aggregate function {} require two parameter or less", getName()); if (params.empty()) { diff --git a/src/AggregateFunctions/AggregateFunctionQuantile.cpp b/src/AggregateFunctions/AggregateFunctionQuantile.cpp index 4d37ec69d26..f72b28030c3 100644 --- a/src/AggregateFunctions/AggregateFunctionQuantile.cpp +++ b/src/AggregateFunctions/AggregateFunctionQuantile.cpp @@ -14,7 +14,7 @@ struct Settings; namespace ErrorCodes { - extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; + extern const int TOO_FEW_ARGUMENTS_FOR_FUNCTION; extern const int ILLEGAL_TYPE_OF_ARGUMENT; extern const int NOT_IMPLEMENTED; } @@ -118,7 +118,7 @@ AggregateFunctionPtr createAggregateFunctionQuantile( const std::string & name, const DataTypes & argument_types, const Array & params, const Settings *) { if (argument_types.empty()) - throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Aggregate function {} requires at least one argument", name); + throw Exception(ErrorCodes::TOO_FEW_ARGUMENTS_FOR_FUNCTION, "Aggregate function {} requires at least one argument", name); const DataTypePtr & argument_type = argument_types[0]; WhichDataType which(argument_type); diff --git a/src/AggregateFunctions/AggregateFunctionQuantileBFloat16Weighted.cpp b/src/AggregateFunctions/AggregateFunctionQuantileBFloat16Weighted.cpp index 6d881b77c16..b0a39b1fdab 100644 --- a/src/AggregateFunctions/AggregateFunctionQuantileBFloat16Weighted.cpp +++ b/src/AggregateFunctions/AggregateFunctionQuantileBFloat16Weighted.cpp @@ -12,7 +12,7 @@ struct Settings; namespace ErrorCodes { - extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; + extern const int TOO_FEW_ARGUMENTS_FOR_FUNCTION; extern const int ILLEGAL_TYPE_OF_ARGUMENT; } @@ -27,7 +27,7 @@ AggregateFunctionPtr createAggregateFunctionQuantile( const std::string & name, const DataTypes & argument_types, const Array & params, const Settings *) { if (argument_types.empty()) - throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Aggregate function {} requires at least one argument", name); + throw Exception(ErrorCodes::TOO_FEW_ARGUMENTS_FOR_FUNCTION, "Aggregate function {} requires at least one argument", name); const DataTypePtr & argument_type = argument_types[0]; WhichDataType which(argument_type); diff --git a/src/AggregateFunctions/AggregateFunctionQuantileDD.cpp b/src/AggregateFunctions/AggregateFunctionQuantileDD.cpp index f3d6b26ee75..7b1ae43038a 100644 --- a/src/AggregateFunctions/AggregateFunctionQuantileDD.cpp +++ b/src/AggregateFunctions/AggregateFunctionQuantileDD.cpp @@ -13,7 +13,7 @@ struct Settings; namespace ErrorCodes { - extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; + extern const int TOO_FEW_ARGUMENTS_FOR_FUNCTION; extern const int ILLEGAL_TYPE_OF_ARGUMENT; } @@ -29,7 +29,7 @@ AggregateFunctionPtr createAggregateFunctionQuantile( const std::string & name, const DataTypes & argument_types, const Array & params, const Settings *) { if (argument_types.empty()) - throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Aggregate function {} requires at least one argument", name); + throw Exception(ErrorCodes::TOO_FEW_ARGUMENTS_FOR_FUNCTION, "Aggregate function {} requires at least one argument", name); const DataTypePtr & argument_type = argument_types[0]; WhichDataType which(argument_type); diff --git a/src/AggregateFunctions/AggregateFunctionQuantileExactHigh.cpp b/src/AggregateFunctions/AggregateFunctionQuantileExactHigh.cpp index b44df755158..6fa4158076c 100644 --- a/src/AggregateFunctions/AggregateFunctionQuantileExactHigh.cpp +++ b/src/AggregateFunctions/AggregateFunctionQuantileExactHigh.cpp @@ -13,7 +13,7 @@ struct Settings; namespace ErrorCodes { - extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; + extern const int TOO_FEW_ARGUMENTS_FOR_FUNCTION; extern const int ILLEGAL_TYPE_OF_ARGUMENT; } @@ -29,7 +29,7 @@ AggregateFunctionPtr createAggregateFunctionQuantile( const std::string & name, const DataTypes & argument_types, const Array & params, const Settings *) { if (argument_types.empty()) - throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Aggregate function {} requires at least one argument", name); + throw Exception(ErrorCodes::TOO_FEW_ARGUMENTS_FOR_FUNCTION, "Aggregate function {} requires at least one argument", name); const DataTypePtr & argument_type = argument_types[0]; WhichDataType which(argument_type); diff --git a/src/AggregateFunctions/AggregateFunctionQuantileExactInclusive.cpp b/src/AggregateFunctions/AggregateFunctionQuantileExactInclusive.cpp index d8287be86ca..bed5cf95bd8 100644 --- a/src/AggregateFunctions/AggregateFunctionQuantileExactInclusive.cpp +++ b/src/AggregateFunctions/AggregateFunctionQuantileExactInclusive.cpp @@ -13,7 +13,7 @@ struct Settings; namespace ErrorCodes { - extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; + extern const int TOO_FEW_ARGUMENTS_FOR_FUNCTION; extern const int ILLEGAL_TYPE_OF_ARGUMENT; } @@ -28,7 +28,7 @@ AggregateFunctionPtr createAggregateFunctionQuantile( const std::string & name, const DataTypes & argument_types, const Array & params, const Settings *) { if (argument_types.empty()) - throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Aggregate function {} requires at least one argument", name); + throw Exception(ErrorCodes::TOO_FEW_ARGUMENTS_FOR_FUNCTION, "Aggregate function {} requires at least one argument", name); const DataTypePtr & argument_type = argument_types[0]; WhichDataType which(argument_type); diff --git a/src/AggregateFunctions/AggregateFunctionQuantileTiming.cpp b/src/AggregateFunctions/AggregateFunctionQuantileTiming.cpp index 1bb77892c0d..e293fc6e59d 100644 --- a/src/AggregateFunctions/AggregateFunctionQuantileTiming.cpp +++ b/src/AggregateFunctions/AggregateFunctionQuantileTiming.cpp @@ -13,7 +13,7 @@ struct Settings; namespace ErrorCodes { - extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; + extern const int TOO_FEW_ARGUMENTS_FOR_FUNCTION; extern const int ILLEGAL_TYPE_OF_ARGUMENT; } @@ -28,7 +28,7 @@ AggregateFunctionPtr createAggregateFunctionQuantile( const std::string & name, const DataTypes & argument_types, const Array & params, const Settings *) { if (argument_types.empty()) - throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Aggregate function {} requires at least one argument", name); + throw Exception(ErrorCodes::TOO_FEW_ARGUMENTS_FOR_FUNCTION, "Aggregate function {} requires at least one argument", name); const DataTypePtr & argument_type = argument_types[0]; WhichDataType which(argument_type); diff --git a/src/AggregateFunctions/AggregateFunctionTopK.cpp b/src/AggregateFunctions/AggregateFunctionTopK.cpp index d0e4f507d46..26f756abe18 100644 --- a/src/AggregateFunctions/AggregateFunctionTopK.cpp +++ b/src/AggregateFunctions/AggregateFunctionTopK.cpp @@ -35,7 +35,7 @@ namespace ErrorCodes extern const int ILLEGAL_TYPE_OF_ARGUMENT; extern const int BAD_ARGUMENTS; extern const int LOGICAL_ERROR; - extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; + extern const int TOO_MANY_ARGUMENTS_FOR_FUNCTION; } @@ -467,7 +467,7 @@ AggregateFunctionPtr createAggregateFunctionTopK(const std::string & name, const if (!params.empty()) { if (params.size() > 3) - throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + throw Exception(ErrorCodes::TOO_MANY_ARGUMENTS_FOR_FUNCTION, "Aggregate function '{}' requires three parameters or less", name); threshold = applyVisitor(FieldVisitorConvertToNumber(), params[0]); diff --git a/src/AggregateFunctions/AggregateFunctionWelchTTest.cpp b/src/AggregateFunctions/AggregateFunctionWelchTTest.cpp index 849f81279e7..7f1c4861fbc 100644 --- a/src/AggregateFunctions/AggregateFunctionWelchTTest.cpp +++ b/src/AggregateFunctions/AggregateFunctionWelchTTest.cpp @@ -7,7 +7,7 @@ namespace ErrorCodes { extern const int BAD_ARGUMENTS; - extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; + extern const int TOO_MANY_ARGUMENTS_FOR_FUNCTION; } namespace DB @@ -80,7 +80,7 @@ AggregateFunctionPtr createAggregateFunctionWelchTTest( assertBinary(name, argument_types); if (parameters.size() > 1) - throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Aggregate function {} requires zero or one parameter.", name); + throw Exception(ErrorCodes::TOO_MANY_ARGUMENTS_FOR_FUNCTION, "Aggregate function {} requires zero or one parameter.", name); if (!isNumber(argument_types[0]) || !isNumber(argument_types[1])) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Aggregate function {} only supports numerical types", name); diff --git a/src/AggregateFunctions/Combinators/AggregateFunctionArray.cpp b/src/AggregateFunctions/Combinators/AggregateFunctionArray.cpp index a54092f0bcb..6b60faadf07 100644 --- a/src/AggregateFunctions/Combinators/AggregateFunctionArray.cpp +++ b/src/AggregateFunctions/Combinators/AggregateFunctionArray.cpp @@ -9,7 +9,7 @@ struct Settings; namespace ErrorCodes { - extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; + extern const int TOO_FEW_ARGUMENTS_FOR_FUNCTION; extern const int ILLEGAL_TYPE_OF_ARGUMENT; } @@ -26,7 +26,7 @@ public: DataTypes transformArguments(const DataTypes & arguments) const override { if (arguments.empty()) - throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "-Array aggregate functions require at least one argument"); + throw Exception(ErrorCodes::TOO_FEW_ARGUMENTS_FOR_FUNCTION, "-Array aggregate functions require at least one argument"); DataTypes nested_arguments; for (const auto & type : arguments) diff --git a/src/AggregateFunctions/Combinators/AggregateFunctionIf.h b/src/AggregateFunctions/Combinators/AggregateFunctionIf.h index a893fc91780..f57cfa41752 100644 --- a/src/AggregateFunctions/Combinators/AggregateFunctionIf.h +++ b/src/AggregateFunctions/Combinators/AggregateFunctionIf.h @@ -18,7 +18,7 @@ struct Settings; namespace ErrorCodes { - extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; + extern const int TOO_FEW_ARGUMENTS_FOR_FUNCTION; extern const int ILLEGAL_TYPE_OF_ARGUMENT; } @@ -42,7 +42,7 @@ public: , nested_func(nested), num_arguments(types.size()) { if (num_arguments == 0) - throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Aggregate function {} require at least one argument", getName()); + throw Exception(ErrorCodes::TOO_FEW_ARGUMENTS_FOR_FUNCTION, "Aggregate function {} require at least one argument", getName()); only_null_condition = types.back()->onlyNull(); diff --git a/src/Functions/Kusto/KqlArraySort.cpp b/src/Functions/Kusto/KqlArraySort.cpp index 5be36328cc3..ac301c474e8 100644 --- a/src/Functions/Kusto/KqlArraySort.cpp +++ b/src/Functions/Kusto/KqlArraySort.cpp @@ -11,7 +11,7 @@ namespace DB { namespace ErrorCodes { - extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; + extern const int TOO_FEW_ARGUMENTS_FOR_FUNCTION; extern const int ILLEGAL_TYPE_OF_ARGUMENT; extern const int ILLEGAL_COLUMN; } @@ -35,7 +35,7 @@ public: { if (arguments.empty()) throw Exception( - ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + ErrorCodes::TOO_FEW_ARGUMENTS_FOR_FUNCTION, "Function {} needs at least one argument; passed {}.", getName(), arguments.size()); diff --git a/src/Functions/MultiSearchFirstPositionImpl.h b/src/Functions/MultiSearchFirstPositionImpl.h index 99dd3f9d394..aca96cabf04 100644 --- a/src/Functions/MultiSearchFirstPositionImpl.h +++ b/src/Functions/MultiSearchFirstPositionImpl.h @@ -10,7 +10,7 @@ namespace DB namespace ErrorCodes { - extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; + extern const int TOO_MANY_ARGUMENTS_FOR_FUNCTION; } template @@ -37,7 +37,7 @@ struct MultiSearchFirstPositionImpl { // For performance of Volnitsky search, it is crucial to save only one byte for pattern number. if (needles_arr.size() > std::numeric_limits::max()) - throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + throw Exception(ErrorCodes::TOO_MANY_ARGUMENTS_FOR_FUNCTION, "Number of arguments for function {} doesn't match: passed {}, should be at most {}", name, std::to_string(needles_arr.size()), std::to_string(std::numeric_limits::max())); diff --git a/src/Functions/MultiSearchImpl.h b/src/Functions/MultiSearchImpl.h index fb7d56f302a..d3d1680481f 100644 --- a/src/Functions/MultiSearchImpl.h +++ b/src/Functions/MultiSearchImpl.h @@ -10,7 +10,7 @@ namespace DB namespace ErrorCodes { - extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; + extern const int TOO_MANY_ARGUMENTS_FOR_FUNCTION; } template @@ -37,7 +37,7 @@ struct MultiSearchImpl { // For performance of Volnitsky search, it is crucial to save only one byte for pattern number. if (needles_arr.size() > std::numeric_limits::max()) - throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + throw Exception(ErrorCodes::TOO_MANY_ARGUMENTS_FOR_FUNCTION, "Number of arguments for function {} doesn't match: passed {}, should be at most {}", name, needles_arr.size(), std::to_string(std::numeric_limits::max())); diff --git a/src/Functions/array/arrayEnumerateExtended.h b/src/Functions/array/arrayEnumerateExtended.h index cf38afcfa5a..67c0981108f 100644 --- a/src/Functions/array/arrayEnumerateExtended.h +++ b/src/Functions/array/arrayEnumerateExtended.h @@ -18,7 +18,7 @@ namespace DB namespace ErrorCodes { - extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; + extern const int TOO_FEW_ARGUMENTS_FOR_FUNCTION; extern const int ILLEGAL_COLUMN; extern const int ILLEGAL_TYPE_OF_ARGUMENT; extern const int SIZES_OF_ARRAYS_DONT_MATCH; @@ -43,7 +43,7 @@ public: DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override { if (arguments.empty()) - throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + throw Exception(ErrorCodes::TOO_FEW_ARGUMENTS_FOR_FUNCTION, "Number of arguments for function {} doesn't match: passed {}, should be at least 1.", getName(), arguments.size()); diff --git a/src/Functions/array/arrayEnumerateRanked.h b/src/Functions/array/arrayEnumerateRanked.h index 04fa305368d..ad325fe542a 100644 --- a/src/Functions/array/arrayEnumerateRanked.h +++ b/src/Functions/array/arrayEnumerateRanked.h @@ -59,7 +59,8 @@ namespace DB { namespace ErrorCodes { - extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; + extern const int BAD_ARGUMENTS; + extern const int TOO_FEW_ARGUMENTS_FOR_FUNCTION; extern const int SIZES_OF_ARRAYS_DONT_MATCH; } @@ -101,7 +102,7 @@ public: DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override { if (arguments.empty()) - throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + throw Exception(ErrorCodes::TOO_FEW_ARGUMENTS_FOR_FUNCTION, "Number of arguments for function {} doesn't match: passed {}, should be at least 1.", getName(), arguments.size()); @@ -238,7 +239,7 @@ ColumnPtr FunctionArrayEnumerateRankedExtended::executeImpl( } if (offsets_by_depth.empty()) - throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "No arrays passed to function {}", getName()); + throw Exception(ErrorCodes::BAD_ARGUMENTS, "No arrays passed to function {}", getName()); auto res_nested = ColumnUInt32::create(); diff --git a/src/Functions/array/arrayFold.cpp b/src/Functions/array/arrayFold.cpp index 63c14f475fc..5170c9a5b5f 100644 --- a/src/Functions/array/arrayFold.cpp +++ b/src/Functions/array/arrayFold.cpp @@ -14,7 +14,7 @@ namespace ErrorCodes { extern const int ILLEGAL_COLUMN; extern const int ILLEGAL_TYPE_OF_ARGUMENT; - extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; + extern const int TOO_FEW_ARGUMENTS_FOR_FUNCTION; extern const int SIZES_OF_ARRAYS_DONT_MATCH; extern const int TYPE_MISMATCH; } @@ -41,7 +41,7 @@ public: void getLambdaArgumentTypes(DataTypes & arguments) const override { if (arguments.size() < 3) - throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Function {} requires as arguments a lambda function, at least one array and an accumulator", getName()); + throw Exception(ErrorCodes::TOO_FEW_ARGUMENTS_FOR_FUNCTION, "Function {} requires as arguments a lambda function, at least one array and an accumulator", getName()); DataTypes accumulator_and_array_types(arguments.size() - 1); accumulator_and_array_types[0] = arguments.back(); @@ -64,7 +64,7 @@ public: DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override { if (arguments.size() < 3) - throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Function {} requires as arguments a lambda function, at least one array and an accumulator", getName()); + throw Exception(ErrorCodes::TOO_FEW_ARGUMENTS_FOR_FUNCTION, "Function {} requires as arguments a lambda function, at least one array and an accumulator", getName()); const auto * lambda_function_type = checkAndGetDataType(arguments[0].type.get()); if (!lambda_function_type) diff --git a/src/Functions/array/arrayReduce.cpp b/src/Functions/array/arrayReduce.cpp index d47d1ae98cc..d70b8b40233 100644 --- a/src/Functions/array/arrayReduce.cpp +++ b/src/Functions/array/arrayReduce.cpp @@ -21,7 +21,7 @@ namespace DB namespace ErrorCodes { extern const int SIZES_OF_ARRAYS_DONT_MATCH; - extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; + extern const int TOO_FEW_ARGUMENTS_FOR_FUNCTION; extern const int ILLEGAL_COLUMN; extern const int ILLEGAL_TYPE_OF_ARGUMENT; extern const int BAD_ARGUMENTS; @@ -73,7 +73,7 @@ DataTypePtr FunctionArrayReduce::getReturnTypeImpl(const ColumnsWithTypeAndName /// (possibly with parameters in parentheses, for example: "quantile(0.99)"). if (arguments.size() < 2) - throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + throw Exception(ErrorCodes::TOO_FEW_ARGUMENTS_FOR_FUNCTION, "Number of arguments for function {} doesn't match: passed {}, should be at least 2.", getName(), arguments.size()); diff --git a/src/Functions/array/arrayZip.cpp b/src/Functions/array/arrayZip.cpp index 44c323e3fe3..6c6fff5926b 100644 --- a/src/Functions/array/arrayZip.cpp +++ b/src/Functions/array/arrayZip.cpp @@ -14,7 +14,7 @@ namespace ErrorCodes { extern const int ILLEGAL_TYPE_OF_ARGUMENT; extern const int SIZES_OF_ARRAYS_DONT_MATCH; - extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; + extern const int TOO_FEW_ARGUMENTS_FOR_FUNCTION; extern const int ILLEGAL_COLUMN; } @@ -39,7 +39,7 @@ public: DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override { if (arguments.empty()) - throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + throw Exception(ErrorCodes::TOO_FEW_ARGUMENTS_FOR_FUNCTION, "Function {} needs at least one argument; passed {}." , getName(), arguments.size()); DataTypes arguments_types; diff --git a/src/Functions/concat.cpp b/src/Functions/concat.cpp index c75a806559c..68cfcdb8d90 100644 --- a/src/Functions/concat.cpp +++ b/src/Functions/concat.cpp @@ -16,7 +16,7 @@ namespace DB { namespace ErrorCodes { -extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; +extern const int TOO_FEW_ARGUMENTS_FOR_FUNCTION; } using namespace GatherUtils; @@ -48,7 +48,7 @@ public: { if (arguments.size() < 2) throw Exception( - ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + ErrorCodes::TOO_FEW_ARGUMENTS_FOR_FUNCTION, "Number of arguments for function {} doesn't match: passed {}, should be at least 2", getName(), arguments.size()); @@ -225,7 +225,7 @@ public: { if (arguments.empty()) throw Exception( - ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + ErrorCodes::TOO_FEW_ARGUMENTS_FOR_FUNCTION, "Number of arguments for function {} doesn't match: passed {}, should be at least 1.", getName(), arguments.size()); diff --git a/src/Functions/generateULID.cpp b/src/Functions/generateULID.cpp index 9c5c9403185..f2f2d8ae3b9 100644 --- a/src/Functions/generateULID.cpp +++ b/src/Functions/generateULID.cpp @@ -17,7 +17,7 @@ namespace DB namespace ErrorCodes { - extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; + extern const int TOO_MANY_ARGUMENTS_FOR_FUNCTION; } class FunctionGenerateULID : public IFunction @@ -45,7 +45,7 @@ public: { if (arguments.size() > 1) throw Exception( - ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + ErrorCodes::TOO_MANY_ARGUMENTS_FOR_FUNCTION, "Number of arguments for function {} doesn't match: passed {}, should be 0 or 1.", getName(), arguments.size()); diff --git a/src/Functions/jsonMergePatch.cpp b/src/Functions/jsonMergePatch.cpp index 65946721432..a83daacdbf6 100644 --- a/src/Functions/jsonMergePatch.cpp +++ b/src/Functions/jsonMergePatch.cpp @@ -25,7 +25,7 @@ namespace ErrorCodes { extern const int BAD_ARGUMENTS; extern const int ILLEGAL_COLUMN; - extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; + extern const int TOO_FEW_ARGUMENTS_FOR_FUNCTION; extern const int ILLEGAL_TYPE_OF_ARGUMENT; } @@ -53,7 +53,7 @@ namespace DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override { if (arguments.empty()) - throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Function {} requires at least one argument.", getName()); + throw Exception(ErrorCodes::TOO_FEW_ARGUMENTS_FOR_FUNCTION, "Function {} requires at least one argument.", getName()); for (const auto & arg : arguments) if (!isString(arg.type)) diff --git a/src/Functions/nested.cpp b/src/Functions/nested.cpp index 679bb4f73d8..bdaf57d65c9 100644 --- a/src/Functions/nested.cpp +++ b/src/Functions/nested.cpp @@ -18,9 +18,10 @@ namespace DB namespace ErrorCodes { + extern const int BAD_ARGUMENTS; extern const int ILLEGAL_TYPE_OF_ARGUMENT; - extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; extern const int SIZES_OF_ARRAYS_DONT_MATCH; + extern const int TOO_FEW_ARGUMENTS_FOR_FUNCTION; } namespace @@ -64,19 +65,19 @@ public: { size_t arguments_size = arguments.size(); if (arguments_size < 2) - throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + throw Exception(ErrorCodes::TOO_FEW_ARGUMENTS_FOR_FUNCTION, "Number of arguments for function {} doesn't match: passed {}, should be at least 2", getName(), arguments_size); Names nested_names = extractNestedNames(arguments[0].column); if (nested_names.empty()) - throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + throw Exception(ErrorCodes::BAD_ARGUMENTS, "First argument for function {} must be constant column with array of strings", getName()); if (nested_names.size() != arguments_size - 1) - throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Size of nested names array for function {} does not match arrays arguments size. Actual {}. Expected {}", getName(), nested_names.size(), diff --git a/src/Functions/now64.cpp b/src/Functions/now64.cpp index 0f1e8a04236..d6f8474c984 100644 --- a/src/Functions/now64.cpp +++ b/src/Functions/now64.cpp @@ -18,7 +18,7 @@ namespace ErrorCodes { extern const int ILLEGAL_TYPE_OF_ARGUMENT; extern const int CANNOT_CLOCK_GETTIME; - extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; + extern const int TOO_MANY_ARGUMENTS_FOR_FUNCTION; } namespace @@ -128,7 +128,7 @@ public: if (arguments.size() > 2) { - throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Arguments size of function {} should be 0, or 1, or 2", getName()); + throw Exception(ErrorCodes::TOO_MANY_ARGUMENTS_FOR_FUNCTION, "Arguments size of function {} should be 0, or 1, or 2", getName()); } if (!arguments.empty()) { diff --git a/src/Functions/nowInBlock.cpp b/src/Functions/nowInBlock.cpp index 0d5f9c45780..74f420986c8 100644 --- a/src/Functions/nowInBlock.cpp +++ b/src/Functions/nowInBlock.cpp @@ -12,7 +12,7 @@ namespace DB namespace ErrorCodes { - extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; + extern const int TOO_MANY_ARGUMENTS_FOR_FUNCTION; extern const int ILLEGAL_TYPE_OF_ARGUMENT; } @@ -63,7 +63,7 @@ public: { if (arguments.size() > 1) { - throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Arguments size of function {} should be 0 or 1", getName()); + throw Exception(ErrorCodes::TOO_MANY_ARGUMENTS_FOR_FUNCTION, "Arguments size of function {} should be 0 or 1", getName()); } if (arguments.size() == 1 && !isStringOrFixedString(arguments[0].type)) { diff --git a/src/Functions/parseTimeDelta.cpp b/src/Functions/parseTimeDelta.cpp index 7743a0cb664..44eeb1a289f 100644 --- a/src/Functions/parseTimeDelta.cpp +++ b/src/Functions/parseTimeDelta.cpp @@ -11,7 +11,8 @@ namespace DB namespace ErrorCodes { - extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; + extern const int TOO_FEW_ARGUMENTS_FOR_FUNCTION; + extern const int TOO_MANY_ARGUMENTS_FOR_FUNCTION; extern const int ILLEGAL_TYPE_OF_ARGUMENT; extern const int BAD_ARGUMENTS; } @@ -117,14 +118,14 @@ namespace { if (arguments.empty()) throw Exception( - ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + ErrorCodes::TOO_FEW_ARGUMENTS_FOR_FUNCTION, "Number of arguments for function {} doesn't match: passed {}, should be 1.", getName(), arguments.size()); if (arguments.size() > 1) throw Exception( - ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + ErrorCodes::TOO_MANY_ARGUMENTS_FOR_FUNCTION, "Number of arguments for function {} doesn't match: passed {}, should be 1.", getName(), arguments.size()); diff --git a/src/Functions/pointInPolygon.cpp b/src/Functions/pointInPolygon.cpp index 0e4467a8210..55f89b71d6f 100644 --- a/src/Functions/pointInPolygon.cpp +++ b/src/Functions/pointInPolygon.cpp @@ -37,7 +37,7 @@ namespace DB { namespace ErrorCodes { - extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; + extern const int TOO_FEW_ARGUMENTS_FOR_FUNCTION; extern const int BAD_ARGUMENTS; extern const int ILLEGAL_TYPE_OF_ARGUMENT; extern const int ILLEGAL_COLUMN; @@ -87,7 +87,7 @@ public: { if (arguments.size() < 2) { - throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Function {} requires at least 2 arguments", getName()); + throw Exception(ErrorCodes::TOO_FEW_ARGUMENTS_FOR_FUNCTION, "Function {} requires at least 2 arguments", getName()); } /** We allow function invocation in one of the following forms: diff --git a/src/TableFunctions/TableFunctionValues.cpp b/src/TableFunctions/TableFunctionValues.cpp index 7b2a61c25eb..4b56fa57091 100644 --- a/src/TableFunctions/TableFunctionValues.cpp +++ b/src/TableFunctions/TableFunctionValues.cpp @@ -25,7 +25,7 @@ namespace ErrorCodes { extern const int BAD_ARGUMENTS; extern const int LOGICAL_ERROR; - extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; + extern const int TOO_FEW_ARGUMENTS_FOR_FUNCTION; extern const int CANNOT_EXTRACT_TABLE_STRUCTURE; } @@ -112,7 +112,7 @@ void TableFunctionValues::parseArguments(const ASTPtr & ast_function, ContextPtr ASTs & args = args_func.at(0)->children; if (args.empty()) - throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Table function '{}' requires at least 1 argument", getName()); + throw Exception(ErrorCodes::TOO_FEW_ARGUMENTS_FOR_FUNCTION, "Table function '{}' requires at least 1 argument", getName()); const auto & literal = args[0]->as(); String value; From cb4f78d9d83de9ec29569652a0d4e6b826e9c0fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9A=D0=B8=D1=80=D0=B8=D0=BB=D0=BB=20=D0=93=D0=B0=D1=80?= =?UTF-8?q?=D0=B1=D0=B0=D1=80?= Date: Mon, 6 May 2024 13:04:28 +0300 Subject: [PATCH 077/289] Allow to create MaterializedMySQL database without connection --- .../MySQL/DatabaseMaterializedMySQL.cpp | 5 +--- .../materialized_with_ddl.py | 26 +++++++++++++++++++ .../test_materialized_mysql_database/test.py | 8 ++++++ 3 files changed, 35 insertions(+), 4 deletions(-) diff --git a/src/Databases/MySQL/DatabaseMaterializedMySQL.cpp b/src/Databases/MySQL/DatabaseMaterializedMySQL.cpp index d8360a24bcb..6d89cc23590 100644 --- a/src/Databases/MySQL/DatabaseMaterializedMySQL.cpp +++ b/src/Databases/MySQL/DatabaseMaterializedMySQL.cpp @@ -81,12 +81,9 @@ LoadTaskPtr DatabaseMaterializedMySQL::startupDatabaseAsync(AsyncLoader & async_ base->goals(), TablesLoaderBackgroundStartupPoolId, fmt::format("startup MaterializedMySQL database {}", getDatabaseName()), - [this, mode] (AsyncLoader &, const LoadJobPtr &) + [this] (AsyncLoader &, const LoadJobPtr &) { LOG_TRACE(log, "Starting MaterializeMySQL database"); - if (mode < LoadingStrictnessLevel::FORCE_ATTACH) - materialize_thread.assertMySQLAvailable(); - materialize_thread.startSynchronization(); started_up = true; }); diff --git a/tests/integration/test_materialized_mysql_database/materialized_with_ddl.py b/tests/integration/test_materialized_mysql_database/materialized_with_ddl.py index 57f2ccd720d..dcb2546bad3 100644 --- a/tests/integration/test_materialized_mysql_database/materialized_with_ddl.py +++ b/tests/integration/test_materialized_mysql_database/materialized_with_ddl.py @@ -3413,3 +3413,29 @@ def gtid_after_attach_test(clickhouse_node, mysql_node, replication): interval_seconds=1, retry_count=300, ) + + +def mysql_create_database_without_connection(clickhouse_node, mysql_node, service_name): + mysql_node.query("DROP DATABASE IF EXISTS create_without_connection") + clickhouse_node.query("DROP DATABASE IF EXISTS create_without_connection") + mysql_node.query("CREATE DATABASE create_without_connection") + mysql_node.query( + "CREATE TABLE create_without_connection.test ( `id` int(11) NOT NULL, PRIMARY KEY (`id`) ) ENGINE=InnoDB;" + ) + + clickhouse_node.cluster.pause_container(service_name) + + clickhouse_node.query( + "CREATE DATABASE create_without_connection ENGINE = MaterializedMySQL('{}:3306', 'create_without_connection', 'root', 'clickhouse') SETTINGS max_wait_time_when_mysql_unavailable=-1".format( + service_name + ) + ) + + clickhouse_node.cluster.unpause_container(service_name) + mysql_node.alloc_connection() + + check_query( + clickhouse_node, + "SHOW TABLES FROM create_without_connection FORMAT TSV", + "test\n", + ) diff --git a/tests/integration/test_materialized_mysql_database/test.py b/tests/integration/test_materialized_mysql_database/test.py index 57e496fe737..080a850a8c6 100644 --- a/tests/integration/test_materialized_mysql_database/test.py +++ b/tests/integration/test_materialized_mysql_database/test.py @@ -721,3 +721,11 @@ def test_binlog_client(started_cluster, started_mysql_8_0, replication): materialized_with_ddl.gtid_after_attach_test( node_db, started_mysql_8_0, replication ) + + +def test_create_database_without_mysql_connection( + started_cluster, started_mysql_8_0, clickhouse_node: ClickHouseInstance +): + materialized_with_ddl.mysql_create_database_without_connection( + clickhouse_node, started_mysql_8_0, "mysql80" + ) From 48d6c3760d3890280ce4308e9d51afe9b281232d Mon Sep 17 00:00:00 2001 From: serxa Date: Mon, 6 May 2024 12:05:37 +0000 Subject: [PATCH 078/289] workaround for `oklch()` inside canvas bug for firefox --- programs/server/dashboard.html | 50 +++++++++++++++++++++++++++++++++- 1 file changed, 49 insertions(+), 1 deletion(-) diff --git a/programs/server/dashboard.html b/programs/server/dashboard.html index 901211e8ad9..b21d4b86314 100644 --- a/programs/server/dashboard.html +++ b/programs/server/dashboard.html @@ -538,9 +538,57 @@ let params = default_params; /// Palette generation for charts function generatePalette(numColors) { + // oklch() does not work in firefox<=125 inside element so we convert it back to rgb for now. + // Based on https://github.com/color-js/color.js/blob/main/src/spaces/oklch.js + const multiplyMatrices = (A, B) => { + return [ + A[0]*B[0] + A[1]*B[1] + A[2]*B[2], + A[3]*B[0] + A[4]*B[1] + A[5]*B[2], + A[6]*B[0] + A[7]*B[1] + A[8]*B[2] + ]; + } + + const oklch2oklab = ([l, c, h]) => [ + l, + isNaN(h) ? 0 : c * Math.cos(h * Math.PI / 180), + isNaN(h) ? 0 : c * Math.sin(h * Math.PI / 180) + ] + + const srgbLinear2rgb = rgb => rgb.map(c => + Math.abs(c) > 0.0031308 ? + (c < 0 ? -1 : 1) * (1.055 * (Math.abs(c) ** (1 / 2.4)) - 0.055) : + 12.92 * c + ) + + const oklab2xyz = lab => { + const LMSg = multiplyMatrices([ + 1, 0.3963377773761749, 0.2158037573099136, + 1, -0.1055613458156586, -0.0638541728258133, + 1, -0.0894841775298119, -1.2914855480194092, + ], lab) + const LMS = LMSg.map(val => val ** 3) + return multiplyMatrices([ + 1.2268798758459243, -0.5578149944602171, 0.2813910456659647, + -0.0405757452148008, 1.1122868032803170, -0.0717110580655164, + -0.0763729366746601, -0.4214933324022432, 1.5869240198367816 + ], LMS) + } + + const xyz2rgbLinear = xyz => { + return multiplyMatrices([ + 3.2409699419045226, -1.537383177570094, -0.4986107602930034, + -0.9692436362808796, 1.8759675015077202, 0.04155505740717559, + 0.05563007969699366, -0.20397695888897652, 1.0569715142428786 + ], xyz) + } + + const oklch2rgb = lch => srgbLinear2rgb(xyz2rgbLinear(oklab2xyz(oklch2oklab(lch)))) + palette = []; for (let i = 0; i < numColors; i++) { - palette.push(`oklch(${theme != 'dark' ? 0.75 : 0.5}, 0.15, ${360 * i / numColors})`); + //palette.push(`oklch(${theme != 'dark' ? 0.75 : 0.5}, 0.15, ${360 * i / numColors})`); + let rgb = oklch2rgb([theme != 'dark' ? 0.75 : 0.5, 0.15, 360 * i / numColors]); + palette.push(`rgb(${rgb[0] * 255}, ${rgb[1] * 255}, ${rgb[2] * 255})`); } return palette; } From 731d05491cf44d8356f1d6971883004a862fcd0d Mon Sep 17 00:00:00 2001 From: Andrey Zvonov Date: Mon, 6 May 2024 12:08:02 +0000 Subject: [PATCH 079/289] simplify estimation of number of objects in bucket --- src/Storages/StorageS3.cpp | 33 ++++++++++++++------------------- src/Storages/StorageS3.h | 1 - 2 files changed, 14 insertions(+), 20 deletions(-) diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp index cb5734cfe0c..8a4e30fed1d 100644 --- a/src/Storages/StorageS3.cpp +++ b/src/Storages/StorageS3.cpp @@ -485,12 +485,15 @@ StorageS3Source::KeyWithInfoPtr StorageS3Source::DisclosedGlobIterator::next(siz size_t StorageS3Source::DisclosedGlobIterator::estimatedKeysCount() { - return pimpl->objectsCount(); -} - -bool StorageS3Source::DisclosedGlobIterator::hasMore() -{ - return pimpl->hasMore(); + if (pimpl->hasMore()) + { + /// 1000 files were listed, and we cannot make any estimation of _how many more_ there are (because we list bucket lazily); + /// If there are more objects in the bucket, limiting the number of streams is the last thing we may want to do + /// as it would lead to serious (up to times) reading performance degradation. + return std::numeric_limits::max(); + } + else + return pimpl->objectsCount(); } class StorageS3Source::KeysIterator::Impl @@ -1285,21 +1288,13 @@ void ReadFromStorageS3Step::initializePipeline(QueryPipelineBuilder & pipeline, createIterator(nullptr); size_t estimated_keys_count = iterator_wrapper->estimatedKeysCount(); - const auto glob_iter = std::dynamic_pointer_cast(iterator_wrapper); - - if (!(glob_iter && glob_iter->hasMore())) + if (estimated_keys_count > 1) + num_streams = std::min(num_streams, estimated_keys_count); + else { - if (estimated_keys_count > 1) - num_streams = std::min(num_streams, estimated_keys_count); - else - { - /// The amount of keys (zero) was probably underestimated. We will keep one stream for this particular case. - num_streams = 1; - } + /// The amount of keys (zero) was probably underestimated. We will keep one stream for this particular case. + num_streams = 1; } - /// OTHERWISE, 1000 files were listed, but we cannot make any estimation of _how many_ there are (because we list bucket lazily); - /// If there are more objects in the bucket, limiting the number of streams is the last thing we may want to do - /// as it would lead to serious (up to times) reading performance degradation. const size_t max_threads = context->getSettingsRef().max_threads; const size_t max_parsing_threads = num_streams >= max_threads ? 1 : (max_threads / std::max(num_streams, 1ul)); diff --git a/src/Storages/StorageS3.h b/src/Storages/StorageS3.h index b841e973a9b..c8ab28fb20e 100644 --- a/src/Storages/StorageS3.h +++ b/src/Storages/StorageS3.h @@ -83,7 +83,6 @@ public: KeyWithInfoPtr next(size_t idx = 0) override; /// NOLINT size_t estimatedKeysCount() override; - bool hasMore(); private: class Impl; From 9d55bc82d6609633a135a5044f05aadeaff21755 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Mon, 6 May 2024 16:19:57 +0200 Subject: [PATCH 080/289] Revert "Revert "Do not remove server constants from GROUP BY key for secondary query."" --- src/Planner/PlannerExpressionAnalysis.cpp | 6 ++++-- .../03095_group_by_server_constants_bug.reference | 1 + .../0_stateless/03095_group_by_server_constants_bug.sql | 5 +++++ 3 files changed, 10 insertions(+), 2 deletions(-) create mode 100644 tests/queries/0_stateless/03095_group_by_server_constants_bug.reference create mode 100644 tests/queries/0_stateless/03095_group_by_server_constants_bug.sql diff --git a/src/Planner/PlannerExpressionAnalysis.cpp b/src/Planner/PlannerExpressionAnalysis.cpp index ad8db83d66c..d7fa270a643 100644 --- a/src/Planner/PlannerExpressionAnalysis.cpp +++ b/src/Planner/PlannerExpressionAnalysis.cpp @@ -85,6 +85,8 @@ std::optional analyzeAggregation(const QueryTreeNodeP bool group_by_use_nulls = planner_context->getQueryContext()->getSettingsRef().group_by_use_nulls && (query_node.isGroupByWithGroupingSets() || query_node.isGroupByWithRollup() || query_node.isGroupByWithCube()); + bool is_secondary_query = planner_context->getQueryContext()->getClientInfo().query_kind == ClientInfo::QueryKind::SECONDARY_QUERY; + if (query_node.hasGroupBy()) { if (query_node.isGroupByWithGroupingSets()) @@ -100,7 +102,7 @@ std::optional analyzeAggregation(const QueryTreeNodeP auto is_constant_key = grouping_set_key_node->as() != nullptr; group_by_with_constant_keys |= is_constant_key; - if (is_constant_key && !aggregates_descriptions.empty()) + if (!is_secondary_query && is_constant_key && !aggregates_descriptions.empty()) continue; auto expression_dag_nodes = actions_visitor.visit(before_aggregation_actions, grouping_set_key_node); @@ -152,7 +154,7 @@ std::optional analyzeAggregation(const QueryTreeNodeP auto is_constant_key = group_by_key_node->as() != nullptr; group_by_with_constant_keys |= is_constant_key; - if (is_constant_key && !aggregates_descriptions.empty()) + if (!is_secondary_query && is_constant_key && !aggregates_descriptions.empty()) continue; auto expression_dag_nodes = actions_visitor.visit(before_aggregation_actions, group_by_key_node); diff --git a/tests/queries/0_stateless/03095_group_by_server_constants_bug.reference b/tests/queries/0_stateless/03095_group_by_server_constants_bug.reference new file mode 100644 index 00000000000..80ab3c879bb --- /dev/null +++ b/tests/queries/0_stateless/03095_group_by_server_constants_bug.reference @@ -0,0 +1 @@ +r1 2 diff --git a/tests/queries/0_stateless/03095_group_by_server_constants_bug.sql b/tests/queries/0_stateless/03095_group_by_server_constants_bug.sql new file mode 100644 index 00000000000..9f9fda1ef62 --- /dev/null +++ b/tests/queries/0_stateless/03095_group_by_server_constants_bug.sql @@ -0,0 +1,5 @@ +SELECT serverUUID() AS s, count() FROM remote('127.0.0.{1,2}', system.one) GROUP BY s format Null; + +select getMacro('replica') as s, count() from remote('127.0.0.{1,2}', system.one) group by s; + +select uptime() as s, count() FROM remote('127.0.0.{1,2}', system.one) group by s format Null; From c0d2a99a9e5cb02de730fc237166c41b79d84034 Mon Sep 17 00:00:00 2001 From: Shaun Struwig <41984034+Blargian@users.noreply.github.com> Date: Mon, 6 May 2024 16:51:23 +0200 Subject: [PATCH 081/289] Update date-time-functions.md --- .../functions/date-time-functions.md | 88 +++++++++---------- 1 file changed, 44 insertions(+), 44 deletions(-) diff --git a/docs/en/sql-reference/functions/date-time-functions.md b/docs/en/sql-reference/functions/date-time-functions.md index 629b3ca9305..493cc9b7648 100644 --- a/docs/en/sql-reference/functions/date-time-functions.md +++ b/docs/en/sql-reference/functions/date-time-functions.md @@ -2564,16 +2564,16 @@ Adds a specified number of years to a date, a date with time or a string-encoded **Syntax** ```sql -addYears(date, x) +addYears(date, num) ``` **Parameters** -- `date`: Date / date with time to add specified number of years to. [date](../data-types/date.md)/[date32](../data-types/date32.md)/[datetime](../data-types/datetime.md)/[datetime64](../data-types/datetime64.md), [String](../data-types/string.md). -- `x`: Number of years to add. [(U)Int*](../data-types/int-uint.md), [float*](../data-types/float.md). +- `date`: Date / date with time to add specified number of years to. [Date](../data-types/date.md)/[Date32](../data-types/date32.md)/[DateTime](../data-types/datetime.md)/[DateTime64](../data-types/datetime64.md), [String](../data-types/string.md). +- `num`: Number of years to add. [(U)Int*](../data-types/int-uint.md), [Float*](../data-types/float.md). **Returned value** -- Returns `date` plus `x` years. [date](../data-types/date.md)/[date32](../data-types/date32.md)/[datetime](../data-types/datetime.md)/[datetime64](../data-types/datetime64.md). +- Returns `date` plus `num` years. [Date](../data-types/date.md)/[Date32](../data-types/date32.md)/[DateTime](../data-types/datetime.md)/[DateTime64](../data-types/datetime64.md). **Example** @@ -2601,16 +2601,16 @@ Adds a specified number of quarters to a date, a date with time or a string-enco **Syntax** ```sql -addQuarters(date, x) +addQuarters(date, num) ``` **Parameters** -- `date`: Date / date with time to add specified number of quarters to. [date](../data-types/date.md)/[date32](../data-types/date32.md)/[datetime](../data-types/datetime.md)/[datetime64](../data-types/datetime64.md), [String](../data-types/string.md). -- `x`: Number of quarters to add. [(U)Int*](../data-types/int-uint.md), [float*](../data-types/float.md). +- `date`: Date / date with time to add specified number of quarters to. [Date](../data-types/date.md)/[Date32](../data-types/date32.md)/[DateTime](../data-types/datetime.md)/[DateTime64](../data-types/datetime64.md), [String](../data-types/string.md). +- `num`: Number of quarters to add. [(U)Int*](../data-types/int-uint.md), [Float*](../data-types/float.md). **Returned value** -- Returns `date` plus `x` quarters. [date](../data-types/date.md)/[date32](../data-types/date32.md)/[datetime](../data-types/datetime.md)/[datetime64](../data-types/datetime64.md). +- Returns `date` plus `num` quarters. [Date](../data-types/date.md)/[Date32](../data-types/date32.md)/[DateTime](../data-types/datetime.md)/[DateTime64](../data-types/datetime64.md). **Example** @@ -2638,16 +2638,16 @@ Adds a specified number of months to a date, a date with time or a string-encode **Syntax** ```sql -addMonths(date, x) +addMonths(date, num) ``` **Parameters** -- `date`: Date / date with time to add specified number of months to. [date](../data-types/date.md)/[date32](../data-types/date32.md)/[datetime](../data-types/datetime.md)/[datetime64](../data-types/datetime64.md), [String](../data-types/string.md). -- `x`: Number of months to add. [(U)Int*](../data-types/int-uint.md), [float*](../data-types/float.md). +- `date`: Date / date with time to add specified number of months to. [Date](../data-types/date.md)/[Date32](../data-types/date32.md)/[DateTime](../data-types/datetime.md)/[DateTime64](../data-types/datetime64.md), [String](../data-types/string.md). +- `num`: Number of months to add. [(U)Int*](../data-types/int-uint.md), [Float*](../data-types/float.md). **Returned value** -- Returns `date` plus `x` months. [date](../data-types/date.md)/[date32](../data-types/date32.md)/[datetime](../data-types/datetime.md)/[datetime64](../data-types/datetime64.md). +- Returns `date` plus `num` months. [Date](../data-types/date.md)/[Date32](../data-types/date32.md)/[DateTime](../data-types/datetime.md)/[DateTime64](../data-types/datetime64.md). **Example** @@ -2675,16 +2675,16 @@ Adds a specified number of weeks to a date, a date with time or a string-encoded **Syntax** ```sql -addWeeks(date, x) +addWeeks(date, num) ``` **Parameters** -- `date`: Date / date with time to add specified number of weeks to. [date](../data-types/date.md)/[date32](../data-types/date32.md)/[datetime](../data-types/datetime.md)/[datetime64](../data-types/datetime64.md), [String](../data-types/string.md). -- `x`: Number of weeks to add. [(U)Int*](../data-types/int-uint.md), [float*](../data-types/float.md). +- `date`: Date / date with time to add specified number of weeks to. [Date](../data-types/date.md)/[Date32](../data-types/date32.md)/[DateTime](../data-types/datetime.md)/[DateTime64](../data-types/datetime64.md), [String](../data-types/string.md). +- `num`: Number of weeks to add. [(U)Int*](../data-types/int-uint.md), [Float*](../data-types/float.md). **Returned value** -- Returns `date` plus `x` weeks. [date](../data-types/date.md)/[date32](../data-types/date32.md)/[datetime](../data-types/datetime.md)/[datetime64](../data-types/datetime64.md). +- Returns `date` plus `num` weeks. [Date](../data-types/date.md)/[Date32](../data-types/date32.md)/[DateTime](../data-types/datetime.md)/[DateTime64](../data-types/datetime64.md). **Example** @@ -2712,16 +2712,16 @@ Adds a specified number of days to a date, a date with time or a string-encoded **Syntax** ```sql -addDays(date, x) +addDays(date, num) ``` **Parameters** -- `date`: Date / date with time to add specified number of days to. [date](../data-types/date.md)/[date32](../data-types/date32.md)/[datetime](../data-types/datetime.md)/[datetime64](../data-types/datetime64.md), [String](../data-types/string.md). -- `x`: Number of days to add. [(U)Int*](../data-types/int-uint.md), [float*](../data-types/float.md). +- `date`: Date / date with time to add specified number of days to. [Date](../data-types/date.md)/[Date32](../data-types/date32.md)/[DateTime](../data-types/datetime.md)/[DateTime64](../data-types/datetime64.md), [String](../data-types/string.md). +- `num`: Number of days to add. [(U)Int*](../data-types/int-uint.md), [Float*](../data-types/float.md). **Returned value** -- Returns `date` plus `x` days. [date](../data-types/date.md)/[date32](../data-types/date32.md)/[datetime](../data-types/datetime.md)/[datetime64](../data-types/datetime64.md). +- Returns `date` plus `num` days. [Date](../data-types/date.md)/[Date32](../data-types/date32.md)/[DateTime](../data-types/datetime.md)/[DateTime64](../data-types/datetime64.md). **Example** @@ -2749,16 +2749,16 @@ Adds a specified number of days to a date, a date with time or a string-encoded **Syntax** ```sql -addHours(date, x) +addHours(date, num) ``` **Parameters** -- `date`: Date / date with time to add specified number of hours to. [date](../data-types/date.md)/[date32](../data-types/date32.md)/[datetime](../data-types/datetime.md)/[datetime64](../data-types/datetime64.md), [String](../data-types/string.md). -- `x`: Number of hours to add. [(U)Int*](../data-types/int-uint.md), [float*](../data-types/float.md). +- `date`: Date / date with time to add specified number of hours to. [Date](../data-types/date.md)/[Date32](../data-types/date32.md)/[DateTime](../data-types/datetime.md)/[DateTime64](../data-types/datetime64.md), [String](../data-types/string.md). +- `num`: Number of hours to add. [(U)Int*](../data-types/int-uint.md), [Float*](../data-types/float.md). **Returned value** -- Returns `date` plus `x` hours. [date](../data-types/date.md)/[date32](../data-types/date32.md)/[datetime](../data-types/datetime.md)/[datetime64](../data-types/datetime64.md). +- Returns `date` plus `num` hours. [Date](../data-types/date.md)/[Date32](../data-types/date32.md)/[DateTime](../data-types/datetime.md)/[DateTime64](../data-types/datetime64.md). **Example** @@ -2786,16 +2786,16 @@ Adds a specified number of minutes to a date, a date with time or a string-encod **Syntax** ```sql -addMinutes(date, x) +addMinutes(date, num) ``` **Parameters** -- `date`: Date / date with time to add specified number of minutes to. [date](../data-types/date.md)/[date32](../data-types/date32.md)/[datetime](../data-types/datetime.md)/[datetime64](../data-types/datetime64.md), [String](../data-types/string.md). -- `x`: Number of minutes to add. [(U)Int*](../data-types/int-uint.md), [float*](../data-types/float.md). +- `date`: Date / date with time to add specified number of minutes to. [Date](../data-types/date.md)/[Date32](../data-types/date32.md)/[DateTime](../data-types/datetime.md)/[DateTime64](../data-types/datetime64.md), [String](../data-types/string.md). +- `num`: Number of minutes to add. [(U)Int*](../data-types/int-uint.md), [Float*](../data-types/float.md). **Returned value** -- Returns `date` plus `x` minutes. [date](../data-types/date.md)/[date32](../data-types/date32.md)/[datetime](../data-types/datetime.md)/[datetime64](../data-types/datetime64.md). +- Returns `date` plus `num` minutes. [Date](../data-types/date.md)/[Date32](../data-types/date32.md)/[DateTime](../data-types/datetime.md)/[DateTime64](../data-types/datetime64.md). **Example** @@ -2823,16 +2823,16 @@ Adds a specified number of seconds to a date, a date with time or a string-encod **Syntax** ```sql -addSeconds(date, x) +addSeconds(date, num) ``` **Parameters** -- `date`: Date / date with time to add specified number of seconds to. [date](../data-types/date.md)/[date32](../data-types/date32.md)/[datetime](../data-types/datetime.md)/[datetime64](../data-types/datetime64.md), [String](../data-types/string.md). -- `x`: Number of seconds to add. [(U)Int*](../data-types/int-uint.md), [float*](../data-types/float.md). +- `date`: Date / date with time to add specified number of seconds to. [Date](../data-types/date.md)/[Date32](../data-types/date32.md)/[DateTime](../data-types/datetime.md)/[DateTime64](../data-types/datetime64.md), [String](../data-types/string.md). +- `num`: Number of seconds to add. [(U)Int*](../data-types/int-uint.md), [Float*](../data-types/float.md). **Returned value** -- Returns `date` plus `x` seconds. [date](../data-types/date.md)/[date32](../data-types/date32.md)/[datetime](../data-types/datetime.md)/[datetime64](../data-types/datetime64.md). +- Returns `date` plus `num` seconds. [Date](../data-types/date.md)/[Date32](../data-types/date32.md)/[DateTime](../data-types/datetime.md)/[DateTime64](../data-types/datetime64.md). **Example** @@ -2860,16 +2860,16 @@ Adds a specified number of milliseconds to a date with time or a string-encoded **Syntax** ```sql -addMilliseconds(date_time, x) +addMilliseconds(date_time, num) ``` **Parameters** -- `date_time`: Date with time to add specified number of milliseconds to. [datetime](../data-types/datetime.md)/[datetime64](../data-types/datetime64.md), [String](../data-types/string.md). -- `x`: Number of milliseconds to add. [(U)Int*](../data-types/int-uint.md), [float*](../data-types/float.md). +- `date_time`: Date with time to add specified number of milliseconds to. [DateTime](../data-types/datetime.md)/[DateTime64](../data-types/datetime64.md), [String](../data-types/string.md). +- `num`: Number of milliseconds to add. [(U)Int*](../data-types/int-uint.md), [Float*](../data-types/float.md). **Returned value** -- Returns `date_time` plus `x` milliseconds. [date](../data-types/date.md)/[date32](../data-types/date32.md)/[datetime](../data-types/datetime.md)/[datetime64](../data-types/datetime64.md). +- Returns `date_time` plus `num` milliseconds. [Date](../data-types/date.md)/[Date32](../data-types/date32.md)/[DateTime](../data-types/datetime.md)/[DateTime64](../data-types/datetime64.md). **Example** @@ -2895,16 +2895,16 @@ Adds a specified number of microseconds to a date with time or a string-encoded **Syntax** ```sql -addMicroseconds(date_time, x) +addMicroseconds(date_time, num) ``` **Parameters** -- `date_time`: Date with time to add specified number of microseconds to. [datetime](../data-types/datetime.md)/[datetime64](../data-types/datetime64.md), [String](../data-types/string.md). -- `x`: Number of microseconds to add. [(U)Int*](../data-types/int-uint.md), [float*](../data-types/float.md). +- `date_time`: Date with time to add specified number of microseconds to. [DateTime](../data-types/datetime.md)/[DateTime64](../data-types/datetime64.md), [String](../data-types/string.md). +- `num`: Number of microseconds to add. [(U)Int*](../data-types/int-uint.md), [Float*](../data-types/float.md). **Returned value** -- Returns `date_time` plus `x` microseconds. [date](../data-types/date.md)/[date32](../data-types/date32.md)/[datetime](../data-types/datetime.md)/[datetime64](../data-types/datetime64.md). +- Returns `date_time` plus `num` microseconds. [Date](../data-types/date.md)/[Date32](../data-types/date32.md)/[DateTime](../data-types/datetime.md)/[DateTime64](../data-types/datetime64.md). **Example** @@ -2930,16 +2930,16 @@ Adds a specified number of microseconds to a date with time or a string-encoded **Syntax** ```sql -addNanoseconds(date_time, x) +addNanoseconds(date_time, num) ``` **Parameters** -- `date_time`: Date with time to add specified number of nanoseconds to. [datetime](../data-types/datetime.md)/[datetime64](../data-types/datetime64.md), [String](../data-types/string.md). -- `x`: Number of nanoseconds to add. [(U)Int*](../data-types/int-uint.md), [float*](../data-types/float.md). +- `date_time`: Date with time to add specified number of nanoseconds to. [DateTime](../data-types/datetime.md)/[DateTime64](../data-types/datetime64.md), [String](../data-types/string.md). +- `num`: Number of nanoseconds to add. [(U)Int*](../data-types/int-uint.md), [Float*](../data-types/float.md). **Returned value** -- Returns `date_time` plus `x` nanoseconds. [date](../data-types/date.md)/[date32](../data-types/date32.md)/[datetime](../data-types/datetime.md)/[datetime64](../data-types/datetime64.md). +- Returns `date_time` plus `num` nanoseconds. [Date](../data-types/date.md)/[Date32](../data-types/date32.md)/[DateTime](../data-types/datetime.md)/[DateTime64](../data-types/datetime64.md). **Example** From ba4f5f9b3fe74d44295633995033c9d484b8a9c6 Mon Sep 17 00:00:00 2001 From: kssenii Date: Mon, 6 May 2024 19:46:57 +0200 Subject: [PATCH 082/289] Fix backup of parts with projections but without projections in metadata --- src/Storages/MergeTree/MergeTreeData.cpp | 42 +++++++++++++--- ...145_non_loaded_projection_backup.reference | 7 +++ .../03145_non_loaded_projection_backup.sh | 49 +++++++++++++++++++ 3 files changed, 92 insertions(+), 6 deletions(-) create mode 100644 tests/queries/0_stateless/03145_non_loaded_projection_backup.reference create mode 100755 tests/queries/0_stateless/03145_non_loaded_projection_backup.sh diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 9350b24c96a..fbe79a32b8e 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -5361,20 +5361,50 @@ MergeTreeData::PartsBackupEntries MergeTreeData::backupParts( &temp_dirs, false, false); - auto projection_parts = part->getProjectionParts(); - for (const auto & [projection_name, projection_part] : projection_parts) + auto backup_projection = [&](IDataPartStorage & storage, IMergeTreeDataPart & projection_part) { - projection_part->getDataPartStorage().backup( - projection_part->checksums, - projection_part->getFileNamesWithoutChecksums(), + storage.backup( + projection_part.checksums, + projection_part.getFileNamesWithoutChecksums(), fs::path{data_path_in_backup} / part->name, backup_settings, read_settings, make_temporary_hard_links, backup_entries_from_part, &temp_dirs, - projection_part->is_broken, + projection_part.is_broken, backup_settings.allow_backup_broken_projections); + }; + + auto projection_parts = part->getProjectionParts(); + std::string proj_suffix = ".proj"; + std::unordered_set defined_projections; + + for (const auto & [projection_name, projection_part] : projection_parts) + { + defined_projections.emplace(projection_name); + backup_projection(projection_part->getDataPartStorage(), *projection_part); + } + + /// It is possible that the part has a written but not loaded projection, + /// e.g. it is written to parent part's checksums.txt and exists on disk, + /// but does not exist in table's projections definition. + /// Such a part can appear server was restarted after DROP PROJECTION but before old part was removed. + /// In this case, the old part will load only projections from metadata. + /// See 031145_non_loaded_projection_backup.sh. + for (const auto & [name, _] : part->checksums.files) + { + auto projection_name = fs::path(name).stem().string(); + if (endsWith(name, proj_suffix) && !defined_projections.contains(projection_name)) + { + auto projection_storage = part->getDataPartStorage().getProjection(projection_name + proj_suffix); + if (projection_storage->exists("checksums.txt")) + { + auto projection_part = const_cast(*part).getProjectionPartBuilder( + projection_name, /* is_temp_projection */false).withPartFormatFromDisk().build(); + backup_projection(projection_part->getDataPartStorage(), *projection_part); + } + } } if (hold_storage_and_part_ptrs) diff --git a/tests/queries/0_stateless/03145_non_loaded_projection_backup.reference b/tests/queries/0_stateless/03145_non_loaded_projection_backup.reference new file mode 100644 index 00000000000..a11ee210e62 --- /dev/null +++ b/tests/queries/0_stateless/03145_non_loaded_projection_backup.reference @@ -0,0 +1,7 @@ +7 +Found unexpected projection directories: pp.proj +BACKUP_CREATED +RESTORED +7 +Found unexpected projection directories: pp.proj +0 diff --git a/tests/queries/0_stateless/03145_non_loaded_projection_backup.sh b/tests/queries/0_stateless/03145_non_loaded_projection_backup.sh new file mode 100755 index 00000000000..721ed784fc2 --- /dev/null +++ b/tests/queries/0_stateless/03145_non_loaded_projection_backup.sh @@ -0,0 +1,49 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +$CLICKHOUSE_CLIENT -nm -q " +drop table if exists tp_1; +create table tp_1 (x Int32, y Int32, projection p (select x, y order by x)) engine = MergeTree order by y partition by intDiv(y, 100); +insert into tp_1 select number, number from numbers(3); + +set mutations_sync = 2; + +alter table tp_1 add projection pp (select x, count() group by x); +insert into tp_1 select number, number from numbers(4); +select count() from tp_1; + +-- Here we have a part with written projection pp +alter table tp_1 detach partition '0'; +-- Move part to detached +alter table tp_1 clear projection pp; +-- Remove projection from table metadata +alter table tp_1 drop projection pp; +-- Now, we don't load projection pp for attached part, but it is written on disk +alter table tp_1 attach partition '0'; +" + +$CLICKHOUSE_CLIENT -nm -q " +set send_logs_level='fatal'; +check table tp_1 settings check_query_single_value_result = 0;" | grep -o "Found unexpected projection directories: pp.proj" + +backup_id="$CLICKHOUSE_TEST_UNIQUE_NAME" +$CLICKHOUSE_CLIENT -q " +backup table tp_1 to Disk('backups', '$backup_id'); +" | grep -o "BACKUP_CREATED" + +$CLICKHOUSE_CLIENT -nm -q " +drop table tp_1; +restore table tp_1 from Disk('backups', '$backup_id'); +" | grep -o "RESTORED" + +$CLICKHOUSE_CLIENT -q "select count() from tp_1;" +$CLICKHOUSE_CLIENT -nm -q " +set send_logs_level='fatal'; +check table tp_1 settings check_query_single_value_result = 0;" | grep -o "Found unexpected projection directories: pp.proj" +$CLICKHOUSE_CLIENT -nm -q " +set send_logs_level='fatal'; +check table tp_1" +$CLICKHOUSE_CLIENT -q "drop table tp_1 sync" From 0b0e97917e3e4ab27a17cbf14d9e73163a20adbe Mon Sep 17 00:00:00 2001 From: alesapin Date: Mon, 6 May 2024 21:03:04 +0200 Subject: [PATCH 083/289] Do in reverse order --- src/Storages/MergeTree/MergeTreeRangeReader.cpp | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeRangeReader.cpp b/src/Storages/MergeTree/MergeTreeRangeReader.cpp index eb757e1d8c7..492e4065502 100644 --- a/src/Storages/MergeTree/MergeTreeRangeReader.cpp +++ b/src/Storages/MergeTree/MergeTreeRangeReader.cpp @@ -1007,6 +1007,10 @@ MergeTreeRangeReader::ReadResult MergeTreeRangeReader::read(size_t max_rows, Mar filterColumns(columns, read_result.final_filter); } + /// If columns not empty, then apply on-fly alter conversions if any required + if (!prewhere_info || prewhere_info->perform_alter_conversions) + merge_tree_reader->performRequiredConversions(columns); + /// If some columns absent in part, then evaluate default values if (should_evaluate_missing_defaults) { @@ -1018,9 +1022,6 @@ MergeTreeRangeReader::ReadResult MergeTreeRangeReader::read(size_t max_rows, Mar merge_tree_reader->evaluateMissingDefaults(additional_columns, columns); } - /// If columns not empty, then apply on-fly alter conversions if any required - if (!prewhere_info || prewhere_info->perform_alter_conversions) - merge_tree_reader->performRequiredConversions(columns); } read_result.columns.reserve(read_result.columns.size() + columns.size()); @@ -1046,14 +1047,14 @@ MergeTreeRangeReader::ReadResult MergeTreeRangeReader::read(size_t max_rows, Mar bool should_evaluate_missing_defaults; merge_tree_reader->fillMissingColumns(columns, should_evaluate_missing_defaults, read_result.num_rows); - /// If some columns absent in part, then evaluate default values - if (should_evaluate_missing_defaults) - merge_tree_reader->evaluateMissingDefaults({}, columns); - /// If result not empty, then apply on-fly alter conversions if any required if (!prewhere_info || prewhere_info->perform_alter_conversions) merge_tree_reader->performRequiredConversions(columns); + /// If some columns absent in part, then evaluate default values + if (should_evaluate_missing_defaults) + merge_tree_reader->evaluateMissingDefaults({}, columns); + for (size_t i = 0; i < columns.size(); ++i) read_result.columns[i] = std::move(columns[i]); } From 1b7b4fc858fe7918d274691d233b69df701654fa Mon Sep 17 00:00:00 2001 From: Maxim Alexeev Date: Mon, 6 May 2024 22:08:13 +0300 Subject: [PATCH 084/289] removed unnecessary setting --- src/Core/Settings.h | 1 - src/Interpreters/HashJoin.cpp | 12 ++++++------ src/Interpreters/TableJoin.cpp | 1 - src/Interpreters/TableJoin.h | 3 --- 4 files changed, 6 insertions(+), 11 deletions(-) diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 5dc80912ebb..7ed175d38f4 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -463,7 +463,6 @@ class IColumn; M(UInt64, partial_merge_join_rows_in_right_blocks, 65536, "Split right-hand joining data in blocks of specified size. It's a portion of data indexed by min-max values and possibly unloaded on disk.", 0) \ M(UInt64, join_on_disk_max_files_to_merge, 64, "For MergeJoin on disk set how much files it's allowed to sort simultaneously. Then this value bigger then more memory used and then less disk I/O needed. Minimum is 2.", 0) \ M(UInt64, max_rows_in_set_to_optimize_join, 0, "Maximal size of the set to filter joined tables by each other row sets before joining. 0 - disable.", 0) \ - M(UInt64, cross_join_max_bytes_inmemory, 1000000, "Maximal bytes to complete cross join inmemory.", 0) \ \ M(Bool, compatibility_ignore_collation_in_create_table, true, "Compatibility ignore collation in create table", 0) \ \ diff --git a/src/Interpreters/HashJoin.cpp b/src/Interpreters/HashJoin.cpp index e194df4e660..52c804c261a 100644 --- a/src/Interpreters/HashJoin.cpp +++ b/src/Interpreters/HashJoin.cpp @@ -2309,6 +2309,12 @@ void HashJoin::joinBlockImplCross(Block & block, ExtraBlockPtr & not_processed) } }; + for (const Block & compressed_block_right : data->blocks) + { + auto block_right = compressed_block_right.decompress(); + process_right_block(block_right); + } + if (tmp_stream) { tmp_stream->finishWritingAsyncSafe(); @@ -2319,12 +2325,6 @@ void HashJoin::joinBlockImplCross(Block & block, ExtraBlockPtr & not_processed) } } - for (const Block & compressed_block_right : data->blocks) - { - auto block_right = compressed_block_right.decompress(); - process_right_block(block_right); - } - start_right_block = 0; if (rows_added > max_joined_block_rows) diff --git a/src/Interpreters/TableJoin.cpp b/src/Interpreters/TableJoin.cpp index 7ceb90704f3..1ee8ca14b2f 100644 --- a/src/Interpreters/TableJoin.cpp +++ b/src/Interpreters/TableJoin.cpp @@ -111,7 +111,6 @@ TableJoin::TableJoin(const Settings & settings, VolumePtr tmp_volume_, Temporary , cross_join_min_bytes_to_compress(settings.cross_join_min_bytes_to_compress) , max_joined_block_rows(settings.max_joined_block_size_rows) , join_algorithm(settings.join_algorithm) - , cross_join_max_bytes_inmemory(settings.cross_join_max_bytes_inmemory) , partial_merge_join_rows_in_right_blocks(settings.partial_merge_join_rows_in_right_blocks) , partial_merge_join_left_table_buffer_bytes(settings.partial_merge_join_left_table_buffer_bytes) , max_files_to_merge(settings.join_on_disk_max_files_to_merge) diff --git a/src/Interpreters/TableJoin.h b/src/Interpreters/TableJoin.h index d6920d6afbf..8e83233e54c 100644 --- a/src/Interpreters/TableJoin.h +++ b/src/Interpreters/TableJoin.h @@ -145,7 +145,6 @@ private: const UInt64 cross_join_min_bytes_to_compress = 10000; const size_t max_joined_block_rows = 0; std::vector join_algorithm; - const UInt64 cross_join_max_bytes_inmemory = 1000000; const size_t partial_merge_join_rows_in_right_blocks = 0; const size_t partial_merge_join_left_table_buffer_bytes = 0; const size_t max_files_to_merge = 0; @@ -281,8 +280,6 @@ public: bool allowParallelHashJoin() const; - UInt64 crossJoinMaxBytesInmemory() const { return cross_join_max_bytes_inmemory; } - bool joinUseNulls() const { return join_use_nulls; } UInt64 crossJoinMinRowsToCompress() const { return cross_join_min_rows_to_compress; } From 670c8c11886ee5763639bccb28cf45f74f683c05 Mon Sep 17 00:00:00 2001 From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com> Date: Mon, 6 May 2024 21:42:25 +0200 Subject: [PATCH 085/289] fix exception --- src/AggregateFunctions/Combinators/AggregateFunctionArray.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/AggregateFunctions/Combinators/AggregateFunctionArray.cpp b/src/AggregateFunctions/Combinators/AggregateFunctionArray.cpp index 6b60faadf07..16646c702bf 100644 --- a/src/AggregateFunctions/Combinators/AggregateFunctionArray.cpp +++ b/src/AggregateFunctions/Combinators/AggregateFunctionArray.cpp @@ -26,7 +26,7 @@ public: DataTypes transformArguments(const DataTypes & arguments) const override { if (arguments.empty()) - throw Exception(ErrorCodes::TOO_FEW_ARGUMENTS_FOR_FUNCTION, "-Array aggregate functions require at least one argument"); + throw Exception(ErrorCodes::TOO_FEW_ARGUMENTS_FOR_FUNCTION, "Array aggregate functions require at least one argument"); DataTypes nested_arguments; for (const auto & type : arguments) From 1660a4fe867d3d8d5e2fb0167c689f936226cfde Mon Sep 17 00:00:00 2001 From: Maxim Alexeev Date: Mon, 6 May 2024 23:22:41 +0300 Subject: [PATCH 086/289] first working version --- src/Interpreters/HashJoin.cpp | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/src/Interpreters/HashJoin.cpp b/src/Interpreters/HashJoin.cpp index 52c804c261a..68186c00663 100644 --- a/src/Interpreters/HashJoin.cpp +++ b/src/Interpreters/HashJoin.cpp @@ -64,6 +64,7 @@ struct NotProcessedCrossJoin : public ExtraBlock { size_t left_position; size_t right_block; + std::unique_ptr reader; }; @@ -256,7 +257,6 @@ HashJoin::HashJoin(std::shared_ptr table_join_, const Block & right_s , instance_log_id(!instance_id_.empty() ? "(" + instance_id_ + ") " : "") , log(getLogger("HashJoin")) { - LOG_INFO(log, "KEK CONSTRUCTOR {}\n", reserve_num); LOG_TRACE(log, "{}Keys: {}, datatype: {}, kind: {}, strictness: {}, right header: {}", instance_log_id, TableJoin::formatClauses(table_join->getClauses(), true), data->type, kind, strictness, right_sample_block.dumpStructure()); @@ -830,14 +830,23 @@ bool HashJoin::addBlockToJoin(const Block & source_block_, bool check_limits) if (shrink_blocks) block_to_save = block_to_save.shrinkToFit(); + size_t max_bytes_in_join = table_join->sizeLimits().max_bytes; + size_t max_rows_in_join = table_join->sizeLimits().max_rows; + if (!table_join->sizeLimits().hasLimits()) + { + max_bytes_in_join = table_join->defaultMaxBytes(); + } - if (kind == JoinKind::Cross) + if (kind == JoinKind::Cross + && (tmp_stream || (max_bytes_in_join && getTotalByteCount() + block_to_save.allocatedBytes() >= max_bytes_in_join) + || (max_rows_in_join && getTotalRowCount() + block_to_save.rows() >= max_rows_in_join))) { if (tmp_stream == nullptr) { tmp_stream = &tmp_data->createStream(right_sample_block); } tmp_stream->write(block_to_save); + return true; } size_t total_rows = 0; @@ -2251,11 +2260,13 @@ void HashJoin::joinBlockImplCross(Block & block, ExtraBlockPtr & not_processed) { size_t start_left_row = 0; size_t start_right_block = 0; + std::unique_ptr reader = nullptr; if (not_processed) { auto & continuation = static_cast(*not_processed); start_left_row = continuation.left_position; start_right_block = continuation.right_block; + reader = std::move(continuation.reader); not_processed.reset(); } @@ -2317,8 +2328,11 @@ void HashJoin::joinBlockImplCross(Block & block, ExtraBlockPtr & not_processed) if (tmp_stream) { - tmp_stream->finishWritingAsyncSafe(); - auto reader = tmp_stream->getReadStream(); + if (reader == nullptr) + { + tmp_stream->finishWritingAsyncSafe(); + reader = tmp_stream->getReadStream(); + } while (auto block_right = reader->read()) { process_right_block(block_right); @@ -2330,7 +2344,7 @@ void HashJoin::joinBlockImplCross(Block & block, ExtraBlockPtr & not_processed) if (rows_added > max_joined_block_rows) { not_processed = std::make_shared( - NotProcessedCrossJoin{{block.cloneEmpty()}, left_row, block_number + 1}); + NotProcessedCrossJoin{{block.cloneEmpty()}, left_row, block_number + 1, std::move(reader)}); not_processed->block.swap(block); break; } From 3dd04e4e58585bd467fc3549200b9759886f8a88 Mon Sep 17 00:00:00 2001 From: Maxim Alexeev Date: Mon, 6 May 2024 23:27:28 +0300 Subject: [PATCH 087/289] removed garbage --- src/Interpreters/GraceHashJoin.cpp | 1 - src/Interpreters/HashJoin.h | 9 --------- 2 files changed, 10 deletions(-) diff --git a/src/Interpreters/GraceHashJoin.cpp b/src/Interpreters/GraceHashJoin.cpp index ba51953e979..53d1f48c291 100644 --- a/src/Interpreters/GraceHashJoin.cpp +++ b/src/Interpreters/GraceHashJoin.cpp @@ -657,7 +657,6 @@ IBlocksStreamPtr GraceHashJoin::getDelayedBlocks() GraceHashJoin::InMemoryJoinPtr GraceHashJoin::makeInMemoryJoin(const String & bucket_id, size_t reserve_num) { - LOG_INFO(log, "GreaceHashJoin\n"); return std::make_unique(table_join, right_sample_block, any_take_last_row, reserve_num, bucket_id); } diff --git a/src/Interpreters/HashJoin.h b/src/Interpreters/HashJoin.h index 1eb4d0f8030..04a04b74dd0 100644 --- a/src/Interpreters/HashJoin.h +++ b/src/Interpreters/HashJoin.h @@ -415,15 +415,6 @@ public: void setMaxJoinedBlockRows(size_t value) { max_joined_block_rows = value; } - TemporaryFileStream* getStreamForCrossJoin() - { - auto streams = tmp_data->getStreams(); - assert(streams.size() <= 1); - if (streams.empty()) - return nullptr; - return streams[0]; - } - private: friend class NotJoinedHash; From d4a0325350673a89e874d4f3a5a1500044587ced Mon Sep 17 00:00:00 2001 From: Maxim Alexeev Date: Mon, 6 May 2024 23:30:27 +0300 Subject: [PATCH 088/289] added check that tmp_data is not nullptr --- src/Interpreters/HashJoin.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Interpreters/HashJoin.cpp b/src/Interpreters/HashJoin.cpp index 68186c00663..f4408271fbf 100644 --- a/src/Interpreters/HashJoin.cpp +++ b/src/Interpreters/HashJoin.cpp @@ -837,7 +837,7 @@ bool HashJoin::addBlockToJoin(const Block & source_block_, bool check_limits) max_bytes_in_join = table_join->defaultMaxBytes(); } - if (kind == JoinKind::Cross + if (kind == JoinKind::Cross && tmp_data && (tmp_stream || (max_bytes_in_join && getTotalByteCount() + block_to_save.allocatedBytes() >= max_bytes_in_join) || (max_rows_in_join && getTotalRowCount() + block_to_save.rows() >= max_rows_in_join))) { From d531e2006d9326bce2877efda9823aa880e4c8cb Mon Sep 17 00:00:00 2001 From: Maxim Alexeev Date: Tue, 7 May 2024 00:10:24 +0300 Subject: [PATCH 089/289] Deleted check of read type and added comment --- src/Interpreters/TemporaryDataOnDisk.cpp | 12 ------------ src/Interpreters/TemporaryDataOnDisk.h | 5 +---- 2 files changed, 1 insertion(+), 16 deletions(-) diff --git a/src/Interpreters/TemporaryDataOnDisk.cpp b/src/Interpreters/TemporaryDataOnDisk.cpp index e50d501f6d3..9bd0fa9e8ed 100644 --- a/src/Interpreters/TemporaryDataOnDisk.cpp +++ b/src/Interpreters/TemporaryDataOnDisk.cpp @@ -331,12 +331,6 @@ Block TemporaryFileStream::read() if (isEof()) return {}; - if (auto type = read_type.exchange(1); type == 2) - { - read_type.store(2); - throw Exception(ErrorCodes::LOGICAL_ERROR, "Different type of reading was requested earlier"); - } - if (!in_reader) { in_reader = std::make_unique(getPath(), header, getSize()); @@ -360,12 +354,6 @@ std::unique_ptr TemporaryFileStream::getReadStream() if (isEof()) return nullptr; - if (auto type = read_type.exchange(2); type == 1) - { - read_type.store(1); - throw Exception(ErrorCodes::LOGICAL_ERROR, "Different type of reading was requested earlier"); - } - return std::make_unique(getPath(), header, getSize()); } diff --git a/src/Interpreters/TemporaryDataOnDisk.h b/src/Interpreters/TemporaryDataOnDisk.h index 792988e94bd..a5141dd373e 100644 --- a/src/Interpreters/TemporaryDataOnDisk.h +++ b/src/Interpreters/TemporaryDataOnDisk.h @@ -149,7 +149,7 @@ struct InputReader /* * Data can be written into this stream and then read. - * After finish writing, call `finishWriting` and then `read` to read the data. + * After finish writing, call `finishWriting` and then either call `read` or 'getReadStream'(only one of the two) to read the data. * Account amount of data written to disk in parent scope. */ class TemporaryFileStream : boost::noncopyable @@ -204,9 +204,6 @@ private: Stat stat; - /// 0 - means that we haven't requested any read, 1 - read from function TemporaryFileStream::read, 2 - - std::atomic_char read_type{0}; - mutable std::mutex finish_writing; std::atomic_bool writing_finished{false}; From 9fcb5d2ebfb3e2105b5d6171a1a53ff75c0023f4 Mon Sep 17 00:00:00 2001 From: Yohann Jardin Date: Mon, 6 May 2024 23:44:27 +0200 Subject: [PATCH 090/289] Move ErrorCodes inside DB namespace --- .../AggregateFunctionMannWhitney.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/AggregateFunctions/AggregateFunctionMannWhitney.cpp b/src/AggregateFunctions/AggregateFunctionMannWhitney.cpp index d185058dbd0..8434f11eae6 100644 --- a/src/AggregateFunctions/AggregateFunctionMannWhitney.cpp +++ b/src/AggregateFunctions/AggregateFunctionMannWhitney.cpp @@ -17,6 +17,11 @@ #include +namespace DB +{ + +struct Settings; + namespace ErrorCodes { extern const int NOT_IMPLEMENTED; @@ -25,11 +30,6 @@ namespace ErrorCodes extern const int BAD_ARGUMENTS; } -namespace DB -{ - -struct Settings; - namespace { From 4703eb943b9586be7cab2429a170d9cf9131558f Mon Sep 17 00:00:00 2001 From: Yohann Jardin Date: Mon, 6 May 2024 23:59:59 +0200 Subject: [PATCH 091/289] Move ErrorCodes inside DB namespace --- src/AggregateFunctions/AggregateFunctionWelchTTest.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/AggregateFunctions/AggregateFunctionWelchTTest.cpp b/src/AggregateFunctions/AggregateFunctionWelchTTest.cpp index 7f1c4861fbc..7efd6154227 100644 --- a/src/AggregateFunctions/AggregateFunctionWelchTTest.cpp +++ b/src/AggregateFunctions/AggregateFunctionWelchTTest.cpp @@ -4,16 +4,16 @@ #include +namespace DB +{ +struct Settings; + namespace ErrorCodes { extern const int BAD_ARGUMENTS; extern const int TOO_MANY_ARGUMENTS_FOR_FUNCTION; } -namespace DB -{ -struct Settings; - namespace { From d14fc62d4d4b597435e482f60a2f7c83412e53cc Mon Sep 17 00:00:00 2001 From: Michael Kolupaev Date: Fri, 29 Mar 2024 08:59:07 +0000 Subject: [PATCH 092/289] Avoid crashing on column type mismatch in a few dozen places --- src/Columns/ColumnArray.cpp | 2 +- src/Columns/ColumnLowCardinality.cpp | 2 +- src/Columns/ColumnUnique.h | 4 +- src/Columns/FilterDescription.cpp | 4 +- src/Columns/IColumn.h | 8 +++- src/Columns/MaskOperations.cpp | 4 +- src/Common/ColumnsHashing.h | 12 +++--- src/Common/ColumnsHashingImpl.h | 2 +- src/Core/DecimalComparison.h | 16 ++++---- src/DataTypes/ObjectUtils.cpp | 2 +- .../Serializations/SerializationBool.cpp | 4 +- .../Serializations/SerializationInterval.cpp | 2 +- .../SerializationLowCardinality.cpp | 2 +- .../HierarchyDictionariesUtils.cpp | 8 ++-- src/Formats/insertNullAsDefaultIfNeeded.cpp | 40 +++++++++---------- src/Functions/FunctionBase64Conversion.h | 4 +- src/Functions/FunctionBinaryArithmetic.h | 20 +++++----- src/Functions/FunctionHelpers.cpp | 10 ++--- src/Functions/FunctionHelpers.h | 25 +++++++++++- src/Functions/FunctionUnixTimestamp64.h | 2 +- src/Functions/FunctionsBitmap.h | 4 +- src/Functions/FunctionsCodingIP.cpp | 4 +- src/Functions/FunctionsConversion.cpp | 2 +- src/Functions/FunctionsRound.h | 18 ++++----- src/Functions/FunctionsStringHash.h | 12 +++--- src/Functions/Kusto/KqlArraySort.cpp | 4 +- src/Functions/MultiMatchAllIndicesImpl.h | 4 +- src/Functions/MultiMatchAnyImpl.h | 4 +- src/Functions/MultiSearchAllPositionsImpl.h | 4 +- src/Functions/MultiSearchFirstIndexImpl.h | 4 +- src/Functions/MultiSearchFirstPositionImpl.h | 4 +- src/Functions/MultiSearchImpl.h | 4 +- .../URL/FirstSignificantSubdomainCustomImpl.h | 2 +- src/Functions/array/FunctionArrayMapped.h | 2 +- src/Functions/array/arrayCompact.cpp | 2 +- src/Functions/array/arrayDistinct.cpp | 8 ++-- src/Functions/array/arrayElement.cpp | 6 +-- src/Functions/array/arrayEnumerateExtended.h | 2 +- src/Functions/array/arrayIndex.h | 10 ++--- src/Functions/array/arrayJaccardIndex.cpp | 8 ++-- src/Functions/array/arrayUniq.cpp | 2 +- src/Functions/array/emptyArrayToSingle.cpp | 2 +- src/Functions/array/range.cpp | 6 +-- src/Functions/arrayStringConcat.cpp | 2 +- src/Functions/assumeNotNull.cpp | 2 +- src/Functions/coalesce.cpp | 10 ++--- src/Functions/fromModifiedJulianDay.cpp | 4 +- src/Functions/grouping.h | 4 +- src/Functions/hasColumnInTable.cpp | 4 +- src/Functions/if.cpp | 18 ++++----- src/Functions/isNotNull.cpp | 14 +++---- src/Functions/isNull.cpp | 14 +++---- src/Functions/minSampleSize.cpp | 8 ++-- src/Functions/multiIf.cpp | 2 +- src/Functions/readWkt.cpp | 4 +- src/Functions/repeat.cpp | 8 ++-- src/Functions/seriesOutliersDetectTukey.cpp | 6 +-- src/Functions/seriesPeriodDetectFFT.cpp | 6 +-- src/Functions/space.cpp | 6 +-- src/Functions/toStartOfInterval.cpp | 6 +-- src/Functions/ztest.cpp | 12 +++--- src/Interpreters/BloomFilterHash.h | 2 +- src/Interpreters/HashJoin.cpp | 2 +- src/Interpreters/InterpreterCheckQuery.cpp | 6 +-- src/Interpreters/JoinUtils.cpp | 20 +++++----- src/Interpreters/MergeJoin.cpp | 8 ++-- src/Interpreters/NullableUtils.cpp | 4 +- src/Interpreters/SetVariants.cpp | 2 +- src/Interpreters/SetVariants.h | 2 +- .../Transforms/CheckConstraintsTransform.cpp | 2 +- .../Transforms/MergeJoinTransform.cpp | 8 ++-- src/Processors/Transforms/WindowTransform.cpp | 2 +- src/Storages/MergeTree/MergeTreeIndexSet.cpp | 2 +- 73 files changed, 253 insertions(+), 228 deletions(-) diff --git a/src/Columns/ColumnArray.cpp b/src/Columns/ColumnArray.cpp index 7b268b80116..8d5c246c48c 100644 --- a/src/Columns/ColumnArray.cpp +++ b/src/Columns/ColumnArray.cpp @@ -1283,7 +1283,7 @@ ColumnPtr ColumnArray::replicateTuple(const Offsets & replicate_offsets) const size_t ColumnArray::getNumberOfDimensions() const { - const auto * nested_array = checkAndGetColumn(*data); + const auto * nested_array = checkAndGetColumn(&*data); if (!nested_array) return 1; return 1 + nested_array->getNumberOfDimensions(); /// Every modern C++ compiler optimizes tail recursion. diff --git a/src/Columns/ColumnLowCardinality.cpp b/src/Columns/ColumnLowCardinality.cpp index a3696c78669..a032c2b25b7 100644 --- a/src/Columns/ColumnLowCardinality.cpp +++ b/src/Columns/ColumnLowCardinality.cpp @@ -903,7 +903,7 @@ ColumnPtr ColumnLowCardinality::cloneWithDefaultOnNull() const bool isColumnLowCardinalityNullable(const IColumn & column) { - if (const auto * lc_column = checkAndGetColumn(column)) + if (const auto * lc_column = checkAndGetColumn(&column)) return lc_column->nestedIsNullable(); return false; } diff --git a/src/Columns/ColumnUnique.h b/src/Columns/ColumnUnique.h index a8873140817..0311efd4c83 100644 --- a/src/Columns/ColumnUnique.h +++ b/src/Columns/ColumnUnique.h @@ -376,7 +376,7 @@ size_t ColumnUnique::uniqueInsertFrom(const IColumn & src, size_t n) if (is_nullable && src.isNullAt(n)) return getNullValueIndex(); - if (const auto * nullable = checkAndGetColumn(src)) + if (const auto * nullable = checkAndGetColumn(&src)) return uniqueInsertFrom(nullable->getNestedColumn(), n); auto ref = src.getDataAt(n); @@ -569,7 +569,7 @@ MutableColumnPtr ColumnUnique::uniqueInsertRangeImpl( return nullptr; }; - if (const auto * nullable_column = checkAndGetColumn(src)) + if (const auto * nullable_column = checkAndGetColumn(&src)) { src_column = typeid_cast(&nullable_column->getNestedColumn()); null_map = &nullable_column->getNullMapData(); diff --git a/src/Columns/FilterDescription.cpp b/src/Columns/FilterDescription.cpp index 62d2babe17a..56c99a5b49c 100644 --- a/src/Columns/FilterDescription.cpp +++ b/src/Columns/FilterDescription.cpp @@ -32,7 +32,7 @@ ConstantFilterDescription::ConstantFilterDescription(const IColumn & column) if (!typeid_cast(column_nested.get())) { - const ColumnNullable * column_nested_nullable = checkAndGetColumn(*column_nested); + const ColumnNullable * column_nested_nullable = checkAndGetColumn(&*column_nested); if (!column_nested_nullable || !typeid_cast(&column_nested_nullable->getNestedColumn())) { throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_COLUMN_FOR_FILTER, @@ -66,7 +66,7 @@ FilterDescription::FilterDescription(const IColumn & column_) return; } - if (const auto * nullable_column = checkAndGetColumn(column)) + if (const auto * nullable_column = checkAndGetColumn(&column)) { ColumnPtr nested_column = nullable_column->getNestedColumnPtr(); MutableColumnPtr mutable_holder = IColumn::mutate(std::move(nested_column)); diff --git a/src/Columns/IColumn.h b/src/Columns/IColumn.h index cea8d7c9f55..cf2693e008c 100644 --- a/src/Columns/IColumn.h +++ b/src/Columns/IColumn.h @@ -640,12 +640,16 @@ template <> struct IsMutableColumns<> { static const bool value = true; }; +/// Throws LOGICAL_ERROR if the type doesn't match. template -const Type * checkAndGetColumn(const IColumn & column) +const Type & checkAndGetColumn(const IColumn & column) { - return typeid_cast(&column); + return typeid_cast(column); } +/// Returns nullptr if the type doesn't match. +/// If you're going to dereference the returned pointer without checking for null, use the +/// `const IColumn &` overload above instead. template const Type * checkAndGetColumn(const IColumn * column) { diff --git a/src/Columns/MaskOperations.cpp b/src/Columns/MaskOperations.cpp index 8f5299cf6e8..2c54a416850 100644 --- a/src/Columns/MaskOperations.cpp +++ b/src/Columns/MaskOperations.cpp @@ -205,10 +205,10 @@ static MaskInfo extractMaskImpl( auto column = col->convertToFullColumnIfLowCardinality(); /// Special implementation for Null and Const columns. - if (column->onlyNull() || checkAndGetColumn(*column)) + if (column->onlyNull() || checkAndGetColumn(&*column)) return extractMaskFromConstOrNull(mask, column, null_value, nulls); - if (const auto * nullable_column = checkAndGetColumn(*column)) + if (const auto * nullable_column = checkAndGetColumn(&*column)) { const PaddedPODArray & null_map = nullable_column->getNullMapData(); return extractMaskImpl(mask, nullable_column->getNestedColumnPtr(), null_value, &null_map, nulls); diff --git a/src/Common/ColumnsHashing.h b/src/Common/ColumnsHashing.h index 25d2ddc684d..bd3880eb83b 100644 --- a/src/Common/ColumnsHashing.h +++ b/src/Common/ColumnsHashing.h @@ -44,8 +44,8 @@ struct HashMethodOneNumber { if constexpr (nullable) { - const auto * null_column = checkAndGetColumn(key_columns[0]); - vec = null_column->getNestedColumnPtr()->getRawData().data(); + const auto & null_column = checkAndGetColumn(*key_columns[0]); + vec = null_column.getNestedColumnPtr()->getRawData().data(); } else { @@ -57,8 +57,8 @@ struct HashMethodOneNumber { if constexpr (nullable) { - const auto * null_column = checkAndGetColumn(column); - vec = null_column->getNestedColumnPtr()->getRawData().data(); + const auto & null_column = checkAndGetColumn(*column); + vec = null_column.getNestedColumnPtr()->getRawData().data(); } else { @@ -105,7 +105,7 @@ struct HashMethodString const IColumn * column; if constexpr (nullable) { - column = checkAndGetColumn(key_columns[0])->getNestedColumnPtr().get(); + column = checkAndGetColumn(*key_columns[0]).getNestedColumnPtr().get(); } else { @@ -153,7 +153,7 @@ struct HashMethodFixedString const IColumn * column; if constexpr (nullable) { - column = checkAndGetColumn(key_columns[0])->getNestedColumnPtr().get(); + column = checkAndGetColumn(*key_columns[0]).getNestedColumnPtr().get(); } else { diff --git a/src/Common/ColumnsHashingImpl.h b/src/Common/ColumnsHashingImpl.h index 6b3cc24d870..f74a56292ae 100644 --- a/src/Common/ColumnsHashingImpl.h +++ b/src/Common/ColumnsHashingImpl.h @@ -305,7 +305,7 @@ protected: } if constexpr (nullable) - null_map = &checkAndGetColumn(column)->getNullMapColumn(); + null_map = &checkAndGetColumn(*column).getNullMapColumn(); } template diff --git a/src/Core/DecimalComparison.h b/src/Core/DecimalComparison.h index 08569aa534c..63817e77805 100644 --- a/src/Core/DecimalComparison.h +++ b/src/Core/DecimalComparison.h @@ -170,11 +170,11 @@ private: if (c0_is_const && c1_is_const) { - const ColumnConst * c0_const = checkAndGetColumnConst(c0.get()); - const ColumnConst * c1_const = checkAndGetColumnConst(c1.get()); + const ColumnConst & c0_const = checkAndGetColumnConst(*c0); + const ColumnConst & c1_const = checkAndGetColumnConst(*c1); - A a = c0_const->template getValue(); - B b = c1_const->template getValue(); + A a = c0_const.template getValue(); + B b = c1_const.template getValue(); UInt8 res = apply(a, b, scale); return DataTypeUInt8().createColumnConst(c0->size(), toField(res)); } @@ -184,8 +184,8 @@ private: if (c0_is_const) { - const ColumnConst * c0_const = checkAndGetColumnConst(c0.get()); - A a = c0_const->template getValue(); + const ColumnConst & c0_const = checkAndGetColumnConst(*c0); + A a = c0_const.template getValue(); if (const ColVecB * c1_vec = checkAndGetColumn(c1.get())) constantVector(a, c1_vec->getData(), vec_res, scale); else @@ -193,8 +193,8 @@ private: } else if (c1_is_const) { - const ColumnConst * c1_const = checkAndGetColumnConst(c1.get()); - B b = c1_const->template getValue(); + const ColumnConst & c1_const = checkAndGetColumnConst(*c1); + B b = c1_const.template getValue(); if (const ColVecA * c0_vec = checkAndGetColumn(c0.get())) vectorConstant(c0_vec->getData(), b, vec_res, scale); else diff --git a/src/DataTypes/ObjectUtils.cpp b/src/DataTypes/ObjectUtils.cpp index 99cf092e6cd..7c671fcf44f 100644 --- a/src/DataTypes/ObjectUtils.cpp +++ b/src/DataTypes/ObjectUtils.cpp @@ -47,7 +47,7 @@ size_t getNumberOfDimensions(const IDataType & type) size_t getNumberOfDimensions(const IColumn & column) { - if (const auto * column_array = checkAndGetColumn(column)) + if (const auto * column_array = checkAndGetColumn(&column)) return column_array->getNumberOfDimensions(); return 0; } diff --git a/src/DataTypes/Serializations/SerializationBool.cpp b/src/DataTypes/Serializations/SerializationBool.cpp index d6a74e5cb8f..b63f25ddc35 100644 --- a/src/DataTypes/Serializations/SerializationBool.cpp +++ b/src/DataTypes/Serializations/SerializationBool.cpp @@ -28,7 +28,7 @@ constexpr char str_false[6] = "false"; const ColumnUInt8 * checkAndGetSerializeColumnType(const IColumn & column) { const auto * col = checkAndGetColumn(&column); - if (!checkAndGetColumn(&column)) + if (!col) throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Bool type can only serialize columns of type UInt8.{}", column.getName()); return col; } @@ -36,7 +36,7 @@ const ColumnUInt8 * checkAndGetSerializeColumnType(const IColumn & column) ColumnUInt8 * checkAndGetDeserializeColumnType(IColumn & column) { auto * col = typeid_cast(&column); - if (!checkAndGetColumn(&column)) + if (!col) throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Bool type can only deserialize columns of type UInt8.{}", column.getName()); return col; diff --git a/src/DataTypes/Serializations/SerializationInterval.cpp b/src/DataTypes/Serializations/SerializationInterval.cpp index 59086d8aef3..c4ef34b4325 100644 --- a/src/DataTypes/Serializations/SerializationInterval.cpp +++ b/src/DataTypes/Serializations/SerializationInterval.cpp @@ -17,7 +17,7 @@ namespace ErrorCodes void SerializationKustoInterval::serializeText( const IColumn & column, const size_t row, WriteBuffer & ostr, const FormatSettings &) const { - const auto * interval_column = checkAndGetColumn(column); + const auto * interval_column = checkAndGetColumn(&column); if (!interval_column) throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Expected column of underlying type of Interval"); diff --git a/src/DataTypes/Serializations/SerializationLowCardinality.cpp b/src/DataTypes/Serializations/SerializationLowCardinality.cpp index 9efe05042ed..2d2be195098 100644 --- a/src/DataTypes/Serializations/SerializationLowCardinality.cpp +++ b/src/DataTypes/Serializations/SerializationLowCardinality.cpp @@ -477,7 +477,7 @@ void SerializationLowCardinality::serializeBinaryBulkWithMultipleStreams( settings.low_cardinality_max_dictionary_size); } - if (const auto * nullable_keys = checkAndGetColumn(*keys)) + if (const auto * nullable_keys = checkAndGetColumn(&*keys)) keys = nullable_keys->getNestedColumnPtr(); bool need_additional_keys = !keys->empty(); diff --git a/src/Dictionaries/HierarchyDictionariesUtils.cpp b/src/Dictionaries/HierarchyDictionariesUtils.cpp index fd59a0c37db..e1119982a34 100644 --- a/src/Dictionaries/HierarchyDictionariesUtils.cpp +++ b/src/Dictionaries/HierarchyDictionariesUtils.cpp @@ -95,7 +95,7 @@ namespace parent_key_column_non_null = parent_key_column_typed->getNestedColumnPtr(); } - const auto * parent_key_column_typed = checkAndGetColumn>(*parent_key_column_non_null); + const auto * parent_key_column_typed = checkAndGetColumn>(&*parent_key_column_non_null); if (!parent_key_column_typed) throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "Parent key column should be UInt64. Actual {}", @@ -166,7 +166,7 @@ ColumnPtr getKeysHierarchyDefaultImplementation( valid_keys = 0; key_column = key_column->convertToFullColumnIfConst(); - const auto * key_column_typed = checkAndGetColumn>(*key_column); + const auto * key_column_typed = checkAndGetColumn>(&*key_column); if (!key_column_typed) throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "Key column should be UInt64"); @@ -224,11 +224,11 @@ ColumnUInt8::Ptr getKeysIsInHierarchyDefaultImplementation( key_column = key_column->convertToFullColumnIfConst(); in_key_column = in_key_column->convertToFullColumnIfConst(); - const auto * key_column_typed = checkAndGetColumn>(*key_column); + const auto * key_column_typed = checkAndGetColumn>(&*key_column); if (!key_column_typed) throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "Key column should be UInt64"); - const auto * in_key_column_typed = checkAndGetColumn>(*in_key_column); + const auto * in_key_column_typed = checkAndGetColumn>(&*in_key_column); if (!in_key_column_typed) throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "Key column should be UInt64"); diff --git a/src/Formats/insertNullAsDefaultIfNeeded.cpp b/src/Formats/insertNullAsDefaultIfNeeded.cpp index c42b8c54d73..ff40d036fb5 100644 --- a/src/Formats/insertNullAsDefaultIfNeeded.cpp +++ b/src/Formats/insertNullAsDefaultIfNeeded.cpp @@ -19,45 +19,45 @@ bool insertNullAsDefaultIfNeeded(ColumnWithTypeAndName & input_column, const Col if (isArray(input_column.type) && isArray(header_column.type)) { ColumnWithTypeAndName nested_input_column; - const auto * array_input_column = checkAndGetColumn(input_column.column.get()); - nested_input_column.column = array_input_column->getDataPtr(); + const auto & array_input_column = checkAndGetColumn(*input_column.column); + nested_input_column.column = array_input_column.getDataPtr(); nested_input_column.type = checkAndGetDataType(input_column.type.get())->getNestedType(); ColumnWithTypeAndName nested_header_column; - nested_header_column.column = checkAndGetColumn(header_column.column.get())->getDataPtr(); + nested_header_column.column = checkAndGetColumn(*header_column.column).getDataPtr(); nested_header_column.type = checkAndGetDataType(header_column.type.get())->getNestedType(); if (!insertNullAsDefaultIfNeeded(nested_input_column, nested_header_column, 0, nullptr)) return false; - input_column.column = ColumnArray::create(nested_input_column.column, array_input_column->getOffsetsPtr()); + input_column.column = ColumnArray::create(nested_input_column.column, array_input_column.getOffsetsPtr()); input_column.type = std::make_shared(std::move(nested_input_column.type)); return true; } if (isTuple(input_column.type) && isTuple(header_column.type)) { - const auto * tuple_input_column = checkAndGetColumn(input_column.column.get()); - const auto * tuple_input_type = checkAndGetDataType(input_column.type.get()); - const auto * tuple_header_column = checkAndGetColumn(header_column.column.get()); - const auto * tuple_header_type = checkAndGetDataType(header_column.type.get()); + const auto & tuple_input_column = checkAndGetColumn(*input_column.column); + const auto & tuple_input_type = checkAndGetDataType(*input_column.type); + const auto & tuple_header_column = checkAndGetColumn(*header_column.column); + const auto & tuple_header_type = checkAndGetDataType(*header_column.type); - if (tuple_input_type->getElements().size() != tuple_header_type->getElements().size()) + if (tuple_input_type.getElements().size() != tuple_header_type.getElements().size()) return false; Columns nested_input_columns; - nested_input_columns.reserve(tuple_input_type->getElements().size()); + nested_input_columns.reserve(tuple_input_type.getElements().size()); DataTypes nested_input_types; - nested_input_types.reserve(tuple_input_type->getElements().size()); + nested_input_types.reserve(tuple_input_type.getElements().size()); bool changed = false; - for (size_t i = 0; i != tuple_input_type->getElements().size(); ++i) + for (size_t i = 0; i != tuple_input_type.getElements().size(); ++i) { ColumnWithTypeAndName nested_input_column; - nested_input_column.column = tuple_input_column->getColumnPtr(i); - nested_input_column.type = tuple_input_type->getElement(i); + nested_input_column.column = tuple_input_column.getColumnPtr(i); + nested_input_column.type = tuple_input_type.getElement(i); ColumnWithTypeAndName nested_header_column; - nested_header_column.column = tuple_header_column->getColumnPtr(i); - nested_header_column.type = tuple_header_type->getElement(i); + nested_header_column.column = tuple_header_column.getColumnPtr(i); + nested_header_column.type = tuple_header_type.getElement(i); changed |= insertNullAsDefaultIfNeeded(nested_input_column, nested_header_column, 0, nullptr); nested_input_columns.push_back(std::move(nested_input_column.column)); nested_input_types.push_back(std::move(nested_input_column.type)); @@ -74,12 +74,12 @@ bool insertNullAsDefaultIfNeeded(ColumnWithTypeAndName & input_column, const Col if (isMap(input_column.type) && isMap(header_column.type)) { ColumnWithTypeAndName nested_input_column; - nested_input_column.column = checkAndGetColumn(input_column.column.get())->getNestedColumnPtr(); - nested_input_column.type = checkAndGetDataType(input_column.type.get())->getNestedType(); + nested_input_column.column = checkAndGetColumn(*input_column.column).getNestedColumnPtr(); + nested_input_column.type = checkAndGetDataType(*input_column.type).getNestedType(); ColumnWithTypeAndName nested_header_column; - nested_header_column.column = checkAndGetColumn(header_column.column.get())->getNestedColumnPtr(); - nested_header_column.type = checkAndGetDataType(header_column.type.get())->getNestedType(); + nested_header_column.column = checkAndGetColumn(*header_column.column).getNestedColumnPtr(); + nested_header_column.type = checkAndGetDataType(*header_column.type).getNestedType(); if (!insertNullAsDefaultIfNeeded(nested_input_column, nested_header_column, 0, nullptr)) return false; diff --git a/src/Functions/FunctionBase64Conversion.h b/src/Functions/FunctionBase64Conversion.h index 979c589c64b..3906563a254 100644 --- a/src/Functions/FunctionBase64Conversion.h +++ b/src/Functions/FunctionBase64Conversion.h @@ -111,9 +111,9 @@ public: ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override { const auto & input_column = arguments[0].column; - if (const auto * src_column_as_fixed_string = checkAndGetColumn(*input_column)) + if (const auto * src_column_as_fixed_string = checkAndGetColumn(&*input_column)) return execute(*src_column_as_fixed_string, input_rows_count); - else if (const auto * src_column_as_string = checkAndGetColumn(*input_column)) + else if (const auto * src_column_as_string = checkAndGetColumn(&*input_column)) return execute(*src_column_as_string, input_rows_count); throw Exception( diff --git a/src/Functions/FunctionBinaryArithmetic.h b/src/Functions/FunctionBinaryArithmetic.h index 89ff63995b1..8ac602faed9 100644 --- a/src/Functions/FunctionBinaryArithmetic.h +++ b/src/Functions/FunctionBinaryArithmetic.h @@ -1767,8 +1767,8 @@ public: { if (const auto * col_right_const = checkAndGetColumnConst(col_right_raw)) { - const auto * col_left = checkAndGetColumn(col_left_const->getDataColumn()); - const auto * col_right = checkAndGetColumn(col_right_const->getDataColumn()); + const auto * col_left = &checkAndGetColumn(col_left_const->getDataColumn()); + const auto * col_right = &checkAndGetColumn(col_right_const->getDataColumn()); if (col_left->getN() != col_right->getN()) return nullptr; @@ -1805,11 +1805,11 @@ public: const auto * col_left = is_left_column_const ? checkAndGetColumn( - checkAndGetColumnConst(col_left_raw)->getDataColumn()) + &checkAndGetColumnConst(col_left_raw)->getDataColumn()) : checkAndGetColumn(col_left_raw); const auto * col_right = is_right_column_const ? checkAndGetColumn( - checkAndGetColumnConst(col_right_raw)->getDataColumn()) + &checkAndGetColumnConst(col_right_raw)->getDataColumn()) : checkAndGetColumn(col_right_raw); if (col_left && col_right) @@ -1881,8 +1881,8 @@ public: { if (const auto * col_right_const = checkAndGetColumnConst(col_right_raw)) { - const auto * col_left = checkAndGetColumn(col_left_const->getDataColumn()); - const auto * col_right = checkAndGetColumn(col_right_const->getDataColumn()); + const auto * col_left = &checkAndGetColumn(col_left_const->getDataColumn()); + const auto * col_right = &checkAndGetColumn(col_right_const->getDataColumn()); std::string_view a = col_left->getDataAt(0).toView(); std::string_view b = col_right->getDataAt(0).toView(); @@ -1897,10 +1897,10 @@ public: const bool is_right_column_const = checkAndGetColumnConst(col_right_raw) != nullptr; const auto * col_left = is_left_column_const - ? checkAndGetColumn(checkAndGetColumnConst(col_left_raw)->getDataColumn()) + ? &checkAndGetColumn(checkAndGetColumnConst(col_left_raw)->getDataColumn()) : checkAndGetColumn(col_left_raw); const auto * col_right = is_right_column_const - ? checkAndGetColumn(checkAndGetColumnConst(col_right_raw)->getDataColumn()) + ? &checkAndGetColumn(checkAndGetColumnConst(col_right_raw)->getDataColumn()) : checkAndGetColumn(col_right_raw); if (col_left && col_right) @@ -1948,7 +1948,7 @@ ColumnPtr executeStringInteger(const ColumnsWithTypeAndName & arguments, const A const ColumnConst * const col_left_const = checkAndGetColumnConst(col_left_raw); - const auto * col_left = col_left_const ? checkAndGetColumn(col_left_const->getDataColumn()) + const auto * col_left = col_left_const ? &checkAndGetColumn(col_left_const->getDataColumn()) : checkAndGetColumn(col_left_raw); if (!col_left) @@ -2231,7 +2231,7 @@ ColumnPtr executeStringInteger(const ColumnsWithTypeAndName & arguments, const A bool is_const = checkColumnConst(right_argument.column.get()); const ColumnNullable * nullable_column = is_const ? checkAndGetColumnConstData(right_argument.column.get()) - : checkAndGetColumn(*right_argument.column); + : checkAndGetColumn(right_argument.column.get()); const auto & null_bytemap = nullable_column->getNullMapData(); auto res = executeImpl2(createBlockWithNestedColumns(arguments), removeNullable(result_type), input_rows_count, &null_bytemap); diff --git a/src/Functions/FunctionHelpers.cpp b/src/Functions/FunctionHelpers.cpp index ce83a489a3d..048a601de81 100644 --- a/src/Functions/FunctionHelpers.cpp +++ b/src/Functions/FunctionHelpers.cpp @@ -58,14 +58,14 @@ ColumnWithTypeAndName columnGetNested(const ColumnWithTypeAndName & col) { return ColumnWithTypeAndName{nullptr, nested_type, col.name}; } - else if (const auto * nullable = checkAndGetColumn(*col.column)) + else if (const auto * nullable = checkAndGetColumn(&*col.column)) { const auto & nested_col = nullable->getNestedColumnPtr(); return ColumnWithTypeAndName{nested_col, nested_type, col.name}; } - else if (const auto * const_column = checkAndGetColumn(*col.column)) + else if (const auto * const_column = checkAndGetColumn(&*col.column)) { - const auto * nullable_column = checkAndGetColumn(const_column->getDataColumn()); + const auto * nullable_column = checkAndGetColumn(&const_column->getDataColumn()); ColumnPtr nullable_res; if (nullable_column) @@ -226,7 +226,7 @@ ColumnPtr wrapInNullable(const ColumnPtr & src, const ColumnsWithTypeAndName & a if (src->onlyNull()) return src; - else if (const auto * nullable = checkAndGetColumn(*src)) + else if (const auto * nullable = checkAndGetColumn(&*src)) { src_not_nullable = nullable->getNestedColumnPtr(); result_null_map_column = nullable->getNullMapColumnPtr(); @@ -247,7 +247,7 @@ ColumnPtr wrapInNullable(const ColumnPtr & src, const ColumnsWithTypeAndName & a if (isColumnConst(*elem.column)) continue; - if (const auto * nullable = checkAndGetColumn(*elem.column)) + if (const auto * nullable = checkAndGetColumn(&*elem.column)) { const ColumnPtr & null_map_column = nullable->getNullMapColumnPtr(); if (!result_null_map_column) diff --git a/src/Functions/FunctionHelpers.h b/src/Functions/FunctionHelpers.h index 9f44d3e95c2..89b224e79ef 100644 --- a/src/Functions/FunctionHelpers.h +++ b/src/Functions/FunctionHelpers.h @@ -25,6 +25,13 @@ const Type * checkAndGetDataType(const IDataType * data_type) return typeid_cast(data_type); } +/// Throws on mismatch. +template +const Type & checkAndGetDataType(const IDataType & data_type) +{ + return typeid_cast(data_type); +} + template bool checkDataTypes(const IDataType * data_type) { @@ -34,10 +41,12 @@ bool checkDataTypes(const IDataType * data_type) template const ColumnConst * checkAndGetColumnConst(const IColumn * column) { - if (!column || !isColumnConst(*column)) + if (!column) return {}; - const ColumnConst * res = assert_cast(column); + const ColumnConst * res = checkAndGetColumn(column); + if (!res) + return {}; if (!checkColumn(&res->getDataColumn())) return {}; @@ -45,6 +54,18 @@ const ColumnConst * checkAndGetColumnConst(const IColumn * column) return res; } +template +const ColumnConst & checkAndGetColumnConst(const IColumn & column) +{ + const ColumnConst & res = checkAndGetColumn(column); + + const auto & data_column = res.getDataColumn(); + if (!checkColumn(&data_column)) + throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "Unexpected const column type: expected {}, got {}", demangle(typeid(Type).name()), demangle(typeid(data_column).name())); + + return res; +} + template const Type * checkAndGetColumnConstData(const IColumn * column) { diff --git a/src/Functions/FunctionUnixTimestamp64.h b/src/Functions/FunctionUnixTimestamp64.h index 53421a565cb..c418163343b 100644 --- a/src/Functions/FunctionUnixTimestamp64.h +++ b/src/Functions/FunctionUnixTimestamp64.h @@ -140,7 +140,7 @@ public: const auto & src = arguments[0]; const auto & col = *src.column; - if (!checkAndGetColumn>(col)) + if (!checkAndGetColumn>(&col)) return false; auto & result_data = result_column->getData(); diff --git a/src/Functions/FunctionsBitmap.h b/src/Functions/FunctionsBitmap.h index 22d46fa7728..92ec71a3118 100644 --- a/src/Functions/FunctionsBitmap.h +++ b/src/Functions/FunctionsBitmap.h @@ -193,8 +193,8 @@ private: const ColumnArray * array = typeid_cast(arguments[0].column.get()); const ColumnPtr & mapped = array->getDataPtr(); const ColumnArray::Offsets & offsets = array->getOffsets(); - const ColumnVector * column = checkAndGetColumn>(&*mapped); - const typename ColumnVector::Container & input_data = column->getData(); + const ColumnVector & column = checkAndGetColumn>(*mapped); + const typename ColumnVector::Container & input_data = column.getData(); // output data Array params_row; diff --git a/src/Functions/FunctionsCodingIP.cpp b/src/Functions/FunctionsCodingIP.cpp index 7bdbac6531d..54f7b6dd1f4 100644 --- a/src/Functions/FunctionsCodingIP.cpp +++ b/src/Functions/FunctionsCodingIP.cpp @@ -536,7 +536,7 @@ public: const auto & col_type_name = arguments[0]; const ColumnPtr & column = col_type_name.column; - if (const auto * col_in = checkAndGetColumn(*column)) + if (const auto * col_in = checkAndGetColumn(&*column)) { auto col_res = ColumnIPv6::create(); @@ -551,7 +551,7 @@ public: return col_res; } - if (const auto * col_in = checkAndGetColumn(*column)) + if (const auto * col_in = checkAndGetColumn(&*column)) { auto col_res = ColumnFixedString::create(IPV6_BINARY_LENGTH); diff --git a/src/Functions/FunctionsConversion.cpp b/src/Functions/FunctionsConversion.cpp index a16ce136b9a..02da450e0c2 100644 --- a/src/Functions/FunctionsConversion.cpp +++ b/src/Functions/FunctionsConversion.cpp @@ -3302,7 +3302,7 @@ private: /// both columns have type UInt8, but we shouldn't use identity wrapper, /// because Bool column can contain only 0 and 1. auto res_column = to_type->createColumn(); - const auto & data_from = checkAndGetColumn(arguments[0].column.get())->getData(); + const auto & data_from = checkAndGetColumn(*arguments[0].column).getData(); auto & data_to = assert_cast(res_column.get())->getData(); data_to.resize(data_from.size()); for (size_t i = 0; i != data_from.size(); ++i) diff --git a/src/Functions/FunctionsRound.h b/src/Functions/FunctionsRound.h index 3d1028c6d35..24ffb5eb0c1 100644 --- a/src/Functions/FunctionsRound.h +++ b/src/Functions/FunctionsRound.h @@ -467,28 +467,28 @@ struct Dispatcher static ColumnPtr apply(const IColumn * col_general, Scale scale_arg) { - const auto * const col = checkAndGetColumn>(col_general); + const auto & col = checkAndGetColumn>(*col_general); auto col_res = ColumnVector::create(); typename ColumnVector::Container & vec_res = col_res->getData(); - vec_res.resize(col->getData().size()); + vec_res.resize(col.getData().size()); if (!vec_res.empty()) { if (scale_arg == 0) { size_t scale = 1; - FunctionRoundingImpl::apply(col->getData(), scale, vec_res); + FunctionRoundingImpl::apply(col.getData(), scale, vec_res); } else if (scale_arg > 0) { size_t scale = intExp10(scale_arg); - FunctionRoundingImpl::apply(col->getData(), scale, vec_res); + FunctionRoundingImpl::apply(col.getData(), scale, vec_res); } else { size_t scale = intExp10(-scale_arg); - FunctionRoundingImpl::apply(col->getData(), scale, vec_res); + FunctionRoundingImpl::apply(col.getData(), scale, vec_res); } } @@ -502,14 +502,14 @@ struct Dispatcher public: static ColumnPtr apply(const IColumn * col_general, Scale scale_arg) { - const auto * const col = checkAndGetColumn>(col_general); - const typename ColumnDecimal::Container & vec_src = col->getData(); + const auto & col = checkAndGetColumn>(*col_general); + const typename ColumnDecimal::Container & vec_src = col.getData(); - auto col_res = ColumnDecimal::create(vec_src.size(), col->getScale()); + auto col_res = ColumnDecimal::create(vec_src.size(), col.getScale()); auto & vec_res = col_res->getData(); if (!vec_res.empty()) - DecimalRoundingImpl::apply(col->getData(), col->getScale(), vec_res, scale_arg); + DecimalRoundingImpl::apply(col.getData(), col.getScale(), vec_res, scale_arg); return col_res; } diff --git a/src/Functions/FunctionsStringHash.h b/src/Functions/FunctionsStringHash.h index d951e77395e..fcd4c970a47 100644 --- a/src/Functions/FunctionsStringHash.h +++ b/src/Functions/FunctionsStringHash.h @@ -153,8 +153,8 @@ public: auto col_res = ColumnVector::create(); auto & vec_res = col_res->getData(); vec_res.resize(column->size()); - const ColumnString * col_str_vector = checkAndGetColumn(&*column); - Impl::apply(col_str_vector->getChars(), col_str_vector->getOffsets(), shingle_size, vec_res); + const ColumnString & col_str_vector = checkAndGetColumn(*column); + Impl::apply(col_str_vector.getChars(), col_str_vector.getOffsets(), shingle_size, vec_res); return col_res; } else if constexpr (is_arg) // Min hash arg @@ -170,8 +170,8 @@ public: auto min_tuple = ColumnTuple::create(std::move(min_columns)); auto max_tuple = ColumnTuple::create(std::move(max_columns)); - const ColumnString * col_str_vector = checkAndGetColumn(&*column); - Impl::apply(col_str_vector->getChars(), col_str_vector->getOffsets(), shingle_size, num_hashes, nullptr, nullptr, min_tuple.get(), max_tuple.get()); + const ColumnString & col_str_vector = checkAndGetColumn(*column); + Impl::apply(col_str_vector.getChars(), col_str_vector.getOffsets(), shingle_size, num_hashes, nullptr, nullptr, min_tuple.get(), max_tuple.get()); MutableColumns tuple_columns; tuple_columns.emplace_back(std::move(min_tuple)); @@ -186,8 +186,8 @@ public: auto & vec_h2 = col_h2->getData(); vec_h1.resize(column->size()); vec_h2.resize(column->size()); - const ColumnString * col_str_vector = checkAndGetColumn(&*column); - Impl::apply(col_str_vector->getChars(), col_str_vector->getOffsets(), shingle_size, num_hashes, &vec_h1, &vec_h2, nullptr, nullptr); + const ColumnString & col_str_vector = checkAndGetColumn(*column); + Impl::apply(col_str_vector.getChars(), col_str_vector.getOffsets(), shingle_size, num_hashes, &vec_h1, &vec_h2, nullptr, nullptr); MutableColumns tuple_columns; tuple_columns.emplace_back(std::move(col_h1)); tuple_columns.emplace_back(std::move(col_h2)); diff --git a/src/Functions/Kusto/KqlArraySort.cpp b/src/Functions/Kusto/KqlArraySort.cpp index 5be36328cc3..22544f4302c 100644 --- a/src/Functions/Kusto/KqlArraySort.cpp +++ b/src/Functions/Kusto/KqlArraySort.cpp @@ -158,12 +158,12 @@ public: auto out_tmp = ColumnArray::create(nested_types[i]->createColumn()); size_t array_size = tuple_coulmn->size(); - const auto * arr = checkAndGetColumn(tuple_coulmn.get()); + const auto & arr = checkAndGetColumn(*tuple_coulmn); for (size_t j = 0; j < array_size; ++j) { Field arr_field; - arr->get(j, arr_field); + arr.get(j, arr_field); out_tmp->insert(arr_field); } diff --git a/src/Functions/MultiMatchAllIndicesImpl.h b/src/Functions/MultiMatchAllIndicesImpl.h index d655311f532..3e9c8fba215 100644 --- a/src/Functions/MultiMatchAllIndicesImpl.h +++ b/src/Functions/MultiMatchAllIndicesImpl.h @@ -185,7 +185,7 @@ struct MultiMatchAllIndicesImpl size_t prev_haystack_offset = 0; size_t prev_needles_offset = 0; - const ColumnString * needles_data_string = checkAndGetColumn(&needles_data); + const ColumnString & needles_data_string = checkAndGetColumn(needles_data); std::vector needles; @@ -195,7 +195,7 @@ struct MultiMatchAllIndicesImpl for (size_t j = prev_needles_offset; j < needles_offsets[i]; ++j) { - needles.emplace_back(needles_data_string->getDataAt(j).toView()); + needles.emplace_back(needles_data_string.getDataAt(j).toView()); } if (needles.empty()) diff --git a/src/Functions/MultiMatchAnyImpl.h b/src/Functions/MultiMatchAnyImpl.h index 0b9eee2382f..000c334f6c1 100644 --- a/src/Functions/MultiMatchAnyImpl.h +++ b/src/Functions/MultiMatchAnyImpl.h @@ -212,7 +212,7 @@ struct MultiMatchAnyImpl size_t prev_haystack_offset = 0; size_t prev_needles_offset = 0; - const ColumnString * needles_data_string = checkAndGetColumn(&needles_data); + const ColumnString & needles_data_string = checkAndGetColumn(needles_data); std::vector needles; @@ -221,7 +221,7 @@ struct MultiMatchAnyImpl needles.reserve(needles_offsets[i] - prev_needles_offset); for (size_t j = prev_needles_offset; j < needles_offsets[i]; ++j) - needles.emplace_back(needles_data_string->getDataAt(j).toView()); + needles.emplace_back(needles_data_string.getDataAt(j).toView()); if (needles.empty()) { diff --git a/src/Functions/MultiSearchAllPositionsImpl.h b/src/Functions/MultiSearchAllPositionsImpl.h index 6e1f13d87b6..cfe60e51bcd 100644 --- a/src/Functions/MultiSearchAllPositionsImpl.h +++ b/src/Functions/MultiSearchAllPositionsImpl.h @@ -89,7 +89,7 @@ struct MultiSearchAllPositionsImpl offsets_res.reserve(haystack_offsets.size()); - const ColumnString * needles_data_string = checkAndGetColumn(&needles_data); + const ColumnString & needles_data_string = checkAndGetColumn(needles_data); std::vector needles; @@ -99,7 +99,7 @@ struct MultiSearchAllPositionsImpl for (size_t j = prev_needles_offset; j < needles_offsets[i]; ++j) { - needles.emplace_back(needles_data_string->getDataAt(j).toView()); + needles.emplace_back(needles_data_string.getDataAt(j).toView()); } const size_t needles_size = needles.size(); diff --git a/src/Functions/MultiSearchFirstIndexImpl.h b/src/Functions/MultiSearchFirstIndexImpl.h index 73f3c92adfb..36a5fd514d9 100644 --- a/src/Functions/MultiSearchFirstIndexImpl.h +++ b/src/Functions/MultiSearchFirstIndexImpl.h @@ -88,7 +88,7 @@ struct MultiSearchFirstIndexImpl size_t prev_haystack_offset = 0; size_t prev_needles_offset = 0; - const ColumnString * needles_data_string = checkAndGetColumn(&needles_data); + const ColumnString & needles_data_string = checkAndGetColumn(needles_data); std::vector needles; @@ -98,7 +98,7 @@ struct MultiSearchFirstIndexImpl for (size_t j = prev_needles_offset; j < needles_offsets[i]; ++j) { - needles.emplace_back(needles_data_string->getDataAt(j).toView()); + needles.emplace_back(needles_data_string.getDataAt(j).toView()); } auto searcher = Impl::createMultiSearcherInBigHaystack(needles); // sub-optimal diff --git a/src/Functions/MultiSearchFirstPositionImpl.h b/src/Functions/MultiSearchFirstPositionImpl.h index 99dd3f9d394..8b137bb67d1 100644 --- a/src/Functions/MultiSearchFirstPositionImpl.h +++ b/src/Functions/MultiSearchFirstPositionImpl.h @@ -97,7 +97,7 @@ struct MultiSearchFirstPositionImpl size_t prev_haystack_offset = 0; size_t prev_needles_offset = 0; - const ColumnString * needles_data_string = checkAndGetColumn(&needles_data); + const ColumnString & needles_data_string = checkAndGetColumn(needles_data); std::vector needles; @@ -112,7 +112,7 @@ struct MultiSearchFirstPositionImpl for (size_t j = prev_needles_offset; j < needles_offsets[i]; ++j) { - needles.emplace_back(needles_data_string->getDataAt(j).toView()); + needles.emplace_back(needles_data_string.getDataAt(j).toView()); } auto searcher = Impl::createMultiSearcherInBigHaystack(needles); // sub-optimal diff --git a/src/Functions/MultiSearchImpl.h b/src/Functions/MultiSearchImpl.h index fb7d56f302a..494eb323639 100644 --- a/src/Functions/MultiSearchImpl.h +++ b/src/Functions/MultiSearchImpl.h @@ -87,7 +87,7 @@ struct MultiSearchImpl size_t prev_haystack_offset = 0; size_t prev_needles_offset = 0; - const ColumnString * needles_data_string = checkAndGetColumn(&needles_data); + const ColumnString & needles_data_string = checkAndGetColumn(needles_data); std::vector needles; @@ -97,7 +97,7 @@ struct MultiSearchImpl for (size_t j = prev_needles_offset; j < needles_offsets[i]; ++j) { - needles.emplace_back(needles_data_string->getDataAt(j).toView()); + needles.emplace_back(needles_data_string.getDataAt(j).toView()); } const auto * const haystack = &haystack_data[prev_haystack_offset]; diff --git a/src/Functions/URL/FirstSignificantSubdomainCustomImpl.h b/src/Functions/URL/FirstSignificantSubdomainCustomImpl.h index 93691e35741..68582198ea3 100644 --- a/src/Functions/URL/FirstSignificantSubdomainCustomImpl.h +++ b/src/Functions/URL/FirstSignificantSubdomainCustomImpl.h @@ -69,7 +69,7 @@ public: const ColumnConst * column_tld_list_name = checkAndGetColumnConstStringOrFixedString(arguments[1].column.get()); FirstSignificantSubdomainCustomLookup tld_lookup(column_tld_list_name->getValue()); - if (const ColumnString * col = checkAndGetColumn(*arguments[0].column)) + if (const ColumnString * col = checkAndGetColumn(&*arguments[0].column)) { auto col_res = ColumnString::create(); vector(tld_lookup, col->getChars(), col->getOffsets(), col_res->getChars(), col_res->getOffsets()); diff --git a/src/Functions/array/FunctionArrayMapped.h b/src/Functions/array/FunctionArrayMapped.h index 5d6d70521b0..3c8dc04de46 100644 --- a/src/Functions/array/FunctionArrayMapped.h +++ b/src/Functions/array/FunctionArrayMapped.h @@ -317,7 +317,7 @@ public: ErrorCodes::ILLEGAL_COLUMN, "Expected Array column, found {}", column_array_ptr->getName()); column_array_ptr = recursiveRemoveLowCardinality(column_const_array->convertToFullColumn()); - column_array = checkAndGetColumn(column_array_ptr.get()); + column_array = &checkAndGetColumn(*column_array_ptr); } if (!array_type) diff --git a/src/Functions/array/arrayCompact.cpp b/src/Functions/array/arrayCompact.cpp index 7d09d1078d5..fe870bebd84 100644 --- a/src/Functions/array/arrayCompact.cpp +++ b/src/Functions/array/arrayCompact.cpp @@ -34,7 +34,7 @@ struct ArrayCompactImpl using ColVecType = ColumnVectorOrDecimal; const ColVecType * check_values_column = checkAndGetColumn(mapped.get()); - const ColVecType * src_values_column = checkAndGetColumn(array.getData()); + const ColVecType * src_values_column = checkAndGetColumn(&array.getData()); if (!src_values_column || !check_values_column) return false; diff --git a/src/Functions/array/arrayDistinct.cpp b/src/Functions/array/arrayDistinct.cpp index ea331d6bdad..2344742e4fd 100644 --- a/src/Functions/array/arrayDistinct.cpp +++ b/src/Functions/array/arrayDistinct.cpp @@ -89,20 +89,20 @@ private: ColumnPtr FunctionArrayDistinct::executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t /*input_rows_count*/) const { ColumnPtr array_ptr = arguments[0].column; - const ColumnArray * array = checkAndGetColumn(array_ptr.get()); + const ColumnArray & array = checkAndGetColumn(*array_ptr); const auto & return_type = result_type; auto res_ptr = return_type->createColumn(); ColumnArray & res = assert_cast(*res_ptr); - const IColumn & src_data = array->getData(); - const ColumnArray::Offsets & offsets = array->getOffsets(); + const IColumn & src_data = array.getData(); + const ColumnArray::Offsets & offsets = array.getOffsets(); IColumn & res_data = res.getData(); ColumnArray::Offsets & res_offsets = res.getOffsets(); - const ColumnNullable * nullable_col = checkAndGetColumn(src_data); + const ColumnNullable * nullable_col = checkAndGetColumn(&src_data); const IColumn * inner_col; diff --git a/src/Functions/array/arrayElement.cpp b/src/Functions/array/arrayElement.cpp index 8669fd1f3a7..227b29d5d9f 100644 --- a/src/Functions/array/arrayElement.cpp +++ b/src/Functions/array/arrayElement.cpp @@ -1538,9 +1538,9 @@ ColumnPtr FunctionArrayElement::executeMap2(const ColumnsWithTypeAndName & argum return nullptr; const ColumnArray * col_map_nested = &col_map->getNestedColumn(); - const ColumnTuple * col_map_kv = checkAndGetColumn(col_map_nested->getDataPtr().get()); - ColumnPtr col_map_keys = col_map_kv->getColumnPtr(0); - ColumnPtr col_map_values = col_map_kv->getColumnPtr(1); + const ColumnTuple & col_map_kv = checkAndGetColumn(*col_map_nested->getDataPtr()); + ColumnPtr col_map_keys = col_map_kv.getColumnPtr(0); + ColumnPtr col_map_values = col_map_kv.getColumnPtr(1); const DataTypeMap & map_type = typeid_cast(*typeid_cast(*arguments[0].type).getNestedType()); diff --git a/src/Functions/array/arrayEnumerateExtended.h b/src/Functions/array/arrayEnumerateExtended.h index cf38afcfa5a..62850a1cbf0 100644 --- a/src/Functions/array/arrayEnumerateExtended.h +++ b/src/Functions/array/arrayEnumerateExtended.h @@ -165,7 +165,7 @@ ColumnPtr FunctionArrayEnumerateExtended::executeImpl(const ColumnsWith for (size_t i = 0; i < num_arguments; ++i) { - if (const auto * nullable_col = checkAndGetColumn(*data_columns[i])) + if (const auto * nullable_col = checkAndGetColumn(data_columns[i])) { if (num_arguments == 1) data_columns[i] = &nullable_col->getNestedColumn(); diff --git a/src/Functions/array/arrayIndex.h b/src/Functions/array/arrayIndex.h index cd537763b4a..c0b1c773387 100644 --- a/src/Functions/array/arrayIndex.h +++ b/src/Functions/array/arrayIndex.h @@ -506,10 +506,10 @@ private: const ColumnNullable * nullable = nullptr; if (col_array) - nullable = checkAndGetColumn(col_array->getData()); + nullable = checkAndGetColumn(&col_array->getData()); const auto & arg_column = arguments[1].column; - const ColumnNullable * arg_nullable = checkAndGetColumn(*arg_column); + const ColumnNullable * arg_nullable = checkAndGetColumn(&*arg_column); if (!nullable && !arg_nullable) { @@ -738,7 +738,7 @@ private: const auto [null_map_data, null_map_item] = getNullMaps(arguments); - if (const ColumnConst * col_arg_const = checkAndGetColumn(*arguments[1].column)) + if (const ColumnConst * col_arg_const = checkAndGetColumn(&*arguments[1].column)) { const IColumnUnique & col_lc_dict = col_lc->getDictionary(); @@ -754,7 +754,7 @@ private: if (!col_arg_cloned->isNullAt(0)) { if (col_arg_cloned->isNullable()) - col_arg_cloned = checkAndGetColumn(*col_arg_cloned)->getNestedColumnPtr(); + col_arg_cloned = checkAndGetColumn(*col_arg_cloned).getNestedColumnPtr(); StringRef elem = col_arg_cloned->getDataAt(0); @@ -786,7 +786,7 @@ private: else if (col_lc->nestedIsNullable()) // LowCardinality(Nullable(T)) and U { const ColumnPtr left_casted = col_lc->convertToFullColumnIfLowCardinality(); // Nullable(T) - const ColumnNullable& left_nullable = *checkAndGetColumn(left_casted.get()); + const ColumnNullable & left_nullable = checkAndGetColumn(*left_casted); const NullMap * const null_map_left_casted = &left_nullable.getNullMapColumn().getData(); diff --git a/src/Functions/array/arrayJaccardIndex.cpp b/src/Functions/array/arrayJaccardIndex.cpp index 9cb74a7aa62..87f3390ac73 100644 --- a/src/Functions/array/arrayJaccardIndex.cpp +++ b/src/Functions/array/arrayJaccardIndex.cpp @@ -97,8 +97,8 @@ public: { if (const ColumnConst * col_const = typeid_cast(col.column.get())) { - const ColumnArray * col_const_array = checkAndGetColumn(col_const->getDataColumnPtr().get()); - return {col_const_array, true}; + const ColumnArray & col_const_array = checkAndGetColumn(*col_const->getDataColumnPtr()); + return {&col_const_array, true}; } else if (const ColumnArray * col_non_const_array = checkAndGetColumn(col.column.get())) return {col_non_const_array, false}; @@ -128,8 +128,8 @@ public: vectorWithEmptyIntersect(left_array->getOffsets(), right_array->getOffsets(), vec_res); \ else \ { \ - const ColumnArray * intersect_column_array = checkAndGetColumn(intersect_column.column.get()); \ - vector(intersect_column_array->getOffsets(), left_array->getOffsets(), right_array->getOffsets(), vec_res); \ + const ColumnArray & intersect_column_array = checkAndGetColumn(*intersect_column.column); \ + vector(intersect_column_array.getOffsets(), left_array->getOffsets(), right_array->getOffsets(), vec_res); \ } if (!left_is_const && !right_is_const) diff --git a/src/Functions/array/arrayUniq.cpp b/src/Functions/array/arrayUniq.cpp index 81ba5b62094..f92daf384d2 100644 --- a/src/Functions/array/arrayUniq.cpp +++ b/src/Functions/array/arrayUniq.cpp @@ -162,7 +162,7 @@ ColumnPtr FunctionArrayUniq::executeImpl(const ColumnsWithTypeAndName & argument for (size_t i = 0; i < num_arguments; ++i) { - if (const auto * nullable_col = checkAndGetColumn(*data_columns[i])) + if (const auto * nullable_col = checkAndGetColumn(data_columns[i])) { if (num_arguments == 1) data_columns[i] = &nullable_col->getNestedColumn(); diff --git a/src/Functions/array/emptyArrayToSingle.cpp b/src/Functions/array/emptyArrayToSingle.cpp index 86d4c32265a..2071abf9911 100644 --- a/src/Functions/array/emptyArrayToSingle.cpp +++ b/src/Functions/array/emptyArrayToSingle.cpp @@ -391,7 +391,7 @@ ColumnPtr FunctionEmptyArrayToSingle::executeImpl(const ColumnsWithTypeAndName & const IColumn * inner_col; IColumn * inner_res_col; - const auto * nullable_col = checkAndGetColumn(src_data); + const auto * nullable_col = checkAndGetColumn(&src_data); if (nullable_col) { inner_col = &nullable_col->getNestedColumn(); diff --git a/src/Functions/array/range.cpp b/src/Functions/array/range.cpp index 57679ccb180..f939ea7b462 100644 --- a/src/Functions/array/range.cpp +++ b/src/Functions/array/range.cpp @@ -404,7 +404,7 @@ private: { if (!col.type->isNullable()) return; - const ColumnNullable * nullable_col = checkAndGetColumn(*col.column); + const ColumnNullable * nullable_col = checkAndGetColumn(col.column.get()); if (!nullable_col) nullable_col = checkAndGetColumnConstData(col.column.get()); if (!nullable_col) @@ -421,8 +421,8 @@ private: const auto * col = arguments[0].column.get(); if (arguments[0].type->isNullable()) { - const auto * nullable = checkAndGetColumn(*arguments[0].column); - col = nullable->getNestedColumnPtr().get(); + const auto & nullable = checkAndGetColumn(*arguments[0].column); + col = nullable.getNestedColumnPtr().get(); } if (!((res = executeInternal(col)) || (res = executeInternal(col)) || (res = executeInternal(col)) diff --git a/src/Functions/arrayStringConcat.cpp b/src/Functions/arrayStringConcat.cpp index c186c0ca7e6..b787feeeca1 100644 --- a/src/Functions/arrayStringConcat.cpp +++ b/src/Functions/arrayStringConcat.cpp @@ -183,7 +183,7 @@ public: const ColumnString & col_string = assert_cast(*str_subcolumn.get()); auto col_res = ColumnString::create(); - if (const ColumnNullable * col_nullable = checkAndGetColumn(col_arr.getData())) + if (const ColumnNullable * col_nullable = checkAndGetColumn(&col_arr.getData())) executeInternal(col_string, col_arr, delimiter, *col_res, col_nullable->getNullMapData().data()); else executeInternal(col_string, col_arr, delimiter, *col_res); diff --git a/src/Functions/assumeNotNull.cpp b/src/Functions/assumeNotNull.cpp index 4dd88163ecb..be9d38f5af3 100644 --- a/src/Functions/assumeNotNull.cpp +++ b/src/Functions/assumeNotNull.cpp @@ -54,7 +54,7 @@ public: if (arguments[0].type->onlyNull() && !col->empty()) throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Cannot create non-empty column with type Nothing"); - if (const auto * nullable_col = checkAndGetColumn(*col)) + if (const auto * nullable_col = checkAndGetColumn(&*col)) return nullable_col->getNestedColumnPtr(); else return col; diff --git a/src/Functions/coalesce.cpp b/src/Functions/coalesce.cpp index 4ae90a9db13..722f32af523 100644 --- a/src/Functions/coalesce.cpp +++ b/src/Functions/coalesce.cpp @@ -157,12 +157,12 @@ public: /// if last argument is not nullable, result should be also not nullable if (!multi_if_args.back().column->isNullable() && res->isNullable()) { - if (const auto * column_lc = checkAndGetColumn(*res)) - res = checkAndGetColumn(*column_lc->convertToFullColumn())->getNestedColumnPtr(); - else if (const auto * column_const = checkAndGetColumn(*res)) - res = checkAndGetColumn(column_const->getDataColumn())->getNestedColumnPtr(); + if (const auto * column_lc = checkAndGetColumn(&*res)) + res = checkAndGetColumn(*column_lc->convertToFullColumn()).getNestedColumnPtr(); + else if (const auto * column_const = checkAndGetColumn(&*res)) + res = checkAndGetColumn(column_const->getDataColumn()).getNestedColumnPtr(); else - res = checkAndGetColumn(*res)->getNestedColumnPtr(); + res = checkAndGetColumn(&*res)->getNestedColumnPtr(); } return res; diff --git a/src/Functions/fromModifiedJulianDay.cpp b/src/Functions/fromModifiedJulianDay.cpp index 695d1b7d63c..a1a3102c70e 100644 --- a/src/Functions/fromModifiedJulianDay.cpp +++ b/src/Functions/fromModifiedJulianDay.cpp @@ -34,8 +34,8 @@ namespace DB ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override { using ColVecType = typename FromDataType::ColumnType; - const ColVecType * col_from = checkAndGetColumn(arguments[0].column.get()); - const typename ColVecType::Container & vec_from = col_from->getData(); + const ColVecType & col_from = checkAndGetColumn(*arguments[0].column); + const typename ColVecType::Container & vec_from = col_from.getData(); auto col_to = ColumnString::create(); ColumnString::Chars & data_to = col_to->getChars(); diff --git a/src/Functions/grouping.h b/src/Functions/grouping.h index 830c509f1f5..85ba1967909 100644 --- a/src/Functions/grouping.h +++ b/src/Functions/grouping.h @@ -55,7 +55,7 @@ public: template ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, size_t input_rows_count, AggregationKeyChecker checker) const { - const auto * grouping_set_column = checkAndGetColumn(arguments[0].column.get()); + const auto & grouping_set_column = checkAndGetColumn(*arguments[0].column); auto result = ColumnUInt64::create(); auto & result_data = result->getData(); @@ -64,7 +64,7 @@ public: const auto * result_table = likely(force_compatibility) ? COMPATIBLE_MODE : INCOMPATIBLE_MODE; for (size_t i = 0; i < input_rows_count; ++i) { - UInt64 set_index = grouping_set_column->getElement(i); + UInt64 set_index = grouping_set_column.getElement(i); UInt64 value = 0; for (auto index : arguments_indexes) diff --git a/src/Functions/hasColumnInTable.cpp b/src/Functions/hasColumnInTable.cpp index 48783a672e2..8ea16f688ee 100644 --- a/src/Functions/hasColumnInTable.cpp +++ b/src/Functions/hasColumnInTable.cpp @@ -88,8 +88,8 @@ ColumnPtr FunctionHasColumnInTable::executeImpl(const ColumnsWithTypeAndName & a { auto get_string_from_columns = [&](const ColumnWithTypeAndName & column) -> String { - const ColumnConst * const_column = checkAndGetColumnConst(column.column.get()); - return const_column->getValue(); + const ColumnConst & const_column = checkAndGetColumnConst(*column.column); + return const_column.getValue(); }; size_t arg = 0; diff --git a/src/Functions/if.cpp b/src/Functions/if.cpp index 4f75042ad8d..9766f34edca 100644 --- a/src/Functions/if.cpp +++ b/src/Functions/if.cpp @@ -948,12 +948,12 @@ private: bool cond_is_const = false; bool cond_is_true = false; bool cond_is_false = false; - if (const auto * const_arg = checkAndGetColumn(*arg_cond.column)) + if (const auto * const_arg = checkAndGetColumn(&*arg_cond.column)) { cond_is_const = true; not_const_condition = const_arg->getDataColumnPtr(); ColumnPtr data_column = const_arg->getDataColumnPtr(); - if (const auto * const_nullable_arg = checkAndGetColumn(*data_column)) + if (const auto * const_nullable_arg = checkAndGetColumn(&*data_column)) { data_column = const_nullable_arg->getNestedColumnPtr(); if (!data_column->empty()) @@ -962,7 +962,7 @@ private: if (!data_column->empty()) { - cond_is_true = !cond_is_null && checkAndGetColumn(*data_column)->getBool(0); + cond_is_true = !cond_is_null && checkAndGetColumn(*data_column).getBool(0); cond_is_false = !cond_is_null && !cond_is_true; } } @@ -975,12 +975,12 @@ private: else if (cond_is_false || cond_is_null) return castColumn(column2, result_type); - if (const auto * nullable = checkAndGetColumn(*not_const_condition)) + if (const auto * nullable = checkAndGetColumn(&*not_const_condition)) { ColumnPtr new_cond_column = nullable->getNestedColumnPtr(); size_t column_size = arg_cond.column->size(); - if (checkAndGetColumn(*new_cond_column)) + if (checkAndGetColumn(&*new_cond_column)) { auto nested_column_copy = new_cond_column->cloneResized(new_cond_column->size()); typeid_cast(nested_column_copy.get())->applyZeroMap(nullable->getNullMapData()); @@ -1027,12 +1027,12 @@ private: /// Const(size = 0, Int32(size = 1)) static ColumnPtr recursiveGetNestedColumnWithoutNullable(const ColumnPtr & column) { - if (const auto * nullable = checkAndGetColumn(*column)) + if (const auto * nullable = checkAndGetColumn(&*column)) { /// Nullable cannot contain Nullable return nullable->getNestedColumnPtr(); } - else if (const auto * column_const = checkAndGetColumn(*column)) + else if (const auto * column_const = checkAndGetColumn(&*column)) { /// Save Constant, but remove Nullable return ColumnConst::create(recursiveGetNestedColumnWithoutNullable(column_const->getDataColumnPtr()), column->size()); @@ -1051,8 +1051,8 @@ private: const ColumnWithTypeAndName & arg_then = arguments[1]; const ColumnWithTypeAndName & arg_else = arguments[2]; - const auto * then_is_nullable = checkAndGetColumn(*arg_then.column); - const auto * else_is_nullable = checkAndGetColumn(*arg_else.column); + const auto * then_is_nullable = checkAndGetColumn(&*arg_then.column); + const auto * else_is_nullable = checkAndGetColumn(&*arg_else.column); if (!then_is_nullable && !else_is_nullable) return nullptr; diff --git a/src/Functions/isNotNull.cpp b/src/Functions/isNotNull.cpp index dd5182aeade..dd53c700221 100644 --- a/src/Functions/isNotNull.cpp +++ b/src/Functions/isNotNull.cpp @@ -46,7 +46,7 @@ public: if (isVariant(elem.type)) { - const auto & discriminators = checkAndGetColumn(*elem.column)->getLocalDiscriminators(); + const auto & discriminators = checkAndGetColumn(*elem.column).getLocalDiscriminators(); auto res = DataTypeUInt8().createColumn(); auto & data = typeid_cast(*res).getData(); data.resize(discriminators.size()); @@ -57,17 +57,17 @@ public: if (elem.type->isLowCardinalityNullable()) { - const auto * low_cardinality_column = checkAndGetColumn(*elem.column); - const size_t null_index = low_cardinality_column->getDictionary().getNullValueIndex(); + const auto & low_cardinality_column = checkAndGetColumn(*elem.column); + const size_t null_index = low_cardinality_column.getDictionary().getNullValueIndex(); auto res = DataTypeUInt8().createColumn(); auto & data = typeid_cast(*res).getData(); - data.resize(low_cardinality_column->size()); - for (size_t i = 0; i != low_cardinality_column->size(); ++i) - data[i] = (low_cardinality_column->getIndexAt(i) != null_index); + data.resize(low_cardinality_column.size()); + for (size_t i = 0; i != low_cardinality_column.size(); ++i) + data[i] = (low_cardinality_column.getIndexAt(i) != null_index); return res; } - if (const auto * nullable = checkAndGetColumn(*elem.column)) + if (const auto * nullable = checkAndGetColumn(&*elem.column)) { /// Return the negated null map. auto res_column = ColumnUInt8::create(input_rows_count); diff --git a/src/Functions/isNull.cpp b/src/Functions/isNull.cpp index 4bf4e44f866..7a6dabab7af 100644 --- a/src/Functions/isNull.cpp +++ b/src/Functions/isNull.cpp @@ -48,7 +48,7 @@ public: if (isVariant(elem.type)) { - const auto & discriminators = checkAndGetColumn(*elem.column)->getLocalDiscriminators(); + const auto & discriminators = checkAndGetColumn(*elem.column).getLocalDiscriminators(); auto res = DataTypeUInt8().createColumn(); auto & data = typeid_cast(*res).getData(); data.reserve(discriminators.size()); @@ -59,17 +59,17 @@ public: if (elem.type->isLowCardinalityNullable()) { - const auto * low_cardinality_column = checkAndGetColumn(*elem.column); - size_t null_index = low_cardinality_column->getDictionary().getNullValueIndex(); + const auto & low_cardinality_column = checkAndGetColumn(*elem.column); + size_t null_index = low_cardinality_column.getDictionary().getNullValueIndex(); auto res = DataTypeUInt8().createColumn(); auto & data = typeid_cast(*res).getData(); - data.reserve(low_cardinality_column->size()); - for (size_t i = 0; i != low_cardinality_column->size(); ++i) - data.push_back(low_cardinality_column->getIndexAt(i) == null_index); + data.reserve(low_cardinality_column.size()); + for (size_t i = 0; i != low_cardinality_column.size(); ++i) + data.push_back(low_cardinality_column.getIndexAt(i) == null_index); return res; } - if (const auto * nullable = checkAndGetColumn(*elem.column)) + if (const auto * nullable = checkAndGetColumn(&*elem.column)) { /// Merely return the embedded null map. return nullable->getNullMapColumnPtr(); diff --git a/src/Functions/minSampleSize.cpp b/src/Functions/minSampleSize.cpp index a5826ef5c0e..f37b030c85a 100644 --- a/src/Functions/minSampleSize.cpp +++ b/src/Functions/minSampleSize.cpp @@ -102,14 +102,14 @@ struct ContinuousImpl auto baseline_argument = arguments[0]; baseline_argument.column = baseline_argument.column->convertToFullColumnIfConst(); auto baseline_column_untyped = castColumnAccurate(baseline_argument, float_64_type); - const auto * baseline_column = checkAndGetColumn>(*baseline_column_untyped); - const auto & baseline_column_data = baseline_column->getData(); + const auto & baseline_column = checkAndGetColumn>(*baseline_column_untyped); + const auto & baseline_column_data = baseline_column.getData(); auto sigma_argument = arguments[1]; sigma_argument.column = sigma_argument.column->convertToFullColumnIfConst(); auto sigma_column_untyped = castColumnAccurate(sigma_argument, float_64_type); - const auto * sigma_column = checkAndGetColumn>(*sigma_column_untyped); - const auto & sigma_column_data = sigma_column->getData(); + const auto & sigma_column = checkAndGetColumn>(*sigma_column_untyped); + const auto & sigma_column_data = sigma_column.getData(); const IColumn & col_mde = *arguments[2].column; const IColumn & col_power = *arguments[3].column; diff --git a/src/Functions/multiIf.cpp b/src/Functions/multiIf.cpp index 49c45d0c0be..8ea2a91f2de 100644 --- a/src/Functions/multiIf.cpp +++ b/src/Functions/multiIf.cpp @@ -198,7 +198,7 @@ public: if (cond_col->onlyNull()) continue; - if (const auto * column_const = checkAndGetColumn(*cond_col)) + if (const auto * column_const = checkAndGetColumn(&*cond_col)) { Field value = column_const->getField(); diff --git a/src/Functions/readWkt.cpp b/src/Functions/readWkt.cpp index 8dff297bcb1..ddc847b1ca5 100644 --- a/src/Functions/readWkt.cpp +++ b/src/Functions/readWkt.cpp @@ -51,14 +51,14 @@ public: ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & /*result_type*/, size_t input_rows_count) const override { - const auto * column_string = checkAndGetColumn(arguments[0].column.get()); + const auto & column_string = checkAndGetColumn(*arguments[0].column); Serializer serializer; Geometry geometry; for (size_t i = 0; i < input_rows_count; ++i) { - const auto & str = column_string->getDataAt(i).toString(); + const auto & str = column_string.getDataAt(i).toString(); boost::geometry::read_wkt(str, geometry); serializer.add(geometry); } diff --git a/src/Functions/repeat.cpp b/src/Functions/repeat.cpp index 6f2078b7e48..84597f4eadc 100644 --- a/src/Functions/repeat.cpp +++ b/src/Functions/repeat.cpp @@ -238,9 +238,9 @@ public: { using DataType = std::decay_t; using T = typename DataType::FieldType; - const ColumnVector * column = checkAndGetColumn>(col_num.get()); + const ColumnVector & column = checkAndGetColumn>(*col_num); auto col_res = ColumnString::create(); - RepeatImpl::vectorStrVectorRepeat(col->getChars(), col->getOffsets(), col_res->getChars(), col_res->getOffsets(), column->getData()); + RepeatImpl::vectorStrVectorRepeat(col->getChars(), col->getOffsets(), col_res->getChars(), col_res->getOffsets(), column.getData()); res = std::move(col_res); return true; })) @@ -258,9 +258,9 @@ public: { using DataType = std::decay_t; using T = typename DataType::FieldType; - const ColumnVector * column = checkAndGetColumn>(col_num.get()); + const ColumnVector & column = checkAndGetColumn>(*col_num); auto col_res = ColumnString::create(); - RepeatImpl::constStrVectorRepeat(copy_str, col_res->getChars(), col_res->getOffsets(), column->getData()); + RepeatImpl::constStrVectorRepeat(copy_str, col_res->getChars(), col_res->getOffsets(), column.getData()); res = std::move(col_res); return true; })) diff --git a/src/Functions/seriesOutliersDetectTukey.cpp b/src/Functions/seriesOutliersDetectTukey.cpp index da04d3b78d3..81fc904e16e 100644 --- a/src/Functions/seriesOutliersDetectTukey.cpp +++ b/src/Functions/seriesOutliersDetectTukey.cpp @@ -61,10 +61,10 @@ public: ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override { ColumnPtr col = arguments[0].column; - const ColumnArray * col_arr = checkAndGetColumn(col.get()); + const ColumnArray & col_arr = checkAndGetColumn(*col); - const IColumn & arr_data = col_arr->getData(); - const ColumnArray::Offsets & arr_offsets = col_arr->getOffsets(); + const IColumn & arr_data = col_arr.getData(); + const ColumnArray::Offsets & arr_offsets = col_arr.getOffsets(); ColumnPtr col_res; if (input_rows_count == 0) diff --git a/src/Functions/seriesPeriodDetectFFT.cpp b/src/Functions/seriesPeriodDetectFFT.cpp index fbaa2b14e64..e85b3a97c67 100644 --- a/src/Functions/seriesPeriodDetectFFT.cpp +++ b/src/Functions/seriesPeriodDetectFFT.cpp @@ -61,10 +61,10 @@ public: ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override { ColumnPtr array_ptr = arguments[0].column; - const ColumnArray * array = checkAndGetColumn(array_ptr.get()); + const ColumnArray & array = checkAndGetColumn(*array_ptr); - const IColumn & src_data = array->getData(); - const ColumnArray::Offsets & offsets = array->getOffsets(); + const IColumn & src_data = array.getData(); + const ColumnArray::Offsets & offsets = array.getOffsets(); auto res = ColumnFloat64::create(input_rows_count); auto & res_data = res->getData(); diff --git a/src/Functions/space.cpp b/src/Functions/space.cpp index 03dc0d06719..4cfa629aa33 100644 --- a/src/Functions/space.cpp +++ b/src/Functions/space.cpp @@ -57,14 +57,14 @@ public: template bool executeConstant(ColumnPtr col_times, ColumnString::Offsets & res_offsets, ColumnString::Chars & res_chars) const { - const ColumnConst * col_times_const = checkAndGetColumn(col_times.get()); + const ColumnConst & col_times_const = checkAndGetColumn(*col_times); - const ColumnPtr & col_times_const_internal = col_times_const->getDataColumnPtr(); + const ColumnPtr & col_times_const_internal = col_times_const.getDataColumnPtr(); if (!checkAndGetColumn(col_times_const_internal.get())) return false; using T = typename DataType::FieldType; - T times = col_times_const->getValue(); + T times = col_times_const.getValue(); if (times < 1) times = 0; diff --git a/src/Functions/toStartOfInterval.cpp b/src/Functions/toStartOfInterval.cpp index 7f25a317466..54aa1205a35 100644 --- a/src/Functions/toStartOfInterval.cpp +++ b/src/Functions/toStartOfInterval.cpp @@ -164,7 +164,7 @@ private: if (isDateTime64(time_column_type)) { - const auto * time_column_vec = checkAndGetColumn(time_column_col); + const auto * time_column_vec = checkAndGetColumn(&time_column_col); auto scale = assert_cast(time_column_type).getScale(); if (time_column_vec) @@ -172,13 +172,13 @@ private: } else if (isDateTime(time_column_type)) { - const auto * time_column_vec = checkAndGetColumn(time_column_col); + const auto * time_column_vec = checkAndGetColumn(&time_column_col); if (time_column_vec) return dispatchForIntervalColumn(assert_cast(time_column_type), *time_column_vec, interval_column, result_type, time_zone); } else if (isDate(time_column_type)) { - const auto * time_column_vec = checkAndGetColumn(time_column_col); + const auto * time_column_vec = checkAndGetColumn(&time_column_col); if (time_column_vec) return dispatchForIntervalColumn(assert_cast(time_column_type), *time_column_vec, interval_column, result_type, time_zone); } diff --git a/src/Functions/ztest.cpp b/src/Functions/ztest.cpp index 55e1b59a897..c2579263674 100644 --- a/src/Functions/ztest.cpp +++ b/src/Functions/ztest.cpp @@ -98,23 +98,23 @@ public: static const auto uint64_data_type = std::make_shared>(); auto column_successes_x = castColumnAccurate(arguments[0], uint64_data_type); - const auto & data_successes_x = checkAndGetColumn>(column_successes_x.get())->getData(); + const auto & data_successes_x = checkAndGetColumn>(*column_successes_x).getData(); auto column_successes_y = castColumnAccurate(arguments[1], uint64_data_type); - const auto & data_successes_y = checkAndGetColumn>(column_successes_y.get())->getData(); + const auto & data_successes_y = checkAndGetColumn>(*column_successes_y).getData(); auto column_trials_x = castColumnAccurate(arguments[2], uint64_data_type); - const auto & data_trials_x = checkAndGetColumn>(column_trials_x.get())->getData(); + const auto & data_trials_x = checkAndGetColumn>(*column_trials_x).getData(); auto column_trials_y = castColumnAccurate(arguments[3], uint64_data_type); - const auto & data_trials_y = checkAndGetColumn>(column_trials_y.get())->getData(); + const auto & data_trials_y = checkAndGetColumn>(*column_trials_y).getData(); static const auto float64_data_type = std::make_shared>(); auto column_confidence_level = castColumnAccurate(arguments[4], float64_data_type); - const auto & data_confidence_level = checkAndGetColumn>(column_confidence_level.get())->getData(); + const auto & data_confidence_level = checkAndGetColumn>(*column_confidence_level).getData(); - String usevar = checkAndGetColumnConst(arguments[5].column.get())->getValue(); + String usevar = checkAndGetColumnConst(*arguments[5].column).getValue(); if (usevar != UNPOOLED && usevar != POOLED) throw Exception{ErrorCodes::BAD_ARGUMENTS, diff --git a/src/Interpreters/BloomFilterHash.h b/src/Interpreters/BloomFilterHash.h index 45098ecff99..8248e9e4469 100644 --- a/src/Interpreters/BloomFilterHash.h +++ b/src/Interpreters/BloomFilterHash.h @@ -108,7 +108,7 @@ struct BloomFilterHash { const auto * array_col = typeid_cast(column.get()); - if (checkAndGetColumn(array_col->getData())) + if (checkAndGetColumn(&array_col->getData())) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unexpected type {} of bloom filter index.", data_type->getName()); const auto & offsets = array_col->getOffsets(); diff --git a/src/Interpreters/HashJoin.cpp b/src/Interpreters/HashJoin.cpp index 9b05edbce36..0db998c14fc 100644 --- a/src/Interpreters/HashJoin.cpp +++ b/src/Interpreters/HashJoin.cpp @@ -216,7 +216,7 @@ static void correctNullabilityInplace(ColumnWithTypeAndName & column, bool nulla { /// We have to replace values masked by NULLs with defaults. if (column.column) - if (const auto * nullable_column = checkAndGetColumn(*column.column)) + if (const auto * nullable_column = checkAndGetColumn(&*column.column)) column.column = JoinCommon::filterWithBlanks(column.column, nullable_column->getNullMapColumn().getData(), true); JoinCommon::removeColumnNullability(column); diff --git a/src/Interpreters/InterpreterCheckQuery.cpp b/src/Interpreters/InterpreterCheckQuery.cpp index ae8cef3f102..4a84a7bf570 100644 --- a/src/Interpreters/InterpreterCheckQuery.cpp +++ b/src/Interpreters/InterpreterCheckQuery.cpp @@ -334,10 +334,10 @@ public: if ((columns.size() != 3 && columns.size() != 5) || column_position_to_check >= columns.size()) throw Exception(ErrorCodes::LOGICAL_ERROR, "Wrong number of columns: {}, position {}", columns.size(), column_position_to_check); - const auto * col = checkAndGetColumn(columns[column_position_to_check].get()); - for (size_t i = 0; i < col->size(); ++i) + const auto & col = checkAndGetColumn(*columns[column_position_to_check]); + for (size_t i = 0; i < col.size(); ++i) { - if (col->getElement(i) == 0) + if (col.getElement(i) == 0) { result_value = 0; return; diff --git a/src/Interpreters/JoinUtils.cpp b/src/Interpreters/JoinUtils.cpp index 0aee96ee9c4..1788c9aca48 100644 --- a/src/Interpreters/JoinUtils.cpp +++ b/src/Interpreters/JoinUtils.cpp @@ -162,7 +162,7 @@ static ColumnPtr tryConvertColumnToNullable(ColumnPtr col) return col_lc.cloneNullable(); } } - else if (const ColumnConst * col_const = checkAndGetColumn(*col)) + else if (const ColumnConst * col_const = checkAndGetColumn(&*col)) { const auto & nested = col_const->getDataColumnPtr(); if (nested->isNullable() || nested->canBeInsideNullable()) @@ -232,7 +232,7 @@ void removeColumnNullability(ColumnWithTypeAndName & column) if (column.column && column.column->isNullable()) { column.column = column.column->convertToFullColumnIfConst(); - const auto * nullable_col = checkAndGetColumn(*column.column); + const auto * nullable_col = checkAndGetColumn(column.column.get()); if (!nullable_col) { throw DB::Exception(ErrorCodes::LOGICAL_ERROR, "Column '{}' is expected to be nullable", column.dumpStructure()); @@ -258,11 +258,11 @@ void changeColumnRepresentation(const ColumnPtr & src_column, ColumnPtr & dst_co if (nullable_src && !nullable_dst) { - const auto * nullable = checkAndGetColumn(*src_column); + const auto & nullable = checkAndGetColumn(*src_column); if (change_lowcard) - dst_column = changeLowCardinality(nullable->getNestedColumnPtr(), dst_column); + dst_column = changeLowCardinality(nullable.getNestedColumnPtr(), dst_column); else - dst_column = nullable->getNestedColumnPtr(); + dst_column = nullable.getNestedColumnPtr(); } else if (!nullable_src && nullable_dst) { @@ -275,7 +275,7 @@ void changeColumnRepresentation(const ColumnPtr & src_column, ColumnPtr & dst_co { if (change_lowcard) { - if (const auto * nullable = checkAndGetColumn(*src_column)) + if (const auto * nullable = checkAndGetColumn(&*src_column)) { dst_column = makeNullable(changeLowCardinality(nullable->getNestedColumnPtr(), dst_not_null)); assert_cast(*dst_column->assumeMutable()).applyNullMap(nullable->getNullMapColumn()); @@ -291,7 +291,7 @@ void changeColumnRepresentation(const ColumnPtr & src_column, ColumnPtr & dst_co ColumnPtr emptyNotNullableClone(const ColumnPtr & column) { if (column->isNullable()) - return checkAndGetColumn(*column)->getNestedColumnPtr()->cloneEmpty(); + return checkAndGetColumn(*column).getNestedColumnPtr()->cloneEmpty(); return column->cloneEmpty(); } @@ -374,10 +374,10 @@ ColumnRawPtrs extractKeysForJoin(const Block & block_keys, const Names & key_nam key_columns[i] = block_keys.getByName(column_name).column.get(); /// We will join only keys, where all components are not NULL. - if (const auto * nullable = checkAndGetColumn(*key_columns[i])) + if (const auto * nullable = checkAndGetColumn(&*key_columns[i])) key_columns[i] = &nullable->getNestedColumn(); - if (const auto * sparse = checkAndGetColumn(*key_columns[i])) + if (const auto * sparse = checkAndGetColumn(&*key_columns[i])) key_columns[i] = &sparse->getValuesColumn(); } @@ -490,7 +490,7 @@ JoinMask getColumnAsMask(const Block & block, const String & column_name) if (isNothing(col_type)) return JoinMask(false, block.rows()); - if (const auto * const_cond = checkAndGetColumn(*src_col.column)) + if (const auto * const_cond = checkAndGetColumn(&*src_col.column)) { return JoinMask(const_cond->getBool(0), block.rows()); } diff --git a/src/Interpreters/MergeJoin.cpp b/src/Interpreters/MergeJoin.cpp index 5bd49b3c971..c6df06cfac6 100644 --- a/src/Interpreters/MergeJoin.cpp +++ b/src/Interpreters/MergeJoin.cpp @@ -80,8 +80,8 @@ int nullableCompareAt(const IColumn & left_column, const IColumn & right_column, if constexpr (has_left_nulls && has_right_nulls) { - const auto * left_nullable = checkAndGetColumn(left_column); - const auto * right_nullable = checkAndGetColumn(right_column); + const auto * left_nullable = checkAndGetColumn(&left_column); + const auto * right_nullable = checkAndGetColumn(&right_column); if (left_nullable && right_nullable) { @@ -99,7 +99,7 @@ int nullableCompareAt(const IColumn & left_column, const IColumn & right_column, if constexpr (has_left_nulls) { - if (const auto * left_nullable = checkAndGetColumn(left_column)) + if (const auto * left_nullable = checkAndGetColumn(&left_column)) { if (left_column.isNullAt(lhs_pos)) return null_direction_hint; @@ -109,7 +109,7 @@ int nullableCompareAt(const IColumn & left_column, const IColumn & right_column, if constexpr (has_right_nulls) { - if (const auto * right_nullable = checkAndGetColumn(right_column)) + if (const auto * right_nullable = checkAndGetColumn(&right_column)) { if (right_column.isNullAt(rhs_pos)) return -null_direction_hint; diff --git a/src/Interpreters/NullableUtils.cpp b/src/Interpreters/NullableUtils.cpp index ce681b1d569..fa0ddae8c90 100644 --- a/src/Interpreters/NullableUtils.cpp +++ b/src/Interpreters/NullableUtils.cpp @@ -12,7 +12,7 @@ ColumnPtr extractNestedColumnsAndNullMap(ColumnRawPtrs & key_columns, ConstNullM if (key_columns.size() == 1) { auto & column = key_columns[0]; - if (const auto * column_nullable = checkAndGetColumn(*column)) + if (const auto * column_nullable = checkAndGetColumn(&*column)) { null_map_holder = column_nullable->getNullMapColumnPtr(); null_map = &column_nullable->getNullMapData(); @@ -23,7 +23,7 @@ ColumnPtr extractNestedColumnsAndNullMap(ColumnRawPtrs & key_columns, ConstNullM { for (auto & column : key_columns) { - if (const auto * column_nullable = checkAndGetColumn(*column)) + if (const auto * column_nullable = checkAndGetColumn(&*column)) { column = &column_nullable->getNestedColumn(); diff --git a/src/Interpreters/SetVariants.cpp b/src/Interpreters/SetVariants.cpp index 0fb2e5189d4..64796a013f1 100644 --- a/src/Interpreters/SetVariants.cpp +++ b/src/Interpreters/SetVariants.cpp @@ -74,7 +74,7 @@ typename SetVariantsTemplate::Type SetVariantsTemplate::choose for (const auto & col : key_columns) { - if (const auto * nullable = checkAndGetColumn(*col)) + if (const auto * nullable = checkAndGetColumn(&*col)) { nested_key_columns.push_back(&nullable->getNestedColumn()); has_nullable_key = true; diff --git a/src/Interpreters/SetVariants.h b/src/Interpreters/SetVariants.h index ff527102080..f6eac517349 100644 --- a/src/Interpreters/SetVariants.h +++ b/src/Interpreters/SetVariants.h @@ -80,7 +80,7 @@ protected: for (const auto & col : key_columns) { - if (const auto * nullable = checkAndGetColumn(*col)) + if (const auto * nullable = checkAndGetColumn(&*col)) { actual_columns.push_back(&nullable->getNestedColumn()); null_maps.push_back(&nullable->getNullMapColumn()); diff --git a/src/Processors/Transforms/CheckConstraintsTransform.cpp b/src/Processors/Transforms/CheckConstraintsTransform.cpp index 3a6595ea4fb..e43aa6028da 100644 --- a/src/Processors/Transforms/CheckConstraintsTransform.cpp +++ b/src/Processors/Transforms/CheckConstraintsTransform.cpp @@ -57,7 +57,7 @@ void CheckConstraintsTransform::onConsume(Chunk chunk) auto result_column = res_column.column->convertToFullColumnIfConst()->convertToFullColumnIfLowCardinality(); - if (const auto * column_nullable = checkAndGetColumn(*result_column)) + if (const auto * column_nullable = checkAndGetColumn(&*result_column)) { const auto & nested_column = column_nullable->getNestedColumnPtr(); diff --git a/src/Processors/Transforms/MergeJoinTransform.cpp b/src/Processors/Transforms/MergeJoinTransform.cpp index 584125b046f..92f4110e9ed 100644 --- a/src/Processors/Transforms/MergeJoinTransform.cpp +++ b/src/Processors/Transforms/MergeJoinTransform.cpp @@ -48,8 +48,8 @@ int nullableCompareAt(const IColumn & left_column, const IColumn & right_column, { if constexpr (has_left_nulls && has_right_nulls) { - const auto * left_nullable = checkAndGetColumn(left_column); - const auto * right_nullable = checkAndGetColumn(right_column); + const auto * left_nullable = checkAndGetColumn(&left_column); + const auto * right_nullable = checkAndGetColumn(&right_column); if (left_nullable && right_nullable) { @@ -67,7 +67,7 @@ int nullableCompareAt(const IColumn & left_column, const IColumn & right_column, if constexpr (has_left_nulls) { - if (const auto * left_nullable = checkAndGetColumn(left_column)) + if (const auto * left_nullable = checkAndGetColumn(&left_column)) { if (left_nullable->isNullAt(lhs_pos)) return null_direction_hint; @@ -77,7 +77,7 @@ int nullableCompareAt(const IColumn & left_column, const IColumn & right_column, if constexpr (has_right_nulls) { - if (const auto * right_nullable = checkAndGetColumn(right_column)) + if (const auto * right_nullable = checkAndGetColumn(&right_column)) { if (right_nullable->isNullAt(rhs_pos)) return -null_direction_hint; diff --git a/src/Processors/Transforms/WindowTransform.cpp b/src/Processors/Transforms/WindowTransform.cpp index f43b9a2e794..cf6a84db9e6 100644 --- a/src/Processors/Transforms/WindowTransform.cpp +++ b/src/Processors/Transforms/WindowTransform.cpp @@ -2516,7 +2516,7 @@ struct WindowFunctionNonNegativeDerivative final : public StatefulWindowFunction if (ts_scale_multiplier) { const auto & column = transform->blockAt(transform->current_row.block).input_columns[workspace.argument_column_indices[ARGUMENT_TIMESTAMP]]; - const auto & curr_timestamp = checkAndGetColumn(column.get())->getInt(transform->current_row.row); + const auto & curr_timestamp = checkAndGetColumn(*column).getInt(transform->current_row.row); Float64 time_elapsed = curr_timestamp - state.previous_timestamp; result = (time_elapsed > 0) ? (metric_diff * ts_scale_multiplier / time_elapsed * interval_duration) : 0; diff --git a/src/Storages/MergeTree/MergeTreeIndexSet.cpp b/src/Storages/MergeTree/MergeTreeIndexSet.cpp index dba2bc1e56c..1bd42518fdd 100644 --- a/src/Storages/MergeTree/MergeTreeIndexSet.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexSet.cpp @@ -305,7 +305,7 @@ bool MergeTreeIndexConditionSet::mayBeTrueOnGranule(MergeTreeIndexGranulePtr idx const NullMap * null_map = nullptr; - if (const auto * col_nullable = checkAndGetColumn(*column)) + if (const auto * col_nullable = checkAndGetColumn(&*column)) { col_uint8 = typeid_cast(&col_nullable->getNestedColumn()); null_map = &col_nullable->getNullMapData(); From 694a8ca81992210dabfcf2da32004899263ae284 Mon Sep 17 00:00:00 2001 From: Michael Kolupaev Date: Fri, 29 Mar 2024 23:33:12 +0000 Subject: [PATCH 093/289] Style --- src/Functions/FunctionHelpers.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/Functions/FunctionHelpers.h b/src/Functions/FunctionHelpers.h index 89b224e79ef..9eabb9a0370 100644 --- a/src/Functions/FunctionHelpers.h +++ b/src/Functions/FunctionHelpers.h @@ -15,6 +15,11 @@ namespace DB { +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; +} + class IFunction; /// Methods, that helps dispatching over real column types. From 4b9819a8f5d6b5407486a8d25bcf24dfdee8950f Mon Sep 17 00:00:00 2001 From: Michael Kolupaev Date: Mon, 6 May 2024 22:25:32 +0000 Subject: [PATCH 094/289] Update call sites added since last commit --- src/Functions/UTCTimestampTransform.cpp | 12 ++++++------ src/Storages/MergeTree/MergeTreeIndexBloomFilter.cpp | 4 ++-- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/Functions/UTCTimestampTransform.cpp b/src/Functions/UTCTimestampTransform.cpp index 0fcba91f49f..6d301270d8e 100644 --- a/src/Functions/UTCTimestampTransform.cpp +++ b/src/Functions/UTCTimestampTransform.cpp @@ -80,14 +80,14 @@ namespace const DateLUTImpl & utc_time_zone = DateLUT::instance("UTC"); if (WhichDataType(arg1.type).isDateTime()) { - const auto * date_time_col = checkAndGetColumn(arg1.column.get()); - size_t col_size = date_time_col->size(); + const auto & date_time_col = checkAndGetColumn(*arg1.column); + size_t col_size = date_time_col.size(); using ColVecTo = DataTypeDateTime::ColumnType; typename ColVecTo::MutablePtr result_column = ColVecTo::create(col_size); typename ColVecTo::Container & result_data = result_column->getData(); for (size_t i = 0; i < col_size; ++i) { - UInt32 date_time_val = date_time_col->getElement(i); + UInt32 date_time_val = date_time_col.getElement(i); LocalDateTime date_time(date_time_val, Name::to ? utc_time_zone : DateLUT::instance(time_zone_val)); time_t time_val = date_time.to_time_t(Name::from ? utc_time_zone : DateLUT::instance(time_zone_val)); result_data[i] = static_cast(time_val); @@ -96,8 +96,8 @@ namespace } else if (WhichDataType(arg1.type).isDateTime64()) { - const auto * date_time_col = checkAndGetColumn(arg1.column.get()); - size_t col_size = date_time_col->size(); + const auto & date_time_col = checkAndGetColumn(*arg1.column); + size_t col_size = date_time_col.size(); const DataTypeDateTime64 * date_time_type = static_cast(arg1.type.get()); UInt32 col_scale = date_time_type->getScale(); Int64 scale_multiplier = DecimalUtils::scaleMultiplier(col_scale); @@ -106,7 +106,7 @@ namespace typename ColDecimalTo::Container & result_data = result_column->getData(); for (size_t i = 0; i < col_size; ++i) { - DateTime64 date_time_val = date_time_col->getElement(i); + DateTime64 date_time_val = date_time_col.getElement(i); Int64 seconds = date_time_val.value / scale_multiplier; Int64 micros = date_time_val.value % scale_multiplier; LocalDateTime date_time(seconds, Name::to ? utc_time_zone : DateLUT::instance(time_zone_val)); diff --git a/src/Storages/MergeTree/MergeTreeIndexBloomFilter.cpp b/src/Storages/MergeTree/MergeTreeIndexBloomFilter.cpp index 4f25a014382..4b164f5ac42 100644 --- a/src/Storages/MergeTree/MergeTreeIndexBloomFilter.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexBloomFilter.cpp @@ -865,8 +865,8 @@ void MergeTreeIndexAggregatorBloomFilter::update(const Block & block, size_t * p const auto & column_and_type = block.getByName(index_columns_name[column]); auto index_column = BloomFilterHash::hashWithColumn(column_and_type.type, column_and_type.column, *pos, max_read_rows); - const auto & index_col = checkAndGetColumn(index_column.get()); - const auto & index_data = index_col->getData(); + const auto & index_col = checkAndGetColumn(*index_column); + const auto & index_data = index_col.getData(); for (const auto & hash: index_data) column_hashes[column].insert(hash); } From a20ef2a3d07343ff914f01db39b8dbb7e02d7584 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9A=D0=B8=D1=80=D0=B8=D0=BB=D0=BB=20=D0=93=D0=B0=D1=80?= =?UTF-8?q?=D0=B1=D0=B0=D1=80?= Date: Tue, 7 May 2024 02:06:13 +0300 Subject: [PATCH 095/289] Add allow without connection setting to MaterializedMySQLSettings --- .../database-engines/materialized-mysql.md | 3 +++ src/Databases/MySQL/DatabaseMaterializedMySQL.cpp | 6 +++++- src/Databases/MySQL/MaterializedMySQLSettings.h | 1 + .../materialized_with_ddl.py | 15 ++++++++++++++- 4 files changed, 23 insertions(+), 2 deletions(-) diff --git a/docs/en/engines/database-engines/materialized-mysql.md b/docs/en/engines/database-engines/materialized-mysql.md index f32698f84f6..2b4d5fe04aa 100644 --- a/docs/en/engines/database-engines/materialized-mysql.md +++ b/docs/en/engines/database-engines/materialized-mysql.md @@ -51,6 +51,9 @@ ENGINE = MaterializedMySQL('host:port', ['database' | database], 'user', 'passwo ### allows_query_when_mysql_lost `allows_query_when_mysql_lost` — Allows to query a materialized table when MySQL is lost. Default: `0` (`false`). +### allow_startup_database_without_connection_to_mysql +`allow_startup_database_without_connection_to_mysql` — Allow to create and attach database without available connection to MySQL. Default: `0` (`false`). + ### materialized_mysql_tables_list `materialized_mysql_tables_list` — a comma-separated list of mysql database tables, which will be replicated by MaterializedMySQL database engine. Default value: empty list — means whole tables will be replicated. diff --git a/src/Databases/MySQL/DatabaseMaterializedMySQL.cpp b/src/Databases/MySQL/DatabaseMaterializedMySQL.cpp index 6d89cc23590..c81fe1b2b26 100644 --- a/src/Databases/MySQL/DatabaseMaterializedMySQL.cpp +++ b/src/Databases/MySQL/DatabaseMaterializedMySQL.cpp @@ -81,9 +81,13 @@ LoadTaskPtr DatabaseMaterializedMySQL::startupDatabaseAsync(AsyncLoader & async_ base->goals(), TablesLoaderBackgroundStartupPoolId, fmt::format("startup MaterializedMySQL database {}", getDatabaseName()), - [this] (AsyncLoader &, const LoadJobPtr &) + [this, mode] (AsyncLoader &, const LoadJobPtr &) { LOG_TRACE(log, "Starting MaterializeMySQL database"); + if (!settings->allow_startup_database_without_connection_to_mysql + && mode < LoadingStrictnessLevel::FORCE_ATTACH) + materialize_thread.assertMySQLAvailable(); + materialize_thread.startSynchronization(); started_up = true; }); diff --git a/src/Databases/MySQL/MaterializedMySQLSettings.h b/src/Databases/MySQL/MaterializedMySQLSettings.h index 557d48be85b..b481846afc1 100644 --- a/src/Databases/MySQL/MaterializedMySQLSettings.h +++ b/src/Databases/MySQL/MaterializedMySQLSettings.h @@ -22,6 +22,7 @@ class ASTStorage; M(UInt64, max_milliseconds_to_wait_in_binlog_queue, 10000, "Max milliseconds to wait when max bytes exceeded in a binlog queue.", 0) \ M(UInt64, max_bytes_in_binlog_dispatcher_buffer, DBMS_DEFAULT_BUFFER_SIZE, "Max bytes in the binlog dispatcher's buffer before it is flushed to attached binlogs.", 0) \ M(UInt64, max_flush_milliseconds_in_binlog_dispatcher, 1000, "Max milliseconds in the binlog dispatcher's buffer to wait before it is flushed to attached binlogs.", 0) \ + M(Bool, allow_startup_database_without_connection_to_mysql, false, "Allow to create and attach database without available connection to MySQL.", 0) \ DECLARE_SETTINGS_TRAITS(MaterializedMySQLSettingsTraits, LIST_OF_MATERIALIZE_MODE_SETTINGS) diff --git a/tests/integration/test_materialized_mysql_database/materialized_with_ddl.py b/tests/integration/test_materialized_mysql_database/materialized_with_ddl.py index dcb2546bad3..86000799ae4 100644 --- a/tests/integration/test_materialized_mysql_database/materialized_with_ddl.py +++ b/tests/integration/test_materialized_mysql_database/materialized_with_ddl.py @@ -3425,8 +3425,21 @@ def mysql_create_database_without_connection(clickhouse_node, mysql_node, servic clickhouse_node.cluster.pause_container(service_name) + assert "ConnectionFailed:" in clickhouse_node.query_and_get_error( + """ + CREATE DATABASE create_without_connection + ENGINE = MaterializedMySQL('{}:3306', 'create_without_connection', 'root', 'clickhouse') + """.format( + service_name + ) + ) + clickhouse_node.query( - "CREATE DATABASE create_without_connection ENGINE = MaterializedMySQL('{}:3306', 'create_without_connection', 'root', 'clickhouse') SETTINGS max_wait_time_when_mysql_unavailable=-1".format( + """ + CREATE DATABASE create_without_connection + ENGINE = MaterializedMySQL('{}:3306', 'create_without_connection', 'root', 'clickhouse') + SETTINGS allow_startup_database_without_connection_to_mysql=1 + """.format( service_name ) ) From d37590aed68e12c5fd7664b1a21138dd428d1482 Mon Sep 17 00:00:00 2001 From: unashi Date: Tue, 7 May 2024 10:36:44 +0800 Subject: [PATCH 096/289] [update] add test for RawWithNames, RawWithNamesAndTypes and *WithNames, *WithNamesAndTypes; add changelog --- CHANGELOG.md | 1 + .../00397_tsv_format_synonym.reference | 27 +++++++++++++++++++ .../0_stateless/00397_tsv_format_synonym.sql | 8 ++++++ 3 files changed, 36 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index f40c42c4462..955e2f5b72f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -33,6 +33,7 @@ * A mode for `topK`/`topkWeighed` support mode, which return count of values and its error. [#54508](https://github.com/ClickHouse/ClickHouse/pull/54508) ([UnamedRus](https://github.com/UnamedRus)). * Added function `toMillisecond` which returns the millisecond component for values of type`DateTime` or `DateTime64`. [#60281](https://github.com/ClickHouse/ClickHouse/pull/60281) ([Shaun Struwig](https://github.com/Blargian)). * Allow configuring HTTP redirect handlers for clickhouse-server. For example, you can make `/` redirect to the Play UI. [#60390](https://github.com/ClickHouse/ClickHouse/pull/60390) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Allow Raw as a synonym for TSVRaw. [#63394](https://github.com/ClickHouse/ClickHouse/pull/63394) ([Unalian](https://github.com/Unalian)) #### Performance Improvement * Optimized function `dotProduct` to omit unnecessary and expensive memory copies. [#60928](https://github.com/ClickHouse/ClickHouse/pull/60928) ([Robert Schulze](https://github.com/rschu1ze)). diff --git a/tests/queries/0_stateless/00397_tsv_format_synonym.reference b/tests/queries/0_stateless/00397_tsv_format_synonym.reference index c91169a06fa..3326b039b8d 100644 --- a/tests/queries/0_stateless/00397_tsv_format_synonym.reference +++ b/tests/queries/0_stateless/00397_tsv_format_synonym.reference @@ -31,3 +31,30 @@ UInt8 String String 1 hello world 2 hello world 3 hello world +arr s1 s2 +1 hello world +2 hello world +3 hello world +arr s1 s2 +1 hello world +2 hello world +3 hello world +arr s1 s2 +1 hello world +2 hello world +3 hello world +arr s1 s2 +UInt8 String String +1 hello world +2 hello world +3 hello world +arr s1 s2 +UInt8 String String +1 hello world +2 hello world +3 hello world +arr s1 s2 +UInt8 String String +1 hello world +2 hello world +3 hello world diff --git a/tests/queries/0_stateless/00397_tsv_format_synonym.sql b/tests/queries/0_stateless/00397_tsv_format_synonym.sql index 51283c6ced9..b3b231fbf3f 100644 --- a/tests/queries/0_stateless/00397_tsv_format_synonym.sql +++ b/tests/queries/0_stateless/00397_tsv_format_synonym.sql @@ -10,3 +10,11 @@ SELECT arrayJoin([1, 2, 3]) AS arr, 'hello' AS s1, 'world' AS s2 FORMAT TSVWithN SELECT arrayJoin([1, 2, 3]) AS arr, 'hello' AS s1, 'world' AS s2 FORMAT TabSeparatedRaw; SELECT arrayJoin([1, 2, 3]) AS arr, 'hello' AS s1, 'world' AS s2 FORMAT TSVRaw; SELECT arrayJoin([1, 2, 3]) AS arr, 'hello' AS s1, 'world' AS s2 FORMAT Raw; + +SELECT arrayJoin([1, 2, 3]) AS arr, 'hello' AS s1, 'world' AS s2 FORMAT TabSeparatedRawWithNames; +SELECT arrayJoin([1, 2, 3]) AS arr, 'hello' AS s1, 'world' AS s2 FORMAT TSVRawWithNames; +SELECT arrayJoin([1, 2, 3]) AS arr, 'hello' AS s1, 'world' AS s2 FORMAT RawWithNames; + +SELECT arrayJoin([1, 2, 3]) AS arr, 'hello' AS s1, 'world' AS s2 FORMAT TabSeparatedRawWithNamesAndTypes; +SELECT arrayJoin([1, 2, 3]) AS arr, 'hello' AS s1, 'world' AS s2 FORMAT TSVRawWithNamesAndTypes; +SELECT arrayJoin([1, 2, 3]) AS arr, 'hello' AS s1, 'world' AS s2 FORMAT RawWithNamesAndTypes; From a8ae0074aa5563b8e65ae110fa5dc71313a81a77 Mon Sep 17 00:00:00 2001 From: unashi Date: Tue, 7 May 2024 10:40:46 +0800 Subject: [PATCH 097/289] [fix] name->names --- docs/en/interfaces/formats.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/en/interfaces/formats.md b/docs/en/interfaces/formats.md index 937dfb52609..0b108edc17f 100644 --- a/docs/en/interfaces/formats.md +++ b/docs/en/interfaces/formats.md @@ -206,7 +206,7 @@ SELECT * FROM nestedt FORMAT TSV Differs from `TabSeparated` format in that the rows are written without escaping. When parsing with this format, tabs or linefeeds are not allowed in each field. -This format is also available under the name `TSVRaw`, `Raw`. +This format is also available under the names `TSVRaw`, `Raw`. ## TabSeparatedWithNames {#tabseparatedwithnames} @@ -241,14 +241,14 @@ This format is also available under the name `TSVWithNamesAndTypes`. Differs from `TabSeparatedWithNames` format in that the rows are written without escaping. When parsing with this format, tabs or linefeeds are not allowed in each field. -This format is also available under the name `TSVRawWithNames`, `RawWithNames`. +This format is also available under the names `TSVRawWithNames`, `RawWithNames`. ## TabSeparatedRawWithNamesAndTypes {#tabseparatedrawwithnamesandtypes} Differs from `TabSeparatedWithNamesAndTypes` format in that the rows are written without escaping. When parsing with this format, tabs or linefeeds are not allowed in each field. -This format is also available under the name `TSVRawWithNamesAndNames`, `RawWithNamesAndNames`. +This format is also available under the names `TSVRawWithNamesAndNames`, `RawWithNamesAndNames`. ## Template {#format-template} From 0a1d852dfd52cc88502a7699d249328edb041976 Mon Sep 17 00:00:00 2001 From: Julia Kartseva Date: Thu, 2 May 2024 21:35:32 +0000 Subject: [PATCH 098/289] Enable plain_rewritable metadata for local and azure Enable plain_rewritable support for local and azure (azure_blob_storage) metadata type. - HDFS object storage currently does not support iteration and does not implement listObjects method. It's a blocker for enabling plain_rewritable metadata type with HDFS. - StaticWeb object storage is read-only and works with its dedicated metadata type. --- .../ObjectStorages/ObjectStorageFactory.cpp | 14 +++++-- .../PlainRewritableObjectStorage.h | 39 ++++++++++++++++++- .../ObjectStorages/S3/S3ObjectStorage.cpp | 7 ---- src/Disks/ObjectStorages/S3/S3ObjectStorage.h | 3 -- .../03008_local_plain_rewritable.reference | 22 +++++++++++ .../03008_local_plain_rewritable.sh | 35 +++++++++++++++++ 6 files changed, 106 insertions(+), 14 deletions(-) create mode 100644 tests/queries/0_stateless/03008_local_plain_rewritable.reference create mode 100755 tests/queries/0_stateless/03008_local_plain_rewritable.sh diff --git a/src/Disks/ObjectStorages/ObjectStorageFactory.cpp b/src/Disks/ObjectStorages/ObjectStorageFactory.cpp index 7b949db268b..264ec2b258e 100644 --- a/src/Disks/ObjectStorages/ObjectStorageFactory.cpp +++ b/src/Disks/ObjectStorages/ObjectStorageFactory.cpp @@ -73,9 +73,17 @@ ObjectStoragePtr createObjectStorage( return std::make_shared>(std::forward(args)...); else if (isPlainRewritableStorage(type, config, config_prefix)) { - /// TODO(jkartseva@): Test support for generic disk type - if (type != ObjectStorageType::S3) - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "plain_rewritable metadata storage support is implemented only for S3"); + /// HDFS object storage currently does not support iteration and does not implement listObjects method. + /// StaticWeb object storage is read-only and works with its dedicated metadata type. + constexpr auto supported_object_storage_types + = std::array{ObjectStorageType::S3, ObjectStorageType::Local, ObjectStorageType::Azure}; + if (std::find(supported_object_storage_types.begin(), supported_object_storage_types.end(), type) + == supported_object_storage_types.end()) + throw Exception( + ErrorCodes::NOT_IMPLEMENTED, + "plain_rewritable metadata storage support is not implemented for '{}' object storage", + DataSourceDescription{DataSourceType::ObjectStorage, type, MetadataStorageType::PlainRewritable, /*description*/ ""} + .toString()); return std::make_shared>(std::forward(args)...); } diff --git a/src/Disks/ObjectStorages/PlainRewritableObjectStorage.h b/src/Disks/ObjectStorages/PlainRewritableObjectStorage.h index d71e995b490..2b116cff443 100644 --- a/src/Disks/ObjectStorages/PlainRewritableObjectStorage.h +++ b/src/Disks/ObjectStorages/PlainRewritableObjectStorage.h @@ -1,16 +1,26 @@ #pragma once #include +#include +#include "CommonPathPrefixKeyGenerator.h" namespace DB { +namespace ErrorCodes +{ +extern const int LOGICAL_ERROR; +} template class PlainRewritableObjectStorage : public BaseObjectStorage { public: template - explicit PlainRewritableObjectStorage(Args &&... args) : BaseObjectStorage(std::forward(args)...) + explicit PlainRewritableObjectStorage(Args &&... args) + : BaseObjectStorage(std::forward(args)...) + /// A basic key generator is required for checking S3 capabilities, + /// it will be reset later by metadata storage. + , key_generator(createObjectStorageKeysGeneratorAsIsWithPrefix(BaseObjectStorage::getCommonKeyPrefix())) { } @@ -19,6 +29,33 @@ public: bool isWriteOnce() const override { return false; } bool isPlain() const override { return true; } + + ObjectStorageKey generateObjectKeyForPath(const std::string & path) const override; + + ObjectStorageKey generateObjectKeyPrefixForDirectoryPath(const std::string & path) const override; + + void setKeysGenerator(ObjectStorageKeysGeneratorPtr gen) override { key_generator = gen; } + +private: + ObjectStorageKeysGeneratorPtr key_generator; }; + +template +ObjectStorageKey PlainRewritableObjectStorage::generateObjectKeyForPath(const std::string & path) const +{ + if (!key_generator) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Key generator is not set"); + + return key_generator->generate(path, /* is_directory */ false); +} + +template +ObjectStorageKey PlainRewritableObjectStorage::generateObjectKeyPrefixForDirectoryPath(const std::string & path) const +{ + if (!key_generator) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Key generator is not set"); + + return key_generator->generate(path, /* is_directory */ true); +} } diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp index 2eae8877f87..a58b37f1df9 100644 --- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp +++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp @@ -574,13 +574,6 @@ ObjectStorageKey S3ObjectStorage::generateObjectKeyForPath(const std::string & p return key_generator->generate(path, /* is_directory */ false); } -ObjectStorageKey S3ObjectStorage::generateObjectKeyPrefixForDirectoryPath(const std::string & path) const -{ - if (!key_generator) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Key generator is not set"); - - return key_generator->generate(path, /* is_directory */ true); -} } #endif diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.h b/src/Disks/ObjectStorages/S3/S3ObjectStorage.h index ff66b00e47c..5eaab4b585c 100644 --- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.h +++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.h @@ -159,12 +159,9 @@ public: bool supportParallelWrite() const override { return true; } ObjectStorageKey generateObjectKeyForPath(const std::string & path) const override; - ObjectStorageKey generateObjectKeyPrefixForDirectoryPath(const std::string & path) const override; bool isReadOnly() const override { return s3_settings.get()->read_only; } - void setKeysGenerator(ObjectStorageKeysGeneratorPtr gen) override { key_generator = gen; } - private: void setNewSettings(std::unique_ptr && s3_settings_); diff --git a/tests/queries/0_stateless/03008_local_plain_rewritable.reference b/tests/queries/0_stateless/03008_local_plain_rewritable.reference new file mode 100644 index 00000000000..10fc932ca4d --- /dev/null +++ b/tests/queries/0_stateless/03008_local_plain_rewritable.reference @@ -0,0 +1,22 @@ +10006 +0 0 0 +1 1 1 +1 2 0 +2 2 2 +2 2 2 +3 1 9 +3 3 3 +4 4 4 +4 7 7 +5 5 5 +10006 +0 0 0 +1 1 1 +1 2 0 +2 2 2 +2 2 2 +3 1 9 +3 3 3 +4 4 4 +4 7 7 +5 5 5 diff --git a/tests/queries/0_stateless/03008_local_plain_rewritable.sh b/tests/queries/0_stateless/03008_local_plain_rewritable.sh new file mode 100755 index 00000000000..07fd013c911 --- /dev/null +++ b/tests/queries/0_stateless/03008_local_plain_rewritable.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash +# Tags: no-random-settings, no-replicated-database, no-shared-merge-tree +# Tag no-random-settings: enable after root causing flakiness + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +${CLICKHOUSE_CLIENT} --query "drop table if exists test_mt sync" + +${CLICKHOUSE_CLIENT} -nm --query " +create table test_mt (a Int32, b Int64, c Int64) engine = MergeTree() partition by intDiv(a, 1000) order by tuple(a, b) +settings disk = disk( + type = object_storage, + object_storage_type = local, + metadata_type = plain_rewritable, + path = '/var/lib/clickhouse/disks/local_plain_rewritable/') +" + +${CLICKHOUSE_CLIENT} -nm --query " +insert into test_mt (*) values (1, 2, 0), (2, 2, 2), (3, 1, 9), (4, 7, 7), (5, 10, 2), (6, 12, 5); +insert into test_mt (*) select number, number, number from numbers_mt(10000); +" + +${CLICKHOUSE_CLIENT} -nm --query " +select count(*) from test_mt; +select (*) from test_mt order by tuple(a, b) limit 10; +" + +${CLICKHOUSE_CLIENT} --query "optimize table test_mt final" + +${CLICKHOUSE_CLIENT} -nm --query " +select count(*) from test_mt; +select (*) from test_mt order by tuple(a, b) limit 10; +" From c6f17b25e47ffcf96ff49f869f5ecd6b67b910b8 Mon Sep 17 00:00:00 2001 From: Julia Kartseva Date: Fri, 3 May 2024 03:59:50 +0000 Subject: [PATCH 099/289] plain_rewritable: add integration test for Azure --- .../__init__.py | 0 .../test.py | 153 ++++++++++++++++++ 2 files changed, 153 insertions(+) create mode 100644 tests/integration/test_azure_blob_storage_plain_rewritable/__init__.py create mode 100644 tests/integration/test_azure_blob_storage_plain_rewritable/test.py diff --git a/tests/integration/test_azure_blob_storage_plain_rewritable/__init__.py b/tests/integration/test_azure_blob_storage_plain_rewritable/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/integration/test_azure_blob_storage_plain_rewritable/test.py b/tests/integration/test_azure_blob_storage_plain_rewritable/test.py new file mode 100644 index 00000000000..96d116ec6a2 --- /dev/null +++ b/tests/integration/test_azure_blob_storage_plain_rewritable/test.py @@ -0,0 +1,153 @@ +import logging +import os +import random +import string + +import pytest + +from helpers.cluster import ClickHouseCluster +from azure.storage.blob import BlobServiceClient +from test_storage_azure_blob_storage.test import azure_query + +NODE_NAME = "node" + + +def generate_cluster_def(port): + path = os.path.join( + os.path.dirname(os.path.realpath(__file__)), + "./_gen/disk_storage_conf.xml", + ) + os.makedirs(os.path.dirname(path), exist_ok=True) + with open(path, "w") as f: + f.write( + f""" + + + + object_storage + azure_blob_storage + plain_rewritable + http://azurite1:{port}/devstoreaccount1 + cont + true + devstoreaccount1 + Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw== + 100000 + 100000 + 10 + 10 + + + + + +
+ blob_storage_disk +
+
+
+
+
+
+""" + ) + return path + + +insert_values = [ + "(0,'data'),(1,'data')", + ",".join( + f"({i},'{''.join(random.choices(string.ascii_lowercase, k=5))}')" + for i in range(10) + ), +] + + +@pytest.fixture(scope="module") +def cluster(): + try: + cluster = ClickHouseCluster(__file__) + port = cluster.azurite_port + path = generate_cluster_def(port) + cluster.add_instance( + NODE_NAME, + main_configs=[ + path, + ], + with_azurite=True, + stay_alive=True, + ) + logging.info("Starting cluster...") + cluster.start() + logging.info("Cluster started") + + yield cluster + finally: + cluster.shutdown() + + +def test_insert_select(cluster): + node = cluster.instances[NODE_NAME] + + for index, value in enumerate(insert_values): + azure_query( + node, + """ + CREATE TABLE test_{} ( + id Int64, + data String + ) ENGINE=MergeTree() + ORDER BY id + SETTINGS storage_policy='blob_storage_policy' + """.format( + index + ), + ) + + azure_query(node, "INSERT INTO test_{} VALUES {}".format(index, value)) + assert ( + azure_query( + node, "SELECT * FROM test_{} ORDER BY id FORMAT Values".format(index) + ) + == value + ) + + +def test_restart_server(cluster): + node = cluster.instances[NODE_NAME] + + for index, value in enumerate(insert_values): + assert ( + azure_query( + node, "SELECT * FROM test_{} ORDER BY id FORMAT Values".format(index) + ) + == value + ) + node.restart_clickhouse() + + for index, value in enumerate(insert_values): + assert ( + azure_query( + node, "SELECT * FROM test_{} ORDER BY id FORMAT Values".format(index) + ) + == value + ) + + +def test_drop_table(cluster): + node = cluster.instances[NODE_NAME] + + for index, value in enumerate(insert_values): + node.query("DROP TABLE IF EXISTS test_{} SYNC".format(index)) + + port = cluster.env_variables["AZURITE_PORT"] + connection_string = ( + f"DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;" + f"AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;" + f"BlobEndpoint=http://127.0.0.1:{port}/devstoreaccount1;" + ) + blob_service_client = BlobServiceClient.from_connection_string(connection_string) + containers = blob_service_client.list_containers() + for container in containers: + container_client = blob_service_client.get_container_client(container) + assert len(list(container_client.list_blobs())) == 0 From fcad15ffc2b7c5d4d1c9e9ce201ba9eb86d4a3d4 Mon Sep 17 00:00:00 2001 From: Julia Kartseva Date: Sat, 4 May 2024 04:26:48 +0000 Subject: [PATCH 100/289] plain_rewritable: update docs --- docs/en/operations/storing-data.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/en/operations/storing-data.md b/docs/en/operations/storing-data.md index 389c917d427..7005783dd60 100644 --- a/docs/en/operations/storing-data.md +++ b/docs/en/operations/storing-data.md @@ -371,6 +371,8 @@ is equal to ``` +Starting from `24.5` it is possible configure any object storage disk (`s3`, `azure`, `local`) using `plain_rewritable` metadata type. + ### Using Azure Blob Storage {#azure-blob-storage} `MergeTree` family table engines can store data to [Azure Blob Storage](https://azure.microsoft.com/en-us/services/storage/blobs/) using a disk with type `azure_blob_storage`. From 5e5477302a71e8959feb8daab1b9459e16b2e168 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Tue, 7 May 2024 09:31:39 +0200 Subject: [PATCH 101/289] Update src/Storages/MergeTree/MergeTreeReadPoolBase.cpp Co-authored-by: Alexander Tokmakov --- src/Storages/MergeTree/MergeTreeReadPoolBase.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Storages/MergeTree/MergeTreeReadPoolBase.cpp b/src/Storages/MergeTree/MergeTreeReadPoolBase.cpp index c759a12e151..36673238f3b 100644 --- a/src/Storages/MergeTree/MergeTreeReadPoolBase.cpp +++ b/src/Storages/MergeTree/MergeTreeReadPoolBase.cpp @@ -130,7 +130,7 @@ MergeTreeReadTaskPtr MergeTreeReadPoolBase::createTask( parent_part_name, {MergeTreeDataPartState::PreActive, MergeTreeDataPartState::Active, MergeTreeDataPartState::Outdated}); if (!parent_part) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Did not find parent part {} for potentially broken projection part {}", + throw Exception(ErrorCodes::LOGICAL_ERROR, "Did not find parent part {} for projection part {}", parent_part_name, data_part->getDataPartStorage().getFullPath()); return parent_part_name; From da4f6f7b6ce4d7c46f3bd1955352656fd2826f19 Mon Sep 17 00:00:00 2001 From: Maksim Kita Date: Tue, 30 Apr 2024 16:35:57 +0300 Subject: [PATCH 102/289] Added recursive CTE documentation --- .../sql-reference/statements/select/with.md | 235 +++++++++++++++++- 1 file changed, 229 insertions(+), 6 deletions(-) diff --git a/docs/en/sql-reference/statements/select/with.md b/docs/en/sql-reference/statements/select/with.md index a59ef463419..84d3c02eca1 100644 --- a/docs/en/sql-reference/statements/select/with.md +++ b/docs/en/sql-reference/statements/select/with.md @@ -5,21 +5,21 @@ sidebar_label: WITH # WITH Clause -ClickHouse supports Common Table Expressions ([CTE](https://en.wikipedia.org/wiki/Hierarchical_and_recursive_queries_in_SQL)) and substitutes the code defined in the `WITH` clause in all places of use for the rest of `SELECT` query. Named subqueries can be included to the current and child query context in places where table objects are allowed. Recursion is prevented by hiding the current level CTEs from the WITH expression. +ClickHouse supports Common Table Expressions ([CTE](https://en.wikipedia.org/wiki/Hierarchical_and_recursive_queries_in_SQL)) and substitutes the code defined in the `WITH` clause in all places of use for the rest of `SELECT` query. Named subqueries can be included to the current and child query context in places where table objects are allowed. Recursion is prevented by hiding the current level CTEs from the WITH expression. Please note that CTEs do not guarantee the same results in all places they are called because the query will be re-executed for each use case. An example of such behavior is below ``` sql -with cte_numbers as +with cte_numbers as ( - select - num - from generateRandom('num UInt64', NULL) + select + num + from generateRandom('num UInt64', NULL) limit 1000000 ) select - count() + count() from cte_numbers where num in (select num from cte_numbers) ``` @@ -87,3 +87,226 @@ LIMIT 10; WITH test1 AS (SELECT i + 1, j + 1 FROM test1) SELECT * FROM test1; ``` + +# Recursive Queries + +The optional RECURSIVE modifier allows for a WITH query to refer to its own output. Example: + +**Example:** Sum integers from 1 throught 100 + +```sql +WITH RECURSIVE test_table AS ( + SELECT 1 AS number +UNION ALL + SELECT number + 1 FROM test_table WHERE number < 100 +) +SELECT sum(number) FROM test_table; +``` + +``` text +┌─sum(number)─┐ +│ 5050 │ +└─────────────┘ +``` + +The general form of a recursive `WITH` query is always a non-recursive term, then `UNION ALL`, then a recursive term, where only the recursive term can contain a reference to the query's own output. Recursive CTE query is executed as follows: + +1. Evaluate the non-recursive term. Place result of non-recursive term query in a temporary working table. +2. As long as the working table is not empty, repeat these steps: + 1. Evaluate the recursive term, substituting the current contents of the working table for the recursive self-reference. Place result of recursive term query in a temporary intermediate table. + 2. Replace the contents of the working table with the contents of the intermediate table, then empty the intermediate table. + +Recursive queries are typically used to work with hierarchical or tree-structured data. For example, we can write a query that performs tree traversal: + +**Example:** Tree traversal + +First let's create tree table: + +```sql +DROP TABLE IF EXISTS tree; +CREATE TABLE tree +( + id UInt64, + parent_id Nullable(UInt64), + data String +) ENGINE = MergeTree ORDER BY id; + +INSERT INTO tree VALUES (0, NULL, 'ROOT'), (1, 0, 'Child_1'), (2, 0, 'Child_2'), (3, 1, 'Child_1_1'); +``` + +We can traverse those tree with such query: + +**Example:** Tree traversal +```sql +WITH RECURSIVE search_tree AS ( + SELECT id, parent_id, data + FROM tree t + WHERE t.id = 0 +UNION ALL + SELECT t.id, t.parent_id, t.data + FROM tree t, search_tree st + WHERE t.parent_id = st.id +) +SELECT * FROM search_tree; +``` + +```text +┌─id─┬─parent_id─┬─data──────┐ +│ 0 │ ᴺᵁᴸᴸ │ ROOT │ +│ 1 │ 0 │ Child_1 │ +│ 2 │ 0 │ Child_2 │ +│ 3 │ 1 │ Child_1_1 │ +└────┴───────────┴───────────┘ +``` + +## Search order + +To create a depth-first order, we compute for each result row an array of rows that we have already visited: + +**Example:** Tree traversal depth-first order +```sql +WITH RECURSIVE search_tree AS ( + SELECT id, parent_id, data, [t.id] AS path + FROM tree t + WHERE t.id = 0 +UNION ALL + SELECT t.id, t.parent_id, t.data, arrayConcat(path, [t.id]) + FROM tree t, search_tree st + WHERE t.parent_id = st.id +) +SELECT * FROM search_tree ORDER BY path; +``` + +```text +┌─id─┬─parent_id─┬─data──────┬─path────┐ +│ 0 │ ᴺᵁᴸᴸ │ ROOT │ [0] │ +│ 1 │ 0 │ Child_1 │ [0,1] │ +│ 3 │ 1 │ Child_1_1 │ [0,1,3] │ +│ 2 │ 0 │ Child_2 │ [0,2] │ +└────┴───────────┴───────────┴─────────┘ +``` + +To create a breadth-first order, standard approach is to add column that tracks the depth of the search: + +**Example:** Tree traversal breadth-first order +```sql +WITH RECURSIVE search_tree AS ( + SELECT id, parent_id, data, [t.id] AS path, toUInt64(0) AS depth + FROM tree t + WHERE t.id = 0 +UNION ALL + SELECT t.id, t.parent_id, t.data, arrayConcat(path, [t.id]), depth + 1 + FROM tree t, search_tree st + WHERE t.parent_id = st.id +) +SELECT * FROM search_tree ORDER BY depth; +``` + +```text +┌─id─┬─link─┬─data──────┬─path────┬─depth─┐ +│ 0 │ ᴺᵁᴸᴸ │ ROOT │ [0] │ 0 │ +│ 1 │ 0 │ Child_1 │ [0,1] │ 1 │ +│ 2 │ 0 │ Child_2 │ [0,2] │ 1 │ +│ 3 │ 1 │ Child_1_1 │ [0,1,3] │ 2 │ +└────┴──────┴───────────┴─────────┴───────┘ +``` + +## Cycle detection + +First let's create graph table: + +```sql +DROP TABLE IF EXISTS graph; +CREATE TABLE graph +( + from UInt64, + to UInt64, + label String +) ENGINE = MergeTree ORDER BY (from, to); + +INSERT INTO graph VALUES (1, 2, '1 -> 2'), (1, 3, '1 -> 3'), (2, 3, '2 -> 3'), (1, 4, '1 -> 4'), (4, 5, '4 -> 5'); +``` + +We can traverse that graph with such query: + +**Example:** Graph traversal without cycle detection +```sql +WITH RECURSIVE search_graph AS ( + SELECT from, to, label FROM graph g + UNION ALL + SELECT g.from, g.to, g.label + FROM graph g, search_graph sg + WHERE g.from = sg.to +) +SELECT DISTINCT * FROM search_graph ORDER BY from; +``` +```text +┌─from─┬─to─┬─label──┐ +│ 1 │ 4 │ 1 -> 4 │ +│ 1 │ 2 │ 1 -> 2 │ +│ 1 │ 3 │ 1 -> 3 │ +│ 2 │ 3 │ 2 -> 3 │ +│ 4 │ 5 │ 4 -> 5 │ +└──────┴────┴────────┘ +``` + +But if we add cycle in that graph, previous query will fail with `Maximum recursive CTE evaluation depth` error: + +```sql +INSERT INTO graph VALUES (5, 1, '5 -> 1'); + +WITH RECURSIVE search_graph AS ( + SELECT from, to, label FROM graph g +UNION ALL + SELECT g.from, g.to, g.label + FROM graph g, search_graph sg + WHERE g.from = sg.to +) +SELECT DISTINCT * FROM search_graph ORDER BY from; +``` + +```text +Code: 306. DB::Exception: Received from localhost:9000. DB::Exception: Maximum recursive CTE evaluation depth (1000) exceeded, during evaluation of search_graph AS (SELECT from, to, label FROM graph AS g UNION ALL SELECT g.from, g.to, g.label FROM graph AS g, search_graph AS sg WHERE g.from = sg.to). Consider raising max_recursive_cte_evaluation_depth setting.: While executing RecursiveCTESource. (TOO_DEEP_RECURSION) +``` + +The standard method for handling cycles is to compute an array of the already visited nodes: + +**Example:** Graph traversal with cycle detection +```sql +WITH RECURSIVE search_graph AS ( + SELECT from, to, label, false AS is_cycle, [tuple(g.from, g.to)] AS path FROM graph g +UNION ALL + SELECT g.from, g.to, g.label, has(path, tuple(g.from, g.to)), arrayConcat(sg.path, [tuple(g.from, g.to)]) + FROM graph g, search_graph sg + WHERE g.from = sg.to AND NOT is_cycle +) +SELECT * FROM search_graph WHERE is_cycle ORDER BY from; +``` + +```text +┌─from─┬─to─┬─label──┬─is_cycle─┬─path──────────────────────┐ +│ 1 │ 4 │ 1 -> 4 │ true │ [(1,4),(4,5),(5,1),(1,4)] │ +│ 4 │ 5 │ 4 -> 5 │ true │ [(4,5),(5,1),(1,4),(4,5)] │ +│ 5 │ 1 │ 5 -> 1 │ true │ [(5,1),(1,4),(4,5),(5,1)] │ +└──────┴────┴────────┴──────────┴───────────────────────────┘ +``` + +## Infinite queries + +It is also possible to use inifinite recursive CTE queries if `LIMIT` is used in outer query: + +**Example:** Infinite recursive CTE query +```sql +WITH RECURSIVE test_table AS ( + SELECT 1 AS number +UNION ALL + SELECT number + 1 FROM test_table +) +SELECT sum(number) FROM (SELECT number FROM test_table LIMIT 100); +``` + +```text +┌─sum(number)─┐ +│ 5050 │ +└─────────────┘ +``` From 0b59c24866a6e61989b907aed0219530d6503b30 Mon Sep 17 00:00:00 2001 From: Maksim Kita Date: Thu, 2 May 2024 18:50:38 +0300 Subject: [PATCH 103/289] Fixed style check --- docs/en/sql-reference/statements/select/with.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/en/sql-reference/statements/select/with.md b/docs/en/sql-reference/statements/select/with.md index 84d3c02eca1..ffde7a3fe54 100644 --- a/docs/en/sql-reference/statements/select/with.md +++ b/docs/en/sql-reference/statements/select/with.md @@ -92,7 +92,7 @@ SELECT * FROM test1; The optional RECURSIVE modifier allows for a WITH query to refer to its own output. Example: -**Example:** Sum integers from 1 throught 100 +**Example:** Sum integers from 1 through 100 ```sql WITH RECURSIVE test_table AS ( @@ -293,7 +293,7 @@ SELECT * FROM search_graph WHERE is_cycle ORDER BY from; ## Infinite queries -It is also possible to use inifinite recursive CTE queries if `LIMIT` is used in outer query: +It is also possible to use infinite recursive CTE queries if `LIMIT` is used in outer query: **Example:** Infinite recursive CTE query ```sql From f50f28d413bf0ba8cd40dccdbb631c2283dc32f1 Mon Sep 17 00:00:00 2001 From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com> Date: Tue, 7 May 2024 12:11:01 +0200 Subject: [PATCH 104/289] Update 03145_non_loaded_projection_backup.sh --- .../queries/0_stateless/03145_non_loaded_projection_backup.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/03145_non_loaded_projection_backup.sh b/tests/queries/0_stateless/03145_non_loaded_projection_backup.sh index 721ed784fc2..36ae9843dc4 100755 --- a/tests/queries/0_stateless/03145_non_loaded_projection_backup.sh +++ b/tests/queries/0_stateless/03145_non_loaded_projection_backup.sh @@ -10,6 +10,7 @@ create table tp_1 (x Int32, y Int32, projection p (select x, y order by x)) engi insert into tp_1 select number, number from numbers(3); set mutations_sync = 2; +system stop merges; alter table tp_1 add projection pp (select x, count() group by x); insert into tp_1 select number, number from numbers(4); @@ -46,4 +47,4 @@ check table tp_1 settings check_query_single_value_result = 0;" | grep -o "Found $CLICKHOUSE_CLIENT -nm -q " set send_logs_level='fatal'; check table tp_1" -$CLICKHOUSE_CLIENT -q "drop table tp_1 sync" +$CLICKHOUSE_CLIENT -q "drop table tp_1" From 791278ba47676ef497c95a308eaca91698717f91 Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Mon, 6 May 2024 21:45:22 +0200 Subject: [PATCH 105/289] Add logging after a failure with evaluating Replicated*MergeTree engine arguments. --- ...tractZooKeeperPathFromReplicatedTableDef.h | 3 +- .../MergeTree/registerStorageMergeTree.cpp | 60 +++++++++++-------- 2 files changed, 36 insertions(+), 27 deletions(-) diff --git a/src/Storages/MergeTree/extractZooKeeperPathFromReplicatedTableDef.h b/src/Storages/MergeTree/extractZooKeeperPathFromReplicatedTableDef.h index 1bd58392201..5ef5e1db62e 100644 --- a/src/Storages/MergeTree/extractZooKeeperPathFromReplicatedTableDef.h +++ b/src/Storages/MergeTree/extractZooKeeperPathFromReplicatedTableDef.h @@ -11,8 +11,9 @@ class ASTCreateQuery; class Context; using ContextPtr = std::shared_ptr; -/// Extracts a zookeeper path from a specified CREATE TABLE query. Returns std::nullopt if fails. +/// Extracts a zookeeper path from a specified CREATE TABLE query. /// The function checks the table engine and if it is Replicated*MergeTree then it takes the first argument and expands macros in it. +/// Returns std::nullopt if the specified CREATE query doesn't describe a Replicated table or its arguments can't be evaluated. std::optional extractZooKeeperPathFromReplicatedTableDef(const ASTCreateQuery & create_query, const ContextPtr & context); } diff --git a/src/Storages/MergeTree/registerStorageMergeTree.cpp b/src/Storages/MergeTree/registerStorageMergeTree.cpp index 9b0200d5a1c..4244ccccfe0 100644 --- a/src/Storages/MergeTree/registerStorageMergeTree.cpp +++ b/src/Storages/MergeTree/registerStorageMergeTree.cpp @@ -296,9 +296,6 @@ static void extractZooKeeperPathAndReplicaNameFromEngineArgs( else throw Exception(ErrorCodes::BAD_ARGUMENTS, "Replica name must be a string literal{}", verbose_help_message); - if (replica_name.empty()) - throw Exception(ErrorCodes::NO_REPLICA_NAME_GIVEN, "No replica name in config{}", verbose_help_message); - expand_macro(ast_zk_path, ast_replica_name); } else if (is_extended_storage_def @@ -332,38 +329,45 @@ static void extractZooKeeperPathAndReplicaNameFromEngineArgs( throw Exception(ErrorCodes::BAD_ARGUMENTS, "Expected two string literal arguments: zookeeper_path and replica_name"); } -/// Extracts a zookeeper path from a specified CREATE TABLE query. Returns std::nullopt if fails. +/// Extracts a zookeeper path from a specified CREATE TABLE query. std::optional extractZooKeeperPathFromReplicatedTableDef(const ASTCreateQuery & query, const ContextPtr & context) { + if (!query.storage || !query.storage->engine) + return {}; + + const String & engine_name = query.storage->engine->name; + if (!isReplicated(engine_name)) + return {}; + + StorageID table_id{query.getDatabase(), query.getTable(), query.uuid}; + + ASTs engine_args; + if (query.storage->engine->arguments) + engine_args = query.storage->engine->arguments->children; + for (auto & engine_arg : engine_args) + engine_arg = engine_arg->clone(); + + LoadingStrictnessLevel mode = LoadingStrictnessLevel::CREATE; + String zookeeper_path; + String replica_name; + RenamingRestrictions renaming_restrictions; + try { - if (!query.storage || !query.storage->engine) - return {}; - - const String & engine_name = query.storage->engine->name; - if (!isReplicated(engine_name)) - return {}; - - StorageID table_id{query.getDatabase(), query.getTable(), query.uuid}; - ASTs engine_args; - if (query.storage->engine->arguments) - engine_args = query.storage->engine->arguments->children; - for (auto & engine_arg : engine_args) - engine_arg = engine_arg->clone(); - LoadingStrictnessLevel mode = LoadingStrictnessLevel::CREATE; - String zookeeper_path; - String replica_name; - RenamingRestrictions renaming_restrictions; - extractZooKeeperPathAndReplicaNameFromEngineArgs(query, table_id, engine_name, engine_args, mode, context, zookeeper_path, replica_name, renaming_restrictions); - - return zookeeper_path; } - catch (...) + catch (Exception & e) { - return {}; + if (e.code() == ErrorCodes::BAD_ARGUMENTS) + { + tryLogCurrentException(__PRETTY_FUNCTION__, "Couldn't evaluate engine arguments"); + return {}; + } + throw; } + + return zookeeper_path; } static StoragePtr create(const StorageFactory::Arguments & args) @@ -539,6 +543,10 @@ static StoragePtr create(const StorageFactory::Arguments & args) { extractZooKeeperPathAndReplicaNameFromEngineArgs(args.query, args.table_id, args.engine_name, args.engine_args, args.mode, args.getLocalContext(), zookeeper_path, replica_name, renaming_restrictions); + + if (replica_name.empty()) + throw Exception(ErrorCodes::NO_REPLICA_NAME_GIVEN, "No replica name in config{}", verbose_help_message); + arg_cnt = engine_args.size(); /// Update `arg_cnt` here because extractZooKeeperPathAndReplicaNameFromEngineArgs() could add arguments. arg_num = 2; /// zookeeper_path and replica_name together are always two arguments. } From e6926dc65d37658f48540490190398c91ea22273 Mon Sep 17 00:00:00 2001 From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com> Date: Tue, 7 May 2024 12:32:34 +0200 Subject: [PATCH 106/289] Update 03145_non_loaded_projection_backup.sh --- tests/queries/0_stateless/03145_non_loaded_projection_backup.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/queries/0_stateless/03145_non_loaded_projection_backup.sh b/tests/queries/0_stateless/03145_non_loaded_projection_backup.sh index 36ae9843dc4..d92b85a984f 100755 --- a/tests/queries/0_stateless/03145_non_loaded_projection_backup.sh +++ b/tests/queries/0_stateless/03145_non_loaded_projection_backup.sh @@ -10,7 +10,6 @@ create table tp_1 (x Int32, y Int32, projection p (select x, y order by x)) engi insert into tp_1 select number, number from numbers(3); set mutations_sync = 2; -system stop merges; alter table tp_1 add projection pp (select x, count() group by x); insert into tp_1 select number, number from numbers(4); From 0a580e84eaae0dff89ca5fd151b8c98d3b8f9adb Mon Sep 17 00:00:00 2001 From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com> Date: Tue, 7 May 2024 12:41:06 +0200 Subject: [PATCH 107/289] Update 03145_non_loaded_projection_backup.sh --- tests/queries/0_stateless/03145_non_loaded_projection_backup.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/queries/0_stateless/03145_non_loaded_projection_backup.sh b/tests/queries/0_stateless/03145_non_loaded_projection_backup.sh index d92b85a984f..6f0e00ce3fc 100755 --- a/tests/queries/0_stateless/03145_non_loaded_projection_backup.sh +++ b/tests/queries/0_stateless/03145_non_loaded_projection_backup.sh @@ -7,6 +7,7 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) $CLICKHOUSE_CLIENT -nm -q " drop table if exists tp_1; create table tp_1 (x Int32, y Int32, projection p (select x, y order by x)) engine = MergeTree order by y partition by intDiv(y, 100); +system stop merges tp_1; insert into tp_1 select number, number from numbers(3); set mutations_sync = 2; From fe18781bed4c26347cd7746dfc3be75745941eab Mon Sep 17 00:00:00 2001 From: skyoct Date: Tue, 7 May 2024 18:41:37 +0800 Subject: [PATCH 108/289] batter --- .../02415_all_new_functions_must_be_documented.reference | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/queries/0_stateless/02415_all_new_functions_must_be_documented.reference b/tests/queries/0_stateless/02415_all_new_functions_must_be_documented.reference index 3ddf165dec0..a152066a460 100644 --- a/tests/queries/0_stateless/02415_all_new_functions_must_be_documented.reference +++ b/tests/queries/0_stateless/02415_all_new_functions_must_be_documented.reference @@ -201,6 +201,7 @@ cbrt ceil char cityHash64 +clamp coalesce concat concatAssumeInjective From 1bae2d9d4ffa6b4757dc2aeccb9eccf89bebc072 Mon Sep 17 00:00:00 2001 From: zvonand Date: Tue, 7 May 2024 12:57:14 +0200 Subject: [PATCH 109/289] update comment --- src/Storages/StorageS3.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp index 8a4e30fed1d..e65d0cb5be4 100644 --- a/src/Storages/StorageS3.cpp +++ b/src/Storages/StorageS3.cpp @@ -489,7 +489,8 @@ size_t StorageS3Source::DisclosedGlobIterator::estimatedKeysCount() { /// 1000 files were listed, and we cannot make any estimation of _how many more_ there are (because we list bucket lazily); /// If there are more objects in the bucket, limiting the number of streams is the last thing we may want to do - /// as it would lead to serious (up to times) reading performance degradation. + /// as it would lead to serious slow down of the execution, since objects are going + /// to be fetched sequentially rather than in-parallel with up to times. return std::numeric_limits::max(); } else From 0609054e9849b915847cbd93cb3d76786eebe0cc Mon Sep 17 00:00:00 2001 From: unashi Date: Tue, 7 May 2024 21:09:33 +0800 Subject: [PATCH 110/289] [update] update a stateless case --- .../02187_async_inserts_all_formats. | 52 +++++++++++++++++++ .../02187_async_inserts_all_formats.reference | 3 ++ 2 files changed, 55 insertions(+) create mode 100644 tests/queries/0_stateless/02187_async_inserts_all_formats. diff --git a/tests/queries/0_stateless/02187_async_inserts_all_formats. b/tests/queries/0_stateless/02187_async_inserts_all_formats. new file mode 100644 index 00000000000..f42a6d39d4f --- /dev/null +++ b/tests/queries/0_stateless/02187_async_inserts_all_formats. @@ -0,0 +1,52 @@ +Arrow +ArrowStream +Avro +BSONEachRow +CSV +CSVWithNames +CSVWithNamesAndTypes +CustomSeparated +CustomSeparatedWithNames +CustomSeparatedWithNamesAndTypes +JSON +JSONColumns +JSONColumnsWithMetadata +JSONCompact +JSONCompactColumns +JSONCompactEachRow +JSONCompactEachRowWithNames +JSONCompactEachRowWithNamesAndTypes +JSONCompactStringsEachRow +JSONCompactStringsEachRowWithNames +JSONCompactStringsEachRowWithNamesAndTypes +JSONEachRow +JSONLines +JSONObjectEachRow +JSONStringsEachRow +MsgPack +NDJSON +Native +ORC +Parquet +Raw +RawWithNames +RawWithNamesAndTypes +RowBinary +RowBinaryWithNames +RowBinaryWithNamesAndTypes +TSKV +TSV +TSVRaw +TSVRawWithNames +TSVRawWithNamesAndTypes +TSVWithNames +TSVWithNamesAndTypes +TabSeparated +TabSeparatedRaw +TabSeparatedRawWithNames +TabSeparatedRawWithNamesAndTypes +TabSeparatedWithNames +TabSeparatedWithNamesAndTypes +Values +LineAsString +OK diff --git a/tests/queries/0_stateless/02187_async_inserts_all_formats.reference b/tests/queries/0_stateless/02187_async_inserts_all_formats.reference index 2de728b4cb4..f42a6d39d4f 100644 --- a/tests/queries/0_stateless/02187_async_inserts_all_formats.reference +++ b/tests/queries/0_stateless/02187_async_inserts_all_formats.reference @@ -28,6 +28,9 @@ NDJSON Native ORC Parquet +Raw +RawWithNames +RawWithNamesAndTypes RowBinary RowBinaryWithNames RowBinaryWithNamesAndTypes From 85f766f27c0cbd49e267334c3d184627b554e853 Mon Sep 17 00:00:00 2001 From: unashi Date: Tue, 7 May 2024 21:11:45 +0800 Subject: [PATCH 111/289] [update] update a stateless case --- .../02187_async_inserts_all_formats. | 52 ------------------- 1 file changed, 52 deletions(-) delete mode 100644 tests/queries/0_stateless/02187_async_inserts_all_formats. diff --git a/tests/queries/0_stateless/02187_async_inserts_all_formats. b/tests/queries/0_stateless/02187_async_inserts_all_formats. deleted file mode 100644 index f42a6d39d4f..00000000000 --- a/tests/queries/0_stateless/02187_async_inserts_all_formats. +++ /dev/null @@ -1,52 +0,0 @@ -Arrow -ArrowStream -Avro -BSONEachRow -CSV -CSVWithNames -CSVWithNamesAndTypes -CustomSeparated -CustomSeparatedWithNames -CustomSeparatedWithNamesAndTypes -JSON -JSONColumns -JSONColumnsWithMetadata -JSONCompact -JSONCompactColumns -JSONCompactEachRow -JSONCompactEachRowWithNames -JSONCompactEachRowWithNamesAndTypes -JSONCompactStringsEachRow -JSONCompactStringsEachRowWithNames -JSONCompactStringsEachRowWithNamesAndTypes -JSONEachRow -JSONLines -JSONObjectEachRow -JSONStringsEachRow -MsgPack -NDJSON -Native -ORC -Parquet -Raw -RawWithNames -RawWithNamesAndTypes -RowBinary -RowBinaryWithNames -RowBinaryWithNamesAndTypes -TSKV -TSV -TSVRaw -TSVRawWithNames -TSVRawWithNamesAndTypes -TSVWithNames -TSVWithNamesAndTypes -TabSeparated -TabSeparatedRaw -TabSeparatedRawWithNames -TabSeparatedRawWithNamesAndTypes -TabSeparatedWithNames -TabSeparatedWithNamesAndTypes -Values -LineAsString -OK From 6f2a39b29131578acd10a79486f29f23e323e311 Mon Sep 17 00:00:00 2001 From: alesapin Date: Tue, 7 May 2024 15:32:10 +0200 Subject: [PATCH 112/289] Revert "Do in reverse order" This reverts commit 0b0e97917e3e4ab27a17cbf14d9e73163a20adbe. --- src/Storages/MergeTree/MergeTreeRangeReader.cpp | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeRangeReader.cpp b/src/Storages/MergeTree/MergeTreeRangeReader.cpp index 492e4065502..eb757e1d8c7 100644 --- a/src/Storages/MergeTree/MergeTreeRangeReader.cpp +++ b/src/Storages/MergeTree/MergeTreeRangeReader.cpp @@ -1007,10 +1007,6 @@ MergeTreeRangeReader::ReadResult MergeTreeRangeReader::read(size_t max_rows, Mar filterColumns(columns, read_result.final_filter); } - /// If columns not empty, then apply on-fly alter conversions if any required - if (!prewhere_info || prewhere_info->perform_alter_conversions) - merge_tree_reader->performRequiredConversions(columns); - /// If some columns absent in part, then evaluate default values if (should_evaluate_missing_defaults) { @@ -1022,6 +1018,9 @@ MergeTreeRangeReader::ReadResult MergeTreeRangeReader::read(size_t max_rows, Mar merge_tree_reader->evaluateMissingDefaults(additional_columns, columns); } + /// If columns not empty, then apply on-fly alter conversions if any required + if (!prewhere_info || prewhere_info->perform_alter_conversions) + merge_tree_reader->performRequiredConversions(columns); } read_result.columns.reserve(read_result.columns.size() + columns.size()); @@ -1047,14 +1046,14 @@ MergeTreeRangeReader::ReadResult MergeTreeRangeReader::read(size_t max_rows, Mar bool should_evaluate_missing_defaults; merge_tree_reader->fillMissingColumns(columns, should_evaluate_missing_defaults, read_result.num_rows); - /// If result not empty, then apply on-fly alter conversions if any required - if (!prewhere_info || prewhere_info->perform_alter_conversions) - merge_tree_reader->performRequiredConversions(columns); - /// If some columns absent in part, then evaluate default values if (should_evaluate_missing_defaults) merge_tree_reader->evaluateMissingDefaults({}, columns); + /// If result not empty, then apply on-fly alter conversions if any required + if (!prewhere_info || prewhere_info->perform_alter_conversions) + merge_tree_reader->performRequiredConversions(columns); + for (size_t i = 0; i < columns.size(); ++i) read_result.columns[i] = std::move(columns[i]); } From 49284724ef4b34bbc5112bfa63a99625e8d35fe7 Mon Sep 17 00:00:00 2001 From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com> Date: Tue, 7 May 2024 16:10:21 +0200 Subject: [PATCH 113/289] fix errorCodes in fast tests --- tests/queries/0_stateless/00909_arrayEnumerateUniq.sql | 2 +- tests/queries/0_stateless/01045_array_zip.sql | 2 +- tests/queries/0_stateless/02354_parse_timedelta.sql | 4 ++-- tests/queries/0_stateless/02372_now_in_block.sql | 2 +- tests/queries/0_stateless/02718_array_fold.sql | 8 ++++---- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/tests/queries/0_stateless/00909_arrayEnumerateUniq.sql b/tests/queries/0_stateless/00909_arrayEnumerateUniq.sql index 0bdb338e9d2..fe01b2185c2 100644 --- a/tests/queries/0_stateless/00909_arrayEnumerateUniq.sql +++ b/tests/queries/0_stateless/00909_arrayEnumerateUniq.sql @@ -152,7 +152,7 @@ DROP TABLE arrays_test; select '---------BAD'; -SELECT arrayEnumerateUniqRanked(); -- { serverError 42 } +SELECT arrayEnumerateUniqRanked(); -- { serverError TOO_FEW_ARGUMENTS_FOR_FUNCTION } SELECT arrayEnumerateUniqRanked([]); SELECT arrayEnumerateUniqRanked(1); -- { serverError 36 } SELECT arrayEnumerateUniqRanked(2,[]); -- { serverError 36 } diff --git a/tests/queries/0_stateless/01045_array_zip.sql b/tests/queries/0_stateless/01045_array_zip.sql index 1a85e6a0874..a2d54c8ae3f 100644 --- a/tests/queries/0_stateless/01045_array_zip.sql +++ b/tests/queries/0_stateless/01045_array_zip.sql @@ -2,7 +2,7 @@ SELECT arrayZip(['a', 'b', 'c'], ['d', 'e', 'f']); SELECT arrayZip(['a', 'b', 'c'], ['d', 'e', 'f'], ['g', 'h', 'i']); -SELECT arrayZip(); -- { serverError 42 } +SELECT arrayZip(); -- { serverError TOO_FEW_ARGUMENTS_FOR_FUNCTION } SELECT arrayZip('a', 'b', 'c'); -- { serverError 43 } diff --git a/tests/queries/0_stateless/02354_parse_timedelta.sql b/tests/queries/0_stateless/02354_parse_timedelta.sql index 29f2bf9fdfc..e876de64374 100644 --- a/tests/queries/0_stateless/02354_parse_timedelta.sql +++ b/tests/queries/0_stateless/02354_parse_timedelta.sql @@ -11,8 +11,8 @@ SELECT parseTimeDelta('1s - 1ms : 1μs ; 1ns'); SELECT parseTimeDelta('1.11s1.11ms1.11us1.11ns'); -- invalid expressions -SELECT parseTimeDelta(); -- {serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH} -SELECT parseTimeDelta('1yr', 1); -- {serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH} +SELECT parseTimeDelta(); -- {serverError TOO_FEW_ARGUMENTS_FOR_FUNCTION} +SELECT parseTimeDelta('1yr', 1); -- {serverError TOO_MANY_ARGUMENTS_FOR_FUNCTION} SELECT parseTimeDelta(1); -- {serverError ILLEGAL_TYPE_OF_ARGUMENT} SELECT parseTimeDelta(' '); -- {serverError BAD_ARGUMENTS} SELECT parseTimeDelta('-1yr'); -- {serverError BAD_ARGUMENTS} diff --git a/tests/queries/0_stateless/02372_now_in_block.sql b/tests/queries/0_stateless/02372_now_in_block.sql index 7c884c0ba7d..815f74e5845 100644 --- a/tests/queries/0_stateless/02372_now_in_block.sql +++ b/tests/queries/0_stateless/02372_now_in_block.sql @@ -1,4 +1,4 @@ SELECT count() FROM (SELECT DISTINCT nowInBlock(), nowInBlock('Pacific/Pitcairn') FROM system.numbers LIMIT 2); SELECT nowInBlock(1); -- { serverError 43 } SELECT nowInBlock(NULL) IS NULL; -SELECT nowInBlock('UTC', 'UTC'); -- { serverError 42 } +SELECT nowInBlock('UTC', 'UTC'); -- { serverError TOO_MANY_ARGUMENTS_FOR_FUNCTION } diff --git a/tests/queries/0_stateless/02718_array_fold.sql b/tests/queries/0_stateless/02718_array_fold.sql index 7dee33c4705..73a3cebdda1 100644 --- a/tests/queries/0_stateless/02718_array_fold.sql +++ b/tests/queries/0_stateless/02718_array_fold.sql @@ -1,10 +1,10 @@ SELECT '-- Negative tests'; -SELECT arrayFold(); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } -SELECT arrayFold(1); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } -SELECT arrayFold(1, toUInt64(0)); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } +SELECT arrayFold(); -- { serverError TOO_FEW_ARGUMENTS_FOR_FUNCTION } +SELECT arrayFold(1); -- { serverError TOO_FEW_ARGUMENTS_FOR_FUNCTION } +SELECT arrayFold(1, toUInt64(0)); -- { serverError TOO_FEW_ARGUMENTS_FOR_FUNCTION } SELECT arrayFold(1, emptyArrayUInt64(), toUInt64(0)); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } SELECT arrayFold( acc,x -> x, emptyArrayString(), toInt8(0)); -- { serverError TYPE_MISMATCH } -SELECT arrayFold( acc,x -> x, 'not an array', toUInt8(0)); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } +SELECT arrayFold( acc,x -> x, 'not an array', toUInt8(0)); -- { serverError I02718_array_foldLLEGAL_TYPE_OF_ARGUMENT } SELECT arrayFold( acc,x,y -> x, [0, 1], 'not an array', toUInt8(0)); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } SELECT arrayFold( acc,x -> x, [0, 1], [2, 3], toUInt8(0)); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } SELECT arrayFold( acc,x,y -> x, [0, 1], [2, 3, 4], toUInt8(0)); -- { serverError SIZES_OF_ARRAYS_DONT_MATCH } From d3155707e9fdf56671af19dd38388861468378a1 Mon Sep 17 00:00:00 2001 From: vdimir Date: Mon, 6 May 2024 12:09:12 +0000 Subject: [PATCH 114/289] no filter pushdow for group_by_use_nulls --- src/Processors/QueryPlan/AggregatingStep.h | 1 + src/Processors/QueryPlan/Optimizations/filterPushDown.cpp | 3 +++ 2 files changed, 4 insertions(+) diff --git a/src/Processors/QueryPlan/AggregatingStep.h b/src/Processors/QueryPlan/AggregatingStep.h index f446ecec846..ae43295024a 100644 --- a/src/Processors/QueryPlan/AggregatingStep.h +++ b/src/Processors/QueryPlan/AggregatingStep.h @@ -59,6 +59,7 @@ public: const Aggregator::Params & getParams() const { return params; } const auto & getGroupingSetsParamsList() const { return grouping_sets_params; } + bool isGroupByUseNulls() const { return group_by_use_nulls; } bool inOrder() const { return !sort_description_for_merging.empty(); } bool explicitSortingRequired() const { return explicit_sorting_required_for_aggregation_in_order; } diff --git a/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp b/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp index 7dd526cbe95..5b3bcfc4468 100644 --- a/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp +++ b/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp @@ -428,6 +428,9 @@ size_t tryPushDownFilter(QueryPlan::Node * parent_node, QueryPlan::Nodes & nodes /// of the grouping sets, we could not push the filter down. if (aggregating->isGroupingSets()) { + /// Cannot push down filter if type has been changed. + if (aggregating->isGroupByUseNulls()) + return 0; const auto & actions = filter->getExpression(); const auto & filter_node = actions->findInOutputs(filter->getFilterColumnName()); From fd75522ea8c8f5ccd0dd6251ea5bb5480823f9c4 Mon Sep 17 00:00:00 2001 From: vdimir Date: Mon, 6 May 2024 12:09:47 +0000 Subject: [PATCH 115/289] analyzer: fix select * with grouping sets --- src/Analyzer/Passes/QueryAnalysisPass.cpp | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/Analyzer/Passes/QueryAnalysisPass.cpp b/src/Analyzer/Passes/QueryAnalysisPass.cpp index 52efee03ae4..5fd49aa4541 100644 --- a/src/Analyzer/Passes/QueryAnalysisPass.cpp +++ b/src/Analyzer/Passes/QueryAnalysisPass.cpp @@ -4815,6 +4815,19 @@ ProjectionNames QueryAnalyzer::resolveMatcher(QueryTreeNodePtr & matcher_node, I } } + if (!scope.expressions_in_resolve_process_stack.hasAggregateFunction()) + { + for (auto & [node, _] : matched_expression_nodes_with_names) + { + auto it = scope.nullable_group_by_keys.find(node); + if (it != scope.nullable_group_by_keys.end()) + { + node = it->node->clone(); + node->convertToNullable(); + } + } + } + std::unordered_map> strict_transformer_to_used_column_names; for (const auto & transformer : matcher_node_typed.getColumnTransformers().getNodes()) { From 063db335480331bf0003f220ab74d93797a8ab17 Mon Sep 17 00:00:00 2001 From: vdimir Date: Mon, 6 May 2024 12:10:11 +0000 Subject: [PATCH 116/289] verbose error --- src/Functions/FunctionHelpers.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Functions/FunctionHelpers.cpp b/src/Functions/FunctionHelpers.cpp index ce83a489a3d..b31127d3896 100644 --- a/src/Functions/FunctionHelpers.cpp +++ b/src/Functions/FunctionHelpers.cpp @@ -80,7 +80,7 @@ ColumnWithTypeAndName columnGetNested(const ColumnWithTypeAndName & col) return ColumnWithTypeAndName{ nullable_res, nested_type, col.name }; } else - throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column for DataTypeNullable"); + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} for DataTypeNullable", col.dumpStructure()); } return col; } From cf9bc27c0b80c17b887170c4182439de1c6e8643 Mon Sep 17 00:00:00 2001 From: vdimir Date: Mon, 6 May 2024 12:12:11 +0000 Subject: [PATCH 117/289] add test --- ...grouping_sets_use_nulls_pushdown.reference | 8 +++++ ...03150_grouping_sets_use_nulls_pushdown.sql | 29 +++++++++++++++++++ 2 files changed, 37 insertions(+) create mode 100644 tests/queries/0_stateless/03150_grouping_sets_use_nulls_pushdown.reference create mode 100644 tests/queries/0_stateless/03150_grouping_sets_use_nulls_pushdown.sql diff --git a/tests/queries/0_stateless/03150_grouping_sets_use_nulls_pushdown.reference b/tests/queries/0_stateless/03150_grouping_sets_use_nulls_pushdown.reference new file mode 100644 index 00000000000..e9f85e9052f --- /dev/null +++ b/tests/queries/0_stateless/03150_grouping_sets_use_nulls_pushdown.reference @@ -0,0 +1,8 @@ +2023-01-05 hello, world +2023-01-05 \N +2023-01-05 hello, world +2023-01-05 \N +2023-01-05 10 +2023-01-05 10 +2023-01-05 hello, world +2023-01-05 \N diff --git a/tests/queries/0_stateless/03150_grouping_sets_use_nulls_pushdown.sql b/tests/queries/0_stateless/03150_grouping_sets_use_nulls_pushdown.sql new file mode 100644 index 00000000000..0f057a1c99d --- /dev/null +++ b/tests/queries/0_stateless/03150_grouping_sets_use_nulls_pushdown.sql @@ -0,0 +1,29 @@ +DROP TABLE IF EXISTS test_grouping_sets_predicate; + +CREATE TABLE test_grouping_sets_predicate ( day_ Date, type_1 String ) ENGINE=MergeTree ORDER BY day_; + +INSERT INTO test_grouping_sets_predicate SELECT toDate('2023-01-05') AS day_, 'hello, world' FROM numbers (10); + +SET group_by_use_nulls = true; + +SELECT * +FROM ( SELECT day_, type_1 FROM test_grouping_sets_predicate GROUP BY GROUPING SETS ( (day_, type_1), (day_) ) ) +WHERE day_ = '2023-01-05' +ORDER BY ALL; + + +SELECT * +FROM ( SELECT * FROM test_grouping_sets_predicate GROUP BY GROUPING SETS ( (day_, type_1), (day_) ) ) +WHERE day_ = '2023-01-05' +ORDER BY ALL; + +SELECT * +FROM ( SELECT day_, COUNT(*) FROM test_grouping_sets_predicate GROUP BY GROUPING SETS ( (day_, type_1), (day_) ) ) +WHERE day_ = '2023-01-05' +ORDER BY ALL; + + +SELECT t2.* +FROM ( SELECT t1.* FROM test_grouping_sets_predicate t1 GROUP BY GROUPING SETS ( (day_, type_1), (day_) ) ) t2 +WHERE day_ = '2023-01-05' +ORDER BY ALL; From cad9c97725e4943730d0dabaa3df2cdf008be948 Mon Sep 17 00:00:00 2001 From: Maksim Kita Date: Tue, 7 May 2024 17:21:25 +0300 Subject: [PATCH 118/289] Analyzer setting max_streams_to_max_threads_ratio overflow fix --- .../AggregateFunctionSparkbar.cpp | 4 ++-- src/Planner/PlannerJoinTree.cpp | 10 +++++++++- ...streams_to_max_threads_ratio_overflow.reference | 0 ...g_max_streams_to_max_threads_ratio_overflow.sql | 14 ++++++++++++++ 4 files changed, 25 insertions(+), 3 deletions(-) create mode 100644 tests/queries/0_stateless/03148_setting_max_streams_to_max_threads_ratio_overflow.reference create mode 100644 tests/queries/0_stateless/03148_setting_max_streams_to_max_threads_ratio_overflow.sql diff --git a/src/AggregateFunctions/AggregateFunctionSparkbar.cpp b/src/AggregateFunctions/AggregateFunctionSparkbar.cpp index b6e538520a8..362ffbe20d2 100644 --- a/src/AggregateFunctions/AggregateFunctionSparkbar.cpp +++ b/src/AggregateFunctions/AggregateFunctionSparkbar.cpp @@ -253,9 +253,9 @@ private: else { Y scaled; - bool has_overfllow = common::mulOverflow(y, levels_num, scaled); + bool has_overflow = common::mulOverflow(y, levels_num, scaled); - if (has_overfllow) + if (has_overflow) y = y / (y_max / levels_num) + 1; else y = scaled / y_max + 1; diff --git a/src/Planner/PlannerJoinTree.cpp b/src/Planner/PlannerJoinTree.cpp index 85cad1dcd69..094cf73dbc6 100644 --- a/src/Planner/PlannerJoinTree.cpp +++ b/src/Planner/PlannerJoinTree.cpp @@ -708,7 +708,15 @@ JoinTreeQueryPlan buildQueryPlanForTableExpression(QueryTreeNodePtr table_expres /// If necessary, we request more sources than the number of threads - to distribute the work evenly over the threads if (max_streams > 1 && !is_sync_remote) - max_streams = static_cast(max_streams * settings.max_streams_to_max_threads_ratio); + { + if (auto streams_with_ratio = max_streams * settings.max_streams_to_max_threads_ratio; canConvertTo(streams_with_ratio)) + max_streams = static_cast(streams_with_ratio); + else + throw Exception(ErrorCodes::PARAMETER_OUT_OF_BOUND, + "Exceeded limit for `max_streams` with `max_streams_to_max_threads_ratio`. " + "Make sure that `max_streams * max_streams_to_max_threads_ratio` is in some reasonable boundaries, current value: {}", + streams_with_ratio); + } if (table_node) table_expression_query_info.table_expression_modifiers = table_node->getTableExpressionModifiers(); diff --git a/tests/queries/0_stateless/03148_setting_max_streams_to_max_threads_ratio_overflow.reference b/tests/queries/0_stateless/03148_setting_max_streams_to_max_threads_ratio_overflow.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/03148_setting_max_streams_to_max_threads_ratio_overflow.sql b/tests/queries/0_stateless/03148_setting_max_streams_to_max_threads_ratio_overflow.sql new file mode 100644 index 00000000000..af326c15bd8 --- /dev/null +++ b/tests/queries/0_stateless/03148_setting_max_streams_to_max_threads_ratio_overflow.sql @@ -0,0 +1,14 @@ +DROP TABLE IF EXISTS test_table; +CREATE TABLE test_table +( + id UInt64, + value String +) ENGINE = MergeTree ORDER BY id; + +INSERT INTO test_table VALUES (0, 'Value_0'); + +SELECT * FROM test_table SETTINGS max_threads = 1025, max_streams_to_max_threads_ratio = -9223372036854775808, allow_experimental_analyzer = 1; -- { serverError PARAMETER_OUT_OF_BOUND } + +SELECT * FROM test_table SETTINGS max_threads = 1025, max_streams_to_max_threads_ratio = -9223372036854775808, allow_experimental_analyzer = 0; -- { serverError PARAMETER_OUT_OF_BOUND } + +DROP TABLE test_table; From c67a43e3a7354c3db488df44c05e5a0d56a454c5 Mon Sep 17 00:00:00 2001 From: vdimir Date: Tue, 7 May 2024 14:37:39 +0000 Subject: [PATCH 119/289] better --- src/Analyzer/Passes/QueryAnalysisPass.cpp | 3 +++ src/Analyzer/ValidationUtils.cpp | 4 ++++ ..._grouping_sets_use_nulls_pushdown.reference | 6 ++++++ .../03150_grouping_sets_use_nulls_pushdown.sql | 18 ++++++++++++++++++ .../0_stateless/03151_where_asterisk.reference | 1 + .../0_stateless/03151_where_asterisk.sql | 3 +++ 6 files changed, 35 insertions(+) create mode 100644 tests/queries/0_stateless/03151_where_asterisk.reference create mode 100644 tests/queries/0_stateless/03151_where_asterisk.sql diff --git a/src/Analyzer/Passes/QueryAnalysisPass.cpp b/src/Analyzer/Passes/QueryAnalysisPass.cpp index 5fd49aa4541..518c1272761 100644 --- a/src/Analyzer/Passes/QueryAnalysisPass.cpp +++ b/src/Analyzer/Passes/QueryAnalysisPass.cpp @@ -5021,7 +5021,10 @@ ProjectionNames QueryAnalyzer::resolveMatcher(QueryTreeNodePtr & matcher_node, I scope.scope_node->formatASTForErrorMessage()); } + auto original_ast = matcher_node->getOriginalAST(); matcher_node = std::move(list); + if (original_ast) + matcher_node->setOriginalAST(original_ast); return result_projection_names; } diff --git a/src/Analyzer/ValidationUtils.cpp b/src/Analyzer/ValidationUtils.cpp index 60cc1dd521f..45a916016ce 100644 --- a/src/Analyzer/ValidationUtils.cpp +++ b/src/Analyzer/ValidationUtils.cpp @@ -26,6 +26,10 @@ namespace void validateFilter(const QueryTreeNodePtr & filter_node, std::string_view exception_place_message, const QueryTreeNodePtr & query_node) { + if (filter_node->getNodeType() == QueryTreeNodeType::LIST) + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Unsupported expression '{}' in filter", filter_node->formatASTForErrorMessage()); + auto filter_node_result_type = filter_node->getResultType(); if (!filter_node_result_type->canBeUsedInBooleanContext()) throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_COLUMN_FOR_FILTER, diff --git a/tests/queries/0_stateless/03150_grouping_sets_use_nulls_pushdown.reference b/tests/queries/0_stateless/03150_grouping_sets_use_nulls_pushdown.reference index e9f85e9052f..209c455b6f5 100644 --- a/tests/queries/0_stateless/03150_grouping_sets_use_nulls_pushdown.reference +++ b/tests/queries/0_stateless/03150_grouping_sets_use_nulls_pushdown.reference @@ -2,6 +2,12 @@ 2023-01-05 \N 2023-01-05 hello, world 2023-01-05 \N +2023-01-05 +2023-01-05 +2023-01-05 hello, world +2023-01-05 \N +2023-01-05 hello, world +2023-01-05 \N 2023-01-05 10 2023-01-05 10 2023-01-05 hello, world diff --git a/tests/queries/0_stateless/03150_grouping_sets_use_nulls_pushdown.sql b/tests/queries/0_stateless/03150_grouping_sets_use_nulls_pushdown.sql index 0f057a1c99d..6162f55ac8f 100644 --- a/tests/queries/0_stateless/03150_grouping_sets_use_nulls_pushdown.sql +++ b/tests/queries/0_stateless/03150_grouping_sets_use_nulls_pushdown.sql @@ -17,6 +17,24 @@ FROM ( SELECT * FROM test_grouping_sets_predicate GROUP BY GROUPING SETS ( (day_ WHERE day_ = '2023-01-05' ORDER BY ALL; +SELECT * +FROM ( SELECT day_ FROM test_grouping_sets_predicate GROUP BY GROUPING SETS ( (day_, type_1), (day_) ) ) +WHERE day_ = '2023-01-05' +ORDER BY * +SETTINGS allow_experimental_analyzer=1; + +SELECT * +FROM ( SELECT * FROM test_grouping_sets_predicate GROUP BY GROUPING SETS ( (day_, type_1), (day_) ) ) +WHERE day_ = '2023-01-05' +GROUP BY * +SETTINGS allow_experimental_analyzer=1; + +SELECT * +FROM ( SELECT * FROM test_grouping_sets_predicate GROUP BY GROUPING SETS ( (*), (day_) ) ) +WHERE day_ = '2023-01-05' +GROUP BY GROUPING SETS (*) +SETTINGS allow_experimental_analyzer=1; + SELECT * FROM ( SELECT day_, COUNT(*) FROM test_grouping_sets_predicate GROUP BY GROUPING SETS ( (day_, type_1), (day_) ) ) WHERE day_ = '2023-01-05' diff --git a/tests/queries/0_stateless/03151_where_asterisk.reference b/tests/queries/0_stateless/03151_where_asterisk.reference new file mode 100644 index 00000000000..8b137891791 --- /dev/null +++ b/tests/queries/0_stateless/03151_where_asterisk.reference @@ -0,0 +1 @@ + diff --git a/tests/queries/0_stateless/03151_where_asterisk.sql b/tests/queries/0_stateless/03151_where_asterisk.sql new file mode 100644 index 00000000000..4b01d6e165c --- /dev/null +++ b/tests/queries/0_stateless/03151_where_asterisk.sql @@ -0,0 +1,3 @@ +SET allow_experimental_analyzer = 1; + +SELECT * FROM (SELECT 1) t1 WHERE *; -- { serverError BAD_ARGUMENTS } From c45c1444355b740ed01fe1849f18f17c689d826b Mon Sep 17 00:00:00 2001 From: Yohann Jardin Date: Tue, 7 May 2024 16:39:43 +0200 Subject: [PATCH 120/289] fix typo in fast tests --- tests/queries/0_stateless/02718_array_fold.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/02718_array_fold.sql b/tests/queries/0_stateless/02718_array_fold.sql index 73a3cebdda1..e59eae87fdf 100644 --- a/tests/queries/0_stateless/02718_array_fold.sql +++ b/tests/queries/0_stateless/02718_array_fold.sql @@ -4,7 +4,7 @@ SELECT arrayFold(1); -- { serverError TOO_FEW_ARGUMENTS_FOR_FUNCTION } SELECT arrayFold(1, toUInt64(0)); -- { serverError TOO_FEW_ARGUMENTS_FOR_FUNCTION } SELECT arrayFold(1, emptyArrayUInt64(), toUInt64(0)); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } SELECT arrayFold( acc,x -> x, emptyArrayString(), toInt8(0)); -- { serverError TYPE_MISMATCH } -SELECT arrayFold( acc,x -> x, 'not an array', toUInt8(0)); -- { serverError I02718_array_foldLLEGAL_TYPE_OF_ARGUMENT } +SELECT arrayFold( acc,x -> x, 'not an array', toUInt8(0)); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } SELECT arrayFold( acc,x,y -> x, [0, 1], 'not an array', toUInt8(0)); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } SELECT arrayFold( acc,x -> x, [0, 1], [2, 3], toUInt8(0)); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } SELECT arrayFold( acc,x,y -> x, [0, 1], [2, 3, 4], toUInt8(0)); -- { serverError SIZES_OF_ARRAYS_DONT_MATCH } From 5117422c7bebd04e3511abff3b02fd26561cda13 Mon Sep 17 00:00:00 2001 From: yariks5s Date: Tue, 7 May 2024 14:48:50 +0000 Subject: [PATCH 121/289] init --- src/Core/Settings.h | 1 + src/Formats/FormatFactory.cpp | 1 + src/Formats/FormatSettings.h | 1 + .../Formats/Impl/PrettyBlockOutputFormat.cpp | 10 +++++----- .../Impl/PrettyCompactBlockOutputFormat.cpp | 4 ++-- .../Impl/PrettySpaceBlockOutputFormat.cpp | 4 ++-- .../03132_pretty_format_break_line.reference | 19 +++++++++++++++++++ .../03132_pretty_format_break_line.sql | 7 +++++++ 8 files changed, 38 insertions(+), 9 deletions(-) diff --git a/src/Core/Settings.h b/src/Core/Settings.h index b4313d9af56..d640a556fb6 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -1113,6 +1113,7 @@ class IColumn; \ M(String, format_json_object_each_row_column_for_object_name, "", "The name of column that will be used as object names in JSONObjectEachRow format. Column type should be String", 0) \ \ + M(Bool, output_format_pretty_preserve_border_for_multiline_string, true, "Applies better rendering for multiline strings.", 0) \ M(UInt64, output_format_pretty_max_rows, 10000, "Rows limit for Pretty formats.", 0) \ M(UInt64, output_format_pretty_max_column_pad_width, 250, "Maximum width to pad all values in a column in Pretty formats.", 0) \ M(UInt64, output_format_pretty_max_value_width, 10000, "Maximum width of value to display in Pretty formats. If greater - it will be cut.", 0) \ diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp index b7e9899da46..cca4c30f5e0 100644 --- a/src/Formats/FormatFactory.cpp +++ b/src/Formats/FormatFactory.cpp @@ -173,6 +173,7 @@ FormatSettings getFormatSettings(const ContextPtr & context, const Settings & se format_settings.pretty.max_value_width_apply_for_single_value = settings.output_format_pretty_max_value_width_apply_for_single_value; format_settings.pretty.highlight_digit_groups = settings.output_format_pretty_highlight_digit_groups; format_settings.pretty.output_format_pretty_row_numbers = settings.output_format_pretty_row_numbers; + format_settings.pretty.preserve_border_for_multiline_string = settings.output_format_pretty_preserve_border_for_multiline_string; format_settings.pretty.output_format_pretty_single_large_number_tip_threshold = settings.output_format_pretty_single_large_number_tip_threshold; format_settings.protobuf.input_flatten_google_wrappers = settings.input_format_protobuf_flatten_google_wrappers; format_settings.protobuf.output_nullables_with_google_wrappers = settings.output_format_protobuf_nullables_with_google_wrappers; diff --git a/src/Formats/FormatSettings.h b/src/Formats/FormatSettings.h index da225a39ec9..e320ea8e6b6 100644 --- a/src/Formats/FormatSettings.h +++ b/src/Formats/FormatSettings.h @@ -282,6 +282,7 @@ struct FormatSettings SettingFieldUInt64Auto color{"auto"}; bool output_format_pretty_row_numbers = false; + bool preserve_border_for_multiline_string = true; UInt64 output_format_pretty_single_large_number_tip_threshold = 1'000'000; enum class Charset diff --git a/src/Processors/Formats/Impl/PrettyBlockOutputFormat.cpp b/src/Processors/Formats/Impl/PrettyBlockOutputFormat.cpp index 178d0b912e1..e23622bc2e5 100644 --- a/src/Processors/Formats/Impl/PrettyBlockOutputFormat.cpp +++ b/src/Processors/Formats/Impl/PrettyBlockOutputFormat.cpp @@ -38,7 +38,7 @@ void PrettyBlockOutputFormat::calculateWidths( max_padded_widths.resize_fill(num_columns); name_widths.resize(num_columns); - const bool need_cut_to_width = format_settings.pretty.max_value_width_apply_for_single_value || num_rows != 1 || num_columns != 1 || total_rows != 0; + const bool need_cut_to_width = format_settings.pretty.preserve_border_for_multiline_string && (format_settings.pretty.max_value_width_apply_for_single_value || num_rows != 1 || num_columns != 1 || total_rows != 0); /// Calculate widths of all values. String serialized_value; @@ -333,7 +333,7 @@ void PrettyBlockOutputFormat::writeChunk(const Chunk & chunk, PortKind port_kind WriteBufferFromString out_serialize(serialized_value, AppendModeTag()); serializations[j]->serializeText(*columns[j], i, out_serialize, format_settings); } - if (cut_to_width) + if (cut_to_width && format_settings.pretty.preserve_border_for_multiline_string) splitValueAtBreakLine(serialized_value, transferred_row[j], cur_width); has_transferred_row |= !transferred_row[j].empty() && cur_width <= cut_to_width; @@ -345,7 +345,7 @@ void PrettyBlockOutputFormat::writeChunk(const Chunk & chunk, PortKind port_kind writeReadableNumberTip(chunk); writeCString("\n", out); - if (has_transferred_row) + if (has_transferred_row && format_settings.pretty.preserve_border_for_multiline_string) writeTransferredRow(max_widths, header, transferred_row, cut_to_width, false); } @@ -453,7 +453,7 @@ void PrettyBlockOutputFormat::writeValueWithPadding( value_width = format_settings.pretty.max_value_width; has_break_line = false; } - else if (!has_break_line) + else if (!has_break_line || !format_settings.pretty.preserve_border_for_multiline_string) value += ' '; auto write_padding = [&]() @@ -478,7 +478,7 @@ void PrettyBlockOutputFormat::writeValueWithPadding( write_padding(); } - if (has_break_line) + if (has_break_line && format_settings.pretty.preserve_border_for_multiline_string) writeString("…", out); } diff --git a/src/Processors/Formats/Impl/PrettyCompactBlockOutputFormat.cpp b/src/Processors/Formats/Impl/PrettyCompactBlockOutputFormat.cpp index 345b6c84ecd..1ccb6d713d7 100644 --- a/src/Processors/Formats/Impl/PrettyCompactBlockOutputFormat.cpp +++ b/src/Processors/Formats/Impl/PrettyCompactBlockOutputFormat.cpp @@ -182,7 +182,7 @@ void PrettyCompactBlockOutputFormat::writeRow( WriteBufferFromString out_serialize(serialized_value, AppendModeTag()); serializations[j]->serializeText(*columns[j], row_num, out_serialize, format_settings); } - if (cut_to_width) + if (cut_to_width && format_settings.pretty.preserve_border_for_multiline_string) splitValueAtBreakLine(serialized_value, transferred_row[j], cur_width); has_transferred_row |= !transferred_row[j].empty() && cur_width <= cut_to_width; @@ -194,7 +194,7 @@ void PrettyCompactBlockOutputFormat::writeRow( writeReadableNumberTip(chunk); writeCString("\n", out); - if (has_transferred_row) + if (has_transferred_row && format_settings.pretty.preserve_border_for_multiline_string) writeTransferredRow(max_widths, header, transferred_row, cut_to_width, false); } diff --git a/src/Processors/Formats/Impl/PrettySpaceBlockOutputFormat.cpp b/src/Processors/Formats/Impl/PrettySpaceBlockOutputFormat.cpp index 6940c20e25b..0ed8c4398e4 100644 --- a/src/Processors/Formats/Impl/PrettySpaceBlockOutputFormat.cpp +++ b/src/Processors/Formats/Impl/PrettySpaceBlockOutputFormat.cpp @@ -100,7 +100,7 @@ void PrettySpaceBlockOutputFormat::writeChunk(const Chunk & chunk, PortKind port WriteBufferFromString out_serialize(serialized_value, AppendModeTag()); serializations[column]->serializeText(*columns[column], row, out_serialize, format_settings); } - if (cut_to_width) + if (cut_to_width && format_settings.pretty.preserve_border_for_multiline_string) splitValueAtBreakLine(serialized_value, transferred_row[column], cur_width); has_transferred_row |= !transferred_row[column].empty() && cur_width <= cut_to_width; @@ -111,7 +111,7 @@ void PrettySpaceBlockOutputFormat::writeChunk(const Chunk & chunk, PortKind port writeReadableNumberTip(chunk); writeChar('\n', out); - if (has_transferred_row) + if (has_transferred_row && format_settings.pretty.preserve_border_for_multiline_string) writeTransferredRow(max_widths, header, transferred_row, cut_to_width, true); } diff --git a/tests/queries/0_stateless/03132_pretty_format_break_line.reference b/tests/queries/0_stateless/03132_pretty_format_break_line.reference index a5282f89327..b7b59df24fc 100644 --- a/tests/queries/0_stateless/03132_pretty_format_break_line.reference +++ b/tests/queries/0_stateless/03132_pretty_format_break_line.reference @@ -86,3 +86,22 @@ 2. │ 1 │ hello world │ hellow …│ │ │ │…мир │ └────┴─────────────┴─────────────┘ +┏━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓ +┃ id ┃ value ┃ value1 ┃ +┡━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩ +│ 0 │ привет +world │ hello world │ +├────┼─────────────┼─────────────┤ +│ 1 │ hello world │ hellow +мир │ +└────┴─────────────┴─────────────┘ + ┏━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓ + ┃ id ┃ value ┃ value1 ┃ + ┡━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩ +1. │ 0 │ привет +world │ hello world │ + ├────┼─────────────┼─────────────┤ +2. │ 1 │ hello world │ hellow +мир │ + └────┴─────────────┴─────────────┘ + \ No newline at end of file diff --git a/tests/queries/0_stateless/03132_pretty_format_break_line.sql b/tests/queries/0_stateless/03132_pretty_format_break_line.sql index ecf967c1836..5f002d8df67 100644 --- a/tests/queries/0_stateless/03132_pretty_format_break_line.sql +++ b/tests/queries/0_stateless/03132_pretty_format_break_line.sql @@ -1,5 +1,7 @@ DROP TABLE IF EXISTS t_break_line; +SET output_format_pretty_preserve_border_for_multiline_string=1; + CREATE TABLE t_break_line (id UInt64, value String, value1 String) ENGINE=MergeTree ORDER BY id; INSERT INTO t_break_line VALUES(0, 'hello\nworld', 'hello world'); @@ -30,4 +32,9 @@ INSERT INTO t_break_line VALUES(1, 'hello world', 'hellow\nмир'); SELECT * FROM t_break_line ORDER BY id FORMAT PrettyMonoBlock SETTINGS output_format_pretty_row_numbers = 0; SELECT * FROM t_break_line ORDER BY id FORMAT PrettyMonoBlock; +SET output_format_pretty_preserve_border_for_multiline_string=0; + +SELECT * FROM t_break_line ORDER BY id FORMAT PrettyMonoBlock SETTINGS output_format_pretty_row_numbers = 0; +SELECT * FROM t_break_line ORDER BY id FORMAT PrettyMonoBlock; + DROP TABLE t_break_line; \ No newline at end of file From ac485b612de0e7e79188fdb07cd63a3500f92132 Mon Sep 17 00:00:00 2001 From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com> Date: Tue, 7 May 2024 17:11:37 +0200 Subject: [PATCH 122/289] add setting to SettingsChangesHistory --- src/Core/SettingsChangesHistory.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Core/SettingsChangesHistory.h b/src/Core/SettingsChangesHistory.h index cd1cd341c29..9a3ed14dd7c 100644 --- a/src/Core/SettingsChangesHistory.h +++ b/src/Core/SettingsChangesHistory.h @@ -91,6 +91,7 @@ static std::map sett {"cross_join_min_rows_to_compress", 0, 10000000, "A new setting."}, {"cross_join_min_bytes_to_compress", 0, 1_GiB, "A new setting."}, {"prefer_external_sort_block_bytes", 0, DEFAULT_BLOCK_SIZE * 256, "Prefer maximum block bytes for external sort, reduce the memory usage during merging."}, + {"output_format_pretty_preserve_border_for_multiline_string", 1, 1, "Applies better rendering for multiline strings."}, }}, {"24.4", {{"input_format_json_throw_on_bad_escape_sequence", true, true, "Allow to save JSON strings with bad escape sequences"}, {"max_parsing_threads", 0, 0, "Add a separate setting to control number of threads in parallel parsing from files"}, From 07472b3e95b8c0beceb8efc177872bb049faf6c6 Mon Sep 17 00:00:00 2001 From: Constantine Peresypkin Date: Wed, 10 Apr 2024 19:54:29 -0400 Subject: [PATCH 123/289] Add setting to force NULL for omitted fields Fixes #60884 --- src/Core/Settings.h | 1 + src/Core/SettingsChangesHistory.h | 1 + src/Formats/FormatFactory.cpp | 1 + src/Formats/FormatSettings.h | 1 + .../Impl/BSONEachRowRowInputFormat.cpp | 9 +++- .../Impl/JSONColumnsBlockInputFormatBase.cpp | 3 ++ .../Impl/JSONEachRowRowInputFormat.cpp | 10 ++++- .../Formats/Impl/TSKVRowInputFormat.cpp | 12 ++++- .../RowInputFormatWithNamesAndTypes.cpp | 20 +++++++++ .../03004_force_null_for_omitted.reference | 44 +++++++++++++++++++ .../03004_force_null_for_omitted.sql | 36 +++++++++++++++ 11 files changed, 135 insertions(+), 3 deletions(-) create mode 100644 tests/queries/0_stateless/03004_force_null_for_omitted.reference create mode 100644 tests/queries/0_stateless/03004_force_null_for_omitted.sql diff --git a/src/Core/Settings.h b/src/Core/Settings.h index b4313d9af56..f80bf1e4e3e 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -1006,6 +1006,7 @@ class IColumn; M(Bool, input_format_tsv_empty_as_default, false, "Treat empty fields in TSV input as default values.", 0) \ M(Bool, input_format_tsv_enum_as_number, false, "Treat inserted enum values in TSV formats as enum indices.", 0) \ M(Bool, input_format_null_as_default, true, "Initialize null fields with default values if the data type of this field is not nullable and it is supported by the input format", 0) \ + M(Bool, input_format_force_null_for_omitted_fields, false, "Force initialize omitted fields with null values", 0) \ M(Bool, input_format_arrow_case_insensitive_column_matching, false, "Ignore case when matching Arrow columns with CH columns.", 0) \ M(Int64, input_format_orc_row_batch_size, 100'000, "Batch size when reading ORC stripes.", 0) \ M(Bool, input_format_orc_case_insensitive_column_matching, false, "Ignore case when matching ORC columns with CH columns.", 0) \ diff --git a/src/Core/SettingsChangesHistory.h b/src/Core/SettingsChangesHistory.h index cd1cd341c29..5ea99aa0192 100644 --- a/src/Core/SettingsChangesHistory.h +++ b/src/Core/SettingsChangesHistory.h @@ -91,6 +91,7 @@ static std::map sett {"cross_join_min_rows_to_compress", 0, 10000000, "A new setting."}, {"cross_join_min_bytes_to_compress", 0, 1_GiB, "A new setting."}, {"prefer_external_sort_block_bytes", 0, DEFAULT_BLOCK_SIZE * 256, "Prefer maximum block bytes for external sort, reduce the memory usage during merging."}, + {"input_format_force_null_for_omitted_fields", false, false, "Disable type-defaults for omitted fields when needed"}, }}, {"24.4", {{"input_format_json_throw_on_bad_escape_sequence", true, true, "Allow to save JSON strings with bad escape sequences"}, {"max_parsing_threads", 0, 0, "Add a separate setting to control number of threads in parallel parsing from files"}, diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp index b7e9899da46..3199445864d 100644 --- a/src/Formats/FormatFactory.cpp +++ b/src/Formats/FormatFactory.cpp @@ -146,6 +146,7 @@ FormatSettings getFormatSettings(const ContextPtr & context, const Settings & se format_settings.json.throw_on_bad_escape_sequence = settings.input_format_json_throw_on_bad_escape_sequence; format_settings.json.ignore_unnecessary_fields = settings.input_format_json_ignore_unnecessary_fields; format_settings.null_as_default = settings.input_format_null_as_default; + format_settings.force_null_for_omitted_fields = settings.input_format_force_null_for_omitted_fields; format_settings.decimal_trailing_zeros = settings.output_format_decimal_trailing_zeros; format_settings.parquet.row_group_rows = settings.output_format_parquet_row_group_size; format_settings.parquet.row_group_bytes = settings.output_format_parquet_row_group_size_bytes; diff --git a/src/Formats/FormatSettings.h b/src/Formats/FormatSettings.h index da225a39ec9..83b5c534297 100644 --- a/src/Formats/FormatSettings.h +++ b/src/Formats/FormatSettings.h @@ -32,6 +32,7 @@ struct FormatSettings bool write_statistics = true; bool import_nested_json = false; bool null_as_default = true; + bool force_null_for_omitted_fields = false; bool decimal_trailing_zeros = false; bool defaults_for_omitted_fields = true; bool is_writing_to_terminal = false; diff --git a/src/Processors/Formats/Impl/BSONEachRowRowInputFormat.cpp b/src/Processors/Formats/Impl/BSONEachRowRowInputFormat.cpp index 340bcc8aae5..6a3475a1830 100644 --- a/src/Processors/Formats/Impl/BSONEachRowRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/BSONEachRowRowInputFormat.cpp @@ -39,6 +39,7 @@ namespace ErrorCodes extern const int ILLEGAL_COLUMN; extern const int TOO_LARGE_STRING_SIZE; extern const int UNKNOWN_TYPE; + extern const int TYPE_MISMATCH; } namespace @@ -820,7 +821,13 @@ bool BSONEachRowRowInputFormat::readRow(MutableColumns & columns, RowReadExtensi /// Fill non-visited columns with the default values. for (size_t i = 0; i < num_columns; ++i) if (!seen_columns[i]) - header.getByPosition(i).type->insertDefaultInto(*columns[i]); + { + const auto & type = header.getByPosition(i).type; + if (format_settings.force_null_for_omitted_fields && !isNullableOrLowCardinalityNullable(type)) + throw Exception(ErrorCodes::TYPE_MISMATCH, "Cannot insert NULL value into a column of type '{}' at index {}", type->getName(), i); + else + type->insertDefaultInto(*columns[i]); + } if (format_settings.defaults_for_omitted_fields) ext.read_columns = read_columns; diff --git a/src/Processors/Formats/Impl/JSONColumnsBlockInputFormatBase.cpp b/src/Processors/Formats/Impl/JSONColumnsBlockInputFormatBase.cpp index faa4f36bbb0..e61e55efc8e 100644 --- a/src/Processors/Formats/Impl/JSONColumnsBlockInputFormatBase.cpp +++ b/src/Processors/Formats/Impl/JSONColumnsBlockInputFormatBase.cpp @@ -13,6 +13,7 @@ namespace ErrorCodes { extern const int INCORRECT_DATA; extern const int EMPTY_DATA_PASSED; + extern const int TYPE_MISMATCH; } @@ -194,6 +195,8 @@ Chunk JSONColumnsBlockInputFormatBase::read() { if (!seen_columns[i]) { + if (format_settings.force_null_for_omitted_fields && !isNullableOrLowCardinalityNullable(fields[i].type)) + throw Exception(ErrorCodes::TYPE_MISMATCH, "Cannot insert NULL value into a column `{}` of type '{}'", fields[i].name, fields[i].type->getName()); columns[i]->insertManyDefaults(rows); if (format_settings.defaults_for_omitted_fields) block_missing_values.setBits(i, rows); diff --git a/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp b/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp index a78d8d016cd..8855a1bc28d 100644 --- a/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp @@ -18,6 +18,7 @@ namespace ErrorCodes extern const int INCORRECT_DATA; extern const int CANNOT_READ_ALL_DATA; extern const int LOGICAL_ERROR; + extern const int TYPE_MISMATCH; } namespace @@ -233,7 +234,14 @@ bool JSONEachRowRowInputFormat::readRow(MutableColumns & columns, RowReadExtensi /// Fill non-visited columns with the default values. for (size_t i = 0; i < num_columns; ++i) if (!seen_columns[i]) - header.getByPosition(i).type->insertDefaultInto(*columns[i]); + { + const auto & type = header.getByPosition(i).type; + if (format_settings.force_null_for_omitted_fields && !isNullableOrLowCardinalityNullable(type)) + throw Exception(ErrorCodes::TYPE_MISMATCH, "Cannot insert NULL value into a column `{}` of type '{}'", columnName(i), type->getName()); + else + type->insertDefaultInto(*columns[i]); + } + /// Return info about defaults set. /// If defaults_for_omitted_fields is set to 0, we should just leave already inserted defaults. diff --git a/src/Processors/Formats/Impl/TSKVRowInputFormat.cpp b/src/Processors/Formats/Impl/TSKVRowInputFormat.cpp index 29bc0012dc0..5382527fcdc 100644 --- a/src/Processors/Formats/Impl/TSKVRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/TSKVRowInputFormat.cpp @@ -15,6 +15,7 @@ namespace ErrorCodes extern const int CANNOT_PARSE_ESCAPE_SEQUENCE; extern const int CANNOT_READ_ALL_DATA; extern const int CANNOT_PARSE_INPUT_ASSERTION_FAILED; + extern const int TYPE_MISMATCH; } @@ -190,7 +191,16 @@ bool TSKVRowInputFormat::readRow(MutableColumns & columns, RowReadExtension & ex /// Fill in the not met columns with default values. for (size_t i = 0; i < num_columns; ++i) if (!seen_columns[i]) - header.getByPosition(i).type->insertDefaultInto(*columns[i]); + { + const auto & type = header.getByPosition(i).type; + if (format_settings.force_null_for_omitted_fields && !isNullableOrLowCardinalityNullable(type)) + throw Exception( + ErrorCodes::TYPE_MISMATCH, + "Cannot insert NULL value into a column `{}` of type '{}'", + header.getByPosition(i).name, + type->getName()); + type->insertDefaultInto(*columns[i]); + } /// return info about defaults set if (format_settings.defaults_for_omitted_fields) diff --git a/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp b/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp index 2ad6a825c8f..ae30d741c2f 100644 --- a/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp +++ b/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp @@ -18,6 +18,7 @@ namespace ErrorCodes { extern const int INCORRECT_DATA; extern const int LOGICAL_ERROR; + extern const int TYPE_MISMATCH; } namespace @@ -124,6 +125,17 @@ void RowInputFormatWithNamesAndTypes::readPrefix() } } } + + if (format_settings.force_null_for_omitted_fields) + { + for (auto index : column_mapping->not_presented_columns) + if (!isNullableOrLowCardinalityNullable(data_types[index])) + throw Exception( + ErrorCodes::TYPE_MISMATCH, + "Cannot insert NULL value into a column type '{}' at index {}", + data_types[index]->getName(), + index); + } } void RowInputFormatWithNamesAndTypes::tryDetectHeader(std::vector & column_names_out, std::vector & type_names_out) @@ -217,7 +229,15 @@ bool RowInputFormatWithNamesAndTypes::readRow(MutableColumns & columns, RowReadE { const auto & rem_column_index = column_mapping->column_indexes_for_input_fields[file_column]; if (rem_column_index) + { + if (format_settings.force_null_for_omitted_fields && !isNullableOrLowCardinalityNullable(data_types[*rem_column_index])) + throw Exception( + ErrorCodes::TYPE_MISMATCH, + "Cannot insert NULL value into a column type '{}' at index {}", + data_types[*rem_column_index]->getName(), + *rem_column_index); columns[*rem_column_index]->insertDefault(); + } ++file_column; } break; diff --git a/tests/queries/0_stateless/03004_force_null_for_omitted.reference b/tests/queries/0_stateless/03004_force_null_for_omitted.reference new file mode 100644 index 00000000000..a4c928aae8c --- /dev/null +++ b/tests/queries/0_stateless/03004_force_null_for_omitted.reference @@ -0,0 +1,44 @@ +0 0 +0 0 +2 0 +0 0 +4 0 +0 \N +0 \N +2 \N +0 \N +4 \N +0 \N +0 \N +2 \N +0 \N +4 \N +0 \N +0 \N +2 \N +0 \N +4 \N +0 \N +0 \N +2 \N +0 \N +4 \N +0 +0 \N +1 \N +1 \N +1 \N +1 \N +1 0 +1 \N +1 \N +1 2 +3 0 +1 0 +1 \N +1 \N +1 2 +3 0 +1 0 +1 \N +1 \N diff --git a/tests/queries/0_stateless/03004_force_null_for_omitted.sql b/tests/queries/0_stateless/03004_force_null_for_omitted.sql new file mode 100644 index 00000000000..43ba2568acb --- /dev/null +++ b/tests/queries/0_stateless/03004_force_null_for_omitted.sql @@ -0,0 +1,36 @@ +set allow_suspicious_low_cardinality_types = 1; +insert into function file(concat(currentDatabase(), '.03004_data.bsonEachRow'), auto, 'null Nullable(UInt32)') select number % 2 ? NULL : number from numbers(5) settings engine_file_truncate_on_insert=1; +select * from file(concat(currentDatabase(), '.03004_data.bsonEachRow'), auto, 'null UInt32, foo UInt32'); +select * from file(concat(currentDatabase(), '.03004_data.bsonEachRow'), auto, 'null UInt32, foo UInt32') settings input_format_force_null_for_omitted_fields = 1; -- { serverError TYPE_MISMATCH } +select * from file(concat(currentDatabase(), '.03004_data.bsonEachRow'), auto, 'null UInt32, foo Nullable(UInt32)'); +select * from file(concat(currentDatabase(), '.03004_data.bsonEachRow'), auto, 'null UInt32, foo Nullable(UInt32)') settings input_format_force_null_for_omitted_fields = 1; +select * from file(concat(currentDatabase(), '.03004_data.bsonEachRow'), auto, 'null UInt32, foo LowCardinality(Nullable(UInt32))'); +select * from file(concat(currentDatabase(), '.03004_data.bsonEachRow'), auto, 'null UInt32, foo LowCardinality(Nullable(UInt32))') settings input_format_force_null_for_omitted_fields = 1; + +select * from format(JSONEachRow, 'foo UInt32', '{}'); +select * from format(JSONEachRow, 'foo UInt32', '{}') settings input_format_force_null_for_omitted_fields = 1; -- { serverError TYPE_MISMATCH } +select * from format(JSONEachRow, 'foo UInt32, bar Nullable(UInt32)', '{}'); +select * from format(JSONEachRow, 'foo UInt32, bar Nullable(UInt32)', '{\"foo\":1}'); +select * from format(JSONEachRow, 'foo UInt32, bar Nullable(UInt32)', '{}') settings input_format_force_null_for_omitted_fields = 1; -- { serverError TYPE_MISMATCH } +select * from format(JSONEachRow, 'foo UInt32, bar Nullable(UInt32)', '{\"foo\":1}') settings input_format_force_null_for_omitted_fields = 1; +select * from format(JSONEachRow, 'foo UInt32, bar LowCardinality(Nullable(UInt32))', '{\"foo\":1}'); +select * from format(JSONEachRow, 'foo UInt32, bar LowCardinality(Nullable(UInt32))', '{\"foo\":1}') settings input_format_force_null_for_omitted_fields = 1; + +select * from format(CSVWithNamesAndTypes, 'foo UInt32, bar UInt32', 'foo\nUInt32\n1'); +select * from format(CSVWithNamesAndTypes, 'foo UInt32, bar UInt32', 'foo\nUInt32\n1') settings input_format_force_null_for_omitted_fields = 1; -- { serverError TYPE_MISMATCH } +select * from format(CSVWithNamesAndTypes, 'foo UInt32, bar Nullable(UInt32)', 'foo\nUInt32\n1') settings input_format_force_null_for_omitted_fields = 1; +select * from format(CSVWithNamesAndTypes, 'foo UInt32, bar LowCardinality(Nullable(UInt32))', 'foo\nUInt32\n1') settings input_format_force_null_for_omitted_fields = 1; +select * from format(CSVWithNamesAndTypes, 'foo UInt32, bar UInt32', 'foo,bar\nUInt32,UInt32\n1,2\n3\n') settings input_format_csv_allow_variable_number_of_columns = 1; +select * from format(CSVWithNamesAndTypes, 'foo UInt32, bar UInt32', 'foo,bar\nUInt32,UInt32\n1,2\n3\n') settings input_format_csv_allow_variable_number_of_columns = 1, input_format_force_null_for_omitted_fields = 1; -- { serverError TYPE_MISMATCH } + +select * from format(TSVWithNamesAndTypes, 'foo UInt32, bar UInt32', 'foo\nUInt32\n1'); +select * from format(TSVWithNamesAndTypes, 'foo UInt32, bar UInt32', 'foo\nUInt32\n1') settings input_format_force_null_for_omitted_fields = 1; -- { serverError TYPE_MISMATCH } +select * from format(TSVWithNamesAndTypes, 'foo UInt32, bar Nullable(UInt32)', 'foo\nUInt32\n1') settings input_format_force_null_for_omitted_fields = 1; +select * from format(TSVWithNamesAndTypes, 'foo UInt32, bar LowCardinality(Nullable(UInt32))', 'foo\nUInt32\n1') settings input_format_force_null_for_omitted_fields = 1; +select * from format(TSVWithNamesAndTypes, 'foo UInt32, bar UInt32', 'foo\tbar\nUInt32\tUInt32\n1\t2\n3\n') settings input_format_tsv_allow_variable_number_of_columns = 1; +select * from format(TSVWithNamesAndTypes, 'foo UInt32, bar UInt32', 'foo\tbar\nUInt32\tUInt32\n1\t2\n3\n') settings input_format_tsv_allow_variable_number_of_columns = 1, input_format_force_null_for_omitted_fields = 1; -- { serverError TYPE_MISMATCH } + +select * from format(TSKV, 'foo UInt32, bar UInt32', 'foo=1\n'); +select * from format(TSKV, 'foo UInt32, bar UInt32', 'foo=1\n') settings input_format_force_null_for_omitted_fields = 1; -- { serverError TYPE_MISMATCH } +select * from format(TSKV, 'foo UInt32, bar Nullable(UInt32)', 'foo=1\n') settings input_format_force_null_for_omitted_fields = 1; +select * from format(TSKV, 'foo UInt32, bar LowCardinality(Nullable(UInt32))', 'foo=1\n') settings input_format_force_null_for_omitted_fields = 1; From b2377c3fefe8951158de201ea399485f6805f955 Mon Sep 17 00:00:00 2001 From: vdimir Date: Tue, 7 May 2024 15:31:35 +0000 Subject: [PATCH 124/289] Fix mysql dictionary source --- src/Dictionaries/ExternalQueryBuilder.cpp | 2 +- .../test_dictionaries_mysql/test.py | 38 ++++++++++++++++++- 2 files changed, 37 insertions(+), 3 deletions(-) diff --git a/src/Dictionaries/ExternalQueryBuilder.cpp b/src/Dictionaries/ExternalQueryBuilder.cpp index 792c4e3e907..a31301cd0f3 100644 --- a/src/Dictionaries/ExternalQueryBuilder.cpp +++ b/src/Dictionaries/ExternalQueryBuilder.cpp @@ -401,7 +401,7 @@ std::string ExternalQueryBuilder::composeLoadKeysQuery( { writeString("SELECT * FROM (", out); writeString(query, out); - writeString(") WHERE ", out); + writeString(") AS subquery WHERE ", out); composeKeysCondition(key_columns, requested_rows, method, partition_key_prefix, out); writeString(";", out); diff --git a/tests/integration/test_dictionaries_mysql/test.py b/tests/integration/test_dictionaries_mysql/test.py index 360456b2046..332f4ca11bb 100644 --- a/tests/integration/test_dictionaries_mysql/test.py +++ b/tests/integration/test_dictionaries_mysql/test.py @@ -76,7 +76,7 @@ def test_mysql_dictionaries_custom_query_full_load(started_cluster): query = instance.query query( - """ + f""" CREATE DICTIONARY test_dictionary_custom_query ( id UInt64, @@ -95,12 +95,46 @@ def test_mysql_dictionaries_custom_query_full_load(started_cluster): """ ) - result = query("SELECT id, value_1, value_2 FROM test_dictionary_custom_query") + result = query( + "SELECT dictGetString('test_dictionary_custom_query', 'value_1', toUInt64(1))" + ) + assert result == "Value_1\n" + result = query("SELECT id, value_1, value_2 FROM test_dictionary_custom_query") assert result == "1\tValue_1\tValue_2\n" query("DROP DICTIONARY test_dictionary_custom_query;") + query( + f""" + CREATE DICTIONARY test_cache_dictionary_custom_query + ( + id1 UInt64, + id2 UInt64, + value_concat String + ) + PRIMARY KEY id1, id2 + LAYOUT(COMPLEX_KEY_CACHE(SIZE_IN_CELLS 10)) + SOURCE(MYSQL( + HOST 'mysql80' + PORT 3306 + USER 'root' + PASSWORD 'clickhouse' + QUERY 'SELECT id AS id1, id + 1 AS id2, CONCAT_WS(" ", "The", value_1) AS value_concat FROM test.test_table_1')) + LIFETIME(0) + """ + ) + + result = query( + "SELECT dictGetString('test_cache_dictionary_custom_query', 'value_concat', (1, 2))" + ) + assert result == "The Value_1\n" + + result = query("SELECT id1, value_concat FROM test_cache_dictionary_custom_query") + assert result == "1\tThe Value_1\n" + + query("DROP DICTIONARY test_cache_dictionary_custom_query;") + execute_mysql_query(mysql_connection, "DROP TABLE test.test_table_1;") execute_mysql_query(mysql_connection, "DROP TABLE test.test_table_2;") From 8e072f6b8d567ea82aa0c44bad6f26228b7bb96f Mon Sep 17 00:00:00 2001 From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com> Date: Tue, 7 May 2024 18:58:51 +0200 Subject: [PATCH 125/289] Newline at the end of .reference --- .../queries/0_stateless/03132_pretty_format_break_line.reference | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/queries/0_stateless/03132_pretty_format_break_line.reference b/tests/queries/0_stateless/03132_pretty_format_break_line.reference index b7b59df24fc..06b17ce4e12 100644 --- a/tests/queries/0_stateless/03132_pretty_format_break_line.reference +++ b/tests/queries/0_stateless/03132_pretty_format_break_line.reference @@ -104,4 +104,3 @@ world │ hello world │ 2. │ 1 │ hello world │ hellow мир │ └────┴─────────────┴─────────────┘ - \ No newline at end of file From 836cf150b5b4a9625aee0d440a0d64a966b4c4e0 Mon Sep 17 00:00:00 2001 From: kssenii Date: Tue, 7 May 2024 17:39:04 +0200 Subject: [PATCH 126/289] Fix --- src/Disks/StoragePolicy.cpp | 13 ++++++---- .../test_disk_over_web_server/test.py | 24 +++++++++++++++++++ 2 files changed, 32 insertions(+), 5 deletions(-) diff --git a/src/Disks/StoragePolicy.cpp b/src/Disks/StoragePolicy.cpp index 390afb368f8..ccdc34d5d06 100644 --- a/src/Disks/StoragePolicy.cpp +++ b/src/Disks/StoragePolicy.cpp @@ -462,15 +462,18 @@ StoragePolicySelectorPtr StoragePolicySelector::updateFromConfig(const Poco::Uti /// First pass, check. for (const auto & [name, policy] : policies) { - if (name.starts_with(TMP_STORAGE_POLICY_PREFIX)) - continue; + if (!name.starts_with(TMP_STORAGE_POLICY_PREFIX)) + { + if (!result->policies.contains(name)) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Storage policy {} is missing in new configuration", backQuote(name)); - if (!result->policies.contains(name)) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Storage policy {} is missing in new configuration", backQuote(name)); + policy->checkCompatibleWith(result->policies[name]); + } - policy->checkCompatibleWith(result->policies[name]); for (const auto & disk : policy->getDisks()) + { disks_before_reload.insert(disk->getName()); + } } /// Second pass, load. diff --git a/tests/integration/test_disk_over_web_server/test.py b/tests/integration/test_disk_over_web_server/test.py index 8ddc1ff3c31..15b26c6b09e 100644 --- a/tests/integration/test_disk_over_web_server/test.py +++ b/tests/integration/test_disk_over_web_server/test.py @@ -40,6 +40,12 @@ def cluster(): image="clickhouse/clickhouse-server", tag=CLICKHOUSE_CI_MIN_TESTED_VERSION, ) + cluster.add_instance( + "node5", + main_configs=["configs/storage_conf.xml"], + with_nginx=True, + allow_analyzer=False, + ) cluster.start() @@ -390,3 +396,21 @@ def test_page_cache(cluster): node.query("DROP TABLE test{} SYNC".format(i)) print(f"Ok {i}") + + +def test_config_reload(cluster): + node1 = cluster.instances["node5"] + table_name = "config_reload" + + global uuids + node1.query( + f""" + DROP TABLE IF EXISTS {table_name}; + CREATE TABLE {table_name} UUID '{uuids[0]}' + (id Int32) ENGINE = MergeTree() ORDER BY id + SETTINGS disk = disk(type=web, endpoint='http://nginx:80/test1/'); + """ + ) + + node1.query("SYSTEM RELOAD CONFIG") + node1.query(f"DROP TABLE {table_name} SYNC") From 9ecbff2573adc3f335ba60c0a2e487a89339a852 Mon Sep 17 00:00:00 2001 From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com> Date: Tue, 7 May 2024 19:03:17 +0200 Subject: [PATCH 127/289] Update 03145_non_loaded_projection_backup.sh --- .../0_stateless/03145_non_loaded_projection_backup.sh | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/03145_non_loaded_projection_backup.sh b/tests/queries/0_stateless/03145_non_loaded_projection_backup.sh index 6f0e00ce3fc..b542c9fff9a 100755 --- a/tests/queries/0_stateless/03145_non_loaded_projection_backup.sh +++ b/tests/queries/0_stateless/03145_non_loaded_projection_backup.sh @@ -36,6 +36,7 @@ backup table tp_1 to Disk('backups', '$backup_id'); " | grep -o "BACKUP_CREATED" $CLICKHOUSE_CLIENT -nm -q " +set send_logs_level='fatal'; drop table tp_1; restore table tp_1 from Disk('backups', '$backup_id'); " | grep -o "RESTORED" @@ -47,4 +48,6 @@ check table tp_1 settings check_query_single_value_result = 0;" | grep -o "Found $CLICKHOUSE_CLIENT -nm -q " set send_logs_level='fatal'; check table tp_1" -$CLICKHOUSE_CLIENT -q "drop table tp_1" +$CLICKHOUSE_CLIENT -nm -q " +set send_logs_level='fatal'; +drop table tp_1" From ed59a5c248e613aeab40f98dfedd2e9cd5c88dee Mon Sep 17 00:00:00 2001 From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com> Date: Tue, 7 May 2024 19:17:00 +0200 Subject: [PATCH 128/289] Update PrettyBlockOutputFormat.cpp --- src/Processors/Formats/Impl/PrettyBlockOutputFormat.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Processors/Formats/Impl/PrettyBlockOutputFormat.cpp b/src/Processors/Formats/Impl/PrettyBlockOutputFormat.cpp index e23622bc2e5..d84ffe800e7 100644 --- a/src/Processors/Formats/Impl/PrettyBlockOutputFormat.cpp +++ b/src/Processors/Formats/Impl/PrettyBlockOutputFormat.cpp @@ -345,7 +345,7 @@ void PrettyBlockOutputFormat::writeChunk(const Chunk & chunk, PortKind port_kind writeReadableNumberTip(chunk); writeCString("\n", out); - if (has_transferred_row && format_settings.pretty.preserve_border_for_multiline_string) + if (has_transferred_row) writeTransferredRow(max_widths, header, transferred_row, cut_to_width, false); } From cdcdb4cf71420aced463b01869da25be8015b75c Mon Sep 17 00:00:00 2001 From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com> Date: Tue, 7 May 2024 19:35:09 +0200 Subject: [PATCH 129/289] remove unnecessary checks --- src/Processors/Formats/Impl/PrettyBlockOutputFormat.cpp | 4 ++-- .../Formats/Impl/PrettyCompactBlockOutputFormat.cpp | 2 +- src/Processors/Formats/Impl/PrettySpaceBlockOutputFormat.cpp | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/Processors/Formats/Impl/PrettyBlockOutputFormat.cpp b/src/Processors/Formats/Impl/PrettyBlockOutputFormat.cpp index d84ffe800e7..41c7bfa316b 100644 --- a/src/Processors/Formats/Impl/PrettyBlockOutputFormat.cpp +++ b/src/Processors/Formats/Impl/PrettyBlockOutputFormat.cpp @@ -453,7 +453,7 @@ void PrettyBlockOutputFormat::writeValueWithPadding( value_width = format_settings.pretty.max_value_width; has_break_line = false; } - else if (!has_break_line || !format_settings.pretty.preserve_border_for_multiline_string) + else if (!has_break_line) value += ' '; auto write_padding = [&]() @@ -478,7 +478,7 @@ void PrettyBlockOutputFormat::writeValueWithPadding( write_padding(); } - if (has_break_line && format_settings.pretty.preserve_border_for_multiline_string) + if (has_break_line) writeString("…", out); } diff --git a/src/Processors/Formats/Impl/PrettyCompactBlockOutputFormat.cpp b/src/Processors/Formats/Impl/PrettyCompactBlockOutputFormat.cpp index 1ccb6d713d7..ce22a3b2864 100644 --- a/src/Processors/Formats/Impl/PrettyCompactBlockOutputFormat.cpp +++ b/src/Processors/Formats/Impl/PrettyCompactBlockOutputFormat.cpp @@ -194,7 +194,7 @@ void PrettyCompactBlockOutputFormat::writeRow( writeReadableNumberTip(chunk); writeCString("\n", out); - if (has_transferred_row && format_settings.pretty.preserve_border_for_multiline_string) + if (has_transferred_row) writeTransferredRow(max_widths, header, transferred_row, cut_to_width, false); } diff --git a/src/Processors/Formats/Impl/PrettySpaceBlockOutputFormat.cpp b/src/Processors/Formats/Impl/PrettySpaceBlockOutputFormat.cpp index 0ed8c4398e4..d311f005173 100644 --- a/src/Processors/Formats/Impl/PrettySpaceBlockOutputFormat.cpp +++ b/src/Processors/Formats/Impl/PrettySpaceBlockOutputFormat.cpp @@ -111,7 +111,7 @@ void PrettySpaceBlockOutputFormat::writeChunk(const Chunk & chunk, PortKind port writeReadableNumberTip(chunk); writeChar('\n', out); - if (has_transferred_row && format_settings.pretty.preserve_border_for_multiline_string) + if (has_transferred_row) writeTransferredRow(max_widths, header, transferred_row, cut_to_width, true); } From a2e9b6f4c61de96a12edee926c757c3d776d01a7 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Tue, 7 May 2024 17:42:43 +0000 Subject: [PATCH 130/289] Another attempt. --- src/Planner/PlannerExpressionAnalysis.cpp | 49 ++++++++++++++++++++--- 1 file changed, 43 insertions(+), 6 deletions(-) diff --git a/src/Planner/PlannerExpressionAnalysis.cpp b/src/Planner/PlannerExpressionAnalysis.cpp index d7fa270a643..6e194b2c03e 100644 --- a/src/Planner/PlannerExpressionAnalysis.cpp +++ b/src/Planner/PlannerExpressionAnalysis.cpp @@ -51,6 +51,41 @@ FilterAnalysisResult analyzeFilter(const QueryTreeNodePtr & filter_expression_no return result; } +bool isDeterministicConstant(const ConstantNode & root) +{ + const auto & source_expression = root.getSourceExpression(); + if (!source_expression) + return true; + + std::stack nodes; + nodes.push(source_expression.get()); + while (!nodes.empty()) + { + const auto * node = nodes.top(); + nodes.pop(); + + const auto * constant_node = node->as(); + const auto * function_node = node->as(); + if (constant_node) + { + if (!isDeterministicConstant(*constant_node)) + return false; + } + else if (function_node) + { + if (!function_node->getFunctionOrThrow()->isDeterministic()) + return false; + + for (const auto & child : function_node->getArguments()) + nodes.push(child.get()); + } + else + return false; + } + + return true; +} + /** Construct aggregation analysis result if query tree has GROUP BY or aggregates. * Actions before aggregation are added into actions chain, if result is not null optional. */ @@ -86,6 +121,8 @@ std::optional analyzeAggregation(const QueryTreeNodeP (query_node.isGroupByWithGroupingSets() || query_node.isGroupByWithRollup() || query_node.isGroupByWithCube()); bool is_secondary_query = planner_context->getQueryContext()->getClientInfo().query_kind == ClientInfo::QueryKind::SECONDARY_QUERY; + bool is_distributed_query = planner_context->getQueryContext()->isDistributed(); + bool check_deterministic_constants = is_secondary_query || is_distributed_query; if (query_node.hasGroupBy()) { @@ -99,10 +136,10 @@ std::optional analyzeAggregation(const QueryTreeNodeP for (auto & grouping_set_key_node : grouping_set_keys_list_node_typed.getNodes()) { - auto is_constant_key = grouping_set_key_node->as() != nullptr; - group_by_with_constant_keys |= is_constant_key; + const auto * constant_key = grouping_set_key_node->as(); + group_by_with_constant_keys |= (constant_key != nullptr); - if (!is_secondary_query && is_constant_key && !aggregates_descriptions.empty()) + if (constant_key && !aggregates_descriptions.empty() && (!check_deterministic_constants || isDeterministicConstant(*constant_key))) continue; auto expression_dag_nodes = actions_visitor.visit(before_aggregation_actions, grouping_set_key_node); @@ -151,10 +188,10 @@ std::optional analyzeAggregation(const QueryTreeNodeP { for (auto & group_by_key_node : query_node.getGroupBy().getNodes()) { - auto is_constant_key = group_by_key_node->as() != nullptr; - group_by_with_constant_keys |= is_constant_key; + const auto * constant_key = group_by_key_node->as(); + group_by_with_constant_keys |= (constant_key != nullptr); - if (!is_secondary_query && is_constant_key && !aggregates_descriptions.empty()) + if (constant_key && !aggregates_descriptions.empty() && (!check_deterministic_constants || isDeterministicConstant(*constant_key))) continue; auto expression_dag_nodes = actions_visitor.visit(before_aggregation_actions, group_by_key_node); From 511146c99c7d0c92802052643ae71e6f3f4c6dad Mon Sep 17 00:00:00 2001 From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com> Date: Tue, 7 May 2024 19:51:47 +0200 Subject: [PATCH 131/289] Update CHANGELOG.md --- CHANGELOG.md | 1 - 1 file changed, 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 955e2f5b72f..f40c42c4462 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -33,7 +33,6 @@ * A mode for `topK`/`topkWeighed` support mode, which return count of values and its error. [#54508](https://github.com/ClickHouse/ClickHouse/pull/54508) ([UnamedRus](https://github.com/UnamedRus)). * Added function `toMillisecond` which returns the millisecond component for values of type`DateTime` or `DateTime64`. [#60281](https://github.com/ClickHouse/ClickHouse/pull/60281) ([Shaun Struwig](https://github.com/Blargian)). * Allow configuring HTTP redirect handlers for clickhouse-server. For example, you can make `/` redirect to the Play UI. [#60390](https://github.com/ClickHouse/ClickHouse/pull/60390) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Allow Raw as a synonym for TSVRaw. [#63394](https://github.com/ClickHouse/ClickHouse/pull/63394) ([Unalian](https://github.com/Unalian)) #### Performance Improvement * Optimized function `dotProduct` to omit unnecessary and expensive memory copies. [#60928](https://github.com/ClickHouse/ClickHouse/pull/60928) ([Robert Schulze](https://github.com/rschu1ze)). From f52dfd98aa0ff7d1c037da02fdf2cf402e7ad3a6 Mon Sep 17 00:00:00 2001 From: Anton Popov Date: Tue, 7 May 2024 16:31:21 +0000 Subject: [PATCH 132/289] add test for 49307 --- .../03148_mutations_virtual_columns.reference | 1 + .../03148_mutations_virtual_columns.sql | 16 ++++++++++++++++ 2 files changed, 17 insertions(+) create mode 100644 tests/queries/0_stateless/03148_mutations_virtual_columns.reference create mode 100644 tests/queries/0_stateless/03148_mutations_virtual_columns.sql diff --git a/tests/queries/0_stateless/03148_mutations_virtual_columns.reference b/tests/queries/0_stateless/03148_mutations_virtual_columns.reference new file mode 100644 index 00000000000..7c5e8041147 --- /dev/null +++ b/tests/queries/0_stateless/03148_mutations_virtual_columns.reference @@ -0,0 +1 @@ +2 all_2_2_0 diff --git a/tests/queries/0_stateless/03148_mutations_virtual_columns.sql b/tests/queries/0_stateless/03148_mutations_virtual_columns.sql new file mode 100644 index 00000000000..045869b224a --- /dev/null +++ b/tests/queries/0_stateless/03148_mutations_virtual_columns.sql @@ -0,0 +1,16 @@ +DROP TABLE IF EXISTS t_mut_virtuals; + +CREATE TABLE t_mut_virtuals (id UInt64, s String) ENGINE = MergeTree ORDER BY id; + +INSERT INTO t_mut_virtuals VALUES (1, 'a'); +INSERT INTO t_mut_virtuals VALUES (2, 'b'); + +SET insert_keeper_fault_injection_probability = 0; +SET mutations_sync = 2; + +ALTER TABLE t_mut_virtuals UPDATE s = _part WHERE 1; +ALTER TABLE t_mut_virtuals DELETE WHERE _part LIKE 'all_1_1_0%'; + +SELECT * FROM t_mut_virtuals ORDER BY id; + +DROP TABLE t_mut_virtuals; From 134b468d917fd8b243b5b39e15872057acbc3015 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Tue, 7 May 2024 18:14:32 +0000 Subject: [PATCH 133/289] Another case. --- src/Analyzer/Passes/QueryAnalysisPass.cpp | 14 +++++++++----- ...3_group_by_use_nulls_analyzer_crashes.reference | 10 ++++++++++ .../03023_group_by_use_nulls_analyzer_crashes.sql | 2 ++ 3 files changed, 21 insertions(+), 5 deletions(-) diff --git a/src/Analyzer/Passes/QueryAnalysisPass.cpp b/src/Analyzer/Passes/QueryAnalysisPass.cpp index 80294b7a428..426f3b6bde8 100644 --- a/src/Analyzer/Passes/QueryAnalysisPass.cpp +++ b/src/Analyzer/Passes/QueryAnalysisPass.cpp @@ -4120,8 +4120,8 @@ IdentifierResolveResult QueryAnalyzer::tryResolveIdentifier(const IdentifierLook * SELECT id FROM ( SELECT ... ) AS subquery ARRAY JOIN [0] AS id INNER JOIN second_table USING (id) * In the example, identifier `id` should be resolved into one from USING (id) column. */ - auto alias_it = scope.alias_name_to_expression_node.find(identifier_lookup.identifier.getFullName()); - if (alias_it != scope.alias_name_to_expression_node.end() && alias_it->second->getNodeType() == QueryTreeNodeType::COLUMN) + auto alias_it = scope.alias_name_to_expression_node->find(identifier_lookup.identifier.getFullName()); + if (alias_it != scope.alias_name_to_expression_node->end() && alias_it->second->getNodeType() == QueryTreeNodeType::COLUMN) { const auto & column_node = alias_it->second->as(); if (column_node.getColumnSource()->getNodeType() == QueryTreeNodeType::ARRAY_JOIN) @@ -5225,8 +5225,12 @@ ProjectionNames QueryAnalyzer::resolveLambda(const QueryTreeNodePtr & lambda_nod for (size_t i = 0; i < lambda_arguments_nodes_size; ++i) { auto & lambda_argument_node = lambda_arguments_nodes[i]; - auto & lambda_argument_node_typed = lambda_argument_node->as(); - const auto & lambda_argument_name = lambda_argument_node_typed.getIdentifier().getFullName(); + const auto * lambda_argument_identifier = lambda_argument_node->as(); + const auto * lambda_argument_column = lambda_argument_node->as(); + if (!lambda_argument_identifier && !lambda_argument_column) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected IDENTIFIER or COLUMN as lambda argument, got {}", lambda_node->dumpTree()); + const auto & lambda_argument_name = lambda_argument_identifier ? lambda_argument_identifier->getIdentifier().getFullName() + : lambda_argument_column->getColumnName(); bool has_expression_node = scope.alias_name_to_expression_node->contains(lambda_argument_name); bool has_alias_node = scope.alias_name_to_lambda_node.contains(lambda_argument_name); @@ -5236,7 +5240,7 @@ ProjectionNames QueryAnalyzer::resolveLambda(const QueryTreeNodePtr & lambda_nod throw Exception(ErrorCodes::BAD_ARGUMENTS, "Alias name '{}' inside lambda {} cannot have same name as lambda argument. In scope {}", lambda_argument_name, - lambda_argument_node_typed.formatASTForErrorMessage(), + lambda_argument_node->formatASTForErrorMessage(), scope.scope_node->formatASTForErrorMessage()); } diff --git a/tests/queries/0_stateless/03023_group_by_use_nulls_analyzer_crashes.reference b/tests/queries/0_stateless/03023_group_by_use_nulls_analyzer_crashes.reference index 0eb9d94e85a..273e1567a9b 100644 --- a/tests/queries/0_stateless/03023_group_by_use_nulls_analyzer_crashes.reference +++ b/tests/queries/0_stateless/03023_group_by_use_nulls_analyzer_crashes.reference @@ -86,3 +86,13 @@ a a \N 0 nan \N 4 nan \N \N nan +[] +['.'] +['.','.'] +['.','.','.'] +['.','.','.','.'] +['.','.','.','.','.'] +['.','.','.','.','.','.'] +['.','.','.','.','.','.','.'] +['.','.','.','.','.','.','.','.'] +['.','.','.','.','.','.','.','.','.'] diff --git a/tests/queries/0_stateless/03023_group_by_use_nulls_analyzer_crashes.sql b/tests/queries/0_stateless/03023_group_by_use_nulls_analyzer_crashes.sql index 7311ce54e39..bba0a27560f 100644 --- a/tests/queries/0_stateless/03023_group_by_use_nulls_analyzer_crashes.sql +++ b/tests/queries/0_stateless/03023_group_by_use_nulls_analyzer_crashes.sql @@ -43,3 +43,5 @@ GROUP BY number + number AS b WITH CUBE SETTINGS group_by_use_nulls = 1; + +SELECT arrayMap(x -> '.', range(number % 10)) AS k FROM remote('127.0.0.{2,3}', numbers(10)) GROUP BY GROUPING SETS ((k)) ORDER BY k settings group_by_use_nulls=1; From 95b76bf6a47f0e23d41ce33c2223cee93066ad3e Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Tue, 7 May 2024 20:58:19 +0200 Subject: [PATCH 134/289] Remove leftovers of GCC support in cmake rules Signed-off-by: Azat Khuzhin --- CMakeLists.txt | 119 +++++++++------------ cmake/linux/default_libs.cmake | 16 ++- cmake/sanitize.cmake | 8 +- cmake/tools.cmake | 85 +++++---------- cmake/warnings.cmake | 66 ++++++------ contrib/capnproto-cmake/CMakeLists.txt | 4 +- contrib/openssl-cmake/CMakeLists.txt | 10 +- contrib/sentry-native-cmake/CMakeLists.txt | 2 +- 8 files changed, 125 insertions(+), 185 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index be804a14765..abbc48ab23a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -135,23 +135,21 @@ endif () include (cmake/check_flags.cmake) include (cmake/add_warning.cmake) -if (COMPILER_CLANG) - # generate ranges for fast "addr2line" search - if (NOT CMAKE_BUILD_TYPE_UC STREQUAL "RELEASE") - # NOTE: that clang has a bug because of it does not emit .debug_aranges - # with ThinLTO, so custom ld.lld wrapper is shipped in docker images. - set(COMPILER_FLAGS "${COMPILER_FLAGS} -gdwarf-aranges") - endif () - - # See https://blog.llvm.org/posts/2021-04-05-constructor-homing-for-debug-info/ - if (CMAKE_BUILD_TYPE_UC STREQUAL "DEBUG" OR CMAKE_BUILD_TYPE_UC STREQUAL "RELWITHDEBINFO") - set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Xclang -fuse-ctor-homing") - set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Xclang -fuse-ctor-homing") - endif() - - no_warning(enum-constexpr-conversion) # breaks Protobuf in clang-16 +# generate ranges for fast "addr2line" search +if (NOT CMAKE_BUILD_TYPE_UC STREQUAL "RELEASE") + # NOTE: that clang has a bug because of it does not emit .debug_aranges + # with ThinLTO, so custom ld.lld wrapper is shipped in docker images. + set(COMPILER_FLAGS "${COMPILER_FLAGS} -gdwarf-aranges") endif () +# See https://blog.llvm.org/posts/2021-04-05-constructor-homing-for-debug-info/ +if (CMAKE_BUILD_TYPE_UC STREQUAL "DEBUG" OR CMAKE_BUILD_TYPE_UC STREQUAL "RELWITHDEBINFO") + set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Xclang -fuse-ctor-homing") + set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Xclang -fuse-ctor-homing") +endif() + +no_warning(enum-constexpr-conversion) # breaks Protobuf in clang-16 + option(ENABLE_TESTS "Provide unit_test_dbms target with Google.Test unit tests" ON) option(ENABLE_EXAMPLES "Build all example programs in 'examples' subdirectories" OFF) option(ENABLE_BENCHMARKS "Build all benchmark programs in 'benchmarks' subdirectories" OFF) @@ -284,16 +282,12 @@ endif () option (ENABLE_BUILD_PROFILING "Enable profiling of build time" OFF) if (ENABLE_BUILD_PROFILING) - if (COMPILER_CLANG) - set (COMPILER_FLAGS "${COMPILER_FLAGS} -ftime-trace") + set (COMPILER_FLAGS "${COMPILER_FLAGS} -ftime-trace") - if (LINKER_NAME MATCHES "lld") - set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,--time-trace") - set (CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} -Wl,--time-trace") - endif () - else () - message (${RECONFIGURE_MESSAGE_LEVEL} "Build profiling is only available with CLang") - endif () + if (LINKER_NAME MATCHES "lld") + set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,--time-trace") + set (CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} -Wl,--time-trace") + endif () endif () set (CMAKE_CXX_STANDARD 23) @@ -304,22 +298,20 @@ set (CMAKE_C_STANDARD 11) set (CMAKE_C_EXTENSIONS ON) # required by most contribs written in C set (CMAKE_C_STANDARD_REQUIRED ON) -if (COMPILER_CLANG) - # Enable C++14 sized global deallocation functions. It should be enabled by setting -std=c++14 but I'm not sure. - # See https://reviews.llvm.org/D112921 - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsized-deallocation") +# Enable C++14 sized global deallocation functions. It should be enabled by setting -std=c++14 but I'm not sure. +# See https://reviews.llvm.org/D112921 +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsized-deallocation") - # falign-functions=32 prevents from random performance regressions with the code change. Thus, providing more stable - # benchmarks. - set(COMPILER_FLAGS "${COMPILER_FLAGS} -falign-functions=32") +# falign-functions=32 prevents from random performance regressions with the code change. Thus, providing more stable +# benchmarks. +set(COMPILER_FLAGS "${COMPILER_FLAGS} -falign-functions=32") - if (ARCH_AMD64) - # align branches within a 32-Byte boundary to avoid the potential performance loss when code layout change, - # which makes benchmark results more stable. - set(BRANCHES_WITHIN_32B_BOUNDARIES "-mbranches-within-32B-boundaries") - set(COMPILER_FLAGS "${COMPILER_FLAGS} ${BRANCHES_WITHIN_32B_BOUNDARIES}") - endif() -endif () +if (ARCH_AMD64) + # align branches within a 32-Byte boundary to avoid the potential performance loss when code layout change, + # which makes benchmark results more stable. + set(BRANCHES_WITHIN_32B_BOUNDARIES "-mbranches-within-32B-boundaries") + set(COMPILER_FLAGS "${COMPILER_FLAGS} ${BRANCHES_WITHIN_32B_BOUNDARIES}") +endif() # Disable floating-point expression contraction in order to get consistent floating point calculation results across platforms set (COMPILER_FLAGS "${COMPILER_FLAGS} -ffp-contract=off") @@ -348,39 +340,34 @@ set (CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} ${COMPILER_FLAGS} $ set (CMAKE_ASM_FLAGS_RELWITHDEBINFO "${CMAKE_ASM_FLAGS_RELWITHDEBINFO} -O3 ${DEBUG_INFO_FLAGS} ${CMAKE_ASM_FLAGS_ADD}") set (CMAKE_ASM_FLAGS_DEBUG "${CMAKE_ASM_FLAGS_DEBUG} -O0 ${DEBUG_INFO_FLAGS} ${CMAKE_ASM_FLAGS_ADD}") -if (COMPILER_CLANG) - if (OS_DARWIN) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -stdlib=libc++") - set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,-U,_inside_main") - endif() +if (OS_DARWIN) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -stdlib=libc++") + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,-U,_inside_main") +endif() - # Display absolute paths in error messages. Otherwise KDevelop fails to navigate to correct file and opens a new file instead. - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fdiagnostics-absolute-paths") - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fdiagnostics-absolute-paths") +# Display absolute paths in error messages. Otherwise KDevelop fails to navigate to correct file and opens a new file instead. +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fdiagnostics-absolute-paths") +set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fdiagnostics-absolute-paths") - if (NOT ENABLE_TESTS AND NOT SANITIZE AND NOT SANITIZE_COVERAGE AND OS_LINUX) - # https://clang.llvm.org/docs/ThinLTO.html - # Applies to clang and linux only. - # Disabled when building with tests or sanitizers. - option(ENABLE_THINLTO "Clang-specific link time optimization" ON) - endif() +if (NOT ENABLE_TESTS AND NOT SANITIZE AND NOT SANITIZE_COVERAGE AND OS_LINUX) + # https://clang.llvm.org/docs/ThinLTO.html + # Applies to clang and linux only. + # Disabled when building with tests or sanitizers. + option(ENABLE_THINLTO "Clang-specific link time optimization" ON) +endif() - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fstrict-vtable-pointers") - - # We cannot afford to use LTO when compiling unit tests, and it's not enough - # to only supply -fno-lto at the final linking stage. So we disable it - # completely. - if (ENABLE_THINLTO AND NOT ENABLE_TESTS AND NOT SANITIZE) - # Link time optimization - set (CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELWITHDEBINFO} -flto=thin -fwhole-program-vtables") - set (CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -flto=thin -fwhole-program-vtables") - set (CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO "${CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO} -flto=thin -fwhole-program-vtables") - elseif (ENABLE_THINLTO) - message (${RECONFIGURE_MESSAGE_LEVEL} "Cannot enable ThinLTO") - endif () +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fstrict-vtable-pointers") +# We cannot afford to use LTO when compiling unit tests, and it's not enough +# to only supply -fno-lto at the final linking stage. So we disable it +# completely. +if (ENABLE_THINLTO AND NOT ENABLE_TESTS AND NOT SANITIZE) + # Link time optimization + set (CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELWITHDEBINFO} -flto=thin -fwhole-program-vtables") + set (CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -flto=thin -fwhole-program-vtables") + set (CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO "${CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO} -flto=thin -fwhole-program-vtables") elseif (ENABLE_THINLTO) - message (${RECONFIGURE_MESSAGE_LEVEL} "ThinLTO is only available with Clang") + message (${RECONFIGURE_MESSAGE_LEVEL} "Cannot enable ThinLTO") endif () # Turns on all external libs like s3, kafka, ODBC, ... diff --git a/cmake/linux/default_libs.cmake b/cmake/linux/default_libs.cmake index e5ca8e296fc..4a06243243e 100644 --- a/cmake/linux/default_libs.cmake +++ b/cmake/linux/default_libs.cmake @@ -5,17 +5,15 @@ set (DEFAULT_LIBS "-nodefaultlibs") # We need builtins from Clang's RT even without libcxx - for ubsan+int128. # See https://bugs.llvm.org/show_bug.cgi?id=16404 -if (COMPILER_CLANG) - execute_process (COMMAND ${CMAKE_CXX_COMPILER} --target=${CMAKE_CXX_COMPILER_TARGET} --print-libgcc-file-name --rtlib=compiler-rt OUTPUT_VARIABLE BUILTINS_LIBRARY OUTPUT_STRIP_TRAILING_WHITESPACE) +execute_process (COMMAND ${CMAKE_CXX_COMPILER} --target=${CMAKE_CXX_COMPILER_TARGET} --print-libgcc-file-name --rtlib=compiler-rt OUTPUT_VARIABLE BUILTINS_LIBRARY OUTPUT_STRIP_TRAILING_WHITESPACE) - # Apparently, in clang-19, the UBSan support library for C++ was moved out into ubsan_standalone_cxx.a, so we have to include both. - if (SANITIZE STREQUAL undefined) - string(REPLACE "builtins.a" "ubsan_standalone_cxx.a" EXTRA_BUILTINS_LIBRARY "${BUILTINS_LIBRARY}") - endif () +# Apparently, in clang-19, the UBSan support library for C++ was moved out into ubsan_standalone_cxx.a, so we have to include both. +if (SANITIZE STREQUAL undefined) + string(REPLACE "builtins.a" "ubsan_standalone_cxx.a" EXTRA_BUILTINS_LIBRARY "${BUILTINS_LIBRARY}") +endif () - if (NOT EXISTS "${BUILTINS_LIBRARY}") - set (BUILTINS_LIBRARY "-lgcc") - endif () +if (NOT EXISTS "${BUILTINS_LIBRARY}") + set (BUILTINS_LIBRARY "-lgcc") endif () if (OS_ANDROID) diff --git a/cmake/sanitize.cmake b/cmake/sanitize.cmake index a3523203912..08716c1196b 100644 --- a/cmake/sanitize.cmake +++ b/cmake/sanitize.cmake @@ -26,9 +26,7 @@ if (SANITIZE) elseif (SANITIZE STREQUAL "thread") set (TSAN_FLAGS "-fsanitize=thread") - if (COMPILER_CLANG) - set (TSAN_FLAGS "${TSAN_FLAGS} -fsanitize-ignorelist=${PROJECT_SOURCE_DIR}/tests/tsan_ignorelist.txt") - endif() + set (TSAN_FLAGS "${TSAN_FLAGS} -fsanitize-ignorelist=${PROJECT_SOURCE_DIR}/tests/tsan_ignorelist.txt") set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SAN_FLAGS} ${TSAN_FLAGS}") set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SAN_FLAGS} ${TSAN_FLAGS}") @@ -44,9 +42,7 @@ if (SANITIZE) # that's why we often receive reports about UIO. The simplest way to avoid this is just set this flag here. set(UBSAN_FLAGS "${UBSAN_FLAGS} -fno-sanitize=unsigned-integer-overflow") endif() - if (COMPILER_CLANG) - set (UBSAN_FLAGS "${UBSAN_FLAGS} -fsanitize-ignorelist=${PROJECT_SOURCE_DIR}/tests/ubsan_ignorelist.txt") - endif() + set (UBSAN_FLAGS "${UBSAN_FLAGS} -fsanitize-ignorelist=${PROJECT_SOURCE_DIR}/tests/ubsan_ignorelist.txt") set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SAN_FLAGS} ${UBSAN_FLAGS}") set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SAN_FLAGS} ${UBSAN_FLAGS}") diff --git a/cmake/tools.cmake b/cmake/tools.cmake index 1ba3007b0f3..024505411a3 100644 --- a/cmake/tools.cmake +++ b/cmake/tools.cmake @@ -1,10 +1,6 @@ # Compiler -if (CMAKE_CXX_COMPILER_ID MATCHES "AppleClang") - set (COMPILER_CLANG 1) # Safe to treat AppleClang as a regular Clang, in general. -elseif (CMAKE_CXX_COMPILER_ID MATCHES "Clang") - set (COMPILER_CLANG 1) -else () +if (NOT CMAKE_CXX_COMPILER_ID MATCHES "Clang") message (FATAL_ERROR "Compiler ${CMAKE_CXX_COMPILER_ID} is not supported") endif () @@ -17,30 +13,26 @@ set (CLANG_MINIMUM_VERSION 16) set (XCODE_MINIMUM_VERSION 12.0) set (APPLE_CLANG_MINIMUM_VERSION 12.0.0) -if (COMPILER_CLANG) - if (CMAKE_CXX_COMPILER_ID MATCHES "AppleClang") - # (Experimental!) Specify "-DALLOW_APPLECLANG=ON" when running CMake configuration step, if you want to experiment with using it. - if (NOT ALLOW_APPLECLANG AND NOT DEFINED ENV{ALLOW_APPLECLANG}) - message (FATAL_ERROR "Compilation with AppleClang is unsupported. Please use vanilla Clang, e.g. from Homebrew.") - endif () +if (CMAKE_CXX_COMPILER_ID MATCHES "AppleClang") + # (Experimental!) Specify "-DALLOW_APPLECLANG=ON" when running CMake configuration step, if you want to experiment with using it. + if (NOT ALLOW_APPLECLANG AND NOT DEFINED ENV{ALLOW_APPLECLANG}) + message (FATAL_ERROR "Compilation with AppleClang is unsupported. Please use vanilla Clang, e.g. from Homebrew.") + endif () - # For a mapping between XCode / AppleClang / vanilla Clang versions, see https://en.wikipedia.org/wiki/Xcode - if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS ${APPLE_CLANG_MINIMUM_VERSION}) - message (FATAL_ERROR "Compilation with AppleClang version ${CMAKE_CXX_COMPILER_VERSION} is unsupported, the minimum required version is ${APPLE_CLANG_MINIMUM_VERSION} (Xcode ${XCODE_MINIMUM_VERSION}).") - endif () - else () - if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS ${CLANG_MINIMUM_VERSION}) - message (FATAL_ERROR "Compilation with Clang version ${CMAKE_CXX_COMPILER_VERSION} is unsupported, the minimum required version is ${CLANG_MINIMUM_VERSION}.") - endif () + # For a mapping between XCode / AppleClang / vanilla Clang versions, see https://en.wikipedia.org/wiki/Xcode + if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS ${APPLE_CLANG_MINIMUM_VERSION}) + message (FATAL_ERROR "Compilation with AppleClang version ${CMAKE_CXX_COMPILER_VERSION} is unsupported, the minimum required version is ${APPLE_CLANG_MINIMUM_VERSION} (Xcode ${XCODE_MINIMUM_VERSION}).") + endif () +else () + if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS ${CLANG_MINIMUM_VERSION}) + message (FATAL_ERROR "Compilation with Clang version ${CMAKE_CXX_COMPILER_VERSION} is unsupported, the minimum required version is ${CLANG_MINIMUM_VERSION}.") endif () endif () -# Linker - string (REGEX MATCHALL "[0-9]+" COMPILER_VERSION_LIST ${CMAKE_CXX_COMPILER_VERSION}) list (GET COMPILER_VERSION_LIST 0 COMPILER_VERSION_MAJOR) -# Example values: `lld-10` +# Linker option (LINKER_NAME "Linker name or full path") if (LINKER_NAME MATCHES "gold") @@ -48,19 +40,15 @@ if (LINKER_NAME MATCHES "gold") endif () if (NOT LINKER_NAME) - if (COMPILER_CLANG) - if (OS_LINUX AND NOT ARCH_S390X) - find_program (LLD_PATH NAMES "ld.lld-${COMPILER_VERSION_MAJOR}" "ld.lld") - elseif (OS_DARWIN) - find_program (LLD_PATH NAMES "ld") - endif () + if (OS_LINUX AND NOT ARCH_S390X) + find_program (LLD_PATH NAMES "ld.lld-${COMPILER_VERSION_MAJOR}" "ld.lld") + elseif (OS_DARWIN) + find_program (LLD_PATH NAMES "ld") endif () if (LLD_PATH) if (OS_LINUX OR OS_DARWIN) - if (COMPILER_CLANG) - # Clang driver simply allows full linker path. - set (LINKER_NAME ${LLD_PATH}) - endif () + # Clang driver simply allows full linker path. + set (LINKER_NAME ${LLD_PATH}) endif () endif() endif() @@ -82,47 +70,28 @@ else () endif () # Archiver - -if (COMPILER_CLANG) - find_program (LLVM_AR_PATH NAMES "llvm-ar-${COMPILER_VERSION_MAJOR}" "llvm-ar") -endif () - +find_program (LLVM_AR_PATH NAMES "llvm-ar-${COMPILER_VERSION_MAJOR}" "llvm-ar") if (LLVM_AR_PATH) set (CMAKE_AR "${LLVM_AR_PATH}") endif () - message(STATUS "Using archiver: ${CMAKE_AR}") # Ranlib - -if (COMPILER_CLANG) - find_program (LLVM_RANLIB_PATH NAMES "llvm-ranlib-${COMPILER_VERSION_MAJOR}" "llvm-ranlib") -endif () - +find_program (LLVM_RANLIB_PATH NAMES "llvm-ranlib-${COMPILER_VERSION_MAJOR}" "llvm-ranlib") if (LLVM_RANLIB_PATH) set (CMAKE_RANLIB "${LLVM_RANLIB_PATH}") endif () - message(STATUS "Using ranlib: ${CMAKE_RANLIB}") # Install Name Tool - -if (COMPILER_CLANG) - find_program (LLVM_INSTALL_NAME_TOOL_PATH NAMES "llvm-install-name-tool-${COMPILER_VERSION_MAJOR}" "llvm-install-name-tool") -endif () - +find_program (LLVM_INSTALL_NAME_TOOL_PATH NAMES "llvm-install-name-tool-${COMPILER_VERSION_MAJOR}" "llvm-install-name-tool") if (LLVM_INSTALL_NAME_TOOL_PATH) set (CMAKE_INSTALL_NAME_TOOL "${LLVM_INSTALL_NAME_TOOL_PATH}") endif () - message(STATUS "Using install-name-tool: ${CMAKE_INSTALL_NAME_TOOL}") # Objcopy - -if (COMPILER_CLANG) - find_program (OBJCOPY_PATH NAMES "llvm-objcopy-${COMPILER_VERSION_MAJOR}" "llvm-objcopy" "objcopy") -endif () - +find_program (OBJCOPY_PATH NAMES "llvm-objcopy-${COMPILER_VERSION_MAJOR}" "llvm-objcopy" "objcopy") if (OBJCOPY_PATH) message (STATUS "Using objcopy: ${OBJCOPY_PATH}") else () @@ -130,11 +99,7 @@ else () endif () # Strip - -if (COMPILER_CLANG) - find_program (STRIP_PATH NAMES "llvm-strip-${COMPILER_VERSION_MAJOR}" "llvm-strip" "strip") -endif () - +find_program (STRIP_PATH NAMES "llvm-strip-${COMPILER_VERSION_MAJOR}" "llvm-strip" "strip") if (STRIP_PATH) message (STATUS "Using strip: ${STRIP_PATH}") else () diff --git a/cmake/warnings.cmake b/cmake/warnings.cmake index 455e4f09939..807d92d9077 100644 --- a/cmake/warnings.cmake +++ b/cmake/warnings.cmake @@ -15,37 +15,35 @@ if ((NOT CMAKE_BUILD_TYPE_UC STREQUAL "DEBUG") AND (NOT SANITIZE) AND (NOT CMAKE add_warning(frame-larger-than=65536) endif () -if (COMPILER_CLANG) - # Add some warnings that are not available even with -Wall -Wextra -Wpedantic. - # We want to get everything out of the compiler for code quality. - add_warning(everything) - add_warning(pedantic) - no_warning(zero-length-array) - no_warning(c++98-compat-pedantic) - no_warning(c++98-compat) - no_warning(c++20-compat) # Use constinit in C++20 without warnings - no_warning(sign-conversion) - no_warning(implicit-int-conversion) - no_warning(implicit-int-float-conversion) - no_warning(ctad-maybe-unsupported) # clang 9+, linux-only - no_warning(disabled-macro-expansion) - no_warning(documentation-unknown-command) - no_warning(double-promotion) - no_warning(exit-time-destructors) - no_warning(float-equal) - no_warning(global-constructors) - no_warning(missing-prototypes) - no_warning(missing-variable-declarations) - no_warning(padded) - no_warning(switch-enum) - no_warning(undefined-func-template) - no_warning(unused-template) - no_warning(vla) - no_warning(weak-template-vtables) - no_warning(weak-vtables) - no_warning(thread-safety-negative) # experimental flag, too many false positives - no_warning(enum-constexpr-conversion) # breaks magic-enum library in clang-16 - no_warning(unsafe-buffer-usage) # too aggressive - no_warning(switch-default) # conflicts with "defaults in a switch covering all enum values" - # TODO Enable conversion, sign-conversion, double-promotion warnings. -endif () +# Add some warnings that are not available even with -Wall -Wextra -Wpedantic. +# We want to get everything out of the compiler for code quality. +add_warning(everything) +add_warning(pedantic) +no_warning(zero-length-array) +no_warning(c++98-compat-pedantic) +no_warning(c++98-compat) +no_warning(c++20-compat) # Use constinit in C++20 without warnings +no_warning(sign-conversion) +no_warning(implicit-int-conversion) +no_warning(implicit-int-float-conversion) +no_warning(ctad-maybe-unsupported) # clang 9+, linux-only +no_warning(disabled-macro-expansion) +no_warning(documentation-unknown-command) +no_warning(double-promotion) +no_warning(exit-time-destructors) +no_warning(float-equal) +no_warning(global-constructors) +no_warning(missing-prototypes) +no_warning(missing-variable-declarations) +no_warning(padded) +no_warning(switch-enum) +no_warning(undefined-func-template) +no_warning(unused-template) +no_warning(vla) +no_warning(weak-template-vtables) +no_warning(weak-vtables) +no_warning(thread-safety-negative) # experimental flag, too many false positives +no_warning(enum-constexpr-conversion) # breaks magic-enum library in clang-16 +no_warning(unsafe-buffer-usage) # too aggressive +no_warning(switch-default) # conflicts with "defaults in a switch covering all enum values" +# TODO Enable conversion, sign-conversion, double-promotion warnings. diff --git a/contrib/capnproto-cmake/CMakeLists.txt b/contrib/capnproto-cmake/CMakeLists.txt index e76268592ee..c07e9e6925b 100644 --- a/contrib/capnproto-cmake/CMakeLists.txt +++ b/contrib/capnproto-cmake/CMakeLists.txt @@ -81,9 +81,7 @@ set (CAPNPC_SRCS add_library(_capnpc ${CAPNPC_SRCS}) target_link_libraries(_capnpc PUBLIC _capnp) -if (COMPILER_CLANG) - set (CAPNP_PRIVATE_CXX_FLAGS -fno-char8_t) -endif () +set (CAPNP_PRIVATE_CXX_FLAGS -fno-char8_t) target_compile_options(_kj PRIVATE ${CAPNP_PRIVATE_CXX_FLAGS}) target_compile_options(_capnp PRIVATE ${CAPNP_PRIVATE_CXX_FLAGS}) diff --git a/contrib/openssl-cmake/CMakeLists.txt b/contrib/openssl-cmake/CMakeLists.txt index 021c88bcb04..72846143b9e 100644 --- a/contrib/openssl-cmake/CMakeLists.txt +++ b/contrib/openssl-cmake/CMakeLists.txt @@ -91,12 +91,10 @@ set(LIB_SOVERSION ${VERSION_MAJOR}) enable_language(ASM) -if(COMPILER_CLANG) - add_definitions(-Wno-unused-command-line-argument) - # Note that s390x build uses mold linker - if(NOT ARCH_S390X) - set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -fuse-ld=lld") # only relevant for -DENABLE_OPENSSL_DYNAMIC=1 - endif() +add_definitions(-Wno-unused-command-line-argument) +# Note that s390x build uses mold linker +if(NOT ARCH_S390X) + set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -fuse-ld=lld") # only relevant for -DENABLE_OPENSSL_DYNAMIC=1 endif() if(ARCH_AMD64) diff --git a/contrib/sentry-native-cmake/CMakeLists.txt b/contrib/sentry-native-cmake/CMakeLists.txt index 6364e75db28..6e4c8c36081 100644 --- a/contrib/sentry-native-cmake/CMakeLists.txt +++ b/contrib/sentry-native-cmake/CMakeLists.txt @@ -1,4 +1,4 @@ -if (NOT OS_FREEBSD AND NOT (OS_DARWIN AND COMPILER_CLANG)) +if (NOT OS_FREEBSD AND NOT OS_DARWIN) option (ENABLE_SENTRY "Enable Sentry" ${ENABLE_LIBRARIES}) else() option (ENABLE_SENTRY "Enable Sentry" OFF) From f45e032715dcc04d3ef78d84750c0f5f835b5008 Mon Sep 17 00:00:00 2001 From: Yohann Jardin Date: Tue, 7 May 2024 21:11:04 +0200 Subject: [PATCH 135/289] fix errorCodes in stateless tests --- tests/queries/0_stateless/00727_concat.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/00727_concat.sql b/tests/queries/0_stateless/00727_concat.sql index f5048dcaaae..01792545b5a 100644 --- a/tests/queries/0_stateless/00727_concat.sql +++ b/tests/queries/0_stateless/00727_concat.sql @@ -93,4 +93,4 @@ SELECT concat(materialize(NULL :: Nullable(UInt64))); SELECT CONCAT('Testing the ', 'alias'); -SELECT concat(); -- { serverError 42 } +SELECT concat(); -- { serverError TOO_FEW_ARGUMENTS_FOR_FUNCTION } From 577dccd47ff70af55140b673a436354d289c1344 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Tue, 7 May 2024 21:09:19 +0200 Subject: [PATCH 136/289] Fix ProfileEventTimeIncrement code Signed-off-by: Azat Khuzhin --- src/Common/ElapsedTimeProfileEventIncrement.h | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/Common/ElapsedTimeProfileEventIncrement.h b/src/Common/ElapsedTimeProfileEventIncrement.h index 731295a4cfd..aa944beeaa9 100644 --- a/src/Common/ElapsedTimeProfileEventIncrement.h +++ b/src/Common/ElapsedTimeProfileEventIncrement.h @@ -17,19 +17,18 @@ enum Time template