diff --git a/.github/workflows/backport_branches.yml b/.github/workflows/backport_branches.yml index 0d81a7b303c..d69168b01ee 100644 --- a/.github/workflows/backport_branches.yml +++ b/.github/workflows/backport_branches.yml @@ -349,6 +349,13 @@ jobs: with: clear-repository: true submodules: true + - name: Apply sparse checkout for contrib # in order to check that it doesn't break build + run: | + rm -rf "$GITHUB_WORKSPACE/contrib" && echo 'removed' + git -C "$GITHUB_WORKSPACE" checkout . && echo 'restored' + "$GITHUB_WORKSPACE/contrib/update-submodules.sh" && echo 'OK' + du -hs "$GITHUB_WORKSPACE/contrib" ||: + find "$GITHUB_WORKSPACE/contrib" -type f | wc -l ||: - name: Build run: | sudo rm -fr "$TEMP_PATH" diff --git a/.github/workflows/master.yml b/.github/workflows/master.yml index ecd5b85d320..1182481c897 100644 --- a/.github/workflows/master.yml +++ b/.github/workflows/master.yml @@ -487,6 +487,13 @@ jobs: with: clear-repository: true submodules: true + - name: Apply sparse checkout for contrib # in order to check that it doesn't break build + run: | + rm -rf "$GITHUB_WORKSPACE/contrib" && echo 'removed' + git -C "$GITHUB_WORKSPACE" checkout . && echo 'restored' + "$GITHUB_WORKSPACE/contrib/update-submodules.sh" && echo 'OK' + du -hs "$GITHUB_WORKSPACE/contrib" ||: + find "$GITHUB_WORKSPACE/contrib" -type f | wc -l ||: - name: Build run: | sudo rm -fr "$TEMP_PATH" diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml index ab0cbbb7ec1..6fccc0542b7 100644 --- a/.github/workflows/pull_request.yml +++ b/.github/workflows/pull_request.yml @@ -550,6 +550,13 @@ jobs: with: clear-repository: true submodules: true + - name: Apply sparse checkout for contrib # in order to check that it doesn't break build + run: | + rm -rf "$GITHUB_WORKSPACE/contrib" && echo 'removed' + git -C "$GITHUB_WORKSPACE" checkout . && echo 'restored' + "$GITHUB_WORKSPACE/contrib/update-submodules.sh" && echo 'OK' + du -hs "$GITHUB_WORKSPACE/contrib" ||: + find "$GITHUB_WORKSPACE/contrib" -type f | wc -l ||: - name: Build run: | sudo rm -fr "$TEMP_PATH" diff --git a/.github/workflows/release_branches.yml b/.github/workflows/release_branches.yml index 1282dbef50b..21284815583 100644 --- a/.github/workflows/release_branches.yml +++ b/.github/workflows/release_branches.yml @@ -406,6 +406,13 @@ jobs: with: clear-repository: true submodules: true + - name: Apply sparse checkout for contrib # in order to check that it doesn't break build + run: | + rm -rf "$GITHUB_WORKSPACE/contrib" && echo 'removed' + git -C "$GITHUB_WORKSPACE" checkout . && echo 'restored' + "$GITHUB_WORKSPACE/contrib/update-submodules.sh" && echo 'OK' + du -hs "$GITHUB_WORKSPACE/contrib" ||: + find "$GITHUB_WORKSPACE/contrib" -type f | wc -l ||: - name: Build run: | sudo rm -fr "$TEMP_PATH" diff --git a/README.md b/README.md index 61d840ecd34..cee3a945262 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -[![ClickHouse — open source distributed column-oriented DBMS](https://github.com/ClickHouse/clickhouse-presentations/raw/master/images/logo-400x240.png)](https://clickhouse.com) +[ClickHouse — open source distributed column-oriented DBMS](https://clickhouse.com?utm_source=github) ClickHouse® is an open-source column-oriented database management system that allows generating analytical data reports in real-time. diff --git a/base/base/wide_integer_impl.h b/base/base/wide_integer_impl.h index 7cdb527f9cf..ed4570d5e3f 100644 --- a/base/base/wide_integer_impl.h +++ b/base/base/wide_integer_impl.h @@ -155,13 +155,13 @@ struct common_type, Arithmetic> std::is_floating_point_v, Arithmetic, std::conditional_t< - sizeof(Arithmetic) < Bits * sizeof(long), + sizeof(Arithmetic) * 8 < Bits, wide::integer, std::conditional_t< - Bits * sizeof(long) < sizeof(Arithmetic), + Bits < sizeof(Arithmetic) * 8, Arithmetic, std::conditional_t< - Bits * sizeof(long) == sizeof(Arithmetic) && (std::is_same_v || std::is_signed_v), + Bits == sizeof(Arithmetic) * 8 && (std::is_same_v || std::is_signed_v), Arithmetic, wide::integer>>>>; }; diff --git a/contrib/cctz b/contrib/cctz index 7c78edd52b4..5e05432420f 160000 --- a/contrib/cctz +++ b/contrib/cctz @@ -1 +1 @@ -Subproject commit 7c78edd52b4d65acc103c2f195818ffcabe6fe0d +Subproject commit 5e05432420f9692418e2e12aff09859e420b14a2 diff --git a/contrib/krb5 b/contrib/krb5 index 9453aec0d50..b56ce6ba690 160000 --- a/contrib/krb5 +++ b/contrib/krb5 @@ -1 +1 @@ -Subproject commit 9453aec0d50e5aff9b189051611b321b40935d02 +Subproject commit b56ce6ba690e1f320df1a64afa34980c3e462617 diff --git a/contrib/krb5-cmake/CMakeLists.txt b/contrib/krb5-cmake/CMakeLists.txt index 93b90c15201..44058456ed4 100644 --- a/contrib/krb5-cmake/CMakeLists.txt +++ b/contrib/krb5-cmake/CMakeLists.txt @@ -15,10 +15,6 @@ if(NOT AWK_PROGRAM) message(FATAL_ERROR "You need the awk program to build ClickHouse with krb5 enabled.") endif() -if (NOT (ENABLE_OPENSSL OR ENABLE_OPENSSL_DYNAMIC)) - add_compile_definitions(USE_BORINGSSL=1) -endif () - set(KRB5_SOURCE_DIR "${ClickHouse_SOURCE_DIR}/contrib/krb5/src") set(KRB5_ET_BIN_DIR "${CMAKE_CURRENT_BINARY_DIR}/include_private") @@ -162,6 +158,11 @@ set(ALL_SRCS "${KRB5_SOURCE_DIR}/lib/crypto/builtin/kdf.c" "${KRB5_SOURCE_DIR}/lib/crypto/builtin/cmac.c" + "${KRB5_SOURCE_DIR}/lib/crypto/builtin/des/des_keys.c" + "${KRB5_SOURCE_DIR}/lib/crypto/builtin/des/f_parity.c" + "${KRB5_SOURCE_DIR}/lib/crypto/builtin/enc_provider/rc4.c" + "${KRB5_SOURCE_DIR}/lib/crypto/builtin/hash_provider/hash_md4.c" + "${KRB5_SOURCE_DIR}/lib/crypto/builtin/md4/md4.c" "${KRB5_SOURCE_DIR}/lib/crypto/krb/prng.c" "${KRB5_SOURCE_DIR}/lib/crypto/krb/enc_dk_cmac.c" # "${KRB5_SOURCE_DIR}/lib/crypto/krb/crc32.c" @@ -226,7 +227,6 @@ set(ALL_SRCS # "${KRB5_SOURCE_DIR}/lib/crypto/openssl/enc_provider/des.c" "${KRB5_SOURCE_DIR}/lib/crypto/openssl/enc_provider/rc4.c" "${KRB5_SOURCE_DIR}/lib/crypto/openssl/enc_provider/des3.c" - #"${KRB5_SOURCE_DIR}/lib/crypto/openssl/enc_provider/camellia.c" "${KRB5_SOURCE_DIR}/lib/crypto/openssl/cmac.c" "${KRB5_SOURCE_DIR}/lib/crypto/openssl/sha256.c" "${KRB5_SOURCE_DIR}/lib/crypto/openssl/hmac.c" @@ -474,6 +474,14 @@ set(ALL_SRCS "${KRB5_SOURCE_DIR}/lib/krb5/krb5_libinit.c" ) +if (NOT (ENABLE_OPENSSL OR ENABLE_OPENSSL_DYNAMIC)) + add_compile_definitions(USE_BORINGSSL=1) +else() + set(ALL_SRCS ${ALL_SRCS} + "${KRB5_SOURCE_DIR}/lib/crypto/openssl/enc_provider/camellia.c" + ) +endif() + add_custom_command( OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/compile_et" COMMAND /bin/sh @@ -673,6 +681,7 @@ target_include_directories(_krb5 PRIVATE "${KRB5_SOURCE_DIR}/lib/gssapi/krb5" "${KRB5_SOURCE_DIR}/lib/gssapi/spnego" "${KRB5_SOURCE_DIR}/util/et" + "${KRB5_SOURCE_DIR}/lib/crypto/builtin/md4" "${KRB5_SOURCE_DIR}/lib/crypto/openssl" "${KRB5_SOURCE_DIR}/lib/crypto/krb" "${KRB5_SOURCE_DIR}/util/profile" diff --git a/contrib/sparse-checkout/setup-sparse-checkout.sh b/contrib/sparse-checkout/setup-sparse-checkout.sh new file mode 100755 index 00000000000..3feba6c5adf --- /dev/null +++ b/contrib/sparse-checkout/setup-sparse-checkout.sh @@ -0,0 +1,19 @@ +#!/bin/sh + +set -e + +git config submodule."contrib/llvm-project".update '!../sparse-checkout/update-llvm-project.sh' +git config submodule."contrib/croaring".update '!../sparse-checkout/update-croaring.sh' +git config submodule."contrib/aws".update '!../sparse-checkout/update-aws.sh' +git config submodule."contrib/openssl".update '!../sparse-checkout/update-openssl.sh' +git config submodule."contrib/boringssl".update '!../sparse-checkout/update-boringssl.sh' +git config submodule."contrib/arrow".update '!../sparse-checkout/update-arrow.sh' +git config submodule."contrib/grpc".update '!../sparse-checkout/update-grpc.sh' +git config submodule."contrib/orc".update '!../sparse-checkout/update-orc.sh' +git config submodule."contrib/h3".update '!../sparse-checkout/update-h3.sh' +git config submodule."contrib/icu".update '!../sparse-checkout/update-icu.sh' +git config submodule."contrib/boost".update '!../sparse-checkout/update-boost.sh' +git config submodule."contrib/aws-s2n-tls".update '!../sparse-checkout/update-aws-s2n-tls.sh' +git config submodule."contrib/protobuf".update '!../sparse-checkout/update-protobuf.sh' +git config submodule."contrib/libxml2".update '!../sparse-checkout/update-libxml2.sh' +git config submodule."contrib/brotli".update '!../sparse-checkout/update-brotli.sh' diff --git a/contrib/sparse-checkout/update-arrow.sh b/contrib/sparse-checkout/update-arrow.sh new file mode 100755 index 00000000000..e004b60da02 --- /dev/null +++ b/contrib/sparse-checkout/update-arrow.sh @@ -0,0 +1,12 @@ +#!/bin/sh + +echo "Using sparse checkout for arrow" + +FILES_TO_CHECKOUT=$(git rev-parse --git-dir)/info/sparse-checkout +echo '/*' > $FILES_TO_CHECKOUT +echo '!/*/*' >> $FILES_TO_CHECKOUT +echo '/cpp/*' >> $FILES_TO_CHECKOUT + +git config core.sparsecheckout true +git checkout $1 +git read-tree -mu HEAD diff --git a/contrib/sparse-checkout/update-aws-s2n-tls.sh b/contrib/sparse-checkout/update-aws-s2n-tls.sh new file mode 100755 index 00000000000..4d65dc4b81d --- /dev/null +++ b/contrib/sparse-checkout/update-aws-s2n-tls.sh @@ -0,0 +1,13 @@ +#!/bin/sh + +echo "Using sparse checkout for aws-s2n-tls" + +FILES_TO_CHECKOUT=$(git rev-parse --git-dir)/info/sparse-checkout +echo '/*' > $FILES_TO_CHECKOUT +echo '!/test/*' >> $FILES_TO_CHECKOUT +echo '!/docs/*' >> $FILES_TO_CHECKOUT +echo '!/compliance/*' >> $FILES_TO_CHECKOUT + +git config core.sparsecheckout true +git checkout $1 +git read-tree -mu HEAD diff --git a/contrib/sparse-checkout/update-aws.sh b/contrib/sparse-checkout/update-aws.sh new file mode 100755 index 00000000000..c8d4c5a89c2 --- /dev/null +++ b/contrib/sparse-checkout/update-aws.sh @@ -0,0 +1,13 @@ +#!/bin/sh + +echo "Using sparse checkout for aws" + +FILES_TO_CHECKOUT=$(git rev-parse --git-dir)/info/sparse-checkout +echo '/*' > $FILES_TO_CHECKOUT +echo '!/*/*' >> $FILES_TO_CHECKOUT +echo '/aws-cpp-sdk-core/*' >> $FILES_TO_CHECKOUT +echo '/aws-cpp-sdk-s3/*' >> $FILES_TO_CHECKOUT + +git config core.sparsecheckout true +git checkout $1 +git read-tree -mu HEAD diff --git a/contrib/sparse-checkout/update-boost.sh b/contrib/sparse-checkout/update-boost.sh new file mode 100755 index 00000000000..9bd1f6c1796 --- /dev/null +++ b/contrib/sparse-checkout/update-boost.sh @@ -0,0 +1,85 @@ +#!/bin/sh + +echo "Using sparse checkout for boost" + +FILES_TO_CHECKOUT=$(git rev-parse --git-dir)/info/sparse-checkout +echo '/*' > $FILES_TO_CHECKOUT +echo '!/*/*' >> $FILES_TO_CHECKOUT +echo '/boost/*' > $FILES_TO_CHECKOUT +echo '!/boost/*/*' >> $FILES_TO_CHECKOUT +echo '/boost/algorithm/*' >> $FILES_TO_CHECKOUT +echo '/boost/any/*' >> $FILES_TO_CHECKOUT +echo '/boost/atomic/*' >> $FILES_TO_CHECKOUT +echo '/boost/assert/*' >> $FILES_TO_CHECKOUT +echo '/boost/bind/*' >> $FILES_TO_CHECKOUT +echo '/boost/concept/*' >> $FILES_TO_CHECKOUT +echo '/boost/config/*' >> $FILES_TO_CHECKOUT +echo '/boost/container/*' >> $FILES_TO_CHECKOUT +echo '/boost/container_hash/*' >> $FILES_TO_CHECKOUT +echo '/boost/context/*' >> $FILES_TO_CHECKOUT +echo '/boost/convert/*' >> $FILES_TO_CHECKOUT +echo '/boost/coroutine/*' >> $FILES_TO_CHECKOUT +echo '/boost/core/*' >> $FILES_TO_CHECKOUT +echo '/boost/detail/*' >> $FILES_TO_CHECKOUT +echo '/boost/dynamic_bitset/*' >> $FILES_TO_CHECKOUT +echo '/boost/exception/*' >> $FILES_TO_CHECKOUT +echo '/boost/filesystem/*' >> $FILES_TO_CHECKOUT +echo '/boost/functional/*' >> $FILES_TO_CHECKOUT +echo '/boost/function/*' >> $FILES_TO_CHECKOUT +echo '/boost/geometry/*' >> $FILES_TO_CHECKOUT +echo '/boost/graph/*' >> $FILES_TO_CHECKOUT +echo '/boost/heap/*' >> $FILES_TO_CHECKOUT +echo '/boost/integer/*' >> $FILES_TO_CHECKOUT +echo '/boost/intrusive/*' >> $FILES_TO_CHECKOUT +echo '/boost/iostreams/*' >> $FILES_TO_CHECKOUT +echo '/boost/io/*' >> $FILES_TO_CHECKOUT +echo '/boost/iterator/*' >> $FILES_TO_CHECKOUT +echo '/boost/math/*' >> $FILES_TO_CHECKOUT +echo '/boost/move/*' >> $FILES_TO_CHECKOUT +echo '/boost/mpl/*' >> $FILES_TO_CHECKOUT +echo '/boost/multi_index/*' >> $FILES_TO_CHECKOUT +echo '/boost/multiprecision/*' >> $FILES_TO_CHECKOUT +echo '/boost/numeric/*' >> $FILES_TO_CHECKOUT +echo '/boost/predef/*' >> $FILES_TO_CHECKOUT +echo '/boost/preprocessor/*' >> $FILES_TO_CHECKOUT +echo '/boost/program_options/*' >> $FILES_TO_CHECKOUT +echo '/boost/range/*' >> $FILES_TO_CHECKOUT +echo '/boost/regex/*' >> $FILES_TO_CHECKOUT +echo '/boost/smart_ptr/*' >> $FILES_TO_CHECKOUT +echo '/boost/type_index/*' >> $FILES_TO_CHECKOUT +echo '/boost/type_traits/*' >> $FILES_TO_CHECKOUT +echo '/boost/system/*' >> $FILES_TO_CHECKOUT +echo '/boost/tti/*' >> $FILES_TO_CHECKOUT +echo '/boost/utility/*' >> $FILES_TO_CHECKOUT +echo '/boost/lexical_cast/*' >> $FILES_TO_CHECKOUT +echo '/boost/optional/*' >> $FILES_TO_CHECKOUT +echo '/boost/property_map/*' >> $FILES_TO_CHECKOUT +echo '/boost/pending/*' >> $FILES_TO_CHECKOUT +echo '/boost/multi_array/*' >> $FILES_TO_CHECKOUT +echo '/boost/tuple/*' >> $FILES_TO_CHECKOUT +echo '/boost/icl/*' >> $FILES_TO_CHECKOUT +echo '/boost/unordered/*' >> $FILES_TO_CHECKOUT +echo '/boost/typeof/*' >> $FILES_TO_CHECKOUT +echo '/boost/parameter/*' >> $FILES_TO_CHECKOUT +echo '/boost/mp11/*' >> $FILES_TO_CHECKOUT +echo '/boost/archive/*' >> $FILES_TO_CHECKOUT +echo '/boost/function_types/*' >> $FILES_TO_CHECKOUT +echo '/boost/serialization/*' >> $FILES_TO_CHECKOUT +echo '/boost/fusion/*' >> $FILES_TO_CHECKOUT +echo '/boost/variant/*' >> $FILES_TO_CHECKOUT +echo '/boost/format/*' >> $FILES_TO_CHECKOUT +echo '/boost/locale/*' >> $FILES_TO_CHECKOUT +echo '/boost/random/*' >> $FILES_TO_CHECKOUT +echo '/boost/spirit/*' >> $FILES_TO_CHECKOUT +echo '/boost/uuid/*' >> $FILES_TO_CHECKOUT +echo '/boost/xpressive/*' >> $FILES_TO_CHECKOUT +echo '/boost/asio/*' >> $FILES_TO_CHECKOUT +echo '/boost/circular_buffer/*' >> $FILES_TO_CHECKOUT +echo '/boost/proto/*' >> $FILES_TO_CHECKOUT +echo '/boost/qvm/*' >> $FILES_TO_CHECKOUT +echo '/boost/property_tree/*' >> $FILES_TO_CHECKOUT +echo '/libs/*' >> $FILES_TO_CHECKOUT + +git config core.sparsecheckout true +git checkout $1 +git read-tree -mu HEAD \ No newline at end of file diff --git a/contrib/sparse-checkout/update-boringssl.sh b/contrib/sparse-checkout/update-boringssl.sh new file mode 100755 index 00000000000..f877a78afed --- /dev/null +++ b/contrib/sparse-checkout/update-boringssl.sh @@ -0,0 +1,14 @@ +#!/bin/sh + +echo "Using sparse checkout for boringsll" + +FILES_TO_CHECKOUT=$(git rev-parse --git-dir)/info/sparse-checkout +echo '/*' > $FILES_TO_CHECKOUT +echo '!/fuzz/*' >> $FILES_TO_CHECKOUT +echo '!/crypto/cipher_extra/test/*' >> $FILES_TO_CHECKOUT +echo '!/third_party/wycheproof_testvectors/*' >> $FILES_TO_CHECKOUT +echo '!/third_party/googletest/*' >> $FILES_TO_CHECKOUT + +git config core.sparsecheckout true +git checkout $1 +git read-tree -mu HEAD diff --git a/contrib/sparse-checkout/update-brotli.sh b/contrib/sparse-checkout/update-brotli.sh new file mode 100755 index 00000000000..8784f5e4125 --- /dev/null +++ b/contrib/sparse-checkout/update-brotli.sh @@ -0,0 +1,12 @@ +#!/bin/sh + +echo "Using sparse checkout for brotli" + +FILES_TO_CHECKOUT=$(git rev-parse --git-dir)/info/sparse-checkout +echo '/*' > $FILES_TO_CHECKOUT +echo '!/*/*' >> $FILES_TO_CHECKOUT +echo '/c/*' >> $FILES_TO_CHECKOUT + +git config core.sparsecheckout true +git checkout $1 +git read-tree -mu HEAD diff --git a/contrib/sparse-checkout/update-croaring.sh b/contrib/sparse-checkout/update-croaring.sh new file mode 100755 index 00000000000..9b7bba19df4 --- /dev/null +++ b/contrib/sparse-checkout/update-croaring.sh @@ -0,0 +1,12 @@ +#!/bin/sh + +echo "Using sparse checkout for croaring" + +FILES_TO_CHECKOUT=$(git rev-parse --git-dir)/info/sparse-checkout +echo '/*' > $FILES_TO_CHECKOUT +echo '!/benchmarks/*' >> $FILES_TO_CHECKOUT +echo '!/tests/*' >> $FILES_TO_CHECKOUT + +git config core.sparsecheckout true +git checkout $1 +git read-tree -mu HEAD diff --git a/contrib/sparse-checkout/update-grpc.sh b/contrib/sparse-checkout/update-grpc.sh new file mode 100755 index 00000000000..38934fdbc1b --- /dev/null +++ b/contrib/sparse-checkout/update-grpc.sh @@ -0,0 +1,22 @@ +#!/bin/sh + +echo "Using sparse checkout for grpc" + +FILES_TO_CHECKOUT=$(git rev-parse --git-dir)/info/sparse-checkout +echo '/*' > $FILES_TO_CHECKOUT +echo '!/test/*' >> $FILES_TO_CHECKOUT +echo '/test/build/*' >> $FILES_TO_CHECKOUT +echo '!/tools/*' >> $FILES_TO_CHECKOUT +echo '/tools/codegen/*' >> $FILES_TO_CHECKOUT +echo '!/examples/*' >> $FILES_TO_CHECKOUT +echo '!/doc/*' >> $FILES_TO_CHECKOUT +# FIXME why do we need csharp? +#echo '!/src/csharp/*' >> $FILES_TO_CHECKOUT +echo '!/src/python/*' >> $FILES_TO_CHECKOUT +echo '!/src/objective-c/*' >> $FILES_TO_CHECKOUT +echo '!/src/php/*' >> $FILES_TO_CHECKOUT +echo '!/src/ruby/*' >> $FILES_TO_CHECKOUT + +git config core.sparsecheckout true +git checkout $1 +git read-tree -mu HEAD diff --git a/contrib/sparse-checkout/update-h3.sh b/contrib/sparse-checkout/update-h3.sh new file mode 100755 index 00000000000..127885f89cc --- /dev/null +++ b/contrib/sparse-checkout/update-h3.sh @@ -0,0 +1,12 @@ +#!/bin/sh + +echo "Using sparse checkout for h3" + +FILES_TO_CHECKOUT=$(git rev-parse --git-dir)/info/sparse-checkout +echo '/*' > $FILES_TO_CHECKOUT +echo '!/tests/*' >> $FILES_TO_CHECKOUT +echo '!/website/*' >> $FILES_TO_CHECKOUT + +git config core.sparsecheckout true +git checkout $1 +git read-tree -mu HEAD diff --git a/contrib/sparse-checkout/update-icu.sh b/contrib/sparse-checkout/update-icu.sh new file mode 100755 index 00000000000..76af39f07a4 --- /dev/null +++ b/contrib/sparse-checkout/update-icu.sh @@ -0,0 +1,12 @@ +#!/bin/sh + +echo "Using sparse checkout for icu" + +FILES_TO_CHECKOUT=$(git rev-parse --git-dir)/info/sparse-checkout +echo '/*' > $FILES_TO_CHECKOUT +echo '!/*/*' >> $FILES_TO_CHECKOUT +echo '/icu4c/*' >> $FILES_TO_CHECKOUT + +git config core.sparsecheckout true +git checkout $1 +git read-tree -mu HEAD \ No newline at end of file diff --git a/contrib/sparse-checkout/update-libxml2.sh b/contrib/sparse-checkout/update-libxml2.sh new file mode 100755 index 00000000000..24faf11eec9 --- /dev/null +++ b/contrib/sparse-checkout/update-libxml2.sh @@ -0,0 +1,16 @@ +#!/bin/sh + +echo "Using sparse checkout for libxml2" + +FILES_TO_CHECKOUT=$(git rev-parse --git-dir)/info/sparse-checkout +echo '/*' > $FILES_TO_CHECKOUT +echo '!/result/*' >> $FILES_TO_CHECKOUT +echo '!/test/*' >> $FILES_TO_CHECKOUT +echo '!/doc/*' >> $FILES_TO_CHECKOUT +echo '!/os400/*' >> $FILES_TO_CHECKOUT +echo '!/fuzz/*' >> $FILES_TO_CHECKOUT +echo '!/python/*' >> $FILES_TO_CHECKOUT + +git config core.sparsecheckout true +git checkout $1 +git read-tree -mu HEAD diff --git a/contrib/sparse-checkout/update-llvm-project.sh b/contrib/sparse-checkout/update-llvm-project.sh new file mode 100755 index 00000000000..53c3b691d3a --- /dev/null +++ b/contrib/sparse-checkout/update-llvm-project.sh @@ -0,0 +1,27 @@ +#!/bin/sh + +echo "Using sparse checkout for llvm-project" + +FILES_TO_CHECKOUT=$(git rev-parse --git-dir)/info/sparse-checkout +echo '/*' > $FILES_TO_CHECKOUT +echo '!/*/*' >> $FILES_TO_CHECKOUT +echo '/llvm/*' >> $FILES_TO_CHECKOUT +echo '!/llvm/*/*' >> $FILES_TO_CHECKOUT +echo '/llvm/cmake/*' >> $FILES_TO_CHECKOUT +echo '/llvm/projects/*' >> $FILES_TO_CHECKOUT +echo '/llvm/include/*' >> $FILES_TO_CHECKOUT +echo '/llvm/lib/*' >> $FILES_TO_CHECKOUT +echo '/llvm/utils/TableGen/*' >> $FILES_TO_CHECKOUT +echo '/libcxxabi/*' >> $FILES_TO_CHECKOUT +echo '!/libcxxabi/test/*' >> $FILES_TO_CHECKOUT +echo '/libcxx/*' >> $FILES_TO_CHECKOUT +echo '!/libcxx/test/*' >> $FILES_TO_CHECKOUT +echo '/libunwind/*' >> $FILES_TO_CHECKOUT +echo '!/libunwind/test/*' >> $FILES_TO_CHECKOUT +echo '/compiler-rt/*' >> $FILES_TO_CHECKOUT +echo '!/compiler-rt/test/*' >> $FILES_TO_CHECKOUT +echo '/cmake/*' >> $FILES_TO_CHECKOUT + +git config core.sparsecheckout true +git checkout $1 +git read-tree -mu HEAD diff --git a/contrib/sparse-checkout/update-openssl.sh b/contrib/sparse-checkout/update-openssl.sh new file mode 100755 index 00000000000..33e19f43cb7 --- /dev/null +++ b/contrib/sparse-checkout/update-openssl.sh @@ -0,0 +1,15 @@ +#!/bin/sh + +echo "Using sparse checkout for openssl" + +FILES_TO_CHECKOUT=$(git rev-parse --git-dir)/info/sparse-checkout +echo '/*' > $FILES_TO_CHECKOUT +echo '!/fuzz/*' >> $FILES_TO_CHECKOUT +echo '!/test/*' >> $FILES_TO_CHECKOUT +echo '!/doc/*' >> $FILES_TO_CHECKOUT +echo '!/providers/*' >> $FILES_TO_CHECKOUT +echo '!/apps/*' >> $FILES_TO_CHECKOUT + +git config core.sparsecheckout true +git checkout $1 +git read-tree -mu HEAD diff --git a/contrib/sparse-checkout/update-orc.sh b/contrib/sparse-checkout/update-orc.sh new file mode 100755 index 00000000000..57ab57a8d52 --- /dev/null +++ b/contrib/sparse-checkout/update-orc.sh @@ -0,0 +1,13 @@ +#!/bin/sh + +echo "Using sparse checkout for orc" + +FILES_TO_CHECKOUT=$(git rev-parse --git-dir)/info/sparse-checkout +echo '/*' > $FILES_TO_CHECKOUT +echo '!/*/*' >> $FILES_TO_CHECKOUT +echo '/c++/*' >> $FILES_TO_CHECKOUT +echo '/proto/*' >> $FILES_TO_CHECKOUT + +git config core.sparsecheckout true +git checkout $1 +git read-tree -mu HEAD diff --git a/contrib/sparse-checkout/update-protobuf.sh b/contrib/sparse-checkout/update-protobuf.sh new file mode 100755 index 00000000000..31c037c2cf5 --- /dev/null +++ b/contrib/sparse-checkout/update-protobuf.sh @@ -0,0 +1,13 @@ +#!/bin/sh + +echo "Using sparse checkout for protobuf" + +FILES_TO_CHECKOUT=$(git rev-parse --git-dir)/info/sparse-checkout +echo '!/*' > $FILES_TO_CHECKOUT +echo '/*/*' >> $FILES_TO_CHECKOUT +echo '/src/*' >> $FILES_TO_CHECKOUT +echo '/cmake/*' >> $FILES_TO_CHECKOUT + +git config core.sparsecheckout true +git checkout $1 +git read-tree -mu HEAD diff --git a/contrib/update-submodules.sh b/contrib/update-submodules.sh new file mode 100755 index 00000000000..c94681e6240 --- /dev/null +++ b/contrib/update-submodules.sh @@ -0,0 +1,11 @@ +#!/bin/sh + +set -e + +WORKDIR=$(dirname "$0") +WORKDIR=$(readlink -f "${WORKDIR}") + +"$WORKDIR/sparse-checkout/setup-sparse-checkout.sh" +git submodule init +git submodule sync +git submodule update --depth=1 diff --git a/docs/en/development/developer-instruction.md b/docs/en/development/developer-instruction.md index ace5ab79bb4..6bcdadeb1eb 100644 --- a/docs/en/development/developer-instruction.md +++ b/docs/en/development/developer-instruction.md @@ -39,9 +39,15 @@ Next, you need to download the source files onto your working machine. This is c In the command line terminal run: - git clone --recursive --shallow-submodules git@github.com:your_github_username/ClickHouse.git + git clone --shallow-submodules git@github.com:your_github_username/ClickHouse.git cd ClickHouse +Or (if you'd like to use sparse checkout for submodules and avoid checking out unneeded files): + + git clone git@github.com:your_github_username/ClickHouse.git + cd ClickHouse + ./contrib/update-submodules.sh + Note: please, substitute *your_github_username* with what is appropriate! This command will create a directory `ClickHouse` containing the working copy of the project. diff --git a/docs/en/engines/table-engines/mergetree-family/replication.md b/docs/en/engines/table-engines/mergetree-family/replication.md index e9ca87916a0..e14ba5699e4 100644 --- a/docs/en/engines/table-engines/mergetree-family/replication.md +++ b/docs/en/engines/table-engines/mergetree-family/replication.md @@ -8,11 +8,18 @@ sidebar_label: Data Replication :::note In ClickHouse Cloud replication is managed for you. Please create your tables without adding arguments. For example, in the text below you would replace: + +```sql +ENGINE = ReplicatedReplacingMergeTree( + '/clickhouse/tables/{shard}/table_name', + '{replica}', + ver +) ``` -ENGINE = ReplicatedReplacingMergeTree('/clickhouse/tables/{shard}/table_name', '{replica}', ver) -``` + with: -``` + +```sql ENGINE = ReplicatedReplacingMergeTree ``` ::: diff --git a/docs/en/interfaces/formats.md b/docs/en/interfaces/formats.md index c18c63d13c2..b4823d5ebaf 100644 --- a/docs/en/interfaces/formats.md +++ b/docs/en/interfaces/formats.md @@ -78,7 +78,7 @@ The supported formats are: | [Null](#null) | ✗ | ✔ | | [XML](#xml) | ✗ | ✔ | | [CapnProto](#capnproto) | ✔ | ✔ | -| [LineAsString](#lineasstring) | ✔ | ✗ | +| [LineAsString](#lineasstring) | ✔ | ✔ | | [Regexp](#data-format-regexp) | ✔ | ✗ | | [RawBLOB](#rawblob) | ✔ | ✔ | | [MsgPack](#msgpack) | ✔ | ✔ | @@ -1235,8 +1235,8 @@ For output it uses the following correspondence between ClickHouse types and BSO | ClickHouse type | BSON Type | |-----------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------| | [Bool](/docs/en/sql-reference/data-types/boolean.md) | `\x08` boolean | -| [Int8/UInt8](/docs/en/sql-reference/data-types/int-uint.md) | `\x10` int32 | -| [Int16UInt16](/docs/en/sql-reference/data-types/int-uint.md) | `\x10` int32 | +| [Int8/UInt8](/docs/en/sql-reference/data-types/int-uint.md)/[Enum8](/docs/en/sql-reference/data-types/enum.md) | `\x10` int32 | +| [Int16/UInt16](/docs/en/sql-reference/data-types/int-uint.md)/[Enum16](/docs/en/sql-reference/data-types/enum.md) | `\x10` int32 | | [Int32](/docs/en/sql-reference/data-types/int-uint.md) | `\x10` int32 | | [UInt32](/docs/en/sql-reference/data-types/int-uint.md) | `\x12` int64 | | [Int64/UInt64](/docs/en/sql-reference/data-types/int-uint.md) | `\x12` int64 | @@ -1255,30 +1255,30 @@ For output it uses the following correspondence between ClickHouse types and BSO | [Array](/docs/en/sql-reference/data-types/array.md) | `\x04` array | | [Tuple](/docs/en/sql-reference/data-types/tuple.md) | `\x04` array | | [Named Tuple](/docs/en/sql-reference/data-types/tuple.md) | `\x03` document | -| [Map](/docs/en/sql-reference/data-types/map.md) (with String keys) | `\x03` document | +| [Map](/docs/en/sql-reference/data-types/map.md) | `\x03` document | | [IPv4](/docs/en/sql-reference/data-types/domains/ipv4.md) | `\x10` int32 | | [IPv6](/docs/en/sql-reference/data-types/domains/ipv6.md) | `\x05` binary, `\x00` binary subtype | For input it uses the following correspondence between BSON types and ClickHouse types: -| BSON Type | ClickHouse Type | -|------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `\x01` double | [Float32/Float64](/docs/en/sql-reference/data-types/float.md) | -| `\x02` string | [String](/docs/en/sql-reference/data-types/string.md)/[FixedString](/docs/en/sql-reference/data-types/fixedstring.md) | -| `\x03` document | [Map](/docs/en/sql-reference/data-types/map.md)/[Named Tuple](/docs/en/sql-reference/data-types/tuple.md) | -| `\x04` array | [Array](/docs/en/sql-reference/data-types/array.md)/[Tuple](/docs/en/sql-reference/data-types/tuple.md) | -| `\x05` binary, `\x00` binary subtype | [String](/docs/en/sql-reference/data-types/string.md)/[FixedString](/docs/en/sql-reference/data-types/fixedstring.md)/[IPv6](/docs/en/sql-reference/data-types/domains/ipv6.md) | -| `\x05` binary, `\x02` old binary subtype | [String](/docs/en/sql-reference/data-types/string.md)/[FixedString](/docs/en/sql-reference/data-types/fixedstring.md) | -| `\x05` binary, `\x03` old uuid subtype | [UUID](/docs/en/sql-reference/data-types/uuid.md) | -| `\x05` binary, `\x04` uuid subtype | [UUID](/docs/en/sql-reference/data-types/uuid.md) | -| `\x07` ObjectId | [String](/docs/en/sql-reference/data-types/string.md)/[FixedString](/docs/en/sql-reference/data-types/fixedstring.md) | -| `\x08` boolean | [Bool](/docs/en/sql-reference/data-types/boolean.md) | -| `\x09` datetime | [DateTime64](/docs/en/sql-reference/data-types/datetime64.md) | -| `\x0A` null value | [NULL](/docs/en/sql-reference/data-types/nullable.md) | -| `\x0D` JavaScript code | [String](/docs/en/sql-reference/data-types/string.md)/[FixedString](/docs/en/sql-reference/data-types/fixedstring.md) | -| `\x0E` symbol | [String](/docs/en/sql-reference/data-types/string.md)/[FixedString](/docs/en/sql-reference/data-types/fixedstring.md) | -| `\x10` int32 | [Int32/UInt32](/docs/en/sql-reference/data-types/int-uint.md)/[Decimal32](/docs/en/sql-reference/data-types/decimal.md)/[IPv4](/docs/en/sql-reference/data-types/domains/ipv4.md) | -| `\x12` int64 | [Int64/UInt64](/docs/en/sql-reference/data-types/int-uint.md)/[Decimal64](/docs/en/sql-reference/data-types/decimal.md)/[DateTime64](/docs/en/sql-reference/data-types/datetime64.md) | +| BSON Type | ClickHouse Type | +|------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `\x01` double | [Float32/Float64](/docs/en/sql-reference/data-types/float.md) | +| `\x02` string | [String](/docs/en/sql-reference/data-types/string.md)/[FixedString](/docs/en/sql-reference/data-types/fixedstring.md) | +| `\x03` document | [Map](/docs/en/sql-reference/data-types/map.md)/[Named Tuple](/docs/en/sql-reference/data-types/tuple.md) | +| `\x04` array | [Array](/docs/en/sql-reference/data-types/array.md)/[Tuple](/docs/en/sql-reference/data-types/tuple.md) | +| `\x05` binary, `\x00` binary subtype | [String](/docs/en/sql-reference/data-types/string.md)/[FixedString](/docs/en/sql-reference/data-types/fixedstring.md)/[IPv6](/docs/en/sql-reference/data-types/domains/ipv6.md) | +| `\x05` binary, `\x02` old binary subtype | [String](/docs/en/sql-reference/data-types/string.md)/[FixedString](/docs/en/sql-reference/data-types/fixedstring.md) | +| `\x05` binary, `\x03` old uuid subtype | [UUID](/docs/en/sql-reference/data-types/uuid.md) | +| `\x05` binary, `\x04` uuid subtype | [UUID](/docs/en/sql-reference/data-types/uuid.md) | +| `\x07` ObjectId | [String](/docs/en/sql-reference/data-types/string.md)/[FixedString](/docs/en/sql-reference/data-types/fixedstring.md) | +| `\x08` boolean | [Bool](/docs/en/sql-reference/data-types/boolean.md) | +| `\x09` datetime | [DateTime64](/docs/en/sql-reference/data-types/datetime64.md) | +| `\x0A` null value | [NULL](/docs/en/sql-reference/data-types/nullable.md) | +| `\x0D` JavaScript code | [String](/docs/en/sql-reference/data-types/string.md)/[FixedString](/docs/en/sql-reference/data-types/fixedstring.md) | +| `\x0E` symbol | [String](/docs/en/sql-reference/data-types/string.md)/[FixedString](/docs/en/sql-reference/data-types/fixedstring.md) | +| `\x10` int32 | [Int32/UInt32](/docs/en/sql-reference/data-types/int-uint.md)/[Decimal32](/docs/en/sql-reference/data-types/decimal.md)/[IPv4](/docs/en/sql-reference/data-types/domains/ipv4.md)/[Enum8/Enum16](/docs/en/sql-reference/data-types/enum.md) | +| `\x12` int64 | [Int64/UInt64](/docs/en/sql-reference/data-types/int-uint.md)/[Decimal64](/docs/en/sql-reference/data-types/decimal.md)/[DateTime64](/docs/en/sql-reference/data-types/datetime64.md) | Other BSON types are not supported. Also, it performs conversion between different integer types (for example, you can insert BSON int32 value into ClickHouse UInt8). Big integers and decimals (Int128/UInt128/Int256/UInt256/Decimal128/Decimal256) can be parsed from BSON Binary value with `\x00` binary subtype. In this case this format will validate that the size of binary data equals the size of expected value. @@ -1877,6 +1877,13 @@ Column names must: Output Avro file compression and sync interval can be configured with [output_format_avro_codec](/docs/en/operations/settings/settings-formats.md/#output_format_avro_codec) and [output_format_avro_sync_interval](/docs/en/operations/settings/settings-formats.md/#output_format_avro_sync_interval) respectively. +### Example Data {#example-data-avro} + +Using the ClickHouse [DESCRIBE](/docs/en/sql-reference/statements/describe-table) function, you can quickly view the inferred format of an Avro file like the following example. This example includes the URL of a publicly accessible Avro file in the ClickHouse S3 public bucket: + +``` DESCRIBE url('https://clickhouse-public-datasets.s3.eu-central-1.amazonaws.com/hits.avro','Avro'); +``` + ## AvroConfluent {#data-format-avro-confluent} AvroConfluent supports decoding single-object Avro messages commonly used with [Kafka](https://kafka.apache.org/) and [Confluent Schema Registry](https://docs.confluent.io/current/schema-registry/index.html). @@ -1936,30 +1943,31 @@ Setting `format_avro_schema_registry_url` needs to be configured in `users.xml` The table below shows supported data types and how they match ClickHouse [data types](/docs/en/sql-reference/data-types/index.md) in `INSERT` and `SELECT` queries. -| Parquet data type (`INSERT`) | ClickHouse data type | Parquet data type (`SELECT`) | -|----------------------------------------------------|-----------------------------------------------------------------|------------------------------| -| `BOOL` | [Bool](/docs/en/sql-reference/data-types/boolean.md) | `BOOL` | -| `UINT8`, `BOOL` | [UInt8](/docs/en/sql-reference/data-types/int-uint.md) | `UINT8` | -| `INT8` | [Int8](/docs/en/sql-reference/data-types/int-uint.md) | `INT8` | -| `UINT16` | [UInt16](/docs/en/sql-reference/data-types/int-uint.md) | `UINT16` | -| `INT16` | [Int16](/docs/en/sql-reference/data-types/int-uint.md) | `INT16` | -| `UINT32` | [UInt32](/docs/en/sql-reference/data-types/int-uint.md) | `UINT32` | -| `INT32` | [Int32](/docs/en/sql-reference/data-types/int-uint.md) | `INT32` | -| `UINT64` | [UInt64](/docs/en/sql-reference/data-types/int-uint.md) | `UINT64` | -| `INT64` | [Int64](/docs/en/sql-reference/data-types/int-uint.md) | `INT64` | -| `FLOAT` | [Float32](/docs/en/sql-reference/data-types/float.md) | `FLOAT` | -| `DOUBLE` | [Float64](/docs/en/sql-reference/data-types/float.md) | `DOUBLE` | -| `DATE` | [Date32](/docs/en/sql-reference/data-types/date.md) | `DATE` | -| `TIME (ms)` | [DateTime](/docs/en/sql-reference/data-types/datetime.md) | `UINT32` | -| `TIMESTAMP`, `TIME (us, ns)` | [DateTime64](/docs/en/sql-reference/data-types/datetime64.md) | `TIMESTAMP` | -| `STRING`, `BINARY` | [String](/docs/en/sql-reference/data-types/string.md) | `BINARY` | -| `STRING`, `BINARY`, `FIXED_LENGTH_BYTE_ARRAY` | [FixedString](/docs/en/sql-reference/data-types/fixedstring.md) | `FIXED_LENGTH_BYTE_ARRAY` | -| `DECIMAL` | [Decimal](/docs/en/sql-reference/data-types/decimal.md) | `DECIMAL` | -| `LIST` | [Array](/docs/en/sql-reference/data-types/array.md) | `LIST` | -| `STRUCT` | [Tuple](/docs/en/sql-reference/data-types/tuple.md) | `STRUCT` | -| `MAP` | [Map](/docs/en/sql-reference/data-types/map.md) | `MAP` | -| `UINT32` | [IPv4](/docs/en/sql-reference/data-types/domains/ipv4.md) | `UINT32` | -| `FIXED_LENGTH_BYTE_ARRAY` | [IPv6](/docs/en/sql-reference/data-types/domains/ipv6.md) | `FIXED_LENGTH_BYTE_ARRAY` | +| Parquet data type (`INSERT`) | ClickHouse data type | Parquet data type (`SELECT`) | +|-----------------------------------------------|------------------------------------------------------------------------------------------------------------|-------------------------------| +| `BOOL` | [Bool](/docs/en/sql-reference/data-types/boolean.md) | `BOOL` | +| `UINT8`, `BOOL` | [UInt8](/docs/en/sql-reference/data-types/int-uint.md) | `UINT8` | +| `INT8` | [Int8](/docs/en/sql-reference/data-types/int-uint.md)/[Enum8](/docs/en/sql-reference/data-types/enum.md) | `INT8` | +| `UINT16` | [UInt16](/docs/en/sql-reference/data-types/int-uint.md) | `UINT16` | +| `INT16` | [Int16](/docs/en/sql-reference/data-types/int-uint.md)/[Enum16](/docs/en/sql-reference/data-types/enum.md) | `INT16` | +| `UINT32` | [UInt32](/docs/en/sql-reference/data-types/int-uint.md) | `UINT32` | +| `INT32` | [Int32](/docs/en/sql-reference/data-types/int-uint.md) | `INT32` | +| `UINT64` | [UInt64](/docs/en/sql-reference/data-types/int-uint.md) | `UINT64` | +| `INT64` | [Int64](/docs/en/sql-reference/data-types/int-uint.md) | `INT64` | +| `FLOAT` | [Float32](/docs/en/sql-reference/data-types/float.md) | `FLOAT` | +| `DOUBLE` | [Float64](/docs/en/sql-reference/data-types/float.md) | `DOUBLE` | +| `DATE` | [Date32](/docs/en/sql-reference/data-types/date.md) | `DATE` | +| `TIME (ms)` | [DateTime](/docs/en/sql-reference/data-types/datetime.md) | `UINT32` | +| `TIMESTAMP`, `TIME (us, ns)` | [DateTime64](/docs/en/sql-reference/data-types/datetime64.md) | `TIMESTAMP` | +| `STRING`, `BINARY` | [String](/docs/en/sql-reference/data-types/string.md) | `BINARY` | +| `STRING`, `BINARY`, `FIXED_LENGTH_BYTE_ARRAY` | [FixedString](/docs/en/sql-reference/data-types/fixedstring.md) | `FIXED_LENGTH_BYTE_ARRAY` | +| `DECIMAL` | [Decimal](/docs/en/sql-reference/data-types/decimal.md) | `DECIMAL` | +| `LIST` | [Array](/docs/en/sql-reference/data-types/array.md) | `LIST` | +| `STRUCT` | [Tuple](/docs/en/sql-reference/data-types/tuple.md) | `STRUCT` | +| `MAP` | [Map](/docs/en/sql-reference/data-types/map.md) | `MAP` | +| `UINT32` | [IPv4](/docs/en/sql-reference/data-types/domains/ipv4.md) | `UINT32` | +| `FIXED_LENGTH_BYTE_ARRAY`, `BINARY` | [IPv6](/docs/en/sql-reference/data-types/domains/ipv6.md) | `FIXED_LENGTH_BYTE_ARRAY` | +| `FIXED_LENGTH_BYTE_ARRAY`, `BINARY` | [Int128/UInt128/Int256/UInt256](/docs/en/sql-reference/data-types/int-uint.md) | `FIXED_LENGTH_BYTE_ARRAY` | Arrays can be nested and can have a value of the `Nullable` type as an argument. `Tuple` and `Map` types also can be nested. @@ -2005,31 +2013,32 @@ To exchange data with Hadoop, you can use [HDFS table engine](/docs/en/engines/t The table below shows supported data types and how they match ClickHouse [data types](/docs/en/sql-reference/data-types/index.md) in `INSERT` and `SELECT` queries. -| Arrow data type (`INSERT`) | ClickHouse data type | Arrow data type (`SELECT`) | -|-----------------------------------------|-----------------------------------------------------------------|----------------------------| -| `BOOL` | [Bool](/docs/en/sql-reference/data-types/boolean.md) | `BOOL` | -| `UINT8`, `BOOL` | [UInt8](/docs/en/sql-reference/data-types/int-uint.md) | `UINT8` | -| `INT8` | [Int8](/docs/en/sql-reference/data-types/int-uint.md) | `INT8` | -| `UINT16` | [UInt16](/docs/en/sql-reference/data-types/int-uint.md) | `UINT16` | -| `INT16` | [Int16](/docs/en/sql-reference/data-types/int-uint.md) | `INT16` | -| `UINT32` | [UInt32](/docs/en/sql-reference/data-types/int-uint.md) | `UINT32` | -| `INT32` | [Int32](/docs/en/sql-reference/data-types/int-uint.md) | `INT32` | -| `UINT64` | [UInt64](/docs/en/sql-reference/data-types/int-uint.md) | `UINT64` | -| `INT64` | [Int64](/docs/en/sql-reference/data-types/int-uint.md) | `INT64` | -| `FLOAT`, `HALF_FLOAT` | [Float32](/docs/en/sql-reference/data-types/float.md) | `FLOAT32` | -| `DOUBLE` | [Float64](/docs/en/sql-reference/data-types/float.md) | `FLOAT64` | -| `DATE32` | [Date32](/docs/en/sql-reference/data-types/date32.md) | `UINT16` | -| `DATE64` | [DateTime](/docs/en/sql-reference/data-types/datetime.md) | `UINT32` | -| `TIMESTAMP`, `TIME32`, `TIME64` | [DateTime64](/docs/en/sql-reference/data-types/datetime64.md) | `UINT32` | -| `STRING`, `BINARY` | [String](/docs/en/sql-reference/data-types/string.md) | `BINARY` | -| `STRING`, `BINARY`, `FIXED_SIZE_BINARY` | [FixedString](/docs/en/sql-reference/data-types/fixedstring.md) | `FIXED_SIZE_BINARY` | -| `DECIMAL` | [Decimal](/docs/en/sql-reference/data-types/decimal.md) | `DECIMAL` | -| `DECIMAL256` | [Decimal256](/docs/en/sql-reference/data-types/decimal.md) | `DECIMAL256` | -| `LIST` | [Array](/docs/en/sql-reference/data-types/array.md) | `LIST` | -| `STRUCT` | [Tuple](/docs/en/sql-reference/data-types/tuple.md) | `STRUCT` | -| `MAP` | [Map](/docs/en/sql-reference/data-types/map.md) | `MAP` | -| `UINT32` | [IPv4](/docs/en/sql-reference/data-types/domains/ipv4.md) | `UINT32` | -| `FIXED_SIZE_BINARY`, `BINARY` | [IPv6](/docs/en/sql-reference/data-types/domains/ipv6.md) | `FIXED_SIZE_BINARY` | +| Arrow data type (`INSERT`) | ClickHouse data type | Arrow data type (`SELECT`) | +|-----------------------------------------|------------------------------------------------------------------------------------------------------------|----------------------------| +| `BOOL` | [Bool](/docs/en/sql-reference/data-types/boolean.md) | `BOOL` | +| `UINT8`, `BOOL` | [UInt8](/docs/en/sql-reference/data-types/int-uint.md) | `UINT8` | +| `INT8` | [Int8](/docs/en/sql-reference/data-types/int-uint.md)/[Enum8](/docs/en/sql-reference/data-types/enum.md) | `INT8` | +| `UINT16` | [UInt16](/docs/en/sql-reference/data-types/int-uint.md) | `UINT16` | +| `INT16` | [Int16](/docs/en/sql-reference/data-types/int-uint.md)/[Enum16](/docs/en/sql-reference/data-types/enum.md) | `INT16` | +| `UINT32` | [UInt32](/docs/en/sql-reference/data-types/int-uint.md) | `UINT32` | +| `INT32` | [Int32](/docs/en/sql-reference/data-types/int-uint.md) | `INT32` | +| `UINT64` | [UInt64](/docs/en/sql-reference/data-types/int-uint.md) | `UINT64` | +| `INT64` | [Int64](/docs/en/sql-reference/data-types/int-uint.md) | `INT64` | +| `FLOAT`, `HALF_FLOAT` | [Float32](/docs/en/sql-reference/data-types/float.md) | `FLOAT32` | +| `DOUBLE` | [Float64](/docs/en/sql-reference/data-types/float.md) | `FLOAT64` | +| `DATE32` | [Date32](/docs/en/sql-reference/data-types/date32.md) | `UINT16` | +| `DATE64` | [DateTime](/docs/en/sql-reference/data-types/datetime.md) | `UINT32` | +| `TIMESTAMP`, `TIME32`, `TIME64` | [DateTime64](/docs/en/sql-reference/data-types/datetime64.md) | `UINT32` | +| `STRING`, `BINARY` | [String](/docs/en/sql-reference/data-types/string.md) | `BINARY` | +| `STRING`, `BINARY`, `FIXED_SIZE_BINARY` | [FixedString](/docs/en/sql-reference/data-types/fixedstring.md) | `FIXED_SIZE_BINARY` | +| `DECIMAL` | [Decimal](/docs/en/sql-reference/data-types/decimal.md) | `DECIMAL` | +| `DECIMAL256` | [Decimal256](/docs/en/sql-reference/data-types/decimal.md) | `DECIMAL256` | +| `LIST` | [Array](/docs/en/sql-reference/data-types/array.md) | `LIST` | +| `STRUCT` | [Tuple](/docs/en/sql-reference/data-types/tuple.md) | `STRUCT` | +| `MAP` | [Map](/docs/en/sql-reference/data-types/map.md) | `MAP` | +| `UINT32` | [IPv4](/docs/en/sql-reference/data-types/domains/ipv4.md) | `UINT32` | +| `FIXED_SIZE_BINARY`, `BINARY` | [IPv6](/docs/en/sql-reference/data-types/domains/ipv6.md) | `FIXED_SIZE_BINARY` | +| `FIXED_SIZE_BINARY`, `BINARY` | [Int128/UInt128/Int256/UInt256](/docs/en/sql-reference/data-types/int-uint.md) | `FIXED_SIZE_BINARY` | Arrays can be nested and can have a value of the `Nullable` type as an argument. `Tuple` and `Map` types also can be nested. @@ -2078,23 +2087,26 @@ $ clickhouse-client --query="SELECT * FROM {some_table} FORMAT Arrow" > {filenam The table below shows supported data types and how they match ClickHouse [data types](/docs/en/sql-reference/data-types/index.md) in `INSERT` and `SELECT` queries. -| ORC data type (`INSERT`) | ClickHouse data type | ORC data type (`SELECT`) | -|---------------------------------------|---------------------------------------------------------------|--------------------------| -| `Boolean` | [UInt8](/docs/en/sql-reference/data-types/int-uint.md) | `Boolean` | -| `Tinyint` | [Int8](/docs/en/sql-reference/data-types/int-uint.md) | `Tinyint` | -| `Smallint` | [Int16](/docs/en/sql-reference/data-types/int-uint.md) | `Smallint` | -| `Int` | [Int32](/docs/en/sql-reference/data-types/int-uint.md) | `Int` | -| `Bigint` | [Int64](/docs/en/sql-reference/data-types/int-uint.md) | `Bigint` | -| `Float` | [Float32](/docs/en/sql-reference/data-types/float.md) | `Float` | -| `Double` | [Float64](/docs/en/sql-reference/data-types/float.md) | `Double` | -| `Decimal` | [Decimal](/docs/en/sql-reference/data-types/decimal.md) | `Decimal` | -| `Date` | [Date32](/docs/en/sql-reference/data-types/date32.md) | `Date` | -| `Timestamp` | [DateTime64](/docs/en/sql-reference/data-types/datetime64.md) | `Timestamp` | -| `String`, `Char`, `Varchar`, `Binary` | [String](/docs/en/sql-reference/data-types/string.md) | `Binary` | -| `List` | [Array](/docs/en/sql-reference/data-types/array.md) | `List` | -| `Struct` | [Tuple](/docs/en/sql-reference/data-types/tuple.md) | `Struct` | -| `Map` | [Map](/docs/en/sql-reference/data-types/map.md) | `Map` | -| `-` | [IPv4](/docs/en/sql-reference/data-types/int-uint.md) | `Int` | +| ORC data type (`INSERT`) | ClickHouse data type | ORC data type (`SELECT`) | +|---------------------------------------|-------------------------------------------------------------------------------------------------------------------|--------------------------| +| `Boolean` | [UInt8](/docs/en/sql-reference/data-types/int-uint.md) | `Boolean` | +| `Tinyint` | [Int8/UInt8](/docs/en/sql-reference/data-types/int-uint.md)/[Enum8](/docs/en/sql-reference/data-types/enum.md) | `Tinyint` | +| `Smallint` | [Int16/UInt16](/docs/en/sql-reference/data-types/int-uint.md)/[Enum16](/docs/en/sql-reference/data-types/enum.md) | `Smallint` | +| `Int` | [Int32/UInt32](/docs/en/sql-reference/data-types/int-uint.md) | `Int` | +| `Bigint` | [Int64/UInt32](/docs/en/sql-reference/data-types/int-uint.md) | `Bigint` | +| `Float` | [Float32](/docs/en/sql-reference/data-types/float.md) | `Float` | +| `Double` | [Float64](/docs/en/sql-reference/data-types/float.md) | `Double` | +| `Decimal` | [Decimal](/docs/en/sql-reference/data-types/decimal.md) | `Decimal` | +| `Date` | [Date32](/docs/en/sql-reference/data-types/date32.md) | `Date` | +| `Timestamp` | [DateTime64](/docs/en/sql-reference/data-types/datetime64.md) | `Timestamp` | +| `String`, `Char`, `Varchar`, `Binary` | [String](/docs/en/sql-reference/data-types/string.md) | `Binary` | +| `List` | [Array](/docs/en/sql-reference/data-types/array.md) | `List` | +| `Struct` | [Tuple](/docs/en/sql-reference/data-types/tuple.md) | `Struct` | +| `Map` | [Map](/docs/en/sql-reference/data-types/map.md) | `Map` | +| `Int` | [IPv4](/docs/en/sql-reference/data-types/int-uint.md) | `Int` | +| `Binary` | [IPv6](/docs/en/sql-reference/data-types/domains/ipv6.md) | `Binary` | +| `Binary` | [Int128/UInt128/Int256/UInt256](/docs/en/sql-reference/data-types/int-uint.md) | `Binary` | +| `Binary` | [Decimal256](/docs/en/sql-reference/data-types/decimal.md) | `Binary` | Other types are not supported. diff --git a/docs/en/operations/system-tables/replicas.md b/docs/en/operations/system-tables/replicas.md index e711d9a7784..15426eefbcc 100644 --- a/docs/en/operations/system-tables/replicas.md +++ b/docs/en/operations/system-tables/replicas.md @@ -50,6 +50,7 @@ last_queue_update: 2021-10-12 14:50:08 absolute_delay: 99 total_replicas: 5 active_replicas: 5 +lost_part_count: 0 last_queue_update_exception: zookeeper_exception: replica_is_active: {'r1':1,'r2':1} @@ -90,6 +91,7 @@ The next 4 columns have a non-zero value only where there is an active session w - `absolute_delay` (`UInt64`) - How big lag in seconds the current replica has. - `total_replicas` (`UInt8`) - The total number of known replicas of this table. - `active_replicas` (`UInt8`) - The number of replicas of this table that have a session in ClickHouse Keeper (i.e., the number of functioning replicas). +- `lost_part_count` (`UInt64`) - The number of data parts lost in the table by all replicas in total since table creation. Value is persisted in ClickHouse Keeper and can only increase. - `last_queue_update_exception` (`String`) - When the queue contains broken entries. Especially important when ClickHouse breaks backward compatibility between versions and log entries written by newer versions aren't parseable by old versions. - `zookeeper_exception` (`String`) - The last exception message, got if the error happened when fetching the info from ClickHouse Keeper. - `replica_is_active` ([Map(String, UInt8)](../../sql-reference/data-types/map.md)) — Map between replica name and is replica active. diff --git a/docs/en/operations/system-tables/storage_policies.md b/docs/en/operations/system-tables/storage_policies.md index 966b677c7e3..69e0f7f0a55 100644 --- a/docs/en/operations/system-tables/storage_policies.md +++ b/docs/en/operations/system-tables/storage_policies.md @@ -11,8 +11,16 @@ Columns: - `volume_name` ([String](../../sql-reference/data-types/string.md)) — Volume name defined in the storage policy. - `volume_priority` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Volume order number in the configuration, the data fills the volumes according this priority, i.e. data during inserts and merges is written to volumes with a lower priority (taking into account other rules: TTL, `max_data_part_size`, `move_factor`). - `disks` ([Array(String)](../../sql-reference/data-types/array.md)) — Disk names, defined in the storage policy. +- `volume_type` ([Enum8](../../sql-reference/data-types/enum.md)) — Type of volume. Can have one of the following values: + - `JBOD` + - `SINGLE_DISK` + - `UNKNOWN` - `max_data_part_size` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Maximum size of a data part that can be stored on volume disks (0 — no limit). - `move_factor` ([Float64](../../sql-reference/data-types/float.md)) — Ratio of free disk space. When the ratio exceeds the value of configuration parameter, ClickHouse start to move data to the next volume in order. - `prefer_not_to_merge` ([UInt8](../../sql-reference/data-types/int-uint.md)) — Value of the `prefer_not_to_merge` setting. When this setting is enabled, merging data on this volume is not allowed. This allows controlling how ClickHouse works with slow disks. +- `perform_ttl_move_on_insert` ([UInt8](../../sql-reference/data-types/int-uint.md)) — Value of the `perform_ttl_move_on_insert` setting. — Disables TTL move on data part INSERT. By default if we insert a data part that already expired by the TTL move rule it immediately goes to a volume/disk declared in move rule. This can significantly slowdown insert in case if destination volume/disk is slow (e.g. S3). +- `load_balancing` ([Enum8](../../sql-reference/data-types/enum.md)) — Policy for disk balancing. Can have one of the following values: + - `ROUND_ROBIN` + - `LEAST_USED` If the storage policy contains more then one volume, then information for each volume is stored in the individual row of the table. diff --git a/docs/en/operations/utilities/clickhouse-local.md b/docs/en/operations/utilities/clickhouse-local.md index a23e0745dec..6363d9cab27 100644 --- a/docs/en/operations/utilities/clickhouse-local.md +++ b/docs/en/operations/utilities/clickhouse-local.md @@ -6,7 +6,13 @@ sidebar_label: clickhouse-local # clickhouse-local -The `clickhouse-local` program enables you to perform fast processing on local files, without having to deploy and configure the ClickHouse server. It accepts data that represent tables and queries them using [ClickHouse SQL dialect](../../sql-reference/index.md). `clickhouse-local` uses the same core as ClickHouse server, so it supports most of the features and the same set of formats and table engines. +## When to use clickhouse-local vs. ClickHouse + +`clickhouse-local` is an easy-to-use version of ClickHouse that is ideal for developers who need to perform fast processing on local and remote files using SQL without having to install a full database server. With `clickhouse-local`, developers can use SQL commands (using the [ClickHouse SQL dialect](../../sql-reference/index.md)) directly from the command line, providing a simple and efficient way to access ClickHouse features without the need for a full ClickHouse installation. One of the main benefits of `clickhouse-local` is that it is already included when installing [clickhouse-client](https://clickhouse.com/docs/en/integrations/sql-clients/clickhouse-client-local). This means that developers can get started with `clickhouse-local` quickly, without the need for a complex installation process. + +While `clickhouse-local` is a great tool for development and testing purposes, and for processing files, it is not suitable for serving end users or applications. In these scenarios, it is recommended to use the open-source [ClickHouse](https://clickhouse.com/docs/en/install). ClickHouse is a powerful OLAP database that is designed to handle large-scale analytical workloads. It provides fast and efficient processing of complex queries on large datasets, making it ideal for use in production environments where high-performance is critical. Additionally, ClickHouse offers a wide range of features such as replication, sharding, and high availability, which are essential for scaling up to handle large datasets and serving applications. If you need to handle larger datasets or serve end users or applications, we recommend using open-source ClickHouse instead of `clickhouse-local`. + +Please read the docs below that show example use cases for `clickhouse-local`, such as [querying local CSVs](#query-data-in-a-csv-file-using-sql) or [reading a parquet file in S3](#query-data-in-a-parquet-file-in-aws-s3). ## Download clickhouse-local diff --git a/docs/en/sql-reference/aggregate-functions/reference/deltasumtimestamp.md b/docs/en/sql-reference/aggregate-functions/reference/deltasumtimestamp.md index e08e69b7cf6..afcf2a48c23 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/deltasumtimestamp.md +++ b/docs/en/sql-reference/aggregate-functions/reference/deltasumtimestamp.md @@ -6,7 +6,7 @@ title: deltaSumTimestamp Adds the difference between consecutive rows. If the difference is negative, it is ignored. -This function is primarily for [materialized views](../../../sql-reference/statements/create/view.md#materialized) that are ordered by some time bucket-aligned timestamp, for example, a `toStartOfMinute` bucket. Because the rows in such a materialized view will all have the same timestamp, it is impossible for them to be merged in the "right" order. This function keeps track of the `timestamp` of the values it's seen, so it's possible to order the states correctly during merging. +This function is primarily for [materialized views](../../../sql-reference/statements/create/view.md#materialized) that store data ordered by some time bucket-aligned timestamp, for example, a `toStartOfMinute` bucket. Because the rows in such a materialized view will all have the same timestamp, it is impossible for them to be merged in the correct order, without storing the original, unrounded timestamp value. The `deltaSumTimestamp` function keeps track of the original `timestamp` of the values it's seen, so the values (states) of the function are correctly computed during merging of parts. To calculate the delta sum across an ordered collection you can simply use the [deltaSum](../../../sql-reference/aggregate-functions/reference/deltasum.md#agg_functions-deltasum) function. diff --git a/docs/en/sql-reference/functions/date-time-functions.md b/docs/en/sql-reference/functions/date-time-functions.md index 71b7fa07f18..903654c2f0a 100644 --- a/docs/en/sql-reference/functions/date-time-functions.md +++ b/docs/en/sql-reference/functions/date-time-functions.md @@ -1264,7 +1264,7 @@ Using replacement fields, you can define a pattern for the resulting string. “ | %d | day of the month, zero-padded (01-31) | 02 | | %D | Short MM/DD/YY date, equivalent to %m/%d/%y | 01/02/18 | | %e | day of the month, space-padded (1-31) |   2 | -| %f | fractional second from the fractional part of DateTime64 | 1234560 | +| %f | fractional second, see 'Note 1' below | 1234560 | | %F | short YYYY-MM-DD date, equivalent to %Y-%m-%d | 2018-01-02 | | %g | two-digit year format, aligned to ISO 8601, abbreviated from four-digit notation | 18 | | %G | four-digit year format for ISO week number, calculated from the week-based year [defined by the ISO 8601](https://en.wikipedia.org/wiki/ISO_8601#Week_dates) standard, normally useful only with %V | 2018 | @@ -1276,16 +1276,16 @@ Using replacement fields, you can define a pattern for the resulting string. “ | %k | hour in 24h format (00-23) | 22 | | %l | hour in 12h format (01-12) | 09 | | %m | month as an integer number (01-12) | 01 | -| %M | minute (00-59) | 33 | +| %M | full month name (January-December), see 'Note 2' below | January | | %n | new-line character (‘’) | | | %p | AM or PM designation | PM | | %Q | Quarter (1-4) | 1 | -| %r | 12-hour HH:MM AM/PM time, equivalent to %H:%M %p | 10:30 PM | -| %R | 24-hour HH:MM time, equivalent to %H:%M | 22:33 | +| %r | 12-hour HH:MM AM/PM time, equivalent to %H:%i %p | 10:30 PM | +| %R | 24-hour HH:MM time, equivalent to %H:%i | 22:33 | | %s | second (00-59) | 44 | | %S | second (00-59) | 44 | | %t | horizontal-tab character (’) | | -| %T | ISO 8601 time format (HH:MM:SS), equivalent to %H:%M:%S | 22:33:44 | +| %T | ISO 8601 time format (HH:MM:SS), equivalent to %H:%i:%S | 22:33:44 | | %u | ISO 8601 weekday as number with Monday as 1 (1-7) | 2 | | %V | ISO 8601 week number (01-53) | 01 | | %w | weekday as a integer number with Sunday as 0 (0-6) | 2 | @@ -1295,6 +1295,10 @@ Using replacement fields, you can define a pattern for the resulting string. “ | %z | Time offset from UTC as +HHMM or -HHMM | -0500 | | %% | a % sign | % | +Note 1: In ClickHouse versions earlier than v23.4, `%f` prints a single zero (0) if the formatted value is a Date, Date32 or DateTime (which have no fractional seconds) or a DateTime64 with a precision of 0. The previous behavior can be restored using setting `formatdatetime_f_prints_single_zero = 1`. + +Note 2: In ClickHouse versions earlier than v23.4, `%M` prints the minute (00-59) instead of the full month name (January-December). The previous behavior can be restored using setting `formatdatetime_parsedatetime_m_is_month_name = 0`. + **Example** Query: diff --git a/docs/en/sql-reference/functions/hash-functions.md b/docs/en/sql-reference/functions/hash-functions.md index 2943ba13861..5fcf6a2d1df 100644 --- a/docs/en/sql-reference/functions/hash-functions.md +++ b/docs/en/sql-reference/functions/hash-functions.md @@ -441,11 +441,11 @@ SELECT farmHash64(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23:00:0 ## javaHash -Calculates JavaHash from a [string](http://hg.openjdk.java.net/jdk8u/jdk8u/jdk/file/478a4add975b/src/share/classes/java/lang/String.java#l1452), -[Byte](https://hg.openjdk.java.net/jdk8u/jdk8u/jdk/file/478a4add975b/src/share/classes/java/lang/Byte.java#l405), -[Short](https://hg.openjdk.java.net/jdk8u/jdk8u/jdk/file/478a4add975b/src/share/classes/java/lang/Short.java#l410), -[Integer](https://hg.openjdk.java.net/jdk8u/jdk8u/jdk/file/478a4add975b/src/share/classes/java/lang/Integer.java#l959), -[Long](https://hg.openjdk.java.net/jdk8u/jdk8u/jdk/file/478a4add975b/src/share/classes/java/lang/Long.java#l1060). +Calculates JavaHash from a [string](http://hg.openjdk.java.net/jdk8u/jdk8u/jdk/file/478a4add975b/src/share/classes/java/lang/String.java#l1452), +[Byte](https://hg.openjdk.java.net/jdk8u/jdk8u/jdk/file/478a4add975b/src/share/classes/java/lang/Byte.java#l405), +[Short](https://hg.openjdk.java.net/jdk8u/jdk8u/jdk/file/478a4add975b/src/share/classes/java/lang/Short.java#l410), +[Integer](https://hg.openjdk.java.net/jdk8u/jdk8u/jdk/file/478a4add975b/src/share/classes/java/lang/Integer.java#l959), +[Long](https://hg.openjdk.java.net/jdk8u/jdk8u/jdk/file/478a4add975b/src/share/classes/java/lang/Long.java#l1060). This hash function is neither fast nor having a good quality. The only reason to use it is when this algorithm is already used in another system and you have to calculate exactly the same result. Note that Java only support calculating signed integers hash, so if you want to calculate unsigned integers hash you must cast it to proper signed ClickHouse types. @@ -660,6 +660,45 @@ Result: └──────────────────────┴─────────────────────┘ ``` + +## kafkaMurmurHash + +Calculates a 32-bit [MurmurHash2](https://github.com/aappleby/smhasher) hash value using the same hash seed as [Kafka](https://github.com/apache/kafka/blob/461c5cfe056db0951d9b74f5adc45973670404d7/clients/src/main/java/org/apache/kafka/common/utils/Utils.java#L482) and without the highest bit to be compatible with [Default Partitioner](https://github.com/apache/kafka/blob/139f7709bd3f5926901a21e55043388728ccca78/clients/src/main/java/org/apache/kafka/clients/producer/internals/BuiltInPartitioner.java#L328). + +**Syntax** + +```sql +MurmurHash(par1, ...) +``` + +**Arguments** + +- `par1, ...` — A variable number of parameters that can be any of the [supported data types](/docs/en/sql-reference/data-types/index.md/#data_types). + +**Returned value** + +- Calculated hash value. + +Type: [UInt32](/docs/en/sql-reference/data-types/int-uint.md). + +**Example** + +Query: + +```sql +SELECT + kafkaMurmurHash('foobar') AS res1, + kafkaMurmurHash(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23:00:00')) AS res2 +``` + +Result: + +```response +┌───────res1─┬─────res2─┐ +│ 1357151166 │ 85479775 │ +└────────────┴──────────┘ +``` + ## murmurHash3_32, murmurHash3_64 Produces a [MurmurHash3](https://github.com/aappleby/smhasher) hash value. diff --git a/docs/en/sql-reference/functions/string-replace-functions.md b/docs/en/sql-reference/functions/string-replace-functions.md index 50e15f70f5d..d4c7c451af2 100644 --- a/docs/en/sql-reference/functions/string-replace-functions.md +++ b/docs/en/sql-reference/functions/string-replace-functions.md @@ -13,17 +13,18 @@ Functions for [searching](../../sql-reference/functions/string-search-functions. ## replaceOne(haystack, pattern, replacement) Replaces the first occurrence of the substring ‘pattern’ (if it exists) in ‘haystack’ by the ‘replacement’ string. -‘pattern’ and ‘replacement’ must be constants. ## replaceAll(haystack, pattern, replacement), replace(haystack, pattern, replacement) Replaces all occurrences of the substring ‘pattern’ in ‘haystack’ by the ‘replacement’ string. +Alias: `replace`. + ## replaceRegexpOne(haystack, pattern, replacement) Replaces the first occurrence of the substring matching the regular expression ‘pattern’ in ‘haystack‘ by the ‘replacement‘ string. -‘pattern‘ must be a constant [re2 regular expression](https://github.com/google/re2/wiki/Syntax). -‘replacement’ must be a plain constant string or a constant string containing substitutions `\0-\9`. +‘pattern‘ must be a [re2 regular expression](https://github.com/google/re2/wiki/Syntax). +‘replacement’ must be a plain string or a string containing substitutions `\0-\9`. Substitutions `\1-\9` correspond to the 1st to 9th capturing group (submatch), substitution `\0` corresponds to the entire match. To use a verbatim `\` character in the ‘pattern‘ or ‘replacement‘ string, escape it using `\`. Also keep in mind that string literals require an extra escaping. @@ -88,6 +89,8 @@ SELECT replaceRegexpAll('Hello, World!', '^', 'here: ') AS res └─────────────────────┘ ``` +Alias: `REGEXP_REPLACE`. + ## regexpQuoteMeta(s) The function adds a backslash before some predefined characters in the string. diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index 213ed187f15..5ce72caa3b9 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -1245,7 +1245,6 @@ Returns DateTime values parsed from input string according to a MySQL style form **Supported format specifiers** All format specifiers listed in [formatDateTime](/docs/en/sql-reference/functions/date-time-functions.md#date_time_functions-formatDateTime) except: -- %f: fractional second - %Q: Quarter (1-4) **Example** diff --git a/docs/en/sql-reference/statements/drop.md b/docs/en/sql-reference/statements/drop.md index 8a83a8fae1d..b6208c2fd52 100644 --- a/docs/en/sql-reference/statements/drop.md +++ b/docs/en/sql-reference/statements/drop.md @@ -22,6 +22,10 @@ DROP DATABASE [IF EXISTS] db [ON CLUSTER cluster] [SYNC] Deletes the table. +:::tip +Also see [UNDROP TABLE](/docs/en/sql-reference/statements/undrop.md) +::: + Syntax: ``` sql diff --git a/docs/en/sql-reference/statements/undrop.md b/docs/en/sql-reference/statements/undrop.md new file mode 100644 index 00000000000..40ac1ab4f99 --- /dev/null +++ b/docs/en/sql-reference/statements/undrop.md @@ -0,0 +1,99 @@ +--- +slug: /en/sql-reference/statements/undrop +sidebar_label: UNDROP +--- + +# UNDROP TABLE + +Cancels the dropping of the table. + +Beginning with ClickHouse version 23.3 it is possible to UNDROP a table in an Atomic database +within `database_atomic_delay_before_drop_table_sec` (8 minutes by default) of issuing the DROP TABLE statement. Dropped tables are listed in +a system table called `system.dropped_tables`. + +If you have a materialized view without a `TO` clause associated with the dropped table, then you will also have to UNDROP the inner table of that view. + +:::note +UNDROP TABLE is experimental. To use it add this setting: +```sql +set allow_experimental_undrop_table_query = 1; +``` +::: + +:::tip +Also see [DROP TABLE](/docs/en/sql-reference/statements/drop.md) +::: + +Syntax: + +``` sql +UNDROP TABLE [db.]name [UUID ''] [ON CLUSTER cluster] +``` + +**Example** + +``` sql +set allow_experimental_undrop_table_query = 1; +``` + +```sql +CREATE TABLE undropMe +( + `id` UInt8 +) +ENGINE = MergeTree +ORDER BY id +``` + +```sql +DROP TABLE undropMe +``` +```sql +SELECT * +FROM system.dropped_tables +FORMAT Vertical +``` +```response +Row 1: +────── +index: 0 +database: default +table: undropMe +uuid: aa696a1a-1d70-4e60-a841-4c80827706cc +engine: MergeTree +metadata_dropped_path: /var/lib/clickhouse/metadata_dropped/default.undropMe.aa696a1a-1d70-4e60-a841-4c80827706cc.sql +table_dropped_time: 2023-04-05 14:12:12 + +1 row in set. Elapsed: 0.001 sec. +``` +```sql +UNDROP TABLE undropMe +``` +```response +Ok. +``` +```sql +SELECT * +FROM system.dropped_tables +FORMAT Vertical +``` +```response +Ok. + +0 rows in set. Elapsed: 0.001 sec. +``` +```sql +DESCRIBE TABLE undropMe +FORMAT Vertical +``` +```response +Row 1: +────── +name: id +type: UInt8 +default_type: +default_expression: +comment: +codec_expression: +ttl_expression: +``` diff --git a/docs/en/sql-reference/syntax.md b/docs/en/sql-reference/syntax.md index 63c5042f9e8..ea2df235c1a 100644 --- a/docs/en/sql-reference/syntax.md +++ b/docs/en/sql-reference/syntax.md @@ -14,7 +14,7 @@ The `INSERT` query uses both parsers: INSERT INTO t VALUES (1, 'Hello, world'), (2, 'abc'), (3, 'def') ``` -The `INSERT INTO t VALUES` fragment is parsed by the full parser, and the data `(1, 'Hello, world'), (2, 'abc'), (3, 'def')` is parsed by the fast stream parser. You can also turn on the full parser for the data by using the [input_format_values_interpret_expressions](../operations/settings/settings-formats.md#settings-input_format_values_interpret_expressions) setting. When `input_format_values_interpret_expressions = 1`, ClickHouse first tries to parse values with the fast stream parser. If it fails, ClickHouse tries to use the full parser for the data, treating it like an SQL [expression](#syntax-expressions). +The `INSERT INTO t VALUES` fragment is parsed by the full parser, and the data `(1, 'Hello, world'), (2, 'abc'), (3, 'def')` is parsed by the fast stream parser. You can also turn on the full parser for the data by using the [input_format_values_interpret_expressions](../operations/settings/settings-formats.md#input_format_values_interpret_expressions) setting. When `input_format_values_interpret_expressions = 1`, ClickHouse first tries to parse values with the fast stream parser. If it fails, ClickHouse tries to use the full parser for the data, treating it like an SQL [expression](#expressions). Data can have any format. When a query is received, the server calculates no more than [max_query_size](../operations/settings/settings.md#settings-max_query_size) bytes of the request in RAM (by default, 1 MB), and the rest is stream parsed. It allows for avoiding issues with large `INSERT` queries. @@ -45,7 +45,7 @@ You can check whether a data type name is case-sensitive in the [system.data_typ In contrast to standard SQL, all other keywords (including functions names) are **case-sensitive**. -Keywords are not reserved; they are treated as such only in the corresponding context. If you use [identifiers](#syntax-identifiers) with the same name as the keywords, enclose them into double-quotes or backticks. For example, the query `SELECT "FROM" FROM table_name` is valid if the table `table_name` has column with the name `"FROM"`. +Keywords are not reserved; they are treated as such only in the corresponding context. If you use [identifiers](#identifiers) with the same name as the keywords, enclose them into double-quotes or backticks. For example, the query `SELECT "FROM" FROM table_name` is valid if the table `table_name` has column with the name `"FROM"`. ## Identifiers @@ -54,7 +54,7 @@ Identifiers are: - Cluster, database, table, partition, and column names. - Functions. - Data types. -- [Expression aliases](#syntax-expression_aliases). +- [Expression aliases](#expression_aliases). Identifiers can be quoted or non-quoted. The latter is preferred. @@ -108,7 +108,7 @@ Depending on the data format (input or output), `NULL` may have a different repr There are many nuances to processing `NULL`. For example, if at least one of the arguments of a comparison operation is `NULL`, the result of this operation is also `NULL`. The same is true for multiplication, addition, and other operations. For more information, read the documentation for each operation. -In queries, you can check `NULL` using the [IS NULL](../sql-reference/operators/index.md#operator-is-null) and [IS NOT NULL](../sql-reference/operators/index.md) operators and the related functions `isNull` and `isNotNull`. +In queries, you can check `NULL` using the [IS NULL](../sql-reference/operators/index.md#is-null) and [IS NOT NULL](../sql-reference/operators/index.md#is-not-null) operators and the related functions `isNull` and `isNotNull`. ### Heredoc @@ -149,7 +149,7 @@ For example, the following SQL defines parameters named `a`, `b`, `c` and `d` - SET param_a = 13; SET param_b = 'str'; SET param_c = '2022-08-04 18:30:53'; -SET param_d = {'10': [11, 12], '13': [14, 15]}'; +SET param_d = {'10': [11, 12], '13': [14, 15]}; SELECT {a: UInt32}, @@ -166,7 +166,7 @@ Result: If you are using `clickhouse-client`, the parameters are specified as `--param_name=value`. For example, the following parameter has the name `message` and it is retrieved as a `String`: -```sql +```bash clickhouse-client --param_message='hello' --query="SELECT {message: String}" ``` @@ -190,7 +190,7 @@ Query parameters are not general text substitutions which can be used in arbitra ## Functions Function calls are written like an identifier with a list of arguments (possibly empty) in round brackets. In contrast to standard SQL, the brackets are required, even for an empty argument list. Example: `now()`. -There are regular and aggregate functions (see the section “Aggregate functions”). Some aggregate functions can contain two lists of arguments in brackets. Example: `quantile (0.9) (x)`. These aggregate functions are called “parametric” functions, and the arguments in the first list are called “parameters”. The syntax of aggregate functions without parameters is the same as for regular functions. +There are regular and aggregate functions (see the section [Aggregate functions](/docs/en/sql-reference/aggregate-functions/index.md)). Some aggregate functions can contain two lists of arguments in brackets. Example: `quantile (0.9) (x)`. These aggregate functions are called “parametric” functions, and the arguments in the first list are called “parameters”. The syntax of aggregate functions without parameters is the same as for regular functions. ## Operators @@ -199,7 +199,7 @@ For example, the expression `1 + 2 * 3 + 4` is transformed to `plus(plus(1, mult ## Data Types and Database Table Engines -Data types and table engines in the `CREATE` query are written the same way as identifiers or functions. In other words, they may or may not contain an argument list in brackets. For more information, see the sections “Data types,” “Table engines,” and “CREATE”. +Data types and table engines in the `CREATE` query are written the same way as identifiers or functions. In other words, they may or may not contain an argument list in brackets. For more information, see the sections [Data types](/docs/en/sql-reference/data-types/index.md), [Table engines](/docs/en/engines/table-engines/index.md), and [CREATE](/docs/en/sql-reference/statements/create/index.md). ## Expression Aliases @@ -211,17 +211,17 @@ expr AS alias - `AS` — The keyword for defining aliases. You can define the alias for a table name or a column name in a `SELECT` clause without using the `AS` keyword. - For example, `SELECT table_name_alias.column_name FROM table_name table_name_alias`. + For example, `SELECT table_name_alias.column_name FROM table_name table_name_alias`. - In the [CAST](./functions/type-conversion-functions.md#type_conversion_function-cast) function, the `AS` keyword has another meaning. See the description of the function. + In the [CAST](./functions/type-conversion-functions.md#castx-t) function, the `AS` keyword has another meaning. See the description of the function. - `expr` — Any expression supported by ClickHouse. - For example, `SELECT column_name * 2 AS double FROM some_table`. + For example, `SELECT column_name * 2 AS double FROM some_table`. -- `alias` — Name for `expr`. Aliases should comply with the [identifiers](#syntax-identifiers) syntax. +- `alias` — Name for `expr`. Aliases should comply with the [identifiers](#identifiers) syntax. - For example, `SELECT "table t".column_name FROM table_name AS "table t"`. + For example, `SELECT "table t".column_name FROM table_name AS "table t"`. ### Notes on Usage @@ -254,11 +254,11 @@ Received exception from server (version 18.14.17): Code: 184. DB::Exception: Received from localhost:9000, 127.0.0.1. DB::Exception: Aggregate function sum(b) is found inside another aggregate function in query. ``` -In this example, we declared table `t` with column `b`. Then, when selecting data, we defined the `sum(b) AS b` alias. As aliases are global, ClickHouse substituted the literal `b` in the expression `argMax(a, b)` with the expression `sum(b)`. This substitution caused the exception. You can change this default behavior by setting [prefer_column_name_to_alias](../operations/settings/settings.md#prefer_column_name_to_alias) to `1`. +In this example, we declared table `t` with column `b`. Then, when selecting data, we defined the `sum(b) AS b` alias. As aliases are global, ClickHouse substituted the literal `b` in the expression `argMax(a, b)` with the expression `sum(b)`. This substitution caused the exception. You can change this default behavior by setting [prefer_column_name_to_alias](../operations/settings/settings.md#prefer-column-name-to-alias) to `1`. ## Asterisk -In a `SELECT` query, an asterisk can replace the expression. For more information, see the section “SELECT”. +In a `SELECT` query, an asterisk can replace the expression. For more information, see the section [SELECT](/docs/en/sql-reference/statements/select/index.md#asterisk). ## Expressions diff --git a/docs/en/sql-reference/table-functions/executable.md b/docs/en/sql-reference/table-functions/executable.md index 22c74eb8cfa..5a24c3ab11d 100644 --- a/docs/en/sql-reference/table-functions/executable.md +++ b/docs/en/sql-reference/table-functions/executable.md @@ -20,7 +20,7 @@ A key advantage between ordinary UDF functions and the `executable` table functi The `executable` table function requires three parameters and accepts an optional list of input queries: ```sql -executable(script_name, format, structure, [input_query...]) +executable(script_name, format, structure, [input_query...] [,SETTINGS ...]) ``` - `script_name`: the file name of the script. saved in the `user_scripts` folder (the default folder of the `user_scripts_path` setting) @@ -83,6 +83,15 @@ The response looks like: └────┴────────────┘ ``` +## Settings + +- `send_chunk_header` - controls whether to send row count before sending a chunk of data to process. Default value is `false`. +- `pool_size` — Size of pool. If 0 is specified as `pool_size` then there is no pool size restrictions. Default value is `16`. +- `max_command_execution_time` — Maximum executable script command execution time for processing block of data. Specified in seconds. Default value is 10. +- `command_termination_timeout` — executable script should contain main read-write loop. After table function is destroyed, pipe is closed, and executable file will have `command_termination_timeout` seconds to shutdown, before ClickHouse will send SIGTERM signal to child process. Specified in seconds. Default value is 10. +- `command_read_timeout` - timeout for reading data from command stdout in milliseconds. Default value 10000. +- `command_write_timeout` - timeout for writing data to command stdin in milliseconds. Default value 10000. + ## Passing Query Results to a Script Be sure to check out the example in the `Executable` table engine on [how to pass query results to a script](../../engines/table-engines/special/executable.md#passing-query-results-to-a-script). Here is how you execute the same script in that example using the `executable` table function: @@ -94,4 +103,4 @@ SELECT * FROM executable( 'id UInt64, sentiment Float32', (SELECT id, comment FROM hackernews WHERE id > 0 AND comment != '' LIMIT 20) ); -``` \ No newline at end of file +``` diff --git a/docs/ru/development/developer-instruction.md b/docs/ru/development/developer-instruction.md index 80472178ae2..7294bc2ae87 100644 --- a/docs/ru/development/developer-instruction.md +++ b/docs/ru/development/developer-instruction.md @@ -41,9 +41,15 @@ ClickHouse не работает и не собирается на 32-битны Выполните в терминале: - git clone git@github.com:your_github_username/ClickHouse.git --recursive + git clone --shallow-submodules git@github.com:your_github_username/ClickHouse.git cd ClickHouse +Или (если вы хотите использовать sparse checkout для submodules): + + git clone git@github.com:your_github_username/ClickHouse.git + cd ClickHouse + ./contrib/update-submodules.sh + Замените слово `your_github_username` в команде для git на имя вашего аккаунта на GitHub. Эта команда создаст директорию ClickHouse, содержащую рабочую копию проекта. diff --git a/docs/ru/sql-reference/aggregate-functions/reference/deltasumtimestamp.md b/docs/ru/sql-reference/aggregate-functions/reference/deltasumtimestamp.md index 7be933d67d7..50434419651 100644 --- a/docs/ru/sql-reference/aggregate-functions/reference/deltasumtimestamp.md +++ b/docs/ru/sql-reference/aggregate-functions/reference/deltasumtimestamp.md @@ -7,7 +7,7 @@ sidebar_position: 141 Суммирует разницу между последовательными строками. Если разница отрицательна — она будет проигнорирована. -Эта функция предназначена в первую очередь для [материализованных представлений](../../../sql-reference/statements/create/view.md#materialized), упорядоченных по некоторому временному бакету согласно timestamp, например, по бакету `toStartOfMinute`. Поскольку строки в таком материализованном представлении будут иметь одинаковый timestamp, невозможно объединить их в "правом" порядке. Функция отслеживает `timestamp` наблюдаемых значений, поэтому возможно правильно упорядочить состояния во время слияния. +Эта функция предназначена в первую очередь для [материализованных представлений](../../../sql-reference/statements/create/view.md#materialized), хранящих данные, упорядоченные по некоторому округленному временному интервалу, согласно timestamp, например, по бакету `toStartOfMinute`. Поскольку строки в таком материализованном представлении будут иметь одинаковый timestamp, их невозможно объединить в правильном порядке без хранения исходного, неокругленного значения timestamp. Функция `deltaSumTimestamp` отслеживает исходные `timestamp` наблюдаемых значений, поэтому значения (состояния) функции правильно вычисляются во время слияния кусков. Чтобы вычислить разницу между упорядоченными последовательными строками, вы можете использовать функцию [deltaSum](../../../sql-reference/aggregate-functions/reference/deltasum.md#agg_functions-deltasum) вместо функции `deltaSumTimestamp`. diff --git a/programs/client/Client.cpp b/programs/client/Client.cpp index 660b8d7c00a..df0abceb8c6 100644 --- a/programs/client/Client.cpp +++ b/programs/client/Client.cpp @@ -277,11 +277,11 @@ void Client::initialize(Poco::Util::Application & self) */ const char * env_user = getenv("CLICKHOUSE_USER"); // NOLINT(concurrency-mt-unsafe) - if (env_user) + if (env_user && !config().has("user")) config().setString("user", env_user); const char * env_password = getenv("CLICKHOUSE_PASSWORD"); // NOLINT(concurrency-mt-unsafe) - if (env_password) + if (env_password && !config().has("password")) config().setString("password", env_password); parseConnectionsCredentials(); diff --git a/programs/install/Install.cpp b/programs/install/Install.cpp index b142159fbdf..d83e189f7ef 100644 --- a/programs/install/Install.cpp +++ b/programs/install/Install.cpp @@ -375,15 +375,22 @@ int mainEntryClickHouseInstall(int argc, char ** argv) try { - ReadBufferFromFile in(binary_self_path.string()); - WriteBufferFromFile out(main_bin_tmp_path.string()); - copyData(in, out); - out.sync(); + String source = binary_self_path.string(); + String destination = main_bin_tmp_path.string(); - if (0 != fchmod(out.getFD(), S_IRUSR | S_IRGRP | S_IROTH | S_IXUSR | S_IXGRP | S_IXOTH)) + /// Try to make a hard link first, as an optimization. + /// It is possible if the source and the destination are on the same filesystems. + if (0 != link(source.c_str(), destination.c_str())) + { + ReadBufferFromFile in(binary_self_path.string()); + WriteBufferFromFile out(main_bin_tmp_path.string()); + copyData(in, out); + out.sync(); + out.finalize(); + } + + if (0 != chmod(destination.c_str(), S_IRUSR | S_IRGRP | S_IROTH | S_IXUSR | S_IXGRP | S_IXOTH)) throwFromErrno(fmt::format("Cannot chmod {}", main_bin_tmp_path.string()), ErrorCodes::SYSTEM_ERROR); - - out.finalize(); } catch (const Exception & e) { diff --git a/programs/keeper/Keeper.cpp b/programs/keeper/Keeper.cpp index 266b363eb47..3853c955171 100644 --- a/programs/keeper/Keeper.cpp +++ b/programs/keeper/Keeper.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include #include diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp index 9ef9f704f61..164e1ce14e5 100644 --- a/programs/server/Server.cpp +++ b/programs/server/Server.cpp @@ -981,7 +981,7 @@ try StatusFile status{path / "status", StatusFile::write_full_info}; - DB::ServerUUID::load(path / "uuid", log); + ServerUUID::load(path / "uuid", log); /// Try to increase limit on number of open files. { diff --git a/src/Access/DiskAccessStorage.cpp b/src/Access/DiskAccessStorage.cpp index ef88e8a225f..710cf257b95 100644 --- a/src/Access/DiskAccessStorage.cpp +++ b/src/Access/DiskAccessStorage.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -19,6 +20,7 @@ #include #include #include +#include namespace DB @@ -317,15 +319,15 @@ void DiskAccessStorage::scheduleWriteLists(AccessEntityType type) return; /// If the lists' writing thread is still waiting we can update `types_of_lists_to_write` easily, /// without restarting that thread. - if (lists_writing_thread.joinable()) - lists_writing_thread.join(); + if (lists_writing_thread && lists_writing_thread->joinable()) + lists_writing_thread->join(); /// Create the 'need_rebuild_lists.mark' file. /// This file will be used later to find out if writing lists is successful or not. std::ofstream out{getNeedRebuildListsMarkFilePath(directory_path)}; out.close(); - lists_writing_thread = ThreadFromGlobalPool{&DiskAccessStorage::listsWritingThreadFunc, this}; + lists_writing_thread = std::make_unique(&DiskAccessStorage::listsWritingThreadFunc, this); lists_writing_thread_is_waiting = true; } @@ -349,10 +351,10 @@ void DiskAccessStorage::listsWritingThreadFunc() void DiskAccessStorage::stopListsWritingThread() { - if (lists_writing_thread.joinable()) + if (lists_writing_thread && lists_writing_thread->joinable()) { lists_writing_thread_should_exit.notify_one(); - lists_writing_thread.join(); + lists_writing_thread->join(); } } diff --git a/src/Access/DiskAccessStorage.h b/src/Access/DiskAccessStorage.h index b1ef1d10ba7..069a966c8e9 100644 --- a/src/Access/DiskAccessStorage.h +++ b/src/Access/DiskAccessStorage.h @@ -1,7 +1,7 @@ #pragma once #include -#include +#include #include @@ -81,7 +81,7 @@ private: bool failed_to_write_lists TSA_GUARDED_BY(mutex) = false; /// List files are written in a separate thread. - ThreadFromGlobalPool lists_writing_thread; + std::unique_ptr lists_writing_thread; /// Signals `lists_writing_thread` to exit. std::condition_variable lists_writing_thread_should_exit; diff --git a/src/Access/ReplicatedAccessStorage.cpp b/src/Access/ReplicatedAccessStorage.cpp index ddc5e8bfed1..f34e6728ab3 100644 --- a/src/Access/ReplicatedAccessStorage.cpp +++ b/src/Access/ReplicatedAccessStorage.cpp @@ -1,3 +1,4 @@ +#include #include #include #include @@ -15,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -72,7 +74,7 @@ void ReplicatedAccessStorage::startWatchingThread() { bool prev_watching_flag = watching.exchange(true); if (!prev_watching_flag) - watching_thread = ThreadFromGlobalPool(&ReplicatedAccessStorage::runWatchingThread, this); + watching_thread = std::make_unique(&ReplicatedAccessStorage::runWatchingThread, this); } void ReplicatedAccessStorage::stopWatchingThread() @@ -81,8 +83,8 @@ void ReplicatedAccessStorage::stopWatchingThread() if (prev_watching_flag) { watched_queue->finish(); - if (watching_thread.joinable()) - watching_thread.join(); + if (watching_thread && watching_thread->joinable()) + watching_thread->join(); } } diff --git a/src/Access/ReplicatedAccessStorage.h b/src/Access/ReplicatedAccessStorage.h index d9d4b628f8d..555d58e6b04 100644 --- a/src/Access/ReplicatedAccessStorage.h +++ b/src/Access/ReplicatedAccessStorage.h @@ -2,7 +2,7 @@ #include -#include +#include #include #include #include @@ -21,7 +21,7 @@ public: static constexpr char STORAGE_TYPE[] = "replicated"; ReplicatedAccessStorage(const String & storage_name, const String & zookeeper_path, zkutil::GetZooKeeper get_zookeeper, AccessChangesNotifier & changes_notifier_, bool allow_backup); - virtual ~ReplicatedAccessStorage() override; + ~ReplicatedAccessStorage() override; const char * getStorageType() const override { return STORAGE_TYPE; } @@ -43,7 +43,7 @@ private: std::mutex cached_zookeeper_mutex; std::atomic watching = false; - ThreadFromGlobalPool watching_thread; + std::unique_ptr watching_thread; std::shared_ptr> watched_queue; std::optional insertImpl(const AccessEntityPtr & entity, bool replace_if_exists, bool throw_if_exists) override; diff --git a/src/AggregateFunctions/AggregateFunctionMaxIntersections.h b/src/AggregateFunctions/AggregateFunctionMaxIntersections.h index 2c54293eeec..5074e491f60 100644 --- a/src/AggregateFunctions/AggregateFunctionMaxIntersections.h +++ b/src/AggregateFunctions/AggregateFunctionMaxIntersections.h @@ -1,6 +1,5 @@ #pragma once -#include #include #include diff --git a/src/AggregateFunctions/AggregateFunctionSparkbar.h b/src/AggregateFunctions/AggregateFunctionSparkbar.h index 78f7e9fcefa..30e107bc4db 100644 --- a/src/AggregateFunctions/AggregateFunctionSparkbar.h +++ b/src/AggregateFunctions/AggregateFunctionSparkbar.h @@ -11,7 +11,6 @@ #include #include #include -#include #include #include #include diff --git a/src/AggregateFunctions/AggregateFunctionSumMap.h b/src/AggregateFunctions/AggregateFunctionSumMap.h index f51ec423c69..b30f5ff5220 100644 --- a/src/AggregateFunctions/AggregateFunctionSumMap.h +++ b/src/AggregateFunctions/AggregateFunctionSumMap.h @@ -18,7 +18,6 @@ #include #include #include -#include #include diff --git a/src/AggregateFunctions/IAggregateFunction.h b/src/AggregateFunctions/IAggregateFunction.h index 4a050a58600..ddc0535d0e4 100644 --- a/src/AggregateFunctions/IAggregateFunction.h +++ b/src/AggregateFunctions/IAggregateFunction.h @@ -9,7 +9,7 @@ #include #include #include -#include +#include #include #include "config.h" diff --git a/src/Analyzer/Passes/QueryAnalysisPass.cpp b/src/Analyzer/Passes/QueryAnalysisPass.cpp index 3b27561450b..d62c9af366c 100644 --- a/src/Analyzer/Passes/QueryAnalysisPass.cpp +++ b/src/Analyzer/Passes/QueryAnalysisPass.cpp @@ -32,6 +32,7 @@ #include #include +#include #include @@ -75,6 +76,7 @@ #include #include #include +#include namespace ProfileEvents { @@ -112,6 +114,8 @@ namespace ErrorCodes extern const int ALIAS_REQUIRED; extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; extern const int UNKNOWN_TABLE; + extern const int ILLEGAL_COLUMN; + extern const int NUMBER_OF_COLUMNS_DOESNT_MATCH; } /** Query analyzer implementation overview. Please check documentation in QueryAnalysisPass.h before. @@ -6079,6 +6083,18 @@ void QueryAnalyzer::initializeTableExpressionData(const QueryTreeNodePtr & table scope.table_expression_node_to_data.emplace(table_expression_node, std::move(table_expression_data)); } +bool findIdentifier(const FunctionNode & function) +{ + for (const auto & argument : function.getArguments()) + { + if (argument->as()) + return true; + if (const auto * f = argument->as(); f && findIdentifier(*f)) + return true; + } + return false; +} + /// Resolve table function node in scope void QueryAnalyzer::resolveTableFunction(QueryTreeNodePtr & table_function_node, IdentifierResolveScope & scope, @@ -6090,12 +6106,11 @@ void QueryAnalyzer::resolveTableFunction(QueryTreeNodePtr & table_function_node, if (!nested_table_function) expressions_visitor.visit(table_function_node_typed.getArgumentsNode()); - const auto & table_function_factory = TableFunctionFactory::instance(); const auto & table_function_name = table_function_node_typed.getTableFunctionName(); auto & scope_context = scope.context; - TableFunctionPtr table_function_ptr = table_function_factory.tryGet(table_function_name, scope_context); + TableFunctionPtr table_function_ptr = TableFunctionFactory::instance().tryGet(table_function_name, scope_context); if (!table_function_ptr) { auto hints = TableFunctionFactory::instance().getHints(table_function_name); @@ -6110,17 +6125,131 @@ void QueryAnalyzer::resolveTableFunction(QueryTreeNodePtr & table_function_node, table_function_name); } + uint64_t use_structure_from_insertion_table_in_table_functions = scope_context->getSettingsRef().use_structure_from_insertion_table_in_table_functions; if (!nested_table_function && - scope_context->getSettingsRef().use_structure_from_insertion_table_in_table_functions && + use_structure_from_insertion_table_in_table_functions && scope_context->hasInsertionTable() && table_function_ptr->needStructureHint()) { const auto & insertion_table = scope_context->getInsertionTable(); if (!insertion_table.empty()) { - auto insertion_table_storage = DatabaseCatalog::instance().getTable(insertion_table, scope_context); - const auto & structure_hint = insertion_table_storage->getInMemoryMetadataPtr()->columns; - table_function_ptr->setStructureHint(structure_hint); + const auto & insert_structure = DatabaseCatalog::instance().getTable(insertion_table, scope_context)->getInMemoryMetadataPtr()->getColumns(); + DB::ColumnsDescription structure_hint; + + bool use_columns_from_insert_query = true; + + /// Insert table matches columns against SELECT expression by position, so we want to map + /// insert table columns to table function columns through names from SELECT expression. + + auto insert_column = insert_structure.begin(); + auto insert_structure_end = insert_structure.end(); /// end iterator of the range covered by possible asterisk + auto virtual_column_names = table_function_ptr->getVirtualsToCheckBeforeUsingStructureHint(); + bool asterisk = false; + const auto & expression_list = scope.scope_node->as().getProjection(); + auto expression = expression_list.begin(); + + /// We want to go through SELECT expression list and correspond each expression to column in insert table + /// which type will be used as a hint for the file structure inference. + for (; expression != expression_list.end() && insert_column != insert_structure_end; ++expression) + { + if (auto * identifier_node = (*expression)->as()) + { + + if (!virtual_column_names.contains(identifier_node->getIdentifier().getFullName())) + { + if (asterisk) + { + if (use_structure_from_insertion_table_in_table_functions == 1) + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Asterisk cannot be mixed with column list in INSERT SELECT query."); + + use_columns_from_insert_query = false; + break; + } + + structure_hint.add({ identifier_node->getIdentifier().getFullName(), insert_column->type }); + } + + /// Once we hit asterisk we want to find end of the range covered by asterisk + /// contributing every further SELECT expression to the tail of insert structure + if (asterisk) + --insert_structure_end; + else + ++insert_column; + } + else if (auto * matcher_node = (*expression)->as(); matcher_node && matcher_node->getMatcherType() == MatcherNodeType::ASTERISK) + { + if (asterisk) + { + if (use_structure_from_insertion_table_in_table_functions == 1) + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Only one asterisk can be used in INSERT SELECT query."); + + use_columns_from_insert_query = false; + break; + } + if (!structure_hint.empty()) + { + if (use_structure_from_insertion_table_in_table_functions == 1) + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Asterisk cannot be mixed with column list in INSERT SELECT query."); + + use_columns_from_insert_query = false; + break; + } + + asterisk = true; + } + else if (auto * function = (*expression)->as()) + { + if (use_structure_from_insertion_table_in_table_functions == 2 && findIdentifier(*function)) + { + use_columns_from_insert_query = false; + break; + } + + /// Once we hit asterisk we want to find end of the range covered by asterisk + /// contributing every further SELECT expression to the tail of insert structure + if (asterisk) + --insert_structure_end; + else + ++insert_column; + } + else + { + /// Once we hit asterisk we want to find end of the range covered by asterisk + /// contributing every further SELECT expression to the tail of insert structure + if (asterisk) + --insert_structure_end; + else + ++insert_column; + } + } + + if (use_structure_from_insertion_table_in_table_functions == 2 && !asterisk) + { + /// For input function we should check if input format supports reading subset of columns. + if (table_function_ptr->getName() == "input") + use_columns_from_insert_query = FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(scope.context->getInsertFormat()); + else + use_columns_from_insert_query = table_function_ptr->supportsReadingSubsetOfColumns(); + } + + if (use_columns_from_insert_query) + { + if (expression == expression_list.end()) + { + /// Append tail of insert structure to the hint + if (asterisk) + { + for (; insert_column != insert_structure_end; ++insert_column) + structure_hint.add({ insert_column->name, insert_column->type }); + } + + if (!structure_hint.empty()) + table_function_ptr->setStructureHint(structure_hint); + + } else if (use_structure_from_insertion_table_in_table_functions == 1) + throw Exception(ErrorCodes::NUMBER_OF_COLUMNS_DOESNT_MATCH, "Number of columns in insert table less than required by SELECT expression."); + } } } diff --git a/src/Backups/BackupCoordinationFileInfos.cpp b/src/Backups/BackupCoordinationFileInfos.cpp index 44f00f6c543..eead742b510 100644 --- a/src/Backups/BackupCoordinationFileInfos.cpp +++ b/src/Backups/BackupCoordinationFileInfos.cpp @@ -1,5 +1,6 @@ #include #include +#include namespace DB diff --git a/src/Backups/BackupCoordinationRemote.cpp b/src/Backups/BackupCoordinationRemote.cpp index f885cd0e6b4..8e6b5db91b1 100644 --- a/src/Backups/BackupCoordinationRemote.cpp +++ b/src/Backups/BackupCoordinationRemote.cpp @@ -771,16 +771,19 @@ bool BackupCoordinationRemote::hasConcurrentBackups(const std::atomic &) String existing_backup_uuid = existing_backup_path; existing_backup_uuid.erase(0, String("backup-").size()); - if (existing_backup_uuid == toString(backup_uuid)) continue; - const auto status = zk->get(root_zookeeper_path + "/" + existing_backup_path + "/stage"); - if (status != Stage::COMPLETED) + String status; + if (zk->tryGet(root_zookeeper_path + "/" + existing_backup_path + "/stage", status)) { - LOG_WARNING(log, "Found a concurrent backup: {}, current backup: {}", existing_backup_uuid, toString(backup_uuid)); - result = true; - return; + /// If status is not COMPLETED it could be because the backup failed, check if 'error' exists + if (status != Stage::COMPLETED && !zk->exists(root_zookeeper_path + "/" + existing_backup_path + "/error")) + { + LOG_WARNING(log, "Found a concurrent backup: {}, current backup: {}", existing_backup_uuid, toString(backup_uuid)); + result = true; + return; + } } } diff --git a/src/Backups/BackupEntriesCollector.cpp b/src/Backups/BackupEntriesCollector.cpp index 1adc4d41fee..ab836487ec0 100644 --- a/src/Backups/BackupEntriesCollector.cpp +++ b/src/Backups/BackupEntriesCollector.cpp @@ -15,6 +15,7 @@ #include #include #include +#include #include namespace fs = std::filesystem; diff --git a/src/Backups/BackupEntryFromAppendOnlyFile.cpp b/src/Backups/BackupEntryFromAppendOnlyFile.cpp index 9bab101bc35..5384a69d890 100644 --- a/src/Backups/BackupEntryFromAppendOnlyFile.cpp +++ b/src/Backups/BackupEntryFromAppendOnlyFile.cpp @@ -8,10 +8,11 @@ namespace DB BackupEntryFromAppendOnlyFile::BackupEntryFromAppendOnlyFile( const DiskPtr & disk_, const String & file_path_, + const ReadSettings & settings_, const std::optional & file_size_, const std::optional & checksum_, const std::shared_ptr & temporary_file_) - : BackupEntryFromImmutableFile(disk_, file_path_, file_size_, checksum_, temporary_file_) + : BackupEntryFromImmutableFile(disk_, file_path_, settings_, file_size_, checksum_, temporary_file_) , limit(BackupEntryFromImmutableFile::getSize()) { } diff --git a/src/Backups/BackupEntryFromAppendOnlyFile.h b/src/Backups/BackupEntryFromAppendOnlyFile.h index c6055b86268..b0cee38c6be 100644 --- a/src/Backups/BackupEntryFromAppendOnlyFile.h +++ b/src/Backups/BackupEntryFromAppendOnlyFile.h @@ -16,6 +16,7 @@ public: BackupEntryFromAppendOnlyFile( const DiskPtr & disk_, const String & file_path_, + const ReadSettings & settings_, const std::optional & file_size_ = {}, const std::optional & checksum_ = {}, const std::shared_ptr & temporary_file_ = {}); diff --git a/src/Backups/BackupEntryFromImmutableFile.cpp b/src/Backups/BackupEntryFromImmutableFile.cpp index 86b9c13fb9a..48783a3bb63 100644 --- a/src/Backups/BackupEntryFromImmutableFile.cpp +++ b/src/Backups/BackupEntryFromImmutableFile.cpp @@ -11,10 +11,16 @@ namespace DB BackupEntryFromImmutableFile::BackupEntryFromImmutableFile( const DiskPtr & disk_, const String & file_path_, + const ReadSettings & settings_, const std::optional & file_size_, const std::optional & checksum_, const std::shared_ptr & temporary_file_) - : disk(disk_), file_path(file_path_), file_size(file_size_), checksum(checksum_), temporary_file_on_disk(temporary_file_) + : disk(disk_) + , file_path(file_path_) + , settings(settings_) + , file_size(file_size_) + , checksum(checksum_) + , temporary_file_on_disk(temporary_file_) { } @@ -30,7 +36,7 @@ UInt64 BackupEntryFromImmutableFile::getSize() const std::unique_ptr BackupEntryFromImmutableFile::getReadBuffer() const { - return disk->readFile(file_path); + return disk->readFile(file_path, settings); } diff --git a/src/Backups/BackupEntryFromImmutableFile.h b/src/Backups/BackupEntryFromImmutableFile.h index 99241c691cb..66f1fade294 100644 --- a/src/Backups/BackupEntryFromImmutableFile.h +++ b/src/Backups/BackupEntryFromImmutableFile.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include @@ -19,6 +20,7 @@ public: BackupEntryFromImmutableFile( const DiskPtr & disk_, const String & file_path_, + const ReadSettings & settings_, const std::optional & file_size_ = {}, const std::optional & checksum_ = {}, const std::shared_ptr & temporary_file_ = {}); @@ -37,6 +39,7 @@ public: private: const DiskPtr disk; const String file_path; + ReadSettings settings; mutable std::optional file_size TSA_GUARDED_BY(get_file_size_mutex); mutable std::mutex get_file_size_mutex; const std::optional checksum; diff --git a/src/Backups/BackupFileInfo.cpp b/src/Backups/BackupFileInfo.cpp index 24548ca05fe..5a3076d1647 100644 --- a/src/Backups/BackupFileInfo.cpp +++ b/src/Backups/BackupFileInfo.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #include diff --git a/src/Backups/BackupFileInfo.h b/src/Backups/BackupFileInfo.h index 96df8ab2e0b..ae6ec83a37b 100644 --- a/src/Backups/BackupFileInfo.h +++ b/src/Backups/BackupFileInfo.h @@ -1,8 +1,9 @@ #pragma once #include -#include +#include +namespace Poco { class Logger; } namespace DB { diff --git a/src/Backups/BackupIO.cpp b/src/Backups/BackupIO.cpp index cc252c2f1bd..f78e6df23a8 100644 --- a/src/Backups/BackupIO.cpp +++ b/src/Backups/BackupIO.cpp @@ -3,6 +3,7 @@ #include #include #include +#include namespace DB @@ -22,6 +23,11 @@ void IBackupReader::copyFileToDisk(const String & file_name, size_t size, DiskPt write_buffer->finalize(); } +IBackupWriter::IBackupWriter(const ContextPtr & context_) + : read_settings(context_->getBackupReadSettings()) + , has_throttling(static_cast(context_->getBackupsThrottler())) +{} + void IBackupWriter::copyDataToFile(const CreateReadBufferFunction & create_read_buffer, UInt64 offset, UInt64 size, const String & dest_file_name) { auto read_buffer = create_read_buffer(); diff --git a/src/Backups/BackupIO.h b/src/Backups/BackupIO.h index cf3d29ee51e..aef9c14e83e 100644 --- a/src/Backups/BackupIO.h +++ b/src/Backups/BackupIO.h @@ -3,6 +3,8 @@ #include #include #include +#include +#include namespace DB { @@ -28,6 +30,8 @@ class IBackupWriter /// BackupWriterFile, BackupWriterDisk public: using CreateReadBufferFunction = std::function()>; + explicit IBackupWriter(const ContextPtr & context_); + virtual ~IBackupWriter() = default; virtual bool fileExists(const String & file_name) = 0; virtual UInt64 getFileSize(const String & file_name) = 0; @@ -38,7 +42,17 @@ public: virtual DataSourceDescription getDataSourceDescription() const = 0; virtual void copyDataToFile(const CreateReadBufferFunction & create_read_buffer, UInt64 offset, UInt64 size, const String & dest_file_name); virtual bool supportNativeCopy(DataSourceDescription /* data_source_description */) const { return false; } + + /// Copy file using native copy (optimized for S3 to use CopyObject) + /// + /// NOTE: It still may fall back to copyDataToFile() if native copy is not possible: + /// - different buckets + /// - throttling had been requested virtual void copyFileNative(DiskPtr src_disk, const String & src_file_name, UInt64 src_offset, UInt64 src_size, const String & dest_file_name); + +protected: + const ReadSettings read_settings; + const bool has_throttling; }; } diff --git a/src/Backups/BackupIO_Disk.cpp b/src/Backups/BackupIO_Disk.cpp index cc6076541d0..10d7a572f6b 100644 --- a/src/Backups/BackupIO_Disk.cpp +++ b/src/Backups/BackupIO_Disk.cpp @@ -50,7 +50,10 @@ void BackupReaderDisk::copyFileToDisk(const String & file_name, size_t size, Dis } -BackupWriterDisk::BackupWriterDisk(const DiskPtr & disk_, const String & path_) : disk(disk_), path(path_) +BackupWriterDisk::BackupWriterDisk(const DiskPtr & disk_, const String & path_, const ContextPtr & context_) + : IBackupWriter(context_) + , disk(disk_) + , path(path_) { } @@ -127,9 +130,9 @@ void BackupWriterDisk::copyFileNative(DiskPtr src_disk, const String & src_file_ if (!src_disk) throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot natively copy data to disk without source disk"); - if ((src_offset != 0) || (src_size != src_disk->getFileSize(src_file_name))) + if (has_throttling || (src_offset != 0) || (src_size != src_disk->getFileSize(src_file_name))) { - auto create_read_buffer = [src_disk, src_file_name] { return src_disk->readFile(src_file_name); }; + auto create_read_buffer = [this, src_disk, src_file_name] { return src_disk->readFile(src_file_name, read_settings); }; copyDataToFile(create_read_buffer, src_offset, src_size, dest_file_name); return; } diff --git a/src/Backups/BackupIO_Disk.h b/src/Backups/BackupIO_Disk.h index 600e4f8ff39..be34847000d 100644 --- a/src/Backups/BackupIO_Disk.h +++ b/src/Backups/BackupIO_Disk.h @@ -2,6 +2,7 @@ #include #include +#include namespace DB { @@ -30,7 +31,7 @@ private: class BackupWriterDisk : public IBackupWriter { public: - BackupWriterDisk(const DiskPtr & disk_, const String & path_); + BackupWriterDisk(const DiskPtr & disk_, const String & path_, const ContextPtr & context_); ~BackupWriterDisk() override; bool fileExists(const String & file_name) override; diff --git a/src/Backups/BackupIO_File.cpp b/src/Backups/BackupIO_File.cpp index 5bf6d54928d..d4c9d0cb210 100644 --- a/src/Backups/BackupIO_File.cpp +++ b/src/Backups/BackupIO_File.cpp @@ -49,7 +49,9 @@ void BackupReaderFile::copyFileToDisk(const String & file_name, size_t size, Dis } -BackupWriterFile::BackupWriterFile(const String & path_) : path(path_) +BackupWriterFile::BackupWriterFile(const String & path_, const ContextPtr & context_) + : IBackupWriter(context_) + , path(path_) { } @@ -152,9 +154,9 @@ void BackupWriterFile::copyFileNative(DiskPtr src_disk, const String & src_file_ else abs_source_path = fs::absolute(src_file_name); - if ((src_offset != 0) || (src_size != fs::file_size(abs_source_path))) + if (has_throttling || (src_offset != 0) || (src_size != fs::file_size(abs_source_path))) { - auto create_read_buffer = [abs_source_path] { return createReadBufferFromFileBase(abs_source_path, {}); }; + auto create_read_buffer = [this, abs_source_path] { return createReadBufferFromFileBase(abs_source_path, read_settings); }; copyDataToFile(create_read_buffer, src_offset, src_size, dest_file_name); return; } diff --git a/src/Backups/BackupIO_File.h b/src/Backups/BackupIO_File.h index e1f4324a39f..d4b6e13d546 100644 --- a/src/Backups/BackupIO_File.h +++ b/src/Backups/BackupIO_File.h @@ -2,6 +2,7 @@ #include #include +#include namespace DB { @@ -27,7 +28,7 @@ private: class BackupWriterFile : public IBackupWriter { public: - explicit BackupWriterFile(const String & path_); + explicit BackupWriterFile(const String & path_, const ContextPtr & context_); ~BackupWriterFile() override; bool fileExists(const String & file_name) override; diff --git a/src/Backups/BackupIO_S3.cpp b/src/Backups/BackupIO_S3.cpp index f7d518b064d..ba5ba170427 100644 --- a/src/Backups/BackupIO_S3.cpp +++ b/src/Backups/BackupIO_S3.cpp @@ -161,9 +161,9 @@ void BackupReaderS3::copyFileToDisk(const String & file_name, size_t size, DiskP BackupWriterS3::BackupWriterS3( const S3::URI & s3_uri_, const String & access_key_id_, const String & secret_access_key_, const ContextPtr & context_) - : s3_uri(s3_uri_) + : IBackupWriter(context_) + , s3_uri(s3_uri_) , client(makeS3Client(s3_uri_, access_key_id_, secret_access_key_, context_)) - , read_settings(context_->getReadSettings()) , request_settings(context_->getStorageS3Settings().getSettings(s3_uri.uri.toString()).request_settings) , log(&Poco::Logger::get("BackupWriterS3")) { @@ -189,7 +189,7 @@ void BackupWriterS3::copyFileNative(DiskPtr src_disk, const String & src_file_na auto objects = src_disk->getStorageObjects(src_file_name); if (objects.size() > 1) { - auto create_read_buffer = [src_disk, src_file_name] { return src_disk->readFile(src_file_name); }; + auto create_read_buffer = [this, src_disk, src_file_name] { return src_disk->readFile(src_file_name, read_settings); }; copyDataToFile(create_read_buffer, src_offset, src_size, dest_file_name); } else diff --git a/src/Backups/BackupIO_S3.h b/src/Backups/BackupIO_S3.h index 94e61248428..c32a6b48660 100644 --- a/src/Backups/BackupIO_S3.h +++ b/src/Backups/BackupIO_S3.h @@ -7,6 +7,7 @@ #include #include #include +#include namespace DB @@ -76,7 +77,6 @@ private: S3::URI s3_uri; std::shared_ptr client; - ReadSettings read_settings; S3Settings::RequestSettings request_settings; Poco::Logger * log; std::optional supports_batch_delete; diff --git a/src/Backups/BackupUtils.h b/src/Backups/BackupUtils.h index f451b003652..3dc0a58d304 100644 --- a/src/Backups/BackupUtils.h +++ b/src/Backups/BackupUtils.h @@ -1,7 +1,6 @@ #pragma once #include -#include namespace DB diff --git a/src/Backups/BackupsWorker.cpp b/src/Backups/BackupsWorker.cpp index 58f0b3effc5..4b17174a8de 100644 --- a/src/Backups/BackupsWorker.cpp +++ b/src/Backups/BackupsWorker.cpp @@ -23,6 +23,7 @@ #include #include #include +#include namespace CurrentMetrics @@ -182,8 +183,8 @@ namespace BackupsWorker::BackupsWorker(size_t num_backup_threads, size_t num_restore_threads, bool allow_concurrent_backups_, bool allow_concurrent_restores_) - : backups_thread_pool(CurrentMetrics::BackupsThreads, CurrentMetrics::BackupsThreadsActive, num_backup_threads, /* max_free_threads = */ 0, num_backup_threads) - , restores_thread_pool(CurrentMetrics::RestoreThreads, CurrentMetrics::RestoreThreadsActive, num_restore_threads, /* max_free_threads = */ 0, num_restore_threads) + : backups_thread_pool(std::make_unique(CurrentMetrics::BackupsThreads, CurrentMetrics::BackupsThreadsActive, num_backup_threads, /* max_free_threads = */ 0, num_backup_threads)) + , restores_thread_pool(std::make_unique(CurrentMetrics::RestoreThreads, CurrentMetrics::RestoreThreadsActive, num_restore_threads, /* max_free_threads = */ 0, num_restore_threads)) , log(&Poco::Logger::get("BackupsWorker")) , allow_concurrent_backups(allow_concurrent_backups_) , allow_concurrent_restores(allow_concurrent_restores_) @@ -248,7 +249,7 @@ OperationID BackupsWorker::startMakingBackup(const ASTPtr & query, const Context if (backup_settings.async) { - backups_thread_pool.scheduleOrThrowOnError( + backups_thread_pool->scheduleOrThrowOnError( [this, backup_query, backup_id, backup_name_for_logging, backup_info, backup_settings, backup_coordination, context_in_use, mutable_context] { doBackup( @@ -435,7 +436,7 @@ void BackupsWorker::buildFileInfosForBackupEntries(const BackupPtr & backup, con LOG_TRACE(log, "{}", Stage::BUILDING_FILE_INFOS); backup_coordination->setStage(Stage::BUILDING_FILE_INFOS, ""); backup_coordination->waitForStage(Stage::BUILDING_FILE_INFOS); - backup_coordination->addFileInfos(::DB::buildFileInfosForBackupEntries(backup_entries, backup->getBaseBackup(), backups_thread_pool)); + backup_coordination->addFileInfos(::DB::buildFileInfosForBackupEntries(backup_entries, backup->getBaseBackup(), *backups_thread_pool)); } @@ -522,7 +523,7 @@ void BackupsWorker::writeBackupEntries(BackupMutablePtr backup, BackupEntries && } }; - if (always_single_threaded || !backups_thread_pool.trySchedule([job] { job(true); })) + if (always_single_threaded || !backups_thread_pool->trySchedule([job] { job(true); })) job(false); } @@ -581,7 +582,7 @@ OperationID BackupsWorker::startRestoring(const ASTPtr & query, ContextMutablePt if (restore_settings.async) { - restores_thread_pool.scheduleOrThrowOnError( + restores_thread_pool->scheduleOrThrowOnError( [this, restore_query, restore_id, backup_name_for_logging, backup_info, restore_settings, restore_coordination, context_in_use] { doRestore( @@ -716,7 +717,7 @@ void BackupsWorker::doRestore( } /// Execute the data restoring tasks. - restoreTablesData(restore_id, backup, std::move(data_restore_tasks), restores_thread_pool); + restoreTablesData(restore_id, backup, std::move(data_restore_tasks), *restores_thread_pool); /// We have restored everything, we need to tell other hosts (they could be waiting for it). restore_coordination->setStage(Stage::COMPLETED, ""); @@ -941,8 +942,8 @@ void BackupsWorker::shutdown() if (has_active_backups_and_restores) LOG_INFO(log, "Waiting for {} backups and {} restores to be finished", num_active_backups, num_active_restores); - backups_thread_pool.wait(); - restores_thread_pool.wait(); + backups_thread_pool->wait(); + restores_thread_pool->wait(); if (has_active_backups_and_restores) LOG_INFO(log, "All backup and restore tasks have finished"); diff --git a/src/Backups/BackupsWorker.h b/src/Backups/BackupsWorker.h index d319daf42bd..cbfadc24b7b 100644 --- a/src/Backups/BackupsWorker.h +++ b/src/Backups/BackupsWorker.h @@ -1,7 +1,8 @@ #pragma once #include -#include +#include +#include #include #include #include @@ -132,8 +133,8 @@ private: void setNumFilesAndSize(const OperationID & id, size_t num_files, UInt64 total_size, size_t num_entries, UInt64 uncompressed_size, UInt64 compressed_size, size_t num_read_files, UInt64 num_read_bytes); - ThreadPool backups_thread_pool; - ThreadPool restores_thread_pool; + std::unique_ptr backups_thread_pool; + std::unique_ptr restores_thread_pool; std::unordered_map infos; std::condition_variable status_changed; diff --git a/src/Backups/RestoreCoordinationRemote.cpp b/src/Backups/RestoreCoordinationRemote.cpp index 0187ad0e7e2..cc03f0c4a2a 100644 --- a/src/Backups/RestoreCoordinationRemote.cpp +++ b/src/Backups/RestoreCoordinationRemote.cpp @@ -279,12 +279,16 @@ bool RestoreCoordinationRemote::hasConcurrentRestores(const std::atomic if (existing_restore_uuid == toString(restore_uuid)) continue; - const auto status = zk->get(root_zookeeper_path + "/" + existing_restore_path + "/stage"); - if (status != Stage::COMPLETED) + String status; + if (zk->tryGet(root_zookeeper_path + "/" + existing_restore_path + "/stage", status)) { - LOG_WARNING(log, "Found a concurrent restore: {}, current restore: {}", existing_restore_uuid, toString(restore_uuid)); - result = true; - return; + /// If status is not COMPLETED it could be because the restore failed, check if 'error' exists + if (status != Stage::COMPLETED && !zk->exists(root_zookeeper_path + "/" + existing_restore_path + "/error")) + { + LOG_WARNING(log, "Found a concurrent restore: {}, current restore: {}", existing_restore_uuid, toString(restore_uuid)); + result = true; + return; + } } } diff --git a/src/Backups/registerBackupEnginesFileAndDisk.cpp b/src/Backups/registerBackupEnginesFileAndDisk.cpp index 51b14fbc1d8..46f44471e6f 100644 --- a/src/Backups/registerBackupEnginesFileAndDisk.cpp +++ b/src/Backups/registerBackupEnginesFileAndDisk.cpp @@ -178,9 +178,9 @@ void registerBackupEnginesFileAndDisk(BackupFactory & factory) { std::shared_ptr writer; if (engine_name == "File") - writer = std::make_shared(path); + writer = std::make_shared(path, params.context); else - writer = std::make_shared(disk, path); + writer = std::make_shared(disk, path, params.context); return std::make_unique( backup_name_for_logging, archive_params, diff --git a/src/Bridge/CMakeLists.txt b/src/Bridge/CMakeLists.txt index daf38bd6cbc..5f0e97fc630 100644 --- a/src/Bridge/CMakeLists.txt +++ b/src/Bridge/CMakeLists.txt @@ -2,4 +2,4 @@ add_library (bridge IBridge.cpp ) -target_link_libraries (bridge PRIVATE daemon dbms Poco::Data Poco::Data::ODBC) +target_link_libraries (bridge PRIVATE daemon dbms) diff --git a/src/Bridge/IBridge.cpp b/src/Bridge/IBridge.cpp index 1ea77573e5f..4ba53fd4435 100644 --- a/src/Bridge/IBridge.cpp +++ b/src/Bridge/IBridge.cpp @@ -14,17 +14,13 @@ #include #include #include +#include #include #include #include "config.h" -#if USE_ODBC -# include -#endif - - namespace DB { diff --git a/src/BridgeHelper/IBridgeHelper.h b/src/BridgeHelper/IBridgeHelper.h index d4762087cc1..272d97c8a78 100644 --- a/src/BridgeHelper/IBridgeHelper.h +++ b/src/BridgeHelper/IBridgeHelper.h @@ -5,7 +5,6 @@ #include #include #include -#include namespace DB diff --git a/src/BridgeHelper/XDBCBridgeHelper.h b/src/BridgeHelper/XDBCBridgeHelper.h index 00a661a1fc4..44104f26f63 100644 --- a/src/BridgeHelper/XDBCBridgeHelper.h +++ b/src/BridgeHelper/XDBCBridgeHelper.h @@ -11,7 +11,6 @@ #include #include #include -#include #include #include #include diff --git a/src/Client/ClientBase.cpp b/src/Client/ClientBase.cpp index 120d273aa62..a5296a143e1 100644 --- a/src/Client/ClientBase.cpp +++ b/src/Client/ClientBase.cpp @@ -7,6 +7,7 @@ #include #include +#include #include #include #include @@ -2219,9 +2220,6 @@ void ClientBase::runInteractive() LineReader lr(history_file, config().has("multiline"), query_extenders, query_delimiters); #endif - /// Enable bracketed-paste-mode so that we are able to paste multiline queries as a whole. - lr.enableBracketedPaste(); - static const std::initializer_list> backslash_aliases = { { "\\l", "SHOW DATABASES" }, @@ -2239,7 +2237,18 @@ void ClientBase::runInteractive() do { - auto input = lr.readLine(prompt(), ":-] "); + String input; + { + /// Enable bracketed-paste-mode so that we are able to paste multiline queries as a whole. + /// But keep it disabled outside of query input, because it breaks password input + /// (e.g. if we need to reconnect and show a password prompt). + /// (Alternatively, we could make the password input ignore the control sequences.) + lr.enableBracketedPaste(); + SCOPE_EXIT({ lr.disableBracketedPaste(); }); + + input = lr.readLine(prompt(), ":-] "); + } + if (input.empty()) break; diff --git a/src/Client/Connection.cpp b/src/Client/Connection.cpp index 6643a94c3bc..11bba4f1448 100644 --- a/src/Client/Connection.cpp +++ b/src/Client/Connection.cpp @@ -22,7 +22,8 @@ #include #include #include -#include "Core/Block.h" +#include +#include #include #include #include diff --git a/src/Client/Connection.h b/src/Client/Connection.h index b86567e2ed0..5f79b365199 100644 --- a/src/Client/Connection.h +++ b/src/Client/Connection.h @@ -1,6 +1,5 @@ #pragma once -#include #include diff --git a/src/Client/LineReader.h b/src/Client/LineReader.h index 321cf41b77e..df64a3a85a9 100644 --- a/src/Client/LineReader.h +++ b/src/Client/LineReader.h @@ -46,7 +46,10 @@ public: /// clickhouse-client so that without -m flag, one can still paste multiline queries, and /// possibly get better pasting performance. See https://cirw.in/blog/bracketed-paste for /// more details. + /// These methods (if implemented) emit the control characters immediately, without waiting + /// for the next readLine() call. virtual void enableBracketedPaste() {} + virtual void disableBracketedPaste() {} protected: enum InputStatus diff --git a/src/Client/LocalConnection.cpp b/src/Client/LocalConnection.cpp index 712ff5f5a31..4fb80f0ea04 100644 --- a/src/Client/LocalConnection.cpp +++ b/src/Client/LocalConnection.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #include diff --git a/src/Client/LocalConnection.h b/src/Client/LocalConnection.h index 3e6fc007fb9..fb8f9003364 100644 --- a/src/Client/LocalConnection.h +++ b/src/Client/LocalConnection.h @@ -7,6 +7,7 @@ #include #include #include +#include namespace DB diff --git a/src/Client/ReplxxLineReader.cpp b/src/Client/ReplxxLineReader.cpp index 1979b37a94b..180be77ca1c 100644 --- a/src/Client/ReplxxLineReader.cpp +++ b/src/Client/ReplxxLineReader.cpp @@ -519,4 +519,10 @@ void ReplxxLineReader::enableBracketedPaste() rx.enable_bracketed_paste(); } +void ReplxxLineReader::disableBracketedPaste() +{ + bracketed_paste_enabled = false; + rx.disable_bracketed_paste(); +} + } diff --git a/src/Client/ReplxxLineReader.h b/src/Client/ReplxxLineReader.h index d36a1d0f42c..5cb8e48eb86 100644 --- a/src/Client/ReplxxLineReader.h +++ b/src/Client/ReplxxLineReader.h @@ -19,6 +19,7 @@ public: ~ReplxxLineReader() override; void enableBracketedPaste() override; + void disableBracketedPaste() override; /// If highlight is on, we will set a flag to denote whether the last token is a delimiter. /// This is useful to determine the behavior of key when multiline is enabled. diff --git a/src/Common/AsynchronousMetrics.cpp b/src/Common/AsynchronousMetrics.cpp index 99073d79bcd..32d54b7644e 100644 --- a/src/Common/AsynchronousMetrics.cpp +++ b/src/Common/AsynchronousMetrics.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include diff --git a/src/Common/CacheBase.h b/src/Common/CacheBase.h index 4ae313d7ecf..b176170cc1f 100644 --- a/src/Common/CacheBase.h +++ b/src/Common/CacheBase.h @@ -12,7 +12,6 @@ #include #include -#include #include diff --git a/src/Common/Concepts.h b/src/Common/Concepts.h index b1bf591024d..927f42aa4be 100644 --- a/src/Common/Concepts.h +++ b/src/Common/Concepts.h @@ -5,6 +5,10 @@ namespace DB { +template +concept is_any_of = (std::same_as || ...); + + template concept OptionalArgument = requires(T &&...) { diff --git a/src/Common/Config/ConfigProcessor.cpp b/src/Common/Config/ConfigProcessor.cpp index b632ea95928..5bbc8eae0de 100644 --- a/src/Common/Config/ConfigProcessor.cpp +++ b/src/Common/Config/ConfigProcessor.cpp @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include diff --git a/src/Common/Config/ConfigProcessor.h b/src/Common/Config/ConfigProcessor.h index aa8ac71446f..0ca3e46db88 100644 --- a/src/Common/Config/ConfigProcessor.h +++ b/src/Common/Config/ConfigProcessor.h @@ -16,9 +16,10 @@ #include #include #include -#include +namespace Poco { class Logger; } + namespace zkutil { class ZooKeeperNodeCache; diff --git a/src/Common/Config/YAMLParser.h b/src/Common/Config/YAMLParser.h index b986fc2d895..a00972b813c 100644 --- a/src/Common/Config/YAMLParser.h +++ b/src/Common/Config/YAMLParser.h @@ -2,11 +2,11 @@ #include "config.h" -#include - +#include +#include +#include #include -#include "Poco/DOM/AutoPtr.h" -#include +#include #if USE_YAML_CPP diff --git a/src/Common/Config/configReadClient.cpp b/src/Common/Config/configReadClient.cpp index e5308bc3bc7..44d338c07af 100644 --- a/src/Common/Config/configReadClient.cpp +++ b/src/Common/Config/configReadClient.cpp @@ -4,6 +4,7 @@ #include "ConfigProcessor.h" #include #include +#include namespace fs = std::filesystem; diff --git a/src/Common/CurrentThread.cpp b/src/Common/CurrentThread.cpp index 6ec46d6508c..fd2ad0bbaf1 100644 --- a/src/Common/CurrentThread.cpp +++ b/src/Common/CurrentThread.cpp @@ -90,7 +90,7 @@ void CurrentThread::attachInternalTextLogsQueue(const std::shared_ptr & logs_queue, @@ -69,9 +69,9 @@ public: /// You must call one of these methods when create a query child thread: /// Add current thread to a group associated with the thread group - static void attachToGroup(const ThreadGroupStatusPtr & thread_group); + static void attachToGroup(const ThreadGroupPtr & thread_group); /// Is useful for a ThreadPool tasks - static void attachToGroupIfDetached(const ThreadGroupStatusPtr & thread_group); + static void attachToGroupIfDetached(const ThreadGroupPtr & thread_group); /// Non-master threads call this method in destructor automatically static void detachFromGroupIfNotDetached(); diff --git a/src/Common/DNSResolver.cpp b/src/Common/DNSResolver.cpp index 81e2624d6db..b6a68bdfb45 100644 --- a/src/Common/DNSResolver.cpp +++ b/src/Common/DNSResolver.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include diff --git a/src/Common/DNSResolver.h b/src/Common/DNSResolver.h index a05456d3de8..84715b392a8 100644 --- a/src/Common/DNSResolver.h +++ b/src/Common/DNSResolver.h @@ -5,9 +5,10 @@ #include #include #include -#include +namespace Poco { class Logger; } + namespace DB { diff --git a/src/Common/ErrorHandlers.h b/src/Common/ErrorHandlers.h index f55b6c83a69..301377bff83 100644 --- a/src/Common/ErrorHandlers.h +++ b/src/Common/ErrorHandlers.h @@ -1,7 +1,6 @@ #pragma once #include -#include #include diff --git a/src/Common/Exception.h b/src/Common/Exception.h index 8e50c1114f4..170e0d32b3c 100644 --- a/src/Common/Exception.h +++ b/src/Common/Exception.h @@ -4,7 +4,6 @@ #include #include -#include #include #include diff --git a/src/Common/FileChecker.cpp b/src/Common/FileChecker.cpp index aa6b9c90a4c..a6e37654ff1 100644 --- a/src/Common/FileChecker.cpp +++ b/src/Common/FileChecker.cpp @@ -1,5 +1,7 @@ #include #include +#include +#include #include #include #include @@ -25,7 +27,9 @@ FileChecker::FileChecker(const String & file_info_path_) : FileChecker(nullptr, { } -FileChecker::FileChecker(DiskPtr disk_, const String & file_info_path_) : disk(std::move(disk_)) +FileChecker::FileChecker(DiskPtr disk_, const String & file_info_path_) + : disk(std::move(disk_)) + , log(&Poco::Logger::get("FileChecker")) { setPath(file_info_path_); try diff --git a/src/Common/FileChecker.h b/src/Common/FileChecker.h index 1beab31ec8f..bb0383e4b56 100644 --- a/src/Common/FileChecker.h +++ b/src/Common/FileChecker.h @@ -1,8 +1,10 @@ #pragma once -#include #include +#include +#include +namespace Poco { class Logger; } namespace DB { @@ -46,7 +48,7 @@ private: size_t getRealFileSize(const String & path_) const; const DiskPtr disk; - const Poco::Logger * log = &Poco::Logger::get("FileChecker"); + const Poco::Logger * log; String files_info_path; std::map map; diff --git a/src/Common/LRUCachePolicy.h b/src/Common/LRUCachePolicy.h index 4aee2135af7..49c2fc4541b 100644 --- a/src/Common/LRUCachePolicy.h +++ b/src/Common/LRUCachePolicy.h @@ -5,8 +5,6 @@ #include #include -#include - namespace DB { /// Cache policy LRU evicts entries which are not used for a long time. @@ -174,7 +172,7 @@ private: auto it = cells.find(key); if (it == cells.end()) { - LOG_ERROR(&Poco::Logger::get("LRUCache"), "LRUCache became inconsistent. There must be a bug in it."); + // Queue became inconsistent abort(); } @@ -192,7 +190,7 @@ private: if (current_size_in_bytes > (1ull << 63)) { - LOG_ERROR(&Poco::Logger::get("LRUCache"), "LRUCache became inconsistent. There must be a bug in it."); + // Queue became inconsistent abort(); } } diff --git a/src/Common/Macros.cpp b/src/Common/Macros.cpp index e5d4be446c1..f43fed6c499 100644 --- a/src/Common/Macros.cpp +++ b/src/Common/Macros.cpp @@ -1,8 +1,9 @@ #include #include #include -#include #include +#include +#include namespace DB @@ -11,6 +12,8 @@ namespace DB namespace ErrorCodes { extern const int SYNTAX_ERROR; + extern const int BAD_ARGUMENTS; + extern const int NO_ELEMENTS_IN_CONFIG; } Macros::Macros(const Poco::Util::AbstractConfiguration & config, const String & root_key, Poco::Logger * log) @@ -95,7 +98,7 @@ String Macros::expand(const String & s, else if (macro_name == "uuid" && !info.expand_special_macros_only) { if (info.table_id.uuid == UUIDHelpers::Nil) - throw Exception(ErrorCodes::SYNTAX_ERROR, "Macro 'uuid' and empty arguments of ReplicatedMergeTree " + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Macro 'uuid' and empty arguments of ReplicatedMergeTree " "are supported only for ON CLUSTER queries with Atomic database engine"); /// For ON CLUSTER queries we don't want to require all macros definitions in initiator's config. /// However, initiator must check that for cross-replication cluster zookeeper_path does not contain {uuid} macro. @@ -105,6 +108,15 @@ String Macros::expand(const String & s, res += toString(info.table_id.uuid); info.expanded_uuid = true; } + else if (macro_name == "server_uuid") + { + auto uuid = ServerUUID::get(); + if (UUIDHelpers::Nil == uuid) + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Macro {server_uuid} expanded to zero, which means the UUID is not initialized (most likely it's not a server application)"); + res += toString(uuid); + info.expanded_other = true; + } else if (info.shard && macro_name == "shard") { res += *info.shard; @@ -125,7 +137,7 @@ String Macros::expand(const String & s, info.has_unknown = true; } else - throw Exception(ErrorCodes::SYNTAX_ERROR, "No macro '{}' in config while processing substitutions in " + throw Exception(ErrorCodes::NO_ELEMENTS_IN_CONFIG, "No macro '{}' in config while processing substitutions in " "'{}' at '{}' or macro is not supported here", macro_name, s, toString(begin)); pos = end + 1; @@ -142,7 +154,7 @@ String Macros::getValue(const String & key) const { if (auto it = macros.find(key); it != macros.end()) return it->second; - throw Exception(ErrorCodes::SYNTAX_ERROR, "No macro {} in config", key); + throw Exception(ErrorCodes::NO_ELEMENTS_IN_CONFIG, "No macro {} in config", key); } diff --git a/src/Common/MemoryTracker.cpp b/src/Common/MemoryTracker.cpp index e2129e1013e..674d8d469af 100644 --- a/src/Common/MemoryTracker.cpp +++ b/src/Common/MemoryTracker.cpp @@ -118,7 +118,6 @@ MemoryTracker::~MemoryTracker() } } - void MemoryTracker::logPeakMemoryUsage() { log_peak_memory_usage_in_destructor = false; @@ -156,6 +155,26 @@ void MemoryTracker::injectFault() const description ? description : ""); } +void MemoryTracker::debugLogBigAllocationWithoutCheck(Int64 size [[maybe_unused]]) +{ + /// Big allocations through allocNoThrow (without checking memory limits) may easily lead to OOM (and it's hard to debug). + /// Let's find them. +#ifdef ABORT_ON_LOGICAL_ERROR + if (size < 0) + return; + + constexpr Int64 threshold = 16 * 1024 * 1024; /// The choice is arbitrary (maybe we should decrease it) + if (size < threshold) + return; + + MemoryTrackerBlockerInThread blocker(VariableContext::Global); + LOG_TEST(&Poco::Logger::get("MemoryTracker"), "Too big allocation ({} bytes) without checking memory limits, " + "it may lead to OOM. Stack trace: {}", size, StackTrace().toString()); +#else + return; /// Avoid trash logging in release builds +#endif +} + void MemoryTracker::allocImpl(Int64 size, bool throw_if_memory_exceeded, MemoryTracker * query_tracker) { if (size < 0) @@ -235,7 +254,10 @@ void MemoryTracker::allocImpl(Int64 size, bool throw_if_memory_exceeded, MemoryT formatReadableSizeWithBinarySuffix(current_hard_limit)); } else + { memory_limit_exceeded_ignored = true; + debugLogBigAllocationWithoutCheck(size); + } } Int64 limit_to_check = current_hard_limit; @@ -303,7 +325,10 @@ void MemoryTracker::allocImpl(Int64 size, bool throw_if_memory_exceeded, MemoryT } } else + { memory_limit_exceeded_ignored = true; + debugLogBigAllocationWithoutCheck(size); + } } bool peak_updated = false; @@ -323,6 +348,7 @@ void MemoryTracker::allocImpl(Int64 size, bool throw_if_memory_exceeded, MemoryT { bool log_memory_usage = false; peak_updated = updatePeak(will_be, log_memory_usage); + debugLogBigAllocationWithoutCheck(size); } } diff --git a/src/Common/MemoryTracker.h b/src/Common/MemoryTracker.h index 66b56730b75..0d7748856bd 100644 --- a/src/Common/MemoryTracker.h +++ b/src/Common/MemoryTracker.h @@ -215,6 +215,8 @@ public: /// Prints info about peak memory consumption into log. void logPeakMemoryUsage(); + + void debugLogBigAllocationWithoutCheck(Int64 size [[maybe_unused]]); }; extern MemoryTracker total_memory_tracker; diff --git a/src/Common/OvercommitTracker.h b/src/Common/OvercommitTracker.h index 598b877ef3c..f40a70fe7cd 100644 --- a/src/Common/OvercommitTracker.h +++ b/src/Common/OvercommitTracker.h @@ -1,7 +1,7 @@ #pragma once -#include #include +#include #include #include #include diff --git a/src/Common/PoolBase.h b/src/Common/PoolBase.h index 96a18ee6591..a9c595c440c 100644 --- a/src/Common/PoolBase.h +++ b/src/Common/PoolBase.h @@ -144,12 +144,17 @@ public: return Entry(*items.back()); } - LOG_INFO(log, "No free connections in pool. Waiting."); - if (timeout < 0) + { + LOG_INFO(log, "No free connections in pool. Waiting undefinitelly."); available.wait(lock); + } else - available.wait_for(lock, std::chrono::microseconds(timeout)); + { + auto timeout_ms = std::chrono::microseconds(timeout); + LOG_INFO(log, "No free connections in pool. Waiting {} ms.", timeout_ms.count()); + available.wait_for(lock, timeout_ms); + } } } diff --git a/src/Common/ProfileEvents.cpp b/src/Common/ProfileEvents.cpp index 1d035952f13..a17d73e1673 100644 --- a/src/Common/ProfileEvents.cpp +++ b/src/Common/ProfileEvents.cpp @@ -75,10 +75,14 @@ M(S3GetRequestThrottlerSleepMicroseconds, "Total time a query was sleeping to conform S3 GET and SELECT request throttling.") \ M(S3PutRequestThrottlerCount, "Number of S3 PUT, COPY, POST and LIST requests passed through throttler.") \ M(S3PutRequestThrottlerSleepMicroseconds, "Total time a query was sleeping to conform S3 PUT, COPY, POST and LIST request throttling.") \ - M(RemoteReadThrottlerBytes, "Bytes passed through 'max_remote_read_network_bandwidth_for_server' throttler.") \ - M(RemoteReadThrottlerSleepMicroseconds, "Total time a query was sleeping to conform 'max_remote_read_network_bandwidth_for_server' throttling.") \ - M(RemoteWriteThrottlerBytes, "Bytes passed through 'max_remote_write_network_bandwidth_for_server' throttler.") \ - M(RemoteWriteThrottlerSleepMicroseconds, "Total time a query was sleeping to conform 'max_remote_write_network_bandwidth_for_server' throttling.") \ + M(RemoteReadThrottlerBytes, "Bytes passed through 'max_remote_read_network_bandwidth_for_server'/'max_remote_read_network_bandwidth' throttler.") \ + M(RemoteReadThrottlerSleepMicroseconds, "Total time a query was sleeping to conform 'max_remote_read_network_bandwidth_for_server'/'max_remote_read_network_bandwidth' throttling.") \ + M(RemoteWriteThrottlerBytes, "Bytes passed through 'max_remote_write_network_bandwidth_for_server'/'max_remote_write_network_bandwidth' throttler.") \ + M(RemoteWriteThrottlerSleepMicroseconds, "Total time a query was sleeping to conform 'max_remote_write_network_bandwidth_for_server'/'max_remote_write_network_bandwidth' throttling.") \ + M(LocalReadThrottlerBytes, "Bytes passed through 'max_local_read_bandwidth_for_server'/'max_local_read_bandwidth' throttler.") \ + M(LocalReadThrottlerSleepMicroseconds, "Total time a query was sleeping to conform 'max_local_read_bandwidth_for_server'/'max_local_read_bandwidth' throttling.") \ + M(LocalWriteThrottlerBytes, "Bytes passed through 'max_local_write_bandwidth_for_server'/'max_local_write_bandwidth' throttler.") \ + M(LocalWriteThrottlerSleepMicroseconds, "Total time a query was sleeping to conform 'max_local_write_bandwidth_for_server'/'max_local_write_bandwidth' throttling.") \ M(ThrottlerSleepMicroseconds, "Total time a query was sleeping to conform all throttling settings.") \ \ M(QueryMaskingRulesMatch, "Number of times query masking rules was successfully matched.") \ diff --git a/src/Common/SLRUCachePolicy.h b/src/Common/SLRUCachePolicy.h index e36bca83c61..810c8c335a0 100644 --- a/src/Common/SLRUCachePolicy.h +++ b/src/Common/SLRUCachePolicy.h @@ -5,8 +5,6 @@ #include #include -#include - namespace DB { @@ -236,7 +234,7 @@ private: auto it = cells.find(key); if (it == cells.end()) { - LOG_ERROR(&Poco::Logger::get("SLRUCache"), "SLRUCache became inconsistent. There must be a bug in it."); + // Queue became inconsistent abort(); } @@ -264,7 +262,7 @@ private: if (current_size_in_bytes > (1ull << 63)) { - LOG_ERROR(&Poco::Logger::get("SLRUCache"), "SLRUCache became inconsistent. There must be a bug in it."); + // Queue became inconsistent abort(); } } diff --git a/src/Common/SpaceSaving.h b/src/Common/SpaceSaving.h index 476e107067b..f5f66e41307 100644 --- a/src/Common/SpaceSaving.h +++ b/src/Common/SpaceSaving.h @@ -51,6 +51,9 @@ struct SpaceSavingArena { StringRef emplace(StringRef key) { + if (!key.data) + return key; + return copyStringInArena(arena, key); } @@ -94,8 +97,8 @@ public: void write(WriteBuffer & wb) const { writeBinary(key, wb); - writeVarUInt(count, wb); - writeVarUInt(error, wb); + writeVarUIntOverflow(count, wb); + writeVarUIntOverflow(error, wb); } void read(ReadBuffer & rb) diff --git a/src/Common/SystemLogBase.cpp b/src/Common/SystemLogBase.cpp index 13150194df2..86adcbbd31b 100644 --- a/src/Common/SystemLogBase.cpp +++ b/src/Common/SystemLogBase.cpp @@ -18,6 +18,7 @@ #include #include +#include #include #include @@ -35,20 +36,18 @@ namespace constexpr size_t DBMS_SYSTEM_LOG_QUEUE_SIZE = 1048576; } +ISystemLog::~ISystemLog() = default; + void ISystemLog::stopFlushThread() { { std::lock_guard lock(mutex); - if (!saving_thread.joinable()) - { + if (!saving_thread || !saving_thread->joinable()) return; - } if (is_shutdown) - { return; - } is_shutdown = true; @@ -56,13 +55,13 @@ void ISystemLog::stopFlushThread() flush_event.notify_all(); } - saving_thread.join(); + saving_thread->join(); } void ISystemLog::startup() { std::lock_guard lock(mutex); - saving_thread = ThreadFromGlobalPool([this] { savingThreadFunction(); }); + saving_thread = std::make_unique([this] { savingThreadFunction(); }); } static thread_local bool recursive_add_call = false; diff --git a/src/Common/SystemLogBase.h b/src/Common/SystemLogBase.h index c2cedb2ae39..8ac731c34f7 100644 --- a/src/Common/SystemLogBase.h +++ b/src/Common/SystemLogBase.h @@ -10,7 +10,7 @@ #include #include #include -#include +#include #define SYSTEM_LOG_ELEMENTS(M) \ M(AsynchronousMetricLogElement) \ @@ -60,12 +60,12 @@ public: /// Stop the background flush thread before destructor. No more data will be written. virtual void shutdown() = 0; - virtual ~ISystemLog() = default; + virtual ~ISystemLog(); virtual void savingThreadFunction() = 0; protected: - ThreadFromGlobalPool saving_thread; + std::unique_ptr saving_thread; /// Data shared between callers of add()/flush()/shutdown(), and the saving thread std::mutex mutex; diff --git a/src/Common/ThreadPool.h b/src/Common/ThreadPool.h index b2f77f9693c..68023c8a410 100644 --- a/src/Common/ThreadPool.h +++ b/src/Common/ThreadPool.h @@ -17,6 +17,7 @@ #include #include #include +#include #include /** Very simple thread pool similar to boost::threadpool. diff --git a/src/Common/ThreadPool_fwd.h b/src/Common/ThreadPool_fwd.h new file mode 100644 index 00000000000..2782acc9c51 --- /dev/null +++ b/src/Common/ThreadPool_fwd.h @@ -0,0 +1,13 @@ +#pragma once + +template +class ThreadPoolImpl; + +template +class ThreadFromGlobalPoolImpl; + +using ThreadFromGlobalPoolNoTracingContextPropagation = ThreadFromGlobalPoolImpl; + +using ThreadFromGlobalPool = ThreadFromGlobalPoolImpl; + +using ThreadPool = ThreadPoolImpl; diff --git a/src/Common/ThreadProfileEvents.cpp b/src/Common/ThreadProfileEvents.cpp index 76a4d8b1adf..a94fd81559a 100644 --- a/src/Common/ThreadProfileEvents.cpp +++ b/src/Common/ThreadProfileEvents.cpp @@ -23,6 +23,7 @@ #include #include +#include namespace ProfileEvents diff --git a/src/Common/ThreadProfileEvents.h b/src/Common/ThreadProfileEvents.h index 5a03a9e8555..c4703a69998 100644 --- a/src/Common/ThreadProfileEvents.h +++ b/src/Common/ThreadProfileEvents.h @@ -2,11 +2,13 @@ #include #include +#include +#include #include #include #include #include -#include +#include #if defined(OS_LINUX) diff --git a/src/Common/ThreadStatus.cpp b/src/Common/ThreadStatus.cpp index 1b783aa9ec4..9b0743d89c3 100644 --- a/src/Common/ThreadStatus.cpp +++ b/src/Common/ThreadStatus.cpp @@ -2,6 +2,8 @@ #include #include #include +#include +#include #include #include @@ -61,7 +63,7 @@ static thread_local ThreadStack alt_stack; static thread_local bool has_alt_stack = false; #endif -ThreadGroupStatus::ThreadGroupStatus() +ThreadGroup::ThreadGroup() : master_thread_id(CurrentThread::get().thread_id) {} @@ -119,7 +121,7 @@ ThreadStatus::ThreadStatus() #endif } -ThreadGroupStatusPtr ThreadStatus::getThreadGroup() const +ThreadGroupPtr ThreadStatus::getThreadGroup() const { return thread_group; } @@ -139,7 +141,7 @@ ContextPtr ThreadStatus::getGlobalContext() const return global_context.lock(); } -void ThreadGroupStatus::attachInternalTextLogsQueue(const InternalTextLogsQueuePtr & logs_queue, LogsLevel logs_level) +void ThreadGroup::attachInternalTextLogsQueue(const InternalTextLogsQueuePtr & logs_queue, LogsLevel logs_level) { std::lock_guard lock(mutex); shared_data.logs_queue_ptr = logs_queue; diff --git a/src/Common/ThreadStatus.h b/src/Common/ThreadStatus.h index 79474f292ec..600dfc56d2b 100644 --- a/src/Common/ThreadStatus.h +++ b/src/Common/ThreadStatus.h @@ -41,7 +41,6 @@ class TaskStatsInfoGetter; class InternalTextLogsQueue; struct ViewRuntimeData; class QueryViewsLog; -class MemoryTrackerThreadSwitcher; using InternalTextLogsQueuePtr = std::shared_ptr; using InternalTextLogsQueueWeakPtr = std::weak_ptr; @@ -58,15 +57,15 @@ using ThreadStatusPtr = ThreadStatus *; * Create via CurrentThread::initializeQuery (for queries) or directly (for various background tasks). * Use via CurrentThread::getGroup. */ -class ThreadGroupStatus; -using ThreadGroupStatusPtr = std::shared_ptr; +class ThreadGroup; +using ThreadGroupPtr = std::shared_ptr; -class ThreadGroupStatus +class ThreadGroup { public: - ThreadGroupStatus(); + ThreadGroup(); using FatalErrorCallback = std::function; - ThreadGroupStatus(ContextPtr query_context_, FatalErrorCallback fatal_error_callback_ = {}); + ThreadGroup(ContextPtr query_context_, FatalErrorCallback fatal_error_callback_ = {}); /// The first thread created this thread group const UInt64 master_thread_id; @@ -104,7 +103,9 @@ public: void attachInternalProfileEventsQueue(const InternalProfileEventsQueuePtr & profile_queue); /// When new query starts, new thread group is created for it, current thread becomes master thread of the query - static ThreadGroupStatusPtr createForQuery(ContextPtr query_context_, FatalErrorCallback fatal_error_callback_ = {}); + static ThreadGroupPtr createForQuery(ContextPtr query_context_, FatalErrorCallback fatal_error_callback_ = {}); + + static ThreadGroupPtr createForBackgroundProcess(ContextPtr storage_context); std::vector getInvolvedThreadIds() const; void linkThread(UInt64 thread_it); @@ -118,6 +119,21 @@ private: std::unordered_set thread_ids; }; +/** + * Since merge is executed with multiple threads, this class + * switches the parent MemoryTracker as part of the thread group to account all the memory used. + */ +class ThreadGroupSwitcher : private boost::noncopyable +{ +public: + explicit ThreadGroupSwitcher(ThreadGroupPtr thread_group); + ~ThreadGroupSwitcher(); + +private: + ThreadGroupPtr prev_thread_group; +}; + + /** * We use **constinit** here to tell the compiler the current_thread variable is initialized. * If we didn't help the compiler, then it would most likely add a check before every use of the variable to initialize it if needed. @@ -161,7 +177,7 @@ public: private: /// Group of threads, to which this thread attached - ThreadGroupStatusPtr thread_group; + ThreadGroupPtr thread_group; /// Is set once ContextWeakPtr global_context; @@ -172,17 +188,11 @@ private: using FatalErrorCallback = std::function; FatalErrorCallback fatal_error_callback; - ThreadGroupStatus::SharedData local_data; + ThreadGroup::SharedData local_data; bool performance_counters_finalized = false; String query_id_from_query_context; - /// Requires access to query_id. - friend class MemoryTrackerThreadSwitcher; - void setQueryId(const String & query_id_) - { - query_id_from_query_context = query_id_; - } struct TimePoint { @@ -219,7 +229,7 @@ public: ThreadStatus(); ~ThreadStatus(); - ThreadGroupStatusPtr getThreadGroup() const; + ThreadGroupPtr getThreadGroup() const; const String & getQueryId() const; @@ -243,7 +253,7 @@ public: void setInternalThread(); /// Attaches slave thread to existing thread group - void attachToGroup(const ThreadGroupStatusPtr & thread_group_, bool check_detached = true); + void attachToGroup(const ThreadGroupPtr & thread_group_, bool check_detached = true); /// Detaches thread from the thread group and the query, dumps performance counters if they have not been dumped void detachFromGroup(); @@ -291,7 +301,7 @@ private: void logToQueryThreadLog(QueryThreadLog & thread_log, const String & current_database); - void attachToGroupImpl(const ThreadGroupStatusPtr & thread_group_); + void attachToGroupImpl(const ThreadGroupPtr & thread_group_); }; /** diff --git a/src/Common/ZooKeeper/Common.h b/src/Common/ZooKeeper/Common.h index a2956706c8f..1a1328588e3 100644 --- a/src/Common/ZooKeeper/Common.h +++ b/src/Common/ZooKeeper/Common.h @@ -3,12 +3,10 @@ #include #include -#include namespace zkutil { using GetZooKeeper = std::function; -using GetZooKeeperWithFaultInjection = std::function; } diff --git a/src/Common/ZooKeeper/IKeeper.h b/src/Common/ZooKeeper/IKeeper.h index a94e367cd70..172714fe04f 100644 --- a/src/Common/ZooKeeper/IKeeper.h +++ b/src/Common/ZooKeeper/IKeeper.h @@ -273,7 +273,7 @@ struct SetRequest : virtual Request void addRootPath(const String & root_path) override; String getPath() const override { return path; } - size_t bytesSize() const override { return data.size() + data.size() + sizeof(version); } + size_t bytesSize() const override { return path.size() + data.size() + sizeof(version); } }; struct SetResponse : virtual Response diff --git a/src/Common/ZooKeeper/ZooKeeper.cpp b/src/Common/ZooKeeper/ZooKeeper.cpp index 78f9e3da83b..a8da0dff0cc 100644 --- a/src/Common/ZooKeeper/ZooKeeper.cpp +++ b/src/Common/ZooKeeper/ZooKeeper.cpp @@ -15,6 +15,7 @@ #include "Common/ZooKeeper/IKeeper.h" #include #include +#include #include #include diff --git a/src/Common/ZooKeeper/ZooKeeper.h b/src/Common/ZooKeeper/ZooKeeper.h index fc4fe95d9b6..8e7639b8cc1 100644 --- a/src/Common/ZooKeeper/ZooKeeper.h +++ b/src/Common/ZooKeeper/ZooKeeper.h @@ -7,7 +7,6 @@ #include #include #include -#include #include #include #include diff --git a/src/Common/ZooKeeper/ZooKeeperImpl.cpp b/src/Common/ZooKeeper/ZooKeeperImpl.cpp index 8183569a718..79a975e683f 100644 --- a/src/Common/ZooKeeper/ZooKeeperImpl.cpp +++ b/src/Common/ZooKeeper/ZooKeeperImpl.cpp @@ -669,8 +669,8 @@ void ZooKeeper::receiveThread() earliest_operation = operations.begin()->second; auto earliest_operation_deadline = earliest_operation->time + std::chrono::microseconds(args.operation_timeout_ms * 1000); if (now > earliest_operation_deadline) - throw Exception(Error::ZOPERATIONTIMEOUT, "Operation timeout (deadline already expired) for path: {}", - earliest_operation->request->getPath()); + throw Exception(Error::ZOPERATIONTIMEOUT, "Operation timeout (deadline of {} ms already expired) for path: {}", + args.operation_timeout_ms, earliest_operation->request->getPath()); max_wait_us = std::chrono::duration_cast(earliest_operation_deadline - now).count(); } } @@ -687,12 +687,12 @@ void ZooKeeper::receiveThread() { if (earliest_operation) { - throw Exception(Error::ZOPERATIONTIMEOUT, "Operation timeout (no response) for request {} for path: {}", - toString(earliest_operation->request->getOpNum()), earliest_operation->request->getPath()); + throw Exception(Error::ZOPERATIONTIMEOUT, "Operation timeout (no response in {} ms) for request {} for path: {}", + args.operation_timeout_ms, toString(earliest_operation->request->getOpNum()), earliest_operation->request->getPath()); } waited_us += max_wait_us; if (waited_us >= args.session_timeout_ms * 1000) - throw Exception(Error::ZOPERATIONTIMEOUT, "Nothing is received in session timeout"); + throw Exception(Error::ZOPERATIONTIMEOUT, "Nothing is received in session timeout of {} ms", args.session_timeout_ms); } @@ -1080,7 +1080,7 @@ void ZooKeeper::pushRequest(RequestInfo && info) if (requests_queue.isFinished()) throw Exception(Error::ZSESSIONEXPIRED, "Session expired"); - throw Exception(Error::ZOPERATIONTIMEOUT, "Cannot push request to queue within operation timeout"); + throw Exception(Error::ZOPERATIONTIMEOUT, "Cannot push request to queue within operation timeout of {} ms", args.operation_timeout_ms); } } catch (...) @@ -1332,7 +1332,7 @@ void ZooKeeper::close() request_info.request = std::make_shared(std::move(request)); if (!requests_queue.tryPush(std::move(request_info), args.operation_timeout_ms)) - throw Exception(Error::ZOPERATIONTIMEOUT, "Cannot push close request to queue within operation timeout"); + throw Exception(Error::ZOPERATIONTIMEOUT, "Cannot push close request to queue within operation timeout of {} ms", args.operation_timeout_ms); ProfileEvents::increment(ProfileEvents::ZooKeeperClose); } diff --git a/src/Common/ZooKeeper/ZooKeeperLock.cpp b/src/Common/ZooKeeper/ZooKeeperLock.cpp index a52c942a35f..6ee1c380efb 100644 --- a/src/Common/ZooKeeper/ZooKeeperLock.cpp +++ b/src/Common/ZooKeeper/ZooKeeperLock.cpp @@ -1,4 +1,6 @@ #include +#include +#include #include namespace DB diff --git a/src/Common/ZooKeeper/ZooKeeperLock.h b/src/Common/ZooKeeper/ZooKeeperLock.h index 755ca1333b8..146527c6c94 100644 --- a/src/Common/ZooKeeper/ZooKeeperLock.h +++ b/src/Common/ZooKeeper/ZooKeeperLock.h @@ -3,7 +3,8 @@ #include #include #include -#include + +namespace Poco { class Logger; } namespace zkutil { diff --git a/src/Common/ZooKeeper/ZooKeeperWithFaultInjection.h b/src/Common/ZooKeeper/ZooKeeperWithFaultInjection.h index a39a083cf33..130590ceb40 100644 --- a/src/Common/ZooKeeper/ZooKeeperWithFaultInjection.h +++ b/src/Common/ZooKeeper/ZooKeeperWithFaultInjection.h @@ -4,6 +4,7 @@ #include #include #include +#include #include namespace DB diff --git a/src/Common/mysqlxx/PoolFactory.cpp b/src/Common/mysqlxx/PoolFactory.cpp index 5fae934a400..9479273e36e 100644 --- a/src/Common/mysqlxx/PoolFactory.cpp +++ b/src/Common/mysqlxx/PoolFactory.cpp @@ -23,12 +23,6 @@ PoolWithFailover PoolFactory::get(const std::string & config_name, unsigned defa return get(Poco::Util::Application::instance().config(), config_name, default_connections, max_connections, max_tries); } -/// Duplicate of code from StringUtils.h. Copied here for less dependencies. -static bool startsWith(const std::string & s, const char * prefix) -{ - return s.size() >= strlen(prefix) && 0 == memcmp(s.data(), prefix, strlen(prefix)); -} - static std::string getPoolEntryName(const Poco::Util::AbstractConfiguration & config, const std::string & config_name) { @@ -55,7 +49,7 @@ static std::string getPoolEntryName(const Poco::Util::AbstractConfiguration & co for (const auto & replica_config_key : replica_keys) { /// There could be another elements in the same level in configuration file, like "user", "port"... - if (startsWith(replica_config_key, "replica")) + if (replica_config_key.starts_with("replica")) { std::string replica_name = config_name + "." + replica_config_key; std::string tmp_host = config.getString(replica_name + ".host", host); diff --git a/src/Common/scope_guard_safe.h b/src/Common/scope_guard_safe.h index f098fd95f00..2befb58870a 100644 --- a/src/Common/scope_guard_safe.h +++ b/src/Common/scope_guard_safe.h @@ -1,7 +1,6 @@ #pragma once #include -#include #include /// Same as SCOPE_EXIT() but block the MEMORY_LIMIT_EXCEEDED errors. diff --git a/src/Common/typeid_cast.h b/src/Common/typeid_cast.h index 1568d380938..baee3aaf632 100644 --- a/src/Common/typeid_cast.h +++ b/src/Common/typeid_cast.h @@ -18,9 +18,6 @@ namespace DB } } -template -concept is_any_of = (std::same_as || ...); - /** Checks type by comparing typeid. * The exact match of the type is checked. That is, cast to the ancestor will be unsuccessful. diff --git a/src/Coordination/Changelog.h b/src/Coordination/Changelog.h index 288f71bb915..56b0475ba8b 100644 --- a/src/Coordination/Changelog.h +++ b/src/Coordination/Changelog.h @@ -10,6 +10,7 @@ #include #include #include +#include namespace DB { diff --git a/src/Coordination/KeeperDispatcher.cpp b/src/Coordination/KeeperDispatcher.cpp index 1828182751d..2cde7c2465e 100644 --- a/src/Coordination/KeeperDispatcher.cpp +++ b/src/Coordination/KeeperDispatcher.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #include #include diff --git a/src/Coordination/KeeperDispatcher.h b/src/Coordination/KeeperDispatcher.h index 77b5510cbb3..4b8b134cf8f 100644 --- a/src/Coordination/KeeperDispatcher.h +++ b/src/Coordination/KeeperDispatcher.h @@ -9,7 +9,6 @@ #include #include #include -#include #include #include #include diff --git a/src/Coordination/KeeperLogStore.cpp b/src/Coordination/KeeperLogStore.cpp index d1bd2f9db18..10619a44517 100644 --- a/src/Coordination/KeeperLogStore.cpp +++ b/src/Coordination/KeeperLogStore.cpp @@ -1,5 +1,6 @@ #include #include +#include namespace DB { diff --git a/src/Coordination/KeeperLogStore.h b/src/Coordination/KeeperLogStore.h index 108241e024e..e48e3a32463 100644 --- a/src/Coordination/KeeperLogStore.h +++ b/src/Coordination/KeeperLogStore.h @@ -4,7 +4,6 @@ #include #include #include -#include #include namespace DB diff --git a/src/Coordination/KeeperSnapshotManagerS3.h b/src/Coordination/KeeperSnapshotManagerS3.h index 197f528b192..eff7868bba9 100644 --- a/src/Coordination/KeeperSnapshotManagerS3.h +++ b/src/Coordination/KeeperSnapshotManagerS3.h @@ -9,7 +9,6 @@ #if USE_AWS_S3 #include #include -#include #include #endif diff --git a/src/Coordination/KeeperStateMachine.cpp b/src/Coordination/KeeperStateMachine.cpp index 632aaec6b54..7c55739a96f 100644 --- a/src/Coordination/KeeperStateMachine.cpp +++ b/src/Coordination/KeeperStateMachine.cpp @@ -8,9 +8,10 @@ #include #include #include -#include "Common/ZooKeeper/ZooKeeperCommon.h" +#include #include #include +#include #include "Coordination/KeeperStorage.h" diff --git a/src/Coordination/KeeperStateMachine.h b/src/Coordination/KeeperStateMachine.h index e4f0295db99..5af5bc05b0f 100644 --- a/src/Coordination/KeeperStateMachine.h +++ b/src/Coordination/KeeperStateMachine.h @@ -8,7 +8,6 @@ #include #include -#include namespace DB diff --git a/src/Coordination/KeeperStateManager.cpp b/src/Coordination/KeeperStateManager.cpp index cfb3519e597..70687ba471c 100644 --- a/src/Coordination/KeeperStateManager.cpp +++ b/src/Coordination/KeeperStateManager.cpp @@ -7,6 +7,7 @@ #include #include #include +#include namespace DB { diff --git a/src/Coordination/KeeperStorage.cpp b/src/Coordination/KeeperStorage.cpp index 41a6af54204..cfc1c2bd12b 100644 --- a/src/Coordination/KeeperStorage.cpp +++ b/src/Coordination/KeeperStorage.cpp @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -61,16 +62,10 @@ String getSHA1(const String & userdata) return String{digest_id.begin(), digest_id.end()}; } -String generateDigest(const String & userdata) -{ - std::vector user_password; - boost::split(user_password, userdata, [](char character) { return character == ':'; }); - return user_password[0] + ":" + base64Encode(getSHA1(userdata)); -} - bool fixupACL( const std::vector & request_acls, - const std::vector & current_ids, + int64_t session_id, + const KeeperStorage::UncommittedState & uncommitted_state, std::vector & result_acls) { if (request_acls.empty()) @@ -81,14 +76,18 @@ bool fixupACL( { if (request_acl.scheme == "auth") { - for (const auto & current_id : current_ids) - { - valid_found = true; - Coordination::ACL new_acl = request_acl; - new_acl.scheme = current_id.scheme; - new_acl.id = current_id.id; - result_acls.push_back(new_acl); - } + uncommitted_state.forEachAuthInSession( + session_id, + [&](const KeeperStorage::AuthID & auth_id) + { + valid_found = true; + Coordination::ACL new_acl = request_acl; + + new_acl.scheme = auth_id.scheme; + new_acl.id = auth_id.id; + + result_acls.push_back(new_acl); + }); } else if (request_acl.scheme == "world" && request_acl.id == "anyone") { @@ -564,6 +563,32 @@ Coordination::ACLs KeeperStorage::UncommittedState::getACLs(StringRef path) cons return storage.acl_map.convertNumber(node_it->value.acl_id); } +void KeeperStorage::UncommittedState::forEachAuthInSession(int64_t session_id, std::function func) const +{ + const auto call_for_each_auth = [&func](const auto & auth_ids) + { + for (const auto & auth : auth_ids) + { + using TAuth = std::remove_reference_t; + + const AuthID * auth_ptr = nullptr; + if constexpr (std::is_pointer_v) + auth_ptr = auth; + else + auth_ptr = &auth; + + func(*auth_ptr); + } + }; + + // for committed + if (storage.session_and_auth.contains(session_id)) + call_for_each_auth(storage.session_and_auth.at(session_id)); + // for uncommitted + if (session_and_auth.contains(session_id)) + call_for_each_auth(session_and_auth.at(session_id)); +} + namespace { @@ -927,7 +952,7 @@ struct KeeperStorageCreateRequestProcessor final : public KeeperStorageRequestPr return {KeeperStorage::Delta{zxid, Coordination::Error::ZBADARGUMENTS}}; Coordination::ACLs node_acls; - if (!fixupACL(request.acls, storage.session_and_auth[session_id], node_acls)) + if (!fixupACL(request.acls, session_id, storage.uncommitted_state, node_acls)) return {KeeperStorage::Delta{zxid, Coordination::Error::ZINVALIDACL}}; if (request.is_ephemeral) @@ -1533,10 +1558,8 @@ struct KeeperStorageSetACLRequestProcessor final : public KeeperStorageRequestPr return {KeeperStorage::Delta{zxid, Coordination::Error::ZBADVERSION}}; - auto & session_auth_ids = storage.session_and_auth[session_id]; Coordination::ACLs node_acls; - - if (!fixupACL(request.acls, session_auth_ids, node_acls)) + if (!fixupACL(request.acls, session_id, uncommitted_state, node_acls)) return {KeeperStorage::Delta{zxid, Coordination::Error::ZINVALIDACL}}; std::vector new_deltas @@ -1840,7 +1863,7 @@ struct KeeperStorageAuthRequestProcessor final : public KeeperStorageRequestProc return {KeeperStorage::Delta{zxid, Coordination::Error::ZAUTHFAILED}}; std::vector new_deltas; - auto auth_digest = generateDigest(auth_request.data); + auto auth_digest = KeeperStorage::generateDigest(auth_request.data); if (auth_digest == storage.superdigest) { KeeperStorage::AuthID auth{"super", ""}; @@ -2420,5 +2443,12 @@ void KeeperStorage::recalculateStats() container.recalculateDataSize(); } +String KeeperStorage::generateDigest(const String & userdata) +{ + std::vector user_password; + boost::split(user_password, userdata, [](char character) { return character == ':'; }); + return user_password[0] + ":" + base64Encode(getSHA1(userdata)); +} + } diff --git a/src/Coordination/KeeperStorage.h b/src/Coordination/KeeperStorage.h index be528072df4..cfacdfc84de 100644 --- a/src/Coordination/KeeperStorage.h +++ b/src/Coordination/KeeperStorage.h @@ -105,6 +105,8 @@ public: return first.value == second.value; } + static String generateDigest(const String & userdata); + struct RequestForSession { int64_t session_id; @@ -263,6 +265,8 @@ public: return check_auth(auth_it->second); } + void forEachAuthInSession(int64_t session_id, std::function func) const; + std::shared_ptr tryGetNodeFromStorage(StringRef path) const; std::unordered_map> session_and_auth; diff --git a/src/Coordination/ZooKeeperDataReader.cpp b/src/Coordination/ZooKeeperDataReader.cpp index 5fa67a60b4b..2d4f64e033f 100644 --- a/src/Coordination/ZooKeeperDataReader.cpp +++ b/src/Coordination/ZooKeeperDataReader.cpp @@ -6,6 +6,7 @@ #include #include +#include #include #include diff --git a/src/Coordination/ZooKeeperDataReader.h b/src/Coordination/ZooKeeperDataReader.h index 6da6fd498af..8fd86ba99e2 100644 --- a/src/Coordination/ZooKeeperDataReader.h +++ b/src/Coordination/ZooKeeperDataReader.h @@ -1,7 +1,6 @@ #pragma once #include #include -#include namespace DB { diff --git a/src/Coordination/tests/gtest_coordination.cpp b/src/Coordination/tests/gtest_coordination.cpp index 895d563327e..b1bea8ddf24 100644 --- a/src/Coordination/tests/gtest_coordination.cpp +++ b/src/Coordination/tests/gtest_coordination.cpp @@ -1579,6 +1579,113 @@ TEST_P(CoordinationTest, TestEphemeralNodeRemove) } +TEST_P(CoordinationTest, TestCreateNodeWithAuthSchemeForAclWhenAuthIsPrecommitted) +{ + using namespace Coordination; + using namespace DB; + + ChangelogDirTest snapshots("./snapshots"); + CoordinationSettingsPtr settings = std::make_shared(); + ResponsesQueue queue(std::numeric_limits::max()); + SnapshotsQueue snapshots_queue{1}; + + auto state_machine = std::make_shared(queue, snapshots_queue, "./snapshots", settings, keeper_context, nullptr); + state_machine->init(); + + String user_auth_data = "test_user:test_password"; + String digest = KeeperStorage::generateDigest(user_auth_data); + + std::shared_ptr auth_req = std::make_shared(); + auth_req->scheme = "digest"; + auth_req->data = user_auth_data; + + // Add auth data to the session + auto auth_entry = getLogEntryFromZKRequest(0, 1, state_machine->getNextZxid(), auth_req); + state_machine->pre_commit(1, auth_entry->get_buf()); + + // Create a node with 'auth' scheme for ACL + String node_path = "/hello"; + std::shared_ptr create_req = std::make_shared(); + create_req->path = node_path; + // When 'auth' scheme is used the creator must have been authenticated by the server (for example, using 'digest' scheme) before it can + // create nodes with this ACL. + create_req->acls = {{.permissions = 31, .scheme = "auth", .id = ""}}; + auto create_entry = getLogEntryFromZKRequest(0, 1, state_machine->getNextZxid(), create_req); + state_machine->pre_commit(2, create_entry->get_buf()); + + const auto & uncommitted_state = state_machine->getStorage().uncommitted_state; + ASSERT_TRUE(uncommitted_state.nodes.contains(node_path)); + + // commit log entries + state_machine->commit(1, auth_entry->get_buf()); + state_machine->commit(2, create_entry->get_buf()); + + auto node = uncommitted_state.getNode(node_path); + ASSERT_NE(node, nullptr); + auto acls = uncommitted_state.getACLs(node_path); + ASSERT_EQ(acls.size(), 1); + EXPECT_EQ(acls[0].scheme, "digest"); + EXPECT_EQ(acls[0].id, digest); + EXPECT_EQ(acls[0].permissions, 31); +} + +TEST_P(CoordinationTest, TestSetACLWithAuthSchemeForAclWhenAuthIsPrecommitted) +{ + using namespace Coordination; + using namespace DB; + + ChangelogDirTest snapshots("./snapshots"); + CoordinationSettingsPtr settings = std::make_shared(); + ResponsesQueue queue(std::numeric_limits::max()); + SnapshotsQueue snapshots_queue{1}; + + auto state_machine = std::make_shared(queue, snapshots_queue, "./snapshots", settings, keeper_context, nullptr); + state_machine->init(); + + String user_auth_data = "test_user:test_password"; + String digest = KeeperStorage::generateDigest(user_auth_data); + + std::shared_ptr auth_req = std::make_shared(); + auth_req->scheme = "digest"; + auth_req->data = user_auth_data; + + // Add auth data to the session + auto auth_entry = getLogEntryFromZKRequest(0, 1, state_machine->getNextZxid(), auth_req); + state_machine->pre_commit(1, auth_entry->get_buf()); + + // Create a node + String node_path = "/hello"; + std::shared_ptr create_req = std::make_shared(); + create_req->path = node_path; + auto create_entry = getLogEntryFromZKRequest(0, 1, state_machine->getNextZxid(), create_req); + state_machine->pre_commit(2, create_entry->get_buf()); + + // Set ACL with 'auth' scheme for ACL + std::shared_ptr set_acl_req = std::make_shared(); + set_acl_req->path = node_path; + // When 'auth' scheme is used the creator must have been authenticated by the server (for example, using 'digest' scheme) before it can + // set this ACL. + set_acl_req->acls = {{.permissions = 31, .scheme = "auth", .id = ""}}; + auto set_acl_entry = getLogEntryFromZKRequest(0, 1, state_machine->getNextZxid(), set_acl_req); + state_machine->pre_commit(3, set_acl_entry->get_buf()); + + // commit all entries + state_machine->commit(1, auth_entry->get_buf()); + state_machine->commit(2, create_entry->get_buf()); + state_machine->commit(3, set_acl_entry->get_buf()); + + const auto & uncommitted_state = state_machine->getStorage().uncommitted_state; + auto node = uncommitted_state.getNode(node_path); + + ASSERT_NE(node, nullptr); + auto acls = uncommitted_state.getACLs(node_path); + ASSERT_EQ(acls.size(), 1); + EXPECT_EQ(acls[0].scheme, "digest"); + EXPECT_EQ(acls[0].id, digest); + EXPECT_EQ(acls[0].permissions, 31); +} + + TEST_P(CoordinationTest, TestRotateIntervalChanges) { using namespace Coordination; diff --git a/src/Core/BackgroundSchedulePool.cpp b/src/Core/BackgroundSchedulePool.cpp index 5384ee7f961..39724ec07fa 100644 --- a/src/Core/BackgroundSchedulePool.cpp +++ b/src/Core/BackgroundSchedulePool.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include @@ -160,7 +161,7 @@ BackgroundSchedulePool::BackgroundSchedulePool(size_t size_, CurrentMetrics::Met for (auto & thread : threads) thread = ThreadFromGlobalPoolNoTracingContextPropagation([this] { threadFunction(); }); - delayed_thread = ThreadFromGlobalPoolNoTracingContextPropagation([this] { delayExecutionThreadFunction(); }); + delayed_thread = std::make_unique([this] { delayExecutionThreadFunction(); }); } @@ -198,7 +199,7 @@ BackgroundSchedulePool::~BackgroundSchedulePool() delayed_tasks_cond_var.notify_all(); LOG_TRACE(&Poco::Logger::get("BackgroundSchedulePool/" + thread_name), "Waiting for threads to finish."); - delayed_thread.join(); + delayed_thread->join(); for (auto & thread : threads) thread.join(); diff --git a/src/Core/BackgroundSchedulePool.h b/src/Core/BackgroundSchedulePool.h index ef6fbfa68e9..e97b02e976f 100644 --- a/src/Core/BackgroundSchedulePool.h +++ b/src/Core/BackgroundSchedulePool.h @@ -14,7 +14,7 @@ #include #include #include -#include +#include #include @@ -86,7 +86,7 @@ private: std::condition_variable delayed_tasks_cond_var; std::mutex delayed_tasks_mutex; /// Thread waiting for next delayed task. - ThreadFromGlobalPoolNoTracingContextPropagation delayed_thread; + std::unique_ptr delayed_thread; /// Tasks ordered by scheduled time. DelayedTasks delayed_tasks; diff --git a/src/Core/PostgreSQL/PoolWithFailover.cpp b/src/Core/PostgreSQL/PoolWithFailover.cpp index 22cd88c0764..3655681c515 100644 --- a/src/Core/PostgreSQL/PoolWithFailover.cpp +++ b/src/Core/PostgreSQL/PoolWithFailover.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #include #include diff --git a/src/Core/PostgreSQL/PoolWithFailover.h b/src/Core/PostgreSQL/PoolWithFailover.h index afef2933d29..bf9c34e6723 100644 --- a/src/Core/PostgreSQL/PoolWithFailover.h +++ b/src/Core/PostgreSQL/PoolWithFailover.h @@ -8,7 +8,6 @@ #include "ConnectionHolder.h" #include #include -#include #include #include diff --git a/src/Core/ServerSettings.cpp b/src/Core/ServerSettings.cpp index 0a94b0dffcc..2b4ee6485bc 100644 --- a/src/Core/ServerSettings.cpp +++ b/src/Core/ServerSettings.cpp @@ -19,7 +19,10 @@ void ServerSettings::loadSettingsFromConfig(const Poco::Util::AbstractConfigurat "background_buffer_flush_schedule_pool_size", "background_schedule_pool_size", "background_message_broker_schedule_pool_size", - "background_distributed_schedule_pool_size" + "background_distributed_schedule_pool_size", + + "max_remote_read_network_bandwidth_for_server", + "max_remote_write_network_bandwidth_for_server", }; for (auto setting : all()) diff --git a/src/Core/ServerSettings.h b/src/Core/ServerSettings.h index 753a70a7c25..aabc89cc6d7 100644 --- a/src/Core/ServerSettings.h +++ b/src/Core/ServerSettings.h @@ -21,10 +21,15 @@ namespace DB M(UInt64, max_io_thread_pool_size, 100, "The maximum number of threads that would be used for IO operations", 0) \ M(UInt64, max_io_thread_pool_free_size, 0, "Max free size for IO thread pool.", 0) \ M(UInt64, io_thread_pool_queue_size, 10000, "Queue size for IO thread pool.", 0) \ + M(UInt64, max_remote_read_network_bandwidth_for_server, 0, "The maximum speed of data exchange over the network in bytes per second for read. Zero means unlimited.", 0) \ + M(UInt64, max_remote_write_network_bandwidth_for_server, 0, "The maximum speed of data exchange over the network in bytes per second for write. Zero means unlimited.", 0) \ + M(UInt64, max_local_read_bandwidth_for_server, 0, "The maximum speed of local reads in bytes per second. Zero means unlimited.", 0) \ + M(UInt64, max_local_write_bandwidth_for_server, 0, "The maximum speed of local writes in bytes per second. Zero means unlimited.", 0) \ M(UInt64, max_backups_io_thread_pool_size, 1000, "The maximum number of threads that would be used for IO operations for BACKUP queries", 0) \ M(UInt64, max_backups_io_thread_pool_free_size, 0, "Max free size for backups IO thread pool.", 0) \ M(UInt64, backups_io_thread_pool_queue_size, 0, "Queue size for backups IO thread pool.", 0) \ M(UInt64, backup_threads, 16, "The maximum number of threads to execute BACKUP requests.", 0) \ + M(UInt64, max_backup_bandwidth_for_server, 0, "The maximum read speed in bytes per second for all backups on server. Zero means unlimited.", 0) \ M(UInt64, restore_threads, 16, "The maximum number of threads to execute RESTORE requests.", 0) \ M(Int32, max_connections, 1024, "Max server connections.", 0) \ M(UInt32, asynchronous_metrics_update_period_s, 1, "Period in seconds for updating asynchronous metrics.", 0) \ diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 44b68f459bd..c47432ae14a 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -100,8 +100,10 @@ class IColumn; M(Bool, replace_running_query, false, "Whether the running request should be canceled with the same id as the new one.", 0) \ M(UInt64, max_replicated_fetches_network_bandwidth_for_server, 0, "The maximum speed of data exchange over the network in bytes per second for replicated fetches. Zero means unlimited. Only has meaning at server startup.", 0) \ M(UInt64, max_replicated_sends_network_bandwidth_for_server, 0, "The maximum speed of data exchange over the network in bytes per second for replicated sends. Zero means unlimited. Only has meaning at server startup.", 0) \ - M(UInt64, max_remote_read_network_bandwidth_for_server, 0, "The maximum speed of data exchange over the network in bytes per second for read. Zero means unlimited. Only has meaning at server startup.", 0) \ - M(UInt64, max_remote_write_network_bandwidth_for_server, 0, "The maximum speed of data exchange over the network in bytes per second for write. Zero means unlimited. Only has meaning at server startup.", 0) \ + M(UInt64, max_remote_read_network_bandwidth, 0, "The maximum speed of data exchange over the network in bytes per second for read.", 0) \ + M(UInt64, max_remote_write_network_bandwidth, 0, "The maximum speed of data exchange over the network in bytes per second for write.", 0) \ + M(UInt64, max_local_read_bandwidth, 0, "The maximum speed of local reads in bytes per second.", 0) \ + M(UInt64, max_local_write_bandwidth, 0, "The maximum speed of local writes in bytes per second.", 0) \ M(Bool, stream_like_engine_allow_direct_select, false, "Allow direct SELECT query for Kafka, RabbitMQ, FileLog, Redis Streams and NATS engines. In case there are attached materialized views, SELECT query is not allowed even if this setting is enabled.", 0) \ M(String, stream_like_engine_insert_queue, "", "When stream like engine reads from multiple queues, user will need to select one queue to insert into when writing. Used by Redis Streams and NATS.", 0) \ \ @@ -129,7 +131,7 @@ class IColumn; M(Bool, allow_suspicious_fixed_string_types, false, "In CREATE TABLE statement allows creating columns of type FixedString(n) with n > 256. FixedString with length >= 256 is suspicious and most likely indicates misusage", 0) \ M(Bool, compile_expressions, true, "Compile some scalar functions and operators to native code.", 0) \ M(UInt64, min_count_to_compile_expression, 3, "The number of identical expressions before they are JIT-compiled", 0) \ - M(Bool, compile_aggregate_expressions, true, "Compile aggregate functions to native code.", 0) \ + M(Bool, compile_aggregate_expressions, false, "Compile aggregate functions to native code. This feature has a bug and should not be used.", 0) \ M(UInt64, min_count_to_compile_aggregate_expression, 3, "The number of identical aggregate expressions before they are JIT-compiled", 0) \ M(Bool, compile_sort_description, true, "Compile sort description to native code.", 0) \ M(UInt64, min_count_to_compile_sort_description, 3, "The number of identical sort descriptions before they are JIT-compiled", 0) \ @@ -422,6 +424,7 @@ class IColumn; M(UInt64, backup_restore_keeper_fault_injection_seed, 0, "0 - random seed, otherwise the setting value", 0) \ M(UInt64, backup_restore_keeper_value_max_size, 1048576, "Maximum size of data of a [Zoo]Keeper's node during backup", 0) \ M(UInt64, backup_restore_batch_size_for_keeper_multiread, 10000, "Maximum size of batch for multiread request to [Zoo]Keeper during backup or restore", 0) \ + M(UInt64, max_backup_bandwidth, 0, "The maximum read speed in bytes per second for particular backup on server. Zero means unlimited.", 0) \ \ M(Bool, log_profile_events, true, "Log query performance statistics into the query_log, query_thread_log and query_views_log.", 0) \ M(Bool, log_query_settings, true, "Log query settings into the query_log.", 0) \ @@ -464,6 +467,8 @@ class IColumn; M(Bool, allow_introspection_functions, false, "Allow functions for introspection of ELF and DWARF for query profiling. These functions are slow and may impose security considerations.", 0) \ \ M(Bool, allow_execute_multiif_columnar, true, "Allow execute multiIf function columnar", 0) \ + M(Bool, formatdatetime_f_prints_single_zero, false, "Formatter '%f' in function 'formatDateTime()' produces a single zero instead of six zeros if the formatted value has no fractional seconds.", 0) \ + M(Bool, formatdatetime_parsedatetime_m_is_month_name, true, "Formatter '%M' in functions 'formatDateTime()' and 'parseDateTime()' produces the month name instead of minutes.", 0) \ \ M(UInt64, max_partitions_per_insert_block, 100, "Limit maximum number of partitions in single INSERTed block. Zero means unlimited. Throw exception if the block contains too many partitions. This setting is a safety threshold, because using large number of partitions is a common misconception.", 0) \ M(Int64, max_partitions_to_read, -1, "Limit the max number of partitions that can be accessed in one query. <= 0 means unlimited.", 0) \ @@ -735,6 +740,7 @@ class IColumn; #define MAKE_OBSOLETE(M, TYPE, NAME, DEFAULT) \ M(TYPE, NAME, DEFAULT, "Obsolete setting, does nothing.", BaseSettingsHelpers::Flags::OBSOLETE) +/// NOTE: ServerSettings::loadSettingsFromConfig() should be updated to include this settings #define MAKE_DEPRECATED_BY_SERVER_CONFIG(M, TYPE, NAME, DEFAULT) \ M(TYPE, NAME, DEFAULT, "User-level setting is deprecated, and it must be defined in the server configuration instead.", BaseSettingsHelpers::Flags::OBSOLETE) @@ -768,6 +774,8 @@ class IColumn; MAKE_DEPRECATED_BY_SERVER_CONFIG(M, UInt64, background_schedule_pool_size, 128) \ MAKE_DEPRECATED_BY_SERVER_CONFIG(M, UInt64, background_message_broker_schedule_pool_size, 16) \ MAKE_DEPRECATED_BY_SERVER_CONFIG(M, UInt64, background_distributed_schedule_pool_size, 16) \ + MAKE_DEPRECATED_BY_SERVER_CONFIG(M, UInt64, max_remote_read_network_bandwidth_for_server, 0) \ + MAKE_DEPRECATED_BY_SERVER_CONFIG(M, UInt64, max_remote_write_network_bandwidth_for_server, 0) \ /* ---- */ \ MAKE_OBSOLETE(M, DefaultDatabaseEngine, default_database_engine, DefaultDatabaseEngine::Atomic) \ MAKE_OBSOLETE(M, UInt64, max_pipeline_depth, 0) \ diff --git a/src/Core/SettingsChangesHistory.h b/src/Core/SettingsChangesHistory.h index caf18cf8fb8..d7f80cc7a49 100644 --- a/src/Core/SettingsChangesHistory.h +++ b/src/Core/SettingsChangesHistory.h @@ -101,6 +101,8 @@ static std::map sett {"query_plan_aggregation_in_order", 0, 1, "Enable some refactoring around query plan"}, {"format_binary_max_string_size", 0, 1_GiB, "Prevent allocating large amount of memory"}}}, {"22.11", {{"use_structure_from_insertion_table_in_table_functions", 0, 2, "Improve using structure from insertion table in table functions"}}}, + {"23.4", {{"formatdatetime_f_prints_single_zero", true, false, "Improved compatibility with MySQL DATE_FORMAT()/STR_TO_DATE()"}}}, + {"23.4", {{"formatdatetime_parsedatetime_m_is_month_name", false, true, "Improved compatibility with MySQL DATE_FORMAT/STR_TO_DATE"}}}, {"22.9", {{"force_grouping_standard_compatibility", false, true, "Make GROUPING function output the same as in SQL standard and other DBMS"}}}, {"22.7", {{"cross_to_inner_join_rewrite", 1, 2, "Force rewrite comma join to inner"}, {"enable_positional_arguments", false, true, "Enable positional arguments feature by default"}, diff --git a/src/Core/SortDescription.cpp b/src/Core/SortDescription.cpp index 66ca1539b71..ae6aedf202d 100644 --- a/src/Core/SortDescription.cpp +++ b/src/Core/SortDescription.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #if USE_EMBEDDED_COMPILER #include diff --git a/src/Daemon/BaseDaemon.h b/src/Daemon/BaseDaemon.h index d28f9403c16..7aa1e8ad1a0 100644 --- a/src/Daemon/BaseDaemon.h +++ b/src/Daemon/BaseDaemon.h @@ -15,9 +15,7 @@ #include #include #include -#include #include -#include #include #include #include diff --git a/src/DataTypes/ObjectUtils.cpp b/src/DataTypes/ObjectUtils.cpp index edda0235bcc..28f000b6f0d 100644 --- a/src/DataTypes/ObjectUtils.cpp +++ b/src/DataTypes/ObjectUtils.cpp @@ -30,6 +30,7 @@ namespace ErrorCodes extern const int TYPE_MISMATCH; extern const int LOGICAL_ERROR; extern const int INCOMPATIBLE_COLUMNS; + extern const int NOT_IMPLEMENTED; } size_t getNumberOfDimensions(const IDataType & type) @@ -121,7 +122,7 @@ DataTypePtr getDataTypeByColumn(const IColumn & column) return makeNullable(getDataTypeByColumn(column_nullable->getNestedColumn())); /// TODO: add more types. - throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot get data type of column {}", column.getFamilyName()); + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Cannot get data type of column {}", column.getFamilyName()); } template diff --git a/src/DataTypes/Serializations/SerializationBool.cpp b/src/DataTypes/Serializations/SerializationBool.cpp index 81ad0ec46b1..41b5bf806e5 100644 --- a/src/DataTypes/Serializations/SerializationBool.cpp +++ b/src/DataTypes/Serializations/SerializationBool.cpp @@ -238,12 +238,15 @@ void SerializationBool::deserializeTextJSON(IColumn &column, ReadBuffer &istr, c ColumnUInt8 * col = checkAndGetDeserializeColumnType(column); bool value = false; - if (*istr.position() == 't' || *istr.position() == 'f') + char first_char = *istr.position(); + if (first_char == 't' || first_char == 'f') readBoolTextWord(value, istr); - else if (*istr.position() == '1' || *istr.position() == '0') + else if (first_char == '1' || first_char == '0') readBoolText(value, istr); else - throw Exception(ErrorCodes::CANNOT_PARSE_BOOL, "Invalid boolean value, should be true/false, 1/0."); + throw Exception(ErrorCodes::CANNOT_PARSE_BOOL, + "Invalid boolean value, should be true/false, 1/0, but it starts with the '{}' character.", first_char); + col->insert(value); } diff --git a/src/Databases/DatabaseFactory.h b/src/Databases/DatabaseFactory.h index 8992ea27093..cb631cd76d0 100644 --- a/src/Databases/DatabaseFactory.h +++ b/src/Databases/DatabaseFactory.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include namespace DB diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index efac04d9e15..5f5cd2667cb 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -1388,25 +1389,31 @@ bool DatabaseReplicated::shouldReplicateQuery(const ContextPtr & query_context, if (query_context->getClientInfo().is_replicated_database_internal) return false; - /// Some ALTERs are not replicated on database level - if (const auto * alter = query_ptr->as()) + /// we never replicate KeeperMap operations for some types of queries because it doesn't make sense + const auto is_keeper_map_table = [&](const ASTPtr & ast) { - auto table_id = query_context->resolveStorageID(*alter, Context::ResolveOrdinary); + auto table_id = query_context->resolveStorageID(ast, Context::ResolveOrdinary); StoragePtr table = DatabaseCatalog::instance().getTable(table_id, query_context); - /// we never replicate KeeperMap operations because it doesn't make sense - if (auto * keeper_map = table->as()) - return false; + return table->as() != nullptr; + }; - return !alter->isAttachAlter() && !alter->isFetchAlter() && !alter->isDropPartitionAlter(); - } + /// Some ALTERs are not replicated on database level + if (const auto * alter = query_ptr->as()) + return !alter->isAttachAlter() && !alter->isFetchAlter() && !alter->isDropPartitionAlter() && !is_keeper_map_table(query_ptr); /// DROP DATABASE is not replicated if (const auto * drop = query_ptr->as()) { - return drop->table.get(); + if (drop->table.get()) + return drop->kind != ASTDropQuery::Truncate || !is_keeper_map_table(query_ptr); + + return false; } + if (query_ptr->as() != nullptr) + return !is_keeper_map_table(query_ptr); + return true; } diff --git a/src/Databases/IDatabase.h b/src/Databases/IDatabase.h index b8880c4c4cc..53a2f372814 100644 --- a/src/Databases/IDatabase.h +++ b/src/Databases/IDatabase.h @@ -7,7 +7,7 @@ #include #include #include -#include +#include #include #include diff --git a/src/Dictionaries/CMakeLists.txt b/src/Dictionaries/CMakeLists.txt index 0260804ab5b..c9dd554a6f1 100644 --- a/src/Dictionaries/CMakeLists.txt +++ b/src/Dictionaries/CMakeLists.txt @@ -26,7 +26,6 @@ target_link_libraries(clickhouse_dictionaries clickhouse_common_io dbms Poco::Data - Poco::Data::ODBC Poco::MongoDB Poco::Redis string_utils diff --git a/src/Dictionaries/CacheDictionary.h b/src/Dictionaries/CacheDictionary.h index e19c4a66b1f..5203415005f 100644 --- a/src/Dictionaries/CacheDictionary.h +++ b/src/Dictionaries/CacheDictionary.h @@ -9,7 +9,6 @@ #include -#include #include #include diff --git a/src/Dictionaries/ExecutableDictionarySource.h b/src/Dictionaries/ExecutableDictionarySource.h index 0456d3cafef..c7067a62893 100644 --- a/src/Dictionaries/ExecutableDictionarySource.h +++ b/src/Dictionaries/ExecutableDictionarySource.h @@ -1,6 +1,5 @@ #pragma once -#include #include #include diff --git a/src/Dictionaries/ExecutablePoolDictionarySource.h b/src/Dictionaries/ExecutablePoolDictionarySource.h index 1fc10d18b76..e8cc6e83406 100644 --- a/src/Dictionaries/ExecutablePoolDictionarySource.h +++ b/src/Dictionaries/ExecutablePoolDictionarySource.h @@ -1,6 +1,5 @@ #pragma once -#include #include #include diff --git a/src/Dictionaries/HashedDictionary.cpp b/src/Dictionaries/HashedDictionary.cpp index 0e5d18363e9..5cfac20e572 100644 --- a/src/Dictionaries/HashedDictionary.cpp +++ b/src/Dictionaries/HashedDictionary.cpp @@ -114,9 +114,18 @@ public: ~ParallelDictionaryLoader() { - for (auto & queue : shards_queues) - queue->clearAndFinish(); - pool.wait(); + try + { + for (auto & queue : shards_queues) + queue->clearAndFinish(); + + /// NOTE: It is OK to not pass the exception next, since on success finish() should be called which will call wait() + pool.wait(); + } + catch (...) + { + tryLogCurrentException(dictionary.log, "Exception had been thrown during parallel load of the dictionary"); + } } private: diff --git a/src/Dictionaries/IPAddressDictionary.cpp b/src/Dictionaries/IPAddressDictionary.cpp index 803f607a3a7..ff1c784750b 100644 --- a/src/Dictionaries/IPAddressDictionary.cpp +++ b/src/Dictionaries/IPAddressDictionary.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include diff --git a/src/Dictionaries/IPAddressDictionary.h b/src/Dictionaries/IPAddressDictionary.h index e1fabb89a7e..67827c6524e 100644 --- a/src/Dictionaries/IPAddressDictionary.h +++ b/src/Dictionaries/IPAddressDictionary.h @@ -11,7 +11,6 @@ #include #include #include -#include #include "DictionaryStructure.h" #include "IDictionary.h" #include "IDictionarySource.h" diff --git a/src/Dictionaries/MongoDBDictionarySource.cpp b/src/Dictionaries/MongoDBDictionarySource.cpp index a9555a94304..922e1e71bbb 100644 --- a/src/Dictionaries/MongoDBDictionarySource.cpp +++ b/src/Dictionaries/MongoDBDictionarySource.cpp @@ -67,7 +67,6 @@ void registerDictionarySourceMongoDB(DictionarySourceFactory & factory) #include #include #include -#include // only after poco // naming conflict: diff --git a/src/Dictionaries/PostgreSQLDictionarySource.cpp b/src/Dictionaries/PostgreSQLDictionarySource.cpp index 9f254da0b11..8ec78308392 100644 --- a/src/Dictionaries/PostgreSQLDictionarySource.cpp +++ b/src/Dictionaries/PostgreSQLDictionarySource.cpp @@ -13,6 +13,7 @@ #include #include #include +#include #endif diff --git a/src/Dictionaries/PostgreSQLDictionarySource.h b/src/Dictionaries/PostgreSQLDictionarySource.h index 8ecf56a9430..1305333458b 100644 --- a/src/Dictionaries/PostgreSQLDictionarySource.h +++ b/src/Dictionaries/PostgreSQLDictionarySource.h @@ -8,7 +8,6 @@ #include "ExternalQueryBuilder.h" #include #include -#include #include diff --git a/src/Dictionaries/RegExpTreeDictionary.cpp b/src/Dictionaries/RegExpTreeDictionary.cpp index c072ba78d46..4db88631a2c 100644 --- a/src/Dictionaries/RegExpTreeDictionary.cpp +++ b/src/Dictionaries/RegExpTreeDictionary.cpp @@ -272,7 +272,7 @@ void RegExpTreeDictionary::initGraph() if (value->parent_id == 0) // this is root node. initTopologyOrder(id, visited, topology_id); if (topology_order.size() != regex_nodes.size()) - throw Exception(ErrorCodes::INCORRECT_DICTIONARY_DEFINITION, "Invalid Regex tree"); + throw Exception(ErrorCodes::LOGICAL_ERROR, "The topology order cannot match the number of regex nodes. This is likely a internal bug."); } void RegExpTreeDictionary::initTopologyOrder(UInt64 node_idx, std::set & visited, UInt64 & topology_id) @@ -280,7 +280,7 @@ void RegExpTreeDictionary::initTopologyOrder(UInt64 node_idx, std::set & visited.insert(node_idx); for (UInt64 child_idx : regex_nodes[node_idx]->children) if (visited.contains(child_idx)) - throw Exception(ErrorCodes::INCORRECT_DICTIONARY_DEFINITION, "Invalid Regex tree. The input tree is cyclical"); + throw Exception(ErrorCodes::INCORRECT_DICTIONARY_DEFINITION, "The regexp tree is cyclical. Please check your config."); else initTopologyOrder(child_idx, visited, topology_id); topology_order[node_idx] = topology_id++; diff --git a/src/Disks/DiskLocal.cpp b/src/Disks/DiskLocal.cpp index a3b7e413014..160fcb5732c 100644 --- a/src/Disks/DiskLocal.cpp +++ b/src/Disks/DiskLocal.cpp @@ -1,4 +1,5 @@ #include "DiskLocal.h" +#include #include #include "DiskFactory.h" @@ -367,10 +368,11 @@ std::unique_ptr DiskLocal::readFile(const String & path, } std::unique_ptr -DiskLocal::writeFile(const String & path, size_t buf_size, WriteMode mode, const WriteSettings &) +DiskLocal::writeFile(const String & path, size_t buf_size, WriteMode mode, const WriteSettings & settings) { int flags = (mode == WriteMode::Append) ? (O_APPEND | O_CREAT | O_WRONLY) : -1; - return std::make_unique(fs::path(disk_path) / path, buf_size, flags); + return std::make_unique( + fs::path(disk_path) / path, buf_size, flags, settings.local_throttler); } void DiskLocal::removeFile(const String & path) diff --git a/src/Disks/DiskLocal.h b/src/Disks/DiskLocal.h index 14e29904422..d6182463ebf 100644 --- a/src/Disks/DiskLocal.h +++ b/src/Disks/DiskLocal.h @@ -1,6 +1,5 @@ #pragma once -#include #include #include #include diff --git a/src/Disks/IDisk.h b/src/Disks/IDisk.h index 4e488bbb39a..797235b5fb8 100644 --- a/src/Disks/IDisk.h +++ b/src/Disks/IDisk.h @@ -254,8 +254,8 @@ public: virtual NameSet getCacheLayersNames() const { throw Exception(ErrorCodes::NOT_IMPLEMENTED, - "Method `getCacheLayersNames()` is not implemented for disk: {}", - getDataSourceDescription().type); + "Method `getCacheLayersNames()` is not implemented for disk: {}", + toString(getDataSourceDescription().type)); } /// Returns a list of storage objects (contains path, size, ...). @@ -263,7 +263,9 @@ public: /// be multiple files in remote fs for single clickhouse file. virtual StoredObjects getStorageObjects(const String &) const { - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method `getStorageObjects() not implemented for disk: {}`", getDataSourceDescription().type); + throw Exception(ErrorCodes::NOT_IMPLEMENTED, + "Method `getStorageObjects()` not implemented for disk: {}", + toString(getDataSourceDescription().type)); } /// For one local path there might be multiple remote paths in case of Log family engines. @@ -281,8 +283,8 @@ public: virtual void getRemotePathsRecursive(const String &, std::vector &) { throw Exception(ErrorCodes::NOT_IMPLEMENTED, - "Method `getRemotePathsRecursive() not implemented for disk: {}`", - getDataSourceDescription().type); + "Method `getRemotePathsRecursive() not implemented for disk: {}`", + toString(getDataSourceDescription().type)); } /// Batch request to remove multiple files. @@ -398,7 +400,7 @@ public: throw Exception( ErrorCodes::NOT_IMPLEMENTED, "Method getObjectStorage() is not implemented for disk type: {}", - getDataSourceDescription().type); + toString(getDataSourceDescription().type)); } /// Create disk object storage according to disk type. @@ -409,7 +411,7 @@ public: throw Exception( ErrorCodes::NOT_IMPLEMENTED, "Method createDiskObjectStorage() is not implemented for disk type: {}", - getDataSourceDescription().type); + toString(getDataSourceDescription().type)); } virtual bool supportsStat() const { return false; } diff --git a/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp b/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp index 72346787cfb..f48935da7bf 100644 --- a/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp +++ b/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp @@ -4,8 +4,9 @@ #include #include #include -#include #include +#include +#include #include @@ -1184,7 +1185,7 @@ String CachedOnDiskReadBufferFromFile::getInfoForLog() implementation_buffer_read_range_str = "None"; String current_file_segment_info; - if (current_file_segment_it == file_segments_holder->file_segments.end()) + if (current_file_segment_it != file_segments_holder->file_segments.end()) current_file_segment_info = (*current_file_segment_it)->getInfoForLog(); else current_file_segment_info = "None"; diff --git a/src/Disks/IO/CachedOnDiskReadBufferFromFile.h b/src/Disks/IO/CachedOnDiskReadBufferFromFile.h index 14e8ea6c7e7..d3c265a522b 100644 --- a/src/Disks/IO/CachedOnDiskReadBufferFromFile.h +++ b/src/Disks/IO/CachedOnDiskReadBufferFromFile.h @@ -1,7 +1,6 @@ #pragma once #include -#include #include #include #include diff --git a/src/Disks/IO/IOUringReader.cpp b/src/Disks/IO/IOUringReader.cpp index 7bf1982d515..7b68e0ee2de 100644 --- a/src/Disks/IO/IOUringReader.cpp +++ b/src/Disks/IO/IOUringReader.cpp @@ -1,15 +1,16 @@ #include "IOUringReader.h" +#include #if USE_LIBURING #include #include -#include #include #include #include #include #include +#include #include #include @@ -44,7 +45,7 @@ namespace ErrorCodes } IOUringReader::IOUringReader(uint32_t entries_) - : log(&Poco::Logger::get("IOUringReader")) + : log(&Poco::Logger::get("IOUringReader")) { struct io_uring_probe * probe = io_uring_get_probe(); if (!probe) @@ -70,7 +71,7 @@ IOUringReader::IOUringReader(uint32_t entries_) throwFromErrno("Failed initializing io_uring", ErrorCodes::IO_URING_INIT_FAILED, -ret); cq_entries = params.cq_entries; - ring_completion_monitor = ThreadFromGlobalPool([this] { monitorRing(); }); + ring_completion_monitor = std::make_unique([this] { monitorRing(); }); } std::future IOUringReader::submit(Request request) @@ -333,7 +334,7 @@ IOUringReader::~IOUringReader() io_uring_submit(&ring); } - ring_completion_monitor.join(); + ring_completion_monitor->join(); io_uring_queue_exit(&ring); } diff --git a/src/Disks/IO/IOUringReader.h b/src/Disks/IO/IOUringReader.h index e3fcf116448..9b80ac6e5e0 100644 --- a/src/Disks/IO/IOUringReader.h +++ b/src/Disks/IO/IOUringReader.h @@ -4,15 +4,20 @@ #if USE_LIBURING -#include +#include +#include #include #include #include #include +namespace Poco { class Logger; } + namespace DB { +class Exception; + /** Perform reads using the io_uring Linux subsystem. * * The class sets up a single io_uring that clients submit read requests to, they are @@ -30,7 +35,7 @@ private: uint32_t cq_entries; std::atomic cancelled{false}; - ThreadFromGlobalPool ring_completion_monitor; + std::unique_ptr ring_completion_monitor; struct EnqueuedRequest { @@ -74,7 +79,7 @@ public: void wait() override {} - virtual ~IOUringReader() override; + ~IOUringReader() override; }; } diff --git a/src/Disks/IO/ThreadPoolReader.cpp b/src/Disks/IO/ThreadPoolReader.cpp index 3a071d13122..de57fa157da 100644 --- a/src/Disks/IO/ThreadPoolReader.cpp +++ b/src/Disks/IO/ThreadPoolReader.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -87,7 +88,7 @@ static bool hasBugInPreadV2() #endif ThreadPoolReader::ThreadPoolReader(size_t pool_size, size_t queue_size_) - : pool(CurrentMetrics::ThreadPoolFSReaderThreads, CurrentMetrics::ThreadPoolFSReaderThreadsActive, pool_size, pool_size, queue_size_) + : pool(std::make_unique(CurrentMetrics::ThreadPoolFSReaderThreads, CurrentMetrics::ThreadPoolFSReaderThreadsActive, pool_size, pool_size, queue_size_)) { } @@ -200,7 +201,7 @@ std::future ThreadPoolReader::submit(Request reques ProfileEvents::increment(ProfileEvents::ThreadPoolReaderPageCacheMiss); - auto schedule = threadPoolCallbackRunner(pool, "ThreadPoolRead"); + auto schedule = threadPoolCallbackRunner(*pool, "ThreadPoolRead"); return schedule([request, fd]() -> Result { @@ -244,4 +245,9 @@ std::future ThreadPoolReader::submit(Request reques }, request.priority); } +void ThreadPoolReader::wait() +{ + pool->wait(); +} + } diff --git a/src/Disks/IO/ThreadPoolReader.h b/src/Disks/IO/ThreadPoolReader.h index dc754e0a81c..4c55be29bf9 100644 --- a/src/Disks/IO/ThreadPoolReader.h +++ b/src/Disks/IO/ThreadPoolReader.h @@ -1,7 +1,8 @@ #pragma once +#include #include -#include +#include #include @@ -28,14 +29,14 @@ namespace DB class ThreadPoolReader final : public IAsynchronousReader { private: - ThreadPool pool; + std::unique_ptr pool; public: ThreadPoolReader(size_t pool_size, size_t queue_size_); std::future submit(Request request) override; - void wait() override { pool.wait(); } + void wait() override; /// pool automatically waits for all tasks in destructor. }; diff --git a/src/Disks/IO/ThreadPoolRemoteFSReader.cpp b/src/Disks/IO/ThreadPoolRemoteFSReader.cpp index 1980f57c876..4d0f39357ab 100644 --- a/src/Disks/IO/ThreadPoolRemoteFSReader.cpp +++ b/src/Disks/IO/ThreadPoolRemoteFSReader.cpp @@ -1,6 +1,7 @@ #include "ThreadPoolRemoteFSReader.h" #include "config.h" +#include #include #include #include @@ -14,6 +15,7 @@ #include #include +#include namespace ProfileEvents @@ -62,7 +64,7 @@ IAsynchronousReader::Result RemoteFSFileDescriptor::readInto(char * data, size_t ThreadPoolRemoteFSReader::ThreadPoolRemoteFSReader(size_t pool_size, size_t queue_size_) - : pool(CurrentMetrics::ThreadPoolRemoteFSReaderThreads, CurrentMetrics::ThreadPoolRemoteFSReaderThreadsActive, pool_size, pool_size, queue_size_) + : pool(std::make_unique(CurrentMetrics::ThreadPoolRemoteFSReaderThreads, CurrentMetrics::ThreadPoolRemoteFSReaderThreadsActive, pool_size, pool_size, queue_size_)) { } @@ -92,7 +94,12 @@ std::future ThreadPoolRemoteFSReader::submit(Reques ProfileEvents::increment(ProfileEvents::ThreadpoolReaderReadBytes, result.size); return Result{ .size = result.size, .offset = result.offset, .execution_watch = std::move(watch) }; - }, pool, "VFSRead", request.priority); + }, *pool, "VFSRead", request.priority); +} + +void ThreadPoolRemoteFSReader::wait() +{ + pool->wait(); } } diff --git a/src/Disks/IO/ThreadPoolRemoteFSReader.h b/src/Disks/IO/ThreadPoolRemoteFSReader.h index cd289150ba1..3a765993292 100644 --- a/src/Disks/IO/ThreadPoolRemoteFSReader.h +++ b/src/Disks/IO/ThreadPoolRemoteFSReader.h @@ -2,7 +2,7 @@ #include #include -#include +#include #include namespace DB @@ -15,10 +15,10 @@ public: std::future submit(Request request) override; - void wait() override { pool.wait(); } + void wait() override; private: - ThreadPool pool; + std::unique_ptr pool; }; class RemoteFSFileDescriptor : public IAsynchronousReader::IFileDescriptor diff --git a/src/Disks/IO/createReadBufferFromFileBase.cpp b/src/Disks/IO/createReadBufferFromFileBase.cpp index 04496e36826..8e9a1d86628 100644 --- a/src/Disks/IO/createReadBufferFromFileBase.cpp +++ b/src/Disks/IO/createReadBufferFromFileBase.cpp @@ -76,11 +76,25 @@ std::unique_ptr createReadBufferFromFileBase( if (settings.local_fs_method == LocalFSReadMethod::read) { - res = std::make_unique(filename, buffer_size, actual_flags, existing_memory, buffer_alignment, file_size); + res = std::make_unique( + filename, + buffer_size, + actual_flags, + existing_memory, + buffer_alignment, + file_size, + settings.local_throttler); } else if (settings.local_fs_method == LocalFSReadMethod::pread || settings.local_fs_method == LocalFSReadMethod::mmap) { - res = std::make_unique(filename, buffer_size, actual_flags, existing_memory, buffer_alignment, file_size); + res = std::make_unique( + filename, + buffer_size, + actual_flags, + existing_memory, + buffer_alignment, + file_size, + settings.local_throttler); } else if (settings.local_fs_method == LocalFSReadMethod::io_uring) { @@ -90,7 +104,15 @@ std::unique_ptr createReadBufferFromFileBase( throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "io_uring is not supported by this system"); res = std::make_unique( - *reader, settings.priority, filename, buffer_size, actual_flags, existing_memory, buffer_alignment, file_size); + *reader, + settings.priority, + filename, + buffer_size, + actual_flags, + existing_memory, + buffer_alignment, + file_size, + settings.local_throttler); #else throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "Read method io_uring is only supported in Linux"); #endif @@ -103,7 +125,15 @@ std::unique_ptr createReadBufferFromFileBase( auto & reader = context->getThreadPoolReader(Context::FilesystemReaderType::SYNCHRONOUS_LOCAL_FS_READER); res = std::make_unique( - reader, settings.priority, filename, buffer_size, actual_flags, existing_memory, buffer_alignment, file_size); + reader, + settings.priority, + filename, + buffer_size, + actual_flags, + existing_memory, + buffer_alignment, + file_size, + settings.local_throttler); } else if (settings.local_fs_method == LocalFSReadMethod::pread_threadpool) { @@ -113,7 +143,15 @@ std::unique_ptr createReadBufferFromFileBase( auto & reader = context->getThreadPoolReader(Context::FilesystemReaderType::ASYNCHRONOUS_LOCAL_FS_READER); res = std::make_unique( - reader, settings.priority, filename, buffer_size, actual_flags, existing_memory, buffer_alignment, file_size); + reader, + settings.priority, + filename, + buffer_size, + actual_flags, + existing_memory, + buffer_alignment, + file_size, + settings.local_throttler); } else throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown read method"); diff --git a/src/Disks/ObjectStorages/IObjectStorage.h b/src/Disks/ObjectStorages/IObjectStorage.h index 2cfb4d43a43..2f27dc18e4b 100644 --- a/src/Disks/ObjectStorages/IObjectStorage.h +++ b/src/Disks/ObjectStorages/IObjectStorage.h @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.h b/src/Disks/ObjectStorages/S3/S3ObjectStorage.h index bcdc97983be..b49f4dafef0 100644 --- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.h +++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.h @@ -9,7 +9,6 @@ #include #include #include -#include namespace DB diff --git a/src/Disks/ObjectStorages/S3/registerDiskS3.cpp b/src/Disks/ObjectStorages/S3/registerDiskS3.cpp index 1c192a0d89c..70f39d893f7 100644 --- a/src/Disks/ObjectStorages/S3/registerDiskS3.cpp +++ b/src/Disks/ObjectStorages/S3/registerDiskS3.cpp @@ -8,7 +8,6 @@ #if USE_AWS_S3 -#include #include #include @@ -19,9 +18,7 @@ #include #include #include -#include -#include #include #include @@ -87,10 +84,10 @@ public: private: static String getServerUUID() { - DB::UUID server_uuid = DB::ServerUUID::get(); - if (server_uuid == DB::UUIDHelpers::Nil) + UUID server_uuid = ServerUUID::get(); + if (server_uuid == UUIDHelpers::Nil) throw Exception(ErrorCodes::LOGICAL_ERROR, "Server UUID is not initialized"); - return DB::toString(server_uuid); + return toString(server_uuid); } }; diff --git a/src/Disks/StoragePolicy.cpp b/src/Disks/StoragePolicy.cpp index ec0f201b801..f4be8b8fe86 100644 --- a/src/Disks/StoragePolicy.cpp +++ b/src/Disks/StoragePolicy.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include diff --git a/src/Disks/StoragePolicy.h b/src/Disks/StoragePolicy.h index 7e6aff7bbda..69cfb830818 100644 --- a/src/Disks/StoragePolicy.h +++ b/src/Disks/StoragePolicy.h @@ -10,7 +10,6 @@ #include #include #include -#include #include #include diff --git a/src/Disks/VolumeJBOD.h b/src/Disks/VolumeJBOD.h index 81da64c488d..ef6f215bf18 100644 --- a/src/Disks/VolumeJBOD.h +++ b/src/Disks/VolumeJBOD.h @@ -2,6 +2,7 @@ #include #include +#include #include diff --git a/src/Formats/ProtobufSchemas.cpp b/src/Formats/ProtobufSchemas.cpp index efc0a4e694f..86c81e1a3c3 100644 --- a/src/Formats/ProtobufSchemas.cpp +++ b/src/Formats/ProtobufSchemas.cpp @@ -41,8 +41,19 @@ public: return descriptor; const auto * file_descriptor = importer.Import(schema_path); - // If there are parsing errors, AddError() throws an exception and in this case the following line - // isn't executed. + if (error) + { + auto info = error.value(); + error.reset(); + throw Exception( + ErrorCodes::CANNOT_PARSE_PROTOBUF_SCHEMA, + "Cannot parse '{}' file, found an error at line {}, column {}, {}", + info.filename, + std::to_string(info.line), + std::to_string(info.column), + info.message); + } + assert(file_descriptor); if (with_envelope == WithEnvelope::No) @@ -74,14 +85,24 @@ private: // Overrides google::protobuf::compiler::MultiFileErrorCollector: void AddError(const String & filename, int line, int column, const String & message) override { - throw Exception(ErrorCodes::CANNOT_PARSE_PROTOBUF_SCHEMA, - "Cannot parse '{}' file, found an error at line {}, column {}, {}", - filename, std::to_string(line), std::to_string(column), message); + /// Protobuf library code is not exception safe, we should + /// remember the error and throw it later from our side. + error = ErrorInfo{filename, line, column, message}; } google::protobuf::compiler::DiskSourceTree disk_source_tree; google::protobuf::compiler::Importer importer; const WithEnvelope with_envelope; + + struct ErrorInfo + { + String filename; + int line; + int column; + String message; + }; + + std::optional error; }; diff --git a/src/Formats/ProtobufSerializer.cpp b/src/Formats/ProtobufSerializer.cpp index 4f3b19ac803..c712a1083d9 100644 --- a/src/Formats/ProtobufSerializer.cpp +++ b/src/Formats/ProtobufSerializer.cpp @@ -3453,15 +3453,35 @@ namespace const auto & tuple_data_type = assert_cast(*data_type); size_t size_of_tuple = tuple_data_type.getElements().size(); - if (tuple_data_type.haveExplicitNames() && field_descriptor.message_type()) + if (const auto * message_type = field_descriptor.message_type()) { + bool have_explicit_names = tuple_data_type.haveExplicitNames(); + Names element_names; + if (have_explicit_names) + { + element_names = tuple_data_type.getElementNames(); + } + else + { + /// Match unnamed Tuple elements and Message fields by position. + size_t field_count = message_type->field_count(); + if (field_count != size_of_tuple) + throw Exception( + ErrorCodes::NO_COLUMNS_SERIALIZED_TO_PROTOBUF_FIELDS, + "The number of fields in Protobuf message ({}) is not equal to the number of elements in unnamed Tuple ({})", + field_count, + size_of_tuple); + for (size_t i = 0; i != field_count; ++i) + element_names.push_back(message_type->field(static_cast(i))->name()); + } + /// Try to serialize as a nested message. std::vector used_column_indices; auto message_serializer = buildMessageSerializerImpl( size_of_tuple, - tuple_data_type.getElementNames().data(), + element_names.data(), tuple_data_type.getElements().data(), - *field_descriptor.message_type(), + *message_type, /* with_length_delimiter = */ false, google_wrappers_special_treatment, &field_descriptor, diff --git a/src/Functions/FunctionShowCertificate.h b/src/Functions/FunctionShowCertificate.h index 3c30d8138e5..5061a198614 100644 --- a/src/Functions/FunctionShowCertificate.h +++ b/src/Functions/FunctionShowCertificate.h @@ -15,6 +15,7 @@ #include #include #include +#include #if USE_SSL #include diff --git a/src/Functions/FunctionStringReplace.h b/src/Functions/FunctionStringReplace.h index f90eac2e7f3..6199e146210 100644 --- a/src/Functions/FunctionStringReplace.h +++ b/src/Functions/FunctionStringReplace.h @@ -5,6 +5,7 @@ #include #include #include +#include namespace DB @@ -13,16 +14,14 @@ namespace DB namespace ErrorCodes { extern const int ILLEGAL_COLUMN; - extern const int ARGUMENT_OUT_OF_BOUND; - extern const int ILLEGAL_TYPE_OF_ARGUMENT; } - template class FunctionStringReplace : public IFunction { public: static constexpr auto name = Name::name; + static FunctionPtr create(ContextPtr) { return std::make_shared(); } String getName() const override { return name; } @@ -32,65 +31,80 @@ public: bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; } bool useDefaultImplementationForConstants() const override { return true; } - ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1, 2}; } - DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override + DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override { - if (!isStringOrFixedString(arguments[0])) - throw Exception( - ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, - "Illegal type {} of first argument of function {}", - arguments[0]->getName(), getName()); + FunctionArgumentDescriptors args{ + {"haystack", &isStringOrFixedString, nullptr, "String or FixedString"}, + {"pattern", &isString, nullptr, "String"}, + {"replacement", &isString, nullptr, "String"} + }; - if (!isStringOrFixedString(arguments[1])) - throw Exception( - ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, - "Illegal type {} of second argument of function {}", - arguments[1]->getName(), getName()); - - if (!isStringOrFixedString(arguments[2])) - throw Exception( - ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, - "Illegal type {} of third argument of function {}", - arguments[2]->getName(), getName()); + validateFunctionArgumentTypes(*this, arguments, args); return std::make_shared(); } ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t /*input_rows_count*/) const override { - const ColumnPtr column_src = arguments[0].column; + const ColumnPtr column_haystack = arguments[0].column; const ColumnPtr column_needle = arguments[1].column; const ColumnPtr column_replacement = arguments[2].column; - if (!isColumnConst(*column_needle) || !isColumnConst(*column_replacement)) - throw Exception( - ErrorCodes::ILLEGAL_COLUMN, - "2nd and 3rd arguments of function {} must be constants.", - getName()); + const ColumnString * col_haystack = checkAndGetColumn(column_haystack.get()); + const ColumnFixedString * col_haystack_fixed = checkAndGetColumn(column_haystack.get()); - const IColumn * c1 = arguments[1].column.get(); - const IColumn * c2 = arguments[2].column.get(); - const ColumnConst * c1_const = typeid_cast(c1); - const ColumnConst * c2_const = typeid_cast(c2); - String needle = c1_const->getValue(); - String replacement = c2_const->getValue(); + const ColumnString * col_needle_vector = checkAndGetColumn(column_needle.get()); + const ColumnConst * col_needle_const = checkAndGetColumn(column_needle.get()); - if (needle.empty()) - throw Exception( - ErrorCodes::ARGUMENT_OUT_OF_BOUND, - "Length of the second argument of function replace must be greater than 0."); + const ColumnString * col_replacement_vector = checkAndGetColumn(column_replacement.get()); + const ColumnConst * col_replacement_const = checkAndGetColumn(column_replacement.get()); - if (const ColumnString * col = checkAndGetColumn(column_src.get())) + auto col_res = ColumnString::create(); + + if (col_haystack && col_needle_const && col_replacement_const) { - auto col_res = ColumnString::create(); - Impl::vector(col->getChars(), col->getOffsets(), needle, replacement, col_res->getChars(), col_res->getOffsets()); + Impl::vectorConstantConstant( + col_haystack->getChars(), col_haystack->getOffsets(), + col_needle_const->getValue(), + col_replacement_const->getValue(), + col_res->getChars(), col_res->getOffsets()); return col_res; } - else if (const ColumnFixedString * col_fixed = checkAndGetColumn(column_src.get())) + else if (col_haystack && col_needle_vector && col_replacement_const) { - auto col_res = ColumnString::create(); - Impl::vectorFixed(col_fixed->getChars(), col_fixed->getN(), needle, replacement, col_res->getChars(), col_res->getOffsets()); + Impl::vectorVectorConstant( + col_haystack->getChars(), col_haystack->getOffsets(), + col_needle_vector->getChars(), col_needle_vector->getOffsets(), + col_replacement_const->getValue(), + col_res->getChars(), col_res->getOffsets()); + return col_res; + } + else if (col_haystack && col_needle_const && col_replacement_vector) + { + Impl::vectorConstantVector( + col_haystack->getChars(), col_haystack->getOffsets(), + col_needle_const->getValue(), + col_replacement_vector->getChars(), col_replacement_vector->getOffsets(), + col_res->getChars(), col_res->getOffsets()); + return col_res; + } + else if (col_haystack && col_needle_vector && col_replacement_vector) + { + Impl::vectorVectorVector( + col_haystack->getChars(), col_haystack->getOffsets(), + col_needle_vector->getChars(), col_needle_vector->getOffsets(), + col_replacement_vector->getChars(), col_replacement_vector->getOffsets(), + col_res->getChars(), col_res->getOffsets()); + return col_res; + } + else if (col_haystack_fixed && col_needle_const && col_replacement_const) + { + Impl::vectorFixedConstantConstant( + col_haystack_fixed->getChars(), col_haystack_fixed->getN(), + col_needle_const->getValue(), + col_replacement_const->getValue(), + col_res->getChars(), col_res->getOffsets()); return col_res; } else diff --git a/src/Functions/FunctionsConversion.h b/src/Functions/FunctionsConversion.h index f832bf404a8..28002d34acc 100644 --- a/src/Functions/FunctionsConversion.h +++ b/src/Functions/FunctionsConversion.h @@ -41,6 +41,7 @@ #include #include #include +#include #include #include #include diff --git a/src/Functions/FunctionsDecimalArithmetics.h b/src/Functions/FunctionsDecimalArithmetics.h index aa4afc68707..79e10d215a9 100644 --- a/src/Functions/FunctionsDecimalArithmetics.h +++ b/src/Functions/FunctionsDecimalArithmetics.h @@ -10,7 +10,6 @@ #include #include -#include #include #include diff --git a/src/Functions/FunctionsHashing.h b/src/Functions/FunctionsHashing.h index 3357fca00bb..034ef868cc7 100644 --- a/src/Functions/FunctionsHashing.h +++ b/src/Functions/FunctionsHashing.h @@ -494,6 +494,28 @@ struct GccMurmurHashImpl static constexpr bool use_int_hash_for_pods = false; }; +/// To be compatible with Default Partitioner in Kafka: +/// murmur2: https://github.com/apache/kafka/blob/461c5cfe056db0951d9b74f5adc45973670404d7/clients/src/main/java/org/apache/kafka/common/utils/Utils.java#L480 +/// Default Partitioner: https://github.com/apache/kafka/blob/139f7709bd3f5926901a21e55043388728ccca78/clients/src/main/java/org/apache/kafka/clients/producer/internals/BuiltInPartitioner.java#L328 +struct KafkaMurmurHashImpl +{ + static constexpr auto name = "kafkaMurmurHash"; + + using ReturnType = UInt32; + + static UInt32 apply(const char * data, const size_t size) + { + return MurmurHash2(data, size, 0x9747b28cU) & 0x7fffffff; + } + + static UInt32 combineHashes(UInt32 h1, UInt32 h2) + { + return IntHash32Impl::apply(h1) ^ h2; + } + + static constexpr bool use_int_hash_for_pods = false; +}; + struct MurmurHash3Impl32 { static constexpr auto name = "murmurHash3_32"; @@ -1727,6 +1749,7 @@ using FunctionMetroHash64 = FunctionAnyHash; using FunctionMurmurHash2_32 = FunctionAnyHash; using FunctionMurmurHash2_64 = FunctionAnyHash; using FunctionGccMurmurHash = FunctionAnyHash; +using FunctionKafkaMurmurHash = FunctionAnyHash; using FunctionMurmurHash3_32 = FunctionAnyHash; using FunctionMurmurHash3_64 = FunctionAnyHash; using FunctionMurmurHash3_128 = FunctionAnyHash; diff --git a/src/Functions/FunctionsHashingMurmur.cpp b/src/Functions/FunctionsHashingMurmur.cpp index 9648c21dbf0..df1a945b967 100644 --- a/src/Functions/FunctionsHashingMurmur.cpp +++ b/src/Functions/FunctionsHashingMurmur.cpp @@ -17,5 +17,6 @@ REGISTER_FUNCTION(HashingMurmur) factory.registerFunction(); factory.registerFunction(); factory.registerFunction(); + factory.registerFunction(); } } diff --git a/src/Functions/FunctionsJSON.cpp b/src/Functions/FunctionsJSON.cpp index 8bcb1f4d849..fbd987577e9 100644 --- a/src/Functions/FunctionsJSON.cpp +++ b/src/Functions/FunctionsJSON.cpp @@ -1,1624 +1,10 @@ -#include -#include - -#include - -#include -#include -#include - -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - +#include #include -#include -#include -#include -#include -#include - -#include -#include - - -#include "config.h" namespace DB { -namespace ErrorCodes -{ - extern const int ILLEGAL_TYPE_OF_ARGUMENT; - extern const int ILLEGAL_COLUMN; - extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; -} - -template -concept HasIndexOperator = requires (T t) -{ - t[0]; -}; - -/// Functions to parse JSONs and extract values from it. -/// The first argument of all these functions gets a JSON, -/// after that there are any number of arguments specifying path to a desired part from the JSON's root. -/// For example, -/// select JSONExtractInt('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 1) = -100 - -class FunctionJSONHelpers -{ -public: - template typename Impl, class JSONParser> - class Executor - { - public: - static ColumnPtr run(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) - { - MutableColumnPtr to{result_type->createColumn()}; - to->reserve(input_rows_count); - - if (arguments.empty()) - throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Function {} requires at least one argument", String(Name::name)); - - const auto & first_column = arguments[0]; - if (!isString(first_column.type)) - throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, - "The first argument of function {} should be a string containing JSON, illegal type: " - "{}", String(Name::name), first_column.type->getName()); - - const ColumnPtr & arg_json = first_column.column; - const auto * col_json_const = typeid_cast(arg_json.get()); - const auto * col_json_string - = typeid_cast(col_json_const ? col_json_const->getDataColumnPtr().get() : arg_json.get()); - - if (!col_json_string) - throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {}", arg_json->getName()); - - const ColumnString::Chars & chars = col_json_string->getChars(); - const ColumnString::Offsets & offsets = col_json_string->getOffsets(); - - size_t num_index_arguments = Impl::getNumberOfIndexArguments(arguments); - std::vector moves = prepareMoves(Name::name, arguments, 1, num_index_arguments); - - /// Preallocate memory in parser if necessary. - JSONParser parser; - if constexpr (has_member_function_reserve::value) - { - size_t max_size = calculateMaxSize(offsets); - if (max_size) - parser.reserve(max_size); - } - - Impl impl; - - /// prepare() does Impl-specific preparation before handling each row. - if constexpr (has_member_function_prepare::*)(const char *, const ColumnsWithTypeAndName &, const DataTypePtr &)>::value) - impl.prepare(Name::name, arguments, result_type); - - using Element = typename JSONParser::Element; - - Element document; - bool document_ok = false; - if (col_json_const) - { - std::string_view json{reinterpret_cast(chars.data()), offsets[0] - 1}; - document_ok = parser.parse(json, document); - } - - for (const auto i : collections::range(0, input_rows_count)) - { - if (!col_json_const) - { - std::string_view json{reinterpret_cast(&chars[offsets[i - 1]]), offsets[i] - offsets[i - 1] - 1}; - document_ok = parser.parse(json, document); - } - - bool added_to_column = false; - if (document_ok) - { - /// Perform moves. - Element element; - std::string_view last_key; - bool moves_ok = performMoves(arguments, i, document, moves, element, last_key); - - if (moves_ok) - added_to_column = impl.insertResultToColumn(*to, element, last_key); - } - - /// We add default value (=null or zero) if something goes wrong, we don't throw exceptions in these JSON functions. - if (!added_to_column) - to->insertDefault(); - } - return to; - } - }; - -private: - BOOST_TTI_HAS_MEMBER_FUNCTION(reserve) - BOOST_TTI_HAS_MEMBER_FUNCTION(prepare) - - /// Represents a move of a JSON iterator described by a single argument passed to a JSON function. - /// For example, the call JSONExtractInt('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 1) - /// contains two moves: {MoveType::ConstKey, "b"} and {MoveType::ConstIndex, 1}. - /// Keys and indices can be nonconst, in this case they are calculated for each row. - enum class MoveType - { - Key, - Index, - ConstKey, - ConstIndex, - }; - - struct Move - { - explicit Move(MoveType type_, size_t index_ = 0) : type(type_), index(index_) {} - Move(MoveType type_, const String & key_) : type(type_), key(key_) {} - MoveType type; - size_t index = 0; - String key; - }; - - static std::vector prepareMoves( - const char * function_name, - const ColumnsWithTypeAndName & columns, - size_t first_index_argument, - size_t num_index_arguments) - { - std::vector moves; - moves.reserve(num_index_arguments); - for (const auto i : collections::range(first_index_argument, first_index_argument + num_index_arguments)) - { - const auto & column = columns[i]; - if (!isString(column.type) && !isNativeInteger(column.type)) - throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, - "The argument {} of function {} should be a string specifying key " - "or an integer specifying index, illegal type: {}", - std::to_string(i + 1), String(function_name), column.type->getName()); - - if (column.column && isColumnConst(*column.column)) - { - const auto & column_const = assert_cast(*column.column); - if (isString(column.type)) - moves.emplace_back(MoveType::ConstKey, column_const.getValue()); - else - moves.emplace_back(MoveType::ConstIndex, column_const.getInt(0)); - } - else - { - if (isString(column.type)) - moves.emplace_back(MoveType::Key, ""); - else - moves.emplace_back(MoveType::Index, 0); - } - } - return moves; - } - - - /// Performs moves of types MoveType::Index and MoveType::ConstIndex. - template - static bool performMoves(const ColumnsWithTypeAndName & arguments, size_t row, - const typename JSONParser::Element & document, const std::vector & moves, - typename JSONParser::Element & element, std::string_view & last_key) - { - typename JSONParser::Element res_element = document; - std::string_view key; - - for (size_t j = 0; j != moves.size(); ++j) - { - switch (moves[j].type) - { - case MoveType::ConstIndex: - { - if (!moveToElementByIndex(res_element, static_cast(moves[j].index), key)) - return false; - break; - } - case MoveType::ConstKey: - { - key = moves[j].key; - if (!moveToElementByKey(res_element, key)) - return false; - break; - } - case MoveType::Index: - { - Int64 index = (*arguments[j + 1].column)[row].get(); - if (!moveToElementByIndex(res_element, static_cast(index), key)) - return false; - break; - } - case MoveType::Key: - { - key = (*arguments[j + 1].column).getDataAt(row).toView(); - if (!moveToElementByKey(res_element, key)) - return false; - break; - } - } - } - - element = res_element; - last_key = key; - return true; - } - - template - static bool moveToElementByIndex(typename JSONParser::Element & element, int index, std::string_view & out_key) - { - if (element.isArray()) - { - auto array = element.getArray(); - if (index >= 0) - --index; - else - index += array.size(); - - if (static_cast(index) >= array.size()) - return false; - element = array[index]; - out_key = {}; - return true; - } - - if constexpr (HasIndexOperator) - { - if (element.isObject()) - { - auto object = element.getObject(); - if (index >= 0) - --index; - else - index += object.size(); - - if (static_cast(index) >= object.size()) - return false; - std::tie(out_key, element) = object[index]; - return true; - } - } - - return {}; - } - - /// Performs moves of types MoveType::Key and MoveType::ConstKey. - template - static bool moveToElementByKey(typename JSONParser::Element & element, std::string_view key) - { - if (!element.isObject()) - return false; - auto object = element.getObject(); - return object.find(key, element); - } - - static size_t calculateMaxSize(const ColumnString::Offsets & offsets) - { - size_t max_size = 0; - for (const auto i : collections::range(0, offsets.size())) - { - size_t size = offsets[i] - offsets[i - 1]; - if (max_size < size) - max_size = size; - } - if (max_size) - --max_size; - return max_size; - } - -}; - - -template typename Impl> -class ExecutableFunctionJSON : public IExecutableFunction, WithContext -{ - -public: - explicit ExecutableFunctionJSON(const NullPresence & null_presence_, bool allow_simdjson_, const DataTypePtr & json_return_type_) - : null_presence(null_presence_), allow_simdjson(allow_simdjson_), json_return_type(json_return_type_) - { - } - - String getName() const override { return Name::name; } - bool useDefaultImplementationForNulls() const override { return false; } - bool useDefaultImplementationForConstants() const override { return true; } - - ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override - { - if (null_presence.has_null_constant) - return result_type->createColumnConstWithDefaultValue(input_rows_count); - - ColumnsWithTypeAndName temporary_columns = null_presence.has_nullable ? createBlockWithNestedColumns(arguments) : arguments; - ColumnPtr temporary_result = chooseAndRunJSONParser(temporary_columns, json_return_type, input_rows_count); - if (null_presence.has_nullable) - return wrapInNullable(temporary_result, arguments, result_type, input_rows_count); - return temporary_result; - } - -private: - - ColumnPtr - chooseAndRunJSONParser(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const - { -#if USE_SIMDJSON - if (allow_simdjson) - return FunctionJSONHelpers::Executor::run(arguments, result_type, input_rows_count); -#endif - -#if USE_RAPIDJSON - return FunctionJSONHelpers::Executor::run(arguments, result_type, input_rows_count); -#else - return FunctionJSONHelpers::Executor::run(arguments, result_type, input_rows_count); -#endif - } - - NullPresence null_presence; - bool allow_simdjson; - DataTypePtr json_return_type; -}; - - -template typename Impl> -class FunctionBaseFunctionJSON : public IFunctionBase -{ -public: - explicit FunctionBaseFunctionJSON( - const NullPresence & null_presence_, - bool allow_simdjson_, - DataTypes argument_types_, - DataTypePtr return_type_, - DataTypePtr json_return_type_) - : null_presence(null_presence_) - , allow_simdjson(allow_simdjson_) - , argument_types(std::move(argument_types_)) - , return_type(std::move(return_type_)) - , json_return_type(std::move(json_return_type_)) - { - } - - String getName() const override { return Name::name; } - - const DataTypes & getArgumentTypes() const override - { - return argument_types; - } - - const DataTypePtr & getResultType() const override - { - return return_type; - } - - bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; } - - ExecutableFunctionPtr prepare(const ColumnsWithTypeAndName &) const override - { - return std::make_unique>(null_presence, allow_simdjson, json_return_type); - } - -private: - NullPresence null_presence; - bool allow_simdjson; - DataTypes argument_types; - DataTypePtr return_type; - DataTypePtr json_return_type; -}; - - -/// We use IFunctionOverloadResolver instead of IFunction to handle non-default NULL processing. -/// Both NULL and JSON NULL should generate NULL value. If any argument is NULL, return NULL. -template typename Impl> -class JSONOverloadResolver : public IFunctionOverloadResolver, WithContext -{ -public: - static constexpr auto name = Name::name; - - String getName() const override { return name; } - - static FunctionOverloadResolverPtr create(ContextPtr context_) - { - return std::make_unique(context_); - } - - explicit JSONOverloadResolver(ContextPtr context_) : WithContext(context_) {} - - bool isVariadic() const override { return true; } - size_t getNumberOfArguments() const override { return 0; } - bool useDefaultImplementationForNulls() const override { return false; } - - FunctionBasePtr build(const ColumnsWithTypeAndName & arguments) const override - { - bool has_nothing_argument = false; - for (const auto & arg : arguments) - has_nothing_argument |= isNothing(arg.type); - - DataTypePtr json_return_type = Impl::getReturnType(Name::name, createBlockWithNestedColumns(arguments)); - NullPresence null_presence = getNullPresense(arguments); - DataTypePtr return_type; - if (has_nothing_argument) - return_type = std::make_shared(); - else if (null_presence.has_null_constant) - return_type = makeNullable(std::make_shared()); - else if (null_presence.has_nullable) - return_type = makeNullable(json_return_type); - else - return_type = json_return_type; - - /// Top-level LowCardinality columns are processed outside JSON parser. - json_return_type = removeLowCardinality(json_return_type); - - DataTypes argument_types; - argument_types.reserve(arguments.size()); - for (const auto & argument : arguments) - argument_types.emplace_back(argument.type); - return std::make_unique>( - null_presence, getContext()->getSettingsRef().allow_simdjson, argument_types, return_type, json_return_type); - } -}; - - -struct NameJSONHas { static constexpr auto name{"JSONHas"}; }; -struct NameIsValidJSON { static constexpr auto name{"isValidJSON"}; }; -struct NameJSONLength { static constexpr auto name{"JSONLength"}; }; -struct NameJSONKey { static constexpr auto name{"JSONKey"}; }; -struct NameJSONType { static constexpr auto name{"JSONType"}; }; -struct NameJSONExtractInt { static constexpr auto name{"JSONExtractInt"}; }; -struct NameJSONExtractUInt { static constexpr auto name{"JSONExtractUInt"}; }; -struct NameJSONExtractFloat { static constexpr auto name{"JSONExtractFloat"}; }; -struct NameJSONExtractBool { static constexpr auto name{"JSONExtractBool"}; }; -struct NameJSONExtractString { static constexpr auto name{"JSONExtractString"}; }; -struct NameJSONExtract { static constexpr auto name{"JSONExtract"}; }; -struct NameJSONExtractKeysAndValues { static constexpr auto name{"JSONExtractKeysAndValues"}; }; -struct NameJSONExtractRaw { static constexpr auto name{"JSONExtractRaw"}; }; -struct NameJSONExtractArrayRaw { static constexpr auto name{"JSONExtractArrayRaw"}; }; -struct NameJSONExtractKeysAndValuesRaw { static constexpr auto name{"JSONExtractKeysAndValuesRaw"}; }; -struct NameJSONExtractKeys { static constexpr auto name{"JSONExtractKeys"}; }; - - -template -class JSONHasImpl -{ -public: - using Element = typename JSONParser::Element; - - static DataTypePtr getReturnType(const char *, const ColumnsWithTypeAndName &) { return std::make_shared(); } - - static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName & arguments) { return arguments.size() - 1; } - - static bool insertResultToColumn(IColumn & dest, const Element &, std::string_view) - { - ColumnVector & col_vec = assert_cast &>(dest); - col_vec.insertValue(1); - return true; - } -}; - - -template -class IsValidJSONImpl -{ -public: - using Element = typename JSONParser::Element; - - static DataTypePtr getReturnType(const char * function_name, const ColumnsWithTypeAndName & arguments) - { - if (arguments.size() != 1) - { - /// IsValidJSON() shouldn't get parameters other than JSON. - throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Function {} needs exactly one argument", - String(function_name)); - } - return std::make_shared(); - } - - static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName &) { return 0; } - - static bool insertResultToColumn(IColumn & dest, const Element &, std::string_view) - { - /// This function is called only if JSON is valid. - /// If JSON isn't valid then `FunctionJSON::Executor::run()` adds default value (=zero) to `dest` without calling this function. - ColumnVector & col_vec = assert_cast &>(dest); - col_vec.insertValue(1); - return true; - } -}; - - -template -class JSONLengthImpl -{ -public: - using Element = typename JSONParser::Element; - - static DataTypePtr getReturnType(const char *, const ColumnsWithTypeAndName &) - { - return std::make_shared(); - } - - static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName & arguments) { return arguments.size() - 1; } - - static bool insertResultToColumn(IColumn & dest, const Element & element, std::string_view) - { - size_t size; - if (element.isArray()) - size = element.getArray().size(); - else if (element.isObject()) - size = element.getObject().size(); - else - return false; - - ColumnVector & col_vec = assert_cast &>(dest); - col_vec.insertValue(size); - return true; - } -}; - - -template -class JSONKeyImpl -{ -public: - using Element = typename JSONParser::Element; - - static DataTypePtr getReturnType(const char *, const ColumnsWithTypeAndName &) - { - return std::make_shared(); - } - - static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName & arguments) { return arguments.size() - 1; } - - static bool insertResultToColumn(IColumn & dest, const Element &, std::string_view last_key) - { - if (last_key.empty()) - return false; - ColumnString & col_str = assert_cast(dest); - col_str.insertData(last_key.data(), last_key.size()); - return true; - } -}; - - -template -class JSONTypeImpl -{ -public: - using Element = typename JSONParser::Element; - - static DataTypePtr getReturnType(const char *, const ColumnsWithTypeAndName &) - { - static const std::vector> values = { - {"Array", '['}, - {"Object", '{'}, - {"String", '"'}, - {"Int64", 'i'}, - {"UInt64", 'u'}, - {"Double", 'd'}, - {"Bool", 'b'}, - {"Null", 0}, /// the default value for the column. - }; - return std::make_shared>(values); - } - - static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName & arguments) { return arguments.size() - 1; } - - static bool insertResultToColumn(IColumn & dest, const Element & element, std::string_view) - { - UInt8 type; - switch (element.type()) - { - case ElementType::INT64: - type = 'i'; - break; - case ElementType::UINT64: - type = 'u'; - break; - case ElementType::DOUBLE: - type = 'd'; - break; - case ElementType::STRING: - type = '"'; - break; - case ElementType::ARRAY: - type = '['; - break; - case ElementType::OBJECT: - type = '{'; - break; - case ElementType::BOOL: - type = 'b'; - break; - case ElementType::NULL_VALUE: - type = 0; - break; - } - - ColumnVector & col_vec = assert_cast &>(dest); - col_vec.insertValue(type); - return true; - } -}; - - -template -class JSONExtractNumericImpl -{ -public: - using Element = typename JSONParser::Element; - - static DataTypePtr getReturnType(const char *, const ColumnsWithTypeAndName &) - { - return std::make_shared>(); - } - - static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName & arguments) { return arguments.size() - 1; } - - static bool insertResultToColumn(IColumn & dest, const Element & element, std::string_view) - { - NumberType value; - - switch (element.type()) - { - case ElementType::DOUBLE: - if constexpr (std::is_floating_point_v) - { - /// We permit inaccurate conversion of double to float. - /// Example: double 0.1 from JSON is not representable in float. - /// But it will be more convenient for user to perform conversion. - value = static_cast(element.getDouble()); - } - else if (!accurate::convertNumeric(element.getDouble(), value)) - return false; - break; - case ElementType::UINT64: - if (!accurate::convertNumeric(element.getUInt64(), value)) - return false; - break; - case ElementType::INT64: - if (!accurate::convertNumeric(element.getInt64(), value)) - return false; - break; - case ElementType::BOOL: - if constexpr (is_integer && convert_bool_to_integer) - { - value = static_cast(element.getBool()); - break; - } - return false; - case ElementType::STRING: - { - auto rb = ReadBufferFromMemory{element.getString()}; - if constexpr (std::is_floating_point_v) - { - if (!tryReadFloatText(value, rb) || !rb.eof()) - return false; - } - else - { - if (tryReadIntText(value, rb) && rb.eof()) - break; - - /// Try to parse float and convert it to integer. - Float64 tmp_float; - rb.position() = rb.buffer().begin(); - if (!tryReadFloatText(tmp_float, rb) || !rb.eof()) - return false; - - if (!accurate::convertNumeric(tmp_float, value)) - return false; - } - break; - } - default: - return false; - } - - if (dest.getDataType() == TypeIndex::LowCardinality) - { - ColumnLowCardinality & col_low = assert_cast(dest); - col_low.insertData(reinterpret_cast(&value), sizeof(value)); - } - else - { - auto & col_vec = assert_cast &>(dest); - col_vec.insertValue(value); - } - return true; - } -}; - - -template -using JSONExtractInt64Impl = JSONExtractNumericImpl; -template -using JSONExtractUInt64Impl = JSONExtractNumericImpl; -template -using JSONExtractFloat64Impl = JSONExtractNumericImpl; - - -template -class JSONExtractBoolImpl -{ -public: - using Element = typename JSONParser::Element; - - static DataTypePtr getReturnType(const char *, const ColumnsWithTypeAndName &) - { - return std::make_shared(); - } - - static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName & arguments) { return arguments.size() - 1; } - - static bool insertResultToColumn(IColumn & dest, const Element & element, std::string_view) - { - bool value; - switch (element.type()) - { - case ElementType::BOOL: - value = element.getBool(); - break; - case ElementType::INT64: - value = element.getInt64() != 0; - break; - case ElementType::UINT64: - value = element.getUInt64() != 0; - break; - default: - return false; - } - - auto & col_vec = assert_cast &>(dest); - col_vec.insertValue(static_cast(value)); - return true; - } -}; - -template -class JSONExtractRawImpl; - -template -class JSONExtractStringImpl -{ -public: - using Element = typename JSONParser::Element; - - static DataTypePtr getReturnType(const char *, const ColumnsWithTypeAndName &) - { - return std::make_shared(); - } - - static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName & arguments) { return arguments.size() - 1; } - - static bool insertResultToColumn(IColumn & dest, const Element & element, std::string_view) - { - if (element.isNull()) - return false; - - if (!element.isString()) - return JSONExtractRawImpl::insertResultToColumn(dest, element, {}); - - auto str = element.getString(); - - if (dest.getDataType() == TypeIndex::LowCardinality) - { - ColumnLowCardinality & col_low = assert_cast(dest); - col_low.insertData(str.data(), str.size()); - } - else - { - ColumnString & col_str = assert_cast(dest); - col_str.insertData(str.data(), str.size()); - } - return true; - } -}; - -/// Nodes of the extract tree. We need the extract tree to extract from JSON complex values containing array, tuples or nullables. -template -struct JSONExtractTree -{ - using Element = typename JSONParser::Element; - - class Node - { - public: - Node() = default; - virtual ~Node() = default; - virtual bool insertResultToColumn(IColumn &, const Element &) = 0; - }; - - template - class NumericNode : public Node - { - public: - bool insertResultToColumn(IColumn & dest, const Element & element) override - { - return JSONExtractNumericImpl::insertResultToColumn(dest, element, {}); - } - }; - - class LowCardinalityFixedStringNode : public Node - { - public: - explicit LowCardinalityFixedStringNode(const size_t fixed_length_) : fixed_length(fixed_length_) { } - bool insertResultToColumn(IColumn & dest, const Element & element) override - { - // If element is an object we delegate the insertion to JSONExtractRawImpl - if (element.isObject()) - return JSONExtractRawImpl::insertResultToLowCardinalityFixedStringColumn(dest, element, fixed_length); - else if (!element.isString()) - return false; - - auto str = element.getString(); - if (str.size() > fixed_length) - return false; - - // For the non low cardinality case of FixedString, the padding is done in the FixedString Column implementation. - // In order to avoid having to pass the data to a FixedString Column and read it back (which would slow down the execution) - // the data is padded here and written directly to the Low Cardinality Column - if (str.size() == fixed_length) - { - assert_cast(dest).insertData(str.data(), str.size()); - } - else - { - String padded_str(str); - padded_str.resize(fixed_length, '\0'); - - assert_cast(dest).insertData(padded_str.data(), padded_str.size()); - } - return true; - } - - private: - const size_t fixed_length; - }; - - class UUIDNode : public Node - { - public: - bool insertResultToColumn(IColumn & dest, const Element & element) override - { - if (!element.isString()) - return false; - - auto uuid = parseFromString(element.getString()); - if (dest.getDataType() == TypeIndex::LowCardinality) - { - ColumnLowCardinality & col_low = assert_cast(dest); - col_low.insertData(reinterpret_cast(&uuid), sizeof(uuid)); - } - else - { - assert_cast(dest).insert(uuid); - } - return true; - } - }; - - template - class DecimalNode : public Node - { - public: - explicit DecimalNode(DataTypePtr data_type_) : data_type(data_type_) {} - bool insertResultToColumn(IColumn & dest, const Element & element) override - { - const auto * type = assert_cast *>(data_type.get()); - - DecimalType value{}; - - switch (element.type()) - { - case ElementType::DOUBLE: - value = convertToDecimal, DataTypeDecimal>( - element.getDouble(), type->getScale()); - break; - case ElementType::UINT64: - value = convertToDecimal, DataTypeDecimal>( - element.getUInt64(), type->getScale()); - break; - case ElementType::INT64: - value = convertToDecimal, DataTypeDecimal>( - element.getInt64(), type->getScale()); - break; - case ElementType::STRING: { - auto rb = ReadBufferFromMemory{element.getString()}; - if (!SerializationDecimal::tryReadText(value, rb, DecimalUtils::max_precision, type->getScale())) - return false; - break; - } - default: - return false; - } - - assert_cast &>(dest).insertValue(value); - return true; - } - - private: - DataTypePtr data_type; - }; - - class StringNode : public Node - { - public: - bool insertResultToColumn(IColumn & dest, const Element & element) override - { - return JSONExtractStringImpl::insertResultToColumn(dest, element, {}); - } - }; - - class FixedStringNode : public Node - { - public: - bool insertResultToColumn(IColumn & dest, const Element & element) override - { - if (element.isNull()) - return false; - - if (!element.isString()) - return JSONExtractRawImpl::insertResultToFixedStringColumn(dest, element, {}); - - auto str = element.getString(); - auto & col_str = assert_cast(dest); - if (str.size() > col_str.getN()) - return false; - col_str.insertData(str.data(), str.size()); - - return true; - } - }; - - template - class EnumNode : public Node - { - public: - explicit EnumNode(const std::vector> & name_value_pairs_) : name_value_pairs(name_value_pairs_) - { - for (const auto & name_value_pair : name_value_pairs) - { - name_to_value_map.emplace(name_value_pair.first, name_value_pair.second); - only_values.emplace(name_value_pair.second); - } - } - - bool insertResultToColumn(IColumn & dest, const Element & element) override - { - auto & col_vec = assert_cast &>(dest); - - if (element.isInt64()) - { - Type value; - if (!accurate::convertNumeric(element.getInt64(), value) || !only_values.contains(value)) - return false; - col_vec.insertValue(value); - return true; - } - - if (element.isUInt64()) - { - Type value; - if (!accurate::convertNumeric(element.getUInt64(), value) || !only_values.contains(value)) - return false; - col_vec.insertValue(value); - return true; - } - - if (element.isString()) - { - auto value = name_to_value_map.find(element.getString()); - if (value == name_to_value_map.end()) - return false; - col_vec.insertValue(value->second); - return true; - } - - return false; - } - - private: - std::vector> name_value_pairs; - std::unordered_map name_to_value_map; - std::unordered_set only_values; - }; - - class NullableNode : public Node - { - public: - explicit NullableNode(std::unique_ptr nested_) : nested(std::move(nested_)) {} - - bool insertResultToColumn(IColumn & dest, const Element & element) override - { - ColumnNullable & col_null = assert_cast(dest); - if (!nested->insertResultToColumn(col_null.getNestedColumn(), element)) - return false; - col_null.getNullMapColumn().insertValue(0); - return true; - } - - private: - std::unique_ptr nested; - }; - - class ArrayNode : public Node - { - public: - explicit ArrayNode(std::unique_ptr nested_) : nested(std::move(nested_)) {} - - bool insertResultToColumn(IColumn & dest, const Element & element) override - { - if (!element.isArray()) - return false; - - auto array = element.getArray(); - - ColumnArray & col_arr = assert_cast(dest); - auto & data = col_arr.getData(); - size_t old_size = data.size(); - bool were_valid_elements = false; - - for (auto value : array) - { - if (nested->insertResultToColumn(data, value)) - were_valid_elements = true; - else - data.insertDefault(); - } - - if (!were_valid_elements) - { - data.popBack(data.size() - old_size); - return false; - } - - col_arr.getOffsets().push_back(data.size()); - return true; - } - - private: - std::unique_ptr nested; - }; - - class TupleNode : public Node - { - public: - TupleNode(std::vector> nested_, const std::vector & explicit_names_) : nested(std::move(nested_)), explicit_names(explicit_names_) - { - for (size_t i = 0; i != explicit_names.size(); ++i) - name_to_index_map.emplace(explicit_names[i], i); - } - - bool insertResultToColumn(IColumn & dest, const Element & element) override - { - ColumnTuple & tuple = assert_cast(dest); - size_t old_size = dest.size(); - bool were_valid_elements = false; - - auto set_size = [&](size_t size) - { - for (size_t i = 0; i != tuple.tupleSize(); ++i) - { - auto & col = tuple.getColumn(i); - if (col.size() != size) - { - if (col.size() > size) - col.popBack(col.size() - size); - else - while (col.size() < size) - col.insertDefault(); - } - } - }; - - if (element.isArray()) - { - auto array = element.getArray(); - auto it = array.begin(); - - for (size_t index = 0; (index != nested.size()) && (it != array.end()); ++index) - { - if (nested[index]->insertResultToColumn(tuple.getColumn(index), *it++)) - were_valid_elements = true; - else - tuple.getColumn(index).insertDefault(); - } - - set_size(old_size + static_cast(were_valid_elements)); - return were_valid_elements; - } - - if (element.isObject()) - { - auto object = element.getObject(); - if (name_to_index_map.empty()) - { - auto it = object.begin(); - for (size_t index = 0; (index != nested.size()) && (it != object.end()); ++index) - { - if (nested[index]->insertResultToColumn(tuple.getColumn(index), (*it++).second)) - were_valid_elements = true; - else - tuple.getColumn(index).insertDefault(); - } - } - else - { - for (const auto & [key, value] : object) - { - auto index = name_to_index_map.find(key); - if (index != name_to_index_map.end()) - { - if (nested[index->second]->insertResultToColumn(tuple.getColumn(index->second), value)) - were_valid_elements = true; - } - } - } - - set_size(old_size + static_cast(were_valid_elements)); - return were_valid_elements; - } - - return false; - } - - private: - std::vector> nested; - std::vector explicit_names; - std::unordered_map name_to_index_map; - }; - - static std::unique_ptr build(const char * function_name, const DataTypePtr & type) - { - switch (type->getTypeId()) - { - case TypeIndex::UInt8: return std::make_unique>(); - case TypeIndex::UInt16: return std::make_unique>(); - case TypeIndex::UInt32: return std::make_unique>(); - case TypeIndex::UInt64: return std::make_unique>(); - case TypeIndex::UInt128: return std::make_unique>(); - case TypeIndex::UInt256: return std::make_unique>(); - case TypeIndex::Int8: return std::make_unique>(); - case TypeIndex::Int16: return std::make_unique>(); - case TypeIndex::Int32: return std::make_unique>(); - case TypeIndex::Int64: return std::make_unique>(); - case TypeIndex::Int128: return std::make_unique>(); - case TypeIndex::Int256: return std::make_unique>(); - case TypeIndex::Float32: return std::make_unique>(); - case TypeIndex::Float64: return std::make_unique>(); - case TypeIndex::String: return std::make_unique(); - case TypeIndex::FixedString: return std::make_unique(); - case TypeIndex::UUID: return std::make_unique(); - case TypeIndex::LowCardinality: - { - // The low cardinality case is treated in two different ways: - // For FixedString type, an especial class is implemented for inserting the data in the destination column, - // as the string length must be passed in order to check and pad the incoming data. - // For the rest of low cardinality types, the insertion is done in their corresponding class, adapting the data - // as needed for the insertData function of the ColumnLowCardinality. - auto dictionary_type = typeid_cast(type.get())->getDictionaryType(); - if ((*dictionary_type).getTypeId() == TypeIndex::FixedString) - { - auto fixed_length = typeid_cast(dictionary_type.get())->getN(); - return std::make_unique(fixed_length); - } - return build(function_name, dictionary_type); - } - case TypeIndex::Decimal256: return std::make_unique>(type); - case TypeIndex::Decimal128: return std::make_unique>(type); - case TypeIndex::Decimal64: return std::make_unique>(type); - case TypeIndex::Decimal32: return std::make_unique>(type); - case TypeIndex::Enum8: - return std::make_unique>(static_cast(*type).getValues()); - case TypeIndex::Enum16: - return std::make_unique>(static_cast(*type).getValues()); - case TypeIndex::Nullable: - { - return std::make_unique(build(function_name, static_cast(*type).getNestedType())); - } - case TypeIndex::Array: - { - return std::make_unique(build(function_name, static_cast(*type).getNestedType())); - } - case TypeIndex::Tuple: - { - const auto & tuple = static_cast(*type); - const auto & tuple_elements = tuple.getElements(); - std::vector> elements; - elements.reserve(tuple_elements.size()); - for (const auto & tuple_element : tuple_elements) - elements.emplace_back(build(function_name, tuple_element)); - return std::make_unique(std::move(elements), tuple.haveExplicitNames() ? tuple.getElementNames() : Strings{}); - } - default: - throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, - "Function {} doesn't support the return type schema: {}", - String(function_name), type->getName()); - } - } -}; - - -template -class JSONExtractImpl -{ -public: - using Element = typename JSONParser::Element; - - static DataTypePtr getReturnType(const char * function_name, const ColumnsWithTypeAndName & arguments) - { - if (arguments.size() < 2) - throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Function {} requires at least two arguments", String(function_name)); - - const auto & col = arguments.back(); - const auto * col_type_const = typeid_cast(col.column.get()); - if (!col_type_const || !isString(col.type)) - throw Exception(ErrorCodes::ILLEGAL_COLUMN, - "The last argument of function {} should " - "be a constant string specifying the return data type, illegal value: {}", - String(function_name), col.name); - - return DataTypeFactory::instance().get(col_type_const->getValue()); - } - - static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName & arguments) { return arguments.size() - 2; } - - void prepare(const char * function_name, const ColumnsWithTypeAndName &, const DataTypePtr & result_type) - { - extract_tree = JSONExtractTree::build(function_name, result_type); - } - - bool insertResultToColumn(IColumn & dest, const Element & element, std::string_view) - { - return extract_tree->insertResultToColumn(dest, element); - } - -protected: - std::unique_ptr::Node> extract_tree; -}; - - -template -class JSONExtractKeysAndValuesImpl -{ -public: - using Element = typename JSONParser::Element; - - static DataTypePtr getReturnType(const char * function_name, const ColumnsWithTypeAndName & arguments) - { - if (arguments.size() < 2) - throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Function {} requires at least two arguments", String(function_name)); - - const auto & col = arguments.back(); - const auto * col_type_const = typeid_cast(col.column.get()); - if (!col_type_const || !isString(col.type)) - throw Exception(ErrorCodes::ILLEGAL_COLUMN, - "The last argument of function {} should " - "be a constant string specifying the values' data type, illegal value: {}", - String(function_name), col.name); - - DataTypePtr key_type = std::make_unique(); - DataTypePtr value_type = DataTypeFactory::instance().get(col_type_const->getValue()); - DataTypePtr tuple_type = std::make_unique(DataTypes{key_type, value_type}); - return std::make_unique(tuple_type); - } - - static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName & arguments) { return arguments.size() - 2; } - - void prepare(const char * function_name, const ColumnsWithTypeAndName &, const DataTypePtr & result_type) - { - const auto tuple_type = typeid_cast(result_type.get())->getNestedType(); - const auto value_type = typeid_cast(tuple_type.get())->getElements()[1]; - extract_tree = JSONExtractTree::build(function_name, value_type); - } - - bool insertResultToColumn(IColumn & dest, const Element & element, std::string_view) - { - if (!element.isObject()) - return false; - - auto object = element.getObject(); - - auto & col_arr = assert_cast(dest); - auto & col_tuple = assert_cast(col_arr.getData()); - size_t old_size = col_tuple.size(); - auto & col_key = assert_cast(col_tuple.getColumn(0)); - auto & col_value = col_tuple.getColumn(1); - - for (const auto & [key, value] : object) - { - if (extract_tree->insertResultToColumn(col_value, value)) - col_key.insertData(key.data(), key.size()); - } - - if (col_tuple.size() == old_size) - return false; - - col_arr.getOffsets().push_back(col_tuple.size()); - return true; - } - -private: - std::unique_ptr::Node> extract_tree; -}; - - -template -class JSONExtractRawImpl -{ -public: - using Element = typename JSONParser::Element; - - static DataTypePtr getReturnType(const char *, const ColumnsWithTypeAndName &) - { - return std::make_shared(); - } - - static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName & arguments) { return arguments.size() - 1; } - - static bool insertResultToColumn(IColumn & dest, const Element & element, std::string_view) - { - if (dest.getDataType() == TypeIndex::LowCardinality) - { - ColumnString::Chars chars; - WriteBufferFromVector buf(chars, AppendModeTag()); - traverse(element, buf); - buf.finalize(); - assert_cast(dest).insertData(reinterpret_cast(chars.data()), chars.size()); - } - else - { - ColumnString & col_str = assert_cast(dest); - auto & chars = col_str.getChars(); - WriteBufferFromVector buf(chars, AppendModeTag()); - traverse(element, buf); - buf.finalize(); - chars.push_back(0); - col_str.getOffsets().push_back(chars.size()); - } - return true; - } - - // We use insertResultToFixedStringColumn in case we are inserting raw data in a FixedString column - static bool insertResultToFixedStringColumn(IColumn & dest, const Element & element, std::string_view) - { - ColumnFixedString::Chars chars; - WriteBufferFromVector buf(chars, AppendModeTag()); - traverse(element, buf); - buf.finalize(); - - auto & col_str = assert_cast(dest); - - if (chars.size() > col_str.getN()) - return false; - - chars.resize_fill(col_str.getN()); - col_str.insertData(reinterpret_cast(chars.data()), chars.size()); - - - return true; - } - - // We use insertResultToLowCardinalityFixedStringColumn in case we are inserting raw data in a Low Cardinality FixedString column - static bool insertResultToLowCardinalityFixedStringColumn(IColumn & dest, const Element & element, size_t fixed_length) - { - if (element.getObject().size() > fixed_length) - return false; - - ColumnFixedString::Chars chars; - WriteBufferFromVector buf(chars, AppendModeTag()); - traverse(element, buf); - buf.finalize(); - - if (chars.size() > fixed_length) - return false; - chars.resize_fill(fixed_length); - assert_cast(dest).insertData(reinterpret_cast(chars.data()), chars.size()); - - return true; - } - -private: - static void traverse(const Element & element, WriteBuffer & buf) - { - if (element.isInt64()) - { - writeIntText(element.getInt64(), buf); - return; - } - if (element.isUInt64()) - { - writeIntText(element.getUInt64(), buf); - return; - } - if (element.isDouble()) - { - writeFloatText(element.getDouble(), buf); - return; - } - if (element.isBool()) - { - if (element.getBool()) - writeCString("true", buf); - else - writeCString("false", buf); - return; - } - if (element.isString()) - { - writeJSONString(element.getString(), buf, formatSettings()); - return; - } - if (element.isArray()) - { - writeChar('[', buf); - bool need_comma = false; - for (auto value : element.getArray()) - { - if (std::exchange(need_comma, true)) - writeChar(',', buf); - traverse(value, buf); - } - writeChar(']', buf); - return; - } - if (element.isObject()) - { - writeChar('{', buf); - bool need_comma = false; - for (auto [key, value] : element.getObject()) - { - if (std::exchange(need_comma, true)) - writeChar(',', buf); - writeJSONString(key, buf, formatSettings()); - writeChar(':', buf); - traverse(value, buf); - } - writeChar('}', buf); - return; - } - if (element.isNull()) - { - writeCString("null", buf); - return; - } - } - - static const FormatSettings & formatSettings() - { - static const FormatSettings the_instance = [] - { - FormatSettings settings; - settings.json.escape_forward_slashes = false; - return settings; - }(); - return the_instance; - } -}; - - -template -class JSONExtractArrayRawImpl -{ -public: - using Element = typename JSONParser::Element; - - static DataTypePtr getReturnType(const char *, const ColumnsWithTypeAndName &) - { - return std::make_shared(std::make_shared()); - } - - static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName & arguments) { return arguments.size() - 1; } - - static bool insertResultToColumn(IColumn & dest, const Element & element, std::string_view) - { - if (!element.isArray()) - return false; - - auto array = element.getArray(); - ColumnArray & col_res = assert_cast(dest); - - for (auto value : array) - JSONExtractRawImpl::insertResultToColumn(col_res.getData(), value, {}); - - col_res.getOffsets().push_back(col_res.getOffsets().back() + array.size()); - return true; - } -}; - - -template -class JSONExtractKeysAndValuesRawImpl -{ -public: - using Element = typename JSONParser::Element; - - static DataTypePtr getReturnType(const char *, const ColumnsWithTypeAndName &) - { - DataTypePtr string_type = std::make_unique(); - DataTypePtr tuple_type = std::make_unique(DataTypes{string_type, string_type}); - return std::make_unique(tuple_type); - } - - static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName & arguments) { return arguments.size() - 1; } - - bool insertResultToColumn(IColumn & dest, const Element & element, std::string_view) - { - if (!element.isObject()) - return false; - - auto object = element.getObject(); - - auto & col_arr = assert_cast(dest); - auto & col_tuple = assert_cast(col_arr.getData()); - auto & col_key = assert_cast(col_tuple.getColumn(0)); - auto & col_value = assert_cast(col_tuple.getColumn(1)); - - for (const auto & [key, value] : object) - { - col_key.insertData(key.data(), key.size()); - JSONExtractRawImpl::insertResultToColumn(col_value, value, {}); - } - - col_arr.getOffsets().push_back(col_arr.getOffsets().back() + object.size()); - return true; - } -}; - -template -class JSONExtractKeysImpl -{ -public: - using Element = typename JSONParser::Element; - - static DataTypePtr getReturnType(const char *, const ColumnsWithTypeAndName &) - { - return std::make_unique(std::make_shared()); - } - - static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName & arguments) { return arguments.size() - 1; } - - bool insertResultToColumn(IColumn & dest, const Element & element, std::string_view) - { - if (!element.isObject()) - return false; - - auto object = element.getObject(); - - ColumnArray & col_res = assert_cast(dest); - auto & col_key = assert_cast(col_res.getData()); - - for (const auto & [key, value] : object) - { - col_key.insertData(key.data(), key.size()); - } - - col_res.getOffsets().push_back(col_res.getOffsets().back() + object.size()); - return true; - } -}; - REGISTER_FUNCTION(JSON) { factory.registerFunction>(); diff --git a/src/Functions/FunctionsJSON.h b/src/Functions/FunctionsJSON.h new file mode 100644 index 00000000000..20f3e9f185d --- /dev/null +++ b/src/Functions/FunctionsJSON.h @@ -0,0 +1,1623 @@ +#pragma once + +#include +#include + +#include + +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include + + +#include "config.h" + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int ILLEGAL_TYPE_OF_ARGUMENT; + extern const int ILLEGAL_COLUMN; + extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; +} + +template +concept HasIndexOperator = requires (T t) +{ + t[0]; +}; + +/// Functions to parse JSONs and extract values from it. +/// The first argument of all these functions gets a JSON, +/// after that there are any number of arguments specifying path to a desired part from the JSON's root. +/// For example, +/// select JSONExtractInt('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 1) = -100 + +class FunctionJSONHelpers +{ +public: + template typename Impl, class JSONParser> + class Executor + { + public: + static ColumnPtr run(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) + { + MutableColumnPtr to{result_type->createColumn()}; + to->reserve(input_rows_count); + + if (arguments.empty()) + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Function {} requires at least one argument", String(Name::name)); + + const auto & first_column = arguments[0]; + if (!isString(first_column.type)) + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "The first argument of function {} should be a string containing JSON, illegal type: " + "{}", String(Name::name), first_column.type->getName()); + + const ColumnPtr & arg_json = first_column.column; + const auto * col_json_const = typeid_cast(arg_json.get()); + const auto * col_json_string + = typeid_cast(col_json_const ? col_json_const->getDataColumnPtr().get() : arg_json.get()); + + if (!col_json_string) + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {}", arg_json->getName()); + + const ColumnString::Chars & chars = col_json_string->getChars(); + const ColumnString::Offsets & offsets = col_json_string->getOffsets(); + + size_t num_index_arguments = Impl::getNumberOfIndexArguments(arguments); + std::vector moves = prepareMoves(Name::name, arguments, 1, num_index_arguments); + + /// Preallocate memory in parser if necessary. + JSONParser parser; + if constexpr (has_member_function_reserve::value) + { + size_t max_size = calculateMaxSize(offsets); + if (max_size) + parser.reserve(max_size); + } + + Impl impl; + + /// prepare() does Impl-specific preparation before handling each row. + if constexpr (has_member_function_prepare::*)(const char *, const ColumnsWithTypeAndName &, const DataTypePtr &)>::value) + impl.prepare(Name::name, arguments, result_type); + + using Element = typename JSONParser::Element; + + Element document; + bool document_ok = false; + if (col_json_const) + { + std::string_view json{reinterpret_cast(chars.data()), offsets[0] - 1}; + document_ok = parser.parse(json, document); + } + + for (const auto i : collections::range(0, input_rows_count)) + { + if (!col_json_const) + { + std::string_view json{reinterpret_cast(&chars[offsets[i - 1]]), offsets[i] - offsets[i - 1] - 1}; + document_ok = parser.parse(json, document); + } + + bool added_to_column = false; + if (document_ok) + { + /// Perform moves. + Element element; + std::string_view last_key; + bool moves_ok = performMoves(arguments, i, document, moves, element, last_key); + + if (moves_ok) + added_to_column = impl.insertResultToColumn(*to, element, last_key); + } + + /// We add default value (=null or zero) if something goes wrong, we don't throw exceptions in these JSON functions. + if (!added_to_column) + to->insertDefault(); + } + return to; + } + }; + +private: + BOOST_TTI_HAS_MEMBER_FUNCTION(reserve) + BOOST_TTI_HAS_MEMBER_FUNCTION(prepare) + + /// Represents a move of a JSON iterator described by a single argument passed to a JSON function. + /// For example, the call JSONExtractInt('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 1) + /// contains two moves: {MoveType::ConstKey, "b"} and {MoveType::ConstIndex, 1}. + /// Keys and indices can be nonconst, in this case they are calculated for each row. + enum class MoveType + { + Key, + Index, + ConstKey, + ConstIndex, + }; + + struct Move + { + explicit Move(MoveType type_, size_t index_ = 0) : type(type_), index(index_) {} + Move(MoveType type_, const String & key_) : type(type_), key(key_) {} + MoveType type; + size_t index = 0; + String key; + }; + + static std::vector prepareMoves( + const char * function_name, + const ColumnsWithTypeAndName & columns, + size_t first_index_argument, + size_t num_index_arguments) + { + std::vector moves; + moves.reserve(num_index_arguments); + for (const auto i : collections::range(first_index_argument, first_index_argument + num_index_arguments)) + { + const auto & column = columns[i]; + if (!isString(column.type) && !isNativeInteger(column.type)) + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "The argument {} of function {} should be a string specifying key " + "or an integer specifying index, illegal type: {}", + std::to_string(i + 1), String(function_name), column.type->getName()); + + if (column.column && isColumnConst(*column.column)) + { + const auto & column_const = assert_cast(*column.column); + if (isString(column.type)) + moves.emplace_back(MoveType::ConstKey, column_const.getValue()); + else + moves.emplace_back(MoveType::ConstIndex, column_const.getInt(0)); + } + else + { + if (isString(column.type)) + moves.emplace_back(MoveType::Key, ""); + else + moves.emplace_back(MoveType::Index, 0); + } + } + return moves; + } + + + /// Performs moves of types MoveType::Index and MoveType::ConstIndex. + template + static bool performMoves(const ColumnsWithTypeAndName & arguments, size_t row, + const typename JSONParser::Element & document, const std::vector & moves, + typename JSONParser::Element & element, std::string_view & last_key) + { + typename JSONParser::Element res_element = document; + std::string_view key; + + for (size_t j = 0; j != moves.size(); ++j) + { + switch (moves[j].type) + { + case MoveType::ConstIndex: + { + if (!moveToElementByIndex(res_element, static_cast(moves[j].index), key)) + return false; + break; + } + case MoveType::ConstKey: + { + key = moves[j].key; + if (!moveToElementByKey(res_element, key)) + return false; + break; + } + case MoveType::Index: + { + Int64 index = (*arguments[j + 1].column)[row].get(); + if (!moveToElementByIndex(res_element, static_cast(index), key)) + return false; + break; + } + case MoveType::Key: + { + key = (*arguments[j + 1].column).getDataAt(row).toView(); + if (!moveToElementByKey(res_element, key)) + return false; + break; + } + } + } + + element = res_element; + last_key = key; + return true; + } + + template + static bool moveToElementByIndex(typename JSONParser::Element & element, int index, std::string_view & out_key) + { + if (element.isArray()) + { + auto array = element.getArray(); + if (index >= 0) + --index; + else + index += array.size(); + + if (static_cast(index) >= array.size()) + return false; + element = array[index]; + out_key = {}; + return true; + } + + if constexpr (HasIndexOperator) + { + if (element.isObject()) + { + auto object = element.getObject(); + if (index >= 0) + --index; + else + index += object.size(); + + if (static_cast(index) >= object.size()) + return false; + std::tie(out_key, element) = object[index]; + return true; + } + } + + return {}; + } + + /// Performs moves of types MoveType::Key and MoveType::ConstKey. + template + static bool moveToElementByKey(typename JSONParser::Element & element, std::string_view key) + { + if (!element.isObject()) + return false; + auto object = element.getObject(); + return object.find(key, element); + } + + static size_t calculateMaxSize(const ColumnString::Offsets & offsets) + { + size_t max_size = 0; + for (const auto i : collections::range(0, offsets.size())) + { + size_t size = offsets[i] - offsets[i - 1]; + if (max_size < size) + max_size = size; + } + if (max_size) + --max_size; + return max_size; + } + +}; + + +template typename Impl> +class ExecutableFunctionJSON : public IExecutableFunction, WithContext +{ + +public: + explicit ExecutableFunctionJSON(const NullPresence & null_presence_, bool allow_simdjson_, const DataTypePtr & json_return_type_) + : null_presence(null_presence_), allow_simdjson(allow_simdjson_), json_return_type(json_return_type_) + { + } + + String getName() const override { return Name::name; } + bool useDefaultImplementationForNulls() const override { return false; } + bool useDefaultImplementationForConstants() const override { return true; } + + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override + { + if (null_presence.has_null_constant) + return result_type->createColumnConstWithDefaultValue(input_rows_count); + + ColumnsWithTypeAndName temporary_columns = null_presence.has_nullable ? createBlockWithNestedColumns(arguments) : arguments; + ColumnPtr temporary_result = chooseAndRunJSONParser(temporary_columns, json_return_type, input_rows_count); + if (null_presence.has_nullable) + return wrapInNullable(temporary_result, arguments, result_type, input_rows_count); + return temporary_result; + } + +private: + + ColumnPtr + chooseAndRunJSONParser(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const + { +#if USE_SIMDJSON + if (allow_simdjson) + return FunctionJSONHelpers::Executor::run(arguments, result_type, input_rows_count); +#endif + +#if USE_RAPIDJSON + return FunctionJSONHelpers::Executor::run(arguments, result_type, input_rows_count); +#else + return FunctionJSONHelpers::Executor::run(arguments, result_type, input_rows_count); +#endif + } + + NullPresence null_presence; + bool allow_simdjson; + DataTypePtr json_return_type; +}; + + +template typename Impl> +class FunctionBaseFunctionJSON : public IFunctionBase +{ +public: + explicit FunctionBaseFunctionJSON( + const NullPresence & null_presence_, + bool allow_simdjson_, + DataTypes argument_types_, + DataTypePtr return_type_, + DataTypePtr json_return_type_) + : null_presence(null_presence_) + , allow_simdjson(allow_simdjson_) + , argument_types(std::move(argument_types_)) + , return_type(std::move(return_type_)) + , json_return_type(std::move(json_return_type_)) + { + } + + String getName() const override { return Name::name; } + + const DataTypes & getArgumentTypes() const override + { + return argument_types; + } + + const DataTypePtr & getResultType() const override + { + return return_type; + } + + bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; } + + ExecutableFunctionPtr prepare(const ColumnsWithTypeAndName &) const override + { + return std::make_unique>(null_presence, allow_simdjson, json_return_type); + } + +private: + NullPresence null_presence; + bool allow_simdjson; + DataTypes argument_types; + DataTypePtr return_type; + DataTypePtr json_return_type; +}; + + +/// We use IFunctionOverloadResolver instead of IFunction to handle non-default NULL processing. +/// Both NULL and JSON NULL should generate NULL value. If any argument is NULL, return NULL. +template typename Impl> +class JSONOverloadResolver : public IFunctionOverloadResolver, WithContext +{ +public: + static constexpr auto name = Name::name; + + String getName() const override { return name; } + + static FunctionOverloadResolverPtr create(ContextPtr context_) + { + return std::make_unique(context_); + } + + explicit JSONOverloadResolver(ContextPtr context_) : WithContext(context_) {} + + bool isVariadic() const override { return true; } + size_t getNumberOfArguments() const override { return 0; } + bool useDefaultImplementationForNulls() const override { return false; } + + FunctionBasePtr build(const ColumnsWithTypeAndName & arguments) const override + { + bool has_nothing_argument = false; + for (const auto & arg : arguments) + has_nothing_argument |= isNothing(arg.type); + + DataTypePtr json_return_type = Impl::getReturnType(Name::name, createBlockWithNestedColumns(arguments)); + NullPresence null_presence = getNullPresense(arguments); + DataTypePtr return_type; + if (has_nothing_argument) + return_type = std::make_shared(); + else if (null_presence.has_null_constant) + return_type = makeNullable(std::make_shared()); + else if (null_presence.has_nullable) + return_type = makeNullable(json_return_type); + else + return_type = json_return_type; + + /// Top-level LowCardinality columns are processed outside JSON parser. + json_return_type = removeLowCardinality(json_return_type); + + DataTypes argument_types; + argument_types.reserve(arguments.size()); + for (const auto & argument : arguments) + argument_types.emplace_back(argument.type); + return std::make_unique>( + null_presence, getContext()->getSettingsRef().allow_simdjson, argument_types, return_type, json_return_type); + } +}; + + +struct NameJSONHas { static constexpr auto name{"JSONHas"}; }; +struct NameIsValidJSON { static constexpr auto name{"isValidJSON"}; }; +struct NameJSONLength { static constexpr auto name{"JSONLength"}; }; +struct NameJSONKey { static constexpr auto name{"JSONKey"}; }; +struct NameJSONType { static constexpr auto name{"JSONType"}; }; +struct NameJSONExtractInt { static constexpr auto name{"JSONExtractInt"}; }; +struct NameJSONExtractUInt { static constexpr auto name{"JSONExtractUInt"}; }; +struct NameJSONExtractFloat { static constexpr auto name{"JSONExtractFloat"}; }; +struct NameJSONExtractBool { static constexpr auto name{"JSONExtractBool"}; }; +struct NameJSONExtractString { static constexpr auto name{"JSONExtractString"}; }; +struct NameJSONExtract { static constexpr auto name{"JSONExtract"}; }; +struct NameJSONExtractKeysAndValues { static constexpr auto name{"JSONExtractKeysAndValues"}; }; +struct NameJSONExtractRaw { static constexpr auto name{"JSONExtractRaw"}; }; +struct NameJSONExtractArrayRaw { static constexpr auto name{"JSONExtractArrayRaw"}; }; +struct NameJSONExtractKeysAndValuesRaw { static constexpr auto name{"JSONExtractKeysAndValuesRaw"}; }; +struct NameJSONExtractKeys { static constexpr auto name{"JSONExtractKeys"}; }; + + +template +class JSONHasImpl +{ +public: + using Element = typename JSONParser::Element; + + static DataTypePtr getReturnType(const char *, const ColumnsWithTypeAndName &) { return std::make_shared(); } + + static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName & arguments) { return arguments.size() - 1; } + + static bool insertResultToColumn(IColumn & dest, const Element &, std::string_view) + { + ColumnVector & col_vec = assert_cast &>(dest); + col_vec.insertValue(1); + return true; + } +}; + + +template +class IsValidJSONImpl +{ +public: + using Element = typename JSONParser::Element; + + static DataTypePtr getReturnType(const char * function_name, const ColumnsWithTypeAndName & arguments) + { + if (arguments.size() != 1) + { + /// IsValidJSON() shouldn't get parameters other than JSON. + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Function {} needs exactly one argument", + String(function_name)); + } + return std::make_shared(); + } + + static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName &) { return 0; } + + static bool insertResultToColumn(IColumn & dest, const Element &, std::string_view) + { + /// This function is called only if JSON is valid. + /// If JSON isn't valid then `FunctionJSON::Executor::run()` adds default value (=zero) to `dest` without calling this function. + ColumnVector & col_vec = assert_cast &>(dest); + col_vec.insertValue(1); + return true; + } +}; + + +template +class JSONLengthImpl +{ +public: + using Element = typename JSONParser::Element; + + static DataTypePtr getReturnType(const char *, const ColumnsWithTypeAndName &) + { + return std::make_shared(); + } + + static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName & arguments) { return arguments.size() - 1; } + + static bool insertResultToColumn(IColumn & dest, const Element & element, std::string_view) + { + size_t size; + if (element.isArray()) + size = element.getArray().size(); + else if (element.isObject()) + size = element.getObject().size(); + else + return false; + + ColumnVector & col_vec = assert_cast &>(dest); + col_vec.insertValue(size); + return true; + } +}; + + +template +class JSONKeyImpl +{ +public: + using Element = typename JSONParser::Element; + + static DataTypePtr getReturnType(const char *, const ColumnsWithTypeAndName &) + { + return std::make_shared(); + } + + static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName & arguments) { return arguments.size() - 1; } + + static bool insertResultToColumn(IColumn & dest, const Element &, std::string_view last_key) + { + if (last_key.empty()) + return false; + ColumnString & col_str = assert_cast(dest); + col_str.insertData(last_key.data(), last_key.size()); + return true; + } +}; + + +template +class JSONTypeImpl +{ +public: + using Element = typename JSONParser::Element; + + static DataTypePtr getReturnType(const char *, const ColumnsWithTypeAndName &) + { + static const std::vector> values = { + {"Array", '['}, + {"Object", '{'}, + {"String", '"'}, + {"Int64", 'i'}, + {"UInt64", 'u'}, + {"Double", 'd'}, + {"Bool", 'b'}, + {"Null", 0}, /// the default value for the column. + }; + return std::make_shared>(values); + } + + static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName & arguments) { return arguments.size() - 1; } + + static bool insertResultToColumn(IColumn & dest, const Element & element, std::string_view) + { + UInt8 type; + switch (element.type()) + { + case ElementType::INT64: + type = 'i'; + break; + case ElementType::UINT64: + type = 'u'; + break; + case ElementType::DOUBLE: + type = 'd'; + break; + case ElementType::STRING: + type = '"'; + break; + case ElementType::ARRAY: + type = '['; + break; + case ElementType::OBJECT: + type = '{'; + break; + case ElementType::BOOL: + type = 'b'; + break; + case ElementType::NULL_VALUE: + type = 0; + break; + } + + ColumnVector & col_vec = assert_cast &>(dest); + col_vec.insertValue(type); + return true; + } +}; + + +template +class JSONExtractNumericImpl +{ +public: + using Element = typename JSONParser::Element; + + static DataTypePtr getReturnType(const char *, const ColumnsWithTypeAndName &) + { + return std::make_shared>(); + } + + static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName & arguments) { return arguments.size() - 1; } + + static bool insertResultToColumn(IColumn & dest, const Element & element, std::string_view) + { + NumberType value; + + switch (element.type()) + { + case ElementType::DOUBLE: + if constexpr (std::is_floating_point_v) + { + /// We permit inaccurate conversion of double to float. + /// Example: double 0.1 from JSON is not representable in float. + /// But it will be more convenient for user to perform conversion. + value = static_cast(element.getDouble()); + } + else if (!accurate::convertNumeric(element.getDouble(), value)) + return false; + break; + case ElementType::UINT64: + if (!accurate::convertNumeric(element.getUInt64(), value)) + return false; + break; + case ElementType::INT64: + if (!accurate::convertNumeric(element.getInt64(), value)) + return false; + break; + case ElementType::BOOL: + if constexpr (is_integer && convert_bool_to_integer) + { + value = static_cast(element.getBool()); + break; + } + return false; + case ElementType::STRING: + { + auto rb = ReadBufferFromMemory{element.getString()}; + if constexpr (std::is_floating_point_v) + { + if (!tryReadFloatText(value, rb) || !rb.eof()) + return false; + } + else + { + if (tryReadIntText(value, rb) && rb.eof()) + break; + + /// Try to parse float and convert it to integer. + Float64 tmp_float; + rb.position() = rb.buffer().begin(); + if (!tryReadFloatText(tmp_float, rb) || !rb.eof()) + return false; + + if (!accurate::convertNumeric(tmp_float, value)) + return false; + } + break; + } + default: + return false; + } + + if (dest.getDataType() == TypeIndex::LowCardinality) + { + ColumnLowCardinality & col_low = assert_cast(dest); + col_low.insertData(reinterpret_cast(&value), sizeof(value)); + } + else + { + auto & col_vec = assert_cast &>(dest); + col_vec.insertValue(value); + } + return true; + } +}; + + +template +using JSONExtractInt64Impl = JSONExtractNumericImpl; +template +using JSONExtractUInt64Impl = JSONExtractNumericImpl; +template +using JSONExtractFloat64Impl = JSONExtractNumericImpl; + + +template +class JSONExtractBoolImpl +{ +public: + using Element = typename JSONParser::Element; + + static DataTypePtr getReturnType(const char *, const ColumnsWithTypeAndName &) + { + return std::make_shared(); + } + + static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName & arguments) { return arguments.size() - 1; } + + static bool insertResultToColumn(IColumn & dest, const Element & element, std::string_view) + { + bool value; + switch (element.type()) + { + case ElementType::BOOL: + value = element.getBool(); + break; + case ElementType::INT64: + value = element.getInt64() != 0; + break; + case ElementType::UINT64: + value = element.getUInt64() != 0; + break; + default: + return false; + } + + auto & col_vec = assert_cast &>(dest); + col_vec.insertValue(static_cast(value)); + return true; + } +}; + +template +class JSONExtractRawImpl; + +template +class JSONExtractStringImpl +{ +public: + using Element = typename JSONParser::Element; + + static DataTypePtr getReturnType(const char *, const ColumnsWithTypeAndName &) + { + return std::make_shared(); + } + + static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName & arguments) { return arguments.size() - 1; } + + static bool insertResultToColumn(IColumn & dest, const Element & element, std::string_view) + { + if (element.isNull()) + return false; + + if (!element.isString()) + return JSONExtractRawImpl::insertResultToColumn(dest, element, {}); + + auto str = element.getString(); + + if (dest.getDataType() == TypeIndex::LowCardinality) + { + ColumnLowCardinality & col_low = assert_cast(dest); + col_low.insertData(str.data(), str.size()); + } + else + { + ColumnString & col_str = assert_cast(dest); + col_str.insertData(str.data(), str.size()); + } + return true; + } +}; + +/// Nodes of the extract tree. We need the extract tree to extract from JSON complex values containing array, tuples or nullables. +template +struct JSONExtractTree +{ + using Element = typename JSONParser::Element; + + class Node + { + public: + Node() = default; + virtual ~Node() = default; + virtual bool insertResultToColumn(IColumn &, const Element &) = 0; + }; + + template + class NumericNode : public Node + { + public: + bool insertResultToColumn(IColumn & dest, const Element & element) override + { + return JSONExtractNumericImpl::insertResultToColumn(dest, element, {}); + } + }; + + class LowCardinalityFixedStringNode : public Node + { + public: + explicit LowCardinalityFixedStringNode(const size_t fixed_length_) : fixed_length(fixed_length_) { } + bool insertResultToColumn(IColumn & dest, const Element & element) override + { + // If element is an object we delegate the insertion to JSONExtractRawImpl + if (element.isObject()) + return JSONExtractRawImpl::insertResultToLowCardinalityFixedStringColumn(dest, element, fixed_length); + else if (!element.isString()) + return false; + + auto str = element.getString(); + if (str.size() > fixed_length) + return false; + + // For the non low cardinality case of FixedString, the padding is done in the FixedString Column implementation. + // In order to avoid having to pass the data to a FixedString Column and read it back (which would slow down the execution) + // the data is padded here and written directly to the Low Cardinality Column + if (str.size() == fixed_length) + { + assert_cast(dest).insertData(str.data(), str.size()); + } + else + { + String padded_str(str); + padded_str.resize(fixed_length, '\0'); + + assert_cast(dest).insertData(padded_str.data(), padded_str.size()); + } + return true; + } + + private: + const size_t fixed_length; + }; + + class UUIDNode : public Node + { + public: + bool insertResultToColumn(IColumn & dest, const Element & element) override + { + if (!element.isString()) + return false; + + auto uuid = parseFromString(element.getString()); + if (dest.getDataType() == TypeIndex::LowCardinality) + { + ColumnLowCardinality & col_low = assert_cast(dest); + col_low.insertData(reinterpret_cast(&uuid), sizeof(uuid)); + } + else + { + assert_cast(dest).insert(uuid); + } + return true; + } + }; + + template + class DecimalNode : public Node + { + public: + explicit DecimalNode(DataTypePtr data_type_) : data_type(data_type_) {} + bool insertResultToColumn(IColumn & dest, const Element & element) override + { + const auto * type = assert_cast *>(data_type.get()); + + DecimalType value{}; + + switch (element.type()) + { + case ElementType::DOUBLE: + value = convertToDecimal, DataTypeDecimal>( + element.getDouble(), type->getScale()); + break; + case ElementType::UINT64: + value = convertToDecimal, DataTypeDecimal>( + element.getUInt64(), type->getScale()); + break; + case ElementType::INT64: + value = convertToDecimal, DataTypeDecimal>( + element.getInt64(), type->getScale()); + break; + case ElementType::STRING: { + auto rb = ReadBufferFromMemory{element.getString()}; + if (!SerializationDecimal::tryReadText(value, rb, DecimalUtils::max_precision, type->getScale())) + return false; + break; + } + default: + return false; + } + + assert_cast &>(dest).insertValue(value); + return true; + } + + private: + DataTypePtr data_type; + }; + + class StringNode : public Node + { + public: + bool insertResultToColumn(IColumn & dest, const Element & element) override + { + return JSONExtractStringImpl::insertResultToColumn(dest, element, {}); + } + }; + + class FixedStringNode : public Node + { + public: + bool insertResultToColumn(IColumn & dest, const Element & element) override + { + if (element.isNull()) + return false; + + if (!element.isString()) + return JSONExtractRawImpl::insertResultToFixedStringColumn(dest, element, {}); + + auto str = element.getString(); + auto & col_str = assert_cast(dest); + if (str.size() > col_str.getN()) + return false; + col_str.insertData(str.data(), str.size()); + + return true; + } + }; + + template + class EnumNode : public Node + { + public: + explicit EnumNode(const std::vector> & name_value_pairs_) : name_value_pairs(name_value_pairs_) + { + for (const auto & name_value_pair : name_value_pairs) + { + name_to_value_map.emplace(name_value_pair.first, name_value_pair.second); + only_values.emplace(name_value_pair.second); + } + } + + bool insertResultToColumn(IColumn & dest, const Element & element) override + { + auto & col_vec = assert_cast &>(dest); + + if (element.isInt64()) + { + Type value; + if (!accurate::convertNumeric(element.getInt64(), value) || !only_values.contains(value)) + return false; + col_vec.insertValue(value); + return true; + } + + if (element.isUInt64()) + { + Type value; + if (!accurate::convertNumeric(element.getUInt64(), value) || !only_values.contains(value)) + return false; + col_vec.insertValue(value); + return true; + } + + if (element.isString()) + { + auto value = name_to_value_map.find(element.getString()); + if (value == name_to_value_map.end()) + return false; + col_vec.insertValue(value->second); + return true; + } + + return false; + } + + private: + std::vector> name_value_pairs; + std::unordered_map name_to_value_map; + std::unordered_set only_values; + }; + + class NullableNode : public Node + { + public: + explicit NullableNode(std::unique_ptr nested_) : nested(std::move(nested_)) {} + + bool insertResultToColumn(IColumn & dest, const Element & element) override + { + ColumnNullable & col_null = assert_cast(dest); + if (!nested->insertResultToColumn(col_null.getNestedColumn(), element)) + return false; + col_null.getNullMapColumn().insertValue(0); + return true; + } + + private: + std::unique_ptr nested; + }; + + class ArrayNode : public Node + { + public: + explicit ArrayNode(std::unique_ptr nested_) : nested(std::move(nested_)) {} + + bool insertResultToColumn(IColumn & dest, const Element & element) override + { + if (!element.isArray()) + return false; + + auto array = element.getArray(); + + ColumnArray & col_arr = assert_cast(dest); + auto & data = col_arr.getData(); + size_t old_size = data.size(); + bool were_valid_elements = false; + + for (auto value : array) + { + if (nested->insertResultToColumn(data, value)) + were_valid_elements = true; + else + data.insertDefault(); + } + + if (!were_valid_elements) + { + data.popBack(data.size() - old_size); + return false; + } + + col_arr.getOffsets().push_back(data.size()); + return true; + } + + private: + std::unique_ptr nested; + }; + + class TupleNode : public Node + { + public: + TupleNode(std::vector> nested_, const std::vector & explicit_names_) : nested(std::move(nested_)), explicit_names(explicit_names_) + { + for (size_t i = 0; i != explicit_names.size(); ++i) + name_to_index_map.emplace(explicit_names[i], i); + } + + bool insertResultToColumn(IColumn & dest, const Element & element) override + { + ColumnTuple & tuple = assert_cast(dest); + size_t old_size = dest.size(); + bool were_valid_elements = false; + + auto set_size = [&](size_t size) + { + for (size_t i = 0; i != tuple.tupleSize(); ++i) + { + auto & col = tuple.getColumn(i); + if (col.size() != size) + { + if (col.size() > size) + col.popBack(col.size() - size); + else + while (col.size() < size) + col.insertDefault(); + } + } + }; + + if (element.isArray()) + { + auto array = element.getArray(); + auto it = array.begin(); + + for (size_t index = 0; (index != nested.size()) && (it != array.end()); ++index) + { + if (nested[index]->insertResultToColumn(tuple.getColumn(index), *it++)) + were_valid_elements = true; + else + tuple.getColumn(index).insertDefault(); + } + + set_size(old_size + static_cast(were_valid_elements)); + return were_valid_elements; + } + + if (element.isObject()) + { + auto object = element.getObject(); + if (name_to_index_map.empty()) + { + auto it = object.begin(); + for (size_t index = 0; (index != nested.size()) && (it != object.end()); ++index) + { + if (nested[index]->insertResultToColumn(tuple.getColumn(index), (*it++).second)) + were_valid_elements = true; + else + tuple.getColumn(index).insertDefault(); + } + } + else + { + for (const auto & [key, value] : object) + { + auto index = name_to_index_map.find(key); + if (index != name_to_index_map.end()) + { + if (nested[index->second]->insertResultToColumn(tuple.getColumn(index->second), value)) + were_valid_elements = true; + } + } + } + + set_size(old_size + static_cast(were_valid_elements)); + return were_valid_elements; + } + + return false; + } + + private: + std::vector> nested; + std::vector explicit_names; + std::unordered_map name_to_index_map; + }; + + static std::unique_ptr build(const char * function_name, const DataTypePtr & type) + { + switch (type->getTypeId()) + { + case TypeIndex::UInt8: return std::make_unique>(); + case TypeIndex::UInt16: return std::make_unique>(); + case TypeIndex::UInt32: return std::make_unique>(); + case TypeIndex::UInt64: return std::make_unique>(); + case TypeIndex::UInt128: return std::make_unique>(); + case TypeIndex::UInt256: return std::make_unique>(); + case TypeIndex::Int8: return std::make_unique>(); + case TypeIndex::Int16: return std::make_unique>(); + case TypeIndex::Int32: return std::make_unique>(); + case TypeIndex::Int64: return std::make_unique>(); + case TypeIndex::Int128: return std::make_unique>(); + case TypeIndex::Int256: return std::make_unique>(); + case TypeIndex::Float32: return std::make_unique>(); + case TypeIndex::Float64: return std::make_unique>(); + case TypeIndex::String: return std::make_unique(); + case TypeIndex::FixedString: return std::make_unique(); + case TypeIndex::UUID: return std::make_unique(); + case TypeIndex::LowCardinality: + { + // The low cardinality case is treated in two different ways: + // For FixedString type, an especial class is implemented for inserting the data in the destination column, + // as the string length must be passed in order to check and pad the incoming data. + // For the rest of low cardinality types, the insertion is done in their corresponding class, adapting the data + // as needed for the insertData function of the ColumnLowCardinality. + auto dictionary_type = typeid_cast(type.get())->getDictionaryType(); + if ((*dictionary_type).getTypeId() == TypeIndex::FixedString) + { + auto fixed_length = typeid_cast(dictionary_type.get())->getN(); + return std::make_unique(fixed_length); + } + return build(function_name, dictionary_type); + } + case TypeIndex::Decimal256: return std::make_unique>(type); + case TypeIndex::Decimal128: return std::make_unique>(type); + case TypeIndex::Decimal64: return std::make_unique>(type); + case TypeIndex::Decimal32: return std::make_unique>(type); + case TypeIndex::Enum8: + return std::make_unique>(static_cast(*type).getValues()); + case TypeIndex::Enum16: + return std::make_unique>(static_cast(*type).getValues()); + case TypeIndex::Nullable: + { + return std::make_unique(build(function_name, static_cast(*type).getNestedType())); + } + case TypeIndex::Array: + { + return std::make_unique(build(function_name, static_cast(*type).getNestedType())); + } + case TypeIndex::Tuple: + { + const auto & tuple = static_cast(*type); + const auto & tuple_elements = tuple.getElements(); + std::vector> elements; + elements.reserve(tuple_elements.size()); + for (const auto & tuple_element : tuple_elements) + elements.emplace_back(build(function_name, tuple_element)); + return std::make_unique(std::move(elements), tuple.haveExplicitNames() ? tuple.getElementNames() : Strings{}); + } + default: + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Function {} doesn't support the return type schema: {}", + String(function_name), type->getName()); + } + } +}; + + +template +class JSONExtractImpl +{ +public: + using Element = typename JSONParser::Element; + + static DataTypePtr getReturnType(const char * function_name, const ColumnsWithTypeAndName & arguments) + { + if (arguments.size() < 2) + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Function {} requires at least two arguments", String(function_name)); + + const auto & col = arguments.back(); + const auto * col_type_const = typeid_cast(col.column.get()); + if (!col_type_const || !isString(col.type)) + throw Exception(ErrorCodes::ILLEGAL_COLUMN, + "The last argument of function {} should " + "be a constant string specifying the return data type, illegal value: {}", + String(function_name), col.name); + + return DataTypeFactory::instance().get(col_type_const->getValue()); + } + + static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName & arguments) { return arguments.size() - 2; } + + void prepare(const char * function_name, const ColumnsWithTypeAndName &, const DataTypePtr & result_type) + { + extract_tree = JSONExtractTree::build(function_name, result_type); + } + + bool insertResultToColumn(IColumn & dest, const Element & element, std::string_view) + { + return extract_tree->insertResultToColumn(dest, element); + } + +protected: + std::unique_ptr::Node> extract_tree; +}; + + +template +class JSONExtractKeysAndValuesImpl +{ +public: + using Element = typename JSONParser::Element; + + static DataTypePtr getReturnType(const char * function_name, const ColumnsWithTypeAndName & arguments) + { + if (arguments.size() < 2) + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Function {} requires at least two arguments", String(function_name)); + + const auto & col = arguments.back(); + const auto * col_type_const = typeid_cast(col.column.get()); + if (!col_type_const || !isString(col.type)) + throw Exception(ErrorCodes::ILLEGAL_COLUMN, + "The last argument of function {} should " + "be a constant string specifying the values' data type, illegal value: {}", + String(function_name), col.name); + + DataTypePtr key_type = std::make_unique(); + DataTypePtr value_type = DataTypeFactory::instance().get(col_type_const->getValue()); + DataTypePtr tuple_type = std::make_unique(DataTypes{key_type, value_type}); + return std::make_unique(tuple_type); + } + + static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName & arguments) { return arguments.size() - 2; } + + void prepare(const char * function_name, const ColumnsWithTypeAndName &, const DataTypePtr & result_type) + { + const auto tuple_type = typeid_cast(result_type.get())->getNestedType(); + const auto value_type = typeid_cast(tuple_type.get())->getElements()[1]; + extract_tree = JSONExtractTree::build(function_name, value_type); + } + + bool insertResultToColumn(IColumn & dest, const Element & element, std::string_view) + { + if (!element.isObject()) + return false; + + auto object = element.getObject(); + + auto & col_arr = assert_cast(dest); + auto & col_tuple = assert_cast(col_arr.getData()); + size_t old_size = col_tuple.size(); + auto & col_key = assert_cast(col_tuple.getColumn(0)); + auto & col_value = col_tuple.getColumn(1); + + for (const auto & [key, value] : object) + { + if (extract_tree->insertResultToColumn(col_value, value)) + col_key.insertData(key.data(), key.size()); + } + + if (col_tuple.size() == old_size) + return false; + + col_arr.getOffsets().push_back(col_tuple.size()); + return true; + } + +private: + std::unique_ptr::Node> extract_tree; +}; + + +template +class JSONExtractRawImpl +{ +public: + using Element = typename JSONParser::Element; + + static DataTypePtr getReturnType(const char *, const ColumnsWithTypeAndName &) + { + return std::make_shared(); + } + + static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName & arguments) { return arguments.size() - 1; } + + static bool insertResultToColumn(IColumn & dest, const Element & element, std::string_view) + { + if (dest.getDataType() == TypeIndex::LowCardinality) + { + ColumnString::Chars chars; + WriteBufferFromVector buf(chars, AppendModeTag()); + traverse(element, buf); + buf.finalize(); + assert_cast(dest).insertData(reinterpret_cast(chars.data()), chars.size()); + } + else + { + ColumnString & col_str = assert_cast(dest); + auto & chars = col_str.getChars(); + WriteBufferFromVector buf(chars, AppendModeTag()); + traverse(element, buf); + buf.finalize(); + chars.push_back(0); + col_str.getOffsets().push_back(chars.size()); + } + return true; + } + + // We use insertResultToFixedStringColumn in case we are inserting raw data in a FixedString column + static bool insertResultToFixedStringColumn(IColumn & dest, const Element & element, std::string_view) + { + ColumnFixedString::Chars chars; + WriteBufferFromVector buf(chars, AppendModeTag()); + traverse(element, buf); + buf.finalize(); + + auto & col_str = assert_cast(dest); + + if (chars.size() > col_str.getN()) + return false; + + chars.resize_fill(col_str.getN()); + col_str.insertData(reinterpret_cast(chars.data()), chars.size()); + + + return true; + } + + // We use insertResultToLowCardinalityFixedStringColumn in case we are inserting raw data in a Low Cardinality FixedString column + static bool insertResultToLowCardinalityFixedStringColumn(IColumn & dest, const Element & element, size_t fixed_length) + { + if (element.getObject().size() > fixed_length) + return false; + + ColumnFixedString::Chars chars; + WriteBufferFromVector buf(chars, AppendModeTag()); + traverse(element, buf); + buf.finalize(); + + if (chars.size() > fixed_length) + return false; + chars.resize_fill(fixed_length); + assert_cast(dest).insertData(reinterpret_cast(chars.data()), chars.size()); + + return true; + } + +private: + static void traverse(const Element & element, WriteBuffer & buf) + { + if (element.isInt64()) + { + writeIntText(element.getInt64(), buf); + return; + } + if (element.isUInt64()) + { + writeIntText(element.getUInt64(), buf); + return; + } + if (element.isDouble()) + { + writeFloatText(element.getDouble(), buf); + return; + } + if (element.isBool()) + { + if (element.getBool()) + writeCString("true", buf); + else + writeCString("false", buf); + return; + } + if (element.isString()) + { + writeJSONString(element.getString(), buf, formatSettings()); + return; + } + if (element.isArray()) + { + writeChar('[', buf); + bool need_comma = false; + for (auto value : element.getArray()) + { + if (std::exchange(need_comma, true)) + writeChar(',', buf); + traverse(value, buf); + } + writeChar(']', buf); + return; + } + if (element.isObject()) + { + writeChar('{', buf); + bool need_comma = false; + for (auto [key, value] : element.getObject()) + { + if (std::exchange(need_comma, true)) + writeChar(',', buf); + writeJSONString(key, buf, formatSettings()); + writeChar(':', buf); + traverse(value, buf); + } + writeChar('}', buf); + return; + } + if (element.isNull()) + { + writeCString("null", buf); + return; + } + } + + static const FormatSettings & formatSettings() + { + static const FormatSettings the_instance = [] + { + FormatSettings settings; + settings.json.escape_forward_slashes = false; + return settings; + }(); + return the_instance; + } +}; + + +template +class JSONExtractArrayRawImpl +{ +public: + using Element = typename JSONParser::Element; + + static DataTypePtr getReturnType(const char *, const ColumnsWithTypeAndName &) + { + return std::make_shared(std::make_shared()); + } + + static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName & arguments) { return arguments.size() - 1; } + + static bool insertResultToColumn(IColumn & dest, const Element & element, std::string_view) + { + if (!element.isArray()) + return false; + + auto array = element.getArray(); + ColumnArray & col_res = assert_cast(dest); + + for (auto value : array) + JSONExtractRawImpl::insertResultToColumn(col_res.getData(), value, {}); + + col_res.getOffsets().push_back(col_res.getOffsets().back() + array.size()); + return true; + } +}; + + +template +class JSONExtractKeysAndValuesRawImpl +{ +public: + using Element = typename JSONParser::Element; + + static DataTypePtr getReturnType(const char *, const ColumnsWithTypeAndName &) + { + DataTypePtr string_type = std::make_unique(); + DataTypePtr tuple_type = std::make_unique(DataTypes{string_type, string_type}); + return std::make_unique(tuple_type); + } + + static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName & arguments) { return arguments.size() - 1; } + + bool insertResultToColumn(IColumn & dest, const Element & element, std::string_view) + { + if (!element.isObject()) + return false; + + auto object = element.getObject(); + + auto & col_arr = assert_cast(dest); + auto & col_tuple = assert_cast(col_arr.getData()); + auto & col_key = assert_cast(col_tuple.getColumn(0)); + auto & col_value = assert_cast(col_tuple.getColumn(1)); + + for (const auto & [key, value] : object) + { + col_key.insertData(key.data(), key.size()); + JSONExtractRawImpl::insertResultToColumn(col_value, value, {}); + } + + col_arr.getOffsets().push_back(col_arr.getOffsets().back() + object.size()); + return true; + } +}; + +template +class JSONExtractKeysImpl +{ +public: + using Element = typename JSONParser::Element; + + static DataTypePtr getReturnType(const char *, const ColumnsWithTypeAndName &) + { + return std::make_unique(std::make_shared()); + } + + static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName & arguments) { return arguments.size() - 1; } + + bool insertResultToColumn(IColumn & dest, const Element & element, std::string_view) + { + if (!element.isObject()) + return false; + + auto object = element.getObject(); + + ColumnArray & col_res = assert_cast(dest); + auto & col_key = assert_cast(col_res.getData()); + + for (const auto & [key, value] : object) + { + col_key.insertData(key.data(), key.size()); + } + + col_res.getOffsets().push_back(col_res.getOffsets().back() + object.size()); + return true; + } +}; + +} diff --git a/src/Functions/ReplaceRegexpImpl.h b/src/Functions/ReplaceRegexpImpl.h index 88d7a40d2dd..7e3af1e62d9 100644 --- a/src/Functions/ReplaceRegexpImpl.h +++ b/src/Functions/ReplaceRegexpImpl.h @@ -13,6 +13,7 @@ namespace DB namespace ErrorCodes { + extern const int ARGUMENT_OUT_OF_BOUND; extern const int BAD_ARGUMENTS; } @@ -28,9 +29,11 @@ struct ReplaceRegexpTraits /** Replace all matches of regexp 'needle' to string 'replacement'. 'needle' and 'replacement' are constants. * 'replacement' can contain substitutions, for example: '\2-\3-\1' */ -template +template struct ReplaceRegexpImpl { + static constexpr auto name = Name::name; + struct Instruction { /// If not negative, perform substitution of n-th subpattern from the regexp match. @@ -162,18 +165,21 @@ struct ReplaceRegexpImpl ++res_offset; } - static void vector( - const ColumnString::Chars & data, - const ColumnString::Offsets & offsets, + static void vectorConstantConstant( + const ColumnString::Chars & haystack_data, + const ColumnString::Offsets & haystack_offsets, const String & needle, const String & replacement, ColumnString::Chars & res_data, ColumnString::Offsets & res_offsets) { + if (needle.empty()) + throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Length of the pattern argument in function {} must be greater than 0.", name); + ColumnString::Offset res_offset = 0; - res_data.reserve(data.size()); - size_t size = offsets.size(); - res_offsets.resize(size); + res_data.reserve(haystack_data.size()); + size_t haystack_size = haystack_offsets.size(); + res_offsets.resize(haystack_size); re2_st::RE2::Options regexp_options; /// Don't write error messages to stderr. @@ -182,39 +188,89 @@ struct ReplaceRegexpImpl re2_st::RE2 searcher(needle, regexp_options); if (!searcher.ok()) - throw Exception( - ErrorCodes::BAD_ARGUMENTS, - "The pattern argument is not a valid re2 pattern: {}", - searcher.error()); + throw Exception(ErrorCodes::BAD_ARGUMENTS, "The pattern argument is not a valid re2 pattern: {}", searcher.error()); int num_captures = std::min(searcher.NumberOfCapturingGroups() + 1, max_captures); Instructions instructions = createInstructions(replacement, num_captures); /// Cannot perform search for whole columns. Will process each string separately. - for (size_t i = 0; i < size; ++i) + for (size_t i = 0; i < haystack_size; ++i) { - size_t from = i > 0 ? offsets[i - 1] : 0; - const char * haystack_data = reinterpret_cast(data.data() + from); - const size_t haystack_length = static_cast(offsets[i] - from - 1); + size_t from = i > 0 ? haystack_offsets[i - 1] : 0; - processString(haystack_data, haystack_length, res_data, res_offset, searcher, num_captures, instructions); + const char * hs_data = reinterpret_cast(haystack_data.data() + from); + const size_t hs_length = static_cast(haystack_offsets[i] - from - 1); + + processString(hs_data, hs_length, res_data, res_offset, searcher, num_captures, instructions); res_offsets[i] = res_offset; } } - static void vectorFixed( - const ColumnString::Chars & data, - size_t n, - const String & needle, + static void vectorVectorConstant( + const ColumnString::Chars & haystack_data, + const ColumnString::Offsets & haystack_offsets, + const ColumnString::Chars & needle_data, + const ColumnString::Offsets & needle_offsets, const String & replacement, ColumnString::Chars & res_data, ColumnString::Offsets & res_offsets) { + assert(haystack_offsets.size() == needle_offsets.size()); + ColumnString::Offset res_offset = 0; - size_t size = data.size() / n; - res_data.reserve(data.size()); - res_offsets.resize(size); + res_data.reserve(haystack_data.size()); + size_t haystack_size = haystack_offsets.size(); + res_offsets.resize(haystack_size); + + re2_st::RE2::Options regexp_options; + /// Don't write error messages to stderr. + regexp_options.set_log_errors(false); + + /// Cannot perform search for whole columns. Will process each string separately. + for (size_t i = 0; i < haystack_size; ++i) + { + size_t hs_from = i > 0 ? haystack_offsets[i - 1] : 0; + const char * hs_data = reinterpret_cast(haystack_data.data() + hs_from); + const size_t hs_length = static_cast(haystack_offsets[i] - hs_from - 1); + + size_t ndl_from = i > 0 ? needle_offsets[i - 1] : 0; + const char * ndl_data = reinterpret_cast(needle_data.data() + ndl_from); + const size_t ndl_length = static_cast(needle_offsets[i] - ndl_from - 1); + std::string_view needle(ndl_data, ndl_length); + + if (needle.empty()) + throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Length of the pattern argument in function {} must be greater than 0.", name); + + re2_st::RE2 searcher(needle, regexp_options); + if (!searcher.ok()) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "The pattern argument is not a valid re2 pattern: {}", searcher.error()); + int num_captures = std::min(searcher.NumberOfCapturingGroups() + 1, max_captures); + Instructions instructions = createInstructions(replacement, num_captures); + + processString(hs_data, hs_length, res_data, res_offset, searcher, num_captures, instructions); + res_offsets[i] = res_offset; + } + } + + static void vectorConstantVector( + const ColumnString::Chars & haystack_data, + const ColumnString::Offsets & haystack_offsets, + const String & needle, + const ColumnString::Chars & replacement_data, + const ColumnString::Offsets & replacement_offsets, + ColumnString::Chars & res_data, + ColumnString::Offsets & res_offsets) + { + assert(haystack_offsets.size() == replacement_offsets.size()); + + if (needle.empty()) + throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Length of the pattern argument in function {} must be greater than 0.", name); + + ColumnString::Offset res_offset = 0; + res_data.reserve(haystack_data.size()); + size_t haystack_size = haystack_offsets.size(); + res_offsets.resize(haystack_size); re2_st::RE2::Options regexp_options; /// Don't write error messages to stderr. @@ -223,22 +279,116 @@ struct ReplaceRegexpImpl re2_st::RE2 searcher(needle, regexp_options); if (!searcher.ok()) - throw Exception( - ErrorCodes::BAD_ARGUMENTS, - "The pattern argument is not a valid re2 pattern: {}", - searcher.error()); + throw Exception(ErrorCodes::BAD_ARGUMENTS, "The pattern argument is not a valid re2 pattern: {}", searcher.error()); + + int num_captures = std::min(searcher.NumberOfCapturingGroups() + 1, max_captures); + + /// Cannot perform search for whole columns. Will process each string separately. + for (size_t i = 0; i < haystack_size; ++i) + { + size_t hs_from = i > 0 ? haystack_offsets[i - 1] : 0; + const char * hs_data = reinterpret_cast(haystack_data.data() + hs_from); + const size_t hs_length = static_cast(haystack_offsets[i] - hs_from - 1); + + size_t repl_from = i > 0 ? replacement_offsets[i - 1] : 0; + const char * repl_data = reinterpret_cast(replacement_data.data() + repl_from); + const size_t repl_length = static_cast(replacement_offsets[i] - repl_from - 1); + + Instructions instructions = createInstructions(std::string_view(repl_data, repl_length), num_captures); + + processString(hs_data, hs_length, res_data, res_offset, searcher, num_captures, instructions); + res_offsets[i] = res_offset; + } + } + + static void vectorVectorVector( + const ColumnString::Chars & haystack_data, + const ColumnString::Offsets & haystack_offsets, + const ColumnString::Chars & needle_data, + const ColumnString::Offsets & needle_offsets, + const ColumnString::Chars & replacement_data, + const ColumnString::Offsets & replacement_offsets, + ColumnString::Chars & res_data, + ColumnString::Offsets & res_offsets) + { + assert(haystack_offsets.size() == needle_offsets.size()); + assert(needle_offsets.size() == replacement_offsets.size()); + + ColumnString::Offset res_offset = 0; + res_data.reserve(haystack_data.size()); + size_t haystack_size = haystack_offsets.size(); + res_offsets.resize(haystack_size); + + re2_st::RE2::Options regexp_options; + /// Don't write error messages to stderr. + regexp_options.set_log_errors(false); + + /// Cannot perform search for whole columns. Will process each string separately. + for (size_t i = 0; i < haystack_size; ++i) + { + size_t hs_from = i > 0 ? haystack_offsets[i - 1] : 0; + const char * hs_data = reinterpret_cast(haystack_data.data() + hs_from); + const size_t hs_length = static_cast(haystack_offsets[i] - hs_from - 1); + + size_t ndl_from = i > 0 ? needle_offsets[i - 1] : 0; + const char * ndl_data = reinterpret_cast(needle_data.data() + ndl_from); + const size_t ndl_length = static_cast(needle_offsets[i] - ndl_from - 1); + std::string_view needle(ndl_data, ndl_length); + + if (needle.empty()) + throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Length of the pattern argument in function {} must be greater than 0.", name); + + size_t repl_from = i > 0 ? replacement_offsets[i - 1] : 0; + const char * repl_data = reinterpret_cast(replacement_data.data() + repl_from); + const size_t repl_length = static_cast(replacement_offsets[i] - repl_from - 1); + + re2_st::RE2 searcher(needle, regexp_options); + if (!searcher.ok()) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "The pattern argument is not a valid re2 pattern: {}", searcher.error()); + int num_captures = std::min(searcher.NumberOfCapturingGroups() + 1, max_captures); + Instructions instructions = createInstructions(std::string_view(repl_data, repl_length), num_captures); + + processString(hs_data, hs_length, res_data, res_offset, searcher, num_captures, instructions); + res_offsets[i] = res_offset; + } + } + + static void vectorFixedConstantConstant( + const ColumnString::Chars & haystack_data, + size_t n, + const String & needle, + const String & replacement, + ColumnString::Chars & res_data, + ColumnString::Offsets & res_offsets) + { + if (needle.empty()) + throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Length of the pattern argument in function {} must be greater than 0.", name); + + ColumnString::Offset res_offset = 0; + size_t haystack_size = haystack_data.size() / n; + res_data.reserve(haystack_data.size()); + res_offsets.resize(haystack_size); + + re2_st::RE2::Options regexp_options; + /// Don't write error messages to stderr. + regexp_options.set_log_errors(false); + + re2_st::RE2 searcher(needle, regexp_options); + + if (!searcher.ok()) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "The pattern argument is not a valid re2 pattern: {}", searcher.error()); int num_captures = std::min(searcher.NumberOfCapturingGroups() + 1, max_captures); Instructions instructions = createInstructions(replacement, num_captures); - for (size_t i = 0; i < size; ++i) + for (size_t i = 0; i < haystack_size; ++i) { size_t from = i * n; - const char * haystack_data = reinterpret_cast(data.data() + from); - const size_t haystack_length = n; + const char * hs_data = reinterpret_cast(haystack_data.data() + from); + const size_t hs_length = n; - processString(haystack_data, haystack_length, res_data, res_offset, searcher, num_captures, instructions); + processString(hs_data, hs_length, res_data, res_offset, searcher, num_captures, instructions); res_offsets[i] = res_offset; } } diff --git a/src/Functions/ReplaceStringImpl.h b/src/Functions/ReplaceStringImpl.h index 1a9ec49c58c..cd2dc1d3636 100644 --- a/src/Functions/ReplaceStringImpl.h +++ b/src/Functions/ReplaceStringImpl.h @@ -8,6 +8,11 @@ namespace DB { +namespace ErrorCodes +{ + extern const int ARGUMENT_OUT_OF_BOUND; +} + struct ReplaceStringTraits { enum class Replace @@ -16,27 +21,33 @@ struct ReplaceStringTraits All }; }; -/** Replace one or all occurencies of substring 'needle' to 'replacement'. 'needle' and 'replacement' are constants. + +/** Replace one or all occurencies of substring 'needle' to 'replacement'. */ -template +template struct ReplaceStringImpl { - static void vector( - const ColumnString::Chars & data, - const ColumnString::Offsets & offsets, - const std::string & needle, - const std::string & replacement, + static constexpr auto name = Name::name; + + static void vectorConstantConstant( + const ColumnString::Chars & haystack_data, + const ColumnString::Offsets & haystack_offsets, + const String & needle, + const String & replacement, ColumnString::Chars & res_data, ColumnString::Offsets & res_offsets) { - const UInt8 * begin = data.data(); + if (needle.empty()) + throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Length of the pattern argument in function {} must be greater than 0.", name); + + const UInt8 * const begin = haystack_data.data(); + const UInt8 * const end = haystack_data.data() + haystack_data.size(); const UInt8 * pos = begin; - const UInt8 * end = pos + data.size(); ColumnString::Offset res_offset = 0; - res_data.reserve(data.size()); - size_t size = offsets.size(); - res_offsets.resize(size); + res_data.reserve(haystack_data.size()); + const size_t haystack_size = haystack_offsets.size(); + res_offsets.resize(haystack_size); /// The current index in the array of strings. size_t i = 0; @@ -53,22 +64,22 @@ struct ReplaceStringImpl memcpy(&res_data[res_offset], pos, match - pos); /// Determine which index it belongs to. - while (i < offsets.size() && begin + offsets[i] <= match) + while (i < haystack_offsets.size() && begin + haystack_offsets[i] <= match) { - res_offsets[i] = res_offset + ((begin + offsets[i]) - pos); + res_offsets[i] = res_offset + ((begin + haystack_offsets[i]) - pos); ++i; } res_offset += (match - pos); /// If you have reached the end, it's time to stop - if (i == offsets.size()) + if (i == haystack_offsets.size()) break; /// Is it true that this string no longer needs to perform transformations. bool can_finish_current_string = false; /// We check that the entry does not go through the boundaries of strings. - if (match + needle.size() < begin + offsets[i]) + if (match + needle.size() < begin + haystack_offsets[i]) { res_data.resize(res_data.size() + replacement.size()); memcpy(&res_data[res_offset], replacement.data(), replacement.size()); @@ -85,34 +96,268 @@ struct ReplaceStringImpl if (can_finish_current_string) { - res_data.resize(res_data.size() + (begin + offsets[i] - pos)); - memcpy(&res_data[res_offset], pos, (begin + offsets[i] - pos)); - res_offset += (begin + offsets[i] - pos); + res_data.resize(res_data.size() + (begin + haystack_offsets[i] - pos)); + memcpy(&res_data[res_offset], pos, (begin + haystack_offsets[i] - pos)); + res_offset += (begin + haystack_offsets[i] - pos); res_offsets[i] = res_offset; - pos = begin + offsets[i]; + pos = begin + haystack_offsets[i]; ++i; } } } - /// Note: this function converts fixed-length strings to variable-length strings - /// and each variable-length string should ends with zero byte. - static void vectorFixed( - const ColumnString::Chars & data, - size_t n, - const std::string & needle, - const std::string & replacement, + template + requires (sizeof(CharT) == 1) + static void copyToOutput( + const CharT * what_start, size_t what_size, + ColumnString::Chars & output, ColumnString::Offset & output_offset) + { + output.resize(output.size() + what_size); + memcpy(&output[output_offset], what_start, what_size); + output_offset += what_size; + } + + static void vectorVectorConstant( + const ColumnString::Chars & haystack_data, + const ColumnString::Offsets & haystack_offsets, + const ColumnString::Chars & needle_data, + const ColumnString::Offsets & needle_offsets, + const String & replacement, ColumnString::Chars & res_data, ColumnString::Offsets & res_offsets) { - const UInt8 * begin = data.data(); - const UInt8 * pos = begin; - const UInt8 * end = pos + data.size(); + chassert(haystack_offsets.size() == needle_offsets.size()); + + const size_t haystack_size = haystack_offsets.size(); + + res_data.reserve(haystack_data.size()); + res_offsets.resize(haystack_size); ColumnString::Offset res_offset = 0; - size_t count = data.size() / n; - res_data.reserve(data.size()); - res_offsets.resize(count); + + size_t prev_haystack_offset = 0; + size_t prev_needle_offset = 0; + + for (size_t i = 0; i < haystack_size; ++i) + { + const auto * const cur_haystack_data = &haystack_data[prev_haystack_offset]; + const size_t cur_haystack_length = haystack_offsets[i] - prev_haystack_offset - 1; + + const auto * const cur_needle_data = &needle_data[prev_needle_offset]; + const size_t cur_needle_length = needle_offsets[i] - prev_needle_offset - 1; + + if (cur_needle_length == 0) + throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Length of the pattern argument in function {} must be greater than 0.", name); + + /// Using "slow" "stdlib searcher instead of Volnitsky because there is a different pattern in each row + StdLibASCIIStringSearcher searcher(cur_needle_data, cur_needle_length); + + const auto * last_match = static_cast(nullptr); + const auto * start_pos = cur_haystack_data; + const auto * const cur_haystack_end = cur_haystack_data + cur_haystack_length; + + while (start_pos < cur_haystack_end) + { + if (const auto * const match = searcher.search(start_pos, cur_haystack_end); match != cur_haystack_end) + { + /// Copy prefix before match + copyToOutput(start_pos, match - start_pos, res_data, res_offset); + + /// Insert replacement for match + copyToOutput(replacement.data(), replacement.size(), res_data, res_offset); + + last_match = match; + start_pos = match + cur_needle_length; + + if constexpr (replace == ReplaceStringTraits::Replace::First) + break; + } + else + break; + } + + /// Copy suffix after last match + size_t bytes = (last_match == nullptr) ? (cur_haystack_end - cur_haystack_data + 1) + : (cur_haystack_end - last_match - cur_needle_length + 1); + copyToOutput(start_pos, bytes, res_data, res_offset); + + res_offsets[i] = res_offset; + + prev_haystack_offset = haystack_offsets[i]; + prev_needle_offset = needle_offsets[i]; + } + } + + static void vectorConstantVector( + const ColumnString::Chars & haystack_data, + const ColumnString::Offsets & haystack_offsets, + const String & needle, + const ColumnString::Chars & replacement_data, + const ColumnString::Offsets & replacement_offsets, + ColumnString::Chars & res_data, + ColumnString::Offsets & res_offsets) + { + chassert(haystack_offsets.size() == replacement_offsets.size()); + + if (needle.empty()) + throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Length of the pattern argument in function {} must be greater than 0.", name); + + const size_t haystack_size = haystack_offsets.size(); + + res_data.reserve(haystack_data.size()); + res_offsets.resize(haystack_size); + + ColumnString::Offset res_offset = 0; + + size_t prev_haystack_offset = 0; + size_t prev_replacement_offset = 0; + + for (size_t i = 0; i < haystack_size; ++i) + { + const auto * const cur_haystack_data = &haystack_data[prev_haystack_offset]; + const size_t cur_haystack_length = haystack_offsets[i] - prev_haystack_offset - 1; + + const auto * const cur_replacement_data = &replacement_data[prev_replacement_offset]; + const size_t cur_replacement_length = replacement_offsets[i] - prev_replacement_offset - 1; + + /// Using "slow" "stdlib searcher instead of Volnitsky just to keep things simple + StdLibASCIIStringSearcher searcher(needle.data(), needle.size()); + + const auto * last_match = static_cast(nullptr); + const auto * start_pos = cur_haystack_data; + const auto * const cur_haystack_end = cur_haystack_data + cur_haystack_length; + + while (start_pos < cur_haystack_end) + { + if (const auto * const match = searcher.search(start_pos, cur_haystack_end); match != cur_haystack_end) + { + /// Copy prefix before match + copyToOutput(start_pos, match - start_pos, res_data, res_offset); + + /// Insert replacement for match + copyToOutput(cur_replacement_data, cur_replacement_length, res_data, res_offset); + + last_match = match; + start_pos = match + needle.size(); + + if constexpr (replace == ReplaceStringTraits::Replace::First) + break; + } + else + break; + } + + /// Copy suffix after last match + size_t bytes = (last_match == nullptr) ? (cur_haystack_end - cur_haystack_data + 1) + : (cur_haystack_end - last_match - needle.size() + 1); + copyToOutput(start_pos, bytes, res_data, res_offset); + + res_offsets[i] = res_offset; + + prev_haystack_offset = haystack_offsets[i]; + prev_replacement_offset = replacement_offsets[i]; + } + } + + static void vectorVectorVector( + const ColumnString::Chars & haystack_data, + const ColumnString::Offsets & haystack_offsets, + const ColumnString::Chars & needle_data, + const ColumnString::Offsets & needle_offsets, + const ColumnString::Chars & replacement_data, + const ColumnString::Offsets & replacement_offsets, + ColumnString::Chars & res_data, + ColumnString::Offsets & res_offsets) + { + chassert(haystack_offsets.size() == needle_offsets.size()); + chassert(needle_offsets.size() == replacement_offsets.size()); + + const size_t haystack_size = haystack_offsets.size(); + + res_data.reserve(haystack_data.size()); + res_offsets.resize(haystack_size); + + ColumnString::Offset res_offset = 0; + + size_t prev_haystack_offset = 0; + size_t prev_needle_offset = 0; + size_t prev_replacement_offset = 0; + + for (size_t i = 0; i < haystack_size; ++i) + { + const auto * const cur_haystack_data = &haystack_data[prev_haystack_offset]; + const size_t cur_haystack_length = haystack_offsets[i] - prev_haystack_offset - 1; + + const auto * const cur_needle_data = &needle_data[prev_needle_offset]; + const size_t cur_needle_length = needle_offsets[i] - prev_needle_offset - 1; + + const auto * const cur_replacement_data = &replacement_data[prev_replacement_offset]; + const size_t cur_replacement_length = replacement_offsets[i] - prev_replacement_offset - 1; + + if (cur_needle_length == 0) + throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Length of the pattern argument in function {} must be greater than 0.", name); + + /// Using "slow" "stdlib searcher instead of Volnitsky because there is a different pattern in each row + StdLibASCIIStringSearcher searcher(cur_needle_data, cur_needle_length); + + const auto * last_match = static_cast(nullptr); + const auto * start_pos = cur_haystack_data; + const auto * const cur_haystack_end = cur_haystack_data + cur_haystack_length; + + while (start_pos < cur_haystack_end) + { + if (const auto * const match = searcher.search(start_pos, cur_haystack_end); match != cur_haystack_end) + { + /// Copy prefix before match + copyToOutput(start_pos, match - start_pos, res_data, res_offset); + + /// Insert replacement for match + copyToOutput(cur_replacement_data, cur_replacement_length, res_data, res_offset); + + last_match = match; + start_pos = match + cur_needle_length; + + if constexpr (replace == ReplaceStringTraits::Replace::First) + break; + } + else + break; + } + + /// Copy suffix after last match + size_t bytes = (last_match == nullptr) ? (cur_haystack_end - cur_haystack_data + 1) + : (cur_haystack_end - last_match - cur_needle_length + 1); + copyToOutput(start_pos, bytes, res_data, res_offset); + + res_offsets[i] = res_offset; + + prev_haystack_offset = haystack_offsets[i]; + prev_needle_offset = needle_offsets[i]; + prev_replacement_offset = replacement_offsets[i]; + } + } + + /// Note: this function converts fixed-length strings to variable-length strings + /// and each variable-length string should ends with zero byte. + static void vectorFixedConstantConstant( + const ColumnString::Chars & haystack_data, + size_t n, + const String & needle, + const String & replacement, + ColumnString::Chars & res_data, + ColumnString::Offsets & res_offsets) + { + if (needle.empty()) + throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Length of the pattern argument in function {} must be greater than 0.", name); + + const UInt8 * const begin = haystack_data.data(); + const UInt8 * const end = haystack_data.data() + haystack_data.size(); + const UInt8 * pos = begin; + + ColumnString::Offset res_offset = 0; + size_t haystack_size = haystack_data.size() / n; + res_data.reserve(haystack_data.size()); + res_offsets.resize(haystack_size); /// The current index in the string array. size_t i = 0; @@ -139,13 +384,13 @@ struct ReplaceStringImpl /// Copy skipped strings without any changes but /// add zero byte to the end of each string. - while (i < count && begin + n * (i + 1) <= match) + while (i < haystack_size && begin + n * (i + 1) <= match) { COPY_REST_OF_CURRENT_STRING(); } /// If you have reached the end, it's time to stop - if (i == count) + if (i == haystack_size) break; /// Copy unchanged part of current string. diff --git a/src/Functions/array/arrayEnumerateExtended.h b/src/Functions/array/arrayEnumerateExtended.h index 3f145c05b54..cf38afcfa5a 100644 --- a/src/Functions/array/arrayEnumerateExtended.h +++ b/src/Functions/array/arrayEnumerateExtended.h @@ -8,6 +8,7 @@ #include #include #include +#include #include #include diff --git a/src/Functions/array/arrayEnumerateRanked.h b/src/Functions/array/arrayEnumerateRanked.h index 8a348c07421..0733f1e2d43 100644 --- a/src/Functions/array/arrayEnumerateRanked.h +++ b/src/Functions/array/arrayEnumerateRanked.h @@ -9,6 +9,7 @@ #include #include #include +#include #include #include diff --git a/src/Functions/extractTextFromHTML.cpp b/src/Functions/extractTextFromHTML.cpp index a15611579bb..4eefeaa9f86 100644 --- a/src/Functions/extractTextFromHTML.cpp +++ b/src/Functions/extractTextFromHTML.cpp @@ -70,16 +70,16 @@ namespace ErrorCodes namespace { -inline bool startsWith(const char * s, const char * end, const char * prefix) +inline bool startsWith(const char * s, const char * end, const std::string_view prefix) { - return s + strlen(prefix) < end && 0 == memcmp(s, prefix, strlen(prefix)); + return s + prefix.length() < end && 0 == memcmp(s, prefix.data(), prefix.length()); } -inline bool checkAndSkip(const char * __restrict & s, const char * end, const char * prefix) +inline bool checkAndSkip(const char * __restrict & s, const char * end, const std::string_view prefix) { if (startsWith(s, end, prefix)) { - s += strlen(prefix); + s += prefix.length(); return true; } return false; @@ -138,7 +138,7 @@ bool processCDATA(const char * __restrict & src, const char * end, char * __rest return true; } -bool processElementAndSkipContent(const char * __restrict & src, const char * end, const char * tag_name) +bool processElementAndSkipContent(const char * __restrict & src, const char * end, const std::string_view tag_name) { const auto * old_src = src; diff --git a/src/Functions/formatDateTime.cpp b/src/Functions/formatDateTime.cpp index dd96a44c17b..aac7ed1ad4d 100644 --- a/src/Functions/formatDateTime.cpp +++ b/src/Functions/formatDateTime.cpp @@ -17,6 +17,7 @@ #include +#include #include #include #include @@ -38,22 +39,19 @@ namespace ErrorCodes namespace { +using Pos = const char *; -struct FormatDateTimeTraits +enum class SupportInteger { - enum class SupportInteger - { - Yes, - No - }; - - enum class FormatSyntax - { - MySQL, - Joda - }; + Yes, + No }; +enum class FormatSyntax +{ + MySQL, + Joda +}; template struct InstructionValueTypeMap {}; template <> struct InstructionValueTypeMap { using InstructionValueType = UInt32; }; @@ -85,11 +83,9 @@ constexpr std::string_view weekdaysFull[] = {"Sunday", "Monday", "Tuesday", "Wed constexpr std::string_view weekdaysShort[] = {"Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"}; -constexpr std::string_view monthsFull[] - = {"January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"}; +constexpr std::string_view monthsFull[] = {"January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"}; -constexpr std::string_view monthsShort[] - = {"Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"}; +constexpr std::string_view monthsShort[] = {"Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"}; /** formatDateTime(time, 'format') * Performs formatting of time, according to provided format. @@ -115,13 +111,13 @@ constexpr std::string_view monthsShort[] * * Performance on Intel(R) Core(TM) i7-6700 CPU @ 3.40GHz: * - * WITH formatDateTime(now() + number, '%H:%M:%S') AS x SELECT count() FROM system.numbers WHERE NOT ignore(x); + * WITH formatDateTime(now() + number, '%H:%i:%S') AS x SELECT count() FROM system.numbers WHERE NOT ignore(x); * - 97 million rows per second per core; * * WITH formatDateTime(toDateTime('2018-01-01 00:00:00') + number, '%F %T') AS x SELECT count() FROM system.numbers WHERE NOT ignore(x) * - 71 million rows per second per core; * - * select count() from (select formatDateTime(t, '%m/%d/%Y %H:%M:%S') from (select toDateTime('2018-01-01 00:00:00')+number as t from numbers(100000000))); + * select count() from (select formatDateTime(t, '%m/%d/%Y %H:%i:%S') from (select toDateTime('2018-01-01 00:00:00')+number as t from numbers(100000000))); * - 53 million rows per second per core; * * select count() from (select formatDateTime(t, 'Hello %Y World') from (select toDateTime('2018-01-01 00:00:00')+number as t from numbers(100000000))); @@ -129,7 +125,7 @@ constexpr std::string_view monthsShort[] * * PS. We can make this function to return FixedString. Currently it returns String. */ -template +template class FunctionFormatDateTimeImpl : public IFunction { private: @@ -152,26 +148,34 @@ private: class Instruction { public: - /// Using std::function will cause performance degradation in MySQL format by 0.45x. - /// But std::function is required for Joda format to capture extra variables. - /// This is the reason why we use raw function pointer in MySQL format and std::function - /// in Joda format. - using Func = std::conditional_t< - format_syntax == FormatDateTimeTraits::FormatSyntax::MySQL, - size_t (*)(char *, Time, UInt64, UInt32, const DateLUTImpl &), - std::function>; + /// Joda format generally requires capturing extra variables (i.e. holding state) which is more convenient with + /// std::function and std::bind. Unfortunately, std::function causes a performance degradation by 0.45x compared to raw function + /// pointers. For MySQL format, we generally prefer raw function pointers. Because of the special case that not all formatters are + /// fixed-width formatters (see mysqlLiteral instruction), we still need to be able to store state. For that reason, we use member + /// function pointers instead of static function pointers. + using FuncMysql = size_t (Instruction