diff --git a/.clang-tidy b/.clang-tidy index 860e7b3189f..532b0f37b81 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -22,6 +22,8 @@ Checks: '*, -bugprone-implicit-widening-of-multiplication-result, -bugprone-narrowing-conversions, -bugprone-not-null-terminated-result, + -bugprone-unchecked-optional-access, + -bugprone-assignment-in-if-condition, -cert-dcl16-c, -cert-err58-cpp, @@ -103,6 +105,7 @@ Checks: '*, -misc-no-recursion, -misc-non-private-member-variables-in-classes, + -misc-const-correctness, -modernize-avoid-c-arrays, -modernize-concat-nested-namespaces, @@ -114,6 +117,7 @@ Checks: '*, -modernize-use-nodiscard, -modernize-use-override, -modernize-use-trailing-return-type, + -modernize-macro-to-enum, -performance-inefficient-string-concatenation, -performance-no-int-to-ptr, @@ -135,6 +139,7 @@ Checks: '*, -readability-suspicious-call-argument, -readability-uppercase-literal-suffix, -readability-use-anyofallof, + -readability-simplify-boolean-expr, -zirkon-*, ' diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs new file mode 100644 index 00000000000..06e893fabb3 --- /dev/null +++ b/.git-blame-ignore-revs @@ -0,0 +1,15 @@ +# This is a file that can be used by git-blame to ignore some revisions. +# (git 2.23+, released in August 2019) +# +# Can be configured as follow: +# +# $ git config blame.ignoreRevsFile .git-blame-ignore-revs +# +# For more information you can look at git-blame(1) man page. + +# Changed tabs to spaces in code [#CLICKHOUSE-3] +137ad95929ee016cc6d3c03bccb5586941c163ff + +# dbms/ → src/ +# (though it is unlikely that you will see it in blame) +06446b4f08a142d6f1bc30664c47ded88ab51782 diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index ae905aa62ba..001f6d9e669 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -30,10 +30,11 @@ jobs: cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" cd "$REPO_COPY" # Download and push packages to artifactory - python3 ./tests/ci/push_to_artifactory.py --release "${{ github.ref }}" \ - --commit '${{ github.sha }}' --artifactory-url "${{ secrets.JFROG_ARTIFACTORY_URL }}" --all + python3 ./tests/ci/push_to_artifactory.py --release '${{ github.ref }}' \ + --commit '${{ github.sha }}' --artifactory-url '${{ secrets.JFROG_ARTIFACTORY_URL }}' --all # Download macos binaries to ${{runner.temp}}/download_binary - python3 ./tests/ci/download_binary.py binary_darwin binary_darwin_aarch64 + python3 ./tests/ci/download_binary.py --version '${{ github.ref }}' \ + --commit '${{ github.sha }}' binary_darwin binary_darwin_aarch64 mv '${{runner.temp}}/download_binary/'clickhouse-* '${{runner.temp}}/push_to_artifactory' - name: Upload packages to release assets uses: svenstaro/upload-release-action@v2 diff --git a/.github/workflows/tags_stable.yml b/.github/workflows/tags_stable.yml index 9711f7688cb..a9172a8a2e2 100644 --- a/.github/workflows/tags_stable.yml +++ b/.github/workflows/tags_stable.yml @@ -43,6 +43,7 @@ jobs: GITHUB_TOKEN: ${{ secrets.ROBOT_CLICKHOUSE_COMMIT_TOKEN }} run: | ./utils/list-versions/list-versions.sh > ./utils/list-versions/version_date.tsv + ./utils/list-versions/update-docker-version.sh GID=$(id -g "${UID}") docker run -u "${UID}:${GID}" -e PYTHONUNBUFFERED=1 \ --volume="${GITHUB_WORKSPACE}:/ClickHouse" clickhouse/style-test \ diff --git a/.gitignore b/.gitignore index e517dfd63c2..dd632eba85d 100644 --- a/.gitignore +++ b/.gitignore @@ -58,6 +58,10 @@ cmake_install.cmake CTestTestfile.cmake *.a *.o +*.so +*.dll +*.lib +*.dylib cmake-build-* # Python cache diff --git a/base/base/ReplxxLineReader.cpp b/base/base/ReplxxLineReader.cpp index b7c18110503..75c48f690f8 100644 --- a/base/base/ReplxxLineReader.cpp +++ b/base/base/ReplxxLineReader.cpp @@ -220,6 +220,35 @@ ReplxxLineReader::ReplxxLineReader( rx.bind_key(Replxx::KEY::control('W'), [this](char32_t code) { return rx.invoke(Replxx::ACTION::KILL_TO_WHITESPACE_ON_LEFT, code); }); rx.bind_key(Replxx::KEY::meta('E'), [this](char32_t) { openEditor(); return Replxx::ACTION_RESULT::CONTINUE; }); + + /// readline insert-comment + auto insert_comment_action = [this](char32_t code) + { + replxx::Replxx::State state(rx.get_state()); + const char * line = state.text(); + const char * line_end = line + strlen(line); + + std::string commented_line; + if (std::find(line, line_end, '\n') != line_end) + { + /// If query has multiple lines, multiline comment is used over + /// commenting each line separately for easier uncomment (though + /// with invoking editor it is simpler to uncomment multiple lines) + /// + /// Note, that using multiline comment is OK even with nested + /// comments, since nested comments are supported. + commented_line = fmt::format("/* {} */", state.text()); + } + else + { + // In a simplest case use simple comment. + commented_line = fmt::format("-- {}", state.text()); + } + rx.set_state(replxx::Replxx::State(commented_line.c_str(), commented_line.size())); + + return rx.invoke(Replxx::ACTION::COMMIT_LINE, code); + }; + rx.bind_key(Replxx::KEY::meta('#'), insert_comment_action); } ReplxxLineReader::~ReplxxLineReader() diff --git a/cmake/clang_tidy.cmake b/cmake/clang_tidy.cmake index fc25c68b11a..200282234ca 100644 --- a/cmake/clang_tidy.cmake +++ b/cmake/clang_tidy.cmake @@ -3,7 +3,7 @@ option (ENABLE_CLANG_TIDY "Use clang-tidy static analyzer" OFF) if (ENABLE_CLANG_TIDY) - find_program (CLANG_TIDY_PATH NAMES "clang-tidy" "clang-tidy-14" "clang-tidy-13" "clang-tidy-12") + find_program (CLANG_TIDY_PATH NAMES "clang-tidy" "clang-tidy-15" "clang-tidy-14" "clang-tidy-13" "clang-tidy-12") if (CLANG_TIDY_PATH) message(STATUS diff --git a/cmake/target.cmake b/cmake/target.cmake index 0fb5e8a20de..ae360758701 100644 --- a/cmake/target.cmake +++ b/cmake/target.cmake @@ -45,6 +45,7 @@ if (CMAKE_CROSSCOMPILING) endif () if (USE_MUSL) + # use of undeclared identifier 'PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP' set (ENABLE_SENTRY OFF CACHE INTERNAL "") set (ENABLE_ODBC OFF CACHE INTERNAL "") set (ENABLE_GRPC OFF CACHE INTERNAL "") diff --git a/contrib/NuRaft b/contrib/NuRaft index 1be805e7cb2..e15858f8ad0 160000 --- a/contrib/NuRaft +++ b/contrib/NuRaft @@ -1 +1 @@ -Subproject commit 1be805e7cb2494aa8170015493474379b0362dfc +Subproject commit e15858f8ad0ce8aba85cf74e3763874c76bf927c diff --git a/contrib/c-ares-cmake/CMakeLists.txt b/contrib/c-ares-cmake/CMakeLists.txt index 603c1f8b65c..4b1170f9dd1 100644 --- a/contrib/c-ares-cmake/CMakeLists.txt +++ b/contrib/c-ares-cmake/CMakeLists.txt @@ -1,35 +1,95 @@ -# Choose to build static or shared library for c-ares. -if (USE_STATIC_LIBRARIES) - set(CARES_STATIC ON CACHE BOOL "" FORCE) - set(CARES_SHARED OFF CACHE BOOL "" FORCE) -else () - set(CARES_STATIC OFF CACHE BOOL "" FORCE) - set(CARES_SHARED ON CACHE BOOL "" FORCE) -endif () +set(LIBRARY_DIR "${ClickHouse_SOURCE_DIR}/contrib/c-ares") -# Disable looking for libnsl on a platforms that has gethostbyname in glibc -# -# c-ares searching for gethostbyname in the libnsl library, however in the -# version that shipped with gRPC it doing it wrong [1], since it uses -# CHECK_LIBRARY_EXISTS(), which will return TRUE even if the function exists in -# another dependent library. The upstream already contains correct macro [2], -# but it is not included in gRPC (even upstream gRPC, not the one that is -# shipped with clickhousee). -# -# [1]: https://github.com/c-ares/c-ares/blob/e982924acee7f7313b4baa4ee5ec000c5e373c30/CMakeLists.txt#L125 -# [2]: https://github.com/c-ares/c-ares/blob/44fbc813685a1fa8aa3f27fcd7544faf612d376a/CMakeLists.txt#L146 -# -# And because if you by some reason have libnsl [3] installed, clickhouse will -# reject to start w/o it. While this is completelly different library. -# -# [3]: https://packages.debian.org/bullseye/libnsl2 -if (NOT CMAKE_SYSTEM_NAME STREQUAL "SunOS") - set(HAVE_LIBNSL OFF CACHE BOOL "" FORCE) +# Generated from contrib/c-ares/src/lib/Makefile.inc +SET(SRCS + "${LIBRARY_DIR}/src/lib/ares__addrinfo2hostent.c" + "${LIBRARY_DIR}/src/lib/ares__addrinfo_localhost.c" + "${LIBRARY_DIR}/src/lib/ares__close_sockets.c" + "${LIBRARY_DIR}/src/lib/ares__get_hostent.c" + "${LIBRARY_DIR}/src/lib/ares__parse_into_addrinfo.c" + "${LIBRARY_DIR}/src/lib/ares__readaddrinfo.c" + "${LIBRARY_DIR}/src/lib/ares__sortaddrinfo.c" + "${LIBRARY_DIR}/src/lib/ares__read_line.c" + "${LIBRARY_DIR}/src/lib/ares__timeval.c" + "${LIBRARY_DIR}/src/lib/ares_android.c" + "${LIBRARY_DIR}/src/lib/ares_cancel.c" + "${LIBRARY_DIR}/src/lib/ares_data.c" + "${LIBRARY_DIR}/src/lib/ares_destroy.c" + "${LIBRARY_DIR}/src/lib/ares_expand_name.c" + "${LIBRARY_DIR}/src/lib/ares_expand_string.c" + "${LIBRARY_DIR}/src/lib/ares_fds.c" + "${LIBRARY_DIR}/src/lib/ares_free_hostent.c" + "${LIBRARY_DIR}/src/lib/ares_free_string.c" + "${LIBRARY_DIR}/src/lib/ares_freeaddrinfo.c" + "${LIBRARY_DIR}/src/lib/ares_getaddrinfo.c" + "${LIBRARY_DIR}/src/lib/ares_getenv.c" + "${LIBRARY_DIR}/src/lib/ares_gethostbyaddr.c" + "${LIBRARY_DIR}/src/lib/ares_gethostbyname.c" + "${LIBRARY_DIR}/src/lib/ares_getnameinfo.c" + "${LIBRARY_DIR}/src/lib/ares_getsock.c" + "${LIBRARY_DIR}/src/lib/ares_init.c" + "${LIBRARY_DIR}/src/lib/ares_library_init.c" + "${LIBRARY_DIR}/src/lib/ares_llist.c" + "${LIBRARY_DIR}/src/lib/ares_mkquery.c" + "${LIBRARY_DIR}/src/lib/ares_create_query.c" + "${LIBRARY_DIR}/src/lib/ares_nowarn.c" + "${LIBRARY_DIR}/src/lib/ares_options.c" + "${LIBRARY_DIR}/src/lib/ares_parse_a_reply.c" + "${LIBRARY_DIR}/src/lib/ares_parse_aaaa_reply.c" + "${LIBRARY_DIR}/src/lib/ares_parse_caa_reply.c" + "${LIBRARY_DIR}/src/lib/ares_parse_mx_reply.c" + "${LIBRARY_DIR}/src/lib/ares_parse_naptr_reply.c" + "${LIBRARY_DIR}/src/lib/ares_parse_ns_reply.c" + "${LIBRARY_DIR}/src/lib/ares_parse_ptr_reply.c" + "${LIBRARY_DIR}/src/lib/ares_parse_soa_reply.c" + "${LIBRARY_DIR}/src/lib/ares_parse_srv_reply.c" + "${LIBRARY_DIR}/src/lib/ares_parse_txt_reply.c" + "${LIBRARY_DIR}/src/lib/ares_parse_uri_reply.c" + "${LIBRARY_DIR}/src/lib/ares_platform.c" + "${LIBRARY_DIR}/src/lib/ares_process.c" + "${LIBRARY_DIR}/src/lib/ares_query.c" + "${LIBRARY_DIR}/src/lib/ares_search.c" + "${LIBRARY_DIR}/src/lib/ares_send.c" + "${LIBRARY_DIR}/src/lib/ares_strcasecmp.c" + "${LIBRARY_DIR}/src/lib/ares_strdup.c" + "${LIBRARY_DIR}/src/lib/ares_strerror.c" + "${LIBRARY_DIR}/src/lib/ares_strsplit.c" + "${LIBRARY_DIR}/src/lib/ares_timeout.c" + "${LIBRARY_DIR}/src/lib/ares_version.c" + "${LIBRARY_DIR}/src/lib/ares_writev.c" + "${LIBRARY_DIR}/src/lib/bitncmp.c" + "${LIBRARY_DIR}/src/lib/inet_net_pton.c" + "${LIBRARY_DIR}/src/lib/inet_ntop.c" + "${LIBRARY_DIR}/src/lib/windows_port.c" +) + +if (USE_STATIC_LIBRARIES) + add_library(_c-ares STATIC ${SRCS}) + target_compile_definitions(_c-ares PUBLIC CARES_STATICLIB) +else() + add_library(_c-ares SHARED ${SRCS}) + target_compile_definitions(_c-ares PUBLIC CARES_BUILDING_LIBRARY) endif() -# Force use of c-ares inet_net_pton instead of libresolv one -set(HAVE_INET_NET_PTON OFF CACHE BOOL "" FORCE) +target_compile_definitions(_c-ares PRIVATE HAVE_CONFIG_H=1) -add_subdirectory("../c-ares/" "../c-ares/") +target_include_directories(_c-ares SYSTEM PUBLIC + "${LIBRARY_DIR}/src/lib" + "${LIBRARY_DIR}/include" +) -add_library(ch_contrib::c-ares ALIAS c-ares) \ No newline at end of file +# Platform-specific include directories. The original build system does a lot of checks to eventually generate two header files with defines: +# ares_build.h and ares_config.h. To update, run the original CMake build in c-ares for each platform and copy the headers into the +# platform-specific folder. +# For the platform-specific compile definitions, see c-ares top-level CMakeLists.txt. +if (OS_LINUX) + target_include_directories(_c-ares SYSTEM PUBLIC "${ClickHouse_SOURCE_DIR}/contrib/c-ares-cmake/linux") + target_compile_definitions(_c-ares PRIVATE -D_GNU_SOURCE -D_POSIX_C_SOURCE=199309L -D_XOPEN_SOURCE=600) +elseif (OS_DARWIN) + target_include_directories(_c-ares SYSTEM PUBLIC "${ClickHouse_SOURCE_DIR}/contrib/c-ares-cmake/darwin") + target_compile_definitions(_c-ares PRIVATE -D_DARWIN_C_SOURCE) +elseif (OS_FREEBSD) + target_include_directories(_c-ares SYSTEM PUBLIC "${ClickHouse_SOURCE_DIR}/contrib/c-ares-cmake/freebsd") +endif() + +add_library(ch_contrib::c-ares ALIAS _c-ares) diff --git a/contrib/c-ares-cmake/darwin/ares_build.h b/contrib/c-ares-cmake/darwin/ares_build.h new file mode 100644 index 00000000000..bf7402e7997 --- /dev/null +++ b/contrib/c-ares-cmake/darwin/ares_build.h @@ -0,0 +1,43 @@ +#ifndef __CARES_BUILD_H +#define __CARES_BUILD_H + +#define CARES_TYPEOF_ARES_SOCKLEN_T socklen_t +#define CARES_TYPEOF_ARES_SSIZE_T ssize_t + +/* Prefix names with CARES_ to make sure they don't conflict with other config.h + * files. We need to include some dependent headers that may be system specific + * for C-Ares */ +#define CARES_HAVE_SYS_TYPES_H +#define CARES_HAVE_SYS_SOCKET_H +/* #undef CARES_HAVE_WINDOWS_H */ +/* #undef CARES_HAVE_WS2TCPIP_H */ +/* #undef CARES_HAVE_WINSOCK2_H */ +/* #undef CARES_HAVE_WINDOWS_H */ +#define CARES_HAVE_ARPA_NAMESER_H +#define CARES_HAVE_ARPA_NAMESER_COMPAT_H + +#ifdef CARES_HAVE_SYS_TYPES_H +# include +#endif + +#ifdef CARES_HAVE_SYS_SOCKET_H +# include +#endif + +#ifdef CARES_HAVE_WINSOCK2_H +# include +#endif + +#ifdef CARES_HAVE_WS2TCPIP_H +# include +#endif + +#ifdef CARES_HAVE_WINDOWS_H +# include +#endif + + +typedef CARES_TYPEOF_ARES_SOCKLEN_T ares_socklen_t; +typedef CARES_TYPEOF_ARES_SSIZE_T ares_ssize_t; + +#endif /* __CARES_BUILD_H */ diff --git a/contrib/c-ares-cmake/darwin/ares_config.h b/contrib/c-ares-cmake/darwin/ares_config.h new file mode 100644 index 00000000000..64af3836f3f --- /dev/null +++ b/contrib/c-ares-cmake/darwin/ares_config.h @@ -0,0 +1,432 @@ +/* Generated from ares_config.h.cmake */ + +/* Define if building universal (internal helper macro) */ +#undef AC_APPLE_UNIVERSAL_BUILD + +/* define this if ares is built for a big endian system */ +#undef ARES_BIG_ENDIAN + +/* when building as static part of libcurl */ +#undef BUILDING_LIBCURL + +/* Defined for build that exposes internal static functions for testing. */ +#undef CARES_EXPOSE_STATICS + +/* Defined for build with symbol hiding. */ +#undef CARES_SYMBOL_HIDING + +/* Definition to make a library symbol externally visible. */ +#undef CARES_SYMBOL_SCOPE_EXTERN + +/* Use resolver library to configure cares */ +/* #undef CARES_USE_LIBRESOLV */ + +/* if a /etc/inet dir is being used */ +#undef ETC_INET + +/* Define to the type of arg 2 for gethostname. */ +#define GETHOSTNAME_TYPE_ARG2 size_t + +/* Define to the type qualifier of arg 1 for getnameinfo. */ +#define GETNAMEINFO_QUAL_ARG1 + +/* Define to the type of arg 1 for getnameinfo. */ +#define GETNAMEINFO_TYPE_ARG1 struct sockaddr * + +/* Define to the type of arg 2 for getnameinfo. */ +#define GETNAMEINFO_TYPE_ARG2 socklen_t + +/* Define to the type of args 4 and 6 for getnameinfo. */ +#define GETNAMEINFO_TYPE_ARG46 socklen_t + +/* Define to the type of arg 7 for getnameinfo. */ +#define GETNAMEINFO_TYPE_ARG7 int + +/* Specifies the number of arguments to getservbyport_r */ +#define GETSERVBYPORT_R_ARGS + +/* Specifies the number of arguments to getservbyname_r */ +#define GETSERVBYNAME_R_ARGS + +/* Define to 1 if you have AF_INET6. */ +#define HAVE_AF_INET6 + +/* Define to 1 if you have the header file. */ +#define HAVE_ARPA_INET_H + +/* Define to 1 if you have the header file. */ +#define HAVE_ARPA_NAMESER_COMPAT_H + +/* Define to 1 if you have the header file. */ +#define HAVE_ARPA_NAMESER_H + +/* Define to 1 if you have the header file. */ +#define HAVE_ASSERT_H + +/* Define to 1 if you have the `bitncmp' function. */ +/* #undef HAVE_BITNCMP */ + +/* Define to 1 if bool is an available type. */ +#define HAVE_BOOL_T + +/* Define to 1 if you have the clock_gettime function and monotonic timer. */ +#define HAVE_CLOCK_GETTIME_MONOTONIC + +/* Define to 1 if you have the closesocket function. */ +/* #undef HAVE_CLOSESOCKET */ + +/* Define to 1 if you have the CloseSocket camel case function. */ +/* #undef HAVE_CLOSESOCKET_CAMEL */ + +/* Define to 1 if you have the connect function. */ +#define HAVE_CONNECT + +/* define if the compiler supports basic C++11 syntax */ +/* #undef HAVE_CXX11 */ + +/* Define to 1 if you have the header file. */ +#define HAVE_DLFCN_H + +/* Define to 1 if you have the header file. */ +#define HAVE_ERRNO_H + +/* Define to 1 if you have the fcntl function. */ +#define HAVE_FCNTL + +/* Define to 1 if you have the header file. */ +#define HAVE_FCNTL_H + +/* Define to 1 if you have a working fcntl O_NONBLOCK function. */ +#define HAVE_FCNTL_O_NONBLOCK + +/* Define to 1 if you have the freeaddrinfo function. */ +#define HAVE_FREEADDRINFO + +/* Define to 1 if you have a working getaddrinfo function. */ +#define HAVE_GETADDRINFO + +/* Define to 1 if the getaddrinfo function is threadsafe. */ +#define HAVE_GETADDRINFO_THREADSAFE + +/* Define to 1 if you have the getenv function. */ +#define HAVE_GETENV + +/* Define to 1 if you have the gethostbyaddr function. */ +#define HAVE_GETHOSTBYADDR + +/* Define to 1 if you have the gethostbyname function. */ +#define HAVE_GETHOSTBYNAME + +/* Define to 1 if you have the gethostname function. */ +#define HAVE_GETHOSTNAME + +/* Define to 1 if you have the getnameinfo function. */ +#define HAVE_GETNAMEINFO + +/* Define to 1 if you have the getservbyport_r function. */ +/* #undef HAVE_GETSERVBYPORT_R */ + +/* Define to 1 if you have the getservbyname_r function. */ +/* #undef HAVE_GETSERVBYNAME_R */ + +/* Define to 1 if you have the `gettimeofday' function. */ +#define HAVE_GETTIMEOFDAY + +/* Define to 1 if you have the `if_indextoname' function. */ +#define HAVE_IF_INDEXTONAME + +/* Define to 1 if you have a IPv6 capable working inet_net_pton function. */ +/* #undef HAVE_INET_NET_PTON */ + +/* Define to 1 if you have a IPv6 capable working inet_ntop function. */ +#define HAVE_INET_NTOP + +/* Define to 1 if you have a IPv6 capable working inet_pton function. */ +#define HAVE_INET_PTON + +/* Define to 1 if you have the header file. */ +#define HAVE_INTTYPES_H + +/* Define to 1 if you have the ioctl function. */ +#define HAVE_IOCTL + +/* Define to 1 if you have the ioctlsocket function. */ +/* #undef HAVE_IOCTLSOCKET */ + +/* Define to 1 if you have the IoctlSocket camel case function. */ +/* #undef HAVE_IOCTLSOCKET_CAMEL */ + +/* Define to 1 if you have a working IoctlSocket camel case FIONBIO function. + */ +/* #undef HAVE_IOCTLSOCKET_CAMEL_FIONBIO */ + +/* Define to 1 if you have a working ioctlsocket FIONBIO function. */ +/* #undef HAVE_IOCTLSOCKET_FIONBIO */ + +/* Define to 1 if you have a working ioctl FIONBIO function. */ +#define HAVE_IOCTL_FIONBIO + +/* Define to 1 if you have a working ioctl SIOCGIFADDR function. */ +#define HAVE_IOCTL_SIOCGIFADDR + +/* Define to 1 if you have the `resolve' library (-lresolve). */ +/* #undef HAVE_LIBRESOLV */ + +/* Define to 1 if you have the header file. */ +#define HAVE_LIMITS_H + +/* if your compiler supports LL */ +#define HAVE_LL + +/* Define to 1 if the compiler supports the 'long long' data type. */ +#define HAVE_LONGLONG + +/* Define to 1 if you have the malloc.h header file. */ +/* #undef HAVE_MALLOC_H */ + +/* Define to 1 if you have the memory.h header file. */ +#define HAVE_MEMORY_H + +/* Define to 1 if you have the MSG_NOSIGNAL flag. */ +/* #undef HAVE_MSG_NOSIGNAL */ + +/* Define to 1 if you have the header file. */ +#define HAVE_NETDB_H + +/* Define to 1 if you have the header file. */ +#define HAVE_NETINET_IN_H + +/* Define to 1 if you have the header file. */ +#define HAVE_NETINET_TCP_H + +/* Define to 1 if you have the header file. */ +#define HAVE_NET_IF_H + +/* Define to 1 if you have PF_INET6. */ +#define HAVE_PF_INET6 + +/* Define to 1 if you have the recv function. */ +#define HAVE_RECV + +/* Define to 1 if you have the recvfrom function. */ +#define HAVE_RECVFROM + +/* Define to 1 if you have the send function. */ +#define HAVE_SEND + +/* Define to 1 if you have the setsockopt function. */ +#define HAVE_SETSOCKOPT + +/* Define to 1 if you have a working setsockopt SO_NONBLOCK function. */ +/* #undef HAVE_SETSOCKOPT_SO_NONBLOCK */ + +/* Define to 1 if you have the header file. */ +#define HAVE_SIGNAL_H + +/* Define to 1 if sig_atomic_t is an available typedef. */ +#define HAVE_SIG_ATOMIC_T + +/* Define to 1 if sig_atomic_t is already defined as volatile. */ +/* #undef HAVE_SIG_ATOMIC_T_VOLATILE */ + +/* Define to 1 if your struct sockaddr_in6 has sin6_scope_id. */ +#define HAVE_SOCKADDR_IN6_SIN6_SCOPE_ID + +/* Define to 1 if you have the socket function. */ +#define HAVE_SOCKET + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SOCKET_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_STDBOOL_H + +/* Define to 1 if you have the header file. */ +#define HAVE_STDINT_H + +/* Define to 1 if you have the header file. */ +#define HAVE_STDLIB_H + +/* Define to 1 if you have the strcasecmp function. */ +#define HAVE_STRCASECMP + +/* Define to 1 if you have the strcmpi function. */ +/* #undef HAVE_STRCMPI */ + +/* Define to 1 if you have the strdup function. */ +#define HAVE_STRDUP + +/* Define to 1 if you have the stricmp function. */ +/* #undef HAVE_STRICMP */ + +/* Define to 1 if you have the header file. */ +#define HAVE_STRINGS_H + +/* Define to 1 if you have the header file. */ +#define HAVE_STRING_H + +/* Define to 1 if you have the strncasecmp function. */ +#define HAVE_STRNCASECMP + +/* Define to 1 if you have the strncmpi function. */ +/* #undef HAVE_STRNCMPI */ + +/* Define to 1 if you have the strnicmp function. */ +/* #undef HAVE_STRNICMP */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_STROPTS_H */ + +/* Define to 1 if you have struct addrinfo. */ +#define HAVE_STRUCT_ADDRINFO + +/* Define to 1 if you have struct in6_addr. */ +#define HAVE_STRUCT_IN6_ADDR + +/* Define to 1 if you have struct sockaddr_in6. */ +#define HAVE_STRUCT_SOCKADDR_IN6 + +/* if struct sockaddr_storage is defined */ +#define HAVE_STRUCT_SOCKADDR_STORAGE + +/* Define to 1 if you have the timeval struct. */ +#define HAVE_STRUCT_TIMEVAL + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_IOCTL_H + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_PARAM_H + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_SELECT_H + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_SOCKET_H + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_STAT_H + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_TIME_H + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_TYPES_H + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_UIO_H + +/* Define to 1 if you have the header file. */ +#define HAVE_TIME_H + +/* Define to 1 if you have the header file. */ +#define HAVE_UNISTD_H + +/* Define to 1 if you have the windows.h header file. */ +/* #undef HAVE_WINDOWS_H */ + +/* Define to 1 if you have the winsock2.h header file. */ +/* #undef HAVE_WINSOCK2_H */ + +/* Define to 1 if you have the winsock.h header file. */ +/* #undef HAVE_WINSOCK_H */ + +/* Define to 1 if you have the writev function. */ +#define HAVE_WRITEV + +/* Define to 1 if you have the ws2tcpip.h header file. */ +/* #undef HAVE_WS2TCPIP_H */ + +/* Define to 1 if you have the __system_property_get function */ +#define HAVE___SYSTEM_PROPERTY_GET + +/* Define to 1 if you need the malloc.h header file even with stdlib.h */ +/* #undef NEED_MALLOC_H */ + +/* Define to 1 if you need the memory.h header file even with stdlib.h */ +/* #undef NEED_MEMORY_H */ + +/* a suitable file/device to read random data from */ +#define CARES_RANDOM_FILE "/dev/urandom" + +/* Define to the type qualifier pointed by arg 5 for recvfrom. */ +#define RECVFROM_QUAL_ARG5 + +/* Define to the type of arg 1 for recvfrom. */ +#define RECVFROM_TYPE_ARG1 int + +/* Define to the type pointed by arg 2 for recvfrom. */ +#define RECVFROM_TYPE_ARG2 void * + +/* Define to 1 if the type pointed by arg 2 for recvfrom is void. */ +#define RECVFROM_TYPE_ARG2_IS_VOID 0 + +/* Define to the type of arg 3 for recvfrom. */ +#define RECVFROM_TYPE_ARG3 size_t + +/* Define to the type of arg 4 for recvfrom. */ +#define RECVFROM_TYPE_ARG4 int + +/* Define to the type pointed by arg 5 for recvfrom. */ +#define RECVFROM_TYPE_ARG5 struct sockaddr * + +/* Define to 1 if the type pointed by arg 5 for recvfrom is void. */ +#define RECVFROM_TYPE_ARG5_IS_VOID 0 + +/* Define to the type pointed by arg 6 for recvfrom. */ +#define RECVFROM_TYPE_ARG6 socklen_t * + +/* Define to 1 if the type pointed by arg 6 for recvfrom is void. */ +#define RECVFROM_TYPE_ARG6_IS_VOID 0 + +/* Define to the function return type for recvfrom. */ +#define RECVFROM_TYPE_RETV ssize_t + +/* Define to the type of arg 1 for recv. */ +#define RECV_TYPE_ARG1 int + +/* Define to the type of arg 2 for recv. */ +#define RECV_TYPE_ARG2 void * + +/* Define to the type of arg 3 for recv. */ +#define RECV_TYPE_ARG3 size_t + +/* Define to the type of arg 4 for recv. */ +#define RECV_TYPE_ARG4 int + +/* Define to the function return type for recv. */ +#define RECV_TYPE_RETV ssize_t + +/* Define as the return type of signal handlers (`int' or `void'). */ +#define RETSIGTYPE + +/* Define to the type qualifier of arg 2 for send. */ +#define SEND_QUAL_ARG2 + +/* Define to the type of arg 1 for send. */ +#define SEND_TYPE_ARG1 int + +/* Define to the type of arg 2 for send. */ +#define SEND_TYPE_ARG2 void * + +/* Define to the type of arg 3 for send. */ +#define SEND_TYPE_ARG3 size_t + +/* Define to the type of arg 4 for send. */ +#define SEND_TYPE_ARG4 int + +/* Define to the function return type for send. */ +#define SEND_TYPE_RETV ssize_t + +/* Define to 1 if you can safely include both and . */ +#define TIME_WITH_SYS_TIME + +/* Define to disable non-blocking sockets. */ +#undef USE_BLOCKING_SOCKETS + +/* Define to avoid automatic inclusion of winsock.h */ +#undef WIN32_LEAN_AND_MEAN + +/* Type to use in place of in_addr_t when system does not provide it. */ +#undef in_addr_t + diff --git a/contrib/c-ares-cmake/freebsd/ares_build.h b/contrib/c-ares-cmake/freebsd/ares_build.h new file mode 100644 index 00000000000..bf7402e7997 --- /dev/null +++ b/contrib/c-ares-cmake/freebsd/ares_build.h @@ -0,0 +1,43 @@ +#ifndef __CARES_BUILD_H +#define __CARES_BUILD_H + +#define CARES_TYPEOF_ARES_SOCKLEN_T socklen_t +#define CARES_TYPEOF_ARES_SSIZE_T ssize_t + +/* Prefix names with CARES_ to make sure they don't conflict with other config.h + * files. We need to include some dependent headers that may be system specific + * for C-Ares */ +#define CARES_HAVE_SYS_TYPES_H +#define CARES_HAVE_SYS_SOCKET_H +/* #undef CARES_HAVE_WINDOWS_H */ +/* #undef CARES_HAVE_WS2TCPIP_H */ +/* #undef CARES_HAVE_WINSOCK2_H */ +/* #undef CARES_HAVE_WINDOWS_H */ +#define CARES_HAVE_ARPA_NAMESER_H +#define CARES_HAVE_ARPA_NAMESER_COMPAT_H + +#ifdef CARES_HAVE_SYS_TYPES_H +# include +#endif + +#ifdef CARES_HAVE_SYS_SOCKET_H +# include +#endif + +#ifdef CARES_HAVE_WINSOCK2_H +# include +#endif + +#ifdef CARES_HAVE_WS2TCPIP_H +# include +#endif + +#ifdef CARES_HAVE_WINDOWS_H +# include +#endif + + +typedef CARES_TYPEOF_ARES_SOCKLEN_T ares_socklen_t; +typedef CARES_TYPEOF_ARES_SSIZE_T ares_ssize_t; + +#endif /* __CARES_BUILD_H */ diff --git a/contrib/c-ares-cmake/freebsd/ares_config.h b/contrib/c-ares-cmake/freebsd/ares_config.h new file mode 100644 index 00000000000..a7836e0e802 --- /dev/null +++ b/contrib/c-ares-cmake/freebsd/ares_config.h @@ -0,0 +1,432 @@ +/* Generated from ares_config.h.cmake */ + +/* Define if building universal (internal helper macro) */ +#undef AC_APPLE_UNIVERSAL_BUILD + +/* define this if ares is built for a big endian system */ +#undef ARES_BIG_ENDIAN + +/* when building as static part of libcurl */ +#undef BUILDING_LIBCURL + +/* Defined for build that exposes internal static functions for testing. */ +#undef CARES_EXPOSE_STATICS + +/* Defined for build with symbol hiding. */ +#undef CARES_SYMBOL_HIDING + +/* Definition to make a library symbol externally visible. */ +#undef CARES_SYMBOL_SCOPE_EXTERN + +/* Use resolver library to configure cares */ +/* #undef CARES_USE_LIBRESOLV */ + +/* if a /etc/inet dir is being used */ +#undef ETC_INET + +/* Define to the type of arg 2 for gethostname. */ +#define GETHOSTNAME_TYPE_ARG2 size_t + +/* Define to the type qualifier of arg 1 for getnameinfo. */ +#define GETNAMEINFO_QUAL_ARG1 + +/* Define to the type of arg 1 for getnameinfo. */ +#define GETNAMEINFO_TYPE_ARG1 struct sockaddr * + +/* Define to the type of arg 2 for getnameinfo. */ +#define GETNAMEINFO_TYPE_ARG2 socklen_t + +/* Define to the type of args 4 and 6 for getnameinfo. */ +#define GETNAMEINFO_TYPE_ARG46 socklen_t + +/* Define to the type of arg 7 for getnameinfo. */ +#define GETNAMEINFO_TYPE_ARG7 int + +/* Specifies the number of arguments to getservbyport_r */ +#define GETSERVBYPORT_R_ARGS 6 + +/* Specifies the number of arguments to getservbyname_r */ +#define GETSERVBYNAME_R_ARGS 6 + +/* Define to 1 if you have AF_INET6. */ +#define HAVE_AF_INET6 + +/* Define to 1 if you have the header file. */ +#define HAVE_ARPA_INET_H + +/* Define to 1 if you have the header file. */ +#define HAVE_ARPA_NAMESER_COMPAT_H + +/* Define to 1 if you have the header file. */ +#define HAVE_ARPA_NAMESER_H + +/* Define to 1 if you have the header file. */ +#define HAVE_ASSERT_H + +/* Define to 1 if you have the `bitncmp' function. */ +/* #undef HAVE_BITNCMP */ + +/* Define to 1 if bool is an available type. */ +#define HAVE_BOOL_T + +/* Define to 1 if you have the clock_gettime function and monotonic timer. */ +#define HAVE_CLOCK_GETTIME_MONOTONIC + +/* Define to 1 if you have the closesocket function. */ +/* #undef HAVE_CLOSESOCKET */ + +/* Define to 1 if you have the CloseSocket camel case function. */ +/* #undef HAVE_CLOSESOCKET_CAMEL */ + +/* Define to 1 if you have the connect function. */ +#define HAVE_CONNECT + +/* define if the compiler supports basic C++11 syntax */ +/* #undef HAVE_CXX11 */ + +/* Define to 1 if you have the header file. */ +#define HAVE_DLFCN_H + +/* Define to 1 if you have the header file. */ +#define HAVE_ERRNO_H + +/* Define to 1 if you have the fcntl function. */ +#define HAVE_FCNTL + +/* Define to 1 if you have the header file. */ +#define HAVE_FCNTL_H + +/* Define to 1 if you have a working fcntl O_NONBLOCK function. */ +#define HAVE_FCNTL_O_NONBLOCK + +/* Define to 1 if you have the freeaddrinfo function. */ +#define HAVE_FREEADDRINFO + +/* Define to 1 if you have a working getaddrinfo function. */ +#define HAVE_GETADDRINFO + +/* Define to 1 if the getaddrinfo function is threadsafe. */ +#define HAVE_GETADDRINFO_THREADSAFE + +/* Define to 1 if you have the getenv function. */ +#define HAVE_GETENV + +/* Define to 1 if you have the gethostbyaddr function. */ +#define HAVE_GETHOSTBYADDR + +/* Define to 1 if you have the gethostbyname function. */ +#define HAVE_GETHOSTBYNAME + +/* Define to 1 if you have the gethostname function. */ +#define HAVE_GETHOSTNAME + +/* Define to 1 if you have the getnameinfo function. */ +#define HAVE_GETNAMEINFO + +/* Define to 1 if you have the getservbyport_r function. */ +#define HAVE_GETSERVBYPORT_R + +/* Define to 1 if you have the getservbyname_r function. */ +#define HAVE_GETSERVBYNAME_R + +/* Define to 1 if you have the `gettimeofday' function. */ +#define HAVE_GETTIMEOFDAY + +/* Define to 1 if you have the `if_indextoname' function. */ +#define HAVE_IF_INDEXTONAME + +/* Define to 1 if you have a IPv6 capable working inet_net_pton function. */ +/* #undef HAVE_INET_NET_PTON */ + +/* Define to 1 if you have a IPv6 capable working inet_ntop function. */ +#define HAVE_INET_NTOP + +/* Define to 1 if you have a IPv6 capable working inet_pton function. */ +#define HAVE_INET_PTON + +/* Define to 1 if you have the header file. */ +#define HAVE_INTTYPES_H + +/* Define to 1 if you have the ioctl function. */ +#define HAVE_IOCTL + +/* Define to 1 if you have the ioctlsocket function. */ +/* #undef HAVE_IOCTLSOCKET */ + +/* Define to 1 if you have the IoctlSocket camel case function. */ +/* #undef HAVE_IOCTLSOCKET_CAMEL */ + +/* Define to 1 if you have a working IoctlSocket camel case FIONBIO function. + */ +/* #undef HAVE_IOCTLSOCKET_CAMEL_FIONBIO */ + +/* Define to 1 if you have a working ioctlsocket FIONBIO function. */ +/* #undef HAVE_IOCTLSOCKET_FIONBIO */ + +/* Define to 1 if you have a working ioctl FIONBIO function. */ +#define HAVE_IOCTL_FIONBIO + +/* Define to 1 if you have a working ioctl SIOCGIFADDR function. */ +#define HAVE_IOCTL_SIOCGIFADDR + +/* Define to 1 if you have the `resolve' library (-lresolve). */ +/* #undef HAVE_LIBRESOLV */ + +/* Define to 1 if you have the header file. */ +#define HAVE_LIMITS_H + +/* if your compiler supports LL */ +#define HAVE_LL + +/* Define to 1 if the compiler supports the 'long long' data type. */ +#define HAVE_LONGLONG + +/* Define to 1 if you have the malloc.h header file. */ +/* #undef HAVE_MALLOC_H */ + +/* Define to 1 if you have the memory.h header file. */ +#define HAVE_MEMORY_H + +/* Define to 1 if you have the MSG_NOSIGNAL flag. */ +#define HAVE_MSG_NOSIGNAL + +/* Define to 1 if you have the header file. */ +#define HAVE_NETDB_H + +/* Define to 1 if you have the header file. */ +#define HAVE_NETINET_IN_H + +/* Define to 1 if you have the header file. */ +#define HAVE_NETINET_TCP_H + +/* Define to 1 if you have the header file. */ +#define HAVE_NET_IF_H + +/* Define to 1 if you have PF_INET6. */ +#define HAVE_PF_INET6 + +/* Define to 1 if you have the recv function. */ +#define HAVE_RECV + +/* Define to 1 if you have the recvfrom function. */ +#define HAVE_RECVFROM + +/* Define to 1 if you have the send function. */ +#define HAVE_SEND + +/* Define to 1 if you have the setsockopt function. */ +#define HAVE_SETSOCKOPT + +/* Define to 1 if you have a working setsockopt SO_NONBLOCK function. */ +/* #undef HAVE_SETSOCKOPT_SO_NONBLOCK */ + +/* Define to 1 if you have the header file. */ +#define HAVE_SIGNAL_H + +/* Define to 1 if sig_atomic_t is an available typedef. */ +#define HAVE_SIG_ATOMIC_T + +/* Define to 1 if sig_atomic_t is already defined as volatile. */ +/* #undef HAVE_SIG_ATOMIC_T_VOLATILE */ + +/* Define to 1 if your struct sockaddr_in6 has sin6_scope_id. */ +#define HAVE_SOCKADDR_IN6_SIN6_SCOPE_ID + +/* Define to 1 if you have the socket function. */ +#define HAVE_SOCKET + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SOCKET_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_STDBOOL_H + +/* Define to 1 if you have the header file. */ +#define HAVE_STDINT_H + +/* Define to 1 if you have the header file. */ +#define HAVE_STDLIB_H + +/* Define to 1 if you have the strcasecmp function. */ +#define HAVE_STRCASECMP + +/* Define to 1 if you have the strcmpi function. */ +/* #undef HAVE_STRCMPI */ + +/* Define to 1 if you have the strdup function. */ +#define HAVE_STRDUP + +/* Define to 1 if you have the stricmp function. */ +/* #undef HAVE_STRICMP */ + +/* Define to 1 if you have the header file. */ +#define HAVE_STRINGS_H + +/* Define to 1 if you have the header file. */ +#define HAVE_STRING_H + +/* Define to 1 if you have the strncasecmp function. */ +#define HAVE_STRNCASECMP + +/* Define to 1 if you have the strncmpi function. */ +/* #undef HAVE_STRNCMPI */ + +/* Define to 1 if you have the strnicmp function. */ +/* #undef HAVE_STRNICMP */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_STROPTS_H */ + +/* Define to 1 if you have struct addrinfo. */ +#define HAVE_STRUCT_ADDRINFO + +/* Define to 1 if you have struct in6_addr. */ +#define HAVE_STRUCT_IN6_ADDR + +/* Define to 1 if you have struct sockaddr_in6. */ +#define HAVE_STRUCT_SOCKADDR_IN6 + +/* if struct sockaddr_storage is defined */ +#define HAVE_STRUCT_SOCKADDR_STORAGE + +/* Define to 1 if you have the timeval struct. */ +#define HAVE_STRUCT_TIMEVAL + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_IOCTL_H + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_PARAM_H + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_SELECT_H + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_SOCKET_H + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_STAT_H + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_TIME_H + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_TYPES_H + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_UIO_H + +/* Define to 1 if you have the header file. */ +#define HAVE_TIME_H + +/* Define to 1 if you have the header file. */ +#define HAVE_UNISTD_H + +/* Define to 1 if you have the windows.h header file. */ +/* #undef HAVE_WINDOWS_H */ + +/* Define to 1 if you have the winsock2.h header file. */ +/* #undef HAVE_WINSOCK2_H */ + +/* Define to 1 if you have the winsock.h header file. */ +/* #undef HAVE_WINSOCK_H */ + +/* Define to 1 if you have the writev function. */ +#define HAVE_WRITEV + +/* Define to 1 if you have the ws2tcpip.h header file. */ +/* #undef HAVE_WS2TCPIP_H */ + +/* Define to 1 if you have the __system_property_get function */ +#define HAVE___SYSTEM_PROPERTY_GET + +/* Define to 1 if you need the malloc.h header file even with stdlib.h */ +/* #undef NEED_MALLOC_H */ + +/* Define to 1 if you need the memory.h header file even with stdlib.h */ +/* #undef NEED_MEMORY_H */ + +/* a suitable file/device to read random data from */ +#define CARES_RANDOM_FILE "/dev/urandom" + +/* Define to the type qualifier pointed by arg 5 for recvfrom. */ +#define RECVFROM_QUAL_ARG5 + +/* Define to the type of arg 1 for recvfrom. */ +#define RECVFROM_TYPE_ARG1 int + +/* Define to the type pointed by arg 2 for recvfrom. */ +#define RECVFROM_TYPE_ARG2 void * + +/* Define to 1 if the type pointed by arg 2 for recvfrom is void. */ +#define RECVFROM_TYPE_ARG2_IS_VOID 0 + +/* Define to the type of arg 3 for recvfrom. */ +#define RECVFROM_TYPE_ARG3 size_t + +/* Define to the type of arg 4 for recvfrom. */ +#define RECVFROM_TYPE_ARG4 int + +/* Define to the type pointed by arg 5 for recvfrom. */ +#define RECVFROM_TYPE_ARG5 struct sockaddr * + +/* Define to 1 if the type pointed by arg 5 for recvfrom is void. */ +#define RECVFROM_TYPE_ARG5_IS_VOID 0 + +/* Define to the type pointed by arg 6 for recvfrom. */ +#define RECVFROM_TYPE_ARG6 socklen_t * + +/* Define to 1 if the type pointed by arg 6 for recvfrom is void. */ +#define RECVFROM_TYPE_ARG6_IS_VOID 0 + +/* Define to the function return type for recvfrom. */ +#define RECVFROM_TYPE_RETV ssize_t + +/* Define to the type of arg 1 for recv. */ +#define RECV_TYPE_ARG1 int + +/* Define to the type of arg 2 for recv. */ +#define RECV_TYPE_ARG2 void * + +/* Define to the type of arg 3 for recv. */ +#define RECV_TYPE_ARG3 size_t + +/* Define to the type of arg 4 for recv. */ +#define RECV_TYPE_ARG4 int + +/* Define to the function return type for recv. */ +#define RECV_TYPE_RETV ssize_t + +/* Define as the return type of signal handlers (`int' or `void'). */ +#define RETSIGTYPE + +/* Define to the type qualifier of arg 2 for send. */ +#define SEND_QUAL_ARG2 + +/* Define to the type of arg 1 for send. */ +#define SEND_TYPE_ARG1 int + +/* Define to the type of arg 2 for send. */ +#define SEND_TYPE_ARG2 void * + +/* Define to the type of arg 3 for send. */ +#define SEND_TYPE_ARG3 size_t + +/* Define to the type of arg 4 for send. */ +#define SEND_TYPE_ARG4 int + +/* Define to the function return type for send. */ +#define SEND_TYPE_RETV ssize_t + +/* Define to 1 if you can safely include both and . */ +#define TIME_WITH_SYS_TIME + +/* Define to disable non-blocking sockets. */ +#undef USE_BLOCKING_SOCKETS + +/* Define to avoid automatic inclusion of winsock.h */ +#undef WIN32_LEAN_AND_MEAN + +/* Type to use in place of in_addr_t when system does not provide it. */ +#undef in_addr_t + diff --git a/contrib/c-ares-cmake/linux/ares_build.h b/contrib/c-ares-cmake/linux/ares_build.h new file mode 100644 index 00000000000..bf7402e7997 --- /dev/null +++ b/contrib/c-ares-cmake/linux/ares_build.h @@ -0,0 +1,43 @@ +#ifndef __CARES_BUILD_H +#define __CARES_BUILD_H + +#define CARES_TYPEOF_ARES_SOCKLEN_T socklen_t +#define CARES_TYPEOF_ARES_SSIZE_T ssize_t + +/* Prefix names with CARES_ to make sure they don't conflict with other config.h + * files. We need to include some dependent headers that may be system specific + * for C-Ares */ +#define CARES_HAVE_SYS_TYPES_H +#define CARES_HAVE_SYS_SOCKET_H +/* #undef CARES_HAVE_WINDOWS_H */ +/* #undef CARES_HAVE_WS2TCPIP_H */ +/* #undef CARES_HAVE_WINSOCK2_H */ +/* #undef CARES_HAVE_WINDOWS_H */ +#define CARES_HAVE_ARPA_NAMESER_H +#define CARES_HAVE_ARPA_NAMESER_COMPAT_H + +#ifdef CARES_HAVE_SYS_TYPES_H +# include +#endif + +#ifdef CARES_HAVE_SYS_SOCKET_H +# include +#endif + +#ifdef CARES_HAVE_WINSOCK2_H +# include +#endif + +#ifdef CARES_HAVE_WS2TCPIP_H +# include +#endif + +#ifdef CARES_HAVE_WINDOWS_H +# include +#endif + + +typedef CARES_TYPEOF_ARES_SOCKLEN_T ares_socklen_t; +typedef CARES_TYPEOF_ARES_SSIZE_T ares_ssize_t; + +#endif /* __CARES_BUILD_H */ diff --git a/contrib/c-ares-cmake/linux/ares_config.h b/contrib/c-ares-cmake/linux/ares_config.h new file mode 100644 index 00000000000..e0ebf86e842 --- /dev/null +++ b/contrib/c-ares-cmake/linux/ares_config.h @@ -0,0 +1,432 @@ +/* Generated from ares_config.h.cmake */ + +/* Define if building universal (internal helper macro) */ +#undef AC_APPLE_UNIVERSAL_BUILD + +/* define this if ares is built for a big endian system */ +#undef ARES_BIG_ENDIAN + +/* when building as static part of libcurl */ +#undef BUILDING_LIBCURL + +/* Defined for build that exposes internal static functions for testing. */ +#undef CARES_EXPOSE_STATICS + +/* Defined for build with symbol hiding. */ +#undef CARES_SYMBOL_HIDING + +/* Definition to make a library symbol externally visible. */ +#undef CARES_SYMBOL_SCOPE_EXTERN + +/* Use resolver library to configure cares */ +/* #undef CARES_USE_LIBRESOLV */ + +/* if a /etc/inet dir is being used */ +#undef ETC_INET + +/* Define to the type of arg 2 for gethostname. */ +#define GETHOSTNAME_TYPE_ARG2 size_t + +/* Define to the type qualifier of arg 1 for getnameinfo. */ +#define GETNAMEINFO_QUAL_ARG1 + +/* Define to the type of arg 1 for getnameinfo. */ +#define GETNAMEINFO_TYPE_ARG1 struct sockaddr * + +/* Define to the type of arg 2 for getnameinfo. */ +#define GETNAMEINFO_TYPE_ARG2 socklen_t + +/* Define to the type of args 4 and 6 for getnameinfo. */ +#define GETNAMEINFO_TYPE_ARG46 socklen_t + +/* Define to the type of arg 7 for getnameinfo. */ +#define GETNAMEINFO_TYPE_ARG7 int + +/* Specifies the number of arguments to getservbyport_r */ +#define GETSERVBYPORT_R_ARGS 6 + +/* Specifies the number of arguments to getservbyname_r */ +#define GETSERVBYNAME_R_ARGS 6 + +/* Define to 1 if you have AF_INET6. */ +#define HAVE_AF_INET6 + +/* Define to 1 if you have the header file. */ +#define HAVE_ARPA_INET_H + +/* Define to 1 if you have the header file. */ +#define HAVE_ARPA_NAMESER_COMPAT_H + +/* Define to 1 if you have the header file. */ +#define HAVE_ARPA_NAMESER_H + +/* Define to 1 if you have the header file. */ +#define HAVE_ASSERT_H + +/* Define to 1 if you have the `bitncmp' function. */ +/* #undef HAVE_BITNCMP */ + +/* Define to 1 if bool is an available type. */ +#define HAVE_BOOL_T + +/* Define to 1 if you have the clock_gettime function and monotonic timer. */ +#define HAVE_CLOCK_GETTIME_MONOTONIC + +/* Define to 1 if you have the closesocket function. */ +/* #undef HAVE_CLOSESOCKET */ + +/* Define to 1 if you have the CloseSocket camel case function. */ +/* #undef HAVE_CLOSESOCKET_CAMEL */ + +/* Define to 1 if you have the connect function. */ +#define HAVE_CONNECT + +/* define if the compiler supports basic C++11 syntax */ +/* #undef HAVE_CXX11 */ + +/* Define to 1 if you have the header file. */ +#define HAVE_DLFCN_H + +/* Define to 1 if you have the header file. */ +#define HAVE_ERRNO_H + +/* Define to 1 if you have the fcntl function. */ +#define HAVE_FCNTL + +/* Define to 1 if you have the header file. */ +#define HAVE_FCNTL_H + +/* Define to 1 if you have a working fcntl O_NONBLOCK function. */ +#define HAVE_FCNTL_O_NONBLOCK + +/* Define to 1 if you have the freeaddrinfo function. */ +#define HAVE_FREEADDRINFO + +/* Define to 1 if you have a working getaddrinfo function. */ +#define HAVE_GETADDRINFO + +/* Define to 1 if the getaddrinfo function is threadsafe. */ +/* #undef HAVE_GETADDRINFO_THREADSAFE */ + +/* Define to 1 if you have the getenv function. */ +#define HAVE_GETENV + +/* Define to 1 if you have the gethostbyaddr function. */ +#define HAVE_GETHOSTBYADDR + +/* Define to 1 if you have the gethostbyname function. */ +#define HAVE_GETHOSTBYNAME + +/* Define to 1 if you have the gethostname function. */ +#define HAVE_GETHOSTNAME + +/* Define to 1 if you have the getnameinfo function. */ +#define HAVE_GETNAMEINFO + +/* Define to 1 if you have the getservbyport_r function. */ +#define HAVE_GETSERVBYPORT_R + +/* Define to 1 if you have the getservbyname_r function. */ +#define HAVE_GETSERVBYNAME_R + +/* Define to 1 if you have the `gettimeofday' function. */ +#define HAVE_GETTIMEOFDAY + +/* Define to 1 if you have the `if_indextoname' function. */ +#define HAVE_IF_INDEXTONAME + +/* Define to 1 if you have a IPv6 capable working inet_net_pton function. */ +/* #undef HAVE_INET_NET_PTON */ + +/* Define to 1 if you have a IPv6 capable working inet_ntop function. */ +#define HAVE_INET_NTOP + +/* Define to 1 if you have a IPv6 capable working inet_pton function. */ +#define HAVE_INET_PTON + +/* Define to 1 if you have the header file. */ +#define HAVE_INTTYPES_H + +/* Define to 1 if you have the ioctl function. */ +#define HAVE_IOCTL + +/* Define to 1 if you have the ioctlsocket function. */ +/* #undef HAVE_IOCTLSOCKET */ + +/* Define to 1 if you have the IoctlSocket camel case function. */ +/* #undef HAVE_IOCTLSOCKET_CAMEL */ + +/* Define to 1 if you have a working IoctlSocket camel case FIONBIO function. + */ +/* #undef HAVE_IOCTLSOCKET_CAMEL_FIONBIO */ + +/* Define to 1 if you have a working ioctlsocket FIONBIO function. */ +/* #undef HAVE_IOCTLSOCKET_FIONBIO */ + +/* Define to 1 if you have a working ioctl FIONBIO function. */ +#define HAVE_IOCTL_FIONBIO + +/* Define to 1 if you have a working ioctl SIOCGIFADDR function. */ +#define HAVE_IOCTL_SIOCGIFADDR + +/* Define to 1 if you have the `resolve' library (-lresolve). */ +/* #undef HAVE_LIBRESOLV */ + +/* Define to 1 if you have the header file. */ +#define HAVE_LIMITS_H + +/* if your compiler supports LL */ +#define HAVE_LL + +/* Define to 1 if the compiler supports the 'long long' data type. */ +#define HAVE_LONGLONG + +/* Define to 1 if you have the malloc.h header file. */ +#define HAVE_MALLOC_H + +/* Define to 1 if you have the memory.h header file. */ +#define HAVE_MEMORY_H + +/* Define to 1 if you have the MSG_NOSIGNAL flag. */ +#define HAVE_MSG_NOSIGNAL + +/* Define to 1 if you have the header file. */ +#define HAVE_NETDB_H + +/* Define to 1 if you have the header file. */ +#define HAVE_NETINET_IN_H + +/* Define to 1 if you have the header file. */ +#define HAVE_NETINET_TCP_H + +/* Define to 1 if you have the header file. */ +#define HAVE_NET_IF_H + +/* Define to 1 if you have PF_INET6. */ +#define HAVE_PF_INET6 + +/* Define to 1 if you have the recv function. */ +#define HAVE_RECV + +/* Define to 1 if you have the recvfrom function. */ +#define HAVE_RECVFROM + +/* Define to 1 if you have the send function. */ +#define HAVE_SEND + +/* Define to 1 if you have the setsockopt function. */ +#define HAVE_SETSOCKOPT + +/* Define to 1 if you have a working setsockopt SO_NONBLOCK function. */ +/* #undef HAVE_SETSOCKOPT_SO_NONBLOCK */ + +/* Define to 1 if you have the header file. */ +#define HAVE_SIGNAL_H + +/* Define to 1 if sig_atomic_t is an available typedef. */ +#define HAVE_SIG_ATOMIC_T + +/* Define to 1 if sig_atomic_t is already defined as volatile. */ +/* #undef HAVE_SIG_ATOMIC_T_VOLATILE */ + +/* Define to 1 if your struct sockaddr_in6 has sin6_scope_id. */ +#define HAVE_SOCKADDR_IN6_SIN6_SCOPE_ID + +/* Define to 1 if you have the socket function. */ +#define HAVE_SOCKET + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SOCKET_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_STDBOOL_H + +/* Define to 1 if you have the header file. */ +#define HAVE_STDINT_H + +/* Define to 1 if you have the header file. */ +#define HAVE_STDLIB_H + +/* Define to 1 if you have the strcasecmp function. */ +#define HAVE_STRCASECMP + +/* Define to 1 if you have the strcmpi function. */ +/* #undef HAVE_STRCMPI */ + +/* Define to 1 if you have the strdup function. */ +#define HAVE_STRDUP + +/* Define to 1 if you have the stricmp function. */ +/* #undef HAVE_STRICMP */ + +/* Define to 1 if you have the header file. */ +#define HAVE_STRINGS_H + +/* Define to 1 if you have the header file. */ +#define HAVE_STRING_H + +/* Define to 1 if you have the strncasecmp function. */ +#define HAVE_STRNCASECMP + +/* Define to 1 if you have the strncmpi function. */ +/* #undef HAVE_STRNCMPI */ + +/* Define to 1 if you have the strnicmp function. */ +/* #undef HAVE_STRNICMP */ + +/* Define to 1 if you have the header file. */ +#define HAVE_STROPTS_H + +/* Define to 1 if you have struct addrinfo. */ +#define HAVE_STRUCT_ADDRINFO + +/* Define to 1 if you have struct in6_addr. */ +#define HAVE_STRUCT_IN6_ADDR + +/* Define to 1 if you have struct sockaddr_in6. */ +#define HAVE_STRUCT_SOCKADDR_IN6 + +/* if struct sockaddr_storage is defined */ +#define HAVE_STRUCT_SOCKADDR_STORAGE + +/* Define to 1 if you have the timeval struct. */ +#define HAVE_STRUCT_TIMEVAL + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_IOCTL_H + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_PARAM_H + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_SELECT_H + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_SOCKET_H + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_STAT_H + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_TIME_H + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_TYPES_H + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_UIO_H + +/* Define to 1 if you have the header file. */ +#define HAVE_TIME_H + +/* Define to 1 if you have the header file. */ +#define HAVE_UNISTD_H + +/* Define to 1 if you have the windows.h header file. */ +/* #undef HAVE_WINDOWS_H */ + +/* Define to 1 if you have the winsock2.h header file. */ +/* #undef HAVE_WINSOCK2_H */ + +/* Define to 1 if you have the winsock.h header file. */ +/* #undef HAVE_WINSOCK_H */ + +/* Define to 1 if you have the writev function. */ +#define HAVE_WRITEV + +/* Define to 1 if you have the ws2tcpip.h header file. */ +/* #undef HAVE_WS2TCPIP_H */ + +/* Define to 1 if you have the __system_property_get function */ +#define HAVE___SYSTEM_PROPERTY_GET + +/* Define to 1 if you need the malloc.h header file even with stdlib.h */ +/* #undef NEED_MALLOC_H */ + +/* Define to 1 if you need the memory.h header file even with stdlib.h */ +/* #undef NEED_MEMORY_H */ + +/* a suitable file/device to read random data from */ +#define CARES_RANDOM_FILE "/dev/urandom" + +/* Define to the type qualifier pointed by arg 5 for recvfrom. */ +#define RECVFROM_QUAL_ARG5 + +/* Define to the type of arg 1 for recvfrom. */ +#define RECVFROM_TYPE_ARG1 int + +/* Define to the type pointed by arg 2 for recvfrom. */ +#define RECVFROM_TYPE_ARG2 void * + +/* Define to 1 if the type pointed by arg 2 for recvfrom is void. */ +#define RECVFROM_TYPE_ARG2_IS_VOID 0 + +/* Define to the type of arg 3 for recvfrom. */ +#define RECVFROM_TYPE_ARG3 size_t + +/* Define to the type of arg 4 for recvfrom. */ +#define RECVFROM_TYPE_ARG4 int + +/* Define to the type pointed by arg 5 for recvfrom. */ +#define RECVFROM_TYPE_ARG5 struct sockaddr * + +/* Define to 1 if the type pointed by arg 5 for recvfrom is void. */ +#define RECVFROM_TYPE_ARG5_IS_VOID 0 + +/* Define to the type pointed by arg 6 for recvfrom. */ +#define RECVFROM_TYPE_ARG6 socklen_t * + +/* Define to 1 if the type pointed by arg 6 for recvfrom is void. */ +#define RECVFROM_TYPE_ARG6_IS_VOID 0 + +/* Define to the function return type for recvfrom. */ +#define RECVFROM_TYPE_RETV ssize_t + +/* Define to the type of arg 1 for recv. */ +#define RECV_TYPE_ARG1 int + +/* Define to the type of arg 2 for recv. */ +#define RECV_TYPE_ARG2 void * + +/* Define to the type of arg 3 for recv. */ +#define RECV_TYPE_ARG3 size_t + +/* Define to the type of arg 4 for recv. */ +#define RECV_TYPE_ARG4 int + +/* Define to the function return type for recv. */ +#define RECV_TYPE_RETV ssize_t + +/* Define as the return type of signal handlers (`int' or `void'). */ +#define RETSIGTYPE + +/* Define to the type qualifier of arg 2 for send. */ +#define SEND_QUAL_ARG2 + +/* Define to the type of arg 1 for send. */ +#define SEND_TYPE_ARG1 int + +/* Define to the type of arg 2 for send. */ +#define SEND_TYPE_ARG2 void * + +/* Define to the type of arg 3 for send. */ +#define SEND_TYPE_ARG3 size_t + +/* Define to the type of arg 4 for send. */ +#define SEND_TYPE_ARG4 int + +/* Define to the function return type for send. */ +#define SEND_TYPE_RETV ssize_t + +/* Define to 1 if you can safely include both and . */ +#define TIME_WITH_SYS_TIME + +/* Define to disable non-blocking sockets. */ +#undef USE_BLOCKING_SOCKETS + +/* Define to avoid automatic inclusion of winsock.h */ +#undef WIN32_LEAN_AND_MEAN + +/* Type to use in place of in_addr_t when system does not provide it. */ +#undef in_addr_t + diff --git a/contrib/jemalloc-cmake/include_linux_x86_64_musl/jemalloc/internal/jemalloc_internal_defs.h.in b/contrib/jemalloc-cmake/include_linux_x86_64_musl/jemalloc/internal/jemalloc_internal_defs.h.in index ff97d297d8f..e08a2bed2ec 100644 --- a/contrib/jemalloc-cmake/include_linux_x86_64_musl/jemalloc/internal/jemalloc_internal_defs.h.in +++ b/contrib/jemalloc-cmake/include_linux_x86_64_musl/jemalloc/internal/jemalloc_internal_defs.h.in @@ -415,7 +415,7 @@ /* * Defined if strerror_r returns char * if _GNU_SOURCE is defined. */ -#define JEMALLOC_STRERROR_R_RETURNS_CHAR_WITH_GNU_SOURCE +/* #undef JEMALLOC_STRERROR_R_RETURNS_CHAR_WITH_GNU_SOURCE */ /* Performs additional safety checks when defined. */ /* #undef JEMALLOC_OPT_SAFETY_CHECKS */ diff --git a/contrib/krb5-cmake/autoconf_linux.h b/contrib/krb5-cmake/autoconf_linux.h index 7b71d962d9a..54951f866a5 100644 --- a/contrib/krb5-cmake/autoconf_linux.h +++ b/contrib/krb5-cmake/autoconf_linux.h @@ -440,7 +440,9 @@ #define HAVE_STRERROR 1 /* Define to 1 if you have the `strerror_r' function. */ +#ifndef USE_MUSL #define HAVE_STRERROR_R 1 +#endif /* Define to 1 if you have the header file. */ #define HAVE_STRINGS_H 1 diff --git a/contrib/libcpuid b/contrib/libcpuid index 8db3b8d2d32..503083acb77 160000 --- a/contrib/libcpuid +++ b/contrib/libcpuid @@ -1 +1 @@ -Subproject commit 8db3b8d2d32d22437f063ce692a1b9bb15e42d18 +Subproject commit 503083acb77edf9fbce22a05826307dff2ce96e6 diff --git a/contrib/libpq-cmake/CMakeLists.txt b/contrib/libpq-cmake/CMakeLists.txt index 280c0381393..91326422b43 100644 --- a/contrib/libpq-cmake/CMakeLists.txt +++ b/contrib/libpq-cmake/CMakeLists.txt @@ -63,6 +63,13 @@ target_include_directories (_libpq SYSTEM PUBLIC ${LIBPQ_SOURCE_DIR}) target_include_directories (_libpq SYSTEM PUBLIC "${LIBPQ_SOURCE_DIR}/include") target_include_directories (_libpq SYSTEM PRIVATE "${LIBPQ_SOURCE_DIR}/configs") +# NOTE: this is a dirty hack to avoid and instead pg_config.h should be shipped +# for different OS'es like for jemalloc, not one generic for all OS'es like +# now. +if (OS_DARWIN OR OS_FREEBSD OR USE_MUSL) + target_compile_definitions(_libpq PRIVATE -DSTRERROR_R_INT=1) +endif() + target_link_libraries (_libpq PRIVATE OpenSSL::SSL) add_library(ch_contrib::libpq ALIAS _libpq) diff --git a/contrib/librdkafka b/contrib/librdkafka index ff32b4e9eea..6f3b483426a 160000 --- a/contrib/librdkafka +++ b/contrib/librdkafka @@ -1 +1 @@ -Subproject commit ff32b4e9eeafd0b276f010ee969179e4e9e6d0b2 +Subproject commit 6f3b483426a8c8ec950e27e446bec175cf8b553f diff --git a/contrib/llvm b/contrib/llvm index 20607e61728..0db5bf5bd24 160000 --- a/contrib/llvm +++ b/contrib/llvm @@ -1 +1 @@ -Subproject commit 20607e61728e97c969e536644c3c0c1bb1a50672 +Subproject commit 0db5bf5bd2452cd8f1283a1fcdc04845af705bfc diff --git a/contrib/sentry-native b/contrib/sentry-native index f431047ac8d..ae10fb8c224 160000 --- a/contrib/sentry-native +++ b/contrib/sentry-native @@ -1 +1 @@ -Subproject commit f431047ac8da13179c488018dddf1c0d0771a997 +Subproject commit ae10fb8c224c3f41571446e1ed7fd57b9e5e366b diff --git a/contrib/vectorscan b/contrib/vectorscan index 73695e419c2..f6250ae3e5a 160000 --- a/contrib/vectorscan +++ b/contrib/vectorscan @@ -1 +1 @@ -Subproject commit 73695e419c27af7fe2a099c7aa57931cc02aea5d +Subproject commit f6250ae3e5a3085000239313ad0689cc1e00cdc2 diff --git a/contrib/vectorscan-cmake/CMakeLists.txt b/contrib/vectorscan-cmake/CMakeLists.txt index 828f2a17df2..d6c626c1612 100644 --- a/contrib/vectorscan-cmake/CMakeLists.txt +++ b/contrib/vectorscan-cmake/CMakeLists.txt @@ -304,7 +304,7 @@ target_include_directories (_vectorscan SYSTEM PUBLIC "${LIBRARY_DIR}/src") # Please regenerate these files if you update vectorscan. if (ARCH_AMD64) - target_include_directories (_vectorscan PRIVATE x86_64) + target_include_directories (_vectorscan PRIVATE amd64) endif () if (ARCH_AARCH64) diff --git a/contrib/vectorscan-cmake/x86_64/config.h b/contrib/vectorscan-cmake/amd64/config.h similarity index 100% rename from contrib/vectorscan-cmake/x86_64/config.h rename to contrib/vectorscan-cmake/amd64/config.h diff --git a/docker/packager/binary/Dockerfile b/docker/packager/binary/Dockerfile index b9b0c5c2c6c..c4244504923 100644 --- a/docker/packager/binary/Dockerfile +++ b/docker/packager/binary/Dockerfile @@ -67,24 +67,5 @@ ENV GOCACHE=/workdir/ RUN mkdir /workdir && chmod 777 /workdir WORKDIR /workdir -# NOTE: thread sanitizer is broken in clang-14, we have to build it with clang-15 -# https://github.com/ClickHouse/ClickHouse/pull/39450 -# https://github.com/google/sanitizers/issues/1540 -# https://github.com/google/sanitizers/issues/1552 - -RUN export CODENAME="$(lsb_release --codename --short | tr 'A-Z' 'a-z')" \ - && echo "deb [trusted=yes] https://apt.llvm.org/${CODENAME}/ llvm-toolchain-${CODENAME}-15 main" >> \ - /etc/apt/sources.list.d/clang.list \ - && apt-get update \ - && apt-get install \ - clang-15 \ - llvm-15 \ - clang-tidy-15 \ - --yes --no-install-recommends \ - && apt-get clean - -# for external_symbolizer_path -RUN ln -s /usr/bin/llvm-symbolizer-15 /usr/bin/llvm-symbolizer - COPY build.sh / CMD ["bash", "-c", "/build.sh 2>&1"] diff --git a/docker/packager/packager b/docker/packager/packager index 591262959b4..9da787e9006 100755 --- a/docker/packager/packager +++ b/docker/packager/packager @@ -339,17 +339,16 @@ if __name__ == "__main__": parser.add_argument( "--compiler", choices=( - "clang-15", # For TSAN builds, see #39450 - "clang-14", - "clang-14-darwin", - "clang-14-darwin-aarch64", - "clang-14-aarch64", - "clang-14-ppc64le", - "clang-14-amd64sse2", - "clang-14-freebsd", + "clang-15", + "clang-15-darwin", + "clang-15-darwin-aarch64", + "clang-15-aarch64", + "clang-15-ppc64le", + "clang-15-amd64sse2", + "clang-15-freebsd", "gcc-11", ), - default="clang-14", + default="clang-15", help="a compiler to use", ) parser.add_argument( diff --git a/docker/server/Dockerfile.alpine b/docker/server/Dockerfile.alpine index b01dba1e22f..1a672f30a74 100644 --- a/docker/server/Dockerfile.alpine +++ b/docker/server/Dockerfile.alpine @@ -33,7 +33,7 @@ RUN arch=${TARGETARCH:-amd64} \ # lts / testing / prestable / etc ARG REPO_CHANNEL="stable" ARG REPOSITORY="https://packages.clickhouse.com/tgz/${REPO_CHANNEL}" -ARG VERSION="20.9.3.45" +ARG VERSION="22.8.5.29" ARG PACKAGES="clickhouse-client clickhouse-server clickhouse-common-static" # user/group precreated explicitly with fixed uid/gid on purpose. diff --git a/docker/server/Dockerfile.ubuntu b/docker/server/Dockerfile.ubuntu index f4102a6ccaf..db76a9fab1d 100644 --- a/docker/server/Dockerfile.ubuntu +++ b/docker/server/Dockerfile.ubuntu @@ -21,7 +21,7 @@ RUN sed -i "s|http://archive.ubuntu.com|${apt_archive}|g" /etc/apt/sources.list ARG REPO_CHANNEL="stable" ARG REPOSITORY="deb https://packages.clickhouse.com/deb ${REPO_CHANNEL} main" -ARG VERSION=22.6.1.* +ARG VERSION="22.8.5.29" ARG PACKAGES="clickhouse-client clickhouse-server clickhouse-common-static" # set non-empty deb_location_url url to create a docker image diff --git a/docker/test/base/Dockerfile b/docker/test/base/Dockerfile index 43cfca1fdfc..4e42fce1a1d 100644 --- a/docker/test/base/Dockerfile +++ b/docker/test/base/Dockerfile @@ -16,11 +16,10 @@ RUN apt-get update \ # and MEMORY_LIMIT_EXCEEDED exceptions in Functional tests (total memory limit in Functional tests is ~55.24 GiB). # TSAN will flush shadow memory when reaching this limit. # It may cause false-negatives, but it's better than OOM. -RUN echo "TSAN_OPTIONS='verbosity=1000 halt_on_error=1 history_size=7 memory_limit_mb=46080'" >> /etc/environment; \ - echo "UBSAN_OPTIONS='print_stacktrace=1'" >> /etc/environment; \ - echo "MSAN_OPTIONS='abort_on_error=1 poison_in_dtor=1'" >> /etc/environment; \ - echo "LSAN_OPTIONS='suppressions=/usr/share/clickhouse-test/config/lsan_suppressions.txt'" >> /etc/environment; \ - ln -s /usr/lib/llvm-${LLVM_VERSION}/bin/llvm-symbolizer /usr/bin/llvm-symbolizer; +RUN echo "TSAN_OPTIONS='verbosity=1000 halt_on_error=1 history_size=7 memory_limit_mb=46080'" >> /etc/environment +RUN echo "UBSAN_OPTIONS='print_stacktrace=1'" >> /etc/environment +RUN echo "MSAN_OPTIONS='abort_on_error=1 poison_in_dtor=1'" >> /etc/environment +RUN echo "LSAN_OPTIONS='suppressions=/usr/share/clickhouse-test/config/lsan_suppressions.txt'" >> /etc/environment # Sanitizer options for current shell (not current, but the one that will be spawned on "docker run") # (but w/o verbosity for TSAN, otherwise test.reference will not match) ENV TSAN_OPTIONS='halt_on_error=1 history_size=7 memory_limit_mb=46080' diff --git a/docker/test/codebrowser/Dockerfile b/docker/test/codebrowser/Dockerfile index c7aed618f6a..ceed93c3ac7 100644 --- a/docker/test/codebrowser/Dockerfile +++ b/docker/test/codebrowser/Dockerfile @@ -8,16 +8,41 @@ FROM clickhouse/binary-builder:$FROM_TAG ARG apt_archive="http://archive.ubuntu.com" RUN sed -i "s|http://archive.ubuntu.com|$apt_archive|g" /etc/apt/sources.list -RUN apt-get update && apt-get --yes --allow-unauthenticated install clang-14 libllvm14 libclang-14-dev libmlir-14-dev +RUN apt-get update && apt-get --yes --allow-unauthenticated install libclang-${LLVM_VERSION}-dev libmlir-${LLVM_VERSION}-dev + +# libclang-15-dev does not contain proper symlink: +# +# This is what cmake will search for: +# +# # readlink -f /usr/lib/llvm-15/lib/libclang-15.so.1 +# /usr/lib/x86_64-linux-gnu/libclang-15.so.1 +# +# This is what exists: +# +# # ls -l /usr/lib/x86_64-linux-gnu/libclang-15* +# lrwxrwxrwx 1 root root 16 Sep 5 13:31 /usr/lib/x86_64-linux-gnu/libclang-15.so -> libclang-15.so.1 +# lrwxrwxrwx 1 root root 21 Sep 5 13:31 /usr/lib/x86_64-linux-gnu/libclang-15.so.15 -> libclang-15.so.15.0.0 +# -rw-r--r-- 1 root root 31835760 Sep 5 13:31 /usr/lib/x86_64-linux-gnu/libclang-15.so.15.0.0 +# +ARG TARGETARCH +RUN arch=${TARGETARCH:-amd64} \ + && case $arch in \ + amd64) rarch=x86_64 ;; \ + arm64) rarch=aarch64 ;; \ + *) exit 1 ;; \ + esac \ + && ln -rsf /usr/lib/$rarch-linux-gnu/libclang-15.so.15 /usr/lib/$rarch-linux-gnu/libclang-15.so.1 # repo versions doesn't work correctly with C++17 # also we push reports to s3, so we add index.html to subfolder urls # https://github.com/ClickHouse-Extras/woboq_codebrowser/commit/37e15eaf377b920acb0b48dbe82471be9203f76b # TODO: remove branch in a few weeks after merge, e.g. in May or June 2022 -RUN git clone https://github.com/ClickHouse-Extras/woboq_codebrowser --branch llvm-14 \ +# +# FIXME: update location of a repo +RUN git clone https://github.com/azat/woboq_codebrowser --branch llvm-15 \ && cd woboq_codebrowser \ - && cmake . -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_COMPILER=clang\+\+-14 -DCMAKE_C_COMPILER=clang-14 \ - && make -j \ + && cmake . -G Ninja -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_COMPILER=clang\+\+-${LLVM_VERSION} -DCMAKE_C_COMPILER=clang-${LLVM_VERSION} \ + && ninja \ && cd .. \ && rm -rf woboq_codebrowser @@ -32,7 +57,7 @@ ENV SHA=nosha ENV DATA="https://s3.amazonaws.com/clickhouse-test-reports/codebrowser/data" CMD mkdir -p $BUILD_DIRECTORY && cd $BUILD_DIRECTORY && \ - cmake $SOURCE_DIRECTORY -DCMAKE_CXX_COMPILER=/usr/bin/clang\+\+-14 -DCMAKE_C_COMPILER=/usr/bin/clang-14 -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DENABLE_EMBEDDED_COMPILER=0 -DENABLE_S3=0 && \ + cmake $SOURCE_DIRECTORY -DCMAKE_CXX_COMPILER=/usr/bin/clang\+\+-${LLVM_VERSION} -DCMAKE_C_COMPILER=/usr/bin/clang-${LLVM_VERSION} -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DENABLE_EMBEDDED_COMPILER=0 -DENABLE_S3=0 && \ mkdir -p $HTML_RESULT_DIRECTORY && \ $CODEGEN -b $BUILD_DIRECTORY -a -o $HTML_RESULT_DIRECTORY -p ClickHouse:$SOURCE_DIRECTORY:$SHA -d $DATA | ts '%Y-%m-%d %H:%M:%S' && \ cp -r $STATIC_DATA $HTML_RESULT_DIRECTORY/ &&\ diff --git a/docker/test/fuzzer/run-fuzzer.sh b/docker/test/fuzzer/run-fuzzer.sh index 93e38260395..bab87865b42 100755 --- a/docker/test/fuzzer/run-fuzzer.sh +++ b/docker/test/fuzzer/run-fuzzer.sh @@ -19,7 +19,7 @@ stage=${stage:-} script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" echo "$script_dir" repo_dir=ch -BINARY_TO_DOWNLOAD=${BINARY_TO_DOWNLOAD:="clang-14_debug_none_unsplitted_disable_False_binary"} +BINARY_TO_DOWNLOAD=${BINARY_TO_DOWNLOAD:="clang-15_debug_none_unsplitted_disable_False_binary"} BINARY_URL_TO_DOWNLOAD=${BINARY_URL_TO_DOWNLOAD:="https://clickhouse-builds.s3.amazonaws.com/$PR_TO_TEST/$SHA_TO_TEST/clickhouse_build_check/$BINARY_TO_DOWNLOAD/clickhouse"} function clone diff --git a/docker/test/keeper-jepsen/run.sh b/docker/test/keeper-jepsen/run.sh index c43e6b2c54d..adf99c029a9 100644 --- a/docker/test/keeper-jepsen/run.sh +++ b/docker/test/keeper-jepsen/run.sh @@ -2,7 +2,7 @@ set -euo pipefail -CLICKHOUSE_PACKAGE=${CLICKHOUSE_PACKAGE:="https://clickhouse-builds.s3.amazonaws.com/$PR_TO_TEST/$SHA_TO_TEST/clickhouse_build_check/clang-14_relwithdebuginfo_none_unsplitted_disable_False_binary/clickhouse"} +CLICKHOUSE_PACKAGE=${CLICKHOUSE_PACKAGE:="https://clickhouse-builds.s3.amazonaws.com/$PR_TO_TEST/$SHA_TO_TEST/clickhouse_build_check/clang-15_relwithdebuginfo_none_unsplitted_disable_False_binary/clickhouse"} CLICKHOUSE_REPO_PATH=${CLICKHOUSE_REPO_PATH:=""} diff --git a/docker/test/performance-comparison/compare.sh b/docker/test/performance-comparison/compare.sh index d3d7084f37f..b0b5ebdb2e2 100755 --- a/docker/test/performance-comparison/compare.sh +++ b/docker/test/performance-comparison/compare.sh @@ -61,7 +61,7 @@ function configure cp -rv right/config left ||: # Start a temporary server to rename the tables - while pkill clickhouse-serv; do echo . ; sleep 1 ; done + while pkill -f clickhouse-serv ; do echo . ; sleep 1 ; done echo all killed set -m # Spawn temporary in its own process groups @@ -88,7 +88,7 @@ function configure clickhouse-client --port $LEFT_SERVER_PORT --query "create database test" ||: clickhouse-client --port $LEFT_SERVER_PORT --query "rename table datasets.hits_v1 to test.hits" ||: - while pkill clickhouse-serv; do echo . ; sleep 1 ; done + while pkill -f clickhouse-serv ; do echo . ; sleep 1 ; done echo all killed # Make copies of the original db for both servers. Use hardlinks instead @@ -106,7 +106,7 @@ function configure function restart { - while pkill clickhouse-serv; do echo . ; sleep 1 ; done + while pkill -f clickhouse-serv ; do echo . ; sleep 1 ; done echo all killed # Change the jemalloc settings here. @@ -1400,7 +1400,7 @@ case "$stage" in while env kill -- -$watchdog_pid ; do sleep 1; done # Stop the servers to free memory for the subsequent query analysis. - while pkill clickhouse-serv; do echo . ; sleep 1 ; done + while pkill -f clickhouse-serv ; do echo . ; sleep 1 ; done echo Servers stopped. ;& "analyze_queries") diff --git a/docker/test/util/Dockerfile b/docker/test/util/Dockerfile index b891b71492c..57880bfc1d6 100644 --- a/docker/test/util/Dockerfile +++ b/docker/test/util/Dockerfile @@ -5,7 +5,7 @@ FROM ubuntu:20.04 ARG apt_archive="http://archive.ubuntu.com" RUN sed -i "s|http://archive.ubuntu.com|$apt_archive|g" /etc/apt/sources.list -ENV DEBIAN_FRONTEND=noninteractive LLVM_VERSION=14 +ENV DEBIAN_FRONTEND=noninteractive LLVM_VERSION=15 RUN apt-get update \ && apt-get install \ @@ -56,6 +56,8 @@ RUN apt-get update \ # This symlink required by gcc to find lld compiler RUN ln -s /usr/bin/lld-${LLVM_VERSION} /usr/bin/ld.lld +# for external_symbolizer_path +RUN ln -s /usr/bin/llvm-symbolizer-${LLVM_VERSION} /usr/bin/llvm-symbolizer ARG CCACHE_VERSION=4.6.1 RUN mkdir /tmp/ccache \ diff --git a/docs/changelogs/v22.8.5.29-lts.md b/docs/changelogs/v22.8.5.29-lts.md new file mode 100644 index 00000000000..0ce13b7c36e --- /dev/null +++ b/docs/changelogs/v22.8.5.29-lts.md @@ -0,0 +1,34 @@ +--- +sidebar_position: 1 +sidebar_label: 2022 +--- + +# 2022 Changelog + +### ClickHouse release v22.8.5.29-lts (74ffb843807) FIXME as compared to v22.8.4.7-lts (baad27bcd2f) + +#### New Feature +* Backported in [#40870](https://github.com/ClickHouse/ClickHouse/issues/40870): Add setting to disable limit on kafka_num_consumers. Closes [#40331](https://github.com/ClickHouse/ClickHouse/issues/40331). [#40670](https://github.com/ClickHouse/ClickHouse/pull/40670) ([Kruglov Pavel](https://github.com/Avogar)). + +#### Improvement +* Backported in [#40817](https://github.com/ClickHouse/ClickHouse/issues/40817): The setting `show_addresses_in_stack_traces` was accidentally disabled in default `config.xml`. It's removed from the config now, so the setting is enabled by default. [#40749](https://github.com/ClickHouse/ClickHouse/pull/40749) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Backported in [#40944](https://github.com/ClickHouse/ClickHouse/issues/40944): Fix issue with passing MySQL timeouts for MySQL database engine and MySQL table function. Closes [#34168](https://github.com/ClickHouse/ClickHouse/issues/34168)?notification_referrer_id=NT_kwDOAzsV57MzMDMxNjAzNTY5OjU0MjAzODc5. [#40751](https://github.com/ClickHouse/ClickHouse/pull/40751) ([Kseniia Sumarokova](https://github.com/kssenii)). + +#### Build/Testing/Packaging Improvement +* Backported in [#41157](https://github.com/ClickHouse/ClickHouse/issues/41157): Add macOS binaries to GH release assets, it fixes [#37718](https://github.com/ClickHouse/ClickHouse/issues/37718). [#41088](https://github.com/ClickHouse/ClickHouse/pull/41088) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). + +#### Bug Fix (user-visible misbehavior in official stable or prestable release) + +* Backported in [#40866](https://github.com/ClickHouse/ClickHouse/issues/40866): - Fix crash while parsing values of type `Object` that contains arrays of variadic dimension. [#40483](https://github.com/ClickHouse/ClickHouse/pull/40483) ([Duc Canh Le](https://github.com/canhld94)). +* Backported in [#40805](https://github.com/ClickHouse/ClickHouse/issues/40805): During insertion of a new query to the `ProcessList` allocations happen. If we reach the memory limit during these allocations we can not use `OvercommitTracker`, because `ProcessList::mutex` is already acquired. Fixes [#40611](https://github.com/ClickHouse/ClickHouse/issues/40611). [#40677](https://github.com/ClickHouse/ClickHouse/pull/40677) ([Dmitry Novik](https://github.com/novikd)). +* Backported in [#40777](https://github.com/ClickHouse/ClickHouse/issues/40777): Fix memory leak while pushing to MVs w/o query context (from Kafka/...). [#40732](https://github.com/ClickHouse/ClickHouse/pull/40732) ([Azat Khuzhin](https://github.com/azat)). +* Backported in [#41135](https://github.com/ClickHouse/ClickHouse/issues/41135): Fix access rights for `DESCRIBE TABLE url()` and some other `DESCRIBE TABLE ()`. [#40975](https://github.com/ClickHouse/ClickHouse/pull/40975) ([Vitaly Baranov](https://github.com/vitlibar)). +* Backported in [#41242](https://github.com/ClickHouse/ClickHouse/issues/41242): Fixed "possible deadlock avoided" error on automatic conversion of database engine from Ordinary to Atomic. [#41146](https://github.com/ClickHouse/ClickHouse/pull/41146) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Backported in [#41234](https://github.com/ClickHouse/ClickHouse/issues/41234): Fix background clean up of broken detached parts. [#41190](https://github.com/ClickHouse/ClickHouse/pull/41190) ([Kseniia Sumarokova](https://github.com/kssenii)). + +#### NOT FOR CHANGELOG / INSIGNIFICANT + +* use ROBOT_CLICKHOUSE_COMMIT_TOKEN for create-pull-request [#40067](https://github.com/ClickHouse/ClickHouse/pull/40067) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)). +* use input token instead of env var [#40421](https://github.com/ClickHouse/ClickHouse/pull/40421) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)). +* CaresPTRResolver small safety improvement [#40890](https://github.com/ClickHouse/ClickHouse/pull/40890) ([Arthur Passos](https://github.com/arthurpassos)). + diff --git a/docs/en/interfaces/third-party/integrations.md b/docs/en/interfaces/third-party/integrations.md index de496546cb4..aede128e9a4 100644 --- a/docs/en/interfaces/third-party/integrations.md +++ b/docs/en/interfaces/third-party/integrations.md @@ -103,6 +103,7 @@ ClickHouse, Inc. does **not** maintain the tools and libraries listed below and - [ClickHouse.Client](https://github.com/DarkWanderer/ClickHouse.Client) - [ClickHouse.Net](https://github.com/ilyabreev/ClickHouse.Net) - [ClickHouse.Net.Migrations](https://github.com/ilyabreev/ClickHouse.Net.Migrations) + - [Linq To DB](https://github.com/linq2db/linq2db) - Elixir - [Ecto](https://github.com/elixir-ecto/ecto) - [clickhouse_ecto](https://github.com/appodeal/clickhouse_ecto) diff --git a/docs/en/operations/storing-data.md b/docs/en/operations/storing-data.md index 546e3d7b7a6..663469ef4ae 100644 --- a/docs/en/operations/storing-data.md +++ b/docs/en/operations/storing-data.md @@ -134,6 +134,13 @@ Example of configuration for versions later or equal to 22.8: 10000000 + + +
+ cache +
+
+ ``` @@ -151,6 +158,13 @@ Example of configuration for versions earlier than 22.8: 10000000 + + +
+ s3 +
+
+ ``` @@ -166,7 +180,7 @@ Cache **configuration settings**: - `enable_cache_hits_threshold` - a number, which defines how many times some data needs to be read before it will be cached. Default: `0`, e.g. the data is cached at the first attempt to read it. -- `do_not_evict_index_and_mark_files` - do not evict small frequently used files according to cache policy. Default: `true`. +- `do_not_evict_index_and_mark_files` - do not evict small frequently used files according to cache policy. Default: `false`. This setting was added in version 22.8. If you used filesystem cache before this version, then it will not work on versions starting from 22.8 if this setting is set to `true`. If you want to use this setting, clear old cache created before version 22.8 before upgrading. - `max_file_segment_size` - a maximum size of a single cache file. Default: `104857600` (100 Mb). diff --git a/docs/en/operations/troubleshooting.md b/docs/en/operations/troubleshooting.md index 5a61359a2c0..93bd56087a2 100644 --- a/docs/en/operations/troubleshooting.md +++ b/docs/en/operations/troubleshooting.md @@ -2,10 +2,9 @@ slug: /en/operations/troubleshooting sidebar_position: 46 sidebar_label: Troubleshooting +title: Troubleshooting --- -# Troubleshooting - - [Installation](#troubleshooting-installation-errors) - [Connecting to the server](#troubleshooting-accepts-no-connections) - [Query processing](#troubleshooting-does-not-process-queries) diff --git a/docs/en/sql-reference/functions/date-time-functions.md b/docs/en/sql-reference/functions/date-time-functions.md index ced96078ce1..8688f3eb3a0 100644 --- a/docs/en/sql-reference/functions/date-time-functions.md +++ b/docs/en/sql-reference/functions/date-time-functions.md @@ -1227,6 +1227,8 @@ Result: Function converts Unix timestamp to a calendar date and a time of a day. When there is only a single argument of [Integer](../../sql-reference/data-types/int-uint.md) type, it acts in the same way as [toDateTime](../../sql-reference/functions/type-conversion-functions.md#todatetime) and return [DateTime](../../sql-reference/data-types/datetime.md) type. +Alias: `fromUnixTimestamp`. + **Example:** Query: diff --git a/docs/en/sql-reference/functions/other-functions.md b/docs/en/sql-reference/functions/other-functions.md index d86eb6b45ae..877179a66a6 100644 --- a/docs/en/sql-reference/functions/other-functions.md +++ b/docs/en/sql-reference/functions/other-functions.md @@ -1823,6 +1823,36 @@ Result: Evaluate external model. Accepts a model name and model arguments. Returns Float64. +## catboostEvaluate(path_to_model, feature_1, feature_2, …, feature_n) + +Evaluate external catboost model. [CatBoost](https://catboost.ai) is an open-source gradient boosting library developed by Yandex for machine learing. +Accepts a path to a catboost model and model arguments (features). Returns Float64. + +``` sql +SELECT feat1, ..., feat_n, catboostEvaluate('/path/to/model.bin', feat_1, ..., feat_n) AS prediction +FROM data_table +``` + +**Prerequisites** + +1. Build the catboost evaluation library + +Before evaluating catboost models, the `libcatboostmodel.` library must be made available. See [CatBoost documentation](https://catboost.ai/docs/concepts/c-plus-plus-api_dynamic-c-pluplus-wrapper.html) how to compile it. + +Next, specify the path to `libcatboostmodel.` in the clickhouse configuration: + +``` xml + +... + /path/to/libcatboostmodel.so +... + +``` + +2. Train a catboost model using libcatboost + +See [Training and applying models](https://catboost.ai/docs/features/training.html#training) for how to train catboost models from a training data set. + ## throwIf(x\[, message\[, error_code\]\]) Throw an exception if the argument is non zero. diff --git a/docs/en/sql-reference/statements/delete.md b/docs/en/sql-reference/statements/delete.md index 487dfc87f9a..0dc6cc0d09a 100644 --- a/docs/en/sql-reference/statements/delete.md +++ b/docs/en/sql-reference/statements/delete.md @@ -32,6 +32,12 @@ SET allow_experimental_lightweight_delete = true; An [alternative way to delete rows](./alter/delete.md) in ClickHouse is `ALTER TABLE ... DELETE`, which might be more efficient if you do bulk deletes only occasionally and don't need the operation to be applied instantly. In most use cases the new lightweight `DELETE FROM` behavior will be considerably faster. :::warning -Even though deletes are becoming more lightweight in ClickHouse, they should still not be used as aggressively as on OLTP system. Ligthweight deletes are currently efficient for wide parts, but for compact parts they can be a heavyweight operation, and it may be better to use `ALTER TABLE` for some scenarios. +Even though deletes are becoming more lightweight in ClickHouse, they should still not be used as aggressively as on an OLTP system. Ligthweight deletes are currently efficient for wide parts, but for compact parts they can be a heavyweight operation, and it may be better to use `ALTER TABLE` for some scenarios. ::: +:::note +`DELETE FROM` requires the `ALTER DELETE` privilege: +```sql +grant ALTER DELETE ON db.table to username; +``` +::: diff --git a/docs/en/sql-reference/statements/system.md b/docs/en/sql-reference/statements/system.md index 9b7527caaa9..feeefd5502a 100644 --- a/docs/en/sql-reference/statements/system.md +++ b/docs/en/sql-reference/statements/system.md @@ -6,45 +6,6 @@ sidebar_label: SYSTEM # SYSTEM Statements -The list of available `SYSTEM` statements: - -- [RELOAD EMBEDDED DICTIONARIES](#query_language-system-reload-emdedded-dictionaries) -- [RELOAD DICTIONARIES](#query_language-system-reload-dictionaries) -- [RELOAD DICTIONARY](#query_language-system-reload-dictionary) -- [RELOAD MODELS](#query_language-system-reload-models) -- [RELOAD MODEL](#query_language-system-reload-model) -- [RELOAD FUNCTIONS](#query_language-system-reload-functions) -- [RELOAD FUNCTION](#query_language-system-reload-functions) -- [DROP DNS CACHE](#query_language-system-drop-dns-cache) -- [DROP MARK CACHE](#query_language-system-drop-mark-cache) -- [DROP UNCOMPRESSED CACHE](#query_language-system-drop-uncompressed-cache) -- [DROP COMPILED EXPRESSION CACHE](#query_language-system-drop-compiled-expression-cache) -- [DROP REPLICA](#query_language-system-drop-replica) -- [FLUSH LOGS](#query_language-system-flush_logs) -- [RELOAD CONFIG](#query_language-system-reload-config) -- [SHUTDOWN](#query_language-system-shutdown) -- [KILL](#query_language-system-kill) -- [STOP DISTRIBUTED SENDS](#query_language-system-stop-distributed-sends) -- [FLUSH DISTRIBUTED](#query_language-system-flush-distributed) -- [START DISTRIBUTED SENDS](#query_language-system-start-distributed-sends) -- [STOP MERGES](#query_language-system-stop-merges) -- [START MERGES](#query_language-system-start-merges) -- [STOP TTL MERGES](#query_language-stop-ttl-merges) -- [START TTL MERGES](#query_language-start-ttl-merges) -- [STOP MOVES](#query_language-stop-moves) -- [START MOVES](#query_language-start-moves) -- [SYSTEM UNFREEZE](#query_language-system-unfreeze) -- [STOP FETCHES](#query_language-system-stop-fetches) -- [START FETCHES](#query_language-system-start-fetches) -- [STOP REPLICATED SENDS](#query_language-system-start-replicated-sends) -- [START REPLICATED SENDS](#query_language-system-start-replicated-sends) -- [STOP REPLICATION QUEUES](#query_language-system-stop-replication-queues) -- [START REPLICATION QUEUES](#query_language-system-start-replication-queues) -- [SYNC REPLICA](#query_language-system-sync-replica) -- [RESTART REPLICA](#query_language-system-restart-replica) -- [RESTORE REPLICA](#query_language-system-restore-replica) -- [RESTART REPLICAS](#query_language-system-restart-replicas) - ## RELOAD EMBEDDED DICTIONARIES Reload all [Internal dictionaries](../../sql-reference/dictionaries/internal-dicts.md). @@ -69,7 +30,12 @@ SELECT name, status FROM system.dictionaries; ## RELOAD MODELS -Reloads all [CatBoost](../../guides/developer/apply-catboost-model.md) models if the configuration was updated without restarting the server. +:::note +This statement and `SYSTEM RELOAD MODEL` merely unload catboost models from the clickhouse-library-bridge. The function `catboostEvaluate()` +loads a model upon first access if it is not loaded yet. +::: + +Unloads all CatBoost models. **Syntax** @@ -79,12 +45,12 @@ SYSTEM RELOAD MODELS [ON CLUSTER cluster_name] ## RELOAD MODEL -Completely reloads a CatBoost model `model_name` if the configuration was updated without restarting the server. +Unloads a CatBoost model at `model_path`. **Syntax** ```sql -SYSTEM RELOAD MODEL [ON CLUSTER cluster_name] +SYSTEM RELOAD MODEL [ON CLUSTER cluster_name] ``` ## RELOAD FUNCTIONS diff --git a/docs/en/sql-reference/table-functions/file.md b/docs/en/sql-reference/table-functions/file.md index a110bfbd15c..f40107aaaca 100644 --- a/docs/en/sql-reference/table-functions/file.md +++ b/docs/en/sql-reference/table-functions/file.md @@ -13,7 +13,7 @@ Creates a table from a file. This table function is similar to [url](../../sql-r **Syntax** ``` sql -file(path, format, structure) +file(path [,format] [,structure]) ``` **Parameters** diff --git a/docs/en/sql-reference/table-functions/s3.md b/docs/en/sql-reference/table-functions/s3.md index 2df7d6e46b3..545037665bb 100644 --- a/docs/en/sql-reference/table-functions/s3.md +++ b/docs/en/sql-reference/table-functions/s3.md @@ -11,7 +11,7 @@ Provides table-like interface to select/insert files in [Amazon S3](https://aws. **Syntax** ``` sql -s3(path, [aws_access_key_id, aws_secret_access_key,] format, structure, [compression]) +s3(path [,aws_access_key_id, aws_secret_access_key] [,format] [,structure] [,compression]) ``` **Arguments** diff --git a/docs/en/sql-reference/table-functions/s3Cluster.md b/docs/en/sql-reference/table-functions/s3Cluster.md index 9d006af9572..b81fc51fd18 100644 --- a/docs/en/sql-reference/table-functions/s3Cluster.md +++ b/docs/en/sql-reference/table-functions/s3Cluster.md @@ -10,7 +10,7 @@ Allows processing files from [Amazon S3](https://aws.amazon.com/s3/) in parallel **Syntax** ``` sql -s3Cluster(cluster_name, source, [access_key_id, secret_access_key,] format, structure) +s3Cluster(cluster_name, source, [,access_key_id, secret_access_key] [,format] [,structure]) ``` **Arguments** diff --git a/docs/en/sql-reference/table-functions/url.md b/docs/en/sql-reference/table-functions/url.md index f1ed7b4dfe4..014dc3ae853 100644 --- a/docs/en/sql-reference/table-functions/url.md +++ b/docs/en/sql-reference/table-functions/url.md @@ -13,7 +13,7 @@ sidebar_label: url **Syntax** ``` sql -url(URL, format, structure) +url(URL [,format] [,structure]) ``` **Parameters** diff --git a/docs/redirects.txt b/docs/redirects.txt index 949b9d48ca8..cea138f7237 100644 --- a/docs/redirects.txt +++ b/docs/redirects.txt @@ -155,7 +155,6 @@ getting_started/index.md getting-started/index.md getting_started/install.md getting-started/install.md getting_started/playground.md getting-started/playground.md getting_started/tutorial.md getting-started/tutorial.md -guides/apply_catboost_model.md guides/apply-catboost-model.md images/column_oriented.gif images/column-oriented.gif images/row_oriented.gif images/row-oriented.gif interfaces/http_interface.md interfaces/http.md diff --git a/docs/ru/guides/apply-catboost-model.md b/docs/ru/guides/apply-catboost-model.md deleted file mode 100644 index 68d7042df2d..00000000000 --- a/docs/ru/guides/apply-catboost-model.md +++ /dev/null @@ -1,241 +0,0 @@ ---- -slug: /ru/guides/apply-catboost-model -sidebar_position: 41 -sidebar_label: "Применение модели CatBoost в ClickHouse" ---- - -# Применение модели CatBoost в ClickHouse {#applying-catboost-model-in-clickhouse} - -[CatBoost](https://catboost.ai) — открытая программная библиотека разработанная компанией [Яндекс](https://yandex.ru/company/) для машинного обучения, которая использует схему градиентного бустинга. - -С помощью этой инструкции вы научитесь применять предобученные модели в ClickHouse: в результате вы запустите вывод модели из SQL. - -Чтобы применить модель CatBoost в ClickHouse: - -1. [Создайте таблицу](#create-table). -2. [Вставьте данные в таблицу](#insert-data-to-table). -3. [Интегрируйте CatBoost в ClickHouse](#integrate-catboost-into-clickhouse) (Опциональный шаг). -4. [Запустите вывод модели из SQL](#run-model-inference). - -Подробнее об обучении моделей в CatBoost, см. [Обучение и применение моделей](https://catboost.ai/docs/features/training.html#training). - -Вы можете перегрузить модели CatBoost, если их конфигурация была обновлена, без перезагрузки сервера. Для этого используйте системные запросы [RELOAD MODEL](../sql-reference/statements/system.md#query_language-system-reload-model) и [RELOAD MODELS](../sql-reference/statements/system.md#query_language-system-reload-models). - -## Перед началом работы {#prerequisites} - -Если у вас еще нет [Docker](https://docs.docker.com/install/), установите его. - - :::note "Примечание" - [Docker](https://www.docker.com) – это программная платформа для создания контейнеров, которые изолируют установку CatBoost и ClickHouse от остальной части системы. - ::: -Перед применением модели CatBoost: - -**1.** Скачайте [Docker-образ](https://hub.docker.com/r/yandex/tutorial-catboost-clickhouse) из реестра: - -``` bash -$ docker pull yandex/tutorial-catboost-clickhouse -``` - -Данный Docker-образ содержит все необходимое для запуска CatBoost и ClickHouse: код, среду выполнения, библиотеки, переменные окружения и файлы конфигурации. - -**2.** Проверьте, что Docker-образ успешно скачался: - -``` bash -$ docker image ls -REPOSITORY TAG IMAGE ID CREATED SIZE -yandex/tutorial-catboost-clickhouse latest 622e4d17945b 22 hours ago 1.37GB -``` - -**3.** Запустите Docker-контейнер основанный на данном образе: - -``` bash -$ docker run -it -p 8888:8888 yandex/tutorial-catboost-clickhouse -``` - -## 1. Создайте таблицу {#create-table} - -Чтобы создать таблицу для обучающей выборки: - -**1.** Запустите клиент ClickHouse: - -``` bash -$ clickhouse client -``` - - :::note "Примечание" - Сервер ClickHouse уже запущен внутри Docker-контейнера. - ::: -**2.** Создайте таблицу в ClickHouse с помощью следующей команды: - -``` sql -:) CREATE TABLE amazon_train -( - date Date MATERIALIZED today(), - ACTION UInt8, - RESOURCE UInt32, - MGR_ID UInt32, - ROLE_ROLLUP_1 UInt32, - ROLE_ROLLUP_2 UInt32, - ROLE_DEPTNAME UInt32, - ROLE_TITLE UInt32, - ROLE_FAMILY_DESC UInt32, - ROLE_FAMILY UInt32, - ROLE_CODE UInt32 -) -ENGINE = MergeTree ORDER BY date -``` - -**3.** Выйдите из клиента ClickHouse: - -``` sql -:) exit -``` - -## 2. Вставьте данные в таблицу {#insert-data-to-table} - -Чтобы вставить данные: - -**1.** Выполните следующую команду: - -``` bash -$ clickhouse client --host 127.0.0.1 --query 'INSERT INTO amazon_train FORMAT CSVWithNames' < ~/amazon/train.csv -``` - -**2.** Запустите клиент ClickHouse: - -``` bash -$ clickhouse client -``` - -**3.** Проверьте, что данные успешно загрузились: - -``` sql -:) SELECT count() FROM amazon_train - -SELECT count() -FROM amazon_train - -+-count()-+ -| 65538 | -+---------+ -``` - -## 3. Интегрируйте CatBoost в ClickHouse {#integrate-catboost-into-clickhouse} - - :::note "Примечание" - **Опциональный шаг.** Docker-образ содержит все необходимое для запуска CatBoost и ClickHouse. - ::: -Чтобы интегрировать CatBoost в ClickHouse: - -**1.** Создайте библиотеку для оценки модели. - -Наиболее быстрый способ оценить модель CatBoost — это скомпилировать библиотеку `libcatboostmodel.`. Подробнее о том, как скомпилировать библиотеку, читайте в [документации CatBoost](https://catboost.ai/docs/concepts/c-plus-plus-api_dynamic-c-pluplus-wrapper.html). - -**2.** Создайте в любом месте новую директорию с произвольным названием, например `data` и поместите в нее созданную библиотеку. Docker-образ уже содержит библиотеку `data/libcatboostmodel.so`. - -**3.** Создайте в любом месте новую директорию для конфигурации модели с произвольным названием, например `models`. - -**4.** Создайте файл конфигурации модели с произвольным названием, например `models/amazon_model.xml`. - -**5.** Опишите конфигурацию модели: - -``` xml - - - - catboost - - amazon - - /home/catboost/tutorial/catboost_model.bin - - 0 - - -``` - -**6.** Добавьте в конфигурацию ClickHouse путь к CatBoost и конфигурации модели: - -``` xml - -/home/catboost/data/libcatboostmodel.so -/home/catboost/models/*_model.xml -``` - :::note "Примечание" - Вы можете позднее изменить путь к конфигурации модели CatBoost без перезагрузки сервера. - ::: -## 4. Запустите вывод модели из SQL {#run-model-inference} - -Для тестирования модели запустите клиент ClickHouse `$ clickhouse client`. - -Проверьте, что модель работает: - -``` sql -:) SELECT - modelEvaluate('amazon', - RESOURCE, - MGR_ID, - ROLE_ROLLUP_1, - ROLE_ROLLUP_2, - ROLE_DEPTNAME, - ROLE_TITLE, - ROLE_FAMILY_DESC, - ROLE_FAMILY, - ROLE_CODE) > 0 AS prediction, - ACTION AS target -FROM amazon_train -LIMIT 10 -``` - - :::note "Примечание" - Функция [modelEvaluate](../sql-reference/functions/other-functions.md#function-modelevaluate) возвращает кортежи (tuple) с исходными прогнозами по классам для моделей с несколькими классами. - ::: -Спрогнозируйте вероятность: - -``` sql -:) SELECT - modelEvaluate('amazon', - RESOURCE, - MGR_ID, - ROLE_ROLLUP_1, - ROLE_ROLLUP_2, - ROLE_DEPTNAME, - ROLE_TITLE, - ROLE_FAMILY_DESC, - ROLE_FAMILY, - ROLE_CODE) AS prediction, - 1. / (1 + exp(-prediction)) AS probability, - ACTION AS target -FROM amazon_train -LIMIT 10 -``` - - :::note "Примечание" - Подробнее про функцию [exp()](../sql-reference/functions/math-functions.md). - ::: -Посчитайте логистическую функцию потерь (LogLoss) на всей выборке: - -``` sql -:) SELECT -avg(tg * log(prob) + (1 - tg) * log(1 - prob)) AS logloss -FROM -( - SELECT - modelEvaluate('amazon', - RESOURCE, - MGR_ID, - ROLE_ROLLUP_1, - ROLE_ROLLUP_2, - ROLE_DEPTNAME, - ROLE_TITLE, - ROLE_FAMILY_DESC, - ROLE_FAMILY, - ROLE_CODE) AS prediction, - 1. / (1. + exp(-prediction)) AS prob, - ACTION AS tg - FROM amazon_train -) -``` - - :::note "Примечание" - Подробнее про функции [avg()](../sql-reference/aggregate-functions/reference/avg.md#agg_function-avg), [log()](../sql-reference/functions/math-functions.md). - ::: \ No newline at end of file diff --git a/docs/ru/guides/index.md b/docs/ru/guides/index.md index 0b5938dfc09..882f71b5700 100644 --- a/docs/ru/guides/index.md +++ b/docs/ru/guides/index.md @@ -7,5 +7,3 @@ sidebar_label: "Руководства" # Руководства {#rukovodstva} Подробные пошаговые инструкции, которые помогут вам решать различные задачи с помощью ClickHouse. - -- [Применение модели CatBoost в ClickHouse](apply-catboost-model.md) diff --git a/docs/ru/sql-reference/statements/system.md b/docs/ru/sql-reference/statements/system.md index c1dc03a63d1..a7dec7abe27 100644 --- a/docs/ru/sql-reference/statements/system.md +++ b/docs/ru/sql-reference/statements/system.md @@ -6,43 +6,6 @@ sidebar_label: SYSTEM # Запросы SYSTEM {#query-language-system} -- [RELOAD EMBEDDED DICTIONARIES](#query_language-system-reload-emdedded-dictionaries) -- [RELOAD DICTIONARIES](#query_language-system-reload-dictionaries) -- [RELOAD DICTIONARY](#query_language-system-reload-dictionary) -- [RELOAD MODELS](#query_language-system-reload-models) -- [RELOAD MODEL](#query_language-system-reload-model) -- [RELOAD FUNCTIONS](#query_language-system-reload-functions) -- [RELOAD FUNCTION](#query_language-system-reload-functions) -- [DROP DNS CACHE](#query_language-system-drop-dns-cache) -- [DROP MARK CACHE](#query_language-system-drop-mark-cache) -- [DROP UNCOMPRESSED CACHE](#query_language-system-drop-uncompressed-cache) -- [DROP COMPILED EXPRESSION CACHE](#query_language-system-drop-compiled-expression-cache) -- [DROP REPLICA](#query_language-system-drop-replica) -- [FLUSH LOGS](#query_language-system-flush_logs) -- [RELOAD CONFIG](#query_language-system-reload-config) -- [SHUTDOWN](#query_language-system-shutdown) -- [KILL](#query_language-system-kill) -- [STOP DISTRIBUTED SENDS](#query_language-system-stop-distributed-sends) -- [FLUSH DISTRIBUTED](#query_language-system-flush-distributed) -- [START DISTRIBUTED SENDS](#query_language-system-start-distributed-sends) -- [STOP MERGES](#query_language-system-stop-merges) -- [START MERGES](#query_language-system-start-merges) -- [STOP TTL MERGES](#query_language-stop-ttl-merges) -- [START TTL MERGES](#query_language-start-ttl-merges) -- [STOP MOVES](#query_language-stop-moves) -- [START MOVES](#query_language-start-moves) -- [SYSTEM UNFREEZE](#query_language-system-unfreeze) -- [STOP FETCHES](#query_language-system-stop-fetches) -- [START FETCHES](#query_language-system-start-fetches) -- [STOP REPLICATED SENDS](#query_language-system-start-replicated-sends) -- [START REPLICATED SENDS](#query_language-system-start-replicated-sends) -- [STOP REPLICATION QUEUES](#query_language-system-stop-replication-queues) -- [START REPLICATION QUEUES](#query_language-system-start-replication-queues) -- [SYNC REPLICA](#query_language-system-sync-replica) -- [RESTART REPLICA](#query_language-system-restart-replica) -- [RESTORE REPLICA](#query_language-system-restore-replica) -- [RESTART REPLICAS](#query_language-system-restart-replicas) - ## RELOAD EMBEDDED DICTIONARIES] {#query_language-system-reload-emdedded-dictionaries} Перегружает все [Встроенные словари](../dictionaries/internal-dicts.md). По умолчанию встроенные словари выключены. @@ -66,7 +29,12 @@ SELECT name, status FROM system.dictionaries; ## RELOAD MODELS {#query_language-system-reload-models} -Перегружает все модели [CatBoost](../../guides/apply-catboost-model.md#applying-catboost-model-in-clickhouse), если их конфигурация была обновлена, без перезагрузки сервера. +:::note +Это утверждение и `SYSTEM RELOAD MODEL` просто выгружают модели catboost из clickhouse-library-bridge. Функция `catboostEvaluate()` +загружает модель при первом обращении, если она еще не загружена. +::: + +Разгрузите все модели CatBoost. **Синтаксис** @@ -76,12 +44,12 @@ SYSTEM RELOAD MODELS ## RELOAD MODEL {#query_language-system-reload-model} -Полностью перегружает модель [CatBoost](../../guides/apply-catboost-model.md#applying-catboost-model-in-clickhouse) `model_name`, если ее конфигурация была обновлена, без перезагрузки сервера. +Выгружает модель CatBoost по адресу `модель_путь`. **Синтаксис** ```sql -SYSTEM RELOAD MODEL +SYSTEM RELOAD MODEL ``` ## RELOAD FUNCTIONS {#query_language-system-reload-functions} diff --git a/docs/ru/sql-reference/table-functions/file.md b/docs/ru/sql-reference/table-functions/file.md index 1f262c9403a..df35a1c4ac0 100644 --- a/docs/ru/sql-reference/table-functions/file.md +++ b/docs/ru/sql-reference/table-functions/file.md @@ -13,7 +13,7 @@ sidebar_label: file **Синтаксис** ``` sql -file(path, format, structure) +file(path [,format] [,structure]) ``` **Параметры** diff --git a/docs/ru/sql-reference/table-functions/s3.md b/docs/ru/sql-reference/table-functions/s3.md index ae0419a4b84..14c8204fd1d 100644 --- a/docs/ru/sql-reference/table-functions/s3.md +++ b/docs/ru/sql-reference/table-functions/s3.md @@ -11,7 +11,7 @@ sidebar_label: s3 **Синтаксис** ``` sql -s3(path, [aws_access_key_id, aws_secret_access_key,] format, structure, [compression]) +s3(path [,aws_access_key_id, aws_secret_access_key] [,format] [,structure] [,compression]) ``` **Aргументы** diff --git a/docs/ru/sql-reference/table-functions/s3Cluster.md b/docs/ru/sql-reference/table-functions/s3Cluster.md index e6b317253c0..1c12913fabe 100644 --- a/docs/ru/sql-reference/table-functions/s3Cluster.md +++ b/docs/ru/sql-reference/table-functions/s3Cluster.md @@ -11,7 +11,7 @@ sidebar_label: s3Cluster **Синтаксис** ``` sql -s3Cluster(cluster_name, source, [access_key_id, secret_access_key,] format, structure) +s3Cluster(cluster_name, source, [,access_key_id, secret_access_key] [,format] [,structure]) ``` **Аргументы** diff --git a/docs/ru/sql-reference/table-functions/url.md b/docs/ru/sql-reference/table-functions/url.md index d4fb11b0de7..e5d9faeec00 100644 --- a/docs/ru/sql-reference/table-functions/url.md +++ b/docs/ru/sql-reference/table-functions/url.md @@ -13,7 +13,7 @@ sidebar_label: url **Синтаксис** ``` sql -url(URL, format, structure) +url(URL [,format] [,structure]) ``` **Параметры** diff --git a/docs/zh/guides/apply-catboost-model.md b/docs/zh/guides/apply-catboost-model.md deleted file mode 100644 index 861e5372875..00000000000 --- a/docs/zh/guides/apply-catboost-model.md +++ /dev/null @@ -1,244 +0,0 @@ ---- -slug: /zh/guides/apply-catboost-model -sidebar_position: 41 -sidebar_label: "\u5E94\u7528CatBoost\u6A21\u578B" ---- - -# 在ClickHouse中应用Catboost模型 {#applying-catboost-model-in-clickhouse} - -[CatBoost](https://catboost.ai) 是一个由[Yandex](https://yandex.com/company/)开发的开源免费机器学习库。 - - -通过本篇文档,您将学会如何用SQL语句调用已经存放在Clickhouse中的预训练模型来预测数据。 - - -为了在ClickHouse中应用CatBoost模型,需要进行如下步骤: - -1. [创建数据表](#create-table). -2. [将数据插入到表中](#insert-data-to-table). -3. [将CatBoost集成到ClickHouse中](#integrate-catboost-into-clickhouse) (可跳过)。 -4. [从SQL运行模型推断](#run-model-inference). - -有关训练CatBoost模型的详细信息,请参阅 [训练和模型应用](https://catboost.ai/docs/features/training.html#training). - -您可以通过[RELOAD MODEL](https://clickhouse.com/docs/en/sql-reference/statements/system/#query_language-system-reload-model)与[RELOAD MODELS](https://clickhouse.com/docs/en/sql-reference/statements/system/#query_language-system-reload-models)语句来重载CatBoost模型。 - -## 先决条件 {#prerequisites} - -请先安装 [Docker](https://docs.docker.com/install/)。 - -!!! note "注" - [Docker](https://www.docker.com) 是一个软件平台,用户可以用Docker来创建独立于已有系统并集成了CatBoost和ClickHouse的容器。 - -在应用CatBoost模型之前: - -**1.** 从容器仓库拉取示例docker镜像 (https://hub.docker.com/r/yandex/tutorial-catboost-clickhouse) : - -``` bash -$ docker pull yandex/tutorial-catboost-clickhouse -``` - -此示例Docker镜像包含运行CatBoost和ClickHouse所需的所有内容:代码、运行时、库、环境变量和配置文件。 - -**2.** 确保已成功拉取Docker镜像: - -``` bash -$ docker image ls -REPOSITORY TAG IMAGE ID CREATED SIZE -yandex/tutorial-catboost-clickhouse latest 622e4d17945b 22 hours ago 1.37GB -``` - -**3.** 基于此镜像启动一个Docker容器: - -``` bash -$ docker run -it -p 8888:8888 yandex/tutorial-catboost-clickhouse -``` - -## 1. 创建数据表 {#create-table} - -为训练样本创建ClickHouse表: - -**1.** 在交互模式下启动ClickHouse控制台客户端: - -``` bash -$ clickhouse client -``` - -!!! note "注" - ClickHouse服务器已经在Docker容器内运行。 - -**2.** 使用以下命令创建表: - -``` sql -:) CREATE TABLE amazon_train -( - date Date MATERIALIZED today(), - ACTION UInt8, - RESOURCE UInt32, - MGR_ID UInt32, - ROLE_ROLLUP_1 UInt32, - ROLE_ROLLUP_2 UInt32, - ROLE_DEPTNAME UInt32, - ROLE_TITLE UInt32, - ROLE_FAMILY_DESC UInt32, - ROLE_FAMILY UInt32, - ROLE_CODE UInt32 -) -ENGINE = MergeTree ORDER BY date -``` - -**3.** 从ClickHouse控制台客户端退出: - -``` sql -:) exit -``` - -## 2. 将数据插入到表中 {#insert-data-to-table} - -插入数据: - -**1.** 运行以下命令: - -``` bash -$ clickhouse client --host 127.0.0.1 --query 'INSERT INTO amazon_train FORMAT CSVWithNames' < ~/amazon/train.csv -``` - -**2.** 在交互模式下启动ClickHouse控制台客户端: - -``` bash -$ clickhouse client -``` - -**3.** 确保数据已上传: - -``` sql -:) SELECT count() FROM amazon_train - -SELECT count() -FROM amazon_train - -+-count()-+ -| 65538 | -+-------+ -``` - -## 3. 将CatBoost集成到ClickHouse中 {#integrate-catboost-into-clickhouse} - -!!! note "注" - **可跳过。** 示例Docker映像已经包含了运行CatBoost和ClickHouse所需的所有内容。 - -为了将CatBoost集成进ClickHouse,需要进行如下步骤: - -**1.** 构建评估库。 - -评估CatBoost模型的最快方法是编译 `libcatboostmodel.` 库文件. - -有关如何构建库文件的详细信息,请参阅 [CatBoost文件](https://catboost.ai/docs/concepts/c-plus-plus-api_dynamic-c-pluplus-wrapper.html). - -**2.** 创建一个新目录(位置与名称可随意指定), 如 `data` 并将创建的库文件放入其中。 示例Docker镜像已经包含了库 `data/libcatboostmodel.so`. - -**3.** 创建一个新目录来放配置模型, 如 `models`. - -**4.** 创建一个模型配置文件,如 `models/amazon_model.xml`. - -**5.** 修改模型配置: - -``` xml - - - - catboost - - amazon - - /home/catboost/tutorial/catboost_model.bin - - 0 - - -``` - -**6.** 将CatBoost库文件的路径和模型配置添加到ClickHouse配置: - -``` xml - -/home/catboost/data/libcatboostmodel.so -/home/catboost/models/*_model.xml -``` - -## 4. 使用SQL调用预测模型 {#run-model-inference} - -为了测试模型是否正常,可以使用ClickHouse客户端 `$ clickhouse client`. - -让我们确保模型能正常工作: - -``` sql -:) SELECT - modelEvaluate('amazon', - RESOURCE, - MGR_ID, - ROLE_ROLLUP_1, - ROLE_ROLLUP_2, - ROLE_DEPTNAME, - ROLE_TITLE, - ROLE_FAMILY_DESC, - ROLE_FAMILY, - ROLE_CODE) > 0 AS prediction, - ACTION AS target -FROM amazon_train -LIMIT 10 -``` - -!!! note "注" - 函数 [modelEvaluate](../sql-reference/functions/other-functions.md#function-modelevaluate) 会对多类别模型返回一个元组,其中包含每一类别的原始预测值。 - -执行预测: - -``` sql -:) SELECT - modelEvaluate('amazon', - RESOURCE, - MGR_ID, - ROLE_ROLLUP_1, - ROLE_ROLLUP_2, - ROLE_DEPTNAME, - ROLE_TITLE, - ROLE_FAMILY_DESC, - ROLE_FAMILY, - ROLE_CODE) AS prediction, - 1. / (1 + exp(-prediction)) AS probability, - ACTION AS target -FROM amazon_train -LIMIT 10 -``` - -!!! note "注" - 查看函数说明 [exp()](../sql-reference/functions/math-functions.md) 。 - -让我们计算样本的LogLoss: - -``` sql -:) SELECT -avg(tg * log(prob) + (1 - tg) * log(1 - prob)) AS logloss -FROM -( - SELECT - modelEvaluate('amazon', - RESOURCE, - MGR_ID, - ROLE_ROLLUP_1, - ROLE_ROLLUP_2, - ROLE_DEPTNAME, - ROLE_TITLE, - ROLE_FAMILY_DESC, - ROLE_FAMILY, - ROLE_CODE) AS prediction, - 1. / (1. + exp(-prediction)) AS prob, - ACTION AS tg - FROM amazon_train -) -``` - -!!! note "注" - 查看函数说明 [avg()](../sql-reference/aggregate-functions/reference/avg.md#agg_function-avg) 和 [log()](../sql-reference/functions/math-functions.md) 。 - -[原始文章](https://clickhouse.com/docs/en/guides/apply_catboost_model/) diff --git a/docs/zh/guides/index.md b/docs/zh/guides/index.md index 5e535ea5736..00c4ae4def1 100644 --- a/docs/zh/guides/index.md +++ b/docs/zh/guides/index.md @@ -9,6 +9,5 @@ sidebar_label: ClickHouse指南 列出了如何使用 Clickhouse 解决各种任务的详细说明: - [关于简单集群设置的教程](../getting-started/tutorial.md) -- [在ClickHouse中应用CatBoost模型](apply-catboost-model.md) [原始文章](https://clickhouse.com/docs/en/guides/) diff --git a/docs/zh/sql-reference/statements/system.md b/docs/zh/sql-reference/statements/system.md index d833887a9c6..3df00cf8854 100644 --- a/docs/zh/sql-reference/statements/system.md +++ b/docs/zh/sql-reference/statements/system.md @@ -6,38 +6,6 @@ sidebar_label: SYSTEM # SYSTEM Queries {#query-language-system} -- [RELOAD EMBEDDED DICTIONARIES](#query_language-system-reload-emdedded-dictionaries) -- [RELOAD DICTIONARIES](#query_language-system-reload-dictionaries) -- [RELOAD DICTIONARY](#query_language-system-reload-dictionary) -- [DROP DNS CACHE](#query_language-system-drop-dns-cache) -- [DROP MARK CACHE](#query_language-system-drop-mark-cache) -- [DROP UNCOMPRESSED CACHE](#query_language-system-drop-uncompressed-cache) -- [DROP COMPILED EXPRESSION CACHE](#query_language-system-drop-compiled-expression-cache) -- [DROP REPLICA](#query_language-system-drop-replica) -- [FLUSH LOGS](#query_language-system-flush_logs) -- [RELOAD CONFIG](#query_language-system-reload-config) -- [SHUTDOWN](#query_language-system-shutdown) -- [KILL](#query_language-system-kill) -- [STOP DISTRIBUTED SENDS](#query_language-system-stop-distributed-sends) -- [FLUSH DISTRIBUTED](#query_language-system-flush-distributed) -- [START DISTRIBUTED SENDS](#query_language-system-start-distributed-sends) -- [STOP MERGES](#query_language-system-stop-merges) -- [START MERGES](#query_language-system-start-merges) -- [STOP TTL MERGES](#query_language-stop-ttl-merges) -- [START TTL MERGES](#query_language-start-ttl-merges) -- [STOP MOVES](#query_language-stop-moves) -- [START MOVES](#query_language-start-moves) -- [SYSTEM UNFREEZE](#query_language-system-unfreeze) -- [STOP FETCHES](#query_language-system-stop-fetches) -- [START FETCHES](#query_language-system-start-fetches) -- [STOP REPLICATED SENDS](#query_language-system-start-replicated-sends) -- [START REPLICATED SENDS](#query_language-system-start-replicated-sends) -- [STOP REPLICATION QUEUES](#query_language-system-stop-replication-queues) -- [START REPLICATION QUEUES](#query_language-system-start-replication-queues) -- [SYNC REPLICA](#query_language-system-sync-replica) -- [RESTART REPLICA](#query_language-system-restart-replica) -- [RESTART REPLICAS](#query_language-system-restart-replicas) - ## RELOAD EMBEDDED DICTIONARIES\] {#query_language-system-reload-emdedded-dictionaries} 重新加载所有[内置字典](../../sql-reference/dictionaries/internal-dicts.md)。默认情况下内置字典是禁用的。 diff --git a/packages/clickhouse-server.init b/packages/clickhouse-server.init index 1695f6286b8..13aeffe13a7 100755 --- a/packages/clickhouse-server.init +++ b/packages/clickhouse-server.init @@ -47,9 +47,10 @@ CLICKHOUSE_PIDFILE="$CLICKHOUSE_PIDDIR/$PROGRAM.pid" # Some systems lack "flock" command -v flock >/dev/null && FLOCK=flock -# Override defaults from optional config file +# Override defaults from optional config file and export them automatically +set -a test -f /etc/default/clickhouse && . /etc/default/clickhouse - +set +a die() { diff --git a/programs/CMakeLists.txt b/programs/CMakeLists.txt index 3c0c0781de6..9cf7cb2b624 100644 --- a/programs/CMakeLists.txt +++ b/programs/CMakeLists.txt @@ -54,7 +54,7 @@ else () endif () if (NOT USE_MUSL) - option (ENABLE_CLICKHOUSE_LIBRARY_BRIDGE "HTTP-server working like a proxy to Library dictionary source" ${ENABLE_CLICKHOUSE_ALL}) + option (ENABLE_CLICKHOUSE_LIBRARY_BRIDGE "HTTP-server working like a proxy to external dynamically loaded libraries" ${ENABLE_CLICKHOUSE_ALL}) endif () # https://presentations.clickhouse.com/matemarketing_2020/ diff --git a/programs/install/Install.cpp b/programs/install/Install.cpp index 297e2d24c07..9b1bae947d2 100644 --- a/programs/install/Install.cpp +++ b/programs/install/Install.cpp @@ -446,8 +446,8 @@ int mainEntryClickHouseInstall(int argc, char ** argv) fs::path ulimits_file = ulimits_dir / fmt::format("{}.conf", user); fmt::print("Will set ulimits for {} user in {}.\n", user, ulimits_file.string()); std::string ulimits_content = fmt::format( - "{0}\tsoft\tnofile\t262144\n" - "{0}\thard\tnofile\t262144\n", user); + "{0}\tsoft\tnofile\t1048576\n" + "{0}\thard\tnofile\t1048576\n", user); fs::create_directories(ulimits_dir); diff --git a/programs/library-bridge/CMakeLists.txt b/programs/library-bridge/CMakeLists.txt index 40cabacded4..c5ef924b180 100644 --- a/programs/library-bridge/CMakeLists.txt +++ b/programs/library-bridge/CMakeLists.txt @@ -1,6 +1,8 @@ include(${ClickHouse_SOURCE_DIR}/cmake/split_debug_symbols.cmake) set (CLICKHOUSE_LIBRARY_BRIDGE_SOURCES + CatBoostLibraryHandler.cpp + CatBoostLibraryHandlerFactory.cpp ExternalDictionaryLibraryAPI.cpp ExternalDictionaryLibraryHandler.cpp ExternalDictionaryLibraryHandlerFactory.cpp diff --git a/programs/library-bridge/CatBoostLibraryAPI.h b/programs/library-bridge/CatBoostLibraryAPI.h new file mode 100644 index 00000000000..9eaa0d17ff7 --- /dev/null +++ b/programs/library-bridge/CatBoostLibraryAPI.h @@ -0,0 +1,49 @@ +#pragma once + +#include +#include + +// Function pointer typedefs and names of libcatboost.so functions used by ClickHouse +struct CatBoostLibraryAPI +{ + using ModelCalcerHandle = void; + + using ModelCalcerCreateFunc = ModelCalcerHandle * (*)(); + static constexpr const char * ModelCalcerCreateName = "ModelCalcerCreate"; + + using ModelCalcerDeleteFunc = void (*)(ModelCalcerHandle *); + static constexpr const char * ModelCalcerDeleteName = "ModelCalcerDelete"; + + using GetErrorStringFunc = const char * (*)(); + static constexpr const char * GetErrorStringName = "GetErrorString"; + + using LoadFullModelFromFileFunc = bool (*)(ModelCalcerHandle *, const char *); + static constexpr const char * LoadFullModelFromFileName = "LoadFullModelFromFile"; + + using CalcModelPredictionFlatFunc = bool (*)(ModelCalcerHandle *, size_t, const float **, size_t, double *, size_t); + static constexpr const char * CalcModelPredictionFlatName = "CalcModelPredictionFlat"; + + using CalcModelPredictionFunc = bool (*)(ModelCalcerHandle *, size_t, const float **, size_t, const char ***, size_t, double *, size_t); + static constexpr const char * CalcModelPredictionName = "CalcModelPrediction"; + + using CalcModelPredictionWithHashedCatFeaturesFunc = bool (*)(ModelCalcerHandle *, size_t, const float **, size_t, const int **, size_t, double *, size_t); + static constexpr const char * CalcModelPredictionWithHashedCatFeaturesName = "CalcModelPredictionWithHashedCatFeatures"; + + using GetStringCatFeatureHashFunc = int (*)(const char *, size_t); + static constexpr const char * GetStringCatFeatureHashName = "GetStringCatFeatureHash"; + + using GetIntegerCatFeatureHashFunc = int (*)(uint64_t); + static constexpr const char * GetIntegerCatFeatureHashName = "GetIntegerCatFeatureHash"; + + using GetFloatFeaturesCountFunc = size_t (*)(ModelCalcerHandle *); + static constexpr const char * GetFloatFeaturesCountName = "GetFloatFeaturesCount"; + + using GetCatFeaturesCountFunc = size_t (*)(ModelCalcerHandle *); + static constexpr const char * GetCatFeaturesCountName = "GetCatFeaturesCount"; + + using GetTreeCountFunc = size_t (*)(ModelCalcerHandle *); + static constexpr const char * GetTreeCountName = "GetTreeCount"; + + using GetDimensionsCountFunc = size_t (*)(ModelCalcerHandle *); + static constexpr const char * GetDimensionsCountName = "GetDimensionsCount"; +}; diff --git a/programs/library-bridge/CatBoostLibraryHandler.cpp b/programs/library-bridge/CatBoostLibraryHandler.cpp new file mode 100644 index 00000000000..2c3ed583463 --- /dev/null +++ b/programs/library-bridge/CatBoostLibraryHandler.cpp @@ -0,0 +1,389 @@ +#include "CatBoostLibraryHandler.h" + +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int BAD_ARGUMENTS; + extern const int CANNOT_APPLY_CATBOOST_MODEL; + extern const int CANNOT_LOAD_CATBOOST_MODEL; + extern const int LOGICAL_ERROR; +} + +CatBoostLibraryHandler::APIHolder::APIHolder(SharedLibrary & lib) +{ + ModelCalcerCreate = lib.get(CatBoostLibraryAPI::ModelCalcerCreateName); + ModelCalcerDelete = lib.get(CatBoostLibraryAPI::ModelCalcerDeleteName); + GetErrorString = lib.get(CatBoostLibraryAPI::GetErrorStringName); + LoadFullModelFromFile = lib.get(CatBoostLibraryAPI::LoadFullModelFromFileName); + CalcModelPredictionFlat = lib.get(CatBoostLibraryAPI::CalcModelPredictionFlatName); + CalcModelPrediction = lib.get(CatBoostLibraryAPI::CalcModelPredictionName); + CalcModelPredictionWithHashedCatFeatures = lib.get(CatBoostLibraryAPI::CalcModelPredictionWithHashedCatFeaturesName); + GetStringCatFeatureHash = lib.get(CatBoostLibraryAPI::GetStringCatFeatureHashName); + GetIntegerCatFeatureHash = lib.get(CatBoostLibraryAPI::GetIntegerCatFeatureHashName); + GetFloatFeaturesCount = lib.get(CatBoostLibraryAPI::GetFloatFeaturesCountName); + GetCatFeaturesCount = lib.get(CatBoostLibraryAPI::GetCatFeaturesCountName); + GetTreeCount = lib.tryGet(CatBoostLibraryAPI::GetTreeCountName); + GetDimensionsCount = lib.tryGet(CatBoostLibraryAPI::GetDimensionsCountName); +} + +CatBoostLibraryHandler::CatBoostLibraryHandler( + const std::string & library_path, + const std::string & model_path) + : loading_start_time(std::chrono::system_clock::now()) + , library(std::make_shared(library_path)) + , api(*library) +{ + model_calcer_handle = api.ModelCalcerCreate(); + + if (!api.LoadFullModelFromFile(model_calcer_handle, model_path.c_str())) + { + throw Exception(ErrorCodes::CANNOT_LOAD_CATBOOST_MODEL, + "Cannot load CatBoost model: {}", api.GetErrorString()); + } + + float_features_count = api.GetFloatFeaturesCount(model_calcer_handle); + cat_features_count = api.GetCatFeaturesCount(model_calcer_handle); + + tree_count = 1; + if (api.GetDimensionsCount) + tree_count = api.GetDimensionsCount(model_calcer_handle); + + loading_duration = std::chrono::duration_cast(std::chrono::system_clock::now() - loading_start_time); +} + +CatBoostLibraryHandler::~CatBoostLibraryHandler() +{ + api.ModelCalcerDelete(model_calcer_handle); +} + +std::chrono::system_clock::time_point CatBoostLibraryHandler::getLoadingStartTime() const +{ + return loading_start_time; +} + +std::chrono::milliseconds CatBoostLibraryHandler::getLoadingDuration() const +{ + return loading_duration; +} + +namespace +{ + +/// Buffer should be allocated with features_count * column->size() elements. +/// Place column elements in positions buffer[0], buffer[features_count], ... , buffer[size * features_count] +template +void placeColumnAsNumber(const IColumn * column, T * buffer, size_t features_count) +{ + size_t size = column->size(); + FieldVisitorConvertToNumber visitor; + for (size_t i = 0; i < size; ++i) + { + /// TODO: Replace with column visitor. + Field field; + column->get(i, field); + *buffer = applyVisitor(visitor, field); + buffer += features_count; + } +} + +/// Buffer should be allocated with features_count * column->size() elements. +/// Place string pointers in positions buffer[0], buffer[features_count], ... , buffer[size * features_count] +void placeStringColumn(const ColumnString & column, const char ** buffer, size_t features_count) +{ + size_t size = column.size(); + for (size_t i = 0; i < size; ++i) + { + *buffer = const_cast(column.getDataAtWithTerminatingZero(i).data); + buffer += features_count; + } +} + +/// Buffer should be allocated with features_count * column->size() elements. +/// Place string pointers in positions buffer[0], buffer[features_count], ... , buffer[size * features_count] +/// Returns PODArray which holds data (because ColumnFixedString doesn't store terminating zero). +PODArray placeFixedStringColumn(const ColumnFixedString & column, const char ** buffer, size_t features_count) +{ + size_t size = column.size(); + size_t str_size = column.getN(); + PODArray data(size * (str_size + 1)); + char * data_ptr = data.data(); + + for (size_t i = 0; i < size; ++i) + { + auto ref = column.getDataAt(i); + memcpy(data_ptr, ref.data, ref.size); + data_ptr[ref.size] = 0; + *buffer = data_ptr; + data_ptr += ref.size + 1; + buffer += features_count; + } + + return data; +} + +/// Place columns into buffer, returns column which holds placed data. Buffer should contains column->size() values. +template +ColumnPtr placeNumericColumns(const ColumnRawPtrs & columns, size_t offset, size_t size, const T** buffer) +{ + if (size == 0) + return nullptr; + + size_t column_size = columns[offset]->size(); + auto data_column = ColumnVector::create(size * column_size); + T * data = data_column->getData().data(); + for (size_t i = 0; i < size; ++i) + { + const auto * column = columns[offset + i]; + if (column->isNumeric()) + placeColumnAsNumber(column, data + i, size); + } + + for (size_t i = 0; i < column_size; ++i) + { + *buffer = data; + ++buffer; + data += size; + } + + return data_column; +} + +/// Place columns into buffer, returns data which was used for fixed string columns. +/// Buffer should contains column->size() values, each value contains size strings. +std::vector> placeStringColumns(const ColumnRawPtrs & columns, size_t offset, size_t size, const char ** buffer) +{ + if (size == 0) + return {}; + + std::vector> data; + for (size_t i = 0; i < size; ++i) + { + const auto * column = columns[offset + i]; + if (const auto * column_string = typeid_cast(column)) + placeStringColumn(*column_string, buffer + i, size); + else if (const auto * column_fixed_string = typeid_cast(column)) + data.push_back(placeFixedStringColumn(*column_fixed_string, buffer + i, size)); + else + throw Exception("Cannot place string column.", ErrorCodes::LOGICAL_ERROR); + } + + return data; +} + +/// buffer[column_size * cat_features_count] -> char * => cat_features[column_size][cat_features_count] -> char * +void fillCatFeaturesBuffer( + const char *** cat_features, const char ** buffer, + size_t column_size, size_t cat_features_count) +{ + for (size_t i = 0; i < column_size; ++i) + { + *cat_features = buffer; + ++cat_features; + buffer += cat_features_count; + } +} + +/// Calc hash for string cat feature at ps positions. +template +void calcStringHashes(const Column * column, size_t ps, const int ** buffer, const CatBoostLibraryHandler::APIHolder & api) +{ + size_t column_size = column->size(); + for (size_t j = 0; j < column_size; ++j) + { + auto ref = column->getDataAt(j); + const_cast(*buffer)[ps] = api.GetStringCatFeatureHash(ref.data, ref.size); + ++buffer; + } +} + +/// Calc hash for int cat feature at ps position. Buffer at positions ps should contains unhashed values. +void calcIntHashes(size_t column_size, size_t ps, const int ** buffer, const CatBoostLibraryHandler::APIHolder & api) +{ + for (size_t j = 0; j < column_size; ++j) + { + const_cast(*buffer)[ps] = api.GetIntegerCatFeatureHash((*buffer)[ps]); + ++buffer; + } +} + +/// buffer contains column->size() rows and size columns. +/// For int cat features calc hash inplace. +/// For string cat features calc hash from column rows. +void calcHashes(const ColumnRawPtrs & columns, size_t offset, size_t size, const int ** buffer, const CatBoostLibraryHandler::APIHolder & api) +{ + if (size == 0) + return; + size_t column_size = columns[offset]->size(); + + std::vector> data; + for (size_t i = 0; i < size; ++i) + { + const auto * column = columns[offset + i]; + if (const auto * column_string = typeid_cast(column)) + calcStringHashes(column_string, i, buffer, api); + else if (const auto * column_fixed_string = typeid_cast(column)) + calcStringHashes(column_fixed_string, i, buffer, api); + else + calcIntHashes(column_size, i, buffer, api); + } +} + +} + +/// Convert values to row-oriented format and call evaluation function from CatBoost wrapper api. +/// * CalcModelPredictionFlat if no cat features +/// * CalcModelPrediction if all cat features are strings +/// * CalcModelPredictionWithHashedCatFeatures if has int cat features. +ColumnFloat64::MutablePtr CatBoostLibraryHandler::evalImpl( + const ColumnRawPtrs & columns, + bool cat_features_are_strings) const +{ + std::string error_msg = "Error occurred while applying CatBoost model: "; + size_t column_size = columns.front()->size(); + + auto result = ColumnFloat64::create(column_size * tree_count); + auto * result_buf = result->getData().data(); + + if (!column_size) + return result; + + /// Prepare float features. + PODArray float_features(column_size); + auto * float_features_buf = float_features.data(); + /// Store all float data into single column. float_features is a list of pointers to it. + auto float_features_col = placeNumericColumns(columns, 0, float_features_count, float_features_buf); + + if (cat_features_count == 0) + { + if (!api.CalcModelPredictionFlat(model_calcer_handle, column_size, + float_features_buf, float_features_count, + result_buf, column_size * tree_count)) + { + + throw Exception(error_msg + api.GetErrorString(), ErrorCodes::CANNOT_APPLY_CATBOOST_MODEL); + } + return result; + } + + /// Prepare cat features. + if (cat_features_are_strings) + { + /// cat_features_holder stores pointers to ColumnString data or fixed_strings_data. + PODArray cat_features_holder(cat_features_count * column_size); + PODArray cat_features(column_size); + auto * cat_features_buf = cat_features.data(); + + fillCatFeaturesBuffer(cat_features_buf, cat_features_holder.data(), column_size, cat_features_count); + /// Fixed strings are stored without termination zero, so have to copy data into fixed_strings_data. + auto fixed_strings_data = placeStringColumns(columns, float_features_count, + cat_features_count, cat_features_holder.data()); + + if (!api.CalcModelPrediction(model_calcer_handle, column_size, + float_features_buf, float_features_count, + cat_features_buf, cat_features_count, + result_buf, column_size * tree_count)) + { + throw Exception(error_msg + api.GetErrorString(), ErrorCodes::CANNOT_APPLY_CATBOOST_MODEL); + } + } + else + { + PODArray cat_features(column_size); + auto * cat_features_buf = cat_features.data(); + auto cat_features_col = placeNumericColumns(columns, float_features_count, + cat_features_count, cat_features_buf); + calcHashes(columns, float_features_count, cat_features_count, cat_features_buf, api); + if (!api.CalcModelPredictionWithHashedCatFeatures( + model_calcer_handle, column_size, + float_features_buf, float_features_count, + cat_features_buf, cat_features_count, + result_buf, column_size * tree_count)) + { + throw Exception(error_msg + api.GetErrorString(), ErrorCodes::CANNOT_APPLY_CATBOOST_MODEL); + } + } + + return result; +} + +size_t CatBoostLibraryHandler::getTreeCount() const +{ + std::lock_guard lock(mutex); + return tree_count; +} + +ColumnPtr CatBoostLibraryHandler::evaluate(const ColumnRawPtrs & columns) const +{ + std::lock_guard lock(mutex); + + if (columns.empty()) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Got empty columns list for CatBoost model."); + + if (columns.size() != float_features_count + cat_features_count) + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Number of columns is different with number of features: columns size {} float features size {} + cat features size {}", + columns.size(), + float_features_count, + cat_features_count); + + for (size_t i = 0; i < float_features_count; ++i) + { + if (!columns[i]->isNumeric()) + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Column {} should be numeric to make float feature.", i); + } + } + + bool cat_features_are_strings = true; + for (size_t i = float_features_count; i < float_features_count + cat_features_count; ++i) + { + const auto * column = columns[i]; + if (column->isNumeric()) + { + cat_features_are_strings = false; + } + else if (!(typeid_cast(column) + || typeid_cast(column))) + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Column {} should be numeric or string.", i); + } + } + + auto result = evalImpl(columns, cat_features_are_strings); + + if (tree_count == 1) + return result; + + size_t column_size = columns.front()->size(); + auto * result_buf = result->getData().data(); + + /// Multiple trees case. Copy data to several columns. + MutableColumns mutable_columns(tree_count); + std::vector column_ptrs(tree_count); + for (size_t i = 0; i < tree_count; ++i) + { + auto col = ColumnFloat64::create(column_size); + column_ptrs[i] = col->getData().data(); + mutable_columns[i] = std::move(col); + } + + Float64 * data = result_buf; + for (size_t row = 0; row < column_size; ++row) + { + for (size_t i = 0; i < tree_count; ++i) + { + *column_ptrs[i] = *data; + ++column_ptrs[i]; + ++data; + } + } + + return ColumnTuple::create(std::move(mutable_columns)); +} + +} diff --git a/programs/library-bridge/CatBoostLibraryHandler.h b/programs/library-bridge/CatBoostLibraryHandler.h new file mode 100644 index 00000000000..e0ff1d70250 --- /dev/null +++ b/programs/library-bridge/CatBoostLibraryHandler.h @@ -0,0 +1,78 @@ +#pragma once + +#include "CatBoostLibraryAPI.h" + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace DB +{ + +/// Abstracts access to the CatBoost shared library. +class CatBoostLibraryHandler +{ +public: + /// Holds pointers to CatBoost library functions + struct APIHolder + { + explicit APIHolder(SharedLibrary & lib); + + // NOLINTBEGIN(readability-identifier-naming) + CatBoostLibraryAPI::ModelCalcerCreateFunc ModelCalcerCreate; + CatBoostLibraryAPI::ModelCalcerDeleteFunc ModelCalcerDelete; + CatBoostLibraryAPI::GetErrorStringFunc GetErrorString; + CatBoostLibraryAPI::LoadFullModelFromFileFunc LoadFullModelFromFile; + CatBoostLibraryAPI::CalcModelPredictionFlatFunc CalcModelPredictionFlat; + CatBoostLibraryAPI::CalcModelPredictionFunc CalcModelPrediction; + CatBoostLibraryAPI::CalcModelPredictionWithHashedCatFeaturesFunc CalcModelPredictionWithHashedCatFeatures; + CatBoostLibraryAPI::GetStringCatFeatureHashFunc GetStringCatFeatureHash; + CatBoostLibraryAPI::GetIntegerCatFeatureHashFunc GetIntegerCatFeatureHash; + CatBoostLibraryAPI::GetFloatFeaturesCountFunc GetFloatFeaturesCount; + CatBoostLibraryAPI::GetCatFeaturesCountFunc GetCatFeaturesCount; + CatBoostLibraryAPI::GetTreeCountFunc GetTreeCount; + CatBoostLibraryAPI::GetDimensionsCountFunc GetDimensionsCount; + // NOLINTEND(readability-identifier-naming) + }; + + CatBoostLibraryHandler( + const String & library_path, + const String & model_path); + + ~CatBoostLibraryHandler(); + + std::chrono::system_clock::time_point getLoadingStartTime() const; + std::chrono::milliseconds getLoadingDuration() const; + + size_t getTreeCount() const; + + ColumnPtr evaluate(const ColumnRawPtrs & columns) const; + +private: + std::chrono::system_clock::time_point loading_start_time; + std::chrono::milliseconds loading_duration; + + const SharedLibraryPtr library; + const APIHolder api; + + mutable std::mutex mutex; + + CatBoostLibraryAPI::ModelCalcerHandle * model_calcer_handle TSA_GUARDED_BY(mutex) TSA_PT_GUARDED_BY(mutex); + + size_t float_features_count TSA_GUARDED_BY(mutex); + size_t cat_features_count TSA_GUARDED_BY(mutex); + size_t tree_count TSA_GUARDED_BY(mutex); + + ColumnFloat64::MutablePtr evalImpl(const ColumnRawPtrs & columns, bool cat_features_are_strings) const TSA_REQUIRES(mutex); +}; + +using CatBoostLibraryHandlerPtr = std::shared_ptr; + +} diff --git a/programs/library-bridge/CatBoostLibraryHandlerFactory.cpp b/programs/library-bridge/CatBoostLibraryHandlerFactory.cpp new file mode 100644 index 00000000000..6ee078f6c5c --- /dev/null +++ b/programs/library-bridge/CatBoostLibraryHandlerFactory.cpp @@ -0,0 +1,80 @@ +#include "CatBoostLibraryHandlerFactory.h" + +#include + + +namespace DB +{ + +CatBoostLibraryHandlerFactory & CatBoostLibraryHandlerFactory::instance() +{ + static CatBoostLibraryHandlerFactory instance; + return instance; +} + +CatBoostLibraryHandlerFactory::CatBoostLibraryHandlerFactory() + : log(&Poco::Logger::get("CatBoostLibraryHandlerFactory")) +{ +} + +CatBoostLibraryHandlerPtr CatBoostLibraryHandlerFactory::tryGetModel(const String & model_path, const String & library_path, bool create_if_not_found) +{ + std::lock_guard lock(mutex); + + auto handler = library_handlers.find(model_path); + bool found = (handler != library_handlers.end()); + + if (found) + return handler->second; + else + { + if (create_if_not_found) + { + auto new_handler = std::make_shared(library_path, model_path); + library_handlers.emplace(model_path, new_handler); + LOG_DEBUG(log, "Loaded catboost library handler for model path '{}'", model_path); + return new_handler; + } + return nullptr; + } +} + +void CatBoostLibraryHandlerFactory::removeModel(const String & model_path) +{ + std::lock_guard lock(mutex); + + bool deleted = library_handlers.erase(model_path); + if (!deleted) + { + LOG_DEBUG(log, "Cannot unload catboost library handler for model path '{}'", model_path); + return; + } + LOG_DEBUG(log, "Unloaded catboost library handler for model path '{}'", model_path); +} + +void CatBoostLibraryHandlerFactory::removeAllModels() +{ + std::lock_guard lock(mutex); + library_handlers.clear(); + LOG_DEBUG(log, "Unloaded all catboost library handlers"); +} + +ExternalModelInfos CatBoostLibraryHandlerFactory::getModelInfos() +{ + std::lock_guard lock(mutex); + + ExternalModelInfos result; + + for (const auto & handler : library_handlers) + result.push_back({ + .model_path = handler.first, + .model_type = "catboost", + .loading_start_time = handler.second->getLoadingStartTime(), + .loading_duration = handler.second->getLoadingDuration() + }); + + return result; + +} + +} diff --git a/programs/library-bridge/CatBoostLibraryHandlerFactory.h b/programs/library-bridge/CatBoostLibraryHandlerFactory.h new file mode 100644 index 00000000000..6ba3fe84ec9 --- /dev/null +++ b/programs/library-bridge/CatBoostLibraryHandlerFactory.h @@ -0,0 +1,37 @@ +#pragma once + +#include "CatBoostLibraryHandler.h" + +#include +#include + +#include +#include +#include + + +namespace DB +{ + +class CatBoostLibraryHandlerFactory final : private boost::noncopyable +{ +public: + static CatBoostLibraryHandlerFactory & instance(); + + CatBoostLibraryHandlerFactory(); + + CatBoostLibraryHandlerPtr tryGetModel(const String & model_path, const String & library_path, bool create_if_not_found); + + void removeModel(const String & model_path); + void removeAllModels(); + + ExternalModelInfos getModelInfos(); + +private: + /// map: model path --> catboost library handler + std::unordered_map library_handlers TSA_GUARDED_BY(mutex); + std::mutex mutex; + Poco::Logger * log; +}; + +} diff --git a/programs/library-bridge/ExternalDictionaryLibraryHandler.h b/programs/library-bridge/ExternalDictionaryLibraryHandler.h index 7713e9a6830..d6d4f926f0f 100644 --- a/programs/library-bridge/ExternalDictionaryLibraryHandler.h +++ b/programs/library-bridge/ExternalDictionaryLibraryHandler.h @@ -50,6 +50,6 @@ private: void * lib_data; }; -using SharedLibraryHandlerPtr = std::shared_ptr; +using ExternalDictionaryLibraryHandlerPtr = std::shared_ptr; } diff --git a/programs/library-bridge/ExternalDictionaryLibraryHandlerFactory.cpp b/programs/library-bridge/ExternalDictionaryLibraryHandlerFactory.cpp index ffa5ff6f493..6acd9af20ed 100644 --- a/programs/library-bridge/ExternalDictionaryLibraryHandlerFactory.cpp +++ b/programs/library-bridge/ExternalDictionaryLibraryHandlerFactory.cpp @@ -1,37 +1,40 @@ #include "ExternalDictionaryLibraryHandlerFactory.h" +#include namespace DB { -SharedLibraryHandlerPtr ExternalDictionaryLibraryHandlerFactory::get(const std::string & dictionary_id) +ExternalDictionaryLibraryHandlerPtr ExternalDictionaryLibraryHandlerFactory::get(const String & dictionary_id) { std::lock_guard lock(mutex); - auto library_handler = library_handlers.find(dictionary_id); - - if (library_handler != library_handlers.end()) - return library_handler->second; + if (auto handler = library_handlers.find(dictionary_id); handler != library_handlers.end()) + return handler->second; return nullptr; } void ExternalDictionaryLibraryHandlerFactory::create( - const std::string & dictionary_id, - const std::string & library_path, - const std::vector & library_settings, + const String & dictionary_id, + const String & library_path, + const std::vector & library_settings, const Block & sample_block, - const std::vector & attributes_names) + const std::vector & attributes_names) { std::lock_guard lock(mutex); - if (!library_handlers.contains(dictionary_id)) - library_handlers.emplace(std::make_pair(dictionary_id, std::make_shared(library_path, library_settings, sample_block, attributes_names))); - else + + if (library_handlers.contains(dictionary_id)) + { LOG_WARNING(&Poco::Logger::get("ExternalDictionaryLibraryHandlerFactory"), "Library handler with dictionary id {} already exists", dictionary_id); + return; + } + + library_handlers.emplace(std::make_pair(dictionary_id, std::make_shared(library_path, library_settings, sample_block, attributes_names))); } -bool ExternalDictionaryLibraryHandlerFactory::clone(const std::string & from_dictionary_id, const std::string & to_dictionary_id) +bool ExternalDictionaryLibraryHandlerFactory::clone(const String & from_dictionary_id, const String & to_dictionary_id) { std::lock_guard lock(mutex); auto from_library_handler = library_handlers.find(from_dictionary_id); @@ -45,7 +48,7 @@ bool ExternalDictionaryLibraryHandlerFactory::clone(const std::string & from_dic } -bool ExternalDictionaryLibraryHandlerFactory::remove(const std::string & dictionary_id) +bool ExternalDictionaryLibraryHandlerFactory::remove(const String & dictionary_id) { std::lock_guard lock(mutex); /// extDict_libDelete is called in destructor. diff --git a/programs/library-bridge/ExternalDictionaryLibraryHandlerFactory.h b/programs/library-bridge/ExternalDictionaryLibraryHandlerFactory.h index d821270c474..3dfafd82a0f 100644 --- a/programs/library-bridge/ExternalDictionaryLibraryHandlerFactory.h +++ b/programs/library-bridge/ExternalDictionaryLibraryHandlerFactory.h @@ -17,22 +17,22 @@ class ExternalDictionaryLibraryHandlerFactory final : private boost::noncopyable public: static ExternalDictionaryLibraryHandlerFactory & instance(); - SharedLibraryHandlerPtr get(const std::string & dictionary_id); + ExternalDictionaryLibraryHandlerPtr get(const String & dictionary_id); void create( - const std::string & dictionary_id, - const std::string & library_path, - const std::vector & library_settings, + const String & dictionary_id, + const String & library_path, + const std::vector & library_settings, const Block & sample_block, - const std::vector & attributes_names); + const std::vector & attributes_names); - bool clone(const std::string & from_dictionary_id, const std::string & to_dictionary_id); + bool clone(const String & from_dictionary_id, const String & to_dictionary_id); - bool remove(const std::string & dictionary_id); + bool remove(const String & dictionary_id); private: /// map: dict_id -> sharedLibraryHandler - std::unordered_map library_handlers TSA_GUARDED_BY(mutex); + std::unordered_map library_handlers TSA_GUARDED_BY(mutex); std::mutex mutex; }; diff --git a/programs/library-bridge/LibraryBridgeHandlerFactory.cpp b/programs/library-bridge/LibraryBridgeHandlerFactory.cpp index f8f6a23e1be..4af1f8355e8 100644 --- a/programs/library-bridge/LibraryBridgeHandlerFactory.cpp +++ b/programs/library-bridge/LibraryBridgeHandlerFactory.cpp @@ -27,12 +27,16 @@ std::unique_ptr LibraryBridgeHandlerFactory::createRequestHa { if (uri.getPath() == "/extdict_ping") return std::make_unique(keep_alive_timeout, getContext()); + else if (uri.getPath() == "/catboost_ping") + return std::make_unique(keep_alive_timeout, getContext()); } if (request.getMethod() == Poco::Net::HTTPRequest::HTTP_POST) { if (uri.getPath() == "/extdict_request") return std::make_unique(keep_alive_timeout, getContext()); + else if (uri.getPath() == "/catboost_request") + return std::make_unique(keep_alive_timeout, getContext()); } return nullptr; diff --git a/programs/library-bridge/LibraryBridgeHandlers.cpp b/programs/library-bridge/LibraryBridgeHandlers.cpp index a28148bd1f7..ab81472be88 100644 --- a/programs/library-bridge/LibraryBridgeHandlers.cpp +++ b/programs/library-bridge/LibraryBridgeHandlers.cpp @@ -1,24 +1,32 @@ #include "LibraryBridgeHandlers.h" + +#include "CatBoostLibraryHandler.h" +#include "CatBoostLibraryHandlerFactory.h" +#include "ExternalDictionaryLibraryHandler.h" #include "ExternalDictionaryLibraryHandlerFactory.h" #include -#include -#include +#include #include #include +#include +#include #include #include -#include #include -#include -#include -#include #include #include +#include +#include #include #include +#include #include -#include +#include +#include +#include +#include +#include namespace DB @@ -31,7 +39,7 @@ namespace ErrorCodes namespace { - void processError(HTTPServerResponse & response, const std::string & message) + void processError(HTTPServerResponse & response, const String & message) { response.setStatusAndReason(HTTPResponse::HTTP_INTERNAL_SERVER_ERROR); @@ -41,7 +49,7 @@ namespace LOG_WARNING(&Poco::Logger::get("LibraryBridge"), fmt::runtime(message)); } - std::shared_ptr parseColumns(std::string && column_string) + std::shared_ptr parseColumns(String && column_string) { auto sample_block = std::make_shared(); auto names_and_types = NamesAndTypesList::parse(column_string); @@ -59,10 +67,10 @@ namespace return ids; } - std::vector parseNamesFromBinary(const std::string & names_string) + std::vector parseNamesFromBinary(const String & names_string) { ReadBufferFromString buf(names_string); - std::vector names; + std::vector names; readVectorBinary(names, buf); return names; } @@ -79,13 +87,15 @@ static void writeData(Block data, OutputFormatPtr format) executor.execute(); } + ExternalDictionaryLibraryBridgeRequestHandler::ExternalDictionaryLibraryBridgeRequestHandler(size_t keep_alive_timeout_, ContextPtr context_) : WithContext(context_) - , log(&Poco::Logger::get("ExternalDictionaryLibraryBridgeRequestHandler")) , keep_alive_timeout(keep_alive_timeout_) + , log(&Poco::Logger::get("ExternalDictionaryLibraryBridgeRequestHandler")) { } + void ExternalDictionaryLibraryBridgeRequestHandler::handleRequest(HTTPServerRequest & request, HTTPServerResponse & response) { LOG_TRACE(log, "Request URI: {}", request.getURI()); @@ -97,7 +107,7 @@ void ExternalDictionaryLibraryBridgeRequestHandler::handleRequest(HTTPServerRequ version = 0; /// assumed version for too old servers which do not send a version else { - String version_str = params.get("version"); + const String & version_str = params.get("version"); if (!tryParse(version, version_str)) { processError(response, "Unable to parse 'version' string in request URL: '" + version_str + "' Check if the server and library-bridge have the same version."); @@ -124,8 +134,8 @@ void ExternalDictionaryLibraryBridgeRequestHandler::handleRequest(HTTPServerRequ return; } - std::string method = params.get("method"); - std::string dictionary_id = params.get("dictionary_id"); + const String & method = params.get("method"); + const String & dictionary_id = params.get("dictionary_id"); LOG_TRACE(log, "Library method: '{}', dictionary id: {}", method, dictionary_id); WriteBufferFromHTTPServerResponse out(response, request.getMethod() == Poco::Net::HTTPRequest::HTTP_HEAD, keep_alive_timeout); @@ -141,7 +151,7 @@ void ExternalDictionaryLibraryBridgeRequestHandler::handleRequest(HTTPServerRequ return; } - std::string from_dictionary_id = params.get("from_dictionary_id"); + const String & from_dictionary_id = params.get("from_dictionary_id"); bool cloned = false; cloned = ExternalDictionaryLibraryHandlerFactory::instance().clone(from_dictionary_id, dictionary_id); @@ -166,7 +176,7 @@ void ExternalDictionaryLibraryBridgeRequestHandler::handleRequest(HTTPServerRequ return; } - std::string library_path = params.get("library_path"); + const String & library_path = params.get("library_path"); if (!params.has("library_settings")) { @@ -174,10 +184,10 @@ void ExternalDictionaryLibraryBridgeRequestHandler::handleRequest(HTTPServerRequ return; } - const auto & settings_string = params.get("library_settings"); + const String & settings_string = params.get("library_settings"); LOG_DEBUG(log, "Parsing library settings from binary string"); - std::vector library_settings = parseNamesFromBinary(settings_string); + std::vector library_settings = parseNamesFromBinary(settings_string); /// Needed for library dictionary if (!params.has("attributes_names")) @@ -186,10 +196,10 @@ void ExternalDictionaryLibraryBridgeRequestHandler::handleRequest(HTTPServerRequ return; } - const auto & attributes_string = params.get("attributes_names"); + const String & attributes_string = params.get("attributes_names"); LOG_DEBUG(log, "Parsing attributes names from binary string"); - std::vector attributes_names = parseNamesFromBinary(attributes_string); + std::vector attributes_names = parseNamesFromBinary(attributes_string); /// Needed to parse block from binary string format if (!params.has("sample_block")) @@ -197,7 +207,7 @@ void ExternalDictionaryLibraryBridgeRequestHandler::handleRequest(HTTPServerRequ processError(response, "No 'sample_block' in request URL"); return; } - std::string sample_block_string = params.get("sample_block"); + String sample_block_string = params.get("sample_block"); std::shared_ptr sample_block; try @@ -297,7 +307,7 @@ void ExternalDictionaryLibraryBridgeRequestHandler::handleRequest(HTTPServerRequ return; } - std::string requested_block_string = params.get("requested_block_sample"); + String requested_block_string = params.get("requested_block_sample"); std::shared_ptr requested_sample_block; try @@ -332,7 +342,8 @@ void ExternalDictionaryLibraryBridgeRequestHandler::handleRequest(HTTPServerRequ } else { - LOG_WARNING(log, "Unknown library method: '{}'", method); + processError(response, "Unknown library method '" + method + "'"); + LOG_ERROR(log, "Unknown library method: '{}'", method); } } catch (...) @@ -362,6 +373,7 @@ void ExternalDictionaryLibraryBridgeRequestHandler::handleRequest(HTTPServerRequ } } + ExternalDictionaryLibraryBridgeExistsHandler::ExternalDictionaryLibraryBridgeExistsHandler(size_t keep_alive_timeout_, ContextPtr context_) : WithContext(context_) , keep_alive_timeout(keep_alive_timeout_) @@ -369,6 +381,7 @@ ExternalDictionaryLibraryBridgeExistsHandler::ExternalDictionaryLibraryBridgeExi { } + void ExternalDictionaryLibraryBridgeExistsHandler::handleRequest(HTTPServerRequest & request, HTTPServerResponse & response) { try @@ -382,7 +395,7 @@ void ExternalDictionaryLibraryBridgeExistsHandler::handleRequest(HTTPServerReque return; } - std::string dictionary_id = params.get("dictionary_id"); + const String & dictionary_id = params.get("dictionary_id"); auto library_handler = ExternalDictionaryLibraryHandlerFactory::instance().get(dictionary_id); @@ -399,4 +412,230 @@ void ExternalDictionaryLibraryBridgeExistsHandler::handleRequest(HTTPServerReque } +CatBoostLibraryBridgeRequestHandler::CatBoostLibraryBridgeRequestHandler( + size_t keep_alive_timeout_, ContextPtr context_) + : WithContext(context_) + , keep_alive_timeout(keep_alive_timeout_) + , log(&Poco::Logger::get("CatBoostLibraryBridgeRequestHandler")) +{ +} + + +void CatBoostLibraryBridgeRequestHandler::handleRequest(HTTPServerRequest & request, HTTPServerResponse & response) +{ + LOG_TRACE(log, "Request URI: {}", request.getURI()); + HTMLForm params(getContext()->getSettingsRef(), request); + + size_t version; + + if (!params.has("version")) + version = 0; /// assumed version for too old servers which do not send a version + else + { + const String & version_str = params.get("version"); + if (!tryParse(version, version_str)) + { + processError(response, "Unable to parse 'version' string in request URL: '" + version_str + "' Check if the server and library-bridge have the same version."); + return; + } + } + + if (version != LIBRARY_BRIDGE_PROTOCOL_VERSION) + { + /// backwards compatibility is considered unnecessary for now, just let the user know that the server and the bridge must be upgraded together + processError(response, "Server and library-bridge have different versions: '" + std::to_string(version) + "' vs. '" + std::to_string(LIBRARY_BRIDGE_PROTOCOL_VERSION) + "'"); + return; + } + if (!params.has("method")) + { + processError(response, "No 'method' in request URL"); + return; + } + + const String & method = params.get("method"); + + LOG_TRACE(log, "Library method: '{}'", method); + WriteBufferFromHTTPServerResponse out(response, request.getMethod() == Poco::Net::HTTPRequest::HTTP_HEAD, keep_alive_timeout); + + try + { + if (method == "catboost_list") + { + ExternalModelInfos model_infos = CatBoostLibraryHandlerFactory::instance().getModelInfos(); + + writeIntBinary(static_cast(model_infos.size()), out); + + for (const auto & info : model_infos) + { + writeStringBinary(info.model_path, out); + writeStringBinary(info.model_type, out); + + UInt64 t = std::chrono::system_clock::to_time_t(info.loading_start_time); + writeIntBinary(t, out); + + t = info.loading_duration.count(); + writeIntBinary(t, out); + + } + } + else if (method == "catboost_removeModel") + { + auto & read_buf = request.getStream(); + params.read(read_buf); + + if (!params.has("model_path")) + { + processError(response, "No 'model_path' in request URL"); + return; + } + + const String & model_path = params.get("model_path"); + + CatBoostLibraryHandlerFactory::instance().removeModel(model_path); + + String res = "1"; + writeStringBinary(res, out); + } + else if (method == "catboost_removeAllModels") + { + CatBoostLibraryHandlerFactory::instance().removeAllModels(); + + String res = "1"; + writeStringBinary(res, out); + } + else if (method == "catboost_GetTreeCount") + { + auto & read_buf = request.getStream(); + params.read(read_buf); + + if (!params.has("library_path")) + { + processError(response, "No 'library_path' in request URL"); + return; + } + + const String & library_path = params.get("library_path"); + + if (!params.has("model_path")) + { + processError(response, "No 'model_path' in request URL"); + return; + } + + const String & model_path = params.get("model_path"); + + auto catboost_handler = CatBoostLibraryHandlerFactory::instance().tryGetModel(model_path, library_path, /*create_if_not_found*/ true); + size_t tree_count = catboost_handler->getTreeCount(); + writeIntBinary(tree_count, out); + } + else if (method == "catboost_libEvaluate") + { + auto & read_buf = request.getStream(); + params.read(read_buf); + + if (!params.has("model_path")) + { + processError(response, "No 'model_path' in request URL"); + return; + } + + const String & model_path = params.get("model_path"); + + if (!params.has("data")) + { + processError(response, "No 'data' in request URL"); + return; + } + + const String & data = params.get("data"); + + ReadBufferFromString string_read_buf(data); + NativeReader deserializer(string_read_buf, /*server_revision*/ 0); + Block block_read = deserializer.read(); + + Columns col_ptrs = block_read.getColumns(); + ColumnRawPtrs col_raw_ptrs; + for (const auto & p : col_ptrs) + col_raw_ptrs.push_back(&*p); + + auto catboost_handler = CatBoostLibraryHandlerFactory::instance().tryGetModel(model_path, "DummyLibraryPath", /*create_if_not_found*/ false); + + if (!catboost_handler) + { + processError(response, "CatBoost library is not loaded for model '" + model_path + "'. Please try again."); + return; + } + + ColumnPtr res_col = catboost_handler->evaluate(col_raw_ptrs); + + DataTypePtr res_col_type = std::make_shared(); + String res_col_name = "res_col"; + + ColumnsWithTypeAndName res_cols_with_type_and_name = {{res_col, res_col_type, res_col_name}}; + + Block block_write(res_cols_with_type_and_name); + NativeWriter serializer{out, /*client_revision*/ 0, block_write}; + serializer.write(block_write); + } + else + { + processError(response, "Unknown library method '" + method + "'"); + LOG_ERROR(log, "Unknown library method: '{}'", method); + } + } + catch (...) + { + auto message = getCurrentExceptionMessage(true); + LOG_ERROR(log, "Failed to process request. Error: {}", message); + + response.setStatusAndReason(Poco::Net::HTTPResponse::HTTP_INTERNAL_SERVER_ERROR, message); // can't call process_error, because of too soon response sending + try + { + writeStringBinary(message, out); + out.finalize(); + } + catch (...) + { + tryLogCurrentException(log); + } + } + + try + { + out.finalize(); + } + catch (...) + { + tryLogCurrentException(log); + } +} + + +CatBoostLibraryBridgeExistsHandler::CatBoostLibraryBridgeExistsHandler(size_t keep_alive_timeout_, ContextPtr context_) + : WithContext(context_) + , keep_alive_timeout(keep_alive_timeout_) + , log(&Poco::Logger::get("CatBoostLibraryBridgeExistsHandler")) +{ +} + + +void CatBoostLibraryBridgeExistsHandler::handleRequest(HTTPServerRequest & request, HTTPServerResponse & response) +{ + try + { + LOG_TRACE(log, "Request URI: {}", request.getURI()); + HTMLForm params(getContext()->getSettingsRef(), request); + + String res = "1"; + + setResponseDefaultHeaders(response, keep_alive_timeout); + LOG_TRACE(log, "Sending ping response: {}", res); + response.sendBuffer(res.data(), res.size()); + } + catch (...) + { + tryLogCurrentException("PingHandler"); + } +} + } diff --git a/programs/library-bridge/LibraryBridgeHandlers.h b/programs/library-bridge/LibraryBridgeHandlers.h index b20f40616ce..16815e84723 100644 --- a/programs/library-bridge/LibraryBridgeHandlers.h +++ b/programs/library-bridge/LibraryBridgeHandlers.h @@ -1,9 +1,8 @@ #pragma once +#include #include #include -#include -#include "ExternalDictionaryLibraryHandler.h" namespace DB @@ -26,11 +25,12 @@ public: private: static constexpr inline auto FORMAT = "RowBinary"; + const size_t keep_alive_timeout; Poco::Logger * log; - size_t keep_alive_timeout; }; +// Handler for checking if the external dictionary library is loaded (used for handshake) class ExternalDictionaryLibraryBridgeExistsHandler : public HTTPRequestHandler, WithContext { public: @@ -43,4 +43,47 @@ private: Poco::Logger * log; }; + +/// Handler for requests to catboost library. The call protocol is as follows: +/// (1) Send a "catboost_GetTreeCount" request from the server to the bridge. It contains a library path (e.g /home/user/libcatboost.so) and +/// a model path (e.g. /home/user/model.bin). This loads the catboost library handler associated with the model path, then executes +/// GetTreeCount() on the library handler and sends the result back to the server. +/// (2) Send "catboost_Evaluate" from the server to the bridge. It contains a model path and the features to run the interference on. Step +/// (2) is called multiple times (once per chunk) by the server. +/// +/// We would ideally like to have steps (1) and (2) in one atomic handler but can't because the evaluation on the server side is divided +/// into two dependent phases: FunctionCatBoostEvaluate::getReturnTypeImpl() and ::executeImpl(). So the model may in principle be unloaded +/// from the library-bridge between steps (1) and (2). Step (2) checks if that is the case and fails gracefully. This is okay because that +/// situation considered exceptional and rare. +/// +/// An update of a model is performed by unloading it. The first call to "catboost_GetTreeCount" brings it into memory again. +/// +/// Further handlers are provided for unloading a specific model, for unloading all models or for retrieving information about the loaded +/// models for display in a system view. +class CatBoostLibraryBridgeRequestHandler : public HTTPRequestHandler, WithContext +{ +public: + CatBoostLibraryBridgeRequestHandler(size_t keep_alive_timeout_, ContextPtr context_); + + void handleRequest(HTTPServerRequest & request, HTTPServerResponse & response) override; + +private: + const size_t keep_alive_timeout; + Poco::Logger * log; +}; + + +// Handler for pinging the library-bridge for catboost access (used for handshake) +class CatBoostLibraryBridgeExistsHandler : public HTTPRequestHandler, WithContext +{ +public: + CatBoostLibraryBridgeExistsHandler(size_t keep_alive_timeout_, ContextPtr context_); + + void handleRequest(HTTPServerRequest & request, HTTPServerResponse & response) override; + +private: + const size_t keep_alive_timeout; + Poco::Logger * log; +}; + } diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp index 414766ee42a..5c09ba5b52e 100644 --- a/programs/server/Server.cpp +++ b/programs/server/Server.cpp @@ -51,7 +51,6 @@ #include #include #include -#include #include #include #include @@ -551,8 +550,9 @@ static void sanityChecks(Server & server) try { const char * filename = "/sys/devices/system/clocksource/clocksource0/current_clocksource"; - if (readString(filename).find("tsc") == std::string::npos) - server.context()->addWarningMessage("Linux is not using a fast TSC clock source. Performance can be degraded. Check " + String(filename)); + String clocksource = readString(filename); + if (clocksource.find("tsc") == std::string::npos && clocksource.find("kvm-clock") == std::string::npos) + server.context()->addWarningMessage("Linux is not using a fast clock source. Performance can be degraded. Check " + String(filename)); } catch (...) { @@ -1157,7 +1157,6 @@ int Server::main(const std::vector & /*args*/) global_context->setExternalAuthenticatorsConfig(*config); global_context->loadOrReloadDictionaries(*config); - global_context->loadOrReloadModels(*config); global_context->loadOrReloadUserDefinedExecutableFunctions(*config); global_context->setRemoteHostFilter(*config); @@ -1738,17 +1737,6 @@ int Server::main(const std::vector & /*args*/) throw; } - /// try to load models immediately, throw on error and die - try - { - global_context->loadOrReloadModels(config()); - } - catch (...) - { - tryLogCurrentException(log, "Caught exception while loading dictionaries."); - throw; - } - /// try to load user defined executable functions, throw on error and die try { diff --git a/src/AggregateFunctions/AggregateFunctionIf.cpp b/src/AggregateFunctions/AggregateFunctionIf.cpp index fa5e6b85a1e..0cf92585b77 100644 --- a/src/AggregateFunctions/AggregateFunctionIf.cpp +++ b/src/AggregateFunctions/AggregateFunctionIf.cpp @@ -278,6 +278,71 @@ public: } } + void addBatchSinglePlace( + size_t row_begin, size_t row_end, AggregateDataPtr __restrict place, const IColumn ** columns, Arena * arena, ssize_t) const final + { + std::unique_ptr final_null_flags = std::make_unique(row_end); + const size_t filter_column_num = number_of_arguments - 1; + + if (is_nullable[filter_column_num]) + { + const ColumnNullable * nullable_column = assert_cast(columns[filter_column_num]); + const IColumn & filter_column = nullable_column->getNestedColumn(); + const UInt8 * filter_null_map = nullable_column->getNullMapColumn().getData().data(); + const UInt8 * filter_values = assert_cast(filter_column).getData().data(); + + for (size_t i = row_begin; i < row_end; i++) + { + final_null_flags[i] = (null_is_skipped && filter_null_map[i]) || !filter_values[i]; + } + } + else + { + const IColumn * filter_column = columns[filter_column_num]; + const UInt8 * filter_values = assert_cast(filter_column)->getData().data(); + for (size_t i = row_begin; i < row_end; i++) + final_null_flags[i] = !filter_values[i]; + } + + const IColumn * nested_columns[number_of_arguments]; + for (size_t arg = 0; arg < number_of_arguments; arg++) + { + if (is_nullable[arg]) + { + const ColumnNullable & nullable_col = assert_cast(*columns[arg]); + if (null_is_skipped && (arg != filter_column_num)) + { + const ColumnUInt8 & nullmap_column = nullable_col.getNullMapColumn(); + const UInt8 * col_null_map = nullmap_column.getData().data(); + for (size_t r = row_begin; r < row_end; r++) + { + final_null_flags[r] |= col_null_map[r]; + } + } + nested_columns[arg] = &nullable_col.getNestedColumn(); + } + else + nested_columns[arg] = columns[arg]; + } + + bool at_least_one = false; + for (size_t i = row_begin; i < row_end; i++) + { + if (!final_null_flags[i]) + { + at_least_one = true; + break; + } + } + + if (at_least_one) + { + this->setFlag(place); + this->nested_function->addBatchSinglePlaceNotNull( + row_begin, row_end, this->nestedPlace(place), nested_columns, final_null_flags.get(), arena, -1); + } + } + #if USE_EMBEDDED_COMPILER void compileAdd(llvm::IRBuilderBase & builder, llvm::Value * aggregate_data_ptr, const DataTypes & arguments_types, const std::vector & argument_values) const override diff --git a/src/AggregateFunctions/AggregateFunctionNull.h b/src/AggregateFunctions/AggregateFunctionNull.h index ca284680800..1e2c9326142 100644 --- a/src/AggregateFunctions/AggregateFunctionNull.h +++ b/src/AggregateFunctions/AggregateFunctionNull.h @@ -414,6 +414,109 @@ public: this->nested_function->add(this->nestedPlace(place), nested_columns, row_num, arena); } + void addBatchSinglePlace( + size_t row_begin, + size_t row_end, + AggregateDataPtr __restrict place, + const IColumn ** columns, + Arena * arena, + ssize_t if_argument_pos) const final + { + /// We are going to merge all the flags into a single one to be able to call the nested batching functions + std::vector nullable_filters; + const IColumn * nested_columns[number_of_arguments]; + + std::unique_ptr final_flags = nullptr; + const UInt8 * final_flags_ptr = nullptr; + + if (if_argument_pos >= 0) + { + final_flags = std::make_unique(row_end); + final_flags_ptr = final_flags.get(); + + bool included_elements = 0; + const auto & flags = assert_cast(*columns[if_argument_pos]).getData(); + for (size_t i = row_begin; i < row_end; i++) + { + final_flags[i] = !flags.data()[i]; + included_elements += !!flags.data()[i]; + } + + if (included_elements == 0) + return; + if (included_elements != (row_end - row_begin)) + { + nullable_filters.push_back(final_flags_ptr); + } + } + + for (size_t i = 0; i < number_of_arguments; ++i) + { + if (is_nullable[i]) + { + const ColumnNullable & nullable_col = assert_cast(*columns[i]); + nested_columns[i] = &nullable_col.getNestedColumn(); + if constexpr (null_is_skipped) + { + const ColumnUInt8 & nullmap_column = nullable_col.getNullMapColumn(); + nullable_filters.push_back(nullmap_column.getData().data()); + } + } + else + { + nested_columns[i] = columns[i]; + } + } + + bool found_one = false; + + chassert(nullable_filters.size() > 0); /// We work under the assumption that we reach this because one argument was NULL + if (nullable_filters.size() == 1) + { + /// We can avoid making copies of the only filter but we still need to check that there is data to be added + final_flags_ptr = nullable_filters[0]; + for (size_t i = row_begin; i < row_end; i++) + { + if (!final_flags_ptr[i]) + { + found_one = true; + break; + } + } + } + else + { + if (!final_flags) + { + final_flags = std::make_unique(row_end); + final_flags_ptr = final_flags.get(); + } + + const size_t filter_start = nullable_filters[0] == final_flags_ptr ? 1 : 0; + for (size_t filter = filter_start; filter < nullable_filters.size(); filter++) + { + for (size_t i = row_begin; i < row_end; i++) + final_flags[i] |= nullable_filters[filter][i]; + } + + for (size_t i = row_begin; i < row_end; i++) + { + if (!final_flags_ptr[i]) + { + found_one = true; + break; + } + } + } + + if (!found_one) + return; // Nothing to do and nothing to mark + + this->setFlag(place); + this->nested_function->addBatchSinglePlaceNotNull( + row_begin, row_end, this->nestedPlace(place), nested_columns, final_flags_ptr, arena, -1); + } + #if USE_EMBEDDED_COMPILER diff --git a/src/AggregateFunctions/UniquesHashSet.h b/src/AggregateFunctions/UniquesHashSet.h index 8648f6e2500..54503e356c2 100644 --- a/src/AggregateFunctions/UniquesHashSet.h +++ b/src/AggregateFunctions/UniquesHashSet.h @@ -424,14 +424,30 @@ public: alloc(new_size_degree); - for (size_t i = 0; i < m_size; ++i) + if (m_size <= 1) { - HashValue x = 0; - DB::readIntBinary(x, rb); - if (x == 0) - has_zero = true; - else - reinsertImpl(x); + for (size_t i = 0; i < m_size; ++i) + { + HashValue x = 0; + DB::readIntBinary(x, rb); + if (x == 0) + has_zero = true; + else + reinsertImpl(x); + } + } + else + { + auto hs = std::make_unique(m_size); + rb.readStrict(reinterpret_cast(hs.get()), m_size * sizeof(HashValue)); + + for (size_t i = 0; i < m_size; ++i) + { + if (hs[i] == 0) + has_zero = true; + else + reinsertImpl(hs[i]); + } } } @@ -458,11 +474,24 @@ public: resize(new_size_degree); } - for (size_t i = 0; i < rhs_size; ++i) + if (rhs_size <= 1) { - HashValue x = 0; - DB::readIntBinary(x, rb); - insertHash(x); + for (size_t i = 0; i < rhs_size; ++i) + { + HashValue x = 0; + DB::readIntBinary(x, rb); + insertHash(x); + } + } + else + { + auto hs = std::make_unique(rhs_size); + rb.readStrict(reinterpret_cast(hs.get()), rhs_size * sizeof(HashValue)); + + for (size_t i = 0; i < rhs_size; ++i) + { + insertHash(hs[i]); + } } } diff --git a/src/BridgeHelper/CatBoostLibraryBridgeHelper.cpp b/src/BridgeHelper/CatBoostLibraryBridgeHelper.cpp new file mode 100644 index 00000000000..b0ef9b91a28 --- /dev/null +++ b/src/BridgeHelper/CatBoostLibraryBridgeHelper.cpp @@ -0,0 +1,194 @@ +#include "CatBoostLibraryBridgeHelper.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; +} + +CatBoostLibraryBridgeHelper::CatBoostLibraryBridgeHelper( + ContextPtr context_, + std::optional model_path_, + std::optional library_path_) + : LibraryBridgeHelper(context_->getGlobalContext()) + , model_path(model_path_) + , library_path(library_path_) +{ +} + +Poco::URI CatBoostLibraryBridgeHelper::getPingURI() const +{ + auto uri = createBaseURI(); + uri.setPath(PING_HANDLER); + return uri; +} + +Poco::URI CatBoostLibraryBridgeHelper::getMainURI() const +{ + auto uri = createBaseURI(); + uri.setPath(MAIN_HANDLER); + return uri; +} + + +Poco::URI CatBoostLibraryBridgeHelper::createRequestURI(const String & method) const +{ + auto uri = getMainURI(); + uri.addQueryParameter("version", std::to_string(LIBRARY_BRIDGE_PROTOCOL_VERSION)); + uri.addQueryParameter("method", method); + return uri; +} + +bool CatBoostLibraryBridgeHelper::bridgeHandShake() +{ + String result; + try + { + ReadWriteBufferFromHTTP buf(getPingURI(), Poco::Net::HTTPRequest::HTTP_GET, {}, http_timeouts, credentials); + readString(result, buf); + } + catch (...) + { + tryLogCurrentException(log); + return false; + } + + if (result != "1") + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected message from library bridge: {}. Check that bridge and server have the same version.", result); + + return true; +} + +ExternalModelInfos CatBoostLibraryBridgeHelper::listModels() +{ + startBridgeSync(); + + ReadWriteBufferFromHTTP buf( + createRequestURI(CATBOOST_LIST_METHOD), + Poco::Net::HTTPRequest::HTTP_POST, + [](std::ostream &) {}, + http_timeouts, credentials); + + ExternalModelInfos result; + + UInt64 num_rows; + readIntBinary(num_rows, buf); + + for (UInt64 i = 0; i < num_rows; ++i) + { + ExternalModelInfo info; + + readStringBinary(info.model_path, buf); + readStringBinary(info.model_type, buf); + + UInt64 t; + readIntBinary(t, buf); + info.loading_start_time = std::chrono::system_clock::from_time_t(t); + + readIntBinary(t, buf); + info.loading_duration = std::chrono::milliseconds(t); + + result.push_back(info); + } + + return result; +} + +void CatBoostLibraryBridgeHelper::removeModel() +{ + startBridgeSync(); + + assert(model_path); + + ReadWriteBufferFromHTTP buf( + createRequestURI(CATBOOST_REMOVEMODEL_METHOD), + Poco::Net::HTTPRequest::HTTP_POST, + [this](std::ostream & os) + { + os << "model_path=" << escapeForFileName(*model_path); + }, + http_timeouts, credentials); + + String result; + readStringBinary(result, buf); + assert(result == "1"); +} + +void CatBoostLibraryBridgeHelper::removeAllModels() +{ + startBridgeSync(); + + ReadWriteBufferFromHTTP buf( + createRequestURI(CATBOOST_REMOVEALLMODELS_METHOD), + Poco::Net::HTTPRequest::HTTP_POST, + [](std::ostream &){}, + http_timeouts, credentials); + + String result; + readStringBinary(result, buf); + assert(result == "1"); +} + +size_t CatBoostLibraryBridgeHelper::getTreeCount() +{ + startBridgeSync(); + + assert(model_path && library_path); + + ReadWriteBufferFromHTTP buf( + createRequestURI(CATBOOST_GETTREECOUNT_METHOD), + Poco::Net::HTTPRequest::HTTP_POST, + [this](std::ostream & os) + { + os << "library_path=" << escapeForFileName(*library_path) << "&"; + os << "model_path=" << escapeForFileName(*model_path); + }, + http_timeouts, credentials); + + size_t result; + readIntBinary(result, buf); + return result; +} + +ColumnPtr CatBoostLibraryBridgeHelper::evaluate(const ColumnsWithTypeAndName & columns) +{ + startBridgeSync(); + + WriteBufferFromOwnString string_write_buf; + Block block(columns); + NativeWriter serializer(string_write_buf, /*client_revision*/ 0, block); + serializer.write(block); + + assert(model_path); + + ReadWriteBufferFromHTTP buf( + createRequestURI(CATBOOST_LIB_EVALUATE_METHOD), + Poco::Net::HTTPRequest::HTTP_POST, + [this, serialized = string_write_buf.str()](std::ostream & os) + { + os << "model_path=" << escapeForFileName(*model_path) << "&"; + os << "data=" << escapeForFileName(serialized); + }, + http_timeouts, credentials); + + NativeReader deserializer(buf, /*server_revision*/ 0); + Block block_read = deserializer.read(); + + return block_read.getColumns()[0]; +} + +} diff --git a/src/BridgeHelper/CatBoostLibraryBridgeHelper.h b/src/BridgeHelper/CatBoostLibraryBridgeHelper.h new file mode 100644 index 00000000000..91c94143147 --- /dev/null +++ b/src/BridgeHelper/CatBoostLibraryBridgeHelper.h @@ -0,0 +1,53 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +class CatBoostLibraryBridgeHelper : public LibraryBridgeHelper +{ +public: + static constexpr inline auto PING_HANDLER = "/catboost_ping"; + static constexpr inline auto MAIN_HANDLER = "/catboost_request"; + + explicit CatBoostLibraryBridgeHelper( + ContextPtr context_, + std::optional model_path_ = std::nullopt, + std::optional library_path_ = std::nullopt); + + ExternalModelInfos listModels(); + + void removeModel(); /// requires model_path + void removeAllModels(); + + size_t getTreeCount(); /// requires model_path and library_path + ColumnPtr evaluate(const ColumnsWithTypeAndName & columns); /// requires model_path + +protected: + Poco::URI getPingURI() const override; + + Poco::URI getMainURI() const override; + + bool bridgeHandShake() override; + +private: + static constexpr inline auto CATBOOST_LIST_METHOD = "catboost_list"; + static constexpr inline auto CATBOOST_REMOVEMODEL_METHOD = "catboost_removeModel"; + static constexpr inline auto CATBOOST_REMOVEALLMODELS_METHOD = "catboost_removeAllModels"; + static constexpr inline auto CATBOOST_GETTREECOUNT_METHOD = "catboost_GetTreeCount"; + static constexpr inline auto CATBOOST_LIB_EVALUATE_METHOD = "catboost_libEvaluate"; + + Poco::URI createRequestURI(const String & method) const; + + const std::optional model_path; + const std::optional library_path; +}; + +} diff --git a/src/BridgeHelper/IBridgeHelper.h b/src/BridgeHelper/IBridgeHelper.h index 5068e84f885..a3348c81b68 100644 --- a/src/BridgeHelper/IBridgeHelper.h +++ b/src/BridgeHelper/IBridgeHelper.h @@ -12,8 +12,8 @@ namespace DB { -/// Common base class for XDBC and Library bridge helpers. -/// Contains helper methods to check/start bridge sync. +/// Base class for server-side bridge helpers, e.g. xdbc-bridge and library-bridge. +/// Contains helper methods to check/start bridge sync class IBridgeHelper: protected WithContext { diff --git a/src/Client/ClientBase.cpp b/src/Client/ClientBase.cpp index 465d4358e91..7d05cbb0681 100644 --- a/src/Client/ClientBase.cpp +++ b/src/Client/ClientBase.cpp @@ -1080,6 +1080,20 @@ bool ClientBase::receiveSampleBlock(Block & out, ColumnsDescription & columns_de } +void ClientBase::setInsertionTable(const ASTInsertQuery & insert_query) +{ + if (!global_context->hasInsertionTable() && insert_query.table) + { + String table = insert_query.table->as().shortName(); + if (!table.empty()) + { + String database = insert_query.database ? insert_query.database->as().shortName() : ""; + global_context->setInsertionTable(StorageID(database, table)); + } + } +} + + void ClientBase::processInsertQuery(const String & query_to_execute, ASTPtr parsed_query) { auto query = query_to_execute; @@ -1129,6 +1143,8 @@ void ClientBase::processInsertQuery(const String & query_to_execute, ASTPtr pars { /// If structure was received (thus, server has not thrown an exception), /// send our data with that structure. + setInsertionTable(parsed_insert_query); + sendData(sample, columns_description, parsed_query); receiveEndOfQuery(); } diff --git a/src/Client/ClientBase.h b/src/Client/ClientBase.h index 219d35d87cd..278056130fd 100644 --- a/src/Client/ClientBase.h +++ b/src/Client/ClientBase.h @@ -113,6 +113,8 @@ protected: std::vector & external_tables_arguments, std::vector & hosts_and_ports_arguments) = 0; + void setInsertionTable(const ASTInsertQuery & insert_query); + private: void receiveResult(ASTPtr parsed_query); diff --git a/src/Common/ArrayCache.h b/src/Common/ArrayCache.h index f01ff94e38b..79aeddb09df 100644 --- a/src/Common/ArrayCache.h +++ b/src/Common/ArrayCache.h @@ -722,5 +722,3 @@ public: return res; } }; - -template constexpr size_t ArrayCache::min_chunk_size; diff --git a/src/Common/Exception.cpp b/src/Common/Exception.cpp index 3645ac5594f..931f06fdb51 100644 --- a/src/Common/Exception.cpp +++ b/src/Common/Exception.cpp @@ -176,10 +176,10 @@ static void tryLogCurrentExceptionImpl(Poco::Logger * logger, const std::string void tryLogCurrentException(const char * log_name, const std::string & start_of_message) { - /// Under high memory pressure, any new allocation will definitelly lead - /// to MEMORY_LIMIT_EXCEEDED exception. + /// Under high memory pressure, new allocations throw a + /// MEMORY_LIMIT_EXCEEDED exception. /// - /// And in this case the exception will not be logged, so let's block the + /// In this case the exception will not be logged, so let's block the /// MemoryTracker until the exception will be logged. LockMemoryExceptionInThread lock_memory_tracker(VariableContext::Global); @@ -189,8 +189,8 @@ void tryLogCurrentException(const char * log_name, const std::string & start_of_ void tryLogCurrentException(Poco::Logger * logger, const std::string & start_of_message) { - /// Under high memory pressure, any new allocation will definitelly lead - /// to MEMORY_LIMIT_EXCEEDED exception. + /// Under high memory pressure, new allocations throw a + /// MEMORY_LIMIT_EXCEEDED exception. /// /// And in this case the exception will not be logged, so let's block the /// MemoryTracker until the exception will be logged. diff --git a/src/Common/ExternalModelInfo.h b/src/Common/ExternalModelInfo.h new file mode 100644 index 00000000000..378e4984af6 --- /dev/null +++ b/src/Common/ExternalModelInfo.h @@ -0,0 +1,20 @@ +#pragma once + +#include +#include + +namespace DB +{ + +/// Details about external machine learning model, used by clickhouse-server and clickhouse-library-bridge +struct ExternalModelInfo +{ + String model_path; + String model_type; + std::chrono::system_clock::time_point loading_start_time; /// serialized as std::time_t + std::chrono::milliseconds loading_duration; /// serialized as UInt64 +}; + +using ExternalModelInfos = std::vector; + +} diff --git a/src/Common/MemoryStatisticsOS.cpp b/src/Common/MemoryStatisticsOS.cpp index 22f8446121f..f2d2ab5fea9 100644 --- a/src/Common/MemoryStatisticsOS.cpp +++ b/src/Common/MemoryStatisticsOS.cpp @@ -135,7 +135,7 @@ MemoryStatisticsOS::Data MemoryStatisticsOS::get() const struct kinfo_proc kp; size_t len = sizeof(struct kinfo_proc); - if (-1 == ::sysctl(mib, 4, &kp, &len, NULL, 0)) + if (-1 == ::sysctl(mib, 4, &kp, &len, nullptr, 0)) throwFromErrno("Cannot sysctl(kern.proc.pid." + std::to_string(self) + ")", ErrorCodes::SYSTEM_ERROR); if (sizeof(struct kinfo_proc) != len) diff --git a/src/Common/OpenTelemetryTraceContext.cpp b/src/Common/OpenTelemetryTraceContext.cpp index 7a1f94926d5..d5c2188ad01 100644 --- a/src/Common/OpenTelemetryTraceContext.cpp +++ b/src/Common/OpenTelemetryTraceContext.cpp @@ -130,16 +130,15 @@ void SpanHolder::finish() noexcept try { auto log = current_thread_trace_context.span_log.lock(); - if (!log) + + /// The log might be disabled, check it before use + if (log) { - // The log might be disabled. - return; + this->finish_time_us + = std::chrono::duration_cast(std::chrono::system_clock::now().time_since_epoch()).count(); + + log->add(OpenTelemetrySpanLogElement(*this)); } - - this->finish_time_us - = std::chrono::duration_cast(std::chrono::system_clock::now().time_since_epoch()).count(); - - log->add(OpenTelemetrySpanLogElement(*this)); } catch (...) { diff --git a/src/Common/ThreadPool.h b/src/Common/ThreadPool.h index fc5377b3783..76ada9e0d75 100644 --- a/src/Common/ThreadPool.h +++ b/src/Common/ThreadPool.h @@ -264,6 +264,18 @@ protected: } }; +/// Schedule jobs/tasks on global thread pool without implicit passing tracing context on current thread to underlying worker as parent tracing context. +/// +/// If you implement your own job/task scheduling upon global thread pool or schedules a long time running job in a infinite loop way, +/// you need to use class, or you need to use ThreadFromGlobalPool below. +/// +/// See the comments of ThreadPool below to know how it works. +using ThreadFromGlobalPoolNoTracingContextPropagation = ThreadFromGlobalPoolImpl; + +/// An alias of thread that execute jobs/tasks on global thread pool by implicit passing tracing context on current thread to underlying worker as parent tracing context. +/// If jobs/tasks are directly scheduled by using APIs of this class, you need to use this class or you need to use class above. +using ThreadFromGlobalPool = ThreadFromGlobalPoolImpl; + /// Recommended thread pool for the case when multiple thread pools are created and destroyed. /// /// The template parameter of ThreadFromGlobalPool is set to false to disable tracing context propagation to underlying worker. @@ -274,9 +286,6 @@ protected: /// which means the tracing context initialized at underlying worker level won't be delete for a very long time. /// This would cause wrong context for further jobs scheduled in ThreadPool. /// -/// To make sure the tracing context are correctly propagated, we explicitly disable context propagation(including initialization and de-initialization) at underlying worker level. +/// To make sure the tracing context is correctly propagated, we explicitly disable context propagation(including initialization and de-initialization) at underlying worker level. /// -using ThreadPool = ThreadPoolImpl>; - -/// An alias for user code to execute a job in the global thread pool -using ThreadFromGlobalPool = ThreadFromGlobalPoolImpl; +using ThreadPool = ThreadPoolImpl; diff --git a/src/Coordination/CoordinationSettings.cpp b/src/Coordination/CoordinationSettings.cpp index 3e03ee0d6f4..f634bcbb281 100644 --- a/src/Coordination/CoordinationSettings.cpp +++ b/src/Coordination/CoordinationSettings.cpp @@ -189,6 +189,9 @@ KeeperConfigurationAndSettings::loadFromConfig(const Poco::Util::AbstractConfigu ret->coordination_settings->loadFromConfig("keeper_server.coordination_settings", config); + if (ret->coordination_settings->quorum_reads) + LOG_WARNING(&Poco::Logger::get("KeeperConfigurationAndSettings"), "Setting 'quorum_reads' is deprecated. Please use 'read_mode'"); + return ret; } diff --git a/src/Coordination/CoordinationSettings.h b/src/Coordination/CoordinationSettings.h index 5247f5d7ec8..d6b0977b4fa 100644 --- a/src/Coordination/CoordinationSettings.h +++ b/src/Coordination/CoordinationSettings.h @@ -26,10 +26,12 @@ struct Settings; M(Milliseconds, heart_beat_interval_ms, 500, "Heartbeat interval between quorum nodes", 0) \ M(Milliseconds, election_timeout_lower_bound_ms, 1000, "Lower bound of election timer (avoid too often leader elections)", 0) \ M(Milliseconds, election_timeout_upper_bound_ms, 2000, "Upper bound of election timer (avoid too often leader elections)", 0) \ + M(Milliseconds, leadership_expiry, 0, "How often will leader node check if it still has majority. Set it lower or equal to election_timeout_lower_bound_ms to have linearizable reads.", 0) \ M(UInt64, reserved_log_items, 100000, "How many log items to store (don't remove during compaction)", 0) \ M(UInt64, snapshot_distance, 100000, "How many log items we have to collect to write new snapshot", 0) \ M(Bool, auto_forwarding, true, "Allow to forward write requests from followers to leader", 0) \ M(Milliseconds, shutdown_timeout, 5000, "How much time we will wait until RAFT shutdown", 0) \ + M(Milliseconds, session_shutdown_timeout, 10000, "How much time we will wait until sessions are closed during shutdown", 0) \ M(Milliseconds, startup_timeout, 180000, "How much time we will wait until RAFT to start.", 0) \ M(LogsLevel, raft_logs_level, LogsLevel::information, "Log internal RAFT logs into main server log level. Valid values: 'trace', 'debug', 'information', 'warning', 'error', 'fatal', 'none'", 0) \ M(UInt64, rotate_log_storage_interval, 100000, "How many records will be stored in one log storage file", 0) \ @@ -37,11 +39,12 @@ struct Settings; M(UInt64, stale_log_gap, 10000, "When node became stale and should receive snapshots from leader", 0) \ M(UInt64, fresh_log_gap, 200, "When node became fresh", 0) \ M(UInt64, max_requests_batch_size, 100, "Max size of batch in requests count before it will be sent to RAFT", 0) \ - M(Bool, quorum_reads, false, "Execute read requests as writes through whole RAFT consesus with similar speed", 0) \ + M(Bool, quorum_reads, false, "Deprecated - use read_mode. Execute read requests as writes through whole RAFT consesus with similar speed", 0) \ M(Bool, force_sync, true, "Call fsync on each change in RAFT changelog", 0) \ M(Bool, compress_logs, true, "Write compressed coordination logs in ZSTD format", 0) \ M(Bool, compress_snapshots_with_zstd_format, true, "Write compressed snapshots in ZSTD format (instead of custom LZ4)", 0) \ - M(UInt64, configuration_change_tries_count, 20, "How many times we will try to apply configuration change (add/remove server) to the cluster", 0) + M(UInt64, configuration_change_tries_count, 20, "How many times we will try to apply configuration change (add/remove server) to the cluster", 0) \ + M(String, read_mode, "nonlinear", "How should reads be processed. Valid values: 'nonlinear', 'fastlinear', 'quorum'. 'nonlinear' is the fastest option because there are no consistency requirements", 0) DECLARE_SETTINGS_TRAITS(CoordinationSettingsTraits, LIST_OF_COORDINATION_SETTINGS) diff --git a/src/Coordination/KeeperDispatcher.cpp b/src/Coordination/KeeperDispatcher.cpp index 5b376a03b02..3445ef5ea23 100644 --- a/src/Coordination/KeeperDispatcher.cpp +++ b/src/Coordination/KeeperDispatcher.cpp @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -6,6 +7,8 @@ #include #include #include +#include +#include #include #include @@ -30,22 +33,83 @@ namespace ErrorCodes KeeperDispatcher::KeeperDispatcher() : responses_queue(std::numeric_limits::max()) + , read_requests_queue(std::numeric_limits::max()) + , finalize_requests_queue(std::numeric_limits::max()) , configuration_and_settings(std::make_shared()) , log(&Poco::Logger::get("KeeperDispatcher")) { } +/// ZooKeepers has 2 requirements: +/// - writes need to be linearizable +/// - all requests from single session need to be processed in the order of their arrival +/// +/// Because of that, we cannot process read and write requests from SAME session at the same time. +/// To be able to process read and write requests in parallel we need to make sure that only 1 type +/// of request is being processed from a single session. +/// Multiple types from different sessions can be processed at the same time. +/// +/// We do some in-session housekeeping to make sure that the multithreaded request processing is correct. +/// When a request is received from a client, we check if there are requests being processed from that same +/// session, and if yes, of what type. If the types are the same, and there are no requests of different +/// type inbetetween, we can instanly add it to active request queue. Otherwise, we need to wait until +/// all requests of the other type are processed. +/// +/// There are multiple threads used for processing the request, each of them communicating with a queue. +/// Assumption: only one type of request is being processed from a same session at any point in time (read or write). +/// +/// requestThread -> requests currently being processed +/// readRequestThread -> thread for processing read requests +/// finalizeRequestThread -> thread for finalizing requests: +/// - in-session housekeeping, add requests to the active request queue if there are any +/// +/// If reads are linearizable without quorum, a request can possibly wait for a certain log to be committed. +/// In that case we add it to the waiting queue for that log. +/// When that log is committed, the committing thread will send that read request to readRequestThread so it can be processed. +/// void KeeperDispatcher::requestThread() { setThreadName("KeeperReqT"); /// Result of requests batch from previous iteration - RaftAppendResult prev_result = nullptr; - /// Requests from previous iteration. We store them to be able - /// to send errors to the client. - KeeperStorage::RequestsForSessions prev_batch; + RaftResult prev_result = nullptr; + const auto previous_quorum_done = [&] { return !prev_result || prev_result->has_result() || prev_result->get_result_code() != nuraft::cmd_result_code::OK; }; + const auto needs_quorum = [](const auto & coordination_settings, const auto & request) + { + return coordination_settings->quorum_reads || coordination_settings->read_mode.toString() == "quorum" || !request.request->isReadRequest(); + }; + + KeeperStorage::RequestsForSessions quorum_requests; + KeeperStorage::RequestsForSessions read_requests; + + auto process_quorum_requests = [&, this]() mutable + { + /// Forcefully process all previous pending requests + if (prev_result) + forceWaitAndProcessResult(prev_result); + + prev_result = server->putRequestBatch(quorum_requests); + + if (prev_result) + { + prev_result->when_ready([&, requests_for_sessions = std::move(quorum_requests)](nuraft::cmd_result> & result, nuraft::ptr &) mutable + { + if (!result.get_accepted() || result.get_result_code() == nuraft::cmd_result_code::TIMEOUT) + addErrorResponses(requests_for_sessions, Coordination::Error::ZOPERATIONTIMEOUT); + else if (result.get_result_code() != nuraft::cmd_result_code::OK) + addErrorResponses(requests_for_sessions, Coordination::Error::ZCONNECTIONLOSS); + }); + } + + quorum_requests.clear(); + }; + + /// ZooKeeper requires that the requests inside a single session are processed in a strict order + /// (we cannot process later requests before all the previous once are processed) + /// By making sure that at this point we can either have just read requests or just write requests + /// from a single session, we can process them independently while (!shutdown_called) { KeeperStorage::RequestForSession request; @@ -54,94 +118,67 @@ void KeeperDispatcher::requestThread() uint64_t max_wait = coordination_settings->operation_timeout_ms.totalMilliseconds(); uint64_t max_batch_size = coordination_settings->max_requests_batch_size; - /// The code below do a very simple thing: batch all write (quorum) requests into vector until - /// previous write batch is not finished or max_batch size achieved. The main complexity goes from - /// the ability to process read requests without quorum (from local state). So when we are collecting - /// requests into a batch we must check that the new request is not read request. Otherwise we have to - /// process all already accumulated write requests, wait them synchronously and only after that process - /// read request. So reads are some kind of "separator" for writes. try { - if (requests_queue->tryPop(request, max_wait)) + if (active_requests_queue->tryPop(request, max_wait)) { CurrentMetrics::sub(CurrentMetrics::KeeperOutstandingRequets); if (shutdown_called) break; - KeeperStorage::RequestsForSessions current_batch; + if (needs_quorum(coordination_settings, request)) + quorum_requests.emplace_back(request); + else + read_requests.emplace_back(request); - bool has_read_request = false; - - /// If new request is not read request or we must to process it through quorum. - /// Otherwise we will process it locally. - if (coordination_settings->quorum_reads || !request.request->isReadRequest()) + /// Waiting until previous append will be successful, or batch is big enough + /// has_result == false && get_result_code == OK means that our request still not processed. + /// Sometimes NuRaft set errorcode without setting result, so we check both here. + while (true) { - current_batch.emplace_back(request); + if (quorum_requests.size() > max_batch_size) + break; - /// Waiting until previous append will be successful, or batch is big enough - /// has_result == false && get_result_code == OK means that our request still not processed. - /// Sometimes NuRaft set errorcode without setting result, so we check both here. - while (prev_result && (!prev_result->has_result() && prev_result->get_result_code() == nuraft::cmd_result_code::OK) && current_batch.size() <= max_batch_size) + if (read_requests.size() > max_batch_size) { - /// Trying to get batch requests as fast as possible - if (requests_queue->tryPop(request, 1)) - { - CurrentMetrics::sub(CurrentMetrics::KeeperOutstandingRequets); - /// Don't append read request into batch, we have to process them separately - if (!coordination_settings->quorum_reads && request.request->isReadRequest()) - { - has_read_request = true; - break; - } - else - { + processReadRequests(coordination_settings, read_requests); - current_batch.emplace_back(request); - } - } - - if (shutdown_called) + if (previous_quorum_done()) break; } + + /// Trying to get batch requests as fast as possible + if (active_requests_queue->tryPop(request, 1)) + { + CurrentMetrics::sub(CurrentMetrics::KeeperOutstandingRequets); + if (needs_quorum(coordination_settings, request)) + quorum_requests.emplace_back(request); + else + read_requests.emplace_back(request); + } + else + { + /// batch of read requests can send at most one request + /// so we don't care if the previous batch hasn't received response + if (!read_requests.empty()) + processReadRequests(coordination_settings, read_requests); + + /// if we still didn't process previous batch we can + /// increase are current batch even more + if (previous_quorum_done()) + break; + } + + if (shutdown_called) + break; } - else - has_read_request = true; if (shutdown_called) break; - /// Forcefully process all previous pending requests - if (prev_result) - forceWaitAndProcessResult(prev_result, prev_batch); + if (!quorum_requests.empty()) + process_quorum_requests(); - /// Process collected write requests batch - if (!current_batch.empty()) - { - auto result = server->putRequestBatch(current_batch); - - if (result) - { - if (has_read_request) /// If we will execute read request next, than we have to process result now - forceWaitAndProcessResult(result, current_batch); - } - else - { - addErrorResponses(current_batch, Coordination::Error::ZCONNECTIONLOSS); - current_batch.clear(); - } - - prev_batch = std::move(current_batch); - prev_result = result; - } - - /// Read request always goes after write batch (last request) - if (has_read_request) - { - if (server->isLeaderAlive()) - server->putLocalReadRequest(request); - else - addErrorResponses({request}, Coordination::Error::ZCONNECTIONLOSS); - } } } catch (...) @@ -151,6 +188,72 @@ void KeeperDispatcher::requestThread() } } +void KeeperDispatcher::processReadRequests(const CoordinationSettingsPtr & coordination_settings, KeeperStorage::RequestsForSessions & read_requests) +{ + if (coordination_settings->read_mode.toString() == "fastlinear") + { + // we just want to know what's the current latest committed log on Leader node + auto leader_info_result = server->getLeaderInfo(); + if (leader_info_result) + { + leader_info_result->when_ready([&, requests_for_sessions = std::move(read_requests)](nuraft::cmd_result> & result, nuraft::ptr & exception) mutable + { + if (!result.get_accepted() || result.get_result_code() == nuraft::cmd_result_code::TIMEOUT) + { + addErrorResponses(requests_for_sessions, Coordination::Error::ZOPERATIONTIMEOUT); + return; + } + + if (result.get_result_code() != nuraft::cmd_result_code::OK) + { + addErrorResponses(requests_for_sessions, Coordination::Error::ZCONNECTIONLOSS); + return; + } + + if (exception) + { + LOG_INFO(log, "Got exception while waiting for read results {}", exception->what()); + addErrorResponses(requests_for_sessions, Coordination::Error::ZCONNECTIONLOSS); + return; + } + + auto & leader_info_ctx = result.get(); + + if (!leader_info_ctx) + { + addErrorResponses(requests_for_sessions, Coordination::Error::ZCONNECTIONLOSS); + return; + } + + KeeperServer::NodeInfo leader_info; + leader_info.term = leader_info_ctx->get_ulong(); + leader_info.last_committed_index = leader_info_ctx->get_ulong(); + std::lock_guard lock(leader_waiter_mutex); + auto node_info = server->getNodeInfo(); + + /// we're behind, we need to wait + if (node_info.term < leader_info.term || node_info.last_committed_index < leader_info.last_committed_index) + { + auto & leader_waiter = leader_waiters[leader_info]; + leader_waiter.insert(leader_waiter.end(), requests_for_sessions.begin(), requests_for_sessions.end()); + LOG_TRACE(log, "waiting for term {}, idx {}", leader_info.term, leader_info.last_committed_index); + } + /// process it in background thread + else if (!read_requests_queue.push(std::move(requests_for_sessions))) + throw Exception(ErrorCodes::SYSTEM_ERROR, "Cannot push read requests to queue"); + }); + } + } + else + { + assert(coordination_settings->read_mode.toString() == "nonlinear"); + if (!read_requests_queue.push(std::move(read_requests))) + throw Exception(ErrorCodes::SYSTEM_ERROR, "Cannot push read requests to queue"); + } + + read_requests.clear(); +} + void KeeperDispatcher::responseThread() { setThreadName("KeeperRspT"); @@ -200,6 +303,65 @@ void KeeperDispatcher::snapshotThread() } } +/// Background thread for processing read requests +void KeeperDispatcher::readRequestThread() +{ + setThreadName("KeeperReadT"); + while (!shutdown_called) + { + KeeperStorage::RequestsForSessions requests; + if (!read_requests_queue.pop(requests)) + break; + + if (shutdown_called) + break; + + try + { + for (const auto & request_info : requests) + { + if (server->isLeaderAlive()) + server->putLocalReadRequest(request_info); + else + addErrorResponses({request_info}, Coordination::Error::ZCONNECTIONLOSS); + } + + if (!finalize_requests_queue.push(std::move(requests))) + throw Exception(ErrorCodes::SYSTEM_ERROR, "Cannot push read requests to queue"); + } + catch (...) + { + tryLogCurrentException(__PRETTY_FUNCTION__); + } + } +} + +/// We finalize requests every time we commit a single log with request +/// or process a batch of read requests. +/// Because it can get heavy, we do it in background thread. +void KeeperDispatcher::finalizeRequestsThread() +{ + setThreadName("KeeperFinalT"); + while (!shutdown_called) + { + KeeperStorage::RequestsForSessions requests; + if (!finalize_requests_queue.pop(requests)) + break; + + if (shutdown_called) + break; + + try + { + finalizeRequests(requests); + } + catch (...) + { + tryLogCurrentException(__PRETTY_FUNCTION__); + } + } +} + void KeeperDispatcher::setResponse(int64_t session_id, const Coordination::ZooKeeperResponsePtr & response) { std::lock_guard lock(session_to_response_callback_mutex); @@ -255,6 +417,30 @@ bool KeeperDispatcher::putRequest(const Coordination::ZooKeeperRequestPtr & requ request_info.time = duration_cast(system_clock::now().time_since_epoch()).count(); request_info.session_id = session_id; + { + std::lock_guard lock{unprocessed_request_mutex}; + auto unprocessed_requests_it = unprocessed_requests_for_session.find(session_id); + if (unprocessed_requests_it == unprocessed_requests_for_session.end()) + { + auto & unprocessed_requests = unprocessed_requests_for_session[session_id]; + unprocessed_requests.unprocessed_num = 1; + unprocessed_requests.is_read = request->isReadRequest(); + } + else + { + auto & unprocessed_requests = unprocessed_requests_it->second; + + /// queue is not empty, or the request types don't match, put it in the waiting queue + if (!unprocessed_requests.request_queue.empty() || unprocessed_requests.is_read != request->isReadRequest()) + { + unprocessed_requests.request_queue.push_back(std::move(request_info)); + return true; + } + + ++unprocessed_requests.unprocessed_num; + } + } + std::lock_guard lock(push_request_mutex); if (shutdown_called) @@ -263,10 +449,10 @@ bool KeeperDispatcher::putRequest(const Coordination::ZooKeeperRequestPtr & requ /// Put close requests without timeouts if (request->getOpNum() == Coordination::OpNum::Close) { - if (!requests_queue->push(std::move(request_info))) + if (!active_requests_queue->push(std::move(request_info))) throw Exception("Cannot push request to queue", ErrorCodes::SYSTEM_ERROR); } - else if (!requests_queue->tryPush(std::move(request_info), configuration_and_settings->coordination_settings->operation_timeout_ms.totalMilliseconds())) + else if (!active_requests_queue->tryPush(std::move(request_info), configuration_and_settings->coordination_settings->operation_timeout_ms.totalMilliseconds())) { throw Exception("Cannot push request to queue within operation timeout", ErrorCodes::TIMEOUT_EXCEEDED); } @@ -279,13 +465,23 @@ void KeeperDispatcher::initialize(const Poco::Util::AbstractConfiguration & conf LOG_DEBUG(log, "Initializing storage dispatcher"); configuration_and_settings = KeeperConfigurationAndSettings::loadFromConfig(config, standalone_keeper); - requests_queue = std::make_unique(configuration_and_settings->coordination_settings->max_requests_batch_size); + active_requests_queue = std::make_unique(configuration_and_settings->coordination_settings->max_requests_batch_size); request_thread = ThreadFromGlobalPool([this] { requestThread(); }); responses_thread = ThreadFromGlobalPool([this] { responseThread(); }); snapshot_thread = ThreadFromGlobalPool([this] { snapshotThread(); }); + read_request_thread = ThreadFromGlobalPool([this] { readRequestThread(); }); + finalize_requests_thread = ThreadFromGlobalPool([this] { finalizeRequestsThread(); }); - server = std::make_unique(configuration_and_settings, config, responses_queue, snapshots_queue); + server = std::make_unique( + configuration_and_settings, + config, + responses_queue, + snapshots_queue, + [this](const KeeperStorage::RequestForSession & request_for_session, uint64_t log_term, uint64_t log_idx) + { onRequestCommit(request_for_session, log_term, log_idx); }, + [this](uint64_t term, uint64_t last_idx) + { onApplySnapshot(term, last_idx); }); try { @@ -333,9 +529,9 @@ void KeeperDispatcher::shutdown() if (session_cleaner_thread.joinable()) session_cleaner_thread.join(); - if (requests_queue) + if (active_requests_queue) { - requests_queue->finish(); + active_requests_queue->finish(); if (request_thread.joinable()) request_thread.join(); @@ -349,18 +545,23 @@ void KeeperDispatcher::shutdown() if (snapshot_thread.joinable()) snapshot_thread.join(); + read_requests_queue.finish(); + if (read_request_thread.joinable()) + read_request_thread.join(); + + finalize_requests_queue.finish(); + if (finalize_requests_thread.joinable()) + finalize_requests_thread.join(); + update_configuration_queue.finish(); if (update_configuration_thread.joinable()) update_configuration_thread.join(); } - if (server) - server->shutdown(); - KeeperStorage::RequestForSession request_for_session; /// Set session expired for all pending requests - while (requests_queue && requests_queue->tryPop(request_for_session)) + while (active_requests_queue && active_requests_queue->tryPop(request_for_session)) { CurrentMetrics::sub(CurrentMetrics::KeeperOutstandingRequets); auto response = request_for_session.request->makeResponse(); @@ -368,10 +569,58 @@ void KeeperDispatcher::shutdown() setResponse(request_for_session.session_id, response); } - /// Clear all registered sessions - std::lock_guard lock(session_to_response_callback_mutex); - session_to_response_callback.clear(); + KeeperStorage::RequestsForSessions close_requests; + { + /// Clear all registered sessions + std::lock_guard lock(session_to_response_callback_mutex); + + if (hasLeader()) + { + close_requests.reserve(session_to_response_callback.size()); + // send to leader CLOSE requests for active sessions + for (const auto & [session, response] : session_to_response_callback) + { + auto request = Coordination::ZooKeeperRequestFactory::instance().get(Coordination::OpNum::Close); + request->xid = Coordination::CLOSE_XID; + using namespace std::chrono; + KeeperStorage::RequestForSession request_info + { + .session_id = session, + .time = duration_cast(system_clock::now().time_since_epoch()).count(), + .request = std::move(request), + }; + + close_requests.push_back(std::move(request_info)); + } + } + + session_to_response_callback.clear(); + } + + // if there is no leader, there is no reason to do CLOSE because it's a write request + if (hasLeader() && !close_requests.empty()) + { + LOG_INFO(log, "Trying to close {} session(s)", close_requests.size()); + const auto raft_result = server->putRequestBatch(close_requests); + auto sessions_closing_done_promise = std::make_shared>(); + auto sessions_closing_done = sessions_closing_done_promise->get_future(); + raft_result->when_ready([sessions_closing_done_promise = std::move(sessions_closing_done_promise)]( + nuraft::cmd_result> & /*result*/, + nuraft::ptr & /*exception*/) { sessions_closing_done_promise->set_value(); }); + + auto session_shutdown_timeout = configuration_and_settings->coordination_settings->session_shutdown_timeout.totalMilliseconds(); + if (sessions_closing_done.wait_for(std::chrono::milliseconds(session_shutdown_timeout)) != std::future_status::ready) + LOG_WARNING( + log, + "Failed to close sessions in {}ms. If they are not closed, they will be closed after session timeout.", + session_shutdown_timeout); + } + + if (server) + server->shutdown(); + CurrentMetrics::set(CurrentMetrics::KeeperAliveConnections, 0); + } catch (...) { @@ -418,16 +667,18 @@ void KeeperDispatcher::sessionCleanerTask() LOG_INFO(log, "Found dead session {}, will try to close it", dead_session); /// Close session == send close request to raft server - Coordination::ZooKeeperRequestPtr request = Coordination::ZooKeeperRequestFactory::instance().get(Coordination::OpNum::Close); + auto request = Coordination::ZooKeeperRequestFactory::instance().get(Coordination::OpNum::Close); request->xid = Coordination::CLOSE_XID; - KeeperStorage::RequestForSession request_info; - request_info.request = request; using namespace std::chrono; - request_info.time = duration_cast(system_clock::now().time_since_epoch()).count(); - request_info.session_id = dead_session; + KeeperStorage::RequestForSession request_info + { + .session_id = dead_session, + .time = duration_cast(system_clock::now().time_since_epoch()).count(), + .request = std::move(request), + }; { std::lock_guard lock(push_request_mutex); - if (!requests_queue->push(std::move(request_info))) + if (!active_requests_queue->push(std::move(request_info))) LOG_INFO(log, "Cannot push close request to queue while cleaning outdated sessions"); CurrentMetrics::add(CurrentMetrics::KeeperOutstandingRequets); } @@ -477,19 +728,12 @@ void KeeperDispatcher::addErrorResponses(const KeeperStorage::RequestsForSession } } -void KeeperDispatcher::forceWaitAndProcessResult(RaftAppendResult & result, KeeperStorage::RequestsForSessions & requests_for_sessions) +void KeeperDispatcher::forceWaitAndProcessResult(RaftResult & result) { if (!result->has_result()) result->get(); - /// If we get some errors, than send them to clients - if (!result->get_accepted() || result->get_result_code() == nuraft::cmd_result_code::TIMEOUT) - addErrorResponses(requests_for_sessions, Coordination::Error::ZOPERATIONTIMEOUT); - else if (result->get_result_code() != nuraft::cmd_result_code::OK) - addErrorResponses(requests_for_sessions, Coordination::Error::ZCONNECTIONLOSS); - result = nullptr; - requests_for_sessions.clear(); } int64_t KeeperDispatcher::getSessionID(int64_t session_timeout_ms) @@ -537,7 +781,7 @@ int64_t KeeperDispatcher::getSessionID(int64_t session_timeout_ms) /// Push new session request to queue { std::lock_guard lock(push_request_mutex); - if (!requests_queue->tryPush(std::move(request_info), session_timeout_ms)) + if (!active_requests_queue->tryPush(std::move(request_info), session_timeout_ms)) throw Exception("Cannot push session id request to queue within session timeout", ErrorCodes::TIMEOUT_EXCEEDED); CurrentMetrics::add(CurrentMetrics::KeeperOutstandingRequets); } @@ -610,6 +854,122 @@ void KeeperDispatcher::updateConfigurationThread() } } +// Used to update the state for a session based on the requests +// - update the number of current unprocessed requests for the session +// - if the number of unprocessed requests is 0, we can start adding next type of requests +// from unprocessed requests queue to the active queue +void KeeperDispatcher::finalizeRequests(const KeeperStorage::RequestsForSessions & requests_for_sessions) +{ + std::unordered_map counts_for_session; + + for (const auto & request_for_session : requests_for_sessions) + { + ++counts_for_session[request_for_session.session_id]; + } + + std::lock_guard lock{unprocessed_request_mutex}; + for (const auto [session_id, count] : counts_for_session) + { + auto unprocessed_requests_it = unprocessed_requests_for_session.find(session_id); + if (unprocessed_requests_it == unprocessed_requests_for_session.end()) + continue; + + auto & unprocessed_requests = unprocessed_requests_it->second; + unprocessed_requests.unprocessed_num -= count; + + if (unprocessed_requests.unprocessed_num == 0) + { + if (!unprocessed_requests.request_queue.empty()) + { + auto & unprocessed_requests_queue = unprocessed_requests.request_queue; + unprocessed_requests.is_read = !unprocessed_requests.is_read; + // start adding next type of requests + while (!unprocessed_requests_queue.empty() && unprocessed_requests_queue.front().request->isReadRequest() == unprocessed_requests.is_read) + { + auto & front_request = unprocessed_requests_queue.front(); + + /// Put close requests without timeouts + if (front_request.request->getOpNum() == Coordination::OpNum::Close) + { + if (!active_requests_queue->push(std::move(front_request))) + throw Exception("Cannot push request to queue", ErrorCodes::SYSTEM_ERROR); + } + else if (!active_requests_queue->tryPush(std::move(front_request), configuration_and_settings->coordination_settings->operation_timeout_ms.totalMilliseconds())) + { + throw Exception("Cannot push request to queue within operation timeout", ErrorCodes::TIMEOUT_EXCEEDED); + } + + ++unprocessed_requests.unprocessed_num; + unprocessed_requests_queue.pop_front(); + } + } + else + { + unprocessed_requests_for_session.erase(unprocessed_requests_it); + } + } + } +} + +// Finalize request +// Process read requests that were waiting for this commit +void KeeperDispatcher::onRequestCommit(const KeeperStorage::RequestForSession & request_for_session, uint64_t log_term, uint64_t log_idx) +{ + if (!finalize_requests_queue.push({request_for_session})) + throw Exception(ErrorCodes::SYSTEM_ERROR, "Cannot push read requests to queue"); + + KeeperStorage::RequestsForSessions requests; + { + std::lock_guard lock(leader_waiter_mutex); + auto request_queue_it = leader_waiters.find(KeeperServer::NodeInfo{.term = log_term, .last_committed_index = log_idx}); + if (request_queue_it != leader_waiters.end()) + { + requests = std::move(request_queue_it->second); + leader_waiters.erase(request_queue_it); + } + } + + if (requests.empty()) + return; + + if (!read_requests_queue.push(std::move(requests))) + throw Exception(ErrorCodes::SYSTEM_ERROR, "Cannot push read requests to queue"); +} + +/// Process all read request that are waiting for lower or currently last processed log index +void KeeperDispatcher::onApplySnapshot(uint64_t term, uint64_t last_idx) +{ + KeeperServer::NodeInfo current_node_info{term, last_idx}; + KeeperStorage::RequestsForSessions requests; + { + std::lock_guard lock(leader_waiter_mutex); + for (auto leader_waiter_it = leader_waiters.begin(); leader_waiter_it != leader_waiters.end();) + { + auto waiting_node_info = leader_waiter_it->first; + if (waiting_node_info.term <= current_node_info.term + && waiting_node_info.last_committed_index <= current_node_info.last_committed_index) + { + for (auto & request : leader_waiter_it->second) + { + requests.push_back(std::move(request)); + } + + leader_waiter_it = leader_waiters.erase(leader_waiter_it); + } + else + { + ++leader_waiter_it; + } + } + } + + if (requests.empty()) + return; + + if (!read_requests_queue.push(std::move(requests))) + throw Exception(ErrorCodes::SYSTEM_ERROR, "Cannot push read requests to queue"); +} + bool KeeperDispatcher::isServerActive() const { return checkInit() && hasLeader() && !server->isRecovering(); @@ -674,7 +1034,7 @@ Keeper4LWInfo KeeperDispatcher::getKeeper4LWInfo() const Keeper4LWInfo result = server->getPartiallyFilled4LWInfo(); { std::lock_guard lock(push_request_mutex); - result.outstanding_requests_count = requests_queue->size(); + result.outstanding_requests_count = active_requests_queue->size(); } { std::lock_guard lock(session_to_response_callback_mutex); diff --git a/src/Coordination/KeeperDispatcher.h b/src/Coordination/KeeperDispatcher.h index 5e2701299f4..6421db87793 100644 --- a/src/Coordination/KeeperDispatcher.h +++ b/src/Coordination/KeeperDispatcher.h @@ -32,9 +32,12 @@ private: using UpdateConfigurationQueue = ConcurrentBoundedQueue; /// Size depends on coordination settings - std::unique_ptr requests_queue; + /// Request currently being processed + std::unique_ptr active_requests_queue; ResponsesQueue responses_queue; SnapshotsQueue snapshots_queue{1}; + ConcurrentBoundedQueue read_requests_queue; + ConcurrentBoundedQueue finalize_requests_queue; /// More than 1k updates is definitely misconfiguration. UpdateConfigurationQueue update_configuration_queue{1000}; @@ -64,6 +67,8 @@ private: ThreadFromGlobalPool snapshot_thread; /// Apply or wait for configuration changes ThreadFromGlobalPool update_configuration_thread; + ThreadFromGlobalPool read_request_thread; + ThreadFromGlobalPool finalize_requests_thread; /// RAFT wrapper. std::unique_ptr server; @@ -77,6 +82,34 @@ private: /// Counter for new session_id requests. std::atomic internal_session_id_counter{0}; + /// A read request needs to have at least the log it was the last committed log on the leader + /// at the time the request was being made. + /// If the node is stale, we need to wait to commit that log before doing local read requests to achieve + /// linearizability. + std::unordered_map leader_waiters; + std::mutex leader_waiter_mutex; + + /// We can be actively processing one type of requests (either read or write) from a single session. + /// If we receive a request of a type that is not currently being processed, we put it in the waiting queue. + /// Also, we want to process them in ariving order, so if we have a different type in the queue, we cannot process that request + /// but wait for all the previous requests to finish. + /// E.g. READ -> WRITE -> READ, the last READ will go to the waiting queue even though we are currently processing the first READ + /// because we have WRITE request before it that needs to be processed. + struct UnprocessedRequests + { + /// how many requests are currently in the active request queue + size_t unprocessed_num{0}; + /// is_read currently being processed + bool is_read{false}; + std::list request_queue; + }; + + // Called every time a batch of requests are processed. + void finalizeRequests(const KeeperStorage::RequestsForSessions & requests_for_sessions); + + std::unordered_map unprocessed_requests_for_session; + std::mutex unprocessed_request_mutex; + /// Thread put requests to raft void requestThread(); /// Thread put responses for subscribed sessions @@ -88,6 +121,12 @@ private: /// Thread apply or wait configuration changes from leader void updateConfigurationThread(); + void readRequestThread(); + + void finalizeRequestsThread(); + + void processReadRequests(const CoordinationSettingsPtr & coordination_settings, KeeperStorage::RequestsForSessions & read_requests); + void setResponse(int64_t session_id, const Coordination::ZooKeeperResponsePtr & response); /// Add error responses for requests to responses queue. @@ -96,7 +135,7 @@ private: /// Forcefully wait for result and sets errors if something when wrong. /// Clears both arguments - void forceWaitAndProcessResult(RaftAppendResult & result, KeeperStorage::RequestsForSessions & requests_for_sessions); + static void forceWaitAndProcessResult(RaftResult & result); public: /// Just allocate some objects, real initialization is done by `intialize method` @@ -116,6 +155,12 @@ public: return server && server->checkInit(); } + /// Called when a single log with request is committed. + void onRequestCommit(const KeeperStorage::RequestForSession & request_for_session, uint64_t log_term, uint64_t log_idx); + + /// Called when a snapshot is applied + void onApplySnapshot(uint64_t term, uint64_t last_idx); + /// Is server accepting requests, i.e. connected to the cluster /// and achieved quorum bool isServerActive() const; diff --git a/src/Coordination/KeeperServer.cpp b/src/Coordination/KeeperServer.cpp index 8186ddd0c00..b708c5a51ba 100644 --- a/src/Coordination/KeeperServer.cpp +++ b/src/Coordination/KeeperServer.cpp @@ -105,7 +105,9 @@ KeeperServer::KeeperServer( const KeeperConfigurationAndSettingsPtr & configuration_and_settings_, const Poco::Util::AbstractConfiguration & config, ResponsesQueue & responses_queue_, - SnapshotsQueue & snapshots_queue_) + SnapshotsQueue & snapshots_queue_, + KeeperStateMachine::CommitCallback commit_callback, + KeeperStateMachine::ApplySnapshotCallback apply_snapshot_callback) : server_id(configuration_and_settings_->server_id) , coordination_settings(configuration_and_settings_->coordination_settings) , log(&Poco::Logger::get("KeeperServer")) @@ -113,7 +115,7 @@ KeeperServer::KeeperServer( , keeper_context{std::make_shared()} , create_snapshot_on_exit(config.getBool("keeper_server.create_snapshot_on_exit", true)) { - if (coordination_settings->quorum_reads) + if (coordination_settings->quorum_reads || coordination_settings->read_mode.toString() == "quorum") LOG_WARNING(log, "Quorum reads enabled, Keeper will work slower."); keeper_context->digest_enabled = config.getBool("keeper_server.digest_enabled", false); @@ -125,7 +127,9 @@ KeeperServer::KeeperServer( configuration_and_settings_->snapshot_storage_path, coordination_settings, keeper_context, - checkAndGetSuperdigest(configuration_and_settings_->super_digest)); + checkAndGetSuperdigest(configuration_and_settings_->super_digest), + std::move(commit_callback), + std::move(apply_snapshot_callback)); state_manager = nuraft::cs_new( server_id, @@ -176,6 +180,13 @@ struct KeeperServer::KeeperRaftServer : public nuraft::raft_server reconfigure(new_config); } + RaftResult getLeaderInfo() + { + nuraft::ptr req + = nuraft::cs_new(0ull, nuraft::msg_type::leader_status_request, 0, 0, 0ull, 0ull, 0ull); + return send_msg_to_leader(req); + } + void commit_in_bg() override { // For NuRaft, if any commit fails (uncaught exception) the whole server aborts as a safety @@ -269,6 +280,20 @@ void KeeperServer::launchRaftServer(const Poco::Util::AbstractConfiguration & co coordination_settings->election_timeout_lower_bound_ms.totalMilliseconds(), "election_timeout_lower_bound_ms", log); params.election_timeout_upper_bound_ = getValueOrMaxInt32AndLogWarning( coordination_settings->election_timeout_upper_bound_ms.totalMilliseconds(), "election_timeout_upper_bound_ms", log); + + params.leadership_expiry_ = getValueOrMaxInt32AndLogWarning(coordination_settings->leadership_expiry.totalMilliseconds(), "leadership_expiry", log); + + if (coordination_settings->read_mode.toString() == "fastlinear") + { + if (params.leadership_expiry_ == 0) + params.leadership_expiry_ = params.election_timeout_lower_bound_; + else if (params.leadership_expiry_ > params.election_timeout_lower_bound_) + { + LOG_WARNING(log, "To use fast linearizable reads, leadership_expiry should be set to a value that is less or equal to the election_timeout_upper_bound_ms. " + "Based on current settings, there are no guarantees for linearizability of reads."); + } + } + params.reserved_log_items_ = getValueOrMaxInt32AndLogWarning(coordination_settings->reserved_log_items, "reserved_log_items", log); params.snapshot_distance_ = getValueOrMaxInt32AndLogWarning(coordination_settings->snapshot_distance, "snapshot_distance", log); @@ -487,7 +512,7 @@ void KeeperServer::putLocalReadRequest(const KeeperStorage::RequestForSession & state_machine->processReadRequest(request_for_session); } -RaftAppendResult KeeperServer::putRequestBatch(const KeeperStorage::RequestsForSessions & requests_for_sessions) +RaftResult KeeperServer::putRequestBatch(const KeeperStorage::RequestsForSessions & requests_for_sessions) { std::vector> entries; for (const auto & request_for_session : requests_for_sessions) @@ -713,6 +738,20 @@ std::vector KeeperServer::getDeadSessions() return state_machine->getDeadSessions(); } +RaftResult KeeperServer::getLeaderInfo() +{ + std::lock_guard lock{server_write_mutex}; + if (is_recovering) + return nullptr; + + return raft_instance->getLeaderInfo(); +} + +KeeperServer::NodeInfo KeeperServer::getNodeInfo() +{ + return { .term = raft_instance->get_term(), .last_committed_index = state_machine->last_commit_index() }; +} + ConfigUpdateActions KeeperServer::getConfigurationDiff(const Poco::Util::AbstractConfiguration & config) { auto diff = state_manager->getConfigurationDiff(config); diff --git a/src/Coordination/KeeperServer.h b/src/Coordination/KeeperServer.h index 6873ef2a01e..02ab643044a 100644 --- a/src/Coordination/KeeperServer.h +++ b/src/Coordination/KeeperServer.h @@ -14,7 +14,7 @@ namespace DB { -using RaftAppendResult = nuraft::ptr>>; +using RaftResult = nuraft::ptr>>; class KeeperServer { @@ -71,7 +71,9 @@ public: const KeeperConfigurationAndSettingsPtr & settings_, const Poco::Util::AbstractConfiguration & config_, ResponsesQueue & responses_queue_, - SnapshotsQueue & snapshots_queue_); + SnapshotsQueue & snapshots_queue_, + KeeperStateMachine::CommitCallback commit_callback, + KeeperStateMachine::ApplySnapshotCallback apply_snapshot_callback); /// Load state machine from the latest snapshot and load log storage. Start NuRaft with required settings. void startup(const Poco::Util::AbstractConfiguration & config, bool enable_ipv6 = true); @@ -84,7 +86,7 @@ public: /// Put batch of requests into Raft and get result of put. Responses will be set separately into /// responses_queue. - RaftAppendResult putRequestBatch(const KeeperStorage::RequestsForSessions & requests); + RaftResult putRequestBatch(const KeeperStorage::RequestsForSessions & requests); /// Return set of the non-active sessions std::vector getDeadSessions(); @@ -119,6 +121,17 @@ public: int getServerID() const { return server_id; } + struct NodeInfo + { + uint64_t term; + uint64_t last_committed_index; + + bool operator==(const NodeInfo &) const = default; + }; + + RaftResult getLeaderInfo(); + NodeInfo getNodeInfo(); + /// Get configuration diff between current configuration in RAFT and in XML file ConfigUpdateActions getConfigurationDiff(const Poco::Util::AbstractConfiguration & config); @@ -126,10 +139,23 @@ public: /// Synchronously check for update results with retries. void applyConfigurationUpdate(const ConfigUpdateAction & task); - /// Wait configuration update for action. Used by followers. /// Return true if update was successfully received. bool waitConfigurationUpdate(const ConfigUpdateAction & task); }; } +namespace std +{ + template <> + struct hash + { + size_t operator()(const DB::KeeperServer::NodeInfo & info) const + { + SipHash hash_state; + hash_state.update(info.term); + hash_state.update(info.last_committed_index); + return hash_state.get64(); + } + }; +} diff --git a/src/Coordination/KeeperStateMachine.cpp b/src/Coordination/KeeperStateMachine.cpp index c5a66ce29ca..477d8104796 100644 --- a/src/Coordination/KeeperStateMachine.cpp +++ b/src/Coordination/KeeperStateMachine.cpp @@ -44,7 +44,9 @@ KeeperStateMachine::KeeperStateMachine( const std::string & snapshots_path_, const CoordinationSettingsPtr & coordination_settings_, const KeeperContextPtr & keeper_context_, - const std::string & superdigest_) + const std::string & superdigest_, + CommitCallback commit_callback_, + ApplySnapshotCallback apply_snapshot_callback_) : coordination_settings(coordination_settings_) , snapshot_manager( snapshots_path_, @@ -58,6 +60,8 @@ KeeperStateMachine::KeeperStateMachine( , last_committed_idx(0) , log(&Poco::Logger::get("KeeperStateMachine")) , superdigest(superdigest_) + , commit_callback(std::move(commit_callback_)) + , apply_snapshot_callback(std::move(apply_snapshot_callback_)) , keeper_context(keeper_context_) { } @@ -223,11 +227,11 @@ bool KeeperStateMachine::preprocess(const KeeperStorage::RequestForSession & req return true; } -nuraft::ptr KeeperStateMachine::commit(const uint64_t log_idx, nuraft::buffer & data) +nuraft::ptr KeeperStateMachine::commit_ext(const ext_op_params & params) { - auto request_for_session = parseRequest(data); + auto request_for_session = parseRequest(*params.data); if (!request_for_session.zxid) - request_for_session.zxid = log_idx; + request_for_session.zxid = params.log_idx; /// Special processing of session_id request if (request_for_session.request->getOpNum() == Coordination::OpNum::SessionID) @@ -272,8 +276,9 @@ nuraft::ptr KeeperStateMachine::commit(const uint64_t log_idx, n assertDigest(*request_for_session.digest, storage->getNodesDigest(true), *request_for_session.request, true); } + last_committed_idx = params.log_idx; + commit_callback(request_for_session, params.log_term, params.log_idx); ProfileEvents::increment(ProfileEvents::KeeperCommits); - last_committed_idx = log_idx; return nullptr; } @@ -306,6 +311,7 @@ bool KeeperStateMachine::apply_snapshot(nuraft::snapshot & s) ProfileEvents::increment(ProfileEvents::KeeperSnapshotApplys); last_committed_idx = s.get_last_log_idx(); + apply_snapshot_callback(s.get_last_log_term(), s.get_last_log_idx()); return true; } @@ -320,6 +326,10 @@ void KeeperStateMachine::commit_config(const uint64_t /* log_idx */, nuraft::ptr void KeeperStateMachine::rollback(uint64_t log_idx, nuraft::buffer & data) { auto request_for_session = parseRequest(data); + + if (request_for_session.request->getOpNum() == Coordination::OpNum::SessionID) + return; + // If we received a log from an older node, use the log_idx as the zxid // log_idx will always be larger or equal to the zxid so we can safely do this // (log_idx is increased for all logs, while zxid is only increased for requests) diff --git a/src/Coordination/KeeperStateMachine.h b/src/Coordination/KeeperStateMachine.h index fbd4fdc5ac2..f44dfd503b0 100644 --- a/src/Coordination/KeeperStateMachine.h +++ b/src/Coordination/KeeperStateMachine.h @@ -20,13 +20,18 @@ using SnapshotsQueue = ConcurrentBoundedQueue; class KeeperStateMachine : public nuraft::state_machine { public: + using CommitCallback = std::function; + using ApplySnapshotCallback = std::function; + KeeperStateMachine( ResponsesQueue & responses_queue_, SnapshotsQueue & snapshots_queue_, const std::string & snapshots_path_, const CoordinationSettingsPtr & coordination_settings_, const KeeperContextPtr & keeper_context_, - const std::string & superdigest_ = ""); + const std::string & superdigest_ = "", + CommitCallback commit_callback_ = [](const KeeperStorage::RequestForSession &, uint64_t, uint64_t){}, + ApplySnapshotCallback apply_snapshot_callback_ = [](uint64_t, uint64_t){}); /// Read state from the latest snapshot void init(); @@ -37,7 +42,7 @@ public: nuraft::ptr pre_commit(uint64_t log_idx, nuraft::buffer & data) override; - nuraft::ptr commit(const uint64_t log_idx, nuraft::buffer & data) override; /// NOLINT + nuraft::ptr commit_ext(const ext_op_params & params) override; /// NOLINT /// Save new cluster config to our snapshot (copy of the config stored in StateManager) void commit_config(const uint64_t log_idx, nuraft::ptr & new_conf) override; /// NOLINT @@ -145,6 +150,11 @@ private: /// Special part of ACL system -- superdigest specified in server config. const std::string superdigest; + /// call when a request is committed + const CommitCallback commit_callback; + /// call when snapshot is applied + const ApplySnapshotCallback apply_snapshot_callback; + KeeperContextPtr keeper_context; }; diff --git a/src/Coordination/tests/gtest_coordination.cpp b/src/Coordination/tests/gtest_coordination.cpp index fa6bfca7c7a..b98fd0e56e8 100644 --- a/src/Coordination/tests/gtest_coordination.cpp +++ b/src/Coordination/tests/gtest_coordination.cpp @@ -1330,8 +1330,9 @@ void testLogAndStateMachine(Coordination::CoordinationSettingsPtr settings, uint changelog.append(entry); changelog.end_of_append_batch(0, 0); - state_machine->pre_commit(i, changelog.entry_at(i)->get_buf()); - state_machine->commit(i, changelog.entry_at(i)->get_buf()); + auto entry_buf = changelog.entry_at(i)->get_buf_ptr(); + state_machine->pre_commit(i, *entry_buf); + state_machine->commit_ext(nuraft::state_machine::ext_op_params{i, entry_buf}); bool snapshot_created = false; if (i % settings->snapshot_distance == 0) { @@ -1375,8 +1376,9 @@ void testLogAndStateMachine(Coordination::CoordinationSettingsPtr settings, uint for (size_t i = restore_machine->last_commit_index() + 1; i < restore_changelog.next_slot(); ++i) { - restore_machine->pre_commit(i, changelog.entry_at(i)->get_buf()); - restore_machine->commit(i, changelog.entry_at(i)->get_buf()); + auto entry = changelog.entry_at(i)->get_buf_ptr(); + restore_machine->pre_commit(i, *entry); + restore_machine->commit_ext(nuraft::state_machine::ext_op_params{i, entry}); } auto & source_storage = state_machine->getStorage(); @@ -1477,18 +1479,18 @@ TEST_P(CoordinationTest, TestEphemeralNodeRemove) std::shared_ptr request_c = std::make_shared(); request_c->path = "/hello"; request_c->is_ephemeral = true; - auto entry_c = getLogEntryFromZKRequest(0, 1, state_machine->getNextZxid(), request_c); - state_machine->pre_commit(1, entry_c->get_buf()); - state_machine->commit(1, entry_c->get_buf()); + auto entry_c = getLogEntryFromZKRequest(0, 1, state_machine->getNextZxid(), request_c)->get_buf_ptr(); + state_machine->pre_commit(1, *entry_c); + state_machine->commit_ext(nuraft::state_machine::ext_op_params{1, entry_c}); const auto & storage = state_machine->getStorage(); EXPECT_EQ(storage.ephemerals.size(), 1); std::shared_ptr request_d = std::make_shared(); request_d->path = "/hello"; /// Delete from other session - auto entry_d = getLogEntryFromZKRequest(0, 2, state_machine->getNextZxid(), request_d); - state_machine->pre_commit(2, entry_d->get_buf()); - state_machine->commit(2, entry_d->get_buf()); + auto entry_d = getLogEntryFromZKRequest(0, 2, state_machine->getNextZxid(), request_d)->get_buf_ptr(); + state_machine->pre_commit(2, *entry_d); + state_machine->commit_ext(nuraft::state_machine::ext_op_params{2, entry_d}); EXPECT_EQ(storage.ephemerals.size(), 0); } diff --git a/src/Core/BackgroundSchedulePool.cpp b/src/Core/BackgroundSchedulePool.cpp index b7a33c4930d..29cd3c1c540 100644 --- a/src/Core/BackgroundSchedulePool.cpp +++ b/src/Core/BackgroundSchedulePool.cpp @@ -149,9 +149,9 @@ BackgroundSchedulePool::BackgroundSchedulePool(size_t size_, CurrentMetrics::Met threads.resize(size_); for (auto & thread : threads) - thread = ThreadFromGlobalPool([this] { threadFunction(); }); + thread = ThreadFromGlobalPoolNoTracingContextPropagation([this] { threadFunction(); }); - delayed_thread = ThreadFromGlobalPool([this] { delayExecutionThreadFunction(); }); + delayed_thread = ThreadFromGlobalPoolNoTracingContextPropagation([this] { delayExecutionThreadFunction(); }); } @@ -168,7 +168,7 @@ void BackgroundSchedulePool::increaseThreadsCount(size_t new_threads_count) threads.resize(new_threads_count); for (size_t i = old_threads_count; i < new_threads_count; ++i) - threads[i] = ThreadFromGlobalPool([this] { threadFunction(); }); + threads[i] = ThreadFromGlobalPoolNoTracingContextPropagation([this] { threadFunction(); }); } diff --git a/src/Core/BackgroundSchedulePool.h b/src/Core/BackgroundSchedulePool.h index 36cbad145c9..1001d98e643 100644 --- a/src/Core/BackgroundSchedulePool.h +++ b/src/Core/BackgroundSchedulePool.h @@ -57,7 +57,9 @@ public: ~BackgroundSchedulePool(); private: - using Threads = std::vector; + /// BackgroundSchedulePool schedules a task on its own task queue, there's no need to construct/restore tracing context on this level. + /// This is also how ThreadPool class treats the tracing context. See ThreadPool for more information. + using Threads = std::vector; void threadFunction(); void delayExecutionThreadFunction(); @@ -83,7 +85,7 @@ private: std::condition_variable delayed_tasks_cond_var; std::mutex delayed_tasks_mutex; /// Thread waiting for next delayed task. - ThreadFromGlobalPool delayed_thread; + ThreadFromGlobalPoolNoTracingContextPropagation delayed_thread; /// Tasks ordered by scheduled time. DelayedTasks delayed_tasks; diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 47c86295a34..86fccf45a8d 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -481,7 +481,7 @@ static constexpr UInt64 operator""_GiB(unsigned long long value) M(Bool, optimize_if_chain_to_multiif, false, "Replace if(cond1, then1, if(cond2, ...)) chains to multiIf. Currently it's not beneficial for numeric types.", 0) \ M(Bool, optimize_multiif_to_if, true, "Replace 'multiIf' with only one condition to 'if'.", 0) \ M(Bool, optimize_if_transform_strings_to_enum, false, "Replaces string-type arguments in If and Transform to enum. Disabled by default cause it could make inconsistent change in distributed query that would lead to its fail.", 0) \ - M(Bool, optimize_monotonous_functions_in_order_by, true, "Replace monotonous function with its argument in ORDER BY", 0) \ + M(Bool, optimize_monotonous_functions_in_order_by, false, "Replace monotonous function with its argument in ORDER BY", 0) \ M(Bool, optimize_functions_to_subcolumns, false, "Transform functions to subcolumns, if possible, to reduce amount of read data. E.g. 'length(arr)' -> 'arr.size0', 'col IS NULL' -> 'col.null' ", 0) \ M(Bool, optimize_using_constraints, false, "Use constraints for query optimization", 0) \ M(Bool, optimize_substitute_columns, false, "Use constraints for column substitution", 0) \ @@ -527,7 +527,7 @@ static constexpr UInt64 operator""_GiB(unsigned long long value) M(Bool, describe_extend_object_types, false, "Deduce concrete type of columns of type Object in DESCRIBE query", 0) \ M(Bool, describe_include_subcolumns, false, "If true, subcolumns of all table columns will be included into result of DESCRIBE query", 0) \ \ - M(Bool, optimize_rewrite_sum_if_to_count_if, true, "Rewrite sumIf() and sum(if()) function countIf() function when logically equivalent", 0) \ + M(Bool, optimize_rewrite_sum_if_to_count_if, false, "Rewrite sumIf() and sum(if()) function countIf() function when logically equivalent", 0) \ M(UInt64, insert_shard_id, 0, "If non zero, when insert into a distributed table, the data will be inserted into the shard `insert_shard_id` synchronously. Possible values range from 1 to `shards_number` of corresponding distributed table", 0) \ \ M(Bool, collect_hash_table_stats_during_aggregation, true, "Enable collecting hash table statistics to optimize memory allocation", 0) \ @@ -618,6 +618,8 @@ static constexpr UInt64 operator""_GiB(unsigned long long value) M(Bool, allow_deprecated_database_ordinary, false, "Allow to create databases with deprecated Ordinary engine", 0) \ M(Bool, allow_deprecated_syntax_for_merge_tree, false, "Allow to create *MergeTree tables with deprecated engine definition syntax", 0) \ \ + M(Bool, force_grouping_standard_compatibility, true, "Make GROUPING function to return 1 when argument is not used as an aggregation key", 0) \ + \ M(Bool, schema_inference_use_cache_for_file, true, "Use cache in schema inference while using file table function", 0) \ M(Bool, schema_inference_use_cache_for_s3, true, "Use cache in schema inference while using s3 table function", 0) \ M(Bool, schema_inference_use_cache_for_hdfs, true, "Use cache in schema inference while using hdfs table function", 0) \ @@ -775,6 +777,8 @@ static constexpr UInt64 operator""_GiB(unsigned long long value) \ M(UInt64, input_format_allow_errors_num, 0, "Maximum absolute amount of errors while reading text formats (like CSV, TSV). In case of error, if at least absolute or relative amount of errors is lower than corresponding value, will skip until next line and continue.", 0) \ M(Float, input_format_allow_errors_ratio, 0, "Maximum relative amount of errors while reading text formats (like CSV, TSV). In case of error, if at least absolute or relative amount of errors is lower than corresponding value, will skip until next line and continue.", 0) \ + M(String, input_format_record_errors_file_path, "", "Path of the file used to record errors while reading text formats (CSV, TSV).", 0) \ + M(String, errors_output_format, "CSV", "Method to write Errors to text output.", 0) \ \ M(String, format_schema, "", "Schema identifier (used by schema-based formats)", 0) \ M(String, format_template_resultset, "", "Path to file which contains format string for result set (for Template format)", 0) \ diff --git a/src/Core/SettingsChangesHistory.h b/src/Core/SettingsChangesHistory.h index be2def2c01a..b78b812da86 100644 --- a/src/Core/SettingsChangesHistory.h +++ b/src/Core/SettingsChangesHistory.h @@ -78,6 +78,7 @@ namespace SettingsChangesHistory /// It's used to implement `compatibility` setting (see https://github.com/ClickHouse/ClickHouse/issues/35972) static std::map settings_changes_history = { + {"22.9", {{"force_grouping_standard_compatibility", false, true, "Make GROUPING function output the same as in SQL standard and other DBMS"}}}, {"22.7", {{"cross_to_inner_join_rewrite", 1, 2, "Force rewrite comma join to inner"}, {"enable_positional_arguments", false, true, "Enable positional arguments feature by default"}, {"format_csv_allow_single_quotes", true, false, "Most tools don't treat single quote in CSV specially, don't do it by default too"}}}, diff --git a/src/Core/examples/coro.cpp b/src/Core/examples/coro.cpp index 370820a228d..fbccc261e9d 100644 --- a/src/Core/examples/coro.cpp +++ b/src/Core/examples/coro.cpp @@ -14,7 +14,7 @@ namespace std // NOLINT(cert-dcl58-cpp) { - using namespace experimental::coroutines_v1; + using namespace experimental::coroutines_v1; // NOLINT(cert-dcl58-cpp) } #if __has_warning("-Wdeprecated-experimental-coroutine") diff --git a/src/DataTypes/ObjectUtils.cpp b/src/DataTypes/ObjectUtils.cpp index 3cf557ec5bf..c14b9b579ea 100644 --- a/src/DataTypes/ObjectUtils.cpp +++ b/src/DataTypes/ObjectUtils.cpp @@ -1,3 +1,4 @@ +#include #include #include #include @@ -159,6 +160,16 @@ void convertObjectsToTuples(Block & block, const NamesAndTypesList & extended_st } } +void deduceTypesOfObjectColumns(const StorageSnapshotPtr & storage_snapshot, Block & block) +{ + if (!storage_snapshot->object_columns.empty()) + { + auto options = GetColumnsOptions(GetColumnsOptions::AllPhysical).withExtendedObjects(); + auto storage_columns = storage_snapshot->getColumns(options); + convertObjectsToTuples(block, storage_columns); + } +} + static bool isPrefix(const PathInData::Parts & prefix, const PathInData::Parts & parts) { if (prefix.size() > parts.size()) diff --git a/src/DataTypes/ObjectUtils.h b/src/DataTypes/ObjectUtils.h index 2dde0ed3e65..c60d5bec208 100644 --- a/src/DataTypes/ObjectUtils.h +++ b/src/DataTypes/ObjectUtils.h @@ -11,6 +11,9 @@ namespace DB { +struct StorageSnapshot; +using StorageSnapshotPtr = std::shared_ptr; + /// Returns number of dimensions in Array type. 0 if type is not array. size_t getNumberOfDimensions(const IDataType & type); @@ -38,6 +41,7 @@ DataTypePtr getDataTypeByColumn(const IColumn & column); /// Converts Object types and columns to Tuples in @columns_list and @block /// and checks that types are consistent with types in @extended_storage_columns. void convertObjectsToTuples(Block & block, const NamesAndTypesList & extended_storage_columns); +void deduceTypesOfObjectColumns(const StorageSnapshotPtr & storage_snapshot, Block & block); /// Checks that each path is not the prefix of any other path. void checkObjectHasNoAmbiguosPaths(const PathsInData & paths); @@ -164,27 +168,24 @@ ColumnsDescription getObjectColumns( const ColumnsDescription & storage_columns, EntryColumnsGetter && entry_columns_getter) { - ColumnsDescription res; - - if (begin == end) - { - for (const auto & column : storage_columns) - { - if (isObject(column.type)) - { - auto tuple_type = std::make_shared( - DataTypes{std::make_shared()}, - Names{ColumnObject::COLUMN_NAME_DUMMY}); - - res.add({column.name, std::move(tuple_type)}); - } - } - - return res; - } - std::unordered_map types_in_entries; + /// Add dummy column for all Object columns + /// to not lose any column if it's missing + /// in all entries. If it exists in any entry + /// dummy column will be removed. + for (const auto & column : storage_columns) + { + if (isObject(column.type)) + { + auto tuple_type = std::make_shared( + DataTypes{std::make_shared()}, + Names{ColumnObject::COLUMN_NAME_DUMMY}); + + types_in_entries[column.name].push_back(std::move(tuple_type)); + } + } + for (auto it = begin; it != end; ++it) { const auto & entry_columns = entry_columns_getter(*it); @@ -196,6 +197,7 @@ ColumnsDescription getObjectColumns( } } + ColumnsDescription res; for (const auto & [name, types] : types_in_entries) res.add({name, getLeastCommonTypeForObject(types)}); diff --git a/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp b/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp index fa4a79415ec..da7f8c871cb 100644 --- a/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp +++ b/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp @@ -143,9 +143,11 @@ void CachedOnDiskReadBufferFromFile::initialize(size_t offset, size_t size) } CachedOnDiskReadBufferFromFile::ImplementationBufferPtr -CachedOnDiskReadBufferFromFile::getCacheReadBuffer(size_t offset) const +CachedOnDiskReadBufferFromFile::getCacheReadBuffer(const FileSegment & file_segment) const { - auto path = cache->getPathInLocalCache(cache_key, offset, is_persistent); + /// Use is_persistent flag from in-memory state of the filesegment, + /// because it is consistent with what is written on disk. + auto path = file_segment.getPathInLocalCache(); ReadSettings local_read_settings{settings}; /// Do not allow to use asynchronous version of LocalFSReadMethod. @@ -206,7 +208,7 @@ CachedOnDiskReadBufferFromFile::getRemoteFSReadBuffer(FileSegment & file_segment return remote_file_reader; auto remote_fs_segment_reader = file_segment.extractRemoteFileReader(); - if (remote_fs_segment_reader && file_offset_of_buffer_end == remote_file_reader->getFileOffsetOfBufferEnd()) + if (remote_fs_segment_reader && file_offset_of_buffer_end == remote_fs_segment_reader->getFileOffsetOfBufferEnd()) remote_file_reader = remote_fs_segment_reader; else remote_file_reader = implementation_buffer_creator(); @@ -237,8 +239,6 @@ bool CachedOnDiskReadBufferFromFile::canStartFromCache(size_t current_offset, co CachedOnDiskReadBufferFromFile::ImplementationBufferPtr CachedOnDiskReadBufferFromFile::getReadBufferForFileSegment(FileSegmentPtr & file_segment) { - auto range = file_segment->range(); - auto download_state = file_segment->state(); LOG_TEST(log, "getReadBufferForFileSegment: {}", file_segment->getInfoForLog()); @@ -247,7 +247,7 @@ CachedOnDiskReadBufferFromFile::getReadBufferForFileSegment(FileSegmentPtr & fil if (download_state == FileSegment::State::DOWNLOADED) { read_type = ReadType::CACHED; - return getCacheReadBuffer(range.left); + return getCacheReadBuffer(*file_segment); } else { @@ -280,7 +280,7 @@ CachedOnDiskReadBufferFromFile::getReadBufferForFileSegment(FileSegmentPtr & fil /// file_offset_of_buffer_end read_type = ReadType::CACHED; - return getCacheReadBuffer(range.left); + return getCacheReadBuffer(*file_segment); } download_state = file_segment->wait(); @@ -289,7 +289,7 @@ CachedOnDiskReadBufferFromFile::getReadBufferForFileSegment(FileSegmentPtr & fil case FileSegment::State::DOWNLOADED: { read_type = ReadType::CACHED; - return getCacheReadBuffer(range.left); + return getCacheReadBuffer(*file_segment); } case FileSegment::State::EMPTY: case FileSegment::State::PARTIALLY_DOWNLOADED: @@ -305,7 +305,7 @@ CachedOnDiskReadBufferFromFile::getReadBufferForFileSegment(FileSegmentPtr & fil /// file_offset_of_buffer_end read_type = ReadType::CACHED; - return getCacheReadBuffer(range.left); + return getCacheReadBuffer(*file_segment); } auto downloader_id = file_segment->getOrSetDownloader(); @@ -323,7 +323,7 @@ CachedOnDiskReadBufferFromFile::getReadBufferForFileSegment(FileSegmentPtr & fil read_type = ReadType::CACHED; file_segment->resetDownloader(); - return getCacheReadBuffer(range.left); + return getCacheReadBuffer(*file_segment); } if (file_segment->getCurrentWriteOffset() < file_offset_of_buffer_end) @@ -339,7 +339,7 @@ CachedOnDiskReadBufferFromFile::getReadBufferForFileSegment(FileSegmentPtr & fil LOG_TEST(log, "Predownload. File segment info: {}", file_segment->getInfoForLog()); chassert(file_offset_of_buffer_end > file_segment->getCurrentWriteOffset()); bytes_to_predownload = file_offset_of_buffer_end - file_segment->getCurrentWriteOffset(); - chassert(bytes_to_predownload < range.size()); + chassert(bytes_to_predownload < file_segment->range().size()); } read_type = ReadType::REMOTE_FS_READ_AND_PUT_IN_CACHE; @@ -354,7 +354,7 @@ CachedOnDiskReadBufferFromFile::getReadBufferForFileSegment(FileSegmentPtr & fil if (canStartFromCache(file_offset_of_buffer_end, *file_segment)) { read_type = ReadType::CACHED; - return getCacheReadBuffer(range.left); + return getCacheReadBuffer(*file_segment); } else { diff --git a/src/Disks/IO/CachedOnDiskReadBufferFromFile.h b/src/Disks/IO/CachedOnDiskReadBufferFromFile.h index b86e53ec160..535d01f3a8c 100644 --- a/src/Disks/IO/CachedOnDiskReadBufferFromFile.h +++ b/src/Disks/IO/CachedOnDiskReadBufferFromFile.h @@ -68,7 +68,7 @@ private: ImplementationBufferPtr getReadBufferForFileSegment(FileSegmentPtr & file_segment); - ImplementationBufferPtr getCacheReadBuffer(size_t offset) const; + ImplementationBufferPtr getCacheReadBuffer(const FileSegment & file_segment) const; std::optional getLastNonDownloadedOffset() const; diff --git a/src/Disks/ObjectStorages/DiskObjectStorageMetadata.cpp b/src/Disks/ObjectStorages/DiskObjectStorageMetadata.cpp index f18debe8a8b..56cc20098ba 100644 --- a/src/Disks/ObjectStorages/DiskObjectStorageMetadata.cpp +++ b/src/Disks/ObjectStorages/DiskObjectStorageMetadata.cpp @@ -13,7 +13,6 @@ namespace DB namespace ErrorCodes { extern const int UNKNOWN_FORMAT; - extern const int LOGICAL_ERROR; } void DiskObjectStorageMetadata::deserialize(ReadBuffer & buf) @@ -131,9 +130,6 @@ DiskObjectStorageMetadata::DiskObjectStorageMetadata( void DiskObjectStorageMetadata::addObject(const String & path, size_t size) { - if (!object_storage_root_path.empty() && path.starts_with(object_storage_root_path)) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected relative path"); - total_size += size; storage_objects.emplace_back(path, size); } diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp index 45304ac2fac..998b521cc56 100644 --- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp +++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp @@ -31,7 +31,6 @@ #include #include - namespace DB { @@ -91,19 +90,7 @@ void logIfError(const Aws::Utils::Outcome & response, std::functi std::string S3ObjectStorage::generateBlobNameForPath(const std::string & /* path */) { - /// Path to store the new S3 object. - - /// Total length is 32 a-z characters for enough randomness. - /// First 3 characters are used as a prefix for - /// https://aws.amazon.com/premiumsupport/knowledge-center/s3-object-key-naming-pattern/ - - constexpr size_t key_name_total_size = 32; - constexpr size_t key_name_prefix_size = 3; - - /// Path to store new S3 object. - return fmt::format("{}/{}", - getRandomASCIIString(key_name_prefix_size), - getRandomASCIIString(key_name_total_size - key_name_prefix_size)); + return getRandomASCIIString(32); } Aws::S3::Model::HeadObjectOutcome S3ObjectStorage::requestObjectHeadData(const std::string & bucket_from, const std::string & key) const diff --git a/src/Disks/ObjectStorages/StoredObject.h b/src/Disks/ObjectStorages/StoredObject.h index d9faa766540..acb8a5fd127 100644 --- a/src/Disks/ObjectStorages/StoredObject.h +++ b/src/Disks/ObjectStorages/StoredObject.h @@ -3,7 +3,6 @@ #include #include - namespace DB { diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp index 780b6bb6201..3de4a0de391 100644 --- a/src/Formats/FormatFactory.cpp +++ b/src/Formats/FormatFactory.cpp @@ -243,11 +243,20 @@ InputFormatPtr FormatFactory::getInput( ParallelParsingInputFormat::Params params{ buf, sample, parser_creator, file_segmentation_engine, name, settings.max_threads, settings.min_chunk_bytes_for_parallel_parsing, context->getApplicationType() == Context::ApplicationType::SERVER}; - return std::make_shared(params); + auto format = std::make_shared(params); + if (!settings.input_format_record_errors_file_path.toString().empty()) + { + format->setErrorsLogger(std::make_shared(context)); + } + return format; } auto format = getInputFormat(name, buf, sample, context, max_block_size, format_settings); + if (!settings.input_format_record_errors_file_path.toString().empty()) + { + format->setErrorsLogger(std::make_shared(context)); + } return format; } @@ -521,6 +530,7 @@ String FormatFactory::getFormatFromFileDescriptor(int fd) return getFormatFromFileName(file_path, false); return ""; #else + (void)fd; return ""; #endif } diff --git a/src/Functions/modelEvaluate.cpp b/src/Functions/catboostEvaluate.cpp similarity index 55% rename from src/Functions/modelEvaluate.cpp rename to src/Functions/catboostEvaluate.cpp index 3ee2ae3fae4..1ac7815239e 100644 --- a/src/Functions/modelEvaluate.cpp +++ b/src/Functions/catboostEvaluate.cpp @@ -1,18 +1,18 @@ #include #include -#include -#include -#include -#include -#include -#include -#include +#include +#include #include +#include #include -#include +#include #include +#include +#include +#include #include +#include #include @@ -21,66 +21,80 @@ namespace DB namespace ErrorCodes { + extern const int FILE_DOESNT_EXIST; extern const int ILLEGAL_TYPE_OF_ARGUMENT; extern const int TOO_FEW_ARGUMENTS_FOR_FUNCTION; extern const int ILLEGAL_COLUMN; } -class ExternalModelsLoader; - - -/// Evaluate external model. -/// First argument - model name, the others - model arguments. -/// * for CatBoost model - float features first, then categorical -/// Result - Float64. -class FunctionModelEvaluate final : public IFunction +/// Evaluate CatBoost model. +/// - Arguments: float features first, then categorical features. +/// - Result: Float64. +class FunctionCatBoostEvaluate final : public IFunction, WithContext { +private: + mutable std::unique_ptr bridge_helper; + public: - static constexpr auto name = "modelEvaluate"; + static constexpr auto name = "catboostEvaluate"; - static FunctionPtr create(ContextPtr context) - { - return std::make_shared(context->getExternalModelsLoader()); - } - - explicit FunctionModelEvaluate(const ExternalModelsLoader & models_loader_) - : models_loader(models_loader_) {} + static FunctionPtr create(ContextPtr context_) { return std::make_shared(context_); } + explicit FunctionCatBoostEvaluate(ContextPtr context_) : WithContext(context_) {} String getName() const override { return name; } - bool isVariadic() const override { return true; } - bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; } - bool isDeterministic() const override { return false; } - bool useDefaultImplementationForNulls() const override { return false; } - size_t getNumberOfArguments() const override { return 0; } + void initBridge(const ColumnConst * name_col) const + { + String library_path = getContext()->getConfigRef().getString("catboost_lib_path"); + if (!std::filesystem::exists(library_path)) + throw Exception(ErrorCodes::FILE_DOESNT_EXIST, "Can't load library {}: file doesn't exist", library_path); + + String model_path = name_col->getValue(); + if (!std::filesystem::exists(model_path)) + throw Exception(ErrorCodes::FILE_DOESNT_EXIST, "Can't load model {}: file doesn't exist", model_path); + + bridge_helper = std::make_unique(getContext(), model_path, library_path); + } + + DataTypePtr getReturnTypeFromLibraryBridge() const + { + size_t tree_count = bridge_helper->getTreeCount(); + auto type = std::make_shared(); + if (tree_count == 1) + return type; + + DataTypes types(tree_count, type); + + return std::make_shared(types); + } DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override { if (arguments.size() < 2) - throw Exception("Function " + getName() + " expects at least 2 arguments", - ErrorCodes::TOO_FEW_ARGUMENTS_FOR_FUNCTION); + throw Exception(ErrorCodes::TOO_FEW_ARGUMENTS_FOR_FUNCTION, "Function {} expects at least 2 arguments", getName()); if (!isString(arguments[0].type)) - throw Exception("Illegal type " + arguments[0].type->getName() + " of first argument of function " + getName() - + ", expected a string.", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + throw Exception( + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Illegal type {} of first argument of function {}, expected a string.", arguments[0].type->getName(), getName()); const auto * name_col = checkAndGetColumnConst(arguments[0].column.get()); if (!name_col) - throw Exception("First argument of function " + getName() + " must be a constant string", - ErrorCodes::ILLEGAL_COLUMN); + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "First argument of function {} must be a constant string", getName()); + + initBridge(name_col); + + auto type = getReturnTypeFromLibraryBridge(); bool has_nullable = false; for (size_t i = 1; i < arguments.size(); ++i) has_nullable = has_nullable || arguments[i].type->isNullable(); - auto model = models_loader.getModel(name_col->getValue()); - auto type = model->getReturnType(); - if (has_nullable) { if (const auto * tuple = typeid_cast(type.get())) @@ -98,31 +112,25 @@ public: return type; } - ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t) const override { const auto * name_col = checkAndGetColumnConst(arguments[0].column.get()); if (!name_col) - throw Exception("First argument of function " + getName() + " must be a constant string", - ErrorCodes::ILLEGAL_COLUMN); - - auto model = models_loader.getModel(name_col->getValue()); + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "First argument of function {} must be a constant string", getName()); ColumnRawPtrs column_ptrs; Columns materialized_columns; ColumnPtr null_map; - column_ptrs.reserve(arguments.size()); - for (auto arg : collections::range(1, arguments.size())) + ColumnsWithTypeAndName feature_arguments(arguments.begin() + 1, arguments.end()); + for (auto & arg : feature_arguments) { - const auto & column = arguments[arg].column; - column_ptrs.push_back(column.get()); - if (auto full_column = column->convertToFullColumnIfConst()) + if (auto full_column = arg.column->convertToFullColumnIfConst()) { materialized_columns.push_back(full_column); - column_ptrs.back() = full_column.get(); + arg.column = full_column; } - if (const auto * col_nullable = checkAndGetColumn(*column_ptrs.back())) + if (const auto * col_nullable = checkAndGetColumn(&*arg.column)) { if (!null_map) null_map = col_nullable->getNullMapColumnPtr(); @@ -140,11 +148,12 @@ public: null_map = std::move(mut_null_map); } - column_ptrs.back() = &col_nullable->getNestedColumn(); + arg.column = col_nullable->getNestedColumn().getPtr(); + arg.type = static_cast(*arg.type).getNestedType(); } } - auto res = model->evaluate(column_ptrs); + auto res = bridge_helper->evaluate(feature_arguments); if (null_map) { @@ -162,15 +171,12 @@ public: return res; } - -private: - const ExternalModelsLoader & models_loader; }; -REGISTER_FUNCTION(ExternalModels) +REGISTER_FUNCTION(CatBoostEvaluate) { - factory.registerFunction(); + factory.registerFunction(); } } diff --git a/src/Functions/grouping.h b/src/Functions/grouping.h index a49e946b2cb..b9ef6ffc107 100644 --- a/src/Functions/grouping.h +++ b/src/Functions/grouping.h @@ -1,9 +1,9 @@ #pragma once +#include #include #include #include -#include #include #include #include @@ -19,10 +19,17 @@ protected: static constexpr UInt64 ONE = 1; const ColumnNumbers arguments_indexes; + // Initial implementation of GROUPING function returned 1 if the argument is used as an aggregation key. + // This differs from the behavior described in the standard and other DBMS. + const bool force_compatibility; + + static constexpr UInt64 COMPATIBLE_MODE[] = {1, 0}; + static constexpr UInt64 INCOMPATIBLE_MODE[] = {0, 1}; public: - FunctionGroupingBase(ColumnNumbers arguments_indexes_) + FunctionGroupingBase(ColumnNumbers arguments_indexes_, bool force_compatibility_) : arguments_indexes(std::move(arguments_indexes_)) + , force_compatibility(force_compatibility_) {} bool isVariadic() const override { return true; } @@ -48,13 +55,15 @@ public: auto result = ColumnUInt64::create(); auto & result_data = result->getData(); result_data.reserve(input_rows_count); + + const auto * result_table = likely(force_compatibility) ? COMPATIBLE_MODE : INCOMPATIBLE_MODE; for (size_t i = 0; i < input_rows_count; ++i) { UInt64 set_index = grouping_set_column->getElement(i); UInt64 value = 0; for (auto index : arguments_indexes) - value = (value << 1) + (checker(set_index, index) ? 1 : 0); + value = (value << 1) + result_table[checker(set_index, index) ? 1 : 0]; result_data.push_back(value); } @@ -65,14 +74,16 @@ public: class FunctionGroupingOrdinary : public FunctionGroupingBase { public: - explicit FunctionGroupingOrdinary(ColumnNumbers arguments_indexes_) - : FunctionGroupingBase(std::move(arguments_indexes_)) + FunctionGroupingOrdinary(ColumnNumbers arguments_indexes_, bool force_compatibility_) + : FunctionGroupingBase(std::move(arguments_indexes_), force_compatibility_) {} String getName() const override { return "groupingOrdinary"; } ColumnPtr executeImpl(const ColumnsWithTypeAndName &, const DataTypePtr &, size_t input_rows_count) const override { + if (likely(force_compatibility)) + return ColumnUInt64::create(input_rows_count, 0); UInt64 value = (ONE << arguments_indexes.size()) - 1; return ColumnUInt64::create(input_rows_count, value); } @@ -83,8 +94,8 @@ class FunctionGroupingForRollup : public FunctionGroupingBase const UInt64 aggregation_keys_number; public: - FunctionGroupingForRollup(ColumnNumbers arguments_indexes_, UInt64 aggregation_keys_number_) - : FunctionGroupingBase(std::move(arguments_indexes_)) + FunctionGroupingForRollup(ColumnNumbers arguments_indexes_, UInt64 aggregation_keys_number_, bool force_compatibility_) + : FunctionGroupingBase(std::move(arguments_indexes_), force_compatibility_) , aggregation_keys_number(aggregation_keys_number_) {} @@ -113,8 +124,8 @@ class FunctionGroupingForCube : public FunctionGroupingBase public: - FunctionGroupingForCube(ColumnNumbers arguments_indexes_, UInt64 aggregation_keys_number_) - : FunctionGroupingBase(arguments_indexes_) + FunctionGroupingForCube(ColumnNumbers arguments_indexes_, UInt64 aggregation_keys_number_, bool force_compatibility_) + : FunctionGroupingBase(arguments_indexes_, force_compatibility_) , aggregation_keys_number(aggregation_keys_number_) {} @@ -142,8 +153,8 @@ class FunctionGroupingForGroupingSets : public FunctionGroupingBase { ColumnNumbersSetList grouping_sets; public: - FunctionGroupingForGroupingSets(ColumnNumbers arguments_indexes_, ColumnNumbersList const & grouping_sets_) - : FunctionGroupingBase(std::move(arguments_indexes_)) + FunctionGroupingForGroupingSets(ColumnNumbers arguments_indexes_, ColumnNumbersList const & grouping_sets_, bool force_compatibility_) + : FunctionGroupingBase(std::move(arguments_indexes_), force_compatibility_) { for (auto const & set : grouping_sets_) grouping_sets.emplace_back(set.begin(), set.end()); diff --git a/src/IO/ReadBufferFromFileDescriptor.cpp b/src/IO/ReadBufferFromFileDescriptor.cpp index ffb7bff8afb..cb4b6ca5f3e 100644 --- a/src/IO/ReadBufferFromFileDescriptor.cpp +++ b/src/IO/ReadBufferFromFileDescriptor.cpp @@ -233,7 +233,7 @@ void ReadBufferFromFileDescriptor::rewind() /// Assuming file descriptor supports 'select', check that we have data to read or wait until timeout. -bool ReadBufferFromFileDescriptor::poll(size_t timeout_microseconds) +bool ReadBufferFromFileDescriptor::poll(size_t timeout_microseconds) const { fd_set fds; FD_ZERO(&fds); diff --git a/src/IO/ReadBufferFromFileDescriptor.h b/src/IO/ReadBufferFromFileDescriptor.h index 6b68b8b6dfd..6edda460bac 100644 --- a/src/IO/ReadBufferFromFileDescriptor.h +++ b/src/IO/ReadBufferFromFileDescriptor.h @@ -66,7 +66,7 @@ public: private: /// Assuming file descriptor supports 'select', check that we have data to read or wait until timeout. - bool poll(size_t timeout_microseconds); + bool poll(size_t timeout_microseconds) const; }; diff --git a/src/Interpreters/ActionsVisitor.cpp b/src/Interpreters/ActionsVisitor.cpp index f10510fadae..8e135d325e6 100644 --- a/src/Interpreters/ActionsVisitor.cpp +++ b/src/Interpreters/ActionsVisitor.cpp @@ -880,20 +880,20 @@ void ActionsMatcher::visit(const ASTFunction & node, const ASTPtr & ast, Data & { case GroupByKind::GROUPING_SETS: { - data.addFunction(std::make_shared(std::make_shared(std::move(arguments_indexes), keys_info.grouping_set_keys)), { "__grouping_set" }, column_name); + data.addFunction(std::make_shared(std::make_shared(std::move(arguments_indexes), keys_info.grouping_set_keys, data.getContext()->getSettingsRef().force_grouping_standard_compatibility)), { "__grouping_set" }, column_name); break; } case GroupByKind::ROLLUP: - data.addFunction(std::make_shared(std::make_shared(std::move(arguments_indexes), aggregation_keys_number)), { "__grouping_set" }, column_name); + data.addFunction(std::make_shared(std::make_shared(std::move(arguments_indexes), aggregation_keys_number, data.getContext()->getSettingsRef().force_grouping_standard_compatibility)), { "__grouping_set" }, column_name); break; case GroupByKind::CUBE: { - data.addFunction(std::make_shared(std::make_shared(std::move(arguments_indexes), aggregation_keys_number)), { "__grouping_set" }, column_name); + data.addFunction(std::make_shared(std::make_shared(std::move(arguments_indexes), aggregation_keys_number, data.getContext()->getSettingsRef().force_grouping_standard_compatibility)), { "__grouping_set" }, column_name); break; } case GroupByKind::ORDINARY: { - data.addFunction(std::make_shared(std::make_shared(std::move(arguments_indexes))), {}, column_name); + data.addFunction(std::make_shared(std::make_shared(std::move(arguments_indexes), data.getContext()->getSettingsRef().force_grouping_standard_compatibility)), {}, column_name); break; } default: diff --git a/src/Interpreters/CatBoostModel.cpp b/src/Interpreters/CatBoostModel.cpp deleted file mode 100644 index d5803ed9e36..00000000000 --- a/src/Interpreters/CatBoostModel.cpp +++ /dev/null @@ -1,525 +0,0 @@ -#include "CatBoostModel.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -namespace DB -{ - -namespace ErrorCodes -{ -extern const int LOGICAL_ERROR; -extern const int BAD_ARGUMENTS; -extern const int CANNOT_LOAD_CATBOOST_MODEL; -extern const int CANNOT_APPLY_CATBOOST_MODEL; -} - -/// CatBoost wrapper interface functions. -class CatBoostWrapperAPI -{ -public: - using ModelCalcerHandle = void; - - ModelCalcerHandle * (* ModelCalcerCreate)(); // NOLINT - - void (* ModelCalcerDelete)(ModelCalcerHandle * calcer); // NOLINT - - const char * (* GetErrorString)(); // NOLINT - - bool (* LoadFullModelFromFile)(ModelCalcerHandle * calcer, const char * filename); // NOLINT - - bool (* CalcModelPredictionFlat)(ModelCalcerHandle * calcer, size_t docCount, // NOLINT - const float ** floatFeatures, size_t floatFeaturesSize, - double * result, size_t resultSize); - - bool (* CalcModelPrediction)(ModelCalcerHandle * calcer, size_t docCount, // NOLINT - const float ** floatFeatures, size_t floatFeaturesSize, - const char *** catFeatures, size_t catFeaturesSize, - double * result, size_t resultSize); - - bool (* CalcModelPredictionWithHashedCatFeatures)(ModelCalcerHandle * calcer, size_t docCount, // NOLINT - const float ** floatFeatures, size_t floatFeaturesSize, - const int ** catFeatures, size_t catFeaturesSize, - double * result, size_t resultSize); - - int (* GetStringCatFeatureHash)(const char * data, size_t size); // NOLINT - int (* GetIntegerCatFeatureHash)(uint64_t val); // NOLINT - - size_t (* GetFloatFeaturesCount)(ModelCalcerHandle* calcer); // NOLINT - size_t (* GetCatFeaturesCount)(ModelCalcerHandle* calcer); // NOLINT - size_t (* GetTreeCount)(ModelCalcerHandle* modelHandle); // NOLINT - size_t (* GetDimensionsCount)(ModelCalcerHandle* modelHandle); // NOLINT - - bool (* CheckModelMetadataHasKey)(ModelCalcerHandle* modelHandle, const char* keyPtr, size_t keySize); // NOLINT - size_t (*GetModelInfoValueSize)(ModelCalcerHandle* modelHandle, const char* keyPtr, size_t keySize); // NOLINT - const char* (*GetModelInfoValue)(ModelCalcerHandle* modelHandle, const char* keyPtr, size_t keySize); // NOLINT -}; - - -class CatBoostModelHolder -{ -private: - CatBoostWrapperAPI::ModelCalcerHandle * handle; - const CatBoostWrapperAPI * api; -public: - explicit CatBoostModelHolder(const CatBoostWrapperAPI * api_) : api(api_) { handle = api->ModelCalcerCreate(); } - ~CatBoostModelHolder() { api->ModelCalcerDelete(handle); } - - CatBoostWrapperAPI::ModelCalcerHandle * get() { return handle; } -}; - - -/// Holds CatBoost wrapper library and provides wrapper interface. -class CatBoostLibHolder -{ -public: - explicit CatBoostLibHolder(std::string lib_path_) : lib_path(std::move(lib_path_)), lib(lib_path) { initAPI(); } - - const CatBoostWrapperAPI & getAPI() const { return api; } - const std::string & getCurrentPath() const { return lib_path; } - -private: - CatBoostWrapperAPI api; - std::string lib_path; - SharedLibrary lib; - - void initAPI() - { - load(api.ModelCalcerCreate, "ModelCalcerCreate"); - load(api.ModelCalcerDelete, "ModelCalcerDelete"); - load(api.GetErrorString, "GetErrorString"); - load(api.LoadFullModelFromFile, "LoadFullModelFromFile"); - load(api.CalcModelPredictionFlat, "CalcModelPredictionFlat"); - load(api.CalcModelPrediction, "CalcModelPrediction"); - load(api.CalcModelPredictionWithHashedCatFeatures, "CalcModelPredictionWithHashedCatFeatures"); - load(api.GetStringCatFeatureHash, "GetStringCatFeatureHash"); - load(api.GetIntegerCatFeatureHash, "GetIntegerCatFeatureHash"); - load(api.GetFloatFeaturesCount, "GetFloatFeaturesCount"); - load(api.GetCatFeaturesCount, "GetCatFeaturesCount"); - tryLoad(api.CheckModelMetadataHasKey, "CheckModelMetadataHasKey"); - tryLoad(api.GetModelInfoValueSize, "GetModelInfoValueSize"); - tryLoad(api.GetModelInfoValue, "GetModelInfoValue"); - tryLoad(api.GetTreeCount, "GetTreeCount"); - tryLoad(api.GetDimensionsCount, "GetDimensionsCount"); - } - - template - void load(T& func, const std::string & name) { func = lib.get(name); } - - template - void tryLoad(T& func, const std::string & name) { func = lib.tryGet(name); } -}; - -std::shared_ptr getCatBoostWrapperHolder(const std::string & lib_path) -{ - static std::shared_ptr ptr; - static std::mutex mutex; - - std::lock_guard lock(mutex); - - if (!ptr || ptr->getCurrentPath() != lib_path) - ptr = std::make_shared(lib_path); - - return ptr; -} - -class CatBoostModelImpl -{ -public: - CatBoostModelImpl(const CatBoostWrapperAPI * api_, const std::string & model_path) : api(api_) - { - handle = std::make_unique(api); - if (!handle) - { - throw Exception(ErrorCodes::CANNOT_LOAD_CATBOOST_MODEL, - "Cannot create CatBoost model: {}", - api->GetErrorString()); - } - if (!api->LoadFullModelFromFile(handle->get(), model_path.c_str())) - { - throw Exception(ErrorCodes::CANNOT_LOAD_CATBOOST_MODEL, - "Cannot load CatBoost model: {}", - api->GetErrorString()); - } - - float_features_count = api->GetFloatFeaturesCount(handle->get()); - cat_features_count = api->GetCatFeaturesCount(handle->get()); - tree_count = 1; - if (api->GetDimensionsCount) - tree_count = api->GetDimensionsCount(handle->get()); - } - - ColumnPtr evaluate(const ColumnRawPtrs & columns) const - { - if (columns.empty()) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Got empty columns list for CatBoost model."); - - if (columns.size() != float_features_count + cat_features_count) - throw Exception(ErrorCodes::BAD_ARGUMENTS, - "Number of columns is different with number of features: columns size {} float features size {} + cat features size {}", - columns.size(), - float_features_count, - cat_features_count); - - for (size_t i = 0; i < float_features_count; ++i) - { - if (!columns[i]->isNumeric()) - { - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Column {} should be numeric to make float feature.", i); - } - } - - bool cat_features_are_strings = true; - for (size_t i = float_features_count; i < float_features_count + cat_features_count; ++i) - { - const auto * column = columns[i]; - if (column->isNumeric()) - { - cat_features_are_strings = false; - } - else if (!(typeid_cast(column) - || typeid_cast(column))) - { - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Column {} should be numeric or string.", i); - } - } - - auto result = evalImpl(columns, cat_features_are_strings); - - if (tree_count == 1) - return result; - - size_t column_size = columns.front()->size(); - auto * result_buf = result->getData().data(); - - /// Multiple trees case. Copy data to several columns. - MutableColumns mutable_columns(tree_count); - std::vector column_ptrs(tree_count); - for (size_t i = 0; i < tree_count; ++i) - { - auto col = ColumnFloat64::create(column_size); - column_ptrs[i] = col->getData().data(); - mutable_columns[i] = std::move(col); - } - - Float64 * data = result_buf; - for (size_t row = 0; row < column_size; ++row) - { - for (size_t i = 0; i < tree_count; ++i) - { - *column_ptrs[i] = *data; - ++column_ptrs[i]; - ++data; - } - } - - return ColumnTuple::create(std::move(mutable_columns)); - } - - size_t getFloatFeaturesCount() const { return float_features_count; } - size_t getCatFeaturesCount() const { return cat_features_count; } - size_t getTreeCount() const { return tree_count; } - -private: - std::unique_ptr handle; - const CatBoostWrapperAPI * api; - size_t float_features_count; - size_t cat_features_count; - size_t tree_count; - - /// Buffer should be allocated with features_count * column->size() elements. - /// Place column elements in positions buffer[0], buffer[features_count], ... , buffer[size * features_count] - template - void placeColumnAsNumber(const IColumn * column, T * buffer, size_t features_count) const - { - size_t size = column->size(); - FieldVisitorConvertToNumber visitor; - for (size_t i = 0; i < size; ++i) - { - /// TODO: Replace with column visitor. - Field field; - column->get(i, field); - *buffer = applyVisitor(visitor, field); - buffer += features_count; - } - } - - /// Buffer should be allocated with features_count * column->size() elements. - /// Place string pointers in positions buffer[0], buffer[features_count], ... , buffer[size * features_count] - static void placeStringColumn(const ColumnString & column, const char ** buffer, size_t features_count) - { - size_t size = column.size(); - for (size_t i = 0; i < size; ++i) - { - *buffer = const_cast(column.getDataAtWithTerminatingZero(i).data); - buffer += features_count; - } - } - - /// Buffer should be allocated with features_count * column->size() elements. - /// Place string pointers in positions buffer[0], buffer[features_count], ... , buffer[size * features_count] - /// Returns PODArray which holds data (because ColumnFixedString doesn't store terminating zero). - static PODArray placeFixedStringColumn( - const ColumnFixedString & column, const char ** buffer, size_t features_count) - { - size_t size = column.size(); - size_t str_size = column.getN(); - PODArray data(size * (str_size + 1)); - char * data_ptr = data.data(); - - for (size_t i = 0; i < size; ++i) - { - auto ref = column.getDataAt(i); - memcpy(data_ptr, ref.data, ref.size); - data_ptr[ref.size] = 0; - *buffer = data_ptr; - data_ptr += ref.size + 1; - buffer += features_count; - } - - return data; - } - - /// Place columns into buffer, returns column which holds placed data. Buffer should contains column->size() values. - template - ColumnPtr placeNumericColumns(const ColumnRawPtrs & columns, - size_t offset, size_t size, const T** buffer) const - { - if (size == 0) - return nullptr; - size_t column_size = columns[offset]->size(); - auto data_column = ColumnVector::create(size * column_size); - T * data = data_column->getData().data(); - for (size_t i = 0; i < size; ++i) - { - const auto * column = columns[offset + i]; - if (column->isNumeric()) - placeColumnAsNumber(column, data + i, size); - } - - for (size_t i = 0; i < column_size; ++i) - { - *buffer = data; - ++buffer; - data += size; - } - - return data_column; - } - - /// Place columns into buffer, returns data which was used for fixed string columns. - /// Buffer should contains column->size() values, each value contains size strings. - static std::vector> placeStringColumns( - const ColumnRawPtrs & columns, size_t offset, size_t size, const char ** buffer) - { - if (size == 0) - return {}; - - std::vector> data; - for (size_t i = 0; i < size; ++i) - { - const auto * column = columns[offset + i]; - if (const auto * column_string = typeid_cast(column)) - placeStringColumn(*column_string, buffer + i, size); - else if (const auto * column_fixed_string = typeid_cast(column)) - data.push_back(placeFixedStringColumn(*column_fixed_string, buffer + i, size)); - else - throw Exception("Cannot place string column.", ErrorCodes::LOGICAL_ERROR); - } - - return data; - } - - /// Calc hash for string cat feature at ps positions. - template - void calcStringHashes(const Column * column, size_t ps, const int ** buffer) const - { - size_t column_size = column->size(); - for (size_t j = 0; j < column_size; ++j) - { - auto ref = column->getDataAt(j); - const_cast(*buffer)[ps] = api->GetStringCatFeatureHash(ref.data, ref.size); - ++buffer; - } - } - - /// Calc hash for int cat feature at ps position. Buffer at positions ps should contains unhashed values. - void calcIntHashes(size_t column_size, size_t ps, const int ** buffer) const - { - for (size_t j = 0; j < column_size; ++j) - { - const_cast(*buffer)[ps] = api->GetIntegerCatFeatureHash((*buffer)[ps]); - ++buffer; - } - } - - /// buffer contains column->size() rows and size columns. - /// For int cat features calc hash inplace. - /// For string cat features calc hash from column rows. - void calcHashes(const ColumnRawPtrs & columns, size_t offset, size_t size, const int ** buffer) const - { - if (size == 0) - return; - size_t column_size = columns[offset]->size(); - - std::vector> data; - for (size_t i = 0; i < size; ++i) - { - const auto * column = columns[offset + i]; - if (const auto * column_string = typeid_cast(column)) - calcStringHashes(column_string, i, buffer); - else if (const auto * column_fixed_string = typeid_cast(column)) - calcStringHashes(column_fixed_string, i, buffer); - else - calcIntHashes(column_size, i, buffer); - } - } - - /// buffer[column_size * cat_features_count] -> char * => cat_features[column_size][cat_features_count] -> char * - void fillCatFeaturesBuffer(const char *** cat_features, const char ** buffer, - size_t column_size) const - { - for (size_t i = 0; i < column_size; ++i) - { - *cat_features = buffer; - ++cat_features; - buffer += cat_features_count; - } - } - - /// Convert values to row-oriented format and call evaluation function from CatBoost wrapper api. - /// * CalcModelPredictionFlat if no cat features - /// * CalcModelPrediction if all cat features are strings - /// * CalcModelPredictionWithHashedCatFeatures if has int cat features. - ColumnFloat64::MutablePtr evalImpl( - const ColumnRawPtrs & columns, - bool cat_features_are_strings) const - { - std::string error_msg = "Error occurred while applying CatBoost model: "; - size_t column_size = columns.front()->size(); - - auto result = ColumnFloat64::create(column_size * tree_count); - auto * result_buf = result->getData().data(); - - if (!column_size) - return result; - - /// Prepare float features. - PODArray float_features(column_size); - auto * float_features_buf = float_features.data(); - /// Store all float data into single column. float_features is a list of pointers to it. - auto float_features_col = placeNumericColumns(columns, 0, float_features_count, float_features_buf); - - if (cat_features_count == 0) - { - if (!api->CalcModelPredictionFlat(handle->get(), column_size, - float_features_buf, float_features_count, - result_buf, column_size * tree_count)) - { - - throw Exception(error_msg + api->GetErrorString(), ErrorCodes::CANNOT_APPLY_CATBOOST_MODEL); - } - return result; - } - - /// Prepare cat features. - if (cat_features_are_strings) - { - /// cat_features_holder stores pointers to ColumnString data or fixed_strings_data. - PODArray cat_features_holder(cat_features_count * column_size); - PODArray cat_features(column_size); - auto * cat_features_buf = cat_features.data(); - - fillCatFeaturesBuffer(cat_features_buf, cat_features_holder.data(), column_size); - /// Fixed strings are stored without termination zero, so have to copy data into fixed_strings_data. - auto fixed_strings_data = placeStringColumns(columns, float_features_count, - cat_features_count, cat_features_holder.data()); - - if (!api->CalcModelPrediction(handle->get(), column_size, - float_features_buf, float_features_count, - cat_features_buf, cat_features_count, - result_buf, column_size * tree_count)) - { - throw Exception(error_msg + api->GetErrorString(), ErrorCodes::CANNOT_APPLY_CATBOOST_MODEL); - } - } - else - { - PODArray cat_features(column_size); - auto * cat_features_buf = cat_features.data(); - auto cat_features_col = placeNumericColumns(columns, float_features_count, - cat_features_count, cat_features_buf); - calcHashes(columns, float_features_count, cat_features_count, cat_features_buf); - if (!api->CalcModelPredictionWithHashedCatFeatures( - handle->get(), column_size, - float_features_buf, float_features_count, - cat_features_buf, cat_features_count, - result_buf, column_size * tree_count)) - { - throw Exception(error_msg + api->GetErrorString(), ErrorCodes::CANNOT_APPLY_CATBOOST_MODEL); - } - } - - return result; - } -}; - -CatBoostModel::CatBoostModel(std::string name_, std::string model_path_, std::string lib_path_, - const ExternalLoadableLifetime & lifetime_) - : name(std::move(name_)), model_path(std::move(model_path_)), lib_path(std::move(lib_path_)), lifetime(lifetime_) -{ - api_provider = getCatBoostWrapperHolder(lib_path); - api = &api_provider->getAPI(); - model = std::make_unique(api, model_path); -} - -CatBoostModel::~CatBoostModel() = default; - -size_t CatBoostModel::getFloatFeaturesCount() const -{ - return model->getFloatFeaturesCount(); -} - -size_t CatBoostModel::getCatFeaturesCount() const -{ - return model->getCatFeaturesCount(); -} - -size_t CatBoostModel::getTreeCount() const -{ - return model->getTreeCount(); -} - -DataTypePtr CatBoostModel::getReturnType() const -{ - size_t tree_count = getTreeCount(); - auto type = std::make_shared(); - if (tree_count == 1) - return type; - - DataTypes types(tree_count, type); - - return std::make_shared(types); -} - -ColumnPtr CatBoostModel::evaluate(const ColumnRawPtrs & columns) const -{ - if (!model) - throw Exception("CatBoost model was not loaded.", ErrorCodes::LOGICAL_ERROR); - - return model->evaluate(columns); -} - -} diff --git a/src/Interpreters/CatBoostModel.h b/src/Interpreters/CatBoostModel.h deleted file mode 100644 index 7bb1df92b67..00000000000 --- a/src/Interpreters/CatBoostModel.h +++ /dev/null @@ -1,73 +0,0 @@ -#pragma once - -#include -#include -#include - - -namespace DB -{ - -class CatBoostLibHolder; -class CatBoostWrapperAPI; -class CatBoostModelImpl; - -class IDataType; -using DataTypePtr = std::shared_ptr; - -/// General ML model evaluator interface. -class IMLModel : public IExternalLoadable -{ -public: - IMLModel() = default; - virtual ColumnPtr evaluate(const ColumnRawPtrs & columns) const = 0; - virtual std::string getTypeName() const = 0; - virtual DataTypePtr getReturnType() const = 0; - virtual ~IMLModel() override = default; -}; - -class CatBoostModel : public IMLModel -{ -public: - CatBoostModel(std::string name, std::string model_path, - std::string lib_path, const ExternalLoadableLifetime & lifetime); - - ~CatBoostModel() override; - - ColumnPtr evaluate(const ColumnRawPtrs & columns) const override; - std::string getTypeName() const override { return "catboost"; } - - size_t getFloatFeaturesCount() const; - size_t getCatFeaturesCount() const; - size_t getTreeCount() const; - DataTypePtr getReturnType() const override; - - /// IExternalLoadable interface. - - const ExternalLoadableLifetime & getLifetime() const override { return lifetime; } - - std::string getLoadableName() const override { return name; } - - bool supportUpdates() const override { return true; } - - bool isModified() const override { return true; } - - std::shared_ptr clone() const override - { - return std::make_shared(name, model_path, lib_path, lifetime); - } - -private: - const std::string name; - std::string model_path; - std::string lib_path; - ExternalLoadableLifetime lifetime; - std::shared_ptr api_provider; - const CatBoostWrapperAPI * api; - - std::unique_ptr model; - - void init(); -}; - -} diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp index b3bd9d97005..d69878e6af0 100644 --- a/src/Interpreters/Context.cpp +++ b/src/Interpreters/Context.cpp @@ -52,7 +52,6 @@ #include #include #include -#include #include #include #include @@ -153,7 +152,6 @@ struct ContextSharedPart : boost::noncopyable mutable std::mutex embedded_dictionaries_mutex; mutable std::mutex external_dictionaries_mutex; mutable std::mutex external_user_defined_executable_functions_mutex; - mutable std::mutex external_models_mutex; /// Separate mutex for storage policies. During server startup we may /// initialize some important storages (system logs with MergeTree engine) /// under context lock. @@ -191,9 +189,7 @@ struct ContextSharedPart : boost::noncopyable mutable std::unique_ptr embedded_dictionaries; /// Metrica's dictionaries. Have lazy initialization. mutable std::unique_ptr external_dictionaries_loader; mutable std::unique_ptr external_user_defined_executable_functions_loader; - mutable std::unique_ptr external_models_loader; - ExternalLoaderXMLConfigRepository * external_models_config_repository = nullptr; scope_guard models_repository_guard; ExternalLoaderXMLConfigRepository * external_dictionaries_config_repository = nullptr; @@ -359,8 +355,6 @@ struct ContextSharedPart : boost::noncopyable external_dictionaries_loader->enablePeriodicUpdates(false); if (external_user_defined_executable_functions_loader) external_user_defined_executable_functions_loader->enablePeriodicUpdates(false); - if (external_models_loader) - external_models_loader->enablePeriodicUpdates(false); Session::shutdownNamedSessions(); @@ -391,7 +385,6 @@ struct ContextSharedPart : boost::noncopyable std::unique_ptr delete_embedded_dictionaries; std::unique_ptr delete_external_dictionaries_loader; std::unique_ptr delete_external_user_defined_executable_functions_loader; - std::unique_ptr delete_external_models_loader; std::unique_ptr delete_buffer_flush_schedule_pool; std::unique_ptr delete_schedule_pool; std::unique_ptr delete_distributed_schedule_pool; @@ -430,7 +423,6 @@ struct ContextSharedPart : boost::noncopyable delete_embedded_dictionaries = std::move(embedded_dictionaries); delete_external_dictionaries_loader = std::move(external_dictionaries_loader); delete_external_user_defined_executable_functions_loader = std::move(external_user_defined_executable_functions_loader); - delete_external_models_loader = std::move(external_models_loader); delete_buffer_flush_schedule_pool = std::move(buffer_flush_schedule_pool); delete_schedule_pool = std::move(schedule_pool); delete_distributed_schedule_pool = std::move(distributed_schedule_pool); @@ -458,7 +450,6 @@ struct ContextSharedPart : boost::noncopyable delete_embedded_dictionaries.reset(); delete_external_dictionaries_loader.reset(); delete_external_user_defined_executable_functions_loader.reset(); - delete_external_models_loader.reset(); delete_ddl_worker.reset(); delete_buffer_flush_schedule_pool.reset(); delete_schedule_pool.reset(); @@ -1476,48 +1467,6 @@ ExternalUserDefinedExecutableFunctionsLoader & Context::getExternalUserDefinedEx return *shared->external_user_defined_executable_functions_loader; } -const ExternalModelsLoader & Context::getExternalModelsLoader() const -{ - return const_cast(this)->getExternalModelsLoader(); -} - -ExternalModelsLoader & Context::getExternalModelsLoader() -{ - std::lock_guard lock(shared->external_models_mutex); - return getExternalModelsLoaderUnlocked(); -} - -ExternalModelsLoader & Context::getExternalModelsLoaderUnlocked() -{ - if (!shared->external_models_loader) - shared->external_models_loader = - std::make_unique(getGlobalContext()); - return *shared->external_models_loader; -} - -void Context::loadOrReloadModels(const Poco::Util::AbstractConfiguration & config) -{ - auto patterns_values = getMultipleValuesFromConfig(config, "", "models_config"); - std::unordered_set patterns(patterns_values.begin(), patterns_values.end()); - - std::lock_guard lock(shared->external_models_mutex); - - auto & external_models_loader = getExternalModelsLoaderUnlocked(); - - if (shared->external_models_config_repository) - { - shared->external_models_config_repository->updatePatterns(patterns); - external_models_loader.reloadConfig(shared->external_models_config_repository->getName()); - return; - } - - auto app_path = getPath(); - auto config_path = getConfigRef().getString("config-file", "config.xml"); - auto repository = std::make_unique(app_path, config_path, patterns); - shared->external_models_config_repository = repository.get(); - shared->models_repository_guard = external_models_loader.addConfigRepository(std::move(repository)); -} - EmbeddedDictionaries & Context::getEmbeddedDictionariesImpl(const bool throw_on_error) const { std::lock_guard lock(shared->embedded_dictionaries_mutex); diff --git a/src/Interpreters/Context.h b/src/Interpreters/Context.h index db3f701bf74..67cf584d5a7 100644 --- a/src/Interpreters/Context.h +++ b/src/Interpreters/Context.h @@ -53,7 +53,6 @@ class AccessRightsElements; enum class RowPolicyFilterType; class EmbeddedDictionaries; class ExternalDictionariesLoader; -class ExternalModelsLoader; class ExternalUserDefinedExecutableFunctionsLoader; class InterserverCredentials; using InterserverCredentialsPtr = std::shared_ptr; @@ -612,6 +611,7 @@ public: void killCurrentQuery(); + bool hasInsertionTable() const { return !insertion_table.empty(); } void setInsertionTable(StorageID db_and_table) { insertion_table = std::move(db_and_table); } const StorageID & getInsertionTable() const { return insertion_table; } @@ -644,19 +644,15 @@ public: const EmbeddedDictionaries & getEmbeddedDictionaries() const; const ExternalDictionariesLoader & getExternalDictionariesLoader() const; - const ExternalModelsLoader & getExternalModelsLoader() const; const ExternalUserDefinedExecutableFunctionsLoader & getExternalUserDefinedExecutableFunctionsLoader() const; EmbeddedDictionaries & getEmbeddedDictionaries(); ExternalDictionariesLoader & getExternalDictionariesLoader(); ExternalDictionariesLoader & getExternalDictionariesLoaderUnlocked(); ExternalUserDefinedExecutableFunctionsLoader & getExternalUserDefinedExecutableFunctionsLoader(); ExternalUserDefinedExecutableFunctionsLoader & getExternalUserDefinedExecutableFunctionsLoaderUnlocked(); - ExternalModelsLoader & getExternalModelsLoader(); - ExternalModelsLoader & getExternalModelsLoaderUnlocked(); void tryCreateEmbeddedDictionaries(const Poco::Util::AbstractConfiguration & config) const; void loadOrReloadDictionaries(const Poco::Util::AbstractConfiguration & config); void loadOrReloadUserDefinedExecutableFunctions(const Poco::Util::AbstractConfiguration & config); - void loadOrReloadModels(const Poco::Util::AbstractConfiguration & config); #if USE_NLP SynonymsExtensions & getSynonymsExtensions() const; diff --git a/src/Interpreters/DDLWorker.h b/src/Interpreters/DDLWorker.h index 7ddcc80c02a..e3c1fa4c271 100644 --- a/src/Interpreters/DDLWorker.h +++ b/src/Interpreters/DDLWorker.h @@ -61,18 +61,23 @@ public: return host_fqdn_id; } + std::string getQueueDir() const + { + return queue_dir; + } + void startup(); virtual void shutdown(); bool isCurrentlyActive() const { return initialized && !stop_flag; } -protected: /// Returns cached ZooKeeper session (possibly expired). ZooKeeperPtr tryGetZooKeeper() const; /// If necessary, creates a new session and caches it. ZooKeeperPtr getAndSetZooKeeper(); +protected: /// Iterates through queue tasks in ZooKeeper, runs execution of new tasks void scheduleTasks(bool reinitialized); diff --git a/src/Interpreters/ExternalModelsLoader.cpp b/src/Interpreters/ExternalModelsLoader.cpp deleted file mode 100644 index 317cf0bf1c9..00000000000 --- a/src/Interpreters/ExternalModelsLoader.cpp +++ /dev/null @@ -1,41 +0,0 @@ -#include -#include - -namespace DB -{ - -namespace ErrorCodes -{ - extern const int INVALID_CONFIG_PARAMETER; -} - - -ExternalModelsLoader::ExternalModelsLoader(ContextPtr context_) - : ExternalLoader("external model", &Poco::Logger::get("ExternalModelsLoader")), WithContext(context_) -{ - setConfigSettings({"model", "name", {}, {}}); - enablePeriodicUpdates(true); -} - -std::shared_ptr ExternalModelsLoader::create( - const std::string & name, const Poco::Util::AbstractConfiguration & config, - const std::string & config_prefix, const std::string & /* repository_name */) const -{ - String type = config.getString(config_prefix + ".type"); - ExternalLoadableLifetime lifetime(config, config_prefix + ".lifetime"); - - /// TODO: add models factory. - if (type == "catboost") - { - return std::make_unique( - name, config.getString(config_prefix + ".path"), - getContext()->getConfigRef().getString("catboost_dynamic_library_path"), - lifetime - ); - } - else - { - throw Exception("Unknown model type: " + type, ErrorCodes::INVALID_CONFIG_PARAMETER); - } -} -} diff --git a/src/Interpreters/ExternalModelsLoader.h b/src/Interpreters/ExternalModelsLoader.h deleted file mode 100644 index 0eeb60008c3..00000000000 --- a/src/Interpreters/ExternalModelsLoader.h +++ /dev/null @@ -1,40 +0,0 @@ -#pragma once - -#include -#include -#include -#include - -#include - - -namespace DB -{ - -/// Manages user-defined models. -class ExternalModelsLoader : public ExternalLoader, WithContext -{ -public: - using ModelPtr = std::shared_ptr; - - /// Models will be loaded immediately and then will be updated in separate thread, each 'reload_period' seconds. - explicit ExternalModelsLoader(ContextPtr context_); - - ModelPtr getModel(const std::string & model_name) const - { - return std::static_pointer_cast(load(model_name)); - } - - void reloadModel(const std::string & model_name) const - { - loadOrReload(model_name); - } - -protected: - LoadablePtr create(const std::string & name, const Poco::Util::AbstractConfiguration & config, - const std::string & config_prefix, const std::string & repository_name) const override; - - friend class StorageSystemModels; -}; - -} diff --git a/src/Interpreters/InterpreterSystemQuery.cpp b/src/Interpreters/InterpreterSystemQuery.cpp index 18cd159ae0e..34504933436 100644 --- a/src/Interpreters/InterpreterSystemQuery.cpp +++ b/src/Interpreters/InterpreterSystemQuery.cpp @@ -12,7 +12,6 @@ #include #include #include -#include #include #include #include @@ -36,6 +35,7 @@ #include #include #include +#include #include #include #include @@ -387,17 +387,15 @@ BlockIO InterpreterSystemQuery::execute() case Type::RELOAD_MODEL: { getContext()->checkAccess(AccessType::SYSTEM_RELOAD_MODEL); - - auto & external_models_loader = system_context->getExternalModelsLoader(); - external_models_loader.reloadModel(query.target_model); + auto bridge_helper = std::make_unique(getContext(), query.target_model); + bridge_helper->removeModel(); break; } case Type::RELOAD_MODELS: { getContext()->checkAccess(AccessType::SYSTEM_RELOAD_MODEL); - - auto & external_models_loader = system_context->getExternalModelsLoader(); - external_models_loader.reloadAllTriedToLoad(); + auto bridge_helper = std::make_unique(getContext()); + bridge_helper->removeAllModels(); break; } case Type::RELOAD_FUNCTION: diff --git a/src/Interpreters/JIT/compileFunction.cpp b/src/Interpreters/JIT/compileFunction.cpp index 353ab84674c..99646084e5a 100644 --- a/src/Interpreters/JIT/compileFunction.cpp +++ b/src/Interpreters/JIT/compileFunction.cpp @@ -739,7 +739,10 @@ CompiledAggregateFunctions compileAggregateFunctions(CHJIT & jit, const std::vec { compileCreateAggregateStatesFunctions(module, functions, create_aggregate_states_functions_name); compileAddIntoAggregateStatesFunctions(module, functions, add_aggregate_states_functions_name); - compileAddIntoAggregateStatesFunctionsSinglePlace(module, functions, add_aggregate_states_functions_name_single_place); + /// FIXME: this leads to use-of-uninitialized-value in llvm + /// But for now, it is safe, since it is not used by Aggregator anyway + (void)compileAddIntoAggregateStatesFunctionsSinglePlace; + /// compileAddIntoAggregateStatesFunctionsSinglePlace(module, functions, add_aggregate_states_functions_name_single_place); compileMergeAggregatesStates(module, functions, merge_aggregate_states_functions_name); compileInsertAggregatesIntoResultColumns(module, functions, insert_aggregate_states_functions_name); }); @@ -752,7 +755,7 @@ CompiledAggregateFunctions compileAggregateFunctions(CHJIT & jit, const std::vec assert(create_aggregate_states_function); assert(add_into_aggregate_states_function); - assert(add_into_aggregate_states_function_single_place); + /// assert(add_into_aggregate_states_function_single_place); /// FIXME assert(merge_aggregate_states_function); assert(insert_aggregate_states_function); diff --git a/src/Interpreters/SortedBlocksWriter.cpp b/src/Interpreters/SortedBlocksWriter.cpp index 82d501451a6..8f598f3dd3f 100644 --- a/src/Interpreters/SortedBlocksWriter.cpp +++ b/src/Interpreters/SortedBlocksWriter.cpp @@ -28,6 +28,11 @@ namespace CurrentMetrics namespace DB { +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; +} + namespace { @@ -84,10 +89,13 @@ void SortedBlocksWriter::insert(Block && block) size_t bytes = 0; size_t flush_no = 0; + if (!block.rows()) + return; + { std::lock_guard lock{insert_mutex}; - /// insert bock into BlocksList undef lock + /// insert block into BlocksList under lock inserted_blocks.insert(std::move(block)); size_t total_row_count = inserted_blocks.row_count + row_count_in_flush; @@ -145,7 +153,7 @@ SortedBlocksWriter::TmpFilePtr SortedBlocksWriter::flush(const BlocksList & bloc pipes.emplace_back(std::make_shared(block.cloneEmpty(), Chunk(block.getColumns(), num_rows))); if (pipes.empty()) - return {}; + throw Exception(ErrorCodes::LOGICAL_ERROR, "Empty block"); QueryPipelineBuilder pipeline; pipeline.init(Pipe::unitePipes(std::move(pipes))); diff --git a/src/Interpreters/StorageID.h b/src/Interpreters/StorageID.h index c60c3aec9c6..43710988243 100644 --- a/src/Interpreters/StorageID.h +++ b/src/Interpreters/StorageID.h @@ -69,6 +69,8 @@ struct StorageID return uuid != UUIDHelpers::Nil; } + bool hasDatabase() const { return !database_name.empty(); } + bool operator<(const StorageID & rhs) const; bool operator==(const StorageID & rhs) const; diff --git a/src/Interpreters/executeQuery.cpp b/src/Interpreters/executeQuery.cpp index c501c1722ba..b6434955418 100644 --- a/src/Interpreters/executeQuery.cpp +++ b/src/Interpreters/executeQuery.cpp @@ -838,101 +838,117 @@ static std::tuple executeQueryImpl( { QueryStatus * process_list_elem = context->getProcessListElement(); - if (!process_list_elem) - return; - - /// Update performance counters before logging to query_log - CurrentThread::finalizePerformanceCounters(); - - QueryStatusInfo info = process_list_elem->getInfo(true, context->getSettingsRef().log_profile_events); - - double elapsed_seconds = info.elapsed_seconds; - - elem.type = QueryLogElementType::QUERY_FINISH; - - // construct event_time and event_time_microseconds using the same time point - // so that the two times will always be equal up to a precision of a second. - const auto finish_time = std::chrono::system_clock::now(); - elem.event_time = time_in_seconds(finish_time); - elem.event_time_microseconds = time_in_microseconds(finish_time); - status_info_to_query_log(elem, info, ast, context); - - if (pulling_pipeline) + if (process_list_elem) { - query_pipeline.tryGetResultRowsAndBytes(elem.result_rows, elem.result_bytes); - } - else /// will be used only for ordinary INSERT queries - { - auto progress_out = process_list_elem->getProgressOut(); - elem.result_rows = progress_out.written_rows; - elem.result_bytes = progress_out.written_bytes; - } + /// Update performance counters before logging to query_log + CurrentThread::finalizePerformanceCounters(); - auto progress_callback = context->getProgressCallback(); - if (progress_callback) - { - Progress p(WriteProgress{info.written_rows, info.written_bytes}); - p.incrementPiecewiseAtomically(Progress{ResultProgress{elem.result_rows, elem.result_bytes}}); - progress_callback(p); - } + QueryStatusInfo info = process_list_elem->getInfo(true, context->getSettingsRef().log_profile_events); - if (elem.read_rows != 0) - { - LOG_INFO(&Poco::Logger::get("executeQuery"), "Read {} rows, {} in {} sec., {} rows/sec., {}/sec.", - elem.read_rows, ReadableSize(elem.read_bytes), elapsed_seconds, - static_cast(elem.read_rows / elapsed_seconds), - ReadableSize(elem.read_bytes / elapsed_seconds)); - } + double elapsed_seconds = info.elapsed_seconds; - if (log_queries && elem.type >= log_queries_min_type && static_cast(elem.query_duration_ms) >= log_queries_min_query_duration_ms) - { - if (auto query_log = context->getQueryLog()) - query_log->add(elem); - } - if (log_processors_profiles) - { - if (auto processors_profile_log = context->getProcessorsProfileLog()) + elem.type = QueryLogElementType::QUERY_FINISH; + + // construct event_time and event_time_microseconds using the same time point + // so that the two times will always be equal up to a precision of a second. + const auto finish_time = std::chrono::system_clock::now(); + elem.event_time = time_in_seconds(finish_time); + elem.event_time_microseconds = time_in_microseconds(finish_time); + status_info_to_query_log(elem, info, ast, context); + + if (pulling_pipeline) { - ProcessorProfileLogElement processor_elem; - processor_elem.event_time = time_in_seconds(finish_time); - processor_elem.event_time_microseconds = time_in_microseconds(finish_time); - processor_elem.query_id = elem.client_info.current_query_id; + query_pipeline.tryGetResultRowsAndBytes(elem.result_rows, elem.result_bytes); + } + else /// will be used only for ordinary INSERT queries + { + auto progress_out = process_list_elem->getProgressOut(); + elem.result_rows = progress_out.written_rows; + elem.result_bytes = progress_out.written_bytes; + } - auto get_proc_id = [](const IProcessor & proc) -> UInt64 - { - return reinterpret_cast(&proc); - }; + auto progress_callback = context->getProgressCallback(); + if (progress_callback) + { + Progress p(WriteProgress{info.written_rows, info.written_bytes}); + p.incrementPiecewiseAtomically(Progress{ResultProgress{elem.result_rows, elem.result_bytes}}); + progress_callback(p); + } - for (const auto & processor : query_pipeline.getProcessors()) + if (elem.read_rows != 0) + { + LOG_INFO(&Poco::Logger::get("executeQuery"), "Read {} rows, {} in {} sec., {} rows/sec., {}/sec.", + elem.read_rows, ReadableSize(elem.read_bytes), elapsed_seconds, + static_cast(elem.read_rows / elapsed_seconds), + ReadableSize(elem.read_bytes / elapsed_seconds)); + } + + if (log_queries && elem.type >= log_queries_min_type && static_cast(elem.query_duration_ms) >= log_queries_min_query_duration_ms) + { + if (auto query_log = context->getQueryLog()) + query_log->add(elem); + } + if (log_processors_profiles) + { + if (auto processors_profile_log = context->getProcessorsProfileLog()) { - std::vector parents; - for (const auto & port : processor->getOutputs()) + ProcessorProfileLogElement processor_elem; + processor_elem.event_time = time_in_seconds(finish_time); + processor_elem.event_time_microseconds = time_in_microseconds(finish_time); + processor_elem.query_id = elem.client_info.current_query_id; + + auto get_proc_id = [](const IProcessor & proc) -> UInt64 { - if (!port.isConnected()) - continue; - const IProcessor & next = port.getInputPort().getProcessor(); - parents.push_back(get_proc_id(next)); + return reinterpret_cast(&proc); + }; + + for (const auto & processor : query_pipeline.getProcessors()) + { + std::vector parents; + for (const auto & port : processor->getOutputs()) + { + if (!port.isConnected()) + continue; + const IProcessor & next = port.getInputPort().getProcessor(); + parents.push_back(get_proc_id(next)); + } + + processor_elem.id = get_proc_id(*processor); + processor_elem.parent_ids = std::move(parents); + + processor_elem.plan_step = reinterpret_cast(processor->getQueryPlanStep()); + processor_elem.plan_group = processor->getQueryPlanStepGroup(); + + processor_elem.processor_name = processor->getName(); + + processor_elem.elapsed_us = processor->getElapsedUs(); + processor_elem.input_wait_elapsed_us = processor->getInputWaitElapsedUs(); + processor_elem.output_wait_elapsed_us = processor->getOutputWaitElapsedUs(); + + auto stats = processor->getProcessorDataStats(); + processor_elem.input_rows = stats.input_rows; + processor_elem.input_bytes = stats.input_bytes; + processor_elem.output_rows = stats.output_rows; + processor_elem.output_bytes = stats.output_bytes; + + processors_profile_log->add(processor_elem); } + } + } - processor_elem.id = get_proc_id(*processor); - processor_elem.parent_ids = std::move(parents); - - processor_elem.plan_step = reinterpret_cast(processor->getQueryPlanStep()); - processor_elem.plan_group = processor->getQueryPlanStepGroup(); - - processor_elem.processor_name = processor->getName(); - - processor_elem.elapsed_us = processor->getElapsedUs(); - processor_elem.input_wait_elapsed_us = processor->getInputWaitElapsedUs(); - processor_elem.output_wait_elapsed_us = processor->getOutputWaitElapsedUs(); - - auto stats = processor->getProcessorDataStats(); - processor_elem.input_rows = stats.input_rows; - processor_elem.input_bytes = stats.input_bytes; - processor_elem.output_rows = stats.output_rows; - processor_elem.output_bytes = stats.output_bytes; - - processors_profile_log->add(processor_elem); + if (implicit_txn_control) + { + try + { + implicit_txn_control->executeCommit(context->getSessionContext()); + implicit_txn_control.reset(); + } + catch (const Exception &) + { + /// An exception might happen when trying to commit the transaction. For example we might get an immediate exception + /// because ZK is down and wait_changes_become_visible_after_commit_mode == WAIT_UNKNOWN + implicit_txn_control.reset(); + throw; } } } @@ -945,27 +961,11 @@ static std::tuple executeQueryImpl( query_span->addAttributeIfNotEmpty("clickhouse.tracestate", OpenTelemetry::CurrentContext().tracestate); query_span->addAttributeIfNotZero("clickhouse.read_rows", elem.read_rows); query_span->addAttributeIfNotZero("clickhouse.read_bytes", elem.read_bytes); - query_span->addAttributeIfNotZero("clickhouse.written_rows", info.written_rows); + query_span->addAttributeIfNotZero("clickhouse.written_rows", elem.written_rows); query_span->addAttributeIfNotZero("clickhouse.written_bytes", elem.written_bytes); query_span->addAttributeIfNotZero("clickhouse.memory_usage", elem.memory_usage); query_span->finish(); } - - if (implicit_txn_control) - { - try - { - implicit_txn_control->executeCommit(context->getSessionContext()); - implicit_txn_control.reset(); - } - catch (const Exception &) - { - /// An exception might happen when trying to commit the transaction. For example we might get an immediate exception - /// because ZK is down and wait_changes_become_visible_after_commit_mode == WAIT_UNKNOWN - implicit_txn_control.reset(); - throw; - } - } }; auto exception_callback = [elem, diff --git a/src/Processors/Formats/IInputFormat.h b/src/Processors/Formats/IInputFormat.h index e2bd208764e..091447e96ee 100644 --- a/src/Processors/Formats/IInputFormat.h +++ b/src/Processors/Formats/IInputFormat.h @@ -1,5 +1,6 @@ #pragma once +#include #include #include #include @@ -55,9 +56,13 @@ public: void addBuffer(std::unique_ptr buffer) { owned_buffers.emplace_back(std::move(buffer)); } + void setErrorsLogger(const InputFormatErrorsLoggerPtr & errors_logger_) { errors_logger = errors_logger_; } + protected: ColumnMappingPtr column_mapping{}; + InputFormatErrorsLoggerPtr errors_logger; + private: /// Number of currently parsed chunk (if parallel parsing is enabled) size_t current_unit_number = 0; diff --git a/src/Processors/Formats/IRowInputFormat.cpp b/src/Processors/Formats/IRowInputFormat.cpp index 3df22002b82..52395338279 100644 --- a/src/Processors/Formats/IRowInputFormat.cpp +++ b/src/Processors/Formats/IRowInputFormat.cpp @@ -52,6 +52,31 @@ IRowInputFormat::IRowInputFormat(Block header, ReadBuffer & in_, Params params_) { } +void IRowInputFormat::logError() +{ + String diagnostic; + String raw_data; + try + { + std::tie(diagnostic, raw_data) = getDiagnosticAndRawData(); + } + catch (const Exception & exception) + { + diagnostic = "Cannot get diagnostic: " + exception.message(); + raw_data = "Cannot get raw data: " + exception.message(); + } + catch (...) + { + /// Error while trying to obtain verbose diagnostic. Ok to ignore. + } + trimLeft(diagnostic, '\n'); + trimRight(diagnostic, '\n'); + + auto now_time = time(nullptr); + + errors_logger->logError(InputFormatErrorsLogger::ErrorEntry{now_time, total_rows, diagnostic, raw_data}); +} + Chunk IRowInputFormat::generate() { if (total_rows == 0) @@ -112,6 +137,9 @@ Chunk IRowInputFormat::generate() if (params.allow_errors_num == 0 && params.allow_errors_ratio == 0) throw; + if (errors_logger) + logError(); + ++num_errors; Float64 current_error_ratio = static_cast(num_errors) / total_rows; diff --git a/src/Processors/Formats/IRowInputFormat.h b/src/Processors/Formats/IRowInputFormat.h index 87caadd93da..a11462549ff 100644 --- a/src/Processors/Formats/IRowInputFormat.h +++ b/src/Processors/Formats/IRowInputFormat.h @@ -65,6 +65,10 @@ protected: /// and collect as much as possible diagnostic information about error. /// If not implemented, returns empty string. virtual std::string getDiagnosticInfo() { return {}; } + /// Get diagnostic info and raw data for a row + virtual std::pair getDiagnosticAndRawData() { return std::make_pair("", ""); } + + void logError(); const BlockMissingValues & getMissingValues() const override { return block_missing_values; } diff --git a/src/Processors/Formats/ISchemaReader.cpp b/src/Processors/Formats/ISchemaReader.cpp index c7d8b87ab77..9365384f4b7 100644 --- a/src/Processors/Formats/ISchemaReader.cpp +++ b/src/Processors/Formats/ISchemaReader.cpp @@ -113,6 +113,11 @@ NamesAndTypesList IRowSchemaReader::readSchema() "Most likely setting input_format_max_rows_to_read_for_schema_inference is set to 0"); DataTypes data_types = readRowAndGetDataTypes(); + + /// Check that we read at list one column. + if (data_types.empty()) + throw Exception(ErrorCodes::EMPTY_DATA_PASSED, "Cannot read rows from the data"); + /// If column names weren't set, use default names 'c1', 'c2', ... if (column_names.empty()) { @@ -122,9 +127,11 @@ NamesAndTypesList IRowSchemaReader::readSchema() } /// If column names were set, check that the number of names match the number of types. else if (column_names.size() != data_types.size()) + { throw Exception( ErrorCodes::INCORRECT_DATA, "The number of column names {} differs with the number of types {}", column_names.size(), data_types.size()); + } for (size_t i = 0; i != column_names.size(); ++i) { @@ -155,10 +162,6 @@ NamesAndTypesList IRowSchemaReader::readSchema() } } - /// Check that we read at list one column. - if (data_types.empty()) - throw Exception(ErrorCodes::EMPTY_DATA_PASSED, "Cannot read rows from the data"); - NamesAndTypesList result; for (size_t i = 0; i != data_types.size(); ++i) { diff --git a/src/Processors/Formats/Impl/MsgPackRowInputFormat.cpp b/src/Processors/Formats/Impl/MsgPackRowInputFormat.cpp index b3d237fecfd..bfc4f726edb 100644 --- a/src/Processors/Formats/Impl/MsgPackRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/MsgPackRowInputFormat.cpp @@ -2,6 +2,15 @@ #if USE_MSGPACK +/// FIXME: there is some issue with clang-15, that incorrectly detect a +/// "Attempt to free released memory" in msgpack::unpack(), because of delete +/// operator for zone (from msgpack/v1/detail/cpp11_zone.hpp), hence NOLINT +/// +/// NOTE: that I was not able to suppress it locally, only with +/// NOLINTBEGIN/NOLINTEND +// +// NOLINTBEGIN(clang-analyzer-cplusplus.NewDelete) + #include #include #include @@ -235,8 +244,10 @@ static void insertNull(IColumn & column, DataTypePtr type) assert_cast(column).insertDefault(); } -static void insertUUID(IColumn & column, DataTypePtr /*type*/, const char * value, size_t size) +static void insertUUID(IColumn & column, DataTypePtr type, const char * value, size_t size) { + if (!isUUID(type)) + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Cannot insert MessagePack UUID into column with type {}.", type->getName()); ReadBufferFromMemory buf(value, size); UUID uuid; readBinaryBigEndian(uuid.toUnderType().items[0], buf); @@ -551,6 +562,8 @@ void registerMsgPackSchemaReader(FormatFactory & factory) } +// NOLINTEND(clang-analyzer-cplusplus.NewDelete) + #else namespace DB diff --git a/src/Processors/Formats/Impl/ParallelParsingInputFormat.cpp b/src/Processors/Formats/Impl/ParallelParsingInputFormat.cpp index 318bcaed466..e0693b489bd 100644 --- a/src/Processors/Formats/Impl/ParallelParsingInputFormat.cpp +++ b/src/Processors/Formats/Impl/ParallelParsingInputFormat.cpp @@ -75,6 +75,7 @@ void ParallelParsingInputFormat::parserThreadFunction(ThreadGroupStatusPtr threa InputFormatPtr input_format = internal_parser_creator(read_buffer); input_format->setCurrentUnitNumber(current_ticket_number); + input_format->setErrorsLogger(errors_logger); InternalParser parser(input_format); unit.chunk_ext.chunk.clear(); diff --git a/src/Processors/Formats/InputFormatErrorsLogger.cpp b/src/Processors/Formats/InputFormatErrorsLogger.cpp new file mode 100644 index 00000000000..e6f8cdd43ee --- /dev/null +++ b/src/Processors/Formats/InputFormatErrorsLogger.cpp @@ -0,0 +1,83 @@ +#include +#include +#include +#include +#include +#include +#include + + +namespace DB +{ + +namespace +{ + const String DEFAULT_OUTPUT_FORMAT = "CSV"; +} + +InputFormatErrorsLogger::InputFormatErrorsLogger(const ContextPtr & context) +{ + String output_format = context->getSettingsRef().errors_output_format; + if (!FormatFactory::instance().isOutputFormat(output_format)) + output_format = DEFAULT_OUTPUT_FORMAT; + if (context->hasInsertionTable()) + table = context->getInsertionTable().getTableName(); + if (context->getInsertionTable().hasDatabase()) + database = context->getInsertionTable().getDatabaseName(); + + String path_in_setting = context->getSettingsRef().input_format_record_errors_file_path; + errors_file_path = context->getApplicationType() == Context::ApplicationType::SERVER ? context->getUserFilesPath() + path_in_setting + : path_in_setting; + while (fs::exists(errors_file_path)) + { + errors_file_path += "_new"; + } + write_buf = std::make_shared(errors_file_path); + + header = Block{ + {std::make_shared(), "time"}, + {std::make_shared(std::make_shared()), "database"}, + {std::make_shared(std::make_shared()), "table"}, + {std::make_shared(), "offset"}, + {std::make_shared(), "reason"}, + {std::make_shared(), "raw_data"}}; + + writer = context->getOutputFormat(output_format, *write_buf, header); +} + +InputFormatErrorsLogger::~InputFormatErrorsLogger() +{ + writer->finalize(); + writer->flush(); + write_buf->finalize(); +} + +void InputFormatErrorsLogger::logErrorImpl(ErrorEntry entry) +{ + auto error = header.cloneEmpty(); + auto columns = error.mutateColumns(); + columns[0]->insert(entry.time); + database.empty() ? columns[1]->insertDefault() : columns[1]->insert(database); + table.empty() ? columns[2]->insertDefault() : columns[2]->insert(table); + columns[3]->insert(entry.offset); + columns[4]->insert(entry.reason); + columns[5]->insert(entry.raw_data); + error.setColumns(std::move(columns)); + + writer->write(error); +} + +void InputFormatErrorsLogger::logError(ErrorEntry entry) +{ + logErrorImpl(entry); +} + +ParallelInputFormatErrorsLogger::~ParallelInputFormatErrorsLogger() = default; + +void ParallelInputFormatErrorsLogger::logError(ErrorEntry entry) +{ + std::lock_guard lock(write_mutex); + logErrorImpl(entry); +} + +} diff --git a/src/Processors/Formats/InputFormatErrorsLogger.h b/src/Processors/Formats/InputFormatErrorsLogger.h new file mode 100644 index 00000000000..4b3766f4d37 --- /dev/null +++ b/src/Processors/Formats/InputFormatErrorsLogger.h @@ -0,0 +1,54 @@ +#pragma once + +#include +#include + + +namespace DB +{ + +class InputFormatErrorsLogger +{ +public: + struct ErrorEntry + { + time_t time; + size_t offset; + String reason; + String raw_data; + }; + + InputFormatErrorsLogger(const ContextPtr & context); + + virtual ~InputFormatErrorsLogger(); + + virtual void logError(ErrorEntry entry); + void logErrorImpl(ErrorEntry entry); + +private: + Block header; + + String errors_file_path; + std::shared_ptr write_buf; + OutputFormatPtr writer; + + String database; + String table; +}; + +using InputFormatErrorsLoggerPtr = std::shared_ptr; + +class ParallelInputFormatErrorsLogger : public InputFormatErrorsLogger +{ +public: + ParallelInputFormatErrorsLogger(const ContextPtr & context) : InputFormatErrorsLogger(context) { } + + ~ParallelInputFormatErrorsLogger() override; + + void logError(ErrorEntry entry) override; + +private: + std::mutex write_mutex; +}; + +} diff --git a/src/Processors/Formats/RowInputFormatWithDiagnosticInfo.cpp b/src/Processors/Formats/RowInputFormatWithDiagnosticInfo.cpp index f4568830720..35a86bc476d 100644 --- a/src/Processors/Formats/RowInputFormatWithDiagnosticInfo.cpp +++ b/src/Processors/Formats/RowInputFormatWithDiagnosticInfo.cpp @@ -35,12 +35,15 @@ void RowInputFormatWithDiagnosticInfo::updateDiagnosticInfo() offset_of_current_row = in->offset(); } -String RowInputFormatWithDiagnosticInfo::getDiagnosticInfo() +std::pair RowInputFormatWithDiagnosticInfo::getDiagnosticAndRawDataImpl(bool is_errors_record) { - if (in->eof()) - return "Buffer has gone, cannot extract information about what has been parsed."; + WriteBufferFromOwnString out_diag; + WriteBufferFromOwnString out_data; - WriteBufferFromOwnString out; + if (in->eof()) + return std::make_pair( + "Buffer has gone, cannot extract information about what has been parsed.", + "Buffer has gone, cannot extract information about what has been parsed."); const auto & header = getPort().getHeader(); MutableColumns columns = header.cloneEmptyColumns(); @@ -49,8 +52,9 @@ String RowInputFormatWithDiagnosticInfo::getDiagnosticInfo() size_t bytes_read_at_start_of_buffer = in->count() - in->offset(); if (bytes_read_at_start_of_buffer != bytes_read_at_start_of_buffer_on_prev_row) { - out << "Could not print diagnostic info because two last rows aren't in buffer (rare case)\n"; - return out.str(); + out_diag << "Could not print diagnostic info because two last rows aren't in buffer (rare case)\n"; + out_data << "Could not collect raw data because two last rows aren't in buffer (rare case)\n"; + return std::make_pair(out_diag.str(), out_data.str()); } max_length_of_column_name = 0; @@ -65,30 +69,49 @@ String RowInputFormatWithDiagnosticInfo::getDiagnosticInfo() /// Roll back the cursor to the beginning of the previous or current row and parse all over again. But now we derive detailed information. - if (offset_of_prev_row <= in->buffer().size()) + if (!is_errors_record && offset_of_prev_row <= in->buffer().size()) { in->position() = in->buffer().begin() + offset_of_prev_row; - out << "\nRow " << (row_num - 1) << ":\n"; - if (!parseRowAndPrintDiagnosticInfo(columns, out)) - return out.str(); + out_diag << "\nRow " << (row_num - 1) << ":\n"; + if (!parseRowAndPrintDiagnosticInfo(columns, out_diag)) + return std::make_pair(out_diag.str(), out_data.str()); } else { if (in->buffer().size() < offset_of_current_row) { - out << "Could not print diagnostic info because parsing of data hasn't started.\n"; - return out.str(); + out_diag << "Could not print diagnostic info because parsing of data hasn't started.\n"; + out_data << "Could not collect raw data because parsing of data hasn't started.\n"; + return std::make_pair(out_diag.str(), out_data.str()); } in->position() = in->buffer().begin() + offset_of_current_row; } - out << "\nRow " << row_num << ":\n"; - parseRowAndPrintDiagnosticInfo(columns, out); - out << "\n"; + char * data = in->position(); + while (data < in->buffer().end() && *data != '\n' && *data != '\r' && *data != '\0') + { + out_data << *data; + ++data; + } - return out.str(); + out_diag << "\nRow " << row_num << ":\n"; + parseRowAndPrintDiagnosticInfo(columns, out_diag); + out_diag << "\n"; + + return std::make_pair(out_diag.str(), out_data.str()); +} + +String RowInputFormatWithDiagnosticInfo::getDiagnosticInfo() +{ + auto diagnostic_and_raw_data = getDiagnosticAndRawDataImpl(false); + return std::get<0>(diagnostic_and_raw_data); +} + +std::pair RowInputFormatWithDiagnosticInfo::getDiagnosticAndRawData() +{ + return getDiagnosticAndRawDataImpl(true); } bool RowInputFormatWithDiagnosticInfo::deserializeFieldAndPrintDiagnosticInfo(const String & col_name, diff --git a/src/Processors/Formats/RowInputFormatWithDiagnosticInfo.h b/src/Processors/Formats/RowInputFormatWithDiagnosticInfo.h index 5bad24cd482..49793fcd208 100644 --- a/src/Processors/Formats/RowInputFormatWithDiagnosticInfo.h +++ b/src/Processors/Formats/RowInputFormatWithDiagnosticInfo.h @@ -14,7 +14,9 @@ class RowInputFormatWithDiagnosticInfo : public IRowInputFormat public: RowInputFormatWithDiagnosticInfo(const Block & header_, ReadBuffer & in_, const Params & params_); + std::pair getDiagnosticAndRawDataImpl(bool is_errors_record); String getDiagnosticInfo() override; + std::pair getDiagnosticAndRawData() override; void resetParser() override; diff --git a/src/Storages/Distributed/DirectoryMonitor.cpp b/src/Storages/Distributed/DirectoryMonitor.cpp index 16981d26146..e8d48431a9e 100644 --- a/src/Storages/Distributed/DirectoryMonitor.cpp +++ b/src/Storages/Distributed/DirectoryMonitor.cpp @@ -140,6 +140,11 @@ namespace size_t rows = 0; size_t bytes = 0; + UInt32 shard_num = 0; + std::string cluster; + std::string distributed_table; + std::string remote_table; + /// dumpStructure() of the header -- obsolete std::string block_header_string; Block block_header; @@ -195,6 +200,14 @@ namespace in.getFileName(), distributed_header.revision, DBMS_TCP_PROTOCOL_VERSION); } + if (header_buf.hasPendingData()) + { + readVarUInt(distributed_header.shard_num, header_buf); + readStringBinary(distributed_header.cluster, header_buf); + readStringBinary(distributed_header.distributed_table, header_buf); + readStringBinary(distributed_header.remote_table, header_buf); + } + /// Add handling new data here, for example: /// /// if (header_buf.hasPendingData()) @@ -621,18 +634,23 @@ void StorageDistributedDirectoryMonitor::processFile(const std::string & file_pa ReadBufferFromFile in(file_path); const auto & distributed_header = readDistributedHeader(in, log); - auto connection = pool->get(timeouts, &distributed_header.insert_settings); + thread_trace_context = std::make_unique(__PRETTY_FUNCTION__, + distributed_header.client_info.client_trace_context, + this->storage.getContext()->getOpenTelemetrySpanLog()); + thread_trace_context->root_span.addAttribute("clickhouse.shard_num", distributed_header.shard_num); + thread_trace_context->root_span.addAttribute("clickhouse.cluster", distributed_header.cluster); + thread_trace_context->root_span.addAttribute("clickhouse.distributed", distributed_header.distributed_table); + thread_trace_context->root_span.addAttribute("clickhouse.remote", distributed_header.remote_table); + thread_trace_context->root_span.addAttribute("clickhouse.rows", distributed_header.rows); + thread_trace_context->root_span.addAttribute("clickhouse.bytes", distributed_header.bytes); + auto connection = pool->get(timeouts, &distributed_header.insert_settings); LOG_DEBUG(log, "Sending `{}` to {} ({} rows, {} bytes)", file_path, connection->getDescription(), formatReadableQuantity(distributed_header.rows), formatReadableSizeWithBinarySuffix(distributed_header.bytes)); - thread_trace_context = std::make_unique(__PRETTY_FUNCTION__, - distributed_header.client_info.client_trace_context, - this->storage.getContext()->getOpenTelemetrySpanLog()); - RemoteInserter remote{*connection, timeouts, distributed_header.insert_query, distributed_header.insert_settings, diff --git a/src/Storages/Distributed/DistributedSink.cpp b/src/Storages/Distributed/DistributedSink.cpp index ae72fdd84e2..8099a7f2002 100644 --- a/src/Storages/Distributed/DistributedSink.cpp +++ b/src/Storages/Distributed/DistributedSink.cpp @@ -171,7 +171,6 @@ void DistributedSink::writeAsync(const Block & block) } else { - if (storage.getShardingKeyExpr() && (cluster->getShardsInfo().size() > 1)) return writeSplitAsync(block); @@ -291,6 +290,8 @@ DistributedSink::runWritingJob(JobReplica & job, const Block & current_block, si auto thread_group = CurrentThread::getGroup(); return [this, thread_group, &job, ¤t_block, num_shards]() { + OpenTelemetry::SpanHolder span(__PRETTY_FUNCTION__); + if (thread_group) CurrentThread::attachToIfDetached(thread_group); setThreadName("DistrOutStrProc"); @@ -331,15 +332,19 @@ DistributedSink::runWritingJob(JobReplica & job, const Block & current_block, si const Block & shard_block = (num_shards > 1) ? job.current_shard_block : current_block; const Settings & settings = context->getSettingsRef(); - /// Do not initiate INSERT for empty block. size_t rows = shard_block.rows(); + + span.addAttribute("clickhouse.shard_num", shard_info.shard_num); + span.addAttribute("clickhouse.cluster", this->storage.cluster_name); + span.addAttribute("clickhouse.distributed", this->storage.getStorageID().getFullNameNotQuoted()); + span.addAttribute("clickhouse.remote", [this]() { return storage.remote_database + "." + storage.remote_table; }); + span.addAttribute("clickhouse.rows", rows); + span.addAttribute("clickhouse.bytes", [&shard_block]() { return toString(shard_block.bytes()); }); + + /// Do not initiate INSERT for empty block. if (rows == 0) return; - OpenTelemetry::SpanHolder span(__PRETTY_FUNCTION__); - span.addAttribute("clickhouse.shard_num", shard_info.shard_num); - span.addAttribute("clickhouse.written_rows", rows); - if (!job.is_local_job || !settings.prefer_localhost_replica) { if (!job.executor) @@ -610,20 +615,15 @@ void DistributedSink::writeSplitAsync(const Block & block) void DistributedSink::writeAsyncImpl(const Block & block, size_t shard_id) { - OpenTelemetry::SpanHolder span("DistributedSink::writeAsyncImpl()"); - const auto & shard_info = cluster->getShardsInfo()[shard_id]; const auto & settings = context->getSettingsRef(); Block block_to_send = removeSuperfluousColumns(block); - span.addAttribute("clickhouse.shard_num", shard_info.shard_num); - span.addAttribute("clickhouse.written_rows", block.rows()); - if (shard_info.hasInternalReplication()) { if (shard_info.isLocal() && settings.prefer_localhost_replica) /// Prefer insert into current instance directly - writeToLocal(block_to_send, shard_info.getLocalNodeCount()); + writeToLocal(shard_info, block_to_send, shard_info.getLocalNodeCount()); else { const auto & path = shard_info.insertPathForInternalReplication( @@ -631,13 +631,13 @@ void DistributedSink::writeAsyncImpl(const Block & block, size_t shard_id) settings.use_compact_format_in_distributed_parts_names); if (path.empty()) throw Exception("Directory name for async inserts is empty", ErrorCodes::LOGICAL_ERROR); - writeToShard(block_to_send, {path}); + writeToShard(shard_info, block_to_send, {path}); } } else { if (shard_info.isLocal() && settings.prefer_localhost_replica) - writeToLocal(block_to_send, shard_info.getLocalNodeCount()); + writeToLocal(shard_info, block_to_send, shard_info.getLocalNodeCount()); std::vector dir_names; for (const auto & address : cluster->getShardsAddresses()[shard_id]) @@ -645,30 +645,44 @@ void DistributedSink::writeAsyncImpl(const Block & block, size_t shard_id) dir_names.push_back(address.toFullString(settings.use_compact_format_in_distributed_parts_names)); if (!dir_names.empty()) - writeToShard(block_to_send, dir_names); + writeToShard(shard_info, block_to_send, dir_names); } } -void DistributedSink::writeToLocal(const Block & block, size_t repeats) +void DistributedSink::writeToLocal(const Cluster::ShardInfo & shard_info, const Block & block, size_t repeats) { OpenTelemetry::SpanHolder span(__PRETTY_FUNCTION__); - span.addAttribute("db.statement", this->query_string); + span.addAttribute("clickhouse.shard_num", shard_info.shard_num); + span.addAttribute("clickhouse.cluster", this->storage.cluster_name); + span.addAttribute("clickhouse.distributed", this->storage.getStorageID().getFullNameNotQuoted()); + span.addAttribute("clickhouse.remote", [this]() { return storage.remote_database + "." + storage.remote_table; }); + span.addAttribute("clickhouse.rows", [&block]() { return toString(block.rows()); }); + span.addAttribute("clickhouse.bytes", [&block]() { return toString(block.bytes()); }); - InterpreterInsertQuery interp(query_ast, context, allow_materialized); + try + { + InterpreterInsertQuery interp(query_ast, context, allow_materialized); - auto block_io = interp.execute(); - PushingPipelineExecutor executor(block_io.pipeline); + auto block_io = interp.execute(); + PushingPipelineExecutor executor(block_io.pipeline); - executor.start(); - writeBlockConvert(executor, block, repeats, log); - executor.finish(); + executor.start(); + writeBlockConvert(executor, block, repeats, log); + executor.finish(); + } + catch (...) + { + span.addAttribute(std::current_exception()); + throw; + } } -void DistributedSink::writeToShard(const Block & block, const std::vector & dir_names) +void DistributedSink::writeToShard(const Cluster::ShardInfo & shard_info, const Block & block, const std::vector & dir_names) { OpenTelemetry::SpanHolder span(__PRETTY_FUNCTION__); + span.addAttribute("clickhouse.shard_num", shard_info.shard_num); const auto & settings = context->getSettingsRef(); const auto & distributed_settings = storage.getDistributedSettingsRef(); @@ -759,6 +773,11 @@ void DistributedSink::writeToShard(const Block & block, const std::vectorstorage.cluster_name, header_buf); + writeStringBinary(this->storage.getStorageID().getFullNameNotQuoted(), header_buf); + writeStringBinary(this->storage.remote_database + "." + this->storage.remote_table, header_buf); + /// Add new fields here, for example: /// writeVarUInt(my_new_data, header_buf); /// And note that it is safe, because we have checksum and size for header. diff --git a/src/Storages/Distributed/DistributedSink.h b/src/Storages/Distributed/DistributedSink.h index 668cec22e8b..af0c64cbd78 100644 --- a/src/Storages/Distributed/DistributedSink.h +++ b/src/Storages/Distributed/DistributedSink.h @@ -69,9 +69,9 @@ private: Block removeSuperfluousColumns(Block block) const; /// Increments finished_writings_count after each repeat. - void writeToLocal(const Block & block, size_t repeats); + void writeToLocal(const Cluster::ShardInfo & shard_info, const Block & block, size_t repeats); - void writeToShard(const Block & block, const std::vector & dir_names); + void writeToShard(const Cluster::ShardInfo & shard_info, const Block & block, const std::vector & dir_names); /// Performs synchronous insertion to remote nodes. If timeout_exceeded flag was set, throws. diff --git a/src/Storages/HDFS/StorageHDFS.cpp b/src/Storages/HDFS/StorageHDFS.cpp index f93bc45d1a3..45caddb21ea 100644 --- a/src/Storages/HDFS/StorageHDFS.cpp +++ b/src/Storages/HDFS/StorageHDFS.cpp @@ -255,7 +255,7 @@ private: class HDFSSource::URISIterator::Impl { public: - explicit Impl(const std::vector & uris_, ContextPtr context) + explicit Impl(const std::vector & uris_, ContextPtr context) { auto path_and_uri = getPathFromUriAndUriWithoutPath(uris_[0]); HDFSBuilderWrapper builder = createHDFSBuilder(path_and_uri.second + "/", context->getGlobalContext()->getConfigRef()); @@ -293,7 +293,7 @@ String HDFSSource::DisclosedGlobIterator::next() return pimpl->next(); } -HDFSSource::URISIterator::URISIterator(const std::vector & uris_, ContextPtr context) +HDFSSource::URISIterator::URISIterator(const std::vector & uris_, ContextPtr context) : pimpl(std::make_shared(uris_, context)) { } diff --git a/src/Storages/HDFS/StorageHDFS.h b/src/Storages/HDFS/StorageHDFS.h index a0d61f4bd2a..896371f9685 100644 --- a/src/Storages/HDFS/StorageHDFS.h +++ b/src/Storages/HDFS/StorageHDFS.h @@ -86,7 +86,7 @@ private: const String & format_name, const ContextPtr & ctx); - std::vector uris; + std::vector uris; String format_name; String compression_method; const bool distributed_processing; @@ -116,7 +116,7 @@ public: class URISIterator { public: - URISIterator(const std::vector & uris_, ContextPtr context); + URISIterator(const std::vector & uris_, ContextPtr context); String next(); private: class Impl; diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index e9d900c6d54..93c8516fa7b 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -1291,7 +1291,7 @@ catch (Exception & e) bool IMergeTreeDataPart::wasInvolvedInTransaction() const { - assert(!version.creation_tid.isEmpty() || (state == MergeTreeDataPartState::Temporary /* && std::uncaught_exceptions() */)); + assert(!storage.data_parts_loading_finished || !version.creation_tid.isEmpty() || (state == MergeTreeDataPartState::Temporary /* && std::uncaught_exceptions() */)); bool created_by_transaction = !version.creation_tid.isPrehistoric(); bool removed_by_transaction = version.isRemovalTIDLocked() && version.removal_tid_lock != Tx::PrehistoricTID.getHash(); return created_by_transaction || removed_by_transaction; diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 7c508568fe8..20323d87bed 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -1560,6 +1560,7 @@ void MergeTreeData::loadDataParts(bool skip_sanity_checks) calculateColumnAndSecondaryIndexSizesImpl(); LOG_DEBUG(log, "Loaded data parts ({} items)", data_parts_indexes.size()); + data_parts_loading_finished = true; } /// Is the part directory old. diff --git a/src/Storages/MergeTree/MergeTreeData.h b/src/Storages/MergeTree/MergeTreeData.h index 4158517fc23..94bca094a86 100644 --- a/src/Storages/MergeTree/MergeTreeData.h +++ b/src/Storages/MergeTree/MergeTreeData.h @@ -1034,6 +1034,8 @@ protected: /// True if at least one part was created/removed with transaction. mutable std::atomic_bool transactions_enabled = false; + std::atomic_bool data_parts_loading_finished = false; + /// Work with data parts struct TagByInfo{}; @@ -1242,6 +1244,9 @@ protected: /// Attaches restored parts to the storage. virtual void attachRestoredParts(MutableDataPartsVector && parts) = 0; + void resetObjectColumnsFromActiveParts(const DataPartsLock & lock); + void updateObjectColumns(const DataPartPtr & part, const DataPartsLock & lock); + static void incrementInsertedPartsProfileEvent(MergeTreeDataPartType type); static void incrementMergedPartsProfileEvent(MergeTreeDataPartType type); @@ -1329,9 +1334,6 @@ private: DataPartsVector & duplicate_parts_to_remove, MutableDataPartsVector & parts_from_wal); - void resetObjectColumnsFromActiveParts(const DataPartsLock & lock); - void updateObjectColumns(const DataPartPtr & part, const DataPartsLock & lock); - /// Create zero-copy exclusive lock for part and disk. Useful for coordination of /// distributed operations which can lead to data duplication. Implemented only in ReplicatedMergeTree. virtual std::optional tryCreateZeroCopyExclusiveLock(const String &, const DiskPtr &) { return std::nullopt; } diff --git a/src/Storages/MergeTree/MergeTreeDataWriter.cpp b/src/Storages/MergeTree/MergeTreeDataWriter.cpp index 97900eef22b..95faef6aac7 100644 --- a/src/Storages/MergeTree/MergeTreeDataWriter.cpp +++ b/src/Storages/MergeTree/MergeTreeDataWriter.cpp @@ -483,16 +483,6 @@ MergeTreeDataWriter::TemporaryPart MergeTreeDataWriter::writeTempPart( return temp_part; } -void MergeTreeDataWriter::deduceTypesOfObjectColumns(const StorageSnapshotPtr & storage_snapshot, Block & block) -{ - if (!storage_snapshot->object_columns.empty()) - { - auto options = GetColumnsOptions(GetColumnsOptions::AllPhysical).withExtendedObjects(); - auto storage_columns = storage_snapshot->getColumns(options); - convertObjectsToTuples(block, storage_columns); - } -} - MergeTreeDataWriter::TemporaryPart MergeTreeDataWriter::writeProjectionPartImpl( const String & part_name, MergeTreeDataPartType part_type, diff --git a/src/Storages/MergeTree/MergeTreeDataWriter.h b/src/Storages/MergeTree/MergeTreeDataWriter.h index 2f9ab1aae8b..00438a29fa1 100644 --- a/src/Storages/MergeTree/MergeTreeDataWriter.h +++ b/src/Storages/MergeTree/MergeTreeDataWriter.h @@ -45,8 +45,6 @@ public: */ static BlocksWithPartition splitBlockIntoParts(const Block & block, size_t max_parts, const StorageMetadataPtr & metadata_snapshot, ContextPtr context); - static void deduceTypesOfObjectColumns(const StorageSnapshotPtr & storage_snapshot, Block & block); - /// This structure contains not completely written temporary part. /// Some writes may happen asynchronously, e.g. for blob storages. /// You should call finalize() to wait until all data is written. diff --git a/src/Storages/MergeTree/MergeTreeSink.cpp b/src/Storages/MergeTree/MergeTreeSink.cpp index 5eaa8ec8004..5d00db861a8 100644 --- a/src/Storages/MergeTree/MergeTreeSink.cpp +++ b/src/Storages/MergeTree/MergeTreeSink.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include namespace ProfileEvents @@ -56,7 +57,7 @@ void MergeTreeSink::consume(Chunk chunk) { auto block = getHeader().cloneWithColumns(chunk.detachColumns()); - storage.writer.deduceTypesOfObjectColumns(storage_snapshot, block); + deduceTypesOfObjectColumns(storage_snapshot, block); auto part_blocks = storage.writer.splitBlockIntoParts(block, max_parts_per_block, metadata_snapshot, context); using DelayedPartitions = std::vector; diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp index 6c7fbcb52d8..b9bd027cde2 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include #include #include @@ -161,7 +162,7 @@ void ReplicatedMergeTreeSink::consume(Chunk chunk) */ size_t replicas_num = checkQuorumPrecondition(zookeeper); - storage.writer.deduceTypesOfObjectColumns(storage_snapshot, block); + deduceTypesOfObjectColumns(storage_snapshot, block); auto part_blocks = storage.writer.splitBlockIntoParts(block, max_parts_per_block, metadata_snapshot, context); using DelayedPartitions = std::vector; @@ -203,11 +204,11 @@ void ReplicatedMergeTreeSink::consume(Chunk chunk) } block_id = temp_part.part->getZeroLevelPartBlockID(block_dedup_token); - LOG_DEBUG(log, "Wrote block with ID '{}', {} rows on {} replicas", block_id, current_block.block.rows(), replicas_num); + LOG_DEBUG(log, "Wrote block with ID '{}', {} rows{}", block_id, current_block.block.rows(), quorumLogMessage(replicas_num)); } else { - LOG_DEBUG(log, "Wrote block with {} rows on {} replicas", current_block.block.rows(), replicas_num); + LOG_DEBUG(log, "Wrote block with {} rows{}", current_block.block.rows(), quorumLogMessage(replicas_num)); } UInt64 elapsed_ns = watch.elapsed(); @@ -639,7 +640,7 @@ void ReplicatedMergeTreeSink::waitForQuorum( size_t replicas_num) const { /// We are waiting for quorum to be satisfied. - LOG_TRACE(log, "Waiting for quorum '{}' for part {} on {} replicas", quorum_path, part_name, replicas_num); + LOG_TRACE(log, "Waiting for quorum '{}' for part {}{}", quorum_path, part_name, quorumLogMessage(replicas_num)); try { @@ -684,6 +685,13 @@ void ReplicatedMergeTreeSink::waitForQuorum( LOG_TRACE(log, "Quorum '{}' for part {} satisfied", quorum_path, part_name); } +String ReplicatedMergeTreeSink::quorumLogMessage(size_t replicas_num) const +{ + if (!isQuorumEnabled()) + return ""; + return fmt::format(" (quorum {} of {} replicas)", getQuorumSize(replicas_num), replicas_num); +} + size_t ReplicatedMergeTreeSink::getQuorumSize(size_t replicas_num) const { if (!isQuorumEnabled()) diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeSink.h b/src/Storages/MergeTree/ReplicatedMergeTreeSink.h index 48e94ef5659..ab729e6edec 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeSink.h +++ b/src/Storages/MergeTree/ReplicatedMergeTreeSink.h @@ -96,6 +96,7 @@ private: size_t getQuorumSize(size_t replicas_num) const; bool isQuorumEnabled() const; + String quorumLogMessage(size_t replicas_num) const; /// Used in logs for debug purposes size_t quorum_timeout_ms; size_t max_parts_per_block; diff --git a/src/Storages/StorageMergeTree.cpp b/src/Storages/StorageMergeTree.cpp index 5adc1974257..e4062734352 100644 --- a/src/Storages/StorageMergeTree.cpp +++ b/src/Storages/StorageMergeTree.cpp @@ -335,6 +335,13 @@ void StorageMergeTree::alter( mutation_version = startMutation(maybe_mutation_commands, local_context); } + { + /// Reset Object columns, because column of type + /// Object may be added or dropped by alter. + auto parts_lock = lockParts(); + resetObjectColumnsFromActiveParts(parts_lock); + } + /// Always execute required mutations synchronously, because alters /// should be executed in sequential order. if (!maybe_mutation_commands.empty()) diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index a2d10e57f8f..d704721abcc 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -3649,7 +3649,7 @@ void StorageReplicatedMergeTree::updateQuorum(const String & part_name, bool is_ if (quorum_entry.replicas.size() >= quorum_entry.required_number_of_replicas) { /// The quorum is reached. Delete the node, and update information about the last part that was successfully written with quorum. - LOG_TRACE(log, "Got {} (of {}) replicas confirmed quorum {}, going to remove node", + LOG_TRACE(log, "Got {} (of {} required) replicas confirmed quorum {}, going to remove node", quorum_entry.replicas.size(), quorum_entry.required_number_of_replicas, quorum_status_path); Coordination::Requests ops; @@ -4649,6 +4649,13 @@ bool StorageReplicatedMergeTree::executeMetadataAlter(const StorageReplicatedMer LOG_INFO(log, "Applied changes to the metadata of the table. Current metadata version: {}", metadata_version); } + { + /// Reset Object columns, because column of type + /// Object may be added or dropped by alter. + auto parts_lock = lockParts(); + resetObjectColumnsFromActiveParts(parts_lock); + } + /// This transaction may not happen, but it's OK, because on the next retry we will eventually create/update this node /// TODO Maybe do in in one transaction for Replicated database? zookeeper->createOrUpdate(fs::path(replica_path) / "metadata_version", std::to_string(metadata_version), zkutil::CreateMode::Persistent); diff --git a/src/Storages/System/StorageSystemDDLWorkerQueue.cpp b/src/Storages/System/StorageSystemDDLWorkerQueue.cpp index 111ea343398..67867b6c577 100644 --- a/src/Storages/System/StorageSystemDDLWorkerQueue.cpp +++ b/src/Storages/System/StorageSystemDDLWorkerQueue.cpp @@ -205,9 +205,9 @@ static void fillStatusColumns(MutableColumns & res_columns, size_t & col, void StorageSystemDDLWorkerQueue::fillData(MutableColumns & res_columns, ContextPtr context, const SelectQueryInfo &) const { - zkutil::ZooKeeperPtr zookeeper = context->getZooKeeper(); - fs::path ddl_zookeeper_path = context->getConfigRef().getString("distributed_ddl.path", "/clickhouse/task_queue/ddl/"); - + auto& ddl_worker = context->getDDLWorker(); + fs::path ddl_zookeeper_path = ddl_worker.getQueueDir(); + zkutil::ZooKeeperPtr zookeeper = ddl_worker.getAndSetZooKeeper(); Strings ddl_task_paths = zookeeper->getChildren(ddl_zookeeper_path); GetResponseFutures ddl_task_futures; diff --git a/src/Storages/System/StorageSystemModels.cpp b/src/Storages/System/StorageSystemModels.cpp index 4a4dbbc69df..d06f97a3f54 100644 --- a/src/Storages/System/StorageSystemModels.cpp +++ b/src/Storages/System/StorageSystemModels.cpp @@ -1,11 +1,11 @@ #include +#include #include #include #include #include #include -#include -#include +#include namespace DB @@ -14,45 +14,24 @@ namespace DB NamesAndTypesList StorageSystemModels::getNamesAndTypes() { return { - { "name", std::make_shared() }, - { "status", std::make_shared(getStatusEnumAllPossibleValues()) }, - { "origin", std::make_shared() }, + { "model_path", std::make_shared() }, { "type", std::make_shared() }, { "loading_start_time", std::make_shared() }, { "loading_duration", std::make_shared() }, - //{ "creation_time", std::make_shared() }, - { "last_exception", std::make_shared() }, }; } void StorageSystemModels::fillData(MutableColumns & res_columns, ContextPtr context, const SelectQueryInfo &) const { - const auto & external_models_loader = context->getExternalModelsLoader(); - auto load_results = external_models_loader.getLoadResults(); + auto bridge_helper = std::make_unique(context); + ExternalModelInfos infos = bridge_helper->listModels(); - for (const auto & load_result : load_results) + for (const auto & info : infos) { - res_columns[0]->insert(load_result.name); - res_columns[1]->insert(static_cast(load_result.status)); - res_columns[2]->insert(load_result.config ? load_result.config->path : ""); - - if (load_result.object) - { - const auto model_ptr = std::static_pointer_cast(load_result.object); - res_columns[3]->insert(model_ptr->getTypeName()); - } - else - { - res_columns[3]->insertDefault(); - } - - res_columns[4]->insert(static_cast(std::chrono::system_clock::to_time_t(load_result.loading_start_time))); - res_columns[5]->insert(std::chrono::duration_cast>(load_result.loading_duration).count()); - - if (load_result.exception) - res_columns[6]->insert(getExceptionMessage(load_result.exception, false)); - else - res_columns[6]->insertDefault(); + res_columns[0]->insert(info.model_path); + res_columns[1]->insert(info.model_type); + res_columns[2]->insert(static_cast(std::chrono::system_clock::to_time_t(info.loading_start_time))); + res_columns[3]->insert(std::chrono::duration_cast>(info.loading_duration).count()); } } diff --git a/tests/ci/build_check.py b/tests/ci/build_check.py index f58c7a74dfe..d668dbe0498 100644 --- a/tests/ci/build_check.py +++ b/tests/ci/build_check.py @@ -291,7 +291,9 @@ def main(): logging.info("Will try to fetch cache for our build") try: - get_ccache_if_not_exists(ccache_path, s3_helper, pr_info.number, TEMP_PATH) + get_ccache_if_not_exists( + ccache_path, s3_helper, pr_info.number, TEMP_PATH, pr_info.release_pr + ) except Exception as e: # In case there are issues with ccache, remove the path and do not fail a build logging.info("Failed to get ccache, building without it. Error: %s", e) diff --git a/tests/ci/ccache_utils.py b/tests/ci/ccache_utils.py index cfe07363589..864b3a8f9b6 100644 --- a/tests/ci/ccache_utils.py +++ b/tests/ci/ccache_utils.py @@ -11,6 +11,7 @@ import requests # type: ignore from compress_files import decompress_fast, compress_fast from env_helper import S3_DOWNLOAD, S3_BUILDS_BUCKET +from s3_helper import S3Helper DOWNLOAD_RETRIES_COUNT = 5 @@ -57,12 +58,19 @@ def dowload_file_with_progress(url, path): def get_ccache_if_not_exists( - path_to_ccache_dir, s3_helper, current_pr_number, temp_path + path_to_ccache_dir: str, + s3_helper: S3Helper, + current_pr_number: int, + temp_path: str, + release_pr: int, ) -> int: """returns: number of PR for downloaded PR. -1 if ccache not found""" ccache_name = os.path.basename(path_to_ccache_dir) cache_found = False prs_to_check = [current_pr_number] + # Release PR is either 0 or defined + if release_pr: + prs_to_check.append(release_pr) ccache_pr = -1 if current_pr_number != 0: prs_to_check.append(0) diff --git a/tests/ci/cherry_pick.py b/tests/ci/cherry_pick.py index 064a0b3add1..d1c9d3d394c 100644 --- a/tests/ci/cherry_pick.py +++ b/tests/ci/cherry_pick.py @@ -44,11 +44,11 @@ from ssh import SSHKey class Labels: - LABEL_MUST_BACKPORT = "pr-must-backport" - LABEL_BACKPORT = "pr-backport" - LABEL_BACKPORTED = "pr-backported" - LABEL_CHERRYPICK = "pr-cherrypick" - LABEL_DO_NOT_TEST = "do not test" + MUST_BACKPORT = "pr-must-backport" + BACKPORT = "pr-backport" + BACKPORTS_CREATED = "pr-backports-created" + CHERRYPICK = "pr-cherrypick" + DO_NOT_TEST = "do not test" class ReleaseBranch: @@ -204,8 +204,8 @@ Merge it only if you intend to backport changes to the target branch, otherwise base=self.backport_branch, head=self.cherrypick_branch, ) - self.cherrypick_pr.add_to_labels(Labels.LABEL_CHERRYPICK) - self.cherrypick_pr.add_to_labels(Labels.LABEL_DO_NOT_TEST) + self.cherrypick_pr.add_to_labels(Labels.CHERRYPICK) + self.cherrypick_pr.add_to_labels(Labels.DO_NOT_TEST) self._assign_new_pr(self.cherrypick_pr) def create_backport(self): @@ -236,7 +236,7 @@ Merge it only if you intend to backport changes to the target branch, otherwise base=self.name, head=self.backport_branch, ) - self.backport_pr.add_to_labels(Labels.LABEL_BACKPORT) + self.backport_pr.add_to_labels(Labels.BACKPORT) self._assign_new_pr(self.backport_pr) def _assign_new_pr(self, new_pr: PullRequest): @@ -321,8 +321,8 @@ class Backport: tomorrow = date.today() + timedelta(days=1) logging.info("Receive PRs suppose to be backported") self.prs_for_backport = self.gh.get_pulls_from_search( - query=f"{self._query} -label:pr-backported", - label=",".join(self.labels_to_backport + [Labels.LABEL_MUST_BACKPORT]), + query=f"{self._query} -label:{Labels.BACKPORTS_CREATED}", + label=",".join(self.labels_to_backport + [Labels.MUST_BACKPORT]), merged=[since_date, tomorrow], ) logging.info( @@ -342,7 +342,7 @@ class Backport: def process_pr(self, pr: PullRequest): pr_labels = [label.name for label in pr.labels] - if Labels.LABEL_MUST_BACKPORT in pr_labels: + if Labels.MUST_BACKPORT in pr_labels: branches = [ ReleaseBranch(br, pr) for br in self.release_branches ] # type: List[ReleaseBranch] @@ -407,11 +407,11 @@ class Backport: if self.dry_run: logging.info("DRY RUN: would mark PR #%s as done", pr.number) return - pr.add_to_labels(Labels.LABEL_BACKPORTED) + pr.add_to_labels(Labels.BACKPORTS_CREATED) logging.info( "PR #%s is successfully labeled with `%s`", pr.number, - Labels.LABEL_BACKPORTED, + Labels.BACKPORTS_CREATED, ) @property diff --git a/tests/ci/ci_config.py b/tests/ci/ci_config.py index fa68d1982d2..a31f2298a58 100644 --- a/tests/ci/ci_config.py +++ b/tests/ci/ci_config.py @@ -8,7 +8,7 @@ BuildConfig = Dict[str, ConfValue] CI_CONFIG = { "build_config": { "package_release": { - "compiler": "clang-14", + "compiler": "clang-15", "build_type": "", "sanitizer": "", "package_type": "deb", @@ -19,7 +19,7 @@ CI_CONFIG = { "with_coverage": False, }, "coverity": { - "compiler": "clang-14", + "compiler": "clang-15", "build_type": "", "sanitizer": "", "package_type": "coverity", @@ -29,7 +29,7 @@ CI_CONFIG = { "official": False, }, "package_aarch64": { - "compiler": "clang-14-aarch64", + "compiler": "clang-15-aarch64", "build_type": "", "sanitizer": "", "package_type": "deb", @@ -40,7 +40,7 @@ CI_CONFIG = { "with_coverage": False, }, "package_asan": { - "compiler": "clang-14", + "compiler": "clang-15", "build_type": "", "sanitizer": "address", "package_type": "deb", @@ -49,7 +49,7 @@ CI_CONFIG = { "with_coverage": False, }, "package_ubsan": { - "compiler": "clang-14", + "compiler": "clang-15", "build_type": "", "sanitizer": "undefined", "package_type": "deb", @@ -67,7 +67,7 @@ CI_CONFIG = { "with_coverage": False, }, "package_msan": { - "compiler": "clang-14", + "compiler": "clang-15", "build_type": "", "sanitizer": "memory", "package_type": "deb", @@ -76,7 +76,7 @@ CI_CONFIG = { "with_coverage": False, }, "package_debug": { - "compiler": "clang-14", + "compiler": "clang-15", "build_type": "debug", "sanitizer": "", "package_type": "deb", @@ -85,7 +85,7 @@ CI_CONFIG = { "with_coverage": False, }, "binary_release": { - "compiler": "clang-14", + "compiler": "clang-15", "build_type": "", "sanitizer": "", "package_type": "binary", @@ -94,7 +94,7 @@ CI_CONFIG = { "with_coverage": False, }, "binary_tidy": { - "compiler": "clang-14", + "compiler": "clang-15", "build_type": "debug", "sanitizer": "", "package_type": "binary", @@ -104,7 +104,7 @@ CI_CONFIG = { "with_coverage": False, }, "binary_shared": { - "compiler": "clang-14", + "compiler": "clang-15", "build_type": "", "sanitizer": "", "package_type": "binary", @@ -113,7 +113,7 @@ CI_CONFIG = { "with_coverage": False, }, "binary_darwin": { - "compiler": "clang-14-darwin", + "compiler": "clang-15-darwin", "build_type": "", "sanitizer": "", "package_type": "binary", @@ -123,7 +123,7 @@ CI_CONFIG = { "with_coverage": False, }, "binary_aarch64": { - "compiler": "clang-14-aarch64", + "compiler": "clang-15-aarch64", "build_type": "", "sanitizer": "", "package_type": "binary", @@ -132,7 +132,7 @@ CI_CONFIG = { "with_coverage": False, }, "binary_freebsd": { - "compiler": "clang-14-freebsd", + "compiler": "clang-15-freebsd", "build_type": "", "sanitizer": "", "package_type": "binary", @@ -142,7 +142,7 @@ CI_CONFIG = { "with_coverage": False, }, "binary_darwin_aarch64": { - "compiler": "clang-14-darwin-aarch64", + "compiler": "clang-15-darwin-aarch64", "build_type": "", "sanitizer": "", "package_type": "binary", @@ -152,7 +152,7 @@ CI_CONFIG = { "with_coverage": False, }, "binary_ppc64le": { - "compiler": "clang-14-ppc64le", + "compiler": "clang-15-ppc64le", "build_type": "", "sanitizer": "", "package_type": "binary", @@ -162,7 +162,7 @@ CI_CONFIG = { "with_coverage": False, }, "binary_amd64sse2": { - "compiler": "clang-14-amd64sse2", + "compiler": "clang-15-amd64sse2", "build_type": "", "sanitizer": "", "package_type": "binary", @@ -342,7 +342,7 @@ CI_CONFIG = { }, "Performance Comparison Aarch64": { "required_build": "package_aarch64", - "test_grep_exclude_filter": "constant_column_search", + "test_grep_exclude_filter": "", }, }, } # type: dict diff --git a/tests/ci/fast_test_check.py b/tests/ci/fast_test_check.py index 038289406de..03e42726808 100644 --- a/tests/ci/fast_test_check.py +++ b/tests/ci/fast_test_check.py @@ -125,7 +125,7 @@ if __name__ == "__main__": logging.info("Will try to fetch cache for our build") ccache_for_pr = get_ccache_if_not_exists( - cache_path, s3_helper, pr_info.number, temp_path + cache_path, s3_helper, pr_info.number, temp_path, pr_info.release_pr ) upload_master_ccache = ccache_for_pr in (-1, 0) diff --git a/tests/ci/pr_info.py b/tests/ci/pr_info.py index 2acd0e4c811..77421ddac32 100644 --- a/tests/ci/pr_info.py +++ b/tests/ci/pr_info.py @@ -86,7 +86,7 @@ class PRInfo: self.changed_files = set() # type: Set[str] self.body = "" self.diff_urls = [] - self.release_pr = "" + self.release_pr = 0 ref = github_event.get("ref", "refs/head/master") if ref and ref.startswith("refs/heads/"): ref = ref[11:] diff --git a/tests/fuzz/all.dict b/tests/fuzz/all.dict index dff62cd68a7..a147878da9b 100644 --- a/tests/fuzz/all.dict +++ b/tests/fuzz/all.dict @@ -763,7 +763,6 @@ "MINUTE" "MM" "mod" -"modelEvaluate" "MODIFY" "MODIFY COLUMN" "MODIFY ORDER BY" diff --git a/tests/fuzz/dictionaries/functions.dict b/tests/fuzz/dictionaries/functions.dict index cbcad3c05da..b90697f0c3d 100644 --- a/tests/fuzz/dictionaries/functions.dict +++ b/tests/fuzz/dictionaries/functions.dict @@ -469,7 +469,6 @@ "subtractSeconds" "alphaTokens" "negate" -"modelEvaluate" "file" "roundAge" "MACStringToOUI" diff --git a/tests/integration/runner b/tests/integration/runner index e1b9a55b43e..f0d87b23a83 100755 --- a/tests/integration/runner +++ b/tests/integration/runner @@ -350,7 +350,8 @@ if __name__ == "__main__": # randomizer, we should remove it after Sep 2022 try: subprocess.check_call( - f"docker volume ls -q | grep '{VOLUME_NAME}_.*_volume' | xargs --no-run-if-empty docker volume rm", + "docker volume rm $(docker volume ls -q | " + f"grep '{VOLUME_NAME}_.*_volume')", shell=True, ) except Exception as ex: diff --git a/tests/integration/test_catboost_model_config_reload/__init__.py b/tests/integration/test_catboost_evaluate/__init__.py similarity index 100% rename from tests/integration/test_catboost_model_config_reload/__init__.py rename to tests/integration/test_catboost_evaluate/__init__.py diff --git a/tests/integration/test_catboost_evaluate/config/models_config.xml b/tests/integration/test_catboost_evaluate/config/models_config.xml new file mode 100644 index 00000000000..f63df06ee26 --- /dev/null +++ b/tests/integration/test_catboost_evaluate/config/models_config.xml @@ -0,0 +1,3 @@ + + /etc/clickhouse-server/model/libcatboostmodel.so + diff --git a/tests/integration/test_catboost_evaluate/model/amazon_model.bin b/tests/integration/test_catboost_evaluate/model/amazon_model.bin new file mode 100644 index 00000000000..4a37fbec310 Binary files /dev/null and b/tests/integration/test_catboost_evaluate/model/amazon_model.bin differ diff --git a/tests/integration/test_catboost_model_config_reload/model/libcatboostmodel.so b/tests/integration/test_catboost_evaluate/model/libcatboostmodel.so similarity index 100% rename from tests/integration/test_catboost_model_config_reload/model/libcatboostmodel.so rename to tests/integration/test_catboost_evaluate/model/libcatboostmodel.so diff --git a/tests/integration/test_catboost_model_config_reload/model/model.bin b/tests/integration/test_catboost_evaluate/model/simple_model.bin similarity index 100% rename from tests/integration/test_catboost_model_config_reload/model/model.bin rename to tests/integration/test_catboost_evaluate/model/simple_model.bin diff --git a/tests/integration/test_catboost_evaluate/test.py b/tests/integration/test_catboost_evaluate/test.py new file mode 100644 index 00000000000..a0915977ab6 --- /dev/null +++ b/tests/integration/test_catboost_evaluate/test.py @@ -0,0 +1,402 @@ +import os +import sys +import time + +import pytest + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__)) + +from helpers.cluster import ClickHouseCluster + +cluster = ClickHouseCluster(__file__) + +instance = cluster.add_instance( + "instance", stay_alive=True, main_configs=["config/models_config.xml"] +) + + +@pytest.fixture(scope="module") +def ch_cluster(): + try: + cluster.start() + + os.system( + "docker cp {local} {cont_id}:{dist}".format( + local=os.path.join(SCRIPT_DIR, "model/."), + cont_id=instance.docker_id, + dist="/etc/clickhouse-server/model", + ) + ) + instance.restart_clickhouse() + + yield cluster + + finally: + cluster.shutdown() + + +# --------------------------------------------------------------------------- +# simple_model.bin has 2 float features and 9 categorical features + + +def testConstantFeatures(ch_cluster): + if instance.is_built_with_memory_sanitizer(): + pytest.skip("Memory Sanitizer cannot work with third-party shared libraries") + + result = instance.query("system reload models") + + result = instance.query( + "select catboostEvaluate('/etc/clickhouse-server/model/simple_model.bin', 1.0, 2.0, 3, 4, 5, 6, 7, 8, 9, 10, 11);" + ) + expected = "-1.930268705869267\n" + assert result == expected + + +def testNonConstantFeatures(ch_cluster): + if instance.is_built_with_memory_sanitizer(): + pytest.skip("Memory Sanitizer cannot work with third-party shared libraries") + + result = instance.query("system reload models") + + instance.query("DROP TABLE IF EXISTS T;") + instance.query( + "CREATE TABLE T(ID UInt32, F1 Float32, F2 Float32, F3 UInt32, F4 UInt32, F5 UInt32, F6 UInt32, F7 UInt32, F8 UInt32, F9 Float32, F10 Float32, F11 Float32) ENGINE MergeTree ORDER BY ID;" + ) + instance.query("INSERT INTO T VALUES(0, 1.0, 2.0, 3, 4, 5, 6, 7, 8, 9, 10, 11);") + + result = instance.query( + "select catboostEvaluate('/etc/clickhouse-server/model/simple_model.bin', F1, F2, F3, F4, F5, F6, F7, F8, F9, F10, F11) from T;" + ) + expected = "-1.930268705869267\n" + assert result == expected + + instance.query("DROP TABLE IF EXISTS T;") + + +def testModelPathIsNotAConstString(ch_cluster): + if instance.is_built_with_memory_sanitizer(): + pytest.skip("Memory Sanitizer cannot work with third-party shared libraries") + + result = instance.query("system reload models") + + err = instance.query_and_get_error( + "select catboostEvaluate(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11);" + ) + assert ( + "Illegal type UInt8 of first argument of function catboostEvaluate, expected a string" + in err + ) + + instance.query("DROP TABLE IF EXISTS T;") + instance.query("CREATE TABLE T(ID UInt32, A String) ENGINE MergeTree ORDER BY ID") + instance.query("INSERT INTO T VALUES(0, 'test');") + err = instance.query_and_get_error( + "select catboostEvaluate(A, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11) FROM T;" + ) + assert ( + "First argument of function catboostEvaluate must be a constant string" in err + ) + instance.query("DROP TABLE IF EXISTS T;") + + +def testWrongNumberOfFeatureArguments(ch_cluster): + if instance.is_built_with_memory_sanitizer(): + pytest.skip("Memory Sanitizer cannot work with third-party shared libraries") + + result = instance.query("system reload models") + + err = instance.query_and_get_error( + "select catboostEvaluate('/etc/clickhouse-server/model/simple_model.bin');" + ) + assert "Function catboostEvaluate expects at least 2 arguments" in err + + err = instance.query_and_get_error( + "select catboostEvaluate('/etc/clickhouse-server/model/simple_model.bin', 1, 2);" + ) + assert ( + "Number of columns is different with number of features: columns size 2 float features size 2 + cat features size 9" + in err + ) + + +def testFloatFeatureMustBeNumeric(ch_cluster): + if instance.is_built_with_memory_sanitizer(): + pytest.skip("Memory Sanitizer cannot work with third-party shared libraries") + + result = instance.query("system reload models") + + err = instance.query_and_get_error( + "select catboostEvaluate('/etc/clickhouse-server/model/simple_model.bin', 1.0, 'a', 3, 4, 5, 6, 7, 8, 9, 10, 11);" + ) + assert "Column 1 should be numeric to make float feature" in err + + +def testCategoricalFeatureMustBeNumericOrString(ch_cluster): + if instance.is_built_with_memory_sanitizer(): + pytest.skip("Memory Sanitizer cannot work with third-party shared libraries") + + result = instance.query("system reload models") + + err = instance.query_and_get_error( + "select catboostEvaluate('/etc/clickhouse-server/model/simple_model.bin', 1.0, 2.0, 3, 4, 5, 6, 7, tuple(8), 9, 10, 11);" + ) + assert "Column 7 should be numeric or string" in err + + +def testOnLowCardinalityFeatures(ch_cluster): + if instance.is_built_with_memory_sanitizer(): + pytest.skip("Memory Sanitizer cannot work with third-party shared libraries") + + result = instance.query("system reload models") + + # same but on domain-compressed data + result = instance.query( + "select catboostEvaluate('/etc/clickhouse-server/model/simple_model.bin', toLowCardinality(1.0), toLowCardinality(2.0), toLowCardinality(3), toLowCardinality(4), toLowCardinality(5), toLowCardinality(6), toLowCardinality(7), toLowCardinality(8), toLowCardinality(9), toLowCardinality(10), toLowCardinality(11));" + ) + expected = "-1.930268705869267\n" + assert result == expected + + +def testOnNullableFeatures(ch_cluster): + if instance.is_built_with_memory_sanitizer(): + pytest.skip("Memory Sanitizer cannot work with third-party shared libraries") + + result = instance.query("system reload models") + + result = instance.query( + "select catboostEvaluate('/etc/clickhouse-server/model/simple_model.bin', toNullable(1.0), toNullable(2.0), toNullable(3), toNullable(4), toNullable(5), toNullable(6), toNullable(7), toNullable(8), toNullable(9), toNullable(10), toNullable(11));" + ) + expected = "-1.930268705869267\n" + assert result == expected + + # Actual NULLs are disallowed + err = instance.query_and_get_error( + "select catboostEvaluate('/etc/clickhouse-server/model/simple_model.bin', toNullable(NULL), toNullable(NULL), toNullable(NULL), toNullable(NULL), toNullable(NULL), toNullable(NULL), toNullable(NULL), toNullable(NULL), toNullable(NULL), toNullable(NULL), toNullable(NULL));" + ) + assert "Column 0 should be numeric to make float feature" in err + + +def testInvalidLibraryPath(ch_cluster): + if instance.is_built_with_memory_sanitizer(): + pytest.skip("Memory Sanitizer cannot work with third-party shared libraries") + + result = instance.query("system reload models") + + # temporarily move library elsewhere + instance.exec_in_container( + [ + "bash", + "-c", + "mv /etc/clickhouse-server/model/libcatboostmodel.so /etc/clickhouse-server/model/nonexistant.so", + ] + ) + + err = instance.query_and_get_error( + "select catboostEvaluate('/etc/clickhouse-server/model/simple_model.bin', 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11);" + ) + assert ( + "Can't load library /etc/clickhouse-server/model/libcatboostmodel.so: file doesn't exist" + in err + ) + + # restore + instance.exec_in_container( + [ + "bash", + "-c", + "mv /etc/clickhouse-server/model/nonexistant.so /etc/clickhouse-server/model/libcatboostmodel.so", + ] + ) + + +def testInvalidModelPath(ch_cluster): + if instance.is_built_with_memory_sanitizer(): + pytest.skip("Memory Sanitizer cannot work with third-party shared libraries") + + result = instance.query("system reload models") + + err = instance.query_and_get_error( + "select catboostEvaluate('', 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11);" + ) + assert "Can't load model : file doesn't exist" in err + + err = instance.query_and_get_error( + "select catboostEvaluate('model_non_existant.bin', 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11);" + ) + assert "Can't load model model_non_existant.bin: file doesn't exist" in err + + +def testRecoveryAfterCrash(ch_cluster): + if instance.is_built_with_memory_sanitizer(): + pytest.skip("Memory Sanitizer cannot work with third-party shared libraries") + + result = instance.query("system reload models") + + result = instance.query( + "select catboostEvaluate('/etc/clickhouse-server/model/simple_model.bin', 1.0, 2.0, 3, 4, 5, 6, 7, 8, 9, 10, 11);" + ) + expected = "-1.930268705869267\n" + assert result == expected + + instance.exec_in_container( + ["bash", "-c", "kill -9 `pidof clickhouse-library-bridge`"], user="root" + ) + + result = instance.query( + "select catboostEvaluate('/etc/clickhouse-server/model/simple_model.bin', 1.0, 2.0, 3, 4, 5, 6, 7, 8, 9, 10, 11);" + ) + assert result == expected + + +# --------------------------------------------------------------------------- +# amazon_model.bin has 0 float features and 9 categorical features + + +def testAmazonModelSingleRow(ch_cluster): + if instance.is_built_with_memory_sanitizer(): + pytest.skip("Memory Sanitizer cannot work with third-party shared libraries") + + result = instance.query("system reload models") + + result = instance.query( + "select catboostEvaluate('/etc/clickhouse-server/model/amazon_model.bin', 1, 2, 3, 4, 5, 6, 7, 8, 9);" + ) + expected = "0.7774665009089274\n" + assert result == expected + + +def testAmazonModelManyRows(ch_cluster): + if instance.is_built_with_memory_sanitizer(): + pytest.skip("Memory Sanitizer cannot work with third-party shared libraries") + + result = instance.query("system reload models") + + result = instance.query("drop table if exists amazon") + + result = instance.query( + "create table amazon ( DATE Date materialized today(), ACTION UInt8, RESOURCE UInt32, MGR_ID UInt32, ROLE_ROLLUP_1 UInt32, ROLE_ROLLUP_2 UInt32, ROLE_DEPTNAME UInt32, ROLE_TITLE UInt32, ROLE_FAMILY_DESC UInt32, ROLE_FAMILY UInt32, ROLE_CODE UInt32) engine = MergeTree order by DATE" + ) + + result = instance.query( + "insert into amazon select number % 256, number, number, number, number, number, number, number, number, number from numbers(7500)" + ) + + # First compute prediction, then as a very crude way to fingerprint and compare the result: sum and floor + # (the focus is to test that the exchange of large result sets between the server and the bridge works) + result = instance.query( + "SELECT floor(sum(catboostEvaluate('/etc/clickhouse-server/model/amazon_model.bin', RESOURCE, MGR_ID, ROLE_ROLLUP_1, ROLE_ROLLUP_2, ROLE_DEPTNAME, ROLE_TITLE, ROLE_FAMILY_DESC, ROLE_FAMILY, ROLE_CODE))) FROM amazon" + ) + + expected = "5834\n" + assert result == expected + + result = instance.query("drop table if exists amazon") + + +def testModelUpdate(ch_cluster): + if instance.is_built_with_memory_sanitizer(): + pytest.skip("Memory Sanitizer cannot work with third-party shared libraries") + + result = instance.query("system reload models") + + query = "select catboostEvaluate('/etc/clickhouse-server/model/simple_model.bin', 1.0, 2.0, 3, 4, 5, 6, 7, 8, 9, 10, 11);" + + result = instance.query(query) + expected = "-1.930268705869267\n" + assert result == expected + + # simulate an update of the model: temporarily move the amazon model in place of the simple model + instance.exec_in_container( + [ + "bash", + "-c", + "mv /etc/clickhouse-server/model/simple_model.bin /etc/clickhouse-server/model/simple_model.bin.bak", + ] + ) + instance.exec_in_container( + [ + "bash", + "-c", + "mv /etc/clickhouse-server/model/amazon_model.bin /etc/clickhouse-server/model/simple_model.bin", + ] + ) + + # unload simple model + result = instance.query( + "system reload model '/etc/clickhouse-server/model/simple_model.bin'" + ) + + # load the simple-model-camouflaged amazon model + result = instance.query( + "select catboostEvaluate('/etc/clickhouse-server/model/simple_model.bin', 1, 2, 3, 4, 5, 6, 7, 8, 9);" + ) + expected = "0.7774665009089274\n" + assert result == expected + + # restore + instance.exec_in_container( + [ + "bash", + "-c", + "mv /etc/clickhouse-server/model/simple_model.bin /etc/clickhouse-server/model/amazon_model.bin", + ] + ) + instance.exec_in_container( + [ + "bash", + "-c", + "mv /etc/clickhouse-server/model/simple_model.bin.bak /etc/clickhouse-server/model/simple_model.bin", + ] + ) + + +def testSystemModelsAndModelRefresh(ch_cluster): + if instance.is_built_with_memory_sanitizer(): + pytest.skip("Memory Sanitizer cannot work with third-party shared libraries") + + result = instance.query("system reload models") + + # check model system view + result = instance.query("select * from system.models") + expected = "" + assert result == expected + + # load simple model + result = instance.query( + "select catboostEvaluate('/etc/clickhouse-server/model/simple_model.bin', 1.0, 2.0, 3, 4, 5, 6, 7, 8, 9, 10, 11);" + ) + expected = "-1.930268705869267\n" + assert result == expected + + # check model system view with one model loaded + result = instance.query("select * from system.models") + assert result.count("\n") == 1 + expected = "/etc/clickhouse-server/model/simple_model.bin" + assert expected in result + + # load amazon model + result = instance.query( + "select catboostEvaluate('/etc/clickhouse-server/model/amazon_model.bin', 1, 2, 3, 4, 5, 6, 7, 8, 9);" + ) + expected = "0.7774665009089274\n" + assert result == expected + + # check model system view with one model loaded + result = instance.query("select * from system.models") + assert result.count("\n") == 2 + expected = "/etc/clickhouse-server/model/simple_model.bin" + assert expected in result + expected = "/etc/clickhouse-server/model/amazon_model.bin" + assert expected in result + + # unload simple model + result = instance.query( + "system reload model '/etc/clickhouse-server/model/simple_model.bin'" + ) + + # check model system view, it should not display the removed model + result = instance.query("select * from system.models") + assert result.count("\n") == 1 + expected = "/etc/clickhouse-server/model/amazon_model.bin" + assert expected in result diff --git a/tests/integration/test_catboost_model_config_reload/config/catboost_lib.xml b/tests/integration/test_catboost_model_config_reload/config/catboost_lib.xml deleted file mode 100644 index 7aa06cc99ff..00000000000 --- a/tests/integration/test_catboost_model_config_reload/config/catboost_lib.xml +++ /dev/null @@ -1,3 +0,0 @@ - - /etc/clickhouse-server/model/libcatboostmodel.so - diff --git a/tests/integration/test_catboost_model_config_reload/config/models_config.xml b/tests/integration/test_catboost_model_config_reload/config/models_config.xml deleted file mode 100644 index 3cbf717bb67..00000000000 --- a/tests/integration/test_catboost_model_config_reload/config/models_config.xml +++ /dev/null @@ -1,2 +0,0 @@ - - diff --git a/tests/integration/test_catboost_model_config_reload/model/model_config.xml b/tests/integration/test_catboost_model_config_reload/model/model_config.xml deleted file mode 100644 index af9778097fa..00000000000 --- a/tests/integration/test_catboost_model_config_reload/model/model_config.xml +++ /dev/null @@ -1,8 +0,0 @@ - - - catboost - model1 - /etc/clickhouse-server/model/model.bin - 0 - - diff --git a/tests/integration/test_catboost_model_config_reload/model/model_config2.xml b/tests/integration/test_catboost_model_config_reload/model/model_config2.xml deleted file mode 100644 index b81120ec900..00000000000 --- a/tests/integration/test_catboost_model_config_reload/model/model_config2.xml +++ /dev/null @@ -1,8 +0,0 @@ - - - catboost - model2 - /etc/clickhouse-server/model/model.bin - 0 - - diff --git a/tests/integration/test_catboost_model_config_reload/test.py b/tests/integration/test_catboost_model_config_reload/test.py deleted file mode 100644 index c12c28e2338..00000000000 --- a/tests/integration/test_catboost_model_config_reload/test.py +++ /dev/null @@ -1,77 +0,0 @@ -import os -import sys -import time - -import pytest - -sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__)) - -from helpers.cluster import ClickHouseCluster - -cluster = ClickHouseCluster(__file__) -node = cluster.add_instance( - "node", - stay_alive=True, - main_configs=["config/models_config.xml", "config/catboost_lib.xml"], -) - - -def copy_file_to_container(local_path, dist_path, container_id): - os.system( - "docker cp {local} {cont_id}:{dist}".format( - local=local_path, cont_id=container_id, dist=dist_path - ) - ) - - -config = """ - /etc/clickhouse-server/model/{model_config} -""" - - -@pytest.fixture(scope="module") -def started_cluster(): - try: - cluster.start() - - copy_file_to_container( - os.path.join(SCRIPT_DIR, "model/."), - "/etc/clickhouse-server/model", - node.docker_id, - ) - node.restart_clickhouse() - - yield cluster - - finally: - cluster.shutdown() - - -def change_config(model_config): - node.replace_config( - "/etc/clickhouse-server/config.d/models_config.xml", - config.format(model_config=model_config), - ) - node.query("SYSTEM RELOAD CONFIG;") - - -def test(started_cluster): - if node.is_built_with_memory_sanitizer(): - pytest.skip("Memory Sanitizer cannot work with third-party shared libraries") - - # Set config with the path to the first model. - change_config("model_config.xml") - - node.query("SELECT modelEvaluate('model1', 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11);") - - # Change path to the second model in config. - change_config("model_config2.xml") - - # Check that the new model is loaded. - node.query("SELECT modelEvaluate('model2', 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11);") - - # Check that the old model was unloaded. - node.query_and_get_error( - "SELECT modelEvaluate('model1', 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11);" - ) diff --git a/tests/integration/test_catboost_model_first_evaluate/config/models_config.xml b/tests/integration/test_catboost_model_first_evaluate/config/models_config.xml deleted file mode 100644 index 26f5c4d57f6..00000000000 --- a/tests/integration/test_catboost_model_first_evaluate/config/models_config.xml +++ /dev/null @@ -1,4 +0,0 @@ - - /etc/clickhouse-server/model/libcatboostmodel.so - /etc/clickhouse-server/model/model_config.xml - diff --git a/tests/integration/test_catboost_model_first_evaluate/model/libcatboostmodel.so b/tests/integration/test_catboost_model_first_evaluate/model/libcatboostmodel.so deleted file mode 100755 index 388d9f887b4..00000000000 Binary files a/tests/integration/test_catboost_model_first_evaluate/model/libcatboostmodel.so and /dev/null differ diff --git a/tests/integration/test_catboost_model_first_evaluate/model/model.bin b/tests/integration/test_catboost_model_first_evaluate/model/model.bin deleted file mode 100644 index 118e099d176..00000000000 Binary files a/tests/integration/test_catboost_model_first_evaluate/model/model.bin and /dev/null differ diff --git a/tests/integration/test_catboost_model_first_evaluate/model/model_config.xml b/tests/integration/test_catboost_model_first_evaluate/model/model_config.xml deleted file mode 100644 index 2c328167a94..00000000000 --- a/tests/integration/test_catboost_model_first_evaluate/model/model_config.xml +++ /dev/null @@ -1,8 +0,0 @@ - - - catboost - titanic - /etc/clickhouse-server/model/model.bin - 0 - - diff --git a/tests/integration/test_catboost_model_first_evaluate/test.py b/tests/integration/test_catboost_model_first_evaluate/test.py deleted file mode 100644 index b15f481c0e9..00000000000 --- a/tests/integration/test_catboost_model_first_evaluate/test.py +++ /dev/null @@ -1,48 +0,0 @@ -import os -import sys -import time - -import pytest - -sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__)) - -from helpers.cluster import ClickHouseCluster - -cluster = ClickHouseCluster(__file__) -node = cluster.add_instance( - "node", stay_alive=True, main_configs=["config/models_config.xml"] -) - - -def copy_file_to_container(local_path, dist_path, container_id): - os.system( - "docker cp {local} {cont_id}:{dist}".format( - local=local_path, cont_id=container_id, dist=dist_path - ) - ) - - -@pytest.fixture(scope="module") -def started_cluster(): - try: - cluster.start() - - copy_file_to_container( - os.path.join(SCRIPT_DIR, "model/."), - "/etc/clickhouse-server/model", - node.docker_id, - ) - node.restart_clickhouse() - - yield cluster - - finally: - cluster.shutdown() - - -def test(started_cluster): - if node.is_built_with_memory_sanitizer(): - pytest.skip("Memory Sanitizer cannot work with third-party shared libraries") - - node.query("select modelEvaluate('titanic', 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11);") diff --git a/tests/integration/test_catboost_model_reload/config/catboost_lib.xml b/tests/integration/test_catboost_model_reload/config/catboost_lib.xml deleted file mode 100644 index 7aa06cc99ff..00000000000 --- a/tests/integration/test_catboost_model_reload/config/catboost_lib.xml +++ /dev/null @@ -1,3 +0,0 @@ - - /etc/clickhouse-server/model/libcatboostmodel.so - diff --git a/tests/integration/test_catboost_model_reload/config/models_config.xml b/tests/integration/test_catboost_model_reload/config/models_config.xml deleted file mode 100644 index 84378df0e8f..00000000000 --- a/tests/integration/test_catboost_model_reload/config/models_config.xml +++ /dev/null @@ -1,3 +0,0 @@ - - /etc/clickhouse-server/model/model_config.xml - diff --git a/tests/integration/test_catboost_model_reload/model/conjunction.cbm b/tests/integration/test_catboost_model_reload/model/conjunction.cbm deleted file mode 100644 index 7b75fb5f886..00000000000 Binary files a/tests/integration/test_catboost_model_reload/model/conjunction.cbm and /dev/null differ diff --git a/tests/integration/test_catboost_model_reload/model/disjunction.cbm b/tests/integration/test_catboost_model_reload/model/disjunction.cbm deleted file mode 100644 index 8145c24637f..00000000000 Binary files a/tests/integration/test_catboost_model_reload/model/disjunction.cbm and /dev/null differ diff --git a/tests/integration/test_catboost_model_reload/model/libcatboostmodel.so b/tests/integration/test_catboost_model_reload/model/libcatboostmodel.so deleted file mode 100755 index 388d9f887b4..00000000000 Binary files a/tests/integration/test_catboost_model_reload/model/libcatboostmodel.so and /dev/null differ diff --git a/tests/integration/test_catboost_model_reload/model/model_config.xml b/tests/integration/test_catboost_model_reload/model/model_config.xml deleted file mode 100644 index 7cbda165ce9..00000000000 --- a/tests/integration/test_catboost_model_reload/model/model_config.xml +++ /dev/null @@ -1,8 +0,0 @@ - - - catboost - model - /etc/clickhouse-server/model/model.cbm - 0 - - diff --git a/tests/integration/test_catboost_model_reload/test.py b/tests/integration/test_catboost_model_reload/test.py deleted file mode 100644 index 3bf7ca18cdd..00000000000 --- a/tests/integration/test_catboost_model_reload/test.py +++ /dev/null @@ -1,132 +0,0 @@ -import os -import sys -import time - -import pytest - -sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__)) - -from helpers.cluster import ClickHouseCluster - -cluster = ClickHouseCluster(__file__) -node = cluster.add_instance( - "node", - stay_alive=True, - main_configs=["config/models_config.xml", "config/catboost_lib.xml"], -) - - -def copy_file_to_container(local_path, dist_path, container_id): - os.system( - "docker cp {local} {cont_id}:{dist}".format( - local=local_path, cont_id=container_id, dist=dist_path - ) - ) - - -@pytest.fixture(scope="module") -def started_cluster(): - try: - cluster.start() - - copy_file_to_container( - os.path.join(SCRIPT_DIR, "model/."), - "/etc/clickhouse-server/model", - node.docker_id, - ) - node.query("CREATE TABLE binary (x UInt64, y UInt64) ENGINE = TinyLog()") - node.query("INSERT INTO binary VALUES (1, 1), (1, 0), (0, 1), (0, 0)") - - node.restart_clickhouse() - - yield cluster - - finally: - cluster.shutdown() - - -def test_model_reload(started_cluster): - if node.is_built_with_memory_sanitizer(): - pytest.skip("Memory Sanitizer cannot work with third-party shared libraries") - - node.exec_in_container( - ["bash", "-c", "rm -f /etc/clickhouse-server/model/model.cbm"] - ) - node.exec_in_container( - [ - "bash", - "-c", - "ln /etc/clickhouse-server/model/conjunction.cbm /etc/clickhouse-server/model/model.cbm", - ] - ) - node.query("SYSTEM RELOAD MODEL model") - - result = node.query( - """ - WITH modelEvaluate('model', toFloat64(x), toFloat64(y)) as prediction, exp(prediction) / (1 + exp(prediction)) as probability - SELECT if(probability > 0.5, 1, 0) FROM binary; - """ - ) - assert result == "1\n0\n0\n0\n" - - node.exec_in_container(["bash", "-c", "rm /etc/clickhouse-server/model/model.cbm"]) - node.exec_in_container( - [ - "bash", - "-c", - "ln /etc/clickhouse-server/model/disjunction.cbm /etc/clickhouse-server/model/model.cbm", - ] - ) - node.query("SYSTEM RELOAD MODEL model") - - result = node.query( - """ - WITH modelEvaluate('model', toFloat64(x), toFloat64(y)) as prediction, exp(prediction) / (1 + exp(prediction)) as probability - SELECT if(probability > 0.5, 1, 0) FROM binary; - """ - ) - assert result == "1\n1\n1\n0\n" - - -def test_models_reload(started_cluster): - if node.is_built_with_memory_sanitizer(): - pytest.skip("Memory Sanitizer cannot work with third-party shared libraries") - - node.exec_in_container( - ["bash", "-c", "rm -f /etc/clickhouse-server/model/model.cbm"] - ) - node.exec_in_container( - [ - "bash", - "-c", - "ln /etc/clickhouse-server/model/conjunction.cbm /etc/clickhouse-server/model/model.cbm", - ] - ) - node.query("SYSTEM RELOAD MODELS") - - result = node.query( - """ - WITH modelEvaluate('model', toFloat64(x), toFloat64(y)) as prediction, exp(prediction) / (1 + exp(prediction)) as probability - SELECT if(probability > 0.5, 1, 0) FROM binary; - """ - ) - assert result == "1\n0\n0\n0\n" - - node.exec_in_container(["bash", "-c", "rm /etc/clickhouse-server/model/model.cbm"]) - node.exec_in_container( - [ - "bash", - "-c", - "ln /etc/clickhouse-server/model/disjunction.cbm /etc/clickhouse-server/model/model.cbm", - ] - ) - node.query("SYSTEM RELOAD MODELS") - - result = node.query( - """ - WITH modelEvaluate('model', toFloat64(x), toFloat64(y)) as prediction, exp(prediction) / (1 + exp(prediction)) as probability - SELECT if(probability > 0.5, 1, 0) FROM binary; - """ - ) - assert result == "1\n1\n1\n0\n" diff --git a/tests/integration/test_join_set_family_s3/test.py b/tests/integration/test_join_set_family_s3/test.py index 38b56b7b15b..b09d5735628 100644 --- a/tests/integration/test_join_set_family_s3/test.py +++ b/tests/integration/test_join_set_family_s3/test.py @@ -27,7 +27,7 @@ def cluster(): def assert_objects_count(cluster, objects_count, path="data/"): minio = cluster.minio_client - s3_objects = list(minio.list_objects(cluster.minio_bucket, path, recursive=True)) + s3_objects = list(minio.list_objects(cluster.minio_bucket, path)) if objects_count != len(s3_objects): for s3_object in s3_objects: object_meta = minio.stat_object(cluster.minio_bucket, s3_object.object_name) diff --git a/tests/integration/test_keeper_session/configs/keeper_config.xml b/tests/integration/test_keeper_session/configs/keeper_config1.xml similarity index 67% rename from tests/integration/test_keeper_session/configs/keeper_config.xml rename to tests/integration/test_keeper_session/configs/keeper_config1.xml index ed0bb52bd51..fd308fe8a2f 100644 --- a/tests/integration/test_keeper_session/configs/keeper_config.xml +++ b/tests/integration/test_keeper_session/configs/keeper_config1.xml @@ -1,4 +1,4 @@ - + 9181 1 @@ -19,9 +19,19 @@ 1 node1 9234 - true - 3 + + + 2 + node2 + 9234 + true + + + 3 + node3 + 9234 + true - + diff --git a/tests/integration/test_keeper_session/configs/keeper_config2.xml b/tests/integration/test_keeper_session/configs/keeper_config2.xml new file mode 100644 index 00000000000..ad558fbccad --- /dev/null +++ b/tests/integration/test_keeper_session/configs/keeper_config2.xml @@ -0,0 +1,37 @@ + + + 9181 + 2 + /var/lib/clickhouse/coordination/log + /var/lib/clickhouse/coordination/snapshots + * + + + 5000 + 10000 + 5000 + 75 + trace + + + + + 1 + node1 + 9234 + + + 2 + node2 + 9234 + true + + + 3 + node3 + 9234 + true + + + + diff --git a/tests/integration/test_keeper_session/configs/keeper_config3.xml b/tests/integration/test_keeper_session/configs/keeper_config3.xml new file mode 100644 index 00000000000..2a21f959816 --- /dev/null +++ b/tests/integration/test_keeper_session/configs/keeper_config3.xml @@ -0,0 +1,37 @@ + + + 9181 + 3 + /var/lib/clickhouse/coordination/log + /var/lib/clickhouse/coordination/snapshots + * + + + 5000 + 10000 + 5000 + 75 + trace + + + + + 1 + node1 + 9234 + + + 2 + node2 + 9234 + true + + + 3 + node3 + 9234 + true + + + + diff --git a/tests/integration/test_keeper_session/test.py b/tests/integration/test_keeper_session/test.py index 30db4d9548c..4b3aa7e3fdf 100644 --- a/tests/integration/test_keeper_session/test.py +++ b/tests/integration/test_keeper_session/test.py @@ -10,7 +10,15 @@ from kazoo.client import KazooClient cluster = ClickHouseCluster(__file__) node1 = cluster.add_instance( - "node1", main_configs=["configs/keeper_config.xml"], stay_alive=True + "node1", main_configs=["configs/keeper_config1.xml"], stay_alive=True +) + +node2 = cluster.add_instance( + "node2", main_configs=["configs/keeper_config2.xml"], stay_alive=True +) + +node3 = cluster.add_instance( + "node3", main_configs=["configs/keeper_config3.xml"], stay_alive=True ) bool_struct = struct.Struct("B") @@ -61,7 +69,7 @@ def wait_node(node): def wait_nodes(): - for n in [node1]: + for n in [node1, node2, node3]: wait_node(n) @@ -165,3 +173,21 @@ def test_session_timeout(started_cluster): negotiated_timeout, _ = handshake(node1.name, session_timeout=20000, session_id=0) assert negotiated_timeout == 10000 + + +def test_session_close_shutdown(started_cluster): + wait_nodes() + + node1_zk = get_fake_zk(node1.name) + node2_zk = get_fake_zk(node2.name) + + eph_node = "/test_node" + node2_zk.create(eph_node, ephemeral=True) + assert node1_zk.exists(eph_node) != None + + # shutdown while session is active + node2.stop_clickhouse() + + assert node1_zk.exists(eph_node) == None + + node2.start_clickhouse() diff --git a/tests/integration/test_log_family_s3/test.py b/tests/integration/test_log_family_s3/test.py index bed379d098b..76ff0930db3 100644 --- a/tests/integration/test_log_family_s3/test.py +++ b/tests/integration/test_log_family_s3/test.py @@ -25,7 +25,7 @@ def cluster(): def assert_objects_count(cluster, objects_count, path="data/"): minio = cluster.minio_client - s3_objects = list(minio.list_objects(cluster.minio_bucket, path, recursive=True)) + s3_objects = list(minio.list_objects(cluster.minio_bucket, path)) if objects_count != len(s3_objects): for s3_object in s3_objects: object_meta = minio.stat_object(cluster.minio_bucket, s3_object.object_name) diff --git a/tests/integration/test_merge_tree_s3/configs/config.d/storage_conf.xml b/tests/integration/test_merge_tree_s3/configs/config.d/storage_conf.xml index 3ee49744a61..f3505f53339 100644 --- a/tests/integration/test_merge_tree_s3/configs/config.d/storage_conf.xml +++ b/tests/integration/test_merge_tree_s3/configs/config.d/storage_conf.xml @@ -38,6 +38,20 @@ /jbod1/ 1000000000 + + s3 + http://minio1:9001/root/data/ + minio + minio123 + 33554432 + + + cache + s3_r + /s3_cache_r/ + 1000000000 + 1 + @@ -78,6 +92,13 @@ + + +
+ s3_cache_r +
+
+
diff --git a/tests/integration/test_merge_tree_s3/test.py b/tests/integration/test_merge_tree_s3/test.py index bee22c03689..4ce5fd5a069 100644 --- a/tests/integration/test_merge_tree_s3/test.py +++ b/tests/integration/test_merge_tree_s3/test.py @@ -6,7 +6,6 @@ import pytest from helpers.cluster import ClickHouseCluster from helpers.utility import generate_values, replace_config, SafeThread - SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__)) @@ -36,6 +35,7 @@ def cluster(): "/jbod1:size=2M", ], ) + logging.info("Starting cluster...") cluster.start() logging.info("Cluster started") @@ -121,17 +121,11 @@ def run_s3_mocks(cluster): def wait_for_delete_s3_objects(cluster, expected, timeout=30): minio = cluster.minio_client while timeout > 0: - if ( - len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True))) - == expected - ): + if len(list(minio.list_objects(cluster.minio_bucket, "data/"))) == expected: return timeout -= 1 time.sleep(1) - assert ( - len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True))) - == expected - ) + assert len(list(minio.list_objects(cluster.minio_bucket, "data/"))) == expected @pytest.fixture(autouse=True) @@ -147,9 +141,7 @@ def drop_table(cluster, node_name): wait_for_delete_s3_objects(cluster, 0) finally: # Remove extra objects to prevent tests cascade failing - for obj in list( - minio.list_objects(cluster.minio_bucket, "data/", recursive=True) - ): + for obj in list(minio.list_objects(cluster.minio_bucket, "data/")): minio.remove_object(cluster.minio_bucket, obj.object_name) @@ -171,7 +163,7 @@ def test_simple_insert_select( node.query("INSERT INTO s3_test VALUES {}".format(values1)) assert node.query("SELECT * FROM s3_test order by dt, id FORMAT Values") == values1 assert ( - len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True))) + len(list(minio.list_objects(cluster.minio_bucket, "data/"))) == FILES_OVERHEAD + files_per_part ) @@ -182,7 +174,7 @@ def test_simple_insert_select( == values1 + "," + values2 ) assert ( - len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True))) + len(list(minio.list_objects(cluster.minio_bucket, "data/"))) == FILES_OVERHEAD + files_per_part * 2 ) @@ -226,7 +218,7 @@ def test_insert_same_partition_and_merge(cluster, merge_vertical, node_name): node.query("SELECT count(distinct(id)) FROM s3_test FORMAT Values") == "(8192)" ) assert ( - len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True))) + len(list(minio.list_objects(cluster.minio_bucket, "data/"))) == FILES_OVERHEAD_PER_PART_WIDE * 6 + FILES_OVERHEAD ) @@ -315,28 +307,28 @@ def test_attach_detach_partition(cluster, node_name): ) assert node.query("SELECT count(*) FROM s3_test FORMAT Values") == "(8192)" assert ( - len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True))) + len(list(minio.list_objects(cluster.minio_bucket, "data/"))) == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE * 2 ) node.query("ALTER TABLE s3_test DETACH PARTITION '2020-01-03'") assert node.query("SELECT count(*) FROM s3_test FORMAT Values") == "(4096)" assert ( - len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True))) + len(list(minio.list_objects(cluster.minio_bucket, "data/"))) == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE * 2 ) node.query("ALTER TABLE s3_test ATTACH PARTITION '2020-01-03'") assert node.query("SELECT count(*) FROM s3_test FORMAT Values") == "(8192)" assert ( - len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True))) + len(list(minio.list_objects(cluster.minio_bucket, "data/"))) == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE * 2 ) node.query("ALTER TABLE s3_test DROP PARTITION '2020-01-03'") assert node.query("SELECT count(*) FROM s3_test FORMAT Values") == "(4096)" assert ( - len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True))) + len(list(minio.list_objects(cluster.minio_bucket, "data/"))) == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE ) @@ -347,8 +339,7 @@ def test_attach_detach_partition(cluster, node_name): ) assert node.query("SELECT count(*) FROM s3_test FORMAT Values") == "(0)" assert ( - len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True))) - == FILES_OVERHEAD + len(list(minio.list_objects(cluster.minio_bucket, "data/"))) == FILES_OVERHEAD ) @@ -366,21 +357,21 @@ def test_move_partition_to_another_disk(cluster, node_name): ) assert node.query("SELECT count(*) FROM s3_test FORMAT Values") == "(8192)" assert ( - len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True))) + len(list(minio.list_objects(cluster.minio_bucket, "data/"))) == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE * 2 ) node.query("ALTER TABLE s3_test MOVE PARTITION '2020-01-04' TO DISK 'hdd'") assert node.query("SELECT count(*) FROM s3_test FORMAT Values") == "(8192)" assert ( - len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True))) + len(list(minio.list_objects(cluster.minio_bucket, "data/"))) == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE ) node.query("ALTER TABLE s3_test MOVE PARTITION '2020-01-04' TO DISK 's3'") assert node.query("SELECT count(*) FROM s3_test FORMAT Values") == "(8192)" assert ( - len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True))) + len(list(minio.list_objects(cluster.minio_bucket, "data/"))) == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE * 2 ) @@ -401,7 +392,7 @@ def test_table_manipulations(cluster, node_name): node.query("RENAME TABLE s3_test TO s3_renamed") assert node.query("SELECT count(*) FROM s3_renamed FORMAT Values") == "(8192)" assert ( - len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True))) + len(list(minio.list_objects(cluster.minio_bucket, "data/"))) == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE * 2 ) node.query("RENAME TABLE s3_renamed TO s3_test") @@ -412,15 +403,14 @@ def test_table_manipulations(cluster, node_name): node.query("ATTACH TABLE s3_test") assert node.query("SELECT count(*) FROM s3_test FORMAT Values") == "(8192)" assert ( - len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True))) + len(list(minio.list_objects(cluster.minio_bucket, "data/"))) == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE * 2 ) node.query("TRUNCATE TABLE s3_test") assert node.query("SELECT count(*) FROM s3_test FORMAT Values") == "(0)" assert ( - len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True))) - == FILES_OVERHEAD + len(list(minio.list_objects(cluster.minio_bucket, "data/"))) == FILES_OVERHEAD ) @@ -445,7 +435,7 @@ def test_move_replace_partition_to_another_table(cluster, node_name): assert node.query("SELECT sum(id) FROM s3_test FORMAT Values") == "(0)" assert node.query("SELECT count(*) FROM s3_test FORMAT Values") == "(16384)" assert ( - len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True))) + len(list(minio.list_objects(cluster.minio_bucket, "data/"))) == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE * 4 ) @@ -459,7 +449,7 @@ def test_move_replace_partition_to_another_table(cluster, node_name): assert node.query("SELECT count(*) FROM s3_clone FORMAT Values") == "(8192)" # Number of objects in S3 should be unchanged. assert ( - len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True))) + len(list(minio.list_objects(cluster.minio_bucket, "data/"))) == FILES_OVERHEAD * 2 + FILES_OVERHEAD_PER_PART_WIDE * 4 ) @@ -473,7 +463,7 @@ def test_move_replace_partition_to_another_table(cluster, node_name): assert node.query("SELECT sum(id) FROM s3_test FORMAT Values") == "(0)" assert node.query("SELECT count(*) FROM s3_test FORMAT Values") == "(16384)" assert ( - len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True))) + len(list(minio.list_objects(cluster.minio_bucket, "data/"))) == FILES_OVERHEAD * 2 + FILES_OVERHEAD_PER_PART_WIDE * 6 ) @@ -494,14 +484,14 @@ def test_move_replace_partition_to_another_table(cluster, node_name): assert node.query("SELECT count(*) FROM s3_test FORMAT Values") == "(16384)" # Data should remain in S3 assert ( - len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True))) + len(list(minio.list_objects(cluster.minio_bucket, "data/"))) == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE * 4 ) node.query("ALTER TABLE s3_test FREEZE") # Number S3 objects should be unchanged. assert ( - len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True))) + len(list(minio.list_objects(cluster.minio_bucket, "data/"))) == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE * 4 ) @@ -510,7 +500,7 @@ def test_move_replace_partition_to_another_table(cluster, node_name): wait_for_delete_s3_objects(cluster, FILES_OVERHEAD_PER_PART_WIDE * 4) - for obj in list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True)): + for obj in list(minio.list_objects(cluster.minio_bucket, "data/")): minio.remove_object(cluster.minio_bucket, obj.object_name) @@ -531,7 +521,7 @@ def test_freeze_unfreeze(cluster, node_name): node.query("TRUNCATE TABLE s3_test") assert ( - len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True))) + len(list(minio.list_objects(cluster.minio_bucket, "data/"))) == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE * 2 ) @@ -546,8 +536,7 @@ def test_freeze_unfreeze(cluster, node_name): # Data should be removed from S3. assert ( - len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True))) - == FILES_OVERHEAD + len(list(minio.list_objects(cluster.minio_bucket, "data/"))) == FILES_OVERHEAD ) @@ -570,7 +559,7 @@ def test_freeze_system_unfreeze(cluster, node_name): node.query("TRUNCATE TABLE s3_test") node.query("DROP TABLE s3_test_removed NO DELAY") assert ( - len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True))) + len(list(minio.list_objects(cluster.minio_bucket, "data/"))) == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE * 2 ) @@ -581,8 +570,7 @@ def test_freeze_system_unfreeze(cluster, node_name): # Data should be removed from S3. assert ( - len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True))) - == FILES_OVERHEAD + len(list(minio.list_objects(cluster.minio_bucket, "data/"))) == FILES_OVERHEAD ) @@ -709,7 +697,7 @@ def test_lazy_seek_optimization_for_async_read(cluster, node_name): node.query("SELECT * FROM s3_test WHERE value LIKE '%abc%' ORDER BY value LIMIT 10") node.query("DROP TABLE IF EXISTS s3_test NO DELAY") minio = cluster.minio_client - for obj in list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True)): + for obj in list(minio.list_objects(cluster.minio_bucket, "data/")): minio.remove_object(cluster.minio_bucket, obj.object_name) @@ -754,3 +742,79 @@ def test_store_cleanup_disk_s3(cluster, node_name): "CREATE TABLE s3_test UUID '00000000-1000-4000-8000-000000000001' (n UInt64) Engine=MergeTree() ORDER BY n SETTINGS storage_policy='s3';" ) node.query("INSERT INTO s3_test SELECT 1") + + +@pytest.mark.parametrize("node_name", ["node"]) +def test_cache_setting_compatibility(cluster, node_name): + node = cluster.instances[node_name] + + node.query("DROP TABLE IF EXISTS s3_test NO DELAY") + + node.query( + "CREATE TABLE s3_test (key UInt32, value String) Engine=MergeTree() ORDER BY key SETTINGS storage_policy='s3_cache_r';" + ) + node.query( + "INSERT INTO s3_test SELECT * FROM generateRandom('key UInt32, value String') LIMIT 500" + ) + + result = node.query("SYSTEM DROP FILESYSTEM CACHE") + + result = node.query( + "SELECT count() FROM system.filesystem_cache WHERE cache_path LIKE '%persistent'" + ) + assert int(result) == 0 + + node.query("SELECT * FROM s3_test") + + result = node.query( + "SELECT count() FROM system.filesystem_cache WHERE cache_path LIKE '%persistent'" + ) + assert int(result) > 0 + + config_path = os.path.join( + SCRIPT_DIR, + f"./{cluster.instances_dir_name}/node/configs/config.d/storage_conf.xml", + ) + + replace_config( + config_path, + "1", + "0", + ) + + result = node.query("DESCRIBE CACHE 's3_cache_r'") + assert result.strip().endswith("1") + + node.restart_clickhouse() + + result = node.query("DESCRIBE CACHE 's3_cache_r'") + assert result.strip().endswith("0") + + result = node.query( + "SELECT count() FROM system.filesystem_cache WHERE cache_path LIKE '%persistent'" + ) + assert int(result) > 0 + + node.query("SELECT * FROM s3_test FORMAT Null") + + assert not node.contains_in_log("No such file or directory: Cache info:") + + replace_config( + config_path, + "0", + "1", + ) + + result = node.query( + "SELECT count() FROM system.filesystem_cache WHERE cache_path LIKE '%persistent'" + ) + assert int(result) > 0 + + node.restart_clickhouse() + + result = node.query("DESCRIBE CACHE 's3_cache_r'") + assert result.strip().endswith("1") + + node.query("SELECT * FROM s3_test FORMAT Null") + + assert not node.contains_in_log("No such file or directory: Cache info:") diff --git a/tests/integration/test_profile_events_s3/test.py b/tests/integration/test_profile_events_s3/test.py index 18f1c5ee9ad..a0f664df000 100644 --- a/tests/integration/test_profile_events_s3/test.py +++ b/tests/integration/test_profile_events_s3/test.py @@ -62,7 +62,7 @@ init_list = { def get_s3_events(instance): result = init_list.copy() events = instance.query( - "SELECT event, value FROM system.events WHERE event LIKE '%S3%'" + "SELECT event,value FROM system.events WHERE event LIKE '%S3%'" ).split("\n") for event in events: ev = event.split("\t") @@ -85,20 +85,20 @@ def get_minio_stat(cluster): ) ).text.split("\n") for line in stat: - x = re.search(r"s3_requests_total(\{.*\})?\s(\d+)(\s.*)?", line) + x = re.search("s3_requests_total(\{.*\})?\s(\d+)(\s.*)?", line) if x != None: y = re.search('.*api="(get|list|head|select).*', x.group(1)) if y != None: result["get_requests"] += int(x.group(2)) else: result["set_requests"] += int(x.group(2)) - x = re.search(r"s3_errors_total(\{.*\})?\s(\d+)(\s.*)?", line) + x = re.search("s3_errors_total(\{.*\})?\s(\d+)(\s.*)?", line) if x != None: result["errors"] += int(x.group(2)) - x = re.search(r"s3_rx_bytes_total(\{.*\})?\s([\d\.e\+\-]+)(\s.*)?", line) + x = re.search("s3_rx_bytes_total(\{.*\})?\s([\d\.e\+\-]+)(\s.*)?", line) if x != None: result["tx_bytes"] += float(x.group(2)) - x = re.search(r"s3_tx_bytes_total(\{.*\})?\s([\d\.e\+\-]+)(\s.*)?", line) + x = re.search("s3_tx_bytes_total(\{.*\})?\s([\d\.e\+\-]+)(\s.*)?", line) if x != None: result["rx_bytes"] += float(x.group(2)) return result @@ -128,10 +128,8 @@ def get_query_stat(instance, hint): def get_minio_size(cluster): minio = cluster.minio_client size = 0 - for obj_level1 in minio.list_objects( - cluster.minio_bucket, prefix="data/", recursive=True - ): - size += obj_level1.size + for obj in minio.list_objects(cluster.minio_bucket, "data/"): + size += obj.size return size @@ -147,7 +145,7 @@ def test_profile_events(cluster): metrics0 = get_s3_events(instance) minio0 = get_minio_stat(cluster) - query1 = "CREATE TABLE test_s3.test_s3 (key UInt32, value UInt32) ENGINE=MergeTree PRIMARY KEY key ORDER BY key SETTINGS storage_policy = 's3'" + query1 = "CREATE TABLE test_s3.test_s3 (key UInt32, value UInt32) ENGINE=MergeTree PRIMARY KEY key ORDER BY key SETTINGS storage_policy='s3'" instance.query(query1) size1 = get_minio_size(cluster) @@ -169,7 +167,7 @@ def test_profile_events(cluster): metrics1["WriteBufferFromS3Bytes"] - metrics0["WriteBufferFromS3Bytes"] == size1 ) - query2 = "INSERT INTO test_s3.test_s3 VALUES" + query2 = "INSERT INTO test_s3.test_s3 FORMAT Values" instance.query(query2 + " (1,1)") size2 = get_minio_size(cluster) @@ -184,12 +182,9 @@ def test_profile_events(cluster): metrics2["S3WriteRequestsCount"] - metrics1["S3WriteRequestsCount"] == minio2["set_requests"] - minio1["set_requests"] ) - stat2 = get_query_stat(instance, query2) - for metric in stat2: assert stat2[metric] == metrics2[metric] - metrics1[metric] - assert ( metrics2["WriteBufferFromS3Bytes"] - metrics1["WriteBufferFromS3Bytes"] == size2 - size1 @@ -210,7 +205,6 @@ def test_profile_events(cluster): == minio3["set_requests"] - minio2["set_requests"] ) stat3 = get_query_stat(instance, query3) - # With async reads profile events are not updated fully because reads are done in a separate thread. # for metric in stat3: # print(metric) diff --git a/tests/integration/test_replicated_merge_tree_s3/test.py b/tests/integration/test_replicated_merge_tree_s3/test.py index 0d978bb6967..37027d07969 100644 --- a/tests/integration/test_replicated_merge_tree_s3/test.py +++ b/tests/integration/test_replicated_merge_tree_s3/test.py @@ -113,7 +113,7 @@ def drop_table(cluster): minio = cluster.minio_client # Remove extra objects to prevent tests cascade failing - for obj in list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True)): + for obj in list(minio.list_objects(cluster.minio_bucket, "data/")): minio.remove_object(cluster.minio_bucket, obj.object_name) @@ -130,9 +130,9 @@ def test_insert_select_replicated(cluster, min_rows_for_wide_part, files_per_par insert(cluster, node_idxs=[1, 2, 3], verify=True) minio = cluster.minio_client - assert len( - list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True)) - ) == 3 * (FILES_OVERHEAD + files_per_part * 3) + assert len(list(minio.list_objects(cluster.minio_bucket, "data/"))) == 3 * ( + FILES_OVERHEAD + files_per_part * 3 + ) def test_drop_cache_on_cluster(cluster): diff --git a/tests/integration/test_replicated_merge_tree_s3_zero_copy/test.py b/tests/integration/test_replicated_merge_tree_s3_zero_copy/test.py index 60a1b9b9746..73b611ad169 100644 --- a/tests/integration/test_replicated_merge_tree_s3_zero_copy/test.py +++ b/tests/integration/test_replicated_merge_tree_s3_zero_copy/test.py @@ -87,7 +87,7 @@ def drop_table(cluster): minio = cluster.minio_client # Remove extra objects to prevent tests cascade failing - for obj in list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True)): + for obj in list(minio.list_objects(cluster.minio_bucket, "data/")): minio.remove_object(cluster.minio_bucket, obj.object_name) @@ -124,6 +124,6 @@ def test_insert_select_replicated(cluster, min_rows_for_wide_part, files_per_par ) minio = cluster.minio_client - assert len( - list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True)) - ) == (3 * FILES_OVERHEAD) + (files_per_part * 3) + assert len(list(minio.list_objects(cluster.minio_bucket, "data/"))) == ( + 3 * FILES_OVERHEAD + ) + (files_per_part * 3) diff --git a/tests/integration/test_s3_aws_sdk_is_total_garbage/__init__.py b/tests/integration/test_s3_aws_sdk_has_slightly_unreliable_behaviour/__init__.py similarity index 100% rename from tests/integration/test_s3_aws_sdk_is_total_garbage/__init__.py rename to tests/integration/test_s3_aws_sdk_has_slightly_unreliable_behaviour/__init__.py diff --git a/tests/integration/test_s3_aws_sdk_is_total_garbage/configs/storage_conf.xml b/tests/integration/test_s3_aws_sdk_has_slightly_unreliable_behaviour/configs/storage_conf.xml similarity index 100% rename from tests/integration/test_s3_aws_sdk_is_total_garbage/configs/storage_conf.xml rename to tests/integration/test_s3_aws_sdk_has_slightly_unreliable_behaviour/configs/storage_conf.xml diff --git a/tests/integration/test_s3_aws_sdk_is_total_garbage/configs/upload_min_size.xml b/tests/integration/test_s3_aws_sdk_has_slightly_unreliable_behaviour/configs/upload_min_size.xml similarity index 100% rename from tests/integration/test_s3_aws_sdk_is_total_garbage/configs/upload_min_size.xml rename to tests/integration/test_s3_aws_sdk_has_slightly_unreliable_behaviour/configs/upload_min_size.xml diff --git a/tests/integration/test_s3_aws_sdk_is_total_garbage/s3_endpoint/endpoint.py b/tests/integration/test_s3_aws_sdk_has_slightly_unreliable_behaviour/s3_endpoint/endpoint.py similarity index 100% rename from tests/integration/test_s3_aws_sdk_is_total_garbage/s3_endpoint/endpoint.py rename to tests/integration/test_s3_aws_sdk_has_slightly_unreliable_behaviour/s3_endpoint/endpoint.py diff --git a/tests/integration/test_s3_aws_sdk_is_total_garbage/test.py b/tests/integration/test_s3_aws_sdk_has_slightly_unreliable_behaviour/test.py similarity index 100% rename from tests/integration/test_s3_aws_sdk_is_total_garbage/test.py rename to tests/integration/test_s3_aws_sdk_has_slightly_unreliable_behaviour/test.py diff --git a/tests/integration/test_s3_zero_copy_replication/test.py b/tests/integration/test_s3_zero_copy_replication/test.py index 860b83d4ed1..7b7fb9d21ad 100644 --- a/tests/integration/test_s3_zero_copy_replication/test.py +++ b/tests/integration/test_s3_zero_copy_replication/test.py @@ -39,9 +39,7 @@ def cluster(): def get_large_objects_count(cluster, size=100, folder="data"): minio = cluster.minio_client counter = 0 - for obj in minio.list_objects( - cluster.minio_bucket, "{}/".format(folder), recursive=True - ): + for obj in minio.list_objects(cluster.minio_bucket, "{}/".format(folder)): if obj.size is not None and obj.size >= size: counter = counter + 1 return counter diff --git a/tests/integration/test_send_crash_reports/test.py b/tests/integration/test_send_crash_reports/test.py index 90a6c684de7..83c0827f891 100644 --- a/tests/integration/test_send_crash_reports/test.py +++ b/tests/integration/test_send_crash_reports/test.py @@ -36,8 +36,10 @@ def started_node(): def test_send_segfault(started_node): + # NOTE: another option is to increase waiting time. if ( started_node.is_built_with_thread_sanitizer() + or started_node.is_built_with_address_sanitizer() or started_node.is_built_with_memory_sanitizer() ): pytest.skip("doesn't fit in timeouts for stacktrace generation") diff --git a/tests/jepsen.clickhouse-keeper/resources/keeper_config.xml b/tests/jepsen.clickhouse-keeper/resources/keeper_config.xml index 2ab747fbd71..677de5f6769 100644 --- a/tests/jepsen.clickhouse-keeper/resources/keeper_config.xml +++ b/tests/jepsen.clickhouse-keeper/resources/keeper_config.xml @@ -21,7 +21,8 @@ 1000 2000 4000 - {quorum_reads} + 0 + fastlinear {snapshot_distance} {stale_log_gap} {reserved_log_items} diff --git a/tests/jepsen.clickhouse-keeper/src/jepsen/clickhouse_keeper/counter.clj b/tests/jepsen.clickhouse-keeper/src/jepsen/clickhouse_keeper/counter.clj index 60b29bd799a..e6e94371501 100644 --- a/tests/jepsen.clickhouse-keeper/src/jepsen/clickhouse_keeper/counter.clj +++ b/tests/jepsen.clickhouse-keeper/src/jepsen/clickhouse_keeper/counter.clj @@ -27,7 +27,12 @@ (invoke! [this test op] (case (:f op) - :read (exec-with-retries 30 (fn [] + :read (try + (assoc op + :type :ok + :value (count (zk-list conn root-path))) + (catch Exception _ (assoc op :type :info, :error :connect-error))) + :final-read (exec-with-retries 30 (fn [] (assoc op :type :ok :value (count (zk-list conn root-path))))) @@ -49,7 +54,5 @@ :checker (checker/compose {:counter (checker/counter) :perf (checker/perf)}) - :generator (->> (range) - (map (fn [x] - (->> (gen/mix [r add]))))) - :final-generator (gen/once {:type :invoke, :f :read, :value nil})}) + :generator (gen/mix [r add]) + :final-generator (gen/once {:type :invoke, :f :final-read, :value nil})}) diff --git a/tests/jepsen.clickhouse-keeper/src/jepsen/clickhouse_keeper/db.clj b/tests/jepsen.clickhouse-keeper/src/jepsen/clickhouse_keeper/db.clj index c354e36e430..9e85b37dd75 100644 --- a/tests/jepsen.clickhouse-keeper/src/jepsen/clickhouse_keeper/db.clj +++ b/tests/jepsen.clickhouse-keeper/src/jepsen/clickhouse_keeper/db.clj @@ -98,7 +98,6 @@ #"\{srv2\}" (get nodes 1) #"\{srv3\}" (get nodes 2) #"\{id\}" (str (inc (.indexOf nodes node))) - #"\{quorum_reads\}" (str (boolean (:quorum test))) #"\{snapshot_distance\}" (str (:snapshot-distance test)) #"\{stale_log_gap\}" (str (:stale-log-gap test)) #"\{reserved_log_items\}" (str (:reserved-log-items test))}] diff --git a/tests/jepsen.clickhouse-keeper/src/jepsen/clickhouse_keeper/main.clj b/tests/jepsen.clickhouse-keeper/src/jepsen/clickhouse_keeper/main.clj index cd1aa540e45..1919c8ce3ec 100644 --- a/tests/jepsen.clickhouse-keeper/src/jepsen/clickhouse_keeper/main.clj +++ b/tests/jepsen.clickhouse-keeper/src/jepsen/clickhouse_keeper/main.clj @@ -103,7 +103,7 @@ current-nemesis (get custom-nemesis/custom-nemesises (:nemesis opts))] (merge tests/noop-test opts - {:name (str "clickhouse-keeper-quorum=" quorum "-" (name (:workload opts)) "-" (name (:nemesis opts))) + {:name (str "clickhouse-keeper-" (name (:workload opts)) "-" (name (:nemesis opts))) :os ubuntu/os :db (get-db opts) :pure-generators true diff --git a/tests/jepsen.clickhouse-keeper/src/jepsen/clickhouse_keeper/register.clj b/tests/jepsen.clickhouse-keeper/src/jepsen/clickhouse_keeper/register.clj index a1605192b51..228cb3f46ef 100644 --- a/tests/jepsen.clickhouse-keeper/src/jepsen/clickhouse_keeper/register.clj +++ b/tests/jepsen.clickhouse-keeper/src/jepsen/clickhouse_keeper/register.clj @@ -20,7 +20,8 @@ (assoc this :conn (zk-connect node 9181 30000))) (setup! [this test] - (zk-create-range conn 300)) ; 300 nodes to be sure + (exec-with-retries 30 (fn [] + (zk-create-range conn 300)))) (invoke! [_ test op] (let [[k v] (:value op) diff --git a/tests/jepsen.clickhouse-keeper/src/jepsen/clickhouse_keeper/utils.clj b/tests/jepsen.clickhouse-keeper/src/jepsen/clickhouse_keeper/utils.clj index 3625b24b4f9..cdb25ba0a2d 100644 --- a/tests/jepsen.clickhouse-keeper/src/jepsen/clickhouse_keeper/utils.clj +++ b/tests/jepsen.clickhouse-keeper/src/jepsen/clickhouse_keeper/utils.clj @@ -45,7 +45,7 @@ (defn zk-connect [host port timeout] - (exec-with-retries 30 (fn [] (zk/connect (str host ":" port) :timeout-msec timeout)))) + (zk/connect (str host ":" port) :timeout-msec timeout)) (defn zk-create-range [conn n] diff --git a/tests/performance/avg_weighted.xml b/tests/performance/avg_weighted.xml index df992ad682a..5aa89b08c35 100644 --- a/tests/performance/avg_weighted.xml +++ b/tests/performance/avg_weighted.xml @@ -32,5 +32,21 @@ SELECT avgWeighted(num_u, num) FROM perf_avg FORMAT Null SELECT avgWeighted(num_u, num_u) FROM perf_avg FORMAT Null + SELECT avgWeighted(num_f, num_f) FROM perf_avg FORMAT Null + SELECT avgWeighted(toNullable(num_f), num_f) FROM perf_avg FORMAT Null + SELECT avgWeighted(num_f, toNullable(num_f)) FROM perf_avg FORMAT Null + SELECT avgWeighted(toNullable(num_f), toNullable(num_f)) FROM perf_avg FORMAT Null + + SELECT avgWeightedIf(num_f, num_f, num % 10) FROM perf_avg FORMAT Null + SELECT avgWeightedIf(toNullable(num_f), num_f, num % 10) FROM perf_avg FORMAT Null + SELECT avgWeightedIf(num_f, toNullable(num_f), num % 10) FROM perf_avg FORMAT Null + SELECT avgWeightedIf(toNullable(num_f), toNullable(num_f), num % 10) FROM perf_avg FORMAT Null + + SELECT avgWeightedIf(num_f, num_f, toNullable(num) % 10) FROM perf_avg FORMAT Null + SELECT avgWeightedIf(toNullable(num_f), num_f, toNullable(num) % 10) FROM perf_avg FORMAT Null + SELECT avgWeightedIf(num_f, toNullable(num_f), toNullable(num) % 10) FROM perf_avg FORMAT Null + SELECT avgWeightedIf(toNullable(num_f), toNullable(num_f), toNullable(num) % 10) FROM perf_avg FORMAT Null + + DROP TABLE IF EXISTS perf_avg diff --git a/tests/performance/uniq_stored.xml b/tests/performance/uniq_stored.xml new file mode 100644 index 00000000000..75fb9847aab --- /dev/null +++ b/tests/performance/uniq_stored.xml @@ -0,0 +1,58 @@ + + + create table matview_1 + ( + a String, + b_count AggregateFunction(uniq, UInt64) + ) Engine=MergeTree partition by tuple() + ORDER by tuple() + SETTINGS index_granularity = 1024; + + + + create table matview_10000 + ( + a String, + b_count AggregateFunction(uniq, String) + ) Engine=MergeTree partition by tuple() + ORDER by tuple() + SETTINGS index_granularity = 1024; + + + + DROP TABLE IF EXISTS matview_1 + DROP TABLE IF EXISTS matview_10000 + + + INSERT INTO matview_10000 + SELECT a, uniqState(b) b_count + FROM + ( + SELECT toString(rand() % 1000) a, toString(number % 10000) b + FROM numbers_mt(20000000) + ) + GROUP BY a + SETTINGS max_insert_threads=8; + + OPTIMIZE TABLE matview_10000 FINAL + + + INSERT INTO matview_1 + SELECT '1', uniqState(number) b_count + FROM + ( + SELECT * + FROM numbers_mt(2000000) + ) + GROUP BY number + SETTINGS max_insert_threads=8; + + OPTIMIZE TABLE matview_1 FINAL + + + select a, uniqMerge(b_count) as b_count from matview_10000 prewhere a='55' group by a FORMAT Null SETTINGS max_threads=1; + select uniqMerge(b_count) as b_count from matview_10000 FORMAT Null SETTINGS max_threads=1; + + + select uniqMerge(b_count) as b_count FROM matview_1 FORMAT Null SETTINGS max_threads=1; + diff --git a/tests/queries/0_stateless/01161_all_system_tables.sh b/tests/queries/0_stateless/01161_all_system_tables.sh index 9988c1f3625..1d886374c07 100755 --- a/tests/queries/0_stateless/01161_all_system_tables.sh +++ b/tests/queries/0_stateless/01161_all_system_tables.sh @@ -16,7 +16,7 @@ function run_selects() { thread_num=$1 readarray -t tables_arr < <(${CLICKHOUSE_CLIENT} -q "SELECT database || '.' || name FROM system.tables - WHERE database in ('system', 'information_schema', 'INFORMATION_SCHEMA') and name!='zookeeper' and name!='merge_tree_metadata_cache' + WHERE database in ('system', 'information_schema', 'INFORMATION_SCHEMA') and name!='zookeeper' and name!='merge_tree_metadata_cache' and name!='models' AND sipHash64(name || toString($RAND)) % $THREADS = $thread_num") for t in "${tables_arr[@]}" diff --git a/tests/queries/0_stateless/01321_monotonous_functions_in_order_by_bug.reference b/tests/queries/0_stateless/01321_monotonous_functions_in_order_by_bug.reference new file mode 100644 index 00000000000..0c720206065 --- /dev/null +++ b/tests/queries/0_stateless/01321_monotonous_functions_in_order_by_bug.reference @@ -0,0 +1,2 @@ +2020-01-01 01:00:00 1 +2020-01-01 01:00:00 999 diff --git a/tests/queries/0_stateless/01321_monotonous_functions_in_order_by_bug.sql b/tests/queries/0_stateless/01321_monotonous_functions_in_order_by_bug.sql new file mode 100644 index 00000000000..4aa52fe6ae8 --- /dev/null +++ b/tests/queries/0_stateless/01321_monotonous_functions_in_order_by_bug.sql @@ -0,0 +1,7 @@ +SELECT + toStartOfHour(c1) AS _c1, + c2 +FROM values((toDateTime('2020-01-01 01:01:01'), 999), (toDateTime('2020-01-01 01:01:59'), 1)) +ORDER BY + _c1 ASC, + c2 ASC diff --git a/tests/queries/0_stateless/01576_alias_column_rewrite.sql b/tests/queries/0_stateless/01576_alias_column_rewrite.sql index 8424eb11f9b..1f28225bef8 100644 --- a/tests/queries/0_stateless/01576_alias_column_rewrite.sql +++ b/tests/queries/0_stateless/01576_alias_column_rewrite.sql @@ -17,7 +17,7 @@ INSERT INTO test_table(timestamp, value) SELECT toDateTime('2020-01-01 12:00:00' INSERT INTO test_table(timestamp, value) SELECT toDateTime('2020-01-02 12:00:00'), 1 FROM numbers(10); INSERT INTO test_table(timestamp, value) SELECT toDateTime('2020-01-03 12:00:00'), 1 FROM numbers(10); -set optimize_respect_aliases = 1; +set optimize_respect_aliases = 1, optimize_monotonous_functions_in_order_by = 1; SELECT 'test-partition-prune'; SELECT COUNT() = 10 FROM test_table WHERE day = '2020-01-01' SETTINGS max_rows_to_read = 10; diff --git a/tests/queries/0_stateless/01646_rewrite_sum_if_bug.reference b/tests/queries/0_stateless/01646_rewrite_sum_if_bug.reference new file mode 100644 index 00000000000..dda2df4fd48 --- /dev/null +++ b/tests/queries/0_stateless/01646_rewrite_sum_if_bug.reference @@ -0,0 +1,2 @@ +67 +0 100 diff --git a/tests/queries/0_stateless/01646_rewrite_sum_if_bug.sql b/tests/queries/0_stateless/01646_rewrite_sum_if_bug.sql new file mode 100644 index 00000000000..3e6a7b92dbf --- /dev/null +++ b/tests/queries/0_stateless/01646_rewrite_sum_if_bug.sql @@ -0,0 +1,26 @@ +DROP TABLE IF EXISTS t; +create table t( s String ) Engine=Memory as select arrayJoin (['a','b','c']); + +SELECT round((sum(multiIf(s IN ('a', 'b'), 1, 0)) / count()) * 100) AS r +FROM cluster('test_cluster_two_shards', currentDatabase(), t); + +DROP TABLE t; + + +DROP TABLE IF EXISTS test_alias; + +CREATE TABLE test_alias(`a` Int64, `b` Int64, `c` Int64, `day` Date, `rtime` DateTime) ENGINE = Memory +as select 0, 0, 0, '2022-01-01', 0 from zeros(10); + +WITH + sum(if((a >= 0) AND (b != 100) AND (c = 0), 1, 0)) AS r1, + sum(if((a >= 0) AND (b != 100) AND (c > 220), 1, 0)) AS r2 +SELECT + (intDiv(toUInt32(rtime), 20) * 20) * 1000 AS t, + (r1 * 100) / (r1 + r2) AS m +FROM cluster('test_cluster_two_shards', currentDatabase(), test_alias) +WHERE day = '2022-01-01' +GROUP BY t +ORDER BY t ASC; + +DROP TABLE test_alias; diff --git a/tests/queries/0_stateless/01825_type_json_add_column.reference.j2 b/tests/queries/0_stateless/01825_type_json_add_column.reference.j2 new file mode 100644 index 00000000000..da724aef01a --- /dev/null +++ b/tests/queries/0_stateless/01825_type_json_add_column.reference.j2 @@ -0,0 +1,6 @@ +{% for storage in ["MergeTree", "ReplicatedMergeTree('/clickhouse/tables/{database}/test_01825_add_column/', 'r1')"] -%} +{"id":"1","s":{"k1":0}} +{"id":"2","s":{"k1":100}} +{"id":"1"} +{"id":"2"} +{% endfor -%} diff --git a/tests/queries/0_stateless/01825_type_json_add_column.sql.j2 b/tests/queries/0_stateless/01825_type_json_add_column.sql.j2 new file mode 100644 index 00000000000..87c76c042a6 --- /dev/null +++ b/tests/queries/0_stateless/01825_type_json_add_column.sql.j2 @@ -0,0 +1,23 @@ +-- Tags: no-fasttest + +{% for storage in ["MergeTree", "ReplicatedMergeTree('/clickhouse/tables/{database}/test_01825_add_column/', 'r1')"] -%} + +DROP TABLE IF EXISTS t_json_add_column; +SET allow_experimental_object_type = 1; + +CREATE TABLE t_json_add_column (id UInt64) ENGINE = {{ storage }} ORDER BY tuple(); + +INSERT INTO t_json_add_column VALUES (1); +ALTER TABLE t_json_add_column ADD COLUMN s JSON; + +INSERT INTO t_json_add_column VALUES(2, '{"k1": 100}'); + +SELECT * FROM t_json_add_column ORDER BY id FORMAT JSONEachRow; + +ALTER TABLE t_json_add_column DROP COLUMN s; + +SELECT * FROM t_json_add_column ORDER BY id FORMAT JSONEachRow; + +DROP TABLE t_json_add_column; + +{% endfor -%} diff --git a/tests/integration/test_catboost_model_first_evaluate/__init__.py b/tests/queries/0_stateless/02070_join_on_disk.reference similarity index 100% rename from tests/integration/test_catboost_model_first_evaluate/__init__.py rename to tests/queries/0_stateless/02070_join_on_disk.reference diff --git a/tests/queries/0_stateless/02070_join_on_disk.sql b/tests/queries/0_stateless/02070_join_on_disk.sql new file mode 100644 index 00000000000..eabf31df25f --- /dev/null +++ b/tests/queries/0_stateless/02070_join_on_disk.sql @@ -0,0 +1,21 @@ +-- Regression test when Join stores data on disk and receive empty block. +-- Because of this it does not create empty file, while expect it. + +SET max_threads = 1; +SET join_algorithm = 'auto'; +SET max_rows_in_join = 1000; +SET optimize_aggregation_in_order = 1; +SET max_block_size = 1000; + +DROP TABLE IF EXISTS join_on_disk; + +SYSTEM STOP MERGES join_on_disk; + +CREATE TABLE join_on_disk (id Int) Engine=MergeTree() ORDER BY id; + +INSERT INTO join_on_disk SELECT number as id FROM numbers_mt(50000); +INSERT INTO join_on_disk SELECT number as id FROM numbers_mt(1000); + +SELECT id FROM join_on_disk lhs LEFT JOIN (SELECT id FROM join_on_disk GROUP BY id) rhs USING (id) FORMAT Null; + +DROP TABLE join_on_disk; diff --git a/tests/queries/0_stateless/02117_show_create_table_system.reference b/tests/queries/0_stateless/02117_show_create_table_system.reference index 9e2f676bb55..d087bb55622 100644 --- a/tests/queries/0_stateless/02117_show_create_table_system.reference +++ b/tests/queries/0_stateless/02117_show_create_table_system.reference @@ -364,18 +364,6 @@ CREATE TABLE system.metrics ) ENGINE = SystemMetrics COMMENT 'SYSTEM TABLE is built on the fly.' -CREATE TABLE system.models -( - `name` String, - `status` Enum8('NOT_LOADED' = 0, 'LOADED' = 1, 'FAILED' = 2, 'LOADING' = 3, 'FAILED_AND_RELOADING' = 4, 'LOADED_AND_RELOADING' = 5, 'NOT_EXIST' = 6), - `origin` String, - `type` String, - `loading_start_time` DateTime, - `loading_duration` Float32, - `last_exception` String -) -ENGINE = SystemModels -COMMENT 'SYSTEM TABLE is built on the fly.' CREATE TABLE system.mutations ( `database` String, diff --git a/tests/queries/0_stateless/02117_show_create_table_system.sql b/tests/queries/0_stateless/02117_show_create_table_system.sql index 9a5726a0780..8b75ed60eec 100644 --- a/tests/queries/0_stateless/02117_show_create_table_system.sql +++ b/tests/queries/0_stateless/02117_show_create_table_system.sql @@ -45,7 +45,6 @@ show create table macros format TSVRaw; show create table merge_tree_settings format TSVRaw; show create table merges format TSVRaw; show create table metrics format TSVRaw; -show create table models format TSVRaw; show create table mutations format TSVRaw; show create table numbers format TSVRaw; show create table numbers_mt format TSVRaw; diff --git a/tests/queries/0_stateless/02149_read_in_order_fixed_prefix.sql b/tests/queries/0_stateless/02149_read_in_order_fixed_prefix.sql index 4dfcbb9bf80..44c1c12be35 100644 --- a/tests/queries/0_stateless/02149_read_in_order_fixed_prefix.sql +++ b/tests/queries/0_stateless/02149_read_in_order_fixed_prefix.sql @@ -56,7 +56,13 @@ ENGINE = MergeTree ORDER BY (toStartOfDay(dt), d); INSERT INTO t_read_in_order SELECT toDateTime('2020-10-10 00:00:00') + number, 1 / (number % 100 + 1), number FROM numbers(1000); EXPLAIN PIPELINE SELECT toStartOfDay(dt) as date, d FROM t_read_in_order ORDER BY date, round(d) LIMIT 5; -SELECT toStartOfDay(dt) as date, d FROM t_read_in_order ORDER BY date, round(d) LIMIT 5; +SELECT * from ( + SELECT toStartOfDay(dt) as date, d FROM t_read_in_order ORDER BY date, round(d) LIMIT 50000000000 + -- subquery with limit 50000000 to stabilize a test result and prevent order by d pushdown +) order by d limit 5; EXPLAIN PIPELINE SELECT toStartOfDay(dt) as date, d FROM t_read_in_order ORDER BY date, round(d) LIMIT 5; -SELECT toStartOfDay(dt) as date, d FROM t_read_in_order WHERE date = '2020-10-10' ORDER BY round(d) LIMIT 5; +SELECT * from ( + SELECT toStartOfDay(dt) as date, d FROM t_read_in_order WHERE date = '2020-10-10' ORDER BY round(d) LIMIT 50000000000 + -- subquery with limit 50000000 to stabilize a test result and prevent order by d pushdown +) order by d limit 5; diff --git a/tests/queries/0_stateless/02293_grouping_function.reference b/tests/queries/0_stateless/02293_grouping_function.reference index e71d6812ab5..7d745a0e0fa 100644 --- a/tests/queries/0_stateless/02293_grouping_function.reference +++ b/tests/queries/0_stateless/02293_grouping_function.reference @@ -8,7 +8,8 @@ GROUP BY (number), (number % 2) ) -ORDER BY number, gr; +ORDER BY number, gr +SETTINGS force_grouping_standard_compatibility=0; 0 1 0 1 0 2 @@ -30,7 +31,8 @@ GROUP BY (number), (number % 2) ) -ORDER BY number, gr; +ORDER BY number, gr +SETTINGS force_grouping_standard_compatibility=0; 0 1 0 2 0 2 @@ -52,7 +54,8 @@ GROUP BY (number), (number % 2) ) -ORDER BY number, gr; +ORDER BY number, gr +SETTINGS force_grouping_standard_compatibility=0; 0 0 0 1 0 1 @@ -73,7 +76,8 @@ GROUP BY (number), (number % 2) ) -ORDER BY number, grouping(number, number % 2) = 1; +ORDER BY number, grouping(number, number % 2) = 1 +SETTINGS force_grouping_standard_compatibility=0; 0 0 0 @@ -97,7 +101,8 @@ GROUP BY (number, number % 2), () ) -ORDER BY (gr, number); +ORDER BY (gr, number) +SETTINGS force_grouping_standard_compatibility=0; 0 10 0 0 1 2 1 1 2 @@ -129,7 +134,7 @@ GROUP BY ) HAVING grouping(number, number % 2) = 2 ORDER BY number -SETTINGS enable_optimize_predicate_expression = 0; +SETTINGS enable_optimize_predicate_expression = 0, force_grouping_standard_compatibility=0; 0 1 2 @@ -150,7 +155,7 @@ GROUP BY ) HAVING grouping(number, number % 2) = 1 ORDER BY number -SETTINGS enable_optimize_predicate_expression = 0; +SETTINGS enable_optimize_predicate_expression = 0, force_grouping_standard_compatibility=0; 0 0 SELECT @@ -161,7 +166,8 @@ GROUP BY GROUPING SETS ( (number), (number % 2)) -ORDER BY number, gr; +ORDER BY number, gr +SETTINGS force_grouping_standard_compatibility=0; 0 0 0 1 0 1 diff --git a/tests/queries/0_stateless/02293_grouping_function.sql b/tests/queries/0_stateless/02293_grouping_function.sql index 169fc09c324..cf076c8e51c 100644 --- a/tests/queries/0_stateless/02293_grouping_function.sql +++ b/tests/queries/0_stateless/02293_grouping_function.sql @@ -19,7 +19,8 @@ GROUP BY (number), (number % 2) ) -ORDER BY number, gr; +ORDER BY number, gr +SETTINGS force_grouping_standard_compatibility=0; SELECT number, @@ -30,7 +31,8 @@ GROUP BY (number), (number % 2) ) -ORDER BY number, gr; +ORDER BY number, gr +SETTINGS force_grouping_standard_compatibility=0; SELECT number, @@ -41,7 +43,8 @@ GROUP BY (number), (number % 2) ) -ORDER BY number, gr; +ORDER BY number, gr +SETTINGS force_grouping_standard_compatibility=0; SELECT number @@ -51,7 +54,8 @@ GROUP BY (number), (number % 2) ) -ORDER BY number, grouping(number, number % 2) = 1; +ORDER BY number, grouping(number, number % 2) = 1 +SETTINGS force_grouping_standard_compatibility=0; SELECT number, @@ -64,7 +68,8 @@ GROUP BY (number, number % 2), () ) -ORDER BY (gr, number); +ORDER BY (gr, number) +SETTINGS force_grouping_standard_compatibility=0; SELECT number @@ -76,7 +81,7 @@ GROUP BY ) HAVING grouping(number, number % 2) = 2 ORDER BY number -SETTINGS enable_optimize_predicate_expression = 0; +SETTINGS enable_optimize_predicate_expression = 0, force_grouping_standard_compatibility=0; SELECT number @@ -88,7 +93,7 @@ GROUP BY ) HAVING grouping(number, number % 2) = 1 ORDER BY number -SETTINGS enable_optimize_predicate_expression = 0; +SETTINGS enable_optimize_predicate_expression = 0, force_grouping_standard_compatibility=0; SELECT number, @@ -98,4 +103,5 @@ GROUP BY GROUPING SETS ( (number), (number % 2)) -ORDER BY number, gr; +ORDER BY number, gr +SETTINGS force_grouping_standard_compatibility=0; diff --git a/tests/queries/0_stateless/02293_grouping_function_group_by.reference b/tests/queries/0_stateless/02293_grouping_function_group_by.reference index 7f87aecd4bd..49cdca1411e 100644 --- a/tests/queries/0_stateless/02293_grouping_function_group_by.reference +++ b/tests/queries/0_stateless/02293_grouping_function_group_by.reference @@ -6,7 +6,8 @@ FROM remote('127.0.0.{2,3}', numbers(10)) GROUP BY number, number % 2 -ORDER BY number; +ORDER BY number +SETTINGS force_grouping_standard_compatibility=0; 0 1 1 1 2 1 @@ -25,7 +26,8 @@ FROM remote('127.0.0.{2,3}', numbers(10)) GROUP BY number, number % 2 -ORDER BY number; +ORDER BY number +SETTINGS force_grouping_standard_compatibility=0; 0 1 1 1 1 1 2 1 1 @@ -45,7 +47,8 @@ GROUP BY number % 2 WITH ROLLUP ORDER BY - number, gr; + number, gr +SETTINGS force_grouping_standard_compatibility=0; 0 0 0 2 0 3 @@ -74,7 +77,8 @@ FROM remote('127.0.0.{2,3}', numbers(10)) GROUP BY ROLLUP(number, number % 2) ORDER BY - number, gr; + number, gr +SETTINGS force_grouping_standard_compatibility=0; 0 0 0 2 0 3 @@ -105,7 +109,8 @@ GROUP BY number % 2 WITH CUBE ORDER BY - number, gr; + number, gr +SETTINGS force_grouping_standard_compatibility=0; 0 0 0 1 0 1 @@ -136,7 +141,8 @@ FROM remote('127.0.0.{2,3}', numbers(10)) GROUP BY CUBE(number, number % 2) ORDER BY - number, gr; + number, gr +SETTINGS force_grouping_standard_compatibility=0; 0 0 0 1 0 1 @@ -168,7 +174,8 @@ GROUP BY CUBE(number, number % 2) HAVING grouping(number) != 0 ORDER BY - number, gr; + number, gr +SETTINGS force_grouping_standard_compatibility=0; 0 5 0 6 1 5 @@ -205,7 +212,8 @@ FROM remote('127.0.0.{2,3}', numbers(10)) GROUP BY CUBE(number, number % 2) WITH TOTALS ORDER BY - number, gr; + number, gr +SETTINGS force_grouping_standard_compatibility=0; 0 0 0 1 0 1 @@ -247,7 +255,8 @@ FROM remote('127.0.0.{2,3}', numbers(10)) GROUP BY ROLLUP(number, number % 2) WITH TOTALS ORDER BY - number, gr; + number, gr +SETTINGS force_grouping_standard_compatibility=0; 0 0 0 2 0 3 diff --git a/tests/queries/0_stateless/02293_grouping_function_group_by.sql b/tests/queries/0_stateless/02293_grouping_function_group_by.sql index 9bf9d43478b..d438a8a5277 100644 --- a/tests/queries/0_stateless/02293_grouping_function_group_by.sql +++ b/tests/queries/0_stateless/02293_grouping_function_group_by.sql @@ -15,7 +15,8 @@ FROM remote('127.0.0.{2,3}', numbers(10)) GROUP BY number, number % 2 -ORDER BY number; +ORDER BY number +SETTINGS force_grouping_standard_compatibility=0; SELECT number, @@ -25,7 +26,8 @@ FROM remote('127.0.0.{2,3}', numbers(10)) GROUP BY number, number % 2 -ORDER BY number; +ORDER BY number +SETTINGS force_grouping_standard_compatibility=0; SELECT number, @@ -36,7 +38,8 @@ GROUP BY number % 2 WITH ROLLUP ORDER BY - number, gr; + number, gr +SETTINGS force_grouping_standard_compatibility=0; SELECT number, @@ -45,7 +48,8 @@ FROM remote('127.0.0.{2,3}', numbers(10)) GROUP BY ROLLUP(number, number % 2) ORDER BY - number, gr; + number, gr +SETTINGS force_grouping_standard_compatibility=0; SELECT number, @@ -56,7 +60,8 @@ GROUP BY number % 2 WITH CUBE ORDER BY - number, gr; + number, gr +SETTINGS force_grouping_standard_compatibility=0; SELECT number, @@ -65,7 +70,8 @@ FROM remote('127.0.0.{2,3}', numbers(10)) GROUP BY CUBE(number, number % 2) ORDER BY - number, gr; + number, gr +SETTINGS force_grouping_standard_compatibility=0; SELECT number, @@ -75,7 +81,8 @@ GROUP BY CUBE(number, number % 2) HAVING grouping(number) != 0 ORDER BY - number, gr; + number, gr +SETTINGS force_grouping_standard_compatibility=0; SELECT number, @@ -94,7 +101,8 @@ FROM remote('127.0.0.{2,3}', numbers(10)) GROUP BY CUBE(number, number % 2) WITH TOTALS ORDER BY - number, gr; + number, gr +SETTINGS force_grouping_standard_compatibility=0; SELECT number, @@ -113,4 +121,5 @@ FROM remote('127.0.0.{2,3}', numbers(10)) GROUP BY ROLLUP(number, number % 2) WITH TOTALS ORDER BY - number, gr; + number, gr +SETTINGS force_grouping_standard_compatibility=0; diff --git a/tests/queries/0_stateless/02315_grouping_constant_folding.reference b/tests/queries/0_stateless/02315_grouping_constant_folding.reference index 5aa979b1453..6e591de2661 100644 --- a/tests/queries/0_stateless/02315_grouping_constant_folding.reference +++ b/tests/queries/0_stateless/02315_grouping_constant_folding.reference @@ -1,5 +1,5 @@ -- { echoOn } -SELECT count() AS amount, a, b, GROUPING(a, b) FROM test02315 GROUP BY GROUPING SETS ((a, b), (a), ()) ORDER BY (amount, a, b); +SELECT count() AS amount, a, b, GROUPING(a, b) FROM test02315 GROUP BY GROUPING SETS ((a, b), (a), ()) ORDER BY (amount, a, b) SETTINGS force_grouping_standard_compatibility=0; 1 0 0 3 1 0 2 3 1 0 4 3 @@ -13,7 +13,7 @@ SELECT count() AS amount, a, b, GROUPING(a, b) FROM test02315 GROUP BY GROUPING 5 0 0 2 5 1 0 2 10 0 0 0 -SELECT count() AS amount, a, b, GROUPING(a, b) FROM test02315 GROUP BY ROLLUP(a, b) ORDER BY (amount, a, b); +SELECT count() AS amount, a, b, GROUPING(a, b) FROM test02315 GROUP BY ROLLUP(a, b) ORDER BY (amount, a, b) SETTINGS force_grouping_standard_compatibility=0; 1 0 0 3 1 0 2 3 1 0 4 3 diff --git a/tests/queries/0_stateless/02315_grouping_constant_folding.sql b/tests/queries/0_stateless/02315_grouping_constant_folding.sql index c4ef087a308..ff259b7be79 100644 --- a/tests/queries/0_stateless/02315_grouping_constant_folding.sql +++ b/tests/queries/0_stateless/02315_grouping_constant_folding.sql @@ -5,9 +5,9 @@ CREATE TABLE test02315(a UInt64, b UInt64) ENGINE=MergeTree() ORDER BY (a, b); INSERT INTO test02315 SELECT number % 2 as a, number as b FROM numbers(10); -- { echoOn } -SELECT count() AS amount, a, b, GROUPING(a, b) FROM test02315 GROUP BY GROUPING SETS ((a, b), (a), ()) ORDER BY (amount, a, b); +SELECT count() AS amount, a, b, GROUPING(a, b) FROM test02315 GROUP BY GROUPING SETS ((a, b), (a), ()) ORDER BY (amount, a, b) SETTINGS force_grouping_standard_compatibility=0; -SELECT count() AS amount, a, b, GROUPING(a, b) FROM test02315 GROUP BY ROLLUP(a, b) ORDER BY (amount, a, b); +SELECT count() AS amount, a, b, GROUPING(a, b) FROM test02315 GROUP BY ROLLUP(a, b) ORDER BY (amount, a, b) SETTINGS force_grouping_standard_compatibility=0; -- { echoOff } DROP TABLE test02315; diff --git a/tests/queries/0_stateless/02374_combine_multi_if_and_count_if_opt.sql b/tests/queries/0_stateless/02374_combine_multi_if_and_count_if_opt.sql index 06e1e4bfd55..c3367873042 100644 --- a/tests/queries/0_stateless/02374_combine_multi_if_and_count_if_opt.sql +++ b/tests/queries/0_stateless/02374_combine_multi_if_and_count_if_opt.sql @@ -4,6 +4,10 @@ create table m (a int) engine Log; insert into m values (1); +set optimize_rewrite_sum_if_to_count_if=1; + explain syntax select sum(multiIf(a = 1, 1, 0)) from m; +set optimize_rewrite_sum_if_to_count_if=0; + drop table m; diff --git a/tests/queries/0_stateless/02413_model_evaluate_smoke.reference b/tests/queries/0_stateless/02413_model_evaluate_smoke.reference deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/tests/queries/0_stateless/02413_model_evaluate_smoke.sql b/tests/queries/0_stateless/02413_model_evaluate_smoke.sql deleted file mode 100644 index 3b20067abfe..00000000000 --- a/tests/queries/0_stateless/02413_model_evaluate_smoke.sql +++ /dev/null @@ -1,2 +0,0 @@ --- This model does not exist: -SELECT modelEvaluate('hello', 1, 2, 3); -- { serverError 36 } diff --git a/tests/queries/0_stateless/02415_all_new_functions_must_be_documented.reference b/tests/queries/0_stateless/02415_all_new_functions_must_be_documented.reference index cbd92d0e8f4..6e0e41f11b8 100644 --- a/tests/queries/0_stateless/02415_all_new_functions_must_be_documented.reference +++ b/tests/queries/0_stateless/02415_all_new_functions_must_be_documented.reference @@ -192,6 +192,7 @@ caseWithExpr caseWithExpression caseWithoutExpr caseWithoutExpression +catboostEvaluate cbrt ceil char @@ -475,7 +476,6 @@ min2 minSampleSizeContinous minSampleSizeConversion minus -modelEvaluate modulo moduloLegacy moduloOrZero diff --git a/tests/queries/0_stateless/02416_grouping_function_compatibility.reference b/tests/queries/0_stateless/02416_grouping_function_compatibility.reference new file mode 100644 index 00000000000..c9a3ad2f593 --- /dev/null +++ b/tests/queries/0_stateless/02416_grouping_function_compatibility.reference @@ -0,0 +1,29 @@ +-- { echoOn } +SELECT count() AS amount, a, b, GROUPING(a, b) FROM test02416 GROUP BY GROUPING SETS ((a, b), (a), ()) ORDER BY (amount, a, b); +1 0 0 0 +1 0 2 0 +1 0 4 0 +1 0 6 0 +1 0 8 0 +1 1 1 0 +1 1 3 0 +1 1 5 0 +1 1 7 0 +1 1 9 0 +5 0 0 1 +5 1 0 1 +10 0 0 3 +SELECT count() AS amount, a, b, GROUPING(a, b) FROM test02416 GROUP BY ROLLUP(a, b) ORDER BY (amount, a, b); +1 0 0 0 +1 0 2 0 +1 0 4 0 +1 0 6 0 +1 0 8 0 +1 1 1 0 +1 1 3 0 +1 1 5 0 +1 1 7 0 +1 1 9 0 +5 0 0 1 +5 1 0 1 +10 0 0 3 diff --git a/tests/queries/0_stateless/02416_grouping_function_compatibility.sql b/tests/queries/0_stateless/02416_grouping_function_compatibility.sql new file mode 100644 index 00000000000..ed21055ade5 --- /dev/null +++ b/tests/queries/0_stateless/02416_grouping_function_compatibility.sql @@ -0,0 +1,14 @@ +DROP TABLE IF EXISTS test02416; + +CREATE TABLE test02416(a UInt64, b UInt64) ENGINE=MergeTree() ORDER BY (a, b); + +INSERT INTO test02416 SELECT number % 2 as a, number as b FROM numbers(10); + +-- { echoOn } +SELECT count() AS amount, a, b, GROUPING(a, b) FROM test02416 GROUP BY GROUPING SETS ((a, b), (a), ()) ORDER BY (amount, a, b); + +SELECT count() AS amount, a, b, GROUPING(a, b) FROM test02416 GROUP BY ROLLUP(a, b) ORDER BY (amount, a, b); + +-- { echoOff } +DROP TABLE test02416; + diff --git a/tests/queries/0_stateless/02417_null_variadic_behaviour.reference b/tests/queries/0_stateless/02417_null_variadic_behaviour.reference new file mode 100644 index 00000000000..bedb69f99b0 --- /dev/null +++ b/tests/queries/0_stateless/02417_null_variadic_behaviour.reference @@ -0,0 +1,65 @@ +-- { echo } +SELECT avgWeighted(number, number) t, toTypeName(t) FROM numbers(1); +nan Float64 +SELECT avgWeighted(number, number + 1) t, toTypeName(t) FROM numbers(0); +nan Float64 +SELECT avgWeighted(toNullable(number), number) t, toTypeName(t) FROM numbers(1); +nan Nullable(Float64) +SELECT avgWeighted(if(number < 10000, NULL, number), number) t, toTypeName(t) FROM numbers(100); +\N Nullable(Float64) +SELECT avgWeighted(if(number < 50, NULL, number), number) t, toTypeName(t) FROM numbers(100); +77.29530201342281 Nullable(Float64) +SELECT avgWeighted(number, if(number < 10000, NULL, number)) t, toTypeName(t) FROM numbers(100); +\N Nullable(Float64) +SELECT avgWeighted(number, if(number < 50, NULL, number)) t, toTypeName(t) FROM numbers(100); +77.29530201342281 Nullable(Float64) +SELECT avgWeighted(toNullable(number), if(number < 10000, NULL, number)) t, toTypeName(t) FROM numbers(100); +\N Nullable(Float64) +SELECT avgWeighted(toNullable(number), if(number < 50, NULL, number)) t, toTypeName(t) FROM numbers(100); +77.29530201342281 Nullable(Float64) +SELECT avgWeighted(if(number < 10000, NULL, number), toNullable(number)) t, toTypeName(t) FROM numbers(100); +\N Nullable(Float64) +SELECT avgWeighted(if(number < 50, NULL, number), toNullable(number)) t, toTypeName(t) FROM numbers(100); +77.29530201342281 Nullable(Float64) +SELECT avgWeighted(if(number < 10000, NULL, number), if(number < 10000, NULL, number)) t, toTypeName(t) FROM numbers(100); +\N Nullable(Float64) +SELECT avgWeighted(if(number < 50, NULL, number), if(number < 10000, NULL, number)) t, toTypeName(t) FROM numbers(100); +\N Nullable(Float64) +SELECT avgWeighted(if(number < 10000, NULL, number), if(number < 50, NULL, number)) t, toTypeName(t) FROM numbers(100); +\N Nullable(Float64) +SELECT avgWeighted(if(number < 50, NULL, number), if(number < 50, NULL, number)) t, toTypeName(t) FROM numbers(100); +77.29530201342281 Nullable(Float64) +SELECT avgWeightedIf(number, number, number % 10) t, toTypeName(t) FROM numbers(100); +66.63333333333334 Float64 +SELECT avgWeightedIf(number, number, toNullable(number % 10)) t, toTypeName(t) FROM numbers(100); +66.63333333333334 Float64 +SELECT avgWeightedIf(number, number, if(number < 10000, NULL, number % 10)) t, toTypeName(t) FROM numbers(100); +nan Float64 +SELECT avgWeightedIf(number, number, if(number < 50, NULL, number % 10)) t, toTypeName(t) FROM numbers(100); +77.75555555555556 Float64 +SELECT avgWeightedIf(number, number, if(number < 0, NULL, number % 10)) t, toTypeName(t) FROM numbers(100); +66.63333333333334 Float64 +SELECT avgWeightedIf(if(number < 10000, NULL, number), if(number < 10000, NULL, number), if(number < 10000, NULL, number % 10)) t, toTypeName(t) FROM numbers(100); +\N Nullable(Float64) +SELECT avgWeightedIf(if(number < 50, NULL, number), if(number < 10000, NULL, number), if(number < 10000, NULL, number % 10)) t, toTypeName(t) FROM numbers(100); +\N Nullable(Float64) +SELECT avgWeightedIf(if(number < 10000, NULL, number), if(number < 50, NULL, number), if(number < 10000, NULL, number % 10)) t, toTypeName(t) FROM numbers(100); +\N Nullable(Float64) +SELECT avgWeightedIf(if(number < 50, NULL, number), if(number < 50, NULL, number), if(number < 10000, NULL, number % 10)) t, toTypeName(t) FROM numbers(100); +\N Nullable(Float64) +SELECT avgWeightedIf(if(number < 10000, NULL, number), if(number < 10000, NULL, number), if(number < 50, NULL, number % 10)) t, toTypeName(t) FROM numbers(100); +\N Nullable(Float64) +SELECT avgWeightedIf(if(number < 50, NULL, number), if(number < 10000, NULL, number), if(number < 50, NULL, number % 10)) t, toTypeName(t) FROM numbers(100); +\N Nullable(Float64) +SELECT avgWeightedIf(if(number < 10000, NULL, number), if(number < 50, NULL, number), if(number < 50, NULL, number % 10)) t, toTypeName(t) FROM numbers(100); +\N Nullable(Float64) +SELECT avgWeightedIf(if(number < 50, NULL, number), if(number < 50, NULL, number), if(number < 50, NULL, number % 10)) t, toTypeName(t) FROM numbers(100); +77.75555555555556 Nullable(Float64) +SELECT avgWeightedIf(if(number < 10000, NULL, number), if(number < 10000, NULL, number), if(number < 0, NULL, number % 10)) t, toTypeName(t) FROM numbers(100); +\N Nullable(Float64) +SELECT avgWeightedIf(if(number < 50, NULL, number), if(number < 10000, NULL, number), if(number < 0, NULL, number % 10)) t, toTypeName(t) FROM numbers(100); +\N Nullable(Float64) +SELECT avgWeightedIf(if(number < 10000, NULL, number), if(number < 50, NULL, number), if(number < 0, NULL, number % 10)) t, toTypeName(t) FROM numbers(100); +\N Nullable(Float64) +SELECT avgWeightedIf(if(number < 50, NULL, number), if(number < 50, NULL, number), if(number < 0, NULL, number % 10)) t, toTypeName(t) FROM numbers(100); +77.75555555555556 Nullable(Float64) diff --git a/tests/queries/0_stateless/02417_null_variadic_behaviour.sql b/tests/queries/0_stateless/02417_null_variadic_behaviour.sql new file mode 100644 index 00000000000..566cf27bb90 --- /dev/null +++ b/tests/queries/0_stateless/02417_null_variadic_behaviour.sql @@ -0,0 +1,41 @@ +-- { echo } +SELECT avgWeighted(number, number) t, toTypeName(t) FROM numbers(1); +SELECT avgWeighted(number, number + 1) t, toTypeName(t) FROM numbers(0); + +SELECT avgWeighted(toNullable(number), number) t, toTypeName(t) FROM numbers(1); +SELECT avgWeighted(if(number < 10000, NULL, number), number) t, toTypeName(t) FROM numbers(100); +SELECT avgWeighted(if(number < 50, NULL, number), number) t, toTypeName(t) FROM numbers(100); + +SELECT avgWeighted(number, if(number < 10000, NULL, number)) t, toTypeName(t) FROM numbers(100); +SELECT avgWeighted(number, if(number < 50, NULL, number)) t, toTypeName(t) FROM numbers(100); + +SELECT avgWeighted(toNullable(number), if(number < 10000, NULL, number)) t, toTypeName(t) FROM numbers(100); +SELECT avgWeighted(toNullable(number), if(number < 50, NULL, number)) t, toTypeName(t) FROM numbers(100); +SELECT avgWeighted(if(number < 10000, NULL, number), toNullable(number)) t, toTypeName(t) FROM numbers(100); +SELECT avgWeighted(if(number < 50, NULL, number), toNullable(number)) t, toTypeName(t) FROM numbers(100); + +SELECT avgWeighted(if(number < 10000, NULL, number), if(number < 10000, NULL, number)) t, toTypeName(t) FROM numbers(100); +SELECT avgWeighted(if(number < 50, NULL, number), if(number < 10000, NULL, number)) t, toTypeName(t) FROM numbers(100); +SELECT avgWeighted(if(number < 10000, NULL, number), if(number < 50, NULL, number)) t, toTypeName(t) FROM numbers(100); +SELECT avgWeighted(if(number < 50, NULL, number), if(number < 50, NULL, number)) t, toTypeName(t) FROM numbers(100); + +SELECT avgWeightedIf(number, number, number % 10) t, toTypeName(t) FROM numbers(100); +SELECT avgWeightedIf(number, number, toNullable(number % 10)) t, toTypeName(t) FROM numbers(100); +SELECT avgWeightedIf(number, number, if(number < 10000, NULL, number % 10)) t, toTypeName(t) FROM numbers(100); +SELECT avgWeightedIf(number, number, if(number < 50, NULL, number % 10)) t, toTypeName(t) FROM numbers(100); +SELECT avgWeightedIf(number, number, if(number < 0, NULL, number % 10)) t, toTypeName(t) FROM numbers(100); + +SELECT avgWeightedIf(if(number < 10000, NULL, number), if(number < 10000, NULL, number), if(number < 10000, NULL, number % 10)) t, toTypeName(t) FROM numbers(100); +SELECT avgWeightedIf(if(number < 50, NULL, number), if(number < 10000, NULL, number), if(number < 10000, NULL, number % 10)) t, toTypeName(t) FROM numbers(100); +SELECT avgWeightedIf(if(number < 10000, NULL, number), if(number < 50, NULL, number), if(number < 10000, NULL, number % 10)) t, toTypeName(t) FROM numbers(100); +SELECT avgWeightedIf(if(number < 50, NULL, number), if(number < 50, NULL, number), if(number < 10000, NULL, number % 10)) t, toTypeName(t) FROM numbers(100); + +SELECT avgWeightedIf(if(number < 10000, NULL, number), if(number < 10000, NULL, number), if(number < 50, NULL, number % 10)) t, toTypeName(t) FROM numbers(100); +SELECT avgWeightedIf(if(number < 50, NULL, number), if(number < 10000, NULL, number), if(number < 50, NULL, number % 10)) t, toTypeName(t) FROM numbers(100); +SELECT avgWeightedIf(if(number < 10000, NULL, number), if(number < 50, NULL, number), if(number < 50, NULL, number % 10)) t, toTypeName(t) FROM numbers(100); +SELECT avgWeightedIf(if(number < 50, NULL, number), if(number < 50, NULL, number), if(number < 50, NULL, number % 10)) t, toTypeName(t) FROM numbers(100); + +SELECT avgWeightedIf(if(number < 10000, NULL, number), if(number < 10000, NULL, number), if(number < 0, NULL, number % 10)) t, toTypeName(t) FROM numbers(100); +SELECT avgWeightedIf(if(number < 50, NULL, number), if(number < 10000, NULL, number), if(number < 0, NULL, number % 10)) t, toTypeName(t) FROM numbers(100); +SELECT avgWeightedIf(if(number < 10000, NULL, number), if(number < 50, NULL, number), if(number < 0, NULL, number % 10)) t, toTypeName(t) FROM numbers(100); +SELECT avgWeightedIf(if(number < 50, NULL, number), if(number < 50, NULL, number), if(number < 0, NULL, number % 10)) t, toTypeName(t) FROM numbers(100); diff --git a/tests/queries/0_stateless/02417_opentelemetry_insert_on_distributed_table.reference b/tests/queries/0_stateless/02417_opentelemetry_insert_on_distributed_table.reference new file mode 100644 index 00000000000..dde07d4540d --- /dev/null +++ b/tests/queries/0_stateless/02417_opentelemetry_insert_on_distributed_table.reference @@ -0,0 +1,8 @@ +{"operation_name":"void DB::DistributedSink::writeToLocal(const Cluster::ShardInfo &, const DB::Block &, size_t)","cluster":"test_cluster_two_shards_localhost","shard":"1","rows":"1","bytes":"8"} +{"operation_name":"void DB::DistributedSink::writeToLocal(const Cluster::ShardInfo &, const DB::Block &, size_t)","cluster":"test_cluster_two_shards_localhost","shard":"2","rows":"1","bytes":"8"} +{"operation_name":"void DB::StorageDistributedDirectoryMonitor::processFile(const std::string &)","cluster":"test_cluster_two_shards_localhost","shard":"1","rows":"1","bytes":"8"} +{"operation_name":"void DB::StorageDistributedDirectoryMonitor::processFile(const std::string &)","cluster":"test_cluster_two_shards_localhost","shard":"2","rows":"1","bytes":"8"} +{"operation_name":"auto DB::DistributedSink::runWritingJob(DB::DistributedSink::JobReplica &, const DB::Block &, size_t)::(anonymous class)::operator()() const","cluster":"test_cluster_two_shards_localhost","shard":"1","rows":"1","bytes":"8"} +{"operation_name":"auto DB::DistributedSink::runWritingJob(DB::DistributedSink::JobReplica &, const DB::Block &, size_t)::(anonymous class)::operator()() const","cluster":"test_cluster_two_shards_localhost","shard":"2","rows":"1","bytes":"8"} +{"operation_name":"auto DB::DistributedSink::runWritingJob(DB::DistributedSink::JobReplica &, const DB::Block &, size_t)::(anonymous class)::operator()() const","cluster":"test_cluster_two_shards_localhost","shard":"1","rows":"1","bytes":"8"} +{"operation_name":"auto DB::DistributedSink::runWritingJob(DB::DistributedSink::JobReplica &, const DB::Block &, size_t)::(anonymous class)::operator()() const","cluster":"test_cluster_two_shards_localhost","shard":"2","rows":"1","bytes":"8"} diff --git a/tests/queries/0_stateless/02417_opentelemetry_insert_on_distributed_table.sh b/tests/queries/0_stateless/02417_opentelemetry_insert_on_distributed_table.sh new file mode 100755 index 00000000000..9ac5f061d4a --- /dev/null +++ b/tests/queries/0_stateless/02417_opentelemetry_insert_on_distributed_table.sh @@ -0,0 +1,91 @@ +#!/usr/bin/env bash +# Tags: no-fasttest, distributed + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + + +# This function takes 4 arguments: +# $1 - OpenTelemetry Trace Id +# $2 - value of insert_distributed_sync +# $3 - value of prefer_localhost_replica +# $4 - a String that helps to debug +function insert() +{ + echo "INSERT INTO ${CLICKHOUSE_DATABASE}.dist_opentelemetry SETTINGS insert_distributed_sync=$2, prefer_localhost_replica=$3 VALUES(1),(2)" | + ${CLICKHOUSE_CURL} \ + -X POST \ + -H "traceparent: 00-$1-5150000000000515-01" \ + -H "tracestate: $4" \ + "${CLICKHOUSE_URL}" \ + --data @- +} + +function check_span() +{ +${CLICKHOUSE_CLIENT} -nq " + SYSTEM FLUSH LOGS; + + SELECT operation_name, + attribute['clickhouse.cluster'] AS cluster, + attribute['clickhouse.shard_num'] AS shard, + attribute['clickhouse.rows'] AS rows, + attribute['clickhouse.bytes'] AS bytes + FROM system.opentelemetry_span_log + WHERE finish_date >= yesterday() + AND lower(hex(trace_id)) = '${1}' + AND attribute['clickhouse.distributed'] = '${CLICKHOUSE_DATABASE}.dist_opentelemetry' + AND attribute['clickhouse.remote'] = '${CLICKHOUSE_DATABASE}.local_opentelemetry' + ORDER BY attribute['clickhouse.shard_num'] + Format JSONEachRow + ;" +} + + +# +# Prepare tables for tests +# +${CLICKHOUSE_CLIENT} -nq " +DROP TABLE IF EXISTS ${CLICKHOUSE_DATABASE}.dist_opentelemetry; +DROP TABLE IF EXISTS ${CLICKHOUSE_DATABASE}.local_opentelemetry; + +CREATE TABLE ${CLICKHOUSE_DATABASE}.dist_opentelemetry (key UInt64) Engine=Distributed('test_cluster_two_shards_localhost', ${CLICKHOUSE_DATABASE}, local_opentelemetry, key % 2); +CREATE TABLE ${CLICKHOUSE_DATABASE}.local_opentelemetry (key UInt64) Engine=MergeTree ORDER BY key; +" + +# +# test1 +# +trace_id=$(${CLICKHOUSE_CLIENT} -q "select lower(hex(generateUUIDv4()))"); +insert $trace_id 0 1 "async-insert-writeToLocal" +check_span $trace_id + +# +# test2 +# +trace_id=$(${CLICKHOUSE_CLIENT} -q "select lower(hex(generateUUIDv4()))"); +insert $trace_id 0 0 "async-insert-writeToRemote" +check_span $trace_id + +# +# test3 +# +trace_id=$(${CLICKHOUSE_CLIENT} -q "select lower(hex(generateUUIDv4()))"); +insert $trace_id 1 1 "sync-insert-writeToLocal" +check_span $trace_id + +# +# test4 +# +trace_id=$(${CLICKHOUSE_CLIENT} -q "select lower(hex(generateUUIDv4()))"); +insert $trace_id 1 0 "sync-insert-writeToRemote" +check_span $trace_id + +# +# Cleanup +# +${CLICKHOUSE_CLIENT} -nq " +DROP TABLE ${CLICKHOUSE_DATABASE}.dist_opentelemetry; +DROP TABLE ${CLICKHOUSE_DATABASE}.local_opentelemetry; +" diff --git a/tests/queries/0_stateless/02421_record_errors_row_by_input_format.reference b/tests/queries/0_stateless/02421_record_errors_row_by_input_format.reference new file mode 100644 index 00000000000..67ec09b70b7 --- /dev/null +++ b/tests/queries/0_stateless/02421_record_errors_row_by_input_format.reference @@ -0,0 +1,6 @@ +default data 2 Row 2:\nColumn 0, name: c1, type: UInt8, parsed text: "2"\nColumn 1, name: c2, type: UInt8, ERROR: text "ab,34,4" is not like UInt8 2,a +default data 3 Row 3:\nColumn 0, name: c1, type: UInt8, ERROR: text "b,34,45," is not like UInt8 b,3 +default data 5 Row 5:\nColumn 0, name: c1, type: UInt8, parsed text: "5"\nColumn 1, name: c2, type: UInt8, ERROR: text "c6,6" is not like UInt8 5,c +\N data 2 Row 2:\nColumn 0, name: A, type: UInt8, parsed text: "2"\nColumn 1, name: B, type: UInt8, ERROR: text "ab,34,4" is not like UInt8 2,a +\N data 3 Row 3:\nColumn 0, name: A, type: UInt8, ERROR: text "b,34,45," is not like UInt8 b,3 +\N data 5 Row 5:\nColumn 0, name: A, type: UInt8, parsed text: "5"\nColumn 1, name: B, type: UInt8, ERROR: text "c6,6" is not like UInt8 5,c diff --git a/tests/queries/0_stateless/02421_record_errors_row_by_input_format.sh b/tests/queries/0_stateless/02421_record_errors_row_by_input_format.sh new file mode 100755 index 00000000000..dda61512936 --- /dev/null +++ b/tests/queries/0_stateless/02421_record_errors_row_by_input_format.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash +# Tags: no-parallel, no-fasttest + +set -eu + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +# Data preparation. + +CLICKHOUSE_USER_FILES_PATH=$(clickhouse-client --query "select _path, _file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}') + +mkdir -p ${CLICKHOUSE_USER_FILES_PATH}/ +echo -e "1,1\n2,a\nb,3\n4,4\n5,c\n6,6" > ${CLICKHOUSE_USER_FILES_PATH}/a.csv + +${CLICKHOUSE_CLIENT} --query "drop table if exists data;" +${CLICKHOUSE_CLIENT} --query "create table data (A UInt8, B UInt8) engine=MergeTree() order by A;" + +# Server side +${CLICKHOUSE_CLIENT} --input_format_allow_errors_num 4 --input_format_record_errors_file_path "errors_server" --query "insert into data select * from file('a.csv', 'CSV', 'c1 UInt8, c2 UInt8');" +sleep 2 +${CLICKHOUSE_CLIENT} --query "select * except (time) from file('errors_server', 'CSV', 'time DateTime, database Nullable(String), table Nullable(String), offset UInt32, reason String, raw_data String');" + +# Client side +${CLICKHOUSE_CLIENT} --input_format_allow_errors_num 4 --input_format_record_errors_file_path "${CLICKHOUSE_USER_FILES_PATH}/errors_client" --query "insert into data(A, B) format CSV" < ${CLICKHOUSE_USER_FILES_PATH}/a.csv +sleep 2 +${CLICKHOUSE_CLIENT} --query "select * except (time) from file('errors_client', 'CSV', 'time DateTime, database Nullable(String), table Nullable(String), offset UInt32, reason String, raw_data String');" + +# Restore +${CLICKHOUSE_CLIENT} --query "drop table if exists data;" +rm ${CLICKHOUSE_USER_FILES_PATH}/a.csv +rm ${CLICKHOUSE_USER_FILES_PATH}/errors_server +rm ${CLICKHOUSE_USER_FILES_PATH}/errors_client + diff --git a/tests/queries/0_stateless/02421_simple_queries_for_opentelemetry.reference b/tests/queries/0_stateless/02421_simple_queries_for_opentelemetry.reference new file mode 100644 index 00000000000..d167d905636 --- /dev/null +++ b/tests/queries/0_stateless/02421_simple_queries_for_opentelemetry.reference @@ -0,0 +1,4 @@ +{"query":"show processlist format Null\n "} +{"query":"show databases format Null\n "} +{"query":"insert into opentelemetry_test values","read_rows":"3","written_rows":"3"} +{"query":"select * from opentelemetry_test format Null\n ","read_rows":"3","written_rows":""} diff --git a/tests/queries/0_stateless/02421_simple_queries_for_opentelemetry.sh b/tests/queries/0_stateless/02421_simple_queries_for_opentelemetry.sh new file mode 100755 index 00000000000..98b571c5968 --- /dev/null +++ b/tests/queries/0_stateless/02421_simple_queries_for_opentelemetry.sh @@ -0,0 +1,82 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +# This function takes 2 arguments: +# $1 - query id +# $2 - query +function execute_query() +{ + ${CLICKHOUSE_CLIENT} --opentelemetry_start_trace_probability=1 --query_id $1 -nq " + ${2} + " +} + +# For some queries, it's not possible to know how many bytes/rows are read when tests are executed on CI, +# so we only to check the db.statement only +function check_query_span_query_only() +{ +${CLICKHOUSE_CLIENT} -nq " + SYSTEM FLUSH LOGS; + SELECT attribute['db.statement'] as query + FROM system.opentelemetry_span_log + WHERE finish_date >= yesterday() + AND operation_name = 'query' + AND attribute['clickhouse.query_id'] = '${1}' + Format JSONEachRow + ;" +} + +function check_query_span() +{ +${CLICKHOUSE_CLIENT} -nq " + SYSTEM FLUSH LOGS; + SELECT attribute['db.statement'] as query, + attribute['clickhouse.read_rows'] as read_rows, + attribute['clickhouse.written_rows'] as written_rows + FROM system.opentelemetry_span_log + WHERE finish_date >= yesterday() + AND operation_name = 'query' + AND attribute['clickhouse.query_id'] = '${1}' + Format JSONEachRow + ;" +} + +# +# Set up +# +${CLICKHOUSE_CLIENT} -nq " +DROP TABLE IF EXISTS ${CLICKHOUSE_DATABASE}.opentelemetry_test; +CREATE TABLE ${CLICKHOUSE_DATABASE}.opentelemetry_test (id UInt64) Engine=MergeTree Order By id; +" + +# test 1, a query that has special path in the code +# Format Null is used to make sure no output is generated so that it won't pollute the reference file +query_id=$(${CLICKHOUSE_CLIENT} -q "select generateUUIDv4()"); +execute_query $query_id 'show processlist format Null' +check_query_span_query_only "$query_id" + +# test 2, a normal show command +query_id=$(${CLICKHOUSE_CLIENT} -q "select generateUUIDv4()"); +execute_query $query_id 'show databases format Null' +check_query_span_query_only "$query_id" + +# test 3, a normal insert query on local table +query_id=$(${CLICKHOUSE_CLIENT} -q "select generateUUIDv4()"); +execute_query $query_id 'insert into opentelemetry_test values(1)(2)(3)' +check_query_span "$query_id" + +# test 4, a normal select query +query_id=$(${CLICKHOUSE_CLIENT} -q "select generateUUIDv4()"); +execute_query $query_id 'select * from opentelemetry_test format Null' +check_query_span $query_id + + +# +# Tear down +# +${CLICKHOUSE_CLIENT} -q " +DROP TABLE IF EXISTS ${CLICKHOUSE_DATABASE}.opentelemetry_test; +" \ No newline at end of file diff --git a/tests/integration/test_catboost_model_reload/__init__.py b/tests/queries/0_stateless/02422_msgpack_uuid_wrong_column.reference similarity index 100% rename from tests/integration/test_catboost_model_reload/__init__.py rename to tests/queries/0_stateless/02422_msgpack_uuid_wrong_column.reference diff --git a/tests/queries/0_stateless/02422_msgpack_uuid_wrong_column.sql b/tests/queries/0_stateless/02422_msgpack_uuid_wrong_column.sql new file mode 100644 index 00000000000..4d790354d51 --- /dev/null +++ b/tests/queries/0_stateless/02422_msgpack_uuid_wrong_column.sql @@ -0,0 +1,4 @@ +-- Tags: no-parallel, no-fasttest + +insert into function file(02422_data.msgpack) select toUUID('f4cdd80d-5d15-4bdc-9527-adcca635ec1f') as uuid settings output_format_msgpack_uuid_representation='ext'; +select * from file(02422_data.msgpack, auto, 'x Int32'); -- {serverError ILLEGAL_COLUMN} diff --git a/utils/changelog/README.md b/utils/changelog/README.md index 8218af83d96..739229b49c9 100644 --- a/utils/changelog/README.md +++ b/utils/changelog/README.md @@ -13,6 +13,8 @@ python3 changelog.py -h Usage example: ``` +git fetch --tags # changelog.py depends on having the tags available, this will fetch them + python3 changelog.py --output=changelog-v22.4.1.2305-prestable.md --gh-user-or-token="$GITHUB_TOKEN" v21.6.2.7-prestable python3 changelog.py --output=changelog-v22.4.1.2305-prestable.md --gh-user-or-token="$USER" --gh-password="$PASSWORD" v21.6.2.7-prestable ``` diff --git a/utils/list-versions/update-docker-version.sh b/utils/list-versions/update-docker-version.sh new file mode 100755 index 00000000000..429da330a9f --- /dev/null +++ b/utils/list-versions/update-docker-version.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +set -e + +# We check only our code, that's why we skip contrib +GIT_ROOT=$(git rev-parse --show-cdup) +GIT_ROOT=${GIT_ROOT:-.} +VERSION=$(sed -e '1 s/^v//; 1 s/-.*//p; d' "$GIT_ROOT"/utils/list-versions/version_date.tsv) + +find "$GIT_ROOT/docker/server/" -name 'Dockerfile.*' -print0 | xargs -0 sed -i "/^ARG VERSION=/ s/^.*$/ARG VERSION=\"$VERSION\"/" diff --git a/utils/list-versions/version_date.tsv b/utils/list-versions/version_date.tsv index f2c8cfc4c76..f7df0345842 100644 --- a/utils/list-versions/version_date.tsv +++ b/utils/list-versions/version_date.tsv @@ -1,3 +1,4 @@ +v22.8.5.29-lts 2022-09-13 v22.8.4.7-lts 2022-08-31 v22.8.3.13-lts 2022-08-29 v22.8.2.11-lts 2022-08-23