diff --git a/.gitmodules b/.gitmodules index 3b9faea3cc1..68016bf8c5b 100644 --- a/.gitmodules +++ b/.gitmodules @@ -245,6 +245,12 @@ [submodule "contrib/idxd-config"] path = contrib/idxd-config url = https://github.com/intel/idxd-config +[submodule "contrib/QAT-ZSTD-Plugin"] + path = contrib/QAT-ZSTD-Plugin + url = https://github.com/intel/QAT-ZSTD-Plugin +[submodule "contrib/qatlib"] + path = contrib/qatlib + url = https://github.com/intel/qatlib [submodule "contrib/wyhash"] path = contrib/wyhash url = https://github.com/wangyi-fudan/wyhash diff --git a/README.md b/README.md index c56b3c2fd0d..d356e429892 100644 --- a/README.md +++ b/README.md @@ -33,7 +33,7 @@ curl https://clickhouse.com/ | sh ## Upcoming Events -Keep an eye out for upcoming meetups around the world. Somewhere else you want us to be? Please feel free to reach out to tyler clickhouse com. +Keep an eye out for upcoming meetups around the world. Somewhere else you want us to be? Please feel free to reach out to tyler `` clickhouse `` com. ## Recent Recordings * **Recent Meetup Videos**: [Meetup Playlist](https://www.youtube.com/playlist?list=PL0Z2YDlm0b3iNDUzpY1S3L_iV4nARda_U) Whenever possible recordings of the ClickHouse Community Meetups are edited and presented as individual talks. Current featuring "Modern SQL in 2023", "Fast, Concurrent, and Consistent Asynchronous INSERTS in ClickHouse", and "Full-Text Indices: Design and Experiments" diff --git a/contrib/CMakeLists.txt b/contrib/CMakeLists.txt index 02cb19d4c07..c6d1dcb41e6 100644 --- a/contrib/CMakeLists.txt +++ b/contrib/CMakeLists.txt @@ -172,9 +172,9 @@ add_contrib (s2geometry-cmake s2geometry) add_contrib (c-ares-cmake c-ares) if (OS_LINUX AND ARCH_AMD64 AND ENABLE_SSE42) - option (ENABLE_QPL "Enable Intel® Query Processing Library" ${ENABLE_LIBRARIES}) + option (ENABLE_QPL "Enable Intel® Query Processing Library (QPL)" ${ENABLE_LIBRARIES}) elseif(ENABLE_QPL) - message (${RECONFIGURE_MESSAGE_LEVEL} "QPL library is only supported on x86_64 arch with SSE 4.2 or higher") + message (${RECONFIGURE_MESSAGE_LEVEL} "QPL library is only supported on x86_64 with SSE 4.2 or higher") endif() if (ENABLE_QPL) add_contrib (idxd-config-cmake idxd-config) @@ -183,6 +183,28 @@ else() message(STATUS "Not using QPL") endif () +if (OS_LINUX AND ARCH_AMD64) + option (ENABLE_QATLIB "Enable Intel® QuickAssist Technology Library (QATlib)" ${ENABLE_LIBRARIES}) +elseif(ENABLE_QATLIB) + message (${RECONFIGURE_MESSAGE_LEVEL} "QATLib is only supported on x86_64") +endif() +if (ENABLE_QATLIB) + option (ENABLE_QAT_USDM_DRIVER "A User Space DMA-able Memory (USDM) component which allocates/frees DMA-able memory" OFF) + option (ENABLE_QAT_OUT_OF_TREE_BUILD "Using out-of-tree driver, user needs to customize ICP_ROOT variable" OFF) + set(ICP_ROOT "" CACHE STRING "ICP_ROOT variable to define the path of out-of-tree driver package") + if (ENABLE_QAT_OUT_OF_TREE_BUILD) + if (ICP_ROOT STREQUAL "") + message(FATAL_ERROR "Please define the path of out-of-tree driver package with -DICP_ROOT=xxx or disable out-of-tree build with -DENABLE_QAT_OUT_OF_TREE_BUILD=OFF; \ + If you want out-of-tree build but have no package available, please download and build ICP package from: https://www.intel.com/content/www/us/en/download/765501.html") + endif () + else() + add_contrib (qatlib-cmake qatlib) # requires: isa-l + endif () + add_contrib (QAT-ZSTD-Plugin-cmake QAT-ZSTD-Plugin) +else() + message(STATUS "Not using QATLib") +endif () + add_contrib (morton-nd-cmake morton-nd) if (ARCH_S390X) add_contrib(crc32-s390x-cmake crc32-s390x) diff --git a/contrib/NuRaft b/contrib/NuRaft index 2f5f52c4d8c..b7ea89b817a 160000 --- a/contrib/NuRaft +++ b/contrib/NuRaft @@ -1 +1 @@ -Subproject commit 2f5f52c4d8c87c2a3a3d101ca3a0194c9b77526f +Subproject commit b7ea89b817a18dc0eafc1f909d568869f02d2d04 diff --git a/contrib/QAT-ZSTD-Plugin b/contrib/QAT-ZSTD-Plugin new file mode 160000 index 00000000000..e5a134e12d2 --- /dev/null +++ b/contrib/QAT-ZSTD-Plugin @@ -0,0 +1 @@ +Subproject commit e5a134e12d2ea8a5b0f3b83c5b1c325fda4eb0a8 diff --git a/contrib/QAT-ZSTD-Plugin-cmake/CMakeLists.txt b/contrib/QAT-ZSTD-Plugin-cmake/CMakeLists.txt new file mode 100644 index 00000000000..72d21a8572b --- /dev/null +++ b/contrib/QAT-ZSTD-Plugin-cmake/CMakeLists.txt @@ -0,0 +1,85 @@ +# Intel® QuickAssist Technology ZSTD Plugin (QAT ZSTD Plugin) is a plugin to Zstandard*(ZSTD*) for accelerating compression by QAT. +# ENABLE_QAT_OUT_OF_TREE_BUILD = 1 means kernel don't have native support, user will build and install driver from external package: https://www.intel.com/content/www/us/en/download/765501.html +# meanwhile, user need to set ICP_ROOT environment variable which point to the root directory of QAT driver source tree. +# ENABLE_QAT_OUT_OF_TREE_BUILD = 0 means kernel has built-in qat driver, QAT-ZSTD-PLUGIN just has dependency on qatlib. + +if (ENABLE_QAT_OUT_OF_TREE_BUILD) + message(STATUS "Intel QATZSTD out-of-tree build, ICP_ROOT:${ICP_ROOT}") + + set(QATZSTD_SRC_DIR "${ClickHouse_SOURCE_DIR}/contrib/QAT-ZSTD-Plugin/src") + set(QATZSTD_SRC "${QATZSTD_SRC_DIR}/qatseqprod.c") + set(ZSTD_LIBRARY_DIR "${ClickHouse_SOURCE_DIR}/contrib/zstd/lib") + set(QAT_INCLUDE_DIR "${ICP_ROOT}/quickassist/include") + set(QAT_DC_INCLUDE_DIR "${ICP_ROOT}/quickassist/include/dc") + set(QAT_AL_INCLUDE_DIR "${ICP_ROOT}/quickassist/lookaside/access_layer/include") + set(QAT_USDM_INCLUDE_DIR "${ICP_ROOT}/quickassist/utilities/libusdm_drv") + set(USDM_LIBRARY "${ICP_ROOT}/build/libusdm_drv_s.so") + set(QAT_S_LIBRARY "${ICP_ROOT}/build/libqat_s.so") + if (ENABLE_QAT_USDM_DRIVER) + add_definitions(-DENABLE_USDM_DRV) + endif() + add_library(_qatzstd_plugin ${QATZSTD_SRC}) + target_link_libraries (_qatzstd_plugin PUBLIC ${USDM_LIBRARY} ${QAT_S_LIBRARY}) + target_include_directories(_qatzstd_plugin + SYSTEM PUBLIC "${QATZSTD_SRC_DIR}" + PRIVATE ${QAT_INCLUDE_DIR} + ${QAT_DC_INCLUDE_DIR} + ${QAT_AL_INCLUDE_DIR} + ${QAT_USDM_INCLUDE_DIR} + ${ZSTD_LIBRARY_DIR}) + target_compile_definitions(_qatzstd_plugin PRIVATE -DDEBUGLEVEL=0 PUBLIC -DENABLE_ZSTD_QAT_CODEC) + add_library (ch_contrib::qatzstd_plugin ALIAS _qatzstd_plugin) +else () # In-tree build + message(STATUS "Intel QATZSTD in-tree build") + set(QATZSTD_SRC_DIR "${ClickHouse_SOURCE_DIR}/contrib/QAT-ZSTD-Plugin/src") + set(QATZSTD_SRC "${QATZSTD_SRC_DIR}/qatseqprod.c") + set(ZSTD_LIBRARY_DIR "${ClickHouse_SOURCE_DIR}/contrib/zstd/lib") + + # please download&build ICP package from: https://www.intel.com/content/www/us/en/download/765501.html + set(ICP_ROOT "${ClickHouse_SOURCE_DIR}/contrib/qatlib") + set(QAT_INCLUDE_DIR "${ICP_ROOT}/quickassist/include") + set(QAT_DC_INCLUDE_DIR "${ICP_ROOT}/quickassist/include/dc") + set(QAT_AL_INCLUDE_DIR "${ICP_ROOT}/quickassist/lookaside/access_layer/include") + set(QAT_USDM_INCLUDE_DIR "${ICP_ROOT}/quickassist/utilities/libusdm_drv") + set(USDM_LIBRARY "${ICP_ROOT}/build/libusdm_drv_s.so") + set(QAT_S_LIBRARY "${ICP_ROOT}/build/libqat_s.so") + set(LIBQAT_ROOT_DIR "${ClickHouse_SOURCE_DIR}/contrib/qatlib") + set(LIBQAT_HEADER_DIR "${CMAKE_CURRENT_BINARY_DIR}/include") + + file(MAKE_DIRECTORY + "${LIBQAT_HEADER_DIR}/qat" + ) + file(COPY "${LIBQAT_ROOT_DIR}/quickassist/include/cpa.h" + DESTINATION "${LIBQAT_HEADER_DIR}/qat/" + ) + file(COPY "${LIBQAT_ROOT_DIR}/quickassist/include/dc/cpa_dc.h" + DESTINATION "${LIBQAT_HEADER_DIR}/qat/" + ) + file(COPY "${LIBQAT_ROOT_DIR}/quickassist/lookaside/access_layer/include/icp_sal_poll.h" + DESTINATION "${LIBQAT_HEADER_DIR}/qat/" + ) + file(COPY "${LIBQAT_ROOT_DIR}/quickassist/lookaside/access_layer/include/icp_sal_user.h" + DESTINATION "${LIBQAT_HEADER_DIR}/qat/" + ) + file(COPY "${LIBQAT_ROOT_DIR}/quickassist/utilities/libusdm_drv/qae_mem.h" + DESTINATION "${LIBQAT_HEADER_DIR}/qat/" + ) + + if (ENABLE_QAT_USDM_DRIVER) + add_definitions(-DENABLE_USDM_DRV) + endif() + + add_library(_qatzstd_plugin ${QATZSTD_SRC}) + target_link_libraries (_qatzstd_plugin PUBLIC ch_contrib::qatlib ch_contrib::usdm) + target_include_directories(_qatzstd_plugin PRIVATE + ${QAT_INCLUDE_DIR} + ${QAT_DC_INCLUDE_DIR} + ${QAT_AL_INCLUDE_DIR} + ${QAT_USDM_INCLUDE_DIR} + ${ZSTD_LIBRARY_DIR} + ${LIBQAT_HEADER_DIR}) + target_compile_definitions(_qatzstd_plugin PRIVATE -DDEBUGLEVEL=0 PUBLIC -DENABLE_ZSTD_QAT_CODEC -DINTREE) + target_include_directories(_qatzstd_plugin SYSTEM PUBLIC $ $) + add_library (ch_contrib::qatzstd_plugin ALIAS _qatzstd_plugin) +endif () + diff --git a/contrib/qatlib b/contrib/qatlib new file mode 160000 index 00000000000..abe15d7bfc0 --- /dev/null +++ b/contrib/qatlib @@ -0,0 +1 @@ +Subproject commit abe15d7bfc083117bfbb4baee0b49ffcd1c03c5c diff --git a/contrib/qatlib-cmake/CMakeLists.txt b/contrib/qatlib-cmake/CMakeLists.txt new file mode 100644 index 00000000000..d599775035a --- /dev/null +++ b/contrib/qatlib-cmake/CMakeLists.txt @@ -0,0 +1,213 @@ +# Intel® QuickAssist Technology Library (QATlib). + +message(STATUS "Intel QATlib ON") +set(LIBQAT_ROOT_DIR "${ClickHouse_SOURCE_DIR}/contrib/qatlib") +set(LIBQAT_DIR "${LIBQAT_ROOT_DIR}/quickassist/lookaside/access_layer/src") +set(LIBOSAL_DIR "${LIBQAT_ROOT_DIR}/quickassist/utilities/osal/src") +set(OPENSSL_DIR "${ClickHouse_SOURCE_DIR}/contrib/openssl") + +# Build 3 libraries: _qatmgr, _osal, _qatlib +# Produce ch_contrib::qatlib by linking these libraries. + +# _qatmgr + +SET(LIBQATMGR_sources ${LIBQAT_DIR}/qat_direct/vfio/qat_mgr_client.c + ${LIBQAT_DIR}/qat_direct/vfio/qat_mgr_lib.c + ${LIBQAT_DIR}/qat_direct/vfio/qat_log.c + ${LIBQAT_DIR}/qat_direct/vfio/vfio_lib.c + ${LIBQAT_DIR}/qat_direct/vfio/adf_pfvf_proto.c + ${LIBQAT_DIR}/qat_direct/vfio/adf_pfvf_vf_msg.c + ${LIBQAT_DIR}/qat_direct/vfio/adf_vfio_pf.c) + +add_library(_qatmgr ${LIBQATMGR_sources}) + +target_include_directories(_qatmgr PRIVATE + ${LIBQAT_ROOT_DIR}/quickassist/lookaside/access_layer/src/qat_direct/vfio + ${LIBQAT_ROOT_DIR}/quickassist/lookaside/access_layer/include + ${LIBQAT_ROOT_DIR}/quickassist/include + ${LIBQAT_ROOT_DIR}/quickassist/utilities/osal/include + ${LIBQAT_ROOT_DIR}/quickassist/utilities/osal/src/linux/user_space/include + ${LIBQAT_ROOT_DIR}/quickassist/qat/drivers/crypto/qat/qat_common + ${LIBQAT_ROOT_DIR}/quickassist/lookaside/access_layer/src/qat_direct/include + ${LIBQAT_ROOT_DIR}/quickassist/lookaside/access_layer/src/qat_direct/common/include + ${ClickHouse_SOURCE_DIR}/contrib/sysroot/linux-x86_64-musl/include) + +target_compile_definitions(_qatmgr PRIVATE -DUSER_SPACE) +target_compile_options(_qatmgr PRIVATE -Wno-error=int-conversion) + +# _osal + +SET(LIBOSAL_sources + ${LIBOSAL_DIR}/linux/user_space/OsalSemaphore.c + ${LIBOSAL_DIR}/linux/user_space/OsalThread.c + ${LIBOSAL_DIR}/linux/user_space/OsalMutex.c + ${LIBOSAL_DIR}/linux/user_space/OsalSpinLock.c + ${LIBOSAL_DIR}/linux/user_space/OsalAtomic.c + ${LIBOSAL_DIR}/linux/user_space/OsalServices.c + ${LIBOSAL_DIR}/linux/user_space/OsalUsrKrnProxy.c + ${LIBOSAL_DIR}/linux/user_space/OsalCryptoInterface.c) + +add_library(_osal ${LIBOSAL_sources}) + +target_include_directories(_osal PRIVATE + ${LIBQAT_ROOT_DIR}/quickassist/utilities/osal/src/linux/user_space + ${LIBQAT_ROOT_DIR}/quickassist/utilities/osal/include + ${LIBQAT_ROOT_DIR}/quickassist/utilities/osal/src/linux/user_space/include + ${OPENSSL_DIR}/include + ${ClickHouse_SOURCE_DIR}/contrib/openssl-cmake/linux_x86_64/include + ${ClickHouse_SOURCE_DIR}/contrib/qatlib-cmake/include) + +target_compile_definitions(_osal PRIVATE -DOSAL_ENSURE_ON -DUSE_OPENSSL) + +# _qatlib +SET(LIBQAT_sources + ${LIBQAT_DIR}/common/compression/dc_buffers.c + ${LIBQAT_DIR}/common/compression/dc_chain.c + ${LIBQAT_DIR}/common/compression/dc_datapath.c + ${LIBQAT_DIR}/common/compression/dc_dp.c + ${LIBQAT_DIR}/common/compression/dc_header_footer.c + ${LIBQAT_DIR}/common/compression/dc_header_footer_lz4.c + ${LIBQAT_DIR}/common/compression/dc_session.c + ${LIBQAT_DIR}/common/compression/dc_stats.c + ${LIBQAT_DIR}/common/compression/dc_err_sim.c + ${LIBQAT_DIR}/common/compression/dc_ns_datapath.c + ${LIBQAT_DIR}/common/compression/dc_ns_header_footer.c + ${LIBQAT_DIR}/common/compression/dc_crc32.c + ${LIBQAT_DIR}/common/compression/dc_crc64.c + ${LIBQAT_DIR}/common/compression/dc_xxhash32.c + ${LIBQAT_DIR}/common/compression/icp_sal_dc_err_sim.c + ${LIBQAT_DIR}/common/crypto/asym/diffie_hellman/lac_dh_control_path.c + ${LIBQAT_DIR}/common/crypto/asym/diffie_hellman/lac_dh_data_path.c + ${LIBQAT_DIR}/common/crypto/asym/diffie_hellman/lac_dh_interface_check.c + ${LIBQAT_DIR}/common/crypto/asym/diffie_hellman/lac_dh_stats.c + ${LIBQAT_DIR}/common/crypto/asym/dsa/lac_dsa.c + ${LIBQAT_DIR}/common/crypto/asym/dsa/lac_dsa_interface_check.c + ${LIBQAT_DIR}/common/crypto/asym/ecc/lac_ec.c + ${LIBQAT_DIR}/common/crypto/asym/ecc/lac_ec_common.c + ${LIBQAT_DIR}/common/crypto/asym/ecc/lac_ec_montedwds.c + ${LIBQAT_DIR}/common/crypto/asym/ecc/lac_ec_nist_curves.c + ${LIBQAT_DIR}/common/crypto/asym/ecc/lac_ecdh.c + ${LIBQAT_DIR}/common/crypto/asym/ecc/lac_ecdsa.c + ${LIBQAT_DIR}/common/crypto/asym/ecc/lac_ecsm2.c + ${LIBQAT_DIR}/common/crypto/asym/ecc/lac_kpt_ecdsa.c + ${LIBQAT_DIR}/common/crypto/asym/large_number/lac_ln.c + ${LIBQAT_DIR}/common/crypto/asym/large_number/lac_ln_interface_check.c + ${LIBQAT_DIR}/common/crypto/asym/pke_common/lac_pke_mmp.c + ${LIBQAT_DIR}/common/crypto/asym/pke_common/lac_pke_qat_comms.c + ${LIBQAT_DIR}/common/crypto/asym/pke_common/lac_pke_utils.c + ${LIBQAT_DIR}/common/crypto/asym/prime/lac_prime.c + ${LIBQAT_DIR}/common/crypto/asym/prime/lac_prime_interface_check.c + ${LIBQAT_DIR}/common/crypto/asym/rsa/lac_rsa.c + ${LIBQAT_DIR}/common/crypto/asym/rsa/lac_rsa_control_path.c + ${LIBQAT_DIR}/common/crypto/asym/rsa/lac_rsa_decrypt.c + ${LIBQAT_DIR}/common/crypto/asym/rsa/lac_rsa_encrypt.c + ${LIBQAT_DIR}/common/crypto/asym/rsa/lac_rsa_interface_check.c + ${LIBQAT_DIR}/common/crypto/asym/rsa/lac_rsa_keygen.c + ${LIBQAT_DIR}/common/crypto/asym/rsa/lac_rsa_stats.c + ${LIBQAT_DIR}/common/crypto/asym/rsa/lac_kpt_rsa_decrypt.c + ${LIBQAT_DIR}/common/crypto/sym/drbg/lac_sym_drbg_api.c + ${LIBQAT_DIR}/common/crypto/sym/key/lac_sym_key.c + ${LIBQAT_DIR}/common/crypto/sym/lac_sym_alg_chain.c + ${LIBQAT_DIR}/common/crypto/sym/lac_sym_api.c + ${LIBQAT_DIR}/common/crypto/sym/lac_sym_auth_enc.c + ${LIBQAT_DIR}/common/crypto/sym/lac_sym_cb.c + ${LIBQAT_DIR}/common/crypto/sym/lac_sym_cipher.c + ${LIBQAT_DIR}/common/crypto/sym/lac_sym_compile_check.c + ${LIBQAT_DIR}/common/crypto/sym/lac_sym_dp.c + ${LIBQAT_DIR}/common/crypto/sym/lac_sym_hash.c + ${LIBQAT_DIR}/common/crypto/sym/lac_sym_partial.c + ${LIBQAT_DIR}/common/crypto/sym/lac_sym_queue.c + ${LIBQAT_DIR}/common/crypto/sym/lac_sym_stats.c + ${LIBQAT_DIR}/common/crypto/sym/nrbg/lac_sym_nrbg_api.c + ${LIBQAT_DIR}/common/crypto/sym/qat/lac_sym_qat.c + ${LIBQAT_DIR}/common/crypto/sym/qat/lac_sym_qat_cipher.c + ${LIBQAT_DIR}/common/crypto/sym/qat/lac_sym_qat_constants_table.c + ${LIBQAT_DIR}/common/crypto/sym/qat/lac_sym_qat_hash.c + ${LIBQAT_DIR}/common/crypto/sym/qat/lac_sym_qat_hash_defs_lookup.c + ${LIBQAT_DIR}/common/crypto/sym/qat/lac_sym_qat_key.c + ${LIBQAT_DIR}/common/crypto/sym/lac_sym_hash_sw_precomputes.c + ${LIBQAT_DIR}/common/crypto/kpt/provision/lac_kpt_provision.c + ${LIBQAT_DIR}/common/ctrl/sal_compression.c + ${LIBQAT_DIR}/common/ctrl/sal_create_services.c + ${LIBQAT_DIR}/common/ctrl/sal_ctrl_services.c + ${LIBQAT_DIR}/common/ctrl/sal_list.c + ${LIBQAT_DIR}/common/ctrl/sal_crypto.c + ${LIBQAT_DIR}/common/ctrl/sal_dc_chain.c + ${LIBQAT_DIR}/common/ctrl/sal_instances.c + ${LIBQAT_DIR}/common/qat_comms/sal_qat_cmn_msg.c + ${LIBQAT_DIR}/common/utils/lac_buffer_desc.c + ${LIBQAT_DIR}/common/utils/lac_log_message.c + ${LIBQAT_DIR}/common/utils/lac_mem.c + ${LIBQAT_DIR}/common/utils/lac_mem_pools.c + ${LIBQAT_DIR}/common/utils/lac_sw_responses.c + ${LIBQAT_DIR}/common/utils/lac_sync.c + ${LIBQAT_DIR}/common/utils/sal_service_state.c + ${LIBQAT_DIR}/common/utils/sal_statistics.c + ${LIBQAT_DIR}/common/utils/sal_misc_error_stats.c + ${LIBQAT_DIR}/common/utils/sal_string_parse.c + ${LIBQAT_DIR}/common/utils/sal_user_process.c + ${LIBQAT_DIR}/common/utils/sal_versions.c + ${LIBQAT_DIR}/common/device/sal_dev_info.c + ${LIBQAT_DIR}/user/sal_user.c + ${LIBQAT_DIR}/user/sal_user_dyn_instance.c + ${LIBQAT_DIR}/qat_direct/common/adf_process_proxy.c + ${LIBQAT_DIR}/qat_direct/common/adf_user_cfg.c + ${LIBQAT_DIR}/qat_direct/common/adf_user_device.c + ${LIBQAT_DIR}/qat_direct/common/adf_user_dyn.c + ${LIBQAT_DIR}/qat_direct/common/adf_user_ETring_mgr_dp.c + ${LIBQAT_DIR}/qat_direct/common/adf_user_init.c + ${LIBQAT_DIR}/qat_direct/common/adf_user_ring.c + ${LIBQAT_DIR}/qat_direct/common/adf_user_transport_ctrl.c + ${LIBQAT_DIR}/qat_direct/vfio/adf_vfio_cfg.c + ${LIBQAT_DIR}/qat_direct/vfio/adf_vfio_ring.c + ${LIBQAT_DIR}/qat_direct/vfio/adf_vfio_user_bundles.c + ${LIBQAT_DIR}/qat_direct/vfio/adf_vfio_user_proxy.c + ${LIBQAT_DIR}/common/compression/dc_crc_base.c) + +add_library(_qatlib ${LIBQAT_sources}) + +target_include_directories(_qatlib PRIVATE + ${CMAKE_SYSROOT}/usr/include + ${LIBQAT_ROOT_DIR}/quickassist/lookaside/access_layer/include + ${LIBQAT_ROOT_DIR}/quickassist/utilities/libusdm_drv + ${LIBQAT_ROOT_DIR}/quickassist/utilities/osal/include + ${LIBOSAL_DIR}/linux/user_space/include + ${LIBQAT_ROOT_DIR}/quickassist/include + ${LIBQAT_ROOT_DIR}/quickassist/include/lac + ${LIBQAT_ROOT_DIR}/quickassist/include/dc + ${LIBQAT_ROOT_DIR}/quickassist/qat/drivers/crypto/qat/qat_common + ${LIBQAT_ROOT_DIR}/quickassist/lookaside/access_layer/src/common/compression/include + ${LIBQAT_ROOT_DIR}/quickassist/lookaside/access_layer/src/common/crypto/sym/include + ${LIBQAT_ROOT_DIR}/quickassist/lookaside/access_layer/src/common/crypto/asym/include + ${LIBQAT_ROOT_DIR}/quickassist/lookaside/firmware/include + ${LIBQAT_ROOT_DIR}/quickassist/lookaside/access_layer/src/common/include + ${LIBQAT_ROOT_DIR}/quickassist/lookaside/access_layer/src/qat_direct/include + ${LIBQAT_ROOT_DIR}/quickassist/lookaside/access_layer/src/qat_direct/common/include + ${LIBQAT_ROOT_DIR}/quickassist/lookaside/access_layer/src/qat_direct/vfio + ${LIBQAT_ROOT_DIR}/quickassist/utilities/osal/src/linux/user_space + ${LIBQAT_ROOT_DIR}/quickassist/utilities/osal/src/linux/user_space/include + ${ClickHouse_SOURCE_DIR}/contrib/sysroot/linux-x86_64-musl/include) + +target_link_libraries(_qatlib PRIVATE _qatmgr _osal OpenSSL::SSL ch_contrib::isal) +target_compile_definitions(_qatlib PRIVATE -DUSER_SPACE -DLAC_BYTE_ORDER=__LITTLE_ENDIAN -DOSAL_ENSURE_ON) +target_link_options(_qatlib PRIVATE -pie -z relro -z now -z noexecstack) +target_compile_options(_qatlib PRIVATE -march=native) +add_library (ch_contrib::qatlib ALIAS _qatlib) + +# _usdm + +set(LIBUSDM_DIR "${ClickHouse_SOURCE_DIR}/contrib/qatlib/quickassist/utilities/libusdm_drv") +set(LIBUSDM_sources + ${LIBUSDM_DIR}/user_space/vfio/qae_mem_utils_vfio.c + ${LIBUSDM_DIR}/user_space/qae_mem_utils_common.c + ${LIBUSDM_DIR}/user_space/vfio/qae_mem_hugepage_utils_vfio.c) + +add_library(_usdm ${LIBUSDM_sources}) + +target_include_directories(_usdm PRIVATE + ${ClickHouse_SOURCE_DIR}/contrib/sysroot/linux-x86_64-musl/include + ${LIBUSDM_DIR} + ${LIBUSDM_DIR}/include + ${LIBUSDM_DIR}/user_space) + +add_library (ch_contrib::usdm ALIAS _usdm) diff --git a/contrib/qatlib-cmake/include/mqueue.h b/contrib/qatlib-cmake/include/mqueue.h new file mode 100644 index 00000000000..7b1125074a8 --- /dev/null +++ b/contrib/qatlib-cmake/include/mqueue.h @@ -0,0 +1,14 @@ +/* This is a workaround for a build conflict issue +1. __GLIBC_PREREQ (referenced in OsalServices.c) is only defined in './sysroot/linux-x86_64/include/features.h' +2. mqueue.h only exist under './sysroot/linux-x86_64-musl/' +This cause target_include_directories for _osal has a conflict between './sysroot/linux-x86_64/include' and './sysroot/linux-x86_64-musl/' +hence create mqueue.h separately under ./qatlib-cmake/include as an alternative. +*/ + +/* Major and minor version number of the GNU C library package. Use + these macros to test for features in specific releases. */ +#define __GLIBC__ 2 +#define __GLIBC_MINOR__ 27 + +#define __GLIBC_PREREQ(maj, min) \ + ((__GLIBC__ << 16) + __GLIBC_MINOR__ >= ((maj) << 16) + (min)) diff --git a/docker/server/entrypoint.sh b/docker/server/entrypoint.sh index d94ffb893e1..b9c7ea34a36 100755 --- a/docker/server/entrypoint.sh +++ b/docker/server/entrypoint.sh @@ -41,6 +41,10 @@ readarray -t DISKS_PATHS < <(clickhouse extract-from-config --config-file "$CLIC readarray -t DISKS_METADATA_PATHS < <(clickhouse extract-from-config --config-file "$CLICKHOUSE_CONFIG" --key='storage_configuration.disks.*.metadata_path' || true) CLICKHOUSE_USER="${CLICKHOUSE_USER:-default}" +CLICKHOUSE_PASSWORD_FILE="${CLICKHOUSE_PASSWORD_FILE:-}" +if [[ -n "${CLICKHOUSE_PASSWORD_FILE}" && -f "${CLICKHOUSE_PASSWORD_FILE}" ]]; then + CLICKHOUSE_PASSWORD="$(cat "${CLICKHOUSE_PASSWORD_FILE}")" +fi CLICKHOUSE_PASSWORD="${CLICKHOUSE_PASSWORD:-}" CLICKHOUSE_DB="${CLICKHOUSE_DB:-}" CLICKHOUSE_ACCESS_MANAGEMENT="${CLICKHOUSE_DEFAULT_ACCESS_MANAGEMENT:-0}" diff --git a/docker/test/stateful/run.sh b/docker/test/stateful/run.sh index c9ce5697182..9079246429f 100755 --- a/docker/test/stateful/run.sh +++ b/docker/test/stateful/run.sh @@ -44,6 +44,9 @@ if [[ -n "$USE_S3_STORAGE_FOR_MERGE_TREE" ]] && [[ "$USE_S3_STORAGE_FOR_MERGE_TR # It is not needed, we will explicitly create tables on s3. # We do not have statefull tests with s3 storage run in public repository, but this is needed for another repository. rm /etc/clickhouse-server/config.d/s3_storage_policy_for_merge_tree_by_default.xml + + rm /etc/clickhouse-server/config.d/storage_metadata_with_full_object_key.xml + rm /etc/clickhouse-server/config.d/s3_storage_policy_with_template_object_key.xml fi function start() diff --git a/docker/test/stress/run.sh b/docker/test/stress/run.sh index 67056cc1bc1..bca8800ab2b 100644 --- a/docker/test/stress/run.sh +++ b/docker/test/stress/run.sh @@ -193,6 +193,7 @@ stop # Let's enable S3 storage by default export USE_S3_STORAGE_FOR_MERGE_TREE=1 +export $RANDOMIZE_OBJECT_KEY_TYPE=1 export ZOOKEEPER_FAULT_INJECTION=1 configure diff --git a/docs/en/engines/table-engines/integrations/s3queue.md b/docs/en/engines/table-engines/integrations/s3queue.md index 8b7f86cce5c..8ebab80423f 100644 --- a/docs/en/engines/table-engines/integrations/s3queue.md +++ b/docs/en/engines/table-engines/integrations/s3queue.md @@ -11,7 +11,7 @@ This engine provides integration with [Amazon S3](https://aws.amazon.com/s3/) ec ``` sql CREATE TABLE s3_queue_engine_table (name String, value UInt32) - ENGINE = S3Queue(path [, NOSIGN | aws_access_key_id, aws_secret_access_key,] format, [compression]) + ENGINE = S3Queue(path, [NOSIGN, | aws_access_key_id, aws_secret_access_key,] format, [compression]) [SETTINGS] [mode = 'unordered',] [after_processing = 'keep',] diff --git a/docs/en/engines/table-engines/mergetree-family/mergetree.md b/docs/en/engines/table-engines/mergetree-family/mergetree.md index ed413959ca6..d4251e7e74c 100644 --- a/docs/en/engines/table-engines/mergetree-family/mergetree.md +++ b/docs/en/engines/table-engines/mergetree-family/mergetree.md @@ -1143,6 +1143,8 @@ Optional parameters: - `s3_max_get_burst` — Max number of requests that can be issued simultaneously before hitting request per second limit. By default (`0` value) equals to `s3_max_get_rps`. - `read_resource` — Resource name to be used for [scheduling](/docs/en/operations/workload-scheduling.md) of read requests to this disk. Default value is empty string (IO scheduling is not enabled for this disk). - `write_resource` — Resource name to be used for [scheduling](/docs/en/operations/workload-scheduling.md) of write requests to this disk. Default value is empty string (IO scheduling is not enabled for this disk). +- `key_template` — Define the format with which the object keys are generated. By default, Clickhouse takes `root path` from `endpoint` option and adds random generated suffix. That suffix is a dir with 3 random symbols and a file name with 29 random symbols. With that option you have a full control how to the object keys are generated. Some usage scenarios require having random symbols in the prefix or in the middle of object key. For example: `[a-z]{3}-prefix-random/constant-part/random-middle-[a-z]{3}/random-suffix-[a-z]{29}`. The value is parsed with [`re2`](https://github.com/google/re2/wiki/Syntax). Only some subset of the syntax is supported. Check if your preferred format is supported before using that option. Disk isn't initialized if clickhouse is unable to generate a key by the value of `key_template`. It requires enabled feature flag [storage_metadata_write_full_object_key](/docs/en/operations/settings/settings#storage_metadata_write_full_object_key). It forbids declaring the `root path` in `endpoint` option. It requires definition of the option `key_compatibility_prefix`. +- `key_compatibility_prefix` — That option is required when option `key_template` is in use. In order to be able to read the objects keys which were stored in the metadata files with the metadata version lower that `VERSION_FULL_OBJECT_KEY`, the previous `root path` from the `endpoint` option should be set here. ### Configuring the cache diff --git a/docs/en/operations/settings/mysql-binlog-client.md b/docs/en/operations/settings/mysql-binlog-client.md new file mode 100644 index 00000000000..1e1a2449e1c --- /dev/null +++ b/docs/en/operations/settings/mysql-binlog-client.md @@ -0,0 +1,176 @@ +# The MySQL Binlog Client + +The MySQL Binlog Client provides a mechanism in ClickHouse to share the binlog from a MySQL instance among multiple [MaterializedMySQL](../../engines/database-engines/materialized-mysql.md) databases. This avoids consuming unnecessary bandwidth and CPU when replicating more than one schema/database. + +The implementation is resilient against crashes and disk issues. The executed GTID sets of the binlog itself and the consuming databases have persisted only after the data they describe has been safely persisted as well. The implementation also tolerates re-doing aborted operations (at-least-once delivery). + +# Settings + +## use_binlog_client + +Forces to reuse existing MySQL binlog connection or creates new one if does not exist. The connection is defined by `user:pass@host:port`. + +Default value: 0 + +**Example** + +```sql +-- create MaterializedMySQL databases that read the events from the binlog client +CREATE DATABASE db1 ENGINE = MaterializedMySQL('host:port', 'db1', 'user', 'password') SETTINGS use_binlog_client=1 +CREATE DATABASE db2 ENGINE = MaterializedMySQL('host:port', 'db2', 'user', 'password') SETTINGS use_binlog_client=1 +CREATE DATABASE db3 ENGINE = MaterializedMySQL('host:port', 'db3', 'user2', 'password2') SETTINGS use_binlog_client=1 +``` + +Databases `db1` and `db2` will use the same binlog connection, since they use the same `user:pass@host:port`. Database `db3` will use separate binlog connection. + +## max_bytes_in_binlog_queue + +Defines the limit of bytes in the events binlog queue. If bytes in the queue increases this limit, it will stop reading new events from MySQL until the space for new events will be freed. This introduces the memory limits. Very high value could consume all available memory. Very low value could make the databases to wait for new events. + +Default value: 67108864 + +**Example** + +```sql +CREATE DATABASE db1 ENGINE = MaterializedMySQL('host:port', 'db1', 'user', 'password') SETTINGS use_binlog_client=1, max_bytes_in_binlog_queue=33554432 +CREATE DATABASE db2 ENGINE = MaterializedMySQL('host:port', 'db2', 'user', 'password') SETTINGS use_binlog_client=1 +``` + +If database `db1` is unable to consume binlog events fast enough and the size of the events queue exceeds `33554432` bytes, reading of new events from MySQL is postponed until `db1` +consumes the events and releases some space. + +NOTE: This will impact to `db2`, and it will be waiting for new events too, since they share the same connection. + +## max_milliseconds_to_wait_in_binlog_queue + +Defines the max milliseconds to wait when `max_bytes_in_binlog_queue` exceeded. After that it will detach the database from current binlog connection and will retry establish new one to prevent other databases to wait for this database. + +Default value: 10000 + +**Example** + +```sql +CREATE DATABASE db1 ENGINE = MaterializedMySQL('host:port', 'db1', 'user', 'password') SETTINGS use_binlog_client=1, max_bytes_in_binlog_queue=33554432, max_milliseconds_to_wait_in_binlog_queue=1000 +CREATE DATABASE db2 ENGINE = MaterializedMySQL('host:port', 'db2', 'user', 'password') SETTINGS use_binlog_client=1 +``` + +If the event queue of database `db1` is full, the binlog connection will be waiting in `1000`ms and if the database is not able to consume the events, it will be detached from the connection to create another one. + +NOTE: If the database `db1` has been detached from the shared connection and created new one, after the binlog connections for `db1` and `db2` have the same positions they will be merged to one. And `db1` and `db2` will use the same connection again. + +## max_bytes_in_binlog_dispatcher_buffer + +Defines the max bytes in the binlog dispatcher's buffer before it is flushed to attached binlog. The events from MySQL binlog connection are buffered before sending to attached databases. It increases the events throughput from the binlog to databases. + +Default value: 1048576 + +## max_flush_milliseconds_in_binlog_dispatcher + +Defines the max milliseconds in the binlog dispatcher's buffer to wait before it is flushed to attached binlog. If there are no events received from MySQL binlog connection for a while, after some time buffered events should be sent to the attached databases. + +Default value: 1000 + +# Design + +## The Binlog Events Dispatcher + +Currently each MaterializedMySQL database opens its own connection to MySQL to subscribe to binlog events. There is a need to have only one connection and _dispatch_ the binlog events to all databases that replicate from the same MySQL instance. + +## Each MaterializedMySQL Database Has Its Own Event Queue + +To prevent slowing down other instances there should be an _event queue_ per MaterializedMySQL database to handle the events independently of the speed of other instances. The dispatcher reads an event from the binlog, and sends it to every MaterializedMySQL database that needs it. Each database handles its events in separate threads. + +## Catching up + +If several databases have the same binlog position, they can use the same dispatcher. If a newly created database (or one that has been detached for some time) requests events that have been already processed, we need to create another communication _channel_ to the binlog. We do this by creating another temporary dispatcher for such databases. When the new dispatcher _catches up with_ the old one, the new/temporary dispatcher is not needed anymore and all databases getting events from this dispatcher can be moved to the old one. + +## Memory Limit + +There is a _memory limit_ to control event queue memory consumption per MySQL Client. If a database is not able to handle events fast enough, and the event queue is getting full, we have the following options: + +1. The dispatcher is blocked until the slowest database frees up space for new events. All other databases are waiting for the slowest one. (Preferred) +2. The dispatcher is _never_ blocked, but suspends incremental sync for the slow database and continues dispatching events to remained databases. + +## Performance + +A lot of CPU can be saved by not processing every event in every database. The binlog contains events for all databases, it is wasteful to distribute row events to a database that it will not process it, especially if there are a lot of databases. This requires some sort of per-database binlog filtering and buffering. + +Currently all events are sent to all MaterializedMySQL databases but parsing the event which consumes CPU is up to the database. + +# Detailed Design + +1. If a client (e.g. database) wants to read a stream of the events from MySQL binlog, it creates a connection to remote binlog by host/user/password and _executed GTID set_ params. +2. If another client wants to read the events from the binlog but for different _executed GTID set_, it is **not** possible to reuse existing connection to MySQL, then need to create another connection to the same remote binlog. (_This is how it is implemented today_). +3. When these 2 connections get the same binlog positions, they read the same events. It is logical to drop duplicate connection and move all its users out. And now one connection dispatches binlog events to several clients. Obviously only connections to the same binlog should be merged. + +## Classes + +1. One connection can send (or dispatch) events to several clients and might be called `BinlogEventsDispatcher`. +2. Several dispatchers grouped by _user:password@host:port_ in `BinlogClient`. Since they point to the same binlog. +3. The clients should communicate only with public API from `BinlogClient`. The result of using `BinlogClient` is an object that implements `IBinlog` to read events from. This implementation of `IBinlog` must be compatible with old implementation `MySQLFlavor` -> when replacing old implementation by new one, the behavior must not be changed. + +## SQL + +```sql +-- create MaterializedMySQL databases that read the events from the binlog client +CREATE DATABASE db1_client1 ENGINE = MaterializedMySQL('host:port', 'db', 'user', 'password') SETTINGS use_binlog_client=1, max_bytes_in_binlog_queue=1024; +CREATE DATABASE db2_client1 ENGINE = MaterializedMySQL('host:port', 'db', 'user', 'password') SETTINGS use_binlog_client=1; +CREATE DATABASE db3_client1 ENGINE = MaterializedMySQL('host:port', 'db2', 'user', 'password') SETTINGS use_binlog_client=1; +CREATE DATABASE db4_client2 ENGINE = MaterializedMySQL('host2:port', 'db', 'user', 'password') SETTINGS use_binlog_client=1; +CREATE DATABASE db5_client3 ENGINE = MaterializedMySQL('host:port', 'db', 'user1', 'password') SETTINGS use_binlog_client=1; +CREATE DATABASE db6_old ENGINE = MaterializedMySQL('host:port', 'db', 'user1', 'password') SETTINGS use_binlog_client=0; +``` + +Databases `db1_client1`, `db2_client1` and `db3_client1` share one instance of `BinlogClient` since they have the same params. `BinlogClient` will create 3 connections to MySQL server thus 3 instances of `BinlogEventsDispatcher`, but if these connections would have the same binlog position, they should be merged to one connection. Means all clients will be moved to one dispatcher and others will be closed. Databases `db4_client2` and `db5_client3` would use 2 different independent `BinlogClient` instances. Database `db6_old` will use old implementation. NOTE: By default `use_binlog_client` is disabled. Setting `max_bytes_in_binlog_queue` defines the max allowed bytes in the binlog queue. By default, it is `1073741824` bytes. If number of bytes exceeds this limit, the dispatching will be stopped until the space will be freed for new events. + +## Binlog Table Structure + +To see the status of the all `BinlogClient` instances there is `system.mysql_binlogs` system table. It shows the list of all created and _alive_ `IBinlog` instances with information about its `BinlogEventsDispatcher` and `BinlogClient`. + +Example: + +``` +SELECT * FROM system.mysql_binlogs FORMAT Vertical +Row 1: +────── +binlog_client_name: root@127.0.0.1:3306 +name: test_Clickhouse1 +mysql_binlog_name: binlog.001154 +mysql_binlog_pos: 7142294 +mysql_binlog_timestamp: 1660082447 +mysql_binlog_executed_gtid_set: a9d88f83-c14e-11ec-bb36-244bfedf7766:1-30523304 +dispatcher_name: Applier +dispatcher_mysql_binlog_name: binlog.001154 +dispatcher_mysql_binlog_pos: 7142294 +dispatcher_mysql_binlog_timestamp: 1660082447 +dispatcher_mysql_binlog_executed_gtid_set: a9d88f83-c14e-11ec-bb36-244bfedf7766:1-30523304 +size: 0 +bytes: 0 +max_bytes: 0 +``` + +### Tests + +Unit tests: + +``` +$ ./unit_tests_dbms --gtest_filter=MySQLBinlog.* +``` + +Integration tests: + +``` +$ pytest -s -vv test_materialized_mysql_database/test.py::test_binlog_client +``` + +Dumps events from the file + +``` +$ ./utils/check-mysql-binlog/check-mysql-binlog --binlog binlog.001392 +``` + +Dumps events from the server + +``` +$ ./utils/check-mysql-binlog/check-mysql-binlog --host 127.0.0.1 --port 3306 --user root --password pass --gtid a9d88f83-c14e-11ec-bb36-244bfedf7766:1-30462856 +``` diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index d4ee8106320..f085fe1abcd 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -4773,6 +4773,45 @@ Type: Int64 Default: 0 +## enable_deflate_qpl_codec {#enable_deflate_qpl_codec} + +If turned on, the DEFLATE_QPL codec may be used to compress columns. + +Possible values: + +- 0 - Disabled +- 1 - Enabled + +Type: Bool + +## enable_zstd_qat_codec {#enable_zstd_qat_codec} + +If turned on, the ZSTD_QAT codec may be used to compress columns. + +Possible values: + +- 0 - Disabled +- 1 - Enabled + +Type: Bool + +## output_format_compression_level + +Default compression level if query output is compressed. The setting is applied when `SELECT` query has `INTO OUTFILE` or when writing to table functions `file`, `url`, `hdfs`, `s3`, or `azureBlobStorage`. + +Possible values: from `1` to `22` + +Default: `3` + + +## output_format_compression_zstd_window_log + +Can be used when the output compression method is `zstd`. If greater than `0`, this setting explicitly sets compression window size (power of `2`) and enables a long-range mode for zstd compression. This can help to achieve a better compression ratio. + +Possible values: non-negative numbers. Note that if the value is too small or too big, `zstdlib` will throw an exception. Typical values are from `20` (window size = `1MB`) to `30` (window size = `1GB`). + +Default: `0` + ## rewrite_count_distinct_if_with_count_distinct_implementation Allows you to rewrite `countDistcintIf` with [count_distinct_implementation](#count_distinct_implementation) setting. @@ -5157,4 +5196,4 @@ The value 0 means that you can delete all tables without any restrictions. :::note This query setting overwrites its server setting equivalent, see [max_table_size_to_drop](/docs/en/operations/server-configuration-parameters/settings.md/#max-table-size-to-drop) -::: \ No newline at end of file +::: diff --git a/docs/en/operations/utilities/clickhouse-format.md b/docs/en/operations/utilities/clickhouse-format.md index 3e4295598aa..879bf9d71ac 100644 --- a/docs/en/operations/utilities/clickhouse-format.md +++ b/docs/en/operations/utilities/clickhouse-format.md @@ -11,6 +11,8 @@ Keys: - `--query` — Format queries of any length and complexity. - `--hilite` — Add syntax highlight with ANSI terminal escape sequences. - `--oneline` — Format in single line. +- `--max_line_length` — Format in single line queries with length less than specified. +- `--comments` — Keep comments in the output. - `--quiet` or `-q` — Just check syntax, no output on success. - `--multiquery` or `-n` — Allow multiple queries in the same file. - `--obfuscate` — Obfuscate instead of formatting. diff --git a/docs/en/operations/utilities/clickhouse-keeper-client.md b/docs/en/operations/utilities/clickhouse-keeper-client.md index d6e11fb9613..4588f68cacd 100644 --- a/docs/en/operations/utilities/clickhouse-keeper-client.md +++ b/docs/en/operations/utilities/clickhouse-keeper-client.md @@ -24,7 +24,7 @@ A client application to interact with clickhouse-keeper by its native protocol. ## Example {#clickhouse-keeper-client-example} ```bash -./clickhouse-keeper-client -h localhost:9181 --connection-timeout 30 --session-timeout 30 --operation-timeout 30 +./clickhouse-keeper-client -h localhost -p 9181 --connection-timeout 30 --session-timeout 30 --operation-timeout 30 Connected to ZooKeeper at [::1]:9181 with session_id 137 / :) ls keeper foo bar diff --git a/docs/en/sql-reference/functions/string-functions.md b/docs/en/sql-reference/functions/string-functions.md index 3461ab28bf9..8ff155ee39d 100644 --- a/docs/en/sql-reference/functions/string-functions.md +++ b/docs/en/sql-reference/functions/string-functions.md @@ -1605,6 +1605,78 @@ Result: Alias: levenshteinDistance +## damerauLevenshteinDistance + +Calculates the [Damerau-Levenshtein distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance) between two byte strings. + +**Syntax** + +```sql +damerauLevenshteinDistance(string1, string2) +``` + +**Examples** + +``` sql +SELECT damerauLevenshteinDistance('clickhouse', 'mouse'); +``` + +Result: + +``` text +┌─damerauLevenshteinDistance('clickhouse', 'mouse')─┐ +│ 6 │ +└───────────────────────────────────────────────────┘ +``` + +## jaroSimilarity + +Calculates the [Jaro similarity](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance#Jaro_similarity) between two byte strings. + +**Syntax** + +```sql +jaroSimilarity(string1, string2) +``` + +**Examples** + +``` sql +SELECT jaroSimilarity('clickhouse', 'click'); +``` + +Result: + +``` text +┌─jaroSimilarity('clickhouse', 'click')─┐ +│ 0.8333333333333333 │ +└───────────────────────────────────────┘ +``` + +## jaroWinklerSimilarity + +Calculates the [Jaro-Winkler similarity](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance#Jaro%E2%80%93Winkler_similarity) between two byte strings. + +**Syntax** + +```sql +jaroWinklerSimilarity(string1, string2) +``` + +**Examples** + +``` sql +SELECT jaroWinklerSimilarity('clickhouse', 'click'); +``` + +Result: + +``` text +┌─jaroWinklerSimilarity('clickhouse', 'click')─┐ +│ 0.8999999999999999 │ +└──────────────────────────────────────────────┘ +``` + ## initcap Convert the first letter of each word to upper case and the rest to lower case. Words are sequences of alphanumeric characters separated by non-alphanumeric characters. diff --git a/docs/en/sql-reference/statements/create/table.md b/docs/en/sql-reference/statements/create/table.md index 602feb69d8a..0258c64e422 100644 --- a/docs/en/sql-reference/statements/create/table.md +++ b/docs/en/sql-reference/statements/create/table.md @@ -372,15 +372,23 @@ ClickHouse supports general purpose codecs and specialized codecs. #### ZSTD -`ZSTD[(level)]` — [ZSTD compression algorithm](https://en.wikipedia.org/wiki/Zstandard) with configurable `level`. Possible levels: \[1, 22\]. Default value: 1. +`ZSTD[(level)]` — [ZSTD compression algorithm](https://en.wikipedia.org/wiki/Zstandard) with configurable `level`. Possible levels: \[1, 22\]. Default level: 1. High compression levels are useful for asymmetric scenarios, like compress once, decompress repeatedly. Higher levels mean better compression and higher CPU usage. +#### ZSTD_QAT + +`ZSTD_QAT[(level)]` — [ZSTD compression algorithm](https://en.wikipedia.org/wiki/Zstandard) with configurable level, implemented by [Intel® QATlib](https://github.com/intel/qatlib) and [Intel® QAT ZSTD Plugin](https://github.com/intel/QAT-ZSTD-Plugin). Possible levels: \[1, 12\]. Default level: 1. Recommended level range: \[6, 12\]. Some limitations apply: + +- ZSTD_QAT is disabled by default and can only be used after enabling configuration setting [enable_zstd_qat_codec](../../../operations/settings/settings.md#enable_zstd_qat_codec). +- For compression, ZSTD_QAT tries to use an Intel® QAT offloading device ([QuickAssist Technology](https://www.intel.com/content/www/us/en/developer/topic-technology/open/quick-assist-technology/overview.html)). If no such device was found, it will fallback to ZSTD compression in software. +- Decompression is always performed in software. + #### DEFLATE_QPL `DEFLATE_QPL` — [Deflate compression algorithm](https://github.com/intel/qpl) implemented by Intel® Query Processing Library. Some limitations apply: -- DEFLATE_QPL is disabled by default and can only be used after setting configuration parameter `enable_deflate_qpl_codec = 1`. +- DEFLATE_QPL is disabled by default and can only be used after enabling configuration setting [enable_deflate_qpl_codec](../../../operations/settings/settings.md#enable_deflate_qpl_codec). - DEFLATE_QPL requires a ClickHouse build compiled with SSE 4.2 instructions (by default, this is the case). Refer to [Build Clickhouse with DEFLATE_QPL](/docs/en/development/building_and_benchmarking_deflate_qpl.md/#Build-Clickhouse-with-DEFLATE_QPL) for more details. - DEFLATE_QPL works best if the system has a Intel® IAA (In-Memory Analytics Accelerator) offloading device. Refer to [Accelerator Configuration](https://intel.github.io/qpl/documentation/get_started_docs/installation.html#accelerator-configuration) and [Benchmark with DEFLATE_QPL](/docs/en/development/building_and_benchmarking_deflate_qpl.md/#Run-Benchmark-with-DEFLATE_QPL) for more details. - DEFLATE_QPL-compressed data can only be transferred between ClickHouse nodes compiled with SSE 4.2 enabled. diff --git a/programs/format/Format.cpp b/programs/format/Format.cpp index ecf02283ab7..a1c51565ae3 100644 --- a/programs/format/Format.cpp +++ b/programs/format/Format.cpp @@ -3,6 +3,7 @@ #include #include +#include #include #include #include @@ -14,6 +15,7 @@ #include #include #include +#include #include #include @@ -30,22 +32,49 @@ #include #include #include +#include +namespace DB::ErrorCodes +{ + extern const int NOT_IMPLEMENTED; +} + +namespace +{ + +void skipSpacesAndComments(const char*& pos, const char* end, bool print_comments) +{ + do + { + /// skip spaces to avoid throw exception after last query + while (pos != end && std::isspace(*pos)) + ++pos; + + const char * comment_begin = pos; + /// for skip comment after the last query and to not throw exception + if (end - pos > 2 && *pos == '-' && *(pos + 1) == '-') + { + pos += 2; + /// skip until the end of the line + while (pos != end && *pos != '\n') + ++pos; + if (print_comments) + std::cout << std::string_view(comment_begin, pos - comment_begin) << "\n"; + } + /// need to parse next sql + else + break; + } while (pos != end); +} + +} + #pragma GCC diagnostic ignored "-Wunused-function" #pragma GCC diagnostic ignored "-Wmissing-declarations" extern const char * auto_time_zones[]; - -namespace DB -{ -namespace ErrorCodes -{ -extern const int INVALID_FORMAT_INSERT_QUERY_WITH_DATA; -} -} - int mainEntryClickHouseFormat(int argc, char ** argv) { using namespace DB; @@ -56,8 +85,10 @@ int mainEntryClickHouseFormat(int argc, char ** argv) desc.add_options() ("query", po::value(), "query to format") ("help,h", "produce help message") + ("comments", "keep comments in the output") ("hilite", "add syntax highlight with ANSI terminal escape sequences") ("oneline", "format in single line") + ("max_line_length", po::value()->default_value(0), "format in single line queries with length less than specified") ("quiet,q", "just check syntax, no output on success") ("multiquery,n", "allow multiple queries in the same file") ("obfuscate", "obfuscate instead of formatting") @@ -89,6 +120,8 @@ int mainEntryClickHouseFormat(int argc, char ** argv) bool oneline = options.count("oneline"); bool quiet = options.count("quiet"); bool multiple = options.count("multiquery"); + bool print_comments = options.count("comments"); + size_t max_line_length = options["max_line_length"].as(); bool obfuscate = options.count("obfuscate"); bool backslash = options.count("backslash"); bool allow_settings_after_format_in_insert = options.count("allow_settings_after_format_in_insert"); @@ -105,6 +138,19 @@ int mainEntryClickHouseFormat(int argc, char ** argv) return 2; } + if (oneline && max_line_length) + { + std::cerr << "Options 'oneline' and 'max_line_length' are mutually exclusive." << std::endl; + return 2; + } + + if (max_line_length > 255) + { + std::cerr << "Option 'max_line_length' must be less than 256." << std::endl; + return 2; + } + + String query; if (options.count("query")) @@ -125,7 +171,6 @@ int mainEntryClickHouseFormat(int argc, char ** argv) if (options.count("seed")) { - std::string seed; hash_func.update(options["seed"].as()); } @@ -181,30 +226,75 @@ int mainEntryClickHouseFormat(int argc, char ** argv) { const char * pos = query.data(); const char * end = pos + query.size(); + skipSpacesAndComments(pos, end, print_comments); ParserQuery parser(end, allow_settings_after_format_in_insert); - do + while (pos != end) { + size_t approx_query_length = multiple ? find_first_symbols<';'>(pos, end) - pos : end - pos; + ASTPtr res = parseQueryAndMovePosition( parser, pos, end, "query", multiple, cmd_settings.max_query_size, cmd_settings.max_parser_depth); - /// For insert query with data(INSERT INTO ... VALUES ...), that will lead to the formatting failure, - /// we should throw an exception early, and make exception message more readable. - if (const auto * insert_query = res->as(); insert_query && insert_query->data) + std::unique_ptr insert_query_payload = nullptr; + /// If the query is INSERT ... VALUES, then we will try to parse the data. + if (auto * insert_query = res->as(); insert_query && insert_query->data) { - throw Exception(DB::ErrorCodes::INVALID_FORMAT_INSERT_QUERY_WITH_DATA, - "Can't format ASTInsertQuery with data, since data will be lost"); + if ("Values" != insert_query->format) + throw Exception(DB::ErrorCodes::NOT_IMPLEMENTED, "Can't format INSERT query with data format '{}'", insert_query->format); + + /// Reset format to default to have `INSERT INTO table VALUES` instead of `INSERT INTO table VALUES FORMAT Values` + insert_query->format = {}; + + /// We assume that data ends with a newline character (same as client does) + const char * this_query_end = find_first_symbols<'\n'>(insert_query->data, end); + insert_query->end = this_query_end; + pos = this_query_end; + insert_query_payload = getReadBufferFromASTInsertQuery(res); } if (!quiet) { if (!backslash) { - WriteBufferFromOStream res_buf(std::cout, 4096); - formatAST(*res, res_buf, hilite, oneline); - res_buf.finalize(); - if (multiple) - std::cout << "\n;\n"; + WriteBufferFromOwnString str_buf; + formatAST(*res, str_buf, hilite, oneline || approx_query_length < max_line_length); + + if (insert_query_payload) + { + str_buf.write(' '); + copyData(*insert_query_payload, str_buf); + } + + String res_string = str_buf.str(); + const char * s_pos = res_string.data(); + const char * s_end = s_pos + res_string.size(); + /// remove trailing spaces + while (s_end > s_pos && isWhitespaceASCIIOneLine(*(s_end - 1))) + --s_end; + WriteBufferFromOStream res_cout(std::cout, 4096); + /// For multiline queries we print ';' at new line, + /// but for single line queries we print ';' at the same line + bool has_multiple_lines = false; + while (s_pos != s_end) + { + if (*s_pos == '\n') + has_multiple_lines = true; + res_cout.write(*s_pos++); + } + res_cout.finalize(); + + if (multiple && !insert_query_payload) + { + if (oneline || !has_multiple_lines) + std::cout << ";\n"; + else + std::cout << "\n;\n"; + } + else if (multiple && insert_query_payload) + /// Do not need to add ; because it's already in the insert_query_payload + std::cout << "\n"; + std::cout << std::endl; } /// add additional '\' at the end of each line; @@ -232,27 +322,10 @@ int mainEntryClickHouseFormat(int argc, char ** argv) std::cout << std::endl; } } - - do - { - /// skip spaces to avoid throw exception after last query - while (pos != end && std::isspace(*pos)) - ++pos; - - /// for skip comment after the last query and to not throw exception - if (end - pos > 2 && *pos == '-' && *(pos + 1) == '-') - { - pos += 2; - /// skip until the end of the line - while (pos != end && *pos != '\n') - ++pos; - } - /// need to parse next sql - else - break; - } while (pos != end); - - } while (multiple && pos != end); + skipSpacesAndComments(pos, end, print_comments); + if (!multiple) + break; + } } } catch (...) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 86cb9acd056..083b959c4b6 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -551,13 +551,18 @@ endif () target_link_libraries (clickhouse_common_io PRIVATE ch_contrib::lz4) if (TARGET ch_contrib::qpl) -dbms_target_link_libraries(PUBLIC ch_contrib::qpl) + dbms_target_link_libraries(PUBLIC ch_contrib::qpl) endif () if (TARGET ch_contrib::accel-config) dbms_target_link_libraries(PUBLIC ch_contrib::accel-config) endif () +if (TARGET ch_contrib::qatzstd_plugin) + dbms_target_link_libraries(PUBLIC ch_contrib::qatzstd_plugin) + target_link_libraries(clickhouse_common_io PUBLIC ch_contrib::qatzstd_plugin) +endif () + target_link_libraries(clickhouse_common_io PUBLIC boost::context) dbms_target_link_libraries(PUBLIC boost::context) diff --git a/src/Client/Connection.cpp b/src/Client/Connection.cpp index 75ca66f2647..352d2a53892 100644 --- a/src/Client/Connection.cpp +++ b/src/Client/Connection.cpp @@ -651,7 +651,13 @@ void Connection::sendQuery( if (method == "ZSTD") level = settings->network_zstd_compression_level; - CompressionCodecFactory::instance().validateCodec(method, level, !settings->allow_suspicious_codecs, settings->allow_experimental_codecs, settings->enable_deflate_qpl_codec); + CompressionCodecFactory::instance().validateCodec( + method, + level, + !settings->allow_suspicious_codecs, + settings->allow_experimental_codecs, + settings->enable_deflate_qpl_codec, + settings->enable_zstd_qat_codec); compression_codec = CompressionCodecFactory::instance().get(method, level); } else diff --git a/src/Common/AsyncLoader.cpp b/src/Common/AsyncLoader.cpp index 962adb8b052..0e0fa25e7a1 100644 --- a/src/Common/AsyncLoader.cpp +++ b/src/Common/AsyncLoader.cpp @@ -43,6 +43,19 @@ void logAboutProgress(Poco::Logger * log, size_t processed, size_t total, Atomic } } +void cancelOnDependencyFailure(const LoadJobPtr & self, const LoadJobPtr & dependency, std::exception_ptr & cancel) +{ + cancel = std::make_exception_ptr(Exception(ErrorCodes::ASYNC_LOAD_CANCELED, + "Load job '{}' -> {}", + self->name, + getExceptionMessage(dependency->exception(), /* with_stacktrace = */ false))); +} + +void ignoreDependencyFailure(const LoadJobPtr &, const LoadJobPtr &, std::exception_ptr &) +{ + // No-op +} + LoadStatus LoadJob::status() const { std::unique_lock lock{mutex}; @@ -96,7 +109,10 @@ size_t LoadJob::canceled(const std::exception_ptr & ptr) size_t LoadJob::finish() { - func = {}; // To ensure job function is destructed before `AsyncLoader::wait()` return + // To ensure functions are destructed before `AsyncLoader::wait()` return + func = {}; + dependency_failure = {}; + finish_time = std::chrono::system_clock::now(); if (waiters > 0) finished.notify_all(); @@ -327,17 +343,19 @@ void AsyncLoader::schedule(const LoadJobSet & jobs_to_schedule) if (dep_status == LoadStatus::FAILED || dep_status == LoadStatus::CANCELED) { - // Dependency on already failed or canceled job -- it's okay. Cancel all dependent jobs. - std::exception_ptr e; + // Dependency on already failed or canceled job -- it's okay. + // Process as usual (may lead to cancel of all dependent jobs). + std::exception_ptr cancel; NOEXCEPT_SCOPE({ ALLOW_ALLOCATIONS_IN_SCOPE; - e = std::make_exception_ptr(Exception(ErrorCodes::ASYNC_LOAD_CANCELED, - "Load job '{}' -> {}", - job->name, - getExceptionMessage(dep->exception(), /* with_stacktrace = */ false))); + if (job->dependency_failure) + job->dependency_failure(job, dep, cancel); }); - finish(job, LoadStatus::CANCELED, e, lock); - break; // This job is now finished, stop its dependencies processing + if (cancel) + { + finish(job, LoadStatus::CANCELED, cancel, lock); + break; // This job is now finished, stop its dependencies processing + } } } } @@ -515,63 +533,76 @@ String AsyncLoader::checkCycle(const LoadJobPtr & job, LoadJobSet & left, LoadJo return {}; } -void AsyncLoader::finish(const LoadJobPtr & job, LoadStatus status, std::exception_ptr exception_from_job, std::unique_lock & lock) +void AsyncLoader::finish(const LoadJobPtr & job, LoadStatus status, std::exception_ptr reason, std::unique_lock & lock) { chassert(scheduled_jobs.contains(job)); // Job was pending + + // Notify waiters size_t resumed_workers = 0; // Number of workers resumed in the execution pool of the job if (status == LoadStatus::OK) - { - // Notify waiters - resumed_workers += job->ok(); + resumed_workers = job->ok(); + else if (status == LoadStatus::FAILED) + resumed_workers = job->failed(reason); + else if (status == LoadStatus::CANCELED) + resumed_workers = job->canceled(reason); - // Update dependent jobs and enqueue if ready - for (const auto & dep : scheduled_jobs[job].dependent_jobs) + // Adjust suspended workers count + if (resumed_workers) + { + Pool & pool = pools[job->executionPool()]; + pool.suspended_workers -= resumed_workers; + } + + Info & info = scheduled_jobs[job]; + if (info.isReady()) + { + // Job could be in ready queue (on cancel) -- must be dequeued + pools[job->pool_id].ready_queue.erase(info.ready_seqno); + info.ready_seqno = 0; + } + + // To avoid container modification during recursion (during clean dependency graph edges below) + LoadJobSet dependent; + dependent.swap(info.dependent_jobs); + + // Update dependent jobs + for (const auto & dpt : dependent) + { + if (auto dpt_info = scheduled_jobs.find(dpt); dpt_info != scheduled_jobs.end()) { - chassert(scheduled_jobs.contains(dep)); // All depended jobs must be pending - Info & dep_info = scheduled_jobs[dep]; - dep_info.dependencies_left--; - if (!dep_info.isBlocked()) - enqueue(dep_info, dep, lock); + dpt_info->second.dependencies_left--; + if (!dpt_info->second.isBlocked()) + enqueue(dpt_info->second, dpt, lock); + + if (status != LoadStatus::OK) + { + std::exception_ptr cancel; + NOEXCEPT_SCOPE({ + ALLOW_ALLOCATIONS_IN_SCOPE; + if (dpt->dependency_failure) + dpt->dependency_failure(dpt, job, cancel); + }); + // Recurse into dependent job if it should be canceled + if (cancel) + finish(dpt, LoadStatus::CANCELED, cancel, lock); + } + } + else + { + // Job has already been canceled. Do not enter twice into the same job during finish recursion. + // This happens in {A<-B; A<-C; B<-D; C<-D} graph for D if A is failed or canceled. + chassert(status == LoadStatus::CANCELED); } } - else + + // Clean dependency graph edges pointing to canceled jobs + if (status != LoadStatus::OK) { - // Notify waiters - if (status == LoadStatus::FAILED) - resumed_workers += job->failed(exception_from_job); - else if (status == LoadStatus::CANCELED) - resumed_workers += job->canceled(exception_from_job); - - Info & info = scheduled_jobs[job]; - if (info.isReady()) - { - pools[job->pool_id].ready_queue.erase(info.ready_seqno); - info.ready_seqno = 0; - } - - // Recurse into all dependent jobs - LoadJobSet dependent; - dependent.swap(info.dependent_jobs); // To avoid container modification during recursion - for (const auto & dep : dependent) - { - if (!scheduled_jobs.contains(dep)) - continue; // Job has already been canceled - std::exception_ptr e; - NOEXCEPT_SCOPE({ - ALLOW_ALLOCATIONS_IN_SCOPE; - e = std::make_exception_ptr( - Exception(ErrorCodes::ASYNC_LOAD_CANCELED, - "Load job '{}' -> {}", - dep->name, - getExceptionMessage(exception_from_job, /* with_stacktrace = */ false))); - }); - finish(dep, LoadStatus::CANCELED, e, lock); - } - - // Clean dependency graph edges pointing to canceled jobs for (const auto & dep : job->dependencies) + { if (auto dep_info = scheduled_jobs.find(dep); dep_info != scheduled_jobs.end()) dep_info->second.dependent_jobs.erase(job); + } } // Job became finished @@ -582,12 +613,6 @@ void AsyncLoader::finish(const LoadJobPtr & job, LoadStatus status, std::excepti if (log_progress) logAboutProgress(log, finished_jobs.size() - old_jobs, finished_jobs.size() + scheduled_jobs.size() - old_jobs, stopwatch); }); - - if (resumed_workers) - { - Pool & pool = pools[job->executionPool()]; - pool.suspended_workers -= resumed_workers; - } } void AsyncLoader::prioritize(const LoadJobPtr & job, size_t new_pool_id, std::unique_lock & lock) @@ -612,6 +637,9 @@ void AsyncLoader::prioritize(const LoadJobPtr & job, size_t new_pool_id, std::un } job->pool_id.store(new_pool_id); + // TODO(serxa): we should adjust suspended_workers and suspended_waiters here. + // Otherwise suspended_workers we be left inconsistent. Fix it and add a test. + // Scenario: schedule a job A, wait for it from a job B in the same pool, prioritize A // Recurse into dependencies for (const auto & dep : job->dependencies) diff --git a/src/Common/AsyncLoader.h b/src/Common/AsyncLoader.h index 95a2273a0f4..85de309b153 100644 --- a/src/Common/AsyncLoader.h +++ b/src/Common/AsyncLoader.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include #include @@ -57,12 +58,13 @@ enum class LoadStatus class LoadJob : private boost::noncopyable { public: - template - LoadJob(LoadJobSetType && dependencies_, String name_, size_t pool_id_, Func && func_) + template + LoadJob(LoadJobSetType && dependencies_, String name_, size_t pool_id_, DFFunc && dependency_failure_, Func && func_) : dependencies(std::forward(dependencies_)) , name(std::move(name_)) , execution_pool_id(pool_id_) , pool_id(pool_id_) + , dependency_failure(std::forward(dependency_failure_)) , func(std::forward(func_)) {} @@ -108,6 +110,14 @@ private: std::atomic job_id{0}; std::atomic execution_pool_id; std::atomic pool_id; + + // Handler for failed or canceled dependencies. + // If job needs to be canceled on `dependency` failure, then function should set `cancel` to a specific reason. + // Note that implementation should be fast and cannot use AsyncLoader, because it is called under `AsyncLoader::mutex`. + // Note that `dependency_failure` is called only on pending jobs. + std::function dependency_failure; + + // Function to be called to execute the job. std::function func; mutable std::mutex mutex; @@ -123,35 +133,54 @@ private: std::atomic finish_time{TimePoint{}}; }; -struct EmptyJobFunc -{ - void operator()(AsyncLoader &, const LoadJobPtr &) {} -}; +// For LoadJob::dependency_failure. Cancels the job on the first dependency failure or cancel. +void cancelOnDependencyFailure(const LoadJobPtr & self, const LoadJobPtr & dependency, std::exception_ptr & cancel); -template -LoadJobPtr makeLoadJob(LoadJobSet && dependencies, String name, Func && func = EmptyJobFunc()) +// For LoadJob::dependency_failure. Never cancels the job due to dependency failure or cancel. +void ignoreDependencyFailure(const LoadJobPtr & self, const LoadJobPtr & dependency, std::exception_ptr & cancel); + +template concept LoadJobDependencyFailure = std::invocable; +template concept LoadJobFunc = std::invocable; + +LoadJobPtr makeLoadJob(LoadJobSet && dependencies, String name, LoadJobDependencyFailure auto && dependency_failure, LoadJobFunc auto && func) { - return std::make_shared(std::move(dependencies), std::move(name), 0, std::forward(func)); + return std::make_shared(std::move(dependencies), std::move(name), 0, std::forward(dependency_failure), std::forward(func)); } -template -LoadJobPtr makeLoadJob(const LoadJobSet & dependencies, String name, Func && func = EmptyJobFunc()) +LoadJobPtr makeLoadJob(const LoadJobSet & dependencies, String name, LoadJobDependencyFailure auto && dependency_failure, LoadJobFunc auto && func) { - return std::make_shared(dependencies, std::move(name), 0, std::forward(func)); + return std::make_shared(dependencies, std::move(name), 0, std::forward(dependency_failure), std::forward(func)); } -template -LoadJobPtr makeLoadJob(LoadJobSet && dependencies, size_t pool_id, String name, Func && func = EmptyJobFunc()) +LoadJobPtr makeLoadJob(LoadJobSet && dependencies, size_t pool_id, String name, LoadJobDependencyFailure auto && dependency_failure, LoadJobFunc auto && func) { - return std::make_shared(std::move(dependencies), std::move(name), pool_id, std::forward(func)); + return std::make_shared(std::move(dependencies), std::move(name), pool_id, std::forward(dependency_failure), std::forward(func)); } -template -LoadJobPtr makeLoadJob(const LoadJobSet & dependencies, size_t pool_id, String name, Func && func = EmptyJobFunc()) +LoadJobPtr makeLoadJob(const LoadJobSet & dependencies, size_t pool_id, String name, LoadJobDependencyFailure auto && dependency_failure, LoadJobFunc auto && func) { - return std::make_shared(dependencies, std::move(name), pool_id, std::forward(func)); + return std::make_shared(dependencies, std::move(name), pool_id, std::forward(dependency_failure), std::forward(func)); } +LoadJobPtr makeLoadJob(LoadJobSet && dependencies, String name, LoadJobFunc auto && func) +{ + return std::make_shared(std::move(dependencies), std::move(name), 0, cancelOnDependencyFailure, std::forward(func)); +} + +LoadJobPtr makeLoadJob(const LoadJobSet & dependencies, String name, LoadJobFunc auto && func) +{ + return std::make_shared(dependencies, std::move(name), 0, cancelOnDependencyFailure, std::forward(func)); +} + +LoadJobPtr makeLoadJob(LoadJobSet && dependencies, size_t pool_id, String name, LoadJobFunc auto && func) +{ + return std::make_shared(std::move(dependencies), std::move(name), pool_id, cancelOnDependencyFailure, std::forward(func)); +} + +LoadJobPtr makeLoadJob(const LoadJobSet & dependencies, size_t pool_id, String name, LoadJobFunc auto && func) +{ + return std::make_shared(dependencies, std::move(name), pool_id, cancelOnDependencyFailure, std::forward(func)); +} // Represents a logically connected set of LoadJobs required to achieve some goals (final LoadJob in the set). class LoadTask : private boost::noncopyable @@ -277,7 +306,7 @@ private: { size_t dependencies_left = 0; // Current number of dependencies on pending jobs. UInt64 ready_seqno = 0; // Zero means that job is not in ready queue. - LoadJobSet dependent_jobs; // Set of jobs dependent on this job. + LoadJobSet dependent_jobs; // Set of jobs dependent on this job. Contains only scheduled jobs. // Three independent states of a scheduled job. bool isBlocked() const { return dependencies_left > 0; } @@ -371,7 +400,7 @@ public: private: void checkCycle(const LoadJobSet & jobs, std::unique_lock & lock); String checkCycle(const LoadJobPtr & job, LoadJobSet & left, LoadJobSet & visited, std::unique_lock & lock); - void finish(const LoadJobPtr & job, LoadStatus status, std::exception_ptr exception_from_job, std::unique_lock & lock); + void finish(const LoadJobPtr & job, LoadStatus status, std::exception_ptr reason, std::unique_lock & lock); void gatherNotScheduled(const LoadJobPtr & job, LoadJobSet & jobs, std::unique_lock & lock); void prioritize(const LoadJobPtr & job, size_t new_pool_id, std::unique_lock & lock); void enqueue(Info & info, const LoadJobPtr & job, std::unique_lock & lock); diff --git a/src/Common/CurrentMetrics.cpp b/src/Common/CurrentMetrics.cpp index 2613e9ec116..724b6ab62f7 100644 --- a/src/Common/CurrentMetrics.cpp +++ b/src/Common/CurrentMetrics.cpp @@ -242,7 +242,7 @@ M(FilesystemCacheDelayedCleanupElements, "Filesystem cache elements in background cleanup queue") \ M(FilesystemCacheHoldFileSegments, "Filesystem cache file segment which are currently hold as unreleasable") \ M(AsyncInsertCacheSize, "Number of async insert hash id in cache") \ - M(S3Requests, "S3 requests") \ + M(S3Requests, "S3 requests count") \ M(KeeperAliveConnections, "Number of alive connections") \ M(KeeperOutstandingRequets, "Number of outstanding requests") \ M(ThreadsInOvercommitTracker, "Number of waiting threads inside of OvercommitTracker") \ diff --git a/src/Common/MatchGenerator.cpp b/src/Common/MatchGenerator.cpp new file mode 100644 index 00000000000..f047c21b470 --- /dev/null +++ b/src/Common/MatchGenerator.cpp @@ -0,0 +1,494 @@ +#ifdef __clang__ +# pragma clang diagnostic push +# pragma clang diagnostic ignored "-Wzero-as-null-pointer-constant" +# pragma clang diagnostic ignored "-Wgnu-anonymous-struct" +# pragma clang diagnostic ignored "-Wnested-anon-types" +# pragma clang diagnostic ignored "-Wunused-parameter" +# pragma clang diagnostic ignored "-Wshadow-field-in-constructor" +# pragma clang diagnostic ignored "-Wdtor-name" +#endif +#include +#include +#include +#ifdef __clang__ +# pragma clang diagnostic pop +#endif + +#ifdef LOG_INFO +#undef LOG_INFO +#undef LOG_WARNING +#undef LOG_ERROR +#undef LOG_FATAL +#endif + +#include "MatchGenerator.h" + +#include +#include +#include +#include +#include + +namespace DB +{ +namespace ErrorCodes +{ +extern const int BAD_ARGUMENTS; +extern const int LOGICAL_ERROR; +} +} + + +namespace re2 +{ + +class RandomStringPrepareWalker : public Regexp::Walker +{ +private: + static constexpr int ImplicitMax = 100; + + using Children = std::vector; + + class Generators; + + /// This function objects look much prettier than lambda expression when stack traces are printed + class NodeFunction + { + public: + virtual size_t operator() (char * out, size_t size) = 0; + virtual size_t getRequiredSize() = 0; + virtual ~NodeFunction() = default; + }; + + using NodeFunctionPtr = std::shared_ptr; + using NodeFuncs = std::vector; + + static NodeFuncs getFuncs(const Children & children_, const Generators & generators_) + { + NodeFuncs result; + result.reserve(children_.size()); + + for (auto * child: children_) + { + result.push_back(generators_.at(child)); + } + + return result; + } + + class Generators: public std::map {}; + + class RegexpConcatFunction : public NodeFunction + { + public: + RegexpConcatFunction(const Children & children_, const Generators & generators_) + : children(getFuncs(children_, generators_)) + { + } + + size_t operator () (char * out, size_t size) override + { + size_t total_size = 0; + + for (auto & child: children) + { + size_t consumed = child->operator()(out, size); + chassert(consumed <= size); + out += consumed; + size -= consumed; + total_size += consumed; + } + + return total_size; + } + + size_t getRequiredSize() override + { + size_t total_size = 0; + for (auto & child: children) + total_size += child->getRequiredSize(); + return total_size; + } + + private: + NodeFuncs children; + }; + + class RegexpAlternateFunction : public NodeFunction + { + public: + RegexpAlternateFunction(const Children & children_, const Generators & generators_) + : children(getFuncs(children_, generators_)) + { + } + + size_t operator () (char * out, size_t size) override + { + std::uniform_int_distribution distribution(0, static_cast(children.size()-1)); + int chosen = distribution(thread_local_rng); + size_t consumed = children[chosen]->operator()(out, size); + chassert(consumed <= size); + return consumed; + } + + size_t getRequiredSize() override + { + size_t total_size = 0; + for (auto & child: children) + total_size = std::max(total_size, child->getRequiredSize()); + return total_size; + } + + private: + NodeFuncs children; + }; + + class RegexpRepeatFunction : public NodeFunction + { + public: + RegexpRepeatFunction(Regexp * re_, const Generators & generators_, int min_repeat_, int max_repeat_) + : func(generators_.at(re_)) + , min_repeat(min_repeat_) + , max_repeat(max_repeat_) + { + } + + size_t operator () (char * out, size_t size) override + { + std::uniform_int_distribution distribution(min_repeat, max_repeat); + int ntimes = distribution(thread_local_rng); + + size_t total_size = 0; + for (int i = 0; i < ntimes; ++i) + { + size_t consumed =func->operator()(out, size); + chassert(consumed <= size); + out += consumed; + size -= consumed; + total_size += consumed; + } + return total_size; + } + + size_t getRequiredSize() override + { + return max_repeat * func->getRequiredSize(); + } + + private: + NodeFunctionPtr func; + int min_repeat = 0; + int max_repeat = 0; + }; + + class RegexpCharClassFunction : public NodeFunction + { + using CharRanges = std::vector>; + + public: + explicit RegexpCharClassFunction(Regexp * re_) + { + CharClass * cc = re_->cc(); + chassert(cc); + if (cc->empty()) + throw DB::Exception(DB::ErrorCodes::BAD_ARGUMENTS, "kRegexpCharClass is empty"); + + char_count = cc->size(); + char_ranges.reserve(std::distance(cc->begin(), cc->end())); + + for (const auto range: *cc) + { + char_ranges.emplace_back(range.lo, range.hi); + } + } + + size_t operator () (char * out, size_t size) override + { + chassert(UTFmax <= size); + + std::uniform_int_distribution distribution(1, char_count); + int chosen = distribution(thread_local_rng); + int count_down = chosen; + + auto it = char_ranges.begin(); + for (; it != char_ranges.end(); ++it) + { + auto [lo, hi] = *it; + auto range_len = hi - lo + 1; + if (count_down <= range_len) + break; + count_down -= range_len; + } + + if (it == char_ranges.end()) + throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, + "Unable to choose the rune. Runes {}, ranges {}, chosen {}", + char_count, char_ranges.size(), chosen); + + auto [lo, _] = *it; + Rune r = lo + count_down - 1; + return re2::runetochar(out, &r); + } + + size_t getRequiredSize() override + { + return UTFmax; + } + + private: + int char_count = 0; + CharRanges char_ranges; + }; + + class RegexpLiteralStringFunction : public NodeFunction + { + public: + explicit RegexpLiteralStringFunction(Regexp * re_) + { + if (re_->nrunes() == 0) + return; + + char buffer[UTFmax]; + for (int i = 0; i < re_->nrunes(); ++i) + { + int n = re2::runetochar(buffer, &re_->runes()[i]); + literal_string += String(buffer, n); + } + } + + size_t operator () (char * out, size_t size) override + { + chassert(literal_string.size() <= size); + + memcpy(out, literal_string.data(), literal_string.size()); + return literal_string.size(); + } + + size_t getRequiredSize() override + { + return literal_string.size(); + } + + private: + String literal_string; + }; + + class RegexpLiteralFunction : public NodeFunction + { + public: + explicit RegexpLiteralFunction(Regexp * re_) + { + char buffer[UTFmax]; + + Rune r = re_->rune(); + int n = re2::runetochar(buffer, &r); + literal = String(buffer, n); + } + + size_t operator () (char * out, size_t size) override + { + chassert(literal.size() <= size); + + memcpy(out, literal.data(), literal.size()); + return literal.size(); + } + + size_t getRequiredSize() override + { + return literal.size(); + } + + private: + String literal; + }; + + class ThrowExceptionFunction : public NodeFunction + { + public: + explicit ThrowExceptionFunction(Regexp * re_) + : operation(magic_enum::enum_name(re_->op())) + { + } + + size_t operator () (char *, size_t) override + { + throw DB::Exception( + DB::ErrorCodes::BAD_ARGUMENTS, + "RandomStringPrepareWalker: regexp node '{}' is not supported for generating a random match", + operation); + } + + size_t getRequiredSize() override + { + return 0; + } + + private: + String operation; + }; + + +public: + std::function getGenerator() + { + if (root == nullptr) + throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "no root has been set"); + + if (generators.empty()) + throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "no generators"); + + auto root_func = generators.at(root); + auto required_buffer_size = root_func->getRequiredSize(); + auto generator_func = [=] () + -> String + { + auto buffer = String(required_buffer_size, '\0'); + size_t size = root_func->operator()(buffer.data(), buffer.size()); + buffer.resize(size); + return buffer; + }; + + root = nullptr; + generators = {}; + + return std::move(generator_func); + } + +private: + Children CopyChildrenArgs(Regexp ** children, int nchild) + { + Children result; + result.reserve(nchild); + for (int i = 0; i < nchild; ++i) + result.push_back(Copy(children[i])); + return result; + } + + Regexp * ShortVisit(Regexp* /*re*/, Regexp * /*parent_arg*/) override + { + throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "ShortVisit should not be called"); + } + + Regexp * PreVisit(Regexp * re, Regexp * parent_arg, bool* /*stop*/) override /*noexcept*/ + { + if (parent_arg == nullptr) + { + chassert(root == nullptr); + chassert(re != nullptr); + root = re; + } + + return re; + } + + Regexp * PostVisit(Regexp * re, Regexp * /*parent_arg*/, Regexp * pre_arg, + Regexp ** child_args, int nchild_args) override /*noexcept*/ + { + switch (re->op()) + { + case kRegexpConcat: // Matches concatenation of sub_[0..nsub-1]. + generators[re] = std::make_shared(CopyChildrenArgs(child_args, nchild_args), generators); + break; + case kRegexpAlternate: // Matches union of sub_[0..nsub-1]. + generators[re] = std::make_shared(CopyChildrenArgs(child_args, nchild_args), generators); + break; + case kRegexpQuest: // Matches sub_[0] zero or one times. + chassert(nchild_args == 1); + generators[re] = std::make_shared(child_args[0], generators, 0, 1); + break; + case kRegexpStar: // Matches sub_[0] zero or more times. + chassert(nchild_args == 1); + generators[re] = std::make_shared(child_args[0], generators, 0, ImplicitMax); + break; + case kRegexpPlus: // Matches sub_[0] one or more times. + chassert(nchild_args == 1); + generators[re] = std::make_shared(child_args[0], generators, 1, ImplicitMax); + break; + case kRegexpCharClass: // Matches character class given by cc_. + chassert(nchild_args == 0); + generators[re] = std::make_shared(re); + break; + case kRegexpLiteralString: // Matches runes_. + chassert(nchild_args == 0); + generators[re] = std::make_shared(re); + break; + case kRegexpLiteral: // Matches rune_. + chassert(nchild_args == 0); + generators[re] = std::make_shared(re); + break; + case kRegexpCapture: // Parenthesized (capturing) subexpression. + chassert(nchild_args == 1); + generators[re] = generators.at(child_args[0]); + break; + + case kRegexpNoMatch: // Matches no strings. + case kRegexpEmptyMatch: // Matches empty string. + case kRegexpRepeat: // Matches sub_[0] at least min_ times, at most max_ times. + case kRegexpAnyChar: // Matches any character. + case kRegexpAnyByte: // Matches any byte [sic]. + case kRegexpBeginLine: // Matches empty string at beginning of line. + case kRegexpEndLine: // Matches empty string at end of line. + case kRegexpWordBoundary: // Matches word boundary "\b". + case kRegexpNoWordBoundary: // Matches not-a-word boundary "\B". + case kRegexpBeginText: // Matches empty string at beginning of text. + case kRegexpEndText: // Matches empty string at end of text. + case kRegexpHaveMatch: // Forces match of entire expression + generators[re] = std::make_shared(re); + } + + return pre_arg; + } + + Regexp * root = nullptr; + Generators generators; +}; + +} + + +namespace DB +{ + +void RandomStringGeneratorByRegexp::RegexpPtrDeleter::operator() (re2::Regexp * re) const noexcept +{ + re->Decref(); +} + +RandomStringGeneratorByRegexp::RandomStringGeneratorByRegexp(const String & re_str) +{ + re2::RE2::Options options; + options.set_case_sensitive(true); + options.set_encoding(re2::RE2::Options::EncodingLatin1); + auto flags = static_cast(options.ParseFlags()); + + re2::RegexpStatus status; + regexp.reset(re2::Regexp::Parse(re_str, flags, &status)); + + if (!regexp) + throw DB::Exception(DB::ErrorCodes::BAD_ARGUMENTS, + "Error parsing regexp '{}': {}", + re_str, status.Text()); + + regexp.reset(regexp->Simplify()); + + auto walker = re2::RandomStringPrepareWalker(); + walker.Walk(regexp.get(), {}); + generatorFunc = walker.getGenerator(); + + { + auto test_check = generate(); + auto matched = RE2::FullMatch(test_check, re2::RE2(re_str)); + if (!matched) + throw DB::Exception(DB::ErrorCodes::BAD_ARGUMENTS, + "Generator is unable to produce random string for regexp '{}': {}", + re_str, test_check); + } +} + +String RandomStringGeneratorByRegexp::generate() const +{ + chassert(generatorFunc); + return generatorFunc(); +} + +} diff --git a/src/Common/MatchGenerator.h b/src/Common/MatchGenerator.h new file mode 100644 index 00000000000..68b22404d5a --- /dev/null +++ b/src/Common/MatchGenerator.h @@ -0,0 +1,31 @@ +#pragma once + +#include +#include + +namespace re2 +{ + class Regexp; +} + +namespace DB +{ + +class RandomStringGeneratorByRegexp +{ +public: + explicit RandomStringGeneratorByRegexp(const String & re_str); + String generate() const; + +private: + struct RegexpPtrDeleter + { + void operator()(re2::Regexp * re) const noexcept; + }; + using RegexpPtr = std::unique_ptr; + + RegexpPtr regexp; + std::function generatorFunc; +}; + +} diff --git a/src/Common/ObjectStorageKey.cpp b/src/Common/ObjectStorageKey.cpp index ca5617c8aa2..feda1d9ac29 100644 --- a/src/Common/ObjectStorageKey.cpp +++ b/src/Common/ObjectStorageKey.cpp @@ -65,4 +65,5 @@ ObjectStorageKey ObjectStorageKey::createAsAbsolute(String key_) object_key.is_relative = false; return object_key; } + } diff --git a/src/Common/ObjectStorageKeyGenerator.cpp b/src/Common/ObjectStorageKeyGenerator.cpp new file mode 100644 index 00000000000..7b4507a3abc --- /dev/null +++ b/src/Common/ObjectStorageKeyGenerator.cpp @@ -0,0 +1,94 @@ +#include "ObjectStorageKeyGenerator.h" + +#include +#include + +#include + + +class GeneratorWithTemplate : public DB::IObjectStorageKeysGenerator +{ +public: + explicit GeneratorWithTemplate(String key_template_) + : key_template(std::move(key_template_)) + , re_gen(key_template) + { + } + DB::ObjectStorageKey generate(const String &) const override + { + return DB::ObjectStorageKey::createAsAbsolute(re_gen.generate()); + } + +private: + String key_template; + DB::RandomStringGeneratorByRegexp re_gen; +}; + + +class GeneratorWithPrefix : public DB::IObjectStorageKeysGenerator +{ +public: + explicit GeneratorWithPrefix(String key_prefix_) + : key_prefix(std::move(key_prefix_)) + {} + + DB::ObjectStorageKey generate(const String &) const override + { + /// Path to store the new S3 object. + + /// Total length is 32 a-z characters for enough randomness. + /// First 3 characters are used as a prefix for + /// https://aws.amazon.com/premiumsupport/knowledge-center/s3-object-key-naming-pattern/ + constexpr size_t key_name_total_size = 32; + constexpr size_t key_name_prefix_size = 3; + + /// Path to store new S3 object. + String key = fmt::format("{}/{}", + DB::getRandomASCIIString(key_name_prefix_size), + DB::getRandomASCIIString(key_name_total_size - key_name_prefix_size)); + + /// what ever key_prefix value is, consider that key as relative + return DB::ObjectStorageKey::createAsRelative(key_prefix, key); + } + +private: + String key_prefix; +}; + + +class GeneratorAsIsWithPrefix : public DB::IObjectStorageKeysGenerator +{ +public: + explicit GeneratorAsIsWithPrefix(String key_prefix_) + : key_prefix(std::move(key_prefix_)) + {} + + DB::ObjectStorageKey generate(const String & path) const override + { + return DB::ObjectStorageKey::createAsRelative(key_prefix, path); + } + +private: + String key_prefix; +}; + + +namespace DB +{ + +ObjectStorageKeysGeneratorPtr createObjectStorageKeysGeneratorAsIsWithPrefix(String key_prefix) +{ + return std::make_shared(std::move(key_prefix)); +} + +ObjectStorageKeysGeneratorPtr createObjectStorageKeysGeneratorByPrefix(String key_prefix) +{ + return std::make_shared(std::move(key_prefix)); +} + +ObjectStorageKeysGeneratorPtr createObjectStorageKeysGeneratorByTemplate(String key_template) +{ + return std::make_shared(std::move(key_template)); +} + +} diff --git a/src/Common/ObjectStorageKeyGenerator.h b/src/Common/ObjectStorageKeyGenerator.h new file mode 100644 index 00000000000..29f2a4a22c2 --- /dev/null +++ b/src/Common/ObjectStorageKeyGenerator.h @@ -0,0 +1,22 @@ +#pragma once + +#include "ObjectStorageKey.h" +#include + +namespace DB +{ + +class IObjectStorageKeysGenerator +{ +public: + virtual ObjectStorageKey generate(const String & path) const = 0; + virtual ~IObjectStorageKeysGenerator() = default; +}; + +using ObjectStorageKeysGeneratorPtr = std::shared_ptr; + +ObjectStorageKeysGeneratorPtr createObjectStorageKeysGeneratorAsIsWithPrefix(String key_prefix); +ObjectStorageKeysGeneratorPtr createObjectStorageKeysGeneratorByPrefix(String key_prefix); +ObjectStorageKeysGeneratorPtr createObjectStorageKeysGeneratorByTemplate(String key_template); + +} diff --git a/src/Common/ProfileEvents.cpp b/src/Common/ProfileEvents.cpp index 119e0d99143..101959dbf83 100644 --- a/src/Common/ProfileEvents.cpp +++ b/src/Common/ProfileEvents.cpp @@ -391,6 +391,9 @@ The server successfully detected this situation and will download merged part fr M(DiskS3PutObject, "Number of DiskS3 API PutObject calls.") \ M(DiskS3GetObject, "Number of DiskS3 API GetObject calls.") \ \ + M(S3Clients, "Number of created S3 clients.") \ + M(TinyS3Clients, "Number of S3 clients copies which reuse an existing auth provider from another client.") \ + \ M(EngineFileLikeReadFiles, "Number of files read in table engines working with files (like File/S3/URL/HDFS).") \ \ M(ReadBufferFromS3Microseconds, "Time spent on reading from S3.") \ diff --git a/src/Common/StackTrace.cpp b/src/Common/StackTrace.cpp index 21235914f7c..fe513199ac2 100644 --- a/src/Common/StackTrace.cpp +++ b/src/Common/StackTrace.cpp @@ -296,6 +296,9 @@ constexpr std::pair replacements[] // Replace parts from @c replacements with shorter aliases String demangleAndCollapseNames(std::string_view file, const char * const symbol_name) { + if (!symbol_name) + return "?"; + std::string_view file_copy = file; if (auto trim_pos = file.find_last_of('/'); trim_pos != file.npos) file_copy.remove_suffix(file.size() - trim_pos); diff --git a/src/Common/randomNumber.h b/src/Common/randomNumber.h new file mode 100644 index 00000000000..b795f32baca --- /dev/null +++ b/src/Common/randomNumber.h @@ -0,0 +1,12 @@ +#pragma once + +#include +#include + +inline UInt32 randomNumber() +{ + pcg64_fast rng{randomSeed()}; + std::uniform_int_distribution dist6( + std::numeric_limits::min(), std::numeric_limits::max()); + return static_cast(dist6(rng)); +} diff --git a/src/Common/tests/gtest_async_loader.cpp b/src/Common/tests/gtest_async_loader.cpp index 5c54dedbbde..ea8485fee92 100644 --- a/src/Common/tests/gtest_async_loader.cpp +++ b/src/Common/tests/gtest_async_loader.cpp @@ -2,6 +2,8 @@ #include #include +#include +#include #include #include #include @@ -544,6 +546,99 @@ TEST(AsyncLoader, ScheduleJobWithCanceledDependencies) } } +TEST(AsyncLoader, IgnoreDependencyFailure) +{ + AsyncLoaderTest t; + std::atomic success{false}; + t.loader.start(); + + std::string_view error_message = "test job failure"; + + auto failed_job_func = [&] (AsyncLoader &, const LoadJobPtr &) { + throw Exception(ErrorCodes::ASYNC_LOAD_FAILED, "{}", error_message); + }; + auto dependent_job_func = [&] (AsyncLoader &, const LoadJobPtr &) { + success.store(true); + }; + + auto failed_job = makeLoadJob({}, "failed_job", failed_job_func); + auto dependent_job = makeLoadJob({failed_job}, + "dependent_job", ignoreDependencyFailure, dependent_job_func); + auto task = t.schedule({ failed_job, dependent_job }); + + t.loader.wait(); + + ASSERT_EQ(failed_job->status(), LoadStatus::FAILED); + ASSERT_EQ(dependent_job->status(), LoadStatus::OK); + ASSERT_EQ(success.load(), true); +} + +TEST(AsyncLoader, CustomDependencyFailure) +{ + AsyncLoaderTest t(16); + int error_count = 0; + std::atomic good_count{0}; + std::barrier canceled_sync(4); + t.loader.start(); + + std::string_view error_message = "test job failure"; + + auto evil_dep_func = [&] (AsyncLoader &, const LoadJobPtr &) { + throw Exception(ErrorCodes::ASYNC_LOAD_FAILED, "{}", error_message); + }; + auto good_dep_func = [&] (AsyncLoader &, const LoadJobPtr &) { + good_count++; + }; + auto late_dep_func = [&] (AsyncLoader &, const LoadJobPtr &) { + canceled_sync.arrive_and_wait(); // wait for fail (A) before this job is finished + }; + auto collect_job_func = [&] (AsyncLoader &, const LoadJobPtr &) { + FAIL(); // job should be canceled, so we never get here + }; + auto dependent_job_func = [&] (AsyncLoader &, const LoadJobPtr &) { + FAIL(); // job should be canceled, so we never get here + }; + auto fail_after_two = [&] (const LoadJobPtr & self, const LoadJobPtr &, std::exception_ptr & cancel) { + if (++error_count == 2) + cancel = std::make_exception_ptr(Exception(ErrorCodes::ASYNC_LOAD_CANCELED, + "Load job '{}' canceled: too many dependencies have failed", + self->name)); + }; + + auto evil_dep1 = makeLoadJob({}, "evil_dep1", evil_dep_func); + auto evil_dep2 = makeLoadJob({}, "evil_dep2", evil_dep_func); + auto evil_dep3 = makeLoadJob({}, "evil_dep3", evil_dep_func); + auto good_dep1 = makeLoadJob({}, "good_dep1", good_dep_func); + auto good_dep2 = makeLoadJob({}, "good_dep2", good_dep_func); + auto good_dep3 = makeLoadJob({}, "good_dep3", good_dep_func); + auto late_dep1 = makeLoadJob({}, "late_dep1", late_dep_func); + auto late_dep2 = makeLoadJob({}, "late_dep2", late_dep_func); + auto late_dep3 = makeLoadJob({}, "late_dep3", late_dep_func); + auto collect_job = makeLoadJob({ + evil_dep1, evil_dep2, evil_dep3, + good_dep1, good_dep2, good_dep3, + late_dep1, late_dep2, late_dep3 + }, "collect_job", fail_after_two, collect_job_func); + auto dependent_job1 = makeLoadJob({ collect_job }, "dependent_job1", dependent_job_func); + auto dependent_job2 = makeLoadJob({ collect_job }, "dependent_job2", dependent_job_func); + auto dependent_job3 = makeLoadJob({ collect_job }, "dependent_job3", dependent_job_func); + auto task = t.schedule({ dependent_job1, dependent_job2, dependent_job3 }); // Other jobs should be discovery automatically + + t.loader.wait(collect_job, true); + canceled_sync.arrive_and_wait(); // (A) + + t.loader.wait(); + + ASSERT_EQ(late_dep1->status(), LoadStatus::OK); + ASSERT_EQ(late_dep2->status(), LoadStatus::OK); + ASSERT_EQ(late_dep3->status(), LoadStatus::OK); + ASSERT_EQ(collect_job->status(), LoadStatus::CANCELED); + ASSERT_EQ(dependent_job1->status(), LoadStatus::CANCELED); + ASSERT_EQ(dependent_job2->status(), LoadStatus::CANCELED); + ASSERT_EQ(dependent_job3->status(), LoadStatus::CANCELED); + ASSERT_EQ(good_count.load(), 3); +} + TEST(AsyncLoader, TestConcurrency) { AsyncLoaderTest t(10); diff --git a/src/Common/tests/gtest_generate_random_by_regexp.cpp b/src/Common/tests/gtest_generate_random_by_regexp.cpp new file mode 100644 index 00000000000..2f6260891c6 --- /dev/null +++ b/src/Common/tests/gtest_generate_random_by_regexp.cpp @@ -0,0 +1,101 @@ +#include +#include +#include +#include + +#include + +void routine(String s) +{ + std::cerr << "case '"<< s << "'"; + auto gen = DB::RandomStringGeneratorByRegexp(s); + [[maybe_unused]] auto res = gen.generate(); + std::cerr << " result '"<< res << "'" << std::endl; +} + +TEST(GenerateRandomString, Positive) +{ + routine("."); + routine("[[:xdigit:]]"); + routine("[0-9a-f]"); + routine("[a-z]"); + routine("prefix-[0-9a-f]-suffix"); + routine("prefix-[a-z]-suffix"); + routine("[0-9a-f]{3}"); + routine("prefix-[0-9a-f]{3}-suffix"); + routine("prefix-[a-z]{3}-suffix/[0-9a-f]{20}"); + routine("left|right"); + routine("[a-z]{0,3}"); + routine("just constant string"); + routine("[a-z]?"); + routine("[a-z]*"); + routine("[a-z]+"); + routine("[^a-z]"); + routine("[[:lower:]]{3}/suffix"); + routine("prefix-(A|B|[0-9a-f]){3}"); + routine("mergetree/[a-z]{3}/[a-z]{29}"); +} + +TEST(GenerateRandomString, Negative) +{ + EXPECT_THROW(routine("[[:do_not_exists:]]"), DB::Exception); + EXPECT_THROW(routine("[:do_not_exis..."), DB::Exception); + EXPECT_THROW(routine("^abc"), DB::Exception); +} + +TEST(GenerateRandomString, DifferentResult) +{ + std::cerr << "100 different keys" << std::endl; + auto gen = DB::RandomStringGeneratorByRegexp("prefix-[a-z]{3}-suffix/[0-9a-f]{20}"); + std::set deduplicate; + for (int i = 0; i < 100; ++i) + ASSERT_TRUE(deduplicate.insert(gen.generate()).second); + std::cerr << "100 different keys: ok" << std::endl; +} + +TEST(GenerateRandomString, FullRange) +{ + std::cerr << "all possible letters" << std::endl; + auto gen = DB::RandomStringGeneratorByRegexp("[a-z]"); + std::set deduplicate; + int count = 'z' - 'a' + 1; + while (deduplicate.size() < count) + if (deduplicate.insert(gen.generate()).second) + std::cerr << " +1 "; + std::cerr << "all possible letters, ok" << std::endl; +} + +UInt64 elapsed(DB::ObjectStorageKeysGeneratorPtr generator) +{ + String path = "some_path"; + + Stopwatch watch; + + for (int i = 0; i < 100000; ++i) + { + [[ maybe_unused ]] auto result = generator->generate(path).serialize(); + } + + return watch.elapsedMicroseconds(); +} + +TEST(ObjectStorageKey, Performance) +{ + auto elapsed_old = elapsed(DB::createObjectStorageKeysGeneratorByPrefix( + "xx-xx-xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx/mergetree/")); + std::cerr << "old: " << elapsed_old << std::endl; + + auto elapsed_new = elapsed(DB::createObjectStorageKeysGeneratorByTemplate( + "xx-xx-xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx/mergetree/[a-z]{3}/[a-z]{29}")); + std::cerr << "new: " << elapsed_new << std::endl; + + if (elapsed_new > elapsed_old) + { + if (elapsed_new > elapsed_old) + std::cerr << "slow ratio: +" << float(elapsed_new) / elapsed_old << std::endl; + else + std::cerr << "fast ratio: " << float(elapsed_old) / elapsed_new << std::endl; + ASSERT_LT(elapsed_new, 1.2 * elapsed_old); + } + +} diff --git a/src/Compression/CompressionCodecZSTD.cpp b/src/Compression/CompressionCodecZSTD.cpp index ec37ec6a7b5..7aecb652efc 100644 --- a/src/Compression/CompressionCodecZSTD.cpp +++ b/src/Compression/CompressionCodecZSTD.cpp @@ -1,4 +1,4 @@ -#include +#include #include #include #include @@ -9,42 +9,11 @@ #include #include #include - +#include +#include namespace DB { - -class CompressionCodecZSTD : public ICompressionCodec -{ -public: - static constexpr auto ZSTD_DEFAULT_LEVEL = 1; - static constexpr auto ZSTD_DEFAULT_LOG_WINDOW = 24; - - explicit CompressionCodecZSTD(int level_); - CompressionCodecZSTD(int level_, int window_log); - - uint8_t getMethodByte() const override; - - UInt32 getMaxCompressedDataSize(UInt32 uncompressed_size) const override; - - void updateHash(SipHash & hash) const override; - -protected: - - UInt32 doCompressData(const char * source, UInt32 source_size, char * dest) const override; - - void doDecompressData(const char * source, UInt32 source_size, char * dest, UInt32 uncompressed_size) const override; - - bool isCompression() const override { return true; } - bool isGenericCompression() const override { return true; } - -private: - const int level; - const bool enable_long_range; - const int window_log; -}; - - namespace ErrorCodes { extern const int CANNOT_COMPRESS; @@ -82,7 +51,7 @@ UInt32 CompressionCodecZSTD::doCompressData(const char * source, UInt32 source_s ZSTD_freeCCtx(cctx); if (ZSTD_isError(compressed_size)) - throw Exception(ErrorCodes::CANNOT_COMPRESS, "Cannot compress with ZSTD codec: {}", std::string(ZSTD_getErrorName(compressed_size))); + throw Exception(ErrorCodes::CANNOT_COMPRESS, "Cannot compress with ZSTD codec: {}", ZSTD_getErrorName(compressed_size)); return static_cast(compressed_size); } @@ -96,13 +65,19 @@ void CompressionCodecZSTD::doDecompressData(const char * source, UInt32 source_s throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "Cannot decompress ZSTD-encoded data: {}", std::string(ZSTD_getErrorName(res))); } -CompressionCodecZSTD::CompressionCodecZSTD(int level_, int window_log_) : level(level_), enable_long_range(true), window_log(window_log_) +CompressionCodecZSTD::CompressionCodecZSTD(int level_, int window_log_) + : level(level_) + , enable_long_range(true) + , window_log(window_log_) { setCodecDescription( "ZSTD", {std::make_shared(static_cast(level)), std::make_shared(static_cast(window_log))}); } -CompressionCodecZSTD::CompressionCodecZSTD(int level_) : level(level_), enable_long_range(false), window_log(0) +CompressionCodecZSTD::CompressionCodecZSTD(int level_) + : level(level_) + , enable_long_range(false) + , window_log(0) { setCodecDescription("ZSTD", {std::make_shared(static_cast(level))}); } diff --git a/src/Compression/CompressionCodecZSTD.h b/src/Compression/CompressionCodecZSTD.h new file mode 100644 index 00000000000..cdded9fc08a --- /dev/null +++ b/src/Compression/CompressionCodecZSTD.h @@ -0,0 +1,38 @@ +#pragma once + +#include + +namespace DB +{ + +class CompressionCodecZSTD : public ICompressionCodec +{ +public: + static constexpr auto ZSTD_DEFAULT_LEVEL = 1; + static constexpr auto ZSTD_DEFAULT_LOG_WINDOW = 24; + + explicit CompressionCodecZSTD(int level_); + CompressionCodecZSTD(int level_, int window_log); + + uint8_t getMethodByte() const override; + + UInt32 getMaxCompressedDataSize(UInt32 uncompressed_size) const override; + + void updateHash(SipHash & hash) const override; + +protected: + + UInt32 doCompressData(const char * source, UInt32 source_size, char * dest) const override; + + void doDecompressData(const char * source, UInt32 source_size, char * dest, UInt32 uncompressed_size) const override; + + bool isCompression() const override { return true; } + bool isGenericCompression() const override { return true; } + +private: + const int level; + const bool enable_long_range; + const int window_log; +}; + +} diff --git a/src/Compression/CompressionCodecZSTDQAT.cpp b/src/Compression/CompressionCodecZSTDQAT.cpp new file mode 100644 index 00000000000..4828a71a515 --- /dev/null +++ b/src/Compression/CompressionCodecZSTDQAT.cpp @@ -0,0 +1,113 @@ +#ifdef ENABLE_ZSTD_QAT_CODEC + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ +namespace ErrorCodes +{ + extern const int CANNOT_COMPRESS; + extern const int ILLEGAL_SYNTAX_FOR_CODEC_TYPE; + extern const int ILLEGAL_CODEC_PARAMETER; +} + +/// Hardware-accelerated ZSTD. Supports only compression so far. +class CompressionCodecZSTDQAT : public CompressionCodecZSTD +{ +public: + static constexpr auto ZSTDQAT_SUPPORTED_MIN_LEVEL = 1; + static constexpr auto ZSTDQAT_SUPPORTED_MAX_LEVEL = 12; + static constexpr int ZSTDQAT_DEVICE_UNINITIALIZED = 0XFFFF; + + explicit CompressionCodecZSTDQAT(int level_); + +protected: + bool isZstdQat() const override { return true; } + UInt32 doCompressData(const char * source, UInt32 source_size, char * dest) const override; + +private: + const int level; + Poco::Logger * log; + static std::atomic qat_state; /// Global initialization status of QAT device, we fall back back to software compression if uninitialized +}; + +std::atomic CompressionCodecZSTDQAT::qat_state = ZSTDQAT_DEVICE_UNINITIALIZED; + +UInt32 CompressionCodecZSTDQAT::doCompressData(const char * source, UInt32 source_size, char * dest) const +{ + if (qat_state == ZSTDQAT_DEVICE_UNINITIALIZED) + { + qat_state = QZSTD_startQatDevice(); + if (qat_state == QZSTD_OK) + LOG_DEBUG(log, "Initialization of hardware-assissted ZSTD_QAT codec successful"); + else + LOG_WARNING(log, "Initialization of hardware-assisted ZSTD_QAT codec failed, falling back to software ZSTD codec -> status: {}", qat_state); + } + + ZSTD_CCtx * cctx = ZSTD_createCCtx(); + ZSTD_CCtx_setParameter(cctx, ZSTD_c_compressionLevel, level); + + void * sequence_producer_state = nullptr; + if (qat_state == QZSTD_OK) + { + sequence_producer_state = QZSTD_createSeqProdState(); + ZSTD_registerSequenceProducer(cctx, sequence_producer_state, qatSequenceProducer); + ZSTD_CCtx_setParameter(cctx, ZSTD_c_enableSeqProducerFallback, 1); + } + + size_t compressed_size = ZSTD_compress2(cctx, dest, ZSTD_compressBound(source_size), source, source_size); + QZSTD_freeSeqProdState(sequence_producer_state); + ZSTD_freeCCtx(cctx); + + if (ZSTD_isError(compressed_size)) + throw Exception(ErrorCodes::CANNOT_COMPRESS, "Cannot compress with ZSTD_QAT codec: {}", ZSTD_getErrorName(compressed_size)); + + return static_cast(compressed_size); +} + +void registerCodecZSTDQAT(CompressionCodecFactory & factory) +{ + UInt8 method_code = static_cast(CompressionMethodByte::ZSTD_QPL); + factory.registerCompressionCodec("ZSTD_QAT", method_code, [&](const ASTPtr & arguments) -> CompressionCodecPtr + { + int level = CompressionCodecZSTD::ZSTD_DEFAULT_LEVEL; + if (arguments && !arguments->children.empty()) + { + if (arguments->children.size() > 1) + throw Exception(ErrorCodes::ILLEGAL_SYNTAX_FOR_CODEC_TYPE, "ZSTD_QAT codec must have 1 parameter, given {}", arguments->children.size()); + + const auto children = arguments->children; + const auto * literal = children[0]->as(); + if (!literal) + throw Exception(ErrorCodes::ILLEGAL_CODEC_PARAMETER, "ZSTD_QAT codec argument must be integer"); + + level = static_cast(literal->value.safeGet()); + if (level < CompressionCodecZSTDQAT::ZSTDQAT_SUPPORTED_MIN_LEVEL || level > CompressionCodecZSTDQAT::ZSTDQAT_SUPPORTED_MAX_LEVEL) + /// that's a hardware limitation + throw Exception(ErrorCodes::ILLEGAL_CODEC_PARAMETER, + "ZSTDQAT codec doesn't support level more than {} and lower than {} , given {}", + CompressionCodecZSTDQAT::ZSTDQAT_SUPPORTED_MAX_LEVEL, CompressionCodecZSTDQAT::ZSTDQAT_SUPPORTED_MIN_LEVEL, level); + } + + return std::make_shared(level); + }); +} + +CompressionCodecZSTDQAT::CompressionCodecZSTDQAT(int level_) + : CompressionCodecZSTD(level_) + , level(level_) + , log(&Poco::Logger::get("CompressionCodecZSTDQAT")) +{ + setCodecDescription("ZSTD_QAT", {std::make_shared(static_cast(level))}); +} + +} + +#endif diff --git a/src/Compression/CompressionFactory.cpp b/src/Compression/CompressionFactory.cpp index 7959c431328..f4413401667 100644 --- a/src/Compression/CompressionFactory.cpp +++ b/src/Compression/CompressionFactory.cpp @@ -167,6 +167,9 @@ void registerCodecNone(CompressionCodecFactory & factory); void registerCodecLZ4(CompressionCodecFactory & factory); void registerCodecLZ4HC(CompressionCodecFactory & factory); void registerCodecZSTD(CompressionCodecFactory & factory); +#ifdef ENABLE_ZSTD_QAT_CODEC +void registerCodecZSTDQAT(CompressionCodecFactory & factory); +#endif void registerCodecMultiple(CompressionCodecFactory & factory); #ifdef ENABLE_QPL_COMPRESSION void registerCodecDeflateQpl(CompressionCodecFactory & factory); @@ -189,6 +192,9 @@ CompressionCodecFactory::CompressionCodecFactory() registerCodecNone(*this); registerCodecLZ4(*this); registerCodecZSTD(*this); +#ifdef ENABLE_ZSTD_QAT_CODEC + registerCodecZSTDQAT(*this); +#endif registerCodecLZ4HC(*this); registerCodecMultiple(*this); #ifndef CLICKHOUSE_KEEPER_STANDALONE_BUILD diff --git a/src/Compression/CompressionFactory.h b/src/Compression/CompressionFactory.h index 4f2627587a3..e71476d564d 100644 --- a/src/Compression/CompressionFactory.h +++ b/src/Compression/CompressionFactory.h @@ -40,10 +40,10 @@ public: CompressionCodecPtr getDefaultCodec() const; /// Validate codecs AST specified by user and parses codecs description (substitute default parameters) - ASTPtr validateCodecAndGetPreprocessedAST(const ASTPtr & ast, const DataTypePtr & column_type, bool sanity_check, bool allow_experimental_codecs, bool enable_deflate_qpl_codec) const; + ASTPtr validateCodecAndGetPreprocessedAST(const ASTPtr & ast, const DataTypePtr & column_type, bool sanity_check, bool allow_experimental_codecs, bool enable_deflate_qpl_codec, bool enable_zstd_qat_codec) const; /// Validate codecs AST specified by user - void validateCodec(const String & family_name, std::optional level, bool sanity_check, bool allow_experimental_codecs, bool enable_deflate_qpl_codec) const; + void validateCodec(const String & family_name, std::optional level, bool sanity_check, bool allow_experimental_codecs, bool enable_deflate_qpl_codec, bool enable_zstd_qat_codec) const; /// Get codec by AST and possible column_type. Some codecs can use /// information about type to improve inner settings, but every codec should diff --git a/src/Compression/CompressionFactoryAdditions.cpp b/src/Compression/CompressionFactoryAdditions.cpp index 98e9e7480da..f4d993f628e 100644 --- a/src/Compression/CompressionFactoryAdditions.cpp +++ b/src/Compression/CompressionFactoryAdditions.cpp @@ -34,7 +34,7 @@ namespace ErrorCodes void CompressionCodecFactory::validateCodec( - const String & family_name, std::optional level, bool sanity_check, bool allow_experimental_codecs, bool enable_deflate_qpl_codec) const + const String & family_name, std::optional level, bool sanity_check, bool allow_experimental_codecs, bool enable_deflate_qpl_codec, bool enable_zstd_qat_codec) const { if (family_name.empty()) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Compression codec name cannot be empty"); @@ -43,13 +43,13 @@ void CompressionCodecFactory::validateCodec( { auto literal = std::make_shared(static_cast(*level)); validateCodecAndGetPreprocessedAST(makeASTFunction("CODEC", makeASTFunction(Poco::toUpper(family_name), literal)), - {}, sanity_check, allow_experimental_codecs, enable_deflate_qpl_codec); + {}, sanity_check, allow_experimental_codecs, enable_deflate_qpl_codec, enable_zstd_qat_codec); } else { auto identifier = std::make_shared(Poco::toUpper(family_name)); validateCodecAndGetPreprocessedAST(makeASTFunction("CODEC", identifier), - {}, sanity_check, allow_experimental_codecs, enable_deflate_qpl_codec); + {}, sanity_check, allow_experimental_codecs, enable_deflate_qpl_codec, enable_zstd_qat_codec); } } @@ -77,7 +77,7 @@ bool innerDataTypeIsFloat(const DataTypePtr & type) } ASTPtr CompressionCodecFactory::validateCodecAndGetPreprocessedAST( - const ASTPtr & ast, const DataTypePtr & column_type, bool sanity_check, bool allow_experimental_codecs, bool enable_deflate_qpl_codec) const + const ASTPtr & ast, const DataTypePtr & column_type, bool sanity_check, bool allow_experimental_codecs, bool enable_deflate_qpl_codec, bool enable_zstd_qat_codec) const { if (const auto * func = ast->as()) { @@ -165,6 +165,12 @@ ASTPtr CompressionCodecFactory::validateCodecAndGetPreprocessedAST( " You can enable it with the 'enable_deflate_qpl_codec' setting.", codec_family_name); + if (!enable_zstd_qat_codec && result_codec->isZstdQat()) + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Codec {} is disabled by default." + " You can enable it with the 'enable_zstd_qat_codec' setting.", + codec_family_name); + codecs_descriptions->children.emplace_back(result_codec->getCodecDesc()); } diff --git a/src/Compression/CompressionInfo.h b/src/Compression/CompressionInfo.h index 1b4025fed1d..ee4b3e38653 100644 --- a/src/Compression/CompressionInfo.h +++ b/src/Compression/CompressionInfo.h @@ -48,6 +48,7 @@ enum class CompressionMethodByte : uint8_t FPC = 0x98, DeflateQpl = 0x99, GCD = 0x9a, + ZSTD_QPL = 0x9b, }; } diff --git a/src/Compression/ICompressionCodec.h b/src/Compression/ICompressionCodec.h index ca794511268..18ff543d908 100644 --- a/src/Compression/ICompressionCodec.h +++ b/src/Compression/ICompressionCodec.h @@ -121,6 +121,9 @@ public: /// Is this the DEFLATE_QPL codec? virtual bool isDeflateQpl() const { return false; } + /// Is this the ZSTD_QAT codec? + virtual bool isZstdQat() const { return false; } + /// If it does nothing. virtual bool isNone() const { return false; } diff --git a/src/Coordination/KeeperSnapshotManagerS3.cpp b/src/Coordination/KeeperSnapshotManagerS3.cpp index 910615bf6ef..716184e07d0 100644 --- a/src/Coordination/KeeperSnapshotManagerS3.cpp +++ b/src/Coordination/KeeperSnapshotManagerS3.cpp @@ -70,7 +70,7 @@ void KeeperSnapshotManagerS3::updateS3Configuration(const Poco::Util::AbstractCo { std::lock_guard client_lock{snapshot_s3_client_mutex}; // if client is not changed (same auth settings, same endpoint) we don't need to update - if (snapshot_s3_client && snapshot_s3_client->client && auth_settings == snapshot_s3_client->auth_settings + if (snapshot_s3_client && snapshot_s3_client->client && !snapshot_s3_client->auth_settings.hasUpdates(auth_settings) && snapshot_s3_client->uri.uri == new_uri.uri) return; } diff --git a/src/Core/MySQL/MySQLGtid.cpp b/src/Core/MySQL/MySQLGtid.cpp index 2b46c3d14ad..7916f882979 100644 --- a/src/Core/MySQL/MySQLGtid.cpp +++ b/src/Core/MySQL/MySQLGtid.cpp @@ -188,4 +188,46 @@ String GTIDSets::toPayload() const return buffer.str(); } +bool GTIDSet::contains(const GTIDSet & gtid_set) const +{ + //we contain the other set if each of its intervals are contained in any of our intervals. + //use the fact that intervals are sorted to make this linear instead of quadratic. + if (uuid != gtid_set.uuid) { return false; } + + auto mine = intervals.begin(), other = gtid_set.intervals.begin(); + auto my_end = intervals.end(), other_end = gtid_set.intervals.end(); + while (mine != my_end && other != other_end) + { + bool mine_contains_other = mine->start <= other->start && mine->end >= other->end; + if (mine_contains_other) + { + ++other; + } + else + { + ++mine; + } + } + + return other == other_end; //if we've iterated through all intervals in the argument, all its intervals are contained in this +} + +bool GTIDSets::contains(const GTIDSet & gtid_set) const +{ + for (const auto & my_gtid_set : sets) + { + if (my_gtid_set.contains(gtid_set)) { return true; } + } + return false; +} + +bool GTIDSets::contains(const GTIDSets & gtid_sets) const +{ + for (const auto & gtid_set : gtid_sets.sets) + { + if (!this->contains(gtid_set)) { return false; } + } + return true; +} + } diff --git a/src/Core/MySQL/MySQLGtid.h b/src/Core/MySQL/MySQLGtid.h index 45eeaf02fa2..b7cff39cca6 100644 --- a/src/Core/MySQL/MySQLGtid.h +++ b/src/Core/MySQL/MySQLGtid.h @@ -28,6 +28,8 @@ public: void tryMerge(size_t i); static void tryShrink(GTIDSet & set, unsigned int i, Interval & current); + + bool contains(const GTIDSet & gtid_set) const; }; class GTIDSets @@ -40,6 +42,31 @@ public: String toString() const; String toPayload() const; + bool contains(const GTIDSet & gtid_set) const; + bool contains(const GTIDSets & gtid_sets) const; }; +inline bool operator==(const GTID & left, const GTID & right) +{ + return left.uuid == right.uuid + && left.seq_no == right.seq_no; +} + +inline bool operator==(const GTIDSet::Interval & left, const GTIDSet::Interval & right) +{ + return left.start == right.start + && left.end == right.end; +} + +inline bool operator==(const GTIDSet & left, const GTIDSet & right) +{ + return left.uuid == right.uuid + && left.intervals == right.intervals; +} + +inline bool operator==(const GTIDSets & left, const GTIDSets & right) +{ + return left.sets == right.sets; +} + } diff --git a/src/Core/MySQL/MySQLReplication.cpp b/src/Core/MySQL/MySQLReplication.cpp index dcf42134b0b..403f98360c1 100644 --- a/src/Core/MySQL/MySQLReplication.cpp +++ b/src/Core/MySQL/MySQLReplication.cpp @@ -967,6 +967,59 @@ namespace MySQLReplication out << "[DryRun Event]" << '\n'; } + void UnparsedRowsEvent::dump(WriteBuffer & out) const + { + std::lock_guard lock(mutex); + header.dump(out); + out << "[UnparsedRowsEvent Event]" << '\n'; + out << "Unparsed Data Size: " << unparsed_data.size() << '\n'; + } + + void UnparsedRowsEvent::parseImpl(ReadBuffer & payload_) + { + char c = 0; + if (payload_.position() < payload_.buffer().end()) + unparsed_data.reserve(payload_.buffer().end() - payload_.position()); + /// Prevent reading after the end + /// payload.available() might have incorrect value + while (payload_.position() <= payload_.buffer().end() && payload_.read(c)) + unparsed_data.push_back(c); + if (!payload_.eof()) + throw Exception(ErrorCodes::CANNOT_READ_ALL_DATA, "Cannot read all data. Available {} bytes but not eof", payload_.available()); + } + + std::shared_ptr UnparsedRowsEvent::parse() + { + std::lock_guard lock(mutex); + if (!unparsed_data.empty()) + { + RowsEventHeader rows_header(header.type); + rows_header.table_id = table_id; + rows_header.flags = flags; + switch (header.type) + { + case WRITE_ROWS_EVENT_V1: + case WRITE_ROWS_EVENT_V2: + parsed_event = std::make_shared(table_map, EventHeader(header), rows_header); + break; + case DELETE_ROWS_EVENT_V1: + case DELETE_ROWS_EVENT_V2: + parsed_event = std::make_shared(table_map, EventHeader(header), rows_header); + break; + case UPDATE_ROWS_EVENT_V1: + case UPDATE_ROWS_EVENT_V2: + parsed_event = std::make_shared(table_map, EventHeader(header), rows_header); + break; + default: + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown event type: {}", magic_enum::enum_name(header.type)); + } + ReadBufferFromMemory payload(unparsed_data.data(), unparsed_data.size()); + parsed_event->parseEvent(payload); + unparsed_data.clear(); + } + return parsed_event; + } + /// Update binlog name/position/gtid based on the event type. void Position::update(BinlogEventPtr event) { @@ -998,7 +1051,8 @@ namespace MySQLReplication case ROTATE_EVENT: { auto rotate = std::static_pointer_cast(event); binlog_name = rotate->next_binlog; - binlog_pos = event->header.log_pos; + /// If binlog name has changed, need to use position from next binlog + binlog_pos = rotate->position; break; } case GTID_EVENT: { @@ -1012,13 +1066,18 @@ namespace MySQLReplication default: throw ReplicationError(ErrorCodes::LOGICAL_ERROR, "Position update with unsupported event"); } + if (event->header.timestamp > 0) + { + timestamp = event->header.timestamp; + } } - void Position::update(UInt64 binlog_pos_, const String & binlog_name_, const String & gtid_sets_) + void Position::update(UInt64 binlog_pos_, const String & binlog_name_, const String & gtid_sets_, UInt32 binlog_time_) { binlog_pos = binlog_pos_; binlog_name = binlog_name_; gtid_sets.parse(gtid_sets_); + timestamp = binlog_time_; } void Position::dump(WriteBuffer & out) const diff --git a/src/Core/MySQL/MySQLReplication.h b/src/Core/MySQL/MySQLReplication.h index 1584dbd42ac..6ba507245b3 100644 --- a/src/Core/MySQL/MySQLReplication.h +++ b/src/Core/MySQL/MySQLReplication.h @@ -181,6 +181,7 @@ namespace MySQLReplication MYSQL_WRITE_ROWS_EVENT = 2, MYSQL_UPDATE_ROWS_EVENT = 3, MYSQL_DELETE_ROWS_EVENT = 4, + MYSQL_UNPARSED_ROWS_EVENT = 100, }; class ReplicationError : public DB::Exception @@ -274,6 +275,8 @@ namespace MySQLReplication String status; String schema; String query; + String query_database_name; + String query_table_name; QueryType typ = QUERY_EVENT_DDL; bool transaction_complete = true; @@ -446,7 +449,6 @@ namespace MySQLReplication void parseImpl(ReadBuffer & payload) override; void parseRow(ReadBuffer & payload, Bitmap & bitmap); - private: std::shared_ptr table_map; }; @@ -497,17 +499,38 @@ namespace MySQLReplication void parseImpl(ReadBuffer & payload) override; }; + class UnparsedRowsEvent : public RowsEvent + { + public: + UnparsedRowsEvent(const std::shared_ptr & table_map_, EventHeader && header_, const RowsEventHeader & rows_header) + : RowsEvent(table_map_, std::move(header_), rows_header) + { + } + + void dump(WriteBuffer & out) const override; + MySQLEventType type() const override { return MYSQL_UNPARSED_ROWS_EVENT; } + std::shared_ptr parse(); + + protected: + void parseImpl(ReadBuffer & payload) override; + std::vector unparsed_data; + std::shared_ptr parsed_event; + mutable std::mutex mutex; + }; + class Position { public: UInt64 binlog_pos; String binlog_name; GTIDSets gtid_sets; + UInt32 timestamp; - Position() : binlog_pos(0) { } + Position() : binlog_pos(0), timestamp(0) { } void update(BinlogEventPtr event); - void update(UInt64 binlog_pos_, const String & binlog_name_, const String & gtid_sets_); + void update(UInt64 binlog_pos_, const String & binlog_name_, const String & gtid_sets_, UInt32 binlog_time_); void dump(WriteBuffer & out) const; + void resetPendingGTID() { pending_gtid.reset(); } private: std::optional pending_gtid; diff --git a/src/Core/MySQL/tests/gtest_MySQLGtid.cpp b/src/Core/MySQL/tests/gtest_MySQLGtid.cpp new file mode 100644 index 00000000000..e31a87aaa39 --- /dev/null +++ b/src/Core/MySQL/tests/gtest_MySQLGtid.cpp @@ -0,0 +1,40 @@ +#include +#include + +using namespace DB; + + +GTEST_TEST(GTIDSetsContains, Tests) +{ + GTIDSets gtid_set, + contained1, contained2, contained3, contained4, contained5, + not_contained1, not_contained2, not_contained3, not_contained4, not_contained5, not_contained6; + + gtid_set.parse("2174B383-5441-11E8-B90A-C80AA9429562:1-3:11:47-49, 24DA167-0C0C-11E8-8442-00059A3C7B00:1-19:47-49:60"); + contained1.parse("2174B383-5441-11E8-B90A-C80AA9429562:1-3:11:47-49, 24DA167-0C0C-11E8-8442-00059A3C7B00:1-19:47-49:60"); + contained2.parse("2174B383-5441-11E8-B90A-C80AA9429562:2-3:11:47-49"); + contained3.parse("2174B383-5441-11E8-B90A-C80AA9429562:11"); + contained4.parse("24DA167-0C0C-11E8-8442-00059A3C7B00:2-16:47-49:60"); + contained5.parse("24DA167-0C0C-11E8-8442-00059A3C7B00:60"); + + not_contained1.parse("2174B383-5441-11E8-B90A-C80AA9429562:1-3:11:47-50, 24DA167-0C0C-11E8-8442-00059A3C7B00:1-19:47-49:60"); + not_contained2.parse("2174B383-5441-11E8-B90A-C80AA9429562:0-3:11:47-49"); + not_contained3.parse("2174B383-5441-11E8-B90A-C80AA9429562:99"); + not_contained4.parse("24DA167-0C0C-11E8-8442-00059A3C7B00:2-16:46-49:60"); + not_contained5.parse("24DA167-0C0C-11E8-8442-00059A3C7B00:99"); + not_contained6.parse("2174B383-5441-11E8-B90A-C80AA9429562:1-3:11:47-49, 24DA167-0C0C-11E8-8442-00059A3C7B00:1-19:47-49:60, 00000000-0000-0000-0000-000000000000"); + + + ASSERT_TRUE(gtid_set.contains(contained1)); + ASSERT_TRUE(gtid_set.contains(contained2)); + ASSERT_TRUE(gtid_set.contains(contained3)); + ASSERT_TRUE(gtid_set.contains(contained4)); + ASSERT_TRUE(gtid_set.contains(contained5)); + + ASSERT_FALSE(gtid_set.contains(not_contained1)); + ASSERT_FALSE(gtid_set.contains(not_contained2)); + ASSERT_FALSE(gtid_set.contains(not_contained3)); + ASSERT_FALSE(gtid_set.contains(not_contained4)); + ASSERT_FALSE(gtid_set.contains(not_contained5)); + ASSERT_FALSE(gtid_set.contains(not_contained6)); +} diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 58b7cbab4c9..0e6da579b10 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -204,6 +204,8 @@ class IColumn; M(Bool, input_format_parallel_parsing, true, "Enable parallel parsing for some data formats.", 0) \ M(UInt64, min_chunk_bytes_for_parallel_parsing, (10 * 1024 * 1024), "The minimum chunk size in bytes, which each thread will parse in parallel.", 0) \ M(Bool, output_format_parallel_formatting, true, "Enable parallel formatting for some data formats.", 0) \ + M(UInt64, output_format_compression_level, 3, "Default compression level if query output is compressed. The setting is applied when `SELECT` query has `INTO OUTFILE` or when inserting to table function `file`, `url`, `hdfs`, `s3`, and `azureBlobStorage`.", 0) \ + M(UInt64, output_format_compression_zstd_window_log, 0, "Can be used when the output compression method is `zstd`. If greater than `0`, this setting explicitly sets compression window size (power of `2`) and enables a long-range mode for zstd compression.", 0) \ \ M(UInt64, merge_tree_min_rows_for_concurrent_read, (20 * 8192), "If at least as many lines are read from one file, the reading can be parallelized.", 0) \ M(UInt64, merge_tree_min_bytes_for_concurrent_read, (24 * 10 * 1024 * 1024), "If at least as many bytes are read from one file, the reading can be parallelized.", 0) \ @@ -352,6 +354,7 @@ class IColumn; M(Bool, allow_suspicious_codecs, false, "If it is set to true, allow to specify meaningless compression codecs.", 0) \ M(Bool, allow_experimental_codecs, false, "If it is set to true, allow to specify experimental compression codecs (but we don't have those yet and this option does nothing).", 0) \ M(Bool, enable_deflate_qpl_codec, false, "Enable/disable the DEFLATE_QPL codec.", 0) \ + M(Bool, enable_zstd_qat_codec, false, "Enable/disable the ZSTD_QAT codec.", 0) \ M(UInt64, query_profiler_real_time_period_ns, QUERY_PROFILER_DEFAULT_SAMPLE_RATE_NS, "Period for real clock timer of query profiler (in nanoseconds). Set 0 value to turn off the real clock query profiler. Recommended value is at least 10000000 (100 times a second) for single queries or 1000000000 (once a second) for cluster-wide profiling.", 0) \ M(UInt64, query_profiler_cpu_time_period_ns, QUERY_PROFILER_DEFAULT_SAMPLE_RATE_NS, "Period for CPU clock timer of query profiler (in nanoseconds). Set 0 value to turn off the CPU clock query profiler. Recommended value is at least 10000000 (100 times a second) for single queries or 1000000000 (once a second) for cluster-wide profiling.", 0) \ M(Bool, metrics_perf_events_enabled, false, "If enabled, some of the perf events will be measured throughout queries' execution.", 0) \ diff --git a/src/Databases/DatabaseOrdinary.cpp b/src/Databases/DatabaseOrdinary.cpp index 8973b533720..ba1b2cdacad 100644 --- a/src/Databases/DatabaseOrdinary.cpp +++ b/src/Databases/DatabaseOrdinary.cpp @@ -227,11 +227,17 @@ LoadTaskPtr DatabaseOrdinary::startupDatabaseAsync( LoadJobSet startup_after, LoadingStrictnessLevel /*mode*/) { - // NOTE: this task is empty, but it is required for correct dependency handling (startup should be done after tables loading) auto job = makeLoadJob( std::move(startup_after), TablesLoaderBackgroundStartupPoolId, - fmt::format("startup Ordinary database {}", getDatabaseName())); + fmt::format("startup Ordinary database {}", getDatabaseName()), + ignoreDependencyFailure, + [] (AsyncLoader &, const LoadJobPtr &) + { + // NOTE: this job is no-op, but it is required for correct dependency handling + // 1) startup should be done after tables loading + // 2) load or startup errors for tables should not lead to not starting up the whole database + }); return startup_database_task = makeLoadTask(async_loader, {job}); } diff --git a/src/Databases/MySQL/DatabaseMaterializedMySQL.cpp b/src/Databases/MySQL/DatabaseMaterializedMySQL.cpp index cbb080a0baa..99dd337189c 100644 --- a/src/Databases/MySQL/DatabaseMaterializedMySQL.cpp +++ b/src/Databases/MySQL/DatabaseMaterializedMySQL.cpp @@ -10,6 +10,7 @@ # include # include # include +# include # include # include # include @@ -39,10 +40,11 @@ DatabaseMaterializedMySQL::DatabaseMaterializedMySQL( const String & mysql_database_name_, mysqlxx::Pool && pool_, MySQLClient && client_, + const MySQLReplication::BinlogClientPtr & binlog_client_, std::unique_ptr settings_) : DatabaseAtomic(database_name_, metadata_path_, uuid, "DatabaseMaterializedMySQL(" + database_name_ + ")", context_) , settings(std::move(settings_)) - , materialize_thread(context_, database_name_, mysql_database_name_, std::move(pool_), std::move(client_), settings.get()) + , materialize_thread(context_, database_name_, mysql_database_name_, std::move(pool_), std::move(client_), binlog_client_, settings.get()) { } @@ -197,6 +199,7 @@ void registerDatabaseMaterializedMySQL(DatabaseFactory & factory) if (!engine->arguments) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Engine `{}` must have arguments", engine_name); + MySQLReplication::BinlogClientPtr binlog_client; StorageMySQL::Configuration configuration; ASTs & arguments = engine->arguments->children; auto mysql_settings = std::make_unique(); @@ -241,6 +244,12 @@ void registerDatabaseMaterializedMySQL(DatabaseFactory & factory) if (engine_define->settings) materialize_mode_settings->loadFromQuery(*engine_define); + if (materialize_mode_settings->use_binlog_client) + binlog_client = DB::MySQLReplication::BinlogClientFactory::instance().getClient( + configuration.host, configuration.port, configuration.username, configuration.password, + materialize_mode_settings->max_bytes_in_binlog_dispatcher_buffer, + materialize_mode_settings->max_flush_milliseconds_in_binlog_dispatcher); + if (args.uuid == UUIDHelpers::Nil) { auto print_create_ast = args.create_query.clone(); @@ -261,6 +270,7 @@ void registerDatabaseMaterializedMySQL(DatabaseFactory & factory) configuration.database, std::move(mysql_pool), std::move(client), + binlog_client, std::move(materialize_mode_settings)); }; factory.registerDatabase("MaterializeMySQL", create_fn); diff --git a/src/Databases/MySQL/DatabaseMaterializedMySQL.h b/src/Databases/MySQL/DatabaseMaterializedMySQL.h index 895498723fd..4d7871d49d6 100644 --- a/src/Databases/MySQL/DatabaseMaterializedMySQL.h +++ b/src/Databases/MySQL/DatabaseMaterializedMySQL.h @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -31,6 +32,7 @@ public: const String & mysql_database_name_, mysqlxx::Pool && pool_, MySQLClient && client_, + const MySQLReplication::BinlogClientPtr & binlog_client_, std::unique_ptr settings_); void rethrowExceptionIfNeeded() const; diff --git a/src/Databases/MySQL/MaterializedMySQLSettings.h b/src/Databases/MySQL/MaterializedMySQLSettings.h index 82342f8c76d..557d48be85b 100644 --- a/src/Databases/MySQL/MaterializedMySQLSettings.h +++ b/src/Databases/MySQL/MaterializedMySQLSettings.h @@ -17,6 +17,11 @@ class ASTStorage; M(Int64, max_wait_time_when_mysql_unavailable, 1000, "Retry interval when MySQL is not available (milliseconds). Negative value disable retry.", 0) \ M(Bool, allows_query_when_mysql_lost, false, "Allow query materialized table when mysql is lost.", 0) \ M(String, materialized_mysql_tables_list, "", "a comma-separated list of mysql database tables, which will be replicated by MaterializedMySQL database engine. Default value: empty list — means whole tables will be replicated.", 0) \ + M(Bool, use_binlog_client, false, "Use MySQL Binlog Client.", 0) \ + M(UInt64, max_bytes_in_binlog_queue, 64 * 1024 * 1024, "Max bytes in binlog's queue created from MySQL Binlog Client.", 0) \ + M(UInt64, max_milliseconds_to_wait_in_binlog_queue, 10000, "Max milliseconds to wait when max bytes exceeded in a binlog queue.", 0) \ + M(UInt64, max_bytes_in_binlog_dispatcher_buffer, DBMS_DEFAULT_BUFFER_SIZE, "Max bytes in the binlog dispatcher's buffer before it is flushed to attached binlogs.", 0) \ + M(UInt64, max_flush_milliseconds_in_binlog_dispatcher, 1000, "Max milliseconds in the binlog dispatcher's buffer to wait before it is flushed to attached binlogs.", 0) \ DECLARE_SETTINGS_TRAITS(MaterializedMySQLSettingsTraits, LIST_OF_MATERIALIZE_MODE_SETTINGS) diff --git a/src/Databases/MySQL/MaterializedMySQLSyncThread.cpp b/src/Databases/MySQL/MaterializedMySQLSyncThread.cpp index 14cd89e1ff6..5834fb96dc6 100644 --- a/src/Databases/MySQL/MaterializedMySQLSyncThread.cpp +++ b/src/Databases/MySQL/MaterializedMySQLSyncThread.cpp @@ -26,14 +26,13 @@ #include #include #include -#include +#include #include #include #include #include #include #include -#include namespace DB { @@ -48,8 +47,43 @@ namespace ErrorCodes extern const int UNKNOWN_DATABASE; extern const int UNKNOWN_EXCEPTION; extern const int CANNOT_READ_ALL_DATA; + extern const int ATTEMPT_TO_READ_AFTER_EOF; + extern const int NETWORK_ERROR; + extern const int UNKNOWN_TABLE; + extern const int CANNOT_GET_CREATE_TABLE_QUERY; + extern const int THERE_IS_NO_QUERY; + extern const int QUERY_WAS_CANCELLED; + extern const int TABLE_ALREADY_EXISTS; + extern const int DATABASE_ALREADY_EXISTS; + extern const int DATABASE_NOT_EMPTY; + extern const int TABLE_IS_DROPPED; + extern const int TABLE_SIZE_EXCEEDS_MAX_DROP_SIZE_LIMIT; + extern const int CANNOT_CREATE_CHARSET_CONVERTER; + extern const int UNKNOWN_FUNCTION; + extern const int UNKNOWN_IDENTIFIER; + extern const int UNKNOWN_TYPE; + extern const int TIMEOUT_EXCEEDED; + extern const int MEMORY_LIMIT_EXCEEDED; + extern const int MYSQL_SYNTAX_ERROR; } +// USE MySQL ERROR CODE: +// https://dev.mysql.com/doc/mysql-errors/5.7/en/server-error-reference.html +constexpr int ER_ACCESS_DENIED_ERROR = 1045; /// NOLINT +constexpr int ER_DBACCESS_DENIED_ERROR = 1044; /// NOLINT +constexpr int ER_BAD_DB_ERROR = 1049; /// NOLINT +constexpr int ER_MASTER_HAS_PURGED_REQUIRED_GTIDS = 1789; /// NOLINT +constexpr int ER_MASTER_FATAL_ERROR_READING_BINLOG = 1236; /// NOLINT + +// https://dev.mysql.com/doc/mysql-errors/8.0/en/client-error-reference.html +constexpr int CR_CONN_HOST_ERROR = 2003; /// NOLINT +constexpr int CR_SERVER_GONE_ERROR = 2006; /// NOLINT +constexpr int CR_SERVER_LOST = 2013; /// NOLINT +constexpr int ER_SERVER_SHUTDOWN = 1053; /// NOLINT +constexpr int ER_LOCK_DEADLOCK = 1213; /// NOLINT +constexpr int ER_LOCK_WAIT_TIMEOUT = 1205; /// NOLINT +constexpr int ER_OPTION_PREVENTS_STATEMENT = 1290; /// NOLINT + static constexpr auto MYSQL_BACKGROUND_THREAD_NAME = "MySQLDBSync"; static ContextMutablePtr createQueryContext(ContextPtr context) @@ -157,12 +191,68 @@ static void checkMySQLVariables(const mysqlxx::Pool::Entry & connection, const S } } +static bool shouldReconnectOnException(const std::exception_ptr & e) +{ + try + { + std::rethrow_exception(e); + } + catch (const mysqlxx::ConnectionFailed &) {} /// NOLINT + catch (const mysqlxx::ConnectionLost &) {} /// NOLINT + catch (const Poco::Net::ConnectionResetException &) {} /// NOLINT + catch (const Poco::Net::ConnectionRefusedException &) {} /// NOLINT + catch (const DB::NetException &) {} /// NOLINT + catch (const Poco::Net::NetException & e) + { + if (e.code() != POCO_ENETDOWN && + e.code() != POCO_ENETUNREACH && + e.code() != POCO_ENETRESET && + e.code() != POCO_ESYSNOTREADY) + return false; + } + catch (const mysqlxx::BadQuery & e) + { + // Lost connection to MySQL server during query + if (e.code() != CR_SERVER_LOST && + e.code() != ER_SERVER_SHUTDOWN && + e.code() != CR_SERVER_GONE_ERROR && + e.code() != CR_CONN_HOST_ERROR && + e.code() != ER_LOCK_DEADLOCK && + e.code() != ER_LOCK_WAIT_TIMEOUT && + e.code() != ER_OPTION_PREVENTS_STATEMENT) + return false; + } + catch (const mysqlxx::Exception & e) + { + // ER_SERVER_SHUTDOWN is thrown in different types under different conditions. + // E.g. checkError() in Common/mysqlxx/Exception.cpp will throw mysqlxx::Exception. + if (e.code() != CR_SERVER_LOST && e.code() != ER_SERVER_SHUTDOWN && e.code() != CR_SERVER_GONE_ERROR && e.code() != CR_CONN_HOST_ERROR) + return false; + } + catch (const Poco::Exception & e) + { + if (e.code() != ErrorCodes::NETWORK_ERROR && + e.code() != ErrorCodes::MEMORY_LIMIT_EXCEEDED && + e.code() != ErrorCodes::UNKNOWN_TABLE && // Since we have ignored the DDL exception when the tables without primary key, insert into those tables will get UNKNOWN_TABLE. + e.code() != ErrorCodes::CANNOT_READ_ALL_DATA && + e.code() != ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF && + e.code() != ErrorCodes::TIMEOUT_EXCEEDED) + return false; + } + catch (...) + { + return false; + } + return true; +} + MaterializedMySQLSyncThread::MaterializedMySQLSyncThread( ContextPtr context_, const String & database_name_, const String & mysql_database_name_, mysqlxx::Pool && pool_, MySQLClient && client_, + const MySQLReplication::BinlogClientPtr & binlog_client_, MaterializedMySQLSettings * settings_) : WithContext(context_->getGlobalContext()) , log(&Poco::Logger::get("MaterializedMySQLSyncThread")) @@ -170,6 +260,7 @@ MaterializedMySQLSyncThread::MaterializedMySQLSyncThread( , mysql_database_name(mysql_database_name_) , pool(std::move(pool_)) /// NOLINT , client(std::move(client_)) + , binlog_client(binlog_client_) , settings(settings_) { query_prefix = "EXTERNAL DDL FROM MySQL(" + backQuoteIfNeed(database_name) + ", " + backQuoteIfNeed(mysql_database_name) + ") "; @@ -216,14 +307,23 @@ void MaterializedMySQLSyncThread::synchronization() UInt64 elapsed_ms = watch.elapsedMilliseconds(); if (elapsed_ms < max_flush_time) { - BinlogEventPtr binlog_event = client.readOneBinlogEvent(max_flush_time - elapsed_ms); - if (binlog_event) + const auto timeout_ms = max_flush_time - elapsed_ms; + BinlogEventPtr binlog_event; + if (binlog) + binlog->tryReadEvent(binlog_event, timeout_ms); + else + binlog_event = client.readOneBinlogEvent(timeout_ms); + if (binlog_event && !ignoreEvent(binlog_event)) onEvent(buffers, binlog_event, metadata); } } catch (const Exception & e) { - if (e.code() != ErrorCodes::CANNOT_READ_ALL_DATA || settings->max_wait_time_when_mysql_unavailable < 0) + if (settings->max_wait_time_when_mysql_unavailable < 0) + throw; + bool binlog_was_purged = e.code() == ER_MASTER_FATAL_ERROR_READING_BINLOG || + e.code() == ER_MASTER_HAS_PURGED_REQUIRED_GTIDS; + if (!binlog_was_purged && !shouldReconnectOnException(std::current_exception())) throw; flushBuffersData(buffers, metadata); @@ -246,6 +346,7 @@ void MaterializedMySQLSyncThread::synchronization() catch (...) { client.disconnect(); + binlog = nullptr; tryLogCurrentException(log); setSynchronizationThreadException(std::current_exception()); } @@ -259,6 +360,7 @@ void MaterializedMySQLSyncThread::stopSynchronization() if (background_thread_pool->joinable()) background_thread_pool->join(); client.disconnect(); + binlog = nullptr; } } @@ -428,14 +530,6 @@ static inline void dumpDataForTables( } } -static inline UInt32 randomNumber() -{ - pcg64_fast rng{randomSeed()}; - std::uniform_int_distribution dist6( - std::numeric_limits::min(), std::numeric_limits::max()); - return static_cast(dist6(rng)); -} - bool MaterializedMySQLSyncThread::prepareSynchronized(MaterializeMetadata & metadata) { bool opened_transaction = false; @@ -463,7 +557,7 @@ bool MaterializedMySQLSyncThread::prepareSynchronized(MaterializeMetadata & meta if (!need_dumping_tables.empty()) { Position position; - position.update(metadata.binlog_position, metadata.binlog_file, metadata.executed_gtid_set); + position.update(metadata.binlog_position, metadata.binlog_file, metadata.executed_gtid_set, 0); metadata.transaction(position, [&]() { @@ -487,8 +581,20 @@ bool MaterializedMySQLSyncThread::prepareSynchronized(MaterializeMetadata & meta if (opened_transaction) connection->query("COMMIT").execute(); - client.connect(); - client.startBinlogDumpGTID(randomNumber(), mysql_database_name, materialized_tables_list, metadata.executed_gtid_set, metadata.binlog_checksum); + if (binlog_client) + { + binlog_client->setBinlogChecksum(metadata.binlog_checksum); + binlog = binlog_client->createBinlog(metadata.executed_gtid_set, + database_name, + {mysql_database_name}, + settings->max_bytes_in_binlog_queue, + settings->max_milliseconds_to_wait_in_binlog_queue); + } + else + { + client.connect(); + client.startBinlogDumpGTID(randomNumber(), mysql_database_name, materialized_tables_list, metadata.executed_gtid_set, metadata.binlog_checksum); + } setSynchronizationThreadException(nullptr); return true; @@ -500,20 +606,11 @@ bool MaterializedMySQLSyncThread::prepareSynchronized(MaterializeMetadata & meta if (opened_transaction) connection->query("ROLLBACK").execute(); - try - { + if (settings->max_wait_time_when_mysql_unavailable < 0) + throw; + + if (!shouldReconnectOnException(std::current_exception())) throw; - } - catch (const mysqlxx::ConnectionFailed & ex) - { - LOG_TRACE(log, "Connection to MySQL failed {}", ex.displayText()); - } - catch (const mysqlxx::BadQuery & e) - { - // Lost connection to MySQL server during query - if (e.code() != CR_SERVER_LOST || settings->max_wait_time_when_mysql_unavailable < 0) - throw; - } setSynchronizationThreadException(std::current_exception()); /// Avoid busy loop when MySQL is not available. @@ -524,17 +621,55 @@ bool MaterializedMySQLSyncThread::prepareSynchronized(MaterializeMetadata & meta return false; } +bool MaterializedMySQLSyncThread::isTableIgnored(const String & table_name) const +{ + return !materialized_tables_list.empty() && !materialized_tables_list.contains(table_name); +} + +bool MaterializedMySQLSyncThread::ignoreEvent(const BinlogEventPtr & event) const +{ + switch (event->type()) + { + case MYSQL_WRITE_ROWS_EVENT: + case MYSQL_DELETE_ROWS_EVENT: + case MYSQL_UPDATE_ROWS_EVENT: + case MYSQL_UNPARSED_ROWS_EVENT: + { + auto table_name = static_cast(*event).table; + if (!table_name.empty() && isTableIgnored(table_name)) + { + switch (event->header.type) + { + case WRITE_ROWS_EVENT_V1: + case WRITE_ROWS_EVENT_V2: + case DELETE_ROWS_EVENT_V1: + case DELETE_ROWS_EVENT_V2: + case UPDATE_ROWS_EVENT_V1: + case UPDATE_ROWS_EVENT_V2: + break; + default: + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown event type: {}", magic_enum::enum_name(event->header.type)); + } + return true; + } + } break; + default: + break; + } + return false; +} + void MaterializedMySQLSyncThread::flushBuffersData(Buffers & buffers, MaterializeMetadata & metadata) { if (buffers.data.empty()) return; - metadata.transaction(client.getPosition(), [&]() { buffers.commit(getContext()); }); + metadata.transaction(getPosition(), [&]() { buffers.commit(getContext()); }); const auto & position_message = [&]() { WriteBufferFromOwnString buf; - client.getPosition().dump(buf); + getPosition().dump(buf); return buf.str(); }; LOG_INFO(log, "MySQL executed position: \n {}", position_message()); @@ -783,10 +918,33 @@ void MaterializedMySQLSyncThread::onEvent(Buffers & buffers, const BinlogEventPt else if (receive_event->type() == MYSQL_QUERY_EVENT) { QueryEvent & query_event = static_cast(*receive_event); + /// Skip events for different databases if any + if (!query_event.query_database_name.empty() && query_event.query_database_name != mysql_database_name) + { + LOG_WARNING( + log, + "Skipped QueryEvent, current mysql database name: {}, ddl schema: {}, query: {}", + mysql_database_name, + query_event.query_database_name, + query_event.query); + return; + } + if (!query_event.query_table_name.empty() && isTableIgnored(query_event.query_table_name)) + { + LOG_WARNING(log, "Due to the table filter rules, query_event on {} is ignored.", database_name); + return; + } + Position position_before_ddl; - position_before_ddl.update(metadata.binlog_position, metadata.binlog_file, metadata.executed_gtid_set); + position_before_ddl.update(metadata.binlog_position, metadata.binlog_file, metadata.executed_gtid_set, query_event.header.timestamp); metadata.transaction(position_before_ddl, [&]() { buffers.commit(getContext()); }); - metadata.transaction(client.getPosition(),[&](){ executeDDLAtomic(query_event); }); + metadata.transaction(getPosition(),[&]() { executeDDLAtomic(query_event); }); + } + else if (receive_event->type() == MYSQL_UNPARSED_ROWS_EVENT) + { + UnparsedRowsEvent & unparsed_event = static_cast(*receive_event); + auto nested_event = unparsed_event.parse(); + onEvent(buffers, nested_event, metadata); } else { @@ -796,7 +954,10 @@ void MaterializedMySQLSyncThread::onEvent(Buffers & buffers, const BinlogEventPt /// Some behaviors(such as changing the value of "binlog_checksum") rotate the binlog file. /// To ensure that the synchronization continues, we need to handle these events metadata.fetchMasterVariablesValue(pool.get(/* wait_timeout= */ UINT64_MAX)); - client.setBinlogChecksum(metadata.binlog_checksum); + if (binlog_client) + binlog_client->setBinlogChecksum(metadata.binlog_checksum); + else + client.setBinlogChecksum(metadata.binlog_checksum); } else if (receive_event->header.type != HEARTBEAT_EVENT) { @@ -827,7 +988,7 @@ void MaterializedMySQLSyncThread::executeDDLAtomic(const QueryEvent & query_even auto table_id = tryParseTableIDFromDDL(query, query_event.schema); if (!table_id.table_name.empty()) { - if (table_id.database_name != mysql_database_name || !materialized_tables_list.contains(table_id.table_name)) + if (table_id.database_name != mysql_database_name || isTableIgnored(table_id.table_name)) { LOG_DEBUG(log, "Skip MySQL DDL for {}.{}:\n{}", table_id.database_name, table_id.table_name, query); return; @@ -845,8 +1006,28 @@ void MaterializedMySQLSyncThread::executeDDLAtomic(const QueryEvent & query_even tryLogCurrentException(log); /// If some DDL query was not successfully parsed and executed - /// Then replication may fail on next binlog events anyway - if (exception.code() != ErrorCodes::SYNTAX_ERROR) + /// Then replication may fail on next binlog events anyway. + /// We can skip the error binlog evetns and continue to execute the right ones. + /// eg. The user creates a table without primary key and finds it is wrong, then + /// drops it and creates a new right one. We guarantee the right one can be executed. + + if (exception.code() != ErrorCodes::SYNTAX_ERROR && + exception.code() != ErrorCodes::MYSQL_SYNTAX_ERROR && + exception.code() != ErrorCodes::NOT_IMPLEMENTED && + exception.code() != ErrorCodes::UNKNOWN_TABLE && + exception.code() != ErrorCodes::CANNOT_GET_CREATE_TABLE_QUERY && + exception.code() != ErrorCodes::THERE_IS_NO_QUERY && + exception.code() != ErrorCodes::QUERY_WAS_CANCELLED && + exception.code() != ErrorCodes::TABLE_ALREADY_EXISTS && + exception.code() != ErrorCodes::UNKNOWN_DATABASE && + exception.code() != ErrorCodes::DATABASE_ALREADY_EXISTS && + exception.code() != ErrorCodes::DATABASE_NOT_EMPTY && + exception.code() != ErrorCodes::TABLE_IS_DROPPED && + exception.code() != ErrorCodes::TABLE_SIZE_EXCEEDS_MAX_DROP_SIZE_LIMIT && + exception.code() != ErrorCodes::CANNOT_CREATE_CHARSET_CONVERTER && + exception.code() != ErrorCodes::UNKNOWN_FUNCTION && + exception.code() != ErrorCodes::UNKNOWN_IDENTIFIER && + exception.code() != ErrorCodes::UNKNOWN_TYPE) throw; } } diff --git a/src/Databases/MySQL/MaterializedMySQLSyncThread.h b/src/Databases/MySQL/MaterializedMySQLSyncThread.h index 4abea5e72df..004a4d67d32 100644 --- a/src/Databases/MySQL/MaterializedMySQLSyncThread.h +++ b/src/Databases/MySQL/MaterializedMySQLSyncThread.h @@ -11,6 +11,7 @@ # include # include # include +# include # include # include # include @@ -45,6 +46,7 @@ public: const String & mysql_database_name_, mysqlxx::Pool && pool_, MySQLClient && client_, + const MySQLReplication::BinlogClientPtr & binlog_client_, MaterializedMySQLSettings * settings_); void stopSynchronization(); @@ -61,19 +63,12 @@ private: mutable mysqlxx::Pool pool; mutable MySQLClient client; + BinlogClientPtr binlog_client; + BinlogPtr binlog; MaterializedMySQLSettings * settings; String query_prefix; NameSet materialized_tables_list; - // USE MySQL ERROR CODE: - // https://dev.mysql.com/doc/mysql-errors/5.7/en/server-error-reference.html - const int ER_ACCESS_DENIED_ERROR = 1045; /// NOLINT - const int ER_DBACCESS_DENIED_ERROR = 1044; /// NOLINT - const int ER_BAD_DB_ERROR = 1049; /// NOLINT - - // https://dev.mysql.com/doc/mysql-errors/8.0/en/client-error-reference.html - const int CR_SERVER_LOST = 2013; /// NOLINT - struct Buffers { String database; @@ -99,12 +94,16 @@ private: BufferAndSortingColumnsPtr getTableDataBuffer(const String & table, ContextPtr context); }; + Position getPosition() const { return binlog ? binlog->getPosition() : client.getPosition(); } void synchronization(); bool isCancelled() { return sync_quit.load(std::memory_order_relaxed); } bool prepareSynchronized(MaterializeMetadata & metadata); + bool isTableIgnored(const String & table_name) const; + bool ignoreEvent(const BinlogEventPtr & event) const; + void flushBuffersData(Buffers & buffers, MaterializeMetadata & metadata); void onEvent(Buffers & buffers, const MySQLReplication::BinlogEventPtr & event, MaterializeMetadata & metadata); diff --git a/src/Databases/MySQL/MySQLBinlog.cpp b/src/Databases/MySQL/MySQLBinlog.cpp new file mode 100644 index 00000000000..3e3aca220bb --- /dev/null +++ b/src/Databases/MySQL/MySQLBinlog.cpp @@ -0,0 +1,500 @@ +#include "MySQLBinlog.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace DB +{ +using namespace Replication; +using namespace Authentication; +using namespace ConnectionPhase; + +namespace ErrorCodes +{ + extern const int UNKNOWN_EXCEPTION; + extern const int UNKNOWN_PACKET_FROM_SERVER; + extern const int ATTEMPT_TO_READ_AFTER_EOF; + extern const int CANNOT_READ_ALL_DATA; + extern const int LOGICAL_ERROR; + extern const int NETWORK_ERROR; +} + +namespace MySQLReplication +{ + +class WriteCommand : public IMySQLWritePacket +{ +public: + const char command; + const String query; + + WriteCommand(char command_, const String & query_) : command(command_), query(query_) { } + + size_t getPayloadSize() const override { return 1 + query.size(); } + + void writePayloadImpl(WriteBuffer & buffer) const override + { + buffer.write(command); + if (!query.empty()) + buffer.write(query.data(), query.size()); + } +}; + +IBinlog::Checksum IBinlog::checksumFromString(const String & checksum) +{ + auto str = Poco::toUpper(checksum); + if (str == "CRC32") + return IBinlog::CRC32; + if (str != "NONE") + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown checksum: {}", checksum); + return IBinlog::NONE; +} + +void BinlogParser::setChecksum(Checksum checksum) +{ + switch (checksum) + { + case Checksum::CRC32: + checksum_signature_length = 4; + break; + case Checksum::NONE: + checksum_signature_length = 0; + break; + } +} + +void BinlogParser::parseEvent(EventHeader & event_header, ReadBuffer & event_payload) +{ + switch (event_header.type) + { + case FORMAT_DESCRIPTION_EVENT: + { + event = std::make_shared(EventHeader(event_header)); + event->parseEvent(event_payload); + break; + } + case ROTATE_EVENT: + { + event = std::make_shared(EventHeader(event_header)); + event->parseEvent(event_payload); + break; + } + case QUERY_EVENT: + { + event = std::make_shared(EventHeader(event_header)); + event->parseEvent(event_payload); + + auto query = std::static_pointer_cast(event); + switch (query->typ) + { + case QUERY_EVENT_MULTI_TXN_FLAG: + case QUERY_EVENT_XA: + case QUERY_SAVEPOINT: + { + event = std::make_shared(EventHeader(query->header)); + break; + } + default: + { + String quoted_query = query->query; + tryQuoteUnrecognizedTokens(quoted_query); + tryConvertStringLiterals(quoted_query); + auto table_id = tryParseTableIDFromDDL(query->query, query->schema); + query->query_database_name = table_id.database_name; + query->query_table_name = table_id.table_name; + break; + } + } + break; + } + case XID_EVENT: + { + event = std::make_shared(EventHeader(event_header)); + event->parseEvent(event_payload); + break; + } + case TABLE_MAP_EVENT: + { + TableMapEventHeader map_event_header; + map_event_header.parse(event_payload); + event = std::make_shared(EventHeader(event_header), map_event_header, flavor_charset); + try + { + event->parseEvent(event_payload); + auto table_map = std::static_pointer_cast(event); + table_maps[table_map->table_id] = table_map; + } + catch (const Poco::Exception & exc) + { + /// Ignore parsing issues + if (exc.code() != ErrorCodes::UNKNOWN_EXCEPTION) + throw; + event = std::make_shared(std::move(event_header)); + event->parseEvent(event_payload); + } + break; + } + case WRITE_ROWS_EVENT_V1: + case WRITE_ROWS_EVENT_V2: + case DELETE_ROWS_EVENT_V1: + case DELETE_ROWS_EVENT_V2: + case UPDATE_ROWS_EVENT_V1: + case UPDATE_ROWS_EVENT_V2: + { + RowsEventHeader rows_header(event_header.type); + rows_header.parse(event_payload); + if (table_maps.contains(rows_header.table_id)) + event = std::make_shared(table_maps.at(rows_header.table_id), EventHeader(event_header), rows_header); + else + event = std::make_shared(std::move(event_header)); + event->parseEvent(event_payload); + if (rows_header.flags & ROWS_END_OF_STATEMENT) + table_maps.clear(); + break; + } + case GTID_EVENT: + { + event = std::make_shared(EventHeader(event_header)); + event->parseEvent(event_payload); + break; + } + default: + { + event = std::make_shared(EventHeader(event_header)); + event->parseEvent(event_payload); + break; + } + } + updatePosition(event, position); +} + +void BinlogParser::updatePosition(const BinlogEventPtr & event, Position & position) +{ + const UInt64 binlog_pos_prev = position.binlog_pos; + position.binlog_pos = event->header.log_pos; + if (event->header.timestamp > 0) + position.timestamp = event->header.timestamp; + + switch (event->header.type) + { + case QUERY_EVENT: + if (event->type() == MYSQL_UNHANDLED_EVENT) + break; + [[fallthrough]]; + case GTID_EVENT: + case XID_EVENT: + case ROTATE_EVENT: + position.update(event); + break; + default: + break; + } + + if (event->header.type != ROTATE_EVENT) + { + /// UInt32 overflow when Pos > End_log_pos + /// https://dev.mysql.com/doc/refman/8.0/en/show-binlog-events.html + /// binlog_pos - The position at which the next event begins, which is equal to Pos plus the size of the event + const UInt64 binlog_pos_correct = binlog_pos_prev + event->header.event_size; + if (position.binlog_pos < binlog_pos_prev && binlog_pos_correct > std::numeric_limits::max()) + position.binlog_pos = binlog_pos_correct; + } +} + +bool BinlogParser::isNew(const Position & older, const Position & newer) +{ + if (older.gtid_sets.contains(newer.gtid_sets)) + return false; + /// Check if all sets in newer position have the same UUID from older sets + std::set older_set; + for (const auto & set : older.gtid_sets.sets) + older_set.insert(set.uuid); + for (const auto & set : newer.gtid_sets.sets) + { + if (!older_set.contains(set.uuid)) + return false; + } + return true; +} + +void BinlogFromSocket::connect(const String & host, UInt16 port, const String & user, const String & password) +{ + if (connected) + disconnect(); + + const Poco::Timespan connection_timeout(10'000'000'000); + const Poco::Timespan receive_timeout(5'000'000'000); + const Poco::Timespan send_timeout(5'000'000'000); + + socket = std::make_unique(); + address = DNSResolver::instance().resolveAddress(host, port); + socket->connect(*address, connection_timeout); + socket->setReceiveTimeout(receive_timeout); + socket->setSendTimeout(send_timeout); + socket->setNoDelay(true); + connected = true; + + in = std::make_unique(*socket); + out = std::make_unique(*socket); + packet_endpoint = std::make_shared(*in, *out, sequence_id); + + handshake(user, password); +} + +void BinlogFromSocket::disconnect() +{ + in = nullptr; + out = nullptr; + if (socket) + socket->close(); + socket = nullptr; + connected = false; + sequence_id = 0; + + GTIDSets sets; + position.gtid_sets = sets; + position.resetPendingGTID(); +} + +/// https://dev.mysql.com/doc/internals/en/connection-phase-packets.html +void BinlogFromSocket::handshake(const String & user, const String & password) +{ + const String mysql_native_password = "mysql_native_password"; + Handshake handshake; + packet_endpoint->receivePacket(handshake); + if (handshake.auth_plugin_name != mysql_native_password) + { + throw Exception( + ErrorCodes::UNKNOWN_PACKET_FROM_SERVER, + "Only support {} auth plugin name, but got {}", + mysql_native_password, + handshake.auth_plugin_name); + } + + Native41 native41(password, handshake.auth_plugin_data); + String auth_plugin_data = native41.getAuthPluginData(); + + const UInt8 charset_utf8 = 33; + HandshakeResponse handshake_response( + client_capabilities, MAX_PACKET_LENGTH, charset_utf8, user, "", auth_plugin_data, mysql_native_password); + packet_endpoint->sendPacket(handshake_response, true); + + ResponsePacket packet_response(client_capabilities, true); + packet_endpoint->receivePacket(packet_response); + packet_endpoint->resetSequenceId(); + + if (packet_response.getType() == PACKET_ERR) + throw Exception::createDeprecated(packet_response.err.error_message, ErrorCodes::UNKNOWN_PACKET_FROM_SERVER); + else if (packet_response.getType() == PACKET_AUTH_SWITCH) + throw Exception(ErrorCodes::UNKNOWN_PACKET_FROM_SERVER, "Access denied for user {}", user); +} + +void BinlogFromSocket::writeCommand(char command, const String & query) +{ + WriteCommand write_command(command, query); + packet_endpoint->sendPacket(write_command, true); + + ResponsePacket packet_response(client_capabilities); + packet_endpoint->receivePacket(packet_response); + switch (packet_response.getType()) + { + case PACKET_ERR: + throw Exception::createDeprecated(packet_response.err.error_message, ErrorCodes::UNKNOWN_PACKET_FROM_SERVER); + case PACKET_OK: + break; + default: + break; + } + packet_endpoint->resetSequenceId(); +} + +void BinlogFromSocket::registerSlaveOnMaster(UInt32 slave_id) +{ + RegisterSlave register_slave(slave_id); + packet_endpoint->sendPacket(register_slave, true); + + ResponsePacket packet_response(client_capabilities); + packet_endpoint->receivePacket(packet_response); + packet_endpoint->resetSequenceId(); + if (packet_response.getType() == PACKET_ERR) + throw Exception::createDeprecated(packet_response.err.error_message, ErrorCodes::UNKNOWN_PACKET_FROM_SERVER); +} + +void BinlogFromSocket::start(UInt32 slave_id, const String & executed_gtid_set) +{ + if (!connected) + return; + + /// Maybe CRC32 or NONE. mysqlbinlog.cc use NONE, see its below comments: + /// Make a notice to the server that this client is checksum-aware. + /// It does not need the first fake Rotate necessary checksummed. + writeCommand(Command::COM_QUERY, "SET @master_binlog_checksum = 'CRC32'"); + + /// Set heartbeat 1s + const UInt64 period_ns = 1'000'000'000; + writeCommand(Command::COM_QUERY, "SET @master_heartbeat_period = " + std::to_string(period_ns)); + + /// Register slave. + registerSlaveOnMaster(slave_id); + + position.gtid_sets = {}; + position.gtid_sets.parse(executed_gtid_set); + + BinlogDumpGTID binlog_dump(slave_id, position.gtid_sets.toPayload()); + packet_endpoint->sendPacket(binlog_dump, true); +} + +class ReadPacketFromSocket : public IMySQLReadPacket +{ +public: + using ReadPayloadFunc = std::function; + explicit ReadPacketFromSocket(ReadPayloadFunc fn) : read_payload_func(std::move(fn)) { } + void readPayloadImpl(ReadBuffer & payload) override; + ReadPayloadFunc read_payload_func; +}; + +void ReadPacketFromSocket::readPayloadImpl(ReadBuffer & payload) +{ + if (payload.eof()) + throw Exception(ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF, "Attempt to read after EOF."); + + UInt8 header = static_cast(*payload.position()); + switch (header) // NOLINT(bugprone-switch-missing-default-case) + { + case PACKET_EOF: + throw ReplicationError(ErrorCodes::CANNOT_READ_ALL_DATA, "Master maybe lost"); + case PACKET_ERR: + { + ERRPacket err; + err.readPayloadWithUnpacked(payload); + throw ReplicationError::createDeprecated(err.error_message, ErrorCodes::UNKNOWN_EXCEPTION); + } + default: + break; + } + /// Skip the generic response packets header flag + payload.ignore(1); + read_payload_func(payload); +} + +bool BinlogFromSocket::tryReadEvent(BinlogEventPtr & to, UInt64 ms) +{ + ReadPacketFromSocket packet([this](ReadBuffer & payload) + { + MySQLBinlogEventReadBuffer event_payload(payload, checksum_signature_length); + + EventHeader event_header; + event_header.parse(event_payload); + + parseEvent(event_header, event_payload); + }); + + if (packet_endpoint && packet_endpoint->tryReceivePacket(packet, ms)) + { + to = event; + return static_cast(to); + } + + return false; +} + +void BinlogFromFile::open(const String & filename) +{ + in = std::make_unique(filename); + assertString("\xfe\x62\x69\x6e", *in); /// magic number +} + +bool BinlogFromFile::tryReadEvent(BinlogEventPtr & to, UInt64 /*ms*/) +{ + if (in && !in->eof()) + { + EventHeader event_header; + event_header.parse(*in); + + LimitReadBuffer limit_read_buffer(*in, event_header.event_size - EVENT_HEADER_LENGTH, /* throw_exception */ false, /* exact_limit */ {}); + MySQLBinlogEventReadBuffer event_payload(limit_read_buffer, checksum_signature_length); + parseEvent(event_header, event_payload); + to = event; + return static_cast(to); + } + + return false; +} + +BinlogFromFileFactory::BinlogFromFileFactory(const String & filename_) + : filename(filename_) +{ +} + +BinlogPtr BinlogFromFileFactory::createBinlog(const String & executed_gtid_set) +{ + auto ret = std::make_shared(); + ret->open(filename); + if (!executed_gtid_set.empty()) + { + /// NOTE: Used for testing only! + GTIDSets sets; + sets.parse(executed_gtid_set); + if (sets.sets.size() != 1 || sets.sets[0].intervals.size() != 1) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Too many intervals: {}", executed_gtid_set); + BinlogEventPtr event; + while (ret->tryReadEvent(event, 0)) + { + const auto & s = ret->getPosition().gtid_sets.sets; + if (!s.empty() && !s[0].intervals.empty() && s[0].intervals[0].end == sets.sets[0].intervals[0].end) + break; + } + + auto pos = ret->getPosition(); + pos.gtid_sets.sets.front().intervals.front().start = sets.sets.front().intervals.front().start; + ret->setPosition(pos); + } + return ret; +} + +BinlogFromSocketFactory::BinlogFromSocketFactory(const String & host_, UInt16 port_, const String & user_, const String & password_) + : host(host_) + , port(port_) + , user(user_) + , password(password_) +{ +} + +BinlogPtr BinlogFromSocketFactory::createBinlog(const String & executed_gtid_set) +{ + auto ret = std::make_shared(); + ret->connect(host, port, user, password); + ret->start(randomNumber(), executed_gtid_set); + auto pos = ret->getPosition(); + if (pos.gtid_sets.sets.empty() || pos.gtid_sets.sets.front().intervals.front().start != 1) + throw Exception(ErrorCodes::NETWORK_ERROR, "Could not create: Wrong executed_gtid_set: {} -> {}", executed_gtid_set, pos.gtid_sets.toString()); + return ret; +} + +/// Should be in MySQLReplication namespace +bool operator==(const Position & left, const Position & right) +{ + return left.binlog_name == right.binlog_name && + left.binlog_pos == right.binlog_pos && + left.gtid_sets == right.gtid_sets; +} + +} +} diff --git a/src/Databases/MySQL/MySQLBinlog.h b/src/Databases/MySQL/MySQLBinlog.h new file mode 100644 index 00000000000..0b8f7543590 --- /dev/null +++ b/src/Databases/MySQL/MySQLBinlog.h @@ -0,0 +1,120 @@ +#pragma once + +#include +#include +#include + +namespace DB +{ +using namespace MySQLProtocol; +using namespace Generic; + +namespace MySQLReplication +{ + +class IBinlog +{ +public: + virtual ~IBinlog() = default; + virtual bool tryReadEvent(BinlogEventPtr & to, UInt64 ms) = 0; + virtual Position getPosition() const = 0; + enum Checksum : UInt8 + { + NONE = 0, + CRC32 = 1 + }; + virtual void setChecksum(Checksum /*checksum*/) { } + static Checksum checksumFromString(const String & checksum); +}; + +using BinlogPtr = std::shared_ptr; + +class BinlogParser : public IBinlog +{ +public: + Position getPosition() const override { return position; } + void setPosition(const Position & position_) { position = position_; } + void setChecksum(Checksum checksum) override; + static void updatePosition(const BinlogEventPtr & event, Position & position); + /// Checks if \a older is older position than \a newer + static bool isNew(const Position & older, const Position & newer); + +protected: + Position position; + BinlogEventPtr event; + std::map> table_maps; + size_t checksum_signature_length = 4; + MySQLCharsetPtr flavor_charset = std::make_shared(); + void parseEvent(EventHeader & event_header, ReadBuffer & event_payload); +}; + +class BinlogFromSocket : public BinlogParser +{ +public: + void connect(const String & host, UInt16 port, const String & user, const String & password); + void start(UInt32 slave_id, const String & executed_gtid_set); + bool tryReadEvent(BinlogEventPtr & to, UInt64 ms) override; + +private: + void disconnect(); + bool connected = false; + uint8_t sequence_id = 0; + const uint32_t client_capabilities = CLIENT_PROTOCOL_41 | CLIENT_PLUGIN_AUTH | CLIENT_SECURE_CONNECTION; + + std::unique_ptr in; + std::unique_ptr out; + std::unique_ptr socket; + std::optional address; + std::shared_ptr packet_endpoint; + + void handshake(const String & user, const String & password); + void registerSlaveOnMaster(UInt32 slave_id); + void writeCommand(char command, const String & query); +}; + +class BinlogFromFile : public BinlogParser +{ +public: + void open(const String & filename); + bool tryReadEvent(BinlogEventPtr & to, UInt64 ms) override; + +private: + std::unique_ptr in; +}; + +class IBinlogFactory +{ +public: + virtual ~IBinlogFactory() = default; + virtual BinlogPtr createBinlog(const String & executed_gtid_set) = 0; +}; + +using BinlogFactoryPtr = std::shared_ptr; + +class BinlogFromFileFactory : public IBinlogFactory +{ +public: + BinlogFromFileFactory(const String & filename_); + BinlogPtr createBinlog(const String & executed_gtid_set) override; + +private: + const String filename; +}; + +class BinlogFromSocketFactory : public IBinlogFactory +{ +public: + BinlogFromSocketFactory(const String & host_, UInt16 port_, const String & user_, const String & password_); + BinlogPtr createBinlog(const String & executed_gtid_set) override; + +private: + const String host; + const UInt16 port; + const String user; + const String password; +}; + +bool operator==(const Position & left, const Position & right); + +} +} diff --git a/src/Databases/MySQL/MySQLBinlogClient.cpp b/src/Databases/MySQL/MySQLBinlogClient.cpp new file mode 100644 index 00000000000..e7d707f76ce --- /dev/null +++ b/src/Databases/MySQL/MySQLBinlogClient.cpp @@ -0,0 +1,104 @@ +#include "MySQLBinlogClient.h" +#include + +namespace DB::ErrorCodes +{ + extern const int LOGICAL_ERROR; +} + +namespace DB::MySQLReplication +{ + +BinlogClient::BinlogClient(const BinlogFactoryPtr & factory_, + const String & name, + UInt64 max_bytes_in_buffer_, + UInt64 max_flush_ms_) + : factory(factory_) + , binlog_client_name(name) + , max_bytes_in_buffer(max_bytes_in_buffer_) + , max_flush_ms(max_flush_ms_) + , logger(&Poco::Logger::get("BinlogClient(" + name + ")")) +{ +} + +BinlogPtr BinlogClient::createBinlog(const String & executed_gtid_set, + const String & name, + const NameSet & mysql_database_names, + size_t max_bytes, + UInt64 max_waiting_ms) +{ + std::lock_guard lock(mutex); + BinlogPtr ret; + for (auto it = dispatchers.begin(); it != dispatchers.end();) + { + auto & dispatcher = *it; + if (!ret) + { + const auto metadata = dispatcher->getDispatcherMetadata(); + LOG_DEBUG(logger, "({} -> {}): Trying dispatcher: {}, size: {} -> {}:{}.{}", + name, executed_gtid_set, metadata.name, metadata.binlogs.size(), + metadata.position.binlog_name, metadata.position.gtid_sets.toString(), metadata.position.binlog_pos); + ret = dispatcher->attach(executed_gtid_set, name, mysql_database_names, max_bytes, max_waiting_ms); + if (ret) + LOG_DEBUG(logger, "({} -> {}): Reused dispatcher: {}, size: {} -> {}:{}.{}", + name, executed_gtid_set, metadata.name, metadata.binlogs.size(), + metadata.position.binlog_name, metadata.position.gtid_sets.toString(), metadata.position.binlog_pos); + } + + if (dispatcher->cleanupBinlogsAndStop()) + { + const auto metadata = dispatcher->getDispatcherMetadata(); + LOG_DEBUG(logger, "({} -> {}): Deleting dispatcher: {}, size: {}, total dispatchers: {}", + name, executed_gtid_set, metadata.name, metadata.binlogs.size(), dispatchers.size()); + it = dispatchers.erase(it); + continue; + } + ++it; + } + + if (!ret) + { + String dispatcher_name = name + ":" + std::to_string(dispatchers_count++); + LOG_DEBUG(logger, "({} -> {}): Creating dispatcher: {}, total dispatchers: {}", + name, executed_gtid_set, dispatcher_name, dispatchers.size()); + auto dispatcher = std::make_shared(dispatcher_name, max_bytes_in_buffer, max_flush_ms); + if (!binlog_checksum.empty()) + dispatcher->setBinlogChecksum(binlog_checksum); + for (const auto & it : dispatchers) + dispatcher->syncTo(it); + ret = dispatcher->start(factory->createBinlog(executed_gtid_set), name, mysql_database_names, max_bytes, max_waiting_ms); + if (!ret) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Could not create binlog: {}", executed_gtid_set); + dispatchers.push_back(dispatcher); + } + + return ret; +} + +BinlogClient::Metadata BinlogClient::getMetadata() const +{ + std::lock_guard lock(mutex); + Metadata ret; + ret.binlog_client_name = binlog_client_name; + for (const auto & dispatcher : dispatchers) + { + auto metadata = dispatcher->getDispatcherMetadata(); + if (!metadata.binlogs.empty()) + ret.dispatchers.push_back(metadata); + } + return ret; +} + +void BinlogClient::setBinlogChecksum(const String & checksum) +{ + std::lock_guard lock(mutex); + if (binlog_checksum != checksum) + { + LOG_DEBUG(logger, "Setting binlog_checksum: {} -> {}, total dispatchers: {}", binlog_checksum, checksum, dispatchers.size()); + binlog_checksum = checksum; + for (const auto & dispatcher : dispatchers) + dispatcher->setBinlogChecksum(checksum); + } +} + +} diff --git a/src/Databases/MySQL/MySQLBinlogClient.h b/src/Databases/MySQL/MySQLBinlogClient.h new file mode 100644 index 00000000000..b76934d08cf --- /dev/null +++ b/src/Databases/MySQL/MySQLBinlogClient.h @@ -0,0 +1,57 @@ +#pragma once + +#include +#include + +namespace DB::MySQLReplication +{ + +/** It is supposed to reduce the number of connections to remote MySQL binlog by reusing one connection between several consumers. + * Such reusing of the connection makes the time of reading from the remote binlog independent to number of the consumers. + * It tracks a list of BinlogEventsDispatcher instances for consumers with different binlog position. + * The dispatchers with the same binlog position will be merged to one. + */ +class BinlogClient +{ +public: + BinlogClient(const BinlogFactoryPtr & factory, + const String & name = {}, + UInt64 max_bytes_in_buffer_ = DBMS_DEFAULT_BUFFER_SIZE, + UInt64 max_flush_ms_ = 1000); + BinlogClient(const BinlogClient & other) = delete; + ~BinlogClient() = default; + BinlogClient & operator=(const BinlogClient & other) = delete; + + /// Creates a binlog to receive events + BinlogPtr createBinlog(const String & executed_gtid_set = {}, + const String & name = {}, + const NameSet & mysql_database_names = {}, + size_t max_bytes = 0, + UInt64 max_waiting_ms = 0); + + /// The binlog checksum is related to entire connection + void setBinlogChecksum(const String & checksum); + + struct Metadata + { + String binlog_client_name; + std::vector dispatchers; + }; + /// Returns only not empty dispatchers + Metadata getMetadata() const; + +private: + BinlogFactoryPtr factory; + const String binlog_client_name; + UInt64 max_bytes_in_buffer = 0; + UInt64 max_flush_ms = 0; + std::vector dispatchers; + String binlog_checksum; + mutable std::mutex mutex; + Poco::Logger * logger = nullptr; + int dispatchers_count = 0; +}; + +using BinlogClientPtr = std::shared_ptr; + +} diff --git a/src/Databases/MySQL/MySQLBinlogClientFactory.cpp b/src/Databases/MySQL/MySQLBinlogClientFactory.cpp new file mode 100644 index 00000000000..03a777ff352 --- /dev/null +++ b/src/Databases/MySQL/MySQLBinlogClientFactory.cpp @@ -0,0 +1,46 @@ +#include + +namespace DB::MySQLReplication +{ + +BinlogClientFactory & BinlogClientFactory::instance() +{ + static BinlogClientFactory ret; + return ret; +} + +BinlogClientPtr BinlogClientFactory::getClient(const String & host, UInt16 port, const String & user, const String & password, UInt64 max_bytes_in_buffer, UInt64 max_flush_ms) +{ + std::lock_guard lock(mutex); + String binlog_client_name; + WriteBufferFromString stream(binlog_client_name); + stream << user << "@" << host << ":" << port; + stream.finalize(); + String binlog_client_key = binlog_client_name + ":" + password; + auto it = clients.find(binlog_client_key); + BinlogClientPtr ret = it != clients.end() ? it->second.lock() : nullptr; + if (ret) + return ret; + auto factory = std::make_shared(host, port, user, password); + auto client = std::make_shared(factory, binlog_client_name, max_bytes_in_buffer, max_flush_ms); + clients[binlog_client_key] = client; + return client; +} + +std::vector BinlogClientFactory::getMetadata() const +{ + std::lock_guard lock(mutex); + std::vector ret; + for (const auto & it : clients) + { + if (auto c = it.second.lock()) + { + auto metadata = c->getMetadata(); + if (!metadata.dispatchers.empty()) + ret.push_back(metadata); + } + } + return ret; +} + +} diff --git a/src/Databases/MySQL/MySQLBinlogClientFactory.h b/src/Databases/MySQL/MySQLBinlogClientFactory.h new file mode 100644 index 00000000000..544b88e3201 --- /dev/null +++ b/src/Databases/MySQL/MySQLBinlogClientFactory.h @@ -0,0 +1,38 @@ +#pragma once + +#include +#include +#include + +namespace DB::MySQLReplication +{ + +/** Global instance to create or reuse MySQL Binlog Clients. + * If a binlog client already exists for specific params, + * it will be returned and reused to read binlog events from MySQL. + * Otherwise new instance will be created. + */ +class BinlogClientFactory final : boost::noncopyable +{ +public: + static BinlogClientFactory & instance(); + + BinlogClientPtr getClient(const String & host, + UInt16 port, + const String & user, + const String & password, + UInt64 max_bytes_in_buffer, + UInt64 max_flush_ms); + + /// Returns info of all registered clients + std::vector getMetadata() const; + +private: + BinlogClientFactory() = default; + + // Keeps track of already destroyed clients + std::unordered_map> clients; + mutable std::mutex mutex; +}; + +} diff --git a/src/Databases/MySQL/MySQLBinlogEventsDispatcher.cpp b/src/Databases/MySQL/MySQLBinlogEventsDispatcher.cpp new file mode 100644 index 00000000000..4af307f9c0f --- /dev/null +++ b/src/Databases/MySQL/MySQLBinlogEventsDispatcher.cpp @@ -0,0 +1,626 @@ +#include "MySQLBinlogEventsDispatcher.h" +#include +#include + +namespace DB::ErrorCodes +{ + extern const int UNKNOWN_EXCEPTION; + extern const int TIMEOUT_EXCEEDED; +} + +namespace DB::MySQLReplication +{ + +class BinlogFromDispatcher : public IBinlog +{ +public: + BinlogFromDispatcher(const String & name_, const NameSet & mysql_database_names_, size_t max_bytes_, UInt64 max_waiting_ms_) + : name(name_) + , mysql_database_names(mysql_database_names_) + , max_bytes(max_bytes_) + , max_waiting_ms(max_waiting_ms_) + , logger(&Poco::Logger::get("BinlogFromDispatcher(" + name + ")")) + { + } + + ~BinlogFromDispatcher() override + { + stop(); + } + + void stop() + { + { + std::lock_guard lock(mutex); + if (is_cancelled) + return; + is_cancelled = true; + } + cv.notify_all(); + } + + std::string getName() const + { + return name; + } + + bool tryReadEvent(BinlogEventPtr & to, UInt64 ms) override; + Position getPosition() const override; + void setPosition(const Position & initial, const Position & wait); + void setException(const std::exception_ptr & exception_); + void push(const BinlogEventsDispatcher::Buffer & buffer); + BinlogEventsDispatcher::BinlogMetadata getBinlogMetadata() const; + +private: + const String name; + const NameSet mysql_database_names; + const size_t max_bytes = 0; + const UInt64 max_waiting_ms = 0; + + Position position; + GTIDSets gtid_sets_wait; + + BinlogEventsDispatcher::Buffer buffer; + mutable std::mutex mutex; + + std::condition_variable cv; + bool is_cancelled = false; + Poco::Logger * logger = nullptr; + std::exception_ptr exception; +}; + +static String getBinlogNames(const std::vector> & binlogs) +{ + std::vector names; + for (const auto & it : binlogs) + { + if (auto binlog = it.lock()) + names.push_back(binlog->getName()); + } + return boost::algorithm::join(names, ", "); +} + +BinlogEventsDispatcher::BinlogEventsDispatcher(const String & logger_name_, size_t max_bytes_in_buffer_, UInt64 max_flush_ms_) + : logger_name(logger_name_) + , max_bytes_in_buffer(max_bytes_in_buffer_) + , max_flush_ms(max_flush_ms_) + , logger(&Poco::Logger::get("BinlogEventsDispatcher(" + logger_name + ")")) + , dispatching_thread(std::make_unique([this]() { dispatchEvents(); })) +{ +} + +BinlogEventsDispatcher::~BinlogEventsDispatcher() +{ + { + std::lock_guard lock(mutex); + is_cancelled = true; + auto exc = std::make_exception_ptr(Exception(ErrorCodes::UNKNOWN_EXCEPTION, "Dispatcher {} has been already destroyed", logger_name)); + try + { + cleanupLocked([&](const auto & binlog) + { + /// Notify the binlogs that the dispatcher is already destroyed + /// and it needs to recreate new binlogs if needed + binlog->setException(exc); + }); + } + catch (const std::exception & exc) + { + LOG_ERROR(logger, "Unexpected exception: {}", exc.what()); + } + } + cv.notify_all(); + if (dispatching_thread) + dispatching_thread->join(); +} + +static void flushTimers(Stopwatch & watch, UInt64 & total_time, UInt64 & size, float & size_per_sec, UInt64 & bytes, float & bytes_per_sec, float threshold_flush, float threshold_reset) +{ + total_time += watch.elapsedMicroseconds(); + const float elapsed_seconds = total_time * 1e-6f; + if (elapsed_seconds >= threshold_flush) + { + size_per_sec = size / elapsed_seconds; + bytes_per_sec = bytes / elapsed_seconds; + } + if (elapsed_seconds >= threshold_reset) + { + size = 0; + bytes = 0; + total_time = 0; + } +} + +void BinlogEventsDispatcher::flushBufferLocked() +{ + Stopwatch watch; + if (buffer.bytes) + cleanupLocked([&](const auto & b) { b->push(buffer); }); + events_flush += buffer.events.size(); + bytes_flush += buffer.bytes; + flushTimers(watch, events_flush_total_time, events_flush, events_flush_per_sec, bytes_flush, bytes_flush_per_sec, 0.1f, 1.0); + buffer = {}; +} + +static bool isDispatcherEventIgnored(const BinlogEventPtr & event) +{ + switch (event->header.type) + { + /// Sending to all databases: + case GTID_EVENT: /// Catch up requested executed gtid set, used only in BinlogFromDispatcher + case ROTATE_EVENT: /// Change binlog_checksum + case XID_EVENT: /// Commit transaction + /// Sending to all attached binlogs without filtering on dispatcher thread + /// to keep the connection as up-to-date as possible, + /// but these events should be filtered on databases' threads + /// and sent only to requested databases: + case QUERY_EVENT: /// Apply DDL + case WRITE_ROWS_EVENT_V1: /// Apply DML + case WRITE_ROWS_EVENT_V2: + case DELETE_ROWS_EVENT_V1: + case DELETE_ROWS_EVENT_V2: + case UPDATE_ROWS_EVENT_V1: + case UPDATE_ROWS_EVENT_V2: + return false; + default: + break; + } + return true; +} + +void BinlogEventsDispatcher::dispatchEvents() +{ + LOG_TRACE(logger, "{}: started", __FUNCTION__); + BinlogEventPtr event; + BinlogPtr binlog_; + Stopwatch watch; + UInt64 events_read = 0; + UInt64 bytes_read = 0; + UInt64 events_read_total_time = 0; + Stopwatch watch_events_read; + + while (!is_cancelled) + { + try + { + { + std::unique_lock lock(mutex); + cv.wait(lock, [&] { return is_cancelled || (binlog_read_from && !binlogs.empty()); }); + if (is_cancelled) + break; + + for (auto it = sync_to.begin(); it != sync_to.end() && !binlogs.empty();) + { + if (auto d = it->lock()) + { + /// If we can catch up the position of a dispatcher we synced to, + /// need to move all binlogs out + if (trySyncLocked(d)) + { + /// Don't keep connection longer than needed + stopLocked(); + break; + } + ++it; + } + else + { + it = sync_to.erase(it); + } + } + + if (binlog_read_from) + binlog_read_from->setChecksum(binlog_checksum); + binlog_ = binlog_read_from; + if (watch.elapsedMilliseconds() >= max_flush_ms || buffer.bytes >= max_bytes_in_buffer) + { + flushBufferLocked(); + watch.restart(); + } + } + + watch_events_read.restart(); + if (!is_cancelled && binlog_ && binlog_->tryReadEvent(event, max_flush_ms) && event) + { + ++events_read; + bytes_read += event->header.event_size; + { + std::lock_guard lock(mutex); + flushTimers(watch_events_read, events_read_total_time, events_read, events_read_per_sec, bytes_read, bytes_read_per_sec, 1.0, 5.0); + BinlogParser::updatePosition(event, position); + /// Ignore meaningless events + if (isDispatcherEventIgnored(event)) + continue; + buffer.events.push_back(event); + buffer.bytes += event->header.event_size; + buffer.position = position; + /// Deliver ROTATE event ASAP if there binlog_checksum should be changed + if (event->header.type == ROTATE_EVENT) + flushBufferLocked(); + } + } + } + catch (const std::exception & exc) + { + std::lock_guard lock(mutex); + LOG_ERROR(logger, "Exception: {}", exc.what()); + stopLocked(); + /// All attached binlogs should be recreated + cleanupLocked([&](const auto & b) { b->setException(std::current_exception()); }); + binlogs.clear(); + buffer = {}; + position = {}; + } + } + LOG_TRACE(logger, "{}: finished", __FUNCTION__); +} + +bool BinlogEventsDispatcher::cleanupLocked(const std::function & binlog)> & fn) +{ + for (auto it = binlogs.begin(); it != binlogs.end();) + { + if (auto binlog = it->lock()) + { + if (fn) + fn(binlog); + ++it; + } + else + { + it = binlogs.erase(it); + } + } + + return binlogs.empty(); +} + +bool BinlogEventsDispatcher::cleanupBinlogsAndStop() +{ + std::lock_guard lock(mutex); + const bool is_empty = cleanupLocked(); + if (is_empty && binlog_read_from) + stopLocked(); + return is_empty; +} + +void BinlogEventsDispatcher::stopLocked() +{ + if (!binlog_read_from) + { + LOG_DEBUG(logger, "Could not stop. Already stopped"); + return; + } + + cleanupLocked(); + binlog_read_from = nullptr; + LOG_DEBUG(logger, "Stopped: {}:{}.{}: ({})", position.binlog_name, position.gtid_sets.toString(), position.binlog_pos, getBinlogNames(binlogs)); +} + +BinlogPtr BinlogEventsDispatcher::createBinlogLocked(const String & name_, + const NameSet & mysql_database_names, + size_t max_bytes, + UInt64 max_waiting_ms, + const Position & pos_initial, + const Position & pos_wait) +{ + static int client_cnt = 0; + const String client_id = !name_.empty() ? name_ : "binlog_" + std::to_string(++client_cnt); + auto binlog = std::make_shared(client_id, mysql_database_names, max_bytes, max_waiting_ms); + binlogs.push_back(binlog); + binlog->setPosition(pos_initial, pos_wait); + LOG_DEBUG(logger, "Created binlog: {} -> {}", name_, binlog->getPosition().gtid_sets.toString()); + return binlog; +} + +BinlogPtr BinlogEventsDispatcher::start(const BinlogPtr & binlog_read_from_, + const String & name_, + const NameSet & mysql_database_names, + size_t max_bytes, + UInt64 max_waiting_ms) +{ + BinlogPtr ret; + { + std::lock_guard lock(mutex); + if (is_started) + return {}; + binlog_read_from = binlog_read_from_; + /// It is used for catching up + /// binlog_read_from should return position with requested executed GTID set: 1-N + position = binlog_read_from->getPosition(); + ret = createBinlogLocked(name_, mysql_database_names, max_bytes, max_waiting_ms, position); + is_started = true; + } + cv.notify_all(); + return ret; +} + +BinlogPtr BinlogEventsDispatcher::attach(const String & executed_gtid_set, + const String & name_, + const NameSet & mysql_database_names, + size_t max_bytes, + UInt64 max_waiting_ms) +{ + BinlogPtr ret; + { + std::lock_guard lock(mutex); + /// Check if binlog_read_from can be reused: + /// Attach to only active dispatchers + /// and if executed_gtid_set is higher value than current + if (!binlog_read_from || !is_started || cleanupLocked() || executed_gtid_set.empty()) + return {}; + Position pos_wait; + pos_wait.gtid_sets.parse(executed_gtid_set); + if (!BinlogParser::isNew(position, pos_wait)) + return {}; + ret = createBinlogLocked(name_, mysql_database_names, max_bytes, max_waiting_ms, position, pos_wait); + } + cv.notify_all(); + return ret; +} + +void BinlogEventsDispatcher::syncToLocked(const BinlogEventsDispatcherPtr & to) +{ + if (to && this != to.get()) + { + std::vector names; + for (const auto & it : sync_to) + { + if (auto dispatcher = it.lock()) + names.push_back(dispatcher->logger_name); + } + LOG_DEBUG(logger, "Syncing -> ({}) + ({})", boost::algorithm::join(names, ", "), to->logger_name); + sync_to.emplace_back(to); + } +} + +void BinlogEventsDispatcher::syncTo(const BinlogEventsDispatcherPtr & to) +{ + std::lock_guard lock(mutex); + syncToLocked(to); +} + +Position BinlogEventsDispatcher::getPosition() const +{ + std::lock_guard lock(mutex); + return position; +} + +bool BinlogEventsDispatcher::trySyncLocked(BinlogEventsDispatcherPtr & to) +{ + { + std::lock_guard lock(to->mutex); + /// Don't catch up if positions do not have GTIDs yet + const auto & cur_sets = position.gtid_sets.sets; + const auto & sets = to->position.gtid_sets.sets; + /// Sync to only started dispatchers + if (!to->binlog_read_from || (cur_sets.empty() && sets.empty()) || to->position != position) + return false; + + flushBufferLocked(); + to->flushBufferLocked(); + LOG_DEBUG(logger, "Synced up: {} -> {}: {}:{}.{}: ({}) + ({})", logger_name, to->logger_name, + position.binlog_name, position.gtid_sets.toString(), position.binlog_pos, getBinlogNames(to->binlogs), getBinlogNames(binlogs)); + std::move(binlogs.begin(), binlogs.end(), std::back_inserter(to->binlogs)); + } + + /// Notify that new binlogs arrived + to->cv.notify_all(); + return true; +} + +void BinlogEventsDispatcher::setBinlogChecksum(const String & checksum) +{ + std::lock_guard lock(mutex); + LOG_DEBUG(logger, "Setting binlog_checksum: {}", checksum); + binlog_checksum = IBinlog::checksumFromString(checksum); +} + +void BinlogFromDispatcher::push(const BinlogEventsDispatcher::Buffer & buffer_) +{ + std::unique_lock lock(mutex); + cv.wait_for(lock, std::chrono::milliseconds(max_waiting_ms), + [&] + { + bool ret = is_cancelled || exception || max_bytes == 0 || buffer.bytes < max_bytes; + if (!ret) + LOG_TRACE(logger, "Waiting: bytes: {} >= {}", buffer.bytes, max_bytes); + return ret; + }); + + if (is_cancelled || exception) + return; + + if (max_bytes != 0 && buffer.bytes >= max_bytes) + { + lock.unlock(); + setException(std::make_exception_ptr( + Exception(ErrorCodes::TIMEOUT_EXCEEDED, + "Timeout exceeded: Waiting: bytes: {} >= {}", buffer.bytes, max_bytes))); + return; + } + + auto it = buffer_.events.begin(); + size_t bytes = buffer_.bytes; + if (!gtid_sets_wait.sets.empty()) + { + if (!buffer_.position.gtid_sets.contains(gtid_sets_wait)) + { + LOG_TRACE(logger, "(wait_until: {} / {}) Skipped bytes: {}", + gtid_sets_wait.toString(), buffer_.position.gtid_sets.toString(), buffer_.bytes); + return; + } + + std::vector seqs; + for (auto & s : gtid_sets_wait.sets) + { + GTID g; + g.uuid = s.uuid; + for (auto & in : s.intervals) + { + g.seq_no = in.end; + seqs.push_back(g); + } + } + for (; it != buffer_.events.end(); ++it) + { + const auto & event = *it; + auto find_if_func = [&](auto & a) + { + return std::static_pointer_cast(event)->gtid == a; + }; + if (event->header.type != GTID_EVENT || std::find_if(seqs.begin(), seqs.end(), find_if_func) == seqs.end()) + { + LOG_TRACE(logger, "(wait_until: {} / {}) Skipped {}", + gtid_sets_wait.toString(), buffer_.position.gtid_sets.toString(), magic_enum::enum_name(event->header.type)); + bytes -= event->header.event_size; + continue; + } + LOG_DEBUG(logger, "(wait_until: {} / {}) Starting {}: gtid seq_no: {}", + gtid_sets_wait.toString(), buffer_.position.gtid_sets.toString(), magic_enum::enum_name(event->header.type), + std::static_pointer_cast(event)->gtid.seq_no); + break; + } + gtid_sets_wait = {}; + } + + if (it != buffer_.events.end()) + { + std::copy(it, buffer_.events.end(), std::back_inserter(buffer.events)); + buffer.bytes += bytes; + buffer.position = buffer_.position; + } + lock.unlock(); + /// Notify that added some event + cv.notify_all(); +} + +static void rethrowIfNeeded(const std::exception_ptr & exception, size_t events_size) +{ + try + { + std::rethrow_exception(exception); + } + catch (const Exception & e) + { + /// If timeout exceeded, it is safe to read all events before rethrowning + if (e.code() == ErrorCodes::TIMEOUT_EXCEEDED && events_size > 0) + return; + throw; + } +} + +static bool isBinlogEventIgnored(const NameSet & mysql_database_names, const BinlogEventPtr & event) +{ + bool ret = false; + switch (event->header.type) + { + case WRITE_ROWS_EVENT_V1: + case WRITE_ROWS_EVENT_V2: + case DELETE_ROWS_EVENT_V1: + case DELETE_ROWS_EVENT_V2: + case UPDATE_ROWS_EVENT_V1: + case UPDATE_ROWS_EVENT_V2: + ret = !mysql_database_names.empty() && !mysql_database_names.contains(std::static_pointer_cast(event)->schema); + break; + case QUERY_EVENT: + if (event->type() != MYSQL_UNHANDLED_EVENT) + { + auto query_event = std::static_pointer_cast(event); + ret = !mysql_database_names.empty() && + !query_event->query_database_name.empty() && + !mysql_database_names.contains(query_event->query_database_name); + } + break; + default: + break; + } + return ret; +} + +bool BinlogFromDispatcher::tryReadEvent(BinlogEventPtr & to, UInt64 ms) +{ + auto wake_up_func = [&] + { + if (exception) + rethrowIfNeeded(exception, buffer.events.size()); + return is_cancelled || !buffer.events.empty(); + }; + to = nullptr; + std::unique_lock lock(mutex); + if (!cv.wait_for(lock, std::chrono::milliseconds(ms), wake_up_func) || is_cancelled || buffer.events.empty()) + return false; + to = buffer.events.front(); + buffer.events.pop_front(); + BinlogParser::updatePosition(to, position); + buffer.bytes -= to->header.event_size; + if (isBinlogEventIgnored(mysql_database_names, to)) + to = std::make_shared(EventHeader(to->header)); + lock.unlock(); + /// Notify that removed some event + cv.notify_all(); + return true; +} + +Position BinlogFromDispatcher::getPosition() const +{ + std::lock_guard lock(mutex); + return position; +} + +void BinlogFromDispatcher::setPosition(const Position & initial, const Position & wait) +{ + std::lock_guard lock(mutex); + if (wait.gtid_sets.sets.empty()) + { + position = initial; + } + else + { + position = wait; + gtid_sets_wait = wait.gtid_sets; + } +} + +void BinlogFromDispatcher::setException(const std::exception_ptr & exception_) +{ + { + std::lock_guard lock(mutex); + exception = exception_; + } + cv.notify_all(); +} + +BinlogEventsDispatcher::BinlogMetadata BinlogFromDispatcher::getBinlogMetadata() const +{ + std::lock_guard lock(mutex); + BinlogEventsDispatcher::BinlogMetadata ret; + ret.name = name; + ret.position_write = buffer.position; + ret.position_read = position; + ret.size = buffer.events.size(); + ret.bytes = buffer.bytes; + ret.max_bytes = max_bytes; + ret.max_waiting_ms = max_waiting_ms; + return ret; +} + +BinlogEventsDispatcher::DispatcherMetadata BinlogEventsDispatcher::getDispatcherMetadata() const +{ + std::lock_guard lock(mutex); + DispatcherMetadata ret; + ret.name = logger_name; + ret.position = position; + ret.events_read_per_sec = events_read_per_sec; + ret.bytes_read_per_sec = bytes_read_per_sec; + ret.events_flush_per_sec = events_flush_per_sec; + ret.bytes_flush_per_sec = bytes_flush_per_sec; + + for (const auto & it : binlogs) + { + if (auto binlog = it.lock()) + ret.binlogs.push_back(binlog->getBinlogMetadata()); + } + return ret; +} + +} diff --git a/src/Databases/MySQL/MySQLBinlogEventsDispatcher.h b/src/Databases/MySQL/MySQLBinlogEventsDispatcher.h new file mode 100644 index 00000000000..43379697015 --- /dev/null +++ b/src/Databases/MySQL/MySQLBinlogEventsDispatcher.h @@ -0,0 +1,136 @@ +#pragma once + +#include +#include +#include +#include + +namespace DB::MySQLReplication +{ + +class BinlogEventsDispatcher; +using BinlogEventsDispatcherPtr = std::shared_ptr; +class BinlogFromDispatcher; + +/** Reads the binlog events from one source and dispatches them over consumers. + * If it can catch up the position of the another dispatcher, it will move all consumers to this dispatcher. + */ +class BinlogEventsDispatcher final : boost::noncopyable +{ +public: + BinlogEventsDispatcher(const String & logger_name_ = "BinlogDispatcher", size_t max_bytes_in_buffer_ = 1_MiB, UInt64 max_flush_ms_ = 1000); + ~BinlogEventsDispatcher(); + + /// Moves all IBinlog objects to \a to if it has the same position + /// Supports syncing to multiple dispatchers + void syncTo(const BinlogEventsDispatcherPtr & to); + + /** Creates a binlog and starts the dispatching + * binlog_read_from Source binlog to read events from + * name Identifies the binlog, could be not unique + * mysql_database_names Returns events only from these databases + * max_bytes Defines a limit in bytes for this binlog + * Note: Dispatching will be stopped for all binlogs if bytes in queue increases this limit + * max_waiting_ms Max wait time when max_bytes exceeded + */ + BinlogPtr start(const BinlogPtr & binlog_read_from_, + const String & name = {}, + const NameSet & mysql_database_names = {}, + size_t max_bytes = 0, + UInt64 max_waiting_ms = 0); + + /** Creates a binlog if the dispatcher is started + * executed_gtid_set Can be higher value than current, + * otherwise not possible to attach + * name Identifies the binlog, could be not unique + * mysql_database_names Returns events only from these databases + * max_bytes Defines a limit in bytes for this binlog + * max_waiting_ms Max wait time when max_bytes exceeded + */ + BinlogPtr attach(const String & executed_gtid_set, + const String & name = {}, + const NameSet & mysql_database_names = {}, + size_t max_bytes = 0, + UInt64 max_waiting_ms = 0); + + /// Cleans the destroyed binlogs up and returns true if empty + bool cleanupBinlogsAndStop(); + + /// Changes binlog_checksum for binlog_read_from + void setBinlogChecksum(const String & checksum); + + Position getPosition() const; + + struct BinlogMetadata + { + String name; + /// Position that was written to + Position position_write; + /// Position that was read from + Position position_read; + size_t size = 0; + size_t bytes = 0; + size_t max_bytes = 0; + UInt64 max_waiting_ms = 0; + }; + struct DispatcherMetadata + { + String name; + Position position; + float events_read_per_sec = 0; + float bytes_read_per_sec = 0; + float events_flush_per_sec = 0; + float bytes_flush_per_sec = 0; + std::vector binlogs; + }; + DispatcherMetadata getDispatcherMetadata() const; + + struct Buffer + { + std::deque events; + size_t bytes = 0; + Position position; + }; + +private: + bool cleanupLocked(const std::function & binlog)> & fn = {}); + bool startLocked(const String & executed_gtid_set); + void stopLocked(); + BinlogPtr createBinlogLocked(const String & name = {}, + const NameSet & mysql_database_names = {}, + size_t max_bytes = 0, + UInt64 max_waiting_ms = 0, + const Position & pos_initial = {}, + const Position & pos_wait = {}); + void syncToLocked(const BinlogEventsDispatcherPtr & to); + bool trySyncLocked(BinlogEventsDispatcherPtr & to); + void flushBufferLocked(); + void dispatchEvents(); + + const String logger_name; + const size_t max_bytes_in_buffer = 0; + const UInt64 max_flush_ms = 0; + Poco::Logger * logger = nullptr; + + BinlogPtr binlog_read_from; + + Position position; + std::vector> sync_to; + std::vector> binlogs; + std::atomic_bool is_cancelled{false}; + mutable std::mutex mutex; + std::condition_variable cv; + std::unique_ptr dispatching_thread; + IBinlog::Checksum binlog_checksum = IBinlog::CRC32; + bool is_started = false; + Buffer buffer; + float events_read_per_sec = 0; + float bytes_read_per_sec = 0; + UInt64 events_flush = 0; + UInt64 events_flush_total_time = 0; + float events_flush_per_sec = 0; + UInt64 bytes_flush = 0; + float bytes_flush_per_sec = 0; +}; + +} diff --git a/src/Databases/MySQL/tests/data/binlog.000016 b/src/Databases/MySQL/tests/data/binlog.000016 new file mode 100644 index 00000000000..e27a2bac9ff Binary files /dev/null and b/src/Databases/MySQL/tests/data/binlog.000016 differ diff --git a/src/Databases/MySQL/tests/data/binlog.001390 b/src/Databases/MySQL/tests/data/binlog.001390 new file mode 100644 index 00000000000..1fbaae08a98 Binary files /dev/null and b/src/Databases/MySQL/tests/data/binlog.001390 differ diff --git a/src/Databases/MySQL/tests/gtest_mysql_binlog.cpp b/src/Databases/MySQL/tests/gtest_mysql_binlog.cpp new file mode 100644 index 00000000000..df8433f7cce --- /dev/null +++ b/src/Databases/MySQL/tests/gtest_mysql_binlog.cpp @@ -0,0 +1,1754 @@ +#include +#include +#include +#include +#include +#include + +#include +#include + +using namespace DB; +using namespace DB::MySQLReplication; + +#define TRY_LOOP_IMPL(expr, timeout) \ + const unsigned long _test_step = (timeout) < 350 ? (timeout) / 7 + 1 : 50; \ + for (int _i = 0; _i < (timeout) && !(expr); _i += _test_step) \ + std::this_thread::sleep_for(std::chrono::milliseconds(_test_step)); \ + +#define TRY_ASSERT_EQ(expr, expected, timeout) \ +do { \ + TRY_LOOP_IMPL(((expr) == (expected)), timeout) \ + ASSERT_EQ((expr), expected); \ +} while (false) + +#define TRY_ASSERT_TRUE(expr, timeout) \ + TRY_ASSERT_EQ((expr), true, timeout) + +static std::string getTestDataRoot() +{ + static auto root = []() -> std::string + { + std::filesystem::path testdata_path("src/Databases/MySQL/tests/data"); + auto basedir = std::filesystem::current_path(); + while (basedir != basedir.parent_path()) + { + if (std::filesystem::exists(basedir / testdata_path)) + { + testdata_path = basedir / testdata_path; + break; + } + basedir = basedir.parent_path(); + } + auto path = basedir / testdata_path; + return std::filesystem::exists(path) ? path.string() : ""; + }(); + return root; +} + +static String getTestDataPath(const String & testdata_file) +{ + return (std::filesystem::path(getTestDataRoot()) / testdata_file).string(); +} + +class MySQLBinlog : public ::testing::Test +{ +protected: + void SetUp() override + { + if (getTestDataRoot().empty()) + GTEST_SKIP() << "Skipping all tests since no test data files found"; + } + + UInt64 timeout = 25000; +}; + +TEST_F(MySQLBinlog, positionEndLogPosOverflow) +{ + Position position; + EventHeader header; + header.event_size = 8161; + header.log_pos = 4294958114; + BinlogParser::updatePosition(std::make_shared(EventHeader(header)), position); + ASSERT_EQ(position.binlog_pos, header.log_pos); + ASSERT_TRUE(position.binlog_name.empty()); + ASSERT_TRUE(position.gtid_sets.toString().empty()); + ASSERT_EQ(position.timestamp, 0); + + header.log_pos = 4294966149; + BinlogParser::updatePosition(std::make_shared(EventHeader(header)), position); + ASSERT_EQ(position.binlog_pos, header.log_pos); + UInt64 prev = position.binlog_pos; + + header.log_pos = 7014; + BinlogParser::updatePosition(std::make_shared(EventHeader(header)), position); + ASSERT_EQ(position.binlog_pos, prev + header.event_size); + prev = position.binlog_pos; + + header.event_size = 8107; + header.log_pos = 15121; + BinlogParser::updatePosition(std::make_shared(EventHeader(header)), position); + ASSERT_EQ(position.binlog_pos, prev + header.event_size); + prev = position.binlog_pos; + + header.event_size = 8131; + header.log_pos = 23252; + BinlogParser::updatePosition(std::make_shared(EventHeader(header)), position); + ASSERT_EQ(position.binlog_pos, prev + header.event_size); + + position.binlog_pos = 4724501662; + prev = position.binlog_pos; + + header.event_size = 8125; + header.log_pos = 429542491; + BinlogParser::updatePosition(std::make_shared(EventHeader(header)), position); + ASSERT_EQ(position.binlog_pos, prev + header.event_size); + + position.binlog_pos = 5474055640; + prev = position.binlog_pos; + + header.event_size = 31; + header.log_pos = 1179088375; + BinlogParser::updatePosition(std::make_shared(EventHeader(header)), position); + ASSERT_EQ(position.binlog_pos, prev + header.event_size); + + position = {}; + header.log_pos = 4294965445; + BinlogParser::updatePosition(std::make_shared(EventHeader(header)), position); + ASSERT_EQ(position.binlog_pos, header.log_pos); + prev = position.binlog_pos; + + header.event_size = 7927; + header.log_pos = 6076; + BinlogParser::updatePosition(std::make_shared(EventHeader(header)), position); + ASSERT_EQ(position.binlog_pos, prev + header.event_size); +} + +TEST_F(MySQLBinlog, positionEquals) +{ + Position p1; + Position p2; + ASSERT_EQ(p1, p2); + p1.binlog_pos = 1; + ASSERT_NE(p1, p2); + p2.binlog_pos = 1; + ASSERT_EQ(p1, p2); + p1.gtid_sets.parse("a9d88f83-c14e-11ec-bb36-244bfedf7766:87828"); + ASSERT_NE(p1, p2); + p2.gtid_sets.parse("a9d88f83-c14e-11ec-bb36-244bfedf7766:87828"); + ASSERT_EQ(p1, p2); + p1.binlog_name = "name"; + ASSERT_NE(p1, p2); + p2.binlog_name = "name"; + ASSERT_EQ(p1, p2); +} + +TEST_F(MySQLBinlog, positionMultimaster) +{ + Position p1; + Position p2; + p1.gtid_sets.parse("f189aee3-3cd2-11ed-a407-fa163ea7d4ed:1-3602,ff9de833-3cd2-11ed-87b7-fa163e99d975:1-172"); + p2.gtid_sets.parse("ff9de833-3cd2-11ed-87b7-fa163e99d975:1-172"); + ASSERT_TRUE(p1.gtid_sets.contains(p2.gtid_sets)); + ASSERT_FALSE(p2.gtid_sets.contains(p1.gtid_sets)); + ASSERT_FALSE(BinlogParser::isNew(p1, p2)); + + p2.gtid_sets = {}; + p2.gtid_sets.parse("ff9de833-3cd2-11ed-87b7-fa163e99d975:1-10"); + ASSERT_FALSE(BinlogParser::isNew(p1, p2)); + + p2.gtid_sets = {}; + p2.gtid_sets.parse("ff9de833-3cd2-11ed-87b7-fa163e99d975:172"); + ASSERT_FALSE(BinlogParser::isNew(p1, p2)); + + p2.gtid_sets = {}; + p2.gtid_sets.parse("ff9de833-3cd2-11ed-87b7-fa163e99d975:171-172"); + ASSERT_FALSE(BinlogParser::isNew(p1, p2)); + + p2.gtid_sets = {}; + p2.gtid_sets.parse("ff9de833-3cd2-11ed-87b7-fa163e99d975:171-173"); + ASSERT_TRUE(BinlogParser::isNew(p1, p2)); + + p2.gtid_sets = {}; + p2.gtid_sets.parse("ff9de833-3cd2-11ed-87b7-fa163e99d975:173"); + ASSERT_TRUE(BinlogParser::isNew(p1, p2)); + + p2.gtid_sets = {}; + p2.gtid_sets.parse("xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx:173"); + ASSERT_FALSE(BinlogParser::isNew(p1, p2)); + + p2.gtid_sets = {}; + p2.gtid_sets.parse("f189aee3-3cd2-11ed-a407-fa163ea7d4ed:1-3602,ff9de833-3cd2-11ed-87b7-fa163e99d975:1-172"); + ASSERT_FALSE(BinlogParser::isNew(p1, p2)); + + p2.gtid_sets = {}; + p2.gtid_sets.parse("f189aee3-3cd2-11ed-a407-fa163ea7d4ed:1-3602,ff9de833-3cd2-11ed-87b7-fa163e99d975:1-173"); + ASSERT_TRUE(BinlogParser::isNew(p1, p2)); +} + +static void testFile1(IBinlog & binlog, UInt64 timeout, bool filtered = false) +{ + BinlogEventPtr event; + int count = 0; + + if (!filtered) + { + ASSERT_TRUE(binlog.tryReadEvent(event, timeout)); + ++count; + ASSERT_TRUE(event); + ASSERT_EQ(event->header.type, FORMAT_DESCRIPTION_EVENT); + ASSERT_EQ(event->header.timestamp, 1651442421); + ASSERT_EQ(event->header.event_size, 122); + ASSERT_EQ(event->header.log_pos, 126); + + ASSERT_TRUE(binlog.tryReadEvent(event, timeout)); + ++count; + ASSERT_TRUE(event); + ASSERT_EQ(event->header.type, PREVIOUS_GTIDS_EVENT); + ASSERT_EQ(event->header.timestamp, 1651442421); + ASSERT_EQ(event->header.event_size, 71); + ASSERT_EQ(event->header.log_pos, 197); + } + + ASSERT_TRUE(binlog.tryReadEvent(event, timeout)); + ++count; + ASSERT_TRUE(event); + ASSERT_EQ(event->header.type, GTID_EVENT); + ASSERT_EQ(event->header.timestamp, 1651475081); + ASSERT_EQ(event->header.event_size, 79); + ASSERT_EQ(event->header.log_pos, 276); + + auto gtid_event = std::static_pointer_cast(event); + ASSERT_TRUE(gtid_event); + ASSERT_EQ(gtid_event->commit_flag, 0); + GTIDSets gtid_expected; + gtid_expected.parse("a9d88f83-c14e-11ec-bb36-244bfedf7766:87828"); + GTIDSets gtid_actual; + gtid_actual.update(gtid_event->gtid); + ASSERT_EQ(gtid_actual.toString(), gtid_expected.toString()); + + ASSERT_TRUE(binlog.tryReadEvent(event, timeout)); + ++count; + ASSERT_TRUE(event); + ASSERT_EQ(event->header.type, QUERY_EVENT); + ASSERT_EQ(event->header.timestamp, 1651475081); + ASSERT_EQ(event->header.event_size, 73); + ASSERT_EQ(event->header.log_pos, 349); + + if (!filtered) + { + ASSERT_TRUE(binlog.tryReadEvent(event, timeout)); + ++count; + ASSERT_TRUE(event); + ASSERT_EQ(event->header.type, TABLE_MAP_EVENT); + ASSERT_EQ(event->header.timestamp, 1651475081); + ASSERT_EQ(event->header.event_size, 48); + ASSERT_EQ(event->header.log_pos, 397); + + auto table_event = std::static_pointer_cast(event); + ASSERT_TRUE(table_event); + ASSERT_EQ(table_event->table_id, 7566); + ASSERT_EQ(table_event->flags, 1); + ASSERT_EQ(table_event->schema_len, 2u); + ASSERT_EQ(table_event->schema, "db"); + ASSERT_EQ(table_event->table_len, 1u); + ASSERT_EQ(table_event->table, "a"); + ASSERT_EQ(table_event->column_count, 4); + std::vector column_type = {3u, 3u, 3u, 3u}; + ASSERT_EQ(table_event->column_type, column_type); + std::vector column_meta = {0, 0, 0, 0}; + ASSERT_EQ(table_event->column_meta, column_meta); + std::vector column_charset = {}; + ASSERT_EQ(table_event->column_charset, column_charset); + ASSERT_EQ(table_event->default_charset, 255u); + } + + ASSERT_TRUE(binlog.tryReadEvent(event, timeout)); + ++count; + ASSERT_TRUE(event); + ASSERT_EQ(event->header.type, WRITE_ROWS_EVENT_V2); + ASSERT_EQ(event->header.timestamp, 1651475081); + ASSERT_EQ(event->header.event_size, 52); + ASSERT_EQ(event->header.log_pos, 449); + + ASSERT_EQ(event->type(), MYSQL_UNPARSED_ROWS_EVENT); + event = std::static_pointer_cast(event)->parse(); + + ASSERT_TRUE(event); + auto write_event = std::static_pointer_cast(event); + ASSERT_TRUE(write_event); + ASSERT_EQ(write_event->number_columns, 4); + ASSERT_EQ(write_event->schema, "db"); + ASSERT_EQ(write_event->table, "a"); + ASSERT_EQ(write_event->rows.size(), 1); + ASSERT_EQ(write_event->rows[0].getType(), Field::Types::Tuple); + auto row_data = write_event->rows[0].get(); + ASSERT_EQ(row_data.size(), 4u); + ASSERT_EQ(row_data[0].get(), 1u); + ASSERT_EQ(row_data[1].get(), 1u); + ASSERT_EQ(row_data[2].get(), 1u); + ASSERT_EQ(row_data[3].get(), 1u); + + ASSERT_TRUE(binlog.tryReadEvent(event, timeout)); + ++count; + ASSERT_TRUE(event); + ASSERT_EQ(event->header.type, XID_EVENT); + ASSERT_EQ(event->header.timestamp, 1651475081); + ASSERT_EQ(event->header.event_size, 31); + ASSERT_EQ(event->header.log_pos, 480); + + ASSERT_TRUE(binlog.tryReadEvent(event, timeout)); + ++count; + ASSERT_TRUE(event); + ASSERT_EQ(event->header.type, GTID_EVENT); + ASSERT_EQ(event->header.timestamp, 1651475244); + ASSERT_EQ(event->header.event_size, 79); + ASSERT_EQ(event->header.log_pos, 559); + + ASSERT_TRUE(binlog.tryReadEvent(event, timeout)); + ++count; + ASSERT_TRUE(event); + ASSERT_EQ(event->header.type, QUERY_EVENT); + ASSERT_EQ(event->header.timestamp, 1651475244); + ASSERT_EQ(event->header.event_size, 82); + ASSERT_EQ(event->header.log_pos, 641); + + if (!filtered) + { + ASSERT_TRUE(binlog.tryReadEvent(event, timeout)); + ++count; + ASSERT_TRUE(event); + ASSERT_EQ(event->header.type, TABLE_MAP_EVENT); + ASSERT_EQ(event->header.timestamp, 1651475244); + ASSERT_EQ(event->header.event_size, 48); + ASSERT_EQ(event->header.log_pos, 689); + } + + ASSERT_TRUE(binlog.tryReadEvent(event, timeout)); + ++count; + ASSERT_TRUE(event); + ASSERT_EQ(event->header.timestamp, 1651475244); + ASSERT_EQ(event->header.event_size, 70); + ASSERT_EQ(event->header.type, UPDATE_ROWS_EVENT_V2); + ASSERT_EQ(event->header.log_pos, 759); + + ASSERT_EQ(event->type(), MYSQL_UNPARSED_ROWS_EVENT); + event = std::static_pointer_cast(event)->parse(); + + ASSERT_TRUE(event); + auto update_event = std::static_pointer_cast(event); + ASSERT_TRUE(update_event); + ASSERT_EQ(update_event->number_columns, 4); + ASSERT_EQ(update_event->schema, "db"); + ASSERT_EQ(update_event->table, "a"); + ASSERT_EQ(update_event->rows.size(), 2); + ASSERT_EQ(update_event->rows[0].getType(), Field::Types::Tuple); + row_data = update_event->rows[0].get(); + ASSERT_EQ(row_data.size(), 4u); + ASSERT_EQ(row_data[0].get(), 1u); + ASSERT_EQ(row_data[1].get(), 1u); + ASSERT_EQ(row_data[2].get(), 1u); + ASSERT_EQ(row_data[3].get(), 1u); + row_data = update_event->rows[1].get(); + ASSERT_EQ(row_data.size(), 4u); + ASSERT_EQ(row_data[0].get(), 1u); + ASSERT_EQ(row_data[1].get(), 2u); + ASSERT_EQ(row_data[2].get(), 1u); + ASSERT_EQ(row_data[3].get(), 1u); + + ASSERT_TRUE(binlog.tryReadEvent(event, timeout)); + ++count; + ASSERT_TRUE(event); + ASSERT_EQ(event->header.type, XID_EVENT); + ASSERT_EQ(event->header.timestamp, 1651475244); + ASSERT_EQ(event->header.event_size, 31); + ASSERT_EQ(event->header.log_pos, 790); + + ASSERT_TRUE(binlog.tryReadEvent(event, timeout)); + ++count; + ASSERT_TRUE(event); + ASSERT_EQ(event->header.type, GTID_EVENT); + ASSERT_EQ(event->header.timestamp, 1651482394); + ASSERT_EQ(event->header.event_size, 79); + ASSERT_EQ(event->header.log_pos, 869); + + ASSERT_TRUE(binlog.tryReadEvent(event, timeout)); + ++count; + ASSERT_TRUE(event); + ASSERT_EQ(event->header.type, QUERY_EVENT); + ASSERT_EQ(event->header.timestamp, 1651482394); + ASSERT_EQ(event->header.event_size, 82); + ASSERT_EQ(event->header.log_pos, 951); + + if (!filtered) + { + ASSERT_TRUE(binlog.tryReadEvent(event, timeout)); + ++count; + ASSERT_TRUE(event); + ASSERT_EQ(event->header.type, TABLE_MAP_EVENT); + ASSERT_EQ(event->header.timestamp, 1651482394); + ASSERT_EQ(event->header.event_size, 48); + ASSERT_EQ(event->header.log_pos, 999); + } + + ASSERT_TRUE(binlog.tryReadEvent(event, timeout)); + ++count; + ASSERT_TRUE(event); + ASSERT_EQ(event->header.type, UPDATE_ROWS_EVENT_V2); + ASSERT_EQ(event->header.timestamp, 1651482394); + ASSERT_EQ(event->header.event_size, 70); + ASSERT_EQ(event->header.log_pos, 1069); + + ASSERT_TRUE(binlog.tryReadEvent(event, timeout)); + ++count; + ASSERT_TRUE(event); + ASSERT_EQ(event->header.type, XID_EVENT); + ASSERT_EQ(event->header.timestamp, 1651482394); + ASSERT_EQ(event->header.event_size, 31); + ASSERT_EQ(event->header.log_pos, 1100); + + ASSERT_TRUE(binlog.tryReadEvent(event, timeout)); + ++count; + ASSERT_TRUE(event); + ASSERT_EQ(event->header.type, GTID_EVENT); + ASSERT_EQ(event->header.timestamp, 1651483072); + ASSERT_EQ(event->header.event_size, 79); + ASSERT_EQ(event->header.log_pos, 1179); + + ASSERT_TRUE(binlog.tryReadEvent(event, timeout)); + ++count; + ASSERT_TRUE(event); + ASSERT_EQ(event->header.type, QUERY_EVENT); + ASSERT_EQ(event->header.timestamp, 1651483072); + ASSERT_EQ(event->header.event_size, 82); + ASSERT_EQ(event->header.log_pos, 1261); + + if (!filtered) + { + ASSERT_TRUE(binlog.tryReadEvent(event, timeout)); + ++count; + ASSERT_TRUE(event); + ASSERT_EQ(event->header.type, TABLE_MAP_EVENT); + ASSERT_EQ(event->header.timestamp, 1651483072); + ASSERT_EQ(event->header.event_size, 48); + ASSERT_EQ(event->header.log_pos, 1309); + } + + ASSERT_TRUE(binlog.tryReadEvent(event, timeout)); + ++count; + ASSERT_TRUE(event); + ASSERT_EQ(event->header.type, UPDATE_ROWS_EVENT_V2); + ASSERT_EQ(event->header.timestamp, 1651483072); + ASSERT_EQ(event->header.event_size, 70); + ASSERT_EQ(event->header.log_pos, 1379); + + ASSERT_EQ(binlog.getPosition().gtid_sets.toString(), "a9d88f83-c14e-11ec-bb36-244bfedf7766:87828-87830"); + + ASSERT_TRUE(binlog.tryReadEvent(event, timeout)); + ++count; + ASSERT_TRUE(event); + ASSERT_EQ(event->header.type, XID_EVENT); + ASSERT_EQ(event->header.timestamp, 1651483072); + ASSERT_EQ(event->header.event_size, 31); + ASSERT_EQ(event->header.log_pos, 1410); + + ASSERT_EQ(binlog.getPosition().gtid_sets.toString(), "a9d88f83-c14e-11ec-bb36-244bfedf7766:87828-87831"); + + ASSERT_TRUE(binlog.tryReadEvent(event, timeout)); + ++count; + ASSERT_TRUE(event); + ASSERT_EQ(event->header.type, GTID_EVENT); + ASSERT_EQ(event->header.timestamp, 1651483336); + ASSERT_EQ(event->header.event_size, 79); + ASSERT_EQ(event->header.log_pos, 1489); + gtid_event = std::static_pointer_cast(event); + ASSERT_TRUE(gtid_event); + ASSERT_EQ(gtid_event->commit_flag, 0); + gtid_expected = {}; + gtid_expected.parse("a9d88f83-c14e-11ec-bb36-244bfedf7766:87832"); + gtid_actual = {}; + gtid_actual.update(gtid_event->gtid); + ASSERT_EQ(gtid_actual.toString(), gtid_expected.toString()); + + ASSERT_EQ(binlog.getPosition().gtid_sets.toString(), "a9d88f83-c14e-11ec-bb36-244bfedf7766:87828-87831"); + + ASSERT_TRUE(binlog.tryReadEvent(event, timeout)); + ++count; + ASSERT_TRUE(event); + ASSERT_EQ(event->header.type, QUERY_EVENT); + ASSERT_EQ(event->header.timestamp, 1651483336); + ASSERT_EQ(event->header.event_size, 82); + ASSERT_EQ(event->header.log_pos, 1571); + + if (!filtered) + { + ASSERT_TRUE(binlog.tryReadEvent(event, timeout)); + ++count; + ASSERT_TRUE(event); + ASSERT_EQ(event->header.type, TABLE_MAP_EVENT); + ASSERT_EQ(event->header.timestamp, 1651483336); + ASSERT_EQ(event->header.event_size, 48); + ASSERT_EQ(event->header.log_pos, 1619); + } + + int total_count = filtered ? 37 : 48; + for (; count < total_count; ++count) + ASSERT_TRUE(binlog.tryReadEvent(event, timeout)); + + ASSERT_TRUE(event); + ASSERT_EQ(event->header.type, ROTATE_EVENT); + ASSERT_EQ(event->header.timestamp, 1651528821); + ASSERT_EQ(event->header.event_size, 44); + ASSERT_EQ(event->header.log_pos, 3091); + ASSERT_EQ(count, total_count); + ASSERT_FALSE(binlog.tryReadEvent(event, 10)); + + auto position = binlog.getPosition(); + ASSERT_EQ(position.binlog_pos, 4); + ASSERT_EQ(position.binlog_name, "binlog.001391"); + ASSERT_EQ(position.gtid_sets.toString(), "a9d88f83-c14e-11ec-bb36-244bfedf7766:87828-87836"); +} + +TEST_F(MySQLBinlog, binlogFromFile1) +{ + BinlogFromFile binlog; + binlog.open(getTestDataPath("binlog.001390")); + testFile1(binlog, timeout); +} + +TEST_F(MySQLBinlog, binlogFromFactory1) +{ + auto f = std::make_shared(getTestDataPath("binlog.001390")); + auto binlog = f->createBinlog(""); + + testFile1(*binlog, timeout); +} + +TEST_F(MySQLBinlog, binlogFromFactory1ExecutedGtidSet) +{ + auto f = std::make_shared(getTestDataPath("binlog.001390")); + BinlogEventPtr event; + + auto binlog = f->createBinlog("a9d88f83-c14e-11ec-bb36-244bfedf7766:1-87828"); + + ASSERT_TRUE(binlog->tryReadEvent(event, timeout)); + ASSERT_TRUE(event); + ASSERT_EQ(event->header.type, GTID_EVENT); + ASSERT_EQ(event->header.log_pos, 559); + + auto gtid_event = std::static_pointer_cast(event); + ASSERT_TRUE(gtid_event); + GTIDSets gtid_expected; + gtid_expected.parse("a9d88f83-c14e-11ec-bb36-244bfedf7766:87829"); + GTIDSets gtid_actual; + gtid_actual.update(gtid_event->gtid); + ASSERT_EQ(gtid_actual.toString(), gtid_expected.toString()); + + for (int count = 8; count < 48; ++count) + ASSERT_TRUE(binlog->tryReadEvent(event, timeout)); + + ASSERT_TRUE(event); + ASSERT_EQ(event->header.type, ROTATE_EVENT); + auto position = binlog->getPosition(); + ASSERT_EQ(position.binlog_pos, 4); + ASSERT_EQ(position.binlog_name, "binlog.001391"); + ASSERT_EQ(position.gtid_sets.toString(), "a9d88f83-c14e-11ec-bb36-244bfedf7766:1-87836"); + ASSERT_FALSE(binlog->tryReadEvent(event, 10)); + + binlog = f->createBinlog("a9d88f83-c14e-11ec-bb36-244bfedf7766:1-87829"); + + ASSERT_TRUE(binlog->tryReadEvent(event, timeout)); + ASSERT_TRUE(event); + ASSERT_EQ(event->header.type, GTID_EVENT); + ASSERT_EQ(event->header.log_pos, 869); + + gtid_event = std::static_pointer_cast(event); + ASSERT_TRUE(gtid_event); + gtid_expected = {}; + gtid_expected.parse("a9d88f83-c14e-11ec-bb36-244bfedf7766:87830"); + gtid_actual = {}; + gtid_actual.update(gtid_event->gtid); + ASSERT_EQ(gtid_actual.toString(), gtid_expected.toString()); + + for (int count = 13; count < 48; ++count) + ASSERT_TRUE(binlog->tryReadEvent(event, timeout)); + + ASSERT_TRUE(event); + ASSERT_EQ(event->header.type, ROTATE_EVENT); + position = binlog->getPosition(); + ASSERT_EQ(position.binlog_pos, 4); + ASSERT_EQ(position.binlog_name, "binlog.001391"); + ASSERT_EQ(position.gtid_sets.toString(), "a9d88f83-c14e-11ec-bb36-244bfedf7766:1-87836"); + ASSERT_FALSE(binlog->tryReadEvent(event, 10)); + + binlog = f->createBinlog("a9d88f83-c14e-11ec-bb36-244bfedf7766:1-87834"); + + ASSERT_TRUE(binlog->tryReadEvent(event, timeout)); + ASSERT_TRUE(event); + ASSERT_EQ(event->header.type, GTID_EVENT); + ASSERT_EQ(event->header.log_pos, 2443); + + gtid_event = std::static_pointer_cast(event); + ASSERT_TRUE(gtid_event); + gtid_expected = {}; + gtid_expected.parse("a9d88f83-c14e-11ec-bb36-244bfedf7766:87835"); + gtid_actual = {}; + gtid_actual.update(gtid_event->gtid); + ASSERT_EQ(gtid_actual.toString(), gtid_expected.toString()); + + for (int count = 38; count < 48; ++count) + ASSERT_TRUE(binlog->tryReadEvent(event, timeout)); + + ASSERT_TRUE(event); + ASSERT_EQ(event->header.type, ROTATE_EVENT); + position = binlog->getPosition(); + ASSERT_EQ(position.binlog_pos, 4); + ASSERT_EQ(position.binlog_name, "binlog.001391"); + ASSERT_EQ(position.gtid_sets.toString(), "a9d88f83-c14e-11ec-bb36-244bfedf7766:1-87836"); + ASSERT_FALSE(binlog->tryReadEvent(event, 10)); +} + +TEST_F(MySQLBinlog, binlogFromDispatcher1) +{ + auto f = std::make_shared(getTestDataPath("binlog.001390")); + BinlogEventsDispatcher d; + auto b = d.start(f->createBinlog("")); + testFile1(*b, timeout, true); + ASSERT_EQ(d.getPosition().gtid_sets.toString(), b->getPosition().gtid_sets.toString()); +} + +static void testFile2(IBinlog & binlog, UInt64 timeout, bool filtered = false) +{ + BinlogEventPtr event; + int count = 0; + + if (!filtered) + { + ASSERT_TRUE(binlog.tryReadEvent(event, timeout)); + ++count; + ASSERT_TRUE(event); + ASSERT_EQ(event->header.type, FORMAT_DESCRIPTION_EVENT); + + ASSERT_TRUE(binlog.tryReadEvent(event, timeout)); + ++count; + ASSERT_TRUE(event); + ASSERT_EQ(event->header.type, PREVIOUS_GTIDS_EVENT); + } + + ASSERT_TRUE(binlog.tryReadEvent(event, timeout)); + ++count; + ASSERT_TRUE(event); + ASSERT_EQ(event->header.type, GTID_EVENT); + + ASSERT_TRUE(binlog.tryReadEvent(event, timeout)); + ++count; + ASSERT_TRUE(event); + ASSERT_EQ(event->header.type, QUERY_EVENT); + + if (!filtered) + { + ASSERT_TRUE(binlog.tryReadEvent(event, timeout)); + ++count; + ASSERT_TRUE(event); + ASSERT_EQ(event->header.type, TABLE_MAP_EVENT); + } + + ASSERT_TRUE(binlog.tryReadEvent(event, timeout)); + ++count; + ASSERT_TRUE(event); + ASSERT_EQ(event->header.type, UPDATE_ROWS_EVENT_V2); + + ASSERT_TRUE(binlog.tryReadEvent(event, timeout)); + ++count; + ASSERT_TRUE(event); + ASSERT_EQ(event->header.type, XID_EVENT); + + ASSERT_TRUE(binlog.tryReadEvent(event, timeout)); + ++count; + ASSERT_TRUE(event); + ASSERT_EQ(event->header.type, GTID_EVENT); + ASSERT_EQ(event->header.log_pos, 608); + + auto gtid_event = std::static_pointer_cast(event); + ASSERT_TRUE(gtid_event); + ASSERT_EQ(gtid_event->commit_flag, 0); + GTIDSets gtid_expected; + gtid_expected.parse("a9d88f83-c14e-11ec-bb36-244bfedf7766:1059"); + GTIDSets gtid_actual; + gtid_actual.update(gtid_event->gtid); + ASSERT_EQ(gtid_actual.toString(), gtid_expected.toString()); + + ASSERT_TRUE(binlog.tryReadEvent(event, timeout)); + ++count; + ASSERT_TRUE(event); + ASSERT_EQ(event->header.type, QUERY_EVENT); + ASSERT_EQ(event->header.log_pos, 701); + + if (!filtered) + { + ASSERT_TRUE(binlog.tryReadEvent(event, timeout)); + ++count; + ASSERT_TRUE(event); + ASSERT_EQ(event->header.type, TABLE_MAP_EVENT); + ASSERT_EQ(event->header.log_pos, 760); + } + + ASSERT_TRUE(binlog.tryReadEvent(event, timeout)); + ++count; + ASSERT_TRUE(event); + ASSERT_EQ(event->header.type, UPDATE_ROWS_EVENT_V2); + ASSERT_EQ(event->header.log_pos, 830); + + ASSERT_EQ(event->type(), MYSQL_UNPARSED_ROWS_EVENT); + event = std::static_pointer_cast(event)->parse(); + + ASSERT_TRUE(binlog.tryReadEvent(event, timeout)); + ++count; + ASSERT_TRUE(event); + ASSERT_EQ(event->header.type, XID_EVENT); + ASSERT_EQ(event->header.log_pos, 861); + + ASSERT_TRUE(binlog.tryReadEvent(event, timeout)); + ++count; + ASSERT_TRUE(event); + ASSERT_EQ(event->header.type, GTID_EVENT); + ASSERT_EQ(event->header.log_pos, 940); + gtid_event = std::static_pointer_cast(event); + ASSERT_TRUE(gtid_event); + ASSERT_EQ(gtid_event->commit_flag, 0); + gtid_expected = {}; + gtid_expected.parse("a9d88f83-c14e-11ec-bb36-244bfedf7766:1060"); + gtid_actual = {}; + gtid_actual.update(gtid_event->gtid); + ASSERT_EQ(gtid_actual.toString(), gtid_expected.toString()); + + int total_count = filtered ? 13 : 18; + for (; count < total_count; ++count) + ASSERT_TRUE(binlog.tryReadEvent(event, timeout)); + + ASSERT_TRUE(event); + ASSERT_EQ(event->header.type, ROTATE_EVENT); + ASSERT_EQ(event->header.log_pos, 1237); + ASSERT_EQ(count, total_count); + ASSERT_FALSE(binlog.tryReadEvent(event, 10)); + + auto position = binlog.getPosition(); + ASSERT_EQ(position.binlog_pos, 4); + ASSERT_EQ(position.binlog_name, "binlog.000017"); + ASSERT_EQ(binlog.getPosition().gtid_sets.toString(), "a9d88f83-c14e-11ec-bb36-244bfedf7766:1058-1060"); +} + +TEST_F(MySQLBinlog, binlogFromFile2) +{ + BinlogFromFile binlog; + binlog.open(getTestDataPath("binlog.000016")); + testFile2(binlog, timeout); +} + +TEST_F(MySQLBinlog, binlogFromDispatcher2) +{ + auto f = std::make_shared(getTestDataPath("binlog.000016")); + BinlogEventsDispatcher d; + auto b = d.start(f->createBinlog("")); + testFile2(*b, timeout, true); + ASSERT_EQ(d.getPosition().gtid_sets.toString(), b->getPosition().gtid_sets.toString()); +} + +TEST_F(MySQLBinlog, binlogsFromOneFile) +{ + auto f = std::make_shared(getTestDataPath("binlog.000016")); + auto d1 = std::make_shared("d1"); + auto d2 = std::make_shared("d2"); + auto b1 = d1->start(f->createBinlog("")); + auto b2 = d2->start(f->createBinlog("")); + + testFile2(*b1, timeout, true); + testFile2(*b2, timeout, true); + + ASSERT_EQ(b1->getPosition().gtid_sets.toString(), b2->getPosition().gtid_sets.toString()); + ASSERT_EQ(b1->getPosition().gtid_sets.toString(), "a9d88f83-c14e-11ec-bb36-244bfedf7766:1058-1060"); + ASSERT_EQ(b1->getPosition().binlog_pos, b2->getPosition().binlog_pos); + ASSERT_EQ(b1->getPosition().binlog_pos, 4); +} + +TEST_F(MySQLBinlog, empty) +{ + auto f = std::make_shared(getTestDataPath("binlog.000016")); + auto d1 = std::make_shared("d1"); + ASSERT_TRUE(d1->getDispatcherMetadata().binlogs.empty()); +} + +TEST_F(MySQLBinlog, binlogsAfterStart) +{ + auto f = std::make_shared(getTestDataPath("binlog.000016")); + auto d1 = std::make_shared("d1"); + + auto b1 = d1->start(f->createBinlog("")); + auto b2 = d1->start(f->createBinlog("")); + ASSERT_FALSE(b2); + + testFile2(*b1, timeout, true); + ASSERT_EQ(b1->getPosition().gtid_sets.toString(), "a9d88f83-c14e-11ec-bb36-244bfedf7766:1058-1060"); +} + +TEST_F(MySQLBinlog, metadata) +{ + auto f = std::make_shared(getTestDataPath("binlog.000016")); + auto d1 = std::make_shared("d1"); + ASSERT_TRUE(d1->getDispatcherMetadata().binlogs.empty()); + ASSERT_EQ(d1->getDispatcherMetadata().name, "d1"); + ASSERT_TRUE(d1->getDispatcherMetadata().position.gtid_sets.sets.empty()); + + auto b1 = d1->start(f->createBinlog("")); + ASSERT_TRUE(b1); + ASSERT_EQ(d1->getDispatcherMetadata().binlogs.size(), 1); + ASSERT_FALSE(d1->start(f->createBinlog(""))); + + TRY_ASSERT_TRUE(!d1->getDispatcherMetadata().position.gtid_sets.sets.empty(), timeout); + ASSERT_EQ(d1->getDispatcherMetadata().binlogs.size(), 1); + + testFile2(*b1, timeout, true); + + ASSERT_EQ(b1->getPosition().gtid_sets.toString(), "a9d88f83-c14e-11ec-bb36-244bfedf7766:1058-1060"); + ASSERT_EQ(d1->getDispatcherMetadata().binlogs.size(), 1); + ASSERT_EQ(d1->getDispatcherMetadata().binlogs[0].position_read.gtid_sets.toString(), "a9d88f83-c14e-11ec-bb36-244bfedf7766:1058-1060"); + ASSERT_EQ(d1->getDispatcherMetadata().binlogs[0].position_write.gtid_sets.toString(), "a9d88f83-c14e-11ec-bb36-244bfedf7766:1058-1060"); + ASSERT_EQ(d1->getDispatcherMetadata().binlogs[0].size, 0); + ASSERT_EQ(d1->getDispatcherMetadata().binlogs[0].bytes, 0); +} + +TEST_F(MySQLBinlog, catchingUp) +{ + auto f = std::make_shared(getTestDataPath("binlog.000016")); + auto d1 = std::make_shared("d1"); + auto d2 = std::make_shared("d2"); + ASSERT_TRUE(d1->getDispatcherMetadata().binlogs.empty()); + ASSERT_TRUE(d2->getDispatcherMetadata().binlogs.empty()); + + d2->syncTo(d1); + + auto b1 = d1->start(f->createBinlog("")); + auto b2 = d2->start(f->createBinlog("")); + ASSERT_TRUE(b1); + ASSERT_TRUE(b2); + TRY_ASSERT_EQ(d1->getDispatcherMetadata().binlogs.size(), 2, timeout); + ASSERT_FALSE(d1->getDispatcherMetadata().position.gtid_sets.sets.empty()); + ASSERT_EQ(d2->getDispatcherMetadata().binlogs.size(), 0); + ASSERT_FALSE(d2->getDispatcherMetadata().position.gtid_sets.sets.empty()); + ASSERT_FALSE(d2->start(f->createBinlog(""))); + + testFile2(*b1, timeout, true); + testFile2(*b2, timeout, true); + + ASSERT_EQ(b1->getPosition().gtid_sets.toString(), b2->getPosition().gtid_sets.toString()); + ASSERT_EQ(b1->getPosition().gtid_sets.toString(), "a9d88f83-c14e-11ec-bb36-244bfedf7766:1058-1060"); + ASSERT_EQ(b1->getPosition().binlog_pos, b2->getPosition().binlog_pos); + ASSERT_EQ(b1->getPosition().binlog_pos, 4); + ASSERT_EQ(d2->getDispatcherMetadata().binlogs.size(), 0); + ASSERT_EQ(d1->getDispatcherMetadata().binlogs.size(), 2); + ASSERT_EQ(d1->getDispatcherMetadata().binlogs[0].position_read.gtid_sets.toString(), "a9d88f83-c14e-11ec-bb36-244bfedf7766:1058-1060"); + ASSERT_EQ(d1->getDispatcherMetadata().binlogs[0].position_write.gtid_sets.toString(), "a9d88f83-c14e-11ec-bb36-244bfedf7766:1058-1060"); + ASSERT_EQ(d1->getDispatcherMetadata().binlogs[0].position_read.binlog_pos, 4); + ASSERT_EQ(d1->getDispatcherMetadata().binlogs[0].position_write.binlog_pos, 4); + ASSERT_EQ(d1->getDispatcherMetadata().binlogs[0].size, 0); + ASSERT_EQ(d1->getDispatcherMetadata().binlogs[0].bytes, 0); + ASSERT_EQ(d1->getDispatcherMetadata().binlogs[1].position_read.gtid_sets.toString(), "a9d88f83-c14e-11ec-bb36-244bfedf7766:1058-1060"); + ASSERT_EQ(d1->getDispatcherMetadata().binlogs[1].position_write.gtid_sets.toString(), "a9d88f83-c14e-11ec-bb36-244bfedf7766:1058-1060"); + ASSERT_EQ(d1->getDispatcherMetadata().binlogs[1].position_read.binlog_pos, 4); + ASSERT_EQ(d1->getDispatcherMetadata().binlogs[1].position_write.binlog_pos, 4); + ASSERT_EQ(d1->getDispatcherMetadata().binlogs[1].size, 0); + ASSERT_EQ(d1->getDispatcherMetadata().binlogs[1].bytes, 0); +} + +TEST_F(MySQLBinlog, catchingUpFastMaster) +{ + auto f = std::make_shared(getTestDataPath("binlog.000016")); + auto d1 = std::make_shared("d1"); + auto d2 = std::make_shared("d2"); + + d2->syncTo(d1); + + auto b1 = d1->start(f->createBinlog("")); + std::this_thread::sleep_for(std::chrono::milliseconds(50)); + auto b2 = d2->start(f->createBinlog("")); + + testFile2(*b1, timeout, true); + testFile2(*b2, timeout, true); + + ASSERT_EQ(b1->getPosition().gtid_sets.toString(), b2->getPosition().gtid_sets.toString()); + ASSERT_EQ(b1->getPosition().gtid_sets.toString(), "a9d88f83-c14e-11ec-bb36-244bfedf7766:1058-1060"); + ASSERT_EQ(b1->getPosition().binlog_pos, b2->getPosition().binlog_pos); + ASSERT_EQ(b1->getPosition().binlog_pos, 4); + ASSERT_EQ(d2->getDispatcherMetadata().binlogs.size(), 0); + ASSERT_EQ(d1->getDispatcherMetadata().binlogs.size(), 2); +} + +TEST_F(MySQLBinlog, catchingUpFastSlave) +{ + auto f = std::make_shared(getTestDataPath("binlog.000016")); + auto d1 = std::make_shared("d1"); + auto d2 = std::make_shared("d2"); + + d2->syncTo(d1); + + auto b2 = d2->start(f->createBinlog("")); + std::this_thread::sleep_for(std::chrono::milliseconds(50)); + auto b1 = d1->start(f->createBinlog("")); + + TRY_ASSERT_EQ(d1->getDispatcherMetadata().binlogs.size(), 2, timeout); + ASSERT_EQ(d2->getDispatcherMetadata().binlogs.size(), 0); + ASSERT_FALSE(d1->getDispatcherMetadata().position.gtid_sets.sets.empty()); + ASSERT_FALSE(d2->getDispatcherMetadata().position.gtid_sets.sets.empty()); + + testFile2(*b1, timeout, true); + testFile2(*b2, timeout, true); + + ASSERT_EQ(b1->getPosition().gtid_sets.toString(), b2->getPosition().gtid_sets.toString()); + ASSERT_EQ(b1->getPosition().gtid_sets.toString(), "a9d88f83-c14e-11ec-bb36-244bfedf7766:1058-1060"); + ASSERT_EQ(b1->getPosition().binlog_pos, b2->getPosition().binlog_pos); + ASSERT_EQ(b1->getPosition().binlog_pos, 4); + ASSERT_EQ(d2->getDispatcherMetadata().binlogs.size(), 0); + ASSERT_EQ(d1->getDispatcherMetadata().binlogs.size(), 2); +} + +TEST_F(MySQLBinlog, catchingUpWithoutWaiting) +{ + auto f = std::make_shared(getTestDataPath("binlog.000016")); + auto d1 = std::make_shared("d1"); + auto d2 = std::make_shared("d2"); + + d2->syncTo(d1); + + auto b1 = d1->start(f->createBinlog("")); + auto b2 = d2->start(f->createBinlog("")); + + testFile2(*b1, timeout, true); + testFile2(*b2, timeout, true); + + ASSERT_EQ(b1->getPosition().gtid_sets.toString(), b2->getPosition().gtid_sets.toString()); + ASSERT_EQ(b1->getPosition().gtid_sets.toString(), "a9d88f83-c14e-11ec-bb36-244bfedf7766:1058-1060"); + ASSERT_EQ(b1->getPosition().binlog_pos, b2->getPosition().binlog_pos); + ASSERT_EQ(b1->getPosition().binlog_pos, 4); + TRY_ASSERT_EQ(d2->getDispatcherMetadata().binlogs.size(), 0, timeout); + ASSERT_EQ(d1->getDispatcherMetadata().binlogs.size(), 2); + ASSERT_EQ(d1->getDispatcherMetadata().binlogs[0].position_read.gtid_sets.toString(), "a9d88f83-c14e-11ec-bb36-244bfedf7766:1058-1060"); + ASSERT_EQ(d1->getDispatcherMetadata().binlogs[0].position_read.binlog_pos, 4); + ASSERT_EQ(d1->getDispatcherMetadata().binlogs[1].position_read.gtid_sets.toString(), "a9d88f83-c14e-11ec-bb36-244bfedf7766:1058-1060"); + ASSERT_EQ(d1->getDispatcherMetadata().binlogs[1].position_read.binlog_pos, 4); +} + +TEST_F(MySQLBinlog, catchingUpManyToOne) +{ + auto f = std::make_shared(getTestDataPath("binlog.000016")); + auto d0 = std::make_shared("d0"); + std::vector ds; + int n = 10; + for (int i = 0; i < n; ++i) + { + auto d = std::make_shared("r" + std::to_string(i)); + d->syncTo(d0); + ds.push_back(d); + } + + for (int i = 0; i < n; ++i) + ASSERT_TRUE(ds[i]->getDispatcherMetadata().binlogs.empty()); + + auto b0 = d0->start(f->createBinlog(""), "b"); + ASSERT_EQ(d0->getDispatcherMetadata().binlogs.size(), 1); + ASSERT_EQ(d0->getDispatcherMetadata().binlogs[0].position_read.binlog_pos, 0); + std::vector bs; + bs.resize(n); + for (int i = 0; i < n; ++i) + bs[i] = ds[i]->start(f->createBinlog(""), "b" + std::to_string(i)); + + TRY_ASSERT_EQ(d0->getDispatcherMetadata().binlogs.size(), n + 1, timeout); + ASSERT_FALSE(d0->getDispatcherMetadata().position.gtid_sets.sets.empty()); + for (int i = 0; i < n; ++i) + { + ASSERT_EQ(ds[i]->getDispatcherMetadata().binlogs.size(), 0); + ASSERT_FALSE(ds[i]->getDispatcherMetadata().position.gtid_sets.sets.empty()); + } + + testFile2(*b0, timeout, true); + for (int i = 0; i < n; ++i) + testFile2(*bs[i], timeout, true); + + ASSERT_EQ(b0->getPosition().gtid_sets.toString(), "a9d88f83-c14e-11ec-bb36-244bfedf7766:1058-1060"); + ASSERT_EQ(b0->getPosition().binlog_pos, 4); + + for (int i = 0; i < n; ++i) + { + ASSERT_EQ(bs[i]->getPosition().gtid_sets.toString(), b0->getPosition().gtid_sets.toString()); + ASSERT_EQ(bs[i]->getPosition().binlog_pos, b0->getPosition().binlog_pos); + } + + for (int i = 0; i < n; ++i) + ASSERT_EQ(ds[i]->getDispatcherMetadata().binlogs.size(), 0); + + ASSERT_EQ(d0->getDispatcherMetadata().binlogs.size(), n + 1); + for (int i = 0; i < n + 1; ++i) + { + ASSERT_EQ(d0->getDispatcherMetadata().binlogs[i].position_read.gtid_sets.toString(), "a9d88f83-c14e-11ec-bb36-244bfedf7766:1058-1060"); + ASSERT_EQ(d0->getDispatcherMetadata().binlogs[i].position_write.gtid_sets.toString(), "a9d88f83-c14e-11ec-bb36-244bfedf7766:1058-1060"); + ASSERT_EQ(d0->getDispatcherMetadata().binlogs[i].position_read.binlog_pos, 4); + ASSERT_EQ(d0->getDispatcherMetadata().binlogs[i].position_write.binlog_pos, 4); + ASSERT_EQ(d0->getDispatcherMetadata().binlogs[i].size, 0); + ASSERT_EQ(d0->getDispatcherMetadata().binlogs[i].bytes, 0); + } +} + +TEST_F(MySQLBinlog, catchingUpStopApplier) +{ + auto f = std::make_shared(getTestDataPath("binlog.000016")); + auto d1 = std::make_shared("d1"); + auto d2 = std::make_shared("d2"); + + d2->syncTo(d1); + + auto b1 = d1->start(f->createBinlog("")); + ASSERT_TRUE(b1); + d1 = nullptr; + + auto b2 = d2->start(f->createBinlog("")); + ASSERT_TRUE(b2); + testFile2(*b2, timeout, true); + ASSERT_EQ(b2->getPosition().gtid_sets.toString(), "a9d88f83-c14e-11ec-bb36-244bfedf7766:1058-1060"); +} + +TEST_F(MySQLBinlog, catchingUpOneToAllPrevious) +{ + auto f = std::make_shared(getTestDataPath("binlog.000016")); + std::vector ds; + int n = 10; + for (int i = 0; i < n; ++i) + { + auto d = std::make_shared("d" + std::to_string(i)); + for (int j = 0; j < i; ++j) + d->syncTo(ds[j]); + ds.push_back(d); + } + + for (int i = 0; i < n; ++i) + ASSERT_TRUE(ds[i]->getDispatcherMetadata().binlogs.empty()); + + std::vector bs; + bs.resize(n); + for (int i = 0; i < n; ++i) + bs[i] = ds[i]->start(f->createBinlog(""), "b" + std::to_string(i)); + + auto check_dispatchers = [&] + { + int not_empty_count = 0; + int ii = 0; + for (int i = 0; i < n; ++i) + { + if (!ds[i]->getDispatcherMetadata().binlogs.empty()) + { + ++not_empty_count; + ii = i; + } + } + return not_empty_count == 1 && ds[ii]->getDispatcherMetadata().binlogs.size() == n; + }; + + for (int i = 0; i < n; ++i) + testFile2(*bs[i], timeout, true); + + TRY_ASSERT_TRUE(check_dispatchers(), timeout); + + for (int i = 1; i < n; ++i) + { + ASSERT_EQ(bs[i]->getPosition().gtid_sets.toString(), bs[0]->getPosition().gtid_sets.toString()); + ASSERT_EQ(bs[i]->getPosition().binlog_pos, bs[0]->getPosition().binlog_pos); + } + + int i = 0; + for (int j = 0; j < n; ++j) + { + auto bs_ = ds[j]->getDispatcherMetadata().binlogs; + for (; i < bs_.size(); ++i) + { + ASSERT_EQ(bs_[i].position_read.gtid_sets.toString(), "a9d88f83-c14e-11ec-bb36-244bfedf7766:1058-1060"); + ASSERT_EQ(bs_[i].position_write.gtid_sets.toString(), bs_[i].position_write.gtid_sets.toString()); + ASSERT_EQ(bs_[i].position_read.binlog_pos, 4); + ASSERT_EQ(bs_[i].position_write.binlog_pos, 4); + ASSERT_EQ(bs_[i].size, 0); + ASSERT_EQ(bs_[i].bytes, 0); + } + } + ASSERT_EQ(i, n); +} + +TEST_F(MySQLBinlog, catchingUpMaxBytes) +{ + auto f = std::make_shared(getTestDataPath("binlog.000016")); + auto d1 = std::make_shared("d1"); + auto d2 = std::make_shared("d2"); + + d2->syncTo(d1); + + auto b1 = d1->start(f->createBinlog(""), "big"); + auto b2 = d2->start(f->createBinlog(""), "small", {}, 1, 10000); + + testFile2(*b2, timeout, true); + TRY_ASSERT_EQ(d1->getDispatcherMetadata().binlogs.size(), 2, timeout); + ASSERT_EQ(d1->getDispatcherMetadata().position.gtid_sets.toString(), "a9d88f83-c14e-11ec-bb36-244bfedf7766:1058-1060"); + testFile2(*b1, timeout, true); + ASSERT_EQ(d1->getDispatcherMetadata().binlogs[0].position_write.gtid_sets.toString(), d1->getDispatcherMetadata().position.gtid_sets.toString()); + ASSERT_EQ(d1->getDispatcherMetadata().binlogs[0].position_read.gtid_sets.toString(), d1->getDispatcherMetadata().position.gtid_sets.toString()); + ASSERT_EQ(d1->getDispatcherMetadata().binlogs[1].position_write.gtid_sets.toString(), d1->getDispatcherMetadata().position.gtid_sets.toString()); + ASSERT_EQ(d1->getDispatcherMetadata().binlogs[1].position_read.gtid_sets.toString(), d1->getDispatcherMetadata().position.gtid_sets.toString()); +} + +TEST_F(MySQLBinlog, filterEvents) +{ + auto f = std::make_shared(getTestDataPath("binlog.001390")); + auto d1 = std::make_shared("d1"); + auto d2 = std::make_shared("d2"); + auto b1 = d1->start(f->createBinlog(""), "b1", {"db"}); + auto b2 = d2->start(f->createBinlog(""), "b2", {"unknown_database"}); + + BinlogEventPtr event; + for (int i = 0; i < 37; ++i) + { + ASSERT_TRUE(b1->tryReadEvent(event, timeout)); + ASSERT_TRUE(event); + switch (event->header.type) + { + case WRITE_ROWS_EVENT_V1: + case WRITE_ROWS_EVENT_V2: + case DELETE_ROWS_EVENT_V1: + case DELETE_ROWS_EVENT_V2: + case UPDATE_ROWS_EVENT_V1: + case UPDATE_ROWS_EVENT_V2: + if (event->type() == MYSQL_UNPARSED_ROWS_EVENT) + { + ASSERT_EQ(std::static_pointer_cast(event)->schema, "db"); + } + break; + default: + break; + } + } + + ASSERT_FALSE(b1->tryReadEvent(event, 0)); + ASSERT_FALSE(event); + + for (int i = 0; i < 37; ++i) + { + ASSERT_TRUE(b2->tryReadEvent(event, timeout)); + ASSERT_TRUE(event); + switch (event->header.type) + { + case ROTATE_EVENT: + case XID_EVENT: + case QUERY_EVENT: + case GTID_EVENT: + break; + default: + if (event->type() != MYSQL_UNHANDLED_EVENT) + FAIL() << "Unexpected event: " << magic_enum::enum_name(event->header.type); + break; + } + } + + ASSERT_FALSE(b2->tryReadEvent(event, 0)); + ASSERT_FALSE(event); + + ASSERT_EQ(b1->getPosition().gtid_sets.toString(), "a9d88f83-c14e-11ec-bb36-244bfedf7766:87828-87836"); + ASSERT_EQ(b1->getPosition().binlog_pos, 4); + ASSERT_EQ(b2->getPosition().gtid_sets.toString(), b1->getPosition().gtid_sets.toString()); + ASSERT_EQ(b2->getPosition().binlog_pos, b1->getPosition().binlog_pos); + ASSERT_FALSE(b2->tryReadEvent(event, 0)); +} + +TEST_F(MySQLBinlog, filterEventsMultipleDatabases) +{ + auto f = std::make_shared(getTestDataPath("binlog.001390")); + auto d1 = std::make_shared("d1"); + auto d2 = std::make_shared("d2"); + auto d3 = std::make_shared("d3"); + auto d4 = std::make_shared("d4"); + auto d5 = std::make_shared("d5"); + auto all_dbs = d1->start(f->createBinlog(""), "all_dbs"); + auto db = d2->start(f->createBinlog(""), "db", {"db"}); + auto aborted = d3->start(f->createBinlog(""), "aborted_full_sync", {"aborted_full_sync"}); + auto db_and_aborted = d4->start(f->createBinlog(""), "db_and_aborted", {"db", "aborted_full_sync"}); + auto unknown = d5->start(f->createBinlog(""), "unknown", {"unknown1", "unknown2"}); + + BinlogEventPtr event; + for (int i = 0; i < 37; ++i) + { + ASSERT_TRUE(all_dbs->tryReadEvent(event, timeout)); + ASSERT_TRUE(event); + switch (event->header.type) + { + case WRITE_ROWS_EVENT_V1: + case WRITE_ROWS_EVENT_V2: + case DELETE_ROWS_EVENT_V1: + case DELETE_ROWS_EVENT_V2: + case UPDATE_ROWS_EVENT_V1: + case UPDATE_ROWS_EVENT_V2: + ASSERT_EQ(event->type(), MYSQL_UNPARSED_ROWS_EVENT); + break; + default: + break; + } + } + + ASSERT_FALSE(all_dbs->tryReadEvent(event, 0)); + ASSERT_FALSE(event); + + for (int i = 0; i < 37; ++i) + { + ASSERT_TRUE(db->tryReadEvent(event, timeout)); + ASSERT_TRUE(event); + switch (event->header.type) + { + case WRITE_ROWS_EVENT_V1: + case WRITE_ROWS_EVENT_V2: + case DELETE_ROWS_EVENT_V1: + case DELETE_ROWS_EVENT_V2: + case UPDATE_ROWS_EVENT_V1: + case UPDATE_ROWS_EVENT_V2: + if (event->type() == MYSQL_UNPARSED_ROWS_EVENT) + { + ASSERT_EQ(std::static_pointer_cast(event)->schema, "db"); + } + break; + default: + break; + } + } + + ASSERT_FALSE(db->tryReadEvent(event, 0)); + ASSERT_FALSE(event); + + for (int i = 0; i < 37; ++i) + { + ASSERT_TRUE(aborted->tryReadEvent(event, timeout)); + ASSERT_TRUE(event); + switch (event->header.type) + { + case WRITE_ROWS_EVENT_V1: + case WRITE_ROWS_EVENT_V2: + case DELETE_ROWS_EVENT_V1: + case DELETE_ROWS_EVENT_V2: + case UPDATE_ROWS_EVENT_V1: + case UPDATE_ROWS_EVENT_V2: + if (event->type() == MYSQL_UNPARSED_ROWS_EVENT) + { + ASSERT_EQ(std::static_pointer_cast(event)->schema, "aborted_full_sync"); + } + break; + default: + break; + } + } + + ASSERT_FALSE(aborted->tryReadEvent(event, 0)); + ASSERT_FALSE(event); + + for (int i = 0; i < 37; ++i) + { + ASSERT_TRUE(db_and_aborted->tryReadEvent(event, timeout)); + ASSERT_TRUE(event); + switch (event->header.type) + { + case WRITE_ROWS_EVENT_V1: + case WRITE_ROWS_EVENT_V2: + case DELETE_ROWS_EVENT_V1: + case DELETE_ROWS_EVENT_V2: + case UPDATE_ROWS_EVENT_V1: + case UPDATE_ROWS_EVENT_V2: + { + ASSERT_EQ(event->type(), MYSQL_UNPARSED_ROWS_EVENT); + auto schema = std::static_pointer_cast(event)->schema; + ASSERT_TRUE(schema == "db" || schema == "aborted_full_sync"); + } break; + default: + break; + } + } + + ASSERT_FALSE(db_and_aborted->tryReadEvent(event, 0)); + ASSERT_FALSE(event); + + for (int i = 0; i < 37; ++i) + { + ASSERT_TRUE(unknown->tryReadEvent(event, timeout)); + ASSERT_TRUE(event); + switch (event->header.type) + { + case ROTATE_EVENT: + case XID_EVENT: + case QUERY_EVENT: + case GTID_EVENT: + break; + default: + ASSERT_EQ(event->type(), MYSQL_UNHANDLED_EVENT); + break; + } + } + + ASSERT_FALSE(unknown->tryReadEvent(event, 0)); + ASSERT_FALSE(event); +} + +TEST_F(MySQLBinlog, dispatcherStop) +{ + auto f = std::make_shared(getTestDataPath("binlog.000016")); + auto d1 = std::make_shared("d1"); + auto b1 = d1->start(f->createBinlog("")); + ASSERT_TRUE(b1); + d1 = nullptr; + BinlogEventPtr event; + EXPECT_THROW(for (int i = 0; i < 18 + 1; ++i) b1->tryReadEvent(event, timeout), DB::Exception); +} + +TEST_F(MySQLBinlog, executedGTIDSet) +{ + auto f = std::make_shared(getTestDataPath("binlog.000016")); + auto d1 = std::make_shared("d1"); + auto b1 = d1->start(f->createBinlog("a9d88f83-c14e-11ec-bb36-244bfedf7766:1-1058"), "b1"); + + BinlogEventPtr event; + ASSERT_TRUE(b1->tryReadEvent(event, timeout)); + ASSERT_TRUE(event); + ASSERT_EQ(event->header.type, GTID_EVENT); + ASSERT_EQ(event->header.log_pos, 608); + + ASSERT_TRUE(b1->tryReadEvent(event, timeout)); + ASSERT_TRUE(event); + ASSERT_EQ(event->header.type, QUERY_EVENT); + ASSERT_EQ(event->header.log_pos, 701); + + for (int i = 0; i < 7; ++i) + ASSERT_TRUE(b1->tryReadEvent(event, timeout)); + + ASSERT_TRUE(event); + ASSERT_EQ(event->header.type, ROTATE_EVENT); + ASSERT_EQ(event->header.log_pos, 1237); + ASSERT_EQ(d1->getPosition().gtid_sets.toString(), "a9d88f83-c14e-11ec-bb36-244bfedf7766:1-1060"); + ASSERT_EQ(b1->getPosition().gtid_sets.toString(), "a9d88f83-c14e-11ec-bb36-244bfedf7766:1-1060"); + ASSERT_FALSE(b1->tryReadEvent(event, 0)); +} + +TEST_F(MySQLBinlog, client) +{ + auto f = std::make_shared(getTestDataPath("binlog.000016")); + auto e = std::make_shared(f); + + auto b1 = e->createBinlog("", "b1"); + auto b2 = e->createBinlog("", "b2"); + testFile2(*b1, timeout, true); + testFile2(*b2, timeout, true); + + auto b3 = e->createBinlog("", "b3"); + + testFile2(*b3, timeout, true); + + b1 = nullptr; + b2 = nullptr; + + auto b4 = e->createBinlog("", "b4"); + testFile2(*b4, timeout, true); + + b3 = nullptr; + b4 = e->createBinlog("", "b4 2"); + testFile2(*b4, timeout, true); + + b1 = e->createBinlog("", "b1 2"); + b2 = e->createBinlog("", "b2 2"); + testFile2(*b1, timeout, true); + + b3 = e->createBinlog("", "b3 2"); + testFile2(*b2, timeout, true); + + b4 = e->createBinlog("", "b4 3"); + testFile2(*b3, timeout, true); + testFile2(*b4, timeout, true); + + b1 = nullptr; + b2 = nullptr; + b3 = nullptr; + b4 = nullptr; + b1 = e->createBinlog("", "b1 3"); + b2 = e->createBinlog("", "b2 3"); + b3 = e->createBinlog("", "b3 3"); + b4 = e->createBinlog("", "b4 4"); + testFile2(*b4, timeout, true); + testFile2(*b3, timeout, true); + testFile2(*b2, timeout, true); + testFile2(*b1, timeout, true); + + f = std::make_shared(getTestDataPath("binlog.000016")); + e = std::make_shared(f); + + b4 = e->createBinlog("", "b4 5"); + b3 = e->createBinlog("", "b3 4"); + testFile2(*b4, timeout, true); + b2 = e->createBinlog("", "b2 4"); + b1 = e->createBinlog("", "b1 4"); + testFile2(*b3, timeout, true); + testFile2(*b1, timeout, true); + testFile2(*b2, timeout, true); + + b1 = e->createBinlog("", "b1 5"); + b2 = e->createBinlog("", "b2 5"); + testFile2(*b1, timeout, true); + testFile2(*b2, timeout, true); + b1 = e->createBinlog("", "b1 6"); + testFile2(*b1, timeout, true); + b1 = e->createBinlog("", "b1 7"); + testFile2(*b1, timeout, true); + + b3 = nullptr; + b4 = nullptr; + b1 = e->createBinlog("", "b1 8"); + b4 = e->createBinlog("", "b4 6"); + b3 = e->createBinlog("", "b3 5"); + testFile2(*b4, timeout, true); + testFile2(*b3, timeout, true); + testFile2(*b1, timeout, true); + + b2 = nullptr; + b3 = nullptr; + b4 = nullptr; + b1 = nullptr; + b1 = e->createBinlog("", "b1 9"); + testFile2(*b1, timeout, true); +} + +TEST_F(MySQLBinlog, createBinlog) +{ + auto f = std::make_shared(getTestDataPath("binlog.001390")); + auto d1 = std::make_shared("d1"); + auto b1 = d1->start(f->createBinlog(""), "b1"); + ASSERT_TRUE(b1); + ASSERT_FALSE(d1->start(f->createBinlog(""))); + testFile1(*b1, timeout, true); + ASSERT_FALSE(d1->start(f->createBinlog(""))); + b1 = nullptr; + ASSERT_FALSE(d1->start(f->createBinlog(""))); +} + +TEST_F(MySQLBinlog, createBinlogAttach1) +{ + auto f = std::make_shared(getTestDataPath("binlog.001390")); + auto d1 = std::make_shared("d1"); + auto b1_ = d1->start(f->createBinlog("a9d88f83-c14e-11ec-bb36-244bfedf7766:1-87828"), "b1_"); + ASSERT_TRUE(b1_); + auto b1 = d1->attach("a9d88f83-c14e-11ec-bb36-244bfedf7766:1-87831", "b1"); + if (b1) + { + BinlogEventPtr event; + ASSERT_EQ(b1->getPosition().gtid_sets.toString(), "a9d88f83-c14e-11ec-bb36-244bfedf7766:1-87831"); + ASSERT_TRUE(b1->tryReadEvent(event, timeout)); + ASSERT_TRUE(event); + ASSERT_EQ(event->header.type, GTID_EVENT); + ASSERT_EQ(b1->getPosition().gtid_sets.toString(), "a9d88f83-c14e-11ec-bb36-244bfedf7766:1-87831"); + + ASSERT_TRUE(b1->tryReadEvent(event, timeout)); + ASSERT_TRUE(event); + ASSERT_EQ(event->header.type, QUERY_EVENT); + ASSERT_EQ(b1->getPosition().gtid_sets.toString(), "a9d88f83-c14e-11ec-bb36-244bfedf7766:1-87831"); + + ASSERT_TRUE(b1->tryReadEvent(event, timeout)); + ASSERT_TRUE(event); + ASSERT_EQ(event->header.type, UPDATE_ROWS_EVENT_V2); + ASSERT_EQ(b1->getPosition().gtid_sets.toString(), "a9d88f83-c14e-11ec-bb36-244bfedf7766:1-87831"); + + ASSERT_TRUE(b1->tryReadEvent(event, timeout)); + ASSERT_TRUE(event); + ASSERT_EQ(event->header.type, XID_EVENT); + ASSERT_EQ(b1->getPosition().gtid_sets.toString(), "a9d88f83-c14e-11ec-bb36-244bfedf7766:1-87832"); + for (int i = 0; i < 17; ++i) + ASSERT_TRUE(b1->tryReadEvent(event, timeout)); + ASSERT_TRUE(event); + ASSERT_EQ(event->header.type, ROTATE_EVENT); + ASSERT_FALSE(b1->tryReadEvent(event, 10)); + ASSERT_EQ(b1->getPosition().binlog_pos, 4); + ASSERT_EQ(b1->getPosition().binlog_name, "binlog.001391"); + ASSERT_EQ(b1->getPosition().gtid_sets.toString(), "a9d88f83-c14e-11ec-bb36-244bfedf7766:1-87836"); + for (int i = 0; i < 33; ++i) + ASSERT_TRUE(b1_->tryReadEvent(event, timeout)); + ASSERT_TRUE(event); + ASSERT_EQ(event->header.type, ROTATE_EVENT); + ASSERT_EQ(d1->getDispatcherMetadata().binlogs.size(), 2); + ASSERT_EQ(d1->getDispatcherMetadata().binlogs[0].bytes, 0); + ASSERT_EQ(d1->getDispatcherMetadata().binlogs[1].bytes, 0); + } +} + +TEST_F(MySQLBinlog, createBinlogAttach2) +{ + BinlogEventPtr event; + auto f = std::make_shared(getTestDataPath("binlog.001390")); + auto d1 = std::make_shared("d1"); + auto d2 = std::make_shared("d2"); + auto d3 = std::make_shared("d3"); + auto d4 = std::make_shared("d4"); + + auto b1 = d1->start(f->createBinlog("a9d88f83-c14e-11ec-bb36-244bfedf7766:1-87828"), "b1"); + ASSERT_TRUE(b1); + ASSERT_TRUE(b1->tryReadEvent(event, timeout)); + ASSERT_TRUE(event); + ASSERT_EQ(event->header.type, GTID_EVENT); + + auto b2_ = d2->start(f->createBinlog("a9d88f83-c14e-11ec-bb36-244bfedf7766:1-87828"), "b2_"); + ASSERT_TRUE(b2_); + auto b2 = d2->attach("a9d88f83-c14e-11ec-bb36-244bfedf7766:1-87831", "b2"); + + auto b3_ = d3->start(f->createBinlog("a9d88f83-c14e-11ec-bb36-244bfedf7766:1-87828"), "b3_"); + ASSERT_TRUE(b3_); + auto b3 = d3->attach("a9d88f83-c14e-11ec-bb36-244bfedf7766:1-87835", "b3"); + + auto b4_ = d4->start(f->createBinlog("a9d88f83-c14e-11ec-bb36-244bfedf7766:1-87828"), "b4_"); + ASSERT_TRUE(b4_); + auto b4 = d4->attach("a9d88f83-c14e-11ec-bb36-244bfedf7766:1-87828", "b4"); + + /// There is a race with dispatcher thread + if (b2) + { + ASSERT_TRUE(b2->tryReadEvent(event, timeout)); + ASSERT_TRUE(event); + ASSERT_EQ(event->header.type, GTID_EVENT); + + ASSERT_TRUE(b2->tryReadEvent(event, timeout)); + ASSERT_TRUE(event); + ASSERT_EQ(event->header.type, QUERY_EVENT); + + ASSERT_TRUE(b2->tryReadEvent(event, timeout)); + ASSERT_TRUE(event); + ASSERT_EQ(event->header.type, UPDATE_ROWS_EVENT_V2); + for (int i = 0; i < 18; ++i) + ASSERT_TRUE(b2->tryReadEvent(event, timeout)); + ASSERT_TRUE(event); + ASSERT_EQ(event->header.type, ROTATE_EVENT); + ASSERT_FALSE(b2->tryReadEvent(event, 10)); + ASSERT_EQ(b2->getPosition().binlog_pos, 4); + ASSERT_EQ(b2->getPosition().binlog_name, "binlog.001391"); + ASSERT_EQ(b2->getPosition().gtid_sets.toString(), "a9d88f83-c14e-11ec-bb36-244bfedf7766:1-87836"); + for (int i = 0; i < 33; ++i) + ASSERT_TRUE(b2_->tryReadEvent(event, timeout)); + ASSERT_TRUE(event); + ASSERT_EQ(event->header.type, ROTATE_EVENT); + ASSERT_EQ(d2->getDispatcherMetadata().binlogs.size(), 2); + ASSERT_EQ(d2->getDispatcherMetadata().binlogs[0].bytes, 0); + ASSERT_EQ(d2->getDispatcherMetadata().binlogs[1].bytes, 0); + } + + if (b4) + { + ASSERT_TRUE(b4->tryReadEvent(event, timeout)); + ASSERT_TRUE(event); + ASSERT_EQ(event->header.type, GTID_EVENT); + + ASSERT_TRUE(b4->tryReadEvent(event, timeout)); + ASSERT_TRUE(event); + ASSERT_EQ(event->header.type, QUERY_EVENT); + + ASSERT_TRUE(b4->tryReadEvent(event, timeout)); + ASSERT_TRUE(event); + ASSERT_EQ(event->header.type, WRITE_ROWS_EVENT_V2); + for (int i = 0; i < 10; ++i) + ASSERT_TRUE(b4->tryReadEvent(event, timeout)); + ASSERT_TRUE(event); + ASSERT_EQ(event->header.type, ROTATE_EVENT); + ASSERT_FALSE(b2->tryReadEvent(event, 10)); + ASSERT_EQ(b4->getPosition().gtid_sets.toString(), "a9d88f83-c14e-11ec-bb36-244bfedf7766:1-87836"); + for (int i = 0; i < 33; ++i) + ASSERT_TRUE(b4_->tryReadEvent(event, timeout)); + ASSERT_TRUE(event); + ASSERT_EQ(event->header.type, ROTATE_EVENT); + ASSERT_EQ(d4->getDispatcherMetadata().binlogs.size(), 2); + ASSERT_EQ(d4->getDispatcherMetadata().binlogs[0].bytes, 0); + ASSERT_EQ(d4->getDispatcherMetadata().binlogs[1].bytes, 0); + } + + if (b3) + { + ASSERT_TRUE(b3->tryReadEvent(event, timeout)); + ASSERT_TRUE(event); + ASSERT_EQ(event->header.type, GTID_EVENT); + + ASSERT_TRUE(b3->tryReadEvent(event, timeout)); + ASSERT_TRUE(event); + ASSERT_EQ(event->header.type, QUERY_EVENT); + for (int i = 0; i < 3; ++i) + ASSERT_TRUE(b3->tryReadEvent(event, timeout)); + ASSERT_TRUE(event); + ASSERT_EQ(event->header.type, ROTATE_EVENT); + ASSERT_FALSE(b3->tryReadEvent(event, 10)); + ASSERT_EQ(b3->getPosition().gtid_sets.toString(), "a9d88f83-c14e-11ec-bb36-244bfedf7766:1-87836"); + for (int i = 0; i < 33; ++i) + ASSERT_TRUE(b3_->tryReadEvent(event, timeout)); + ASSERT_TRUE(event); + ASSERT_EQ(event->header.type, ROTATE_EVENT); + ASSERT_EQ(d3->getDispatcherMetadata().binlogs.size(), 2); + ASSERT_EQ(d3->getDispatcherMetadata().binlogs[0].bytes, 0); + ASSERT_EQ(d3->getDispatcherMetadata().binlogs[1].bytes, 0); + } +} + +TEST_F(MySQLBinlog, factoryThreads) +{ + auto f = std::make_shared(getTestDataPath("binlog.001390")); + auto func1 = [&] + { + auto b1 = f->createBinlog(""); + auto b2 = f->createBinlog(""); + auto b3 = f->createBinlog(""); + testFile1(*b1, timeout); + testFile1(*b2, timeout); + b2 = f->createBinlog(""); + testFile1(*b2, timeout); + b1 = f->createBinlog(""); + testFile1(*b1, timeout); + b1 = nullptr; + b2 = f->createBinlog(""); + testFile1(*b2, timeout); + b1 = f->createBinlog(""); + testFile1(*b1, timeout); + testFile1(*b3, timeout); + }; + + auto func2 = [&] + { + auto b1 = f->createBinlog(""); + auto b2 = f->createBinlog(""); + testFile1(*b2, timeout); + testFile1(*b1, timeout); + b1 = f->createBinlog(""); + testFile1(*b1, timeout); + b2 = f->createBinlog(""); + testFile1(*b2, timeout); + b1 = f->createBinlog(""); + b2 = f->createBinlog(""); + testFile1(*b1, timeout); + b2 = nullptr; + b1 = f->createBinlog(""); + testFile1(*b1, timeout); + b1 = nullptr; + }; + + int n = 4; + std::vector ts1, ts2; + for (int i = 0; i < n; ++i) + { + ts1.emplace_back(std::thread(func1)); + ts2.emplace_back(std::thread(func2)); + } + for (int i = 0; i < n; ++i) + { + ts1[i].join(); + ts2[i].join(); + } +} + +TEST_F(MySQLBinlog, clientThreads) +{ + auto f = std::make_shared(getTestDataPath("binlog.001390")); + auto e = std::make_shared(f); + auto func1 = [&] + { + auto b1 = e->createBinlog(""); + auto b2 = e->createBinlog(""); + testFile1(*b1, timeout, true); + testFile1(*b2, timeout, true); + b1 = nullptr; + b2 = nullptr; + b2 = e->createBinlog(""); + testFile1(*b2, timeout, true); + b1 = e->createBinlog(""); + testFile1(*b1, timeout, true); + b1 = nullptr; + b2 = e->createBinlog(""); + testFile1(*b2, timeout, true); + b2 = nullptr; + b1 = e->createBinlog(""); + testFile1(*b1, timeout, true); + }; + + auto func2 = [&] + { + auto b1 = e->createBinlog(""); + testFile1(*b1, timeout, true); + auto b2 = e->createBinlog(""); + testFile1(*b2, timeout, true); + b2 = e->createBinlog(""); + b1 = e->createBinlog(""); + testFile1(*b1, timeout, true); + testFile1(*b2, timeout, true); + b1 = nullptr; + b2 = nullptr; + b1 = e->createBinlog(""); + testFile1(*b1, timeout, true); + b2 = e->createBinlog(""); + testFile1(*b2, timeout, true); + }; + + int n = 4; + std::vector ts1, ts2; + for (int i = 0; i < n; ++i) + { + ts1.emplace_back(std::thread(func1)); + ts2.emplace_back(std::thread(func2)); + } + for (int i = 0; i < n; ++i) + { + ts1[i].join(); + ts2[i].join(); + } + + // All dispatchers synced and finished + // No dispatchers and no binlogs are alive here + ASSERT_EQ(e->getMetadata().dispatchers.size(), 0); + + // Creates new dispatcher + auto b1 = e->createBinlog("", "b1 1"); + testFile1(*b1, timeout, true); + + auto md = e->getMetadata().dispatchers; + ASSERT_EQ(md.size(), 1); + ASSERT_EQ(md[0].position.gtid_sets.toString(), "a9d88f83-c14e-11ec-bb36-244bfedf7766:87828-87836"); + ASSERT_EQ(md[0].binlogs.size(), 1); + ASSERT_EQ(md[0].binlogs[0].position_read.gtid_sets.toString(), "a9d88f83-c14e-11ec-bb36-244bfedf7766:87828-87836"); + ASSERT_EQ(md[0].binlogs[0].size, 0); + ASSERT_EQ(md[0].binlogs[0].bytes, 0); + + // Creates new dispatcher + auto b1_2 = e->createBinlog("", "b1 2"); + + // Should sync to the first dispatcher + TRY_ASSERT_EQ(e->getMetadata().dispatchers.size(), 1, timeout); + // If there is no CPU available, + // it possible to catch in the middle of the transform between dispatchers. + // Checking again to make sure that catching up is finished. + TRY_ASSERT_EQ(e->getMetadata().dispatchers.size(), 1, timeout); + b1 = nullptr; + md = e->getMetadata().dispatchers; + ASSERT_EQ(md.size(), 1); + ASSERT_EQ(md[0].binlogs.size(), 1); + // Did not read any events yet + ASSERT_EQ(md[0].binlogs[0].position_read.gtid_sets.toString(), ""); + ASSERT_EQ(md[0].binlogs[0].position_read.binlog_pos, 0); + + auto b2 = e->createBinlog("", "b2"); + + BinlogEventPtr event; + // Read only one event + ASSERT_TRUE(b2->tryReadEvent(event, timeout)); + // Waits before all binlogs are moved to main dispatcher + TRY_ASSERT_EQ(e->getMetadata().dispatchers[0].binlogs.size(), 2, timeout); + + // One dispatcher is alive + md = e->getMetadata().dispatchers; + ASSERT_EQ(md.size(), 1); + ASSERT_EQ(md[0].binlogs.size(), 2); + ASSERT_EQ(md[0].binlogs[0].position_read.gtid_sets.toString(), ""); + ASSERT_EQ(md[0].binlogs[1].position_read.gtid_sets.toString(), ""); + ASSERT_EQ(md[0].binlogs[0].position_read.binlog_pos, md[0].binlogs[0].name == "b2" ? 276 : 0); // Read one event + ASSERT_EQ(md[0].binlogs[1].position_read.binlog_pos, md[0].binlogs[0].name == "b2" ? 0 : 276); +} diff --git a/src/Disks/ObjectStorages/DiskObjectStorage.cpp b/src/Disks/ObjectStorages/DiskObjectStorage.cpp index c3baf3fdbda..6962248c7e1 100644 --- a/src/Disks/ObjectStorages/DiskObjectStorage.cpp +++ b/src/Disks/ObjectStorages/DiskObjectStorage.cpp @@ -258,12 +258,6 @@ String DiskObjectStorage::getUniqueId(const String & path) const bool DiskObjectStorage::checkUniqueId(const String & id) const { - if (!id.starts_with(object_key_prefix)) - { - LOG_DEBUG(log, "Blob with id {} doesn't start with blob storage prefix {}, Stack {}", id, object_key_prefix, StackTrace().toString()); - return false; - } - auto object = StoredObject(id); return object_storage->exists(object); } diff --git a/src/Disks/ObjectStorages/DiskObjectStorageMetadata.cpp b/src/Disks/ObjectStorages/DiskObjectStorageMetadata.cpp index 3271a190193..881f7a46c16 100644 --- a/src/Disks/ObjectStorages/DiskObjectStorageMetadata.cpp +++ b/src/Disks/ObjectStorages/DiskObjectStorageMetadata.cpp @@ -20,6 +20,7 @@ namespace ErrorCodes void DiskObjectStorageMetadata::deserialize(ReadBuffer & buf) { readIntText(version, buf); + assertChar('\n', buf); if (version < VERSION_ABSOLUTE_PATHS || version > VERSION_FULL_OBJECT_KEY) throw Exception( @@ -27,8 +28,6 @@ void DiskObjectStorageMetadata::deserialize(ReadBuffer & buf) "Unknown metadata file version. Path: {}. Version: {}. Maximum expected version: {}", metadata_file_path, toString(version), toString(VERSION_FULL_OBJECT_KEY)); - assertChar('\n', buf); - UInt32 keys_count; readIntText(keys_count, buf); assertChar('\t', buf); @@ -122,6 +121,7 @@ void DiskObjectStorageMetadata::serialize(WriteBuffer & buf, bool sync) const chassert(write_version >= VERSION_ABSOLUTE_PATHS && write_version <= VERSION_FULL_OBJECT_KEY); writeIntText(write_version, buf); + writeChar('\n', buf); writeIntText(keys_with_meta.size(), buf); diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp index beb8a400632..6a091471888 100644 --- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp +++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp @@ -19,7 +19,6 @@ #include -#include #include #include #include @@ -556,27 +555,12 @@ std::unique_ptr S3ObjectStorage::cloneObjectStorage( return std::make_unique( std::move(new_client), std::move(new_s3_settings), version_id, s3_capabilities, new_namespace, - endpoint, object_key_prefix, disk_name); + endpoint, key_generator, disk_name); } -ObjectStorageKey S3ObjectStorage::generateObjectKeyForPath(const std::string &) const +ObjectStorageKey S3ObjectStorage::generateObjectKeyForPath(const std::string & path) const { - /// Path to store the new S3 object. - - /// Total length is 32 a-z characters for enough randomness. - /// First 3 characters are used as a prefix for - /// https://aws.amazon.com/premiumsupport/knowledge-center/s3-object-key-naming-pattern/ - - constexpr size_t key_name_total_size = 32; - constexpr size_t key_name_prefix_size = 3; - - /// Path to store new S3 object. - String key = fmt::format("{}/{}", - getRandomASCIIString(key_name_prefix_size), - getRandomASCIIString(key_name_total_size - key_name_prefix_size)); - - /// what ever key_prefix value is, consider that key as relative - return ObjectStorageKey::createAsRelative(object_key_prefix, key); + return key_generator->generate(path); } diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.h b/src/Disks/ObjectStorages/S3/S3ObjectStorage.h index c8b3aeaca28..caa4beaba3b 100644 --- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.h +++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.h @@ -9,6 +9,7 @@ #include #include #include +#include namespace DB @@ -39,7 +40,6 @@ struct S3ObjectStorageSettings bool read_only; }; - class S3ObjectStorage : public IObjectStorage { private: @@ -53,10 +53,10 @@ private: const S3Capabilities & s3_capabilities_, String bucket_, String connection_string, - String object_key_prefix_, + ObjectStorageKeysGeneratorPtr key_generator_, const String & disk_name_) : bucket(std::move(bucket_)) - , object_key_prefix(std::move(object_key_prefix_)) + , key_generator(std::move(key_generator_)) , disk_name(disk_name_) , client(std::move(client_)) , s3_settings(std::move(s3_settings_)) @@ -179,7 +179,7 @@ private: private: std::string bucket; - String object_key_prefix; + ObjectStorageKeysGeneratorPtr key_generator; std::string disk_name; MultiVersion client; @@ -199,11 +199,6 @@ private: class S3PlainObjectStorage : public S3ObjectStorage { public: - ObjectStorageKey generateObjectKeyForPath(const std::string & path) const override - { - return ObjectStorageKey::createAsRelative(object_key_prefix, path); - } - std::string getName() const override { return "S3PlainObjectStorage"; } template diff --git a/src/Disks/ObjectStorages/S3/registerDiskS3.cpp b/src/Disks/ObjectStorages/S3/registerDiskS3.cpp index 7543fb94331..a35a1eb2a82 100644 --- a/src/Disks/ObjectStorages/S3/registerDiskS3.cpp +++ b/src/Disks/ObjectStorages/S3/registerDiskS3.cpp @@ -91,6 +91,60 @@ private: } }; +std::pair getPrefixAndKeyGenerator( + String type, const S3::URI & uri, const Poco::Util::AbstractConfiguration & config, const String & config_prefix) +{ + if (type == "s3_plain") + return {uri.key, createObjectStorageKeysGeneratorAsIsWithPrefix(uri.key)}; + + chassert(type == "s3"); + + bool storage_metadata_write_full_object_key = DiskObjectStorageMetadata::getWriteFullObjectKeySetting(); + bool send_metadata = config.getBool(config_prefix + ".send_metadata", false); + + if (send_metadata && storage_metadata_write_full_object_key) + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Wrong configuration in {}. " + "s3 does not supports feature 'send_metadata' with feature 'storage_metadata_write_full_object_key'.", + config_prefix); + + String object_key_compatibility_prefix = config.getString(config_prefix + ".key_compatibility_prefix", String()); + String object_key_template = config.getString(config_prefix + ".key_template", String()); + + if (object_key_template.empty()) + { + if (!object_key_compatibility_prefix.empty()) + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Wrong configuration in {}. " + "Setting 'key_compatibility_prefix' can be defined only with setting 'key_template'.", + config_prefix); + + return {uri.key, createObjectStorageKeysGeneratorByPrefix(uri.key)}; + } + + if (send_metadata) + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Wrong configuration in {}. " + "s3 does not supports send_metadata with setting 'key_template'.", + config_prefix); + + if (!storage_metadata_write_full_object_key) + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Wrong configuration in {}. " + "Feature 'storage_metadata_write_full_object_key' has to be enabled in order to use setting 'key_template'.", + config_prefix); + + if (!uri.key.empty()) + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Wrong configuration in {}. " + "URI.key is forbidden with settings 'key_template', use setting 'key_compatibility_prefix' instead'. " + "URI.key: '{}', bucket: '{}'. ", + config_prefix, + uri.key, uri.bucket); + + return {object_key_compatibility_prefix, createObjectStorageKeysGeneratorByTemplate(object_key_template)}; +} + } void registerDiskS3(DiskFactory & factory, bool global_skip_access_check) @@ -104,7 +158,8 @@ void registerDiskS3(DiskFactory & factory, bool global_skip_access_check) { String endpoint = context->getMacros()->expand(config.getString(config_prefix + ".endpoint")); S3::URI uri(endpoint); - if (!uri.key.ends_with('/')) + // an empty key remains empty + if (!uri.key.empty() && !uri.key.ends_with('/')) uri.key.push_back('/'); S3Capabilities s3_capabilities = getCapabilitiesFromConfig(config, config_prefix); @@ -113,6 +168,8 @@ void registerDiskS3(DiskFactory & factory, bool global_skip_access_check) String type = config.getString(config_prefix + ".type"); chassert(type == "s3" || type == "s3_plain"); + auto [object_key_compatibility_prefix, object_key_generator] = getPrefixAndKeyGenerator(type, uri, config, config_prefix); + MetadataStoragePtr metadata_storage; auto settings = getSettings(config, config_prefix, context); auto client = getClient(config, config_prefix, context, *settings); @@ -128,20 +185,18 @@ void registerDiskS3(DiskFactory & factory, bool global_skip_access_check) throw Exception(ErrorCodes::BAD_ARGUMENTS, "s3_plain does not supports send_metadata"); s3_storage = std::make_shared( - std::move(client), std::move(settings), - uri.version_id, s3_capabilities, - uri.bucket, uri.endpoint, uri.key, name); - metadata_storage = std::make_shared(s3_storage, uri.key); + std::move(client), std::move(settings), uri.version_id, s3_capabilities, uri.bucket, uri.endpoint, object_key_generator, name); + + metadata_storage = std::make_shared(s3_storage, object_key_compatibility_prefix); } else { s3_storage = std::make_shared( - std::move(client), std::move(settings), - uri.version_id, s3_capabilities, - uri.bucket, uri.endpoint, uri.key, name); + std::move(client), std::move(settings), uri.version_id, s3_capabilities, uri.bucket, uri.endpoint, object_key_generator, name); auto [metadata_path, metadata_disk] = prepareForLocalMetadata(name, config, config_prefix, context); - metadata_storage = std::make_shared(metadata_disk, uri.key); + + metadata_storage = std::make_shared(metadata_disk, object_key_compatibility_prefix); } /// NOTE: should we still perform this check for clickhouse-disks? @@ -164,7 +219,7 @@ void registerDiskS3(DiskFactory & factory, bool global_skip_access_check) DiskObjectStoragePtr s3disk = std::make_shared( name, - uri.key, + uri.key, /// might be empty type == "s3" ? "DiskS3" : "DiskS3Plain", std::move(metadata_storage), std::move(s3_storage), diff --git a/src/Functions/FunctionsStringDistance.cpp b/src/Functions/FunctionsStringDistance.cpp index a5e819179d6..6cb23bbea9f 100644 --- a/src/Functions/FunctionsStringDistance.cpp +++ b/src/Functions/FunctionsStringDistance.cpp @@ -8,6 +8,8 @@ #include #include +#include + #ifdef __SSE4_2__ # include #endif @@ -25,7 +27,7 @@ struct FunctionStringDistanceImpl { using ResultType = typename Op::ResultType; - static void constantConstant(const std::string & haystack, const std::string & needle, ResultType & res) + static void constantConstant(const String & haystack, const String & needle, ResultType & res) { res = Op::process(haystack.data(), haystack.size(), needle.data(), needle.size()); } @@ -51,7 +53,7 @@ struct FunctionStringDistanceImpl } static void constantVector( - const std::string & haystack, + const String & haystack, const ColumnString::Chars & needle_data, const ColumnString::Offsets & needle_offsets, PaddedPODArray & res) @@ -70,7 +72,7 @@ struct FunctionStringDistanceImpl static void vectorConstant( const ColumnString::Chars & data, const ColumnString::Offsets & offsets, - const std::string & needle, + const String & needle, PaddedPODArray & res) { constantVector(needle, data, offsets, res); @@ -81,7 +83,7 @@ struct FunctionStringDistanceImpl struct ByteHammingDistanceImpl { using ResultType = UInt64; - static ResultType inline process( + static ResultType process( const char * __restrict haystack, size_t haystack_size, const char * __restrict needle, size_t needle_size) { UInt64 res = 0; @@ -115,7 +117,7 @@ template struct ByteJaccardIndexImpl { using ResultType = Float64; - static ResultType inline process( + static ResultType process( const char * __restrict haystack, size_t haystack_size, const char * __restrict needle, size_t needle_size) { if (haystack_size == 0 || needle_size == 0) @@ -222,23 +224,23 @@ struct ByteJaccardIndexImpl } }; +static constexpr size_t max_string_size = 1u << 16; + struct ByteEditDistanceImpl { using ResultType = UInt64; - static constexpr size_t max_string_size = 1u << 16; - static ResultType inline process( + static ResultType process( const char * __restrict haystack, size_t haystack_size, const char * __restrict needle, size_t needle_size) { if (haystack_size == 0 || needle_size == 0) return haystack_size + needle_size; - /// Safety threshold against DoS, since we use two array to calculate the distance. + /// Safety threshold against DoS, since we use two arrays to calculate the distance. if (haystack_size > max_string_size || needle_size > max_string_size) throw Exception( ErrorCodes::TOO_LARGE_STRING_SIZE, - "The string size is too big for function editDistance, " - "should be at most {}", max_string_size); + "The string size is too big for function editDistance, should be at most {}", max_string_size); PaddedPODArray distances0(haystack_size + 1, 0); PaddedPODArray distances1(haystack_size + 1, 0); @@ -271,6 +273,180 @@ struct ByteEditDistanceImpl } }; +struct ByteDamerauLevenshteinDistanceImpl +{ + using ResultType = UInt64; + + static ResultType process( + const char * __restrict haystack, size_t haystack_size, const char * __restrict needle, size_t needle_size) + { + /// Safety threshold against DoS + if (haystack_size > max_string_size || needle_size > max_string_size) + throw Exception( + ErrorCodes::TOO_LARGE_STRING_SIZE, + "The string size is too big for function damerauLevenshteinDistance, should be at most {}", max_string_size); + + /// Shortcuts: + + if (haystack_size == 0) + return needle_size; + + if (needle_size == 0) + return haystack_size; + + if (haystack_size == needle_size && memcmp(haystack, needle, haystack_size) == 0) + return 0; + + /// Implements the algorithm for optimal string alignment distance from + /// https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance#Optimal_string_alignment_distance + + /// Dynamically allocate memory for the 2D array + /// Allocating a 2D array, for convenience starts is an array of pointers to the start of the rows. + std::vector d((needle_size + 1) * (haystack_size + 1)); + std::vector starts(haystack_size + 1); + + /// Setting the pointers in starts to the beginning of (needle_size + 1)-long intervals. + /// Also initialize the row values based on the mentioned algorithm. + for (size_t i = 0; i <= haystack_size; ++i) + { + starts[i] = d.data() + (needle_size + 1) * i; + starts[i][0] = static_cast(i); + } + + for (size_t j = 0; j <= needle_size; ++j) + { + starts[0][j] = static_cast(j); + } + + for (size_t i = 1; i <= haystack_size; ++i) + { + for (size_t j = 1; j <= needle_size; ++j) + { + int cost = (haystack[i - 1] == needle[j - 1]) ? 0 : 1; + starts[i][j] = std::min(starts[i - 1][j] + 1, /// deletion + std::min(starts[i][j - 1] + 1, /// insertion + starts[i - 1][j - 1] + cost) /// substitution + ); + if (i > 1 && j > 1 && haystack[i - 1] == needle[j - 2] && haystack[i - 2] == needle[j - 1]) + starts[i][j] = std::min(starts[i][j], starts[i - 2][j - 2] + 1); /// transposition + } + } + + return starts[haystack_size][needle_size]; + } +}; + +struct ByteJaroSimilarityImpl +{ + using ResultType = Float64; + + static ResultType process( + const char * __restrict haystack, size_t haystack_size, const char * __restrict needle, size_t needle_size) + { + /// Safety threshold against DoS + if (haystack_size > max_string_size || needle_size > max_string_size) + throw Exception( + ErrorCodes::TOO_LARGE_STRING_SIZE, + "The string size is too big for function jaroSimilarity, should be at most {}", max_string_size); + + /// Shortcuts: + + if (haystack_size == 0) + return needle_size; + + if (needle_size == 0) + return haystack_size; + + if (haystack_size == needle_size && memcmp(haystack, needle, haystack_size) == 0) + return 1.0; + + const int s1len = static_cast(haystack_size); + const int s2len = static_cast(needle_size); + + /// Window size to search for matches in the other string + const int max_range = std::max(0, std::max(s1len, s2len) / 2 - 1); + std::vector s1_matching(s1len, -1); + std::vector s2_matching(s2len, -1); + + /// Calculate matching characters + size_t matching_characters = 0; + for (int i = 0; i < s1len; i++) + { + /// Matching window + const int min_index = std::max(i - max_range, 0); + const int max_index = std::min(i + max_range + 1, s2len); + for (int j = min_index; j < max_index; j++) + { + if (s2_matching[j] == -1 && haystack[i] == needle[j]) + { + s1_matching[i] = i; + s2_matching[j] = j; + matching_characters++; + break; + } + } + } + + if (matching_characters == 0) + return 0.0; + + /// Transpositions (one-way only) + double transpositions = 0.0; + for (size_t i = 0, s1i = 0, s2i = 0; i < matching_characters; i++) + { + while (s1_matching[s1i] == -1) + s1i++; + while (s2_matching[s2i] == -1) + s2i++; + if (haystack[s1i] != needle[s2i]) + transpositions += 0.5; + s1i++; + s2i++; + } + + double m = static_cast(matching_characters); + double jaro_similarity = 1.0 / 3.0 * (m / static_cast(s1len) + + m / static_cast(s2len) + + (m - transpositions) / m); + return jaro_similarity; + } +}; + +struct ByteJaroWinklerSimilarityImpl +{ + using ResultType = Float64; + + static ResultType process( + const char * __restrict haystack, size_t haystack_size, const char * __restrict needle, size_t needle_size) + { + static constexpr int max_prefix_length = 4; + static constexpr double scaling_factor = 0.1; + static constexpr double boost_threshold = 0.7; + + /// Safety threshold against DoS + if (haystack_size > max_string_size || needle_size > max_string_size) + throw Exception( + ErrorCodes::TOO_LARGE_STRING_SIZE, + "The string size is too big for function jaroWinklerSimilarity, should be at most {}", max_string_size); + + const int s1len = static_cast(haystack_size); + const int s2len = static_cast(needle_size); + + ResultType jaro_winkler_similarity = ByteJaroSimilarityImpl::process(haystack, haystack_size, needle, needle_size); + + if (jaro_winkler_similarity > boost_threshold) + { + const int common_length = std::min(max_prefix_length, std::min(s1len, s2len)); + int common_prefix = 0; + while (common_prefix < common_length && haystack[common_prefix] == needle[common_prefix]) + common_prefix++; + + jaro_winkler_similarity += common_prefix * scaling_factor * (1.0 - jaro_winkler_similarity); + } + return jaro_winkler_similarity; + } +}; + struct NameByteHammingDistance { static constexpr auto name = "byteHammingDistance"; @@ -283,6 +459,12 @@ struct NameEditDistance }; using FunctionEditDistance = FunctionsStringSimilarity, NameEditDistance>; +struct NameDamerauLevenshteinDistance +{ + static constexpr auto name = "damerauLevenshteinDistance"; +}; +using FunctionDamerauLevenshteinDistance = FunctionsStringSimilarity, NameDamerauLevenshteinDistance>; + struct NameJaccardIndex { static constexpr auto name = "stringJaccardIndex"; @@ -295,6 +477,18 @@ struct NameJaccardIndexUTF8 }; using FunctionStringJaccardIndexUTF8 = FunctionsStringSimilarity>, NameJaccardIndexUTF8>; +struct NameJaroSimilarity +{ + static constexpr auto name = "jaroSimilarity"; +}; +using FunctionJaroSimilarity = FunctionsStringSimilarity, NameJaroSimilarity>; + +struct NameJaroWinklerSimilarity +{ + static constexpr auto name = "jaroWinklerSimilarity"; +}; +using FunctionJaroWinklerSimilarity = FunctionsStringSimilarity, NameJaroWinklerSimilarity>; + REGISTER_FUNCTION(StringDistance) { factory.registerFunction( @@ -305,9 +499,18 @@ REGISTER_FUNCTION(StringDistance) FunctionDocumentation{.description = R"(Calculates the edit distance between two byte-strings.)"}); factory.registerAlias("levenshteinDistance", NameEditDistance::name); + factory.registerFunction( + FunctionDocumentation{.description = R"(Calculates the Damerau-Levenshtein distance two between two byte-string.)"}); + factory.registerFunction( - FunctionDocumentation{.description = R"(Calculates the [Jaccard similarity index](https://en.wikipedia.org/wiki/Jaccard_index) between two byte strings.)"}); + FunctionDocumentation{.description = R"(Calculates the Jaccard similarity index between two byte strings.)"}); factory.registerFunction( - FunctionDocumentation{.description = R"(Calculates the [Jaccard similarity index](https://en.wikipedia.org/wiki/Jaccard_index) between two UTF8 strings.)"}); + FunctionDocumentation{.description = R"(Calculates the Jaccard similarity index between two UTF8 strings.)"}); + + factory.registerFunction( + FunctionDocumentation{.description = R"(Calculates the Jaro similarity between two byte-string.)"}); + + factory.registerFunction( + FunctionDocumentation{.description = R"(Calculates the Jaro-Winkler similarity between two byte-string.)"}); } } diff --git a/src/Functions/makeDate.cpp b/src/Functions/makeDate.cpp index 1381e9f2828..987cf4eb1a9 100644 --- a/src/Functions/makeDate.cpp +++ b/src/Functions/makeDate.cpp @@ -434,7 +434,7 @@ public: }; FunctionArgumentDescriptors optional_args{ - {optional_argument_names[0], &isNumber, isColumnConst, "const Number"}, + {optional_argument_names[0], &isNumber, nullptr, "const Number"}, {optional_argument_names[1], &isNumber, isColumnConst, "const Number"}, {optional_argument_names[2], &isString, isColumnConst, "const String"} }; diff --git a/src/Functions/multiIf.cpp b/src/Functions/multiIf.cpp index cdb9ca061c3..d0f5a1ce439 100644 --- a/src/Functions/multiIf.cpp +++ b/src/Functions/multiIf.cpp @@ -143,7 +143,6 @@ public: * depending on values of conditions. */ - std::vector instructions; instructions.reserve(arguments.size() / 2 + 1); @@ -238,7 +237,7 @@ public: } const auto & settings = context->getSettingsRef(); - const WhichDataType which(result_type); + const WhichDataType which(removeNullable(result_type)); bool execute_multiif_columnar = settings.allow_execute_multiif_columnar && !contains_short && (which.isInt() || which.isUInt() || which.isFloat()); @@ -254,8 +253,12 @@ public: if (which.is##TYPE()) \ { \ MutableColumnPtr res = ColumnVector::create(rows); \ - executeInstructionsColumnar(instructions, rows, res); \ - return std::move(res); \ + MutableColumnPtr null_map = result_type->isNullable() ? ColumnUInt8::create(rows) : nullptr; \ + executeInstructionsColumnar(instructions, rows, res, null_map, result_type->isNullable()); \ + if (!result_type->isNullable()) \ + return std::move(res); \ + else \ + return ColumnNullable::create(std::move(res), std::move(null_map)); \ } #define ENUMERATE_NUMERIC_TYPES(M, INDEX) \ @@ -295,6 +298,7 @@ public: } private: + static void executeInstructions(std::vector & instructions, size_t rows, const MutableColumnPtr & res) { for (size_t i = 0; i < rows; ++i) @@ -374,17 +378,59 @@ private: } template - static void executeInstructionsColumnar(std::vector & instructions, size_t rows, const MutableColumnPtr & res) + static void executeInstructionsColumnar(std::vector & instructions, size_t rows, const MutableColumnPtr & res, const MutableColumnPtr & null_map, bool nullable) { PaddedPODArray inserts(rows, static_cast(instructions.size())); calculateInserts(instructions, rows, inserts); PaddedPODArray & res_data = assert_cast &>(*res).getData(); - for (size_t row_i = 0; row_i < rows; ++row_i) + if (!nullable) { - auto & instruction = instructions[inserts[row_i]]; - auto ref = instruction.source->getDataAt(row_i); - res_data[row_i] = *reinterpret_cast(ref.data); + for (size_t row_i = 0; row_i < rows; ++row_i) + { + auto & instruction = instructions[inserts[row_i]]; + auto ref = instruction.source->getDataAt(row_i); + res_data[row_i] = *reinterpret_cast(ref.data); + } + } + else + { + PaddedPODArray & null_map_data = assert_cast(*null_map).getData(); + std::vector data_cols(instructions.size()); + std::vector null_map_cols(instructions.size()); + ColumnPtr shared_null_map_col = nullptr; + for (size_t i = 0; i < instructions.size(); ++i) + { + if (instructions[i].source->isNullable()) + { + const ColumnNullable * nullable_col; + if (!instructions[i].source_is_constant) + nullable_col = assert_cast(instructions[i].source.get()); + else + { + const ColumnPtr data_column = assert_cast(*instructions[i].source).getDataColumnPtr(); + nullable_col = assert_cast(data_column.get()); + } + null_map_cols[i] = assert_cast(*nullable_col->getNullMapColumnPtr()).getData().data(); + data_cols[i] = assert_cast &>(*nullable_col->getNestedColumnPtr()).getData().data(); + } + else + { + if (!shared_null_map_col) + { + shared_null_map_col = ColumnUInt8::create(rows, 0); + } + null_map_cols[i] = assert_cast(*shared_null_map_col).getData().data(); + data_cols[i] = assert_cast &>(*instructions[i].source).getData().data(); + } + } + for (size_t row_i = 0; row_i < rows; ++row_i) + { + auto & instruction = instructions[inserts[row_i]]; + size_t index = instruction.source_is_constant ? 0 : row_i; + res_data[row_i] = *(data_cols[inserts[row_i]] + index); + null_map_data[row_i] = *(null_map_cols[inserts[row_i]] + index); + } } } diff --git a/src/IO/CompressionMethod.cpp b/src/IO/CompressionMethod.cpp index 13e1adbb702..fc415b73ec1 100644 --- a/src/IO/CompressionMethod.cpp +++ b/src/IO/CompressionMethod.cpp @@ -170,7 +170,7 @@ std::unique_ptr wrapReadBufferWithCompressionMethod( } std::unique_ptr wrapWriteBufferWithCompressionMethod( - std::unique_ptr nested, CompressionMethod method, int level, size_t buf_size, char * existing_memory, size_t alignment) + std::unique_ptr nested, CompressionMethod method, int level, int zstd_window_log, size_t buf_size, char * existing_memory, size_t alignment) { if (method == DB::CompressionMethod::Gzip || method == CompressionMethod::Zlib) return std::make_unique(std::move(nested), method, level, buf_size, existing_memory, alignment); @@ -183,7 +183,7 @@ std::unique_ptr wrapWriteBufferWithCompressionMethod( return std::make_unique(std::move(nested), level, buf_size, existing_memory, alignment); if (method == CompressionMethod::Zstd) - return std::make_unique(std::move(nested), level, buf_size, existing_memory, alignment); + return std::make_unique(std::move(nested), level, zstd_window_log, buf_size, existing_memory, alignment); if (method == CompressionMethod::Lz4) return std::make_unique(std::move(nested), level, buf_size, existing_memory, alignment); diff --git a/src/IO/CompressionMethod.h b/src/IO/CompressionMethod.h index c142531cd05..511704059ec 100644 --- a/src/IO/CompressionMethod.h +++ b/src/IO/CompressionMethod.h @@ -66,6 +66,7 @@ std::unique_ptr wrapWriteBufferWithCompressionMethod( std::unique_ptr nested, CompressionMethod method, int level, + int zstd_window_log = 0, size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, char * existing_memory = nullptr, size_t alignment = 0); diff --git a/src/IO/S3/Client.cpp b/src/IO/S3/Client.cpp index a65a82d9b40..b65de8d34a7 100644 --- a/src/IO/S3/Client.cpp +++ b/src/IO/S3/Client.cpp @@ -3,7 +3,6 @@ #if USE_AWS_S3 #include -#include #include #include #include @@ -15,7 +14,6 @@ #include -#include #include #include #include @@ -37,6 +35,9 @@ namespace ProfileEvents extern const Event DiskS3WriteRequestsErrors; extern const Event DiskS3ReadRequestsErrors; + + extern const Event S3Clients; + extern const Event TinyS3Clients; } namespace DB @@ -199,6 +200,8 @@ Client::Client( cache = std::make_shared(); ClientCacheRegistry::instance().registerClient(cache); + + ProfileEvents::increment(ProfileEvents::S3Clients); } Client::Client( @@ -219,6 +222,22 @@ Client::Client( { cache = std::make_shared(*other.cache); ClientCacheRegistry::instance().registerClient(cache); + + ProfileEvents::increment(ProfileEvents::TinyS3Clients); +} + + +Client::~Client() +{ + try + { + ClientCacheRegistry::instance().unregisterClient(cache.get()); + } + catch (...) + { + tryLogCurrentException(log); + throw; + } } Aws::Auth::AWSCredentials Client::getCredentials() const diff --git a/src/IO/S3/Client.h b/src/IO/S3/Client.h index b137f0605dc..677b739fd39 100644 --- a/src/IO/S3/Client.h +++ b/src/IO/S3/Client.h @@ -142,18 +142,7 @@ public: Client(Client && other) = delete; Client & operator=(Client &&) = delete; - ~Client() override - { - try - { - ClientCacheRegistry::instance().unregisterClient(cache.get()); - } - catch (...) - { - tryLogCurrentException(log); - throw; - } - } + ~Client() override; /// Returns the initial endpoint. const String & getInitialEndpoint() const { return initial_endpoint; } @@ -170,7 +159,7 @@ public: class RetryStrategy : public Aws::Client::RetryStrategy { public: - RetryStrategy(uint32_t maxRetries_ = 10, uint32_t scaleFactor_ = 25, uint32_t maxDelayMs_ = 90000); + explicit RetryStrategy(uint32_t maxRetries_ = 10, uint32_t scaleFactor_ = 25, uint32_t maxDelayMs_ = 90000); /// NOLINTNEXTLINE(google-runtime-int) bool ShouldRetry(const Aws::Client::AWSError& error, long attemptedRetries) const override; diff --git a/src/IO/S3Common.cpp b/src/IO/S3Common.cpp index 96ad6413ef5..5039059f522 100644 --- a/src/IO/S3Common.cpp +++ b/src/IO/S3Common.cpp @@ -6,21 +6,12 @@ #if USE_AWS_S3 -# include - -# include # include -# include - -# include -# include # include -# include # include -# include +# include # include -# include namespace ProfileEvents { @@ -147,6 +138,12 @@ AuthSettings AuthSettings::loadFromConfig(const std::string & config_elem, const }; } +bool AuthSettings::hasUpdates(const AuthSettings & other) const +{ + AuthSettings copy = *this; + copy.updateFrom(other); + return *this != copy; +} void AuthSettings::updateFrom(const AuthSettings & from) { @@ -175,7 +172,7 @@ void AuthSettings::updateFrom(const AuthSettings & from) expiration_window_seconds = from.expiration_window_seconds; if (from.no_sign_request.has_value()) - no_sign_request = *from.no_sign_request; + no_sign_request = from.no_sign_request; } } diff --git a/src/IO/S3Common.h b/src/IO/S3Common.h index ebfc07a3976..6ee8d96ed09 100644 --- a/src/IO/S3Common.h +++ b/src/IO/S3Common.h @@ -92,9 +92,11 @@ struct AuthSettings std::optional expiration_window_seconds; std::optional no_sign_request; - bool operator==(const AuthSettings & other) const = default; - + bool hasUpdates(const AuthSettings & other) const; void updateFrom(const AuthSettings & from); + +private: + bool operator==(const AuthSettings & other) const = default; }; } diff --git a/src/IO/ZstdDeflatingWriteBuffer.cpp b/src/IO/ZstdDeflatingWriteBuffer.cpp index 949d65926b3..3b474a4de74 100644 --- a/src/IO/ZstdDeflatingWriteBuffer.cpp +++ b/src/IO/ZstdDeflatingWriteBuffer.cpp @@ -1,30 +1,51 @@ #include #include +#include namespace DB { namespace ErrorCodes { extern const int ZSTD_ENCODER_FAILED; + extern const int ILLEGAL_CODEC_PARAMETER; +} + +static void setZstdParameter(ZSTD_CCtx * cctx, ZSTD_cParameter param, int value) +{ + auto ret = ZSTD_CCtx_setParameter(cctx, param, value); + if (ZSTD_isError(ret)) + throw Exception( + ErrorCodes::ZSTD_ENCODER_FAILED, + "zstd stream encoder option setting failed: error code: {}; zstd version: {}", + ret, + ZSTD_VERSION_STRING); } ZstdDeflatingWriteBuffer::ZstdDeflatingWriteBuffer( - std::unique_ptr out_, int compression_level, size_t buf_size, char * existing_memory, size_t alignment) + std::unique_ptr out_, int compression_level, int window_log, size_t buf_size, char * existing_memory, size_t alignment) : WriteBufferWithOwnMemoryDecorator(std::move(out_), buf_size, existing_memory, alignment) { cctx = ZSTD_createCCtx(); if (cctx == nullptr) throw Exception(ErrorCodes::ZSTD_ENCODER_FAILED, "zstd stream encoder init failed: zstd version: {}", ZSTD_VERSION_STRING); - size_t ret = ZSTD_CCtx_setParameter(cctx, ZSTD_c_compressionLevel, compression_level); - if (ZSTD_isError(ret)) - throw Exception(ErrorCodes::ZSTD_ENCODER_FAILED, - "zstd stream encoder option setting failed: error code: {}; zstd version: {}", - ret, ZSTD_VERSION_STRING); - ret = ZSTD_CCtx_setParameter(cctx, ZSTD_c_checksumFlag, 1); - if (ZSTD_isError(ret)) - throw Exception(ErrorCodes::ZSTD_ENCODER_FAILED, - "zstd stream encoder option setting failed: error code: {}; zstd version: {}", - ret, ZSTD_VERSION_STRING); + setZstdParameter(cctx, ZSTD_c_compressionLevel, compression_level); + + if (window_log > 0) + { + ZSTD_bounds window_log_bounds = ZSTD_cParam_getBounds(ZSTD_c_windowLog); + if (ZSTD_isError(window_log_bounds.error)) + throw Exception(ErrorCodes::ILLEGAL_CODEC_PARAMETER, "ZSTD windowLog parameter is not supported {}", + std::string(ZSTD_getErrorName(window_log_bounds.error))); + if (window_log > window_log_bounds.upperBound || window_log < window_log_bounds.lowerBound) + throw Exception(ErrorCodes::ILLEGAL_CODEC_PARAMETER, + "ZSTD codec can't have window log more than {} and lower than {}, given {}", + toString(window_log_bounds.upperBound), + toString(window_log_bounds.lowerBound), toString(window_log)); + setZstdParameter(cctx, ZSTD_c_enableLongDistanceMatching, 1); + setZstdParameter(cctx, ZSTD_c_windowLog, window_log); + } + + setZstdParameter(cctx, ZSTD_c_checksumFlag, 1); input = {nullptr, 0, 0}; output = {nullptr, 0, 0}; diff --git a/src/IO/ZstdDeflatingWriteBuffer.h b/src/IO/ZstdDeflatingWriteBuffer.h index a66d6085a74..8c129b1bfbb 100644 --- a/src/IO/ZstdDeflatingWriteBuffer.h +++ b/src/IO/ZstdDeflatingWriteBuffer.h @@ -17,6 +17,7 @@ public: ZstdDeflatingWriteBuffer( std::unique_ptr out_, int compression_level, + int window_log = 0, size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, char * existing_memory = nullptr, size_t alignment = 0); diff --git a/src/Interpreters/ConcurrentHashJoin.cpp b/src/Interpreters/ConcurrentHashJoin.cpp index 8e73bc8b484..96be70c5527 100644 --- a/src/Interpreters/ConcurrentHashJoin.cpp +++ b/src/Interpreters/ConcurrentHashJoin.cpp @@ -46,6 +46,9 @@ ConcurrentHashJoin::ConcurrentHashJoin(ContextPtr context_, std::shared_ptr(); inner_hash_join->data = std::make_unique(table_join_, right_sample_block, any_take_last_row_, 0, fmt::format("concurrent{}", i)); + /// Non zero `max_joined_block_rows` allows to process block partially and return not processed part. + /// TODO: It's not handled properly in ConcurrentHashJoin case, so we set it to 0 to disable this feature. + inner_hash_join->data->setMaxJoinedBlockRows(0); hash_joins.emplace_back(std::move(inner_hash_join)); } } diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp index 38944b21c49..57dda316edb 100644 --- a/src/Interpreters/Context.cpp +++ b/src/Interpreters/Context.cpp @@ -1583,9 +1583,7 @@ bool Context::hasScalar(const String & name) const void Context::addQueryAccessInfo( const String & quoted_database_name, const String & full_quoted_table_name, - const Names & column_names, - const String & projection_name, - const String & view_name) + const Names & column_names) { if (isGlobalContext()) throw Exception(ErrorCodes::LOGICAL_ERROR, "Global context cannot have query access info"); @@ -1593,12 +1591,9 @@ void Context::addQueryAccessInfo( std::lock_guard lock(query_access_info.mutex); query_access_info.databases.emplace(quoted_database_name); query_access_info.tables.emplace(full_quoted_table_name); + for (const auto & column_name : column_names) query_access_info.columns.emplace(full_quoted_table_name + "." + backQuoteIfNeed(column_name)); - if (!projection_name.empty()) - query_access_info.projections.emplace(full_quoted_table_name + "." + backQuoteIfNeed(projection_name)); - if (!view_name.empty()) - query_access_info.views.emplace(view_name); } void Context::addQueryAccessInfo(const Names & partition_names) @@ -1611,6 +1606,15 @@ void Context::addQueryAccessInfo(const Names & partition_names) query_access_info.partitions.emplace(partition_name); } +void Context::addViewAccessInfo(const String & view_name) +{ + if (isGlobalContext()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Global context cannot have query access info"); + + std::lock_guard lock(query_access_info.mutex); + query_access_info.views.emplace(view_name); +} + void Context::addQueryAccessInfo(const QualifiedProjectionName & qualified_projection_name) { if (!qualified_projection_name) diff --git a/src/Interpreters/Context.h b/src/Interpreters/Context.h index 640aeb0539c..e5163366f50 100644 --- a/src/Interpreters/Context.h +++ b/src/Interpreters/Context.h @@ -693,13 +693,14 @@ public: void addSpecialScalar(const String & name, const Block & block); const QueryAccessInfo & getQueryAccessInfo() const { return query_access_info; } + void addQueryAccessInfo( const String & quoted_database_name, const String & full_quoted_table_name, - const Names & column_names, - const String & projection_name = {}, - const String & view_name = {}); + const Names & column_names); + void addQueryAccessInfo(const Names & partition_names); + void addViewAccessInfo(const String & view_name); struct QualifiedProjectionName { @@ -707,8 +708,8 @@ public: String projection_name; explicit operator bool() const { return !projection_name.empty(); } }; - void addQueryAccessInfo(const QualifiedProjectionName & qualified_projection_name); + void addQueryAccessInfo(const QualifiedProjectionName & qualified_projection_name); /// Supported factories for records in query_log enum class QueryLogFactories diff --git a/src/Interpreters/ExpressionAnalyzer.cpp b/src/Interpreters/ExpressionAnalyzer.cpp index 969c57535f9..5c628436d60 100644 --- a/src/Interpreters/ExpressionAnalyzer.cpp +++ b/src/Interpreters/ExpressionAnalyzer.cpp @@ -1050,7 +1050,7 @@ static std::unique_ptr buildJoinedPlan( join_element.table_expression, context, original_right_column_names, - query_options.copy().setWithAllColumns().ignoreProjections(false).ignoreAlias(false)); + query_options.copy().setWithAllColumns().ignoreAlias(false)); auto joined_plan = std::make_unique(); interpreter->buildQueryPlan(*joined_plan); { diff --git a/src/Interpreters/HashJoin.cpp b/src/Interpreters/HashJoin.cpp index a84e1ec2175..b05b10ff25e 100644 --- a/src/Interpreters/HashJoin.cpp +++ b/src/Interpreters/HashJoin.cpp @@ -243,6 +243,7 @@ HashJoin::HashJoin(std::shared_ptr table_join_, const Block & right_s , asof_inequality(table_join->getAsofInequality()) , data(std::make_shared()) , right_sample_block(right_sample_block_) + , max_joined_block_rows(table_join->maxJoinedBlockRows()) , instance_log_id(!instance_id_.empty() ? "(" + instance_id_ + ") " : "") , log(&Poco::Logger::get("HashJoin")) { @@ -1401,7 +1402,7 @@ NO_INLINE size_t joinRightColumns( { if constexpr (join_features.need_replication) { - if (unlikely(current_offset > max_joined_block_rows)) + if (unlikely(current_offset >= max_joined_block_rows)) { added_columns.offsets_to_replicate->resize_assume_reserved(i); added_columns.filter.resize_assume_reserved(i); @@ -1690,7 +1691,7 @@ Block HashJoin::joinBlockImpl( bool has_required_right_keys = (required_right_keys.columns() != 0); added_columns.need_filter = join_features.need_filter || has_required_right_keys; - added_columns.max_joined_block_rows = table_join->maxJoinedBlockRows(); + added_columns.max_joined_block_rows = max_joined_block_rows; if (!added_columns.max_joined_block_rows) added_columns.max_joined_block_rows = std::numeric_limits::max(); else @@ -1771,7 +1772,6 @@ Block HashJoin::joinBlockImpl( void HashJoin::joinBlockImplCross(Block & block, ExtraBlockPtr & not_processed) const { - size_t max_joined_block_rows = table_join->maxJoinedBlockRows(); size_t start_left_row = 0; size_t start_right_block = 0; if (not_processed) diff --git a/src/Interpreters/HashJoin.h b/src/Interpreters/HashJoin.h index 284cf5d0e7f..2be58b5fd2d 100644 --- a/src/Interpreters/HashJoin.h +++ b/src/Interpreters/HashJoin.h @@ -396,6 +396,8 @@ public: void shrinkStoredBlocksToFit(size_t & total_bytes_in_join); + void setMaxJoinedBlockRows(size_t value) { max_joined_block_rows = value; } + private: template friend class NotJoinedHash; @@ -433,6 +435,9 @@ private: /// Left table column names that are sources for required_right_keys columns std::vector required_right_keys_sources; + /// Maximum number of rows in result block. If it is 0, then no limits. + size_t max_joined_block_rows = 0; + /// When tracked memory consumption is more than a threshold, we will shrink to fit stored blocks. bool shrink_blocks = false; Int64 memory_usage_before_adding_blocks = 0; diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index 2bddb4935de..36e864ace26 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -596,6 +596,7 @@ ColumnsDescription InterpreterCreateQuery::getColumnsDescription( bool sanity_check_compression_codecs = !attach && !context_->getSettingsRef().allow_suspicious_codecs; bool allow_experimental_codecs = attach || context_->getSettingsRef().allow_experimental_codecs; bool enable_deflate_qpl_codec = attach || context_->getSettingsRef().enable_deflate_qpl_codec; + bool enable_zstd_qat_codec = attach || context_->getSettingsRef().enable_zstd_qat_codec; ColumnsDescription res; auto name_type_it = column_names_and_types.begin(); @@ -656,7 +657,7 @@ ColumnsDescription InterpreterCreateQuery::getColumnsDescription( if (col_decl.default_specifier == "ALIAS") throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot specify codec for column type ALIAS"); column.codec = CompressionCodecFactory::instance().validateCodecAndGetPreprocessedAST( - col_decl.codec, column.type, sanity_check_compression_codecs, allow_experimental_codecs, enable_deflate_qpl_codec); + col_decl.codec, column.type, sanity_check_compression_codecs, allow_experimental_codecs, enable_deflate_qpl_codec, enable_zstd_qat_codec); } if (col_decl.stat_type) diff --git a/src/Interpreters/InterpreterSelectQuery.cpp b/src/Interpreters/InterpreterSelectQuery.cpp index e057b4fd7e0..d1bc66f47f1 100644 --- a/src/Interpreters/InterpreterSelectQuery.cpp +++ b/src/Interpreters/InterpreterSelectQuery.cpp @@ -390,8 +390,6 @@ InterpreterSelectQuery::InterpreterSelectQuery( if (!prepared_sets) prepared_sets = std::make_shared(); - query_info.ignore_projections = options.ignore_projections; - query_info.is_projection_query = options.is_projection_query; query_info.is_internal = options.is_internal; initSettings(); @@ -417,7 +415,6 @@ InterpreterSelectQuery::InterpreterSelectQuery( } query_info.query = query_ptr->clone(); - query_info.original_query = query_ptr->clone(); if (settings.count_distinct_optimization) { @@ -856,9 +853,6 @@ InterpreterSelectQuery::InterpreterSelectQuery( analysis_result.required_columns = required_columns; } - if (query_info.projection) - storage_snapshot->addProjection(query_info.projection->desc); - /// Blocks used in expression analysis contains size 1 const columns for constant folding and /// null non-const columns to avoid useless memory allocations. However, a valid block sample /// requires all columns to be of size 0, thus we need to sanitize the block here. @@ -965,10 +959,7 @@ void InterpreterSelectQuery::buildQueryPlan(QueryPlan & query_plan) executeImpl(query_plan, std::move(input_pipe)); /// We must guarantee that result structure is the same as in getSampleBlock() - /// - /// But if it's a projection query, plan header does not match result_header. - /// TODO: add special stage for InterpreterSelectQuery? - if (!options.is_projection_query && !blocksHaveEqualStructure(query_plan.getCurrentDataStream().header, result_header)) + if (!blocksHaveEqualStructure(query_plan.getCurrentDataStream().header, result_header)) { auto convert_actions_dag = ActionsDAG::makeConvertingActions( query_plan.getCurrentDataStream().header.getColumnsWithTypeAndName(), @@ -1476,12 +1467,6 @@ void InterpreterSelectQuery::executeImpl(QueryPlan & query_plan, std::optional

desc->type == ProjectionDescription::Type::Aggregate) - { - query_info.projection->aggregate_overflow_row = aggregate_overflow_row; - query_info.projection->aggregate_final = aggregate_final; - } - if (options.only_analyze) { auto read_nothing = std::make_unique(source_header); @@ -1550,11 +1535,9 @@ void InterpreterSelectQuery::executeImpl(QueryPlan & query_plan, std::optional

{}", QueryProcessingStage::toString(from_stage), QueryProcessingStage::toString(options.to_stage)); } - if (query_info.projection && query_info.projection->input_order_info && query_info.input_order_info) - throw Exception(ErrorCodes::LOGICAL_ERROR, "InputOrderInfo is set for projection and for query"); InputOrderInfoPtr input_order_info_for_order; if (!expressions.need_aggregate) - input_order_info_for_order = query_info.projection ? query_info.projection->input_order_info : query_info.input_order_info; + input_order_info_for_order = query_info.input_order_info; if (options.to_stage > QueryProcessingStage::FetchColumns) { @@ -1615,7 +1598,7 @@ void InterpreterSelectQuery::executeImpl(QueryPlan & query_plan, std::optional

( query_plan.getCurrentDataStream(), @@ -1789,7 +1772,7 @@ void InterpreterSelectQuery::executeImpl(QueryPlan & query_plan, std::optional

(source_header)); - PrewhereInfoPtr prewhere_info_ptr = query_info.projection ? query_info.projection->prewhere_info : query_info.prewhere_info; - if (prewhere_info_ptr) + if (query_info.prewhere_info) { - auto & prewhere_info = *prewhere_info_ptr; + auto & prewhere_info = *query_info.prewhere_info; if (prewhere_info.row_level_filter) { @@ -2088,50 +2069,6 @@ void InterpreterSelectQuery::addEmptySourceToQueryPlan( auto read_from_pipe = std::make_unique(std::move(pipe)); read_from_pipe->setStepDescription("Read from NullSource"); query_plan.addStep(std::move(read_from_pipe)); - - if (query_info.projection) - { - if (query_info.projection->before_where) - { - auto where_step = std::make_unique( - query_plan.getCurrentDataStream(), - query_info.projection->before_where, - query_info.projection->where_column_name, - query_info.projection->remove_where_filter); - - where_step->setStepDescription("WHERE"); - query_plan.addStep(std::move(where_step)); - } - - if (query_info.projection->desc->type == ProjectionDescription::Type::Aggregate) - { - if (query_info.projection->before_aggregation) - { - auto expression_before_aggregation - = std::make_unique(query_plan.getCurrentDataStream(), query_info.projection->before_aggregation); - expression_before_aggregation->setStepDescription("Before GROUP BY"); - query_plan.addStep(std::move(expression_before_aggregation)); - } - - // Let's just choose the safe option since we don't know the value of `to_stage` here. - const bool should_produce_results_in_order_of_bucket_number = true; - - // It is used to determine if we should use memory bound merging strategy. Maybe it makes sense for projections, but so far this case is just left untouched. - SortDescription group_by_sort_description; - - executeMergeAggregatedImpl( - query_plan, - query_info.projection->aggregate_overflow_row, - query_info.projection->aggregate_final, - false, - false, - context_->getSettingsRef(), - query_info.projection->aggregation_keys, - query_info.projection->aggregate_descriptions, - should_produce_results_in_order_of_bucket_number, - std::move(group_by_sort_description)); - } - } } RowPolicyFilterPtr InterpreterSelectQuery::getRowPolicyFilter() const @@ -2575,80 +2512,47 @@ void InterpreterSelectQuery::executeFetchColumns(QueryProcessingStage::Enum proc /// Create optimizer with prepared actions. /// Maybe we will need to calc input_order_info later, e.g. while reading from StorageMerge. - if ((optimize_read_in_order || optimize_aggregation_in_order) - && (!query_info.projection || query_info.projection->complete)) + if (optimize_read_in_order) { - if (optimize_read_in_order) - { - if (query_info.projection) - { - query_info.projection->order_optimizer = std::make_shared( - // TODO Do we need a projection variant for this field? - query, - analysis_result.order_by_elements_actions, - getSortDescription(query, context), - query_info.syntax_analyzer_result); - } - else - { - query_info.order_optimizer = std::make_shared( - query, - analysis_result.order_by_elements_actions, - getSortDescription(query, context), - query_info.syntax_analyzer_result); - } - } - else if (optimize_aggregation_in_order) - { - if (query_info.projection) - { - query_info.projection->order_optimizer = std::make_shared( - query, - query_info.projection->group_by_elements_actions, - query_info.projection->group_by_elements_order_descr, - query_info.syntax_analyzer_result); - } - else - { - query_info.order_optimizer = std::make_shared( - query, - analysis_result.group_by_elements_actions, - getSortDescriptionFromGroupBy(query), - query_info.syntax_analyzer_result); - } - } + query_info.order_optimizer = std::make_shared( + query, + analysis_result.order_by_elements_actions, + getSortDescription(query, context), + query_info.syntax_analyzer_result); /// If we don't have filtration, we can pushdown limit to reading stage for optimizations. - UInt64 limit = (query.hasFiltration() || query.groupBy()) ? 0 : getLimitForSorting(query, context); - if (query_info.projection) - query_info.projection->input_order_info - = query_info.projection->order_optimizer->getInputOrder(query_info.projection->desc->metadata, context, limit); - else - query_info.input_order_info = query_info.order_optimizer->getInputOrder(metadata_snapshot, context, limit); + UInt64 limit = query.hasFiltration() ? 0 : getLimitForSorting(query, context); + query_info.input_order_info = query_info.order_optimizer->getInputOrder(metadata_snapshot, context, limit); + } + else if (optimize_aggregation_in_order) + { + query_info.order_optimizer = std::make_shared( + query, + analysis_result.group_by_elements_actions, + getSortDescriptionFromGroupBy(query), + query_info.syntax_analyzer_result); + + query_info.input_order_info = query_info.order_optimizer->getInputOrder(metadata_snapshot, context, /*limit=*/ 0); } query_info.storage_limits = std::make_shared(storage_limits); - query_info.settings_limit_offset_done = options.settings_limit_offset_done; storage->read(query_plan, required_columns, storage_snapshot, query_info, context, processing_stage, max_block_size, max_streams); if (context->hasQueryContext() && !options.is_internal) { - const String view_name{}; auto local_storage_id = storage->getStorageID(); context->getQueryContext()->addQueryAccessInfo( backQuoteIfNeed(local_storage_id.getDatabaseName()), local_storage_id.getFullTableName(), - required_columns, - query_info.projection ? query_info.projection->desc->name : "", - view_name); + required_columns); } /// Create step which reads from empty source if storage has no data. if (!query_plan.isInitialized()) { auto header = storage_snapshot->getSampleBlockForColumns(required_columns); - addEmptySourceToQueryPlan(query_plan, header, query_info, context); + addEmptySourceToQueryPlan(query_plan, header, query_info); } } else @@ -2757,13 +2661,8 @@ void InterpreterSelectQuery::executeAggregation(QueryPlan & query_plan, const Ac expression_before_aggregation->setStepDescription("Before GROUP BY"); query_plan.addStep(std::move(expression_before_aggregation)); - if (options.is_projection_query) - return; - AggregateDescriptions aggregates = query_analyzer->aggregates(); - const Settings & settings = context->getSettingsRef(); - const auto & keys = query_analyzer->aggregationKeys().getNames(); auto aggregator_params = getAggregatorParams( @@ -2827,13 +2726,6 @@ void InterpreterSelectQuery::executeAggregation(QueryPlan & query_plan, const Ac void InterpreterSelectQuery::executeMergeAggregated(QueryPlan & query_plan, bool overflow_row, bool final, bool has_grouping_sets) { - /// If aggregate projection was chosen for table, avoid adding MergeAggregated. - /// It is already added by storage (because of performance issues). - /// TODO: We should probably add another one processing stage for storage? - /// WithMergeableStateAfterAggregation is not ok because, e.g., it skips sorting after aggregation. - if (query_info.projection && query_info.projection->desc->type == ProjectionDescription::Type::Aggregate) - return; - const Settings & settings = context->getSettingsRef(); /// Used to determine if we should use memory bound merging strategy. diff --git a/src/Interpreters/InterpreterSelectQuery.h b/src/Interpreters/InterpreterSelectQuery.h index ec9612ad248..fbb53d71755 100644 --- a/src/Interpreters/InterpreterSelectQuery.h +++ b/src/Interpreters/InterpreterSelectQuery.h @@ -117,7 +117,7 @@ public: bool hasAggregation() const { return query_analyzer->hasAggregation(); } static void addEmptySourceToQueryPlan( - QueryPlan & query_plan, const Block & source_header, const SelectQueryInfo & query_info, const ContextPtr & context_); + QueryPlan & query_plan, const Block & source_header, const SelectQueryInfo & query_info); Names getRequiredColumns() { return required_columns; } diff --git a/src/Interpreters/MutationsInterpreter.cpp b/src/Interpreters/MutationsInterpreter.cpp index a6ea03f8a03..b478382b10d 100644 --- a/src/Interpreters/MutationsInterpreter.cpp +++ b/src/Interpreters/MutationsInterpreter.cpp @@ -190,7 +190,7 @@ bool isStorageTouchedByMutations( if (context->getSettingsRef().allow_experimental_analyzer) { auto select_query_tree = prepareQueryAffectedQueryTree(commands, storage.shared_from_this(), context); - InterpreterSelectQueryAnalyzer interpreter(select_query_tree, context, SelectQueryOptions().ignoreLimits().ignoreProjections()); + InterpreterSelectQueryAnalyzer interpreter(select_query_tree, context, SelectQueryOptions().ignoreLimits()); io = interpreter.execute(); } else @@ -200,7 +200,7 @@ bool isStorageTouchedByMutations( /// For some reason it may copy context and give it into ExpressionTransform /// after that we will use context from destroyed stack frame in our stream. interpreter_select_query.emplace( - select_query, context, storage_from_part, metadata_snapshot, SelectQueryOptions().ignoreLimits().ignoreProjections()); + select_query, context, storage_from_part, metadata_snapshot, SelectQueryOptions().ignoreLimits()); io = interpreter_select_query->execute(); } @@ -404,7 +404,7 @@ MutationsInterpreter::MutationsInterpreter( , available_columns(std::move(available_columns_)) , context(Context::createCopy(context_)) , settings(std::move(settings_)) - , select_limits(SelectQueryOptions().analyze(!settings.can_execute).ignoreLimits().ignoreProjections()) + , select_limits(SelectQueryOptions().analyze(!settings.can_execute).ignoreLimits()) { prepare(!settings.can_execute); } diff --git a/src/Interpreters/SelectQueryOptions.h b/src/Interpreters/SelectQueryOptions.h index c91329c869c..1e08aec3813 100644 --- a/src/Interpreters/SelectQueryOptions.h +++ b/src/Interpreters/SelectQueryOptions.h @@ -33,14 +33,6 @@ struct SelectQueryOptions bool remove_duplicates = false; bool ignore_quota = false; bool ignore_limits = false; - /// This flag is needed to analyze query ignoring table projections. - /// It is needed because we build another one InterpreterSelectQuery while analyzing projections. - /// It helps to avoid infinite recursion. - bool ignore_projections = false; - /// This flag is also used for projection analysis. - /// It is needed because lazy normal projections require special planning in FetchColumns stage, such as adding WHERE transform. - /// It is also used to avoid adding aggregating step when aggregate projection is chosen. - bool is_projection_query = false; /// This flag is needed for projection description. /// Otherwise, keys for GROUP BY may be removed as constants. bool ignore_ast_optimizations = false; @@ -119,18 +111,6 @@ struct SelectQueryOptions return *this; } - SelectQueryOptions & ignoreProjections(bool value = true) - { - ignore_projections = value; - return *this; - } - - SelectQueryOptions & projectionQuery(bool value = true) - { - is_projection_query = value; - return *this; - } - SelectQueryOptions & ignoreAlias(bool value = true) { ignore_alias = value; diff --git a/src/Interpreters/executeQuery.cpp b/src/Interpreters/executeQuery.cpp index 76e71b24d42..4b5a6a84e17 100644 --- a/src/Interpreters/executeQuery.cpp +++ b/src/Interpreters/executeQuery.cpp @@ -1435,11 +1435,12 @@ void executeQuery( const auto & compression_method_node = ast_query_with_output->compression->as(); compression_method = compression_method_node.value.safeGet(); } - + const auto & settings = context->getSettingsRef(); compressed_buffer = wrapWriteBufferWithCompressionMethod( std::make_unique(out_file, DBMS_DEFAULT_BUFFER_SIZE, O_WRONLY | O_EXCL | O_CREAT), chooseCompressionMethod(out_file, compression_method), - /* compression level = */ 3 + /* compression level = */ static_cast(settings.output_format_compression_level), + /* zstd_window_log = */ static_cast(settings.output_format_compression_zstd_window_log) ); } diff --git a/src/Planner/PlannerJoinTree.cpp b/src/Planner/PlannerJoinTree.cpp index f6569d998f1..394cd8a0669 100644 --- a/src/Planner/PlannerJoinTree.cpp +++ b/src/Planner/PlannerJoinTree.cpp @@ -846,9 +846,7 @@ JoinTreeQueryPlan buildQueryPlanForTableExpression(QueryTreeNodePtr table_expres query_context->getQueryContext()->addQueryAccessInfo( backQuoteIfNeed(local_storage_id.getDatabaseName()), local_storage_id.getFullTableName(), - columns_names, - {}, - {}); + columns_names); } } diff --git a/src/Planner/Utils.cpp b/src/Planner/Utils.cpp index ba29cab5956..44374a64a9a 100644 --- a/src/Planner/Utils.cpp +++ b/src/Planner/Utils.cpp @@ -455,8 +455,7 @@ QueryTreeNodePtr buildSubqueryToReadColumnsFromTableExpression(const NamesAndTyp SelectQueryInfo buildSelectQueryInfo(const QueryTreeNodePtr & query_tree, const PlannerContextPtr & planner_context) { SelectQueryInfo select_query_info; - select_query_info.original_query = queryNodeToSelectQuery(query_tree); - select_query_info.query = select_query_info.original_query; + select_query_info.query = queryNodeToSelectQuery(query_tree); select_query_info.query_tree = query_tree; select_query_info.planner_context = planner_context; return select_query_info; diff --git a/src/Processors/Formats/Impl/ConstantExpressionTemplate.cpp b/src/Processors/Formats/Impl/ConstantExpressionTemplate.cpp index 2602f8b881d..316a84fe94f 100644 --- a/src/Processors/Formats/Impl/ConstantExpressionTemplate.cpp +++ b/src/Processors/Formats/Impl/ConstantExpressionTemplate.cpp @@ -603,18 +603,21 @@ bool ConstantExpressionTemplate::parseLiteralAndAssertType( memcpy(buf, istr.position(), bytes_to_copy); buf[bytes_to_copy] = 0; - char * pos_double = buf; + /// Skip leading zeroes - we don't want any funny octal business + char * non_zero_buf = find_first_not_symbols<'0'>(buf, buf + bytes_to_copy); + + char * pos_double = non_zero_buf; errno = 0; - Float64 float_value = std::strtod(buf, &pos_double); - if (pos_double == buf || errno == ERANGE || float_value < 0) + Float64 float_value = std::strtod(non_zero_buf, &pos_double); + if (pos_double == non_zero_buf || errno == ERANGE || float_value < 0) return false; if (negative) float_value = -float_value; - char * pos_integer = buf; + char * pos_integer = non_zero_buf; errno = 0; - UInt64 uint_value = std::strtoull(buf, &pos_integer, 0); + UInt64 uint_value = std::strtoull(non_zero_buf, &pos_integer, 0); if (pos_integer == pos_double && errno != ERANGE && (!negative || uint_value <= (1ULL << 63))) { istr.position() += pos_integer - buf; diff --git a/src/Processors/Formats/Impl/Parquet/Write.cpp b/src/Processors/Formats/Impl/Parquet/Write.cpp index 6d8f1ab55cb..02ca2734ff8 100644 --- a/src/Processors/Formats/Impl/Parquet/Write.cpp +++ b/src/Processors/Formats/Impl/Parquet/Write.cpp @@ -448,6 +448,7 @@ PODArray & compress(PODArray & source, PODArray & scratch, Com std::move(dest_buf), method, /*level*/ 3, + /*zstd_window_log*/ 0, source.size(), /*existing_memory*/ source.data()); chassert(compressed_buf->position() == source.data()); diff --git a/src/Processors/QueryPlan/Optimizations/optimizeReadInOrder.cpp b/src/Processors/QueryPlan/Optimizations/optimizeReadInOrder.cpp index 433422a7c30..bc1b3695d88 100644 --- a/src/Processors/QueryPlan/Optimizations/optimizeReadInOrder.cpp +++ b/src/Processors/QueryPlan/Optimizations/optimizeReadInOrder.cpp @@ -1080,10 +1080,7 @@ size_t tryReuseStorageOrderingForWindowFunctions(QueryPlan::Node * parent_node, /// If we don't have filtration, we can pushdown limit to reading stage for optimizations. UInt64 limit = (select_query->hasFiltration() || select_query->groupBy()) ? 0 : InterpreterSelectQuery::getLimitForSorting(*select_query, context); - auto order_info = order_optimizer->getInputOrder( - query_info.projection ? query_info.projection->desc->metadata : read_from_merge_tree->getStorageMetadata(), - context, - limit); + auto order_info = order_optimizer->getInputOrder(read_from_merge_tree->getStorageMetadata(), context, limit); if (order_info) { diff --git a/src/Processors/QueryPlan/Optimizations/optimizeUseAggregateProjection.cpp b/src/Processors/QueryPlan/Optimizations/optimizeUseAggregateProjection.cpp index d1f0c1ebe5e..8fcc088baa9 100644 --- a/src/Processors/QueryPlan/Optimizations/optimizeUseAggregateProjection.cpp +++ b/src/Processors/QueryPlan/Optimizations/optimizeUseAggregateProjection.cpp @@ -642,6 +642,7 @@ bool optimizeUseAggregateProjections(QueryPlan::Node & node, QueryPlan::Nodes & return false; } + Context::QualifiedProjectionName projection_name; chassert(best_candidate != nullptr); QueryPlanStepPtr projection_reading; @@ -654,23 +655,19 @@ bool optimizeUseAggregateProjections(QueryPlan::Node & node, QueryPlan::Nodes & // candidates.minmax_projection->block.dumpStructure()); Pipe pipe(std::make_shared(std::move(candidates.minmax_projection->block))); - projection_reading = std::make_unique( - std::move(pipe), - context, - query_info.is_internal - ? Context::QualifiedProjectionName{} - : Context::QualifiedProjectionName - { - .storage_id = reading->getMergeTreeData().getStorageID(), - .projection_name = candidates.minmax_projection->candidate.projection->name, - }); + projection_reading = std::make_unique(std::move(pipe)); has_ordinary_parts = false; + + projection_name = Context::QualifiedProjectionName + { + .storage_id = reading->getMergeTreeData().getStorageID(), + .projection_name = candidates.minmax_projection->candidate.projection->name, + }; } else { auto storage_snapshot = reading->getStorageSnapshot(); - auto proj_snapshot = std::make_shared( - storage_snapshot->storage, storage_snapshot->metadata, storage_snapshot->object_columns); + auto proj_snapshot = std::make_shared(storage_snapshot->storage, storage_snapshot->metadata); proj_snapshot->addProjection(best_candidate->projection); auto query_info_copy = query_info; @@ -693,23 +690,29 @@ bool optimizeUseAggregateProjections(QueryPlan::Node & node, QueryPlan::Nodes & { auto header = proj_snapshot->getSampleBlockForColumns(best_candidate->dag->getRequiredColumnsNames()); Pipe pipe(std::make_shared(std::move(header))); - projection_reading = std::make_unique( - std::move(pipe), - context, - query_info.is_internal - ? Context::QualifiedProjectionName{} - : Context::QualifiedProjectionName - { - .storage_id = reading->getMergeTreeData().getStorageID(), - .projection_name = best_candidate->projection->name, - }); + projection_reading = std::make_unique(std::move(pipe)); } + projection_name = Context::QualifiedProjectionName + { + .storage_id = reading->getMergeTreeData().getStorageID(), + .projection_name = best_candidate->projection->name, + }; + has_ordinary_parts = best_candidate->merge_tree_ordinary_select_result_ptr != nullptr; if (has_ordinary_parts) reading->setAnalyzedResult(std::move(best_candidate->merge_tree_ordinary_select_result_ptr)); } + if (!query_info.is_internal && context->hasQueryContext()) + { + context->getQueryContext()->addQueryAccessInfo(Context::QualifiedProjectionName + { + .storage_id = reading->getMergeTreeData().getStorageID(), + .projection_name = best_candidate->projection->name, + }); + } + // LOG_TRACE(&Poco::Logger::get("optimizeUseProjections"), "Projection reading header {}", // projection_reading->getOutputStream().header.dumpStructure()); diff --git a/src/Processors/QueryPlan/Optimizations/optimizeUseNormalProjection.cpp b/src/Processors/QueryPlan/Optimizations/optimizeUseNormalProjection.cpp index e4b3e4f84ab..05afc80cba0 100644 --- a/src/Processors/QueryPlan/Optimizations/optimizeUseNormalProjection.cpp +++ b/src/Processors/QueryPlan/Optimizations/optimizeUseNormalProjection.cpp @@ -196,8 +196,7 @@ bool optimizeUseNormalProjections(Stack & stack, QueryPlan::Nodes & nodes) } auto storage_snapshot = reading->getStorageSnapshot(); - auto proj_snapshot = std::make_shared( - storage_snapshot->storage, storage_snapshot->metadata, storage_snapshot->object_columns); //, storage_snapshot->data); + auto proj_snapshot = std::make_shared(storage_snapshot->storage, storage_snapshot->metadata); proj_snapshot->addProjection(best_candidate->projection); auto query_info_copy = query_info; @@ -219,16 +218,16 @@ bool optimizeUseNormalProjections(Stack & stack, QueryPlan::Nodes & nodes) if (!projection_reading) { Pipe pipe(std::make_shared(proj_snapshot->getSampleBlockForColumns(required_columns))); - projection_reading = std::make_unique( - std::move(pipe), - context, - query_info.is_internal - ? Context::QualifiedProjectionName{} - : Context::QualifiedProjectionName - { - .storage_id = reading->getMergeTreeData().getStorageID(), - .projection_name = best_candidate->projection->name, - }); + projection_reading = std::make_unique(std::move(pipe)); + } + + if (!query_info.is_internal && context->hasQueryContext()) + { + context->getQueryContext()->addQueryAccessInfo(Context::QualifiedProjectionName + { + .storage_id = reading->getMergeTreeData().getStorageID(), + .projection_name = best_candidate->projection->name, + }); } bool has_ordinary_parts = best_candidate->merge_tree_ordinary_select_result_ptr != nullptr; diff --git a/src/Processors/QueryPlan/ReadFromMergeTree.cpp b/src/Processors/QueryPlan/ReadFromMergeTree.cpp index 68786bdec6c..f14960bc8d1 100644 --- a/src/Processors/QueryPlan/ReadFromMergeTree.cpp +++ b/src/Processors/QueryPlan/ReadFromMergeTree.cpp @@ -124,12 +124,6 @@ static MergeTreeReaderSettings getMergeTreeReaderSettings( }; } -static const PrewhereInfoPtr & getPrewhereInfoFromQueryInfo(const SelectQueryInfo & query_info) -{ - return query_info.projection ? query_info.projection->prewhere_info - : query_info.prewhere_info; -} - static bool checkAllPartsOnRemoteFS(const RangesInDataParts & parts) { for (const auto & part : parts) @@ -256,7 +250,7 @@ ReadFromMergeTree::ReadFromMergeTree( bool enable_parallel_reading) : SourceStepWithFilter(DataStream{.header = MergeTreeSelectProcessor::transformHeader( storage_snapshot_->getSampleBlockForColumns(real_column_names_), - getPrewhereInfoFromQueryInfo(query_info_), + query_info_.prewhere_info, data_.getPartitionValueType(), virt_column_names_)}) , reader_settings(getMergeTreeReaderSettings(context_, query_info_)) @@ -266,7 +260,7 @@ ReadFromMergeTree::ReadFromMergeTree( , virt_column_names(std::move(virt_column_names_)) , data(data_) , query_info(query_info_) - , prewhere_info(getPrewhereInfoFromQueryInfo(query_info)) + , prewhere_info(query_info_.prewhere_info) , actions_settings(ExpressionActionsSettings::fromContext(context_)) , storage_snapshot(std::move(storage_snapshot_)) , metadata_for_reading(storage_snapshot->getMetadataForQuery()) @@ -321,7 +315,7 @@ ReadFromMergeTree::ReadFromMergeTree( *output_stream, storage_snapshot->getMetadataForQuery()->getSortingKeyColumns(), getSortDirection(), - query_info.getInputOrderInfo(), + query_info.input_order_info, prewhere_info); } @@ -1632,10 +1626,10 @@ MergeTreeDataSelectAnalysisResultPtr ReadFromMergeTree::selectRangesToReadImpl( result.total_marks_pk = total_marks_pk; result.selected_rows = sum_rows; - const auto & input_order_info = query_info.getInputOrderInfo(); - if (input_order_info) - result.read_type = (input_order_info->direction > 0) ? ReadType::InOrder - : ReadType::InReverseOrder; + if (query_info.input_order_info) + result.read_type = (query_info.input_order_info->direction > 0) + ? ReadType::InOrder + : ReadType::InReverseOrder; return std::make_shared(MergeTreeDataSelectAnalysisResult{.result = std::move(result)}); } @@ -1651,12 +1645,7 @@ bool ReadFromMergeTree::requestReadingInOrder(size_t prefix_size, int direction, if (direction != 1 && query_info.isFinal()) return false; - auto order_info = std::make_shared(SortDescription{}, prefix_size, direction, limit); - if (query_info.projection) - query_info.projection->input_order_info = order_info; - else - query_info.input_order_info = order_info; - + query_info.input_order_info = std::make_shared(SortDescription{}, prefix_size, direction, limit); reader_settings.read_in_order = true; /// In case or read-in-order, don't create too many reading streams. @@ -1678,7 +1667,7 @@ bool ReadFromMergeTree::requestReadingInOrder(size_t prefix_size, int direction, } if (!sort_description.empty()) { - const size_t used_prefix_of_sorting_key_size = order_info->used_prefix_of_sorting_key_size; + const size_t used_prefix_of_sorting_key_size = query_info.input_order_info->used_prefix_of_sorting_key_size; if (sort_description.size() > used_prefix_of_sorting_key_size) sort_description.resize(used_prefix_of_sorting_key_size); output_stream->sort_description = std::move(sort_description); @@ -1708,7 +1697,7 @@ void ReadFromMergeTree::updatePrewhereInfo(const PrewhereInfoPtr & prewhere_info *output_stream, storage_snapshot->getMetadataForQuery()->getSortingKeyColumns(), getSortDirection(), - query_info.getInputOrderInfo(), + query_info.input_order_info, prewhere_info); } @@ -1803,8 +1792,6 @@ Pipe ReadFromMergeTree::spreadMarkRanges( RangesInDataParts && parts_with_ranges, size_t num_streams, AnalysisResult & result, ActionsDAGPtr & result_projection) { const bool final = isQueryWithFinal(); - const auto & input_order_info = query_info.getInputOrderInfo(); - Names column_names_to_read = result.column_names_to_read; NameSet names(column_names_to_read.begin(), column_names_to_read.end()); @@ -1845,10 +1832,10 @@ Pipe ReadFromMergeTree::spreadMarkRanges( return spreadMarkRangesAmongStreamsFinal(std::move(parts_with_ranges), num_streams, result.column_names_to_read, column_names_to_read, result_projection); } - else if (input_order_info) + else if (query_info.input_order_info) { return spreadMarkRangesAmongStreamsWithOrder( - std::move(parts_with_ranges), num_streams, column_names_to_read, result_projection, input_order_info); + std::move(parts_with_ranges), num_streams, column_names_to_read, result_projection, query_info.input_order_info); } else { diff --git a/src/Processors/QueryPlan/ReadFromMergeTree.h b/src/Processors/QueryPlan/ReadFromMergeTree.h index e2c38ebb251..4f9406c4a85 100644 --- a/src/Processors/QueryPlan/ReadFromMergeTree.h +++ b/src/Processors/QueryPlan/ReadFromMergeTree.h @@ -226,9 +226,8 @@ private: int getSortDirection() const { - const InputOrderInfoPtr & order_info = query_info.getInputOrderInfo(); - if (order_info) - return order_info->direction; + if (query_info.input_order_info) + return query_info.input_order_info->direction; return 1; } diff --git a/src/Processors/QueryPlan/ReadFromPreparedSource.cpp b/src/Processors/QueryPlan/ReadFromPreparedSource.cpp index e7b170f0f91..bf2e49727ed 100644 --- a/src/Processors/QueryPlan/ReadFromPreparedSource.cpp +++ b/src/Processors/QueryPlan/ReadFromPreparedSource.cpp @@ -6,30 +6,37 @@ namespace DB { -ReadFromPreparedSource::ReadFromPreparedSource(Pipe pipe_, ContextPtr context_, Context::QualifiedProjectionName qualified_projection_name_) +ReadFromPreparedSource::ReadFromPreparedSource(Pipe pipe_) : SourceStepWithFilter(DataStream{.header = pipe_.getHeader()}) , pipe(std::move(pipe_)) - , context(std::move(context_)) - , qualified_projection_name(std::move(qualified_projection_name_)) { } void ReadFromPreparedSource::initializePipeline(QueryPipelineBuilder & pipeline, const BuildQueryPipelineSettings &) { - if (context && context->hasQueryContext()) - context->getQueryContext()->addQueryAccessInfo(qualified_projection_name); - for (const auto & processor : pipe.getProcessors()) processors.emplace_back(processor); pipeline.init(std::move(pipe)); } +ReadFromStorageStep::ReadFromStorageStep( + Pipe pipe_, + String storage_name, + ContextPtr context_, + const SelectQueryInfo & query_info_) + : ReadFromPreparedSource(std::move(pipe_)) + , context(std::move(context_)) + , query_info(query_info_) +{ + setStepDescription(storage_name); + + for (const auto & processor : pipe.getProcessors()) + processor->setStorageLimits(query_info.storage_limits); +} + void ReadFromStorageStep::applyFilters() { - if (!context) - return; - for (const auto & processor : pipe.getProcessors()) if (auto * source = dynamic_cast(processor.get())) source->setKeyCondition(filter_nodes.nodes, context); diff --git a/src/Processors/QueryPlan/ReadFromPreparedSource.h b/src/Processors/QueryPlan/ReadFromPreparedSource.h index 16e790273ea..2eea48553b3 100644 --- a/src/Processors/QueryPlan/ReadFromPreparedSource.h +++ b/src/Processors/QueryPlan/ReadFromPreparedSource.h @@ -13,36 +13,25 @@ namespace DB class ReadFromPreparedSource : public SourceStepWithFilter { public: - explicit ReadFromPreparedSource( - Pipe pipe_, ContextPtr context_ = nullptr, Context::QualifiedProjectionName qualified_projection_name_ = {}); + explicit ReadFromPreparedSource(Pipe pipe_); String getName() const override { return "ReadFromPreparedSource"; } - void initializePipeline(QueryPipelineBuilder & pipeline, const BuildQueryPipelineSettings &) override; protected: Pipe pipe; - ContextPtr context; - Context::QualifiedProjectionName qualified_projection_name; }; class ReadFromStorageStep : public ReadFromPreparedSource { public: - ReadFromStorageStep(Pipe pipe_, String storage_name, const SelectQueryInfo & query_info_, ContextPtr context_) - : ReadFromPreparedSource(std::move(pipe_), std::move(context_)), query_info(query_info_) - { - setStepDescription(storage_name); - - for (const auto & processor : pipe.getProcessors()) - processor->setStorageLimits(query_info.storage_limits); - } + ReadFromStorageStep(Pipe pipe_, String storage_name, ContextPtr context_, const SelectQueryInfo & query_info_); String getName() const override { return "ReadFromStorage"; } - void applyFilters() override; private: + ContextPtr context; SelectQueryInfo query_info; }; diff --git a/src/Processors/QueryPlan/ReadFromSystemNumbersStep.cpp b/src/Processors/QueryPlan/ReadFromSystemNumbersStep.cpp index aec959233ea..5173b18c6bf 100644 --- a/src/Processors/QueryPlan/ReadFromSystemNumbersStep.cpp +++ b/src/Processors/QueryPlan/ReadFromSystemNumbersStep.cpp @@ -332,7 +332,7 @@ ReadFromSystemNumbersStep::ReadFromSystemNumbersStep( , storage{std::move(storage_)} , storage_snapshot{storage_snapshot_} , context{std::move(context_)} - , key_expression{KeyDescription::parse(column_names[0], storage_snapshot->getMetadataForQuery()->columns, context).expression} + , key_expression{KeyDescription::parse(column_names[0], storage_snapshot->metadata->columns, context).expression} , max_block_size{max_block_size_} , num_streams{num_streams_} , limit_length_and_offset(InterpreterSelectQuery::getLimitLengthAndOffset(query_info.query->as(), context)) diff --git a/src/Processors/Transforms/buildPushingToViewsChain.cpp b/src/Processors/Transforms/buildPushingToViewsChain.cpp index ab9b3a80f12..71d652e74d0 100644 --- a/src/Processors/Transforms/buildPushingToViewsChain.cpp +++ b/src/Processors/Transforms/buildPushingToViewsChain.cpp @@ -420,7 +420,11 @@ Chain buildPushingToViewsChain( if (!no_destination && context->hasQueryContext()) { context->getQueryContext()->addQueryAccessInfo( - backQuoteIfNeed(view_id.getDatabaseName()), views_data->views.back().runtime_stats->target_name, {}, "", view_id.getFullTableName()); + backQuoteIfNeed(view_id.getDatabaseName()), + views_data->views.back().runtime_stats->target_name, + /*column_names=*/ {}); + + context->getQueryContext()->addViewAccessInfo(view_id.getFullTableName()); } } diff --git a/src/Server/HTTP/WriteBufferFromHTTPServerResponse.cpp b/src/Server/HTTP/WriteBufferFromHTTPServerResponse.cpp index 1a12c09a8c7..c32da278e49 100644 --- a/src/Server/HTTP/WriteBufferFromHTTPServerResponse.cpp +++ b/src/Server/HTTP/WriteBufferFromHTTPServerResponse.cpp @@ -112,6 +112,7 @@ void WriteBufferFromHTTPServerResponse::nextImpl() std::make_unique(*response_body_ostr), compress ? compression_method : CompressionMethod::None, compression_level, + 0, working_buffer.size(), working_buffer.begin()); else diff --git a/src/Server/TCPHandler.cpp b/src/Server/TCPHandler.cpp index a563e0e0004..600ca7ebbbb 100644 --- a/src/Server/TCPHandler.cpp +++ b/src/Server/TCPHandler.cpp @@ -2023,7 +2023,7 @@ void TCPHandler::initBlockOutput(const Block & block) if (state.compression == Protocol::Compression::Enable) { - CompressionCodecFactory::instance().validateCodec(method, level, !query_settings.allow_suspicious_codecs, query_settings.allow_experimental_codecs, query_settings.enable_deflate_qpl_codec); + CompressionCodecFactory::instance().validateCodec(method, level, !query_settings.allow_suspicious_codecs, query_settings.allow_experimental_codecs, query_settings.enable_deflate_qpl_codec, query_settings.enable_zstd_qat_codec); state.maybe_compressed_out = std::make_shared( *out, CompressionCodecFactory::instance().get(method, level)); diff --git a/src/Storages/AlterCommands.cpp b/src/Storages/AlterCommands.cpp index 9fc785373b3..1fb53475801 100644 --- a/src/Storages/AlterCommands.cpp +++ b/src/Storages/AlterCommands.cpp @@ -439,7 +439,7 @@ void AlterCommand::apply(StorageInMemoryMetadata & metadata, ContextPtr context) column.comment = *comment; if (codec) - column.codec = CompressionCodecFactory::instance().validateCodecAndGetPreprocessedAST(codec, data_type, false, true, true); + column.codec = CompressionCodecFactory::instance().validateCodecAndGetPreprocessedAST(codec, data_type, false, true, true, true); column.ttl = ttl; @@ -504,7 +504,7 @@ void AlterCommand::apply(StorageInMemoryMetadata & metadata, ContextPtr context) else { if (codec) - column.codec = CompressionCodecFactory::instance().validateCodecAndGetPreprocessedAST(codec, data_type ? data_type : column.type, false, true, true); + column.codec = CompressionCodecFactory::instance().validateCodecAndGetPreprocessedAST(codec, data_type ? data_type : column.type, false, true, true, true); if (comment) column.comment = *comment; @@ -1249,7 +1249,7 @@ void AlterCommands::validate(const StoragePtr & table, ContextPtr context) const "this column name is reserved for _block_number persisting feature", backQuote(column_name)); if (command.codec) - CompressionCodecFactory::instance().validateCodecAndGetPreprocessedAST(command.codec, command.data_type, !context->getSettingsRef().allow_suspicious_codecs, context->getSettingsRef().allow_experimental_codecs, context->getSettingsRef().enable_deflate_qpl_codec); + CompressionCodecFactory::instance().validateCodecAndGetPreprocessedAST(command.codec, command.data_type, !context->getSettingsRef().allow_suspicious_codecs, context->getSettingsRef().allow_experimental_codecs, context->getSettingsRef().enable_deflate_qpl_codec, context->getSettingsRef().enable_zstd_qat_codec); all_columns.add(ColumnDescription(column_name, command.data_type)); } @@ -1274,7 +1274,7 @@ void AlterCommands::validate(const StoragePtr & table, ContextPtr context) const { if (all_columns.hasAlias(column_name)) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot specify codec for column type ALIAS"); - CompressionCodecFactory::instance().validateCodecAndGetPreprocessedAST(command.codec, command.data_type, !context->getSettingsRef().allow_suspicious_codecs, context->getSettingsRef().allow_experimental_codecs, context->getSettingsRef().enable_deflate_qpl_codec); + CompressionCodecFactory::instance().validateCodecAndGetPreprocessedAST(command.codec, command.data_type, !context->getSettingsRef().allow_suspicious_codecs, context->getSettingsRef().allow_experimental_codecs, context->getSettingsRef().enable_deflate_qpl_codec, context->getSettingsRef().enable_zstd_qat_codec); } auto column_default = all_columns.getDefault(column_name); if (column_default) diff --git a/src/Storages/ColumnsDescription.cpp b/src/Storages/ColumnsDescription.cpp index 697350faf09..72047b3033a 100644 --- a/src/Storages/ColumnsDescription.cpp +++ b/src/Storages/ColumnsDescription.cpp @@ -140,7 +140,7 @@ void ColumnDescription::readText(ReadBuffer & buf) comment = col_ast->comment->as().value.get(); if (col_ast->codec) - codec = CompressionCodecFactory::instance().validateCodecAndGetPreprocessedAST(col_ast->codec, type, false, true, true); + codec = CompressionCodecFactory::instance().validateCodecAndGetPreprocessedAST(col_ast->codec, type, false, true, true, true); if (col_ast->ttl) ttl = col_ast->ttl; diff --git a/src/Storages/Distributed/DistributedSink.cpp b/src/Storages/Distributed/DistributedSink.cpp index acdc6f142a4..650539ef1e9 100644 --- a/src/Storages/Distributed/DistributedSink.cpp +++ b/src/Storages/Distributed/DistributedSink.cpp @@ -740,7 +740,7 @@ void DistributedSink::writeToShard(const Cluster::ShardInfo & shard_info, const if (compression_method == "ZSTD") compression_level = settings.network_zstd_compression_level; - CompressionCodecFactory::instance().validateCodec(compression_method, compression_level, !settings.allow_suspicious_codecs, settings.allow_experimental_codecs, settings.enable_deflate_qpl_codec); + CompressionCodecFactory::instance().validateCodec(compression_method, compression_level, !settings.allow_suspicious_codecs, settings.allow_experimental_codecs, settings.enable_deflate_qpl_codec, settings.enable_zstd_qat_codec); CompressionCodecPtr compression_codec = CompressionCodecFactory::instance().get(compression_method, compression_level); /// tmp directory is used to ensure atomicity of transactions diff --git a/src/Storages/HDFS/StorageHDFS.cpp b/src/Storages/HDFS/StorageHDFS.cpp index 974b2bb68cf..a25004b83f4 100644 --- a/src/Storages/HDFS/StorageHDFS.cpp +++ b/src/Storages/HDFS/StorageHDFS.cpp @@ -724,13 +724,13 @@ public: const CompressionMethod compression_method) : SinkToStorage(sample_block) { + const auto & settings = context->getSettingsRef(); write_buf = wrapWriteBufferWithCompressionMethod( std::make_unique( - uri, - context->getGlobalContext()->getConfigRef(), - context->getSettingsRef().hdfs_replication, - context->getWriteSettings()), - compression_method, 3); + uri, context->getGlobalContext()->getConfigRef(), context->getSettingsRef().hdfs_replication, context->getWriteSettings()), + compression_method, + static_cast(settings.output_format_compression_level), + static_cast(settings.output_format_compression_zstd_window_log)); writer = FormatFactory::instance().getOutputFormatParallelIfPossible(format, *write_buf, sample_block, context); } diff --git a/src/Storages/IStorage.cpp b/src/Storages/IStorage.cpp index 88603d56ebb..85ef6a0bb35 100644 --- a/src/Storages/IStorage.cpp +++ b/src/Storages/IStorage.cpp @@ -165,11 +165,11 @@ void IStorage::readFromPipe( if (pipe.empty()) { auto header = storage_snapshot->getSampleBlockForColumns(column_names); - InterpreterSelectQuery::addEmptySourceToQueryPlan(query_plan, header, query_info, context); + InterpreterSelectQuery::addEmptySourceToQueryPlan(query_plan, header, query_info); } else { - auto read_step = std::make_unique(std::move(pipe), storage_name, query_info, context); + auto read_step = std::make_unique(std::move(pipe), storage_name, context, query_info); query_plan.addStep(std::move(read_step)); } } diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 4ad6c564a18..c8bfebc4919 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -6888,7 +6888,7 @@ QueryProcessingStage::Enum MergeTreeData::getQueryProcessingStage( ContextPtr query_context, QueryProcessingStage::Enum to_stage, const StorageSnapshotPtr &, - SelectQueryInfo & query_info) const + SelectQueryInfo &) const { if (query_context->getClientInfo().collaborate_with_initiator) return QueryProcessingStage::Enum::FetchColumns; @@ -6905,11 +6905,6 @@ QueryProcessingStage::Enum MergeTreeData::getQueryProcessingStage( return QueryProcessingStage::Enum::WithMergeableState; } - if (to_stage >= QueryProcessingStage::Enum::WithMergeableState) - { - query_info.projection = std::nullopt; - } - return QueryProcessingStage::Enum::FetchColumns; } diff --git a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp index d5b9b4423a9..91519d00cb6 100644 --- a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp +++ b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp @@ -125,22 +125,6 @@ static RelativeSize convertAbsoluteSampleSizeToRelative(const ASTSampleRatio::Ra return std::min(RelativeSize(1), RelativeSize(absolute_sample_size) / RelativeSize(approx_total_rows)); } -static SortDescription getSortDescriptionFromGroupBy(const ASTSelectQuery & query) -{ - SortDescription order_descr; - order_descr.reserve(query.groupBy()->children.size()); - - for (const auto & elem : query.groupBy()->children) - { - /// Note, here aliases should not be used, since there will be no such column in a block. - String name = elem->getColumnNameWithoutAlias(); - order_descr.emplace_back(name, 1, 1); - } - - return order_descr; -} - - QueryPlanPtr MergeTreeDataSelectExecutor::read( const Names & column_names_to_return, const StorageSnapshotPtr & storage_snapshot, @@ -148,339 +132,32 @@ QueryPlanPtr MergeTreeDataSelectExecutor::read( ContextPtr context, const UInt64 max_block_size, const size_t num_streams, - QueryProcessingStage::Enum processed_stage, std::shared_ptr max_block_numbers_to_read, bool enable_parallel_reading) const { if (query_info.merge_tree_empty_result) return std::make_unique(); - const auto & settings = context->getSettingsRef(); - - const auto & metadata_for_reading = storage_snapshot->getMetadataForQuery(); - const auto & snapshot_data = assert_cast(*storage_snapshot->data); - const auto & parts = snapshot_data.parts; const auto & alter_conversions = snapshot_data.alter_conversions; - if (!query_info.projection) - { - auto step = readFromParts( - query_info.merge_tree_select_result_ptr ? MergeTreeData::DataPartsVector{} : parts, - query_info.merge_tree_select_result_ptr ? std::vector{} : alter_conversions, - column_names_to_return, - storage_snapshot, - query_info, - context, - max_block_size, - num_streams, - max_block_numbers_to_read, - query_info.merge_tree_select_result_ptr, - enable_parallel_reading); - - auto plan = std::make_unique(); - if (step) - plan->addStep(std::move(step)); - return plan; - } - - LOG_DEBUG( - log, - "Choose {} {} projection {}", - query_info.projection->complete ? "complete" : "incomplete", - query_info.projection->desc->type, - query_info.projection->desc->name); - - const ASTSelectQuery & select_query = query_info.query->as(); - QueryPlanResourceHolder resources; - - auto projection_plan = std::make_unique(); - if (query_info.projection->desc->is_minmax_count_projection) - { - Pipe pipe(std::make_shared(query_info.minmax_count_projection_block)); - auto read_from_pipe = std::make_unique(std::move(pipe)); - projection_plan->addStep(std::move(read_from_pipe)); - } - else if (query_info.projection->merge_tree_projection_select_result_ptr) - { - LOG_DEBUG(log, "projection required columns: {}", fmt::join(query_info.projection->required_columns, ", ")); - projection_plan->addStep(readFromParts( - /*parts=*/ {}, - /*alter_conversions=*/ {}, - query_info.projection->required_columns, - storage_snapshot, - query_info, - context, - max_block_size, - num_streams, - max_block_numbers_to_read, - query_info.projection->merge_tree_projection_select_result_ptr, - enable_parallel_reading)); - } - - if (projection_plan->isInitialized()) - { - if (query_info.projection->before_where) - { - auto where_step = std::make_unique( - projection_plan->getCurrentDataStream(), - query_info.projection->before_where, - query_info.projection->where_column_name, - query_info.projection->remove_where_filter); - - where_step->setStepDescription("WHERE"); - projection_plan->addStep(std::move(where_step)); - } - - if (query_info.projection->before_aggregation) - { - auto expression_before_aggregation - = std::make_unique(projection_plan->getCurrentDataStream(), query_info.projection->before_aggregation); - expression_before_aggregation->setStepDescription("Before GROUP BY"); - projection_plan->addStep(std::move(expression_before_aggregation)); - } - - /// NOTE: input_order_info (for projection and not) is set only if projection is complete - if (query_info.has_order_by && !query_info.need_aggregate && query_info.projection->input_order_info) - { - chassert(query_info.projection->complete); - - SortDescription output_order_descr = InterpreterSelectQuery::getSortDescription(select_query, context); - UInt64 limit = InterpreterSelectQuery::getLimitForSorting(select_query, context); - - auto sorting_step = std::make_unique( - projection_plan->getCurrentDataStream(), - query_info.projection->input_order_info->sort_description_for_merging, - output_order_descr, - settings.max_block_size, - limit); - - sorting_step->setStepDescription("ORDER BY for projections"); - projection_plan->addStep(std::move(sorting_step)); - } - } - - auto ordinary_query_plan = std::make_unique(); - if (query_info.projection->merge_tree_normal_select_result_ptr) - { - auto storage_from_base_parts_of_projection - = std::make_shared(data, query_info.projection->merge_tree_normal_select_result_ptr); - auto interpreter = InterpreterSelectQuery( - query_info.query, - context, - storage_from_base_parts_of_projection, - nullptr, - SelectQueryOptions{processed_stage}.projectionQuery()); - - interpreter.buildQueryPlan(*ordinary_query_plan); - - const auto & expressions = interpreter.getAnalysisResult(); - if (processed_stage == QueryProcessingStage::Enum::FetchColumns && expressions.before_where) - { - auto where_step = std::make_unique( - ordinary_query_plan->getCurrentDataStream(), - expressions.before_where, - expressions.where_column_name, - expressions.remove_where_filter); - where_step->setStepDescription("WHERE"); - ordinary_query_plan->addStep(std::move(where_step)); - } - } - - Pipe projection_pipe; - Pipe ordinary_pipe; - if (query_info.projection->desc->type == ProjectionDescription::Type::Aggregate) - { - auto make_aggregator_params = [&](bool projection) - { - const auto & keys = query_info.projection->aggregation_keys.getNames(); - - AggregateDescriptions aggregates = query_info.projection->aggregate_descriptions; - - /// This part is hacky. - /// We want AggregatingTransform to work with aggregate states instead of normal columns. - /// It is almost the same, just instead of adding new data to aggregation state we merge it with existing. - /// - /// It is needed because data in projection: - /// * is not merged completely (we may have states with the same key in different parts) - /// * is not split into buckets (so if we just use MergingAggregated, it will use single thread) - const bool only_merge = projection; - - Aggregator::Params params( - keys, - aggregates, - query_info.projection->aggregate_overflow_row, - settings.max_rows_to_group_by, - settings.group_by_overflow_mode, - settings.group_by_two_level_threshold, - settings.group_by_two_level_threshold_bytes, - settings.max_bytes_before_external_group_by, - settings.empty_result_for_aggregation_by_empty_set, - context->getTempDataOnDisk(), - settings.max_threads, - settings.min_free_disk_space_for_temporary_data, - settings.compile_aggregate_expressions, - settings.min_count_to_compile_aggregate_expression, - settings.max_block_size, - settings.enable_software_prefetch_in_aggregation, - only_merge, - settings.optimize_group_by_constant_keys, - settings.min_hit_rate_to_use_consecutive_keys_optimization, - /*stats_collecting_params=*/ {}); - - return std::make_pair(params, only_merge); - }; - - if (ordinary_query_plan->isInitialized() && projection_plan->isInitialized()) - { - auto projection_builder = projection_plan->buildQueryPipeline( - QueryPlanOptimizationSettings::fromContext(context), BuildQueryPipelineSettings::fromContext(context)); - projection_pipe = QueryPipelineBuilder::getPipe(std::move(*projection_builder), resources); - - auto ordinary_builder = ordinary_query_plan->buildQueryPipeline( - QueryPlanOptimizationSettings::fromContext(context), BuildQueryPipelineSettings::fromContext(context)); - ordinary_pipe = QueryPipelineBuilder::getPipe(std::move(*ordinary_builder), resources); - - /// Here we create shared ManyAggregatedData for both projection and ordinary data. - /// For ordinary data, AggregatedData is filled in a usual way. - /// For projection data, AggregatedData is filled by merging aggregation states. - /// When all AggregatedData is filled, we merge aggregation states together in a usual way. - /// Pipeline will look like: - /// ReadFromProjection -> Aggregating (only merge states) -> - /// ReadFromProjection -> Aggregating (only merge states) -> - /// ... -> Resize -> ConvertingAggregatedToChunks - /// ReadFromOrdinaryPart -> Aggregating (usual) -> (added by last Aggregating) - /// ReadFromOrdinaryPart -> Aggregating (usual) -> - /// ... - auto many_data = std::make_shared(projection_pipe.numOutputPorts() + ordinary_pipe.numOutputPorts()); - size_t counter = 0; - - AggregatorListPtr aggregator_list_ptr = std::make_shared(); - - /// TODO apply optimize_aggregation_in_order here too (like below) - auto build_aggregate_pipe = [&](Pipe & pipe, bool projection) - { - auto [params, only_merge] = make_aggregator_params(projection); - - AggregatingTransformParamsPtr transform_params = std::make_shared( - pipe.getHeader(), std::move(params), aggregator_list_ptr, query_info.projection->aggregate_final); - - pipe.resize(pipe.numOutputPorts(), true, true); - - auto merge_threads = num_streams; - auto temporary_data_merge_threads = settings.aggregation_memory_efficient_merge_threads - ? static_cast(settings.aggregation_memory_efficient_merge_threads) - : static_cast(settings.max_threads); - - pipe.addSimpleTransform([&](const Block & header) - { - return std::make_shared( - header, transform_params, many_data, counter++, merge_threads, temporary_data_merge_threads); - }); - }; - - if (!projection_pipe.empty()) - build_aggregate_pipe(projection_pipe, true); - if (!ordinary_pipe.empty()) - build_aggregate_pipe(ordinary_pipe, false); - } - else - { - auto add_aggregating_step = [&](QueryPlanPtr & query_plan, bool projection) - { - auto [params, only_merge] = make_aggregator_params(projection); - - auto merge_threads = num_streams; - auto temporary_data_merge_threads = settings.aggregation_memory_efficient_merge_threads - ? static_cast(settings.aggregation_memory_efficient_merge_threads) - : static_cast(settings.max_threads); - - InputOrderInfoPtr group_by_info = query_info.projection->input_order_info; - SortDescription sort_description_for_merging; - SortDescription group_by_sort_description; - if (group_by_info && settings.optimize_aggregation_in_order) - { - group_by_sort_description = getSortDescriptionFromGroupBy(select_query); - sort_description_for_merging = group_by_info->sort_description_for_merging; - } - else - group_by_info = nullptr; - - // We don't have information regarding the `to_stage` of the query processing, only about `from_stage` (which is passed through `processed_stage` argument). - // Thus we cannot assign false here since it may be a query over distributed table. - const bool should_produce_results_in_order_of_bucket_number = true; - - auto aggregating_step = std::make_unique( - query_plan->getCurrentDataStream(), - std::move(params), - /* grouping_sets_params_= */ GroupingSetsParamsList{}, - query_info.projection->aggregate_final, - settings.max_block_size, - settings.aggregation_in_order_max_block_bytes, - merge_threads, - temporary_data_merge_threads, - /* storage_has_evenly_distributed_read_= */ false, - /* group_by_use_nulls */ false, - std::move(sort_description_for_merging), - std::move(group_by_sort_description), - should_produce_results_in_order_of_bucket_number, - settings.enable_memory_bound_merging_of_aggregation_results, - !group_by_info && settings.force_aggregation_in_order); - query_plan->addStep(std::move(aggregating_step)); - }; - - if (projection_plan->isInitialized()) - { - add_aggregating_step(projection_plan, true); - - auto projection_builder = projection_plan->buildQueryPipeline( - QueryPlanOptimizationSettings::fromContext(context), BuildQueryPipelineSettings::fromContext(context)); - projection_pipe = QueryPipelineBuilder::getPipe(std::move(*projection_builder), resources); - } - if (ordinary_query_plan->isInitialized()) - { - add_aggregating_step(ordinary_query_plan, false); - - auto ordinary_builder = ordinary_query_plan->buildQueryPipeline( - QueryPlanOptimizationSettings::fromContext(context), BuildQueryPipelineSettings::fromContext(context)); - ordinary_pipe = QueryPipelineBuilder::getPipe(std::move(*ordinary_builder), resources); - } - } - } - else - { - if (projection_plan->isInitialized()) - { - auto projection_builder = projection_plan->buildQueryPipeline( - QueryPlanOptimizationSettings::fromContext(context), BuildQueryPipelineSettings::fromContext(context)); - projection_pipe = QueryPipelineBuilder::getPipe(std::move(*projection_builder), resources); - } - - if (ordinary_query_plan->isInitialized()) - { - auto ordinary_builder = ordinary_query_plan->buildQueryPipeline( - QueryPlanOptimizationSettings::fromContext(context), BuildQueryPipelineSettings::fromContext(context)); - ordinary_pipe = QueryPipelineBuilder::getPipe(std::move(*ordinary_builder), resources); - } - } - - Pipes pipes; - pipes.emplace_back(std::move(projection_pipe)); - pipes.emplace_back(std::move(ordinary_pipe)); - auto pipe = Pipe::unitePipes(std::move(pipes)); - auto plan = std::make_unique(); - if (pipe.empty()) - return plan; - - pipe.resize(1); - auto step = std::make_unique( - std::move(pipe), - fmt::format("MergeTree(with {} projection {})", query_info.projection->desc->type, query_info.projection->desc->name), + auto step = readFromParts( + parts, + alter_conversions, + column_names_to_return, + storage_snapshot, query_info, - context); - plan->addStep(std::move(step)); - plan->addInterpreterContext(query_info.projection->context); + context, + max_block_size, + num_streams, + max_block_numbers_to_read, + /*merge_tree_select_result_ptr=*/ nullptr, + enable_parallel_reading); + + auto plan = std::make_unique(); + if (step) + plan->addStep(std::move(step)); return plan; } diff --git a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.h b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.h index 4c6e1086cbc..9d56100a10c 100644 --- a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.h +++ b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.h @@ -34,7 +34,6 @@ public: ContextPtr context, UInt64 max_block_size, size_t num_streams, - QueryProcessingStage::Enum processed_stage, std::shared_ptr max_block_numbers_to_read = nullptr, bool enable_parallel_reading = false) const; diff --git a/src/Storages/NATS/StorageNATS.cpp b/src/Storages/NATS/StorageNATS.cpp index cd7e99a6d18..9cb1fbd8506 100644 --- a/src/Storages/NATS/StorageNATS.cpp +++ b/src/Storages/NATS/StorageNATS.cpp @@ -347,11 +347,11 @@ void StorageNATS::read( if (pipe.empty()) { auto header = storage_snapshot->getSampleBlockForColumns(column_names); - InterpreterSelectQuery::addEmptySourceToQueryPlan(query_plan, header, query_info, local_context); + InterpreterSelectQuery::addEmptySourceToQueryPlan(query_plan, header, query_info); } else { - auto read_step = std::make_unique(std::move(pipe), getName(), query_info, local_context); + auto read_step = std::make_unique(std::move(pipe), getName(), local_context, query_info); query_plan.addStep(std::move(read_step)); query_plan.addInterpreterContext(modified_context); } diff --git a/src/Storages/RabbitMQ/StorageRabbitMQ.cpp b/src/Storages/RabbitMQ/StorageRabbitMQ.cpp index ec2e002b285..fce2d775b15 100644 --- a/src/Storages/RabbitMQ/StorageRabbitMQ.cpp +++ b/src/Storages/RabbitMQ/StorageRabbitMQ.cpp @@ -700,7 +700,7 @@ void StorageRabbitMQ::read( if (num_created_consumers == 0) { auto header = storage_snapshot->getSampleBlockForColumns(column_names); - InterpreterSelectQuery::addEmptySourceToQueryPlan(query_plan, header, query_info, local_context); + InterpreterSelectQuery::addEmptySourceToQueryPlan(query_plan, header, query_info); return; } @@ -758,11 +758,11 @@ void StorageRabbitMQ::read( if (pipe.empty()) { auto header = storage_snapshot->getSampleBlockForColumns(column_names); - InterpreterSelectQuery::addEmptySourceToQueryPlan(query_plan, header, query_info, local_context); + InterpreterSelectQuery::addEmptySourceToQueryPlan(query_plan, header, query_info); } else { - auto read_step = std::make_unique(std::move(pipe), getName(), query_info, local_context); + auto read_step = std::make_unique(std::move(pipe), getName(), local_context, query_info); query_plan.addStep(std::move(read_step)); query_plan.addInterpreterContext(modified_context); } diff --git a/src/Storages/ReadFinalForExternalReplicaStorage.cpp b/src/Storages/ReadFinalForExternalReplicaStorage.cpp index 28053c84e20..e1d52eefc20 100644 --- a/src/Storages/ReadFinalForExternalReplicaStorage.cpp +++ b/src/Storages/ReadFinalForExternalReplicaStorage.cpp @@ -64,7 +64,7 @@ void readFinalFromNestedStorage( if (!query_plan.isInitialized()) { - InterpreterSelectQuery::addEmptySourceToQueryPlan(query_plan, nested_header, query_info, context); + InterpreterSelectQuery::addEmptySourceToQueryPlan(query_plan, nested_header, query_info); return; } diff --git a/src/Storages/SelectQueryInfo.h b/src/Storages/SelectQueryInfo.h index 0145b7074c6..69dbb64db38 100644 --- a/src/Storages/SelectQueryInfo.h +++ b/src/Storages/SelectQueryInfo.h @@ -9,7 +9,6 @@ #include #include #include -#include #include @@ -142,32 +141,6 @@ class IMergeTreeDataPart; using ManyExpressionActions = std::vector; -// The projection selected to execute current query -struct ProjectionCandidate -{ - ProjectionDescriptionRawPtr desc{}; - PrewhereInfoPtr prewhere_info; - ActionsDAGPtr before_where; - String where_column_name; - bool remove_where_filter = false; - ActionsDAGPtr before_aggregation; - Names required_columns; - NamesAndTypesList aggregation_keys; - AggregateDescriptions aggregate_descriptions; - bool aggregate_overflow_row = false; - bool aggregate_final = false; - bool complete = false; - ReadInOrderOptimizerPtr order_optimizer; - InputOrderInfoPtr input_order_info; - ManyExpressionActions group_by_elements_actions; - SortDescription group_by_elements_order_descr; - MergeTreeDataSelectAnalysisResultPtr merge_tree_projection_select_result_ptr; - MergeTreeDataSelectAnalysisResultPtr merge_tree_normal_select_result_ptr; - - /// Because projection analysis uses a separate interpreter. - ContextPtr context; -}; - /** Query along with some additional data, * that can be used during query processing * inside storage engines. @@ -180,7 +153,6 @@ struct SelectQueryInfo ASTPtr query; ASTPtr view_query; /// Optimized VIEW query - ASTPtr original_query; /// Unmodified query for projection analysis /// Query tree QueryTreeNodePtr query_tree; @@ -242,18 +214,11 @@ struct SelectQueryInfo ClusterPtr getCluster() const { return !optimized_cluster ? cluster : optimized_cluster; } - /// If not null, it means we choose a projection to execute current query. - std::optional projection; - bool ignore_projections = false; - bool is_projection_query = false; bool merge_tree_empty_result = false; bool settings_limit_offset_done = false; bool is_internal = false; - Block minmax_count_projection_block; - MergeTreeDataSelectAnalysisResultPtr merge_tree_select_result_ptr; - + bool parallel_replicas_disabled = false; bool is_parameterized_view = false; - bool optimize_trivial_count = false; // If limit is not 0, that means it's a trivial limit query. @@ -262,11 +227,6 @@ struct SelectQueryInfo /// For IStorageSystemOneBlock std::vector columns_mask; - InputOrderInfoPtr getInputOrderInfo() const - { - return input_order_info ? input_order_info : (projection ? projection->input_order_info : nullptr); - } - bool isFinal() const; }; } diff --git a/src/Storages/StorageAzureBlob.cpp b/src/Storages/StorageAzureBlob.cpp index 4f63b144f43..df482f5ebf2 100644 --- a/src/Storages/StorageAzureBlob.cpp +++ b/src/Storages/StorageAzureBlob.cpp @@ -535,7 +535,12 @@ public: , format_settings(format_settings_) { StoredObject object(blob_path); - write_buf = wrapWriteBufferWithCompressionMethod(object_storage->writeObject(object, WriteMode::Rewrite), compression_method, 3); + const auto & settings = context->getSettingsRef(); + write_buf = wrapWriteBufferWithCompressionMethod( + object_storage->writeObject(object, WriteMode::Rewrite), + compression_method, + static_cast(settings.output_format_compression_level), + static_cast(settings.output_format_compression_zstd_window_log)); writer = FormatFactory::instance().getOutputFormatParallelIfPossible(format, *write_buf, sample_block, context, format_settings); } diff --git a/src/Storages/StorageBuffer.cpp b/src/Storages/StorageBuffer.cpp index ba5d922dc86..6f4b1563a46 100644 --- a/src/Storages/StorageBuffer.cpp +++ b/src/Storages/StorageBuffer.cpp @@ -212,8 +212,6 @@ QueryProcessingStage::Enum StorageBuffer::getQueryProcessingStage( { if (auto destination = getDestinationTable()) { - /// TODO: Find a way to support projections for StorageBuffer - query_info.ignore_projections = true; const auto & destination_metadata = destination->getInMemoryMetadataPtr(); return destination->getQueryProcessingStage(local_context, to_stage, destination->getStorageSnapshot(destination_metadata, local_context), query_info); } @@ -337,12 +335,12 @@ void StorageBuffer::read( pipes_from_buffers.emplace_back(std::make_shared(column_names, buf, storage_snapshot)); pipe_from_buffers = Pipe::unitePipes(std::move(pipes_from_buffers)); - if (query_info.getInputOrderInfo()) + if (query_info.input_order_info) { /// Each buffer has one block, and it not guaranteed that rows in each block are sorted by order keys pipe_from_buffers.addSimpleTransform([&](const Block & header) { - return std::make_shared(header, query_info.getInputOrderInfo()->sort_description_for_merging, 0); + return std::make_shared(header, query_info.input_order_info->sort_description_for_merging, 0); }); } } @@ -360,7 +358,7 @@ void StorageBuffer::read( /// TODO: Find a way to support projections for StorageBuffer auto interpreter = InterpreterSelectQuery( query_info.query, local_context, std::move(pipe_from_buffers), - SelectQueryOptions(processed_stage).ignoreProjections()); + SelectQueryOptions(processed_stage)); interpreter.addStorageLimits(*query_info.storage_limits); interpreter.buildQueryPlan(buffers_plan); } diff --git a/src/Storages/StorageExternalDistributed.cpp b/src/Storages/StorageExternalDistributed.cpp index d493fead993..beb93afc972 100644 --- a/src/Storages/StorageExternalDistributed.cpp +++ b/src/Storages/StorageExternalDistributed.cpp @@ -73,7 +73,7 @@ void StorageExternalDistributed::read( if (plans.empty()) { auto header = storage_snapshot->getSampleBlockForColumns(column_names); - InterpreterSelectQuery::addEmptySourceToQueryPlan(query_plan, header, query_info, context); + InterpreterSelectQuery::addEmptySourceToQueryPlan(query_plan, header, query_info); } if (plans.size() == 1) diff --git a/src/Storages/StorageFile.cpp b/src/Storages/StorageFile.cpp index 3c1e13679b5..cfff167555a 100644 --- a/src/Storages/StorageFile.cpp +++ b/src/Storages/StorageFile.cpp @@ -1397,7 +1397,7 @@ void StorageFile::read( throw Exception(ErrorCodes::FILE_DOESNT_EXIST, "File {} doesn't exist", p->at(0)); auto header = storage_snapshot->getSampleBlockForColumns(column_names); - InterpreterSelectQuery::addEmptySourceToQueryPlan(query_plan, header, query_info, context); + InterpreterSelectQuery::addEmptySourceToQueryPlan(query_plan, header, query_info); return; } } @@ -1575,8 +1575,12 @@ public: /// In case of formats with prefixes if file is not empty we have already written prefix. bool do_not_write_prefix = naked_buffer->size(); - - write_buf = wrapWriteBufferWithCompressionMethod(std::move(naked_buffer), compression_method, 3); + const auto & settings = context->getSettingsRef(); + write_buf = wrapWriteBufferWithCompressionMethod( + std::move(naked_buffer), + compression_method, + static_cast(settings.output_format_compression_level), + static_cast(settings.output_format_compression_zstd_window_log)); writer = FormatFactory::instance().getOutputFormatParallelIfPossible(format_name, *write_buf, metadata_snapshot->getSampleBlock(), context, format_settings); diff --git a/src/Storages/StorageMaterializedView.cpp b/src/Storages/StorageMaterializedView.cpp index b06fe00f5b2..ae616b1df04 100644 --- a/src/Storages/StorageMaterializedView.cpp +++ b/src/Storages/StorageMaterializedView.cpp @@ -149,10 +149,6 @@ QueryProcessingStage::Enum StorageMaterializedView::getQueryProcessingStage( const StorageSnapshotPtr &, SelectQueryInfo & query_info) const { - /// TODO: Find a way to support projections for StorageMaterializedView. Why do we use different - /// metadata for materialized view and target table? If they are the same, we can get rid of all - /// converting and use it just like a normal view. - query_info.ignore_projections = true; const auto & target_metadata = getTargetTable()->getInMemoryMetadataPtr(); return getTargetTable()->getQueryProcessingStage(local_context, to_stage, getTargetTable()->getStorageSnapshot(target_metadata, local_context), query_info); } diff --git a/src/Storages/StorageMerge.cpp b/src/Storages/StorageMerge.cpp index 5d4f50baa53..0d67403fa2f 100644 --- a/src/Storages/StorageMerge.cpp +++ b/src/Storages/StorageMerge.cpp @@ -278,8 +278,6 @@ QueryProcessingStage::Enum StorageMerge::getQueryProcessingStage( size_t selected_table_size = 0; - /// TODO: Find a way to support projections for StorageMerge - query_info.ignore_projections = true; for (const auto & iterator : database_table_iterators) { while (iterator->isValid()) @@ -854,7 +852,8 @@ QueryPlan ReadFromMerge::createPlanForTable( { InterpreterSelectQueryAnalyzer interpreter(modified_query_info.query_tree, modified_context, - SelectQueryOptions(processed_stage).ignoreProjections()); + SelectQueryOptions(processed_stage)); + auto & planner = interpreter.getPlanner(); planner.buildQueryPlanIfNeeded(); plan = std::move(planner).extractQueryPlan(); @@ -865,7 +864,8 @@ QueryPlan ReadFromMerge::createPlanForTable( /// TODO: Find a way to support projections for StorageMerge InterpreterSelectQuery interpreter{modified_query_info.query, modified_context, - SelectQueryOptions(processed_stage).ignoreProjections()}; + SelectQueryOptions(processed_stage)}; + interpreter.buildQueryPlan(plan); } } diff --git a/src/Storages/StorageMergeTree.cpp b/src/Storages/StorageMergeTree.cpp index b8804ad3c6d..4761ccd8b58 100644 --- a/src/Storages/StorageMergeTree.cpp +++ b/src/Storages/StorageMergeTree.cpp @@ -250,7 +250,6 @@ void StorageMergeTree::read( local_context, max_block_size, num_streams, - processed_stage, nullptr, enable_parallel_reading)) query_plan = std::move(*plan); diff --git a/src/Storages/StorageProxy.h b/src/Storages/StorageProxy.h index 269ddf57fa2..18a1f9086ae 100644 --- a/src/Storages/StorageProxy.h +++ b/src/Storages/StorageProxy.h @@ -38,8 +38,6 @@ public: const StorageSnapshotPtr &, SelectQueryInfo & info) const override { - /// TODO: Find a way to support projections for StorageProxy - info.ignore_projections = true; const auto & nested_metadata = getNested()->getInMemoryMetadataPtr(); return getNested()->getQueryProcessingStage(context, to_stage, getNested()->getStorageSnapshot(nested_metadata, context), info); } diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index a8404052c59..1d9a50b18b7 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -5345,12 +5345,12 @@ void StorageReplicatedMergeTree::read( /// 2. Do not read parts that have not yet been written to the quorum of the replicas. /// For this you have to synchronously go to ZooKeeper. if (settings.select_sequential_consistency) - return readLocalSequentialConsistencyImpl(query_plan, column_names, storage_snapshot, query_info, local_context, processed_stage, max_block_size, num_streams); + return readLocalSequentialConsistencyImpl(query_plan, column_names, storage_snapshot, query_info, local_context, max_block_size, num_streams); if (local_context->canUseParallelReplicasOnInitiator()) return readParallelReplicasImpl(query_plan, column_names, storage_snapshot, query_info, local_context, processed_stage); - readLocalImpl(query_plan, column_names, storage_snapshot, query_info, local_context, processed_stage, max_block_size, num_streams); + readLocalImpl(query_plan, column_names, storage_snapshot, query_info, local_context, max_block_size, num_streams); } void StorageReplicatedMergeTree::readLocalSequentialConsistencyImpl( @@ -5359,14 +5359,15 @@ void StorageReplicatedMergeTree::readLocalSequentialConsistencyImpl( const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr local_context, - QueryProcessingStage::Enum processed_stage, size_t max_block_size, size_t num_streams) { auto max_added_blocks = std::make_shared(getMaxAddedBlocks()); - auto plan = reader.read(column_names, storage_snapshot, query_info, local_context, - max_block_size, num_streams, processed_stage, std::move(max_added_blocks), - /* enable_parallel_reading= */false); + auto plan = reader.read( + column_names, storage_snapshot, query_info, local_context, + max_block_size, num_streams, std::move(max_added_blocks), + /* enable_parallel_reading=*/ false); + if (plan) query_plan = std::move(*plan); } @@ -5420,16 +5421,15 @@ void StorageReplicatedMergeTree::readLocalImpl( const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr local_context, - QueryProcessingStage::Enum processed_stage, const size_t max_block_size, const size_t num_streams) { auto plan = reader.read( column_names, storage_snapshot, query_info, local_context, max_block_size, num_streams, - processed_stage, /* max_block_numbers_to_read= */ nullptr, /* enable_parallel_reading= */ local_context->canUseParallelReplicasOnFollower()); + if (plan) query_plan = std::move(*plan); } diff --git a/src/Storages/StorageReplicatedMergeTree.h b/src/Storages/StorageReplicatedMergeTree.h index 2bd1fcbc693..fb74097d768 100644 --- a/src/Storages/StorageReplicatedMergeTree.h +++ b/src/Storages/StorageReplicatedMergeTree.h @@ -561,7 +561,6 @@ private: const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr local_context, - QueryProcessingStage::Enum processed_stage, size_t max_block_size, size_t num_streams); @@ -571,7 +570,6 @@ private: const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr local_context, - QueryProcessingStage::Enum processed_stage, size_t max_block_size, size_t num_streams); diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp index d7cc86ed321..3ddbfe8d894 100644 --- a/src/Storages/StorageS3.cpp +++ b/src/Storages/StorageS3.cpp @@ -8,6 +8,8 @@ #include #include #include +#include +#include #include #include @@ -856,6 +858,7 @@ public: blob_log->query_id = context->getCurrentQueryId(); } + const auto & settings = context->getSettingsRef(); write_buf = wrapWriteBufferWithCompressionMethod( std::make_unique( configuration_.client, @@ -868,7 +871,8 @@ public: threadPoolCallbackRunner(getIOThreadPool().get(), "S3ParallelWrite"), context->getWriteSettings()), compression_method, - 3); + static_cast(settings.output_format_compression_level), + static_cast(settings.output_format_compression_zstd_window_log)); writer = FormatFactory::instance().getOutputFormatParallelIfPossible(format, *write_buf, sample_block, context, format_settings); } @@ -1377,7 +1381,7 @@ bool StorageS3::Configuration::update(ContextPtr context) request_settings = s3_settings.request_settings; request_settings.updateFromSettings(context->getSettings()); - if (client && (static_configuration || s3_settings.auth_settings == auth_settings)) + if (client && (static_configuration || !auth_settings.hasUpdates(s3_settings.auth_settings))) return false; auth_settings.updateFrom(s3_settings.auth_settings); @@ -1600,11 +1604,11 @@ StorageS3::Configuration StorageS3::getConfiguration(ASTs & engine_args, Context if (engine_args_to_idx.contains("session_token")) configuration.auth_settings.session_token = checkAndGetLiteralArgument(engine_args[engine_args_to_idx["session_token"]], "session_token"); - - configuration.auth_settings.no_sign_request = no_sign_request; + if (no_sign_request) + configuration.auth_settings.no_sign_request = no_sign_request; } - configuration.static_configuration = !configuration.auth_settings.access_key_id.empty(); + configuration.static_configuration = !configuration.auth_settings.access_key_id.empty() || configuration.auth_settings.no_sign_request.has_value(); configuration.keys = {configuration.url.key}; diff --git a/src/Storages/StorageURL.cpp b/src/Storages/StorageURL.cpp index c0e4be36202..a31b131bf1d 100644 --- a/src/Storages/StorageURL.cpp +++ b/src/Storages/StorageURL.cpp @@ -541,11 +541,12 @@ StorageURLSink::StorageURLSink( Poco::URI(uri), http_method, content_type, content_encoding, headers, timeouts, DBMS_DEFAULT_BUFFER_SIZE, proxy_config ); + const auto & settings = context->getSettingsRef(); write_buf = wrapWriteBufferWithCompressionMethod( std::move(write_buffer), compression_method, - 3 - ); + static_cast(settings.output_format_compression_level), + static_cast(settings.output_format_compression_zstd_window_log)); writer = FormatFactory::instance().getOutputFormat(format, *write_buf, sample_block, context, format_settings); } diff --git a/src/Storages/System/StorageSystemBuildOptions.cpp.in b/src/Storages/System/StorageSystemBuildOptions.cpp.in index 796b134ba56..a81bcb08bfc 100644 --- a/src/Storages/System/StorageSystemBuildOptions.cpp.in +++ b/src/Storages/System/StorageSystemBuildOptions.cpp.in @@ -63,6 +63,7 @@ const char * auto_config_build[] "USE_ORC", "@USE_ORC@", "USE_MSGPACK", "@USE_MSGPACK@", "USE_QPL", "@ENABLE_QPL@", + "USE_QAT", "@ENABLE_QATLIB@", "GIT_HASH", "@GIT_HASH@", "GIT_BRANCH", R"IRjaNsZIL9Yh7FQ4(@GIT_BRANCH@)IRjaNsZIL9Yh7FQ4", "GIT_DATE", "@GIT_DATE@", diff --git a/src/Storages/System/StorageSystemMySQLBinlogs.cpp b/src/Storages/System/StorageSystemMySQLBinlogs.cpp new file mode 100644 index 00000000000..32648d22ee8 --- /dev/null +++ b/src/Storages/System/StorageSystemMySQLBinlogs.cpp @@ -0,0 +1,164 @@ +#include + +#include +#include +#include +#include +#include +#include + + +namespace DB +{ + +NamesAndTypesList StorageSystemMySQLBinlogs::getNamesAndTypes() +{ + return { + {"binlog_client_name", std::make_shared()}, + {"name", std::make_shared()}, + {"mysql_binlog_name", std::make_shared()}, + {"mysql_binlog_pos", std::make_shared()}, + {"mysql_binlog_timestamp", std::make_shared()}, + {"mysql_binlog_executed_gtid_set", std::make_shared()}, + {"dispatcher_name", std::make_shared()}, + {"dispatcher_mysql_binlog_name", std::make_shared()}, + {"dispatcher_mysql_binlog_pos", std::make_shared()}, + {"dispatcher_mysql_binlog_timestamp", std::make_shared()}, + {"dispatcher_mysql_binlog_executed_gtid_set", std::make_shared()}, + {"size", std::make_shared()}, + {"bytes", std::make_shared()}, + {"max_bytes", std::make_shared()}, + {"max_waiting_ms", std::make_shared()}, + {"dispatcher_events_read_per_sec", std::make_shared()}, + {"dispatcher_bytes_read_per_sec", std::make_shared()}, + {"dispatcher_events_flush_per_sec", std::make_shared()}, + {"dispatcher_bytes_flush_per_sec", std::make_shared()}, + }; +} + +StorageSystemMySQLBinlogs::StorageSystemMySQLBinlogs(const StorageID & storage_id_) + : IStorage(storage_id_) +{ + StorageInMemoryMetadata storage_metadata; + ColumnsDescription columns(getNamesAndTypes()); + storage_metadata.setColumns(columns); + setInMemoryMetadata(storage_metadata); +} + +class MetadataSource : public ISource +{ +public: + using DispatcherMetadata = MySQLReplication::BinlogEventsDispatcher::DispatcherMetadata; + using BinlogMetadata = MySQLReplication::BinlogEventsDispatcher::BinlogMetadata; + + MetadataSource(Block block_header_, const std::vector & clients_) + : ISource(block_header_) + , block_to_fill(std::move(block_header_)) + , clients(clients_) + {} + + String getName() const override { return "MySQLBinlogClient"; } + +protected: + Chunk generate() override + { + if (clients.empty()) + return {}; + + Columns columns; + columns.reserve(block_to_fill.columns()); + + size_t total_size = 0; + auto create_column = [&](auto && column, const std::function & field) + { + size_t size = 0; + for (const auto & client : clients) + { + for (const auto & d : client.dispatchers) + { + for (const auto & b : d.binlogs) + { + column->insert(field(client.binlog_client_name, d, b)); + ++size; + } + } + } + if (!total_size) + total_size = size; + return std::forward(column); + }; + + for (const auto & elem : block_to_fill) + { + if (elem.name == "binlog_client_name") + columns.emplace_back(create_column(ColumnString::create(), [](auto n, auto, auto) { return Field(n); })); + else if (elem.name == "name") + columns.emplace_back(create_column(ColumnString::create(), [](auto, auto, auto b) { return Field(b.name); })); + else if (elem.name == "mysql_binlog_name") + columns.emplace_back(create_column(ColumnString::create(), [](auto, auto, auto b) { return Field(b.position_read.binlog_name); })); + else if (elem.name == "mysql_binlog_pos") + columns.emplace_back(create_column(ColumnUInt64::create(), [](auto, auto, auto b) { return Field(b.position_read.binlog_pos); })); + else if (elem.name == "mysql_binlog_timestamp") + columns.emplace_back(create_column(ColumnUInt64::create(), [](auto, auto, auto b) { return Field(b.position_read.timestamp); })); + else if (elem.name == "mysql_binlog_executed_gtid_set") + columns.emplace_back(create_column(ColumnString::create(), [](auto, auto, auto b) { return Field(b.position_read.gtid_sets.toString()); })); + else if (elem.name == "dispatcher_name") + columns.emplace_back(create_column(ColumnString::create(), [](auto, auto d, auto) { return Field(d.name); })); + else if (elem.name == "dispatcher_mysql_binlog_name") + columns.emplace_back(create_column(ColumnString::create(), [](auto, auto d, auto) { return Field(d.position.binlog_name); })); + else if (elem.name == "dispatcher_mysql_binlog_pos") + columns.emplace_back(create_column(ColumnUInt64::create(), [](auto, auto d, auto) { return Field(d.position.binlog_pos); })); + else if (elem.name == "dispatcher_mysql_binlog_timestamp") + columns.emplace_back(create_column(ColumnUInt64::create(), [](auto, auto d, auto) { return Field(d.position.timestamp); })); + else if (elem.name == "dispatcher_mysql_binlog_executed_gtid_set") + columns.emplace_back(create_column(ColumnString::create(), [](auto, auto d, auto) { return Field(d.position.gtid_sets.toString()); })); + else if (elem.name == "size") + columns.emplace_back(create_column(ColumnUInt64::create(), [](auto, auto, auto b) { return Field(b.size); })); + else if (elem.name == "bytes") + columns.emplace_back(create_column(ColumnUInt64::create(), [](auto, auto, auto b) { return Field(b.bytes); })); + else if (elem.name == "max_bytes") + columns.emplace_back(create_column(ColumnUInt64::create(), [](auto, auto, auto b) { return Field(b.max_bytes); })); + else if (elem.name == "max_waiting_ms") + columns.emplace_back(create_column(ColumnUInt64::create(), [](auto, auto, auto b) { return Field(b.max_waiting_ms); })); + else if (elem.name == "dispatcher_events_read_per_sec") + columns.emplace_back(create_column(ColumnFloat32::create(), [](auto, auto d, auto) { return Field(d.events_read_per_sec); })); + else if (elem.name == "dispatcher_bytes_read_per_sec") + columns.emplace_back(create_column(ColumnFloat32::create(), [](auto, auto d, auto) { return Field(d.bytes_read_per_sec); })); + else if (elem.name == "dispatcher_events_flush_per_sec") + columns.emplace_back(create_column(ColumnFloat32::create(), [](auto, auto d, auto) { return Field(d.events_flush_per_sec); })); + else if (elem.name == "dispatcher_bytes_flush_per_sec") + columns.emplace_back(create_column(ColumnFloat32::create(), [](auto, auto d, auto) { return Field(d.bytes_flush_per_sec); })); + } + + clients.clear(); + return {std::move(columns), total_size}; + } + +private: + Block block_to_fill; + std::vector clients; +}; + +Pipe StorageSystemMySQLBinlogs::read( + const Names & column_names_, + const StorageSnapshotPtr & storage_snapshot, + SelectQueryInfo & /* query_info_ */, + ContextPtr /*context_ */, + QueryProcessingStage::Enum /*processed_stage*/, + size_t /*max_block_size*/, + size_t /*num_streams*/) +{ + storage_snapshot->check(column_names_); + const ColumnsDescription & our_columns = storage_snapshot->getDescriptionForColumns(column_names_); + Block block_header; + for (const auto & name : column_names_) + { + const auto & name_type = our_columns.get(name); + MutableColumnPtr column = name_type.type->createColumn(); + block_header.insert({std::move(column), name_type.type, name_type.name}); + } + + return Pipe{std::make_shared(block_header, MySQLReplication::BinlogClientFactory::instance().getMetadata())}; +} + +} diff --git a/src/Storages/System/StorageSystemMySQLBinlogs.h b/src/Storages/System/StorageSystemMySQLBinlogs.h new file mode 100644 index 00000000000..a627137f495 --- /dev/null +++ b/src/Storages/System/StorageSystemMySQLBinlogs.h @@ -0,0 +1,29 @@ +#pragma once + +#include + +namespace DB +{ + +class StorageSystemMySQLBinlogs final : public IStorage +{ +public: + explicit StorageSystemMySQLBinlogs(const StorageID & storage_id_); + + std::string getName() const override { return "MySQLBinlogs"; } + + Pipe read( + const Names & column_names, + const StorageSnapshotPtr & storage_snapshot, + SelectQueryInfo & query_info, + ContextPtr context, + QueryProcessingStage::Enum processed_stage, + size_t max_block_size, + size_t num_streams) override; + + bool isSystemStorage() const override { return true; } + + static NamesAndTypesList getNamesAndTypes(); +}; + +} diff --git a/src/Storages/System/attachSystemTables.cpp b/src/Storages/System/attachSystemTables.cpp index 2ecddd4b5ed..5180cac609b 100644 --- a/src/Storages/System/attachSystemTables.cpp +++ b/src/Storages/System/attachSystemTables.cpp @@ -107,6 +107,10 @@ #include #endif +#if USE_MYSQL +#include +#endif + namespace DB { @@ -174,6 +178,9 @@ void attachSystemTablesServer(ContextPtr context, IDatabase & system_database, b #if USE_ROCKSDB attach(context, system_database, "rocksdb"); #endif +#if USE_MYSQL + attach(context, system_database, "mysql_binlogs"); +#endif attach(context, system_database, "parts"); attach(context, system_database, "projection_parts"); diff --git a/src/Storages/TTLDescription.cpp b/src/Storages/TTLDescription.cpp index 41a222525bf..b3f5d181d5d 100644 --- a/src/Storages/TTLDescription.cpp +++ b/src/Storages/TTLDescription.cpp @@ -293,7 +293,7 @@ TTLDescription TTLDescription::getTTLFromAST( { result.recompression_codec = CompressionCodecFactory::instance().validateCodecAndGetPreprocessedAST( - ttl_element->recompression_codec, {}, !context->getSettingsRef().allow_suspicious_codecs, context->getSettingsRef().allow_experimental_codecs, context->getSettingsRef().enable_deflate_qpl_codec); + ttl_element->recompression_codec, {}, !context->getSettingsRef().allow_suspicious_codecs, context->getSettingsRef().allow_experimental_codecs, context->getSettingsRef().enable_deflate_qpl_codec, context->getSettingsRef().enable_zstd_qat_codec); } } diff --git a/tests/ci/ci_config.py b/tests/ci/ci_config.py index 895a12313da..e3319fe4a72 100644 --- a/tests/ci/ci_config.py +++ b/tests/ci/ci_config.py @@ -1,9 +1,9 @@ #!/usr/bin/env python3 -from enum import Enum import logging from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser from dataclasses import dataclass, field +from enum import Enum from pathlib import Path from typing import Callable, Dict, Iterable, List, Literal, Optional, Union @@ -47,7 +47,7 @@ class JobConfig: @num_batches - sets number of batches for multi-batch job """ - digest: DigestConfig = DigestConfig() + digest: DigestConfig = field(default_factory=DigestConfig) run_command: str = "" timeout: Optional[int] = None num_batches: int = 1 @@ -67,30 +67,32 @@ class BuildConfig: sparse_checkout: bool = False comment: str = "" static_binary_name: str = "" - job_config: JobConfig = JobConfig( - digest=DigestConfig( - include_paths=[ - "./src", - "./contrib/*-cmake", - "./contrib/consistent-hashing", - "./contrib/murmurhash", - "./contrib/libfarmhash", - "./contrib/pdqsort", - "./contrib/cityhash102", - "./contrib/sparse-checkout", - "./contrib/libmetrohash", - "./contrib/update-submodules.sh", - "./contrib/CMakeLists.txt", - "./cmake", - "./base", - "./programs", - "./packages", - "./docker/packager/packager", - ], - exclude_files=[".md"], - docker=["clickhouse/binary-builder"], - git_submodules=True, - ), + job_config: JobConfig = field( + default_factory=lambda: JobConfig( + digest=DigestConfig( + include_paths=[ + "./src", + "./contrib/*-cmake", + "./contrib/consistent-hashing", + "./contrib/murmurhash", + "./contrib/libfarmhash", + "./contrib/pdqsort", + "./contrib/cityhash102", + "./contrib/sparse-checkout", + "./contrib/libmetrohash", + "./contrib/update-submodules.sh", + "./contrib/CMakeLists.txt", + "./cmake", + "./base", + "./programs", + "./packages", + "./docker/packager/packager", + ], + exclude_files=[".md"], + docker=["clickhouse/binary-builder"], + git_submodules=True, + ), + ) ) def export_env(self, export: bool = False) -> str: @@ -107,14 +109,14 @@ class BuildConfig: @dataclass class BuildReportConfig: builds: List[str] - job_config: JobConfig = JobConfig() + job_config: JobConfig = field(default_factory=JobConfig) @dataclass class TestConfig: required_build: str force_tests: bool = False - job_config: JobConfig = JobConfig() + job_config: JobConfig = field(default_factory=JobConfig) BuildConfigs = Dict[str, BuildConfig] diff --git a/tests/ci/functional_test_check.py b/tests/ci/functional_test_check.py index 4d81161b6de..89fcb9ce350 100644 --- a/tests/ci/functional_test_check.py +++ b/tests/ci/functional_test_check.py @@ -55,6 +55,7 @@ def get_additional_envs( result.append("USE_PARALLEL_REPLICAS=1") if "s3 storage" in check_name: result.append("USE_S3_STORAGE_FOR_MERGE_TREE=1") + result.append("RANDOMIZE_OBJECT_KEY_TYPE=1") if "analyzer" in check_name: result.append("USE_NEW_ANALYZER=1") diff --git a/tests/ci/libfuzzer_test_check.py b/tests/ci/libfuzzer_test_check.py index 6de0614541a..49699b7d2fd 100644 --- a/tests/ci/libfuzzer_test_check.py +++ b/tests/ci/libfuzzer_test_check.py @@ -47,6 +47,7 @@ def get_additional_envs(check_name, run_by_hash_num, run_by_hash_total): result.append("USE_PARALLEL_REPLICAS=1") if "s3 storage" in check_name: result.append("USE_S3_STORAGE_FOR_MERGE_TREE=1") + result.append("RANDOMIZE_OBJECT_KEY_TYPE=1") if "analyzer" in check_name: result.append("USE_NEW_ANALYZER=1") diff --git a/tests/ci/s3_helper.py b/tests/ci/s3_helper.py index bc403aa5015..616d645b5a6 100644 --- a/tests/ci/s3_helper.py +++ b/tests/ci/s3_helper.py @@ -5,20 +5,19 @@ import shutil import time from multiprocessing.dummy import Pool from pathlib import Path -from typing import List, Union +from typing import Any, List, Union import boto3 # type: ignore import botocore # type: ignore - -from env_helper import ( - S3_TEST_REPORTS_BUCKET, - S3_BUILDS_BUCKET, - RUNNER_TEMP, - CI, - S3_URL, - S3_DOWNLOAD, -) from compress_files import compress_file_fast +from env_helper import ( + CI, + RUNNER_TEMP, + S3_BUILDS_BUCKET, + S3_DOWNLOAD, + S3_TEST_REPORTS_BUCKET, + S3_URL, +) def _flatten_list(lst): @@ -34,11 +33,14 @@ def _flatten_list(lst): class S3Helper: max_pool_size = 100 - def __init__(self): + def __init__(self, client: Any = None, endpoint: str = S3_URL): + self.host = endpoint + if client is not None: + self.client = client + return config = botocore.config.Config(max_pool_connections=self.max_pool_size) - self.session = boto3.session.Session(region_name="us-east-1") - self.client = self.session.client("s3", endpoint_url=S3_URL, config=config) - self.host = S3_URL + session = boto3.session.Session(region_name="us-east-1") + self.client = session.client("s3", endpoint_url=endpoint, config=config) def _upload_file_to_s3( self, bucket_name: str, file_path: Path, s3_path: str @@ -199,6 +201,7 @@ class S3Helper: t = time.time() except Exception as ex: logging.critical("Failed to upload file, expcetion %s", ex) + return "" return self.s3_url(bucket_name, s3_path) p = Pool(self.max_pool_size) diff --git a/tests/ci/stress.py b/tests/ci/stress.py index 49a53c9048c..7d582e683e0 100755 --- a/tests/ci/stress.py +++ b/tests/ci/stress.py @@ -21,6 +21,7 @@ def get_options(i: int, upgrade_check: bool) -> str: options.append(f'''--db-engine="Replicated('/test/db/test_{i}', 's1', 'r1')"''') client_options.append("allow_experimental_database_replicated=1") client_options.append("enable_deflate_qpl_codec=1") + client_options.append("enable_zstd_qat_codec=1") # If database name is not specified, new database is created for each functional test. # Run some threads with one database for all tests. diff --git a/tests/config/config.d/s3_storage_policy_with_template_object_key.xml b/tests/config/config.d/s3_storage_policy_with_template_object_key.xml new file mode 100644 index 00000000000..834f5102da1 --- /dev/null +++ b/tests/config/config.d/s3_storage_policy_with_template_object_key.xml @@ -0,0 +1,32 @@ + + + + + s3 + http://localhost:11111/test/ + clickhouse + clickhouse + test + + [a-z]{3}-first-random-part/new-style-prefix/[a-z]{3}/[a-z]{29} + + + cache + 1Gi + cached_s3/ + s3 + + + + + +

cached_s3
+ + + + + + s3 + + cached_s3 + diff --git a/tests/config/config.d/storage_metadata_with_full_object_key.xml b/tests/config/config.d/storage_metadata_with_full_object_key.xml new file mode 100644 index 00000000000..2bb8d49ec4b --- /dev/null +++ b/tests/config/config.d/storage_metadata_with_full_object_key.xml @@ -0,0 +1,5 @@ + + + + 1 + diff --git a/tests/config/install.sh b/tests/config/install.sh index 2f9fd44c9b0..3a0744a298d 100755 --- a/tests/config/install.sh +++ b/tests/config/install.sh @@ -141,7 +141,25 @@ if [[ -n "$USE_DATABASE_ORDINARY" ]] && [[ "$USE_DATABASE_ORDINARY" -eq 1 ]]; th fi if [[ -n "$USE_S3_STORAGE_FOR_MERGE_TREE" ]] && [[ "$USE_S3_STORAGE_FOR_MERGE_TREE" -eq 1 ]]; then - ln -sf $SRC_PATH/config.d/s3_storage_policy_by_default.xml $DEST_SERVER_PATH/config.d/ + object_key_types_options=("generate-suffix" "generate-full-key" "generate-template-key") + object_key_type="${object_key_types_options[0]}" + + if [[ -n "$RANDOMIZE_OBJECT_KEY_TYPE" ]] && [[ "$RANDOMIZE_OBJECT_KEY_TYPE" -eq 1 ]]; then + object_key_type="${object_key_types_options[$(($RANDOM % ${#object_key_types_options[@]}))]}" + fi + + case $object_key_type in + "generate-full-key") + ln -sf $SRC_PATH/config.d/storage_metadata_with_full_object_key.xml $DEST_SERVER_PATH/config.d/ + ;; + "generate-template-key") + ln -sf $SRC_PATH/config.d/storage_metadata_with_full_object_key.xml $DEST_SERVER_PATH/config.d/ + ln -sf $SRC_PATH/config.d/s3_storage_policy_with_template_object_key.xml $DEST_SERVER_PATH/config.d/ + ;; + "generate-suffix"|*) + ln -sf $SRC_PATH/config.d/s3_storage_policy_by_default.xml $DEST_SERVER_PATH/config.d/ + ;; + esac fi ARM="aarch64" diff --git a/tests/integration/test_materialized_mysql_database/materialized_with_ddl.py b/tests/integration/test_materialized_mysql_database/materialized_with_ddl.py index 286a2d29541..97c8b65f15d 100644 --- a/tests/integration/test_materialized_mysql_database/materialized_with_ddl.py +++ b/tests/integration/test_materialized_mysql_database/materialized_with_ddl.py @@ -2714,3 +2714,698 @@ def table_with_indexes(clickhouse_node, mysql_node, service_name): mysql_node.query(f"DROP DATABASE IF EXISTS {db}") clickhouse_node.query(f"DROP DATABASE IF EXISTS {db}") + + +def binlog_client_test(clickhouse_node, mysql_node, replication): + db = "binlog_client_test" + replication.create_db_mysql(db) + + mysql_node.query( + f"CREATE TABLE {db}.t(id INT PRIMARY KEY AUTO_INCREMENT, score int, create_time DATETIME DEFAULT NOW())" + ) + replication.insert_data(db, "t", 100000, column="score") + replication.create_db_ch(f"{db}1", from_mysql_db=db, settings="use_binlog_client=1") + check_query( + clickhouse_node, + f"SHOW TABLES FROM {db}1 FORMAT TSV", + "t\n", + ) + + replication.insert_data(db, "t", 100000, column="score") + + num_rows = replication.inserted_rows + check_query( + clickhouse_node, + f"/* expect: {num_rows} */ SELECT count() FROM {db}1.t", + f"{num_rows}\n", + interval_seconds=1, + retry_count=30, + ) + + replication.create_db_ch(f"{db}2", from_mysql_db=db, settings="use_binlog_client=1") + check_query( + clickhouse_node, + f"SHOW TABLES FROM {db}2 FORMAT TSV", + "t\n", + ) + + replication.insert_data(db, "t", 100000, column="score") + num_rows = replication.inserted_rows + + check_query( + clickhouse_node, + f"/* expect: 1 */ SELECT count() FROM system.mysql_binlogs WHERE name = '{db}1'", + "1\n", + interval_seconds=1, + retry_count=30, + ) + check_query( + clickhouse_node, + f"/* expect: 1 */ SELECT count() FROM system.mysql_binlogs WHERE name = '{db}2'", + "1\n", + interval_seconds=1, + retry_count=30, + ) + check_query( + clickhouse_node, + f"/* expect: {num_rows} */ SELECT count() FROM {db}1.t", + f"{num_rows}\n", + interval_seconds=1, + retry_count=60, + ) + check_query( + clickhouse_node, + f"/* expect: {num_rows} */ SELECT count() FROM {db}2.t", + f"{num_rows}\n", + interval_seconds=1, + retry_count=60, + ) + # Catch up + check_query( + clickhouse_node, + f"/* expect: 1 */ SELECT COUNT(DISTINCT(dispatcher_name)) FROM system.mysql_binlogs WHERE name LIKE '{db}%'", + "1\n", + interval_seconds=1, + retry_count=30, + ) + + replication.drop_dbs_ch() + replication.create_db_ch( + f"{db}1", + from_mysql_db=db, + settings="use_binlog_client=1, max_bytes_in_binlog_queue=10", + ) + replication.create_db_ch( + f"{db}2", + from_mysql_db=db, + settings="use_binlog_client=1, max_bytes_in_binlog_queue=10", + ) + replication.insert_data(db, "t", 10000, column="score") + check_query( + clickhouse_node, + f"SHOW TABLES FROM {db}1 FORMAT TSV", + "t\n", + ) + + replication.insert_data(db, "t", 100000, column="score") + check_query( + clickhouse_node, + f"SHOW TABLES FROM {db}2 FORMAT TSV", + "t\n", + ) + + replication.insert_data(db, "t", 10000, column="score") + + num_rows = replication.inserted_rows + + check_query( + clickhouse_node, + f"/* expect: 1 */ SELECT count() FROM system.mysql_binlogs WHERE name = '{db}1'", + "1\n", + interval_seconds=1, + retry_count=30, + ) + check_query( + clickhouse_node, + f"/* expect: 1 */ SELECT count() FROM system.mysql_binlogs WHERE name = '{db}2'", + "1\n", + interval_seconds=1, + retry_count=30, + ) + check_query( + clickhouse_node, + f"/* expect: {num_rows} */ SELECT count() FROM {db}1.t", + f"{num_rows}\n", + interval_seconds=1, + retry_count=60, + ) + check_query( + clickhouse_node, + f"/* expect: {num_rows} */ SELECT count() FROM {db}2.t", + f"{num_rows}\n", + interval_seconds=1, + retry_count=60, + ) + check_query( + clickhouse_node, + f"/* expect: 1 */ SELECT COUNT(DISTINCT(dispatcher_name)) FROM system.mysql_binlogs WHERE name LIKE '{db}%'", + "1\n", + interval_seconds=1, + retry_count=30, + ) + + replication.create_db_ch( + f"{db}3", + from_mysql_db=db, + settings="use_binlog_client=1", + ) + + mysql_node.query(f"UPDATE {db}.t SET score = score + 1") + + check_query( + clickhouse_node, + f"/* expect: 1 */ SELECT COUNT(DISTINCT(dispatcher_name)) FROM system.mysql_binlogs WHERE name LIKE '{db}%'", + "1\n", + interval_seconds=1, + retry_count=30, + ) + check_query( + clickhouse_node, + f"/* expect: 0 */ SELECT size FROM system.mysql_binlogs WHERE name = '{db}1'", + "0\n", + interval_seconds=1, + retry_count=30, + ) + check_query( + clickhouse_node, + f"/* expect: 0 */ SELECT size FROM system.mysql_binlogs WHERE name = '{db}2'", + "0\n", + interval_seconds=1, + retry_count=30, + ) + check_query( + clickhouse_node, + f"/* expect: 0 */ SELECT size FROM system.mysql_binlogs WHERE name = '{db}3'", + "0\n", + interval_seconds=1, + retry_count=30, + ) + check_query( + clickhouse_node, + f"/* expect: {num_rows} */ SELECT count() FROM {db}1.t", + f"{num_rows}\n", + interval_seconds=1, + retry_count=30, + ) + check_query( + clickhouse_node, + f"/* expect: {num_rows} */ SELECT count() FROM {db}2.t", + f"{num_rows}\n", + interval_seconds=1, + retry_count=30, + ) + check_query( + clickhouse_node, + f"/* expect: {num_rows} */ SELECT count() FROM {db}3.t", + f"{num_rows}\n", + interval_seconds=1, + retry_count=30, + ) + + mysql_crc32 = mysql_node.query_and_get_data( + f"SELECT bit_xor(cast(crc32(concat(id, score, create_time)) AS unsigned)) AS checksum FROM {db}.t" + )[0][0] + column = "bit_xor(cast(crc32(concat(toString(assumeNotNull(id)), toString(assumeNotNull(score)), toString(assumeNotNull(create_time)))) AS UInt32)) AS checksum" + check_query( + clickhouse_node, + f"/* expect: {mysql_crc32} */ SELECT {column} FROM {db}1.t", + f"{mysql_crc32}\n", + interval_seconds=1, + retry_count=30, + ) + check_query( + clickhouse_node, + f"/* expect: {mysql_crc32} */ SELECT {column} FROM {db}2.t", + f"{mysql_crc32}\n", + interval_seconds=1, + retry_count=30, + ) + check_query( + clickhouse_node, + f"/* expect: {mysql_crc32} */ SELECT {column} FROM {db}3.t", + f"{mysql_crc32}\n", + interval_seconds=1, + retry_count=30, + ) + + clickhouse_node.query(f"DROP DATABASE IF EXISTS {db}1") + check_query( + clickhouse_node, + f"/* expect: 0 */ SELECT COUNT() FROM system.mysql_binlogs WHERE name = '{db}1'", + "0\n", + interval_seconds=1, + retry_count=10, + ) + + +def binlog_client_timeout_test(clickhouse_node, mysql_node, replication): + db = "binlog_client_timeout_test" + replication.create_db_mysql(db) + mysql_node.query( + f"CREATE TABLE {db}.t(id INT PRIMARY KEY AUTO_INCREMENT, score int, create_time DATETIME DEFAULT NOW())" + ) + replication.insert_data(db, "t", 10000, column="score") + num_rows = replication.inserted_rows + + replication.create_db_ch( + f"{db}1", + from_mysql_db=db, + settings="use_binlog_client=1, max_bytes_in_binlog_queue=100000000, max_milliseconds_to_wait_in_binlog_queue=60000", + ) + replication.create_db_ch( + f"{db}2", + from_mysql_db=db, + settings="use_binlog_client=1, max_bytes_in_binlog_queue=10", + ) + replication.create_db_ch( + f"{db}3", + from_mysql_db=db, + settings="use_binlog_client=1, max_bytes_in_binlog_queue=10, max_milliseconds_to_wait_in_binlog_queue=100", + ) + replication.create_db_ch( + f"{db}4", + from_mysql_db=db, + settings="use_binlog_client=1, max_bytes_in_binlog_queue=10, max_milliseconds_to_wait_in_binlog_queue=10", + ) + + # After incremental sync + check_query( + clickhouse_node, + f"/* expect: 100000000, 60000 */ SELECT max_bytes, max_waiting_ms FROM system.mysql_binlogs WHERE name = '{db}1'", + f"100000000\t60000\n", + interval_seconds=1, + retry_count=10, + ) + check_query( + clickhouse_node, + f"/* expect: 10 */ SELECT max_bytes FROM system.mysql_binlogs WHERE name = '{db}2'", + f"10\n", + interval_seconds=2, + retry_count=10, + ) + check_query( + clickhouse_node, + f"/* expect: {num_rows} */ SELECT count() FROM {db}1.t", + f"{num_rows}\n", + interval_seconds=1, + retry_count=30, + ) + check_query( + clickhouse_node, + f"/* expect: {num_rows} */ SELECT count() FROM {db}2.t", + f"{num_rows}\n", + interval_seconds=1, + retry_count=30, + ) + check_query( + clickhouse_node, + f"/* expect: {num_rows} */ SELECT count() FROM {db}3.t", + f"{num_rows}\n", + interval_seconds=1, + retry_count=30, + ) + check_query( + clickhouse_node, + f"/* expect: {num_rows} */ SELECT count() FROM {db}4.t", + f"{num_rows}\n", + interval_seconds=1, + retry_count=30, + ) + + clickhouse_node.query(f"DROP DATABASE {db}3") + replication.create_db_ch( + f"{db}3", + from_mysql_db=db, + settings="use_binlog_client=1, max_bytes_in_binlog_queue=10, max_milliseconds_to_wait_in_binlog_queue=10", + ) + check_query( + clickhouse_node, + f"SHOW TABLES FROM {db}3 FORMAT TSV", + "t\n", + ) + + clickhouse_node.query(f"DROP DATABASE {db}4") + replication.create_db_ch( + f"{db}4", + from_mysql_db=db, + settings="use_binlog_client=1, max_bytes_in_binlog_queue=10, max_milliseconds_to_wait_in_binlog_queue=50", + ) + check_query( + clickhouse_node, + f"SHOW TABLES FROM {db}4 FORMAT TSV", + "t\n", + ) + + mysql_node.query( + f"UPDATE {db}.t SET create_time='2021-01-01' WHERE id > 1000 AND id < 100000" + ) + mysql_node.query(f"UPDATE {db}.t SET create_time='2021-11-11' WHERE score > 1000") + mysql_node.query( + f"UPDATE {db}.t SET create_time=now() WHERE create_time='2021-01-01'" + ) + + check_query( + clickhouse_node, + f"/* expect: 0 */ SELECT COUNT() FROM {db}1.t WHERE toDate(create_time)='2021-01-01'", + "0\n", + interval_seconds=1, + retry_count=300, + ) + check_query( + clickhouse_node, + f"/* expect: 0 */ SELECT COUNT() FROM {db}2.t WHERE toDate(create_time)='2021-01-01'", + "0\n", + interval_seconds=1, + retry_count=300, + ) + check_query( + clickhouse_node, + f"/* expect: 0 */ SELECT COUNT() FROM {db}3.t WHERE toDate(create_time)='2021-01-01'", + "0\n", + interval_seconds=1, + retry_count=300, + ) + check_query( + clickhouse_node, + f"/* expect: 0 */ SELECT COUNT() FROM {db}4.t WHERE toDate(create_time)='2021-01-01'", + "0\n", + interval_seconds=1, + retry_count=300, + ) + check_query( + clickhouse_node, + f"/* expect: 0 */ SELECT size FROM system.mysql_binlogs WHERE name = '{db}1'", + "0\n", + interval_seconds=1, + retry_count=300, + ) + check_query( + clickhouse_node, + f"/* expect: 0 */ SELECT size FROM system.mysql_binlogs WHERE name = '{db}2'", + "0\n", + interval_seconds=1, + retry_count=300, + ) + check_query( + clickhouse_node, + f"/* expect: 0 */ SELECT size FROM system.mysql_binlogs WHERE name = '{db}3'", + "0\n", + interval_seconds=1, + retry_count=300, + ) + check_query( + clickhouse_node, + f"/* expect: 0 */ SELECT size FROM system.mysql_binlogs WHERE name = '{db}4'", + "0\n", + interval_seconds=1, + retry_count=300, + ) + check_query( + clickhouse_node, + f"/* expect: {num_rows} */ SELECT count() FROM {db}1.t", + f"{num_rows}\n", + interval_seconds=1, + retry_count=30, + ) + check_query( + clickhouse_node, + f"/* expect: {num_rows} */ SELECT count() FROM {db}2.t", + f"{num_rows}\n", + interval_seconds=1, + retry_count=30, + ) + check_query( + clickhouse_node, + f"/* expect: {num_rows} */ SELECT count() FROM {db}3.t", + f"{num_rows}\n", + interval_seconds=1, + retry_count=30, + ) + check_query( + clickhouse_node, + f"/* expect: {num_rows} */ SELECT count() FROM {db}4.t", + f"{num_rows}\n", + interval_seconds=1, + retry_count=30, + ) + + mysql_crc32 = mysql_node.query_and_get_data( + f"SELECT bit_xor(cast(crc32(concat(id, score, create_time)) AS unsigned)) AS checksum FROM {db}.t" + )[0][0] + column = "bit_xor(cast(crc32(concat(toString(assumeNotNull(id)), toString(assumeNotNull(score)), toString(assumeNotNull(create_time)))) AS UInt32)) AS checksum" + check_query( + clickhouse_node, + f"/* expect: {mysql_crc32} */ SELECT {column} FROM {db}1.t", + f"{mysql_crc32}\n", + interval_seconds=1, + retry_count=30, + ) + check_query( + clickhouse_node, + f"/* expect: {mysql_crc32} */ SELECT {column} FROM {db}2.t", + f"{mysql_crc32}\n", + interval_seconds=1, + retry_count=30, + ) + check_query( + clickhouse_node, + f"/* expect: {mysql_crc32} */ SELECT {column} FROM {db}3.t", + f"{mysql_crc32}\n", + interval_seconds=1, + retry_count=30, + ) + check_query( + clickhouse_node, + f"/* expect: {mysql_crc32} */ SELECT {column} FROM {db}4.t", + f"{mysql_crc32}\n", + interval_seconds=1, + retry_count=30, + ) + + +def wrong_password_test(clickhouse_node, mysql_node, replication): + db = "wrong_password_test" + replication.create_db_mysql(db) + mysql_node.query( + f"CREATE TABLE {db}.t(id INT PRIMARY KEY AUTO_INCREMENT, score int, create_time DATETIME DEFAULT NOW())" + ) + replication.insert_data(db, "t", 100, column="score") + with pytest.raises(Exception) as exc: + clickhouse_node.query( + f"CREATE DATABASE {db} ENGINE = MaterializedMySQL('{replication.mysql_host}:3306', '{db}', 'root', 'wrong_password') SETTINGS use_binlog_client=1" + ) + + replication.create_db_ch(db, settings="use_binlog_client=1") + check_query( + clickhouse_node, + f"SHOW TABLES FROM {db} FORMAT TSV", + "t\n", + ) + + replication.insert_data(db, "t", 100, column="score") + check_query( + clickhouse_node, + f"/* expect: 200 */ SELECT COUNT() FROM {db}.t ", + "200\n", + interval_seconds=1, + retry_count=30, + ) + check_query( + clickhouse_node, + f"/* expect: root@{replication.mysql_host}:3306 */ SELECT binlog_client_name FROM system.mysql_binlogs WHERE name = '{db}'", + f"root@{replication.mysql_host}:3306\n", + interval_seconds=1, + retry_count=30, + ) + + +def dispatcher_buffer_test(clickhouse_node, mysql_node, replication): + db = "dispatcher_buffer_test" + replication.create_db_mysql(db) + mysql_node.query( + f"CREATE TABLE {db}.t(id INT PRIMARY KEY AUTO_INCREMENT, score int, create_time DATETIME DEFAULT NOW())" + ) + replication.insert_data(db, "t", 100, column="score") + rows_count = 100 + replication.create_db_ch( + db, + settings="use_binlog_client=1, max_bytes_in_binlog_dispatcher_buffer=0, max_flush_milliseconds_in_binlog_dispatcher=0", + ) + check_query( + clickhouse_node, + f"SHOW TABLES FROM {db} FORMAT TSV", + "t\n", + ) + + replication.insert_data(db, "t", 100000, column="score") + rows_count += 100000 + + mysql_node.query( + f"UPDATE {db}.t SET create_time='2021-01-01' WHERE id > 10000 AND id < 50000" + ) + mysql_node.query( + f"UPDATE {db}.t SET create_time=now() WHERE create_time='2021-01-01'" + ) + + mysql_crc32 = mysql_node.query_and_get_data( + f"SELECT bit_xor(cast(crc32(concat(id, score, create_time)) AS unsigned)) AS checksum FROM {db}.t" + )[0][0] + column = "bit_xor(cast(crc32(concat(toString(assumeNotNull(id)), toString(assumeNotNull(score)), toString(assumeNotNull(create_time)))) AS UInt32)) AS checksum" + check_query( + clickhouse_node, + f"/* expect: {mysql_crc32} */ SELECT {column} FROM {db}.t", + f"{mysql_crc32}\n", + interval_seconds=1, + retry_count=30, + ) + check_query( + clickhouse_node, + f"/* expect: {rows_count} */ SELECT COUNT() FROM {db}.t", + f"{rows_count}\n", + interval_seconds=1, + retry_count=30, + ) + check_query( + clickhouse_node, + f"/* expect: 0 */ SELECT COUNT() FROM {db}.t WHERE toDate(create_time)='2021-01-01'", + "0\n", + interval_seconds=1, + retry_count=30, + ) + + clickhouse_node.query(f"DROP DATABASE {db}") + replication.create_db_ch( + f"{db}", + from_mysql_db=db, + settings="use_binlog_client=1, max_bytes_in_binlog_dispatcher_buffer=1000, max_flush_milliseconds_in_binlog_dispatcher=1000", + ) + check_query( + clickhouse_node, + f"SHOW TABLES FROM {db} FORMAT TSV", + "t\n", + ) + + replication.insert_data(db, "t", 10000, column="score") + rows_count += 10000 + + mysql_node.query(f"UPDATE {db}.t SET create_time='2021-11-11' WHERE score > 10000") + mysql_node.query( + f"UPDATE {db}.t SET create_time='2021-01-01' WHERE id > 10000 AND id < 50000" + ) + mysql_node.query( + f"UPDATE {db}.t SET create_time=now() WHERE create_time='2021-01-01'" + ) + mysql_node.query( + f"UPDATE {db}.t SET create_time=now() WHERE create_time='2021-11-01'" + ) + + mysql_crc32 = mysql_node.query_and_get_data( + f"SELECT bit_xor(cast(crc32(concat(id, score, create_time)) AS unsigned)) AS checksum FROM {db}.t" + )[0][0] + check_query( + clickhouse_node, + f"/* expect: {mysql_crc32} */ SELECT {column} FROM {db}.t", + f"{mysql_crc32}\n", + interval_seconds=1, + retry_count=30, + ) + check_query( + clickhouse_node, + f"/* expect: {rows_count} */ SELECT COUNT() FROM {db}.t", + f"{rows_count}\n", + interval_seconds=1, + retry_count=30, + ) + check_query( + clickhouse_node, + f"/* expect: 0 */ SELECT COUNT() FROM {db}.t WHERE toDate(create_time)='2021-11-01'", + "0\n", + interval_seconds=1, + retry_count=30, + ) + + replication.create_db_ch( + db, + settings="use_binlog_client=1, max_bytes_in_binlog_dispatcher_buffer=100000000, max_flush_milliseconds_in_binlog_dispatcher=1000", + ) + check_query( + clickhouse_node, + f"SHOW TABLES FROM {db} FORMAT TSV", + "t\n", + ) + + replication.insert_data(db, "t", 100000, column="score") + rows_count += 100000 + + mysql_node.query(f"UPDATE {db}.t SET create_time='2021-11-11' WHERE score > 10000") + mysql_node.query( + f"UPDATE {db}.t SET create_time='2021-01-01' WHERE id > 10000 AND id < 50000" + ) + mysql_node.query( + f"UPDATE {db}.t SET create_time=now() WHERE create_time='2021-01-01'" + ) + mysql_node.query( + f"UPDATE {db}.t SET create_time=now() WHERE create_time='2021-11-01'" + ) + + check_query( + clickhouse_node, + f"/* expect: 1 */ SELECT COUNT() FROM system.mysql_binlogs WHERE name = '{db}' AND (dispatcher_events_read_per_sec > 0 OR dispatcher_bytes_read_per_sec > 0 OR dispatcher_events_flush_per_sec > 0 OR dispatcher_bytes_flush_per_sec > 0)", + f"1\n", + interval_seconds=1, + retry_count=30, + ) + + mysql_crc32 = mysql_node.query_and_get_data( + f"SELECT bit_xor(cast(crc32(concat(id, score, create_time)) AS unsigned)) AS checksum FROM {db}.t" + )[0][0] + check_query( + clickhouse_node, + f"/* expect: {mysql_crc32} */ SELECT {column} FROM {db}.t", + f"{mysql_crc32}\n", + interval_seconds=1, + retry_count=30, + ) + check_query( + clickhouse_node, + f"/* expect: {rows_count} */ SELECT COUNT() FROM {db}.t", + f"{rows_count}\n", + interval_seconds=1, + retry_count=30, + ) + check_query( + clickhouse_node, + f"/* expect: 0 */ SELECT COUNT() FROM {db}.t WHERE toDate(create_time)='2021-11-01'", + "0\n", + interval_seconds=1, + retry_count=30, + ) + + +def gtid_after_attach_test(clickhouse_node, mysql_node, replication): + db = "gtid_after_attach_test" + replication.create_db_mysql(db) + mysql_node.query( + f"CREATE TABLE {db}.t(id INT PRIMARY KEY AUTO_INCREMENT, score int, create_time DATETIME DEFAULT NOW())" + ) + + db_count = 6 + for i in range(db_count): + replication.create_db_ch( + f"{db}{i}", + from_mysql_db=db, + settings="use_binlog_client=1", + ) + check_query( + clickhouse_node, + f"SHOW TABLES FROM {db}0 FORMAT TSV", + "t\n", + ) + for i in range(int(db_count / 2)): + clickhouse_node.query(f"DETACH DATABASE {db}{i}") + + mysql_node.query(f"USE {db}") + rows = 10000 + for i in range(100): + mysql_node.query(f"ALTER TABLE t ADD COLUMN (e{i} INT)") + replication.insert_data(db, "t", rows, column="score") + + clickhouse_node.restart_clickhouse(stop_start_wait_sec=120) + + check_query( + clickhouse_node, + f"/* expect: 1 */ SELECT COUNT(DISTINCT(dispatcher_name)) FROM system.mysql_binlogs WHERE name LIKE '{db}%'", + "1\n", + interval_seconds=1, + retry_count=300, + ) diff --git a/tests/integration/test_materialized_mysql_database/test.py b/tests/integration/test_materialized_mysql_database/test.py index f3369e25d94..727188a4b86 100644 --- a/tests/integration/test_materialized_mysql_database/test.py +++ b/tests/integration/test_materialized_mysql_database/test.py @@ -1,3 +1,4 @@ +import os import time import pymysql.cursors import pytest @@ -142,6 +143,145 @@ def clickhouse_node(): yield node_db +class ReplicationHelper: + def __init__(self, clickhouse, mysql, mysql_host=None): + self.clickhouse = clickhouse + self.mysql = mysql + self.created_mysql_dbs = [] + self.created_clickhouse_dbs = [] + self.base_mysql_settings = os.getenv("TEST_BASE_MYSQL_SETTINGS", "") + self.base_ch_settings = os.getenv("TEST_BASE_CH_SETTINGS", "") + self.mysql_host = mysql_host if mysql_host is not None else cluster.mysql8_host + self.created_insert_procedures = {} + self.inserted_rows_per_sp = {} + self.inserted_rows = 0 + + def create_dbs(self, db_name, ch_settings="", mysql_settings=""): + self.create_db_mysql(db_name, settings=mysql_settings) + self.create_db_ch(db_name, settings=ch_settings) + + def create_db_mysql(self, db_name, settings=""): + self.mysql.query(f"DROP DATABASE IF EXISTS {db_name}") + self.mysql.query( + f"CREATE DATABASE {db_name} {self.base_mysql_settings} {settings}" + ) + self.created_mysql_dbs.append(db_name) + + def create_db_ch( + self, db_name, from_mysql_db=None, settings="", table_overrides="" + ): + if from_mysql_db is None: + from_mysql_db = db_name + self.clickhouse.query(f"DROP DATABASE IF EXISTS {db_name}") + all_settings = "" + create_query = f"CREATE DATABASE {db_name} ENGINE = MaterializedMySQL('{self.mysql_host}:3306', '{from_mysql_db}', 'root', 'clickhouse')" + if self.base_ch_settings or settings: + separator = ", " if self.base_ch_settings and settings else "" + create_query += f" SETTINGS {self.base_ch_settings}{separator}{settings}" + if table_overrides: + create_query += f" {table_overrides}" + self.clickhouse.query(create_query) + self.created_clickhouse_dbs.append(db_name) + + def drop_dbs_mysql(self): + for db_name in self.created_mysql_dbs: + self.mysql.query(f"DROP DATABASE IF EXISTS {db_name}") + self.created_mysql_dbs = [] + self.created_insert_procedures = {} + self.inserted_rows_per_sp = {} + self.inserted_rows = 0 + + def drop_dbs_ch(self): + for db_name in self.created_clickhouse_dbs: + self.clickhouse.query(f"DROP DATABASE IF EXISTS {db_name}") + self.created_clickhouse_dbs = [] + + def drop_dbs(self): + self.drop_dbs_mysql() + self.drop_dbs_ch() + + def create_stored_procedure(self, db, table, column): + sp_id = f"{db}_{table}_{column}" + if sp_id in self.created_insert_procedures: + return sp_id + self.mysql.query(f"DROP PROCEDURE IF EXISTS {db}.insert_test_data_{sp_id}") + self.mysql.query( + f""" +CREATE PROCEDURE {db}.insert_test_data_{sp_id}(IN num_rows INT, IN existing_rows INT) +BEGIN + DECLARE i INT; + SET i = existing_rows; + SET @insert = concat("INSERT INTO {table} ({column}) VALUES "); + SET @exedata = ""; + WHILE i < (num_rows + existing_rows) DO + SET @exedata=concat(@exedata, ",(", i , ")"); + SET i = i + 1; + IF i % 1000 = 0 + THEN + SET @exedata = SUBSTRING(@exedata, 2); + SET @exesql = concat(@insert, @exedata); + PREPARE stmt FROM @exesql; + EXECUTE stmt; + DEALLOCATE PREPARE stmt; + SET @exedata = ""; + END IF; + END WHILE; + IF length(@exedata) > 0 + THEN + SET @exedata = SUBSTRING(@exedata, 2); + SET @exesql = concat(@insert, @exedata); + PREPARE stmt FROM @exesql; + EXECUTE stmt; + DEALLOCATE PREPARE stmt; + END IF; +END""" + ) + self.created_insert_procedures[sp_id] = True + self.inserted_rows_per_sp[sp_id] = 0 + return sp_id + + def insert_data(self, db, table, num_rows, column="id"): + """Inserts num_rows into db.table, into the column `column` (which must be INT)""" + sp_id = self.create_stored_procedure(db, table, column) + self.mysql.query( + f"CALL {db}.insert_test_data_{sp_id}({num_rows}, {self.inserted_rows_per_sp[sp_id]})" + ) + self.inserted_rows_per_sp[sp_id] += num_rows + self.inserted_rows += num_rows + + def wait_for_sync_to_catch_up( + self, database: str = "", retry_count=30, interval_seconds=1 + ): + if database == "": + database = self.created_clickhouse_dbs[-1] + mysql_gtid = self.mysql.query_and_get_data("SELECT @@GLOBAL.gtid_executed")[0][ + 0 + ] + materialized_with_ddl.check_query( + self.clickhouse, + f"SELECT executed_gtid_set /* expect: {mysql_gtid} */ FROM system.materialized_mysql_databases WHERE name = '{database}'", + f"{mysql_gtid}\n", + retry_count=retry_count, + interval_seconds=interval_seconds, + ) + + +@pytest.fixture(scope="function") +def replication(started_mysql_8_0, request): + try: + replication = ReplicationHelper(node_db, started_mysql_8_0) + yield replication + finally: + if hasattr(request.session, "testsfailed") and request.session.testsfailed: + logging.warning(f"tests failed - not dropping databases") + else: + # drop databases only if the test succeeds - so we can inspect the database after failed tests + try: + replication.drop_dbs() + except Exception as e: + logging.warning(f"replication.drop_dbs() failed: {e}") + + def test_materialized_database_dml_with_mysql_5_7( started_cluster, started_mysql_5_7, clickhouse_node: ClickHouseInstance ): @@ -556,3 +696,21 @@ def test_table_with_indexes(started_cluster, started_mysql_8_0, clickhouse_node) materialized_with_ddl.table_with_indexes( clickhouse_node, started_mysql_8_0, "mysql80" ) + + +def test_binlog_client(started_cluster, started_mysql_8_0, replication): + materialized_with_ddl.binlog_client_test(node_db, started_mysql_8_0, replication) + replication.drop_dbs() + materialized_with_ddl.binlog_client_timeout_test( + node_db, started_mysql_8_0, replication + ) + replication.drop_dbs() + materialized_with_ddl.wrong_password_test(node_db, started_mysql_8_0, replication) + replication.drop_dbs() + materialized_with_ddl.dispatcher_buffer_test( + node_db, started_mysql_8_0, replication + ) + replication.drop_dbs() + materialized_with_ddl.gtid_after_attach_test( + node_db, started_mysql_8_0, replication + ) diff --git a/tests/integration/test_non_default_compression/configs/enable_zstdqat_codec.xml b/tests/integration/test_non_default_compression/configs/enable_zstdqat_codec.xml new file mode 100644 index 00000000000..c686b37a537 --- /dev/null +++ b/tests/integration/test_non_default_compression/configs/enable_zstdqat_codec.xml @@ -0,0 +1,7 @@ + + + + 1 + + + diff --git a/tests/integration/test_remote_blobs_naming/configs/setting.xml b/tests/integration/test_remote_blobs_naming/configs/setting.xml new file mode 100644 index 00000000000..408fa36fdd3 --- /dev/null +++ b/tests/integration/test_remote_blobs_naming/configs/setting.xml @@ -0,0 +1,11 @@ + + + + + + 1 + 1 + + + + diff --git a/tests/integration/test_remote_blobs_naming/configs/storage_conf.xml b/tests/integration/test_remote_blobs_naming/configs/storage_conf.xml index 31c6a3bf968..e901f0df51d 100644 --- a/tests/integration/test_remote_blobs_naming/configs/storage_conf.xml +++ b/tests/integration/test_remote_blobs_naming/configs/storage_conf.xml @@ -9,17 +9,17 @@ s3 - http://minio1:9001/root/data/ + http://minio1:9001/root/old-style-prefix/with-several-section/ minio minio123 - s3_plain - http://minio1:9001/root/data/s3_pain_key_prefix - minio - minio123 - true - + s3_plain + http://minio1:9001/root/data/s3_pain_key_prefix + minio + minio123 + true + @@ -30,7 +30,6 @@ -
@@ -38,6 +37,13 @@
+ + +
+ s3 +
+
+
diff --git a/tests/integration/test_remote_blobs_naming/configs/storage_conf_new.xml b/tests/integration/test_remote_blobs_naming/configs/storage_conf_new.xml new file mode 100644 index 00000000000..c3b515e8777 --- /dev/null +++ b/tests/integration/test_remote_blobs_naming/configs/storage_conf_new.xml @@ -0,0 +1,61 @@ + + + + + test + + + + + + s3 + http://minio1:9001/root/old-style-prefix/with-several-section/ + minio + minio123 + + + s3_plain + http://minio1:9001/root/data/s3_pain_key_prefix + minio + minio123 + true + + + s3 + http://minio1:9001/root/ + minio + minio123 + old-style-prefix/with-several-section + [a-z]{3}-first-random-part/new-style-prefix/constant-part/[a-z]{3}/[a-z]{29} + + + + + + +
+ s3 +
+
+
+ + +
+ s3_plain +
+
+
+ + +
+ s3_template_key +
+
+
+
+
+ + + s3 + +
diff --git a/tests/integration/test_remote_blobs_naming/test_backward_compatibility.py b/tests/integration/test_remote_blobs_naming/test_backward_compatibility.py index 485bf73dad1..8c52b05dba2 100644 --- a/tests/integration/test_remote_blobs_naming/test_backward_compatibility.py +++ b/tests/integration/test_remote_blobs_naming/test_backward_compatibility.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 - +from contextlib import contextmanager +from difflib import unified_diff import logging +import re import pytest import os @@ -27,7 +29,7 @@ def cluster(): "new_node", main_configs=[ "configs/new_node.xml", - "configs/storage_conf.xml", + "configs/storage_conf_new.xml", ], user_configs=[ "configs/settings.xml", @@ -49,6 +51,7 @@ def cluster(): with_zookeeper=True, stay_alive=True, ) + logging.info("Starting cluster...") cluster.start() logging.info("Cluster started") @@ -200,8 +203,32 @@ def test_write_new_format(cluster): assert remote == object_key -@pytest.mark.parametrize("storage_policy", ["s3", "s3_plain"]) -def test_replicated_merge_tree(cluster, storage_policy): +@contextmanager +def drop_table_scope(nodes, tables, create_statements): + try: + for node in nodes: + for statement in create_statements: + node.query(statement) + yield + finally: + for node in nodes: + for table in tables: + node.query(f"DROP TABLE IF EXISTS {table} SYNC") + + +@pytest.mark.parametrize( + "test_case", + [ + ("s3_plain", False), + ("s3", False), + ("s3", True), + ("s3_template_key", False), + ("s3_template_key", True), + ], +) +def test_replicated_merge_tree(cluster, test_case): + storage_policy, zero_copy = test_case + if storage_policy == "s3_plain": # MergeTree table doesn't work on s3_plain. Rename operation is not implemented return @@ -209,35 +236,172 @@ def test_replicated_merge_tree(cluster, storage_policy): node_old = cluster.instances["node"] node_new = cluster.instances["new_node"] + zk_table_path = f"/clickhouse/tables/test_replicated_merge_tree_{storage_policy}{'_zero_copy' if zero_copy else ''}" create_table_statement = f""" - CREATE TABLE test_replicated_merge_tree ( - id Int64, - val String - ) ENGINE=ReplicatedMergeTree('/clickhouse/tables/test_replicated_merge_tree_{storage_policy}', '{{replica}}') - PARTITION BY id - ORDER BY (id, val) - SETTINGS - storage_policy='{storage_policy}' - """ + CREATE TABLE test_replicated_merge_tree ( + id Int64, + val String + ) ENGINE=ReplicatedMergeTree('{zk_table_path}', '{{replica}}') + PARTITION BY id + ORDER BY (id, val) + SETTINGS + storage_policy='{storage_policy}', + allow_remote_fs_zero_copy_replication='{1 if zero_copy else 0}' + """ - node_old.query(create_table_statement) - node_new.query(create_table_statement) + with drop_table_scope( + [node_old, node_new], ["test_replicated_merge_tree"], [create_table_statement] + ): + node_old.query("INSERT INTO test_replicated_merge_tree VALUES (0, 'a')") + node_new.query("INSERT INTO test_replicated_merge_tree VALUES (1, 'b')") - node_old.query("INSERT INTO test_replicated_merge_tree VALUES (0, 'a')") - node_new.query("INSERT INTO test_replicated_merge_tree VALUES (1, 'b')") + # node_old have to fetch metadata from node_new and vice versa + node_old.query("SYSTEM SYNC REPLICA test_replicated_merge_tree") + node_new.query("SYSTEM SYNC REPLICA test_replicated_merge_tree") - # node_old have to fetch metadata from node_new and vice versa - node_old.query("SYSTEM SYNC REPLICA test_replicated_merge_tree") - node_new.query("SYSTEM SYNC REPLICA test_replicated_merge_tree") + count_old = node_old.query( + "SELECT count() FROM test_replicated_merge_tree" + ).strip() + count_new = node_new.query( + "SELECT count() FROM test_replicated_merge_tree" + ).strip() - count_old = node_old.query("SELECT count() FROM test_replicated_merge_tree").strip() - count_new = node_new.query("SELECT count() FROM test_replicated_merge_tree").strip() + assert count_old == "2" + assert count_new == "2" - assert count_old == "2" - assert count_new == "2" + if not zero_copy: + return - node_old.query("DROP TABLE test_replicated_merge_tree SYNC") - node_new.query("DROP TABLE test_replicated_merge_tree SYNC") + def get_remote_pathes(node, table_name, only_remote_path=True): + uuid = node.query( + f""" + SELECT uuid + FROM system.tables + WHERE name = '{table_name}' + """ + ).strip() + assert uuid + return node.query( + f""" + SELECT {"remote_path" if only_remote_path else "*"} + FROM system.remote_data_paths + WHERE + local_path LIKE '%{uuid}%' + AND local_path NOT LIKE '%format_version.txt%' + ORDER BY ALL + """ + ).strip() + + remote_pathes_old = get_remote_pathes(node_old, "test_replicated_merge_tree") + remote_pathes_new = get_remote_pathes(node_new, "test_replicated_merge_tree") + + assert len(remote_pathes_old) > 0 + assert remote_pathes_old == remote_pathes_new, ( + str(unified_diff(remote_pathes_old, remote_pathes_new)) + + "\n\nold:\n" + + get_remote_pathes(node_old, "test_replicated_merge_tree", False) + + "\n\nnew:\n" + + get_remote_pathes(node_new, "test_replicated_merge_tree", False) + ) + + def count_lines_with(lines, pattern): + return sum([1 for x in lines if pattern in x]) + + remore_pathes_with_old_format = count_lines_with( + remote_pathes_old.split(), "old-style-prefix" + ) + remore_pathes_with_new_format = count_lines_with( + remote_pathes_old.split(), "new-style-prefix" + ) + + if storage_policy == "s3_template_key": + assert remore_pathes_with_old_format == remore_pathes_with_new_format + assert remore_pathes_with_old_format == len(remote_pathes_old.split()) / 2 + else: + assert remore_pathes_with_old_format == len(remote_pathes_old.split()) + assert remore_pathes_with_new_format == 0 + + parts = ( + node_old.query( + """ + SELECT name + FROM system.parts + WHERE + table = 'test_replicated_merge_tree' + AND active + ORDER BY ALL + """ + ) + .strip() + .split() + ) + table_shared_uuid = node_old.query( + f"SELECT value FROM system.zookeeper WHERE path='{zk_table_path}' and name='table_shared_id'" + ).strip() + + part_blobs = {} + blobs_replicas = {} + + for part in parts: + blobs = ( + node_old.query( + f""" + SELECT name + FROM system.zookeeper + WHERE path='/clickhouse/zero_copy/zero_copy_s3/{table_shared_uuid}/{part}' + ORDER BY ALL + """ + ) + .strip() + .split() + ) + + for blob in blobs: + replicas = ( + node_old.query( + f""" + SELECT name + FROM system.zookeeper + WHERE path='/clickhouse/zero_copy/zero_copy_s3/{table_shared_uuid}/{part}/{blob}' + ORDER BY ALL + """ + ) + .strip() + .split() + ) + assert blob not in blobs_replicas + blobs_replicas[blob] = replicas + + assert part not in part_blobs + part_blobs[part] = blobs + + assert len(parts) == 2, "parts: " + str(parts) + assert len(part_blobs.keys()) == len(parts), ( + "part_blobs: " + str(part_blobs) + "; parts: " + str(parts) + ) + assert len(blobs_replicas.keys()) == len(parts), ( + "blobs_replicas: " + str(blobs_replicas) + "; parts: " + str(parts) + ) + + for replicas in blobs_replicas.values(): + assert len(replicas) == 2, "blobs_replicas: " + str(blobs_replicas) + + for blob in blobs_replicas.keys(): + assert re.match( + "(old-style-prefix_with-several-section|[a-z]{3}-first-random-part_new-style-prefix_constant-part)_[a-z]{3}_[a-z]{29}", + blob, + ), "blobs_replicas: " + str(blobs_replicas) + + old_style_count = sum( + [1 for x in blobs_replicas.keys() if "old-style-prefix" in x] + ) + new_style_count = sum( + [1 for x in blobs_replicas.keys() if "new-style-prefix" in x] + ) + + assert (new_style_count > 0 and old_style_count == new_style_count) or ( + new_style_count == 0 and old_style_count == len(blobs_replicas) + ) def switch_config_write_full_object_key(node, enable): diff --git a/tests/integration/test_storage_s3_queue/configs/defaultS3.xml b/tests/integration/test_storage_s3_queue/configs/defaultS3.xml deleted file mode 100644 index 7dac6d9fbb5..00000000000 --- a/tests/integration/test_storage_s3_queue/configs/defaultS3.xml +++ /dev/null @@ -1,11 +0,0 @@ - - - - http://resolver:8080 -
Authorization: Bearer TOKEN
-
- - http://resolver:8080/root-with-auth/restricteddirectory/ - -
-
diff --git a/tests/integration/test_storage_s3_queue/configs/named_collections.xml b/tests/integration/test_storage_s3_queue/configs/named_collections.xml deleted file mode 100644 index 64674e2a3e3..00000000000 --- a/tests/integration/test_storage_s3_queue/configs/named_collections.xml +++ /dev/null @@ -1,43 +0,0 @@ - - - - http://minio1:9001/root/test_table - minio - minio123 - - - http://minio1:9001/root/test_parquet - minio - minio123 - - - http://minio1:9001/root/test_parquet_gz - minio - minio123 - - - http://minio1:9001/root/test_orc - minio - minio123 - - - http://minio1:9001/root/test_native - minio - minio123 - - - http://minio1:9001/root/test.arrow - minio - minio123 - - - http://minio1:9001/root/test.parquet - minio - minio123 - - - http://minio1:9001/root/test_cache4.jsonl - true - - - diff --git a/tests/integration/test_storage_s3_queue/test.py b/tests/integration/test_storage_s3_queue/test.py index b83c095a7a6..7d40060fec6 100644 --- a/tests/integration/test_storage_s3_queue/test.py +++ b/tests/integration/test_storage_s3_queue/test.py @@ -1,6 +1,5 @@ import io import logging -import os import random import time @@ -9,75 +8,57 @@ from helpers.client import QueryRuntimeException from helpers.cluster import ClickHouseCluster, ClickHouseInstance import json -""" -export CLICKHOUSE_TESTS_SERVER_BIN_PATH=/home/sergey/vkr/ClickHouse/build/programs/clickhouse-server -export CLICKHOUSE_TESTS_CLIENT_BIN_PATH=/home/sergey/vkr/ClickHouse/build/programs/clickhouse-client -export CLICKHOUSE_TESTS_ODBC_BRIDGE_BIN_PATH=/home/sergey/vkr/ClickHouse/build/programs/clickhouse-odbc-bridge -export CLICKHOUSE_TESTS_BASE_CONFIG_DIR=/home/sergey/vkr/ClickHouse/programs/server -""" - -MINIO_INTERNAL_PORT = 9001 AVAILABLE_MODES = ["unordered", "ordered"] -AUTH = "'minio','minio123'," -SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__)) +DEFAULT_AUTH = ["'minio'", "'minio123'"] +NO_AUTH = ["NOSIGN"] -def prepare_s3_bucket(started_cluster): - # Allows read-write access for bucket without authorization. - bucket_read_write_policy = { - "Version": "2012-10-17", - "Statement": [ - { - "Sid": "", - "Effect": "Allow", - "Principal": {"AWS": "*"}, - "Action": "s3:GetBucketLocation", - "Resource": "arn:aws:s3:::root", - }, - { - "Sid": "", - "Effect": "Allow", - "Principal": {"AWS": "*"}, - "Action": "s3:ListBucket", - "Resource": "arn:aws:s3:::root", - }, - { - "Sid": "", - "Effect": "Allow", - "Principal": {"AWS": "*"}, - "Action": "s3:GetObject", - "Resource": "arn:aws:s3:::root/*", - }, - { - "Sid": "", - "Effect": "Allow", - "Principal": {"AWS": "*"}, - "Action": "s3:PutObject", - "Resource": "arn:aws:s3:::root/*", - }, - { - "Sid": "", - "Effect": "Allow", - "Principal": {"AWS": "*"}, - "Action": "s3:DeleteObject", - "Resource": "arn:aws:s3:::root/*", - }, - ], - } +def prepare_public_s3_bucket(started_cluster): + def create_bucket(client, bucket_name, policy): + if client.bucket_exists(bucket_name): + client.remove_bucket(bucket_name) + + client.make_bucket(bucket_name) + + client.set_bucket_policy(bucket_name, json.dumps(policy)) + + def get_policy_with_public_access(bucket_name): + return { + "Version": "2012-10-17", + "Statement": [ + { + "Sid": "", + "Effect": "Allow", + "Principal": "*", + "Action": [ + "s3:GetBucketLocation", + "s3:ListBucket", + ], + "Resource": f"arn:aws:s3:::{bucket_name}", + }, + { + "Sid": "", + "Effect": "Allow", + "Principal": "*", + "Action": [ + "s3:GetObject", + "s3:PutObject", + "s3:DeleteObject", + ], + "Resource": f"arn:aws:s3:::{bucket_name}/*", + }, + ], + } minio_client = started_cluster.minio_client - minio_client.set_bucket_policy( - started_cluster.minio_bucket, json.dumps(bucket_read_write_policy) - ) - started_cluster.minio_restricted_bucket = "{}-with-auth".format( - started_cluster.minio_bucket + started_cluster.minio_public_bucket = f"{started_cluster.minio_bucket}-public" + create_bucket( + minio_client, + started_cluster.minio_public_bucket, + get_policy_with_public_access(started_cluster.minio_public_bucket), ) - if minio_client.bucket_exists(started_cluster.minio_restricted_bucket): - minio_client.remove_bucket(started_cluster.minio_restricted_bucket) - - minio_client.make_bucket(started_cluster.minio_restricted_bucket) @pytest.fixture(autouse=True) @@ -89,11 +70,9 @@ def s3_queue_setup_teardown(started_cluster): instance_2.query("DROP DATABASE IF EXISTS test; CREATE DATABASE test;") minio = started_cluster.minio_client - objects = list( - minio.list_objects(started_cluster.minio_restricted_bucket, recursive=True) - ) + objects = list(minio.list_objects(started_cluster.minio_bucket, recursive=True)) for obj in objects: - minio.remove_object(started_cluster.minio_restricted_bucket, obj.object_name) + minio.remove_object(started_cluster.minio_bucket, obj.object_name) yield # run test @@ -107,8 +86,6 @@ def started_cluster(): with_minio=True, with_zookeeper=True, main_configs=[ - "configs/defaultS3.xml", - "configs/named_collections.xml", "configs/zookeeper.xml", "configs/s3queue_log.xml", ], @@ -119,8 +96,6 @@ def started_cluster(): with_minio=True, with_zookeeper=True, main_configs=[ - "configs/defaultS3.xml", - "configs/named_collections.xml", "configs/s3queue_log.xml", ], ) @@ -129,7 +104,6 @@ def started_cluster(): cluster.start() logging.info("Cluster started") - prepare_s3_bucket(cluster) yield cluster finally: cluster.shutdown() @@ -146,7 +120,13 @@ def run_query(instance, query, stdin=None, settings=None): def generate_random_files( - started_cluster, files_path, count, column_num=3, row_num=10, start_ind=0 + started_cluster, + files_path, + count, + column_num=3, + row_num=10, + start_ind=0, + bucket=None, ): files = [ (f"{files_path}/test_{i}.csv", i) for i in range(start_ind, start_ind + count) @@ -164,28 +144,14 @@ def generate_random_files( values_csv = ( "\n".join((",".join(map(str, row)) for row in rand_values)) + "\n" ).encode() - put_s3_file_content(started_cluster, filename, values_csv) + put_s3_file_content(started_cluster, filename, values_csv, bucket) return total_values -def put_s3_file_content(started_cluster, filename, data): +def put_s3_file_content(started_cluster, filename, data, bucket=None): + bucket = started_cluster.minio_bucket if bucket is None else bucket buf = io.BytesIO(data) - started_cluster.minio_client.put_object( - started_cluster.minio_bucket, filename, buf, len(data) - ) - - -def get_s3_file_content(started_cluster, bucket, filename, decode=True): - # type: (ClickHouseCluster, str, str, bool) -> str - # Returns content of given S3 file as string. - - data = started_cluster.minio_client.get_object(bucket, filename) - data_str = b"" - for chunk in data.stream(): - data_str += chunk - if decode: - return data_str.decode() - return data_str + started_cluster.minio_client.put_object(bucket, filename, buf, len(data)) def create_table( @@ -197,7 +163,12 @@ def create_table( format="column1 UInt32, column2 UInt32, column3 UInt32", additional_settings={}, file_format="CSV", + auth=DEFAULT_AUTH, + bucket=None, ): + auth_params = ",".join(auth) + bucket = started_cluster.minio_bucket if bucket is None else bucket + settings = { "s3queue_loading_retries": 0, "after_processing": "keep", @@ -206,11 +177,11 @@ def create_table( } settings.update(additional_settings) - url = f"http://{started_cluster.minio_host}:{started_cluster.minio_port}/{started_cluster.minio_bucket}/{files_path}/" + url = f"http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{files_path}/" node.query(f"DROP TABLE IF EXISTS {table_name}") create_query = f""" CREATE TABLE {table_name} ({format}) - ENGINE = S3Queue('{url}', {AUTH}'{file_format}') + ENGINE = S3Queue('{url}', {auth_params}, {file_format}) SETTINGS {",".join((k+"="+repr(v) for k, v in settings.items()))} """ node.query(create_query) @@ -922,3 +893,70 @@ def test_drop_table(started_cluster): ) or node.contains_in_log( f"StorageS3Queue ({table_name}): Shutdown was called, stopping sync" ) + + +def test_s3_client_reused(started_cluster): + node = started_cluster.instances["instance"] + table_name = f"test.test_s3_client_reused" + dst_table_name = f"{table_name}_dst" + files_path = f"{table_name}_data" + row_num = 10 + + def get_created_s3_clients_count(): + value = node.query( + f"SELECT value FROM system.events WHERE event='S3Clients'" + ).strip() + return int(value) if value != "" else 0 + + def wait_all_processed(files_num): + expected_count = files_num * row_num + for _ in range(100): + count = int(node.query(f"SELECT count() FROM {dst_table_name}")) + print(f"{count}/{expected_count}") + if count == expected_count: + break + time.sleep(1) + assert ( + int(node.query(f"SELECT count() FROM {dst_table_name}")) == expected_count + ) + + prepare_public_s3_bucket(started_cluster) + + s3_clients_before = get_created_s3_clients_count() + + create_table( + started_cluster, + node, + table_name, + "ordered", + files_path, + additional_settings={ + "after_processing": "delete", + "s3queue_processing_threads_num": 1, + }, + auth=NO_AUTH, + bucket=started_cluster.minio_public_bucket, + ) + + s3_clients_after = get_created_s3_clients_count() + assert s3_clients_before + 1 == s3_clients_after + + create_mv(node, table_name, dst_table_name) + + for i in range(0, 10): + s3_clients_before = get_created_s3_clients_count() + + generate_random_files( + started_cluster, + files_path, + count=1, + start_ind=i, + row_num=row_num, + bucket=started_cluster.minio_public_bucket, + ) + + wait_all_processed(i + 1) + + s3_clients_after = get_created_s3_clients_count() + + assert s3_clients_before == s3_clients_after diff --git a/tests/performance/multiif.xml b/tests/performance/multiif.xml new file mode 100644 index 00000000000..ad56ab3f5f2 --- /dev/null +++ b/tests/performance/multiif.xml @@ -0,0 +1,8 @@ + + CREATE TABLE test_multiif_t(d Nullable(Int64)) ENGINE Memory + INSERT INTO test_multiif_t SELECT * from numbers(300000000) + + select count(1) from test_multiif_t where multiIf(d > 2, d-2, d > 1, d-1, d >0, d, 0) > 1 SETTINGS max_threads=1 + + DROP TABLE IF EXISTS test_multiif_t + diff --git a/tests/queries/0_stateless/00804_test_zstd_qat_codec_compression.reference b/tests/queries/0_stateless/00804_test_zstd_qat_codec_compression.reference new file mode 100644 index 00000000000..31a4360469f --- /dev/null +++ b/tests/queries/0_stateless/00804_test_zstd_qat_codec_compression.reference @@ -0,0 +1,6 @@ +CREATE TABLE default.compression_codec\n(\n `id` UInt64 CODEC(ZSTD_QAT(1)),\n `data` String CODEC(ZSTD_QAT(1)),\n `ddd` Date CODEC(ZSTD_QAT(1)),\n `ddd32` Date32 CODEC(ZSTD_QAT(1)),\n `somenum` Float64 CODEC(ZSTD_QAT(1)),\n `somestr` FixedString(3) CODEC(ZSTD_QAT(1)),\n `othernum` Int64 CODEC(ZSTD_QAT(1)),\n `somearray` Array(UInt8) CODEC(ZSTD_QAT(1)),\n `somemap` Map(String, UInt32) CODEC(ZSTD_QAT(1)),\n `sometuple` Tuple(UInt16, UInt64) CODEC(ZSTD_QAT(1))\n)\nENGINE = MergeTree\nORDER BY tuple()\nSETTINGS index_granularity = 8192 +1 hello 2018-12-14 2018-12-14 1.1 aaa 5 [1,2,3] {'k1':1,'k2':2} (1,2) +2 world 2018-12-15 2018-12-15 2.2 bbb 6 [4,5,6] {'k3':3,'k4':4} (3,4) +3 ! 2018-12-16 2018-12-16 3.3 ccc 7 [7,8,9] {'k5':5,'k6':6} (5,6) +2 +10001 diff --git a/tests/queries/0_stateless/00804_test_zstd_qat_codec_compression.sql b/tests/queries/0_stateless/00804_test_zstd_qat_codec_compression.sql new file mode 100644 index 00000000000..92748efd2d1 --- /dev/null +++ b/tests/queries/0_stateless/00804_test_zstd_qat_codec_compression.sql @@ -0,0 +1,50 @@ +--Tags: no-fasttest, no-cpu-aarch64, no-cpu-s390x +-- no-fasttest because ZSTD_QAT isn't available in fasttest +-- no-cpu-aarch64 and no-cpu-s390x because ZSTD_QAT is x86-only + +SET enable_zstd_qat_codec = 1; + +-- Suppress test failures because stderr contains warning "Initialization of hardware-assisted ZSTD_QAT codec failed, falling back to software ZSTD coded." +SET send_logs_level = 'fatal'; + +DROP TABLE IF EXISTS compression_codec; + +-- negative test +CREATE TABLE compression_codec(id UInt64 CODEC(ZSTD_QAT(0))) ENGINE = MergeTree() ORDER BY tuple(); -- { serverError ILLEGAL_CODEC_PARAMETER } +CREATE TABLE compression_codec(id UInt64 CODEC(ZSTD_QAT(13))) ENGINE = MergeTree() ORDER BY tuple(); -- { serverError ILLEGAL_CODEC_PARAMETER } + +CREATE TABLE compression_codec( + id UInt64 CODEC(ZSTD_QAT), + data String CODEC(ZSTD_QAT), + ddd Date CODEC(ZSTD_QAT), + ddd32 Date32 CODEC(ZSTD_QAT), + somenum Float64 CODEC(ZSTD_QAT), + somestr FixedString(3) CODEC(ZSTD_QAT), + othernum Int64 CODEC(ZSTD_QAT), + somearray Array(UInt8) CODEC(ZSTD_QAT), + somemap Map(String, UInt32) CODEC(ZSTD_QAT), + sometuple Tuple(UInt16, UInt64) CODEC(ZSTD_QAT), +) ENGINE = MergeTree() ORDER BY tuple(); + +SHOW CREATE TABLE compression_codec; + +INSERT INTO compression_codec VALUES(1, 'hello', toDate('2018-12-14'), toDate32('2018-12-14'), 1.1, 'aaa', 5, [1,2,3], map('k1',1,'k2',2), tuple(1,2)); +INSERT INTO compression_codec VALUES(2, 'world', toDate('2018-12-15'), toDate32('2018-12-15'), 2.2, 'bbb', 6, [4,5,6], map('k3',3,'k4',4), tuple(3,4)); +INSERT INTO compression_codec VALUES(3, '!', toDate('2018-12-16'), toDate32('2018-12-16'), 3.3, 'ccc', 7, [7,8,9], map('k5',5,'k6',6), tuple(5,6)); + +SELECT * FROM compression_codec ORDER BY id; + +OPTIMIZE TABLE compression_codec FINAL; + +INSERT INTO compression_codec VALUES(2, '', toDate('2018-12-13'), toDate32('2018-12-13'), 4.4, 'ddd', 8, [10,11,12], map('k7',7,'k8',8), tuple(7,8)); + +DETACH TABLE compression_codec; +ATTACH TABLE compression_codec; + +SELECT count(*) FROM compression_codec WHERE id = 2 GROUP BY id; + +INSERT INTO compression_codec SELECT 3, '!', toDate('2018-12-16'), toDate32('2018-12-16'), 3.3, 'ccc', 7, [7,8,9], map('k5',5,'k6',6), tuple(5,6) FROM system.numbers LIMIT 10000; + +SELECT count(*) FROM compression_codec WHERE id = 3 GROUP BY id; + +DROP TABLE IF EXISTS compression_codec; diff --git a/tests/queries/0_stateless/01753_fix_clickhouse_format.reference b/tests/queries/0_stateless/01753_fix_clickhouse_format.reference index 0aad4d64c55..735d4099534 100644 --- a/tests/queries/0_stateless/01753_fix_clickhouse_format.reference +++ b/tests/queries/0_stateless/01753_fix_clickhouse_format.reference @@ -1,5 +1,4 @@ -SELECT 1 -; +SELECT 1; SELECT 1 UNION ALL @@ -10,8 +9,7 @@ UNION ALL ) ; -SELECT 1 -; +SELECT 1; SELECT 1 UNION ALL @@ -22,4 +20,6 @@ UNION ALL ) ; +INSERT INTO t VALUES (1); + OK diff --git a/tests/queries/0_stateless/01753_fix_clickhouse_format.sh b/tests/queries/0_stateless/01753_fix_clickhouse_format.sh index 5cdd53b2166..ba7fe949833 100755 --- a/tests/queries/0_stateless/01753_fix_clickhouse_format.sh +++ b/tests/queries/0_stateless/01753_fix_clickhouse_format.sh @@ -8,4 +8,6 @@ echo "select 1; select 1 union all (select 1 union distinct select 1); " | $CL echo "select 1; select 1 union all (select 1 union distinct select 1); -- comment " | $CLICKHOUSE_FORMAT -n; -echo "insert into t values (1); " | $CLICKHOUSE_FORMAT -n 2>&1 \ | grep -F -q "Code: 578" && echo 'OK' || echo 'FAIL' +echo "insert into t values (1); " | $CLICKHOUSE_FORMAT -n + +echo 'insert into t format JSONEachRow {"a":1};' | $CLICKHOUSE_FORMAT -n 2>&1 \ | grep -F -q "NOT_IMPLEMENTED" && echo 'OK' || echo 'FAIL' diff --git a/tests/queries/0_stateless/02245_make_datetime64.reference b/tests/queries/0_stateless/02245_make_datetime64.reference index 0ac672ae54d..1c7d31788e3 100644 --- a/tests/queries/0_stateless/02245_make_datetime64.reference +++ b/tests/queries/0_stateless/02245_make_datetime64.reference @@ -67,3 +67,4 @@ DateTime64(7, \'UTC\') 1900-01-01 00:00:00.000 1900-01-01 00:00:00.000 1900-01-01 00:00:00.000 +2024-01-08 11:12:13.014 diff --git a/tests/queries/0_stateless/02245_make_datetime64.sql b/tests/queries/0_stateless/02245_make_datetime64.sql index 62784cb9b75..71629ad8dff 100644 --- a/tests/queries/0_stateless/02245_make_datetime64.sql +++ b/tests/queries/0_stateless/02245_make_datetime64.sql @@ -82,6 +82,9 @@ select makeDateTime64(1991, 8, 24, 65537, 4, 0); select makeDateTime64(1991, 8, 24, 21, 65537, 0); select makeDateTime64(1991, 8, 24, 21, 4, 65537); +-- bug 58590 +select makeDateTime64(2024, 1, 8, 11, 12, 13, materialize(14)); + select makeDateTime64(year, 1, 1, 1, 0, 0, 0, precision, timezone) from ( select 1984 as year, 5 as precision, 'UTC' as timezone union all diff --git a/tests/queries/0_stateless/02263_format_insert_settings.reference b/tests/queries/0_stateless/02263_format_insert_settings.reference index e2d1ec3980e..2bba75f6788 100644 --- a/tests/queries/0_stateless/02263_format_insert_settings.reference +++ b/tests/queries/0_stateless/02263_format_insert_settings.reference @@ -1,7 +1,7 @@ [multi] insert into foo settings max_threads=1 Syntax error (query): failed at position 40 (end of query): [multi] insert into foo format tsv settings max_threads=1 -Can't format ASTInsertQuery with data, since data will be lost. +NOT_IMPLEMENTED [multi] insert into foo format tsv settings max_threads=1 INSERT INTO foo SETTINGS max_threads = 1 diff --git a/tests/queries/0_stateless/02263_format_insert_settings.sh b/tests/queries/0_stateless/02263_format_insert_settings.sh index 8b156ffec83..49aa56d6c0a 100755 --- a/tests/queries/0_stateless/02263_format_insert_settings.sh +++ b/tests/queries/0_stateless/02263_format_insert_settings.sh @@ -25,7 +25,7 @@ function run_format_both() run_format 'insert into foo settings max_threads=1' |& grep --max-count 2 --only-matching -e "Syntax error (query): failed at position .* (end of query):" -e '^\[.*$' # compatibility -run_format 'insert into foo format tsv settings max_threads=1' |& grep --max-count 2 --only-matching -e "Can't format ASTInsertQuery with data, since data will be lost." -e '^\[.*$' +run_format 'insert into foo format tsv settings max_threads=1' |& grep --max-count 2 --only-matching -e "NOT_IMPLEMENTED" -e '^\[.*$' run_format_both 'insert into foo format tsv settings max_threads=1' --allow_settings_after_format_in_insert run_format 'insert into foo settings max_threads=1 format tsv settings max_threads=1' --allow_settings_after_format_in_insert |& grep --max-count 2 --only-matching -e "You have SETTINGS before and after FORMAT" -e '^\[.*$' diff --git a/tests/queries/0_stateless/02366_kql_summarize.sql b/tests/queries/0_stateless/02366_kql_summarize.sql index 21a1b643d98..bb12d1f251f 100644 --- a/tests/queries/0_stateless/02366_kql_summarize.sql +++ b/tests/queries/0_stateless/02366_kql_summarize.sql @@ -1,23 +1,23 @@ -- datatable(FirstName:string, LastName:string, Occupation:string, Education:string, Age:int) [ --- 'Theodore', 'Diaz', 'Skilled Manual', 'Bachelors', 28, --- 'Stephanie', 'Cox', 'Management abcd defg', 'Bachelors', 33, --- 'Peter', 'Nara', 'Skilled Manual', 'Graduate Degree', 26, --- 'Latoya', 'Shen', 'Professional', 'Graduate Degree', 25, --- 'Joshua', 'Lee', 'Professional', 'Partial College', 26, --- 'Edward', 'Hernandez', 'Skilled Manual', 'High School', 36, --- 'Dalton', 'Wood', 'Professional', 'Partial College', 42, --- 'Christine', 'Nara', 'Skilled Manual', 'Partial College', 33, --- 'Cameron', 'Rodriguez', 'Professional', 'Partial College', 28, --- 'Angel', 'Stewart', 'Professional', 'Partial College', 46, --- 'Apple', '', 'Skilled Manual', 'Bachelors', 28, +-- 'Theodore', 'Diaz', 'Skilled Manual', 'Bachelors', 28, +-- 'Stephanie', 'Cox', 'Management abcd defg', 'Bachelors', 33, +-- 'Peter', 'Nara', 'Skilled Manual', 'Graduate Degree', 26, +-- 'Latoya', 'Shen', 'Professional', 'Graduate Degree', 25, +-- 'Joshua', 'Lee', 'Professional', 'Partial College', 26, +-- 'Edward', 'Hernandez', 'Skilled Manual', 'High School', 36, +-- 'Dalton', 'Wood', 'Professional', 'Partial College', 42, +-- 'Christine', 'Nara', 'Skilled Manual', 'Partial College', 33, +-- 'Cameron', 'Rodriguez', 'Professional', 'Partial College', 28, +-- 'Angel', 'Stewart', 'Professional', 'Partial College', 46, +-- 'Apple', '', 'Skilled Manual', 'Bachelors', 28, -- dynamic(null), 'why', 'Professional', 'Partial College', 38 -- ] DROP TABLE IF EXISTS Customers; CREATE TABLE Customers -( +( FirstName Nullable(String), - LastName String, + LastName String, Occupation String, Education String, Age Nullable(UInt8) @@ -89,9 +89,9 @@ print '-- Summarize following sort --'; Customers | sort by FirstName | summarize count() by Occupation | sort by Occupation; print '-- summarize with bin --'; -EventLog | summarize count=count() by bin(Created, 1000); -EventLog | summarize count=count() by bin(unixtime_seconds_todatetime(Created/1000), 1s); -EventLog | summarize count=count() by time_label=bin(Created/1000, 1s); +EventLog | summarize count=count() by bin(Created, 1000) | sort by count asc; +EventLog | summarize count=count() by bin(unixtime_seconds_todatetime(Created/1000), 1s) | sort by count asc; +EventLog | summarize count=count() by time_label=bin(Created/1000, 1s) | sort by count asc; Dates | project bin(datetime(EventTime), 1m); print '-- make_list_with_nulls --'; Customers | summarize t = make_list_with_nulls(FirstName); diff --git a/tests/queries/0_stateless/02675_profile_events_from_query_log_and_client.reference b/tests/queries/0_stateless/02675_profile_events_from_query_log_and_client.reference index a97879eaca8..babcecf7004 100644 --- a/tests/queries/0_stateless/02675_profile_events_from_query_log_and_client.reference +++ b/tests/queries/0_stateless/02675_profile_events_from_query_log_and_client.reference @@ -1,4 +1,5 @@ INSERT TO S3 + [ 0 ] S3Clients: 1 [ 0 ] S3CompleteMultipartUpload: 1 [ 0 ] S3CreateMultipartUpload: 1 [ 0 ] S3HeadObject: 2 diff --git a/tests/queries/0_stateless/02884_string_distance_function.reference b/tests/queries/0_stateless/02884_string_distance_function.reference index 3ac30825fd0..e8cce2017d9 100644 --- a/tests/queries/0_stateless/02884_string_distance_function.reference +++ b/tests/queries/0_stateless/02884_string_distance_function.reference @@ -1,59 +1,69 @@ -const arguments byteHammingDistance -0 -const arguments editDistance -6 -const arguments stringJaccardIndex -0.4 -byteHammingDistance -7 -1 -7 -10 -byteHammingDistance(const, non const) -3 -3 -6 -10 -byteHammingDistance(non const, const) -3 -6 -6 -10 -mismatches(alias) -7 -1 -7 -10 -3 -3 -6 -10 -3 -6 -6 -10 -stringJaccardIndex -0 -0.8571428571428571 -0.8571428571428571 -0.4 -0 -0.8571428571428571 -0.8571428571428571 -0.4 +-- const arguments +clickhouse mouse 10 +clickhouse mouse 6 +clickhouse mouse 6 +clickhouse mouse 0.4 +clickhouse mouse 0.4 +clickhouse mouse 0 +clickhouse mouse 0 +-- test aliases +clickhouse mouse 10 +clickhouse mouse 6 +-- Deny DoS using too large inputs +-- non-const arguments +byteHammingDistance 0 +byteHammingDistance abc 3 +byteHammingDistance abc 3 +byteHammingDistance abc ab 1 +byteHammingDistance abc abc 0 +byteHammingDistance abc bc 3 +byteHammingDistance clickhouse mouse 10 +editDistance 0 +editDistance abc 3 +editDistance abc 3 +editDistance abc ab 1 +editDistance abc abc 0 +editDistance abc bc 1 +editDistance clickhouse mouse 6 +damerauLevenshteinDistance 0 +damerauLevenshteinDistance abc 3 +damerauLevenshteinDistance abc 3 +damerauLevenshteinDistance abc ab 1 +damerauLevenshteinDistance abc abc 0 +damerauLevenshteinDistance abc bc 1 +damerauLevenshteinDistance clickhouse mouse 6 +stringJaccardIndex 0 +stringJaccardIndex abc 0 +stringJaccardIndex abc 0 +stringJaccardIndex abc ab 0.6666666666666666 +stringJaccardIndex abc abc 1 +stringJaccardIndex abc bc 0.6666666666666666 +stringJaccardIndex clickhouse mouse 0.4 +stringJaccardIndexUTF8 0 +stringJaccardIndexUTF8 abc 0 +stringJaccardIndexUTF8 abc 0 +stringJaccardIndexUTF8 abc ab 0.6666666666666666 +stringJaccardIndexUTF8 abc abc 1 +stringJaccardIndexUTF8 abc bc 0.6666666666666666 +stringJaccardIndexUTF8 clickhouse mouse 0.4 +jaroSimilarity 0 +jaroSimilarity abc 3 +jaroSimilarity abc 3 +jaroSimilarity abc ab 0.8888888888888888 +jaroSimilarity abc abc 1 +jaroSimilarity abc bc 0 +jaroSimilarity clickhouse mouse 0 +jaroWinklerSimilarity 0 +jaroWinklerSimilarity abc 3 +jaroWinklerSimilarity abc 3 +jaroWinklerSimilarity abc ab 0.9111111111111111 +jaroWinklerSimilarity abc abc 1 +jaroWinklerSimilarity abc bc 0 +jaroWinklerSimilarity clickhouse mouse 0 +-- Special UTF-8 tests 0.4 0 0 0 0 0.25 0.625 -editDistance -7 -1 -1 -6 -levenshteinDistance -7 -1 -1 -6 diff --git a/tests/queries/0_stateless/02884_string_distance_function.sql b/tests/queries/0_stateless/02884_string_distance_function.sql index e3d9051ce5b..fddbf41f0e5 100644 --- a/tests/queries/0_stateless/02884_string_distance_function.sql +++ b/tests/queries/0_stateless/02884_string_distance_function.sql @@ -1,36 +1,44 @@ -select 'const arguments byteHammingDistance'; -select byteHammingDistance('abcd', 'abcd'); -select 'const arguments editDistance'; -select editDistance('clickhouse', 'mouse'); +SELECT '-- const arguments'; +-- just to see it works +SELECT 'clickhouse' AS s1, 'mouse' AS s2, byteHammingDistance(s1, s2); +SELECT 'clickhouse' AS s1, 'mouse' AS s2, editDistance(s1, s2); +SELECT 'clickhouse' AS s1, 'mouse' AS s2, damerauLevenshteinDistance(s1, s2); +SELECT 'clickhouse' AS s1, 'mouse' AS s2, stringJaccardIndex(s1, s2); +SELECT 'clickhouse' AS s1, 'mouse' AS s2, stringJaccardIndexUTF8(s1, s2); +SELECT 'clickhouse' AS s1, 'mouse' AS s2, jaroSimilarity(s1, s2); +SELECT 'clickhouse' AS s1, 'mouse' AS s2, jaroWinklerSimilarity(s1, s2); -select 'const arguments stringJaccardIndex'; -select stringJaccardIndex('clickhouse', 'mouse'); +SELECT '-- test aliases'; +SELECT 'clickhouse' AS s1, 'mouse' AS s2, mismatches(s1, s2); +SELECT 'clickhouse' AS s1, 'mouse' AS s2, levenshteinDistance(s1, s2); -drop table if exists t; -create table t +SELECT '-- Deny DoS using too large inputs'; +SELECT editDistance(randomString(power(2, 17)), 'abc'); -- { serverError TOO_LARGE_STRING_SIZE} +SELECT damerauLevenshteinDistance(randomString(power(2, 17)), 'abc'); -- { serverError TOO_LARGE_STRING_SIZE} +SELECT jaroSimilarity(randomString(power(2, 17)), 'abc'); -- { serverError TOO_LARGE_STRING_SIZE} +SELECT jaroWinklerSimilarity(randomString(power(2, 17)), 'abc'); -- { serverError TOO_LARGE_STRING_SIZE} + +DROP TABLE IF EXISTS t; +CREATE TABLE t ( - s1 String, - s2 String -) engine = MergeTree order by s1; + s1 String, + s2 String +) ENGINE = MergeTree ORDER BY s1; -insert into t values ('abcdefg', 'abcdef') ('abcdefg', 'bcdefg') ('abcdefg', '') ('mouse', 'clickhouse'); -select 'byteHammingDistance'; -select byteHammingDistance(s1, s2) FROM t ORDER BY s1, s2; -select 'byteHammingDistance(const, non const)'; -select byteHammingDistance('abc', s2) FROM t ORDER BY s1, s2; -select 'byteHammingDistance(non const, const)'; -select byteHammingDistance(s2, 'def') FROM t ORDER BY s1, s2; +-- actual test cases +INSERT INTO t VALUES ('', '') ('abc', '') ('', 'abc') ('abc', 'abc') ('abc', 'ab') ('abc', 'bc') ('clickhouse', 'mouse'); -select 'mismatches(alias)'; -select mismatches(s1, s2) FROM t ORDER BY s1, s2; -select mismatches('abc', s2) FROM t ORDER BY s1, s2; -select mismatches(s2, 'def') FROM t ORDER BY s1, s2; +SELECT '-- non-const arguments'; +SELECT 'byteHammingDistance', s1, s2, byteHammingDistance(s1, s2) FROM t ORDER BY ALL; +SELECT 'editDistance', s1, s2, editDistance(s1, s2) FROM t ORDER BY ALL; +SELECT 'damerauLevenshteinDistance', s1, s2, damerauLevenshteinDistance(s1, s2) FROM t ORDER BY ALL; +SELECT 'stringJaccardIndex', s1, s2, stringJaccardIndex(s1, s2) FROM t ORDER BY ALL; +SELECT 'stringJaccardIndexUTF8', s1, s2, stringJaccardIndexUTF8(s1, s2) FROM t ORDER BY ALL; +SELECT 'jaroSimilarity', s1, s2, jaroSimilarity(s1, s2) FROM t ORDER BY ALL; +SELECT 'jaroWinklerSimilarity', s1, s2, jaroWinklerSimilarity(s1, s2) FROM t ORDER BY ALL; -select 'stringJaccardIndex'; -select stringJaccardIndex(s1, s2) FROM t ORDER BY s1, s2; -select stringJaccardIndexUTF8(s1, s2) FROM t ORDER BY s1, s2; - --- we do not perform full UTF8 validation, so sometimes it just returns some result +SELECT '-- Special UTF-8 tests'; +-- We do not perform full UTF8 validation, so sometimes it just returns some result SELECT stringJaccardIndexUTF8(materialize('hello'), materialize('\x48\x65\x6C')); SELECT stringJaccardIndexUTF8(materialize('hello'), materialize('\xFF\xFF\xFF\xFF')); SELECT stringJaccardIndexUTF8(materialize('hello'), materialize('\x41\xE2\x82\xAC')); @@ -42,14 +50,6 @@ SELECT stringJaccardIndexUTF8(materialize('hello'), materialize('\xF0\x80\x80\x4 SELECT stringJaccardIndexUTF8(materialize('hello'), materialize('\xC0\x80')); -- { serverError BAD_ARGUMENTS } SELECT stringJaccardIndexUTF8(materialize('hello'), materialize('\xD8\x00 ')); -- { serverError BAD_ARGUMENTS } SELECT stringJaccardIndexUTF8(materialize('hello'), materialize('\xDC\x00')); -- { serverError BAD_ARGUMENTS } - SELECT stringJaccardIndexUTF8('😃🌍', '🙃😃🌑'), stringJaccardIndex('😃🌍', '🙃😃🌑'); -select 'editDistance'; -select editDistance(s1, s2) FROM t ORDER BY s1, s2; -select 'levenshteinDistance'; -select levenshteinDistance(s1, s2) FROM t ORDER BY s1, s2; - -SELECT editDistance(randomString(power(2, 17)), 'abc'); -- { serverError TOO_LARGE_STRING_SIZE} - -drop table t; +DROP TABLE t; diff --git a/tests/queries/0_stateless/02896_leading_zeroes_no_octal.reference b/tests/queries/0_stateless/02896_leading_zeroes_no_octal.reference new file mode 100644 index 00000000000..5b932f50824 --- /dev/null +++ b/tests/queries/0_stateless/02896_leading_zeroes_no_octal.reference @@ -0,0 +1,35 @@ +Leading zeroes into INTEGER +1 1 00000 0 0 +1 2 0 0 0 +1 3 00 0 0 +1 4 01 1 1 +1 5 +01 1 1 +1 6 -01 -1 -1 +1 7 0001 1 1 +1 8 0005 5 5 +1 9 0008 8 8 +1 10 0017 17 17 +1 11 0021 21 21 +1 12 0051 51 51 +1 13 00000123 123 123 +1 14 0b10000 16 16 +1 15 0x0abcd 43981 43981 +1 16 0000.008 0 0 +1 17 1000.0008 1000 1000 +1 18 0008.0008 8 8 +Leading zeroes into Float32 +1 1 00000 0 0 +1 2 00009.00009 9.00009 9.00009 +1 3 00009e9 9000000000 9000000000 +1 4 00009e09 9000000000 9000000000 +1 5 00009e0009 9000000000 9000000000 +1 6 -00000 -0.1 -0.1 +1 7 -00009.00009 -9.00009 -9.00009 +1 8 -00009e9 -9000000000 -9000000000 +1 9 -00009e09 -9000000000 -9000000000 +1 10 -00009e0009 -9000000000 -9000000000 +1 11 +00000 0 0 +1 12 +00009.00009 9.00009 9.00009 +1 13 +00009e9 9000000000 9000000000 +1 14 +00009e09 9000000000 9000000000 +1 15 +00009e0009 9000000000 9000000000 diff --git a/tests/queries/0_stateless/02896_leading_zeroes_no_octal.sql b/tests/queries/0_stateless/02896_leading_zeroes_no_octal.sql new file mode 100644 index 00000000000..69cc06a46f8 --- /dev/null +++ b/tests/queries/0_stateless/02896_leading_zeroes_no_octal.sql @@ -0,0 +1,28 @@ +DROP TABLE IF EXISTS t_leading_zeroes; +DROP TABLE IF EXISTS t_leading_zeroes_f; + +CREATE TABLE t_leading_zeroes(id INTEGER, input String, val INTEGER, expected INTEGER) ENGINE=MergeTree ORDER BY id; + +INSERT INTO t_leading_zeroes VALUES (1, '00000', 00000, 0), (2, '0', 0, 0), (3, '00', 00, 0), (4, '01', 01, 1), (5, '+01', +01, 1); +INSERT INTO t_leading_zeroes VALUES (6, '-01', -01, -1), (7, '0001', 0001, 1), (8, '0005', 0005, 5), (9, '0008', 0008, 8); +INSERT INTO t_leading_zeroes VALUES (10, '0017', 0017, 17), (11, '0021', 0021, 21), (12, '0051', 0051, 51), (13, '00000123', 00000123, 123); +INSERT INTO t_leading_zeroes VALUES (14, '0b10000', 0b10000, 16), (15, '0x0abcd', 0x0abcd, 43981), (16, '0000.008', 0000.008, 0) +INSERT INTO t_leading_zeroes VALUES (17, '1000.0008', 1000.0008, 1000), (18, '0008.0008', 0008.0008, 8); + +SELECT 'Leading zeroes into INTEGER'; +SELECT t.val == t.expected AS ok, * FROM t_leading_zeroes t ORDER BY id; + +-- Floats don't go via the weird octal path: +CREATE TABLE t_leading_zeroes_f(id INTEGER, input String, val Float32, expected Float32) ENGINE=MergeTree ORDER BY id; +INSERT INTO t_leading_zeroes_f VALUES (1, '00000', 00000, 0), (2, '00009.00009', 00009.00009, 9.00009), (3, '00009e9', 00009e9, 9e9), (4, '00009e09', 00009e09, 9e9), (5, '00009e0009', 00009e0009, 9e9); +INSERT INTO t_leading_zeroes_f VALUES (6, '-00000', -00000.1, -0.1), (7, '-00009.00009', -00009.00009, -9.00009), (8, '-00009e9', -00009e9, -9e9), (9, '-00009e09', -00009e09, -9e9), (10, '-00009e0009', -00009e0009, -9e9); +INSERT INTO t_leading_zeroes_f VALUES (11, '+00000', +00000., 0), (12, '+00009.00009', +00009.00009, 9.00009), (13, '+00009e9', +00009e9, 9e9), (14, '+00009e09', +00009e09, 9e9), (15, '+00009e0009', +00009e0009, 9e9); +-- Coincidentally, the following result in 9 rather than 9e9 because of readFloatTextFastImpl +-- using readUIntTextUpToNSignificantDigits<4>(exponent, in) +-- INSERT INTO t_leading_zeroes_f VALUES (100, '00009e00009', 00009e00009, 9e9), (101, '-00009e00009', -00009e00009, -9e9), (102, '+00009e00009', +00009e00009, 9e9) + +SELECT 'Leading zeroes into Float32'; +SELECT t.val == t.expected AS ok, * FROM t_leading_zeroes_f t ORDER BY id; + +DROP TABLE IF EXISTS t_leading_zeroes; +DROP TABLE IF EXISTS t_leading_zeroes_f; \ No newline at end of file diff --git a/tests/queries/0_stateless/02918_multif_for_nullable.reference b/tests/queries/0_stateless/02918_multif_for_nullable.reference new file mode 100644 index 00000000000..f58086cfee1 --- /dev/null +++ b/tests/queries/0_stateless/02918_multif_for_nullable.reference @@ -0,0 +1,5 @@ +-1 -1 -1 +1 -1 -1 +1 1 -1 +1 2 \N +1 3 \N diff --git a/tests/queries/0_stateless/02918_multif_for_nullable.sh b/tests/queries/0_stateless/02918_multif_for_nullable.sh new file mode 100755 index 00000000000..cd9ac8b904f --- /dev/null +++ b/tests/queries/0_stateless/02918_multif_for_nullable.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash + +# NOTE: this sh wrapper is required because of shell_config + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +$CLICKHOUSE_CLIENT -q "drop table if exists test_tbl" +$CLICKHOUSE_CLIENT -q "create table test_tbl (d Nullable(Int64)) engine=Memory" +$CLICKHOUSE_CLIENT -q "insert into test_tbl select * from numbers(5)" +$CLICKHOUSE_CLIENT -q "select multiIf(d > 0, 1, -1), multiIf(d > 1, d-1, -1), multiIf(d > 2, null, -1) from test_tbl" +$CLICKHOUSE_CLIENT -q "drop table test_tbl" \ No newline at end of file diff --git a/tests/queries/0_stateless/02946_format_values.reference b/tests/queries/0_stateless/02946_format_values.reference new file mode 100644 index 00000000000..90b2a3cb8ef --- /dev/null +++ b/tests/queries/0_stateless/02946_format_values.reference @@ -0,0 +1,141 @@ +INSERT INTO table1 VALUES (1, [1,3], 'fd'), (2, [2,4], 'sd'), (3, [3,5], 'td') +====================================== +SELECT a +FROM table1 +; + +INSERT INTO table1 VALUES (1, [1,3], 'fd'), (2, [2,4], 'sd'), (3, [3,5], 'td'); + +SELECT b +FROM table1 +; + +====================================== +-- begin +SELECT a +FROM table1 +; + +-- some insert query +INSERT INTO table1 VALUES (1, [1,3], 'fd'), (2, [2,4], 'sd'), (3, [3,5], 'td'); + +-- more comments +-- in a row +SELECT b +FROM table1 +; + +-- end +====================================== +SELECT b FROM table1; + +SELECT b, c FROM table1; + +SELECT + b, + c, + d +FROM table1 +; + +SELECT + b, + c, + d, + e +FROM table1 +; + +SELECT + b, + c, + d, + e, + f +FROM table1 +; + +SELECT + b, + c +FROM +( + SELECT + b, + c + FROM table1 +) +; + +SELECT + b, + c, + d, + e, + f +FROM +( + SELECT + b, + c, + d, + e, + f + FROM table1 +) +; + +====================================== +SELECT b FROM table1; + +SELECT b, c FROM table1; + +SELECT b, c, d FROM table1; + +SELECT b, c, d, e FROM table1; + +SELECT b, c, d, e, f FROM table1; + +SELECT b, c FROM (SELECT b, c FROM table1); + +SELECT + b, + c, + d, + e, + f +FROM +( + SELECT + b, + c, + d, + e, + f + FROM table1 +) +; + +====================================== +SELECT + b, + c, + d, + e, + f +FROM +( + SELECT + b, + c, + d, + e, + f + FROM table1 +) +SELECT b, c, d, e, f FROM (SELECT b, c, d, e, f FROM table1) +====================================== +Option 'max_line_length' must be less than 256. +2 +Options 'oneline' and 'max_line_length' are mutually exclusive. +2 diff --git a/tests/queries/0_stateless/02946_format_values.sh b/tests/queries/0_stateless/02946_format_values.sh new file mode 100755 index 00000000000..36e32de42fa --- /dev/null +++ b/tests/queries/0_stateless/02946_format_values.sh @@ -0,0 +1,73 @@ +#!/usr/bin/env bash + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +echo "insert into table1 values (1, [1,3], 'fd'), (2, [2,4], 'sd'), (3, [3,5], 'td')" | ${CLICKHOUSE_FORMAT} + +echo "======================================" + +cat <&1; echo $?; } +{ echo "select 1" | ${CLICKHOUSE_FORMAT} --comments --max_line_length=120 --oneline 2>&1; echo $?; } diff --git a/tests/queries/0_stateless/02961_output_format_compress_params.reference b/tests/queries/0_stateless/02961_output_format_compress_params.reference new file mode 100644 index 00000000000..d0752a77fc7 --- /dev/null +++ b/tests/queries/0_stateless/02961_output_format_compress_params.reference @@ -0,0 +1,2 @@ +1 +1000000 diff --git a/tests/queries/0_stateless/02961_output_format_compress_params.sh b/tests/queries/0_stateless/02961_output_format_compress_params.sh new file mode 100755 index 00000000000..7275f9a0b2b --- /dev/null +++ b/tests/queries/0_stateless/02961_output_format_compress_params.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash +# Tags: replica + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +file_with_random_postfix=test_02961_`date +%s%6N`.csv + +${CLICKHOUSE_CLIENT} --query "INSERT INTO FUNCTION file('${file_with_random_postfix}', 'CSV', 'x UInt64', 'zstd') SELECT number FROM numbers(1000000) SETTINGS output_format_compression_level = 10, output_format_compression_zstd_window_log = 30, engine_file_truncate_on_insert = 1;" +# Simple check that output_format_compression_zstd_window_log = 30 works +${CLICKHOUSE_CLIENT} --query "SELECT count() FROM file('${file_with_random_postfix}', 'CSV', 'x UInt64', 'zstd') SETTINGS zstd_window_log_max = 29;" 2>&1 | head -n 1 | grep -c "ZSTD_DECODER_FAILED" +${CLICKHOUSE_CLIENT} --query "SELECT count() FROM file('${file_with_random_postfix}', 'CSV', 'x UInt64', 'zstd') SETTINGS zstd_window_log_max = 30;" diff --git a/tests/queries/0_stateless/02962_max_joined_block_rows.reference b/tests/queries/0_stateless/02962_max_joined_block_rows.reference new file mode 100644 index 00000000000..8bc1bad225b --- /dev/null +++ b/tests/queries/0_stateless/02962_max_joined_block_rows.reference @@ -0,0 +1,32 @@ +1 0 +1 1 +1 2 +1 3 +1 4 +1 5 +1 6 +1 7 +1 8 +1 9 +-- +1 0 +1 1 +1 2 +1 3 +1 4 +1 5 +1 6 +1 7 +1 8 +1 9 +-- +1 0 +1 1 +1 2 +1 3 +1 4 +1 5 +1 6 +1 7 +1 8 +1 9 diff --git a/tests/queries/0_stateless/02962_max_joined_block_rows.sql b/tests/queries/0_stateless/02962_max_joined_block_rows.sql new file mode 100644 index 00000000000..c31ab5e1132 --- /dev/null +++ b/tests/queries/0_stateless/02962_max_joined_block_rows.sql @@ -0,0 +1,38 @@ +DROP TABLE IF EXISTS t1; +DROP TABLE IF EXISTS t2; + +CREATE table t1 (a UInt64, b UInt64) ENGINE = Memory; +INSERT INTO t1 SELECT number % 2, number FROM numbers(10); + +CREATE table t2 (a UInt64) ENGINE = Memory; + +INSERT INTO t2 SELECT number % 2 FROM numbers(10); + +-- block size is always multiple of 5 because we have 5 rows for each key in right table +-- we do not split rows corresponding to the same key + +SELECT max(bs) <= 5, b FROM ( + SELECT blockSize() as bs, * FROM t1 JOIN t2 ON t1.a = t2.a +) GROUP BY b +ORDER BY b +SETTINGS max_joined_block_size_rows = 5; + +SELECT '--'; + +SELECT max(bs) <= 10, b FROM ( + SELECT blockSize() as bs, * FROM t1 JOIN t2 ON t1.a = t2.a +) GROUP BY b +ORDER BY b +SETTINGS max_joined_block_size_rows = 10; + +SELECT '--'; + +-- parallel_hash doen't support max_joined_block_size_rows + +SET join_algorithm = 'parallel_hash'; + +SELECT max(bs) > 10, b FROM ( + SELECT blockSize() as bs, * FROM t1 JOIN t2 ON t1.a = t2.a +) GROUP BY b +ORDER BY b +SETTINGS max_joined_block_size_rows = 10; diff --git a/utils/check-mysql-binlog/main.cpp b/utils/check-mysql-binlog/main.cpp index d1f868eba97..484dd46a90c 100644 --- a/utils/check-mysql-binlog/main.cpp +++ b/utils/check-mysql-binlog/main.cpp @@ -1,173 +1,98 @@ +#include +#include +#include #include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static DB::MySQLCharsetPtr charset = std::make_shared(); -static DB::MySQLReplication::BinlogEventPtr parseSingleEventBody( - DB::MySQLReplication::EventHeader & header, DB::ReadBuffer & payload, - std::shared_ptr & last_table_map_event, bool exist_checksum) +bool quit = false; +void signal_handler(int) { + quit = true; +} + +static void processBinlogFromFile(const std::string & bin_path, bool disable_checksum) +{ + DB::MySQLReplication::BinlogFromFile binlog; + binlog.open(bin_path); + binlog.setChecksum(disable_checksum ? DB::MySQLReplication::IBinlog::NONE : DB::MySQLReplication::IBinlog::CRC32); + DB::MySQLReplication::BinlogEventPtr event; - DB::ReadBufferPtr limit_read_buffer = std::make_shared(payload, header.event_size - 19, - /* trow_exception */ false, /* exact_limit */ std::nullopt); - DB::ReadBufferPtr event_payload = std::make_shared(*limit_read_buffer, exist_checksum ? 4 : 0); - - switch (header.type) + while (binlog.tryReadEvent(event, /*timeout*/ 0) && !quit) { - case DB::MySQLReplication::FORMAT_DESCRIPTION_EVENT: - { - event = std::make_shared(std::move(header)); - event->parseEvent(*event_payload); - break; - } - case DB::MySQLReplication::ROTATE_EVENT: - { - event = std::make_shared(std::move(header)); - event->parseEvent(*event_payload); - break; - } - case DB::MySQLReplication::QUERY_EVENT: - { - event = std::make_shared(std::move(header)); - event->parseEvent(*event_payload); - - auto query = std::static_pointer_cast(event); - switch (query->typ) - { - case DB::MySQLReplication::QUERY_EVENT_MULTI_TXN_FLAG: - case DB::MySQLReplication::QUERY_EVENT_XA: - { - event = std::make_shared(std::move(query->header)); - break; - } - default: - break; - } - break; - } - case DB::MySQLReplication::XID_EVENT: - { - event = std::make_shared(std::move(header)); - event->parseEvent(*event_payload); - break; - } - case DB::MySQLReplication::TABLE_MAP_EVENT: - { - DB::MySQLReplication::TableMapEventHeader map_event_header; - map_event_header.parse(*event_payload); - event = std::make_shared(std::move(header), map_event_header, charset); - event->parseEvent(*event_payload); - last_table_map_event = std::static_pointer_cast(event); - break; - } - case DB::MySQLReplication::WRITE_ROWS_EVENT_V1: - case DB::MySQLReplication::WRITE_ROWS_EVENT_V2: - { - DB::MySQLReplication::RowsEventHeader rows_header(header.type); - rows_header.parse(*event_payload); - event = std::make_shared(last_table_map_event, std::move(header), rows_header); - event->parseEvent(*event_payload); - break; - } - case DB::MySQLReplication::DELETE_ROWS_EVENT_V1: - case DB::MySQLReplication::DELETE_ROWS_EVENT_V2: - { - DB::MySQLReplication::RowsEventHeader rows_header(header.type); - rows_header.parse(*event_payload); - event = std::make_shared(last_table_map_event, std::move(header), rows_header); - event->parseEvent(*event_payload); - break; - } - case DB::MySQLReplication::UPDATE_ROWS_EVENT_V1: - case DB::MySQLReplication::UPDATE_ROWS_EVENT_V2: - { - DB::MySQLReplication::RowsEventHeader rows_header(header.type); - rows_header.parse(*event_payload); - event = std::make_shared(last_table_map_event, std::move(header), rows_header); - event->parseEvent(*event_payload); - break; - } - case DB::MySQLReplication::GTID_EVENT: - { - event = std::make_shared(std::move(header)); - event->parseEvent(*event_payload); - break; - } - default: - { - event = std::make_shared(std::move(header)); - event->parseEvent(*event_payload); - break; - } + DB::WriteBufferFromOStream cout(std::cout); + event->dump(cout); + binlog.getPosition().dump(cout); + cout.finalize(); } - - return event; } -static int checkBinLogFile(const std::string & bin_path, bool exist_checksum) +static void processBinlogFromSocket(const std::string & host, int port, const std::string & user, const std::string & password, const std::string & executed_gtid_set, bool disable_checksum) { - DB::ReadBufferFromFile in(bin_path); - DB::assertString("\xfe\x62\x69\x6e", in); /// magic number + DB::MySQLReplication::BinlogFromSocket binlog; + binlog.setChecksum(disable_checksum ? DB::MySQLReplication::IBinlog::NONE : DB::MySQLReplication::IBinlog::CRC32); - DB::MySQLReplication::BinlogEventPtr last_event; - std::shared_ptr last_header; - std::shared_ptr table_map; + binlog.connect(host, port, user, password); + binlog.start(/*unique number*/ 42, executed_gtid_set); + DB::MySQLReplication::BinlogEventPtr event; - try + while (!quit) { - while (!in.eof()) + if (binlog.tryReadEvent(event, /*timeout*/ 100)) { - last_header = std::make_shared(); - last_header->parse(in); - last_event = parseSingleEventBody(*last_header, in, table_map, exist_checksum); + if (event->header.type != DB::MySQLReplication::HEARTBEAT_EVENT) + { + DB::WriteBufferFromOStream cout(std::cout); + event->dump(cout); + binlog.getPosition().dump(cout); + cout.finalize(); + } } } - catch (...) - { - DB::WriteBufferFromOStream cerr(std::cerr); - cerr << "Unable to parse MySQL binlog event. Code: " << DB::getCurrentExceptionCode() << ", Exception message: " - << DB::getCurrentExceptionMessage(false) << '\n' << ", Previous event: " << '\n'; - last_event->dump(cerr); - cerr << '\n' << ", Event header: " << '\n'; - last_header->dump(cerr); - cerr << '\n'; - return DB::getCurrentExceptionCode(); - } - - DB::WriteBufferFromOStream cout(std::cout); - cout << "Check passed. " << '\n' << "No exception was thrown." << '\n' << "The last binlog event: " << '\n'; - last_event->dump(cout); - cout << '\n'; - return 0; } - int main(int argc, char ** argv) { + (void)signal(SIGINT, signal_handler); boost::program_options::options_description desc("Allowed options"); - desc.add_options()("help,h", "Produce help message"); - desc.add_options()("disable_checksum", "Disable checksums in binlog files."); - boost::program_options::variables_map options; - boost::program_options::store(boost::program_options::parse_command_line(argc, argv, desc), options); + std::string host = "127.0.0.1"; + int port = 3306; + std::string user = "root"; + std::string password; + std::string gtid; - if (options.count("help") || argc < 2) + desc.add_options() + ("help", "Produce help message") + ("disable_checksum", "Disable checksums in binlog files.") + ("binlog", boost::program_options::value(), "Binlog file") + ("host", boost::program_options::value(&host)->default_value(host), "Host to connect") + ("port", boost::program_options::value(&port)->default_value(port), "Port number to connect") + ("user", boost::program_options::value(&user)->default_value(user), "User") + ("password", boost::program_options::value(&password), "Password") + ("gtid", boost::program_options::value(>id), "Executed gtid set"); + + try { - std::cout << "Usage: " << argv[0] << " mysql_binlog_file" << std::endl; - std::cout << desc << std::endl; - return 1; + boost::program_options::variables_map options; + boost::program_options::store(boost::program_options::parse_command_line(argc, argv, desc), options); + boost::program_options::notify(options); + + if (options.count("help") || (!options.count("binlog") && !options.count("gtid"))) + { + std::cout << "Usage: " << argv[0] << std::endl; + std::cout << desc << std::endl; + return EXIT_FAILURE; + } + + if (options.count("binlog")) + processBinlogFromFile(options["binlog"].as(), options.count("disable_checksum")); + else + processBinlogFromSocket(host, port, user, password, gtid, options.count("disable_checksum")); + } + catch (std::exception & ex) + { + std::cerr << ex.what() << std::endl; + return EXIT_FAILURE; } - return checkBinLogFile(argv[argc - 1], !options.count("disable_checksum")); + return EXIT_SUCCESS; } diff --git a/utils/check-style/aspell-ignore/en/aspell-dict.txt b/utils/check-style/aspell-ignore/en/aspell-dict.txt index 18566993870..3c774e11de4 100644 --- a/utils/check-style/aspell-ignore/en/aspell-dict.txt +++ b/utils/check-style/aspell-ignore/en/aspell-dict.txt @@ -198,6 +198,7 @@ DELETEs DESC DIEs DOGEFI +Damerau DataGrip DataLens DataTime @@ -414,6 +415,7 @@ JSONType JSONs Jaeger Jannis +Jaro JavaHash Jemalloc Jepsen @@ -716,6 +718,7 @@ ProxySQL Punycode PyArrow PyCharm +QATlib QEMU QTCreator Quantile @@ -725,6 +728,7 @@ QueryCacheHits QueryCacheMisses QueryPreempted QueryThread +QuickAssist QuoteMeta RBAC RClickHouse @@ -980,6 +984,7 @@ Werror Wether WikiStat WindowView +Winkler WithNames WithNamesAndTypes WordNet @@ -996,6 +1001,7 @@ YYYYMMDDToDate YYYYMMDDhhmmssToDateTime Yandex Yasm +ZSTDQAT Zabbix Zipkin ZooKeeper @@ -1342,6 +1348,7 @@ cutToFirstSignificantSubdomainWithWWW cutURLParameter cutWWW cyrus +damerauLevenshteinDistance datacenter datacenters datafiles @@ -1701,6 +1708,8 @@ isZeroOrNull iteratively jaccard jaccardIndex +jaroSimilarity +jaroWinklerSimilarity javaHash javaHashUTF jbod