diff --git a/CMakeLists.txt b/CMakeLists.txt index f65cc3b3d48..0e9d2c13d45 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -39,6 +39,8 @@ else() set(RECONFIGURE_MESSAGE_LEVEL STATUS) endif() +enable_language(C CXX ASM) + include (cmake/arch.cmake) include (cmake/target.cmake) include (cmake/tools.cmake) diff --git a/cmake/autogenerated_versions.txt b/cmake/autogenerated_versions.txt index bd7885bc41b..9d74179902d 100644 --- a/cmake/autogenerated_versions.txt +++ b/cmake/autogenerated_versions.txt @@ -1,9 +1,9 @@ # This strings autochanged from release_lib.sh: -SET(VERSION_REVISION 54449) +SET(VERSION_REVISION 54450) SET(VERSION_MAJOR 21) -SET(VERSION_MINOR 4) +SET(VERSION_MINOR 5) SET(VERSION_PATCH 1) -SET(VERSION_GITHASH af2135ef9dc72f16fa4f229b731262c3f0a8bbdc) -SET(VERSION_DESCRIBE v21.4.1.1-prestable) -SET(VERSION_STRING 21.4.1.1) +SET(VERSION_GITHASH 3827789b3d8fd2021952e57e5110343d26daa1a1) +SET(VERSION_DESCRIBE v21.5.1.1-prestable) +SET(VERSION_STRING 21.5.1.1) # end of autochange diff --git a/cmake/find/base64.cmake b/cmake/find/base64.cmake index 7427baf9cad..acade11eb2f 100644 --- a/cmake/find/base64.cmake +++ b/cmake/find/base64.cmake @@ -1,4 +1,8 @@ -option (ENABLE_BASE64 "Enable base64" ${ENABLE_LIBRARIES}) +if(ARCH_AMD64 OR ARCH_ARM) + option (ENABLE_BASE64 "Enable base64" ${ENABLE_LIBRARIES}) +elseif(ENABLE_BASE64) + message (${RECONFIGURE_MESSAGE_LEVEL} "base64 library is only supported on x86_64 and aarch64") +endif() if (NOT ENABLE_BASE64) return() diff --git a/cmake/find/fastops.cmake b/cmake/find/fastops.cmake index 5ab320bdb7a..1675646654e 100644 --- a/cmake/find/fastops.cmake +++ b/cmake/find/fastops.cmake @@ -1,7 +1,7 @@ -if(NOT ARCH_ARM AND NOT OS_FREEBSD AND NOT OS_DARWIN) +if(ARCH_AMD64 AND NOT OS_FREEBSD AND NOT OS_DARWIN) option(ENABLE_FASTOPS "Enable fast vectorized mathematical functions library by Mikhail Parakhin" ${ENABLE_LIBRARIES}) elseif(ENABLE_FASTOPS) - message (${RECONFIGURE_MESSAGE_LEVEL} "Fastops library is not supported on ARM, FreeBSD and Darwin") + message (${RECONFIGURE_MESSAGE_LEVEL} "Fastops library is supported on x86_64 only, and not FreeBSD or Darwin") endif() if(NOT ENABLE_FASTOPS) diff --git a/cmake/find/hdfs3.cmake b/cmake/find/hdfs3.cmake index 7b385f24e1e..3aab2b612ef 100644 --- a/cmake/find/hdfs3.cmake +++ b/cmake/find/hdfs3.cmake @@ -1,4 +1,4 @@ -if(NOT ARCH_ARM AND NOT OS_FREEBSD AND NOT APPLE AND USE_PROTOBUF) +if(NOT ARCH_ARM AND NOT OS_FREEBSD AND NOT APPLE AND USE_PROTOBUF AND NOT ARCH_PPC64LE) option(ENABLE_HDFS "Enable HDFS" ${ENABLE_LIBRARIES}) elseif(ENABLE_HDFS OR USE_INTERNAL_HDFS3_LIBRARY) message (${RECONFIGURE_MESSAGE_LEVEL} "Cannot use HDFS3 with current configuration") diff --git a/cmake/find/ldap.cmake b/cmake/find/ldap.cmake index 369c1e42e8d..0dffa334e73 100644 --- a/cmake/find/ldap.cmake +++ b/cmake/find/ldap.cmake @@ -62,6 +62,7 @@ if (NOT OPENLDAP_FOUND AND NOT MISSING_INTERNAL_LDAP_LIBRARY) if ( ( "${_system_name}" STREQUAL "linux" AND "${_system_processor}" STREQUAL "x86_64" ) OR ( "${_system_name}" STREQUAL "linux" AND "${_system_processor}" STREQUAL "aarch64" ) OR + ( "${_system_name}" STREQUAL "linux" AND "${_system_processor}" STREQUAL "ppc64le" ) OR ( "${_system_name}" STREQUAL "freebsd" AND "${_system_processor}" STREQUAL "x86_64" ) OR ( "${_system_name}" STREQUAL "darwin" AND "${_system_processor}" STREQUAL "x86_64" ) ) diff --git a/cmake/find/s3.cmake b/cmake/find/s3.cmake index 1bbf48fd6b0..1b0c652a31a 100644 --- a/cmake/find/s3.cmake +++ b/cmake/find/s3.cmake @@ -1,7 +1,7 @@ -if(NOT OS_FREEBSD AND NOT APPLE AND NOT ARCH_ARM) +if(NOT OS_FREEBSD AND NOT APPLE) option(ENABLE_S3 "Enable S3" ${ENABLE_LIBRARIES}) elseif(ENABLE_S3 OR USE_INTERNAL_AWS_S3_LIBRARY) - message (${RECONFIGURE_MESSAGE_LEVEL} "Can't use S3 on ARM, Apple or FreeBSD") + message (${RECONFIGURE_MESSAGE_LEVEL} "Can't use S3 on Apple or FreeBSD") endif() if(NOT ENABLE_S3) diff --git a/cmake/linux/default_libs.cmake b/cmake/linux/default_libs.cmake index d3a727e9cb8..c1e4d450389 100644 --- a/cmake/linux/default_libs.cmake +++ b/cmake/linux/default_libs.cmake @@ -6,7 +6,7 @@ set (DEFAULT_LIBS "-nodefaultlibs") # We need builtins from Clang's RT even without libcxx - for ubsan+int128. # See https://bugs.llvm.org/show_bug.cgi?id=16404 if (COMPILER_CLANG AND NOT (CMAKE_CROSSCOMPILING AND ARCH_AARCH64)) - execute_process (COMMAND ${CMAKE_CXX_COMPILER} --print-file-name=libclang_rt.builtins-${CMAKE_SYSTEM_PROCESSOR}.a OUTPUT_VARIABLE BUILTINS_LIBRARY OUTPUT_STRIP_TRAILING_WHITESPACE) + execute_process (COMMAND ${CMAKE_CXX_COMPILER} --print-libgcc-file-name --rtlib=compiler-rt OUTPUT_VARIABLE BUILTINS_LIBRARY OUTPUT_STRIP_TRAILING_WHITESPACE) else () set (BUILTINS_LIBRARY "-lgcc") endif () diff --git a/cmake/tools.cmake b/cmake/tools.cmake index abb11843d59..44fc3b3e530 100644 --- a/cmake/tools.cmake +++ b/cmake/tools.cmake @@ -86,8 +86,3 @@ if (LINKER_NAME) message(STATUS "Using custom linker by name: ${LINKER_NAME}") endif () -if (ARCH_PPC64LE) - if (COMPILER_CLANG OR (COMPILER_GCC AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 8)) - message(FATAL_ERROR "Only gcc-8 or higher is supported for powerpc architecture") - endif () -endif () diff --git a/contrib/boost-cmake/CMakeLists.txt b/contrib/boost-cmake/CMakeLists.txt index b9298f59f2b..0759935a7db 100644 --- a/contrib/boost-cmake/CMakeLists.txt +++ b/contrib/boost-cmake/CMakeLists.txt @@ -160,6 +160,12 @@ if (NOT EXTERNAL_BOOST_FOUND) enable_language(ASM) SET(ASM_OPTIONS "-x assembler-with-cpp") + set (SRCS_CONTEXT + ${LIBRARY_DIR}/libs/context/src/dummy.cpp + ${LIBRARY_DIR}/libs/context/src/execution_context.cpp + ${LIBRARY_DIR}/libs/context/src/posix/stack_traits.cpp + ) + if (SANITIZE AND (SANITIZE STREQUAL "address" OR SANITIZE STREQUAL "thread")) add_compile_definitions(BOOST_USE_UCONTEXT) @@ -169,39 +175,34 @@ if (NOT EXTERNAL_BOOST_FOUND) add_compile_definitions(BOOST_USE_TSAN) endif() - set (SRCS_CONTEXT + set (SRCS_CONTEXT ${SRCS_CONTEXT} ${LIBRARY_DIR}/libs/context/src/fiber.cpp ${LIBRARY_DIR}/libs/context/src/continuation.cpp - ${LIBRARY_DIR}/libs/context/src/dummy.cpp - ${LIBRARY_DIR}/libs/context/src/execution_context.cpp - ${LIBRARY_DIR}/libs/context/src/posix/stack_traits.cpp ) - elseif (ARCH_ARM) - set (SRCS_CONTEXT + endif() + if (ARCH_ARM) + set (SRCS_CONTEXT ${SRCS_CONTEXT} ${LIBRARY_DIR}/libs/context/src/asm/jump_arm64_aapcs_elf_gas.S ${LIBRARY_DIR}/libs/context/src/asm/make_arm64_aapcs_elf_gas.S ${LIBRARY_DIR}/libs/context/src/asm/ontop_arm64_aapcs_elf_gas.S - ${LIBRARY_DIR}/libs/context/src/dummy.cpp - ${LIBRARY_DIR}/libs/context/src/execution_context.cpp - ${LIBRARY_DIR}/libs/context/src/posix/stack_traits.cpp + ) + elseif (ARCH_PPC64LE) + set (SRCS_CONTEXT ${SRCS_CONTEXT} + ${LIBRARY_DIR}/libs/context/src/asm/jump_ppc64_sysv_elf_gas.S + ${LIBRARY_DIR}/libs/context/src/asm/make_ppc64_sysv_elf_gas.S + ${LIBRARY_DIR}/libs/context/src/asm/ontop_ppc64_sysv_elf_gas.S ) elseif(OS_DARWIN) - set (SRCS_CONTEXT + set (SRCS_CONTEXT ${SRCS_CONTEXT} ${LIBRARY_DIR}/libs/context/src/asm/jump_x86_64_sysv_macho_gas.S ${LIBRARY_DIR}/libs/context/src/asm/make_x86_64_sysv_macho_gas.S ${LIBRARY_DIR}/libs/context/src/asm/ontop_x86_64_sysv_macho_gas.S - ${LIBRARY_DIR}/libs/context/src/dummy.cpp - ${LIBRARY_DIR}/libs/context/src/execution_context.cpp - ${LIBRARY_DIR}/libs/context/src/posix/stack_traits.cpp ) else() - set (SRCS_CONTEXT + set (SRCS_CONTEXT ${SRCS_CONTEXT} ${LIBRARY_DIR}/libs/context/src/asm/jump_x86_64_sysv_elf_gas.S ${LIBRARY_DIR}/libs/context/src/asm/make_x86_64_sysv_elf_gas.S ${LIBRARY_DIR}/libs/context/src/asm/ontop_x86_64_sysv_elf_gas.S - ${LIBRARY_DIR}/libs/context/src/dummy.cpp - ${LIBRARY_DIR}/libs/context/src/execution_context.cpp - ${LIBRARY_DIR}/libs/context/src/posix/stack_traits.cpp ) endif() diff --git a/contrib/cctz-cmake/CMakeLists.txt b/contrib/cctz-cmake/CMakeLists.txt index 90e33dc9f62..a3869478347 100644 --- a/contrib/cctz-cmake/CMakeLists.txt +++ b/contrib/cctz-cmake/CMakeLists.txt @@ -97,12 +97,19 @@ if (NOT EXTERNAL_CCTZ_LIBRARY_FOUND OR NOT EXTERNAL_CCTZ_LIBRARY_WORKS) set(TZ_OBJS ${TZ_OBJS} ${TZ_OBJ}) # https://stackoverflow.com/questions/14776463/compile-and-add-an-object-file-from-a-binary-with-cmake - add_custom_command(OUTPUT ${TZ_OBJ} - COMMAND cp ${TZDIR}/${TIMEZONE} ${CMAKE_CURRENT_BINARY_DIR}/${TIMEZONE_ID} - COMMAND cd ${CMAKE_CURRENT_BINARY_DIR} && ${OBJCOPY_PATH} -I binary ${OBJCOPY_ARCH_OPTIONS} + # PPC64LE fails to do this with objcopy, use ld or lld instead + if (ARCH_PPC64LE) + add_custom_command(OUTPUT ${TZ_OBJ} + COMMAND cp ${TZDIR}/${TIMEZONE} ${CMAKE_CURRENT_BINARY_DIR}/${TIMEZONE_ID} + COMMAND cd ${CMAKE_CURRENT_BINARY_DIR} && ${CMAKE_LINKER} -m elf64lppc -r -b binary -o ${TZ_OBJ} ${TIMEZONE_ID} + COMMAND rm ${CMAKE_CURRENT_BINARY_DIR}/${TIMEZONE_ID}) + else() + add_custom_command(OUTPUT ${TZ_OBJ} + COMMAND cp ${TZDIR}/${TIMEZONE} ${CMAKE_CURRENT_BINARY_DIR}/${TIMEZONE_ID} + COMMAND cd ${CMAKE_CURRENT_BINARY_DIR} && ${OBJCOPY_PATH} -I binary ${OBJCOPY_ARCH_OPTIONS} --rename-section .data=.rodata,alloc,load,readonly,data,contents ${TIMEZONE_ID} ${TZ_OBJ} - COMMAND rm ${CMAKE_CURRENT_BINARY_DIR}/${TIMEZONE_ID}) - + COMMAND rm ${CMAKE_CURRENT_BINARY_DIR}/${TIMEZONE_ID}) + endif() set_source_files_properties(${TZ_OBJ} PROPERTIES EXTERNAL_OBJECT true GENERATED true) endforeach(TIMEZONE) diff --git a/contrib/jemalloc-cmake/CMakeLists.txt b/contrib/jemalloc-cmake/CMakeLists.txt index b8a6474413a..73afa99f1d8 100644 --- a/contrib/jemalloc-cmake/CMakeLists.txt +++ b/contrib/jemalloc-cmake/CMakeLists.txt @@ -1,7 +1,7 @@ -if (SANITIZE OR NOT (ARCH_AMD64 OR ARCH_ARM) OR NOT (OS_LINUX OR OS_FREEBSD OR OS_DARWIN)) +if (SANITIZE OR NOT (ARCH_AMD64 OR ARCH_ARM OR ARCH_PPC64LE) OR NOT (OS_LINUX OR OS_FREEBSD OR OS_DARWIN)) if (ENABLE_JEMALLOC) message (${RECONFIGURE_MESSAGE_LEVEL} - "jemalloc is disabled implicitly: it doesn't work with sanitizers and can only be used with x86_64 or aarch64 on linux or freebsd.") + "jemalloc is disabled implicitly: it doesn't work with sanitizers and can only be used with x86_64, aarch64 or ppc64le on linux or freebsd.") endif() set (ENABLE_JEMALLOC OFF) else() @@ -107,6 +107,8 @@ if (ARCH_AMD64) set(JEMALLOC_INCLUDE_PREFIX "${JEMALLOC_INCLUDE_PREFIX}_x86_64") elseif (ARCH_ARM) set(JEMALLOC_INCLUDE_PREFIX "${JEMALLOC_INCLUDE_PREFIX}_aarch64") +elseif (ARCH_PPC64LE) + set(JEMALLOC_INCLUDE_PREFIX "${JEMALLOC_INCLUDE_PREFIX}_ppc64le") else () message (FATAL_ERROR "internal jemalloc: This arch is not supported") endif () diff --git a/contrib/jemalloc-cmake/include_linux_ppc64le/jemalloc/internal/jemalloc_internal_defs.h.in b/contrib/jemalloc-cmake/include_linux_ppc64le/jemalloc/internal/jemalloc_internal_defs.h.in new file mode 100644 index 00000000000..8068861041f --- /dev/null +++ b/contrib/jemalloc-cmake/include_linux_ppc64le/jemalloc/internal/jemalloc_internal_defs.h.in @@ -0,0 +1,367 @@ +/* include/jemalloc/internal/jemalloc_internal_defs.h. Generated from jemalloc_internal_defs.h.in by configure. */ +#ifndef JEMALLOC_INTERNAL_DEFS_H_ +#define JEMALLOC_INTERNAL_DEFS_H_ +/* + * If JEMALLOC_PREFIX is defined via --with-jemalloc-prefix, it will cause all + * public APIs to be prefixed. This makes it possible, with some care, to use + * multiple allocators simultaneously. + */ +/* #undef JEMALLOC_PREFIX */ +/* #undef JEMALLOC_CPREFIX */ + +/* + * Define overrides for non-standard allocator-related functions if they are + * present on the system. + */ +#define JEMALLOC_OVERRIDE___LIBC_CALLOC +#define JEMALLOC_OVERRIDE___LIBC_FREE +#define JEMALLOC_OVERRIDE___LIBC_MALLOC +#define JEMALLOC_OVERRIDE___LIBC_MEMALIGN +#define JEMALLOC_OVERRIDE___LIBC_REALLOC +#define JEMALLOC_OVERRIDE___LIBC_VALLOC +/* #undef JEMALLOC_OVERRIDE___POSIX_MEMALIGN */ + +/* + * JEMALLOC_PRIVATE_NAMESPACE is used as a prefix for all library-private APIs. + * For shared libraries, symbol visibility mechanisms prevent these symbols + * from being exported, but for static libraries, naming collisions are a real + * possibility. + */ +#define JEMALLOC_PRIVATE_NAMESPACE je_ + +/* + * Hyper-threaded CPUs may need a special instruction inside spin loops in + * order to yield to another virtual CPU. + */ +#define CPU_SPINWAIT +/* 1 if CPU_SPINWAIT is defined, 0 otherwise. */ +#define HAVE_CPU_SPINWAIT 0 + +/* + * Number of significant bits in virtual addresses. This may be less than the + * total number of bits in a pointer, e.g. on x64, for which the uppermost 16 + * bits are the same as bit 47. + */ +#define LG_VADDR 64 + +/* Defined if C11 atomics are available. */ +#define JEMALLOC_C11_ATOMICS 1 + +/* Defined if GCC __atomic atomics are available. */ +#define JEMALLOC_GCC_ATOMIC_ATOMICS 1 +/* and the 8-bit variant support. */ +#define JEMALLOC_GCC_U8_ATOMIC_ATOMICS 1 + +/* Defined if GCC __sync atomics are available. */ +#define JEMALLOC_GCC_SYNC_ATOMICS 1 +/* and the 8-bit variant support. */ +#define JEMALLOC_GCC_U8_SYNC_ATOMICS 1 + +/* + * Defined if __builtin_clz() and __builtin_clzl() are available. + */ +#define JEMALLOC_HAVE_BUILTIN_CLZ + +/* + * Defined if os_unfair_lock_*() functions are available, as provided by Darwin. + */ +/* #undef JEMALLOC_OS_UNFAIR_LOCK */ + +/* Defined if syscall(2) is usable. */ +#define JEMALLOC_USE_SYSCALL + +/* + * Defined if secure_getenv(3) is available. + */ +// #define JEMALLOC_HAVE_SECURE_GETENV + +/* + * Defined if issetugid(2) is available. + */ +/* #undef JEMALLOC_HAVE_ISSETUGID */ + +/* Defined if pthread_atfork(3) is available. */ +#define JEMALLOC_HAVE_PTHREAD_ATFORK + +/* Defined if pthread_setname_np(3) is available. */ +#define JEMALLOC_HAVE_PTHREAD_SETNAME_NP + +/* + * Defined if clock_gettime(CLOCK_MONOTONIC_COARSE, ...) is available. + */ +#define JEMALLOC_HAVE_CLOCK_MONOTONIC_COARSE 1 + +/* + * Defined if clock_gettime(CLOCK_MONOTONIC, ...) is available. + */ +#define JEMALLOC_HAVE_CLOCK_MONOTONIC 1 + +/* + * Defined if mach_absolute_time() is available. + */ +/* #undef JEMALLOC_HAVE_MACH_ABSOLUTE_TIME */ + +/* + * Defined if _malloc_thread_cleanup() exists. At least in the case of + * FreeBSD, pthread_key_create() allocates, which if used during malloc + * bootstrapping will cause recursion into the pthreads library. Therefore, if + * _malloc_thread_cleanup() exists, use it as the basis for thread cleanup in + * malloc_tsd. + */ +/* #undef JEMALLOC_MALLOC_THREAD_CLEANUP */ + +/* + * Defined if threaded initialization is known to be safe on this platform. + * Among other things, it must be possible to initialize a mutex without + * triggering allocation in order for threaded allocation to be safe. + */ +#define JEMALLOC_THREADED_INIT + +/* + * Defined if the pthreads implementation defines + * _pthread_mutex_init_calloc_cb(), in which case the function is used in order + * to avoid recursive allocation during mutex initialization. + */ +/* #undef JEMALLOC_MUTEX_INIT_CB */ + +/* Non-empty if the tls_model attribute is supported. */ +#define JEMALLOC_TLS_MODEL __attribute__((tls_model("initial-exec"))) + +/* + * JEMALLOC_DEBUG enables assertions and other sanity checks, and disables + * inline functions. + */ +/* #undef JEMALLOC_DEBUG */ + +/* JEMALLOC_STATS enables statistics calculation. */ +#define JEMALLOC_STATS + +/* JEMALLOC_EXPERIMENTAL_SMALLOCX_API enables experimental smallocx API. */ +/* #undef JEMALLOC_EXPERIMENTAL_SMALLOCX_API */ + +/* JEMALLOC_PROF enables allocation profiling. */ +/* #undef JEMALLOC_PROF */ + +/* Use libunwind for profile backtracing if defined. */ +/* #undef JEMALLOC_PROF_LIBUNWIND */ + +/* Use libgcc for profile backtracing if defined. */ +/* #undef JEMALLOC_PROF_LIBGCC */ + +/* Use gcc intrinsics for profile backtracing if defined. */ +/* #undef JEMALLOC_PROF_GCC */ + +/* + * JEMALLOC_DSS enables use of sbrk(2) to allocate extents from the data storage + * segment (DSS). + */ +#define JEMALLOC_DSS + +/* Support memory filling (junk/zero). */ +#define JEMALLOC_FILL + +/* Support utrace(2)-based tracing. */ +/* #undef JEMALLOC_UTRACE */ + +/* Support optional abort() on OOM. */ +/* #undef JEMALLOC_XMALLOC */ + +/* Support lazy locking (avoid locking unless a second thread is launched). */ +/* #undef JEMALLOC_LAZY_LOCK */ + +/* + * Minimum allocation alignment is 2^LG_QUANTUM bytes (ignoring tiny size + * classes). + */ +/* #undef LG_QUANTUM */ + +/* One page is 2^LG_PAGE bytes. */ +#define LG_PAGE 16 + +/* + * One huge page is 2^LG_HUGEPAGE bytes. Note that this is defined even if the + * system does not explicitly support huge pages; system calls that require + * explicit huge page support are separately configured. + */ +#define LG_HUGEPAGE 21 + +/* + * If defined, adjacent virtual memory mappings with identical attributes + * automatically coalesce, and they fragment when changes are made to subranges. + * This is the normal order of things for mmap()/munmap(), but on Windows + * VirtualAlloc()/VirtualFree() operations must be precisely matched, i.e. + * mappings do *not* coalesce/fragment. + */ +#define JEMALLOC_MAPS_COALESCE + +/* + * If defined, retain memory for later reuse by default rather than using e.g. + * munmap() to unmap freed extents. This is enabled on 64-bit Linux because + * common sequences of mmap()/munmap() calls will cause virtual memory map + * holes. + */ +#define JEMALLOC_RETAIN + +/* TLS is used to map arenas and magazine caches to threads. */ +#define JEMALLOC_TLS + +/* + * Used to mark unreachable code to quiet "end of non-void" compiler warnings. + * Don't use this directly; instead use unreachable() from util.h + */ +#define JEMALLOC_INTERNAL_UNREACHABLE __builtin_unreachable + +/* + * ffs*() functions to use for bitmapping. Don't use these directly; instead, + * use ffs_*() from util.h. + */ +#define JEMALLOC_INTERNAL_FFSLL __builtin_ffsll +#define JEMALLOC_INTERNAL_FFSL __builtin_ffsl +#define JEMALLOC_INTERNAL_FFS __builtin_ffs + +/* + * popcount*() functions to use for bitmapping. + */ +#define JEMALLOC_INTERNAL_POPCOUNTL __builtin_popcountl +#define JEMALLOC_INTERNAL_POPCOUNT __builtin_popcount + +/* + * If defined, explicitly attempt to more uniformly distribute large allocation + * pointer alignments across all cache indices. + */ +#define JEMALLOC_CACHE_OBLIVIOUS + +/* + * If defined, enable logging facilities. We make this a configure option to + * avoid taking extra branches everywhere. + */ +/* #undef JEMALLOC_LOG */ + +/* + * If defined, use readlinkat() (instead of readlink()) to follow + * /etc/malloc_conf. + */ +/* #undef JEMALLOC_READLINKAT */ + +/* + * Darwin (OS X) uses zones to work around Mach-O symbol override shortcomings. + */ +/* #undef JEMALLOC_ZONE */ + +/* + * Methods for determining whether the OS overcommits. + * JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY: Linux's + * /proc/sys/vm.overcommit_memory file. + * JEMALLOC_SYSCTL_VM_OVERCOMMIT: FreeBSD's vm.overcommit sysctl. + */ +/* #undef JEMALLOC_SYSCTL_VM_OVERCOMMIT */ +#define JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY + +/* Defined if madvise(2) is available. */ +#define JEMALLOC_HAVE_MADVISE + +/* + * Defined if transparent huge pages are supported via the MADV_[NO]HUGEPAGE + * arguments to madvise(2). + */ +#define JEMALLOC_HAVE_MADVISE_HUGE + +/* + * Methods for purging unused pages differ between operating systems. + * + * madvise(..., MADV_FREE) : This marks pages as being unused, such that they + * will be discarded rather than swapped out. + * madvise(..., MADV_DONTNEED) : If JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS is + * defined, this immediately discards pages, + * such that new pages will be demand-zeroed if + * the address region is later touched; + * otherwise this behaves similarly to + * MADV_FREE, though typically with higher + * system overhead. + */ +#define JEMALLOC_PURGE_MADVISE_FREE +#define JEMALLOC_PURGE_MADVISE_DONTNEED +#define JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS + +/* Defined if madvise(2) is available but MADV_FREE is not (x86 Linux only). */ +/* #undef JEMALLOC_DEFINE_MADVISE_FREE */ + +/* + * Defined if MADV_DO[NT]DUMP is supported as an argument to madvise. + */ +#define JEMALLOC_MADVISE_DONTDUMP + +/* + * Defined if transparent huge pages (THPs) are supported via the + * MADV_[NO]HUGEPAGE arguments to madvise(2), and THP support is enabled. + */ +/* #undef JEMALLOC_THP */ + +/* Define if operating system has alloca.h header. */ +#define JEMALLOC_HAS_ALLOCA_H 1 + +/* C99 restrict keyword supported. */ +#define JEMALLOC_HAS_RESTRICT 1 + +/* For use by hash code. */ +/* #undef JEMALLOC_BIG_ENDIAN */ + +/* sizeof(int) == 2^LG_SIZEOF_INT. */ +#define LG_SIZEOF_INT 2 + +/* sizeof(long) == 2^LG_SIZEOF_LONG. */ +#define LG_SIZEOF_LONG 3 + +/* sizeof(long long) == 2^LG_SIZEOF_LONG_LONG. */ +#define LG_SIZEOF_LONG_LONG 3 + +/* sizeof(intmax_t) == 2^LG_SIZEOF_INTMAX_T. */ +#define LG_SIZEOF_INTMAX_T 3 + +/* glibc malloc hooks (__malloc_hook, __realloc_hook, __free_hook). */ +#define JEMALLOC_GLIBC_MALLOC_HOOK + +/* glibc memalign hook. */ +#define JEMALLOC_GLIBC_MEMALIGN_HOOK + +/* pthread support */ +#define JEMALLOC_HAVE_PTHREAD + +/* dlsym() support */ +#define JEMALLOC_HAVE_DLSYM + +/* Adaptive mutex support in pthreads. */ +#define JEMALLOC_HAVE_PTHREAD_MUTEX_ADAPTIVE_NP + +/* GNU specific sched_getcpu support */ +#define JEMALLOC_HAVE_SCHED_GETCPU + +/* GNU specific sched_setaffinity support */ +#define JEMALLOC_HAVE_SCHED_SETAFFINITY + +/* + * If defined, all the features necessary for background threads are present. + */ +#define JEMALLOC_BACKGROUND_THREAD 1 + +/* + * If defined, jemalloc symbols are not exported (doesn't work when + * JEMALLOC_PREFIX is not defined). + */ +/* #undef JEMALLOC_EXPORT */ + +/* config.malloc_conf options string. */ +#define JEMALLOC_CONFIG_MALLOC_CONF "@JEMALLOC_CONFIG_MALLOC_CONF@" + +/* If defined, jemalloc takes the malloc/free/etc. symbol names. */ +#define JEMALLOC_IS_MALLOC 1 + +/* + * Defined if strerror_r returns char * if _GNU_SOURCE is defined. + */ +#define JEMALLOC_STRERROR_R_RETURNS_CHAR_WITH_GNU_SOURCE + +/* Performs additional safety checks when defined. */ +/* #undef JEMALLOC_OPT_SAFETY_CHECKS */ + +#endif /* JEMALLOC_INTERNAL_DEFS_H_ */ diff --git a/contrib/libcpuid-cmake/CMakeLists.txt b/contrib/libcpuid-cmake/CMakeLists.txt index 8c1be50b4e6..9baebb3ba1b 100644 --- a/contrib/libcpuid-cmake/CMakeLists.txt +++ b/contrib/libcpuid-cmake/CMakeLists.txt @@ -1,11 +1,9 @@ -if (NOT ARCH_ARM) +if(ARCH_AMD64) option (ENABLE_CPUID "Enable libcpuid library (only internal)" ${ENABLE_LIBRARIES}) -endif() - -if (ARCH_ARM AND ENABLE_CPUID) - message (${RECONFIGURE_MESSAGE_LEVEL} "cpuid is not supported on ARM") +elseif(ENABLE_CPUID) + message (${RECONFIGURE_MESSAGE_LEVEL} "libcpuid is only supported on x86_64") set (ENABLE_CPUID 0) -endif () +endif() if (NOT ENABLE_CPUID) add_library (cpuid INTERFACE) diff --git a/contrib/openldap-cmake/linux_ppc64le/include/lber_types.h b/contrib/openldap-cmake/linux_ppc64le/include/lber_types.h new file mode 100644 index 00000000000..dbd59430527 --- /dev/null +++ b/contrib/openldap-cmake/linux_ppc64le/include/lber_types.h @@ -0,0 +1,63 @@ +/* include/lber_types.h. Generated from lber_types.hin by configure. */ +/* $OpenLDAP$ */ +/* This work is part of OpenLDAP Software . + * + * Copyright 1998-2020 The OpenLDAP Foundation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * . + */ + +/* + * LBER types + */ + +#ifndef _LBER_TYPES_H +#define _LBER_TYPES_H + +#include + +LDAP_BEGIN_DECL + +/* LBER boolean, enum, integers (32 bits or larger) */ +#define LBER_INT_T int + +/* LBER tags (32 bits or larger) */ +#define LBER_TAG_T long + +/* LBER socket descriptor */ +#define LBER_SOCKET_T int + +/* LBER lengths (32 bits or larger) */ +#define LBER_LEN_T long + +/* ------------------------------------------------------------ */ + +/* booleans, enumerations, and integers */ +typedef LBER_INT_T ber_int_t; + +/* signed and unsigned versions */ +typedef signed LBER_INT_T ber_sint_t; +typedef unsigned LBER_INT_T ber_uint_t; + +/* tags */ +typedef unsigned LBER_TAG_T ber_tag_t; + +/* "socket" descriptors */ +typedef LBER_SOCKET_T ber_socket_t; + +/* lengths */ +typedef unsigned LBER_LEN_T ber_len_t; + +/* signed lengths */ +typedef signed LBER_LEN_T ber_slen_t; + +LDAP_END_DECL + +#endif /* _LBER_TYPES_H */ diff --git a/contrib/openldap-cmake/linux_ppc64le/include/ldap_config.h b/contrib/openldap-cmake/linux_ppc64le/include/ldap_config.h new file mode 100644 index 00000000000..89f7b40b884 --- /dev/null +++ b/contrib/openldap-cmake/linux_ppc64le/include/ldap_config.h @@ -0,0 +1,74 @@ +/* include/ldap_config.h. Generated from ldap_config.hin by configure. */ +/* $OpenLDAP$ */ +/* This work is part of OpenLDAP Software . + * + * Copyright 1998-2020 The OpenLDAP Foundation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * . + */ + +/* + * This file works in conjunction with OpenLDAP configure system. + * If you do no like the values below, adjust your configure options. + */ + +#ifndef _LDAP_CONFIG_H +#define _LDAP_CONFIG_H + +/* directory separator */ +#ifndef LDAP_DIRSEP +#ifndef _WIN32 +#define LDAP_DIRSEP "/" +#else +#define LDAP_DIRSEP "\\" +#endif +#endif + +/* directory for temporary files */ +#if defined(_WIN32) +# define LDAP_TMPDIR "C:\\." /* we don't have much of a choice */ +#elif defined( _P_tmpdir ) +# define LDAP_TMPDIR _P_tmpdir +#elif defined( P_tmpdir ) +# define LDAP_TMPDIR P_tmpdir +#elif defined( _PATH_TMPDIR ) +# define LDAP_TMPDIR _PATH_TMPDIR +#else +# define LDAP_TMPDIR LDAP_DIRSEP "tmp" +#endif + +/* directories */ +#ifndef LDAP_BINDIR +#define LDAP_BINDIR "/tmp/ldap-prefix/bin" +#endif +#ifndef LDAP_SBINDIR +#define LDAP_SBINDIR "/tmp/ldap-prefix/sbin" +#endif +#ifndef LDAP_DATADIR +#define LDAP_DATADIR "/tmp/ldap-prefix/share/openldap" +#endif +#ifndef LDAP_SYSCONFDIR +#define LDAP_SYSCONFDIR "/tmp/ldap-prefix/etc/openldap" +#endif +#ifndef LDAP_LIBEXECDIR +#define LDAP_LIBEXECDIR "/tmp/ldap-prefix/libexec" +#endif +#ifndef LDAP_MODULEDIR +#define LDAP_MODULEDIR "/tmp/ldap-prefix/libexec/openldap" +#endif +#ifndef LDAP_RUNDIR +#define LDAP_RUNDIR "/tmp/ldap-prefix/var" +#endif +#ifndef LDAP_LOCALEDIR +#define LDAP_LOCALEDIR "" +#endif + + +#endif /* _LDAP_CONFIG_H */ diff --git a/contrib/openldap-cmake/linux_ppc64le/include/ldap_features.h b/contrib/openldap-cmake/linux_ppc64le/include/ldap_features.h new file mode 100644 index 00000000000..f0cc7c3626f --- /dev/null +++ b/contrib/openldap-cmake/linux_ppc64le/include/ldap_features.h @@ -0,0 +1,61 @@ +/* include/ldap_features.h. Generated from ldap_features.hin by configure. */ +/* $OpenLDAP$ */ +/* This work is part of OpenLDAP Software . + * + * Copyright 1998-2020 The OpenLDAP Foundation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * . + */ + +/* + * LDAP Features + */ + +#ifndef _LDAP_FEATURES_H +#define _LDAP_FEATURES_H 1 + +/* OpenLDAP API version macros */ +#define LDAP_VENDOR_VERSION 20501 +#define LDAP_VENDOR_VERSION_MAJOR 2 +#define LDAP_VENDOR_VERSION_MINOR 5 +#define LDAP_VENDOR_VERSION_PATCH X + +/* +** WORK IN PROGRESS! +** +** OpenLDAP reentrancy/thread-safeness should be dynamically +** checked using ldap_get_option(). +** +** The -lldap implementation is not thread-safe. +** +** The -lldap_r implementation is: +** LDAP_API_FEATURE_THREAD_SAFE (basic thread safety) +** but also be: +** LDAP_API_FEATURE_SESSION_THREAD_SAFE +** LDAP_API_FEATURE_OPERATION_THREAD_SAFE +** +** The preprocessor flag LDAP_API_FEATURE_X_OPENLDAP_THREAD_SAFE +** can be used to determine if -lldap_r is available at compile +** time. You must define LDAP_THREAD_SAFE if and only if you +** link with -lldap_r. +** +** If you fail to define LDAP_THREAD_SAFE when linking with +** -lldap_r or define LDAP_THREAD_SAFE when linking with -lldap, +** provided header definitions and declarations may be incorrect. +** +*/ + +/* is -lldap_r available or not */ +#define LDAP_API_FEATURE_X_OPENLDAP_THREAD_SAFE 1 + +/* LDAP v2 Referrals */ +/* #undef LDAP_API_FEATURE_X_OPENLDAP_V2_REFERRALS */ + +#endif /* LDAP_FEATURES */ diff --git a/contrib/openldap-cmake/linux_ppc64le/include/portable.h b/contrib/openldap-cmake/linux_ppc64le/include/portable.h new file mode 100644 index 00000000000..2924b6713a4 --- /dev/null +++ b/contrib/openldap-cmake/linux_ppc64le/include/portable.h @@ -0,0 +1,1169 @@ +/* include/portable.h. Generated from portable.hin by configure. */ +/* include/portable.hin. Generated from configure.in by autoheader. */ + + +/* begin of portable.h.pre */ +/* This work is part of OpenLDAP Software . + * + * Copyright 1998-2020 The OpenLDAP Foundation + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in the file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * . + */ + +#ifndef _LDAP_PORTABLE_H +#define _LDAP_PORTABLE_H + +/* define this if needed to get reentrant functions */ +#ifndef REENTRANT +#define REENTRANT 1 +#endif +#ifndef _REENTRANT +#define _REENTRANT 1 +#endif + +/* define this if needed to get threadsafe functions */ +#ifndef THREADSAFE +#define THREADSAFE 1 +#endif +#ifndef _THREADSAFE +#define _THREADSAFE 1 +#endif +#ifndef THREAD_SAFE +#define THREAD_SAFE 1 +#endif +#ifndef _THREAD_SAFE +#define _THREAD_SAFE 1 +#endif + +#ifndef _SGI_MP_SOURCE +#define _SGI_MP_SOURCE 1 +#endif + +/* end of portable.h.pre */ + + +/* Define if building universal (internal helper macro) */ +/* #undef AC_APPLE_UNIVERSAL_BUILD */ + +/* define to use both and */ +/* #undef BOTH_STRINGS_H */ + +/* define if cross compiling */ +/* #undef CROSS_COMPILING */ + +/* set to the number of arguments ctime_r() expects */ +#define CTIME_R_NARGS 2 + +/* define if toupper() requires islower() */ +/* #undef C_UPPER_LOWER */ + +/* define if sys_errlist is not declared in stdio.h or errno.h */ +/* #undef DECL_SYS_ERRLIST */ + +/* define to enable slapi library */ +/* #undef ENABLE_SLAPI */ + +/* defined to be the EXE extension */ +#define EXEEXT "" + +/* set to the number of arguments gethostbyaddr_r() expects */ +#define GETHOSTBYADDR_R_NARGS 8 + +/* set to the number of arguments gethostbyname_r() expects */ +#define GETHOSTBYNAME_R_NARGS 6 + +/* Define to 1 if `TIOCGWINSZ' requires . */ +#define GWINSZ_IN_SYS_IOCTL 1 + +/* define if you have AIX security lib */ +/* #undef HAVE_AIX_SECURITY */ + +/* Define to 1 if you have the header file. */ +#define HAVE_ARPA_INET_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_ARPA_NAMESER_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_ASSERT_H 1 + +/* Define to 1 if you have the `bcopy' function. */ +#define HAVE_BCOPY 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_BITS_TYPES_H 1 + +/* Define to 1 if you have the `chroot' function. */ +#define HAVE_CHROOT 1 + +/* Define to 1 if you have the `closesocket' function. */ +/* #undef HAVE_CLOSESOCKET */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_CONIO_H */ + +/* define if crypt(3) is available */ +/* #undef HAVE_CRYPT */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_CRYPT_H */ + +/* define if crypt_r() is also available */ +/* #undef HAVE_CRYPT_R */ + +/* Define to 1 if you have the `ctime_r' function. */ +#define HAVE_CTIME_R 1 + +/* define if you have Cyrus SASL */ +/* #undef HAVE_CYRUS_SASL */ + +/* define if your system supports /dev/poll */ +/* #undef HAVE_DEVPOLL */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_DIRECT_H */ + +/* Define to 1 if you have the header file, and it defines `DIR'. + */ +#define HAVE_DIRENT_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_DLFCN_H 1 + +/* Define to 1 if you don't have `vprintf' but do have `_doprnt.' */ +/* #undef HAVE_DOPRNT */ + +/* define if system uses EBCDIC instead of ASCII */ +/* #undef HAVE_EBCDIC */ + +/* Define to 1 if you have the `endgrent' function. */ +#define HAVE_ENDGRENT 1 + +/* Define to 1 if you have the `endpwent' function. */ +#define HAVE_ENDPWENT 1 + +/* define if your system supports epoll */ +#define HAVE_EPOLL 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_ERRNO_H 1 + +/* Define to 1 if you have the `fcntl' function. */ +#define HAVE_FCNTL 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_FCNTL_H 1 + +/* define if you actually have FreeBSD fetch(3) */ +/* #undef HAVE_FETCH */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_FILIO_H */ + +/* Define to 1 if you have the `flock' function. */ +#define HAVE_FLOCK 1 + +/* Define to 1 if you have the `fstat' function. */ +#define HAVE_FSTAT 1 + +/* Define to 1 if you have the `gai_strerror' function. */ +#define HAVE_GAI_STRERROR 1 + +/* Define to 1 if you have the `getaddrinfo' function. */ +#define HAVE_GETADDRINFO 1 + +/* Define to 1 if you have the `getdtablesize' function. */ +#define HAVE_GETDTABLESIZE 1 + +/* Define to 1 if you have the `geteuid' function. */ +#define HAVE_GETEUID 1 + +/* Define to 1 if you have the `getgrgid' function. */ +#define HAVE_GETGRGID 1 + +/* Define to 1 if you have the `gethostbyaddr_r' function. */ +#define HAVE_GETHOSTBYADDR_R 1 + +/* Define to 1 if you have the `gethostbyname_r' function. */ +#define HAVE_GETHOSTBYNAME_R 1 + +/* Define to 1 if you have the `gethostname' function. */ +#define HAVE_GETHOSTNAME 1 + +/* Define to 1 if you have the `getnameinfo' function. */ +#define HAVE_GETNAMEINFO 1 + +/* Define to 1 if you have the `getopt' function. */ +#define HAVE_GETOPT 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_GETOPT_H 1 + +/* Define to 1 if you have the `getpassphrase' function. */ +/* #undef HAVE_GETPASSPHRASE */ + +/* Define to 1 if you have the `getpeereid' function. */ +/* #undef HAVE_GETPEEREID */ + +/* Define to 1 if you have the `getpeerucred' function. */ +/* #undef HAVE_GETPEERUCRED */ + +/* Define to 1 if you have the `getpwnam' function. */ +#define HAVE_GETPWNAM 1 + +/* Define to 1 if you have the `getpwuid' function. */ +#define HAVE_GETPWUID 1 + +/* Define to 1 if you have the `getspnam' function. */ +#define HAVE_GETSPNAM 1 + +/* Define to 1 if you have the `gettimeofday' function. */ +#define HAVE_GETTIMEOFDAY 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_GMP_H */ + +/* Define to 1 if you have the `gmtime_r' function. */ +#define HAVE_GMTIME_R 1 + +/* define if you have GNUtls */ +/* #undef HAVE_GNUTLS */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_GNUTLS_GNUTLS_H */ + +/* if you have GNU Pth */ +/* #undef HAVE_GNU_PTH */ + +/* Define to 1 if you have the header file. */ +#define HAVE_GRP_H 1 + +/* Define to 1 if you have the `hstrerror' function. */ +#define HAVE_HSTRERROR 1 + +/* define to you inet_aton(3) is available */ +#define HAVE_INET_ATON 1 + +/* Define to 1 if you have the `inet_ntoa_b' function. */ +/* #undef HAVE_INET_NTOA_B */ + +/* Define to 1 if you have the `inet_ntop' function. */ +#define HAVE_INET_NTOP 1 + +/* Define to 1 if you have the `initgroups' function. */ +#define HAVE_INITGROUPS 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_INTTYPES_H 1 + +/* Define to 1 if you have the `ioctl' function. */ +#define HAVE_IOCTL 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_IO_H */ + +/* define if your system supports kqueue */ +/* #undef HAVE_KQUEUE */ + +/* Define to 1 if you have the `gen' library (-lgen). */ +/* #undef HAVE_LIBGEN */ + +/* Define to 1 if you have the `gmp' library (-lgmp). */ +/* #undef HAVE_LIBGMP */ + +/* Define to 1 if you have the `inet' library (-linet). */ +/* #undef HAVE_LIBINET */ + +/* define if you have libtool -ltdl */ +/* #undef HAVE_LIBLTDL */ + +/* Define to 1 if you have the `net' library (-lnet). */ +/* #undef HAVE_LIBNET */ + +/* Define to 1 if you have the `nsl' library (-lnsl). */ +/* #undef HAVE_LIBNSL */ + +/* Define to 1 if you have the `nsl_s' library (-lnsl_s). */ +/* #undef HAVE_LIBNSL_S */ + +/* Define to 1 if you have the `socket' library (-lsocket). */ +/* #undef HAVE_LIBSOCKET */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_LIBUTIL_H */ + +/* Define to 1 if you have the `V3' library (-lV3). */ +/* #undef HAVE_LIBV3 */ + +/* Define to 1 if you have the header file. */ +#define HAVE_LIMITS_H 1 + +/* if you have LinuxThreads */ +/* #undef HAVE_LINUX_THREADS */ + +/* Define to 1 if you have the header file. */ +#define HAVE_LOCALE_H 1 + +/* Define to 1 if you have the `localtime_r' function. */ +#define HAVE_LOCALTIME_R 1 + +/* Define to 1 if you have the `lockf' function. */ +#define HAVE_LOCKF 1 + +/* Define to 1 if the system has the type `long long'. */ +#define HAVE_LONG_LONG 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_LTDL_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_MALLOC_H 1 + +/* Define to 1 if you have the `memcpy' function. */ +#define HAVE_MEMCPY 1 + +/* Define to 1 if you have the `memmove' function. */ +#define HAVE_MEMMOVE 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_MEMORY_H 1 + +/* Define to 1 if you have the `memrchr' function. */ +#define HAVE_MEMRCHR 1 + +/* Define to 1 if you have the `mkstemp' function. */ +#define HAVE_MKSTEMP 1 + +/* Define to 1 if you have the `mktemp' function. */ +#define HAVE_MKTEMP 1 + +/* define this if you have mkversion */ +#define HAVE_MKVERSION 1 + +/* Define to 1 if you have the header file, and it defines `DIR'. */ +/* #undef HAVE_NDIR_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_NETINET_TCP_H 1 + +/* define if strerror_r returns char* instead of int */ +/* #undef HAVE_NONPOSIX_STRERROR_R */ + +/* if you have NT Event Log */ +/* #undef HAVE_NT_EVENT_LOG */ + +/* if you have NT Service Manager */ +/* #undef HAVE_NT_SERVICE_MANAGER */ + +/* if you have NT Threads */ +/* #undef HAVE_NT_THREADS */ + +/* define if you have OpenSSL */ +#define HAVE_OPENSSL 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_OPENSSL_BN_H 1 + +/* define if you have OpenSSL with CRL checking capability */ +#define HAVE_OPENSSL_CRL 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_OPENSSL_CRYPTO_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_OPENSSL_SSL_H 1 + +/* Define to 1 if you have the `pipe' function. */ +#define HAVE_PIPE 1 + +/* Define to 1 if you have the `poll' function. */ +#define HAVE_POLL 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_POLL_H 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_PROCESS_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_PSAP_H */ + +/* define to pthreads API spec revision */ +#define HAVE_PTHREADS 10 + +/* define if you have pthread_detach function */ +#define HAVE_PTHREAD_DETACH 1 + +/* Define to 1 if you have the `pthread_getconcurrency' function. */ +#define HAVE_PTHREAD_GETCONCURRENCY 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_PTHREAD_H 1 + +/* Define to 1 if you have the `pthread_kill' function. */ +#define HAVE_PTHREAD_KILL 1 + +/* Define to 1 if you have the `pthread_kill_other_threads_np' function. */ +/* #undef HAVE_PTHREAD_KILL_OTHER_THREADS_NP */ + +/* define if you have pthread_rwlock_destroy function */ +#define HAVE_PTHREAD_RWLOCK_DESTROY 1 + +/* Define to 1 if you have the `pthread_setconcurrency' function. */ +#define HAVE_PTHREAD_SETCONCURRENCY 1 + +/* Define to 1 if you have the `pthread_yield' function. */ +#define HAVE_PTHREAD_YIELD 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_PTH_H */ + +/* Define to 1 if the system has the type `ptrdiff_t'. */ +#define HAVE_PTRDIFF_T 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_PWD_H 1 + +/* Define to 1 if you have the `read' function. */ +#define HAVE_READ 1 + +/* Define to 1 if you have the `recv' function. */ +#define HAVE_RECV 1 + +/* Define to 1 if you have the `recvfrom' function. */ +#define HAVE_RECVFROM 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_REGEX_H 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_RESOLV_H */ + +/* define if you have res_query() */ +/* #undef HAVE_RES_QUERY */ + +/* define if OpenSSL needs RSAref */ +/* #undef HAVE_RSAREF */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SASL_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SASL_SASL_H */ + +/* define if your SASL library has sasl_version() */ +/* #undef HAVE_SASL_VERSION */ + +/* Define to 1 if you have the header file. */ +#define HAVE_SCHED_H 1 + +/* Define to 1 if you have the `sched_yield' function. */ +#define HAVE_SCHED_YIELD 1 + +/* Define to 1 if you have the `send' function. */ +#define HAVE_SEND 1 + +/* Define to 1 if you have the `sendmsg' function. */ +#define HAVE_SENDMSG 1 + +/* Define to 1 if you have the `sendto' function. */ +#define HAVE_SENDTO 1 + +/* Define to 1 if you have the `setegid' function. */ +#define HAVE_SETEGID 1 + +/* Define to 1 if you have the `seteuid' function. */ +#define HAVE_SETEUID 1 + +/* Define to 1 if you have the `setgid' function. */ +#define HAVE_SETGID 1 + +/* Define to 1 if you have the `setpwfile' function. */ +/* #undef HAVE_SETPWFILE */ + +/* Define to 1 if you have the `setsid' function. */ +#define HAVE_SETSID 1 + +/* Define to 1 if you have the `setuid' function. */ +#define HAVE_SETUID 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SGTTY_H 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SHADOW_H */ + +/* Define to 1 if you have the `sigaction' function. */ +#define HAVE_SIGACTION 1 + +/* Define to 1 if you have the `signal' function. */ +#define HAVE_SIGNAL 1 + +/* Define to 1 if you have the `sigset' function. */ +#define HAVE_SIGSET 1 + +/* define if you have -lslp */ +/* #undef HAVE_SLP */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SLP_H */ + +/* Define to 1 if you have the `snprintf' function. */ +#define HAVE_SNPRINTF 1 + +/* if you have spawnlp() */ +/* #undef HAVE_SPAWNLP */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SQLEXT_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SQL_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_STDDEF_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STDINT_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STDLIB_H 1 + +/* Define to 1 if you have the `strdup' function. */ +#define HAVE_STRDUP 1 + +/* Define to 1 if you have the `strerror' function. */ +#define HAVE_STRERROR 1 + +/* Define to 1 if you have the `strerror_r' function. */ +#define HAVE_STRERROR_R 1 + +/* Define to 1 if you have the `strftime' function. */ +#define HAVE_STRFTIME 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STRINGS_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STRING_H 1 + +/* Define to 1 if you have the `strpbrk' function. */ +#define HAVE_STRPBRK 1 + +/* Define to 1 if you have the `strrchr' function. */ +#define HAVE_STRRCHR 1 + +/* Define to 1 if you have the `strsep' function. */ +#define HAVE_STRSEP 1 + +/* Define to 1 if you have the `strspn' function. */ +#define HAVE_STRSPN 1 + +/* Define to 1 if you have the `strstr' function. */ +#define HAVE_STRSTR 1 + +/* Define to 1 if you have the `strtol' function. */ +#define HAVE_STRTOL 1 + +/* Define to 1 if you have the `strtoll' function. */ +#define HAVE_STRTOLL 1 + +/* Define to 1 if you have the `strtoq' function. */ +#define HAVE_STRTOQ 1 + +/* Define to 1 if you have the `strtoul' function. */ +#define HAVE_STRTOUL 1 + +/* Define to 1 if you have the `strtoull' function. */ +#define HAVE_STRTOULL 1 + +/* Define to 1 if you have the `strtouq' function. */ +#define HAVE_STRTOUQ 1 + +/* Define to 1 if `msg_accrightslen' is a member of `struct msghdr'. */ +/* #undef HAVE_STRUCT_MSGHDR_MSG_ACCRIGHTSLEN */ + +/* Define to 1 if `msg_control' is a member of `struct msghdr'. */ +#define HAVE_STRUCT_MSGHDR_MSG_CONTROL 1 + +/* Define to 1 if `pw_gecos' is a member of `struct passwd'. */ +#define HAVE_STRUCT_PASSWD_PW_GECOS 1 + +/* Define to 1 if `pw_passwd' is a member of `struct passwd'. */ +#define HAVE_STRUCT_PASSWD_PW_PASSWD 1 + +/* Define to 1 if `st_blksize' is a member of `struct stat'. */ +#define HAVE_STRUCT_STAT_ST_BLKSIZE 1 + +/* Define to 1 if `st_fstype' is a member of `struct stat'. */ +/* #undef HAVE_STRUCT_STAT_ST_FSTYPE */ + +/* define to 1 if st_fstype is char * */ +/* #undef HAVE_STRUCT_STAT_ST_FSTYPE_CHAR */ + +/* define to 1 if st_fstype is int */ +/* #undef HAVE_STRUCT_STAT_ST_FSTYPE_INT */ + +/* Define to 1 if `st_vfstype' is a member of `struct stat'. */ +/* #undef HAVE_STRUCT_STAT_ST_VFSTYPE */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYNCH_H */ + +/* Define to 1 if you have the `sysconf' function. */ +#define HAVE_SYSCONF 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYSEXITS_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYSLOG_H 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_DEVPOLL_H */ + +/* Define to 1 if you have the header file, and it defines `DIR'. + */ +/* #undef HAVE_SYS_DIR_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_EPOLL_H 1 + +/* define if you actually have sys_errlist in your libs */ +#define HAVE_SYS_ERRLIST 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_ERRNO_H 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_EVENT_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_FILE_H 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_FILIO_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_FSTYP_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_IOCTL_H 1 + +/* Define to 1 if you have the header file, and it defines `DIR'. + */ +/* #undef HAVE_SYS_NDIR_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_PARAM_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_POLL_H 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_PRIVGRP_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_RESOURCE_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_SELECT_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_SOCKET_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_STAT_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_SYSLOG_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_TIME_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_TYPES_H 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_UCRED_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_UIO_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_UN_H 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_UUID_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_VMOUNT_H */ + +/* Define to 1 if you have that is POSIX.1 compatible. */ +#define HAVE_SYS_WAIT_H 1 + +/* define if you have -lwrap */ +/* #undef HAVE_TCPD */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_TCPD_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_TERMIOS_H 1 + +/* if you have Solaris LWP (thr) package */ +/* #undef HAVE_THR */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_THREAD_H */ + +/* Define to 1 if you have the `thr_getconcurrency' function. */ +/* #undef HAVE_THR_GETCONCURRENCY */ + +/* Define to 1 if you have the `thr_setconcurrency' function. */ +/* #undef HAVE_THR_SETCONCURRENCY */ + +/* Define to 1 if you have the `thr_yield' function. */ +/* #undef HAVE_THR_YIELD */ + +/* define if you have TLS */ +#define HAVE_TLS 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_UNISTD_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_UTIME_H 1 + +/* define if you have uuid_generate() */ +/* #undef HAVE_UUID_GENERATE */ + +/* define if you have uuid_to_str() */ +/* #undef HAVE_UUID_TO_STR */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_UUID_UUID_H */ + +/* Define to 1 if you have the `vprintf' function. */ +#define HAVE_VPRINTF 1 + +/* Define to 1 if you have the `vsnprintf' function. */ +#define HAVE_VSNPRINTF 1 + +/* Define to 1 if you have the `wait4' function. */ +#define HAVE_WAIT4 1 + +/* Define to 1 if you have the `waitpid' function. */ +#define HAVE_WAITPID 1 + +/* define if you have winsock */ +/* #undef HAVE_WINSOCK */ + +/* define if you have winsock2 */ +/* #undef HAVE_WINSOCK2 */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_WINSOCK2_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_WINSOCK_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_WIREDTIGER_H */ + +/* Define to 1 if you have the `write' function. */ +#define HAVE_WRITE 1 + +/* define if select implicitly yields */ +#define HAVE_YIELDING_SELECT 1 + +/* Define to 1 if you have the `_vsnprintf' function. */ +/* #undef HAVE__VSNPRINTF */ + +/* define to 32-bit or greater integer type */ +#define LBER_INT_T int + +/* define to large integer type */ +#define LBER_LEN_T long + +/* define to socket descriptor type */ +#define LBER_SOCKET_T int + +/* define to large integer type */ +#define LBER_TAG_T long + +/* define to 1 if library is thread safe */ +#define LDAP_API_FEATURE_X_OPENLDAP_THREAD_SAFE 1 + +/* define to LDAP VENDOR VERSION */ +/* #undef LDAP_API_FEATURE_X_OPENLDAP_V2_REFERRALS */ + +/* define this to add debugging code */ +/* #undef LDAP_DEBUG */ + +/* define if LDAP libs are dynamic */ +/* #undef LDAP_LIBS_DYNAMIC */ + +/* define to support PF_INET6 */ +#define LDAP_PF_INET6 1 + +/* define to support PF_LOCAL */ +#define LDAP_PF_LOCAL 1 + +/* define this to add SLAPI code */ +/* #undef LDAP_SLAPI */ + +/* define this to add syslog code */ +/* #undef LDAP_SYSLOG */ + +/* Version */ +#define LDAP_VENDOR_VERSION 20501 + +/* Major */ +#define LDAP_VENDOR_VERSION_MAJOR 2 + +/* Minor */ +#define LDAP_VENDOR_VERSION_MINOR 5 + +/* Patch */ +#define LDAP_VENDOR_VERSION_PATCH X + +/* Define to the sub-directory where libtool stores uninstalled libraries. */ +#define LT_OBJDIR ".libs/" + +/* define if memcmp is not 8-bit clean or is otherwise broken */ +/* #undef NEED_MEMCMP_REPLACEMENT */ + +/* define if you have (or want) no threads */ +/* #undef NO_THREADS */ + +/* define to use the original debug style */ +/* #undef OLD_DEBUG */ + +/* Package */ +#define OPENLDAP_PACKAGE "OpenLDAP" + +/* Version */ +#define OPENLDAP_VERSION "2.5.X" + +/* Define to the address where bug reports for this package should be sent. */ +#define PACKAGE_BUGREPORT "" + +/* Define to the full name of this package. */ +#define PACKAGE_NAME "" + +/* Define to the full name and version of this package. */ +#define PACKAGE_STRING "" + +/* Define to the one symbol short name of this package. */ +#define PACKAGE_TARNAME "" + +/* Define to the home page for this package. */ +#define PACKAGE_URL "" + +/* Define to the version of this package. */ +#define PACKAGE_VERSION "" + +/* define if sched_yield yields the entire process */ +/* #undef REPLACE_BROKEN_YIELD */ + +/* Define as the return type of signal handlers (`int' or `void'). */ +#define RETSIGTYPE void + +/* Define to the type of arg 1 for `select'. */ +#define SELECT_TYPE_ARG1 int + +/* Define to the type of args 2, 3 and 4 for `select'. */ +#define SELECT_TYPE_ARG234 (fd_set *) + +/* Define to the type of arg 5 for `select'. */ +#define SELECT_TYPE_ARG5 (struct timeval *) + +/* The size of `int', as computed by sizeof. */ +#define SIZEOF_INT 4 + +/* The size of `long', as computed by sizeof. */ +#define SIZEOF_LONG 8 + +/* The size of `long long', as computed by sizeof. */ +#define SIZEOF_LONG_LONG 8 + +/* The size of `short', as computed by sizeof. */ +#define SIZEOF_SHORT 2 + +/* The size of `wchar_t', as computed by sizeof. */ +#define SIZEOF_WCHAR_T 4 + +/* define to support per-object ACIs */ +/* #undef SLAPD_ACI_ENABLED */ + +/* define to support LDAP Async Metadirectory backend */ +/* #undef SLAPD_ASYNCMETA */ + +/* define to support cleartext passwords */ +/* #undef SLAPD_CLEARTEXT */ + +/* define to support crypt(3) passwords */ +/* #undef SLAPD_CRYPT */ + +/* define to support DNS SRV backend */ +/* #undef SLAPD_DNSSRV */ + +/* define to support LDAP backend */ +/* #undef SLAPD_LDAP */ + +/* define to support MDB backend */ +/* #undef SLAPD_MDB */ + +/* define to support LDAP Metadirectory backend */ +/* #undef SLAPD_META */ + +/* define to support modules */ +/* #undef SLAPD_MODULES */ + +/* dynamically linked module */ +#define SLAPD_MOD_DYNAMIC 2 + +/* statically linked module */ +#define SLAPD_MOD_STATIC 1 + +/* define to support cn=Monitor backend */ +/* #undef SLAPD_MONITOR */ + +/* define to support NDB backend */ +/* #undef SLAPD_NDB */ + +/* define to support NULL backend */ +/* #undef SLAPD_NULL */ + +/* define for In-Directory Access Logging overlay */ +/* #undef SLAPD_OVER_ACCESSLOG */ + +/* define for Audit Logging overlay */ +/* #undef SLAPD_OVER_AUDITLOG */ + +/* define for Automatic Certificate Authority overlay */ +/* #undef SLAPD_OVER_AUTOCA */ + +/* define for Collect overlay */ +/* #undef SLAPD_OVER_COLLECT */ + +/* define for Attribute Constraint overlay */ +/* #undef SLAPD_OVER_CONSTRAINT */ + +/* define for Dynamic Directory Services overlay */ +/* #undef SLAPD_OVER_DDS */ + +/* define for Dynamic Directory Services overlay */ +/* #undef SLAPD_OVER_DEREF */ + +/* define for Dynamic Group overlay */ +/* #undef SLAPD_OVER_DYNGROUP */ + +/* define for Dynamic List overlay */ +/* #undef SLAPD_OVER_DYNLIST */ + +/* define for Reverse Group Membership overlay */ +/* #undef SLAPD_OVER_MEMBEROF */ + +/* define for Password Policy overlay */ +/* #undef SLAPD_OVER_PPOLICY */ + +/* define for Proxy Cache overlay */ +/* #undef SLAPD_OVER_PROXYCACHE */ + +/* define for Referential Integrity overlay */ +/* #undef SLAPD_OVER_REFINT */ + +/* define for Return Code overlay */ +/* #undef SLAPD_OVER_RETCODE */ + +/* define for Rewrite/Remap overlay */ +/* #undef SLAPD_OVER_RWM */ + +/* define for Sequential Modify overlay */ +/* #undef SLAPD_OVER_SEQMOD */ + +/* define for ServerSideSort/VLV overlay */ +/* #undef SLAPD_OVER_SSSVLV */ + +/* define for Syncrepl Provider overlay */ +/* #undef SLAPD_OVER_SYNCPROV */ + +/* define for Translucent Proxy overlay */ +/* #undef SLAPD_OVER_TRANSLUCENT */ + +/* define for Attribute Uniqueness overlay */ +/* #undef SLAPD_OVER_UNIQUE */ + +/* define for Value Sorting overlay */ +/* #undef SLAPD_OVER_VALSORT */ + +/* define to support PASSWD backend */ +/* #undef SLAPD_PASSWD */ + +/* define to support PERL backend */ +/* #undef SLAPD_PERL */ + +/* define to support relay backend */ +/* #undef SLAPD_RELAY */ + +/* define to support reverse lookups */ +/* #undef SLAPD_RLOOKUPS */ + +/* define to support SHELL backend */ +/* #undef SLAPD_SHELL */ + +/* define to support SOCK backend */ +/* #undef SLAPD_SOCK */ + +/* define to support SASL passwords */ +/* #undef SLAPD_SPASSWD */ + +/* define to support SQL backend */ +/* #undef SLAPD_SQL */ + +/* define to support WiredTiger backend */ +/* #undef SLAPD_WT */ + +/* define to support run-time loadable ACL */ +/* #undef SLAP_DYNACL */ + +/* Define to 1 if you have the ANSI C header files. */ +#define STDC_HEADERS 1 + +/* Define to 1 if you can safely include both and . */ +#define TIME_WITH_SYS_TIME 1 + +/* Define to 1 if your declares `struct tm'. */ +/* #undef TM_IN_SYS_TIME */ + +/* set to urandom device */ +#define URANDOM_DEVICE "/dev/urandom" + +/* define to use OpenSSL BIGNUM for MP */ +/* #undef USE_MP_BIGNUM */ + +/* define to use GMP for MP */ +/* #undef USE_MP_GMP */ + +/* define to use 'long' for MP */ +/* #undef USE_MP_LONG */ + +/* define to use 'long long' for MP */ +/* #undef USE_MP_LONG_LONG */ + +/* Define WORDS_BIGENDIAN to 1 if your processor stores words with the most + significant byte first (like Motorola and SPARC, unlike Intel). */ +#if defined AC_APPLE_UNIVERSAL_BUILD +# if defined __BIG_ENDIAN__ +# define WORDS_BIGENDIAN 1 +# endif +#else +# ifndef WORDS_BIGENDIAN +/* # undef WORDS_BIGENDIAN */ +# endif +#endif + +/* Define to the type of arg 3 for `accept'. */ +#define ber_socklen_t socklen_t + +/* Define to `char *' if does not define. */ +/* #undef caddr_t */ + +/* Define to empty if `const' does not conform to ANSI C. */ +/* #undef const */ + +/* Define to `int' if doesn't define. */ +/* #undef gid_t */ + +/* Define to `int' if does not define. */ +/* #undef mode_t */ + +/* Define to `long' if does not define. */ +/* #undef off_t */ + +/* Define to `int' if does not define. */ +/* #undef pid_t */ + +/* Define to `int' if does not define. */ +/* #undef sig_atomic_t */ + +/* Define to `unsigned' if does not define. */ +/* #undef size_t */ + +/* define to snprintf routine */ +/* #undef snprintf */ + +/* Define like ber_socklen_t if does not define. */ +/* #undef socklen_t */ + +/* Define to `signed int' if does not define. */ +/* #undef ssize_t */ + +/* Define to `int' if doesn't define. */ +/* #undef uid_t */ + +/* define as empty if volatile is not supported */ +/* #undef volatile */ + +/* define to snprintf routine */ +/* #undef vsnprintf */ + + +/* begin of portable.h.post */ + +#ifdef _WIN32 +/* don't suck in all of the win32 api */ +# define WIN32_LEAN_AND_MEAN 1 +#endif + +#ifndef LDAP_NEEDS_PROTOTYPES +/* force LDAP_P to always include prototypes */ +#define LDAP_NEEDS_PROTOTYPES 1 +#endif + +#ifndef LDAP_REL_ENG +#if (LDAP_VENDOR_VERSION == 000000) && !defined(LDAP_DEVEL) +#define LDAP_DEVEL +#endif +#if defined(LDAP_DEVEL) && !defined(LDAP_TEST) +#define LDAP_TEST +#endif +#endif + +#ifdef HAVE_STDDEF_H +# include +#endif + +#ifdef HAVE_EBCDIC +/* ASCII/EBCDIC converting replacements for stdio funcs + * vsnprintf and snprintf are used too, but they are already + * checked by the configure script + */ +#define fputs ber_pvt_fputs +#define fgets ber_pvt_fgets +#define printf ber_pvt_printf +#define fprintf ber_pvt_fprintf +#define vfprintf ber_pvt_vfprintf +#define vsprintf ber_pvt_vsprintf +#endif + +#include "ac/fdset.h" + +#include "ldap_cdefs.h" +#include "ldap_features.h" + +#include "ac/assert.h" +#include "ac/localize.h" + +#endif /* _LDAP_PORTABLE_H */ +/* end of portable.h.post */ + diff --git a/debian/changelog b/debian/changelog index 23d63b41099..be77dfdefe9 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,5 +1,5 @@ -clickhouse (21.4.1.1) unstable; urgency=low +clickhouse (21.5.1.1) unstable; urgency=low * Modified source code - -- clickhouse-release Sat, 06 Mar 2021 14:43:27 +0300 + -- clickhouse-release Fri, 02 Apr 2021 18:34:26 +0300 diff --git a/docker/client/Dockerfile b/docker/client/Dockerfile index d9cd68254b7..2efba9735ae 100644 --- a/docker/client/Dockerfile +++ b/docker/client/Dockerfile @@ -1,7 +1,7 @@ FROM ubuntu:18.04 ARG repository="deb https://repo.clickhouse.tech/deb/stable/ main/" -ARG version=21.4.1.* +ARG version=21.5.1.* RUN apt-get update \ && apt-get install --yes --no-install-recommends \ diff --git a/docker/server/Dockerfile b/docker/server/Dockerfile index d22516eab0a..05ca29f22d4 100644 --- a/docker/server/Dockerfile +++ b/docker/server/Dockerfile @@ -1,7 +1,7 @@ FROM ubuntu:20.04 ARG repository="deb https://repo.clickhouse.tech/deb/stable/ main/" -ARG version=21.4.1.* +ARG version=21.5.1.* ARG gosu_ver=1.10 # set non-empty deb_location_url url to create a docker image diff --git a/docker/test/Dockerfile b/docker/test/Dockerfile index e727d2a3ecf..976c46ebe27 100644 --- a/docker/test/Dockerfile +++ b/docker/test/Dockerfile @@ -1,7 +1,7 @@ FROM ubuntu:18.04 ARG repository="deb https://repo.clickhouse.tech/deb/stable/ main/" -ARG version=21.4.1.* +ARG version=21.5.1.* RUN apt-get update && \ apt-get install -y apt-transport-https dirmngr && \ diff --git a/docker/test/fasttest/Dockerfile b/docker/test/fasttest/Dockerfile index 64be52d8e30..2864f7fc4da 100644 --- a/docker/test/fasttest/Dockerfile +++ b/docker/test/fasttest/Dockerfile @@ -1,7 +1,7 @@ # docker build -t yandex/clickhouse-fasttest . FROM ubuntu:20.04 -ENV DEBIAN_FRONTEND=noninteractive LLVM_VERSION=10 +ENV DEBIAN_FRONTEND=noninteractive LLVM_VERSION=11 RUN apt-get update \ && apt-get install ca-certificates lsb-release wget gnupg apt-transport-https \ @@ -43,20 +43,20 @@ RUN apt-get update \ clang-tidy-${LLVM_VERSION} \ cmake \ curl \ - lsof \ expect \ fakeroot \ - git \ gdb \ + git \ gperf \ lld-${LLVM_VERSION} \ llvm-${LLVM_VERSION} \ + lsof \ moreutils \ ninja-build \ psmisc \ python3 \ - python3-pip \ python3-lxml \ + python3-pip \ python3-requests \ python3-termcolor \ rename \ diff --git a/docker/test/fasttest/run.sh b/docker/test/fasttest/run.sh index c8bfce3848d..c21a115289d 100755 --- a/docker/test/fasttest/run.sh +++ b/docker/test/fasttest/run.sh @@ -8,6 +8,9 @@ trap 'kill $(jobs -pr) ||:' EXIT # that we can run the "everything else" stage from the cloned source. stage=${stage:-} +# Compiler version, normally set by Dockerfile +export LLVM_VERSION=${LLVM_VERSION:-11} + # A variable to pass additional flags to CMake. # Here we explicitly default it to nothing so that bash doesn't complain about # it being undefined. Also read it as array so that we can pass an empty list @@ -124,22 +127,26 @@ continue function clone_root { - git clone https://github.com/ClickHouse/ClickHouse.git -- "$FASTTEST_SOURCE" | ts '%Y-%m-%d %H:%M:%S' | tee "$FASTTEST_OUTPUT/clone_log.txt" + git clone --depth 1 https://github.com/ClickHouse/ClickHouse.git -- "$FASTTEST_SOURCE" 2>&1 | ts '%Y-%m-%d %H:%M:%S' | tee "$FASTTEST_OUTPUT/clone_log.txt" ( cd "$FASTTEST_SOURCE" if [ "$PULL_REQUEST_NUMBER" != "0" ]; then - if git fetch origin "+refs/pull/$PULL_REQUEST_NUMBER/merge"; then + if git fetch --depth 1 origin "+refs/pull/$PULL_REQUEST_NUMBER/merge"; then git checkout FETCH_HEAD - echo 'Clonned merge head' + echo "Checked out pull/$PULL_REQUEST_NUMBER/merge ($(git rev-parse FETCH_HEAD))" else - git fetch origin "+refs/pull/$PULL_REQUEST_NUMBER/head" + git fetch --depth 1 origin "+refs/pull/$PULL_REQUEST_NUMBER/head" git checkout "$COMMIT_SHA" - echo 'Checked out to commit' + echo "Checked out nominal SHA $COMMIT_SHA for PR $PULL_REQUEST_NUMBER" fi else if [ -v COMMIT_SHA ]; then + git fetch --depth 1 origin "$COMMIT_SHA" git checkout "$COMMIT_SHA" + echo "Checked out nominal SHA $COMMIT_SHA for master" + else + echo "Using default repository head $(git rev-parse HEAD)" fi fi ) @@ -181,7 +188,7 @@ function clone_submodules ) git submodule sync - git submodule update --init --recursive "${SUBMODULES_TO_UPDATE[@]}" + git submodule update --depth 1 --init --recursive "${SUBMODULES_TO_UPDATE[@]}" git submodule foreach git reset --hard git submodule foreach git checkout @ -f git submodule foreach git clean -xfd @@ -215,7 +222,7 @@ function run_cmake ( cd "$FASTTEST_BUILD" - cmake "$FASTTEST_SOURCE" -DCMAKE_CXX_COMPILER=clang++-10 -DCMAKE_C_COMPILER=clang-10 "${CMAKE_LIBS_CONFIG[@]}" "${FASTTEST_CMAKE_FLAGS[@]}" | ts '%Y-%m-%d %H:%M:%S' | tee "$FASTTEST_OUTPUT/cmake_log.txt" + cmake "$FASTTEST_SOURCE" -DCMAKE_CXX_COMPILER="clang++-${LLVM_VERSION}" -DCMAKE_C_COMPILER="clang-${LLVM_VERSION}" "${CMAKE_LIBS_CONFIG[@]}" "${FASTTEST_CMAKE_FLAGS[@]}" 2>&1 | ts '%Y-%m-%d %H:%M:%S' | tee "$FASTTEST_OUTPUT/cmake_log.txt" ) } @@ -223,7 +230,7 @@ function build { ( cd "$FASTTEST_BUILD" - time ninja clickhouse-bundle | ts '%Y-%m-%d %H:%M:%S' | tee "$FASTTEST_OUTPUT/build_log.txt" + time ninja clickhouse-bundle 2>&1 | ts '%Y-%m-%d %H:%M:%S' | tee "$FASTTEST_OUTPUT/build_log.txt" if [ "$COPY_CLICKHOUSE_BINARY_TO_OUTPUT" -eq "1" ]; then cp programs/clickhouse "$FASTTEST_OUTPUT/clickhouse" fi @@ -420,7 +427,7 @@ case "$stage" in # See the compatibility hacks in `clone_root` stage above. Remove at the same time, # after Nov 1, 2020. cd "$FASTTEST_WORKSPACE" - clone_submodules | ts '%Y-%m-%d %H:%M:%S' | tee "$FASTTEST_OUTPUT/submodule_log.txt" + clone_submodules 2>&1 | ts '%Y-%m-%d %H:%M:%S' | tee "$FASTTEST_OUTPUT/submodule_log.txt" ;& "run_cmake") run_cmake @@ -431,7 +438,7 @@ case "$stage" in "configure") # The `install_log.txt` is also needed for compatibility with old CI task -- # if there is no log, it will decide that build failed. - configure | ts '%Y-%m-%d %H:%M:%S' | tee "$FASTTEST_OUTPUT/install_log.txt" + configure 2>&1 | ts '%Y-%m-%d %H:%M:%S' | tee "$FASTTEST_OUTPUT/install_log.txt" ;& "run_tests") run_tests diff --git a/docker/test/stateless/run.sh b/docker/test/stateless/run.sh index 852c6415d13..20132eafb75 100755 --- a/docker/test/stateless/run.sh +++ b/docker/test/stateless/run.sh @@ -74,12 +74,17 @@ function run_tests() ADDITIONAL_OPTIONS+=('--order=random') ADDITIONAL_OPTIONS+=('--skip') ADDITIONAL_OPTIONS+=('00000_no_tests_to_skip') - ADDITIONAL_OPTIONS+=('--jobs') - ADDITIONAL_OPTIONS+=('4') + # Note that flaky check must be ran in parallel, but for now we run + # everything in parallel except DatabaseReplicated. See below. fi if [[ -n "$USE_DATABASE_REPLICATED" ]] && [[ "$USE_DATABASE_REPLICATED" -eq 1 ]]; then ADDITIONAL_OPTIONS+=('--replicated-database') + else + # Too many tests fail for DatabaseReplicated in parallel. All other + # configurations are OK. + ADDITIONAL_OPTIONS+=('--jobs') + ADDITIONAL_OPTIONS+=('8') fi clickhouse-test --testname --shard --zookeeper --hung-check --print-time \ diff --git a/docs/README.md b/docs/README.md index 8b3066501bf..a4df023a6ad 100644 --- a/docs/README.md +++ b/docs/README.md @@ -126,7 +126,13 @@ Contribute all new information in English language. Other languages are translat ### Adding a New File -When adding a new file: +When you add a new file, it should end with a link like: + +`[Original article](https://clickhouse.tech/docs/) ` + +and there should be **a new empty line** after it. + +{## When adding a new file: - Make symbolic links for all other languages. You can use the following commands: @@ -134,7 +140,7 @@ When adding a new file: $ cd /ClickHouse/clone/directory/docs $ ln -sr en/new/file.md lang/new/file.md ``` - +##} ### Adding a New Language @@ -195,8 +201,11 @@ Templates: - [Function](_description_templates/template-function.md) - [Setting](_description_templates/template-setting.md) +- [Server Setting](_description_templates/template-server-setting.md) - [Database or Table engine](_description_templates/template-engine.md) - [System table](_description_templates/template-system-table.md) +- [Data type](_description_templates/data-type.md) +- [Statement](_description_templates/statement.md) diff --git a/docs/en/development/build-osx.md b/docs/en/development/build-osx.md index e0b1be710f1..886e85bbf86 100644 --- a/docs/en/development/build-osx.md +++ b/docs/en/development/build-osx.md @@ -5,43 +5,77 @@ toc_title: Build on Mac OS X # How to Build ClickHouse on Mac OS X {#how-to-build-clickhouse-on-mac-os-x} -Build should work on Mac OS X 10.15 (Catalina). +Build should work on x86_64 (Intel) based macOS 10.15 (Catalina) and higher with recent Xcode's native AppleClang, or Homebrew's vanilla Clang or GCC compilers. ## Install Homebrew {#install-homebrew} ``` bash -$ /usr/bin/ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)" +$ /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)" ``` +## Install Xcode and Command Line Tools {#install-xcode-and-command-line-tools} + +Install the latest [Xcode](https://apps.apple.com/am/app/xcode/id497799835?mt=12) from App Store. + +Open it at least once to accept the end-user license agreement and automatically install the required components. + +Then, make sure that the latest Comman Line Tools are installed and selected in the system: + +``` bash +$ sudo rm -rf /Library/Developer/CommandLineTools +$ sudo xcode-select --install +``` + +Reboot. + ## Install Required Compilers, Tools, and Libraries {#install-required-compilers-tools-and-libraries} ``` bash -$ brew install cmake ninja libtool gettext llvm +$ brew update +$ brew install cmake ninja libtool gettext llvm gcc ``` ## Checkout ClickHouse Sources {#checkout-clickhouse-sources} ``` bash -$ git clone --recursive git@github.com:ClickHouse/ClickHouse.git -``` - -or - -``` bash -$ git clone --recursive https://github.com/ClickHouse/ClickHouse.git - -$ cd ClickHouse +$ git clone --recursive git@github.com:ClickHouse/ClickHouse.git # or https://github.com/ClickHouse/ClickHouse.git ``` ## Build ClickHouse {#build-clickhouse} -> Please note: ClickHouse doesn't support build with native Apple Clang compiler, we need use clang from LLVM. +To build using Xcode's native AppleClang compiler: ``` bash +$ cd ClickHouse +$ rm -rf build $ mkdir build $ cd build -$ cmake .. -DCMAKE_C_COMPILER=`brew --prefix llvm`/bin/clang -DCMAKE_CXX_COMPILER=`brew --prefix llvm`/bin/clang++ -DCMAKE_PREFIX_PATH=`brew --prefix llvm` -$ ninja +$ cmake -DCMAKE_BUILD_TYPE=RelWithDebInfo -DENABLE_JEMALLOC=OFF .. +$ cmake --build . --config RelWithDebInfo +$ cd .. +``` + +To build using Homebrew's vanilla Clang compiler: + +``` bash +$ cd ClickHouse +$ rm -rf build +$ mkdir build +$ cd build +$ cmake -DCMAKE_C_COMPILER=$(brew --prefix llvm)/bin/clang -DCMAKE_CXX_COMPILER==$(brew --prefix llvm)/bin/clang++ -DCMAKE_BUILD_TYPE=RelWithDebInfo -DENABLE_JEMALLOC=OFF .. +$ cmake --build . --config RelWithDebInfo +$ cd .. +``` + +To build using Homebrew's vanilla GCC compiler: + +``` bash +$ cd ClickHouse +$ rm -rf build +$ mkdir build +$ cd build +$ cmake -DCMAKE_C_COMPILER=$(brew --prefix gcc)/bin/gcc-10 -DCMAKE_CXX_COMPILER=$(brew --prefix gcc)/bin/g++-10 -DCMAKE_BUILD_TYPE=RelWithDebInfo -DENABLE_JEMALLOC=OFF .. +$ cmake --build . --config RelWithDebInfo $ cd .. ``` diff --git a/docs/en/engines/database-engines/index.md b/docs/en/engines/database-engines/index.md index 2db11998483..b6892099378 100644 --- a/docs/en/engines/database-engines/index.md +++ b/docs/en/engines/database-engines/index.md @@ -18,4 +18,8 @@ You can also use the following database engines: - [Lazy](../../engines/database-engines/lazy.md) +- [Atomic](../../engines/database-engines/atomic.md) + +- [PostgreSQL](../../engines/database-engines/postgresql.md) + [Original article](https://clickhouse.tech/docs/en/database_engines/) diff --git a/docs/en/engines/database-engines/postgresql.md b/docs/en/engines/database-engines/postgresql.md new file mode 100644 index 00000000000..1fa86b7ac21 --- /dev/null +++ b/docs/en/engines/database-engines/postgresql.md @@ -0,0 +1,138 @@ +--- +toc_priority: 35 +toc_title: PostgreSQL +--- + +# PostgreSQL {#postgresql} + +Allows to connect to databases on a remote [PostgreSQL](https://www.postgresql.org) server. Supports read and write operations (`SELECT` and `INSERT` queries) to exchange data between ClickHouse and PostgreSQL. + +Gives the real-time access to table list and table structure from remote PostgreSQL with the help of `SHOW TABLES` and `DESCRIBE TABLE` queries. + +Supports table structure modifications (`ALTER TABLE ... ADD|DROP COLUMN`). If `use_table_cache` parameter (see the Engine Parameters below) it set to `1`, the table structure is cached and not checked for being modified, but can be updated with `DETACH` and `ATTACH` queries. + +## Creating a Database {#creating-a-database} + +``` sql +CREATE DATABASE test_database +ENGINE = PostgreSQL('host:port', 'database', 'user', 'password'[, `use_table_cache`]); +``` + +**Engine Parameters** + +- `host:port` — PostgreSQL server address. +- `database` — Remote database name. +- `user` — PostgreSQL user. +- `password` — User password. +- `use_table_cache` — Defines if the database table structure is cached or not. Optional. Default value: `0`. + +## Data Types Support {#data_types-support} + +| PostgerSQL | ClickHouse | +|------------------|--------------------------------------------------------------| +| DATE | [Date](../../sql-reference/data-types/date.md) | +| TIMESTAMP | [DateTime](../../sql-reference/data-types/datetime.md) | +| REAL | [Float32](../../sql-reference/data-types/float.md) | +| DOUBLE | [Float64](../../sql-reference/data-types/float.md) | +| DECIMAL, NUMERIC | [Decimal](../../sql-reference/data-types/decimal.md) | +| SMALLINT | [Int16](../../sql-reference/data-types/int-uint.md) | +| INTEGER | [Int32](../../sql-reference/data-types/int-uint.md) | +| BIGINT | [Int64](../../sql-reference/data-types/int-uint.md) | +| SERIAL | [UInt32](../../sql-reference/data-types/int-uint.md) | +| BIGSERIAL | [UInt64](../../sql-reference/data-types/int-uint.md) | +| TEXT, CHAR | [String](../../sql-reference/data-types/string.md) | +| INTEGER | Nullable([Int32](../../sql-reference/data-types/int-uint.md))| +| ARRAY | [Array](../../sql-reference/data-types/array.md) | + + +## Examples of Use {#examples-of-use} + +Database in ClickHouse, exchanging data with the PostgreSQL server: + +``` sql +CREATE DATABASE test_database +ENGINE = PostgreSQL('postgres1:5432', 'test_database', 'postgres', 'mysecretpassword', 1); +``` + +``` sql +SHOW DATABASES; +``` + +``` text +┌─name──────────┐ +│ default │ +│ test_database │ +│ system │ +└───────────────┘ +``` + +``` sql +SHOW TABLES FROM test_database; +``` + +``` text +┌─name───────┐ +│ test_table │ +└────────────┘ +``` + +Reading data from the PostgreSQL table: + +``` sql +SELECT * FROM test_database.test_table; +``` + +``` text +┌─id─┬─value─┐ +│ 1 │ 2 │ +└────┴───────┘ +``` + +Writing data to the PostgreSQL table: + +``` sql +INSERT INTO test_database.test_table VALUES (3,4); +SELECT * FROM test_database.test_table; +``` + +``` text +┌─int_id─┬─value─┐ +│ 1 │ 2 │ +│ 3 │ 4 │ +└────────┴───────┘ +``` + +Consider the table structure was modified in PostgreSQL: + +``` sql +postgre> ALTER TABLE test_table ADD COLUMN data Text +``` + +As the `use_table_cache` parameter was set to `1` when the database was created, the table structure in ClickHouse was cached and therefore not modified: + +``` sql +DESCRIBE TABLE test_database.test_table; +``` +``` text +┌─name───┬─type──────────────┐ +│ id │ Nullable(Integer) │ +│ value │ Nullable(Integer) │ +└────────┴───────────────────┘ +``` + +After detaching the table and attaching it again, the structure was updated: + +``` sql +DETACH TABLE test_database.test_table; +ATTACH TABLE test_database.test_table; +DESCRIBE TABLE test_database.test_table; +``` +``` text +┌─name───┬─type──────────────┐ +│ id │ Nullable(Integer) │ +│ value │ Nullable(Integer) │ +│ data │ Nullable(String) │ +└────────┴───────────────────┘ +``` + +[Original article](https://clickhouse.tech/docs/en/database-engines/postgresql/) diff --git a/docs/en/engines/table-engines/index.md b/docs/en/engines/table-engines/index.md index e60cdf3c899..eb4fc583f88 100644 --- a/docs/en/engines/table-engines/index.md +++ b/docs/en/engines/table-engines/index.md @@ -47,12 +47,17 @@ Engines for communicating with other data storage and processing systems. Engines in the family: -- [Kafka](../../engines/table-engines/integrations/kafka.md#kafka) -- [MySQL](../../engines/table-engines/integrations/mysql.md#mysql) -- [ODBC](../../engines/table-engines/integrations/odbc.md#table-engine-odbc) -- [JDBC](../../engines/table-engines/integrations/jdbc.md#table-engine-jdbc) -- [HDFS](../../engines/table-engines/integrations/hdfs.md#hdfs) -- [S3](../../engines/table-engines/integrations/s3.md#table-engine-s3) + +- [ODBC](../../engines/table-engines/integrations/odbc.md) +- [JDBC](../../engines/table-engines/integrations/jdbc.md) +- [MySQL](../../engines/table-engines/integrations/mysql.md) +- [MongoDB](../../engines/table-engines/integrations/mongodb.md) +- [HDFS](../../engines/table-engines/integrations/hdfs.md) +- [S3](../../engines/table-engines/integrations/s3.md) +- [Kafka](../../engines/table-engines/integrations/kafka.md) +- [EmbeddedRocksDB](../../engines/table-engines/integrations/embedded-rocksdb.md) +- [RabbitMQ](../../engines/table-engines/integrations/rabbitmq.md) +- [PostgreSQL](../../engines/table-engines/integrations/postgresql.md) ### Special Engines {#special-engines} diff --git a/docs/en/engines/table-engines/integrations/embedded-rocksdb.md b/docs/en/engines/table-engines/integrations/embedded-rocksdb.md index e9e069933e5..88c8973eeab 100644 --- a/docs/en/engines/table-engines/integrations/embedded-rocksdb.md +++ b/docs/en/engines/table-engines/integrations/embedded-rocksdb.md @@ -1,5 +1,5 @@ --- -toc_priority: 6 +toc_priority: 9 toc_title: EmbeddedRocksDB --- diff --git a/docs/en/engines/table-engines/integrations/hdfs.md b/docs/en/engines/table-engines/integrations/hdfs.md index 0782efe8e72..cf4bb5ecbf7 100644 --- a/docs/en/engines/table-engines/integrations/hdfs.md +++ b/docs/en/engines/table-engines/integrations/hdfs.md @@ -1,5 +1,5 @@ --- -toc_priority: 4 +toc_priority: 6 toc_title: HDFS --- diff --git a/docs/en/engines/table-engines/integrations/index.md b/docs/en/engines/table-engines/integrations/index.md index 28f38375448..eb1c5411e18 100644 --- a/docs/en/engines/table-engines/integrations/index.md +++ b/docs/en/engines/table-engines/integrations/index.md @@ -1,6 +1,6 @@ --- toc_folder_title: Integrations -toc_priority: 30 +toc_priority: 1 --- # Table Engines for Integrations {#table-engines-for-integrations} @@ -19,5 +19,3 @@ List of supported integrations: - [EmbeddedRocksDB](../../../engines/table-engines/integrations/embedded-rocksdb.md) - [RabbitMQ](../../../engines/table-engines/integrations/rabbitmq.md) - [PostgreSQL](../../../engines/table-engines/integrations/postgresql.md) - -[Original article](https://clickhouse.tech/docs/en/engines/table-engines/integrations/) diff --git a/docs/en/engines/table-engines/integrations/jdbc.md b/docs/en/engines/table-engines/integrations/jdbc.md index edbc5d3ed3e..82efb842ae7 100644 --- a/docs/en/engines/table-engines/integrations/jdbc.md +++ b/docs/en/engines/table-engines/integrations/jdbc.md @@ -1,5 +1,5 @@ --- -toc_priority: 2 +toc_priority: 3 toc_title: JDBC --- diff --git a/docs/en/engines/table-engines/integrations/kafka.md b/docs/en/engines/table-engines/integrations/kafka.md index 0ec50094a27..2eebf5bdb92 100644 --- a/docs/en/engines/table-engines/integrations/kafka.md +++ b/docs/en/engines/table-engines/integrations/kafka.md @@ -1,5 +1,5 @@ --- -toc_priority: 5 +toc_priority: 8 toc_title: Kafka --- diff --git a/docs/en/engines/table-engines/integrations/mongodb.md b/docs/en/engines/table-engines/integrations/mongodb.md index 2fee27ce80d..a378ab03f55 100644 --- a/docs/en/engines/table-engines/integrations/mongodb.md +++ b/docs/en/engines/table-engines/integrations/mongodb.md @@ -1,5 +1,5 @@ --- -toc_priority: 7 +toc_priority: 5 toc_title: MongoDB --- diff --git a/docs/en/engines/table-engines/integrations/mysql.md b/docs/en/engines/table-engines/integrations/mysql.md index 8b7caa12c91..3847e7a9e0e 100644 --- a/docs/en/engines/table-engines/integrations/mysql.md +++ b/docs/en/engines/table-engines/integrations/mysql.md @@ -1,5 +1,5 @@ --- -toc_priority: 3 +toc_priority: 4 toc_title: MySQL --- diff --git a/docs/en/engines/table-engines/integrations/odbc.md b/docs/en/engines/table-engines/integrations/odbc.md index 99efd870088..26bfb6aeb0d 100644 --- a/docs/en/engines/table-engines/integrations/odbc.md +++ b/docs/en/engines/table-engines/integrations/odbc.md @@ -1,5 +1,5 @@ --- -toc_priority: 1 +toc_priority: 2 toc_title: ODBC --- diff --git a/docs/en/engines/table-engines/integrations/postgresql.md b/docs/en/engines/table-engines/integrations/postgresql.md index 8326038407f..ad5bebb3dea 100644 --- a/docs/en/engines/table-engines/integrations/postgresql.md +++ b/docs/en/engines/table-engines/integrations/postgresql.md @@ -1,11 +1,11 @@ --- -toc_priority: 8 +toc_priority: 11 toc_title: PostgreSQL --- # PostgreSQL {#postgresql} -The PostgreSQL engine allows you to perform `SELECT` queries on data that is stored on a remote PostgreSQL server. +The PostgreSQL engine allows to perform `SELECT` and `INSERT` queries on data that is stored on a remote PostgreSQL server. ## Creating a Table {#creating-a-table} @@ -15,7 +15,7 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1] [TTL expr1], name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2] [TTL expr2], ... -) ENGINE = PostgreSQL('host:port', 'database', 'table', 'user', 'password'); +) ENGINE = PostgreSQL('host:port', 'database', 'table', 'user', 'password'[, `schema`]); ``` See a detailed description of the [CREATE TABLE](../../../sql-reference/statements/create/table.md#create-table-query) query. @@ -29,25 +29,51 @@ The table structure can differ from the original PostgreSQL table structure: **Engine Parameters** - `host:port` — PostgreSQL server address. - - `database` — Remote database name. - - `table` — Remote table name. - - `user` — PostgreSQL user. - - `password` — User password. +- `schema` — Non-default table schema. Optional. -SELECT Queries on PostgreSQL side run as `COPY (SELECT ...) TO STDOUT` inside read-only PostgreSQL transaction with commit after each `SELECT` query. +## Implementation Details {#implementation-details} -Simple `WHERE` clauses such as `=, !=, >, >=, <, <=, IN` are executed on the PostgreSQL server. +`SELECT` queries on PostgreSQL side run as `COPY (SELECT ...) TO STDOUT` inside read-only PostgreSQL transaction with commit after each `SELECT` query. + +Simple `WHERE` clauses such as `=`, `!=`, `>`, `>=`, `<`, `<=`, and `IN` are executed on the PostgreSQL server. All joins, aggregations, sorting, `IN [ array ]` conditions and the `LIMIT` sampling constraint are executed in ClickHouse only after the query to PostgreSQL finishes. -INSERT Queries on PostgreSQL side run as `COPY "table_name" (field1, field2, ... fieldN) FROM STDIN` inside PostgreSQL transaction with auto-commit after each `INSERT` statement. +`INSERT` queries on PostgreSQL side run as `COPY "table_name" (field1, field2, ... fieldN) FROM STDIN` inside PostgreSQL transaction with auto-commit after each `INSERT` statement. -PostgreSQL Array types converts into ClickHouse arrays. -Be careful in PostgreSQL an array data created like a type_name[] may contain multi-dimensional arrays of different dimensions in different table rows in same column, but in ClickHouse it is only allowed to have multidimensional arrays of the same count of dimensions in all table rows in same column. +PostgreSQL `Array` types are converted into ClickHouse arrays. + +!!! info "Note" + Be careful - in PostgreSQL an array data, created like a `type_name[]`, may contain multi-dimensional arrays of different dimensions in different table rows in same column. But in ClickHouse it is only allowed to have multidimensional arrays of the same count of dimensions in all table rows in same column. + +Replicas priority for PostgreSQL dictionary source is supported. The bigger the number in map, the less the priority. The highest priority is `0`. + +In the example below replica `example01-1` has the highest priority: + +```xml + + 5432 + clickhouse + qwerty + + example01-1 + 1 + + + example01-2 + 2 + + db_name + table_name
+ id=10 + SQL_QUERY +
+ +``` ## Usage Example {#usage-example} @@ -64,10 +90,10 @@ PRIMARY KEY (int_id)); CREATE TABLE -postgres=# insert into test (int_id, str, "float") VALUES (1,'test',2); +postgres=# INSERT INTO test (int_id, str, "float") VALUES (1,'test',2); INSERT 0 1 -postgresql> select * from test; +postgresql> SELECT * FROM test; int_id | int_nullable | float | str | float_nullable --------+--------------+-------+------+---------------- 1 | | 2 | test | @@ -87,20 +113,33 @@ ENGINE = PostgreSQL('localhost:5432', 'public', 'test', 'postges_user', 'postgre ``` ``` sql -SELECT * FROM postgresql_table WHERE str IN ('test') +SELECT * FROM postgresql_table WHERE str IN ('test'); ``` ``` text ┌─float_nullable─┬─str──┬─int_id─┐ │ ᴺᵁᴸᴸ │ test │ 1 │ └────────────────┴──────┴────────┘ -1 rows in set. Elapsed: 0.019 sec. ``` +Using Non-default Schema: -## See Also {#see-also} +```text +postgres=# CREATE SCHEMA "nice.schema"; -- [The ‘postgresql’ table function](../../../sql-reference/table-functions/postgresql.md) +postgres=# CREATE TABLE "nice.schema"."nice.table" (a integer); + +postgres=# INSERT INTO "nice.schema"."nice.table" SELECT i FROM generate_series(0, 99) as t(i) +``` + +```sql +CREATE TABLE pg_table_schema_with_dots (a UInt32) + ENGINE PostgreSQL('localhost:5432', 'clickhouse', 'nice.table', 'postgrsql_user', 'password', 'nice.schema'); +``` + +**See Also** + +- [The `postgresql` table function](../../../sql-reference/table-functions/postgresql.md) - [Using PostgreSQL as a source of external dictionary](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md#dicts-external_dicts_dict_sources-postgresql) [Original article](https://clickhouse.tech/docs/en/engines/table-engines/integrations/postgresql/) diff --git a/docs/en/engines/table-engines/integrations/rabbitmq.md b/docs/en/engines/table-engines/integrations/rabbitmq.md index 476192d3969..5fb9ce5b151 100644 --- a/docs/en/engines/table-engines/integrations/rabbitmq.md +++ b/docs/en/engines/table-engines/integrations/rabbitmq.md @@ -1,5 +1,5 @@ --- -toc_priority: 6 +toc_priority: 10 toc_title: RabbitMQ --- diff --git a/docs/en/engines/table-engines/integrations/s3.md b/docs/en/engines/table-engines/integrations/s3.md index 03340f2d8c9..3d02aa13812 100644 --- a/docs/en/engines/table-engines/integrations/s3.md +++ b/docs/en/engines/table-engines/integrations/s3.md @@ -1,5 +1,5 @@ --- -toc_priority: 4 +toc_priority: 7 toc_title: S3 --- diff --git a/docs/en/operations/server-configuration-parameters/settings.md b/docs/en/operations/server-configuration-parameters/settings.md index 89fcbafe663..0b45488ebf7 100644 --- a/docs/en/operations/server-configuration-parameters/settings.md +++ b/docs/en/operations/server-configuration-parameters/settings.md @@ -502,7 +502,15 @@ On hosts with low RAM and swap, you possibly need setting `max_server_memory_usa ## max_concurrent_queries {#max-concurrent-queries} -The maximum number of simultaneously processed requests. +The maximum number of simultaneously processed queries related to MergeTree table. Queries may be limited by other settings: [max_concurrent_queries_for_all_users](#max-concurrent-queries-for-all-users), [min_marks_to_honor_max_concurrent_queries](#min-marks-to-honor-max-concurrent-queries). + +!!! info "Note" + These settings can be modified at runtime and will take effect immediately. Queries that are already running will remain unchanged. + +Possible values: + +- Positive integer. +- 0 — Disabled. **Example** @@ -530,6 +538,21 @@ Default value: `0` that means no limit. - [max_concurrent_queries](#max-concurrent-queries) +## min_marks_to_honor_max_concurrent_queries {#min-marks-to-honor-max-concurrent-queries} + +The minimal number of marks read by the query for applying the [max_concurrent_queries](#max-concurrent-queries) setting. + +Possible values: + +- Positive integer. +- 0 — Disabled. + +**Example** + +``` xml +10 +``` + ## max_connections {#max-connections} The maximum number of inbound connections. diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index 4da31b44b57..a5c3902f8f2 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -1914,7 +1914,7 @@ Default value: `0`. Enables or disables random shard insertion into a [Distributed](../../engines/table-engines/special/distributed.md#distributed) table when there is no distributed key. -By default, when inserting data into a `Distributed` table with more than one shard, the ClickHouse server will any insertion request if there is no distributed key. When `insert_distributed_one_random_shard = 1`, insertions are allowed and data is forwarded randomly among all shards. +By default, when inserting data into a `Distributed` table with more than one shard, the ClickHouse server will reject any insertion request if there is no distributed key. When `insert_distributed_one_random_shard = 1`, insertions are allowed and data is forwarded randomly among all shards. Possible values: diff --git a/docs/en/operations/system-tables/replication_queue.md b/docs/en/operations/system-tables/replication_queue.md index d1c74a771c6..f3e3a35f13b 100644 --- a/docs/en/operations/system-tables/replication_queue.md +++ b/docs/en/operations/system-tables/replication_queue.md @@ -14,7 +14,17 @@ Columns: - `node_name` ([String](../../sql-reference/data-types/string.md)) — Node name in ZooKeeper. -- `type` ([String](../../sql-reference/data-types/string.md)) — Type of the task in the queue: `GET_PARTS`, `MERGE_PARTS`, `DETACH_PARTS`, `DROP_PARTS`, or `MUTATE_PARTS`. +- `type` ([String](../../sql-reference/data-types/string.md)) — Type of the task in the queue, one of: + - `GET_PART` - Get the part from another replica. + - `ATTACH_PART` - Attach the part, possibly from our own replica (if found in `detached` folder). + You may think of it as a `GET_PART` with some optimisations as they're nearly identical. + - `MERGE_PARTS` - Merge the parts. + - `DROP_RANGE` - Delete the parts in the specified partition in the specified number range. + - `CLEAR_COLUMN` - NOTE: Deprecated. Drop specific column from specified partition. + - `CLEAR_INDEX` - NOTE: Deprecated. Drop specific index from specified partition. + - `REPLACE_RANGE` - Drop certain range of partitions and replace them by new ones + - `MUTATE_PART` - Apply one or several mutations to the part. + - `ALTER_METADATA` - Apply alter modification according to global /metadata and /columns paths - `create_time` ([Datetime](../../sql-reference/data-types/datetime.md)) — Date and time when the task was submitted for execution. diff --git a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md index b7129725820..dc0b6e17198 100644 --- a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md +++ b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md @@ -69,6 +69,8 @@ Types of sources (`source_type`): - [ClickHouse](#dicts-external_dicts_dict_sources-clickhouse) - [MongoDB](#dicts-external_dicts_dict_sources-mongodb) - [Redis](#dicts-external_dicts_dict_sources-redis) + - [Cassandra](#dicts-external_dicts_dict_sources-cassandra) + - [PostgreSQL](#dicts-external_dicts_dict_sources-postgresql) ## Local File {#dicts-external_dicts_dict_sources-local_file} diff --git a/docs/en/sql-reference/functions/array-functions.md b/docs/en/sql-reference/functions/array-functions.md index 5e1d9d4ba23..499376a70d4 100644 --- a/docs/en/sql-reference/functions/array-functions.md +++ b/docs/en/sql-reference/functions/array-functions.md @@ -245,7 +245,7 @@ Elements set to `NULL` are handled as normal values. Returns the number of elements in the arr array for which func returns something other than 0. If ‘func’ is not specified, it returns the number of non-zero elements in the array. -Note that the `arrayCount` is a [higher-order function](../../sql-reference/functions/index.md#higher-order-functions). You can pass a lambda function to it as the first argument. +Note that the `arrayCount` is a [higher-order function](../../sql-reference/functions/index.md#higher-order-functions). You can pass a lambda function to it as the first argument. ## countEqual(arr, x) {#countequalarr-x} @@ -1229,7 +1229,7 @@ SELECT arrayReverseFill(x -> not isNull(x), [1, null, 3, 11, 12, null, null, 5, └────────────────────────────────────┘ ``` -Note that the `arrayReverseFilter` is a [higher-order function](../../sql-reference/functions/index.md#higher-order-functions). You must pass a lambda function to it as the first argument, and it can’t be omitted. +Note that the `arrayReverseFill` is a [higher-order function](../../sql-reference/functions/index.md#higher-order-functions). You must pass a lambda function to it as the first argument, and it can’t be omitted. ## arraySplit(func, arr1, …) {#array-split} @@ -1293,7 +1293,7 @@ Note that the `arrayFirstIndex` is a [higher-order function](../../sql-reference ## arrayMin {#array-min} -Returns the minimum of elements in the source array. +Returns the minimum of elements in the source array. If the `func` function is specified, returns the mininum of elements converted by this function. @@ -1312,9 +1312,9 @@ arrayMin([func,] arr) **Returned value** -- The minimum of function values (or the array minimum). +- The minimum of function values (or the array minimum). -Type: if `func` is specified, matches `func` return value type, else matches the array elements type. +Type: if `func` is specified, matches `func` return value type, else matches the array elements type. **Examples** @@ -1348,7 +1348,7 @@ Result: ## arrayMax {#array-max} -Returns the maximum of elements in the source array. +Returns the maximum of elements in the source array. If the `func` function is specified, returns the maximum of elements converted by this function. @@ -1367,9 +1367,9 @@ arrayMax([func,] arr) **Returned value** -- The maximum of function values (or the array maximum). +- The maximum of function values (or the array maximum). -Type: if `func` is specified, matches `func` return value type, else matches the array elements type. +Type: if `func` is specified, matches `func` return value type, else matches the array elements type. **Examples** @@ -1403,7 +1403,7 @@ Result: ## arraySum {#array-sum} -Returns the sum of elements in the source array. +Returns the sum of elements in the source array. If the `func` function is specified, returns the sum of elements converted by this function. @@ -1418,7 +1418,7 @@ arraySum([func,] arr) **Arguments** - `func` — Function. [Expression](../../sql-reference/data-types/special-data-types/expression.md). -- `arr` — Array. [Array](../../sql-reference/data-types/array.md). +- `arr` — Array. [Array](../../sql-reference/data-types/array.md). **Returned value** @@ -1458,7 +1458,7 @@ Result: ## arrayAvg {#array-avg} -Returns the average of elements in the source array. +Returns the average of elements in the source array. If the `func` function is specified, returns the average of elements converted by this function. @@ -1473,7 +1473,7 @@ arrayAvg([func,] arr) **Arguments** - `func` — Function. [Expression](../../sql-reference/data-types/special-data-types/expression.md). -- `arr` — Array. [Array](../../sql-reference/data-types/array.md). +- `arr` — Array. [Array](../../sql-reference/data-types/array.md). **Returned value** diff --git a/docs/en/sql-reference/functions/bit-functions.md b/docs/en/sql-reference/functions/bit-functions.md index 31d09e48e01..e07f28c0f24 100644 --- a/docs/en/sql-reference/functions/bit-functions.md +++ b/docs/en/sql-reference/functions/bit-functions.md @@ -250,3 +250,53 @@ Result: └───────────────┘ ``` +## bitHammingDistance {#bithammingdistance} + +Returns the [Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance) between the bit representations of two integer values. Can be used with [SimHash](../../sql-reference/functions/hash-functions.md#ngramsimhash) functions for detection of semi-duplicate strings. The smaller is the distance, the more likely those strings are the same. + +**Syntax** + +``` sql +bitHammingDistance(int1, int2) +``` + +**Arguments** + +- `int1` — First integer value. [Int64](../../sql-reference/data-types/int-uint.md). +- `int2` — Second integer value. [Int64](../../sql-reference/data-types/int-uint.md). + +**Returned value** + +- The Hamming distance. + +Type: [UInt8](../../sql-reference/data-types/int-uint.md). + +**Examples** + +Query: + +``` sql +SELECT bitHammingDistance(111, 121); +``` + +Result: + +``` text +┌─bitHammingDistance(111, 121)─┐ +│ 3 │ +└──────────────────────────────┘ +``` + +With [SimHash](../../sql-reference/functions/hash-functions.md#ngramsimhash): + +``` sql +SELECT bitHammingDistance(ngramSimHash('cat ate rat'), ngramSimHash('rat ate cat')); +``` + +Result: + +``` text +┌─bitHammingDistance(ngramSimHash('cat ate rat'), ngramSimHash('rat ate cat'))─┐ +│ 5 │ +└──────────────────────────────────────────────────────────────────────────────┘ +``` diff --git a/docs/en/sql-reference/functions/hash-functions.md b/docs/en/sql-reference/functions/hash-functions.md index 945ede4927f..c60067b06af 100644 --- a/docs/en/sql-reference/functions/hash-functions.md +++ b/docs/en/sql-reference/functions/hash-functions.md @@ -7,6 +7,8 @@ toc_title: Hash Hash functions can be used for the deterministic pseudo-random shuffling of elements. +Simhash is a hash function, which returns close hash values for close (similar) arguments. + ## halfMD5 {#hash-functions-halfmd5} [Interprets](../../sql-reference/functions/type-conversion-functions.md#type_conversion_functions-reinterpretAsString) all the input parameters as strings and calculates the [MD5](https://en.wikipedia.org/wiki/MD5) hash value for each of them. Then combines hashes, takes the first 8 bytes of the hash of the resulting string, and interprets them as `UInt64` in big-endian byte order. @@ -482,3 +484,938 @@ Result: - [xxHash](http://cyan4973.github.io/xxHash/). +## ngramSimHash {#ngramsimhash} + +Splits a ASCII string into n-grams of `ngramsize` symbols and returns the n-gram `simhash`. Is case sensitive. + +Can be used for detection of semi-duplicate strings with [bitHammingDistance](../../sql-reference/functions/bit-functions.md#bithammingdistance). The smaller is the [Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance) of the calculated `simhashes` of two strings, the more likely these strings are the same. + +**Syntax** + +``` sql +ngramSimHash(string[, ngramsize]) +``` + +**Arguments** + +- `string` — String. [String](../../sql-reference/data-types/string.md). +- `ngramsize` — The size of an n-gram. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Returned value** + +- Hash value. + +Type: [UInt64](../../sql-reference/data-types/int-uint.md). + +**Example** + +Query: + +``` sql +SELECT ngramSimHash('ClickHouse') AS Hash; +``` + +Result: + +``` text +┌───────Hash─┐ +│ 1627567969 │ +└────────────┘ +``` + +## ngramSimHashCaseInsensitive {#ngramsimhashcaseinsensitive} + +Splits a ASCII string into n-grams of `ngramsize` symbols and returns the n-gram `simhash`. Is case insensitive. + +Can be used for detection of semi-duplicate strings with [bitHammingDistance](../../sql-reference/functions/bit-functions.md#bithammingdistance). The smaller is the [Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance) of the calculated `simhashes` of two strings, the more likely these strings are the same. + +**Syntax** + +``` sql +ngramSimHashCaseInsensitive(string[, ngramsize]) +``` + +**Arguments** + +- `string` — String. [String](../../sql-reference/data-types/string.md). +- `ngramsize` — The size of an n-gram. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Returned value** + +- Hash value. + +Type: [UInt64](../../sql-reference/data-types/int-uint.md). + +**Example** + +Query: + +``` sql +SELECT ngramSimHashCaseInsensitive('ClickHouse') AS Hash; +``` + +Result: + +``` text +┌──────Hash─┐ +│ 562180645 │ +└───────────┘ +``` + +## ngramSimHashUTF8 {#ngramsimhashutf8} + +Splits a UTF-8 string into n-grams of `ngramsize` symbols and returns the n-gram `simhash`. Is case sensitive. + +Can be used for detection of semi-duplicate strings with [bitHammingDistance](../../sql-reference/functions/bit-functions.md#bithammingdistance). The smaller is the [Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance) of the calculated `simhashes` of two strings, the more likely these strings are the same. + +**Syntax** + +``` sql +ngramSimHashUTF8(string[, ngramsize]) +``` + +**Arguments** + +- `string` — String. [String](../../sql-reference/data-types/string.md). +- `ngramsize` — The size of an n-gram. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Returned value** + +- Hash value. + +Type: [UInt64](../../sql-reference/data-types/int-uint.md). + +**Example** + +Query: + +``` sql +SELECT ngramSimHashUTF8('ClickHouse') AS Hash; +``` + +Result: + +``` text +┌───────Hash─┐ +│ 1628157797 │ +└────────────┘ +``` + +## ngramSimHashCaseInsensitiveUTF8 {#ngramsimhashcaseinsensitiveutf8} + +Splits a UTF-8 string into n-grams of `ngramsize` symbols and returns the n-gram `simhash`. Is case insensitive. + +Can be used for detection of semi-duplicate strings with [bitHammingDistance](../../sql-reference/functions/bit-functions.md#bithammingdistance). The smaller is the [Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance) of the calculated `simhashes` of two strings, the more likely these strings are the same. + +**Syntax** + +``` sql +ngramSimHashCaseInsensitiveUTF8(string[, ngramsize]) +``` + +**Arguments** + +- `string` — String. [String](../../sql-reference/data-types/string.md). +- `ngramsize` — The size of an n-gram. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Returned value** + +- Hash value. + +Type: [UInt64](../../sql-reference/data-types/int-uint.md). + +**Example** + +Query: + +``` sql +SELECT ngramSimHashCaseInsensitiveUTF8('ClickHouse') AS Hash; +``` + +Result: + +``` text +┌───────Hash─┐ +│ 1636742693 │ +└────────────┘ +``` + +## wordShingleSimHash {#wordshinglesimhash} + +Splits a ASCII string into parts (shingles) of `shinglesize` words and returns the word shingle `simhash`. Is case sensitive. + +Can be used for detection of semi-duplicate strings with [bitHammingDistance](../../sql-reference/functions/bit-functions.md#bithammingdistance). The smaller is the [Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance) of the calculated `simhashes` of two strings, the more likely these strings are the same. + +**Syntax** + +``` sql +wordShingleSimHash(string[, shinglesize]) +``` + +**Arguments** + +- `string` — String. [String](../../sql-reference/data-types/string.md). +- `shinglesize` — The size of a word shingle. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Returned value** + +- Hash value. + +Type: [UInt64](../../sql-reference/data-types/int-uint.md). + +**Example** + +Query: + +``` sql +SELECT wordShingleSimHash('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).') AS Hash; +``` + +Result: + +``` text +┌───────Hash─┐ +│ 2328277067 │ +└────────────┘ +``` + +## wordShingleSimHashCaseInsensitive {#wordshinglesimhashcaseinsensitive} + +Splits a ASCII string into parts (shingles) of `shinglesize` words and returns the word shingle `simhash`. Is case insensitive. + +Can be used for detection of semi-duplicate strings with [bitHammingDistance](../../sql-reference/functions/bit-functions.md#bithammingdistance). The smaller is the [Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance) of the calculated `simhashes` of two strings, the more likely these strings are the same. + +**Syntax** + +``` sql +wordShingleSimHashCaseInsensitive(string[, shinglesize]) +``` + +**Arguments** + +- `string` — String. [String](../../sql-reference/data-types/string.md). +- `shinglesize` — The size of a word shingle. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Returned value** + +- Hash value. + +Type: [UInt64](../../sql-reference/data-types/int-uint.md). + +**Example** + +Query: + +``` sql +SELECT wordShingleSimHashCaseInsensitive('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).') AS Hash; +``` + +Result: + +``` text +┌───────Hash─┐ +│ 2194812424 │ +└────────────┘ +``` + +## wordShingleSimHashUTF8 {#wordshinglesimhashutf8} + +Splits a UTF-8 string into parts (shingles) of `shinglesize` words and returns the word shingle `simhash`. Is case sensitive. + +Can be used for detection of semi-duplicate strings with [bitHammingDistance](../../sql-reference/functions/bit-functions.md#bithammingdistance). The smaller is the [Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance) of the calculated `simhashes` of two strings, the more likely these strings are the same. + +**Syntax** + +``` sql +wordShingleSimHashUTF8(string[, shinglesize]) +``` + +**Arguments** + +- `string` — String. [String](../../sql-reference/data-types/string.md). +- `shinglesize` — The size of a word shingle. Optinal. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Returned value** + +- Hash value. + +Type: [UInt64](../../sql-reference/data-types/int-uint.md). + +**Example** + +Query: + +``` sql +SELECT wordShingleSimHashUTF8('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).') AS Hash; +``` + +Result: + +``` text +┌───────Hash─┐ +│ 2328277067 │ +└────────────┘ +``` + +## wordShingleSimHashCaseInsensitiveUTF8 {#wordshinglesimhashcaseinsensitiveutf8} + +Splits a UTF-8 string into parts (shingles) of `shinglesize` words and returns the word shingle `simhash`. Is case insensitive. + +Can be used for detection of semi-duplicate strings with [bitHammingDistance](../../sql-reference/functions/bit-functions.md#bithammingdistance). The smaller is the [Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance) of the calculated `simhashes` of two strings, the more likely these strings are the same. + +**Syntax** + +``` sql +wordShingleSimHashCaseInsensitiveUTF8(string[, shinglesize]) +``` + +**Arguments** + +- `string` — String. [String](../../sql-reference/data-types/string.md). +- `shinglesize` — The size of a word shingle. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Returned value** + +- Hash value. + +Type: [UInt64](../../sql-reference/data-types/int-uint.md). + +**Example** + +Query: + +``` sql +SELECT wordShingleSimHashCaseInsensitiveUTF8('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).') AS Hash; +``` + +Result: + +``` text +┌───────Hash─┐ +│ 2194812424 │ +└────────────┘ +``` + +## ngramMinHash {#ngramminhash} + +Splits a ASCII string into n-grams of `ngramsize` symbols and calculates hash values for each n-gram. Uses `hashnum` minimum hashes to calculate the minimum hash and `hashnum` maximum hashes to calculate the maximum hash. Returns a tuple with these hashes. Is case sensitive. + +Can be used for detection of semi-duplicate strings with [tupleHammingDistance](../../sql-reference/functions/tuple-functions.md#tuplehammingdistance). For two strings: if one of the returned hashes is the same for both strings, we think that those strings are the same. + +**Syntax** + +``` sql +ngramMinHash(string[, ngramsize, hashnum]) +``` + +**Arguments** + +- `string` — String. [String](../../sql-reference/data-types/string.md). +- `ngramsize` — The size of an n-gram. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Returned value** + +- Tuple with two hashes — the minimum and the maximum. + +Type: [Tuple](../../sql-reference/data-types/tuple.md)([UInt64](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md)). + +**Example** + +Query: + +``` sql +SELECT ngramMinHash('ClickHouse') AS Tuple; +``` + +Result: + +``` text +┌─Tuple──────────────────────────────────────┐ +│ (18333312859352735453,9054248444481805918) │ +└────────────────────────────────────────────┘ +``` + +## ngramMinHashCaseInsensitive {#ngramminhashcaseinsensitive} + +Splits a ASCII string into n-grams of `ngramsize` symbols and calculates hash values for each n-gram. Uses `hashnum` minimum hashes to calculate the minimum hash and `hashnum` maximum hashes to calculate the maximum hash. Returns a tuple with these hashes. Is case insensitive. + +Can be used for detection of semi-duplicate strings with [tupleHammingDistance](../../sql-reference/functions/tuple-functions.md#tuplehammingdistance). For two strings: if one of the returned hashes is the same for both strings, we think that those strings are the same. + +**Syntax** + +``` sql +ngramMinHashCaseInsensitive(string[, ngramsize, hashnum]) +``` + +**Arguments** + +- `string` — String. [String](../../sql-reference/data-types/string.md). +- `ngramsize` — The size of an n-gram. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Returned value** + +- Tuple with two hashes — the minimum and the maximum. + +Type: [Tuple](../../sql-reference/data-types/tuple.md)([UInt64](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md)). + +**Example** + +Query: + +``` sql +SELECT ngramMinHashCaseInsensitive('ClickHouse') AS Tuple; +``` + +Result: + +``` text +┌─Tuple──────────────────────────────────────┐ +│ (2106263556442004574,13203602793651726206) │ +└────────────────────────────────────────────┘ +``` + +## ngramMinHashUTF8 {#ngramminhashutf8} + +Splits a UTF-8 string into n-grams of `ngramsize` symbols and calculates hash values for each n-gram. Uses `hashnum` minimum hashes to calculate the minimum hash and `hashnum` maximum hashes to calculate the maximum hash. Returns a tuple with these hashes. Is case sensitive. + +Can be used for detection of semi-duplicate strings with [tupleHammingDistance](../../sql-reference/functions/tuple-functions.md#tuplehammingdistance). For two strings: if one of the returned hashes is the same for both strings, we think that those strings are the same. + +**Syntax** + +``` sql +ngramMinHashUTF8(string[, ngramsize, hashnum]) +``` + +**Arguments** + +- `string` — String. [String](../../sql-reference/data-types/string.md). +- `ngramsize` — The size of an n-gram. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Returned value** + +- Tuple with two hashes — the minimum and the maximum. + +Type: [Tuple](../../sql-reference/data-types/tuple.md)([UInt64](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md)). + +**Example** + +Query: + +``` sql +SELECT ngramMinHashUTF8('ClickHouse') AS Tuple; +``` + +Result: + +``` text +┌─Tuple──────────────────────────────────────┐ +│ (18333312859352735453,6742163577938632877) │ +└────────────────────────────────────────────┘ +``` + +## ngramMinHashCaseInsensitiveUTF8 {#ngramminhashcaseinsensitiveutf8} + +Splits a UTF-8 string into n-grams of `ngramsize` symbols and calculates hash values for each n-gram. Uses `hashnum` minimum hashes to calculate the minimum hash and `hashnum` maximum hashes to calculate the maximum hash. Returns a tuple with these hashes. Is case insensitive. + +Can be used for detection of semi-duplicate strings with [tupleHammingDistance](../../sql-reference/functions/tuple-functions.md#tuplehammingdistance). For two strings: if one of the returned hashes is the same for both strings, we think that those strings are the same. + +**Syntax** + +``` sql +ngramMinHashCaseInsensitiveUTF8(string [, ngramsize, hashnum]) +``` + +**Arguments** + +- `string` — String. [String](../../sql-reference/data-types/string.md). +- `ngramsize` — The size of an n-gram. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Returned value** + +- Tuple with two hashes — the minimum and the maximum. + +Type: [Tuple](../../sql-reference/data-types/tuple.md)([UInt64](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md)). + +**Example** + +Query: + +``` sql +SELECT ngramMinHashCaseInsensitiveUTF8('ClickHouse') AS Tuple; +``` + +Result: + +``` text +┌─Tuple───────────────────────────────────────┐ +│ (12493625717655877135,13203602793651726206) │ +└─────────────────────────────────────────────┘ +``` + +## ngramMinHashArg {#ngramminhasharg} + +Splits a ASCII string into n-grams of `ngramsize` symbols and returns the n-grams with minimum and maximum hashes, calculated by the [ngramMinHash](#ngramminhash) function with the same input. Is case sensitive. + +**Syntax** + +``` sql +ngramMinHashArg(string[, ngramsize, hashnum]) +``` + +**Arguments** + +- `string` — String. [String](../../sql-reference/data-types/string.md). +- `ngramsize` — The size of an n-gram. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Returned value** + +- Tuple with two tuples with `hashnum` n-grams each. + +Type: [Tuple](../../sql-reference/data-types/tuple.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md)), [Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md))). + +**Example** + +Query: + +``` sql +SELECT ngramMinHashArg('ClickHouse') AS Tuple; +``` + +Result: + +``` text +┌─Tuple─────────────────────────────────────────────────────────────────────────┐ +│ (('ous','ick','lic','Hou','kHo','use'),('Hou','lic','ick','ous','ckH','Cli')) │ +└───────────────────────────────────────────────────────────────────────────────┘ +``` + +## ngramMinHashArgCaseInsensitive {#ngramminhashargcaseinsensitive} + +Splits a ASCII string into n-grams of `ngramsize` symbols and returns the n-grams with minimum and maximum hashes, calculated by the [ngramMinHashCaseInsensitive](#ngramminhashcaseinsensitive) function with the same input. Is case insensitive. + +**Syntax** + +``` sql +ngramMinHashArgCaseInsensitive(string[, ngramsize, hashnum]) +``` + +**Arguments** + +- `string` — String. [String](../../sql-reference/data-types/string.md). +- `ngramsize` — The size of an n-gram. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Returned value** + +- Tuple with two tuples with `hashnum` n-grams each. + +Type: [Tuple](../../sql-reference/data-types/tuple.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md)), [Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md))). + +**Example** + +Query: + +``` sql +SELECT ngramMinHashArgCaseInsensitive('ClickHouse') AS Tuple; +``` + +Result: + +``` text +┌─Tuple─────────────────────────────────────────────────────────────────────────┐ +│ (('ous','ick','lic','kHo','use','Cli'),('kHo','lic','ick','ous','ckH','Hou')) │ +└───────────────────────────────────────────────────────────────────────────────┘ +``` + +## ngramMinHashArgUTF8 {#ngramminhashargutf8} + +Splits a UTF-8 string into n-grams of `ngramsize` symbols and returns the n-grams with minimum and maximum hashes, calculated by the [ngramMinHashUTF8](#ngramminhashutf8) function with the same input. Is case sensitive. + +**Syntax** + +``` sql +ngramMinHashArgUTF8(string[, ngramsize, hashnum]) +``` + +**Arguments** + +- `string` — String. [String](../../sql-reference/data-types/string.md). +- `ngramsize` — The size of an n-gram. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Returned value** + +- Tuple with two tuples with `hashnum` n-grams each. + +Type: [Tuple](../../sql-reference/data-types/tuple.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md)), [Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md))). + +**Example** + +Query: + +``` sql +SELECT ngramMinHashArgUTF8('ClickHouse') AS Tuple; +``` + +Result: + +``` text +┌─Tuple─────────────────────────────────────────────────────────────────────────┐ +│ (('ous','ick','lic','Hou','kHo','use'),('kHo','Hou','lic','ick','ous','ckH')) │ +└───────────────────────────────────────────────────────────────────────────────┘ +``` + +## ngramMinHashArgCaseInsensitiveUTF8 {#ngramminhashargcaseinsensitiveutf8} + +Splits a UTF-8 string into n-grams of `ngramsize` symbols and returns the n-grams with minimum and maximum hashes, calculated by the [ngramMinHashCaseInsensitiveUTF8](#ngramminhashcaseinsensitiveutf8) function with the same input. Is case insensitive. + +**Syntax** + +``` sql +ngramMinHashArgCaseInsensitiveUTF8(string[, ngramsize, hashnum]) +``` + +**Arguments** + +- `string` — String. [String](../../sql-reference/data-types/string.md). +- `ngramsize` — The size of an n-gram. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Returned value** + +- Tuple with two tuples with `hashnum` n-grams each. + +Type: [Tuple](../../sql-reference/data-types/tuple.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md)), [Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md))). + +**Example** + +Query: + +``` sql +SELECT ngramMinHashArgCaseInsensitiveUTF8('ClickHouse') AS Tuple; +``` + +Result: + +``` text +┌─Tuple─────────────────────────────────────────────────────────────────────────┐ +│ (('ckH','ous','ick','lic','kHo','use'),('kHo','lic','ick','ous','ckH','Hou')) │ +└───────────────────────────────────────────────────────────────────────────────┘ +``` + +## wordShingleMinHash {#wordshingleminhash} + +Splits a ASCII string into parts (shingles) of `shinglesize` words and calculates hash values for each word shingle. Uses `hashnum` minimum hashes to calculate the minimum hash and `hashnum` maximum hashes to calculate the maximum hash. Returns a tuple with these hashes. Is case sensitive. + +Can be used for detection of semi-duplicate strings with [tupleHammingDistance](../../sql-reference/functions/tuple-functions.md#tuplehammingdistance). For two strings: if one of the returned hashes is the same for both strings, we think that those strings are the same. + +**Syntax** + +``` sql +wordShingleMinHash(string[, shinglesize, hashnum]) +``` + +**Arguments** + +- `string` — String. [String](../../sql-reference/data-types/string.md). +- `shinglesize` — The size of a word shingle. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Returned value** + +- Tuple with two hashes — the minimum and the maximum. + +Type: [Tuple](../../sql-reference/data-types/tuple.md)([UInt64](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md)). + +**Example** + +Query: + +``` sql +SELECT wordShingleMinHash('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).') AS Tuple; +``` + +Result: + +``` text +┌─Tuple──────────────────────────────────────┐ +│ (16452112859864147620,5844417301642981317) │ +└────────────────────────────────────────────┘ +``` + +## wordShingleMinHashCaseInsensitive {#wordshingleminhashcaseinsensitive} + +Splits a ASCII string into parts (shingles) of `shinglesize` words and calculates hash values for each word shingle. Uses `hashnum` minimum hashes to calculate the minimum hash and `hashnum` maximum hashes to calculate the maximum hash. Returns a tuple with these hashes. Is case insensitive. + +Can be used for detection of semi-duplicate strings with [tupleHammingDistance](../../sql-reference/functions/tuple-functions.md#tuplehammingdistance). For two strings: if one of the returned hashes is the same for both strings, we think that those strings are the same. + +**Syntax** + +``` sql +wordShingleMinHashCaseInsensitive(string[, shinglesize, hashnum]) +``` + +**Arguments** + +- `string` — String. [String](../../sql-reference/data-types/string.md). +- `shinglesize` — The size of a word shingle. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Returned value** + +- Tuple with two hashes — the minimum and the maximum. + +Type: [Tuple](../../sql-reference/data-types/tuple.md)([UInt64](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md)). + +**Example** + +Query: + +``` sql +SELECT wordShingleMinHashCaseInsensitive('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).') AS Tuple; +``` + +Result: + +``` text +┌─Tuple─────────────────────────────────────┐ +│ (3065874883688416519,1634050779997673240) │ +└───────────────────────────────────────────┘ +``` + +## wordShingleMinHashUTF8 {#wordshingleminhashutf8} + +Splits a UTF-8 string into parts (shingles) of `shinglesize` words and calculates hash values for each word shingle. Uses `hashnum` minimum hashes to calculate the minimum hash and `hashnum` maximum hashes to calculate the maximum hash. Returns a tuple with these hashes. Is case sensitive. + +Can be used for detection of semi-duplicate strings with [tupleHammingDistance](../../sql-reference/functions/tuple-functions.md#tuplehammingdistance). For two strings: if one of the returned hashes is the same for both strings, we think that those strings are the same. + +**Syntax** + +``` sql +wordShingleMinHashUTF8(string[, shinglesize, hashnum]) +``` + +**Arguments** + +- `string` — String. [String](../../sql-reference/data-types/string.md). +- `shinglesize` — The size of a word shingle. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Returned value** + +- Tuple with two hashes — the minimum and the maximum. + +Type: [Tuple](../../sql-reference/data-types/tuple.md)([UInt64](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md)). + +**Example** + +Query: + +``` sql +SELECT wordShingleMinHashUTF8('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).') AS Tuple; +``` + +Result: + +``` text +┌─Tuple──────────────────────────────────────┐ +│ (16452112859864147620,5844417301642981317) │ +└────────────────────────────────────────────┘ +``` + +## wordShingleMinHashCaseInsensitiveUTF8 {#wordshingleminhashcaseinsensitiveutf8} + +Splits a UTF-8 string into parts (shingles) of `shinglesize` words and calculates hash values for each word shingle. Uses `hashnum` minimum hashes to calculate the minimum hash and `hashnum` maximum hashes to calculate the maximum hash. Returns a tuple with these hashes. Is case insensitive. + +Can be used for detection of semi-duplicate strings with [tupleHammingDistance](../../sql-reference/functions/tuple-functions.md#tuplehammingdistance). For two strings: if one of the returned hashes is the same for both strings, we think that those strings are the same. + +**Syntax** + +``` sql +wordShingleMinHashCaseInsensitiveUTF8(string[, shinglesize, hashnum]) +``` + +**Arguments** + +- `string` — String. [String](../../sql-reference/data-types/string.md). +- `shinglesize` — The size of a word shingle. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Returned value** + +- Tuple with two hashes — the minimum and the maximum. + +Type: [Tuple](../../sql-reference/data-types/tuple.md)([UInt64](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md)). + +**Example** + +Query: + +``` sql +SELECT wordShingleMinHashCaseInsensitiveUTF8('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).') AS Tuple; +``` + +Result: + +``` text +┌─Tuple─────────────────────────────────────┐ +│ (3065874883688416519,1634050779997673240) │ +└───────────────────────────────────────────┘ +``` + +## wordShingleMinHashArg {#wordshingleminhasharg} + +Splits a ASCII string into parts (shingles) of `shinglesize` words each and returns the shingles with minimum and maximum word hashes, calculated by the [wordshingleMinHash](#wordshingleminhash) function with the same input. Is case sensitive. + +**Syntax** + +``` sql +wordShingleMinHashArg(string[, shinglesize, hashnum]) +``` + +**Arguments** + +- `string` — String. [String](../../sql-reference/data-types/string.md). +- `shinglesize` — The size of a word shingle. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Returned value** + +- Tuple with two tuples with `hashnum` word shingles each. + +Type: [Tuple](../../sql-reference/data-types/tuple.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md)), [Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md))). + +**Example** + +Query: + +``` sql +SELECT wordShingleMinHashArg('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).', 1, 3) AS Tuple; +``` + +Result: + +``` text +┌─Tuple─────────────────────────────────────────────────────────────────┐ +│ (('OLAP','database','analytical'),('online','oriented','processing')) │ +└───────────────────────────────────────────────────────────────────────┘ +``` + +## wordShingleMinHashArgCaseInsensitive {#wordshingleminhashargcaseinsensitive} + +Splits a ASCII string into parts (shingles) of `shinglesize` words each and returns the shingles with minimum and maximum word hashes, calculated by the [wordShingleMinHashCaseInsensitive](#wordshingleminhashcaseinsensitive) function with the same input. Is case insensitive. + +**Syntax** + +``` sql +wordShingleMinHashArgCaseInsensitive(string[, shinglesize, hashnum]) +``` + +**Arguments** + +- `string` — String. [String](../../sql-reference/data-types/string.md). +- `shinglesize` — The size of a word shingle. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Returned value** + +- Tuple with two tuples with `hashnum` word shingles each. + +Type: [Tuple](../../sql-reference/data-types/tuple.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md)), [Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md))). + +**Example** + +Query: + +``` sql +SELECT wordShingleMinHashArgCaseInsensitive('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).', 1, 3) AS Tuple; +``` + +Result: + +``` text +┌─Tuple──────────────────────────────────────────────────────────────────┐ +│ (('queries','database','analytical'),('oriented','processing','DBMS')) │ +└────────────────────────────────────────────────────────────────────────┘ +``` + +## wordShingleMinHashArgUTF8 {#wordshingleminhashargutf8} + +Splits a UTF-8 string into parts (shingles) of `shinglesize` words each and returns the shingles with minimum and maximum word hashes, calculated by the [wordShingleMinHashUTF8](#wordshingleminhashutf8) function with the same input. Is case sensitive. + +**Syntax** + +``` sql +wordShingleMinHashArgUTF8(string[, shinglesize, hashnum]) +``` + +**Arguments** + +- `string` — String. [String](../../sql-reference/data-types/string.md). +- `shinglesize` — The size of a word shingle. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Returned value** + +- Tuple with two tuples with `hashnum` word shingles each. + +Type: [Tuple](../../sql-reference/data-types/tuple.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md)), [Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md))). + +**Example** + +Query: + +``` sql +SELECT wordShingleMinHashArgUTF8('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).', 1, 3) AS Tuple; +``` + +Result: + +``` text +┌─Tuple─────────────────────────────────────────────────────────────────┐ +│ (('OLAP','database','analytical'),('online','oriented','processing')) │ +└───────────────────────────────────────────────────────────────────────┘ +``` + +## wordShingleMinHashArgCaseInsensitiveUTF8 {#wordshingleminhashargcaseinsensitiveutf8} + +Splits a UTF-8 string into parts (shingles) of `shinglesize` words each and returns the shingles with minimum and maximum word hashes, calculated by the [wordShingleMinHashCaseInsensitiveUTF8](#wordshingleminhashcaseinsensitiveutf8) function with the same input. Is case insensitive. + +**Syntax** + +``` sql +wordShingleMinHashArgCaseInsensitiveUTF8(string[, shinglesize, hashnum]) +``` + +**Arguments** + +- `string` — String. [String](../../sql-reference/data-types/string.md). +- `shinglesize` — The size of a word shingle. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Returned value** + +- Tuple with two tuples with `hashnum` word shingles each. + +Type: [Tuple](../../sql-reference/data-types/tuple.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md)), [Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md))). + +**Example** + +Query: + +``` sql +SELECT wordShingleMinHashArgCaseInsensitiveUTF8('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).', 1, 3) AS Tuple; +``` + +Result: + +``` text +┌─Tuple──────────────────────────────────────────────────────────────────┐ +│ (('queries','database','analytical'),('oriented','processing','DBMS')) │ +└────────────────────────────────────────────────────────────────────────┘ +``` diff --git a/docs/en/sql-reference/functions/tuple-functions.md b/docs/en/sql-reference/functions/tuple-functions.md index 884e1ef754f..86442835425 100644 --- a/docs/en/sql-reference/functions/tuple-functions.md +++ b/docs/en/sql-reference/functions/tuple-functions.md @@ -111,4 +111,55 @@ Result: - [Tuple](../../sql-reference/data-types/tuple.md) -[Original article](https://clickhouse.tech/docs/en/sql-reference/functions/tuple-functions/) +## tupleHammingDistance {#tuplehammingdistance} + +Returns the [Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance) between two tuples of the same size. + +**Syntax** + +``` sql +tupleHammingDistance(tuple1, tuple2) +``` + +**Arguments** + +- `tuple1` — First tuple. [Tuple](../../sql-reference/data-types/tuple.md). +- `tuple2` — Second tuple. [Tuple](../../sql-reference/data-types/tuple.md). + +Tuples should have the same type of the elements. + +**Returned value** + +- The Hamming distance. + +Type: [UInt8](../../sql-reference/data-types/int-uint.md). + +**Examples** + +Query: + +``` sql +SELECT tupleHammingDistance((1, 2, 3), (3, 2, 1)) AS HammingDistance; +``` + +Result: + +``` text +┌─HammingDistance─┐ +│ 2 │ +└─────────────────┘ +``` + +Can be used with [MinHash](../../sql-reference/functions/hash-functions.md#ngramminhash) functions for detection of semi-duplicate strings: + +``` sql +SELECT tupleHammingDistance(wordShingleMinHash(string), wordShingleMinHashCaseInsensitive(string)) as HammingDistance FROM (SELECT 'Clickhouse is a column-oriented database management system for online analytical processing of queries.' AS string); +``` + +Result: + +``` text +┌─HammingDistance─┐ +│ 2 │ +└─────────────────┘ +``` diff --git a/docs/en/sql-reference/statements/alter/partition.md b/docs/en/sql-reference/statements/alter/partition.md index 42396223b86..f7183ba525c 100644 --- a/docs/en/sql-reference/statements/alter/partition.md +++ b/docs/en/sql-reference/statements/alter/partition.md @@ -40,7 +40,7 @@ Read about setting the partition expression in a section [How to specify the par After the query is executed, you can do whatever you want with the data in the `detached` directory — delete it from the file system, or just leave it. -This query is replicated – it moves the data to the `detached` directory on all replicas. Note that you can execute this query only on a leader replica. To find out if a replica is a leader, perform the `SELECT` query to the [system.replicas](../../../operations/system-tables/replicas.md#system_tables-replicas) table. Alternatively, it is easier to make a `DETACH` query on all replicas - all the replicas throw an exception, except the leader replica. +This query is replicated – it moves the data to the `detached` directory on all replicas. Note that you can execute this query only on a leader replica. To find out if a replica is a leader, perform the `SELECT` query to the [system.replicas](../../../operations/system-tables/replicas.md#system_tables-replicas) table. Alternatively, it is easier to make a `DETACH` query on all replicas - all the replicas throw an exception, except the leader replicas (as multiple leaders are allowed). ## DROP PARTITION\|PART {#alter_drop-partition} @@ -85,9 +85,15 @@ ALTER TABLE visits ATTACH PART 201901_2_2_0; Read more about setting the partition expression in a section [How to specify the partition expression](#alter-how-to-specify-part-expr). -This query is replicated. The replica-initiator checks whether there is data in the `detached` directory. If data exists, the query checks its integrity. If everything is correct, the query adds the data to the table. All other replicas download the data from the replica-initiator. +This query is replicated. The replica-initiator checks whether there is data in the `detached` directory. +If data exists, the query checks its integrity. If everything is correct, the query adds the data to the table. -So you can put data to the `detached` directory on one replica, and use the `ALTER ... ATTACH` query to add it to the table on all replicas. +If the non-initiator replica, receiving the attach command, finds the part with the correct checksums in its own +`detached` folder, it attaches the data without fetching it from other replicas. +If there is no part with the correct checksums, the data is downloaded from any replica having the part. + +You can put data to the `detached` directory on one replica and use the `ALTER ... ATTACH` query to add it to the +table on all replicas. ## ATTACH PARTITION FROM {#alter_attach-partition-from} @@ -95,7 +101,8 @@ So you can put data to the `detached` directory on one replica, and use the `ALT ALTER TABLE table2 ATTACH PARTITION partition_expr FROM table1 ``` -This query copies the data partition from the `table1` to `table2` adds data to exsisting in the `table2`. Note that data won’t be deleted from `table1`. +This query copies the data partition from the `table1` to `table2`. +Note that data won't be deleted neither from `table1` nor from `table2`. For the query to run successfully, the following conditions must be met: diff --git a/docs/en/sql-reference/statements/system.md b/docs/en/sql-reference/statements/system.md index 725024efe0c..2348a2a2668 100644 --- a/docs/en/sql-reference/statements/system.md +++ b/docs/en/sql-reference/statements/system.md @@ -264,6 +264,10 @@ Wait until a `ReplicatedMergeTree` table will be synced with other replicas in a SYSTEM SYNC REPLICA [db.]replicated_merge_tree_family_table_name ``` +After running this statement the `[db.]replicated_merge_tree_family_table_name` fetches commands from +the common replicated log into its own replication queue, and then the query waits till the replica processes all +of the fetched commands. + ### RESTART REPLICA {#query_language-system-restart-replica} Provides possibility to reinitialize Zookeeper sessions state for `ReplicatedMergeTree` table, will compare current state with Zookeeper as source of true and add tasks to Zookeeper queue if needed diff --git a/docs/en/sql-reference/table-functions/index.md b/docs/en/sql-reference/table-functions/index.md index fef30c04c9d..d65a18ab985 100644 --- a/docs/en/sql-reference/table-functions/index.md +++ b/docs/en/sql-reference/table-functions/index.md @@ -21,16 +21,18 @@ You can use table functions in: !!! warning "Warning" You can’t use table functions if the [allow_ddl](../../operations/settings/permissions-for-queries.md#settings_allow_ddl) setting is disabled. -| Function | Description | -|-----------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------| -| [file](../../sql-reference/table-functions/file.md) | Creates a File-engine table. | -| [merge](../../sql-reference/table-functions/merge.md) | Creates a Merge-engine table. | -| [numbers](../../sql-reference/table-functions/numbers.md) | Creates a table with a single column filled with integer numbers. | -| [remote](../../sql-reference/table-functions/remote.md) | Allows you to access remote servers without creating a Distributed-engine table. | -| [url](../../sql-reference/table-functions/url.md) | Creates a URL-engine table. | -| [mysql](../../sql-reference/table-functions/mysql.md) | Creates a MySQL-engine table. | -| [postgresql](../../sql-reference/table-functions/postgresql.md) | Creates a PostgreSQL-engine table. | -| [jdbc](../../sql-reference/table-functions/jdbc.md) | Creates a JDBC-engine table. | -| [odbc](../../sql-reference/table-functions/odbc.md) | Creates a ODBC-engine table. | -| [hdfs](../../sql-reference/table-functions/hdfs.md) | Creates a HDFS-engine table. | -| [s3](../../sql-reference/table-functions/s3.md) | Creates a S3-engine table. | +| Function | Description | +|------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------| +| [file](../../sql-reference/table-functions/file.md) | Creates a [File](../../engines/table-engines/special/file.md)-engine table. | +| [merge](../../sql-reference/table-functions/merge.md) | Creates a [Merge](../../engines/table-engines/special/merge.md)-engine table. | +| [numbers](../../sql-reference/table-functions/numbers.md) | Creates a table with a single column filled with integer numbers. | +| [remote](../../sql-reference/table-functions/remote.md) | Allows you to access remote servers without creating a [Distributed](../../engines/table-engines/special/distributed.md)-engine table. | +| [url](../../sql-reference/table-functions/url.md) | Creates a [Url](../../engines/table-engines/special/url.md)-engine table. | +| [mysql](../../sql-reference/table-functions/mysql.md) | Creates a [MySQL](../../engines/table-engines/integrations/mysql.md)-engine table. | +| [postgresql](../../sql-reference/table-functions/postgresql.md) | Creates a [PostgreSQL](../../engines/table-engines/integrations/postgresql.md)-engine table. | +| [jdbc](../../sql-reference/table-functions/jdbc.md) | Creates a [JDBC](../../engines/table-engines/integrations/jdbc.md)-engine table. | +| [odbc](../../sql-reference/table-functions/odbc.md) | Creates a [ODBC](../../engines/table-engines/integrations/odbc.md)-engine table. | +| [hdfs](../../sql-reference/table-functions/hdfs.md) | Creates a [HDFS](../../engines/table-engines/integrations/hdfs.md)-engine table. | +| [s3](../../sql-reference/table-functions/s3.md) | Creates a [S3](../../engines/table-engines/integrations/s3.md)-engine table. | + +[Original article](https://clickhouse.tech/docs/en/sql-reference/table-functions/) diff --git a/docs/en/sql-reference/table-functions/postgresql.md b/docs/en/sql-reference/table-functions/postgresql.md index ad5d8a29904..bfb5fdf9be6 100644 --- a/docs/en/sql-reference/table-functions/postgresql.md +++ b/docs/en/sql-reference/table-functions/postgresql.md @@ -10,33 +10,17 @@ Allows `SELECT` and `INSERT` queries to be performed on data that is stored on a **Syntax** ``` sql -postgresql('host:port', 'database', 'table', 'user', 'password') +postgresql('host:port', 'database', 'table', 'user', 'password'[, `schema`]) ``` **Arguments** - `host:port` — PostgreSQL server address. - - `database` — Remote database name. - - `table` — Remote table name. - - `user` — PostgreSQL user. - - `password` — User password. - - -SELECT Queries on PostgreSQL side run as `COPY (SELECT ...) TO STDOUT` inside read-only PostgreSQL transaction with commit after each `SELECT` query. - -Simple `WHERE` clauses such as `=, !=, >, >=, <, <=, IN` are executed on the PostgreSQL server. - -All joins, aggregations, sorting, `IN [ array ]` conditions and the `LIMIT` sampling constraint are executed in ClickHouse only after the query to PostgreSQL finishes. - -INSERT Queries on PostgreSQL side run as `COPY "table_name" (field1, field2, ... fieldN) FROM STDIN` inside PostgreSQL transaction with auto-commit after each `INSERT` statement. - -PostgreSQL Array types converts into ClickHouse arrays. - -Be careful in PostgreSQL an array data type column like Integer[] may contain arrays of different dimensions in different rows, but in ClickHouse it is only allowed to have multidimensional arrays of the same dimension in all rows. +- `schema` — Non-default table schema. Optional. **Returned Value** @@ -45,6 +29,23 @@ A table object with the same columns as the original PostgreSQL table. !!! info "Note" In the `INSERT` query to distinguish table function `postgresql(...)` from table name with column names list you must use keywords `FUNCTION` or `TABLE FUNCTION`. See examples below. +## Implementation Details {#implementation-details} + +`SELECT` queries on PostgreSQL side run as `COPY (SELECT ...) TO STDOUT` inside read-only PostgreSQL transaction with commit after each `SELECT` query. + +Simple `WHERE` clauses such as `=`, `!=`, `>`, `>=`, `<`, `<=`, and `IN` are executed on the PostgreSQL server. + +All joins, aggregations, sorting, `IN [ array ]` conditions and the `LIMIT` sampling constraint are executed in ClickHouse only after the query to PostgreSQL finishes. + +`INSERT` queries on PostgreSQL side run as `COPY "table_name" (field1, field2, ... fieldN) FROM STDIN` inside PostgreSQL transaction with auto-commit after each `INSERT` statement. + +PostgreSQL Array types converts into ClickHouse arrays. + +!!! info "Note" + Be careful, in PostgreSQL an array data type column like Integer[] may contain arrays of different dimensions in different rows, but in ClickHouse it is only allowed to have multidimensional arrays of the same dimension in all rows. + +Supports replicas priority for PostgreSQL dictionary source. The bigger the number in map, the less the priority. The highest priority is `0`. + **Examples** Table in PostgreSQL: @@ -60,10 +61,10 @@ PRIMARY KEY (int_id)); CREATE TABLE -postgres=# insert into test (int_id, str, "float") VALUES (1,'test',2); +postgres=# INSERT INTO test (int_id, str, "float") VALUES (1,'test',2); INSERT 0 1 -postgresql> select * from test; +postgresql> SELECT * FROM test; int_id | int_nullable | float | str | float_nullable --------+--------------+-------+------+---------------- 1 | | 2 | test | @@ -96,9 +97,24 @@ SELECT * FROM postgresql('localhost:5432', 'test', 'test', 'postgresql_user', 'p └────────┴──────────────┴───────┴──────┴────────────────┘ ``` +Using Non-default Schema: + +```text +postgres=# CREATE SCHEMA "nice.schema"; + +postgres=# CREATE TABLE "nice.schema"."nice.table" (a integer); + +postgres=# INSERT INTO "nice.schema"."nice.table" SELECT i FROM generate_series(0, 99) as t(i) +``` + +```sql +CREATE TABLE pg_table_schema_with_dots (a UInt32) + ENGINE PostgreSQL('localhost:5432', 'clickhouse', 'nice.table', 'postgrsql_user', 'password', 'nice.schema'); +``` + **See Also** -- [The ‘PostgreSQL’ table engine](../../engines/table-engines/integrations/postgresql.md) +- [The PostgreSQL table engine](../../engines/table-engines/integrations/postgresql.md) - [Using PostgreSQL as a source of external dictionary](../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md#dicts-external_dicts_dict_sources-postgresql) [Original article](https://clickhouse.tech/docs/en/sql-reference/table-functions/postgresql/) diff --git a/docs/ru/engines/database-engines/index.md b/docs/ru/engines/database-engines/index.md index ec92edd2888..d4fad8f43a9 100644 --- a/docs/ru/engines/database-engines/index.md +++ b/docs/ru/engines/database-engines/index.md @@ -4,7 +4,7 @@ toc_priority: 27 toc_title: "Введение" --- -# Движки баз данных {#dvizhki-baz-dannykh} +# Движки баз данных {#database-engines} Движки баз данных обеспечивают работу с таблицами. @@ -18,3 +18,5 @@ toc_title: "Введение" - [Lazy](../../engines/database-engines/lazy.md) +- [PostgreSQL](../../engines/database-engines/postgresql.md) + diff --git a/docs/ru/engines/database-engines/postgresql.md b/docs/ru/engines/database-engines/postgresql.md new file mode 100644 index 00000000000..c11dab6f1aa --- /dev/null +++ b/docs/ru/engines/database-engines/postgresql.md @@ -0,0 +1,138 @@ +--- +toc_priority: 35 +toc_title: PostgreSQL +--- + +# PostgreSQL {#postgresql} + +Позволяет подключаться к БД на удаленном сервере [PostgreSQL](https://www.postgresql.org). Поддерживает операции чтения и записи (запросы `SELECT` и `INSERT`) для обмена данными между ClickHouse и PostgreSQL. + +Позволяет в реальном времени получать от удаленного сервера PostgreSQL информацию о таблицах БД и их структуре с помощью запросов `SHOW TABLES` и `DESCRIBE TABLE`. + +Поддерживает операции изменения структуры таблиц (`ALTER TABLE ... ADD|DROP COLUMN`). Если параметр `use_table_cache` (см. ниже раздел Параметры движка) установлен в значение `1`, структура таблицы кешируется, и изменения в структуре не отслеживаются, но будут обновлены, если выполнить команды `DETACH` и `ATTACH`. + +## Создание БД {#creating-a-database} + +``` sql +CREATE DATABASE test_database +ENGINE = PostgreSQL('host:port', 'database', 'user', 'password'[, `use_table_cache`]); +``` + +**Параметры движка** + +- `host:port` — адрес сервера PostgreSQL. +- `database` — имя удаленной БД. +- `user` — пользователь PostgreSQL. +- `password` — пароль пользователя. +- `use_table_cache` — определяет кеширование структуры таблиц БД. Необязательный параметр. Значение по умолчанию: `0`. + +## Поддерживаемые типы данных {#data_types-support} + +| PostgerSQL | ClickHouse | +|------------------|--------------------------------------------------------------| +| DATE | [Date](../../sql-reference/data-types/date.md) | +| TIMESTAMP | [DateTime](../../sql-reference/data-types/datetime.md) | +| REAL | [Float32](../../sql-reference/data-types/float.md) | +| DOUBLE | [Float64](../../sql-reference/data-types/float.md) | +| DECIMAL, NUMERIC | [Decimal](../../sql-reference/data-types/decimal.md) | +| SMALLINT | [Int16](../../sql-reference/data-types/int-uint.md) | +| INTEGER | [Int32](../../sql-reference/data-types/int-uint.md) | +| BIGINT | [Int64](../../sql-reference/data-types/int-uint.md) | +| SERIAL | [UInt32](../../sql-reference/data-types/int-uint.md) | +| BIGSERIAL | [UInt64](../../sql-reference/data-types/int-uint.md) | +| TEXT, CHAR | [String](../../sql-reference/data-types/string.md) | +| INTEGER | Nullable([Int32](../../sql-reference/data-types/int-uint.md))| +| ARRAY | [Array](../../sql-reference/data-types/array.md) | + + +## Примеры использования {#examples-of-use} + +Обмен данными между БД ClickHouse и сервером PostgreSQL: + +``` sql +CREATE DATABASE test_database +ENGINE = PostgreSQL('postgres1:5432', 'test_database', 'postgres', 'mysecretpassword', 1); +``` + +``` sql +SHOW DATABASES; +``` + +``` text +┌─name──────────┐ +│ default │ +│ test_database │ +│ system │ +└───────────────┘ +``` + +``` sql +SHOW TABLES FROM test_database; +``` + +``` text +┌─name───────┐ +│ test_table │ +└────────────┘ +``` + +Чтение данных из таблицы PostgreSQL: + +``` sql +SELECT * FROM test_database.test_table; +``` + +``` text +┌─id─┬─value─┐ +│ 1 │ 2 │ +└────┴───────┘ +``` + +Запись данных в таблицу PostgreSQL: + +``` sql +INSERT INTO test_database.test_table VALUES (3,4); +SELECT * FROM test_database.test_table; +``` + +``` text +┌─int_id─┬─value─┐ +│ 1 │ 2 │ +│ 3 │ 4 │ +└────────┴───────┘ +``` + +Пусть структура таблицы была изменена в PostgreSQL: + +``` sql +postgre> ALTER TABLE test_table ADD COLUMN data Text +``` + +Поскольку при создании БД параметр `use_table_cache` был установлен в значение `1`, структура таблицы в ClickHouse была кеширована и поэтому не изменилась: + +``` sql +DESCRIBE TABLE test_database.test_table; +``` +``` text +┌─name───┬─type──────────────┐ +│ id │ Nullable(Integer) │ +│ value │ Nullable(Integer) │ +└────────┴───────────────────┘ +``` + +После того как таблицу «отцепили» и затем снова «прицепили», структура обновилась: + +``` sql +DETACH TABLE test_database.test_table; +ATTACH TABLE test_database.test_table; +DESCRIBE TABLE test_database.test_table; +``` +``` text +┌─name───┬─type──────────────┐ +│ id │ Nullable(Integer) │ +│ value │ Nullable(Integer) │ +│ data │ Nullable(String) │ +└────────┴───────────────────┘ +``` + +[Оригинальная статья](https://clickhouse.tech/docs/ru/database-engines/postgresql/) diff --git a/docs/ru/engines/table-engines/index.md b/docs/ru/engines/table-engines/index.md index 6c11011a307..a364a3cb972 100644 --- a/docs/ru/engines/table-engines/index.md +++ b/docs/ru/engines/table-engines/index.md @@ -16,7 +16,7 @@ toc_title: "Введение" - Возможно ли многопоточное выполнение запроса. - Параметры репликации данных. -## Семейства движков {#semeistva-dvizhkov} +## Семейства движков {#engine-families} ### MergeTree {#mergetree} @@ -42,18 +42,23 @@ toc_title: "Введение" - [StripeLog](log-family/stripelog.md#stripelog) - [Log](log-family/log.md#log) -### Движки для интеграции {#dvizhki-dlia-integratsii} +### Движки для интеграции {#integration-engines} Движки для связи с другими системами хранения и обработки данных. Движки семейства: -- [Kafka](integrations/kafka.md#kafka) -- [MySQL](integrations/mysql.md#mysql) -- [ODBC](integrations/odbc.md#table-engine-odbc) -- [JDBC](integrations/jdbc.md#table-engine-jdbc) +- [ODBC](../../engines/table-engines/integrations/odbc.md) +- [JDBC](../../engines/table-engines/integrations/jdbc.md) +- [MySQL](../../engines/table-engines/integrations/mysql.md) +- [MongoDB](../../engines/table-engines/integrations/mongodb.md) +- [HDFS](../../engines/table-engines/integrations/hdfs.md) +- [Kafka](../../engines/table-engines/integrations/kafka.md) +- [EmbeddedRocksDB](../../engines/table-engines/integrations/embedded-rocksdb.md) +- [RabbitMQ](../../engines/table-engines/integrations/rabbitmq.md) +- [PostgreSQL](../../engines/table-engines/integrations/postgresql.md) -### Специальные движки {#spetsialnye-dvizhki} +### Специальные движки {#special-engines} Движки семейства: diff --git a/docs/ru/engines/table-engines/integrations/embedded-rocksdb.md b/docs/ru/engines/table-engines/integrations/embedded-rocksdb.md index f66e789a392..5a7909f63b2 100644 --- a/docs/ru/engines/table-engines/integrations/embedded-rocksdb.md +++ b/docs/ru/engines/table-engines/integrations/embedded-rocksdb.md @@ -1,5 +1,5 @@ --- -toc_priority: 6 +toc_priority: 9 toc_title: EmbeddedRocksDB --- diff --git a/docs/ru/engines/table-engines/integrations/hdfs.md b/docs/ru/engines/table-engines/integrations/hdfs.md index 3d9cb388a01..b56bbfc0788 100644 --- a/docs/ru/engines/table-engines/integrations/hdfs.md +++ b/docs/ru/engines/table-engines/integrations/hdfs.md @@ -1,5 +1,5 @@ --- -toc_priority: 4 +toc_priority: 6 toc_title: HDFS --- diff --git a/docs/ru/engines/table-engines/integrations/jdbc.md b/docs/ru/engines/table-engines/integrations/jdbc.md index e2db6fac0b2..fd7411a258e 100644 --- a/docs/ru/engines/table-engines/integrations/jdbc.md +++ b/docs/ru/engines/table-engines/integrations/jdbc.md @@ -1,5 +1,5 @@ --- -toc_priority: 2 +toc_priority: 3 toc_title: JDBC --- diff --git a/docs/ru/engines/table-engines/integrations/kafka.md b/docs/ru/engines/table-engines/integrations/kafka.md index f053b80aebd..19e2850dd51 100644 --- a/docs/ru/engines/table-engines/integrations/kafka.md +++ b/docs/ru/engines/table-engines/integrations/kafka.md @@ -1,5 +1,5 @@ --- -toc_priority: 5 +toc_priority: 8 toc_title: Kafka --- diff --git a/docs/ru/engines/table-engines/integrations/mongodb.md b/docs/ru/engines/table-engines/integrations/mongodb.md index 5ab63494648..97f903bdf89 100644 --- a/docs/ru/engines/table-engines/integrations/mongodb.md +++ b/docs/ru/engines/table-engines/integrations/mongodb.md @@ -1,5 +1,5 @@ --- -toc_priority: 7 +toc_priority: 5 toc_title: MongoDB --- diff --git a/docs/ru/engines/table-engines/integrations/mysql.md b/docs/ru/engines/table-engines/integrations/mysql.md index 9152a57d122..5011c8a93c6 100644 --- a/docs/ru/engines/table-engines/integrations/mysql.md +++ b/docs/ru/engines/table-engines/integrations/mysql.md @@ -1,5 +1,5 @@ --- -toc_priority: 3 +toc_priority: 4 toc_title: MySQL --- diff --git a/docs/ru/engines/table-engines/integrations/odbc.md b/docs/ru/engines/table-engines/integrations/odbc.md index b2faa9b1e9e..669977ff531 100644 --- a/docs/ru/engines/table-engines/integrations/odbc.md +++ b/docs/ru/engines/table-engines/integrations/odbc.md @@ -1,5 +1,5 @@ --- -toc_priority: 1 +toc_priority: 2 toc_title: ODBC --- diff --git a/docs/ru/engines/table-engines/integrations/postgresql.md b/docs/ru/engines/table-engines/integrations/postgresql.md index ecf431830f8..8964b1dbf02 100644 --- a/docs/ru/engines/table-engines/integrations/postgresql.md +++ b/docs/ru/engines/table-engines/integrations/postgresql.md @@ -1,11 +1,11 @@ --- -toc_priority: 8 +toc_priority: 11 toc_title: PostgreSQL --- -# PosgtreSQL {#postgresql} +#PostgreSQL {#postgresql} -Движок PostgreSQL позволяет выполнять запросы `SELECT` над данными, хранящимися на удалённом PostgreSQL сервере. +Движок PostgreSQL позволяет выполнять запросы `SELECT` и `INSERT` для таблиц на удаленном сервере PostgreSQL. ## Создание таблицы {#creating-a-table} @@ -15,7 +15,7 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1] [TTL expr1], name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2] [TTL expr2], ... -) ENGINE = PostgreSQL('host:port', 'database', 'table', 'user', 'password'); +) ENGINE = PostgreSQL('host:port', 'database', 'table', 'user', 'password'[, `schema`]); ``` Смотрите подробное описание запроса [CREATE TABLE](../../../sql-reference/statements/create/table.md#create-table-query). @@ -29,25 +29,51 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] **Параметры движка** - `host:port` — адрес сервера PostgreSQL. - - `database` — Имя базы данных на сервере PostgreSQL. - - `table` — Имя таблицы. - - `user` — Имя пользователя PostgreSQL. - - `password` — Пароль пользователя PostgreSQL. +- `schema` — имя схемы, если не используется схема по умолчанию. Необязательный аргумент. -SELECT запросы на стороне PostgreSQL выполняются как `COPY (SELECT ...) TO STDOUT` внутри транзакции PostgreSQL только на чтение с коммитом после каждого `SELECT` запроса. +## Особенности реализации {#implementation-details} -Простые условия для `WHERE` такие как `=, !=, >, >=, <, <=, IN` исполняются на стороне PostgreSQL сервера. +Запросы `SELECT` на стороне PostgreSQL выполняются как `COPY (SELECT ...) TO STDOUT` внутри транзакции PostgreSQL только на чтение с коммитом после каждого запроса `SELECT`. -Все операции объединения, аггрегации, сортировки, условия `IN [ array ]` и ограничения `LIMIT` выполняются на стороне ClickHouse только после того как запрос к PostgreSQL закончился. +Простые условия для `WHERE`, такие как `=`, `!=`, `>`, `>=`, `<`, `<=` и `IN`, исполняются на стороне PostgreSQL сервера. -INSERT запросы на стороне PostgreSQL выполняются как `COPY "table_name" (field1, field2, ... fieldN) FROM STDIN` внутри PostgreSQL транзакции с автоматическим коммитом после каждого `INSERT` запроса. +Все операции объединения, аггрегации, сортировки, условия `IN [ array ]` и ограничения `LIMIT` выполняются на стороне ClickHouse только после того, как запрос к PostgreSQL закончился. + +Запросы `INSERT` на стороне PostgreSQL выполняются как `COPY "table_name" (field1, field2, ... fieldN) FROM STDIN` внутри PostgreSQL транзакции с автоматическим коммитом после каждого запроса `INSERT`. PostgreSQL массивы конвертируются в массивы ClickHouse. -Будьте осторожны в PostgreSQL массивы созданные как type_name[], являются многомерными и могут содержать в себе разное количество измерений в разных строках одной таблицы, внутри ClickHouse допустипы только многомерные массивы с одинаковым кол-вом измерений во всех строках таблицы. + +!!! info "Внимание" + Будьте внимательны, в PostgreSQL массивы, созданные как `type_name[]`, являются многомерными и могут содержать в себе разное количество измерений в разных строках одной таблицы. Внутри ClickHouse допустимы только многомерные массивы с одинаковым кол-вом измерений во всех строках таблицы. + +При использовании словаря PostgreSQL поддерживается приоритет реплик. Чем больше номер реплики, тем ниже ее приоритет. Наивысший приоритет у реплики с номером `0`. + +В примере ниже реплика `example01-1` имеет более высокий приоритет: + +```xml + + 5432 + clickhouse + qwerty + + example01-1 + 1 + + + example01-2 + 2 + + db_name + table_name
+ id=10 + SQL_QUERY +
+ +``` ## Пример использования {#usage-example} @@ -64,17 +90,17 @@ PRIMARY KEY (int_id)); CREATE TABLE -postgres=# insert into test (int_id, str, "float") VALUES (1,'test',2); +postgres=# INSERT INTO test (int_id, str, "float") VALUES (1,'test',2); INSERT 0 1 -postgresql> select * from test; +postgresql> SELECT * FROM test; int_id | int_nullable | float | str | float_nullable --------+--------------+-------+------+---------------- 1 | | 2 | test | (1 row) ``` -Таблица в ClickHouse, получение данных из PostgreSQL таблицы созданной выше: +Таблица в ClickHouse, получение данных из PostgreSQL таблицы, созданной выше: ``` sql CREATE TABLE default.postgresql_table @@ -87,19 +113,33 @@ ENGINE = PostgreSQL('localhost:5432', 'public', 'test', 'postges_user', 'postgre ``` ``` sql -SELECT * FROM postgresql_table WHERE str IN ('test') +SELECT * FROM postgresql_table WHERE str IN ('test'); ``` ``` text ┌─float_nullable─┬─str──┬─int_id─┐ │ ᴺᵁᴸᴸ │ test │ 1 │ └────────────────┴──────┴────────┘ -1 rows in set. Elapsed: 0.019 sec. ``` +Using Non-default Schema: -## Смотри также {#see-also} +```text +postgres=# CREATE SCHEMA "nice.schema"; -- [Табличная функция ‘postgresql’](../../../sql-reference/table-functions/postgresql.md) -- [Использование PostgreSQL в качестве истояника для внешнего словаря](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md#dicts-external_dicts_dict_sources-postgresql) +postgres=# CREATE TABLE "nice.schema"."nice.table" (a integer); +postgres=# INSERT INTO "nice.schema"."nice.table" SELECT i FROM generate_series(0, 99) as t(i) +``` + +```sql +CREATE TABLE pg_table_schema_with_dots (a UInt32) + ENGINE PostgreSQL('localhost:5432', 'clickhouse', 'nice.table', 'postgrsql_user', 'password', 'nice.schema'); +``` + +**См. также** + +- [Табличная функция `postgresql`](../../../sql-reference/table-functions/postgresql.md) +- [Использование PostgreSQL в качестве источника для внешнего словаря](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md#dicts-external_dicts_dict_sources-postgresql) + +[Оригинальная статья](https://clickhouse.tech/docs/ru/engines/table-engines/integrations/postgresql/) diff --git a/docs/ru/operations/server-configuration-parameters/settings.md b/docs/ru/operations/server-configuration-parameters/settings.md index b50347f6196..109146d27f4 100644 --- a/docs/ru/operations/server-configuration-parameters/settings.md +++ b/docs/ru/operations/server-configuration-parameters/settings.md @@ -481,7 +481,15 @@ ClickHouse проверяет условия для `min_part_size` и `min_part ## max_concurrent_queries {#max-concurrent-queries} -Максимальное количество одновременно обрабатываемых запросов. +Определяет максимальное количество одновременно обрабатываемых запросов, связанных с таблицей семейства `MergeTree`. Запросы также могут быть ограничены настройками: [max_concurrent_queries_for_all_users](#max-concurrent-queries-for-all-users), [min_marks_to_honor_max_concurrent_queries](#min-marks-to-honor-max-concurrent-queries). + +!!! info "Примечание" + Параметры этих настроек могут быть изменены во время выполнения запросов и вступят в силу немедленно. Запросы, которые уже запущены, выполнятся без изменений. + +Возможные значения: + +- Положительное целое число. +- 0 — выключена. **Пример** @@ -509,6 +517,21 @@ ClickHouse проверяет условия для `min_part_size` и `min_part - [max_concurrent_queries](#max-concurrent-queries) +## min_marks_to_honor_max_concurrent_queries {#min-marks-to-honor-max-concurrent-queries} + +Определяет минимальное количество засечек, считываемых запросом для применения настройки [max_concurrent_queries](#max-concurrent-queries). + +Возможные значения: + +- Положительное целое число. +- 0 — выключена. + +**Пример** + +``` xml +10 +``` + ## max_connections {#max-connections} Максимальное количество входящих соединений. @@ -1159,4 +1182,3 @@ ClickHouse использует ZooKeeper для хранения метадан ``` - diff --git a/docs/ru/operations/settings/settings.md b/docs/ru/operations/settings/settings.md index f95dc6657b2..d10ac2ab317 100644 --- a/docs/ru/operations/settings/settings.md +++ b/docs/ru/operations/settings/settings.md @@ -1792,6 +1792,19 @@ ClickHouse генерирует исключение - [Движок Distributed](../../engines/table-engines/special/distributed.md#distributed) - [Управление распределёнными таблицами](../../sql-reference/statements/system.md#query-language-system-distributed) +## insert_distributed_one_random_shard {#insert_distributed_one_random_shard} + +Включает или отключает режим вставки данных в [Distributed](../../engines/table-engines/special/distributed.md#distributed)) таблицу в случайный шард при отсутствии ключ шардирования. + +По умолчанию при вставке данных в `Distributed` таблицу с несколькими шардами и при отсутствии ключа шардирования сервер ClickHouse будет отклонять любой запрос на вставку данных. Когда `insert_distributed_one_random_shard = 1`, вставки принимаются, а данные записываются в случайный шард. + +Возможные значения: + +- 0 — если у таблицы несколько шардов, но ключ шардирования отсутствует, вставка данных отклоняется. +- 1 — если ключ шардирования отсутствует, то вставка данных осуществляется в случайный шард среди всех доступных шардов. + +Значение по умолчанию: `0`. + ## insert_shard_id {#insert_shard_id} Если не `0`, указывает, в какой шард [Distributed](../../engines/table-engines/special/distributed.md#distributed) таблицы данные будут вставлены синхронно. diff --git a/docs/ru/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md b/docs/ru/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md index e3816e78547..a7999470330 100644 --- a/docs/ru/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md +++ b/docs/ru/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md @@ -69,6 +69,7 @@ SETTINGS(format_csv_allow_single_quotes = 0) - [ClickHouse](#dicts-external_dicts_dict_sources-clickhouse) - [MongoDB](#dicts-external_dicts_dict_sources-mongodb) - [Redis](#dicts-external_dicts_dict_sources-redis) + - [PostgreSQL](#dicts-external_dicts_dict_sources-postgresql) ## Локальный файл {#dicts-external_dicts_dict_sources-local_file} diff --git a/docs/ru/sql-reference/functions/array-functions.md b/docs/ru/sql-reference/functions/array-functions.md index 4538941a4a4..560795506a0 100644 --- a/docs/ru/sql-reference/functions/array-functions.md +++ b/docs/ru/sql-reference/functions/array-functions.md @@ -1111,6 +1111,78 @@ SELECT Функция `arrayFilter` является [функцией высшего порядка](../../sql-reference/functions/index.md#higher-order-functions) — в качестве первого аргумента ей нужно передать лямбда-функцию, и этот аргумент не может быть опущен. +## arrayFill(func, arr1, …) {#array-fill} + +Перебирает `arr1` от первого элемента к последнему и заменяет `arr1[i]` на `arr1[i - 1]`, если `func` вернула 0. Первый элемент `arr1` остаётся неизменным. + +Примеры: + +``` sql +SELECT arrayFill(x -> not isNull(x), [1, null, 3, 11, 12, null, null, 5, 6, 14, null, null]) AS res +``` + +``` text +┌─res──────────────────────────────┐ +│ [1,1,3,11,12,12,12,5,6,14,14,14] │ +└──────────────────────────────────┘ +``` + +Функция `arrayFill` является [функцией высшего порядка](../../sql-reference/functions/index.md#higher-order-functions) — в качестве первого аргумента ей нужно передать лямбда-функцию, и этот аргумент не может быть опущен. + +## arrayReverseFill(func, arr1, …) {#array-reverse-fill} + +Перебирает `arr1` от последнего элемента к первому и заменяет `arr1[i]` на `arr1[i + 1]`, если `func` вернула 0. Последний элемент `arr1` остаётся неизменным. + +Примеры: + +``` sql +SELECT arrayReverseFill(x -> not isNull(x), [1, null, 3, 11, 12, null, null, 5, 6, 14, null, null]) AS res +``` + +``` text +┌─res────────────────────────────────┐ +│ [1,3,3,11,12,5,5,5,6,14,NULL,NULL] │ +└────────────────────────────────────┘ +``` + +Функция `arrayReverseFill` является [функцией высшего порядка](../../sql-reference/functions/index.md#higher-order-functions) — в качестве первого аргумента ей нужно передать лямбда-функцию, и этот аргумент не может быть опущен. + +## arraySplit(func, arr1, …) {#array-split} + +Разделяет массив `arr1` на несколько. Если `func` возвращает не 0, то массив разделяется, а элемент помещается в левую часть. Массив не разбивается по первому элементу. + +Примеры: + +``` sql +SELECT arraySplit((x, y) -> y, [1, 2, 3, 4, 5], [1, 0, 0, 1, 0]) AS res +``` + +``` text +┌─res─────────────┐ +│ [[1,2,3],[4,5]] │ +└─────────────────┘ +``` + +Функция `arraySplit` является [функцией высшего порядка](../../sql-reference/functions/index.md#higher-order-functions) — в качестве первого аргумента ей нужно передать лямбда-функцию, и этот аргумент не может быть опущен. + +## arrayReverseSplit(func, arr1, …) {#array-reverse-split} + +Разделяет массив `arr1` на несколько. Если `func` возвращает не 0, то массив разделяется, а элемент помещается в правую часть. Массив не разбивается по последнему элементу. + +Примеры: + +``` sql +SELECT arrayReverseSplit((x, y) -> y, [1, 2, 3, 4, 5], [1, 0, 0, 1, 0]) AS res +``` + +``` text +┌─res───────────────┐ +│ [[1],[2,3,4],[5]] │ +└───────────────────┘ +``` + +Функция `arrayReverseSplit` является [функцией высшего порядка](../../sql-reference/functions/index.md#higher-order-functions) — в качестве первого аргумента ей нужно передать лямбда-функцию, и этот аргумент не может быть опущен. + ## arrayExists(\[func,\] arr1, …) {#arrayexistsfunc-arr1} Возвращает 1, если существует хотя бы один элемент массива `arr`, для которого функция func возвращает не 0. Иначе возвращает 0. @@ -1137,7 +1209,7 @@ SELECT ## arrayMin {#array-min} -Возвращает значение минимального элемента в исходном массиве. +Возвращает значение минимального элемента в исходном массиве. Если передана функция `func`, возвращается минимум из элементов массива, преобразованных этой функцией. @@ -1192,7 +1264,7 @@ SELECT arrayMin(x -> (-x), [1, 2, 4]) AS res; ## arrayMax {#array-max} -Возвращает значение максимального элемента в исходном массиве. +Возвращает значение максимального элемента в исходном массиве. Если передана функция `func`, возвращается максимум из элементов массива, преобразованных этой функцией. @@ -1247,7 +1319,7 @@ SELECT arrayMax(x -> (-x), [1, 2, 4]) AS res; ## arraySum {#array-sum} -Возвращает сумму элементов в исходном массиве. +Возвращает сумму элементов в исходном массиве. Если передана функция `func`, возвращается сумма элементов массива, преобразованных этой функцией. @@ -1262,7 +1334,7 @@ arraySum([func,] arr) **Аргументы** - `func` — функция. [Expression](../../sql-reference/data-types/special-data-types/expression.md). -- `arr` — массив. [Array](../../sql-reference/data-types/array.md). +- `arr` — массив. [Array](../../sql-reference/data-types/array.md). **Возвращаемое значение** @@ -1302,7 +1374,7 @@ SELECT arraySum(x -> x*x, [2, 3]) AS res; ## arrayAvg {#array-avg} -Возвращает среднее значение элементов в исходном массиве. +Возвращает среднее значение элементов в исходном массиве. Если передана функция `func`, возвращается среднее значение элементов массива, преобразованных этой функцией. @@ -1317,7 +1389,7 @@ arrayAvg([func,] arr) **Аргументы** - `func` — функция. [Expression](../../sql-reference/data-types/special-data-types/expression.md). -- `arr` — массив. [Array](../../sql-reference/data-types/array.md). +- `arr` — массив. [Array](../../sql-reference/data-types/array.md). **Возвращаемое значение** @@ -1355,7 +1427,7 @@ SELECT arrayAvg(x -> (x * x), [2, 4]) AS res; └─────┘ ``` -**Синтаксис** +**Синтаксис** ``` sql arraySum(arr) @@ -1367,7 +1439,7 @@ arraySum(arr) Тип: [Int](../../sql-reference/data-types/int-uint.md) или [Float](../../sql-reference/data-types/float.md). -**Аргументы** +**Аргументы** - `arr` — [массив](../../sql-reference/data-types/array.md). diff --git a/docs/ru/sql-reference/functions/bit-functions.md b/docs/ru/sql-reference/functions/bit-functions.md index 09844685a6c..a5124e67235 100644 --- a/docs/ru/sql-reference/functions/bit-functions.md +++ b/docs/ru/sql-reference/functions/bit-functions.md @@ -240,3 +240,53 @@ SELECT bitCount(333); └───────────────┘ ``` +## bitHammingDistance {#bithammingdistance} + +Возвращает [расстояние Хэмминга](https://ru.wikipedia.org/wiki/%D0%A0%D0%B0%D1%81%D1%81%D1%82%D0%BE%D1%8F%D0%BD%D0%B8%D0%B5_%D0%A5%D1%8D%D0%BC%D0%BC%D0%B8%D0%BD%D0%B3%D0%B0) между битовыми представлениями двух целых чисел. Может быть использовано с функциями [SimHash](../../sql-reference/functions/hash-functions.md#ngramsimhash) для проверки двух строк на схожесть. Чем меньше расстояние, тем больше вероятность, что строки совпадают. + +**Синтаксис** + +``` sql +bitHammingDistance(int1, int2) +``` + +**Аргументы** + +- `int1` — первое целое число. [Int64](../../sql-reference/data-types/int-uint.md). +- `int2` — второе целое число. [Int64](../../sql-reference/data-types/int-uint.md). + +**Возвращаемое значение** + +- Расстояние Хэмминга. + +Тип: [UInt8](../../sql-reference/data-types/int-uint.md). + +**Примеры** + +Запрос: + +``` sql +SELECT bitHammingDistance(111, 121); +``` + +Результат: + +``` text +┌─bitHammingDistance(111, 121)─┐ +│ 3 │ +└──────────────────────────────┘ +``` + +Используя [SimHash](../../sql-reference/functions/hash-functions.md#ngramsimhash): + +``` sql +SELECT bitHammingDistance(ngramSimHash('cat ate rat'), ngramSimHash('rat ate cat')); +``` + +Результат: + +``` text +┌─bitHammingDistance(ngramSimHash('cat ate rat'), ngramSimHash('rat ate cat'))─┐ +│ 5 │ +└──────────────────────────────────────────────────────────────────────────────┘ +``` diff --git a/docs/ru/sql-reference/functions/hash-functions.md b/docs/ru/sql-reference/functions/hash-functions.md index 6797f530346..2efff9c3727 100644 --- a/docs/ru/sql-reference/functions/hash-functions.md +++ b/docs/ru/sql-reference/functions/hash-functions.md @@ -7,6 +7,8 @@ toc_title: "Функции хэширования" Функции хэширования могут использоваться для детерминированного псевдослучайного разбрасывания элементов. +Simhash – это хеш-функция, которая для близких значений возвращает близкий хеш. + ## halfMD5 {#hash-functions-halfmd5} [Интерпретирует](../../sql-reference/functions/hash-functions.md#type_conversion_functions-reinterpretAsString) все входные параметры как строки и вычисляет хэш [MD5](https://ru.wikipedia.org/wiki/MD5) для каждой из них. Затем объединяет хэши, берет первые 8 байт хэша результирующей строки и интерпретирует их как значение типа `UInt64` с big-endian порядком байтов. @@ -484,3 +486,937 @@ SELECT xxHash32('Hello, world!'); - [xxHash](http://cyan4973.github.io/xxHash/). +## ngramSimHash {#ngramsimhash} + +Выделяет из ASCII строки отрезки (n-граммы) размером `ngramsize` символов и возвращает n-граммовый `simhash`. Функция регистрозависимая. + +Может быть использована для проверки двух строк на схожесть вместе с функцией [bitHammingDistance](../../sql-reference/functions/bit-functions.md#bithammingdistance). Чем меньше [расстояние Хэмминга](https://ru.wikipedia.org/wiki/%D0%A0%D0%B0%D1%81%D1%81%D1%82%D0%BE%D1%8F%D0%BD%D0%B8%D0%B5_%D0%A5%D1%8D%D0%BC%D0%BC%D0%B8%D0%BD%D0%B3%D0%B0) между результатом вычисления `simhash` двух строк, тем больше вероятность, что строки совпадают. + +**Синтаксис** + +``` sql +ngramSimHash(string[, ngramsize]) +``` + +**Аргументы** + +- `string` — строка. [String](../../sql-reference/data-types/string.md). +- `ngramsize` — размер n-грамм. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Возвращаемое значение** + +- Значение хеш-функции от строки. + +Тип: [UInt64](../../sql-reference/data-types/int-uint.md). + +**Пример** + +Запрос: + +``` sql +SELECT ngramSimHash('ClickHouse') AS Hash; +``` + +Результат: + +``` text +┌───────Hash─┐ +│ 1627567969 │ +└────────────┘ +``` + +## ngramSimHashCaseInsensitive {#ngramsimhashcaseinsensitive} + +Выделяет из ASCII строки отрезки (n-граммы) размером `ngramsize` символов и возвращает n-граммовый `simhash`. Функция регистро**не**зависимая. + +Может быть использована для проверки двух строк на схожесть вместе с функцией [bitHammingDistance](../../sql-reference/functions/bit-functions.md#bithammingdistance). Чем меньше [расстояние Хэмминга](https://ru.wikipedia.org/wiki/%D0%A0%D0%B0%D1%81%D1%81%D1%82%D0%BE%D1%8F%D0%BD%D0%B8%D0%B5_%D0%A5%D1%8D%D0%BC%D0%BC%D0%B8%D0%BD%D0%B3%D0%B0) между результатом вычисления `simhash` двух строк, тем больше вероятность, что строки совпадают. + +**Синтаксис** + +``` sql +ngramSimHashCaseInsensitive(string[, ngramsize]) +``` + +**Аргументы** + +- `string` — строка. [String](../../sql-reference/data-types/string.md). +- `ngramsize` — размер n-грамм. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Возвращаемое значение** + +- Значение хеш-функции от строки. + +Тип: [UInt64](../../sql-reference/data-types/int-uint.md). + +**Пример** + +Запрос: + +``` sql +SELECT ngramSimHashCaseInsensitive('ClickHouse') AS Hash; +``` + +Результат: + +``` text +┌──────Hash─┐ +│ 562180645 │ +└───────────┘ +``` + +## ngramSimHashUTF8 {#ngramsimhashutf8} + +Выделяет из UTF-8 строки отрезки (n-граммы) размером `ngramsize` символов и возвращает n-граммовый `simhash`. Функция регистрозависимая. + +Может быть использована для проверки двух строк на схожесть вместе с функцией [bitHammingDistance](../../sql-reference/functions/bit-functions.md#bithammingdistance). Чем меньше [расстояние Хэмминга](https://ru.wikipedia.org/wiki/%D0%A0%D0%B0%D1%81%D1%81%D1%82%D0%BE%D1%8F%D0%BD%D0%B8%D0%B5_%D0%A5%D1%8D%D0%BC%D0%BC%D0%B8%D0%BD%D0%B3%D0%B0) между результатом вычисления `simhash` двух строк, тем больше вероятность, что строки совпадают. + +**Синтаксис** + +``` sql +ngramSimHashUTF8(string[, ngramsize]) +``` + +**Аргументы** + +- `string` — строка. [String](../../sql-reference/data-types/string.md). +- `ngramsize` — размер n-грамм. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Возвращаемое значение** + +- Значение хеш-функции от строки. + +Тип: [UInt64](../../sql-reference/data-types/int-uint.md). + +**Пример** + +Запрос: + +``` sql +SELECT ngramSimHashUTF8('ClickHouse') AS Hash; +``` + +Результат: + +``` text +┌───────Hash─┐ +│ 1628157797 │ +└────────────┘ +``` + +## ngramSimHashCaseInsensitiveUTF8 {#ngramsimhashcaseinsensitiveutf8} + +Выделяет из UTF-8 строки отрезки (n-граммы) размером `ngramsize` символов и возвращает n-граммовый `simhash`. Функция регистро**не**зависимая. + +Может быть использована для проверки двух строк на схожесть вместе с функцией [bitHammingDistance](../../sql-reference/functions/bit-functions.md#bithammingdistance). Чем меньше [расстояние Хэмминга](https://ru.wikipedia.org/wiki/%D0%A0%D0%B0%D1%81%D1%81%D1%82%D0%BE%D1%8F%D0%BD%D0%B8%D0%B5_%D0%A5%D1%8D%D0%BC%D0%BC%D0%B8%D0%BD%D0%B3%D0%B0) между результатом вычисления `simhash` двух строк, тем больше вероятность, что строки совпадают. + +**Синтаксис** + +``` sql +ngramSimHashCaseInsensitiveUTF8(string[, ngramsize]) +``` + +**Аргументы** + +- `string` — строка. [String](../../sql-reference/data-types/string.md). +- `ngramsize` — размер n-грамм. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Возвращаемое значение** + +- Значение хеш-функции от строки. + +Тип: [UInt64](../../sql-reference/data-types/int-uint.md). + +**Пример** + +Запрос: + +``` sql +SELECT ngramSimHashCaseInsensitiveUTF8('ClickHouse') AS Hash; +``` + +Результат: + +``` text +┌───────Hash─┐ +│ 1636742693 │ +└────────────┘ +``` + +## wordShingleSimHash {#wordshinglesimhash} + +Выделяет из ASCII строки отрезки (шинглы) из `shinglesize` слов и возвращает шингловый `simhash`. Функция регистрозависимая. + +Может быть использована для проверки двух строк на схожесть вместе с функцией [bitHammingDistance](../../sql-reference/functions/bit-functions.md#bithammingdistance). Чем меньше [расстояние Хэмминга](https://ru.wikipedia.org/wiki/%D0%A0%D0%B0%D1%81%D1%81%D1%82%D0%BE%D1%8F%D0%BD%D0%B8%D0%B5_%D0%A5%D1%8D%D0%BC%D0%BC%D0%B8%D0%BD%D0%B3%D0%B0) между результатом вычисления `simhash` двух строк, тем больше вероятность, что строки совпадают. + +**Синтаксис** + +``` sql +wordShingleSimHash(string[, shinglesize]) +``` + +**Аргументы** + +- `string` — строка. [String](../../sql-reference/data-types/string.md). +- `shinglesize` — размер словесных шинглов. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Возвращаемое значение** + +- Значение хеш-функции от строки. + +Тип: [UInt64](../../sql-reference/data-types/int-uint.md). + +**Пример** + +Запрос: + +``` sql +SELECT wordShingleSimHash('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).') AS Hash; +``` + +Результат: + +``` text +┌───────Hash─┐ +│ 2328277067 │ +└────────────┘ +``` + +## wordShingleSimHashCaseInsensitive {#wordshinglesimhashcaseinsensitive} + +Выделяет из ASCII строки отрезки (шинглы) из `shinglesize` слов и возвращает шингловый `simhash`. Функция регистро**не**зависимая. + +Может быть использована для проверки двух строк на схожесть вместе с функцией [bitHammingDistance](../../sql-reference/functions/bit-functions.md#bithammingdistance). Чем меньше [расстояние Хэмминга](https://ru.wikipedia.org/wiki/%D0%A0%D0%B0%D1%81%D1%81%D1%82%D0%BE%D1%8F%D0%BD%D0%B8%D0%B5_%D0%A5%D1%8D%D0%BC%D0%BC%D0%B8%D0%BD%D0%B3%D0%B0) между результатом вычисления `simhash` двух строк, тем больше вероятность, что строки совпадают. + +**Синтаксис** + +``` sql +wordShingleSimHashCaseInsensitive(string[, shinglesize]) +``` + +**Аргументы** + +- `string` — строка. [String](../../sql-reference/data-types/string.md). +- `shinglesize` — размер словесных шинглов. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Возвращаемое значение** + +- Значение хеш-функции от строки. + +Тип: [UInt64](../../sql-reference/data-types/int-uint.md). + +**Пример** + +Запрос: + +``` sql +SELECT wordShingleSimHashCaseInsensitive('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).') AS Hash; +``` + +Результат: + +``` text +┌───────Hash─┐ +│ 2194812424 │ +└────────────┘ +``` + +## wordShingleSimHashUTF8 {#wordshinglesimhashutf8} + +Выделяет из UTF-8 строки отрезки (шинглы) из `shinglesize` слов и возвращает шингловый `simhash`. Функция регистрозависимая. + +Может быть использована для проверки двух строк на схожесть вместе с функцией [bitHammingDistance](../../sql-reference/functions/bit-functions.md#bithammingdistance). Чем меньше [расстояние Хэмминга](https://ru.wikipedia.org/wiki/%D0%A0%D0%B0%D1%81%D1%81%D1%82%D0%BE%D1%8F%D0%BD%D0%B8%D0%B5_%D0%A5%D1%8D%D0%BC%D0%BC%D0%B8%D0%BD%D0%B3%D0%B0) между результатом вычисления `simhash` двух строк, тем больше вероятность, что строки совпадают. + +**Синтаксис** + +``` sql +wordShingleSimHashUTF8(string[, shinglesize]) +``` + +**Аргументы** + +- `string` — строка. [String](../../sql-reference/data-types/string.md). +- `shinglesize` — размер словесных шинглов. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Возвращаемое значение** + +- Значение хеш-функции от строки. + +Тип: [UInt64](../../sql-reference/data-types/int-uint.md). + +**Пример** + +Запрос: + +``` sql +SELECT wordShingleSimHashUTF8('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).') AS Hash; +``` + +Результат: + +``` text +┌───────Hash─┐ +│ 2328277067 │ +└────────────┘ +``` + +## wordShingleSimHashCaseInsensitiveUTF8 {#wordshinglesimhashcaseinsensitiveutf8} + +Выделяет из UTF-8 строки отрезки (шинглы) из `shinglesize` слов и возвращает шингловый `simhash`. Функция регистро**не**зависимая. + +Может быть использована для проверки двух строк на схожесть вместе с функцией [bitHammingDistance](../../sql-reference/functions/bit-functions.md#bithammingdistance). Чем меньше [расстояние Хэмминга](https://ru.wikipedia.org/wiki/%D0%A0%D0%B0%D1%81%D1%81%D1%82%D0%BE%D1%8F%D0%BD%D0%B8%D0%B5_%D0%A5%D1%8D%D0%BC%D0%BC%D0%B8%D0%BD%D0%B3%D0%B0) между результатом вычисления `simhash` двух строк, тем больше вероятность, что строки совпадают. + +**Синтаксис** + +``` sql +wordShingleSimHashCaseInsensitiveUTF8(string[, shinglesize]) +``` + +**Аргументы** + +- `string` — строка. [String](../../sql-reference/data-types/string.md). +- `shinglesize` — размер словесных шинглов. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Возвращаемое значение** + +- Значение хеш-функции от строки. + +Тип: [UInt64](../../sql-reference/data-types/int-uint.md). + +**Пример** + +Запрос: + +``` sql +SELECT wordShingleSimHashCaseInsensitiveUTF8('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).') AS Hash; +``` + +Результат: + +``` text +┌───────Hash─┐ +│ 2194812424 │ +└────────────┘ +``` + +## ngramMinHash {#ngramminhash} + +Выделяет из ASCII строки отрезки (n-граммы) размером `ngramsize` символов и вычисляет хеш для каждой n-граммы. Использует `hashnum` минимальных хешей, чтобы вычислить минимальный хеш, и `hashnum` максимальных хешей, чтобы вычислить максимальный хеш. Возвращает кортеж из этих хешей. Функция регистрозависимая. + +Может быть использована для проверки двух строк на схожесть вместе с функцией [tupleHammingDistance](../../sql-reference/functions/tuple-functions.md#tuplehammingdistance). Если для двух строк минимальные или максимальные хеши одинаковы, мы считаем, что эти строки совпадают. + +**Синтаксис** + +``` sql +ngramMinHash(string[, ngramsize, hashnum]) +``` + +**Аргументы** + +- `string` — строка. [String](../../sql-reference/data-types/string.md). +- `ngramsize` — размер n-грамм. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — количество минимальных и максимальных хешей, которое используется при вычислении результата. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Возвращаемое значение** + +- Кортеж с двумя хешами — минимальным и максимальным. + +Тип: [Tuple](../../sql-reference/data-types/tuple.md)([UInt64](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md)). + +**Пример** + +Запрос: + +``` sql +SELECT ngramMinHash('ClickHouse') AS Tuple; +``` + +Результат: + +``` text +┌─Tuple──────────────────────────────────────┐ +│ (18333312859352735453,9054248444481805918) │ +└────────────────────────────────────────────┘ +``` + +## ngramMinHashCaseInsensitive {#ngramminhashcaseinsensitive} + +Выделяет из ASCII строки отрезки (n-граммы) размером `ngramsize` символов и вычисляет хеш для каждой n-граммы. Использует `hashnum` минимальных хешей, чтобы вычислить минимальный хеш, и `hashnum` максимальных хешей, чтобы вычислить максимальный хеш. Возвращает кортеж из этих хешей. Функция регистро**не**зависимая. + +Может быть использована для проверки двух строк на схожесть вместе с функцией [tupleHammingDistance](../../sql-reference/functions/tuple-functions.md#tuplehammingdistance). Если для двух строк минимальные или максимальные хеши одинаковы, мы считаем, что эти строки совпадают. + +**Синтаксис** + +``` sql +ngramMinHashCaseInsensitive(string[, ngramsize, hashnum]) +``` + +**Аргументы** + +- `string` — строка. [String](../../sql-reference/data-types/string.md). +- `ngramsize` — размер n-грамм. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — количество минимальных и максимальных хешей, которое используется при вычислении результата. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Возвращаемое значение** + +- Кортеж с двумя хешами — минимальным и максимальным. + +Тип: [Tuple](../../sql-reference/data-types/tuple.md)([UInt64](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md)). + +**Пример** + +Запрос: + +``` sql +SELECT ngramMinHashCaseInsensitive('ClickHouse') AS Tuple; +``` + +Результат: + +``` text +┌─Tuple──────────────────────────────────────┐ +│ (2106263556442004574,13203602793651726206) │ +└────────────────────────────────────────────┘ +``` + +## ngramMinHashUTF8 {#ngramminhashutf8} + +Выделяет из UTF-8 строки отрезки (n-граммы) размером `ngramsize` символов и вычисляет хеш для каждой n-граммы. Использует `hashnum` минимальных хешей, чтобы вычислить минимальный хеш, и `hashnum` максимальных хешей, чтобы вычислить максимальный хеш. Возвращает кортеж из этих хешей. Функция регистрозависимая. + +Может быть использована для проверки двух строк на схожесть вместе с функцией [tupleHammingDistance](../../sql-reference/functions/tuple-functions.md#tuplehammingdistance). Если для двух строк минимальные или максимальные хеши одинаковы, мы считаем, что эти строки совпадают. + +**Синтаксис** +``` sql +ngramMinHashUTF8(string[, ngramsize, hashnum]) +``` + +**Аргументы** + +- `string` — строка. [String](../../sql-reference/data-types/string.md). +- `ngramsize` — размер n-грамм. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — количество минимальных и максимальных хешей, которое используется при вычислении результата. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Возвращаемое значение** + +- Кортеж с двумя хешами — минимальным и максимальным. + +Тип: [Tuple](../../sql-reference/data-types/tuple.md)([UInt64](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md)). + +**Пример** + +Запрос: + +``` sql +SELECT ngramMinHashUTF8('ClickHouse') AS Tuple; +``` + +Результат: + +``` text +┌─Tuple──────────────────────────────────────┐ +│ (18333312859352735453,6742163577938632877) │ +└────────────────────────────────────────────┘ +``` + +## ngramMinHashCaseInsensitiveUTF8 {#ngramminhashcaseinsensitiveutf8} + +Выделяет из UTF-8 строки отрезки (n-граммы) размером `ngramsize` символов и вычисляет хеш для каждой n-граммы. Использует `hashnum` минимальных хешей, чтобы вычислить минимальный хеш, и `hashnum` максимальных хешей, чтобы вычислить максимальный хеш. Возвращает кортеж из этих хешей. Функция регистро**не**зависимая. + +Может быть использована для проверки двух строк на схожесть вместе с функцией [tupleHammingDistance](../../sql-reference/functions/tuple-functions.md#tuplehammingdistance). Если для двух строк минимальные или максимальные хеши одинаковы, мы считаем, что эти строки совпадают. + +**Синтаксис** + +``` sql +ngramMinHashCaseInsensitiveUTF8(string [, ngramsize, hashnum]) +``` + +**Аргументы** + +- `string` — строка. [String](../../sql-reference/data-types/string.md). +- `ngramsize` — размер n-грамм. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — количество минимальных и максимальных хешей, которое используется при вычислении результата. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Возвращаемое значение** + +- Кортеж с двумя хешами — минимальным и максимальным. + +Тип: [Tuple](../../sql-reference/data-types/tuple.md)([UInt64](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md)). + +**Пример** + +Запрос: + +``` sql +SELECT ngramMinHashCaseInsensitiveUTF8('ClickHouse') AS Tuple; +``` + +Результат: + +``` text +┌─Tuple───────────────────────────────────────┐ +│ (12493625717655877135,13203602793651726206) │ +└─────────────────────────────────────────────┘ +``` + +## ngramMinHashArg {#ngramminhasharg} + +Выделяет из ASCII строки отрезки (n-граммы) размером `ngramsize` символов и возвращает n-граммы с минимальным и максимальным хешами, вычисленными функцией [ngramMinHash](#ngramminhash) с теми же входными данными. Функция регистрозависимая. + +**Синтаксис** + +``` sql +ngramMinHashArg(string[, ngramsize, hashnum]) +``` + +**Аргументы** + +- `string` — строка. [String](../../sql-reference/data-types/string.md). +- `ngramsize` — размер n-грамм. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — количество минимальных и максимальных хешей, которое используется при вычислении результата. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Возвращаемое значение** + +- Кортеж из двух кортежей, каждый из которых состоит из `hashnum` n-грамм. + +Тип: [Tuple](../../sql-reference/data-types/tuple.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md)), [Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md))). + +**Пример** + +Запрос: + +``` sql +SELECT ngramMinHashArg('ClickHouse') AS Tuple; +``` + +Результат: + +``` text +┌─Tuple─────────────────────────────────────────────────────────────────────────┐ +│ (('ous','ick','lic','Hou','kHo','use'),('Hou','lic','ick','ous','ckH','Cli')) │ +└───────────────────────────────────────────────────────────────────────────────┘ +``` + +## ngramMinHashArgCaseInsensitive {#ngramminhashargcaseinsensitive} + +Выделяет из ASCII строки отрезки (n-граммы) размером `ngramsize` символов и возвращает n-граммы с минимальным и максимальным хешами, вычисленными функцией [ngramMinHashCaseInsensitive](#ngramminhashcaseinsensitive) с теми же входными данными. Функция регистро**не**зависимая. + +**Синтаксис** + +``` sql +ngramMinHashArgCaseInsensitive(string[, ngramsize, hashnum]) +``` + +**Аргументы** + +- `string` — строка. [String](../../sql-reference/data-types/string.md). +- `ngramsize` — размер n-грамм. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — количество минимальных и максимальных хешей, которое используется при вычислении результата. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Возвращаемое значение** + +- Кортеж из двух кортежей, каждый из которых состоит из `hashnum` n-грамм. + +Тип: [Tuple](../../sql-reference/data-types/tuple.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md)), [Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md))). + +**Пример** + +Запрос: + +``` sql +SELECT ngramMinHashArgCaseInsensitive('ClickHouse') AS Tuple; +``` + +Результат: + +``` text +┌─Tuple─────────────────────────────────────────────────────────────────────────┐ +│ (('ous','ick','lic','kHo','use','Cli'),('kHo','lic','ick','ous','ckH','Hou')) │ +└───────────────────────────────────────────────────────────────────────────────┘ +``` + +## ngramMinHashArgUTF8 {#ngramminhashargutf8} + +Выделяет из UTF-8 строки отрезки (n-граммы) размером `ngramsize` символов и возвращает n-граммы с минимальным и максимальным хешами, вычисленными функцией [ngramMinHashUTF8](#ngramminhashutf8) с теми же входными данными. Функция регистрозависимая. + +**Синтаксис** + +``` sql +ngramMinHashArgUTF8(string[, ngramsize, hashnum]) +``` + +**Аргументы** + +- `string` — строка. [String](../../sql-reference/data-types/string.md). +- `ngramsize` — размер n-грамм. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — количество минимальных и максимальных хешей, которое используется при вычислении результата. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Возвращаемое значение** + +- Кортеж из двух кортежей, каждый из которых состоит из `hashnum` n-грамм. + +Тип: [Tuple](../../sql-reference/data-types/tuple.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md)), [Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md))). + +**Пример** + +Запрос: + +``` sql +SELECT ngramMinHashArgUTF8('ClickHouse') AS Tuple; +``` + +Результат: + +``` text +┌─Tuple─────────────────────────────────────────────────────────────────────────┐ +│ (('ous','ick','lic','Hou','kHo','use'),('kHo','Hou','lic','ick','ous','ckH')) │ +└───────────────────────────────────────────────────────────────────────────────┘ +``` + +## ngramMinHashArgCaseInsensitiveUTF8 {#ngramminhashargcaseinsensitiveutf8} + +Выделяет из UTF-8 строки отрезки (n-граммы) размером `ngramsize` символов и возвращает n-граммы с минимальным и максимальным хешами, вычисленными функцией [ngramMinHashCaseInsensitiveUTF8](#ngramminhashcaseinsensitiveutf8) с теми же входными данными. Функция регистро**не**зависимая. + +**Синтаксис** + +``` sql +ngramMinHashArgCaseInsensitiveUTF8(string[, ngramsize, hashnum]) +``` + +**Аргументы** + +- `string` — строка. [String](../../sql-reference/data-types/string.md). +- `ngramsize` — размер n-грамм. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — количество минимальных и максимальных хешей, которое используется при вычислении результата. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Возвращаемое значение** + +- Кортеж из двух кортежей, каждый из которых состоит из `hashnum` n-грамм. + +Тип: [Tuple](../../sql-reference/data-types/tuple.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md)), [Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md))). + +**Пример** + +Запрос: + +``` sql +SELECT ngramMinHashArgCaseInsensitiveUTF8('ClickHouse') AS Tuple; +``` + +Результат: + +``` text +┌─Tuple─────────────────────────────────────────────────────────────────────────┐ +│ (('ckH','ous','ick','lic','kHo','use'),('kHo','lic','ick','ous','ckH','Hou')) │ +└───────────────────────────────────────────────────────────────────────────────┘ +``` + +## wordShingleMinHash {#wordshingleminhash} + +Выделяет из ASCII строки отрезки (шинглы) из `shinglesize` слов и вычисляет хеш для каждого шингла. Использует `hashnum` минимальных хешей, чтобы вычислить минимальный хеш, и `hashnum` максимальных хешей, чтобы вычислить максимальный хеш. Возвращает кортеж из этих хешей. Функция регистрозависимая. + +Может быть использована для проверки двух строк на схожесть вместе с функцией [tupleHammingDistance](../../sql-reference/functions/tuple-functions.md#tuplehammingdistance). Если для двух строк минимальные или максимальные хеши одинаковы, мы считаем, что эти строки совпадают. + +**Синтаксис** + +``` sql +wordShingleMinHash(string[, shinglesize, hashnum]) +``` + +**Аргументы** + +- `string` — строка. [String](../../sql-reference/data-types/string.md). +- `shinglesize` — размер словесных шинглов. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — количество минимальных и максимальных хешей, которое используется при вычислении результата. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Возвращаемое значение** + +- Кортеж с двумя хешами — минимальным и максимальным. + +Тип: [Tuple](../../sql-reference/data-types/tuple.md)([UInt64](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md)). + +**Пример** + +Запрос: + +``` sql +SELECT wordShingleMinHash('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).') AS Tuple; +``` + +Результат: + +``` text +┌─Tuple──────────────────────────────────────┐ +│ (16452112859864147620,5844417301642981317) │ +└────────────────────────────────────────────┘ +``` + +## wordShingleMinHashCaseInsensitive {#wordshingleminhashcaseinsensitive} + +Выделяет из ASCII строки отрезки (шинглы) из `shinglesize` слов и вычисляет хеш для каждого шингла. Использует `hashnum` минимальных хешей, чтобы вычислить минимальный хеш, и `hashnum` максимальных хешей, чтобы вычислить максимальный хеш. Возвращает кортеж из этих хешей. Функция регистро**не**зависимая. + +Может быть использована для проверки двух строк на схожесть вместе с функцией [tupleHammingDistance](../../sql-reference/functions/tuple-functions.md#tuplehammingdistance). Если для двух строк минимальные или максимальные хеши одинаковы, мы считаем, что эти строки совпадают. + +**Синтаксис** + +``` sql +wordShingleMinHashCaseInsensitive(string[, shinglesize, hashnum]) +``` + +**Аргументы** + +- `string` — строка. [String](../../sql-reference/data-types/string.md). +- `shinglesize` — размер словесных шинглов. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — количество минимальных и максимальных хешей, которое используется при вычислении результата. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Возвращаемое значение** + +- Кортеж с двумя хешами — минимальным и максимальным. + +Тип: [Tuple](../../sql-reference/data-types/tuple.md)([UInt64](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md)). + +**Пример** + +Запрос: + +``` sql +SELECT wordShingleMinHashCaseInsensitive('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).') AS Tuple; +``` + +Результат: + +``` text +┌─Tuple─────────────────────────────────────┐ +│ (3065874883688416519,1634050779997673240) │ +└───────────────────────────────────────────┘ +``` + +## wordShingleMinHashUTF8 {#wordshingleminhashutf8} + +Выделяет из UTF-8 строки отрезки (шинглы) из `shinglesize` слов и вычисляет хеш для каждого шингла. Использует `hashnum` минимальных хешей, чтобы вычислить минимальный хеш, и `hashnum` максимальных хешей, чтобы вычислить максимальный хеш. Возвращает кортеж из этих хешей. Функция регистрозависимая. + +Может быть использована для проверки двух строк на схожесть вместе с функцией [tupleHammingDistance](../../sql-reference/functions/tuple-functions.md#tuplehammingdistance). Если для двух строк минимальные или максимальные хеши одинаковы, мы считаем, что эти строки совпадают. + +**Синтаксис** + +``` sql +wordShingleMinHashUTF8(string[, shinglesize, hashnum]) +``` + +**Аргументы** + +- `string` — строка. [String](../../sql-reference/data-types/string.md). +- `shinglesize` — размер словесных шинглов. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — количество минимальных и максимальных хешей, которое используется при вычислении результата. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Возвращаемое значение** + +- Кортеж с двумя хешами — минимальным и максимальным. + +Тип: [Tuple](../../sql-reference/data-types/tuple.md)([UInt64](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md)). + +**Пример** + +Запрос: + +``` sql +SELECT wordShingleMinHashUTF8('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).') AS Tuple; +``` + +Результат: + +``` text +┌─Tuple──────────────────────────────────────┐ +│ (16452112859864147620,5844417301642981317) │ +└────────────────────────────────────────────┘ +``` + +## wordShingleMinHashCaseInsensitiveUTF8 {#wordshingleminhashcaseinsensitiveutf8} + +Выделяет из UTF-8 строки отрезки (шинглы) из `shinglesize` слов и вычисляет хеш для каждого шингла. Использует `hashnum` минимальных хешей, чтобы вычислить минимальный хеш, и `hashnum` максимальных хешей, чтобы вычислить максимальный хеш. Возвращает кортеж из этих хешей. Функция регистро**не**зависимая. + +Может быть использована для проверки двух строк на схожесть вместе с функцией [tupleHammingDistance](../../sql-reference/functions/tuple-functions.md#tuplehammingdistance). Если для двух строк минимальные или максимальные хеши одинаковы, мы считаем, что эти строки совпадают. + +**Синтаксис** + +``` sql +wordShingleMinHashCaseInsensitiveUTF8(string[, shinglesize, hashnum]) +``` + +**Аргументы** + +- `string` — строка. [String](../../sql-reference/data-types/string.md). +- `shinglesize` — размер словесных шинглов. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — количество минимальных и максимальных хешей, которое используется при вычислении результата. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Возвращаемое значение** + +- Кортеж с двумя хешами — минимальным и максимальным. + +Тип: [Tuple](../../sql-reference/data-types/tuple.md)([UInt64](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md)). + +**Пример** + +Запрос: + +``` sql +SELECT wordShingleMinHashCaseInsensitiveUTF8('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).') AS Tuple; +``` + +Результат: + +``` text +┌─Tuple─────────────────────────────────────┐ +│ (3065874883688416519,1634050779997673240) │ +└───────────────────────────────────────────┘ +``` + +## wordShingleMinHashArg {#wordshingleminhasharg} + +Выделяет из ASCII строки отрезки (шинглы) из `shinglesize` слов и возвращает шинглы с минимальным и максимальным хешами, вычисленными функцией [wordshingleMinHash](#wordshingleminhash) с теми же входными данными. Функция регистрозависимая. + +**Синтаксис** + +``` sql +wordShingleMinHashArg(string[, shinglesize, hashnum]) +``` + +**Аргументы** + +- `string` — строка. [String](../../sql-reference/data-types/string.md). +- `shinglesize` — размер словесных шинглов. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — количество минимальных и максимальных хешей, которое используется при вычислении результата. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Возвращаемое значение** + +- Кортеж из двух кортежей, каждый из которых состоит из `hashnum` шинглов. + +Тип: [Tuple](../../sql-reference/data-types/tuple.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md)), [Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md))). + +**Пример** + +Запрос: + +``` sql +SELECT wordShingleMinHashArg('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).', 1, 3) AS Tuple; +``` + +Результат: + +``` text +┌─Tuple─────────────────────────────────────────────────────────────────┐ +│ (('OLAP','database','analytical'),('online','oriented','processing')) │ +└───────────────────────────────────────────────────────────────────────┘ +``` + +## wordShingleMinHashArgCaseInsensitive {#wordshingleminhashargcaseinsensitive} + +Выделяет из ASCII строки отрезки (шинглы) из `shinglesize` слов и возвращает шинглы с минимальным и максимальным хешами, вычисленными функцией [wordShingleMinHashCaseInsensitive](#wordshingleminhashcaseinsensitive) с теми же входными данными. Функция регистро**не**зависимая. + +**Синтаксис** + +``` sql +wordShingleMinHashArgCaseInsensitive(string[, shinglesize, hashnum]) +``` + +**Аргументы** + +- `string` — строка. [String](../../sql-reference/data-types/string.md). +- `shinglesize` — размер словесных шинглов. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — количество минимальных и максимальных хешей, которое используется при вычислении результата. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Возвращаемое значение** + +- Кортеж из двух кортежей, каждый из которых состоит из `hashnum` шинглов. + +Тип: [Tuple](../../sql-reference/data-types/tuple.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md)), [Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md))). + +**Пример** + +Запрос: + +``` sql +SELECT wordShingleMinHashArgCaseInsensitive('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).', 1, 3) AS Tuple; +``` + +Результат: + +``` text +┌─Tuple──────────────────────────────────────────────────────────────────┐ +│ (('queries','database','analytical'),('oriented','processing','DBMS')) │ +└────────────────────────────────────────────────────────────────────────┘ +``` + +## wordShingleMinHashArgUTF8 {#wordshingleminhashargutf8} + +Выделяет из UTF-8 строки отрезки (шинглы) из `shinglesize` слов и возвращает шинглы с минимальным и максимальным хешами, вычисленными функцией [wordShingleMinHashUTF8](#wordshingleminhashutf8) с теми же входными данными. Функция регистрозависимая. + +**Синтаксис** + +``` sql +wordShingleMinHashArgUTF8(string[, shinglesize, hashnum]) +``` + +**Аргументы** + +- `string` — строка. [String](../../sql-reference/data-types/string.md). +- `shinglesize` — размер словесных шинглов. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — количество минимальных и максимальных хешей, которое используется при вычислении результата. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Возвращаемое значение** + +- Кортеж из двух кортежей, каждый из которых состоит из `hashnum` шинглов. + +Тип: [Tuple](../../sql-reference/data-types/tuple.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md)), [Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md))). + +**Пример** + +Запрос: + +``` sql +SELECT wordShingleMinHashArgUTF8('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).', 1, 3) AS Tuple; +``` + +Результат: + +``` text +┌─Tuple─────────────────────────────────────────────────────────────────┐ +│ (('OLAP','database','analytical'),('online','oriented','processing')) │ +└───────────────────────────────────────────────────────────────────────┘ +``` + +## wordShingleMinHashArgCaseInsensitiveUTF8 {#wordshingleminhashargcaseinsensitiveutf8} + +Выделяет из UTF-8 строки отрезки (шинглы) из `shinglesize` слов и возвращает шинглы с минимальным и максимальным хешами, вычисленными функцией [wordShingleMinHashCaseInsensitiveUTF8](#wordshingleminhashcaseinsensitiveutf8) с теми же входными данными. Функция регистро**не**зависимая. + +**Синтаксис** + +``` sql +wordShingleMinHashArgCaseInsensitiveUTF8(string[, shinglesize, hashnum]) +``` + +**Аргументы** + +- `string` — строка. [String](../../sql-reference/data-types/string.md). +- `shinglesize` — размер словесных шинглов. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — количество минимальных и максимальных хешей, которое используется при вычислении результата. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Возвращаемое значение** + +- Кортеж из двух кортежей, каждый из которых состоит из `hashnum` шинглов. + +Тип: [Tuple](../../sql-reference/data-types/tuple.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md)), [Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md))). + +**Пример** + +Запрос: + +``` sql +SELECT wordShingleMinHashArgCaseInsensitiveUTF8('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).', 1, 3) AS Tuple; +``` + +Результат: + +``` text +┌─Tuple──────────────────────────────────────────────────────────────────┐ +│ (('queries','database','analytical'),('oriented','processing','DBMS')) │ +└────────────────────────────────────────────────────────────────────────┘ +``` diff --git a/docs/ru/sql-reference/functions/tuple-functions.md b/docs/ru/sql-reference/functions/tuple-functions.md index a56eac27db2..381743a450b 100644 --- a/docs/ru/sql-reference/functions/tuple-functions.md +++ b/docs/ru/sql-reference/functions/tuple-functions.md @@ -111,3 +111,55 @@ SELECT untuple((* EXCEPT (v2, v3),)) FROM kv; - [Tuple](../../sql-reference/data-types/tuple.md) +## tupleHammingDistance {#tuplehammingdistance} + +Возвращает [расстояние Хэмминга](https://ru.wikipedia.org/wiki/%D0%A0%D0%B0%D1%81%D1%81%D1%82%D0%BE%D1%8F%D0%BD%D0%B8%D0%B5_%D0%A5%D1%8D%D0%BC%D0%BC%D0%B8%D0%BD%D0%B3%D0%B0) между двумя кортежами одинакового размера. + +**Синтаксис** + +``` sql +tupleHammingDistance(tuple1, tuple2) +``` + +**Аргументы** + +- `tuple1` — первый кортеж. [Tuple](../../sql-reference/data-types/tuple.md). +- `tuple2` — второй кортеж. [Tuple](../../sql-reference/data-types/tuple.md). + +Кортежи должны иметь одинаковый размер и тип элементов. + +**Возвращаемое значение** + +- Расстояние Хэмминга. + +Тип: [UInt8](../../sql-reference/data-types/int-uint.md). + +**Примеры** + +Запрос: + +``` sql +SELECT tupleHammingDistance((1, 2, 3), (3, 2, 1)) AS HammingDistance; +``` + +Результат: + +``` text +┌─HammingDistance─┐ +│ 2 │ +└─────────────────┘ +``` + +Может быть использовано с функциями [MinHash](../../sql-reference/functions/hash-functions.md#ngramminhash) для проверки строк на совпадение: + +``` sql +SELECT tupleHammingDistance(wordShingleMinHash(string), wordShingleMinHashCaseInsensitive(string)) as HammingDistance FROM (SELECT 'Clickhouse is a column-oriented database management system for online analytical processing of queries.' AS string); +``` + +Результат: + +``` text +┌─HammingDistance─┐ +│ 2 │ +└─────────────────┘ +``` diff --git a/docs/ru/sql-reference/statements/alter/column.md b/docs/ru/sql-reference/statements/alter/column.md index f51859b46f6..87fc1c78cd0 100644 --- a/docs/ru/sql-reference/statements/alter/column.md +++ b/docs/ru/sql-reference/statements/alter/column.md @@ -117,7 +117,7 @@ MODIFY COLUMN [IF EXISTS] name [type] [default_expr] [TTL] [AFTER name_after | F - TTL - Примеры изменения TTL столбца смотрите в разделе [TTL столбца](ttl.md#mergetree-column-ttl). + Примеры изменения TTL столбца смотрите в разделе [TTL столбца](../../../engines/table-engines/mergetree-family/mergetree.md#mergetree-column-ttl). Если указано `IF EXISTS`, запрос не возвращает ошибку, если столбца не существует. diff --git a/docs/ru/sql-reference/table-functions/postgresql.md b/docs/ru/sql-reference/table-functions/postgresql.md index a8ed23db8ed..66637276726 100644 --- a/docs/ru/sql-reference/table-functions/postgresql.md +++ b/docs/ru/sql-reference/table-functions/postgresql.md @@ -5,43 +5,46 @@ toc_title: postgresql # postgresql {#postgresql} -Позволяет выполнять запросы `SELECT` над данными, хранящимися на удалённом PostgreSQL сервере. +Позволяет выполнять запросы `SELECT` и `INSERT` над таблицами удаленной БД PostgreSQL. **Синтаксис** + ``` sql -postgresql('host:port', 'database', 'table', 'user', 'password') +postgresql('host:port', 'database', 'table', 'user', 'password'[, `schema`]) ``` -**Параметры** +**Аргументы** - `host:port` — адрес сервера PostgreSQL. - - `database` — имя базы данных на удалённом сервере. - - `table` — имя таблицы на удалённом сервере. - - `user` — пользователь PostgreSQL. - - `password` — пароль пользователя. - - -SELECT запросы на стороне PostgreSQL выполняются как `COPY (SELECT ...) TO STDOUT` внутри транзакции PostgreSQL только на чтение с коммитом после каждого `SELECT` запроса. - -Простые условия для `WHERE` такие как `=, !=, >, >=, <, <=, IN` исполняются на стороне PostgreSQL сервера. - -Все операции объединения, аггрегации, сортировки, условия `IN [ array ]` и ограничения `LIMIT` выполняются на стороне ClickHouse только после того как запрос к PostgreSQL закончился. - -INSERT запросы на стороне PostgreSQL выполняются как `COPY "table_name" (field1, field2, ... fieldN) FROM STDIN` внутри PostgreSQL транзакции с автоматическим коммитом после каждого `INSERT` запроса. - -PostgreSQL массивы конвертируются в массивы ClickHouse. -Будьте осторожны в PostgreSQL массивы созданные как type_name[], являются многомерными и могут содержать в себе разное количество измерений в разных строках одной таблицы, внутри ClickHouse допустипы только многомерные массивы с одинаковым кол-вом измерений во всех строках таблицы. +- `schema` — имя схемы, если не используется схема по умолчанию. Необязательный аргумент. **Возвращаемое значение** -Объект таблицы с теми же столбцами, что и в исходной таблице PostgreSQL. +Таблица с теми же столбцами, что и в исходной таблице PostgreSQL. !!! info "Примечание" -В запросах `INSERT` для того чтобы отличить табличную функцию `postgresql(...)` от таблицы со списком имен столбцов вы должны указывать ключевые слова `FUNCTION` или `TABLE FUNCTION`. See examples below. + В запросах `INSERT` для того чтобы отличить табличную функцию `postgresql(...)` от таблицы со списком имен столбцов вы должны указывать ключевые слова `FUNCTION` или `TABLE FUNCTION`. См. примеры ниже. + +## Особенности реализации {#implementation-details} + +Запросы `SELECT` на стороне PostgreSQL выполняются как `COPY (SELECT ...) TO STDOUT` внутри транзакции PostgreSQL только на чтение с коммитом после каждого запроса `SELECT`. + +Простые условия для `WHERE` такие как `=`, `!=`, `>`, `>=`, `<`, `<=` и `IN` исполняются на стороне PostgreSQL сервера. + +Все операции объединения, аггрегации, сортировки, условия `IN [ array ]` и ограничения `LIMIT` выполняются на стороне ClickHouse только после того как запрос к PostgreSQL закончился. + +Запросы `INSERT` на стороне PostgreSQL выполняются как `COPY "table_name" (field1, field2, ... fieldN) FROM STDIN` внутри PostgreSQL транзакции с автоматическим коммитом после каждого запроса `INSERT`. + +PostgreSQL массивы конвертируются в массивы ClickHouse. + +!!! info "Примечание" + Будьте внимательны, в PostgreSQL массивы, созданные как `type_name[]`, являются многомерными и могут содержать в себе разное количество измерений в разных строках одной таблицы. Внутри ClickHouse допустипы только многомерные массивы с одинаковым кол-вом измерений во всех строках таблицы. + +При использовании словаря PostgreSQL поддерживается приоритет реплик. Чем больше номер реплики, тем ниже ее приоритет. Наивысший приоритет у реплики с номером `0`. **Примеры** @@ -58,10 +61,10 @@ PRIMARY KEY (int_id)); CREATE TABLE -postgres=# insert into test (int_id, str, "float") VALUES (1,'test',2); +postgres=# INSERT INTO test (int_id, str, "float") VALUES (1,'test',2); INSERT 0 1 -postgresql> select * from test; +postgresql> SELECT * FROM test; int_id | int_nullable | float | str | float_nullable --------+--------------+-------+------+---------------- 1 | | 2 | test | @@ -80,7 +83,7 @@ SELECT * FROM postgresql('localhost:5432', 'test', 'test', 'postgresql_user', 'p └────────┴──────────────┴───────┴──────┴────────────────┘ ``` -Вставка: +Вставка данных: ```sql INSERT INTO TABLE FUNCTION postgresql('localhost:5432', 'test', 'test', 'postgrsql_user', 'password') (int_id, float) VALUES (2, 3); @@ -94,7 +97,24 @@ SELECT * FROM postgresql('localhost:5432', 'test', 'test', 'postgresql_user', 'p └────────┴──────────────┴───────┴──────┴────────────────┘ ``` -**Смотрите также** +Using Non-default Schema: -- [Движок таблиц ‘PostgreSQL’](../../sql-reference/table-functions/postgresql.md) +```text +postgres=# CREATE SCHEMA "nice.schema"; + +postgres=# CREATE TABLE "nice.schema"."nice.table" (a integer); + +postgres=# INSERT INTO "nice.schema"."nice.table" SELECT i FROM generate_series(0, 99) as t(i) +``` + +```sql +CREATE TABLE pg_table_schema_with_dots (a UInt32) + ENGINE PostgreSQL('localhost:5432', 'clickhouse', 'nice.table', 'postgrsql_user', 'password', 'nice.schema'); +``` + +**См. также** + +- [Движок таблиц PostgreSQL](../../sql-reference/table-functions/postgresql.md) - [Использование PostgreSQL как источника данных для внешнего словаря](../../sql-reference/table-functions/postgresql.md#dicts-external_dicts_dict_sources-postgresql) + +[Оригинальная статья](https://clickhouse.tech/docs/ru/sql-reference/table-functions/postgresql/) diff --git a/programs/server/CMakeLists.txt b/programs/server/CMakeLists.txt index 198d9081168..697851b294b 100644 --- a/programs/server/CMakeLists.txt +++ b/programs/server/CMakeLists.txt @@ -42,11 +42,16 @@ if (OS_LINUX) set(RESOURCE_OBJS ${RESOURCE_OBJS} ${RESOURCE_OBJ}) # https://stackoverflow.com/questions/14776463/compile-and-add-an-object-file-from-a-binary-with-cmake - add_custom_command(OUTPUT ${RESOURCE_OBJ} - COMMAND cd ${CMAKE_CURRENT_SOURCE_DIR} && ${OBJCOPY_PATH} -I binary ${OBJCOPY_ARCH_OPTIONS} ${RESOURCE_FILE} ${CMAKE_CURRENT_BINARY_DIR}/${RESOURCE_OBJ} - COMMAND ${OBJCOPY_PATH} --rename-section .data=.rodata,alloc,load,readonly,data,contents - ${CMAKE_CURRENT_BINARY_DIR}/${RESOURCE_OBJ} ${CMAKE_CURRENT_BINARY_DIR}/${RESOURCE_OBJ}) - + # PPC64LE fails to do this with objcopy, use ld or lld instead + if (ARCH_PPC64LE) + add_custom_command(OUTPUT ${RESOURCE_OBJ} + COMMAND cd ${CMAKE_CURRENT_SOURCE_DIR} && ${CMAKE_LINKER} -m elf64lppc -r -b binary -o ${CMAKE_CURRENT_BINARY_DIR}/${RESOURCE_OBJ} ${RESOURCE_FILE}) + else() + add_custom_command(OUTPUT ${RESOURCE_OBJ} + COMMAND cd ${CMAKE_CURRENT_SOURCE_DIR} && ${OBJCOPY_PATH} -I binary ${OBJCOPY_ARCH_OPTIONS} ${RESOURCE_FILE} ${CMAKE_CURRENT_BINARY_DIR}/${RESOURCE_OBJ} + COMMAND ${OBJCOPY_PATH} --rename-section .data=.rodata,alloc,load,readonly,data,contents + ${CMAKE_CURRENT_BINARY_DIR}/${RESOURCE_OBJ} ${CMAKE_CURRENT_BINARY_DIR}/${RESOURCE_OBJ}) + endif() set_source_files_properties(${RESOURCE_OBJ} PROPERTIES EXTERNAL_OBJECT true GENERATED true) endforeach(RESOURCE_FILE) diff --git a/src/Client/HedgedConnections.cpp b/src/Client/HedgedConnections.cpp index a163ceba4a2..8455ef3117e 100644 --- a/src/Client/HedgedConnections.cpp +++ b/src/Client/HedgedConnections.cpp @@ -521,14 +521,17 @@ void HedgedConnections::processNewReplicaState(HedgedConnectionsFactory::State s void HedgedConnections::finishProcessReplica(ReplicaState & replica, bool disconnect) { + /// It's important to remove file descriptor from epoll exactly before cancelling packet_receiver, + /// because otherwise another thread can try to receive a packet, get this file descriptor + /// from epoll and resume cancelled packet_receiver. + epoll.remove(replica.packet_receiver->getFileDescriptor()); + epoll.remove(replica.change_replica_timeout.getDescriptor()); + replica.packet_receiver->cancel(); replica.change_replica_timeout.reset(); - epoll.remove(replica.packet_receiver->getFileDescriptor()); --offset_states[fd_to_replica_location[replica.packet_receiver->getFileDescriptor()].offset].active_connection_count; fd_to_replica_location.erase(replica.packet_receiver->getFileDescriptor()); - - epoll.remove(replica.change_replica_timeout.getDescriptor()); timeout_fd_to_replica_location.erase(replica.change_replica_timeout.getDescriptor()); --active_connection_count; diff --git a/src/Common/HashTable/HashMap.h b/src/Common/HashTable/HashMap.h index 99dc5414107..c3cd09eccb2 100644 --- a/src/Common/HashTable/HashMap.h +++ b/src/Common/HashTable/HashMap.h @@ -48,7 +48,7 @@ struct HashMapCell value_type value; - HashMapCell() {} + HashMapCell() = default; HashMapCell(const Key & key_, const State &) : value(key_, NoInitTag()) {} HashMapCell(const value_type & value_, const State &) : value(value_) {} @@ -114,8 +114,39 @@ struct HashMapCell static void move(HashMapCell * /* old_location */, HashMapCell * /* new_location */) {} + template + auto & get() & { + if constexpr (I == 0) return value.first; + else if constexpr (I == 1) return value.second; + } + + template + auto const & get() const & { + if constexpr (I == 0) return value.first; + else if constexpr (I == 1) return value.second; + } + + template + auto && get() && { + if constexpr (I == 0) return std::move(value.first); + else if constexpr (I == 1) return std::move(value.second); + } + }; +namespace std +{ + + template + struct tuple_size> : std::integral_constant { }; + + template + struct tuple_element<0, HashMapCell> { using type = Key; }; + + template + struct tuple_element<1, HashMapCell> { using type = TMapped; }; +} + template struct HashMapCellWithSavedHash : public HashMapCell { @@ -227,6 +258,19 @@ public: } }; +namespace std +{ + + template + struct tuple_size> : std::integral_constant { }; + + template + struct tuple_element<0, HashMapCellWithSavedHash> { using type = Key; }; + + template + struct tuple_element<1, HashMapCellWithSavedHash> { using type = TMapped; }; +} + template < typename Key, diff --git a/src/Common/PODArray.h b/src/Common/PODArray.h index 57ad3d46177..b1042332cfa 100644 --- a/src/Common/PODArray.h +++ b/src/Common/PODArray.h @@ -530,6 +530,31 @@ public: this->c_end += bytes_to_copy; } + template + void insertFromItself(iterator from_begin, iterator from_end, TAllocatorParams && ... allocator_params) + { + static_assert(memcpy_can_be_used_for_assignment, std::decay_t>); + + /// Convert iterators to indexes because reserve can invalidate iterators + size_t start_index = from_begin - begin(); + size_t end_index = from_end - begin(); + size_t copy_size = end_index - start_index; + + assert(start_index <= end_index); + + size_t required_capacity = this->size() + copy_size; + if (required_capacity > this->capacity()) + this->reserve(roundUpToPowerOfTwoOrZero(required_capacity), std::forward(allocator_params)...); + + size_t bytes_to_copy = this->byte_size(copy_size); + if (bytes_to_copy) + { + auto begin = this->c_start + this->byte_size(start_index); + memcpy(this->c_end, reinterpret_cast(&*begin), bytes_to_copy); + this->c_end += bytes_to_copy; + } + } + template void insert_assume_reserved(It1 from_begin, It2 from_end) { diff --git a/src/Common/StackTrace.cpp b/src/Common/StackTrace.cpp index c4cf7f11e68..9e81cdddbda 100644 --- a/src/Common/StackTrace.cpp +++ b/src/Common/StackTrace.cpp @@ -35,7 +35,7 @@ std::string signalToErrorMessage(int sig, const siginfo_t & info, const ucontext else error << "Address: " << info.si_addr; -#if defined(__x86_64__) && !defined(__FreeBSD__) && !defined(__APPLE__) && !defined(__arm__) +#if defined(__x86_64__) && !defined(__FreeBSD__) && !defined(__APPLE__) && !defined(__arm__) && !defined(__powerpc__) auto err_mask = context.uc_mcontext.gregs[REG_ERR]; if ((err_mask & 0x02)) error << " Access: write."; @@ -186,6 +186,8 @@ static void * getCallerAddress(const ucontext_t & context) # endif #elif defined(__aarch64__) return reinterpret_cast(context.uc_mcontext.pc); +#elif defined(__powerpc64__) + return reinterpret_cast(context.uc_mcontext.gp_regs[PT_NIP]); #else return nullptr; #endif diff --git a/src/Common/tests/gtest_pod_array.cpp b/src/Common/tests/gtest_pod_array.cpp index 63cf7026757..9cc77b88195 100644 --- a/src/Common/tests/gtest_pod_array.cpp +++ b/src/Common/tests/gtest_pod_array.cpp @@ -33,6 +33,19 @@ TEST(Common, PODArrayInsert) EXPECT_EQ(str, std::string(chars.data(), chars.size())); } +TEST(Common, PODArrayInsertFromItself) +{ + { + PaddedPODArray array { 1 }; + + for (size_t i = 0; i < 3; ++i) + array.insertFromItself(array.begin(), array.end()); + + PaddedPODArray expected {1,1,1,1,1,1,1,1}; + ASSERT_EQ(array,expected); + } +} + TEST(Common, PODPushBackRawMany) { PODArray chars; diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 0b47cf1f2f7..045433dc895 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -228,6 +228,7 @@ class IColumn; M(Seconds, http_connection_timeout, DEFAULT_HTTP_READ_BUFFER_CONNECTION_TIMEOUT, "HTTP connection timeout.", 0) \ M(Seconds, http_send_timeout, DEFAULT_HTTP_READ_BUFFER_TIMEOUT, "HTTP send timeout", 0) \ M(Seconds, http_receive_timeout, DEFAULT_HTTP_READ_BUFFER_TIMEOUT, "HTTP receive timeout", 0) \ + M(UInt64, http_max_uri_size, 16384, "Maximum URI length of HTTP request", 0) \ M(Bool, optimize_throw_if_noop, false, "If setting is enabled and OPTIMIZE query didn't actually assign a merge then an explanatory exception is thrown", 0) \ M(Bool, use_index_for_in_with_subqueries, true, "Try using an index if there is a subquery or a table expression on the right side of the IN operator.", 0) \ M(Bool, joined_subquery_requires_alias, true, "Force joined subqueries and table functions to have aliases for correct name qualification.", 0) \ @@ -546,7 +547,7 @@ struct Settings : public BaseSettings { /// For initialization from empty initializer-list to be "value initialization", not "aggregate initialization" in C++14. /// http://en.cppreference.com/w/cpp/language/aggregate_initialization - Settings() {} + Settings() = default; /** Set multiple settings from "profile" (in server configuration file (users.xml), profiles contain groups of multiple settings). * The profile can also be set using the `set` functions, like the profile setting. diff --git a/src/Databases/DatabaseAtomic.cpp b/src/Databases/DatabaseAtomic.cpp index e0078da57b7..b4222a7e349 100644 --- a/src/Databases/DatabaseAtomic.cpp +++ b/src/Databases/DatabaseAtomic.cpp @@ -567,7 +567,7 @@ void DatabaseAtomic::renameDictionaryInMemoryUnlocked(const StorageID & old_name auto result = external_loader.getLoadResult(toString(old_name.uuid)); if (!result.object) return; - const auto & dict = dynamic_cast(*result.object); + const auto & dict = dynamic_cast(*result.object); dict.updateDictionaryName(new_name); } void DatabaseAtomic::waitDetachedTableNotInUse(const UUID & uuid) diff --git a/src/Databases/DatabaseWithDictionaries.cpp b/src/Databases/DatabaseWithDictionaries.cpp index d92f0f1897e..55b04f27c58 100644 --- a/src/Databases/DatabaseWithDictionaries.cpp +++ b/src/Databases/DatabaseWithDictionaries.cpp @@ -49,7 +49,7 @@ void DatabaseWithDictionaries::attachDictionary(const String & dictionary_name, /// Attach the dictionary as table too. try { - /// TODO Make StorageDictionary an owner of IDictionaryBase objects. + /// TODO Make StorageDictionary an owner of IDictionary objects. /// All DDL operations with dictionaries will work with StorageDictionary table, /// and StorageDictionary will be responsible for loading of DDL dictionaries. /// ExternalLoaderDatabaseConfigRepository and other hacks related to ExternalLoader diff --git a/src/Dictionaries/CacheDictionary.cpp b/src/Dictionaries/CacheDictionary.cpp index eedf4dd3d87..535e862af40 100644 --- a/src/Dictionaries/CacheDictionary.cpp +++ b/src/Dictionaries/CacheDictionary.cpp @@ -13,7 +13,9 @@ #include #include #include + #include +#include namespace ProfileEvents { @@ -39,7 +41,6 @@ namespace DB namespace ErrorCodes { extern const int CACHE_DICTIONARY_UPDATE_FAIL; - extern const int TYPE_MISMATCH; extern const int UNSUPPORTED_METHOD; } @@ -70,8 +71,6 @@ CacheDictionary::CacheDictionary( { if (!source_ptr->supportsSelectiveLoad()) throw Exception{full_name + ": source cannot be used with CacheDictionary", ErrorCodes::UNSUPPORTED_METHOD}; - - setupHierarchicalAttribute(); } template @@ -120,164 +119,6 @@ const IDictionarySource * CacheDictionary::getSource() cons return source_ptr.get(); } -template -void CacheDictionary::toParent(const PaddedPODArray & ids [[maybe_unused]], PaddedPODArray & out [[maybe_unused]]) const -{ - if constexpr (dictionary_key_type == DictionaryKeyType::simple) - { - /// Run update on requested keys before fetch from storage - const auto & attribute_name = hierarchical_attribute->name; - - auto result_type = std::make_shared(); - auto input_column = result_type->createColumn(); - auto & input_column_typed = assert_cast &>(*input_column); - auto & data = input_column_typed.getData(); - data.insert(ids.begin(), ids.end()); - - auto column = getColumn({attribute_name}, result_type, {std::move(input_column)}, {result_type}, {nullptr}); - const auto & result_column_typed = assert_cast &>(*column); - const auto & result_data = result_column_typed.getData(); - - out.assign(result_data); - } - else - throw Exception("Hierarchy is not supported for complex key CacheDictionary", ErrorCodes::UNSUPPORTED_METHOD); -} - - -/// Allow to use single value in same way as array. -static inline UInt64 getAt(const PaddedPODArray & arr, const size_t idx) -{ - return arr[idx]; -} -static inline UInt64 getAt(const UInt64 & value, const size_t) -{ - return value; -} - -template -template -void CacheDictionary::isInImpl(const PaddedPODArray & child_ids, const AncestorType & ancestor_ids, PaddedPODArray & out) const -{ - /// Transform all children to parents until ancestor id or null_value will be reached. - - size_t out_size = out.size(); - memset(out.data(), 0xFF, out_size); /// 0xFF means "not calculated" - - const auto null_value = hierarchical_attribute->null_value.get(); - - PaddedPODArray children(out_size, 0); - PaddedPODArray parents(child_ids.begin(), child_ids.end()); - - for (size_t i = 0; i < DBMS_HIERARCHICAL_DICTIONARY_MAX_DEPTH; ++i) - { - size_t out_idx = 0; - size_t parents_idx = 0; - size_t new_children_idx = 0; - - while (out_idx < out_size) - { - /// Already calculated - if (out[out_idx] != 0xFF) - { - ++out_idx; - continue; - } - - /// No parent - if (parents[parents_idx] == null_value) - { - out[out_idx] = 0; - } - /// Found ancestor - else if (parents[parents_idx] == getAt(ancestor_ids, parents_idx)) - { - out[out_idx] = 1; - } - /// Loop detected - else if (children[new_children_idx] == parents[parents_idx]) - { - out[out_idx] = 1; - } - /// Found intermediate parent, add this value to search at next loop iteration - else - { - children[new_children_idx] = parents[parents_idx]; - ++new_children_idx; - } - - ++out_idx; - ++parents_idx; - } - - if (new_children_idx == 0) - break; - - /// Transform all children to its parents. - children.resize(new_children_idx); - parents.resize(new_children_idx); - - toParent(children, parents); - } -} - -template -void CacheDictionary::isInVectorVector( - const PaddedPODArray & child_ids, const PaddedPODArray & ancestor_ids, PaddedPODArray & out) const -{ - isInImpl(child_ids, ancestor_ids, out); -} - -template -void CacheDictionary::isInVectorConstant(const PaddedPODArray & child_ids, const UInt64 ancestor_id, PaddedPODArray & out) const -{ - isInImpl(child_ids, ancestor_id, out); -} - -template -void CacheDictionary::isInConstantVector(const UInt64 child_id, const PaddedPODArray & ancestor_ids, PaddedPODArray & out) const -{ - /// Special case with single child value. - - const auto null_value = hierarchical_attribute->null_value.get(); - - PaddedPODArray child(1, child_id); - PaddedPODArray parent(1); - std::vector ancestors(1, child_id); - - /// Iteratively find all ancestors for child. - for (size_t i = 0; i < DBMS_HIERARCHICAL_DICTIONARY_MAX_DEPTH; ++i) - { - toParent(child, parent); - - if (parent[0] == null_value) - break; - - child[0] = parent[0]; - ancestors.push_back(parent[0]); - } - - /// Assuming short hierarchy, so linear search is Ok. - for (size_t i = 0, out_size = out.size(); i < out_size; ++i) - out[i] = std::find(ancestors.begin(), ancestors.end(), ancestor_ids[i]) != ancestors.end(); -} - -template -void CacheDictionary::setupHierarchicalAttribute() -{ - /// TODO: Move this to DictionaryStructure - for (const auto & attribute : dict_struct.attributes) - { - if (attribute.hierarchical) - { - hierarchical_attribute = &attribute; - - if (attribute.underlying_type != AttributeUnderlyingType::utUInt64) - throw Exception{full_name + ": hierarchical attribute must be UInt64.", ErrorCodes::TYPE_MISMATCH}; - } - } -} - template ColumnPtr CacheDictionary::getColumn( const std::string & attribute_name, @@ -296,23 +137,6 @@ Columns CacheDictionary::getColumns( const Columns & key_columns, const DataTypes & key_types, const Columns & default_values_columns) const -{ - if (dictionary_key_type == DictionaryKeyType::complex) - dict_struct.validateKeyTypes(key_types); - - Arena complex_keys_arena; - DictionaryKeysExtractor extractor(key_columns, complex_keys_arena); - auto & keys = extractor.getKeys(); - - return getColumnsImpl(attribute_names, key_columns, keys, default_values_columns); -} - -template -Columns CacheDictionary::getColumnsImpl( - const Strings & attribute_names, - const Columns & key_columns, - const PaddedPODArray & keys, - const Columns & default_values_columns) const { /** * Flow of getColumsImpl @@ -328,6 +152,13 @@ Columns CacheDictionary::getColumnsImpl( * use default value. */ + if (dictionary_key_type == DictionaryKeyType::complex) + dict_struct.validateKeyTypes(key_types); + + DictionaryKeysArenaHolder arena_holder; + DictionaryKeysExtractor extractor(key_columns, arena_holder.getComplexKeyArena()); + auto keys = extractor.extractAllKeys(); + DictionaryStorageFetchRequest request(dict_struct, attribute_names, default_values_columns); FetchResult result_of_fetch_from_storage; @@ -440,9 +271,10 @@ ColumnUInt8::Ptr CacheDictionary::hasKeys(const Columns & k if (dictionary_key_type == DictionaryKeyType::complex) dict_struct.validateKeyTypes(key_types); - Arena complex_keys_arena; - DictionaryKeysExtractor extractor(key_columns, complex_keys_arena); - const auto & keys = extractor.getKeys(); + + DictionaryKeysArenaHolder arena_holder; + DictionaryKeysExtractor extractor(key_columns, arena_holder.getComplexKeyArena()); + const auto keys = extractor.extractAllKeys(); /// We make empty request just to fetch if keys exists DictionaryStorageFetchRequest request(dict_struct, {}, {}); @@ -526,6 +358,37 @@ ColumnUInt8::Ptr CacheDictionary::hasKeys(const Columns & k return result; } +template +ColumnPtr CacheDictionary::getHierarchy( + ColumnPtr key_column [[maybe_unused]], + const DataTypePtr & key_type [[maybe_unused]]) const +{ + if (dictionary_key_type == DictionaryKeyType::simple) + { + auto result = getKeysHierarchyDefaultImplementation(this, key_column, key_type); + query_count.fetch_add(key_column->size(), std::memory_order_relaxed); + return result; + } + else + return nullptr; +} + +template +ColumnUInt8::Ptr CacheDictionary::isInHierarchy( + ColumnPtr key_column [[maybe_unused]], + ColumnPtr in_key_column [[maybe_unused]], + const DataTypePtr & key_type [[maybe_unused]]) const +{ + if (dictionary_key_type == DictionaryKeyType::simple) + { + auto result = getKeysIsInHierarchyDefaultImplementation(this, key_column, in_key_column, key_type); + query_count.fetch_add(key_column->size(), std::memory_order_relaxed); + return result; + } + else + return nullptr; +} + template MutableColumns CacheDictionary::aggregateColumnsInOrderOfKeys( const PaddedPODArray & keys, @@ -618,19 +481,18 @@ MutableColumns CacheDictionary::aggregateColumns( template BlockInputStreamPtr CacheDictionary::getBlockInputStream(const Names & column_names, size_t max_block_size) const { - using BlockInputStreamType = DictionaryBlockInputStream; - std::shared_ptr stream; + std::shared_ptr stream; { /// Write lock on storage const ProfilingScopedWriteRWLock write_lock{rw_lock, ProfileEvents::DictCacheLockWriteNs}; if constexpr (dictionary_key_type == DictionaryKeyType::simple) - stream = std::make_shared(shared_from_this(), max_block_size, cache_storage_ptr->getCachedSimpleKeys(), column_names); + stream = std::make_shared(shared_from_this(), max_block_size, cache_storage_ptr->getCachedSimpleKeys(), column_names); else { auto keys = cache_storage_ptr->getCachedComplexKeys(); - stream = std::make_shared(shared_from_this(), max_block_size, keys, column_names); + stream = std::make_shared(shared_from_this(), max_block_size, keys, column_names); } } @@ -660,14 +522,20 @@ void CacheDictionary::update(CacheDictionaryUpdateUnitPtr requested_keys_extractor(update_unit_ptr->key_columns, update_unit_ptr->complex_key_arena); - const auto & requested_keys = requested_keys_extractor.getKeys(); + Arena * complex_key_arena = update_unit_ptr->complex_keys_arena_holder.getComplexKeyArena(); + DictionaryKeysExtractor requested_keys_extractor(update_unit_ptr->key_columns, complex_key_arena); + auto requested_keys = requested_keys_extractor.extractAllKeys(); HashSet not_found_keys; std::vector requested_keys_vector; std::vector requested_complex_key_rows; + if constexpr (dictionary_key_type == DictionaryKeyType::simple) + requested_keys_vector.reserve(requested_keys.size()); + else + requested_complex_key_rows.reserve(requested_keys.size()); + auto & key_index_to_state_from_storage = update_unit_ptr->key_index_to_state; for (size_t i = 0; i < key_index_to_state_from_storage.size(); ++i) @@ -727,8 +595,8 @@ void CacheDictionary::update(CacheDictionaryUpdateUnitPtr keys_extractor(key_columns, update_unit_ptr->complex_key_arena); - const auto & keys_extracted_from_block = keys_extractor.getKeys(); + DictionaryKeysExtractor keys_extractor(key_columns, complex_key_arena); + auto keys_extracted_from_block = keys_extractor.extractAllKeys(); for (size_t index_of_attribute = 0; index_of_attribute < fetched_columns_during_update.size(); ++index_of_attribute) { @@ -740,6 +608,7 @@ void CacheDictionary::update(CacheDictionaryUpdateUnitPtrrequested_keys_to_fetched_columns_during_update_index[fetched_key_from_source] = found_keys_size; found_keys_in_source.emplace_back(fetched_key_from_source); diff --git a/src/Dictionaries/CacheDictionary.h b/src/Dictionaries/CacheDictionary.h index 1192db73737..62cd509d006 100644 --- a/src/Dictionaries/CacheDictionary.h +++ b/src/Dictionaries/CacheDictionary.h @@ -130,33 +130,18 @@ public: std::exception_ptr getLastException() const override; - bool hasHierarchy() const override { return dictionary_key_type == DictionaryKeyType::simple && hierarchical_attribute; } + bool hasHierarchy() const override { return dictionary_key_type == DictionaryKeyType::simple && dict_struct.hierarchical_attribute_index.has_value(); } - void toParent(const PaddedPODArray & ids, PaddedPODArray & out) const override; + ColumnPtr getHierarchy(ColumnPtr key_column, const DataTypePtr & key_type) const override; - void isInVectorVector( - const PaddedPODArray & child_ids, - const PaddedPODArray & ancestor_ids, - PaddedPODArray & out) const override; - - void isInVectorConstant( - const PaddedPODArray & child_ids, - const UInt64 ancestor_id, PaddedPODArray & out) const override; - - void isInConstantVector( - const UInt64 child_id, - const PaddedPODArray & ancestor_ids, - PaddedPODArray & out) const override; + ColumnUInt8::Ptr isInHierarchy( + ColumnPtr key_column, + ColumnPtr in_key_column, + const DataTypePtr & key_type) const override; private: using FetchResult = std::conditional_t; - Columns getColumnsImpl( - const Strings & attribute_names, - const Columns & key_columns, - const PaddedPODArray & keys, - const Columns & default_values_columns) const; - static MutableColumns aggregateColumnsInOrderOfKeys( const PaddedPODArray & keys, const DictionaryStorageFetchRequest & request, @@ -171,8 +156,6 @@ private: const MutableColumns & fetched_columns_during_update, const HashMap & found_keys_to_fetched_columns_during_update_index); - void setupHierarchicalAttribute(); - void update(CacheDictionaryUpdateUnitPtr update_unit_ptr); /// Update dictionary source pointer if required and return it. Thread safe. @@ -193,9 +176,6 @@ private: return source_ptr; } - template - void isInImpl(const PaddedPODArray & child_ids, const AncestorType & ancestor_ids, PaddedPODArray & out) const; - const DictionaryStructure dict_struct; /// Dictionary source should be used with mutex @@ -218,8 +198,6 @@ private: /// readers. Surprisingly this lock is also used for last_exception pointer. mutable std::shared_mutex rw_lock; - const DictionaryAttribute * hierarchical_attribute = nullptr; - mutable std::exception_ptr last_exception; mutable std::atomic error_count {0}; mutable std::atomic backoff_end_time{std::chrono::system_clock::time_point{}}; diff --git a/src/Dictionaries/CacheDictionaryUpdateQueue.h b/src/Dictionaries/CacheDictionaryUpdateQueue.h index 2e636af6db6..3d27a157752 100644 --- a/src/Dictionaries/CacheDictionaryUpdateQueue.h +++ b/src/Dictionaries/CacheDictionaryUpdateQueue.h @@ -66,8 +66,9 @@ public: HashMap requested_keys_to_fetched_columns_during_update_index; MutableColumns fetched_columns_during_update; + /// Complex keys are serialized in this arena - Arena complex_key_arena; + DictionaryKeysArenaHolder complex_keys_arena_holder; private: template diff --git a/src/Dictionaries/ComplexKeyHashedDictionary.cpp b/src/Dictionaries/ComplexKeyHashedDictionary.cpp deleted file mode 100644 index 4086082e66d..00000000000 --- a/src/Dictionaries/ComplexKeyHashedDictionary.cpp +++ /dev/null @@ -1,594 +0,0 @@ -#include "ComplexKeyHashedDictionary.h" -#include -#include -#include -#include -#include -#include -#include "DictionaryBlockInputStream.h" -#include "DictionaryFactory.h" - -namespace DB -{ -namespace ErrorCodes -{ - extern const int TYPE_MISMATCH; - extern const int BAD_ARGUMENTS; - extern const int DICTIONARY_IS_EMPTY; -} - -ComplexKeyHashedDictionary::ComplexKeyHashedDictionary( - const StorageID & dict_id_, - const DictionaryStructure & dict_struct_, - DictionarySourcePtr source_ptr_, - const DictionaryLifetime dict_lifetime_, - bool require_nonempty_, - BlockPtr saved_block_) - : IDictionaryBase(dict_id_) - , dict_struct(dict_struct_) - , source_ptr{std::move(source_ptr_)} - , dict_lifetime(dict_lifetime_) - , require_nonempty(require_nonempty_) - , saved_block{std::move(saved_block_)} -{ - createAttributes(); - loadData(); - calculateBytesAllocated(); -} - -ColumnPtr ComplexKeyHashedDictionary::getColumn( - const std::string & attribute_name, - const DataTypePtr & result_type, - const Columns & key_columns, - const DataTypes & key_types, - const ColumnPtr & default_values_column) const -{ - dict_struct.validateKeyTypes(key_types); - - ColumnPtr result; - - const auto & attribute = getAttribute(attribute_name); - const auto & dictionary_attribute = dict_struct.getAttribute(attribute_name, result_type); - - auto keys_size = key_columns.front()->size(); - - ColumnUInt8::MutablePtr col_null_map_to; - ColumnUInt8::Container * vec_null_map_to = nullptr; - if (attribute.is_nullable) - { - col_null_map_to = ColumnUInt8::create(keys_size, false); - vec_null_map_to = &col_null_map_to->getData(); - } - - auto type_call = [&](const auto &dictionary_attribute_type) - { - using Type = std::decay_t; - using AttributeType = typename Type::AttributeType; - using ValueType = DictionaryValueType; - using ColumnProvider = DictionaryAttributeColumnProvider; - - const auto attribute_null_value = std::get(attribute.null_values); - AttributeType null_value = static_cast(attribute_null_value); - DictionaryDefaultValueExtractor default_value_extractor(std::move(null_value), default_values_column); - - auto column = ColumnProvider::getColumn(dictionary_attribute, keys_size); - - if constexpr (std::is_same_v) - { - auto * out = column.get(); - - getItemsImpl( - attribute, - key_columns, - [&](const size_t row, const StringRef value, bool is_null) - { - if (attribute.is_nullable) - (*vec_null_map_to)[row] = is_null; - - out->insertData(value.data, value.size); - }, - default_value_extractor); - } - else - { - auto & out = column->getData(); - - getItemsImpl( - attribute, - key_columns, - [&](const size_t row, const auto value, bool is_null) - { - if (attribute.is_nullable) - (*vec_null_map_to)[row] = is_null; - - out[row] = value; - }, - default_value_extractor); - } - - result = std::move(column); - }; - - callOnDictionaryAttributeType(attribute.type, type_call); - - if (attribute.is_nullable) - { - result = ColumnNullable::create(result, std::move(col_null_map_to)); - } - - return result; -} - -ColumnUInt8::Ptr ComplexKeyHashedDictionary::hasKeys(const Columns & key_columns, const DataTypes & key_types) const -{ - dict_struct.validateKeyTypes(key_types); - - auto size = key_columns.front()->size(); - auto result = ColumnUInt8::create(size); - auto& out = result->getData(); - - const auto & attribute = attributes.front(); - - auto type_call = [&](const auto & dictionary_attribute_type) - { - using Type = std::decay_t; - using AttributeType = typename Type::AttributeType; - using ValueType = DictionaryValueType; - - has(attribute, key_columns, out); - }; - - callOnDictionaryAttributeType(attribute.type, type_call); - - return result; -} - -void ComplexKeyHashedDictionary::createAttributes() -{ - const auto size = dict_struct.attributes.size(); - attributes.reserve(size); - - for (const auto & attribute : dict_struct.attributes) - { - attribute_index_by_name.emplace(attribute.name, attributes.size()); - attributes.push_back(createAttribute(attribute, attribute.null_value)); - - if (attribute.hierarchical) - throw Exception{full_name + ": hierarchical attributes not supported for dictionary of type " + getTypeName(), - ErrorCodes::TYPE_MISMATCH}; - } -} - -void ComplexKeyHashedDictionary::blockToAttributes(const Block & block) -{ - /// created upfront to avoid excess allocations - const auto keys_size = dict_struct.key->size(); - StringRefs keys(keys_size); - - const auto attributes_size = attributes.size(); - const auto rows = block.rows(); - element_count += rows; - - const auto key_column_ptrs = ext::map( - ext::range(0, keys_size), [&](const size_t attribute_idx) { return block.safeGetByPosition(attribute_idx).column; }); - - const auto attribute_column_ptrs = ext::map(ext::range(0, attributes_size), [&](const size_t attribute_idx) - { - return block.safeGetByPosition(keys_size + attribute_idx).column; - }); - - for (const auto row_idx : ext::range(0, rows)) - { - /// calculate key once per row - const auto key = placeKeysInPool(row_idx, key_column_ptrs, keys, keys_pool); - - auto should_rollback = false; - - for (const auto attribute_idx : ext::range(0, attributes_size)) - { - const auto & attribute_column = *attribute_column_ptrs[attribute_idx]; - auto & attribute = attributes[attribute_idx]; - const auto inserted = setAttributeValue(attribute, key, attribute_column[row_idx]); - if (!inserted) - should_rollback = true; - } - - /// @note on multiple equal keys the mapped value for the first one is stored - if (should_rollback) - keys_pool.rollback(key.size); - } -} - -void ComplexKeyHashedDictionary::updateData() -{ - /// created upfront to avoid excess allocations - const auto keys_size = dict_struct.key->size(); - StringRefs keys(keys_size); - - const auto attributes_size = attributes.size(); - - if (!saved_block || saved_block->rows() == 0) - { - auto stream = source_ptr->loadUpdatedAll(); - stream->readPrefix(); - - while (const auto block = stream->read()) - { - /// We are using this method to keep saved data if input stream consists of multiple blocks - if (!saved_block) - saved_block = std::make_shared(block.cloneEmpty()); - for (const auto attribute_idx : ext::range(0, keys_size + attributes_size)) - { - const IColumn & update_column = *block.getByPosition(attribute_idx).column.get(); - MutableColumnPtr saved_column = saved_block->getByPosition(attribute_idx).column->assumeMutable(); - saved_column->insertRangeFrom(update_column, 0, update_column.size()); - } - } - stream->readSuffix(); - } - else - { - auto stream = source_ptr->loadUpdatedAll(); - - stream->readPrefix(); - while (Block block = stream->read()) - { - const auto saved_key_column_ptrs = ext::map( - ext::range(0, keys_size), [&](const size_t key_idx) { return saved_block->safeGetByPosition(key_idx).column; }); - - const auto update_key_column_ptrs = ext::map( - ext::range(0, keys_size), [&](const size_t key_idx) { return block.safeGetByPosition(key_idx).column; }); - - Arena temp_key_pool; - ContainerType> update_key_hash; - - for (size_t i = 0; i < block.rows(); ++i) - { - const auto u_key = placeKeysInPool(i, update_key_column_ptrs, keys, temp_key_pool); - update_key_hash[u_key].push_back(i); - } - - const size_t rows = saved_block->rows(); - IColumn::Filter filter(rows); - - for (size_t i = 0; i < saved_block->rows(); ++i) - { - const auto s_key = placeKeysInPool(i, saved_key_column_ptrs, keys, temp_key_pool); - auto * it = update_key_hash.find(s_key); - if (it) - filter[i] = 0; - else - filter[i] = 1; - } - - auto block_columns = block.mutateColumns(); - for (const auto attribute_idx : ext::range(0, keys_size + attributes_size)) - { - auto & column = saved_block->safeGetByPosition(attribute_idx).column; - const auto & filtered_column = column->filter(filter, -1); - - block_columns[attribute_idx]->insertRangeFrom(*filtered_column.get(), 0, filtered_column->size()); - } - - saved_block->setColumns(std::move(block_columns)); - } - stream->readSuffix(); - } - - if (saved_block) - blockToAttributes(*saved_block.get()); -} - -void ComplexKeyHashedDictionary::loadData() -{ - if (!source_ptr->hasUpdateField()) - { - auto stream = source_ptr->loadAll(); - stream->readPrefix(); - - while (const auto block = stream->read()) - blockToAttributes(block); - - stream->readSuffix(); - } - else - updateData(); - - if (require_nonempty && 0 == element_count) - throw Exception{full_name + ": dictionary source is empty and 'require_nonempty' property is set.", ErrorCodes::DICTIONARY_IS_EMPTY}; -} - -template -void ComplexKeyHashedDictionary::addAttributeSize(const Attribute & attribute) -{ - const auto & map_ref = std::get>(attribute.maps); - bytes_allocated += sizeof(ContainerType) + map_ref.getBufferSizeInBytes(); - bucket_count = map_ref.getBufferSizeInCells(); -} - -template <> -void ComplexKeyHashedDictionary::addAttributeSize(const Attribute & attribute) -{ - const auto & map_ref = std::get>(attribute.maps); - bytes_allocated += sizeof(ContainerType) + map_ref.getBufferSizeInBytes(); - bucket_count = map_ref.getBufferSizeInCells(); - bytes_allocated += sizeof(Arena) + attribute.string_arena->size(); -} - -void ComplexKeyHashedDictionary::calculateBytesAllocated() -{ - bytes_allocated += attributes.size() * sizeof(attributes.front()); - - for (const auto & attribute : attributes) - { - auto type_call = [&](const auto & dictionary_attribute_type) - { - using Type = std::decay_t; - using AttributeType = typename Type::AttributeType; - - addAttributeSize(attribute); - }; - - callOnDictionaryAttributeType(attribute.type, type_call); - } - - bytes_allocated += keys_pool.size(); -} - -template -void ComplexKeyHashedDictionary::createAttributeImpl(Attribute & attribute, const Field & null_value) -{ - attribute.null_values = T(null_value.get()); - attribute.maps.emplace>(); -} - -template <> -void ComplexKeyHashedDictionary::createAttributeImpl(Attribute & attribute, const Field & null_value) -{ - attribute.string_arena = std::make_unique(); - const String & string = null_value.get(); - const char * string_in_arena = attribute.string_arena->insert(string.data(), string.size()); - attribute.null_values.emplace(string_in_arena, string.size()); - attribute.maps.emplace>(); -} - -ComplexKeyHashedDictionary::Attribute -ComplexKeyHashedDictionary::createAttribute(const DictionaryAttribute & attribute, const Field & null_value) -{ - auto nullable_set = attribute.is_nullable ? std::make_unique() : nullptr; - Attribute attr{attribute.underlying_type, attribute.is_nullable, std::move(nullable_set), {}, {}, {}}; - - auto type_call = [&](const auto &dictionary_attribute_type) - { - using Type = std::decay_t; - using AttributeType = typename Type::AttributeType; - createAttributeImpl(attr, null_value); - }; - - callOnDictionaryAttributeType(attribute.underlying_type, type_call); - - return attr; -} - - -template -void ComplexKeyHashedDictionary::getItemsImpl( - const Attribute & attribute, - const Columns & key_columns, - ValueSetter && set_value, - DefaultValueExtractor & default_value_extractor) const -{ - const auto & attr = std::get>(attribute.maps); - - const auto keys_size = key_columns.size(); - StringRefs keys(keys_size); - Arena temporary_keys_pool; - - const auto rows = key_columns.front()->size(); - for (const auto i : ext::range(0, rows)) - { - /// copy key data to arena so it is contiguous and return StringRef to it - const auto key = placeKeysInPool(i, key_columns, keys, temporary_keys_pool); - - const auto it = attr.find(key); - - if (it) - { - set_value(i, static_cast(it->getMapped()), false); - } - else - { - if (attribute.is_nullable && attribute.nullable_set->find(key) != nullptr) - set_value(i, default_value_extractor[i], true); - else - set_value(i, default_value_extractor[i], false); - } - - /// free memory allocated for the key - temporary_keys_pool.rollback(key.size); - } - - query_count.fetch_add(rows, std::memory_order_relaxed); -} - - -template -bool ComplexKeyHashedDictionary::setAttributeValueImpl(Attribute & attribute, const StringRef key, const T value) -{ - auto & map = std::get>(attribute.maps); - const auto pair = map.insert({key, value}); - return pair.second; -} - -template <> -bool ComplexKeyHashedDictionary::setAttributeValueImpl(Attribute & attribute, const StringRef key, const String value) -{ - const auto * string_in_arena = attribute.string_arena->insert(value.data(), value.size()); - return setAttributeValueImpl(attribute, key, StringRef{string_in_arena, value.size()}); -} - -bool ComplexKeyHashedDictionary::setAttributeValue(Attribute & attribute, const StringRef key, const Field & value) -{ - bool result = false; - - auto type_call = [&](const auto &dictionary_attribute_type) - { - using Type = std::decay_t; - using AttributeType = typename Type::AttributeType; - - if (attribute.is_nullable) - { - if (value.isNull()) - { - attribute.nullable_set->insert(key); - result = true; - return; - } - else - { - attribute.nullable_set->erase(key); - } - } - - result = setAttributeValueImpl(attribute, key, value.get()); - }; - - callOnDictionaryAttributeType(attribute.type, type_call); - - return result; -} - -const ComplexKeyHashedDictionary::Attribute & ComplexKeyHashedDictionary::getAttribute(const std::string & attribute_name) const -{ - const auto it = attribute_index_by_name.find(attribute_name); - if (it == std::end(attribute_index_by_name)) - throw Exception{full_name + ": no such attribute '" + attribute_name + "'", ErrorCodes::BAD_ARGUMENTS}; - - return attributes[it->second]; -} - -StringRef ComplexKeyHashedDictionary::placeKeysInPool(const size_t row, const Columns & key_columns, StringRefs & keys, Arena & pool) -{ - const auto keys_size = key_columns.size(); - size_t sum_keys_size{}; - - const char * block_start = nullptr; - for (size_t j = 0; j < keys_size; ++j) - { - keys[j] = key_columns[j]->serializeValueIntoArena(row, pool, block_start); - sum_keys_size += keys[j].size; - } - - const auto * key_start = block_start; - for (size_t j = 0; j < keys_size; ++j) - { - keys[j].data = key_start; - key_start += keys[j].size; - } - - return {block_start, sum_keys_size}; -} - -template -void ComplexKeyHashedDictionary::has(const Attribute & attribute, const Columns & key_columns, PaddedPODArray & out) const -{ - const auto & attr = std::get>(attribute.maps); - const auto keys_size = key_columns.size(); - StringRefs keys(keys_size); - Arena temporary_keys_pool; - const auto rows = key_columns.front()->size(); - - for (const auto i : ext::range(0, rows)) - { - /// copy key data to arena so it is contiguous and return StringRef to it - const auto key = placeKeysInPool(i, key_columns, keys, temporary_keys_pool); - - const auto it = attr.find(key); - out[i] = static_cast(it); - - if (attribute.is_nullable && !out[i]) - out[i] = attribute.nullable_set->find(key) != nullptr; - - /// free memory allocated for the key - temporary_keys_pool.rollback(key.size); - } - - query_count.fetch_add(rows, std::memory_order_relaxed); -} - -std::vector ComplexKeyHashedDictionary::getKeys() const -{ - const Attribute & attribute = attributes.front(); - - std::vector result; - - auto type_call = [&](const auto & dictionary_attribute_type) - { - using Type = std::decay_t; - using AttributeType = typename Type::AttributeType; - - if constexpr (std::is_same_v) - { - result = getKeys(attribute); - } - else - { - result = getKeys(attribute); - } - }; - - callOnDictionaryAttributeType(attribute.type, type_call); - - return result; -} - -template -std::vector ComplexKeyHashedDictionary::getKeys(const Attribute & attribute) const -{ - const ContainerType & attr = std::get>(attribute.maps); - std::vector keys; - keys.reserve(attr.size()); - for (const auto & key : attr) - keys.push_back(key.getKey()); - - if (attribute.is_nullable) - { - for (const auto & key: *attribute.nullable_set) - keys.push_back(key.getKey()); - } - - return keys; -} - -BlockInputStreamPtr ComplexKeyHashedDictionary::getBlockInputStream(const Names & column_names, size_t max_block_size) const -{ - using BlockInputStreamType = DictionaryBlockInputStream; - auto vector_keys = getKeys(); - - PaddedPODArray keys; - keys.reserve(vector_keys.size()); - keys.assign(vector_keys.begin(), vector_keys.end()); - - return std::make_shared(shared_from_this(), max_block_size, keys, column_names); -} - -void registerDictionaryComplexKeyHashed(DictionaryFactory & factory) -{ - auto create_layout = [=](const std::string &, - const DictionaryStructure & dict_struct, - const Poco::Util::AbstractConfiguration & config, - const std::string & config_prefix, - DictionarySourcePtr source_ptr) -> DictionaryPtr - { - if (!dict_struct.key) - throw Exception{"'key' is required for dictionary of layout 'complex_key_hashed'", ErrorCodes::BAD_ARGUMENTS}; - - const auto dict_id = StorageID::fromDictionaryConfig(config, config_prefix); - const DictionaryLifetime dict_lifetime{config, config_prefix + ".lifetime"}; - const bool require_nonempty = config.getBool(config_prefix + ".require_nonempty", false); - return std::make_unique(dict_id, dict_struct, std::move(source_ptr), dict_lifetime, require_nonempty); - }; - factory.registerLayout("complex_key_hashed", create_layout, true); -} - -} diff --git a/src/Dictionaries/ComplexKeyHashedDictionary.h b/src/Dictionaries/ComplexKeyHashedDictionary.h deleted file mode 100644 index 091974bbf43..00000000000 --- a/src/Dictionaries/ComplexKeyHashedDictionary.h +++ /dev/null @@ -1,185 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "IDictionary.h" -#include "IDictionarySource.h" -#include "DictionaryStructure.h" -#include "DictionaryHelpers.h" - -namespace DB -{ - -class ComplexKeyHashedDictionary final : public IDictionaryBase -{ -public: - ComplexKeyHashedDictionary( - const StorageID & dict_id_, - const DictionaryStructure & dict_struct_, - DictionarySourcePtr source_ptr_, - const DictionaryLifetime dict_lifetime_, - bool require_nonempty_, - BlockPtr saved_block_ = nullptr); - - std::string getKeyDescription() const { return key_description; } - - std::string getTypeName() const override { return "ComplexKeyHashed"; } - - size_t getBytesAllocated() const override { return bytes_allocated; } - - size_t getQueryCount() const override { return query_count.load(std::memory_order_relaxed); } - - double getHitRate() const override { return 1.0; } - - size_t getElementCount() const override { return element_count; } - - double getLoadFactor() const override { return static_cast(element_count) / bucket_count; } - - std::shared_ptr clone() const override - { - return std::make_shared(getDictionaryID(), dict_struct, source_ptr->clone(), dict_lifetime, require_nonempty, saved_block); - } - - const IDictionarySource * getSource() const override { return source_ptr.get(); } - - const DictionaryLifetime & getLifetime() const override { return dict_lifetime; } - - const DictionaryStructure & getStructure() const override { return dict_struct; } - - bool isInjective(const std::string & attribute_name) const override - { - return dict_struct.attributes[&getAttribute(attribute_name) - attributes.data()].injective; - } - - DictionaryKeyType getKeyType() const override { return DictionaryKeyType::complex; } - - ColumnPtr getColumn( - const std::string& attribute_name, - const DataTypePtr & result_type, - const Columns & key_columns, - const DataTypes & key_types, - const ColumnPtr & default_values_column) const override; - - ColumnUInt8::Ptr hasKeys(const Columns & key_columns, const DataTypes & key_types) const override; - - BlockInputStreamPtr getBlockInputStream(const Names & column_names, size_t max_block_size) const override; - -private: - template - using ContainerType = HashMapWithSavedHash; - - using NullableSet = HashSetWithSavedHash; - - struct Attribute final - { - AttributeUnderlyingType type; - bool is_nullable; - std::unique_ptr nullable_set; - - std::variant< - UInt8, - UInt16, - UInt32, - UInt64, - UInt128, - Int8, - Int16, - Int32, - Int64, - Decimal32, - Decimal64, - Decimal128, - Float32, - Float64, - StringRef> - null_values; - std::variant< - ContainerType, - ContainerType, - ContainerType, - ContainerType, - ContainerType, - ContainerType, - ContainerType, - ContainerType, - ContainerType, - ContainerType, - ContainerType, - ContainerType, - ContainerType, - ContainerType, - ContainerType> - maps; - std::unique_ptr string_arena; - }; - - void createAttributes(); - - void blockToAttributes(const Block & block); - - void updateData(); - - void loadData(); - - template - void addAttributeSize(const Attribute & attribute); - - void calculateBytesAllocated(); - - template - static void createAttributeImpl(Attribute & attribute, const Field & null_value); - - static Attribute createAttribute(const DictionaryAttribute & attribute, const Field & null_value); - - template - void getItemsImpl( - const Attribute & attribute, - const Columns & key_columns, - ValueSetter && set_value, - DefaultValueExtractor & default_value_extractor) const; - - template - static bool setAttributeValueImpl(Attribute & attribute, const StringRef key, const T value); - - static bool setAttributeValue(Attribute & attribute, const StringRef key, const Field & value); - - const Attribute & getAttribute(const std::string & attribute_name) const; - - static StringRef placeKeysInPool(const size_t row, const Columns & key_columns, StringRefs & keys, Arena & pool); - - template - void has(const Attribute & attribute, const Columns & key_columns, PaddedPODArray & out) const; - - std::vector getKeys() const; - - template - std::vector getKeys(const Attribute & attribute) const; - - const DictionaryStructure dict_struct; - const DictionarySourcePtr source_ptr; - const DictionaryLifetime dict_lifetime; - const bool require_nonempty; - const std::string key_description{dict_struct.getKeyDescription()}; - - std::map attribute_index_by_name; - std::vector attributes; - Arena keys_pool; - - size_t bytes_allocated = 0; - size_t element_count = 0; - size_t bucket_count = 0; - mutable std::atomic query_count{0}; - - BlockPtr saved_block; -}; - -} diff --git a/src/Dictionaries/DictionaryBlockInputStream.cpp b/src/Dictionaries/DictionaryBlockInputStream.cpp new file mode 100644 index 00000000000..433ff211831 --- /dev/null +++ b/src/Dictionaries/DictionaryBlockInputStream.cpp @@ -0,0 +1,200 @@ +#include "DictionaryBlockInputStream.h" + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; +} + +DictionaryBlockInputStream::DictionaryBlockInputStream( + std::shared_ptr dictionary_, UInt64 max_block_size_, PaddedPODArray && ids_, const Names & column_names_) + : DictionaryBlockInputStreamBase(ids_.size(), max_block_size_) + , dictionary(dictionary_) + , column_names(column_names_) + , ids(std::move(ids_)) + , key_type(DictionaryInputStreamKeyType::Id) +{ +} + +DictionaryBlockInputStream::DictionaryBlockInputStream( + std::shared_ptr dictionary_, + UInt64 max_block_size_, + const PaddedPODArray & keys, + const Names & column_names_) + : DictionaryBlockInputStreamBase(keys.size(), max_block_size_) + , dictionary(dictionary_) + , column_names(column_names_) + , key_type(DictionaryInputStreamKeyType::ComplexKey) +{ + const DictionaryStructure & dictionary_structure = dictionary->getStructure(); + fillKeyColumns(keys, 0, keys.size(), dictionary_structure, key_columns); +} + +DictionaryBlockInputStream::DictionaryBlockInputStream( + std::shared_ptr dictionary_, + UInt64 max_block_size_, + const Columns & data_columns_, + const Names & column_names_, + GetColumnsFunction && get_key_columns_function_, + GetColumnsFunction && get_view_columns_function_) + : DictionaryBlockInputStreamBase(data_columns_.front()->size(), max_block_size_) + , dictionary(dictionary_) + , column_names(column_names_) + , data_columns(data_columns_) + , get_key_columns_function(std::move(get_key_columns_function_)) + , get_view_columns_function(std::move(get_view_columns_function_)) + , key_type(DictionaryInputStreamKeyType::Callback) +{ +} + +Block DictionaryBlockInputStream::getBlock(size_t start, size_t length) const +{ + /// TODO: Rewrite + switch (key_type) + { + case DictionaryInputStreamKeyType::ComplexKey: + { + Columns columns; + ColumnsWithTypeAndName view_columns; + columns.reserve(key_columns.size()); + for (const auto & key_column : key_columns) + { + ColumnPtr column = key_column.column->cut(start, length); + columns.emplace_back(column); + view_columns.emplace_back(column, key_column.type, key_column.name); + } + return fillBlock({}, columns, {}, std::move(view_columns)); + } + + case DictionaryInputStreamKeyType::Id: + { + PaddedPODArray ids_to_fill(ids.begin() + start, ids.begin() + start + length); + return fillBlock(ids_to_fill, {}, {}, {}); + } + + case DictionaryInputStreamKeyType::Callback: + { + Columns columns; + columns.reserve(data_columns.size()); + for (const auto & data_column : data_columns) + columns.push_back(data_column->cut(start, length)); + const DictionaryStructure & dictionaty_structure = dictionary->getStructure(); + const auto & attributes = *dictionaty_structure.key; + ColumnsWithTypeAndName keys_with_type_and_name = get_key_columns_function(columns, attributes); + ColumnsWithTypeAndName view_with_type_and_name = get_view_columns_function(columns, attributes); + DataTypes types; + columns.clear(); + for (const auto & key_column : keys_with_type_and_name) + { + columns.push_back(key_column.column); + types.push_back(key_column.type); + } + return fillBlock({}, columns, types, std::move(view_with_type_and_name)); + } + } + + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected DictionaryInputStreamKeyType."); +} + +Block DictionaryBlockInputStream::fillBlock( + const PaddedPODArray & ids_to_fill, + const Columns & keys, + const DataTypes & types, + ColumnsWithTypeAndName && view) const +{ + std::unordered_set names(column_names.begin(), column_names.end()); + + DataTypes data_types = types; + ColumnsWithTypeAndName block_columns; + + data_types.reserve(keys.size()); + const DictionaryStructure & dictionary_structure = dictionary->getStructure(); + if (data_types.empty() && dictionary_structure.key) + for (const auto & key : *dictionary_structure.key) + data_types.push_back(key.type); + + for (const auto & column : view) + if (names.find(column.name) != names.end()) + block_columns.push_back(column); + + const DictionaryStructure & structure = dictionary->getStructure(); + ColumnPtr ids_column = getColumnFromIds(ids_to_fill); + + if (structure.id && names.find(structure.id->name) != names.end()) + { + block_columns.emplace_back(ids_column, std::make_shared(), structure.id->name); + } + + auto dictionary_key_type = dictionary->getKeyType(); + + for (const auto idx : ext::range(0, structure.attributes.size())) + { + const DictionaryAttribute & attribute = structure.attributes[idx]; + if (names.find(attribute.name) != names.end()) + { + ColumnPtr column; + + if (dictionary_key_type == DictionaryKeyType::simple) + { + column = dictionary->getColumn( + attribute.name, + attribute.type, + {ids_column}, + {std::make_shared()}, + nullptr /* default_values_column */); + } + else + { + column = dictionary->getColumn( + attribute.name, + attribute.type, + keys, + data_types, + nullptr /* default_values_column*/); + } + + block_columns.emplace_back(column, attribute.type, attribute.name); + } + } + + return Block(block_columns); +} + +ColumnPtr DictionaryBlockInputStream::getColumnFromIds(const PaddedPODArray & ids_to_fill) +{ + auto column_vector = ColumnVector::create(); + column_vector->getData().assign(ids_to_fill); + return column_vector; +} + +void DictionaryBlockInputStream::fillKeyColumns( + const PaddedPODArray & keys, + size_t start, + size_t size, + const DictionaryStructure & dictionary_structure, + ColumnsWithTypeAndName & result) +{ + MutableColumns columns; + columns.reserve(dictionary_structure.key->size()); + + for (const DictionaryAttribute & attribute : *dictionary_structure.key) + columns.emplace_back(attribute.type->createColumn()); + + for (auto idx : ext::range(start, size)) + { + const auto & key = keys[idx]; + const auto *ptr = key.data; + for (auto & column : columns) + ptr = column->deserializeAndInsertFromArena(ptr); + } + + for (size_t i = 0, num_columns = columns.size(); i < num_columns; ++i) + { + const auto & dictionary_attribute = (*dictionary_structure.key)[i]; + result.emplace_back(ColumnWithTypeAndName{std::move(columns[i]), dictionary_attribute.type, dictionary_attribute.name}); + } +} + +} diff --git a/src/Dictionaries/DictionaryBlockInputStream.h b/src/Dictionaries/DictionaryBlockInputStream.h index 71615efa7f8..5197df411fa 100644 --- a/src/Dictionaries/DictionaryBlockInputStream.h +++ b/src/Dictionaries/DictionaryBlockInputStream.h @@ -16,27 +16,22 @@ namespace DB { -namespace ErrorCodes -{ - extern const int LOGICAL_ERROR; -} /// TODO: Remove this class /* BlockInputStream implementation for external dictionaries * read() returns blocks consisting of the in-memory contents of the dictionaries */ -template class DictionaryBlockInputStream : public DictionaryBlockInputStreamBase { public: DictionaryBlockInputStream( - std::shared_ptr dictionary, + std::shared_ptr dictionary, UInt64 max_block_size, - PaddedPODArray && ids, + PaddedPODArray && ids, const Names & column_names); DictionaryBlockInputStream( - std::shared_ptr dictionary, + std::shared_ptr dictionary, UInt64 max_block_size, const PaddedPODArray & keys, const Names & column_names); @@ -48,7 +43,7 @@ public: // and get_view_columns_function to get key representation. // Now used in trie dictionary, where columns are stored as ip and mask, and are showed as string DictionaryBlockInputStream( - std::shared_ptr dictionary, + std::shared_ptr dictionary, UInt64 max_block_size, const Columns & data_columns, const Names & column_names, @@ -61,21 +56,24 @@ protected: Block getBlock(size_t start, size_t length) const override; private: - Block - fillBlock(const PaddedPODArray & ids_to_fill, const Columns & keys, const DataTypes & types, ColumnsWithTypeAndName && view) const; + Block fillBlock( + const PaddedPODArray & ids_to_fill, + const Columns & keys, + const DataTypes & types, + ColumnsWithTypeAndName && view) const; - ColumnPtr getColumnFromIds(const PaddedPODArray & ids_to_fill) const; + static ColumnPtr getColumnFromIds(const PaddedPODArray & ids_to_fill); - void fillKeyColumns( + static void fillKeyColumns( const PaddedPODArray & keys, size_t start, size_t size, const DictionaryStructure & dictionary_structure, - ColumnsWithTypeAndName & columns) const; + ColumnsWithTypeAndName & result); - std::shared_ptr dictionary; + std::shared_ptr dictionary; Names column_names; - PaddedPODArray ids; + PaddedPODArray ids; ColumnsWithTypeAndName key_columns; Columns data_columns; @@ -92,200 +90,4 @@ private: DictionaryInputStreamKeyType key_type; }; - -template -DictionaryBlockInputStream::DictionaryBlockInputStream( - std::shared_ptr dictionary_, UInt64 max_block_size_, PaddedPODArray && ids_, const Names & column_names_) - : DictionaryBlockInputStreamBase(ids_.size(), max_block_size_) - , dictionary(dictionary_) - , column_names(column_names_) - , ids(std::move(ids_)) - , key_type(DictionaryInputStreamKeyType::Id) -{ -} - -template -DictionaryBlockInputStream::DictionaryBlockInputStream( - std::shared_ptr dictionary_, - UInt64 max_block_size_, - const PaddedPODArray & keys, - const Names & column_names_) - : DictionaryBlockInputStreamBase(keys.size(), max_block_size_) - , dictionary(dictionary_) - , column_names(column_names_) - , key_type(DictionaryInputStreamKeyType::ComplexKey) -{ - const DictionaryStructure & dictionary_structure = dictionary->getStructure(); - fillKeyColumns(keys, 0, keys.size(), dictionary_structure, key_columns); -} - -template -DictionaryBlockInputStream::DictionaryBlockInputStream( - std::shared_ptr dictionary_, - UInt64 max_block_size_, - const Columns & data_columns_, - const Names & column_names_, - GetColumnsFunction && get_key_columns_function_, - GetColumnsFunction && get_view_columns_function_) - : DictionaryBlockInputStreamBase(data_columns_.front()->size(), max_block_size_) - , dictionary(dictionary_) - , column_names(column_names_) - , data_columns(data_columns_) - , get_key_columns_function(std::move(get_key_columns_function_)) - , get_view_columns_function(std::move(get_view_columns_function_)) - , key_type(DictionaryInputStreamKeyType::Callback) -{ -} - - -template -Block DictionaryBlockInputStream::getBlock(size_t start, size_t length) const -{ - /// TODO: Rewrite - switch (key_type) - { - case DictionaryInputStreamKeyType::ComplexKey: - { - Columns columns; - ColumnsWithTypeAndName view_columns; - columns.reserve(key_columns.size()); - for (const auto & key_column : key_columns) - { - ColumnPtr column = key_column.column->cut(start, length); - columns.emplace_back(column); - view_columns.emplace_back(column, key_column.type, key_column.name); - } - return fillBlock({}, columns, {}, std::move(view_columns)); - } - - case DictionaryInputStreamKeyType::Id: - { - PaddedPODArray ids_to_fill(ids.begin() + start, ids.begin() + start + length); - return fillBlock(ids_to_fill, {}, {}, {}); - } - - case DictionaryInputStreamKeyType::Callback: - { - Columns columns; - columns.reserve(data_columns.size()); - for (const auto & data_column : data_columns) - columns.push_back(data_column->cut(start, length)); - const DictionaryStructure & dictionaty_structure = dictionary->getStructure(); - const auto & attributes = *dictionaty_structure.key; - ColumnsWithTypeAndName keys_with_type_and_name = get_key_columns_function(columns, attributes); - ColumnsWithTypeAndName view_with_type_and_name = get_view_columns_function(columns, attributes); - DataTypes types; - columns.clear(); - for (const auto & key_column : keys_with_type_and_name) - { - columns.push_back(key_column.column); - types.push_back(key_column.type); - } - return fillBlock({}, columns, types, std::move(view_with_type_and_name)); - } - } - - throw Exception("Unexpected DictionaryInputStreamKeyType.", ErrorCodes::LOGICAL_ERROR); -} - -template -Block DictionaryBlockInputStream::fillBlock( - const PaddedPODArray & ids_to_fill, const Columns & keys, const DataTypes & types, ColumnsWithTypeAndName && view) const -{ - std::unordered_set names(column_names.begin(), column_names.end()); - - DataTypes data_types = types; - ColumnsWithTypeAndName block_columns; - - data_types.reserve(keys.size()); - const DictionaryStructure & dictionaty_structure = dictionary->getStructure(); - if (data_types.empty() && dictionaty_structure.key) - for (const auto & key : *dictionaty_structure.key) - data_types.push_back(key.type); - - for (const auto & column : view) - if (names.find(column.name) != names.end()) - block_columns.push_back(column); - - const DictionaryStructure & structure = dictionary->getStructure(); - ColumnPtr ids_column = getColumnFromIds(ids_to_fill); - - if (structure.id && names.find(structure.id->name) != names.end()) - { - block_columns.emplace_back(ids_column, std::make_shared(), structure.id->name); - } - - auto dictionary_key_type = dictionary->getKeyType(); - - for (const auto idx : ext::range(0, structure.attributes.size())) - { - const DictionaryAttribute & attribute = structure.attributes[idx]; - if (names.find(attribute.name) != names.end()) - { - ColumnPtr column; - - if (dictionary_key_type == DictionaryKeyType::simple) - { - column = dictionary->getColumn( - attribute.name, - attribute.type, - {ids_column}, - {std::make_shared()}, - nullptr /* default_values_column */); - } - else - { - column = dictionary->getColumn( - attribute.name, - attribute.type, - keys, - data_types, - nullptr /* default_values_column*/); - } - - block_columns.emplace_back(column, attribute.type, attribute.name); - } - } - - return Block(block_columns); -} - -template -ColumnPtr DictionaryBlockInputStream::getColumnFromIds(const PaddedPODArray & ids_to_fill) const -{ - auto column_vector = ColumnVector::create(); - column_vector->getData().reserve(ids_to_fill.size()); - for (UInt64 id : ids_to_fill) - column_vector->insertValue(id); - return column_vector; -} - - -template -void DictionaryBlockInputStream::fillKeyColumns( - const PaddedPODArray & keys, - size_t start, - size_t size, - const DictionaryStructure & dictionary_structure, - ColumnsWithTypeAndName & res) const -{ - MutableColumns columns; - columns.reserve(dictionary_structure.key->size()); - - for (const DictionaryAttribute & attribute : *dictionary_structure.key) - columns.emplace_back(attribute.type->createColumn()); - - for (auto idx : ext::range(start, size)) - { - const auto & key = keys[idx]; - const auto *ptr = key.data; - for (auto & column : columns) - ptr = column->deserializeAndInsertFromArena(ptr); - } - - for (size_t i = 0, num_columns = columns.size(); i < num_columns; ++i) - res.emplace_back( - ColumnWithTypeAndName{std::move(columns[i]), (*dictionary_structure.key)[i].type, (*dictionary_structure.key)[i].name}); -} - } diff --git a/src/Dictionaries/DictionaryHelpers.h b/src/Dictionaries/DictionaryHelpers.h index 5fda5f2599e..3e7063bb9ef 100644 --- a/src/Dictionaries/DictionaryHelpers.h +++ b/src/Dictionaries/DictionaryHelpers.h @@ -295,6 +295,28 @@ private: bool use_default_value_from_column = false; }; +template +class DictionaryKeysArenaHolder; + +template <> +class DictionaryKeysArenaHolder +{ +public: + static Arena * getComplexKeyArena() { return nullptr; } +}; + +template <> +class DictionaryKeysArenaHolder +{ +public: + + Arena * getComplexKeyArena() { return &complex_key_arena; } + +private: + Arena complex_key_arena; +}; + + template class DictionaryKeysExtractor { @@ -302,67 +324,96 @@ public: using KeyType = std::conditional_t; static_assert(key_type != DictionaryKeyType::range, "Range key type is not supported by DictionaryKeysExtractor"); - explicit DictionaryKeysExtractor(const Columns & key_columns, Arena & existing_arena) + explicit DictionaryKeysExtractor(const Columns & key_columns_, Arena * complex_key_arena_) + : key_columns(key_columns_) + , complex_key_arena(complex_key_arena_) { assert(!key_columns.empty()); if constexpr (key_type == DictionaryKeyType::simple) - keys = getColumnVectorData(key_columns.front()); + { + key_columns[0] = key_columns[0]->convertToFullColumnIfConst(); + + const auto * vector_col = checkAndGetColumn>(key_columns[0].get()); + if (!vector_col) + throw Exception(ErrorCodes::TYPE_MISMATCH, "Column type mismatch for simple key expected UInt64"); + } + + keys_size = key_columns.front()->size(); + } + + inline size_t getKeysSize() const + { + return keys_size; + } + + inline size_t getCurrentKeyIndex() const + { + return current_key_index; + } + + inline KeyType extractCurrentKey() + { + assert(current_key_index < keys_size); + + if constexpr (key_type == DictionaryKeyType::simple) + { + const auto & column_vector = static_cast &>(*key_columns[0]); + const auto & data = column_vector.getData(); + + auto key = data[current_key_index]; + ++current_key_index; + return key; + } else - keys = deserializeKeyColumnsInArena(key_columns, existing_arena); - } - - - const PaddedPODArray & getKeys() const - { - return keys; - } - -private: - static PaddedPODArray getColumnVectorData(const ColumnPtr column) - { - PaddedPODArray result; - - auto full_column = column->convertToFullColumnIfConst(); - const auto *vector_col = checkAndGetColumn>(full_column.get()); - - if (!vector_col) - throw Exception{ErrorCodes::TYPE_MISMATCH, "Column type mismatch for simple key expected UInt64"}; - - result.assign(vector_col->getData()); - - return result; - } - - static PaddedPODArray deserializeKeyColumnsInArena(const Columns & key_columns, Arena & temporary_arena) - { - size_t keys_size = key_columns.front()->size(); - - PaddedPODArray result; - result.reserve(keys_size); - - PaddedPODArray temporary_column_data(key_columns.size()); - - for (size_t key_index = 0; key_index < keys_size; ++key_index) { size_t allocated_size_for_columns = 0; const char * block_start = nullptr; - for (size_t column_index = 0; column_index < key_columns.size(); ++column_index) + for (const auto & column : key_columns) { - const auto & column = key_columns[column_index]; - temporary_column_data[column_index] = column->serializeValueIntoArena(key_index, temporary_arena, block_start); - allocated_size_for_columns += temporary_column_data[column_index].size; + StringRef serialized_data = column->serializeValueIntoArena(current_key_index, *complex_key_arena, block_start); + allocated_size_for_columns += serialized_data.size; } - result.push_back(StringRef{block_start, allocated_size_for_columns}); + ++current_key_index; + current_complex_key = StringRef{block_start, allocated_size_for_columns}; + return current_complex_key; + } + } + + void rollbackCurrentKey() const + { + if constexpr (key_type == DictionaryKeyType::complex) + complex_key_arena->rollback(current_complex_key.size); + } + + PaddedPODArray extractAllKeys() + { + PaddedPODArray result; + result.reserve(keys_size - current_key_index); + + for (; current_key_index < keys_size;) + { + auto value = extractCurrentKey(); + result.emplace_back(value); } return result; } - PaddedPODArray keys; + void reset() + { + current_key_index = 0; + } +private: + Columns key_columns; + size_t keys_size = 0; + size_t current_key_index = 0; + + KeyType current_complex_key {}; + Arena * complex_key_arena; }; /** @@ -370,9 +421,10 @@ private: * If column is constant parameter backup_storage is used to store values. */ +/// TODO: Remove template static const PaddedPODArray & getColumnVectorData( - const IDictionaryBase * dictionary, + const IDictionary * dictionary, const ColumnPtr column, PaddedPODArray & backup_storage) { diff --git a/src/Dictionaries/DictionaryStructure.cpp b/src/Dictionaries/DictionaryStructure.cpp index d3ee194bf60..806ee0b80e0 100644 --- a/src/Dictionaries/DictionaryStructure.cpp +++ b/src/Dictionaries/DictionaryStructure.cpp @@ -200,8 +200,21 @@ DictionaryStructure::DictionaryStructure(const Poco::Util::AbstractConfiguration for (size_t i = 0; i < attributes.size(); ++i) { - const auto & attribute_name = attributes[i].name; + const auto & attribute = attributes[i]; + const auto & attribute_name = attribute.name; attribute_name_to_index[attribute_name] = i; + + if (attribute.hierarchical) + { + if (id && attribute.underlying_type != AttributeUnderlyingType::utUInt64) + throw Exception(ErrorCodes::TYPE_MISMATCH, + "Hierarchical attribute type for dictionary with simple key must be UInt64. Actual ({})", + toString(attribute.underlying_type)); + else if (key) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Dictionary with complex key does not support hierarchy"); + + hierarchical_attribute_index = i; + } } if (attributes.empty()) diff --git a/src/Dictionaries/DictionaryStructure.h b/src/Dictionaries/DictionaryStructure.h index 2dedb1be0ce..4f03b4ff09e 100644 --- a/src/Dictionaries/DictionaryStructure.h +++ b/src/Dictionaries/DictionaryStructure.h @@ -153,6 +153,8 @@ struct DictionaryStructure final std::unordered_map attribute_name_to_index; std::optional range_min; std::optional range_max; + std::optional hierarchical_attribute_index; + bool has_expressions = false; bool access_to_key_from_attributes = false; diff --git a/src/Dictionaries/DirectDictionary.cpp b/src/Dictionaries/DirectDictionary.cpp index 4cb9e0cd629..96ef259106a 100644 --- a/src/Dictionaries/DirectDictionary.cpp +++ b/src/Dictionaries/DirectDictionary.cpp @@ -1,158 +1,33 @@ #include "DirectDictionary.h" -#include -#include "DictionaryBlockInputStream.h" -#include "DictionaryFactory.h" -#include -#include -#include -#include -#include -#include +#include +#include +#include +#include +#include + +#include +#include namespace DB { namespace ErrorCodes { - extern const int TYPE_MISMATCH; extern const int UNSUPPORTED_METHOD; extern const int BAD_ARGUMENTS; } -namespace -{ - - inline UInt64 getAt(const PaddedPODArray & arr, const size_t idx) - { - return arr[idx]; - } - - inline UInt64 getAt(const UInt64 & value, const size_t) - { - return value; - } - -} - template DirectDictionary::DirectDictionary( const StorageID & dict_id_, const DictionaryStructure & dict_struct_, - DictionarySourcePtr source_ptr_, - BlockPtr saved_block_) + DictionarySourcePtr source_ptr_) : IDictionary(dict_id_) , dict_struct(dict_struct_) , source_ptr{std::move(source_ptr_)} - , saved_block{std::move(saved_block_)} { if (!source_ptr->supportsSelectiveLoad()) throw Exception{full_name + ": source cannot be used with DirectDictionary", ErrorCodes::UNSUPPORTED_METHOD}; - - setup(); -} - -template -void DirectDictionary::toParent(const PaddedPODArray & ids [[maybe_unused]], PaddedPODArray & out [[maybe_unused]]) const -{ - if constexpr (dictionary_key_type == DictionaryKeyType::simple) - { - const auto & attribute_name = hierarchical_attribute->name; - - auto result_type = std::make_shared(); - auto input_column = result_type->createColumn(); - auto & input_column_typed = assert_cast &>(*input_column); - auto & data = input_column_typed.getData(); - data.insert(ids.begin(), ids.end()); - - auto column = getColumn({attribute_name}, result_type, {std::move(input_column)}, {result_type}, {nullptr}); - const auto & result_column_typed = assert_cast &>(*column); - const auto & result_data = result_column_typed.getData(); - - out.assign(result_data); - } - else - throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "Hierarchy is not supported for complex key DirectDictionary"); -} - -template -UInt64 DirectDictionary::getValueOrNullByKey(const Key & to_find) const -{ - std::vector required_key = {to_find}; - - auto stream = source_ptr->loadIds(required_key); - stream->readPrefix(); - - bool is_found = false; - UInt64 result = hierarchical_attribute->null_value.template get(); - - while (const auto block = stream->read()) - { - const IColumn & id_column = *block.safeGetByPosition(0).column; - - for (const size_t attribute_idx : ext::range(0, dict_struct.attributes.size())) - { - if (is_found) - break; - - const IColumn & attribute_column = *block.safeGetByPosition(attribute_idx + 1).column; - - for (const auto row_idx : ext::range(0, id_column.size())) - { - const auto key = id_column[row_idx].get(); - - if (key == to_find && hierarchical_attribute->name == attribute_name_by_index.at(attribute_idx)) - { - result = attribute_column[row_idx].get(); - is_found = true; - break; - } - } - } - } - - stream->readSuffix(); - - return result; -} - -template -template -void DirectDictionary::isInImpl(const ChildType & child_ids, const AncestorType & ancestor_ids, PaddedPODArray & out) const -{ - const auto null_value = hierarchical_attribute->null_value.template get(); - const auto rows = out.size(); - - for (const auto row : ext::range(0, rows)) - { - auto id = getAt(child_ids, row); - const auto ancestor_id = getAt(ancestor_ids, row); - - for (size_t i = 0; id != null_value && id != ancestor_id && i < DBMS_HIERARCHICAL_DICTIONARY_MAX_DEPTH; ++i) - id = getValueOrNullByKey(id); - - out[row] = id != null_value && id == ancestor_id; - } - - query_count.fetch_add(rows, std::memory_order_relaxed); -} - -template -void DirectDictionary::isInVectorVector( - const PaddedPODArray & child_ids, const PaddedPODArray & ancestor_ids, PaddedPODArray & out) const -{ - isInImpl(child_ids, ancestor_ids, out); -} - -template -void DirectDictionary::isInVectorConstant(const PaddedPODArray & child_ids, const UInt64 ancestor_id, PaddedPODArray & out) const -{ - isInImpl(child_ids, ancestor_id, out); -} - -template -void DirectDictionary::isInConstantVector(const UInt64 child_id, const PaddedPODArray & ancestor_ids, PaddedPODArray & out) const -{ - isInImpl(child_id, ancestor_ids, out); } template @@ -166,20 +41,20 @@ ColumnPtr DirectDictionary::getColumn( if constexpr (dictionary_key_type == DictionaryKeyType::complex) dict_struct.validateKeyTypes(key_types); - Arena complex_key_arena; + DictionaryKeysArenaHolder arena_holder; + DictionaryKeysExtractor extractor(key_columns, arena_holder.getComplexKeyArena()); + const auto requested_keys = extractor.extractAllKeys(); const DictionaryAttribute & attribute = dict_struct.getAttribute(attribute_name, result_type); DefaultValueProvider default_value_provider(attribute.null_value, default_values_column); - DictionaryKeysExtractor extractor(key_columns, complex_key_arena); - const auto & requested_keys = extractor.getKeys(); - HashMap key_to_fetched_index; key_to_fetched_index.reserve(requested_keys.size()); auto fetched_from_storage = attribute.type->createColumn(); + size_t fetched_key_index = 0; - size_t requested_attribute_index = attribute_index_by_name.find(attribute_name)->second; + size_t requested_attribute_index = dict_struct.attribute_name_to_index.find(attribute_name)->second; Columns block_key_columns; size_t dictionary_keys_size = dict_struct.getKeysNames().size(); @@ -191,26 +66,19 @@ ColumnPtr DirectDictionary::getColumn( while (const auto block = stream->read()) { - auto block_columns = block.getColumns(); - /// Split into keys columns and attribute columns for (size_t i = 0; i < dictionary_keys_size; ++i) - { - block_key_columns.emplace_back(*block_columns.begin()); - block_columns.erase(block_columns.begin()); - } + block_key_columns.emplace_back(block.safeGetByPosition(i).column); - DictionaryKeysExtractor block_keys_extractor(block_key_columns, complex_key_arena); - const auto & block_keys = block_keys_extractor.getKeys(); - size_t block_keys_size = block_keys.size(); + DictionaryKeysExtractor block_keys_extractor(block_key_columns, arena_holder.getComplexKeyArena()); + auto block_keys = block_keys_extractor.extractAllKeys(); const auto & block_column = block.safeGetByPosition(dictionary_keys_size + requested_attribute_index).column; - fetched_from_storage->insertRangeFrom(*block_column, 0, block_keys_size); + fetched_from_storage->insertRangeFrom(*block_column, 0, block_keys.size()); - for (size_t block_key_index = 0; block_key_index < block_keys_size; ++block_key_index) + for (size_t block_key_index = 0; block_key_index < block_keys.size(); ++block_key_index) { - const auto & block_key = block_keys[block_key_index]; - + auto block_key = block_keys[block_key_index]; key_to_fetched_index[block_key] = fetched_key_index; ++fetched_key_index; } @@ -223,10 +91,10 @@ ColumnPtr DirectDictionary::getColumn( Field value_to_insert; size_t requested_keys_size = requested_keys.size(); + auto result = fetched_from_storage->cloneEmpty(); result->reserve(requested_keys_size); - for (size_t requested_key_index = 0; requested_key_index < requested_keys_size; ++requested_key_index) { const auto requested_key = requested_keys[requested_key_index]; @@ -251,10 +119,9 @@ ColumnUInt8::Ptr DirectDictionary::hasKeys(const Columns & if constexpr (dictionary_key_type == DictionaryKeyType::complex) dict_struct.validateKeyTypes(key_types); - Arena complex_key_arena; - - DictionaryKeysExtractor requested_keys_extractor(key_columns, complex_key_arena); - const auto & requested_keys = requested_keys_extractor.getKeys(); + DictionaryKeysArenaHolder arena_holder; + DictionaryKeysExtractor requested_keys_extractor(key_columns, arena_holder.getComplexKeyArena()); + auto requested_keys = requested_keys_extractor.extractAllKeys(); size_t requested_keys_size = requested_keys.size(); HashMap requested_key_to_index; @@ -279,25 +146,24 @@ ColumnUInt8::Ptr DirectDictionary::hasKeys(const Columns & while (const auto block = stream->read()) { - auto block_columns = block.getColumns(); - /// Split into keys columns and attribute columns for (size_t i = 0; i < dictionary_keys_size; ++i) - { - block_key_columns.emplace_back(*block_columns.begin()); - block_columns.erase(block_columns.begin()); - } + block_key_columns.emplace_back(block.safeGetByPosition(i).column); - DictionaryKeysExtractor block_keys_extractor(block_key_columns, complex_key_arena); - const auto & block_keys = block_keys_extractor.getKeys(); + DictionaryKeysExtractor block_keys_extractor(block_key_columns, arena_holder.getComplexKeyArena()); + size_t block_keys_size = block_keys_extractor.getKeysSize(); - for (const auto & block_key : block_keys) + for (size_t i = 0; i < block_keys_size; ++i) { + auto block_key = block_keys_extractor.extractCurrentKey(); + const auto * it = requested_key_to_index.find(block_key); assert(it); size_t result_data_found_index = it->getMapped(); result_data[result_data_found_index] = true; + + block_keys_extractor.rollbackCurrentKey(); } block_key_columns.clear(); @@ -310,6 +176,37 @@ ColumnUInt8::Ptr DirectDictionary::hasKeys(const Columns & return result; } +template +ColumnPtr DirectDictionary::getHierarchy( + ColumnPtr key_column, + const DataTypePtr & key_type) const +{ + if (dictionary_key_type == DictionaryKeyType::simple) + { + auto result = getKeysHierarchyDefaultImplementation(this, key_column, key_type); + query_count.fetch_add(key_column->size(), std::memory_order_relaxed); + return result; + } + else + return nullptr; +} + +template +ColumnUInt8::Ptr DirectDictionary::isInHierarchy( + ColumnPtr key_column, + ColumnPtr in_key_column, + const DataTypePtr & key_type) const +{ + if (dictionary_key_type == DictionaryKeyType::simple) + { + auto result = getKeysIsInHierarchyDefaultImplementation(this, key_column, in_key_column, key_type); + query_count.fetch_add(key_column->size(), std::memory_order_relaxed); + return result; + } + else + return nullptr; +} + template BlockInputStreamPtr DirectDictionary::getSourceBlockInputStream( const Columns & key_columns [[maybe_unused]], @@ -342,32 +239,6 @@ BlockInputStreamPtr DirectDictionary::getSourceBlockInputSt return stream; } -template -void DirectDictionary::setup() -{ - /// TODO: Move this to DictionaryStructure - size_t dictionary_attributes_size = dict_struct.attributes.size(); - for (size_t i = 0; i < dictionary_attributes_size; ++i) - { - const auto & attribute = dict_struct.attributes[i]; - attribute_index_by_name[attribute.name] = i; - attribute_name_by_index[i] = attribute.name; - - if (attribute.hierarchical) - { - if constexpr (dictionary_key_type == DictionaryKeyType::complex) - throw Exception(ErrorCodes::BAD_ARGUMENTS, - "({}): hierarchical attributes are not supported for complex key direct dictionary", - full_name); - - hierarchical_attribute = &attribute; - - if (attribute.underlying_type != AttributeUnderlyingType::utUInt64) - throw Exception{full_name + ": hierarchical attribute must be UInt64.", ErrorCodes::TYPE_MISMATCH}; - } - } -} - template BlockInputStreamPtr DirectDictionary::getBlockInputStream(const Names & /* column_names */, size_t /* max_block_size */) const { diff --git a/src/Dictionaries/DirectDictionary.h b/src/Dictionaries/DirectDictionary.h index 685fd707ded..6bca6ac6a18 100644 --- a/src/Dictionaries/DirectDictionary.h +++ b/src/Dictionaries/DirectDictionary.h @@ -18,11 +18,6 @@ namespace DB { -namespace ErrorCodes -{ - extern const int BAD_ARGUMENTS; -} - template class DirectDictionary final : public IDictionary { @@ -33,8 +28,7 @@ public: DirectDictionary( const StorageID & dict_id_, const DictionaryStructure & dict_struct_, - DictionarySourcePtr source_ptr_, - BlockPtr saved_block_ = nullptr); + DictionarySourcePtr source_ptr_); std::string getTypeName() const override { @@ -56,7 +50,7 @@ public: std::shared_ptr clone() const override { - return std::make_shared(getDictionaryID(), dict_struct, source_ptr->clone(), saved_block); + return std::make_shared(getDictionaryID(), dict_struct, source_ptr->clone()); } const IDictionarySource * getSource() const override { return source_ptr.get(); } @@ -67,26 +61,9 @@ public: bool isInjective(const std::string & attribute_name) const override { - auto it = attribute_index_by_name.find(attribute_name); - - if (it == attribute_index_by_name.end()) - throw Exception(ErrorCodes::BAD_ARGUMENTS, - "({}): no attribute with name ({}) in dictionary", - full_name, - attribute_name); - - return dict_struct.attributes[it->second].injective; + return dict_struct.getAttribute(attribute_name).injective; } - bool hasHierarchy() const override { return hierarchical_attribute; } - - void toParent(const PaddedPODArray & ids, PaddedPODArray & out) const override; - - void isInVectorVector( - const PaddedPODArray & child_ids, const PaddedPODArray & ancestor_ids, PaddedPODArray & out) const override; - void isInVectorConstant(const PaddedPODArray & child_ids, const UInt64 ancestor_id, PaddedPODArray & out) const override; - void isInConstantVector(const UInt64 child_id, const PaddedPODArray & ancestor_ids, PaddedPODArray & out) const override; - DictionaryKeyType getKeyType() const override { return dictionary_key_type; } ColumnPtr getColumn( @@ -98,30 +75,25 @@ public: ColumnUInt8::Ptr hasKeys(const Columns & key_columns, const DataTypes & key_types) const override; + bool hasHierarchy() const override { return dict_struct.hierarchical_attribute_index.has_value(); } + + ColumnPtr getHierarchy(ColumnPtr key_column, const DataTypePtr & key_type) const override; + + ColumnUInt8::Ptr isInHierarchy( + ColumnPtr key_column, + ColumnPtr in_key_column, + const DataTypePtr & key_type) const override; + BlockInputStreamPtr getBlockInputStream(const Names & column_names, size_t max_block_size) const override; private: - void setup(); - BlockInputStreamPtr getSourceBlockInputStream(const Columns & key_columns, const PaddedPODArray & requested_keys) const; - UInt64 getValueOrNullByKey(const UInt64 & to_find) const; - - template - void isInImpl(const ChildType & child_ids, const AncestorType & ancestor_ids, PaddedPODArray & out) const; - const DictionaryStructure dict_struct; const DictionarySourcePtr source_ptr; const DictionaryLifetime dict_lifetime; - std::unordered_map attribute_index_by_name; - std::unordered_map attribute_name_by_index; - - const DictionaryAttribute * hierarchical_attribute = nullptr; - mutable std::atomic query_count{0}; - - BlockPtr saved_block; }; extern template class DirectDictionary; diff --git a/src/Dictionaries/FlatDictionary.cpp b/src/Dictionaries/FlatDictionary.cpp index eb63d716913..2d8d208d76b 100644 --- a/src/Dictionaries/FlatDictionary.cpp +++ b/src/Dictionaries/FlatDictionary.cpp @@ -1,20 +1,22 @@ #include "FlatDictionary.h" #include +#include + #include #include #include #include #include -#include "DictionaryBlockInputStream.h" -#include "DictionaryFactory.h" +#include +#include +#include namespace DB { namespace ErrorCodes { - extern const int TYPE_MISMATCH; extern const int ARGUMENT_OUT_OF_BOUND; extern const int BAD_ARGUMENTS; extern const int DICTIONARY_IS_EMPTY; @@ -24,7 +26,6 @@ namespace ErrorCodes static const auto initial_array_size = 1024; static const auto max_array_size = 500000; - FlatDictionary::FlatDictionary( const StorageID & dict_id_, const DictionaryStructure & dict_struct_, @@ -45,69 +46,6 @@ FlatDictionary::FlatDictionary( calculateBytesAllocated(); } - -void FlatDictionary::toParent(const PaddedPODArray & ids, PaddedPODArray & out) const -{ - const auto null_value = std::get(hierarchical_attribute->null_values); - DictionaryDefaultValueExtractor extractor(null_value); - - getItemsImpl( - *hierarchical_attribute, - ids, - [&](const size_t row, const UInt64 value) { out[row] = value; }, - extractor); -} - - -/// Allow to use single value in same way as array. -static inline FlatDictionary::Key getAt(const PaddedPODArray & arr, const size_t idx) -{ - return arr[idx]; -} -static inline FlatDictionary::Key getAt(const FlatDictionary::Key & value, const size_t) -{ - return value; -} - -template -void FlatDictionary::isInImpl(const ChildType & child_ids, const AncestorType & ancestor_ids, PaddedPODArray & out) const -{ - const auto null_value = std::get(hierarchical_attribute->null_values); - const auto & attr = std::get>(hierarchical_attribute->arrays); - const auto rows = out.size(); - - size_t loaded_size = attr.size(); - for (const auto row : ext::range(0, rows)) - { - auto id = getAt(child_ids, row); - const auto ancestor_id = getAt(ancestor_ids, row); - - for (size_t i = 0; id < loaded_size && id != null_value && id != ancestor_id && i < DBMS_HIERARCHICAL_DICTIONARY_MAX_DEPTH; ++i) - id = attr[id]; - - out[row] = id != null_value && id == ancestor_id; - } - - query_count.fetch_add(rows, std::memory_order_relaxed); -} - - -void FlatDictionary::isInVectorVector( - const PaddedPODArray & child_ids, const PaddedPODArray & ancestor_ids, PaddedPODArray & out) const -{ - isInImpl(child_ids, ancestor_ids, out); -} - -void FlatDictionary::isInVectorConstant(const PaddedPODArray & child_ids, const Key ancestor_id, PaddedPODArray & out) const -{ - isInImpl(child_ids, ancestor_id, out); -} - -void FlatDictionary::isInConstantVector(const Key child_id, const PaddedPODArray & ancestor_ids, PaddedPODArray & out) const -{ - isInImpl(child_id, ancestor_ids, out); -} - ColumnPtr FlatDictionary::getColumn( const std::string & attribute_name, const DataTypePtr & result_type, @@ -117,14 +55,16 @@ ColumnPtr FlatDictionary::getColumn( { ColumnPtr result; - PaddedPODArray backup_storage; + PaddedPODArray backup_storage; const auto & ids = getColumnVectorData(this, key_columns.front(), backup_storage); auto size = ids.size(); - const auto & attribute = getAttribute(attribute_name); const auto & dictionary_attribute = dict_struct.getAttribute(attribute_name, result_type); + size_t attribute_index = dict_struct.attribute_name_to_index.find(attribute_name)->second; + const auto & attribute = attributes[attribute_index]; + auto type_call = [&](const auto & dictionary_attribute_type) { using Type = std::decay_t; @@ -183,10 +123,9 @@ ColumnPtr FlatDictionary::getColumn( return result; } - ColumnUInt8::Ptr FlatDictionary::hasKeys(const Columns & key_columns, const DataTypes &) const { - PaddedPODArray backup_storage; + PaddedPODArray backup_storage; const auto& ids = getColumnVectorData(this, key_columns.front(), backup_storage); auto result = ColumnUInt8::create(ext::size(ids)); @@ -205,24 +144,118 @@ ColumnUInt8::Ptr FlatDictionary::hasKeys(const Columns & key_columns, const Data return result; } +ColumnPtr FlatDictionary::getHierarchy(ColumnPtr key_column, const DataTypePtr &) const +{ + PaddedPODArray keys_backup_storage; + const auto & keys = getColumnVectorData(this, key_column, keys_backup_storage); + + size_t hierarchical_attribute_index = *dict_struct.hierarchical_attribute_index; + const auto & hierarchical_attribute = attributes[hierarchical_attribute_index]; + + const UInt64 null_value = std::get(hierarchical_attribute.null_values); + const ContainerType & parent_keys = std::get>(hierarchical_attribute.arrays); + + auto is_key_valid_func = [&, this](auto & key) + { + return key < loaded_ids.size() && loaded_ids[key]; + }; + + auto get_parent_key_func = [&, this](auto & hierarchy_key) + { + std::optional result; + + if (hierarchy_key >= loaded_ids.size() || !loaded_ids[hierarchy_key]) + return result; + + result = parent_keys[hierarchy_key]; + + return result; + }; + + auto dictionary_hierarchy_array = getKeysHierarchyArray(keys, null_value, is_key_valid_func, get_parent_key_func); + + query_count.fetch_add(keys.size(), std::memory_order_relaxed); + + return dictionary_hierarchy_array; +} + +ColumnUInt8::Ptr FlatDictionary::isInHierarchy( + ColumnPtr key_column, + ColumnPtr in_key_column, + const DataTypePtr &) const +{ + PaddedPODArray keys_backup_storage; + const auto & keys = getColumnVectorData(this, key_column, keys_backup_storage); + + PaddedPODArray keys_in_backup_storage; + const auto & keys_in = getColumnVectorData(this, in_key_column, keys_in_backup_storage); + + size_t hierarchical_attribute_index = *dict_struct.hierarchical_attribute_index; + const auto & hierarchical_attribute = attributes[hierarchical_attribute_index]; + + const UInt64 null_value = std::get(hierarchical_attribute.null_values); + const ContainerType & parent_keys = std::get>(hierarchical_attribute.arrays); + + auto is_key_valid_func = [&, this](auto & key) + { + return key < loaded_ids.size() && loaded_ids[key]; + }; + + auto get_parent_key_func = [&, this](auto & hierarchy_key) + { + std::optional result; + + if (hierarchy_key >= loaded_ids.size() || !loaded_ids[hierarchy_key]) + return result; + + result = parent_keys[hierarchy_key]; + + return result; + }; + + auto result = getKeysIsInHierarchyColumn(keys, keys_in, null_value, is_key_valid_func, get_parent_key_func); + + query_count.fetch_add(keys.size(), std::memory_order_relaxed); + + return result; +} + +ColumnPtr FlatDictionary::getDescendants( + ColumnPtr key_column, + const DataTypePtr &, + size_t level) const +{ + PaddedPODArray keys_backup; + const auto & keys = getColumnVectorData(this, key_column, keys_backup); + + size_t hierarchical_attribute_index = *dict_struct.hierarchical_attribute_index; + const auto & hierarchical_attribute = attributes[hierarchical_attribute_index]; + const ContainerType & parent_keys = std::get>(hierarchical_attribute.arrays); + + HashMap> parent_to_child; + + for (size_t i = 0; i < parent_keys.size(); ++i) + { + auto parent_key = parent_keys[i]; + + if (loaded_ids[i]) + parent_to_child[parent_key].emplace_back(static_cast(i)); + } + + auto result = getKeysDescendantsArray(keys, parent_to_child, level); + + query_count.fetch_add(keys.size(), std::memory_order_relaxed); + + return result; +} + void FlatDictionary::createAttributes() { const auto size = dict_struct.attributes.size(); attributes.reserve(size); for (const auto & attribute : dict_struct.attributes) - { - attribute_index_by_name.emplace(attribute.name, attributes.size()); attributes.push_back(createAttribute(attribute, attribute.null_value)); - - if (attribute.hierarchical) - { - hierarchical_attribute = &attributes.back(); - - if (hierarchical_attribute->type != AttributeUnderlyingType::utUInt64) - throw Exception{full_name + ": hierarchical attribute must be UInt64.", ErrorCodes::TYPE_MISMATCH}; - } - } } void FlatDictionary::blockToAttributes(const Block & block) @@ -271,7 +304,7 @@ void FlatDictionary::updateData() const auto & saved_id_column = *saved_block->safeGetByPosition(0).column; const auto & update_id_column = *block.safeGetByPosition(0).column; - std::unordered_map> update_ids; + std::unordered_map> update_ids; for (size_t row = 0; row < update_id_column.size(); ++row) { const auto id = update_id_column.get64(row); @@ -280,7 +313,7 @@ void FlatDictionary::updateData() const size_t saved_rows = saved_id_column.size(); IColumn::Filter filter(saved_rows); - std::unordered_map>::iterator it; + std::unordered_map>::iterator it; for (size_t row = 0; row < saved_id_column.size(); ++row) { @@ -385,7 +418,6 @@ void FlatDictionary::createAttributeImpl(Attribute & attribute, const Fi attribute.arrays.emplace>(initial_array_size, StringRef(string_in_arena, string.size())); } - FlatDictionary::Attribute FlatDictionary::createAttribute(const DictionaryAttribute& attribute, const Field & null_value) { auto nullable_set = attribute.is_nullable ? std::make_optional() : std::optional{}; @@ -408,7 +440,7 @@ FlatDictionary::Attribute FlatDictionary::createAttribute(const DictionaryAttrib template void FlatDictionary::getItemsImpl( const Attribute & attribute, - const PaddedPODArray & ids, + const PaddedPODArray & ids, ValueSetter && set_value, DefaultValueExtractor & default_value_extractor) const { @@ -425,7 +457,7 @@ void FlatDictionary::getItemsImpl( } template -void FlatDictionary::resize(Attribute & attribute, const Key id) +void FlatDictionary::resize(Attribute & attribute, const UInt64 id) { if (id >= max_array_size) throw Exception{full_name + ": identifier should be less than " + toString(max_array_size), ErrorCodes::ARGUMENT_OUT_OF_BOUND}; @@ -440,7 +472,7 @@ void FlatDictionary::resize(Attribute & attribute, const Key id) } template -void FlatDictionary::setAttributeValueImpl(Attribute & attribute, const Key id, const T & value) +void FlatDictionary::setAttributeValueImpl(Attribute & attribute, const UInt64 id, const T & value) { auto & array = std::get>(attribute.arrays); array[id] = value; @@ -448,13 +480,13 @@ void FlatDictionary::setAttributeValueImpl(Attribute & attribute, const Key id, } template <> -void FlatDictionary::setAttributeValueImpl(Attribute & attribute, const Key id, const String & value) +void FlatDictionary::setAttributeValueImpl(Attribute & attribute, const UInt64 id, const String & value) { const auto * string_in_arena = attribute.string_arena->insert(value.data(), value.size()); setAttributeValueImpl(attribute, id, StringRef{string_in_arena, value.size()}); } -void FlatDictionary::setAttributeValue(Attribute & attribute, const Key id, const Field & value) +void FlatDictionary::setAttributeValue(Attribute & attribute, const UInt64 id, const Field & value) { auto type_call = [&](const auto &dictionary_attribute_type) { @@ -484,21 +516,11 @@ void FlatDictionary::setAttributeValue(Attribute & attribute, const Key id, cons callOnDictionaryAttributeType(attribute.type, type_call); } - -const FlatDictionary::Attribute & FlatDictionary::getAttribute(const std::string & attribute_name) const -{ - const auto it = attribute_index_by_name.find(attribute_name); - if (it == std::end(attribute_index_by_name)) - throw Exception{full_name + ": no such attribute '" + attribute_name + "'", ErrorCodes::BAD_ARGUMENTS}; - - return attributes[it->second]; -} - -PaddedPODArray FlatDictionary::getIds() const +PaddedPODArray FlatDictionary::getIds() const { const auto ids_count = ext::size(loaded_ids); - PaddedPODArray ids; + PaddedPODArray ids; ids.reserve(ids_count); for (auto idx : ext::range(0, ids_count)) @@ -509,8 +531,7 @@ PaddedPODArray FlatDictionary::getIds() const BlockInputStreamPtr FlatDictionary::getBlockInputStream(const Names & column_names, size_t max_block_size) const { - using BlockInputStreamType = DictionaryBlockInputStream; - return std::make_shared(shared_from_this(), max_block_size, getIds(), column_names); + return std::make_shared(shared_from_this(), max_block_size, getIds(), column_names); } void registerDictionaryFlat(DictionaryFactory & factory) diff --git a/src/Dictionaries/FlatDictionary.h b/src/Dictionaries/FlatDictionary.h index f491eb28641..09721bf1a99 100644 --- a/src/Dictionaries/FlatDictionary.h +++ b/src/Dictionaries/FlatDictionary.h @@ -59,18 +59,9 @@ public: bool isInjective(const std::string & attribute_name) const override { - return dict_struct.attributes[&getAttribute(attribute_name) - attributes.data()].injective; + return dict_struct.getAttribute(attribute_name).injective; } - bool hasHierarchy() const override { return hierarchical_attribute; } - - void toParent(const PaddedPODArray & ids, PaddedPODArray & out) const override; - - void isInVectorVector( - const PaddedPODArray & child_ids, const PaddedPODArray & ancestor_ids, PaddedPODArray & out) const override; - void isInVectorConstant(const PaddedPODArray & child_ids, const Key ancestor_id, PaddedPODArray & out) const override; - void isInConstantVector(const Key child_id, const PaddedPODArray & ancestor_ids, PaddedPODArray & out) const override; - DictionaryKeyType getKeyType() const override { return DictionaryKeyType::simple; } ColumnPtr getColumn( @@ -82,13 +73,27 @@ public: ColumnUInt8::Ptr hasKeys(const Columns & key_columns, const DataTypes & key_types) const override; + bool hasHierarchy() const override { return dict_struct.hierarchical_attribute_index.has_value(); } + + ColumnPtr getHierarchy(ColumnPtr key_column, const DataTypePtr & key_type) const override; + + ColumnUInt8::Ptr isInHierarchy( + ColumnPtr key_column, + ColumnPtr in_key_column, + const DataTypePtr & key_type) const override; + + ColumnPtr getDescendants( + ColumnPtr key_column, + const DataTypePtr & key_type, + size_t level) const override; + BlockInputStreamPtr getBlockInputStream(const Names & column_names, size_t max_block_size) const override; private: template using ContainerType = PaddedPODArray; - using NullableSet = HashSet>; + using NullableSet = HashSet>; struct Attribute final { @@ -151,24 +156,24 @@ private: template void getItemsImpl( const Attribute & attribute, - const PaddedPODArray & ids, + const PaddedPODArray & ids, ValueSetter && set_value, DefaultValueExtractor & default_value_extractor) const; template - void resize(Attribute & attribute, const Key id); + void resize(Attribute & attribute, const UInt64 id); template - void setAttributeValueImpl(Attribute & attribute, const Key id, const T & value); + void setAttributeValueImpl(Attribute & attribute, const UInt64 id, const T & value); - void setAttributeValue(Attribute & attribute, const Key id, const Field & value); + void setAttributeValue(Attribute & attribute, const UInt64 id, const Field & value); const Attribute & getAttribute(const std::string & attribute_name) const; template void isInImpl(const ChildType & child_ids, const AncestorType & ancestor_ids, PaddedPODArray & out) const; - PaddedPODArray getIds() const; + PaddedPODArray getIds() const; const DictionaryStructure dict_struct; const DictionarySourcePtr source_ptr; @@ -177,7 +182,6 @@ private: std::map attribute_index_by_name; std::vector attributes; - const Attribute * hierarchical_attribute = nullptr; std::vector loaded_ids; size_t bytes_allocated = 0; @@ -185,6 +189,7 @@ private: size_t bucket_count = 0; mutable std::atomic query_count{0}; + /// TODO: Remove BlockPtr saved_block; }; diff --git a/src/Dictionaries/HashedDictionary.cpp b/src/Dictionaries/HashedDictionary.cpp index 708be7945f1..d45e4ade1cf 100644 --- a/src/Dictionaries/HashedDictionary.cpp +++ b/src/Dictionaries/HashedDictionary.cpp @@ -1,13 +1,18 @@ #include "HashedDictionary.h" + #include -#include "DictionaryBlockInputStream.h" -#include "DictionaryFactory.h" -#include "ClickHouseDictionarySource.h" + +#include + #include -#include +#include #include #include -#include +#include + +#include +#include +#include namespace { @@ -15,136 +20,74 @@ namespace /// NOTE: Trailing return type is explicitly specified for SFINAE. /// google::sparse_hash_map -template auto first(const T & value) -> decltype(value.first) { return value.first; } // NOLINT -template auto second(const T & value) -> decltype(value.second) { return value.second; } // NOLINT +template auto getKeyFromCell(const T & value) -> decltype(value->first) { return value->first; } // NOLINT +template auto getValueFromCell(const T & value) -> decltype(value->second) { return value->second; } // NOLINT /// HashMap -template auto first(const T & value) -> decltype(value.getKey()) { return value.getKey(); } // NOLINT -template auto second(const T & value) -> decltype(value.getMapped()) { return value.getMapped(); } // NOLINT +template auto getKeyFromCell(const T & value) -> decltype(value->getKey()) { return value->getKey(); } // NOLINT +template auto getValueFromCell(const T & value) -> decltype(value->getMapped()) { return value->getMapped(); } // NOLINT } namespace DB { + namespace ErrorCodes { - extern const int TYPE_MISMATCH; extern const int BAD_ARGUMENTS; extern const int DICTIONARY_IS_EMPTY; extern const int UNSUPPORTED_METHOD; } - -HashedDictionary::HashedDictionary( +template +HashedDictionary::HashedDictionary( const StorageID & dict_id_, const DictionaryStructure & dict_struct_, DictionarySourcePtr source_ptr_, const DictionaryLifetime dict_lifetime_, bool require_nonempty_, - bool sparse_, BlockPtr saved_block_) : IDictionary(dict_id_) , dict_struct(dict_struct_) - , source_ptr{std::move(source_ptr_)} + , source_ptr(std::move(source_ptr_)) , dict_lifetime(dict_lifetime_) , require_nonempty(require_nonempty_) - , sparse(sparse_) - , saved_block{std::move(saved_block_)} + , saved_block(std::move(saved_block_)) { createAttributes(); loadData(); calculateBytesAllocated(); } - -void HashedDictionary::toParent(const PaddedPODArray & ids, PaddedPODArray & out) const -{ - const auto null_value = std::get(hierarchical_attribute->null_values); - DictionaryDefaultValueExtractor extractor(null_value); - - getItemsImpl( - *hierarchical_attribute, - ids, - [&](const size_t row, const UInt64 value) { out[row] = value; }, - extractor); -} - - -/// Allow to use single value in same way as array. -static inline HashedDictionary::Key getAt(const PaddedPODArray & arr, const size_t idx) -{ - return arr[idx]; -} -static inline HashedDictionary::Key getAt(const HashedDictionary::Key & value, const size_t) -{ - return value; -} - -template -void HashedDictionary::isInAttrImpl(const AttrType & attr, const ChildType & child_ids, const AncestorType & ancestor_ids, PaddedPODArray & out) const -{ - const auto null_value = std::get(hierarchical_attribute->null_values); - const auto rows = out.size(); - - for (const auto row : ext::range(0, rows)) - { - auto id = getAt(child_ids, row); - const auto ancestor_id = getAt(ancestor_ids, row); - - for (size_t i = 0; id != null_value && id != ancestor_id && i < DBMS_HIERARCHICAL_DICTIONARY_MAX_DEPTH; ++i) - { - auto it = attr.find(id); - if (it != std::end(attr)) - id = second(*it); - else - break; - } - - out[row] = id != null_value && id == ancestor_id; - } - - query_count.fetch_add(rows, std::memory_order_relaxed); -} -template -void HashedDictionary::isInImpl(const ChildType & child_ids, const AncestorType & ancestor_ids, PaddedPODArray & out) const -{ - if (!sparse) - return isInAttrImpl(*std::get>(hierarchical_attribute->maps), child_ids, ancestor_ids, out); - return isInAttrImpl(*std::get>(hierarchical_attribute->sparse_maps), child_ids, ancestor_ids, out); -} - -void HashedDictionary::isInVectorVector( - const PaddedPODArray & child_ids, const PaddedPODArray & ancestor_ids, PaddedPODArray & out) const -{ - isInImpl(child_ids, ancestor_ids, out); -} - -void HashedDictionary::isInVectorConstant(const PaddedPODArray & child_ids, const Key ancestor_id, PaddedPODArray & out) const -{ - isInImpl(child_ids, ancestor_id, out); -} - -void HashedDictionary::isInConstantVector(const Key child_id, const PaddedPODArray & ancestor_ids, PaddedPODArray & out) const -{ - isInImpl(child_id, ancestor_ids, out); -} - -ColumnPtr HashedDictionary::getColumn( +template +ColumnPtr HashedDictionary::getColumn( const std::string & attribute_name, const DataTypePtr & result_type, const Columns & key_columns, - const DataTypes &, + const DataTypes & key_types [[maybe_unused]], const ColumnPtr & default_values_column) const { + if (dictionary_key_type == DictionaryKeyType::complex) + dict_struct.validateKeyTypes(key_types); + ColumnPtr result; - PaddedPODArray backup_storage; - const auto & ids = getColumnVectorData(this, key_columns.front(), backup_storage); + DictionaryKeysArenaHolder arena_holder; + DictionaryKeysExtractor extractor(key_columns, arena_holder.getComplexKeyArena()); - auto size = ids.size(); + const size_t size = extractor.getKeysSize(); - const auto & attribute = getAttribute(attribute_name); const auto & dictionary_attribute = dict_struct.getAttribute(attribute_name, result_type); + const size_t attribute_index = dict_struct.attribute_name_to_index.find(attribute_name)->second; + auto & attribute = attributes[attribute_index]; + + ColumnUInt8::MutablePtr col_null_map_to; + ColumnUInt8::Container * vec_null_map_to = nullptr; + if (attribute.is_nullable_set) + { + col_null_map_to = ColumnUInt8::create(size, false); + vec_null_map_to = &col_null_map_to->getData(); + } auto type_call = [&](const auto & dictionary_attribute_type) { @@ -159,24 +102,34 @@ ColumnPtr HashedDictionary::getColumn( auto column = ColumnProvider::getColumn(dictionary_attribute, size); - if constexpr (std::is_same_v) + if constexpr (std::is_same_v) { auto * out = column.get(); - getItemsImpl( + getItemsImpl( attribute, - ids, + extractor, [&](const size_t, const StringRef value) { out->insertData(value.data, value.size); }, + [&](const size_t row) + { + out->insertDefault(); + (*vec_null_map_to)[row] = true; + }, default_value_extractor); } else { auto & out = column->getData(); - getItemsImpl( + getItemsImpl( attribute, - ids, + extractor, [&](const size_t row, const auto value) { return out[row] = value; }, + [&](const size_t row) + { + out[row] = 0; + (*vec_null_map_to)[row] = true; + }, default_value_extractor); } @@ -185,87 +138,214 @@ ColumnPtr HashedDictionary::getColumn( callOnDictionaryAttributeType(attribute.type, type_call); - if (attribute.nullable_set) - { - ColumnUInt8::MutablePtr col_null_map_to = ColumnUInt8::create(size, false); - ColumnUInt8::Container& vec_null_map_to = col_null_map_to->getData(); - - for (size_t row = 0; row < ids.size(); ++row) - { - auto id = ids[row]; - - if (attribute.nullable_set->find(id) != nullptr) - vec_null_map_to[row] = true; - } - + if (attribute.is_nullable_set) result = ColumnNullable::create(result, std::move(col_null_map_to)); + + return result; +} + +template +ColumnUInt8::Ptr HashedDictionary::hasKeys(const Columns & key_columns, const DataTypes & key_types) const +{ + if (dictionary_key_type == DictionaryKeyType::complex) + dict_struct.validateKeyTypes(key_types); + + DictionaryKeysArenaHolder arena_holder; + DictionaryKeysExtractor extractor(key_columns, arena_holder.getComplexKeyArena()); + + size_t keys_size = extractor.getKeysSize(); + + auto result = ColumnUInt8::create(keys_size, false); + auto & out = result->getData(); + + if (attributes.empty()) + { + query_count.fetch_add(keys_size, std::memory_order_relaxed); + return result; } - return result; -} - -ColumnUInt8::Ptr HashedDictionary::hasKeys(const Columns & key_columns, const DataTypes &) const -{ - PaddedPODArray backup_storage; - const auto& ids = getColumnVectorData(this, key_columns.front(), backup_storage); - - size_t ids_count = ext::size(ids); - - auto result = ColumnUInt8::create(ext::size(ids)); - auto& out = result->getData(); - const auto & attribute = attributes.front(); + bool is_attribute_nullable = attribute.is_nullable_set.has_value(); - auto type_call = [&](const auto & dictionary_attribute_type) + getAttributeContainer(0, [&](const auto & container) { - using Type = std::decay_t; - using AttributeType = typename Type::AttributeType; - has(attribute, ids, out); - }; + for (size_t requested_key_index = 0; requested_key_index < keys_size; ++requested_key_index) + { + auto requested_key = extractor.extractCurrentKey(); - callOnDictionaryAttributeType(attribute.type, type_call); + out[requested_key_index] = container.find(requested_key) != container.end(); - query_count.fetch_add(ids_count, std::memory_order_relaxed); + if (is_attribute_nullable && !out[requested_key_index]) + out[requested_key_index] = attribute.is_nullable_set->find(requested_key) != nullptr; + + extractor.rollbackCurrentKey(); + } + }); + + query_count.fetch_add(keys_size, std::memory_order_relaxed); return result; } -void HashedDictionary::createAttributes() +template +ColumnPtr HashedDictionary::getHierarchy(ColumnPtr key_column [[maybe_unused]], const DataTypePtr &) const +{ + if constexpr (dictionary_key_type == DictionaryKeyType::simple) + { + PaddedPODArray keys_backup_storage; + const auto & keys = getColumnVectorData(this, key_column, keys_backup_storage); + + size_t hierarchical_attribute_index = *dict_struct.hierarchical_attribute_index; + + const auto & dictionary_attribute = dict_struct.attributes[hierarchical_attribute_index]; + const auto & hierarchical_attribute = attributes[hierarchical_attribute_index]; + + const UInt64 null_value = dictionary_attribute.null_value.get(); + const CollectionType & parent_keys_map = std::get>(hierarchical_attribute.container); + + auto is_key_valid_func = [&](auto & key) { return parent_keys_map.find(key) != parent_keys_map.end(); }; + + auto get_parent_func = [&](auto & hierarchy_key) + { + std::optional result; + + auto it = parent_keys_map.find(hierarchy_key); + + if (it != parent_keys_map.end()) + result = getValueFromCell(it); + + return result; + }; + + auto dictionary_hierarchy_array = getKeysHierarchyArray(keys, null_value, is_key_valid_func, get_parent_func); + + query_count.fetch_add(keys.size(), std::memory_order_relaxed); + + return dictionary_hierarchy_array; + } + else + return nullptr; +} + +template +ColumnUInt8::Ptr HashedDictionary::isInHierarchy( + ColumnPtr key_column [[maybe_unused]], + ColumnPtr in_key_column [[maybe_unused]], + const DataTypePtr &) const +{ + if constexpr (dictionary_key_type == DictionaryKeyType::simple) + { + PaddedPODArray keys_backup_storage; + const auto & keys = getColumnVectorData(this, key_column, keys_backup_storage); + + PaddedPODArray keys_in_backup_storage; + const auto & keys_in = getColumnVectorData(this, in_key_column, keys_in_backup_storage); + + size_t hierarchical_attribute_index = *dict_struct.hierarchical_attribute_index; + + const auto & dictionary_attribute = dict_struct.attributes[hierarchical_attribute_index]; + auto & hierarchical_attribute = attributes[hierarchical_attribute_index]; + + const UInt64 null_value = dictionary_attribute.null_value.get(); + const CollectionType & parent_keys_map = std::get>(hierarchical_attribute.container); + + auto is_key_valid_func = [&](auto & key) { return parent_keys_map.find(key) != parent_keys_map.end(); }; + + auto get_parent_func = [&](auto & hierarchy_key) + { + std::optional result; + + auto it = parent_keys_map.find(hierarchy_key); + + if (it != parent_keys_map.end()) + result = getValueFromCell(it); + + return result; + }; + + auto result = getKeysIsInHierarchyColumn(keys, keys_in, null_value, is_key_valid_func, get_parent_func); + + query_count.fetch_add(keys.size(), std::memory_order_relaxed); + + return result; + } + else + return nullptr; +} + +template +ColumnPtr HashedDictionary::getDescendants( + ColumnPtr key_column [[maybe_unused]], + const DataTypePtr &, + size_t level [[maybe_unused]]) const +{ + if constexpr (dictionary_key_type == DictionaryKeyType::simple) + { + PaddedPODArray keys_backup; + const auto & keys = getColumnVectorData(this, key_column, keys_backup); + + size_t hierarchical_attribute_index = *dict_struct.hierarchical_attribute_index; + + const auto & hierarchical_attribute = attributes[hierarchical_attribute_index]; + const CollectionType & parent_keys = std::get>(hierarchical_attribute.container); + + HashMap> parent_to_child; + + for (const auto & [key, value] : parent_keys) + parent_to_child[value].emplace_back(key); + + auto result = getKeysDescendantsArray(keys, parent_to_child, level); + + query_count.fetch_add(keys.size(), std::memory_order_relaxed); + + return result; + } + else + return nullptr; +} + +template +void HashedDictionary::createAttributes() { const auto size = dict_struct.attributes.size(); attributes.reserve(size); - for (const auto & attribute : dict_struct.attributes) + for (const auto & dictionary_attribute : dict_struct.attributes) { - attribute_index_by_name.emplace(attribute.name, attributes.size()); - attributes.push_back(createAttribute(attribute, attribute.null_value)); - - if (attribute.hierarchical) + auto type_call = [&, this](const auto & dictionary_attribute_type) { - hierarchical_attribute = &attributes.back(); + using Type = std::decay_t; + using AttributeType = typename Type::AttributeType; + using ValueType = DictionaryValueType; - if (hierarchical_attribute->type != AttributeUnderlyingType::utUInt64) - throw Exception{full_name + ": hierarchical attribute must be UInt64.", ErrorCodes::TYPE_MISMATCH}; - } + auto is_nullable_set = dictionary_attribute.is_nullable ? std::make_optional() : std::optional{}; + std::unique_ptr string_arena = std::is_same_v ? std::make_unique() : nullptr; + + ValueType default_value; + + if constexpr (std::is_same_v) + { + string_arena = std::make_unique(); + + const auto & string_null_value = dictionary_attribute.null_value.get(); + const size_t string_null_value_size = string_null_value.size(); + + const char * string_in_arena = string_arena->insert(string_null_value.data(), string_null_value_size); + default_value = {string_in_arena, string_null_value_size}; + } + else + default_value = dictionary_attribute.null_value.get>(); + + Attribute attribute{dictionary_attribute.underlying_type, std::move(is_nullable_set), default_value, CollectionType(), std::move(string_arena)}; + attributes.emplace_back(std::move(attribute)); + }; + + callOnDictionaryAttributeType(dictionary_attribute.underlying_type, type_call); } } -void HashedDictionary::blockToAttributes(const Block & block) -{ - const auto & id_column = *block.safeGetByPosition(0).column; - - for (const size_t attribute_idx : ext::range(0, attributes.size())) - { - const IColumn & attribute_column = *block.safeGetByPosition(attribute_idx + 1).column; - auto & attribute = attributes[attribute_idx]; - - for (const auto row_idx : ext::range(0, id_column.size())) - if (setAttributeValue(attribute, id_column[row_idx].get(), attribute_column[row_idx])) - ++element_count; - } -} - -void HashedDictionary::updateData() +template +void HashedDictionary::updateData() { if (!saved_block || saved_block->rows() == 0) { @@ -277,6 +357,7 @@ void HashedDictionary::updateData() /// We are using this to keep saved data if input stream consists of multiple blocks if (!saved_block) saved_block = std::make_shared(block.cloneEmpty()); + for (const auto attribute_idx : ext::range(0, attributes.size() + 1)) { const IColumn & update_column = *block.getByPosition(attribute_idx).column.get(); @@ -288,34 +369,50 @@ void HashedDictionary::updateData() } else { + size_t skip_keys_size_offset = dict_struct.getKeysSize(); + + Columns saved_block_key_columns; + saved_block_key_columns.reserve(skip_keys_size_offset); + + /// Split into keys columns and attribute columns + for (size_t i = 0; i < skip_keys_size_offset; ++i) + saved_block_key_columns.emplace_back(saved_block->safeGetByPosition(i).column); + + + DictionaryKeysArenaHolder arena_holder; + DictionaryKeysExtractor saved_keys_extractor(saved_block_key_columns, arena_holder.getComplexKeyArena()); + auto saved_keys_extracted_from_block = saved_keys_extractor.extractAllKeys(); + auto stream = source_ptr->loadUpdatedAll(); stream->readPrefix(); while (Block block = stream->read()) { - const auto & saved_id_column = *saved_block->safeGetByPosition(0).column; - const auto & update_id_column = *block.safeGetByPosition(0).column; + /// TODO: Rewrite + Columns block_key_columns; + block_key_columns.reserve(skip_keys_size_offset); - std::unordered_map> update_ids; - for (size_t row = 0; row < update_id_column.size(); ++row) + /// Split into keys columns and attribute columns + for (size_t i = 0; i < skip_keys_size_offset; ++i) + block_key_columns.emplace_back(block.safeGetByPosition(i).column); + + DictionaryKeysExtractor block_keys_extractor(saved_block_key_columns, arena_holder.getComplexKeyArena()); + auto keys_extracted_from_block = block_keys_extractor.extractAllKeys(); + + absl::flat_hash_map, DefaultHash> update_keys; + for (size_t row = 0; row < keys_extracted_from_block.size(); ++row) { - const auto id = update_id_column.get64(row); - update_ids[id].push_back(row); + auto key = keys_extracted_from_block[row]; + update_keys[key].push_back(row); } - const size_t saved_rows = saved_id_column.size(); - IColumn::Filter filter(saved_rows); - std::unordered_map>::iterator it; + IColumn::Filter filter(saved_keys_extracted_from_block.size()); - for (size_t row = 0; row < saved_id_column.size(); ++row) + for (size_t row = 0; row < saved_keys_extracted_from_block.size(); ++row) { - auto id = saved_id_column.get64(row); - it = update_ids.find(id); - - if (it != update_ids.end()) - filter[row] = 0; - else - filter[row] = 1; + auto key = saved_keys_extracted_from_block[row]; + auto it = update_keys.find(key); + filter[row] = (it == update_keys.end()); } auto block_columns = block.mutateColumns(); @@ -323,12 +420,12 @@ void HashedDictionary::updateData() { auto & column = saved_block->safeGetByPosition(attribute_idx).column; const auto & filtered_column = column->filter(filter, -1); - block_columns[attribute_idx]->insertRangeFrom(*filtered_column.get(), 0, filtered_column->size()); } saved_block->setColumns(std::move(block_columns)); } + stream->readSuffix(); } @@ -339,48 +436,154 @@ void HashedDictionary::updateData() } } -template -void HashedDictionary::resize(Attribute & attribute, size_t added_rows) +template +void HashedDictionary::blockToAttributes(const Block & block [[maybe_unused]]) { - if (!sparse) + size_t skip_keys_size_offset = dict_struct.getKeysSize(); + + Columns key_columns; + key_columns.reserve(skip_keys_size_offset); + + /// Split into keys columns and attribute columns + for (size_t i = 0; i < skip_keys_size_offset; ++i) + key_columns.emplace_back(block.safeGetByPosition(i).column); + + DictionaryKeysArenaHolder arena_holder; + DictionaryKeysExtractor keys_extractor(key_columns, arena_holder.getComplexKeyArena()); + const size_t keys_size = keys_extractor.getKeysSize(); + + Field column_value_to_insert; + + for (size_t attribute_index = 0; attribute_index < attributes.size(); ++attribute_index) { - const auto & map_ref = std::get>(attribute.maps); - added_rows += map_ref->size(); - map_ref->reserve(added_rows); - } - else - { - const auto & map_ref = std::get>(attribute.sparse_maps); - added_rows += map_ref->size(); - map_ref->resize(added_rows); + const IColumn & attribute_column = *block.safeGetByPosition(skip_keys_size_offset + attribute_index).column; + auto & attribute = attributes[attribute_index]; + bool attribute_is_nullable = attribute.is_nullable_set.has_value(); + + getAttributeContainer(attribute_index, [&](auto & container) + { + using ContainerType = std::decay_t; + using AttributeValueType = typename ContainerType::mapped_type; + + for (size_t key_index = 0; key_index < keys_size; ++key_index) + { + auto key = keys_extractor.extractCurrentKey(); + + auto it = container.find(key); + bool key_is_nullable_and_already_exists = attribute_is_nullable && attribute.is_nullable_set->find(key) != nullptr; + + if (key_is_nullable_and_already_exists || it != container.end()) + { + keys_extractor.rollbackCurrentKey(); + continue; + } + + if constexpr (std::is_same_v) + key = copyKeyInArena(key); + + attribute_column.get(key_index, column_value_to_insert); + + if (attribute.is_nullable_set && column_value_to_insert.isNull()) + { + attribute.is_nullable_set->insert(key); + keys_extractor.rollbackCurrentKey(); + continue; + } + + if constexpr (std::is_same_v) + { + String & value_to_insert = column_value_to_insert.get(); + size_t value_to_insert_size = value_to_insert.size(); + + const char * string_in_arena = attribute.string_arena->insert(value_to_insert.data(), value_to_insert_size); + + StringRef string_in_arena_reference = StringRef{string_in_arena, value_to_insert_size}; + container.insert({key, string_in_arena_reference}); + } + else + { + auto value_to_insert = column_value_to_insert.get>(); + container.insert({key, value_to_insert}); + } + + ++element_count; + + keys_extractor.rollbackCurrentKey(); + } + + keys_extractor.reset(); + }); } } -template <> -void HashedDictionary::resize(Attribute & attribute, size_t added_rows) +template +void HashedDictionary::resize(size_t added_rows) { - resize(attribute, added_rows); -} - -void HashedDictionary::resize(size_t added_rows) -{ - if (!added_rows) + if (unlikely(!added_rows)) return; - for (auto & attribute : attributes) + for (size_t attribute_index = 0; attribute_index < attributes.size(); ++attribute_index) { - auto type_call = [&](const auto & dictionary_attribute_type) + getAttributeContainer(attribute_index, [added_rows](auto & attribute_map) { - using Type = std::decay_t; - using AttributeType = typename Type::AttributeType; - resize(attribute, added_rows); - }; + size_t reserve_size = added_rows + attribute_map.size(); - callOnDictionaryAttributeType(attribute.type, type_call); + if constexpr (sparse) + attribute_map.resize(reserve_size); + else + attribute_map.reserve(reserve_size); + }); } } -void HashedDictionary::loadData() +template +template +void HashedDictionary::getItemsImpl( + const Attribute & attribute, + DictionaryKeysExtractor & keys_extractor, + ValueSetter && set_value [[maybe_unused]], + NullableValueSetter && set_nullable_value [[maybe_unused]], + DefaultValueExtractor & default_value_extractor) const +{ + const auto & attribute_container = std::get>(attribute.container); + const size_t keys_size = keys_extractor.getKeysSize(); + + bool is_attribute_nullable = attribute.is_nullable_set.has_value(); + + for (size_t key_index = 0; key_index < keys_size; ++key_index) + { + auto key = keys_extractor.extractCurrentKey(); + + const auto it = attribute_container.find(key); + + if (it != attribute_container.end()) + set_value(key_index, getValueFromCell(it)); + else + { + if (is_attribute_nullable && attribute.is_nullable_set->find(key) != nullptr) + set_nullable_value(key_index); + else + set_value(key_index, default_value_extractor[key_index]); + } + + keys_extractor.rollbackCurrentKey(); + } + + query_count.fetch_add(keys_size, std::memory_order_relaxed); +} + +template +StringRef HashedDictionary::copyKeyInArena(StringRef key) +{ + size_t key_size = key.size; + char * place_for_key = complex_key_arena.alloc(key_size); + memcpy(reinterpret_cast(place_for_key), reinterpret_cast(key.data), key_size); + StringRef updated_key{place_for_key, key_size}; + return updated_key; +} + +template +void HashedDictionary::loadData() { if (!source_ptr->hasUpdateField()) { @@ -400,263 +603,116 @@ void HashedDictionary::loadData() updateData(); if (require_nonempty && 0 == element_count) - throw Exception{full_name + ": dictionary source is empty and 'require_nonempty' property is set.", ErrorCodes::DICTIONARY_IS_EMPTY}; + throw Exception(ErrorCodes::DICTIONARY_IS_EMPTY, + "({}): dictionary source is empty and 'require_nonempty' property is set.", + full_name); } -template -void HashedDictionary::addAttributeSize(const Attribute & attribute) -{ - if (!sparse) - { - const auto & map_ref = std::get>(attribute.maps); - bytes_allocated += sizeof(CollectionType) + map_ref->getBufferSizeInBytes(); - bucket_count = map_ref->getBufferSizeInCells(); - } - else - { - const auto & map_ref = std::get>(attribute.sparse_maps); - bucket_count = map_ref->bucket_count(); - - /** TODO: more accurate calculation */ - bytes_allocated += sizeof(SparseCollectionType); - bytes_allocated += bucket_count; - bytes_allocated += map_ref->size() * (sizeof(Key) + sizeof(T)); - } -} - -template <> -void HashedDictionary::addAttributeSize(const Attribute & attribute) -{ - addAttributeSize(attribute); - bytes_allocated += sizeof(Arena) + attribute.string_arena->size(); -} - -void HashedDictionary::calculateBytesAllocated() +template +void HashedDictionary::calculateBytesAllocated() { bytes_allocated += attributes.size() * sizeof(attributes.front()); - for (const auto & attribute : attributes) + for (size_t i = 0; i < attributes.size(); ++i) { - auto type_call = [&](const auto & dictionary_attribute_type) + getAttributeContainer(i, [&](const auto & container) { - using Type = std::decay_t; - using AttributeType = typename Type::AttributeType; - addAttributeSize(attribute); - }; + using ContainerType = std::decay_t; + using AttributeValueType = typename ContainerType::mapped_type; - callOnDictionaryAttributeType(attribute.type, type_call); - } -} + bytes_allocated += sizeof(container); -template -void HashedDictionary::createAttributeImpl(Attribute & attribute, const Field & null_value) -{ - attribute.null_values = T(null_value.get()); - if (!sparse) - attribute.maps = std::make_unique>(); - else - attribute.sparse_maps = std::make_unique>(); -} - -template <> -void HashedDictionary::createAttributeImpl(Attribute & attribute, const Field & null_value) -{ - attribute.string_arena = std::make_unique(); - const String & string = null_value.get(); - const char * string_in_arena = attribute.string_arena->insert(string.data(), string.size()); - attribute.null_values.emplace(string_in_arena, string.size()); - - if (!sparse) - attribute.maps = std::make_unique>(); - else - attribute.sparse_maps = std::make_unique>(); -} - -HashedDictionary::Attribute HashedDictionary::createAttribute(const DictionaryAttribute& attribute, const Field & null_value) -{ - auto nullable_set = attribute.is_nullable ? std::make_optional() : std::optional{}; - Attribute attr{attribute.underlying_type, std::move(nullable_set), {}, {}, {}, {}}; - - auto type_call = [&, this](const auto &dictionary_attribute_type) - { - using Type = std::decay_t; - using AttributeType = typename Type::AttributeType; - createAttributeImpl(attr, null_value); - }; - - callOnDictionaryAttributeType(attribute.underlying_type, type_call); - - return attr; -} - - -template -void HashedDictionary::getItemsAttrImpl( - const MapType & attr, - const PaddedPODArray & ids, - ValueSetter && set_value, - DefaultValueExtractor & default_value_extractor) const -{ - const auto rows = ext::size(ids); - - for (const auto i : ext::range(0, rows)) - { - const auto it = attr.find(ids[i]); - set_value(i, it != attr.end() ? static_cast(second(*it)) : default_value_extractor[i]); - } - - query_count.fetch_add(rows, std::memory_order_relaxed); -} - -template -void HashedDictionary::getItemsImpl( - const Attribute & attribute, - const PaddedPODArray & ids, - ValueSetter && set_value, - DefaultValueExtractor & default_value_extractor) const -{ - if (!sparse) - return getItemsAttrImpl(*std::get>(attribute.maps), ids, set_value, default_value_extractor); - return getItemsAttrImpl(*std::get>(attribute.sparse_maps), ids, set_value, default_value_extractor); -} - - -template -bool HashedDictionary::setAttributeValueImpl(Attribute & attribute, const Key id, const T value) -{ - if (!sparse) - { - auto & map = *std::get>(attribute.maps); - return map.insert({id, value}).second; - } - else - { - auto & map = *std::get>(attribute.sparse_maps); - return map.insert({id, value}).second; - } -} - -template <> -bool HashedDictionary::setAttributeValueImpl(Attribute & attribute, const Key id, const String value) -{ - const auto * string_in_arena = attribute.string_arena->insert(value.data(), value.size()); - return setAttributeValueImpl(attribute, id, StringRef{string_in_arena, value.size()}); -} - -bool HashedDictionary::setAttributeValue(Attribute & attribute, const Key id, const Field & value) -{ - bool result = false; - - auto type_call = [&, this](const auto &dictionary_attribute_type) - { - using Type = std::decay_t; - using AttributeType = typename Type::AttributeType; - - if (attribute.nullable_set) - { - if (value.isNull()) + if constexpr (sparse || std::is_same_v) { - result = attribute.nullable_set->insert(id).second; - return; + bytes_allocated += container.max_size() * (sizeof(KeyType) + sizeof(AttributeValueType)); + bucket_count = container.bucket_count(); } else { - attribute.nullable_set->erase(id); + bytes_allocated += container.getBufferSizeInBytes(); + bucket_count = container.getBufferSizeInCells(); } - } + }); - result = setAttributeValueImpl(attribute, id, value.get()); - }; - - callOnDictionaryAttributeType(attribute.type, type_call); - - return result; -} - -const HashedDictionary::Attribute & HashedDictionary::getAttribute(const std::string & attribute_name) const -{ - const auto it = attribute_index_by_name.find(attribute_name); - if (it == std::end(attribute_index_by_name)) - throw Exception{full_name + ": no such attribute '" + attribute_name + "'", ErrorCodes::BAD_ARGUMENTS}; - - return attributes[it->second]; -} - -template -void HashedDictionary::has(const Attribute & attribute, const PaddedPODArray & ids, PaddedPODArray & out) const -{ - const auto & attr = *std::get>(attribute.maps); - const auto rows = ext::size(ids); - - for (const auto i : ext::range(0, rows)) - { - out[i] = attr.find(ids[i]) != nullptr; - - if (attribute.nullable_set && !out[i]) - out[i] = attribute.nullable_set->find(ids[i]) != nullptr; + if (attributes[i].string_arena) + bytes_allocated += attributes[i].string_arena->size(); } + + bytes_allocated += complex_key_arena.size(); } -template <> -void HashedDictionary::has(const Attribute & attribute, const PaddedPODArray & ids, PaddedPODArray & out) const +template +BlockInputStreamPtr HashedDictionary::getBlockInputStream(const Names & column_names, size_t max_block_size) const { - has(attribute, ids, out); + PaddedPODArray keys; + + if (!attributes.empty()) + { + const auto & attribute = attributes.front(); + + getAttributeContainer(0, [&](auto & container) + { + keys.reserve(container.size()); + + for (const auto & [key, value] : container) + { + (void)(value); + keys.emplace_back(key); + } + + if (attribute.is_nullable_set) + { + const auto & is_nullable_set = *attribute.is_nullable_set; + keys.reserve(is_nullable_set.size()); + + for (auto & node : is_nullable_set) + keys.emplace_back(node.getKey()); + } + }); + } + + if constexpr (dictionary_key_type == DictionaryKeyType::simple) + return std::make_shared(shared_from_this(), max_block_size, std::move(keys), column_names); + else + return std::make_shared(shared_from_this(), max_block_size, keys, column_names); } -template -PaddedPODArray HashedDictionary::getIdsAttrImpl(const AttrType & attr) const +template +template +void HashedDictionary::getAttributeContainer(size_t attribute_index, GetContainerFunc && get_container_func) { - PaddedPODArray ids; - ids.reserve(attr.size()); - for (const auto & value : attr) - ids.push_back(first(value)); + assert(attribute_index < attributes.size()); - return ids; -} -template -PaddedPODArray HashedDictionary::getIds(const Attribute & attribute) const -{ - if (!sparse) - return getIdsAttrImpl(*std::get>(attribute.maps)); - return getIdsAttrImpl(*std::get>(attribute.sparse_maps)); -} - -template <> -PaddedPODArray HashedDictionary::getIds(const Attribute & attribute) const -{ - return getIds(attribute); -} - -PaddedPODArray HashedDictionary::getIds() const -{ - const auto & attribute = attributes.front(); - PaddedPODArray result; + auto & attribute = attributes[attribute_index]; auto type_call = [&](const auto & dictionary_attribute_type) { using Type = std::decay_t; using AttributeType = typename Type::AttributeType; - /// TODO: Check if order is satisfied - result = getIds(attribute); + using ValueType = DictionaryValueType; - if (attribute.nullable_set) - { - for (const auto& value: *attribute.nullable_set) - result.push_back(value.getKey()); - } + auto & attribute_container = std::get>(attribute.container); + std::forward(get_container_func)(attribute_container); }; callOnDictionaryAttributeType(attribute.type, type_call); - - return result; } -BlockInputStreamPtr HashedDictionary::getBlockInputStream(const Names & column_names, size_t max_block_size) const +template +template +void HashedDictionary::getAttributeContainer(size_t attribute_index, GetContainerFunc && get_container_func) const { - using BlockInputStreamType = DictionaryBlockInputStream; - return std::make_shared(shared_from_this(), max_block_size, getIds(), column_names); + const_cast *>(this)->getAttributeContainer(attribute_index, [&](auto & attribute_container) + { + std::forward(get_container_func)(attribute_container); + }); } +template class HashedDictionary; +template class HashedDictionary; +template class HashedDictionary; +template class HashedDictionary; + void registerDictionaryHashed(DictionaryFactory & factory) { auto create_layout = [](const std::string & full_name, @@ -664,10 +720,13 @@ void registerDictionaryHashed(DictionaryFactory & factory) const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix, DictionarySourcePtr source_ptr, + DictionaryKeyType dictionary_key_type, bool sparse) -> DictionaryPtr { - if (dict_struct.key) - throw Exception{"'key' is not supported for dictionary of layout 'hashed'", ErrorCodes::UNSUPPORTED_METHOD}; + if (dictionary_key_type == DictionaryKeyType::simple && dict_struct.key) + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "'key' is not supported for simple key hashed dictionary"); + else if (dictionary_key_type == DictionaryKeyType::complex && dict_struct.id) + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "'id' is not supported for complex key hashed dictionary"); if (dict_struct.range_min || dict_struct.range_max) throw Exception{full_name @@ -678,13 +737,34 @@ void registerDictionaryHashed(DictionaryFactory & factory) const auto dict_id = StorageID::fromDictionaryConfig(config, config_prefix); const DictionaryLifetime dict_lifetime{config, config_prefix + ".lifetime"}; const bool require_nonempty = config.getBool(config_prefix + ".require_nonempty", false); - return std::make_unique(dict_id, dict_struct, std::move(source_ptr), dict_lifetime, require_nonempty, sparse); + + if (dictionary_key_type == DictionaryKeyType::simple) + { + if (sparse) + return std::make_unique>(dict_id, dict_struct, std::move(source_ptr), dict_lifetime, require_nonempty); + else + return std::make_unique>(dict_id, dict_struct, std::move(source_ptr), dict_lifetime, require_nonempty); + } + else + { + if (sparse) + return std::make_unique>(dict_id, dict_struct, std::move(source_ptr), dict_lifetime, require_nonempty); + else + return std::make_unique>(dict_id, dict_struct, std::move(source_ptr), dict_lifetime, require_nonempty); + } }; + using namespace std::placeholders; + factory.registerLayout("hashed", - [=](auto && a, auto && b, auto && c, auto && d, DictionarySourcePtr e){ return create_layout(a, b, c, d, std::move(e), /* sparse = */ false); }, false); + [=](auto && a, auto && b, auto && c, auto && d, DictionarySourcePtr e){ return create_layout(a, b, c, d, std::move(e), DictionaryKeyType::simple, /* sparse = */ false); }, false); factory.registerLayout("sparse_hashed", - [=](auto && a, auto && b, auto && c, auto && d, DictionarySourcePtr e){ return create_layout(a, b, c, d, std::move(e), /* sparse = */ true); }, false); + [=](auto && a, auto && b, auto && c, auto && d, DictionarySourcePtr e){ return create_layout(a, b, c, d, std::move(e), DictionaryKeyType::simple, /* sparse = */ true); }, false); + factory.registerLayout("complex_key_hashed", + [=](auto && a, auto && b, auto && c, auto && d, DictionarySourcePtr e){ return create_layout(a, b, c, d, std::move(e), DictionaryKeyType::complex, /* sparse = */ false); }, true); + factory.registerLayout("complex_key_sparse_hashed", + [=](auto && a, auto && b, auto && c, auto && d, DictionarySourcePtr e){ return create_layout(a, b, c, d, std::move(e), DictionaryKeyType::complex, /* sparse = */ true); }, true); + } } diff --git a/src/Dictionaries/HashedDictionary.h b/src/Dictionaries/HashedDictionary.h index ab37f1528ca..3882b669324 100644 --- a/src/Dictionaries/HashedDictionary.h +++ b/src/Dictionaries/HashedDictionary.h @@ -4,17 +4,21 @@ #include #include #include -#include -#include -#include -#include -#include + #include #include -#include "DictionaryStructure.h" -#include "IDictionary.h" -#include "IDictionarySource.h" -#include "DictionaryHelpers.h" + +#include +#include +#include + +#include +#include + +#include +#include +#include +#include /** This dictionary stores all content in a hash table in memory * (a separate Key -> Value map for each attribute) @@ -24,19 +28,32 @@ namespace DB { +template class HashedDictionary final : public IDictionary { public: + using KeyType = std::conditional_t; + static_assert(dictionary_key_type != DictionaryKeyType::range, "Range key type is not supported by hashed dictionary"); + HashedDictionary( const StorageID & dict_id_, const DictionaryStructure & dict_struct_, DictionarySourcePtr source_ptr_, const DictionaryLifetime dict_lifetime_, bool require_nonempty_, - bool sparse_, BlockPtr saved_block_ = nullptr); - std::string getTypeName() const override { return sparse ? "SparseHashed" : "Hashed"; } + std::string getTypeName() const override + { + if constexpr (dictionary_key_type == DictionaryKeyType::simple && sparse) + return "SparseHashed"; + else if constexpr (dictionary_key_type == DictionaryKeyType::simple && !sparse) + return "Hashed"; + else if constexpr (dictionary_key_type == DictionaryKeyType::complex && sparse) + return "ComplexKeySpareseHashed"; + else + return "ComplexKeyHashed"; + } size_t getBytesAllocated() const override { return bytes_allocated; } @@ -50,7 +67,7 @@ public: std::shared_ptr clone() const override { - return std::make_shared(getDictionaryID(), dict_struct, source_ptr->clone(), dict_lifetime, require_nonempty, sparse, saved_block); + return std::make_shared>(getDictionaryID(), dict_struct, source_ptr->clone(), dict_lifetime, require_nonempty, saved_block); } const IDictionarySource * getSource() const override { return source_ptr.get(); } @@ -61,14 +78,10 @@ public: bool isInjective(const std::string & attribute_name) const override { - return dict_struct.attributes[&getAttribute(attribute_name) - attributes.data()].injective; + return dict_struct.getAttribute(attribute_name).injective; } - bool hasHierarchy() const override { return hierarchical_attribute; } - - void toParent(const PaddedPODArray & ids, PaddedPODArray & out) const override; - - DictionaryKeyType getKeyType() const override { return DictionaryKeyType::simple; } + DictionaryKeyType getKeyType() const override { return dictionary_key_type; } ColumnPtr getColumn( const std::string& attribute_name, @@ -79,36 +92,52 @@ public: ColumnUInt8::Ptr hasKeys(const Columns & key_columns, const DataTypes & key_types) const override; - void isInVectorVector( - const PaddedPODArray & child_ids, const PaddedPODArray & ancestor_ids, PaddedPODArray & out) const override; - void isInVectorConstant(const PaddedPODArray & child_ids, const Key ancestor_id, PaddedPODArray & out) const override; - void isInConstantVector(const Key child_id, const PaddedPODArray & ancestor_ids, PaddedPODArray & out) const override; + bool hasHierarchy() const override { return dictionary_key_type == DictionaryKeyType::simple && dict_struct.hierarchical_attribute_index.has_value(); } + + ColumnPtr getHierarchy(ColumnPtr key_column, const DataTypePtr & hierarchy_attribute_type) const override; + + ColumnUInt8::Ptr isInHierarchy( + ColumnPtr key_column, + ColumnPtr in_key_column, + const DataTypePtr & key_type) const override; + + ColumnPtr getDescendants( + ColumnPtr key_column, + const DataTypePtr & key_type, + size_t level) const override; BlockInputStreamPtr getBlockInputStream(const Names & column_names, size_t max_block_size) const override; private: template - using CollectionType = HashMap; - template - using CollectionPtrType = std::unique_ptr>; + using CollectionTypeNonSparse = std::conditional_t< + dictionary_key_type == DictionaryKeyType::simple, + HashMap, + HashMapWithSavedHash>>; #if !defined(ARCADIA_BUILD) - template - using SparseCollectionType = google::sparse_hash_map>; + template + using SparseHashMap = google::sparse_hash_map>; #else - template - using SparseCollectionType = google::sparsehash::sparse_hash_map>; + template + using SparseHashMap = google::sparsehash::sparse_hash_map>; #endif template - using SparseCollectionPtrType = std::unique_ptr>; + using CollectionTypeSparse = std::conditional_t< + dictionary_key_type == DictionaryKeyType::simple, + SparseHashMap, + SparseHashMap>; - using NullableSet = HashSet>; + template + using CollectionType = std::conditional_t, CollectionTypeNonSparse>; + + using NullableSet = HashSet>; struct Attribute final { AttributeUnderlyingType type; - std::optional nullable_set; + std::optional is_nullable_set; std::variant< UInt8, @@ -127,41 +156,27 @@ private: Float64, StringRef> null_values; + std::variant< - CollectionPtrType, - CollectionPtrType, - CollectionPtrType, - CollectionPtrType, - CollectionPtrType, - CollectionPtrType, - CollectionPtrType, - CollectionPtrType, - CollectionPtrType, - CollectionPtrType, - CollectionPtrType, - CollectionPtrType, - CollectionPtrType, - CollectionPtrType, - CollectionPtrType> - maps; - std::variant< - SparseCollectionPtrType, - SparseCollectionPtrType, - SparseCollectionPtrType, - SparseCollectionPtrType, - SparseCollectionPtrType, - SparseCollectionPtrType, - SparseCollectionPtrType, - SparseCollectionPtrType, - SparseCollectionPtrType, - SparseCollectionPtrType, - SparseCollectionPtrType, - SparseCollectionPtrType, - SparseCollectionPtrType, - SparseCollectionPtrType, - SparseCollectionPtrType> - sparse_maps; + CollectionType, + CollectionType, + CollectionType, + CollectionType, + CollectionType, + CollectionType, + CollectionType, + CollectionType, + CollectionType, + CollectionType, + CollectionType, + CollectionType, + CollectionType, + CollectionType, + CollectionType> + container; + std::unique_ptr string_arena; + }; void createAttributes(); @@ -172,76 +187,47 @@ private: void loadData(); - template - void addAttributeSize(const Attribute & attribute); - void calculateBytesAllocated(); - template - void createAttributeImpl(Attribute & attribute, const Field & null_value); - - Attribute createAttribute(const DictionaryAttribute& attribute, const Field & null_value); - - template - void getItemsAttrImpl( - const MapType & attr, - const PaddedPODArray & ids, - ValueSetter && set_value, - DefaultValueExtractor & default_value_extractor) const; - - template + template void getItemsImpl( const Attribute & attribute, - const PaddedPODArray & ids, + DictionaryKeysExtractor & keys_extractor, ValueSetter && set_value, + NullableValueSetter && set_nullable_value, DefaultValueExtractor & default_value_extractor) const; - template - bool setAttributeValueImpl(Attribute & attribute, const Key id, const T value); + template + void getAttributeContainer(size_t attribute_index, GetContainerFunc && get_container_func); - bool setAttributeValue(Attribute & attribute, const Key id, const Field & value); + template + void getAttributeContainer(size_t attribute_index, GetContainerFunc && get_container_func) const; - const Attribute & getAttribute(const std::string & attribute_name) const; - - template - void has(const Attribute & attribute, const PaddedPODArray & ids, PaddedPODArray & out) const; - - template - PaddedPODArray getIdsAttrImpl(const AttrType & attr) const; - template - PaddedPODArray getIds(const Attribute & attribute) const; - - PaddedPODArray getIds() const; - - /// Preallocates the hashtable based on query progress - /// (Only while loading all data). - /// - /// @see preallocate - template - void resize(Attribute & attribute, size_t added_rows); void resize(size_t added_rows); - template - void isInAttrImpl(const AttrType & attr, const ChildType & child_ids, const AncestorType & ancestor_ids, PaddedPODArray & out) const; - template - void isInImpl(const ChildType & child_ids, const AncestorType & ancestor_ids, PaddedPODArray & out) const; + StringRef copyKeyInArena(StringRef key); const DictionaryStructure dict_struct; const DictionarySourcePtr source_ptr; const DictionaryLifetime dict_lifetime; const bool require_nonempty; - const bool sparse; - std::map attribute_index_by_name; std::vector attributes; - const Attribute * hierarchical_attribute = nullptr; size_t bytes_allocated = 0; size_t element_count = 0; size_t bucket_count = 0; mutable std::atomic query_count{0}; + /// TODO: Remove BlockPtr saved_block; + Arena complex_key_arena; }; +extern template class HashedDictionary; +extern template class HashedDictionary; + +extern template class HashedDictionary; +extern template class HashedDictionary; + } diff --git a/src/Dictionaries/HierarchyDictionariesUtils.cpp b/src/Dictionaries/HierarchyDictionariesUtils.cpp new file mode 100644 index 00000000000..fffe0d30e0e --- /dev/null +++ b/src/Dictionaries/HierarchyDictionariesUtils.cpp @@ -0,0 +1,156 @@ +#include "HierarchyDictionariesUtils.h" + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int UNSUPPORTED_METHOD; +} + +namespace +{ + /** In case of cache or direct dictionary we does not have structure with child to parent representation. + * This function build such structure calling getColumn for initial keys to request and for next keys in hierarchy, + * until all keys are requested or result key is null value. + * To distinguish null value key and key that is not present in dictionary, we use special default value column + * with max UInt64 value, if result column key has such value we assume that current key is not presented in dictionary storage. + */ + HashMap getChildToParentHierarchyMapImpl( + const IDictionary * dictionary, + const DictionaryAttribute & hierarchical_attribute, + const PaddedPODArray & initial_keys_to_request, + const DataTypePtr & key_type) + { + UInt64 null_value = hierarchical_attribute.null_value.get(); + + ColumnPtr key_to_request_column = ColumnVector::create(); + auto * key_to_request_column_typed = static_cast *>(key_to_request_column->assumeMutable().get()); + + UInt64 key_not_in_storage_value = std::numeric_limits::max(); + ColumnPtr key_not_in_storage_default_value_column = ColumnVector::create(initial_keys_to_request.size(), key_not_in_storage_value); + + PaddedPODArray & keys_to_request = key_to_request_column_typed->getData(); + keys_to_request.assign(initial_keys_to_request); + + PaddedPODArray next_keys_to_request; + HashSet already_requested_keys; + + HashMap child_to_parent_key; + + while (!keys_to_request.empty()) + { + child_to_parent_key.reserve(child_to_parent_key.size() + keys_to_request.size()); + + auto parent_key_column = dictionary->getColumn( + hierarchical_attribute.name, + hierarchical_attribute.type, + {key_to_request_column}, + {key_type}, + key_not_in_storage_default_value_column); + + const auto * parent_key_column_typed = checkAndGetColumn>(*parent_key_column); + if (!parent_key_column_typed) + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, + "Parent key column should be UInt64. Actual ({})", + hierarchical_attribute.type->getName()); + + const auto & parent_keys = parent_key_column_typed->getData(); + next_keys_to_request.clear(); + + for (size_t i = 0; i < keys_to_request.size(); ++i) + { + auto key = keys_to_request[i]; + auto parent_key = parent_keys[i]; + + if (parent_key == key_not_in_storage_value) + continue; + + child_to_parent_key[key] = parent_key; + + if (parent_key == null_value || + already_requested_keys.find(parent_key) != nullptr) + continue; + + already_requested_keys.insert(parent_key); + next_keys_to_request.emplace_back(parent_key); + } + + keys_to_request.clear(); + keys_to_request.assign(next_keys_to_request); + } + + return child_to_parent_key; + } +} + +ColumnPtr getKeysHierarchyDefaultImplementation(const IDictionary * dictionary, ColumnPtr key_column, const DataTypePtr & key_type) +{ + key_column = key_column->convertToFullColumnIfConst(); + const auto * key_column_typed = checkAndGetColumn>(*key_column); + if (!key_column_typed) + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "Key column should be UInt64"); + + const auto & dictionary_structure = dictionary->getStructure(); + size_t hierarchical_attribute_index = *dictionary_structure.hierarchical_attribute_index; + const auto & hierarchical_attribute = dictionary_structure.attributes[hierarchical_attribute_index]; + + const PaddedPODArray & requested_keys = key_column_typed->getData(); + HashMap key_to_parent_key = getChildToParentHierarchyMapImpl(dictionary, hierarchical_attribute, requested_keys, key_type); + + auto is_key_valid_func = [&](auto & key) { return key_to_parent_key.find(key) != nullptr; }; + + auto get_parent_key_func = [&](auto & key) + { + auto it = key_to_parent_key.find(key); + std::optional result = (it != nullptr ? std::make_optional(it->getMapped()) : std::nullopt); + return result; + }; + + UInt64 null_value = hierarchical_attribute.null_value.get(); + + auto dictionary_hierarchy_array = getKeysHierarchyArray(requested_keys, null_value, is_key_valid_func, get_parent_key_func); + return dictionary_hierarchy_array; +} + +ColumnUInt8::Ptr getKeysIsInHierarchyDefaultImplementation( + const IDictionary * dictionary, + ColumnPtr key_column, + ColumnPtr in_key_column, + const DataTypePtr & key_type) +{ + key_column = key_column->convertToFullColumnIfConst(); + in_key_column = in_key_column->convertToFullColumnIfConst(); + + const auto * key_column_typed = checkAndGetColumn>(*key_column); + if (!key_column_typed) + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "Key column should be UInt64"); + + const auto * in_key_column_typed = checkAndGetColumn>(*in_key_column); + if (!in_key_column_typed) + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "Key column should be UInt64"); + + const auto & dictionary_structure = dictionary->getStructure(); + size_t hierarchical_attribute_index = *dictionary_structure.hierarchical_attribute_index; + const auto & hierarchical_attribute = dictionary_structure.attributes[hierarchical_attribute_index]; + + const PaddedPODArray & requested_keys = key_column_typed->getData(); + HashMap key_to_parent_key = getChildToParentHierarchyMapImpl(dictionary, hierarchical_attribute, requested_keys, key_type); + + auto is_key_valid_func = [&](auto & key) { return key_to_parent_key.find(key) != nullptr; }; + + auto get_parent_key_func = [&](auto & key) + { + auto it = key_to_parent_key.find(key); + std::optional result = (it != nullptr ? std::make_optional(it->getMapped()) : std::nullopt); + return result; + }; + + UInt64 null_value = hierarchical_attribute.null_value.get(); + const auto & in_keys = in_key_column_typed->getData(); + + auto result = getKeysIsInHierarchyColumn(requested_keys, in_keys, null_value, is_key_valid_func, get_parent_key_func); + return result; +} + +} diff --git a/src/Dictionaries/HierarchyDictionariesUtils.h b/src/Dictionaries/HierarchyDictionariesUtils.h new file mode 100644 index 00000000000..8b2fe6ef08e --- /dev/null +++ b/src/Dictionaries/HierarchyDictionariesUtils.h @@ -0,0 +1,467 @@ +#pragma once + +#include +#include +#include +#include + +#include +#include +#include + +#include + +namespace DB +{ + +namespace detail +{ + template + struct ElementsAndOffsets + { + PaddedPODArray elements; + PaddedPODArray offsets; + }; + + template + struct IsKeyValidFuncInterface + { + bool operator()(T key [[maybe_unused]]) { return false; } + }; + + template + struct GetParentKeyFuncInterface + { + std::optional operator()(T key [[maybe_unused]]) { return {}; } + }; + + /** Calculate hierarchy for keys iterating the hierarchy from child to parent using get_parent_key_func provided by client. + * Hierarchy iteration is stopped if key equals null value, get_parent_key_func returns null optional, or hierarchy depth + * greater or equal than DBMS_HIERARCHICAL_DICTIONARY_MAX_DEPTH. + * IsKeyValidFunc used for each input hierarchy key, if it returns false result hierarchy for that key will have size 0. + * Hierarchy result is ElementsAndOffsets structure, for each element there is hierarchy array, + * with size offset[element_index] - (element_index > 0 ? offset[element_index - 1] : 0). + * + * Example: + * id parent_id + * 1 0 + * 2 1 + * 3 1 + * 4 2 + * + * If hierarchy_null_value will be 0. Requested keys [1, 2, 3, 4, 5]. + * Result: [1], [2, 1], [3, 1], [4, 2, 1], [] + * Elements: [1, 2, 1, 3, 1, 4, 2, 1] + * Offsets: [1, 3, 5, 8, 8] + */ + template + ElementsAndOffsets getHierarchy( + const PaddedPODArray & keys, + const KeyType & hierarchy_null_value, + IsKeyValidFunc && is_key_valid_func, + GetParentKeyFunc && get_parent_key_func) + { + size_t hierarchy_keys_size = keys.size(); + + PaddedPODArray elements; + elements.reserve(hierarchy_keys_size); + + PaddedPODArray offsets; + offsets.reserve(hierarchy_keys_size); + + struct OffsetInArray + { + size_t offset_index; + size_t array_element_offset; + }; + + HashMap already_processes_keys_to_offset; + already_processes_keys_to_offset.reserve(hierarchy_keys_size); + + for (size_t i = 0; i < hierarchy_keys_size; ++i) + { + auto hierarchy_key = keys[i]; + size_t current_hierarchy_depth = 0; + + bool is_key_valid = std::forward(is_key_valid_func)(hierarchy_key); + + if (!is_key_valid) + { + offsets.emplace_back(elements.size()); + continue; + } + + while (true) + { + const auto * it = already_processes_keys_to_offset.find(hierarchy_key); + + if (it) + { + const auto & index = it->getMapped(); + + size_t offset = index.offset_index; + + bool is_loop = (offset == offsets.size()); + + if (unlikely(is_loop)) + break; + + size_t array_element_offset = index.array_element_offset; + + size_t previous_offset_size = offset > 0 ? offsets[offset - 1] : 0; + size_t start_index = previous_offset_size + array_element_offset; + size_t end_index = offsets[offset]; + + elements.insertFromItself(elements.begin() + start_index, elements.begin() + end_index); + break; + } + + if (hierarchy_key == hierarchy_null_value || current_hierarchy_depth >= DBMS_HIERARCHICAL_DICTIONARY_MAX_DEPTH) + break; + + already_processes_keys_to_offset[hierarchy_key] = {offsets.size(), current_hierarchy_depth}; + elements.emplace_back(hierarchy_key); + ++current_hierarchy_depth; + + std::optional parent_key = std::forward(get_parent_key_func)(hierarchy_key); + + if (!parent_key.has_value()) + break; + + hierarchy_key = *parent_key; + } + + offsets.emplace_back(elements.size()); + } + + ElementsAndOffsets result = {std::move(elements), std::move(offsets)}; + + return result; + } + + /** Returns array with UInt8 represent if key from in_keys array is in hierarchy of key from keys column. + * If value in result array is 1 that means key from in_keys array is in hierarchy of key from + * keys array with same index, 0 therwise. + * For getting hierarchy implementation uses getKeysHierarchy function. + * + * Not: keys size must be equal to in_keys_size. + */ + template + PaddedPODArray getIsInHierarchy( + const PaddedPODArray & keys, + const PaddedPODArray & in_keys, + const KeyType & hierarchy_null_value, + IsKeyValidFunc && is_key_valid_func, + GetParentKeyFunc && get_parent_func) + { + assert(keys.size() == in_keys.size()); + + PaddedPODArray result; + result.resize_fill(keys.size()); + + detail::ElementsAndOffsets hierarchy = detail::getHierarchy( + keys, + hierarchy_null_value, + std::forward(is_key_valid_func), + std::forward(get_parent_func)); + + auto & offsets = hierarchy.offsets; + auto & elements = hierarchy.elements; + + for (size_t i = 0; i < offsets.size(); ++i) + { + size_t i_elements_start = i > 0 ? offsets[i - 1] : 0; + size_t i_elements_end = offsets[i]; + + auto & key_to_find = in_keys[i]; + + const auto * begin = elements.begin() + i_elements_start; + const auto * end = elements.begin() + i_elements_end; + + const auto * it = std::find(begin, end, key_to_find); + + bool contains_key = (it != end); + result[i] = contains_key; + } + + return result; + } + + struct GetAllDescendantsStrategy { size_t level = 0; }; + struct GetDescendantsAtSpecificLevelStrategy { size_t level = 0; }; + + /** Get descendants for keys iterating the hierarchy from parent to child using parent_to_child hash map provided by client. + * GetAllDescendantsStrategy get all descendants for key + * GetDescendantsAtSpecificLevelStrategy get descendants only for specific hierarchy level. + * Hierarchy result is ElementsAndOffsets structure, for each element there is descendants array, + * with size offset[element_index] - (element_index > 0 ? offset[element_index - 1] : 0). + * + * Example: + * id parent_id + * 1 0 + * 2 1 + * 3 1 + * 4 2 + * + * Example. Strategy GetAllDescendantsStrategy. + * Requested keys [0, 1, 2, 3, 4]. + * Result: [1, 2, 3, 4], [2, 2, 4], [4], [], [] + * Elements: [1, 2, 3, 4, 2, 3, 4, 4] + * Offsets: [4, 7, 8, 8, 8] + * + * Example. Strategy GetDescendantsAtSpecificLevelStrategy with level 1. + * Requested keys [0, 1, 2, 3, 4]. + * Result: [1], [2, 3], [4], [], []; + * Offsets: [1, 3, 4, 4, 4]; + */ + template + ElementsAndOffsets getDescendants( + const PaddedPODArray & keys, + const HashMap> & parent_to_child, + Strategy strategy) + { + /// If strategy is GetAllDescendantsStrategy we try to cache and later reuse previously calculated descendants. + /// If strategy is GetDescendantsAtSpecificLevelStrategy we does not use cache strategy. + size_t keys_size = keys.size(); + + PaddedPODArray descendants; + descendants.reserve(keys_size); + + PaddedPODArray descendants_offsets; + descendants_offsets.reserve(keys_size); + + struct Range + { + size_t start_index; + size_t end_index; + }; + + static constexpr Int64 key_range_requires_update = -1; + HashMap already_processed_keys_to_range [[maybe_unused]]; + + if constexpr (std::is_same_v) + already_processed_keys_to_range.reserve(keys_size); + + struct KeyAndDepth + { + KeyType key; + Int64 depth; + }; + + HashSet already_processed_keys_during_loop; + already_processed_keys_during_loop.reserve(keys_size); + + PaddedPODArray next_keys_to_process_stack; + next_keys_to_process_stack.reserve(keys_size); + + Int64 level = static_cast(strategy.level); + + for (size_t i = 0; i < keys_size; ++i) + { + const KeyType & requested_key = keys[i]; + + if (parent_to_child.find(requested_key) == nullptr) + { + descendants_offsets.emplace_back(descendants.size()); + continue; + } + + next_keys_to_process_stack.emplace_back(KeyAndDepth{requested_key, 0}); + + /** To cache range for key without recursive function calls and custom stack we put special + * signaling value on stack key_range_requires_update. + * When we pop such value from stack that means processing descendants for key is finished + * and we can update range with end_index. + */ + while (!next_keys_to_process_stack.empty()) + { + KeyAndDepth key_to_process = next_keys_to_process_stack.back(); + + KeyType key = key_to_process.key; + Int64 depth = key_to_process.depth; + next_keys_to_process_stack.pop_back(); + + if constexpr (std::is_same_v) + { + /// Update end_index for key + if (depth == key_range_requires_update) + { + auto * it = already_processed_keys_to_range.find(key); + assert(it); + + auto & range_to_update = it->getMapped(); + range_to_update.end_index = descendants.size(); + continue; + } + } + + if (unlikely(already_processed_keys_during_loop.find(key) != nullptr)) + { + next_keys_to_process_stack.clear(); + break; + } + + if constexpr (std::is_same_v) + { + const auto * already_processed_it = already_processed_keys_to_range.find(key); + + if (already_processed_it) + { + Range range = already_processed_it->getMapped(); + + if (unlikely(range.start_index > range.end_index)) + { + /// Broken range because there was loop + already_processed_keys_to_range.erase(key); + } + else + { + auto insert_start_iterator = descendants.begin() + range.start_index; + auto insert_end_iterator = descendants.begin() + range.end_index; + descendants.insertFromItself(insert_start_iterator, insert_end_iterator); + continue; + } + } + } + + const auto * it = parent_to_child.find(key); + + if (!it || depth >= DBMS_HIERARCHICAL_DICTIONARY_MAX_DEPTH) + continue; + + if constexpr (std::is_same_v) + { + if (depth > level) + continue; + } + + if constexpr (std::is_same_v) + { + /// Put special signaling value on stack and update cache with range start + size_t range_start_index = descendants.size(); + already_processed_keys_to_range[key].start_index = range_start_index; + next_keys_to_process_stack.emplace_back(KeyAndDepth{key, key_range_requires_update}); + } + + already_processed_keys_during_loop.insert(key); + + ++depth; + + const auto & children = it->getMapped(); + + for (auto child_key : children) + { + /// In case of GetAllDescendantsStrategy we add any descendant to result array + /// If strategy is GetDescendantsAtSpecificLevelStrategy we require depth == level + if (std::is_same_v || depth == level) + descendants.emplace_back(child_key); + + next_keys_to_process_stack.emplace_back(KeyAndDepth{child_key, depth}); + } + } + + already_processed_keys_during_loop.clear(); + + descendants_offsets.emplace_back(descendants.size()); + } + + ElementsAndOffsets result = {std::move(descendants), std::move(descendants_offsets)}; + return result; + } + + /// Converts ElementAndOffsets structure into ArrayColumn + template + ColumnPtr convertElementsAndOffsetsIntoArray(ElementsAndOffsets && elements_and_offsets) + { + auto elements_column = ColumnVector::create(); + elements_column->getData() = std::move(elements_and_offsets.elements); + + auto offsets_column = ColumnVector::create(); + offsets_column->getData() = std::move(elements_and_offsets.offsets); + + auto column_array = ColumnArray::create(std::move(elements_column), std::move(offsets_column)); + + return column_array; + } +} + +/// Returns hierarchy array column for keys +template +ColumnPtr getKeysHierarchyArray( + const PaddedPODArray & keys, + const KeyType & hierarchy_null_value, + IsKeyValidFunc && is_key_valid_func, + GetParentKeyFunc && get_parent_func) +{ + auto elements_and_offsets = detail::getHierarchy( + keys, + hierarchy_null_value, + std::forward(is_key_valid_func), + std::forward(get_parent_func)); + + return detail::convertElementsAndOffsetsIntoArray(std::move(elements_and_offsets)); +} + +/// Returns is in hierarchy column for keys +template +ColumnUInt8::Ptr getKeysIsInHierarchyColumn( + const PaddedPODArray & hierarchy_keys, + const PaddedPODArray & hierarchy_in_keys, + const KeyType & hierarchy_null_value, + IsKeyValidFunc && is_key_valid_func, + GetParentKeyFunc && get_parent_func) +{ + auto is_in_hierarchy_data = detail::getIsInHierarchy( + hierarchy_keys, + hierarchy_in_keys, + hierarchy_null_value, + std::forward(is_key_valid_func), + std::forward(get_parent_func)); + + auto result = ColumnUInt8::create(); + result->getData() = std::move(is_in_hierarchy_data); + + return result; +} + +/// Returns descendants array column for keys +template +ColumnPtr getKeysDescendantsArray( + const PaddedPODArray & requested_keys, + const HashMap> & parent_to_child, + size_t level) +{ + if (level == 0) + { + detail::GetAllDescendantsStrategy strategy { .level = level }; + auto elements_and_offsets = detail::getDescendants(requested_keys, parent_to_child, strategy); + return detail::convertElementsAndOffsetsIntoArray(std::move(elements_and_offsets)); + } + else + { + detail::GetDescendantsAtSpecificLevelStrategy strategy { .level = level }; + auto elements_and_offsets = detail::getDescendants(requested_keys, parent_to_child, strategy); + return detail::convertElementsAndOffsetsIntoArray(std::move(elements_and_offsets)); + } +} + +/** Default getHierarchy implementation for dictionaries that does not have structure with child to parent representation. + * Implementation will build such structure with getColumn calls, and then getHierarchy for such structure. + * Returns ColumnArray with hierarchy arrays for keys from key_column. + */ +ColumnPtr getKeysHierarchyDefaultImplementation( + const IDictionary * dictionary, + ColumnPtr key_column, + const DataTypePtr & key_type); + +/** Default isInHierarchy implementation for dictionaries that does not have structure with child to parent representation. + * Implementation will build such structure with getColumn calls, and then getHierarchy for such structure. + * Returns UInt8 column if key from in_key_column is in key hierarchy from key_column. + */ +ColumnUInt8::Ptr getKeysIsInHierarchyDefaultImplementation( + const IDictionary * dictionary, + ColumnPtr key_column, + ColumnPtr in_key_column, + const DataTypePtr & key_type); + +} diff --git a/src/Dictionaries/IDictionary.h b/src/Dictionaries/IDictionary.h index 4d51747a652..a7445312409 100644 --- a/src/Dictionaries/IDictionary.h +++ b/src/Dictionaries/IDictionary.h @@ -24,8 +24,8 @@ namespace ErrorCodes extern const int NOT_IMPLEMENTED; } -struct IDictionaryBase; -using DictionaryPtr = std::unique_ptr; +struct IDictionary; +using DictionaryPtr = std::unique_ptr; /** DictionaryKeyType provides IDictionary client information about * which key type is supported by dictionary. @@ -47,13 +47,11 @@ enum class DictionaryKeyType /** * Base class for Dictionaries implementation. */ -struct IDictionaryBase : public IExternalLoadable +struct IDictionary : public IExternalLoadable { - using Key = UInt64; - - IDictionaryBase(const StorageID & dict_id_) - : dict_id(dict_id_) - , full_name(dict_id.getInternalDictionaryName()) + explicit IDictionary(const StorageID & dictionary_id_) + : dictionary_id(dictionary_id_) + , full_name(dictionary_id.getInternalDictionaryName()) { } @@ -61,14 +59,14 @@ struct IDictionaryBase : public IExternalLoadable StorageID getDictionaryID() const { std::lock_guard lock{name_mutex}; - return dict_id; + return dictionary_id; } void updateDictionaryName(const StorageID & new_name) const { std::lock_guard lock{name_mutex}; - assert(new_name.uuid == dict_id.uuid && dict_id.uuid != UUIDHelpers::Nil); - dict_id = new_name; + assert(new_name.uuid == dictionary_id.uuid && dictionary_id.uuid != UUIDHelpers::Nil); + dictionary_id = new_name; } const std::string & getLoadableName() const override final { return getFullName(); } @@ -80,8 +78,9 @@ struct IDictionaryBase : public IExternalLoadable std::string getDatabaseOrNoDatabaseTag() const { - if (!dict_id.database_name.empty()) - return dict_id.database_name; + if (!dictionary_id.database_name.empty()) + return dictionary_id.database_name; + return NO_DATABASE_TAG; } @@ -159,74 +158,65 @@ struct IDictionaryBase : public IExternalLoadable const Columns & key_columns, const DataTypes & key_types) const = 0; + virtual bool hasHierarchy() const { return false; } + + virtual ColumnPtr getHierarchy( + ColumnPtr key_column [[maybe_unused]], + const DataTypePtr & key_type [[maybe_unused]]) const + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, + "Method getHierarchy is not supported for {} dictionary.", + getDictionaryID().getNameForLogs()); + } + + virtual ColumnUInt8::Ptr isInHierarchy( + ColumnPtr key_column [[maybe_unused]], + ColumnPtr in_key_column [[maybe_unused]], + const DataTypePtr & key_type [[maybe_unused]]) const + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, + "Method isInHierarchy is not supported for {} dictionary.", + getDictionaryID().getNameForLogs()); + } + + virtual ColumnPtr getDescendants( + ColumnPtr key_column [[maybe_unused]], + const DataTypePtr & key_type [[maybe_unused]], + size_t level [[maybe_unused]]) const + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, + "Method getDescendants is not supported for {} dictionary.", + getDictionaryID().getNameForLogs()); + } + virtual BlockInputStreamPtr getBlockInputStream(const Names & column_names, size_t max_block_size) const = 0; bool supportUpdates() const override { return true; } bool isModified() const override { - auto source = getSource(); + const auto * source = getSource(); return source && source->isModified(); } virtual std::exception_ptr getLastException() const { return {}; } - std::shared_ptr shared_from_this() + std::shared_ptr shared_from_this() { - return std::static_pointer_cast(IExternalLoadable::shared_from_this()); + return std::static_pointer_cast(IExternalLoadable::shared_from_this()); } - std::shared_ptr shared_from_this() const + std::shared_ptr shared_from_this() const { - return std::static_pointer_cast(IExternalLoadable::shared_from_this()); + return std::static_pointer_cast(IExternalLoadable::shared_from_this()); } private: mutable std::mutex name_mutex; - mutable StorageID dict_id; + mutable StorageID dictionary_id; protected: const String full_name; }; -struct IDictionary : IDictionaryBase -{ - IDictionary(const StorageID & dict_id_) : IDictionaryBase(dict_id_) {} - - virtual bool hasHierarchy() const = 0; - - virtual void toParent(const PaddedPODArray & ids, PaddedPODArray & out) const = 0; - - /// TODO: Rewrite - /// Methods for hierarchy. - - virtual void isInVectorVector( - const PaddedPODArray & /*child_ids*/, const PaddedPODArray & /*ancestor_ids*/, PaddedPODArray & /*out*/) const - { - throw Exception(ErrorCodes::NOT_IMPLEMENTED, - "Hierarchy is not supported for {} dictionary.", getDictionaryID().getNameForLogs()); - } - - virtual void - isInVectorConstant(const PaddedPODArray & /*child_ids*/, const Key /*ancestor_id*/, PaddedPODArray & /*out*/) const - { - throw Exception(ErrorCodes::NOT_IMPLEMENTED, - "Hierarchy is not supported for {} dictionary.", getDictionaryID().getNameForLogs()); - } - - virtual void - isInConstantVector(const Key /*child_id*/, const PaddedPODArray & /*ancestor_ids*/, PaddedPODArray & /*out*/) const - { - throw Exception(ErrorCodes::NOT_IMPLEMENTED, - "Hierarchy is not supported for {} dictionary.", getDictionaryID().getNameForLogs()); - } - - void isInConstantConstant(const Key child_id, const Key ancestor_id, UInt8 & out) const - { - PaddedPODArray out_arr(1); - isInVectorConstant(PaddedPODArray(1, child_id), ancestor_id, out_arr); - out = out_arr[0]; - } -}; - } diff --git a/src/Dictionaries/IPAddressDictionary.cpp b/src/Dictionaries/IPAddressDictionary.cpp index 4b51d94f0d8..d66c285bc42 100644 --- a/src/Dictionaries/IPAddressDictionary.cpp +++ b/src/Dictionaries/IPAddressDictionary.cpp @@ -195,7 +195,7 @@ IPAddressDictionary::IPAddressDictionary( DictionarySourcePtr source_ptr_, const DictionaryLifetime dict_lifetime_, bool require_nonempty_) - : IDictionaryBase(dict_id_) + : IDictionary(dict_id_) , dict_struct(dict_struct_) , source_ptr{std::move(source_ptr_)} , dict_lifetime(dict_lifetime_) @@ -804,9 +804,6 @@ static auto keyViewGetter() BlockInputStreamPtr IPAddressDictionary::getBlockInputStream(const Names & column_names, size_t max_block_size) const { - using BlockInputStreamType = DictionaryBlockInputStream; - - const bool is_ipv4 = std::get_if(&ip_column) != nullptr; auto get_keys = [is_ipv4](const Columns & columns, const std::vector & dict_attributes) @@ -827,12 +824,12 @@ BlockInputStreamPtr IPAddressDictionary::getBlockInputStream(const Names & colum if (is_ipv4) { auto get_view = keyViewGetter, true>(); - return std::make_shared( + return std::make_shared( shared_from_this(), max_block_size, getKeyColumns(), column_names, std::move(get_keys), std::move(get_view)); } auto get_view = keyViewGetter(); - return std::make_shared( + return std::make_shared( shared_from_this(), max_block_size, getKeyColumns(), column_names, std::move(get_keys), std::move(get_view)); } diff --git a/src/Dictionaries/IPAddressDictionary.h b/src/Dictionaries/IPAddressDictionary.h index dcfb26c3c96..cf79caa75fc 100644 --- a/src/Dictionaries/IPAddressDictionary.h +++ b/src/Dictionaries/IPAddressDictionary.h @@ -20,7 +20,7 @@ namespace DB { -class IPAddressDictionary final : public IDictionaryBase +class IPAddressDictionary final : public IDictionary { public: IPAddressDictionary( diff --git a/src/Dictionaries/PolygonDictionary.cpp b/src/Dictionaries/PolygonDictionary.cpp index 04559d701c9..7046741b3a0 100644 --- a/src/Dictionaries/PolygonDictionary.cpp +++ b/src/Dictionaries/PolygonDictionary.cpp @@ -30,7 +30,7 @@ IPolygonDictionary::IPolygonDictionary( const DictionaryLifetime dict_lifetime_, InputType input_type_, PointType point_type_) - : IDictionaryBase(dict_id_) + : IDictionary(dict_id_) , dict_struct(dict_struct_) , source_ptr(std::move(source_ptr_)) , dict_lifetime(dict_lifetime_) @@ -142,7 +142,6 @@ ColumnPtr IPolygonDictionary::getColumn( callOnDictionaryAttributeType(attribute.underlying_type, type_call); } - query_count.fetch_add(requested_key_points.size(), std::memory_order_relaxed); return result; diff --git a/src/Dictionaries/PolygonDictionary.h b/src/Dictionaries/PolygonDictionary.h index b82a8b2928f..5974e6461a7 100644 --- a/src/Dictionaries/PolygonDictionary.h +++ b/src/Dictionaries/PolygonDictionary.h @@ -24,7 +24,7 @@ namespace bg = boost::geometry; * An implementation should inherit from this base class and preprocess the data upon construction if needed. * It must override the find method of this class which retrieves the polygon containing a single point. */ -class IPolygonDictionary : public IDictionaryBase +class IPolygonDictionary : public IDictionary { public: /** Controls the different types of polygons allowed as input. diff --git a/src/Dictionaries/RangeDictionaryBlockInputStream.h b/src/Dictionaries/RangeDictionaryBlockInputStream.h index 6531f5cba9d..499eea7152f 100644 --- a/src/Dictionaries/RangeDictionaryBlockInputStream.h +++ b/src/Dictionaries/RangeDictionaryBlockInputStream.h @@ -24,7 +24,7 @@ public: using Key = UInt64; RangeDictionaryBlockInputStream( - std::shared_ptr dictionary, + std::shared_ptr dictionary, size_t max_block_size, const Names & column_names, PaddedPODArray && ids_to_fill, @@ -49,7 +49,7 @@ private: const PaddedPODArray & block_start_dates, const PaddedPODArray & block_end_dates) const; - std::shared_ptr dictionary; + std::shared_ptr dictionary; NameSet column_names; PaddedPODArray ids; PaddedPODArray start_dates; @@ -59,7 +59,7 @@ private: template RangeDictionaryBlockInputStream::RangeDictionaryBlockInputStream( - std::shared_ptr dictionary_, + std::shared_ptr dictionary_, size_t max_block_size_, const Names & column_names_, PaddedPODArray && ids_, diff --git a/src/Dictionaries/RangeHashedDictionary.cpp b/src/Dictionaries/RangeHashedDictionary.cpp index 4196d6ebd72..30395114a8e 100644 --- a/src/Dictionaries/RangeHashedDictionary.cpp +++ b/src/Dictionaries/RangeHashedDictionary.cpp @@ -76,7 +76,7 @@ RangeHashedDictionary::RangeHashedDictionary( DictionarySourcePtr source_ptr_, const DictionaryLifetime dict_lifetime_, bool require_nonempty_) - : IDictionaryBase(dict_id_) + : IDictionary(dict_id_) , dict_struct(dict_struct_) , source_ptr{std::move(source_ptr_)} , dict_lifetime(dict_lifetime_) @@ -185,10 +185,10 @@ ColumnUInt8::Ptr RangeHashedDictionary::hasKeys(const Columns & key_columns, con auto range_column_storage_type = std::make_shared(); auto range_column_updated = castColumnAccurate(column_to_cast, range_column_storage_type); - PaddedPODArray key_backup_storage; + PaddedPODArray key_backup_storage; PaddedPODArray range_backup_storage; - const PaddedPODArray & ids = getColumnVectorData(this, key_columns[0], key_backup_storage); + const PaddedPODArray & ids = getColumnVectorData(this, key_columns[0], key_backup_storage); const PaddedPODArray & dates = getColumnVectorData(this, range_column_updated, range_backup_storage); const auto & attribute = attributes.front(); @@ -213,7 +213,7 @@ ColumnUInt8::Ptr RangeHashedDictionary::hasKeys(const Columns & key_columns, con template ColumnUInt8::Ptr RangeHashedDictionary::hasKeysImpl( const Attribute & attribute, - const PaddedPODArray & ids, + const PaddedPODArray & ids, const PaddedPODArray & dates) const { auto result = ColumnUInt8::create(ids.size()); @@ -388,10 +388,10 @@ void RangeHashedDictionary::getItemsImpl( ValueSetter && set_value, DefaultValueExtractor & default_value_extractor) const { - PaddedPODArray key_backup_storage; + PaddedPODArray key_backup_storage; PaddedPODArray range_backup_storage; - const PaddedPODArray & ids = getColumnVectorData(this, key_columns[0], key_backup_storage); + const PaddedPODArray & ids = getColumnVectorData(this, key_columns[0], key_backup_storage); const PaddedPODArray & dates = getColumnVectorData(this, key_columns[1], range_backup_storage); const auto & attr = *std::get>(attribute.maps); @@ -436,7 +436,7 @@ void RangeHashedDictionary::getItemsImpl( template -void RangeHashedDictionary::setAttributeValueImpl(Attribute & attribute, const Key id, const Range & range, const Field & value) +void RangeHashedDictionary::setAttributeValueImpl(Attribute & attribute, const UInt64 id, const Range & range, const Field & value) { using ValueType = std::conditional_t, StringRef, T>; auto & map = *std::get>(attribute.maps); @@ -480,7 +480,7 @@ void RangeHashedDictionary::setAttributeValueImpl(Attribute & attribute, const K map.insert({id, Values{std::move(value_to_insert)}}); } -void RangeHashedDictionary::setAttributeValue(Attribute & attribute, const Key id, const Range & range, const Field & value) +void RangeHashedDictionary::setAttributeValue(Attribute & attribute, const UInt64 id, const Range & range, const Field & value) { auto type_call = [&](const auto &dictionary_attribute_type) { @@ -515,7 +515,7 @@ RangeHashedDictionary::getAttributeWithType(const std::string & attribute_name, template void RangeHashedDictionary::getIdsAndDates( - PaddedPODArray & ids, + PaddedPODArray & ids, PaddedPODArray & start_dates, PaddedPODArray & end_dates) const { @@ -536,7 +536,7 @@ void RangeHashedDictionary::getIdsAndDates( template void RangeHashedDictionary::getIdsAndDates( const Attribute & attribute, - PaddedPODArray & ids, + PaddedPODArray & ids, PaddedPODArray & start_dates, PaddedPODArray & end_dates) const { @@ -567,7 +567,7 @@ void RangeHashedDictionary::getIdsAndDates( template BlockInputStreamPtr RangeHashedDictionary::getBlockInputStreamImpl(const Names & column_names, size_t max_block_size) const { - PaddedPODArray ids; + PaddedPODArray ids; PaddedPODArray start_dates; PaddedPODArray end_dates; getIdsAndDates(ids, start_dates, end_dates); diff --git a/src/Dictionaries/RangeHashedDictionary.h b/src/Dictionaries/RangeHashedDictionary.h index f2b24e52dfc..ca2a925df5e 100644 --- a/src/Dictionaries/RangeHashedDictionary.h +++ b/src/Dictionaries/RangeHashedDictionary.h @@ -16,7 +16,7 @@ namespace DB { -class RangeHashedDictionary final : public IDictionaryBase +class RangeHashedDictionary final : public IDictionary { public: RangeHashedDictionary( @@ -160,25 +160,25 @@ private: template ColumnUInt8::Ptr hasKeysImpl( const Attribute & attribute, - const PaddedPODArray & ids, + const PaddedPODArray & ids, const PaddedPODArray & dates) const; template - static void setAttributeValueImpl(Attribute & attribute, const Key id, const Range & range, const Field & value); + static void setAttributeValueImpl(Attribute & attribute, const UInt64 id, const Range & range, const Field & value); - static void setAttributeValue(Attribute & attribute, const Key id, const Range & range, const Field & value); + static void setAttributeValue(Attribute & attribute, const UInt64 id, const Range & range, const Field & value); const Attribute & getAttribute(const std::string & attribute_name) const; const Attribute & getAttributeWithType(const std::string & name, const AttributeUnderlyingType type) const; template - void getIdsAndDates(PaddedPODArray & ids, PaddedPODArray & start_dates, PaddedPODArray & end_dates) const; + void getIdsAndDates(PaddedPODArray & ids, PaddedPODArray & start_dates, PaddedPODArray & end_dates) const; template void getIdsAndDates( const Attribute & attribute, - PaddedPODArray & ids, + PaddedPODArray & ids, PaddedPODArray & start_dates, PaddedPODArray & end_dates) const; diff --git a/src/Dictionaries/registerDictionaries.cpp b/src/Dictionaries/registerDictionaries.cpp index a7b3c87267d..8d24a6ea979 100644 --- a/src/Dictionaries/registerDictionaries.cpp +++ b/src/Dictionaries/registerDictionaries.cpp @@ -57,7 +57,6 @@ void registerDictionaries() { auto & factory = DictionaryFactory::instance(); registerDictionaryRangeHashed(factory); - registerDictionaryComplexKeyHashed(factory); registerDictionaryTrie(factory); registerDictionaryFlat(factory); registerDictionaryHashed(factory); diff --git a/src/Dictionaries/tests/gtest_dictionary_ssd_cache_dictionary_storage.cpp b/src/Dictionaries/tests/gtest_dictionary_ssd_cache_dictionary_storage.cpp index 20529e91bd3..9fd9dc9b78c 100644 --- a/src/Dictionaries/tests/gtest_dictionary_ssd_cache_dictionary_storage.cpp +++ b/src/Dictionaries/tests/gtest_dictionary_ssd_cache_dictionary_storage.cpp @@ -1,7 +1,5 @@ #if defined(__linux__) || defined(__FreeBSD__) -#include - #include #include diff --git a/src/Dictionaries/tests/gtest_hierarchy_dictionaries_utils.cpp b/src/Dictionaries/tests/gtest_hierarchy_dictionaries_utils.cpp new file mode 100644 index 00000000000..064f57dfe11 --- /dev/null +++ b/src/Dictionaries/tests/gtest_hierarchy_dictionaries_utils.cpp @@ -0,0 +1,225 @@ +#include + +#include + +#include + +using namespace DB; + +TEST(HierarchyDictionariesUtils, getHierarchy) +{ + { + HashMap child_to_parent; + child_to_parent[1] = 0; + child_to_parent[2] = 1; + child_to_parent[3] = 1; + child_to_parent[4] = 2; + + auto is_key_valid_func = [&](auto key) { return child_to_parent.find(key) != nullptr; }; + + auto get_parent_key_func = [&](auto key) + { + auto it = child_to_parent.find(key); + std::optional value = (it != nullptr ? std::make_optional(it->getMapped()) : std::nullopt); + return value; + }; + + UInt64 hierarchy_null_value_key = 0; + PaddedPODArray keys = {1, 2, 3, 4, 5}; + + auto result = DB::detail::getHierarchy( + keys, + hierarchy_null_value_key, + is_key_valid_func, + get_parent_key_func); + + const auto & actual_elements = result.elements; + const auto & actual_offsets = result.offsets; + + PaddedPODArray expected_elements = {1, 2, 1, 3, 1, 4, 2, 1}; + PaddedPODArray expected_offsets = {1, 3, 5, 8, 8}; + + ASSERT_EQ(actual_elements, expected_elements); + ASSERT_EQ(actual_offsets, expected_offsets); + } + { + HashMap child_to_parent; + child_to_parent[1] = 2; + child_to_parent[2] = 1; + + auto is_key_valid_func = [&](auto key) { return child_to_parent.find(key) != nullptr; }; + + auto get_parent_key_func = [&](auto key) + { + auto it = child_to_parent.find(key); + std::optional value = (it != nullptr ? std::make_optional(it->getMapped()) : std::nullopt); + return value; + }; + + UInt64 hierarchy_null_value_key = 0; + PaddedPODArray keys = {1, 2, 3}; + + auto result = DB::detail::getHierarchy( + keys, + hierarchy_null_value_key, + is_key_valid_func, + get_parent_key_func); + + const auto & actual_elements = result.elements; + const auto & actual_offsets = result.offsets; + + PaddedPODArray expected_elements = {1, 2, 2}; + PaddedPODArray expected_offsets = {2, 3, 3}; + + ASSERT_EQ(actual_elements, expected_elements); + ASSERT_EQ(actual_offsets, expected_offsets); + } +} + +TEST(HierarchyDictionariesUtils, getIsInHierarchy) +{ + { + HashMap child_to_parent; + child_to_parent[1] = 0; + child_to_parent[2] = 1; + child_to_parent[3] = 1; + child_to_parent[4] = 2; + + auto is_key_valid_func = [&](auto key) { return child_to_parent.find(key) != nullptr; }; + + auto get_parent_key_func = [&](auto key) + { + auto it = child_to_parent.find(key); + std::optional value = (it != nullptr ? std::make_optional(it->getMapped()) : std::nullopt); + return value; + }; + + UInt64 hierarchy_null_value_key = 0; + PaddedPODArray keys = {1, 2, 3, 4, 5}; + PaddedPODArray keys_in = {1, 1, 1, 2, 5}; + + PaddedPODArray actual = DB::detail::getIsInHierarchy( + keys, + keys_in, + hierarchy_null_value_key, + is_key_valid_func, + get_parent_key_func); + + PaddedPODArray expected = {1,1,1,1,0}; + + ASSERT_EQ(actual, expected); + } + { + HashMap child_to_parent; + child_to_parent[1] = 2; + child_to_parent[2] = 1; + + auto is_key_valid_func = [&](auto key) + { + return child_to_parent.find(key) != nullptr; + }; + + auto get_parent_key_func = [&](auto key) + { + auto it = child_to_parent.find(key); + std::optional value = (it != nullptr ? std::make_optional(it->getMapped()) : std::nullopt); + return value; + }; + + UInt64 hierarchy_null_value_key = 0; + PaddedPODArray keys = {1, 2, 3}; + PaddedPODArray keys_in = {1, 2, 3}; + + PaddedPODArray actual = DB::detail::getIsInHierarchy( + keys, + keys_in, + hierarchy_null_value_key, + is_key_valid_func, + get_parent_key_func); + + PaddedPODArray expected = {1, 1, 0}; + ASSERT_EQ(actual, expected); + } +} + +TEST(HierarchyDictionariesUtils, getDescendants) +{ + { + HashMap> parent_to_child; + parent_to_child[0].emplace_back(1); + parent_to_child[1].emplace_back(2); + parent_to_child[1].emplace_back(3); + parent_to_child[2].emplace_back(4); + + PaddedPODArray keys = {0, 1, 2, 3, 4}; + + { + auto result = DB::detail::getDescendants( + keys, + parent_to_child, + DB::detail::GetAllDescendantsStrategy()); + + const auto & actual_elements = result.elements; + const auto & actual_offsets = result.offsets; + + PaddedPODArray expected_elements = {1, 2, 3, 4, 2, 3, 4, 4}; + PaddedPODArray expected_offsets = {4, 7, 8, 8, 8}; + + ASSERT_EQ(actual_elements, expected_elements); + ASSERT_EQ(actual_offsets, expected_offsets); + } + { + auto result = DB::detail::getDescendants( + keys, + parent_to_child, + DB::detail::GetDescendantsAtSpecificLevelStrategy{1}); + + const auto & actual_elements = result.elements; + const auto & actual_offsets = result.offsets; + + PaddedPODArray expected_elements = {1, 2, 3, 4}; + PaddedPODArray expected_offsets = {1, 3, 4, 4, 4}; + + ASSERT_EQ(actual_elements, expected_elements); + ASSERT_EQ(actual_offsets, expected_offsets); + } + } + { + HashMap> parent_to_child; + parent_to_child[1].emplace_back(2); + parent_to_child[2].emplace_back(1); + + PaddedPODArray keys = {1, 2, 3}; + + { + auto result = DB::detail::getDescendants( + keys, + parent_to_child, + DB::detail::GetAllDescendantsStrategy()); + + const auto & actual_elements = result.elements; + const auto & actual_offsets = result.offsets; + + PaddedPODArray expected_elements = {2, 1, 1}; + PaddedPODArray expected_offsets = {2, 3, 3}; + + ASSERT_EQ(actual_elements, expected_elements); + ASSERT_EQ(actual_offsets, expected_offsets); + } + { + auto result = DB::detail::getDescendants( + keys, + parent_to_child, + DB::detail::GetDescendantsAtSpecificLevelStrategy{1}); + + const auto & actual_elements = result.elements; + const auto & actual_offsets = result.offsets; + + PaddedPODArray expected_elements = {2, 1}; + PaddedPODArray expected_offsets = {1, 2, 2}; + + ASSERT_EQ(actual_elements, expected_elements); + ASSERT_EQ(actual_offsets, expected_offsets); + } + } +} diff --git a/src/Dictionaries/ya.make b/src/Dictionaries/ya.make index 4df58211118..dc58d3f0a14 100644 --- a/src/Dictionaries/ya.make +++ b/src/Dictionaries/ya.make @@ -26,7 +26,7 @@ SRCS( CassandraDictionarySource.cpp CassandraHelpers.cpp ClickHouseDictionarySource.cpp - ComplexKeyHashedDictionary.cpp + DictionaryBlockInputStream.cpp DictionaryBlockInputStreamBase.cpp DictionaryFactory.cpp DictionarySourceFactory.cpp @@ -48,6 +48,7 @@ SRCS( FlatDictionary.cpp HTTPDictionarySource.cpp HashedDictionary.cpp + HierarchyDictionariesUtils.cpp IPAddressDictionary.cpp LibraryDictionarySource.cpp LibraryDictionarySourceExternal.cpp diff --git a/src/Functions/FunctionsExternalDictionaries.cpp b/src/Functions/FunctionsExternalDictionaries.cpp index f037a3bd808..6b83f761086 100644 --- a/src/Functions/FunctionsExternalDictionaries.cpp +++ b/src/Functions/FunctionsExternalDictionaries.cpp @@ -24,6 +24,8 @@ void registerFunctionsExternalDictionaries(FunctionFactory & factory) factory.registerFunction(); factory.registerFunction(); factory.registerFunction(); + factory.registerFunction(); + factory.registerFunction(); factory.registerFunction(); factory.registerFunction(); factory.registerFunction(); @@ -40,6 +42,7 @@ void registerFunctionsExternalDictionaries(FunctionFactory & factory) factory.registerFunction(); factory.registerFunction>(); factory.registerFunction>(); + factory.registerFunction(); } } diff --git a/src/Functions/FunctionsExternalDictionaries.h b/src/Functions/FunctionsExternalDictionaries.h index 2c322698327..1f8ef60b4af 100644 --- a/src/Functions/FunctionsExternalDictionaries.h +++ b/src/Functions/FunctionsExternalDictionaries.h @@ -19,6 +19,7 @@ #include #include #include +#include #include @@ -28,16 +29,6 @@ #include #include - -#include -#include -#include -#include -#include -#include -#include -#include - #include #include @@ -49,7 +40,6 @@ namespace ErrorCodes { extern const int ILLEGAL_TYPE_OF_ARGUMENT; extern const int UNSUPPORTED_METHOD; - extern const int UNKNOWN_TYPE; extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; extern const int ILLEGAL_COLUMN; extern const int BAD_ARGUMENTS; @@ -77,7 +67,7 @@ class FunctionDictHelper public: explicit FunctionDictHelper(const Context & context_) : context(context_) {} - std::shared_ptr getDictionary(const String & dictionary_name) + std::shared_ptr getDictionary(const String & dictionary_name) { auto dict = context.getExternalDictionariesLoader().getDictionary(dictionary_name, context); @@ -90,9 +80,13 @@ public: return dict; } - std::shared_ptr getDictionary(const ColumnWithTypeAndName & column) + std::shared_ptr getDictionary(const ColumnPtr & column) { - const auto * dict_name_col = checkAndGetColumnConst(column.column.get()); + const auto * dict_name_col = checkAndGetColumnConst(column.get()); + + if (!dict_name_col) + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "Expected const String column"); + return getDictionary(dict_name_col->getValue()); } @@ -148,7 +142,6 @@ public: String getName() const override { return name; } -private: size_t getNumberOfArguments() const override { return 0; } bool isVariadic() const override { return true; } @@ -187,7 +180,7 @@ private: if (input_rows_count == 0) return result_type->createColumn(); - auto dictionary = helper.getDictionary(arguments[0]); + auto dictionary = helper.getDictionary(arguments[0].column); auto dictionary_key_type = dictionary->getKeyType(); const ColumnWithTypeAndName & key_column_with_type = arguments[1]; @@ -238,6 +231,7 @@ private: return dictionary->hasKeys({key_column, range_col}, {std::make_shared(), range_col_type}); } +private: mutable FunctionDictHelper helper; }; @@ -302,7 +296,7 @@ public: } if (types.size() > 1) - return std::make_shared(types); + return std::make_shared(types, attribute_names); else return types.front(); } @@ -701,6 +695,163 @@ using FunctionDictGetDecimal64OrDefault = FunctionDictGetOrDefault, NameDictGetDecimal128OrDefault>; using FunctionDictGetStringOrDefault = FunctionDictGetOrDefault; +class FunctionDictGetOrNull final : public IFunction +{ +public: + static constexpr auto name = "dictGetOrNull"; + + static FunctionPtr create(const Context &context) + { + return std::make_shared(context); + } + + explicit FunctionDictGetOrNull(const Context & context_) + : dictionary_get_func_impl(context_) + , dictionary_has_func_impl(context_) + {} + + String getName() const override { return name; } + +private: + + size_t getNumberOfArguments() const override { return 0; } + + bool isVariadic() const override { return true; } + + bool useDefaultImplementationForConstants() const override { return true; } + + bool useDefaultImplementationForNulls() const override { return false; } + + bool isDeterministic() const override { return false; } + + ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {0, 1}; } + + bool isInjective(const ColumnsWithTypeAndName & sample_columns) const override + { + return dictionary_get_func_impl.isInjective(sample_columns); + } + + DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override + { + auto result_type = dictionary_get_func_impl.getReturnTypeImpl(arguments); + + WhichDataType result_data_type(result_type); + if (result_data_type.isTuple()) + { + const auto & data_type_tuple = static_cast(*result_type); + auto elements_types_copy = data_type_tuple.getElements(); + for (auto & element_type : elements_types_copy) + element_type = makeNullable(element_type); + + result_type = std::make_shared(elements_types_copy, data_type_tuple.getElementNames()); + } + else + result_type = makeNullable(result_type); + + return result_type; + } + + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override + { + /** We call dictHas function to get which map is key presented in dictionary. + For key that presented in dictionary dict has result for that key index value will be 1. Otherwise 0. + We invert result, and then for key that is not presented in dictionary value will be 1. Otherwise 0. + This inverted result will be used as null column map. + After that we call dict get function, by contract for key that are not presented in dictionary we + return default value. + We create nullable column from dict get result column and null column map. + + 2 additional implementation details: + 1. Result from dict get can be tuple if client requested multiple attributes we apply such operation on each result column. + 2. If column is already nullable we merge column null map with null map that we get from dict has. + */ + + auto dict_has_arguments = filterAttributeNameArgumentForDictHas(arguments); + auto is_key_in_dictionary_column = dictionary_has_func_impl.executeImpl(dict_has_arguments, std::make_shared(), input_rows_count); + auto is_key_in_dictionary_column_mutable = is_key_in_dictionary_column->assumeMutable(); + ColumnVector & is_key_in_dictionary_column_typed = assert_cast &>(*is_key_in_dictionary_column_mutable); + PaddedPODArray & is_key_in_dictionary_data = is_key_in_dictionary_column_typed.getData(); + for (auto & key : is_key_in_dictionary_data) + key = !key; + + auto result_type = dictionary_get_func_impl.getReturnTypeImpl(arguments); + auto dictionary_get_result_column = dictionary_get_func_impl.executeImpl(arguments, result_type, input_rows_count); + + ColumnPtr result; + + WhichDataType result_data_type(result_type); + auto dictionary_get_result_column_mutable = dictionary_get_result_column->assumeMutable(); + + if (result_data_type.isTuple()) + { + ColumnTuple & column_tuple = assert_cast(*dictionary_get_result_column_mutable); + + const auto & columns = column_tuple.getColumns(); + size_t tuple_size = columns.size(); + + MutableColumns new_columns(tuple_size); + for (size_t tuple_column_index = 0; tuple_column_index < tuple_size; ++tuple_column_index) + { + auto nullable_column_map = ColumnVector::create(); + auto & nullable_column_map_data = nullable_column_map->getData(); + nullable_column_map_data.assign(is_key_in_dictionary_data); + + auto mutable_column = columns[tuple_column_index]->assumeMutable(); + if (ColumnNullable * nullable_column = typeid_cast(mutable_column.get())) + { + auto & null_map_data = nullable_column->getNullMapData(); + addNullMap(null_map_data, is_key_in_dictionary_data); + new_columns[tuple_column_index] = std::move(mutable_column); + } + else + new_columns[tuple_column_index] = ColumnNullable::create(std::move(mutable_column), std::move(nullable_column_map)); + } + + result = ColumnTuple::create(std::move(new_columns)); + } + else + { + if (ColumnNullable * nullable_column = typeid_cast(dictionary_get_result_column_mutable.get())) + { + auto & null_map_data = nullable_column->getNullMapData(); + addNullMap(null_map_data, is_key_in_dictionary_data); + result = std::move(dictionary_get_result_column); + } + else + result = ColumnNullable::create(std::move(dictionary_get_result_column), std::move(is_key_in_dictionary_column_mutable)); + } + + return result; + } + + static void addNullMap(PaddedPODArray & null_map, PaddedPODArray & null_map_to_add) + { + assert(null_map.size() == null_map_to_add.size()); + + for (size_t i = 0; i < null_map.size(); ++i) + null_map[i] = null_map[i] || null_map_to_add[i]; + } + + static ColumnsWithTypeAndName filterAttributeNameArgumentForDictHas(const ColumnsWithTypeAndName & arguments) + { + ColumnsWithTypeAndName dict_has_arguments; + dict_has_arguments.reserve(arguments.size() - 1); + size_t attribute_name_argument_index = 1; + + for (size_t i = 0; i < arguments.size(); ++i) + { + if (i == attribute_name_argument_index) + continue; + + dict_has_arguments.emplace_back(arguments[i]); + } + + return dict_has_arguments; + } + + const FunctionDictGetNoType dictionary_get_func_impl; + const FunctionDictHas dictionary_has_func_impl; +}; /// Functions to work with hierarchies. class FunctionDictGetHierarchy final : public IFunction @@ -727,12 +878,16 @@ private: DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override { if (!isString(arguments[0])) - throw Exception{"Illegal type " + arguments[0]->getName() + " of first argument of function " + getName() - + ", expected a string.", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT}; + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Illegal type of first argument of function ({}). Expected String. Actual type ({})", + getName(), + arguments[0]->getName()); if (!WhichDataType(arguments[1]).isUInt64()) - throw Exception{"Illegal type " + arguments[1]->getName() + " of second argument of function " + getName() - + ", must be UInt64.", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT}; + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Illegal type of second argument of function ({}). Expected UInt64. Actual type ({})", + getName(), + arguments[1]->getName()); return std::make_shared(std::make_shared()); } @@ -744,109 +899,15 @@ private: if (input_rows_count == 0) return result_type->createColumn(); - auto dict = helper.getDictionary(arguments[0]); - ColumnPtr res; + auto dictionary = helper.getDictionary(arguments[0].column); - /// TODO: Rewrite this - if (!((res = executeDispatch(arguments, result_type, dict)) - || (res = executeDispatch>(arguments, result_type, dict)) - || (res = executeDispatch(arguments, result_type, dict)) - || (res = executeDispatch>(arguments, result_type, dict)))) - throw Exception{"Unsupported dictionary type " + dict->getTypeName(), ErrorCodes::UNKNOWN_TYPE}; + if (!dictionary->hasHierarchy()) + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, + "Dictionary ({}) does not support hierarchy", + dictionary->getFullName()); - return res; - } - - template - ColumnPtr executeDispatch(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, const std::shared_ptr & dict_ptr) const - { - const auto * dict = typeid_cast(dict_ptr.get()); - if (!dict) - return nullptr; - - if (!dict->hasHierarchy()) - throw Exception{"Dictionary does not have a hierarchy", ErrorCodes::UNSUPPORTED_METHOD}; - - const auto get_hierarchies = [&] (const PaddedPODArray & in, PaddedPODArray & out, PaddedPODArray & offsets) - { - const auto size = in.size(); - - /// copy of `in` array - auto in_array = std::make_unique>(std::begin(in), std::end(in)); - /// used for storing and handling result of ::toParent call - auto out_array = std::make_unique>(size); - /// resulting hierarchies - std::vector> hierarchies(size); /// TODO Bad code, poor performance. - - /// total number of non-zero elements, used for allocating all the required memory upfront - size_t total_count = 0; - - while (true) - { - auto all_zeroes = true; - - /// erase zeroed identifiers, store non-zeroed ones - for (const auto i : ext::range(0, size)) - { - const auto id = (*in_array)[i]; - if (0 == id) - continue; - - - auto & hierarchy = hierarchies[i]; - - /// Checking for loop - if (std::find(std::begin(hierarchy), std::end(hierarchy), id) != std::end(hierarchy)) - continue; - - all_zeroes = false; - /// place id at it's corresponding place - hierarchy.push_back(id); - - ++total_count; - } - - if (all_zeroes) - break; - - /// translate all non-zero identifiers at once - dict->toParent(*in_array, *out_array); - - /// we're going to use the `in_array` from this iteration as `out_array` on the next one - std::swap(in_array, out_array); - } - - out.reserve(total_count); - offsets.resize(size); - - for (const auto i : ext::range(0, size)) - { - const auto & ids = hierarchies[i]; - out.insert_assume_reserved(std::begin(ids), std::end(ids)); - offsets[i] = out.size(); - } - }; - - const auto * id_col_untyped = arguments[1].column.get(); - if (const auto * id_col = checkAndGetColumn(id_col_untyped)) - { - const auto & in = id_col->getData(); - auto backend = ColumnUInt64::create(); - auto offsets = ColumnArray::ColumnOffsets::create(); - get_hierarchies(in, backend->getData(), offsets->getData()); - return ColumnArray::create(std::move(backend), std::move(offsets)); - } - else if (const auto * id_col_const = checkAndGetColumnConst>(id_col_untyped)) - { - const PaddedPODArray in(1, id_col_const->getValue()); - auto backend = ColumnUInt64::create(); - auto offsets = ColumnArray::ColumnOffsets::create(); - get_hierarchies(in, backend->getData(), offsets->getData()); - auto array = ColumnArray::create(std::move(backend), std::move(offsets)); - return result_type->createColumnConst(id_col_const->size(), (*array)[0].get()); - } - else - throw Exception{"Second argument of function " + getName() + " must be UInt64", ErrorCodes::ILLEGAL_COLUMN}; + ColumnPtr result = dictionary->getHierarchy(arguments[1].column, std::make_shared()); + return result; } mutable FunctionDictHelper helper; @@ -877,16 +938,22 @@ private: DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override { if (!isString(arguments[0])) - throw Exception{"Illegal type " + arguments[0]->getName() + " of first argument of function " + getName() - + ", expected a string.", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT}; + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Illegal type of first argument of function ({}). Expected String. Actual type ({})", + getName(), + arguments[0]->getName()); if (!WhichDataType(arguments[1]).isUInt64()) - throw Exception{"Illegal type " + arguments[1]->getName() + " of second argument of function " + getName() - + ", must be UInt64.", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT}; + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Illegal type of second argument of function ({}). Expected UInt64. Actual type ({})", + getName(), + arguments[1]->getName()); if (!WhichDataType(arguments[2]).isUInt64()) - throw Exception{"Illegal type " + arguments[2]->getName() + " of third argument of function " + getName() - + ", must be UInt64.", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT}; + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Illegal type of third argument of function ({}). Expected UInt64. Actual type ({})", + getName(), + arguments[2]->getName()); return std::make_shared(); } @@ -898,105 +965,163 @@ private: if (input_rows_count == 0) return result_type->createColumn(); - auto dict = helper.getDictionary(arguments[0]); + auto dict = helper.getDictionary(arguments[0].column); - ColumnPtr res; - if (!((res = executeDispatch(arguments, dict)) - || (res = executeDispatch>(arguments, dict)) - || (res = executeDispatch(arguments, dict)) - || (res = executeDispatch>(arguments, dict)))) - throw Exception{"Unsupported dictionary type " + dict->getTypeName(), ErrorCodes::UNKNOWN_TYPE}; + if (!dict->hasHierarchy()) + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "Dictionary ({}) does not support hierarchy", dict->getFullName()); + + ColumnPtr res = dict->isInHierarchy(arguments[1].column, arguments[2].column, std::make_shared()); return res; } - template - ColumnPtr executeDispatch(const ColumnsWithTypeAndName & arguments, const std::shared_ptr & dict_ptr) const + mutable FunctionDictHelper helper; +}; + +class FunctionDictGetChildren final : public IFunction +{ +public: + static constexpr auto name = "dictGetChildren"; + + static FunctionPtr create(const Context & context) { - const auto * dict = typeid_cast(dict_ptr.get()); - if (!dict) - return nullptr; - - if (!dict->hasHierarchy()) - throw Exception{"Dictionary does not have a hierarchy", ErrorCodes::UNSUPPORTED_METHOD}; - - const auto * child_id_col_untyped = arguments[1].column.get(); - const auto * ancestor_id_col_untyped = arguments[2].column.get(); - - if (const auto * child_id_col = checkAndGetColumn(child_id_col_untyped)) - return execute(dict, child_id_col, ancestor_id_col_untyped); - else if (const auto * child_id_col_const = checkAndGetColumnConst>(child_id_col_untyped)) - return execute(dict, child_id_col_const, ancestor_id_col_untyped); - else - throw Exception{"Illegal column " + child_id_col_untyped->getName() - + " of second argument of function " + getName(), ErrorCodes::ILLEGAL_COLUMN}; + return std::make_shared(context); } - template - ColumnPtr execute(const DictionaryType * dict, - const ColumnUInt64 * child_id_col, const IColumn * ancestor_id_col_untyped) const + explicit FunctionDictGetChildren(const Context & context_) + : helper(context_) {} + + String getName() const override { return name; } + +private: + size_t getNumberOfArguments() const override { return 2; } + + bool useDefaultImplementationForConstants() const final { return true; } + ColumnNumbers getArgumentsThatAreAlwaysConstant() const final { return {0}; } + bool isDeterministic() const override { return false; } + + DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override { - if (const auto * ancestor_id_col = checkAndGetColumn(ancestor_id_col_untyped)) - { - auto out = ColumnUInt8::create(); + if (!isString(arguments[0])) + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Illegal type of first argument of function ({}). Expected String. Actual type ({})", + getName(), + arguments[0]->getName()); - const auto & child_ids = child_id_col->getData(); - const auto & ancestor_ids = ancestor_id_col->getData(); - auto & data = out->getData(); - const auto size = child_id_col->size(); - data.resize(size); + if (!WhichDataType(arguments[1]).isUInt64()) + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Illegal type of second argument of function ({}). Expected UInt64. Actual type ({})", + getName(), + arguments[1]->getName()); - dict->isInVectorVector(child_ids, ancestor_ids, data); - return out; - } - else if (const auto * ancestor_id_col_const = checkAndGetColumnConst>(ancestor_id_col_untyped)) - { - auto out = ColumnUInt8::create(); - - const auto & child_ids = child_id_col->getData(); - const auto ancestor_id = ancestor_id_col_const->getValue(); - auto & data = out->getData(); - const auto size = child_id_col->size(); - data.resize(size); - - dict->isInVectorConstant(child_ids, ancestor_id, data); - return out; - } - else - { - throw Exception{"Illegal column " + ancestor_id_col_untyped->getName() - + " of third argument of function " + getName(), ErrorCodes::ILLEGAL_COLUMN}; - } + return std::make_shared(std::make_shared()); } - template - ColumnPtr execute(const DictionaryType * dict, const ColumnConst * child_id_col, const IColumn * ancestor_id_col_untyped) const + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override { - if (const auto * ancestor_id_col = checkAndGetColumn(ancestor_id_col_untyped)) + if (input_rows_count == 0) + return result_type->createColumn(); + + auto dictionary = helper.getDictionary(arguments[0].column); + + if (!dictionary->hasHierarchy()) + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, + "Dictionary ({}) does not support hierarchy", + dictionary->getFullName()); + + ColumnPtr result = dictionary->getDescendants(arguments[1].column, std::make_shared(), 1); + + return result; + } + + mutable FunctionDictHelper helper; +}; + +class FunctionDictGetDescendants final : public IFunction +{ +public: + static constexpr auto name = "dictGetDescendants"; + + static FunctionPtr create(const Context & context) + { + return std::make_shared(context); + } + + explicit FunctionDictGetDescendants(const Context & context_) + : helper(context_) {} + + String getName() const override { return name; } + +private: + size_t getNumberOfArguments() const override { return 0; } + bool isVariadic() const override { return true; } + + bool useDefaultImplementationForConstants() const final { return true; } + ColumnNumbers getArgumentsThatAreAlwaysConstant() const final { return {0}; } + bool isDeterministic() const override { return false; } + + + DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override + { + size_t arguments_size = arguments.size(); + if (arguments_size < 2 || arguments_size > 3) { - auto out = ColumnUInt8::create(); - - const auto child_id = child_id_col->getValue(); - const auto & ancestor_ids = ancestor_id_col->getData(); - auto & data = out->getData(); - const auto size = child_id_col->size(); - data.resize(size); - - dict->isInConstantVector(child_id, ancestor_ids, data); - return out; + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + "Illegal arguments size of function ({}). Expects 2 or 3 arguments size. Actual size ({})", + getName(), + arguments_size); } - else if (const auto * ancestor_id_col_const = checkAndGetColumnConst>(ancestor_id_col_untyped)) + + if (!isString(arguments[0])) + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Illegal type of first argument of function ({}). Expected const String. Actual type ({})", + getName(), + arguments[0]->getName()); + + if (!WhichDataType(arguments[1]).isUInt64()) + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Illegal type of second argument of function ({}). Expected UInt64. Actual type ({})", + getName(), + arguments[1]->getName()); + + if (arguments.size() == 3 && !isUnsignedInteger(arguments[2])) { - const auto child_id = child_id_col->getValue(); - const auto ancestor_id = ancestor_id_col_const->getValue(); - UInt8 res = 0; - - dict->isInConstantConstant(child_id, ancestor_id, res); - return DataTypeUInt8().createColumnConst(child_id_col->size(), res); + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Illegal type of third argument of function ({}). Expected const unsigned integer. Actual type ({})", + getName(), + arguments[2]->getName()); } - else - throw Exception{"Illegal column " + ancestor_id_col_untyped->getName() - + " of third argument of function " + getName(), ErrorCodes::ILLEGAL_COLUMN}; + + return std::make_shared(std::make_shared()); + } + + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override + { + if (input_rows_count == 0) + return result_type->createColumn(); + + auto dictionary = helper.getDictionary(arguments[0].column); + + size_t level = 0; + + if (arguments.size() == 3) + { + if (!isColumnConst(*arguments[2].column)) + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Illegal type of third argument of function ({}). Expected const unsigned integer.", + getName()); + + level = static_cast(arguments[2].column->get64(0)); + } + + if (!dictionary->hasHierarchy()) + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, + "Dictionary ({}) does not support hierarchy", + dictionary->getFullName()); + + ColumnPtr res = dictionary->getDescendants(arguments[1].column, std::make_shared(), level); + + return res; } mutable FunctionDictHelper helper; diff --git a/src/Functions/visitParamExtractBool.cpp b/src/Functions/visitParamExtractBool.cpp index 7f989ccbb9e..059115b5b13 100644 --- a/src/Functions/visitParamExtractBool.cpp +++ b/src/Functions/visitParamExtractBool.cpp @@ -19,10 +19,13 @@ struct ExtractBool struct NameVisitParamExtractBool { static constexpr auto name = "visitParamExtractBool"; }; using FunctionVisitParamExtractBool = FunctionsStringSearch, NameVisitParamExtractBool>; +struct NameSimpleJSONExtractBool { static constexpr auto name = "simpleJSONExtractBool"; }; +using FunctionSimpleJSONExtractBool = FunctionsStringSearch, NameSimpleJSONExtractBool>; void registerFunctionVisitParamExtractBool(FunctionFactory & factory) { factory.registerFunction(); + factory.registerFunction(); } } diff --git a/src/Functions/visitParamExtractFloat.cpp b/src/Functions/visitParamExtractFloat.cpp index b02b0209daf..7a55cff365c 100644 --- a/src/Functions/visitParamExtractFloat.cpp +++ b/src/Functions/visitParamExtractFloat.cpp @@ -9,10 +9,13 @@ namespace DB struct NameVisitParamExtractFloat { static constexpr auto name = "visitParamExtractFloat"; }; using FunctionVisitParamExtractFloat = FunctionsStringSearch>, NameVisitParamExtractFloat>; +struct NameSimpleJSONExtractFloat { static constexpr auto name = "simpleJSONExtractFloat"; }; +using FunctionSimpleJSONExtractFloat = FunctionsStringSearch>, NameSimpleJSONExtractFloat>; void registerFunctionVisitParamExtractFloat(FunctionFactory & factory) { factory.registerFunction(); + factory.registerFunction(); } } diff --git a/src/Functions/visitParamExtractInt.cpp b/src/Functions/visitParamExtractInt.cpp index f3f30f566e6..7c2188c10fc 100644 --- a/src/Functions/visitParamExtractInt.cpp +++ b/src/Functions/visitParamExtractInt.cpp @@ -9,10 +9,13 @@ namespace DB struct NameVisitParamExtractInt { static constexpr auto name = "visitParamExtractInt"; }; using FunctionVisitParamExtractInt = FunctionsStringSearch>, NameVisitParamExtractInt>; +struct NameSimpleJSONExtractInt { static constexpr auto name = "simpleJSONExtractInt"; }; +using FunctionSimpleJSONExtractInt = FunctionsStringSearch>, NameSimpleJSONExtractInt>; void registerFunctionVisitParamExtractInt(FunctionFactory & factory) { factory.registerFunction(); + factory.registerFunction(); } } diff --git a/src/Functions/visitParamExtractRaw.cpp b/src/Functions/visitParamExtractRaw.cpp index add882f003f..734fe107557 100644 --- a/src/Functions/visitParamExtractRaw.cpp +++ b/src/Functions/visitParamExtractRaw.cpp @@ -59,10 +59,13 @@ struct ExtractRaw struct NameVisitParamExtractRaw { static constexpr auto name = "visitParamExtractRaw"; }; using FunctionVisitParamExtractRaw = FunctionsStringSearchToString, NameVisitParamExtractRaw>; +struct NameSimpleJSONExtractRaw { static constexpr auto name = "simpleJSONExtractRaw"; }; +using FunctionSimpleJSONExtractRaw = FunctionsStringSearchToString, NameSimpleJSONExtractRaw>; void registerFunctionVisitParamExtractRaw(FunctionFactory & factory) { factory.registerFunction(); + factory.registerFunction(); } } diff --git a/src/Functions/visitParamExtractString.cpp b/src/Functions/visitParamExtractString.cpp index b633a59807e..23f24b9e3b8 100644 --- a/src/Functions/visitParamExtractString.cpp +++ b/src/Functions/visitParamExtractString.cpp @@ -20,10 +20,13 @@ struct ExtractString struct NameVisitParamExtractString { static constexpr auto name = "visitParamExtractString"; }; using FunctionVisitParamExtractString = FunctionsStringSearchToString, NameVisitParamExtractString>; +struct NameSimpleJSONExtractString { static constexpr auto name = "simpleJSONExtractString"; }; +using FunctionSimpleJSONExtractString = FunctionsStringSearchToString, NameSimpleJSONExtractString>; void registerFunctionVisitParamExtractString(FunctionFactory & factory) { factory.registerFunction(); + factory.registerFunction(); } } diff --git a/src/Functions/visitParamExtractUInt.cpp b/src/Functions/visitParamExtractUInt.cpp index 5e70eed8253..f5466a63b0d 100644 --- a/src/Functions/visitParamExtractUInt.cpp +++ b/src/Functions/visitParamExtractUInt.cpp @@ -9,10 +9,14 @@ namespace DB struct NameVisitParamExtractUInt { static constexpr auto name = "visitParamExtractUInt"; }; using FunctionVisitParamExtractUInt = FunctionsStringSearch>, NameVisitParamExtractUInt>; +struct NameSimpleJSONExtractUInt { static constexpr auto name = "simpleJSONExtractUInt"; }; +using FunctionSimpleJSONExtractUInt = FunctionsStringSearch>, NameSimpleJSONExtractUInt>; + void registerFunctionVisitParamExtractUInt(FunctionFactory & factory) { factory.registerFunction(); + factory.registerFunction(); } } diff --git a/src/Functions/visitParamHas.cpp b/src/Functions/visitParamHas.cpp index 5fbedfb4995..f4f377f9e8f 100644 --- a/src/Functions/visitParamHas.cpp +++ b/src/Functions/visitParamHas.cpp @@ -19,10 +19,13 @@ struct HasParam struct NameVisitParamHas { static constexpr auto name = "visitParamHas"; }; using FunctionVisitParamHas = FunctionsStringSearch, NameVisitParamHas>; +struct NameSimpleJSONHas { static constexpr auto name = "simpleJSONHas"; }; +using FunctionSimpleJSONHas = FunctionsStringSearch, NameSimpleJSONHas>; void registerFunctionVisitParamHas(FunctionFactory & factory) { factory.registerFunction(); + factory.registerFunction(); } } diff --git a/src/IO/HTTPChunkedReadBuffer.cpp b/src/IO/HTTPChunkedReadBuffer.cpp index bd9bbba4c6c..374e04031d0 100644 --- a/src/IO/HTTPChunkedReadBuffer.cpp +++ b/src/IO/HTTPChunkedReadBuffer.cpp @@ -14,7 +14,6 @@ namespace ErrorCodes extern const int ARGUMENT_OUT_OF_BOUND; extern const int UNEXPECTED_END_OF_FILE; extern const int CORRUPTED_DATA; - extern const int TOO_MANY_BYTES; } size_t HTTPChunkedReadBuffer::readChunkHeader() @@ -40,9 +39,6 @@ size_t HTTPChunkedReadBuffer::readChunkHeader() if (in->eof()) throw Exception("Unexpected end of file while reading chunk header of HTTP chunked data", ErrorCodes::UNEXPECTED_END_OF_FILE); - if (res > max_size) - throw Exception("Chunk size is too large", ErrorCodes::TOO_MANY_BYTES); - assertString("\n", *in); return res; } diff --git a/src/IO/HTTPChunkedReadBuffer.h b/src/IO/HTTPChunkedReadBuffer.h index 0ccebc69d08..378835cafc0 100644 --- a/src/IO/HTTPChunkedReadBuffer.h +++ b/src/IO/HTTPChunkedReadBuffer.h @@ -10,11 +10,10 @@ namespace DB class HTTPChunkedReadBuffer : public BufferWithOwnMemory { public: - HTTPChunkedReadBuffer(std::unique_ptr in_, size_t max_chunk_size) : in(std::move(in_)), max_size(max_chunk_size) {} + explicit HTTPChunkedReadBuffer(std::unique_ptr in_) : in(std::move(in_)) {} private: std::unique_ptr in; - const size_t max_size; size_t readChunkHeader(); void readChunkFooter(); diff --git a/src/Interpreters/ExternalDictionariesLoader.cpp b/src/Interpreters/ExternalDictionariesLoader.cpp index 1632b7cbf78..8df29459b72 100644 --- a/src/Interpreters/ExternalDictionariesLoader.cpp +++ b/src/Interpreters/ExternalDictionariesLoader.cpp @@ -46,13 +46,13 @@ ExternalLoader::LoadablePtr ExternalDictionariesLoader::create( ExternalDictionariesLoader::DictPtr ExternalDictionariesLoader::getDictionary(const std::string & dictionary_name, const Context & context) const { std::string resolved_dictionary_name = resolveDictionaryName(dictionary_name, context.getCurrentDatabase()); - return std::static_pointer_cast(load(resolved_dictionary_name)); + return std::static_pointer_cast(load(resolved_dictionary_name)); } ExternalDictionariesLoader::DictPtr ExternalDictionariesLoader::tryGetDictionary(const std::string & dictionary_name, const Context & context) const { std::string resolved_dictionary_name = resolveDictionaryName(dictionary_name, context.getCurrentDatabase()); - return std::static_pointer_cast(tryLoad(resolved_dictionary_name)); + return std::static_pointer_cast(tryLoad(resolved_dictionary_name)); } diff --git a/src/Interpreters/ExternalDictionariesLoader.h b/src/Interpreters/ExternalDictionariesLoader.h index 0f64715b243..ce5b2512741 100644 --- a/src/Interpreters/ExternalDictionariesLoader.h +++ b/src/Interpreters/ExternalDictionariesLoader.h @@ -15,7 +15,7 @@ class IExternalLoaderConfigRepository; class ExternalDictionariesLoader : public ExternalLoader { public: - using DictPtr = std::shared_ptr; + using DictPtr = std::shared_ptr; /// Dictionaries will be loaded immediately and then will be updated in separate thread, each 'reload_period' seconds. explicit ExternalDictionariesLoader(Context & global_context_); diff --git a/src/Interpreters/IdentifierSemantic.cpp b/src/Interpreters/IdentifierSemantic.cpp index a1fc533eb7f..81bd499ea2e 100644 --- a/src/Interpreters/IdentifierSemantic.cpp +++ b/src/Interpreters/IdentifierSemantic.cpp @@ -209,7 +209,7 @@ IdentifierSemantic::ColumnMatch IdentifierSemantic::canReferColumnToTable(const return canReferColumnToTable(identifier, table_with_columns.table); } -/// Strip qualificators from left side of column name. +/// Strip qualifications from left side of column name. /// Example: 'database.table.name' -> 'name'. void IdentifierSemantic::setColumnShortName(ASTIdentifier & identifier, const DatabaseAndTableWithAlias & db_and_table) { diff --git a/src/Interpreters/PredicateExpressionsOptimizer.cpp b/src/Interpreters/PredicateExpressionsOptimizer.cpp index 00b47be408a..476bdaaceea 100644 --- a/src/Interpreters/PredicateExpressionsOptimizer.cpp +++ b/src/Interpreters/PredicateExpressionsOptimizer.cpp @@ -146,7 +146,7 @@ bool PredicateExpressionsOptimizer::tryRewritePredicatesToTables(ASTs & tables_e break; /// Skip left and right table optimization is_rewrite_tables |= tryRewritePredicatesToTable(tables_element[table_pos], tables_predicates[table_pos], - tables_with_columns[table_pos].columns.getNames()); + tables_with_columns[table_pos]); if (table_element->table_join && isRight(table_element->table_join->as()->kind)) break; /// Skip left table optimization @@ -156,13 +156,13 @@ bool PredicateExpressionsOptimizer::tryRewritePredicatesToTables(ASTs & tables_e return is_rewrite_tables; } -bool PredicateExpressionsOptimizer::tryRewritePredicatesToTable(ASTPtr & table_element, const ASTs & table_predicates, Names && table_columns) const +bool PredicateExpressionsOptimizer::tryRewritePredicatesToTable(ASTPtr & table_element, const ASTs & table_predicates, const TableWithColumnNamesAndTypes & table_columns) const { if (!table_predicates.empty()) { auto optimize_final = enable_optimize_predicate_expression_to_final_subquery; auto optimize_with = allow_push_predicate_when_subquery_contains_with; - PredicateRewriteVisitor::Data data(context, table_predicates, std::move(table_columns), optimize_final, optimize_with); + PredicateRewriteVisitor::Data data(context, table_predicates, table_columns, optimize_final, optimize_with); PredicateRewriteVisitor(data).visit(table_element); return data.is_rewrite; diff --git a/src/Interpreters/PredicateExpressionsOptimizer.h b/src/Interpreters/PredicateExpressionsOptimizer.h index 8cceda93164..223ac1e8998 100644 --- a/src/Interpreters/PredicateExpressionsOptimizer.h +++ b/src/Interpreters/PredicateExpressionsOptimizer.h @@ -33,7 +33,8 @@ private: bool tryRewritePredicatesToTables(ASTs & tables_element, const std::vector & tables_predicates); - bool tryRewritePredicatesToTable(ASTPtr & table_element, const ASTs & table_predicates, Names && table_columns) const; + bool tryRewritePredicatesToTable( + ASTPtr & table_element, const ASTs & table_predicates, const TableWithColumnNamesAndTypes & table_columns) const; bool tryMovePredicatesFromHavingToWhere(ASTSelectQuery & select_query); }; diff --git a/src/Interpreters/PredicateRewriteVisitor.cpp b/src/Interpreters/PredicateRewriteVisitor.cpp index 9e6d5543f2f..6f28b9050df 100644 --- a/src/Interpreters/PredicateRewriteVisitor.cpp +++ b/src/Interpreters/PredicateRewriteVisitor.cpp @@ -17,8 +17,8 @@ namespace DB { PredicateRewriteVisitorData::PredicateRewriteVisitorData( - const Context & context_, const ASTs & predicates_, Names && column_names_, bool optimize_final_, bool optimize_with_) - : context(context_), predicates(predicates_), column_names(column_names_), optimize_final(optimize_final_), optimize_with(optimize_with_) + const Context & context_, const ASTs & predicates_, const TableWithColumnNamesAndTypes & table_columns_, bool optimize_final_, bool optimize_with_) + : context(context_), predicates(predicates_), table_columns(table_columns_), optimize_final(optimize_final_), optimize_with(optimize_with_) { } @@ -42,7 +42,8 @@ void PredicateRewriteVisitorData::visit(ASTSelectWithUnionQuery & union_select_q void PredicateRewriteVisitorData::visitFirstInternalSelect(ASTSelectQuery & select_query, ASTPtr &) { - is_rewrite |= rewriteSubquery(select_query, column_names, column_names); + /// In this case inner_columns same as outer_columns from table_columns + is_rewrite |= rewriteSubquery(select_query, table_columns.columns.getNames()); } void PredicateRewriteVisitorData::visitOtherInternalSelect(ASTSelectQuery & select_query, ASTPtr &) @@ -65,7 +66,7 @@ void PredicateRewriteVisitorData::visitOtherInternalSelect(ASTSelectQuery & sele const Names & internal_columns = InterpreterSelectQuery( temp_internal_select, context, SelectQueryOptions().analyze()).getSampleBlock().getNames(); - if (rewriteSubquery(*temp_select_query, column_names, internal_columns)) + if (rewriteSubquery(*temp_select_query, internal_columns)) { is_rewrite |= true; select_query.setExpression(ASTSelectQuery::Expression::SELECT, std::move(temp_select_query->refSelect())); @@ -89,7 +90,7 @@ static void cleanAliasAndCollectIdentifiers(ASTPtr & predicate, std::vector identifiers; @@ -106,13 +108,16 @@ bool PredicateRewriteVisitorData::rewriteSubquery(ASTSelectQuery & subquery, con for (const auto & identifier : identifiers) { - const auto & column_name = identifier->shortName(); - const auto & outer_column_iterator = std::find(outer_columns.begin(), outer_columns.end(), column_name); + IdentifierSemantic::setColumnShortName(*identifier, table_columns.table); + const auto & column_name = identifier->name(); /// For lambda functions, we can't always find them in the list of columns /// For example: SELECT * FROM system.one WHERE arrayMap(x -> x, [dummy]) = [0] + const auto & outer_column_iterator = std::find(outer_columns.begin(), outer_columns.end(), column_name); if (outer_column_iterator != outer_columns.end()) + { identifier->setShortName(inner_columns[outer_column_iterator - outer_columns.begin()]); + } } /// We only need to push all the predicates to subquery having diff --git a/src/Interpreters/PredicateRewriteVisitor.h b/src/Interpreters/PredicateRewriteVisitor.h index 02c8b9ca422..1132d93a5ec 100644 --- a/src/Interpreters/PredicateRewriteVisitor.h +++ b/src/Interpreters/PredicateRewriteVisitor.h @@ -4,6 +4,7 @@ #include #include #include +#include namespace DB { @@ -24,12 +25,13 @@ public: return true; } - PredicateRewriteVisitorData(const Context & context_, const ASTs & predicates_, Names && column_names_, bool optimize_final_, bool optimize_with_); + PredicateRewriteVisitorData(const Context & context_, const ASTs & predicates_, + const TableWithColumnNamesAndTypes & table_columns_, bool optimize_final_, bool optimize_with_); private: const Context & context; const ASTs & predicates; - const Names column_names; + const TableWithColumnNamesAndTypes & table_columns; bool optimize_final; bool optimize_with; @@ -37,7 +39,7 @@ private: void visitOtherInternalSelect(ASTSelectQuery & select_query, ASTPtr &); - bool rewriteSubquery(ASTSelectQuery & subquery, const Names & outer_columns, const Names & inner_columns); + bool rewriteSubquery(ASTSelectQuery & subquery, const Names & inner_columns); }; using PredicateRewriteMatcher = OneTypeMatcher; diff --git a/src/Interpreters/convertFieldToType.cpp b/src/Interpreters/convertFieldToType.cpp index 90b840ce8bd..ed920539bea 100644 --- a/src/Interpreters/convertFieldToType.cpp +++ b/src/Interpreters/convertFieldToType.cpp @@ -397,8 +397,11 @@ Field convertFieldToTypeOrThrow(const Field & from_value, const IDataType & to_t throw Exception(ErrorCodes::TYPE_MISMATCH, "Cannot convert NULL to {}", to_type.getName()); Field converted = convertFieldToType(from_value, to_type, from_type_hint); if (!is_null && converted.isNull()) - throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Cannot convert value{}: it cannot be represented as {}", - from_type_hint ? " from " + from_type_hint->getName() : "", to_type.getName()); + throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, + "Cannot convert value '{}'{}: it cannot be represented as {}", + toString(from_value), + from_type_hint ? " from " + from_type_hint->getName() : "", + to_type.getName()); return converted; } diff --git a/src/Parsers/New/ClickHouseLexer.h b/src/Parsers/New/ClickHouseLexer.h index e925c5d271f..62de0792f3c 100644 --- a/src/Parsers/New/ClickHouseLexer.h +++ b/src/Parsers/New/ClickHouseLexer.h @@ -13,51 +13,51 @@ namespace DB { class ClickHouseLexer : public antlr4::Lexer { public: enum { - ADD = 1, AFTER = 2, ALIAS = 3, ALL = 4, ALTER = 5, AND = 6, ANTI = 7, - ANY = 8, ARRAY = 9, AS = 10, ASCENDING = 11, ASOF = 12, ASYNC = 13, - ATTACH = 14, BETWEEN = 15, BOTH = 16, BY = 17, CASE = 18, CAST = 19, - CHECK = 20, CLEAR = 21, CLUSTER = 22, CODEC = 23, COLLATE = 24, COLUMN = 25, - COMMENT = 26, CONSTRAINT = 27, CREATE = 28, CROSS = 29, CUBE = 30, DATABASE = 31, - DATABASES = 32, DATE = 33, DAY = 34, DEDUPLICATE = 35, DEFAULT = 36, - DELAY = 37, DELETE = 38, DESC = 39, DESCENDING = 40, DESCRIBE = 41, - DETACH = 42, DICTIONARIES = 43, DICTIONARY = 44, DISK = 45, DISTINCT = 46, - DISTRIBUTED = 47, DROP = 48, ELSE = 49, END = 50, ENGINE = 51, EVENTS = 52, - EXISTS = 53, EXPLAIN = 54, EXPRESSION = 55, EXTRACT = 56, FETCHES = 57, - FINAL = 58, FIRST = 59, FLUSH = 60, FOR = 61, FORMAT = 62, FREEZE = 63, - FROM = 64, FULL = 65, FUNCTION = 66, GLOBAL = 67, GRANULARITY = 68, - GROUP = 69, HAVING = 70, HIERARCHICAL = 71, HOUR = 72, ID = 73, IF = 74, - ILIKE = 75, IN = 76, INDEX = 77, INF = 78, INJECTIVE = 79, INNER = 80, - INSERT = 81, INTERVAL = 82, INTO = 83, IS = 84, IS_OBJECT_ID = 85, JOIN = 86, - KEY = 87, KILL = 88, LAST = 89, LAYOUT = 90, LEADING = 91, LEFT = 92, - LIFETIME = 93, LIKE = 94, LIMIT = 95, LIVE = 96, LOCAL = 97, LOGS = 98, - MATERIALIZED = 99, MAX = 100, MERGES = 101, MIN = 102, MINUTE = 103, - MODIFY = 104, MONTH = 105, MOVE = 106, MUTATION = 107, NAN_SQL = 108, - NO = 109, NOT = 110, NULL_SQL = 111, NULLS = 112, OFFSET = 113, ON = 114, - OPTIMIZE = 115, OR = 116, ORDER = 117, OUTER = 118, OUTFILE = 119, PARTITION = 120, - POPULATE = 121, PREWHERE = 122, PRIMARY = 123, QUARTER = 124, RANGE = 125, - RELOAD = 126, REMOVE = 127, RENAME = 128, REPLACE = 129, REPLICA = 130, - REPLICATED = 131, RIGHT = 132, ROLLUP = 133, SAMPLE = 134, SECOND = 135, - SELECT = 136, SEMI = 137, SENDS = 138, SET = 139, SETTINGS = 140, SHOW = 141, - SOURCE = 142, START = 143, STOP = 144, SUBSTRING = 145, SYNC = 146, - SYNTAX = 147, SYSTEM = 148, TABLE = 149, TABLES = 150, TEMPORARY = 151, - TEST = 152, THEN = 153, TIES = 154, TIMEOUT = 155, TIMESTAMP = 156, - TO = 157, TOP = 158, TOTALS = 159, TRAILING = 160, TRIM = 161, TRUNCATE = 162, - TTL = 163, TYPE = 164, UNION = 165, UPDATE = 166, USE = 167, USING = 168, - UUID = 169, VALUES = 170, VIEW = 171, VOLUME = 172, WATCH = 173, WEEK = 174, - WHEN = 175, WHERE = 176, WITH = 177, YEAR = 178, JSON_FALSE = 179, JSON_TRUE = 180, - IDENTIFIER = 181, FLOATING_LITERAL = 182, OCTAL_LITERAL = 183, DECIMAL_LITERAL = 184, - HEXADECIMAL_LITERAL = 185, STRING_LITERAL = 186, ARROW = 187, ASTERISK = 188, - BACKQUOTE = 189, BACKSLASH = 190, COLON = 191, COMMA = 192, CONCAT = 193, - DASH = 194, DOT = 195, EQ_DOUBLE = 196, EQ_SINGLE = 197, GE = 198, GT = 199, - LBRACE = 200, LBRACKET = 201, LE = 202, LPAREN = 203, LT = 204, NOT_EQ = 205, - PERCENT = 206, PLUS = 207, QUERY = 208, QUOTE_DOUBLE = 209, QUOTE_SINGLE = 210, - RBRACE = 211, RBRACKET = 212, RPAREN = 213, SEMICOLON = 214, SLASH = 215, - UNDERSCORE = 216, MULTI_LINE_COMMENT = 217, SINGLE_LINE_COMMENT = 218, + ADD = 1, AFTER = 2, ALIAS = 3, ALL = 4, ALTER = 5, AND = 6, ANTI = 7, + ANY = 8, ARRAY = 9, AS = 10, ASCENDING = 11, ASOF = 12, ASYNC = 13, + ATTACH = 14, BETWEEN = 15, BOTH = 16, BY = 17, CASE = 18, CAST = 19, + CHECK = 20, CLEAR = 21, CLUSTER = 22, CODEC = 23, COLLATE = 24, COLUMN = 25, + COMMENT = 26, CONSTRAINT = 27, CREATE = 28, CROSS = 29, CUBE = 30, DATABASE = 31, + DATABASES = 32, DATE = 33, DAY = 34, DEDUPLICATE = 35, DEFAULT = 36, + DELAY = 37, DELETE = 38, DESC = 39, DESCENDING = 40, DESCRIBE = 41, + DETACH = 42, DICTIONARIES = 43, DICTIONARY = 44, DISK = 45, DISTINCT = 46, + DISTRIBUTED = 47, DROP = 48, ELSE = 49, END = 50, ENGINE = 51, EVENTS = 52, + EXISTS = 53, EXPLAIN = 54, EXPRESSION = 55, EXTRACT = 56, FETCHES = 57, + FINAL = 58, FIRST = 59, FLUSH = 60, FOR = 61, FORMAT = 62, FREEZE = 63, + FROM = 64, FULL = 65, FUNCTION = 66, GLOBAL = 67, GRANULARITY = 68, + GROUP = 69, HAVING = 70, HIERARCHICAL = 71, HOUR = 72, ID = 73, IF = 74, + ILIKE = 75, IN = 76, INDEX = 77, INF = 78, INJECTIVE = 79, INNER = 80, + INSERT = 81, INTERVAL = 82, INTO = 83, IS = 84, IS_OBJECT_ID = 85, JOIN = 86, + KEY = 87, KILL = 88, LAST = 89, LAYOUT = 90, LEADING = 91, LEFT = 92, + LIFETIME = 93, LIKE = 94, LIMIT = 95, LIVE = 96, LOCAL = 97, LOGS = 98, + MATERIALIZED = 99, MAX = 100, MERGES = 101, MIN = 102, MINUTE = 103, + MODIFY = 104, MONTH = 105, MOVE = 106, MUTATION = 107, NAN_SQL = 108, + NO = 109, NOT = 110, NULL_SQL = 111, NULLS = 112, OFFSET = 113, ON = 114, + OPTIMIZE = 115, OR = 116, ORDER = 117, OUTER = 118, OUTFILE = 119, PARTITION = 120, + POPULATE = 121, PREWHERE = 122, PRIMARY = 123, QUARTER = 124, RANGE = 125, + RELOAD = 126, REMOVE = 127, RENAME = 128, REPLACE = 129, REPLICA = 130, + REPLICATED = 131, RIGHT = 132, ROLLUP = 133, SAMPLE = 134, SECOND = 135, + SELECT = 136, SEMI = 137, SENDS = 138, SET = 139, SETTINGS = 140, SHOW = 141, + SOURCE = 142, START = 143, STOP = 144, SUBSTRING = 145, SYNC = 146, + SYNTAX = 147, SYSTEM = 148, TABLE = 149, TABLES = 150, TEMPORARY = 151, + TEST = 152, THEN = 153, TIES = 154, TIMEOUT = 155, TIMESTAMP = 156, + TO = 157, TOP = 158, TOTALS = 159, TRAILING = 160, TRIM = 161, TRUNCATE = 162, + TTL = 163, TYPE = 164, UNION = 165, UPDATE = 166, USE = 167, USING = 168, + UUID = 169, VALUES = 170, VIEW = 171, VOLUME = 172, WATCH = 173, WEEK = 174, + WHEN = 175, WHERE = 176, WITH = 177, YEAR = 178, JSON_FALSE = 179, JSON_TRUE = 180, + IDENTIFIER = 181, FLOATING_LITERAL = 182, OCTAL_LITERAL = 183, DECIMAL_LITERAL = 184, + HEXADECIMAL_LITERAL = 185, STRING_LITERAL = 186, ARROW = 187, ASTERISK = 188, + BACKQUOTE = 189, BACKSLASH = 190, COLON = 191, COMMA = 192, CONCAT = 193, + DASH = 194, DOT = 195, EQ_DOUBLE = 196, EQ_SINGLE = 197, GE = 198, GT = 199, + LBRACE = 200, LBRACKET = 201, LE = 202, LPAREN = 203, LT = 204, NOT_EQ = 205, + PERCENT = 206, PLUS = 207, QUERY = 208, QUOTE_DOUBLE = 209, QUOTE_SINGLE = 210, + RBRACE = 211, RBRACKET = 212, RPAREN = 213, SEMICOLON = 214, SLASH = 215, + UNDERSCORE = 216, MULTI_LINE_COMMENT = 217, SINGLE_LINE_COMMENT = 218, WHITESPACE = 219 }; ClickHouseLexer(antlr4::CharStream *input); - ~ClickHouseLexer(); + ~ClickHouseLexer() override; virtual std::string getGrammarFileName() const override; virtual const std::vector& getRuleNames() const override; diff --git a/src/Parsers/New/ClickHouseParser.h b/src/Parsers/New/ClickHouseParser.h index 11beadb182e..35e8d81d7f8 100644 --- a/src/Parsers/New/ClickHouseParser.h +++ b/src/Parsers/New/ClickHouseParser.h @@ -91,7 +91,7 @@ public: }; ClickHouseParser(antlr4::TokenStream *input); - ~ClickHouseParser(); + ~ClickHouseParser() override; virtual std::string getGrammarFileName() const override; virtual const antlr4::atn::ATN& getATN() const override { return _atn; }; diff --git a/src/Processors/Transforms/WindowTransform.cpp b/src/Processors/Transforms/WindowTransform.cpp index 4a5282c1e6b..16d028f0fc1 100644 --- a/src/Processors/Transforms/WindowTransform.cpp +++ b/src/Processors/Transforms/WindowTransform.cpp @@ -2,6 +2,7 @@ #include #include +#include #include #include #include @@ -48,7 +49,10 @@ static int compareValuesWithOffset(const IColumn * _compared_column, _compared_column); const auto * reference_column = assert_cast( _reference_column); - const auto offset = _offset.get(); + // Note that the storage type of offset returned by get<> is different, so + // we need to specify the type explicitly. + const typename ColumnType::ValueType offset + = _offset.get(); assert(offset >= 0); const auto compared_value_data = compared_column->getDataAt(compared_row); @@ -62,32 +66,32 @@ static int compareValuesWithOffset(const IColumn * _compared_column, reference_value_data.data); bool is_overflow; - bool overflow_to_negative; if (offset_is_preceding) { is_overflow = __builtin_sub_overflow(reference_value, offset, &reference_value); - overflow_to_negative = offset > 0; } else { is_overflow = __builtin_add_overflow(reference_value, offset, &reference_value); - overflow_to_negative = offset < 0; } // fmt::print(stderr, -// "compared [{}] = {}, ref [{}] = {}, offset {} preceding {} overflow {} to negative {}\n", +// "compared [{}] = {}, old ref {}, shifted ref [{}] = {}, offset {} preceding {} overflow {} to negative {}\n", // compared_row, toString(compared_value), +// // fmt doesn't like char8_t. +// static_cast(unalignedLoad(reference_value_data.data)), // reference_row, toString(reference_value), // toString(offset), offset_is_preceding, -// is_overflow, overflow_to_negative); +// is_overflow, offset_is_preceding); if (is_overflow) { - if (overflow_to_negative) + if (offset_is_preceding) { // Overflow to the negative, [compared] must be greater. + // We know that because offset is >= 0. return 1; } else @@ -263,6 +267,14 @@ WindowTransform::WindowTransform(const Block & input_header_, window_description.frame.begin_offset = convertFieldToTypeOrThrow( window_description.frame.begin_offset, *entry.type); + + if (applyVisitor(FieldVisitorAccurateLess{}, + window_description.frame.begin_offset, Field(0))) + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Window frame start offset must be nonnegative, {} given", + window_description.frame.begin_offset); + } } if (window_description.frame.end_type == WindowFrame::BoundaryType::Offset) @@ -270,6 +282,14 @@ WindowTransform::WindowTransform(const Block & input_header_, window_description.frame.end_offset = convertFieldToTypeOrThrow( window_description.frame.end_offset, *entry.type); + + if (applyVisitor(FieldVisitorAccurateLess{}, + window_description.frame.end_offset, Field(0))) + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Window frame start offset must be nonnegative, {} given", + window_description.frame.end_offset); + } } } } diff --git a/src/Server/HTTP/HTMLForm.cpp b/src/Server/HTTP/HTMLForm.cpp index a00950c8e27..7a87f484b5c 100644 --- a/src/Server/HTTP/HTMLForm.cpp +++ b/src/Server/HTTP/HTMLForm.cpp @@ -71,23 +71,6 @@ HTMLForm::HTMLForm(const Poco::URI & uri) : field_limit(DFL_FIELD_LIMIT), value_ } -void HTMLForm::setEncoding(const std::string & encoding_) -{ - encoding = encoding_; -} - - -void HTMLForm::addPart(const std::string & name, Poco::Net::PartSource * source) -{ - poco_check_ptr(source); - - Part part; - part.name = name; - part.source = std::unique_ptr(source); - parts.push_back(std::move(part)); -} - - void HTMLForm::load(const Poco::Net::HTTPRequest & request, ReadBuffer & requestBody, PartHandler & handler) { clear(); @@ -126,36 +109,12 @@ void HTMLForm::load(const Poco::Net::HTTPRequest & request, ReadBuffer & request } -void HTMLForm::load(const Poco::Net::HTTPRequest & request) -{ - NullPartHandler nah; - EmptyReadBuffer nis; - load(request, nis, nah); -} - - -void HTMLForm::read(ReadBuffer & in, PartHandler & handler) -{ - if (encoding == ENCODING_URL) - readQuery(in); - else - readMultipart(in, handler); -} - - void HTMLForm::read(ReadBuffer & in) { readQuery(in); } -void HTMLForm::read(const std::string & queryString) -{ - ReadBufferFromString istr(queryString); - readQuery(istr); -} - - void HTMLForm::readQuery(ReadBuffer & in) { size_t fields = 0; @@ -269,22 +228,6 @@ void HTMLForm::readMultipart(ReadBuffer & in_, PartHandler & handler) } -void HTMLForm::setFieldLimit(int limit) -{ - poco_assert(limit >= 0); - - field_limit = limit; -} - - -void HTMLForm::setValueLengthLimit(int limit) -{ - poco_assert(limit >= 0); - - value_length_limit = limit; -} - - HTMLForm::MultipartReadBuffer::MultipartReadBuffer(ReadBuffer & in_, const std::string & boundary_) : ReadBuffer(nullptr, 0), in(in_), boundary("--" + boundary_) { diff --git a/src/Server/HTTP/HTMLForm.h b/src/Server/HTTP/HTMLForm.h index 27be712e1d5..8d8fb0d1719 100644 --- a/src/Server/HTTP/HTMLForm.h +++ b/src/Server/HTTP/HTMLForm.h @@ -52,24 +52,6 @@ public: return (it != end()) ? DB::parse(it->second) : default_value; } - template - T getParsed(const std::string & key) - { - return DB::parse(get(key)); - } - - /// Sets the encoding used for posting the form. - /// Encoding must be either "application/x-www-form-urlencoded" (which is the default) or "multipart/form-data". - void setEncoding(const std::string & encoding); - - /// Returns the encoding used for posting the form. - const std::string & getEncoding() const { return encoding; } - - /// Adds an part/attachment (file upload) to the form. - /// The form takes ownership of the PartSource and deletes it when it is no longer needed. - /// The part will only be sent if the encoding set for the form is "multipart/form-data" - void addPart(const std::string & name, Poco::Net::PartSource * pSource); - /// Reads the form data from the given HTTP request. /// Uploaded files are passed to the given PartHandler. void load(const Poco::Net::HTTPRequest & request, ReadBuffer & requestBody, PartHandler & handler); @@ -78,41 +60,10 @@ public: /// Uploaded files are silently discarded. void load(const Poco::Net::HTTPRequest & request, ReadBuffer & requestBody); - /// Reads the form data from the given HTTP request. - /// The request must be a GET request and the form data must be in the query string (URL encoded). - /// For POST requests, you must use one of the overloads taking an additional input stream for the request body. - void load(const Poco::Net::HTTPRequest & request); - - /// Reads the form data from the given input stream. - /// The form data read from the stream must be in the encoding specified for the form. - /// Note that read() does not clear the form before reading the new values. - void read(ReadBuffer & in, PartHandler & handler); - /// Reads the URL-encoded form data from the given input stream. /// Note that read() does not clear the form before reading the new values. void read(ReadBuffer & in); - /// Reads the form data from the given HTTP query string. - /// Note that read() does not clear the form before reading the new values. - void read(const std::string & queryString); - - /// Returns the MIME boundary used for writing multipart form data. - const std::string & getBoundary() const { return boundary; } - - /// Returns the maximum number of header fields allowed. - /// See setFieldLimit() for more information. - int getFieldLimit() const { return field_limit; } - - /// Sets the maximum number of header fields allowed. This limit is used to defend certain kinds of denial-of-service attacks. - /// Specify 0 for unlimited (not recommended). The default limit is 100. - void setFieldLimit(int limit); - - /// Sets the maximum size for form field values stored as strings. - void setValueLengthLimit(int limit); - - /// Returns the maximum size for form field values stored as strings. - int getValueLengthLimit() const { return value_length_limit; } - static const std::string ENCODING_URL; /// "application/x-www-form-urlencoded" static const std::string ENCODING_MULTIPART; /// "multipart/form-data" static const int UNKNOWN_CONTENT_LENGTH; diff --git a/src/Server/HTTP/HTTPServer.cpp b/src/Server/HTTP/HTTPServer.cpp index 3e050080bdd..5554a0ee31d 100644 --- a/src/Server/HTTP/HTTPServer.cpp +++ b/src/Server/HTTP/HTTPServer.cpp @@ -8,9 +8,9 @@ namespace DB HTTPServer::HTTPServer( const Context & context, HTTPRequestHandlerFactoryPtr factory_, - UInt16 portNumber, + UInt16 port_number, Poco::Net::HTTPServerParams::Ptr params) - : TCPServer(new HTTPServerConnectionFactory(context, params, factory_), portNumber, params), factory(factory_) + : TCPServer(new HTTPServerConnectionFactory(context, params, factory_), port_number, params), factory(factory_) { } @@ -26,10 +26,10 @@ HTTPServer::HTTPServer( HTTPServer::HTTPServer( const Context & context, HTTPRequestHandlerFactoryPtr factory_, - Poco::ThreadPool & threadPool, + Poco::ThreadPool & thread_pool, const Poco::Net::ServerSocket & socket, Poco::Net::HTTPServerParams::Ptr params) - : TCPServer(new HTTPServerConnectionFactory(context, params, factory_), threadPool, socket, params), factory(factory_) + : TCPServer(new HTTPServerConnectionFactory(context, params, factory_), thread_pool, socket, params), factory(factory_) { } diff --git a/src/Server/HTTP/HTTPServer.h b/src/Server/HTTP/HTTPServer.h index 1ce62c65ca2..3d2a2ac9cf4 100644 --- a/src/Server/HTTP/HTTPServer.h +++ b/src/Server/HTTP/HTTPServer.h @@ -19,7 +19,7 @@ public: explicit HTTPServer( const Context & context, HTTPRequestHandlerFactoryPtr factory, - UInt16 portNumber = 80, + UInt16 port_number = 80, Poco::Net::HTTPServerParams::Ptr params = new Poco::Net::HTTPServerParams); HTTPServer( @@ -31,13 +31,13 @@ public: HTTPServer( const Context & context, HTTPRequestHandlerFactoryPtr factory, - Poco::ThreadPool & threadPool, + Poco::ThreadPool & thread_pool, const Poco::Net::ServerSocket & socket, Poco::Net::HTTPServerParams::Ptr params); ~HTTPServer() override; - void stopAll(bool abortCurrent = false); + void stopAll(bool abort_current = false); private: HTTPRequestHandlerFactoryPtr factory; diff --git a/src/Server/HTTP/HTTPServerConnection.cpp b/src/Server/HTTP/HTTPServerConnection.cpp index e2ee4c8882b..7a6cd4cab54 100644 --- a/src/Server/HTTP/HTTPServerConnection.cpp +++ b/src/Server/HTTP/HTTPServerConnection.cpp @@ -67,15 +67,15 @@ void HTTPServerConnection::run() } } } - catch (Poco::Net::NoMessageException &) + catch (const Poco::Net::NoMessageException &) { break; } - catch (Poco::Net::MessageException &) + catch (const Poco::Net::MessageException &) { sendErrorResponse(session, Poco::Net::HTTPResponse::HTTP_BAD_REQUEST); } - catch (Poco::Exception &) + catch (const Poco::Exception &) { if (session.networkException()) { @@ -98,31 +98,4 @@ void HTTPServerConnection::sendErrorResponse(Poco::Net::HTTPServerSession & sess session.setKeepAlive(false); } -void HTTPServerConnection::onServerStopped(const bool & abortCurrent) -{ - stopped = true; - if (abortCurrent) - { - try - { - socket().shutdown(); - } - catch (...) - { - } - } - else - { - std::unique_lock lock(mutex); - - try - { - socket().shutdown(); - } - catch (...) - { - } - } -} - } diff --git a/src/Server/HTTP/HTTPServerConnection.h b/src/Server/HTTP/HTTPServerConnection.h index 589c33025bf..55b6e921d9f 100644 --- a/src/Server/HTTP/HTTPServerConnection.h +++ b/src/Server/HTTP/HTTPServerConnection.h @@ -23,7 +23,6 @@ public: protected: static void sendErrorResponse(Poco::Net::HTTPServerSession & session, Poco::Net::HTTPResponse::HTTPStatus status); - void onServerStopped(const bool & abortCurrent); private: Context context; diff --git a/src/Server/HTTP/HTTPServerRequest.cpp b/src/Server/HTTP/HTTPServerRequest.cpp index bdba6a51d91..ab8b803c29d 100644 --- a/src/Server/HTTP/HTTPServerRequest.cpp +++ b/src/Server/HTTP/HTTPServerRequest.cpp @@ -15,8 +15,8 @@ namespace DB { - HTTPServerRequest::HTTPServerRequest(const Context & context, HTTPServerResponse & response, Poco::Net::HTTPServerSession & session) + : max_uri_size(context.getSettingsRef().http_max_uri_size) { response.attachRequest(this); @@ -26,7 +26,6 @@ HTTPServerRequest::HTTPServerRequest(const Context & context, HTTPServerResponse auto receive_timeout = context.getSettingsRef().http_receive_timeout; auto send_timeout = context.getSettingsRef().http_send_timeout; - auto max_query_size = context.getSettingsRef().max_query_size; session.socket().setReceiveTimeout(receive_timeout); session.socket().setSendTimeout(send_timeout); @@ -37,7 +36,7 @@ HTTPServerRequest::HTTPServerRequest(const Context & context, HTTPServerResponse readRequest(*in); /// Try parse according to RFC7230 if (getChunkedTransferEncoding()) - stream = std::make_unique(std::move(in), max_query_size); + stream = std::make_unique(std::move(in)); else if (hasContentLength()) stream = std::make_unique(std::move(in), getContentLength(), false); else if (getMethod() != HTTPRequest::HTTP_GET && getMethod() != HTTPRequest::HTTP_HEAD && getMethod() != HTTPRequest::HTTP_DELETE) @@ -93,10 +92,10 @@ void HTTPServerRequest::readRequest(ReadBuffer & in) skipWhitespaceIfAny(in); - while (in.read(ch) && !Poco::Ascii::isSpace(ch) && uri.size() <= MAX_URI_LENGTH) + while (in.read(ch) && !Poco::Ascii::isSpace(ch) && uri.size() <= max_uri_size) uri += ch; - if (uri.size() > MAX_URI_LENGTH) + if (uri.size() > max_uri_size) throw Poco::Net::MessageException("HTTP request URI invalid or too long"); skipWhitespaceIfAny(in); diff --git a/src/Server/HTTP/HTTPServerRequest.h b/src/Server/HTTP/HTTPServerRequest.h index 7fd54850212..a0f022f32ec 100644 --- a/src/Server/HTTP/HTTPServerRequest.h +++ b/src/Server/HTTP/HTTPServerRequest.h @@ -43,11 +43,12 @@ private: MAX_NAME_LENGTH = 256, MAX_VALUE_LENGTH = 8192, MAX_METHOD_LENGTH = 32, - MAX_URI_LENGTH = 16384, MAX_VERSION_LENGTH = 8, MAX_FIELDS_NUMBER = 100, }; + const size_t max_uri_size; + std::unique_ptr stream; Poco::Net::SocketImpl * socket; Poco::Net::SocketAddress client_address; diff --git a/src/Server/HTTP/HTTPServerResponse.cpp b/src/Server/HTTP/HTTPServerResponse.cpp index e3d52fffa80..db5cfb132e3 100644 --- a/src/Server/HTTP/HTTPServerResponse.cpp +++ b/src/Server/HTTP/HTTPServerResponse.cpp @@ -94,32 +94,6 @@ std::pair, std::shared_ptr> HTTPServ return std::make_pair(header_stream, stream); } -void HTTPServerResponse::sendFile(const std::string & path, const std::string & mediaType) -{ - poco_assert(!stream); - - Poco::File f(path); - Poco::Timestamp date_time = f.getLastModified(); - Poco::File::FileSize length = f.getSize(); - set("Last-Modified", Poco::DateTimeFormatter::format(date_time, Poco::DateTimeFormat::HTTP_FORMAT)); - setContentLength64(length); - setContentType(mediaType); - setChunkedTransferEncoding(false); - - Poco::FileInputStream istr(path); - if (istr.good()) - { - stream = std::make_shared(session); - write(*stream); - if (request && request->getMethod() != HTTPRequest::HTTP_HEAD) - { - Poco::StreamCopier::copyStream(istr, *stream); - } - } - else - throw Poco::OpenFileException(path); -} - void HTTPServerResponse::sendBuffer(const void * buffer, std::size_t length) { poco_assert(!stream); @@ -135,20 +109,6 @@ void HTTPServerResponse::sendBuffer(const void * buffer, std::size_t length) } } -void HTTPServerResponse::redirect(const std::string & uri, HTTPStatus status) -{ - poco_assert(!stream); - - setContentLength(0); - setChunkedTransferEncoding(false); - - setStatusAndReason(status); - set("Location", uri); - - stream = std::make_shared(session); - write(*stream); -} - void HTTPServerResponse::requireAuthentication(const std::string & realm) { poco_assert(!stream); diff --git a/src/Server/HTTP/HTTPServerResponse.h b/src/Server/HTTP/HTTPServerResponse.h index 82221ce3a83..f5b7a70dc79 100644 --- a/src/Server/HTTP/HTTPServerResponse.h +++ b/src/Server/HTTP/HTTPServerResponse.h @@ -36,17 +36,6 @@ public: /// or redirect() has been called. std::pair, std::shared_ptr> beginSend(); /// TODO: use some WriteBuffer implementation here. - /// Sends the response header to the client, followed - /// by the content of the given file. - /// - /// Must not be called after send(), sendBuffer() - /// or redirect() has been called. - /// - /// Throws a FileNotFoundException if the file - /// cannot be found, or an OpenFileException if - /// the file cannot be opened. - void sendFile(const std::string & path, const std::string & mediaType); - /// Sends the response header to the client, followed /// by the contents of the given buffer. /// @@ -61,16 +50,6 @@ public: /// or redirect() has been called. void sendBuffer(const void * pBuffer, std::size_t length); /// FIXME: do we need this one? - /// Sets the status code, which must be one of - /// HTTP_MOVED_PERMANENTLY (301), HTTP_FOUND (302), - /// or HTTP_SEE_OTHER (303), - /// and sets the "Location" header field - /// to the given URI, which according to - /// the HTTP specification, must be absolute. - /// - /// Must not be called after send() has been called. - void redirect(const std::string & uri, Poco::Net::HTTPResponse::HTTPStatus status = Poco::Net::HTTPResponse::HTTP_FOUND); - void requireAuthentication(const std::string & realm); /// Sets the status code to 401 (Unauthorized) /// and sets the "WWW-Authenticate" header field @@ -83,7 +62,7 @@ public: private: Poco::Net::HTTPServerSession & session; - HTTPServerRequest * request; + HTTPServerRequest * request = nullptr; std::shared_ptr stream; std::shared_ptr header_stream; }; diff --git a/src/Server/HTTP/README.md b/src/Server/HTTP/README.md new file mode 100644 index 00000000000..71730962780 --- /dev/null +++ b/src/Server/HTTP/README.md @@ -0,0 +1,3 @@ +# Notice + +The source code located in this folder is based on some files from the POCO project, from here `contrib/poco/Net/src`. diff --git a/src/Storages/MergeTree/DataPartsExchange.cpp b/src/Storages/MergeTree/DataPartsExchange.cpp index cf8de4456dd..862a3088f89 100644 --- a/src/Storages/MergeTree/DataPartsExchange.cpp +++ b/src/Storages/MergeTree/DataPartsExchange.cpp @@ -543,11 +543,22 @@ MergeTreeData::MutableDataPartPtr Fetcher::downloadPartToDisk( static const String TMP_PREFIX = "tmp_fetch_"; String tmp_prefix = tmp_prefix_.empty() ? TMP_PREFIX : tmp_prefix_; + /// We will remove directory if it's already exists. Make precautions. + if (tmp_prefix.empty() + || part_name.empty() + || std::string::npos != tmp_prefix.find_first_of("/.") + || std::string::npos != part_name.find_first_of("/.")) + throw Exception("Logical error: tmp_prefix and part_name cannot be empty or contain '.' or '/' characters.", ErrorCodes::LOGICAL_ERROR); + String part_relative_path = String(to_detached ? "detached/" : "") + tmp_prefix + part_name; String part_download_path = data.getRelativeDataPath() + part_relative_path + "/"; if (disk->exists(part_download_path)) - throw Exception("Directory " + fullPath(disk, part_download_path) + " already exists.", ErrorCodes::DIRECTORY_ALREADY_EXISTS); + { + LOG_WARNING(log, "Directory {} already exists, probably result of a failed fetch. Will remove it before fetching part.", + fullPath(disk, part_download_path)); + disk->removeRecursive(part_download_path); + } disk->createDirectories(part_download_path); diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index 7c9f7b8104d..c79e754f61a 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -769,7 +769,8 @@ void IMergeTreeDataPart::loadPartitionAndMinMaxIndex() void IMergeTreeDataPart::loadChecksums(bool require) { - String path = getFullRelativePath() + "checksums.txt"; + const String path = getFullRelativePath() + "checksums.txt"; + if (volume->getDisk()->exists(path)) { auto buf = openForReading(volume->getDisk(), path); @@ -784,12 +785,14 @@ void IMergeTreeDataPart::loadChecksums(bool require) else { if (require) - throw Exception("No checksums.txt in part " + name, ErrorCodes::NO_FILE_IN_DATA_PART); + throw Exception(ErrorCodes::NO_FILE_IN_DATA_PART, "No checksums.txt in part {}", name); /// If the checksums file is not present, calculate the checksums and write them to disk. /// Check the data while we are at it. LOG_WARNING(storage.log, "Checksums for part {} not found. Will calculate them from data on disk.", name); + checksums = checkDataPart(shared_from_this(), false); + { auto out = volume->getDisk()->writeFile(getFullRelativePath() + "checksums.txt.tmp", 4096); checksums.write(*out); diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.h b/src/Storages/MergeTree/IMergeTreeDataPart.h index b64022d2b5a..03f6564788a 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.h +++ b/src/Storages/MergeTree/IMergeTreeDataPart.h @@ -203,8 +203,8 @@ public: * * Possible state transitions: * Temporary -> Precommitted: we are trying to commit a fetched, inserted or merged part to active set - * Precommitted -> Outdated: we could not to add a part to active set and doing a rollback (for example it is duplicated part) - * Precommitted -> Committed: we successfully committed a part to active dataset + * Precommitted -> Outdated: we could not add a part to active set and are doing a rollback (for example it is duplicated part) + * Precommitted -> Committed: we successfully committed a part to active dataset * Precommitted -> Outdated: a part was replaced by a covering part or DROP PARTITION * Outdated -> Deleting: a cleaner selected this part for deletion * Deleting -> Outdated: if an ZooKeeper error occurred during the deletion, we will retry deletion diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 3cd4187af0a..71564cb1f54 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -2280,7 +2280,7 @@ MergeTreeData::DataPartsVector MergeTreeData::removePartsInRangeFromWorkingSet(c void MergeTreeData::forgetPartAndMoveToDetached(const MergeTreeData::DataPartPtr & part_to_detach, const String & prefix, bool restore_covered) { - LOG_INFO(log, "Renaming {} to {}{} and forgiving it.", part_to_detach->relative_path, prefix, part_to_detach->name); + LOG_INFO(log, "Renaming {} to {}{} and forgetting it.", part_to_detach->relative_path, prefix, part_to_detach->name); auto lock = lockParts(); @@ -2746,12 +2746,12 @@ void MergeTreeData::checkAlterPartitionIsPossible(const PartitionCommands & comm if (command.part) { auto part_name = command.partition->as().value.safeGet(); - /// We able to parse it + /// We are able to parse it MergeTreePartInfo::fromPartName(part_name, format_version); } else { - /// We able to parse it + /// We are able to parse it getPartitionIDFromQuery(command.partition, global_context); } } @@ -3179,15 +3179,18 @@ void MergeTreeData::dropDetached(const ASTPtr & partition, bool part, const Cont MergeTreeData::MutableDataPartsVector MergeTreeData::tryLoadPartsToAttach(const ASTPtr & partition, bool attach_part, const Context & context, PartsTemporaryRename & renamed_parts) { - String source_dir = "detached/"; + const String source_dir = "detached/"; std::map name_to_disk; + /// Let's compose a list of parts that should be added. if (attach_part) { - String part_id = partition->as().value.safeGet(); + const String part_id = partition->as().value.safeGet(); + validateDetachedPartName(part_id); renamed_parts.addPart(part_id, "attaching_" + part_id); + if (MergeTreePartInfo::tryParsePartName(part_id, nullptr, format_version)) name_to_disk[part_id] = getDiskForPart(part_id, source_dir); } @@ -3198,12 +3201,14 @@ MergeTreeData::MutableDataPartsVector MergeTreeData::tryLoadPartsToAttach(const ActiveDataPartSet active_parts(format_version); const auto disks = getStoragePolicy()->getDisks(); + for (const auto & disk : disks) { for (auto it = disk->iterateDirectory(relative_data_path + source_dir); it->isValid(); it->next()) { const String & name = it->name(); MergeTreePartInfo part_info; + // TODO what if name contains "_tryN" suffix? /// Parts with prefix in name (e.g. attaching_1_3_3_0, deleting_1_3_3_0) will be ignored if (!MergeTreePartInfo::tryParsePartName(name, &part_info, format_version) @@ -3211,21 +3216,23 @@ MergeTreeData::MutableDataPartsVector MergeTreeData::tryLoadPartsToAttach(const { continue; } + LOG_DEBUG(log, "Found part {}", name); active_parts.add(name); name_to_disk[name] = disk; } } LOG_DEBUG(log, "{} of them are active", active_parts.size()); - /// Inactive parts rename so they can not be attached in case of repeated ATTACH. + + /// Inactive parts are renamed so they can not be attached in case of repeated ATTACH. for (const auto & [name, disk] : name_to_disk) { - String containing_part = active_parts.getContainingPart(name); + const String containing_part = active_parts.getContainingPart(name); + if (!containing_part.empty() && containing_part != name) - { // TODO maybe use PartsTemporaryRename here? - disk->moveDirectory(relative_data_path + source_dir + name, relative_data_path + source_dir + "inactive_" + name); - } + disk->moveDirectory(relative_data_path + source_dir + name, + relative_data_path + source_dir + "inactive_" + name); else renamed_parts.addPart(name, "attaching_" + name); } @@ -3240,11 +3247,13 @@ MergeTreeData::MutableDataPartsVector MergeTreeData::tryLoadPartsToAttach(const MutableDataPartsVector loaded_parts; loaded_parts.reserve(renamed_parts.old_and_new_names.size()); - for (const auto & part_names : renamed_parts.old_and_new_names) + for (const auto & [old_name, new_name] : renamed_parts.old_and_new_names) { - LOG_DEBUG(log, "Checking part {}", part_names.second); - auto single_disk_volume = std::make_shared("volume_" + part_names.first, name_to_disk[part_names.first], 0); - MutableDataPartPtr part = createPart(part_names.first, single_disk_volume, source_dir + part_names.second); + LOG_DEBUG(log, "Checking part {}", new_name); + + auto single_disk_volume = std::make_shared("volume_" + old_name, name_to_disk[old_name]); + MutableDataPartPtr part = createPart(old_name, single_disk_volume, source_dir + new_name); + loadPartAndFixMetadataImpl(part); loaded_parts.push_back(part); } diff --git a/src/Storages/MergeTree/MergeTreeData.h b/src/Storages/MergeTree/MergeTreeData.h index 2b6da96fede..63d776a838c 100644 --- a/src/Storages/MergeTree/MergeTreeData.h +++ b/src/Storages/MergeTree/MergeTreeData.h @@ -463,7 +463,7 @@ public: /// Remove parts from working set immediately (without wait for background /// process). Transfer part state to temporary. Have very limited usage only - /// for new parts which don't already present in table. + /// for new parts which aren't already present in table. void removePartsFromWorkingSetImmediatelyAndSetTemporaryState(const DataPartsVector & remove); /// Removes parts from the working set parts. diff --git a/src/Storages/MergeTree/MergeTreeDataPartChecksum.cpp b/src/Storages/MergeTree/MergeTreeDataPartChecksum.cpp index dd141a68248..b0eb1cbea70 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartChecksum.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartChecksum.cpp @@ -293,11 +293,8 @@ String MergeTreeDataPartChecksums::getTotalChecksumHex() const { SipHash hash_of_all_files; - for (const auto & elem : files) + for (const auto & [name, checksum] : files) { - const String & name = elem.first; - const auto & checksum = elem.second; - updateHash(hash_of_all_files, name); hash_of_all_files.update(checksum.file_hash); } @@ -376,11 +373,8 @@ void MinimalisticDataPartChecksums::computeTotalChecksums(const MergeTreeDataPar SipHash hash_of_uncompressed_files_state; SipHash uncompressed_hash_of_compressed_files_state; - for (const auto & elem : full_checksums_.files) + for (const auto & [name, checksum] : full_checksums_.files) { - const String & name = elem.first; - const auto & checksum = elem.second; - updateHash(hash_of_all_files_state, name); hash_of_all_files_state.update(checksum.file_hash); diff --git a/src/Storages/MergeTree/MergeTreeIndexAggregatorBloomFilter.h b/src/Storages/MergeTree/MergeTreeIndexAggregatorBloomFilter.h index ebbe9865313..9877db8ee30 100644 --- a/src/Storages/MergeTree/MergeTreeIndexAggregatorBloomFilter.h +++ b/src/Storages/MergeTree/MergeTreeIndexAggregatorBloomFilter.h @@ -6,7 +6,7 @@ namespace DB { -class MergeTreeIndexAggregatorBloomFilter : public IMergeTreeIndexAggregator +class MergeTreeIndexAggregatorBloomFilter final : public IMergeTreeIndexAggregator { public: MergeTreeIndexAggregatorBloomFilter(size_t bits_per_row_, size_t hash_functions_, const Names & columns_name_); diff --git a/src/Storages/MergeTree/MergeTreeIndexBloomFilter.h b/src/Storages/MergeTree/MergeTreeIndexBloomFilter.h index b0d9a295bcd..1aac2c22aa0 100644 --- a/src/Storages/MergeTree/MergeTreeIndexBloomFilter.h +++ b/src/Storages/MergeTree/MergeTreeIndexBloomFilter.h @@ -8,7 +8,7 @@ namespace DB { -class MergeTreeIndexBloomFilter : public IMergeTreeIndex +class MergeTreeIndexBloomFilter final : public IMergeTreeIndex { public: MergeTreeIndexBloomFilter( diff --git a/src/Storages/MergeTree/MergeTreeIndexConditionBloomFilter.h b/src/Storages/MergeTree/MergeTreeIndexConditionBloomFilter.h index 34fb45c86a5..0b02e64d43c 100644 --- a/src/Storages/MergeTree/MergeTreeIndexConditionBloomFilter.h +++ b/src/Storages/MergeTree/MergeTreeIndexConditionBloomFilter.h @@ -13,7 +13,7 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } -class MergeTreeIndexConditionBloomFilter : public IMergeTreeIndexCondition +class MergeTreeIndexConditionBloomFilter final : public IMergeTreeIndexCondition { public: struct RPNElement diff --git a/src/Storages/MergeTree/MergeTreeIndexFullText.cpp b/src/Storages/MergeTree/MergeTreeIndexFullText.cpp index 3e8b9cc704b..419a417c3e8 100644 --- a/src/Storages/MergeTree/MergeTreeIndexFullText.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexFullText.cpp @@ -43,15 +43,29 @@ namespace ErrorCodes /// Adds all tokens from string to bloom filter. static void stringToBloomFilter( + const String & string, TokenExtractorPtr token_extractor, BloomFilter & bloom_filter) +{ + const char * data = string.data(); + size_t size = string.size(); + + size_t cur = 0; + size_t token_start = 0; + size_t token_len = 0; + while (cur < size && token_extractor->nextInField(data, size, &cur, &token_start, &token_len)) + bloom_filter.add(data + token_start, token_len); +} + +static void columnToBloomFilter( const char * data, size_t size, TokenExtractorPtr token_extractor, BloomFilter & bloom_filter) { size_t cur = 0; size_t token_start = 0; size_t token_len = 0; - while (cur < size && token_extractor->next(data, size, &cur, &token_start, &token_len)) + while (cur < size && token_extractor->nextInColumn(data, size, &cur, &token_start, &token_len)) bloom_filter.add(data + token_start, token_len); } + /// Adds all tokens from like pattern string to bloom filter. (Because like pattern can contain `\%` and `\_`.) static void likeStringToBloomFilter( const String & data, TokenExtractorPtr token_extractor, BloomFilter & bloom_filter) @@ -61,15 +75,14 @@ static void likeStringToBloomFilter( while (cur < data.size() && token_extractor->nextLike(data, &cur, token)) bloom_filter.add(token.c_str(), token.size()); } + /// Unified condition for equals, startsWith and endsWith bool MergeTreeConditionFullText::createFunctionEqualsCondition( RPNElement & out, const Field & value, const BloomFilterParameters & params, TokenExtractorPtr token_extractor) { out.function = RPNElement::FUNCTION_EQUALS; out.bloom_filter = std::make_unique(params); - - const auto & str = value.get(); - stringToBloomFilter(str.c_str(), str.size(), token_extractor, *out.bloom_filter); + stringToBloomFilter(value.get(), token_extractor, *out.bloom_filter); return true; } @@ -143,7 +156,7 @@ void MergeTreeIndexAggregatorFullText::update(const Block & block, size_t * pos, for (size_t i = 0; i < rows_read; ++i) { auto ref = column->getDataAt(*pos + i); - stringToBloomFilter(ref.data, ref.size, token_extractor, granule->bloom_filters[col]); + columnToBloomFilter(ref.data, ref.size, token_extractor, granule->bloom_filters[col]); } } granule->has_elems = true; @@ -367,9 +380,7 @@ bool MergeTreeConditionFullText::atomFromAST( out.key_column = key_column_num; out.function = RPNElement::FUNCTION_NOT_EQUALS; out.bloom_filter = std::make_unique(params); - - const auto & str = const_value.get(); - stringToBloomFilter(str.c_str(), str.size(), token_extractor, *out.bloom_filter); + stringToBloomFilter(const_value.get(), token_extractor, *out.bloom_filter); return true; } else if (func_name == "equals") @@ -382,9 +393,7 @@ bool MergeTreeConditionFullText::atomFromAST( out.key_column = key_column_num; out.function = RPNElement::FUNCTION_EQUALS; out.bloom_filter = std::make_unique(params); - - const auto & str = const_value.get(); - likeStringToBloomFilter(str, token_extractor, *out.bloom_filter); + likeStringToBloomFilter(const_value.get(), token_extractor, *out.bloom_filter); return true; } else if (func_name == "notLike") @@ -392,9 +401,7 @@ bool MergeTreeConditionFullText::atomFromAST( out.key_column = key_column_num; out.function = RPNElement::FUNCTION_NOT_EQUALS; out.bloom_filter = std::make_unique(params); - - const auto & str = const_value.get(); - likeStringToBloomFilter(str, token_extractor, *out.bloom_filter); + likeStringToBloomFilter(const_value.get(), token_extractor, *out.bloom_filter); return true; } else if (func_name == "hasToken") @@ -402,9 +409,7 @@ bool MergeTreeConditionFullText::atomFromAST( out.key_column = key_column_num; out.function = RPNElement::FUNCTION_EQUALS; out.bloom_filter = std::make_unique(params); - - const auto & str = const_value.get(); - stringToBloomFilter(str.c_str(), str.size(), token_extractor, *out.bloom_filter); + stringToBloomFilter(const_value.get(), token_extractor, *out.bloom_filter); return true; } else if (func_name == "startsWith") @@ -431,8 +436,7 @@ bool MergeTreeConditionFullText::atomFromAST( return false; bloom_filters.back().emplace_back(params); - const auto & str = element.get(); - stringToBloomFilter(str.c_str(), str.size(), token_extractor, bloom_filters.back().back()); + stringToBloomFilter(element.get(), token_extractor, bloom_filters.back().back()); } out.set_bloom_filters = std::move(bloom_filters); return true; @@ -541,7 +545,7 @@ bool MergeTreeConditionFullText::tryPrepareSetBloomFilter( { bloom_filters.back().emplace_back(params); auto ref = column->getDataAt(row); - stringToBloomFilter(ref.data, ref.size, token_extractor, bloom_filters.back().back()); + columnToBloomFilter(ref.data, ref.size, token_extractor, bloom_filters.back().back()); } } @@ -573,7 +577,7 @@ bool MergeTreeIndexFullText::mayBenefitFromIndexForIn(const ASTPtr & node) const } -bool NgramTokenExtractor::next(const char * data, size_t len, size_t * pos, size_t * token_start, size_t * token_len) const +bool NgramTokenExtractor::nextInField(const char * data, size_t len, size_t * pos, size_t * token_start, size_t * token_len) const { *token_start = *pos; *token_len = 0; @@ -635,7 +639,33 @@ bool NgramTokenExtractor::nextLike(const String & str, size_t * pos, String & to return false; } -bool SplitTokenExtractor::next(const char * data, size_t len, size_t * pos, size_t * token_start, size_t * token_len) const + +bool SplitTokenExtractor::nextInField(const char * data, size_t len, size_t * pos, size_t * token_start, size_t * token_len) const +{ + *token_start = *pos; + *token_len = 0; + + while (*pos < len) + { + if (isASCII(data[*pos]) && !isAlphaNumericASCII(data[*pos])) + { + /// Finish current token if any + if (*token_len > 0) + return true; + *token_start = ++*pos; + } + else + { + /// Note that UTF-8 sequence is completely consisted of non-ASCII bytes. + ++*pos; + ++*token_len; + } + } + + return *token_len > 0; +} + +bool SplitTokenExtractor::nextInColumn(const char * data, size_t len, size_t * pos, size_t * token_start, size_t * token_len) const { *token_start = *pos; *token_len = 0; diff --git a/src/Storages/MergeTree/MergeTreeIndexFullText.h b/src/Storages/MergeTree/MergeTreeIndexFullText.h index c3c1ff8de8b..d861751c7df 100644 --- a/src/Storages/MergeTree/MergeTreeIndexFullText.h +++ b/src/Storages/MergeTree/MergeTreeIndexFullText.h @@ -14,10 +14,18 @@ namespace DB struct ITokenExtractor { virtual ~ITokenExtractor() = default; + /// Fast inplace implementation for regular use. /// Gets string (data ptr and len) and start position for extracting next token (state of extractor). /// Returns false if parsing is finished, otherwise returns true. - virtual bool next(const char * data, size_t len, size_t * pos, size_t * token_start, size_t * token_len) const = 0; + virtual bool nextInField(const char * data, size_t len, size_t * pos, size_t * token_start, size_t * token_len) const = 0; + + /// Optimized version that can assume at least 15 padding bytes after data + len (as our Columns provide). + virtual bool nextInColumn(const char * data, size_t len, size_t * pos, size_t * token_start, size_t * token_len) const + { + return nextInField(data, len, pos, token_start, token_len); + } + /// Special implementation for creating bloom filter for LIKE function. /// It skips unescaped `%` and `_` and supports escaping symbols, but it is less lightweight. virtual bool nextLike(const String & str, size_t * pos, String & out) const = 0; @@ -27,7 +35,7 @@ struct ITokenExtractor using TokenExtractorPtr = const ITokenExtractor *; -struct MergeTreeIndexGranuleFullText : public IMergeTreeIndexGranule +struct MergeTreeIndexGranuleFullText final : public IMergeTreeIndexGranule { explicit MergeTreeIndexGranuleFullText( const String & index_name_, @@ -50,7 +58,7 @@ struct MergeTreeIndexGranuleFullText : public IMergeTreeIndexGranule using MergeTreeIndexGranuleFullTextPtr = std::shared_ptr; -struct MergeTreeIndexAggregatorFullText : IMergeTreeIndexAggregator +struct MergeTreeIndexAggregatorFullText final : IMergeTreeIndexAggregator { explicit MergeTreeIndexAggregatorFullText( const Names & index_columns_, @@ -74,7 +82,7 @@ struct MergeTreeIndexAggregatorFullText : IMergeTreeIndexAggregator }; -class MergeTreeConditionFullText : public IMergeTreeIndexCondition +class MergeTreeConditionFullText final : public IMergeTreeIndexCondition { public: MergeTreeConditionFullText( @@ -156,13 +164,13 @@ private: /// Parser extracting all ngrams from string. -struct NgramTokenExtractor : public ITokenExtractor +struct NgramTokenExtractor final : public ITokenExtractor { NgramTokenExtractor(size_t n_) : n(n_) {} static String getName() { return "ngrambf_v1"; } - bool next(const char * data, size_t len, size_t * pos, size_t * token_start, size_t * token_len) const override; + bool nextInField(const char * data, size_t len, size_t * pos, size_t * token_start, size_t * token_len) const override; bool nextLike(const String & str, size_t * pos, String & token) const override; bool supportLike() const override { return true; } @@ -171,18 +179,19 @@ struct NgramTokenExtractor : public ITokenExtractor }; /// Parser extracting tokens (sequences of numbers and ascii letters). -struct SplitTokenExtractor : public ITokenExtractor +struct SplitTokenExtractor final : public ITokenExtractor { static String getName() { return "tokenbf_v1"; } - bool next(const char * data, size_t len, size_t * pos, size_t * token_start, size_t * token_len) const override; + bool nextInField(const char * data, size_t len, size_t * pos, size_t * token_start, size_t * token_len) const override; + bool nextInColumn(const char * data, size_t len, size_t * pos, size_t * token_start, size_t * token_len) const override; bool nextLike(const String & str, size_t * pos, String & token) const override; bool supportLike() const override { return true; } }; -class MergeTreeIndexFullText : public IMergeTreeIndex +class MergeTreeIndexFullText final : public IMergeTreeIndex { public: MergeTreeIndexFullText( diff --git a/src/Storages/MergeTree/MergeTreeIndexGranuleBloomFilter.h b/src/Storages/MergeTree/MergeTreeIndexGranuleBloomFilter.h index 54e2c105db8..cdd4b92f80c 100644 --- a/src/Storages/MergeTree/MergeTreeIndexGranuleBloomFilter.h +++ b/src/Storages/MergeTree/MergeTreeIndexGranuleBloomFilter.h @@ -6,7 +6,7 @@ namespace DB { -class MergeTreeIndexGranuleBloomFilter : public IMergeTreeIndexGranule +class MergeTreeIndexGranuleBloomFilter final : public IMergeTreeIndexGranule { public: MergeTreeIndexGranuleBloomFilter(size_t bits_per_row_, size_t hash_functions_, size_t index_columns_); diff --git a/src/Storages/MergeTree/MergeTreeIndexMinMax.h b/src/Storages/MergeTree/MergeTreeIndexMinMax.h index 3956b1d9f9a..8d782d9a7dc 100644 --- a/src/Storages/MergeTree/MergeTreeIndexMinMax.h +++ b/src/Storages/MergeTree/MergeTreeIndexMinMax.h @@ -10,7 +10,7 @@ namespace DB { -struct MergeTreeIndexGranuleMinMax : public IMergeTreeIndexGranule +struct MergeTreeIndexGranuleMinMax final : public IMergeTreeIndexGranule { MergeTreeIndexGranuleMinMax(const String & index_name_, const Block & index_sample_block_); MergeTreeIndexGranuleMinMax( @@ -31,7 +31,7 @@ struct MergeTreeIndexGranuleMinMax : public IMergeTreeIndexGranule }; -struct MergeTreeIndexAggregatorMinMax : IMergeTreeIndexAggregator +struct MergeTreeIndexAggregatorMinMax final : IMergeTreeIndexAggregator { MergeTreeIndexAggregatorMinMax(const String & index_name_, const Block & index_sample_block); ~MergeTreeIndexAggregatorMinMax() override = default; @@ -46,7 +46,7 @@ struct MergeTreeIndexAggregatorMinMax : IMergeTreeIndexAggregator }; -class MergeTreeIndexConditionMinMax : public IMergeTreeIndexCondition +class MergeTreeIndexConditionMinMax final : public IMergeTreeIndexCondition { public: MergeTreeIndexConditionMinMax( diff --git a/src/Storages/MergeTree/MergeTreeIndexSet.h b/src/Storages/MergeTree/MergeTreeIndexSet.h index d84991f5e85..90389264d53 100644 --- a/src/Storages/MergeTree/MergeTreeIndexSet.h +++ b/src/Storages/MergeTree/MergeTreeIndexSet.h @@ -14,7 +14,7 @@ namespace DB class MergeTreeIndexSet; -struct MergeTreeIndexGranuleSet : public IMergeTreeIndexGranule +struct MergeTreeIndexGranuleSet final : public IMergeTreeIndexGranule { explicit MergeTreeIndexGranuleSet( const String & index_name_, @@ -42,7 +42,7 @@ struct MergeTreeIndexGranuleSet : public IMergeTreeIndexGranule }; -struct MergeTreeIndexAggregatorSet : IMergeTreeIndexAggregator +struct MergeTreeIndexAggregatorSet final : IMergeTreeIndexAggregator { explicit MergeTreeIndexAggregatorSet( const String & index_name_, @@ -79,7 +79,7 @@ private: }; -class MergeTreeIndexConditionSet : public IMergeTreeIndexCondition +class MergeTreeIndexConditionSet final : public IMergeTreeIndexCondition { public: MergeTreeIndexConditionSet( @@ -113,7 +113,7 @@ private: }; -class MergeTreeIndexSet : public IMergeTreeIndex +class MergeTreeIndexSet final : public IMergeTreeIndex { public: MergeTreeIndexSet( diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeBlockOutputStream.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeBlockOutputStream.cpp index 6f90d9f00a9..529e3d2ab49 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeBlockOutputStream.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeTreeBlockOutputStream.cpp @@ -41,12 +41,14 @@ ReplicatedMergeTreeBlockOutputStream::ReplicatedMergeTreeBlockOutputStream( size_t max_parts_per_block_, bool quorum_parallel_, bool deduplicate_, - bool optimize_on_insert_) + bool optimize_on_insert_, + bool is_attach_) : storage(storage_) , metadata_snapshot(metadata_snapshot_) , quorum(quorum_) , quorum_timeout_ms(quorum_timeout_ms_) , max_parts_per_block(max_parts_per_block_) + , is_attach(is_attach_) , quorum_parallel(quorum_parallel_) , deduplicate(deduplicate_) , log(&Poco::Logger::get(storage.getLogName() + " (Replicated OutputStream)")) @@ -263,10 +265,20 @@ void ReplicatedMergeTreeBlockOutputStream::commitPart( part->name = part->getNewName(part->info); - /// Will add log entry about new part. - StorageReplicatedMergeTree::LogEntry log_entry; - log_entry.type = StorageReplicatedMergeTree::LogEntry::GET_PART; + + if (is_attach) + { + log_entry.type = StorageReplicatedMergeTree::LogEntry::ATTACH_PART; + + /// We don't need to involve ZooKeeper to obtain the checksums as by the time we get + /// the MutableDataPartPtr here, we already have the data thus being able to + /// calculate the checksums. + log_entry.part_checksum = part->checksums.getTotalChecksumHex(); + } + else + log_entry.type = StorageReplicatedMergeTree::LogEntry::GET_PART; + log_entry.create_time = time(nullptr); log_entry.source_replica = storage.replica_name; log_entry.new_part_name = part->name; diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeBlockOutputStream.h b/src/Storages/MergeTree/ReplicatedMergeTreeBlockOutputStream.h index 3ac2c4bcfcb..860b0c4ed12 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeBlockOutputStream.h +++ b/src/Storages/MergeTree/ReplicatedMergeTreeBlockOutputStream.h @@ -30,7 +30,10 @@ public: size_t max_parts_per_block_, bool quorum_parallel_, bool deduplicate_, - bool optimize_on_insert); + bool optimize_on_insert, + // special flag to determine the ALTER TABLE ATTACH PART without the query context, + // needed to set the special LogEntryType::ATTACH_PART + bool is_attach_ = false); Block getHeader() const override; void writePrefix() override; @@ -66,6 +69,7 @@ private: size_t quorum_timeout_ms; size_t max_parts_per_block; + bool is_attach = false; bool quorum_parallel = false; bool deduplicate = true; bool last_block_is_duplicate = false; diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeLogEntry.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeLogEntry.cpp index 07c64d9c95c..7d8ba0e4a30 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeLogEntry.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeTreeLogEntry.cpp @@ -1,4 +1,5 @@ #include +#include "Access/IAccessEntity.h" #include #include @@ -52,6 +53,11 @@ void ReplicatedMergeTreeLogEntryData::writeText(WriteBuffer & out) const out << "get\n" << new_part_name; break; + case ATTACH_PART: + out << "attach\n" << new_part_name << "\n" + << "part_checksum: " << part_checksum; + break; + case MERGE_PARTS: out << "merge\n"; for (const String & s : source_parts) @@ -136,7 +142,7 @@ void ReplicatedMergeTreeLogEntryData::writeText(WriteBuffer & out) const break; default: - throw Exception("Unknown log entry type: " + DB::toString(type), ErrorCodes::LOGICAL_ERROR); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown log entry type: {}", static_cast(type)); } out << '\n'; @@ -156,7 +162,8 @@ void ReplicatedMergeTreeLogEntryData::readText(ReadBuffer & in) in >> "format version: " >> format_version >> "\n"; if (format_version < 1 || format_version >= FORMAT_LAST) - throw Exception("Unknown ReplicatedMergeTreeLogEntry format version: " + DB::toString(format_version), ErrorCodes::UNKNOWN_FORMAT_VERSION); + throw Exception(ErrorCodes::UNKNOWN_FORMAT_VERSION, "Unknown ReplicatedMergeTreeLogEntry format version: {}", + DB::toString(format_version)); if (format_version >= FORMAT_WITH_CREATE_TIME) { @@ -177,11 +184,17 @@ void ReplicatedMergeTreeLogEntryData::readText(ReadBuffer & in) in >> type_str >> "\n"; bool trailing_newline_found = false; + if (type_str == "get") { type = GET_PART; in >> new_part_name; } + else if (type_str == "attach") + { + type = ATTACH_PART; + in >> new_part_name >> "\npart_checksum: " >> part_checksum; + } else if (type_str == "merge") { type = MERGE_PARTS; diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeLogEntry.h b/src/Storages/MergeTree/ReplicatedMergeTreeLogEntry.h index afd8c963943..309120560e7 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeLogEntry.h +++ b/src/Storages/MergeTree/ReplicatedMergeTreeLogEntry.h @@ -30,29 +30,32 @@ struct ReplicatedMergeTreeLogEntryData { enum Type { - EMPTY, /// Not used. - GET_PART, /// Get the part from another replica. - MERGE_PARTS, /// Merge the parts. - DROP_RANGE, /// Delete the parts in the specified partition in the specified number range. - CLEAR_COLUMN, /// NOTE: Deprecated. Drop specific column from specified partition. - CLEAR_INDEX, /// NOTE: Deprecated. Drop specific index from specified partition. - REPLACE_RANGE, /// Drop certain range of partitions and replace them by new ones - MUTATE_PART, /// Apply one or several mutations to the part. - ALTER_METADATA, /// Apply alter modification according to global /metadata and /columns paths + EMPTY, /// Not used. + GET_PART, /// Get the part from another replica. + ATTACH_PART, /// Attach the part, possibly from our own replica (if found in /detached folder). + /// You may think of it as a GET_PART with some optimisations as they're nearly identical. + MERGE_PARTS, /// Merge the parts. + DROP_RANGE, /// Delete the parts in the specified partition in the specified number range. + CLEAR_COLUMN, /// NOTE: Deprecated. Drop specific column from specified partition. + CLEAR_INDEX, /// NOTE: Deprecated. Drop specific index from specified partition. + REPLACE_RANGE, /// Drop certain range of partitions and replace them by new ones + MUTATE_PART, /// Apply one or several mutations to the part. + ALTER_METADATA, /// Apply alter modification according to global /metadata and /columns paths }; static String typeToString(Type type) { switch (type) { - case ReplicatedMergeTreeLogEntryData::GET_PART: return "GET_PART"; - case ReplicatedMergeTreeLogEntryData::MERGE_PARTS: return "MERGE_PARTS"; - case ReplicatedMergeTreeLogEntryData::DROP_RANGE: return "DROP_RANGE"; - case ReplicatedMergeTreeLogEntryData::CLEAR_COLUMN: return "CLEAR_COLUMN"; - case ReplicatedMergeTreeLogEntryData::CLEAR_INDEX: return "CLEAR_INDEX"; - case ReplicatedMergeTreeLogEntryData::REPLACE_RANGE: return "REPLACE_RANGE"; - case ReplicatedMergeTreeLogEntryData::MUTATE_PART: return "MUTATE_PART"; - case ReplicatedMergeTreeLogEntryData::ALTER_METADATA: return "ALTER_METADATA"; + case ReplicatedMergeTreeLogEntryData::GET_PART: return "GET_PART"; + case ReplicatedMergeTreeLogEntryData::ATTACH_PART: return "ATTACH_PART"; + case ReplicatedMergeTreeLogEntryData::MERGE_PARTS: return "MERGE_PARTS"; + case ReplicatedMergeTreeLogEntryData::DROP_RANGE: return "DROP_RANGE"; + case ReplicatedMergeTreeLogEntryData::CLEAR_COLUMN: return "CLEAR_COLUMN"; + case ReplicatedMergeTreeLogEntryData::CLEAR_INDEX: return "CLEAR_INDEX"; + case ReplicatedMergeTreeLogEntryData::REPLACE_RANGE: return "REPLACE_RANGE"; + case ReplicatedMergeTreeLogEntryData::MUTATE_PART: return "MUTATE_PART"; + case ReplicatedMergeTreeLogEntryData::ALTER_METADATA: return "ALTER_METADATA"; default: throw Exception("Unknown log entry type: " + DB::toString(type), ErrorCodes::LOGICAL_ERROR); } @@ -72,6 +75,8 @@ struct ReplicatedMergeTreeLogEntryData Type type = EMPTY; String source_replica; /// Empty string means that this entry was added to the queue immediately, and not copied from the log. + String part_checksum; /// Part checksum for ATTACH_PART, empty otherwise. + /// The name of resulting part for GET_PART and MERGE_PARTS /// Part range for DROP_RANGE and CLEAR_COLUMN String new_part_name; diff --git a/src/Storages/MergeTree/ReplicatedMergeTreePartCheckThread.cpp b/src/Storages/MergeTree/ReplicatedMergeTreePartCheckThread.cpp index b2a144ca748..95883c65abb 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreePartCheckThread.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeTreePartCheckThread.cpp @@ -213,7 +213,7 @@ std::pair ReplicatedMergeTreePartCheckThread::findLo /// because our checks of local storage and zookeeper are not consistent. /// If part exists in zookeeper and doesn't exists in local storage definitely require /// to fetch this part. But if we check local storage first and than check zookeeper - /// some background process can successfully commit part between this checks (both to the local stoarge and zookeeper), + /// some background process can successfully commit part between this checks (both to the local storage and zookeeper), /// but checker thread will remove part from zookeeper and queue fetch. bool exists_in_zookeeper = zookeeper->exists(part_path); @@ -234,6 +234,8 @@ CheckResult ReplicatedMergeTreePartCheckThread::checkPart(const String & part_na auto [exists_in_zookeeper, part] = findLocalPart(part_name); + LOG_TRACE(log, "Part {} in zookeeper: {}, locally: {}", part_name, exists_in_zookeeper, part != nullptr); + /// We do not have this or a covering part. if (!part) { @@ -250,6 +252,9 @@ CheckResult ReplicatedMergeTreePartCheckThread::checkPart(const String & part_na auto local_part_header = ReplicatedMergeTreePartHeader::fromColumnsAndChecksums( part->getColumns(), part->checksums); + /// The double get scheme is needed to retain compatibility with very old parts that were created + /// before the ReplicatedMergeTreePartHeader was introduced. + String part_path = storage.replica_path + "/parts/" + part_name; String part_znode; /// If the part is in ZooKeeper, check its data with its checksums, and them with ZooKeeper. diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp index 26a916d2356..ad41bbe1a08 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp @@ -145,7 +145,7 @@ void ReplicatedMergeTreeQueue::insertUnlocked( else queue.push_front(entry); - if (entry->type == LogEntry::GET_PART) + if (entry->type == LogEntry::GET_PART || entry->type == LogEntry::ATTACH_PART) { inserts_by_time.insert(entry); @@ -184,7 +184,7 @@ void ReplicatedMergeTreeQueue::updateStateOnQueueEntryRemoval( std::unique_lock & state_lock) { /// Update insert times. - if (entry->type == LogEntry::GET_PART) + if (entry->type == LogEntry::GET_PART || entry->type == LogEntry::ATTACH_PART) { inserts_by_time.erase(entry); @@ -563,7 +563,7 @@ int32_t ReplicatedMergeTreeQueue::pullLogsToQueue(zkutil::ZooKeeperPtr zookeeper replica_path + "/queue/queue-", res.data, zkutil::CreateMode::PersistentSequential)); const auto & entry = *copied_entries.back(); - if (entry.type == LogEntry::GET_PART) + if (entry.type == LogEntry::GET_PART || entry.type == LogEntry::ATTACH_PART) { std::lock_guard state_lock(state_mutex); if (entry.create_time && (!min_unprocessed_insert_time || entry.create_time < min_unprocessed_insert_time)) @@ -871,7 +871,12 @@ ReplicatedMergeTreeQueue::StringSet ReplicatedMergeTreeQueue::moveSiblingPartsFo if (it0 == merge_entry) break; - if (((*it0)->type == LogEntry::MERGE_PARTS || (*it0)->type == LogEntry::GET_PART || (*it0)->type == LogEntry::MUTATE_PART) + const auto t = (*it0)->type; + + if ((t == LogEntry::MERGE_PARTS || + t == LogEntry::GET_PART || + t == LogEntry::ATTACH_PART || + t == LogEntry::MUTATE_PART) && parts_for_merge.count((*it0)->new_part_name)) { queue.splice(queue.end(), queue, it0, it); @@ -921,7 +926,10 @@ void ReplicatedMergeTreeQueue::removePartProducingOpsInRange( { auto type = (*it)->type; - if (((type == LogEntry::GET_PART || type == LogEntry::MERGE_PARTS || type == LogEntry::MUTATE_PART) + if (((type == LogEntry::GET_PART || + type == LogEntry::ATTACH_PART || + type == LogEntry::MERGE_PARTS || + type == LogEntry::MUTATE_PART) && part_info.contains(MergeTreePartInfo::fromPartName((*it)->new_part_name, format_version))) || checkReplaceRangeCanBeRemoved(part_info, *it, current)) { @@ -1066,6 +1074,7 @@ bool ReplicatedMergeTreeQueue::shouldExecuteLogEntry( /// some other entry which is currently executing, then we can postpone this entry. if (entry.type == LogEntry::MERGE_PARTS || entry.type == LogEntry::GET_PART + || entry.type == LogEntry::ATTACH_PART || entry.type == LogEntry::MUTATE_PART) { for (const String & new_part_name : entry.getBlockingPartNames()) @@ -1076,7 +1085,8 @@ bool ReplicatedMergeTreeQueue::shouldExecuteLogEntry( } /// Check that fetches pool is not overloaded - if (entry.type == LogEntry::GET_PART && !storage.canExecuteFetch(entry, out_postpone_reason)) + if ((entry.type == LogEntry::GET_PART || entry.type == LogEntry::ATTACH_PART) + && !storage.canExecuteFetch(entry, out_postpone_reason)) { /// Don't print log message about this, because we can have a lot of fetches, /// for example during replica recovery. @@ -1643,7 +1653,7 @@ ReplicatedMergeTreeQueue::Status ReplicatedMergeTreeQueue::getStatus() const if (entry->create_time && (!res.queue_oldest_time || entry->create_time < res.queue_oldest_time)) res.queue_oldest_time = entry->create_time; - if (entry->type == LogEntry::GET_PART) + if (entry->type == LogEntry::GET_PART || entry->type == LogEntry::ATTACH_PART) { ++res.inserts_in_queue; diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index 410410feb11..f9d63132a1b 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -59,6 +59,7 @@ #include #include +#include "Storages/MergeTree/MergeTreeReaderCompact.h" #include #include @@ -1353,6 +1354,48 @@ String StorageReplicatedMergeTree::getChecksumsForZooKeeper(const MergeTreeDataP getSettings()->use_minimalistic_checksums_in_zookeeper); } +MergeTreeData::MutableDataPartPtr StorageReplicatedMergeTree::attachPartHelperFoundValidPart(const LogEntry& entry) const +{ + const MergeTreePartInfo actual_part_info = MergeTreePartInfo::fromPartName(entry.new_part_name, format_version); + const String part_new_name = actual_part_info.getPartName(); + + for (const DiskPtr & disk : getStoragePolicy()->getDisks()) + for (const auto it = disk->iterateDirectory(relative_data_path + "detached/"); it->isValid(); it->next()) + { + MergeTreePartInfo part_info; + + if (!MergeTreePartInfo::tryParsePartName(it->name(), &part_info, format_version) || + part_info.partition_id != actual_part_info.partition_id) + continue; + + const String part_old_name = part_info.getPartName(); + const String part_path = "detached/" + part_old_name; + + const VolumePtr volume = std::make_shared("volume_" + part_old_name, disk); + + /// actual_part_info is more recent than part_info so we use it + MergeTreeData::MutableDataPartPtr part = createPart(part_new_name, actual_part_info, volume, part_path); + + try + { + part->loadColumnsChecksumsIndexes(true, true); + } + catch (const Exception&) + { + /// This method throws if the part data is corrupted or partly missing. In this case, we simply don't + /// process the part. + continue; + } + + if (entry.part_checksum == part->checksums.getTotalChecksumHex()) + { + part->modification_time = disk->getLastModified(part->getFullRelativePath()).epochTime(); + return part; + } + } + + return {}; +} bool StorageReplicatedMergeTree::executeLogEntry(LogEntry & entry) { @@ -1368,32 +1411,54 @@ bool StorageReplicatedMergeTree::executeLogEntry(LogEntry & entry) return true; } - if (entry.type == LogEntry::GET_PART || - entry.type == LogEntry::MERGE_PARTS || - entry.type == LogEntry::MUTATE_PART) + const bool is_get_or_attach = entry.type == LogEntry::GET_PART || entry.type == LogEntry::ATTACH_PART; + + if (is_get_or_attach || entry.type == LogEntry::MERGE_PARTS || entry.type == LogEntry::MUTATE_PART) { /// If we already have this part or a part covering it, we do not need to do anything. /// The part may be still in the PreCommitted -> Committed transition so we first search /// among PreCommitted parts to definitely find the desired part if it exists. DataPartPtr existing_part = getPartIfExists(entry.new_part_name, {MergeTreeDataPartState::PreCommitted}); + if (!existing_part) existing_part = getActiveContainingPart(entry.new_part_name); - /// Even if the part is locally, it (in exceptional cases) may not be in ZooKeeper. Let's check that it is there. + /// Even if the part is local, it (in exceptional cases) may not be in ZooKeeper. Let's check that it is there. if (existing_part && getZooKeeper()->exists(replica_path + "/parts/" + existing_part->name)) { - if (!(entry.type == LogEntry::GET_PART && entry.source_replica == replica_name)) - { - LOG_DEBUG(log, "Skipping action for part {} because part {} already exists.", entry.new_part_name, existing_part->name); - } + if (!is_get_or_attach || entry.source_replica != replica_name) + LOG_DEBUG(log, "Skipping action for part {} because part {} already exists.", + entry.new_part_name, existing_part->name); + return true; } } - if (entry.type == LogEntry::GET_PART && entry.source_replica == replica_name) + if (entry.type == LogEntry::ATTACH_PART) + { + if (MutableDataPartPtr part = attachPartHelperFoundValidPart(entry); part) + { + LOG_TRACE(log, "Found valid part to attach from local data, preparing the transaction"); + + Transaction transaction(*this); + + renameTempPartAndReplace(part, nullptr, &transaction); + checkPartChecksumsAndCommit(transaction, part); + + writePartLog(PartLogElement::Type::NEW_PART, {}, 0 /** log entry is fake so we don't measure the time */, + part->name, part, {} /** log entry is fake so there are no initial parts */, nullptr); + + return true; + } + + LOG_TRACE(log, "Didn't find part with the correct checksums, will fetch it from other replica"); + } + + if (is_get_or_attach && entry.source_replica == replica_name) LOG_WARNING(log, "Part {} from own log doesn't exist.", entry.new_part_name); - /// Perhaps we don't need this part, because during write with quorum, the quorum has failed (see below about `/quorum/failed_parts`). + /// Perhaps we don't need this part, because during write with quorum, the quorum has failed + /// (see below about `/quorum/failed_parts`). if (entry.quorum && getZooKeeper()->exists(zookeeper_path + "/quorum/failed_parts/" + entry.new_part_name)) { LOG_DEBUG(log, "Skipping action for part {} because quorum for that part was failed.", entry.new_part_name); @@ -1401,28 +1466,28 @@ bool StorageReplicatedMergeTree::executeLogEntry(LogEntry & entry) } bool do_fetch = false; - if (entry.type == LogEntry::GET_PART) + + switch (entry.type) { - do_fetch = true; - } - else if (entry.type == LogEntry::MERGE_PARTS) - { - /// Sometimes it's better to fetch merged part instead of merge - /// For example when we don't have all source parts for merge - do_fetch = !tryExecuteMerge(entry); - } - else if (entry.type == LogEntry::MUTATE_PART) - { - /// Sometimes it's better to fetch mutated part instead of merge - do_fetch = !tryExecutePartMutation(entry); - } - else if (entry.type == LogEntry::ALTER_METADATA) - { - return executeMetadataAlter(entry); - } - else - { - throw Exception("Unexpected log entry type: " + toString(static_cast(entry.type)), ErrorCodes::LOGICAL_ERROR); + case LogEntry::ATTACH_PART: + /// We surely don't have this part locally as we've checked it before, so download it. + [[fallthrough]]; + case LogEntry::GET_PART: + do_fetch = true; + break; + case LogEntry::MERGE_PARTS: + /// Sometimes it's better to fetch the merged part instead of merging, + /// e.g when we don't have all the source parts. + do_fetch = !tryExecuteMerge(entry); + break; + case LogEntry::MUTATE_PART: + /// Sometimes it's better to fetch mutated part instead of merging. + do_fetch = !tryExecutePartMutation(entry); + break; + case LogEntry::ALTER_METADATA: + return executeMetadataAlter(entry); + default: + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected log entry type: {}", static_cast(entry.type)); } if (do_fetch) @@ -1433,7 +1498,8 @@ bool StorageReplicatedMergeTree::executeLogEntry(LogEntry & entry) bool StorageReplicatedMergeTree::tryExecuteMerge(const LogEntry & entry) { - LOG_TRACE(log, "Executing log entry to merge parts {} to {}", boost::algorithm::join(entry.source_parts, ", "), entry.new_part_name); + LOG_TRACE(log, "Executing log entry to merge parts {} to {}", + fmt::join(entry.source_parts, ", "), entry.new_part_name); const auto storage_settings_ptr = getSettings(); @@ -1458,6 +1524,7 @@ bool StorageReplicatedMergeTree::tryExecuteMerge(const LogEntry & entry) /// instead of doing exactly the same merge cluster-wise std::optional replica_to_execute_merge; bool replica_to_execute_merge_picked = false; + if (merge_strategy_picker.shouldMergeOnSingleReplica(entry)) { replica_to_execute_merge = merge_strategy_picker.pickReplicaToExecuteMerge(entry); @@ -1465,16 +1532,21 @@ bool StorageReplicatedMergeTree::tryExecuteMerge(const LogEntry & entry) if (replica_to_execute_merge) { - LOG_DEBUG(log, "Prefer fetching part {} from replica {} due execute_merges_on_single_replica_time_threshold", entry.new_part_name, replica_to_execute_merge.value()); + LOG_DEBUG(log, + "Prefer fetching part {} from replica {} due to execute_merges_on_single_replica_time_threshold", + entry.new_part_name, replica_to_execute_merge.value()); + return false; } } DataPartsVector parts; bool have_all_parts = true; + for (const String & name : entry.source_parts) { DataPartPtr part = getActiveContainingPart(name); + if (!part) { have_all_parts = false; @@ -1557,8 +1629,7 @@ bool StorageReplicatedMergeTree::tryExecuteMerge(const LogEntry & entry) if (storage_settings_ptr->allow_s3_zero_copy_replication) { - auto disk = reserved_space->getDisk(); - if (disk->getType() == DB::DiskType::Type::S3) + if (auto disk = reserved_space->getDisk(); disk->getType() == DB::DiskType::Type::S3) { if (merge_strategy_picker.shouldMergeOnSingleReplicaS3Shared(entry)) { @@ -1567,7 +1638,9 @@ bool StorageReplicatedMergeTree::tryExecuteMerge(const LogEntry & entry) if (replica_to_execute_merge) { - LOG_DEBUG(log, "Prefer fetching part {} from replica {} due s3_execute_merges_on_single_replica_time_threshold", entry.new_part_name, replica_to_execute_merge.value()); + LOG_DEBUG(log, + "Prefer fetching part {} from replica {} due s3_execute_merges_on_single_replica_time_threshold", + entry.new_part_name, replica_to_execute_merge.value()); return false; } } @@ -1579,8 +1652,10 @@ bool StorageReplicatedMergeTree::tryExecuteMerge(const LogEntry & entry) global_context.getMergeList().bookMergeWithTTL(); auto table_id = getStorageID(); + /// Add merge to list - MergeList::EntryPtr merge_entry = global_context.getMergeList().insert(table_id.database_name, table_id.table_name, future_merged_part); + MergeList::EntryPtr merge_entry = global_context.getMergeList().insert( + table_id.database_name, table_id.table_name, future_merged_part); Transaction transaction(*this); MutableDataPartPtr part; @@ -1614,7 +1689,16 @@ bool StorageReplicatedMergeTree::tryExecuteMerge(const LogEntry & entry) ProfileEvents::increment(ProfileEvents::DataAfterMergeDiffersFromReplica); - LOG_ERROR(log, "{}. Data after merge is not byte-identical to data on another replicas. There could be several reasons: 1. Using newer version of compression library after server update. 2. Using another compression method. 3. Non-deterministic compression algorithm (highly unlikely). 4. Non-deterministic merge algorithm due to logical error in code. 5. Data corruption in memory due to bug in code. 6. Data corruption in memory due to hardware issue. 7. Manual modification of source data after server startup. 8. Manual modification of checksums stored in ZooKeeper. 9. Part format related settings like 'enable_mixed_granularity_parts' are different on different replicas. We will download merged part from replica to force byte-identical result.", getCurrentExceptionMessage(false)); + LOG_ERROR(log, + "{}. Data after merge is not byte-identical to data on another replicas. There could be several" + " reasons: 1. Using newer version of compression library after server update. 2. Using another" + " compression method. 3. Non-deterministic compression algorithm (highly unlikely). 4." + " Non-deterministic merge algorithm due to logical error in code. 5. Data corruption in memory due" + " to bug in code. 6. Data corruption in memory due to hardware issue. 7. Manual modification of" + " source data after server startup. 8. Manual modification of checksums stored in ZooKeeper. 9." + " Part format related settings like 'enable_mixed_granularity_parts' are different on different" + " replicas. We will download merged part from replica to force byte-identical result.", + getCurrentExceptionMessage(false)); write_part_log(ExecutionStatus::fromCurrentException()); @@ -1778,21 +1862,18 @@ bool StorageReplicatedMergeTree::executeFetch(LogEntry & entry) const auto storage_settings_ptr = getSettings(); auto metadata_snapshot = getInMemoryMetadataPtr(); - if (storage_settings_ptr->replicated_max_parallel_fetches && total_fetches >= storage_settings_ptr->replicated_max_parallel_fetches) - { - throw Exception("Too many total fetches from replicas, maximum: " + storage_settings_ptr->replicated_max_parallel_fetches.toString(), - ErrorCodes::TOO_MANY_FETCHES); - } + if (storage_settings_ptr->replicated_max_parallel_fetches && + total_fetches >= storage_settings_ptr->replicated_max_parallel_fetches) + throw Exception(ErrorCodes::TOO_MANY_FETCHES, "Too many total fetches from replicas, maximum: {} ", + storage_settings_ptr->replicated_max_parallel_fetches.toString()); ++total_fetches; SCOPE_EXIT({--total_fetches;}); if (storage_settings_ptr->replicated_max_parallel_fetches_for_table && current_table_fetches >= storage_settings_ptr->replicated_max_parallel_fetches_for_table) - { - throw Exception("Too many fetches from replicas for table, maximum: " + storage_settings_ptr->replicated_max_parallel_fetches_for_table.toString(), - ErrorCodes::TOO_MANY_FETCHES); - } + throw Exception(ErrorCodes::TOO_MANY_FETCHES, "Too many fetches from replicas for table, maximum: {}", + storage_settings_ptr->replicated_max_parallel_fetches_for_table.toString()); ++current_table_fetches; SCOPE_EXIT({--current_table_fetches;}); @@ -3279,12 +3360,16 @@ String StorageReplicatedMergeTree::findReplicaHavingPart(const String & part_nam /// Select replicas in uniformly random order. std::shuffle(replicas.begin(), replicas.end(), thread_local_rng); + LOG_TRACE(log, "Candidate replicas: {}", replicas.size()); + for (const String & replica : replicas) { - /// We don't interested in ourself. + /// We aren't interested in ourself. if (replica == replica_name) continue; + LOG_TRACE(log, "Candidate replica: {}", replica); + if (checkReplicaHavePart(replica, part_name) && (!active || zookeeper->exists(zookeeper_path + "/replicas/" + replica + "/is_active"))) return replica; @@ -3670,6 +3755,7 @@ bool StorageReplicatedMergeTree::fetchPart(const String & part_name, const Stora String interserver_scheme; std::optional tagger_ptr; std::function get_part; + if (part_to_clone) { get_part = [&, part_to_clone]() @@ -4756,13 +4842,20 @@ PartitionCommandsResultInfo StorageReplicatedMergeTree::attachPartition( PartsTemporaryRename renamed_parts(*this, "detached/"); MutableDataPartsVector loaded_parts = tryLoadPartsToAttach(partition, attach_part, query_context, renamed_parts); - ReplicatedMergeTreeBlockOutputStream output(*this, metadata_snapshot, 0, 0, 0, false, false, false); /// TODO Allow to use quorum here. + /// TODO Allow to use quorum here. + ReplicatedMergeTreeBlockOutputStream output(*this, metadata_snapshot, 0, 0, 0, false, false, false, + /*is_attach*/true); + for (size_t i = 0; i < loaded_parts.size(); ++i) { - String old_name = loaded_parts[i]->name; + const String old_name = loaded_parts[i]->name; + output.writeExistingPart(loaded_parts[i]); + renamed_parts.old_and_new_names[i].first.clear(); + LOG_DEBUG(log, "Attached part {} as {}", old_name, loaded_parts[i]->name); + results.push_back(PartitionCommandResultInfo{ .partition_id = loaded_parts[i]->info.partition_id, .part_name = loaded_parts[i]->name, diff --git a/src/Storages/StorageReplicatedMergeTree.h b/src/Storages/StorageReplicatedMergeTree.h index 6393f4d58a5..0c8aca18c6a 100644 --- a/src/Storages/StorageReplicatedMergeTree.h +++ b/src/Storages/StorageReplicatedMergeTree.h @@ -39,13 +39,14 @@ namespace DB * - the structure of the table (/metadata, /columns) * - action log with data (/log/log-...,/replicas/replica_name/queue/queue-...); * - a replica list (/replicas), and replica activity tag (/replicas/replica_name/is_active), replica addresses (/replicas/replica_name/host); - * - select the leader replica (/leader_election) - these are the replicas that assigning merges, mutations and partition manipulations + * - the leader replica election (/leader_election) - these are the replicas that assign merges, mutations + * and partition manipulations. * (after ClickHouse version 20.5 we allow multiple leaders to act concurrently); * - a set of parts of data on each replica (/replicas/replica_name/parts); * - list of the last N blocks of data with checksum, for deduplication (/blocks); * - the list of incremental block numbers (/block_numbers) that we are about to insert, * to ensure the linear order of data insertion and data merge only on the intervals in this sequence; - * - coordinates writes with quorum (/quorum). + * - coordinate writes with quorum (/quorum). * - Storage of mutation entries (ALTER DELETE, ALTER UPDATE etc.) to execute (/mutations). * See comments in StorageReplicatedMergeTree::mutate() for details. */ @@ -65,6 +66,8 @@ namespace DB * - if the part is corrupt (removePartAndEnqueueFetch) or absent during the check (at start - checkParts, while running - searchForMissingPart), * actions are put on GET from other replicas; * + * TODO Update the GET part after rewriting the code (search locally). + * * The replica to which INSERT was made in the queue will also have an entry of the GET of this data. * Such an entry is considered to be executed as soon as the queue handler sees it. * @@ -251,6 +254,8 @@ private: using LogEntry = ReplicatedMergeTreeLogEntry; using LogEntryPtr = LogEntry::Ptr; + using MergeTreeData::MutableDataPartPtr; + zkutil::ZooKeeperPtr current_zookeeper; /// Use only the methods below. mutable std::mutex current_zookeeper_mutex; /// To recreate the session in the background thread. @@ -420,6 +425,9 @@ private: */ bool executeLogEntry(LogEntry & entry); + /// Lookup the part for the entry in the detached/ folder. + /// returns nullptr if the part is corrupt or missing. + MutableDataPartPtr attachPartHelperFoundValidPart(const LogEntry& entry) const; void executeDropRange(const LogEntry & entry); diff --git a/src/Storages/System/StorageSystemClusters.cpp b/src/Storages/System/StorageSystemClusters.cpp index 25b432252f9..e20ce233190 100644 --- a/src/Storages/System/StorageSystemClusters.cpp +++ b/src/Storages/System/StorageSystemClusters.cpp @@ -38,7 +38,21 @@ void StorageSystemClusters::fillData(MutableColumns & res_columns, const Context for (const auto & name_and_database : databases) { if (const auto * replicated = typeid_cast(name_and_database.second.get())) - writeCluster(res_columns, {name_and_database.first, replicated->getCluster()}); + { + // A quick fix for stateless tests with DatabaseReplicated. Its ZK + // node can be destroyed at any time. If another test lists + // system.clusters to get client command line suggestions, it will + // get an error when trying to get the info about DB from ZK. + // Just ignore these inaccessible databases. A good example of a + // failing test is `01526_client_start_and_exit`. + try { + writeCluster(res_columns, {name_and_database.first, replicated->getCluster()}); + } + catch (...) + { + tryLogCurrentException(__PRETTY_FUNCTION__); + } + } } } diff --git a/src/Storages/System/StorageSystemContributors.generated.cpp b/src/Storages/System/StorageSystemContributors.generated.cpp index fd4807e550c..46ead225102 100644 --- a/src/Storages/System/StorageSystemContributors.generated.cpp +++ b/src/Storages/System/StorageSystemContributors.generated.cpp @@ -4,6 +4,7 @@ const char * auto_contributors[] { "20018712", "243f6a88 85a308d3", "243f6a8885a308d313198a2e037", + "3ldar-nasyrov", "821008736@qq.com", "Akazz", "Alain BERRIER", @@ -58,6 +59,7 @@ const char * auto_contributors[] { "Alexey Vasiliev", "Alexey Zatelepin", "Alexsey Shestakov", + "Ali Demirci", "Aliaksandr Pliutau", "Aliaksandr Shylau", "Amos Bird", @@ -138,6 +140,7 @@ const char * auto_contributors[] { "Brett Hoerner", "Bulat Gaifullin", "Carbyn", + "Chao Ma", "Chao Wang", "Chen Yufei", "Chienlung Cheung", @@ -258,6 +261,7 @@ const char * auto_contributors[] { "Ilya Skrypitsa", "Ilya Yatsishin", "ImgBotApp", + "Islam Israfilov", "Islam Israfilov (Islam93)", "Ivan", "Ivan A. Torgashov", @@ -367,6 +371,7 @@ const char * auto_contributors[] { "Mikahil Nacharov", "Mike", "Mike F", + "Mike Kot", "Mikhail", "Mikhail Cheshkov", "Mikhail Fandyushin", @@ -378,6 +383,7 @@ const char * auto_contributors[] { "Mikhail Salosin", "Mikhail Surin", "Mikhail f. Shiryaev", + "MikuSugar", "Milad Arabi", "Mohammad Hossein Sekhavat", "MovElb", @@ -388,6 +394,7 @@ const char * auto_contributors[] { "Narek Galstyan", "NeZeD [Mac Pro]", "Neeke Gao", + "Neng Liu", "Nico Mandery", "Nico Piderman", "Nicolae Vartolomei", @@ -439,6 +446,7 @@ const char * auto_contributors[] { "Philippe Ombredanne", "Potya", "Pradeep Chhetri", + "Pysaoke", "Quid37", "Rafael David Tinoco", "Ramazan Polat", @@ -455,6 +463,7 @@ const char * auto_contributors[] { "Roman Peshkurov", "Roman Tsisyk", "Ruslan", + "Ruslan Savchenko", "Russ Frank", "Ruzal Ibragimov", "S.M.A. Djawadi", @@ -468,6 +477,7 @@ const char * auto_contributors[] { "Sergei Semin", "Sergei Shtykov", "Sergei Tsetlin (rekub)", + "Sergey Demurin", "Sergey Elantsev", "Sergey Fedorov", "Sergey Kononenko", @@ -483,6 +493,7 @@ const char * auto_contributors[] { "SevaCode", "Sherry Wang", "Silviu Caragea", + "Simeon Emanuilov", "Simon Liu", "Simon Podlipsky", "Sina", @@ -504,7 +515,9 @@ const char * auto_contributors[] { "TCeason", "Tagir Kuskarov", "Tai White", + "Taleh Zaliyev", "Tangaev", + "Tatiana Kirillova", "Tema Novikov", "The-Alchemist", "TiunovNN", @@ -534,6 +547,7 @@ const char * auto_contributors[] { "Veselkov Konstantin", "Victor Tarnavsky", "Viktor Taranenko", + "Vitaliy Fedorchenko", "Vitaliy Karnienko", "Vitaliy Kozlovskiy", "Vitaliy Lyudvichenko", @@ -566,6 +580,7 @@ const char * auto_contributors[] { "William Shallum", "Winter Zhang", "Xianda Ke", + "Xiang Zhou", "Y Lu", "Yangkuan Liu", "Yatsishin Ilya", @@ -683,6 +698,7 @@ const char * auto_contributors[] { "frank", "franklee", "fredchenbj", + "fuqi", "fuwhu", "g-arslan", "ggerogery", @@ -701,8 +717,10 @@ const char * auto_contributors[] { "idfer", "igor", "igor.lapko", + "ikarishinjieva", "ikopylov", "imgbot[bot]", + "ip", "it1804", "ivan-kush", "ivanzhukov", @@ -715,6 +733,8 @@ const char * auto_contributors[] { "jianmei zhang", "jyz0309", "keenwolf", + "kevin wan", + "kirillikoff", "kmeaw", "koshachy", "kreuzerkrieg", @@ -744,6 +764,7 @@ const char * auto_contributors[] { "malkfilipp", "manmitya", "maqroll", + "mastertheknife", "maxim", "maxim-babenko", "maxkuzn", @@ -754,6 +775,7 @@ const char * auto_contributors[] { "mergify[bot]", "mf5137", "mfridental", + "michon470", "miha-g", "mikepop7", "millb", @@ -791,6 +813,7 @@ const char * auto_contributors[] { "r1j1k", "rainbowsysu", "ritaank", + "robert", "robot-clickhouse", "robot-metrika-test", "rodrigargar", @@ -850,6 +873,7 @@ const char * auto_contributors[] { "ygrek", "yhgcn", "yiguolei", + "yingjinghan", "ylchou", "yonesko", "yuefoo", @@ -863,6 +887,7 @@ const char * auto_contributors[] { "zhen ni", "zhukai", "zlx19950903", + "zvonand", "zvrr", "zvvr", "zzsmdfj", @@ -879,6 +904,7 @@ const char * auto_contributors[] { "张健", "张风啸", "徐炘", + "曲正鹏", "极客青年", "谢磊", "贾顺名(Jarvis)", diff --git a/src/Storages/System/StorageSystemDictionaries.cpp b/src/Storages/System/StorageSystemDictionaries.cpp index cccd23ffbd1..378905b7dc0 100644 --- a/src/Storages/System/StorageSystemDictionaries.cpp +++ b/src/Storages/System/StorageSystemDictionaries.cpp @@ -58,7 +58,7 @@ void StorageSystemDictionaries::fillData(MutableColumns & res_columns, const Con const auto & external_dictionaries = context.getExternalDictionariesLoader(); for (const auto & load_result : external_dictionaries.getLoadResults()) { - const auto dict_ptr = std::dynamic_pointer_cast(load_result.object); + const auto dict_ptr = std::dynamic_pointer_cast(load_result.object); DictionaryStructure dictionary_structure = ExternalDictionariesLoader::getDictionaryStructure(*load_result.config); StorageID dict_id = StorageID::createEmpty(); diff --git a/src/Storages/System/StorageSystemPartsBase.cpp b/src/Storages/System/StorageSystemPartsBase.cpp index 39cc651e147..02627a3ba03 100644 --- a/src/Storages/System/StorageSystemPartsBase.cpp +++ b/src/Storages/System/StorageSystemPartsBase.cpp @@ -84,7 +84,7 @@ StoragesInfoStream::StoragesInfoStream(const SelectQueryInfo & query_info, const MutableColumnPtr database_column_mut = ColumnString::create(); for (const auto & database : databases) { - /// Checck if database can contain MergeTree tables, + /// Check if database can contain MergeTree tables, /// if not it's unnecessary to load all tables of database just to filter all of them. if (database.second->canContainMergeTreeTables()) database_column_mut->insert(database.first); diff --git a/src/Storages/tests/gtest_SplitTokenExtractor.cpp b/src/Storages/tests/gtest_SplitTokenExtractor.cpp index b5a26c9cd8e..ee6a55f50b8 100644 --- a/src/Storages/tests/gtest_SplitTokenExtractor.cpp +++ b/src/Storages/tests/gtest_SplitTokenExtractor.cpp @@ -61,12 +61,12 @@ TEST_P(SplitTokenExtractorTest, next) for (const auto & expected_token : param.tokens) { SCOPED_TRACE(++i); - ASSERT_TRUE(token_extractor.next(data->data(), data->size(), &pos, &token_start, &token_len)); + ASSERT_TRUE(token_extractor.nextInColumn(data->data(), data->size(), &pos, &token_start, &token_len)); EXPECT_EQ(expected_token, std::string_view(data->data() + token_start, token_len)) << " token_start:" << token_start << " token_len: " << token_len; } - ASSERT_FALSE(token_extractor.next(data->data(), data->size(), &pos, &token_start, &token_len)) + ASSERT_FALSE(token_extractor.nextInColumn(data->data(), data->size(), &pos, &token_start, &token_len)) << "\n\t=> \"" << param.source.substr(token_start, token_len) << "\"" << "\n\t" << token_start << ", " << token_len << ", " << pos << ", " << data->size(); } diff --git a/tests/clickhouse-test b/tests/clickhouse-test index afb02e7ea0b..a44f7972397 100755 --- a/tests/clickhouse-test +++ b/tests/clickhouse-test @@ -305,6 +305,9 @@ def run_tests_array(all_tests_with_params): failures_total = 0 failures = 0 failures_chain = 0 + start_time = datetime.now() + + is_concurrent = multiprocessing.current_process().name != "MainProcess" client_options = get_additional_client_options(args) @@ -315,7 +318,7 @@ def run_tests_array(all_tests_with_params): return '' if all_tests: - print("\nRunning {} {} tests.".format(len(all_tests), suite) + "\n") + print(f"\nRunning {len(all_tests)} {suite} tests ({multiprocessing.current_process().name}).\n") for case in all_tests: if SERVER_DIED: @@ -330,7 +333,6 @@ def run_tests_array(all_tests_with_params): try: status = '' - is_concurrent = multiprocessing.current_process().name != "MainProcess" if not is_concurrent: sys.stdout.flush() sys.stdout.write("{0:72}".format(name + ": ")) @@ -499,12 +501,18 @@ def run_tests_array(all_tests_with_params): failures_total = failures_total + failures if failures_total > 0: - print(colored("\nHaving {failures_total} errors! {passed_total} tests passed. {skipped_total} tests skipped.".format( - passed_total = passed_total, skipped_total = skipped_total, failures_total = failures_total), args, "red", attrs=["bold"])) + print(colored(f"\nHaving {failures_total} errors! {passed_total} tests passed." + f" {skipped_total} tests skipped. {(datetime.now() - start_time).total_seconds():.2f} s elapsed" + f' ({multiprocessing.current_process().name}).', + args, "red", attrs=["bold"])) exit_code = 1 else: - print(colored("\n{passed_total} tests passed. {skipped_total} tests skipped.".format( - passed_total = passed_total, skipped_total = skipped_total), args, "green", attrs=["bold"])) + print(colored(f"\n{passed_total} tests passed. {skipped_total} tests skipped." + f" {(datetime.now() - start_time).total_seconds():.2f} s elapsed" + f' ({multiprocessing.current_process().name}).', + args, "green", attrs=["bold"])) + + sys.stdout.flush() server_logs_level = "warning" @@ -799,7 +807,8 @@ def main(args): if jobs > run_total: run_total = jobs - batch_size = len(parallel_tests) // jobs + # Create two batches per process for more uniform execution time. + batch_size = max(1, len(parallel_tests) // (jobs * 2)) parallel_tests_array = [] for i in range(0, len(parallel_tests), batch_size): parallel_tests_array.append((parallel_tests[i:i+batch_size], suite, suite_dir, suite_tmp_dir)) diff --git a/tests/config/config.d/database_replicated.xml b/tests/config/config.d/database_replicated.xml index ed5845bad48..c2e62f9645a 100644 --- a/tests/config/config.d/database_replicated.xml +++ b/tests/config/config.d/database_replicated.xml @@ -21,6 +21,9 @@ 5000 10000 + 1000 + 2000 + 4000 trace false diff --git a/tests/integration/helpers/corrupt_part_data_on_disk.py b/tests/integration/helpers/corrupt_part_data_on_disk.py new file mode 100644 index 00000000000..1a6f384da9e --- /dev/null +++ b/tests/integration/helpers/corrupt_part_data_on_disk.py @@ -0,0 +1,14 @@ +def corrupt_part_data_on_disk(node, table, part_name): + part_path = node.query("SELECT path FROM system.parts WHERE table = '{}' and name = '{}'" + .format(table, part_name)).strip() + + corrupt_part_data_by_path(node, part_path) + +def corrupt_part_data_by_path(node, part_path): + print("Corrupting part", part_path, "at", node.name) + print("Will corrupt: ", + node.exec_in_container(['bash', '-c', 'cd {p} && ls *.bin | head -n 1'.format(p=part_path)])) + + node.exec_in_container(['bash', '-c', + 'cd {p} && ls *.bin | head -n 1 | xargs -I{{}} sh -c \'echo "1" >> $1\' -- {{}}'.format( + p=part_path)], privileged=True) diff --git a/tests/integration/test_attach_without_fetching/__init__.py b/tests/integration/test_attach_without_fetching/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/integration/test_attach_without_fetching/configs/remote_servers.xml b/tests/integration/test_attach_without_fetching/configs/remote_servers.xml new file mode 100644 index 00000000000..7978f921b2e --- /dev/null +++ b/tests/integration/test_attach_without_fetching/configs/remote_servers.xml @@ -0,0 +1,21 @@ + + + + + true + + node_1_1 + 9000 + + + node_1_2 + 9000 + + + node_1_3 + 9000 + + + + + diff --git a/tests/integration/test_attach_without_fetching/test.py b/tests/integration/test_attach_without_fetching/test.py new file mode 100644 index 00000000000..a79a7babc08 --- /dev/null +++ b/tests/integration/test_attach_without_fetching/test.py @@ -0,0 +1,130 @@ +import time +import pytest + +from helpers.cluster import ClickHouseCluster +from helpers.test_tools import assert_eq_with_retry +from helpers.network import PartitionManager +from helpers.corrupt_part_data_on_disk import corrupt_part_data_by_path + +def fill_node(node): + node.query( + ''' + CREATE TABLE test(n UInt32) + ENGINE = ReplicatedMergeTree('/clickhouse/tables/test', '{replica}') + ORDER BY n PARTITION BY n % 10; + '''.format(replica=node.name)) + +cluster = ClickHouseCluster(__file__) +configs =["configs/remote_servers.xml"] + +node_1 = cluster.add_instance('replica1', with_zookeeper=True, main_configs=configs) +node_2 = cluster.add_instance('replica2', with_zookeeper=True, main_configs=configs) +node_3 = cluster.add_instance('replica3', with_zookeeper=True, main_configs=configs) + +@pytest.fixture(scope="module") +def start_cluster(): + try: + cluster.start() + fill_node(node_1) + fill_node(node_2) + # the third node is filled after the DETACH query + yield cluster + + except Exception as ex: + print(ex) + + finally: + cluster.shutdown() + +def check_data(nodes, detached_parts): + for node in nodes: + print("> Replication queue for", node.name, "\n> table\treplica_name\tsource_replica\ttype\tposition\n", + node.query("SELECT table, replica_name, source_replica, type, position FROM system.replication_queue")) + + node.query("SYSTEM SYNC REPLICA test") + + print("> Checking data integrity for", node.name) + + for i in range(10): + assert_eq_with_retry(node, "SELECT count() FROM test WHERE n % 10 == " + str(i), + "0\n" if i in detached_parts else "10\n") + + assert_eq_with_retry(node, "SELECT count() FROM system.parts WHERE table='test'", + str(10 - len(detached_parts)) + "\n") + + res: str = node.query("SELECT * FROM test ORDER BY n") + + for other in nodes: + if other != node: + print("> Checking data consistency,", other.name, "vs", node.name) + assert_eq_with_retry(other, "SELECT * FROM test ORDER BY n", res) + + +# 1. Check that ALTER TABLE ATTACH PART|PARTITION does not fetch data from other replicas if it's present in the +# detached/ folder. +# 2. Check that ALTER TABLE ATTACH PART|PARTITION downloads the data from other replicas if the detached/ folder +# does not contain the part with the correct checksums. +def test_attach_without_fetching(start_cluster): + # Note here requests are used for both PARTITION and PART. This is done for better test diversity. + # The partition and part are used interchangeably which is not true in most cases. + # 0. Insert data on two replicas + node_1.query("INSERT INTO test SELECT * FROM numbers(100)") + + check_data([node_1, node_2], detached_parts=[]) + + # 1. + # This part will be fetched from other replicas as it would be missing in the detached/ folder and + # also attached locally. + node_1.query("ALTER TABLE test DETACH PART '0_0_0_0'") + # This partition will be just fetched from other replicas as the checksums won't match + # (we'll manually break the data). + node_1.query("ALTER TABLE test DETACH PARTITION 1") + # This partition will be just fetched from other replicas as the part data will be corrupted with one of the + # files missing. + node_1.query("ALTER TABLE test DETACH PARTITION 2") + + + check_data([node_1, node_2], detached_parts=[0, 1, 2]) + + # 2. Create the third replica + fill_node(node_3) + + # 3. Break the part data on the second node to corrupt the checksums. + # Replica 3 should download the data from replica 1 as there is no local data. + # Replica 2 should also download the data from 1 as the checksums won't match. + print("Checking attach with corrupted part data with files missing") + + print("Before deleting:", node_2.exec_in_container(['bash', '-c', + 'cd {p} && ls *.bin'.format( + p="/var/lib/clickhouse/data/default/test/detached/2_0_0_0")], privileged=True)) + + node_2.exec_in_container(['bash', '-c', + 'cd {p} && rm -fr *.bin'.format( + p="/var/lib/clickhouse/data/default/test/detached/2_0_0_0")], privileged=True) + + node_1.query("ALTER TABLE test ATTACH PARTITION 2") + check_data([node_1, node_2, node_3], detached_parts=[0, 1]) + + # 4. Break the part data on the second node to corrupt the checksums. + # Replica 3 should download the data from replica 1 as there is no local data. + # Replica 2 should also download the data from 1 as the checksums won't match. + print("Checking attach with corrupted part data with all of the files present") + + corrupt_part_data_by_path(node_2, "/var/lib/clickhouse/data/default/test/detached/1_0_0_0") + + node_1.query("ALTER TABLE test ATTACH PARTITION 1") + check_data([node_1, node_2, node_3], detached_parts=[0]) + + # 5. Attach the first part and check if it has been fetched correctly. + # Replica 2 should attach the local data from detached/. + # Replica 3 should download the data from replica 2 as there is no local data and other connections are broken. + print("Checking attach with valid checksums") + + with PartitionManager() as pm: + # If something goes wrong and replica 2 wants to fetch data, the test will fail. + pm.partition_instances(node_2, node_1) + pm.partition_instances(node_1, node_3) + + node_1.query("ALTER TABLE test ATTACH PART '0_0_0_0'") + + check_data([node_1, node_2, node_3], detached_parts=[]) diff --git a/tests/integration/test_broken_part_during_merge/test.py b/tests/integration/test_broken_part_during_merge/test.py index 33719166f4a..910dbc1d1a9 100644 --- a/tests/integration/test_broken_part_during_merge/test.py +++ b/tests/integration/test_broken_part_during_merge/test.py @@ -3,6 +3,7 @@ import pytest from helpers.cluster import ClickHouseCluster from multiprocessing.dummy import Pool from helpers.network import PartitionManager +from helpers.corrupt_part_data_on_disk import corrupt_part_data_on_disk import time cluster = ClickHouseCluster(__file__) @@ -25,13 +26,6 @@ def started_cluster(): finally: cluster.shutdown() -def corrupt_data_part_on_disk(node, table, part_name): - part_path = node.query( - "SELECT path FROM system.parts WHERE table = '{}' and name = '{}'".format(table, part_name)).strip() - node.exec_in_container(['bash', '-c', - 'cd {p} && ls *.bin | head -n 1 | xargs -I{{}} sh -c \'echo "1" >> $1\' -- {{}}'.format( - p=part_path)], privileged=True) - def test_merge_and_part_corruption(started_cluster): node1.query("SYSTEM STOP REPLICATION QUEUES replicated_mt") @@ -43,7 +37,7 @@ def test_merge_and_part_corruption(started_cluster): # Need to corrupt "border part" (left or right). If we will corrupt something in the middle # clickhouse will not consider merge as broken, because we have parts with the same min and max # block numbers. - corrupt_data_part_on_disk(node1, 'replicated_mt', 'all_3_3_0') + corrupt_part_data_on_disk(node1, 'replicated_mt', 'all_3_3_0') with Pool(1) as p: def optimize_with_delay(x): diff --git a/tests/integration/test_s3_zero_copy_replication/test.py b/tests/integration/test_s3_zero_copy_replication/test.py index d4c3ae06b72..5bc30ab1d6b 100644 --- a/tests/integration/test_s3_zero_copy_replication/test.py +++ b/tests/integration/test_s3_zero_copy_replication/test.py @@ -96,7 +96,7 @@ def test_s3_zero_copy_on_hybrid_storage(cluster): node1.query( """ CREATE TABLE hybrid_test ON CLUSTER test_cluster (id UInt32, value String) - ENGINE=ReplicatedMergeTree('/clickhouse/tables/s3_test', '{}') + ENGINE=ReplicatedMergeTree('/clickhouse/tables/hybrid_test', '{}') ORDER BY id SETTINGS storage_policy='hybrid' """ @@ -131,3 +131,6 @@ def test_s3_zero_copy_on_hybrid_storage(cluster): assert node1.query("SELECT * FROM hybrid_test ORDER BY id FORMAT Values") == "(0,'data'),(1,'data')" assert node2.query("SELECT * FROM hybrid_test ORDER BY id FORMAT Values") == "(0,'data'),(1,'data')" + + node1.query("DROP TABLE IF EXISTS hybrid_test NO DELAY") + node2.query("DROP TABLE IF EXISTS hybrid_test NO DELAY") diff --git a/tests/performance/direct_dictionary.xml b/tests/performance/direct_dictionary.xml index 68b52d917dd..97ecdfe3e95 100644 --- a/tests/performance/direct_dictionary.xml +++ b/tests/performance/direct_dictionary.xml @@ -1,38 +1,17 @@ - CREATE TABLE simple_direct_dictionary_test_table + CREATE TABLE simple_key_direct_dictionary_source_table ( id UInt64, value_int UInt64, value_string String, value_decimal Decimal64(8), value_string_nullable Nullable(String) - ) ENGINE = TinyLog; + ) ENGINE = Memory; - INSERT INTO simple_direct_dictionary_test_table - SELECT number, number, toString(number), toDecimal64(number, 8), toString(number) - FROM system.numbers - LIMIT 100000; - - - - CREATE DICTIONARY simple_direct_dictionary - ( - id UInt64, - value_int UInt64, - value_string String, - value_decimal Decimal64(8), - value_string_nullable Nullable(String) - ) - PRIMARY KEY id - SOURCE(CLICKHOUSE(DB 'default' TABLE 'simple_direct_dictionary_test_table')) - LAYOUT(DIRECT()) - - - - CREATE TABLE complex_direct_dictionary_test_table + CREATE TABLE complex_key_direct_dictionary_source_table ( id UInt64, id_key String, @@ -44,14 +23,21 @@ - INSERT INTO complex_direct_dictionary_test_table - SELECT number, toString(number), number, toString(number), toDecimal64(number, 8), toString(number) - FROM system.numbers - LIMIT 100000; + CREATE DICTIONARY simple_key_direct_dictionary + ( + id UInt64, + value_int UInt64, + value_string String, + value_decimal Decimal64(8), + value_string_nullable Nullable(String) + ) + PRIMARY KEY id + SOURCE(CLICKHOUSE(DB 'default' TABLE 'simple_key_direct_dictionary_source_table')) + LAYOUT(DIRECT()) - CREATE DICTIONARY complex_direct_dictionary + CREATE DICTIONARY complex_key_direct_dictionary ( id UInt64, id_key String, @@ -61,20 +47,76 @@ value_string_nullable Nullable(String) ) PRIMARY KEY id, id_key - SOURCE(CLICKHOUSE(DB 'default' TABLE 'complex_direct_dictionary_test_table')) + SOURCE(CLICKHOUSE(DB 'default' TABLE 'complex_key_direct_dictionary_source_table')) LAYOUT(COMPLEX_KEY_DIRECT()) - SELECT dictGet('default.simple_direct_dictionary', 'value_int', number) FROM system.numbers LIMIT 150000; - SELECT dictGet('default.simple_direct_dictionary', 'value_string', number) FROM system.numbers LIMIT 150000; - SELECT dictGet('default.simple_direct_dictionary', 'value_decimal', number) FROM system.numbers LIMIT 150000; - SELECT dictGet('default.simple_direct_dictionary', 'value_string_nullable', number) FROM system.numbers LIMIT 150000; - SELECT dictHas('default.simple_direct_dictionary', number) FROM system.numbers LIMIT 150000; + + INSERT INTO simple_key_direct_dictionary_source_table + SELECT number, number, toString(number), toDecimal64(number, 8), toString(number) + FROM system.numbers + LIMIT 100000; + - SELECT dictGet('default.complex_direct_dictionary', 'value_int', (number, toString(number))) FROM system.numbers LIMIT 150000; - SELECT dictGet('default.complex_direct_dictionary', 'value_string', (number, toString(number))) FROM system.numbers LIMIT 150000; - SELECT dictGet('default.complex_direct_dictionary', 'value_decimal', (number, toString(number))) FROM system.numbers LIMIT 150000; - SELECT dictGet('default.complex_direct_dictionary', 'value_string_nullable', (number, toString(number))) FROM system.numbers LIMIT 150000; - SELECT dictHas('default.complex_direct_dictionary', (number, toString(number))) FROM system.numbers LIMIT 150000; + + INSERT INTO complex_key_direct_dictionary_source_table + SELECT number, toString(number), number, toString(number), toDecimal64(number, 8), toString(number) + FROM system.numbers + LIMIT 100000; + + + + + column_name + + 'value_int' + 'value_string' + 'value_decimal' + 'value_string_nullable' + + + + + elements_count + + 25000 + 50000 + 75000 + 100000 + + + + + + SELECT dictGet('default.simple_key_direct_dictionary', {column_name}, number) + FROM system.numbers + LIMIT {elements_count} + FORMAT Null; + + + SELECT dictHas('default.simple_key_direct_dictionary', number) + FROM system.numbers + LIMIT {elements_count} + FORMAT Null; + + + + SELECT dictGet('default.complex_key_direct_dictionary', {column_name}, (number, toString(number))) + FROM system.numbers + LIMIT {elements_count} + FORMAT Null; + + + SELECT dictHas('default.complex_key_direct_dictionary', (number, toString(number))) + FROM system.numbers + LIMIT {elements_count} + FORMAT Null; + + + DROP TABLE IF EXISTS simple_key_direct_dictionary_source_table; + DROP TABLE IF EXISTS complex_key_direct_dictionary_source_table; + + DROP DICTIONARY IF EXISTS simple_key_direct_dictionary; + DROP DICTIONARY IF EXISTS complex_key_direct_dictionary; diff --git a/tests/performance/flat_dictionary.xml b/tests/performance/flat_dictionary.xml new file mode 100644 index 00000000000..426aa929bbc --- /dev/null +++ b/tests/performance/flat_dictionary.xml @@ -0,0 +1,75 @@ + + + CREATE TABLE simple_key_flat_dictionary_source_table + ( + id UInt64, + value_int UInt64, + value_string String, + value_decimal Decimal64(8), + value_string_nullable Nullable(String) + ) ENGINE = Memory; + + + + CREATE DICTIONARY simple_key_flat_dictionary + ( + id UInt64, + value_int UInt64, + value_string String, + value_decimal Decimal64(8), + value_string_nullable Nullable(String) + ) + PRIMARY KEY id + SOURCE(CLICKHOUSE(DB 'default' TABLE 'simple_key_flat_dictionary_source_table')) + LAYOUT(FLAT()) + LIFETIME(MIN 0 MAX 1000) + + + + INSERT INTO simple_key_flat_dictionary_source_table + SELECT number, number, toString(number), toDecimal64(number, 8), toString(number) + FROM system.numbers + LIMIT 500000; + + + + + column_name + + 'value_int' + 'value_string' + 'value_decimal' + 'value_string_nullable' + + + + + elements_count + + 250000 + 500000 + 750000 + 1000000 + + + + + + SELECT dictGet('default.simple_key_flat_dictionary', {column_name}, number) + FROM system.numbers + LIMIT {elements_count} + FORMAR Null; + + + + SELECT dictHas('default.simple_key_flat_dictionary', number) + FROM system.numbers + LIMIT {elements_count} + FORMAT Null; + + + DROP TABLE IF EXISTS simple_key_flat_dictionary_source_table + + DROP DICTIONARY IF EXISTS simple_key_flat_dictionary + + diff --git a/tests/performance/hashed_dictionary.xml b/tests/performance/hashed_dictionary.xml new file mode 100644 index 00000000000..a38d2f30c23 --- /dev/null +++ b/tests/performance/hashed_dictionary.xml @@ -0,0 +1,124 @@ + + + CREATE TABLE simple_key_hashed_dictionary_source_table + ( + id UInt64, + value_int UInt64, + value_string String, + value_decimal Decimal64(8), + value_string_nullable Nullable(String) + ) ENGINE = Memory; + + + + CREATE TABLE complex_key_hashed_dictionary_source_table + ( + id UInt64, + id_key String, + value_int UInt64, + value_string String, + value_decimal Decimal64(8), + value_string_nullable Nullable(String) + ) ENGINE = Memory; + + + + CREATE DICTIONARY simple_key_hashed_dictionary + ( + id UInt64, + value_int UInt64, + value_string String, + value_decimal Decimal64(8), + value_string_nullable Nullable(String) + ) + PRIMARY KEY id + SOURCE(CLICKHOUSE(DB 'default' TABLE 'simple_key_hashed_dictionary_source_table')) + LAYOUT(HASHED()) + LIFETIME(MIN 0 MAX 1000); + + + + CREATE DICTIONARY complex_key_hashed_dictionary + ( + id UInt64, + id_key String, + value_int UInt64, + value_string String, + value_decimal Decimal64(8), + value_string_nullable Nullable(String) + ) + PRIMARY KEY id, id_key + SOURCE(CLICKHOUSE(DB 'default' TABLE 'complex_key_hashed_dictionary_source_table')) + LAYOUT(COMPLEX_KEY_HASHED()) + LIFETIME(MIN 0 MAX 1000); + + + + INSERT INTO simple_key_hashed_dictionary_source_table + SELECT number, number, toString(number), toDecimal64(number, 8), toString(number) + FROM system.numbers + LIMIT 5000000; + + + + INSERT INTO complex_key_hashed_dictionary_source_table + SELECT number, toString(number), number, toString(number), toDecimal64(number, 8), toString(number) + FROM system.numbers + LIMIT 5000000; + + + + + column_name + + 'value_int' + 'value_string' + 'value_decimal' + 'value_string_nullable' + + + + + elements_count + + 2500000 + 5000000 + 7500000 + 10000000 + + + + + + SELECT dictGet('default.simple_key_hashed_dictionary', {column_name}, number) + FROM system.numbers + LIMIT {elements_count} + FORMAT Null; + + + SELECT dictHas('default.simple_key_hashed_dictionary', number) + FROM system.numbers + LIMIT {elements_count} + FORMAT Null; + + + + SELECT dictGet('default.complex_key_hashed_dictionary', {column_name}, (number, toString(number))) + FROM system.numbers + LIMIT {elements_count} + FORMAT Null; + + + SELECT dictHas('default.complex_key_hashed_dictionary', (number, toString(number))) + FROM system.numbers + LIMIT {elements_count} + FORMAT Null; + + + DROP TABLE IF EXISTS simple_key_hashed_dictionary_source_table; + DROP TABLE IF EXISTS complex_key_hashed_dictionary_source_table; + + DROP DICTIONARY IF EXISTS simple_key_hashed_dictionary; + DROP DICTIONARY IF EXISTS complex_key_hashed_dictionary; + + diff --git a/tests/queries/0_stateless/00539_functions_for_working_with_json.reference b/tests/queries/0_stateless/00539_functions_for_working_with_json.reference index c0399f8ab2e..4d3527722a1 100644 --- a/tests/queries/0_stateless/00539_functions_for_working_with_json.reference +++ b/tests/queries/0_stateless/00539_functions_for_working_with_json.reference @@ -13,3 +13,10 @@ test"string "[" ["]", "2", "3"] {"nested" : [1,2,3]} +-1 +0 +0 +-1 +1 +test_string +test"string diff --git a/tests/queries/0_stateless/00539_functions_for_working_with_json.sql b/tests/queries/0_stateless/00539_functions_for_working_with_json.sql index 514b5f2e5ea..31853e92262 100644 --- a/tests/queries/0_stateless/00539_functions_for_working_with_json.sql +++ b/tests/queries/0_stateless/00539_functions_for_working_with_json.sql @@ -15,3 +15,11 @@ SELECT visitParamExtractRaw('{"myparam": "{"}', 'myparam'); SELECT visitParamExtractRaw('{"myparam": "["}', 'myparam'); SELECT visitParamExtractRaw('{"myparam": ["]", "2", "3"], "other":123}', 'myparam'); SELECT visitParamExtractRaw('{"myparam": {"nested" : [1,2,3]}, "other":123}', 'myparam'); + +SELECT simpleJSONExtractInt('{"myparam":-1}', 'myparam'); +SELECT simpleJSONExtractUInt('{"myparam":-1}', 'myparam'); +SELECT simpleJSONExtractFloat('{"myparam":null}', 'myparam'); +SELECT simpleJSONExtractFloat('{"myparam":-1}', 'myparam'); +SELECT simpleJSONExtractBool('{"myparam":true}', 'myparam'); +SELECT simpleJSONExtractString('{"myparam":"test_string"}', 'myparam'); +SELECT simpleJSONExtractString('{"myparam":"test\\"string"}', 'myparam'); diff --git a/tests/queries/0_stateless/00597_push_down_predicate.reference b/tests/queries/0_stateless/00597_push_down_predicate.reference index bd1c4791df4..59313c35b81 100644 --- a/tests/queries/0_stateless/00597_push_down_predicate.reference +++ b/tests/queries/0_stateless/00597_push_down_predicate.reference @@ -585,3 +585,15 @@ SEMI LEFT JOIN ) AS r USING (id) WHERE r.id = 1 2000-01-01 1 test string 1 1 2000-01-01 test string 1 1 +SELECT value + t1.value AS expr +FROM +( + SELECT + value, + t1.value + FROM test_00597 AS t0 + ALL FULL OUTER JOIN test_00597 AS t1 USING (date) + WHERE (value + `t1.value`) < 3 +) +WHERE expr < 3 +2 diff --git a/tests/queries/0_stateless/00597_push_down_predicate.sql b/tests/queries/0_stateless/00597_push_down_predicate.sql index ec306ac6792..2e3357241ad 100644 --- a/tests/queries/0_stateless/00597_push_down_predicate.sql +++ b/tests/queries/0_stateless/00597_push_down_predicate.sql @@ -135,5 +135,9 @@ SELECT * FROM (SELECT * FROM (SELECT * FROM test_00597) AS a ANY LEFT JOIN (SELE EXPLAIN SYNTAX SELECT * FROM (SELECT * FROM test_00597) ANY INNER JOIN (SELECT * FROM (SELECT * FROM test_00597)) as r USING id WHERE r.id = 1; SELECT * FROM (SELECT * FROM test_00597) ANY INNER JOIN (SELECT * FROM (SELECT * FROM test_00597)) as r USING id WHERE r.id = 1; +-- issue 20497 +EXPLAIN SYNTAX SELECT value + t1.value AS expr FROM (SELECT t0.value, t1.value FROM test_00597 AS t0 FULL JOIN test_00597 AS t1 USING date) WHERE expr < 3; +SELECT value + t1.value AS expr FROM (SELECT t0.value, t1.value FROM test_00597 AS t0 FULL JOIN test_00597 AS t1 USING date) WHERE expr < 3; + DROP TABLE IF EXISTS test_00597; DROP TABLE IF EXISTS test_view_00597; diff --git a/tests/queries/0_stateless/00926_adaptive_index_granularity_versioned_collapsing_merge_tree.reference b/tests/queries/0_stateless/00926_adaptive_index_granularity_versioned_collapsing_merge_tree.reference index 067189f73fc..f93aae0225a 100644 --- a/tests/queries/0_stateless/00926_adaptive_index_granularity_versioned_collapsing_merge_tree.reference +++ b/tests/queries/0_stateless/00926_adaptive_index_granularity_versioned_collapsing_merge_tree.reference @@ -6,11 +6,9 @@ 4 1 0 -0 6 2 ----- 6 3 0 -0 diff --git a/tests/queries/0_stateless/00926_adaptive_index_granularity_versioned_collapsing_merge_tree.sql b/tests/queries/0_stateless/00926_adaptive_index_granularity_versioned_collapsing_merge_tree.sql index 4d4dbda922d..44dd0412aea 100644 --- a/tests/queries/0_stateless/00926_adaptive_index_granularity_versioned_collapsing_merge_tree.sql +++ b/tests/queries/0_stateless/00926_adaptive_index_granularity_versioned_collapsing_merge_tree.sql @@ -62,7 +62,11 @@ OPTIMIZE TABLE four_rows_per_granule FINAL; SELECT COUNT(*) FROM four_rows_per_granule; -SELECT distinct(marks) from system.parts WHERE table = 'four_rows_per_granule' and database=currentDatabase() and active=1; +-- We expect zero marks here, so we might get zero rows if all the parts were +-- deleted already. This can happen in parallel runs where there may be a long delay +-- between queries. So we must write the query in such a way that it always returns +-- zero rows if OK. +SELECT distinct(marks) d from system.parts WHERE table = 'four_rows_per_granule' and database=currentDatabase() and active=1 having d > 0; INSERT INTO four_rows_per_granule (p, k, v1, v2, Sign, Version) VALUES ('2018-05-15', 1, 1000, 2000, 1, 1), ('2018-05-16', 2, 3000, 4000, 1, 1), ('2018-05-17', 3, 5000, 6000, 1, 1), ('2018-05-18', 4, 7000, 8000, 1, 1); @@ -120,6 +124,10 @@ OPTIMIZE TABLE six_rows_per_granule FINAL; SELECT COUNT(*) FROM six_rows_per_granule; -SELECT distinct(marks) from system.parts WHERE table = 'six_rows_per_granule' and database=currentDatabase() and active=1; +-- We expect zero marks here, so we might get zero rows if all the parts were +-- deleted already. This can happen in parallel runs where there may be a long delay +-- between queries. So we must write the query in such a way that it always returns +-- zero rows if OK. +SELECT distinct(marks) d from system.parts WHERE table = 'six_rows_per_granule' and database=currentDatabase() and active=1 having d > 0; DROP TABLE IF EXISTS six_rows_per_granule; diff --git a/tests/queries/0_stateless/00976_system_stop_ttl_merges.sql b/tests/queries/0_stateless/00976_system_stop_ttl_merges.sql index 41f2428d9e6..b27e4275d5d 100644 --- a/tests/queries/0_stateless/00976_system_stop_ttl_merges.sql +++ b/tests/queries/0_stateless/00976_system_stop_ttl_merges.sql @@ -2,7 +2,7 @@ drop table if exists ttl; create table ttl (d Date, a Int) engine = MergeTree order by a partition by toDayOfMonth(d) ttl d + interval 1 day; -system stop ttl merges; +system stop ttl merges ttl; insert into ttl values (toDateTime('2000-10-10 00:00:00'), 1), (toDateTime('2000-10-10 00:00:00'), 2) insert into ttl values (toDateTime('2100-10-10 00:00:00'), 3), (toDateTime('2100-10-10 00:00:00'), 4); @@ -11,7 +11,7 @@ select sleep(1) format Null; -- wait if very fast merge happen optimize table ttl partition 10 final; select * from ttl order by d, a; -system start ttl merges; +system start ttl merges ttl; optimize table ttl partition 10 final; select * from ttl order by d, a; diff --git a/tests/queries/0_stateless/01251_dict_is_in_infinite_loop.reference b/tests/queries/0_stateless/01251_dict_is_in_infinite_loop.reference index 757d2858524..0a2c97efb42 100644 --- a/tests/queries/0_stateless/01251_dict_is_in_infinite_loop.reference +++ b/tests/queries/0_stateless/01251_dict_is_in_infinite_loop.reference @@ -29,10 +29,10 @@ 1 1 1 -255 -255 0 -255 +0 +0 +0 [11,22] [22,11] [11,22] diff --git a/tests/queries/0_stateless/01591_window_functions.reference b/tests/queries/0_stateless/01591_window_functions.reference index 14e5889a811..9067ee8d955 100644 --- a/tests/queries/0_stateless/01591_window_functions.reference +++ b/tests/queries/0_stateless/01591_window_functions.reference @@ -771,6 +771,28 @@ order by x; 125 124 127 4 126 125 127 3 127 126 127 2 +-- We need large offsets to trigger overflow to positive direction, or +-- else the frame end runs into partition end w/o overflow and doesn't move +-- after that. The frame from this query is equivalent to the entire partition. +select x, min(x) over w, max(x) over w, count(x) over w +from ( + select toUInt8(if(mod(number, 2), + toInt64(255 - intDiv(number, 2)), + toInt64(intDiv(number, 2)))) x + from numbers(10) +) +window w as (order by x range between 255 preceding and 255 following) +order by x; +0 0 255 10 +1 0 255 10 +2 0 255 10 +3 0 255 10 +4 0 255 10 +251 0 255 10 +252 0 255 10 +253 0 255 10 +254 0 255 10 +255 0 255 10 -- RANGE OFFSET ORDER BY DESC select x, min(x) over w, max(x) over w, count(x) over w from ( select toUInt8(number) x from numbers(11)) t @@ -1004,6 +1026,8 @@ from numbers(5); 3 \N -- variants of lag/lead that respect the frame select number, p, pp, + lagInFrame(number) over w as lag1, + lagInFrame(number, number - pp) over w as lag2, lagInFrame(number, number - pp, number * 11) over w as lag, leadInFrame(number, number - pp, number * 11) over w as lead from (select number, intDiv(number, 5) p, p * 5 pp from numbers(16)) @@ -1012,22 +1036,22 @@ window w as (partition by p order by number order by number settings max_block_size = 3; ; -0 0 0 0 0 -1 0 0 0 2 -2 0 0 0 4 -3 0 0 0 33 -4 0 0 0 44 -5 1 5 5 5 -6 1 5 5 7 -7 1 5 5 9 -8 1 5 5 88 -9 1 5 5 99 -10 2 10 10 10 -11 2 10 10 12 -12 2 10 10 14 -13 2 10 10 143 -14 2 10 10 154 -15 3 15 15 15 +0 0 0 0 0 0 0 +1 0 0 0 0 0 2 +2 0 0 1 0 0 4 +3 0 0 2 0 0 33 +4 0 0 3 0 0 44 +5 1 5 0 5 5 5 +6 1 5 5 5 5 7 +7 1 5 6 5 5 9 +8 1 5 7 5 5 88 +9 1 5 8 5 5 99 +10 2 10 0 10 10 10 +11 2 10 10 10 10 12 +12 2 10 11 10 10 14 +13 2 10 12 10 10 143 +14 2 10 13 10 10 154 +15 3 15 0 15 15 15 -- case-insensitive SQL-standard synonyms for any and anyLast select number, @@ -1054,14 +1078,29 @@ select count() over () from numbers(4) where number < 2; 2 -- floating point RANGE frame select - count(*) over (order by (toFloat32(number) as f32) range 5. preceding), - count(*) over (order by (toFloat64(number) as f64) range 5. preceding) + count(*) over (order by toFloat32(number) range 5. preceding), + count(*) over (order by toFloat64(number) range 5. preceding), + count(*) over (order by toFloat32(number) range between current row and 5. following), + count(*) over (order by toFloat64(number) range between current row and 5. following) from numbers(7) ; -1 1 -2 2 -3 3 -4 4 -5 5 -6 6 -6 6 +1 1 6 6 +2 2 6 6 +3 3 5 5 +4 4 4 4 +5 5 3 3 +6 6 2 2 +6 6 1 1 +-- negative offsets should not be allowed +select count() over (order by toInt64(number) range between -1 preceding and unbounded following) from numbers(1); -- { serverError 36 } +select count() over (order by toInt64(number) range between -1 following and unbounded following) from numbers(1); -- { serverError 36 } +select count() over (order by toInt64(number) range between unbounded preceding and -1 preceding) from numbers(1); -- { serverError 36 } +select count() over (order by toInt64(number) range between unbounded preceding and -1 following) from numbers(1); -- { serverError 36 } +---- a test with aggregate function that allocates memory in arena +select sum(a[length(a)]) +from ( + select groupArray(number) over (partition by modulo(number, 11) + order by modulo(number, 1111), number) a + from numbers_mt(10000) +) settings max_block_size = 7; +49995000 diff --git a/tests/queries/0_stateless/01591_window_functions.sql b/tests/queries/0_stateless/01591_window_functions.sql index 30847e09246..85856dd797d 100644 --- a/tests/queries/0_stateless/01591_window_functions.sql +++ b/tests/queries/0_stateless/01591_window_functions.sql @@ -242,6 +242,19 @@ from ( window w as (order by x range between 1 preceding and 2 following) order by x; +-- We need large offsets to trigger overflow to positive direction, or +-- else the frame end runs into partition end w/o overflow and doesn't move +-- after that. The frame from this query is equivalent to the entire partition. +select x, min(x) over w, max(x) over w, count(x) over w +from ( + select toUInt8(if(mod(number, 2), + toInt64(255 - intDiv(number, 2)), + toInt64(intDiv(number, 2)))) x + from numbers(10) +) +window w as (order by x range between 255 preceding and 255 following) +order by x; + -- RANGE OFFSET ORDER BY DESC select x, min(x) over w, max(x) over w, count(x) over w from ( select toUInt8(number) x from numbers(11)) t @@ -349,6 +362,8 @@ from numbers(5); -- variants of lag/lead that respect the frame select number, p, pp, + lagInFrame(number) over w as lag1, + lagInFrame(number, number - pp) over w as lag2, lagInFrame(number, number - pp, number * 11) over w as lag, leadInFrame(number, number - pp, number * 11) over w as lead from (select number, intDiv(number, 5) p, p * 5 pp from numbers(16)) @@ -374,7 +389,23 @@ select count() over () from numbers(4) where number < 2; -- floating point RANGE frame select - count(*) over (order by (toFloat32(number) as f32) range 5. preceding), - count(*) over (order by (toFloat64(number) as f64) range 5. preceding) + count(*) over (order by toFloat32(number) range 5. preceding), + count(*) over (order by toFloat64(number) range 5. preceding), + count(*) over (order by toFloat32(number) range between current row and 5. following), + count(*) over (order by toFloat64(number) range between current row and 5. following) from numbers(7) ; + +-- negative offsets should not be allowed +select count() over (order by toInt64(number) range between -1 preceding and unbounded following) from numbers(1); -- { serverError 36 } +select count() over (order by toInt64(number) range between -1 following and unbounded following) from numbers(1); -- { serverError 36 } +select count() over (order by toInt64(number) range between unbounded preceding and -1 preceding) from numbers(1); -- { serverError 36 } +select count() over (order by toInt64(number) range between unbounded preceding and -1 following) from numbers(1); -- { serverError 36 } + +---- a test with aggregate function that allocates memory in arena +select sum(a[length(a)]) +from ( + select groupArray(number) over (partition by modulo(number, 11) + order by modulo(number, 1111), number) a + from numbers_mt(10000) +) settings max_block_size = 7; diff --git a/tests/queries/0_stateless/01666_merge_tree_max_query_limit.reference b/tests/queries/0_stateless/01666_merge_tree_max_query_limit.reference index a08a20dc95d..25880a7d740 100644 --- a/tests/queries/0_stateless/01666_merge_tree_max_query_limit.reference +++ b/tests/queries/0_stateless/01666_merge_tree_max_query_limit.reference @@ -12,4 +12,3 @@ Check if another query is passed Modify max_concurrent_queries back to 1 Check if another query with less marks to read is throttled yes -finished long_running_query default select sleepEachRow(0.01) from simple settings max_block_size = 1 format Null diff --git a/tests/queries/0_stateless/01666_merge_tree_max_query_limit.sh b/tests/queries/0_stateless/01666_merge_tree_max_query_limit.sh index e32a83c9560..5bb93371483 100755 --- a/tests/queries/0_stateless/01666_merge_tree_max_query_limit.sh +++ b/tests/queries/0_stateless/01666_merge_tree_max_query_limit.sh @@ -18,9 +18,11 @@ settings index_granularity = 1, max_concurrent_queries = 1, min_marks_to_honor_m insert into simple select number, number + 100 from numbers(1000); " +query_id="long_running_query-$CLICKHOUSE_DATABASE" + echo "Spin up a long running query" -${CLICKHOUSE_CLIENT} --query "select sleepEachRow(0.01) from simple settings max_block_size = 1 format Null" --query_id "long_running_query" > /dev/null 2>&1 & -wait_for_query_to_start 'long_running_query' +${CLICKHOUSE_CLIENT} --query "select sleepEachRow(0.01) from simple settings max_block_size = 1 format Null" --query_id "$query_id" > /dev/null 2>&1 & +wait_for_query_to_start "$query_id" # query which reads marks >= min_marks_to_honor_max_concurrent_queries is throttled echo "Check if another query with some marks to read is throttled" @@ -61,7 +63,7 @@ CODE=$? [ "$CODE" -ne "202" ] && echo "Expected error code: 202 but got: $CODE" && exit 1; echo "yes" -${CLICKHOUSE_CLIENT} --query "KILL QUERY WHERE query_id = 'long_running_query' SYNC" +${CLICKHOUSE_CLIENT} --query "KILL QUERY WHERE query_id = '$query_id' SYNC FORMAT Null" wait ${CLICKHOUSE_CLIENT} --multiline --multiquery --query " diff --git a/tests/queries/0_stateless/01753_max_uri_size.reference b/tests/queries/0_stateless/01753_max_uri_size.reference new file mode 100644 index 00000000000..d00491fd7e5 --- /dev/null +++ b/tests/queries/0_stateless/01753_max_uri_size.reference @@ -0,0 +1 @@ +1 diff --git a/tests/queries/0_stateless/01753_max_uri_size.sh b/tests/queries/0_stateless/01753_max_uri_size.sh new file mode 100755 index 00000000000..5c63d9274fd --- /dev/null +++ b/tests/queries/0_stateless/01753_max_uri_size.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +# NOTE: since 'max_uri_size' doesn't affect the request itself, this test hardly depends on the default value of this setting (16Kb). + +LONG_REQUEST=$(python3 -c "print('&max_uri_size=1'*2000, end='')") # ~30K + +${CLICKHOUSE_CURL} -sSv "${CLICKHOUSE_URL}${LONG_REQUEST}&query=SELECT+1" 2>&1 | grep -Fc "HTTP/1.1 400 Bad Request" diff --git a/tests/queries/0_stateless/01765_hashed_dictionary_simple_key.reference b/tests/queries/0_stateless/01765_hashed_dictionary_simple_key.reference new file mode 100644 index 00000000000..2cc0a8668a2 --- /dev/null +++ b/tests/queries/0_stateless/01765_hashed_dictionary_simple_key.reference @@ -0,0 +1,132 @@ +Dictionary hashed_dictionary_simple_key_simple_attributes +dictGet existing value +value_0 value_second_0 +value_1 value_second_1 +value_2 value_second_2 +dictGet with non existing value +value_0 value_second_0 +value_1 value_second_1 +value_2 value_second_2 +value_first_default value_second_default +dictGetOrDefault existing value +value_0 value_second_0 +value_1 value_second_1 +value_2 value_second_2 +dictGetOrDefault non existing value +value_0 value_second_0 +value_1 value_second_1 +value_2 value_second_2 +default default +dictHas +1 +1 +1 +0 +select all values as input stream +0 value_0 value_second_0 +1 value_1 value_second_1 +2 value_2 value_second_2 +Dictionary sparse_hashed_dictionary_simple_key_simple_attributes +dictGet existing value +value_0 value_second_0 +value_1 value_second_1 +value_2 value_second_2 +dictGet with non existing value +value_0 value_second_0 +value_1 value_second_1 +value_2 value_second_2 +value_first_default value_second_default +dictGetOrDefault existing value +value_0 value_second_0 +value_1 value_second_1 +value_2 value_second_2 +dictGetOrDefault non existing value +value_0 value_second_0 +value_1 value_second_1 +value_2 value_second_2 +default default +dictHas +1 +1 +1 +0 +select all values as input stream +0 value_0 value_second_0 +1 value_1 value_second_1 +2 value_2 value_second_2 +Dictionary hashed_dictionary_simple_key_complex_attributes +dictGet existing value +value_0 value_second_0 +value_1 \N +value_2 value_second_2 +dictGet with non existing value +value_0 value_second_0 +value_1 \N +value_2 value_second_2 +value_first_default value_second_default +dictGetOrDefault existing value +value_0 value_second_0 +value_1 \N +value_2 value_second_2 +dictGetOrDefault non existing value +value_0 value_second_0 +value_1 \N +value_2 value_second_2 +default default +dictHas +1 +1 +1 +0 +select all values as input stream +0 value_0 value_second_0 +1 value_1 \N +2 value_2 value_second_2 +Dictionary sparse_hashed_dictionary_simple_key_complex_attributes +dictGet existing value +value_0 value_second_0 +value_1 \N +value_2 value_second_2 +dictGet with non existing value +value_0 value_second_0 +value_1 \N +value_2 value_second_2 +value_first_default value_second_default +dictGetOrDefault existing value +value_0 value_second_0 +value_1 \N +value_2 value_second_2 +dictGetOrDefault non existing value +value_0 value_second_0 +value_1 \N +value_2 value_second_2 +default default +dictHas +1 +1 +1 +0 +select all values as input stream +0 value_0 value_second_0 +1 value_1 \N +2 value_2 value_second_2 +Dictionary hashed_dictionary_simple_key_hierarchy +dictGet +0 +0 +1 +1 +2 +dictGetHierarchy +[1] +[4,2,1] +Dictionary sparse_hashed_dictionary_simple_key_hierarchy +dictGet +0 +0 +1 +1 +2 +dictGetHierarchy +[1] +[4,2,1] diff --git a/tests/queries/0_stateless/01765_hashed_dictionary_simple_key.sql b/tests/queries/0_stateless/01765_hashed_dictionary_simple_key.sql new file mode 100644 index 00000000000..7502c6a93bb --- /dev/null +++ b/tests/queries/0_stateless/01765_hashed_dictionary_simple_key.sql @@ -0,0 +1,207 @@ +DROP DATABASE IF EXISTS 01765_db; +CREATE DATABASE 01765_db; + +CREATE TABLE 01765_db.simple_key_simple_attributes_source_table +( + id UInt64, + value_first String, + value_second String +) +ENGINE = TinyLog; + +INSERT INTO 01765_db.simple_key_simple_attributes_source_table VALUES(0, 'value_0', 'value_second_0'); +INSERT INTO 01765_db.simple_key_simple_attributes_source_table VALUES(1, 'value_1', 'value_second_1'); +INSERT INTO 01765_db.simple_key_simple_attributes_source_table VALUES(2, 'value_2', 'value_second_2'); + +CREATE DICTIONARY 01765_db.hashed_dictionary_simple_key_simple_attributes +( + id UInt64, + value_first String DEFAULT 'value_first_default', + value_second String DEFAULT 'value_second_default' +) +PRIMARY KEY id +SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() USER 'default' TABLE 'simple_key_simple_attributes_source_table')) +LIFETIME(MIN 1 MAX 1000) +LAYOUT(HASHED()); + +SELECT 'Dictionary hashed_dictionary_simple_key_simple_attributes'; +SELECT 'dictGet existing value'; +SELECT dictGet('01765_db.hashed_dictionary_simple_key_simple_attributes', 'value_first', number) as value_first, + dictGet('01765_db.hashed_dictionary_simple_key_simple_attributes', 'value_second', number) as value_second FROM system.numbers LIMIT 3; +SELECT 'dictGet with non existing value'; +SELECT dictGet('01765_db.hashed_dictionary_simple_key_simple_attributes', 'value_first', number) as value_first, + dictGet('01765_db.hashed_dictionary_simple_key_simple_attributes', 'value_second', number) as value_second FROM system.numbers LIMIT 4; +SELECT 'dictGetOrDefault existing value'; +SELECT dictGetOrDefault('01765_db.hashed_dictionary_simple_key_simple_attributes', 'value_first', number, toString('default')) as value_first, + dictGetOrDefault('01765_db.hashed_dictionary_simple_key_simple_attributes', 'value_second', number, toString('default')) as value_second FROM system.numbers LIMIT 3; +SELECT 'dictGetOrDefault non existing value'; +SELECT dictGetOrDefault('01765_db.hashed_dictionary_simple_key_simple_attributes', 'value_first', number, toString('default')) as value_first, + dictGetOrDefault('01765_db.hashed_dictionary_simple_key_simple_attributes', 'value_second', number, toString('default')) as value_second FROM system.numbers LIMIT 4; +SELECT 'dictHas'; +SELECT dictHas('01765_db.hashed_dictionary_simple_key_simple_attributes', number) FROM system.numbers LIMIT 4; +SELECT 'select all values as input stream'; +SELECT * FROM 01765_db.hashed_dictionary_simple_key_simple_attributes ORDER BY id; + +DROP DICTIONARY 01765_db.hashed_dictionary_simple_key_simple_attributes; + +CREATE DICTIONARY 01765_db.sparse_hashed_dictionary_simple_key_simple_attributes +( + id UInt64, + value_first String DEFAULT 'value_first_default', + value_second String DEFAULT 'value_second_default' +) +PRIMARY KEY id +SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() USER 'default' TABLE 'simple_key_simple_attributes_source_table')) +LIFETIME(MIN 1 MAX 1000) +LAYOUT(SPARSE_HASHED()); + +SELECT 'Dictionary sparse_hashed_dictionary_simple_key_simple_attributes'; +SELECT 'dictGet existing value'; +SELECT dictGet('01765_db.sparse_hashed_dictionary_simple_key_simple_attributes', 'value_first', number) as value_first, + dictGet('01765_db.sparse_hashed_dictionary_simple_key_simple_attributes', 'value_second', number) as value_second FROM system.numbers LIMIT 3; +SELECT 'dictGet with non existing value'; +SELECT dictGet('01765_db.sparse_hashed_dictionary_simple_key_simple_attributes', 'value_first', number) as value_first, + dictGet('01765_db.sparse_hashed_dictionary_simple_key_simple_attributes', 'value_second', number) as value_second FROM system.numbers LIMIT 4; +SELECT 'dictGetOrDefault existing value'; +SELECT dictGetOrDefault('01765_db.sparse_hashed_dictionary_simple_key_simple_attributes', 'value_first', number, toString('default')) as value_first, + dictGetOrDefault('01765_db.sparse_hashed_dictionary_simple_key_simple_attributes', 'value_second', number, toString('default')) as value_second FROM system.numbers LIMIT 3; +SELECT 'dictGetOrDefault non existing value'; +SELECT dictGetOrDefault('01765_db.sparse_hashed_dictionary_simple_key_simple_attributes', 'value_first', number, toString('default')) as value_first, + dictGetOrDefault('01765_db.sparse_hashed_dictionary_simple_key_simple_attributes', 'value_second', number, toString('default')) as value_second FROM system.numbers LIMIT 4; +SELECT 'dictHas'; +SELECT dictHas('01765_db.sparse_hashed_dictionary_simple_key_simple_attributes', number) FROM system.numbers LIMIT 4; +SELECT 'select all values as input stream'; +SELECT * FROM 01765_db.sparse_hashed_dictionary_simple_key_simple_attributes ORDER BY id; + +DROP DICTIONARY 01765_db.sparse_hashed_dictionary_simple_key_simple_attributes; + +DROP TABLE 01765_db.simple_key_simple_attributes_source_table; + +CREATE TABLE 01765_db.simple_key_complex_attributes_source_table +( + id UInt64, + value_first String, + value_second Nullable(String) +) +ENGINE = TinyLog; + +INSERT INTO 01765_db.simple_key_complex_attributes_source_table VALUES(0, 'value_0', 'value_second_0'); +INSERT INTO 01765_db.simple_key_complex_attributes_source_table VALUES(1, 'value_1', NULL); +INSERT INTO 01765_db.simple_key_complex_attributes_source_table VALUES(2, 'value_2', 'value_second_2'); + +CREATE DICTIONARY 01765_db.hashed_dictionary_simple_key_complex_attributes +( + id UInt64, + value_first String DEFAULT 'value_first_default', + value_second Nullable(String) DEFAULT 'value_second_default' +) +PRIMARY KEY id +SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() USER 'default' TABLE 'simple_key_complex_attributes_source_table')) +LIFETIME(MIN 1 MAX 1000) +LAYOUT(HASHED()); + +SELECT 'Dictionary hashed_dictionary_simple_key_complex_attributes'; +SELECT 'dictGet existing value'; +SELECT dictGet('01765_db.hashed_dictionary_simple_key_complex_attributes', 'value_first', number) as value_first, + dictGet('01765_db.hashed_dictionary_simple_key_complex_attributes', 'value_second', number) as value_second FROM system.numbers LIMIT 3; +SELECT 'dictGet with non existing value'; +SELECT dictGet('01765_db.hashed_dictionary_simple_key_complex_attributes', 'value_first', number) as value_first, + dictGet('01765_db.hashed_dictionary_simple_key_complex_attributes', 'value_second', number) as value_second FROM system.numbers LIMIT 4; +SELECT 'dictGetOrDefault existing value'; +SELECT dictGetOrDefault('01765_db.hashed_dictionary_simple_key_complex_attributes', 'value_first', number, toString('default')) as value_first, + dictGetOrDefault('01765_db.hashed_dictionary_simple_key_complex_attributes', 'value_second', number, toString('default')) as value_second FROM system.numbers LIMIT 3; +SELECT 'dictGetOrDefault non existing value'; +SELECT dictGetOrDefault('01765_db.hashed_dictionary_simple_key_complex_attributes', 'value_first', number, toString('default')) as value_first, + dictGetOrDefault('01765_db.hashed_dictionary_simple_key_complex_attributes', 'value_second', number, toString('default')) as value_second FROM system.numbers LIMIT 4; +SELECT 'dictHas'; +SELECT dictHas('01765_db.hashed_dictionary_simple_key_complex_attributes', number) FROM system.numbers LIMIT 4; +SELECT 'select all values as input stream'; +SELECT * FROM 01765_db.hashed_dictionary_simple_key_complex_attributes ORDER BY id; + +DROP DICTIONARY 01765_db.hashed_dictionary_simple_key_complex_attributes; + +CREATE DICTIONARY 01765_db.sparse_hashed_dictionary_simple_key_complex_attributes +( + id UInt64, + value_first String DEFAULT 'value_first_default', + value_second Nullable(String) DEFAULT 'value_second_default' +) +PRIMARY KEY id +SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() USER 'default' TABLE 'simple_key_complex_attributes_source_table')) +LIFETIME(MIN 1 MAX 1000) +LAYOUT(HASHED()); + +SELECT 'Dictionary sparse_hashed_dictionary_simple_key_complex_attributes'; +SELECT 'dictGet existing value'; +SELECT dictGet('01765_db.sparse_hashed_dictionary_simple_key_complex_attributes', 'value_first', number) as value_first, + dictGet('01765_db.sparse_hashed_dictionary_simple_key_complex_attributes', 'value_second', number) as value_second FROM system.numbers LIMIT 3; +SELECT 'dictGet with non existing value'; +SELECT dictGet('01765_db.sparse_hashed_dictionary_simple_key_complex_attributes', 'value_first', number) as value_first, + dictGet('01765_db.sparse_hashed_dictionary_simple_key_complex_attributes', 'value_second', number) as value_second FROM system.numbers LIMIT 4; +SELECT 'dictGetOrDefault existing value'; +SELECT dictGetOrDefault('01765_db.sparse_hashed_dictionary_simple_key_complex_attributes', 'value_first', number, toString('default')) as value_first, + dictGetOrDefault('01765_db.sparse_hashed_dictionary_simple_key_complex_attributes', 'value_second', number, toString('default')) as value_second FROM system.numbers LIMIT 3; +SELECT 'dictGetOrDefault non existing value'; +SELECT dictGetOrDefault('01765_db.sparse_hashed_dictionary_simple_key_complex_attributes', 'value_first', number, toString('default')) as value_first, + dictGetOrDefault('01765_db.sparse_hashed_dictionary_simple_key_complex_attributes', 'value_second', number, toString('default')) as value_second FROM system.numbers LIMIT 4; +SELECT 'dictHas'; +SELECT dictHas('01765_db.sparse_hashed_dictionary_simple_key_complex_attributes', number) FROM system.numbers LIMIT 4; +SELECT 'select all values as input stream'; +SELECT * FROM 01765_db.sparse_hashed_dictionary_simple_key_complex_attributes ORDER BY id; + +DROP DICTIONARY 01765_db.sparse_hashed_dictionary_simple_key_complex_attributes; + +DROP TABLE 01765_db.simple_key_complex_attributes_source_table; + +CREATE TABLE 01765_db.simple_key_hierarchy_table +( + id UInt64, + parent_id UInt64 +) ENGINE = TinyLog(); + +INSERT INTO 01765_db.simple_key_hierarchy_table VALUES (1, 0); +INSERT INTO 01765_db.simple_key_hierarchy_table VALUES (2, 1); +INSERT INTO 01765_db.simple_key_hierarchy_table VALUES (3, 1); +INSERT INTO 01765_db.simple_key_hierarchy_table VALUES (4, 2); + +CREATE DICTIONARY 01765_db.hashed_dictionary_simple_key_hierarchy +( + id UInt64, + parent_id UInt64 HIERARCHICAL +) +PRIMARY KEY id +SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() USER 'default' TABLE 'simple_key_hierarchy_table')) +LIFETIME(MIN 1 MAX 1000) +LAYOUT(HASHED()); + +SELECT 'Dictionary hashed_dictionary_simple_key_hierarchy'; +SELECT 'dictGet'; +SELECT dictGet('01765_db.hashed_dictionary_simple_key_hierarchy', 'parent_id', number) FROM system.numbers LIMIT 5; +SELECT 'dictGetHierarchy'; +SELECT dictGetHierarchy('01765_db.hashed_dictionary_simple_key_hierarchy', toUInt64(1)); +SELECT dictGetHierarchy('01765_db.hashed_dictionary_simple_key_hierarchy', toUInt64(4)); + +DROP DICTIONARY 01765_db.hashed_dictionary_simple_key_hierarchy; + +CREATE DICTIONARY 01765_db.sparse_hashed_dictionary_simple_key_hierarchy +( + id UInt64, + parent_id UInt64 HIERARCHICAL +) +PRIMARY KEY id +SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() USER 'default' TABLE 'simple_key_hierarchy_table')) +LIFETIME(MIN 1 MAX 1000) +LAYOUT(HASHED()); + +SELECT 'Dictionary sparse_hashed_dictionary_simple_key_hierarchy'; +SELECT 'dictGet'; +SELECT dictGet('01765_db.sparse_hashed_dictionary_simple_key_hierarchy', 'parent_id', number) FROM system.numbers LIMIT 5; +SELECT 'dictGetHierarchy'; +SELECT dictGetHierarchy('01765_db.sparse_hashed_dictionary_simple_key_hierarchy', toUInt64(1)); +SELECT dictGetHierarchy('01765_db.sparse_hashed_dictionary_simple_key_hierarchy', toUInt64(4)); + +DROP DICTIONARY 01765_db.sparse_hashed_dictionary_simple_key_hierarchy; + +DROP TABLE 01765_db.simple_key_hierarchy_table; + +DROP DATABASE 01765_db; diff --git a/tests/queries/0_stateless/01766_hashed_dictionary_complex_key.reference b/tests/queries/0_stateless/01766_hashed_dictionary_complex_key.reference new file mode 100644 index 00000000000..12c210581c2 --- /dev/null +++ b/tests/queries/0_stateless/01766_hashed_dictionary_complex_key.reference @@ -0,0 +1,56 @@ +Dictionary hashed_dictionary_complex_key_simple_attributes +dictGet existing value +value_0 value_second_0 +value_1 value_second_1 +value_2 value_second_2 +dictGet with non existing value +value_0 value_second_0 +value_1 value_second_1 +value_2 value_second_2 +value_first_default value_second_default +dictGetOrDefault existing value +value_0 value_second_0 +value_1 value_second_1 +value_2 value_second_2 +dictGetOrDefault non existing value +value_0 value_second_0 +value_1 value_second_1 +value_2 value_second_2 +default default +dictHas +1 +1 +1 +0 +select all values as input stream +0 id_key_0 value_0 value_second_0 +1 id_key_1 value_1 value_second_1 +2 id_key_2 value_2 value_second_2 +Dictionary hashed_dictionary_complex_key_complex_attributes +dictGet existing value +value_0 value_second_0 +value_1 \N +value_2 value_second_2 +dictGet with non existing value +value_0 value_second_0 +value_1 \N +value_2 value_second_2 +value_first_default value_second_default +dictGetOrDefault existing value +value_0 value_second_0 +value_1 \N +value_2 value_second_2 +dictGetOrDefault non existing value +value_0 value_second_0 +value_1 \N +value_2 value_second_2 +default default +dictHas +1 +1 +1 +0 +select all values as input stream +0 id_key_0 value_0 value_second_0 +1 id_key_1 value_1 \N +2 id_key_2 value_2 value_second_2 diff --git a/tests/queries/0_stateless/01766_hashed_dictionary_complex_key.sql b/tests/queries/0_stateless/01766_hashed_dictionary_complex_key.sql new file mode 100644 index 00000000000..de7ab5b5a1a --- /dev/null +++ b/tests/queries/0_stateless/01766_hashed_dictionary_complex_key.sql @@ -0,0 +1,98 @@ +DROP DATABASE IF EXISTS 01766_db; +CREATE DATABASE 01766_db; + +CREATE TABLE 01766_db.complex_key_simple_attributes_source_table +( + id UInt64, + id_key String, + value_first String, + value_second String +) +ENGINE = TinyLog; + +INSERT INTO 01766_db.complex_key_simple_attributes_source_table VALUES(0, 'id_key_0', 'value_0', 'value_second_0'); +INSERT INTO 01766_db.complex_key_simple_attributes_source_table VALUES(1, 'id_key_1', 'value_1', 'value_second_1'); +INSERT INTO 01766_db.complex_key_simple_attributes_source_table VALUES(2, 'id_key_2', 'value_2', 'value_second_2'); + +CREATE DICTIONARY 01766_db.hashed_dictionary_complex_key_simple_attributes +( + id UInt64, + id_key String, + value_first String DEFAULT 'value_first_default', + value_second String DEFAULT 'value_second_default' +) +PRIMARY KEY id, id_key +SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() USER 'default' TABLE 'complex_key_simple_attributes_source_table' DB '01766_db')) +LIFETIME(MIN 1 MAX 1000) +LAYOUT(COMPLEX_KEY_HASHED()); + +SELECT 'Dictionary hashed_dictionary_complex_key_simple_attributes'; +SELECT 'dictGet existing value'; +SELECT dictGet('01766_db.hashed_dictionary_complex_key_simple_attributes', 'value_first', (number, concat('id_key_', toString(number)))) as value_first, + dictGet('01766_db.hashed_dictionary_complex_key_simple_attributes', 'value_second', (number, concat('id_key_', toString(number)))) as value_second FROM system.numbers LIMIT 3; +SELECT 'dictGet with non existing value'; +SELECT dictGet('01766_db.hashed_dictionary_complex_key_simple_attributes', 'value_first', (number, concat('id_key_', toString(number)))) as value_first, + dictGet('01766_db.hashed_dictionary_complex_key_simple_attributes', 'value_second', (number, concat('id_key_', toString(number)))) as value_second FROM system.numbers LIMIT 4; +SELECT 'dictGetOrDefault existing value'; +SELECT dictGetOrDefault('01766_db.hashed_dictionary_complex_key_simple_attributes', 'value_first', (number, concat('id_key_', toString(number))), toString('default')) as value_first, + dictGetOrDefault('01766_db.hashed_dictionary_complex_key_simple_attributes', 'value_second', (number, concat('id_key_', toString(number))), toString('default')) as value_second FROM system.numbers LIMIT 3; +SELECT 'dictGetOrDefault non existing value'; +SELECT dictGetOrDefault('01766_db.hashed_dictionary_complex_key_simple_attributes', 'value_first', (number, concat('id_key_', toString(number))), toString('default')) as value_first, + dictGetOrDefault('01766_db.hashed_dictionary_complex_key_simple_attributes', 'value_second', (number, concat('id_key_', toString(number))), toString('default')) as value_second FROM system.numbers LIMIT 4; +SELECT 'dictHas'; +SELECT dictHas('01766_db.hashed_dictionary_complex_key_simple_attributes', (number, concat('id_key_', toString(number)))) FROM system.numbers LIMIT 4; +SELECT 'select all values as input stream'; +SELECT * FROM 01766_db.hashed_dictionary_complex_key_simple_attributes ORDER BY (id, id_key); + +DROP DICTIONARY 01766_db.hashed_dictionary_complex_key_simple_attributes; + +DROP TABLE 01766_db.complex_key_simple_attributes_source_table; + +CREATE TABLE 01766_db.complex_key_complex_attributes_source_table +( + id UInt64, + id_key String, + value_first String, + value_second Nullable(String) +) +ENGINE = TinyLog; + +INSERT INTO 01766_db.complex_key_complex_attributes_source_table VALUES(0, 'id_key_0', 'value_0', 'value_second_0'); +INSERT INTO 01766_db.complex_key_complex_attributes_source_table VALUES(1, 'id_key_1', 'value_1', NULL); +INSERT INTO 01766_db.complex_key_complex_attributes_source_table VALUES(2, 'id_key_2', 'value_2', 'value_second_2'); + +CREATE DICTIONARY 01766_db.hashed_dictionary_complex_key_complex_attributes +( + id UInt64, + id_key String, + + value_first String DEFAULT 'value_first_default', + value_second Nullable(String) DEFAULT 'value_second_default' +) +PRIMARY KEY id, id_key +SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() USER 'default' TABLE 'complex_key_complex_attributes_source_table' DB '01766_db')) +LIFETIME(MIN 1 MAX 1000) +LAYOUT(COMPLEX_KEY_HASHED()); + +SELECT 'Dictionary hashed_dictionary_complex_key_complex_attributes'; +SELECT 'dictGet existing value'; +SELECT dictGet('01766_db.hashed_dictionary_complex_key_complex_attributes', 'value_first', (number, concat('id_key_', toString(number)))) as value_first, + dictGet('01766_db.hashed_dictionary_complex_key_complex_attributes', 'value_second', (number, concat('id_key_', toString(number)))) as value_second FROM system.numbers LIMIT 3; +SELECT 'dictGet with non existing value'; +SELECT dictGet('01766_db.hashed_dictionary_complex_key_complex_attributes', 'value_first', (number, concat('id_key_', toString(number)))) as value_first, + dictGet('01766_db.hashed_dictionary_complex_key_complex_attributes', 'value_second', (number, concat('id_key_', toString(number)))) as value_second FROM system.numbers LIMIT 4; +SELECT 'dictGetOrDefault existing value'; +SELECT dictGetOrDefault('01766_db.hashed_dictionary_complex_key_complex_attributes', 'value_first', (number, concat('id_key_', toString(number))), toString('default')) as value_first, + dictGetOrDefault('01766_db.hashed_dictionary_complex_key_complex_attributes', 'value_second', (number, concat('id_key_', toString(number))), toString('default')) as value_second FROM system.numbers LIMIT 3; +SELECT 'dictGetOrDefault non existing value'; +SELECT dictGetOrDefault('01766_db.hashed_dictionary_complex_key_complex_attributes', 'value_first', (number, concat('id_key_', toString(number))), toString('default')) as value_first, + dictGetOrDefault('01766_db.hashed_dictionary_complex_key_complex_attributes', 'value_second', (number, concat('id_key_', toString(number))), toString('default')) as value_second FROM system.numbers LIMIT 4; +SELECT 'dictHas'; +SELECT dictHas('01766_db.hashed_dictionary_complex_key_complex_attributes', (number, concat('id_key_', toString(number)))) FROM system.numbers LIMIT 4; +SELECT 'select all values as input stream'; +SELECT * FROM 01766_db.hashed_dictionary_complex_key_complex_attributes ORDER BY (id, id_key); + +DROP DICTIONARY 01766_db.hashed_dictionary_complex_key_complex_attributes; +DROP TABLE 01766_db.complex_key_complex_attributes_source_table; + +DROP DATABASE 01766_db; diff --git a/tests/queries/0_stateless/01778_hierarchical_dictionaries.reference b/tests/queries/0_stateless/01778_hierarchical_dictionaries.reference new file mode 100644 index 00000000000..5fe5f5f1db6 --- /dev/null +++ b/tests/queries/0_stateless/01778_hierarchical_dictionaries.reference @@ -0,0 +1,102 @@ +Flat dictionary +Get hierarchy +[] +[1] +[2,1] +[3,1] +[4,2,1] +[] +Get is in hierarchy +0 +1 +1 +1 +1 +0 +Get children +[1] +[2,3] +[4] +[] +[] +[] +Get all descendants +[1,2,3,4] +[2,3,4] +[4] +[] +[] +[] +Get descendants at first level +[1] +[2,3] +[4] +[] +[] +[] +Hashed dictionary +Get hierarchy +[] +[1] +[2,1] +[3,1] +[4,2,1] +[] +Get is in hierarchy +0 +1 +1 +1 +1 +0 +Get children +[1] +[3,2] +[4] +[] +[] +[] +Get all descendants +[1,3,2,4] +[3,2,4] +[4] +[] +[] +[] +Get descendants at first level +[1] +[3,2] +[4] +[] +[] +[] +Cache dictionary +Get hierarchy +[] +[1] +[2,1] +[3,1] +[4,2,1] +[] +Get is in hierarchy +0 +1 +1 +1 +1 +0 +Direct dictionary +Get hierarchy +[] +[1] +[2,1] +[3,1] +[4,2,1] +[] +Get is in hierarchy +0 +1 +1 +1 +1 +0 diff --git a/tests/queries/0_stateless/01778_hierarchical_dictionaries.sql b/tests/queries/0_stateless/01778_hierarchical_dictionaries.sql new file mode 100644 index 00000000000..f6e1a7c9375 --- /dev/null +++ b/tests/queries/0_stateless/01778_hierarchical_dictionaries.sql @@ -0,0 +1,95 @@ +DROP DATABASE IF EXISTS 01778_db; +CREATE DATABASE 01778_db; + +CREATE TABLE 01778_db.hierarchy_source_table (id UInt64, parent_id UInt64) ENGINE = TinyLog; +INSERT INTO 01778_db.hierarchy_source_table VALUES (1, 0), (2, 1), (3, 1), (4, 2); + +CREATE DICTIONARY 01778_db.hierarchy_flat_dictionary +( + id UInt64, + parent_id UInt64 HIERARCHICAL +) +PRIMARY KEY id +SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() USER 'default' TABLE 'hierarchy_source_table' DB '01778_db')) +LAYOUT(FLAT()) +LIFETIME(MIN 1 MAX 1000); + +SELECT 'Flat dictionary'; + +SELECT 'Get hierarchy'; +SELECT dictGetHierarchy('01778_db.hierarchy_flat_dictionary', number) FROM system.numbers LIMIT 6; +SELECT 'Get is in hierarchy'; +SELECT dictIsIn('01778_db.hierarchy_flat_dictionary', number, number) FROM system.numbers LIMIT 6; +SELECT 'Get children'; +SELECT dictGetChildren('01778_db.hierarchy_flat_dictionary', number) FROM system.numbers LIMIT 6; +SELECT 'Get all descendants'; +SELECT dictGetDescendants('01778_db.hierarchy_flat_dictionary', number) FROM system.numbers LIMIT 6; +SELECT 'Get descendants at first level'; +SELECT dictGetDescendants('01778_db.hierarchy_flat_dictionary', number, 1) FROM system.numbers LIMIT 6; + +DROP DICTIONARY 01778_db.hierarchy_flat_dictionary; + +CREATE DICTIONARY 01778_db.hierarchy_hashed_dictionary +( + id UInt64, + parent_id UInt64 HIERARCHICAL +) +PRIMARY KEY id +SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() USER 'default' TABLE 'hierarchy_source_table' DB '01778_db')) +LAYOUT(HASHED()) +LIFETIME(MIN 1 MAX 1000); + +SELECT 'Hashed dictionary'; + +SELECT 'Get hierarchy'; +SELECT dictGetHierarchy('01778_db.hierarchy_hashed_dictionary', number) FROM system.numbers LIMIT 6; +SELECT 'Get is in hierarchy'; +SELECT dictIsIn('01778_db.hierarchy_hashed_dictionary', number, number) FROM system.numbers LIMIT 6; +SELECT 'Get children'; +SELECT dictGetChildren('01778_db.hierarchy_hashed_dictionary', number) FROM system.numbers LIMIT 6; +SELECT 'Get all descendants'; +SELECT dictGetDescendants('01778_db.hierarchy_hashed_dictionary', number) FROM system.numbers LIMIT 6; +SELECT 'Get descendants at first level'; +SELECT dictGetDescendants('01778_db.hierarchy_hashed_dictionary', number, 1) FROM system.numbers LIMIT 6; + +DROP DICTIONARY 01778_db.hierarchy_hashed_dictionary; + +CREATE DICTIONARY 01778_db.hierarchy_cache_dictionary +( + id UInt64, + parent_id UInt64 HIERARCHICAL +) +PRIMARY KEY id +SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() USER 'default' TABLE 'hierarchy_source_table' DB '01778_db')) +LAYOUT(CACHE(SIZE_IN_CELLS 10)) +LIFETIME(MIN 1 MAX 1000); + +SELECT 'Cache dictionary'; + +SELECT 'Get hierarchy'; +SELECT dictGetHierarchy('01778_db.hierarchy_cache_dictionary', number) FROM system.numbers LIMIT 6; +SELECT 'Get is in hierarchy'; +SELECT dictIsIn('01778_db.hierarchy_cache_dictionary', number, number) FROM system.numbers LIMIT 6; + +DROP DICTIONARY 01778_db.hierarchy_cache_dictionary; + +CREATE DICTIONARY 01778_db.hierarchy_direct_dictionary +( + id UInt64, + parent_id UInt64 HIERARCHICAL +) +PRIMARY KEY id +SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() USER 'default' TABLE 'hierarchy_source_table' DB '01778_db')) +LAYOUT(DIRECT()); + +SELECT 'Direct dictionary'; + +SELECT 'Get hierarchy'; +SELECT dictGetHierarchy('01778_db.hierarchy_direct_dictionary', number) FROM system.numbers LIMIT 6; +SELECT 'Get is in hierarchy'; +SELECT dictIsIn('01778_db.hierarchy_direct_dictionary', number, number) FROM system.numbers LIMIT 6; + +DROP DICTIONARY 01778_db.hierarchy_direct_dictionary; + +DROP TABLE 01778_db.hierarchy_source_table; +DROP DATABASE 01778_db; diff --git a/tests/queries/0_stateless/01780_dict_get_or_null.reference b/tests/queries/0_stateless/01780_dict_get_or_null.reference new file mode 100644 index 00000000000..4baca9ec91b --- /dev/null +++ b/tests/queries/0_stateless/01780_dict_get_or_null.reference @@ -0,0 +1,18 @@ +Simple key dictionary dictGetOrNull +0 0 \N \N (NULL,NULL) +1 1 First First ('First','First') +2 1 Second \N ('Second',NULL) +3 1 Third Third ('Third','Third') +4 0 \N \N (NULL,NULL) +Complex key dictionary dictGetOrNull +(0,'key') 0 \N \N (NULL,NULL) +(1,'key') 1 First First ('First','First') +(2,'key') 1 Second \N ('Second',NULL) +(3,'key') 1 Third Third ('Third','Third') +(4,'key') 0 \N \N (NULL,NULL) +Range key dictionary dictGetOrNull +(0,'2019-05-20') 0 \N \N (NULL,NULL) +(1,'2019-05-20') 1 First First ('First','First') +(2,'2019-05-20') 1 Second \N ('Second',NULL) +(3,'2019-05-20') 1 Third Third ('Third','Third') +(4,'2019-05-20') 0 \N \N (NULL,NULL) diff --git a/tests/queries/0_stateless/01780_dict_get_or_null.sql b/tests/queries/0_stateless/01780_dict_get_or_null.sql new file mode 100644 index 00000000000..f13bcf57d27 --- /dev/null +++ b/tests/queries/0_stateless/01780_dict_get_or_null.sql @@ -0,0 +1,116 @@ +DROP TABLE IF EXISTS simple_key_dictionary_source_table; +CREATE TABLE simple_key_dictionary_source_table +( + id UInt64, + value String, + value_nullable Nullable(String) +) ENGINE = TinyLog; + +INSERT INTO simple_key_dictionary_source_table VALUES (1, 'First', 'First'); +INSERT INTO simple_key_dictionary_source_table VALUES (2, 'Second', NULL); +INSERT INTO simple_key_dictionary_source_table VALUES (3, 'Third', 'Third'); + +DROP DICTIONARY IF EXISTS simple_key_dictionary; +CREATE DICTIONARY simple_key_dictionary +( + id UInt64, + value String, + value_nullable Nullable(String) +) +PRIMARY KEY id +SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() TABLE 'simple_key_dictionary_source_table')) +LAYOUT(DIRECT()); + +SELECT 'Simple key dictionary dictGetOrNull'; + +SELECT + number, + dictHas('simple_key_dictionary', number), + dictGetOrNull('simple_key_dictionary', 'value', number), + dictGetOrNull('simple_key_dictionary', 'value_nullable', number), + dictGetOrNull('simple_key_dictionary', ('value', 'value_nullable'), number) +FROM system.numbers LIMIT 5; + +DROP DICTIONARY simple_key_dictionary; +DROP TABLE simple_key_dictionary_source_table; + +DROP TABLE IF EXISTS complex_key_dictionary_source_table; +CREATE TABLE complex_key_dictionary_source_table +( + id UInt64, + id_key String, + value String, + value_nullable Nullable(String) +) ENGINE = TinyLog; + +INSERT INTO complex_key_dictionary_source_table VALUES (1, 'key', 'First', 'First'); +INSERT INTO complex_key_dictionary_source_table VALUES (2, 'key', 'Second', NULL); +INSERT INTO complex_key_dictionary_source_table VALUES (3, 'key', 'Third', 'Third'); + +DROP DICTIONARY IF EXISTS complex_key_dictionary; +CREATE DICTIONARY complex_key_dictionary +( + id UInt64, + id_key String, + value String, + value_nullable Nullable(String) +) +PRIMARY KEY id, id_key +SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() TABLE 'complex_key_dictionary_source_table')) +LAYOUT(COMPLEX_KEY_DIRECT()); + +SELECT 'Complex key dictionary dictGetOrNull'; + +SELECT + (number, 'key'), + dictHas('complex_key_dictionary', (number, 'key')), + dictGetOrNull('complex_key_dictionary', 'value', (number, 'key')), + dictGetOrNull('complex_key_dictionary', 'value_nullable', (number, 'key')), + dictGetOrNull('complex_key_dictionary', ('value', 'value_nullable'), (number, 'key')) +FROM system.numbers LIMIT 5; + +DROP DICTIONARY complex_key_dictionary; +DROP TABLE complex_key_dictionary_source_table; + +DROP TABLE IF EXISTS range_key_dictionary_source_table; +CREATE TABLE range_key_dictionary_source_table +( + key UInt64, + start_date Date, + end_date Date, + value String, + value_nullable Nullable(String) +) +ENGINE = TinyLog(); + +INSERT INTO range_key_dictionary_source_table VALUES(1, toDate('2019-05-20'), toDate('2019-05-20'), 'First', 'First'); +INSERT INTO range_key_dictionary_source_table VALUES(2, toDate('2019-05-20'), toDate('2019-05-20'), 'Second', NULL); +INSERT INTO range_key_dictionary_source_table VALUES(3, toDate('2019-05-20'), toDate('2019-05-20'), 'Third', 'Third'); + +DROP DICTIONARY IF EXISTS range_key_dictionary; +CREATE DICTIONARY range_key_dictionary +( + key UInt64, + start_date Date, + end_date Date, + value String, + value_nullable Nullable(String) +) +PRIMARY KEY key +SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() TABLE 'range_key_dictionary_source_table')) +LIFETIME(MIN 1 MAX 1000) +LAYOUT(RANGE_HASHED()) +RANGE(MIN start_date MAX end_date); + +SELECT 'Range key dictionary dictGetOrNull'; + +SELECT + (number, toDate('2019-05-20')), + dictHas('range_key_dictionary', number, toDate('2019-05-20')), + dictGetOrNull('range_key_dictionary', 'value', number, toDate('2019-05-20')), + dictGetOrNull('range_key_dictionary', 'value_nullable', number, toDate('2019-05-20')), + dictGetOrNull('range_key_dictionary', ('value', 'value_nullable'), number, toDate('2019-05-20')) +FROM system.numbers LIMIT 5; + +DROP DICTIONARY range_key_dictionary; +DROP TABLE range_key_dictionary_source_table; diff --git a/tests/queries/0_stateless/01781_token_extractor_buffer_overflow.reference b/tests/queries/0_stateless/01781_token_extractor_buffer_overflow.reference new file mode 100644 index 00000000000..aa47d0d46d4 --- /dev/null +++ b/tests/queries/0_stateless/01781_token_extractor_buffer_overflow.reference @@ -0,0 +1,2 @@ +0 +0 diff --git a/tests/queries/0_stateless/01781_token_extractor_buffer_overflow.sql b/tests/queries/0_stateless/01781_token_extractor_buffer_overflow.sql new file mode 100644 index 00000000000..4cc216955b3 --- /dev/null +++ b/tests/queries/0_stateless/01781_token_extractor_buffer_overflow.sql @@ -0,0 +1,10 @@ +SET max_block_size = 10, min_insert_block_size_rows = 0, min_insert_block_size_bytes = 0, max_threads = 20; + +DROP TABLE IF EXISTS bloom_filter; +CREATE TABLE bloom_filter (`id` UInt64, `s` String, INDEX tok_bf (s, lower(s)) TYPE tokenbf_v1(512, 3, 0) GRANULARITY 1) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 8; +INSERT INTO bloom_filter SELECT number, 'yyy,uuu' FROM numbers(1024); + +SELECT max(id) FROM bloom_filter WHERE hasToken(s, 'abc'); +SELECT max(id) FROM bloom_filter WHERE hasToken(s, 'abcabcabcabcabcabcabcab\0'); + +DROP TABLE bloom_filter; diff --git a/tests/queries/0_stateless/01783_http_chunk_size.reference b/tests/queries/0_stateless/01783_http_chunk_size.reference new file mode 100644 index 00000000000..e454a00607c --- /dev/null +++ b/tests/queries/0_stateless/01783_http_chunk_size.reference @@ -0,0 +1 @@ +1234567890 1234567890 1234567890 1234567890 diff --git a/tests/queries/0_stateless/01783_http_chunk_size.sh b/tests/queries/0_stateless/01783_http_chunk_size.sh new file mode 100755 index 00000000000..66ac4dfa975 --- /dev/null +++ b/tests/queries/0_stateless/01783_http_chunk_size.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +URL="${CLICKHOUSE_URL}&session_id=id_${CLICKHOUSE_DATABASE}" + +echo "DROP TABLE IF EXISTS table" | ${CLICKHOUSE_CURL} -sSg "${URL}" -d @- +echo "CREATE TABLE table (a String) ENGINE Memory()" | ${CLICKHOUSE_CURL} -sSg "${URL}" -d @- + +# NOTE: suppose that curl sends everything in a single chunk - there are no options to force the chunk-size. +echo "SET max_query_size=44" | ${CLICKHOUSE_CURL} -sSg "${URL}" -d @- +echo -ne "INSERT INTO TABLE table FORMAT TabSeparated 1234567890 1234567890 1234567890 1234567890\n" | ${CLICKHOUSE_CURL} -H "Transfer-Encoding: chunked" -sS "${URL}" --data-binary @- + +echo "SELECT * from table" | ${CLICKHOUSE_CURL} -sSg "${URL}" -d @- +echo "DROP TABLE table" | ${CLICKHOUSE_CURL} -sSg "${URL}" -d @- diff --git a/tests/queries/1_stateful/00159_parallel_formatting_http.reference b/tests/queries/1_stateful/00159_parallel_formatting_http.reference index 499a0b8a7c7..8eabf5d4f03 100644 --- a/tests/queries/1_stateful/00159_parallel_formatting_http.reference +++ b/tests/queries/1_stateful/00159_parallel_formatting_http.reference @@ -1,12 +1,12 @@ TSV, false -8a984bbbfb127c430f67173f5371c6cb - +6e4ce4996dd0e036d27cb0d2166c8e59 - TSV, true -8a984bbbfb127c430f67173f5371c6cb - +6e4ce4996dd0e036d27cb0d2166c8e59 - CSV, false -ea1c740f03f5dcc43a3044528ad0a98f - +ab6b3616f31e8a952c802ca92562e418 - CSV, true -ea1c740f03f5dcc43a3044528ad0a98f - +ab6b3616f31e8a952c802ca92562e418 - JSONCompactEachRow, false -ba1081a754a06ef6563840b2d8d4d327 - +1651b540b43bd6c62446f4c340bf13c7 - JSONCompactEachRow, true -ba1081a754a06ef6563840b2d8d4d327 - +1651b540b43bd6c62446f4c340bf13c7 - diff --git a/tests/queries/1_stateful/00159_parallel_formatting_http.sh b/tests/queries/1_stateful/00159_parallel_formatting_http.sh index 8fd8c15b7c7..a4e68de6a3f 100755 --- a/tests/queries/1_stateful/00159_parallel_formatting_http.sh +++ b/tests/queries/1_stateful/00159_parallel_formatting_http.sh @@ -10,8 +10,8 @@ FORMATS=('TSV' 'CSV' 'JSONCompactEachRow') for format in "${FORMATS[@]}" do echo "$format, false"; - ${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}&query=SELECT+ClientEventTime+as+a,MobilePhoneModel+as+b,ClientIP6+as+c+FROM+test.hits+ORDER+BY+a,b,c+Format+$format&output_format_parallel_formatting=false" -d' ' | md5sum + ${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}&query=SELECT+ClientEventTime+as+a,MobilePhoneModel+as+b,ClientIP6+as+c+FROM+test.hits+ORDER+BY+a,b,c+LIMIT+1000000+Format+$format&output_format_parallel_formatting=false" -d' ' | md5sum echo "$format, true"; - ${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}&query=SELECT+ClientEventTime+as+a,MobilePhoneModel+as+b,ClientIP6+as+c+FROM+test.hits+ORDER+BY+a,b,c+Format+$format&output_format_parallel_formatting=true" -d' ' | md5sum + ${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}&query=SELECT+ClientEventTime+as+a,MobilePhoneModel+as+b,ClientIP6+as+c+FROM+test.hits+ORDER+BY+a,b,c+LIMIT+1000000+Format+$format&output_format_parallel_formatting=true" -d' ' | md5sum done diff --git a/tests/queries/skip_list.json b/tests/queries/skip_list.json index df2090325a3..4759fb95602 100644 --- a/tests/queries/skip_list.json +++ b/tests/queries/skip_list.json @@ -164,7 +164,8 @@ "00062_replicated_merge_tree_alter_zookeeper", /// Does not support renaming of multiple tables in single query "00634_rename_view", - "00140_rename" + "00140_rename", + "01783_http_chunk_size" ], "polymorphic-parts": [ "01508_partition_pruning_long", /// bug, shoud be fixed @@ -641,6 +642,7 @@ "01542_dictionary_load_exception_race", "01545_system_errors", // looks at the difference of values in system.errors "01560_optimize_on_insert_zookeeper", + "01563_distributed_query_finish", // looks at system.errors which is global "01575_disable_detach_table_of_dictionary", "01593_concurrent_alter_mutations_kill", "01593_concurrent_alter_mutations_kill_many_replicas", @@ -667,6 +669,7 @@ "01702_system_query_log", // Runs many global system queries "01715_background_checker_blather_zookeeper", "01721_engine_file_truncate_on_insert", // It's ok to execute in parallel but not several instances of the same test. + "01722_long_brotli_http_compression_json_format", // It is broken in some unimaginable way with the genius error 'cannot write to ofstream'. Not sure how to debug this "01747_alter_partition_key_enum_zookeeper", "01748_dictionary_table_dot", // creates database "01760_polygon_dictionaries", @@ -679,6 +682,19 @@ "live_view", "memory_leak", "memory_limit", - "polygon_dicts" // they use an explicitly specified database + "polygon_dicts", // they use an explicitly specified database + "01658_read_file_to_stringcolumn", + "01721_engine_file_truncate_on_insert", // It's ok to execute in parallel but not several instances of the same test. + "01702_system_query_log", // It's ok to execute in parallel with oter tests but not several instances of the same test. + "01748_dictionary_table_dot", // creates database + "00950_dict_get", + "01683_flat_dictionary", + "01681_cache_dictionary_simple_key", + "01682_cache_dictionary_complex_key", + "01684_ssd_cache_dictionary_simple_key", + "01685_ssd_cache_dictionary_complex_key", + "01760_system_dictionaries", + "01760_polygon_dictionaries", + "01778_hierarchical_dictionaries" ] } diff --git a/tests/testflows/regression.py b/tests/testflows/regression.py index 05fec3ea985..45f1ed64a6c 100755 --- a/tests/testflows/regression.py +++ b/tests/testflows/regression.py @@ -14,10 +14,10 @@ def regression(self, local, clickhouse_binary_path, stress=None, parallel=None): """ args = {"local": local, "clickhouse_binary_path": clickhouse_binary_path, "stress": stress, "parallel": parallel} - Feature(test=load("example.regression", "regression"))(**args) - Feature(test=load("ldap.regression", "regression"))(**args) - Feature(test=load("rbac.regression", "regression"))(**args) - Feature(test=load("aes_encryption.regression", "regression"))(**args) + # Feature(test=load("example.regression", "regression"))(**args) + # Feature(test=load("ldap.regression", "regression"))(**args) + # Feature(test=load("rbac.regression", "regression"))(**args) + # Feature(test=load("aes_encryption.regression", "regression"))(**args) # Feature(test=load("kerberos.regression", "regression"))(**args) if main():