diff --git a/CMakeLists.txt b/CMakeLists.txt index 37822f95e10..c4475c38ec1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -39,6 +39,8 @@ else() set(RECONFIGURE_MESSAGE_LEVEL STATUS) endif() +enable_language(C CXX ASM) + include (cmake/arch.cmake) include (cmake/target.cmake) include (cmake/tools.cmake) diff --git a/cmake/find/ldap.cmake b/cmake/find/ldap.cmake index 369c1e42e8d..0dffa334e73 100644 --- a/cmake/find/ldap.cmake +++ b/cmake/find/ldap.cmake @@ -62,6 +62,7 @@ if (NOT OPENLDAP_FOUND AND NOT MISSING_INTERNAL_LDAP_LIBRARY) if ( ( "${_system_name}" STREQUAL "linux" AND "${_system_processor}" STREQUAL "x86_64" ) OR ( "${_system_name}" STREQUAL "linux" AND "${_system_processor}" STREQUAL "aarch64" ) OR + ( "${_system_name}" STREQUAL "linux" AND "${_system_processor}" STREQUAL "ppc64le" ) OR ( "${_system_name}" STREQUAL "freebsd" AND "${_system_processor}" STREQUAL "x86_64" ) OR ( "${_system_name}" STREQUAL "darwin" AND "${_system_processor}" STREQUAL "x86_64" ) ) diff --git a/cmake/find/s3.cmake b/cmake/find/s3.cmake index 1bbf48fd6b0..1b0c652a31a 100644 --- a/cmake/find/s3.cmake +++ b/cmake/find/s3.cmake @@ -1,7 +1,7 @@ -if(NOT OS_FREEBSD AND NOT APPLE AND NOT ARCH_ARM) +if(NOT OS_FREEBSD AND NOT APPLE) option(ENABLE_S3 "Enable S3" ${ENABLE_LIBRARIES}) elseif(ENABLE_S3 OR USE_INTERNAL_AWS_S3_LIBRARY) - message (${RECONFIGURE_MESSAGE_LEVEL} "Can't use S3 on ARM, Apple or FreeBSD") + message (${RECONFIGURE_MESSAGE_LEVEL} "Can't use S3 on Apple or FreeBSD") endif() if(NOT ENABLE_S3) diff --git a/cmake/linux/default_libs.cmake b/cmake/linux/default_libs.cmake index d3a727e9cb8..c1e4d450389 100644 --- a/cmake/linux/default_libs.cmake +++ b/cmake/linux/default_libs.cmake @@ -6,7 +6,7 @@ set (DEFAULT_LIBS "-nodefaultlibs") # We need builtins from Clang's RT even without libcxx - for ubsan+int128. # See https://bugs.llvm.org/show_bug.cgi?id=16404 if (COMPILER_CLANG AND NOT (CMAKE_CROSSCOMPILING AND ARCH_AARCH64)) - execute_process (COMMAND ${CMAKE_CXX_COMPILER} --print-file-name=libclang_rt.builtins-${CMAKE_SYSTEM_PROCESSOR}.a OUTPUT_VARIABLE BUILTINS_LIBRARY OUTPUT_STRIP_TRAILING_WHITESPACE) + execute_process (COMMAND ${CMAKE_CXX_COMPILER} --print-libgcc-file-name --rtlib=compiler-rt OUTPUT_VARIABLE BUILTINS_LIBRARY OUTPUT_STRIP_TRAILING_WHITESPACE) else () set (BUILTINS_LIBRARY "-lgcc") endif () diff --git a/cmake/tools.cmake b/cmake/tools.cmake index abb11843d59..44fc3b3e530 100644 --- a/cmake/tools.cmake +++ b/cmake/tools.cmake @@ -86,8 +86,3 @@ if (LINKER_NAME) message(STATUS "Using custom linker by name: ${LINKER_NAME}") endif () -if (ARCH_PPC64LE) - if (COMPILER_CLANG OR (COMPILER_GCC AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 8)) - message(FATAL_ERROR "Only gcc-8 or higher is supported for powerpc architecture") - endif () -endif () diff --git a/contrib/boost-cmake/CMakeLists.txt b/contrib/boost-cmake/CMakeLists.txt index b9298f59f2b..0759935a7db 100644 --- a/contrib/boost-cmake/CMakeLists.txt +++ b/contrib/boost-cmake/CMakeLists.txt @@ -160,6 +160,12 @@ if (NOT EXTERNAL_BOOST_FOUND) enable_language(ASM) SET(ASM_OPTIONS "-x assembler-with-cpp") + set (SRCS_CONTEXT + ${LIBRARY_DIR}/libs/context/src/dummy.cpp + ${LIBRARY_DIR}/libs/context/src/execution_context.cpp + ${LIBRARY_DIR}/libs/context/src/posix/stack_traits.cpp + ) + if (SANITIZE AND (SANITIZE STREQUAL "address" OR SANITIZE STREQUAL "thread")) add_compile_definitions(BOOST_USE_UCONTEXT) @@ -169,39 +175,34 @@ if (NOT EXTERNAL_BOOST_FOUND) add_compile_definitions(BOOST_USE_TSAN) endif() - set (SRCS_CONTEXT + set (SRCS_CONTEXT ${SRCS_CONTEXT} ${LIBRARY_DIR}/libs/context/src/fiber.cpp ${LIBRARY_DIR}/libs/context/src/continuation.cpp - ${LIBRARY_DIR}/libs/context/src/dummy.cpp - ${LIBRARY_DIR}/libs/context/src/execution_context.cpp - ${LIBRARY_DIR}/libs/context/src/posix/stack_traits.cpp ) - elseif (ARCH_ARM) - set (SRCS_CONTEXT + endif() + if (ARCH_ARM) + set (SRCS_CONTEXT ${SRCS_CONTEXT} ${LIBRARY_DIR}/libs/context/src/asm/jump_arm64_aapcs_elf_gas.S ${LIBRARY_DIR}/libs/context/src/asm/make_arm64_aapcs_elf_gas.S ${LIBRARY_DIR}/libs/context/src/asm/ontop_arm64_aapcs_elf_gas.S - ${LIBRARY_DIR}/libs/context/src/dummy.cpp - ${LIBRARY_DIR}/libs/context/src/execution_context.cpp - ${LIBRARY_DIR}/libs/context/src/posix/stack_traits.cpp + ) + elseif (ARCH_PPC64LE) + set (SRCS_CONTEXT ${SRCS_CONTEXT} + ${LIBRARY_DIR}/libs/context/src/asm/jump_ppc64_sysv_elf_gas.S + ${LIBRARY_DIR}/libs/context/src/asm/make_ppc64_sysv_elf_gas.S + ${LIBRARY_DIR}/libs/context/src/asm/ontop_ppc64_sysv_elf_gas.S ) elseif(OS_DARWIN) - set (SRCS_CONTEXT + set (SRCS_CONTEXT ${SRCS_CONTEXT} ${LIBRARY_DIR}/libs/context/src/asm/jump_x86_64_sysv_macho_gas.S ${LIBRARY_DIR}/libs/context/src/asm/make_x86_64_sysv_macho_gas.S ${LIBRARY_DIR}/libs/context/src/asm/ontop_x86_64_sysv_macho_gas.S - ${LIBRARY_DIR}/libs/context/src/dummy.cpp - ${LIBRARY_DIR}/libs/context/src/execution_context.cpp - ${LIBRARY_DIR}/libs/context/src/posix/stack_traits.cpp ) else() - set (SRCS_CONTEXT + set (SRCS_CONTEXT ${SRCS_CONTEXT} ${LIBRARY_DIR}/libs/context/src/asm/jump_x86_64_sysv_elf_gas.S ${LIBRARY_DIR}/libs/context/src/asm/make_x86_64_sysv_elf_gas.S ${LIBRARY_DIR}/libs/context/src/asm/ontop_x86_64_sysv_elf_gas.S - ${LIBRARY_DIR}/libs/context/src/dummy.cpp - ${LIBRARY_DIR}/libs/context/src/execution_context.cpp - ${LIBRARY_DIR}/libs/context/src/posix/stack_traits.cpp ) endif() diff --git a/contrib/cctz-cmake/CMakeLists.txt b/contrib/cctz-cmake/CMakeLists.txt index 90e33dc9f62..a3869478347 100644 --- a/contrib/cctz-cmake/CMakeLists.txt +++ b/contrib/cctz-cmake/CMakeLists.txt @@ -97,12 +97,19 @@ if (NOT EXTERNAL_CCTZ_LIBRARY_FOUND OR NOT EXTERNAL_CCTZ_LIBRARY_WORKS) set(TZ_OBJS ${TZ_OBJS} ${TZ_OBJ}) # https://stackoverflow.com/questions/14776463/compile-and-add-an-object-file-from-a-binary-with-cmake - add_custom_command(OUTPUT ${TZ_OBJ} - COMMAND cp ${TZDIR}/${TIMEZONE} ${CMAKE_CURRENT_BINARY_DIR}/${TIMEZONE_ID} - COMMAND cd ${CMAKE_CURRENT_BINARY_DIR} && ${OBJCOPY_PATH} -I binary ${OBJCOPY_ARCH_OPTIONS} + # PPC64LE fails to do this with objcopy, use ld or lld instead + if (ARCH_PPC64LE) + add_custom_command(OUTPUT ${TZ_OBJ} + COMMAND cp ${TZDIR}/${TIMEZONE} ${CMAKE_CURRENT_BINARY_DIR}/${TIMEZONE_ID} + COMMAND cd ${CMAKE_CURRENT_BINARY_DIR} && ${CMAKE_LINKER} -m elf64lppc -r -b binary -o ${TZ_OBJ} ${TIMEZONE_ID} + COMMAND rm ${CMAKE_CURRENT_BINARY_DIR}/${TIMEZONE_ID}) + else() + add_custom_command(OUTPUT ${TZ_OBJ} + COMMAND cp ${TZDIR}/${TIMEZONE} ${CMAKE_CURRENT_BINARY_DIR}/${TIMEZONE_ID} + COMMAND cd ${CMAKE_CURRENT_BINARY_DIR} && ${OBJCOPY_PATH} -I binary ${OBJCOPY_ARCH_OPTIONS} --rename-section .data=.rodata,alloc,load,readonly,data,contents ${TIMEZONE_ID} ${TZ_OBJ} - COMMAND rm ${CMAKE_CURRENT_BINARY_DIR}/${TIMEZONE_ID}) - + COMMAND rm ${CMAKE_CURRENT_BINARY_DIR}/${TIMEZONE_ID}) + endif() set_source_files_properties(${TZ_OBJ} PROPERTIES EXTERNAL_OBJECT true GENERATED true) endforeach(TIMEZONE) diff --git a/contrib/jemalloc-cmake/CMakeLists.txt b/contrib/jemalloc-cmake/CMakeLists.txt index b8a6474413a..73afa99f1d8 100644 --- a/contrib/jemalloc-cmake/CMakeLists.txt +++ b/contrib/jemalloc-cmake/CMakeLists.txt @@ -1,7 +1,7 @@ -if (SANITIZE OR NOT (ARCH_AMD64 OR ARCH_ARM) OR NOT (OS_LINUX OR OS_FREEBSD OR OS_DARWIN)) +if (SANITIZE OR NOT (ARCH_AMD64 OR ARCH_ARM OR ARCH_PPC64LE) OR NOT (OS_LINUX OR OS_FREEBSD OR OS_DARWIN)) if (ENABLE_JEMALLOC) message (${RECONFIGURE_MESSAGE_LEVEL} - "jemalloc is disabled implicitly: it doesn't work with sanitizers and can only be used with x86_64 or aarch64 on linux or freebsd.") + "jemalloc is disabled implicitly: it doesn't work with sanitizers and can only be used with x86_64, aarch64 or ppc64le on linux or freebsd.") endif() set (ENABLE_JEMALLOC OFF) else() @@ -107,6 +107,8 @@ if (ARCH_AMD64) set(JEMALLOC_INCLUDE_PREFIX "${JEMALLOC_INCLUDE_PREFIX}_x86_64") elseif (ARCH_ARM) set(JEMALLOC_INCLUDE_PREFIX "${JEMALLOC_INCLUDE_PREFIX}_aarch64") +elseif (ARCH_PPC64LE) + set(JEMALLOC_INCLUDE_PREFIX "${JEMALLOC_INCLUDE_PREFIX}_ppc64le") else () message (FATAL_ERROR "internal jemalloc: This arch is not supported") endif () diff --git a/contrib/jemalloc-cmake/include_linux_ppc64le/jemalloc/internal/jemalloc_internal_defs.h.in b/contrib/jemalloc-cmake/include_linux_ppc64le/jemalloc/internal/jemalloc_internal_defs.h.in new file mode 100644 index 00000000000..8068861041f --- /dev/null +++ b/contrib/jemalloc-cmake/include_linux_ppc64le/jemalloc/internal/jemalloc_internal_defs.h.in @@ -0,0 +1,367 @@ +/* include/jemalloc/internal/jemalloc_internal_defs.h. Generated from jemalloc_internal_defs.h.in by configure. */ +#ifndef JEMALLOC_INTERNAL_DEFS_H_ +#define JEMALLOC_INTERNAL_DEFS_H_ +/* + * If JEMALLOC_PREFIX is defined via --with-jemalloc-prefix, it will cause all + * public APIs to be prefixed. This makes it possible, with some care, to use + * multiple allocators simultaneously. + */ +/* #undef JEMALLOC_PREFIX */ +/* #undef JEMALLOC_CPREFIX */ + +/* + * Define overrides for non-standard allocator-related functions if they are + * present on the system. + */ +#define JEMALLOC_OVERRIDE___LIBC_CALLOC +#define JEMALLOC_OVERRIDE___LIBC_FREE +#define JEMALLOC_OVERRIDE___LIBC_MALLOC +#define JEMALLOC_OVERRIDE___LIBC_MEMALIGN +#define JEMALLOC_OVERRIDE___LIBC_REALLOC +#define JEMALLOC_OVERRIDE___LIBC_VALLOC +/* #undef JEMALLOC_OVERRIDE___POSIX_MEMALIGN */ + +/* + * JEMALLOC_PRIVATE_NAMESPACE is used as a prefix for all library-private APIs. + * For shared libraries, symbol visibility mechanisms prevent these symbols + * from being exported, but for static libraries, naming collisions are a real + * possibility. + */ +#define JEMALLOC_PRIVATE_NAMESPACE je_ + +/* + * Hyper-threaded CPUs may need a special instruction inside spin loops in + * order to yield to another virtual CPU. + */ +#define CPU_SPINWAIT +/* 1 if CPU_SPINWAIT is defined, 0 otherwise. */ +#define HAVE_CPU_SPINWAIT 0 + +/* + * Number of significant bits in virtual addresses. This may be less than the + * total number of bits in a pointer, e.g. on x64, for which the uppermost 16 + * bits are the same as bit 47. + */ +#define LG_VADDR 64 + +/* Defined if C11 atomics are available. */ +#define JEMALLOC_C11_ATOMICS 1 + +/* Defined if GCC __atomic atomics are available. */ +#define JEMALLOC_GCC_ATOMIC_ATOMICS 1 +/* and the 8-bit variant support. */ +#define JEMALLOC_GCC_U8_ATOMIC_ATOMICS 1 + +/* Defined if GCC __sync atomics are available. */ +#define JEMALLOC_GCC_SYNC_ATOMICS 1 +/* and the 8-bit variant support. */ +#define JEMALLOC_GCC_U8_SYNC_ATOMICS 1 + +/* + * Defined if __builtin_clz() and __builtin_clzl() are available. + */ +#define JEMALLOC_HAVE_BUILTIN_CLZ + +/* + * Defined if os_unfair_lock_*() functions are available, as provided by Darwin. + */ +/* #undef JEMALLOC_OS_UNFAIR_LOCK */ + +/* Defined if syscall(2) is usable. */ +#define JEMALLOC_USE_SYSCALL + +/* + * Defined if secure_getenv(3) is available. + */ +// #define JEMALLOC_HAVE_SECURE_GETENV + +/* + * Defined if issetugid(2) is available. + */ +/* #undef JEMALLOC_HAVE_ISSETUGID */ + +/* Defined if pthread_atfork(3) is available. */ +#define JEMALLOC_HAVE_PTHREAD_ATFORK + +/* Defined if pthread_setname_np(3) is available. */ +#define JEMALLOC_HAVE_PTHREAD_SETNAME_NP + +/* + * Defined if clock_gettime(CLOCK_MONOTONIC_COARSE, ...) is available. + */ +#define JEMALLOC_HAVE_CLOCK_MONOTONIC_COARSE 1 + +/* + * Defined if clock_gettime(CLOCK_MONOTONIC, ...) is available. + */ +#define JEMALLOC_HAVE_CLOCK_MONOTONIC 1 + +/* + * Defined if mach_absolute_time() is available. + */ +/* #undef JEMALLOC_HAVE_MACH_ABSOLUTE_TIME */ + +/* + * Defined if _malloc_thread_cleanup() exists. At least in the case of + * FreeBSD, pthread_key_create() allocates, which if used during malloc + * bootstrapping will cause recursion into the pthreads library. Therefore, if + * _malloc_thread_cleanup() exists, use it as the basis for thread cleanup in + * malloc_tsd. + */ +/* #undef JEMALLOC_MALLOC_THREAD_CLEANUP */ + +/* + * Defined if threaded initialization is known to be safe on this platform. + * Among other things, it must be possible to initialize a mutex without + * triggering allocation in order for threaded allocation to be safe. + */ +#define JEMALLOC_THREADED_INIT + +/* + * Defined if the pthreads implementation defines + * _pthread_mutex_init_calloc_cb(), in which case the function is used in order + * to avoid recursive allocation during mutex initialization. + */ +/* #undef JEMALLOC_MUTEX_INIT_CB */ + +/* Non-empty if the tls_model attribute is supported. */ +#define JEMALLOC_TLS_MODEL __attribute__((tls_model("initial-exec"))) + +/* + * JEMALLOC_DEBUG enables assertions and other sanity checks, and disables + * inline functions. + */ +/* #undef JEMALLOC_DEBUG */ + +/* JEMALLOC_STATS enables statistics calculation. */ +#define JEMALLOC_STATS + +/* JEMALLOC_EXPERIMENTAL_SMALLOCX_API enables experimental smallocx API. */ +/* #undef JEMALLOC_EXPERIMENTAL_SMALLOCX_API */ + +/* JEMALLOC_PROF enables allocation profiling. */ +/* #undef JEMALLOC_PROF */ + +/* Use libunwind for profile backtracing if defined. */ +/* #undef JEMALLOC_PROF_LIBUNWIND */ + +/* Use libgcc for profile backtracing if defined. */ +/* #undef JEMALLOC_PROF_LIBGCC */ + +/* Use gcc intrinsics for profile backtracing if defined. */ +/* #undef JEMALLOC_PROF_GCC */ + +/* + * JEMALLOC_DSS enables use of sbrk(2) to allocate extents from the data storage + * segment (DSS). + */ +#define JEMALLOC_DSS + +/* Support memory filling (junk/zero). */ +#define JEMALLOC_FILL + +/* Support utrace(2)-based tracing. */ +/* #undef JEMALLOC_UTRACE */ + +/* Support optional abort() on OOM. */ +/* #undef JEMALLOC_XMALLOC */ + +/* Support lazy locking (avoid locking unless a second thread is launched). */ +/* #undef JEMALLOC_LAZY_LOCK */ + +/* + * Minimum allocation alignment is 2^LG_QUANTUM bytes (ignoring tiny size + * classes). + */ +/* #undef LG_QUANTUM */ + +/* One page is 2^LG_PAGE bytes. */ +#define LG_PAGE 16 + +/* + * One huge page is 2^LG_HUGEPAGE bytes. Note that this is defined even if the + * system does not explicitly support huge pages; system calls that require + * explicit huge page support are separately configured. + */ +#define LG_HUGEPAGE 21 + +/* + * If defined, adjacent virtual memory mappings with identical attributes + * automatically coalesce, and they fragment when changes are made to subranges. + * This is the normal order of things for mmap()/munmap(), but on Windows + * VirtualAlloc()/VirtualFree() operations must be precisely matched, i.e. + * mappings do *not* coalesce/fragment. + */ +#define JEMALLOC_MAPS_COALESCE + +/* + * If defined, retain memory for later reuse by default rather than using e.g. + * munmap() to unmap freed extents. This is enabled on 64-bit Linux because + * common sequences of mmap()/munmap() calls will cause virtual memory map + * holes. + */ +#define JEMALLOC_RETAIN + +/* TLS is used to map arenas and magazine caches to threads. */ +#define JEMALLOC_TLS + +/* + * Used to mark unreachable code to quiet "end of non-void" compiler warnings. + * Don't use this directly; instead use unreachable() from util.h + */ +#define JEMALLOC_INTERNAL_UNREACHABLE __builtin_unreachable + +/* + * ffs*() functions to use for bitmapping. Don't use these directly; instead, + * use ffs_*() from util.h. + */ +#define JEMALLOC_INTERNAL_FFSLL __builtin_ffsll +#define JEMALLOC_INTERNAL_FFSL __builtin_ffsl +#define JEMALLOC_INTERNAL_FFS __builtin_ffs + +/* + * popcount*() functions to use for bitmapping. + */ +#define JEMALLOC_INTERNAL_POPCOUNTL __builtin_popcountl +#define JEMALLOC_INTERNAL_POPCOUNT __builtin_popcount + +/* + * If defined, explicitly attempt to more uniformly distribute large allocation + * pointer alignments across all cache indices. + */ +#define JEMALLOC_CACHE_OBLIVIOUS + +/* + * If defined, enable logging facilities. We make this a configure option to + * avoid taking extra branches everywhere. + */ +/* #undef JEMALLOC_LOG */ + +/* + * If defined, use readlinkat() (instead of readlink()) to follow + * /etc/malloc_conf. + */ +/* #undef JEMALLOC_READLINKAT */ + +/* + * Darwin (OS X) uses zones to work around Mach-O symbol override shortcomings. + */ +/* #undef JEMALLOC_ZONE */ + +/* + * Methods for determining whether the OS overcommits. + * JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY: Linux's + * /proc/sys/vm.overcommit_memory file. + * JEMALLOC_SYSCTL_VM_OVERCOMMIT: FreeBSD's vm.overcommit sysctl. + */ +/* #undef JEMALLOC_SYSCTL_VM_OVERCOMMIT */ +#define JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY + +/* Defined if madvise(2) is available. */ +#define JEMALLOC_HAVE_MADVISE + +/* + * Defined if transparent huge pages are supported via the MADV_[NO]HUGEPAGE + * arguments to madvise(2). + */ +#define JEMALLOC_HAVE_MADVISE_HUGE + +/* + * Methods for purging unused pages differ between operating systems. + * + * madvise(..., MADV_FREE) : This marks pages as being unused, such that they + * will be discarded rather than swapped out. + * madvise(..., MADV_DONTNEED) : If JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS is + * defined, this immediately discards pages, + * such that new pages will be demand-zeroed if + * the address region is later touched; + * otherwise this behaves similarly to + * MADV_FREE, though typically with higher + * system overhead. + */ +#define JEMALLOC_PURGE_MADVISE_FREE +#define JEMALLOC_PURGE_MADVISE_DONTNEED +#define JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS + +/* Defined if madvise(2) is available but MADV_FREE is not (x86 Linux only). */ +/* #undef JEMALLOC_DEFINE_MADVISE_FREE */ + +/* + * Defined if MADV_DO[NT]DUMP is supported as an argument to madvise. + */ +#define JEMALLOC_MADVISE_DONTDUMP + +/* + * Defined if transparent huge pages (THPs) are supported via the + * MADV_[NO]HUGEPAGE arguments to madvise(2), and THP support is enabled. + */ +/* #undef JEMALLOC_THP */ + +/* Define if operating system has alloca.h header. */ +#define JEMALLOC_HAS_ALLOCA_H 1 + +/* C99 restrict keyword supported. */ +#define JEMALLOC_HAS_RESTRICT 1 + +/* For use by hash code. */ +/* #undef JEMALLOC_BIG_ENDIAN */ + +/* sizeof(int) == 2^LG_SIZEOF_INT. */ +#define LG_SIZEOF_INT 2 + +/* sizeof(long) == 2^LG_SIZEOF_LONG. */ +#define LG_SIZEOF_LONG 3 + +/* sizeof(long long) == 2^LG_SIZEOF_LONG_LONG. */ +#define LG_SIZEOF_LONG_LONG 3 + +/* sizeof(intmax_t) == 2^LG_SIZEOF_INTMAX_T. */ +#define LG_SIZEOF_INTMAX_T 3 + +/* glibc malloc hooks (__malloc_hook, __realloc_hook, __free_hook). */ +#define JEMALLOC_GLIBC_MALLOC_HOOK + +/* glibc memalign hook. */ +#define JEMALLOC_GLIBC_MEMALIGN_HOOK + +/* pthread support */ +#define JEMALLOC_HAVE_PTHREAD + +/* dlsym() support */ +#define JEMALLOC_HAVE_DLSYM + +/* Adaptive mutex support in pthreads. */ +#define JEMALLOC_HAVE_PTHREAD_MUTEX_ADAPTIVE_NP + +/* GNU specific sched_getcpu support */ +#define JEMALLOC_HAVE_SCHED_GETCPU + +/* GNU specific sched_setaffinity support */ +#define JEMALLOC_HAVE_SCHED_SETAFFINITY + +/* + * If defined, all the features necessary for background threads are present. + */ +#define JEMALLOC_BACKGROUND_THREAD 1 + +/* + * If defined, jemalloc symbols are not exported (doesn't work when + * JEMALLOC_PREFIX is not defined). + */ +/* #undef JEMALLOC_EXPORT */ + +/* config.malloc_conf options string. */ +#define JEMALLOC_CONFIG_MALLOC_CONF "@JEMALLOC_CONFIG_MALLOC_CONF@" + +/* If defined, jemalloc takes the malloc/free/etc. symbol names. */ +#define JEMALLOC_IS_MALLOC 1 + +/* + * Defined if strerror_r returns char * if _GNU_SOURCE is defined. + */ +#define JEMALLOC_STRERROR_R_RETURNS_CHAR_WITH_GNU_SOURCE + +/* Performs additional safety checks when defined. */ +/* #undef JEMALLOC_OPT_SAFETY_CHECKS */ + +#endif /* JEMALLOC_INTERNAL_DEFS_H_ */ diff --git a/contrib/openldap-cmake/linux_ppc64le/include/lber_types.h b/contrib/openldap-cmake/linux_ppc64le/include/lber_types.h new file mode 100644 index 00000000000..dbd59430527 --- /dev/null +++ b/contrib/openldap-cmake/linux_ppc64le/include/lber_types.h @@ -0,0 +1,63 @@ +/* include/lber_types.h. Generated from lber_types.hin by configure. */ +/* $OpenLDAP$ */ +/* This work is part of OpenLDAP Software . + * + * Copyright 1998-2020 The OpenLDAP Foundation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * . + */ + +/* + * LBER types + */ + +#ifndef _LBER_TYPES_H +#define _LBER_TYPES_H + +#include + +LDAP_BEGIN_DECL + +/* LBER boolean, enum, integers (32 bits or larger) */ +#define LBER_INT_T int + +/* LBER tags (32 bits or larger) */ +#define LBER_TAG_T long + +/* LBER socket descriptor */ +#define LBER_SOCKET_T int + +/* LBER lengths (32 bits or larger) */ +#define LBER_LEN_T long + +/* ------------------------------------------------------------ */ + +/* booleans, enumerations, and integers */ +typedef LBER_INT_T ber_int_t; + +/* signed and unsigned versions */ +typedef signed LBER_INT_T ber_sint_t; +typedef unsigned LBER_INT_T ber_uint_t; + +/* tags */ +typedef unsigned LBER_TAG_T ber_tag_t; + +/* "socket" descriptors */ +typedef LBER_SOCKET_T ber_socket_t; + +/* lengths */ +typedef unsigned LBER_LEN_T ber_len_t; + +/* signed lengths */ +typedef signed LBER_LEN_T ber_slen_t; + +LDAP_END_DECL + +#endif /* _LBER_TYPES_H */ diff --git a/contrib/openldap-cmake/linux_ppc64le/include/ldap_config.h b/contrib/openldap-cmake/linux_ppc64le/include/ldap_config.h new file mode 100644 index 00000000000..89f7b40b884 --- /dev/null +++ b/contrib/openldap-cmake/linux_ppc64le/include/ldap_config.h @@ -0,0 +1,74 @@ +/* include/ldap_config.h. Generated from ldap_config.hin by configure. */ +/* $OpenLDAP$ */ +/* This work is part of OpenLDAP Software . + * + * Copyright 1998-2020 The OpenLDAP Foundation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * . + */ + +/* + * This file works in conjunction with OpenLDAP configure system. + * If you do no like the values below, adjust your configure options. + */ + +#ifndef _LDAP_CONFIG_H +#define _LDAP_CONFIG_H + +/* directory separator */ +#ifndef LDAP_DIRSEP +#ifndef _WIN32 +#define LDAP_DIRSEP "/" +#else +#define LDAP_DIRSEP "\\" +#endif +#endif + +/* directory for temporary files */ +#if defined(_WIN32) +# define LDAP_TMPDIR "C:\\." /* we don't have much of a choice */ +#elif defined( _P_tmpdir ) +# define LDAP_TMPDIR _P_tmpdir +#elif defined( P_tmpdir ) +# define LDAP_TMPDIR P_tmpdir +#elif defined( _PATH_TMPDIR ) +# define LDAP_TMPDIR _PATH_TMPDIR +#else +# define LDAP_TMPDIR LDAP_DIRSEP "tmp" +#endif + +/* directories */ +#ifndef LDAP_BINDIR +#define LDAP_BINDIR "/tmp/ldap-prefix/bin" +#endif +#ifndef LDAP_SBINDIR +#define LDAP_SBINDIR "/tmp/ldap-prefix/sbin" +#endif +#ifndef LDAP_DATADIR +#define LDAP_DATADIR "/tmp/ldap-prefix/share/openldap" +#endif +#ifndef LDAP_SYSCONFDIR +#define LDAP_SYSCONFDIR "/tmp/ldap-prefix/etc/openldap" +#endif +#ifndef LDAP_LIBEXECDIR +#define LDAP_LIBEXECDIR "/tmp/ldap-prefix/libexec" +#endif +#ifndef LDAP_MODULEDIR +#define LDAP_MODULEDIR "/tmp/ldap-prefix/libexec/openldap" +#endif +#ifndef LDAP_RUNDIR +#define LDAP_RUNDIR "/tmp/ldap-prefix/var" +#endif +#ifndef LDAP_LOCALEDIR +#define LDAP_LOCALEDIR "" +#endif + + +#endif /* _LDAP_CONFIG_H */ diff --git a/contrib/openldap-cmake/linux_ppc64le/include/ldap_features.h b/contrib/openldap-cmake/linux_ppc64le/include/ldap_features.h new file mode 100644 index 00000000000..f0cc7c3626f --- /dev/null +++ b/contrib/openldap-cmake/linux_ppc64le/include/ldap_features.h @@ -0,0 +1,61 @@ +/* include/ldap_features.h. Generated from ldap_features.hin by configure. */ +/* $OpenLDAP$ */ +/* This work is part of OpenLDAP Software . + * + * Copyright 1998-2020 The OpenLDAP Foundation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * . + */ + +/* + * LDAP Features + */ + +#ifndef _LDAP_FEATURES_H +#define _LDAP_FEATURES_H 1 + +/* OpenLDAP API version macros */ +#define LDAP_VENDOR_VERSION 20501 +#define LDAP_VENDOR_VERSION_MAJOR 2 +#define LDAP_VENDOR_VERSION_MINOR 5 +#define LDAP_VENDOR_VERSION_PATCH X + +/* +** WORK IN PROGRESS! +** +** OpenLDAP reentrancy/thread-safeness should be dynamically +** checked using ldap_get_option(). +** +** The -lldap implementation is not thread-safe. +** +** The -lldap_r implementation is: +** LDAP_API_FEATURE_THREAD_SAFE (basic thread safety) +** but also be: +** LDAP_API_FEATURE_SESSION_THREAD_SAFE +** LDAP_API_FEATURE_OPERATION_THREAD_SAFE +** +** The preprocessor flag LDAP_API_FEATURE_X_OPENLDAP_THREAD_SAFE +** can be used to determine if -lldap_r is available at compile +** time. You must define LDAP_THREAD_SAFE if and only if you +** link with -lldap_r. +** +** If you fail to define LDAP_THREAD_SAFE when linking with +** -lldap_r or define LDAP_THREAD_SAFE when linking with -lldap, +** provided header definitions and declarations may be incorrect. +** +*/ + +/* is -lldap_r available or not */ +#define LDAP_API_FEATURE_X_OPENLDAP_THREAD_SAFE 1 + +/* LDAP v2 Referrals */ +/* #undef LDAP_API_FEATURE_X_OPENLDAP_V2_REFERRALS */ + +#endif /* LDAP_FEATURES */ diff --git a/contrib/openldap-cmake/linux_ppc64le/include/portable.h b/contrib/openldap-cmake/linux_ppc64le/include/portable.h new file mode 100644 index 00000000000..2924b6713a4 --- /dev/null +++ b/contrib/openldap-cmake/linux_ppc64le/include/portable.h @@ -0,0 +1,1169 @@ +/* include/portable.h. Generated from portable.hin by configure. */ +/* include/portable.hin. Generated from configure.in by autoheader. */ + + +/* begin of portable.h.pre */ +/* This work is part of OpenLDAP Software . + * + * Copyright 1998-2020 The OpenLDAP Foundation + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in the file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * . + */ + +#ifndef _LDAP_PORTABLE_H +#define _LDAP_PORTABLE_H + +/* define this if needed to get reentrant functions */ +#ifndef REENTRANT +#define REENTRANT 1 +#endif +#ifndef _REENTRANT +#define _REENTRANT 1 +#endif + +/* define this if needed to get threadsafe functions */ +#ifndef THREADSAFE +#define THREADSAFE 1 +#endif +#ifndef _THREADSAFE +#define _THREADSAFE 1 +#endif +#ifndef THREAD_SAFE +#define THREAD_SAFE 1 +#endif +#ifndef _THREAD_SAFE +#define _THREAD_SAFE 1 +#endif + +#ifndef _SGI_MP_SOURCE +#define _SGI_MP_SOURCE 1 +#endif + +/* end of portable.h.pre */ + + +/* Define if building universal (internal helper macro) */ +/* #undef AC_APPLE_UNIVERSAL_BUILD */ + +/* define to use both and */ +/* #undef BOTH_STRINGS_H */ + +/* define if cross compiling */ +/* #undef CROSS_COMPILING */ + +/* set to the number of arguments ctime_r() expects */ +#define CTIME_R_NARGS 2 + +/* define if toupper() requires islower() */ +/* #undef C_UPPER_LOWER */ + +/* define if sys_errlist is not declared in stdio.h or errno.h */ +/* #undef DECL_SYS_ERRLIST */ + +/* define to enable slapi library */ +/* #undef ENABLE_SLAPI */ + +/* defined to be the EXE extension */ +#define EXEEXT "" + +/* set to the number of arguments gethostbyaddr_r() expects */ +#define GETHOSTBYADDR_R_NARGS 8 + +/* set to the number of arguments gethostbyname_r() expects */ +#define GETHOSTBYNAME_R_NARGS 6 + +/* Define to 1 if `TIOCGWINSZ' requires . */ +#define GWINSZ_IN_SYS_IOCTL 1 + +/* define if you have AIX security lib */ +/* #undef HAVE_AIX_SECURITY */ + +/* Define to 1 if you have the header file. */ +#define HAVE_ARPA_INET_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_ARPA_NAMESER_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_ASSERT_H 1 + +/* Define to 1 if you have the `bcopy' function. */ +#define HAVE_BCOPY 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_BITS_TYPES_H 1 + +/* Define to 1 if you have the `chroot' function. */ +#define HAVE_CHROOT 1 + +/* Define to 1 if you have the `closesocket' function. */ +/* #undef HAVE_CLOSESOCKET */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_CONIO_H */ + +/* define if crypt(3) is available */ +/* #undef HAVE_CRYPT */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_CRYPT_H */ + +/* define if crypt_r() is also available */ +/* #undef HAVE_CRYPT_R */ + +/* Define to 1 if you have the `ctime_r' function. */ +#define HAVE_CTIME_R 1 + +/* define if you have Cyrus SASL */ +/* #undef HAVE_CYRUS_SASL */ + +/* define if your system supports /dev/poll */ +/* #undef HAVE_DEVPOLL */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_DIRECT_H */ + +/* Define to 1 if you have the header file, and it defines `DIR'. + */ +#define HAVE_DIRENT_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_DLFCN_H 1 + +/* Define to 1 if you don't have `vprintf' but do have `_doprnt.' */ +/* #undef HAVE_DOPRNT */ + +/* define if system uses EBCDIC instead of ASCII */ +/* #undef HAVE_EBCDIC */ + +/* Define to 1 if you have the `endgrent' function. */ +#define HAVE_ENDGRENT 1 + +/* Define to 1 if you have the `endpwent' function. */ +#define HAVE_ENDPWENT 1 + +/* define if your system supports epoll */ +#define HAVE_EPOLL 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_ERRNO_H 1 + +/* Define to 1 if you have the `fcntl' function. */ +#define HAVE_FCNTL 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_FCNTL_H 1 + +/* define if you actually have FreeBSD fetch(3) */ +/* #undef HAVE_FETCH */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_FILIO_H */ + +/* Define to 1 if you have the `flock' function. */ +#define HAVE_FLOCK 1 + +/* Define to 1 if you have the `fstat' function. */ +#define HAVE_FSTAT 1 + +/* Define to 1 if you have the `gai_strerror' function. */ +#define HAVE_GAI_STRERROR 1 + +/* Define to 1 if you have the `getaddrinfo' function. */ +#define HAVE_GETADDRINFO 1 + +/* Define to 1 if you have the `getdtablesize' function. */ +#define HAVE_GETDTABLESIZE 1 + +/* Define to 1 if you have the `geteuid' function. */ +#define HAVE_GETEUID 1 + +/* Define to 1 if you have the `getgrgid' function. */ +#define HAVE_GETGRGID 1 + +/* Define to 1 if you have the `gethostbyaddr_r' function. */ +#define HAVE_GETHOSTBYADDR_R 1 + +/* Define to 1 if you have the `gethostbyname_r' function. */ +#define HAVE_GETHOSTBYNAME_R 1 + +/* Define to 1 if you have the `gethostname' function. */ +#define HAVE_GETHOSTNAME 1 + +/* Define to 1 if you have the `getnameinfo' function. */ +#define HAVE_GETNAMEINFO 1 + +/* Define to 1 if you have the `getopt' function. */ +#define HAVE_GETOPT 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_GETOPT_H 1 + +/* Define to 1 if you have the `getpassphrase' function. */ +/* #undef HAVE_GETPASSPHRASE */ + +/* Define to 1 if you have the `getpeereid' function. */ +/* #undef HAVE_GETPEEREID */ + +/* Define to 1 if you have the `getpeerucred' function. */ +/* #undef HAVE_GETPEERUCRED */ + +/* Define to 1 if you have the `getpwnam' function. */ +#define HAVE_GETPWNAM 1 + +/* Define to 1 if you have the `getpwuid' function. */ +#define HAVE_GETPWUID 1 + +/* Define to 1 if you have the `getspnam' function. */ +#define HAVE_GETSPNAM 1 + +/* Define to 1 if you have the `gettimeofday' function. */ +#define HAVE_GETTIMEOFDAY 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_GMP_H */ + +/* Define to 1 if you have the `gmtime_r' function. */ +#define HAVE_GMTIME_R 1 + +/* define if you have GNUtls */ +/* #undef HAVE_GNUTLS */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_GNUTLS_GNUTLS_H */ + +/* if you have GNU Pth */ +/* #undef HAVE_GNU_PTH */ + +/* Define to 1 if you have the header file. */ +#define HAVE_GRP_H 1 + +/* Define to 1 if you have the `hstrerror' function. */ +#define HAVE_HSTRERROR 1 + +/* define to you inet_aton(3) is available */ +#define HAVE_INET_ATON 1 + +/* Define to 1 if you have the `inet_ntoa_b' function. */ +/* #undef HAVE_INET_NTOA_B */ + +/* Define to 1 if you have the `inet_ntop' function. */ +#define HAVE_INET_NTOP 1 + +/* Define to 1 if you have the `initgroups' function. */ +#define HAVE_INITGROUPS 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_INTTYPES_H 1 + +/* Define to 1 if you have the `ioctl' function. */ +#define HAVE_IOCTL 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_IO_H */ + +/* define if your system supports kqueue */ +/* #undef HAVE_KQUEUE */ + +/* Define to 1 if you have the `gen' library (-lgen). */ +/* #undef HAVE_LIBGEN */ + +/* Define to 1 if you have the `gmp' library (-lgmp). */ +/* #undef HAVE_LIBGMP */ + +/* Define to 1 if you have the `inet' library (-linet). */ +/* #undef HAVE_LIBINET */ + +/* define if you have libtool -ltdl */ +/* #undef HAVE_LIBLTDL */ + +/* Define to 1 if you have the `net' library (-lnet). */ +/* #undef HAVE_LIBNET */ + +/* Define to 1 if you have the `nsl' library (-lnsl). */ +/* #undef HAVE_LIBNSL */ + +/* Define to 1 if you have the `nsl_s' library (-lnsl_s). */ +/* #undef HAVE_LIBNSL_S */ + +/* Define to 1 if you have the `socket' library (-lsocket). */ +/* #undef HAVE_LIBSOCKET */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_LIBUTIL_H */ + +/* Define to 1 if you have the `V3' library (-lV3). */ +/* #undef HAVE_LIBV3 */ + +/* Define to 1 if you have the header file. */ +#define HAVE_LIMITS_H 1 + +/* if you have LinuxThreads */ +/* #undef HAVE_LINUX_THREADS */ + +/* Define to 1 if you have the header file. */ +#define HAVE_LOCALE_H 1 + +/* Define to 1 if you have the `localtime_r' function. */ +#define HAVE_LOCALTIME_R 1 + +/* Define to 1 if you have the `lockf' function. */ +#define HAVE_LOCKF 1 + +/* Define to 1 if the system has the type `long long'. */ +#define HAVE_LONG_LONG 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_LTDL_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_MALLOC_H 1 + +/* Define to 1 if you have the `memcpy' function. */ +#define HAVE_MEMCPY 1 + +/* Define to 1 if you have the `memmove' function. */ +#define HAVE_MEMMOVE 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_MEMORY_H 1 + +/* Define to 1 if you have the `memrchr' function. */ +#define HAVE_MEMRCHR 1 + +/* Define to 1 if you have the `mkstemp' function. */ +#define HAVE_MKSTEMP 1 + +/* Define to 1 if you have the `mktemp' function. */ +#define HAVE_MKTEMP 1 + +/* define this if you have mkversion */ +#define HAVE_MKVERSION 1 + +/* Define to 1 if you have the header file, and it defines `DIR'. */ +/* #undef HAVE_NDIR_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_NETINET_TCP_H 1 + +/* define if strerror_r returns char* instead of int */ +/* #undef HAVE_NONPOSIX_STRERROR_R */ + +/* if you have NT Event Log */ +/* #undef HAVE_NT_EVENT_LOG */ + +/* if you have NT Service Manager */ +/* #undef HAVE_NT_SERVICE_MANAGER */ + +/* if you have NT Threads */ +/* #undef HAVE_NT_THREADS */ + +/* define if you have OpenSSL */ +#define HAVE_OPENSSL 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_OPENSSL_BN_H 1 + +/* define if you have OpenSSL with CRL checking capability */ +#define HAVE_OPENSSL_CRL 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_OPENSSL_CRYPTO_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_OPENSSL_SSL_H 1 + +/* Define to 1 if you have the `pipe' function. */ +#define HAVE_PIPE 1 + +/* Define to 1 if you have the `poll' function. */ +#define HAVE_POLL 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_POLL_H 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_PROCESS_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_PSAP_H */ + +/* define to pthreads API spec revision */ +#define HAVE_PTHREADS 10 + +/* define if you have pthread_detach function */ +#define HAVE_PTHREAD_DETACH 1 + +/* Define to 1 if you have the `pthread_getconcurrency' function. */ +#define HAVE_PTHREAD_GETCONCURRENCY 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_PTHREAD_H 1 + +/* Define to 1 if you have the `pthread_kill' function. */ +#define HAVE_PTHREAD_KILL 1 + +/* Define to 1 if you have the `pthread_kill_other_threads_np' function. */ +/* #undef HAVE_PTHREAD_KILL_OTHER_THREADS_NP */ + +/* define if you have pthread_rwlock_destroy function */ +#define HAVE_PTHREAD_RWLOCK_DESTROY 1 + +/* Define to 1 if you have the `pthread_setconcurrency' function. */ +#define HAVE_PTHREAD_SETCONCURRENCY 1 + +/* Define to 1 if you have the `pthread_yield' function. */ +#define HAVE_PTHREAD_YIELD 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_PTH_H */ + +/* Define to 1 if the system has the type `ptrdiff_t'. */ +#define HAVE_PTRDIFF_T 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_PWD_H 1 + +/* Define to 1 if you have the `read' function. */ +#define HAVE_READ 1 + +/* Define to 1 if you have the `recv' function. */ +#define HAVE_RECV 1 + +/* Define to 1 if you have the `recvfrom' function. */ +#define HAVE_RECVFROM 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_REGEX_H 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_RESOLV_H */ + +/* define if you have res_query() */ +/* #undef HAVE_RES_QUERY */ + +/* define if OpenSSL needs RSAref */ +/* #undef HAVE_RSAREF */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SASL_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SASL_SASL_H */ + +/* define if your SASL library has sasl_version() */ +/* #undef HAVE_SASL_VERSION */ + +/* Define to 1 if you have the header file. */ +#define HAVE_SCHED_H 1 + +/* Define to 1 if you have the `sched_yield' function. */ +#define HAVE_SCHED_YIELD 1 + +/* Define to 1 if you have the `send' function. */ +#define HAVE_SEND 1 + +/* Define to 1 if you have the `sendmsg' function. */ +#define HAVE_SENDMSG 1 + +/* Define to 1 if you have the `sendto' function. */ +#define HAVE_SENDTO 1 + +/* Define to 1 if you have the `setegid' function. */ +#define HAVE_SETEGID 1 + +/* Define to 1 if you have the `seteuid' function. */ +#define HAVE_SETEUID 1 + +/* Define to 1 if you have the `setgid' function. */ +#define HAVE_SETGID 1 + +/* Define to 1 if you have the `setpwfile' function. */ +/* #undef HAVE_SETPWFILE */ + +/* Define to 1 if you have the `setsid' function. */ +#define HAVE_SETSID 1 + +/* Define to 1 if you have the `setuid' function. */ +#define HAVE_SETUID 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SGTTY_H 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SHADOW_H */ + +/* Define to 1 if you have the `sigaction' function. */ +#define HAVE_SIGACTION 1 + +/* Define to 1 if you have the `signal' function. */ +#define HAVE_SIGNAL 1 + +/* Define to 1 if you have the `sigset' function. */ +#define HAVE_SIGSET 1 + +/* define if you have -lslp */ +/* #undef HAVE_SLP */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SLP_H */ + +/* Define to 1 if you have the `snprintf' function. */ +#define HAVE_SNPRINTF 1 + +/* if you have spawnlp() */ +/* #undef HAVE_SPAWNLP */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SQLEXT_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SQL_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_STDDEF_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STDINT_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STDLIB_H 1 + +/* Define to 1 if you have the `strdup' function. */ +#define HAVE_STRDUP 1 + +/* Define to 1 if you have the `strerror' function. */ +#define HAVE_STRERROR 1 + +/* Define to 1 if you have the `strerror_r' function. */ +#define HAVE_STRERROR_R 1 + +/* Define to 1 if you have the `strftime' function. */ +#define HAVE_STRFTIME 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STRINGS_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STRING_H 1 + +/* Define to 1 if you have the `strpbrk' function. */ +#define HAVE_STRPBRK 1 + +/* Define to 1 if you have the `strrchr' function. */ +#define HAVE_STRRCHR 1 + +/* Define to 1 if you have the `strsep' function. */ +#define HAVE_STRSEP 1 + +/* Define to 1 if you have the `strspn' function. */ +#define HAVE_STRSPN 1 + +/* Define to 1 if you have the `strstr' function. */ +#define HAVE_STRSTR 1 + +/* Define to 1 if you have the `strtol' function. */ +#define HAVE_STRTOL 1 + +/* Define to 1 if you have the `strtoll' function. */ +#define HAVE_STRTOLL 1 + +/* Define to 1 if you have the `strtoq' function. */ +#define HAVE_STRTOQ 1 + +/* Define to 1 if you have the `strtoul' function. */ +#define HAVE_STRTOUL 1 + +/* Define to 1 if you have the `strtoull' function. */ +#define HAVE_STRTOULL 1 + +/* Define to 1 if you have the `strtouq' function. */ +#define HAVE_STRTOUQ 1 + +/* Define to 1 if `msg_accrightslen' is a member of `struct msghdr'. */ +/* #undef HAVE_STRUCT_MSGHDR_MSG_ACCRIGHTSLEN */ + +/* Define to 1 if `msg_control' is a member of `struct msghdr'. */ +#define HAVE_STRUCT_MSGHDR_MSG_CONTROL 1 + +/* Define to 1 if `pw_gecos' is a member of `struct passwd'. */ +#define HAVE_STRUCT_PASSWD_PW_GECOS 1 + +/* Define to 1 if `pw_passwd' is a member of `struct passwd'. */ +#define HAVE_STRUCT_PASSWD_PW_PASSWD 1 + +/* Define to 1 if `st_blksize' is a member of `struct stat'. */ +#define HAVE_STRUCT_STAT_ST_BLKSIZE 1 + +/* Define to 1 if `st_fstype' is a member of `struct stat'. */ +/* #undef HAVE_STRUCT_STAT_ST_FSTYPE */ + +/* define to 1 if st_fstype is char * */ +/* #undef HAVE_STRUCT_STAT_ST_FSTYPE_CHAR */ + +/* define to 1 if st_fstype is int */ +/* #undef HAVE_STRUCT_STAT_ST_FSTYPE_INT */ + +/* Define to 1 if `st_vfstype' is a member of `struct stat'. */ +/* #undef HAVE_STRUCT_STAT_ST_VFSTYPE */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYNCH_H */ + +/* Define to 1 if you have the `sysconf' function. */ +#define HAVE_SYSCONF 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYSEXITS_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYSLOG_H 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_DEVPOLL_H */ + +/* Define to 1 if you have the header file, and it defines `DIR'. + */ +/* #undef HAVE_SYS_DIR_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_EPOLL_H 1 + +/* define if you actually have sys_errlist in your libs */ +#define HAVE_SYS_ERRLIST 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_ERRNO_H 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_EVENT_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_FILE_H 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_FILIO_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_FSTYP_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_IOCTL_H 1 + +/* Define to 1 if you have the header file, and it defines `DIR'. + */ +/* #undef HAVE_SYS_NDIR_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_PARAM_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_POLL_H 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_PRIVGRP_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_RESOURCE_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_SELECT_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_SOCKET_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_STAT_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_SYSLOG_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_TIME_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_TYPES_H 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_UCRED_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_UIO_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_UN_H 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_UUID_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_VMOUNT_H */ + +/* Define to 1 if you have that is POSIX.1 compatible. */ +#define HAVE_SYS_WAIT_H 1 + +/* define if you have -lwrap */ +/* #undef HAVE_TCPD */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_TCPD_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_TERMIOS_H 1 + +/* if you have Solaris LWP (thr) package */ +/* #undef HAVE_THR */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_THREAD_H */ + +/* Define to 1 if you have the `thr_getconcurrency' function. */ +/* #undef HAVE_THR_GETCONCURRENCY */ + +/* Define to 1 if you have the `thr_setconcurrency' function. */ +/* #undef HAVE_THR_SETCONCURRENCY */ + +/* Define to 1 if you have the `thr_yield' function. */ +/* #undef HAVE_THR_YIELD */ + +/* define if you have TLS */ +#define HAVE_TLS 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_UNISTD_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_UTIME_H 1 + +/* define if you have uuid_generate() */ +/* #undef HAVE_UUID_GENERATE */ + +/* define if you have uuid_to_str() */ +/* #undef HAVE_UUID_TO_STR */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_UUID_UUID_H */ + +/* Define to 1 if you have the `vprintf' function. */ +#define HAVE_VPRINTF 1 + +/* Define to 1 if you have the `vsnprintf' function. */ +#define HAVE_VSNPRINTF 1 + +/* Define to 1 if you have the `wait4' function. */ +#define HAVE_WAIT4 1 + +/* Define to 1 if you have the `waitpid' function. */ +#define HAVE_WAITPID 1 + +/* define if you have winsock */ +/* #undef HAVE_WINSOCK */ + +/* define if you have winsock2 */ +/* #undef HAVE_WINSOCK2 */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_WINSOCK2_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_WINSOCK_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_WIREDTIGER_H */ + +/* Define to 1 if you have the `write' function. */ +#define HAVE_WRITE 1 + +/* define if select implicitly yields */ +#define HAVE_YIELDING_SELECT 1 + +/* Define to 1 if you have the `_vsnprintf' function. */ +/* #undef HAVE__VSNPRINTF */ + +/* define to 32-bit or greater integer type */ +#define LBER_INT_T int + +/* define to large integer type */ +#define LBER_LEN_T long + +/* define to socket descriptor type */ +#define LBER_SOCKET_T int + +/* define to large integer type */ +#define LBER_TAG_T long + +/* define to 1 if library is thread safe */ +#define LDAP_API_FEATURE_X_OPENLDAP_THREAD_SAFE 1 + +/* define to LDAP VENDOR VERSION */ +/* #undef LDAP_API_FEATURE_X_OPENLDAP_V2_REFERRALS */ + +/* define this to add debugging code */ +/* #undef LDAP_DEBUG */ + +/* define if LDAP libs are dynamic */ +/* #undef LDAP_LIBS_DYNAMIC */ + +/* define to support PF_INET6 */ +#define LDAP_PF_INET6 1 + +/* define to support PF_LOCAL */ +#define LDAP_PF_LOCAL 1 + +/* define this to add SLAPI code */ +/* #undef LDAP_SLAPI */ + +/* define this to add syslog code */ +/* #undef LDAP_SYSLOG */ + +/* Version */ +#define LDAP_VENDOR_VERSION 20501 + +/* Major */ +#define LDAP_VENDOR_VERSION_MAJOR 2 + +/* Minor */ +#define LDAP_VENDOR_VERSION_MINOR 5 + +/* Patch */ +#define LDAP_VENDOR_VERSION_PATCH X + +/* Define to the sub-directory where libtool stores uninstalled libraries. */ +#define LT_OBJDIR ".libs/" + +/* define if memcmp is not 8-bit clean or is otherwise broken */ +/* #undef NEED_MEMCMP_REPLACEMENT */ + +/* define if you have (or want) no threads */ +/* #undef NO_THREADS */ + +/* define to use the original debug style */ +/* #undef OLD_DEBUG */ + +/* Package */ +#define OPENLDAP_PACKAGE "OpenLDAP" + +/* Version */ +#define OPENLDAP_VERSION "2.5.X" + +/* Define to the address where bug reports for this package should be sent. */ +#define PACKAGE_BUGREPORT "" + +/* Define to the full name of this package. */ +#define PACKAGE_NAME "" + +/* Define to the full name and version of this package. */ +#define PACKAGE_STRING "" + +/* Define to the one symbol short name of this package. */ +#define PACKAGE_TARNAME "" + +/* Define to the home page for this package. */ +#define PACKAGE_URL "" + +/* Define to the version of this package. */ +#define PACKAGE_VERSION "" + +/* define if sched_yield yields the entire process */ +/* #undef REPLACE_BROKEN_YIELD */ + +/* Define as the return type of signal handlers (`int' or `void'). */ +#define RETSIGTYPE void + +/* Define to the type of arg 1 for `select'. */ +#define SELECT_TYPE_ARG1 int + +/* Define to the type of args 2, 3 and 4 for `select'. */ +#define SELECT_TYPE_ARG234 (fd_set *) + +/* Define to the type of arg 5 for `select'. */ +#define SELECT_TYPE_ARG5 (struct timeval *) + +/* The size of `int', as computed by sizeof. */ +#define SIZEOF_INT 4 + +/* The size of `long', as computed by sizeof. */ +#define SIZEOF_LONG 8 + +/* The size of `long long', as computed by sizeof. */ +#define SIZEOF_LONG_LONG 8 + +/* The size of `short', as computed by sizeof. */ +#define SIZEOF_SHORT 2 + +/* The size of `wchar_t', as computed by sizeof. */ +#define SIZEOF_WCHAR_T 4 + +/* define to support per-object ACIs */ +/* #undef SLAPD_ACI_ENABLED */ + +/* define to support LDAP Async Metadirectory backend */ +/* #undef SLAPD_ASYNCMETA */ + +/* define to support cleartext passwords */ +/* #undef SLAPD_CLEARTEXT */ + +/* define to support crypt(3) passwords */ +/* #undef SLAPD_CRYPT */ + +/* define to support DNS SRV backend */ +/* #undef SLAPD_DNSSRV */ + +/* define to support LDAP backend */ +/* #undef SLAPD_LDAP */ + +/* define to support MDB backend */ +/* #undef SLAPD_MDB */ + +/* define to support LDAP Metadirectory backend */ +/* #undef SLAPD_META */ + +/* define to support modules */ +/* #undef SLAPD_MODULES */ + +/* dynamically linked module */ +#define SLAPD_MOD_DYNAMIC 2 + +/* statically linked module */ +#define SLAPD_MOD_STATIC 1 + +/* define to support cn=Monitor backend */ +/* #undef SLAPD_MONITOR */ + +/* define to support NDB backend */ +/* #undef SLAPD_NDB */ + +/* define to support NULL backend */ +/* #undef SLAPD_NULL */ + +/* define for In-Directory Access Logging overlay */ +/* #undef SLAPD_OVER_ACCESSLOG */ + +/* define for Audit Logging overlay */ +/* #undef SLAPD_OVER_AUDITLOG */ + +/* define for Automatic Certificate Authority overlay */ +/* #undef SLAPD_OVER_AUTOCA */ + +/* define for Collect overlay */ +/* #undef SLAPD_OVER_COLLECT */ + +/* define for Attribute Constraint overlay */ +/* #undef SLAPD_OVER_CONSTRAINT */ + +/* define for Dynamic Directory Services overlay */ +/* #undef SLAPD_OVER_DDS */ + +/* define for Dynamic Directory Services overlay */ +/* #undef SLAPD_OVER_DEREF */ + +/* define for Dynamic Group overlay */ +/* #undef SLAPD_OVER_DYNGROUP */ + +/* define for Dynamic List overlay */ +/* #undef SLAPD_OVER_DYNLIST */ + +/* define for Reverse Group Membership overlay */ +/* #undef SLAPD_OVER_MEMBEROF */ + +/* define for Password Policy overlay */ +/* #undef SLAPD_OVER_PPOLICY */ + +/* define for Proxy Cache overlay */ +/* #undef SLAPD_OVER_PROXYCACHE */ + +/* define for Referential Integrity overlay */ +/* #undef SLAPD_OVER_REFINT */ + +/* define for Return Code overlay */ +/* #undef SLAPD_OVER_RETCODE */ + +/* define for Rewrite/Remap overlay */ +/* #undef SLAPD_OVER_RWM */ + +/* define for Sequential Modify overlay */ +/* #undef SLAPD_OVER_SEQMOD */ + +/* define for ServerSideSort/VLV overlay */ +/* #undef SLAPD_OVER_SSSVLV */ + +/* define for Syncrepl Provider overlay */ +/* #undef SLAPD_OVER_SYNCPROV */ + +/* define for Translucent Proxy overlay */ +/* #undef SLAPD_OVER_TRANSLUCENT */ + +/* define for Attribute Uniqueness overlay */ +/* #undef SLAPD_OVER_UNIQUE */ + +/* define for Value Sorting overlay */ +/* #undef SLAPD_OVER_VALSORT */ + +/* define to support PASSWD backend */ +/* #undef SLAPD_PASSWD */ + +/* define to support PERL backend */ +/* #undef SLAPD_PERL */ + +/* define to support relay backend */ +/* #undef SLAPD_RELAY */ + +/* define to support reverse lookups */ +/* #undef SLAPD_RLOOKUPS */ + +/* define to support SHELL backend */ +/* #undef SLAPD_SHELL */ + +/* define to support SOCK backend */ +/* #undef SLAPD_SOCK */ + +/* define to support SASL passwords */ +/* #undef SLAPD_SPASSWD */ + +/* define to support SQL backend */ +/* #undef SLAPD_SQL */ + +/* define to support WiredTiger backend */ +/* #undef SLAPD_WT */ + +/* define to support run-time loadable ACL */ +/* #undef SLAP_DYNACL */ + +/* Define to 1 if you have the ANSI C header files. */ +#define STDC_HEADERS 1 + +/* Define to 1 if you can safely include both and . */ +#define TIME_WITH_SYS_TIME 1 + +/* Define to 1 if your declares `struct tm'. */ +/* #undef TM_IN_SYS_TIME */ + +/* set to urandom device */ +#define URANDOM_DEVICE "/dev/urandom" + +/* define to use OpenSSL BIGNUM for MP */ +/* #undef USE_MP_BIGNUM */ + +/* define to use GMP for MP */ +/* #undef USE_MP_GMP */ + +/* define to use 'long' for MP */ +/* #undef USE_MP_LONG */ + +/* define to use 'long long' for MP */ +/* #undef USE_MP_LONG_LONG */ + +/* Define WORDS_BIGENDIAN to 1 if your processor stores words with the most + significant byte first (like Motorola and SPARC, unlike Intel). */ +#if defined AC_APPLE_UNIVERSAL_BUILD +# if defined __BIG_ENDIAN__ +# define WORDS_BIGENDIAN 1 +# endif +#else +# ifndef WORDS_BIGENDIAN +/* # undef WORDS_BIGENDIAN */ +# endif +#endif + +/* Define to the type of arg 3 for `accept'. */ +#define ber_socklen_t socklen_t + +/* Define to `char *' if does not define. */ +/* #undef caddr_t */ + +/* Define to empty if `const' does not conform to ANSI C. */ +/* #undef const */ + +/* Define to `int' if doesn't define. */ +/* #undef gid_t */ + +/* Define to `int' if does not define. */ +/* #undef mode_t */ + +/* Define to `long' if does not define. */ +/* #undef off_t */ + +/* Define to `int' if does not define. */ +/* #undef pid_t */ + +/* Define to `int' if does not define. */ +/* #undef sig_atomic_t */ + +/* Define to `unsigned' if does not define. */ +/* #undef size_t */ + +/* define to snprintf routine */ +/* #undef snprintf */ + +/* Define like ber_socklen_t if does not define. */ +/* #undef socklen_t */ + +/* Define to `signed int' if does not define. */ +/* #undef ssize_t */ + +/* Define to `int' if doesn't define. */ +/* #undef uid_t */ + +/* define as empty if volatile is not supported */ +/* #undef volatile */ + +/* define to snprintf routine */ +/* #undef vsnprintf */ + + +/* begin of portable.h.post */ + +#ifdef _WIN32 +/* don't suck in all of the win32 api */ +# define WIN32_LEAN_AND_MEAN 1 +#endif + +#ifndef LDAP_NEEDS_PROTOTYPES +/* force LDAP_P to always include prototypes */ +#define LDAP_NEEDS_PROTOTYPES 1 +#endif + +#ifndef LDAP_REL_ENG +#if (LDAP_VENDOR_VERSION == 000000) && !defined(LDAP_DEVEL) +#define LDAP_DEVEL +#endif +#if defined(LDAP_DEVEL) && !defined(LDAP_TEST) +#define LDAP_TEST +#endif +#endif + +#ifdef HAVE_STDDEF_H +# include +#endif + +#ifdef HAVE_EBCDIC +/* ASCII/EBCDIC converting replacements for stdio funcs + * vsnprintf and snprintf are used too, but they are already + * checked by the configure script + */ +#define fputs ber_pvt_fputs +#define fgets ber_pvt_fgets +#define printf ber_pvt_printf +#define fprintf ber_pvt_fprintf +#define vfprintf ber_pvt_vfprintf +#define vsprintf ber_pvt_vsprintf +#endif + +#include "ac/fdset.h" + +#include "ldap_cdefs.h" +#include "ldap_features.h" + +#include "ac/assert.h" +#include "ac/localize.h" + +#endif /* _LDAP_PORTABLE_H */ +/* end of portable.h.post */ + diff --git a/docker/test/fasttest/Dockerfile b/docker/test/fasttest/Dockerfile index 64be52d8e30..2864f7fc4da 100644 --- a/docker/test/fasttest/Dockerfile +++ b/docker/test/fasttest/Dockerfile @@ -1,7 +1,7 @@ # docker build -t yandex/clickhouse-fasttest . FROM ubuntu:20.04 -ENV DEBIAN_FRONTEND=noninteractive LLVM_VERSION=10 +ENV DEBIAN_FRONTEND=noninteractive LLVM_VERSION=11 RUN apt-get update \ && apt-get install ca-certificates lsb-release wget gnupg apt-transport-https \ @@ -43,20 +43,20 @@ RUN apt-get update \ clang-tidy-${LLVM_VERSION} \ cmake \ curl \ - lsof \ expect \ fakeroot \ - git \ gdb \ + git \ gperf \ lld-${LLVM_VERSION} \ llvm-${LLVM_VERSION} \ + lsof \ moreutils \ ninja-build \ psmisc \ python3 \ - python3-pip \ python3-lxml \ + python3-pip \ python3-requests \ python3-termcolor \ rename \ diff --git a/docker/test/fasttest/run.sh b/docker/test/fasttest/run.sh index c8bfce3848d..c21a115289d 100755 --- a/docker/test/fasttest/run.sh +++ b/docker/test/fasttest/run.sh @@ -8,6 +8,9 @@ trap 'kill $(jobs -pr) ||:' EXIT # that we can run the "everything else" stage from the cloned source. stage=${stage:-} +# Compiler version, normally set by Dockerfile +export LLVM_VERSION=${LLVM_VERSION:-11} + # A variable to pass additional flags to CMake. # Here we explicitly default it to nothing so that bash doesn't complain about # it being undefined. Also read it as array so that we can pass an empty list @@ -124,22 +127,26 @@ continue function clone_root { - git clone https://github.com/ClickHouse/ClickHouse.git -- "$FASTTEST_SOURCE" | ts '%Y-%m-%d %H:%M:%S' | tee "$FASTTEST_OUTPUT/clone_log.txt" + git clone --depth 1 https://github.com/ClickHouse/ClickHouse.git -- "$FASTTEST_SOURCE" 2>&1 | ts '%Y-%m-%d %H:%M:%S' | tee "$FASTTEST_OUTPUT/clone_log.txt" ( cd "$FASTTEST_SOURCE" if [ "$PULL_REQUEST_NUMBER" != "0" ]; then - if git fetch origin "+refs/pull/$PULL_REQUEST_NUMBER/merge"; then + if git fetch --depth 1 origin "+refs/pull/$PULL_REQUEST_NUMBER/merge"; then git checkout FETCH_HEAD - echo 'Clonned merge head' + echo "Checked out pull/$PULL_REQUEST_NUMBER/merge ($(git rev-parse FETCH_HEAD))" else - git fetch origin "+refs/pull/$PULL_REQUEST_NUMBER/head" + git fetch --depth 1 origin "+refs/pull/$PULL_REQUEST_NUMBER/head" git checkout "$COMMIT_SHA" - echo 'Checked out to commit' + echo "Checked out nominal SHA $COMMIT_SHA for PR $PULL_REQUEST_NUMBER" fi else if [ -v COMMIT_SHA ]; then + git fetch --depth 1 origin "$COMMIT_SHA" git checkout "$COMMIT_SHA" + echo "Checked out nominal SHA $COMMIT_SHA for master" + else + echo "Using default repository head $(git rev-parse HEAD)" fi fi ) @@ -181,7 +188,7 @@ function clone_submodules ) git submodule sync - git submodule update --init --recursive "${SUBMODULES_TO_UPDATE[@]}" + git submodule update --depth 1 --init --recursive "${SUBMODULES_TO_UPDATE[@]}" git submodule foreach git reset --hard git submodule foreach git checkout @ -f git submodule foreach git clean -xfd @@ -215,7 +222,7 @@ function run_cmake ( cd "$FASTTEST_BUILD" - cmake "$FASTTEST_SOURCE" -DCMAKE_CXX_COMPILER=clang++-10 -DCMAKE_C_COMPILER=clang-10 "${CMAKE_LIBS_CONFIG[@]}" "${FASTTEST_CMAKE_FLAGS[@]}" | ts '%Y-%m-%d %H:%M:%S' | tee "$FASTTEST_OUTPUT/cmake_log.txt" + cmake "$FASTTEST_SOURCE" -DCMAKE_CXX_COMPILER="clang++-${LLVM_VERSION}" -DCMAKE_C_COMPILER="clang-${LLVM_VERSION}" "${CMAKE_LIBS_CONFIG[@]}" "${FASTTEST_CMAKE_FLAGS[@]}" 2>&1 | ts '%Y-%m-%d %H:%M:%S' | tee "$FASTTEST_OUTPUT/cmake_log.txt" ) } @@ -223,7 +230,7 @@ function build { ( cd "$FASTTEST_BUILD" - time ninja clickhouse-bundle | ts '%Y-%m-%d %H:%M:%S' | tee "$FASTTEST_OUTPUT/build_log.txt" + time ninja clickhouse-bundle 2>&1 | ts '%Y-%m-%d %H:%M:%S' | tee "$FASTTEST_OUTPUT/build_log.txt" if [ "$COPY_CLICKHOUSE_BINARY_TO_OUTPUT" -eq "1" ]; then cp programs/clickhouse "$FASTTEST_OUTPUT/clickhouse" fi @@ -420,7 +427,7 @@ case "$stage" in # See the compatibility hacks in `clone_root` stage above. Remove at the same time, # after Nov 1, 2020. cd "$FASTTEST_WORKSPACE" - clone_submodules | ts '%Y-%m-%d %H:%M:%S' | tee "$FASTTEST_OUTPUT/submodule_log.txt" + clone_submodules 2>&1 | ts '%Y-%m-%d %H:%M:%S' | tee "$FASTTEST_OUTPUT/submodule_log.txt" ;& "run_cmake") run_cmake @@ -431,7 +438,7 @@ case "$stage" in "configure") # The `install_log.txt` is also needed for compatibility with old CI task -- # if there is no log, it will decide that build failed. - configure | ts '%Y-%m-%d %H:%M:%S' | tee "$FASTTEST_OUTPUT/install_log.txt" + configure 2>&1 | ts '%Y-%m-%d %H:%M:%S' | tee "$FASTTEST_OUTPUT/install_log.txt" ;& "run_tests") run_tests diff --git a/docs/README.md b/docs/README.md index 8b3066501bf..a4df023a6ad 100644 --- a/docs/README.md +++ b/docs/README.md @@ -126,7 +126,13 @@ Contribute all new information in English language. Other languages are translat ### Adding a New File -When adding a new file: +When you add a new file, it should end with a link like: + +`[Original article](https://clickhouse.tech/docs/) ` + +and there should be **a new empty line** after it. + +{## When adding a new file: - Make symbolic links for all other languages. You can use the following commands: @@ -134,7 +140,7 @@ When adding a new file: $ cd /ClickHouse/clone/directory/docs $ ln -sr en/new/file.md lang/new/file.md ``` - +##} ### Adding a New Language @@ -195,8 +201,11 @@ Templates: - [Function](_description_templates/template-function.md) - [Setting](_description_templates/template-setting.md) +- [Server Setting](_description_templates/template-server-setting.md) - [Database or Table engine](_description_templates/template-engine.md) - [System table](_description_templates/template-system-table.md) +- [Data type](_description_templates/data-type.md) +- [Statement](_description_templates/statement.md) diff --git a/docs/en/engines/database-engines/index.md b/docs/en/engines/database-engines/index.md index 2db11998483..b6892099378 100644 --- a/docs/en/engines/database-engines/index.md +++ b/docs/en/engines/database-engines/index.md @@ -18,4 +18,8 @@ You can also use the following database engines: - [Lazy](../../engines/database-engines/lazy.md) +- [Atomic](../../engines/database-engines/atomic.md) + +- [PostgreSQL](../../engines/database-engines/postgresql.md) + [Original article](https://clickhouse.tech/docs/en/database_engines/) diff --git a/docs/en/engines/database-engines/postgresql.md b/docs/en/engines/database-engines/postgresql.md new file mode 100644 index 00000000000..1fa86b7ac21 --- /dev/null +++ b/docs/en/engines/database-engines/postgresql.md @@ -0,0 +1,138 @@ +--- +toc_priority: 35 +toc_title: PostgreSQL +--- + +# PostgreSQL {#postgresql} + +Allows to connect to databases on a remote [PostgreSQL](https://www.postgresql.org) server. Supports read and write operations (`SELECT` and `INSERT` queries) to exchange data between ClickHouse and PostgreSQL. + +Gives the real-time access to table list and table structure from remote PostgreSQL with the help of `SHOW TABLES` and `DESCRIBE TABLE` queries. + +Supports table structure modifications (`ALTER TABLE ... ADD|DROP COLUMN`). If `use_table_cache` parameter (see the Engine Parameters below) it set to `1`, the table structure is cached and not checked for being modified, but can be updated with `DETACH` and `ATTACH` queries. + +## Creating a Database {#creating-a-database} + +``` sql +CREATE DATABASE test_database +ENGINE = PostgreSQL('host:port', 'database', 'user', 'password'[, `use_table_cache`]); +``` + +**Engine Parameters** + +- `host:port` — PostgreSQL server address. +- `database` — Remote database name. +- `user` — PostgreSQL user. +- `password` — User password. +- `use_table_cache` — Defines if the database table structure is cached or not. Optional. Default value: `0`. + +## Data Types Support {#data_types-support} + +| PostgerSQL | ClickHouse | +|------------------|--------------------------------------------------------------| +| DATE | [Date](../../sql-reference/data-types/date.md) | +| TIMESTAMP | [DateTime](../../sql-reference/data-types/datetime.md) | +| REAL | [Float32](../../sql-reference/data-types/float.md) | +| DOUBLE | [Float64](../../sql-reference/data-types/float.md) | +| DECIMAL, NUMERIC | [Decimal](../../sql-reference/data-types/decimal.md) | +| SMALLINT | [Int16](../../sql-reference/data-types/int-uint.md) | +| INTEGER | [Int32](../../sql-reference/data-types/int-uint.md) | +| BIGINT | [Int64](../../sql-reference/data-types/int-uint.md) | +| SERIAL | [UInt32](../../sql-reference/data-types/int-uint.md) | +| BIGSERIAL | [UInt64](../../sql-reference/data-types/int-uint.md) | +| TEXT, CHAR | [String](../../sql-reference/data-types/string.md) | +| INTEGER | Nullable([Int32](../../sql-reference/data-types/int-uint.md))| +| ARRAY | [Array](../../sql-reference/data-types/array.md) | + + +## Examples of Use {#examples-of-use} + +Database in ClickHouse, exchanging data with the PostgreSQL server: + +``` sql +CREATE DATABASE test_database +ENGINE = PostgreSQL('postgres1:5432', 'test_database', 'postgres', 'mysecretpassword', 1); +``` + +``` sql +SHOW DATABASES; +``` + +``` text +┌─name──────────┐ +│ default │ +│ test_database │ +│ system │ +└───────────────┘ +``` + +``` sql +SHOW TABLES FROM test_database; +``` + +``` text +┌─name───────┐ +│ test_table │ +└────────────┘ +``` + +Reading data from the PostgreSQL table: + +``` sql +SELECT * FROM test_database.test_table; +``` + +``` text +┌─id─┬─value─┐ +│ 1 │ 2 │ +└────┴───────┘ +``` + +Writing data to the PostgreSQL table: + +``` sql +INSERT INTO test_database.test_table VALUES (3,4); +SELECT * FROM test_database.test_table; +``` + +``` text +┌─int_id─┬─value─┐ +│ 1 │ 2 │ +│ 3 │ 4 │ +└────────┴───────┘ +``` + +Consider the table structure was modified in PostgreSQL: + +``` sql +postgre> ALTER TABLE test_table ADD COLUMN data Text +``` + +As the `use_table_cache` parameter was set to `1` when the database was created, the table structure in ClickHouse was cached and therefore not modified: + +``` sql +DESCRIBE TABLE test_database.test_table; +``` +``` text +┌─name───┬─type──────────────┐ +│ id │ Nullable(Integer) │ +│ value │ Nullable(Integer) │ +└────────┴───────────────────┘ +``` + +After detaching the table and attaching it again, the structure was updated: + +``` sql +DETACH TABLE test_database.test_table; +ATTACH TABLE test_database.test_table; +DESCRIBE TABLE test_database.test_table; +``` +``` text +┌─name───┬─type──────────────┐ +│ id │ Nullable(Integer) │ +│ value │ Nullable(Integer) │ +│ data │ Nullable(String) │ +└────────┴───────────────────┘ +``` + +[Original article](https://clickhouse.tech/docs/en/database-engines/postgresql/) diff --git a/docs/en/engines/table-engines/index.md b/docs/en/engines/table-engines/index.md index e60cdf3c899..eb4fc583f88 100644 --- a/docs/en/engines/table-engines/index.md +++ b/docs/en/engines/table-engines/index.md @@ -47,12 +47,17 @@ Engines for communicating with other data storage and processing systems. Engines in the family: -- [Kafka](../../engines/table-engines/integrations/kafka.md#kafka) -- [MySQL](../../engines/table-engines/integrations/mysql.md#mysql) -- [ODBC](../../engines/table-engines/integrations/odbc.md#table-engine-odbc) -- [JDBC](../../engines/table-engines/integrations/jdbc.md#table-engine-jdbc) -- [HDFS](../../engines/table-engines/integrations/hdfs.md#hdfs) -- [S3](../../engines/table-engines/integrations/s3.md#table-engine-s3) + +- [ODBC](../../engines/table-engines/integrations/odbc.md) +- [JDBC](../../engines/table-engines/integrations/jdbc.md) +- [MySQL](../../engines/table-engines/integrations/mysql.md) +- [MongoDB](../../engines/table-engines/integrations/mongodb.md) +- [HDFS](../../engines/table-engines/integrations/hdfs.md) +- [S3](../../engines/table-engines/integrations/s3.md) +- [Kafka](../../engines/table-engines/integrations/kafka.md) +- [EmbeddedRocksDB](../../engines/table-engines/integrations/embedded-rocksdb.md) +- [RabbitMQ](../../engines/table-engines/integrations/rabbitmq.md) +- [PostgreSQL](../../engines/table-engines/integrations/postgresql.md) ### Special Engines {#special-engines} diff --git a/docs/en/engines/table-engines/integrations/embedded-rocksdb.md b/docs/en/engines/table-engines/integrations/embedded-rocksdb.md index e9e069933e5..88c8973eeab 100644 --- a/docs/en/engines/table-engines/integrations/embedded-rocksdb.md +++ b/docs/en/engines/table-engines/integrations/embedded-rocksdb.md @@ -1,5 +1,5 @@ --- -toc_priority: 6 +toc_priority: 9 toc_title: EmbeddedRocksDB --- diff --git a/docs/en/engines/table-engines/integrations/hdfs.md b/docs/en/engines/table-engines/integrations/hdfs.md index 0782efe8e72..cf4bb5ecbf7 100644 --- a/docs/en/engines/table-engines/integrations/hdfs.md +++ b/docs/en/engines/table-engines/integrations/hdfs.md @@ -1,5 +1,5 @@ --- -toc_priority: 4 +toc_priority: 6 toc_title: HDFS --- diff --git a/docs/en/engines/table-engines/integrations/index.md b/docs/en/engines/table-engines/integrations/index.md index 28f38375448..eb1c5411e18 100644 --- a/docs/en/engines/table-engines/integrations/index.md +++ b/docs/en/engines/table-engines/integrations/index.md @@ -1,6 +1,6 @@ --- toc_folder_title: Integrations -toc_priority: 30 +toc_priority: 1 --- # Table Engines for Integrations {#table-engines-for-integrations} @@ -19,5 +19,3 @@ List of supported integrations: - [EmbeddedRocksDB](../../../engines/table-engines/integrations/embedded-rocksdb.md) - [RabbitMQ](../../../engines/table-engines/integrations/rabbitmq.md) - [PostgreSQL](../../../engines/table-engines/integrations/postgresql.md) - -[Original article](https://clickhouse.tech/docs/en/engines/table-engines/integrations/) diff --git a/docs/en/engines/table-engines/integrations/jdbc.md b/docs/en/engines/table-engines/integrations/jdbc.md index edbc5d3ed3e..82efb842ae7 100644 --- a/docs/en/engines/table-engines/integrations/jdbc.md +++ b/docs/en/engines/table-engines/integrations/jdbc.md @@ -1,5 +1,5 @@ --- -toc_priority: 2 +toc_priority: 3 toc_title: JDBC --- diff --git a/docs/en/engines/table-engines/integrations/kafka.md b/docs/en/engines/table-engines/integrations/kafka.md index 0ec50094a27..2eebf5bdb92 100644 --- a/docs/en/engines/table-engines/integrations/kafka.md +++ b/docs/en/engines/table-engines/integrations/kafka.md @@ -1,5 +1,5 @@ --- -toc_priority: 5 +toc_priority: 8 toc_title: Kafka --- diff --git a/docs/en/engines/table-engines/integrations/mongodb.md b/docs/en/engines/table-engines/integrations/mongodb.md index 2fee27ce80d..a378ab03f55 100644 --- a/docs/en/engines/table-engines/integrations/mongodb.md +++ b/docs/en/engines/table-engines/integrations/mongodb.md @@ -1,5 +1,5 @@ --- -toc_priority: 7 +toc_priority: 5 toc_title: MongoDB --- diff --git a/docs/en/engines/table-engines/integrations/mysql.md b/docs/en/engines/table-engines/integrations/mysql.md index 8b7caa12c91..3847e7a9e0e 100644 --- a/docs/en/engines/table-engines/integrations/mysql.md +++ b/docs/en/engines/table-engines/integrations/mysql.md @@ -1,5 +1,5 @@ --- -toc_priority: 3 +toc_priority: 4 toc_title: MySQL --- diff --git a/docs/en/engines/table-engines/integrations/odbc.md b/docs/en/engines/table-engines/integrations/odbc.md index 99efd870088..26bfb6aeb0d 100644 --- a/docs/en/engines/table-engines/integrations/odbc.md +++ b/docs/en/engines/table-engines/integrations/odbc.md @@ -1,5 +1,5 @@ --- -toc_priority: 1 +toc_priority: 2 toc_title: ODBC --- diff --git a/docs/en/engines/table-engines/integrations/postgresql.md b/docs/en/engines/table-engines/integrations/postgresql.md index 8326038407f..ad5bebb3dea 100644 --- a/docs/en/engines/table-engines/integrations/postgresql.md +++ b/docs/en/engines/table-engines/integrations/postgresql.md @@ -1,11 +1,11 @@ --- -toc_priority: 8 +toc_priority: 11 toc_title: PostgreSQL --- # PostgreSQL {#postgresql} -The PostgreSQL engine allows you to perform `SELECT` queries on data that is stored on a remote PostgreSQL server. +The PostgreSQL engine allows to perform `SELECT` and `INSERT` queries on data that is stored on a remote PostgreSQL server. ## Creating a Table {#creating-a-table} @@ -15,7 +15,7 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1] [TTL expr1], name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2] [TTL expr2], ... -) ENGINE = PostgreSQL('host:port', 'database', 'table', 'user', 'password'); +) ENGINE = PostgreSQL('host:port', 'database', 'table', 'user', 'password'[, `schema`]); ``` See a detailed description of the [CREATE TABLE](../../../sql-reference/statements/create/table.md#create-table-query) query. @@ -29,25 +29,51 @@ The table structure can differ from the original PostgreSQL table structure: **Engine Parameters** - `host:port` — PostgreSQL server address. - - `database` — Remote database name. - - `table` — Remote table name. - - `user` — PostgreSQL user. - - `password` — User password. +- `schema` — Non-default table schema. Optional. -SELECT Queries on PostgreSQL side run as `COPY (SELECT ...) TO STDOUT` inside read-only PostgreSQL transaction with commit after each `SELECT` query. +## Implementation Details {#implementation-details} -Simple `WHERE` clauses such as `=, !=, >, >=, <, <=, IN` are executed on the PostgreSQL server. +`SELECT` queries on PostgreSQL side run as `COPY (SELECT ...) TO STDOUT` inside read-only PostgreSQL transaction with commit after each `SELECT` query. + +Simple `WHERE` clauses such as `=`, `!=`, `>`, `>=`, `<`, `<=`, and `IN` are executed on the PostgreSQL server. All joins, aggregations, sorting, `IN [ array ]` conditions and the `LIMIT` sampling constraint are executed in ClickHouse only after the query to PostgreSQL finishes. -INSERT Queries on PostgreSQL side run as `COPY "table_name" (field1, field2, ... fieldN) FROM STDIN` inside PostgreSQL transaction with auto-commit after each `INSERT` statement. +`INSERT` queries on PostgreSQL side run as `COPY "table_name" (field1, field2, ... fieldN) FROM STDIN` inside PostgreSQL transaction with auto-commit after each `INSERT` statement. -PostgreSQL Array types converts into ClickHouse arrays. -Be careful in PostgreSQL an array data created like a type_name[] may contain multi-dimensional arrays of different dimensions in different table rows in same column, but in ClickHouse it is only allowed to have multidimensional arrays of the same count of dimensions in all table rows in same column. +PostgreSQL `Array` types are converted into ClickHouse arrays. + +!!! info "Note" + Be careful - in PostgreSQL an array data, created like a `type_name[]`, may contain multi-dimensional arrays of different dimensions in different table rows in same column. But in ClickHouse it is only allowed to have multidimensional arrays of the same count of dimensions in all table rows in same column. + +Replicas priority for PostgreSQL dictionary source is supported. The bigger the number in map, the less the priority. The highest priority is `0`. + +In the example below replica `example01-1` has the highest priority: + +```xml + + 5432 + clickhouse + qwerty + + example01-1 + 1 + + + example01-2 + 2 + + db_name + table_name
+ id=10 + SQL_QUERY +
+ +``` ## Usage Example {#usage-example} @@ -64,10 +90,10 @@ PRIMARY KEY (int_id)); CREATE TABLE -postgres=# insert into test (int_id, str, "float") VALUES (1,'test',2); +postgres=# INSERT INTO test (int_id, str, "float") VALUES (1,'test',2); INSERT 0 1 -postgresql> select * from test; +postgresql> SELECT * FROM test; int_id | int_nullable | float | str | float_nullable --------+--------------+-------+------+---------------- 1 | | 2 | test | @@ -87,20 +113,33 @@ ENGINE = PostgreSQL('localhost:5432', 'public', 'test', 'postges_user', 'postgre ``` ``` sql -SELECT * FROM postgresql_table WHERE str IN ('test') +SELECT * FROM postgresql_table WHERE str IN ('test'); ``` ``` text ┌─float_nullable─┬─str──┬─int_id─┐ │ ᴺᵁᴸᴸ │ test │ 1 │ └────────────────┴──────┴────────┘ -1 rows in set. Elapsed: 0.019 sec. ``` +Using Non-default Schema: -## See Also {#see-also} +```text +postgres=# CREATE SCHEMA "nice.schema"; -- [The ‘postgresql’ table function](../../../sql-reference/table-functions/postgresql.md) +postgres=# CREATE TABLE "nice.schema"."nice.table" (a integer); + +postgres=# INSERT INTO "nice.schema"."nice.table" SELECT i FROM generate_series(0, 99) as t(i) +``` + +```sql +CREATE TABLE pg_table_schema_with_dots (a UInt32) + ENGINE PostgreSQL('localhost:5432', 'clickhouse', 'nice.table', 'postgrsql_user', 'password', 'nice.schema'); +``` + +**See Also** + +- [The `postgresql` table function](../../../sql-reference/table-functions/postgresql.md) - [Using PostgreSQL as a source of external dictionary](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md#dicts-external_dicts_dict_sources-postgresql) [Original article](https://clickhouse.tech/docs/en/engines/table-engines/integrations/postgresql/) diff --git a/docs/en/engines/table-engines/integrations/rabbitmq.md b/docs/en/engines/table-engines/integrations/rabbitmq.md index 476192d3969..5fb9ce5b151 100644 --- a/docs/en/engines/table-engines/integrations/rabbitmq.md +++ b/docs/en/engines/table-engines/integrations/rabbitmq.md @@ -1,5 +1,5 @@ --- -toc_priority: 6 +toc_priority: 10 toc_title: RabbitMQ --- diff --git a/docs/en/engines/table-engines/integrations/s3.md b/docs/en/engines/table-engines/integrations/s3.md index 03340f2d8c9..3d02aa13812 100644 --- a/docs/en/engines/table-engines/integrations/s3.md +++ b/docs/en/engines/table-engines/integrations/s3.md @@ -1,5 +1,5 @@ --- -toc_priority: 4 +toc_priority: 7 toc_title: S3 --- diff --git a/docs/en/operations/server-configuration-parameters/settings.md b/docs/en/operations/server-configuration-parameters/settings.md index 89fcbafe663..0b45488ebf7 100644 --- a/docs/en/operations/server-configuration-parameters/settings.md +++ b/docs/en/operations/server-configuration-parameters/settings.md @@ -502,7 +502,15 @@ On hosts with low RAM and swap, you possibly need setting `max_server_memory_usa ## max_concurrent_queries {#max-concurrent-queries} -The maximum number of simultaneously processed requests. +The maximum number of simultaneously processed queries related to MergeTree table. Queries may be limited by other settings: [max_concurrent_queries_for_all_users](#max-concurrent-queries-for-all-users), [min_marks_to_honor_max_concurrent_queries](#min-marks-to-honor-max-concurrent-queries). + +!!! info "Note" + These settings can be modified at runtime and will take effect immediately. Queries that are already running will remain unchanged. + +Possible values: + +- Positive integer. +- 0 — Disabled. **Example** @@ -530,6 +538,21 @@ Default value: `0` that means no limit. - [max_concurrent_queries](#max-concurrent-queries) +## min_marks_to_honor_max_concurrent_queries {#min-marks-to-honor-max-concurrent-queries} + +The minimal number of marks read by the query for applying the [max_concurrent_queries](#max-concurrent-queries) setting. + +Possible values: + +- Positive integer. +- 0 — Disabled. + +**Example** + +``` xml +10 +``` + ## max_connections {#max-connections} The maximum number of inbound connections. diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index 4da31b44b57..a5c3902f8f2 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -1914,7 +1914,7 @@ Default value: `0`. Enables or disables random shard insertion into a [Distributed](../../engines/table-engines/special/distributed.md#distributed) table when there is no distributed key. -By default, when inserting data into a `Distributed` table with more than one shard, the ClickHouse server will any insertion request if there is no distributed key. When `insert_distributed_one_random_shard = 1`, insertions are allowed and data is forwarded randomly among all shards. +By default, when inserting data into a `Distributed` table with more than one shard, the ClickHouse server will reject any insertion request if there is no distributed key. When `insert_distributed_one_random_shard = 1`, insertions are allowed and data is forwarded randomly among all shards. Possible values: diff --git a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md index b7129725820..dc0b6e17198 100644 --- a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md +++ b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md @@ -69,6 +69,8 @@ Types of sources (`source_type`): - [ClickHouse](#dicts-external_dicts_dict_sources-clickhouse) - [MongoDB](#dicts-external_dicts_dict_sources-mongodb) - [Redis](#dicts-external_dicts_dict_sources-redis) + - [Cassandra](#dicts-external_dicts_dict_sources-cassandra) + - [PostgreSQL](#dicts-external_dicts_dict_sources-postgresql) ## Local File {#dicts-external_dicts_dict_sources-local_file} diff --git a/docs/en/sql-reference/functions/bit-functions.md b/docs/en/sql-reference/functions/bit-functions.md index 31d09e48e01..e07f28c0f24 100644 --- a/docs/en/sql-reference/functions/bit-functions.md +++ b/docs/en/sql-reference/functions/bit-functions.md @@ -250,3 +250,53 @@ Result: └───────────────┘ ``` +## bitHammingDistance {#bithammingdistance} + +Returns the [Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance) between the bit representations of two integer values. Can be used with [SimHash](../../sql-reference/functions/hash-functions.md#ngramsimhash) functions for detection of semi-duplicate strings. The smaller is the distance, the more likely those strings are the same. + +**Syntax** + +``` sql +bitHammingDistance(int1, int2) +``` + +**Arguments** + +- `int1` — First integer value. [Int64](../../sql-reference/data-types/int-uint.md). +- `int2` — Second integer value. [Int64](../../sql-reference/data-types/int-uint.md). + +**Returned value** + +- The Hamming distance. + +Type: [UInt8](../../sql-reference/data-types/int-uint.md). + +**Examples** + +Query: + +``` sql +SELECT bitHammingDistance(111, 121); +``` + +Result: + +``` text +┌─bitHammingDistance(111, 121)─┐ +│ 3 │ +└──────────────────────────────┘ +``` + +With [SimHash](../../sql-reference/functions/hash-functions.md#ngramsimhash): + +``` sql +SELECT bitHammingDistance(ngramSimHash('cat ate rat'), ngramSimHash('rat ate cat')); +``` + +Result: + +``` text +┌─bitHammingDistance(ngramSimHash('cat ate rat'), ngramSimHash('rat ate cat'))─┐ +│ 5 │ +└──────────────────────────────────────────────────────────────────────────────┘ +``` diff --git a/docs/en/sql-reference/functions/hash-functions.md b/docs/en/sql-reference/functions/hash-functions.md index 945ede4927f..c60067b06af 100644 --- a/docs/en/sql-reference/functions/hash-functions.md +++ b/docs/en/sql-reference/functions/hash-functions.md @@ -7,6 +7,8 @@ toc_title: Hash Hash functions can be used for the deterministic pseudo-random shuffling of elements. +Simhash is a hash function, which returns close hash values for close (similar) arguments. + ## halfMD5 {#hash-functions-halfmd5} [Interprets](../../sql-reference/functions/type-conversion-functions.md#type_conversion_functions-reinterpretAsString) all the input parameters as strings and calculates the [MD5](https://en.wikipedia.org/wiki/MD5) hash value for each of them. Then combines hashes, takes the first 8 bytes of the hash of the resulting string, and interprets them as `UInt64` in big-endian byte order. @@ -482,3 +484,938 @@ Result: - [xxHash](http://cyan4973.github.io/xxHash/). +## ngramSimHash {#ngramsimhash} + +Splits a ASCII string into n-grams of `ngramsize` symbols and returns the n-gram `simhash`. Is case sensitive. + +Can be used for detection of semi-duplicate strings with [bitHammingDistance](../../sql-reference/functions/bit-functions.md#bithammingdistance). The smaller is the [Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance) of the calculated `simhashes` of two strings, the more likely these strings are the same. + +**Syntax** + +``` sql +ngramSimHash(string[, ngramsize]) +``` + +**Arguments** + +- `string` — String. [String](../../sql-reference/data-types/string.md). +- `ngramsize` — The size of an n-gram. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Returned value** + +- Hash value. + +Type: [UInt64](../../sql-reference/data-types/int-uint.md). + +**Example** + +Query: + +``` sql +SELECT ngramSimHash('ClickHouse') AS Hash; +``` + +Result: + +``` text +┌───────Hash─┐ +│ 1627567969 │ +└────────────┘ +``` + +## ngramSimHashCaseInsensitive {#ngramsimhashcaseinsensitive} + +Splits a ASCII string into n-grams of `ngramsize` symbols and returns the n-gram `simhash`. Is case insensitive. + +Can be used for detection of semi-duplicate strings with [bitHammingDistance](../../sql-reference/functions/bit-functions.md#bithammingdistance). The smaller is the [Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance) of the calculated `simhashes` of two strings, the more likely these strings are the same. + +**Syntax** + +``` sql +ngramSimHashCaseInsensitive(string[, ngramsize]) +``` + +**Arguments** + +- `string` — String. [String](../../sql-reference/data-types/string.md). +- `ngramsize` — The size of an n-gram. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Returned value** + +- Hash value. + +Type: [UInt64](../../sql-reference/data-types/int-uint.md). + +**Example** + +Query: + +``` sql +SELECT ngramSimHashCaseInsensitive('ClickHouse') AS Hash; +``` + +Result: + +``` text +┌──────Hash─┐ +│ 562180645 │ +└───────────┘ +``` + +## ngramSimHashUTF8 {#ngramsimhashutf8} + +Splits a UTF-8 string into n-grams of `ngramsize` symbols and returns the n-gram `simhash`. Is case sensitive. + +Can be used for detection of semi-duplicate strings with [bitHammingDistance](../../sql-reference/functions/bit-functions.md#bithammingdistance). The smaller is the [Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance) of the calculated `simhashes` of two strings, the more likely these strings are the same. + +**Syntax** + +``` sql +ngramSimHashUTF8(string[, ngramsize]) +``` + +**Arguments** + +- `string` — String. [String](../../sql-reference/data-types/string.md). +- `ngramsize` — The size of an n-gram. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Returned value** + +- Hash value. + +Type: [UInt64](../../sql-reference/data-types/int-uint.md). + +**Example** + +Query: + +``` sql +SELECT ngramSimHashUTF8('ClickHouse') AS Hash; +``` + +Result: + +``` text +┌───────Hash─┐ +│ 1628157797 │ +└────────────┘ +``` + +## ngramSimHashCaseInsensitiveUTF8 {#ngramsimhashcaseinsensitiveutf8} + +Splits a UTF-8 string into n-grams of `ngramsize` symbols and returns the n-gram `simhash`. Is case insensitive. + +Can be used for detection of semi-duplicate strings with [bitHammingDistance](../../sql-reference/functions/bit-functions.md#bithammingdistance). The smaller is the [Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance) of the calculated `simhashes` of two strings, the more likely these strings are the same. + +**Syntax** + +``` sql +ngramSimHashCaseInsensitiveUTF8(string[, ngramsize]) +``` + +**Arguments** + +- `string` — String. [String](../../sql-reference/data-types/string.md). +- `ngramsize` — The size of an n-gram. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Returned value** + +- Hash value. + +Type: [UInt64](../../sql-reference/data-types/int-uint.md). + +**Example** + +Query: + +``` sql +SELECT ngramSimHashCaseInsensitiveUTF8('ClickHouse') AS Hash; +``` + +Result: + +``` text +┌───────Hash─┐ +│ 1636742693 │ +└────────────┘ +``` + +## wordShingleSimHash {#wordshinglesimhash} + +Splits a ASCII string into parts (shingles) of `shinglesize` words and returns the word shingle `simhash`. Is case sensitive. + +Can be used for detection of semi-duplicate strings with [bitHammingDistance](../../sql-reference/functions/bit-functions.md#bithammingdistance). The smaller is the [Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance) of the calculated `simhashes` of two strings, the more likely these strings are the same. + +**Syntax** + +``` sql +wordShingleSimHash(string[, shinglesize]) +``` + +**Arguments** + +- `string` — String. [String](../../sql-reference/data-types/string.md). +- `shinglesize` — The size of a word shingle. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Returned value** + +- Hash value. + +Type: [UInt64](../../sql-reference/data-types/int-uint.md). + +**Example** + +Query: + +``` sql +SELECT wordShingleSimHash('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).') AS Hash; +``` + +Result: + +``` text +┌───────Hash─┐ +│ 2328277067 │ +└────────────┘ +``` + +## wordShingleSimHashCaseInsensitive {#wordshinglesimhashcaseinsensitive} + +Splits a ASCII string into parts (shingles) of `shinglesize` words and returns the word shingle `simhash`. Is case insensitive. + +Can be used for detection of semi-duplicate strings with [bitHammingDistance](../../sql-reference/functions/bit-functions.md#bithammingdistance). The smaller is the [Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance) of the calculated `simhashes` of two strings, the more likely these strings are the same. + +**Syntax** + +``` sql +wordShingleSimHashCaseInsensitive(string[, shinglesize]) +``` + +**Arguments** + +- `string` — String. [String](../../sql-reference/data-types/string.md). +- `shinglesize` — The size of a word shingle. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Returned value** + +- Hash value. + +Type: [UInt64](../../sql-reference/data-types/int-uint.md). + +**Example** + +Query: + +``` sql +SELECT wordShingleSimHashCaseInsensitive('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).') AS Hash; +``` + +Result: + +``` text +┌───────Hash─┐ +│ 2194812424 │ +└────────────┘ +``` + +## wordShingleSimHashUTF8 {#wordshinglesimhashutf8} + +Splits a UTF-8 string into parts (shingles) of `shinglesize` words and returns the word shingle `simhash`. Is case sensitive. + +Can be used for detection of semi-duplicate strings with [bitHammingDistance](../../sql-reference/functions/bit-functions.md#bithammingdistance). The smaller is the [Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance) of the calculated `simhashes` of two strings, the more likely these strings are the same. + +**Syntax** + +``` sql +wordShingleSimHashUTF8(string[, shinglesize]) +``` + +**Arguments** + +- `string` — String. [String](../../sql-reference/data-types/string.md). +- `shinglesize` — The size of a word shingle. Optinal. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Returned value** + +- Hash value. + +Type: [UInt64](../../sql-reference/data-types/int-uint.md). + +**Example** + +Query: + +``` sql +SELECT wordShingleSimHashUTF8('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).') AS Hash; +``` + +Result: + +``` text +┌───────Hash─┐ +│ 2328277067 │ +└────────────┘ +``` + +## wordShingleSimHashCaseInsensitiveUTF8 {#wordshinglesimhashcaseinsensitiveutf8} + +Splits a UTF-8 string into parts (shingles) of `shinglesize` words and returns the word shingle `simhash`. Is case insensitive. + +Can be used for detection of semi-duplicate strings with [bitHammingDistance](../../sql-reference/functions/bit-functions.md#bithammingdistance). The smaller is the [Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance) of the calculated `simhashes` of two strings, the more likely these strings are the same. + +**Syntax** + +``` sql +wordShingleSimHashCaseInsensitiveUTF8(string[, shinglesize]) +``` + +**Arguments** + +- `string` — String. [String](../../sql-reference/data-types/string.md). +- `shinglesize` — The size of a word shingle. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Returned value** + +- Hash value. + +Type: [UInt64](../../sql-reference/data-types/int-uint.md). + +**Example** + +Query: + +``` sql +SELECT wordShingleSimHashCaseInsensitiveUTF8('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).') AS Hash; +``` + +Result: + +``` text +┌───────Hash─┐ +│ 2194812424 │ +└────────────┘ +``` + +## ngramMinHash {#ngramminhash} + +Splits a ASCII string into n-grams of `ngramsize` symbols and calculates hash values for each n-gram. Uses `hashnum` minimum hashes to calculate the minimum hash and `hashnum` maximum hashes to calculate the maximum hash. Returns a tuple with these hashes. Is case sensitive. + +Can be used for detection of semi-duplicate strings with [tupleHammingDistance](../../sql-reference/functions/tuple-functions.md#tuplehammingdistance). For two strings: if one of the returned hashes is the same for both strings, we think that those strings are the same. + +**Syntax** + +``` sql +ngramMinHash(string[, ngramsize, hashnum]) +``` + +**Arguments** + +- `string` — String. [String](../../sql-reference/data-types/string.md). +- `ngramsize` — The size of an n-gram. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Returned value** + +- Tuple with two hashes — the minimum and the maximum. + +Type: [Tuple](../../sql-reference/data-types/tuple.md)([UInt64](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md)). + +**Example** + +Query: + +``` sql +SELECT ngramMinHash('ClickHouse') AS Tuple; +``` + +Result: + +``` text +┌─Tuple──────────────────────────────────────┐ +│ (18333312859352735453,9054248444481805918) │ +└────────────────────────────────────────────┘ +``` + +## ngramMinHashCaseInsensitive {#ngramminhashcaseinsensitive} + +Splits a ASCII string into n-grams of `ngramsize` symbols and calculates hash values for each n-gram. Uses `hashnum` minimum hashes to calculate the minimum hash and `hashnum` maximum hashes to calculate the maximum hash. Returns a tuple with these hashes. Is case insensitive. + +Can be used for detection of semi-duplicate strings with [tupleHammingDistance](../../sql-reference/functions/tuple-functions.md#tuplehammingdistance). For two strings: if one of the returned hashes is the same for both strings, we think that those strings are the same. + +**Syntax** + +``` sql +ngramMinHashCaseInsensitive(string[, ngramsize, hashnum]) +``` + +**Arguments** + +- `string` — String. [String](../../sql-reference/data-types/string.md). +- `ngramsize` — The size of an n-gram. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Returned value** + +- Tuple with two hashes — the minimum and the maximum. + +Type: [Tuple](../../sql-reference/data-types/tuple.md)([UInt64](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md)). + +**Example** + +Query: + +``` sql +SELECT ngramMinHashCaseInsensitive('ClickHouse') AS Tuple; +``` + +Result: + +``` text +┌─Tuple──────────────────────────────────────┐ +│ (2106263556442004574,13203602793651726206) │ +└────────────────────────────────────────────┘ +``` + +## ngramMinHashUTF8 {#ngramminhashutf8} + +Splits a UTF-8 string into n-grams of `ngramsize` symbols and calculates hash values for each n-gram. Uses `hashnum` minimum hashes to calculate the minimum hash and `hashnum` maximum hashes to calculate the maximum hash. Returns a tuple with these hashes. Is case sensitive. + +Can be used for detection of semi-duplicate strings with [tupleHammingDistance](../../sql-reference/functions/tuple-functions.md#tuplehammingdistance). For two strings: if one of the returned hashes is the same for both strings, we think that those strings are the same. + +**Syntax** + +``` sql +ngramMinHashUTF8(string[, ngramsize, hashnum]) +``` + +**Arguments** + +- `string` — String. [String](../../sql-reference/data-types/string.md). +- `ngramsize` — The size of an n-gram. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Returned value** + +- Tuple with two hashes — the minimum and the maximum. + +Type: [Tuple](../../sql-reference/data-types/tuple.md)([UInt64](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md)). + +**Example** + +Query: + +``` sql +SELECT ngramMinHashUTF8('ClickHouse') AS Tuple; +``` + +Result: + +``` text +┌─Tuple──────────────────────────────────────┐ +│ (18333312859352735453,6742163577938632877) │ +└────────────────────────────────────────────┘ +``` + +## ngramMinHashCaseInsensitiveUTF8 {#ngramminhashcaseinsensitiveutf8} + +Splits a UTF-8 string into n-grams of `ngramsize` symbols and calculates hash values for each n-gram. Uses `hashnum` minimum hashes to calculate the minimum hash and `hashnum` maximum hashes to calculate the maximum hash. Returns a tuple with these hashes. Is case insensitive. + +Can be used for detection of semi-duplicate strings with [tupleHammingDistance](../../sql-reference/functions/tuple-functions.md#tuplehammingdistance). For two strings: if one of the returned hashes is the same for both strings, we think that those strings are the same. + +**Syntax** + +``` sql +ngramMinHashCaseInsensitiveUTF8(string [, ngramsize, hashnum]) +``` + +**Arguments** + +- `string` — String. [String](../../sql-reference/data-types/string.md). +- `ngramsize` — The size of an n-gram. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Returned value** + +- Tuple with two hashes — the minimum and the maximum. + +Type: [Tuple](../../sql-reference/data-types/tuple.md)([UInt64](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md)). + +**Example** + +Query: + +``` sql +SELECT ngramMinHashCaseInsensitiveUTF8('ClickHouse') AS Tuple; +``` + +Result: + +``` text +┌─Tuple───────────────────────────────────────┐ +│ (12493625717655877135,13203602793651726206) │ +└─────────────────────────────────────────────┘ +``` + +## ngramMinHashArg {#ngramminhasharg} + +Splits a ASCII string into n-grams of `ngramsize` symbols and returns the n-grams with minimum and maximum hashes, calculated by the [ngramMinHash](#ngramminhash) function with the same input. Is case sensitive. + +**Syntax** + +``` sql +ngramMinHashArg(string[, ngramsize, hashnum]) +``` + +**Arguments** + +- `string` — String. [String](../../sql-reference/data-types/string.md). +- `ngramsize` — The size of an n-gram. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Returned value** + +- Tuple with two tuples with `hashnum` n-grams each. + +Type: [Tuple](../../sql-reference/data-types/tuple.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md)), [Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md))). + +**Example** + +Query: + +``` sql +SELECT ngramMinHashArg('ClickHouse') AS Tuple; +``` + +Result: + +``` text +┌─Tuple─────────────────────────────────────────────────────────────────────────┐ +│ (('ous','ick','lic','Hou','kHo','use'),('Hou','lic','ick','ous','ckH','Cli')) │ +└───────────────────────────────────────────────────────────────────────────────┘ +``` + +## ngramMinHashArgCaseInsensitive {#ngramminhashargcaseinsensitive} + +Splits a ASCII string into n-grams of `ngramsize` symbols and returns the n-grams with minimum and maximum hashes, calculated by the [ngramMinHashCaseInsensitive](#ngramminhashcaseinsensitive) function with the same input. Is case insensitive. + +**Syntax** + +``` sql +ngramMinHashArgCaseInsensitive(string[, ngramsize, hashnum]) +``` + +**Arguments** + +- `string` — String. [String](../../sql-reference/data-types/string.md). +- `ngramsize` — The size of an n-gram. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Returned value** + +- Tuple with two tuples with `hashnum` n-grams each. + +Type: [Tuple](../../sql-reference/data-types/tuple.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md)), [Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md))). + +**Example** + +Query: + +``` sql +SELECT ngramMinHashArgCaseInsensitive('ClickHouse') AS Tuple; +``` + +Result: + +``` text +┌─Tuple─────────────────────────────────────────────────────────────────────────┐ +│ (('ous','ick','lic','kHo','use','Cli'),('kHo','lic','ick','ous','ckH','Hou')) │ +└───────────────────────────────────────────────────────────────────────────────┘ +``` + +## ngramMinHashArgUTF8 {#ngramminhashargutf8} + +Splits a UTF-8 string into n-grams of `ngramsize` symbols and returns the n-grams with minimum and maximum hashes, calculated by the [ngramMinHashUTF8](#ngramminhashutf8) function with the same input. Is case sensitive. + +**Syntax** + +``` sql +ngramMinHashArgUTF8(string[, ngramsize, hashnum]) +``` + +**Arguments** + +- `string` — String. [String](../../sql-reference/data-types/string.md). +- `ngramsize` — The size of an n-gram. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Returned value** + +- Tuple with two tuples with `hashnum` n-grams each. + +Type: [Tuple](../../sql-reference/data-types/tuple.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md)), [Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md))). + +**Example** + +Query: + +``` sql +SELECT ngramMinHashArgUTF8('ClickHouse') AS Tuple; +``` + +Result: + +``` text +┌─Tuple─────────────────────────────────────────────────────────────────────────┐ +│ (('ous','ick','lic','Hou','kHo','use'),('kHo','Hou','lic','ick','ous','ckH')) │ +└───────────────────────────────────────────────────────────────────────────────┘ +``` + +## ngramMinHashArgCaseInsensitiveUTF8 {#ngramminhashargcaseinsensitiveutf8} + +Splits a UTF-8 string into n-grams of `ngramsize` symbols and returns the n-grams with minimum and maximum hashes, calculated by the [ngramMinHashCaseInsensitiveUTF8](#ngramminhashcaseinsensitiveutf8) function with the same input. Is case insensitive. + +**Syntax** + +``` sql +ngramMinHashArgCaseInsensitiveUTF8(string[, ngramsize, hashnum]) +``` + +**Arguments** + +- `string` — String. [String](../../sql-reference/data-types/string.md). +- `ngramsize` — The size of an n-gram. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Returned value** + +- Tuple with two tuples with `hashnum` n-grams each. + +Type: [Tuple](../../sql-reference/data-types/tuple.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md)), [Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md))). + +**Example** + +Query: + +``` sql +SELECT ngramMinHashArgCaseInsensitiveUTF8('ClickHouse') AS Tuple; +``` + +Result: + +``` text +┌─Tuple─────────────────────────────────────────────────────────────────────────┐ +│ (('ckH','ous','ick','lic','kHo','use'),('kHo','lic','ick','ous','ckH','Hou')) │ +└───────────────────────────────────────────────────────────────────────────────┘ +``` + +## wordShingleMinHash {#wordshingleminhash} + +Splits a ASCII string into parts (shingles) of `shinglesize` words and calculates hash values for each word shingle. Uses `hashnum` minimum hashes to calculate the minimum hash and `hashnum` maximum hashes to calculate the maximum hash. Returns a tuple with these hashes. Is case sensitive. + +Can be used for detection of semi-duplicate strings with [tupleHammingDistance](../../sql-reference/functions/tuple-functions.md#tuplehammingdistance). For two strings: if one of the returned hashes is the same for both strings, we think that those strings are the same. + +**Syntax** + +``` sql +wordShingleMinHash(string[, shinglesize, hashnum]) +``` + +**Arguments** + +- `string` — String. [String](../../sql-reference/data-types/string.md). +- `shinglesize` — The size of a word shingle. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Returned value** + +- Tuple with two hashes — the minimum and the maximum. + +Type: [Tuple](../../sql-reference/data-types/tuple.md)([UInt64](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md)). + +**Example** + +Query: + +``` sql +SELECT wordShingleMinHash('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).') AS Tuple; +``` + +Result: + +``` text +┌─Tuple──────────────────────────────────────┐ +│ (16452112859864147620,5844417301642981317) │ +└────────────────────────────────────────────┘ +``` + +## wordShingleMinHashCaseInsensitive {#wordshingleminhashcaseinsensitive} + +Splits a ASCII string into parts (shingles) of `shinglesize` words and calculates hash values for each word shingle. Uses `hashnum` minimum hashes to calculate the minimum hash and `hashnum` maximum hashes to calculate the maximum hash. Returns a tuple with these hashes. Is case insensitive. + +Can be used for detection of semi-duplicate strings with [tupleHammingDistance](../../sql-reference/functions/tuple-functions.md#tuplehammingdistance). For two strings: if one of the returned hashes is the same for both strings, we think that those strings are the same. + +**Syntax** + +``` sql +wordShingleMinHashCaseInsensitive(string[, shinglesize, hashnum]) +``` + +**Arguments** + +- `string` — String. [String](../../sql-reference/data-types/string.md). +- `shinglesize` — The size of a word shingle. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Returned value** + +- Tuple with two hashes — the minimum and the maximum. + +Type: [Tuple](../../sql-reference/data-types/tuple.md)([UInt64](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md)). + +**Example** + +Query: + +``` sql +SELECT wordShingleMinHashCaseInsensitive('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).') AS Tuple; +``` + +Result: + +``` text +┌─Tuple─────────────────────────────────────┐ +│ (3065874883688416519,1634050779997673240) │ +└───────────────────────────────────────────┘ +``` + +## wordShingleMinHashUTF8 {#wordshingleminhashutf8} + +Splits a UTF-8 string into parts (shingles) of `shinglesize` words and calculates hash values for each word shingle. Uses `hashnum` minimum hashes to calculate the minimum hash and `hashnum` maximum hashes to calculate the maximum hash. Returns a tuple with these hashes. Is case sensitive. + +Can be used for detection of semi-duplicate strings with [tupleHammingDistance](../../sql-reference/functions/tuple-functions.md#tuplehammingdistance). For two strings: if one of the returned hashes is the same for both strings, we think that those strings are the same. + +**Syntax** + +``` sql +wordShingleMinHashUTF8(string[, shinglesize, hashnum]) +``` + +**Arguments** + +- `string` — String. [String](../../sql-reference/data-types/string.md). +- `shinglesize` — The size of a word shingle. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Returned value** + +- Tuple with two hashes — the minimum and the maximum. + +Type: [Tuple](../../sql-reference/data-types/tuple.md)([UInt64](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md)). + +**Example** + +Query: + +``` sql +SELECT wordShingleMinHashUTF8('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).') AS Tuple; +``` + +Result: + +``` text +┌─Tuple──────────────────────────────────────┐ +│ (16452112859864147620,5844417301642981317) │ +└────────────────────────────────────────────┘ +``` + +## wordShingleMinHashCaseInsensitiveUTF8 {#wordshingleminhashcaseinsensitiveutf8} + +Splits a UTF-8 string into parts (shingles) of `shinglesize` words and calculates hash values for each word shingle. Uses `hashnum` minimum hashes to calculate the minimum hash and `hashnum` maximum hashes to calculate the maximum hash. Returns a tuple with these hashes. Is case insensitive. + +Can be used for detection of semi-duplicate strings with [tupleHammingDistance](../../sql-reference/functions/tuple-functions.md#tuplehammingdistance). For two strings: if one of the returned hashes is the same for both strings, we think that those strings are the same. + +**Syntax** + +``` sql +wordShingleMinHashCaseInsensitiveUTF8(string[, shinglesize, hashnum]) +``` + +**Arguments** + +- `string` — String. [String](../../sql-reference/data-types/string.md). +- `shinglesize` — The size of a word shingle. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Returned value** + +- Tuple with two hashes — the minimum and the maximum. + +Type: [Tuple](../../sql-reference/data-types/tuple.md)([UInt64](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md)). + +**Example** + +Query: + +``` sql +SELECT wordShingleMinHashCaseInsensitiveUTF8('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).') AS Tuple; +``` + +Result: + +``` text +┌─Tuple─────────────────────────────────────┐ +│ (3065874883688416519,1634050779997673240) │ +└───────────────────────────────────────────┘ +``` + +## wordShingleMinHashArg {#wordshingleminhasharg} + +Splits a ASCII string into parts (shingles) of `shinglesize` words each and returns the shingles with minimum and maximum word hashes, calculated by the [wordshingleMinHash](#wordshingleminhash) function with the same input. Is case sensitive. + +**Syntax** + +``` sql +wordShingleMinHashArg(string[, shinglesize, hashnum]) +``` + +**Arguments** + +- `string` — String. [String](../../sql-reference/data-types/string.md). +- `shinglesize` — The size of a word shingle. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Returned value** + +- Tuple with two tuples with `hashnum` word shingles each. + +Type: [Tuple](../../sql-reference/data-types/tuple.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md)), [Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md))). + +**Example** + +Query: + +``` sql +SELECT wordShingleMinHashArg('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).', 1, 3) AS Tuple; +``` + +Result: + +``` text +┌─Tuple─────────────────────────────────────────────────────────────────┐ +│ (('OLAP','database','analytical'),('online','oriented','processing')) │ +└───────────────────────────────────────────────────────────────────────┘ +``` + +## wordShingleMinHashArgCaseInsensitive {#wordshingleminhashargcaseinsensitive} + +Splits a ASCII string into parts (shingles) of `shinglesize` words each and returns the shingles with minimum and maximum word hashes, calculated by the [wordShingleMinHashCaseInsensitive](#wordshingleminhashcaseinsensitive) function with the same input. Is case insensitive. + +**Syntax** + +``` sql +wordShingleMinHashArgCaseInsensitive(string[, shinglesize, hashnum]) +``` + +**Arguments** + +- `string` — String. [String](../../sql-reference/data-types/string.md). +- `shinglesize` — The size of a word shingle. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Returned value** + +- Tuple with two tuples with `hashnum` word shingles each. + +Type: [Tuple](../../sql-reference/data-types/tuple.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md)), [Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md))). + +**Example** + +Query: + +``` sql +SELECT wordShingleMinHashArgCaseInsensitive('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).', 1, 3) AS Tuple; +``` + +Result: + +``` text +┌─Tuple──────────────────────────────────────────────────────────────────┐ +│ (('queries','database','analytical'),('oriented','processing','DBMS')) │ +└────────────────────────────────────────────────────────────────────────┘ +``` + +## wordShingleMinHashArgUTF8 {#wordshingleminhashargutf8} + +Splits a UTF-8 string into parts (shingles) of `shinglesize` words each and returns the shingles with minimum and maximum word hashes, calculated by the [wordShingleMinHashUTF8](#wordshingleminhashutf8) function with the same input. Is case sensitive. + +**Syntax** + +``` sql +wordShingleMinHashArgUTF8(string[, shinglesize, hashnum]) +``` + +**Arguments** + +- `string` — String. [String](../../sql-reference/data-types/string.md). +- `shinglesize` — The size of a word shingle. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Returned value** + +- Tuple with two tuples with `hashnum` word shingles each. + +Type: [Tuple](../../sql-reference/data-types/tuple.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md)), [Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md))). + +**Example** + +Query: + +``` sql +SELECT wordShingleMinHashArgUTF8('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).', 1, 3) AS Tuple; +``` + +Result: + +``` text +┌─Tuple─────────────────────────────────────────────────────────────────┐ +│ (('OLAP','database','analytical'),('online','oriented','processing')) │ +└───────────────────────────────────────────────────────────────────────┘ +``` + +## wordShingleMinHashArgCaseInsensitiveUTF8 {#wordshingleminhashargcaseinsensitiveutf8} + +Splits a UTF-8 string into parts (shingles) of `shinglesize` words each and returns the shingles with minimum and maximum word hashes, calculated by the [wordShingleMinHashCaseInsensitiveUTF8](#wordshingleminhashcaseinsensitiveutf8) function with the same input. Is case insensitive. + +**Syntax** + +``` sql +wordShingleMinHashArgCaseInsensitiveUTF8(string[, shinglesize, hashnum]) +``` + +**Arguments** + +- `string` — String. [String](../../sql-reference/data-types/string.md). +- `shinglesize` — The size of a word shingle. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Returned value** + +- Tuple with two tuples with `hashnum` word shingles each. + +Type: [Tuple](../../sql-reference/data-types/tuple.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md)), [Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md))). + +**Example** + +Query: + +``` sql +SELECT wordShingleMinHashArgCaseInsensitiveUTF8('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).', 1, 3) AS Tuple; +``` + +Result: + +``` text +┌─Tuple──────────────────────────────────────────────────────────────────┐ +│ (('queries','database','analytical'),('oriented','processing','DBMS')) │ +└────────────────────────────────────────────────────────────────────────┘ +``` diff --git a/docs/en/sql-reference/functions/tuple-functions.md b/docs/en/sql-reference/functions/tuple-functions.md index 884e1ef754f..86442835425 100644 --- a/docs/en/sql-reference/functions/tuple-functions.md +++ b/docs/en/sql-reference/functions/tuple-functions.md @@ -111,4 +111,55 @@ Result: - [Tuple](../../sql-reference/data-types/tuple.md) -[Original article](https://clickhouse.tech/docs/en/sql-reference/functions/tuple-functions/) +## tupleHammingDistance {#tuplehammingdistance} + +Returns the [Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance) between two tuples of the same size. + +**Syntax** + +``` sql +tupleHammingDistance(tuple1, tuple2) +``` + +**Arguments** + +- `tuple1` — First tuple. [Tuple](../../sql-reference/data-types/tuple.md). +- `tuple2` — Second tuple. [Tuple](../../sql-reference/data-types/tuple.md). + +Tuples should have the same type of the elements. + +**Returned value** + +- The Hamming distance. + +Type: [UInt8](../../sql-reference/data-types/int-uint.md). + +**Examples** + +Query: + +``` sql +SELECT tupleHammingDistance((1, 2, 3), (3, 2, 1)) AS HammingDistance; +``` + +Result: + +``` text +┌─HammingDistance─┐ +│ 2 │ +└─────────────────┘ +``` + +Can be used with [MinHash](../../sql-reference/functions/hash-functions.md#ngramminhash) functions for detection of semi-duplicate strings: + +``` sql +SELECT tupleHammingDistance(wordShingleMinHash(string), wordShingleMinHashCaseInsensitive(string)) as HammingDistance FROM (SELECT 'Clickhouse is a column-oriented database management system for online analytical processing of queries.' AS string); +``` + +Result: + +``` text +┌─HammingDistance─┐ +│ 2 │ +└─────────────────┘ +``` diff --git a/docs/en/sql-reference/table-functions/index.md b/docs/en/sql-reference/table-functions/index.md index fef30c04c9d..d65a18ab985 100644 --- a/docs/en/sql-reference/table-functions/index.md +++ b/docs/en/sql-reference/table-functions/index.md @@ -21,16 +21,18 @@ You can use table functions in: !!! warning "Warning" You can’t use table functions if the [allow_ddl](../../operations/settings/permissions-for-queries.md#settings_allow_ddl) setting is disabled. -| Function | Description | -|-----------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------| -| [file](../../sql-reference/table-functions/file.md) | Creates a File-engine table. | -| [merge](../../sql-reference/table-functions/merge.md) | Creates a Merge-engine table. | -| [numbers](../../sql-reference/table-functions/numbers.md) | Creates a table with a single column filled with integer numbers. | -| [remote](../../sql-reference/table-functions/remote.md) | Allows you to access remote servers without creating a Distributed-engine table. | -| [url](../../sql-reference/table-functions/url.md) | Creates a URL-engine table. | -| [mysql](../../sql-reference/table-functions/mysql.md) | Creates a MySQL-engine table. | -| [postgresql](../../sql-reference/table-functions/postgresql.md) | Creates a PostgreSQL-engine table. | -| [jdbc](../../sql-reference/table-functions/jdbc.md) | Creates a JDBC-engine table. | -| [odbc](../../sql-reference/table-functions/odbc.md) | Creates a ODBC-engine table. | -| [hdfs](../../sql-reference/table-functions/hdfs.md) | Creates a HDFS-engine table. | -| [s3](../../sql-reference/table-functions/s3.md) | Creates a S3-engine table. | +| Function | Description | +|------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------| +| [file](../../sql-reference/table-functions/file.md) | Creates a [File](../../engines/table-engines/special/file.md)-engine table. | +| [merge](../../sql-reference/table-functions/merge.md) | Creates a [Merge](../../engines/table-engines/special/merge.md)-engine table. | +| [numbers](../../sql-reference/table-functions/numbers.md) | Creates a table with a single column filled with integer numbers. | +| [remote](../../sql-reference/table-functions/remote.md) | Allows you to access remote servers without creating a [Distributed](../../engines/table-engines/special/distributed.md)-engine table. | +| [url](../../sql-reference/table-functions/url.md) | Creates a [Url](../../engines/table-engines/special/url.md)-engine table. | +| [mysql](../../sql-reference/table-functions/mysql.md) | Creates a [MySQL](../../engines/table-engines/integrations/mysql.md)-engine table. | +| [postgresql](../../sql-reference/table-functions/postgresql.md) | Creates a [PostgreSQL](../../engines/table-engines/integrations/postgresql.md)-engine table. | +| [jdbc](../../sql-reference/table-functions/jdbc.md) | Creates a [JDBC](../../engines/table-engines/integrations/jdbc.md)-engine table. | +| [odbc](../../sql-reference/table-functions/odbc.md) | Creates a [ODBC](../../engines/table-engines/integrations/odbc.md)-engine table. | +| [hdfs](../../sql-reference/table-functions/hdfs.md) | Creates a [HDFS](../../engines/table-engines/integrations/hdfs.md)-engine table. | +| [s3](../../sql-reference/table-functions/s3.md) | Creates a [S3](../../engines/table-engines/integrations/s3.md)-engine table. | + +[Original article](https://clickhouse.tech/docs/en/sql-reference/table-functions/) diff --git a/docs/en/sql-reference/table-functions/postgresql.md b/docs/en/sql-reference/table-functions/postgresql.md index ad5d8a29904..bfb5fdf9be6 100644 --- a/docs/en/sql-reference/table-functions/postgresql.md +++ b/docs/en/sql-reference/table-functions/postgresql.md @@ -10,33 +10,17 @@ Allows `SELECT` and `INSERT` queries to be performed on data that is stored on a **Syntax** ``` sql -postgresql('host:port', 'database', 'table', 'user', 'password') +postgresql('host:port', 'database', 'table', 'user', 'password'[, `schema`]) ``` **Arguments** - `host:port` — PostgreSQL server address. - - `database` — Remote database name. - - `table` — Remote table name. - - `user` — PostgreSQL user. - - `password` — User password. - - -SELECT Queries on PostgreSQL side run as `COPY (SELECT ...) TO STDOUT` inside read-only PostgreSQL transaction with commit after each `SELECT` query. - -Simple `WHERE` clauses such as `=, !=, >, >=, <, <=, IN` are executed on the PostgreSQL server. - -All joins, aggregations, sorting, `IN [ array ]` conditions and the `LIMIT` sampling constraint are executed in ClickHouse only after the query to PostgreSQL finishes. - -INSERT Queries on PostgreSQL side run as `COPY "table_name" (field1, field2, ... fieldN) FROM STDIN` inside PostgreSQL transaction with auto-commit after each `INSERT` statement. - -PostgreSQL Array types converts into ClickHouse arrays. - -Be careful in PostgreSQL an array data type column like Integer[] may contain arrays of different dimensions in different rows, but in ClickHouse it is only allowed to have multidimensional arrays of the same dimension in all rows. +- `schema` — Non-default table schema. Optional. **Returned Value** @@ -45,6 +29,23 @@ A table object with the same columns as the original PostgreSQL table. !!! info "Note" In the `INSERT` query to distinguish table function `postgresql(...)` from table name with column names list you must use keywords `FUNCTION` or `TABLE FUNCTION`. See examples below. +## Implementation Details {#implementation-details} + +`SELECT` queries on PostgreSQL side run as `COPY (SELECT ...) TO STDOUT` inside read-only PostgreSQL transaction with commit after each `SELECT` query. + +Simple `WHERE` clauses such as `=`, `!=`, `>`, `>=`, `<`, `<=`, and `IN` are executed on the PostgreSQL server. + +All joins, aggregations, sorting, `IN [ array ]` conditions and the `LIMIT` sampling constraint are executed in ClickHouse only after the query to PostgreSQL finishes. + +`INSERT` queries on PostgreSQL side run as `COPY "table_name" (field1, field2, ... fieldN) FROM STDIN` inside PostgreSQL transaction with auto-commit after each `INSERT` statement. + +PostgreSQL Array types converts into ClickHouse arrays. + +!!! info "Note" + Be careful, in PostgreSQL an array data type column like Integer[] may contain arrays of different dimensions in different rows, but in ClickHouse it is only allowed to have multidimensional arrays of the same dimension in all rows. + +Supports replicas priority for PostgreSQL dictionary source. The bigger the number in map, the less the priority. The highest priority is `0`. + **Examples** Table in PostgreSQL: @@ -60,10 +61,10 @@ PRIMARY KEY (int_id)); CREATE TABLE -postgres=# insert into test (int_id, str, "float") VALUES (1,'test',2); +postgres=# INSERT INTO test (int_id, str, "float") VALUES (1,'test',2); INSERT 0 1 -postgresql> select * from test; +postgresql> SELECT * FROM test; int_id | int_nullable | float | str | float_nullable --------+--------------+-------+------+---------------- 1 | | 2 | test | @@ -96,9 +97,24 @@ SELECT * FROM postgresql('localhost:5432', 'test', 'test', 'postgresql_user', 'p └────────┴──────────────┴───────┴──────┴────────────────┘ ``` +Using Non-default Schema: + +```text +postgres=# CREATE SCHEMA "nice.schema"; + +postgres=# CREATE TABLE "nice.schema"."nice.table" (a integer); + +postgres=# INSERT INTO "nice.schema"."nice.table" SELECT i FROM generate_series(0, 99) as t(i) +``` + +```sql +CREATE TABLE pg_table_schema_with_dots (a UInt32) + ENGINE PostgreSQL('localhost:5432', 'clickhouse', 'nice.table', 'postgrsql_user', 'password', 'nice.schema'); +``` + **See Also** -- [The ‘PostgreSQL’ table engine](../../engines/table-engines/integrations/postgresql.md) +- [The PostgreSQL table engine](../../engines/table-engines/integrations/postgresql.md) - [Using PostgreSQL as a source of external dictionary](../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md#dicts-external_dicts_dict_sources-postgresql) [Original article](https://clickhouse.tech/docs/en/sql-reference/table-functions/postgresql/) diff --git a/docs/ru/engines/database-engines/index.md b/docs/ru/engines/database-engines/index.md index ec92edd2888..d4fad8f43a9 100644 --- a/docs/ru/engines/database-engines/index.md +++ b/docs/ru/engines/database-engines/index.md @@ -4,7 +4,7 @@ toc_priority: 27 toc_title: "Введение" --- -# Движки баз данных {#dvizhki-baz-dannykh} +# Движки баз данных {#database-engines} Движки баз данных обеспечивают работу с таблицами. @@ -18,3 +18,5 @@ toc_title: "Введение" - [Lazy](../../engines/database-engines/lazy.md) +- [PostgreSQL](../../engines/database-engines/postgresql.md) + diff --git a/docs/ru/engines/database-engines/postgresql.md b/docs/ru/engines/database-engines/postgresql.md new file mode 100644 index 00000000000..c11dab6f1aa --- /dev/null +++ b/docs/ru/engines/database-engines/postgresql.md @@ -0,0 +1,138 @@ +--- +toc_priority: 35 +toc_title: PostgreSQL +--- + +# PostgreSQL {#postgresql} + +Позволяет подключаться к БД на удаленном сервере [PostgreSQL](https://www.postgresql.org). Поддерживает операции чтения и записи (запросы `SELECT` и `INSERT`) для обмена данными между ClickHouse и PostgreSQL. + +Позволяет в реальном времени получать от удаленного сервера PostgreSQL информацию о таблицах БД и их структуре с помощью запросов `SHOW TABLES` и `DESCRIBE TABLE`. + +Поддерживает операции изменения структуры таблиц (`ALTER TABLE ... ADD|DROP COLUMN`). Если параметр `use_table_cache` (см. ниже раздел Параметры движка) установлен в значение `1`, структура таблицы кешируется, и изменения в структуре не отслеживаются, но будут обновлены, если выполнить команды `DETACH` и `ATTACH`. + +## Создание БД {#creating-a-database} + +``` sql +CREATE DATABASE test_database +ENGINE = PostgreSQL('host:port', 'database', 'user', 'password'[, `use_table_cache`]); +``` + +**Параметры движка** + +- `host:port` — адрес сервера PostgreSQL. +- `database` — имя удаленной БД. +- `user` — пользователь PostgreSQL. +- `password` — пароль пользователя. +- `use_table_cache` — определяет кеширование структуры таблиц БД. Необязательный параметр. Значение по умолчанию: `0`. + +## Поддерживаемые типы данных {#data_types-support} + +| PostgerSQL | ClickHouse | +|------------------|--------------------------------------------------------------| +| DATE | [Date](../../sql-reference/data-types/date.md) | +| TIMESTAMP | [DateTime](../../sql-reference/data-types/datetime.md) | +| REAL | [Float32](../../sql-reference/data-types/float.md) | +| DOUBLE | [Float64](../../sql-reference/data-types/float.md) | +| DECIMAL, NUMERIC | [Decimal](../../sql-reference/data-types/decimal.md) | +| SMALLINT | [Int16](../../sql-reference/data-types/int-uint.md) | +| INTEGER | [Int32](../../sql-reference/data-types/int-uint.md) | +| BIGINT | [Int64](../../sql-reference/data-types/int-uint.md) | +| SERIAL | [UInt32](../../sql-reference/data-types/int-uint.md) | +| BIGSERIAL | [UInt64](../../sql-reference/data-types/int-uint.md) | +| TEXT, CHAR | [String](../../sql-reference/data-types/string.md) | +| INTEGER | Nullable([Int32](../../sql-reference/data-types/int-uint.md))| +| ARRAY | [Array](../../sql-reference/data-types/array.md) | + + +## Примеры использования {#examples-of-use} + +Обмен данными между БД ClickHouse и сервером PostgreSQL: + +``` sql +CREATE DATABASE test_database +ENGINE = PostgreSQL('postgres1:5432', 'test_database', 'postgres', 'mysecretpassword', 1); +``` + +``` sql +SHOW DATABASES; +``` + +``` text +┌─name──────────┐ +│ default │ +│ test_database │ +│ system │ +└───────────────┘ +``` + +``` sql +SHOW TABLES FROM test_database; +``` + +``` text +┌─name───────┐ +│ test_table │ +└────────────┘ +``` + +Чтение данных из таблицы PostgreSQL: + +``` sql +SELECT * FROM test_database.test_table; +``` + +``` text +┌─id─┬─value─┐ +│ 1 │ 2 │ +└────┴───────┘ +``` + +Запись данных в таблицу PostgreSQL: + +``` sql +INSERT INTO test_database.test_table VALUES (3,4); +SELECT * FROM test_database.test_table; +``` + +``` text +┌─int_id─┬─value─┐ +│ 1 │ 2 │ +│ 3 │ 4 │ +└────────┴───────┘ +``` + +Пусть структура таблицы была изменена в PostgreSQL: + +``` sql +postgre> ALTER TABLE test_table ADD COLUMN data Text +``` + +Поскольку при создании БД параметр `use_table_cache` был установлен в значение `1`, структура таблицы в ClickHouse была кеширована и поэтому не изменилась: + +``` sql +DESCRIBE TABLE test_database.test_table; +``` +``` text +┌─name───┬─type──────────────┐ +│ id │ Nullable(Integer) │ +│ value │ Nullable(Integer) │ +└────────┴───────────────────┘ +``` + +После того как таблицу «отцепили» и затем снова «прицепили», структура обновилась: + +``` sql +DETACH TABLE test_database.test_table; +ATTACH TABLE test_database.test_table; +DESCRIBE TABLE test_database.test_table; +``` +``` text +┌─name───┬─type──────────────┐ +│ id │ Nullable(Integer) │ +│ value │ Nullable(Integer) │ +│ data │ Nullable(String) │ +└────────┴───────────────────┘ +``` + +[Оригинальная статья](https://clickhouse.tech/docs/ru/database-engines/postgresql/) diff --git a/docs/ru/engines/table-engines/index.md b/docs/ru/engines/table-engines/index.md index 6c11011a307..a364a3cb972 100644 --- a/docs/ru/engines/table-engines/index.md +++ b/docs/ru/engines/table-engines/index.md @@ -16,7 +16,7 @@ toc_title: "Введение" - Возможно ли многопоточное выполнение запроса. - Параметры репликации данных. -## Семейства движков {#semeistva-dvizhkov} +## Семейства движков {#engine-families} ### MergeTree {#mergetree} @@ -42,18 +42,23 @@ toc_title: "Введение" - [StripeLog](log-family/stripelog.md#stripelog) - [Log](log-family/log.md#log) -### Движки для интеграции {#dvizhki-dlia-integratsii} +### Движки для интеграции {#integration-engines} Движки для связи с другими системами хранения и обработки данных. Движки семейства: -- [Kafka](integrations/kafka.md#kafka) -- [MySQL](integrations/mysql.md#mysql) -- [ODBC](integrations/odbc.md#table-engine-odbc) -- [JDBC](integrations/jdbc.md#table-engine-jdbc) +- [ODBC](../../engines/table-engines/integrations/odbc.md) +- [JDBC](../../engines/table-engines/integrations/jdbc.md) +- [MySQL](../../engines/table-engines/integrations/mysql.md) +- [MongoDB](../../engines/table-engines/integrations/mongodb.md) +- [HDFS](../../engines/table-engines/integrations/hdfs.md) +- [Kafka](../../engines/table-engines/integrations/kafka.md) +- [EmbeddedRocksDB](../../engines/table-engines/integrations/embedded-rocksdb.md) +- [RabbitMQ](../../engines/table-engines/integrations/rabbitmq.md) +- [PostgreSQL](../../engines/table-engines/integrations/postgresql.md) -### Специальные движки {#spetsialnye-dvizhki} +### Специальные движки {#special-engines} Движки семейства: diff --git a/docs/ru/engines/table-engines/integrations/embedded-rocksdb.md b/docs/ru/engines/table-engines/integrations/embedded-rocksdb.md index f66e789a392..5a7909f63b2 100644 --- a/docs/ru/engines/table-engines/integrations/embedded-rocksdb.md +++ b/docs/ru/engines/table-engines/integrations/embedded-rocksdb.md @@ -1,5 +1,5 @@ --- -toc_priority: 6 +toc_priority: 9 toc_title: EmbeddedRocksDB --- diff --git a/docs/ru/engines/table-engines/integrations/hdfs.md b/docs/ru/engines/table-engines/integrations/hdfs.md index 3d9cb388a01..b56bbfc0788 100644 --- a/docs/ru/engines/table-engines/integrations/hdfs.md +++ b/docs/ru/engines/table-engines/integrations/hdfs.md @@ -1,5 +1,5 @@ --- -toc_priority: 4 +toc_priority: 6 toc_title: HDFS --- diff --git a/docs/ru/engines/table-engines/integrations/jdbc.md b/docs/ru/engines/table-engines/integrations/jdbc.md index e2db6fac0b2..fd7411a258e 100644 --- a/docs/ru/engines/table-engines/integrations/jdbc.md +++ b/docs/ru/engines/table-engines/integrations/jdbc.md @@ -1,5 +1,5 @@ --- -toc_priority: 2 +toc_priority: 3 toc_title: JDBC --- diff --git a/docs/ru/engines/table-engines/integrations/kafka.md b/docs/ru/engines/table-engines/integrations/kafka.md index f053b80aebd..19e2850dd51 100644 --- a/docs/ru/engines/table-engines/integrations/kafka.md +++ b/docs/ru/engines/table-engines/integrations/kafka.md @@ -1,5 +1,5 @@ --- -toc_priority: 5 +toc_priority: 8 toc_title: Kafka --- diff --git a/docs/ru/engines/table-engines/integrations/mongodb.md b/docs/ru/engines/table-engines/integrations/mongodb.md index 5ab63494648..97f903bdf89 100644 --- a/docs/ru/engines/table-engines/integrations/mongodb.md +++ b/docs/ru/engines/table-engines/integrations/mongodb.md @@ -1,5 +1,5 @@ --- -toc_priority: 7 +toc_priority: 5 toc_title: MongoDB --- diff --git a/docs/ru/engines/table-engines/integrations/mysql.md b/docs/ru/engines/table-engines/integrations/mysql.md index 9152a57d122..5011c8a93c6 100644 --- a/docs/ru/engines/table-engines/integrations/mysql.md +++ b/docs/ru/engines/table-engines/integrations/mysql.md @@ -1,5 +1,5 @@ --- -toc_priority: 3 +toc_priority: 4 toc_title: MySQL --- diff --git a/docs/ru/engines/table-engines/integrations/odbc.md b/docs/ru/engines/table-engines/integrations/odbc.md index b2faa9b1e9e..669977ff531 100644 --- a/docs/ru/engines/table-engines/integrations/odbc.md +++ b/docs/ru/engines/table-engines/integrations/odbc.md @@ -1,5 +1,5 @@ --- -toc_priority: 1 +toc_priority: 2 toc_title: ODBC --- diff --git a/docs/ru/engines/table-engines/integrations/postgresql.md b/docs/ru/engines/table-engines/integrations/postgresql.md index ecf431830f8..8964b1dbf02 100644 --- a/docs/ru/engines/table-engines/integrations/postgresql.md +++ b/docs/ru/engines/table-engines/integrations/postgresql.md @@ -1,11 +1,11 @@ --- -toc_priority: 8 +toc_priority: 11 toc_title: PostgreSQL --- -# PosgtreSQL {#postgresql} +#PostgreSQL {#postgresql} -Движок PostgreSQL позволяет выполнять запросы `SELECT` над данными, хранящимися на удалённом PostgreSQL сервере. +Движок PostgreSQL позволяет выполнять запросы `SELECT` и `INSERT` для таблиц на удаленном сервере PostgreSQL. ## Создание таблицы {#creating-a-table} @@ -15,7 +15,7 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1] [TTL expr1], name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2] [TTL expr2], ... -) ENGINE = PostgreSQL('host:port', 'database', 'table', 'user', 'password'); +) ENGINE = PostgreSQL('host:port', 'database', 'table', 'user', 'password'[, `schema`]); ``` Смотрите подробное описание запроса [CREATE TABLE](../../../sql-reference/statements/create/table.md#create-table-query). @@ -29,25 +29,51 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] **Параметры движка** - `host:port` — адрес сервера PostgreSQL. - - `database` — Имя базы данных на сервере PostgreSQL. - - `table` — Имя таблицы. - - `user` — Имя пользователя PostgreSQL. - - `password` — Пароль пользователя PostgreSQL. +- `schema` — имя схемы, если не используется схема по умолчанию. Необязательный аргумент. -SELECT запросы на стороне PostgreSQL выполняются как `COPY (SELECT ...) TO STDOUT` внутри транзакции PostgreSQL только на чтение с коммитом после каждого `SELECT` запроса. +## Особенности реализации {#implementation-details} -Простые условия для `WHERE` такие как `=, !=, >, >=, <, <=, IN` исполняются на стороне PostgreSQL сервера. +Запросы `SELECT` на стороне PostgreSQL выполняются как `COPY (SELECT ...) TO STDOUT` внутри транзакции PostgreSQL только на чтение с коммитом после каждого запроса `SELECT`. -Все операции объединения, аггрегации, сортировки, условия `IN [ array ]` и ограничения `LIMIT` выполняются на стороне ClickHouse только после того как запрос к PostgreSQL закончился. +Простые условия для `WHERE`, такие как `=`, `!=`, `>`, `>=`, `<`, `<=` и `IN`, исполняются на стороне PostgreSQL сервера. -INSERT запросы на стороне PostgreSQL выполняются как `COPY "table_name" (field1, field2, ... fieldN) FROM STDIN` внутри PostgreSQL транзакции с автоматическим коммитом после каждого `INSERT` запроса. +Все операции объединения, аггрегации, сортировки, условия `IN [ array ]` и ограничения `LIMIT` выполняются на стороне ClickHouse только после того, как запрос к PostgreSQL закончился. + +Запросы `INSERT` на стороне PostgreSQL выполняются как `COPY "table_name" (field1, field2, ... fieldN) FROM STDIN` внутри PostgreSQL транзакции с автоматическим коммитом после каждого запроса `INSERT`. PostgreSQL массивы конвертируются в массивы ClickHouse. -Будьте осторожны в PostgreSQL массивы созданные как type_name[], являются многомерными и могут содержать в себе разное количество измерений в разных строках одной таблицы, внутри ClickHouse допустипы только многомерные массивы с одинаковым кол-вом измерений во всех строках таблицы. + +!!! info "Внимание" + Будьте внимательны, в PostgreSQL массивы, созданные как `type_name[]`, являются многомерными и могут содержать в себе разное количество измерений в разных строках одной таблицы. Внутри ClickHouse допустимы только многомерные массивы с одинаковым кол-вом измерений во всех строках таблицы. + +При использовании словаря PostgreSQL поддерживается приоритет реплик. Чем больше номер реплики, тем ниже ее приоритет. Наивысший приоритет у реплики с номером `0`. + +В примере ниже реплика `example01-1` имеет более высокий приоритет: + +```xml + + 5432 + clickhouse + qwerty + + example01-1 + 1 + + + example01-2 + 2 + + db_name + table_name
+ id=10 + SQL_QUERY +
+ +``` ## Пример использования {#usage-example} @@ -64,17 +90,17 @@ PRIMARY KEY (int_id)); CREATE TABLE -postgres=# insert into test (int_id, str, "float") VALUES (1,'test',2); +postgres=# INSERT INTO test (int_id, str, "float") VALUES (1,'test',2); INSERT 0 1 -postgresql> select * from test; +postgresql> SELECT * FROM test; int_id | int_nullable | float | str | float_nullable --------+--------------+-------+------+---------------- 1 | | 2 | test | (1 row) ``` -Таблица в ClickHouse, получение данных из PostgreSQL таблицы созданной выше: +Таблица в ClickHouse, получение данных из PostgreSQL таблицы, созданной выше: ``` sql CREATE TABLE default.postgresql_table @@ -87,19 +113,33 @@ ENGINE = PostgreSQL('localhost:5432', 'public', 'test', 'postges_user', 'postgre ``` ``` sql -SELECT * FROM postgresql_table WHERE str IN ('test') +SELECT * FROM postgresql_table WHERE str IN ('test'); ``` ``` text ┌─float_nullable─┬─str──┬─int_id─┐ │ ᴺᵁᴸᴸ │ test │ 1 │ └────────────────┴──────┴────────┘ -1 rows in set. Elapsed: 0.019 sec. ``` +Using Non-default Schema: -## Смотри также {#see-also} +```text +postgres=# CREATE SCHEMA "nice.schema"; -- [Табличная функция ‘postgresql’](../../../sql-reference/table-functions/postgresql.md) -- [Использование PostgreSQL в качестве истояника для внешнего словаря](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md#dicts-external_dicts_dict_sources-postgresql) +postgres=# CREATE TABLE "nice.schema"."nice.table" (a integer); +postgres=# INSERT INTO "nice.schema"."nice.table" SELECT i FROM generate_series(0, 99) as t(i) +``` + +```sql +CREATE TABLE pg_table_schema_with_dots (a UInt32) + ENGINE PostgreSQL('localhost:5432', 'clickhouse', 'nice.table', 'postgrsql_user', 'password', 'nice.schema'); +``` + +**См. также** + +- [Табличная функция `postgresql`](../../../sql-reference/table-functions/postgresql.md) +- [Использование PostgreSQL в качестве источника для внешнего словаря](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md#dicts-external_dicts_dict_sources-postgresql) + +[Оригинальная статья](https://clickhouse.tech/docs/ru/engines/table-engines/integrations/postgresql/) diff --git a/docs/ru/operations/server-configuration-parameters/settings.md b/docs/ru/operations/server-configuration-parameters/settings.md index b50347f6196..109146d27f4 100644 --- a/docs/ru/operations/server-configuration-parameters/settings.md +++ b/docs/ru/operations/server-configuration-parameters/settings.md @@ -481,7 +481,15 @@ ClickHouse проверяет условия для `min_part_size` и `min_part ## max_concurrent_queries {#max-concurrent-queries} -Максимальное количество одновременно обрабатываемых запросов. +Определяет максимальное количество одновременно обрабатываемых запросов, связанных с таблицей семейства `MergeTree`. Запросы также могут быть ограничены настройками: [max_concurrent_queries_for_all_users](#max-concurrent-queries-for-all-users), [min_marks_to_honor_max_concurrent_queries](#min-marks-to-honor-max-concurrent-queries). + +!!! info "Примечание" + Параметры этих настроек могут быть изменены во время выполнения запросов и вступят в силу немедленно. Запросы, которые уже запущены, выполнятся без изменений. + +Возможные значения: + +- Положительное целое число. +- 0 — выключена. **Пример** @@ -509,6 +517,21 @@ ClickHouse проверяет условия для `min_part_size` и `min_part - [max_concurrent_queries](#max-concurrent-queries) +## min_marks_to_honor_max_concurrent_queries {#min-marks-to-honor-max-concurrent-queries} + +Определяет минимальное количество засечек, считываемых запросом для применения настройки [max_concurrent_queries](#max-concurrent-queries). + +Возможные значения: + +- Положительное целое число. +- 0 — выключена. + +**Пример** + +``` xml +10 +``` + ## max_connections {#max-connections} Максимальное количество входящих соединений. @@ -1159,4 +1182,3 @@ ClickHouse использует ZooKeeper для хранения метадан ``` - diff --git a/docs/ru/operations/settings/settings.md b/docs/ru/operations/settings/settings.md index f95dc6657b2..d10ac2ab317 100644 --- a/docs/ru/operations/settings/settings.md +++ b/docs/ru/operations/settings/settings.md @@ -1792,6 +1792,19 @@ ClickHouse генерирует исключение - [Движок Distributed](../../engines/table-engines/special/distributed.md#distributed) - [Управление распределёнными таблицами](../../sql-reference/statements/system.md#query-language-system-distributed) +## insert_distributed_one_random_shard {#insert_distributed_one_random_shard} + +Включает или отключает режим вставки данных в [Distributed](../../engines/table-engines/special/distributed.md#distributed)) таблицу в случайный шард при отсутствии ключ шардирования. + +По умолчанию при вставке данных в `Distributed` таблицу с несколькими шардами и при отсутствии ключа шардирования сервер ClickHouse будет отклонять любой запрос на вставку данных. Когда `insert_distributed_one_random_shard = 1`, вставки принимаются, а данные записываются в случайный шард. + +Возможные значения: + +- 0 — если у таблицы несколько шардов, но ключ шардирования отсутствует, вставка данных отклоняется. +- 1 — если ключ шардирования отсутствует, то вставка данных осуществляется в случайный шард среди всех доступных шардов. + +Значение по умолчанию: `0`. + ## insert_shard_id {#insert_shard_id} Если не `0`, указывает, в какой шард [Distributed](../../engines/table-engines/special/distributed.md#distributed) таблицы данные будут вставлены синхронно. diff --git a/docs/ru/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md b/docs/ru/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md index e3816e78547..a7999470330 100644 --- a/docs/ru/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md +++ b/docs/ru/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md @@ -69,6 +69,7 @@ SETTINGS(format_csv_allow_single_quotes = 0) - [ClickHouse](#dicts-external_dicts_dict_sources-clickhouse) - [MongoDB](#dicts-external_dicts_dict_sources-mongodb) - [Redis](#dicts-external_dicts_dict_sources-redis) + - [PostgreSQL](#dicts-external_dicts_dict_sources-postgresql) ## Локальный файл {#dicts-external_dicts_dict_sources-local_file} diff --git a/docs/ru/sql-reference/functions/bit-functions.md b/docs/ru/sql-reference/functions/bit-functions.md index 09844685a6c..a5124e67235 100644 --- a/docs/ru/sql-reference/functions/bit-functions.md +++ b/docs/ru/sql-reference/functions/bit-functions.md @@ -240,3 +240,53 @@ SELECT bitCount(333); └───────────────┘ ``` +## bitHammingDistance {#bithammingdistance} + +Возвращает [расстояние Хэмминга](https://ru.wikipedia.org/wiki/%D0%A0%D0%B0%D1%81%D1%81%D1%82%D0%BE%D1%8F%D0%BD%D0%B8%D0%B5_%D0%A5%D1%8D%D0%BC%D0%BC%D0%B8%D0%BD%D0%B3%D0%B0) между битовыми представлениями двух целых чисел. Может быть использовано с функциями [SimHash](../../sql-reference/functions/hash-functions.md#ngramsimhash) для проверки двух строк на схожесть. Чем меньше расстояние, тем больше вероятность, что строки совпадают. + +**Синтаксис** + +``` sql +bitHammingDistance(int1, int2) +``` + +**Аргументы** + +- `int1` — первое целое число. [Int64](../../sql-reference/data-types/int-uint.md). +- `int2` — второе целое число. [Int64](../../sql-reference/data-types/int-uint.md). + +**Возвращаемое значение** + +- Расстояние Хэмминга. + +Тип: [UInt8](../../sql-reference/data-types/int-uint.md). + +**Примеры** + +Запрос: + +``` sql +SELECT bitHammingDistance(111, 121); +``` + +Результат: + +``` text +┌─bitHammingDistance(111, 121)─┐ +│ 3 │ +└──────────────────────────────┘ +``` + +Используя [SimHash](../../sql-reference/functions/hash-functions.md#ngramsimhash): + +``` sql +SELECT bitHammingDistance(ngramSimHash('cat ate rat'), ngramSimHash('rat ate cat')); +``` + +Результат: + +``` text +┌─bitHammingDistance(ngramSimHash('cat ate rat'), ngramSimHash('rat ate cat'))─┐ +│ 5 │ +└──────────────────────────────────────────────────────────────────────────────┘ +``` diff --git a/docs/ru/sql-reference/functions/hash-functions.md b/docs/ru/sql-reference/functions/hash-functions.md index 6797f530346..2efff9c3727 100644 --- a/docs/ru/sql-reference/functions/hash-functions.md +++ b/docs/ru/sql-reference/functions/hash-functions.md @@ -7,6 +7,8 @@ toc_title: "Функции хэширования" Функции хэширования могут использоваться для детерминированного псевдослучайного разбрасывания элементов. +Simhash – это хеш-функция, которая для близких значений возвращает близкий хеш. + ## halfMD5 {#hash-functions-halfmd5} [Интерпретирует](../../sql-reference/functions/hash-functions.md#type_conversion_functions-reinterpretAsString) все входные параметры как строки и вычисляет хэш [MD5](https://ru.wikipedia.org/wiki/MD5) для каждой из них. Затем объединяет хэши, берет первые 8 байт хэша результирующей строки и интерпретирует их как значение типа `UInt64` с big-endian порядком байтов. @@ -484,3 +486,937 @@ SELECT xxHash32('Hello, world!'); - [xxHash](http://cyan4973.github.io/xxHash/). +## ngramSimHash {#ngramsimhash} + +Выделяет из ASCII строки отрезки (n-граммы) размером `ngramsize` символов и возвращает n-граммовый `simhash`. Функция регистрозависимая. + +Может быть использована для проверки двух строк на схожесть вместе с функцией [bitHammingDistance](../../sql-reference/functions/bit-functions.md#bithammingdistance). Чем меньше [расстояние Хэмминга](https://ru.wikipedia.org/wiki/%D0%A0%D0%B0%D1%81%D1%81%D1%82%D0%BE%D1%8F%D0%BD%D0%B8%D0%B5_%D0%A5%D1%8D%D0%BC%D0%BC%D0%B8%D0%BD%D0%B3%D0%B0) между результатом вычисления `simhash` двух строк, тем больше вероятность, что строки совпадают. + +**Синтаксис** + +``` sql +ngramSimHash(string[, ngramsize]) +``` + +**Аргументы** + +- `string` — строка. [String](../../sql-reference/data-types/string.md). +- `ngramsize` — размер n-грамм. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Возвращаемое значение** + +- Значение хеш-функции от строки. + +Тип: [UInt64](../../sql-reference/data-types/int-uint.md). + +**Пример** + +Запрос: + +``` sql +SELECT ngramSimHash('ClickHouse') AS Hash; +``` + +Результат: + +``` text +┌───────Hash─┐ +│ 1627567969 │ +└────────────┘ +``` + +## ngramSimHashCaseInsensitive {#ngramsimhashcaseinsensitive} + +Выделяет из ASCII строки отрезки (n-граммы) размером `ngramsize` символов и возвращает n-граммовый `simhash`. Функция регистро**не**зависимая. + +Может быть использована для проверки двух строк на схожесть вместе с функцией [bitHammingDistance](../../sql-reference/functions/bit-functions.md#bithammingdistance). Чем меньше [расстояние Хэмминга](https://ru.wikipedia.org/wiki/%D0%A0%D0%B0%D1%81%D1%81%D1%82%D0%BE%D1%8F%D0%BD%D0%B8%D0%B5_%D0%A5%D1%8D%D0%BC%D0%BC%D0%B8%D0%BD%D0%B3%D0%B0) между результатом вычисления `simhash` двух строк, тем больше вероятность, что строки совпадают. + +**Синтаксис** + +``` sql +ngramSimHashCaseInsensitive(string[, ngramsize]) +``` + +**Аргументы** + +- `string` — строка. [String](../../sql-reference/data-types/string.md). +- `ngramsize` — размер n-грамм. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Возвращаемое значение** + +- Значение хеш-функции от строки. + +Тип: [UInt64](../../sql-reference/data-types/int-uint.md). + +**Пример** + +Запрос: + +``` sql +SELECT ngramSimHashCaseInsensitive('ClickHouse') AS Hash; +``` + +Результат: + +``` text +┌──────Hash─┐ +│ 562180645 │ +└───────────┘ +``` + +## ngramSimHashUTF8 {#ngramsimhashutf8} + +Выделяет из UTF-8 строки отрезки (n-граммы) размером `ngramsize` символов и возвращает n-граммовый `simhash`. Функция регистрозависимая. + +Может быть использована для проверки двух строк на схожесть вместе с функцией [bitHammingDistance](../../sql-reference/functions/bit-functions.md#bithammingdistance). Чем меньше [расстояние Хэмминга](https://ru.wikipedia.org/wiki/%D0%A0%D0%B0%D1%81%D1%81%D1%82%D0%BE%D1%8F%D0%BD%D0%B8%D0%B5_%D0%A5%D1%8D%D0%BC%D0%BC%D0%B8%D0%BD%D0%B3%D0%B0) между результатом вычисления `simhash` двух строк, тем больше вероятность, что строки совпадают. + +**Синтаксис** + +``` sql +ngramSimHashUTF8(string[, ngramsize]) +``` + +**Аргументы** + +- `string` — строка. [String](../../sql-reference/data-types/string.md). +- `ngramsize` — размер n-грамм. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Возвращаемое значение** + +- Значение хеш-функции от строки. + +Тип: [UInt64](../../sql-reference/data-types/int-uint.md). + +**Пример** + +Запрос: + +``` sql +SELECT ngramSimHashUTF8('ClickHouse') AS Hash; +``` + +Результат: + +``` text +┌───────Hash─┐ +│ 1628157797 │ +└────────────┘ +``` + +## ngramSimHashCaseInsensitiveUTF8 {#ngramsimhashcaseinsensitiveutf8} + +Выделяет из UTF-8 строки отрезки (n-граммы) размером `ngramsize` символов и возвращает n-граммовый `simhash`. Функция регистро**не**зависимая. + +Может быть использована для проверки двух строк на схожесть вместе с функцией [bitHammingDistance](../../sql-reference/functions/bit-functions.md#bithammingdistance). Чем меньше [расстояние Хэмминга](https://ru.wikipedia.org/wiki/%D0%A0%D0%B0%D1%81%D1%81%D1%82%D0%BE%D1%8F%D0%BD%D0%B8%D0%B5_%D0%A5%D1%8D%D0%BC%D0%BC%D0%B8%D0%BD%D0%B3%D0%B0) между результатом вычисления `simhash` двух строк, тем больше вероятность, что строки совпадают. + +**Синтаксис** + +``` sql +ngramSimHashCaseInsensitiveUTF8(string[, ngramsize]) +``` + +**Аргументы** + +- `string` — строка. [String](../../sql-reference/data-types/string.md). +- `ngramsize` — размер n-грамм. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Возвращаемое значение** + +- Значение хеш-функции от строки. + +Тип: [UInt64](../../sql-reference/data-types/int-uint.md). + +**Пример** + +Запрос: + +``` sql +SELECT ngramSimHashCaseInsensitiveUTF8('ClickHouse') AS Hash; +``` + +Результат: + +``` text +┌───────Hash─┐ +│ 1636742693 │ +└────────────┘ +``` + +## wordShingleSimHash {#wordshinglesimhash} + +Выделяет из ASCII строки отрезки (шинглы) из `shinglesize` слов и возвращает шингловый `simhash`. Функция регистрозависимая. + +Может быть использована для проверки двух строк на схожесть вместе с функцией [bitHammingDistance](../../sql-reference/functions/bit-functions.md#bithammingdistance). Чем меньше [расстояние Хэмминга](https://ru.wikipedia.org/wiki/%D0%A0%D0%B0%D1%81%D1%81%D1%82%D0%BE%D1%8F%D0%BD%D0%B8%D0%B5_%D0%A5%D1%8D%D0%BC%D0%BC%D0%B8%D0%BD%D0%B3%D0%B0) между результатом вычисления `simhash` двух строк, тем больше вероятность, что строки совпадают. + +**Синтаксис** + +``` sql +wordShingleSimHash(string[, shinglesize]) +``` + +**Аргументы** + +- `string` — строка. [String](../../sql-reference/data-types/string.md). +- `shinglesize` — размер словесных шинглов. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Возвращаемое значение** + +- Значение хеш-функции от строки. + +Тип: [UInt64](../../sql-reference/data-types/int-uint.md). + +**Пример** + +Запрос: + +``` sql +SELECT wordShingleSimHash('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).') AS Hash; +``` + +Результат: + +``` text +┌───────Hash─┐ +│ 2328277067 │ +└────────────┘ +``` + +## wordShingleSimHashCaseInsensitive {#wordshinglesimhashcaseinsensitive} + +Выделяет из ASCII строки отрезки (шинглы) из `shinglesize` слов и возвращает шингловый `simhash`. Функция регистро**не**зависимая. + +Может быть использована для проверки двух строк на схожесть вместе с функцией [bitHammingDistance](../../sql-reference/functions/bit-functions.md#bithammingdistance). Чем меньше [расстояние Хэмминга](https://ru.wikipedia.org/wiki/%D0%A0%D0%B0%D1%81%D1%81%D1%82%D0%BE%D1%8F%D0%BD%D0%B8%D0%B5_%D0%A5%D1%8D%D0%BC%D0%BC%D0%B8%D0%BD%D0%B3%D0%B0) между результатом вычисления `simhash` двух строк, тем больше вероятность, что строки совпадают. + +**Синтаксис** + +``` sql +wordShingleSimHashCaseInsensitive(string[, shinglesize]) +``` + +**Аргументы** + +- `string` — строка. [String](../../sql-reference/data-types/string.md). +- `shinglesize` — размер словесных шинглов. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Возвращаемое значение** + +- Значение хеш-функции от строки. + +Тип: [UInt64](../../sql-reference/data-types/int-uint.md). + +**Пример** + +Запрос: + +``` sql +SELECT wordShingleSimHashCaseInsensitive('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).') AS Hash; +``` + +Результат: + +``` text +┌───────Hash─┐ +│ 2194812424 │ +└────────────┘ +``` + +## wordShingleSimHashUTF8 {#wordshinglesimhashutf8} + +Выделяет из UTF-8 строки отрезки (шинглы) из `shinglesize` слов и возвращает шингловый `simhash`. Функция регистрозависимая. + +Может быть использована для проверки двух строк на схожесть вместе с функцией [bitHammingDistance](../../sql-reference/functions/bit-functions.md#bithammingdistance). Чем меньше [расстояние Хэмминга](https://ru.wikipedia.org/wiki/%D0%A0%D0%B0%D1%81%D1%81%D1%82%D0%BE%D1%8F%D0%BD%D0%B8%D0%B5_%D0%A5%D1%8D%D0%BC%D0%BC%D0%B8%D0%BD%D0%B3%D0%B0) между результатом вычисления `simhash` двух строк, тем больше вероятность, что строки совпадают. + +**Синтаксис** + +``` sql +wordShingleSimHashUTF8(string[, shinglesize]) +``` + +**Аргументы** + +- `string` — строка. [String](../../sql-reference/data-types/string.md). +- `shinglesize` — размер словесных шинглов. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Возвращаемое значение** + +- Значение хеш-функции от строки. + +Тип: [UInt64](../../sql-reference/data-types/int-uint.md). + +**Пример** + +Запрос: + +``` sql +SELECT wordShingleSimHashUTF8('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).') AS Hash; +``` + +Результат: + +``` text +┌───────Hash─┐ +│ 2328277067 │ +└────────────┘ +``` + +## wordShingleSimHashCaseInsensitiveUTF8 {#wordshinglesimhashcaseinsensitiveutf8} + +Выделяет из UTF-8 строки отрезки (шинглы) из `shinglesize` слов и возвращает шингловый `simhash`. Функция регистро**не**зависимая. + +Может быть использована для проверки двух строк на схожесть вместе с функцией [bitHammingDistance](../../sql-reference/functions/bit-functions.md#bithammingdistance). Чем меньше [расстояние Хэмминга](https://ru.wikipedia.org/wiki/%D0%A0%D0%B0%D1%81%D1%81%D1%82%D0%BE%D1%8F%D0%BD%D0%B8%D0%B5_%D0%A5%D1%8D%D0%BC%D0%BC%D0%B8%D0%BD%D0%B3%D0%B0) между результатом вычисления `simhash` двух строк, тем больше вероятность, что строки совпадают. + +**Синтаксис** + +``` sql +wordShingleSimHashCaseInsensitiveUTF8(string[, shinglesize]) +``` + +**Аргументы** + +- `string` — строка. [String](../../sql-reference/data-types/string.md). +- `shinglesize` — размер словесных шинглов. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Возвращаемое значение** + +- Значение хеш-функции от строки. + +Тип: [UInt64](../../sql-reference/data-types/int-uint.md). + +**Пример** + +Запрос: + +``` sql +SELECT wordShingleSimHashCaseInsensitiveUTF8('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).') AS Hash; +``` + +Результат: + +``` text +┌───────Hash─┐ +│ 2194812424 │ +└────────────┘ +``` + +## ngramMinHash {#ngramminhash} + +Выделяет из ASCII строки отрезки (n-граммы) размером `ngramsize` символов и вычисляет хеш для каждой n-граммы. Использует `hashnum` минимальных хешей, чтобы вычислить минимальный хеш, и `hashnum` максимальных хешей, чтобы вычислить максимальный хеш. Возвращает кортеж из этих хешей. Функция регистрозависимая. + +Может быть использована для проверки двух строк на схожесть вместе с функцией [tupleHammingDistance](../../sql-reference/functions/tuple-functions.md#tuplehammingdistance). Если для двух строк минимальные или максимальные хеши одинаковы, мы считаем, что эти строки совпадают. + +**Синтаксис** + +``` sql +ngramMinHash(string[, ngramsize, hashnum]) +``` + +**Аргументы** + +- `string` — строка. [String](../../sql-reference/data-types/string.md). +- `ngramsize` — размер n-грамм. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — количество минимальных и максимальных хешей, которое используется при вычислении результата. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Возвращаемое значение** + +- Кортеж с двумя хешами — минимальным и максимальным. + +Тип: [Tuple](../../sql-reference/data-types/tuple.md)([UInt64](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md)). + +**Пример** + +Запрос: + +``` sql +SELECT ngramMinHash('ClickHouse') AS Tuple; +``` + +Результат: + +``` text +┌─Tuple──────────────────────────────────────┐ +│ (18333312859352735453,9054248444481805918) │ +└────────────────────────────────────────────┘ +``` + +## ngramMinHashCaseInsensitive {#ngramminhashcaseinsensitive} + +Выделяет из ASCII строки отрезки (n-граммы) размером `ngramsize` символов и вычисляет хеш для каждой n-граммы. Использует `hashnum` минимальных хешей, чтобы вычислить минимальный хеш, и `hashnum` максимальных хешей, чтобы вычислить максимальный хеш. Возвращает кортеж из этих хешей. Функция регистро**не**зависимая. + +Может быть использована для проверки двух строк на схожесть вместе с функцией [tupleHammingDistance](../../sql-reference/functions/tuple-functions.md#tuplehammingdistance). Если для двух строк минимальные или максимальные хеши одинаковы, мы считаем, что эти строки совпадают. + +**Синтаксис** + +``` sql +ngramMinHashCaseInsensitive(string[, ngramsize, hashnum]) +``` + +**Аргументы** + +- `string` — строка. [String](../../sql-reference/data-types/string.md). +- `ngramsize` — размер n-грамм. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — количество минимальных и максимальных хешей, которое используется при вычислении результата. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Возвращаемое значение** + +- Кортеж с двумя хешами — минимальным и максимальным. + +Тип: [Tuple](../../sql-reference/data-types/tuple.md)([UInt64](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md)). + +**Пример** + +Запрос: + +``` sql +SELECT ngramMinHashCaseInsensitive('ClickHouse') AS Tuple; +``` + +Результат: + +``` text +┌─Tuple──────────────────────────────────────┐ +│ (2106263556442004574,13203602793651726206) │ +└────────────────────────────────────────────┘ +``` + +## ngramMinHashUTF8 {#ngramminhashutf8} + +Выделяет из UTF-8 строки отрезки (n-граммы) размером `ngramsize` символов и вычисляет хеш для каждой n-граммы. Использует `hashnum` минимальных хешей, чтобы вычислить минимальный хеш, и `hashnum` максимальных хешей, чтобы вычислить максимальный хеш. Возвращает кортеж из этих хешей. Функция регистрозависимая. + +Может быть использована для проверки двух строк на схожесть вместе с функцией [tupleHammingDistance](../../sql-reference/functions/tuple-functions.md#tuplehammingdistance). Если для двух строк минимальные или максимальные хеши одинаковы, мы считаем, что эти строки совпадают. + +**Синтаксис** +``` sql +ngramMinHashUTF8(string[, ngramsize, hashnum]) +``` + +**Аргументы** + +- `string` — строка. [String](../../sql-reference/data-types/string.md). +- `ngramsize` — размер n-грамм. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — количество минимальных и максимальных хешей, которое используется при вычислении результата. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Возвращаемое значение** + +- Кортеж с двумя хешами — минимальным и максимальным. + +Тип: [Tuple](../../sql-reference/data-types/tuple.md)([UInt64](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md)). + +**Пример** + +Запрос: + +``` sql +SELECT ngramMinHashUTF8('ClickHouse') AS Tuple; +``` + +Результат: + +``` text +┌─Tuple──────────────────────────────────────┐ +│ (18333312859352735453,6742163577938632877) │ +└────────────────────────────────────────────┘ +``` + +## ngramMinHashCaseInsensitiveUTF8 {#ngramminhashcaseinsensitiveutf8} + +Выделяет из UTF-8 строки отрезки (n-граммы) размером `ngramsize` символов и вычисляет хеш для каждой n-граммы. Использует `hashnum` минимальных хешей, чтобы вычислить минимальный хеш, и `hashnum` максимальных хешей, чтобы вычислить максимальный хеш. Возвращает кортеж из этих хешей. Функция регистро**не**зависимая. + +Может быть использована для проверки двух строк на схожесть вместе с функцией [tupleHammingDistance](../../sql-reference/functions/tuple-functions.md#tuplehammingdistance). Если для двух строк минимальные или максимальные хеши одинаковы, мы считаем, что эти строки совпадают. + +**Синтаксис** + +``` sql +ngramMinHashCaseInsensitiveUTF8(string [, ngramsize, hashnum]) +``` + +**Аргументы** + +- `string` — строка. [String](../../sql-reference/data-types/string.md). +- `ngramsize` — размер n-грамм. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — количество минимальных и максимальных хешей, которое используется при вычислении результата. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Возвращаемое значение** + +- Кортеж с двумя хешами — минимальным и максимальным. + +Тип: [Tuple](../../sql-reference/data-types/tuple.md)([UInt64](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md)). + +**Пример** + +Запрос: + +``` sql +SELECT ngramMinHashCaseInsensitiveUTF8('ClickHouse') AS Tuple; +``` + +Результат: + +``` text +┌─Tuple───────────────────────────────────────┐ +│ (12493625717655877135,13203602793651726206) │ +└─────────────────────────────────────────────┘ +``` + +## ngramMinHashArg {#ngramminhasharg} + +Выделяет из ASCII строки отрезки (n-граммы) размером `ngramsize` символов и возвращает n-граммы с минимальным и максимальным хешами, вычисленными функцией [ngramMinHash](#ngramminhash) с теми же входными данными. Функция регистрозависимая. + +**Синтаксис** + +``` sql +ngramMinHashArg(string[, ngramsize, hashnum]) +``` + +**Аргументы** + +- `string` — строка. [String](../../sql-reference/data-types/string.md). +- `ngramsize` — размер n-грамм. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — количество минимальных и максимальных хешей, которое используется при вычислении результата. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Возвращаемое значение** + +- Кортеж из двух кортежей, каждый из которых состоит из `hashnum` n-грамм. + +Тип: [Tuple](../../sql-reference/data-types/tuple.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md)), [Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md))). + +**Пример** + +Запрос: + +``` sql +SELECT ngramMinHashArg('ClickHouse') AS Tuple; +``` + +Результат: + +``` text +┌─Tuple─────────────────────────────────────────────────────────────────────────┐ +│ (('ous','ick','lic','Hou','kHo','use'),('Hou','lic','ick','ous','ckH','Cli')) │ +└───────────────────────────────────────────────────────────────────────────────┘ +``` + +## ngramMinHashArgCaseInsensitive {#ngramminhashargcaseinsensitive} + +Выделяет из ASCII строки отрезки (n-граммы) размером `ngramsize` символов и возвращает n-граммы с минимальным и максимальным хешами, вычисленными функцией [ngramMinHashCaseInsensitive](#ngramminhashcaseinsensitive) с теми же входными данными. Функция регистро**не**зависимая. + +**Синтаксис** + +``` sql +ngramMinHashArgCaseInsensitive(string[, ngramsize, hashnum]) +``` + +**Аргументы** + +- `string` — строка. [String](../../sql-reference/data-types/string.md). +- `ngramsize` — размер n-грамм. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — количество минимальных и максимальных хешей, которое используется при вычислении результата. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Возвращаемое значение** + +- Кортеж из двух кортежей, каждый из которых состоит из `hashnum` n-грамм. + +Тип: [Tuple](../../sql-reference/data-types/tuple.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md)), [Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md))). + +**Пример** + +Запрос: + +``` sql +SELECT ngramMinHashArgCaseInsensitive('ClickHouse') AS Tuple; +``` + +Результат: + +``` text +┌─Tuple─────────────────────────────────────────────────────────────────────────┐ +│ (('ous','ick','lic','kHo','use','Cli'),('kHo','lic','ick','ous','ckH','Hou')) │ +└───────────────────────────────────────────────────────────────────────────────┘ +``` + +## ngramMinHashArgUTF8 {#ngramminhashargutf8} + +Выделяет из UTF-8 строки отрезки (n-граммы) размером `ngramsize` символов и возвращает n-граммы с минимальным и максимальным хешами, вычисленными функцией [ngramMinHashUTF8](#ngramminhashutf8) с теми же входными данными. Функция регистрозависимая. + +**Синтаксис** + +``` sql +ngramMinHashArgUTF8(string[, ngramsize, hashnum]) +``` + +**Аргументы** + +- `string` — строка. [String](../../sql-reference/data-types/string.md). +- `ngramsize` — размер n-грамм. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — количество минимальных и максимальных хешей, которое используется при вычислении результата. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Возвращаемое значение** + +- Кортеж из двух кортежей, каждый из которых состоит из `hashnum` n-грамм. + +Тип: [Tuple](../../sql-reference/data-types/tuple.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md)), [Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md))). + +**Пример** + +Запрос: + +``` sql +SELECT ngramMinHashArgUTF8('ClickHouse') AS Tuple; +``` + +Результат: + +``` text +┌─Tuple─────────────────────────────────────────────────────────────────────────┐ +│ (('ous','ick','lic','Hou','kHo','use'),('kHo','Hou','lic','ick','ous','ckH')) │ +└───────────────────────────────────────────────────────────────────────────────┘ +``` + +## ngramMinHashArgCaseInsensitiveUTF8 {#ngramminhashargcaseinsensitiveutf8} + +Выделяет из UTF-8 строки отрезки (n-граммы) размером `ngramsize` символов и возвращает n-граммы с минимальным и максимальным хешами, вычисленными функцией [ngramMinHashCaseInsensitiveUTF8](#ngramminhashcaseinsensitiveutf8) с теми же входными данными. Функция регистро**не**зависимая. + +**Синтаксис** + +``` sql +ngramMinHashArgCaseInsensitiveUTF8(string[, ngramsize, hashnum]) +``` + +**Аргументы** + +- `string` — строка. [String](../../sql-reference/data-types/string.md). +- `ngramsize` — размер n-грамм. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — количество минимальных и максимальных хешей, которое используется при вычислении результата. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Возвращаемое значение** + +- Кортеж из двух кортежей, каждый из которых состоит из `hashnum` n-грамм. + +Тип: [Tuple](../../sql-reference/data-types/tuple.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md)), [Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md))). + +**Пример** + +Запрос: + +``` sql +SELECT ngramMinHashArgCaseInsensitiveUTF8('ClickHouse') AS Tuple; +``` + +Результат: + +``` text +┌─Tuple─────────────────────────────────────────────────────────────────────────┐ +│ (('ckH','ous','ick','lic','kHo','use'),('kHo','lic','ick','ous','ckH','Hou')) │ +└───────────────────────────────────────────────────────────────────────────────┘ +``` + +## wordShingleMinHash {#wordshingleminhash} + +Выделяет из ASCII строки отрезки (шинглы) из `shinglesize` слов и вычисляет хеш для каждого шингла. Использует `hashnum` минимальных хешей, чтобы вычислить минимальный хеш, и `hashnum` максимальных хешей, чтобы вычислить максимальный хеш. Возвращает кортеж из этих хешей. Функция регистрозависимая. + +Может быть использована для проверки двух строк на схожесть вместе с функцией [tupleHammingDistance](../../sql-reference/functions/tuple-functions.md#tuplehammingdistance). Если для двух строк минимальные или максимальные хеши одинаковы, мы считаем, что эти строки совпадают. + +**Синтаксис** + +``` sql +wordShingleMinHash(string[, shinglesize, hashnum]) +``` + +**Аргументы** + +- `string` — строка. [String](../../sql-reference/data-types/string.md). +- `shinglesize` — размер словесных шинглов. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — количество минимальных и максимальных хешей, которое используется при вычислении результата. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Возвращаемое значение** + +- Кортеж с двумя хешами — минимальным и максимальным. + +Тип: [Tuple](../../sql-reference/data-types/tuple.md)([UInt64](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md)). + +**Пример** + +Запрос: + +``` sql +SELECT wordShingleMinHash('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).') AS Tuple; +``` + +Результат: + +``` text +┌─Tuple──────────────────────────────────────┐ +│ (16452112859864147620,5844417301642981317) │ +└────────────────────────────────────────────┘ +``` + +## wordShingleMinHashCaseInsensitive {#wordshingleminhashcaseinsensitive} + +Выделяет из ASCII строки отрезки (шинглы) из `shinglesize` слов и вычисляет хеш для каждого шингла. Использует `hashnum` минимальных хешей, чтобы вычислить минимальный хеш, и `hashnum` максимальных хешей, чтобы вычислить максимальный хеш. Возвращает кортеж из этих хешей. Функция регистро**не**зависимая. + +Может быть использована для проверки двух строк на схожесть вместе с функцией [tupleHammingDistance](../../sql-reference/functions/tuple-functions.md#tuplehammingdistance). Если для двух строк минимальные или максимальные хеши одинаковы, мы считаем, что эти строки совпадают. + +**Синтаксис** + +``` sql +wordShingleMinHashCaseInsensitive(string[, shinglesize, hashnum]) +``` + +**Аргументы** + +- `string` — строка. [String](../../sql-reference/data-types/string.md). +- `shinglesize` — размер словесных шинглов. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — количество минимальных и максимальных хешей, которое используется при вычислении результата. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Возвращаемое значение** + +- Кортеж с двумя хешами — минимальным и максимальным. + +Тип: [Tuple](../../sql-reference/data-types/tuple.md)([UInt64](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md)). + +**Пример** + +Запрос: + +``` sql +SELECT wordShingleMinHashCaseInsensitive('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).') AS Tuple; +``` + +Результат: + +``` text +┌─Tuple─────────────────────────────────────┐ +│ (3065874883688416519,1634050779997673240) │ +└───────────────────────────────────────────┘ +``` + +## wordShingleMinHashUTF8 {#wordshingleminhashutf8} + +Выделяет из UTF-8 строки отрезки (шинглы) из `shinglesize` слов и вычисляет хеш для каждого шингла. Использует `hashnum` минимальных хешей, чтобы вычислить минимальный хеш, и `hashnum` максимальных хешей, чтобы вычислить максимальный хеш. Возвращает кортеж из этих хешей. Функция регистрозависимая. + +Может быть использована для проверки двух строк на схожесть вместе с функцией [tupleHammingDistance](../../sql-reference/functions/tuple-functions.md#tuplehammingdistance). Если для двух строк минимальные или максимальные хеши одинаковы, мы считаем, что эти строки совпадают. + +**Синтаксис** + +``` sql +wordShingleMinHashUTF8(string[, shinglesize, hashnum]) +``` + +**Аргументы** + +- `string` — строка. [String](../../sql-reference/data-types/string.md). +- `shinglesize` — размер словесных шинглов. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — количество минимальных и максимальных хешей, которое используется при вычислении результата. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Возвращаемое значение** + +- Кортеж с двумя хешами — минимальным и максимальным. + +Тип: [Tuple](../../sql-reference/data-types/tuple.md)([UInt64](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md)). + +**Пример** + +Запрос: + +``` sql +SELECT wordShingleMinHashUTF8('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).') AS Tuple; +``` + +Результат: + +``` text +┌─Tuple──────────────────────────────────────┐ +│ (16452112859864147620,5844417301642981317) │ +└────────────────────────────────────────────┘ +``` + +## wordShingleMinHashCaseInsensitiveUTF8 {#wordshingleminhashcaseinsensitiveutf8} + +Выделяет из UTF-8 строки отрезки (шинглы) из `shinglesize` слов и вычисляет хеш для каждого шингла. Использует `hashnum` минимальных хешей, чтобы вычислить минимальный хеш, и `hashnum` максимальных хешей, чтобы вычислить максимальный хеш. Возвращает кортеж из этих хешей. Функция регистро**не**зависимая. + +Может быть использована для проверки двух строк на схожесть вместе с функцией [tupleHammingDistance](../../sql-reference/functions/tuple-functions.md#tuplehammingdistance). Если для двух строк минимальные или максимальные хеши одинаковы, мы считаем, что эти строки совпадают. + +**Синтаксис** + +``` sql +wordShingleMinHashCaseInsensitiveUTF8(string[, shinglesize, hashnum]) +``` + +**Аргументы** + +- `string` — строка. [String](../../sql-reference/data-types/string.md). +- `shinglesize` — размер словесных шинглов. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — количество минимальных и максимальных хешей, которое используется при вычислении результата. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Возвращаемое значение** + +- Кортеж с двумя хешами — минимальным и максимальным. + +Тип: [Tuple](../../sql-reference/data-types/tuple.md)([UInt64](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md)). + +**Пример** + +Запрос: + +``` sql +SELECT wordShingleMinHashCaseInsensitiveUTF8('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).') AS Tuple; +``` + +Результат: + +``` text +┌─Tuple─────────────────────────────────────┐ +│ (3065874883688416519,1634050779997673240) │ +└───────────────────────────────────────────┘ +``` + +## wordShingleMinHashArg {#wordshingleminhasharg} + +Выделяет из ASCII строки отрезки (шинглы) из `shinglesize` слов и возвращает шинглы с минимальным и максимальным хешами, вычисленными функцией [wordshingleMinHash](#wordshingleminhash) с теми же входными данными. Функция регистрозависимая. + +**Синтаксис** + +``` sql +wordShingleMinHashArg(string[, shinglesize, hashnum]) +``` + +**Аргументы** + +- `string` — строка. [String](../../sql-reference/data-types/string.md). +- `shinglesize` — размер словесных шинглов. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — количество минимальных и максимальных хешей, которое используется при вычислении результата. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Возвращаемое значение** + +- Кортеж из двух кортежей, каждый из которых состоит из `hashnum` шинглов. + +Тип: [Tuple](../../sql-reference/data-types/tuple.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md)), [Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md))). + +**Пример** + +Запрос: + +``` sql +SELECT wordShingleMinHashArg('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).', 1, 3) AS Tuple; +``` + +Результат: + +``` text +┌─Tuple─────────────────────────────────────────────────────────────────┐ +│ (('OLAP','database','analytical'),('online','oriented','processing')) │ +└───────────────────────────────────────────────────────────────────────┘ +``` + +## wordShingleMinHashArgCaseInsensitive {#wordshingleminhashargcaseinsensitive} + +Выделяет из ASCII строки отрезки (шинглы) из `shinglesize` слов и возвращает шинглы с минимальным и максимальным хешами, вычисленными функцией [wordShingleMinHashCaseInsensitive](#wordshingleminhashcaseinsensitive) с теми же входными данными. Функция регистро**не**зависимая. + +**Синтаксис** + +``` sql +wordShingleMinHashArgCaseInsensitive(string[, shinglesize, hashnum]) +``` + +**Аргументы** + +- `string` — строка. [String](../../sql-reference/data-types/string.md). +- `shinglesize` — размер словесных шинглов. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — количество минимальных и максимальных хешей, которое используется при вычислении результата. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Возвращаемое значение** + +- Кортеж из двух кортежей, каждый из которых состоит из `hashnum` шинглов. + +Тип: [Tuple](../../sql-reference/data-types/tuple.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md)), [Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md))). + +**Пример** + +Запрос: + +``` sql +SELECT wordShingleMinHashArgCaseInsensitive('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).', 1, 3) AS Tuple; +``` + +Результат: + +``` text +┌─Tuple──────────────────────────────────────────────────────────────────┐ +│ (('queries','database','analytical'),('oriented','processing','DBMS')) │ +└────────────────────────────────────────────────────────────────────────┘ +``` + +## wordShingleMinHashArgUTF8 {#wordshingleminhashargutf8} + +Выделяет из UTF-8 строки отрезки (шинглы) из `shinglesize` слов и возвращает шинглы с минимальным и максимальным хешами, вычисленными функцией [wordShingleMinHashUTF8](#wordshingleminhashutf8) с теми же входными данными. Функция регистрозависимая. + +**Синтаксис** + +``` sql +wordShingleMinHashArgUTF8(string[, shinglesize, hashnum]) +``` + +**Аргументы** + +- `string` — строка. [String](../../sql-reference/data-types/string.md). +- `shinglesize` — размер словесных шинглов. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — количество минимальных и максимальных хешей, которое используется при вычислении результата. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Возвращаемое значение** + +- Кортеж из двух кортежей, каждый из которых состоит из `hashnum` шинглов. + +Тип: [Tuple](../../sql-reference/data-types/tuple.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md)), [Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md))). + +**Пример** + +Запрос: + +``` sql +SELECT wordShingleMinHashArgUTF8('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).', 1, 3) AS Tuple; +``` + +Результат: + +``` text +┌─Tuple─────────────────────────────────────────────────────────────────┐ +│ (('OLAP','database','analytical'),('online','oriented','processing')) │ +└───────────────────────────────────────────────────────────────────────┘ +``` + +## wordShingleMinHashArgCaseInsensitiveUTF8 {#wordshingleminhashargcaseinsensitiveutf8} + +Выделяет из UTF-8 строки отрезки (шинглы) из `shinglesize` слов и возвращает шинглы с минимальным и максимальным хешами, вычисленными функцией [wordShingleMinHashCaseInsensitiveUTF8](#wordshingleminhashcaseinsensitiveutf8) с теми же входными данными. Функция регистро**не**зависимая. + +**Синтаксис** + +``` sql +wordShingleMinHashArgCaseInsensitiveUTF8(string[, shinglesize, hashnum]) +``` + +**Аргументы** + +- `string` — строка. [String](../../sql-reference/data-types/string.md). +- `shinglesize` — размер словесных шинглов. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — количество минимальных и максимальных хешей, которое используется при вычислении результата. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Возвращаемое значение** + +- Кортеж из двух кортежей, каждый из которых состоит из `hashnum` шинглов. + +Тип: [Tuple](../../sql-reference/data-types/tuple.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md)), [Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md))). + +**Пример** + +Запрос: + +``` sql +SELECT wordShingleMinHashArgCaseInsensitiveUTF8('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).', 1, 3) AS Tuple; +``` + +Результат: + +``` text +┌─Tuple──────────────────────────────────────────────────────────────────┐ +│ (('queries','database','analytical'),('oriented','processing','DBMS')) │ +└────────────────────────────────────────────────────────────────────────┘ +``` diff --git a/docs/ru/sql-reference/functions/tuple-functions.md b/docs/ru/sql-reference/functions/tuple-functions.md index a56eac27db2..381743a450b 100644 --- a/docs/ru/sql-reference/functions/tuple-functions.md +++ b/docs/ru/sql-reference/functions/tuple-functions.md @@ -111,3 +111,55 @@ SELECT untuple((* EXCEPT (v2, v3),)) FROM kv; - [Tuple](../../sql-reference/data-types/tuple.md) +## tupleHammingDistance {#tuplehammingdistance} + +Возвращает [расстояние Хэмминга](https://ru.wikipedia.org/wiki/%D0%A0%D0%B0%D1%81%D1%81%D1%82%D0%BE%D1%8F%D0%BD%D0%B8%D0%B5_%D0%A5%D1%8D%D0%BC%D0%BC%D0%B8%D0%BD%D0%B3%D0%B0) между двумя кортежами одинакового размера. + +**Синтаксис** + +``` sql +tupleHammingDistance(tuple1, tuple2) +``` + +**Аргументы** + +- `tuple1` — первый кортеж. [Tuple](../../sql-reference/data-types/tuple.md). +- `tuple2` — второй кортеж. [Tuple](../../sql-reference/data-types/tuple.md). + +Кортежи должны иметь одинаковый размер и тип элементов. + +**Возвращаемое значение** + +- Расстояние Хэмминга. + +Тип: [UInt8](../../sql-reference/data-types/int-uint.md). + +**Примеры** + +Запрос: + +``` sql +SELECT tupleHammingDistance((1, 2, 3), (3, 2, 1)) AS HammingDistance; +``` + +Результат: + +``` text +┌─HammingDistance─┐ +│ 2 │ +└─────────────────┘ +``` + +Может быть использовано с функциями [MinHash](../../sql-reference/functions/hash-functions.md#ngramminhash) для проверки строк на совпадение: + +``` sql +SELECT tupleHammingDistance(wordShingleMinHash(string), wordShingleMinHashCaseInsensitive(string)) as HammingDistance FROM (SELECT 'Clickhouse is a column-oriented database management system for online analytical processing of queries.' AS string); +``` + +Результат: + +``` text +┌─HammingDistance─┐ +│ 2 │ +└─────────────────┘ +``` diff --git a/docs/ru/sql-reference/statements/alter/column.md b/docs/ru/sql-reference/statements/alter/column.md index f51859b46f6..87fc1c78cd0 100644 --- a/docs/ru/sql-reference/statements/alter/column.md +++ b/docs/ru/sql-reference/statements/alter/column.md @@ -117,7 +117,7 @@ MODIFY COLUMN [IF EXISTS] name [type] [default_expr] [TTL] [AFTER name_after | F - TTL - Примеры изменения TTL столбца смотрите в разделе [TTL столбца](ttl.md#mergetree-column-ttl). + Примеры изменения TTL столбца смотрите в разделе [TTL столбца](../../../engines/table-engines/mergetree-family/mergetree.md#mergetree-column-ttl). Если указано `IF EXISTS`, запрос не возвращает ошибку, если столбца не существует. diff --git a/docs/ru/sql-reference/table-functions/postgresql.md b/docs/ru/sql-reference/table-functions/postgresql.md index a8ed23db8ed..66637276726 100644 --- a/docs/ru/sql-reference/table-functions/postgresql.md +++ b/docs/ru/sql-reference/table-functions/postgresql.md @@ -5,43 +5,46 @@ toc_title: postgresql # postgresql {#postgresql} -Позволяет выполнять запросы `SELECT` над данными, хранящимися на удалённом PostgreSQL сервере. +Позволяет выполнять запросы `SELECT` и `INSERT` над таблицами удаленной БД PostgreSQL. **Синтаксис** + ``` sql -postgresql('host:port', 'database', 'table', 'user', 'password') +postgresql('host:port', 'database', 'table', 'user', 'password'[, `schema`]) ``` -**Параметры** +**Аргументы** - `host:port` — адрес сервера PostgreSQL. - - `database` — имя базы данных на удалённом сервере. - - `table` — имя таблицы на удалённом сервере. - - `user` — пользователь PostgreSQL. - - `password` — пароль пользователя. - - -SELECT запросы на стороне PostgreSQL выполняются как `COPY (SELECT ...) TO STDOUT` внутри транзакции PostgreSQL только на чтение с коммитом после каждого `SELECT` запроса. - -Простые условия для `WHERE` такие как `=, !=, >, >=, <, <=, IN` исполняются на стороне PostgreSQL сервера. - -Все операции объединения, аггрегации, сортировки, условия `IN [ array ]` и ограничения `LIMIT` выполняются на стороне ClickHouse только после того как запрос к PostgreSQL закончился. - -INSERT запросы на стороне PostgreSQL выполняются как `COPY "table_name" (field1, field2, ... fieldN) FROM STDIN` внутри PostgreSQL транзакции с автоматическим коммитом после каждого `INSERT` запроса. - -PostgreSQL массивы конвертируются в массивы ClickHouse. -Будьте осторожны в PostgreSQL массивы созданные как type_name[], являются многомерными и могут содержать в себе разное количество измерений в разных строках одной таблицы, внутри ClickHouse допустипы только многомерные массивы с одинаковым кол-вом измерений во всех строках таблицы. +- `schema` — имя схемы, если не используется схема по умолчанию. Необязательный аргумент. **Возвращаемое значение** -Объект таблицы с теми же столбцами, что и в исходной таблице PostgreSQL. +Таблица с теми же столбцами, что и в исходной таблице PostgreSQL. !!! info "Примечание" -В запросах `INSERT` для того чтобы отличить табличную функцию `postgresql(...)` от таблицы со списком имен столбцов вы должны указывать ключевые слова `FUNCTION` или `TABLE FUNCTION`. See examples below. + В запросах `INSERT` для того чтобы отличить табличную функцию `postgresql(...)` от таблицы со списком имен столбцов вы должны указывать ключевые слова `FUNCTION` или `TABLE FUNCTION`. См. примеры ниже. + +## Особенности реализации {#implementation-details} + +Запросы `SELECT` на стороне PostgreSQL выполняются как `COPY (SELECT ...) TO STDOUT` внутри транзакции PostgreSQL только на чтение с коммитом после каждого запроса `SELECT`. + +Простые условия для `WHERE` такие как `=`, `!=`, `>`, `>=`, `<`, `<=` и `IN` исполняются на стороне PostgreSQL сервера. + +Все операции объединения, аггрегации, сортировки, условия `IN [ array ]` и ограничения `LIMIT` выполняются на стороне ClickHouse только после того как запрос к PostgreSQL закончился. + +Запросы `INSERT` на стороне PostgreSQL выполняются как `COPY "table_name" (field1, field2, ... fieldN) FROM STDIN` внутри PostgreSQL транзакции с автоматическим коммитом после каждого запроса `INSERT`. + +PostgreSQL массивы конвертируются в массивы ClickHouse. + +!!! info "Примечание" + Будьте внимательны, в PostgreSQL массивы, созданные как `type_name[]`, являются многомерными и могут содержать в себе разное количество измерений в разных строках одной таблицы. Внутри ClickHouse допустипы только многомерные массивы с одинаковым кол-вом измерений во всех строках таблицы. + +При использовании словаря PostgreSQL поддерживается приоритет реплик. Чем больше номер реплики, тем ниже ее приоритет. Наивысший приоритет у реплики с номером `0`. **Примеры** @@ -58,10 +61,10 @@ PRIMARY KEY (int_id)); CREATE TABLE -postgres=# insert into test (int_id, str, "float") VALUES (1,'test',2); +postgres=# INSERT INTO test (int_id, str, "float") VALUES (1,'test',2); INSERT 0 1 -postgresql> select * from test; +postgresql> SELECT * FROM test; int_id | int_nullable | float | str | float_nullable --------+--------------+-------+------+---------------- 1 | | 2 | test | @@ -80,7 +83,7 @@ SELECT * FROM postgresql('localhost:5432', 'test', 'test', 'postgresql_user', 'p └────────┴──────────────┴───────┴──────┴────────────────┘ ``` -Вставка: +Вставка данных: ```sql INSERT INTO TABLE FUNCTION postgresql('localhost:5432', 'test', 'test', 'postgrsql_user', 'password') (int_id, float) VALUES (2, 3); @@ -94,7 +97,24 @@ SELECT * FROM postgresql('localhost:5432', 'test', 'test', 'postgresql_user', 'p └────────┴──────────────┴───────┴──────┴────────────────┘ ``` -**Смотрите также** +Using Non-default Schema: -- [Движок таблиц ‘PostgreSQL’](../../sql-reference/table-functions/postgresql.md) +```text +postgres=# CREATE SCHEMA "nice.schema"; + +postgres=# CREATE TABLE "nice.schema"."nice.table" (a integer); + +postgres=# INSERT INTO "nice.schema"."nice.table" SELECT i FROM generate_series(0, 99) as t(i) +``` + +```sql +CREATE TABLE pg_table_schema_with_dots (a UInt32) + ENGINE PostgreSQL('localhost:5432', 'clickhouse', 'nice.table', 'postgrsql_user', 'password', 'nice.schema'); +``` + +**См. также** + +- [Движок таблиц PostgreSQL](../../sql-reference/table-functions/postgresql.md) - [Использование PostgreSQL как источника данных для внешнего словаря](../../sql-reference/table-functions/postgresql.md#dicts-external_dicts_dict_sources-postgresql) + +[Оригинальная статья](https://clickhouse.tech/docs/ru/sql-reference/table-functions/postgresql/) diff --git a/programs/server/CMakeLists.txt b/programs/server/CMakeLists.txt index 198d9081168..697851b294b 100644 --- a/programs/server/CMakeLists.txt +++ b/programs/server/CMakeLists.txt @@ -42,11 +42,16 @@ if (OS_LINUX) set(RESOURCE_OBJS ${RESOURCE_OBJS} ${RESOURCE_OBJ}) # https://stackoverflow.com/questions/14776463/compile-and-add-an-object-file-from-a-binary-with-cmake - add_custom_command(OUTPUT ${RESOURCE_OBJ} - COMMAND cd ${CMAKE_CURRENT_SOURCE_DIR} && ${OBJCOPY_PATH} -I binary ${OBJCOPY_ARCH_OPTIONS} ${RESOURCE_FILE} ${CMAKE_CURRENT_BINARY_DIR}/${RESOURCE_OBJ} - COMMAND ${OBJCOPY_PATH} --rename-section .data=.rodata,alloc,load,readonly,data,contents - ${CMAKE_CURRENT_BINARY_DIR}/${RESOURCE_OBJ} ${CMAKE_CURRENT_BINARY_DIR}/${RESOURCE_OBJ}) - + # PPC64LE fails to do this with objcopy, use ld or lld instead + if (ARCH_PPC64LE) + add_custom_command(OUTPUT ${RESOURCE_OBJ} + COMMAND cd ${CMAKE_CURRENT_SOURCE_DIR} && ${CMAKE_LINKER} -m elf64lppc -r -b binary -o ${CMAKE_CURRENT_BINARY_DIR}/${RESOURCE_OBJ} ${RESOURCE_FILE}) + else() + add_custom_command(OUTPUT ${RESOURCE_OBJ} + COMMAND cd ${CMAKE_CURRENT_SOURCE_DIR} && ${OBJCOPY_PATH} -I binary ${OBJCOPY_ARCH_OPTIONS} ${RESOURCE_FILE} ${CMAKE_CURRENT_BINARY_DIR}/${RESOURCE_OBJ} + COMMAND ${OBJCOPY_PATH} --rename-section .data=.rodata,alloc,load,readonly,data,contents + ${CMAKE_CURRENT_BINARY_DIR}/${RESOURCE_OBJ} ${CMAKE_CURRENT_BINARY_DIR}/${RESOURCE_OBJ}) + endif() set_source_files_properties(${RESOURCE_OBJ} PROPERTIES EXTERNAL_OBJECT true GENERATED true) endforeach(RESOURCE_FILE) diff --git a/src/Client/HedgedConnections.cpp b/src/Client/HedgedConnections.cpp index a163ceba4a2..8455ef3117e 100644 --- a/src/Client/HedgedConnections.cpp +++ b/src/Client/HedgedConnections.cpp @@ -521,14 +521,17 @@ void HedgedConnections::processNewReplicaState(HedgedConnectionsFactory::State s void HedgedConnections::finishProcessReplica(ReplicaState & replica, bool disconnect) { + /// It's important to remove file descriptor from epoll exactly before cancelling packet_receiver, + /// because otherwise another thread can try to receive a packet, get this file descriptor + /// from epoll and resume cancelled packet_receiver. + epoll.remove(replica.packet_receiver->getFileDescriptor()); + epoll.remove(replica.change_replica_timeout.getDescriptor()); + replica.packet_receiver->cancel(); replica.change_replica_timeout.reset(); - epoll.remove(replica.packet_receiver->getFileDescriptor()); --offset_states[fd_to_replica_location[replica.packet_receiver->getFileDescriptor()].offset].active_connection_count; fd_to_replica_location.erase(replica.packet_receiver->getFileDescriptor()); - - epoll.remove(replica.change_replica_timeout.getDescriptor()); timeout_fd_to_replica_location.erase(replica.change_replica_timeout.getDescriptor()); --active_connection_count; diff --git a/src/Common/HashTable/HashMap.h b/src/Common/HashTable/HashMap.h index 99dc5414107..c3cd09eccb2 100644 --- a/src/Common/HashTable/HashMap.h +++ b/src/Common/HashTable/HashMap.h @@ -48,7 +48,7 @@ struct HashMapCell value_type value; - HashMapCell() {} + HashMapCell() = default; HashMapCell(const Key & key_, const State &) : value(key_, NoInitTag()) {} HashMapCell(const value_type & value_, const State &) : value(value_) {} @@ -114,8 +114,39 @@ struct HashMapCell static void move(HashMapCell * /* old_location */, HashMapCell * /* new_location */) {} + template + auto & get() & { + if constexpr (I == 0) return value.first; + else if constexpr (I == 1) return value.second; + } + + template + auto const & get() const & { + if constexpr (I == 0) return value.first; + else if constexpr (I == 1) return value.second; + } + + template + auto && get() && { + if constexpr (I == 0) return std::move(value.first); + else if constexpr (I == 1) return std::move(value.second); + } + }; +namespace std +{ + + template + struct tuple_size> : std::integral_constant { }; + + template + struct tuple_element<0, HashMapCell> { using type = Key; }; + + template + struct tuple_element<1, HashMapCell> { using type = TMapped; }; +} + template struct HashMapCellWithSavedHash : public HashMapCell { @@ -227,6 +258,19 @@ public: } }; +namespace std +{ + + template + struct tuple_size> : std::integral_constant { }; + + template + struct tuple_element<0, HashMapCellWithSavedHash> { using type = Key; }; + + template + struct tuple_element<1, HashMapCellWithSavedHash> { using type = TMapped; }; +} + template < typename Key, diff --git a/src/Common/PODArray.h b/src/Common/PODArray.h index 57ad3d46177..b1042332cfa 100644 --- a/src/Common/PODArray.h +++ b/src/Common/PODArray.h @@ -530,6 +530,31 @@ public: this->c_end += bytes_to_copy; } + template + void insertFromItself(iterator from_begin, iterator from_end, TAllocatorParams && ... allocator_params) + { + static_assert(memcpy_can_be_used_for_assignment, std::decay_t>); + + /// Convert iterators to indexes because reserve can invalidate iterators + size_t start_index = from_begin - begin(); + size_t end_index = from_end - begin(); + size_t copy_size = end_index - start_index; + + assert(start_index <= end_index); + + size_t required_capacity = this->size() + copy_size; + if (required_capacity > this->capacity()) + this->reserve(roundUpToPowerOfTwoOrZero(required_capacity), std::forward(allocator_params)...); + + size_t bytes_to_copy = this->byte_size(copy_size); + if (bytes_to_copy) + { + auto begin = this->c_start + this->byte_size(start_index); + memcpy(this->c_end, reinterpret_cast(&*begin), bytes_to_copy); + this->c_end += bytes_to_copy; + } + } + template void insert_assume_reserved(It1 from_begin, It2 from_end) { diff --git a/src/Common/StackTrace.cpp b/src/Common/StackTrace.cpp index c4cf7f11e68..9e81cdddbda 100644 --- a/src/Common/StackTrace.cpp +++ b/src/Common/StackTrace.cpp @@ -35,7 +35,7 @@ std::string signalToErrorMessage(int sig, const siginfo_t & info, const ucontext else error << "Address: " << info.si_addr; -#if defined(__x86_64__) && !defined(__FreeBSD__) && !defined(__APPLE__) && !defined(__arm__) +#if defined(__x86_64__) && !defined(__FreeBSD__) && !defined(__APPLE__) && !defined(__arm__) && !defined(__powerpc__) auto err_mask = context.uc_mcontext.gregs[REG_ERR]; if ((err_mask & 0x02)) error << " Access: write."; @@ -186,6 +186,8 @@ static void * getCallerAddress(const ucontext_t & context) # endif #elif defined(__aarch64__) return reinterpret_cast(context.uc_mcontext.pc); +#elif defined(__powerpc64__) + return reinterpret_cast(context.uc_mcontext.gp_regs[PT_NIP]); #else return nullptr; #endif diff --git a/src/Common/tests/gtest_pod_array.cpp b/src/Common/tests/gtest_pod_array.cpp index 63cf7026757..9cc77b88195 100644 --- a/src/Common/tests/gtest_pod_array.cpp +++ b/src/Common/tests/gtest_pod_array.cpp @@ -33,6 +33,19 @@ TEST(Common, PODArrayInsert) EXPECT_EQ(str, std::string(chars.data(), chars.size())); } +TEST(Common, PODArrayInsertFromItself) +{ + { + PaddedPODArray array { 1 }; + + for (size_t i = 0; i < 3; ++i) + array.insertFromItself(array.begin(), array.end()); + + PaddedPODArray expected {1,1,1,1,1,1,1,1}; + ASSERT_EQ(array,expected); + } +} + TEST(Common, PODPushBackRawMany) { PODArray chars; diff --git a/src/DataStreams/NativeBlockOutputStream.cpp b/src/DataStreams/NativeBlockOutputStream.cpp index da68376201f..2a016c9a0c8 100644 --- a/src/DataStreams/NativeBlockOutputStream.cpp +++ b/src/DataStreams/NativeBlockOutputStream.cpp @@ -41,7 +41,7 @@ void NativeBlockOutputStream::flush() } -void NativeBlockOutputStream::writeData(const IDataType & type, const ColumnPtr & column, WriteBuffer & ostr, UInt64 offset, UInt64 limit) +static void writeData(const IDataType & type, const ColumnPtr & column, WriteBuffer & ostr, UInt64 offset, UInt64 limit) { /** If there are columns-constants - then we materialize them. * (Since the data type does not know how to serialize / deserialize constants.) diff --git a/src/DataStreams/NativeBlockOutputStream.h b/src/DataStreams/NativeBlockOutputStream.h index 64ccd267634..c47d7b2f1c3 100644 --- a/src/DataStreams/NativeBlockOutputStream.h +++ b/src/DataStreams/NativeBlockOutputStream.h @@ -30,8 +30,6 @@ public: void write(const Block & block) override; void flush() override; - static void writeData(const IDataType & type, const ColumnPtr & column, WriteBuffer & ostr, UInt64 offset, UInt64 limit); - String getContentType() const override { return "application/octet-stream"; } private: diff --git a/src/Databases/DatabaseAtomic.cpp b/src/Databases/DatabaseAtomic.cpp index e0078da57b7..b4222a7e349 100644 --- a/src/Databases/DatabaseAtomic.cpp +++ b/src/Databases/DatabaseAtomic.cpp @@ -567,7 +567,7 @@ void DatabaseAtomic::renameDictionaryInMemoryUnlocked(const StorageID & old_name auto result = external_loader.getLoadResult(toString(old_name.uuid)); if (!result.object) return; - const auto & dict = dynamic_cast(*result.object); + const auto & dict = dynamic_cast(*result.object); dict.updateDictionaryName(new_name); } void DatabaseAtomic::waitDetachedTableNotInUse(const UUID & uuid) diff --git a/src/Databases/DatabaseWithDictionaries.cpp b/src/Databases/DatabaseWithDictionaries.cpp index d92f0f1897e..55b04f27c58 100644 --- a/src/Databases/DatabaseWithDictionaries.cpp +++ b/src/Databases/DatabaseWithDictionaries.cpp @@ -49,7 +49,7 @@ void DatabaseWithDictionaries::attachDictionary(const String & dictionary_name, /// Attach the dictionary as table too. try { - /// TODO Make StorageDictionary an owner of IDictionaryBase objects. + /// TODO Make StorageDictionary an owner of IDictionary objects. /// All DDL operations with dictionaries will work with StorageDictionary table, /// and StorageDictionary will be responsible for loading of DDL dictionaries. /// ExternalLoaderDatabaseConfigRepository and other hacks related to ExternalLoader diff --git a/src/Dictionaries/CacheDictionary.cpp b/src/Dictionaries/CacheDictionary.cpp index eedf4dd3d87..535e862af40 100644 --- a/src/Dictionaries/CacheDictionary.cpp +++ b/src/Dictionaries/CacheDictionary.cpp @@ -13,7 +13,9 @@ #include #include #include + #include +#include namespace ProfileEvents { @@ -39,7 +41,6 @@ namespace DB namespace ErrorCodes { extern const int CACHE_DICTIONARY_UPDATE_FAIL; - extern const int TYPE_MISMATCH; extern const int UNSUPPORTED_METHOD; } @@ -70,8 +71,6 @@ CacheDictionary::CacheDictionary( { if (!source_ptr->supportsSelectiveLoad()) throw Exception{full_name + ": source cannot be used with CacheDictionary", ErrorCodes::UNSUPPORTED_METHOD}; - - setupHierarchicalAttribute(); } template @@ -120,164 +119,6 @@ const IDictionarySource * CacheDictionary::getSource() cons return source_ptr.get(); } -template -void CacheDictionary::toParent(const PaddedPODArray & ids [[maybe_unused]], PaddedPODArray & out [[maybe_unused]]) const -{ - if constexpr (dictionary_key_type == DictionaryKeyType::simple) - { - /// Run update on requested keys before fetch from storage - const auto & attribute_name = hierarchical_attribute->name; - - auto result_type = std::make_shared(); - auto input_column = result_type->createColumn(); - auto & input_column_typed = assert_cast &>(*input_column); - auto & data = input_column_typed.getData(); - data.insert(ids.begin(), ids.end()); - - auto column = getColumn({attribute_name}, result_type, {std::move(input_column)}, {result_type}, {nullptr}); - const auto & result_column_typed = assert_cast &>(*column); - const auto & result_data = result_column_typed.getData(); - - out.assign(result_data); - } - else - throw Exception("Hierarchy is not supported for complex key CacheDictionary", ErrorCodes::UNSUPPORTED_METHOD); -} - - -/// Allow to use single value in same way as array. -static inline UInt64 getAt(const PaddedPODArray & arr, const size_t idx) -{ - return arr[idx]; -} -static inline UInt64 getAt(const UInt64 & value, const size_t) -{ - return value; -} - -template -template -void CacheDictionary::isInImpl(const PaddedPODArray & child_ids, const AncestorType & ancestor_ids, PaddedPODArray & out) const -{ - /// Transform all children to parents until ancestor id or null_value will be reached. - - size_t out_size = out.size(); - memset(out.data(), 0xFF, out_size); /// 0xFF means "not calculated" - - const auto null_value = hierarchical_attribute->null_value.get(); - - PaddedPODArray children(out_size, 0); - PaddedPODArray parents(child_ids.begin(), child_ids.end()); - - for (size_t i = 0; i < DBMS_HIERARCHICAL_DICTIONARY_MAX_DEPTH; ++i) - { - size_t out_idx = 0; - size_t parents_idx = 0; - size_t new_children_idx = 0; - - while (out_idx < out_size) - { - /// Already calculated - if (out[out_idx] != 0xFF) - { - ++out_idx; - continue; - } - - /// No parent - if (parents[parents_idx] == null_value) - { - out[out_idx] = 0; - } - /// Found ancestor - else if (parents[parents_idx] == getAt(ancestor_ids, parents_idx)) - { - out[out_idx] = 1; - } - /// Loop detected - else if (children[new_children_idx] == parents[parents_idx]) - { - out[out_idx] = 1; - } - /// Found intermediate parent, add this value to search at next loop iteration - else - { - children[new_children_idx] = parents[parents_idx]; - ++new_children_idx; - } - - ++out_idx; - ++parents_idx; - } - - if (new_children_idx == 0) - break; - - /// Transform all children to its parents. - children.resize(new_children_idx); - parents.resize(new_children_idx); - - toParent(children, parents); - } -} - -template -void CacheDictionary::isInVectorVector( - const PaddedPODArray & child_ids, const PaddedPODArray & ancestor_ids, PaddedPODArray & out) const -{ - isInImpl(child_ids, ancestor_ids, out); -} - -template -void CacheDictionary::isInVectorConstant(const PaddedPODArray & child_ids, const UInt64 ancestor_id, PaddedPODArray & out) const -{ - isInImpl(child_ids, ancestor_id, out); -} - -template -void CacheDictionary::isInConstantVector(const UInt64 child_id, const PaddedPODArray & ancestor_ids, PaddedPODArray & out) const -{ - /// Special case with single child value. - - const auto null_value = hierarchical_attribute->null_value.get(); - - PaddedPODArray child(1, child_id); - PaddedPODArray parent(1); - std::vector ancestors(1, child_id); - - /// Iteratively find all ancestors for child. - for (size_t i = 0; i < DBMS_HIERARCHICAL_DICTIONARY_MAX_DEPTH; ++i) - { - toParent(child, parent); - - if (parent[0] == null_value) - break; - - child[0] = parent[0]; - ancestors.push_back(parent[0]); - } - - /// Assuming short hierarchy, so linear search is Ok. - for (size_t i = 0, out_size = out.size(); i < out_size; ++i) - out[i] = std::find(ancestors.begin(), ancestors.end(), ancestor_ids[i]) != ancestors.end(); -} - -template -void CacheDictionary::setupHierarchicalAttribute() -{ - /// TODO: Move this to DictionaryStructure - for (const auto & attribute : dict_struct.attributes) - { - if (attribute.hierarchical) - { - hierarchical_attribute = &attribute; - - if (attribute.underlying_type != AttributeUnderlyingType::utUInt64) - throw Exception{full_name + ": hierarchical attribute must be UInt64.", ErrorCodes::TYPE_MISMATCH}; - } - } -} - template ColumnPtr CacheDictionary::getColumn( const std::string & attribute_name, @@ -296,23 +137,6 @@ Columns CacheDictionary::getColumns( const Columns & key_columns, const DataTypes & key_types, const Columns & default_values_columns) const -{ - if (dictionary_key_type == DictionaryKeyType::complex) - dict_struct.validateKeyTypes(key_types); - - Arena complex_keys_arena; - DictionaryKeysExtractor extractor(key_columns, complex_keys_arena); - auto & keys = extractor.getKeys(); - - return getColumnsImpl(attribute_names, key_columns, keys, default_values_columns); -} - -template -Columns CacheDictionary::getColumnsImpl( - const Strings & attribute_names, - const Columns & key_columns, - const PaddedPODArray & keys, - const Columns & default_values_columns) const { /** * Flow of getColumsImpl @@ -328,6 +152,13 @@ Columns CacheDictionary::getColumnsImpl( * use default value. */ + if (dictionary_key_type == DictionaryKeyType::complex) + dict_struct.validateKeyTypes(key_types); + + DictionaryKeysArenaHolder arena_holder; + DictionaryKeysExtractor extractor(key_columns, arena_holder.getComplexKeyArena()); + auto keys = extractor.extractAllKeys(); + DictionaryStorageFetchRequest request(dict_struct, attribute_names, default_values_columns); FetchResult result_of_fetch_from_storage; @@ -440,9 +271,10 @@ ColumnUInt8::Ptr CacheDictionary::hasKeys(const Columns & k if (dictionary_key_type == DictionaryKeyType::complex) dict_struct.validateKeyTypes(key_types); - Arena complex_keys_arena; - DictionaryKeysExtractor extractor(key_columns, complex_keys_arena); - const auto & keys = extractor.getKeys(); + + DictionaryKeysArenaHolder arena_holder; + DictionaryKeysExtractor extractor(key_columns, arena_holder.getComplexKeyArena()); + const auto keys = extractor.extractAllKeys(); /// We make empty request just to fetch if keys exists DictionaryStorageFetchRequest request(dict_struct, {}, {}); @@ -526,6 +358,37 @@ ColumnUInt8::Ptr CacheDictionary::hasKeys(const Columns & k return result; } +template +ColumnPtr CacheDictionary::getHierarchy( + ColumnPtr key_column [[maybe_unused]], + const DataTypePtr & key_type [[maybe_unused]]) const +{ + if (dictionary_key_type == DictionaryKeyType::simple) + { + auto result = getKeysHierarchyDefaultImplementation(this, key_column, key_type); + query_count.fetch_add(key_column->size(), std::memory_order_relaxed); + return result; + } + else + return nullptr; +} + +template +ColumnUInt8::Ptr CacheDictionary::isInHierarchy( + ColumnPtr key_column [[maybe_unused]], + ColumnPtr in_key_column [[maybe_unused]], + const DataTypePtr & key_type [[maybe_unused]]) const +{ + if (dictionary_key_type == DictionaryKeyType::simple) + { + auto result = getKeysIsInHierarchyDefaultImplementation(this, key_column, in_key_column, key_type); + query_count.fetch_add(key_column->size(), std::memory_order_relaxed); + return result; + } + else + return nullptr; +} + template MutableColumns CacheDictionary::aggregateColumnsInOrderOfKeys( const PaddedPODArray & keys, @@ -618,19 +481,18 @@ MutableColumns CacheDictionary::aggregateColumns( template BlockInputStreamPtr CacheDictionary::getBlockInputStream(const Names & column_names, size_t max_block_size) const { - using BlockInputStreamType = DictionaryBlockInputStream; - std::shared_ptr stream; + std::shared_ptr stream; { /// Write lock on storage const ProfilingScopedWriteRWLock write_lock{rw_lock, ProfileEvents::DictCacheLockWriteNs}; if constexpr (dictionary_key_type == DictionaryKeyType::simple) - stream = std::make_shared(shared_from_this(), max_block_size, cache_storage_ptr->getCachedSimpleKeys(), column_names); + stream = std::make_shared(shared_from_this(), max_block_size, cache_storage_ptr->getCachedSimpleKeys(), column_names); else { auto keys = cache_storage_ptr->getCachedComplexKeys(); - stream = std::make_shared(shared_from_this(), max_block_size, keys, column_names); + stream = std::make_shared(shared_from_this(), max_block_size, keys, column_names); } } @@ -660,14 +522,20 @@ void CacheDictionary::update(CacheDictionaryUpdateUnitPtr requested_keys_extractor(update_unit_ptr->key_columns, update_unit_ptr->complex_key_arena); - const auto & requested_keys = requested_keys_extractor.getKeys(); + Arena * complex_key_arena = update_unit_ptr->complex_keys_arena_holder.getComplexKeyArena(); + DictionaryKeysExtractor requested_keys_extractor(update_unit_ptr->key_columns, complex_key_arena); + auto requested_keys = requested_keys_extractor.extractAllKeys(); HashSet not_found_keys; std::vector requested_keys_vector; std::vector requested_complex_key_rows; + if constexpr (dictionary_key_type == DictionaryKeyType::simple) + requested_keys_vector.reserve(requested_keys.size()); + else + requested_complex_key_rows.reserve(requested_keys.size()); + auto & key_index_to_state_from_storage = update_unit_ptr->key_index_to_state; for (size_t i = 0; i < key_index_to_state_from_storage.size(); ++i) @@ -727,8 +595,8 @@ void CacheDictionary::update(CacheDictionaryUpdateUnitPtr keys_extractor(key_columns, update_unit_ptr->complex_key_arena); - const auto & keys_extracted_from_block = keys_extractor.getKeys(); + DictionaryKeysExtractor keys_extractor(key_columns, complex_key_arena); + auto keys_extracted_from_block = keys_extractor.extractAllKeys(); for (size_t index_of_attribute = 0; index_of_attribute < fetched_columns_during_update.size(); ++index_of_attribute) { @@ -740,6 +608,7 @@ void CacheDictionary::update(CacheDictionaryUpdateUnitPtrrequested_keys_to_fetched_columns_during_update_index[fetched_key_from_source] = found_keys_size; found_keys_in_source.emplace_back(fetched_key_from_source); diff --git a/src/Dictionaries/CacheDictionary.h b/src/Dictionaries/CacheDictionary.h index 1192db73737..62cd509d006 100644 --- a/src/Dictionaries/CacheDictionary.h +++ b/src/Dictionaries/CacheDictionary.h @@ -130,33 +130,18 @@ public: std::exception_ptr getLastException() const override; - bool hasHierarchy() const override { return dictionary_key_type == DictionaryKeyType::simple && hierarchical_attribute; } + bool hasHierarchy() const override { return dictionary_key_type == DictionaryKeyType::simple && dict_struct.hierarchical_attribute_index.has_value(); } - void toParent(const PaddedPODArray & ids, PaddedPODArray & out) const override; + ColumnPtr getHierarchy(ColumnPtr key_column, const DataTypePtr & key_type) const override; - void isInVectorVector( - const PaddedPODArray & child_ids, - const PaddedPODArray & ancestor_ids, - PaddedPODArray & out) const override; - - void isInVectorConstant( - const PaddedPODArray & child_ids, - const UInt64 ancestor_id, PaddedPODArray & out) const override; - - void isInConstantVector( - const UInt64 child_id, - const PaddedPODArray & ancestor_ids, - PaddedPODArray & out) const override; + ColumnUInt8::Ptr isInHierarchy( + ColumnPtr key_column, + ColumnPtr in_key_column, + const DataTypePtr & key_type) const override; private: using FetchResult = std::conditional_t; - Columns getColumnsImpl( - const Strings & attribute_names, - const Columns & key_columns, - const PaddedPODArray & keys, - const Columns & default_values_columns) const; - static MutableColumns aggregateColumnsInOrderOfKeys( const PaddedPODArray & keys, const DictionaryStorageFetchRequest & request, @@ -171,8 +156,6 @@ private: const MutableColumns & fetched_columns_during_update, const HashMap & found_keys_to_fetched_columns_during_update_index); - void setupHierarchicalAttribute(); - void update(CacheDictionaryUpdateUnitPtr update_unit_ptr); /// Update dictionary source pointer if required and return it. Thread safe. @@ -193,9 +176,6 @@ private: return source_ptr; } - template - void isInImpl(const PaddedPODArray & child_ids, const AncestorType & ancestor_ids, PaddedPODArray & out) const; - const DictionaryStructure dict_struct; /// Dictionary source should be used with mutex @@ -218,8 +198,6 @@ private: /// readers. Surprisingly this lock is also used for last_exception pointer. mutable std::shared_mutex rw_lock; - const DictionaryAttribute * hierarchical_attribute = nullptr; - mutable std::exception_ptr last_exception; mutable std::atomic error_count {0}; mutable std::atomic backoff_end_time{std::chrono::system_clock::time_point{}}; diff --git a/src/Dictionaries/CacheDictionaryUpdateQueue.h b/src/Dictionaries/CacheDictionaryUpdateQueue.h index 2e636af6db6..3d27a157752 100644 --- a/src/Dictionaries/CacheDictionaryUpdateQueue.h +++ b/src/Dictionaries/CacheDictionaryUpdateQueue.h @@ -66,8 +66,9 @@ public: HashMap requested_keys_to_fetched_columns_during_update_index; MutableColumns fetched_columns_during_update; + /// Complex keys are serialized in this arena - Arena complex_key_arena; + DictionaryKeysArenaHolder complex_keys_arena_holder; private: template diff --git a/src/Dictionaries/ComplexKeyHashedDictionary.cpp b/src/Dictionaries/ComplexKeyHashedDictionary.cpp deleted file mode 100644 index 4086082e66d..00000000000 --- a/src/Dictionaries/ComplexKeyHashedDictionary.cpp +++ /dev/null @@ -1,594 +0,0 @@ -#include "ComplexKeyHashedDictionary.h" -#include -#include -#include -#include -#include -#include -#include "DictionaryBlockInputStream.h" -#include "DictionaryFactory.h" - -namespace DB -{ -namespace ErrorCodes -{ - extern const int TYPE_MISMATCH; - extern const int BAD_ARGUMENTS; - extern const int DICTIONARY_IS_EMPTY; -} - -ComplexKeyHashedDictionary::ComplexKeyHashedDictionary( - const StorageID & dict_id_, - const DictionaryStructure & dict_struct_, - DictionarySourcePtr source_ptr_, - const DictionaryLifetime dict_lifetime_, - bool require_nonempty_, - BlockPtr saved_block_) - : IDictionaryBase(dict_id_) - , dict_struct(dict_struct_) - , source_ptr{std::move(source_ptr_)} - , dict_lifetime(dict_lifetime_) - , require_nonempty(require_nonempty_) - , saved_block{std::move(saved_block_)} -{ - createAttributes(); - loadData(); - calculateBytesAllocated(); -} - -ColumnPtr ComplexKeyHashedDictionary::getColumn( - const std::string & attribute_name, - const DataTypePtr & result_type, - const Columns & key_columns, - const DataTypes & key_types, - const ColumnPtr & default_values_column) const -{ - dict_struct.validateKeyTypes(key_types); - - ColumnPtr result; - - const auto & attribute = getAttribute(attribute_name); - const auto & dictionary_attribute = dict_struct.getAttribute(attribute_name, result_type); - - auto keys_size = key_columns.front()->size(); - - ColumnUInt8::MutablePtr col_null_map_to; - ColumnUInt8::Container * vec_null_map_to = nullptr; - if (attribute.is_nullable) - { - col_null_map_to = ColumnUInt8::create(keys_size, false); - vec_null_map_to = &col_null_map_to->getData(); - } - - auto type_call = [&](const auto &dictionary_attribute_type) - { - using Type = std::decay_t; - using AttributeType = typename Type::AttributeType; - using ValueType = DictionaryValueType; - using ColumnProvider = DictionaryAttributeColumnProvider; - - const auto attribute_null_value = std::get(attribute.null_values); - AttributeType null_value = static_cast(attribute_null_value); - DictionaryDefaultValueExtractor default_value_extractor(std::move(null_value), default_values_column); - - auto column = ColumnProvider::getColumn(dictionary_attribute, keys_size); - - if constexpr (std::is_same_v) - { - auto * out = column.get(); - - getItemsImpl( - attribute, - key_columns, - [&](const size_t row, const StringRef value, bool is_null) - { - if (attribute.is_nullable) - (*vec_null_map_to)[row] = is_null; - - out->insertData(value.data, value.size); - }, - default_value_extractor); - } - else - { - auto & out = column->getData(); - - getItemsImpl( - attribute, - key_columns, - [&](const size_t row, const auto value, bool is_null) - { - if (attribute.is_nullable) - (*vec_null_map_to)[row] = is_null; - - out[row] = value; - }, - default_value_extractor); - } - - result = std::move(column); - }; - - callOnDictionaryAttributeType(attribute.type, type_call); - - if (attribute.is_nullable) - { - result = ColumnNullable::create(result, std::move(col_null_map_to)); - } - - return result; -} - -ColumnUInt8::Ptr ComplexKeyHashedDictionary::hasKeys(const Columns & key_columns, const DataTypes & key_types) const -{ - dict_struct.validateKeyTypes(key_types); - - auto size = key_columns.front()->size(); - auto result = ColumnUInt8::create(size); - auto& out = result->getData(); - - const auto & attribute = attributes.front(); - - auto type_call = [&](const auto & dictionary_attribute_type) - { - using Type = std::decay_t; - using AttributeType = typename Type::AttributeType; - using ValueType = DictionaryValueType; - - has(attribute, key_columns, out); - }; - - callOnDictionaryAttributeType(attribute.type, type_call); - - return result; -} - -void ComplexKeyHashedDictionary::createAttributes() -{ - const auto size = dict_struct.attributes.size(); - attributes.reserve(size); - - for (const auto & attribute : dict_struct.attributes) - { - attribute_index_by_name.emplace(attribute.name, attributes.size()); - attributes.push_back(createAttribute(attribute, attribute.null_value)); - - if (attribute.hierarchical) - throw Exception{full_name + ": hierarchical attributes not supported for dictionary of type " + getTypeName(), - ErrorCodes::TYPE_MISMATCH}; - } -} - -void ComplexKeyHashedDictionary::blockToAttributes(const Block & block) -{ - /// created upfront to avoid excess allocations - const auto keys_size = dict_struct.key->size(); - StringRefs keys(keys_size); - - const auto attributes_size = attributes.size(); - const auto rows = block.rows(); - element_count += rows; - - const auto key_column_ptrs = ext::map( - ext::range(0, keys_size), [&](const size_t attribute_idx) { return block.safeGetByPosition(attribute_idx).column; }); - - const auto attribute_column_ptrs = ext::map(ext::range(0, attributes_size), [&](const size_t attribute_idx) - { - return block.safeGetByPosition(keys_size + attribute_idx).column; - }); - - for (const auto row_idx : ext::range(0, rows)) - { - /// calculate key once per row - const auto key = placeKeysInPool(row_idx, key_column_ptrs, keys, keys_pool); - - auto should_rollback = false; - - for (const auto attribute_idx : ext::range(0, attributes_size)) - { - const auto & attribute_column = *attribute_column_ptrs[attribute_idx]; - auto & attribute = attributes[attribute_idx]; - const auto inserted = setAttributeValue(attribute, key, attribute_column[row_idx]); - if (!inserted) - should_rollback = true; - } - - /// @note on multiple equal keys the mapped value for the first one is stored - if (should_rollback) - keys_pool.rollback(key.size); - } -} - -void ComplexKeyHashedDictionary::updateData() -{ - /// created upfront to avoid excess allocations - const auto keys_size = dict_struct.key->size(); - StringRefs keys(keys_size); - - const auto attributes_size = attributes.size(); - - if (!saved_block || saved_block->rows() == 0) - { - auto stream = source_ptr->loadUpdatedAll(); - stream->readPrefix(); - - while (const auto block = stream->read()) - { - /// We are using this method to keep saved data if input stream consists of multiple blocks - if (!saved_block) - saved_block = std::make_shared(block.cloneEmpty()); - for (const auto attribute_idx : ext::range(0, keys_size + attributes_size)) - { - const IColumn & update_column = *block.getByPosition(attribute_idx).column.get(); - MutableColumnPtr saved_column = saved_block->getByPosition(attribute_idx).column->assumeMutable(); - saved_column->insertRangeFrom(update_column, 0, update_column.size()); - } - } - stream->readSuffix(); - } - else - { - auto stream = source_ptr->loadUpdatedAll(); - - stream->readPrefix(); - while (Block block = stream->read()) - { - const auto saved_key_column_ptrs = ext::map( - ext::range(0, keys_size), [&](const size_t key_idx) { return saved_block->safeGetByPosition(key_idx).column; }); - - const auto update_key_column_ptrs = ext::map( - ext::range(0, keys_size), [&](const size_t key_idx) { return block.safeGetByPosition(key_idx).column; }); - - Arena temp_key_pool; - ContainerType> update_key_hash; - - for (size_t i = 0; i < block.rows(); ++i) - { - const auto u_key = placeKeysInPool(i, update_key_column_ptrs, keys, temp_key_pool); - update_key_hash[u_key].push_back(i); - } - - const size_t rows = saved_block->rows(); - IColumn::Filter filter(rows); - - for (size_t i = 0; i < saved_block->rows(); ++i) - { - const auto s_key = placeKeysInPool(i, saved_key_column_ptrs, keys, temp_key_pool); - auto * it = update_key_hash.find(s_key); - if (it) - filter[i] = 0; - else - filter[i] = 1; - } - - auto block_columns = block.mutateColumns(); - for (const auto attribute_idx : ext::range(0, keys_size + attributes_size)) - { - auto & column = saved_block->safeGetByPosition(attribute_idx).column; - const auto & filtered_column = column->filter(filter, -1); - - block_columns[attribute_idx]->insertRangeFrom(*filtered_column.get(), 0, filtered_column->size()); - } - - saved_block->setColumns(std::move(block_columns)); - } - stream->readSuffix(); - } - - if (saved_block) - blockToAttributes(*saved_block.get()); -} - -void ComplexKeyHashedDictionary::loadData() -{ - if (!source_ptr->hasUpdateField()) - { - auto stream = source_ptr->loadAll(); - stream->readPrefix(); - - while (const auto block = stream->read()) - blockToAttributes(block); - - stream->readSuffix(); - } - else - updateData(); - - if (require_nonempty && 0 == element_count) - throw Exception{full_name + ": dictionary source is empty and 'require_nonempty' property is set.", ErrorCodes::DICTIONARY_IS_EMPTY}; -} - -template -void ComplexKeyHashedDictionary::addAttributeSize(const Attribute & attribute) -{ - const auto & map_ref = std::get>(attribute.maps); - bytes_allocated += sizeof(ContainerType) + map_ref.getBufferSizeInBytes(); - bucket_count = map_ref.getBufferSizeInCells(); -} - -template <> -void ComplexKeyHashedDictionary::addAttributeSize(const Attribute & attribute) -{ - const auto & map_ref = std::get>(attribute.maps); - bytes_allocated += sizeof(ContainerType) + map_ref.getBufferSizeInBytes(); - bucket_count = map_ref.getBufferSizeInCells(); - bytes_allocated += sizeof(Arena) + attribute.string_arena->size(); -} - -void ComplexKeyHashedDictionary::calculateBytesAllocated() -{ - bytes_allocated += attributes.size() * sizeof(attributes.front()); - - for (const auto & attribute : attributes) - { - auto type_call = [&](const auto & dictionary_attribute_type) - { - using Type = std::decay_t; - using AttributeType = typename Type::AttributeType; - - addAttributeSize(attribute); - }; - - callOnDictionaryAttributeType(attribute.type, type_call); - } - - bytes_allocated += keys_pool.size(); -} - -template -void ComplexKeyHashedDictionary::createAttributeImpl(Attribute & attribute, const Field & null_value) -{ - attribute.null_values = T(null_value.get()); - attribute.maps.emplace>(); -} - -template <> -void ComplexKeyHashedDictionary::createAttributeImpl(Attribute & attribute, const Field & null_value) -{ - attribute.string_arena = std::make_unique(); - const String & string = null_value.get(); - const char * string_in_arena = attribute.string_arena->insert(string.data(), string.size()); - attribute.null_values.emplace(string_in_arena, string.size()); - attribute.maps.emplace>(); -} - -ComplexKeyHashedDictionary::Attribute -ComplexKeyHashedDictionary::createAttribute(const DictionaryAttribute & attribute, const Field & null_value) -{ - auto nullable_set = attribute.is_nullable ? std::make_unique() : nullptr; - Attribute attr{attribute.underlying_type, attribute.is_nullable, std::move(nullable_set), {}, {}, {}}; - - auto type_call = [&](const auto &dictionary_attribute_type) - { - using Type = std::decay_t; - using AttributeType = typename Type::AttributeType; - createAttributeImpl(attr, null_value); - }; - - callOnDictionaryAttributeType(attribute.underlying_type, type_call); - - return attr; -} - - -template -void ComplexKeyHashedDictionary::getItemsImpl( - const Attribute & attribute, - const Columns & key_columns, - ValueSetter && set_value, - DefaultValueExtractor & default_value_extractor) const -{ - const auto & attr = std::get>(attribute.maps); - - const auto keys_size = key_columns.size(); - StringRefs keys(keys_size); - Arena temporary_keys_pool; - - const auto rows = key_columns.front()->size(); - for (const auto i : ext::range(0, rows)) - { - /// copy key data to arena so it is contiguous and return StringRef to it - const auto key = placeKeysInPool(i, key_columns, keys, temporary_keys_pool); - - const auto it = attr.find(key); - - if (it) - { - set_value(i, static_cast(it->getMapped()), false); - } - else - { - if (attribute.is_nullable && attribute.nullable_set->find(key) != nullptr) - set_value(i, default_value_extractor[i], true); - else - set_value(i, default_value_extractor[i], false); - } - - /// free memory allocated for the key - temporary_keys_pool.rollback(key.size); - } - - query_count.fetch_add(rows, std::memory_order_relaxed); -} - - -template -bool ComplexKeyHashedDictionary::setAttributeValueImpl(Attribute & attribute, const StringRef key, const T value) -{ - auto & map = std::get>(attribute.maps); - const auto pair = map.insert({key, value}); - return pair.second; -} - -template <> -bool ComplexKeyHashedDictionary::setAttributeValueImpl(Attribute & attribute, const StringRef key, const String value) -{ - const auto * string_in_arena = attribute.string_arena->insert(value.data(), value.size()); - return setAttributeValueImpl(attribute, key, StringRef{string_in_arena, value.size()}); -} - -bool ComplexKeyHashedDictionary::setAttributeValue(Attribute & attribute, const StringRef key, const Field & value) -{ - bool result = false; - - auto type_call = [&](const auto &dictionary_attribute_type) - { - using Type = std::decay_t; - using AttributeType = typename Type::AttributeType; - - if (attribute.is_nullable) - { - if (value.isNull()) - { - attribute.nullable_set->insert(key); - result = true; - return; - } - else - { - attribute.nullable_set->erase(key); - } - } - - result = setAttributeValueImpl(attribute, key, value.get()); - }; - - callOnDictionaryAttributeType(attribute.type, type_call); - - return result; -} - -const ComplexKeyHashedDictionary::Attribute & ComplexKeyHashedDictionary::getAttribute(const std::string & attribute_name) const -{ - const auto it = attribute_index_by_name.find(attribute_name); - if (it == std::end(attribute_index_by_name)) - throw Exception{full_name + ": no such attribute '" + attribute_name + "'", ErrorCodes::BAD_ARGUMENTS}; - - return attributes[it->second]; -} - -StringRef ComplexKeyHashedDictionary::placeKeysInPool(const size_t row, const Columns & key_columns, StringRefs & keys, Arena & pool) -{ - const auto keys_size = key_columns.size(); - size_t sum_keys_size{}; - - const char * block_start = nullptr; - for (size_t j = 0; j < keys_size; ++j) - { - keys[j] = key_columns[j]->serializeValueIntoArena(row, pool, block_start); - sum_keys_size += keys[j].size; - } - - const auto * key_start = block_start; - for (size_t j = 0; j < keys_size; ++j) - { - keys[j].data = key_start; - key_start += keys[j].size; - } - - return {block_start, sum_keys_size}; -} - -template -void ComplexKeyHashedDictionary::has(const Attribute & attribute, const Columns & key_columns, PaddedPODArray & out) const -{ - const auto & attr = std::get>(attribute.maps); - const auto keys_size = key_columns.size(); - StringRefs keys(keys_size); - Arena temporary_keys_pool; - const auto rows = key_columns.front()->size(); - - for (const auto i : ext::range(0, rows)) - { - /// copy key data to arena so it is contiguous and return StringRef to it - const auto key = placeKeysInPool(i, key_columns, keys, temporary_keys_pool); - - const auto it = attr.find(key); - out[i] = static_cast(it); - - if (attribute.is_nullable && !out[i]) - out[i] = attribute.nullable_set->find(key) != nullptr; - - /// free memory allocated for the key - temporary_keys_pool.rollback(key.size); - } - - query_count.fetch_add(rows, std::memory_order_relaxed); -} - -std::vector ComplexKeyHashedDictionary::getKeys() const -{ - const Attribute & attribute = attributes.front(); - - std::vector result; - - auto type_call = [&](const auto & dictionary_attribute_type) - { - using Type = std::decay_t; - using AttributeType = typename Type::AttributeType; - - if constexpr (std::is_same_v) - { - result = getKeys(attribute); - } - else - { - result = getKeys(attribute); - } - }; - - callOnDictionaryAttributeType(attribute.type, type_call); - - return result; -} - -template -std::vector ComplexKeyHashedDictionary::getKeys(const Attribute & attribute) const -{ - const ContainerType & attr = std::get>(attribute.maps); - std::vector keys; - keys.reserve(attr.size()); - for (const auto & key : attr) - keys.push_back(key.getKey()); - - if (attribute.is_nullable) - { - for (const auto & key: *attribute.nullable_set) - keys.push_back(key.getKey()); - } - - return keys; -} - -BlockInputStreamPtr ComplexKeyHashedDictionary::getBlockInputStream(const Names & column_names, size_t max_block_size) const -{ - using BlockInputStreamType = DictionaryBlockInputStream; - auto vector_keys = getKeys(); - - PaddedPODArray keys; - keys.reserve(vector_keys.size()); - keys.assign(vector_keys.begin(), vector_keys.end()); - - return std::make_shared(shared_from_this(), max_block_size, keys, column_names); -} - -void registerDictionaryComplexKeyHashed(DictionaryFactory & factory) -{ - auto create_layout = [=](const std::string &, - const DictionaryStructure & dict_struct, - const Poco::Util::AbstractConfiguration & config, - const std::string & config_prefix, - DictionarySourcePtr source_ptr) -> DictionaryPtr - { - if (!dict_struct.key) - throw Exception{"'key' is required for dictionary of layout 'complex_key_hashed'", ErrorCodes::BAD_ARGUMENTS}; - - const auto dict_id = StorageID::fromDictionaryConfig(config, config_prefix); - const DictionaryLifetime dict_lifetime{config, config_prefix + ".lifetime"}; - const bool require_nonempty = config.getBool(config_prefix + ".require_nonempty", false); - return std::make_unique(dict_id, dict_struct, std::move(source_ptr), dict_lifetime, require_nonempty); - }; - factory.registerLayout("complex_key_hashed", create_layout, true); -} - -} diff --git a/src/Dictionaries/ComplexKeyHashedDictionary.h b/src/Dictionaries/ComplexKeyHashedDictionary.h deleted file mode 100644 index 091974bbf43..00000000000 --- a/src/Dictionaries/ComplexKeyHashedDictionary.h +++ /dev/null @@ -1,185 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "IDictionary.h" -#include "IDictionarySource.h" -#include "DictionaryStructure.h" -#include "DictionaryHelpers.h" - -namespace DB -{ - -class ComplexKeyHashedDictionary final : public IDictionaryBase -{ -public: - ComplexKeyHashedDictionary( - const StorageID & dict_id_, - const DictionaryStructure & dict_struct_, - DictionarySourcePtr source_ptr_, - const DictionaryLifetime dict_lifetime_, - bool require_nonempty_, - BlockPtr saved_block_ = nullptr); - - std::string getKeyDescription() const { return key_description; } - - std::string getTypeName() const override { return "ComplexKeyHashed"; } - - size_t getBytesAllocated() const override { return bytes_allocated; } - - size_t getQueryCount() const override { return query_count.load(std::memory_order_relaxed); } - - double getHitRate() const override { return 1.0; } - - size_t getElementCount() const override { return element_count; } - - double getLoadFactor() const override { return static_cast(element_count) / bucket_count; } - - std::shared_ptr clone() const override - { - return std::make_shared(getDictionaryID(), dict_struct, source_ptr->clone(), dict_lifetime, require_nonempty, saved_block); - } - - const IDictionarySource * getSource() const override { return source_ptr.get(); } - - const DictionaryLifetime & getLifetime() const override { return dict_lifetime; } - - const DictionaryStructure & getStructure() const override { return dict_struct; } - - bool isInjective(const std::string & attribute_name) const override - { - return dict_struct.attributes[&getAttribute(attribute_name) - attributes.data()].injective; - } - - DictionaryKeyType getKeyType() const override { return DictionaryKeyType::complex; } - - ColumnPtr getColumn( - const std::string& attribute_name, - const DataTypePtr & result_type, - const Columns & key_columns, - const DataTypes & key_types, - const ColumnPtr & default_values_column) const override; - - ColumnUInt8::Ptr hasKeys(const Columns & key_columns, const DataTypes & key_types) const override; - - BlockInputStreamPtr getBlockInputStream(const Names & column_names, size_t max_block_size) const override; - -private: - template - using ContainerType = HashMapWithSavedHash; - - using NullableSet = HashSetWithSavedHash; - - struct Attribute final - { - AttributeUnderlyingType type; - bool is_nullable; - std::unique_ptr nullable_set; - - std::variant< - UInt8, - UInt16, - UInt32, - UInt64, - UInt128, - Int8, - Int16, - Int32, - Int64, - Decimal32, - Decimal64, - Decimal128, - Float32, - Float64, - StringRef> - null_values; - std::variant< - ContainerType, - ContainerType, - ContainerType, - ContainerType, - ContainerType, - ContainerType, - ContainerType, - ContainerType, - ContainerType, - ContainerType, - ContainerType, - ContainerType, - ContainerType, - ContainerType, - ContainerType> - maps; - std::unique_ptr string_arena; - }; - - void createAttributes(); - - void blockToAttributes(const Block & block); - - void updateData(); - - void loadData(); - - template - void addAttributeSize(const Attribute & attribute); - - void calculateBytesAllocated(); - - template - static void createAttributeImpl(Attribute & attribute, const Field & null_value); - - static Attribute createAttribute(const DictionaryAttribute & attribute, const Field & null_value); - - template - void getItemsImpl( - const Attribute & attribute, - const Columns & key_columns, - ValueSetter && set_value, - DefaultValueExtractor & default_value_extractor) const; - - template - static bool setAttributeValueImpl(Attribute & attribute, const StringRef key, const T value); - - static bool setAttributeValue(Attribute & attribute, const StringRef key, const Field & value); - - const Attribute & getAttribute(const std::string & attribute_name) const; - - static StringRef placeKeysInPool(const size_t row, const Columns & key_columns, StringRefs & keys, Arena & pool); - - template - void has(const Attribute & attribute, const Columns & key_columns, PaddedPODArray & out) const; - - std::vector getKeys() const; - - template - std::vector getKeys(const Attribute & attribute) const; - - const DictionaryStructure dict_struct; - const DictionarySourcePtr source_ptr; - const DictionaryLifetime dict_lifetime; - const bool require_nonempty; - const std::string key_description{dict_struct.getKeyDescription()}; - - std::map attribute_index_by_name; - std::vector attributes; - Arena keys_pool; - - size_t bytes_allocated = 0; - size_t element_count = 0; - size_t bucket_count = 0; - mutable std::atomic query_count{0}; - - BlockPtr saved_block; -}; - -} diff --git a/src/Dictionaries/DictionaryBlockInputStream.cpp b/src/Dictionaries/DictionaryBlockInputStream.cpp new file mode 100644 index 00000000000..433ff211831 --- /dev/null +++ b/src/Dictionaries/DictionaryBlockInputStream.cpp @@ -0,0 +1,200 @@ +#include "DictionaryBlockInputStream.h" + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; +} + +DictionaryBlockInputStream::DictionaryBlockInputStream( + std::shared_ptr dictionary_, UInt64 max_block_size_, PaddedPODArray && ids_, const Names & column_names_) + : DictionaryBlockInputStreamBase(ids_.size(), max_block_size_) + , dictionary(dictionary_) + , column_names(column_names_) + , ids(std::move(ids_)) + , key_type(DictionaryInputStreamKeyType::Id) +{ +} + +DictionaryBlockInputStream::DictionaryBlockInputStream( + std::shared_ptr dictionary_, + UInt64 max_block_size_, + const PaddedPODArray & keys, + const Names & column_names_) + : DictionaryBlockInputStreamBase(keys.size(), max_block_size_) + , dictionary(dictionary_) + , column_names(column_names_) + , key_type(DictionaryInputStreamKeyType::ComplexKey) +{ + const DictionaryStructure & dictionary_structure = dictionary->getStructure(); + fillKeyColumns(keys, 0, keys.size(), dictionary_structure, key_columns); +} + +DictionaryBlockInputStream::DictionaryBlockInputStream( + std::shared_ptr dictionary_, + UInt64 max_block_size_, + const Columns & data_columns_, + const Names & column_names_, + GetColumnsFunction && get_key_columns_function_, + GetColumnsFunction && get_view_columns_function_) + : DictionaryBlockInputStreamBase(data_columns_.front()->size(), max_block_size_) + , dictionary(dictionary_) + , column_names(column_names_) + , data_columns(data_columns_) + , get_key_columns_function(std::move(get_key_columns_function_)) + , get_view_columns_function(std::move(get_view_columns_function_)) + , key_type(DictionaryInputStreamKeyType::Callback) +{ +} + +Block DictionaryBlockInputStream::getBlock(size_t start, size_t length) const +{ + /// TODO: Rewrite + switch (key_type) + { + case DictionaryInputStreamKeyType::ComplexKey: + { + Columns columns; + ColumnsWithTypeAndName view_columns; + columns.reserve(key_columns.size()); + for (const auto & key_column : key_columns) + { + ColumnPtr column = key_column.column->cut(start, length); + columns.emplace_back(column); + view_columns.emplace_back(column, key_column.type, key_column.name); + } + return fillBlock({}, columns, {}, std::move(view_columns)); + } + + case DictionaryInputStreamKeyType::Id: + { + PaddedPODArray ids_to_fill(ids.begin() + start, ids.begin() + start + length); + return fillBlock(ids_to_fill, {}, {}, {}); + } + + case DictionaryInputStreamKeyType::Callback: + { + Columns columns; + columns.reserve(data_columns.size()); + for (const auto & data_column : data_columns) + columns.push_back(data_column->cut(start, length)); + const DictionaryStructure & dictionaty_structure = dictionary->getStructure(); + const auto & attributes = *dictionaty_structure.key; + ColumnsWithTypeAndName keys_with_type_and_name = get_key_columns_function(columns, attributes); + ColumnsWithTypeAndName view_with_type_and_name = get_view_columns_function(columns, attributes); + DataTypes types; + columns.clear(); + for (const auto & key_column : keys_with_type_and_name) + { + columns.push_back(key_column.column); + types.push_back(key_column.type); + } + return fillBlock({}, columns, types, std::move(view_with_type_and_name)); + } + } + + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected DictionaryInputStreamKeyType."); +} + +Block DictionaryBlockInputStream::fillBlock( + const PaddedPODArray & ids_to_fill, + const Columns & keys, + const DataTypes & types, + ColumnsWithTypeAndName && view) const +{ + std::unordered_set names(column_names.begin(), column_names.end()); + + DataTypes data_types = types; + ColumnsWithTypeAndName block_columns; + + data_types.reserve(keys.size()); + const DictionaryStructure & dictionary_structure = dictionary->getStructure(); + if (data_types.empty() && dictionary_structure.key) + for (const auto & key : *dictionary_structure.key) + data_types.push_back(key.type); + + for (const auto & column : view) + if (names.find(column.name) != names.end()) + block_columns.push_back(column); + + const DictionaryStructure & structure = dictionary->getStructure(); + ColumnPtr ids_column = getColumnFromIds(ids_to_fill); + + if (structure.id && names.find(structure.id->name) != names.end()) + { + block_columns.emplace_back(ids_column, std::make_shared(), structure.id->name); + } + + auto dictionary_key_type = dictionary->getKeyType(); + + for (const auto idx : ext::range(0, structure.attributes.size())) + { + const DictionaryAttribute & attribute = structure.attributes[idx]; + if (names.find(attribute.name) != names.end()) + { + ColumnPtr column; + + if (dictionary_key_type == DictionaryKeyType::simple) + { + column = dictionary->getColumn( + attribute.name, + attribute.type, + {ids_column}, + {std::make_shared()}, + nullptr /* default_values_column */); + } + else + { + column = dictionary->getColumn( + attribute.name, + attribute.type, + keys, + data_types, + nullptr /* default_values_column*/); + } + + block_columns.emplace_back(column, attribute.type, attribute.name); + } + } + + return Block(block_columns); +} + +ColumnPtr DictionaryBlockInputStream::getColumnFromIds(const PaddedPODArray & ids_to_fill) +{ + auto column_vector = ColumnVector::create(); + column_vector->getData().assign(ids_to_fill); + return column_vector; +} + +void DictionaryBlockInputStream::fillKeyColumns( + const PaddedPODArray & keys, + size_t start, + size_t size, + const DictionaryStructure & dictionary_structure, + ColumnsWithTypeAndName & result) +{ + MutableColumns columns; + columns.reserve(dictionary_structure.key->size()); + + for (const DictionaryAttribute & attribute : *dictionary_structure.key) + columns.emplace_back(attribute.type->createColumn()); + + for (auto idx : ext::range(start, size)) + { + const auto & key = keys[idx]; + const auto *ptr = key.data; + for (auto & column : columns) + ptr = column->deserializeAndInsertFromArena(ptr); + } + + for (size_t i = 0, num_columns = columns.size(); i < num_columns; ++i) + { + const auto & dictionary_attribute = (*dictionary_structure.key)[i]; + result.emplace_back(ColumnWithTypeAndName{std::move(columns[i]), dictionary_attribute.type, dictionary_attribute.name}); + } +} + +} diff --git a/src/Dictionaries/DictionaryBlockInputStream.h b/src/Dictionaries/DictionaryBlockInputStream.h index 71615efa7f8..5197df411fa 100644 --- a/src/Dictionaries/DictionaryBlockInputStream.h +++ b/src/Dictionaries/DictionaryBlockInputStream.h @@ -16,27 +16,22 @@ namespace DB { -namespace ErrorCodes -{ - extern const int LOGICAL_ERROR; -} /// TODO: Remove this class /* BlockInputStream implementation for external dictionaries * read() returns blocks consisting of the in-memory contents of the dictionaries */ -template class DictionaryBlockInputStream : public DictionaryBlockInputStreamBase { public: DictionaryBlockInputStream( - std::shared_ptr dictionary, + std::shared_ptr dictionary, UInt64 max_block_size, - PaddedPODArray && ids, + PaddedPODArray && ids, const Names & column_names); DictionaryBlockInputStream( - std::shared_ptr dictionary, + std::shared_ptr dictionary, UInt64 max_block_size, const PaddedPODArray & keys, const Names & column_names); @@ -48,7 +43,7 @@ public: // and get_view_columns_function to get key representation. // Now used in trie dictionary, where columns are stored as ip and mask, and are showed as string DictionaryBlockInputStream( - std::shared_ptr dictionary, + std::shared_ptr dictionary, UInt64 max_block_size, const Columns & data_columns, const Names & column_names, @@ -61,21 +56,24 @@ protected: Block getBlock(size_t start, size_t length) const override; private: - Block - fillBlock(const PaddedPODArray & ids_to_fill, const Columns & keys, const DataTypes & types, ColumnsWithTypeAndName && view) const; + Block fillBlock( + const PaddedPODArray & ids_to_fill, + const Columns & keys, + const DataTypes & types, + ColumnsWithTypeAndName && view) const; - ColumnPtr getColumnFromIds(const PaddedPODArray & ids_to_fill) const; + static ColumnPtr getColumnFromIds(const PaddedPODArray & ids_to_fill); - void fillKeyColumns( + static void fillKeyColumns( const PaddedPODArray & keys, size_t start, size_t size, const DictionaryStructure & dictionary_structure, - ColumnsWithTypeAndName & columns) const; + ColumnsWithTypeAndName & result); - std::shared_ptr dictionary; + std::shared_ptr dictionary; Names column_names; - PaddedPODArray ids; + PaddedPODArray ids; ColumnsWithTypeAndName key_columns; Columns data_columns; @@ -92,200 +90,4 @@ private: DictionaryInputStreamKeyType key_type; }; - -template -DictionaryBlockInputStream::DictionaryBlockInputStream( - std::shared_ptr dictionary_, UInt64 max_block_size_, PaddedPODArray && ids_, const Names & column_names_) - : DictionaryBlockInputStreamBase(ids_.size(), max_block_size_) - , dictionary(dictionary_) - , column_names(column_names_) - , ids(std::move(ids_)) - , key_type(DictionaryInputStreamKeyType::Id) -{ -} - -template -DictionaryBlockInputStream::DictionaryBlockInputStream( - std::shared_ptr dictionary_, - UInt64 max_block_size_, - const PaddedPODArray & keys, - const Names & column_names_) - : DictionaryBlockInputStreamBase(keys.size(), max_block_size_) - , dictionary(dictionary_) - , column_names(column_names_) - , key_type(DictionaryInputStreamKeyType::ComplexKey) -{ - const DictionaryStructure & dictionary_structure = dictionary->getStructure(); - fillKeyColumns(keys, 0, keys.size(), dictionary_structure, key_columns); -} - -template -DictionaryBlockInputStream::DictionaryBlockInputStream( - std::shared_ptr dictionary_, - UInt64 max_block_size_, - const Columns & data_columns_, - const Names & column_names_, - GetColumnsFunction && get_key_columns_function_, - GetColumnsFunction && get_view_columns_function_) - : DictionaryBlockInputStreamBase(data_columns_.front()->size(), max_block_size_) - , dictionary(dictionary_) - , column_names(column_names_) - , data_columns(data_columns_) - , get_key_columns_function(std::move(get_key_columns_function_)) - , get_view_columns_function(std::move(get_view_columns_function_)) - , key_type(DictionaryInputStreamKeyType::Callback) -{ -} - - -template -Block DictionaryBlockInputStream::getBlock(size_t start, size_t length) const -{ - /// TODO: Rewrite - switch (key_type) - { - case DictionaryInputStreamKeyType::ComplexKey: - { - Columns columns; - ColumnsWithTypeAndName view_columns; - columns.reserve(key_columns.size()); - for (const auto & key_column : key_columns) - { - ColumnPtr column = key_column.column->cut(start, length); - columns.emplace_back(column); - view_columns.emplace_back(column, key_column.type, key_column.name); - } - return fillBlock({}, columns, {}, std::move(view_columns)); - } - - case DictionaryInputStreamKeyType::Id: - { - PaddedPODArray ids_to_fill(ids.begin() + start, ids.begin() + start + length); - return fillBlock(ids_to_fill, {}, {}, {}); - } - - case DictionaryInputStreamKeyType::Callback: - { - Columns columns; - columns.reserve(data_columns.size()); - for (const auto & data_column : data_columns) - columns.push_back(data_column->cut(start, length)); - const DictionaryStructure & dictionaty_structure = dictionary->getStructure(); - const auto & attributes = *dictionaty_structure.key; - ColumnsWithTypeAndName keys_with_type_and_name = get_key_columns_function(columns, attributes); - ColumnsWithTypeAndName view_with_type_and_name = get_view_columns_function(columns, attributes); - DataTypes types; - columns.clear(); - for (const auto & key_column : keys_with_type_and_name) - { - columns.push_back(key_column.column); - types.push_back(key_column.type); - } - return fillBlock({}, columns, types, std::move(view_with_type_and_name)); - } - } - - throw Exception("Unexpected DictionaryInputStreamKeyType.", ErrorCodes::LOGICAL_ERROR); -} - -template -Block DictionaryBlockInputStream::fillBlock( - const PaddedPODArray & ids_to_fill, const Columns & keys, const DataTypes & types, ColumnsWithTypeAndName && view) const -{ - std::unordered_set names(column_names.begin(), column_names.end()); - - DataTypes data_types = types; - ColumnsWithTypeAndName block_columns; - - data_types.reserve(keys.size()); - const DictionaryStructure & dictionaty_structure = dictionary->getStructure(); - if (data_types.empty() && dictionaty_structure.key) - for (const auto & key : *dictionaty_structure.key) - data_types.push_back(key.type); - - for (const auto & column : view) - if (names.find(column.name) != names.end()) - block_columns.push_back(column); - - const DictionaryStructure & structure = dictionary->getStructure(); - ColumnPtr ids_column = getColumnFromIds(ids_to_fill); - - if (structure.id && names.find(structure.id->name) != names.end()) - { - block_columns.emplace_back(ids_column, std::make_shared(), structure.id->name); - } - - auto dictionary_key_type = dictionary->getKeyType(); - - for (const auto idx : ext::range(0, structure.attributes.size())) - { - const DictionaryAttribute & attribute = structure.attributes[idx]; - if (names.find(attribute.name) != names.end()) - { - ColumnPtr column; - - if (dictionary_key_type == DictionaryKeyType::simple) - { - column = dictionary->getColumn( - attribute.name, - attribute.type, - {ids_column}, - {std::make_shared()}, - nullptr /* default_values_column */); - } - else - { - column = dictionary->getColumn( - attribute.name, - attribute.type, - keys, - data_types, - nullptr /* default_values_column*/); - } - - block_columns.emplace_back(column, attribute.type, attribute.name); - } - } - - return Block(block_columns); -} - -template -ColumnPtr DictionaryBlockInputStream::getColumnFromIds(const PaddedPODArray & ids_to_fill) const -{ - auto column_vector = ColumnVector::create(); - column_vector->getData().reserve(ids_to_fill.size()); - for (UInt64 id : ids_to_fill) - column_vector->insertValue(id); - return column_vector; -} - - -template -void DictionaryBlockInputStream::fillKeyColumns( - const PaddedPODArray & keys, - size_t start, - size_t size, - const DictionaryStructure & dictionary_structure, - ColumnsWithTypeAndName & res) const -{ - MutableColumns columns; - columns.reserve(dictionary_structure.key->size()); - - for (const DictionaryAttribute & attribute : *dictionary_structure.key) - columns.emplace_back(attribute.type->createColumn()); - - for (auto idx : ext::range(start, size)) - { - const auto & key = keys[idx]; - const auto *ptr = key.data; - for (auto & column : columns) - ptr = column->deserializeAndInsertFromArena(ptr); - } - - for (size_t i = 0, num_columns = columns.size(); i < num_columns; ++i) - res.emplace_back( - ColumnWithTypeAndName{std::move(columns[i]), (*dictionary_structure.key)[i].type, (*dictionary_structure.key)[i].name}); -} - } diff --git a/src/Dictionaries/DictionaryHelpers.h b/src/Dictionaries/DictionaryHelpers.h index 5fda5f2599e..3e7063bb9ef 100644 --- a/src/Dictionaries/DictionaryHelpers.h +++ b/src/Dictionaries/DictionaryHelpers.h @@ -295,6 +295,28 @@ private: bool use_default_value_from_column = false; }; +template +class DictionaryKeysArenaHolder; + +template <> +class DictionaryKeysArenaHolder +{ +public: + static Arena * getComplexKeyArena() { return nullptr; } +}; + +template <> +class DictionaryKeysArenaHolder +{ +public: + + Arena * getComplexKeyArena() { return &complex_key_arena; } + +private: + Arena complex_key_arena; +}; + + template class DictionaryKeysExtractor { @@ -302,67 +324,96 @@ public: using KeyType = std::conditional_t; static_assert(key_type != DictionaryKeyType::range, "Range key type is not supported by DictionaryKeysExtractor"); - explicit DictionaryKeysExtractor(const Columns & key_columns, Arena & existing_arena) + explicit DictionaryKeysExtractor(const Columns & key_columns_, Arena * complex_key_arena_) + : key_columns(key_columns_) + , complex_key_arena(complex_key_arena_) { assert(!key_columns.empty()); if constexpr (key_type == DictionaryKeyType::simple) - keys = getColumnVectorData(key_columns.front()); + { + key_columns[0] = key_columns[0]->convertToFullColumnIfConst(); + + const auto * vector_col = checkAndGetColumn>(key_columns[0].get()); + if (!vector_col) + throw Exception(ErrorCodes::TYPE_MISMATCH, "Column type mismatch for simple key expected UInt64"); + } + + keys_size = key_columns.front()->size(); + } + + inline size_t getKeysSize() const + { + return keys_size; + } + + inline size_t getCurrentKeyIndex() const + { + return current_key_index; + } + + inline KeyType extractCurrentKey() + { + assert(current_key_index < keys_size); + + if constexpr (key_type == DictionaryKeyType::simple) + { + const auto & column_vector = static_cast &>(*key_columns[0]); + const auto & data = column_vector.getData(); + + auto key = data[current_key_index]; + ++current_key_index; + return key; + } else - keys = deserializeKeyColumnsInArena(key_columns, existing_arena); - } - - - const PaddedPODArray & getKeys() const - { - return keys; - } - -private: - static PaddedPODArray getColumnVectorData(const ColumnPtr column) - { - PaddedPODArray result; - - auto full_column = column->convertToFullColumnIfConst(); - const auto *vector_col = checkAndGetColumn>(full_column.get()); - - if (!vector_col) - throw Exception{ErrorCodes::TYPE_MISMATCH, "Column type mismatch for simple key expected UInt64"}; - - result.assign(vector_col->getData()); - - return result; - } - - static PaddedPODArray deserializeKeyColumnsInArena(const Columns & key_columns, Arena & temporary_arena) - { - size_t keys_size = key_columns.front()->size(); - - PaddedPODArray result; - result.reserve(keys_size); - - PaddedPODArray temporary_column_data(key_columns.size()); - - for (size_t key_index = 0; key_index < keys_size; ++key_index) { size_t allocated_size_for_columns = 0; const char * block_start = nullptr; - for (size_t column_index = 0; column_index < key_columns.size(); ++column_index) + for (const auto & column : key_columns) { - const auto & column = key_columns[column_index]; - temporary_column_data[column_index] = column->serializeValueIntoArena(key_index, temporary_arena, block_start); - allocated_size_for_columns += temporary_column_data[column_index].size; + StringRef serialized_data = column->serializeValueIntoArena(current_key_index, *complex_key_arena, block_start); + allocated_size_for_columns += serialized_data.size; } - result.push_back(StringRef{block_start, allocated_size_for_columns}); + ++current_key_index; + current_complex_key = StringRef{block_start, allocated_size_for_columns}; + return current_complex_key; + } + } + + void rollbackCurrentKey() const + { + if constexpr (key_type == DictionaryKeyType::complex) + complex_key_arena->rollback(current_complex_key.size); + } + + PaddedPODArray extractAllKeys() + { + PaddedPODArray result; + result.reserve(keys_size - current_key_index); + + for (; current_key_index < keys_size;) + { + auto value = extractCurrentKey(); + result.emplace_back(value); } return result; } - PaddedPODArray keys; + void reset() + { + current_key_index = 0; + } +private: + Columns key_columns; + size_t keys_size = 0; + size_t current_key_index = 0; + + KeyType current_complex_key {}; + Arena * complex_key_arena; }; /** @@ -370,9 +421,10 @@ private: * If column is constant parameter backup_storage is used to store values. */ +/// TODO: Remove template static const PaddedPODArray & getColumnVectorData( - const IDictionaryBase * dictionary, + const IDictionary * dictionary, const ColumnPtr column, PaddedPODArray & backup_storage) { diff --git a/src/Dictionaries/DictionaryStructure.cpp b/src/Dictionaries/DictionaryStructure.cpp index d3ee194bf60..806ee0b80e0 100644 --- a/src/Dictionaries/DictionaryStructure.cpp +++ b/src/Dictionaries/DictionaryStructure.cpp @@ -200,8 +200,21 @@ DictionaryStructure::DictionaryStructure(const Poco::Util::AbstractConfiguration for (size_t i = 0; i < attributes.size(); ++i) { - const auto & attribute_name = attributes[i].name; + const auto & attribute = attributes[i]; + const auto & attribute_name = attribute.name; attribute_name_to_index[attribute_name] = i; + + if (attribute.hierarchical) + { + if (id && attribute.underlying_type != AttributeUnderlyingType::utUInt64) + throw Exception(ErrorCodes::TYPE_MISMATCH, + "Hierarchical attribute type for dictionary with simple key must be UInt64. Actual ({})", + toString(attribute.underlying_type)); + else if (key) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Dictionary with complex key does not support hierarchy"); + + hierarchical_attribute_index = i; + } } if (attributes.empty()) diff --git a/src/Dictionaries/DictionaryStructure.h b/src/Dictionaries/DictionaryStructure.h index 2dedb1be0ce..4f03b4ff09e 100644 --- a/src/Dictionaries/DictionaryStructure.h +++ b/src/Dictionaries/DictionaryStructure.h @@ -153,6 +153,8 @@ struct DictionaryStructure final std::unordered_map attribute_name_to_index; std::optional range_min; std::optional range_max; + std::optional hierarchical_attribute_index; + bool has_expressions = false; bool access_to_key_from_attributes = false; diff --git a/src/Dictionaries/DirectDictionary.cpp b/src/Dictionaries/DirectDictionary.cpp index 4cb9e0cd629..96ef259106a 100644 --- a/src/Dictionaries/DirectDictionary.cpp +++ b/src/Dictionaries/DirectDictionary.cpp @@ -1,158 +1,33 @@ #include "DirectDictionary.h" -#include -#include "DictionaryBlockInputStream.h" -#include "DictionaryFactory.h" -#include -#include -#include -#include -#include -#include +#include +#include +#include +#include +#include + +#include +#include namespace DB { namespace ErrorCodes { - extern const int TYPE_MISMATCH; extern const int UNSUPPORTED_METHOD; extern const int BAD_ARGUMENTS; } -namespace -{ - - inline UInt64 getAt(const PaddedPODArray & arr, const size_t idx) - { - return arr[idx]; - } - - inline UInt64 getAt(const UInt64 & value, const size_t) - { - return value; - } - -} - template DirectDictionary::DirectDictionary( const StorageID & dict_id_, const DictionaryStructure & dict_struct_, - DictionarySourcePtr source_ptr_, - BlockPtr saved_block_) + DictionarySourcePtr source_ptr_) : IDictionary(dict_id_) , dict_struct(dict_struct_) , source_ptr{std::move(source_ptr_)} - , saved_block{std::move(saved_block_)} { if (!source_ptr->supportsSelectiveLoad()) throw Exception{full_name + ": source cannot be used with DirectDictionary", ErrorCodes::UNSUPPORTED_METHOD}; - - setup(); -} - -template -void DirectDictionary::toParent(const PaddedPODArray & ids [[maybe_unused]], PaddedPODArray & out [[maybe_unused]]) const -{ - if constexpr (dictionary_key_type == DictionaryKeyType::simple) - { - const auto & attribute_name = hierarchical_attribute->name; - - auto result_type = std::make_shared(); - auto input_column = result_type->createColumn(); - auto & input_column_typed = assert_cast &>(*input_column); - auto & data = input_column_typed.getData(); - data.insert(ids.begin(), ids.end()); - - auto column = getColumn({attribute_name}, result_type, {std::move(input_column)}, {result_type}, {nullptr}); - const auto & result_column_typed = assert_cast &>(*column); - const auto & result_data = result_column_typed.getData(); - - out.assign(result_data); - } - else - throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "Hierarchy is not supported for complex key DirectDictionary"); -} - -template -UInt64 DirectDictionary::getValueOrNullByKey(const Key & to_find) const -{ - std::vector required_key = {to_find}; - - auto stream = source_ptr->loadIds(required_key); - stream->readPrefix(); - - bool is_found = false; - UInt64 result = hierarchical_attribute->null_value.template get(); - - while (const auto block = stream->read()) - { - const IColumn & id_column = *block.safeGetByPosition(0).column; - - for (const size_t attribute_idx : ext::range(0, dict_struct.attributes.size())) - { - if (is_found) - break; - - const IColumn & attribute_column = *block.safeGetByPosition(attribute_idx + 1).column; - - for (const auto row_idx : ext::range(0, id_column.size())) - { - const auto key = id_column[row_idx].get(); - - if (key == to_find && hierarchical_attribute->name == attribute_name_by_index.at(attribute_idx)) - { - result = attribute_column[row_idx].get(); - is_found = true; - break; - } - } - } - } - - stream->readSuffix(); - - return result; -} - -template -template -void DirectDictionary::isInImpl(const ChildType & child_ids, const AncestorType & ancestor_ids, PaddedPODArray & out) const -{ - const auto null_value = hierarchical_attribute->null_value.template get(); - const auto rows = out.size(); - - for (const auto row : ext::range(0, rows)) - { - auto id = getAt(child_ids, row); - const auto ancestor_id = getAt(ancestor_ids, row); - - for (size_t i = 0; id != null_value && id != ancestor_id && i < DBMS_HIERARCHICAL_DICTIONARY_MAX_DEPTH; ++i) - id = getValueOrNullByKey(id); - - out[row] = id != null_value && id == ancestor_id; - } - - query_count.fetch_add(rows, std::memory_order_relaxed); -} - -template -void DirectDictionary::isInVectorVector( - const PaddedPODArray & child_ids, const PaddedPODArray & ancestor_ids, PaddedPODArray & out) const -{ - isInImpl(child_ids, ancestor_ids, out); -} - -template -void DirectDictionary::isInVectorConstant(const PaddedPODArray & child_ids, const UInt64 ancestor_id, PaddedPODArray & out) const -{ - isInImpl(child_ids, ancestor_id, out); -} - -template -void DirectDictionary::isInConstantVector(const UInt64 child_id, const PaddedPODArray & ancestor_ids, PaddedPODArray & out) const -{ - isInImpl(child_id, ancestor_ids, out); } template @@ -166,20 +41,20 @@ ColumnPtr DirectDictionary::getColumn( if constexpr (dictionary_key_type == DictionaryKeyType::complex) dict_struct.validateKeyTypes(key_types); - Arena complex_key_arena; + DictionaryKeysArenaHolder arena_holder; + DictionaryKeysExtractor extractor(key_columns, arena_holder.getComplexKeyArena()); + const auto requested_keys = extractor.extractAllKeys(); const DictionaryAttribute & attribute = dict_struct.getAttribute(attribute_name, result_type); DefaultValueProvider default_value_provider(attribute.null_value, default_values_column); - DictionaryKeysExtractor extractor(key_columns, complex_key_arena); - const auto & requested_keys = extractor.getKeys(); - HashMap key_to_fetched_index; key_to_fetched_index.reserve(requested_keys.size()); auto fetched_from_storage = attribute.type->createColumn(); + size_t fetched_key_index = 0; - size_t requested_attribute_index = attribute_index_by_name.find(attribute_name)->second; + size_t requested_attribute_index = dict_struct.attribute_name_to_index.find(attribute_name)->second; Columns block_key_columns; size_t dictionary_keys_size = dict_struct.getKeysNames().size(); @@ -191,26 +66,19 @@ ColumnPtr DirectDictionary::getColumn( while (const auto block = stream->read()) { - auto block_columns = block.getColumns(); - /// Split into keys columns and attribute columns for (size_t i = 0; i < dictionary_keys_size; ++i) - { - block_key_columns.emplace_back(*block_columns.begin()); - block_columns.erase(block_columns.begin()); - } + block_key_columns.emplace_back(block.safeGetByPosition(i).column); - DictionaryKeysExtractor block_keys_extractor(block_key_columns, complex_key_arena); - const auto & block_keys = block_keys_extractor.getKeys(); - size_t block_keys_size = block_keys.size(); + DictionaryKeysExtractor block_keys_extractor(block_key_columns, arena_holder.getComplexKeyArena()); + auto block_keys = block_keys_extractor.extractAllKeys(); const auto & block_column = block.safeGetByPosition(dictionary_keys_size + requested_attribute_index).column; - fetched_from_storage->insertRangeFrom(*block_column, 0, block_keys_size); + fetched_from_storage->insertRangeFrom(*block_column, 0, block_keys.size()); - for (size_t block_key_index = 0; block_key_index < block_keys_size; ++block_key_index) + for (size_t block_key_index = 0; block_key_index < block_keys.size(); ++block_key_index) { - const auto & block_key = block_keys[block_key_index]; - + auto block_key = block_keys[block_key_index]; key_to_fetched_index[block_key] = fetched_key_index; ++fetched_key_index; } @@ -223,10 +91,10 @@ ColumnPtr DirectDictionary::getColumn( Field value_to_insert; size_t requested_keys_size = requested_keys.size(); + auto result = fetched_from_storage->cloneEmpty(); result->reserve(requested_keys_size); - for (size_t requested_key_index = 0; requested_key_index < requested_keys_size; ++requested_key_index) { const auto requested_key = requested_keys[requested_key_index]; @@ -251,10 +119,9 @@ ColumnUInt8::Ptr DirectDictionary::hasKeys(const Columns & if constexpr (dictionary_key_type == DictionaryKeyType::complex) dict_struct.validateKeyTypes(key_types); - Arena complex_key_arena; - - DictionaryKeysExtractor requested_keys_extractor(key_columns, complex_key_arena); - const auto & requested_keys = requested_keys_extractor.getKeys(); + DictionaryKeysArenaHolder arena_holder; + DictionaryKeysExtractor requested_keys_extractor(key_columns, arena_holder.getComplexKeyArena()); + auto requested_keys = requested_keys_extractor.extractAllKeys(); size_t requested_keys_size = requested_keys.size(); HashMap requested_key_to_index; @@ -279,25 +146,24 @@ ColumnUInt8::Ptr DirectDictionary::hasKeys(const Columns & while (const auto block = stream->read()) { - auto block_columns = block.getColumns(); - /// Split into keys columns and attribute columns for (size_t i = 0; i < dictionary_keys_size; ++i) - { - block_key_columns.emplace_back(*block_columns.begin()); - block_columns.erase(block_columns.begin()); - } + block_key_columns.emplace_back(block.safeGetByPosition(i).column); - DictionaryKeysExtractor block_keys_extractor(block_key_columns, complex_key_arena); - const auto & block_keys = block_keys_extractor.getKeys(); + DictionaryKeysExtractor block_keys_extractor(block_key_columns, arena_holder.getComplexKeyArena()); + size_t block_keys_size = block_keys_extractor.getKeysSize(); - for (const auto & block_key : block_keys) + for (size_t i = 0; i < block_keys_size; ++i) { + auto block_key = block_keys_extractor.extractCurrentKey(); + const auto * it = requested_key_to_index.find(block_key); assert(it); size_t result_data_found_index = it->getMapped(); result_data[result_data_found_index] = true; + + block_keys_extractor.rollbackCurrentKey(); } block_key_columns.clear(); @@ -310,6 +176,37 @@ ColumnUInt8::Ptr DirectDictionary::hasKeys(const Columns & return result; } +template +ColumnPtr DirectDictionary::getHierarchy( + ColumnPtr key_column, + const DataTypePtr & key_type) const +{ + if (dictionary_key_type == DictionaryKeyType::simple) + { + auto result = getKeysHierarchyDefaultImplementation(this, key_column, key_type); + query_count.fetch_add(key_column->size(), std::memory_order_relaxed); + return result; + } + else + return nullptr; +} + +template +ColumnUInt8::Ptr DirectDictionary::isInHierarchy( + ColumnPtr key_column, + ColumnPtr in_key_column, + const DataTypePtr & key_type) const +{ + if (dictionary_key_type == DictionaryKeyType::simple) + { + auto result = getKeysIsInHierarchyDefaultImplementation(this, key_column, in_key_column, key_type); + query_count.fetch_add(key_column->size(), std::memory_order_relaxed); + return result; + } + else + return nullptr; +} + template BlockInputStreamPtr DirectDictionary::getSourceBlockInputStream( const Columns & key_columns [[maybe_unused]], @@ -342,32 +239,6 @@ BlockInputStreamPtr DirectDictionary::getSourceBlockInputSt return stream; } -template -void DirectDictionary::setup() -{ - /// TODO: Move this to DictionaryStructure - size_t dictionary_attributes_size = dict_struct.attributes.size(); - for (size_t i = 0; i < dictionary_attributes_size; ++i) - { - const auto & attribute = dict_struct.attributes[i]; - attribute_index_by_name[attribute.name] = i; - attribute_name_by_index[i] = attribute.name; - - if (attribute.hierarchical) - { - if constexpr (dictionary_key_type == DictionaryKeyType::complex) - throw Exception(ErrorCodes::BAD_ARGUMENTS, - "({}): hierarchical attributes are not supported for complex key direct dictionary", - full_name); - - hierarchical_attribute = &attribute; - - if (attribute.underlying_type != AttributeUnderlyingType::utUInt64) - throw Exception{full_name + ": hierarchical attribute must be UInt64.", ErrorCodes::TYPE_MISMATCH}; - } - } -} - template BlockInputStreamPtr DirectDictionary::getBlockInputStream(const Names & /* column_names */, size_t /* max_block_size */) const { diff --git a/src/Dictionaries/DirectDictionary.h b/src/Dictionaries/DirectDictionary.h index 685fd707ded..6bca6ac6a18 100644 --- a/src/Dictionaries/DirectDictionary.h +++ b/src/Dictionaries/DirectDictionary.h @@ -18,11 +18,6 @@ namespace DB { -namespace ErrorCodes -{ - extern const int BAD_ARGUMENTS; -} - template class DirectDictionary final : public IDictionary { @@ -33,8 +28,7 @@ public: DirectDictionary( const StorageID & dict_id_, const DictionaryStructure & dict_struct_, - DictionarySourcePtr source_ptr_, - BlockPtr saved_block_ = nullptr); + DictionarySourcePtr source_ptr_); std::string getTypeName() const override { @@ -56,7 +50,7 @@ public: std::shared_ptr clone() const override { - return std::make_shared(getDictionaryID(), dict_struct, source_ptr->clone(), saved_block); + return std::make_shared(getDictionaryID(), dict_struct, source_ptr->clone()); } const IDictionarySource * getSource() const override { return source_ptr.get(); } @@ -67,26 +61,9 @@ public: bool isInjective(const std::string & attribute_name) const override { - auto it = attribute_index_by_name.find(attribute_name); - - if (it == attribute_index_by_name.end()) - throw Exception(ErrorCodes::BAD_ARGUMENTS, - "({}): no attribute with name ({}) in dictionary", - full_name, - attribute_name); - - return dict_struct.attributes[it->second].injective; + return dict_struct.getAttribute(attribute_name).injective; } - bool hasHierarchy() const override { return hierarchical_attribute; } - - void toParent(const PaddedPODArray & ids, PaddedPODArray & out) const override; - - void isInVectorVector( - const PaddedPODArray & child_ids, const PaddedPODArray & ancestor_ids, PaddedPODArray & out) const override; - void isInVectorConstant(const PaddedPODArray & child_ids, const UInt64 ancestor_id, PaddedPODArray & out) const override; - void isInConstantVector(const UInt64 child_id, const PaddedPODArray & ancestor_ids, PaddedPODArray & out) const override; - DictionaryKeyType getKeyType() const override { return dictionary_key_type; } ColumnPtr getColumn( @@ -98,30 +75,25 @@ public: ColumnUInt8::Ptr hasKeys(const Columns & key_columns, const DataTypes & key_types) const override; + bool hasHierarchy() const override { return dict_struct.hierarchical_attribute_index.has_value(); } + + ColumnPtr getHierarchy(ColumnPtr key_column, const DataTypePtr & key_type) const override; + + ColumnUInt8::Ptr isInHierarchy( + ColumnPtr key_column, + ColumnPtr in_key_column, + const DataTypePtr & key_type) const override; + BlockInputStreamPtr getBlockInputStream(const Names & column_names, size_t max_block_size) const override; private: - void setup(); - BlockInputStreamPtr getSourceBlockInputStream(const Columns & key_columns, const PaddedPODArray & requested_keys) const; - UInt64 getValueOrNullByKey(const UInt64 & to_find) const; - - template - void isInImpl(const ChildType & child_ids, const AncestorType & ancestor_ids, PaddedPODArray & out) const; - const DictionaryStructure dict_struct; const DictionarySourcePtr source_ptr; const DictionaryLifetime dict_lifetime; - std::unordered_map attribute_index_by_name; - std::unordered_map attribute_name_by_index; - - const DictionaryAttribute * hierarchical_attribute = nullptr; - mutable std::atomic query_count{0}; - - BlockPtr saved_block; }; extern template class DirectDictionary; diff --git a/src/Dictionaries/FlatDictionary.cpp b/src/Dictionaries/FlatDictionary.cpp index eb63d716913..2d8d208d76b 100644 --- a/src/Dictionaries/FlatDictionary.cpp +++ b/src/Dictionaries/FlatDictionary.cpp @@ -1,20 +1,22 @@ #include "FlatDictionary.h" #include +#include + #include #include #include #include #include -#include "DictionaryBlockInputStream.h" -#include "DictionaryFactory.h" +#include +#include +#include namespace DB { namespace ErrorCodes { - extern const int TYPE_MISMATCH; extern const int ARGUMENT_OUT_OF_BOUND; extern const int BAD_ARGUMENTS; extern const int DICTIONARY_IS_EMPTY; @@ -24,7 +26,6 @@ namespace ErrorCodes static const auto initial_array_size = 1024; static const auto max_array_size = 500000; - FlatDictionary::FlatDictionary( const StorageID & dict_id_, const DictionaryStructure & dict_struct_, @@ -45,69 +46,6 @@ FlatDictionary::FlatDictionary( calculateBytesAllocated(); } - -void FlatDictionary::toParent(const PaddedPODArray & ids, PaddedPODArray & out) const -{ - const auto null_value = std::get(hierarchical_attribute->null_values); - DictionaryDefaultValueExtractor extractor(null_value); - - getItemsImpl( - *hierarchical_attribute, - ids, - [&](const size_t row, const UInt64 value) { out[row] = value; }, - extractor); -} - - -/// Allow to use single value in same way as array. -static inline FlatDictionary::Key getAt(const PaddedPODArray & arr, const size_t idx) -{ - return arr[idx]; -} -static inline FlatDictionary::Key getAt(const FlatDictionary::Key & value, const size_t) -{ - return value; -} - -template -void FlatDictionary::isInImpl(const ChildType & child_ids, const AncestorType & ancestor_ids, PaddedPODArray & out) const -{ - const auto null_value = std::get(hierarchical_attribute->null_values); - const auto & attr = std::get>(hierarchical_attribute->arrays); - const auto rows = out.size(); - - size_t loaded_size = attr.size(); - for (const auto row : ext::range(0, rows)) - { - auto id = getAt(child_ids, row); - const auto ancestor_id = getAt(ancestor_ids, row); - - for (size_t i = 0; id < loaded_size && id != null_value && id != ancestor_id && i < DBMS_HIERARCHICAL_DICTIONARY_MAX_DEPTH; ++i) - id = attr[id]; - - out[row] = id != null_value && id == ancestor_id; - } - - query_count.fetch_add(rows, std::memory_order_relaxed); -} - - -void FlatDictionary::isInVectorVector( - const PaddedPODArray & child_ids, const PaddedPODArray & ancestor_ids, PaddedPODArray & out) const -{ - isInImpl(child_ids, ancestor_ids, out); -} - -void FlatDictionary::isInVectorConstant(const PaddedPODArray & child_ids, const Key ancestor_id, PaddedPODArray & out) const -{ - isInImpl(child_ids, ancestor_id, out); -} - -void FlatDictionary::isInConstantVector(const Key child_id, const PaddedPODArray & ancestor_ids, PaddedPODArray & out) const -{ - isInImpl(child_id, ancestor_ids, out); -} - ColumnPtr FlatDictionary::getColumn( const std::string & attribute_name, const DataTypePtr & result_type, @@ -117,14 +55,16 @@ ColumnPtr FlatDictionary::getColumn( { ColumnPtr result; - PaddedPODArray backup_storage; + PaddedPODArray backup_storage; const auto & ids = getColumnVectorData(this, key_columns.front(), backup_storage); auto size = ids.size(); - const auto & attribute = getAttribute(attribute_name); const auto & dictionary_attribute = dict_struct.getAttribute(attribute_name, result_type); + size_t attribute_index = dict_struct.attribute_name_to_index.find(attribute_name)->second; + const auto & attribute = attributes[attribute_index]; + auto type_call = [&](const auto & dictionary_attribute_type) { using Type = std::decay_t; @@ -183,10 +123,9 @@ ColumnPtr FlatDictionary::getColumn( return result; } - ColumnUInt8::Ptr FlatDictionary::hasKeys(const Columns & key_columns, const DataTypes &) const { - PaddedPODArray backup_storage; + PaddedPODArray backup_storage; const auto& ids = getColumnVectorData(this, key_columns.front(), backup_storage); auto result = ColumnUInt8::create(ext::size(ids)); @@ -205,24 +144,118 @@ ColumnUInt8::Ptr FlatDictionary::hasKeys(const Columns & key_columns, const Data return result; } +ColumnPtr FlatDictionary::getHierarchy(ColumnPtr key_column, const DataTypePtr &) const +{ + PaddedPODArray keys_backup_storage; + const auto & keys = getColumnVectorData(this, key_column, keys_backup_storage); + + size_t hierarchical_attribute_index = *dict_struct.hierarchical_attribute_index; + const auto & hierarchical_attribute = attributes[hierarchical_attribute_index]; + + const UInt64 null_value = std::get(hierarchical_attribute.null_values); + const ContainerType & parent_keys = std::get>(hierarchical_attribute.arrays); + + auto is_key_valid_func = [&, this](auto & key) + { + return key < loaded_ids.size() && loaded_ids[key]; + }; + + auto get_parent_key_func = [&, this](auto & hierarchy_key) + { + std::optional result; + + if (hierarchy_key >= loaded_ids.size() || !loaded_ids[hierarchy_key]) + return result; + + result = parent_keys[hierarchy_key]; + + return result; + }; + + auto dictionary_hierarchy_array = getKeysHierarchyArray(keys, null_value, is_key_valid_func, get_parent_key_func); + + query_count.fetch_add(keys.size(), std::memory_order_relaxed); + + return dictionary_hierarchy_array; +} + +ColumnUInt8::Ptr FlatDictionary::isInHierarchy( + ColumnPtr key_column, + ColumnPtr in_key_column, + const DataTypePtr &) const +{ + PaddedPODArray keys_backup_storage; + const auto & keys = getColumnVectorData(this, key_column, keys_backup_storage); + + PaddedPODArray keys_in_backup_storage; + const auto & keys_in = getColumnVectorData(this, in_key_column, keys_in_backup_storage); + + size_t hierarchical_attribute_index = *dict_struct.hierarchical_attribute_index; + const auto & hierarchical_attribute = attributes[hierarchical_attribute_index]; + + const UInt64 null_value = std::get(hierarchical_attribute.null_values); + const ContainerType & parent_keys = std::get>(hierarchical_attribute.arrays); + + auto is_key_valid_func = [&, this](auto & key) + { + return key < loaded_ids.size() && loaded_ids[key]; + }; + + auto get_parent_key_func = [&, this](auto & hierarchy_key) + { + std::optional result; + + if (hierarchy_key >= loaded_ids.size() || !loaded_ids[hierarchy_key]) + return result; + + result = parent_keys[hierarchy_key]; + + return result; + }; + + auto result = getKeysIsInHierarchyColumn(keys, keys_in, null_value, is_key_valid_func, get_parent_key_func); + + query_count.fetch_add(keys.size(), std::memory_order_relaxed); + + return result; +} + +ColumnPtr FlatDictionary::getDescendants( + ColumnPtr key_column, + const DataTypePtr &, + size_t level) const +{ + PaddedPODArray keys_backup; + const auto & keys = getColumnVectorData(this, key_column, keys_backup); + + size_t hierarchical_attribute_index = *dict_struct.hierarchical_attribute_index; + const auto & hierarchical_attribute = attributes[hierarchical_attribute_index]; + const ContainerType & parent_keys = std::get>(hierarchical_attribute.arrays); + + HashMap> parent_to_child; + + for (size_t i = 0; i < parent_keys.size(); ++i) + { + auto parent_key = parent_keys[i]; + + if (loaded_ids[i]) + parent_to_child[parent_key].emplace_back(static_cast(i)); + } + + auto result = getKeysDescendantsArray(keys, parent_to_child, level); + + query_count.fetch_add(keys.size(), std::memory_order_relaxed); + + return result; +} + void FlatDictionary::createAttributes() { const auto size = dict_struct.attributes.size(); attributes.reserve(size); for (const auto & attribute : dict_struct.attributes) - { - attribute_index_by_name.emplace(attribute.name, attributes.size()); attributes.push_back(createAttribute(attribute, attribute.null_value)); - - if (attribute.hierarchical) - { - hierarchical_attribute = &attributes.back(); - - if (hierarchical_attribute->type != AttributeUnderlyingType::utUInt64) - throw Exception{full_name + ": hierarchical attribute must be UInt64.", ErrorCodes::TYPE_MISMATCH}; - } - } } void FlatDictionary::blockToAttributes(const Block & block) @@ -271,7 +304,7 @@ void FlatDictionary::updateData() const auto & saved_id_column = *saved_block->safeGetByPosition(0).column; const auto & update_id_column = *block.safeGetByPosition(0).column; - std::unordered_map> update_ids; + std::unordered_map> update_ids; for (size_t row = 0; row < update_id_column.size(); ++row) { const auto id = update_id_column.get64(row); @@ -280,7 +313,7 @@ void FlatDictionary::updateData() const size_t saved_rows = saved_id_column.size(); IColumn::Filter filter(saved_rows); - std::unordered_map>::iterator it; + std::unordered_map>::iterator it; for (size_t row = 0; row < saved_id_column.size(); ++row) { @@ -385,7 +418,6 @@ void FlatDictionary::createAttributeImpl(Attribute & attribute, const Fi attribute.arrays.emplace>(initial_array_size, StringRef(string_in_arena, string.size())); } - FlatDictionary::Attribute FlatDictionary::createAttribute(const DictionaryAttribute& attribute, const Field & null_value) { auto nullable_set = attribute.is_nullable ? std::make_optional() : std::optional{}; @@ -408,7 +440,7 @@ FlatDictionary::Attribute FlatDictionary::createAttribute(const DictionaryAttrib template void FlatDictionary::getItemsImpl( const Attribute & attribute, - const PaddedPODArray & ids, + const PaddedPODArray & ids, ValueSetter && set_value, DefaultValueExtractor & default_value_extractor) const { @@ -425,7 +457,7 @@ void FlatDictionary::getItemsImpl( } template -void FlatDictionary::resize(Attribute & attribute, const Key id) +void FlatDictionary::resize(Attribute & attribute, const UInt64 id) { if (id >= max_array_size) throw Exception{full_name + ": identifier should be less than " + toString(max_array_size), ErrorCodes::ARGUMENT_OUT_OF_BOUND}; @@ -440,7 +472,7 @@ void FlatDictionary::resize(Attribute & attribute, const Key id) } template -void FlatDictionary::setAttributeValueImpl(Attribute & attribute, const Key id, const T & value) +void FlatDictionary::setAttributeValueImpl(Attribute & attribute, const UInt64 id, const T & value) { auto & array = std::get>(attribute.arrays); array[id] = value; @@ -448,13 +480,13 @@ void FlatDictionary::setAttributeValueImpl(Attribute & attribute, const Key id, } template <> -void FlatDictionary::setAttributeValueImpl(Attribute & attribute, const Key id, const String & value) +void FlatDictionary::setAttributeValueImpl(Attribute & attribute, const UInt64 id, const String & value) { const auto * string_in_arena = attribute.string_arena->insert(value.data(), value.size()); setAttributeValueImpl(attribute, id, StringRef{string_in_arena, value.size()}); } -void FlatDictionary::setAttributeValue(Attribute & attribute, const Key id, const Field & value) +void FlatDictionary::setAttributeValue(Attribute & attribute, const UInt64 id, const Field & value) { auto type_call = [&](const auto &dictionary_attribute_type) { @@ -484,21 +516,11 @@ void FlatDictionary::setAttributeValue(Attribute & attribute, const Key id, cons callOnDictionaryAttributeType(attribute.type, type_call); } - -const FlatDictionary::Attribute & FlatDictionary::getAttribute(const std::string & attribute_name) const -{ - const auto it = attribute_index_by_name.find(attribute_name); - if (it == std::end(attribute_index_by_name)) - throw Exception{full_name + ": no such attribute '" + attribute_name + "'", ErrorCodes::BAD_ARGUMENTS}; - - return attributes[it->second]; -} - -PaddedPODArray FlatDictionary::getIds() const +PaddedPODArray FlatDictionary::getIds() const { const auto ids_count = ext::size(loaded_ids); - PaddedPODArray ids; + PaddedPODArray ids; ids.reserve(ids_count); for (auto idx : ext::range(0, ids_count)) @@ -509,8 +531,7 @@ PaddedPODArray FlatDictionary::getIds() const BlockInputStreamPtr FlatDictionary::getBlockInputStream(const Names & column_names, size_t max_block_size) const { - using BlockInputStreamType = DictionaryBlockInputStream; - return std::make_shared(shared_from_this(), max_block_size, getIds(), column_names); + return std::make_shared(shared_from_this(), max_block_size, getIds(), column_names); } void registerDictionaryFlat(DictionaryFactory & factory) diff --git a/src/Dictionaries/FlatDictionary.h b/src/Dictionaries/FlatDictionary.h index f491eb28641..09721bf1a99 100644 --- a/src/Dictionaries/FlatDictionary.h +++ b/src/Dictionaries/FlatDictionary.h @@ -59,18 +59,9 @@ public: bool isInjective(const std::string & attribute_name) const override { - return dict_struct.attributes[&getAttribute(attribute_name) - attributes.data()].injective; + return dict_struct.getAttribute(attribute_name).injective; } - bool hasHierarchy() const override { return hierarchical_attribute; } - - void toParent(const PaddedPODArray & ids, PaddedPODArray & out) const override; - - void isInVectorVector( - const PaddedPODArray & child_ids, const PaddedPODArray & ancestor_ids, PaddedPODArray & out) const override; - void isInVectorConstant(const PaddedPODArray & child_ids, const Key ancestor_id, PaddedPODArray & out) const override; - void isInConstantVector(const Key child_id, const PaddedPODArray & ancestor_ids, PaddedPODArray & out) const override; - DictionaryKeyType getKeyType() const override { return DictionaryKeyType::simple; } ColumnPtr getColumn( @@ -82,13 +73,27 @@ public: ColumnUInt8::Ptr hasKeys(const Columns & key_columns, const DataTypes & key_types) const override; + bool hasHierarchy() const override { return dict_struct.hierarchical_attribute_index.has_value(); } + + ColumnPtr getHierarchy(ColumnPtr key_column, const DataTypePtr & key_type) const override; + + ColumnUInt8::Ptr isInHierarchy( + ColumnPtr key_column, + ColumnPtr in_key_column, + const DataTypePtr & key_type) const override; + + ColumnPtr getDescendants( + ColumnPtr key_column, + const DataTypePtr & key_type, + size_t level) const override; + BlockInputStreamPtr getBlockInputStream(const Names & column_names, size_t max_block_size) const override; private: template using ContainerType = PaddedPODArray; - using NullableSet = HashSet>; + using NullableSet = HashSet>; struct Attribute final { @@ -151,24 +156,24 @@ private: template void getItemsImpl( const Attribute & attribute, - const PaddedPODArray & ids, + const PaddedPODArray & ids, ValueSetter && set_value, DefaultValueExtractor & default_value_extractor) const; template - void resize(Attribute & attribute, const Key id); + void resize(Attribute & attribute, const UInt64 id); template - void setAttributeValueImpl(Attribute & attribute, const Key id, const T & value); + void setAttributeValueImpl(Attribute & attribute, const UInt64 id, const T & value); - void setAttributeValue(Attribute & attribute, const Key id, const Field & value); + void setAttributeValue(Attribute & attribute, const UInt64 id, const Field & value); const Attribute & getAttribute(const std::string & attribute_name) const; template void isInImpl(const ChildType & child_ids, const AncestorType & ancestor_ids, PaddedPODArray & out) const; - PaddedPODArray getIds() const; + PaddedPODArray getIds() const; const DictionaryStructure dict_struct; const DictionarySourcePtr source_ptr; @@ -177,7 +182,6 @@ private: std::map attribute_index_by_name; std::vector attributes; - const Attribute * hierarchical_attribute = nullptr; std::vector loaded_ids; size_t bytes_allocated = 0; @@ -185,6 +189,7 @@ private: size_t bucket_count = 0; mutable std::atomic query_count{0}; + /// TODO: Remove BlockPtr saved_block; }; diff --git a/src/Dictionaries/HashedDictionary.cpp b/src/Dictionaries/HashedDictionary.cpp index 708be7945f1..d45e4ade1cf 100644 --- a/src/Dictionaries/HashedDictionary.cpp +++ b/src/Dictionaries/HashedDictionary.cpp @@ -1,13 +1,18 @@ #include "HashedDictionary.h" + #include -#include "DictionaryBlockInputStream.h" -#include "DictionaryFactory.h" -#include "ClickHouseDictionarySource.h" + +#include + #include -#include +#include #include #include -#include +#include + +#include +#include +#include namespace { @@ -15,136 +20,74 @@ namespace /// NOTE: Trailing return type is explicitly specified for SFINAE. /// google::sparse_hash_map -template auto first(const T & value) -> decltype(value.first) { return value.first; } // NOLINT -template auto second(const T & value) -> decltype(value.second) { return value.second; } // NOLINT +template auto getKeyFromCell(const T & value) -> decltype(value->first) { return value->first; } // NOLINT +template auto getValueFromCell(const T & value) -> decltype(value->second) { return value->second; } // NOLINT /// HashMap -template auto first(const T & value) -> decltype(value.getKey()) { return value.getKey(); } // NOLINT -template auto second(const T & value) -> decltype(value.getMapped()) { return value.getMapped(); } // NOLINT +template auto getKeyFromCell(const T & value) -> decltype(value->getKey()) { return value->getKey(); } // NOLINT +template auto getValueFromCell(const T & value) -> decltype(value->getMapped()) { return value->getMapped(); } // NOLINT } namespace DB { + namespace ErrorCodes { - extern const int TYPE_MISMATCH; extern const int BAD_ARGUMENTS; extern const int DICTIONARY_IS_EMPTY; extern const int UNSUPPORTED_METHOD; } - -HashedDictionary::HashedDictionary( +template +HashedDictionary::HashedDictionary( const StorageID & dict_id_, const DictionaryStructure & dict_struct_, DictionarySourcePtr source_ptr_, const DictionaryLifetime dict_lifetime_, bool require_nonempty_, - bool sparse_, BlockPtr saved_block_) : IDictionary(dict_id_) , dict_struct(dict_struct_) - , source_ptr{std::move(source_ptr_)} + , source_ptr(std::move(source_ptr_)) , dict_lifetime(dict_lifetime_) , require_nonempty(require_nonempty_) - , sparse(sparse_) - , saved_block{std::move(saved_block_)} + , saved_block(std::move(saved_block_)) { createAttributes(); loadData(); calculateBytesAllocated(); } - -void HashedDictionary::toParent(const PaddedPODArray & ids, PaddedPODArray & out) const -{ - const auto null_value = std::get(hierarchical_attribute->null_values); - DictionaryDefaultValueExtractor extractor(null_value); - - getItemsImpl( - *hierarchical_attribute, - ids, - [&](const size_t row, const UInt64 value) { out[row] = value; }, - extractor); -} - - -/// Allow to use single value in same way as array. -static inline HashedDictionary::Key getAt(const PaddedPODArray & arr, const size_t idx) -{ - return arr[idx]; -} -static inline HashedDictionary::Key getAt(const HashedDictionary::Key & value, const size_t) -{ - return value; -} - -template -void HashedDictionary::isInAttrImpl(const AttrType & attr, const ChildType & child_ids, const AncestorType & ancestor_ids, PaddedPODArray & out) const -{ - const auto null_value = std::get(hierarchical_attribute->null_values); - const auto rows = out.size(); - - for (const auto row : ext::range(0, rows)) - { - auto id = getAt(child_ids, row); - const auto ancestor_id = getAt(ancestor_ids, row); - - for (size_t i = 0; id != null_value && id != ancestor_id && i < DBMS_HIERARCHICAL_DICTIONARY_MAX_DEPTH; ++i) - { - auto it = attr.find(id); - if (it != std::end(attr)) - id = second(*it); - else - break; - } - - out[row] = id != null_value && id == ancestor_id; - } - - query_count.fetch_add(rows, std::memory_order_relaxed); -} -template -void HashedDictionary::isInImpl(const ChildType & child_ids, const AncestorType & ancestor_ids, PaddedPODArray & out) const -{ - if (!sparse) - return isInAttrImpl(*std::get>(hierarchical_attribute->maps), child_ids, ancestor_ids, out); - return isInAttrImpl(*std::get>(hierarchical_attribute->sparse_maps), child_ids, ancestor_ids, out); -} - -void HashedDictionary::isInVectorVector( - const PaddedPODArray & child_ids, const PaddedPODArray & ancestor_ids, PaddedPODArray & out) const -{ - isInImpl(child_ids, ancestor_ids, out); -} - -void HashedDictionary::isInVectorConstant(const PaddedPODArray & child_ids, const Key ancestor_id, PaddedPODArray & out) const -{ - isInImpl(child_ids, ancestor_id, out); -} - -void HashedDictionary::isInConstantVector(const Key child_id, const PaddedPODArray & ancestor_ids, PaddedPODArray & out) const -{ - isInImpl(child_id, ancestor_ids, out); -} - -ColumnPtr HashedDictionary::getColumn( +template +ColumnPtr HashedDictionary::getColumn( const std::string & attribute_name, const DataTypePtr & result_type, const Columns & key_columns, - const DataTypes &, + const DataTypes & key_types [[maybe_unused]], const ColumnPtr & default_values_column) const { + if (dictionary_key_type == DictionaryKeyType::complex) + dict_struct.validateKeyTypes(key_types); + ColumnPtr result; - PaddedPODArray backup_storage; - const auto & ids = getColumnVectorData(this, key_columns.front(), backup_storage); + DictionaryKeysArenaHolder arena_holder; + DictionaryKeysExtractor extractor(key_columns, arena_holder.getComplexKeyArena()); - auto size = ids.size(); + const size_t size = extractor.getKeysSize(); - const auto & attribute = getAttribute(attribute_name); const auto & dictionary_attribute = dict_struct.getAttribute(attribute_name, result_type); + const size_t attribute_index = dict_struct.attribute_name_to_index.find(attribute_name)->second; + auto & attribute = attributes[attribute_index]; + + ColumnUInt8::MutablePtr col_null_map_to; + ColumnUInt8::Container * vec_null_map_to = nullptr; + if (attribute.is_nullable_set) + { + col_null_map_to = ColumnUInt8::create(size, false); + vec_null_map_to = &col_null_map_to->getData(); + } auto type_call = [&](const auto & dictionary_attribute_type) { @@ -159,24 +102,34 @@ ColumnPtr HashedDictionary::getColumn( auto column = ColumnProvider::getColumn(dictionary_attribute, size); - if constexpr (std::is_same_v) + if constexpr (std::is_same_v) { auto * out = column.get(); - getItemsImpl( + getItemsImpl( attribute, - ids, + extractor, [&](const size_t, const StringRef value) { out->insertData(value.data, value.size); }, + [&](const size_t row) + { + out->insertDefault(); + (*vec_null_map_to)[row] = true; + }, default_value_extractor); } else { auto & out = column->getData(); - getItemsImpl( + getItemsImpl( attribute, - ids, + extractor, [&](const size_t row, const auto value) { return out[row] = value; }, + [&](const size_t row) + { + out[row] = 0; + (*vec_null_map_to)[row] = true; + }, default_value_extractor); } @@ -185,87 +138,214 @@ ColumnPtr HashedDictionary::getColumn( callOnDictionaryAttributeType(attribute.type, type_call); - if (attribute.nullable_set) - { - ColumnUInt8::MutablePtr col_null_map_to = ColumnUInt8::create(size, false); - ColumnUInt8::Container& vec_null_map_to = col_null_map_to->getData(); - - for (size_t row = 0; row < ids.size(); ++row) - { - auto id = ids[row]; - - if (attribute.nullable_set->find(id) != nullptr) - vec_null_map_to[row] = true; - } - + if (attribute.is_nullable_set) result = ColumnNullable::create(result, std::move(col_null_map_to)); + + return result; +} + +template +ColumnUInt8::Ptr HashedDictionary::hasKeys(const Columns & key_columns, const DataTypes & key_types) const +{ + if (dictionary_key_type == DictionaryKeyType::complex) + dict_struct.validateKeyTypes(key_types); + + DictionaryKeysArenaHolder arena_holder; + DictionaryKeysExtractor extractor(key_columns, arena_holder.getComplexKeyArena()); + + size_t keys_size = extractor.getKeysSize(); + + auto result = ColumnUInt8::create(keys_size, false); + auto & out = result->getData(); + + if (attributes.empty()) + { + query_count.fetch_add(keys_size, std::memory_order_relaxed); + return result; } - return result; -} - -ColumnUInt8::Ptr HashedDictionary::hasKeys(const Columns & key_columns, const DataTypes &) const -{ - PaddedPODArray backup_storage; - const auto& ids = getColumnVectorData(this, key_columns.front(), backup_storage); - - size_t ids_count = ext::size(ids); - - auto result = ColumnUInt8::create(ext::size(ids)); - auto& out = result->getData(); - const auto & attribute = attributes.front(); + bool is_attribute_nullable = attribute.is_nullable_set.has_value(); - auto type_call = [&](const auto & dictionary_attribute_type) + getAttributeContainer(0, [&](const auto & container) { - using Type = std::decay_t; - using AttributeType = typename Type::AttributeType; - has(attribute, ids, out); - }; + for (size_t requested_key_index = 0; requested_key_index < keys_size; ++requested_key_index) + { + auto requested_key = extractor.extractCurrentKey(); - callOnDictionaryAttributeType(attribute.type, type_call); + out[requested_key_index] = container.find(requested_key) != container.end(); - query_count.fetch_add(ids_count, std::memory_order_relaxed); + if (is_attribute_nullable && !out[requested_key_index]) + out[requested_key_index] = attribute.is_nullable_set->find(requested_key) != nullptr; + + extractor.rollbackCurrentKey(); + } + }); + + query_count.fetch_add(keys_size, std::memory_order_relaxed); return result; } -void HashedDictionary::createAttributes() +template +ColumnPtr HashedDictionary::getHierarchy(ColumnPtr key_column [[maybe_unused]], const DataTypePtr &) const +{ + if constexpr (dictionary_key_type == DictionaryKeyType::simple) + { + PaddedPODArray keys_backup_storage; + const auto & keys = getColumnVectorData(this, key_column, keys_backup_storage); + + size_t hierarchical_attribute_index = *dict_struct.hierarchical_attribute_index; + + const auto & dictionary_attribute = dict_struct.attributes[hierarchical_attribute_index]; + const auto & hierarchical_attribute = attributes[hierarchical_attribute_index]; + + const UInt64 null_value = dictionary_attribute.null_value.get(); + const CollectionType & parent_keys_map = std::get>(hierarchical_attribute.container); + + auto is_key_valid_func = [&](auto & key) { return parent_keys_map.find(key) != parent_keys_map.end(); }; + + auto get_parent_func = [&](auto & hierarchy_key) + { + std::optional result; + + auto it = parent_keys_map.find(hierarchy_key); + + if (it != parent_keys_map.end()) + result = getValueFromCell(it); + + return result; + }; + + auto dictionary_hierarchy_array = getKeysHierarchyArray(keys, null_value, is_key_valid_func, get_parent_func); + + query_count.fetch_add(keys.size(), std::memory_order_relaxed); + + return dictionary_hierarchy_array; + } + else + return nullptr; +} + +template +ColumnUInt8::Ptr HashedDictionary::isInHierarchy( + ColumnPtr key_column [[maybe_unused]], + ColumnPtr in_key_column [[maybe_unused]], + const DataTypePtr &) const +{ + if constexpr (dictionary_key_type == DictionaryKeyType::simple) + { + PaddedPODArray keys_backup_storage; + const auto & keys = getColumnVectorData(this, key_column, keys_backup_storage); + + PaddedPODArray keys_in_backup_storage; + const auto & keys_in = getColumnVectorData(this, in_key_column, keys_in_backup_storage); + + size_t hierarchical_attribute_index = *dict_struct.hierarchical_attribute_index; + + const auto & dictionary_attribute = dict_struct.attributes[hierarchical_attribute_index]; + auto & hierarchical_attribute = attributes[hierarchical_attribute_index]; + + const UInt64 null_value = dictionary_attribute.null_value.get(); + const CollectionType & parent_keys_map = std::get>(hierarchical_attribute.container); + + auto is_key_valid_func = [&](auto & key) { return parent_keys_map.find(key) != parent_keys_map.end(); }; + + auto get_parent_func = [&](auto & hierarchy_key) + { + std::optional result; + + auto it = parent_keys_map.find(hierarchy_key); + + if (it != parent_keys_map.end()) + result = getValueFromCell(it); + + return result; + }; + + auto result = getKeysIsInHierarchyColumn(keys, keys_in, null_value, is_key_valid_func, get_parent_func); + + query_count.fetch_add(keys.size(), std::memory_order_relaxed); + + return result; + } + else + return nullptr; +} + +template +ColumnPtr HashedDictionary::getDescendants( + ColumnPtr key_column [[maybe_unused]], + const DataTypePtr &, + size_t level [[maybe_unused]]) const +{ + if constexpr (dictionary_key_type == DictionaryKeyType::simple) + { + PaddedPODArray keys_backup; + const auto & keys = getColumnVectorData(this, key_column, keys_backup); + + size_t hierarchical_attribute_index = *dict_struct.hierarchical_attribute_index; + + const auto & hierarchical_attribute = attributes[hierarchical_attribute_index]; + const CollectionType & parent_keys = std::get>(hierarchical_attribute.container); + + HashMap> parent_to_child; + + for (const auto & [key, value] : parent_keys) + parent_to_child[value].emplace_back(key); + + auto result = getKeysDescendantsArray(keys, parent_to_child, level); + + query_count.fetch_add(keys.size(), std::memory_order_relaxed); + + return result; + } + else + return nullptr; +} + +template +void HashedDictionary::createAttributes() { const auto size = dict_struct.attributes.size(); attributes.reserve(size); - for (const auto & attribute : dict_struct.attributes) + for (const auto & dictionary_attribute : dict_struct.attributes) { - attribute_index_by_name.emplace(attribute.name, attributes.size()); - attributes.push_back(createAttribute(attribute, attribute.null_value)); - - if (attribute.hierarchical) + auto type_call = [&, this](const auto & dictionary_attribute_type) { - hierarchical_attribute = &attributes.back(); + using Type = std::decay_t; + using AttributeType = typename Type::AttributeType; + using ValueType = DictionaryValueType; - if (hierarchical_attribute->type != AttributeUnderlyingType::utUInt64) - throw Exception{full_name + ": hierarchical attribute must be UInt64.", ErrorCodes::TYPE_MISMATCH}; - } + auto is_nullable_set = dictionary_attribute.is_nullable ? std::make_optional() : std::optional{}; + std::unique_ptr string_arena = std::is_same_v ? std::make_unique() : nullptr; + + ValueType default_value; + + if constexpr (std::is_same_v) + { + string_arena = std::make_unique(); + + const auto & string_null_value = dictionary_attribute.null_value.get(); + const size_t string_null_value_size = string_null_value.size(); + + const char * string_in_arena = string_arena->insert(string_null_value.data(), string_null_value_size); + default_value = {string_in_arena, string_null_value_size}; + } + else + default_value = dictionary_attribute.null_value.get>(); + + Attribute attribute{dictionary_attribute.underlying_type, std::move(is_nullable_set), default_value, CollectionType(), std::move(string_arena)}; + attributes.emplace_back(std::move(attribute)); + }; + + callOnDictionaryAttributeType(dictionary_attribute.underlying_type, type_call); } } -void HashedDictionary::blockToAttributes(const Block & block) -{ - const auto & id_column = *block.safeGetByPosition(0).column; - - for (const size_t attribute_idx : ext::range(0, attributes.size())) - { - const IColumn & attribute_column = *block.safeGetByPosition(attribute_idx + 1).column; - auto & attribute = attributes[attribute_idx]; - - for (const auto row_idx : ext::range(0, id_column.size())) - if (setAttributeValue(attribute, id_column[row_idx].get(), attribute_column[row_idx])) - ++element_count; - } -} - -void HashedDictionary::updateData() +template +void HashedDictionary::updateData() { if (!saved_block || saved_block->rows() == 0) { @@ -277,6 +357,7 @@ void HashedDictionary::updateData() /// We are using this to keep saved data if input stream consists of multiple blocks if (!saved_block) saved_block = std::make_shared(block.cloneEmpty()); + for (const auto attribute_idx : ext::range(0, attributes.size() + 1)) { const IColumn & update_column = *block.getByPosition(attribute_idx).column.get(); @@ -288,34 +369,50 @@ void HashedDictionary::updateData() } else { + size_t skip_keys_size_offset = dict_struct.getKeysSize(); + + Columns saved_block_key_columns; + saved_block_key_columns.reserve(skip_keys_size_offset); + + /// Split into keys columns and attribute columns + for (size_t i = 0; i < skip_keys_size_offset; ++i) + saved_block_key_columns.emplace_back(saved_block->safeGetByPosition(i).column); + + + DictionaryKeysArenaHolder arena_holder; + DictionaryKeysExtractor saved_keys_extractor(saved_block_key_columns, arena_holder.getComplexKeyArena()); + auto saved_keys_extracted_from_block = saved_keys_extractor.extractAllKeys(); + auto stream = source_ptr->loadUpdatedAll(); stream->readPrefix(); while (Block block = stream->read()) { - const auto & saved_id_column = *saved_block->safeGetByPosition(0).column; - const auto & update_id_column = *block.safeGetByPosition(0).column; + /// TODO: Rewrite + Columns block_key_columns; + block_key_columns.reserve(skip_keys_size_offset); - std::unordered_map> update_ids; - for (size_t row = 0; row < update_id_column.size(); ++row) + /// Split into keys columns and attribute columns + for (size_t i = 0; i < skip_keys_size_offset; ++i) + block_key_columns.emplace_back(block.safeGetByPosition(i).column); + + DictionaryKeysExtractor block_keys_extractor(saved_block_key_columns, arena_holder.getComplexKeyArena()); + auto keys_extracted_from_block = block_keys_extractor.extractAllKeys(); + + absl::flat_hash_map, DefaultHash> update_keys; + for (size_t row = 0; row < keys_extracted_from_block.size(); ++row) { - const auto id = update_id_column.get64(row); - update_ids[id].push_back(row); + auto key = keys_extracted_from_block[row]; + update_keys[key].push_back(row); } - const size_t saved_rows = saved_id_column.size(); - IColumn::Filter filter(saved_rows); - std::unordered_map>::iterator it; + IColumn::Filter filter(saved_keys_extracted_from_block.size()); - for (size_t row = 0; row < saved_id_column.size(); ++row) + for (size_t row = 0; row < saved_keys_extracted_from_block.size(); ++row) { - auto id = saved_id_column.get64(row); - it = update_ids.find(id); - - if (it != update_ids.end()) - filter[row] = 0; - else - filter[row] = 1; + auto key = saved_keys_extracted_from_block[row]; + auto it = update_keys.find(key); + filter[row] = (it == update_keys.end()); } auto block_columns = block.mutateColumns(); @@ -323,12 +420,12 @@ void HashedDictionary::updateData() { auto & column = saved_block->safeGetByPosition(attribute_idx).column; const auto & filtered_column = column->filter(filter, -1); - block_columns[attribute_idx]->insertRangeFrom(*filtered_column.get(), 0, filtered_column->size()); } saved_block->setColumns(std::move(block_columns)); } + stream->readSuffix(); } @@ -339,48 +436,154 @@ void HashedDictionary::updateData() } } -template -void HashedDictionary::resize(Attribute & attribute, size_t added_rows) +template +void HashedDictionary::blockToAttributes(const Block & block [[maybe_unused]]) { - if (!sparse) + size_t skip_keys_size_offset = dict_struct.getKeysSize(); + + Columns key_columns; + key_columns.reserve(skip_keys_size_offset); + + /// Split into keys columns and attribute columns + for (size_t i = 0; i < skip_keys_size_offset; ++i) + key_columns.emplace_back(block.safeGetByPosition(i).column); + + DictionaryKeysArenaHolder arena_holder; + DictionaryKeysExtractor keys_extractor(key_columns, arena_holder.getComplexKeyArena()); + const size_t keys_size = keys_extractor.getKeysSize(); + + Field column_value_to_insert; + + for (size_t attribute_index = 0; attribute_index < attributes.size(); ++attribute_index) { - const auto & map_ref = std::get>(attribute.maps); - added_rows += map_ref->size(); - map_ref->reserve(added_rows); - } - else - { - const auto & map_ref = std::get>(attribute.sparse_maps); - added_rows += map_ref->size(); - map_ref->resize(added_rows); + const IColumn & attribute_column = *block.safeGetByPosition(skip_keys_size_offset + attribute_index).column; + auto & attribute = attributes[attribute_index]; + bool attribute_is_nullable = attribute.is_nullable_set.has_value(); + + getAttributeContainer(attribute_index, [&](auto & container) + { + using ContainerType = std::decay_t; + using AttributeValueType = typename ContainerType::mapped_type; + + for (size_t key_index = 0; key_index < keys_size; ++key_index) + { + auto key = keys_extractor.extractCurrentKey(); + + auto it = container.find(key); + bool key_is_nullable_and_already_exists = attribute_is_nullable && attribute.is_nullable_set->find(key) != nullptr; + + if (key_is_nullable_and_already_exists || it != container.end()) + { + keys_extractor.rollbackCurrentKey(); + continue; + } + + if constexpr (std::is_same_v) + key = copyKeyInArena(key); + + attribute_column.get(key_index, column_value_to_insert); + + if (attribute.is_nullable_set && column_value_to_insert.isNull()) + { + attribute.is_nullable_set->insert(key); + keys_extractor.rollbackCurrentKey(); + continue; + } + + if constexpr (std::is_same_v) + { + String & value_to_insert = column_value_to_insert.get(); + size_t value_to_insert_size = value_to_insert.size(); + + const char * string_in_arena = attribute.string_arena->insert(value_to_insert.data(), value_to_insert_size); + + StringRef string_in_arena_reference = StringRef{string_in_arena, value_to_insert_size}; + container.insert({key, string_in_arena_reference}); + } + else + { + auto value_to_insert = column_value_to_insert.get>(); + container.insert({key, value_to_insert}); + } + + ++element_count; + + keys_extractor.rollbackCurrentKey(); + } + + keys_extractor.reset(); + }); } } -template <> -void HashedDictionary::resize(Attribute & attribute, size_t added_rows) +template +void HashedDictionary::resize(size_t added_rows) { - resize(attribute, added_rows); -} - -void HashedDictionary::resize(size_t added_rows) -{ - if (!added_rows) + if (unlikely(!added_rows)) return; - for (auto & attribute : attributes) + for (size_t attribute_index = 0; attribute_index < attributes.size(); ++attribute_index) { - auto type_call = [&](const auto & dictionary_attribute_type) + getAttributeContainer(attribute_index, [added_rows](auto & attribute_map) { - using Type = std::decay_t; - using AttributeType = typename Type::AttributeType; - resize(attribute, added_rows); - }; + size_t reserve_size = added_rows + attribute_map.size(); - callOnDictionaryAttributeType(attribute.type, type_call); + if constexpr (sparse) + attribute_map.resize(reserve_size); + else + attribute_map.reserve(reserve_size); + }); } } -void HashedDictionary::loadData() +template +template +void HashedDictionary::getItemsImpl( + const Attribute & attribute, + DictionaryKeysExtractor & keys_extractor, + ValueSetter && set_value [[maybe_unused]], + NullableValueSetter && set_nullable_value [[maybe_unused]], + DefaultValueExtractor & default_value_extractor) const +{ + const auto & attribute_container = std::get>(attribute.container); + const size_t keys_size = keys_extractor.getKeysSize(); + + bool is_attribute_nullable = attribute.is_nullable_set.has_value(); + + for (size_t key_index = 0; key_index < keys_size; ++key_index) + { + auto key = keys_extractor.extractCurrentKey(); + + const auto it = attribute_container.find(key); + + if (it != attribute_container.end()) + set_value(key_index, getValueFromCell(it)); + else + { + if (is_attribute_nullable && attribute.is_nullable_set->find(key) != nullptr) + set_nullable_value(key_index); + else + set_value(key_index, default_value_extractor[key_index]); + } + + keys_extractor.rollbackCurrentKey(); + } + + query_count.fetch_add(keys_size, std::memory_order_relaxed); +} + +template +StringRef HashedDictionary::copyKeyInArena(StringRef key) +{ + size_t key_size = key.size; + char * place_for_key = complex_key_arena.alloc(key_size); + memcpy(reinterpret_cast(place_for_key), reinterpret_cast(key.data), key_size); + StringRef updated_key{place_for_key, key_size}; + return updated_key; +} + +template +void HashedDictionary::loadData() { if (!source_ptr->hasUpdateField()) { @@ -400,263 +603,116 @@ void HashedDictionary::loadData() updateData(); if (require_nonempty && 0 == element_count) - throw Exception{full_name + ": dictionary source is empty and 'require_nonempty' property is set.", ErrorCodes::DICTIONARY_IS_EMPTY}; + throw Exception(ErrorCodes::DICTIONARY_IS_EMPTY, + "({}): dictionary source is empty and 'require_nonempty' property is set.", + full_name); } -template -void HashedDictionary::addAttributeSize(const Attribute & attribute) -{ - if (!sparse) - { - const auto & map_ref = std::get>(attribute.maps); - bytes_allocated += sizeof(CollectionType) + map_ref->getBufferSizeInBytes(); - bucket_count = map_ref->getBufferSizeInCells(); - } - else - { - const auto & map_ref = std::get>(attribute.sparse_maps); - bucket_count = map_ref->bucket_count(); - - /** TODO: more accurate calculation */ - bytes_allocated += sizeof(SparseCollectionType); - bytes_allocated += bucket_count; - bytes_allocated += map_ref->size() * (sizeof(Key) + sizeof(T)); - } -} - -template <> -void HashedDictionary::addAttributeSize(const Attribute & attribute) -{ - addAttributeSize(attribute); - bytes_allocated += sizeof(Arena) + attribute.string_arena->size(); -} - -void HashedDictionary::calculateBytesAllocated() +template +void HashedDictionary::calculateBytesAllocated() { bytes_allocated += attributes.size() * sizeof(attributes.front()); - for (const auto & attribute : attributes) + for (size_t i = 0; i < attributes.size(); ++i) { - auto type_call = [&](const auto & dictionary_attribute_type) + getAttributeContainer(i, [&](const auto & container) { - using Type = std::decay_t; - using AttributeType = typename Type::AttributeType; - addAttributeSize(attribute); - }; + using ContainerType = std::decay_t; + using AttributeValueType = typename ContainerType::mapped_type; - callOnDictionaryAttributeType(attribute.type, type_call); - } -} + bytes_allocated += sizeof(container); -template -void HashedDictionary::createAttributeImpl(Attribute & attribute, const Field & null_value) -{ - attribute.null_values = T(null_value.get()); - if (!sparse) - attribute.maps = std::make_unique>(); - else - attribute.sparse_maps = std::make_unique>(); -} - -template <> -void HashedDictionary::createAttributeImpl(Attribute & attribute, const Field & null_value) -{ - attribute.string_arena = std::make_unique(); - const String & string = null_value.get(); - const char * string_in_arena = attribute.string_arena->insert(string.data(), string.size()); - attribute.null_values.emplace(string_in_arena, string.size()); - - if (!sparse) - attribute.maps = std::make_unique>(); - else - attribute.sparse_maps = std::make_unique>(); -} - -HashedDictionary::Attribute HashedDictionary::createAttribute(const DictionaryAttribute& attribute, const Field & null_value) -{ - auto nullable_set = attribute.is_nullable ? std::make_optional() : std::optional{}; - Attribute attr{attribute.underlying_type, std::move(nullable_set), {}, {}, {}, {}}; - - auto type_call = [&, this](const auto &dictionary_attribute_type) - { - using Type = std::decay_t; - using AttributeType = typename Type::AttributeType; - createAttributeImpl(attr, null_value); - }; - - callOnDictionaryAttributeType(attribute.underlying_type, type_call); - - return attr; -} - - -template -void HashedDictionary::getItemsAttrImpl( - const MapType & attr, - const PaddedPODArray & ids, - ValueSetter && set_value, - DefaultValueExtractor & default_value_extractor) const -{ - const auto rows = ext::size(ids); - - for (const auto i : ext::range(0, rows)) - { - const auto it = attr.find(ids[i]); - set_value(i, it != attr.end() ? static_cast(second(*it)) : default_value_extractor[i]); - } - - query_count.fetch_add(rows, std::memory_order_relaxed); -} - -template -void HashedDictionary::getItemsImpl( - const Attribute & attribute, - const PaddedPODArray & ids, - ValueSetter && set_value, - DefaultValueExtractor & default_value_extractor) const -{ - if (!sparse) - return getItemsAttrImpl(*std::get>(attribute.maps), ids, set_value, default_value_extractor); - return getItemsAttrImpl(*std::get>(attribute.sparse_maps), ids, set_value, default_value_extractor); -} - - -template -bool HashedDictionary::setAttributeValueImpl(Attribute & attribute, const Key id, const T value) -{ - if (!sparse) - { - auto & map = *std::get>(attribute.maps); - return map.insert({id, value}).second; - } - else - { - auto & map = *std::get>(attribute.sparse_maps); - return map.insert({id, value}).second; - } -} - -template <> -bool HashedDictionary::setAttributeValueImpl(Attribute & attribute, const Key id, const String value) -{ - const auto * string_in_arena = attribute.string_arena->insert(value.data(), value.size()); - return setAttributeValueImpl(attribute, id, StringRef{string_in_arena, value.size()}); -} - -bool HashedDictionary::setAttributeValue(Attribute & attribute, const Key id, const Field & value) -{ - bool result = false; - - auto type_call = [&, this](const auto &dictionary_attribute_type) - { - using Type = std::decay_t; - using AttributeType = typename Type::AttributeType; - - if (attribute.nullable_set) - { - if (value.isNull()) + if constexpr (sparse || std::is_same_v) { - result = attribute.nullable_set->insert(id).second; - return; + bytes_allocated += container.max_size() * (sizeof(KeyType) + sizeof(AttributeValueType)); + bucket_count = container.bucket_count(); } else { - attribute.nullable_set->erase(id); + bytes_allocated += container.getBufferSizeInBytes(); + bucket_count = container.getBufferSizeInCells(); } - } + }); - result = setAttributeValueImpl(attribute, id, value.get()); - }; - - callOnDictionaryAttributeType(attribute.type, type_call); - - return result; -} - -const HashedDictionary::Attribute & HashedDictionary::getAttribute(const std::string & attribute_name) const -{ - const auto it = attribute_index_by_name.find(attribute_name); - if (it == std::end(attribute_index_by_name)) - throw Exception{full_name + ": no such attribute '" + attribute_name + "'", ErrorCodes::BAD_ARGUMENTS}; - - return attributes[it->second]; -} - -template -void HashedDictionary::has(const Attribute & attribute, const PaddedPODArray & ids, PaddedPODArray & out) const -{ - const auto & attr = *std::get>(attribute.maps); - const auto rows = ext::size(ids); - - for (const auto i : ext::range(0, rows)) - { - out[i] = attr.find(ids[i]) != nullptr; - - if (attribute.nullable_set && !out[i]) - out[i] = attribute.nullable_set->find(ids[i]) != nullptr; + if (attributes[i].string_arena) + bytes_allocated += attributes[i].string_arena->size(); } + + bytes_allocated += complex_key_arena.size(); } -template <> -void HashedDictionary::has(const Attribute & attribute, const PaddedPODArray & ids, PaddedPODArray & out) const +template +BlockInputStreamPtr HashedDictionary::getBlockInputStream(const Names & column_names, size_t max_block_size) const { - has(attribute, ids, out); + PaddedPODArray keys; + + if (!attributes.empty()) + { + const auto & attribute = attributes.front(); + + getAttributeContainer(0, [&](auto & container) + { + keys.reserve(container.size()); + + for (const auto & [key, value] : container) + { + (void)(value); + keys.emplace_back(key); + } + + if (attribute.is_nullable_set) + { + const auto & is_nullable_set = *attribute.is_nullable_set; + keys.reserve(is_nullable_set.size()); + + for (auto & node : is_nullable_set) + keys.emplace_back(node.getKey()); + } + }); + } + + if constexpr (dictionary_key_type == DictionaryKeyType::simple) + return std::make_shared(shared_from_this(), max_block_size, std::move(keys), column_names); + else + return std::make_shared(shared_from_this(), max_block_size, keys, column_names); } -template -PaddedPODArray HashedDictionary::getIdsAttrImpl(const AttrType & attr) const +template +template +void HashedDictionary::getAttributeContainer(size_t attribute_index, GetContainerFunc && get_container_func) { - PaddedPODArray ids; - ids.reserve(attr.size()); - for (const auto & value : attr) - ids.push_back(first(value)); + assert(attribute_index < attributes.size()); - return ids; -} -template -PaddedPODArray HashedDictionary::getIds(const Attribute & attribute) const -{ - if (!sparse) - return getIdsAttrImpl(*std::get>(attribute.maps)); - return getIdsAttrImpl(*std::get>(attribute.sparse_maps)); -} - -template <> -PaddedPODArray HashedDictionary::getIds(const Attribute & attribute) const -{ - return getIds(attribute); -} - -PaddedPODArray HashedDictionary::getIds() const -{ - const auto & attribute = attributes.front(); - PaddedPODArray result; + auto & attribute = attributes[attribute_index]; auto type_call = [&](const auto & dictionary_attribute_type) { using Type = std::decay_t; using AttributeType = typename Type::AttributeType; - /// TODO: Check if order is satisfied - result = getIds(attribute); + using ValueType = DictionaryValueType; - if (attribute.nullable_set) - { - for (const auto& value: *attribute.nullable_set) - result.push_back(value.getKey()); - } + auto & attribute_container = std::get>(attribute.container); + std::forward(get_container_func)(attribute_container); }; callOnDictionaryAttributeType(attribute.type, type_call); - - return result; } -BlockInputStreamPtr HashedDictionary::getBlockInputStream(const Names & column_names, size_t max_block_size) const +template +template +void HashedDictionary::getAttributeContainer(size_t attribute_index, GetContainerFunc && get_container_func) const { - using BlockInputStreamType = DictionaryBlockInputStream; - return std::make_shared(shared_from_this(), max_block_size, getIds(), column_names); + const_cast *>(this)->getAttributeContainer(attribute_index, [&](auto & attribute_container) + { + std::forward(get_container_func)(attribute_container); + }); } +template class HashedDictionary; +template class HashedDictionary; +template class HashedDictionary; +template class HashedDictionary; + void registerDictionaryHashed(DictionaryFactory & factory) { auto create_layout = [](const std::string & full_name, @@ -664,10 +720,13 @@ void registerDictionaryHashed(DictionaryFactory & factory) const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix, DictionarySourcePtr source_ptr, + DictionaryKeyType dictionary_key_type, bool sparse) -> DictionaryPtr { - if (dict_struct.key) - throw Exception{"'key' is not supported for dictionary of layout 'hashed'", ErrorCodes::UNSUPPORTED_METHOD}; + if (dictionary_key_type == DictionaryKeyType::simple && dict_struct.key) + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "'key' is not supported for simple key hashed dictionary"); + else if (dictionary_key_type == DictionaryKeyType::complex && dict_struct.id) + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "'id' is not supported for complex key hashed dictionary"); if (dict_struct.range_min || dict_struct.range_max) throw Exception{full_name @@ -678,13 +737,34 @@ void registerDictionaryHashed(DictionaryFactory & factory) const auto dict_id = StorageID::fromDictionaryConfig(config, config_prefix); const DictionaryLifetime dict_lifetime{config, config_prefix + ".lifetime"}; const bool require_nonempty = config.getBool(config_prefix + ".require_nonempty", false); - return std::make_unique(dict_id, dict_struct, std::move(source_ptr), dict_lifetime, require_nonempty, sparse); + + if (dictionary_key_type == DictionaryKeyType::simple) + { + if (sparse) + return std::make_unique>(dict_id, dict_struct, std::move(source_ptr), dict_lifetime, require_nonempty); + else + return std::make_unique>(dict_id, dict_struct, std::move(source_ptr), dict_lifetime, require_nonempty); + } + else + { + if (sparse) + return std::make_unique>(dict_id, dict_struct, std::move(source_ptr), dict_lifetime, require_nonempty); + else + return std::make_unique>(dict_id, dict_struct, std::move(source_ptr), dict_lifetime, require_nonempty); + } }; + using namespace std::placeholders; + factory.registerLayout("hashed", - [=](auto && a, auto && b, auto && c, auto && d, DictionarySourcePtr e){ return create_layout(a, b, c, d, std::move(e), /* sparse = */ false); }, false); + [=](auto && a, auto && b, auto && c, auto && d, DictionarySourcePtr e){ return create_layout(a, b, c, d, std::move(e), DictionaryKeyType::simple, /* sparse = */ false); }, false); factory.registerLayout("sparse_hashed", - [=](auto && a, auto && b, auto && c, auto && d, DictionarySourcePtr e){ return create_layout(a, b, c, d, std::move(e), /* sparse = */ true); }, false); + [=](auto && a, auto && b, auto && c, auto && d, DictionarySourcePtr e){ return create_layout(a, b, c, d, std::move(e), DictionaryKeyType::simple, /* sparse = */ true); }, false); + factory.registerLayout("complex_key_hashed", + [=](auto && a, auto && b, auto && c, auto && d, DictionarySourcePtr e){ return create_layout(a, b, c, d, std::move(e), DictionaryKeyType::complex, /* sparse = */ false); }, true); + factory.registerLayout("complex_key_sparse_hashed", + [=](auto && a, auto && b, auto && c, auto && d, DictionarySourcePtr e){ return create_layout(a, b, c, d, std::move(e), DictionaryKeyType::complex, /* sparse = */ true); }, true); + } } diff --git a/src/Dictionaries/HashedDictionary.h b/src/Dictionaries/HashedDictionary.h index ab37f1528ca..3882b669324 100644 --- a/src/Dictionaries/HashedDictionary.h +++ b/src/Dictionaries/HashedDictionary.h @@ -4,17 +4,21 @@ #include #include #include -#include -#include -#include -#include -#include + #include #include -#include "DictionaryStructure.h" -#include "IDictionary.h" -#include "IDictionarySource.h" -#include "DictionaryHelpers.h" + +#include +#include +#include + +#include +#include + +#include +#include +#include +#include /** This dictionary stores all content in a hash table in memory * (a separate Key -> Value map for each attribute) @@ -24,19 +28,32 @@ namespace DB { +template class HashedDictionary final : public IDictionary { public: + using KeyType = std::conditional_t; + static_assert(dictionary_key_type != DictionaryKeyType::range, "Range key type is not supported by hashed dictionary"); + HashedDictionary( const StorageID & dict_id_, const DictionaryStructure & dict_struct_, DictionarySourcePtr source_ptr_, const DictionaryLifetime dict_lifetime_, bool require_nonempty_, - bool sparse_, BlockPtr saved_block_ = nullptr); - std::string getTypeName() const override { return sparse ? "SparseHashed" : "Hashed"; } + std::string getTypeName() const override + { + if constexpr (dictionary_key_type == DictionaryKeyType::simple && sparse) + return "SparseHashed"; + else if constexpr (dictionary_key_type == DictionaryKeyType::simple && !sparse) + return "Hashed"; + else if constexpr (dictionary_key_type == DictionaryKeyType::complex && sparse) + return "ComplexKeySpareseHashed"; + else + return "ComplexKeyHashed"; + } size_t getBytesAllocated() const override { return bytes_allocated; } @@ -50,7 +67,7 @@ public: std::shared_ptr clone() const override { - return std::make_shared(getDictionaryID(), dict_struct, source_ptr->clone(), dict_lifetime, require_nonempty, sparse, saved_block); + return std::make_shared>(getDictionaryID(), dict_struct, source_ptr->clone(), dict_lifetime, require_nonempty, saved_block); } const IDictionarySource * getSource() const override { return source_ptr.get(); } @@ -61,14 +78,10 @@ public: bool isInjective(const std::string & attribute_name) const override { - return dict_struct.attributes[&getAttribute(attribute_name) - attributes.data()].injective; + return dict_struct.getAttribute(attribute_name).injective; } - bool hasHierarchy() const override { return hierarchical_attribute; } - - void toParent(const PaddedPODArray & ids, PaddedPODArray & out) const override; - - DictionaryKeyType getKeyType() const override { return DictionaryKeyType::simple; } + DictionaryKeyType getKeyType() const override { return dictionary_key_type; } ColumnPtr getColumn( const std::string& attribute_name, @@ -79,36 +92,52 @@ public: ColumnUInt8::Ptr hasKeys(const Columns & key_columns, const DataTypes & key_types) const override; - void isInVectorVector( - const PaddedPODArray & child_ids, const PaddedPODArray & ancestor_ids, PaddedPODArray & out) const override; - void isInVectorConstant(const PaddedPODArray & child_ids, const Key ancestor_id, PaddedPODArray & out) const override; - void isInConstantVector(const Key child_id, const PaddedPODArray & ancestor_ids, PaddedPODArray & out) const override; + bool hasHierarchy() const override { return dictionary_key_type == DictionaryKeyType::simple && dict_struct.hierarchical_attribute_index.has_value(); } + + ColumnPtr getHierarchy(ColumnPtr key_column, const DataTypePtr & hierarchy_attribute_type) const override; + + ColumnUInt8::Ptr isInHierarchy( + ColumnPtr key_column, + ColumnPtr in_key_column, + const DataTypePtr & key_type) const override; + + ColumnPtr getDescendants( + ColumnPtr key_column, + const DataTypePtr & key_type, + size_t level) const override; BlockInputStreamPtr getBlockInputStream(const Names & column_names, size_t max_block_size) const override; private: template - using CollectionType = HashMap; - template - using CollectionPtrType = std::unique_ptr>; + using CollectionTypeNonSparse = std::conditional_t< + dictionary_key_type == DictionaryKeyType::simple, + HashMap, + HashMapWithSavedHash>>; #if !defined(ARCADIA_BUILD) - template - using SparseCollectionType = google::sparse_hash_map>; + template + using SparseHashMap = google::sparse_hash_map>; #else - template - using SparseCollectionType = google::sparsehash::sparse_hash_map>; + template + using SparseHashMap = google::sparsehash::sparse_hash_map>; #endif template - using SparseCollectionPtrType = std::unique_ptr>; + using CollectionTypeSparse = std::conditional_t< + dictionary_key_type == DictionaryKeyType::simple, + SparseHashMap, + SparseHashMap>; - using NullableSet = HashSet>; + template + using CollectionType = std::conditional_t, CollectionTypeNonSparse>; + + using NullableSet = HashSet>; struct Attribute final { AttributeUnderlyingType type; - std::optional nullable_set; + std::optional is_nullable_set; std::variant< UInt8, @@ -127,41 +156,27 @@ private: Float64, StringRef> null_values; + std::variant< - CollectionPtrType, - CollectionPtrType, - CollectionPtrType, - CollectionPtrType, - CollectionPtrType, - CollectionPtrType, - CollectionPtrType, - CollectionPtrType, - CollectionPtrType, - CollectionPtrType, - CollectionPtrType, - CollectionPtrType, - CollectionPtrType, - CollectionPtrType, - CollectionPtrType> - maps; - std::variant< - SparseCollectionPtrType, - SparseCollectionPtrType, - SparseCollectionPtrType, - SparseCollectionPtrType, - SparseCollectionPtrType, - SparseCollectionPtrType, - SparseCollectionPtrType, - SparseCollectionPtrType, - SparseCollectionPtrType, - SparseCollectionPtrType, - SparseCollectionPtrType, - SparseCollectionPtrType, - SparseCollectionPtrType, - SparseCollectionPtrType, - SparseCollectionPtrType> - sparse_maps; + CollectionType, + CollectionType, + CollectionType, + CollectionType, + CollectionType, + CollectionType, + CollectionType, + CollectionType, + CollectionType, + CollectionType, + CollectionType, + CollectionType, + CollectionType, + CollectionType, + CollectionType> + container; + std::unique_ptr string_arena; + }; void createAttributes(); @@ -172,76 +187,47 @@ private: void loadData(); - template - void addAttributeSize(const Attribute & attribute); - void calculateBytesAllocated(); - template - void createAttributeImpl(Attribute & attribute, const Field & null_value); - - Attribute createAttribute(const DictionaryAttribute& attribute, const Field & null_value); - - template - void getItemsAttrImpl( - const MapType & attr, - const PaddedPODArray & ids, - ValueSetter && set_value, - DefaultValueExtractor & default_value_extractor) const; - - template + template void getItemsImpl( const Attribute & attribute, - const PaddedPODArray & ids, + DictionaryKeysExtractor & keys_extractor, ValueSetter && set_value, + NullableValueSetter && set_nullable_value, DefaultValueExtractor & default_value_extractor) const; - template - bool setAttributeValueImpl(Attribute & attribute, const Key id, const T value); + template + void getAttributeContainer(size_t attribute_index, GetContainerFunc && get_container_func); - bool setAttributeValue(Attribute & attribute, const Key id, const Field & value); + template + void getAttributeContainer(size_t attribute_index, GetContainerFunc && get_container_func) const; - const Attribute & getAttribute(const std::string & attribute_name) const; - - template - void has(const Attribute & attribute, const PaddedPODArray & ids, PaddedPODArray & out) const; - - template - PaddedPODArray getIdsAttrImpl(const AttrType & attr) const; - template - PaddedPODArray getIds(const Attribute & attribute) const; - - PaddedPODArray getIds() const; - - /// Preallocates the hashtable based on query progress - /// (Only while loading all data). - /// - /// @see preallocate - template - void resize(Attribute & attribute, size_t added_rows); void resize(size_t added_rows); - template - void isInAttrImpl(const AttrType & attr, const ChildType & child_ids, const AncestorType & ancestor_ids, PaddedPODArray & out) const; - template - void isInImpl(const ChildType & child_ids, const AncestorType & ancestor_ids, PaddedPODArray & out) const; + StringRef copyKeyInArena(StringRef key); const DictionaryStructure dict_struct; const DictionarySourcePtr source_ptr; const DictionaryLifetime dict_lifetime; const bool require_nonempty; - const bool sparse; - std::map attribute_index_by_name; std::vector attributes; - const Attribute * hierarchical_attribute = nullptr; size_t bytes_allocated = 0; size_t element_count = 0; size_t bucket_count = 0; mutable std::atomic query_count{0}; + /// TODO: Remove BlockPtr saved_block; + Arena complex_key_arena; }; +extern template class HashedDictionary; +extern template class HashedDictionary; + +extern template class HashedDictionary; +extern template class HashedDictionary; + } diff --git a/src/Dictionaries/HierarchyDictionariesUtils.cpp b/src/Dictionaries/HierarchyDictionariesUtils.cpp new file mode 100644 index 00000000000..fffe0d30e0e --- /dev/null +++ b/src/Dictionaries/HierarchyDictionariesUtils.cpp @@ -0,0 +1,156 @@ +#include "HierarchyDictionariesUtils.h" + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int UNSUPPORTED_METHOD; +} + +namespace +{ + /** In case of cache or direct dictionary we does not have structure with child to parent representation. + * This function build such structure calling getColumn for initial keys to request and for next keys in hierarchy, + * until all keys are requested or result key is null value. + * To distinguish null value key and key that is not present in dictionary, we use special default value column + * with max UInt64 value, if result column key has such value we assume that current key is not presented in dictionary storage. + */ + HashMap getChildToParentHierarchyMapImpl( + const IDictionary * dictionary, + const DictionaryAttribute & hierarchical_attribute, + const PaddedPODArray & initial_keys_to_request, + const DataTypePtr & key_type) + { + UInt64 null_value = hierarchical_attribute.null_value.get(); + + ColumnPtr key_to_request_column = ColumnVector::create(); + auto * key_to_request_column_typed = static_cast *>(key_to_request_column->assumeMutable().get()); + + UInt64 key_not_in_storage_value = std::numeric_limits::max(); + ColumnPtr key_not_in_storage_default_value_column = ColumnVector::create(initial_keys_to_request.size(), key_not_in_storage_value); + + PaddedPODArray & keys_to_request = key_to_request_column_typed->getData(); + keys_to_request.assign(initial_keys_to_request); + + PaddedPODArray next_keys_to_request; + HashSet already_requested_keys; + + HashMap child_to_parent_key; + + while (!keys_to_request.empty()) + { + child_to_parent_key.reserve(child_to_parent_key.size() + keys_to_request.size()); + + auto parent_key_column = dictionary->getColumn( + hierarchical_attribute.name, + hierarchical_attribute.type, + {key_to_request_column}, + {key_type}, + key_not_in_storage_default_value_column); + + const auto * parent_key_column_typed = checkAndGetColumn>(*parent_key_column); + if (!parent_key_column_typed) + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, + "Parent key column should be UInt64. Actual ({})", + hierarchical_attribute.type->getName()); + + const auto & parent_keys = parent_key_column_typed->getData(); + next_keys_to_request.clear(); + + for (size_t i = 0; i < keys_to_request.size(); ++i) + { + auto key = keys_to_request[i]; + auto parent_key = parent_keys[i]; + + if (parent_key == key_not_in_storage_value) + continue; + + child_to_parent_key[key] = parent_key; + + if (parent_key == null_value || + already_requested_keys.find(parent_key) != nullptr) + continue; + + already_requested_keys.insert(parent_key); + next_keys_to_request.emplace_back(parent_key); + } + + keys_to_request.clear(); + keys_to_request.assign(next_keys_to_request); + } + + return child_to_parent_key; + } +} + +ColumnPtr getKeysHierarchyDefaultImplementation(const IDictionary * dictionary, ColumnPtr key_column, const DataTypePtr & key_type) +{ + key_column = key_column->convertToFullColumnIfConst(); + const auto * key_column_typed = checkAndGetColumn>(*key_column); + if (!key_column_typed) + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "Key column should be UInt64"); + + const auto & dictionary_structure = dictionary->getStructure(); + size_t hierarchical_attribute_index = *dictionary_structure.hierarchical_attribute_index; + const auto & hierarchical_attribute = dictionary_structure.attributes[hierarchical_attribute_index]; + + const PaddedPODArray & requested_keys = key_column_typed->getData(); + HashMap key_to_parent_key = getChildToParentHierarchyMapImpl(dictionary, hierarchical_attribute, requested_keys, key_type); + + auto is_key_valid_func = [&](auto & key) { return key_to_parent_key.find(key) != nullptr; }; + + auto get_parent_key_func = [&](auto & key) + { + auto it = key_to_parent_key.find(key); + std::optional result = (it != nullptr ? std::make_optional(it->getMapped()) : std::nullopt); + return result; + }; + + UInt64 null_value = hierarchical_attribute.null_value.get(); + + auto dictionary_hierarchy_array = getKeysHierarchyArray(requested_keys, null_value, is_key_valid_func, get_parent_key_func); + return dictionary_hierarchy_array; +} + +ColumnUInt8::Ptr getKeysIsInHierarchyDefaultImplementation( + const IDictionary * dictionary, + ColumnPtr key_column, + ColumnPtr in_key_column, + const DataTypePtr & key_type) +{ + key_column = key_column->convertToFullColumnIfConst(); + in_key_column = in_key_column->convertToFullColumnIfConst(); + + const auto * key_column_typed = checkAndGetColumn>(*key_column); + if (!key_column_typed) + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "Key column should be UInt64"); + + const auto * in_key_column_typed = checkAndGetColumn>(*in_key_column); + if (!in_key_column_typed) + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "Key column should be UInt64"); + + const auto & dictionary_structure = dictionary->getStructure(); + size_t hierarchical_attribute_index = *dictionary_structure.hierarchical_attribute_index; + const auto & hierarchical_attribute = dictionary_structure.attributes[hierarchical_attribute_index]; + + const PaddedPODArray & requested_keys = key_column_typed->getData(); + HashMap key_to_parent_key = getChildToParentHierarchyMapImpl(dictionary, hierarchical_attribute, requested_keys, key_type); + + auto is_key_valid_func = [&](auto & key) { return key_to_parent_key.find(key) != nullptr; }; + + auto get_parent_key_func = [&](auto & key) + { + auto it = key_to_parent_key.find(key); + std::optional result = (it != nullptr ? std::make_optional(it->getMapped()) : std::nullopt); + return result; + }; + + UInt64 null_value = hierarchical_attribute.null_value.get(); + const auto & in_keys = in_key_column_typed->getData(); + + auto result = getKeysIsInHierarchyColumn(requested_keys, in_keys, null_value, is_key_valid_func, get_parent_key_func); + return result; +} + +} diff --git a/src/Dictionaries/HierarchyDictionariesUtils.h b/src/Dictionaries/HierarchyDictionariesUtils.h new file mode 100644 index 00000000000..8b2fe6ef08e --- /dev/null +++ b/src/Dictionaries/HierarchyDictionariesUtils.h @@ -0,0 +1,467 @@ +#pragma once + +#include +#include +#include +#include + +#include +#include +#include + +#include + +namespace DB +{ + +namespace detail +{ + template + struct ElementsAndOffsets + { + PaddedPODArray elements; + PaddedPODArray offsets; + }; + + template + struct IsKeyValidFuncInterface + { + bool operator()(T key [[maybe_unused]]) { return false; } + }; + + template + struct GetParentKeyFuncInterface + { + std::optional operator()(T key [[maybe_unused]]) { return {}; } + }; + + /** Calculate hierarchy for keys iterating the hierarchy from child to parent using get_parent_key_func provided by client. + * Hierarchy iteration is stopped if key equals null value, get_parent_key_func returns null optional, or hierarchy depth + * greater or equal than DBMS_HIERARCHICAL_DICTIONARY_MAX_DEPTH. + * IsKeyValidFunc used for each input hierarchy key, if it returns false result hierarchy for that key will have size 0. + * Hierarchy result is ElementsAndOffsets structure, for each element there is hierarchy array, + * with size offset[element_index] - (element_index > 0 ? offset[element_index - 1] : 0). + * + * Example: + * id parent_id + * 1 0 + * 2 1 + * 3 1 + * 4 2 + * + * If hierarchy_null_value will be 0. Requested keys [1, 2, 3, 4, 5]. + * Result: [1], [2, 1], [3, 1], [4, 2, 1], [] + * Elements: [1, 2, 1, 3, 1, 4, 2, 1] + * Offsets: [1, 3, 5, 8, 8] + */ + template + ElementsAndOffsets getHierarchy( + const PaddedPODArray & keys, + const KeyType & hierarchy_null_value, + IsKeyValidFunc && is_key_valid_func, + GetParentKeyFunc && get_parent_key_func) + { + size_t hierarchy_keys_size = keys.size(); + + PaddedPODArray elements; + elements.reserve(hierarchy_keys_size); + + PaddedPODArray offsets; + offsets.reserve(hierarchy_keys_size); + + struct OffsetInArray + { + size_t offset_index; + size_t array_element_offset; + }; + + HashMap already_processes_keys_to_offset; + already_processes_keys_to_offset.reserve(hierarchy_keys_size); + + for (size_t i = 0; i < hierarchy_keys_size; ++i) + { + auto hierarchy_key = keys[i]; + size_t current_hierarchy_depth = 0; + + bool is_key_valid = std::forward(is_key_valid_func)(hierarchy_key); + + if (!is_key_valid) + { + offsets.emplace_back(elements.size()); + continue; + } + + while (true) + { + const auto * it = already_processes_keys_to_offset.find(hierarchy_key); + + if (it) + { + const auto & index = it->getMapped(); + + size_t offset = index.offset_index; + + bool is_loop = (offset == offsets.size()); + + if (unlikely(is_loop)) + break; + + size_t array_element_offset = index.array_element_offset; + + size_t previous_offset_size = offset > 0 ? offsets[offset - 1] : 0; + size_t start_index = previous_offset_size + array_element_offset; + size_t end_index = offsets[offset]; + + elements.insertFromItself(elements.begin() + start_index, elements.begin() + end_index); + break; + } + + if (hierarchy_key == hierarchy_null_value || current_hierarchy_depth >= DBMS_HIERARCHICAL_DICTIONARY_MAX_DEPTH) + break; + + already_processes_keys_to_offset[hierarchy_key] = {offsets.size(), current_hierarchy_depth}; + elements.emplace_back(hierarchy_key); + ++current_hierarchy_depth; + + std::optional parent_key = std::forward(get_parent_key_func)(hierarchy_key); + + if (!parent_key.has_value()) + break; + + hierarchy_key = *parent_key; + } + + offsets.emplace_back(elements.size()); + } + + ElementsAndOffsets result = {std::move(elements), std::move(offsets)}; + + return result; + } + + /** Returns array with UInt8 represent if key from in_keys array is in hierarchy of key from keys column. + * If value in result array is 1 that means key from in_keys array is in hierarchy of key from + * keys array with same index, 0 therwise. + * For getting hierarchy implementation uses getKeysHierarchy function. + * + * Not: keys size must be equal to in_keys_size. + */ + template + PaddedPODArray getIsInHierarchy( + const PaddedPODArray & keys, + const PaddedPODArray & in_keys, + const KeyType & hierarchy_null_value, + IsKeyValidFunc && is_key_valid_func, + GetParentKeyFunc && get_parent_func) + { + assert(keys.size() == in_keys.size()); + + PaddedPODArray result; + result.resize_fill(keys.size()); + + detail::ElementsAndOffsets hierarchy = detail::getHierarchy( + keys, + hierarchy_null_value, + std::forward(is_key_valid_func), + std::forward(get_parent_func)); + + auto & offsets = hierarchy.offsets; + auto & elements = hierarchy.elements; + + for (size_t i = 0; i < offsets.size(); ++i) + { + size_t i_elements_start = i > 0 ? offsets[i - 1] : 0; + size_t i_elements_end = offsets[i]; + + auto & key_to_find = in_keys[i]; + + const auto * begin = elements.begin() + i_elements_start; + const auto * end = elements.begin() + i_elements_end; + + const auto * it = std::find(begin, end, key_to_find); + + bool contains_key = (it != end); + result[i] = contains_key; + } + + return result; + } + + struct GetAllDescendantsStrategy { size_t level = 0; }; + struct GetDescendantsAtSpecificLevelStrategy { size_t level = 0; }; + + /** Get descendants for keys iterating the hierarchy from parent to child using parent_to_child hash map provided by client. + * GetAllDescendantsStrategy get all descendants for key + * GetDescendantsAtSpecificLevelStrategy get descendants only for specific hierarchy level. + * Hierarchy result is ElementsAndOffsets structure, for each element there is descendants array, + * with size offset[element_index] - (element_index > 0 ? offset[element_index - 1] : 0). + * + * Example: + * id parent_id + * 1 0 + * 2 1 + * 3 1 + * 4 2 + * + * Example. Strategy GetAllDescendantsStrategy. + * Requested keys [0, 1, 2, 3, 4]. + * Result: [1, 2, 3, 4], [2, 2, 4], [4], [], [] + * Elements: [1, 2, 3, 4, 2, 3, 4, 4] + * Offsets: [4, 7, 8, 8, 8] + * + * Example. Strategy GetDescendantsAtSpecificLevelStrategy with level 1. + * Requested keys [0, 1, 2, 3, 4]. + * Result: [1], [2, 3], [4], [], []; + * Offsets: [1, 3, 4, 4, 4]; + */ + template + ElementsAndOffsets getDescendants( + const PaddedPODArray & keys, + const HashMap> & parent_to_child, + Strategy strategy) + { + /// If strategy is GetAllDescendantsStrategy we try to cache and later reuse previously calculated descendants. + /// If strategy is GetDescendantsAtSpecificLevelStrategy we does not use cache strategy. + size_t keys_size = keys.size(); + + PaddedPODArray descendants; + descendants.reserve(keys_size); + + PaddedPODArray descendants_offsets; + descendants_offsets.reserve(keys_size); + + struct Range + { + size_t start_index; + size_t end_index; + }; + + static constexpr Int64 key_range_requires_update = -1; + HashMap already_processed_keys_to_range [[maybe_unused]]; + + if constexpr (std::is_same_v) + already_processed_keys_to_range.reserve(keys_size); + + struct KeyAndDepth + { + KeyType key; + Int64 depth; + }; + + HashSet already_processed_keys_during_loop; + already_processed_keys_during_loop.reserve(keys_size); + + PaddedPODArray next_keys_to_process_stack; + next_keys_to_process_stack.reserve(keys_size); + + Int64 level = static_cast(strategy.level); + + for (size_t i = 0; i < keys_size; ++i) + { + const KeyType & requested_key = keys[i]; + + if (parent_to_child.find(requested_key) == nullptr) + { + descendants_offsets.emplace_back(descendants.size()); + continue; + } + + next_keys_to_process_stack.emplace_back(KeyAndDepth{requested_key, 0}); + + /** To cache range for key without recursive function calls and custom stack we put special + * signaling value on stack key_range_requires_update. + * When we pop such value from stack that means processing descendants for key is finished + * and we can update range with end_index. + */ + while (!next_keys_to_process_stack.empty()) + { + KeyAndDepth key_to_process = next_keys_to_process_stack.back(); + + KeyType key = key_to_process.key; + Int64 depth = key_to_process.depth; + next_keys_to_process_stack.pop_back(); + + if constexpr (std::is_same_v) + { + /// Update end_index for key + if (depth == key_range_requires_update) + { + auto * it = already_processed_keys_to_range.find(key); + assert(it); + + auto & range_to_update = it->getMapped(); + range_to_update.end_index = descendants.size(); + continue; + } + } + + if (unlikely(already_processed_keys_during_loop.find(key) != nullptr)) + { + next_keys_to_process_stack.clear(); + break; + } + + if constexpr (std::is_same_v) + { + const auto * already_processed_it = already_processed_keys_to_range.find(key); + + if (already_processed_it) + { + Range range = already_processed_it->getMapped(); + + if (unlikely(range.start_index > range.end_index)) + { + /// Broken range because there was loop + already_processed_keys_to_range.erase(key); + } + else + { + auto insert_start_iterator = descendants.begin() + range.start_index; + auto insert_end_iterator = descendants.begin() + range.end_index; + descendants.insertFromItself(insert_start_iterator, insert_end_iterator); + continue; + } + } + } + + const auto * it = parent_to_child.find(key); + + if (!it || depth >= DBMS_HIERARCHICAL_DICTIONARY_MAX_DEPTH) + continue; + + if constexpr (std::is_same_v) + { + if (depth > level) + continue; + } + + if constexpr (std::is_same_v) + { + /// Put special signaling value on stack and update cache with range start + size_t range_start_index = descendants.size(); + already_processed_keys_to_range[key].start_index = range_start_index; + next_keys_to_process_stack.emplace_back(KeyAndDepth{key, key_range_requires_update}); + } + + already_processed_keys_during_loop.insert(key); + + ++depth; + + const auto & children = it->getMapped(); + + for (auto child_key : children) + { + /// In case of GetAllDescendantsStrategy we add any descendant to result array + /// If strategy is GetDescendantsAtSpecificLevelStrategy we require depth == level + if (std::is_same_v || depth == level) + descendants.emplace_back(child_key); + + next_keys_to_process_stack.emplace_back(KeyAndDepth{child_key, depth}); + } + } + + already_processed_keys_during_loop.clear(); + + descendants_offsets.emplace_back(descendants.size()); + } + + ElementsAndOffsets result = {std::move(descendants), std::move(descendants_offsets)}; + return result; + } + + /// Converts ElementAndOffsets structure into ArrayColumn + template + ColumnPtr convertElementsAndOffsetsIntoArray(ElementsAndOffsets && elements_and_offsets) + { + auto elements_column = ColumnVector::create(); + elements_column->getData() = std::move(elements_and_offsets.elements); + + auto offsets_column = ColumnVector::create(); + offsets_column->getData() = std::move(elements_and_offsets.offsets); + + auto column_array = ColumnArray::create(std::move(elements_column), std::move(offsets_column)); + + return column_array; + } +} + +/// Returns hierarchy array column for keys +template +ColumnPtr getKeysHierarchyArray( + const PaddedPODArray & keys, + const KeyType & hierarchy_null_value, + IsKeyValidFunc && is_key_valid_func, + GetParentKeyFunc && get_parent_func) +{ + auto elements_and_offsets = detail::getHierarchy( + keys, + hierarchy_null_value, + std::forward(is_key_valid_func), + std::forward(get_parent_func)); + + return detail::convertElementsAndOffsetsIntoArray(std::move(elements_and_offsets)); +} + +/// Returns is in hierarchy column for keys +template +ColumnUInt8::Ptr getKeysIsInHierarchyColumn( + const PaddedPODArray & hierarchy_keys, + const PaddedPODArray & hierarchy_in_keys, + const KeyType & hierarchy_null_value, + IsKeyValidFunc && is_key_valid_func, + GetParentKeyFunc && get_parent_func) +{ + auto is_in_hierarchy_data = detail::getIsInHierarchy( + hierarchy_keys, + hierarchy_in_keys, + hierarchy_null_value, + std::forward(is_key_valid_func), + std::forward(get_parent_func)); + + auto result = ColumnUInt8::create(); + result->getData() = std::move(is_in_hierarchy_data); + + return result; +} + +/// Returns descendants array column for keys +template +ColumnPtr getKeysDescendantsArray( + const PaddedPODArray & requested_keys, + const HashMap> & parent_to_child, + size_t level) +{ + if (level == 0) + { + detail::GetAllDescendantsStrategy strategy { .level = level }; + auto elements_and_offsets = detail::getDescendants(requested_keys, parent_to_child, strategy); + return detail::convertElementsAndOffsetsIntoArray(std::move(elements_and_offsets)); + } + else + { + detail::GetDescendantsAtSpecificLevelStrategy strategy { .level = level }; + auto elements_and_offsets = detail::getDescendants(requested_keys, parent_to_child, strategy); + return detail::convertElementsAndOffsetsIntoArray(std::move(elements_and_offsets)); + } +} + +/** Default getHierarchy implementation for dictionaries that does not have structure with child to parent representation. + * Implementation will build such structure with getColumn calls, and then getHierarchy for such structure. + * Returns ColumnArray with hierarchy arrays for keys from key_column. + */ +ColumnPtr getKeysHierarchyDefaultImplementation( + const IDictionary * dictionary, + ColumnPtr key_column, + const DataTypePtr & key_type); + +/** Default isInHierarchy implementation for dictionaries that does not have structure with child to parent representation. + * Implementation will build such structure with getColumn calls, and then getHierarchy for such structure. + * Returns UInt8 column if key from in_key_column is in key hierarchy from key_column. + */ +ColumnUInt8::Ptr getKeysIsInHierarchyDefaultImplementation( + const IDictionary * dictionary, + ColumnPtr key_column, + ColumnPtr in_key_column, + const DataTypePtr & key_type); + +} diff --git a/src/Dictionaries/IDictionary.h b/src/Dictionaries/IDictionary.h index 4d51747a652..a7445312409 100644 --- a/src/Dictionaries/IDictionary.h +++ b/src/Dictionaries/IDictionary.h @@ -24,8 +24,8 @@ namespace ErrorCodes extern const int NOT_IMPLEMENTED; } -struct IDictionaryBase; -using DictionaryPtr = std::unique_ptr; +struct IDictionary; +using DictionaryPtr = std::unique_ptr; /** DictionaryKeyType provides IDictionary client information about * which key type is supported by dictionary. @@ -47,13 +47,11 @@ enum class DictionaryKeyType /** * Base class for Dictionaries implementation. */ -struct IDictionaryBase : public IExternalLoadable +struct IDictionary : public IExternalLoadable { - using Key = UInt64; - - IDictionaryBase(const StorageID & dict_id_) - : dict_id(dict_id_) - , full_name(dict_id.getInternalDictionaryName()) + explicit IDictionary(const StorageID & dictionary_id_) + : dictionary_id(dictionary_id_) + , full_name(dictionary_id.getInternalDictionaryName()) { } @@ -61,14 +59,14 @@ struct IDictionaryBase : public IExternalLoadable StorageID getDictionaryID() const { std::lock_guard lock{name_mutex}; - return dict_id; + return dictionary_id; } void updateDictionaryName(const StorageID & new_name) const { std::lock_guard lock{name_mutex}; - assert(new_name.uuid == dict_id.uuid && dict_id.uuid != UUIDHelpers::Nil); - dict_id = new_name; + assert(new_name.uuid == dictionary_id.uuid && dictionary_id.uuid != UUIDHelpers::Nil); + dictionary_id = new_name; } const std::string & getLoadableName() const override final { return getFullName(); } @@ -80,8 +78,9 @@ struct IDictionaryBase : public IExternalLoadable std::string getDatabaseOrNoDatabaseTag() const { - if (!dict_id.database_name.empty()) - return dict_id.database_name; + if (!dictionary_id.database_name.empty()) + return dictionary_id.database_name; + return NO_DATABASE_TAG; } @@ -159,74 +158,65 @@ struct IDictionaryBase : public IExternalLoadable const Columns & key_columns, const DataTypes & key_types) const = 0; + virtual bool hasHierarchy() const { return false; } + + virtual ColumnPtr getHierarchy( + ColumnPtr key_column [[maybe_unused]], + const DataTypePtr & key_type [[maybe_unused]]) const + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, + "Method getHierarchy is not supported for {} dictionary.", + getDictionaryID().getNameForLogs()); + } + + virtual ColumnUInt8::Ptr isInHierarchy( + ColumnPtr key_column [[maybe_unused]], + ColumnPtr in_key_column [[maybe_unused]], + const DataTypePtr & key_type [[maybe_unused]]) const + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, + "Method isInHierarchy is not supported for {} dictionary.", + getDictionaryID().getNameForLogs()); + } + + virtual ColumnPtr getDescendants( + ColumnPtr key_column [[maybe_unused]], + const DataTypePtr & key_type [[maybe_unused]], + size_t level [[maybe_unused]]) const + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, + "Method getDescendants is not supported for {} dictionary.", + getDictionaryID().getNameForLogs()); + } + virtual BlockInputStreamPtr getBlockInputStream(const Names & column_names, size_t max_block_size) const = 0; bool supportUpdates() const override { return true; } bool isModified() const override { - auto source = getSource(); + const auto * source = getSource(); return source && source->isModified(); } virtual std::exception_ptr getLastException() const { return {}; } - std::shared_ptr shared_from_this() + std::shared_ptr shared_from_this() { - return std::static_pointer_cast(IExternalLoadable::shared_from_this()); + return std::static_pointer_cast(IExternalLoadable::shared_from_this()); } - std::shared_ptr shared_from_this() const + std::shared_ptr shared_from_this() const { - return std::static_pointer_cast(IExternalLoadable::shared_from_this()); + return std::static_pointer_cast(IExternalLoadable::shared_from_this()); } private: mutable std::mutex name_mutex; - mutable StorageID dict_id; + mutable StorageID dictionary_id; protected: const String full_name; }; -struct IDictionary : IDictionaryBase -{ - IDictionary(const StorageID & dict_id_) : IDictionaryBase(dict_id_) {} - - virtual bool hasHierarchy() const = 0; - - virtual void toParent(const PaddedPODArray & ids, PaddedPODArray & out) const = 0; - - /// TODO: Rewrite - /// Methods for hierarchy. - - virtual void isInVectorVector( - const PaddedPODArray & /*child_ids*/, const PaddedPODArray & /*ancestor_ids*/, PaddedPODArray & /*out*/) const - { - throw Exception(ErrorCodes::NOT_IMPLEMENTED, - "Hierarchy is not supported for {} dictionary.", getDictionaryID().getNameForLogs()); - } - - virtual void - isInVectorConstant(const PaddedPODArray & /*child_ids*/, const Key /*ancestor_id*/, PaddedPODArray & /*out*/) const - { - throw Exception(ErrorCodes::NOT_IMPLEMENTED, - "Hierarchy is not supported for {} dictionary.", getDictionaryID().getNameForLogs()); - } - - virtual void - isInConstantVector(const Key /*child_id*/, const PaddedPODArray & /*ancestor_ids*/, PaddedPODArray & /*out*/) const - { - throw Exception(ErrorCodes::NOT_IMPLEMENTED, - "Hierarchy is not supported for {} dictionary.", getDictionaryID().getNameForLogs()); - } - - void isInConstantConstant(const Key child_id, const Key ancestor_id, UInt8 & out) const - { - PaddedPODArray out_arr(1); - isInVectorConstant(PaddedPODArray(1, child_id), ancestor_id, out_arr); - out = out_arr[0]; - } -}; - } diff --git a/src/Dictionaries/IPAddressDictionary.cpp b/src/Dictionaries/IPAddressDictionary.cpp index 4b51d94f0d8..d66c285bc42 100644 --- a/src/Dictionaries/IPAddressDictionary.cpp +++ b/src/Dictionaries/IPAddressDictionary.cpp @@ -195,7 +195,7 @@ IPAddressDictionary::IPAddressDictionary( DictionarySourcePtr source_ptr_, const DictionaryLifetime dict_lifetime_, bool require_nonempty_) - : IDictionaryBase(dict_id_) + : IDictionary(dict_id_) , dict_struct(dict_struct_) , source_ptr{std::move(source_ptr_)} , dict_lifetime(dict_lifetime_) @@ -804,9 +804,6 @@ static auto keyViewGetter() BlockInputStreamPtr IPAddressDictionary::getBlockInputStream(const Names & column_names, size_t max_block_size) const { - using BlockInputStreamType = DictionaryBlockInputStream; - - const bool is_ipv4 = std::get_if(&ip_column) != nullptr; auto get_keys = [is_ipv4](const Columns & columns, const std::vector & dict_attributes) @@ -827,12 +824,12 @@ BlockInputStreamPtr IPAddressDictionary::getBlockInputStream(const Names & colum if (is_ipv4) { auto get_view = keyViewGetter, true>(); - return std::make_shared( + return std::make_shared( shared_from_this(), max_block_size, getKeyColumns(), column_names, std::move(get_keys), std::move(get_view)); } auto get_view = keyViewGetter(); - return std::make_shared( + return std::make_shared( shared_from_this(), max_block_size, getKeyColumns(), column_names, std::move(get_keys), std::move(get_view)); } diff --git a/src/Dictionaries/IPAddressDictionary.h b/src/Dictionaries/IPAddressDictionary.h index dcfb26c3c96..cf79caa75fc 100644 --- a/src/Dictionaries/IPAddressDictionary.h +++ b/src/Dictionaries/IPAddressDictionary.h @@ -20,7 +20,7 @@ namespace DB { -class IPAddressDictionary final : public IDictionaryBase +class IPAddressDictionary final : public IDictionary { public: IPAddressDictionary( diff --git a/src/Dictionaries/PolygonDictionary.cpp b/src/Dictionaries/PolygonDictionary.cpp index 04559d701c9..7046741b3a0 100644 --- a/src/Dictionaries/PolygonDictionary.cpp +++ b/src/Dictionaries/PolygonDictionary.cpp @@ -30,7 +30,7 @@ IPolygonDictionary::IPolygonDictionary( const DictionaryLifetime dict_lifetime_, InputType input_type_, PointType point_type_) - : IDictionaryBase(dict_id_) + : IDictionary(dict_id_) , dict_struct(dict_struct_) , source_ptr(std::move(source_ptr_)) , dict_lifetime(dict_lifetime_) @@ -142,7 +142,6 @@ ColumnPtr IPolygonDictionary::getColumn( callOnDictionaryAttributeType(attribute.underlying_type, type_call); } - query_count.fetch_add(requested_key_points.size(), std::memory_order_relaxed); return result; diff --git a/src/Dictionaries/PolygonDictionary.h b/src/Dictionaries/PolygonDictionary.h index b82a8b2928f..5974e6461a7 100644 --- a/src/Dictionaries/PolygonDictionary.h +++ b/src/Dictionaries/PolygonDictionary.h @@ -24,7 +24,7 @@ namespace bg = boost::geometry; * An implementation should inherit from this base class and preprocess the data upon construction if needed. * It must override the find method of this class which retrieves the polygon containing a single point. */ -class IPolygonDictionary : public IDictionaryBase +class IPolygonDictionary : public IDictionary { public: /** Controls the different types of polygons allowed as input. diff --git a/src/Dictionaries/RangeDictionaryBlockInputStream.h b/src/Dictionaries/RangeDictionaryBlockInputStream.h index 6531f5cba9d..499eea7152f 100644 --- a/src/Dictionaries/RangeDictionaryBlockInputStream.h +++ b/src/Dictionaries/RangeDictionaryBlockInputStream.h @@ -24,7 +24,7 @@ public: using Key = UInt64; RangeDictionaryBlockInputStream( - std::shared_ptr dictionary, + std::shared_ptr dictionary, size_t max_block_size, const Names & column_names, PaddedPODArray && ids_to_fill, @@ -49,7 +49,7 @@ private: const PaddedPODArray & block_start_dates, const PaddedPODArray & block_end_dates) const; - std::shared_ptr dictionary; + std::shared_ptr dictionary; NameSet column_names; PaddedPODArray ids; PaddedPODArray start_dates; @@ -59,7 +59,7 @@ private: template RangeDictionaryBlockInputStream::RangeDictionaryBlockInputStream( - std::shared_ptr dictionary_, + std::shared_ptr dictionary_, size_t max_block_size_, const Names & column_names_, PaddedPODArray && ids_, diff --git a/src/Dictionaries/RangeHashedDictionary.cpp b/src/Dictionaries/RangeHashedDictionary.cpp index 4196d6ebd72..30395114a8e 100644 --- a/src/Dictionaries/RangeHashedDictionary.cpp +++ b/src/Dictionaries/RangeHashedDictionary.cpp @@ -76,7 +76,7 @@ RangeHashedDictionary::RangeHashedDictionary( DictionarySourcePtr source_ptr_, const DictionaryLifetime dict_lifetime_, bool require_nonempty_) - : IDictionaryBase(dict_id_) + : IDictionary(dict_id_) , dict_struct(dict_struct_) , source_ptr{std::move(source_ptr_)} , dict_lifetime(dict_lifetime_) @@ -185,10 +185,10 @@ ColumnUInt8::Ptr RangeHashedDictionary::hasKeys(const Columns & key_columns, con auto range_column_storage_type = std::make_shared(); auto range_column_updated = castColumnAccurate(column_to_cast, range_column_storage_type); - PaddedPODArray key_backup_storage; + PaddedPODArray key_backup_storage; PaddedPODArray range_backup_storage; - const PaddedPODArray & ids = getColumnVectorData(this, key_columns[0], key_backup_storage); + const PaddedPODArray & ids = getColumnVectorData(this, key_columns[0], key_backup_storage); const PaddedPODArray & dates = getColumnVectorData(this, range_column_updated, range_backup_storage); const auto & attribute = attributes.front(); @@ -213,7 +213,7 @@ ColumnUInt8::Ptr RangeHashedDictionary::hasKeys(const Columns & key_columns, con template ColumnUInt8::Ptr RangeHashedDictionary::hasKeysImpl( const Attribute & attribute, - const PaddedPODArray & ids, + const PaddedPODArray & ids, const PaddedPODArray & dates) const { auto result = ColumnUInt8::create(ids.size()); @@ -388,10 +388,10 @@ void RangeHashedDictionary::getItemsImpl( ValueSetter && set_value, DefaultValueExtractor & default_value_extractor) const { - PaddedPODArray key_backup_storage; + PaddedPODArray key_backup_storage; PaddedPODArray range_backup_storage; - const PaddedPODArray & ids = getColumnVectorData(this, key_columns[0], key_backup_storage); + const PaddedPODArray & ids = getColumnVectorData(this, key_columns[0], key_backup_storage); const PaddedPODArray & dates = getColumnVectorData(this, key_columns[1], range_backup_storage); const auto & attr = *std::get>(attribute.maps); @@ -436,7 +436,7 @@ void RangeHashedDictionary::getItemsImpl( template -void RangeHashedDictionary::setAttributeValueImpl(Attribute & attribute, const Key id, const Range & range, const Field & value) +void RangeHashedDictionary::setAttributeValueImpl(Attribute & attribute, const UInt64 id, const Range & range, const Field & value) { using ValueType = std::conditional_t, StringRef, T>; auto & map = *std::get>(attribute.maps); @@ -480,7 +480,7 @@ void RangeHashedDictionary::setAttributeValueImpl(Attribute & attribute, const K map.insert({id, Values{std::move(value_to_insert)}}); } -void RangeHashedDictionary::setAttributeValue(Attribute & attribute, const Key id, const Range & range, const Field & value) +void RangeHashedDictionary::setAttributeValue(Attribute & attribute, const UInt64 id, const Range & range, const Field & value) { auto type_call = [&](const auto &dictionary_attribute_type) { @@ -515,7 +515,7 @@ RangeHashedDictionary::getAttributeWithType(const std::string & attribute_name, template void RangeHashedDictionary::getIdsAndDates( - PaddedPODArray & ids, + PaddedPODArray & ids, PaddedPODArray & start_dates, PaddedPODArray & end_dates) const { @@ -536,7 +536,7 @@ void RangeHashedDictionary::getIdsAndDates( template void RangeHashedDictionary::getIdsAndDates( const Attribute & attribute, - PaddedPODArray & ids, + PaddedPODArray & ids, PaddedPODArray & start_dates, PaddedPODArray & end_dates) const { @@ -567,7 +567,7 @@ void RangeHashedDictionary::getIdsAndDates( template BlockInputStreamPtr RangeHashedDictionary::getBlockInputStreamImpl(const Names & column_names, size_t max_block_size) const { - PaddedPODArray ids; + PaddedPODArray ids; PaddedPODArray start_dates; PaddedPODArray end_dates; getIdsAndDates(ids, start_dates, end_dates); diff --git a/src/Dictionaries/RangeHashedDictionary.h b/src/Dictionaries/RangeHashedDictionary.h index f2b24e52dfc..ca2a925df5e 100644 --- a/src/Dictionaries/RangeHashedDictionary.h +++ b/src/Dictionaries/RangeHashedDictionary.h @@ -16,7 +16,7 @@ namespace DB { -class RangeHashedDictionary final : public IDictionaryBase +class RangeHashedDictionary final : public IDictionary { public: RangeHashedDictionary( @@ -160,25 +160,25 @@ private: template ColumnUInt8::Ptr hasKeysImpl( const Attribute & attribute, - const PaddedPODArray & ids, + const PaddedPODArray & ids, const PaddedPODArray & dates) const; template - static void setAttributeValueImpl(Attribute & attribute, const Key id, const Range & range, const Field & value); + static void setAttributeValueImpl(Attribute & attribute, const UInt64 id, const Range & range, const Field & value); - static void setAttributeValue(Attribute & attribute, const Key id, const Range & range, const Field & value); + static void setAttributeValue(Attribute & attribute, const UInt64 id, const Range & range, const Field & value); const Attribute & getAttribute(const std::string & attribute_name) const; const Attribute & getAttributeWithType(const std::string & name, const AttributeUnderlyingType type) const; template - void getIdsAndDates(PaddedPODArray & ids, PaddedPODArray & start_dates, PaddedPODArray & end_dates) const; + void getIdsAndDates(PaddedPODArray & ids, PaddedPODArray & start_dates, PaddedPODArray & end_dates) const; template void getIdsAndDates( const Attribute & attribute, - PaddedPODArray & ids, + PaddedPODArray & ids, PaddedPODArray & start_dates, PaddedPODArray & end_dates) const; diff --git a/src/Dictionaries/registerDictionaries.cpp b/src/Dictionaries/registerDictionaries.cpp index a7b3c87267d..8d24a6ea979 100644 --- a/src/Dictionaries/registerDictionaries.cpp +++ b/src/Dictionaries/registerDictionaries.cpp @@ -57,7 +57,6 @@ void registerDictionaries() { auto & factory = DictionaryFactory::instance(); registerDictionaryRangeHashed(factory); - registerDictionaryComplexKeyHashed(factory); registerDictionaryTrie(factory); registerDictionaryFlat(factory); registerDictionaryHashed(factory); diff --git a/src/Dictionaries/tests/gtest_dictionary_ssd_cache_dictionary_storage.cpp b/src/Dictionaries/tests/gtest_dictionary_ssd_cache_dictionary_storage.cpp index 20529e91bd3..9fd9dc9b78c 100644 --- a/src/Dictionaries/tests/gtest_dictionary_ssd_cache_dictionary_storage.cpp +++ b/src/Dictionaries/tests/gtest_dictionary_ssd_cache_dictionary_storage.cpp @@ -1,7 +1,5 @@ #if defined(__linux__) || defined(__FreeBSD__) -#include - #include #include diff --git a/src/Dictionaries/tests/gtest_hierarchy_dictionaries_utils.cpp b/src/Dictionaries/tests/gtest_hierarchy_dictionaries_utils.cpp new file mode 100644 index 00000000000..064f57dfe11 --- /dev/null +++ b/src/Dictionaries/tests/gtest_hierarchy_dictionaries_utils.cpp @@ -0,0 +1,225 @@ +#include + +#include + +#include + +using namespace DB; + +TEST(HierarchyDictionariesUtils, getHierarchy) +{ + { + HashMap child_to_parent; + child_to_parent[1] = 0; + child_to_parent[2] = 1; + child_to_parent[3] = 1; + child_to_parent[4] = 2; + + auto is_key_valid_func = [&](auto key) { return child_to_parent.find(key) != nullptr; }; + + auto get_parent_key_func = [&](auto key) + { + auto it = child_to_parent.find(key); + std::optional value = (it != nullptr ? std::make_optional(it->getMapped()) : std::nullopt); + return value; + }; + + UInt64 hierarchy_null_value_key = 0; + PaddedPODArray keys = {1, 2, 3, 4, 5}; + + auto result = DB::detail::getHierarchy( + keys, + hierarchy_null_value_key, + is_key_valid_func, + get_parent_key_func); + + const auto & actual_elements = result.elements; + const auto & actual_offsets = result.offsets; + + PaddedPODArray expected_elements = {1, 2, 1, 3, 1, 4, 2, 1}; + PaddedPODArray expected_offsets = {1, 3, 5, 8, 8}; + + ASSERT_EQ(actual_elements, expected_elements); + ASSERT_EQ(actual_offsets, expected_offsets); + } + { + HashMap child_to_parent; + child_to_parent[1] = 2; + child_to_parent[2] = 1; + + auto is_key_valid_func = [&](auto key) { return child_to_parent.find(key) != nullptr; }; + + auto get_parent_key_func = [&](auto key) + { + auto it = child_to_parent.find(key); + std::optional value = (it != nullptr ? std::make_optional(it->getMapped()) : std::nullopt); + return value; + }; + + UInt64 hierarchy_null_value_key = 0; + PaddedPODArray keys = {1, 2, 3}; + + auto result = DB::detail::getHierarchy( + keys, + hierarchy_null_value_key, + is_key_valid_func, + get_parent_key_func); + + const auto & actual_elements = result.elements; + const auto & actual_offsets = result.offsets; + + PaddedPODArray expected_elements = {1, 2, 2}; + PaddedPODArray expected_offsets = {2, 3, 3}; + + ASSERT_EQ(actual_elements, expected_elements); + ASSERT_EQ(actual_offsets, expected_offsets); + } +} + +TEST(HierarchyDictionariesUtils, getIsInHierarchy) +{ + { + HashMap child_to_parent; + child_to_parent[1] = 0; + child_to_parent[2] = 1; + child_to_parent[3] = 1; + child_to_parent[4] = 2; + + auto is_key_valid_func = [&](auto key) { return child_to_parent.find(key) != nullptr; }; + + auto get_parent_key_func = [&](auto key) + { + auto it = child_to_parent.find(key); + std::optional value = (it != nullptr ? std::make_optional(it->getMapped()) : std::nullopt); + return value; + }; + + UInt64 hierarchy_null_value_key = 0; + PaddedPODArray keys = {1, 2, 3, 4, 5}; + PaddedPODArray keys_in = {1, 1, 1, 2, 5}; + + PaddedPODArray actual = DB::detail::getIsInHierarchy( + keys, + keys_in, + hierarchy_null_value_key, + is_key_valid_func, + get_parent_key_func); + + PaddedPODArray expected = {1,1,1,1,0}; + + ASSERT_EQ(actual, expected); + } + { + HashMap child_to_parent; + child_to_parent[1] = 2; + child_to_parent[2] = 1; + + auto is_key_valid_func = [&](auto key) + { + return child_to_parent.find(key) != nullptr; + }; + + auto get_parent_key_func = [&](auto key) + { + auto it = child_to_parent.find(key); + std::optional value = (it != nullptr ? std::make_optional(it->getMapped()) : std::nullopt); + return value; + }; + + UInt64 hierarchy_null_value_key = 0; + PaddedPODArray keys = {1, 2, 3}; + PaddedPODArray keys_in = {1, 2, 3}; + + PaddedPODArray actual = DB::detail::getIsInHierarchy( + keys, + keys_in, + hierarchy_null_value_key, + is_key_valid_func, + get_parent_key_func); + + PaddedPODArray expected = {1, 1, 0}; + ASSERT_EQ(actual, expected); + } +} + +TEST(HierarchyDictionariesUtils, getDescendants) +{ + { + HashMap> parent_to_child; + parent_to_child[0].emplace_back(1); + parent_to_child[1].emplace_back(2); + parent_to_child[1].emplace_back(3); + parent_to_child[2].emplace_back(4); + + PaddedPODArray keys = {0, 1, 2, 3, 4}; + + { + auto result = DB::detail::getDescendants( + keys, + parent_to_child, + DB::detail::GetAllDescendantsStrategy()); + + const auto & actual_elements = result.elements; + const auto & actual_offsets = result.offsets; + + PaddedPODArray expected_elements = {1, 2, 3, 4, 2, 3, 4, 4}; + PaddedPODArray expected_offsets = {4, 7, 8, 8, 8}; + + ASSERT_EQ(actual_elements, expected_elements); + ASSERT_EQ(actual_offsets, expected_offsets); + } + { + auto result = DB::detail::getDescendants( + keys, + parent_to_child, + DB::detail::GetDescendantsAtSpecificLevelStrategy{1}); + + const auto & actual_elements = result.elements; + const auto & actual_offsets = result.offsets; + + PaddedPODArray expected_elements = {1, 2, 3, 4}; + PaddedPODArray expected_offsets = {1, 3, 4, 4, 4}; + + ASSERT_EQ(actual_elements, expected_elements); + ASSERT_EQ(actual_offsets, expected_offsets); + } + } + { + HashMap> parent_to_child; + parent_to_child[1].emplace_back(2); + parent_to_child[2].emplace_back(1); + + PaddedPODArray keys = {1, 2, 3}; + + { + auto result = DB::detail::getDescendants( + keys, + parent_to_child, + DB::detail::GetAllDescendantsStrategy()); + + const auto & actual_elements = result.elements; + const auto & actual_offsets = result.offsets; + + PaddedPODArray expected_elements = {2, 1, 1}; + PaddedPODArray expected_offsets = {2, 3, 3}; + + ASSERT_EQ(actual_elements, expected_elements); + ASSERT_EQ(actual_offsets, expected_offsets); + } + { + auto result = DB::detail::getDescendants( + keys, + parent_to_child, + DB::detail::GetDescendantsAtSpecificLevelStrategy{1}); + + const auto & actual_elements = result.elements; + const auto & actual_offsets = result.offsets; + + PaddedPODArray expected_elements = {2, 1}; + PaddedPODArray expected_offsets = {1, 2, 2}; + + ASSERT_EQ(actual_elements, expected_elements); + ASSERT_EQ(actual_offsets, expected_offsets); + } + } +} diff --git a/src/Dictionaries/ya.make b/src/Dictionaries/ya.make index 4df58211118..dc58d3f0a14 100644 --- a/src/Dictionaries/ya.make +++ b/src/Dictionaries/ya.make @@ -26,7 +26,7 @@ SRCS( CassandraDictionarySource.cpp CassandraHelpers.cpp ClickHouseDictionarySource.cpp - ComplexKeyHashedDictionary.cpp + DictionaryBlockInputStream.cpp DictionaryBlockInputStreamBase.cpp DictionaryFactory.cpp DictionarySourceFactory.cpp @@ -48,6 +48,7 @@ SRCS( FlatDictionary.cpp HTTPDictionarySource.cpp HashedDictionary.cpp + HierarchyDictionariesUtils.cpp IPAddressDictionary.cpp LibraryDictionarySource.cpp LibraryDictionarySourceExternal.cpp diff --git a/src/Functions/FunctionsExternalDictionaries.cpp b/src/Functions/FunctionsExternalDictionaries.cpp index f037a3bd808..6b83f761086 100644 --- a/src/Functions/FunctionsExternalDictionaries.cpp +++ b/src/Functions/FunctionsExternalDictionaries.cpp @@ -24,6 +24,8 @@ void registerFunctionsExternalDictionaries(FunctionFactory & factory) factory.registerFunction(); factory.registerFunction(); factory.registerFunction(); + factory.registerFunction(); + factory.registerFunction(); factory.registerFunction(); factory.registerFunction(); factory.registerFunction(); @@ -40,6 +42,7 @@ void registerFunctionsExternalDictionaries(FunctionFactory & factory) factory.registerFunction(); factory.registerFunction>(); factory.registerFunction>(); + factory.registerFunction(); } } diff --git a/src/Functions/FunctionsExternalDictionaries.h b/src/Functions/FunctionsExternalDictionaries.h index 2c322698327..1f8ef60b4af 100644 --- a/src/Functions/FunctionsExternalDictionaries.h +++ b/src/Functions/FunctionsExternalDictionaries.h @@ -19,6 +19,7 @@ #include #include #include +#include #include @@ -28,16 +29,6 @@ #include #include - -#include -#include -#include -#include -#include -#include -#include -#include - #include #include @@ -49,7 +40,6 @@ namespace ErrorCodes { extern const int ILLEGAL_TYPE_OF_ARGUMENT; extern const int UNSUPPORTED_METHOD; - extern const int UNKNOWN_TYPE; extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; extern const int ILLEGAL_COLUMN; extern const int BAD_ARGUMENTS; @@ -77,7 +67,7 @@ class FunctionDictHelper public: explicit FunctionDictHelper(const Context & context_) : context(context_) {} - std::shared_ptr getDictionary(const String & dictionary_name) + std::shared_ptr getDictionary(const String & dictionary_name) { auto dict = context.getExternalDictionariesLoader().getDictionary(dictionary_name, context); @@ -90,9 +80,13 @@ public: return dict; } - std::shared_ptr getDictionary(const ColumnWithTypeAndName & column) + std::shared_ptr getDictionary(const ColumnPtr & column) { - const auto * dict_name_col = checkAndGetColumnConst(column.column.get()); + const auto * dict_name_col = checkAndGetColumnConst(column.get()); + + if (!dict_name_col) + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "Expected const String column"); + return getDictionary(dict_name_col->getValue()); } @@ -148,7 +142,6 @@ public: String getName() const override { return name; } -private: size_t getNumberOfArguments() const override { return 0; } bool isVariadic() const override { return true; } @@ -187,7 +180,7 @@ private: if (input_rows_count == 0) return result_type->createColumn(); - auto dictionary = helper.getDictionary(arguments[0]); + auto dictionary = helper.getDictionary(arguments[0].column); auto dictionary_key_type = dictionary->getKeyType(); const ColumnWithTypeAndName & key_column_with_type = arguments[1]; @@ -238,6 +231,7 @@ private: return dictionary->hasKeys({key_column, range_col}, {std::make_shared(), range_col_type}); } +private: mutable FunctionDictHelper helper; }; @@ -302,7 +296,7 @@ public: } if (types.size() > 1) - return std::make_shared(types); + return std::make_shared(types, attribute_names); else return types.front(); } @@ -701,6 +695,163 @@ using FunctionDictGetDecimal64OrDefault = FunctionDictGetOrDefault, NameDictGetDecimal128OrDefault>; using FunctionDictGetStringOrDefault = FunctionDictGetOrDefault; +class FunctionDictGetOrNull final : public IFunction +{ +public: + static constexpr auto name = "dictGetOrNull"; + + static FunctionPtr create(const Context &context) + { + return std::make_shared(context); + } + + explicit FunctionDictGetOrNull(const Context & context_) + : dictionary_get_func_impl(context_) + , dictionary_has_func_impl(context_) + {} + + String getName() const override { return name; } + +private: + + size_t getNumberOfArguments() const override { return 0; } + + bool isVariadic() const override { return true; } + + bool useDefaultImplementationForConstants() const override { return true; } + + bool useDefaultImplementationForNulls() const override { return false; } + + bool isDeterministic() const override { return false; } + + ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {0, 1}; } + + bool isInjective(const ColumnsWithTypeAndName & sample_columns) const override + { + return dictionary_get_func_impl.isInjective(sample_columns); + } + + DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override + { + auto result_type = dictionary_get_func_impl.getReturnTypeImpl(arguments); + + WhichDataType result_data_type(result_type); + if (result_data_type.isTuple()) + { + const auto & data_type_tuple = static_cast(*result_type); + auto elements_types_copy = data_type_tuple.getElements(); + for (auto & element_type : elements_types_copy) + element_type = makeNullable(element_type); + + result_type = std::make_shared(elements_types_copy, data_type_tuple.getElementNames()); + } + else + result_type = makeNullable(result_type); + + return result_type; + } + + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override + { + /** We call dictHas function to get which map is key presented in dictionary. + For key that presented in dictionary dict has result for that key index value will be 1. Otherwise 0. + We invert result, and then for key that is not presented in dictionary value will be 1. Otherwise 0. + This inverted result will be used as null column map. + After that we call dict get function, by contract for key that are not presented in dictionary we + return default value. + We create nullable column from dict get result column and null column map. + + 2 additional implementation details: + 1. Result from dict get can be tuple if client requested multiple attributes we apply such operation on each result column. + 2. If column is already nullable we merge column null map with null map that we get from dict has. + */ + + auto dict_has_arguments = filterAttributeNameArgumentForDictHas(arguments); + auto is_key_in_dictionary_column = dictionary_has_func_impl.executeImpl(dict_has_arguments, std::make_shared(), input_rows_count); + auto is_key_in_dictionary_column_mutable = is_key_in_dictionary_column->assumeMutable(); + ColumnVector & is_key_in_dictionary_column_typed = assert_cast &>(*is_key_in_dictionary_column_mutable); + PaddedPODArray & is_key_in_dictionary_data = is_key_in_dictionary_column_typed.getData(); + for (auto & key : is_key_in_dictionary_data) + key = !key; + + auto result_type = dictionary_get_func_impl.getReturnTypeImpl(arguments); + auto dictionary_get_result_column = dictionary_get_func_impl.executeImpl(arguments, result_type, input_rows_count); + + ColumnPtr result; + + WhichDataType result_data_type(result_type); + auto dictionary_get_result_column_mutable = dictionary_get_result_column->assumeMutable(); + + if (result_data_type.isTuple()) + { + ColumnTuple & column_tuple = assert_cast(*dictionary_get_result_column_mutable); + + const auto & columns = column_tuple.getColumns(); + size_t tuple_size = columns.size(); + + MutableColumns new_columns(tuple_size); + for (size_t tuple_column_index = 0; tuple_column_index < tuple_size; ++tuple_column_index) + { + auto nullable_column_map = ColumnVector::create(); + auto & nullable_column_map_data = nullable_column_map->getData(); + nullable_column_map_data.assign(is_key_in_dictionary_data); + + auto mutable_column = columns[tuple_column_index]->assumeMutable(); + if (ColumnNullable * nullable_column = typeid_cast(mutable_column.get())) + { + auto & null_map_data = nullable_column->getNullMapData(); + addNullMap(null_map_data, is_key_in_dictionary_data); + new_columns[tuple_column_index] = std::move(mutable_column); + } + else + new_columns[tuple_column_index] = ColumnNullable::create(std::move(mutable_column), std::move(nullable_column_map)); + } + + result = ColumnTuple::create(std::move(new_columns)); + } + else + { + if (ColumnNullable * nullable_column = typeid_cast(dictionary_get_result_column_mutable.get())) + { + auto & null_map_data = nullable_column->getNullMapData(); + addNullMap(null_map_data, is_key_in_dictionary_data); + result = std::move(dictionary_get_result_column); + } + else + result = ColumnNullable::create(std::move(dictionary_get_result_column), std::move(is_key_in_dictionary_column_mutable)); + } + + return result; + } + + static void addNullMap(PaddedPODArray & null_map, PaddedPODArray & null_map_to_add) + { + assert(null_map.size() == null_map_to_add.size()); + + for (size_t i = 0; i < null_map.size(); ++i) + null_map[i] = null_map[i] || null_map_to_add[i]; + } + + static ColumnsWithTypeAndName filterAttributeNameArgumentForDictHas(const ColumnsWithTypeAndName & arguments) + { + ColumnsWithTypeAndName dict_has_arguments; + dict_has_arguments.reserve(arguments.size() - 1); + size_t attribute_name_argument_index = 1; + + for (size_t i = 0; i < arguments.size(); ++i) + { + if (i == attribute_name_argument_index) + continue; + + dict_has_arguments.emplace_back(arguments[i]); + } + + return dict_has_arguments; + } + + const FunctionDictGetNoType dictionary_get_func_impl; + const FunctionDictHas dictionary_has_func_impl; +}; /// Functions to work with hierarchies. class FunctionDictGetHierarchy final : public IFunction @@ -727,12 +878,16 @@ private: DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override { if (!isString(arguments[0])) - throw Exception{"Illegal type " + arguments[0]->getName() + " of first argument of function " + getName() - + ", expected a string.", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT}; + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Illegal type of first argument of function ({}). Expected String. Actual type ({})", + getName(), + arguments[0]->getName()); if (!WhichDataType(arguments[1]).isUInt64()) - throw Exception{"Illegal type " + arguments[1]->getName() + " of second argument of function " + getName() - + ", must be UInt64.", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT}; + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Illegal type of second argument of function ({}). Expected UInt64. Actual type ({})", + getName(), + arguments[1]->getName()); return std::make_shared(std::make_shared()); } @@ -744,109 +899,15 @@ private: if (input_rows_count == 0) return result_type->createColumn(); - auto dict = helper.getDictionary(arguments[0]); - ColumnPtr res; + auto dictionary = helper.getDictionary(arguments[0].column); - /// TODO: Rewrite this - if (!((res = executeDispatch(arguments, result_type, dict)) - || (res = executeDispatch>(arguments, result_type, dict)) - || (res = executeDispatch(arguments, result_type, dict)) - || (res = executeDispatch>(arguments, result_type, dict)))) - throw Exception{"Unsupported dictionary type " + dict->getTypeName(), ErrorCodes::UNKNOWN_TYPE}; + if (!dictionary->hasHierarchy()) + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, + "Dictionary ({}) does not support hierarchy", + dictionary->getFullName()); - return res; - } - - template - ColumnPtr executeDispatch(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, const std::shared_ptr & dict_ptr) const - { - const auto * dict = typeid_cast(dict_ptr.get()); - if (!dict) - return nullptr; - - if (!dict->hasHierarchy()) - throw Exception{"Dictionary does not have a hierarchy", ErrorCodes::UNSUPPORTED_METHOD}; - - const auto get_hierarchies = [&] (const PaddedPODArray & in, PaddedPODArray & out, PaddedPODArray & offsets) - { - const auto size = in.size(); - - /// copy of `in` array - auto in_array = std::make_unique>(std::begin(in), std::end(in)); - /// used for storing and handling result of ::toParent call - auto out_array = std::make_unique>(size); - /// resulting hierarchies - std::vector> hierarchies(size); /// TODO Bad code, poor performance. - - /// total number of non-zero elements, used for allocating all the required memory upfront - size_t total_count = 0; - - while (true) - { - auto all_zeroes = true; - - /// erase zeroed identifiers, store non-zeroed ones - for (const auto i : ext::range(0, size)) - { - const auto id = (*in_array)[i]; - if (0 == id) - continue; - - - auto & hierarchy = hierarchies[i]; - - /// Checking for loop - if (std::find(std::begin(hierarchy), std::end(hierarchy), id) != std::end(hierarchy)) - continue; - - all_zeroes = false; - /// place id at it's corresponding place - hierarchy.push_back(id); - - ++total_count; - } - - if (all_zeroes) - break; - - /// translate all non-zero identifiers at once - dict->toParent(*in_array, *out_array); - - /// we're going to use the `in_array` from this iteration as `out_array` on the next one - std::swap(in_array, out_array); - } - - out.reserve(total_count); - offsets.resize(size); - - for (const auto i : ext::range(0, size)) - { - const auto & ids = hierarchies[i]; - out.insert_assume_reserved(std::begin(ids), std::end(ids)); - offsets[i] = out.size(); - } - }; - - const auto * id_col_untyped = arguments[1].column.get(); - if (const auto * id_col = checkAndGetColumn(id_col_untyped)) - { - const auto & in = id_col->getData(); - auto backend = ColumnUInt64::create(); - auto offsets = ColumnArray::ColumnOffsets::create(); - get_hierarchies(in, backend->getData(), offsets->getData()); - return ColumnArray::create(std::move(backend), std::move(offsets)); - } - else if (const auto * id_col_const = checkAndGetColumnConst>(id_col_untyped)) - { - const PaddedPODArray in(1, id_col_const->getValue()); - auto backend = ColumnUInt64::create(); - auto offsets = ColumnArray::ColumnOffsets::create(); - get_hierarchies(in, backend->getData(), offsets->getData()); - auto array = ColumnArray::create(std::move(backend), std::move(offsets)); - return result_type->createColumnConst(id_col_const->size(), (*array)[0].get()); - } - else - throw Exception{"Second argument of function " + getName() + " must be UInt64", ErrorCodes::ILLEGAL_COLUMN}; + ColumnPtr result = dictionary->getHierarchy(arguments[1].column, std::make_shared()); + return result; } mutable FunctionDictHelper helper; @@ -877,16 +938,22 @@ private: DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override { if (!isString(arguments[0])) - throw Exception{"Illegal type " + arguments[0]->getName() + " of first argument of function " + getName() - + ", expected a string.", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT}; + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Illegal type of first argument of function ({}). Expected String. Actual type ({})", + getName(), + arguments[0]->getName()); if (!WhichDataType(arguments[1]).isUInt64()) - throw Exception{"Illegal type " + arguments[1]->getName() + " of second argument of function " + getName() - + ", must be UInt64.", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT}; + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Illegal type of second argument of function ({}). Expected UInt64. Actual type ({})", + getName(), + arguments[1]->getName()); if (!WhichDataType(arguments[2]).isUInt64()) - throw Exception{"Illegal type " + arguments[2]->getName() + " of third argument of function " + getName() - + ", must be UInt64.", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT}; + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Illegal type of third argument of function ({}). Expected UInt64. Actual type ({})", + getName(), + arguments[2]->getName()); return std::make_shared(); } @@ -898,105 +965,163 @@ private: if (input_rows_count == 0) return result_type->createColumn(); - auto dict = helper.getDictionary(arguments[0]); + auto dict = helper.getDictionary(arguments[0].column); - ColumnPtr res; - if (!((res = executeDispatch(arguments, dict)) - || (res = executeDispatch>(arguments, dict)) - || (res = executeDispatch(arguments, dict)) - || (res = executeDispatch>(arguments, dict)))) - throw Exception{"Unsupported dictionary type " + dict->getTypeName(), ErrorCodes::UNKNOWN_TYPE}; + if (!dict->hasHierarchy()) + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "Dictionary ({}) does not support hierarchy", dict->getFullName()); + + ColumnPtr res = dict->isInHierarchy(arguments[1].column, arguments[2].column, std::make_shared()); return res; } - template - ColumnPtr executeDispatch(const ColumnsWithTypeAndName & arguments, const std::shared_ptr & dict_ptr) const + mutable FunctionDictHelper helper; +}; + +class FunctionDictGetChildren final : public IFunction +{ +public: + static constexpr auto name = "dictGetChildren"; + + static FunctionPtr create(const Context & context) { - const auto * dict = typeid_cast(dict_ptr.get()); - if (!dict) - return nullptr; - - if (!dict->hasHierarchy()) - throw Exception{"Dictionary does not have a hierarchy", ErrorCodes::UNSUPPORTED_METHOD}; - - const auto * child_id_col_untyped = arguments[1].column.get(); - const auto * ancestor_id_col_untyped = arguments[2].column.get(); - - if (const auto * child_id_col = checkAndGetColumn(child_id_col_untyped)) - return execute(dict, child_id_col, ancestor_id_col_untyped); - else if (const auto * child_id_col_const = checkAndGetColumnConst>(child_id_col_untyped)) - return execute(dict, child_id_col_const, ancestor_id_col_untyped); - else - throw Exception{"Illegal column " + child_id_col_untyped->getName() - + " of second argument of function " + getName(), ErrorCodes::ILLEGAL_COLUMN}; + return std::make_shared(context); } - template - ColumnPtr execute(const DictionaryType * dict, - const ColumnUInt64 * child_id_col, const IColumn * ancestor_id_col_untyped) const + explicit FunctionDictGetChildren(const Context & context_) + : helper(context_) {} + + String getName() const override { return name; } + +private: + size_t getNumberOfArguments() const override { return 2; } + + bool useDefaultImplementationForConstants() const final { return true; } + ColumnNumbers getArgumentsThatAreAlwaysConstant() const final { return {0}; } + bool isDeterministic() const override { return false; } + + DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override { - if (const auto * ancestor_id_col = checkAndGetColumn(ancestor_id_col_untyped)) - { - auto out = ColumnUInt8::create(); + if (!isString(arguments[0])) + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Illegal type of first argument of function ({}). Expected String. Actual type ({})", + getName(), + arguments[0]->getName()); - const auto & child_ids = child_id_col->getData(); - const auto & ancestor_ids = ancestor_id_col->getData(); - auto & data = out->getData(); - const auto size = child_id_col->size(); - data.resize(size); + if (!WhichDataType(arguments[1]).isUInt64()) + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Illegal type of second argument of function ({}). Expected UInt64. Actual type ({})", + getName(), + arguments[1]->getName()); - dict->isInVectorVector(child_ids, ancestor_ids, data); - return out; - } - else if (const auto * ancestor_id_col_const = checkAndGetColumnConst>(ancestor_id_col_untyped)) - { - auto out = ColumnUInt8::create(); - - const auto & child_ids = child_id_col->getData(); - const auto ancestor_id = ancestor_id_col_const->getValue(); - auto & data = out->getData(); - const auto size = child_id_col->size(); - data.resize(size); - - dict->isInVectorConstant(child_ids, ancestor_id, data); - return out; - } - else - { - throw Exception{"Illegal column " + ancestor_id_col_untyped->getName() - + " of third argument of function " + getName(), ErrorCodes::ILLEGAL_COLUMN}; - } + return std::make_shared(std::make_shared()); } - template - ColumnPtr execute(const DictionaryType * dict, const ColumnConst * child_id_col, const IColumn * ancestor_id_col_untyped) const + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override { - if (const auto * ancestor_id_col = checkAndGetColumn(ancestor_id_col_untyped)) + if (input_rows_count == 0) + return result_type->createColumn(); + + auto dictionary = helper.getDictionary(arguments[0].column); + + if (!dictionary->hasHierarchy()) + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, + "Dictionary ({}) does not support hierarchy", + dictionary->getFullName()); + + ColumnPtr result = dictionary->getDescendants(arguments[1].column, std::make_shared(), 1); + + return result; + } + + mutable FunctionDictHelper helper; +}; + +class FunctionDictGetDescendants final : public IFunction +{ +public: + static constexpr auto name = "dictGetDescendants"; + + static FunctionPtr create(const Context & context) + { + return std::make_shared(context); + } + + explicit FunctionDictGetDescendants(const Context & context_) + : helper(context_) {} + + String getName() const override { return name; } + +private: + size_t getNumberOfArguments() const override { return 0; } + bool isVariadic() const override { return true; } + + bool useDefaultImplementationForConstants() const final { return true; } + ColumnNumbers getArgumentsThatAreAlwaysConstant() const final { return {0}; } + bool isDeterministic() const override { return false; } + + + DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override + { + size_t arguments_size = arguments.size(); + if (arguments_size < 2 || arguments_size > 3) { - auto out = ColumnUInt8::create(); - - const auto child_id = child_id_col->getValue(); - const auto & ancestor_ids = ancestor_id_col->getData(); - auto & data = out->getData(); - const auto size = child_id_col->size(); - data.resize(size); - - dict->isInConstantVector(child_id, ancestor_ids, data); - return out; + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + "Illegal arguments size of function ({}). Expects 2 or 3 arguments size. Actual size ({})", + getName(), + arguments_size); } - else if (const auto * ancestor_id_col_const = checkAndGetColumnConst>(ancestor_id_col_untyped)) + + if (!isString(arguments[0])) + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Illegal type of first argument of function ({}). Expected const String. Actual type ({})", + getName(), + arguments[0]->getName()); + + if (!WhichDataType(arguments[1]).isUInt64()) + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Illegal type of second argument of function ({}). Expected UInt64. Actual type ({})", + getName(), + arguments[1]->getName()); + + if (arguments.size() == 3 && !isUnsignedInteger(arguments[2])) { - const auto child_id = child_id_col->getValue(); - const auto ancestor_id = ancestor_id_col_const->getValue(); - UInt8 res = 0; - - dict->isInConstantConstant(child_id, ancestor_id, res); - return DataTypeUInt8().createColumnConst(child_id_col->size(), res); + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Illegal type of third argument of function ({}). Expected const unsigned integer. Actual type ({})", + getName(), + arguments[2]->getName()); } - else - throw Exception{"Illegal column " + ancestor_id_col_untyped->getName() - + " of third argument of function " + getName(), ErrorCodes::ILLEGAL_COLUMN}; + + return std::make_shared(std::make_shared()); + } + + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override + { + if (input_rows_count == 0) + return result_type->createColumn(); + + auto dictionary = helper.getDictionary(arguments[0].column); + + size_t level = 0; + + if (arguments.size() == 3) + { + if (!isColumnConst(*arguments[2].column)) + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Illegal type of third argument of function ({}). Expected const unsigned integer.", + getName()); + + level = static_cast(arguments[2].column->get64(0)); + } + + if (!dictionary->hasHierarchy()) + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, + "Dictionary ({}) does not support hierarchy", + dictionary->getFullName()); + + ColumnPtr res = dictionary->getDescendants(arguments[1].column, std::make_shared(), level); + + return res; } mutable FunctionDictHelper helper; diff --git a/src/Functions/visitParamExtractBool.cpp b/src/Functions/visitParamExtractBool.cpp index 7f989ccbb9e..059115b5b13 100644 --- a/src/Functions/visitParamExtractBool.cpp +++ b/src/Functions/visitParamExtractBool.cpp @@ -19,10 +19,13 @@ struct ExtractBool struct NameVisitParamExtractBool { static constexpr auto name = "visitParamExtractBool"; }; using FunctionVisitParamExtractBool = FunctionsStringSearch, NameVisitParamExtractBool>; +struct NameSimpleJSONExtractBool { static constexpr auto name = "simpleJSONExtractBool"; }; +using FunctionSimpleJSONExtractBool = FunctionsStringSearch, NameSimpleJSONExtractBool>; void registerFunctionVisitParamExtractBool(FunctionFactory & factory) { factory.registerFunction(); + factory.registerFunction(); } } diff --git a/src/Functions/visitParamExtractFloat.cpp b/src/Functions/visitParamExtractFloat.cpp index b02b0209daf..7a55cff365c 100644 --- a/src/Functions/visitParamExtractFloat.cpp +++ b/src/Functions/visitParamExtractFloat.cpp @@ -9,10 +9,13 @@ namespace DB struct NameVisitParamExtractFloat { static constexpr auto name = "visitParamExtractFloat"; }; using FunctionVisitParamExtractFloat = FunctionsStringSearch>, NameVisitParamExtractFloat>; +struct NameSimpleJSONExtractFloat { static constexpr auto name = "simpleJSONExtractFloat"; }; +using FunctionSimpleJSONExtractFloat = FunctionsStringSearch>, NameSimpleJSONExtractFloat>; void registerFunctionVisitParamExtractFloat(FunctionFactory & factory) { factory.registerFunction(); + factory.registerFunction(); } } diff --git a/src/Functions/visitParamExtractInt.cpp b/src/Functions/visitParamExtractInt.cpp index f3f30f566e6..7c2188c10fc 100644 --- a/src/Functions/visitParamExtractInt.cpp +++ b/src/Functions/visitParamExtractInt.cpp @@ -9,10 +9,13 @@ namespace DB struct NameVisitParamExtractInt { static constexpr auto name = "visitParamExtractInt"; }; using FunctionVisitParamExtractInt = FunctionsStringSearch>, NameVisitParamExtractInt>; +struct NameSimpleJSONExtractInt { static constexpr auto name = "simpleJSONExtractInt"; }; +using FunctionSimpleJSONExtractInt = FunctionsStringSearch>, NameSimpleJSONExtractInt>; void registerFunctionVisitParamExtractInt(FunctionFactory & factory) { factory.registerFunction(); + factory.registerFunction(); } } diff --git a/src/Functions/visitParamExtractRaw.cpp b/src/Functions/visitParamExtractRaw.cpp index add882f003f..734fe107557 100644 --- a/src/Functions/visitParamExtractRaw.cpp +++ b/src/Functions/visitParamExtractRaw.cpp @@ -59,10 +59,13 @@ struct ExtractRaw struct NameVisitParamExtractRaw { static constexpr auto name = "visitParamExtractRaw"; }; using FunctionVisitParamExtractRaw = FunctionsStringSearchToString, NameVisitParamExtractRaw>; +struct NameSimpleJSONExtractRaw { static constexpr auto name = "simpleJSONExtractRaw"; }; +using FunctionSimpleJSONExtractRaw = FunctionsStringSearchToString, NameSimpleJSONExtractRaw>; void registerFunctionVisitParamExtractRaw(FunctionFactory & factory) { factory.registerFunction(); + factory.registerFunction(); } } diff --git a/src/Functions/visitParamExtractString.cpp b/src/Functions/visitParamExtractString.cpp index b633a59807e..23f24b9e3b8 100644 --- a/src/Functions/visitParamExtractString.cpp +++ b/src/Functions/visitParamExtractString.cpp @@ -20,10 +20,13 @@ struct ExtractString struct NameVisitParamExtractString { static constexpr auto name = "visitParamExtractString"; }; using FunctionVisitParamExtractString = FunctionsStringSearchToString, NameVisitParamExtractString>; +struct NameSimpleJSONExtractString { static constexpr auto name = "simpleJSONExtractString"; }; +using FunctionSimpleJSONExtractString = FunctionsStringSearchToString, NameSimpleJSONExtractString>; void registerFunctionVisitParamExtractString(FunctionFactory & factory) { factory.registerFunction(); + factory.registerFunction(); } } diff --git a/src/Functions/visitParamExtractUInt.cpp b/src/Functions/visitParamExtractUInt.cpp index 5e70eed8253..f5466a63b0d 100644 --- a/src/Functions/visitParamExtractUInt.cpp +++ b/src/Functions/visitParamExtractUInt.cpp @@ -9,10 +9,14 @@ namespace DB struct NameVisitParamExtractUInt { static constexpr auto name = "visitParamExtractUInt"; }; using FunctionVisitParamExtractUInt = FunctionsStringSearch>, NameVisitParamExtractUInt>; +struct NameSimpleJSONExtractUInt { static constexpr auto name = "simpleJSONExtractUInt"; }; +using FunctionSimpleJSONExtractUInt = FunctionsStringSearch>, NameSimpleJSONExtractUInt>; + void registerFunctionVisitParamExtractUInt(FunctionFactory & factory) { factory.registerFunction(); + factory.registerFunction(); } } diff --git a/src/Functions/visitParamHas.cpp b/src/Functions/visitParamHas.cpp index 5fbedfb4995..f4f377f9e8f 100644 --- a/src/Functions/visitParamHas.cpp +++ b/src/Functions/visitParamHas.cpp @@ -19,10 +19,13 @@ struct HasParam struct NameVisitParamHas { static constexpr auto name = "visitParamHas"; }; using FunctionVisitParamHas = FunctionsStringSearch, NameVisitParamHas>; +struct NameSimpleJSONHas { static constexpr auto name = "simpleJSONHas"; }; +using FunctionSimpleJSONHas = FunctionsStringSearch, NameSimpleJSONHas>; void registerFunctionVisitParamHas(FunctionFactory & factory) { factory.registerFunction(); + factory.registerFunction(); } } diff --git a/src/Interpreters/ExternalDictionariesLoader.cpp b/src/Interpreters/ExternalDictionariesLoader.cpp index 1632b7cbf78..8df29459b72 100644 --- a/src/Interpreters/ExternalDictionariesLoader.cpp +++ b/src/Interpreters/ExternalDictionariesLoader.cpp @@ -46,13 +46,13 @@ ExternalLoader::LoadablePtr ExternalDictionariesLoader::create( ExternalDictionariesLoader::DictPtr ExternalDictionariesLoader::getDictionary(const std::string & dictionary_name, const Context & context) const { std::string resolved_dictionary_name = resolveDictionaryName(dictionary_name, context.getCurrentDatabase()); - return std::static_pointer_cast(load(resolved_dictionary_name)); + return std::static_pointer_cast(load(resolved_dictionary_name)); } ExternalDictionariesLoader::DictPtr ExternalDictionariesLoader::tryGetDictionary(const std::string & dictionary_name, const Context & context) const { std::string resolved_dictionary_name = resolveDictionaryName(dictionary_name, context.getCurrentDatabase()); - return std::static_pointer_cast(tryLoad(resolved_dictionary_name)); + return std::static_pointer_cast(tryLoad(resolved_dictionary_name)); } diff --git a/src/Interpreters/ExternalDictionariesLoader.h b/src/Interpreters/ExternalDictionariesLoader.h index 0f64715b243..ce5b2512741 100644 --- a/src/Interpreters/ExternalDictionariesLoader.h +++ b/src/Interpreters/ExternalDictionariesLoader.h @@ -15,7 +15,7 @@ class IExternalLoaderConfigRepository; class ExternalDictionariesLoader : public ExternalLoader { public: - using DictPtr = std::shared_ptr; + using DictPtr = std::shared_ptr; /// Dictionaries will be loaded immediately and then will be updated in separate thread, each 'reload_period' seconds. explicit ExternalDictionariesLoader(Context & global_context_); diff --git a/src/Interpreters/IdentifierSemantic.cpp b/src/Interpreters/IdentifierSemantic.cpp index a1fc533eb7f..81bd499ea2e 100644 --- a/src/Interpreters/IdentifierSemantic.cpp +++ b/src/Interpreters/IdentifierSemantic.cpp @@ -209,7 +209,7 @@ IdentifierSemantic::ColumnMatch IdentifierSemantic::canReferColumnToTable(const return canReferColumnToTable(identifier, table_with_columns.table); } -/// Strip qualificators from left side of column name. +/// Strip qualifications from left side of column name. /// Example: 'database.table.name' -> 'name'. void IdentifierSemantic::setColumnShortName(ASTIdentifier & identifier, const DatabaseAndTableWithAlias & db_and_table) { diff --git a/src/Interpreters/PredicateExpressionsOptimizer.cpp b/src/Interpreters/PredicateExpressionsOptimizer.cpp index 00b47be408a..476bdaaceea 100644 --- a/src/Interpreters/PredicateExpressionsOptimizer.cpp +++ b/src/Interpreters/PredicateExpressionsOptimizer.cpp @@ -146,7 +146,7 @@ bool PredicateExpressionsOptimizer::tryRewritePredicatesToTables(ASTs & tables_e break; /// Skip left and right table optimization is_rewrite_tables |= tryRewritePredicatesToTable(tables_element[table_pos], tables_predicates[table_pos], - tables_with_columns[table_pos].columns.getNames()); + tables_with_columns[table_pos]); if (table_element->table_join && isRight(table_element->table_join->as()->kind)) break; /// Skip left table optimization @@ -156,13 +156,13 @@ bool PredicateExpressionsOptimizer::tryRewritePredicatesToTables(ASTs & tables_e return is_rewrite_tables; } -bool PredicateExpressionsOptimizer::tryRewritePredicatesToTable(ASTPtr & table_element, const ASTs & table_predicates, Names && table_columns) const +bool PredicateExpressionsOptimizer::tryRewritePredicatesToTable(ASTPtr & table_element, const ASTs & table_predicates, const TableWithColumnNamesAndTypes & table_columns) const { if (!table_predicates.empty()) { auto optimize_final = enable_optimize_predicate_expression_to_final_subquery; auto optimize_with = allow_push_predicate_when_subquery_contains_with; - PredicateRewriteVisitor::Data data(context, table_predicates, std::move(table_columns), optimize_final, optimize_with); + PredicateRewriteVisitor::Data data(context, table_predicates, table_columns, optimize_final, optimize_with); PredicateRewriteVisitor(data).visit(table_element); return data.is_rewrite; diff --git a/src/Interpreters/PredicateExpressionsOptimizer.h b/src/Interpreters/PredicateExpressionsOptimizer.h index 8cceda93164..223ac1e8998 100644 --- a/src/Interpreters/PredicateExpressionsOptimizer.h +++ b/src/Interpreters/PredicateExpressionsOptimizer.h @@ -33,7 +33,8 @@ private: bool tryRewritePredicatesToTables(ASTs & tables_element, const std::vector & tables_predicates); - bool tryRewritePredicatesToTable(ASTPtr & table_element, const ASTs & table_predicates, Names && table_columns) const; + bool tryRewritePredicatesToTable( + ASTPtr & table_element, const ASTs & table_predicates, const TableWithColumnNamesAndTypes & table_columns) const; bool tryMovePredicatesFromHavingToWhere(ASTSelectQuery & select_query); }; diff --git a/src/Interpreters/PredicateRewriteVisitor.cpp b/src/Interpreters/PredicateRewriteVisitor.cpp index 9e6d5543f2f..6f28b9050df 100644 --- a/src/Interpreters/PredicateRewriteVisitor.cpp +++ b/src/Interpreters/PredicateRewriteVisitor.cpp @@ -17,8 +17,8 @@ namespace DB { PredicateRewriteVisitorData::PredicateRewriteVisitorData( - const Context & context_, const ASTs & predicates_, Names && column_names_, bool optimize_final_, bool optimize_with_) - : context(context_), predicates(predicates_), column_names(column_names_), optimize_final(optimize_final_), optimize_with(optimize_with_) + const Context & context_, const ASTs & predicates_, const TableWithColumnNamesAndTypes & table_columns_, bool optimize_final_, bool optimize_with_) + : context(context_), predicates(predicates_), table_columns(table_columns_), optimize_final(optimize_final_), optimize_with(optimize_with_) { } @@ -42,7 +42,8 @@ void PredicateRewriteVisitorData::visit(ASTSelectWithUnionQuery & union_select_q void PredicateRewriteVisitorData::visitFirstInternalSelect(ASTSelectQuery & select_query, ASTPtr &) { - is_rewrite |= rewriteSubquery(select_query, column_names, column_names); + /// In this case inner_columns same as outer_columns from table_columns + is_rewrite |= rewriteSubquery(select_query, table_columns.columns.getNames()); } void PredicateRewriteVisitorData::visitOtherInternalSelect(ASTSelectQuery & select_query, ASTPtr &) @@ -65,7 +66,7 @@ void PredicateRewriteVisitorData::visitOtherInternalSelect(ASTSelectQuery & sele const Names & internal_columns = InterpreterSelectQuery( temp_internal_select, context, SelectQueryOptions().analyze()).getSampleBlock().getNames(); - if (rewriteSubquery(*temp_select_query, column_names, internal_columns)) + if (rewriteSubquery(*temp_select_query, internal_columns)) { is_rewrite |= true; select_query.setExpression(ASTSelectQuery::Expression::SELECT, std::move(temp_select_query->refSelect())); @@ -89,7 +90,7 @@ static void cleanAliasAndCollectIdentifiers(ASTPtr & predicate, std::vector identifiers; @@ -106,13 +108,16 @@ bool PredicateRewriteVisitorData::rewriteSubquery(ASTSelectQuery & subquery, con for (const auto & identifier : identifiers) { - const auto & column_name = identifier->shortName(); - const auto & outer_column_iterator = std::find(outer_columns.begin(), outer_columns.end(), column_name); + IdentifierSemantic::setColumnShortName(*identifier, table_columns.table); + const auto & column_name = identifier->name(); /// For lambda functions, we can't always find them in the list of columns /// For example: SELECT * FROM system.one WHERE arrayMap(x -> x, [dummy]) = [0] + const auto & outer_column_iterator = std::find(outer_columns.begin(), outer_columns.end(), column_name); if (outer_column_iterator != outer_columns.end()) + { identifier->setShortName(inner_columns[outer_column_iterator - outer_columns.begin()]); + } } /// We only need to push all the predicates to subquery having diff --git a/src/Interpreters/PredicateRewriteVisitor.h b/src/Interpreters/PredicateRewriteVisitor.h index 02c8b9ca422..1132d93a5ec 100644 --- a/src/Interpreters/PredicateRewriteVisitor.h +++ b/src/Interpreters/PredicateRewriteVisitor.h @@ -4,6 +4,7 @@ #include #include #include +#include namespace DB { @@ -24,12 +25,13 @@ public: return true; } - PredicateRewriteVisitorData(const Context & context_, const ASTs & predicates_, Names && column_names_, bool optimize_final_, bool optimize_with_); + PredicateRewriteVisitorData(const Context & context_, const ASTs & predicates_, + const TableWithColumnNamesAndTypes & table_columns_, bool optimize_final_, bool optimize_with_); private: const Context & context; const ASTs & predicates; - const Names column_names; + const TableWithColumnNamesAndTypes & table_columns; bool optimize_final; bool optimize_with; @@ -37,7 +39,7 @@ private: void visitOtherInternalSelect(ASTSelectQuery & select_query, ASTPtr &); - bool rewriteSubquery(ASTSelectQuery & subquery, const Names & outer_columns, const Names & inner_columns); + bool rewriteSubquery(ASTSelectQuery & subquery, const Names & inner_columns); }; using PredicateRewriteMatcher = OneTypeMatcher; diff --git a/src/Processors/Formats/Impl/ParallelFormattingOutputFormat.cpp b/src/Processors/Formats/Impl/ParallelFormattingOutputFormat.cpp index 0ebca3661b4..ce7dd1abd51 100644 --- a/src/Processors/Formats/Impl/ParallelFormattingOutputFormat.cpp +++ b/src/Processors/Formats/Impl/ParallelFormattingOutputFormat.cpp @@ -80,9 +80,11 @@ namespace DB } - void ParallelFormattingOutputFormat::collectorThreadFunction() + void ParallelFormattingOutputFormat::collectorThreadFunction(const ThreadGroupStatusPtr & thread_group) { setThreadName("Collector"); + if (thread_group) + CurrentThread::attachToIfDetached(thread_group); try { @@ -135,9 +137,11 @@ namespace DB } - void ParallelFormattingOutputFormat::formatterThreadFunction(size_t current_unit_number) + void ParallelFormattingOutputFormat::formatterThreadFunction(size_t current_unit_number, const ThreadGroupStatusPtr & thread_group) { setThreadName("Formatter"); + if (thread_group) + CurrentThread::attachToIfDetached(thread_group); try { diff --git a/src/Processors/Formats/Impl/ParallelFormattingOutputFormat.h b/src/Processors/Formats/Impl/ParallelFormattingOutputFormat.h index 584aa364d27..8b9e8293c69 100644 --- a/src/Processors/Formats/Impl/ParallelFormattingOutputFormat.h +++ b/src/Processors/Formats/Impl/ParallelFormattingOutputFormat.h @@ -76,7 +76,10 @@ public: /// Just heuristic. We need one thread for collecting, one thread for receiving chunks /// and n threads for formatting. processing_units.resize(params.max_threads_for_parallel_formatting + 2); - collector_thread = ThreadFromGlobalPool([&] { collectorThreadFunction(); }); + collector_thread = ThreadFromGlobalPool([thread_group = CurrentThread::getGroup(), this] + { + collectorThreadFunction(thread_group); + }); LOG_TRACE(&Poco::Logger::get("ParallelFormattingOutputFormat"), "Parallel formatting is being used"); } @@ -200,14 +203,17 @@ private: void scheduleFormatterThreadForUnitWithNumber(size_t ticket_number) { - pool.scheduleOrThrowOnError([this, ticket_number] { formatterThreadFunction(ticket_number); }); + pool.scheduleOrThrowOnError([this, thread_group = CurrentThread::getGroup(), ticket_number] + { + formatterThreadFunction(ticket_number, thread_group); + }); } /// Collects all temporary buffers into main WriteBuffer. - void collectorThreadFunction(); + void collectorThreadFunction(const ThreadGroupStatusPtr & thread_group); /// This function is executed in ThreadPool and the only purpose of it is to format one Chunk into a continuous buffer in memory. - void formatterThreadFunction(size_t current_unit_number); + void formatterThreadFunction(size_t current_unit_number, const ThreadGroupStatusPtr & thread_group); }; } diff --git a/src/Processors/Merges/Algorithms/CollapsingSortedAlgorithm.cpp b/src/Processors/Merges/Algorithms/CollapsingSortedAlgorithm.cpp index ccb66259e2e..0db99fc7b0e 100644 --- a/src/Processors/Merges/Algorithms/CollapsingSortedAlgorithm.cpp +++ b/src/Processors/Merges/Algorithms/CollapsingSortedAlgorithm.cpp @@ -66,14 +66,16 @@ void CollapsingSortedAlgorithm::insertRow(RowRef & row) merged_data.insertRow(*row.all_columns, row.row_num, row.owned_chunk->getNumRows()); } -void CollapsingSortedAlgorithm::insertRows() +std::optional CollapsingSortedAlgorithm::insertRows() { if (count_positive == 0 && count_negative == 0) { /// No input rows have been read. - return; + return {}; } + std::optional res; + if (last_is_positive || count_positive != count_negative) { if (count_positive <= count_negative && !only_positive_sign) @@ -86,6 +88,9 @@ void CollapsingSortedAlgorithm::insertRows() if (count_positive >= count_negative) { + if (merged_data.hasEnoughRows()) + res = merged_data.pull(); + insertRow(last_positive_row); if (out_row_sources_buf) @@ -107,10 +112,16 @@ void CollapsingSortedAlgorithm::insertRows() out_row_sources_buf->write( reinterpret_cast(current_row_sources.data()), current_row_sources.size() * sizeof(RowSourcePart)); + + return res; } IMergingAlgorithm::Status CollapsingSortedAlgorithm::merge() { + /// Rare case, which may happen when index_granularity is 1, but we needed to insert 2 rows inside insertRows(). + if (merged_data.hasEnoughRows()) + return Status(merged_data.pull()); + /// Take rows in required order and put them into `merged_data`, while the rows are no more than `max_block_size` while (queue.isValid()) { @@ -132,15 +143,14 @@ IMergingAlgorithm::Status CollapsingSortedAlgorithm::merge() setRowRef(last_row, current); bool key_differs = !last_row.hasEqualSortColumnsWith(current_row); - - /// if there are enough rows and the last one is calculated completely - if (key_differs && merged_data.hasEnoughRows()) - return Status(merged_data.pull()); - if (key_differs) { + /// if there are enough rows and the last one is calculated completely + if (merged_data.hasEnoughRows()) + return Status(merged_data.pull()); + /// We write data for the previous primary key. - insertRows(); + auto res = insertRows(); current_row.swap(last_row); @@ -151,6 +161,12 @@ IMergingAlgorithm::Status CollapsingSortedAlgorithm::merge() first_negative_pos = 0; last_positive_pos = 0; current_row_sources.resize(0); + + /// Here we can return ready chunk. + /// Next iteration, last_row == current_row, and all the counters are zeroed. + /// So, current_row should be correctly processed. + if (res) + return Status(std::move(*res)); } /// Initially, skip all rows. On insert, unskip "corner" rows. @@ -194,7 +210,15 @@ IMergingAlgorithm::Status CollapsingSortedAlgorithm::merge() } } - insertRows(); + if (auto res = insertRows()) + { + /// Queue is empty, and we have inserted all the rows. + /// Set counter to zero so that insertRows() will return immediately next time. + count_positive = 0; + count_negative = 0; + return Status(std::move(*res)); + } + return Status(merged_data.pull(), true); } diff --git a/src/Processors/Merges/Algorithms/CollapsingSortedAlgorithm.h b/src/Processors/Merges/Algorithms/CollapsingSortedAlgorithm.h index 028715f715b..18ebaad5596 100644 --- a/src/Processors/Merges/Algorithms/CollapsingSortedAlgorithm.h +++ b/src/Processors/Merges/Algorithms/CollapsingSortedAlgorithm.h @@ -66,7 +66,11 @@ private: void reportIncorrectData(); void insertRow(RowRef & row); - void insertRows(); + + /// Insert ready rows into merged_data. We may want to insert 0, 1 or 2 rows. + /// It may happen that 2 rows is going to be inserted and, but merged data has free space only for 1 row. + /// In this case, Chunk with ready is pulled from merged_data before the second insertion. + std::optional insertRows(); }; } diff --git a/src/Server/TCPHandler.cpp b/src/Server/TCPHandler.cpp index ae9358c6159..efda9bbfec3 100644 --- a/src/Server/TCPHandler.cpp +++ b/src/Server/TCPHandler.cpp @@ -1409,22 +1409,49 @@ void TCPHandler::sendData(const Block & block) { initBlockOutput(block); - writeVarUInt(Protocol::Server::Data, *out); - /// Send external table name (empty name is the main table) - writeStringBinary("", *out); + auto prev_bytes_written_out = out->count(); + auto prev_bytes_written_compressed_out = state.maybe_compressed_out->count(); - /// For testing hedged requests - const Settings & settings = query_context->getSettingsRef(); - if (block.rows() > 0 && settings.sleep_in_send_data_ms.totalMilliseconds()) + try { - out->next(); - std::chrono::milliseconds ms(settings.sleep_in_send_data_ms.totalMilliseconds()); - std::this_thread::sleep_for(ms); - } + writeVarUInt(Protocol::Server::Data, *out); + /// Send external table name (empty name is the main table) + writeStringBinary("", *out); - state.block_out->write(block); - state.maybe_compressed_out->next(); - out->next(); + /// For testing hedged requests + const Settings & settings = query_context->getSettingsRef(); + if (block.rows() > 0 && settings.sleep_in_send_data_ms.totalMilliseconds()) + { + out->next(); + std::chrono::milliseconds ms(settings.sleep_in_send_data_ms.totalMilliseconds()); + std::this_thread::sleep_for(ms); + } + + state.block_out->write(block); + state.maybe_compressed_out->next(); + out->next(); + } + catch (...) + { + /// In case of unsuccessful write, if the buffer with written data was not flushed, + /// we will rollback write to avoid breaking the protocol. + /// (otherwise the client will not be able to receive exception after unfinished data + /// as it will expect the continuation of the data). + /// It looks like hangs on client side or a message like "Data compressed with different methods". + + if (state.compression == Protocol::Compression::Enable) + { + auto extra_bytes_written_compressed = state.maybe_compressed_out->count() - prev_bytes_written_compressed_out; + if (state.maybe_compressed_out->offset() >= extra_bytes_written_compressed) + state.maybe_compressed_out->position() -= extra_bytes_written_compressed; + } + + auto extra_bytes_written_out = out->count() - prev_bytes_written_out; + if (out->offset() >= extra_bytes_written_out) + out->position() -= extra_bytes_written_out; + + throw; + } } diff --git a/src/Storages/MergeTree/DataPartsExchange.cpp b/src/Storages/MergeTree/DataPartsExchange.cpp index cf8de4456dd..862a3088f89 100644 --- a/src/Storages/MergeTree/DataPartsExchange.cpp +++ b/src/Storages/MergeTree/DataPartsExchange.cpp @@ -543,11 +543,22 @@ MergeTreeData::MutableDataPartPtr Fetcher::downloadPartToDisk( static const String TMP_PREFIX = "tmp_fetch_"; String tmp_prefix = tmp_prefix_.empty() ? TMP_PREFIX : tmp_prefix_; + /// We will remove directory if it's already exists. Make precautions. + if (tmp_prefix.empty() + || part_name.empty() + || std::string::npos != tmp_prefix.find_first_of("/.") + || std::string::npos != part_name.find_first_of("/.")) + throw Exception("Logical error: tmp_prefix and part_name cannot be empty or contain '.' or '/' characters.", ErrorCodes::LOGICAL_ERROR); + String part_relative_path = String(to_detached ? "detached/" : "") + tmp_prefix + part_name; String part_download_path = data.getRelativeDataPath() + part_relative_path + "/"; if (disk->exists(part_download_path)) - throw Exception("Directory " + fullPath(disk, part_download_path) + " already exists.", ErrorCodes::DIRECTORY_ALREADY_EXISTS); + { + LOG_WARNING(log, "Directory {} already exists, probably result of a failed fetch. Will remove it before fetching part.", + fullPath(disk, part_download_path)); + disk->removeRecursive(part_download_path); + } disk->createDirectories(part_download_path); diff --git a/src/Storages/System/StorageSystemDictionaries.cpp b/src/Storages/System/StorageSystemDictionaries.cpp index cccd23ffbd1..378905b7dc0 100644 --- a/src/Storages/System/StorageSystemDictionaries.cpp +++ b/src/Storages/System/StorageSystemDictionaries.cpp @@ -58,7 +58,7 @@ void StorageSystemDictionaries::fillData(MutableColumns & res_columns, const Con const auto & external_dictionaries = context.getExternalDictionariesLoader(); for (const auto & load_result : external_dictionaries.getLoadResults()) { - const auto dict_ptr = std::dynamic_pointer_cast(load_result.object); + const auto dict_ptr = std::dynamic_pointer_cast(load_result.object); DictionaryStructure dictionary_structure = ExternalDictionariesLoader::getDictionaryStructure(*load_result.config); StorageID dict_id = StorageID::createEmpty(); diff --git a/tests/config/config.d/database_replicated.xml b/tests/config/config.d/database_replicated.xml index ed5845bad48..c2e62f9645a 100644 --- a/tests/config/config.d/database_replicated.xml +++ b/tests/config/config.d/database_replicated.xml @@ -21,6 +21,9 @@ 5000 10000 + 1000 + 2000 + 4000 trace false diff --git a/tests/integration/test_s3_zero_copy_replication/test.py b/tests/integration/test_s3_zero_copy_replication/test.py index d4c3ae06b72..5bc30ab1d6b 100644 --- a/tests/integration/test_s3_zero_copy_replication/test.py +++ b/tests/integration/test_s3_zero_copy_replication/test.py @@ -96,7 +96,7 @@ def test_s3_zero_copy_on_hybrid_storage(cluster): node1.query( """ CREATE TABLE hybrid_test ON CLUSTER test_cluster (id UInt32, value String) - ENGINE=ReplicatedMergeTree('/clickhouse/tables/s3_test', '{}') + ENGINE=ReplicatedMergeTree('/clickhouse/tables/hybrid_test', '{}') ORDER BY id SETTINGS storage_policy='hybrid' """ @@ -131,3 +131,6 @@ def test_s3_zero_copy_on_hybrid_storage(cluster): assert node1.query("SELECT * FROM hybrid_test ORDER BY id FORMAT Values") == "(0,'data'),(1,'data')" assert node2.query("SELECT * FROM hybrid_test ORDER BY id FORMAT Values") == "(0,'data'),(1,'data')" + + node1.query("DROP TABLE IF EXISTS hybrid_test NO DELAY") + node2.query("DROP TABLE IF EXISTS hybrid_test NO DELAY") diff --git a/tests/integration/test_storage_rabbitmq/test.py b/tests/integration/test_storage_rabbitmq/test.py index ca89ebdea0a..50fcdd8d77e 100644 --- a/tests/integration/test_storage_rabbitmq/test.py +++ b/tests/integration/test_storage_rabbitmq/test.py @@ -253,12 +253,20 @@ def test_rabbitmq_csv_with_delimiter(rabbitmq_cluster): @pytest.mark.timeout(240) def test_rabbitmq_tsv_with_delimiter(rabbitmq_cluster): instance.query(''' + DROP TABLE IF EXISTS test.view; + DROP TABLE IF EXISTS test.consumer; CREATE TABLE test.rabbitmq (key UInt64, value UInt64) ENGINE = RabbitMQ SETTINGS rabbitmq_host_port = 'rabbitmq1:5672', rabbitmq_exchange_name = 'tsv', rabbitmq_format = 'TSV', + rabbitmq_queue_base = 'tsv', rabbitmq_row_delimiter = '\\n'; + CREATE TABLE test.view (key UInt64, value UInt64) + ENGINE = MergeTree() + ORDER BY key; + CREATE MATERIALIZED VIEW test.consumer TO test.view AS + SELECT * FROM test.rabbitmq; ''') credentials = pika.PlainCredentials('root', 'clickhouse') @@ -272,13 +280,11 @@ def test_rabbitmq_tsv_with_delimiter(rabbitmq_cluster): for message in messages: channel.basic_publish(exchange='tsv', routing_key='', body=message) - connection.close() - time.sleep(1) result = '' while True: - result += instance.query('SELECT * FROM test.rabbitmq ORDER BY key', ignore_error=True) + result = instance.query('SELECT * FROM test.view ORDER BY key') if rabbitmq_check_result(result): break diff --git a/tests/performance/direct_dictionary.xml b/tests/performance/direct_dictionary.xml index 68b52d917dd..97ecdfe3e95 100644 --- a/tests/performance/direct_dictionary.xml +++ b/tests/performance/direct_dictionary.xml @@ -1,38 +1,17 @@ - CREATE TABLE simple_direct_dictionary_test_table + CREATE TABLE simple_key_direct_dictionary_source_table ( id UInt64, value_int UInt64, value_string String, value_decimal Decimal64(8), value_string_nullable Nullable(String) - ) ENGINE = TinyLog; + ) ENGINE = Memory; - INSERT INTO simple_direct_dictionary_test_table - SELECT number, number, toString(number), toDecimal64(number, 8), toString(number) - FROM system.numbers - LIMIT 100000; - - - - CREATE DICTIONARY simple_direct_dictionary - ( - id UInt64, - value_int UInt64, - value_string String, - value_decimal Decimal64(8), - value_string_nullable Nullable(String) - ) - PRIMARY KEY id - SOURCE(CLICKHOUSE(DB 'default' TABLE 'simple_direct_dictionary_test_table')) - LAYOUT(DIRECT()) - - - - CREATE TABLE complex_direct_dictionary_test_table + CREATE TABLE complex_key_direct_dictionary_source_table ( id UInt64, id_key String, @@ -44,14 +23,21 @@ - INSERT INTO complex_direct_dictionary_test_table - SELECT number, toString(number), number, toString(number), toDecimal64(number, 8), toString(number) - FROM system.numbers - LIMIT 100000; + CREATE DICTIONARY simple_key_direct_dictionary + ( + id UInt64, + value_int UInt64, + value_string String, + value_decimal Decimal64(8), + value_string_nullable Nullable(String) + ) + PRIMARY KEY id + SOURCE(CLICKHOUSE(DB 'default' TABLE 'simple_key_direct_dictionary_source_table')) + LAYOUT(DIRECT()) - CREATE DICTIONARY complex_direct_dictionary + CREATE DICTIONARY complex_key_direct_dictionary ( id UInt64, id_key String, @@ -61,20 +47,76 @@ value_string_nullable Nullable(String) ) PRIMARY KEY id, id_key - SOURCE(CLICKHOUSE(DB 'default' TABLE 'complex_direct_dictionary_test_table')) + SOURCE(CLICKHOUSE(DB 'default' TABLE 'complex_key_direct_dictionary_source_table')) LAYOUT(COMPLEX_KEY_DIRECT()) - SELECT dictGet('default.simple_direct_dictionary', 'value_int', number) FROM system.numbers LIMIT 150000; - SELECT dictGet('default.simple_direct_dictionary', 'value_string', number) FROM system.numbers LIMIT 150000; - SELECT dictGet('default.simple_direct_dictionary', 'value_decimal', number) FROM system.numbers LIMIT 150000; - SELECT dictGet('default.simple_direct_dictionary', 'value_string_nullable', number) FROM system.numbers LIMIT 150000; - SELECT dictHas('default.simple_direct_dictionary', number) FROM system.numbers LIMIT 150000; + + INSERT INTO simple_key_direct_dictionary_source_table + SELECT number, number, toString(number), toDecimal64(number, 8), toString(number) + FROM system.numbers + LIMIT 100000; + - SELECT dictGet('default.complex_direct_dictionary', 'value_int', (number, toString(number))) FROM system.numbers LIMIT 150000; - SELECT dictGet('default.complex_direct_dictionary', 'value_string', (number, toString(number))) FROM system.numbers LIMIT 150000; - SELECT dictGet('default.complex_direct_dictionary', 'value_decimal', (number, toString(number))) FROM system.numbers LIMIT 150000; - SELECT dictGet('default.complex_direct_dictionary', 'value_string_nullable', (number, toString(number))) FROM system.numbers LIMIT 150000; - SELECT dictHas('default.complex_direct_dictionary', (number, toString(number))) FROM system.numbers LIMIT 150000; + + INSERT INTO complex_key_direct_dictionary_source_table + SELECT number, toString(number), number, toString(number), toDecimal64(number, 8), toString(number) + FROM system.numbers + LIMIT 100000; + + + + + column_name + + 'value_int' + 'value_string' + 'value_decimal' + 'value_string_nullable' + + + + + elements_count + + 25000 + 50000 + 75000 + 100000 + + + + + + SELECT dictGet('default.simple_key_direct_dictionary', {column_name}, number) + FROM system.numbers + LIMIT {elements_count} + FORMAT Null; + + + SELECT dictHas('default.simple_key_direct_dictionary', number) + FROM system.numbers + LIMIT {elements_count} + FORMAT Null; + + + + SELECT dictGet('default.complex_key_direct_dictionary', {column_name}, (number, toString(number))) + FROM system.numbers + LIMIT {elements_count} + FORMAT Null; + + + SELECT dictHas('default.complex_key_direct_dictionary', (number, toString(number))) + FROM system.numbers + LIMIT {elements_count} + FORMAT Null; + + + DROP TABLE IF EXISTS simple_key_direct_dictionary_source_table; + DROP TABLE IF EXISTS complex_key_direct_dictionary_source_table; + + DROP DICTIONARY IF EXISTS simple_key_direct_dictionary; + DROP DICTIONARY IF EXISTS complex_key_direct_dictionary; diff --git a/tests/performance/flat_dictionary.xml b/tests/performance/flat_dictionary.xml new file mode 100644 index 00000000000..426aa929bbc --- /dev/null +++ b/tests/performance/flat_dictionary.xml @@ -0,0 +1,75 @@ + + + CREATE TABLE simple_key_flat_dictionary_source_table + ( + id UInt64, + value_int UInt64, + value_string String, + value_decimal Decimal64(8), + value_string_nullable Nullable(String) + ) ENGINE = Memory; + + + + CREATE DICTIONARY simple_key_flat_dictionary + ( + id UInt64, + value_int UInt64, + value_string String, + value_decimal Decimal64(8), + value_string_nullable Nullable(String) + ) + PRIMARY KEY id + SOURCE(CLICKHOUSE(DB 'default' TABLE 'simple_key_flat_dictionary_source_table')) + LAYOUT(FLAT()) + LIFETIME(MIN 0 MAX 1000) + + + + INSERT INTO simple_key_flat_dictionary_source_table + SELECT number, number, toString(number), toDecimal64(number, 8), toString(number) + FROM system.numbers + LIMIT 500000; + + + + + column_name + + 'value_int' + 'value_string' + 'value_decimal' + 'value_string_nullable' + + + + + elements_count + + 250000 + 500000 + 750000 + 1000000 + + + + + + SELECT dictGet('default.simple_key_flat_dictionary', {column_name}, number) + FROM system.numbers + LIMIT {elements_count} + FORMAR Null; + + + + SELECT dictHas('default.simple_key_flat_dictionary', number) + FROM system.numbers + LIMIT {elements_count} + FORMAT Null; + + + DROP TABLE IF EXISTS simple_key_flat_dictionary_source_table + + DROP DICTIONARY IF EXISTS simple_key_flat_dictionary + + diff --git a/tests/performance/hashed_dictionary.xml b/tests/performance/hashed_dictionary.xml new file mode 100644 index 00000000000..a38d2f30c23 --- /dev/null +++ b/tests/performance/hashed_dictionary.xml @@ -0,0 +1,124 @@ + + + CREATE TABLE simple_key_hashed_dictionary_source_table + ( + id UInt64, + value_int UInt64, + value_string String, + value_decimal Decimal64(8), + value_string_nullable Nullable(String) + ) ENGINE = Memory; + + + + CREATE TABLE complex_key_hashed_dictionary_source_table + ( + id UInt64, + id_key String, + value_int UInt64, + value_string String, + value_decimal Decimal64(8), + value_string_nullable Nullable(String) + ) ENGINE = Memory; + + + + CREATE DICTIONARY simple_key_hashed_dictionary + ( + id UInt64, + value_int UInt64, + value_string String, + value_decimal Decimal64(8), + value_string_nullable Nullable(String) + ) + PRIMARY KEY id + SOURCE(CLICKHOUSE(DB 'default' TABLE 'simple_key_hashed_dictionary_source_table')) + LAYOUT(HASHED()) + LIFETIME(MIN 0 MAX 1000); + + + + CREATE DICTIONARY complex_key_hashed_dictionary + ( + id UInt64, + id_key String, + value_int UInt64, + value_string String, + value_decimal Decimal64(8), + value_string_nullable Nullable(String) + ) + PRIMARY KEY id, id_key + SOURCE(CLICKHOUSE(DB 'default' TABLE 'complex_key_hashed_dictionary_source_table')) + LAYOUT(COMPLEX_KEY_HASHED()) + LIFETIME(MIN 0 MAX 1000); + + + + INSERT INTO simple_key_hashed_dictionary_source_table + SELECT number, number, toString(number), toDecimal64(number, 8), toString(number) + FROM system.numbers + LIMIT 5000000; + + + + INSERT INTO complex_key_hashed_dictionary_source_table + SELECT number, toString(number), number, toString(number), toDecimal64(number, 8), toString(number) + FROM system.numbers + LIMIT 5000000; + + + + + column_name + + 'value_int' + 'value_string' + 'value_decimal' + 'value_string_nullable' + + + + + elements_count + + 2500000 + 5000000 + 7500000 + 10000000 + + + + + + SELECT dictGet('default.simple_key_hashed_dictionary', {column_name}, number) + FROM system.numbers + LIMIT {elements_count} + FORMAT Null; + + + SELECT dictHas('default.simple_key_hashed_dictionary', number) + FROM system.numbers + LIMIT {elements_count} + FORMAT Null; + + + + SELECT dictGet('default.complex_key_hashed_dictionary', {column_name}, (number, toString(number))) + FROM system.numbers + LIMIT {elements_count} + FORMAT Null; + + + SELECT dictHas('default.complex_key_hashed_dictionary', (number, toString(number))) + FROM system.numbers + LIMIT {elements_count} + FORMAT Null; + + + DROP TABLE IF EXISTS simple_key_hashed_dictionary_source_table; + DROP TABLE IF EXISTS complex_key_hashed_dictionary_source_table; + + DROP DICTIONARY IF EXISTS simple_key_hashed_dictionary; + DROP DICTIONARY IF EXISTS complex_key_hashed_dictionary; + + diff --git a/tests/queries/0_stateless/00539_functions_for_working_with_json.reference b/tests/queries/0_stateless/00539_functions_for_working_with_json.reference index c0399f8ab2e..4d3527722a1 100644 --- a/tests/queries/0_stateless/00539_functions_for_working_with_json.reference +++ b/tests/queries/0_stateless/00539_functions_for_working_with_json.reference @@ -13,3 +13,10 @@ test"string "[" ["]", "2", "3"] {"nested" : [1,2,3]} +-1 +0 +0 +-1 +1 +test_string +test"string diff --git a/tests/queries/0_stateless/00539_functions_for_working_with_json.sql b/tests/queries/0_stateless/00539_functions_for_working_with_json.sql index 514b5f2e5ea..31853e92262 100644 --- a/tests/queries/0_stateless/00539_functions_for_working_with_json.sql +++ b/tests/queries/0_stateless/00539_functions_for_working_with_json.sql @@ -15,3 +15,11 @@ SELECT visitParamExtractRaw('{"myparam": "{"}', 'myparam'); SELECT visitParamExtractRaw('{"myparam": "["}', 'myparam'); SELECT visitParamExtractRaw('{"myparam": ["]", "2", "3"], "other":123}', 'myparam'); SELECT visitParamExtractRaw('{"myparam": {"nested" : [1,2,3]}, "other":123}', 'myparam'); + +SELECT simpleJSONExtractInt('{"myparam":-1}', 'myparam'); +SELECT simpleJSONExtractUInt('{"myparam":-1}', 'myparam'); +SELECT simpleJSONExtractFloat('{"myparam":null}', 'myparam'); +SELECT simpleJSONExtractFloat('{"myparam":-1}', 'myparam'); +SELECT simpleJSONExtractBool('{"myparam":true}', 'myparam'); +SELECT simpleJSONExtractString('{"myparam":"test_string"}', 'myparam'); +SELECT simpleJSONExtractString('{"myparam":"test\\"string"}', 'myparam'); diff --git a/tests/queries/0_stateless/00597_push_down_predicate.reference b/tests/queries/0_stateless/00597_push_down_predicate.reference index bd1c4791df4..59313c35b81 100644 --- a/tests/queries/0_stateless/00597_push_down_predicate.reference +++ b/tests/queries/0_stateless/00597_push_down_predicate.reference @@ -585,3 +585,15 @@ SEMI LEFT JOIN ) AS r USING (id) WHERE r.id = 1 2000-01-01 1 test string 1 1 2000-01-01 test string 1 1 +SELECT value + t1.value AS expr +FROM +( + SELECT + value, + t1.value + FROM test_00597 AS t0 + ALL FULL OUTER JOIN test_00597 AS t1 USING (date) + WHERE (value + `t1.value`) < 3 +) +WHERE expr < 3 +2 diff --git a/tests/queries/0_stateless/00597_push_down_predicate.sql b/tests/queries/0_stateless/00597_push_down_predicate.sql index ec306ac6792..2e3357241ad 100644 --- a/tests/queries/0_stateless/00597_push_down_predicate.sql +++ b/tests/queries/0_stateless/00597_push_down_predicate.sql @@ -135,5 +135,9 @@ SELECT * FROM (SELECT * FROM (SELECT * FROM test_00597) AS a ANY LEFT JOIN (SELE EXPLAIN SYNTAX SELECT * FROM (SELECT * FROM test_00597) ANY INNER JOIN (SELECT * FROM (SELECT * FROM test_00597)) as r USING id WHERE r.id = 1; SELECT * FROM (SELECT * FROM test_00597) ANY INNER JOIN (SELECT * FROM (SELECT * FROM test_00597)) as r USING id WHERE r.id = 1; +-- issue 20497 +EXPLAIN SYNTAX SELECT value + t1.value AS expr FROM (SELECT t0.value, t1.value FROM test_00597 AS t0 FULL JOIN test_00597 AS t1 USING date) WHERE expr < 3; +SELECT value + t1.value AS expr FROM (SELECT t0.value, t1.value FROM test_00597 AS t0 FULL JOIN test_00597 AS t1 USING date) WHERE expr < 3; + DROP TABLE IF EXISTS test_00597; DROP TABLE IF EXISTS test_view_00597; diff --git a/tests/queries/0_stateless/00926_adaptive_index_granularity_collapsing_merge_tree.reference b/tests/queries/0_stateless/00926_adaptive_index_granularity_collapsing_merge_tree.reference index e5e283f754b..3a176a17f5a 100644 --- a/tests/queries/0_stateless/00926_adaptive_index_granularity_collapsing_merge_tree.reference +++ b/tests/queries/0_stateless/00926_adaptive_index_granularity_collapsing_merge_tree.reference @@ -1,7 +1,7 @@ 4 4 8 -7 +8 ----- 4 1 diff --git a/tests/queries/0_stateless/00926_adaptive_index_granularity_collapsing_merge_tree.sql b/tests/queries/0_stateless/00926_adaptive_index_granularity_collapsing_merge_tree.sql index d4c19cbe8f2..58b266f106f 100644 --- a/tests/queries/0_stateless/00926_adaptive_index_granularity_collapsing_merge_tree.sql +++ b/tests/queries/0_stateless/00926_adaptive_index_granularity_collapsing_merge_tree.sql @@ -58,7 +58,7 @@ OPTIMIZE TABLE four_rows_per_granule FINAL; SELECT COUNT(*) FROM four_rows_per_granule; -SELECT distinct(marks) from system.parts WHERE table = 'four_rows_per_granule' and database=currentDatabase() and active=1; +SELECT sum(marks) from system.parts WHERE table = 'four_rows_per_granule' and database=currentDatabase() and active=1; INSERT INTO four_rows_per_granule (p, k, v1, v2, Sign) VALUES ('2018-05-15', 1, 1000, 2000, 1), ('2018-05-16', 2, 3000, 4000, 1), ('2018-05-17', 3, 5000, 6000, 1), ('2018-05-18', 4, 7000, 8000, 1); diff --git a/tests/queries/0_stateless/01251_dict_is_in_infinite_loop.reference b/tests/queries/0_stateless/01251_dict_is_in_infinite_loop.reference index 757d2858524..0a2c97efb42 100644 --- a/tests/queries/0_stateless/01251_dict_is_in_infinite_loop.reference +++ b/tests/queries/0_stateless/01251_dict_is_in_infinite_loop.reference @@ -29,10 +29,10 @@ 1 1 1 -255 -255 0 -255 +0 +0 +0 [11,22] [22,11] [11,22] diff --git a/tests/queries/0_stateless/01764_collapsing_merge_adaptive_granularity.reference b/tests/queries/0_stateless/01764_collapsing_merge_adaptive_granularity.reference new file mode 100644 index 00000000000..0f128a62bbb --- /dev/null +++ b/tests/queries/0_stateless/01764_collapsing_merge_adaptive_granularity.reference @@ -0,0 +1,4 @@ +-8191 8193 +-8191 8193 +0 2 +0 2 diff --git a/tests/queries/0_stateless/01764_collapsing_merge_adaptive_granularity.sql b/tests/queries/0_stateless/01764_collapsing_merge_adaptive_granularity.sql new file mode 100644 index 00000000000..ca6465154ea --- /dev/null +++ b/tests/queries/0_stateless/01764_collapsing_merge_adaptive_granularity.sql @@ -0,0 +1,53 @@ +DROP TABLE IF EXISTS collapsing_table; +SET optimize_on_insert = 0; + +CREATE TABLE collapsing_table +( + key UInt64, + value UInt64, + Sign Int8 +) +ENGINE = CollapsingMergeTree(Sign) +ORDER BY key +SETTINGS + vertical_merge_algorithm_min_rows_to_activate=0, + vertical_merge_algorithm_min_columns_to_activate=0, + min_bytes_for_wide_part = 0; + +INSERT INTO collapsing_table SELECT if(number == 8192, 8191, number), 1, if(number == 8192, +1, -1) FROM numbers(8193); + +SELECT sum(Sign), count() from collapsing_table; + +OPTIMIZE TABLE collapsing_table FINAL; + +SELECT sum(Sign), count() from collapsing_table; + +DROP TABLE IF EXISTS collapsing_table; + + +DROP TABLE IF EXISTS collapsing_suspicious_granularity; + +CREATE TABLE collapsing_suspicious_granularity +( + key UInt64, + value UInt64, + Sign Int8 +) +ENGINE = CollapsingMergeTree(Sign) +ORDER BY key +SETTINGS + vertical_merge_algorithm_min_rows_to_activate=0, + vertical_merge_algorithm_min_columns_to_activate=0, + min_bytes_for_wide_part = 0, + index_granularity = 1; + +INSERT INTO collapsing_suspicious_granularity VALUES (1, 1, -1) (1, 1, 1); + +SELECT sum(Sign), count() from collapsing_suspicious_granularity; + +OPTIMIZE TABLE collapsing_suspicious_granularity FINAL; + +SELECT sum(Sign), count() from collapsing_suspicious_granularity; + + +DROP TABLE IF EXISTS collapsing_suspicious_granularity; diff --git a/tests/queries/0_stateless/01765_hashed_dictionary_simple_key.reference b/tests/queries/0_stateless/01765_hashed_dictionary_simple_key.reference new file mode 100644 index 00000000000..2cc0a8668a2 --- /dev/null +++ b/tests/queries/0_stateless/01765_hashed_dictionary_simple_key.reference @@ -0,0 +1,132 @@ +Dictionary hashed_dictionary_simple_key_simple_attributes +dictGet existing value +value_0 value_second_0 +value_1 value_second_1 +value_2 value_second_2 +dictGet with non existing value +value_0 value_second_0 +value_1 value_second_1 +value_2 value_second_2 +value_first_default value_second_default +dictGetOrDefault existing value +value_0 value_second_0 +value_1 value_second_1 +value_2 value_second_2 +dictGetOrDefault non existing value +value_0 value_second_0 +value_1 value_second_1 +value_2 value_second_2 +default default +dictHas +1 +1 +1 +0 +select all values as input stream +0 value_0 value_second_0 +1 value_1 value_second_1 +2 value_2 value_second_2 +Dictionary sparse_hashed_dictionary_simple_key_simple_attributes +dictGet existing value +value_0 value_second_0 +value_1 value_second_1 +value_2 value_second_2 +dictGet with non existing value +value_0 value_second_0 +value_1 value_second_1 +value_2 value_second_2 +value_first_default value_second_default +dictGetOrDefault existing value +value_0 value_second_0 +value_1 value_second_1 +value_2 value_second_2 +dictGetOrDefault non existing value +value_0 value_second_0 +value_1 value_second_1 +value_2 value_second_2 +default default +dictHas +1 +1 +1 +0 +select all values as input stream +0 value_0 value_second_0 +1 value_1 value_second_1 +2 value_2 value_second_2 +Dictionary hashed_dictionary_simple_key_complex_attributes +dictGet existing value +value_0 value_second_0 +value_1 \N +value_2 value_second_2 +dictGet with non existing value +value_0 value_second_0 +value_1 \N +value_2 value_second_2 +value_first_default value_second_default +dictGetOrDefault existing value +value_0 value_second_0 +value_1 \N +value_2 value_second_2 +dictGetOrDefault non existing value +value_0 value_second_0 +value_1 \N +value_2 value_second_2 +default default +dictHas +1 +1 +1 +0 +select all values as input stream +0 value_0 value_second_0 +1 value_1 \N +2 value_2 value_second_2 +Dictionary sparse_hashed_dictionary_simple_key_complex_attributes +dictGet existing value +value_0 value_second_0 +value_1 \N +value_2 value_second_2 +dictGet with non existing value +value_0 value_second_0 +value_1 \N +value_2 value_second_2 +value_first_default value_second_default +dictGetOrDefault existing value +value_0 value_second_0 +value_1 \N +value_2 value_second_2 +dictGetOrDefault non existing value +value_0 value_second_0 +value_1 \N +value_2 value_second_2 +default default +dictHas +1 +1 +1 +0 +select all values as input stream +0 value_0 value_second_0 +1 value_1 \N +2 value_2 value_second_2 +Dictionary hashed_dictionary_simple_key_hierarchy +dictGet +0 +0 +1 +1 +2 +dictGetHierarchy +[1] +[4,2,1] +Dictionary sparse_hashed_dictionary_simple_key_hierarchy +dictGet +0 +0 +1 +1 +2 +dictGetHierarchy +[1] +[4,2,1] diff --git a/tests/queries/0_stateless/01765_hashed_dictionary_simple_key.sql b/tests/queries/0_stateless/01765_hashed_dictionary_simple_key.sql new file mode 100644 index 00000000000..7502c6a93bb --- /dev/null +++ b/tests/queries/0_stateless/01765_hashed_dictionary_simple_key.sql @@ -0,0 +1,207 @@ +DROP DATABASE IF EXISTS 01765_db; +CREATE DATABASE 01765_db; + +CREATE TABLE 01765_db.simple_key_simple_attributes_source_table +( + id UInt64, + value_first String, + value_second String +) +ENGINE = TinyLog; + +INSERT INTO 01765_db.simple_key_simple_attributes_source_table VALUES(0, 'value_0', 'value_second_0'); +INSERT INTO 01765_db.simple_key_simple_attributes_source_table VALUES(1, 'value_1', 'value_second_1'); +INSERT INTO 01765_db.simple_key_simple_attributes_source_table VALUES(2, 'value_2', 'value_second_2'); + +CREATE DICTIONARY 01765_db.hashed_dictionary_simple_key_simple_attributes +( + id UInt64, + value_first String DEFAULT 'value_first_default', + value_second String DEFAULT 'value_second_default' +) +PRIMARY KEY id +SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() USER 'default' TABLE 'simple_key_simple_attributes_source_table')) +LIFETIME(MIN 1 MAX 1000) +LAYOUT(HASHED()); + +SELECT 'Dictionary hashed_dictionary_simple_key_simple_attributes'; +SELECT 'dictGet existing value'; +SELECT dictGet('01765_db.hashed_dictionary_simple_key_simple_attributes', 'value_first', number) as value_first, + dictGet('01765_db.hashed_dictionary_simple_key_simple_attributes', 'value_second', number) as value_second FROM system.numbers LIMIT 3; +SELECT 'dictGet with non existing value'; +SELECT dictGet('01765_db.hashed_dictionary_simple_key_simple_attributes', 'value_first', number) as value_first, + dictGet('01765_db.hashed_dictionary_simple_key_simple_attributes', 'value_second', number) as value_second FROM system.numbers LIMIT 4; +SELECT 'dictGetOrDefault existing value'; +SELECT dictGetOrDefault('01765_db.hashed_dictionary_simple_key_simple_attributes', 'value_first', number, toString('default')) as value_first, + dictGetOrDefault('01765_db.hashed_dictionary_simple_key_simple_attributes', 'value_second', number, toString('default')) as value_second FROM system.numbers LIMIT 3; +SELECT 'dictGetOrDefault non existing value'; +SELECT dictGetOrDefault('01765_db.hashed_dictionary_simple_key_simple_attributes', 'value_first', number, toString('default')) as value_first, + dictGetOrDefault('01765_db.hashed_dictionary_simple_key_simple_attributes', 'value_second', number, toString('default')) as value_second FROM system.numbers LIMIT 4; +SELECT 'dictHas'; +SELECT dictHas('01765_db.hashed_dictionary_simple_key_simple_attributes', number) FROM system.numbers LIMIT 4; +SELECT 'select all values as input stream'; +SELECT * FROM 01765_db.hashed_dictionary_simple_key_simple_attributes ORDER BY id; + +DROP DICTIONARY 01765_db.hashed_dictionary_simple_key_simple_attributes; + +CREATE DICTIONARY 01765_db.sparse_hashed_dictionary_simple_key_simple_attributes +( + id UInt64, + value_first String DEFAULT 'value_first_default', + value_second String DEFAULT 'value_second_default' +) +PRIMARY KEY id +SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() USER 'default' TABLE 'simple_key_simple_attributes_source_table')) +LIFETIME(MIN 1 MAX 1000) +LAYOUT(SPARSE_HASHED()); + +SELECT 'Dictionary sparse_hashed_dictionary_simple_key_simple_attributes'; +SELECT 'dictGet existing value'; +SELECT dictGet('01765_db.sparse_hashed_dictionary_simple_key_simple_attributes', 'value_first', number) as value_first, + dictGet('01765_db.sparse_hashed_dictionary_simple_key_simple_attributes', 'value_second', number) as value_second FROM system.numbers LIMIT 3; +SELECT 'dictGet with non existing value'; +SELECT dictGet('01765_db.sparse_hashed_dictionary_simple_key_simple_attributes', 'value_first', number) as value_first, + dictGet('01765_db.sparse_hashed_dictionary_simple_key_simple_attributes', 'value_second', number) as value_second FROM system.numbers LIMIT 4; +SELECT 'dictGetOrDefault existing value'; +SELECT dictGetOrDefault('01765_db.sparse_hashed_dictionary_simple_key_simple_attributes', 'value_first', number, toString('default')) as value_first, + dictGetOrDefault('01765_db.sparse_hashed_dictionary_simple_key_simple_attributes', 'value_second', number, toString('default')) as value_second FROM system.numbers LIMIT 3; +SELECT 'dictGetOrDefault non existing value'; +SELECT dictGetOrDefault('01765_db.sparse_hashed_dictionary_simple_key_simple_attributes', 'value_first', number, toString('default')) as value_first, + dictGetOrDefault('01765_db.sparse_hashed_dictionary_simple_key_simple_attributes', 'value_second', number, toString('default')) as value_second FROM system.numbers LIMIT 4; +SELECT 'dictHas'; +SELECT dictHas('01765_db.sparse_hashed_dictionary_simple_key_simple_attributes', number) FROM system.numbers LIMIT 4; +SELECT 'select all values as input stream'; +SELECT * FROM 01765_db.sparse_hashed_dictionary_simple_key_simple_attributes ORDER BY id; + +DROP DICTIONARY 01765_db.sparse_hashed_dictionary_simple_key_simple_attributes; + +DROP TABLE 01765_db.simple_key_simple_attributes_source_table; + +CREATE TABLE 01765_db.simple_key_complex_attributes_source_table +( + id UInt64, + value_first String, + value_second Nullable(String) +) +ENGINE = TinyLog; + +INSERT INTO 01765_db.simple_key_complex_attributes_source_table VALUES(0, 'value_0', 'value_second_0'); +INSERT INTO 01765_db.simple_key_complex_attributes_source_table VALUES(1, 'value_1', NULL); +INSERT INTO 01765_db.simple_key_complex_attributes_source_table VALUES(2, 'value_2', 'value_second_2'); + +CREATE DICTIONARY 01765_db.hashed_dictionary_simple_key_complex_attributes +( + id UInt64, + value_first String DEFAULT 'value_first_default', + value_second Nullable(String) DEFAULT 'value_second_default' +) +PRIMARY KEY id +SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() USER 'default' TABLE 'simple_key_complex_attributes_source_table')) +LIFETIME(MIN 1 MAX 1000) +LAYOUT(HASHED()); + +SELECT 'Dictionary hashed_dictionary_simple_key_complex_attributes'; +SELECT 'dictGet existing value'; +SELECT dictGet('01765_db.hashed_dictionary_simple_key_complex_attributes', 'value_first', number) as value_first, + dictGet('01765_db.hashed_dictionary_simple_key_complex_attributes', 'value_second', number) as value_second FROM system.numbers LIMIT 3; +SELECT 'dictGet with non existing value'; +SELECT dictGet('01765_db.hashed_dictionary_simple_key_complex_attributes', 'value_first', number) as value_first, + dictGet('01765_db.hashed_dictionary_simple_key_complex_attributes', 'value_second', number) as value_second FROM system.numbers LIMIT 4; +SELECT 'dictGetOrDefault existing value'; +SELECT dictGetOrDefault('01765_db.hashed_dictionary_simple_key_complex_attributes', 'value_first', number, toString('default')) as value_first, + dictGetOrDefault('01765_db.hashed_dictionary_simple_key_complex_attributes', 'value_second', number, toString('default')) as value_second FROM system.numbers LIMIT 3; +SELECT 'dictGetOrDefault non existing value'; +SELECT dictGetOrDefault('01765_db.hashed_dictionary_simple_key_complex_attributes', 'value_first', number, toString('default')) as value_first, + dictGetOrDefault('01765_db.hashed_dictionary_simple_key_complex_attributes', 'value_second', number, toString('default')) as value_second FROM system.numbers LIMIT 4; +SELECT 'dictHas'; +SELECT dictHas('01765_db.hashed_dictionary_simple_key_complex_attributes', number) FROM system.numbers LIMIT 4; +SELECT 'select all values as input stream'; +SELECT * FROM 01765_db.hashed_dictionary_simple_key_complex_attributes ORDER BY id; + +DROP DICTIONARY 01765_db.hashed_dictionary_simple_key_complex_attributes; + +CREATE DICTIONARY 01765_db.sparse_hashed_dictionary_simple_key_complex_attributes +( + id UInt64, + value_first String DEFAULT 'value_first_default', + value_second Nullable(String) DEFAULT 'value_second_default' +) +PRIMARY KEY id +SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() USER 'default' TABLE 'simple_key_complex_attributes_source_table')) +LIFETIME(MIN 1 MAX 1000) +LAYOUT(HASHED()); + +SELECT 'Dictionary sparse_hashed_dictionary_simple_key_complex_attributes'; +SELECT 'dictGet existing value'; +SELECT dictGet('01765_db.sparse_hashed_dictionary_simple_key_complex_attributes', 'value_first', number) as value_first, + dictGet('01765_db.sparse_hashed_dictionary_simple_key_complex_attributes', 'value_second', number) as value_second FROM system.numbers LIMIT 3; +SELECT 'dictGet with non existing value'; +SELECT dictGet('01765_db.sparse_hashed_dictionary_simple_key_complex_attributes', 'value_first', number) as value_first, + dictGet('01765_db.sparse_hashed_dictionary_simple_key_complex_attributes', 'value_second', number) as value_second FROM system.numbers LIMIT 4; +SELECT 'dictGetOrDefault existing value'; +SELECT dictGetOrDefault('01765_db.sparse_hashed_dictionary_simple_key_complex_attributes', 'value_first', number, toString('default')) as value_first, + dictGetOrDefault('01765_db.sparse_hashed_dictionary_simple_key_complex_attributes', 'value_second', number, toString('default')) as value_second FROM system.numbers LIMIT 3; +SELECT 'dictGetOrDefault non existing value'; +SELECT dictGetOrDefault('01765_db.sparse_hashed_dictionary_simple_key_complex_attributes', 'value_first', number, toString('default')) as value_first, + dictGetOrDefault('01765_db.sparse_hashed_dictionary_simple_key_complex_attributes', 'value_second', number, toString('default')) as value_second FROM system.numbers LIMIT 4; +SELECT 'dictHas'; +SELECT dictHas('01765_db.sparse_hashed_dictionary_simple_key_complex_attributes', number) FROM system.numbers LIMIT 4; +SELECT 'select all values as input stream'; +SELECT * FROM 01765_db.sparse_hashed_dictionary_simple_key_complex_attributes ORDER BY id; + +DROP DICTIONARY 01765_db.sparse_hashed_dictionary_simple_key_complex_attributes; + +DROP TABLE 01765_db.simple_key_complex_attributes_source_table; + +CREATE TABLE 01765_db.simple_key_hierarchy_table +( + id UInt64, + parent_id UInt64 +) ENGINE = TinyLog(); + +INSERT INTO 01765_db.simple_key_hierarchy_table VALUES (1, 0); +INSERT INTO 01765_db.simple_key_hierarchy_table VALUES (2, 1); +INSERT INTO 01765_db.simple_key_hierarchy_table VALUES (3, 1); +INSERT INTO 01765_db.simple_key_hierarchy_table VALUES (4, 2); + +CREATE DICTIONARY 01765_db.hashed_dictionary_simple_key_hierarchy +( + id UInt64, + parent_id UInt64 HIERARCHICAL +) +PRIMARY KEY id +SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() USER 'default' TABLE 'simple_key_hierarchy_table')) +LIFETIME(MIN 1 MAX 1000) +LAYOUT(HASHED()); + +SELECT 'Dictionary hashed_dictionary_simple_key_hierarchy'; +SELECT 'dictGet'; +SELECT dictGet('01765_db.hashed_dictionary_simple_key_hierarchy', 'parent_id', number) FROM system.numbers LIMIT 5; +SELECT 'dictGetHierarchy'; +SELECT dictGetHierarchy('01765_db.hashed_dictionary_simple_key_hierarchy', toUInt64(1)); +SELECT dictGetHierarchy('01765_db.hashed_dictionary_simple_key_hierarchy', toUInt64(4)); + +DROP DICTIONARY 01765_db.hashed_dictionary_simple_key_hierarchy; + +CREATE DICTIONARY 01765_db.sparse_hashed_dictionary_simple_key_hierarchy +( + id UInt64, + parent_id UInt64 HIERARCHICAL +) +PRIMARY KEY id +SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() USER 'default' TABLE 'simple_key_hierarchy_table')) +LIFETIME(MIN 1 MAX 1000) +LAYOUT(HASHED()); + +SELECT 'Dictionary sparse_hashed_dictionary_simple_key_hierarchy'; +SELECT 'dictGet'; +SELECT dictGet('01765_db.sparse_hashed_dictionary_simple_key_hierarchy', 'parent_id', number) FROM system.numbers LIMIT 5; +SELECT 'dictGetHierarchy'; +SELECT dictGetHierarchy('01765_db.sparse_hashed_dictionary_simple_key_hierarchy', toUInt64(1)); +SELECT dictGetHierarchy('01765_db.sparse_hashed_dictionary_simple_key_hierarchy', toUInt64(4)); + +DROP DICTIONARY 01765_db.sparse_hashed_dictionary_simple_key_hierarchy; + +DROP TABLE 01765_db.simple_key_hierarchy_table; + +DROP DATABASE 01765_db; diff --git a/tests/queries/0_stateless/01766_hashed_dictionary_complex_key.reference b/tests/queries/0_stateless/01766_hashed_dictionary_complex_key.reference new file mode 100644 index 00000000000..12c210581c2 --- /dev/null +++ b/tests/queries/0_stateless/01766_hashed_dictionary_complex_key.reference @@ -0,0 +1,56 @@ +Dictionary hashed_dictionary_complex_key_simple_attributes +dictGet existing value +value_0 value_second_0 +value_1 value_second_1 +value_2 value_second_2 +dictGet with non existing value +value_0 value_second_0 +value_1 value_second_1 +value_2 value_second_2 +value_first_default value_second_default +dictGetOrDefault existing value +value_0 value_second_0 +value_1 value_second_1 +value_2 value_second_2 +dictGetOrDefault non existing value +value_0 value_second_0 +value_1 value_second_1 +value_2 value_second_2 +default default +dictHas +1 +1 +1 +0 +select all values as input stream +0 id_key_0 value_0 value_second_0 +1 id_key_1 value_1 value_second_1 +2 id_key_2 value_2 value_second_2 +Dictionary hashed_dictionary_complex_key_complex_attributes +dictGet existing value +value_0 value_second_0 +value_1 \N +value_2 value_second_2 +dictGet with non existing value +value_0 value_second_0 +value_1 \N +value_2 value_second_2 +value_first_default value_second_default +dictGetOrDefault existing value +value_0 value_second_0 +value_1 \N +value_2 value_second_2 +dictGetOrDefault non existing value +value_0 value_second_0 +value_1 \N +value_2 value_second_2 +default default +dictHas +1 +1 +1 +0 +select all values as input stream +0 id_key_0 value_0 value_second_0 +1 id_key_1 value_1 \N +2 id_key_2 value_2 value_second_2 diff --git a/tests/queries/0_stateless/01766_hashed_dictionary_complex_key.sql b/tests/queries/0_stateless/01766_hashed_dictionary_complex_key.sql new file mode 100644 index 00000000000..de7ab5b5a1a --- /dev/null +++ b/tests/queries/0_stateless/01766_hashed_dictionary_complex_key.sql @@ -0,0 +1,98 @@ +DROP DATABASE IF EXISTS 01766_db; +CREATE DATABASE 01766_db; + +CREATE TABLE 01766_db.complex_key_simple_attributes_source_table +( + id UInt64, + id_key String, + value_first String, + value_second String +) +ENGINE = TinyLog; + +INSERT INTO 01766_db.complex_key_simple_attributes_source_table VALUES(0, 'id_key_0', 'value_0', 'value_second_0'); +INSERT INTO 01766_db.complex_key_simple_attributes_source_table VALUES(1, 'id_key_1', 'value_1', 'value_second_1'); +INSERT INTO 01766_db.complex_key_simple_attributes_source_table VALUES(2, 'id_key_2', 'value_2', 'value_second_2'); + +CREATE DICTIONARY 01766_db.hashed_dictionary_complex_key_simple_attributes +( + id UInt64, + id_key String, + value_first String DEFAULT 'value_first_default', + value_second String DEFAULT 'value_second_default' +) +PRIMARY KEY id, id_key +SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() USER 'default' TABLE 'complex_key_simple_attributes_source_table' DB '01766_db')) +LIFETIME(MIN 1 MAX 1000) +LAYOUT(COMPLEX_KEY_HASHED()); + +SELECT 'Dictionary hashed_dictionary_complex_key_simple_attributes'; +SELECT 'dictGet existing value'; +SELECT dictGet('01766_db.hashed_dictionary_complex_key_simple_attributes', 'value_first', (number, concat('id_key_', toString(number)))) as value_first, + dictGet('01766_db.hashed_dictionary_complex_key_simple_attributes', 'value_second', (number, concat('id_key_', toString(number)))) as value_second FROM system.numbers LIMIT 3; +SELECT 'dictGet with non existing value'; +SELECT dictGet('01766_db.hashed_dictionary_complex_key_simple_attributes', 'value_first', (number, concat('id_key_', toString(number)))) as value_first, + dictGet('01766_db.hashed_dictionary_complex_key_simple_attributes', 'value_second', (number, concat('id_key_', toString(number)))) as value_second FROM system.numbers LIMIT 4; +SELECT 'dictGetOrDefault existing value'; +SELECT dictGetOrDefault('01766_db.hashed_dictionary_complex_key_simple_attributes', 'value_first', (number, concat('id_key_', toString(number))), toString('default')) as value_first, + dictGetOrDefault('01766_db.hashed_dictionary_complex_key_simple_attributes', 'value_second', (number, concat('id_key_', toString(number))), toString('default')) as value_second FROM system.numbers LIMIT 3; +SELECT 'dictGetOrDefault non existing value'; +SELECT dictGetOrDefault('01766_db.hashed_dictionary_complex_key_simple_attributes', 'value_first', (number, concat('id_key_', toString(number))), toString('default')) as value_first, + dictGetOrDefault('01766_db.hashed_dictionary_complex_key_simple_attributes', 'value_second', (number, concat('id_key_', toString(number))), toString('default')) as value_second FROM system.numbers LIMIT 4; +SELECT 'dictHas'; +SELECT dictHas('01766_db.hashed_dictionary_complex_key_simple_attributes', (number, concat('id_key_', toString(number)))) FROM system.numbers LIMIT 4; +SELECT 'select all values as input stream'; +SELECT * FROM 01766_db.hashed_dictionary_complex_key_simple_attributes ORDER BY (id, id_key); + +DROP DICTIONARY 01766_db.hashed_dictionary_complex_key_simple_attributes; + +DROP TABLE 01766_db.complex_key_simple_attributes_source_table; + +CREATE TABLE 01766_db.complex_key_complex_attributes_source_table +( + id UInt64, + id_key String, + value_first String, + value_second Nullable(String) +) +ENGINE = TinyLog; + +INSERT INTO 01766_db.complex_key_complex_attributes_source_table VALUES(0, 'id_key_0', 'value_0', 'value_second_0'); +INSERT INTO 01766_db.complex_key_complex_attributes_source_table VALUES(1, 'id_key_1', 'value_1', NULL); +INSERT INTO 01766_db.complex_key_complex_attributes_source_table VALUES(2, 'id_key_2', 'value_2', 'value_second_2'); + +CREATE DICTIONARY 01766_db.hashed_dictionary_complex_key_complex_attributes +( + id UInt64, + id_key String, + + value_first String DEFAULT 'value_first_default', + value_second Nullable(String) DEFAULT 'value_second_default' +) +PRIMARY KEY id, id_key +SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() USER 'default' TABLE 'complex_key_complex_attributes_source_table' DB '01766_db')) +LIFETIME(MIN 1 MAX 1000) +LAYOUT(COMPLEX_KEY_HASHED()); + +SELECT 'Dictionary hashed_dictionary_complex_key_complex_attributes'; +SELECT 'dictGet existing value'; +SELECT dictGet('01766_db.hashed_dictionary_complex_key_complex_attributes', 'value_first', (number, concat('id_key_', toString(number)))) as value_first, + dictGet('01766_db.hashed_dictionary_complex_key_complex_attributes', 'value_second', (number, concat('id_key_', toString(number)))) as value_second FROM system.numbers LIMIT 3; +SELECT 'dictGet with non existing value'; +SELECT dictGet('01766_db.hashed_dictionary_complex_key_complex_attributes', 'value_first', (number, concat('id_key_', toString(number)))) as value_first, + dictGet('01766_db.hashed_dictionary_complex_key_complex_attributes', 'value_second', (number, concat('id_key_', toString(number)))) as value_second FROM system.numbers LIMIT 4; +SELECT 'dictGetOrDefault existing value'; +SELECT dictGetOrDefault('01766_db.hashed_dictionary_complex_key_complex_attributes', 'value_first', (number, concat('id_key_', toString(number))), toString('default')) as value_first, + dictGetOrDefault('01766_db.hashed_dictionary_complex_key_complex_attributes', 'value_second', (number, concat('id_key_', toString(number))), toString('default')) as value_second FROM system.numbers LIMIT 3; +SELECT 'dictGetOrDefault non existing value'; +SELECT dictGetOrDefault('01766_db.hashed_dictionary_complex_key_complex_attributes', 'value_first', (number, concat('id_key_', toString(number))), toString('default')) as value_first, + dictGetOrDefault('01766_db.hashed_dictionary_complex_key_complex_attributes', 'value_second', (number, concat('id_key_', toString(number))), toString('default')) as value_second FROM system.numbers LIMIT 4; +SELECT 'dictHas'; +SELECT dictHas('01766_db.hashed_dictionary_complex_key_complex_attributes', (number, concat('id_key_', toString(number)))) FROM system.numbers LIMIT 4; +SELECT 'select all values as input stream'; +SELECT * FROM 01766_db.hashed_dictionary_complex_key_complex_attributes ORDER BY (id, id_key); + +DROP DICTIONARY 01766_db.hashed_dictionary_complex_key_complex_attributes; +DROP TABLE 01766_db.complex_key_complex_attributes_source_table; + +DROP DATABASE 01766_db; diff --git a/tests/queries/0_stateless/01778_hierarchical_dictionaries.reference b/tests/queries/0_stateless/01778_hierarchical_dictionaries.reference new file mode 100644 index 00000000000..5fe5f5f1db6 --- /dev/null +++ b/tests/queries/0_stateless/01778_hierarchical_dictionaries.reference @@ -0,0 +1,102 @@ +Flat dictionary +Get hierarchy +[] +[1] +[2,1] +[3,1] +[4,2,1] +[] +Get is in hierarchy +0 +1 +1 +1 +1 +0 +Get children +[1] +[2,3] +[4] +[] +[] +[] +Get all descendants +[1,2,3,4] +[2,3,4] +[4] +[] +[] +[] +Get descendants at first level +[1] +[2,3] +[4] +[] +[] +[] +Hashed dictionary +Get hierarchy +[] +[1] +[2,1] +[3,1] +[4,2,1] +[] +Get is in hierarchy +0 +1 +1 +1 +1 +0 +Get children +[1] +[3,2] +[4] +[] +[] +[] +Get all descendants +[1,3,2,4] +[3,2,4] +[4] +[] +[] +[] +Get descendants at first level +[1] +[3,2] +[4] +[] +[] +[] +Cache dictionary +Get hierarchy +[] +[1] +[2,1] +[3,1] +[4,2,1] +[] +Get is in hierarchy +0 +1 +1 +1 +1 +0 +Direct dictionary +Get hierarchy +[] +[1] +[2,1] +[3,1] +[4,2,1] +[] +Get is in hierarchy +0 +1 +1 +1 +1 +0 diff --git a/tests/queries/0_stateless/01778_hierarchical_dictionaries.sql b/tests/queries/0_stateless/01778_hierarchical_dictionaries.sql new file mode 100644 index 00000000000..f6e1a7c9375 --- /dev/null +++ b/tests/queries/0_stateless/01778_hierarchical_dictionaries.sql @@ -0,0 +1,95 @@ +DROP DATABASE IF EXISTS 01778_db; +CREATE DATABASE 01778_db; + +CREATE TABLE 01778_db.hierarchy_source_table (id UInt64, parent_id UInt64) ENGINE = TinyLog; +INSERT INTO 01778_db.hierarchy_source_table VALUES (1, 0), (2, 1), (3, 1), (4, 2); + +CREATE DICTIONARY 01778_db.hierarchy_flat_dictionary +( + id UInt64, + parent_id UInt64 HIERARCHICAL +) +PRIMARY KEY id +SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() USER 'default' TABLE 'hierarchy_source_table' DB '01778_db')) +LAYOUT(FLAT()) +LIFETIME(MIN 1 MAX 1000); + +SELECT 'Flat dictionary'; + +SELECT 'Get hierarchy'; +SELECT dictGetHierarchy('01778_db.hierarchy_flat_dictionary', number) FROM system.numbers LIMIT 6; +SELECT 'Get is in hierarchy'; +SELECT dictIsIn('01778_db.hierarchy_flat_dictionary', number, number) FROM system.numbers LIMIT 6; +SELECT 'Get children'; +SELECT dictGetChildren('01778_db.hierarchy_flat_dictionary', number) FROM system.numbers LIMIT 6; +SELECT 'Get all descendants'; +SELECT dictGetDescendants('01778_db.hierarchy_flat_dictionary', number) FROM system.numbers LIMIT 6; +SELECT 'Get descendants at first level'; +SELECT dictGetDescendants('01778_db.hierarchy_flat_dictionary', number, 1) FROM system.numbers LIMIT 6; + +DROP DICTIONARY 01778_db.hierarchy_flat_dictionary; + +CREATE DICTIONARY 01778_db.hierarchy_hashed_dictionary +( + id UInt64, + parent_id UInt64 HIERARCHICAL +) +PRIMARY KEY id +SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() USER 'default' TABLE 'hierarchy_source_table' DB '01778_db')) +LAYOUT(HASHED()) +LIFETIME(MIN 1 MAX 1000); + +SELECT 'Hashed dictionary'; + +SELECT 'Get hierarchy'; +SELECT dictGetHierarchy('01778_db.hierarchy_hashed_dictionary', number) FROM system.numbers LIMIT 6; +SELECT 'Get is in hierarchy'; +SELECT dictIsIn('01778_db.hierarchy_hashed_dictionary', number, number) FROM system.numbers LIMIT 6; +SELECT 'Get children'; +SELECT dictGetChildren('01778_db.hierarchy_hashed_dictionary', number) FROM system.numbers LIMIT 6; +SELECT 'Get all descendants'; +SELECT dictGetDescendants('01778_db.hierarchy_hashed_dictionary', number) FROM system.numbers LIMIT 6; +SELECT 'Get descendants at first level'; +SELECT dictGetDescendants('01778_db.hierarchy_hashed_dictionary', number, 1) FROM system.numbers LIMIT 6; + +DROP DICTIONARY 01778_db.hierarchy_hashed_dictionary; + +CREATE DICTIONARY 01778_db.hierarchy_cache_dictionary +( + id UInt64, + parent_id UInt64 HIERARCHICAL +) +PRIMARY KEY id +SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() USER 'default' TABLE 'hierarchy_source_table' DB '01778_db')) +LAYOUT(CACHE(SIZE_IN_CELLS 10)) +LIFETIME(MIN 1 MAX 1000); + +SELECT 'Cache dictionary'; + +SELECT 'Get hierarchy'; +SELECT dictGetHierarchy('01778_db.hierarchy_cache_dictionary', number) FROM system.numbers LIMIT 6; +SELECT 'Get is in hierarchy'; +SELECT dictIsIn('01778_db.hierarchy_cache_dictionary', number, number) FROM system.numbers LIMIT 6; + +DROP DICTIONARY 01778_db.hierarchy_cache_dictionary; + +CREATE DICTIONARY 01778_db.hierarchy_direct_dictionary +( + id UInt64, + parent_id UInt64 HIERARCHICAL +) +PRIMARY KEY id +SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() USER 'default' TABLE 'hierarchy_source_table' DB '01778_db')) +LAYOUT(DIRECT()); + +SELECT 'Direct dictionary'; + +SELECT 'Get hierarchy'; +SELECT dictGetHierarchy('01778_db.hierarchy_direct_dictionary', number) FROM system.numbers LIMIT 6; +SELECT 'Get is in hierarchy'; +SELECT dictIsIn('01778_db.hierarchy_direct_dictionary', number, number) FROM system.numbers LIMIT 6; + +DROP DICTIONARY 01778_db.hierarchy_direct_dictionary; + +DROP TABLE 01778_db.hierarchy_source_table; +DROP DATABASE 01778_db; diff --git a/tests/queries/0_stateless/01780_dict_get_or_null.reference b/tests/queries/0_stateless/01780_dict_get_or_null.reference new file mode 100644 index 00000000000..4baca9ec91b --- /dev/null +++ b/tests/queries/0_stateless/01780_dict_get_or_null.reference @@ -0,0 +1,18 @@ +Simple key dictionary dictGetOrNull +0 0 \N \N (NULL,NULL) +1 1 First First ('First','First') +2 1 Second \N ('Second',NULL) +3 1 Third Third ('Third','Third') +4 0 \N \N (NULL,NULL) +Complex key dictionary dictGetOrNull +(0,'key') 0 \N \N (NULL,NULL) +(1,'key') 1 First First ('First','First') +(2,'key') 1 Second \N ('Second',NULL) +(3,'key') 1 Third Third ('Third','Third') +(4,'key') 0 \N \N (NULL,NULL) +Range key dictionary dictGetOrNull +(0,'2019-05-20') 0 \N \N (NULL,NULL) +(1,'2019-05-20') 1 First First ('First','First') +(2,'2019-05-20') 1 Second \N ('Second',NULL) +(3,'2019-05-20') 1 Third Third ('Third','Third') +(4,'2019-05-20') 0 \N \N (NULL,NULL) diff --git a/tests/queries/0_stateless/01780_dict_get_or_null.sql b/tests/queries/0_stateless/01780_dict_get_or_null.sql new file mode 100644 index 00000000000..f13bcf57d27 --- /dev/null +++ b/tests/queries/0_stateless/01780_dict_get_or_null.sql @@ -0,0 +1,116 @@ +DROP TABLE IF EXISTS simple_key_dictionary_source_table; +CREATE TABLE simple_key_dictionary_source_table +( + id UInt64, + value String, + value_nullable Nullable(String) +) ENGINE = TinyLog; + +INSERT INTO simple_key_dictionary_source_table VALUES (1, 'First', 'First'); +INSERT INTO simple_key_dictionary_source_table VALUES (2, 'Second', NULL); +INSERT INTO simple_key_dictionary_source_table VALUES (3, 'Third', 'Third'); + +DROP DICTIONARY IF EXISTS simple_key_dictionary; +CREATE DICTIONARY simple_key_dictionary +( + id UInt64, + value String, + value_nullable Nullable(String) +) +PRIMARY KEY id +SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() TABLE 'simple_key_dictionary_source_table')) +LAYOUT(DIRECT()); + +SELECT 'Simple key dictionary dictGetOrNull'; + +SELECT + number, + dictHas('simple_key_dictionary', number), + dictGetOrNull('simple_key_dictionary', 'value', number), + dictGetOrNull('simple_key_dictionary', 'value_nullable', number), + dictGetOrNull('simple_key_dictionary', ('value', 'value_nullable'), number) +FROM system.numbers LIMIT 5; + +DROP DICTIONARY simple_key_dictionary; +DROP TABLE simple_key_dictionary_source_table; + +DROP TABLE IF EXISTS complex_key_dictionary_source_table; +CREATE TABLE complex_key_dictionary_source_table +( + id UInt64, + id_key String, + value String, + value_nullable Nullable(String) +) ENGINE = TinyLog; + +INSERT INTO complex_key_dictionary_source_table VALUES (1, 'key', 'First', 'First'); +INSERT INTO complex_key_dictionary_source_table VALUES (2, 'key', 'Second', NULL); +INSERT INTO complex_key_dictionary_source_table VALUES (3, 'key', 'Third', 'Third'); + +DROP DICTIONARY IF EXISTS complex_key_dictionary; +CREATE DICTIONARY complex_key_dictionary +( + id UInt64, + id_key String, + value String, + value_nullable Nullable(String) +) +PRIMARY KEY id, id_key +SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() TABLE 'complex_key_dictionary_source_table')) +LAYOUT(COMPLEX_KEY_DIRECT()); + +SELECT 'Complex key dictionary dictGetOrNull'; + +SELECT + (number, 'key'), + dictHas('complex_key_dictionary', (number, 'key')), + dictGetOrNull('complex_key_dictionary', 'value', (number, 'key')), + dictGetOrNull('complex_key_dictionary', 'value_nullable', (number, 'key')), + dictGetOrNull('complex_key_dictionary', ('value', 'value_nullable'), (number, 'key')) +FROM system.numbers LIMIT 5; + +DROP DICTIONARY complex_key_dictionary; +DROP TABLE complex_key_dictionary_source_table; + +DROP TABLE IF EXISTS range_key_dictionary_source_table; +CREATE TABLE range_key_dictionary_source_table +( + key UInt64, + start_date Date, + end_date Date, + value String, + value_nullable Nullable(String) +) +ENGINE = TinyLog(); + +INSERT INTO range_key_dictionary_source_table VALUES(1, toDate('2019-05-20'), toDate('2019-05-20'), 'First', 'First'); +INSERT INTO range_key_dictionary_source_table VALUES(2, toDate('2019-05-20'), toDate('2019-05-20'), 'Second', NULL); +INSERT INTO range_key_dictionary_source_table VALUES(3, toDate('2019-05-20'), toDate('2019-05-20'), 'Third', 'Third'); + +DROP DICTIONARY IF EXISTS range_key_dictionary; +CREATE DICTIONARY range_key_dictionary +( + key UInt64, + start_date Date, + end_date Date, + value String, + value_nullable Nullable(String) +) +PRIMARY KEY key +SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() TABLE 'range_key_dictionary_source_table')) +LIFETIME(MIN 1 MAX 1000) +LAYOUT(RANGE_HASHED()) +RANGE(MIN start_date MAX end_date); + +SELECT 'Range key dictionary dictGetOrNull'; + +SELECT + (number, toDate('2019-05-20')), + dictHas('range_key_dictionary', number, toDate('2019-05-20')), + dictGetOrNull('range_key_dictionary', 'value', number, toDate('2019-05-20')), + dictGetOrNull('range_key_dictionary', 'value_nullable', number, toDate('2019-05-20')), + dictGetOrNull('range_key_dictionary', ('value', 'value_nullable'), number, toDate('2019-05-20')) +FROM system.numbers LIMIT 5; + +DROP DICTIONARY range_key_dictionary; +DROP TABLE range_key_dictionary_source_table; diff --git a/tests/queries/0_stateless/01783_parallel_formatting_memory.reference b/tests/queries/0_stateless/01783_parallel_formatting_memory.reference new file mode 100644 index 00000000000..c5cdc5cf0bb --- /dev/null +++ b/tests/queries/0_stateless/01783_parallel_formatting_memory.reference @@ -0,0 +1 @@ +Code: 241 diff --git a/tests/queries/0_stateless/01783_parallel_formatting_memory.sh b/tests/queries/0_stateless/01783_parallel_formatting_memory.sh new file mode 100755 index 00000000000..0b8cb0bc6be --- /dev/null +++ b/tests/queries/0_stateless/01783_parallel_formatting_memory.sh @@ -0,0 +1,7 @@ +#!/usr/bin/env bash + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +$CLICKHOUSE_CURL -sS "$CLICKHOUSE_URL&max_memory_usage=1G" -d "SELECT range(65535) FROM system.one ARRAY JOIN range(65536) AS number" | grep -oF 'Code: 241' diff --git a/tests/queries/0_stateless/01784_parallel_formatting_memory.reference b/tests/queries/0_stateless/01784_parallel_formatting_memory.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/01784_parallel_formatting_memory.sql b/tests/queries/0_stateless/01784_parallel_formatting_memory.sql new file mode 100644 index 00000000000..35dc063f895 --- /dev/null +++ b/tests/queries/0_stateless/01784_parallel_formatting_memory.sql @@ -0,0 +1,2 @@ +SET max_memory_usage = '1G'; +SELECT range(65535) FROM system.one ARRAY JOIN range(65536) AS number; -- { serverError 241 } diff --git a/tests/queries/0_stateless/01785_parallel_formatting_memory.reference b/tests/queries/0_stateless/01785_parallel_formatting_memory.reference new file mode 100644 index 00000000000..0ec7fc54b01 --- /dev/null +++ b/tests/queries/0_stateless/01785_parallel_formatting_memory.reference @@ -0,0 +1,2 @@ +Code: 241 +Code: 241 diff --git a/tests/queries/0_stateless/01785_parallel_formatting_memory.sh b/tests/queries/0_stateless/01785_parallel_formatting_memory.sh new file mode 100755 index 00000000000..6d081c61fd3 --- /dev/null +++ b/tests/queries/0_stateless/01785_parallel_formatting_memory.sh @@ -0,0 +1,8 @@ +#!/usr/bin/env bash + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +$CLICKHOUSE_CLIENT --compress 0 --max_memory_usage 1G --query "SELECT range(65535) FROM system.one ARRAY JOIN range(65536) AS number" 2>&1 | grep -oF 'Code: 241' | head -n1 +$CLICKHOUSE_CLIENT --compress 1 --max_memory_usage 1G --query "SELECT range(65535) FROM system.one ARRAY JOIN range(65536) AS number" 2>&1 | grep -oF 'Code: 241' | head -n1 diff --git a/tests/queries/1_stateful/00159_parallel_formatting_http.reference b/tests/queries/1_stateful/00159_parallel_formatting_http.reference index 499a0b8a7c7..8eabf5d4f03 100644 --- a/tests/queries/1_stateful/00159_parallel_formatting_http.reference +++ b/tests/queries/1_stateful/00159_parallel_formatting_http.reference @@ -1,12 +1,12 @@ TSV, false -8a984bbbfb127c430f67173f5371c6cb - +6e4ce4996dd0e036d27cb0d2166c8e59 - TSV, true -8a984bbbfb127c430f67173f5371c6cb - +6e4ce4996dd0e036d27cb0d2166c8e59 - CSV, false -ea1c740f03f5dcc43a3044528ad0a98f - +ab6b3616f31e8a952c802ca92562e418 - CSV, true -ea1c740f03f5dcc43a3044528ad0a98f - +ab6b3616f31e8a952c802ca92562e418 - JSONCompactEachRow, false -ba1081a754a06ef6563840b2d8d4d327 - +1651b540b43bd6c62446f4c340bf13c7 - JSONCompactEachRow, true -ba1081a754a06ef6563840b2d8d4d327 - +1651b540b43bd6c62446f4c340bf13c7 - diff --git a/tests/queries/1_stateful/00159_parallel_formatting_http.sh b/tests/queries/1_stateful/00159_parallel_formatting_http.sh index 8fd8c15b7c7..a4e68de6a3f 100755 --- a/tests/queries/1_stateful/00159_parallel_formatting_http.sh +++ b/tests/queries/1_stateful/00159_parallel_formatting_http.sh @@ -10,8 +10,8 @@ FORMATS=('TSV' 'CSV' 'JSONCompactEachRow') for format in "${FORMATS[@]}" do echo "$format, false"; - ${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}&query=SELECT+ClientEventTime+as+a,MobilePhoneModel+as+b,ClientIP6+as+c+FROM+test.hits+ORDER+BY+a,b,c+Format+$format&output_format_parallel_formatting=false" -d' ' | md5sum + ${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}&query=SELECT+ClientEventTime+as+a,MobilePhoneModel+as+b,ClientIP6+as+c+FROM+test.hits+ORDER+BY+a,b,c+LIMIT+1000000+Format+$format&output_format_parallel_formatting=false" -d' ' | md5sum echo "$format, true"; - ${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}&query=SELECT+ClientEventTime+as+a,MobilePhoneModel+as+b,ClientIP6+as+c+FROM+test.hits+ORDER+BY+a,b,c+Format+$format&output_format_parallel_formatting=true" -d' ' | md5sum + ${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}&query=SELECT+ClientEventTime+as+a,MobilePhoneModel+as+b,ClientIP6+as+c+FROM+test.hits+ORDER+BY+a,b,c+LIMIT+1000000+Format+$format&output_format_parallel_formatting=true" -d' ' | md5sum done diff --git a/tests/queries/skip_list.json b/tests/queries/skip_list.json index df2090325a3..627b76827a8 100644 --- a/tests/queries/skip_list.json +++ b/tests/queries/skip_list.json @@ -679,6 +679,19 @@ "live_view", "memory_leak", "memory_limit", - "polygon_dicts" // they use an explicitly specified database + "polygon_dicts", // they use an explicitly specified database + "01658_read_file_to_stringcolumn", + "01721_engine_file_truncate_on_insert", // It's ok to execute in parallel but not several instances of the same test. + "01702_system_query_log", // It's ok to execute in parallel with oter tests but not several instances of the same test. + "01748_dictionary_table_dot", // creates database + "00950_dict_get", + "01683_flat_dictionary", + "01681_cache_dictionary_simple_key", + "01682_cache_dictionary_complex_key", + "01684_ssd_cache_dictionary_simple_key", + "01685_ssd_cache_dictionary_complex_key", + "01760_system_dictionaries", + "01760_polygon_dictionaries", + "01778_hierarchical_dictionaries" ] } diff --git a/tests/testflows/regression.py b/tests/testflows/regression.py index 05fec3ea985..45f1ed64a6c 100755 --- a/tests/testflows/regression.py +++ b/tests/testflows/regression.py @@ -14,10 +14,10 @@ def regression(self, local, clickhouse_binary_path, stress=None, parallel=None): """ args = {"local": local, "clickhouse_binary_path": clickhouse_binary_path, "stress": stress, "parallel": parallel} - Feature(test=load("example.regression", "regression"))(**args) - Feature(test=load("ldap.regression", "regression"))(**args) - Feature(test=load("rbac.regression", "regression"))(**args) - Feature(test=load("aes_encryption.regression", "regression"))(**args) + # Feature(test=load("example.regression", "regression"))(**args) + # Feature(test=load("ldap.regression", "regression"))(**args) + # Feature(test=load("rbac.regression", "regression"))(**args) + # Feature(test=load("aes_encryption.regression", "regression"))(**args) # Feature(test=load("kerberos.regression", "regression"))(**args) if main():