Merge branch 'master' of https://github.com/ClickHouse/ClickHouse into replicas-shards-for-mysql-and-postgres

This commit is contained in:
kssenii 2021-04-03 08:00:09 +00:00
commit 13d1f17d3e
213 changed files with 9335 additions and 3266 deletions

View File

@ -39,6 +39,8 @@ else()
set(RECONFIGURE_MESSAGE_LEVEL STATUS)
endif()
enable_language(C CXX ASM)
include (cmake/arch.cmake)
include (cmake/target.cmake)
include (cmake/tools.cmake)

View File

@ -1,9 +1,9 @@
# This strings autochanged from release_lib.sh:
SET(VERSION_REVISION 54449)
SET(VERSION_REVISION 54450)
SET(VERSION_MAJOR 21)
SET(VERSION_MINOR 4)
SET(VERSION_MINOR 5)
SET(VERSION_PATCH 1)
SET(VERSION_GITHASH af2135ef9dc72f16fa4f229b731262c3f0a8bbdc)
SET(VERSION_DESCRIBE v21.4.1.1-prestable)
SET(VERSION_STRING 21.4.1.1)
SET(VERSION_GITHASH 3827789b3d8fd2021952e57e5110343d26daa1a1)
SET(VERSION_DESCRIBE v21.5.1.1-prestable)
SET(VERSION_STRING 21.5.1.1)
# end of autochange

View File

@ -1,4 +1,8 @@
if(ARCH_AMD64 OR ARCH_ARM)
option (ENABLE_BASE64 "Enable base64" ${ENABLE_LIBRARIES})
elseif(ENABLE_BASE64)
message (${RECONFIGURE_MESSAGE_LEVEL} "base64 library is only supported on x86_64 and aarch64")
endif()
if (NOT ENABLE_BASE64)
return()

View File

@ -1,7 +1,7 @@
if(NOT ARCH_ARM AND NOT OS_FREEBSD AND NOT OS_DARWIN)
if(ARCH_AMD64 AND NOT OS_FREEBSD AND NOT OS_DARWIN)
option(ENABLE_FASTOPS "Enable fast vectorized mathematical functions library by Mikhail Parakhin" ${ENABLE_LIBRARIES})
elseif(ENABLE_FASTOPS)
message (${RECONFIGURE_MESSAGE_LEVEL} "Fastops library is not supported on ARM, FreeBSD and Darwin")
message (${RECONFIGURE_MESSAGE_LEVEL} "Fastops library is supported on x86_64 only, and not FreeBSD or Darwin")
endif()
if(NOT ENABLE_FASTOPS)

View File

@ -1,4 +1,4 @@
if(NOT ARCH_ARM AND NOT OS_FREEBSD AND NOT APPLE AND USE_PROTOBUF)
if(NOT ARCH_ARM AND NOT OS_FREEBSD AND NOT APPLE AND USE_PROTOBUF AND NOT ARCH_PPC64LE)
option(ENABLE_HDFS "Enable HDFS" ${ENABLE_LIBRARIES})
elseif(ENABLE_HDFS OR USE_INTERNAL_HDFS3_LIBRARY)
message (${RECONFIGURE_MESSAGE_LEVEL} "Cannot use HDFS3 with current configuration")

View File

@ -62,6 +62,7 @@ if (NOT OPENLDAP_FOUND AND NOT MISSING_INTERNAL_LDAP_LIBRARY)
if (
( "${_system_name}" STREQUAL "linux" AND "${_system_processor}" STREQUAL "x86_64" ) OR
( "${_system_name}" STREQUAL "linux" AND "${_system_processor}" STREQUAL "aarch64" ) OR
( "${_system_name}" STREQUAL "linux" AND "${_system_processor}" STREQUAL "ppc64le" ) OR
( "${_system_name}" STREQUAL "freebsd" AND "${_system_processor}" STREQUAL "x86_64" ) OR
( "${_system_name}" STREQUAL "darwin" AND "${_system_processor}" STREQUAL "x86_64" )
)

View File

@ -1,7 +1,7 @@
if(NOT OS_FREEBSD AND NOT APPLE AND NOT ARCH_ARM)
if(NOT OS_FREEBSD AND NOT APPLE)
option(ENABLE_S3 "Enable S3" ${ENABLE_LIBRARIES})
elseif(ENABLE_S3 OR USE_INTERNAL_AWS_S3_LIBRARY)
message (${RECONFIGURE_MESSAGE_LEVEL} "Can't use S3 on ARM, Apple or FreeBSD")
message (${RECONFIGURE_MESSAGE_LEVEL} "Can't use S3 on Apple or FreeBSD")
endif()
if(NOT ENABLE_S3)

View File

@ -6,7 +6,7 @@ set (DEFAULT_LIBS "-nodefaultlibs")
# We need builtins from Clang's RT even without libcxx - for ubsan+int128.
# See https://bugs.llvm.org/show_bug.cgi?id=16404
if (COMPILER_CLANG AND NOT (CMAKE_CROSSCOMPILING AND ARCH_AARCH64))
execute_process (COMMAND ${CMAKE_CXX_COMPILER} --print-file-name=libclang_rt.builtins-${CMAKE_SYSTEM_PROCESSOR}.a OUTPUT_VARIABLE BUILTINS_LIBRARY OUTPUT_STRIP_TRAILING_WHITESPACE)
execute_process (COMMAND ${CMAKE_CXX_COMPILER} --print-libgcc-file-name --rtlib=compiler-rt OUTPUT_VARIABLE BUILTINS_LIBRARY OUTPUT_STRIP_TRAILING_WHITESPACE)
else ()
set (BUILTINS_LIBRARY "-lgcc")
endif ()

View File

@ -86,8 +86,3 @@ if (LINKER_NAME)
message(STATUS "Using custom linker by name: ${LINKER_NAME}")
endif ()
if (ARCH_PPC64LE)
if (COMPILER_CLANG OR (COMPILER_GCC AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 8))
message(FATAL_ERROR "Only gcc-8 or higher is supported for powerpc architecture")
endif ()
endif ()

View File

@ -160,6 +160,12 @@ if (NOT EXTERNAL_BOOST_FOUND)
enable_language(ASM)
SET(ASM_OPTIONS "-x assembler-with-cpp")
set (SRCS_CONTEXT
${LIBRARY_DIR}/libs/context/src/dummy.cpp
${LIBRARY_DIR}/libs/context/src/execution_context.cpp
${LIBRARY_DIR}/libs/context/src/posix/stack_traits.cpp
)
if (SANITIZE AND (SANITIZE STREQUAL "address" OR SANITIZE STREQUAL "thread"))
add_compile_definitions(BOOST_USE_UCONTEXT)
@ -169,39 +175,34 @@ if (NOT EXTERNAL_BOOST_FOUND)
add_compile_definitions(BOOST_USE_TSAN)
endif()
set (SRCS_CONTEXT
set (SRCS_CONTEXT ${SRCS_CONTEXT}
${LIBRARY_DIR}/libs/context/src/fiber.cpp
${LIBRARY_DIR}/libs/context/src/continuation.cpp
${LIBRARY_DIR}/libs/context/src/dummy.cpp
${LIBRARY_DIR}/libs/context/src/execution_context.cpp
${LIBRARY_DIR}/libs/context/src/posix/stack_traits.cpp
)
elseif (ARCH_ARM)
set (SRCS_CONTEXT
endif()
if (ARCH_ARM)
set (SRCS_CONTEXT ${SRCS_CONTEXT}
${LIBRARY_DIR}/libs/context/src/asm/jump_arm64_aapcs_elf_gas.S
${LIBRARY_DIR}/libs/context/src/asm/make_arm64_aapcs_elf_gas.S
${LIBRARY_DIR}/libs/context/src/asm/ontop_arm64_aapcs_elf_gas.S
${LIBRARY_DIR}/libs/context/src/dummy.cpp
${LIBRARY_DIR}/libs/context/src/execution_context.cpp
${LIBRARY_DIR}/libs/context/src/posix/stack_traits.cpp
)
elseif (ARCH_PPC64LE)
set (SRCS_CONTEXT ${SRCS_CONTEXT}
${LIBRARY_DIR}/libs/context/src/asm/jump_ppc64_sysv_elf_gas.S
${LIBRARY_DIR}/libs/context/src/asm/make_ppc64_sysv_elf_gas.S
${LIBRARY_DIR}/libs/context/src/asm/ontop_ppc64_sysv_elf_gas.S
)
elseif(OS_DARWIN)
set (SRCS_CONTEXT
set (SRCS_CONTEXT ${SRCS_CONTEXT}
${LIBRARY_DIR}/libs/context/src/asm/jump_x86_64_sysv_macho_gas.S
${LIBRARY_DIR}/libs/context/src/asm/make_x86_64_sysv_macho_gas.S
${LIBRARY_DIR}/libs/context/src/asm/ontop_x86_64_sysv_macho_gas.S
${LIBRARY_DIR}/libs/context/src/dummy.cpp
${LIBRARY_DIR}/libs/context/src/execution_context.cpp
${LIBRARY_DIR}/libs/context/src/posix/stack_traits.cpp
)
else()
set (SRCS_CONTEXT
set (SRCS_CONTEXT ${SRCS_CONTEXT}
${LIBRARY_DIR}/libs/context/src/asm/jump_x86_64_sysv_elf_gas.S
${LIBRARY_DIR}/libs/context/src/asm/make_x86_64_sysv_elf_gas.S
${LIBRARY_DIR}/libs/context/src/asm/ontop_x86_64_sysv_elf_gas.S
${LIBRARY_DIR}/libs/context/src/dummy.cpp
${LIBRARY_DIR}/libs/context/src/execution_context.cpp
${LIBRARY_DIR}/libs/context/src/posix/stack_traits.cpp
)
endif()

View File

@ -97,12 +97,19 @@ if (NOT EXTERNAL_CCTZ_LIBRARY_FOUND OR NOT EXTERNAL_CCTZ_LIBRARY_WORKS)
set(TZ_OBJS ${TZ_OBJS} ${TZ_OBJ})
# https://stackoverflow.com/questions/14776463/compile-and-add-an-object-file-from-a-binary-with-cmake
# PPC64LE fails to do this with objcopy, use ld or lld instead
if (ARCH_PPC64LE)
add_custom_command(OUTPUT ${TZ_OBJ}
COMMAND cp ${TZDIR}/${TIMEZONE} ${CMAKE_CURRENT_BINARY_DIR}/${TIMEZONE_ID}
COMMAND cd ${CMAKE_CURRENT_BINARY_DIR} && ${CMAKE_LINKER} -m elf64lppc -r -b binary -o ${TZ_OBJ} ${TIMEZONE_ID}
COMMAND rm ${CMAKE_CURRENT_BINARY_DIR}/${TIMEZONE_ID})
else()
add_custom_command(OUTPUT ${TZ_OBJ}
COMMAND cp ${TZDIR}/${TIMEZONE} ${CMAKE_CURRENT_BINARY_DIR}/${TIMEZONE_ID}
COMMAND cd ${CMAKE_CURRENT_BINARY_DIR} && ${OBJCOPY_PATH} -I binary ${OBJCOPY_ARCH_OPTIONS}
--rename-section .data=.rodata,alloc,load,readonly,data,contents ${TIMEZONE_ID} ${TZ_OBJ}
COMMAND rm ${CMAKE_CURRENT_BINARY_DIR}/${TIMEZONE_ID})
endif()
set_source_files_properties(${TZ_OBJ} PROPERTIES EXTERNAL_OBJECT true GENERATED true)
endforeach(TIMEZONE)

View File

@ -1,7 +1,7 @@
if (SANITIZE OR NOT (ARCH_AMD64 OR ARCH_ARM) OR NOT (OS_LINUX OR OS_FREEBSD OR OS_DARWIN))
if (SANITIZE OR NOT (ARCH_AMD64 OR ARCH_ARM OR ARCH_PPC64LE) OR NOT (OS_LINUX OR OS_FREEBSD OR OS_DARWIN))
if (ENABLE_JEMALLOC)
message (${RECONFIGURE_MESSAGE_LEVEL}
"jemalloc is disabled implicitly: it doesn't work with sanitizers and can only be used with x86_64 or aarch64 on linux or freebsd.")
"jemalloc is disabled implicitly: it doesn't work with sanitizers and can only be used with x86_64, aarch64 or ppc64le on linux or freebsd.")
endif()
set (ENABLE_JEMALLOC OFF)
else()
@ -107,6 +107,8 @@ if (ARCH_AMD64)
set(JEMALLOC_INCLUDE_PREFIX "${JEMALLOC_INCLUDE_PREFIX}_x86_64")
elseif (ARCH_ARM)
set(JEMALLOC_INCLUDE_PREFIX "${JEMALLOC_INCLUDE_PREFIX}_aarch64")
elseif (ARCH_PPC64LE)
set(JEMALLOC_INCLUDE_PREFIX "${JEMALLOC_INCLUDE_PREFIX}_ppc64le")
else ()
message (FATAL_ERROR "internal jemalloc: This arch is not supported")
endif ()

View File

@ -0,0 +1,367 @@
/* include/jemalloc/internal/jemalloc_internal_defs.h. Generated from jemalloc_internal_defs.h.in by configure. */
#ifndef JEMALLOC_INTERNAL_DEFS_H_
#define JEMALLOC_INTERNAL_DEFS_H_
/*
* If JEMALLOC_PREFIX is defined via --with-jemalloc-prefix, it will cause all
* public APIs to be prefixed. This makes it possible, with some care, to use
* multiple allocators simultaneously.
*/
/* #undef JEMALLOC_PREFIX */
/* #undef JEMALLOC_CPREFIX */
/*
* Define overrides for non-standard allocator-related functions if they are
* present on the system.
*/
#define JEMALLOC_OVERRIDE___LIBC_CALLOC
#define JEMALLOC_OVERRIDE___LIBC_FREE
#define JEMALLOC_OVERRIDE___LIBC_MALLOC
#define JEMALLOC_OVERRIDE___LIBC_MEMALIGN
#define JEMALLOC_OVERRIDE___LIBC_REALLOC
#define JEMALLOC_OVERRIDE___LIBC_VALLOC
/* #undef JEMALLOC_OVERRIDE___POSIX_MEMALIGN */
/*
* JEMALLOC_PRIVATE_NAMESPACE is used as a prefix for all library-private APIs.
* For shared libraries, symbol visibility mechanisms prevent these symbols
* from being exported, but for static libraries, naming collisions are a real
* possibility.
*/
#define JEMALLOC_PRIVATE_NAMESPACE je_
/*
* Hyper-threaded CPUs may need a special instruction inside spin loops in
* order to yield to another virtual CPU.
*/
#define CPU_SPINWAIT
/* 1 if CPU_SPINWAIT is defined, 0 otherwise. */
#define HAVE_CPU_SPINWAIT 0
/*
* Number of significant bits in virtual addresses. This may be less than the
* total number of bits in a pointer, e.g. on x64, for which the uppermost 16
* bits are the same as bit 47.
*/
#define LG_VADDR 64
/* Defined if C11 atomics are available. */
#define JEMALLOC_C11_ATOMICS 1
/* Defined if GCC __atomic atomics are available. */
#define JEMALLOC_GCC_ATOMIC_ATOMICS 1
/* and the 8-bit variant support. */
#define JEMALLOC_GCC_U8_ATOMIC_ATOMICS 1
/* Defined if GCC __sync atomics are available. */
#define JEMALLOC_GCC_SYNC_ATOMICS 1
/* and the 8-bit variant support. */
#define JEMALLOC_GCC_U8_SYNC_ATOMICS 1
/*
* Defined if __builtin_clz() and __builtin_clzl() are available.
*/
#define JEMALLOC_HAVE_BUILTIN_CLZ
/*
* Defined if os_unfair_lock_*() functions are available, as provided by Darwin.
*/
/* #undef JEMALLOC_OS_UNFAIR_LOCK */
/* Defined if syscall(2) is usable. */
#define JEMALLOC_USE_SYSCALL
/*
* Defined if secure_getenv(3) is available.
*/
// #define JEMALLOC_HAVE_SECURE_GETENV
/*
* Defined if issetugid(2) is available.
*/
/* #undef JEMALLOC_HAVE_ISSETUGID */
/* Defined if pthread_atfork(3) is available. */
#define JEMALLOC_HAVE_PTHREAD_ATFORK
/* Defined if pthread_setname_np(3) is available. */
#define JEMALLOC_HAVE_PTHREAD_SETNAME_NP
/*
* Defined if clock_gettime(CLOCK_MONOTONIC_COARSE, ...) is available.
*/
#define JEMALLOC_HAVE_CLOCK_MONOTONIC_COARSE 1
/*
* Defined if clock_gettime(CLOCK_MONOTONIC, ...) is available.
*/
#define JEMALLOC_HAVE_CLOCK_MONOTONIC 1
/*
* Defined if mach_absolute_time() is available.
*/
/* #undef JEMALLOC_HAVE_MACH_ABSOLUTE_TIME */
/*
* Defined if _malloc_thread_cleanup() exists. At least in the case of
* FreeBSD, pthread_key_create() allocates, which if used during malloc
* bootstrapping will cause recursion into the pthreads library. Therefore, if
* _malloc_thread_cleanup() exists, use it as the basis for thread cleanup in
* malloc_tsd.
*/
/* #undef JEMALLOC_MALLOC_THREAD_CLEANUP */
/*
* Defined if threaded initialization is known to be safe on this platform.
* Among other things, it must be possible to initialize a mutex without
* triggering allocation in order for threaded allocation to be safe.
*/
#define JEMALLOC_THREADED_INIT
/*
* Defined if the pthreads implementation defines
* _pthread_mutex_init_calloc_cb(), in which case the function is used in order
* to avoid recursive allocation during mutex initialization.
*/
/* #undef JEMALLOC_MUTEX_INIT_CB */
/* Non-empty if the tls_model attribute is supported. */
#define JEMALLOC_TLS_MODEL __attribute__((tls_model("initial-exec")))
/*
* JEMALLOC_DEBUG enables assertions and other sanity checks, and disables
* inline functions.
*/
/* #undef JEMALLOC_DEBUG */
/* JEMALLOC_STATS enables statistics calculation. */
#define JEMALLOC_STATS
/* JEMALLOC_EXPERIMENTAL_SMALLOCX_API enables experimental smallocx API. */
/* #undef JEMALLOC_EXPERIMENTAL_SMALLOCX_API */
/* JEMALLOC_PROF enables allocation profiling. */
/* #undef JEMALLOC_PROF */
/* Use libunwind for profile backtracing if defined. */
/* #undef JEMALLOC_PROF_LIBUNWIND */
/* Use libgcc for profile backtracing if defined. */
/* #undef JEMALLOC_PROF_LIBGCC */
/* Use gcc intrinsics for profile backtracing if defined. */
/* #undef JEMALLOC_PROF_GCC */
/*
* JEMALLOC_DSS enables use of sbrk(2) to allocate extents from the data storage
* segment (DSS).
*/
#define JEMALLOC_DSS
/* Support memory filling (junk/zero). */
#define JEMALLOC_FILL
/* Support utrace(2)-based tracing. */
/* #undef JEMALLOC_UTRACE */
/* Support optional abort() on OOM. */
/* #undef JEMALLOC_XMALLOC */
/* Support lazy locking (avoid locking unless a second thread is launched). */
/* #undef JEMALLOC_LAZY_LOCK */
/*
* Minimum allocation alignment is 2^LG_QUANTUM bytes (ignoring tiny size
* classes).
*/
/* #undef LG_QUANTUM */
/* One page is 2^LG_PAGE bytes. */
#define LG_PAGE 16
/*
* One huge page is 2^LG_HUGEPAGE bytes. Note that this is defined even if the
* system does not explicitly support huge pages; system calls that require
* explicit huge page support are separately configured.
*/
#define LG_HUGEPAGE 21
/*
* If defined, adjacent virtual memory mappings with identical attributes
* automatically coalesce, and they fragment when changes are made to subranges.
* This is the normal order of things for mmap()/munmap(), but on Windows
* VirtualAlloc()/VirtualFree() operations must be precisely matched, i.e.
* mappings do *not* coalesce/fragment.
*/
#define JEMALLOC_MAPS_COALESCE
/*
* If defined, retain memory for later reuse by default rather than using e.g.
* munmap() to unmap freed extents. This is enabled on 64-bit Linux because
* common sequences of mmap()/munmap() calls will cause virtual memory map
* holes.
*/
#define JEMALLOC_RETAIN
/* TLS is used to map arenas and magazine caches to threads. */
#define JEMALLOC_TLS
/*
* Used to mark unreachable code to quiet "end of non-void" compiler warnings.
* Don't use this directly; instead use unreachable() from util.h
*/
#define JEMALLOC_INTERNAL_UNREACHABLE __builtin_unreachable
/*
* ffs*() functions to use for bitmapping. Don't use these directly; instead,
* use ffs_*() from util.h.
*/
#define JEMALLOC_INTERNAL_FFSLL __builtin_ffsll
#define JEMALLOC_INTERNAL_FFSL __builtin_ffsl
#define JEMALLOC_INTERNAL_FFS __builtin_ffs
/*
* popcount*() functions to use for bitmapping.
*/
#define JEMALLOC_INTERNAL_POPCOUNTL __builtin_popcountl
#define JEMALLOC_INTERNAL_POPCOUNT __builtin_popcount
/*
* If defined, explicitly attempt to more uniformly distribute large allocation
* pointer alignments across all cache indices.
*/
#define JEMALLOC_CACHE_OBLIVIOUS
/*
* If defined, enable logging facilities. We make this a configure option to
* avoid taking extra branches everywhere.
*/
/* #undef JEMALLOC_LOG */
/*
* If defined, use readlinkat() (instead of readlink()) to follow
* /etc/malloc_conf.
*/
/* #undef JEMALLOC_READLINKAT */
/*
* Darwin (OS X) uses zones to work around Mach-O symbol override shortcomings.
*/
/* #undef JEMALLOC_ZONE */
/*
* Methods for determining whether the OS overcommits.
* JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY: Linux's
* /proc/sys/vm.overcommit_memory file.
* JEMALLOC_SYSCTL_VM_OVERCOMMIT: FreeBSD's vm.overcommit sysctl.
*/
/* #undef JEMALLOC_SYSCTL_VM_OVERCOMMIT */
#define JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY
/* Defined if madvise(2) is available. */
#define JEMALLOC_HAVE_MADVISE
/*
* Defined if transparent huge pages are supported via the MADV_[NO]HUGEPAGE
* arguments to madvise(2).
*/
#define JEMALLOC_HAVE_MADVISE_HUGE
/*
* Methods for purging unused pages differ between operating systems.
*
* madvise(..., MADV_FREE) : This marks pages as being unused, such that they
* will be discarded rather than swapped out.
* madvise(..., MADV_DONTNEED) : If JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS is
* defined, this immediately discards pages,
* such that new pages will be demand-zeroed if
* the address region is later touched;
* otherwise this behaves similarly to
* MADV_FREE, though typically with higher
* system overhead.
*/
#define JEMALLOC_PURGE_MADVISE_FREE
#define JEMALLOC_PURGE_MADVISE_DONTNEED
#define JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS
/* Defined if madvise(2) is available but MADV_FREE is not (x86 Linux only). */
/* #undef JEMALLOC_DEFINE_MADVISE_FREE */
/*
* Defined if MADV_DO[NT]DUMP is supported as an argument to madvise.
*/
#define JEMALLOC_MADVISE_DONTDUMP
/*
* Defined if transparent huge pages (THPs) are supported via the
* MADV_[NO]HUGEPAGE arguments to madvise(2), and THP support is enabled.
*/
/* #undef JEMALLOC_THP */
/* Define if operating system has alloca.h header. */
#define JEMALLOC_HAS_ALLOCA_H 1
/* C99 restrict keyword supported. */
#define JEMALLOC_HAS_RESTRICT 1
/* For use by hash code. */
/* #undef JEMALLOC_BIG_ENDIAN */
/* sizeof(int) == 2^LG_SIZEOF_INT. */
#define LG_SIZEOF_INT 2
/* sizeof(long) == 2^LG_SIZEOF_LONG. */
#define LG_SIZEOF_LONG 3
/* sizeof(long long) == 2^LG_SIZEOF_LONG_LONG. */
#define LG_SIZEOF_LONG_LONG 3
/* sizeof(intmax_t) == 2^LG_SIZEOF_INTMAX_T. */
#define LG_SIZEOF_INTMAX_T 3
/* glibc malloc hooks (__malloc_hook, __realloc_hook, __free_hook). */
#define JEMALLOC_GLIBC_MALLOC_HOOK
/* glibc memalign hook. */
#define JEMALLOC_GLIBC_MEMALIGN_HOOK
/* pthread support */
#define JEMALLOC_HAVE_PTHREAD
/* dlsym() support */
#define JEMALLOC_HAVE_DLSYM
/* Adaptive mutex support in pthreads. */
#define JEMALLOC_HAVE_PTHREAD_MUTEX_ADAPTIVE_NP
/* GNU specific sched_getcpu support */
#define JEMALLOC_HAVE_SCHED_GETCPU
/* GNU specific sched_setaffinity support */
#define JEMALLOC_HAVE_SCHED_SETAFFINITY
/*
* If defined, all the features necessary for background threads are present.
*/
#define JEMALLOC_BACKGROUND_THREAD 1
/*
* If defined, jemalloc symbols are not exported (doesn't work when
* JEMALLOC_PREFIX is not defined).
*/
/* #undef JEMALLOC_EXPORT */
/* config.malloc_conf options string. */
#define JEMALLOC_CONFIG_MALLOC_CONF "@JEMALLOC_CONFIG_MALLOC_CONF@"
/* If defined, jemalloc takes the malloc/free/etc. symbol names. */
#define JEMALLOC_IS_MALLOC 1
/*
* Defined if strerror_r returns char * if _GNU_SOURCE is defined.
*/
#define JEMALLOC_STRERROR_R_RETURNS_CHAR_WITH_GNU_SOURCE
/* Performs additional safety checks when defined. */
/* #undef JEMALLOC_OPT_SAFETY_CHECKS */
#endif /* JEMALLOC_INTERNAL_DEFS_H_ */

View File

@ -1,9 +1,7 @@
if (NOT ARCH_ARM)
if(ARCH_AMD64)
option (ENABLE_CPUID "Enable libcpuid library (only internal)" ${ENABLE_LIBRARIES})
endif()
if (ARCH_ARM AND ENABLE_CPUID)
message (${RECONFIGURE_MESSAGE_LEVEL} "cpuid is not supported on ARM")
elseif(ENABLE_CPUID)
message (${RECONFIGURE_MESSAGE_LEVEL} "libcpuid is only supported on x86_64")
set (ENABLE_CPUID 0)
endif()

View File

@ -0,0 +1,63 @@
/* include/lber_types.h. Generated from lber_types.hin by configure. */
/* $OpenLDAP$ */
/* This work is part of OpenLDAP Software <http://www.openldap.org/>.
*
* Copyright 1998-2020 The OpenLDAP Foundation.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted only as authorized by the OpenLDAP
* Public License.
*
* A copy of this license is available in file LICENSE in the
* top-level directory of the distribution or, alternatively, at
* <http://www.OpenLDAP.org/license.html>.
*/
/*
* LBER types
*/
#ifndef _LBER_TYPES_H
#define _LBER_TYPES_H
#include <ldap_cdefs.h>
LDAP_BEGIN_DECL
/* LBER boolean, enum, integers (32 bits or larger) */
#define LBER_INT_T int
/* LBER tags (32 bits or larger) */
#define LBER_TAG_T long
/* LBER socket descriptor */
#define LBER_SOCKET_T int
/* LBER lengths (32 bits or larger) */
#define LBER_LEN_T long
/* ------------------------------------------------------------ */
/* booleans, enumerations, and integers */
typedef LBER_INT_T ber_int_t;
/* signed and unsigned versions */
typedef signed LBER_INT_T ber_sint_t;
typedef unsigned LBER_INT_T ber_uint_t;
/* tags */
typedef unsigned LBER_TAG_T ber_tag_t;
/* "socket" descriptors */
typedef LBER_SOCKET_T ber_socket_t;
/* lengths */
typedef unsigned LBER_LEN_T ber_len_t;
/* signed lengths */
typedef signed LBER_LEN_T ber_slen_t;
LDAP_END_DECL
#endif /* _LBER_TYPES_H */

View File

@ -0,0 +1,74 @@
/* include/ldap_config.h. Generated from ldap_config.hin by configure. */
/* $OpenLDAP$ */
/* This work is part of OpenLDAP Software <http://www.openldap.org/>.
*
* Copyright 1998-2020 The OpenLDAP Foundation.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted only as authorized by the OpenLDAP
* Public License.
*
* A copy of this license is available in file LICENSE in the
* top-level directory of the distribution or, alternatively, at
* <http://www.OpenLDAP.org/license.html>.
*/
/*
* This file works in conjunction with OpenLDAP configure system.
* If you do no like the values below, adjust your configure options.
*/
#ifndef _LDAP_CONFIG_H
#define _LDAP_CONFIG_H
/* directory separator */
#ifndef LDAP_DIRSEP
#ifndef _WIN32
#define LDAP_DIRSEP "/"
#else
#define LDAP_DIRSEP "\\"
#endif
#endif
/* directory for temporary files */
#if defined(_WIN32)
# define LDAP_TMPDIR "C:\\." /* we don't have much of a choice */
#elif defined( _P_tmpdir )
# define LDAP_TMPDIR _P_tmpdir
#elif defined( P_tmpdir )
# define LDAP_TMPDIR P_tmpdir
#elif defined( _PATH_TMPDIR )
# define LDAP_TMPDIR _PATH_TMPDIR
#else
# define LDAP_TMPDIR LDAP_DIRSEP "tmp"
#endif
/* directories */
#ifndef LDAP_BINDIR
#define LDAP_BINDIR "/tmp/ldap-prefix/bin"
#endif
#ifndef LDAP_SBINDIR
#define LDAP_SBINDIR "/tmp/ldap-prefix/sbin"
#endif
#ifndef LDAP_DATADIR
#define LDAP_DATADIR "/tmp/ldap-prefix/share/openldap"
#endif
#ifndef LDAP_SYSCONFDIR
#define LDAP_SYSCONFDIR "/tmp/ldap-prefix/etc/openldap"
#endif
#ifndef LDAP_LIBEXECDIR
#define LDAP_LIBEXECDIR "/tmp/ldap-prefix/libexec"
#endif
#ifndef LDAP_MODULEDIR
#define LDAP_MODULEDIR "/tmp/ldap-prefix/libexec/openldap"
#endif
#ifndef LDAP_RUNDIR
#define LDAP_RUNDIR "/tmp/ldap-prefix/var"
#endif
#ifndef LDAP_LOCALEDIR
#define LDAP_LOCALEDIR ""
#endif
#endif /* _LDAP_CONFIG_H */

View File

@ -0,0 +1,61 @@
/* include/ldap_features.h. Generated from ldap_features.hin by configure. */
/* $OpenLDAP$ */
/* This work is part of OpenLDAP Software <http://www.openldap.org/>.
*
* Copyright 1998-2020 The OpenLDAP Foundation.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted only as authorized by the OpenLDAP
* Public License.
*
* A copy of this license is available in file LICENSE in the
* top-level directory of the distribution or, alternatively, at
* <http://www.OpenLDAP.org/license.html>.
*/
/*
* LDAP Features
*/
#ifndef _LDAP_FEATURES_H
#define _LDAP_FEATURES_H 1
/* OpenLDAP API version macros */
#define LDAP_VENDOR_VERSION 20501
#define LDAP_VENDOR_VERSION_MAJOR 2
#define LDAP_VENDOR_VERSION_MINOR 5
#define LDAP_VENDOR_VERSION_PATCH X
/*
** WORK IN PROGRESS!
**
** OpenLDAP reentrancy/thread-safeness should be dynamically
** checked using ldap_get_option().
**
** The -lldap implementation is not thread-safe.
**
** The -lldap_r implementation is:
** LDAP_API_FEATURE_THREAD_SAFE (basic thread safety)
** but also be:
** LDAP_API_FEATURE_SESSION_THREAD_SAFE
** LDAP_API_FEATURE_OPERATION_THREAD_SAFE
**
** The preprocessor flag LDAP_API_FEATURE_X_OPENLDAP_THREAD_SAFE
** can be used to determine if -lldap_r is available at compile
** time. You must define LDAP_THREAD_SAFE if and only if you
** link with -lldap_r.
**
** If you fail to define LDAP_THREAD_SAFE when linking with
** -lldap_r or define LDAP_THREAD_SAFE when linking with -lldap,
** provided header definitions and declarations may be incorrect.
**
*/
/* is -lldap_r available or not */
#define LDAP_API_FEATURE_X_OPENLDAP_THREAD_SAFE 1
/* LDAP v2 Referrals */
/* #undef LDAP_API_FEATURE_X_OPENLDAP_V2_REFERRALS */
#endif /* LDAP_FEATURES */

File diff suppressed because it is too large Load Diff

4
debian/changelog vendored
View File

@ -1,5 +1,5 @@
clickhouse (21.4.1.1) unstable; urgency=low
clickhouse (21.5.1.1) unstable; urgency=low
* Modified source code
-- clickhouse-release <clickhouse-release@yandex-team.ru> Sat, 06 Mar 2021 14:43:27 +0300
-- clickhouse-release <clickhouse-release@yandex-team.ru> Fri, 02 Apr 2021 18:34:26 +0300

View File

@ -1,7 +1,7 @@
FROM ubuntu:18.04
ARG repository="deb https://repo.clickhouse.tech/deb/stable/ main/"
ARG version=21.4.1.*
ARG version=21.5.1.*
RUN apt-get update \
&& apt-get install --yes --no-install-recommends \

View File

@ -1,7 +1,7 @@
FROM ubuntu:20.04
ARG repository="deb https://repo.clickhouse.tech/deb/stable/ main/"
ARG version=21.4.1.*
ARG version=21.5.1.*
ARG gosu_ver=1.10
# set non-empty deb_location_url url to create a docker image

View File

@ -1,7 +1,7 @@
FROM ubuntu:18.04
ARG repository="deb https://repo.clickhouse.tech/deb/stable/ main/"
ARG version=21.4.1.*
ARG version=21.5.1.*
RUN apt-get update && \
apt-get install -y apt-transport-https dirmngr && \

View File

@ -1,7 +1,7 @@
# docker build -t yandex/clickhouse-fasttest .
FROM ubuntu:20.04
ENV DEBIAN_FRONTEND=noninteractive LLVM_VERSION=10
ENV DEBIAN_FRONTEND=noninteractive LLVM_VERSION=11
RUN apt-get update \
&& apt-get install ca-certificates lsb-release wget gnupg apt-transport-https \
@ -43,20 +43,20 @@ RUN apt-get update \
clang-tidy-${LLVM_VERSION} \
cmake \
curl \
lsof \
expect \
fakeroot \
git \
gdb \
git \
gperf \
lld-${LLVM_VERSION} \
llvm-${LLVM_VERSION} \
lsof \
moreutils \
ninja-build \
psmisc \
python3 \
python3-pip \
python3-lxml \
python3-pip \
python3-requests \
python3-termcolor \
rename \

View File

@ -8,6 +8,9 @@ trap 'kill $(jobs -pr) ||:' EXIT
# that we can run the "everything else" stage from the cloned source.
stage=${stage:-}
# Compiler version, normally set by Dockerfile
export LLVM_VERSION=${LLVM_VERSION:-11}
# A variable to pass additional flags to CMake.
# Here we explicitly default it to nothing so that bash doesn't complain about
# it being undefined. Also read it as array so that we can pass an empty list
@ -124,22 +127,26 @@ continue
function clone_root
{
git clone https://github.com/ClickHouse/ClickHouse.git -- "$FASTTEST_SOURCE" | ts '%Y-%m-%d %H:%M:%S' | tee "$FASTTEST_OUTPUT/clone_log.txt"
git clone --depth 1 https://github.com/ClickHouse/ClickHouse.git -- "$FASTTEST_SOURCE" 2>&1 | ts '%Y-%m-%d %H:%M:%S' | tee "$FASTTEST_OUTPUT/clone_log.txt"
(
cd "$FASTTEST_SOURCE"
if [ "$PULL_REQUEST_NUMBER" != "0" ]; then
if git fetch origin "+refs/pull/$PULL_REQUEST_NUMBER/merge"; then
if git fetch --depth 1 origin "+refs/pull/$PULL_REQUEST_NUMBER/merge"; then
git checkout FETCH_HEAD
echo 'Clonned merge head'
echo "Checked out pull/$PULL_REQUEST_NUMBER/merge ($(git rev-parse FETCH_HEAD))"
else
git fetch origin "+refs/pull/$PULL_REQUEST_NUMBER/head"
git fetch --depth 1 origin "+refs/pull/$PULL_REQUEST_NUMBER/head"
git checkout "$COMMIT_SHA"
echo 'Checked out to commit'
echo "Checked out nominal SHA $COMMIT_SHA for PR $PULL_REQUEST_NUMBER"
fi
else
if [ -v COMMIT_SHA ]; then
git fetch --depth 1 origin "$COMMIT_SHA"
git checkout "$COMMIT_SHA"
echo "Checked out nominal SHA $COMMIT_SHA for master"
else
echo "Using default repository head $(git rev-parse HEAD)"
fi
fi
)
@ -181,7 +188,7 @@ function clone_submodules
)
git submodule sync
git submodule update --init --recursive "${SUBMODULES_TO_UPDATE[@]}"
git submodule update --depth 1 --init --recursive "${SUBMODULES_TO_UPDATE[@]}"
git submodule foreach git reset --hard
git submodule foreach git checkout @ -f
git submodule foreach git clean -xfd
@ -215,7 +222,7 @@ function run_cmake
(
cd "$FASTTEST_BUILD"
cmake "$FASTTEST_SOURCE" -DCMAKE_CXX_COMPILER=clang++-10 -DCMAKE_C_COMPILER=clang-10 "${CMAKE_LIBS_CONFIG[@]}" "${FASTTEST_CMAKE_FLAGS[@]}" | ts '%Y-%m-%d %H:%M:%S' | tee "$FASTTEST_OUTPUT/cmake_log.txt"
cmake "$FASTTEST_SOURCE" -DCMAKE_CXX_COMPILER="clang++-${LLVM_VERSION}" -DCMAKE_C_COMPILER="clang-${LLVM_VERSION}" "${CMAKE_LIBS_CONFIG[@]}" "${FASTTEST_CMAKE_FLAGS[@]}" 2>&1 | ts '%Y-%m-%d %H:%M:%S' | tee "$FASTTEST_OUTPUT/cmake_log.txt"
)
}
@ -223,7 +230,7 @@ function build
{
(
cd "$FASTTEST_BUILD"
time ninja clickhouse-bundle | ts '%Y-%m-%d %H:%M:%S' | tee "$FASTTEST_OUTPUT/build_log.txt"
time ninja clickhouse-bundle 2>&1 | ts '%Y-%m-%d %H:%M:%S' | tee "$FASTTEST_OUTPUT/build_log.txt"
if [ "$COPY_CLICKHOUSE_BINARY_TO_OUTPUT" -eq "1" ]; then
cp programs/clickhouse "$FASTTEST_OUTPUT/clickhouse"
fi
@ -420,7 +427,7 @@ case "$stage" in
# See the compatibility hacks in `clone_root` stage above. Remove at the same time,
# after Nov 1, 2020.
cd "$FASTTEST_WORKSPACE"
clone_submodules | ts '%Y-%m-%d %H:%M:%S' | tee "$FASTTEST_OUTPUT/submodule_log.txt"
clone_submodules 2>&1 | ts '%Y-%m-%d %H:%M:%S' | tee "$FASTTEST_OUTPUT/submodule_log.txt"
;&
"run_cmake")
run_cmake
@ -431,7 +438,7 @@ case "$stage" in
"configure")
# The `install_log.txt` is also needed for compatibility with old CI task --
# if there is no log, it will decide that build failed.
configure | ts '%Y-%m-%d %H:%M:%S' | tee "$FASTTEST_OUTPUT/install_log.txt"
configure 2>&1 | ts '%Y-%m-%d %H:%M:%S' | tee "$FASTTEST_OUTPUT/install_log.txt"
;&
"run_tests")
run_tests

View File

@ -74,12 +74,17 @@ function run_tests()
ADDITIONAL_OPTIONS+=('--order=random')
ADDITIONAL_OPTIONS+=('--skip')
ADDITIONAL_OPTIONS+=('00000_no_tests_to_skip')
ADDITIONAL_OPTIONS+=('--jobs')
ADDITIONAL_OPTIONS+=('4')
# Note that flaky check must be ran in parallel, but for now we run
# everything in parallel except DatabaseReplicated. See below.
fi
if [[ -n "$USE_DATABASE_REPLICATED" ]] && [[ "$USE_DATABASE_REPLICATED" -eq 1 ]]; then
ADDITIONAL_OPTIONS+=('--replicated-database')
else
# Too many tests fail for DatabaseReplicated in parallel. All other
# configurations are OK.
ADDITIONAL_OPTIONS+=('--jobs')
ADDITIONAL_OPTIONS+=('8')
fi
clickhouse-test --testname --shard --zookeeper --hung-check --print-time \

View File

@ -126,7 +126,13 @@ Contribute all new information in English language. Other languages are translat
### Adding a New File
When adding a new file:
When you add a new file, it should end with a link like:
`[Original article](https://clickhouse.tech/docs/<path-to-the-page>) <!--hide-->`
and there should be **a new empty line** after it.
{## When adding a new file:
- Make symbolic links for all other languages. You can use the following commands:
@ -134,7 +140,7 @@ When adding a new file:
$ cd /ClickHouse/clone/directory/docs
$ ln -sr en/new/file.md lang/new/file.md
```
##}
<a name="adding-a-new-language"/>
### Adding a New Language
@ -195,8 +201,11 @@ Templates:
- [Function](_description_templates/template-function.md)
- [Setting](_description_templates/template-setting.md)
- [Server Setting](_description_templates/template-server-setting.md)
- [Database or Table engine](_description_templates/template-engine.md)
- [System table](_description_templates/template-system-table.md)
- [Data type](_description_templates/data-type.md)
- [Statement](_description_templates/statement.md)
<a name="how-to-build-docs"/>

View File

@ -5,43 +5,77 @@ toc_title: Build on Mac OS X
# How to Build ClickHouse on Mac OS X {#how-to-build-clickhouse-on-mac-os-x}
Build should work on Mac OS X 10.15 (Catalina).
Build should work on x86_64 (Intel) based macOS 10.15 (Catalina) and higher with recent Xcode's native AppleClang, or Homebrew's vanilla Clang or GCC compilers.
## Install Homebrew {#install-homebrew}
``` bash
$ /usr/bin/ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)"
$ /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"
```
## Install Xcode and Command Line Tools {#install-xcode-and-command-line-tools}
Install the latest [Xcode](https://apps.apple.com/am/app/xcode/id497799835?mt=12) from App Store.
Open it at least once to accept the end-user license agreement and automatically install the required components.
Then, make sure that the latest Comman Line Tools are installed and selected in the system:
``` bash
$ sudo rm -rf /Library/Developer/CommandLineTools
$ sudo xcode-select --install
```
Reboot.
## Install Required Compilers, Tools, and Libraries {#install-required-compilers-tools-and-libraries}
``` bash
$ brew install cmake ninja libtool gettext llvm
$ brew update
$ brew install cmake ninja libtool gettext llvm gcc
```
## Checkout ClickHouse Sources {#checkout-clickhouse-sources}
``` bash
$ git clone --recursive git@github.com:ClickHouse/ClickHouse.git
```
or
``` bash
$ git clone --recursive https://github.com/ClickHouse/ClickHouse.git
$ cd ClickHouse
$ git clone --recursive git@github.com:ClickHouse/ClickHouse.git # or https://github.com/ClickHouse/ClickHouse.git
```
## Build ClickHouse {#build-clickhouse}
> Please note: ClickHouse doesn't support build with native Apple Clang compiler, we need use clang from LLVM.
To build using Xcode's native AppleClang compiler:
``` bash
$ cd ClickHouse
$ rm -rf build
$ mkdir build
$ cd build
$ cmake .. -DCMAKE_C_COMPILER=`brew --prefix llvm`/bin/clang -DCMAKE_CXX_COMPILER=`brew --prefix llvm`/bin/clang++ -DCMAKE_PREFIX_PATH=`brew --prefix llvm`
$ ninja
$ cmake -DCMAKE_BUILD_TYPE=RelWithDebInfo -DENABLE_JEMALLOC=OFF ..
$ cmake --build . --config RelWithDebInfo
$ cd ..
```
To build using Homebrew's vanilla Clang compiler:
``` bash
$ cd ClickHouse
$ rm -rf build
$ mkdir build
$ cd build
$ cmake -DCMAKE_C_COMPILER=$(brew --prefix llvm)/bin/clang -DCMAKE_CXX_COMPILER==$(brew --prefix llvm)/bin/clang++ -DCMAKE_BUILD_TYPE=RelWithDebInfo -DENABLE_JEMALLOC=OFF ..
$ cmake --build . --config RelWithDebInfo
$ cd ..
```
To build using Homebrew's vanilla GCC compiler:
``` bash
$ cd ClickHouse
$ rm -rf build
$ mkdir build
$ cd build
$ cmake -DCMAKE_C_COMPILER=$(brew --prefix gcc)/bin/gcc-10 -DCMAKE_CXX_COMPILER=$(brew --prefix gcc)/bin/g++-10 -DCMAKE_BUILD_TYPE=RelWithDebInfo -DENABLE_JEMALLOC=OFF ..
$ cmake --build . --config RelWithDebInfo
$ cd ..
```

View File

@ -18,4 +18,8 @@ You can also use the following database engines:
- [Lazy](../../engines/database-engines/lazy.md)
- [Atomic](../../engines/database-engines/atomic.md)
- [PostgreSQL](../../engines/database-engines/postgresql.md)
[Original article](https://clickhouse.tech/docs/en/database_engines/) <!--hide-->

View File

@ -0,0 +1,138 @@
---
toc_priority: 35
toc_title: PostgreSQL
---
# PostgreSQL {#postgresql}
Allows to connect to databases on a remote [PostgreSQL](https://www.postgresql.org) server. Supports read and write operations (`SELECT` and `INSERT` queries) to exchange data between ClickHouse and PostgreSQL.
Gives the real-time access to table list and table structure from remote PostgreSQL with the help of `SHOW TABLES` and `DESCRIBE TABLE` queries.
Supports table structure modifications (`ALTER TABLE ... ADD|DROP COLUMN`). If `use_table_cache` parameter (see the Engine Parameters below) it set to `1`, the table structure is cached and not checked for being modified, but can be updated with `DETACH` and `ATTACH` queries.
## Creating a Database {#creating-a-database}
``` sql
CREATE DATABASE test_database
ENGINE = PostgreSQL('host:port', 'database', 'user', 'password'[, `use_table_cache`]);
```
**Engine Parameters**
- `host:port` — PostgreSQL server address.
- `database` — Remote database name.
- `user` — PostgreSQL user.
- `password` — User password.
- `use_table_cache` — Defines if the database table structure is cached or not. Optional. Default value: `0`.
## Data Types Support {#data_types-support}
| PostgerSQL | ClickHouse |
|------------------|--------------------------------------------------------------|
| DATE | [Date](../../sql-reference/data-types/date.md) |
| TIMESTAMP | [DateTime](../../sql-reference/data-types/datetime.md) |
| REAL | [Float32](../../sql-reference/data-types/float.md) |
| DOUBLE | [Float64](../../sql-reference/data-types/float.md) |
| DECIMAL, NUMERIC | [Decimal](../../sql-reference/data-types/decimal.md) |
| SMALLINT | [Int16](../../sql-reference/data-types/int-uint.md) |
| INTEGER | [Int32](../../sql-reference/data-types/int-uint.md) |
| BIGINT | [Int64](../../sql-reference/data-types/int-uint.md) |
| SERIAL | [UInt32](../../sql-reference/data-types/int-uint.md) |
| BIGSERIAL | [UInt64](../../sql-reference/data-types/int-uint.md) |
| TEXT, CHAR | [String](../../sql-reference/data-types/string.md) |
| INTEGER | Nullable([Int32](../../sql-reference/data-types/int-uint.md))|
| ARRAY | [Array](../../sql-reference/data-types/array.md) |
## Examples of Use {#examples-of-use}
Database in ClickHouse, exchanging data with the PostgreSQL server:
``` sql
CREATE DATABASE test_database
ENGINE = PostgreSQL('postgres1:5432', 'test_database', 'postgres', 'mysecretpassword', 1);
```
``` sql
SHOW DATABASES;
```
``` text
┌─name──────────┐
│ default │
│ test_database │
│ system │
└───────────────┘
```
``` sql
SHOW TABLES FROM test_database;
```
``` text
┌─name───────┐
│ test_table │
└────────────┘
```
Reading data from the PostgreSQL table:
``` sql
SELECT * FROM test_database.test_table;
```
``` text
┌─id─┬─value─┐
│ 1 │ 2 │
└────┴───────┘
```
Writing data to the PostgreSQL table:
``` sql
INSERT INTO test_database.test_table VALUES (3,4);
SELECT * FROM test_database.test_table;
```
``` text
┌─int_id─┬─value─┐
│ 1 │ 2 │
│ 3 │ 4 │
└────────┴───────┘
```
Consider the table structure was modified in PostgreSQL:
``` sql
postgre> ALTER TABLE test_table ADD COLUMN data Text
```
As the `use_table_cache` parameter was set to `1` when the database was created, the table structure in ClickHouse was cached and therefore not modified:
``` sql
DESCRIBE TABLE test_database.test_table;
```
``` text
┌─name───┬─type──────────────┐
│ id │ Nullable(Integer) │
│ value │ Nullable(Integer) │
└────────┴───────────────────┘
```
After detaching the table and attaching it again, the structure was updated:
``` sql
DETACH TABLE test_database.test_table;
ATTACH TABLE test_database.test_table;
DESCRIBE TABLE test_database.test_table;
```
``` text
┌─name───┬─type──────────────┐
│ id │ Nullable(Integer) │
│ value │ Nullable(Integer) │
│ data │ Nullable(String) │
└────────┴───────────────────┘
```
[Original article](https://clickhouse.tech/docs/en/database-engines/postgresql/) <!--hide-->

View File

@ -47,12 +47,17 @@ Engines for communicating with other data storage and processing systems.
Engines in the family:
- [Kafka](../../engines/table-engines/integrations/kafka.md#kafka)
- [MySQL](../../engines/table-engines/integrations/mysql.md#mysql)
- [ODBC](../../engines/table-engines/integrations/odbc.md#table-engine-odbc)
- [JDBC](../../engines/table-engines/integrations/jdbc.md#table-engine-jdbc)
- [HDFS](../../engines/table-engines/integrations/hdfs.md#hdfs)
- [S3](../../engines/table-engines/integrations/s3.md#table-engine-s3)
- [ODBC](../../engines/table-engines/integrations/odbc.md)
- [JDBC](../../engines/table-engines/integrations/jdbc.md)
- [MySQL](../../engines/table-engines/integrations/mysql.md)
- [MongoDB](../../engines/table-engines/integrations/mongodb.md)
- [HDFS](../../engines/table-engines/integrations/hdfs.md)
- [S3](../../engines/table-engines/integrations/s3.md)
- [Kafka](../../engines/table-engines/integrations/kafka.md)
- [EmbeddedRocksDB](../../engines/table-engines/integrations/embedded-rocksdb.md)
- [RabbitMQ](../../engines/table-engines/integrations/rabbitmq.md)
- [PostgreSQL](../../engines/table-engines/integrations/postgresql.md)
### Special Engines {#special-engines}

View File

@ -1,5 +1,5 @@
---
toc_priority: 6
toc_priority: 9
toc_title: EmbeddedRocksDB
---

View File

@ -1,5 +1,5 @@
---
toc_priority: 4
toc_priority: 6
toc_title: HDFS
---

View File

@ -1,6 +1,6 @@
---
toc_folder_title: Integrations
toc_priority: 30
toc_priority: 1
---
# Table Engines for Integrations {#table-engines-for-integrations}
@ -19,5 +19,3 @@ List of supported integrations:
- [EmbeddedRocksDB](../../../engines/table-engines/integrations/embedded-rocksdb.md)
- [RabbitMQ](../../../engines/table-engines/integrations/rabbitmq.md)
- [PostgreSQL](../../../engines/table-engines/integrations/postgresql.md)
[Original article](https://clickhouse.tech/docs/en/engines/table-engines/integrations/) <!--hide-->

View File

@ -1,5 +1,5 @@
---
toc_priority: 2
toc_priority: 3
toc_title: JDBC
---

View File

@ -1,5 +1,5 @@
---
toc_priority: 5
toc_priority: 8
toc_title: Kafka
---

View File

@ -1,5 +1,5 @@
---
toc_priority: 7
toc_priority: 5
toc_title: MongoDB
---

View File

@ -1,5 +1,5 @@
---
toc_priority: 3
toc_priority: 4
toc_title: MySQL
---

View File

@ -1,5 +1,5 @@
---
toc_priority: 1
toc_priority: 2
toc_title: ODBC
---

View File

@ -1,11 +1,11 @@
---
toc_priority: 8
toc_priority: 11
toc_title: PostgreSQL
---
# PostgreSQL {#postgresql}
The PostgreSQL engine allows you to perform `SELECT` queries on data that is stored on a remote PostgreSQL server.
The PostgreSQL engine allows to perform `SELECT` and `INSERT` queries on data that is stored on a remote PostgreSQL server.
## Creating a Table {#creating-a-table}
@ -15,7 +15,7 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster]
name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1] [TTL expr1],
name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2] [TTL expr2],
...
) ENGINE = PostgreSQL('host:port', 'database', 'table', 'user', 'password');
) ENGINE = PostgreSQL('host:port', 'database', 'table', 'user', 'password'[, `schema`]);
```
See a detailed description of the [CREATE TABLE](../../../sql-reference/statements/create/table.md#create-table-query) query.
@ -29,25 +29,51 @@ The table structure can differ from the original PostgreSQL table structure:
**Engine Parameters**
- `host:port` — PostgreSQL server address.
- `database` — Remote database name.
- `table` — Remote table name.
- `user` — PostgreSQL user.
- `password` — User password.
- `schema` — Non-default table schema. Optional.
SELECT Queries on PostgreSQL side run as `COPY (SELECT ...) TO STDOUT` inside read-only PostgreSQL transaction with commit after each `SELECT` query.
## Implementation Details {#implementation-details}
Simple `WHERE` clauses such as `=, !=, >, >=, <, <=, IN` are executed on the PostgreSQL server.
`SELECT` queries on PostgreSQL side run as `COPY (SELECT ...) TO STDOUT` inside read-only PostgreSQL transaction with commit after each `SELECT` query.
Simple `WHERE` clauses such as `=`, `!=`, `>`, `>=`, `<`, `<=`, and `IN` are executed on the PostgreSQL server.
All joins, aggregations, sorting, `IN [ array ]` conditions and the `LIMIT` sampling constraint are executed in ClickHouse only after the query to PostgreSQL finishes.
INSERT Queries on PostgreSQL side run as `COPY "table_name" (field1, field2, ... fieldN) FROM STDIN` inside PostgreSQL transaction with auto-commit after each `INSERT` statement.
`INSERT` queries on PostgreSQL side run as `COPY "table_name" (field1, field2, ... fieldN) FROM STDIN` inside PostgreSQL transaction with auto-commit after each `INSERT` statement.
PostgreSQL Array types converts into ClickHouse arrays.
Be careful in PostgreSQL an array data created like a type_name[] may contain multi-dimensional arrays of different dimensions in different table rows in same column, but in ClickHouse it is only allowed to have multidimensional arrays of the same count of dimensions in all table rows in same column.
PostgreSQL `Array` types are converted into ClickHouse arrays.
!!! info "Note"
Be careful - in PostgreSQL an array data, created like a `type_name[]`, may contain multi-dimensional arrays of different dimensions in different table rows in same column. But in ClickHouse it is only allowed to have multidimensional arrays of the same count of dimensions in all table rows in same column.
Replicas priority for PostgreSQL dictionary source is supported. The bigger the number in map, the less the priority. The highest priority is `0`.
In the example below replica `example01-1` has the highest priority:
```xml
<postgresql>
<port>5432</port>
<user>clickhouse</user>
<password>qwerty</password>
<replica>
<host>example01-1</host>
<priority>1</priority>
</replica>
<replica>
<host>example01-2</host>
<priority>2</priority>
</replica>
<db>db_name</db>
<table>table_name</table>
<where>id=10</where>
<invalidate_query>SQL_QUERY</invalidate_query>
</postgresql>
</source>
```
## Usage Example {#usage-example}
@ -64,10 +90,10 @@ PRIMARY KEY (int_id));
CREATE TABLE
postgres=# insert into test (int_id, str, "float") VALUES (1,'test',2);
postgres=# INSERT INTO test (int_id, str, "float") VALUES (1,'test',2);
INSERT 0 1
postgresql> select * from test;
postgresql> SELECT * FROM test;
int_id | int_nullable | float | str | float_nullable
--------+--------------+-------+------+----------------
1 | | 2 | test |
@ -87,20 +113,33 @@ ENGINE = PostgreSQL('localhost:5432', 'public', 'test', 'postges_user', 'postgre
```
``` sql
SELECT * FROM postgresql_table WHERE str IN ('test')
SELECT * FROM postgresql_table WHERE str IN ('test');
```
``` text
┌─float_nullable─┬─str──┬─int_id─┐
│ ᴺᵁᴸᴸ │ test │ 1 │
└────────────────┴──────┴────────┘
1 rows in set. Elapsed: 0.019 sec.
```
Using Non-default Schema:
## See Also {#see-also}
```text
postgres=# CREATE SCHEMA "nice.schema";
- [The postgresql table function](../../../sql-reference/table-functions/postgresql.md)
postgres=# CREATE TABLE "nice.schema"."nice.table" (a integer);
postgres=# INSERT INTO "nice.schema"."nice.table" SELECT i FROM generate_series(0, 99) as t(i)
```
```sql
CREATE TABLE pg_table_schema_with_dots (a UInt32)
ENGINE PostgreSQL('localhost:5432', 'clickhouse', 'nice.table', 'postgrsql_user', 'password', 'nice.schema');
```
**See Also**
- [The `postgresql` table function](../../../sql-reference/table-functions/postgresql.md)
- [Using PostgreSQL as a source of external dictionary](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md#dicts-external_dicts_dict_sources-postgresql)
[Original article](https://clickhouse.tech/docs/en/engines/table-engines/integrations/postgresql/) <!--hide-->

View File

@ -1,5 +1,5 @@
---
toc_priority: 6
toc_priority: 10
toc_title: RabbitMQ
---

View File

@ -1,5 +1,5 @@
---
toc_priority: 4
toc_priority: 7
toc_title: S3
---

View File

@ -502,7 +502,15 @@ On hosts with low RAM and swap, you possibly need setting `max_server_memory_usa
## max_concurrent_queries {#max-concurrent-queries}
The maximum number of simultaneously processed requests.
The maximum number of simultaneously processed queries related to MergeTree table. Queries may be limited by other settings: [max_concurrent_queries_for_all_users](#max-concurrent-queries-for-all-users), [min_marks_to_honor_max_concurrent_queries](#min-marks-to-honor-max-concurrent-queries).
!!! info "Note"
These settings can be modified at runtime and will take effect immediately. Queries that are already running will remain unchanged.
Possible values:
- Positive integer.
- 0 — Disabled.
**Example**
@ -530,6 +538,21 @@ Default value: `0` that means no limit.
- [max_concurrent_queries](#max-concurrent-queries)
## min_marks_to_honor_max_concurrent_queries {#min-marks-to-honor-max-concurrent-queries}
The minimal number of marks read by the query for applying the [max_concurrent_queries](#max-concurrent-queries) setting.
Possible values:
- Positive integer.
- 0 — Disabled.
**Example**
``` xml
<min_marks_to_honor_max_concurrent_queries>10</min_marks_to_honor_max_concurrent_queries>
```
## max_connections {#max-connections}
The maximum number of inbound connections.

View File

@ -1914,7 +1914,7 @@ Default value: `0`.
Enables or disables random shard insertion into a [Distributed](../../engines/table-engines/special/distributed.md#distributed) table when there is no distributed key.
By default, when inserting data into a `Distributed` table with more than one shard, the ClickHouse server will any insertion request if there is no distributed key. When `insert_distributed_one_random_shard = 1`, insertions are allowed and data is forwarded randomly among all shards.
By default, when inserting data into a `Distributed` table with more than one shard, the ClickHouse server will reject any insertion request if there is no distributed key. When `insert_distributed_one_random_shard = 1`, insertions are allowed and data is forwarded randomly among all shards.
Possible values:

View File

@ -14,7 +14,17 @@ Columns:
- `node_name` ([String](../../sql-reference/data-types/string.md)) — Node name in ZooKeeper.
- `type` ([String](../../sql-reference/data-types/string.md)) — Type of the task in the queue: `GET_PARTS`, `MERGE_PARTS`, `DETACH_PARTS`, `DROP_PARTS`, or `MUTATE_PARTS`.
- `type` ([String](../../sql-reference/data-types/string.md)) — Type of the task in the queue, one of:
- `GET_PART` - Get the part from another replica.
- `ATTACH_PART` - Attach the part, possibly from our own replica (if found in `detached` folder).
You may think of it as a `GET_PART` with some optimisations as they're nearly identical.
- `MERGE_PARTS` - Merge the parts.
- `DROP_RANGE` - Delete the parts in the specified partition in the specified number range.
- `CLEAR_COLUMN` - NOTE: Deprecated. Drop specific column from specified partition.
- `CLEAR_INDEX` - NOTE: Deprecated. Drop specific index from specified partition.
- `REPLACE_RANGE` - Drop certain range of partitions and replace them by new ones
- `MUTATE_PART` - Apply one or several mutations to the part.
- `ALTER_METADATA` - Apply alter modification according to global /metadata and /columns paths
- `create_time` ([Datetime](../../sql-reference/data-types/datetime.md)) — Date and time when the task was submitted for execution.

View File

@ -69,6 +69,8 @@ Types of sources (`source_type`):
- [ClickHouse](#dicts-external_dicts_dict_sources-clickhouse)
- [MongoDB](#dicts-external_dicts_dict_sources-mongodb)
- [Redis](#dicts-external_dicts_dict_sources-redis)
- [Cassandra](#dicts-external_dicts_dict_sources-cassandra)
- [PostgreSQL](#dicts-external_dicts_dict_sources-postgresql)
## Local File {#dicts-external_dicts_dict_sources-local_file}

View File

@ -1229,7 +1229,7 @@ SELECT arrayReverseFill(x -> not isNull(x), [1, null, 3, 11, 12, null, null, 5,
└────────────────────────────────────┘
```
Note that the `arrayReverseFilter` is a [higher-order function](../../sql-reference/functions/index.md#higher-order-functions). You must pass a lambda function to it as the first argument, and it cant be omitted.
Note that the `arrayReverseFill` is a [higher-order function](../../sql-reference/functions/index.md#higher-order-functions). You must pass a lambda function to it as the first argument, and it cant be omitted.
## arraySplit(func, arr1, …) {#array-split}

View File

@ -250,3 +250,53 @@ Result:
└───────────────┘
```
## bitHammingDistance {#bithammingdistance}
Returns the [Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance) between the bit representations of two integer values. Can be used with [SimHash](../../sql-reference/functions/hash-functions.md#ngramsimhash) functions for detection of semi-duplicate strings. The smaller is the distance, the more likely those strings are the same.
**Syntax**
``` sql
bitHammingDistance(int1, int2)
```
**Arguments**
- `int1` — First integer value. [Int64](../../sql-reference/data-types/int-uint.md).
- `int2` — Second integer value. [Int64](../../sql-reference/data-types/int-uint.md).
**Returned value**
- The Hamming distance.
Type: [UInt8](../../sql-reference/data-types/int-uint.md).
**Examples**
Query:
``` sql
SELECT bitHammingDistance(111, 121);
```
Result:
``` text
┌─bitHammingDistance(111, 121)─┐
│ 3 │
└──────────────────────────────┘
```
With [SimHash](../../sql-reference/functions/hash-functions.md#ngramsimhash):
``` sql
SELECT bitHammingDistance(ngramSimHash('cat ate rat'), ngramSimHash('rat ate cat'));
```
Result:
``` text
┌─bitHammingDistance(ngramSimHash('cat ate rat'), ngramSimHash('rat ate cat'))─┐
│ 5 │
└──────────────────────────────────────────────────────────────────────────────┘
```

View File

@ -7,6 +7,8 @@ toc_title: Hash
Hash functions can be used for the deterministic pseudo-random shuffling of elements.
Simhash is a hash function, which returns close hash values for close (similar) arguments.
## halfMD5 {#hash-functions-halfmd5}
[Interprets](../../sql-reference/functions/type-conversion-functions.md#type_conversion_functions-reinterpretAsString) all the input parameters as strings and calculates the [MD5](https://en.wikipedia.org/wiki/MD5) hash value for each of them. Then combines hashes, takes the first 8 bytes of the hash of the resulting string, and interprets them as `UInt64` in big-endian byte order.
@ -482,3 +484,938 @@ Result:
- [xxHash](http://cyan4973.github.io/xxHash/).
## ngramSimHash {#ngramsimhash}
Splits a ASCII string into n-grams of `ngramsize` symbols and returns the n-gram `simhash`. Is case sensitive.
Can be used for detection of semi-duplicate strings with [bitHammingDistance](../../sql-reference/functions/bit-functions.md#bithammingdistance). The smaller is the [Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance) of the calculated `simhashes` of two strings, the more likely these strings are the same.
**Syntax**
``` sql
ngramSimHash(string[, ngramsize])
```
**Arguments**
- `string` — String. [String](../../sql-reference/data-types/string.md).
- `ngramsize` — The size of an n-gram. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
**Returned value**
- Hash value.
Type: [UInt64](../../sql-reference/data-types/int-uint.md).
**Example**
Query:
``` sql
SELECT ngramSimHash('ClickHouse') AS Hash;
```
Result:
``` text
┌───────Hash─┐
│ 1627567969 │
└────────────┘
```
## ngramSimHashCaseInsensitive {#ngramsimhashcaseinsensitive}
Splits a ASCII string into n-grams of `ngramsize` symbols and returns the n-gram `simhash`. Is case insensitive.
Can be used for detection of semi-duplicate strings with [bitHammingDistance](../../sql-reference/functions/bit-functions.md#bithammingdistance). The smaller is the [Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance) of the calculated `simhashes` of two strings, the more likely these strings are the same.
**Syntax**
``` sql
ngramSimHashCaseInsensitive(string[, ngramsize])
```
**Arguments**
- `string` — String. [String](../../sql-reference/data-types/string.md).
- `ngramsize` — The size of an n-gram. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
**Returned value**
- Hash value.
Type: [UInt64](../../sql-reference/data-types/int-uint.md).
**Example**
Query:
``` sql
SELECT ngramSimHashCaseInsensitive('ClickHouse') AS Hash;
```
Result:
``` text
┌──────Hash─┐
│ 562180645 │
└───────────┘
```
## ngramSimHashUTF8 {#ngramsimhashutf8}
Splits a UTF-8 string into n-grams of `ngramsize` symbols and returns the n-gram `simhash`. Is case sensitive.
Can be used for detection of semi-duplicate strings with [bitHammingDistance](../../sql-reference/functions/bit-functions.md#bithammingdistance). The smaller is the [Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance) of the calculated `simhashes` of two strings, the more likely these strings are the same.
**Syntax**
``` sql
ngramSimHashUTF8(string[, ngramsize])
```
**Arguments**
- `string` — String. [String](../../sql-reference/data-types/string.md).
- `ngramsize` — The size of an n-gram. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
**Returned value**
- Hash value.
Type: [UInt64](../../sql-reference/data-types/int-uint.md).
**Example**
Query:
``` sql
SELECT ngramSimHashUTF8('ClickHouse') AS Hash;
```
Result:
``` text
┌───────Hash─┐
│ 1628157797 │
└────────────┘
```
## ngramSimHashCaseInsensitiveUTF8 {#ngramsimhashcaseinsensitiveutf8}
Splits a UTF-8 string into n-grams of `ngramsize` symbols and returns the n-gram `simhash`. Is case insensitive.
Can be used for detection of semi-duplicate strings with [bitHammingDistance](../../sql-reference/functions/bit-functions.md#bithammingdistance). The smaller is the [Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance) of the calculated `simhashes` of two strings, the more likely these strings are the same.
**Syntax**
``` sql
ngramSimHashCaseInsensitiveUTF8(string[, ngramsize])
```
**Arguments**
- `string` — String. [String](../../sql-reference/data-types/string.md).
- `ngramsize` — The size of an n-gram. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
**Returned value**
- Hash value.
Type: [UInt64](../../sql-reference/data-types/int-uint.md).
**Example**
Query:
``` sql
SELECT ngramSimHashCaseInsensitiveUTF8('ClickHouse') AS Hash;
```
Result:
``` text
┌───────Hash─┐
│ 1636742693 │
└────────────┘
```
## wordShingleSimHash {#wordshinglesimhash}
Splits a ASCII string into parts (shingles) of `shinglesize` words and returns the word shingle `simhash`. Is case sensitive.
Can be used for detection of semi-duplicate strings with [bitHammingDistance](../../sql-reference/functions/bit-functions.md#bithammingdistance). The smaller is the [Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance) of the calculated `simhashes` of two strings, the more likely these strings are the same.
**Syntax**
``` sql
wordShingleSimHash(string[, shinglesize])
```
**Arguments**
- `string` — String. [String](../../sql-reference/data-types/string.md).
- `shinglesize` — The size of a word shingle. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
**Returned value**
- Hash value.
Type: [UInt64](../../sql-reference/data-types/int-uint.md).
**Example**
Query:
``` sql
SELECT wordShingleSimHash('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).') AS Hash;
```
Result:
``` text
┌───────Hash─┐
│ 2328277067 │
└────────────┘
```
## wordShingleSimHashCaseInsensitive {#wordshinglesimhashcaseinsensitive}
Splits a ASCII string into parts (shingles) of `shinglesize` words and returns the word shingle `simhash`. Is case insensitive.
Can be used for detection of semi-duplicate strings with [bitHammingDistance](../../sql-reference/functions/bit-functions.md#bithammingdistance). The smaller is the [Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance) of the calculated `simhashes` of two strings, the more likely these strings are the same.
**Syntax**
``` sql
wordShingleSimHashCaseInsensitive(string[, shinglesize])
```
**Arguments**
- `string` — String. [String](../../sql-reference/data-types/string.md).
- `shinglesize` — The size of a word shingle. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
**Returned value**
- Hash value.
Type: [UInt64](../../sql-reference/data-types/int-uint.md).
**Example**
Query:
``` sql
SELECT wordShingleSimHashCaseInsensitive('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).') AS Hash;
```
Result:
``` text
┌───────Hash─┐
│ 2194812424 │
└────────────┘
```
## wordShingleSimHashUTF8 {#wordshinglesimhashutf8}
Splits a UTF-8 string into parts (shingles) of `shinglesize` words and returns the word shingle `simhash`. Is case sensitive.
Can be used for detection of semi-duplicate strings with [bitHammingDistance](../../sql-reference/functions/bit-functions.md#bithammingdistance). The smaller is the [Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance) of the calculated `simhashes` of two strings, the more likely these strings are the same.
**Syntax**
``` sql
wordShingleSimHashUTF8(string[, shinglesize])
```
**Arguments**
- `string` — String. [String](../../sql-reference/data-types/string.md).
- `shinglesize` — The size of a word shingle. Optinal. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
**Returned value**
- Hash value.
Type: [UInt64](../../sql-reference/data-types/int-uint.md).
**Example**
Query:
``` sql
SELECT wordShingleSimHashUTF8('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).') AS Hash;
```
Result:
``` text
┌───────Hash─┐
│ 2328277067 │
└────────────┘
```
## wordShingleSimHashCaseInsensitiveUTF8 {#wordshinglesimhashcaseinsensitiveutf8}
Splits a UTF-8 string into parts (shingles) of `shinglesize` words and returns the word shingle `simhash`. Is case insensitive.
Can be used for detection of semi-duplicate strings with [bitHammingDistance](../../sql-reference/functions/bit-functions.md#bithammingdistance). The smaller is the [Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance) of the calculated `simhashes` of two strings, the more likely these strings are the same.
**Syntax**
``` sql
wordShingleSimHashCaseInsensitiveUTF8(string[, shinglesize])
```
**Arguments**
- `string` — String. [String](../../sql-reference/data-types/string.md).
- `shinglesize` — The size of a word shingle. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
**Returned value**
- Hash value.
Type: [UInt64](../../sql-reference/data-types/int-uint.md).
**Example**
Query:
``` sql
SELECT wordShingleSimHashCaseInsensitiveUTF8('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).') AS Hash;
```
Result:
``` text
┌───────Hash─┐
│ 2194812424 │
└────────────┘
```
## ngramMinHash {#ngramminhash}
Splits a ASCII string into n-grams of `ngramsize` symbols and calculates hash values for each n-gram. Uses `hashnum` minimum hashes to calculate the minimum hash and `hashnum` maximum hashes to calculate the maximum hash. Returns a tuple with these hashes. Is case sensitive.
Can be used for detection of semi-duplicate strings with [tupleHammingDistance](../../sql-reference/functions/tuple-functions.md#tuplehammingdistance). For two strings: if one of the returned hashes is the same for both strings, we think that those strings are the same.
**Syntax**
``` sql
ngramMinHash(string[, ngramsize, hashnum])
```
**Arguments**
- `string` — String. [String](../../sql-reference/data-types/string.md).
- `ngramsize` — The size of an n-gram. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../../sql-reference/data-types/int-uint.md).
**Returned value**
- Tuple with two hashes — the minimum and the maximum.
Type: [Tuple](../../sql-reference/data-types/tuple.md)([UInt64](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md)).
**Example**
Query:
``` sql
SELECT ngramMinHash('ClickHouse') AS Tuple;
```
Result:
``` text
┌─Tuple──────────────────────────────────────┐
│ (18333312859352735453,9054248444481805918) │
└────────────────────────────────────────────┘
```
## ngramMinHashCaseInsensitive {#ngramminhashcaseinsensitive}
Splits a ASCII string into n-grams of `ngramsize` symbols and calculates hash values for each n-gram. Uses `hashnum` minimum hashes to calculate the minimum hash and `hashnum` maximum hashes to calculate the maximum hash. Returns a tuple with these hashes. Is case insensitive.
Can be used for detection of semi-duplicate strings with [tupleHammingDistance](../../sql-reference/functions/tuple-functions.md#tuplehammingdistance). For two strings: if one of the returned hashes is the same for both strings, we think that those strings are the same.
**Syntax**
``` sql
ngramMinHashCaseInsensitive(string[, ngramsize, hashnum])
```
**Arguments**
- `string` — String. [String](../../sql-reference/data-types/string.md).
- `ngramsize` — The size of an n-gram. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../../sql-reference/data-types/int-uint.md).
**Returned value**
- Tuple with two hashes — the minimum and the maximum.
Type: [Tuple](../../sql-reference/data-types/tuple.md)([UInt64](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md)).
**Example**
Query:
``` sql
SELECT ngramMinHashCaseInsensitive('ClickHouse') AS Tuple;
```
Result:
``` text
┌─Tuple──────────────────────────────────────┐
│ (2106263556442004574,13203602793651726206) │
└────────────────────────────────────────────┘
```
## ngramMinHashUTF8 {#ngramminhashutf8}
Splits a UTF-8 string into n-grams of `ngramsize` symbols and calculates hash values for each n-gram. Uses `hashnum` minimum hashes to calculate the minimum hash and `hashnum` maximum hashes to calculate the maximum hash. Returns a tuple with these hashes. Is case sensitive.
Can be used for detection of semi-duplicate strings with [tupleHammingDistance](../../sql-reference/functions/tuple-functions.md#tuplehammingdistance). For two strings: if one of the returned hashes is the same for both strings, we think that those strings are the same.
**Syntax**
``` sql
ngramMinHashUTF8(string[, ngramsize, hashnum])
```
**Arguments**
- `string` — String. [String](../../sql-reference/data-types/string.md).
- `ngramsize` — The size of an n-gram. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../../sql-reference/data-types/int-uint.md).
**Returned value**
- Tuple with two hashes — the minimum and the maximum.
Type: [Tuple](../../sql-reference/data-types/tuple.md)([UInt64](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md)).
**Example**
Query:
``` sql
SELECT ngramMinHashUTF8('ClickHouse') AS Tuple;
```
Result:
``` text
┌─Tuple──────────────────────────────────────┐
│ (18333312859352735453,6742163577938632877) │
└────────────────────────────────────────────┘
```
## ngramMinHashCaseInsensitiveUTF8 {#ngramminhashcaseinsensitiveutf8}
Splits a UTF-8 string into n-grams of `ngramsize` symbols and calculates hash values for each n-gram. Uses `hashnum` minimum hashes to calculate the minimum hash and `hashnum` maximum hashes to calculate the maximum hash. Returns a tuple with these hashes. Is case insensitive.
Can be used for detection of semi-duplicate strings with [tupleHammingDistance](../../sql-reference/functions/tuple-functions.md#tuplehammingdistance). For two strings: if one of the returned hashes is the same for both strings, we think that those strings are the same.
**Syntax**
``` sql
ngramMinHashCaseInsensitiveUTF8(string [, ngramsize, hashnum])
```
**Arguments**
- `string` — String. [String](../../sql-reference/data-types/string.md).
- `ngramsize` — The size of an n-gram. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../../sql-reference/data-types/int-uint.md).
**Returned value**
- Tuple with two hashes — the minimum and the maximum.
Type: [Tuple](../../sql-reference/data-types/tuple.md)([UInt64](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md)).
**Example**
Query:
``` sql
SELECT ngramMinHashCaseInsensitiveUTF8('ClickHouse') AS Tuple;
```
Result:
``` text
┌─Tuple───────────────────────────────────────┐
│ (12493625717655877135,13203602793651726206) │
└─────────────────────────────────────────────┘
```
## ngramMinHashArg {#ngramminhasharg}
Splits a ASCII string into n-grams of `ngramsize` symbols and returns the n-grams with minimum and maximum hashes, calculated by the [ngramMinHash](#ngramminhash) function with the same input. Is case sensitive.
**Syntax**
``` sql
ngramMinHashArg(string[, ngramsize, hashnum])
```
**Arguments**
- `string` — String. [String](../../sql-reference/data-types/string.md).
- `ngramsize` — The size of an n-gram. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../../sql-reference/data-types/int-uint.md).
**Returned value**
- Tuple with two tuples with `hashnum` n-grams each.
Type: [Tuple](../../sql-reference/data-types/tuple.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md)), [Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md))).
**Example**
Query:
``` sql
SELECT ngramMinHashArg('ClickHouse') AS Tuple;
```
Result:
``` text
┌─Tuple─────────────────────────────────────────────────────────────────────────┐
│ (('ous','ick','lic','Hou','kHo','use'),('Hou','lic','ick','ous','ckH','Cli')) │
└───────────────────────────────────────────────────────────────────────────────┘
```
## ngramMinHashArgCaseInsensitive {#ngramminhashargcaseinsensitive}
Splits a ASCII string into n-grams of `ngramsize` symbols and returns the n-grams with minimum and maximum hashes, calculated by the [ngramMinHashCaseInsensitive](#ngramminhashcaseinsensitive) function with the same input. Is case insensitive.
**Syntax**
``` sql
ngramMinHashArgCaseInsensitive(string[, ngramsize, hashnum])
```
**Arguments**
- `string` — String. [String](../../sql-reference/data-types/string.md).
- `ngramsize` — The size of an n-gram. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../../sql-reference/data-types/int-uint.md).
**Returned value**
- Tuple with two tuples with `hashnum` n-grams each.
Type: [Tuple](../../sql-reference/data-types/tuple.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md)), [Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md))).
**Example**
Query:
``` sql
SELECT ngramMinHashArgCaseInsensitive('ClickHouse') AS Tuple;
```
Result:
``` text
┌─Tuple─────────────────────────────────────────────────────────────────────────┐
│ (('ous','ick','lic','kHo','use','Cli'),('kHo','lic','ick','ous','ckH','Hou')) │
└───────────────────────────────────────────────────────────────────────────────┘
```
## ngramMinHashArgUTF8 {#ngramminhashargutf8}
Splits a UTF-8 string into n-grams of `ngramsize` symbols and returns the n-grams with minimum and maximum hashes, calculated by the [ngramMinHashUTF8](#ngramminhashutf8) function with the same input. Is case sensitive.
**Syntax**
``` sql
ngramMinHashArgUTF8(string[, ngramsize, hashnum])
```
**Arguments**
- `string` — String. [String](../../sql-reference/data-types/string.md).
- `ngramsize` — The size of an n-gram. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../../sql-reference/data-types/int-uint.md).
**Returned value**
- Tuple with two tuples with `hashnum` n-grams each.
Type: [Tuple](../../sql-reference/data-types/tuple.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md)), [Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md))).
**Example**
Query:
``` sql
SELECT ngramMinHashArgUTF8('ClickHouse') AS Tuple;
```
Result:
``` text
┌─Tuple─────────────────────────────────────────────────────────────────────────┐
│ (('ous','ick','lic','Hou','kHo','use'),('kHo','Hou','lic','ick','ous','ckH')) │
└───────────────────────────────────────────────────────────────────────────────┘
```
## ngramMinHashArgCaseInsensitiveUTF8 {#ngramminhashargcaseinsensitiveutf8}
Splits a UTF-8 string into n-grams of `ngramsize` symbols and returns the n-grams with minimum and maximum hashes, calculated by the [ngramMinHashCaseInsensitiveUTF8](#ngramminhashcaseinsensitiveutf8) function with the same input. Is case insensitive.
**Syntax**
``` sql
ngramMinHashArgCaseInsensitiveUTF8(string[, ngramsize, hashnum])
```
**Arguments**
- `string` — String. [String](../../sql-reference/data-types/string.md).
- `ngramsize` — The size of an n-gram. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../../sql-reference/data-types/int-uint.md).
**Returned value**
- Tuple with two tuples with `hashnum` n-grams each.
Type: [Tuple](../../sql-reference/data-types/tuple.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md)), [Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md))).
**Example**
Query:
``` sql
SELECT ngramMinHashArgCaseInsensitiveUTF8('ClickHouse') AS Tuple;
```
Result:
``` text
┌─Tuple─────────────────────────────────────────────────────────────────────────┐
│ (('ckH','ous','ick','lic','kHo','use'),('kHo','lic','ick','ous','ckH','Hou')) │
└───────────────────────────────────────────────────────────────────────────────┘
```
## wordShingleMinHash {#wordshingleminhash}
Splits a ASCII string into parts (shingles) of `shinglesize` words and calculates hash values for each word shingle. Uses `hashnum` minimum hashes to calculate the minimum hash and `hashnum` maximum hashes to calculate the maximum hash. Returns a tuple with these hashes. Is case sensitive.
Can be used for detection of semi-duplicate strings with [tupleHammingDistance](../../sql-reference/functions/tuple-functions.md#tuplehammingdistance). For two strings: if one of the returned hashes is the same for both strings, we think that those strings are the same.
**Syntax**
``` sql
wordShingleMinHash(string[, shinglesize, hashnum])
```
**Arguments**
- `string` — String. [String](../../sql-reference/data-types/string.md).
- `shinglesize` — The size of a word shingle. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../../sql-reference/data-types/int-uint.md).
**Returned value**
- Tuple with two hashes — the minimum and the maximum.
Type: [Tuple](../../sql-reference/data-types/tuple.md)([UInt64](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md)).
**Example**
Query:
``` sql
SELECT wordShingleMinHash('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).') AS Tuple;
```
Result:
``` text
┌─Tuple──────────────────────────────────────┐
│ (16452112859864147620,5844417301642981317) │
└────────────────────────────────────────────┘
```
## wordShingleMinHashCaseInsensitive {#wordshingleminhashcaseinsensitive}
Splits a ASCII string into parts (shingles) of `shinglesize` words and calculates hash values for each word shingle. Uses `hashnum` minimum hashes to calculate the minimum hash and `hashnum` maximum hashes to calculate the maximum hash. Returns a tuple with these hashes. Is case insensitive.
Can be used for detection of semi-duplicate strings with [tupleHammingDistance](../../sql-reference/functions/tuple-functions.md#tuplehammingdistance). For two strings: if one of the returned hashes is the same for both strings, we think that those strings are the same.
**Syntax**
``` sql
wordShingleMinHashCaseInsensitive(string[, shinglesize, hashnum])
```
**Arguments**
- `string` — String. [String](../../sql-reference/data-types/string.md).
- `shinglesize` — The size of a word shingle. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../../sql-reference/data-types/int-uint.md).
**Returned value**
- Tuple with two hashes — the minimum and the maximum.
Type: [Tuple](../../sql-reference/data-types/tuple.md)([UInt64](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md)).
**Example**
Query:
``` sql
SELECT wordShingleMinHashCaseInsensitive('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).') AS Tuple;
```
Result:
``` text
┌─Tuple─────────────────────────────────────┐
│ (3065874883688416519,1634050779997673240) │
└───────────────────────────────────────────┘
```
## wordShingleMinHashUTF8 {#wordshingleminhashutf8}
Splits a UTF-8 string into parts (shingles) of `shinglesize` words and calculates hash values for each word shingle. Uses `hashnum` minimum hashes to calculate the minimum hash and `hashnum` maximum hashes to calculate the maximum hash. Returns a tuple with these hashes. Is case sensitive.
Can be used for detection of semi-duplicate strings with [tupleHammingDistance](../../sql-reference/functions/tuple-functions.md#tuplehammingdistance). For two strings: if one of the returned hashes is the same for both strings, we think that those strings are the same.
**Syntax**
``` sql
wordShingleMinHashUTF8(string[, shinglesize, hashnum])
```
**Arguments**
- `string` — String. [String](../../sql-reference/data-types/string.md).
- `shinglesize` — The size of a word shingle. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../../sql-reference/data-types/int-uint.md).
**Returned value**
- Tuple with two hashes — the minimum and the maximum.
Type: [Tuple](../../sql-reference/data-types/tuple.md)([UInt64](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md)).
**Example**
Query:
``` sql
SELECT wordShingleMinHashUTF8('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).') AS Tuple;
```
Result:
``` text
┌─Tuple──────────────────────────────────────┐
│ (16452112859864147620,5844417301642981317) │
└────────────────────────────────────────────┘
```
## wordShingleMinHashCaseInsensitiveUTF8 {#wordshingleminhashcaseinsensitiveutf8}
Splits a UTF-8 string into parts (shingles) of `shinglesize` words and calculates hash values for each word shingle. Uses `hashnum` minimum hashes to calculate the minimum hash and `hashnum` maximum hashes to calculate the maximum hash. Returns a tuple with these hashes. Is case insensitive.
Can be used for detection of semi-duplicate strings with [tupleHammingDistance](../../sql-reference/functions/tuple-functions.md#tuplehammingdistance). For two strings: if one of the returned hashes is the same for both strings, we think that those strings are the same.
**Syntax**
``` sql
wordShingleMinHashCaseInsensitiveUTF8(string[, shinglesize, hashnum])
```
**Arguments**
- `string` — String. [String](../../sql-reference/data-types/string.md).
- `shinglesize` — The size of a word shingle. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../../sql-reference/data-types/int-uint.md).
**Returned value**
- Tuple with two hashes — the minimum and the maximum.
Type: [Tuple](../../sql-reference/data-types/tuple.md)([UInt64](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md)).
**Example**
Query:
``` sql
SELECT wordShingleMinHashCaseInsensitiveUTF8('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).') AS Tuple;
```
Result:
``` text
┌─Tuple─────────────────────────────────────┐
│ (3065874883688416519,1634050779997673240) │
└───────────────────────────────────────────┘
```
## wordShingleMinHashArg {#wordshingleminhasharg}
Splits a ASCII string into parts (shingles) of `shinglesize` words each and returns the shingles with minimum and maximum word hashes, calculated by the [wordshingleMinHash](#wordshingleminhash) function with the same input. Is case sensitive.
**Syntax**
``` sql
wordShingleMinHashArg(string[, shinglesize, hashnum])
```
**Arguments**
- `string` — String. [String](../../sql-reference/data-types/string.md).
- `shinglesize` — The size of a word shingle. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../../sql-reference/data-types/int-uint.md).
**Returned value**
- Tuple with two tuples with `hashnum` word shingles each.
Type: [Tuple](../../sql-reference/data-types/tuple.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md)), [Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md))).
**Example**
Query:
``` sql
SELECT wordShingleMinHashArg('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).', 1, 3) AS Tuple;
```
Result:
``` text
┌─Tuple─────────────────────────────────────────────────────────────────┐
│ (('OLAP','database','analytical'),('online','oriented','processing')) │
└───────────────────────────────────────────────────────────────────────┘
```
## wordShingleMinHashArgCaseInsensitive {#wordshingleminhashargcaseinsensitive}
Splits a ASCII string into parts (shingles) of `shinglesize` words each and returns the shingles with minimum and maximum word hashes, calculated by the [wordShingleMinHashCaseInsensitive](#wordshingleminhashcaseinsensitive) function with the same input. Is case insensitive.
**Syntax**
``` sql
wordShingleMinHashArgCaseInsensitive(string[, shinglesize, hashnum])
```
**Arguments**
- `string` — String. [String](../../sql-reference/data-types/string.md).
- `shinglesize` — The size of a word shingle. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../../sql-reference/data-types/int-uint.md).
**Returned value**
- Tuple with two tuples with `hashnum` word shingles each.
Type: [Tuple](../../sql-reference/data-types/tuple.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md)), [Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md))).
**Example**
Query:
``` sql
SELECT wordShingleMinHashArgCaseInsensitive('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).', 1, 3) AS Tuple;
```
Result:
``` text
┌─Tuple──────────────────────────────────────────────────────────────────┐
│ (('queries','database','analytical'),('oriented','processing','DBMS')) │
└────────────────────────────────────────────────────────────────────────┘
```
## wordShingleMinHashArgUTF8 {#wordshingleminhashargutf8}
Splits a UTF-8 string into parts (shingles) of `shinglesize` words each and returns the shingles with minimum and maximum word hashes, calculated by the [wordShingleMinHashUTF8](#wordshingleminhashutf8) function with the same input. Is case sensitive.
**Syntax**
``` sql
wordShingleMinHashArgUTF8(string[, shinglesize, hashnum])
```
**Arguments**
- `string` — String. [String](../../sql-reference/data-types/string.md).
- `shinglesize` — The size of a word shingle. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../../sql-reference/data-types/int-uint.md).
**Returned value**
- Tuple with two tuples with `hashnum` word shingles each.
Type: [Tuple](../../sql-reference/data-types/tuple.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md)), [Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md))).
**Example**
Query:
``` sql
SELECT wordShingleMinHashArgUTF8('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).', 1, 3) AS Tuple;
```
Result:
``` text
┌─Tuple─────────────────────────────────────────────────────────────────┐
│ (('OLAP','database','analytical'),('online','oriented','processing')) │
└───────────────────────────────────────────────────────────────────────┘
```
## wordShingleMinHashArgCaseInsensitiveUTF8 {#wordshingleminhashargcaseinsensitiveutf8}
Splits a UTF-8 string into parts (shingles) of `shinglesize` words each and returns the shingles with minimum and maximum word hashes, calculated by the [wordShingleMinHashCaseInsensitiveUTF8](#wordshingleminhashcaseinsensitiveutf8) function with the same input. Is case insensitive.
**Syntax**
``` sql
wordShingleMinHashArgCaseInsensitiveUTF8(string[, shinglesize, hashnum])
```
**Arguments**
- `string` — String. [String](../../sql-reference/data-types/string.md).
- `shinglesize` — The size of a word shingle. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../../sql-reference/data-types/int-uint.md).
**Returned value**
- Tuple with two tuples with `hashnum` word shingles each.
Type: [Tuple](../../sql-reference/data-types/tuple.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md)), [Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md))).
**Example**
Query:
``` sql
SELECT wordShingleMinHashArgCaseInsensitiveUTF8('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).', 1, 3) AS Tuple;
```
Result:
``` text
┌─Tuple──────────────────────────────────────────────────────────────────┐
│ (('queries','database','analytical'),('oriented','processing','DBMS')) │
└────────────────────────────────────────────────────────────────────────┘
```

View File

@ -111,4 +111,55 @@ Result:
- [Tuple](../../sql-reference/data-types/tuple.md)
[Original article](https://clickhouse.tech/docs/en/sql-reference/functions/tuple-functions/) <!--hide-->
## tupleHammingDistance {#tuplehammingdistance}
Returns the [Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance) between two tuples of the same size.
**Syntax**
``` sql
tupleHammingDistance(tuple1, tuple2)
```
**Arguments**
- `tuple1` — First tuple. [Tuple](../../sql-reference/data-types/tuple.md).
- `tuple2` — Second tuple. [Tuple](../../sql-reference/data-types/tuple.md).
Tuples should have the same type of the elements.
**Returned value**
- The Hamming distance.
Type: [UInt8](../../sql-reference/data-types/int-uint.md).
**Examples**
Query:
``` sql
SELECT tupleHammingDistance((1, 2, 3), (3, 2, 1)) AS HammingDistance;
```
Result:
``` text
┌─HammingDistance─┐
│ 2 │
└─────────────────┘
```
Can be used with [MinHash](../../sql-reference/functions/hash-functions.md#ngramminhash) functions for detection of semi-duplicate strings:
``` sql
SELECT tupleHammingDistance(wordShingleMinHash(string), wordShingleMinHashCaseInsensitive(string)) as HammingDistance FROM (SELECT 'Clickhouse is a column-oriented database management system for online analytical processing of queries.' AS string);
```
Result:
``` text
┌─HammingDistance─┐
│ 2 │
└─────────────────┘
```

View File

@ -40,7 +40,7 @@ Read about setting the partition expression in a section [How to specify the par
After the query is executed, you can do whatever you want with the data in the `detached` directory — delete it from the file system, or just leave it.
This query is replicated it moves the data to the `detached` directory on all replicas. Note that you can execute this query only on a leader replica. To find out if a replica is a leader, perform the `SELECT` query to the [system.replicas](../../../operations/system-tables/replicas.md#system_tables-replicas) table. Alternatively, it is easier to make a `DETACH` query on all replicas - all the replicas throw an exception, except the leader replica.
This query is replicated it moves the data to the `detached` directory on all replicas. Note that you can execute this query only on a leader replica. To find out if a replica is a leader, perform the `SELECT` query to the [system.replicas](../../../operations/system-tables/replicas.md#system_tables-replicas) table. Alternatively, it is easier to make a `DETACH` query on all replicas - all the replicas throw an exception, except the leader replicas (as multiple leaders are allowed).
## DROP PARTITION\|PART {#alter_drop-partition}
@ -85,9 +85,15 @@ ALTER TABLE visits ATTACH PART 201901_2_2_0;
Read more about setting the partition expression in a section [How to specify the partition expression](#alter-how-to-specify-part-expr).
This query is replicated. The replica-initiator checks whether there is data in the `detached` directory. If data exists, the query checks its integrity. If everything is correct, the query adds the data to the table. All other replicas download the data from the replica-initiator.
This query is replicated. The replica-initiator checks whether there is data in the `detached` directory.
If data exists, the query checks its integrity. If everything is correct, the query adds the data to the table.
So you can put data to the `detached` directory on one replica, and use the `ALTER ... ATTACH` query to add it to the table on all replicas.
If the non-initiator replica, receiving the attach command, finds the part with the correct checksums in its own
`detached` folder, it attaches the data without fetching it from other replicas.
If there is no part with the correct checksums, the data is downloaded from any replica having the part.
You can put data to the `detached` directory on one replica and use the `ALTER ... ATTACH` query to add it to the
table on all replicas.
## ATTACH PARTITION FROM {#alter_attach-partition-from}
@ -95,7 +101,8 @@ So you can put data to the `detached` directory on one replica, and use the `ALT
ALTER TABLE table2 ATTACH PARTITION partition_expr FROM table1
```
This query copies the data partition from the `table1` to `table2` adds data to exsisting in the `table2`. Note that data wont be deleted from `table1`.
This query copies the data partition from the `table1` to `table2`.
Note that data won't be deleted neither from `table1` nor from `table2`.
For the query to run successfully, the following conditions must be met:

View File

@ -264,6 +264,10 @@ Wait until a `ReplicatedMergeTree` table will be synced with other replicas in a
SYSTEM SYNC REPLICA [db.]replicated_merge_tree_family_table_name
```
After running this statement the `[db.]replicated_merge_tree_family_table_name` fetches commands from
the common replicated log into its own replication queue, and then the query waits till the replica processes all
of the fetched commands.
### RESTART REPLICA {#query_language-system-restart-replica}
Provides possibility to reinitialize Zookeeper sessions state for `ReplicatedMergeTree` table, will compare current state with Zookeeper as source of true and add tasks to Zookeeper queue if needed

View File

@ -22,15 +22,17 @@ You can use table functions in:
You cant use table functions if the [allow_ddl](../../operations/settings/permissions-for-queries.md#settings_allow_ddl) setting is disabled.
| Function | Description |
|-----------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------|
| [file](../../sql-reference/table-functions/file.md) | Creates a File-engine table. |
| [merge](../../sql-reference/table-functions/merge.md) | Creates a Merge-engine table. |
|------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------|
| [file](../../sql-reference/table-functions/file.md) | Creates a [File](../../engines/table-engines/special/file.md)-engine table. |
| [merge](../../sql-reference/table-functions/merge.md) | Creates a [Merge](../../engines/table-engines/special/merge.md)-engine table. |
| [numbers](../../sql-reference/table-functions/numbers.md) | Creates a table with a single column filled with integer numbers. |
| [remote](../../sql-reference/table-functions/remote.md) | Allows you to access remote servers without creating a Distributed-engine table. |
| [url](../../sql-reference/table-functions/url.md) | Creates a URL-engine table. |
| [mysql](../../sql-reference/table-functions/mysql.md) | Creates a MySQL-engine table. |
| [postgresql](../../sql-reference/table-functions/postgresql.md) | Creates a PostgreSQL-engine table. |
| [jdbc](../../sql-reference/table-functions/jdbc.md) | Creates a JDBC-engine table. |
| [odbc](../../sql-reference/table-functions/odbc.md) | Creates a ODBC-engine table. |
| [hdfs](../../sql-reference/table-functions/hdfs.md) | Creates a HDFS-engine table. |
| [s3](../../sql-reference/table-functions/s3.md) | Creates a S3-engine table. |
| [remote](../../sql-reference/table-functions/remote.md) | Allows you to access remote servers without creating a [Distributed](../../engines/table-engines/special/distributed.md)-engine table. |
| [url](../../sql-reference/table-functions/url.md) | Creates a [Url](../../engines/table-engines/special/url.md)-engine table. |
| [mysql](../../sql-reference/table-functions/mysql.md) | Creates a [MySQL](../../engines/table-engines/integrations/mysql.md)-engine table. |
| [postgresql](../../sql-reference/table-functions/postgresql.md) | Creates a [PostgreSQL](../../engines/table-engines/integrations/postgresql.md)-engine table. |
| [jdbc](../../sql-reference/table-functions/jdbc.md) | Creates a [JDBC](../../engines/table-engines/integrations/jdbc.md)-engine table. |
| [odbc](../../sql-reference/table-functions/odbc.md) | Creates a [ODBC](../../engines/table-engines/integrations/odbc.md)-engine table. |
| [hdfs](../../sql-reference/table-functions/hdfs.md) | Creates a [HDFS](../../engines/table-engines/integrations/hdfs.md)-engine table. |
| [s3](../../sql-reference/table-functions/s3.md) | Creates a [S3](../../engines/table-engines/integrations/s3.md)-engine table. |
[Original article](https://clickhouse.tech/docs/en/sql-reference/table-functions/) <!--hide-->

View File

@ -10,33 +10,17 @@ Allows `SELECT` and `INSERT` queries to be performed on data that is stored on a
**Syntax**
``` sql
postgresql('host:port', 'database', 'table', 'user', 'password')
postgresql('host:port', 'database', 'table', 'user', 'password'[, `schema`])
```
**Arguments**
- `host:port` — PostgreSQL server address.
- `database` — Remote database name.
- `table` — Remote table name.
- `user` — PostgreSQL user.
- `password` — User password.
SELECT Queries on PostgreSQL side run as `COPY (SELECT ...) TO STDOUT` inside read-only PostgreSQL transaction with commit after each `SELECT` query.
Simple `WHERE` clauses such as `=, !=, >, >=, <, <=, IN` are executed on the PostgreSQL server.
All joins, aggregations, sorting, `IN [ array ]` conditions and the `LIMIT` sampling constraint are executed in ClickHouse only after the query to PostgreSQL finishes.
INSERT Queries on PostgreSQL side run as `COPY "table_name" (field1, field2, ... fieldN) FROM STDIN` inside PostgreSQL transaction with auto-commit after each `INSERT` statement.
PostgreSQL Array types converts into ClickHouse arrays.
Be careful in PostgreSQL an array data type column like Integer[] may contain arrays of different dimensions in different rows, but in ClickHouse it is only allowed to have multidimensional arrays of the same dimension in all rows.
- `schema` — Non-default table schema. Optional.
**Returned Value**
@ -45,6 +29,23 @@ A table object with the same columns as the original PostgreSQL table.
!!! info "Note"
In the `INSERT` query to distinguish table function `postgresql(...)` from table name with column names list you must use keywords `FUNCTION` or `TABLE FUNCTION`. See examples below.
## Implementation Details {#implementation-details}
`SELECT` queries on PostgreSQL side run as `COPY (SELECT ...) TO STDOUT` inside read-only PostgreSQL transaction with commit after each `SELECT` query.
Simple `WHERE` clauses such as `=`, `!=`, `>`, `>=`, `<`, `<=`, and `IN` are executed on the PostgreSQL server.
All joins, aggregations, sorting, `IN [ array ]` conditions and the `LIMIT` sampling constraint are executed in ClickHouse only after the query to PostgreSQL finishes.
`INSERT` queries on PostgreSQL side run as `COPY "table_name" (field1, field2, ... fieldN) FROM STDIN` inside PostgreSQL transaction with auto-commit after each `INSERT` statement.
PostgreSQL Array types converts into ClickHouse arrays.
!!! info "Note"
Be careful, in PostgreSQL an array data type column like Integer[] may contain arrays of different dimensions in different rows, but in ClickHouse it is only allowed to have multidimensional arrays of the same dimension in all rows.
Supports replicas priority for PostgreSQL dictionary source. The bigger the number in map, the less the priority. The highest priority is `0`.
**Examples**
Table in PostgreSQL:
@ -60,10 +61,10 @@ PRIMARY KEY (int_id));
CREATE TABLE
postgres=# insert into test (int_id, str, "float") VALUES (1,'test',2);
postgres=# INSERT INTO test (int_id, str, "float") VALUES (1,'test',2);
INSERT 0 1
postgresql> select * from test;
postgresql> SELECT * FROM test;
int_id | int_nullable | float | str | float_nullable
--------+--------------+-------+------+----------------
1 | | 2 | test |
@ -96,9 +97,24 @@ SELECT * FROM postgresql('localhost:5432', 'test', 'test', 'postgresql_user', 'p
└────────┴──────────────┴───────┴──────┴────────────────┘
```
Using Non-default Schema:
```text
postgres=# CREATE SCHEMA "nice.schema";
postgres=# CREATE TABLE "nice.schema"."nice.table" (a integer);
postgres=# INSERT INTO "nice.schema"."nice.table" SELECT i FROM generate_series(0, 99) as t(i)
```
```sql
CREATE TABLE pg_table_schema_with_dots (a UInt32)
ENGINE PostgreSQL('localhost:5432', 'clickhouse', 'nice.table', 'postgrsql_user', 'password', 'nice.schema');
```
**See Also**
- [The PostgreSQL table engine](../../engines/table-engines/integrations/postgresql.md)
- [The PostgreSQL table engine](../../engines/table-engines/integrations/postgresql.md)
- [Using PostgreSQL as a source of external dictionary](../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md#dicts-external_dicts_dict_sources-postgresql)
[Original article](https://clickhouse.tech/docs/en/sql-reference/table-functions/postgresql/) <!--hide-->

View File

@ -4,7 +4,7 @@ toc_priority: 27
toc_title: "Введение"
---
# Движки баз данных {#dvizhki-baz-dannykh}
# Движки баз данных {#database-engines}
Движки баз данных обеспечивают работу с таблицами.
@ -18,3 +18,5 @@ toc_title: "Введение"
- [Lazy](../../engines/database-engines/lazy.md)
- [PostgreSQL](../../engines/database-engines/postgresql.md)

View File

@ -0,0 +1,138 @@
---
toc_priority: 35
toc_title: PostgreSQL
---
# PostgreSQL {#postgresql}
Позволяет подключаться к БД на удаленном сервере [PostgreSQL](https://www.postgresql.org). Поддерживает операции чтения и записи (запросы `SELECT` и `INSERT`) для обмена данными между ClickHouse и PostgreSQL.
Позволяет в реальном времени получать от удаленного сервера PostgreSQL информацию о таблицах БД и их структуре с помощью запросов `SHOW TABLES` и `DESCRIBE TABLE`.
Поддерживает операции изменения структуры таблиц (`ALTER TABLE ... ADD|DROP COLUMN`). Если параметр `use_table_cache` (см. ниже раздел Параметры движка) установлен в значение `1`, структура таблицы кешируется, и изменения в структуре не отслеживаются, но будут обновлены, если выполнить команды `DETACH` и `ATTACH`.
## Создание БД {#creating-a-database}
``` sql
CREATE DATABASE test_database
ENGINE = PostgreSQL('host:port', 'database', 'user', 'password'[, `use_table_cache`]);
```
**Параметры движка**
- `host:port` — адрес сервера PostgreSQL.
- `database` — имя удаленной БД.
- `user` — пользователь PostgreSQL.
- `password` — пароль пользователя.
- `use_table_cache` — определяет кеширование структуры таблиц БД. Необязательный параметр. Значение по умолчанию: `0`.
## Поддерживаемые типы данных {#data_types-support}
| PostgerSQL | ClickHouse |
|------------------|--------------------------------------------------------------|
| DATE | [Date](../../sql-reference/data-types/date.md) |
| TIMESTAMP | [DateTime](../../sql-reference/data-types/datetime.md) |
| REAL | [Float32](../../sql-reference/data-types/float.md) |
| DOUBLE | [Float64](../../sql-reference/data-types/float.md) |
| DECIMAL, NUMERIC | [Decimal](../../sql-reference/data-types/decimal.md) |
| SMALLINT | [Int16](../../sql-reference/data-types/int-uint.md) |
| INTEGER | [Int32](../../sql-reference/data-types/int-uint.md) |
| BIGINT | [Int64](../../sql-reference/data-types/int-uint.md) |
| SERIAL | [UInt32](../../sql-reference/data-types/int-uint.md) |
| BIGSERIAL | [UInt64](../../sql-reference/data-types/int-uint.md) |
| TEXT, CHAR | [String](../../sql-reference/data-types/string.md) |
| INTEGER | Nullable([Int32](../../sql-reference/data-types/int-uint.md))|
| ARRAY | [Array](../../sql-reference/data-types/array.md) |
## Примеры использования {#examples-of-use}
Обмен данными между БД ClickHouse и сервером PostgreSQL:
``` sql
CREATE DATABASE test_database
ENGINE = PostgreSQL('postgres1:5432', 'test_database', 'postgres', 'mysecretpassword', 1);
```
``` sql
SHOW DATABASES;
```
``` text
┌─name──────────┐
│ default │
│ test_database │
│ system │
└───────────────┘
```
``` sql
SHOW TABLES FROM test_database;
```
``` text
┌─name───────┐
│ test_table │
└────────────┘
```
Чтение данных из таблицы PostgreSQL:
``` sql
SELECT * FROM test_database.test_table;
```
``` text
┌─id─┬─value─┐
│ 1 │ 2 │
└────┴───────┘
```
Запись данных в таблицу PostgreSQL:
``` sql
INSERT INTO test_database.test_table VALUES (3,4);
SELECT * FROM test_database.test_table;
```
``` text
┌─int_id─┬─value─┐
│ 1 │ 2 │
│ 3 │ 4 │
└────────┴───────┘
```
Пусть структура таблицы была изменена в PostgreSQL:
``` sql
postgre> ALTER TABLE test_table ADD COLUMN data Text
```
Поскольку при создании БД параметр `use_table_cache` был установлен в значение `1`, структура таблицы в ClickHouse была кеширована и поэтому не изменилась:
``` sql
DESCRIBE TABLE test_database.test_table;
```
``` text
┌─name───┬─type──────────────┐
│ id │ Nullable(Integer) │
│ value │ Nullable(Integer) │
└────────┴───────────────────┘
```
После того как таблицу «отцепили» и затем снова «прицепили», структура обновилась:
``` sql
DETACH TABLE test_database.test_table;
ATTACH TABLE test_database.test_table;
DESCRIBE TABLE test_database.test_table;
```
``` text
┌─name───┬─type──────────────┐
│ id │ Nullable(Integer) │
│ value │ Nullable(Integer) │
│ data │ Nullable(String) │
└────────┴───────────────────┘
```
[Оригинальная статья](https://clickhouse.tech/docs/ru/database-engines/postgresql/) <!--hide-->

View File

@ -16,7 +16,7 @@ toc_title: "Введение"
- Возможно ли многопоточное выполнение запроса.
- Параметры репликации данных.
## Семейства движков {#semeistva-dvizhkov}
## Семейства движков {#engine-families}
### MergeTree {#mergetree}
@ -42,18 +42,23 @@ toc_title: "Введение"
- [StripeLog](log-family/stripelog.md#stripelog)
- [Log](log-family/log.md#log)
### Движки для интеграции {#dvizhki-dlia-integratsii}
### Движки для интеграции {#integration-engines}
Движки для связи с другими системами хранения и обработки данных.
Движки семейства:
- [Kafka](integrations/kafka.md#kafka)
- [MySQL](integrations/mysql.md#mysql)
- [ODBC](integrations/odbc.md#table-engine-odbc)
- [JDBC](integrations/jdbc.md#table-engine-jdbc)
- [ODBC](../../engines/table-engines/integrations/odbc.md)
- [JDBC](../../engines/table-engines/integrations/jdbc.md)
- [MySQL](../../engines/table-engines/integrations/mysql.md)
- [MongoDB](../../engines/table-engines/integrations/mongodb.md)
- [HDFS](../../engines/table-engines/integrations/hdfs.md)
- [Kafka](../../engines/table-engines/integrations/kafka.md)
- [EmbeddedRocksDB](../../engines/table-engines/integrations/embedded-rocksdb.md)
- [RabbitMQ](../../engines/table-engines/integrations/rabbitmq.md)
- [PostgreSQL](../../engines/table-engines/integrations/postgresql.md)
### Специальные движки {#spetsialnye-dvizhki}
### Специальные движки {#special-engines}
Движки семейства:

View File

@ -1,5 +1,5 @@
---
toc_priority: 6
toc_priority: 9
toc_title: EmbeddedRocksDB
---

View File

@ -1,5 +1,5 @@
---
toc_priority: 4
toc_priority: 6
toc_title: HDFS
---

View File

@ -1,5 +1,5 @@
---
toc_priority: 2
toc_priority: 3
toc_title: JDBC
---

View File

@ -1,5 +1,5 @@
---
toc_priority: 5
toc_priority: 8
toc_title: Kafka
---

View File

@ -1,5 +1,5 @@
---
toc_priority: 7
toc_priority: 5
toc_title: MongoDB
---

View File

@ -1,5 +1,5 @@
---
toc_priority: 3
toc_priority: 4
toc_title: MySQL
---

View File

@ -1,5 +1,5 @@
---
toc_priority: 1
toc_priority: 2
toc_title: ODBC
---

View File

@ -1,11 +1,11 @@
---
toc_priority: 8
toc_priority: 11
toc_title: PostgreSQL
---
# PosgtreSQL {#postgresql}
#PostgreSQL {#postgresql}
Движок PostgreSQL позволяет выполнять запросы `SELECT` над данными, хранящимися на удалённом PostgreSQL сервере.
Движок PostgreSQL позволяет выполнять запросы `SELECT` и `INSERT` для таблиц на удаленном сервере PostgreSQL.
## Создание таблицы {#creating-a-table}
@ -15,7 +15,7 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster]
name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1] [TTL expr1],
name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2] [TTL expr2],
...
) ENGINE = PostgreSQL('host:port', 'database', 'table', 'user', 'password');
) ENGINE = PostgreSQL('host:port', 'database', 'table', 'user', 'password'[, `schema`]);
```
Смотрите подробное описание запроса [CREATE TABLE](../../../sql-reference/statements/create/table.md#create-table-query).
@ -29,25 +29,51 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster]
**Параметры движка**
- `host:port` — адрес сервера PostgreSQL.
- `database` — Имя базы данных на сервере PostgreSQL.
- `table` — Имя таблицы.
- `user` — Имя пользователя PostgreSQL.
- `password` — Пароль пользователя PostgreSQL.
- `schema` — имя схемы, если не используется схема по умолчанию. Необязательный аргумент.
SELECT запросы на стороне PostgreSQL выполняются как `COPY (SELECT ...) TO STDOUT` внутри транзакции PostgreSQL только на чтение с коммитом после каждого `SELECT` запроса.
## Особенности реализации {#implementation-details}
Простые условия для `WHERE` такие как `=, !=, >, >=, <, <=, IN` исполняются на стороне PostgreSQL сервера.
Запросы `SELECT` на стороне PostgreSQL выполняются как `COPY (SELECT ...) TO STDOUT` внутри транзакции PostgreSQL только на чтение с коммитом после каждого запроса `SELECT`.
Все операции объединения, аггрегации, сортировки, условия `IN [ array ]` и ограничения `LIMIT` выполняются на стороне ClickHouse только после того как запрос к PostgreSQL закончился.
Простые условия для `WHERE`, такие как `=`, `!=`, `>`, `>=`, `<`, `<=` и `IN`, исполняются на стороне PostgreSQL сервера.
INSERT запросы на стороне PostgreSQL выполняются как `COPY "table_name" (field1, field2, ... fieldN) FROM STDIN` внутри PostgreSQL транзакции с автоматическим коммитом после каждого `INSERT` запроса.
Все операции объединения, аггрегации, сортировки, условия `IN [ array ]` и ограничения `LIMIT` выполняются на стороне ClickHouse только после того, как запрос к PostgreSQL закончился.
Запросы `INSERT` на стороне PostgreSQL выполняются как `COPY "table_name" (field1, field2, ... fieldN) FROM STDIN` внутри PostgreSQL транзакции с автоматическим коммитом после каждого запроса `INSERT`.
PostgreSQL массивы конвертируются в массивы ClickHouse.
Будьте осторожны в PostgreSQL массивы созданные как type_name[], являются многомерными и могут содержать в себе разное количество измерений в разных строках одной таблицы, внутри ClickHouse допустипы только многомерные массивы с одинаковым кол-вом измерений во всех строках таблицы.
!!! info "Внимание"
Будьте внимательны, в PostgreSQL массивы, созданные как `type_name[]`, являются многомерными и могут содержать в себе разное количество измерений в разных строках одной таблицы. Внутри ClickHouse допустимы только многомерные массивы с одинаковым кол-вом измерений во всех строках таблицы.
При использовании словаря PostgreSQL поддерживается приоритет реплик. Чем больше номер реплики, тем ниже ее приоритет. Наивысший приоритет у реплики с номером `0`.
В примере ниже реплика `example01-1` имеет более высокий приоритет:
```xml
<postgresql>
<port>5432</port>
<user>clickhouse</user>
<password>qwerty</password>
<replica>
<host>example01-1</host>
<priority>1</priority>
</replica>
<replica>
<host>example01-2</host>
<priority>2</priority>
</replica>
<db>db_name</db>
<table>table_name</table>
<where>id=10</where>
<invalidate_query>SQL_QUERY</invalidate_query>
</postgresql>
</source>
```
## Пример использования {#usage-example}
@ -64,17 +90,17 @@ PRIMARY KEY (int_id));
CREATE TABLE
postgres=# insert into test (int_id, str, "float") VALUES (1,'test',2);
postgres=# INSERT INTO test (int_id, str, "float") VALUES (1,'test',2);
INSERT 0 1
postgresql> select * from test;
postgresql> SELECT * FROM test;
int_id | int_nullable | float | str | float_nullable
--------+--------------+-------+------+----------------
1 | | 2 | test |
(1 row)
```
Таблица в ClickHouse, получение данных из PostgreSQL таблицы созданной выше:
Таблица в ClickHouse, получение данных из PostgreSQL таблицы, созданной выше:
``` sql
CREATE TABLE default.postgresql_table
@ -87,19 +113,33 @@ ENGINE = PostgreSQL('localhost:5432', 'public', 'test', 'postges_user', 'postgre
```
``` sql
SELECT * FROM postgresql_table WHERE str IN ('test')
SELECT * FROM postgresql_table WHERE str IN ('test');
```
``` text
┌─float_nullable─┬─str──┬─int_id─┐
│ ᴺᵁᴸᴸ │ test │ 1 │
└────────────────┴──────┴────────┘
1 rows in set. Elapsed: 0.019 sec.
```
Using Non-default Schema:
## Смотри также {#see-also}
```text
postgres=# CREATE SCHEMA "nice.schema";
- [Табличная функция postgresql](../../../sql-reference/table-functions/postgresql.md)
- [Использование PostgreSQL в качестве истояника для внешнего словаря](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md#dicts-external_dicts_dict_sources-postgresql)
postgres=# CREATE TABLE "nice.schema"."nice.table" (a integer);
postgres=# INSERT INTO "nice.schema"."nice.table" SELECT i FROM generate_series(0, 99) as t(i)
```
```sql
CREATE TABLE pg_table_schema_with_dots (a UInt32)
ENGINE PostgreSQL('localhost:5432', 'clickhouse', 'nice.table', 'postgrsql_user', 'password', 'nice.schema');
```
**См. также**
- [Табличная функция `postgresql`](../../../sql-reference/table-functions/postgresql.md)
- [Использование PostgreSQL в качестве источника для внешнего словаря](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md#dicts-external_dicts_dict_sources-postgresql)
[Оригинальная статья](https://clickhouse.tech/docs/ru/engines/table-engines/integrations/postgresql/) <!--hide-->

View File

@ -481,7 +481,15 @@ ClickHouse проверяет условия для `min_part_size` и `min_part
## max_concurrent_queries {#max-concurrent-queries}
Максимальное количество одновременно обрабатываемых запросов.
Определяет максимальное количество одновременно обрабатываемых запросов, связанных с таблицей семейства `MergeTree`. Запросы также могут быть ограничены настройками: [max_concurrent_queries_for_all_users](#max-concurrent-queries-for-all-users), [min_marks_to_honor_max_concurrent_queries](#min-marks-to-honor-max-concurrent-queries).
!!! info "Примечание"
Параметры этих настроек могут быть изменены во время выполнения запросов и вступят в силу немедленно. Запросы, которые уже запущены, выполнятся без изменений.
Возможные значения:
- Положительное целое число.
- 0 — выключена.
**Пример**
@ -509,6 +517,21 @@ ClickHouse проверяет условия для `min_part_size` и `min_part
- [max_concurrent_queries](#max-concurrent-queries)
## min_marks_to_honor_max_concurrent_queries {#min-marks-to-honor-max-concurrent-queries}
Определяет минимальное количество засечек, считываемых запросом для применения настройки [max_concurrent_queries](#max-concurrent-queries).
Возможные значения:
- Положительное целое число.
- 0 — выключена.
**Пример**
``` xml
<min_marks_to_honor_max_concurrent_queries>10</min_marks_to_honor_max_concurrent_queries>
```
## max_connections {#max-connections}
Максимальное количество входящих соединений.
@ -1159,4 +1182,3 @@ ClickHouse использует ZooKeeper для хранения метадан
</roles>
</ldap>
```

View File

@ -1792,6 +1792,19 @@ ClickHouse генерирует исключение
- [Движок Distributed](../../engines/table-engines/special/distributed.md#distributed)
- [Управление распределёнными таблицами](../../sql-reference/statements/system.md#query-language-system-distributed)
## insert_distributed_one_random_shard {#insert_distributed_one_random_shard}
Включает или отключает режим вставки данных в [Distributed](../../engines/table-engines/special/distributed.md#distributed)) таблицу в случайный шард при отсутствии ключ шардирования.
По умолчанию при вставке данных в `Distributed` таблицу с несколькими шардами и при отсутствии ключа шардирования сервер ClickHouse будет отклонять любой запрос на вставку данных. Когда `insert_distributed_one_random_shard = 1`, вставки принимаются, а данные записываются в случайный шард.
Возможные значения:
- 0 — если у таблицы несколько шардов, но ключ шардирования отсутствует, вставка данных отклоняется.
- 1 — если ключ шардирования отсутствует, то вставка данных осуществляется в случайный шард среди всех доступных шардов.
Значение по умолчанию: `0`.
## insert_shard_id {#insert_shard_id}
Если не `0`, указывает, в какой шард [Distributed](../../engines/table-engines/special/distributed.md#distributed) таблицы данные будут вставлены синхронно.

View File

@ -69,6 +69,7 @@ SETTINGS(format_csv_allow_single_quotes = 0)
- [ClickHouse](#dicts-external_dicts_dict_sources-clickhouse)
- [MongoDB](#dicts-external_dicts_dict_sources-mongodb)
- [Redis](#dicts-external_dicts_dict_sources-redis)
- [PostgreSQL](#dicts-external_dicts_dict_sources-postgresql)
## Локальный файл {#dicts-external_dicts_dict_sources-local_file}

View File

@ -1111,6 +1111,78 @@ SELECT
Функция `arrayFilter` является [функцией высшего порядка](../../sql-reference/functions/index.md#higher-order-functions) — в качестве первого аргумента ей нужно передать лямбда-функцию, и этот аргумент не может быть опущен.
## arrayFill(func, arr1, …) {#array-fill}
Перебирает `arr1` от первого элемента к последнему и заменяет `arr1[i]` на `arr1[i - 1]`, если `func` вернула 0. Первый элемент `arr1` остаётся неизменным.
Примеры:
``` sql
SELECT arrayFill(x -> not isNull(x), [1, null, 3, 11, 12, null, null, 5, 6, 14, null, null]) AS res
```
``` text
┌─res──────────────────────────────┐
│ [1,1,3,11,12,12,12,5,6,14,14,14] │
└──────────────────────────────────┘
```
Функция `arrayFill` является [функцией высшего порядка](../../sql-reference/functions/index.md#higher-order-functions) — в качестве первого аргумента ей нужно передать лямбда-функцию, и этот аргумент не может быть опущен.
## arrayReverseFill(func, arr1, …) {#array-reverse-fill}
Перебирает `arr1` от последнего элемента к первому и заменяет `arr1[i]` на `arr1[i + 1]`, если `func` вернула 0. Последний элемент `arr1` остаётся неизменным.
Примеры:
``` sql
SELECT arrayReverseFill(x -> not isNull(x), [1, null, 3, 11, 12, null, null, 5, 6, 14, null, null]) AS res
```
``` text
┌─res────────────────────────────────┐
│ [1,3,3,11,12,5,5,5,6,14,NULL,NULL] │
└────────────────────────────────────┘
```
Функция `arrayReverseFill` является [функцией высшего порядка](../../sql-reference/functions/index.md#higher-order-functions) — в качестве первого аргумента ей нужно передать лямбда-функцию, и этот аргумент не может быть опущен.
## arraySplit(func, arr1, …) {#array-split}
Разделяет массив `arr1` на несколько. Если `func` возвращает не 0, то массив разделяется, а элемент помещается в левую часть. Массив не разбивается по первому элементу.
Примеры:
``` sql
SELECT arraySplit((x, y) -> y, [1, 2, 3, 4, 5], [1, 0, 0, 1, 0]) AS res
```
``` text
┌─res─────────────┐
│ [[1,2,3],[4,5]] │
└─────────────────┘
```
Функция `arraySplit` является [функцией высшего порядка](../../sql-reference/functions/index.md#higher-order-functions) — в качестве первого аргумента ей нужно передать лямбда-функцию, и этот аргумент не может быть опущен.
## arrayReverseSplit(func, arr1, …) {#array-reverse-split}
Разделяет массив `arr1` на несколько. Если `func` возвращает не 0, то массив разделяется, а элемент помещается в правую часть. Массив не разбивается по последнему элементу.
Примеры:
``` sql
SELECT arrayReverseSplit((x, y) -> y, [1, 2, 3, 4, 5], [1, 0, 0, 1, 0]) AS res
```
``` text
┌─res───────────────┐
│ [[1],[2,3,4],[5]] │
└───────────────────┘
```
Функция `arrayReverseSplit` является [функцией высшего порядка](../../sql-reference/functions/index.md#higher-order-functions) — в качестве первого аргумента ей нужно передать лямбда-функцию, и этот аргумент не может быть опущен.
## arrayExists(\[func,\] arr1, …) {#arrayexistsfunc-arr1}
Возвращает 1, если существует хотя бы один элемент массива `arr`, для которого функция func возвращает не 0. Иначе возвращает 0.

View File

@ -240,3 +240,53 @@ SELECT bitCount(333);
└───────────────┘
```
## bitHammingDistance {#bithammingdistance}
Возвращает [расстояние Хэмминга](https://ru.wikipedia.org/wiki/%D0%A0%D0%B0%D1%81%D1%81%D1%82%D0%BE%D1%8F%D0%BD%D0%B8%D0%B5_%D0%A5%D1%8D%D0%BC%D0%BC%D0%B8%D0%BD%D0%B3%D0%B0) между битовыми представлениями двух целых чисел. Может быть использовано с функциями [SimHash](../../sql-reference/functions/hash-functions.md#ngramsimhash) для проверки двух строк на схожесть. Чем меньше расстояние, тем больше вероятность, что строки совпадают.
**Синтаксис**
``` sql
bitHammingDistance(int1, int2)
```
**Аргументы**
- `int1` — первое целое число. [Int64](../../sql-reference/data-types/int-uint.md).
- `int2` — второе целое число. [Int64](../../sql-reference/data-types/int-uint.md).
**Возвращаемое значение**
- Расстояние Хэмминга.
Тип: [UInt8](../../sql-reference/data-types/int-uint.md).
**Примеры**
Запрос:
``` sql
SELECT bitHammingDistance(111, 121);
```
Результат:
``` text
┌─bitHammingDistance(111, 121)─┐
│ 3 │
└──────────────────────────────┘
```
Используя [SimHash](../../sql-reference/functions/hash-functions.md#ngramsimhash):
``` sql
SELECT bitHammingDistance(ngramSimHash('cat ate rat'), ngramSimHash('rat ate cat'));
```
Результат:
``` text
┌─bitHammingDistance(ngramSimHash('cat ate rat'), ngramSimHash('rat ate cat'))─┐
│ 5 │
└──────────────────────────────────────────────────────────────────────────────┘
```

View File

@ -7,6 +7,8 @@ toc_title: "Функции хэширования"
Функции хэширования могут использоваться для детерминированного псевдослучайного разбрасывания элементов.
Simhash это хеш-функция, которая для близких значений возвращает близкий хеш.
## halfMD5 {#hash-functions-halfmd5}
[Интерпретирует](../../sql-reference/functions/hash-functions.md#type_conversion_functions-reinterpretAsString) все входные параметры как строки и вычисляет хэш [MD5](https://ru.wikipedia.org/wiki/MD5) для каждой из них. Затем объединяет хэши, берет первые 8 байт хэша результирующей строки и интерпретирует их как значение типа `UInt64` с big-endian порядком байтов.
@ -484,3 +486,937 @@ SELECT xxHash32('Hello, world!');
- [xxHash](http://cyan4973.github.io/xxHash/).
## ngramSimHash {#ngramsimhash}
Выделяет из ASCII строки отрезки (n-граммы) размером `ngramsize` символов и возвращает n-граммовый `simhash`. Функция регистрозависимая.
Может быть использована для проверки двух строк на схожесть вместе с функцией [bitHammingDistance](../../sql-reference/functions/bit-functions.md#bithammingdistance). Чем меньше [расстояние Хэмминга](https://ru.wikipedia.org/wiki/%D0%A0%D0%B0%D1%81%D1%81%D1%82%D0%BE%D1%8F%D0%BD%D0%B8%D0%B5_%D0%A5%D1%8D%D0%BC%D0%BC%D0%B8%D0%BD%D0%B3%D0%B0) между результатом вычисления `simhash` двух строк, тем больше вероятность, что строки совпадают.
**Синтаксис**
``` sql
ngramSimHash(string[, ngramsize])
```
**Аргументы**
- `string` — строка. [String](../../sql-reference/data-types/string.md).
- `ngramsize` — размер n-грамм. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
**Возвращаемое значение**
- Значение хеш-функции от строки.
Тип: [UInt64](../../sql-reference/data-types/int-uint.md).
**Пример**
Запрос:
``` sql
SELECT ngramSimHash('ClickHouse') AS Hash;
```
Результат:
``` text
┌───────Hash─┐
│ 1627567969 │
└────────────┘
```
## ngramSimHashCaseInsensitive {#ngramsimhashcaseinsensitive}
Выделяет из ASCII строки отрезки (n-граммы) размером `ngramsize` символов и возвращает n-граммовый `simhash`. Функция регистро**не**зависимая.
Может быть использована для проверки двух строк на схожесть вместе с функцией [bitHammingDistance](../../sql-reference/functions/bit-functions.md#bithammingdistance). Чем меньше [расстояние Хэмминга](https://ru.wikipedia.org/wiki/%D0%A0%D0%B0%D1%81%D1%81%D1%82%D0%BE%D1%8F%D0%BD%D0%B8%D0%B5_%D0%A5%D1%8D%D0%BC%D0%BC%D0%B8%D0%BD%D0%B3%D0%B0) между результатом вычисления `simhash` двух строк, тем больше вероятность, что строки совпадают.
**Синтаксис**
``` sql
ngramSimHashCaseInsensitive(string[, ngramsize])
```
**Аргументы**
- `string` — строка. [String](../../sql-reference/data-types/string.md).
- `ngramsize` — размер n-грамм. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
**Возвращаемое значение**
- Значение хеш-функции от строки.
Тип: [UInt64](../../sql-reference/data-types/int-uint.md).
**Пример**
Запрос:
``` sql
SELECT ngramSimHashCaseInsensitive('ClickHouse') AS Hash;
```
Результат:
``` text
┌──────Hash─┐
│ 562180645 │
└───────────┘
```
## ngramSimHashUTF8 {#ngramsimhashutf8}
Выделяет из UTF-8 строки отрезки (n-граммы) размером `ngramsize` символов и возвращает n-граммовый `simhash`. Функция регистрозависимая.
Может быть использована для проверки двух строк на схожесть вместе с функцией [bitHammingDistance](../../sql-reference/functions/bit-functions.md#bithammingdistance). Чем меньше [расстояние Хэмминга](https://ru.wikipedia.org/wiki/%D0%A0%D0%B0%D1%81%D1%81%D1%82%D0%BE%D1%8F%D0%BD%D0%B8%D0%B5_%D0%A5%D1%8D%D0%BC%D0%BC%D0%B8%D0%BD%D0%B3%D0%B0) между результатом вычисления `simhash` двух строк, тем больше вероятность, что строки совпадают.
**Синтаксис**
``` sql
ngramSimHashUTF8(string[, ngramsize])
```
**Аргументы**
- `string` — строка. [String](../../sql-reference/data-types/string.md).
- `ngramsize` — размер n-грамм. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
**Возвращаемое значение**
- Значение хеш-функции от строки.
Тип: [UInt64](../../sql-reference/data-types/int-uint.md).
**Пример**
Запрос:
``` sql
SELECT ngramSimHashUTF8('ClickHouse') AS Hash;
```
Результат:
``` text
┌───────Hash─┐
│ 1628157797 │
└────────────┘
```
## ngramSimHashCaseInsensitiveUTF8 {#ngramsimhashcaseinsensitiveutf8}
Выделяет из UTF-8 строки отрезки (n-граммы) размером `ngramsize` символов и возвращает n-граммовый `simhash`. Функция регистро**не**зависимая.
Может быть использована для проверки двух строк на схожесть вместе с функцией [bitHammingDistance](../../sql-reference/functions/bit-functions.md#bithammingdistance). Чем меньше [расстояние Хэмминга](https://ru.wikipedia.org/wiki/%D0%A0%D0%B0%D1%81%D1%81%D1%82%D0%BE%D1%8F%D0%BD%D0%B8%D0%B5_%D0%A5%D1%8D%D0%BC%D0%BC%D0%B8%D0%BD%D0%B3%D0%B0) между результатом вычисления `simhash` двух строк, тем больше вероятность, что строки совпадают.
**Синтаксис**
``` sql
ngramSimHashCaseInsensitiveUTF8(string[, ngramsize])
```
**Аргументы**
- `string` — строка. [String](../../sql-reference/data-types/string.md).
- `ngramsize` — размер n-грамм. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
**Возвращаемое значение**
- Значение хеш-функции от строки.
Тип: [UInt64](../../sql-reference/data-types/int-uint.md).
**Пример**
Запрос:
``` sql
SELECT ngramSimHashCaseInsensitiveUTF8('ClickHouse') AS Hash;
```
Результат:
``` text
┌───────Hash─┐
│ 1636742693 │
└────────────┘
```
## wordShingleSimHash {#wordshinglesimhash}
Выделяет из ASCII строки отрезки (шинглы) из `shinglesize` слов и возвращает шингловый `simhash`. Функция регистрозависимая.
Может быть использована для проверки двух строк на схожесть вместе с функцией [bitHammingDistance](../../sql-reference/functions/bit-functions.md#bithammingdistance). Чем меньше [расстояние Хэмминга](https://ru.wikipedia.org/wiki/%D0%A0%D0%B0%D1%81%D1%81%D1%82%D0%BE%D1%8F%D0%BD%D0%B8%D0%B5_%D0%A5%D1%8D%D0%BC%D0%BC%D0%B8%D0%BD%D0%B3%D0%B0) между результатом вычисления `simhash` двух строк, тем больше вероятность, что строки совпадают.
**Синтаксис**
``` sql
wordShingleSimHash(string[, shinglesize])
```
**Аргументы**
- `string` — строка. [String](../../sql-reference/data-types/string.md).
- `shinglesize` — размер словесных шинглов. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
**Возвращаемое значение**
- Значение хеш-функции от строки.
Тип: [UInt64](../../sql-reference/data-types/int-uint.md).
**Пример**
Запрос:
``` sql
SELECT wordShingleSimHash('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).') AS Hash;
```
Результат:
``` text
┌───────Hash─┐
│ 2328277067 │
└────────────┘
```
## wordShingleSimHashCaseInsensitive {#wordshinglesimhashcaseinsensitive}
Выделяет из ASCII строки отрезки (шинглы) из `shinglesize` слов и возвращает шингловый `simhash`. Функция регистро**не**зависимая.
Может быть использована для проверки двух строк на схожесть вместе с функцией [bitHammingDistance](../../sql-reference/functions/bit-functions.md#bithammingdistance). Чем меньше [расстояние Хэмминга](https://ru.wikipedia.org/wiki/%D0%A0%D0%B0%D1%81%D1%81%D1%82%D0%BE%D1%8F%D0%BD%D0%B8%D0%B5_%D0%A5%D1%8D%D0%BC%D0%BC%D0%B8%D0%BD%D0%B3%D0%B0) между результатом вычисления `simhash` двух строк, тем больше вероятность, что строки совпадают.
**Синтаксис**
``` sql
wordShingleSimHashCaseInsensitive(string[, shinglesize])
```
**Аргументы**
- `string` — строка. [String](../../sql-reference/data-types/string.md).
- `shinglesize` — размер словесных шинглов. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
**Возвращаемое значение**
- Значение хеш-функции от строки.
Тип: [UInt64](../../sql-reference/data-types/int-uint.md).
**Пример**
Запрос:
``` sql
SELECT wordShingleSimHashCaseInsensitive('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).') AS Hash;
```
Результат:
``` text
┌───────Hash─┐
│ 2194812424 │
└────────────┘
```
## wordShingleSimHashUTF8 {#wordshinglesimhashutf8}
Выделяет из UTF-8 строки отрезки (шинглы) из `shinglesize` слов и возвращает шингловый `simhash`. Функция регистрозависимая.
Может быть использована для проверки двух строк на схожесть вместе с функцией [bitHammingDistance](../../sql-reference/functions/bit-functions.md#bithammingdistance). Чем меньше [расстояние Хэмминга](https://ru.wikipedia.org/wiki/%D0%A0%D0%B0%D1%81%D1%81%D1%82%D0%BE%D1%8F%D0%BD%D0%B8%D0%B5_%D0%A5%D1%8D%D0%BC%D0%BC%D0%B8%D0%BD%D0%B3%D0%B0) между результатом вычисления `simhash` двух строк, тем больше вероятность, что строки совпадают.
**Синтаксис**
``` sql
wordShingleSimHashUTF8(string[, shinglesize])
```
**Аргументы**
- `string` — строка. [String](../../sql-reference/data-types/string.md).
- `shinglesize` — размер словесных шинглов. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
**Возвращаемое значение**
- Значение хеш-функции от строки.
Тип: [UInt64](../../sql-reference/data-types/int-uint.md).
**Пример**
Запрос:
``` sql
SELECT wordShingleSimHashUTF8('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).') AS Hash;
```
Результат:
``` text
┌───────Hash─┐
│ 2328277067 │
└────────────┘
```
## wordShingleSimHashCaseInsensitiveUTF8 {#wordshinglesimhashcaseinsensitiveutf8}
Выделяет из UTF-8 строки отрезки (шинглы) из `shinglesize` слов и возвращает шингловый `simhash`. Функция регистро**не**зависимая.
Может быть использована для проверки двух строк на схожесть вместе с функцией [bitHammingDistance](../../sql-reference/functions/bit-functions.md#bithammingdistance). Чем меньше [расстояние Хэмминга](https://ru.wikipedia.org/wiki/%D0%A0%D0%B0%D1%81%D1%81%D1%82%D0%BE%D1%8F%D0%BD%D0%B8%D0%B5_%D0%A5%D1%8D%D0%BC%D0%BC%D0%B8%D0%BD%D0%B3%D0%B0) между результатом вычисления `simhash` двух строк, тем больше вероятность, что строки совпадают.
**Синтаксис**
``` sql
wordShingleSimHashCaseInsensitiveUTF8(string[, shinglesize])
```
**Аргументы**
- `string` — строка. [String](../../sql-reference/data-types/string.md).
- `shinglesize` — размер словесных шинглов. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
**Возвращаемое значение**
- Значение хеш-функции от строки.
Тип: [UInt64](../../sql-reference/data-types/int-uint.md).
**Пример**
Запрос:
``` sql
SELECT wordShingleSimHashCaseInsensitiveUTF8('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).') AS Hash;
```
Результат:
``` text
┌───────Hash─┐
│ 2194812424 │
└────────────┘
```
## ngramMinHash {#ngramminhash}
Выделяет из ASCII строки отрезки (n-граммы) размером `ngramsize` символов и вычисляет хеш для каждой n-граммы. Использует `hashnum` минимальных хешей, чтобы вычислить минимальный хеш, и `hashnum` максимальных хешей, чтобы вычислить максимальный хеш. Возвращает кортеж из этих хешей. Функция регистрозависимая.
Может быть использована для проверки двух строк на схожесть вместе с функцией [tupleHammingDistance](../../sql-reference/functions/tuple-functions.md#tuplehammingdistance). Если для двух строк минимальные или максимальные хеши одинаковы, мы считаем, что эти строки совпадают.
**Синтаксис**
``` sql
ngramMinHash(string[, ngramsize, hashnum])
```
**Аргументы**
- `string` — строка. [String](../../sql-reference/data-types/string.md).
- `ngramsize` — размер n-грамм. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
- `hashnum` — количество минимальных и максимальных хешей, которое используется при вычислении результата. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `6`. [UInt8](../../sql-reference/data-types/int-uint.md).
**Возвращаемое значение**
- Кортеж с двумя хешами — минимальным и максимальным.
Тип: [Tuple](../../sql-reference/data-types/tuple.md)([UInt64](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md)).
**Пример**
Запрос:
``` sql
SELECT ngramMinHash('ClickHouse') AS Tuple;
```
Результат:
``` text
┌─Tuple──────────────────────────────────────┐
│ (18333312859352735453,9054248444481805918) │
└────────────────────────────────────────────┘
```
## ngramMinHashCaseInsensitive {#ngramminhashcaseinsensitive}
Выделяет из ASCII строки отрезки (n-граммы) размером `ngramsize` символов и вычисляет хеш для каждой n-граммы. Использует `hashnum` минимальных хешей, чтобы вычислить минимальный хеш, и `hashnum` максимальных хешей, чтобы вычислить максимальный хеш. Возвращает кортеж из этих хешей. Функция регистро**не**зависимая.
Может быть использована для проверки двух строк на схожесть вместе с функцией [tupleHammingDistance](../../sql-reference/functions/tuple-functions.md#tuplehammingdistance). Если для двух строк минимальные или максимальные хеши одинаковы, мы считаем, что эти строки совпадают.
**Синтаксис**
``` sql
ngramMinHashCaseInsensitive(string[, ngramsize, hashnum])
```
**Аргументы**
- `string` — строка. [String](../../sql-reference/data-types/string.md).
- `ngramsize` — размер n-грамм. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
- `hashnum` — количество минимальных и максимальных хешей, которое используется при вычислении результата. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `6`. [UInt8](../../sql-reference/data-types/int-uint.md).
**Возвращаемое значение**
- Кортеж с двумя хешами — минимальным и максимальным.
Тип: [Tuple](../../sql-reference/data-types/tuple.md)([UInt64](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md)).
**Пример**
Запрос:
``` sql
SELECT ngramMinHashCaseInsensitive('ClickHouse') AS Tuple;
```
Результат:
``` text
┌─Tuple──────────────────────────────────────┐
│ (2106263556442004574,13203602793651726206) │
└────────────────────────────────────────────┘
```
## ngramMinHashUTF8 {#ngramminhashutf8}
Выделяет из UTF-8 строки отрезки (n-граммы) размером `ngramsize` символов и вычисляет хеш для каждой n-граммы. Использует `hashnum` минимальных хешей, чтобы вычислить минимальный хеш, и `hashnum` максимальных хешей, чтобы вычислить максимальный хеш. Возвращает кортеж из этих хешей. Функция регистрозависимая.
Может быть использована для проверки двух строк на схожесть вместе с функцией [tupleHammingDistance](../../sql-reference/functions/tuple-functions.md#tuplehammingdistance). Если для двух строк минимальные или максимальные хеши одинаковы, мы считаем, что эти строки совпадают.
**Синтаксис**
``` sql
ngramMinHashUTF8(string[, ngramsize, hashnum])
```
**Аргументы**
- `string` — строка. [String](../../sql-reference/data-types/string.md).
- `ngramsize` — размер n-грамм. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
- `hashnum` — количество минимальных и максимальных хешей, которое используется при вычислении результата. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `6`. [UInt8](../../sql-reference/data-types/int-uint.md).
**Возвращаемое значение**
- Кортеж с двумя хешами — минимальным и максимальным.
Тип: [Tuple](../../sql-reference/data-types/tuple.md)([UInt64](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md)).
**Пример**
Запрос:
``` sql
SELECT ngramMinHashUTF8('ClickHouse') AS Tuple;
```
Результат:
``` text
┌─Tuple──────────────────────────────────────┐
│ (18333312859352735453,6742163577938632877) │
└────────────────────────────────────────────┘
```
## ngramMinHashCaseInsensitiveUTF8 {#ngramminhashcaseinsensitiveutf8}
Выделяет из UTF-8 строки отрезки (n-граммы) размером `ngramsize` символов и вычисляет хеш для каждой n-граммы. Использует `hashnum` минимальных хешей, чтобы вычислить минимальный хеш, и `hashnum` максимальных хешей, чтобы вычислить максимальный хеш. Возвращает кортеж из этих хешей. Функция регистро**не**зависимая.
Может быть использована для проверки двух строк на схожесть вместе с функцией [tupleHammingDistance](../../sql-reference/functions/tuple-functions.md#tuplehammingdistance). Если для двух строк минимальные или максимальные хеши одинаковы, мы считаем, что эти строки совпадают.
**Синтаксис**
``` sql
ngramMinHashCaseInsensitiveUTF8(string [, ngramsize, hashnum])
```
**Аргументы**
- `string` — строка. [String](../../sql-reference/data-types/string.md).
- `ngramsize` — размер n-грамм. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
- `hashnum` — количество минимальных и максимальных хешей, которое используется при вычислении результата. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `6`. [UInt8](../../sql-reference/data-types/int-uint.md).
**Возвращаемое значение**
- Кортеж с двумя хешами — минимальным и максимальным.
Тип: [Tuple](../../sql-reference/data-types/tuple.md)([UInt64](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md)).
**Пример**
Запрос:
``` sql
SELECT ngramMinHashCaseInsensitiveUTF8('ClickHouse') AS Tuple;
```
Результат:
``` text
┌─Tuple───────────────────────────────────────┐
│ (12493625717655877135,13203602793651726206) │
└─────────────────────────────────────────────┘
```
## ngramMinHashArg {#ngramminhasharg}
Выделяет из ASCII строки отрезки (n-граммы) размером `ngramsize` символов и возвращает n-граммы с минимальным и максимальным хешами, вычисленными функцией [ngramMinHash](#ngramminhash) с теми же входными данными. Функция регистрозависимая.
**Синтаксис**
``` sql
ngramMinHashArg(string[, ngramsize, hashnum])
```
**Аргументы**
- `string` — строка. [String](../../sql-reference/data-types/string.md).
- `ngramsize` — размер n-грамм. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
- `hashnum` — количество минимальных и максимальных хешей, которое используется при вычислении результата. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `6`. [UInt8](../../sql-reference/data-types/int-uint.md).
**Возвращаемое значение**
- Кортеж из двух кортежей, каждый из которых состоит из `hashnum` n-грамм.
Тип: [Tuple](../../sql-reference/data-types/tuple.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md)), [Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md))).
**Пример**
Запрос:
``` sql
SELECT ngramMinHashArg('ClickHouse') AS Tuple;
```
Результат:
``` text
┌─Tuple─────────────────────────────────────────────────────────────────────────┐
│ (('ous','ick','lic','Hou','kHo','use'),('Hou','lic','ick','ous','ckH','Cli')) │
└───────────────────────────────────────────────────────────────────────────────┘
```
## ngramMinHashArgCaseInsensitive {#ngramminhashargcaseinsensitive}
Выделяет из ASCII строки отрезки (n-граммы) размером `ngramsize` символов и возвращает n-граммы с минимальным и максимальным хешами, вычисленными функцией [ngramMinHashCaseInsensitive](#ngramminhashcaseinsensitive) с теми же входными данными. Функция регистро**не**зависимая.
**Синтаксис**
``` sql
ngramMinHashArgCaseInsensitive(string[, ngramsize, hashnum])
```
**Аргументы**
- `string` — строка. [String](../../sql-reference/data-types/string.md).
- `ngramsize` — размер n-грамм. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
- `hashnum` — количество минимальных и максимальных хешей, которое используется при вычислении результата. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `6`. [UInt8](../../sql-reference/data-types/int-uint.md).
**Возвращаемое значение**
- Кортеж из двух кортежей, каждый из которых состоит из `hashnum` n-грамм.
Тип: [Tuple](../../sql-reference/data-types/tuple.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md)), [Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md))).
**Пример**
Запрос:
``` sql
SELECT ngramMinHashArgCaseInsensitive('ClickHouse') AS Tuple;
```
Результат:
``` text
┌─Tuple─────────────────────────────────────────────────────────────────────────┐
│ (('ous','ick','lic','kHo','use','Cli'),('kHo','lic','ick','ous','ckH','Hou')) │
└───────────────────────────────────────────────────────────────────────────────┘
```
## ngramMinHashArgUTF8 {#ngramminhashargutf8}
Выделяет из UTF-8 строки отрезки (n-граммы) размером `ngramsize` символов и возвращает n-граммы с минимальным и максимальным хешами, вычисленными функцией [ngramMinHashUTF8](#ngramminhashutf8) с теми же входными данными. Функция регистрозависимая.
**Синтаксис**
``` sql
ngramMinHashArgUTF8(string[, ngramsize, hashnum])
```
**Аргументы**
- `string` — строка. [String](../../sql-reference/data-types/string.md).
- `ngramsize` — размер n-грамм. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
- `hashnum` — количество минимальных и максимальных хешей, которое используется при вычислении результата. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `6`. [UInt8](../../sql-reference/data-types/int-uint.md).
**Возвращаемое значение**
- Кортеж из двух кортежей, каждый из которых состоит из `hashnum` n-грамм.
Тип: [Tuple](../../sql-reference/data-types/tuple.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md)), [Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md))).
**Пример**
Запрос:
``` sql
SELECT ngramMinHashArgUTF8('ClickHouse') AS Tuple;
```
Результат:
``` text
┌─Tuple─────────────────────────────────────────────────────────────────────────┐
│ (('ous','ick','lic','Hou','kHo','use'),('kHo','Hou','lic','ick','ous','ckH')) │
└───────────────────────────────────────────────────────────────────────────────┘
```
## ngramMinHashArgCaseInsensitiveUTF8 {#ngramminhashargcaseinsensitiveutf8}
Выделяет из UTF-8 строки отрезки (n-граммы) размером `ngramsize` символов и возвращает n-граммы с минимальным и максимальным хешами, вычисленными функцией [ngramMinHashCaseInsensitiveUTF8](#ngramminhashcaseinsensitiveutf8) с теми же входными данными. Функция регистро**не**зависимая.
**Синтаксис**
``` sql
ngramMinHashArgCaseInsensitiveUTF8(string[, ngramsize, hashnum])
```
**Аргументы**
- `string` — строка. [String](../../sql-reference/data-types/string.md).
- `ngramsize` — размер n-грамм. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
- `hashnum` — количество минимальных и максимальных хешей, которое используется при вычислении результата. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `6`. [UInt8](../../sql-reference/data-types/int-uint.md).
**Возвращаемое значение**
- Кортеж из двух кортежей, каждый из которых состоит из `hashnum` n-грамм.
Тип: [Tuple](../../sql-reference/data-types/tuple.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md)), [Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md))).
**Пример**
Запрос:
``` sql
SELECT ngramMinHashArgCaseInsensitiveUTF8('ClickHouse') AS Tuple;
```
Результат:
``` text
┌─Tuple─────────────────────────────────────────────────────────────────────────┐
│ (('ckH','ous','ick','lic','kHo','use'),('kHo','lic','ick','ous','ckH','Hou')) │
└───────────────────────────────────────────────────────────────────────────────┘
```
## wordShingleMinHash {#wordshingleminhash}
Выделяет из ASCII строки отрезки (шинглы) из `shinglesize` слов и вычисляет хеш для каждого шингла. Использует `hashnum` минимальных хешей, чтобы вычислить минимальный хеш, и `hashnum` максимальных хешей, чтобы вычислить максимальный хеш. Возвращает кортеж из этих хешей. Функция регистрозависимая.
Может быть использована для проверки двух строк на схожесть вместе с функцией [tupleHammingDistance](../../sql-reference/functions/tuple-functions.md#tuplehammingdistance). Если для двух строк минимальные или максимальные хеши одинаковы, мы считаем, что эти строки совпадают.
**Синтаксис**
``` sql
wordShingleMinHash(string[, shinglesize, hashnum])
```
**Аргументы**
- `string` — строка. [String](../../sql-reference/data-types/string.md).
- `shinglesize` — размер словесных шинглов. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
- `hashnum` — количество минимальных и максимальных хешей, которое используется при вычислении результата. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `6`. [UInt8](../../sql-reference/data-types/int-uint.md).
**Возвращаемое значение**
- Кортеж с двумя хешами — минимальным и максимальным.
Тип: [Tuple](../../sql-reference/data-types/tuple.md)([UInt64](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md)).
**Пример**
Запрос:
``` sql
SELECT wordShingleMinHash('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).') AS Tuple;
```
Результат:
``` text
┌─Tuple──────────────────────────────────────┐
│ (16452112859864147620,5844417301642981317) │
└────────────────────────────────────────────┘
```
## wordShingleMinHashCaseInsensitive {#wordshingleminhashcaseinsensitive}
Выделяет из ASCII строки отрезки (шинглы) из `shinglesize` слов и вычисляет хеш для каждого шингла. Использует `hashnum` минимальных хешей, чтобы вычислить минимальный хеш, и `hashnum` максимальных хешей, чтобы вычислить максимальный хеш. Возвращает кортеж из этих хешей. Функция регистро**не**зависимая.
Может быть использована для проверки двух строк на схожесть вместе с функцией [tupleHammingDistance](../../sql-reference/functions/tuple-functions.md#tuplehammingdistance). Если для двух строк минимальные или максимальные хеши одинаковы, мы считаем, что эти строки совпадают.
**Синтаксис**
``` sql
wordShingleMinHashCaseInsensitive(string[, shinglesize, hashnum])
```
**Аргументы**
- `string` — строка. [String](../../sql-reference/data-types/string.md).
- `shinglesize` — размер словесных шинглов. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
- `hashnum` — количество минимальных и максимальных хешей, которое используется при вычислении результата. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `6`. [UInt8](../../sql-reference/data-types/int-uint.md).
**Возвращаемое значение**
- Кортеж с двумя хешами — минимальным и максимальным.
Тип: [Tuple](../../sql-reference/data-types/tuple.md)([UInt64](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md)).
**Пример**
Запрос:
``` sql
SELECT wordShingleMinHashCaseInsensitive('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).') AS Tuple;
```
Результат:
``` text
┌─Tuple─────────────────────────────────────┐
│ (3065874883688416519,1634050779997673240) │
└───────────────────────────────────────────┘
```
## wordShingleMinHashUTF8 {#wordshingleminhashutf8}
Выделяет из UTF-8 строки отрезки (шинглы) из `shinglesize` слов и вычисляет хеш для каждого шингла. Использует `hashnum` минимальных хешей, чтобы вычислить минимальный хеш, и `hashnum` максимальных хешей, чтобы вычислить максимальный хеш. Возвращает кортеж из этих хешей. Функция регистрозависимая.
Может быть использована для проверки двух строк на схожесть вместе с функцией [tupleHammingDistance](../../sql-reference/functions/tuple-functions.md#tuplehammingdistance). Если для двух строк минимальные или максимальные хеши одинаковы, мы считаем, что эти строки совпадают.
**Синтаксис**
``` sql
wordShingleMinHashUTF8(string[, shinglesize, hashnum])
```
**Аргументы**
- `string` — строка. [String](../../sql-reference/data-types/string.md).
- `shinglesize` — размер словесных шинглов. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
- `hashnum` — количество минимальных и максимальных хешей, которое используется при вычислении результата. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `6`. [UInt8](../../sql-reference/data-types/int-uint.md).
**Возвращаемое значение**
- Кортеж с двумя хешами — минимальным и максимальным.
Тип: [Tuple](../../sql-reference/data-types/tuple.md)([UInt64](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md)).
**Пример**
Запрос:
``` sql
SELECT wordShingleMinHashUTF8('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).') AS Tuple;
```
Результат:
``` text
┌─Tuple──────────────────────────────────────┐
│ (16452112859864147620,5844417301642981317) │
└────────────────────────────────────────────┘
```
## wordShingleMinHashCaseInsensitiveUTF8 {#wordshingleminhashcaseinsensitiveutf8}
Выделяет из UTF-8 строки отрезки (шинглы) из `shinglesize` слов и вычисляет хеш для каждого шингла. Использует `hashnum` минимальных хешей, чтобы вычислить минимальный хеш, и `hashnum` максимальных хешей, чтобы вычислить максимальный хеш. Возвращает кортеж из этих хешей. Функция регистро**не**зависимая.
Может быть использована для проверки двух строк на схожесть вместе с функцией [tupleHammingDistance](../../sql-reference/functions/tuple-functions.md#tuplehammingdistance). Если для двух строк минимальные или максимальные хеши одинаковы, мы считаем, что эти строки совпадают.
**Синтаксис**
``` sql
wordShingleMinHashCaseInsensitiveUTF8(string[, shinglesize, hashnum])
```
**Аргументы**
- `string` — строка. [String](../../sql-reference/data-types/string.md).
- `shinglesize` — размер словесных шинглов. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
- `hashnum` — количество минимальных и максимальных хешей, которое используется при вычислении результата. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `6`. [UInt8](../../sql-reference/data-types/int-uint.md).
**Возвращаемое значение**
- Кортеж с двумя хешами — минимальным и максимальным.
Тип: [Tuple](../../sql-reference/data-types/tuple.md)([UInt64](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md)).
**Пример**
Запрос:
``` sql
SELECT wordShingleMinHashCaseInsensitiveUTF8('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).') AS Tuple;
```
Результат:
``` text
┌─Tuple─────────────────────────────────────┐
│ (3065874883688416519,1634050779997673240) │
└───────────────────────────────────────────┘
```
## wordShingleMinHashArg {#wordshingleminhasharg}
Выделяет из ASCII строки отрезки (шинглы) из `shinglesize` слов и возвращает шинглы с минимальным и максимальным хешами, вычисленными функцией [wordshingleMinHash](#wordshingleminhash) с теми же входными данными. Функция регистрозависимая.
**Синтаксис**
``` sql
wordShingleMinHashArg(string[, shinglesize, hashnum])
```
**Аргументы**
- `string` — строка. [String](../../sql-reference/data-types/string.md).
- `shinglesize` — размер словесных шинглов. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
- `hashnum` — количество минимальных и максимальных хешей, которое используется при вычислении результата. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `6`. [UInt8](../../sql-reference/data-types/int-uint.md).
**Возвращаемое значение**
- Кортеж из двух кортежей, каждый из которых состоит из `hashnum` шинглов.
Тип: [Tuple](../../sql-reference/data-types/tuple.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md)), [Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md))).
**Пример**
Запрос:
``` sql
SELECT wordShingleMinHashArg('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).', 1, 3) AS Tuple;
```
Результат:
``` text
┌─Tuple─────────────────────────────────────────────────────────────────┐
│ (('OLAP','database','analytical'),('online','oriented','processing')) │
└───────────────────────────────────────────────────────────────────────┘
```
## wordShingleMinHashArgCaseInsensitive {#wordshingleminhashargcaseinsensitive}
Выделяет из ASCII строки отрезки (шинглы) из `shinglesize` слов и возвращает шинглы с минимальным и максимальным хешами, вычисленными функцией [wordShingleMinHashCaseInsensitive](#wordshingleminhashcaseinsensitive) с теми же входными данными. Функция регистро**не**зависимая.
**Синтаксис**
``` sql
wordShingleMinHashArgCaseInsensitive(string[, shinglesize, hashnum])
```
**Аргументы**
- `string` — строка. [String](../../sql-reference/data-types/string.md).
- `shinglesize` — размер словесных шинглов. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
- `hashnum` — количество минимальных и максимальных хешей, которое используется при вычислении результата. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `6`. [UInt8](../../sql-reference/data-types/int-uint.md).
**Возвращаемое значение**
- Кортеж из двух кортежей, каждый из которых состоит из `hashnum` шинглов.
Тип: [Tuple](../../sql-reference/data-types/tuple.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md)), [Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md))).
**Пример**
Запрос:
``` sql
SELECT wordShingleMinHashArgCaseInsensitive('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).', 1, 3) AS Tuple;
```
Результат:
``` text
┌─Tuple──────────────────────────────────────────────────────────────────┐
│ (('queries','database','analytical'),('oriented','processing','DBMS')) │
└────────────────────────────────────────────────────────────────────────┘
```
## wordShingleMinHashArgUTF8 {#wordshingleminhashargutf8}
Выделяет из UTF-8 строки отрезки (шинглы) из `shinglesize` слов и возвращает шинглы с минимальным и максимальным хешами, вычисленными функцией [wordShingleMinHashUTF8](#wordshingleminhashutf8) с теми же входными данными. Функция регистрозависимая.
**Синтаксис**
``` sql
wordShingleMinHashArgUTF8(string[, shinglesize, hashnum])
```
**Аргументы**
- `string` — строка. [String](../../sql-reference/data-types/string.md).
- `shinglesize` — размер словесных шинглов. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
- `hashnum` — количество минимальных и максимальных хешей, которое используется при вычислении результата. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `6`. [UInt8](../../sql-reference/data-types/int-uint.md).
**Возвращаемое значение**
- Кортеж из двух кортежей, каждый из которых состоит из `hashnum` шинглов.
Тип: [Tuple](../../sql-reference/data-types/tuple.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md)), [Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md))).
**Пример**
Запрос:
``` sql
SELECT wordShingleMinHashArgUTF8('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).', 1, 3) AS Tuple;
```
Результат:
``` text
┌─Tuple─────────────────────────────────────────────────────────────────┐
│ (('OLAP','database','analytical'),('online','oriented','processing')) │
└───────────────────────────────────────────────────────────────────────┘
```
## wordShingleMinHashArgCaseInsensitiveUTF8 {#wordshingleminhashargcaseinsensitiveutf8}
Выделяет из UTF-8 строки отрезки (шинглы) из `shinglesize` слов и возвращает шинглы с минимальным и максимальным хешами, вычисленными функцией [wordShingleMinHashCaseInsensitiveUTF8](#wordshingleminhashcaseinsensitiveutf8) с теми же входными данными. Функция регистро**не**зависимая.
**Синтаксис**
``` sql
wordShingleMinHashArgCaseInsensitiveUTF8(string[, shinglesize, hashnum])
```
**Аргументы**
- `string` — строка. [String](../../sql-reference/data-types/string.md).
- `shinglesize` — размер словесных шинглов. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
- `hashnum` — количество минимальных и максимальных хешей, которое используется при вычислении результата. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `6`. [UInt8](../../sql-reference/data-types/int-uint.md).
**Возвращаемое значение**
- Кортеж из двух кортежей, каждый из которых состоит из `hashnum` шинглов.
Тип: [Tuple](../../sql-reference/data-types/tuple.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md)), [Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md))).
**Пример**
Запрос:
``` sql
SELECT wordShingleMinHashArgCaseInsensitiveUTF8('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).', 1, 3) AS Tuple;
```
Результат:
``` text
┌─Tuple──────────────────────────────────────────────────────────────────┐
│ (('queries','database','analytical'),('oriented','processing','DBMS')) │
└────────────────────────────────────────────────────────────────────────┘
```

View File

@ -111,3 +111,55 @@ SELECT untuple((* EXCEPT (v2, v3),)) FROM kv;
- [Tuple](../../sql-reference/data-types/tuple.md)
## tupleHammingDistance {#tuplehammingdistance}
Возвращает [расстояние Хэмминга](https://ru.wikipedia.org/wiki/%D0%A0%D0%B0%D1%81%D1%81%D1%82%D0%BE%D1%8F%D0%BD%D0%B8%D0%B5_%D0%A5%D1%8D%D0%BC%D0%BC%D0%B8%D0%BD%D0%B3%D0%B0) между двумя кортежами одинакового размера.
**Синтаксис**
``` sql
tupleHammingDistance(tuple1, tuple2)
```
**Аргументы**
- `tuple1` — первый кортеж. [Tuple](../../sql-reference/data-types/tuple.md).
- `tuple2` — второй кортеж. [Tuple](../../sql-reference/data-types/tuple.md).
Кортежи должны иметь одинаковый размер и тип элементов.
**Возвращаемое значение**
- Расстояние Хэмминга.
Тип: [UInt8](../../sql-reference/data-types/int-uint.md).
**Примеры**
Запрос:
``` sql
SELECT tupleHammingDistance((1, 2, 3), (3, 2, 1)) AS HammingDistance;
```
Результат:
``` text
┌─HammingDistance─┐
│ 2 │
└─────────────────┘
```
Может быть использовано с функциями [MinHash](../../sql-reference/functions/hash-functions.md#ngramminhash) для проверки строк на совпадение:
``` sql
SELECT tupleHammingDistance(wordShingleMinHash(string), wordShingleMinHashCaseInsensitive(string)) as HammingDistance FROM (SELECT 'Clickhouse is a column-oriented database management system for online analytical processing of queries.' AS string);
```
Результат:
``` text
┌─HammingDistance─┐
│ 2 │
└─────────────────┘
```

View File

@ -117,7 +117,7 @@ MODIFY COLUMN [IF EXISTS] name [type] [default_expr] [TTL] [AFTER name_after | F
- TTL
Примеры изменения TTL столбца смотрите в разделе [TTL столбца](ttl.md#mergetree-column-ttl).
Примеры изменения TTL столбца смотрите в разделе [TTL столбца](../../../engines/table-engines/mergetree-family/mergetree.md#mergetree-column-ttl).
Если указано `IF EXISTS`, запрос не возвращает ошибку, если столбца не существует.

View File

@ -5,43 +5,46 @@ toc_title: postgresql
# postgresql {#postgresql}
Позволяет выполнять запросы `SELECT` над данными, хранящимися на удалённом PostgreSQL сервере.
Позволяет выполнять запросы `SELECT` и `INSERT` над таблицами удаленной БД PostgreSQL.
**Синтаксис**
``` sql
postgresql('host:port', 'database', 'table', 'user', 'password')
postgresql('host:port', 'database', 'table', 'user', 'password'[, `schema`])
```
**Параметры**
**Аргументы**
- `host:port` — адрес сервера PostgreSQL.
- `database` — имя базы данных на удалённом сервере.
- `table` — имя таблицы на удалённом сервере.
- `user` — пользователь PostgreSQL.
- `password` — пароль пользователя.
SELECT запросы на стороне PostgreSQL выполняются как `COPY (SELECT ...) TO STDOUT` внутри транзакции PostgreSQL только на чтение с коммитом после каждого `SELECT` запроса.
Простые условия для `WHERE` такие как `=, !=, >, >=, <, <=, IN` исполняются на стороне PostgreSQL сервера.
Все операции объединения, аггрегации, сортировки, условия `IN [ array ]` и ограничения `LIMIT` выполняются на стороне ClickHouse только после того как запрос к PostgreSQL закончился.
INSERT запросы на стороне PostgreSQL выполняются как `COPY "table_name" (field1, field2, ... fieldN) FROM STDIN` внутри PostgreSQL транзакции с автоматическим коммитом после каждого `INSERT` запроса.
PostgreSQL массивы конвертируются в массивы ClickHouse.
Будьте осторожны в PostgreSQL массивы созданные как type_name[], являются многомерными и могут содержать в себе разное количество измерений в разных строках одной таблицы, внутри ClickHouse допустипы только многомерные массивы с одинаковым кол-вом измерений во всех строках таблицы.
- `schema` — имя схемы, если не используется схема по умолчанию. Необязательный аргумент.
**Возвращаемое значение**
Объект таблицы с теми же столбцами, что и в исходной таблице PostgreSQL.
Таблица с теми же столбцами, что и в исходной таблице PostgreSQL.
!!! info "Примечание"
В запросах `INSERT` для того чтобы отличить табличную функцию `postgresql(...)` от таблицы со списком имен столбцов вы должны указывать ключевые слова `FUNCTION` или `TABLE FUNCTION`. See examples below.
В запросах `INSERT` для того чтобы отличить табличную функцию `postgresql(...)` от таблицы со списком имен столбцов вы должны указывать ключевые слова `FUNCTION` или `TABLE FUNCTION`. См. примеры ниже.
## Особенности реализации {#implementation-details}
Запросы `SELECT` на стороне PostgreSQL выполняются как `COPY (SELECT ...) TO STDOUT` внутри транзакции PostgreSQL только на чтение с коммитом после каждого запроса `SELECT`.
Простые условия для `WHERE` такие как `=`, `!=`, `>`, `>=`, `<`, `<=` и `IN` исполняются на стороне PostgreSQL сервера.
Все операции объединения, аггрегации, сортировки, условия `IN [ array ]` и ограничения `LIMIT` выполняются на стороне ClickHouse только после того как запрос к PostgreSQL закончился.
Запросы `INSERT` на стороне PostgreSQL выполняются как `COPY "table_name" (field1, field2, ... fieldN) FROM STDIN` внутри PostgreSQL транзакции с автоматическим коммитом после каждого запроса `INSERT`.
PostgreSQL массивы конвертируются в массивы ClickHouse.
!!! info "Примечание"
Будьте внимательны, в PostgreSQL массивы, созданные как `type_name[]`, являются многомерными и могут содержать в себе разное количество измерений в разных строках одной таблицы. Внутри ClickHouse допустипы только многомерные массивы с одинаковым кол-вом измерений во всех строках таблицы.
При использовании словаря PostgreSQL поддерживается приоритет реплик. Чем больше номер реплики, тем ниже ее приоритет. Наивысший приоритет у реплики с номером `0`.
**Примеры**
@ -58,10 +61,10 @@ PRIMARY KEY (int_id));
CREATE TABLE
postgres=# insert into test (int_id, str, "float") VALUES (1,'test',2);
postgres=# INSERT INTO test (int_id, str, "float") VALUES (1,'test',2);
INSERT 0 1
postgresql> select * from test;
postgresql> SELECT * FROM test;
int_id | int_nullable | float | str | float_nullable
--------+--------------+-------+------+----------------
1 | | 2 | test |
@ -80,7 +83,7 @@ SELECT * FROM postgresql('localhost:5432', 'test', 'test', 'postgresql_user', 'p
└────────┴──────────────┴───────┴──────┴────────────────┘
```
Вставка:
Вставка данных:
```sql
INSERT INTO TABLE FUNCTION postgresql('localhost:5432', 'test', 'test', 'postgrsql_user', 'password') (int_id, float) VALUES (2, 3);
@ -94,7 +97,24 @@ SELECT * FROM postgresql('localhost:5432', 'test', 'test', 'postgresql_user', 'p
└────────┴──────────────┴───────┴──────┴────────────────┘
```
**Смотрите также**
Using Non-default Schema:
- [Движок таблиц PostgreSQL](../../sql-reference/table-functions/postgresql.md)
```text
postgres=# CREATE SCHEMA "nice.schema";
postgres=# CREATE TABLE "nice.schema"."nice.table" (a integer);
postgres=# INSERT INTO "nice.schema"."nice.table" SELECT i FROM generate_series(0, 99) as t(i)
```
```sql
CREATE TABLE pg_table_schema_with_dots (a UInt32)
ENGINE PostgreSQL('localhost:5432', 'clickhouse', 'nice.table', 'postgrsql_user', 'password', 'nice.schema');
```
**См. также**
- [Движок таблиц PostgreSQL](../../sql-reference/table-functions/postgresql.md)
- [Использование PostgreSQL как источника данных для внешнего словаря](../../sql-reference/table-functions/postgresql.md#dicts-external_dicts_dict_sources-postgresql)
[Оригинальная статья](https://clickhouse.tech/docs/ru/sql-reference/table-functions/postgresql/) <!--hide-->

View File

@ -42,11 +42,16 @@ if (OS_LINUX)
set(RESOURCE_OBJS ${RESOURCE_OBJS} ${RESOURCE_OBJ})
# https://stackoverflow.com/questions/14776463/compile-and-add-an-object-file-from-a-binary-with-cmake
# PPC64LE fails to do this with objcopy, use ld or lld instead
if (ARCH_PPC64LE)
add_custom_command(OUTPUT ${RESOURCE_OBJ}
COMMAND cd ${CMAKE_CURRENT_SOURCE_DIR} && ${CMAKE_LINKER} -m elf64lppc -r -b binary -o ${CMAKE_CURRENT_BINARY_DIR}/${RESOURCE_OBJ} ${RESOURCE_FILE})
else()
add_custom_command(OUTPUT ${RESOURCE_OBJ}
COMMAND cd ${CMAKE_CURRENT_SOURCE_DIR} && ${OBJCOPY_PATH} -I binary ${OBJCOPY_ARCH_OPTIONS} ${RESOURCE_FILE} ${CMAKE_CURRENT_BINARY_DIR}/${RESOURCE_OBJ}
COMMAND ${OBJCOPY_PATH} --rename-section .data=.rodata,alloc,load,readonly,data,contents
${CMAKE_CURRENT_BINARY_DIR}/${RESOURCE_OBJ} ${CMAKE_CURRENT_BINARY_DIR}/${RESOURCE_OBJ})
endif()
set_source_files_properties(${RESOURCE_OBJ} PROPERTIES EXTERNAL_OBJECT true GENERATED true)
endforeach(RESOURCE_FILE)

View File

@ -521,14 +521,17 @@ void HedgedConnections::processNewReplicaState(HedgedConnectionsFactory::State s
void HedgedConnections::finishProcessReplica(ReplicaState & replica, bool disconnect)
{
/// It's important to remove file descriptor from epoll exactly before cancelling packet_receiver,
/// because otherwise another thread can try to receive a packet, get this file descriptor
/// from epoll and resume cancelled packet_receiver.
epoll.remove(replica.packet_receiver->getFileDescriptor());
epoll.remove(replica.change_replica_timeout.getDescriptor());
replica.packet_receiver->cancel();
replica.change_replica_timeout.reset();
epoll.remove(replica.packet_receiver->getFileDescriptor());
--offset_states[fd_to_replica_location[replica.packet_receiver->getFileDescriptor()].offset].active_connection_count;
fd_to_replica_location.erase(replica.packet_receiver->getFileDescriptor());
epoll.remove(replica.change_replica_timeout.getDescriptor());
timeout_fd_to_replica_location.erase(replica.change_replica_timeout.getDescriptor());
--active_connection_count;

View File

@ -48,7 +48,7 @@ struct HashMapCell
value_type value;
HashMapCell() {}
HashMapCell() = default;
HashMapCell(const Key & key_, const State &) : value(key_, NoInitTag()) {}
HashMapCell(const value_type & value_, const State &) : value(value_) {}
@ -114,8 +114,39 @@ struct HashMapCell
static void move(HashMapCell * /* old_location */, HashMapCell * /* new_location */) {}
template <size_t I>
auto & get() & {
if constexpr (I == 0) return value.first;
else if constexpr (I == 1) return value.second;
}
template <size_t I>
auto const & get() const & {
if constexpr (I == 0) return value.first;
else if constexpr (I == 1) return value.second;
}
template <size_t I>
auto && get() && {
if constexpr (I == 0) return std::move(value.first);
else if constexpr (I == 1) return std::move(value.second);
}
};
namespace std
{
template <typename Key, typename TMapped, typename Hash, typename TState>
struct tuple_size<HashMapCell<Key, TMapped, Hash, TState>> : std::integral_constant<size_t, 2> { };
template <typename Key, typename TMapped, typename Hash, typename TState>
struct tuple_element<0, HashMapCell<Key, TMapped, Hash, TState>> { using type = Key; };
template <typename Key, typename TMapped, typename Hash, typename TState>
struct tuple_element<1, HashMapCell<Key, TMapped, Hash, TState>> { using type = TMapped; };
}
template <typename Key, typename TMapped, typename Hash, typename TState = HashTableNoState>
struct HashMapCellWithSavedHash : public HashMapCell<Key, TMapped, Hash, TState>
{
@ -227,6 +258,19 @@ public:
}
};
namespace std
{
template <typename Key, typename TMapped, typename Hash, typename TState>
struct tuple_size<HashMapCellWithSavedHash<Key, TMapped, Hash, TState>> : std::integral_constant<size_t, 2> { };
template <typename Key, typename TMapped, typename Hash, typename TState>
struct tuple_element<0, HashMapCellWithSavedHash<Key, TMapped, Hash, TState>> { using type = Key; };
template <typename Key, typename TMapped, typename Hash, typename TState>
struct tuple_element<1, HashMapCellWithSavedHash<Key, TMapped, Hash, TState>> { using type = TMapped; };
}
template <
typename Key,

View File

@ -530,6 +530,31 @@ public:
this->c_end += bytes_to_copy;
}
template <typename ... TAllocatorParams>
void insertFromItself(iterator from_begin, iterator from_end, TAllocatorParams && ... allocator_params)
{
static_assert(memcpy_can_be_used_for_assignment<std::decay_t<T>, std::decay_t<decltype(*from_begin)>>);
/// Convert iterators to indexes because reserve can invalidate iterators
size_t start_index = from_begin - begin();
size_t end_index = from_end - begin();
size_t copy_size = end_index - start_index;
assert(start_index <= end_index);
size_t required_capacity = this->size() + copy_size;
if (required_capacity > this->capacity())
this->reserve(roundUpToPowerOfTwoOrZero(required_capacity), std::forward<TAllocatorParams>(allocator_params)...);
size_t bytes_to_copy = this->byte_size(copy_size);
if (bytes_to_copy)
{
auto begin = this->c_start + this->byte_size(start_index);
memcpy(this->c_end, reinterpret_cast<const void *>(&*begin), bytes_to_copy);
this->c_end += bytes_to_copy;
}
}
template <typename It1, typename It2>
void insert_assume_reserved(It1 from_begin, It2 from_end)
{

View File

@ -35,7 +35,7 @@ std::string signalToErrorMessage(int sig, const siginfo_t & info, const ucontext
else
error << "Address: " << info.si_addr;
#if defined(__x86_64__) && !defined(__FreeBSD__) && !defined(__APPLE__) && !defined(__arm__)
#if defined(__x86_64__) && !defined(__FreeBSD__) && !defined(__APPLE__) && !defined(__arm__) && !defined(__powerpc__)
auto err_mask = context.uc_mcontext.gregs[REG_ERR];
if ((err_mask & 0x02))
error << " Access: write.";
@ -186,6 +186,8 @@ static void * getCallerAddress(const ucontext_t & context)
# endif
#elif defined(__aarch64__)
return reinterpret_cast<void *>(context.uc_mcontext.pc);
#elif defined(__powerpc64__)
return reinterpret_cast<void *>(context.uc_mcontext.gp_regs[PT_NIP]);
#else
return nullptr;
#endif

View File

@ -33,6 +33,19 @@ TEST(Common, PODArrayInsert)
EXPECT_EQ(str, std::string(chars.data(), chars.size()));
}
TEST(Common, PODArrayInsertFromItself)
{
{
PaddedPODArray<UInt64> array { 1 };
for (size_t i = 0; i < 3; ++i)
array.insertFromItself(array.begin(), array.end());
PaddedPODArray<UInt64> expected {1,1,1,1,1,1,1,1};
ASSERT_EQ(array,expected);
}
}
TEST(Common, PODPushBackRawMany)
{
PODArray<char> chars;

View File

@ -228,6 +228,7 @@ class IColumn;
M(Seconds, http_connection_timeout, DEFAULT_HTTP_READ_BUFFER_CONNECTION_TIMEOUT, "HTTP connection timeout.", 0) \
M(Seconds, http_send_timeout, DEFAULT_HTTP_READ_BUFFER_TIMEOUT, "HTTP send timeout", 0) \
M(Seconds, http_receive_timeout, DEFAULT_HTTP_READ_BUFFER_TIMEOUT, "HTTP receive timeout", 0) \
M(UInt64, http_max_uri_size, 16384, "Maximum URI length of HTTP request", 0) \
M(Bool, optimize_throw_if_noop, false, "If setting is enabled and OPTIMIZE query didn't actually assign a merge then an explanatory exception is thrown", 0) \
M(Bool, use_index_for_in_with_subqueries, true, "Try using an index if there is a subquery or a table expression on the right side of the IN operator.", 0) \
M(Bool, joined_subquery_requires_alias, true, "Force joined subqueries and table functions to have aliases for correct name qualification.", 0) \
@ -547,7 +548,7 @@ struct Settings : public BaseSettings<SettingsTraits>
{
/// For initialization from empty initializer-list to be "value initialization", not "aggregate initialization" in C++14.
/// http://en.cppreference.com/w/cpp/language/aggregate_initialization
Settings() {}
Settings() = default;
/** Set multiple settings from "profile" (in server configuration file (users.xml), profiles contain groups of multiple settings).
* The profile can also be set using the `set` functions, like the profile setting.

View File

@ -567,7 +567,7 @@ void DatabaseAtomic::renameDictionaryInMemoryUnlocked(const StorageID & old_name
auto result = external_loader.getLoadResult(toString(old_name.uuid));
if (!result.object)
return;
const auto & dict = dynamic_cast<const IDictionaryBase &>(*result.object);
const auto & dict = dynamic_cast<const IDictionary &>(*result.object);
dict.updateDictionaryName(new_name);
}
void DatabaseAtomic::waitDetachedTableNotInUse(const UUID & uuid)

View File

@ -49,7 +49,7 @@ void DatabaseWithDictionaries::attachDictionary(const String & dictionary_name,
/// Attach the dictionary as table too.
try
{
/// TODO Make StorageDictionary an owner of IDictionaryBase objects.
/// TODO Make StorageDictionary an owner of IDictionary objects.
/// All DDL operations with dictionaries will work with StorageDictionary table,
/// and StorageDictionary will be responsible for loading of DDL dictionaries.
/// ExternalLoaderDatabaseConfigRepository and other hacks related to ExternalLoader

View File

@ -13,7 +13,9 @@
#include <Common/HashTable/HashSet.h>
#include <Common/ProfileEvents.h>
#include <Common/ProfilingScopedRWLock.h>
#include <Dictionaries/DictionaryBlockInputStream.h>
#include <Dictionaries/HierarchyDictionariesUtils.h>
namespace ProfileEvents
{
@ -39,7 +41,6 @@ namespace DB
namespace ErrorCodes
{
extern const int CACHE_DICTIONARY_UPDATE_FAIL;
extern const int TYPE_MISMATCH;
extern const int UNSUPPORTED_METHOD;
}
@ -70,8 +71,6 @@ CacheDictionary<dictionary_key_type>::CacheDictionary(
{
if (!source_ptr->supportsSelectiveLoad())
throw Exception{full_name + ": source cannot be used with CacheDictionary", ErrorCodes::UNSUPPORTED_METHOD};
setupHierarchicalAttribute();
}
template <DictionaryKeyType dictionary_key_type>
@ -120,164 +119,6 @@ const IDictionarySource * CacheDictionary<dictionary_key_type>::getSource() cons
return source_ptr.get();
}
template <DictionaryKeyType dictionary_key_type>
void CacheDictionary<dictionary_key_type>::toParent(const PaddedPODArray<UInt64> & ids [[maybe_unused]], PaddedPODArray<UInt64> & out [[maybe_unused]]) const
{
if constexpr (dictionary_key_type == DictionaryKeyType::simple)
{
/// Run update on requested keys before fetch from storage
const auto & attribute_name = hierarchical_attribute->name;
auto result_type = std::make_shared<DataTypeUInt64>();
auto input_column = result_type->createColumn();
auto & input_column_typed = assert_cast<ColumnVector<UInt64> &>(*input_column);
auto & data = input_column_typed.getData();
data.insert(ids.begin(), ids.end());
auto column = getColumn({attribute_name}, result_type, {std::move(input_column)}, {result_type}, {nullptr});
const auto & result_column_typed = assert_cast<const ColumnVector<UInt64> &>(*column);
const auto & result_data = result_column_typed.getData();
out.assign(result_data);
}
else
throw Exception("Hierarchy is not supported for complex key CacheDictionary", ErrorCodes::UNSUPPORTED_METHOD);
}
/// Allow to use single value in same way as array.
static inline UInt64 getAt(const PaddedPODArray<UInt64> & arr, const size_t idx)
{
return arr[idx];
}
static inline UInt64 getAt(const UInt64 & value, const size_t)
{
return value;
}
template <DictionaryKeyType dictionary_key_type>
template <typename AncestorType>
void CacheDictionary<dictionary_key_type>::isInImpl(const PaddedPODArray<Key> & child_ids, const AncestorType & ancestor_ids, PaddedPODArray<UInt8> & out) const
{
/// Transform all children to parents until ancestor id or null_value will be reached.
size_t out_size = out.size();
memset(out.data(), 0xFF, out_size); /// 0xFF means "not calculated"
const auto null_value = hierarchical_attribute->null_value.get<UInt64>();
PaddedPODArray<Key> children(out_size, 0);
PaddedPODArray<Key> parents(child_ids.begin(), child_ids.end());
for (size_t i = 0; i < DBMS_HIERARCHICAL_DICTIONARY_MAX_DEPTH; ++i)
{
size_t out_idx = 0;
size_t parents_idx = 0;
size_t new_children_idx = 0;
while (out_idx < out_size)
{
/// Already calculated
if (out[out_idx] != 0xFF)
{
++out_idx;
continue;
}
/// No parent
if (parents[parents_idx] == null_value)
{
out[out_idx] = 0;
}
/// Found ancestor
else if (parents[parents_idx] == getAt(ancestor_ids, parents_idx))
{
out[out_idx] = 1;
}
/// Loop detected
else if (children[new_children_idx] == parents[parents_idx])
{
out[out_idx] = 1;
}
/// Found intermediate parent, add this value to search at next loop iteration
else
{
children[new_children_idx] = parents[parents_idx];
++new_children_idx;
}
++out_idx;
++parents_idx;
}
if (new_children_idx == 0)
break;
/// Transform all children to its parents.
children.resize(new_children_idx);
parents.resize(new_children_idx);
toParent(children, parents);
}
}
template <DictionaryKeyType dictionary_key_type>
void CacheDictionary<dictionary_key_type>::isInVectorVector(
const PaddedPODArray<UInt64> & child_ids, const PaddedPODArray<UInt64> & ancestor_ids, PaddedPODArray<UInt8> & out) const
{
isInImpl(child_ids, ancestor_ids, out);
}
template <DictionaryKeyType dictionary_key_type>
void CacheDictionary<dictionary_key_type>::isInVectorConstant(const PaddedPODArray<UInt64> & child_ids, const UInt64 ancestor_id, PaddedPODArray<UInt8> & out) const
{
isInImpl(child_ids, ancestor_id, out);
}
template <DictionaryKeyType dictionary_key_type>
void CacheDictionary<dictionary_key_type>::isInConstantVector(const UInt64 child_id, const PaddedPODArray<UInt64> & ancestor_ids, PaddedPODArray<UInt8> & out) const
{
/// Special case with single child value.
const auto null_value = hierarchical_attribute->null_value.get<UInt64>();
PaddedPODArray<Key> child(1, child_id);
PaddedPODArray<Key> parent(1);
std::vector<Key> ancestors(1, child_id);
/// Iteratively find all ancestors for child.
for (size_t i = 0; i < DBMS_HIERARCHICAL_DICTIONARY_MAX_DEPTH; ++i)
{
toParent(child, parent);
if (parent[0] == null_value)
break;
child[0] = parent[0];
ancestors.push_back(parent[0]);
}
/// Assuming short hierarchy, so linear search is Ok.
for (size_t i = 0, out_size = out.size(); i < out_size; ++i)
out[i] = std::find(ancestors.begin(), ancestors.end(), ancestor_ids[i]) != ancestors.end();
}
template <DictionaryKeyType dictionary_key_type>
void CacheDictionary<dictionary_key_type>::setupHierarchicalAttribute()
{
/// TODO: Move this to DictionaryStructure
for (const auto & attribute : dict_struct.attributes)
{
if (attribute.hierarchical)
{
hierarchical_attribute = &attribute;
if (attribute.underlying_type != AttributeUnderlyingType::utUInt64)
throw Exception{full_name + ": hierarchical attribute must be UInt64.", ErrorCodes::TYPE_MISMATCH};
}
}
}
template <DictionaryKeyType dictionary_key_type>
ColumnPtr CacheDictionary<dictionary_key_type>::getColumn(
const std::string & attribute_name,
@ -296,23 +137,6 @@ Columns CacheDictionary<dictionary_key_type>::getColumns(
const Columns & key_columns,
const DataTypes & key_types,
const Columns & default_values_columns) const
{
if (dictionary_key_type == DictionaryKeyType::complex)
dict_struct.validateKeyTypes(key_types);
Arena complex_keys_arena;
DictionaryKeysExtractor<dictionary_key_type> extractor(key_columns, complex_keys_arena);
auto & keys = extractor.getKeys();
return getColumnsImpl(attribute_names, key_columns, keys, default_values_columns);
}
template <DictionaryKeyType dictionary_key_type>
Columns CacheDictionary<dictionary_key_type>::getColumnsImpl(
const Strings & attribute_names,
const Columns & key_columns,
const PaddedPODArray<KeyType> & keys,
const Columns & default_values_columns) const
{
/**
* Flow of getColumsImpl
@ -328,6 +152,13 @@ Columns CacheDictionary<dictionary_key_type>::getColumnsImpl(
* use default value.
*/
if (dictionary_key_type == DictionaryKeyType::complex)
dict_struct.validateKeyTypes(key_types);
DictionaryKeysArenaHolder<dictionary_key_type> arena_holder;
DictionaryKeysExtractor<dictionary_key_type> extractor(key_columns, arena_holder.getComplexKeyArena());
auto keys = extractor.extractAllKeys();
DictionaryStorageFetchRequest request(dict_struct, attribute_names, default_values_columns);
FetchResult result_of_fetch_from_storage;
@ -440,9 +271,10 @@ ColumnUInt8::Ptr CacheDictionary<dictionary_key_type>::hasKeys(const Columns & k
if (dictionary_key_type == DictionaryKeyType::complex)
dict_struct.validateKeyTypes(key_types);
Arena complex_keys_arena;
DictionaryKeysExtractor<dictionary_key_type> extractor(key_columns, complex_keys_arena);
const auto & keys = extractor.getKeys();
DictionaryKeysArenaHolder<dictionary_key_type> arena_holder;
DictionaryKeysExtractor<dictionary_key_type> extractor(key_columns, arena_holder.getComplexKeyArena());
const auto keys = extractor.extractAllKeys();
/// We make empty request just to fetch if keys exists
DictionaryStorageFetchRequest request(dict_struct, {}, {});
@ -526,6 +358,37 @@ ColumnUInt8::Ptr CacheDictionary<dictionary_key_type>::hasKeys(const Columns & k
return result;
}
template <DictionaryKeyType dictionary_key_type>
ColumnPtr CacheDictionary<dictionary_key_type>::getHierarchy(
ColumnPtr key_column [[maybe_unused]],
const DataTypePtr & key_type [[maybe_unused]]) const
{
if (dictionary_key_type == DictionaryKeyType::simple)
{
auto result = getKeysHierarchyDefaultImplementation(this, key_column, key_type);
query_count.fetch_add(key_column->size(), std::memory_order_relaxed);
return result;
}
else
return nullptr;
}
template <DictionaryKeyType dictionary_key_type>
ColumnUInt8::Ptr CacheDictionary<dictionary_key_type>::isInHierarchy(
ColumnPtr key_column [[maybe_unused]],
ColumnPtr in_key_column [[maybe_unused]],
const DataTypePtr & key_type [[maybe_unused]]) const
{
if (dictionary_key_type == DictionaryKeyType::simple)
{
auto result = getKeysIsInHierarchyDefaultImplementation(this, key_column, in_key_column, key_type);
query_count.fetch_add(key_column->size(), std::memory_order_relaxed);
return result;
}
else
return nullptr;
}
template <DictionaryKeyType dictionary_key_type>
MutableColumns CacheDictionary<dictionary_key_type>::aggregateColumnsInOrderOfKeys(
const PaddedPODArray<KeyType> & keys,
@ -618,19 +481,18 @@ MutableColumns CacheDictionary<dictionary_key_type>::aggregateColumns(
template <DictionaryKeyType dictionary_key_type>
BlockInputStreamPtr CacheDictionary<dictionary_key_type>::getBlockInputStream(const Names & column_names, size_t max_block_size) const
{
using BlockInputStreamType = DictionaryBlockInputStream<Key>;
std::shared_ptr<BlockInputStreamType> stream;
std::shared_ptr<DictionaryBlockInputStream> stream;
{
/// Write lock on storage
const ProfilingScopedWriteRWLock write_lock{rw_lock, ProfileEvents::DictCacheLockWriteNs};
if constexpr (dictionary_key_type == DictionaryKeyType::simple)
stream = std::make_shared<BlockInputStreamType>(shared_from_this(), max_block_size, cache_storage_ptr->getCachedSimpleKeys(), column_names);
stream = std::make_shared<DictionaryBlockInputStream>(shared_from_this(), max_block_size, cache_storage_ptr->getCachedSimpleKeys(), column_names);
else
{
auto keys = cache_storage_ptr->getCachedComplexKeys();
stream = std::make_shared<BlockInputStreamType>(shared_from_this(), max_block_size, keys, column_names);
stream = std::make_shared<DictionaryBlockInputStream>(shared_from_this(), max_block_size, keys, column_names);
}
}
@ -660,14 +522,20 @@ void CacheDictionary<dictionary_key_type>::update(CacheDictionaryUpdateUnitPtr<d
size_t found_keys_size = 0;
DictionaryKeysExtractor<dictionary_key_type> requested_keys_extractor(update_unit_ptr->key_columns, update_unit_ptr->complex_key_arena);
const auto & requested_keys = requested_keys_extractor.getKeys();
Arena * complex_key_arena = update_unit_ptr->complex_keys_arena_holder.getComplexKeyArena();
DictionaryKeysExtractor<dictionary_key_type> requested_keys_extractor(update_unit_ptr->key_columns, complex_key_arena);
auto requested_keys = requested_keys_extractor.extractAllKeys();
HashSet<KeyType> not_found_keys;
std::vector<UInt64> requested_keys_vector;
std::vector<size_t> requested_complex_key_rows;
if constexpr (dictionary_key_type == DictionaryKeyType::simple)
requested_keys_vector.reserve(requested_keys.size());
else
requested_complex_key_rows.reserve(requested_keys.size());
auto & key_index_to_state_from_storage = update_unit_ptr->key_index_to_state;
for (size_t i = 0; i < key_index_to_state_from_storage.size(); ++i)
@ -727,8 +595,8 @@ void CacheDictionary<dictionary_key_type>::update(CacheDictionaryUpdateUnitPtr<d
block_columns.erase(block_columns.begin());
}
DictionaryKeysExtractor<dictionary_key_type> keys_extractor(key_columns, update_unit_ptr->complex_key_arena);
const auto & keys_extracted_from_block = keys_extractor.getKeys();
DictionaryKeysExtractor<dictionary_key_type> keys_extractor(key_columns, complex_key_arena);
auto keys_extracted_from_block = keys_extractor.extractAllKeys();
for (size_t index_of_attribute = 0; index_of_attribute < fetched_columns_during_update.size(); ++index_of_attribute)
{
@ -740,6 +608,7 @@ void CacheDictionary<dictionary_key_type>::update(CacheDictionaryUpdateUnitPtr<d
for (size_t i = 0; i < keys_extracted_from_block.size(); ++i)
{
auto fetched_key_from_source = keys_extracted_from_block[i];
not_found_keys.erase(fetched_key_from_source);
update_unit_ptr->requested_keys_to_fetched_columns_during_update_index[fetched_key_from_source] = found_keys_size;
found_keys_in_source.emplace_back(fetched_key_from_source);

View File

@ -130,33 +130,18 @@ public:
std::exception_ptr getLastException() const override;
bool hasHierarchy() const override { return dictionary_key_type == DictionaryKeyType::simple && hierarchical_attribute; }
bool hasHierarchy() const override { return dictionary_key_type == DictionaryKeyType::simple && dict_struct.hierarchical_attribute_index.has_value(); }
void toParent(const PaddedPODArray<UInt64> & ids, PaddedPODArray<UInt64> & out) const override;
ColumnPtr getHierarchy(ColumnPtr key_column, const DataTypePtr & key_type) const override;
void isInVectorVector(
const PaddedPODArray<UInt64> & child_ids,
const PaddedPODArray<UInt64> & ancestor_ids,
PaddedPODArray<UInt8> & out) const override;
void isInVectorConstant(
const PaddedPODArray<UInt64> & child_ids,
const UInt64 ancestor_id, PaddedPODArray<UInt8> & out) const override;
void isInConstantVector(
const UInt64 child_id,
const PaddedPODArray<UInt64> & ancestor_ids,
PaddedPODArray<UInt8> & out) const override;
ColumnUInt8::Ptr isInHierarchy(
ColumnPtr key_column,
ColumnPtr in_key_column,
const DataTypePtr & key_type) const override;
private:
using FetchResult = std::conditional_t<dictionary_key_type == DictionaryKeyType::simple, SimpleKeysStorageFetchResult, ComplexKeysStorageFetchResult>;
Columns getColumnsImpl(
const Strings & attribute_names,
const Columns & key_columns,
const PaddedPODArray<KeyType> & keys,
const Columns & default_values_columns) const;
static MutableColumns aggregateColumnsInOrderOfKeys(
const PaddedPODArray<KeyType> & keys,
const DictionaryStorageFetchRequest & request,
@ -171,8 +156,6 @@ private:
const MutableColumns & fetched_columns_during_update,
const HashMap<KeyType, size_t> & found_keys_to_fetched_columns_during_update_index);
void setupHierarchicalAttribute();
void update(CacheDictionaryUpdateUnitPtr<dictionary_key_type> update_unit_ptr);
/// Update dictionary source pointer if required and return it. Thread safe.
@ -193,9 +176,6 @@ private:
return source_ptr;
}
template <typename AncestorType>
void isInImpl(const PaddedPODArray<Key> & child_ids, const AncestorType & ancestor_ids, PaddedPODArray<UInt8> & out) const;
const DictionaryStructure dict_struct;
/// Dictionary source should be used with mutex
@ -218,8 +198,6 @@ private:
/// readers. Surprisingly this lock is also used for last_exception pointer.
mutable std::shared_mutex rw_lock;
const DictionaryAttribute * hierarchical_attribute = nullptr;
mutable std::exception_ptr last_exception;
mutable std::atomic<size_t> error_count {0};
mutable std::atomic<std::chrono::system_clock::time_point> backoff_end_time{std::chrono::system_clock::time_point{}};

View File

@ -66,8 +66,9 @@ public:
HashMap<KeyType, size_t> requested_keys_to_fetched_columns_during_update_index;
MutableColumns fetched_columns_during_update;
/// Complex keys are serialized in this arena
Arena complex_key_arena;
DictionaryKeysArenaHolder<dictionary_key_type> complex_keys_arena_holder;
private:
template <DictionaryKeyType>

View File

@ -1,594 +0,0 @@
#include "ComplexKeyHashedDictionary.h"
#include <ext/map.h>
#include <ext/range.h>
#include <Columns/ColumnsNumber.h>
#include <Columns/ColumnNullable.h>
#include <Functions/FunctionHelpers.h>
#include <DataTypes/DataTypesDecimal.h>
#include "DictionaryBlockInputStream.h"
#include "DictionaryFactory.h"
namespace DB
{
namespace ErrorCodes
{
extern const int TYPE_MISMATCH;
extern const int BAD_ARGUMENTS;
extern const int DICTIONARY_IS_EMPTY;
}
ComplexKeyHashedDictionary::ComplexKeyHashedDictionary(
const StorageID & dict_id_,
const DictionaryStructure & dict_struct_,
DictionarySourcePtr source_ptr_,
const DictionaryLifetime dict_lifetime_,
bool require_nonempty_,
BlockPtr saved_block_)
: IDictionaryBase(dict_id_)
, dict_struct(dict_struct_)
, source_ptr{std::move(source_ptr_)}
, dict_lifetime(dict_lifetime_)
, require_nonempty(require_nonempty_)
, saved_block{std::move(saved_block_)}
{
createAttributes();
loadData();
calculateBytesAllocated();
}
ColumnPtr ComplexKeyHashedDictionary::getColumn(
const std::string & attribute_name,
const DataTypePtr & result_type,
const Columns & key_columns,
const DataTypes & key_types,
const ColumnPtr & default_values_column) const
{
dict_struct.validateKeyTypes(key_types);
ColumnPtr result;
const auto & attribute = getAttribute(attribute_name);
const auto & dictionary_attribute = dict_struct.getAttribute(attribute_name, result_type);
auto keys_size = key_columns.front()->size();
ColumnUInt8::MutablePtr col_null_map_to;
ColumnUInt8::Container * vec_null_map_to = nullptr;
if (attribute.is_nullable)
{
col_null_map_to = ColumnUInt8::create(keys_size, false);
vec_null_map_to = &col_null_map_to->getData();
}
auto type_call = [&](const auto &dictionary_attribute_type)
{
using Type = std::decay_t<decltype(dictionary_attribute_type)>;
using AttributeType = typename Type::AttributeType;
using ValueType = DictionaryValueType<AttributeType>;
using ColumnProvider = DictionaryAttributeColumnProvider<AttributeType>;
const auto attribute_null_value = std::get<ValueType>(attribute.null_values);
AttributeType null_value = static_cast<AttributeType>(attribute_null_value);
DictionaryDefaultValueExtractor<AttributeType> default_value_extractor(std::move(null_value), default_values_column);
auto column = ColumnProvider::getColumn(dictionary_attribute, keys_size);
if constexpr (std::is_same_v<AttributeType, String>)
{
auto * out = column.get();
getItemsImpl<StringRef, StringRef>(
attribute,
key_columns,
[&](const size_t row, const StringRef value, bool is_null)
{
if (attribute.is_nullable)
(*vec_null_map_to)[row] = is_null;
out->insertData(value.data, value.size);
},
default_value_extractor);
}
else
{
auto & out = column->getData();
getItemsImpl<AttributeType, AttributeType>(
attribute,
key_columns,
[&](const size_t row, const auto value, bool is_null)
{
if (attribute.is_nullable)
(*vec_null_map_to)[row] = is_null;
out[row] = value;
},
default_value_extractor);
}
result = std::move(column);
};
callOnDictionaryAttributeType(attribute.type, type_call);
if (attribute.is_nullable)
{
result = ColumnNullable::create(result, std::move(col_null_map_to));
}
return result;
}
ColumnUInt8::Ptr ComplexKeyHashedDictionary::hasKeys(const Columns & key_columns, const DataTypes & key_types) const
{
dict_struct.validateKeyTypes(key_types);
auto size = key_columns.front()->size();
auto result = ColumnUInt8::create(size);
auto& out = result->getData();
const auto & attribute = attributes.front();
auto type_call = [&](const auto & dictionary_attribute_type)
{
using Type = std::decay_t<decltype(dictionary_attribute_type)>;
using AttributeType = typename Type::AttributeType;
using ValueType = DictionaryValueType<AttributeType>;
has<ValueType>(attribute, key_columns, out);
};
callOnDictionaryAttributeType(attribute.type, type_call);
return result;
}
void ComplexKeyHashedDictionary::createAttributes()
{
const auto size = dict_struct.attributes.size();
attributes.reserve(size);
for (const auto & attribute : dict_struct.attributes)
{
attribute_index_by_name.emplace(attribute.name, attributes.size());
attributes.push_back(createAttribute(attribute, attribute.null_value));
if (attribute.hierarchical)
throw Exception{full_name + ": hierarchical attributes not supported for dictionary of type " + getTypeName(),
ErrorCodes::TYPE_MISMATCH};
}
}
void ComplexKeyHashedDictionary::blockToAttributes(const Block & block)
{
/// created upfront to avoid excess allocations
const auto keys_size = dict_struct.key->size();
StringRefs keys(keys_size);
const auto attributes_size = attributes.size();
const auto rows = block.rows();
element_count += rows;
const auto key_column_ptrs = ext::map<Columns>(
ext::range(0, keys_size), [&](const size_t attribute_idx) { return block.safeGetByPosition(attribute_idx).column; });
const auto attribute_column_ptrs = ext::map<Columns>(ext::range(0, attributes_size), [&](const size_t attribute_idx)
{
return block.safeGetByPosition(keys_size + attribute_idx).column;
});
for (const auto row_idx : ext::range(0, rows))
{
/// calculate key once per row
const auto key = placeKeysInPool(row_idx, key_column_ptrs, keys, keys_pool);
auto should_rollback = false;
for (const auto attribute_idx : ext::range(0, attributes_size))
{
const auto & attribute_column = *attribute_column_ptrs[attribute_idx];
auto & attribute = attributes[attribute_idx];
const auto inserted = setAttributeValue(attribute, key, attribute_column[row_idx]);
if (!inserted)
should_rollback = true;
}
/// @note on multiple equal keys the mapped value for the first one is stored
if (should_rollback)
keys_pool.rollback(key.size);
}
}
void ComplexKeyHashedDictionary::updateData()
{
/// created upfront to avoid excess allocations
const auto keys_size = dict_struct.key->size();
StringRefs keys(keys_size);
const auto attributes_size = attributes.size();
if (!saved_block || saved_block->rows() == 0)
{
auto stream = source_ptr->loadUpdatedAll();
stream->readPrefix();
while (const auto block = stream->read())
{
/// We are using this method to keep saved data if input stream consists of multiple blocks
if (!saved_block)
saved_block = std::make_shared<DB::Block>(block.cloneEmpty());
for (const auto attribute_idx : ext::range(0, keys_size + attributes_size))
{
const IColumn & update_column = *block.getByPosition(attribute_idx).column.get();
MutableColumnPtr saved_column = saved_block->getByPosition(attribute_idx).column->assumeMutable();
saved_column->insertRangeFrom(update_column, 0, update_column.size());
}
}
stream->readSuffix();
}
else
{
auto stream = source_ptr->loadUpdatedAll();
stream->readPrefix();
while (Block block = stream->read())
{
const auto saved_key_column_ptrs = ext::map<Columns>(
ext::range(0, keys_size), [&](const size_t key_idx) { return saved_block->safeGetByPosition(key_idx).column; });
const auto update_key_column_ptrs = ext::map<Columns>(
ext::range(0, keys_size), [&](const size_t key_idx) { return block.safeGetByPosition(key_idx).column; });
Arena temp_key_pool;
ContainerType<std::vector<size_t>> update_key_hash;
for (size_t i = 0; i < block.rows(); ++i)
{
const auto u_key = placeKeysInPool(i, update_key_column_ptrs, keys, temp_key_pool);
update_key_hash[u_key].push_back(i);
}
const size_t rows = saved_block->rows();
IColumn::Filter filter(rows);
for (size_t i = 0; i < saved_block->rows(); ++i)
{
const auto s_key = placeKeysInPool(i, saved_key_column_ptrs, keys, temp_key_pool);
auto * it = update_key_hash.find(s_key);
if (it)
filter[i] = 0;
else
filter[i] = 1;
}
auto block_columns = block.mutateColumns();
for (const auto attribute_idx : ext::range(0, keys_size + attributes_size))
{
auto & column = saved_block->safeGetByPosition(attribute_idx).column;
const auto & filtered_column = column->filter(filter, -1);
block_columns[attribute_idx]->insertRangeFrom(*filtered_column.get(), 0, filtered_column->size());
}
saved_block->setColumns(std::move(block_columns));
}
stream->readSuffix();
}
if (saved_block)
blockToAttributes(*saved_block.get());
}
void ComplexKeyHashedDictionary::loadData()
{
if (!source_ptr->hasUpdateField())
{
auto stream = source_ptr->loadAll();
stream->readPrefix();
while (const auto block = stream->read())
blockToAttributes(block);
stream->readSuffix();
}
else
updateData();
if (require_nonempty && 0 == element_count)
throw Exception{full_name + ": dictionary source is empty and 'require_nonempty' property is set.", ErrorCodes::DICTIONARY_IS_EMPTY};
}
template <typename T>
void ComplexKeyHashedDictionary::addAttributeSize(const Attribute & attribute)
{
const auto & map_ref = std::get<ContainerType<T>>(attribute.maps);
bytes_allocated += sizeof(ContainerType<T>) + map_ref.getBufferSizeInBytes();
bucket_count = map_ref.getBufferSizeInCells();
}
template <>
void ComplexKeyHashedDictionary::addAttributeSize<String>(const Attribute & attribute)
{
const auto & map_ref = std::get<ContainerType<StringRef>>(attribute.maps);
bytes_allocated += sizeof(ContainerType<StringRef>) + map_ref.getBufferSizeInBytes();
bucket_count = map_ref.getBufferSizeInCells();
bytes_allocated += sizeof(Arena) + attribute.string_arena->size();
}
void ComplexKeyHashedDictionary::calculateBytesAllocated()
{
bytes_allocated += attributes.size() * sizeof(attributes.front());
for (const auto & attribute : attributes)
{
auto type_call = [&](const auto & dictionary_attribute_type)
{
using Type = std::decay_t<decltype(dictionary_attribute_type)>;
using AttributeType = typename Type::AttributeType;
addAttributeSize<AttributeType>(attribute);
};
callOnDictionaryAttributeType(attribute.type, type_call);
}
bytes_allocated += keys_pool.size();
}
template <typename T>
void ComplexKeyHashedDictionary::createAttributeImpl(Attribute & attribute, const Field & null_value)
{
attribute.null_values = T(null_value.get<T>());
attribute.maps.emplace<ContainerType<T>>();
}
template <>
void ComplexKeyHashedDictionary::createAttributeImpl<String>(Attribute & attribute, const Field & null_value)
{
attribute.string_arena = std::make_unique<Arena>();
const String & string = null_value.get<String>();
const char * string_in_arena = attribute.string_arena->insert(string.data(), string.size());
attribute.null_values.emplace<StringRef>(string_in_arena, string.size());
attribute.maps.emplace<ContainerType<StringRef>>();
}
ComplexKeyHashedDictionary::Attribute
ComplexKeyHashedDictionary::createAttribute(const DictionaryAttribute & attribute, const Field & null_value)
{
auto nullable_set = attribute.is_nullable ? std::make_unique<NullableSet>() : nullptr;
Attribute attr{attribute.underlying_type, attribute.is_nullable, std::move(nullable_set), {}, {}, {}};
auto type_call = [&](const auto &dictionary_attribute_type)
{
using Type = std::decay_t<decltype(dictionary_attribute_type)>;
using AttributeType = typename Type::AttributeType;
createAttributeImpl<AttributeType>(attr, null_value);
};
callOnDictionaryAttributeType(attribute.underlying_type, type_call);
return attr;
}
template <typename AttributeType, typename OutputType, typename ValueSetter, typename DefaultValueExtractor>
void ComplexKeyHashedDictionary::getItemsImpl(
const Attribute & attribute,
const Columns & key_columns,
ValueSetter && set_value,
DefaultValueExtractor & default_value_extractor) const
{
const auto & attr = std::get<ContainerType<AttributeType>>(attribute.maps);
const auto keys_size = key_columns.size();
StringRefs keys(keys_size);
Arena temporary_keys_pool;
const auto rows = key_columns.front()->size();
for (const auto i : ext::range(0, rows))
{
/// copy key data to arena so it is contiguous and return StringRef to it
const auto key = placeKeysInPool(i, key_columns, keys, temporary_keys_pool);
const auto it = attr.find(key);
if (it)
{
set_value(i, static_cast<OutputType>(it->getMapped()), false);
}
else
{
if (attribute.is_nullable && attribute.nullable_set->find(key) != nullptr)
set_value(i, default_value_extractor[i], true);
else
set_value(i, default_value_extractor[i], false);
}
/// free memory allocated for the key
temporary_keys_pool.rollback(key.size);
}
query_count.fetch_add(rows, std::memory_order_relaxed);
}
template <typename T>
bool ComplexKeyHashedDictionary::setAttributeValueImpl(Attribute & attribute, const StringRef key, const T value)
{
auto & map = std::get<ContainerType<T>>(attribute.maps);
const auto pair = map.insert({key, value});
return pair.second;
}
template <>
bool ComplexKeyHashedDictionary::setAttributeValueImpl<String>(Attribute & attribute, const StringRef key, const String value)
{
const auto * string_in_arena = attribute.string_arena->insert(value.data(), value.size());
return setAttributeValueImpl<StringRef>(attribute, key, StringRef{string_in_arena, value.size()});
}
bool ComplexKeyHashedDictionary::setAttributeValue(Attribute & attribute, const StringRef key, const Field & value)
{
bool result = false;
auto type_call = [&](const auto &dictionary_attribute_type)
{
using Type = std::decay_t<decltype(dictionary_attribute_type)>;
using AttributeType = typename Type::AttributeType;
if (attribute.is_nullable)
{
if (value.isNull())
{
attribute.nullable_set->insert(key);
result = true;
return;
}
else
{
attribute.nullable_set->erase(key);
}
}
result = setAttributeValueImpl<AttributeType>(attribute, key, value.get<AttributeType>());
};
callOnDictionaryAttributeType(attribute.type, type_call);
return result;
}
const ComplexKeyHashedDictionary::Attribute & ComplexKeyHashedDictionary::getAttribute(const std::string & attribute_name) const
{
const auto it = attribute_index_by_name.find(attribute_name);
if (it == std::end(attribute_index_by_name))
throw Exception{full_name + ": no such attribute '" + attribute_name + "'", ErrorCodes::BAD_ARGUMENTS};
return attributes[it->second];
}
StringRef ComplexKeyHashedDictionary::placeKeysInPool(const size_t row, const Columns & key_columns, StringRefs & keys, Arena & pool)
{
const auto keys_size = key_columns.size();
size_t sum_keys_size{};
const char * block_start = nullptr;
for (size_t j = 0; j < keys_size; ++j)
{
keys[j] = key_columns[j]->serializeValueIntoArena(row, pool, block_start);
sum_keys_size += keys[j].size;
}
const auto * key_start = block_start;
for (size_t j = 0; j < keys_size; ++j)
{
keys[j].data = key_start;
key_start += keys[j].size;
}
return {block_start, sum_keys_size};
}
template <typename T>
void ComplexKeyHashedDictionary::has(const Attribute & attribute, const Columns & key_columns, PaddedPODArray<UInt8> & out) const
{
const auto & attr = std::get<ContainerType<T>>(attribute.maps);
const auto keys_size = key_columns.size();
StringRefs keys(keys_size);
Arena temporary_keys_pool;
const auto rows = key_columns.front()->size();
for (const auto i : ext::range(0, rows))
{
/// copy key data to arena so it is contiguous and return StringRef to it
const auto key = placeKeysInPool(i, key_columns, keys, temporary_keys_pool);
const auto it = attr.find(key);
out[i] = static_cast<bool>(it);
if (attribute.is_nullable && !out[i])
out[i] = attribute.nullable_set->find(key) != nullptr;
/// free memory allocated for the key
temporary_keys_pool.rollback(key.size);
}
query_count.fetch_add(rows, std::memory_order_relaxed);
}
std::vector<StringRef> ComplexKeyHashedDictionary::getKeys() const
{
const Attribute & attribute = attributes.front();
std::vector<StringRef> result;
auto type_call = [&](const auto & dictionary_attribute_type)
{
using Type = std::decay_t<decltype(dictionary_attribute_type)>;
using AttributeType = typename Type::AttributeType;
if constexpr (std::is_same_v<AttributeType, String>)
{
result = getKeys<StringRef>(attribute);
}
else
{
result = getKeys<AttributeType>(attribute);
}
};
callOnDictionaryAttributeType(attribute.type, type_call);
return result;
}
template <typename T>
std::vector<StringRef> ComplexKeyHashedDictionary::getKeys(const Attribute & attribute) const
{
const ContainerType<T> & attr = std::get<ContainerType<T>>(attribute.maps);
std::vector<StringRef> keys;
keys.reserve(attr.size());
for (const auto & key : attr)
keys.push_back(key.getKey());
if (attribute.is_nullable)
{
for (const auto & key: *attribute.nullable_set)
keys.push_back(key.getKey());
}
return keys;
}
BlockInputStreamPtr ComplexKeyHashedDictionary::getBlockInputStream(const Names & column_names, size_t max_block_size) const
{
using BlockInputStreamType = DictionaryBlockInputStream<UInt64>;
auto vector_keys = getKeys();
PaddedPODArray<StringRef> keys;
keys.reserve(vector_keys.size());
keys.assign(vector_keys.begin(), vector_keys.end());
return std::make_shared<BlockInputStreamType>(shared_from_this(), max_block_size, keys, column_names);
}
void registerDictionaryComplexKeyHashed(DictionaryFactory & factory)
{
auto create_layout = [=](const std::string &,
const DictionaryStructure & dict_struct,
const Poco::Util::AbstractConfiguration & config,
const std::string & config_prefix,
DictionarySourcePtr source_ptr) -> DictionaryPtr
{
if (!dict_struct.key)
throw Exception{"'key' is required for dictionary of layout 'complex_key_hashed'", ErrorCodes::BAD_ARGUMENTS};
const auto dict_id = StorageID::fromDictionaryConfig(config, config_prefix);
const DictionaryLifetime dict_lifetime{config, config_prefix + ".lifetime"};
const bool require_nonempty = config.getBool(config_prefix + ".require_nonempty", false);
return std::make_unique<ComplexKeyHashedDictionary>(dict_id, dict_struct, std::move(source_ptr), dict_lifetime, require_nonempty);
};
factory.registerLayout("complex_key_hashed", create_layout, true);
}
}

View File

@ -1,185 +0,0 @@
#pragma once
#include <atomic>
#include <memory>
#include <variant>
#include <Columns/ColumnDecimal.h>
#include <Columns/ColumnString.h>
#include <Common/Arena.h>
#include <Common/HashTable/HashMap.h>
#include <Common/HashTable/HashSet.h>
#include <Core/Block.h>
#include <common/StringRef.h>
#include <ext/range.h>
#include "IDictionary.h"
#include "IDictionarySource.h"
#include "DictionaryStructure.h"
#include "DictionaryHelpers.h"
namespace DB
{
class ComplexKeyHashedDictionary final : public IDictionaryBase
{
public:
ComplexKeyHashedDictionary(
const StorageID & dict_id_,
const DictionaryStructure & dict_struct_,
DictionarySourcePtr source_ptr_,
const DictionaryLifetime dict_lifetime_,
bool require_nonempty_,
BlockPtr saved_block_ = nullptr);
std::string getKeyDescription() const { return key_description; }
std::string getTypeName() const override { return "ComplexKeyHashed"; }
size_t getBytesAllocated() const override { return bytes_allocated; }
size_t getQueryCount() const override { return query_count.load(std::memory_order_relaxed); }
double getHitRate() const override { return 1.0; }
size_t getElementCount() const override { return element_count; }
double getLoadFactor() const override { return static_cast<double>(element_count) / bucket_count; }
std::shared_ptr<const IExternalLoadable> clone() const override
{
return std::make_shared<ComplexKeyHashedDictionary>(getDictionaryID(), dict_struct, source_ptr->clone(), dict_lifetime, require_nonempty, saved_block);
}
const IDictionarySource * getSource() const override { return source_ptr.get(); }
const DictionaryLifetime & getLifetime() const override { return dict_lifetime; }
const DictionaryStructure & getStructure() const override { return dict_struct; }
bool isInjective(const std::string & attribute_name) const override
{
return dict_struct.attributes[&getAttribute(attribute_name) - attributes.data()].injective;
}
DictionaryKeyType getKeyType() const override { return DictionaryKeyType::complex; }
ColumnPtr getColumn(
const std::string& attribute_name,
const DataTypePtr & result_type,
const Columns & key_columns,
const DataTypes & key_types,
const ColumnPtr & default_values_column) const override;
ColumnUInt8::Ptr hasKeys(const Columns & key_columns, const DataTypes & key_types) const override;
BlockInputStreamPtr getBlockInputStream(const Names & column_names, size_t max_block_size) const override;
private:
template <typename Value>
using ContainerType = HashMapWithSavedHash<StringRef, Value, StringRefHash>;
using NullableSet = HashSetWithSavedHash<StringRef, StringRefHash>;
struct Attribute final
{
AttributeUnderlyingType type;
bool is_nullable;
std::unique_ptr<NullableSet> nullable_set;
std::variant<
UInt8,
UInt16,
UInt32,
UInt64,
UInt128,
Int8,
Int16,
Int32,
Int64,
Decimal32,
Decimal64,
Decimal128,
Float32,
Float64,
StringRef>
null_values;
std::variant<
ContainerType<UInt8>,
ContainerType<UInt16>,
ContainerType<UInt32>,
ContainerType<UInt64>,
ContainerType<UInt128>,
ContainerType<Int8>,
ContainerType<Int16>,
ContainerType<Int32>,
ContainerType<Int64>,
ContainerType<Decimal32>,
ContainerType<Decimal64>,
ContainerType<Decimal128>,
ContainerType<Float32>,
ContainerType<Float64>,
ContainerType<StringRef>>
maps;
std::unique_ptr<Arena> string_arena;
};
void createAttributes();
void blockToAttributes(const Block & block);
void updateData();
void loadData();
template <typename T>
void addAttributeSize(const Attribute & attribute);
void calculateBytesAllocated();
template <typename T>
static void createAttributeImpl(Attribute & attribute, const Field & null_value);
static Attribute createAttribute(const DictionaryAttribute & attribute, const Field & null_value);
template <typename AttributeType, typename OutputType, typename ValueSetter, typename DefaultValueExtractor>
void getItemsImpl(
const Attribute & attribute,
const Columns & key_columns,
ValueSetter && set_value,
DefaultValueExtractor & default_value_extractor) const;
template <typename T>
static bool setAttributeValueImpl(Attribute & attribute, const StringRef key, const T value);
static bool setAttributeValue(Attribute & attribute, const StringRef key, const Field & value);
const Attribute & getAttribute(const std::string & attribute_name) const;
static StringRef placeKeysInPool(const size_t row, const Columns & key_columns, StringRefs & keys, Arena & pool);
template <typename T>
void has(const Attribute & attribute, const Columns & key_columns, PaddedPODArray<UInt8> & out) const;
std::vector<StringRef> getKeys() const;
template <typename T>
std::vector<StringRef> getKeys(const Attribute & attribute) const;
const DictionaryStructure dict_struct;
const DictionarySourcePtr source_ptr;
const DictionaryLifetime dict_lifetime;
const bool require_nonempty;
const std::string key_description{dict_struct.getKeyDescription()};
std::map<std::string, size_t> attribute_index_by_name;
std::vector<Attribute> attributes;
Arena keys_pool;
size_t bytes_allocated = 0;
size_t element_count = 0;
size_t bucket_count = 0;
mutable std::atomic<size_t> query_count{0};
BlockPtr saved_block;
};
}

View File

@ -0,0 +1,200 @@
#include "DictionaryBlockInputStream.h"
namespace DB
{
namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
}
DictionaryBlockInputStream::DictionaryBlockInputStream(
std::shared_ptr<const IDictionary> dictionary_, UInt64 max_block_size_, PaddedPODArray<UInt64> && ids_, const Names & column_names_)
: DictionaryBlockInputStreamBase(ids_.size(), max_block_size_)
, dictionary(dictionary_)
, column_names(column_names_)
, ids(std::move(ids_))
, key_type(DictionaryInputStreamKeyType::Id)
{
}
DictionaryBlockInputStream::DictionaryBlockInputStream(
std::shared_ptr<const IDictionary> dictionary_,
UInt64 max_block_size_,
const PaddedPODArray<StringRef> & keys,
const Names & column_names_)
: DictionaryBlockInputStreamBase(keys.size(), max_block_size_)
, dictionary(dictionary_)
, column_names(column_names_)
, key_type(DictionaryInputStreamKeyType::ComplexKey)
{
const DictionaryStructure & dictionary_structure = dictionary->getStructure();
fillKeyColumns(keys, 0, keys.size(), dictionary_structure, key_columns);
}
DictionaryBlockInputStream::DictionaryBlockInputStream(
std::shared_ptr<const IDictionary> dictionary_,
UInt64 max_block_size_,
const Columns & data_columns_,
const Names & column_names_,
GetColumnsFunction && get_key_columns_function_,
GetColumnsFunction && get_view_columns_function_)
: DictionaryBlockInputStreamBase(data_columns_.front()->size(), max_block_size_)
, dictionary(dictionary_)
, column_names(column_names_)
, data_columns(data_columns_)
, get_key_columns_function(std::move(get_key_columns_function_))
, get_view_columns_function(std::move(get_view_columns_function_))
, key_type(DictionaryInputStreamKeyType::Callback)
{
}
Block DictionaryBlockInputStream::getBlock(size_t start, size_t length) const
{
/// TODO: Rewrite
switch (key_type)
{
case DictionaryInputStreamKeyType::ComplexKey:
{
Columns columns;
ColumnsWithTypeAndName view_columns;
columns.reserve(key_columns.size());
for (const auto & key_column : key_columns)
{
ColumnPtr column = key_column.column->cut(start, length);
columns.emplace_back(column);
view_columns.emplace_back(column, key_column.type, key_column.name);
}
return fillBlock({}, columns, {}, std::move(view_columns));
}
case DictionaryInputStreamKeyType::Id:
{
PaddedPODArray<UInt64> ids_to_fill(ids.begin() + start, ids.begin() + start + length);
return fillBlock(ids_to_fill, {}, {}, {});
}
case DictionaryInputStreamKeyType::Callback:
{
Columns columns;
columns.reserve(data_columns.size());
for (const auto & data_column : data_columns)
columns.push_back(data_column->cut(start, length));
const DictionaryStructure & dictionaty_structure = dictionary->getStructure();
const auto & attributes = *dictionaty_structure.key;
ColumnsWithTypeAndName keys_with_type_and_name = get_key_columns_function(columns, attributes);
ColumnsWithTypeAndName view_with_type_and_name = get_view_columns_function(columns, attributes);
DataTypes types;
columns.clear();
for (const auto & key_column : keys_with_type_and_name)
{
columns.push_back(key_column.column);
types.push_back(key_column.type);
}
return fillBlock({}, columns, types, std::move(view_with_type_and_name));
}
}
throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected DictionaryInputStreamKeyType.");
}
Block DictionaryBlockInputStream::fillBlock(
const PaddedPODArray<UInt64> & ids_to_fill,
const Columns & keys,
const DataTypes & types,
ColumnsWithTypeAndName && view) const
{
std::unordered_set<std::string> names(column_names.begin(), column_names.end());
DataTypes data_types = types;
ColumnsWithTypeAndName block_columns;
data_types.reserve(keys.size());
const DictionaryStructure & dictionary_structure = dictionary->getStructure();
if (data_types.empty() && dictionary_structure.key)
for (const auto & key : *dictionary_structure.key)
data_types.push_back(key.type);
for (const auto & column : view)
if (names.find(column.name) != names.end())
block_columns.push_back(column);
const DictionaryStructure & structure = dictionary->getStructure();
ColumnPtr ids_column = getColumnFromIds(ids_to_fill);
if (structure.id && names.find(structure.id->name) != names.end())
{
block_columns.emplace_back(ids_column, std::make_shared<DataTypeUInt64>(), structure.id->name);
}
auto dictionary_key_type = dictionary->getKeyType();
for (const auto idx : ext::range(0, structure.attributes.size()))
{
const DictionaryAttribute & attribute = structure.attributes[idx];
if (names.find(attribute.name) != names.end())
{
ColumnPtr column;
if (dictionary_key_type == DictionaryKeyType::simple)
{
column = dictionary->getColumn(
attribute.name,
attribute.type,
{ids_column},
{std::make_shared<DataTypeUInt64>()},
nullptr /* default_values_column */);
}
else
{
column = dictionary->getColumn(
attribute.name,
attribute.type,
keys,
data_types,
nullptr /* default_values_column*/);
}
block_columns.emplace_back(column, attribute.type, attribute.name);
}
}
return Block(block_columns);
}
ColumnPtr DictionaryBlockInputStream::getColumnFromIds(const PaddedPODArray<UInt64> & ids_to_fill)
{
auto column_vector = ColumnVector<UInt64>::create();
column_vector->getData().assign(ids_to_fill);
return column_vector;
}
void DictionaryBlockInputStream::fillKeyColumns(
const PaddedPODArray<StringRef> & keys,
size_t start,
size_t size,
const DictionaryStructure & dictionary_structure,
ColumnsWithTypeAndName & result)
{
MutableColumns columns;
columns.reserve(dictionary_structure.key->size());
for (const DictionaryAttribute & attribute : *dictionary_structure.key)
columns.emplace_back(attribute.type->createColumn());
for (auto idx : ext::range(start, size))
{
const auto & key = keys[idx];
const auto *ptr = key.data;
for (auto & column : columns)
ptr = column->deserializeAndInsertFromArena(ptr);
}
for (size_t i = 0, num_columns = columns.size(); i < num_columns; ++i)
{
const auto & dictionary_attribute = (*dictionary_structure.key)[i];
result.emplace_back(ColumnWithTypeAndName{std::move(columns[i]), dictionary_attribute.type, dictionary_attribute.name});
}
}
}

View File

@ -16,27 +16,22 @@
namespace DB
{
namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
}
/// TODO: Remove this class
/* BlockInputStream implementation for external dictionaries
* read() returns blocks consisting of the in-memory contents of the dictionaries
*/
template <typename Key>
class DictionaryBlockInputStream : public DictionaryBlockInputStreamBase
{
public:
DictionaryBlockInputStream(
std::shared_ptr<const IDictionaryBase> dictionary,
std::shared_ptr<const IDictionary> dictionary,
UInt64 max_block_size,
PaddedPODArray<Key> && ids,
PaddedPODArray<UInt64> && ids,
const Names & column_names);
DictionaryBlockInputStream(
std::shared_ptr<const IDictionaryBase> dictionary,
std::shared_ptr<const IDictionary> dictionary,
UInt64 max_block_size,
const PaddedPODArray<StringRef> & keys,
const Names & column_names);
@ -48,7 +43,7 @@ public:
// and get_view_columns_function to get key representation.
// Now used in trie dictionary, where columns are stored as ip and mask, and are showed as string
DictionaryBlockInputStream(
std::shared_ptr<const IDictionaryBase> dictionary,
std::shared_ptr<const IDictionary> dictionary,
UInt64 max_block_size,
const Columns & data_columns,
const Names & column_names,
@ -61,21 +56,24 @@ protected:
Block getBlock(size_t start, size_t length) const override;
private:
Block
fillBlock(const PaddedPODArray<Key> & ids_to_fill, const Columns & keys, const DataTypes & types, ColumnsWithTypeAndName && view) const;
Block fillBlock(
const PaddedPODArray<UInt64> & ids_to_fill,
const Columns & keys,
const DataTypes & types,
ColumnsWithTypeAndName && view) const;
ColumnPtr getColumnFromIds(const PaddedPODArray<Key> & ids_to_fill) const;
static ColumnPtr getColumnFromIds(const PaddedPODArray<UInt64> & ids_to_fill);
void fillKeyColumns(
static void fillKeyColumns(
const PaddedPODArray<StringRef> & keys,
size_t start,
size_t size,
const DictionaryStructure & dictionary_structure,
ColumnsWithTypeAndName & columns) const;
ColumnsWithTypeAndName & result);
std::shared_ptr<const IDictionaryBase> dictionary;
std::shared_ptr<const IDictionary> dictionary;
Names column_names;
PaddedPODArray<Key> ids;
PaddedPODArray<UInt64> ids;
ColumnsWithTypeAndName key_columns;
Columns data_columns;
@ -92,200 +90,4 @@ private:
DictionaryInputStreamKeyType key_type;
};
template <typename Key>
DictionaryBlockInputStream<Key>::DictionaryBlockInputStream(
std::shared_ptr<const IDictionaryBase> dictionary_, UInt64 max_block_size_, PaddedPODArray<Key> && ids_, const Names & column_names_)
: DictionaryBlockInputStreamBase(ids_.size(), max_block_size_)
, dictionary(dictionary_)
, column_names(column_names_)
, ids(std::move(ids_))
, key_type(DictionaryInputStreamKeyType::Id)
{
}
template <typename Key>
DictionaryBlockInputStream<Key>::DictionaryBlockInputStream(
std::shared_ptr<const IDictionaryBase> dictionary_,
UInt64 max_block_size_,
const PaddedPODArray<StringRef> & keys,
const Names & column_names_)
: DictionaryBlockInputStreamBase(keys.size(), max_block_size_)
, dictionary(dictionary_)
, column_names(column_names_)
, key_type(DictionaryInputStreamKeyType::ComplexKey)
{
const DictionaryStructure & dictionary_structure = dictionary->getStructure();
fillKeyColumns(keys, 0, keys.size(), dictionary_structure, key_columns);
}
template <typename Key>
DictionaryBlockInputStream<Key>::DictionaryBlockInputStream(
std::shared_ptr<const IDictionaryBase> dictionary_,
UInt64 max_block_size_,
const Columns & data_columns_,
const Names & column_names_,
GetColumnsFunction && get_key_columns_function_,
GetColumnsFunction && get_view_columns_function_)
: DictionaryBlockInputStreamBase(data_columns_.front()->size(), max_block_size_)
, dictionary(dictionary_)
, column_names(column_names_)
, data_columns(data_columns_)
, get_key_columns_function(std::move(get_key_columns_function_))
, get_view_columns_function(std::move(get_view_columns_function_))
, key_type(DictionaryInputStreamKeyType::Callback)
{
}
template <typename Key>
Block DictionaryBlockInputStream<Key>::getBlock(size_t start, size_t length) const
{
/// TODO: Rewrite
switch (key_type)
{
case DictionaryInputStreamKeyType::ComplexKey:
{
Columns columns;
ColumnsWithTypeAndName view_columns;
columns.reserve(key_columns.size());
for (const auto & key_column : key_columns)
{
ColumnPtr column = key_column.column->cut(start, length);
columns.emplace_back(column);
view_columns.emplace_back(column, key_column.type, key_column.name);
}
return fillBlock({}, columns, {}, std::move(view_columns));
}
case DictionaryInputStreamKeyType::Id:
{
PaddedPODArray<Key> ids_to_fill(ids.begin() + start, ids.begin() + start + length);
return fillBlock(ids_to_fill, {}, {}, {});
}
case DictionaryInputStreamKeyType::Callback:
{
Columns columns;
columns.reserve(data_columns.size());
for (const auto & data_column : data_columns)
columns.push_back(data_column->cut(start, length));
const DictionaryStructure & dictionaty_structure = dictionary->getStructure();
const auto & attributes = *dictionaty_structure.key;
ColumnsWithTypeAndName keys_with_type_and_name = get_key_columns_function(columns, attributes);
ColumnsWithTypeAndName view_with_type_and_name = get_view_columns_function(columns, attributes);
DataTypes types;
columns.clear();
for (const auto & key_column : keys_with_type_and_name)
{
columns.push_back(key_column.column);
types.push_back(key_column.type);
}
return fillBlock({}, columns, types, std::move(view_with_type_and_name));
}
}
throw Exception("Unexpected DictionaryInputStreamKeyType.", ErrorCodes::LOGICAL_ERROR);
}
template <typename Key>
Block DictionaryBlockInputStream<Key>::fillBlock(
const PaddedPODArray<Key> & ids_to_fill, const Columns & keys, const DataTypes & types, ColumnsWithTypeAndName && view) const
{
std::unordered_set<std::string> names(column_names.begin(), column_names.end());
DataTypes data_types = types;
ColumnsWithTypeAndName block_columns;
data_types.reserve(keys.size());
const DictionaryStructure & dictionaty_structure = dictionary->getStructure();
if (data_types.empty() && dictionaty_structure.key)
for (const auto & key : *dictionaty_structure.key)
data_types.push_back(key.type);
for (const auto & column : view)
if (names.find(column.name) != names.end())
block_columns.push_back(column);
const DictionaryStructure & structure = dictionary->getStructure();
ColumnPtr ids_column = getColumnFromIds(ids_to_fill);
if (structure.id && names.find(structure.id->name) != names.end())
{
block_columns.emplace_back(ids_column, std::make_shared<DataTypeUInt64>(), structure.id->name);
}
auto dictionary_key_type = dictionary->getKeyType();
for (const auto idx : ext::range(0, structure.attributes.size()))
{
const DictionaryAttribute & attribute = structure.attributes[idx];
if (names.find(attribute.name) != names.end())
{
ColumnPtr column;
if (dictionary_key_type == DictionaryKeyType::simple)
{
column = dictionary->getColumn(
attribute.name,
attribute.type,
{ids_column},
{std::make_shared<DataTypeUInt64>()},
nullptr /* default_values_column */);
}
else
{
column = dictionary->getColumn(
attribute.name,
attribute.type,
keys,
data_types,
nullptr /* default_values_column*/);
}
block_columns.emplace_back(column, attribute.type, attribute.name);
}
}
return Block(block_columns);
}
template <typename Key>
ColumnPtr DictionaryBlockInputStream<Key>::getColumnFromIds(const PaddedPODArray<Key> & ids_to_fill) const
{
auto column_vector = ColumnVector<UInt64>::create();
column_vector->getData().reserve(ids_to_fill.size());
for (UInt64 id : ids_to_fill)
column_vector->insertValue(id);
return column_vector;
}
template <typename Key>
void DictionaryBlockInputStream<Key>::fillKeyColumns(
const PaddedPODArray<StringRef> & keys,
size_t start,
size_t size,
const DictionaryStructure & dictionary_structure,
ColumnsWithTypeAndName & res) const
{
MutableColumns columns;
columns.reserve(dictionary_structure.key->size());
for (const DictionaryAttribute & attribute : *dictionary_structure.key)
columns.emplace_back(attribute.type->createColumn());
for (auto idx : ext::range(start, size))
{
const auto & key = keys[idx];
const auto *ptr = key.data;
for (auto & column : columns)
ptr = column->deserializeAndInsertFromArena(ptr);
}
for (size_t i = 0, num_columns = columns.size(); i < num_columns; ++i)
res.emplace_back(
ColumnWithTypeAndName{std::move(columns[i]), (*dictionary_structure.key)[i].type, (*dictionary_structure.key)[i].name});
}
}

View File

@ -295,6 +295,28 @@ private:
bool use_default_value_from_column = false;
};
template <DictionaryKeyType key_type>
class DictionaryKeysArenaHolder;
template <>
class DictionaryKeysArenaHolder<DictionaryKeyType::simple>
{
public:
static Arena * getComplexKeyArena() { return nullptr; }
};
template <>
class DictionaryKeysArenaHolder<DictionaryKeyType::complex>
{
public:
Arena * getComplexKeyArena() { return &complex_key_arena; }
private:
Arena complex_key_arena;
};
template <DictionaryKeyType key_type>
class DictionaryKeysExtractor
{
@ -302,67 +324,96 @@ public:
using KeyType = std::conditional_t<key_type == DictionaryKeyType::simple, UInt64, StringRef>;
static_assert(key_type != DictionaryKeyType::range, "Range key type is not supported by DictionaryKeysExtractor");
explicit DictionaryKeysExtractor(const Columns & key_columns, Arena & existing_arena)
explicit DictionaryKeysExtractor(const Columns & key_columns_, Arena * complex_key_arena_)
: key_columns(key_columns_)
, complex_key_arena(complex_key_arena_)
{
assert(!key_columns.empty());
if constexpr (key_type == DictionaryKeyType::simple)
keys = getColumnVectorData(key_columns.front());
else
keys = deserializeKeyColumnsInArena(key_columns, existing_arena);
}
const PaddedPODArray<KeyType> & getKeys() const
{
return keys;
}
private:
static PaddedPODArray<UInt64> getColumnVectorData(const ColumnPtr column)
{
PaddedPODArray<UInt64> result;
auto full_column = column->convertToFullColumnIfConst();
const auto *vector_col = checkAndGetColumn<ColumnVector<UInt64>>(full_column.get());
key_columns[0] = key_columns[0]->convertToFullColumnIfConst();
const auto * vector_col = checkAndGetColumn<ColumnVector<UInt64>>(key_columns[0].get());
if (!vector_col)
throw Exception{ErrorCodes::TYPE_MISMATCH, "Column type mismatch for simple key expected UInt64"};
result.assign(vector_col->getData());
return result;
throw Exception(ErrorCodes::TYPE_MISMATCH, "Column type mismatch for simple key expected UInt64");
}
static PaddedPODArray<StringRef> deserializeKeyColumnsInArena(const Columns & key_columns, Arena & temporary_arena)
keys_size = key_columns.front()->size();
}
inline size_t getKeysSize() const
{
size_t keys_size = key_columns.front()->size();
return keys_size;
}
PaddedPODArray<StringRef> result;
result.reserve(keys_size);
inline size_t getCurrentKeyIndex() const
{
return current_key_index;
}
PaddedPODArray<StringRef> temporary_column_data(key_columns.size());
inline KeyType extractCurrentKey()
{
assert(current_key_index < keys_size);
for (size_t key_index = 0; key_index < keys_size; ++key_index)
if constexpr (key_type == DictionaryKeyType::simple)
{
const auto & column_vector = static_cast<const ColumnVector<UInt64> &>(*key_columns[0]);
const auto & data = column_vector.getData();
auto key = data[current_key_index];
++current_key_index;
return key;
}
else
{
size_t allocated_size_for_columns = 0;
const char * block_start = nullptr;
for (size_t column_index = 0; column_index < key_columns.size(); ++column_index)
for (const auto & column : key_columns)
{
const auto & column = key_columns[column_index];
temporary_column_data[column_index] = column->serializeValueIntoArena(key_index, temporary_arena, block_start);
allocated_size_for_columns += temporary_column_data[column_index].size;
StringRef serialized_data = column->serializeValueIntoArena(current_key_index, *complex_key_arena, block_start);
allocated_size_for_columns += serialized_data.size;
}
result.push_back(StringRef{block_start, allocated_size_for_columns});
++current_key_index;
current_complex_key = StringRef{block_start, allocated_size_for_columns};
return current_complex_key;
}
}
void rollbackCurrentKey() const
{
if constexpr (key_type == DictionaryKeyType::complex)
complex_key_arena->rollback(current_complex_key.size);
}
PaddedPODArray<KeyType> extractAllKeys()
{
PaddedPODArray<KeyType> result;
result.reserve(keys_size - current_key_index);
for (; current_key_index < keys_size;)
{
auto value = extractCurrentKey();
result.emplace_back(value);
}
return result;
}
PaddedPODArray<KeyType> keys;
void reset()
{
current_key_index = 0;
}
private:
Columns key_columns;
size_t keys_size = 0;
size_t current_key_index = 0;
KeyType current_complex_key {};
Arena * complex_key_arena;
};
/**
@ -370,9 +421,10 @@ private:
* If column is constant parameter backup_storage is used to store values.
*/
/// TODO: Remove
template <typename T>
static const PaddedPODArray<T> & getColumnVectorData(
const IDictionaryBase * dictionary,
const IDictionary * dictionary,
const ColumnPtr column,
PaddedPODArray<T> & backup_storage)
{

View File

@ -200,8 +200,21 @@ DictionaryStructure::DictionaryStructure(const Poco::Util::AbstractConfiguration
for (size_t i = 0; i < attributes.size(); ++i)
{
const auto & attribute_name = attributes[i].name;
const auto & attribute = attributes[i];
const auto & attribute_name = attribute.name;
attribute_name_to_index[attribute_name] = i;
if (attribute.hierarchical)
{
if (id && attribute.underlying_type != AttributeUnderlyingType::utUInt64)
throw Exception(ErrorCodes::TYPE_MISMATCH,
"Hierarchical attribute type for dictionary with simple key must be UInt64. Actual ({})",
toString(attribute.underlying_type));
else if (key)
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Dictionary with complex key does not support hierarchy");
hierarchical_attribute_index = i;
}
}
if (attributes.empty())

View File

@ -153,6 +153,8 @@ struct DictionaryStructure final
std::unordered_map<std::string, size_t> attribute_name_to_index;
std::optional<DictionaryTypedSpecialAttribute> range_min;
std::optional<DictionaryTypedSpecialAttribute> range_max;
std::optional<size_t> hierarchical_attribute_index;
bool has_expressions = false;
bool access_to_key_from_attributes = false;

View File

@ -1,158 +1,33 @@
#include "DirectDictionary.h"
#include <IO/WriteHelpers.h>
#include "DictionaryBlockInputStream.h"
#include "DictionaryFactory.h"
#include <Core/Defines.h>
#include <Functions/FunctionHelpers.h>
#include <Columns/ColumnNullable.h>
#include <DataTypes/DataTypesDecimal.h>
#include <Common/HashTable/HashMap.h>
#include <Interpreters/AggregationCommon.h>
#include <Core/Defines.h>
#include <Common/HashTable/HashMap.h>
#include <DataStreams/IBlockInputStream.h>
#include <DataTypes/DataTypesDecimal.h>
#include <Functions/FunctionHelpers.h>
#include <Dictionaries/DictionaryFactory.h>
#include <Dictionaries/HierarchyDictionariesUtils.h>
namespace DB
{
namespace ErrorCodes
{
extern const int TYPE_MISMATCH;
extern const int UNSUPPORTED_METHOD;
extern const int BAD_ARGUMENTS;
}
namespace
{
inline UInt64 getAt(const PaddedPODArray<UInt64> & arr, const size_t idx)
{
return arr[idx];
}
inline UInt64 getAt(const UInt64 & value, const size_t)
{
return value;
}
}
template <DictionaryKeyType dictionary_key_type>
DirectDictionary<dictionary_key_type>::DirectDictionary(
const StorageID & dict_id_,
const DictionaryStructure & dict_struct_,
DictionarySourcePtr source_ptr_,
BlockPtr saved_block_)
DictionarySourcePtr source_ptr_)
: IDictionary(dict_id_)
, dict_struct(dict_struct_)
, source_ptr{std::move(source_ptr_)}
, saved_block{std::move(saved_block_)}
{
if (!source_ptr->supportsSelectiveLoad())
throw Exception{full_name + ": source cannot be used with DirectDictionary", ErrorCodes::UNSUPPORTED_METHOD};
setup();
}
template <DictionaryKeyType dictionary_key_type>
void DirectDictionary<dictionary_key_type>::toParent(const PaddedPODArray<Key> & ids [[maybe_unused]], PaddedPODArray<Key> & out [[maybe_unused]]) const
{
if constexpr (dictionary_key_type == DictionaryKeyType::simple)
{
const auto & attribute_name = hierarchical_attribute->name;
auto result_type = std::make_shared<DataTypeUInt64>();
auto input_column = result_type->createColumn();
auto & input_column_typed = assert_cast<ColumnVector<UInt64> &>(*input_column);
auto & data = input_column_typed.getData();
data.insert(ids.begin(), ids.end());
auto column = getColumn({attribute_name}, result_type, {std::move(input_column)}, {result_type}, {nullptr});
const auto & result_column_typed = assert_cast<const ColumnVector<UInt64> &>(*column);
const auto & result_data = result_column_typed.getData();
out.assign(result_data);
}
else
throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "Hierarchy is not supported for complex key DirectDictionary");
}
template <DictionaryKeyType dictionary_key_type>
UInt64 DirectDictionary<dictionary_key_type>::getValueOrNullByKey(const Key & to_find) const
{
std::vector<Key> required_key = {to_find};
auto stream = source_ptr->loadIds(required_key);
stream->readPrefix();
bool is_found = false;
UInt64 result = hierarchical_attribute->null_value.template get<UInt64>();
while (const auto block = stream->read())
{
const IColumn & id_column = *block.safeGetByPosition(0).column;
for (const size_t attribute_idx : ext::range(0, dict_struct.attributes.size()))
{
if (is_found)
break;
const IColumn & attribute_column = *block.safeGetByPosition(attribute_idx + 1).column;
for (const auto row_idx : ext::range(0, id_column.size()))
{
const auto key = id_column[row_idx].get<UInt64>();
if (key == to_find && hierarchical_attribute->name == attribute_name_by_index.at(attribute_idx))
{
result = attribute_column[row_idx].get<Key>();
is_found = true;
break;
}
}
}
}
stream->readSuffix();
return result;
}
template <DictionaryKeyType dictionary_key_type>
template <typename ChildType, typename AncestorType>
void DirectDictionary<dictionary_key_type>::isInImpl(const ChildType & child_ids, const AncestorType & ancestor_ids, PaddedPODArray<UInt8> & out) const
{
const auto null_value = hierarchical_attribute->null_value.template get<UInt64>();
const auto rows = out.size();
for (const auto row : ext::range(0, rows))
{
auto id = getAt(child_ids, row);
const auto ancestor_id = getAt(ancestor_ids, row);
for (size_t i = 0; id != null_value && id != ancestor_id && i < DBMS_HIERARCHICAL_DICTIONARY_MAX_DEPTH; ++i)
id = getValueOrNullByKey(id);
out[row] = id != null_value && id == ancestor_id;
}
query_count.fetch_add(rows, std::memory_order_relaxed);
}
template <DictionaryKeyType dictionary_key_type>
void DirectDictionary<dictionary_key_type>::isInVectorVector(
const PaddedPODArray<UInt64> & child_ids, const PaddedPODArray<UInt64> & ancestor_ids, PaddedPODArray<UInt8> & out) const
{
isInImpl(child_ids, ancestor_ids, out);
}
template <DictionaryKeyType dictionary_key_type>
void DirectDictionary<dictionary_key_type>::isInVectorConstant(const PaddedPODArray<UInt64> & child_ids, const UInt64 ancestor_id, PaddedPODArray<UInt8> & out) const
{
isInImpl(child_ids, ancestor_id, out);
}
template <DictionaryKeyType dictionary_key_type>
void DirectDictionary<dictionary_key_type>::isInConstantVector(const UInt64 child_id, const PaddedPODArray<UInt64> & ancestor_ids, PaddedPODArray<UInt8> & out) const
{
isInImpl(child_id, ancestor_ids, out);
}
template <DictionaryKeyType dictionary_key_type>
@ -166,20 +41,20 @@ ColumnPtr DirectDictionary<dictionary_key_type>::getColumn(
if constexpr (dictionary_key_type == DictionaryKeyType::complex)
dict_struct.validateKeyTypes(key_types);
Arena complex_key_arena;
DictionaryKeysArenaHolder<dictionary_key_type> arena_holder;
DictionaryKeysExtractor<dictionary_key_type> extractor(key_columns, arena_holder.getComplexKeyArena());
const auto requested_keys = extractor.extractAllKeys();
const DictionaryAttribute & attribute = dict_struct.getAttribute(attribute_name, result_type);
DefaultValueProvider default_value_provider(attribute.null_value, default_values_column);
DictionaryKeysExtractor<dictionary_key_type> extractor(key_columns, complex_key_arena);
const auto & requested_keys = extractor.getKeys();
HashMap<KeyType, size_t> key_to_fetched_index;
key_to_fetched_index.reserve(requested_keys.size());
auto fetched_from_storage = attribute.type->createColumn();
size_t fetched_key_index = 0;
size_t requested_attribute_index = attribute_index_by_name.find(attribute_name)->second;
size_t requested_attribute_index = dict_struct.attribute_name_to_index.find(attribute_name)->second;
Columns block_key_columns;
size_t dictionary_keys_size = dict_struct.getKeysNames().size();
@ -191,26 +66,19 @@ ColumnPtr DirectDictionary<dictionary_key_type>::getColumn(
while (const auto block = stream->read())
{
auto block_columns = block.getColumns();
/// Split into keys columns and attribute columns
for (size_t i = 0; i < dictionary_keys_size; ++i)
{
block_key_columns.emplace_back(*block_columns.begin());
block_columns.erase(block_columns.begin());
}
block_key_columns.emplace_back(block.safeGetByPosition(i).column);
DictionaryKeysExtractor<dictionary_key_type> block_keys_extractor(block_key_columns, complex_key_arena);
const auto & block_keys = block_keys_extractor.getKeys();
size_t block_keys_size = block_keys.size();
DictionaryKeysExtractor<dictionary_key_type> block_keys_extractor(block_key_columns, arena_holder.getComplexKeyArena());
auto block_keys = block_keys_extractor.extractAllKeys();
const auto & block_column = block.safeGetByPosition(dictionary_keys_size + requested_attribute_index).column;
fetched_from_storage->insertRangeFrom(*block_column, 0, block_keys_size);
fetched_from_storage->insertRangeFrom(*block_column, 0, block_keys.size());
for (size_t block_key_index = 0; block_key_index < block_keys_size; ++block_key_index)
for (size_t block_key_index = 0; block_key_index < block_keys.size(); ++block_key_index)
{
const auto & block_key = block_keys[block_key_index];
auto block_key = block_keys[block_key_index];
key_to_fetched_index[block_key] = fetched_key_index;
++fetched_key_index;
}
@ -223,10 +91,10 @@ ColumnPtr DirectDictionary<dictionary_key_type>::getColumn(
Field value_to_insert;
size_t requested_keys_size = requested_keys.size();
auto result = fetched_from_storage->cloneEmpty();
result->reserve(requested_keys_size);
for (size_t requested_key_index = 0; requested_key_index < requested_keys_size; ++requested_key_index)
{
const auto requested_key = requested_keys[requested_key_index];
@ -251,10 +119,9 @@ ColumnUInt8::Ptr DirectDictionary<dictionary_key_type>::hasKeys(const Columns &
if constexpr (dictionary_key_type == DictionaryKeyType::complex)
dict_struct.validateKeyTypes(key_types);
Arena complex_key_arena;
DictionaryKeysExtractor<dictionary_key_type> requested_keys_extractor(key_columns, complex_key_arena);
const auto & requested_keys = requested_keys_extractor.getKeys();
DictionaryKeysArenaHolder<dictionary_key_type> arena_holder;
DictionaryKeysExtractor<dictionary_key_type> requested_keys_extractor(key_columns, arena_holder.getComplexKeyArena());
auto requested_keys = requested_keys_extractor.extractAllKeys();
size_t requested_keys_size = requested_keys.size();
HashMap<KeyType, size_t> requested_key_to_index;
@ -279,25 +146,24 @@ ColumnUInt8::Ptr DirectDictionary<dictionary_key_type>::hasKeys(const Columns &
while (const auto block = stream->read())
{
auto block_columns = block.getColumns();
/// Split into keys columns and attribute columns
for (size_t i = 0; i < dictionary_keys_size; ++i)
{
block_key_columns.emplace_back(*block_columns.begin());
block_columns.erase(block_columns.begin());
}
block_key_columns.emplace_back(block.safeGetByPosition(i).column);
DictionaryKeysExtractor<dictionary_key_type> block_keys_extractor(block_key_columns, complex_key_arena);
const auto & block_keys = block_keys_extractor.getKeys();
DictionaryKeysExtractor<dictionary_key_type> block_keys_extractor(block_key_columns, arena_holder.getComplexKeyArena());
size_t block_keys_size = block_keys_extractor.getKeysSize();
for (const auto & block_key : block_keys)
for (size_t i = 0; i < block_keys_size; ++i)
{
auto block_key = block_keys_extractor.extractCurrentKey();
const auto * it = requested_key_to_index.find(block_key);
assert(it);
size_t result_data_found_index = it->getMapped();
result_data[result_data_found_index] = true;
block_keys_extractor.rollbackCurrentKey();
}
block_key_columns.clear();
@ -310,6 +176,37 @@ ColumnUInt8::Ptr DirectDictionary<dictionary_key_type>::hasKeys(const Columns &
return result;
}
template <DictionaryKeyType dictionary_key_type>
ColumnPtr DirectDictionary<dictionary_key_type>::getHierarchy(
ColumnPtr key_column,
const DataTypePtr & key_type) const
{
if (dictionary_key_type == DictionaryKeyType::simple)
{
auto result = getKeysHierarchyDefaultImplementation(this, key_column, key_type);
query_count.fetch_add(key_column->size(), std::memory_order_relaxed);
return result;
}
else
return nullptr;
}
template <DictionaryKeyType dictionary_key_type>
ColumnUInt8::Ptr DirectDictionary<dictionary_key_type>::isInHierarchy(
ColumnPtr key_column,
ColumnPtr in_key_column,
const DataTypePtr & key_type) const
{
if (dictionary_key_type == DictionaryKeyType::simple)
{
auto result = getKeysIsInHierarchyDefaultImplementation(this, key_column, in_key_column, key_type);
query_count.fetch_add(key_column->size(), std::memory_order_relaxed);
return result;
}
else
return nullptr;
}
template <DictionaryKeyType dictionary_key_type>
BlockInputStreamPtr DirectDictionary<dictionary_key_type>::getSourceBlockInputStream(
const Columns & key_columns [[maybe_unused]],
@ -342,32 +239,6 @@ BlockInputStreamPtr DirectDictionary<dictionary_key_type>::getSourceBlockInputSt
return stream;
}
template <DictionaryKeyType dictionary_key_type>
void DirectDictionary<dictionary_key_type>::setup()
{
/// TODO: Move this to DictionaryStructure
size_t dictionary_attributes_size = dict_struct.attributes.size();
for (size_t i = 0; i < dictionary_attributes_size; ++i)
{
const auto & attribute = dict_struct.attributes[i];
attribute_index_by_name[attribute.name] = i;
attribute_name_by_index[i] = attribute.name;
if (attribute.hierarchical)
{
if constexpr (dictionary_key_type == DictionaryKeyType::complex)
throw Exception(ErrorCodes::BAD_ARGUMENTS,
"({}): hierarchical attributes are not supported for complex key direct dictionary",
full_name);
hierarchical_attribute = &attribute;
if (attribute.underlying_type != AttributeUnderlyingType::utUInt64)
throw Exception{full_name + ": hierarchical attribute must be UInt64.", ErrorCodes::TYPE_MISMATCH};
}
}
}
template <DictionaryKeyType dictionary_key_type>
BlockInputStreamPtr DirectDictionary<dictionary_key_type>::getBlockInputStream(const Names & /* column_names */, size_t /* max_block_size */) const
{

View File

@ -18,11 +18,6 @@
namespace DB
{
namespace ErrorCodes
{
extern const int BAD_ARGUMENTS;
}
template <DictionaryKeyType dictionary_key_type>
class DirectDictionary final : public IDictionary
{
@ -33,8 +28,7 @@ public:
DirectDictionary(
const StorageID & dict_id_,
const DictionaryStructure & dict_struct_,
DictionarySourcePtr source_ptr_,
BlockPtr saved_block_ = nullptr);
DictionarySourcePtr source_ptr_);
std::string getTypeName() const override
{
@ -56,7 +50,7 @@ public:
std::shared_ptr<const IExternalLoadable> clone() const override
{
return std::make_shared<DirectDictionary>(getDictionaryID(), dict_struct, source_ptr->clone(), saved_block);
return std::make_shared<DirectDictionary>(getDictionaryID(), dict_struct, source_ptr->clone());
}
const IDictionarySource * getSource() const override { return source_ptr.get(); }
@ -67,26 +61,9 @@ public:
bool isInjective(const std::string & attribute_name) const override
{
auto it = attribute_index_by_name.find(attribute_name);
if (it == attribute_index_by_name.end())
throw Exception(ErrorCodes::BAD_ARGUMENTS,
"({}): no attribute with name ({}) in dictionary",
full_name,
attribute_name);
return dict_struct.attributes[it->second].injective;
return dict_struct.getAttribute(attribute_name).injective;
}
bool hasHierarchy() const override { return hierarchical_attribute; }
void toParent(const PaddedPODArray<UInt64> & ids, PaddedPODArray<UInt64> & out) const override;
void isInVectorVector(
const PaddedPODArray<UInt64> & child_ids, const PaddedPODArray<UInt64> & ancestor_ids, PaddedPODArray<UInt8> & out) const override;
void isInVectorConstant(const PaddedPODArray<UInt64> & child_ids, const UInt64 ancestor_id, PaddedPODArray<UInt8> & out) const override;
void isInConstantVector(const UInt64 child_id, const PaddedPODArray<UInt64> & ancestor_ids, PaddedPODArray<UInt8> & out) const override;
DictionaryKeyType getKeyType() const override { return dictionary_key_type; }
ColumnPtr getColumn(
@ -98,30 +75,25 @@ public:
ColumnUInt8::Ptr hasKeys(const Columns & key_columns, const DataTypes & key_types) const override;
bool hasHierarchy() const override { return dict_struct.hierarchical_attribute_index.has_value(); }
ColumnPtr getHierarchy(ColumnPtr key_column, const DataTypePtr & key_type) const override;
ColumnUInt8::Ptr isInHierarchy(
ColumnPtr key_column,
ColumnPtr in_key_column,
const DataTypePtr & key_type) const override;
BlockInputStreamPtr getBlockInputStream(const Names & column_names, size_t max_block_size) const override;
private:
void setup();
BlockInputStreamPtr getSourceBlockInputStream(const Columns & key_columns, const PaddedPODArray<KeyType> & requested_keys) const;
UInt64 getValueOrNullByKey(const UInt64 & to_find) const;
template <typename ChildType, typename AncestorType>
void isInImpl(const ChildType & child_ids, const AncestorType & ancestor_ids, PaddedPODArray<UInt8> & out) const;
const DictionaryStructure dict_struct;
const DictionarySourcePtr source_ptr;
const DictionaryLifetime dict_lifetime;
std::unordered_map<std::string, size_t> attribute_index_by_name;
std::unordered_map<size_t, std::string> attribute_name_by_index;
const DictionaryAttribute * hierarchical_attribute = nullptr;
mutable std::atomic<size_t> query_count{0};
BlockPtr saved_block;
};
extern template class DirectDictionary<DictionaryKeyType::simple>;

View File

@ -1,20 +1,22 @@
#include "FlatDictionary.h"
#include <Core/Defines.h>
#include <Common/HashTable/HashMap.h>
#include <DataTypes/DataTypesDecimal.h>
#include <IO/WriteHelpers.h>
#include <Columns/ColumnsNumber.h>
#include <Columns/ColumnNullable.h>
#include <Functions/FunctionHelpers.h>
#include "DictionaryBlockInputStream.h"
#include "DictionaryFactory.h"
#include <Dictionaries/DictionaryBlockInputStream.h>
#include <Dictionaries/DictionaryFactory.h>
#include <Dictionaries/HierarchyDictionariesUtils.h>
namespace DB
{
namespace ErrorCodes
{
extern const int TYPE_MISMATCH;
extern const int ARGUMENT_OUT_OF_BOUND;
extern const int BAD_ARGUMENTS;
extern const int DICTIONARY_IS_EMPTY;
@ -24,7 +26,6 @@ namespace ErrorCodes
static const auto initial_array_size = 1024;
static const auto max_array_size = 500000;
FlatDictionary::FlatDictionary(
const StorageID & dict_id_,
const DictionaryStructure & dict_struct_,
@ -45,69 +46,6 @@ FlatDictionary::FlatDictionary(
calculateBytesAllocated();
}
void FlatDictionary::toParent(const PaddedPODArray<Key> & ids, PaddedPODArray<Key> & out) const
{
const auto null_value = std::get<UInt64>(hierarchical_attribute->null_values);
DictionaryDefaultValueExtractor<UInt64> extractor(null_value);
getItemsImpl<UInt64, UInt64>(
*hierarchical_attribute,
ids,
[&](const size_t row, const UInt64 value) { out[row] = value; },
extractor);
}
/// Allow to use single value in same way as array.
static inline FlatDictionary::Key getAt(const PaddedPODArray<FlatDictionary::Key> & arr, const size_t idx)
{
return arr[idx];
}
static inline FlatDictionary::Key getAt(const FlatDictionary::Key & value, const size_t)
{
return value;
}
template <typename ChildType, typename AncestorType>
void FlatDictionary::isInImpl(const ChildType & child_ids, const AncestorType & ancestor_ids, PaddedPODArray<UInt8> & out) const
{
const auto null_value = std::get<UInt64>(hierarchical_attribute->null_values);
const auto & attr = std::get<ContainerType<Key>>(hierarchical_attribute->arrays);
const auto rows = out.size();
size_t loaded_size = attr.size();
for (const auto row : ext::range(0, rows))
{
auto id = getAt(child_ids, row);
const auto ancestor_id = getAt(ancestor_ids, row);
for (size_t i = 0; id < loaded_size && id != null_value && id != ancestor_id && i < DBMS_HIERARCHICAL_DICTIONARY_MAX_DEPTH; ++i)
id = attr[id];
out[row] = id != null_value && id == ancestor_id;
}
query_count.fetch_add(rows, std::memory_order_relaxed);
}
void FlatDictionary::isInVectorVector(
const PaddedPODArray<Key> & child_ids, const PaddedPODArray<Key> & ancestor_ids, PaddedPODArray<UInt8> & out) const
{
isInImpl(child_ids, ancestor_ids, out);
}
void FlatDictionary::isInVectorConstant(const PaddedPODArray<Key> & child_ids, const Key ancestor_id, PaddedPODArray<UInt8> & out) const
{
isInImpl(child_ids, ancestor_id, out);
}
void FlatDictionary::isInConstantVector(const Key child_id, const PaddedPODArray<Key> & ancestor_ids, PaddedPODArray<UInt8> & out) const
{
isInImpl(child_id, ancestor_ids, out);
}
ColumnPtr FlatDictionary::getColumn(
const std::string & attribute_name,
const DataTypePtr & result_type,
@ -117,14 +55,16 @@ ColumnPtr FlatDictionary::getColumn(
{
ColumnPtr result;
PaddedPODArray<Key> backup_storage;
PaddedPODArray<UInt64> backup_storage;
const auto & ids = getColumnVectorData(this, key_columns.front(), backup_storage);
auto size = ids.size();
const auto & attribute = getAttribute(attribute_name);
const auto & dictionary_attribute = dict_struct.getAttribute(attribute_name, result_type);
size_t attribute_index = dict_struct.attribute_name_to_index.find(attribute_name)->second;
const auto & attribute = attributes[attribute_index];
auto type_call = [&](const auto & dictionary_attribute_type)
{
using Type = std::decay_t<decltype(dictionary_attribute_type)>;
@ -183,10 +123,9 @@ ColumnPtr FlatDictionary::getColumn(
return result;
}
ColumnUInt8::Ptr FlatDictionary::hasKeys(const Columns & key_columns, const DataTypes &) const
{
PaddedPODArray<Key> backup_storage;
PaddedPODArray<UInt64> backup_storage;
const auto& ids = getColumnVectorData(this, key_columns.front(), backup_storage);
auto result = ColumnUInt8::create(ext::size(ids));
@ -205,24 +144,118 @@ ColumnUInt8::Ptr FlatDictionary::hasKeys(const Columns & key_columns, const Data
return result;
}
ColumnPtr FlatDictionary::getHierarchy(ColumnPtr key_column, const DataTypePtr &) const
{
PaddedPODArray<UInt64> keys_backup_storage;
const auto & keys = getColumnVectorData(this, key_column, keys_backup_storage);
size_t hierarchical_attribute_index = *dict_struct.hierarchical_attribute_index;
const auto & hierarchical_attribute = attributes[hierarchical_attribute_index];
const UInt64 null_value = std::get<UInt64>(hierarchical_attribute.null_values);
const ContainerType<UInt64> & parent_keys = std::get<ContainerType<UInt64>>(hierarchical_attribute.arrays);
auto is_key_valid_func = [&, this](auto & key)
{
return key < loaded_ids.size() && loaded_ids[key];
};
auto get_parent_key_func = [&, this](auto & hierarchy_key)
{
std::optional<UInt64> result;
if (hierarchy_key >= loaded_ids.size() || !loaded_ids[hierarchy_key])
return result;
result = parent_keys[hierarchy_key];
return result;
};
auto dictionary_hierarchy_array = getKeysHierarchyArray(keys, null_value, is_key_valid_func, get_parent_key_func);
query_count.fetch_add(keys.size(), std::memory_order_relaxed);
return dictionary_hierarchy_array;
}
ColumnUInt8::Ptr FlatDictionary::isInHierarchy(
ColumnPtr key_column,
ColumnPtr in_key_column,
const DataTypePtr &) const
{
PaddedPODArray<UInt64> keys_backup_storage;
const auto & keys = getColumnVectorData(this, key_column, keys_backup_storage);
PaddedPODArray<UInt64> keys_in_backup_storage;
const auto & keys_in = getColumnVectorData(this, in_key_column, keys_in_backup_storage);
size_t hierarchical_attribute_index = *dict_struct.hierarchical_attribute_index;
const auto & hierarchical_attribute = attributes[hierarchical_attribute_index];
const UInt64 null_value = std::get<UInt64>(hierarchical_attribute.null_values);
const ContainerType<UInt64> & parent_keys = std::get<ContainerType<UInt64>>(hierarchical_attribute.arrays);
auto is_key_valid_func = [&, this](auto & key)
{
return key < loaded_ids.size() && loaded_ids[key];
};
auto get_parent_key_func = [&, this](auto & hierarchy_key)
{
std::optional<UInt64> result;
if (hierarchy_key >= loaded_ids.size() || !loaded_ids[hierarchy_key])
return result;
result = parent_keys[hierarchy_key];
return result;
};
auto result = getKeysIsInHierarchyColumn(keys, keys_in, null_value, is_key_valid_func, get_parent_key_func);
query_count.fetch_add(keys.size(), std::memory_order_relaxed);
return result;
}
ColumnPtr FlatDictionary::getDescendants(
ColumnPtr key_column,
const DataTypePtr &,
size_t level) const
{
PaddedPODArray<UInt64> keys_backup;
const auto & keys = getColumnVectorData(this, key_column, keys_backup);
size_t hierarchical_attribute_index = *dict_struct.hierarchical_attribute_index;
const auto & hierarchical_attribute = attributes[hierarchical_attribute_index];
const ContainerType<UInt64> & parent_keys = std::get<ContainerType<UInt64>>(hierarchical_attribute.arrays);
HashMap<UInt64, PaddedPODArray<UInt64>> parent_to_child;
for (size_t i = 0; i < parent_keys.size(); ++i)
{
auto parent_key = parent_keys[i];
if (loaded_ids[i])
parent_to_child[parent_key].emplace_back(static_cast<UInt64>(i));
}
auto result = getKeysDescendantsArray(keys, parent_to_child, level);
query_count.fetch_add(keys.size(), std::memory_order_relaxed);
return result;
}
void FlatDictionary::createAttributes()
{
const auto size = dict_struct.attributes.size();
attributes.reserve(size);
for (const auto & attribute : dict_struct.attributes)
{
attribute_index_by_name.emplace(attribute.name, attributes.size());
attributes.push_back(createAttribute(attribute, attribute.null_value));
if (attribute.hierarchical)
{
hierarchical_attribute = &attributes.back();
if (hierarchical_attribute->type != AttributeUnderlyingType::utUInt64)
throw Exception{full_name + ": hierarchical attribute must be UInt64.", ErrorCodes::TYPE_MISMATCH};
}
}
}
void FlatDictionary::blockToAttributes(const Block & block)
@ -271,7 +304,7 @@ void FlatDictionary::updateData()
const auto & saved_id_column = *saved_block->safeGetByPosition(0).column;
const auto & update_id_column = *block.safeGetByPosition(0).column;
std::unordered_map<Key, std::vector<size_t>> update_ids;
std::unordered_map<UInt64, std::vector<size_t>> update_ids;
for (size_t row = 0; row < update_id_column.size(); ++row)
{
const auto id = update_id_column.get64(row);
@ -280,7 +313,7 @@ void FlatDictionary::updateData()
const size_t saved_rows = saved_id_column.size();
IColumn::Filter filter(saved_rows);
std::unordered_map<Key, std::vector<size_t>>::iterator it;
std::unordered_map<UInt64, std::vector<size_t>>::iterator it;
for (size_t row = 0; row < saved_id_column.size(); ++row)
{
@ -385,7 +418,6 @@ void FlatDictionary::createAttributeImpl<String>(Attribute & attribute, const Fi
attribute.arrays.emplace<ContainerType<StringRef>>(initial_array_size, StringRef(string_in_arena, string.size()));
}
FlatDictionary::Attribute FlatDictionary::createAttribute(const DictionaryAttribute& attribute, const Field & null_value)
{
auto nullable_set = attribute.is_nullable ? std::make_optional<NullableSet>() : std::optional<NullableSet>{};
@ -408,7 +440,7 @@ FlatDictionary::Attribute FlatDictionary::createAttribute(const DictionaryAttrib
template <typename AttributeType, typename OutputType, typename ValueSetter, typename DefaultValueExtractor>
void FlatDictionary::getItemsImpl(
const Attribute & attribute,
const PaddedPODArray<Key> & ids,
const PaddedPODArray<UInt64> & ids,
ValueSetter && set_value,
DefaultValueExtractor & default_value_extractor) const
{
@ -425,7 +457,7 @@ void FlatDictionary::getItemsImpl(
}
template <typename T>
void FlatDictionary::resize(Attribute & attribute, const Key id)
void FlatDictionary::resize(Attribute & attribute, const UInt64 id)
{
if (id >= max_array_size)
throw Exception{full_name + ": identifier should be less than " + toString(max_array_size), ErrorCodes::ARGUMENT_OUT_OF_BOUND};
@ -440,7 +472,7 @@ void FlatDictionary::resize(Attribute & attribute, const Key id)
}
template <typename T>
void FlatDictionary::setAttributeValueImpl(Attribute & attribute, const Key id, const T & value)
void FlatDictionary::setAttributeValueImpl(Attribute & attribute, const UInt64 id, const T & value)
{
auto & array = std::get<ContainerType<T>>(attribute.arrays);
array[id] = value;
@ -448,13 +480,13 @@ void FlatDictionary::setAttributeValueImpl(Attribute & attribute, const Key id,
}
template <>
void FlatDictionary::setAttributeValueImpl<String>(Attribute & attribute, const Key id, const String & value)
void FlatDictionary::setAttributeValueImpl<String>(Attribute & attribute, const UInt64 id, const String & value)
{
const auto * string_in_arena = attribute.string_arena->insert(value.data(), value.size());
setAttributeValueImpl(attribute, id, StringRef{string_in_arena, value.size()});
}
void FlatDictionary::setAttributeValue(Attribute & attribute, const Key id, const Field & value)
void FlatDictionary::setAttributeValue(Attribute & attribute, const UInt64 id, const Field & value)
{
auto type_call = [&](const auto &dictionary_attribute_type)
{
@ -484,21 +516,11 @@ void FlatDictionary::setAttributeValue(Attribute & attribute, const Key id, cons
callOnDictionaryAttributeType(attribute.type, type_call);
}
const FlatDictionary::Attribute & FlatDictionary::getAttribute(const std::string & attribute_name) const
{
const auto it = attribute_index_by_name.find(attribute_name);
if (it == std::end(attribute_index_by_name))
throw Exception{full_name + ": no such attribute '" + attribute_name + "'", ErrorCodes::BAD_ARGUMENTS};
return attributes[it->second];
}
PaddedPODArray<FlatDictionary::Key> FlatDictionary::getIds() const
PaddedPODArray<UInt64> FlatDictionary::getIds() const
{
const auto ids_count = ext::size(loaded_ids);
PaddedPODArray<Key> ids;
PaddedPODArray<UInt64> ids;
ids.reserve(ids_count);
for (auto idx : ext::range(0, ids_count))
@ -509,8 +531,7 @@ PaddedPODArray<FlatDictionary::Key> FlatDictionary::getIds() const
BlockInputStreamPtr FlatDictionary::getBlockInputStream(const Names & column_names, size_t max_block_size) const
{
using BlockInputStreamType = DictionaryBlockInputStream<Key>;
return std::make_shared<BlockInputStreamType>(shared_from_this(), max_block_size, getIds(), column_names);
return std::make_shared<DictionaryBlockInputStream>(shared_from_this(), max_block_size, getIds(), column_names);
}
void registerDictionaryFlat(DictionaryFactory & factory)

View File

@ -59,18 +59,9 @@ public:
bool isInjective(const std::string & attribute_name) const override
{
return dict_struct.attributes[&getAttribute(attribute_name) - attributes.data()].injective;
return dict_struct.getAttribute(attribute_name).injective;
}
bool hasHierarchy() const override { return hierarchical_attribute; }
void toParent(const PaddedPODArray<Key> & ids, PaddedPODArray<Key> & out) const override;
void isInVectorVector(
const PaddedPODArray<Key> & child_ids, const PaddedPODArray<Key> & ancestor_ids, PaddedPODArray<UInt8> & out) const override;
void isInVectorConstant(const PaddedPODArray<Key> & child_ids, const Key ancestor_id, PaddedPODArray<UInt8> & out) const override;
void isInConstantVector(const Key child_id, const PaddedPODArray<Key> & ancestor_ids, PaddedPODArray<UInt8> & out) const override;
DictionaryKeyType getKeyType() const override { return DictionaryKeyType::simple; }
ColumnPtr getColumn(
@ -82,13 +73,27 @@ public:
ColumnUInt8::Ptr hasKeys(const Columns & key_columns, const DataTypes & key_types) const override;
bool hasHierarchy() const override { return dict_struct.hierarchical_attribute_index.has_value(); }
ColumnPtr getHierarchy(ColumnPtr key_column, const DataTypePtr & key_type) const override;
ColumnUInt8::Ptr isInHierarchy(
ColumnPtr key_column,
ColumnPtr in_key_column,
const DataTypePtr & key_type) const override;
ColumnPtr getDescendants(
ColumnPtr key_column,
const DataTypePtr & key_type,
size_t level) const override;
BlockInputStreamPtr getBlockInputStream(const Names & column_names, size_t max_block_size) const override;
private:
template <typename Value>
using ContainerType = PaddedPODArray<Value>;
using NullableSet = HashSet<Key, DefaultHash<Key>>;
using NullableSet = HashSet<UInt64, DefaultHash<UInt64>>;
struct Attribute final
{
@ -151,24 +156,24 @@ private:
template <typename AttributeType, typename OutputType, typename ValueSetter, typename DefaultValueExtractor>
void getItemsImpl(
const Attribute & attribute,
const PaddedPODArray<Key> & ids,
const PaddedPODArray<UInt64> & ids,
ValueSetter && set_value,
DefaultValueExtractor & default_value_extractor) const;
template <typename T>
void resize(Attribute & attribute, const Key id);
void resize(Attribute & attribute, const UInt64 id);
template <typename T>
void setAttributeValueImpl(Attribute & attribute, const Key id, const T & value);
void setAttributeValueImpl(Attribute & attribute, const UInt64 id, const T & value);
void setAttributeValue(Attribute & attribute, const Key id, const Field & value);
void setAttributeValue(Attribute & attribute, const UInt64 id, const Field & value);
const Attribute & getAttribute(const std::string & attribute_name) const;
template <typename ChildType, typename AncestorType>
void isInImpl(const ChildType & child_ids, const AncestorType & ancestor_ids, PaddedPODArray<UInt8> & out) const;
PaddedPODArray<Key> getIds() const;
PaddedPODArray<UInt64> getIds() const;
const DictionaryStructure dict_struct;
const DictionarySourcePtr source_ptr;
@ -177,7 +182,6 @@ private:
std::map<std::string, size_t> attribute_index_by_name;
std::vector<Attribute> attributes;
const Attribute * hierarchical_attribute = nullptr;
std::vector<bool> loaded_ids;
size_t bytes_allocated = 0;
@ -185,6 +189,7 @@ private:
size_t bucket_count = 0;
mutable std::atomic<size_t> query_count{0};
/// TODO: Remove
BlockPtr saved_block;
};

File diff suppressed because it is too large Load Diff

View File

@ -4,17 +4,21 @@
#include <memory>
#include <variant>
#include <optional>
#include <Columns/ColumnDecimal.h>
#include <Columns/ColumnString.h>
#include <Core/Block.h>
#include <Common/HashTable/HashMap.h>
#include <Common/HashTable/HashSet.h>
#include <sparsehash/sparse_hash_map>
#include <ext/range.h>
#include "DictionaryStructure.h"
#include "IDictionary.h"
#include "IDictionarySource.h"
#include "DictionaryHelpers.h"
#include <Common/HashTable/HashMap.h>
#include <Common/HashTable/HashSet.h>
#include <Core/Block.h>
#include <Columns/ColumnDecimal.h>
#include <Columns/ColumnString.h>
#include <Dictionaries/DictionaryStructure.h>
#include <Dictionaries/IDictionary.h>
#include <Dictionaries/IDictionarySource.h>
#include <Dictionaries/DictionaryHelpers.h>
/** This dictionary stores all content in a hash table in memory
* (a separate Key -> Value map for each attribute)
@ -24,19 +28,32 @@
namespace DB
{
template <DictionaryKeyType dictionary_key_type, bool sparse>
class HashedDictionary final : public IDictionary
{
public:
using KeyType = std::conditional_t<dictionary_key_type == DictionaryKeyType::simple, UInt64, StringRef>;
static_assert(dictionary_key_type != DictionaryKeyType::range, "Range key type is not supported by hashed dictionary");
HashedDictionary(
const StorageID & dict_id_,
const DictionaryStructure & dict_struct_,
DictionarySourcePtr source_ptr_,
const DictionaryLifetime dict_lifetime_,
bool require_nonempty_,
bool sparse_,
BlockPtr saved_block_ = nullptr);
std::string getTypeName() const override { return sparse ? "SparseHashed" : "Hashed"; }
std::string getTypeName() const override
{
if constexpr (dictionary_key_type == DictionaryKeyType::simple && sparse)
return "SparseHashed";
else if constexpr (dictionary_key_type == DictionaryKeyType::simple && !sparse)
return "Hashed";
else if constexpr (dictionary_key_type == DictionaryKeyType::complex && sparse)
return "ComplexKeySpareseHashed";
else
return "ComplexKeyHashed";
}
size_t getBytesAllocated() const override { return bytes_allocated; }
@ -50,7 +67,7 @@ public:
std::shared_ptr<const IExternalLoadable> clone() const override
{
return std::make_shared<HashedDictionary>(getDictionaryID(), dict_struct, source_ptr->clone(), dict_lifetime, require_nonempty, sparse, saved_block);
return std::make_shared<HashedDictionary<dictionary_key_type, sparse>>(getDictionaryID(), dict_struct, source_ptr->clone(), dict_lifetime, require_nonempty, saved_block);
}
const IDictionarySource * getSource() const override { return source_ptr.get(); }
@ -61,14 +78,10 @@ public:
bool isInjective(const std::string & attribute_name) const override
{
return dict_struct.attributes[&getAttribute(attribute_name) - attributes.data()].injective;
return dict_struct.getAttribute(attribute_name).injective;
}
bool hasHierarchy() const override { return hierarchical_attribute; }
void toParent(const PaddedPODArray<Key> & ids, PaddedPODArray<Key> & out) const override;
DictionaryKeyType getKeyType() const override { return DictionaryKeyType::simple; }
DictionaryKeyType getKeyType() const override { return dictionary_key_type; }
ColumnPtr getColumn(
const std::string& attribute_name,
@ -79,36 +92,52 @@ public:
ColumnUInt8::Ptr hasKeys(const Columns & key_columns, const DataTypes & key_types) const override;
void isInVectorVector(
const PaddedPODArray<Key> & child_ids, const PaddedPODArray<Key> & ancestor_ids, PaddedPODArray<UInt8> & out) const override;
void isInVectorConstant(const PaddedPODArray<Key> & child_ids, const Key ancestor_id, PaddedPODArray<UInt8> & out) const override;
void isInConstantVector(const Key child_id, const PaddedPODArray<Key> & ancestor_ids, PaddedPODArray<UInt8> & out) const override;
bool hasHierarchy() const override { return dictionary_key_type == DictionaryKeyType::simple && dict_struct.hierarchical_attribute_index.has_value(); }
ColumnPtr getHierarchy(ColumnPtr key_column, const DataTypePtr & hierarchy_attribute_type) const override;
ColumnUInt8::Ptr isInHierarchy(
ColumnPtr key_column,
ColumnPtr in_key_column,
const DataTypePtr & key_type) const override;
ColumnPtr getDescendants(
ColumnPtr key_column,
const DataTypePtr & key_type,
size_t level) const override;
BlockInputStreamPtr getBlockInputStream(const Names & column_names, size_t max_block_size) const override;
private:
template <typename Value>
using CollectionType = HashMap<UInt64, Value>;
template <typename Value>
using CollectionPtrType = std::unique_ptr<CollectionType<Value>>;
using CollectionTypeNonSparse = std::conditional_t<
dictionary_key_type == DictionaryKeyType::simple,
HashMap<UInt64, Value>,
HashMapWithSavedHash<StringRef, Value, DefaultHash<StringRef>>>;
#if !defined(ARCADIA_BUILD)
template <typename Value>
using SparseCollectionType = google::sparse_hash_map<UInt64, Value, DefaultHash<UInt64>>;
template <typename Key, typename Value>
using SparseHashMap = google::sparse_hash_map<Key, Value, DefaultHash<Key>>;
#else
template <typename Value>
using SparseCollectionType = google::sparsehash::sparse_hash_map<UInt64, Value, DefaultHash<UInt64>>;
template <typename Key, typename Value>
using SparseHashMap = google::sparsehash::sparse_hash_map<Key, Value, DefaultHash<Key>>;
#endif
template <typename Value>
using SparseCollectionPtrType = std::unique_ptr<SparseCollectionType<Value>>;
using CollectionTypeSparse = std::conditional_t<
dictionary_key_type == DictionaryKeyType::simple,
SparseHashMap<UInt64, Value>,
SparseHashMap<StringRef, Value>>;
using NullableSet = HashSet<Key, DefaultHash<Key>>;
template <typename Value>
using CollectionType = std::conditional_t<sparse, CollectionTypeSparse<Value>, CollectionTypeNonSparse<Value>>;
using NullableSet = HashSet<KeyType, DefaultHash<KeyType>>;
struct Attribute final
{
AttributeUnderlyingType type;
std::optional<NullableSet> nullable_set;
std::optional<NullableSet> is_nullable_set;
std::variant<
UInt8,
@ -127,41 +156,27 @@ private:
Float64,
StringRef>
null_values;
std::variant<
CollectionPtrType<UInt8>,
CollectionPtrType<UInt16>,
CollectionPtrType<UInt32>,
CollectionPtrType<UInt64>,
CollectionPtrType<UInt128>,
CollectionPtrType<Int8>,
CollectionPtrType<Int16>,
CollectionPtrType<Int32>,
CollectionPtrType<Int64>,
CollectionPtrType<Decimal32>,
CollectionPtrType<Decimal64>,
CollectionPtrType<Decimal128>,
CollectionPtrType<Float32>,
CollectionPtrType<Float64>,
CollectionPtrType<StringRef>>
maps;
std::variant<
SparseCollectionPtrType<UInt8>,
SparseCollectionPtrType<UInt16>,
SparseCollectionPtrType<UInt32>,
SparseCollectionPtrType<UInt64>,
SparseCollectionPtrType<UInt128>,
SparseCollectionPtrType<Int8>,
SparseCollectionPtrType<Int16>,
SparseCollectionPtrType<Int32>,
SparseCollectionPtrType<Int64>,
SparseCollectionPtrType<Decimal32>,
SparseCollectionPtrType<Decimal64>,
SparseCollectionPtrType<Decimal128>,
SparseCollectionPtrType<Float32>,
SparseCollectionPtrType<Float64>,
SparseCollectionPtrType<StringRef>>
sparse_maps;
CollectionType<UInt8>,
CollectionType<UInt16>,
CollectionType<UInt32>,
CollectionType<UInt64>,
CollectionType<UInt128>,
CollectionType<Int8>,
CollectionType<Int16>,
CollectionType<Int32>,
CollectionType<Int64>,
CollectionType<Decimal32>,
CollectionType<Decimal64>,
CollectionType<Decimal128>,
CollectionType<Float32>,
CollectionType<Float64>,
CollectionType<StringRef>>
container;
std::unique_ptr<Arena> string_arena;
};
void createAttributes();
@ -172,76 +187,47 @@ private:
void loadData();
template <typename T>
void addAttributeSize(const Attribute & attribute);
void calculateBytesAllocated();
template <typename T>
void createAttributeImpl(Attribute & attribute, const Field & null_value);
Attribute createAttribute(const DictionaryAttribute& attribute, const Field & null_value);
template <typename AttributeType, typename OutputType, typename MapType, typename ValueSetter, typename DefaultValueExtractor>
void getItemsAttrImpl(
const MapType & attr,
const PaddedPODArray<Key> & ids,
ValueSetter && set_value,
DefaultValueExtractor & default_value_extractor) const;
template <typename AttributeType, typename OutputType, typename ValueSetter, typename DefaultValueExtractor>
template <typename AttributeType, typename ValueSetter, typename NullableValueSetter, typename DefaultValueExtractor>
void getItemsImpl(
const Attribute & attribute,
const PaddedPODArray<Key> & ids,
DictionaryKeysExtractor<dictionary_key_type> & keys_extractor,
ValueSetter && set_value,
NullableValueSetter && set_nullable_value,
DefaultValueExtractor & default_value_extractor) const;
template <typename T>
bool setAttributeValueImpl(Attribute & attribute, const Key id, const T value);
template <typename GetContainerFunc>
void getAttributeContainer(size_t attribute_index, GetContainerFunc && get_container_func);
bool setAttributeValue(Attribute & attribute, const Key id, const Field & value);
template <typename GetContainerFunc>
void getAttributeContainer(size_t attribute_index, GetContainerFunc && get_container_func) const;
const Attribute & getAttribute(const std::string & attribute_name) const;
template <typename T>
void has(const Attribute & attribute, const PaddedPODArray<Key> & ids, PaddedPODArray<UInt8> & out) const;
template <typename T, typename AttrType>
PaddedPODArray<Key> getIdsAttrImpl(const AttrType & attr) const;
template <typename T>
PaddedPODArray<Key> getIds(const Attribute & attribute) const;
PaddedPODArray<Key> getIds() const;
/// Preallocates the hashtable based on query progress
/// (Only while loading all data).
///
/// @see preallocate
template <typename T>
void resize(Attribute & attribute, size_t added_rows);
void resize(size_t added_rows);
template <typename AttrType, typename ChildType, typename AncestorType>
void isInAttrImpl(const AttrType & attr, const ChildType & child_ids, const AncestorType & ancestor_ids, PaddedPODArray<UInt8> & out) const;
template <typename ChildType, typename AncestorType>
void isInImpl(const ChildType & child_ids, const AncestorType & ancestor_ids, PaddedPODArray<UInt8> & out) const;
StringRef copyKeyInArena(StringRef key);
const DictionaryStructure dict_struct;
const DictionarySourcePtr source_ptr;
const DictionaryLifetime dict_lifetime;
const bool require_nonempty;
const bool sparse;
std::map<std::string, size_t> attribute_index_by_name;
std::vector<Attribute> attributes;
const Attribute * hierarchical_attribute = nullptr;
size_t bytes_allocated = 0;
size_t element_count = 0;
size_t bucket_count = 0;
mutable std::atomic<size_t> query_count{0};
/// TODO: Remove
BlockPtr saved_block;
Arena complex_key_arena;
};
extern template class HashedDictionary<DictionaryKeyType::simple, false>;
extern template class HashedDictionary<DictionaryKeyType::simple, true>;
extern template class HashedDictionary<DictionaryKeyType::complex, false>;
extern template class HashedDictionary<DictionaryKeyType::complex, true>;
}

View File

@ -0,0 +1,156 @@
#include "HierarchyDictionariesUtils.h"
namespace DB
{
namespace ErrorCodes
{
extern const int UNSUPPORTED_METHOD;
}
namespace
{
/** In case of cache or direct dictionary we does not have structure with child to parent representation.
* This function build such structure calling getColumn for initial keys to request and for next keys in hierarchy,
* until all keys are requested or result key is null value.
* To distinguish null value key and key that is not present in dictionary, we use special default value column
* with max UInt64 value, if result column key has such value we assume that current key is not presented in dictionary storage.
*/
HashMap<UInt64, UInt64> getChildToParentHierarchyMapImpl(
const IDictionary * dictionary,
const DictionaryAttribute & hierarchical_attribute,
const PaddedPODArray<UInt64> & initial_keys_to_request,
const DataTypePtr & key_type)
{
UInt64 null_value = hierarchical_attribute.null_value.get<UInt64>();
ColumnPtr key_to_request_column = ColumnVector<UInt64>::create();
auto * key_to_request_column_typed = static_cast<ColumnVector<UInt64> *>(key_to_request_column->assumeMutable().get());
UInt64 key_not_in_storage_value = std::numeric_limits<UInt64>::max();
ColumnPtr key_not_in_storage_default_value_column = ColumnVector<UInt64>::create(initial_keys_to_request.size(), key_not_in_storage_value);
PaddedPODArray<UInt64> & keys_to_request = key_to_request_column_typed->getData();
keys_to_request.assign(initial_keys_to_request);
PaddedPODArray<UInt64> next_keys_to_request;
HashSet<UInt64> already_requested_keys;
HashMap<UInt64, UInt64> child_to_parent_key;
while (!keys_to_request.empty())
{
child_to_parent_key.reserve(child_to_parent_key.size() + keys_to_request.size());
auto parent_key_column = dictionary->getColumn(
hierarchical_attribute.name,
hierarchical_attribute.type,
{key_to_request_column},
{key_type},
key_not_in_storage_default_value_column);
const auto * parent_key_column_typed = checkAndGetColumn<ColumnVector<UInt64>>(*parent_key_column);
if (!parent_key_column_typed)
throw Exception(ErrorCodes::UNSUPPORTED_METHOD,
"Parent key column should be UInt64. Actual ({})",
hierarchical_attribute.type->getName());
const auto & parent_keys = parent_key_column_typed->getData();
next_keys_to_request.clear();
for (size_t i = 0; i < keys_to_request.size(); ++i)
{
auto key = keys_to_request[i];
auto parent_key = parent_keys[i];
if (parent_key == key_not_in_storage_value)
continue;
child_to_parent_key[key] = parent_key;
if (parent_key == null_value ||
already_requested_keys.find(parent_key) != nullptr)
continue;
already_requested_keys.insert(parent_key);
next_keys_to_request.emplace_back(parent_key);
}
keys_to_request.clear();
keys_to_request.assign(next_keys_to_request);
}
return child_to_parent_key;
}
}
ColumnPtr getKeysHierarchyDefaultImplementation(const IDictionary * dictionary, ColumnPtr key_column, const DataTypePtr & key_type)
{
key_column = key_column->convertToFullColumnIfConst();
const auto * key_column_typed = checkAndGetColumn<ColumnVector<UInt64>>(*key_column);
if (!key_column_typed)
throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "Key column should be UInt64");
const auto & dictionary_structure = dictionary->getStructure();
size_t hierarchical_attribute_index = *dictionary_structure.hierarchical_attribute_index;
const auto & hierarchical_attribute = dictionary_structure.attributes[hierarchical_attribute_index];
const PaddedPODArray<UInt64> & requested_keys = key_column_typed->getData();
HashMap<UInt64, UInt64> key_to_parent_key = getChildToParentHierarchyMapImpl(dictionary, hierarchical_attribute, requested_keys, key_type);
auto is_key_valid_func = [&](auto & key) { return key_to_parent_key.find(key) != nullptr; };
auto get_parent_key_func = [&](auto & key)
{
auto it = key_to_parent_key.find(key);
std::optional<UInt64> result = (it != nullptr ? std::make_optional(it->getMapped()) : std::nullopt);
return result;
};
UInt64 null_value = hierarchical_attribute.null_value.get<UInt64>();
auto dictionary_hierarchy_array = getKeysHierarchyArray(requested_keys, null_value, is_key_valid_func, get_parent_key_func);
return dictionary_hierarchy_array;
}
ColumnUInt8::Ptr getKeysIsInHierarchyDefaultImplementation(
const IDictionary * dictionary,
ColumnPtr key_column,
ColumnPtr in_key_column,
const DataTypePtr & key_type)
{
key_column = key_column->convertToFullColumnIfConst();
in_key_column = in_key_column->convertToFullColumnIfConst();
const auto * key_column_typed = checkAndGetColumn<ColumnVector<UInt64>>(*key_column);
if (!key_column_typed)
throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "Key column should be UInt64");
const auto * in_key_column_typed = checkAndGetColumn<ColumnVector<UInt64>>(*in_key_column);
if (!in_key_column_typed)
throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "Key column should be UInt64");
const auto & dictionary_structure = dictionary->getStructure();
size_t hierarchical_attribute_index = *dictionary_structure.hierarchical_attribute_index;
const auto & hierarchical_attribute = dictionary_structure.attributes[hierarchical_attribute_index];
const PaddedPODArray<UInt64> & requested_keys = key_column_typed->getData();
HashMap<UInt64, UInt64> key_to_parent_key = getChildToParentHierarchyMapImpl(dictionary, hierarchical_attribute, requested_keys, key_type);
auto is_key_valid_func = [&](auto & key) { return key_to_parent_key.find(key) != nullptr; };
auto get_parent_key_func = [&](auto & key)
{
auto it = key_to_parent_key.find(key);
std::optional<UInt64> result = (it != nullptr ? std::make_optional(it->getMapped()) : std::nullopt);
return result;
};
UInt64 null_value = hierarchical_attribute.null_value.get<UInt64>();
const auto & in_keys = in_key_column_typed->getData();
auto result = getKeysIsInHierarchyColumn(requested_keys, in_keys, null_value, is_key_valid_func, get_parent_key_func);
return result;
}
}

View File

@ -0,0 +1,467 @@
#pragma once
#include <common/types.h>
#include <Common/PODArray.h>
#include <Common/HashTable/HashMap.h>
#include <Common/HashTable/HashSet.h>
#include <Columns/IColumn.h>
#include <Columns/ColumnVector.h>
#include <Columns/ColumnArray.h>
#include <Dictionaries/IDictionary.h>
namespace DB
{
namespace detail
{
template <typename KeyType>
struct ElementsAndOffsets
{
PaddedPODArray<KeyType> elements;
PaddedPODArray<IColumn::Offset> offsets;
};
template <typename T>
struct IsKeyValidFuncInterface
{
bool operator()(T key [[maybe_unused]]) { return false; }
};
template <typename T>
struct GetParentKeyFuncInterface
{
std::optional<T> operator()(T key [[maybe_unused]]) { return {}; }
};
/** Calculate hierarchy for keys iterating the hierarchy from child to parent using get_parent_key_func provided by client.
* Hierarchy iteration is stopped if key equals null value, get_parent_key_func returns null optional, or hierarchy depth
* greater or equal than DBMS_HIERARCHICAL_DICTIONARY_MAX_DEPTH.
* IsKeyValidFunc used for each input hierarchy key, if it returns false result hierarchy for that key will have size 0.
* Hierarchy result is ElementsAndOffsets structure, for each element there is hierarchy array,
* with size offset[element_index] - (element_index > 0 ? offset[element_index - 1] : 0).
*
* Example:
* id parent_id
* 1 0
* 2 1
* 3 1
* 4 2
*
* If hierarchy_null_value will be 0. Requested keys [1, 2, 3, 4, 5].
* Result: [1], [2, 1], [3, 1], [4, 2, 1], []
* Elements: [1, 2, 1, 3, 1, 4, 2, 1]
* Offsets: [1, 3, 5, 8, 8]
*/
template <typename KeyType, typename IsKeyValidFunc, typename GetParentKeyFunc>
ElementsAndOffsets<KeyType> getHierarchy(
const PaddedPODArray<KeyType> & keys,
const KeyType & hierarchy_null_value,
IsKeyValidFunc && is_key_valid_func,
GetParentKeyFunc && get_parent_key_func)
{
size_t hierarchy_keys_size = keys.size();
PaddedPODArray<KeyType> elements;
elements.reserve(hierarchy_keys_size);
PaddedPODArray<IColumn::Offset> offsets;
offsets.reserve(hierarchy_keys_size);
struct OffsetInArray
{
size_t offset_index;
size_t array_element_offset;
};
HashMap<KeyType, OffsetInArray> already_processes_keys_to_offset;
already_processes_keys_to_offset.reserve(hierarchy_keys_size);
for (size_t i = 0; i < hierarchy_keys_size; ++i)
{
auto hierarchy_key = keys[i];
size_t current_hierarchy_depth = 0;
bool is_key_valid = std::forward<IsKeyValidFunc>(is_key_valid_func)(hierarchy_key);
if (!is_key_valid)
{
offsets.emplace_back(elements.size());
continue;
}
while (true)
{
const auto * it = already_processes_keys_to_offset.find(hierarchy_key);
if (it)
{
const auto & index = it->getMapped();
size_t offset = index.offset_index;
bool is_loop = (offset == offsets.size());
if (unlikely(is_loop))
break;
size_t array_element_offset = index.array_element_offset;
size_t previous_offset_size = offset > 0 ? offsets[offset - 1] : 0;
size_t start_index = previous_offset_size + array_element_offset;
size_t end_index = offsets[offset];
elements.insertFromItself(elements.begin() + start_index, elements.begin() + end_index);
break;
}
if (hierarchy_key == hierarchy_null_value || current_hierarchy_depth >= DBMS_HIERARCHICAL_DICTIONARY_MAX_DEPTH)
break;
already_processes_keys_to_offset[hierarchy_key] = {offsets.size(), current_hierarchy_depth};
elements.emplace_back(hierarchy_key);
++current_hierarchy_depth;
std::optional<KeyType> parent_key = std::forward<GetParentKeyFunc>(get_parent_key_func)(hierarchy_key);
if (!parent_key.has_value())
break;
hierarchy_key = *parent_key;
}
offsets.emplace_back(elements.size());
}
ElementsAndOffsets<KeyType> result = {std::move(elements), std::move(offsets)};
return result;
}
/** Returns array with UInt8 represent if key from in_keys array is in hierarchy of key from keys column.
* If value in result array is 1 that means key from in_keys array is in hierarchy of key from
* keys array with same index, 0 therwise.
* For getting hierarchy implementation uses getKeysHierarchy function.
*
* Not: keys size must be equal to in_keys_size.
*/
template <typename KeyType, typename IsKeyValidFunc, typename GetParentKeyFunc>
PaddedPODArray<UInt8> getIsInHierarchy(
const PaddedPODArray<KeyType> & keys,
const PaddedPODArray<KeyType> & in_keys,
const KeyType & hierarchy_null_value,
IsKeyValidFunc && is_key_valid_func,
GetParentKeyFunc && get_parent_func)
{
assert(keys.size() == in_keys.size());
PaddedPODArray<UInt8> result;
result.resize_fill(keys.size());
detail::ElementsAndOffsets<KeyType> hierarchy = detail::getHierarchy(
keys,
hierarchy_null_value,
std::forward<IsKeyValidFunc>(is_key_valid_func),
std::forward<GetParentKeyFunc>(get_parent_func));
auto & offsets = hierarchy.offsets;
auto & elements = hierarchy.elements;
for (size_t i = 0; i < offsets.size(); ++i)
{
size_t i_elements_start = i > 0 ? offsets[i - 1] : 0;
size_t i_elements_end = offsets[i];
auto & key_to_find = in_keys[i];
const auto * begin = elements.begin() + i_elements_start;
const auto * end = elements.begin() + i_elements_end;
const auto * it = std::find(begin, end, key_to_find);
bool contains_key = (it != end);
result[i] = contains_key;
}
return result;
}
struct GetAllDescendantsStrategy { size_t level = 0; };
struct GetDescendantsAtSpecificLevelStrategy { size_t level = 0; };
/** Get descendants for keys iterating the hierarchy from parent to child using parent_to_child hash map provided by client.
* GetAllDescendantsStrategy get all descendants for key
* GetDescendantsAtSpecificLevelStrategy get descendants only for specific hierarchy level.
* Hierarchy result is ElementsAndOffsets structure, for each element there is descendants array,
* with size offset[element_index] - (element_index > 0 ? offset[element_index - 1] : 0).
*
* Example:
* id parent_id
* 1 0
* 2 1
* 3 1
* 4 2
*
* Example. Strategy GetAllDescendantsStrategy.
* Requested keys [0, 1, 2, 3, 4].
* Result: [1, 2, 3, 4], [2, 2, 4], [4], [], []
* Elements: [1, 2, 3, 4, 2, 3, 4, 4]
* Offsets: [4, 7, 8, 8, 8]
*
* Example. Strategy GetDescendantsAtSpecificLevelStrategy with level 1.
* Requested keys [0, 1, 2, 3, 4].
* Result: [1], [2, 3], [4], [], [];
* Offsets: [1, 3, 4, 4, 4];
*/
template <typename KeyType, typename Strategy>
ElementsAndOffsets<KeyType> getDescendants(
const PaddedPODArray<KeyType> & keys,
const HashMap<KeyType, PaddedPODArray<KeyType>> & parent_to_child,
Strategy strategy)
{
/// If strategy is GetAllDescendantsStrategy we try to cache and later reuse previously calculated descendants.
/// If strategy is GetDescendantsAtSpecificLevelStrategy we does not use cache strategy.
size_t keys_size = keys.size();
PaddedPODArray<KeyType> descendants;
descendants.reserve(keys_size);
PaddedPODArray<IColumn::Offset> descendants_offsets;
descendants_offsets.reserve(keys_size);
struct Range
{
size_t start_index;
size_t end_index;
};
static constexpr Int64 key_range_requires_update = -1;
HashMap<KeyType, Range> already_processed_keys_to_range [[maybe_unused]];
if constexpr (std::is_same_v<Strategy, GetAllDescendantsStrategy>)
already_processed_keys_to_range.reserve(keys_size);
struct KeyAndDepth
{
KeyType key;
Int64 depth;
};
HashSet<KeyType> already_processed_keys_during_loop;
already_processed_keys_during_loop.reserve(keys_size);
PaddedPODArray<KeyAndDepth> next_keys_to_process_stack;
next_keys_to_process_stack.reserve(keys_size);
Int64 level = static_cast<Int64>(strategy.level);
for (size_t i = 0; i < keys_size; ++i)
{
const KeyType & requested_key = keys[i];
if (parent_to_child.find(requested_key) == nullptr)
{
descendants_offsets.emplace_back(descendants.size());
continue;
}
next_keys_to_process_stack.emplace_back(KeyAndDepth{requested_key, 0});
/** To cache range for key without recursive function calls and custom stack we put special
* signaling value on stack key_range_requires_update.
* When we pop such value from stack that means processing descendants for key is finished
* and we can update range with end_index.
*/
while (!next_keys_to_process_stack.empty())
{
KeyAndDepth key_to_process = next_keys_to_process_stack.back();
KeyType key = key_to_process.key;
Int64 depth = key_to_process.depth;
next_keys_to_process_stack.pop_back();
if constexpr (std::is_same_v<Strategy, GetAllDescendantsStrategy>)
{
/// Update end_index for key
if (depth == key_range_requires_update)
{
auto * it = already_processed_keys_to_range.find(key);
assert(it);
auto & range_to_update = it->getMapped();
range_to_update.end_index = descendants.size();
continue;
}
}
if (unlikely(already_processed_keys_during_loop.find(key) != nullptr))
{
next_keys_to_process_stack.clear();
break;
}
if constexpr (std::is_same_v<Strategy, GetAllDescendantsStrategy>)
{
const auto * already_processed_it = already_processed_keys_to_range.find(key);
if (already_processed_it)
{
Range range = already_processed_it->getMapped();
if (unlikely(range.start_index > range.end_index))
{
/// Broken range because there was loop
already_processed_keys_to_range.erase(key);
}
else
{
auto insert_start_iterator = descendants.begin() + range.start_index;
auto insert_end_iterator = descendants.begin() + range.end_index;
descendants.insertFromItself(insert_start_iterator, insert_end_iterator);
continue;
}
}
}
const auto * it = parent_to_child.find(key);
if (!it || depth >= DBMS_HIERARCHICAL_DICTIONARY_MAX_DEPTH)
continue;
if constexpr (std::is_same_v<Strategy, GetDescendantsAtSpecificLevelStrategy>)
{
if (depth > level)
continue;
}
if constexpr (std::is_same_v<Strategy, GetAllDescendantsStrategy>)
{
/// Put special signaling value on stack and update cache with range start
size_t range_start_index = descendants.size();
already_processed_keys_to_range[key].start_index = range_start_index;
next_keys_to_process_stack.emplace_back(KeyAndDepth{key, key_range_requires_update});
}
already_processed_keys_during_loop.insert(key);
++depth;
const auto & children = it->getMapped();
for (auto child_key : children)
{
/// In case of GetAllDescendantsStrategy we add any descendant to result array
/// If strategy is GetDescendantsAtSpecificLevelStrategy we require depth == level
if (std::is_same_v<Strategy, GetAllDescendantsStrategy> || depth == level)
descendants.emplace_back(child_key);
next_keys_to_process_stack.emplace_back(KeyAndDepth{child_key, depth});
}
}
already_processed_keys_during_loop.clear();
descendants_offsets.emplace_back(descendants.size());
}
ElementsAndOffsets<KeyType> result = {std::move(descendants), std::move(descendants_offsets)};
return result;
}
/// Converts ElementAndOffsets structure into ArrayColumn
template<typename KeyType>
ColumnPtr convertElementsAndOffsetsIntoArray(ElementsAndOffsets<KeyType> && elements_and_offsets)
{
auto elements_column = ColumnVector<KeyType>::create();
elements_column->getData() = std::move(elements_and_offsets.elements);
auto offsets_column = ColumnVector<IColumn::Offset>::create();
offsets_column->getData() = std::move(elements_and_offsets.offsets);
auto column_array = ColumnArray::create(std::move(elements_column), std::move(offsets_column));
return column_array;
}
}
/// Returns hierarchy array column for keys
template <typename KeyType, typename IsKeyValidFunc, typename GetParentKeyFunc>
ColumnPtr getKeysHierarchyArray(
const PaddedPODArray<KeyType> & keys,
const KeyType & hierarchy_null_value,
IsKeyValidFunc && is_key_valid_func,
GetParentKeyFunc && get_parent_func)
{
auto elements_and_offsets = detail::getHierarchy(
keys,
hierarchy_null_value,
std::forward<IsKeyValidFunc>(is_key_valid_func),
std::forward<GetParentKeyFunc>(get_parent_func));
return detail::convertElementsAndOffsetsIntoArray(std::move(elements_and_offsets));
}
/// Returns is in hierarchy column for keys
template <typename KeyType, typename IsKeyValidFunc, typename GetParentKeyFunc>
ColumnUInt8::Ptr getKeysIsInHierarchyColumn(
const PaddedPODArray<KeyType> & hierarchy_keys,
const PaddedPODArray<KeyType> & hierarchy_in_keys,
const KeyType & hierarchy_null_value,
IsKeyValidFunc && is_key_valid_func,
GetParentKeyFunc && get_parent_func)
{
auto is_in_hierarchy_data = detail::getIsInHierarchy(
hierarchy_keys,
hierarchy_in_keys,
hierarchy_null_value,
std::forward<IsKeyValidFunc>(is_key_valid_func),
std::forward<GetParentKeyFunc>(get_parent_func));
auto result = ColumnUInt8::create();
result->getData() = std::move(is_in_hierarchy_data);
return result;
}
/// Returns descendants array column for keys
template <typename KeyType>
ColumnPtr getKeysDescendantsArray(
const PaddedPODArray<KeyType> & requested_keys,
const HashMap<KeyType, PaddedPODArray<KeyType>> & parent_to_child,
size_t level)
{
if (level == 0)
{
detail::GetAllDescendantsStrategy strategy { .level = level };
auto elements_and_offsets = detail::getDescendants(requested_keys, parent_to_child, strategy);
return detail::convertElementsAndOffsetsIntoArray(std::move(elements_and_offsets));
}
else
{
detail::GetDescendantsAtSpecificLevelStrategy strategy { .level = level };
auto elements_and_offsets = detail::getDescendants(requested_keys, parent_to_child, strategy);
return detail::convertElementsAndOffsetsIntoArray(std::move(elements_and_offsets));
}
}
/** Default getHierarchy implementation for dictionaries that does not have structure with child to parent representation.
* Implementation will build such structure with getColumn calls, and then getHierarchy for such structure.
* Returns ColumnArray with hierarchy arrays for keys from key_column.
*/
ColumnPtr getKeysHierarchyDefaultImplementation(
const IDictionary * dictionary,
ColumnPtr key_column,
const DataTypePtr & key_type);
/** Default isInHierarchy implementation for dictionaries that does not have structure with child to parent representation.
* Implementation will build such structure with getColumn calls, and then getHierarchy for such structure.
* Returns UInt8 column if key from in_key_column is in key hierarchy from key_column.
*/
ColumnUInt8::Ptr getKeysIsInHierarchyDefaultImplementation(
const IDictionary * dictionary,
ColumnPtr key_column,
ColumnPtr in_key_column,
const DataTypePtr & key_type);
}

Some files were not shown because too many files have changed in this diff Show More