mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-21 15:12:02 +00:00
Merge remote-tracking branch 'origin/master' into HEAD
This commit is contained in:
commit
2c91fc55e9
@ -39,6 +39,8 @@ else()
|
||||
set(RECONFIGURE_MESSAGE_LEVEL STATUS)
|
||||
endif()
|
||||
|
||||
enable_language(C CXX ASM)
|
||||
|
||||
include (cmake/arch.cmake)
|
||||
include (cmake/target.cmake)
|
||||
include (cmake/tools.cmake)
|
||||
|
@ -62,6 +62,7 @@ if (NOT OPENLDAP_FOUND AND NOT MISSING_INTERNAL_LDAP_LIBRARY)
|
||||
if (
|
||||
( "${_system_name}" STREQUAL "linux" AND "${_system_processor}" STREQUAL "x86_64" ) OR
|
||||
( "${_system_name}" STREQUAL "linux" AND "${_system_processor}" STREQUAL "aarch64" ) OR
|
||||
( "${_system_name}" STREQUAL "linux" AND "${_system_processor}" STREQUAL "ppc64le" ) OR
|
||||
( "${_system_name}" STREQUAL "freebsd" AND "${_system_processor}" STREQUAL "x86_64" ) OR
|
||||
( "${_system_name}" STREQUAL "darwin" AND "${_system_processor}" STREQUAL "x86_64" )
|
||||
)
|
||||
|
@ -1,7 +1,7 @@
|
||||
if(NOT OS_FREEBSD AND NOT APPLE AND NOT ARCH_ARM)
|
||||
if(NOT OS_FREEBSD AND NOT APPLE)
|
||||
option(ENABLE_S3 "Enable S3" ${ENABLE_LIBRARIES})
|
||||
elseif(ENABLE_S3 OR USE_INTERNAL_AWS_S3_LIBRARY)
|
||||
message (${RECONFIGURE_MESSAGE_LEVEL} "Can't use S3 on ARM, Apple or FreeBSD")
|
||||
message (${RECONFIGURE_MESSAGE_LEVEL} "Can't use S3 on Apple or FreeBSD")
|
||||
endif()
|
||||
|
||||
if(NOT ENABLE_S3)
|
||||
|
@ -6,7 +6,7 @@ set (DEFAULT_LIBS "-nodefaultlibs")
|
||||
# We need builtins from Clang's RT even without libcxx - for ubsan+int128.
|
||||
# See https://bugs.llvm.org/show_bug.cgi?id=16404
|
||||
if (COMPILER_CLANG AND NOT (CMAKE_CROSSCOMPILING AND ARCH_AARCH64))
|
||||
execute_process (COMMAND ${CMAKE_CXX_COMPILER} --print-file-name=libclang_rt.builtins-${CMAKE_SYSTEM_PROCESSOR}.a OUTPUT_VARIABLE BUILTINS_LIBRARY OUTPUT_STRIP_TRAILING_WHITESPACE)
|
||||
execute_process (COMMAND ${CMAKE_CXX_COMPILER} --print-libgcc-file-name --rtlib=compiler-rt OUTPUT_VARIABLE BUILTINS_LIBRARY OUTPUT_STRIP_TRAILING_WHITESPACE)
|
||||
else ()
|
||||
set (BUILTINS_LIBRARY "-lgcc")
|
||||
endif ()
|
||||
|
@ -86,8 +86,3 @@ if (LINKER_NAME)
|
||||
message(STATUS "Using custom linker by name: ${LINKER_NAME}")
|
||||
endif ()
|
||||
|
||||
if (ARCH_PPC64LE)
|
||||
if (COMPILER_CLANG OR (COMPILER_GCC AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 8))
|
||||
message(FATAL_ERROR "Only gcc-8 or higher is supported for powerpc architecture")
|
||||
endif ()
|
||||
endif ()
|
||||
|
@ -160,6 +160,12 @@ if (NOT EXTERNAL_BOOST_FOUND)
|
||||
enable_language(ASM)
|
||||
SET(ASM_OPTIONS "-x assembler-with-cpp")
|
||||
|
||||
set (SRCS_CONTEXT
|
||||
${LIBRARY_DIR}/libs/context/src/dummy.cpp
|
||||
${LIBRARY_DIR}/libs/context/src/execution_context.cpp
|
||||
${LIBRARY_DIR}/libs/context/src/posix/stack_traits.cpp
|
||||
)
|
||||
|
||||
if (SANITIZE AND (SANITIZE STREQUAL "address" OR SANITIZE STREQUAL "thread"))
|
||||
add_compile_definitions(BOOST_USE_UCONTEXT)
|
||||
|
||||
@ -169,39 +175,34 @@ if (NOT EXTERNAL_BOOST_FOUND)
|
||||
add_compile_definitions(BOOST_USE_TSAN)
|
||||
endif()
|
||||
|
||||
set (SRCS_CONTEXT
|
||||
set (SRCS_CONTEXT ${SRCS_CONTEXT}
|
||||
${LIBRARY_DIR}/libs/context/src/fiber.cpp
|
||||
${LIBRARY_DIR}/libs/context/src/continuation.cpp
|
||||
${LIBRARY_DIR}/libs/context/src/dummy.cpp
|
||||
${LIBRARY_DIR}/libs/context/src/execution_context.cpp
|
||||
${LIBRARY_DIR}/libs/context/src/posix/stack_traits.cpp
|
||||
)
|
||||
elseif (ARCH_ARM)
|
||||
set (SRCS_CONTEXT
|
||||
endif()
|
||||
if (ARCH_ARM)
|
||||
set (SRCS_CONTEXT ${SRCS_CONTEXT}
|
||||
${LIBRARY_DIR}/libs/context/src/asm/jump_arm64_aapcs_elf_gas.S
|
||||
${LIBRARY_DIR}/libs/context/src/asm/make_arm64_aapcs_elf_gas.S
|
||||
${LIBRARY_DIR}/libs/context/src/asm/ontop_arm64_aapcs_elf_gas.S
|
||||
${LIBRARY_DIR}/libs/context/src/dummy.cpp
|
||||
${LIBRARY_DIR}/libs/context/src/execution_context.cpp
|
||||
${LIBRARY_DIR}/libs/context/src/posix/stack_traits.cpp
|
||||
)
|
||||
elseif (ARCH_PPC64LE)
|
||||
set (SRCS_CONTEXT ${SRCS_CONTEXT}
|
||||
${LIBRARY_DIR}/libs/context/src/asm/jump_ppc64_sysv_elf_gas.S
|
||||
${LIBRARY_DIR}/libs/context/src/asm/make_ppc64_sysv_elf_gas.S
|
||||
${LIBRARY_DIR}/libs/context/src/asm/ontop_ppc64_sysv_elf_gas.S
|
||||
)
|
||||
elseif(OS_DARWIN)
|
||||
set (SRCS_CONTEXT
|
||||
set (SRCS_CONTEXT ${SRCS_CONTEXT}
|
||||
${LIBRARY_DIR}/libs/context/src/asm/jump_x86_64_sysv_macho_gas.S
|
||||
${LIBRARY_DIR}/libs/context/src/asm/make_x86_64_sysv_macho_gas.S
|
||||
${LIBRARY_DIR}/libs/context/src/asm/ontop_x86_64_sysv_macho_gas.S
|
||||
${LIBRARY_DIR}/libs/context/src/dummy.cpp
|
||||
${LIBRARY_DIR}/libs/context/src/execution_context.cpp
|
||||
${LIBRARY_DIR}/libs/context/src/posix/stack_traits.cpp
|
||||
)
|
||||
else()
|
||||
set (SRCS_CONTEXT
|
||||
set (SRCS_CONTEXT ${SRCS_CONTEXT}
|
||||
${LIBRARY_DIR}/libs/context/src/asm/jump_x86_64_sysv_elf_gas.S
|
||||
${LIBRARY_DIR}/libs/context/src/asm/make_x86_64_sysv_elf_gas.S
|
||||
${LIBRARY_DIR}/libs/context/src/asm/ontop_x86_64_sysv_elf_gas.S
|
||||
${LIBRARY_DIR}/libs/context/src/dummy.cpp
|
||||
${LIBRARY_DIR}/libs/context/src/execution_context.cpp
|
||||
${LIBRARY_DIR}/libs/context/src/posix/stack_traits.cpp
|
||||
)
|
||||
endif()
|
||||
|
||||
|
@ -97,12 +97,19 @@ if (NOT EXTERNAL_CCTZ_LIBRARY_FOUND OR NOT EXTERNAL_CCTZ_LIBRARY_WORKS)
|
||||
set(TZ_OBJS ${TZ_OBJS} ${TZ_OBJ})
|
||||
|
||||
# https://stackoverflow.com/questions/14776463/compile-and-add-an-object-file-from-a-binary-with-cmake
|
||||
add_custom_command(OUTPUT ${TZ_OBJ}
|
||||
COMMAND cp ${TZDIR}/${TIMEZONE} ${CMAKE_CURRENT_BINARY_DIR}/${TIMEZONE_ID}
|
||||
COMMAND cd ${CMAKE_CURRENT_BINARY_DIR} && ${OBJCOPY_PATH} -I binary ${OBJCOPY_ARCH_OPTIONS}
|
||||
# PPC64LE fails to do this with objcopy, use ld or lld instead
|
||||
if (ARCH_PPC64LE)
|
||||
add_custom_command(OUTPUT ${TZ_OBJ}
|
||||
COMMAND cp ${TZDIR}/${TIMEZONE} ${CMAKE_CURRENT_BINARY_DIR}/${TIMEZONE_ID}
|
||||
COMMAND cd ${CMAKE_CURRENT_BINARY_DIR} && ${CMAKE_LINKER} -m elf64lppc -r -b binary -o ${TZ_OBJ} ${TIMEZONE_ID}
|
||||
COMMAND rm ${CMAKE_CURRENT_BINARY_DIR}/${TIMEZONE_ID})
|
||||
else()
|
||||
add_custom_command(OUTPUT ${TZ_OBJ}
|
||||
COMMAND cp ${TZDIR}/${TIMEZONE} ${CMAKE_CURRENT_BINARY_DIR}/${TIMEZONE_ID}
|
||||
COMMAND cd ${CMAKE_CURRENT_BINARY_DIR} && ${OBJCOPY_PATH} -I binary ${OBJCOPY_ARCH_OPTIONS}
|
||||
--rename-section .data=.rodata,alloc,load,readonly,data,contents ${TIMEZONE_ID} ${TZ_OBJ}
|
||||
COMMAND rm ${CMAKE_CURRENT_BINARY_DIR}/${TIMEZONE_ID})
|
||||
|
||||
COMMAND rm ${CMAKE_CURRENT_BINARY_DIR}/${TIMEZONE_ID})
|
||||
endif()
|
||||
set_source_files_properties(${TZ_OBJ} PROPERTIES EXTERNAL_OBJECT true GENERATED true)
|
||||
endforeach(TIMEZONE)
|
||||
|
||||
|
@ -1,7 +1,7 @@
|
||||
if (SANITIZE OR NOT (ARCH_AMD64 OR ARCH_ARM) OR NOT (OS_LINUX OR OS_FREEBSD OR OS_DARWIN))
|
||||
if (SANITIZE OR NOT (ARCH_AMD64 OR ARCH_ARM OR ARCH_PPC64LE) OR NOT (OS_LINUX OR OS_FREEBSD OR OS_DARWIN))
|
||||
if (ENABLE_JEMALLOC)
|
||||
message (${RECONFIGURE_MESSAGE_LEVEL}
|
||||
"jemalloc is disabled implicitly: it doesn't work with sanitizers and can only be used with x86_64 or aarch64 on linux or freebsd.")
|
||||
"jemalloc is disabled implicitly: it doesn't work with sanitizers and can only be used with x86_64, aarch64 or ppc64le on linux or freebsd.")
|
||||
endif()
|
||||
set (ENABLE_JEMALLOC OFF)
|
||||
else()
|
||||
@ -107,6 +107,8 @@ if (ARCH_AMD64)
|
||||
set(JEMALLOC_INCLUDE_PREFIX "${JEMALLOC_INCLUDE_PREFIX}_x86_64")
|
||||
elseif (ARCH_ARM)
|
||||
set(JEMALLOC_INCLUDE_PREFIX "${JEMALLOC_INCLUDE_PREFIX}_aarch64")
|
||||
elseif (ARCH_PPC64LE)
|
||||
set(JEMALLOC_INCLUDE_PREFIX "${JEMALLOC_INCLUDE_PREFIX}_ppc64le")
|
||||
else ()
|
||||
message (FATAL_ERROR "internal jemalloc: This arch is not supported")
|
||||
endif ()
|
||||
|
@ -0,0 +1,367 @@
|
||||
/* include/jemalloc/internal/jemalloc_internal_defs.h. Generated from jemalloc_internal_defs.h.in by configure. */
|
||||
#ifndef JEMALLOC_INTERNAL_DEFS_H_
|
||||
#define JEMALLOC_INTERNAL_DEFS_H_
|
||||
/*
|
||||
* If JEMALLOC_PREFIX is defined via --with-jemalloc-prefix, it will cause all
|
||||
* public APIs to be prefixed. This makes it possible, with some care, to use
|
||||
* multiple allocators simultaneously.
|
||||
*/
|
||||
/* #undef JEMALLOC_PREFIX */
|
||||
/* #undef JEMALLOC_CPREFIX */
|
||||
|
||||
/*
|
||||
* Define overrides for non-standard allocator-related functions if they are
|
||||
* present on the system.
|
||||
*/
|
||||
#define JEMALLOC_OVERRIDE___LIBC_CALLOC
|
||||
#define JEMALLOC_OVERRIDE___LIBC_FREE
|
||||
#define JEMALLOC_OVERRIDE___LIBC_MALLOC
|
||||
#define JEMALLOC_OVERRIDE___LIBC_MEMALIGN
|
||||
#define JEMALLOC_OVERRIDE___LIBC_REALLOC
|
||||
#define JEMALLOC_OVERRIDE___LIBC_VALLOC
|
||||
/* #undef JEMALLOC_OVERRIDE___POSIX_MEMALIGN */
|
||||
|
||||
/*
|
||||
* JEMALLOC_PRIVATE_NAMESPACE is used as a prefix for all library-private APIs.
|
||||
* For shared libraries, symbol visibility mechanisms prevent these symbols
|
||||
* from being exported, but for static libraries, naming collisions are a real
|
||||
* possibility.
|
||||
*/
|
||||
#define JEMALLOC_PRIVATE_NAMESPACE je_
|
||||
|
||||
/*
|
||||
* Hyper-threaded CPUs may need a special instruction inside spin loops in
|
||||
* order to yield to another virtual CPU.
|
||||
*/
|
||||
#define CPU_SPINWAIT
|
||||
/* 1 if CPU_SPINWAIT is defined, 0 otherwise. */
|
||||
#define HAVE_CPU_SPINWAIT 0
|
||||
|
||||
/*
|
||||
* Number of significant bits in virtual addresses. This may be less than the
|
||||
* total number of bits in a pointer, e.g. on x64, for which the uppermost 16
|
||||
* bits are the same as bit 47.
|
||||
*/
|
||||
#define LG_VADDR 64
|
||||
|
||||
/* Defined if C11 atomics are available. */
|
||||
#define JEMALLOC_C11_ATOMICS 1
|
||||
|
||||
/* Defined if GCC __atomic atomics are available. */
|
||||
#define JEMALLOC_GCC_ATOMIC_ATOMICS 1
|
||||
/* and the 8-bit variant support. */
|
||||
#define JEMALLOC_GCC_U8_ATOMIC_ATOMICS 1
|
||||
|
||||
/* Defined if GCC __sync atomics are available. */
|
||||
#define JEMALLOC_GCC_SYNC_ATOMICS 1
|
||||
/* and the 8-bit variant support. */
|
||||
#define JEMALLOC_GCC_U8_SYNC_ATOMICS 1
|
||||
|
||||
/*
|
||||
* Defined if __builtin_clz() and __builtin_clzl() are available.
|
||||
*/
|
||||
#define JEMALLOC_HAVE_BUILTIN_CLZ
|
||||
|
||||
/*
|
||||
* Defined if os_unfair_lock_*() functions are available, as provided by Darwin.
|
||||
*/
|
||||
/* #undef JEMALLOC_OS_UNFAIR_LOCK */
|
||||
|
||||
/* Defined if syscall(2) is usable. */
|
||||
#define JEMALLOC_USE_SYSCALL
|
||||
|
||||
/*
|
||||
* Defined if secure_getenv(3) is available.
|
||||
*/
|
||||
// #define JEMALLOC_HAVE_SECURE_GETENV
|
||||
|
||||
/*
|
||||
* Defined if issetugid(2) is available.
|
||||
*/
|
||||
/* #undef JEMALLOC_HAVE_ISSETUGID */
|
||||
|
||||
/* Defined if pthread_atfork(3) is available. */
|
||||
#define JEMALLOC_HAVE_PTHREAD_ATFORK
|
||||
|
||||
/* Defined if pthread_setname_np(3) is available. */
|
||||
#define JEMALLOC_HAVE_PTHREAD_SETNAME_NP
|
||||
|
||||
/*
|
||||
* Defined if clock_gettime(CLOCK_MONOTONIC_COARSE, ...) is available.
|
||||
*/
|
||||
#define JEMALLOC_HAVE_CLOCK_MONOTONIC_COARSE 1
|
||||
|
||||
/*
|
||||
* Defined if clock_gettime(CLOCK_MONOTONIC, ...) is available.
|
||||
*/
|
||||
#define JEMALLOC_HAVE_CLOCK_MONOTONIC 1
|
||||
|
||||
/*
|
||||
* Defined if mach_absolute_time() is available.
|
||||
*/
|
||||
/* #undef JEMALLOC_HAVE_MACH_ABSOLUTE_TIME */
|
||||
|
||||
/*
|
||||
* Defined if _malloc_thread_cleanup() exists. At least in the case of
|
||||
* FreeBSD, pthread_key_create() allocates, which if used during malloc
|
||||
* bootstrapping will cause recursion into the pthreads library. Therefore, if
|
||||
* _malloc_thread_cleanup() exists, use it as the basis for thread cleanup in
|
||||
* malloc_tsd.
|
||||
*/
|
||||
/* #undef JEMALLOC_MALLOC_THREAD_CLEANUP */
|
||||
|
||||
/*
|
||||
* Defined if threaded initialization is known to be safe on this platform.
|
||||
* Among other things, it must be possible to initialize a mutex without
|
||||
* triggering allocation in order for threaded allocation to be safe.
|
||||
*/
|
||||
#define JEMALLOC_THREADED_INIT
|
||||
|
||||
/*
|
||||
* Defined if the pthreads implementation defines
|
||||
* _pthread_mutex_init_calloc_cb(), in which case the function is used in order
|
||||
* to avoid recursive allocation during mutex initialization.
|
||||
*/
|
||||
/* #undef JEMALLOC_MUTEX_INIT_CB */
|
||||
|
||||
/* Non-empty if the tls_model attribute is supported. */
|
||||
#define JEMALLOC_TLS_MODEL __attribute__((tls_model("initial-exec")))
|
||||
|
||||
/*
|
||||
* JEMALLOC_DEBUG enables assertions and other sanity checks, and disables
|
||||
* inline functions.
|
||||
*/
|
||||
/* #undef JEMALLOC_DEBUG */
|
||||
|
||||
/* JEMALLOC_STATS enables statistics calculation. */
|
||||
#define JEMALLOC_STATS
|
||||
|
||||
/* JEMALLOC_EXPERIMENTAL_SMALLOCX_API enables experimental smallocx API. */
|
||||
/* #undef JEMALLOC_EXPERIMENTAL_SMALLOCX_API */
|
||||
|
||||
/* JEMALLOC_PROF enables allocation profiling. */
|
||||
/* #undef JEMALLOC_PROF */
|
||||
|
||||
/* Use libunwind for profile backtracing if defined. */
|
||||
/* #undef JEMALLOC_PROF_LIBUNWIND */
|
||||
|
||||
/* Use libgcc for profile backtracing if defined. */
|
||||
/* #undef JEMALLOC_PROF_LIBGCC */
|
||||
|
||||
/* Use gcc intrinsics for profile backtracing if defined. */
|
||||
/* #undef JEMALLOC_PROF_GCC */
|
||||
|
||||
/*
|
||||
* JEMALLOC_DSS enables use of sbrk(2) to allocate extents from the data storage
|
||||
* segment (DSS).
|
||||
*/
|
||||
#define JEMALLOC_DSS
|
||||
|
||||
/* Support memory filling (junk/zero). */
|
||||
#define JEMALLOC_FILL
|
||||
|
||||
/* Support utrace(2)-based tracing. */
|
||||
/* #undef JEMALLOC_UTRACE */
|
||||
|
||||
/* Support optional abort() on OOM. */
|
||||
/* #undef JEMALLOC_XMALLOC */
|
||||
|
||||
/* Support lazy locking (avoid locking unless a second thread is launched). */
|
||||
/* #undef JEMALLOC_LAZY_LOCK */
|
||||
|
||||
/*
|
||||
* Minimum allocation alignment is 2^LG_QUANTUM bytes (ignoring tiny size
|
||||
* classes).
|
||||
*/
|
||||
/* #undef LG_QUANTUM */
|
||||
|
||||
/* One page is 2^LG_PAGE bytes. */
|
||||
#define LG_PAGE 16
|
||||
|
||||
/*
|
||||
* One huge page is 2^LG_HUGEPAGE bytes. Note that this is defined even if the
|
||||
* system does not explicitly support huge pages; system calls that require
|
||||
* explicit huge page support are separately configured.
|
||||
*/
|
||||
#define LG_HUGEPAGE 21
|
||||
|
||||
/*
|
||||
* If defined, adjacent virtual memory mappings with identical attributes
|
||||
* automatically coalesce, and they fragment when changes are made to subranges.
|
||||
* This is the normal order of things for mmap()/munmap(), but on Windows
|
||||
* VirtualAlloc()/VirtualFree() operations must be precisely matched, i.e.
|
||||
* mappings do *not* coalesce/fragment.
|
||||
*/
|
||||
#define JEMALLOC_MAPS_COALESCE
|
||||
|
||||
/*
|
||||
* If defined, retain memory for later reuse by default rather than using e.g.
|
||||
* munmap() to unmap freed extents. This is enabled on 64-bit Linux because
|
||||
* common sequences of mmap()/munmap() calls will cause virtual memory map
|
||||
* holes.
|
||||
*/
|
||||
#define JEMALLOC_RETAIN
|
||||
|
||||
/* TLS is used to map arenas and magazine caches to threads. */
|
||||
#define JEMALLOC_TLS
|
||||
|
||||
/*
|
||||
* Used to mark unreachable code to quiet "end of non-void" compiler warnings.
|
||||
* Don't use this directly; instead use unreachable() from util.h
|
||||
*/
|
||||
#define JEMALLOC_INTERNAL_UNREACHABLE __builtin_unreachable
|
||||
|
||||
/*
|
||||
* ffs*() functions to use for bitmapping. Don't use these directly; instead,
|
||||
* use ffs_*() from util.h.
|
||||
*/
|
||||
#define JEMALLOC_INTERNAL_FFSLL __builtin_ffsll
|
||||
#define JEMALLOC_INTERNAL_FFSL __builtin_ffsl
|
||||
#define JEMALLOC_INTERNAL_FFS __builtin_ffs
|
||||
|
||||
/*
|
||||
* popcount*() functions to use for bitmapping.
|
||||
*/
|
||||
#define JEMALLOC_INTERNAL_POPCOUNTL __builtin_popcountl
|
||||
#define JEMALLOC_INTERNAL_POPCOUNT __builtin_popcount
|
||||
|
||||
/*
|
||||
* If defined, explicitly attempt to more uniformly distribute large allocation
|
||||
* pointer alignments across all cache indices.
|
||||
*/
|
||||
#define JEMALLOC_CACHE_OBLIVIOUS
|
||||
|
||||
/*
|
||||
* If defined, enable logging facilities. We make this a configure option to
|
||||
* avoid taking extra branches everywhere.
|
||||
*/
|
||||
/* #undef JEMALLOC_LOG */
|
||||
|
||||
/*
|
||||
* If defined, use readlinkat() (instead of readlink()) to follow
|
||||
* /etc/malloc_conf.
|
||||
*/
|
||||
/* #undef JEMALLOC_READLINKAT */
|
||||
|
||||
/*
|
||||
* Darwin (OS X) uses zones to work around Mach-O symbol override shortcomings.
|
||||
*/
|
||||
/* #undef JEMALLOC_ZONE */
|
||||
|
||||
/*
|
||||
* Methods for determining whether the OS overcommits.
|
||||
* JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY: Linux's
|
||||
* /proc/sys/vm.overcommit_memory file.
|
||||
* JEMALLOC_SYSCTL_VM_OVERCOMMIT: FreeBSD's vm.overcommit sysctl.
|
||||
*/
|
||||
/* #undef JEMALLOC_SYSCTL_VM_OVERCOMMIT */
|
||||
#define JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY
|
||||
|
||||
/* Defined if madvise(2) is available. */
|
||||
#define JEMALLOC_HAVE_MADVISE
|
||||
|
||||
/*
|
||||
* Defined if transparent huge pages are supported via the MADV_[NO]HUGEPAGE
|
||||
* arguments to madvise(2).
|
||||
*/
|
||||
#define JEMALLOC_HAVE_MADVISE_HUGE
|
||||
|
||||
/*
|
||||
* Methods for purging unused pages differ between operating systems.
|
||||
*
|
||||
* madvise(..., MADV_FREE) : This marks pages as being unused, such that they
|
||||
* will be discarded rather than swapped out.
|
||||
* madvise(..., MADV_DONTNEED) : If JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS is
|
||||
* defined, this immediately discards pages,
|
||||
* such that new pages will be demand-zeroed if
|
||||
* the address region is later touched;
|
||||
* otherwise this behaves similarly to
|
||||
* MADV_FREE, though typically with higher
|
||||
* system overhead.
|
||||
*/
|
||||
#define JEMALLOC_PURGE_MADVISE_FREE
|
||||
#define JEMALLOC_PURGE_MADVISE_DONTNEED
|
||||
#define JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS
|
||||
|
||||
/* Defined if madvise(2) is available but MADV_FREE is not (x86 Linux only). */
|
||||
/* #undef JEMALLOC_DEFINE_MADVISE_FREE */
|
||||
|
||||
/*
|
||||
* Defined if MADV_DO[NT]DUMP is supported as an argument to madvise.
|
||||
*/
|
||||
#define JEMALLOC_MADVISE_DONTDUMP
|
||||
|
||||
/*
|
||||
* Defined if transparent huge pages (THPs) are supported via the
|
||||
* MADV_[NO]HUGEPAGE arguments to madvise(2), and THP support is enabled.
|
||||
*/
|
||||
/* #undef JEMALLOC_THP */
|
||||
|
||||
/* Define if operating system has alloca.h header. */
|
||||
#define JEMALLOC_HAS_ALLOCA_H 1
|
||||
|
||||
/* C99 restrict keyword supported. */
|
||||
#define JEMALLOC_HAS_RESTRICT 1
|
||||
|
||||
/* For use by hash code. */
|
||||
/* #undef JEMALLOC_BIG_ENDIAN */
|
||||
|
||||
/* sizeof(int) == 2^LG_SIZEOF_INT. */
|
||||
#define LG_SIZEOF_INT 2
|
||||
|
||||
/* sizeof(long) == 2^LG_SIZEOF_LONG. */
|
||||
#define LG_SIZEOF_LONG 3
|
||||
|
||||
/* sizeof(long long) == 2^LG_SIZEOF_LONG_LONG. */
|
||||
#define LG_SIZEOF_LONG_LONG 3
|
||||
|
||||
/* sizeof(intmax_t) == 2^LG_SIZEOF_INTMAX_T. */
|
||||
#define LG_SIZEOF_INTMAX_T 3
|
||||
|
||||
/* glibc malloc hooks (__malloc_hook, __realloc_hook, __free_hook). */
|
||||
#define JEMALLOC_GLIBC_MALLOC_HOOK
|
||||
|
||||
/* glibc memalign hook. */
|
||||
#define JEMALLOC_GLIBC_MEMALIGN_HOOK
|
||||
|
||||
/* pthread support */
|
||||
#define JEMALLOC_HAVE_PTHREAD
|
||||
|
||||
/* dlsym() support */
|
||||
#define JEMALLOC_HAVE_DLSYM
|
||||
|
||||
/* Adaptive mutex support in pthreads. */
|
||||
#define JEMALLOC_HAVE_PTHREAD_MUTEX_ADAPTIVE_NP
|
||||
|
||||
/* GNU specific sched_getcpu support */
|
||||
#define JEMALLOC_HAVE_SCHED_GETCPU
|
||||
|
||||
/* GNU specific sched_setaffinity support */
|
||||
#define JEMALLOC_HAVE_SCHED_SETAFFINITY
|
||||
|
||||
/*
|
||||
* If defined, all the features necessary for background threads are present.
|
||||
*/
|
||||
#define JEMALLOC_BACKGROUND_THREAD 1
|
||||
|
||||
/*
|
||||
* If defined, jemalloc symbols are not exported (doesn't work when
|
||||
* JEMALLOC_PREFIX is not defined).
|
||||
*/
|
||||
/* #undef JEMALLOC_EXPORT */
|
||||
|
||||
/* config.malloc_conf options string. */
|
||||
#define JEMALLOC_CONFIG_MALLOC_CONF "@JEMALLOC_CONFIG_MALLOC_CONF@"
|
||||
|
||||
/* If defined, jemalloc takes the malloc/free/etc. symbol names. */
|
||||
#define JEMALLOC_IS_MALLOC 1
|
||||
|
||||
/*
|
||||
* Defined if strerror_r returns char * if _GNU_SOURCE is defined.
|
||||
*/
|
||||
#define JEMALLOC_STRERROR_R_RETURNS_CHAR_WITH_GNU_SOURCE
|
||||
|
||||
/* Performs additional safety checks when defined. */
|
||||
/* #undef JEMALLOC_OPT_SAFETY_CHECKS */
|
||||
|
||||
#endif /* JEMALLOC_INTERNAL_DEFS_H_ */
|
63
contrib/openldap-cmake/linux_ppc64le/include/lber_types.h
Normal file
63
contrib/openldap-cmake/linux_ppc64le/include/lber_types.h
Normal file
@ -0,0 +1,63 @@
|
||||
/* include/lber_types.h. Generated from lber_types.hin by configure. */
|
||||
/* $OpenLDAP$ */
|
||||
/* This work is part of OpenLDAP Software <http://www.openldap.org/>.
|
||||
*
|
||||
* Copyright 1998-2020 The OpenLDAP Foundation.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted only as authorized by the OpenLDAP
|
||||
* Public License.
|
||||
*
|
||||
* A copy of this license is available in file LICENSE in the
|
||||
* top-level directory of the distribution or, alternatively, at
|
||||
* <http://www.OpenLDAP.org/license.html>.
|
||||
*/
|
||||
|
||||
/*
|
||||
* LBER types
|
||||
*/
|
||||
|
||||
#ifndef _LBER_TYPES_H
|
||||
#define _LBER_TYPES_H
|
||||
|
||||
#include <ldap_cdefs.h>
|
||||
|
||||
LDAP_BEGIN_DECL
|
||||
|
||||
/* LBER boolean, enum, integers (32 bits or larger) */
|
||||
#define LBER_INT_T int
|
||||
|
||||
/* LBER tags (32 bits or larger) */
|
||||
#define LBER_TAG_T long
|
||||
|
||||
/* LBER socket descriptor */
|
||||
#define LBER_SOCKET_T int
|
||||
|
||||
/* LBER lengths (32 bits or larger) */
|
||||
#define LBER_LEN_T long
|
||||
|
||||
/* ------------------------------------------------------------ */
|
||||
|
||||
/* booleans, enumerations, and integers */
|
||||
typedef LBER_INT_T ber_int_t;
|
||||
|
||||
/* signed and unsigned versions */
|
||||
typedef signed LBER_INT_T ber_sint_t;
|
||||
typedef unsigned LBER_INT_T ber_uint_t;
|
||||
|
||||
/* tags */
|
||||
typedef unsigned LBER_TAG_T ber_tag_t;
|
||||
|
||||
/* "socket" descriptors */
|
||||
typedef LBER_SOCKET_T ber_socket_t;
|
||||
|
||||
/* lengths */
|
||||
typedef unsigned LBER_LEN_T ber_len_t;
|
||||
|
||||
/* signed lengths */
|
||||
typedef signed LBER_LEN_T ber_slen_t;
|
||||
|
||||
LDAP_END_DECL
|
||||
|
||||
#endif /* _LBER_TYPES_H */
|
74
contrib/openldap-cmake/linux_ppc64le/include/ldap_config.h
Normal file
74
contrib/openldap-cmake/linux_ppc64le/include/ldap_config.h
Normal file
@ -0,0 +1,74 @@
|
||||
/* include/ldap_config.h. Generated from ldap_config.hin by configure. */
|
||||
/* $OpenLDAP$ */
|
||||
/* This work is part of OpenLDAP Software <http://www.openldap.org/>.
|
||||
*
|
||||
* Copyright 1998-2020 The OpenLDAP Foundation.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted only as authorized by the OpenLDAP
|
||||
* Public License.
|
||||
*
|
||||
* A copy of this license is available in file LICENSE in the
|
||||
* top-level directory of the distribution or, alternatively, at
|
||||
* <http://www.OpenLDAP.org/license.html>.
|
||||
*/
|
||||
|
||||
/*
|
||||
* This file works in conjunction with OpenLDAP configure system.
|
||||
* If you do no like the values below, adjust your configure options.
|
||||
*/
|
||||
|
||||
#ifndef _LDAP_CONFIG_H
|
||||
#define _LDAP_CONFIG_H
|
||||
|
||||
/* directory separator */
|
||||
#ifndef LDAP_DIRSEP
|
||||
#ifndef _WIN32
|
||||
#define LDAP_DIRSEP "/"
|
||||
#else
|
||||
#define LDAP_DIRSEP "\\"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
/* directory for temporary files */
|
||||
#if defined(_WIN32)
|
||||
# define LDAP_TMPDIR "C:\\." /* we don't have much of a choice */
|
||||
#elif defined( _P_tmpdir )
|
||||
# define LDAP_TMPDIR _P_tmpdir
|
||||
#elif defined( P_tmpdir )
|
||||
# define LDAP_TMPDIR P_tmpdir
|
||||
#elif defined( _PATH_TMPDIR )
|
||||
# define LDAP_TMPDIR _PATH_TMPDIR
|
||||
#else
|
||||
# define LDAP_TMPDIR LDAP_DIRSEP "tmp"
|
||||
#endif
|
||||
|
||||
/* directories */
|
||||
#ifndef LDAP_BINDIR
|
||||
#define LDAP_BINDIR "/tmp/ldap-prefix/bin"
|
||||
#endif
|
||||
#ifndef LDAP_SBINDIR
|
||||
#define LDAP_SBINDIR "/tmp/ldap-prefix/sbin"
|
||||
#endif
|
||||
#ifndef LDAP_DATADIR
|
||||
#define LDAP_DATADIR "/tmp/ldap-prefix/share/openldap"
|
||||
#endif
|
||||
#ifndef LDAP_SYSCONFDIR
|
||||
#define LDAP_SYSCONFDIR "/tmp/ldap-prefix/etc/openldap"
|
||||
#endif
|
||||
#ifndef LDAP_LIBEXECDIR
|
||||
#define LDAP_LIBEXECDIR "/tmp/ldap-prefix/libexec"
|
||||
#endif
|
||||
#ifndef LDAP_MODULEDIR
|
||||
#define LDAP_MODULEDIR "/tmp/ldap-prefix/libexec/openldap"
|
||||
#endif
|
||||
#ifndef LDAP_RUNDIR
|
||||
#define LDAP_RUNDIR "/tmp/ldap-prefix/var"
|
||||
#endif
|
||||
#ifndef LDAP_LOCALEDIR
|
||||
#define LDAP_LOCALEDIR ""
|
||||
#endif
|
||||
|
||||
|
||||
#endif /* _LDAP_CONFIG_H */
|
61
contrib/openldap-cmake/linux_ppc64le/include/ldap_features.h
Normal file
61
contrib/openldap-cmake/linux_ppc64le/include/ldap_features.h
Normal file
@ -0,0 +1,61 @@
|
||||
/* include/ldap_features.h. Generated from ldap_features.hin by configure. */
|
||||
/* $OpenLDAP$ */
|
||||
/* This work is part of OpenLDAP Software <http://www.openldap.org/>.
|
||||
*
|
||||
* Copyright 1998-2020 The OpenLDAP Foundation.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted only as authorized by the OpenLDAP
|
||||
* Public License.
|
||||
*
|
||||
* A copy of this license is available in file LICENSE in the
|
||||
* top-level directory of the distribution or, alternatively, at
|
||||
* <http://www.OpenLDAP.org/license.html>.
|
||||
*/
|
||||
|
||||
/*
|
||||
* LDAP Features
|
||||
*/
|
||||
|
||||
#ifndef _LDAP_FEATURES_H
|
||||
#define _LDAP_FEATURES_H 1
|
||||
|
||||
/* OpenLDAP API version macros */
|
||||
#define LDAP_VENDOR_VERSION 20501
|
||||
#define LDAP_VENDOR_VERSION_MAJOR 2
|
||||
#define LDAP_VENDOR_VERSION_MINOR 5
|
||||
#define LDAP_VENDOR_VERSION_PATCH X
|
||||
|
||||
/*
|
||||
** WORK IN PROGRESS!
|
||||
**
|
||||
** OpenLDAP reentrancy/thread-safeness should be dynamically
|
||||
** checked using ldap_get_option().
|
||||
**
|
||||
** The -lldap implementation is not thread-safe.
|
||||
**
|
||||
** The -lldap_r implementation is:
|
||||
** LDAP_API_FEATURE_THREAD_SAFE (basic thread safety)
|
||||
** but also be:
|
||||
** LDAP_API_FEATURE_SESSION_THREAD_SAFE
|
||||
** LDAP_API_FEATURE_OPERATION_THREAD_SAFE
|
||||
**
|
||||
** The preprocessor flag LDAP_API_FEATURE_X_OPENLDAP_THREAD_SAFE
|
||||
** can be used to determine if -lldap_r is available at compile
|
||||
** time. You must define LDAP_THREAD_SAFE if and only if you
|
||||
** link with -lldap_r.
|
||||
**
|
||||
** If you fail to define LDAP_THREAD_SAFE when linking with
|
||||
** -lldap_r or define LDAP_THREAD_SAFE when linking with -lldap,
|
||||
** provided header definitions and declarations may be incorrect.
|
||||
**
|
||||
*/
|
||||
|
||||
/* is -lldap_r available or not */
|
||||
#define LDAP_API_FEATURE_X_OPENLDAP_THREAD_SAFE 1
|
||||
|
||||
/* LDAP v2 Referrals */
|
||||
/* #undef LDAP_API_FEATURE_X_OPENLDAP_V2_REFERRALS */
|
||||
|
||||
#endif /* LDAP_FEATURES */
|
1169
contrib/openldap-cmake/linux_ppc64le/include/portable.h
Normal file
1169
contrib/openldap-cmake/linux_ppc64le/include/portable.h
Normal file
File diff suppressed because it is too large
Load Diff
@ -1,7 +1,7 @@
|
||||
# docker build -t yandex/clickhouse-fasttest .
|
||||
FROM ubuntu:20.04
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive LLVM_VERSION=10
|
||||
ENV DEBIAN_FRONTEND=noninteractive LLVM_VERSION=11
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install ca-certificates lsb-release wget gnupg apt-transport-https \
|
||||
@ -43,20 +43,20 @@ RUN apt-get update \
|
||||
clang-tidy-${LLVM_VERSION} \
|
||||
cmake \
|
||||
curl \
|
||||
lsof \
|
||||
expect \
|
||||
fakeroot \
|
||||
git \
|
||||
gdb \
|
||||
git \
|
||||
gperf \
|
||||
lld-${LLVM_VERSION} \
|
||||
llvm-${LLVM_VERSION} \
|
||||
lsof \
|
||||
moreutils \
|
||||
ninja-build \
|
||||
psmisc \
|
||||
python3 \
|
||||
python3-pip \
|
||||
python3-lxml \
|
||||
python3-pip \
|
||||
python3-requests \
|
||||
python3-termcolor \
|
||||
rename \
|
||||
|
@ -8,6 +8,9 @@ trap 'kill $(jobs -pr) ||:' EXIT
|
||||
# that we can run the "everything else" stage from the cloned source.
|
||||
stage=${stage:-}
|
||||
|
||||
# Compiler version, normally set by Dockerfile
|
||||
export LLVM_VERSION=${LLVM_VERSION:-11}
|
||||
|
||||
# A variable to pass additional flags to CMake.
|
||||
# Here we explicitly default it to nothing so that bash doesn't complain about
|
||||
# it being undefined. Also read it as array so that we can pass an empty list
|
||||
@ -124,22 +127,26 @@ continue
|
||||
|
||||
function clone_root
|
||||
{
|
||||
git clone https://github.com/ClickHouse/ClickHouse.git -- "$FASTTEST_SOURCE" | ts '%Y-%m-%d %H:%M:%S' | tee "$FASTTEST_OUTPUT/clone_log.txt"
|
||||
git clone --depth 1 https://github.com/ClickHouse/ClickHouse.git -- "$FASTTEST_SOURCE" 2>&1 | ts '%Y-%m-%d %H:%M:%S' | tee "$FASTTEST_OUTPUT/clone_log.txt"
|
||||
|
||||
(
|
||||
cd "$FASTTEST_SOURCE"
|
||||
if [ "$PULL_REQUEST_NUMBER" != "0" ]; then
|
||||
if git fetch origin "+refs/pull/$PULL_REQUEST_NUMBER/merge"; then
|
||||
if git fetch --depth 1 origin "+refs/pull/$PULL_REQUEST_NUMBER/merge"; then
|
||||
git checkout FETCH_HEAD
|
||||
echo 'Clonned merge head'
|
||||
echo "Checked out pull/$PULL_REQUEST_NUMBER/merge ($(git rev-parse FETCH_HEAD))"
|
||||
else
|
||||
git fetch origin "+refs/pull/$PULL_REQUEST_NUMBER/head"
|
||||
git fetch --depth 1 origin "+refs/pull/$PULL_REQUEST_NUMBER/head"
|
||||
git checkout "$COMMIT_SHA"
|
||||
echo 'Checked out to commit'
|
||||
echo "Checked out nominal SHA $COMMIT_SHA for PR $PULL_REQUEST_NUMBER"
|
||||
fi
|
||||
else
|
||||
if [ -v COMMIT_SHA ]; then
|
||||
git fetch --depth 1 origin "$COMMIT_SHA"
|
||||
git checkout "$COMMIT_SHA"
|
||||
echo "Checked out nominal SHA $COMMIT_SHA for master"
|
||||
else
|
||||
echo "Using default repository head $(git rev-parse HEAD)"
|
||||
fi
|
||||
fi
|
||||
)
|
||||
@ -181,7 +188,7 @@ function clone_submodules
|
||||
)
|
||||
|
||||
git submodule sync
|
||||
git submodule update --init --recursive "${SUBMODULES_TO_UPDATE[@]}"
|
||||
git submodule update --depth 1 --init --recursive "${SUBMODULES_TO_UPDATE[@]}"
|
||||
git submodule foreach git reset --hard
|
||||
git submodule foreach git checkout @ -f
|
||||
git submodule foreach git clean -xfd
|
||||
@ -215,7 +222,7 @@ function run_cmake
|
||||
|
||||
(
|
||||
cd "$FASTTEST_BUILD"
|
||||
cmake "$FASTTEST_SOURCE" -DCMAKE_CXX_COMPILER=clang++-10 -DCMAKE_C_COMPILER=clang-10 "${CMAKE_LIBS_CONFIG[@]}" "${FASTTEST_CMAKE_FLAGS[@]}" | ts '%Y-%m-%d %H:%M:%S' | tee "$FASTTEST_OUTPUT/cmake_log.txt"
|
||||
cmake "$FASTTEST_SOURCE" -DCMAKE_CXX_COMPILER="clang++-${LLVM_VERSION}" -DCMAKE_C_COMPILER="clang-${LLVM_VERSION}" "${CMAKE_LIBS_CONFIG[@]}" "${FASTTEST_CMAKE_FLAGS[@]}" 2>&1 | ts '%Y-%m-%d %H:%M:%S' | tee "$FASTTEST_OUTPUT/cmake_log.txt"
|
||||
)
|
||||
}
|
||||
|
||||
@ -223,7 +230,7 @@ function build
|
||||
{
|
||||
(
|
||||
cd "$FASTTEST_BUILD"
|
||||
time ninja clickhouse-bundle | ts '%Y-%m-%d %H:%M:%S' | tee "$FASTTEST_OUTPUT/build_log.txt"
|
||||
time ninja clickhouse-bundle 2>&1 | ts '%Y-%m-%d %H:%M:%S' | tee "$FASTTEST_OUTPUT/build_log.txt"
|
||||
if [ "$COPY_CLICKHOUSE_BINARY_TO_OUTPUT" -eq "1" ]; then
|
||||
cp programs/clickhouse "$FASTTEST_OUTPUT/clickhouse"
|
||||
fi
|
||||
@ -420,7 +427,7 @@ case "$stage" in
|
||||
# See the compatibility hacks in `clone_root` stage above. Remove at the same time,
|
||||
# after Nov 1, 2020.
|
||||
cd "$FASTTEST_WORKSPACE"
|
||||
clone_submodules | ts '%Y-%m-%d %H:%M:%S' | tee "$FASTTEST_OUTPUT/submodule_log.txt"
|
||||
clone_submodules 2>&1 | ts '%Y-%m-%d %H:%M:%S' | tee "$FASTTEST_OUTPUT/submodule_log.txt"
|
||||
;&
|
||||
"run_cmake")
|
||||
run_cmake
|
||||
@ -431,7 +438,7 @@ case "$stage" in
|
||||
"configure")
|
||||
# The `install_log.txt` is also needed for compatibility with old CI task --
|
||||
# if there is no log, it will decide that build failed.
|
||||
configure | ts '%Y-%m-%d %H:%M:%S' | tee "$FASTTEST_OUTPUT/install_log.txt"
|
||||
configure 2>&1 | ts '%Y-%m-%d %H:%M:%S' | tee "$FASTTEST_OUTPUT/install_log.txt"
|
||||
;&
|
||||
"run_tests")
|
||||
run_tests
|
||||
|
@ -126,7 +126,13 @@ Contribute all new information in English language. Other languages are translat
|
||||
|
||||
### Adding a New File
|
||||
|
||||
When adding a new file:
|
||||
When you add a new file, it should end with a link like:
|
||||
|
||||
`[Original article](https://clickhouse.tech/docs/<path-to-the-page>) <!--hide-->`
|
||||
|
||||
and there should be **a new empty line** after it.
|
||||
|
||||
{## When adding a new file:
|
||||
|
||||
- Make symbolic links for all other languages. You can use the following commands:
|
||||
|
||||
@ -134,7 +140,7 @@ When adding a new file:
|
||||
$ cd /ClickHouse/clone/directory/docs
|
||||
$ ln -sr en/new/file.md lang/new/file.md
|
||||
```
|
||||
|
||||
##}
|
||||
<a name="adding-a-new-language"/>
|
||||
|
||||
### Adding a New Language
|
||||
@ -195,8 +201,11 @@ Templates:
|
||||
|
||||
- [Function](_description_templates/template-function.md)
|
||||
- [Setting](_description_templates/template-setting.md)
|
||||
- [Server Setting](_description_templates/template-server-setting.md)
|
||||
- [Database or Table engine](_description_templates/template-engine.md)
|
||||
- [System table](_description_templates/template-system-table.md)
|
||||
- [Data type](_description_templates/data-type.md)
|
||||
- [Statement](_description_templates/statement.md)
|
||||
|
||||
|
||||
<a name="how-to-build-docs"/>
|
||||
|
@ -18,4 +18,8 @@ You can also use the following database engines:
|
||||
|
||||
- [Lazy](../../engines/database-engines/lazy.md)
|
||||
|
||||
- [Atomic](../../engines/database-engines/atomic.md)
|
||||
|
||||
- [PostgreSQL](../../engines/database-engines/postgresql.md)
|
||||
|
||||
[Original article](https://clickhouse.tech/docs/en/database_engines/) <!--hide-->
|
||||
|
138
docs/en/engines/database-engines/postgresql.md
Normal file
138
docs/en/engines/database-engines/postgresql.md
Normal file
@ -0,0 +1,138 @@
|
||||
---
|
||||
toc_priority: 35
|
||||
toc_title: PostgreSQL
|
||||
---
|
||||
|
||||
# PostgreSQL {#postgresql}
|
||||
|
||||
Allows to connect to databases on a remote [PostgreSQL](https://www.postgresql.org) server. Supports read and write operations (`SELECT` and `INSERT` queries) to exchange data between ClickHouse and PostgreSQL.
|
||||
|
||||
Gives the real-time access to table list and table structure from remote PostgreSQL with the help of `SHOW TABLES` and `DESCRIBE TABLE` queries.
|
||||
|
||||
Supports table structure modifications (`ALTER TABLE ... ADD|DROP COLUMN`). If `use_table_cache` parameter (see the Engine Parameters below) it set to `1`, the table structure is cached and not checked for being modified, but can be updated with `DETACH` and `ATTACH` queries.
|
||||
|
||||
## Creating a Database {#creating-a-database}
|
||||
|
||||
``` sql
|
||||
CREATE DATABASE test_database
|
||||
ENGINE = PostgreSQL('host:port', 'database', 'user', 'password'[, `use_table_cache`]);
|
||||
```
|
||||
|
||||
**Engine Parameters**
|
||||
|
||||
- `host:port` — PostgreSQL server address.
|
||||
- `database` — Remote database name.
|
||||
- `user` — PostgreSQL user.
|
||||
- `password` — User password.
|
||||
- `use_table_cache` — Defines if the database table structure is cached or not. Optional. Default value: `0`.
|
||||
|
||||
## Data Types Support {#data_types-support}
|
||||
|
||||
| PostgerSQL | ClickHouse |
|
||||
|------------------|--------------------------------------------------------------|
|
||||
| DATE | [Date](../../sql-reference/data-types/date.md) |
|
||||
| TIMESTAMP | [DateTime](../../sql-reference/data-types/datetime.md) |
|
||||
| REAL | [Float32](../../sql-reference/data-types/float.md) |
|
||||
| DOUBLE | [Float64](../../sql-reference/data-types/float.md) |
|
||||
| DECIMAL, NUMERIC | [Decimal](../../sql-reference/data-types/decimal.md) |
|
||||
| SMALLINT | [Int16](../../sql-reference/data-types/int-uint.md) |
|
||||
| INTEGER | [Int32](../../sql-reference/data-types/int-uint.md) |
|
||||
| BIGINT | [Int64](../../sql-reference/data-types/int-uint.md) |
|
||||
| SERIAL | [UInt32](../../sql-reference/data-types/int-uint.md) |
|
||||
| BIGSERIAL | [UInt64](../../sql-reference/data-types/int-uint.md) |
|
||||
| TEXT, CHAR | [String](../../sql-reference/data-types/string.md) |
|
||||
| INTEGER | Nullable([Int32](../../sql-reference/data-types/int-uint.md))|
|
||||
| ARRAY | [Array](../../sql-reference/data-types/array.md) |
|
||||
|
||||
|
||||
## Examples of Use {#examples-of-use}
|
||||
|
||||
Database in ClickHouse, exchanging data with the PostgreSQL server:
|
||||
|
||||
``` sql
|
||||
CREATE DATABASE test_database
|
||||
ENGINE = PostgreSQL('postgres1:5432', 'test_database', 'postgres', 'mysecretpassword', 1);
|
||||
```
|
||||
|
||||
``` sql
|
||||
SHOW DATABASES;
|
||||
```
|
||||
|
||||
``` text
|
||||
┌─name──────────┐
|
||||
│ default │
|
||||
│ test_database │
|
||||
│ system │
|
||||
└───────────────┘
|
||||
```
|
||||
|
||||
``` sql
|
||||
SHOW TABLES FROM test_database;
|
||||
```
|
||||
|
||||
``` text
|
||||
┌─name───────┐
|
||||
│ test_table │
|
||||
└────────────┘
|
||||
```
|
||||
|
||||
Reading data from the PostgreSQL table:
|
||||
|
||||
``` sql
|
||||
SELECT * FROM test_database.test_table;
|
||||
```
|
||||
|
||||
``` text
|
||||
┌─id─┬─value─┐
|
||||
│ 1 │ 2 │
|
||||
└────┴───────┘
|
||||
```
|
||||
|
||||
Writing data to the PostgreSQL table:
|
||||
|
||||
``` sql
|
||||
INSERT INTO test_database.test_table VALUES (3,4);
|
||||
SELECT * FROM test_database.test_table;
|
||||
```
|
||||
|
||||
``` text
|
||||
┌─int_id─┬─value─┐
|
||||
│ 1 │ 2 │
|
||||
│ 3 │ 4 │
|
||||
└────────┴───────┘
|
||||
```
|
||||
|
||||
Consider the table structure was modified in PostgreSQL:
|
||||
|
||||
``` sql
|
||||
postgre> ALTER TABLE test_table ADD COLUMN data Text
|
||||
```
|
||||
|
||||
As the `use_table_cache` parameter was set to `1` when the database was created, the table structure in ClickHouse was cached and therefore not modified:
|
||||
|
||||
``` sql
|
||||
DESCRIBE TABLE test_database.test_table;
|
||||
```
|
||||
``` text
|
||||
┌─name───┬─type──────────────┐
|
||||
│ id │ Nullable(Integer) │
|
||||
│ value │ Nullable(Integer) │
|
||||
└────────┴───────────────────┘
|
||||
```
|
||||
|
||||
After detaching the table and attaching it again, the structure was updated:
|
||||
|
||||
``` sql
|
||||
DETACH TABLE test_database.test_table;
|
||||
ATTACH TABLE test_database.test_table;
|
||||
DESCRIBE TABLE test_database.test_table;
|
||||
```
|
||||
``` text
|
||||
┌─name───┬─type──────────────┐
|
||||
│ id │ Nullable(Integer) │
|
||||
│ value │ Nullable(Integer) │
|
||||
│ data │ Nullable(String) │
|
||||
└────────┴───────────────────┘
|
||||
```
|
||||
|
||||
[Original article](https://clickhouse.tech/docs/en/database-engines/postgresql/) <!--hide-->
|
@ -47,12 +47,17 @@ Engines for communicating with other data storage and processing systems.
|
||||
|
||||
Engines in the family:
|
||||
|
||||
- [Kafka](../../engines/table-engines/integrations/kafka.md#kafka)
|
||||
- [MySQL](../../engines/table-engines/integrations/mysql.md#mysql)
|
||||
- [ODBC](../../engines/table-engines/integrations/odbc.md#table-engine-odbc)
|
||||
- [JDBC](../../engines/table-engines/integrations/jdbc.md#table-engine-jdbc)
|
||||
- [HDFS](../../engines/table-engines/integrations/hdfs.md#hdfs)
|
||||
- [S3](../../engines/table-engines/integrations/s3.md#table-engine-s3)
|
||||
|
||||
- [ODBC](../../engines/table-engines/integrations/odbc.md)
|
||||
- [JDBC](../../engines/table-engines/integrations/jdbc.md)
|
||||
- [MySQL](../../engines/table-engines/integrations/mysql.md)
|
||||
- [MongoDB](../../engines/table-engines/integrations/mongodb.md)
|
||||
- [HDFS](../../engines/table-engines/integrations/hdfs.md)
|
||||
- [S3](../../engines/table-engines/integrations/s3.md)
|
||||
- [Kafka](../../engines/table-engines/integrations/kafka.md)
|
||||
- [EmbeddedRocksDB](../../engines/table-engines/integrations/embedded-rocksdb.md)
|
||||
- [RabbitMQ](../../engines/table-engines/integrations/rabbitmq.md)
|
||||
- [PostgreSQL](../../engines/table-engines/integrations/postgresql.md)
|
||||
|
||||
### Special Engines {#special-engines}
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
---
|
||||
toc_priority: 6
|
||||
toc_priority: 9
|
||||
toc_title: EmbeddedRocksDB
|
||||
---
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
---
|
||||
toc_priority: 4
|
||||
toc_priority: 6
|
||||
toc_title: HDFS
|
||||
---
|
||||
|
||||
|
@ -1,6 +1,6 @@
|
||||
---
|
||||
toc_folder_title: Integrations
|
||||
toc_priority: 30
|
||||
toc_priority: 1
|
||||
---
|
||||
|
||||
# Table Engines for Integrations {#table-engines-for-integrations}
|
||||
@ -19,5 +19,3 @@ List of supported integrations:
|
||||
- [EmbeddedRocksDB](../../../engines/table-engines/integrations/embedded-rocksdb.md)
|
||||
- [RabbitMQ](../../../engines/table-engines/integrations/rabbitmq.md)
|
||||
- [PostgreSQL](../../../engines/table-engines/integrations/postgresql.md)
|
||||
|
||||
[Original article](https://clickhouse.tech/docs/en/engines/table-engines/integrations/) <!--hide-->
|
||||
|
@ -1,5 +1,5 @@
|
||||
---
|
||||
toc_priority: 2
|
||||
toc_priority: 3
|
||||
toc_title: JDBC
|
||||
---
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
---
|
||||
toc_priority: 5
|
||||
toc_priority: 8
|
||||
toc_title: Kafka
|
||||
---
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
---
|
||||
toc_priority: 7
|
||||
toc_priority: 5
|
||||
toc_title: MongoDB
|
||||
---
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
---
|
||||
toc_priority: 3
|
||||
toc_priority: 4
|
||||
toc_title: MySQL
|
||||
---
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
---
|
||||
toc_priority: 1
|
||||
toc_priority: 2
|
||||
toc_title: ODBC
|
||||
---
|
||||
|
||||
|
@ -1,11 +1,11 @@
|
||||
---
|
||||
toc_priority: 8
|
||||
toc_priority: 11
|
||||
toc_title: PostgreSQL
|
||||
---
|
||||
|
||||
# PostgreSQL {#postgresql}
|
||||
|
||||
The PostgreSQL engine allows you to perform `SELECT` queries on data that is stored on a remote PostgreSQL server.
|
||||
The PostgreSQL engine allows to perform `SELECT` and `INSERT` queries on data that is stored on a remote PostgreSQL server.
|
||||
|
||||
## Creating a Table {#creating-a-table}
|
||||
|
||||
@ -15,7 +15,7 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster]
|
||||
name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1] [TTL expr1],
|
||||
name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2] [TTL expr2],
|
||||
...
|
||||
) ENGINE = PostgreSQL('host:port', 'database', 'table', 'user', 'password');
|
||||
) ENGINE = PostgreSQL('host:port', 'database', 'table', 'user', 'password'[, `schema`]);
|
||||
```
|
||||
|
||||
See a detailed description of the [CREATE TABLE](../../../sql-reference/statements/create/table.md#create-table-query) query.
|
||||
@ -29,25 +29,51 @@ The table structure can differ from the original PostgreSQL table structure:
|
||||
**Engine Parameters**
|
||||
|
||||
- `host:port` — PostgreSQL server address.
|
||||
|
||||
- `database` — Remote database name.
|
||||
|
||||
- `table` — Remote table name.
|
||||
|
||||
- `user` — PostgreSQL user.
|
||||
|
||||
- `password` — User password.
|
||||
- `schema` — Non-default table schema. Optional.
|
||||
|
||||
SELECT Queries on PostgreSQL side run as `COPY (SELECT ...) TO STDOUT` inside read-only PostgreSQL transaction with commit after each `SELECT` query.
|
||||
## Implementation Details {#implementation-details}
|
||||
|
||||
Simple `WHERE` clauses such as `=, !=, >, >=, <, <=, IN` are executed on the PostgreSQL server.
|
||||
`SELECT` queries on PostgreSQL side run as `COPY (SELECT ...) TO STDOUT` inside read-only PostgreSQL transaction with commit after each `SELECT` query.
|
||||
|
||||
Simple `WHERE` clauses such as `=`, `!=`, `>`, `>=`, `<`, `<=`, and `IN` are executed on the PostgreSQL server.
|
||||
|
||||
All joins, aggregations, sorting, `IN [ array ]` conditions and the `LIMIT` sampling constraint are executed in ClickHouse only after the query to PostgreSQL finishes.
|
||||
|
||||
INSERT Queries on PostgreSQL side run as `COPY "table_name" (field1, field2, ... fieldN) FROM STDIN` inside PostgreSQL transaction with auto-commit after each `INSERT` statement.
|
||||
`INSERT` queries on PostgreSQL side run as `COPY "table_name" (field1, field2, ... fieldN) FROM STDIN` inside PostgreSQL transaction with auto-commit after each `INSERT` statement.
|
||||
|
||||
PostgreSQL Array types converts into ClickHouse arrays.
|
||||
Be careful in PostgreSQL an array data created like a type_name[] may contain multi-dimensional arrays of different dimensions in different table rows in same column, but in ClickHouse it is only allowed to have multidimensional arrays of the same count of dimensions in all table rows in same column.
|
||||
PostgreSQL `Array` types are converted into ClickHouse arrays.
|
||||
|
||||
!!! info "Note"
|
||||
Be careful - in PostgreSQL an array data, created like a `type_name[]`, may contain multi-dimensional arrays of different dimensions in different table rows in same column. But in ClickHouse it is only allowed to have multidimensional arrays of the same count of dimensions in all table rows in same column.
|
||||
|
||||
Replicas priority for PostgreSQL dictionary source is supported. The bigger the number in map, the less the priority. The highest priority is `0`.
|
||||
|
||||
In the example below replica `example01-1` has the highest priority:
|
||||
|
||||
```xml
|
||||
<postgresql>
|
||||
<port>5432</port>
|
||||
<user>clickhouse</user>
|
||||
<password>qwerty</password>
|
||||
<replica>
|
||||
<host>example01-1</host>
|
||||
<priority>1</priority>
|
||||
</replica>
|
||||
<replica>
|
||||
<host>example01-2</host>
|
||||
<priority>2</priority>
|
||||
</replica>
|
||||
<db>db_name</db>
|
||||
<table>table_name</table>
|
||||
<where>id=10</where>
|
||||
<invalidate_query>SQL_QUERY</invalidate_query>
|
||||
</postgresql>
|
||||
</source>
|
||||
```
|
||||
|
||||
## Usage Example {#usage-example}
|
||||
|
||||
@ -64,10 +90,10 @@ PRIMARY KEY (int_id));
|
||||
|
||||
CREATE TABLE
|
||||
|
||||
postgres=# insert into test (int_id, str, "float") VALUES (1,'test',2);
|
||||
postgres=# INSERT INTO test (int_id, str, "float") VALUES (1,'test',2);
|
||||
INSERT 0 1
|
||||
|
||||
postgresql> select * from test;
|
||||
postgresql> SELECT * FROM test;
|
||||
int_id | int_nullable | float | str | float_nullable
|
||||
--------+--------------+-------+------+----------------
|
||||
1 | | 2 | test |
|
||||
@ -87,20 +113,33 @@ ENGINE = PostgreSQL('localhost:5432', 'public', 'test', 'postges_user', 'postgre
|
||||
```
|
||||
|
||||
``` sql
|
||||
SELECT * FROM postgresql_table WHERE str IN ('test')
|
||||
SELECT * FROM postgresql_table WHERE str IN ('test');
|
||||
```
|
||||
|
||||
``` text
|
||||
┌─float_nullable─┬─str──┬─int_id─┐
|
||||
│ ᴺᵁᴸᴸ │ test │ 1 │
|
||||
└────────────────┴──────┴────────┘
|
||||
1 rows in set. Elapsed: 0.019 sec.
|
||||
```
|
||||
|
||||
Using Non-default Schema:
|
||||
|
||||
## See Also {#see-also}
|
||||
```text
|
||||
postgres=# CREATE SCHEMA "nice.schema";
|
||||
|
||||
- [The ‘postgresql’ table function](../../../sql-reference/table-functions/postgresql.md)
|
||||
postgres=# CREATE TABLE "nice.schema"."nice.table" (a integer);
|
||||
|
||||
postgres=# INSERT INTO "nice.schema"."nice.table" SELECT i FROM generate_series(0, 99) as t(i)
|
||||
```
|
||||
|
||||
```sql
|
||||
CREATE TABLE pg_table_schema_with_dots (a UInt32)
|
||||
ENGINE PostgreSQL('localhost:5432', 'clickhouse', 'nice.table', 'postgrsql_user', 'password', 'nice.schema');
|
||||
```
|
||||
|
||||
**See Also**
|
||||
|
||||
- [The `postgresql` table function](../../../sql-reference/table-functions/postgresql.md)
|
||||
- [Using PostgreSQL as a source of external dictionary](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md#dicts-external_dicts_dict_sources-postgresql)
|
||||
|
||||
[Original article](https://clickhouse.tech/docs/en/engines/table-engines/integrations/postgresql/) <!--hide-->
|
||||
|
@ -1,5 +1,5 @@
|
||||
---
|
||||
toc_priority: 6
|
||||
toc_priority: 10
|
||||
toc_title: RabbitMQ
|
||||
---
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
---
|
||||
toc_priority: 4
|
||||
toc_priority: 7
|
||||
toc_title: S3
|
||||
---
|
||||
|
||||
|
@ -502,7 +502,15 @@ On hosts with low RAM and swap, you possibly need setting `max_server_memory_usa
|
||||
|
||||
## max_concurrent_queries {#max-concurrent-queries}
|
||||
|
||||
The maximum number of simultaneously processed requests.
|
||||
The maximum number of simultaneously processed queries related to MergeTree table. Queries may be limited by other settings: [max_concurrent_queries_for_all_users](#max-concurrent-queries-for-all-users), [min_marks_to_honor_max_concurrent_queries](#min-marks-to-honor-max-concurrent-queries).
|
||||
|
||||
!!! info "Note"
|
||||
These settings can be modified at runtime and will take effect immediately. Queries that are already running will remain unchanged.
|
||||
|
||||
Possible values:
|
||||
|
||||
- Positive integer.
|
||||
- 0 — Disabled.
|
||||
|
||||
**Example**
|
||||
|
||||
@ -530,6 +538,21 @@ Default value: `0` that means no limit.
|
||||
|
||||
- [max_concurrent_queries](#max-concurrent-queries)
|
||||
|
||||
## min_marks_to_honor_max_concurrent_queries {#min-marks-to-honor-max-concurrent-queries}
|
||||
|
||||
The minimal number of marks read by the query for applying the [max_concurrent_queries](#max-concurrent-queries) setting.
|
||||
|
||||
Possible values:
|
||||
|
||||
- Positive integer.
|
||||
- 0 — Disabled.
|
||||
|
||||
**Example**
|
||||
|
||||
``` xml
|
||||
<min_marks_to_honor_max_concurrent_queries>10</min_marks_to_honor_max_concurrent_queries>
|
||||
```
|
||||
|
||||
## max_connections {#max-connections}
|
||||
|
||||
The maximum number of inbound connections.
|
||||
|
@ -1914,7 +1914,7 @@ Default value: `0`.
|
||||
|
||||
Enables or disables random shard insertion into a [Distributed](../../engines/table-engines/special/distributed.md#distributed) table when there is no distributed key.
|
||||
|
||||
By default, when inserting data into a `Distributed` table with more than one shard, the ClickHouse server will any insertion request if there is no distributed key. When `insert_distributed_one_random_shard = 1`, insertions are allowed and data is forwarded randomly among all shards.
|
||||
By default, when inserting data into a `Distributed` table with more than one shard, the ClickHouse server will reject any insertion request if there is no distributed key. When `insert_distributed_one_random_shard = 1`, insertions are allowed and data is forwarded randomly among all shards.
|
||||
|
||||
Possible values:
|
||||
|
||||
|
@ -69,6 +69,8 @@ Types of sources (`source_type`):
|
||||
- [ClickHouse](#dicts-external_dicts_dict_sources-clickhouse)
|
||||
- [MongoDB](#dicts-external_dicts_dict_sources-mongodb)
|
||||
- [Redis](#dicts-external_dicts_dict_sources-redis)
|
||||
- [Cassandra](#dicts-external_dicts_dict_sources-cassandra)
|
||||
- [PostgreSQL](#dicts-external_dicts_dict_sources-postgresql)
|
||||
|
||||
## Local File {#dicts-external_dicts_dict_sources-local_file}
|
||||
|
||||
|
@ -250,3 +250,53 @@ Result:
|
||||
└───────────────┘
|
||||
```
|
||||
|
||||
## bitHammingDistance {#bithammingdistance}
|
||||
|
||||
Returns the [Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance) between the bit representations of two integer values. Can be used with [SimHash](../../sql-reference/functions/hash-functions.md#ngramsimhash) functions for detection of semi-duplicate strings. The smaller is the distance, the more likely those strings are the same.
|
||||
|
||||
**Syntax**
|
||||
|
||||
``` sql
|
||||
bitHammingDistance(int1, int2)
|
||||
```
|
||||
|
||||
**Arguments**
|
||||
|
||||
- `int1` — First integer value. [Int64](../../sql-reference/data-types/int-uint.md).
|
||||
- `int2` — Second integer value. [Int64](../../sql-reference/data-types/int-uint.md).
|
||||
|
||||
**Returned value**
|
||||
|
||||
- The Hamming distance.
|
||||
|
||||
Type: [UInt8](../../sql-reference/data-types/int-uint.md).
|
||||
|
||||
**Examples**
|
||||
|
||||
Query:
|
||||
|
||||
``` sql
|
||||
SELECT bitHammingDistance(111, 121);
|
||||
```
|
||||
|
||||
Result:
|
||||
|
||||
``` text
|
||||
┌─bitHammingDistance(111, 121)─┐
|
||||
│ 3 │
|
||||
└──────────────────────────────┘
|
||||
```
|
||||
|
||||
With [SimHash](../../sql-reference/functions/hash-functions.md#ngramsimhash):
|
||||
|
||||
``` sql
|
||||
SELECT bitHammingDistance(ngramSimHash('cat ate rat'), ngramSimHash('rat ate cat'));
|
||||
```
|
||||
|
||||
Result:
|
||||
|
||||
``` text
|
||||
┌─bitHammingDistance(ngramSimHash('cat ate rat'), ngramSimHash('rat ate cat'))─┐
|
||||
│ 5 │
|
||||
└──────────────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
@ -7,6 +7,8 @@ toc_title: Hash
|
||||
|
||||
Hash functions can be used for the deterministic pseudo-random shuffling of elements.
|
||||
|
||||
Simhash is a hash function, which returns close hash values for close (similar) arguments.
|
||||
|
||||
## halfMD5 {#hash-functions-halfmd5}
|
||||
|
||||
[Interprets](../../sql-reference/functions/type-conversion-functions.md#type_conversion_functions-reinterpretAsString) all the input parameters as strings and calculates the [MD5](https://en.wikipedia.org/wiki/MD5) hash value for each of them. Then combines hashes, takes the first 8 bytes of the hash of the resulting string, and interprets them as `UInt64` in big-endian byte order.
|
||||
@ -482,3 +484,938 @@ Result:
|
||||
|
||||
- [xxHash](http://cyan4973.github.io/xxHash/).
|
||||
|
||||
## ngramSimHash {#ngramsimhash}
|
||||
|
||||
Splits a ASCII string into n-grams of `ngramsize` symbols and returns the n-gram `simhash`. Is case sensitive.
|
||||
|
||||
Can be used for detection of semi-duplicate strings with [bitHammingDistance](../../sql-reference/functions/bit-functions.md#bithammingdistance). The smaller is the [Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance) of the calculated `simhashes` of two strings, the more likely these strings are the same.
|
||||
|
||||
**Syntax**
|
||||
|
||||
``` sql
|
||||
ngramSimHash(string[, ngramsize])
|
||||
```
|
||||
|
||||
**Arguments**
|
||||
|
||||
- `string` — String. [String](../../sql-reference/data-types/string.md).
|
||||
- `ngramsize` — The size of an n-gram. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
|
||||
|
||||
**Returned value**
|
||||
|
||||
- Hash value.
|
||||
|
||||
Type: [UInt64](../../sql-reference/data-types/int-uint.md).
|
||||
|
||||
**Example**
|
||||
|
||||
Query:
|
||||
|
||||
``` sql
|
||||
SELECT ngramSimHash('ClickHouse') AS Hash;
|
||||
```
|
||||
|
||||
Result:
|
||||
|
||||
``` text
|
||||
┌───────Hash─┐
|
||||
│ 1627567969 │
|
||||
└────────────┘
|
||||
```
|
||||
|
||||
## ngramSimHashCaseInsensitive {#ngramsimhashcaseinsensitive}
|
||||
|
||||
Splits a ASCII string into n-grams of `ngramsize` symbols and returns the n-gram `simhash`. Is case insensitive.
|
||||
|
||||
Can be used for detection of semi-duplicate strings with [bitHammingDistance](../../sql-reference/functions/bit-functions.md#bithammingdistance). The smaller is the [Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance) of the calculated `simhashes` of two strings, the more likely these strings are the same.
|
||||
|
||||
**Syntax**
|
||||
|
||||
``` sql
|
||||
ngramSimHashCaseInsensitive(string[, ngramsize])
|
||||
```
|
||||
|
||||
**Arguments**
|
||||
|
||||
- `string` — String. [String](../../sql-reference/data-types/string.md).
|
||||
- `ngramsize` — The size of an n-gram. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
|
||||
|
||||
**Returned value**
|
||||
|
||||
- Hash value.
|
||||
|
||||
Type: [UInt64](../../sql-reference/data-types/int-uint.md).
|
||||
|
||||
**Example**
|
||||
|
||||
Query:
|
||||
|
||||
``` sql
|
||||
SELECT ngramSimHashCaseInsensitive('ClickHouse') AS Hash;
|
||||
```
|
||||
|
||||
Result:
|
||||
|
||||
``` text
|
||||
┌──────Hash─┐
|
||||
│ 562180645 │
|
||||
└───────────┘
|
||||
```
|
||||
|
||||
## ngramSimHashUTF8 {#ngramsimhashutf8}
|
||||
|
||||
Splits a UTF-8 string into n-grams of `ngramsize` symbols and returns the n-gram `simhash`. Is case sensitive.
|
||||
|
||||
Can be used for detection of semi-duplicate strings with [bitHammingDistance](../../sql-reference/functions/bit-functions.md#bithammingdistance). The smaller is the [Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance) of the calculated `simhashes` of two strings, the more likely these strings are the same.
|
||||
|
||||
**Syntax**
|
||||
|
||||
``` sql
|
||||
ngramSimHashUTF8(string[, ngramsize])
|
||||
```
|
||||
|
||||
**Arguments**
|
||||
|
||||
- `string` — String. [String](../../sql-reference/data-types/string.md).
|
||||
- `ngramsize` — The size of an n-gram. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
|
||||
|
||||
**Returned value**
|
||||
|
||||
- Hash value.
|
||||
|
||||
Type: [UInt64](../../sql-reference/data-types/int-uint.md).
|
||||
|
||||
**Example**
|
||||
|
||||
Query:
|
||||
|
||||
``` sql
|
||||
SELECT ngramSimHashUTF8('ClickHouse') AS Hash;
|
||||
```
|
||||
|
||||
Result:
|
||||
|
||||
``` text
|
||||
┌───────Hash─┐
|
||||
│ 1628157797 │
|
||||
└────────────┘
|
||||
```
|
||||
|
||||
## ngramSimHashCaseInsensitiveUTF8 {#ngramsimhashcaseinsensitiveutf8}
|
||||
|
||||
Splits a UTF-8 string into n-grams of `ngramsize` symbols and returns the n-gram `simhash`. Is case insensitive.
|
||||
|
||||
Can be used for detection of semi-duplicate strings with [bitHammingDistance](../../sql-reference/functions/bit-functions.md#bithammingdistance). The smaller is the [Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance) of the calculated `simhashes` of two strings, the more likely these strings are the same.
|
||||
|
||||
**Syntax**
|
||||
|
||||
``` sql
|
||||
ngramSimHashCaseInsensitiveUTF8(string[, ngramsize])
|
||||
```
|
||||
|
||||
**Arguments**
|
||||
|
||||
- `string` — String. [String](../../sql-reference/data-types/string.md).
|
||||
- `ngramsize` — The size of an n-gram. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
|
||||
|
||||
**Returned value**
|
||||
|
||||
- Hash value.
|
||||
|
||||
Type: [UInt64](../../sql-reference/data-types/int-uint.md).
|
||||
|
||||
**Example**
|
||||
|
||||
Query:
|
||||
|
||||
``` sql
|
||||
SELECT ngramSimHashCaseInsensitiveUTF8('ClickHouse') AS Hash;
|
||||
```
|
||||
|
||||
Result:
|
||||
|
||||
``` text
|
||||
┌───────Hash─┐
|
||||
│ 1636742693 │
|
||||
└────────────┘
|
||||
```
|
||||
|
||||
## wordShingleSimHash {#wordshinglesimhash}
|
||||
|
||||
Splits a ASCII string into parts (shingles) of `shinglesize` words and returns the word shingle `simhash`. Is case sensitive.
|
||||
|
||||
Can be used for detection of semi-duplicate strings with [bitHammingDistance](../../sql-reference/functions/bit-functions.md#bithammingdistance). The smaller is the [Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance) of the calculated `simhashes` of two strings, the more likely these strings are the same.
|
||||
|
||||
**Syntax**
|
||||
|
||||
``` sql
|
||||
wordShingleSimHash(string[, shinglesize])
|
||||
```
|
||||
|
||||
**Arguments**
|
||||
|
||||
- `string` — String. [String](../../sql-reference/data-types/string.md).
|
||||
- `shinglesize` — The size of a word shingle. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
|
||||
|
||||
**Returned value**
|
||||
|
||||
- Hash value.
|
||||
|
||||
Type: [UInt64](../../sql-reference/data-types/int-uint.md).
|
||||
|
||||
**Example**
|
||||
|
||||
Query:
|
||||
|
||||
``` sql
|
||||
SELECT wordShingleSimHash('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).') AS Hash;
|
||||
```
|
||||
|
||||
Result:
|
||||
|
||||
``` text
|
||||
┌───────Hash─┐
|
||||
│ 2328277067 │
|
||||
└────────────┘
|
||||
```
|
||||
|
||||
## wordShingleSimHashCaseInsensitive {#wordshinglesimhashcaseinsensitive}
|
||||
|
||||
Splits a ASCII string into parts (shingles) of `shinglesize` words and returns the word shingle `simhash`. Is case insensitive.
|
||||
|
||||
Can be used for detection of semi-duplicate strings with [bitHammingDistance](../../sql-reference/functions/bit-functions.md#bithammingdistance). The smaller is the [Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance) of the calculated `simhashes` of two strings, the more likely these strings are the same.
|
||||
|
||||
**Syntax**
|
||||
|
||||
``` sql
|
||||
wordShingleSimHashCaseInsensitive(string[, shinglesize])
|
||||
```
|
||||
|
||||
**Arguments**
|
||||
|
||||
- `string` — String. [String](../../sql-reference/data-types/string.md).
|
||||
- `shinglesize` — The size of a word shingle. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
|
||||
|
||||
**Returned value**
|
||||
|
||||
- Hash value.
|
||||
|
||||
Type: [UInt64](../../sql-reference/data-types/int-uint.md).
|
||||
|
||||
**Example**
|
||||
|
||||
Query:
|
||||
|
||||
``` sql
|
||||
SELECT wordShingleSimHashCaseInsensitive('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).') AS Hash;
|
||||
```
|
||||
|
||||
Result:
|
||||
|
||||
``` text
|
||||
┌───────Hash─┐
|
||||
│ 2194812424 │
|
||||
└────────────┘
|
||||
```
|
||||
|
||||
## wordShingleSimHashUTF8 {#wordshinglesimhashutf8}
|
||||
|
||||
Splits a UTF-8 string into parts (shingles) of `shinglesize` words and returns the word shingle `simhash`. Is case sensitive.
|
||||
|
||||
Can be used for detection of semi-duplicate strings with [bitHammingDistance](../../sql-reference/functions/bit-functions.md#bithammingdistance). The smaller is the [Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance) of the calculated `simhashes` of two strings, the more likely these strings are the same.
|
||||
|
||||
**Syntax**
|
||||
|
||||
``` sql
|
||||
wordShingleSimHashUTF8(string[, shinglesize])
|
||||
```
|
||||
|
||||
**Arguments**
|
||||
|
||||
- `string` — String. [String](../../sql-reference/data-types/string.md).
|
||||
- `shinglesize` — The size of a word shingle. Optinal. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
|
||||
|
||||
**Returned value**
|
||||
|
||||
- Hash value.
|
||||
|
||||
Type: [UInt64](../../sql-reference/data-types/int-uint.md).
|
||||
|
||||
**Example**
|
||||
|
||||
Query:
|
||||
|
||||
``` sql
|
||||
SELECT wordShingleSimHashUTF8('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).') AS Hash;
|
||||
```
|
||||
|
||||
Result:
|
||||
|
||||
``` text
|
||||
┌───────Hash─┐
|
||||
│ 2328277067 │
|
||||
└────────────┘
|
||||
```
|
||||
|
||||
## wordShingleSimHashCaseInsensitiveUTF8 {#wordshinglesimhashcaseinsensitiveutf8}
|
||||
|
||||
Splits a UTF-8 string into parts (shingles) of `shinglesize` words and returns the word shingle `simhash`. Is case insensitive.
|
||||
|
||||
Can be used for detection of semi-duplicate strings with [bitHammingDistance](../../sql-reference/functions/bit-functions.md#bithammingdistance). The smaller is the [Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance) of the calculated `simhashes` of two strings, the more likely these strings are the same.
|
||||
|
||||
**Syntax**
|
||||
|
||||
``` sql
|
||||
wordShingleSimHashCaseInsensitiveUTF8(string[, shinglesize])
|
||||
```
|
||||
|
||||
**Arguments**
|
||||
|
||||
- `string` — String. [String](../../sql-reference/data-types/string.md).
|
||||
- `shinglesize` — The size of a word shingle. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
|
||||
|
||||
**Returned value**
|
||||
|
||||
- Hash value.
|
||||
|
||||
Type: [UInt64](../../sql-reference/data-types/int-uint.md).
|
||||
|
||||
**Example**
|
||||
|
||||
Query:
|
||||
|
||||
``` sql
|
||||
SELECT wordShingleSimHashCaseInsensitiveUTF8('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).') AS Hash;
|
||||
```
|
||||
|
||||
Result:
|
||||
|
||||
``` text
|
||||
┌───────Hash─┐
|
||||
│ 2194812424 │
|
||||
└────────────┘
|
||||
```
|
||||
|
||||
## ngramMinHash {#ngramminhash}
|
||||
|
||||
Splits a ASCII string into n-grams of `ngramsize` symbols and calculates hash values for each n-gram. Uses `hashnum` minimum hashes to calculate the minimum hash and `hashnum` maximum hashes to calculate the maximum hash. Returns a tuple with these hashes. Is case sensitive.
|
||||
|
||||
Can be used for detection of semi-duplicate strings with [tupleHammingDistance](../../sql-reference/functions/tuple-functions.md#tuplehammingdistance). For two strings: if one of the returned hashes is the same for both strings, we think that those strings are the same.
|
||||
|
||||
**Syntax**
|
||||
|
||||
``` sql
|
||||
ngramMinHash(string[, ngramsize, hashnum])
|
||||
```
|
||||
|
||||
**Arguments**
|
||||
|
||||
- `string` — String. [String](../../sql-reference/data-types/string.md).
|
||||
- `ngramsize` — The size of an n-gram. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
|
||||
- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../../sql-reference/data-types/int-uint.md).
|
||||
|
||||
**Returned value**
|
||||
|
||||
- Tuple with two hashes — the minimum and the maximum.
|
||||
|
||||
Type: [Tuple](../../sql-reference/data-types/tuple.md)([UInt64](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md)).
|
||||
|
||||
**Example**
|
||||
|
||||
Query:
|
||||
|
||||
``` sql
|
||||
SELECT ngramMinHash('ClickHouse') AS Tuple;
|
||||
```
|
||||
|
||||
Result:
|
||||
|
||||
``` text
|
||||
┌─Tuple──────────────────────────────────────┐
|
||||
│ (18333312859352735453,9054248444481805918) │
|
||||
└────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
## ngramMinHashCaseInsensitive {#ngramminhashcaseinsensitive}
|
||||
|
||||
Splits a ASCII string into n-grams of `ngramsize` symbols and calculates hash values for each n-gram. Uses `hashnum` minimum hashes to calculate the minimum hash and `hashnum` maximum hashes to calculate the maximum hash. Returns a tuple with these hashes. Is case insensitive.
|
||||
|
||||
Can be used for detection of semi-duplicate strings with [tupleHammingDistance](../../sql-reference/functions/tuple-functions.md#tuplehammingdistance). For two strings: if one of the returned hashes is the same for both strings, we think that those strings are the same.
|
||||
|
||||
**Syntax**
|
||||
|
||||
``` sql
|
||||
ngramMinHashCaseInsensitive(string[, ngramsize, hashnum])
|
||||
```
|
||||
|
||||
**Arguments**
|
||||
|
||||
- `string` — String. [String](../../sql-reference/data-types/string.md).
|
||||
- `ngramsize` — The size of an n-gram. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
|
||||
- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../../sql-reference/data-types/int-uint.md).
|
||||
|
||||
**Returned value**
|
||||
|
||||
- Tuple with two hashes — the minimum and the maximum.
|
||||
|
||||
Type: [Tuple](../../sql-reference/data-types/tuple.md)([UInt64](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md)).
|
||||
|
||||
**Example**
|
||||
|
||||
Query:
|
||||
|
||||
``` sql
|
||||
SELECT ngramMinHashCaseInsensitive('ClickHouse') AS Tuple;
|
||||
```
|
||||
|
||||
Result:
|
||||
|
||||
``` text
|
||||
┌─Tuple──────────────────────────────────────┐
|
||||
│ (2106263556442004574,13203602793651726206) │
|
||||
└────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
## ngramMinHashUTF8 {#ngramminhashutf8}
|
||||
|
||||
Splits a UTF-8 string into n-grams of `ngramsize` symbols and calculates hash values for each n-gram. Uses `hashnum` minimum hashes to calculate the minimum hash and `hashnum` maximum hashes to calculate the maximum hash. Returns a tuple with these hashes. Is case sensitive.
|
||||
|
||||
Can be used for detection of semi-duplicate strings with [tupleHammingDistance](../../sql-reference/functions/tuple-functions.md#tuplehammingdistance). For two strings: if one of the returned hashes is the same for both strings, we think that those strings are the same.
|
||||
|
||||
**Syntax**
|
||||
|
||||
``` sql
|
||||
ngramMinHashUTF8(string[, ngramsize, hashnum])
|
||||
```
|
||||
|
||||
**Arguments**
|
||||
|
||||
- `string` — String. [String](../../sql-reference/data-types/string.md).
|
||||
- `ngramsize` — The size of an n-gram. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
|
||||
- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../../sql-reference/data-types/int-uint.md).
|
||||
|
||||
**Returned value**
|
||||
|
||||
- Tuple with two hashes — the minimum and the maximum.
|
||||
|
||||
Type: [Tuple](../../sql-reference/data-types/tuple.md)([UInt64](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md)).
|
||||
|
||||
**Example**
|
||||
|
||||
Query:
|
||||
|
||||
``` sql
|
||||
SELECT ngramMinHashUTF8('ClickHouse') AS Tuple;
|
||||
```
|
||||
|
||||
Result:
|
||||
|
||||
``` text
|
||||
┌─Tuple──────────────────────────────────────┐
|
||||
│ (18333312859352735453,6742163577938632877) │
|
||||
└────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
## ngramMinHashCaseInsensitiveUTF8 {#ngramminhashcaseinsensitiveutf8}
|
||||
|
||||
Splits a UTF-8 string into n-grams of `ngramsize` symbols and calculates hash values for each n-gram. Uses `hashnum` minimum hashes to calculate the minimum hash and `hashnum` maximum hashes to calculate the maximum hash. Returns a tuple with these hashes. Is case insensitive.
|
||||
|
||||
Can be used for detection of semi-duplicate strings with [tupleHammingDistance](../../sql-reference/functions/tuple-functions.md#tuplehammingdistance). For two strings: if one of the returned hashes is the same for both strings, we think that those strings are the same.
|
||||
|
||||
**Syntax**
|
||||
|
||||
``` sql
|
||||
ngramMinHashCaseInsensitiveUTF8(string [, ngramsize, hashnum])
|
||||
```
|
||||
|
||||
**Arguments**
|
||||
|
||||
- `string` — String. [String](../../sql-reference/data-types/string.md).
|
||||
- `ngramsize` — The size of an n-gram. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
|
||||
- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../../sql-reference/data-types/int-uint.md).
|
||||
|
||||
**Returned value**
|
||||
|
||||
- Tuple with two hashes — the minimum and the maximum.
|
||||
|
||||
Type: [Tuple](../../sql-reference/data-types/tuple.md)([UInt64](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md)).
|
||||
|
||||
**Example**
|
||||
|
||||
Query:
|
||||
|
||||
``` sql
|
||||
SELECT ngramMinHashCaseInsensitiveUTF8('ClickHouse') AS Tuple;
|
||||
```
|
||||
|
||||
Result:
|
||||
|
||||
``` text
|
||||
┌─Tuple───────────────────────────────────────┐
|
||||
│ (12493625717655877135,13203602793651726206) │
|
||||
└─────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
## ngramMinHashArg {#ngramminhasharg}
|
||||
|
||||
Splits a ASCII string into n-grams of `ngramsize` symbols and returns the n-grams with minimum and maximum hashes, calculated by the [ngramMinHash](#ngramminhash) function with the same input. Is case sensitive.
|
||||
|
||||
**Syntax**
|
||||
|
||||
``` sql
|
||||
ngramMinHashArg(string[, ngramsize, hashnum])
|
||||
```
|
||||
|
||||
**Arguments**
|
||||
|
||||
- `string` — String. [String](../../sql-reference/data-types/string.md).
|
||||
- `ngramsize` — The size of an n-gram. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
|
||||
- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../../sql-reference/data-types/int-uint.md).
|
||||
|
||||
**Returned value**
|
||||
|
||||
- Tuple with two tuples with `hashnum` n-grams each.
|
||||
|
||||
Type: [Tuple](../../sql-reference/data-types/tuple.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md)), [Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md))).
|
||||
|
||||
**Example**
|
||||
|
||||
Query:
|
||||
|
||||
``` sql
|
||||
SELECT ngramMinHashArg('ClickHouse') AS Tuple;
|
||||
```
|
||||
|
||||
Result:
|
||||
|
||||
``` text
|
||||
┌─Tuple─────────────────────────────────────────────────────────────────────────┐
|
||||
│ (('ous','ick','lic','Hou','kHo','use'),('Hou','lic','ick','ous','ckH','Cli')) │
|
||||
└───────────────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
## ngramMinHashArgCaseInsensitive {#ngramminhashargcaseinsensitive}
|
||||
|
||||
Splits a ASCII string into n-grams of `ngramsize` symbols and returns the n-grams with minimum and maximum hashes, calculated by the [ngramMinHashCaseInsensitive](#ngramminhashcaseinsensitive) function with the same input. Is case insensitive.
|
||||
|
||||
**Syntax**
|
||||
|
||||
``` sql
|
||||
ngramMinHashArgCaseInsensitive(string[, ngramsize, hashnum])
|
||||
```
|
||||
|
||||
**Arguments**
|
||||
|
||||
- `string` — String. [String](../../sql-reference/data-types/string.md).
|
||||
- `ngramsize` — The size of an n-gram. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
|
||||
- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../../sql-reference/data-types/int-uint.md).
|
||||
|
||||
**Returned value**
|
||||
|
||||
- Tuple with two tuples with `hashnum` n-grams each.
|
||||
|
||||
Type: [Tuple](../../sql-reference/data-types/tuple.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md)), [Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md))).
|
||||
|
||||
**Example**
|
||||
|
||||
Query:
|
||||
|
||||
``` sql
|
||||
SELECT ngramMinHashArgCaseInsensitive('ClickHouse') AS Tuple;
|
||||
```
|
||||
|
||||
Result:
|
||||
|
||||
``` text
|
||||
┌─Tuple─────────────────────────────────────────────────────────────────────────┐
|
||||
│ (('ous','ick','lic','kHo','use','Cli'),('kHo','lic','ick','ous','ckH','Hou')) │
|
||||
└───────────────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
## ngramMinHashArgUTF8 {#ngramminhashargutf8}
|
||||
|
||||
Splits a UTF-8 string into n-grams of `ngramsize` symbols and returns the n-grams with minimum and maximum hashes, calculated by the [ngramMinHashUTF8](#ngramminhashutf8) function with the same input. Is case sensitive.
|
||||
|
||||
**Syntax**
|
||||
|
||||
``` sql
|
||||
ngramMinHashArgUTF8(string[, ngramsize, hashnum])
|
||||
```
|
||||
|
||||
**Arguments**
|
||||
|
||||
- `string` — String. [String](../../sql-reference/data-types/string.md).
|
||||
- `ngramsize` — The size of an n-gram. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
|
||||
- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../../sql-reference/data-types/int-uint.md).
|
||||
|
||||
**Returned value**
|
||||
|
||||
- Tuple with two tuples with `hashnum` n-grams each.
|
||||
|
||||
Type: [Tuple](../../sql-reference/data-types/tuple.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md)), [Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md))).
|
||||
|
||||
**Example**
|
||||
|
||||
Query:
|
||||
|
||||
``` sql
|
||||
SELECT ngramMinHashArgUTF8('ClickHouse') AS Tuple;
|
||||
```
|
||||
|
||||
Result:
|
||||
|
||||
``` text
|
||||
┌─Tuple─────────────────────────────────────────────────────────────────────────┐
|
||||
│ (('ous','ick','lic','Hou','kHo','use'),('kHo','Hou','lic','ick','ous','ckH')) │
|
||||
└───────────────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
## ngramMinHashArgCaseInsensitiveUTF8 {#ngramminhashargcaseinsensitiveutf8}
|
||||
|
||||
Splits a UTF-8 string into n-grams of `ngramsize` symbols and returns the n-grams with minimum and maximum hashes, calculated by the [ngramMinHashCaseInsensitiveUTF8](#ngramminhashcaseinsensitiveutf8) function with the same input. Is case insensitive.
|
||||
|
||||
**Syntax**
|
||||
|
||||
``` sql
|
||||
ngramMinHashArgCaseInsensitiveUTF8(string[, ngramsize, hashnum])
|
||||
```
|
||||
|
||||
**Arguments**
|
||||
|
||||
- `string` — String. [String](../../sql-reference/data-types/string.md).
|
||||
- `ngramsize` — The size of an n-gram. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
|
||||
- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../../sql-reference/data-types/int-uint.md).
|
||||
|
||||
**Returned value**
|
||||
|
||||
- Tuple with two tuples with `hashnum` n-grams each.
|
||||
|
||||
Type: [Tuple](../../sql-reference/data-types/tuple.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md)), [Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md))).
|
||||
|
||||
**Example**
|
||||
|
||||
Query:
|
||||
|
||||
``` sql
|
||||
SELECT ngramMinHashArgCaseInsensitiveUTF8('ClickHouse') AS Tuple;
|
||||
```
|
||||
|
||||
Result:
|
||||
|
||||
``` text
|
||||
┌─Tuple─────────────────────────────────────────────────────────────────────────┐
|
||||
│ (('ckH','ous','ick','lic','kHo','use'),('kHo','lic','ick','ous','ckH','Hou')) │
|
||||
└───────────────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
## wordShingleMinHash {#wordshingleminhash}
|
||||
|
||||
Splits a ASCII string into parts (shingles) of `shinglesize` words and calculates hash values for each word shingle. Uses `hashnum` minimum hashes to calculate the minimum hash and `hashnum` maximum hashes to calculate the maximum hash. Returns a tuple with these hashes. Is case sensitive.
|
||||
|
||||
Can be used for detection of semi-duplicate strings with [tupleHammingDistance](../../sql-reference/functions/tuple-functions.md#tuplehammingdistance). For two strings: if one of the returned hashes is the same for both strings, we think that those strings are the same.
|
||||
|
||||
**Syntax**
|
||||
|
||||
``` sql
|
||||
wordShingleMinHash(string[, shinglesize, hashnum])
|
||||
```
|
||||
|
||||
**Arguments**
|
||||
|
||||
- `string` — String. [String](../../sql-reference/data-types/string.md).
|
||||
- `shinglesize` — The size of a word shingle. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
|
||||
- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../../sql-reference/data-types/int-uint.md).
|
||||
|
||||
**Returned value**
|
||||
|
||||
- Tuple with two hashes — the minimum and the maximum.
|
||||
|
||||
Type: [Tuple](../../sql-reference/data-types/tuple.md)([UInt64](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md)).
|
||||
|
||||
**Example**
|
||||
|
||||
Query:
|
||||
|
||||
``` sql
|
||||
SELECT wordShingleMinHash('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).') AS Tuple;
|
||||
```
|
||||
|
||||
Result:
|
||||
|
||||
``` text
|
||||
┌─Tuple──────────────────────────────────────┐
|
||||
│ (16452112859864147620,5844417301642981317) │
|
||||
└────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
## wordShingleMinHashCaseInsensitive {#wordshingleminhashcaseinsensitive}
|
||||
|
||||
Splits a ASCII string into parts (shingles) of `shinglesize` words and calculates hash values for each word shingle. Uses `hashnum` minimum hashes to calculate the minimum hash and `hashnum` maximum hashes to calculate the maximum hash. Returns a tuple with these hashes. Is case insensitive.
|
||||
|
||||
Can be used for detection of semi-duplicate strings with [tupleHammingDistance](../../sql-reference/functions/tuple-functions.md#tuplehammingdistance). For two strings: if one of the returned hashes is the same for both strings, we think that those strings are the same.
|
||||
|
||||
**Syntax**
|
||||
|
||||
``` sql
|
||||
wordShingleMinHashCaseInsensitive(string[, shinglesize, hashnum])
|
||||
```
|
||||
|
||||
**Arguments**
|
||||
|
||||
- `string` — String. [String](../../sql-reference/data-types/string.md).
|
||||
- `shinglesize` — The size of a word shingle. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
|
||||
- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../../sql-reference/data-types/int-uint.md).
|
||||
|
||||
**Returned value**
|
||||
|
||||
- Tuple with two hashes — the minimum and the maximum.
|
||||
|
||||
Type: [Tuple](../../sql-reference/data-types/tuple.md)([UInt64](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md)).
|
||||
|
||||
**Example**
|
||||
|
||||
Query:
|
||||
|
||||
``` sql
|
||||
SELECT wordShingleMinHashCaseInsensitive('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).') AS Tuple;
|
||||
```
|
||||
|
||||
Result:
|
||||
|
||||
``` text
|
||||
┌─Tuple─────────────────────────────────────┐
|
||||
│ (3065874883688416519,1634050779997673240) │
|
||||
└───────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
## wordShingleMinHashUTF8 {#wordshingleminhashutf8}
|
||||
|
||||
Splits a UTF-8 string into parts (shingles) of `shinglesize` words and calculates hash values for each word shingle. Uses `hashnum` minimum hashes to calculate the minimum hash and `hashnum` maximum hashes to calculate the maximum hash. Returns a tuple with these hashes. Is case sensitive.
|
||||
|
||||
Can be used for detection of semi-duplicate strings with [tupleHammingDistance](../../sql-reference/functions/tuple-functions.md#tuplehammingdistance). For two strings: if one of the returned hashes is the same for both strings, we think that those strings are the same.
|
||||
|
||||
**Syntax**
|
||||
|
||||
``` sql
|
||||
wordShingleMinHashUTF8(string[, shinglesize, hashnum])
|
||||
```
|
||||
|
||||
**Arguments**
|
||||
|
||||
- `string` — String. [String](../../sql-reference/data-types/string.md).
|
||||
- `shinglesize` — The size of a word shingle. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
|
||||
- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../../sql-reference/data-types/int-uint.md).
|
||||
|
||||
**Returned value**
|
||||
|
||||
- Tuple with two hashes — the minimum and the maximum.
|
||||
|
||||
Type: [Tuple](../../sql-reference/data-types/tuple.md)([UInt64](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md)).
|
||||
|
||||
**Example**
|
||||
|
||||
Query:
|
||||
|
||||
``` sql
|
||||
SELECT wordShingleMinHashUTF8('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).') AS Tuple;
|
||||
```
|
||||
|
||||
Result:
|
||||
|
||||
``` text
|
||||
┌─Tuple──────────────────────────────────────┐
|
||||
│ (16452112859864147620,5844417301642981317) │
|
||||
└────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
## wordShingleMinHashCaseInsensitiveUTF8 {#wordshingleminhashcaseinsensitiveutf8}
|
||||
|
||||
Splits a UTF-8 string into parts (shingles) of `shinglesize` words and calculates hash values for each word shingle. Uses `hashnum` minimum hashes to calculate the minimum hash and `hashnum` maximum hashes to calculate the maximum hash. Returns a tuple with these hashes. Is case insensitive.
|
||||
|
||||
Can be used for detection of semi-duplicate strings with [tupleHammingDistance](../../sql-reference/functions/tuple-functions.md#tuplehammingdistance). For two strings: if one of the returned hashes is the same for both strings, we think that those strings are the same.
|
||||
|
||||
**Syntax**
|
||||
|
||||
``` sql
|
||||
wordShingleMinHashCaseInsensitiveUTF8(string[, shinglesize, hashnum])
|
||||
```
|
||||
|
||||
**Arguments**
|
||||
|
||||
- `string` — String. [String](../../sql-reference/data-types/string.md).
|
||||
- `shinglesize` — The size of a word shingle. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
|
||||
- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../../sql-reference/data-types/int-uint.md).
|
||||
|
||||
**Returned value**
|
||||
|
||||
- Tuple with two hashes — the minimum and the maximum.
|
||||
|
||||
Type: [Tuple](../../sql-reference/data-types/tuple.md)([UInt64](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md)).
|
||||
|
||||
**Example**
|
||||
|
||||
Query:
|
||||
|
||||
``` sql
|
||||
SELECT wordShingleMinHashCaseInsensitiveUTF8('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).') AS Tuple;
|
||||
```
|
||||
|
||||
Result:
|
||||
|
||||
``` text
|
||||
┌─Tuple─────────────────────────────────────┐
|
||||
│ (3065874883688416519,1634050779997673240) │
|
||||
└───────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
## wordShingleMinHashArg {#wordshingleminhasharg}
|
||||
|
||||
Splits a ASCII string into parts (shingles) of `shinglesize` words each and returns the shingles with minimum and maximum word hashes, calculated by the [wordshingleMinHash](#wordshingleminhash) function with the same input. Is case sensitive.
|
||||
|
||||
**Syntax**
|
||||
|
||||
``` sql
|
||||
wordShingleMinHashArg(string[, shinglesize, hashnum])
|
||||
```
|
||||
|
||||
**Arguments**
|
||||
|
||||
- `string` — String. [String](../../sql-reference/data-types/string.md).
|
||||
- `shinglesize` — The size of a word shingle. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
|
||||
- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../../sql-reference/data-types/int-uint.md).
|
||||
|
||||
**Returned value**
|
||||
|
||||
- Tuple with two tuples with `hashnum` word shingles each.
|
||||
|
||||
Type: [Tuple](../../sql-reference/data-types/tuple.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md)), [Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md))).
|
||||
|
||||
**Example**
|
||||
|
||||
Query:
|
||||
|
||||
``` sql
|
||||
SELECT wordShingleMinHashArg('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).', 1, 3) AS Tuple;
|
||||
```
|
||||
|
||||
Result:
|
||||
|
||||
``` text
|
||||
┌─Tuple─────────────────────────────────────────────────────────────────┐
|
||||
│ (('OLAP','database','analytical'),('online','oriented','processing')) │
|
||||
└───────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
## wordShingleMinHashArgCaseInsensitive {#wordshingleminhashargcaseinsensitive}
|
||||
|
||||
Splits a ASCII string into parts (shingles) of `shinglesize` words each and returns the shingles with minimum and maximum word hashes, calculated by the [wordShingleMinHashCaseInsensitive](#wordshingleminhashcaseinsensitive) function with the same input. Is case insensitive.
|
||||
|
||||
**Syntax**
|
||||
|
||||
``` sql
|
||||
wordShingleMinHashArgCaseInsensitive(string[, shinglesize, hashnum])
|
||||
```
|
||||
|
||||
**Arguments**
|
||||
|
||||
- `string` — String. [String](../../sql-reference/data-types/string.md).
|
||||
- `shinglesize` — The size of a word shingle. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
|
||||
- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../../sql-reference/data-types/int-uint.md).
|
||||
|
||||
**Returned value**
|
||||
|
||||
- Tuple with two tuples with `hashnum` word shingles each.
|
||||
|
||||
Type: [Tuple](../../sql-reference/data-types/tuple.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md)), [Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md))).
|
||||
|
||||
**Example**
|
||||
|
||||
Query:
|
||||
|
||||
``` sql
|
||||
SELECT wordShingleMinHashArgCaseInsensitive('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).', 1, 3) AS Tuple;
|
||||
```
|
||||
|
||||
Result:
|
||||
|
||||
``` text
|
||||
┌─Tuple──────────────────────────────────────────────────────────────────┐
|
||||
│ (('queries','database','analytical'),('oriented','processing','DBMS')) │
|
||||
└────────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
## wordShingleMinHashArgUTF8 {#wordshingleminhashargutf8}
|
||||
|
||||
Splits a UTF-8 string into parts (shingles) of `shinglesize` words each and returns the shingles with minimum and maximum word hashes, calculated by the [wordShingleMinHashUTF8](#wordshingleminhashutf8) function with the same input. Is case sensitive.
|
||||
|
||||
**Syntax**
|
||||
|
||||
``` sql
|
||||
wordShingleMinHashArgUTF8(string[, shinglesize, hashnum])
|
||||
```
|
||||
|
||||
**Arguments**
|
||||
|
||||
- `string` — String. [String](../../sql-reference/data-types/string.md).
|
||||
- `shinglesize` — The size of a word shingle. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
|
||||
- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../../sql-reference/data-types/int-uint.md).
|
||||
|
||||
**Returned value**
|
||||
|
||||
- Tuple with two tuples with `hashnum` word shingles each.
|
||||
|
||||
Type: [Tuple](../../sql-reference/data-types/tuple.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md)), [Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md))).
|
||||
|
||||
**Example**
|
||||
|
||||
Query:
|
||||
|
||||
``` sql
|
||||
SELECT wordShingleMinHashArgUTF8('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).', 1, 3) AS Tuple;
|
||||
```
|
||||
|
||||
Result:
|
||||
|
||||
``` text
|
||||
┌─Tuple─────────────────────────────────────────────────────────────────┐
|
||||
│ (('OLAP','database','analytical'),('online','oriented','processing')) │
|
||||
└───────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
## wordShingleMinHashArgCaseInsensitiveUTF8 {#wordshingleminhashargcaseinsensitiveutf8}
|
||||
|
||||
Splits a UTF-8 string into parts (shingles) of `shinglesize` words each and returns the shingles with minimum and maximum word hashes, calculated by the [wordShingleMinHashCaseInsensitiveUTF8](#wordshingleminhashcaseinsensitiveutf8) function with the same input. Is case insensitive.
|
||||
|
||||
**Syntax**
|
||||
|
||||
``` sql
|
||||
wordShingleMinHashArgCaseInsensitiveUTF8(string[, shinglesize, hashnum])
|
||||
```
|
||||
|
||||
**Arguments**
|
||||
|
||||
- `string` — String. [String](../../sql-reference/data-types/string.md).
|
||||
- `shinglesize` — The size of a word shingle. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
|
||||
- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../../sql-reference/data-types/int-uint.md).
|
||||
|
||||
**Returned value**
|
||||
|
||||
- Tuple with two tuples with `hashnum` word shingles each.
|
||||
|
||||
Type: [Tuple](../../sql-reference/data-types/tuple.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md)), [Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md))).
|
||||
|
||||
**Example**
|
||||
|
||||
Query:
|
||||
|
||||
``` sql
|
||||
SELECT wordShingleMinHashArgCaseInsensitiveUTF8('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).', 1, 3) AS Tuple;
|
||||
```
|
||||
|
||||
Result:
|
||||
|
||||
``` text
|
||||
┌─Tuple──────────────────────────────────────────────────────────────────┐
|
||||
│ (('queries','database','analytical'),('oriented','processing','DBMS')) │
|
||||
└────────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
@ -111,4 +111,55 @@ Result:
|
||||
|
||||
- [Tuple](../../sql-reference/data-types/tuple.md)
|
||||
|
||||
[Original article](https://clickhouse.tech/docs/en/sql-reference/functions/tuple-functions/) <!--hide-->
|
||||
## tupleHammingDistance {#tuplehammingdistance}
|
||||
|
||||
Returns the [Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance) between two tuples of the same size.
|
||||
|
||||
**Syntax**
|
||||
|
||||
``` sql
|
||||
tupleHammingDistance(tuple1, tuple2)
|
||||
```
|
||||
|
||||
**Arguments**
|
||||
|
||||
- `tuple1` — First tuple. [Tuple](../../sql-reference/data-types/tuple.md).
|
||||
- `tuple2` — Second tuple. [Tuple](../../sql-reference/data-types/tuple.md).
|
||||
|
||||
Tuples should have the same type of the elements.
|
||||
|
||||
**Returned value**
|
||||
|
||||
- The Hamming distance.
|
||||
|
||||
Type: [UInt8](../../sql-reference/data-types/int-uint.md).
|
||||
|
||||
**Examples**
|
||||
|
||||
Query:
|
||||
|
||||
``` sql
|
||||
SELECT tupleHammingDistance((1, 2, 3), (3, 2, 1)) AS HammingDistance;
|
||||
```
|
||||
|
||||
Result:
|
||||
|
||||
``` text
|
||||
┌─HammingDistance─┐
|
||||
│ 2 │
|
||||
└─────────────────┘
|
||||
```
|
||||
|
||||
Can be used with [MinHash](../../sql-reference/functions/hash-functions.md#ngramminhash) functions for detection of semi-duplicate strings:
|
||||
|
||||
``` sql
|
||||
SELECT tupleHammingDistance(wordShingleMinHash(string), wordShingleMinHashCaseInsensitive(string)) as HammingDistance FROM (SELECT 'Clickhouse is a column-oriented database management system for online analytical processing of queries.' AS string);
|
||||
```
|
||||
|
||||
Result:
|
||||
|
||||
``` text
|
||||
┌─HammingDistance─┐
|
||||
│ 2 │
|
||||
└─────────────────┘
|
||||
```
|
||||
|
@ -21,16 +21,18 @@ You can use table functions in:
|
||||
!!! warning "Warning"
|
||||
You can’t use table functions if the [allow_ddl](../../operations/settings/permissions-for-queries.md#settings_allow_ddl) setting is disabled.
|
||||
|
||||
| Function | Description |
|
||||
|-----------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------|
|
||||
| [file](../../sql-reference/table-functions/file.md) | Creates a File-engine table. |
|
||||
| [merge](../../sql-reference/table-functions/merge.md) | Creates a Merge-engine table. |
|
||||
| [numbers](../../sql-reference/table-functions/numbers.md) | Creates a table with a single column filled with integer numbers. |
|
||||
| [remote](../../sql-reference/table-functions/remote.md) | Allows you to access remote servers without creating a Distributed-engine table. |
|
||||
| [url](../../sql-reference/table-functions/url.md) | Creates a URL-engine table. |
|
||||
| [mysql](../../sql-reference/table-functions/mysql.md) | Creates a MySQL-engine table. |
|
||||
| [postgresql](../../sql-reference/table-functions/postgresql.md) | Creates a PostgreSQL-engine table. |
|
||||
| [jdbc](../../sql-reference/table-functions/jdbc.md) | Creates a JDBC-engine table. |
|
||||
| [odbc](../../sql-reference/table-functions/odbc.md) | Creates a ODBC-engine table. |
|
||||
| [hdfs](../../sql-reference/table-functions/hdfs.md) | Creates a HDFS-engine table. |
|
||||
| [s3](../../sql-reference/table-functions/s3.md) | Creates a S3-engine table. |
|
||||
| Function | Description |
|
||||
|------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------|
|
||||
| [file](../../sql-reference/table-functions/file.md) | Creates a [File](../../engines/table-engines/special/file.md)-engine table. |
|
||||
| [merge](../../sql-reference/table-functions/merge.md) | Creates a [Merge](../../engines/table-engines/special/merge.md)-engine table. |
|
||||
| [numbers](../../sql-reference/table-functions/numbers.md) | Creates a table with a single column filled with integer numbers. |
|
||||
| [remote](../../sql-reference/table-functions/remote.md) | Allows you to access remote servers without creating a [Distributed](../../engines/table-engines/special/distributed.md)-engine table. |
|
||||
| [url](../../sql-reference/table-functions/url.md) | Creates a [Url](../../engines/table-engines/special/url.md)-engine table. |
|
||||
| [mysql](../../sql-reference/table-functions/mysql.md) | Creates a [MySQL](../../engines/table-engines/integrations/mysql.md)-engine table. |
|
||||
| [postgresql](../../sql-reference/table-functions/postgresql.md) | Creates a [PostgreSQL](../../engines/table-engines/integrations/postgresql.md)-engine table. |
|
||||
| [jdbc](../../sql-reference/table-functions/jdbc.md) | Creates a [JDBC](../../engines/table-engines/integrations/jdbc.md)-engine table. |
|
||||
| [odbc](../../sql-reference/table-functions/odbc.md) | Creates a [ODBC](../../engines/table-engines/integrations/odbc.md)-engine table. |
|
||||
| [hdfs](../../sql-reference/table-functions/hdfs.md) | Creates a [HDFS](../../engines/table-engines/integrations/hdfs.md)-engine table. |
|
||||
| [s3](../../sql-reference/table-functions/s3.md) | Creates a [S3](../../engines/table-engines/integrations/s3.md)-engine table. |
|
||||
|
||||
[Original article](https://clickhouse.tech/docs/en/sql-reference/table-functions/) <!--hide-->
|
||||
|
@ -10,33 +10,17 @@ Allows `SELECT` and `INSERT` queries to be performed on data that is stored on a
|
||||
**Syntax**
|
||||
|
||||
``` sql
|
||||
postgresql('host:port', 'database', 'table', 'user', 'password')
|
||||
postgresql('host:port', 'database', 'table', 'user', 'password'[, `schema`])
|
||||
```
|
||||
|
||||
**Arguments**
|
||||
|
||||
- `host:port` — PostgreSQL server address.
|
||||
|
||||
- `database` — Remote database name.
|
||||
|
||||
- `table` — Remote table name.
|
||||
|
||||
- `user` — PostgreSQL user.
|
||||
|
||||
- `password` — User password.
|
||||
|
||||
|
||||
SELECT Queries on PostgreSQL side run as `COPY (SELECT ...) TO STDOUT` inside read-only PostgreSQL transaction with commit after each `SELECT` query.
|
||||
|
||||
Simple `WHERE` clauses such as `=, !=, >, >=, <, <=, IN` are executed on the PostgreSQL server.
|
||||
|
||||
All joins, aggregations, sorting, `IN [ array ]` conditions and the `LIMIT` sampling constraint are executed in ClickHouse only after the query to PostgreSQL finishes.
|
||||
|
||||
INSERT Queries on PostgreSQL side run as `COPY "table_name" (field1, field2, ... fieldN) FROM STDIN` inside PostgreSQL transaction with auto-commit after each `INSERT` statement.
|
||||
|
||||
PostgreSQL Array types converts into ClickHouse arrays.
|
||||
|
||||
Be careful in PostgreSQL an array data type column like Integer[] may contain arrays of different dimensions in different rows, but in ClickHouse it is only allowed to have multidimensional arrays of the same dimension in all rows.
|
||||
- `schema` — Non-default table schema. Optional.
|
||||
|
||||
**Returned Value**
|
||||
|
||||
@ -45,6 +29,23 @@ A table object with the same columns as the original PostgreSQL table.
|
||||
!!! info "Note"
|
||||
In the `INSERT` query to distinguish table function `postgresql(...)` from table name with column names list you must use keywords `FUNCTION` or `TABLE FUNCTION`. See examples below.
|
||||
|
||||
## Implementation Details {#implementation-details}
|
||||
|
||||
`SELECT` queries on PostgreSQL side run as `COPY (SELECT ...) TO STDOUT` inside read-only PostgreSQL transaction with commit after each `SELECT` query.
|
||||
|
||||
Simple `WHERE` clauses such as `=`, `!=`, `>`, `>=`, `<`, `<=`, and `IN` are executed on the PostgreSQL server.
|
||||
|
||||
All joins, aggregations, sorting, `IN [ array ]` conditions and the `LIMIT` sampling constraint are executed in ClickHouse only after the query to PostgreSQL finishes.
|
||||
|
||||
`INSERT` queries on PostgreSQL side run as `COPY "table_name" (field1, field2, ... fieldN) FROM STDIN` inside PostgreSQL transaction with auto-commit after each `INSERT` statement.
|
||||
|
||||
PostgreSQL Array types converts into ClickHouse arrays.
|
||||
|
||||
!!! info "Note"
|
||||
Be careful, in PostgreSQL an array data type column like Integer[] may contain arrays of different dimensions in different rows, but in ClickHouse it is only allowed to have multidimensional arrays of the same dimension in all rows.
|
||||
|
||||
Supports replicas priority for PostgreSQL dictionary source. The bigger the number in map, the less the priority. The highest priority is `0`.
|
||||
|
||||
**Examples**
|
||||
|
||||
Table in PostgreSQL:
|
||||
@ -60,10 +61,10 @@ PRIMARY KEY (int_id));
|
||||
|
||||
CREATE TABLE
|
||||
|
||||
postgres=# insert into test (int_id, str, "float") VALUES (1,'test',2);
|
||||
postgres=# INSERT INTO test (int_id, str, "float") VALUES (1,'test',2);
|
||||
INSERT 0 1
|
||||
|
||||
postgresql> select * from test;
|
||||
postgresql> SELECT * FROM test;
|
||||
int_id | int_nullable | float | str | float_nullable
|
||||
--------+--------------+-------+------+----------------
|
||||
1 | | 2 | test |
|
||||
@ -96,9 +97,24 @@ SELECT * FROM postgresql('localhost:5432', 'test', 'test', 'postgresql_user', 'p
|
||||
└────────┴──────────────┴───────┴──────┴────────────────┘
|
||||
```
|
||||
|
||||
Using Non-default Schema:
|
||||
|
||||
```text
|
||||
postgres=# CREATE SCHEMA "nice.schema";
|
||||
|
||||
postgres=# CREATE TABLE "nice.schema"."nice.table" (a integer);
|
||||
|
||||
postgres=# INSERT INTO "nice.schema"."nice.table" SELECT i FROM generate_series(0, 99) as t(i)
|
||||
```
|
||||
|
||||
```sql
|
||||
CREATE TABLE pg_table_schema_with_dots (a UInt32)
|
||||
ENGINE PostgreSQL('localhost:5432', 'clickhouse', 'nice.table', 'postgrsql_user', 'password', 'nice.schema');
|
||||
```
|
||||
|
||||
**See Also**
|
||||
|
||||
- [The ‘PostgreSQL’ table engine](../../engines/table-engines/integrations/postgresql.md)
|
||||
- [The PostgreSQL table engine](../../engines/table-engines/integrations/postgresql.md)
|
||||
- [Using PostgreSQL as a source of external dictionary](../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md#dicts-external_dicts_dict_sources-postgresql)
|
||||
|
||||
[Original article](https://clickhouse.tech/docs/en/sql-reference/table-functions/postgresql/) <!--hide-->
|
||||
|
@ -4,7 +4,7 @@ toc_priority: 27
|
||||
toc_title: "Введение"
|
||||
---
|
||||
|
||||
# Движки баз данных {#dvizhki-baz-dannykh}
|
||||
# Движки баз данных {#database-engines}
|
||||
|
||||
Движки баз данных обеспечивают работу с таблицами.
|
||||
|
||||
@ -18,3 +18,5 @@ toc_title: "Введение"
|
||||
|
||||
- [Lazy](../../engines/database-engines/lazy.md)
|
||||
|
||||
- [PostgreSQL](../../engines/database-engines/postgresql.md)
|
||||
|
||||
|
138
docs/ru/engines/database-engines/postgresql.md
Normal file
138
docs/ru/engines/database-engines/postgresql.md
Normal file
@ -0,0 +1,138 @@
|
||||
---
|
||||
toc_priority: 35
|
||||
toc_title: PostgreSQL
|
||||
---
|
||||
|
||||
# PostgreSQL {#postgresql}
|
||||
|
||||
Позволяет подключаться к БД на удаленном сервере [PostgreSQL](https://www.postgresql.org). Поддерживает операции чтения и записи (запросы `SELECT` и `INSERT`) для обмена данными между ClickHouse и PostgreSQL.
|
||||
|
||||
Позволяет в реальном времени получать от удаленного сервера PostgreSQL информацию о таблицах БД и их структуре с помощью запросов `SHOW TABLES` и `DESCRIBE TABLE`.
|
||||
|
||||
Поддерживает операции изменения структуры таблиц (`ALTER TABLE ... ADD|DROP COLUMN`). Если параметр `use_table_cache` (см. ниже раздел Параметры движка) установлен в значение `1`, структура таблицы кешируется, и изменения в структуре не отслеживаются, но будут обновлены, если выполнить команды `DETACH` и `ATTACH`.
|
||||
|
||||
## Создание БД {#creating-a-database}
|
||||
|
||||
``` sql
|
||||
CREATE DATABASE test_database
|
||||
ENGINE = PostgreSQL('host:port', 'database', 'user', 'password'[, `use_table_cache`]);
|
||||
```
|
||||
|
||||
**Параметры движка**
|
||||
|
||||
- `host:port` — адрес сервера PostgreSQL.
|
||||
- `database` — имя удаленной БД.
|
||||
- `user` — пользователь PostgreSQL.
|
||||
- `password` — пароль пользователя.
|
||||
- `use_table_cache` — определяет кеширование структуры таблиц БД. Необязательный параметр. Значение по умолчанию: `0`.
|
||||
|
||||
## Поддерживаемые типы данных {#data_types-support}
|
||||
|
||||
| PostgerSQL | ClickHouse |
|
||||
|------------------|--------------------------------------------------------------|
|
||||
| DATE | [Date](../../sql-reference/data-types/date.md) |
|
||||
| TIMESTAMP | [DateTime](../../sql-reference/data-types/datetime.md) |
|
||||
| REAL | [Float32](../../sql-reference/data-types/float.md) |
|
||||
| DOUBLE | [Float64](../../sql-reference/data-types/float.md) |
|
||||
| DECIMAL, NUMERIC | [Decimal](../../sql-reference/data-types/decimal.md) |
|
||||
| SMALLINT | [Int16](../../sql-reference/data-types/int-uint.md) |
|
||||
| INTEGER | [Int32](../../sql-reference/data-types/int-uint.md) |
|
||||
| BIGINT | [Int64](../../sql-reference/data-types/int-uint.md) |
|
||||
| SERIAL | [UInt32](../../sql-reference/data-types/int-uint.md) |
|
||||
| BIGSERIAL | [UInt64](../../sql-reference/data-types/int-uint.md) |
|
||||
| TEXT, CHAR | [String](../../sql-reference/data-types/string.md) |
|
||||
| INTEGER | Nullable([Int32](../../sql-reference/data-types/int-uint.md))|
|
||||
| ARRAY | [Array](../../sql-reference/data-types/array.md) |
|
||||
|
||||
|
||||
## Примеры использования {#examples-of-use}
|
||||
|
||||
Обмен данными между БД ClickHouse и сервером PostgreSQL:
|
||||
|
||||
``` sql
|
||||
CREATE DATABASE test_database
|
||||
ENGINE = PostgreSQL('postgres1:5432', 'test_database', 'postgres', 'mysecretpassword', 1);
|
||||
```
|
||||
|
||||
``` sql
|
||||
SHOW DATABASES;
|
||||
```
|
||||
|
||||
``` text
|
||||
┌─name──────────┐
|
||||
│ default │
|
||||
│ test_database │
|
||||
│ system │
|
||||
└───────────────┘
|
||||
```
|
||||
|
||||
``` sql
|
||||
SHOW TABLES FROM test_database;
|
||||
```
|
||||
|
||||
``` text
|
||||
┌─name───────┐
|
||||
│ test_table │
|
||||
└────────────┘
|
||||
```
|
||||
|
||||
Чтение данных из таблицы PostgreSQL:
|
||||
|
||||
``` sql
|
||||
SELECT * FROM test_database.test_table;
|
||||
```
|
||||
|
||||
``` text
|
||||
┌─id─┬─value─┐
|
||||
│ 1 │ 2 │
|
||||
└────┴───────┘
|
||||
```
|
||||
|
||||
Запись данных в таблицу PostgreSQL:
|
||||
|
||||
``` sql
|
||||
INSERT INTO test_database.test_table VALUES (3,4);
|
||||
SELECT * FROM test_database.test_table;
|
||||
```
|
||||
|
||||
``` text
|
||||
┌─int_id─┬─value─┐
|
||||
│ 1 │ 2 │
|
||||
│ 3 │ 4 │
|
||||
└────────┴───────┘
|
||||
```
|
||||
|
||||
Пусть структура таблицы была изменена в PostgreSQL:
|
||||
|
||||
``` sql
|
||||
postgre> ALTER TABLE test_table ADD COLUMN data Text
|
||||
```
|
||||
|
||||
Поскольку при создании БД параметр `use_table_cache` был установлен в значение `1`, структура таблицы в ClickHouse была кеширована и поэтому не изменилась:
|
||||
|
||||
``` sql
|
||||
DESCRIBE TABLE test_database.test_table;
|
||||
```
|
||||
``` text
|
||||
┌─name───┬─type──────────────┐
|
||||
│ id │ Nullable(Integer) │
|
||||
│ value │ Nullable(Integer) │
|
||||
└────────┴───────────────────┘
|
||||
```
|
||||
|
||||
После того как таблицу «отцепили» и затем снова «прицепили», структура обновилась:
|
||||
|
||||
``` sql
|
||||
DETACH TABLE test_database.test_table;
|
||||
ATTACH TABLE test_database.test_table;
|
||||
DESCRIBE TABLE test_database.test_table;
|
||||
```
|
||||
``` text
|
||||
┌─name───┬─type──────────────┐
|
||||
│ id │ Nullable(Integer) │
|
||||
│ value │ Nullable(Integer) │
|
||||
│ data │ Nullable(String) │
|
||||
└────────┴───────────────────┘
|
||||
```
|
||||
|
||||
[Оригинальная статья](https://clickhouse.tech/docs/ru/database-engines/postgresql/) <!--hide-->
|
@ -16,7 +16,7 @@ toc_title: "Введение"
|
||||
- Возможно ли многопоточное выполнение запроса.
|
||||
- Параметры репликации данных.
|
||||
|
||||
## Семейства движков {#semeistva-dvizhkov}
|
||||
## Семейства движков {#engine-families}
|
||||
|
||||
### MergeTree {#mergetree}
|
||||
|
||||
@ -42,18 +42,23 @@ toc_title: "Введение"
|
||||
- [StripeLog](log-family/stripelog.md#stripelog)
|
||||
- [Log](log-family/log.md#log)
|
||||
|
||||
### Движки для интеграции {#dvizhki-dlia-integratsii}
|
||||
### Движки для интеграции {#integration-engines}
|
||||
|
||||
Движки для связи с другими системами хранения и обработки данных.
|
||||
|
||||
Движки семейства:
|
||||
|
||||
- [Kafka](integrations/kafka.md#kafka)
|
||||
- [MySQL](integrations/mysql.md#mysql)
|
||||
- [ODBC](integrations/odbc.md#table-engine-odbc)
|
||||
- [JDBC](integrations/jdbc.md#table-engine-jdbc)
|
||||
- [ODBC](../../engines/table-engines/integrations/odbc.md)
|
||||
- [JDBC](../../engines/table-engines/integrations/jdbc.md)
|
||||
- [MySQL](../../engines/table-engines/integrations/mysql.md)
|
||||
- [MongoDB](../../engines/table-engines/integrations/mongodb.md)
|
||||
- [HDFS](../../engines/table-engines/integrations/hdfs.md)
|
||||
- [Kafka](../../engines/table-engines/integrations/kafka.md)
|
||||
- [EmbeddedRocksDB](../../engines/table-engines/integrations/embedded-rocksdb.md)
|
||||
- [RabbitMQ](../../engines/table-engines/integrations/rabbitmq.md)
|
||||
- [PostgreSQL](../../engines/table-engines/integrations/postgresql.md)
|
||||
|
||||
### Специальные движки {#spetsialnye-dvizhki}
|
||||
### Специальные движки {#special-engines}
|
||||
|
||||
Движки семейства:
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
---
|
||||
toc_priority: 6
|
||||
toc_priority: 9
|
||||
toc_title: EmbeddedRocksDB
|
||||
---
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
---
|
||||
toc_priority: 4
|
||||
toc_priority: 6
|
||||
toc_title: HDFS
|
||||
---
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
---
|
||||
toc_priority: 2
|
||||
toc_priority: 3
|
||||
toc_title: JDBC
|
||||
---
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
---
|
||||
toc_priority: 5
|
||||
toc_priority: 8
|
||||
toc_title: Kafka
|
||||
---
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
---
|
||||
toc_priority: 7
|
||||
toc_priority: 5
|
||||
toc_title: MongoDB
|
||||
---
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
---
|
||||
toc_priority: 3
|
||||
toc_priority: 4
|
||||
toc_title: MySQL
|
||||
---
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
---
|
||||
toc_priority: 1
|
||||
toc_priority: 2
|
||||
toc_title: ODBC
|
||||
---
|
||||
|
||||
|
@ -1,11 +1,11 @@
|
||||
---
|
||||
toc_priority: 8
|
||||
toc_priority: 11
|
||||
toc_title: PostgreSQL
|
||||
---
|
||||
|
||||
# PosgtreSQL {#postgresql}
|
||||
#PostgreSQL {#postgresql}
|
||||
|
||||
Движок PostgreSQL позволяет выполнять запросы `SELECT` над данными, хранящимися на удалённом PostgreSQL сервере.
|
||||
Движок PostgreSQL позволяет выполнять запросы `SELECT` и `INSERT` для таблиц на удаленном сервере PostgreSQL.
|
||||
|
||||
## Создание таблицы {#creating-a-table}
|
||||
|
||||
@ -15,7 +15,7 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster]
|
||||
name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1] [TTL expr1],
|
||||
name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2] [TTL expr2],
|
||||
...
|
||||
) ENGINE = PostgreSQL('host:port', 'database', 'table', 'user', 'password');
|
||||
) ENGINE = PostgreSQL('host:port', 'database', 'table', 'user', 'password'[, `schema`]);
|
||||
```
|
||||
|
||||
Смотрите подробное описание запроса [CREATE TABLE](../../../sql-reference/statements/create/table.md#create-table-query).
|
||||
@ -29,25 +29,51 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster]
|
||||
**Параметры движка**
|
||||
|
||||
- `host:port` — адрес сервера PostgreSQL.
|
||||
|
||||
- `database` — Имя базы данных на сервере PostgreSQL.
|
||||
|
||||
- `table` — Имя таблицы.
|
||||
|
||||
- `user` — Имя пользователя PostgreSQL.
|
||||
|
||||
- `password` — Пароль пользователя PostgreSQL.
|
||||
- `schema` — имя схемы, если не используется схема по умолчанию. Необязательный аргумент.
|
||||
|
||||
SELECT запросы на стороне PostgreSQL выполняются как `COPY (SELECT ...) TO STDOUT` внутри транзакции PostgreSQL только на чтение с коммитом после каждого `SELECT` запроса.
|
||||
## Особенности реализации {#implementation-details}
|
||||
|
||||
Простые условия для `WHERE` такие как `=, !=, >, >=, <, <=, IN` исполняются на стороне PostgreSQL сервера.
|
||||
Запросы `SELECT` на стороне PostgreSQL выполняются как `COPY (SELECT ...) TO STDOUT` внутри транзакции PostgreSQL только на чтение с коммитом после каждого запроса `SELECT`.
|
||||
|
||||
Все операции объединения, аггрегации, сортировки, условия `IN [ array ]` и ограничения `LIMIT` выполняются на стороне ClickHouse только после того как запрос к PostgreSQL закончился.
|
||||
Простые условия для `WHERE`, такие как `=`, `!=`, `>`, `>=`, `<`, `<=` и `IN`, исполняются на стороне PostgreSQL сервера.
|
||||
|
||||
INSERT запросы на стороне PostgreSQL выполняются как `COPY "table_name" (field1, field2, ... fieldN) FROM STDIN` внутри PostgreSQL транзакции с автоматическим коммитом после каждого `INSERT` запроса.
|
||||
Все операции объединения, аггрегации, сортировки, условия `IN [ array ]` и ограничения `LIMIT` выполняются на стороне ClickHouse только после того, как запрос к PostgreSQL закончился.
|
||||
|
||||
Запросы `INSERT` на стороне PostgreSQL выполняются как `COPY "table_name" (field1, field2, ... fieldN) FROM STDIN` внутри PostgreSQL транзакции с автоматическим коммитом после каждого запроса `INSERT`.
|
||||
|
||||
PostgreSQL массивы конвертируются в массивы ClickHouse.
|
||||
Будьте осторожны в PostgreSQL массивы созданные как type_name[], являются многомерными и могут содержать в себе разное количество измерений в разных строках одной таблицы, внутри ClickHouse допустипы только многомерные массивы с одинаковым кол-вом измерений во всех строках таблицы.
|
||||
|
||||
!!! info "Внимание"
|
||||
Будьте внимательны, в PostgreSQL массивы, созданные как `type_name[]`, являются многомерными и могут содержать в себе разное количество измерений в разных строках одной таблицы. Внутри ClickHouse допустимы только многомерные массивы с одинаковым кол-вом измерений во всех строках таблицы.
|
||||
|
||||
При использовании словаря PostgreSQL поддерживается приоритет реплик. Чем больше номер реплики, тем ниже ее приоритет. Наивысший приоритет у реплики с номером `0`.
|
||||
|
||||
В примере ниже реплика `example01-1` имеет более высокий приоритет:
|
||||
|
||||
```xml
|
||||
<postgresql>
|
||||
<port>5432</port>
|
||||
<user>clickhouse</user>
|
||||
<password>qwerty</password>
|
||||
<replica>
|
||||
<host>example01-1</host>
|
||||
<priority>1</priority>
|
||||
</replica>
|
||||
<replica>
|
||||
<host>example01-2</host>
|
||||
<priority>2</priority>
|
||||
</replica>
|
||||
<db>db_name</db>
|
||||
<table>table_name</table>
|
||||
<where>id=10</where>
|
||||
<invalidate_query>SQL_QUERY</invalidate_query>
|
||||
</postgresql>
|
||||
</source>
|
||||
```
|
||||
|
||||
## Пример использования {#usage-example}
|
||||
|
||||
@ -64,17 +90,17 @@ PRIMARY KEY (int_id));
|
||||
|
||||
CREATE TABLE
|
||||
|
||||
postgres=# insert into test (int_id, str, "float") VALUES (1,'test',2);
|
||||
postgres=# INSERT INTO test (int_id, str, "float") VALUES (1,'test',2);
|
||||
INSERT 0 1
|
||||
|
||||
postgresql> select * from test;
|
||||
postgresql> SELECT * FROM test;
|
||||
int_id | int_nullable | float | str | float_nullable
|
||||
--------+--------------+-------+------+----------------
|
||||
1 | | 2 | test |
|
||||
(1 row)
|
||||
```
|
||||
|
||||
Таблица в ClickHouse, получение данных из PostgreSQL таблицы созданной выше:
|
||||
Таблица в ClickHouse, получение данных из PostgreSQL таблицы, созданной выше:
|
||||
|
||||
``` sql
|
||||
CREATE TABLE default.postgresql_table
|
||||
@ -87,19 +113,33 @@ ENGINE = PostgreSQL('localhost:5432', 'public', 'test', 'postges_user', 'postgre
|
||||
```
|
||||
|
||||
``` sql
|
||||
SELECT * FROM postgresql_table WHERE str IN ('test')
|
||||
SELECT * FROM postgresql_table WHERE str IN ('test');
|
||||
```
|
||||
|
||||
``` text
|
||||
┌─float_nullable─┬─str──┬─int_id─┐
|
||||
│ ᴺᵁᴸᴸ │ test │ 1 │
|
||||
└────────────────┴──────┴────────┘
|
||||
1 rows in set. Elapsed: 0.019 sec.
|
||||
```
|
||||
|
||||
Using Non-default Schema:
|
||||
|
||||
## Смотри также {#see-also}
|
||||
```text
|
||||
postgres=# CREATE SCHEMA "nice.schema";
|
||||
|
||||
- [Табличная функция ‘postgresql’](../../../sql-reference/table-functions/postgresql.md)
|
||||
- [Использование PostgreSQL в качестве истояника для внешнего словаря](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md#dicts-external_dicts_dict_sources-postgresql)
|
||||
postgres=# CREATE TABLE "nice.schema"."nice.table" (a integer);
|
||||
|
||||
postgres=# INSERT INTO "nice.schema"."nice.table" SELECT i FROM generate_series(0, 99) as t(i)
|
||||
```
|
||||
|
||||
```sql
|
||||
CREATE TABLE pg_table_schema_with_dots (a UInt32)
|
||||
ENGINE PostgreSQL('localhost:5432', 'clickhouse', 'nice.table', 'postgrsql_user', 'password', 'nice.schema');
|
||||
```
|
||||
|
||||
**См. также**
|
||||
|
||||
- [Табличная функция `postgresql`](../../../sql-reference/table-functions/postgresql.md)
|
||||
- [Использование PostgreSQL в качестве источника для внешнего словаря](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md#dicts-external_dicts_dict_sources-postgresql)
|
||||
|
||||
[Оригинальная статья](https://clickhouse.tech/docs/ru/engines/table-engines/integrations/postgresql/) <!--hide-->
|
||||
|
@ -481,7 +481,15 @@ ClickHouse проверяет условия для `min_part_size` и `min_part
|
||||
|
||||
## max_concurrent_queries {#max-concurrent-queries}
|
||||
|
||||
Максимальное количество одновременно обрабатываемых запросов.
|
||||
Определяет максимальное количество одновременно обрабатываемых запросов, связанных с таблицей семейства `MergeTree`. Запросы также могут быть ограничены настройками: [max_concurrent_queries_for_all_users](#max-concurrent-queries-for-all-users), [min_marks_to_honor_max_concurrent_queries](#min-marks-to-honor-max-concurrent-queries).
|
||||
|
||||
!!! info "Примечание"
|
||||
Параметры этих настроек могут быть изменены во время выполнения запросов и вступят в силу немедленно. Запросы, которые уже запущены, выполнятся без изменений.
|
||||
|
||||
Возможные значения:
|
||||
|
||||
- Положительное целое число.
|
||||
- 0 — выключена.
|
||||
|
||||
**Пример**
|
||||
|
||||
@ -509,6 +517,21 @@ ClickHouse проверяет условия для `min_part_size` и `min_part
|
||||
|
||||
- [max_concurrent_queries](#max-concurrent-queries)
|
||||
|
||||
## min_marks_to_honor_max_concurrent_queries {#min-marks-to-honor-max-concurrent-queries}
|
||||
|
||||
Определяет минимальное количество засечек, считываемых запросом для применения настройки [max_concurrent_queries](#max-concurrent-queries).
|
||||
|
||||
Возможные значения:
|
||||
|
||||
- Положительное целое число.
|
||||
- 0 — выключена.
|
||||
|
||||
**Пример**
|
||||
|
||||
``` xml
|
||||
<min_marks_to_honor_max_concurrent_queries>10</min_marks_to_honor_max_concurrent_queries>
|
||||
```
|
||||
|
||||
## max_connections {#max-connections}
|
||||
|
||||
Максимальное количество входящих соединений.
|
||||
@ -1159,4 +1182,3 @@ ClickHouse использует ZooKeeper для хранения метадан
|
||||
</roles>
|
||||
</ldap>
|
||||
```
|
||||
|
||||
|
@ -1792,6 +1792,19 @@ ClickHouse генерирует исключение
|
||||
- [Движок Distributed](../../engines/table-engines/special/distributed.md#distributed)
|
||||
- [Управление распределёнными таблицами](../../sql-reference/statements/system.md#query-language-system-distributed)
|
||||
|
||||
## insert_distributed_one_random_shard {#insert_distributed_one_random_shard}
|
||||
|
||||
Включает или отключает режим вставки данных в [Distributed](../../engines/table-engines/special/distributed.md#distributed)) таблицу в случайный шард при отсутствии ключ шардирования.
|
||||
|
||||
По умолчанию при вставке данных в `Distributed` таблицу с несколькими шардами и при отсутствии ключа шардирования сервер ClickHouse будет отклонять любой запрос на вставку данных. Когда `insert_distributed_one_random_shard = 1`, вставки принимаются, а данные записываются в случайный шард.
|
||||
|
||||
Возможные значения:
|
||||
|
||||
- 0 — если у таблицы несколько шардов, но ключ шардирования отсутствует, вставка данных отклоняется.
|
||||
- 1 — если ключ шардирования отсутствует, то вставка данных осуществляется в случайный шард среди всех доступных шардов.
|
||||
|
||||
Значение по умолчанию: `0`.
|
||||
|
||||
## insert_shard_id {#insert_shard_id}
|
||||
|
||||
Если не `0`, указывает, в какой шард [Distributed](../../engines/table-engines/special/distributed.md#distributed) таблицы данные будут вставлены синхронно.
|
||||
|
@ -69,6 +69,7 @@ SETTINGS(format_csv_allow_single_quotes = 0)
|
||||
- [ClickHouse](#dicts-external_dicts_dict_sources-clickhouse)
|
||||
- [MongoDB](#dicts-external_dicts_dict_sources-mongodb)
|
||||
- [Redis](#dicts-external_dicts_dict_sources-redis)
|
||||
- [PostgreSQL](#dicts-external_dicts_dict_sources-postgresql)
|
||||
|
||||
## Локальный файл {#dicts-external_dicts_dict_sources-local_file}
|
||||
|
||||
|
@ -240,3 +240,53 @@ SELECT bitCount(333);
|
||||
└───────────────┘
|
||||
```
|
||||
|
||||
## bitHammingDistance {#bithammingdistance}
|
||||
|
||||
Возвращает [расстояние Хэмминга](https://ru.wikipedia.org/wiki/%D0%A0%D0%B0%D1%81%D1%81%D1%82%D0%BE%D1%8F%D0%BD%D0%B8%D0%B5_%D0%A5%D1%8D%D0%BC%D0%BC%D0%B8%D0%BD%D0%B3%D0%B0) между битовыми представлениями двух целых чисел. Может быть использовано с функциями [SimHash](../../sql-reference/functions/hash-functions.md#ngramsimhash) для проверки двух строк на схожесть. Чем меньше расстояние, тем больше вероятность, что строки совпадают.
|
||||
|
||||
**Синтаксис**
|
||||
|
||||
``` sql
|
||||
bitHammingDistance(int1, int2)
|
||||
```
|
||||
|
||||
**Аргументы**
|
||||
|
||||
- `int1` — первое целое число. [Int64](../../sql-reference/data-types/int-uint.md).
|
||||
- `int2` — второе целое число. [Int64](../../sql-reference/data-types/int-uint.md).
|
||||
|
||||
**Возвращаемое значение**
|
||||
|
||||
- Расстояние Хэмминга.
|
||||
|
||||
Тип: [UInt8](../../sql-reference/data-types/int-uint.md).
|
||||
|
||||
**Примеры**
|
||||
|
||||
Запрос:
|
||||
|
||||
``` sql
|
||||
SELECT bitHammingDistance(111, 121);
|
||||
```
|
||||
|
||||
Результат:
|
||||
|
||||
``` text
|
||||
┌─bitHammingDistance(111, 121)─┐
|
||||
│ 3 │
|
||||
└──────────────────────────────┘
|
||||
```
|
||||
|
||||
Используя [SimHash](../../sql-reference/functions/hash-functions.md#ngramsimhash):
|
||||
|
||||
``` sql
|
||||
SELECT bitHammingDistance(ngramSimHash('cat ate rat'), ngramSimHash('rat ate cat'));
|
||||
```
|
||||
|
||||
Результат:
|
||||
|
||||
``` text
|
||||
┌─bitHammingDistance(ngramSimHash('cat ate rat'), ngramSimHash('rat ate cat'))─┐
|
||||
│ 5 │
|
||||
└──────────────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
@ -7,6 +7,8 @@ toc_title: "Функции хэширования"
|
||||
|
||||
Функции хэширования могут использоваться для детерминированного псевдослучайного разбрасывания элементов.
|
||||
|
||||
Simhash – это хеш-функция, которая для близких значений возвращает близкий хеш.
|
||||
|
||||
## halfMD5 {#hash-functions-halfmd5}
|
||||
|
||||
[Интерпретирует](../../sql-reference/functions/hash-functions.md#type_conversion_functions-reinterpretAsString) все входные параметры как строки и вычисляет хэш [MD5](https://ru.wikipedia.org/wiki/MD5) для каждой из них. Затем объединяет хэши, берет первые 8 байт хэша результирующей строки и интерпретирует их как значение типа `UInt64` с big-endian порядком байтов.
|
||||
@ -484,3 +486,937 @@ SELECT xxHash32('Hello, world!');
|
||||
|
||||
- [xxHash](http://cyan4973.github.io/xxHash/).
|
||||
|
||||
## ngramSimHash {#ngramsimhash}
|
||||
|
||||
Выделяет из ASCII строки отрезки (n-граммы) размером `ngramsize` символов и возвращает n-граммовый `simhash`. Функция регистрозависимая.
|
||||
|
||||
Может быть использована для проверки двух строк на схожесть вместе с функцией [bitHammingDistance](../../sql-reference/functions/bit-functions.md#bithammingdistance). Чем меньше [расстояние Хэмминга](https://ru.wikipedia.org/wiki/%D0%A0%D0%B0%D1%81%D1%81%D1%82%D0%BE%D1%8F%D0%BD%D0%B8%D0%B5_%D0%A5%D1%8D%D0%BC%D0%BC%D0%B8%D0%BD%D0%B3%D0%B0) между результатом вычисления `simhash` двух строк, тем больше вероятность, что строки совпадают.
|
||||
|
||||
**Синтаксис**
|
||||
|
||||
``` sql
|
||||
ngramSimHash(string[, ngramsize])
|
||||
```
|
||||
|
||||
**Аргументы**
|
||||
|
||||
- `string` — строка. [String](../../sql-reference/data-types/string.md).
|
||||
- `ngramsize` — размер n-грамм. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
|
||||
|
||||
**Возвращаемое значение**
|
||||
|
||||
- Значение хеш-функции от строки.
|
||||
|
||||
Тип: [UInt64](../../sql-reference/data-types/int-uint.md).
|
||||
|
||||
**Пример**
|
||||
|
||||
Запрос:
|
||||
|
||||
``` sql
|
||||
SELECT ngramSimHash('ClickHouse') AS Hash;
|
||||
```
|
||||
|
||||
Результат:
|
||||
|
||||
``` text
|
||||
┌───────Hash─┐
|
||||
│ 1627567969 │
|
||||
└────────────┘
|
||||
```
|
||||
|
||||
## ngramSimHashCaseInsensitive {#ngramsimhashcaseinsensitive}
|
||||
|
||||
Выделяет из ASCII строки отрезки (n-граммы) размером `ngramsize` символов и возвращает n-граммовый `simhash`. Функция регистро**не**зависимая.
|
||||
|
||||
Может быть использована для проверки двух строк на схожесть вместе с функцией [bitHammingDistance](../../sql-reference/functions/bit-functions.md#bithammingdistance). Чем меньше [расстояние Хэмминга](https://ru.wikipedia.org/wiki/%D0%A0%D0%B0%D1%81%D1%81%D1%82%D0%BE%D1%8F%D0%BD%D0%B8%D0%B5_%D0%A5%D1%8D%D0%BC%D0%BC%D0%B8%D0%BD%D0%B3%D0%B0) между результатом вычисления `simhash` двух строк, тем больше вероятность, что строки совпадают.
|
||||
|
||||
**Синтаксис**
|
||||
|
||||
``` sql
|
||||
ngramSimHashCaseInsensitive(string[, ngramsize])
|
||||
```
|
||||
|
||||
**Аргументы**
|
||||
|
||||
- `string` — строка. [String](../../sql-reference/data-types/string.md).
|
||||
- `ngramsize` — размер n-грамм. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
|
||||
|
||||
**Возвращаемое значение**
|
||||
|
||||
- Значение хеш-функции от строки.
|
||||
|
||||
Тип: [UInt64](../../sql-reference/data-types/int-uint.md).
|
||||
|
||||
**Пример**
|
||||
|
||||
Запрос:
|
||||
|
||||
``` sql
|
||||
SELECT ngramSimHashCaseInsensitive('ClickHouse') AS Hash;
|
||||
```
|
||||
|
||||
Результат:
|
||||
|
||||
``` text
|
||||
┌──────Hash─┐
|
||||
│ 562180645 │
|
||||
└───────────┘
|
||||
```
|
||||
|
||||
## ngramSimHashUTF8 {#ngramsimhashutf8}
|
||||
|
||||
Выделяет из UTF-8 строки отрезки (n-граммы) размером `ngramsize` символов и возвращает n-граммовый `simhash`. Функция регистрозависимая.
|
||||
|
||||
Может быть использована для проверки двух строк на схожесть вместе с функцией [bitHammingDistance](../../sql-reference/functions/bit-functions.md#bithammingdistance). Чем меньше [расстояние Хэмминга](https://ru.wikipedia.org/wiki/%D0%A0%D0%B0%D1%81%D1%81%D1%82%D0%BE%D1%8F%D0%BD%D0%B8%D0%B5_%D0%A5%D1%8D%D0%BC%D0%BC%D0%B8%D0%BD%D0%B3%D0%B0) между результатом вычисления `simhash` двух строк, тем больше вероятность, что строки совпадают.
|
||||
|
||||
**Синтаксис**
|
||||
|
||||
``` sql
|
||||
ngramSimHashUTF8(string[, ngramsize])
|
||||
```
|
||||
|
||||
**Аргументы**
|
||||
|
||||
- `string` — строка. [String](../../sql-reference/data-types/string.md).
|
||||
- `ngramsize` — размер n-грамм. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
|
||||
|
||||
**Возвращаемое значение**
|
||||
|
||||
- Значение хеш-функции от строки.
|
||||
|
||||
Тип: [UInt64](../../sql-reference/data-types/int-uint.md).
|
||||
|
||||
**Пример**
|
||||
|
||||
Запрос:
|
||||
|
||||
``` sql
|
||||
SELECT ngramSimHashUTF8('ClickHouse') AS Hash;
|
||||
```
|
||||
|
||||
Результат:
|
||||
|
||||
``` text
|
||||
┌───────Hash─┐
|
||||
│ 1628157797 │
|
||||
└────────────┘
|
||||
```
|
||||
|
||||
## ngramSimHashCaseInsensitiveUTF8 {#ngramsimhashcaseinsensitiveutf8}
|
||||
|
||||
Выделяет из UTF-8 строки отрезки (n-граммы) размером `ngramsize` символов и возвращает n-граммовый `simhash`. Функция регистро**не**зависимая.
|
||||
|
||||
Может быть использована для проверки двух строк на схожесть вместе с функцией [bitHammingDistance](../../sql-reference/functions/bit-functions.md#bithammingdistance). Чем меньше [расстояние Хэмминга](https://ru.wikipedia.org/wiki/%D0%A0%D0%B0%D1%81%D1%81%D1%82%D0%BE%D1%8F%D0%BD%D0%B8%D0%B5_%D0%A5%D1%8D%D0%BC%D0%BC%D0%B8%D0%BD%D0%B3%D0%B0) между результатом вычисления `simhash` двух строк, тем больше вероятность, что строки совпадают.
|
||||
|
||||
**Синтаксис**
|
||||
|
||||
``` sql
|
||||
ngramSimHashCaseInsensitiveUTF8(string[, ngramsize])
|
||||
```
|
||||
|
||||
**Аргументы**
|
||||
|
||||
- `string` — строка. [String](../../sql-reference/data-types/string.md).
|
||||
- `ngramsize` — размер n-грамм. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
|
||||
|
||||
**Возвращаемое значение**
|
||||
|
||||
- Значение хеш-функции от строки.
|
||||
|
||||
Тип: [UInt64](../../sql-reference/data-types/int-uint.md).
|
||||
|
||||
**Пример**
|
||||
|
||||
Запрос:
|
||||
|
||||
``` sql
|
||||
SELECT ngramSimHashCaseInsensitiveUTF8('ClickHouse') AS Hash;
|
||||
```
|
||||
|
||||
Результат:
|
||||
|
||||
``` text
|
||||
┌───────Hash─┐
|
||||
│ 1636742693 │
|
||||
└────────────┘
|
||||
```
|
||||
|
||||
## wordShingleSimHash {#wordshinglesimhash}
|
||||
|
||||
Выделяет из ASCII строки отрезки (шинглы) из `shinglesize` слов и возвращает шингловый `simhash`. Функция регистрозависимая.
|
||||
|
||||
Может быть использована для проверки двух строк на схожесть вместе с функцией [bitHammingDistance](../../sql-reference/functions/bit-functions.md#bithammingdistance). Чем меньше [расстояние Хэмминга](https://ru.wikipedia.org/wiki/%D0%A0%D0%B0%D1%81%D1%81%D1%82%D0%BE%D1%8F%D0%BD%D0%B8%D0%B5_%D0%A5%D1%8D%D0%BC%D0%BC%D0%B8%D0%BD%D0%B3%D0%B0) между результатом вычисления `simhash` двух строк, тем больше вероятность, что строки совпадают.
|
||||
|
||||
**Синтаксис**
|
||||
|
||||
``` sql
|
||||
wordShingleSimHash(string[, shinglesize])
|
||||
```
|
||||
|
||||
**Аргументы**
|
||||
|
||||
- `string` — строка. [String](../../sql-reference/data-types/string.md).
|
||||
- `shinglesize` — размер словесных шинглов. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
|
||||
|
||||
**Возвращаемое значение**
|
||||
|
||||
- Значение хеш-функции от строки.
|
||||
|
||||
Тип: [UInt64](../../sql-reference/data-types/int-uint.md).
|
||||
|
||||
**Пример**
|
||||
|
||||
Запрос:
|
||||
|
||||
``` sql
|
||||
SELECT wordShingleSimHash('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).') AS Hash;
|
||||
```
|
||||
|
||||
Результат:
|
||||
|
||||
``` text
|
||||
┌───────Hash─┐
|
||||
│ 2328277067 │
|
||||
└────────────┘
|
||||
```
|
||||
|
||||
## wordShingleSimHashCaseInsensitive {#wordshinglesimhashcaseinsensitive}
|
||||
|
||||
Выделяет из ASCII строки отрезки (шинглы) из `shinglesize` слов и возвращает шингловый `simhash`. Функция регистро**не**зависимая.
|
||||
|
||||
Может быть использована для проверки двух строк на схожесть вместе с функцией [bitHammingDistance](../../sql-reference/functions/bit-functions.md#bithammingdistance). Чем меньше [расстояние Хэмминга](https://ru.wikipedia.org/wiki/%D0%A0%D0%B0%D1%81%D1%81%D1%82%D0%BE%D1%8F%D0%BD%D0%B8%D0%B5_%D0%A5%D1%8D%D0%BC%D0%BC%D0%B8%D0%BD%D0%B3%D0%B0) между результатом вычисления `simhash` двух строк, тем больше вероятность, что строки совпадают.
|
||||
|
||||
**Синтаксис**
|
||||
|
||||
``` sql
|
||||
wordShingleSimHashCaseInsensitive(string[, shinglesize])
|
||||
```
|
||||
|
||||
**Аргументы**
|
||||
|
||||
- `string` — строка. [String](../../sql-reference/data-types/string.md).
|
||||
- `shinglesize` — размер словесных шинглов. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
|
||||
|
||||
**Возвращаемое значение**
|
||||
|
||||
- Значение хеш-функции от строки.
|
||||
|
||||
Тип: [UInt64](../../sql-reference/data-types/int-uint.md).
|
||||
|
||||
**Пример**
|
||||
|
||||
Запрос:
|
||||
|
||||
``` sql
|
||||
SELECT wordShingleSimHashCaseInsensitive('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).') AS Hash;
|
||||
```
|
||||
|
||||
Результат:
|
||||
|
||||
``` text
|
||||
┌───────Hash─┐
|
||||
│ 2194812424 │
|
||||
└────────────┘
|
||||
```
|
||||
|
||||
## wordShingleSimHashUTF8 {#wordshinglesimhashutf8}
|
||||
|
||||
Выделяет из UTF-8 строки отрезки (шинглы) из `shinglesize` слов и возвращает шингловый `simhash`. Функция регистрозависимая.
|
||||
|
||||
Может быть использована для проверки двух строк на схожесть вместе с функцией [bitHammingDistance](../../sql-reference/functions/bit-functions.md#bithammingdistance). Чем меньше [расстояние Хэмминга](https://ru.wikipedia.org/wiki/%D0%A0%D0%B0%D1%81%D1%81%D1%82%D0%BE%D1%8F%D0%BD%D0%B8%D0%B5_%D0%A5%D1%8D%D0%BC%D0%BC%D0%B8%D0%BD%D0%B3%D0%B0) между результатом вычисления `simhash` двух строк, тем больше вероятность, что строки совпадают.
|
||||
|
||||
**Синтаксис**
|
||||
|
||||
``` sql
|
||||
wordShingleSimHashUTF8(string[, shinglesize])
|
||||
```
|
||||
|
||||
**Аргументы**
|
||||
|
||||
- `string` — строка. [String](../../sql-reference/data-types/string.md).
|
||||
- `shinglesize` — размер словесных шинглов. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
|
||||
|
||||
**Возвращаемое значение**
|
||||
|
||||
- Значение хеш-функции от строки.
|
||||
|
||||
Тип: [UInt64](../../sql-reference/data-types/int-uint.md).
|
||||
|
||||
**Пример**
|
||||
|
||||
Запрос:
|
||||
|
||||
``` sql
|
||||
SELECT wordShingleSimHashUTF8('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).') AS Hash;
|
||||
```
|
||||
|
||||
Результат:
|
||||
|
||||
``` text
|
||||
┌───────Hash─┐
|
||||
│ 2328277067 │
|
||||
└────────────┘
|
||||
```
|
||||
|
||||
## wordShingleSimHashCaseInsensitiveUTF8 {#wordshinglesimhashcaseinsensitiveutf8}
|
||||
|
||||
Выделяет из UTF-8 строки отрезки (шинглы) из `shinglesize` слов и возвращает шингловый `simhash`. Функция регистро**не**зависимая.
|
||||
|
||||
Может быть использована для проверки двух строк на схожесть вместе с функцией [bitHammingDistance](../../sql-reference/functions/bit-functions.md#bithammingdistance). Чем меньше [расстояние Хэмминга](https://ru.wikipedia.org/wiki/%D0%A0%D0%B0%D1%81%D1%81%D1%82%D0%BE%D1%8F%D0%BD%D0%B8%D0%B5_%D0%A5%D1%8D%D0%BC%D0%BC%D0%B8%D0%BD%D0%B3%D0%B0) между результатом вычисления `simhash` двух строк, тем больше вероятность, что строки совпадают.
|
||||
|
||||
**Синтаксис**
|
||||
|
||||
``` sql
|
||||
wordShingleSimHashCaseInsensitiveUTF8(string[, shinglesize])
|
||||
```
|
||||
|
||||
**Аргументы**
|
||||
|
||||
- `string` — строка. [String](../../sql-reference/data-types/string.md).
|
||||
- `shinglesize` — размер словесных шинглов. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
|
||||
|
||||
**Возвращаемое значение**
|
||||
|
||||
- Значение хеш-функции от строки.
|
||||
|
||||
Тип: [UInt64](../../sql-reference/data-types/int-uint.md).
|
||||
|
||||
**Пример**
|
||||
|
||||
Запрос:
|
||||
|
||||
``` sql
|
||||
SELECT wordShingleSimHashCaseInsensitiveUTF8('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).') AS Hash;
|
||||
```
|
||||
|
||||
Результат:
|
||||
|
||||
``` text
|
||||
┌───────Hash─┐
|
||||
│ 2194812424 │
|
||||
└────────────┘
|
||||
```
|
||||
|
||||
## ngramMinHash {#ngramminhash}
|
||||
|
||||
Выделяет из ASCII строки отрезки (n-граммы) размером `ngramsize` символов и вычисляет хеш для каждой n-граммы. Использует `hashnum` минимальных хешей, чтобы вычислить минимальный хеш, и `hashnum` максимальных хешей, чтобы вычислить максимальный хеш. Возвращает кортеж из этих хешей. Функция регистрозависимая.
|
||||
|
||||
Может быть использована для проверки двух строк на схожесть вместе с функцией [tupleHammingDistance](../../sql-reference/functions/tuple-functions.md#tuplehammingdistance). Если для двух строк минимальные или максимальные хеши одинаковы, мы считаем, что эти строки совпадают.
|
||||
|
||||
**Синтаксис**
|
||||
|
||||
``` sql
|
||||
ngramMinHash(string[, ngramsize, hashnum])
|
||||
```
|
||||
|
||||
**Аргументы**
|
||||
|
||||
- `string` — строка. [String](../../sql-reference/data-types/string.md).
|
||||
- `ngramsize` — размер n-грамм. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
|
||||
- `hashnum` — количество минимальных и максимальных хешей, которое используется при вычислении результата. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `6`. [UInt8](../../sql-reference/data-types/int-uint.md).
|
||||
|
||||
**Возвращаемое значение**
|
||||
|
||||
- Кортеж с двумя хешами — минимальным и максимальным.
|
||||
|
||||
Тип: [Tuple](../../sql-reference/data-types/tuple.md)([UInt64](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md)).
|
||||
|
||||
**Пример**
|
||||
|
||||
Запрос:
|
||||
|
||||
``` sql
|
||||
SELECT ngramMinHash('ClickHouse') AS Tuple;
|
||||
```
|
||||
|
||||
Результат:
|
||||
|
||||
``` text
|
||||
┌─Tuple──────────────────────────────────────┐
|
||||
│ (18333312859352735453,9054248444481805918) │
|
||||
└────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
## ngramMinHashCaseInsensitive {#ngramminhashcaseinsensitive}
|
||||
|
||||
Выделяет из ASCII строки отрезки (n-граммы) размером `ngramsize` символов и вычисляет хеш для каждой n-граммы. Использует `hashnum` минимальных хешей, чтобы вычислить минимальный хеш, и `hashnum` максимальных хешей, чтобы вычислить максимальный хеш. Возвращает кортеж из этих хешей. Функция регистро**не**зависимая.
|
||||
|
||||
Может быть использована для проверки двух строк на схожесть вместе с функцией [tupleHammingDistance](../../sql-reference/functions/tuple-functions.md#tuplehammingdistance). Если для двух строк минимальные или максимальные хеши одинаковы, мы считаем, что эти строки совпадают.
|
||||
|
||||
**Синтаксис**
|
||||
|
||||
``` sql
|
||||
ngramMinHashCaseInsensitive(string[, ngramsize, hashnum])
|
||||
```
|
||||
|
||||
**Аргументы**
|
||||
|
||||
- `string` — строка. [String](../../sql-reference/data-types/string.md).
|
||||
- `ngramsize` — размер n-грамм. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
|
||||
- `hashnum` — количество минимальных и максимальных хешей, которое используется при вычислении результата. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `6`. [UInt8](../../sql-reference/data-types/int-uint.md).
|
||||
|
||||
**Возвращаемое значение**
|
||||
|
||||
- Кортеж с двумя хешами — минимальным и максимальным.
|
||||
|
||||
Тип: [Tuple](../../sql-reference/data-types/tuple.md)([UInt64](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md)).
|
||||
|
||||
**Пример**
|
||||
|
||||
Запрос:
|
||||
|
||||
``` sql
|
||||
SELECT ngramMinHashCaseInsensitive('ClickHouse') AS Tuple;
|
||||
```
|
||||
|
||||
Результат:
|
||||
|
||||
``` text
|
||||
┌─Tuple──────────────────────────────────────┐
|
||||
│ (2106263556442004574,13203602793651726206) │
|
||||
└────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
## ngramMinHashUTF8 {#ngramminhashutf8}
|
||||
|
||||
Выделяет из UTF-8 строки отрезки (n-граммы) размером `ngramsize` символов и вычисляет хеш для каждой n-граммы. Использует `hashnum` минимальных хешей, чтобы вычислить минимальный хеш, и `hashnum` максимальных хешей, чтобы вычислить максимальный хеш. Возвращает кортеж из этих хешей. Функция регистрозависимая.
|
||||
|
||||
Может быть использована для проверки двух строк на схожесть вместе с функцией [tupleHammingDistance](../../sql-reference/functions/tuple-functions.md#tuplehammingdistance). Если для двух строк минимальные или максимальные хеши одинаковы, мы считаем, что эти строки совпадают.
|
||||
|
||||
**Синтаксис**
|
||||
``` sql
|
||||
ngramMinHashUTF8(string[, ngramsize, hashnum])
|
||||
```
|
||||
|
||||
**Аргументы**
|
||||
|
||||
- `string` — строка. [String](../../sql-reference/data-types/string.md).
|
||||
- `ngramsize` — размер n-грамм. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
|
||||
- `hashnum` — количество минимальных и максимальных хешей, которое используется при вычислении результата. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `6`. [UInt8](../../sql-reference/data-types/int-uint.md).
|
||||
|
||||
**Возвращаемое значение**
|
||||
|
||||
- Кортеж с двумя хешами — минимальным и максимальным.
|
||||
|
||||
Тип: [Tuple](../../sql-reference/data-types/tuple.md)([UInt64](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md)).
|
||||
|
||||
**Пример**
|
||||
|
||||
Запрос:
|
||||
|
||||
``` sql
|
||||
SELECT ngramMinHashUTF8('ClickHouse') AS Tuple;
|
||||
```
|
||||
|
||||
Результат:
|
||||
|
||||
``` text
|
||||
┌─Tuple──────────────────────────────────────┐
|
||||
│ (18333312859352735453,6742163577938632877) │
|
||||
└────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
## ngramMinHashCaseInsensitiveUTF8 {#ngramminhashcaseinsensitiveutf8}
|
||||
|
||||
Выделяет из UTF-8 строки отрезки (n-граммы) размером `ngramsize` символов и вычисляет хеш для каждой n-граммы. Использует `hashnum` минимальных хешей, чтобы вычислить минимальный хеш, и `hashnum` максимальных хешей, чтобы вычислить максимальный хеш. Возвращает кортеж из этих хешей. Функция регистро**не**зависимая.
|
||||
|
||||
Может быть использована для проверки двух строк на схожесть вместе с функцией [tupleHammingDistance](../../sql-reference/functions/tuple-functions.md#tuplehammingdistance). Если для двух строк минимальные или максимальные хеши одинаковы, мы считаем, что эти строки совпадают.
|
||||
|
||||
**Синтаксис**
|
||||
|
||||
``` sql
|
||||
ngramMinHashCaseInsensitiveUTF8(string [, ngramsize, hashnum])
|
||||
```
|
||||
|
||||
**Аргументы**
|
||||
|
||||
- `string` — строка. [String](../../sql-reference/data-types/string.md).
|
||||
- `ngramsize` — размер n-грамм. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
|
||||
- `hashnum` — количество минимальных и максимальных хешей, которое используется при вычислении результата. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `6`. [UInt8](../../sql-reference/data-types/int-uint.md).
|
||||
|
||||
**Возвращаемое значение**
|
||||
|
||||
- Кортеж с двумя хешами — минимальным и максимальным.
|
||||
|
||||
Тип: [Tuple](../../sql-reference/data-types/tuple.md)([UInt64](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md)).
|
||||
|
||||
**Пример**
|
||||
|
||||
Запрос:
|
||||
|
||||
``` sql
|
||||
SELECT ngramMinHashCaseInsensitiveUTF8('ClickHouse') AS Tuple;
|
||||
```
|
||||
|
||||
Результат:
|
||||
|
||||
``` text
|
||||
┌─Tuple───────────────────────────────────────┐
|
||||
│ (12493625717655877135,13203602793651726206) │
|
||||
└─────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
## ngramMinHashArg {#ngramminhasharg}
|
||||
|
||||
Выделяет из ASCII строки отрезки (n-граммы) размером `ngramsize` символов и возвращает n-граммы с минимальным и максимальным хешами, вычисленными функцией [ngramMinHash](#ngramminhash) с теми же входными данными. Функция регистрозависимая.
|
||||
|
||||
**Синтаксис**
|
||||
|
||||
``` sql
|
||||
ngramMinHashArg(string[, ngramsize, hashnum])
|
||||
```
|
||||
|
||||
**Аргументы**
|
||||
|
||||
- `string` — строка. [String](../../sql-reference/data-types/string.md).
|
||||
- `ngramsize` — размер n-грамм. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
|
||||
- `hashnum` — количество минимальных и максимальных хешей, которое используется при вычислении результата. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `6`. [UInt8](../../sql-reference/data-types/int-uint.md).
|
||||
|
||||
**Возвращаемое значение**
|
||||
|
||||
- Кортеж из двух кортежей, каждый из которых состоит из `hashnum` n-грамм.
|
||||
|
||||
Тип: [Tuple](../../sql-reference/data-types/tuple.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md)), [Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md))).
|
||||
|
||||
**Пример**
|
||||
|
||||
Запрос:
|
||||
|
||||
``` sql
|
||||
SELECT ngramMinHashArg('ClickHouse') AS Tuple;
|
||||
```
|
||||
|
||||
Результат:
|
||||
|
||||
``` text
|
||||
┌─Tuple─────────────────────────────────────────────────────────────────────────┐
|
||||
│ (('ous','ick','lic','Hou','kHo','use'),('Hou','lic','ick','ous','ckH','Cli')) │
|
||||
└───────────────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
## ngramMinHashArgCaseInsensitive {#ngramminhashargcaseinsensitive}
|
||||
|
||||
Выделяет из ASCII строки отрезки (n-граммы) размером `ngramsize` символов и возвращает n-граммы с минимальным и максимальным хешами, вычисленными функцией [ngramMinHashCaseInsensitive](#ngramminhashcaseinsensitive) с теми же входными данными. Функция регистро**не**зависимая.
|
||||
|
||||
**Синтаксис**
|
||||
|
||||
``` sql
|
||||
ngramMinHashArgCaseInsensitive(string[, ngramsize, hashnum])
|
||||
```
|
||||
|
||||
**Аргументы**
|
||||
|
||||
- `string` — строка. [String](../../sql-reference/data-types/string.md).
|
||||
- `ngramsize` — размер n-грамм. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
|
||||
- `hashnum` — количество минимальных и максимальных хешей, которое используется при вычислении результата. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `6`. [UInt8](../../sql-reference/data-types/int-uint.md).
|
||||
|
||||
**Возвращаемое значение**
|
||||
|
||||
- Кортеж из двух кортежей, каждый из которых состоит из `hashnum` n-грамм.
|
||||
|
||||
Тип: [Tuple](../../sql-reference/data-types/tuple.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md)), [Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md))).
|
||||
|
||||
**Пример**
|
||||
|
||||
Запрос:
|
||||
|
||||
``` sql
|
||||
SELECT ngramMinHashArgCaseInsensitive('ClickHouse') AS Tuple;
|
||||
```
|
||||
|
||||
Результат:
|
||||
|
||||
``` text
|
||||
┌─Tuple─────────────────────────────────────────────────────────────────────────┐
|
||||
│ (('ous','ick','lic','kHo','use','Cli'),('kHo','lic','ick','ous','ckH','Hou')) │
|
||||
└───────────────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
## ngramMinHashArgUTF8 {#ngramminhashargutf8}
|
||||
|
||||
Выделяет из UTF-8 строки отрезки (n-граммы) размером `ngramsize` символов и возвращает n-граммы с минимальным и максимальным хешами, вычисленными функцией [ngramMinHashUTF8](#ngramminhashutf8) с теми же входными данными. Функция регистрозависимая.
|
||||
|
||||
**Синтаксис**
|
||||
|
||||
``` sql
|
||||
ngramMinHashArgUTF8(string[, ngramsize, hashnum])
|
||||
```
|
||||
|
||||
**Аргументы**
|
||||
|
||||
- `string` — строка. [String](../../sql-reference/data-types/string.md).
|
||||
- `ngramsize` — размер n-грамм. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
|
||||
- `hashnum` — количество минимальных и максимальных хешей, которое используется при вычислении результата. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `6`. [UInt8](../../sql-reference/data-types/int-uint.md).
|
||||
|
||||
**Возвращаемое значение**
|
||||
|
||||
- Кортеж из двух кортежей, каждый из которых состоит из `hashnum` n-грамм.
|
||||
|
||||
Тип: [Tuple](../../sql-reference/data-types/tuple.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md)), [Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md))).
|
||||
|
||||
**Пример**
|
||||
|
||||
Запрос:
|
||||
|
||||
``` sql
|
||||
SELECT ngramMinHashArgUTF8('ClickHouse') AS Tuple;
|
||||
```
|
||||
|
||||
Результат:
|
||||
|
||||
``` text
|
||||
┌─Tuple─────────────────────────────────────────────────────────────────────────┐
|
||||
│ (('ous','ick','lic','Hou','kHo','use'),('kHo','Hou','lic','ick','ous','ckH')) │
|
||||
└───────────────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
## ngramMinHashArgCaseInsensitiveUTF8 {#ngramminhashargcaseinsensitiveutf8}
|
||||
|
||||
Выделяет из UTF-8 строки отрезки (n-граммы) размером `ngramsize` символов и возвращает n-граммы с минимальным и максимальным хешами, вычисленными функцией [ngramMinHashCaseInsensitiveUTF8](#ngramminhashcaseinsensitiveutf8) с теми же входными данными. Функция регистро**не**зависимая.
|
||||
|
||||
**Синтаксис**
|
||||
|
||||
``` sql
|
||||
ngramMinHashArgCaseInsensitiveUTF8(string[, ngramsize, hashnum])
|
||||
```
|
||||
|
||||
**Аргументы**
|
||||
|
||||
- `string` — строка. [String](../../sql-reference/data-types/string.md).
|
||||
- `ngramsize` — размер n-грамм. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
|
||||
- `hashnum` — количество минимальных и максимальных хешей, которое используется при вычислении результата. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `6`. [UInt8](../../sql-reference/data-types/int-uint.md).
|
||||
|
||||
**Возвращаемое значение**
|
||||
|
||||
- Кортеж из двух кортежей, каждый из которых состоит из `hashnum` n-грамм.
|
||||
|
||||
Тип: [Tuple](../../sql-reference/data-types/tuple.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md)), [Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md))).
|
||||
|
||||
**Пример**
|
||||
|
||||
Запрос:
|
||||
|
||||
``` sql
|
||||
SELECT ngramMinHashArgCaseInsensitiveUTF8('ClickHouse') AS Tuple;
|
||||
```
|
||||
|
||||
Результат:
|
||||
|
||||
``` text
|
||||
┌─Tuple─────────────────────────────────────────────────────────────────────────┐
|
||||
│ (('ckH','ous','ick','lic','kHo','use'),('kHo','lic','ick','ous','ckH','Hou')) │
|
||||
└───────────────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
## wordShingleMinHash {#wordshingleminhash}
|
||||
|
||||
Выделяет из ASCII строки отрезки (шинглы) из `shinglesize` слов и вычисляет хеш для каждого шингла. Использует `hashnum` минимальных хешей, чтобы вычислить минимальный хеш, и `hashnum` максимальных хешей, чтобы вычислить максимальный хеш. Возвращает кортеж из этих хешей. Функция регистрозависимая.
|
||||
|
||||
Может быть использована для проверки двух строк на схожесть вместе с функцией [tupleHammingDistance](../../sql-reference/functions/tuple-functions.md#tuplehammingdistance). Если для двух строк минимальные или максимальные хеши одинаковы, мы считаем, что эти строки совпадают.
|
||||
|
||||
**Синтаксис**
|
||||
|
||||
``` sql
|
||||
wordShingleMinHash(string[, shinglesize, hashnum])
|
||||
```
|
||||
|
||||
**Аргументы**
|
||||
|
||||
- `string` — строка. [String](../../sql-reference/data-types/string.md).
|
||||
- `shinglesize` — размер словесных шинглов. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
|
||||
- `hashnum` — количество минимальных и максимальных хешей, которое используется при вычислении результата. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `6`. [UInt8](../../sql-reference/data-types/int-uint.md).
|
||||
|
||||
**Возвращаемое значение**
|
||||
|
||||
- Кортеж с двумя хешами — минимальным и максимальным.
|
||||
|
||||
Тип: [Tuple](../../sql-reference/data-types/tuple.md)([UInt64](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md)).
|
||||
|
||||
**Пример**
|
||||
|
||||
Запрос:
|
||||
|
||||
``` sql
|
||||
SELECT wordShingleMinHash('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).') AS Tuple;
|
||||
```
|
||||
|
||||
Результат:
|
||||
|
||||
``` text
|
||||
┌─Tuple──────────────────────────────────────┐
|
||||
│ (16452112859864147620,5844417301642981317) │
|
||||
└────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
## wordShingleMinHashCaseInsensitive {#wordshingleminhashcaseinsensitive}
|
||||
|
||||
Выделяет из ASCII строки отрезки (шинглы) из `shinglesize` слов и вычисляет хеш для каждого шингла. Использует `hashnum` минимальных хешей, чтобы вычислить минимальный хеш, и `hashnum` максимальных хешей, чтобы вычислить максимальный хеш. Возвращает кортеж из этих хешей. Функция регистро**не**зависимая.
|
||||
|
||||
Может быть использована для проверки двух строк на схожесть вместе с функцией [tupleHammingDistance](../../sql-reference/functions/tuple-functions.md#tuplehammingdistance). Если для двух строк минимальные или максимальные хеши одинаковы, мы считаем, что эти строки совпадают.
|
||||
|
||||
**Синтаксис**
|
||||
|
||||
``` sql
|
||||
wordShingleMinHashCaseInsensitive(string[, shinglesize, hashnum])
|
||||
```
|
||||
|
||||
**Аргументы**
|
||||
|
||||
- `string` — строка. [String](../../sql-reference/data-types/string.md).
|
||||
- `shinglesize` — размер словесных шинглов. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
|
||||
- `hashnum` — количество минимальных и максимальных хешей, которое используется при вычислении результата. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `6`. [UInt8](../../sql-reference/data-types/int-uint.md).
|
||||
|
||||
**Возвращаемое значение**
|
||||
|
||||
- Кортеж с двумя хешами — минимальным и максимальным.
|
||||
|
||||
Тип: [Tuple](../../sql-reference/data-types/tuple.md)([UInt64](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md)).
|
||||
|
||||
**Пример**
|
||||
|
||||
Запрос:
|
||||
|
||||
``` sql
|
||||
SELECT wordShingleMinHashCaseInsensitive('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).') AS Tuple;
|
||||
```
|
||||
|
||||
Результат:
|
||||
|
||||
``` text
|
||||
┌─Tuple─────────────────────────────────────┐
|
||||
│ (3065874883688416519,1634050779997673240) │
|
||||
└───────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
## wordShingleMinHashUTF8 {#wordshingleminhashutf8}
|
||||
|
||||
Выделяет из UTF-8 строки отрезки (шинглы) из `shinglesize` слов и вычисляет хеш для каждого шингла. Использует `hashnum` минимальных хешей, чтобы вычислить минимальный хеш, и `hashnum` максимальных хешей, чтобы вычислить максимальный хеш. Возвращает кортеж из этих хешей. Функция регистрозависимая.
|
||||
|
||||
Может быть использована для проверки двух строк на схожесть вместе с функцией [tupleHammingDistance](../../sql-reference/functions/tuple-functions.md#tuplehammingdistance). Если для двух строк минимальные или максимальные хеши одинаковы, мы считаем, что эти строки совпадают.
|
||||
|
||||
**Синтаксис**
|
||||
|
||||
``` sql
|
||||
wordShingleMinHashUTF8(string[, shinglesize, hashnum])
|
||||
```
|
||||
|
||||
**Аргументы**
|
||||
|
||||
- `string` — строка. [String](../../sql-reference/data-types/string.md).
|
||||
- `shinglesize` — размер словесных шинглов. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
|
||||
- `hashnum` — количество минимальных и максимальных хешей, которое используется при вычислении результата. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `6`. [UInt8](../../sql-reference/data-types/int-uint.md).
|
||||
|
||||
**Возвращаемое значение**
|
||||
|
||||
- Кортеж с двумя хешами — минимальным и максимальным.
|
||||
|
||||
Тип: [Tuple](../../sql-reference/data-types/tuple.md)([UInt64](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md)).
|
||||
|
||||
**Пример**
|
||||
|
||||
Запрос:
|
||||
|
||||
``` sql
|
||||
SELECT wordShingleMinHashUTF8('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).') AS Tuple;
|
||||
```
|
||||
|
||||
Результат:
|
||||
|
||||
``` text
|
||||
┌─Tuple──────────────────────────────────────┐
|
||||
│ (16452112859864147620,5844417301642981317) │
|
||||
└────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
## wordShingleMinHashCaseInsensitiveUTF8 {#wordshingleminhashcaseinsensitiveutf8}
|
||||
|
||||
Выделяет из UTF-8 строки отрезки (шинглы) из `shinglesize` слов и вычисляет хеш для каждого шингла. Использует `hashnum` минимальных хешей, чтобы вычислить минимальный хеш, и `hashnum` максимальных хешей, чтобы вычислить максимальный хеш. Возвращает кортеж из этих хешей. Функция регистро**не**зависимая.
|
||||
|
||||
Может быть использована для проверки двух строк на схожесть вместе с функцией [tupleHammingDistance](../../sql-reference/functions/tuple-functions.md#tuplehammingdistance). Если для двух строк минимальные или максимальные хеши одинаковы, мы считаем, что эти строки совпадают.
|
||||
|
||||
**Синтаксис**
|
||||
|
||||
``` sql
|
||||
wordShingleMinHashCaseInsensitiveUTF8(string[, shinglesize, hashnum])
|
||||
```
|
||||
|
||||
**Аргументы**
|
||||
|
||||
- `string` — строка. [String](../../sql-reference/data-types/string.md).
|
||||
- `shinglesize` — размер словесных шинглов. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
|
||||
- `hashnum` — количество минимальных и максимальных хешей, которое используется при вычислении результата. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `6`. [UInt8](../../sql-reference/data-types/int-uint.md).
|
||||
|
||||
**Возвращаемое значение**
|
||||
|
||||
- Кортеж с двумя хешами — минимальным и максимальным.
|
||||
|
||||
Тип: [Tuple](../../sql-reference/data-types/tuple.md)([UInt64](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md)).
|
||||
|
||||
**Пример**
|
||||
|
||||
Запрос:
|
||||
|
||||
``` sql
|
||||
SELECT wordShingleMinHashCaseInsensitiveUTF8('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).') AS Tuple;
|
||||
```
|
||||
|
||||
Результат:
|
||||
|
||||
``` text
|
||||
┌─Tuple─────────────────────────────────────┐
|
||||
│ (3065874883688416519,1634050779997673240) │
|
||||
└───────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
## wordShingleMinHashArg {#wordshingleminhasharg}
|
||||
|
||||
Выделяет из ASCII строки отрезки (шинглы) из `shinglesize` слов и возвращает шинглы с минимальным и максимальным хешами, вычисленными функцией [wordshingleMinHash](#wordshingleminhash) с теми же входными данными. Функция регистрозависимая.
|
||||
|
||||
**Синтаксис**
|
||||
|
||||
``` sql
|
||||
wordShingleMinHashArg(string[, shinglesize, hashnum])
|
||||
```
|
||||
|
||||
**Аргументы**
|
||||
|
||||
- `string` — строка. [String](../../sql-reference/data-types/string.md).
|
||||
- `shinglesize` — размер словесных шинглов. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
|
||||
- `hashnum` — количество минимальных и максимальных хешей, которое используется при вычислении результата. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `6`. [UInt8](../../sql-reference/data-types/int-uint.md).
|
||||
|
||||
**Возвращаемое значение**
|
||||
|
||||
- Кортеж из двух кортежей, каждый из которых состоит из `hashnum` шинглов.
|
||||
|
||||
Тип: [Tuple](../../sql-reference/data-types/tuple.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md)), [Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md))).
|
||||
|
||||
**Пример**
|
||||
|
||||
Запрос:
|
||||
|
||||
``` sql
|
||||
SELECT wordShingleMinHashArg('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).', 1, 3) AS Tuple;
|
||||
```
|
||||
|
||||
Результат:
|
||||
|
||||
``` text
|
||||
┌─Tuple─────────────────────────────────────────────────────────────────┐
|
||||
│ (('OLAP','database','analytical'),('online','oriented','processing')) │
|
||||
└───────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
## wordShingleMinHashArgCaseInsensitive {#wordshingleminhashargcaseinsensitive}
|
||||
|
||||
Выделяет из ASCII строки отрезки (шинглы) из `shinglesize` слов и возвращает шинглы с минимальным и максимальным хешами, вычисленными функцией [wordShingleMinHashCaseInsensitive](#wordshingleminhashcaseinsensitive) с теми же входными данными. Функция регистро**не**зависимая.
|
||||
|
||||
**Синтаксис**
|
||||
|
||||
``` sql
|
||||
wordShingleMinHashArgCaseInsensitive(string[, shinglesize, hashnum])
|
||||
```
|
||||
|
||||
**Аргументы**
|
||||
|
||||
- `string` — строка. [String](../../sql-reference/data-types/string.md).
|
||||
- `shinglesize` — размер словесных шинглов. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
|
||||
- `hashnum` — количество минимальных и максимальных хешей, которое используется при вычислении результата. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `6`. [UInt8](../../sql-reference/data-types/int-uint.md).
|
||||
|
||||
**Возвращаемое значение**
|
||||
|
||||
- Кортеж из двух кортежей, каждый из которых состоит из `hashnum` шинглов.
|
||||
|
||||
Тип: [Tuple](../../sql-reference/data-types/tuple.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md)), [Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md))).
|
||||
|
||||
**Пример**
|
||||
|
||||
Запрос:
|
||||
|
||||
``` sql
|
||||
SELECT wordShingleMinHashArgCaseInsensitive('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).', 1, 3) AS Tuple;
|
||||
```
|
||||
|
||||
Результат:
|
||||
|
||||
``` text
|
||||
┌─Tuple──────────────────────────────────────────────────────────────────┐
|
||||
│ (('queries','database','analytical'),('oriented','processing','DBMS')) │
|
||||
└────────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
## wordShingleMinHashArgUTF8 {#wordshingleminhashargutf8}
|
||||
|
||||
Выделяет из UTF-8 строки отрезки (шинглы) из `shinglesize` слов и возвращает шинглы с минимальным и максимальным хешами, вычисленными функцией [wordShingleMinHashUTF8](#wordshingleminhashutf8) с теми же входными данными. Функция регистрозависимая.
|
||||
|
||||
**Синтаксис**
|
||||
|
||||
``` sql
|
||||
wordShingleMinHashArgUTF8(string[, shinglesize, hashnum])
|
||||
```
|
||||
|
||||
**Аргументы**
|
||||
|
||||
- `string` — строка. [String](../../sql-reference/data-types/string.md).
|
||||
- `shinglesize` — размер словесных шинглов. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
|
||||
- `hashnum` — количество минимальных и максимальных хешей, которое используется при вычислении результата. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `6`. [UInt8](../../sql-reference/data-types/int-uint.md).
|
||||
|
||||
**Возвращаемое значение**
|
||||
|
||||
- Кортеж из двух кортежей, каждый из которых состоит из `hashnum` шинглов.
|
||||
|
||||
Тип: [Tuple](../../sql-reference/data-types/tuple.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md)), [Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md))).
|
||||
|
||||
**Пример**
|
||||
|
||||
Запрос:
|
||||
|
||||
``` sql
|
||||
SELECT wordShingleMinHashArgUTF8('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).', 1, 3) AS Tuple;
|
||||
```
|
||||
|
||||
Результат:
|
||||
|
||||
``` text
|
||||
┌─Tuple─────────────────────────────────────────────────────────────────┐
|
||||
│ (('OLAP','database','analytical'),('online','oriented','processing')) │
|
||||
└───────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
## wordShingleMinHashArgCaseInsensitiveUTF8 {#wordshingleminhashargcaseinsensitiveutf8}
|
||||
|
||||
Выделяет из UTF-8 строки отрезки (шинглы) из `shinglesize` слов и возвращает шинглы с минимальным и максимальным хешами, вычисленными функцией [wordShingleMinHashCaseInsensitiveUTF8](#wordshingleminhashcaseinsensitiveutf8) с теми же входными данными. Функция регистро**не**зависимая.
|
||||
|
||||
**Синтаксис**
|
||||
|
||||
``` sql
|
||||
wordShingleMinHashArgCaseInsensitiveUTF8(string[, shinglesize, hashnum])
|
||||
```
|
||||
|
||||
**Аргументы**
|
||||
|
||||
- `string` — строка. [String](../../sql-reference/data-types/string.md).
|
||||
- `shinglesize` — размер словесных шинглов. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md).
|
||||
- `hashnum` — количество минимальных и максимальных хешей, которое используется при вычислении результата. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `6`. [UInt8](../../sql-reference/data-types/int-uint.md).
|
||||
|
||||
**Возвращаемое значение**
|
||||
|
||||
- Кортеж из двух кортежей, каждый из которых состоит из `hashnum` шинглов.
|
||||
|
||||
Тип: [Tuple](../../sql-reference/data-types/tuple.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md)), [Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md))).
|
||||
|
||||
**Пример**
|
||||
|
||||
Запрос:
|
||||
|
||||
``` sql
|
||||
SELECT wordShingleMinHashArgCaseInsensitiveUTF8('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).', 1, 3) AS Tuple;
|
||||
```
|
||||
|
||||
Результат:
|
||||
|
||||
``` text
|
||||
┌─Tuple──────────────────────────────────────────────────────────────────┐
|
||||
│ (('queries','database','analytical'),('oriented','processing','DBMS')) │
|
||||
└────────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
@ -111,3 +111,55 @@ SELECT untuple((* EXCEPT (v2, v3),)) FROM kv;
|
||||
|
||||
- [Tuple](../../sql-reference/data-types/tuple.md)
|
||||
|
||||
## tupleHammingDistance {#tuplehammingdistance}
|
||||
|
||||
Возвращает [расстояние Хэмминга](https://ru.wikipedia.org/wiki/%D0%A0%D0%B0%D1%81%D1%81%D1%82%D0%BE%D1%8F%D0%BD%D0%B8%D0%B5_%D0%A5%D1%8D%D0%BC%D0%BC%D0%B8%D0%BD%D0%B3%D0%B0) между двумя кортежами одинакового размера.
|
||||
|
||||
**Синтаксис**
|
||||
|
||||
``` sql
|
||||
tupleHammingDistance(tuple1, tuple2)
|
||||
```
|
||||
|
||||
**Аргументы**
|
||||
|
||||
- `tuple1` — первый кортеж. [Tuple](../../sql-reference/data-types/tuple.md).
|
||||
- `tuple2` — второй кортеж. [Tuple](../../sql-reference/data-types/tuple.md).
|
||||
|
||||
Кортежи должны иметь одинаковый размер и тип элементов.
|
||||
|
||||
**Возвращаемое значение**
|
||||
|
||||
- Расстояние Хэмминга.
|
||||
|
||||
Тип: [UInt8](../../sql-reference/data-types/int-uint.md).
|
||||
|
||||
**Примеры**
|
||||
|
||||
Запрос:
|
||||
|
||||
``` sql
|
||||
SELECT tupleHammingDistance((1, 2, 3), (3, 2, 1)) AS HammingDistance;
|
||||
```
|
||||
|
||||
Результат:
|
||||
|
||||
``` text
|
||||
┌─HammingDistance─┐
|
||||
│ 2 │
|
||||
└─────────────────┘
|
||||
```
|
||||
|
||||
Может быть использовано с функциями [MinHash](../../sql-reference/functions/hash-functions.md#ngramminhash) для проверки строк на совпадение:
|
||||
|
||||
``` sql
|
||||
SELECT tupleHammingDistance(wordShingleMinHash(string), wordShingleMinHashCaseInsensitive(string)) as HammingDistance FROM (SELECT 'Clickhouse is a column-oriented database management system for online analytical processing of queries.' AS string);
|
||||
```
|
||||
|
||||
Результат:
|
||||
|
||||
``` text
|
||||
┌─HammingDistance─┐
|
||||
│ 2 │
|
||||
└─────────────────┘
|
||||
```
|
||||
|
@ -117,7 +117,7 @@ MODIFY COLUMN [IF EXISTS] name [type] [default_expr] [TTL] [AFTER name_after | F
|
||||
|
||||
- TTL
|
||||
|
||||
Примеры изменения TTL столбца смотрите в разделе [TTL столбца](ttl.md#mergetree-column-ttl).
|
||||
Примеры изменения TTL столбца смотрите в разделе [TTL столбца](../../../engines/table-engines/mergetree-family/mergetree.md#mergetree-column-ttl).
|
||||
|
||||
Если указано `IF EXISTS`, запрос не возвращает ошибку, если столбца не существует.
|
||||
|
||||
|
@ -5,43 +5,46 @@ toc_title: postgresql
|
||||
|
||||
# postgresql {#postgresql}
|
||||
|
||||
Позволяет выполнять запросы `SELECT` над данными, хранящимися на удалённом PostgreSQL сервере.
|
||||
Позволяет выполнять запросы `SELECT` и `INSERT` над таблицами удаленной БД PostgreSQL.
|
||||
|
||||
**Синтаксис**
|
||||
|
||||
``` sql
|
||||
postgresql('host:port', 'database', 'table', 'user', 'password')
|
||||
postgresql('host:port', 'database', 'table', 'user', 'password'[, `schema`])
|
||||
```
|
||||
|
||||
**Параметры**
|
||||
**Аргументы**
|
||||
|
||||
- `host:port` — адрес сервера PostgreSQL.
|
||||
|
||||
- `database` — имя базы данных на удалённом сервере.
|
||||
|
||||
- `table` — имя таблицы на удалённом сервере.
|
||||
|
||||
- `user` — пользователь PostgreSQL.
|
||||
|
||||
- `password` — пароль пользователя.
|
||||
|
||||
|
||||
SELECT запросы на стороне PostgreSQL выполняются как `COPY (SELECT ...) TO STDOUT` внутри транзакции PostgreSQL только на чтение с коммитом после каждого `SELECT` запроса.
|
||||
|
||||
Простые условия для `WHERE` такие как `=, !=, >, >=, <, <=, IN` исполняются на стороне PostgreSQL сервера.
|
||||
|
||||
Все операции объединения, аггрегации, сортировки, условия `IN [ array ]` и ограничения `LIMIT` выполняются на стороне ClickHouse только после того как запрос к PostgreSQL закончился.
|
||||
|
||||
INSERT запросы на стороне PostgreSQL выполняются как `COPY "table_name" (field1, field2, ... fieldN) FROM STDIN` внутри PostgreSQL транзакции с автоматическим коммитом после каждого `INSERT` запроса.
|
||||
|
||||
PostgreSQL массивы конвертируются в массивы ClickHouse.
|
||||
Будьте осторожны в PostgreSQL массивы созданные как type_name[], являются многомерными и могут содержать в себе разное количество измерений в разных строках одной таблицы, внутри ClickHouse допустипы только многомерные массивы с одинаковым кол-вом измерений во всех строках таблицы.
|
||||
- `schema` — имя схемы, если не используется схема по умолчанию. Необязательный аргумент.
|
||||
|
||||
**Возвращаемое значение**
|
||||
|
||||
Объект таблицы с теми же столбцами, что и в исходной таблице PostgreSQL.
|
||||
Таблица с теми же столбцами, что и в исходной таблице PostgreSQL.
|
||||
|
||||
!!! info "Примечание"
|
||||
В запросах `INSERT` для того чтобы отличить табличную функцию `postgresql(...)` от таблицы со списком имен столбцов вы должны указывать ключевые слова `FUNCTION` или `TABLE FUNCTION`. See examples below.
|
||||
В запросах `INSERT` для того чтобы отличить табличную функцию `postgresql(...)` от таблицы со списком имен столбцов вы должны указывать ключевые слова `FUNCTION` или `TABLE FUNCTION`. См. примеры ниже.
|
||||
|
||||
## Особенности реализации {#implementation-details}
|
||||
|
||||
Запросы `SELECT` на стороне PostgreSQL выполняются как `COPY (SELECT ...) TO STDOUT` внутри транзакции PostgreSQL только на чтение с коммитом после каждого запроса `SELECT`.
|
||||
|
||||
Простые условия для `WHERE` такие как `=`, `!=`, `>`, `>=`, `<`, `<=` и `IN` исполняются на стороне PostgreSQL сервера.
|
||||
|
||||
Все операции объединения, аггрегации, сортировки, условия `IN [ array ]` и ограничения `LIMIT` выполняются на стороне ClickHouse только после того как запрос к PostgreSQL закончился.
|
||||
|
||||
Запросы `INSERT` на стороне PostgreSQL выполняются как `COPY "table_name" (field1, field2, ... fieldN) FROM STDIN` внутри PostgreSQL транзакции с автоматическим коммитом после каждого запроса `INSERT`.
|
||||
|
||||
PostgreSQL массивы конвертируются в массивы ClickHouse.
|
||||
|
||||
!!! info "Примечание"
|
||||
Будьте внимательны, в PostgreSQL массивы, созданные как `type_name[]`, являются многомерными и могут содержать в себе разное количество измерений в разных строках одной таблицы. Внутри ClickHouse допустипы только многомерные массивы с одинаковым кол-вом измерений во всех строках таблицы.
|
||||
|
||||
При использовании словаря PostgreSQL поддерживается приоритет реплик. Чем больше номер реплики, тем ниже ее приоритет. Наивысший приоритет у реплики с номером `0`.
|
||||
|
||||
**Примеры**
|
||||
|
||||
@ -58,10 +61,10 @@ PRIMARY KEY (int_id));
|
||||
|
||||
CREATE TABLE
|
||||
|
||||
postgres=# insert into test (int_id, str, "float") VALUES (1,'test',2);
|
||||
postgres=# INSERT INTO test (int_id, str, "float") VALUES (1,'test',2);
|
||||
INSERT 0 1
|
||||
|
||||
postgresql> select * from test;
|
||||
postgresql> SELECT * FROM test;
|
||||
int_id | int_nullable | float | str | float_nullable
|
||||
--------+--------------+-------+------+----------------
|
||||
1 | | 2 | test |
|
||||
@ -80,7 +83,7 @@ SELECT * FROM postgresql('localhost:5432', 'test', 'test', 'postgresql_user', 'p
|
||||
└────────┴──────────────┴───────┴──────┴────────────────┘
|
||||
```
|
||||
|
||||
Вставка:
|
||||
Вставка данных:
|
||||
|
||||
```sql
|
||||
INSERT INTO TABLE FUNCTION postgresql('localhost:5432', 'test', 'test', 'postgrsql_user', 'password') (int_id, float) VALUES (2, 3);
|
||||
@ -94,7 +97,24 @@ SELECT * FROM postgresql('localhost:5432', 'test', 'test', 'postgresql_user', 'p
|
||||
└────────┴──────────────┴───────┴──────┴────────────────┘
|
||||
```
|
||||
|
||||
**Смотрите также**
|
||||
Using Non-default Schema:
|
||||
|
||||
- [Движок таблиц ‘PostgreSQL’](../../sql-reference/table-functions/postgresql.md)
|
||||
```text
|
||||
postgres=# CREATE SCHEMA "nice.schema";
|
||||
|
||||
postgres=# CREATE TABLE "nice.schema"."nice.table" (a integer);
|
||||
|
||||
postgres=# INSERT INTO "nice.schema"."nice.table" SELECT i FROM generate_series(0, 99) as t(i)
|
||||
```
|
||||
|
||||
```sql
|
||||
CREATE TABLE pg_table_schema_with_dots (a UInt32)
|
||||
ENGINE PostgreSQL('localhost:5432', 'clickhouse', 'nice.table', 'postgrsql_user', 'password', 'nice.schema');
|
||||
```
|
||||
|
||||
**См. также**
|
||||
|
||||
- [Движок таблиц PostgreSQL](../../sql-reference/table-functions/postgresql.md)
|
||||
- [Использование PostgreSQL как источника данных для внешнего словаря](../../sql-reference/table-functions/postgresql.md#dicts-external_dicts_dict_sources-postgresql)
|
||||
|
||||
[Оригинальная статья](https://clickhouse.tech/docs/ru/sql-reference/table-functions/postgresql/) <!--hide-->
|
||||
|
@ -42,11 +42,16 @@ if (OS_LINUX)
|
||||
set(RESOURCE_OBJS ${RESOURCE_OBJS} ${RESOURCE_OBJ})
|
||||
|
||||
# https://stackoverflow.com/questions/14776463/compile-and-add-an-object-file-from-a-binary-with-cmake
|
||||
add_custom_command(OUTPUT ${RESOURCE_OBJ}
|
||||
COMMAND cd ${CMAKE_CURRENT_SOURCE_DIR} && ${OBJCOPY_PATH} -I binary ${OBJCOPY_ARCH_OPTIONS} ${RESOURCE_FILE} ${CMAKE_CURRENT_BINARY_DIR}/${RESOURCE_OBJ}
|
||||
COMMAND ${OBJCOPY_PATH} --rename-section .data=.rodata,alloc,load,readonly,data,contents
|
||||
${CMAKE_CURRENT_BINARY_DIR}/${RESOURCE_OBJ} ${CMAKE_CURRENT_BINARY_DIR}/${RESOURCE_OBJ})
|
||||
|
||||
# PPC64LE fails to do this with objcopy, use ld or lld instead
|
||||
if (ARCH_PPC64LE)
|
||||
add_custom_command(OUTPUT ${RESOURCE_OBJ}
|
||||
COMMAND cd ${CMAKE_CURRENT_SOURCE_DIR} && ${CMAKE_LINKER} -m elf64lppc -r -b binary -o ${CMAKE_CURRENT_BINARY_DIR}/${RESOURCE_OBJ} ${RESOURCE_FILE})
|
||||
else()
|
||||
add_custom_command(OUTPUT ${RESOURCE_OBJ}
|
||||
COMMAND cd ${CMAKE_CURRENT_SOURCE_DIR} && ${OBJCOPY_PATH} -I binary ${OBJCOPY_ARCH_OPTIONS} ${RESOURCE_FILE} ${CMAKE_CURRENT_BINARY_DIR}/${RESOURCE_OBJ}
|
||||
COMMAND ${OBJCOPY_PATH} --rename-section .data=.rodata,alloc,load,readonly,data,contents
|
||||
${CMAKE_CURRENT_BINARY_DIR}/${RESOURCE_OBJ} ${CMAKE_CURRENT_BINARY_DIR}/${RESOURCE_OBJ})
|
||||
endif()
|
||||
set_source_files_properties(${RESOURCE_OBJ} PROPERTIES EXTERNAL_OBJECT true GENERATED true)
|
||||
endforeach(RESOURCE_FILE)
|
||||
|
||||
|
@ -521,14 +521,17 @@ void HedgedConnections::processNewReplicaState(HedgedConnectionsFactory::State s
|
||||
|
||||
void HedgedConnections::finishProcessReplica(ReplicaState & replica, bool disconnect)
|
||||
{
|
||||
/// It's important to remove file descriptor from epoll exactly before cancelling packet_receiver,
|
||||
/// because otherwise another thread can try to receive a packet, get this file descriptor
|
||||
/// from epoll and resume cancelled packet_receiver.
|
||||
epoll.remove(replica.packet_receiver->getFileDescriptor());
|
||||
epoll.remove(replica.change_replica_timeout.getDescriptor());
|
||||
|
||||
replica.packet_receiver->cancel();
|
||||
replica.change_replica_timeout.reset();
|
||||
|
||||
epoll.remove(replica.packet_receiver->getFileDescriptor());
|
||||
--offset_states[fd_to_replica_location[replica.packet_receiver->getFileDescriptor()].offset].active_connection_count;
|
||||
fd_to_replica_location.erase(replica.packet_receiver->getFileDescriptor());
|
||||
|
||||
epoll.remove(replica.change_replica_timeout.getDescriptor());
|
||||
timeout_fd_to_replica_location.erase(replica.change_replica_timeout.getDescriptor());
|
||||
|
||||
--active_connection_count;
|
||||
|
@ -48,7 +48,7 @@ struct HashMapCell
|
||||
|
||||
value_type value;
|
||||
|
||||
HashMapCell() {}
|
||||
HashMapCell() = default;
|
||||
HashMapCell(const Key & key_, const State &) : value(key_, NoInitTag()) {}
|
||||
HashMapCell(const value_type & value_, const State &) : value(value_) {}
|
||||
|
||||
@ -114,8 +114,39 @@ struct HashMapCell
|
||||
|
||||
static void move(HashMapCell * /* old_location */, HashMapCell * /* new_location */) {}
|
||||
|
||||
template <size_t I>
|
||||
auto & get() & {
|
||||
if constexpr (I == 0) return value.first;
|
||||
else if constexpr (I == 1) return value.second;
|
||||
}
|
||||
|
||||
template <size_t I>
|
||||
auto const & get() const & {
|
||||
if constexpr (I == 0) return value.first;
|
||||
else if constexpr (I == 1) return value.second;
|
||||
}
|
||||
|
||||
template <size_t I>
|
||||
auto && get() && {
|
||||
if constexpr (I == 0) return std::move(value.first);
|
||||
else if constexpr (I == 1) return std::move(value.second);
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
namespace std
|
||||
{
|
||||
|
||||
template <typename Key, typename TMapped, typename Hash, typename TState>
|
||||
struct tuple_size<HashMapCell<Key, TMapped, Hash, TState>> : std::integral_constant<size_t, 2> { };
|
||||
|
||||
template <typename Key, typename TMapped, typename Hash, typename TState>
|
||||
struct tuple_element<0, HashMapCell<Key, TMapped, Hash, TState>> { using type = Key; };
|
||||
|
||||
template <typename Key, typename TMapped, typename Hash, typename TState>
|
||||
struct tuple_element<1, HashMapCell<Key, TMapped, Hash, TState>> { using type = TMapped; };
|
||||
}
|
||||
|
||||
template <typename Key, typename TMapped, typename Hash, typename TState = HashTableNoState>
|
||||
struct HashMapCellWithSavedHash : public HashMapCell<Key, TMapped, Hash, TState>
|
||||
{
|
||||
@ -227,6 +258,19 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
namespace std
|
||||
{
|
||||
|
||||
template <typename Key, typename TMapped, typename Hash, typename TState>
|
||||
struct tuple_size<HashMapCellWithSavedHash<Key, TMapped, Hash, TState>> : std::integral_constant<size_t, 2> { };
|
||||
|
||||
template <typename Key, typename TMapped, typename Hash, typename TState>
|
||||
struct tuple_element<0, HashMapCellWithSavedHash<Key, TMapped, Hash, TState>> { using type = Key; };
|
||||
|
||||
template <typename Key, typename TMapped, typename Hash, typename TState>
|
||||
struct tuple_element<1, HashMapCellWithSavedHash<Key, TMapped, Hash, TState>> { using type = TMapped; };
|
||||
}
|
||||
|
||||
|
||||
template <
|
||||
typename Key,
|
||||
|
@ -530,6 +530,31 @@ public:
|
||||
this->c_end += bytes_to_copy;
|
||||
}
|
||||
|
||||
template <typename ... TAllocatorParams>
|
||||
void insertFromItself(iterator from_begin, iterator from_end, TAllocatorParams && ... allocator_params)
|
||||
{
|
||||
static_assert(memcpy_can_be_used_for_assignment<std::decay_t<T>, std::decay_t<decltype(*from_begin)>>);
|
||||
|
||||
/// Convert iterators to indexes because reserve can invalidate iterators
|
||||
size_t start_index = from_begin - begin();
|
||||
size_t end_index = from_end - begin();
|
||||
size_t copy_size = end_index - start_index;
|
||||
|
||||
assert(start_index <= end_index);
|
||||
|
||||
size_t required_capacity = this->size() + copy_size;
|
||||
if (required_capacity > this->capacity())
|
||||
this->reserve(roundUpToPowerOfTwoOrZero(required_capacity), std::forward<TAllocatorParams>(allocator_params)...);
|
||||
|
||||
size_t bytes_to_copy = this->byte_size(copy_size);
|
||||
if (bytes_to_copy)
|
||||
{
|
||||
auto begin = this->c_start + this->byte_size(start_index);
|
||||
memcpy(this->c_end, reinterpret_cast<const void *>(&*begin), bytes_to_copy);
|
||||
this->c_end += bytes_to_copy;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename It1, typename It2>
|
||||
void insert_assume_reserved(It1 from_begin, It2 from_end)
|
||||
{
|
||||
|
@ -35,7 +35,7 @@ std::string signalToErrorMessage(int sig, const siginfo_t & info, const ucontext
|
||||
else
|
||||
error << "Address: " << info.si_addr;
|
||||
|
||||
#if defined(__x86_64__) && !defined(__FreeBSD__) && !defined(__APPLE__) && !defined(__arm__)
|
||||
#if defined(__x86_64__) && !defined(__FreeBSD__) && !defined(__APPLE__) && !defined(__arm__) && !defined(__powerpc__)
|
||||
auto err_mask = context.uc_mcontext.gregs[REG_ERR];
|
||||
if ((err_mask & 0x02))
|
||||
error << " Access: write.";
|
||||
@ -186,6 +186,8 @@ static void * getCallerAddress(const ucontext_t & context)
|
||||
# endif
|
||||
#elif defined(__aarch64__)
|
||||
return reinterpret_cast<void *>(context.uc_mcontext.pc);
|
||||
#elif defined(__powerpc64__)
|
||||
return reinterpret_cast<void *>(context.uc_mcontext.gp_regs[PT_NIP]);
|
||||
#else
|
||||
return nullptr;
|
||||
#endif
|
||||
|
@ -33,6 +33,19 @@ TEST(Common, PODArrayInsert)
|
||||
EXPECT_EQ(str, std::string(chars.data(), chars.size()));
|
||||
}
|
||||
|
||||
TEST(Common, PODArrayInsertFromItself)
|
||||
{
|
||||
{
|
||||
PaddedPODArray<UInt64> array { 1 };
|
||||
|
||||
for (size_t i = 0; i < 3; ++i)
|
||||
array.insertFromItself(array.begin(), array.end());
|
||||
|
||||
PaddedPODArray<UInt64> expected {1,1,1,1,1,1,1,1};
|
||||
ASSERT_EQ(array,expected);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(Common, PODPushBackRawMany)
|
||||
{
|
||||
PODArray<char> chars;
|
||||
|
@ -41,7 +41,7 @@ void NativeBlockOutputStream::flush()
|
||||
}
|
||||
|
||||
|
||||
void NativeBlockOutputStream::writeData(const IDataType & type, const ColumnPtr & column, WriteBuffer & ostr, UInt64 offset, UInt64 limit)
|
||||
static void writeData(const IDataType & type, const ColumnPtr & column, WriteBuffer & ostr, UInt64 offset, UInt64 limit)
|
||||
{
|
||||
/** If there are columns-constants - then we materialize them.
|
||||
* (Since the data type does not know how to serialize / deserialize constants.)
|
||||
|
@ -30,8 +30,6 @@ public:
|
||||
void write(const Block & block) override;
|
||||
void flush() override;
|
||||
|
||||
static void writeData(const IDataType & type, const ColumnPtr & column, WriteBuffer & ostr, UInt64 offset, UInt64 limit);
|
||||
|
||||
String getContentType() const override { return "application/octet-stream"; }
|
||||
|
||||
private:
|
||||
|
@ -567,7 +567,7 @@ void DatabaseAtomic::renameDictionaryInMemoryUnlocked(const StorageID & old_name
|
||||
auto result = external_loader.getLoadResult(toString(old_name.uuid));
|
||||
if (!result.object)
|
||||
return;
|
||||
const auto & dict = dynamic_cast<const IDictionaryBase &>(*result.object);
|
||||
const auto & dict = dynamic_cast<const IDictionary &>(*result.object);
|
||||
dict.updateDictionaryName(new_name);
|
||||
}
|
||||
void DatabaseAtomic::waitDetachedTableNotInUse(const UUID & uuid)
|
||||
|
@ -49,7 +49,7 @@ void DatabaseWithDictionaries::attachDictionary(const String & dictionary_name,
|
||||
/// Attach the dictionary as table too.
|
||||
try
|
||||
{
|
||||
/// TODO Make StorageDictionary an owner of IDictionaryBase objects.
|
||||
/// TODO Make StorageDictionary an owner of IDictionary objects.
|
||||
/// All DDL operations with dictionaries will work with StorageDictionary table,
|
||||
/// and StorageDictionary will be responsible for loading of DDL dictionaries.
|
||||
/// ExternalLoaderDatabaseConfigRepository and other hacks related to ExternalLoader
|
||||
|
@ -13,7 +13,9 @@
|
||||
#include <Common/HashTable/HashSet.h>
|
||||
#include <Common/ProfileEvents.h>
|
||||
#include <Common/ProfilingScopedRWLock.h>
|
||||
|
||||
#include <Dictionaries/DictionaryBlockInputStream.h>
|
||||
#include <Dictionaries/HierarchyDictionariesUtils.h>
|
||||
|
||||
namespace ProfileEvents
|
||||
{
|
||||
@ -39,7 +41,6 @@ namespace DB
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int CACHE_DICTIONARY_UPDATE_FAIL;
|
||||
extern const int TYPE_MISMATCH;
|
||||
extern const int UNSUPPORTED_METHOD;
|
||||
}
|
||||
|
||||
@ -70,8 +71,6 @@ CacheDictionary<dictionary_key_type>::CacheDictionary(
|
||||
{
|
||||
if (!source_ptr->supportsSelectiveLoad())
|
||||
throw Exception{full_name + ": source cannot be used with CacheDictionary", ErrorCodes::UNSUPPORTED_METHOD};
|
||||
|
||||
setupHierarchicalAttribute();
|
||||
}
|
||||
|
||||
template <DictionaryKeyType dictionary_key_type>
|
||||
@ -120,164 +119,6 @@ const IDictionarySource * CacheDictionary<dictionary_key_type>::getSource() cons
|
||||
return source_ptr.get();
|
||||
}
|
||||
|
||||
template <DictionaryKeyType dictionary_key_type>
|
||||
void CacheDictionary<dictionary_key_type>::toParent(const PaddedPODArray<UInt64> & ids [[maybe_unused]], PaddedPODArray<UInt64> & out [[maybe_unused]]) const
|
||||
{
|
||||
if constexpr (dictionary_key_type == DictionaryKeyType::simple)
|
||||
{
|
||||
/// Run update on requested keys before fetch from storage
|
||||
const auto & attribute_name = hierarchical_attribute->name;
|
||||
|
||||
auto result_type = std::make_shared<DataTypeUInt64>();
|
||||
auto input_column = result_type->createColumn();
|
||||
auto & input_column_typed = assert_cast<ColumnVector<UInt64> &>(*input_column);
|
||||
auto & data = input_column_typed.getData();
|
||||
data.insert(ids.begin(), ids.end());
|
||||
|
||||
auto column = getColumn({attribute_name}, result_type, {std::move(input_column)}, {result_type}, {nullptr});
|
||||
const auto & result_column_typed = assert_cast<const ColumnVector<UInt64> &>(*column);
|
||||
const auto & result_data = result_column_typed.getData();
|
||||
|
||||
out.assign(result_data);
|
||||
}
|
||||
else
|
||||
throw Exception("Hierarchy is not supported for complex key CacheDictionary", ErrorCodes::UNSUPPORTED_METHOD);
|
||||
}
|
||||
|
||||
|
||||
/// Allow to use single value in same way as array.
|
||||
static inline UInt64 getAt(const PaddedPODArray<UInt64> & arr, const size_t idx)
|
||||
{
|
||||
return arr[idx];
|
||||
}
|
||||
static inline UInt64 getAt(const UInt64 & value, const size_t)
|
||||
{
|
||||
return value;
|
||||
}
|
||||
|
||||
template <DictionaryKeyType dictionary_key_type>
|
||||
template <typename AncestorType>
|
||||
void CacheDictionary<dictionary_key_type>::isInImpl(const PaddedPODArray<Key> & child_ids, const AncestorType & ancestor_ids, PaddedPODArray<UInt8> & out) const
|
||||
{
|
||||
/// Transform all children to parents until ancestor id or null_value will be reached.
|
||||
|
||||
size_t out_size = out.size();
|
||||
memset(out.data(), 0xFF, out_size); /// 0xFF means "not calculated"
|
||||
|
||||
const auto null_value = hierarchical_attribute->null_value.get<UInt64>();
|
||||
|
||||
PaddedPODArray<Key> children(out_size, 0);
|
||||
PaddedPODArray<Key> parents(child_ids.begin(), child_ids.end());
|
||||
|
||||
for (size_t i = 0; i < DBMS_HIERARCHICAL_DICTIONARY_MAX_DEPTH; ++i)
|
||||
{
|
||||
size_t out_idx = 0;
|
||||
size_t parents_idx = 0;
|
||||
size_t new_children_idx = 0;
|
||||
|
||||
while (out_idx < out_size)
|
||||
{
|
||||
/// Already calculated
|
||||
if (out[out_idx] != 0xFF)
|
||||
{
|
||||
++out_idx;
|
||||
continue;
|
||||
}
|
||||
|
||||
/// No parent
|
||||
if (parents[parents_idx] == null_value)
|
||||
{
|
||||
out[out_idx] = 0;
|
||||
}
|
||||
/// Found ancestor
|
||||
else if (parents[parents_idx] == getAt(ancestor_ids, parents_idx))
|
||||
{
|
||||
out[out_idx] = 1;
|
||||
}
|
||||
/// Loop detected
|
||||
else if (children[new_children_idx] == parents[parents_idx])
|
||||
{
|
||||
out[out_idx] = 1;
|
||||
}
|
||||
/// Found intermediate parent, add this value to search at next loop iteration
|
||||
else
|
||||
{
|
||||
children[new_children_idx] = parents[parents_idx];
|
||||
++new_children_idx;
|
||||
}
|
||||
|
||||
++out_idx;
|
||||
++parents_idx;
|
||||
}
|
||||
|
||||
if (new_children_idx == 0)
|
||||
break;
|
||||
|
||||
/// Transform all children to its parents.
|
||||
children.resize(new_children_idx);
|
||||
parents.resize(new_children_idx);
|
||||
|
||||
toParent(children, parents);
|
||||
}
|
||||
}
|
||||
|
||||
template <DictionaryKeyType dictionary_key_type>
|
||||
void CacheDictionary<dictionary_key_type>::isInVectorVector(
|
||||
const PaddedPODArray<UInt64> & child_ids, const PaddedPODArray<UInt64> & ancestor_ids, PaddedPODArray<UInt8> & out) const
|
||||
{
|
||||
isInImpl(child_ids, ancestor_ids, out);
|
||||
}
|
||||
|
||||
template <DictionaryKeyType dictionary_key_type>
|
||||
void CacheDictionary<dictionary_key_type>::isInVectorConstant(const PaddedPODArray<UInt64> & child_ids, const UInt64 ancestor_id, PaddedPODArray<UInt8> & out) const
|
||||
{
|
||||
isInImpl(child_ids, ancestor_id, out);
|
||||
}
|
||||
|
||||
template <DictionaryKeyType dictionary_key_type>
|
||||
void CacheDictionary<dictionary_key_type>::isInConstantVector(const UInt64 child_id, const PaddedPODArray<UInt64> & ancestor_ids, PaddedPODArray<UInt8> & out) const
|
||||
{
|
||||
/// Special case with single child value.
|
||||
|
||||
const auto null_value = hierarchical_attribute->null_value.get<UInt64>();
|
||||
|
||||
PaddedPODArray<Key> child(1, child_id);
|
||||
PaddedPODArray<Key> parent(1);
|
||||
std::vector<Key> ancestors(1, child_id);
|
||||
|
||||
/// Iteratively find all ancestors for child.
|
||||
for (size_t i = 0; i < DBMS_HIERARCHICAL_DICTIONARY_MAX_DEPTH; ++i)
|
||||
{
|
||||
toParent(child, parent);
|
||||
|
||||
if (parent[0] == null_value)
|
||||
break;
|
||||
|
||||
child[0] = parent[0];
|
||||
ancestors.push_back(parent[0]);
|
||||
}
|
||||
|
||||
/// Assuming short hierarchy, so linear search is Ok.
|
||||
for (size_t i = 0, out_size = out.size(); i < out_size; ++i)
|
||||
out[i] = std::find(ancestors.begin(), ancestors.end(), ancestor_ids[i]) != ancestors.end();
|
||||
}
|
||||
|
||||
template <DictionaryKeyType dictionary_key_type>
|
||||
void CacheDictionary<dictionary_key_type>::setupHierarchicalAttribute()
|
||||
{
|
||||
/// TODO: Move this to DictionaryStructure
|
||||
for (const auto & attribute : dict_struct.attributes)
|
||||
{
|
||||
if (attribute.hierarchical)
|
||||
{
|
||||
hierarchical_attribute = &attribute;
|
||||
|
||||
if (attribute.underlying_type != AttributeUnderlyingType::utUInt64)
|
||||
throw Exception{full_name + ": hierarchical attribute must be UInt64.", ErrorCodes::TYPE_MISMATCH};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <DictionaryKeyType dictionary_key_type>
|
||||
ColumnPtr CacheDictionary<dictionary_key_type>::getColumn(
|
||||
const std::string & attribute_name,
|
||||
@ -296,23 +137,6 @@ Columns CacheDictionary<dictionary_key_type>::getColumns(
|
||||
const Columns & key_columns,
|
||||
const DataTypes & key_types,
|
||||
const Columns & default_values_columns) const
|
||||
{
|
||||
if (dictionary_key_type == DictionaryKeyType::complex)
|
||||
dict_struct.validateKeyTypes(key_types);
|
||||
|
||||
Arena complex_keys_arena;
|
||||
DictionaryKeysExtractor<dictionary_key_type> extractor(key_columns, complex_keys_arena);
|
||||
auto & keys = extractor.getKeys();
|
||||
|
||||
return getColumnsImpl(attribute_names, key_columns, keys, default_values_columns);
|
||||
}
|
||||
|
||||
template <DictionaryKeyType dictionary_key_type>
|
||||
Columns CacheDictionary<dictionary_key_type>::getColumnsImpl(
|
||||
const Strings & attribute_names,
|
||||
const Columns & key_columns,
|
||||
const PaddedPODArray<KeyType> & keys,
|
||||
const Columns & default_values_columns) const
|
||||
{
|
||||
/**
|
||||
* Flow of getColumsImpl
|
||||
@ -328,6 +152,13 @@ Columns CacheDictionary<dictionary_key_type>::getColumnsImpl(
|
||||
* use default value.
|
||||
*/
|
||||
|
||||
if (dictionary_key_type == DictionaryKeyType::complex)
|
||||
dict_struct.validateKeyTypes(key_types);
|
||||
|
||||
DictionaryKeysArenaHolder<dictionary_key_type> arena_holder;
|
||||
DictionaryKeysExtractor<dictionary_key_type> extractor(key_columns, arena_holder.getComplexKeyArena());
|
||||
auto keys = extractor.extractAllKeys();
|
||||
|
||||
DictionaryStorageFetchRequest request(dict_struct, attribute_names, default_values_columns);
|
||||
|
||||
FetchResult result_of_fetch_from_storage;
|
||||
@ -440,9 +271,10 @@ ColumnUInt8::Ptr CacheDictionary<dictionary_key_type>::hasKeys(const Columns & k
|
||||
if (dictionary_key_type == DictionaryKeyType::complex)
|
||||
dict_struct.validateKeyTypes(key_types);
|
||||
|
||||
Arena complex_keys_arena;
|
||||
DictionaryKeysExtractor<dictionary_key_type> extractor(key_columns, complex_keys_arena);
|
||||
const auto & keys = extractor.getKeys();
|
||||
|
||||
DictionaryKeysArenaHolder<dictionary_key_type> arena_holder;
|
||||
DictionaryKeysExtractor<dictionary_key_type> extractor(key_columns, arena_holder.getComplexKeyArena());
|
||||
const auto keys = extractor.extractAllKeys();
|
||||
|
||||
/// We make empty request just to fetch if keys exists
|
||||
DictionaryStorageFetchRequest request(dict_struct, {}, {});
|
||||
@ -526,6 +358,37 @@ ColumnUInt8::Ptr CacheDictionary<dictionary_key_type>::hasKeys(const Columns & k
|
||||
return result;
|
||||
}
|
||||
|
||||
template <DictionaryKeyType dictionary_key_type>
|
||||
ColumnPtr CacheDictionary<dictionary_key_type>::getHierarchy(
|
||||
ColumnPtr key_column [[maybe_unused]],
|
||||
const DataTypePtr & key_type [[maybe_unused]]) const
|
||||
{
|
||||
if (dictionary_key_type == DictionaryKeyType::simple)
|
||||
{
|
||||
auto result = getKeysHierarchyDefaultImplementation(this, key_column, key_type);
|
||||
query_count.fetch_add(key_column->size(), std::memory_order_relaxed);
|
||||
return result;
|
||||
}
|
||||
else
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
template <DictionaryKeyType dictionary_key_type>
|
||||
ColumnUInt8::Ptr CacheDictionary<dictionary_key_type>::isInHierarchy(
|
||||
ColumnPtr key_column [[maybe_unused]],
|
||||
ColumnPtr in_key_column [[maybe_unused]],
|
||||
const DataTypePtr & key_type [[maybe_unused]]) const
|
||||
{
|
||||
if (dictionary_key_type == DictionaryKeyType::simple)
|
||||
{
|
||||
auto result = getKeysIsInHierarchyDefaultImplementation(this, key_column, in_key_column, key_type);
|
||||
query_count.fetch_add(key_column->size(), std::memory_order_relaxed);
|
||||
return result;
|
||||
}
|
||||
else
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
template <DictionaryKeyType dictionary_key_type>
|
||||
MutableColumns CacheDictionary<dictionary_key_type>::aggregateColumnsInOrderOfKeys(
|
||||
const PaddedPODArray<KeyType> & keys,
|
||||
@ -618,19 +481,18 @@ MutableColumns CacheDictionary<dictionary_key_type>::aggregateColumns(
|
||||
template <DictionaryKeyType dictionary_key_type>
|
||||
BlockInputStreamPtr CacheDictionary<dictionary_key_type>::getBlockInputStream(const Names & column_names, size_t max_block_size) const
|
||||
{
|
||||
using BlockInputStreamType = DictionaryBlockInputStream<Key>;
|
||||
std::shared_ptr<BlockInputStreamType> stream;
|
||||
std::shared_ptr<DictionaryBlockInputStream> stream;
|
||||
|
||||
{
|
||||
/// Write lock on storage
|
||||
const ProfilingScopedWriteRWLock write_lock{rw_lock, ProfileEvents::DictCacheLockWriteNs};
|
||||
|
||||
if constexpr (dictionary_key_type == DictionaryKeyType::simple)
|
||||
stream = std::make_shared<BlockInputStreamType>(shared_from_this(), max_block_size, cache_storage_ptr->getCachedSimpleKeys(), column_names);
|
||||
stream = std::make_shared<DictionaryBlockInputStream>(shared_from_this(), max_block_size, cache_storage_ptr->getCachedSimpleKeys(), column_names);
|
||||
else
|
||||
{
|
||||
auto keys = cache_storage_ptr->getCachedComplexKeys();
|
||||
stream = std::make_shared<BlockInputStreamType>(shared_from_this(), max_block_size, keys, column_names);
|
||||
stream = std::make_shared<DictionaryBlockInputStream>(shared_from_this(), max_block_size, keys, column_names);
|
||||
}
|
||||
}
|
||||
|
||||
@ -660,14 +522,20 @@ void CacheDictionary<dictionary_key_type>::update(CacheDictionaryUpdateUnitPtr<d
|
||||
|
||||
size_t found_keys_size = 0;
|
||||
|
||||
DictionaryKeysExtractor<dictionary_key_type> requested_keys_extractor(update_unit_ptr->key_columns, update_unit_ptr->complex_key_arena);
|
||||
const auto & requested_keys = requested_keys_extractor.getKeys();
|
||||
Arena * complex_key_arena = update_unit_ptr->complex_keys_arena_holder.getComplexKeyArena();
|
||||
DictionaryKeysExtractor<dictionary_key_type> requested_keys_extractor(update_unit_ptr->key_columns, complex_key_arena);
|
||||
auto requested_keys = requested_keys_extractor.extractAllKeys();
|
||||
|
||||
HashSet<KeyType> not_found_keys;
|
||||
|
||||
std::vector<UInt64> requested_keys_vector;
|
||||
std::vector<size_t> requested_complex_key_rows;
|
||||
|
||||
if constexpr (dictionary_key_type == DictionaryKeyType::simple)
|
||||
requested_keys_vector.reserve(requested_keys.size());
|
||||
else
|
||||
requested_complex_key_rows.reserve(requested_keys.size());
|
||||
|
||||
auto & key_index_to_state_from_storage = update_unit_ptr->key_index_to_state;
|
||||
|
||||
for (size_t i = 0; i < key_index_to_state_from_storage.size(); ++i)
|
||||
@ -727,8 +595,8 @@ void CacheDictionary<dictionary_key_type>::update(CacheDictionaryUpdateUnitPtr<d
|
||||
block_columns.erase(block_columns.begin());
|
||||
}
|
||||
|
||||
DictionaryKeysExtractor<dictionary_key_type> keys_extractor(key_columns, update_unit_ptr->complex_key_arena);
|
||||
const auto & keys_extracted_from_block = keys_extractor.getKeys();
|
||||
DictionaryKeysExtractor<dictionary_key_type> keys_extractor(key_columns, complex_key_arena);
|
||||
auto keys_extracted_from_block = keys_extractor.extractAllKeys();
|
||||
|
||||
for (size_t index_of_attribute = 0; index_of_attribute < fetched_columns_during_update.size(); ++index_of_attribute)
|
||||
{
|
||||
@ -740,6 +608,7 @@ void CacheDictionary<dictionary_key_type>::update(CacheDictionaryUpdateUnitPtr<d
|
||||
for (size_t i = 0; i < keys_extracted_from_block.size(); ++i)
|
||||
{
|
||||
auto fetched_key_from_source = keys_extracted_from_block[i];
|
||||
|
||||
not_found_keys.erase(fetched_key_from_source);
|
||||
update_unit_ptr->requested_keys_to_fetched_columns_during_update_index[fetched_key_from_source] = found_keys_size;
|
||||
found_keys_in_source.emplace_back(fetched_key_from_source);
|
||||
|
@ -130,33 +130,18 @@ public:
|
||||
|
||||
std::exception_ptr getLastException() const override;
|
||||
|
||||
bool hasHierarchy() const override { return dictionary_key_type == DictionaryKeyType::simple && hierarchical_attribute; }
|
||||
bool hasHierarchy() const override { return dictionary_key_type == DictionaryKeyType::simple && dict_struct.hierarchical_attribute_index.has_value(); }
|
||||
|
||||
void toParent(const PaddedPODArray<UInt64> & ids, PaddedPODArray<UInt64> & out) const override;
|
||||
ColumnPtr getHierarchy(ColumnPtr key_column, const DataTypePtr & key_type) const override;
|
||||
|
||||
void isInVectorVector(
|
||||
const PaddedPODArray<UInt64> & child_ids,
|
||||
const PaddedPODArray<UInt64> & ancestor_ids,
|
||||
PaddedPODArray<UInt8> & out) const override;
|
||||
|
||||
void isInVectorConstant(
|
||||
const PaddedPODArray<UInt64> & child_ids,
|
||||
const UInt64 ancestor_id, PaddedPODArray<UInt8> & out) const override;
|
||||
|
||||
void isInConstantVector(
|
||||
const UInt64 child_id,
|
||||
const PaddedPODArray<UInt64> & ancestor_ids,
|
||||
PaddedPODArray<UInt8> & out) const override;
|
||||
ColumnUInt8::Ptr isInHierarchy(
|
||||
ColumnPtr key_column,
|
||||
ColumnPtr in_key_column,
|
||||
const DataTypePtr & key_type) const override;
|
||||
|
||||
private:
|
||||
using FetchResult = std::conditional_t<dictionary_key_type == DictionaryKeyType::simple, SimpleKeysStorageFetchResult, ComplexKeysStorageFetchResult>;
|
||||
|
||||
Columns getColumnsImpl(
|
||||
const Strings & attribute_names,
|
||||
const Columns & key_columns,
|
||||
const PaddedPODArray<KeyType> & keys,
|
||||
const Columns & default_values_columns) const;
|
||||
|
||||
static MutableColumns aggregateColumnsInOrderOfKeys(
|
||||
const PaddedPODArray<KeyType> & keys,
|
||||
const DictionaryStorageFetchRequest & request,
|
||||
@ -171,8 +156,6 @@ private:
|
||||
const MutableColumns & fetched_columns_during_update,
|
||||
const HashMap<KeyType, size_t> & found_keys_to_fetched_columns_during_update_index);
|
||||
|
||||
void setupHierarchicalAttribute();
|
||||
|
||||
void update(CacheDictionaryUpdateUnitPtr<dictionary_key_type> update_unit_ptr);
|
||||
|
||||
/// Update dictionary source pointer if required and return it. Thread safe.
|
||||
@ -193,9 +176,6 @@ private:
|
||||
return source_ptr;
|
||||
}
|
||||
|
||||
template <typename AncestorType>
|
||||
void isInImpl(const PaddedPODArray<Key> & child_ids, const AncestorType & ancestor_ids, PaddedPODArray<UInt8> & out) const;
|
||||
|
||||
const DictionaryStructure dict_struct;
|
||||
|
||||
/// Dictionary source should be used with mutex
|
||||
@ -218,8 +198,6 @@ private:
|
||||
/// readers. Surprisingly this lock is also used for last_exception pointer.
|
||||
mutable std::shared_mutex rw_lock;
|
||||
|
||||
const DictionaryAttribute * hierarchical_attribute = nullptr;
|
||||
|
||||
mutable std::exception_ptr last_exception;
|
||||
mutable std::atomic<size_t> error_count {0};
|
||||
mutable std::atomic<std::chrono::system_clock::time_point> backoff_end_time{std::chrono::system_clock::time_point{}};
|
||||
|
@ -66,8 +66,9 @@ public:
|
||||
|
||||
HashMap<KeyType, size_t> requested_keys_to_fetched_columns_during_update_index;
|
||||
MutableColumns fetched_columns_during_update;
|
||||
|
||||
/// Complex keys are serialized in this arena
|
||||
Arena complex_key_arena;
|
||||
DictionaryKeysArenaHolder<dictionary_key_type> complex_keys_arena_holder;
|
||||
|
||||
private:
|
||||
template <DictionaryKeyType>
|
||||
|
@ -1,594 +0,0 @@
|
||||
#include "ComplexKeyHashedDictionary.h"
|
||||
#include <ext/map.h>
|
||||
#include <ext/range.h>
|
||||
#include <Columns/ColumnsNumber.h>
|
||||
#include <Columns/ColumnNullable.h>
|
||||
#include <Functions/FunctionHelpers.h>
|
||||
#include <DataTypes/DataTypesDecimal.h>
|
||||
#include "DictionaryBlockInputStream.h"
|
||||
#include "DictionaryFactory.h"
|
||||
|
||||
namespace DB
|
||||
{
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int TYPE_MISMATCH;
|
||||
extern const int BAD_ARGUMENTS;
|
||||
extern const int DICTIONARY_IS_EMPTY;
|
||||
}
|
||||
|
||||
ComplexKeyHashedDictionary::ComplexKeyHashedDictionary(
|
||||
const StorageID & dict_id_,
|
||||
const DictionaryStructure & dict_struct_,
|
||||
DictionarySourcePtr source_ptr_,
|
||||
const DictionaryLifetime dict_lifetime_,
|
||||
bool require_nonempty_,
|
||||
BlockPtr saved_block_)
|
||||
: IDictionaryBase(dict_id_)
|
||||
, dict_struct(dict_struct_)
|
||||
, source_ptr{std::move(source_ptr_)}
|
||||
, dict_lifetime(dict_lifetime_)
|
||||
, require_nonempty(require_nonempty_)
|
||||
, saved_block{std::move(saved_block_)}
|
||||
{
|
||||
createAttributes();
|
||||
loadData();
|
||||
calculateBytesAllocated();
|
||||
}
|
||||
|
||||
ColumnPtr ComplexKeyHashedDictionary::getColumn(
|
||||
const std::string & attribute_name,
|
||||
const DataTypePtr & result_type,
|
||||
const Columns & key_columns,
|
||||
const DataTypes & key_types,
|
||||
const ColumnPtr & default_values_column) const
|
||||
{
|
||||
dict_struct.validateKeyTypes(key_types);
|
||||
|
||||
ColumnPtr result;
|
||||
|
||||
const auto & attribute = getAttribute(attribute_name);
|
||||
const auto & dictionary_attribute = dict_struct.getAttribute(attribute_name, result_type);
|
||||
|
||||
auto keys_size = key_columns.front()->size();
|
||||
|
||||
ColumnUInt8::MutablePtr col_null_map_to;
|
||||
ColumnUInt8::Container * vec_null_map_to = nullptr;
|
||||
if (attribute.is_nullable)
|
||||
{
|
||||
col_null_map_to = ColumnUInt8::create(keys_size, false);
|
||||
vec_null_map_to = &col_null_map_to->getData();
|
||||
}
|
||||
|
||||
auto type_call = [&](const auto &dictionary_attribute_type)
|
||||
{
|
||||
using Type = std::decay_t<decltype(dictionary_attribute_type)>;
|
||||
using AttributeType = typename Type::AttributeType;
|
||||
using ValueType = DictionaryValueType<AttributeType>;
|
||||
using ColumnProvider = DictionaryAttributeColumnProvider<AttributeType>;
|
||||
|
||||
const auto attribute_null_value = std::get<ValueType>(attribute.null_values);
|
||||
AttributeType null_value = static_cast<AttributeType>(attribute_null_value);
|
||||
DictionaryDefaultValueExtractor<AttributeType> default_value_extractor(std::move(null_value), default_values_column);
|
||||
|
||||
auto column = ColumnProvider::getColumn(dictionary_attribute, keys_size);
|
||||
|
||||
if constexpr (std::is_same_v<AttributeType, String>)
|
||||
{
|
||||
auto * out = column.get();
|
||||
|
||||
getItemsImpl<StringRef, StringRef>(
|
||||
attribute,
|
||||
key_columns,
|
||||
[&](const size_t row, const StringRef value, bool is_null)
|
||||
{
|
||||
if (attribute.is_nullable)
|
||||
(*vec_null_map_to)[row] = is_null;
|
||||
|
||||
out->insertData(value.data, value.size);
|
||||
},
|
||||
default_value_extractor);
|
||||
}
|
||||
else
|
||||
{
|
||||
auto & out = column->getData();
|
||||
|
||||
getItemsImpl<AttributeType, AttributeType>(
|
||||
attribute,
|
||||
key_columns,
|
||||
[&](const size_t row, const auto value, bool is_null)
|
||||
{
|
||||
if (attribute.is_nullable)
|
||||
(*vec_null_map_to)[row] = is_null;
|
||||
|
||||
out[row] = value;
|
||||
},
|
||||
default_value_extractor);
|
||||
}
|
||||
|
||||
result = std::move(column);
|
||||
};
|
||||
|
||||
callOnDictionaryAttributeType(attribute.type, type_call);
|
||||
|
||||
if (attribute.is_nullable)
|
||||
{
|
||||
result = ColumnNullable::create(result, std::move(col_null_map_to));
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
ColumnUInt8::Ptr ComplexKeyHashedDictionary::hasKeys(const Columns & key_columns, const DataTypes & key_types) const
|
||||
{
|
||||
dict_struct.validateKeyTypes(key_types);
|
||||
|
||||
auto size = key_columns.front()->size();
|
||||
auto result = ColumnUInt8::create(size);
|
||||
auto& out = result->getData();
|
||||
|
||||
const auto & attribute = attributes.front();
|
||||
|
||||
auto type_call = [&](const auto & dictionary_attribute_type)
|
||||
{
|
||||
using Type = std::decay_t<decltype(dictionary_attribute_type)>;
|
||||
using AttributeType = typename Type::AttributeType;
|
||||
using ValueType = DictionaryValueType<AttributeType>;
|
||||
|
||||
has<ValueType>(attribute, key_columns, out);
|
||||
};
|
||||
|
||||
callOnDictionaryAttributeType(attribute.type, type_call);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
void ComplexKeyHashedDictionary::createAttributes()
|
||||
{
|
||||
const auto size = dict_struct.attributes.size();
|
||||
attributes.reserve(size);
|
||||
|
||||
for (const auto & attribute : dict_struct.attributes)
|
||||
{
|
||||
attribute_index_by_name.emplace(attribute.name, attributes.size());
|
||||
attributes.push_back(createAttribute(attribute, attribute.null_value));
|
||||
|
||||
if (attribute.hierarchical)
|
||||
throw Exception{full_name + ": hierarchical attributes not supported for dictionary of type " + getTypeName(),
|
||||
ErrorCodes::TYPE_MISMATCH};
|
||||
}
|
||||
}
|
||||
|
||||
void ComplexKeyHashedDictionary::blockToAttributes(const Block & block)
|
||||
{
|
||||
/// created upfront to avoid excess allocations
|
||||
const auto keys_size = dict_struct.key->size();
|
||||
StringRefs keys(keys_size);
|
||||
|
||||
const auto attributes_size = attributes.size();
|
||||
const auto rows = block.rows();
|
||||
element_count += rows;
|
||||
|
||||
const auto key_column_ptrs = ext::map<Columns>(
|
||||
ext::range(0, keys_size), [&](const size_t attribute_idx) { return block.safeGetByPosition(attribute_idx).column; });
|
||||
|
||||
const auto attribute_column_ptrs = ext::map<Columns>(ext::range(0, attributes_size), [&](const size_t attribute_idx)
|
||||
{
|
||||
return block.safeGetByPosition(keys_size + attribute_idx).column;
|
||||
});
|
||||
|
||||
for (const auto row_idx : ext::range(0, rows))
|
||||
{
|
||||
/// calculate key once per row
|
||||
const auto key = placeKeysInPool(row_idx, key_column_ptrs, keys, keys_pool);
|
||||
|
||||
auto should_rollback = false;
|
||||
|
||||
for (const auto attribute_idx : ext::range(0, attributes_size))
|
||||
{
|
||||
const auto & attribute_column = *attribute_column_ptrs[attribute_idx];
|
||||
auto & attribute = attributes[attribute_idx];
|
||||
const auto inserted = setAttributeValue(attribute, key, attribute_column[row_idx]);
|
||||
if (!inserted)
|
||||
should_rollback = true;
|
||||
}
|
||||
|
||||
/// @note on multiple equal keys the mapped value for the first one is stored
|
||||
if (should_rollback)
|
||||
keys_pool.rollback(key.size);
|
||||
}
|
||||
}
|
||||
|
||||
void ComplexKeyHashedDictionary::updateData()
|
||||
{
|
||||
/// created upfront to avoid excess allocations
|
||||
const auto keys_size = dict_struct.key->size();
|
||||
StringRefs keys(keys_size);
|
||||
|
||||
const auto attributes_size = attributes.size();
|
||||
|
||||
if (!saved_block || saved_block->rows() == 0)
|
||||
{
|
||||
auto stream = source_ptr->loadUpdatedAll();
|
||||
stream->readPrefix();
|
||||
|
||||
while (const auto block = stream->read())
|
||||
{
|
||||
/// We are using this method to keep saved data if input stream consists of multiple blocks
|
||||
if (!saved_block)
|
||||
saved_block = std::make_shared<DB::Block>(block.cloneEmpty());
|
||||
for (const auto attribute_idx : ext::range(0, keys_size + attributes_size))
|
||||
{
|
||||
const IColumn & update_column = *block.getByPosition(attribute_idx).column.get();
|
||||
MutableColumnPtr saved_column = saved_block->getByPosition(attribute_idx).column->assumeMutable();
|
||||
saved_column->insertRangeFrom(update_column, 0, update_column.size());
|
||||
}
|
||||
}
|
||||
stream->readSuffix();
|
||||
}
|
||||
else
|
||||
{
|
||||
auto stream = source_ptr->loadUpdatedAll();
|
||||
|
||||
stream->readPrefix();
|
||||
while (Block block = stream->read())
|
||||
{
|
||||
const auto saved_key_column_ptrs = ext::map<Columns>(
|
||||
ext::range(0, keys_size), [&](const size_t key_idx) { return saved_block->safeGetByPosition(key_idx).column; });
|
||||
|
||||
const auto update_key_column_ptrs = ext::map<Columns>(
|
||||
ext::range(0, keys_size), [&](const size_t key_idx) { return block.safeGetByPosition(key_idx).column; });
|
||||
|
||||
Arena temp_key_pool;
|
||||
ContainerType<std::vector<size_t>> update_key_hash;
|
||||
|
||||
for (size_t i = 0; i < block.rows(); ++i)
|
||||
{
|
||||
const auto u_key = placeKeysInPool(i, update_key_column_ptrs, keys, temp_key_pool);
|
||||
update_key_hash[u_key].push_back(i);
|
||||
}
|
||||
|
||||
const size_t rows = saved_block->rows();
|
||||
IColumn::Filter filter(rows);
|
||||
|
||||
for (size_t i = 0; i < saved_block->rows(); ++i)
|
||||
{
|
||||
const auto s_key = placeKeysInPool(i, saved_key_column_ptrs, keys, temp_key_pool);
|
||||
auto * it = update_key_hash.find(s_key);
|
||||
if (it)
|
||||
filter[i] = 0;
|
||||
else
|
||||
filter[i] = 1;
|
||||
}
|
||||
|
||||
auto block_columns = block.mutateColumns();
|
||||
for (const auto attribute_idx : ext::range(0, keys_size + attributes_size))
|
||||
{
|
||||
auto & column = saved_block->safeGetByPosition(attribute_idx).column;
|
||||
const auto & filtered_column = column->filter(filter, -1);
|
||||
|
||||
block_columns[attribute_idx]->insertRangeFrom(*filtered_column.get(), 0, filtered_column->size());
|
||||
}
|
||||
|
||||
saved_block->setColumns(std::move(block_columns));
|
||||
}
|
||||
stream->readSuffix();
|
||||
}
|
||||
|
||||
if (saved_block)
|
||||
blockToAttributes(*saved_block.get());
|
||||
}
|
||||
|
||||
void ComplexKeyHashedDictionary::loadData()
|
||||
{
|
||||
if (!source_ptr->hasUpdateField())
|
||||
{
|
||||
auto stream = source_ptr->loadAll();
|
||||
stream->readPrefix();
|
||||
|
||||
while (const auto block = stream->read())
|
||||
blockToAttributes(block);
|
||||
|
||||
stream->readSuffix();
|
||||
}
|
||||
else
|
||||
updateData();
|
||||
|
||||
if (require_nonempty && 0 == element_count)
|
||||
throw Exception{full_name + ": dictionary source is empty and 'require_nonempty' property is set.", ErrorCodes::DICTIONARY_IS_EMPTY};
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void ComplexKeyHashedDictionary::addAttributeSize(const Attribute & attribute)
|
||||
{
|
||||
const auto & map_ref = std::get<ContainerType<T>>(attribute.maps);
|
||||
bytes_allocated += sizeof(ContainerType<T>) + map_ref.getBufferSizeInBytes();
|
||||
bucket_count = map_ref.getBufferSizeInCells();
|
||||
}
|
||||
|
||||
template <>
|
||||
void ComplexKeyHashedDictionary::addAttributeSize<String>(const Attribute & attribute)
|
||||
{
|
||||
const auto & map_ref = std::get<ContainerType<StringRef>>(attribute.maps);
|
||||
bytes_allocated += sizeof(ContainerType<StringRef>) + map_ref.getBufferSizeInBytes();
|
||||
bucket_count = map_ref.getBufferSizeInCells();
|
||||
bytes_allocated += sizeof(Arena) + attribute.string_arena->size();
|
||||
}
|
||||
|
||||
void ComplexKeyHashedDictionary::calculateBytesAllocated()
|
||||
{
|
||||
bytes_allocated += attributes.size() * sizeof(attributes.front());
|
||||
|
||||
for (const auto & attribute : attributes)
|
||||
{
|
||||
auto type_call = [&](const auto & dictionary_attribute_type)
|
||||
{
|
||||
using Type = std::decay_t<decltype(dictionary_attribute_type)>;
|
||||
using AttributeType = typename Type::AttributeType;
|
||||
|
||||
addAttributeSize<AttributeType>(attribute);
|
||||
};
|
||||
|
||||
callOnDictionaryAttributeType(attribute.type, type_call);
|
||||
}
|
||||
|
||||
bytes_allocated += keys_pool.size();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void ComplexKeyHashedDictionary::createAttributeImpl(Attribute & attribute, const Field & null_value)
|
||||
{
|
||||
attribute.null_values = T(null_value.get<T>());
|
||||
attribute.maps.emplace<ContainerType<T>>();
|
||||
}
|
||||
|
||||
template <>
|
||||
void ComplexKeyHashedDictionary::createAttributeImpl<String>(Attribute & attribute, const Field & null_value)
|
||||
{
|
||||
attribute.string_arena = std::make_unique<Arena>();
|
||||
const String & string = null_value.get<String>();
|
||||
const char * string_in_arena = attribute.string_arena->insert(string.data(), string.size());
|
||||
attribute.null_values.emplace<StringRef>(string_in_arena, string.size());
|
||||
attribute.maps.emplace<ContainerType<StringRef>>();
|
||||
}
|
||||
|
||||
ComplexKeyHashedDictionary::Attribute
|
||||
ComplexKeyHashedDictionary::createAttribute(const DictionaryAttribute & attribute, const Field & null_value)
|
||||
{
|
||||
auto nullable_set = attribute.is_nullable ? std::make_unique<NullableSet>() : nullptr;
|
||||
Attribute attr{attribute.underlying_type, attribute.is_nullable, std::move(nullable_set), {}, {}, {}};
|
||||
|
||||
auto type_call = [&](const auto &dictionary_attribute_type)
|
||||
{
|
||||
using Type = std::decay_t<decltype(dictionary_attribute_type)>;
|
||||
using AttributeType = typename Type::AttributeType;
|
||||
createAttributeImpl<AttributeType>(attr, null_value);
|
||||
};
|
||||
|
||||
callOnDictionaryAttributeType(attribute.underlying_type, type_call);
|
||||
|
||||
return attr;
|
||||
}
|
||||
|
||||
|
||||
template <typename AttributeType, typename OutputType, typename ValueSetter, typename DefaultValueExtractor>
|
||||
void ComplexKeyHashedDictionary::getItemsImpl(
|
||||
const Attribute & attribute,
|
||||
const Columns & key_columns,
|
||||
ValueSetter && set_value,
|
||||
DefaultValueExtractor & default_value_extractor) const
|
||||
{
|
||||
const auto & attr = std::get<ContainerType<AttributeType>>(attribute.maps);
|
||||
|
||||
const auto keys_size = key_columns.size();
|
||||
StringRefs keys(keys_size);
|
||||
Arena temporary_keys_pool;
|
||||
|
||||
const auto rows = key_columns.front()->size();
|
||||
for (const auto i : ext::range(0, rows))
|
||||
{
|
||||
/// copy key data to arena so it is contiguous and return StringRef to it
|
||||
const auto key = placeKeysInPool(i, key_columns, keys, temporary_keys_pool);
|
||||
|
||||
const auto it = attr.find(key);
|
||||
|
||||
if (it)
|
||||
{
|
||||
set_value(i, static_cast<OutputType>(it->getMapped()), false);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (attribute.is_nullable && attribute.nullable_set->find(key) != nullptr)
|
||||
set_value(i, default_value_extractor[i], true);
|
||||
else
|
||||
set_value(i, default_value_extractor[i], false);
|
||||
}
|
||||
|
||||
/// free memory allocated for the key
|
||||
temporary_keys_pool.rollback(key.size);
|
||||
}
|
||||
|
||||
query_count.fetch_add(rows, std::memory_order_relaxed);
|
||||
}
|
||||
|
||||
|
||||
template <typename T>
|
||||
bool ComplexKeyHashedDictionary::setAttributeValueImpl(Attribute & attribute, const StringRef key, const T value)
|
||||
{
|
||||
auto & map = std::get<ContainerType<T>>(attribute.maps);
|
||||
const auto pair = map.insert({key, value});
|
||||
return pair.second;
|
||||
}
|
||||
|
||||
template <>
|
||||
bool ComplexKeyHashedDictionary::setAttributeValueImpl<String>(Attribute & attribute, const StringRef key, const String value)
|
||||
{
|
||||
const auto * string_in_arena = attribute.string_arena->insert(value.data(), value.size());
|
||||
return setAttributeValueImpl<StringRef>(attribute, key, StringRef{string_in_arena, value.size()});
|
||||
}
|
||||
|
||||
bool ComplexKeyHashedDictionary::setAttributeValue(Attribute & attribute, const StringRef key, const Field & value)
|
||||
{
|
||||
bool result = false;
|
||||
|
||||
auto type_call = [&](const auto &dictionary_attribute_type)
|
||||
{
|
||||
using Type = std::decay_t<decltype(dictionary_attribute_type)>;
|
||||
using AttributeType = typename Type::AttributeType;
|
||||
|
||||
if (attribute.is_nullable)
|
||||
{
|
||||
if (value.isNull())
|
||||
{
|
||||
attribute.nullable_set->insert(key);
|
||||
result = true;
|
||||
return;
|
||||
}
|
||||
else
|
||||
{
|
||||
attribute.nullable_set->erase(key);
|
||||
}
|
||||
}
|
||||
|
||||
result = setAttributeValueImpl<AttributeType>(attribute, key, value.get<AttributeType>());
|
||||
};
|
||||
|
||||
callOnDictionaryAttributeType(attribute.type, type_call);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
const ComplexKeyHashedDictionary::Attribute & ComplexKeyHashedDictionary::getAttribute(const std::string & attribute_name) const
|
||||
{
|
||||
const auto it = attribute_index_by_name.find(attribute_name);
|
||||
if (it == std::end(attribute_index_by_name))
|
||||
throw Exception{full_name + ": no such attribute '" + attribute_name + "'", ErrorCodes::BAD_ARGUMENTS};
|
||||
|
||||
return attributes[it->second];
|
||||
}
|
||||
|
||||
StringRef ComplexKeyHashedDictionary::placeKeysInPool(const size_t row, const Columns & key_columns, StringRefs & keys, Arena & pool)
|
||||
{
|
||||
const auto keys_size = key_columns.size();
|
||||
size_t sum_keys_size{};
|
||||
|
||||
const char * block_start = nullptr;
|
||||
for (size_t j = 0; j < keys_size; ++j)
|
||||
{
|
||||
keys[j] = key_columns[j]->serializeValueIntoArena(row, pool, block_start);
|
||||
sum_keys_size += keys[j].size;
|
||||
}
|
||||
|
||||
const auto * key_start = block_start;
|
||||
for (size_t j = 0; j < keys_size; ++j)
|
||||
{
|
||||
keys[j].data = key_start;
|
||||
key_start += keys[j].size;
|
||||
}
|
||||
|
||||
return {block_start, sum_keys_size};
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void ComplexKeyHashedDictionary::has(const Attribute & attribute, const Columns & key_columns, PaddedPODArray<UInt8> & out) const
|
||||
{
|
||||
const auto & attr = std::get<ContainerType<T>>(attribute.maps);
|
||||
const auto keys_size = key_columns.size();
|
||||
StringRefs keys(keys_size);
|
||||
Arena temporary_keys_pool;
|
||||
const auto rows = key_columns.front()->size();
|
||||
|
||||
for (const auto i : ext::range(0, rows))
|
||||
{
|
||||
/// copy key data to arena so it is contiguous and return StringRef to it
|
||||
const auto key = placeKeysInPool(i, key_columns, keys, temporary_keys_pool);
|
||||
|
||||
const auto it = attr.find(key);
|
||||
out[i] = static_cast<bool>(it);
|
||||
|
||||
if (attribute.is_nullable && !out[i])
|
||||
out[i] = attribute.nullable_set->find(key) != nullptr;
|
||||
|
||||
/// free memory allocated for the key
|
||||
temporary_keys_pool.rollback(key.size);
|
||||
}
|
||||
|
||||
query_count.fetch_add(rows, std::memory_order_relaxed);
|
||||
}
|
||||
|
||||
std::vector<StringRef> ComplexKeyHashedDictionary::getKeys() const
|
||||
{
|
||||
const Attribute & attribute = attributes.front();
|
||||
|
||||
std::vector<StringRef> result;
|
||||
|
||||
auto type_call = [&](const auto & dictionary_attribute_type)
|
||||
{
|
||||
using Type = std::decay_t<decltype(dictionary_attribute_type)>;
|
||||
using AttributeType = typename Type::AttributeType;
|
||||
|
||||
if constexpr (std::is_same_v<AttributeType, String>)
|
||||
{
|
||||
result = getKeys<StringRef>(attribute);
|
||||
}
|
||||
else
|
||||
{
|
||||
result = getKeys<AttributeType>(attribute);
|
||||
}
|
||||
};
|
||||
|
||||
callOnDictionaryAttributeType(attribute.type, type_call);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
std::vector<StringRef> ComplexKeyHashedDictionary::getKeys(const Attribute & attribute) const
|
||||
{
|
||||
const ContainerType<T> & attr = std::get<ContainerType<T>>(attribute.maps);
|
||||
std::vector<StringRef> keys;
|
||||
keys.reserve(attr.size());
|
||||
for (const auto & key : attr)
|
||||
keys.push_back(key.getKey());
|
||||
|
||||
if (attribute.is_nullable)
|
||||
{
|
||||
for (const auto & key: *attribute.nullable_set)
|
||||
keys.push_back(key.getKey());
|
||||
}
|
||||
|
||||
return keys;
|
||||
}
|
||||
|
||||
BlockInputStreamPtr ComplexKeyHashedDictionary::getBlockInputStream(const Names & column_names, size_t max_block_size) const
|
||||
{
|
||||
using BlockInputStreamType = DictionaryBlockInputStream<UInt64>;
|
||||
auto vector_keys = getKeys();
|
||||
|
||||
PaddedPODArray<StringRef> keys;
|
||||
keys.reserve(vector_keys.size());
|
||||
keys.assign(vector_keys.begin(), vector_keys.end());
|
||||
|
||||
return std::make_shared<BlockInputStreamType>(shared_from_this(), max_block_size, keys, column_names);
|
||||
}
|
||||
|
||||
void registerDictionaryComplexKeyHashed(DictionaryFactory & factory)
|
||||
{
|
||||
auto create_layout = [=](const std::string &,
|
||||
const DictionaryStructure & dict_struct,
|
||||
const Poco::Util::AbstractConfiguration & config,
|
||||
const std::string & config_prefix,
|
||||
DictionarySourcePtr source_ptr) -> DictionaryPtr
|
||||
{
|
||||
if (!dict_struct.key)
|
||||
throw Exception{"'key' is required for dictionary of layout 'complex_key_hashed'", ErrorCodes::BAD_ARGUMENTS};
|
||||
|
||||
const auto dict_id = StorageID::fromDictionaryConfig(config, config_prefix);
|
||||
const DictionaryLifetime dict_lifetime{config, config_prefix + ".lifetime"};
|
||||
const bool require_nonempty = config.getBool(config_prefix + ".require_nonempty", false);
|
||||
return std::make_unique<ComplexKeyHashedDictionary>(dict_id, dict_struct, std::move(source_ptr), dict_lifetime, require_nonempty);
|
||||
};
|
||||
factory.registerLayout("complex_key_hashed", create_layout, true);
|
||||
}
|
||||
|
||||
}
|
@ -1,185 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include <atomic>
|
||||
#include <memory>
|
||||
#include <variant>
|
||||
#include <Columns/ColumnDecimal.h>
|
||||
#include <Columns/ColumnString.h>
|
||||
#include <Common/Arena.h>
|
||||
#include <Common/HashTable/HashMap.h>
|
||||
#include <Common/HashTable/HashSet.h>
|
||||
#include <Core/Block.h>
|
||||
#include <common/StringRef.h>
|
||||
#include <ext/range.h>
|
||||
#include "IDictionary.h"
|
||||
#include "IDictionarySource.h"
|
||||
#include "DictionaryStructure.h"
|
||||
#include "DictionaryHelpers.h"
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
class ComplexKeyHashedDictionary final : public IDictionaryBase
|
||||
{
|
||||
public:
|
||||
ComplexKeyHashedDictionary(
|
||||
const StorageID & dict_id_,
|
||||
const DictionaryStructure & dict_struct_,
|
||||
DictionarySourcePtr source_ptr_,
|
||||
const DictionaryLifetime dict_lifetime_,
|
||||
bool require_nonempty_,
|
||||
BlockPtr saved_block_ = nullptr);
|
||||
|
||||
std::string getKeyDescription() const { return key_description; }
|
||||
|
||||
std::string getTypeName() const override { return "ComplexKeyHashed"; }
|
||||
|
||||
size_t getBytesAllocated() const override { return bytes_allocated; }
|
||||
|
||||
size_t getQueryCount() const override { return query_count.load(std::memory_order_relaxed); }
|
||||
|
||||
double getHitRate() const override { return 1.0; }
|
||||
|
||||
size_t getElementCount() const override { return element_count; }
|
||||
|
||||
double getLoadFactor() const override { return static_cast<double>(element_count) / bucket_count; }
|
||||
|
||||
std::shared_ptr<const IExternalLoadable> clone() const override
|
||||
{
|
||||
return std::make_shared<ComplexKeyHashedDictionary>(getDictionaryID(), dict_struct, source_ptr->clone(), dict_lifetime, require_nonempty, saved_block);
|
||||
}
|
||||
|
||||
const IDictionarySource * getSource() const override { return source_ptr.get(); }
|
||||
|
||||
const DictionaryLifetime & getLifetime() const override { return dict_lifetime; }
|
||||
|
||||
const DictionaryStructure & getStructure() const override { return dict_struct; }
|
||||
|
||||
bool isInjective(const std::string & attribute_name) const override
|
||||
{
|
||||
return dict_struct.attributes[&getAttribute(attribute_name) - attributes.data()].injective;
|
||||
}
|
||||
|
||||
DictionaryKeyType getKeyType() const override { return DictionaryKeyType::complex; }
|
||||
|
||||
ColumnPtr getColumn(
|
||||
const std::string& attribute_name,
|
||||
const DataTypePtr & result_type,
|
||||
const Columns & key_columns,
|
||||
const DataTypes & key_types,
|
||||
const ColumnPtr & default_values_column) const override;
|
||||
|
||||
ColumnUInt8::Ptr hasKeys(const Columns & key_columns, const DataTypes & key_types) const override;
|
||||
|
||||
BlockInputStreamPtr getBlockInputStream(const Names & column_names, size_t max_block_size) const override;
|
||||
|
||||
private:
|
||||
template <typename Value>
|
||||
using ContainerType = HashMapWithSavedHash<StringRef, Value, StringRefHash>;
|
||||
|
||||
using NullableSet = HashSetWithSavedHash<StringRef, StringRefHash>;
|
||||
|
||||
struct Attribute final
|
||||
{
|
||||
AttributeUnderlyingType type;
|
||||
bool is_nullable;
|
||||
std::unique_ptr<NullableSet> nullable_set;
|
||||
|
||||
std::variant<
|
||||
UInt8,
|
||||
UInt16,
|
||||
UInt32,
|
||||
UInt64,
|
||||
UInt128,
|
||||
Int8,
|
||||
Int16,
|
||||
Int32,
|
||||
Int64,
|
||||
Decimal32,
|
||||
Decimal64,
|
||||
Decimal128,
|
||||
Float32,
|
||||
Float64,
|
||||
StringRef>
|
||||
null_values;
|
||||
std::variant<
|
||||
ContainerType<UInt8>,
|
||||
ContainerType<UInt16>,
|
||||
ContainerType<UInt32>,
|
||||
ContainerType<UInt64>,
|
||||
ContainerType<UInt128>,
|
||||
ContainerType<Int8>,
|
||||
ContainerType<Int16>,
|
||||
ContainerType<Int32>,
|
||||
ContainerType<Int64>,
|
||||
ContainerType<Decimal32>,
|
||||
ContainerType<Decimal64>,
|
||||
ContainerType<Decimal128>,
|
||||
ContainerType<Float32>,
|
||||
ContainerType<Float64>,
|
||||
ContainerType<StringRef>>
|
||||
maps;
|
||||
std::unique_ptr<Arena> string_arena;
|
||||
};
|
||||
|
||||
void createAttributes();
|
||||
|
||||
void blockToAttributes(const Block & block);
|
||||
|
||||
void updateData();
|
||||
|
||||
void loadData();
|
||||
|
||||
template <typename T>
|
||||
void addAttributeSize(const Attribute & attribute);
|
||||
|
||||
void calculateBytesAllocated();
|
||||
|
||||
template <typename T>
|
||||
static void createAttributeImpl(Attribute & attribute, const Field & null_value);
|
||||
|
||||
static Attribute createAttribute(const DictionaryAttribute & attribute, const Field & null_value);
|
||||
|
||||
template <typename AttributeType, typename OutputType, typename ValueSetter, typename DefaultValueExtractor>
|
||||
void getItemsImpl(
|
||||
const Attribute & attribute,
|
||||
const Columns & key_columns,
|
||||
ValueSetter && set_value,
|
||||
DefaultValueExtractor & default_value_extractor) const;
|
||||
|
||||
template <typename T>
|
||||
static bool setAttributeValueImpl(Attribute & attribute, const StringRef key, const T value);
|
||||
|
||||
static bool setAttributeValue(Attribute & attribute, const StringRef key, const Field & value);
|
||||
|
||||
const Attribute & getAttribute(const std::string & attribute_name) const;
|
||||
|
||||
static StringRef placeKeysInPool(const size_t row, const Columns & key_columns, StringRefs & keys, Arena & pool);
|
||||
|
||||
template <typename T>
|
||||
void has(const Attribute & attribute, const Columns & key_columns, PaddedPODArray<UInt8> & out) const;
|
||||
|
||||
std::vector<StringRef> getKeys() const;
|
||||
|
||||
template <typename T>
|
||||
std::vector<StringRef> getKeys(const Attribute & attribute) const;
|
||||
|
||||
const DictionaryStructure dict_struct;
|
||||
const DictionarySourcePtr source_ptr;
|
||||
const DictionaryLifetime dict_lifetime;
|
||||
const bool require_nonempty;
|
||||
const std::string key_description{dict_struct.getKeyDescription()};
|
||||
|
||||
std::map<std::string, size_t> attribute_index_by_name;
|
||||
std::vector<Attribute> attributes;
|
||||
Arena keys_pool;
|
||||
|
||||
size_t bytes_allocated = 0;
|
||||
size_t element_count = 0;
|
||||
size_t bucket_count = 0;
|
||||
mutable std::atomic<size_t> query_count{0};
|
||||
|
||||
BlockPtr saved_block;
|
||||
};
|
||||
|
||||
}
|
200
src/Dictionaries/DictionaryBlockInputStream.cpp
Normal file
200
src/Dictionaries/DictionaryBlockInputStream.cpp
Normal file
@ -0,0 +1,200 @@
|
||||
#include "DictionaryBlockInputStream.h"
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int LOGICAL_ERROR;
|
||||
}
|
||||
|
||||
DictionaryBlockInputStream::DictionaryBlockInputStream(
|
||||
std::shared_ptr<const IDictionary> dictionary_, UInt64 max_block_size_, PaddedPODArray<UInt64> && ids_, const Names & column_names_)
|
||||
: DictionaryBlockInputStreamBase(ids_.size(), max_block_size_)
|
||||
, dictionary(dictionary_)
|
||||
, column_names(column_names_)
|
||||
, ids(std::move(ids_))
|
||||
, key_type(DictionaryInputStreamKeyType::Id)
|
||||
{
|
||||
}
|
||||
|
||||
DictionaryBlockInputStream::DictionaryBlockInputStream(
|
||||
std::shared_ptr<const IDictionary> dictionary_,
|
||||
UInt64 max_block_size_,
|
||||
const PaddedPODArray<StringRef> & keys,
|
||||
const Names & column_names_)
|
||||
: DictionaryBlockInputStreamBase(keys.size(), max_block_size_)
|
||||
, dictionary(dictionary_)
|
||||
, column_names(column_names_)
|
||||
, key_type(DictionaryInputStreamKeyType::ComplexKey)
|
||||
{
|
||||
const DictionaryStructure & dictionary_structure = dictionary->getStructure();
|
||||
fillKeyColumns(keys, 0, keys.size(), dictionary_structure, key_columns);
|
||||
}
|
||||
|
||||
DictionaryBlockInputStream::DictionaryBlockInputStream(
|
||||
std::shared_ptr<const IDictionary> dictionary_,
|
||||
UInt64 max_block_size_,
|
||||
const Columns & data_columns_,
|
||||
const Names & column_names_,
|
||||
GetColumnsFunction && get_key_columns_function_,
|
||||
GetColumnsFunction && get_view_columns_function_)
|
||||
: DictionaryBlockInputStreamBase(data_columns_.front()->size(), max_block_size_)
|
||||
, dictionary(dictionary_)
|
||||
, column_names(column_names_)
|
||||
, data_columns(data_columns_)
|
||||
, get_key_columns_function(std::move(get_key_columns_function_))
|
||||
, get_view_columns_function(std::move(get_view_columns_function_))
|
||||
, key_type(DictionaryInputStreamKeyType::Callback)
|
||||
{
|
||||
}
|
||||
|
||||
Block DictionaryBlockInputStream::getBlock(size_t start, size_t length) const
|
||||
{
|
||||
/// TODO: Rewrite
|
||||
switch (key_type)
|
||||
{
|
||||
case DictionaryInputStreamKeyType::ComplexKey:
|
||||
{
|
||||
Columns columns;
|
||||
ColumnsWithTypeAndName view_columns;
|
||||
columns.reserve(key_columns.size());
|
||||
for (const auto & key_column : key_columns)
|
||||
{
|
||||
ColumnPtr column = key_column.column->cut(start, length);
|
||||
columns.emplace_back(column);
|
||||
view_columns.emplace_back(column, key_column.type, key_column.name);
|
||||
}
|
||||
return fillBlock({}, columns, {}, std::move(view_columns));
|
||||
}
|
||||
|
||||
case DictionaryInputStreamKeyType::Id:
|
||||
{
|
||||
PaddedPODArray<UInt64> ids_to_fill(ids.begin() + start, ids.begin() + start + length);
|
||||
return fillBlock(ids_to_fill, {}, {}, {});
|
||||
}
|
||||
|
||||
case DictionaryInputStreamKeyType::Callback:
|
||||
{
|
||||
Columns columns;
|
||||
columns.reserve(data_columns.size());
|
||||
for (const auto & data_column : data_columns)
|
||||
columns.push_back(data_column->cut(start, length));
|
||||
const DictionaryStructure & dictionaty_structure = dictionary->getStructure();
|
||||
const auto & attributes = *dictionaty_structure.key;
|
||||
ColumnsWithTypeAndName keys_with_type_and_name = get_key_columns_function(columns, attributes);
|
||||
ColumnsWithTypeAndName view_with_type_and_name = get_view_columns_function(columns, attributes);
|
||||
DataTypes types;
|
||||
columns.clear();
|
||||
for (const auto & key_column : keys_with_type_and_name)
|
||||
{
|
||||
columns.push_back(key_column.column);
|
||||
types.push_back(key_column.type);
|
||||
}
|
||||
return fillBlock({}, columns, types, std::move(view_with_type_and_name));
|
||||
}
|
||||
}
|
||||
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected DictionaryInputStreamKeyType.");
|
||||
}
|
||||
|
||||
Block DictionaryBlockInputStream::fillBlock(
|
||||
const PaddedPODArray<UInt64> & ids_to_fill,
|
||||
const Columns & keys,
|
||||
const DataTypes & types,
|
||||
ColumnsWithTypeAndName && view) const
|
||||
{
|
||||
std::unordered_set<std::string> names(column_names.begin(), column_names.end());
|
||||
|
||||
DataTypes data_types = types;
|
||||
ColumnsWithTypeAndName block_columns;
|
||||
|
||||
data_types.reserve(keys.size());
|
||||
const DictionaryStructure & dictionary_structure = dictionary->getStructure();
|
||||
if (data_types.empty() && dictionary_structure.key)
|
||||
for (const auto & key : *dictionary_structure.key)
|
||||
data_types.push_back(key.type);
|
||||
|
||||
for (const auto & column : view)
|
||||
if (names.find(column.name) != names.end())
|
||||
block_columns.push_back(column);
|
||||
|
||||
const DictionaryStructure & structure = dictionary->getStructure();
|
||||
ColumnPtr ids_column = getColumnFromIds(ids_to_fill);
|
||||
|
||||
if (structure.id && names.find(structure.id->name) != names.end())
|
||||
{
|
||||
block_columns.emplace_back(ids_column, std::make_shared<DataTypeUInt64>(), structure.id->name);
|
||||
}
|
||||
|
||||
auto dictionary_key_type = dictionary->getKeyType();
|
||||
|
||||
for (const auto idx : ext::range(0, structure.attributes.size()))
|
||||
{
|
||||
const DictionaryAttribute & attribute = structure.attributes[idx];
|
||||
if (names.find(attribute.name) != names.end())
|
||||
{
|
||||
ColumnPtr column;
|
||||
|
||||
if (dictionary_key_type == DictionaryKeyType::simple)
|
||||
{
|
||||
column = dictionary->getColumn(
|
||||
attribute.name,
|
||||
attribute.type,
|
||||
{ids_column},
|
||||
{std::make_shared<DataTypeUInt64>()},
|
||||
nullptr /* default_values_column */);
|
||||
}
|
||||
else
|
||||
{
|
||||
column = dictionary->getColumn(
|
||||
attribute.name,
|
||||
attribute.type,
|
||||
keys,
|
||||
data_types,
|
||||
nullptr /* default_values_column*/);
|
||||
}
|
||||
|
||||
block_columns.emplace_back(column, attribute.type, attribute.name);
|
||||
}
|
||||
}
|
||||
|
||||
return Block(block_columns);
|
||||
}
|
||||
|
||||
ColumnPtr DictionaryBlockInputStream::getColumnFromIds(const PaddedPODArray<UInt64> & ids_to_fill)
|
||||
{
|
||||
auto column_vector = ColumnVector<UInt64>::create();
|
||||
column_vector->getData().assign(ids_to_fill);
|
||||
return column_vector;
|
||||
}
|
||||
|
||||
void DictionaryBlockInputStream::fillKeyColumns(
|
||||
const PaddedPODArray<StringRef> & keys,
|
||||
size_t start,
|
||||
size_t size,
|
||||
const DictionaryStructure & dictionary_structure,
|
||||
ColumnsWithTypeAndName & result)
|
||||
{
|
||||
MutableColumns columns;
|
||||
columns.reserve(dictionary_structure.key->size());
|
||||
|
||||
for (const DictionaryAttribute & attribute : *dictionary_structure.key)
|
||||
columns.emplace_back(attribute.type->createColumn());
|
||||
|
||||
for (auto idx : ext::range(start, size))
|
||||
{
|
||||
const auto & key = keys[idx];
|
||||
const auto *ptr = key.data;
|
||||
for (auto & column : columns)
|
||||
ptr = column->deserializeAndInsertFromArena(ptr);
|
||||
}
|
||||
|
||||
for (size_t i = 0, num_columns = columns.size(); i < num_columns; ++i)
|
||||
{
|
||||
const auto & dictionary_attribute = (*dictionary_structure.key)[i];
|
||||
result.emplace_back(ColumnWithTypeAndName{std::move(columns[i]), dictionary_attribute.type, dictionary_attribute.name});
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@ -16,27 +16,22 @@
|
||||
|
||||
namespace DB
|
||||
{
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int LOGICAL_ERROR;
|
||||
}
|
||||
|
||||
/// TODO: Remove this class
|
||||
/* BlockInputStream implementation for external dictionaries
|
||||
* read() returns blocks consisting of the in-memory contents of the dictionaries
|
||||
*/
|
||||
template <typename Key>
|
||||
class DictionaryBlockInputStream : public DictionaryBlockInputStreamBase
|
||||
{
|
||||
public:
|
||||
DictionaryBlockInputStream(
|
||||
std::shared_ptr<const IDictionaryBase> dictionary,
|
||||
std::shared_ptr<const IDictionary> dictionary,
|
||||
UInt64 max_block_size,
|
||||
PaddedPODArray<Key> && ids,
|
||||
PaddedPODArray<UInt64> && ids,
|
||||
const Names & column_names);
|
||||
|
||||
DictionaryBlockInputStream(
|
||||
std::shared_ptr<const IDictionaryBase> dictionary,
|
||||
std::shared_ptr<const IDictionary> dictionary,
|
||||
UInt64 max_block_size,
|
||||
const PaddedPODArray<StringRef> & keys,
|
||||
const Names & column_names);
|
||||
@ -48,7 +43,7 @@ public:
|
||||
// and get_view_columns_function to get key representation.
|
||||
// Now used in trie dictionary, where columns are stored as ip and mask, and are showed as string
|
||||
DictionaryBlockInputStream(
|
||||
std::shared_ptr<const IDictionaryBase> dictionary,
|
||||
std::shared_ptr<const IDictionary> dictionary,
|
||||
UInt64 max_block_size,
|
||||
const Columns & data_columns,
|
||||
const Names & column_names,
|
||||
@ -61,21 +56,24 @@ protected:
|
||||
Block getBlock(size_t start, size_t length) const override;
|
||||
|
||||
private:
|
||||
Block
|
||||
fillBlock(const PaddedPODArray<Key> & ids_to_fill, const Columns & keys, const DataTypes & types, ColumnsWithTypeAndName && view) const;
|
||||
Block fillBlock(
|
||||
const PaddedPODArray<UInt64> & ids_to_fill,
|
||||
const Columns & keys,
|
||||
const DataTypes & types,
|
||||
ColumnsWithTypeAndName && view) const;
|
||||
|
||||
ColumnPtr getColumnFromIds(const PaddedPODArray<Key> & ids_to_fill) const;
|
||||
static ColumnPtr getColumnFromIds(const PaddedPODArray<UInt64> & ids_to_fill);
|
||||
|
||||
void fillKeyColumns(
|
||||
static void fillKeyColumns(
|
||||
const PaddedPODArray<StringRef> & keys,
|
||||
size_t start,
|
||||
size_t size,
|
||||
const DictionaryStructure & dictionary_structure,
|
||||
ColumnsWithTypeAndName & columns) const;
|
||||
ColumnsWithTypeAndName & result);
|
||||
|
||||
std::shared_ptr<const IDictionaryBase> dictionary;
|
||||
std::shared_ptr<const IDictionary> dictionary;
|
||||
Names column_names;
|
||||
PaddedPODArray<Key> ids;
|
||||
PaddedPODArray<UInt64> ids;
|
||||
ColumnsWithTypeAndName key_columns;
|
||||
|
||||
Columns data_columns;
|
||||
@ -92,200 +90,4 @@ private:
|
||||
DictionaryInputStreamKeyType key_type;
|
||||
};
|
||||
|
||||
|
||||
template <typename Key>
|
||||
DictionaryBlockInputStream<Key>::DictionaryBlockInputStream(
|
||||
std::shared_ptr<const IDictionaryBase> dictionary_, UInt64 max_block_size_, PaddedPODArray<Key> && ids_, const Names & column_names_)
|
||||
: DictionaryBlockInputStreamBase(ids_.size(), max_block_size_)
|
||||
, dictionary(dictionary_)
|
||||
, column_names(column_names_)
|
||||
, ids(std::move(ids_))
|
||||
, key_type(DictionaryInputStreamKeyType::Id)
|
||||
{
|
||||
}
|
||||
|
||||
template <typename Key>
|
||||
DictionaryBlockInputStream<Key>::DictionaryBlockInputStream(
|
||||
std::shared_ptr<const IDictionaryBase> dictionary_,
|
||||
UInt64 max_block_size_,
|
||||
const PaddedPODArray<StringRef> & keys,
|
||||
const Names & column_names_)
|
||||
: DictionaryBlockInputStreamBase(keys.size(), max_block_size_)
|
||||
, dictionary(dictionary_)
|
||||
, column_names(column_names_)
|
||||
, key_type(DictionaryInputStreamKeyType::ComplexKey)
|
||||
{
|
||||
const DictionaryStructure & dictionary_structure = dictionary->getStructure();
|
||||
fillKeyColumns(keys, 0, keys.size(), dictionary_structure, key_columns);
|
||||
}
|
||||
|
||||
template <typename Key>
|
||||
DictionaryBlockInputStream<Key>::DictionaryBlockInputStream(
|
||||
std::shared_ptr<const IDictionaryBase> dictionary_,
|
||||
UInt64 max_block_size_,
|
||||
const Columns & data_columns_,
|
||||
const Names & column_names_,
|
||||
GetColumnsFunction && get_key_columns_function_,
|
||||
GetColumnsFunction && get_view_columns_function_)
|
||||
: DictionaryBlockInputStreamBase(data_columns_.front()->size(), max_block_size_)
|
||||
, dictionary(dictionary_)
|
||||
, column_names(column_names_)
|
||||
, data_columns(data_columns_)
|
||||
, get_key_columns_function(std::move(get_key_columns_function_))
|
||||
, get_view_columns_function(std::move(get_view_columns_function_))
|
||||
, key_type(DictionaryInputStreamKeyType::Callback)
|
||||
{
|
||||
}
|
||||
|
||||
|
||||
template <typename Key>
|
||||
Block DictionaryBlockInputStream<Key>::getBlock(size_t start, size_t length) const
|
||||
{
|
||||
/// TODO: Rewrite
|
||||
switch (key_type)
|
||||
{
|
||||
case DictionaryInputStreamKeyType::ComplexKey:
|
||||
{
|
||||
Columns columns;
|
||||
ColumnsWithTypeAndName view_columns;
|
||||
columns.reserve(key_columns.size());
|
||||
for (const auto & key_column : key_columns)
|
||||
{
|
||||
ColumnPtr column = key_column.column->cut(start, length);
|
||||
columns.emplace_back(column);
|
||||
view_columns.emplace_back(column, key_column.type, key_column.name);
|
||||
}
|
||||
return fillBlock({}, columns, {}, std::move(view_columns));
|
||||
}
|
||||
|
||||
case DictionaryInputStreamKeyType::Id:
|
||||
{
|
||||
PaddedPODArray<Key> ids_to_fill(ids.begin() + start, ids.begin() + start + length);
|
||||
return fillBlock(ids_to_fill, {}, {}, {});
|
||||
}
|
||||
|
||||
case DictionaryInputStreamKeyType::Callback:
|
||||
{
|
||||
Columns columns;
|
||||
columns.reserve(data_columns.size());
|
||||
for (const auto & data_column : data_columns)
|
||||
columns.push_back(data_column->cut(start, length));
|
||||
const DictionaryStructure & dictionaty_structure = dictionary->getStructure();
|
||||
const auto & attributes = *dictionaty_structure.key;
|
||||
ColumnsWithTypeAndName keys_with_type_and_name = get_key_columns_function(columns, attributes);
|
||||
ColumnsWithTypeAndName view_with_type_and_name = get_view_columns_function(columns, attributes);
|
||||
DataTypes types;
|
||||
columns.clear();
|
||||
for (const auto & key_column : keys_with_type_and_name)
|
||||
{
|
||||
columns.push_back(key_column.column);
|
||||
types.push_back(key_column.type);
|
||||
}
|
||||
return fillBlock({}, columns, types, std::move(view_with_type_and_name));
|
||||
}
|
||||
}
|
||||
|
||||
throw Exception("Unexpected DictionaryInputStreamKeyType.", ErrorCodes::LOGICAL_ERROR);
|
||||
}
|
||||
|
||||
template <typename Key>
|
||||
Block DictionaryBlockInputStream<Key>::fillBlock(
|
||||
const PaddedPODArray<Key> & ids_to_fill, const Columns & keys, const DataTypes & types, ColumnsWithTypeAndName && view) const
|
||||
{
|
||||
std::unordered_set<std::string> names(column_names.begin(), column_names.end());
|
||||
|
||||
DataTypes data_types = types;
|
||||
ColumnsWithTypeAndName block_columns;
|
||||
|
||||
data_types.reserve(keys.size());
|
||||
const DictionaryStructure & dictionaty_structure = dictionary->getStructure();
|
||||
if (data_types.empty() && dictionaty_structure.key)
|
||||
for (const auto & key : *dictionaty_structure.key)
|
||||
data_types.push_back(key.type);
|
||||
|
||||
for (const auto & column : view)
|
||||
if (names.find(column.name) != names.end())
|
||||
block_columns.push_back(column);
|
||||
|
||||
const DictionaryStructure & structure = dictionary->getStructure();
|
||||
ColumnPtr ids_column = getColumnFromIds(ids_to_fill);
|
||||
|
||||
if (structure.id && names.find(structure.id->name) != names.end())
|
||||
{
|
||||
block_columns.emplace_back(ids_column, std::make_shared<DataTypeUInt64>(), structure.id->name);
|
||||
}
|
||||
|
||||
auto dictionary_key_type = dictionary->getKeyType();
|
||||
|
||||
for (const auto idx : ext::range(0, structure.attributes.size()))
|
||||
{
|
||||
const DictionaryAttribute & attribute = structure.attributes[idx];
|
||||
if (names.find(attribute.name) != names.end())
|
||||
{
|
||||
ColumnPtr column;
|
||||
|
||||
if (dictionary_key_type == DictionaryKeyType::simple)
|
||||
{
|
||||
column = dictionary->getColumn(
|
||||
attribute.name,
|
||||
attribute.type,
|
||||
{ids_column},
|
||||
{std::make_shared<DataTypeUInt64>()},
|
||||
nullptr /* default_values_column */);
|
||||
}
|
||||
else
|
||||
{
|
||||
column = dictionary->getColumn(
|
||||
attribute.name,
|
||||
attribute.type,
|
||||
keys,
|
||||
data_types,
|
||||
nullptr /* default_values_column*/);
|
||||
}
|
||||
|
||||
block_columns.emplace_back(column, attribute.type, attribute.name);
|
||||
}
|
||||
}
|
||||
|
||||
return Block(block_columns);
|
||||
}
|
||||
|
||||
template <typename Key>
|
||||
ColumnPtr DictionaryBlockInputStream<Key>::getColumnFromIds(const PaddedPODArray<Key> & ids_to_fill) const
|
||||
{
|
||||
auto column_vector = ColumnVector<UInt64>::create();
|
||||
column_vector->getData().reserve(ids_to_fill.size());
|
||||
for (UInt64 id : ids_to_fill)
|
||||
column_vector->insertValue(id);
|
||||
return column_vector;
|
||||
}
|
||||
|
||||
|
||||
template <typename Key>
|
||||
void DictionaryBlockInputStream<Key>::fillKeyColumns(
|
||||
const PaddedPODArray<StringRef> & keys,
|
||||
size_t start,
|
||||
size_t size,
|
||||
const DictionaryStructure & dictionary_structure,
|
||||
ColumnsWithTypeAndName & res) const
|
||||
{
|
||||
MutableColumns columns;
|
||||
columns.reserve(dictionary_structure.key->size());
|
||||
|
||||
for (const DictionaryAttribute & attribute : *dictionary_structure.key)
|
||||
columns.emplace_back(attribute.type->createColumn());
|
||||
|
||||
for (auto idx : ext::range(start, size))
|
||||
{
|
||||
const auto & key = keys[idx];
|
||||
const auto *ptr = key.data;
|
||||
for (auto & column : columns)
|
||||
ptr = column->deserializeAndInsertFromArena(ptr);
|
||||
}
|
||||
|
||||
for (size_t i = 0, num_columns = columns.size(); i < num_columns; ++i)
|
||||
res.emplace_back(
|
||||
ColumnWithTypeAndName{std::move(columns[i]), (*dictionary_structure.key)[i].type, (*dictionary_structure.key)[i].name});
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -295,6 +295,28 @@ private:
|
||||
bool use_default_value_from_column = false;
|
||||
};
|
||||
|
||||
template <DictionaryKeyType key_type>
|
||||
class DictionaryKeysArenaHolder;
|
||||
|
||||
template <>
|
||||
class DictionaryKeysArenaHolder<DictionaryKeyType::simple>
|
||||
{
|
||||
public:
|
||||
static Arena * getComplexKeyArena() { return nullptr; }
|
||||
};
|
||||
|
||||
template <>
|
||||
class DictionaryKeysArenaHolder<DictionaryKeyType::complex>
|
||||
{
|
||||
public:
|
||||
|
||||
Arena * getComplexKeyArena() { return &complex_key_arena; }
|
||||
|
||||
private:
|
||||
Arena complex_key_arena;
|
||||
};
|
||||
|
||||
|
||||
template <DictionaryKeyType key_type>
|
||||
class DictionaryKeysExtractor
|
||||
{
|
||||
@ -302,67 +324,96 @@ public:
|
||||
using KeyType = std::conditional_t<key_type == DictionaryKeyType::simple, UInt64, StringRef>;
|
||||
static_assert(key_type != DictionaryKeyType::range, "Range key type is not supported by DictionaryKeysExtractor");
|
||||
|
||||
explicit DictionaryKeysExtractor(const Columns & key_columns, Arena & existing_arena)
|
||||
explicit DictionaryKeysExtractor(const Columns & key_columns_, Arena * complex_key_arena_)
|
||||
: key_columns(key_columns_)
|
||||
, complex_key_arena(complex_key_arena_)
|
||||
{
|
||||
assert(!key_columns.empty());
|
||||
|
||||
if constexpr (key_type == DictionaryKeyType::simple)
|
||||
keys = getColumnVectorData(key_columns.front());
|
||||
{
|
||||
key_columns[0] = key_columns[0]->convertToFullColumnIfConst();
|
||||
|
||||
const auto * vector_col = checkAndGetColumn<ColumnVector<UInt64>>(key_columns[0].get());
|
||||
if (!vector_col)
|
||||
throw Exception(ErrorCodes::TYPE_MISMATCH, "Column type mismatch for simple key expected UInt64");
|
||||
}
|
||||
|
||||
keys_size = key_columns.front()->size();
|
||||
}
|
||||
|
||||
inline size_t getKeysSize() const
|
||||
{
|
||||
return keys_size;
|
||||
}
|
||||
|
||||
inline size_t getCurrentKeyIndex() const
|
||||
{
|
||||
return current_key_index;
|
||||
}
|
||||
|
||||
inline KeyType extractCurrentKey()
|
||||
{
|
||||
assert(current_key_index < keys_size);
|
||||
|
||||
if constexpr (key_type == DictionaryKeyType::simple)
|
||||
{
|
||||
const auto & column_vector = static_cast<const ColumnVector<UInt64> &>(*key_columns[0]);
|
||||
const auto & data = column_vector.getData();
|
||||
|
||||
auto key = data[current_key_index];
|
||||
++current_key_index;
|
||||
return key;
|
||||
}
|
||||
else
|
||||
keys = deserializeKeyColumnsInArena(key_columns, existing_arena);
|
||||
}
|
||||
|
||||
|
||||
const PaddedPODArray<KeyType> & getKeys() const
|
||||
{
|
||||
return keys;
|
||||
}
|
||||
|
||||
private:
|
||||
static PaddedPODArray<UInt64> getColumnVectorData(const ColumnPtr column)
|
||||
{
|
||||
PaddedPODArray<UInt64> result;
|
||||
|
||||
auto full_column = column->convertToFullColumnIfConst();
|
||||
const auto *vector_col = checkAndGetColumn<ColumnVector<UInt64>>(full_column.get());
|
||||
|
||||
if (!vector_col)
|
||||
throw Exception{ErrorCodes::TYPE_MISMATCH, "Column type mismatch for simple key expected UInt64"};
|
||||
|
||||
result.assign(vector_col->getData());
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
static PaddedPODArray<StringRef> deserializeKeyColumnsInArena(const Columns & key_columns, Arena & temporary_arena)
|
||||
{
|
||||
size_t keys_size = key_columns.front()->size();
|
||||
|
||||
PaddedPODArray<StringRef> result;
|
||||
result.reserve(keys_size);
|
||||
|
||||
PaddedPODArray<StringRef> temporary_column_data(key_columns.size());
|
||||
|
||||
for (size_t key_index = 0; key_index < keys_size; ++key_index)
|
||||
{
|
||||
size_t allocated_size_for_columns = 0;
|
||||
const char * block_start = nullptr;
|
||||
|
||||
for (size_t column_index = 0; column_index < key_columns.size(); ++column_index)
|
||||
for (const auto & column : key_columns)
|
||||
{
|
||||
const auto & column = key_columns[column_index];
|
||||
temporary_column_data[column_index] = column->serializeValueIntoArena(key_index, temporary_arena, block_start);
|
||||
allocated_size_for_columns += temporary_column_data[column_index].size;
|
||||
StringRef serialized_data = column->serializeValueIntoArena(current_key_index, *complex_key_arena, block_start);
|
||||
allocated_size_for_columns += serialized_data.size;
|
||||
}
|
||||
|
||||
result.push_back(StringRef{block_start, allocated_size_for_columns});
|
||||
++current_key_index;
|
||||
current_complex_key = StringRef{block_start, allocated_size_for_columns};
|
||||
return current_complex_key;
|
||||
}
|
||||
}
|
||||
|
||||
void rollbackCurrentKey() const
|
||||
{
|
||||
if constexpr (key_type == DictionaryKeyType::complex)
|
||||
complex_key_arena->rollback(current_complex_key.size);
|
||||
}
|
||||
|
||||
PaddedPODArray<KeyType> extractAllKeys()
|
||||
{
|
||||
PaddedPODArray<KeyType> result;
|
||||
result.reserve(keys_size - current_key_index);
|
||||
|
||||
for (; current_key_index < keys_size;)
|
||||
{
|
||||
auto value = extractCurrentKey();
|
||||
result.emplace_back(value);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
PaddedPODArray<KeyType> keys;
|
||||
void reset()
|
||||
{
|
||||
current_key_index = 0;
|
||||
}
|
||||
private:
|
||||
Columns key_columns;
|
||||
|
||||
size_t keys_size = 0;
|
||||
size_t current_key_index = 0;
|
||||
|
||||
KeyType current_complex_key {};
|
||||
Arena * complex_key_arena;
|
||||
};
|
||||
|
||||
/**
|
||||
@ -370,9 +421,10 @@ private:
|
||||
|
||||
* If column is constant parameter backup_storage is used to store values.
|
||||
*/
|
||||
/// TODO: Remove
|
||||
template <typename T>
|
||||
static const PaddedPODArray<T> & getColumnVectorData(
|
||||
const IDictionaryBase * dictionary,
|
||||
const IDictionary * dictionary,
|
||||
const ColumnPtr column,
|
||||
PaddedPODArray<T> & backup_storage)
|
||||
{
|
||||
|
@ -200,8 +200,21 @@ DictionaryStructure::DictionaryStructure(const Poco::Util::AbstractConfiguration
|
||||
|
||||
for (size_t i = 0; i < attributes.size(); ++i)
|
||||
{
|
||||
const auto & attribute_name = attributes[i].name;
|
||||
const auto & attribute = attributes[i];
|
||||
const auto & attribute_name = attribute.name;
|
||||
attribute_name_to_index[attribute_name] = i;
|
||||
|
||||
if (attribute.hierarchical)
|
||||
{
|
||||
if (id && attribute.underlying_type != AttributeUnderlyingType::utUInt64)
|
||||
throw Exception(ErrorCodes::TYPE_MISMATCH,
|
||||
"Hierarchical attribute type for dictionary with simple key must be UInt64. Actual ({})",
|
||||
toString(attribute.underlying_type));
|
||||
else if (key)
|
||||
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Dictionary with complex key does not support hierarchy");
|
||||
|
||||
hierarchical_attribute_index = i;
|
||||
}
|
||||
}
|
||||
|
||||
if (attributes.empty())
|
||||
|
@ -153,6 +153,8 @@ struct DictionaryStructure final
|
||||
std::unordered_map<std::string, size_t> attribute_name_to_index;
|
||||
std::optional<DictionaryTypedSpecialAttribute> range_min;
|
||||
std::optional<DictionaryTypedSpecialAttribute> range_max;
|
||||
std::optional<size_t> hierarchical_attribute_index;
|
||||
|
||||
bool has_expressions = false;
|
||||
bool access_to_key_from_attributes = false;
|
||||
|
||||
|
@ -1,158 +1,33 @@
|
||||
#include "DirectDictionary.h"
|
||||
#include <IO/WriteHelpers.h>
|
||||
#include "DictionaryBlockInputStream.h"
|
||||
#include "DictionaryFactory.h"
|
||||
#include <Core/Defines.h>
|
||||
#include <Functions/FunctionHelpers.h>
|
||||
#include <Columns/ColumnNullable.h>
|
||||
#include <DataTypes/DataTypesDecimal.h>
|
||||
#include <Common/HashTable/HashMap.h>
|
||||
#include <Interpreters/AggregationCommon.h>
|
||||
|
||||
#include <Core/Defines.h>
|
||||
#include <Common/HashTable/HashMap.h>
|
||||
#include <DataStreams/IBlockInputStream.h>
|
||||
#include <DataTypes/DataTypesDecimal.h>
|
||||
#include <Functions/FunctionHelpers.h>
|
||||
|
||||
#include <Dictionaries/DictionaryFactory.h>
|
||||
#include <Dictionaries/HierarchyDictionariesUtils.h>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int TYPE_MISMATCH;
|
||||
extern const int UNSUPPORTED_METHOD;
|
||||
extern const int BAD_ARGUMENTS;
|
||||
}
|
||||
|
||||
namespace
|
||||
{
|
||||
|
||||
inline UInt64 getAt(const PaddedPODArray<UInt64> & arr, const size_t idx)
|
||||
{
|
||||
return arr[idx];
|
||||
}
|
||||
|
||||
inline UInt64 getAt(const UInt64 & value, const size_t)
|
||||
{
|
||||
return value;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
template <DictionaryKeyType dictionary_key_type>
|
||||
DirectDictionary<dictionary_key_type>::DirectDictionary(
|
||||
const StorageID & dict_id_,
|
||||
const DictionaryStructure & dict_struct_,
|
||||
DictionarySourcePtr source_ptr_,
|
||||
BlockPtr saved_block_)
|
||||
DictionarySourcePtr source_ptr_)
|
||||
: IDictionary(dict_id_)
|
||||
, dict_struct(dict_struct_)
|
||||
, source_ptr{std::move(source_ptr_)}
|
||||
, saved_block{std::move(saved_block_)}
|
||||
{
|
||||
if (!source_ptr->supportsSelectiveLoad())
|
||||
throw Exception{full_name + ": source cannot be used with DirectDictionary", ErrorCodes::UNSUPPORTED_METHOD};
|
||||
|
||||
setup();
|
||||
}
|
||||
|
||||
template <DictionaryKeyType dictionary_key_type>
|
||||
void DirectDictionary<dictionary_key_type>::toParent(const PaddedPODArray<Key> & ids [[maybe_unused]], PaddedPODArray<Key> & out [[maybe_unused]]) const
|
||||
{
|
||||
if constexpr (dictionary_key_type == DictionaryKeyType::simple)
|
||||
{
|
||||
const auto & attribute_name = hierarchical_attribute->name;
|
||||
|
||||
auto result_type = std::make_shared<DataTypeUInt64>();
|
||||
auto input_column = result_type->createColumn();
|
||||
auto & input_column_typed = assert_cast<ColumnVector<UInt64> &>(*input_column);
|
||||
auto & data = input_column_typed.getData();
|
||||
data.insert(ids.begin(), ids.end());
|
||||
|
||||
auto column = getColumn({attribute_name}, result_type, {std::move(input_column)}, {result_type}, {nullptr});
|
||||
const auto & result_column_typed = assert_cast<const ColumnVector<UInt64> &>(*column);
|
||||
const auto & result_data = result_column_typed.getData();
|
||||
|
||||
out.assign(result_data);
|
||||
}
|
||||
else
|
||||
throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "Hierarchy is not supported for complex key DirectDictionary");
|
||||
}
|
||||
|
||||
template <DictionaryKeyType dictionary_key_type>
|
||||
UInt64 DirectDictionary<dictionary_key_type>::getValueOrNullByKey(const Key & to_find) const
|
||||
{
|
||||
std::vector<Key> required_key = {to_find};
|
||||
|
||||
auto stream = source_ptr->loadIds(required_key);
|
||||
stream->readPrefix();
|
||||
|
||||
bool is_found = false;
|
||||
UInt64 result = hierarchical_attribute->null_value.template get<UInt64>();
|
||||
|
||||
while (const auto block = stream->read())
|
||||
{
|
||||
const IColumn & id_column = *block.safeGetByPosition(0).column;
|
||||
|
||||
for (const size_t attribute_idx : ext::range(0, dict_struct.attributes.size()))
|
||||
{
|
||||
if (is_found)
|
||||
break;
|
||||
|
||||
const IColumn & attribute_column = *block.safeGetByPosition(attribute_idx + 1).column;
|
||||
|
||||
for (const auto row_idx : ext::range(0, id_column.size()))
|
||||
{
|
||||
const auto key = id_column[row_idx].get<UInt64>();
|
||||
|
||||
if (key == to_find && hierarchical_attribute->name == attribute_name_by_index.at(attribute_idx))
|
||||
{
|
||||
result = attribute_column[row_idx].get<Key>();
|
||||
is_found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
stream->readSuffix();
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
template <DictionaryKeyType dictionary_key_type>
|
||||
template <typename ChildType, typename AncestorType>
|
||||
void DirectDictionary<dictionary_key_type>::isInImpl(const ChildType & child_ids, const AncestorType & ancestor_ids, PaddedPODArray<UInt8> & out) const
|
||||
{
|
||||
const auto null_value = hierarchical_attribute->null_value.template get<UInt64>();
|
||||
const auto rows = out.size();
|
||||
|
||||
for (const auto row : ext::range(0, rows))
|
||||
{
|
||||
auto id = getAt(child_ids, row);
|
||||
const auto ancestor_id = getAt(ancestor_ids, row);
|
||||
|
||||
for (size_t i = 0; id != null_value && id != ancestor_id && i < DBMS_HIERARCHICAL_DICTIONARY_MAX_DEPTH; ++i)
|
||||
id = getValueOrNullByKey(id);
|
||||
|
||||
out[row] = id != null_value && id == ancestor_id;
|
||||
}
|
||||
|
||||
query_count.fetch_add(rows, std::memory_order_relaxed);
|
||||
}
|
||||
|
||||
template <DictionaryKeyType dictionary_key_type>
|
||||
void DirectDictionary<dictionary_key_type>::isInVectorVector(
|
||||
const PaddedPODArray<UInt64> & child_ids, const PaddedPODArray<UInt64> & ancestor_ids, PaddedPODArray<UInt8> & out) const
|
||||
{
|
||||
isInImpl(child_ids, ancestor_ids, out);
|
||||
}
|
||||
|
||||
template <DictionaryKeyType dictionary_key_type>
|
||||
void DirectDictionary<dictionary_key_type>::isInVectorConstant(const PaddedPODArray<UInt64> & child_ids, const UInt64 ancestor_id, PaddedPODArray<UInt8> & out) const
|
||||
{
|
||||
isInImpl(child_ids, ancestor_id, out);
|
||||
}
|
||||
|
||||
template <DictionaryKeyType dictionary_key_type>
|
||||
void DirectDictionary<dictionary_key_type>::isInConstantVector(const UInt64 child_id, const PaddedPODArray<UInt64> & ancestor_ids, PaddedPODArray<UInt8> & out) const
|
||||
{
|
||||
isInImpl(child_id, ancestor_ids, out);
|
||||
}
|
||||
|
||||
template <DictionaryKeyType dictionary_key_type>
|
||||
@ -166,20 +41,20 @@ ColumnPtr DirectDictionary<dictionary_key_type>::getColumn(
|
||||
if constexpr (dictionary_key_type == DictionaryKeyType::complex)
|
||||
dict_struct.validateKeyTypes(key_types);
|
||||
|
||||
Arena complex_key_arena;
|
||||
DictionaryKeysArenaHolder<dictionary_key_type> arena_holder;
|
||||
DictionaryKeysExtractor<dictionary_key_type> extractor(key_columns, arena_holder.getComplexKeyArena());
|
||||
const auto requested_keys = extractor.extractAllKeys();
|
||||
|
||||
const DictionaryAttribute & attribute = dict_struct.getAttribute(attribute_name, result_type);
|
||||
DefaultValueProvider default_value_provider(attribute.null_value, default_values_column);
|
||||
|
||||
DictionaryKeysExtractor<dictionary_key_type> extractor(key_columns, complex_key_arena);
|
||||
const auto & requested_keys = extractor.getKeys();
|
||||
|
||||
HashMap<KeyType, size_t> key_to_fetched_index;
|
||||
key_to_fetched_index.reserve(requested_keys.size());
|
||||
|
||||
auto fetched_from_storage = attribute.type->createColumn();
|
||||
|
||||
size_t fetched_key_index = 0;
|
||||
size_t requested_attribute_index = attribute_index_by_name.find(attribute_name)->second;
|
||||
size_t requested_attribute_index = dict_struct.attribute_name_to_index.find(attribute_name)->second;
|
||||
|
||||
Columns block_key_columns;
|
||||
size_t dictionary_keys_size = dict_struct.getKeysNames().size();
|
||||
@ -191,26 +66,19 @@ ColumnPtr DirectDictionary<dictionary_key_type>::getColumn(
|
||||
|
||||
while (const auto block = stream->read())
|
||||
{
|
||||
auto block_columns = block.getColumns();
|
||||
|
||||
/// Split into keys columns and attribute columns
|
||||
for (size_t i = 0; i < dictionary_keys_size; ++i)
|
||||
{
|
||||
block_key_columns.emplace_back(*block_columns.begin());
|
||||
block_columns.erase(block_columns.begin());
|
||||
}
|
||||
block_key_columns.emplace_back(block.safeGetByPosition(i).column);
|
||||
|
||||
DictionaryKeysExtractor<dictionary_key_type> block_keys_extractor(block_key_columns, complex_key_arena);
|
||||
const auto & block_keys = block_keys_extractor.getKeys();
|
||||
size_t block_keys_size = block_keys.size();
|
||||
DictionaryKeysExtractor<dictionary_key_type> block_keys_extractor(block_key_columns, arena_holder.getComplexKeyArena());
|
||||
auto block_keys = block_keys_extractor.extractAllKeys();
|
||||
|
||||
const auto & block_column = block.safeGetByPosition(dictionary_keys_size + requested_attribute_index).column;
|
||||
fetched_from_storage->insertRangeFrom(*block_column, 0, block_keys_size);
|
||||
fetched_from_storage->insertRangeFrom(*block_column, 0, block_keys.size());
|
||||
|
||||
for (size_t block_key_index = 0; block_key_index < block_keys_size; ++block_key_index)
|
||||
for (size_t block_key_index = 0; block_key_index < block_keys.size(); ++block_key_index)
|
||||
{
|
||||
const auto & block_key = block_keys[block_key_index];
|
||||
|
||||
auto block_key = block_keys[block_key_index];
|
||||
key_to_fetched_index[block_key] = fetched_key_index;
|
||||
++fetched_key_index;
|
||||
}
|
||||
@ -223,10 +91,10 @@ ColumnPtr DirectDictionary<dictionary_key_type>::getColumn(
|
||||
Field value_to_insert;
|
||||
|
||||
size_t requested_keys_size = requested_keys.size();
|
||||
|
||||
auto result = fetched_from_storage->cloneEmpty();
|
||||
result->reserve(requested_keys_size);
|
||||
|
||||
|
||||
for (size_t requested_key_index = 0; requested_key_index < requested_keys_size; ++requested_key_index)
|
||||
{
|
||||
const auto requested_key = requested_keys[requested_key_index];
|
||||
@ -251,10 +119,9 @@ ColumnUInt8::Ptr DirectDictionary<dictionary_key_type>::hasKeys(const Columns &
|
||||
if constexpr (dictionary_key_type == DictionaryKeyType::complex)
|
||||
dict_struct.validateKeyTypes(key_types);
|
||||
|
||||
Arena complex_key_arena;
|
||||
|
||||
DictionaryKeysExtractor<dictionary_key_type> requested_keys_extractor(key_columns, complex_key_arena);
|
||||
const auto & requested_keys = requested_keys_extractor.getKeys();
|
||||
DictionaryKeysArenaHolder<dictionary_key_type> arena_holder;
|
||||
DictionaryKeysExtractor<dictionary_key_type> requested_keys_extractor(key_columns, arena_holder.getComplexKeyArena());
|
||||
auto requested_keys = requested_keys_extractor.extractAllKeys();
|
||||
size_t requested_keys_size = requested_keys.size();
|
||||
|
||||
HashMap<KeyType, size_t> requested_key_to_index;
|
||||
@ -279,25 +146,24 @@ ColumnUInt8::Ptr DirectDictionary<dictionary_key_type>::hasKeys(const Columns &
|
||||
|
||||
while (const auto block = stream->read())
|
||||
{
|
||||
auto block_columns = block.getColumns();
|
||||
|
||||
/// Split into keys columns and attribute columns
|
||||
for (size_t i = 0; i < dictionary_keys_size; ++i)
|
||||
{
|
||||
block_key_columns.emplace_back(*block_columns.begin());
|
||||
block_columns.erase(block_columns.begin());
|
||||
}
|
||||
block_key_columns.emplace_back(block.safeGetByPosition(i).column);
|
||||
|
||||
DictionaryKeysExtractor<dictionary_key_type> block_keys_extractor(block_key_columns, complex_key_arena);
|
||||
const auto & block_keys = block_keys_extractor.getKeys();
|
||||
DictionaryKeysExtractor<dictionary_key_type> block_keys_extractor(block_key_columns, arena_holder.getComplexKeyArena());
|
||||
size_t block_keys_size = block_keys_extractor.getKeysSize();
|
||||
|
||||
for (const auto & block_key : block_keys)
|
||||
for (size_t i = 0; i < block_keys_size; ++i)
|
||||
{
|
||||
auto block_key = block_keys_extractor.extractCurrentKey();
|
||||
|
||||
const auto * it = requested_key_to_index.find(block_key);
|
||||
assert(it);
|
||||
|
||||
size_t result_data_found_index = it->getMapped();
|
||||
result_data[result_data_found_index] = true;
|
||||
|
||||
block_keys_extractor.rollbackCurrentKey();
|
||||
}
|
||||
|
||||
block_key_columns.clear();
|
||||
@ -310,6 +176,37 @@ ColumnUInt8::Ptr DirectDictionary<dictionary_key_type>::hasKeys(const Columns &
|
||||
return result;
|
||||
}
|
||||
|
||||
template <DictionaryKeyType dictionary_key_type>
|
||||
ColumnPtr DirectDictionary<dictionary_key_type>::getHierarchy(
|
||||
ColumnPtr key_column,
|
||||
const DataTypePtr & key_type) const
|
||||
{
|
||||
if (dictionary_key_type == DictionaryKeyType::simple)
|
||||
{
|
||||
auto result = getKeysHierarchyDefaultImplementation(this, key_column, key_type);
|
||||
query_count.fetch_add(key_column->size(), std::memory_order_relaxed);
|
||||
return result;
|
||||
}
|
||||
else
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
template <DictionaryKeyType dictionary_key_type>
|
||||
ColumnUInt8::Ptr DirectDictionary<dictionary_key_type>::isInHierarchy(
|
||||
ColumnPtr key_column,
|
||||
ColumnPtr in_key_column,
|
||||
const DataTypePtr & key_type) const
|
||||
{
|
||||
if (dictionary_key_type == DictionaryKeyType::simple)
|
||||
{
|
||||
auto result = getKeysIsInHierarchyDefaultImplementation(this, key_column, in_key_column, key_type);
|
||||
query_count.fetch_add(key_column->size(), std::memory_order_relaxed);
|
||||
return result;
|
||||
}
|
||||
else
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
template <DictionaryKeyType dictionary_key_type>
|
||||
BlockInputStreamPtr DirectDictionary<dictionary_key_type>::getSourceBlockInputStream(
|
||||
const Columns & key_columns [[maybe_unused]],
|
||||
@ -342,32 +239,6 @@ BlockInputStreamPtr DirectDictionary<dictionary_key_type>::getSourceBlockInputSt
|
||||
return stream;
|
||||
}
|
||||
|
||||
template <DictionaryKeyType dictionary_key_type>
|
||||
void DirectDictionary<dictionary_key_type>::setup()
|
||||
{
|
||||
/// TODO: Move this to DictionaryStructure
|
||||
size_t dictionary_attributes_size = dict_struct.attributes.size();
|
||||
for (size_t i = 0; i < dictionary_attributes_size; ++i)
|
||||
{
|
||||
const auto & attribute = dict_struct.attributes[i];
|
||||
attribute_index_by_name[attribute.name] = i;
|
||||
attribute_name_by_index[i] = attribute.name;
|
||||
|
||||
if (attribute.hierarchical)
|
||||
{
|
||||
if constexpr (dictionary_key_type == DictionaryKeyType::complex)
|
||||
throw Exception(ErrorCodes::BAD_ARGUMENTS,
|
||||
"({}): hierarchical attributes are not supported for complex key direct dictionary",
|
||||
full_name);
|
||||
|
||||
hierarchical_attribute = &attribute;
|
||||
|
||||
if (attribute.underlying_type != AttributeUnderlyingType::utUInt64)
|
||||
throw Exception{full_name + ": hierarchical attribute must be UInt64.", ErrorCodes::TYPE_MISMATCH};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <DictionaryKeyType dictionary_key_type>
|
||||
BlockInputStreamPtr DirectDictionary<dictionary_key_type>::getBlockInputStream(const Names & /* column_names */, size_t /* max_block_size */) const
|
||||
{
|
||||
|
@ -18,11 +18,6 @@
|
||||
namespace DB
|
||||
{
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int BAD_ARGUMENTS;
|
||||
}
|
||||
|
||||
template <DictionaryKeyType dictionary_key_type>
|
||||
class DirectDictionary final : public IDictionary
|
||||
{
|
||||
@ -33,8 +28,7 @@ public:
|
||||
DirectDictionary(
|
||||
const StorageID & dict_id_,
|
||||
const DictionaryStructure & dict_struct_,
|
||||
DictionarySourcePtr source_ptr_,
|
||||
BlockPtr saved_block_ = nullptr);
|
||||
DictionarySourcePtr source_ptr_);
|
||||
|
||||
std::string getTypeName() const override
|
||||
{
|
||||
@ -56,7 +50,7 @@ public:
|
||||
|
||||
std::shared_ptr<const IExternalLoadable> clone() const override
|
||||
{
|
||||
return std::make_shared<DirectDictionary>(getDictionaryID(), dict_struct, source_ptr->clone(), saved_block);
|
||||
return std::make_shared<DirectDictionary>(getDictionaryID(), dict_struct, source_ptr->clone());
|
||||
}
|
||||
|
||||
const IDictionarySource * getSource() const override { return source_ptr.get(); }
|
||||
@ -67,26 +61,9 @@ public:
|
||||
|
||||
bool isInjective(const std::string & attribute_name) const override
|
||||
{
|
||||
auto it = attribute_index_by_name.find(attribute_name);
|
||||
|
||||
if (it == attribute_index_by_name.end())
|
||||
throw Exception(ErrorCodes::BAD_ARGUMENTS,
|
||||
"({}): no attribute with name ({}) in dictionary",
|
||||
full_name,
|
||||
attribute_name);
|
||||
|
||||
return dict_struct.attributes[it->second].injective;
|
||||
return dict_struct.getAttribute(attribute_name).injective;
|
||||
}
|
||||
|
||||
bool hasHierarchy() const override { return hierarchical_attribute; }
|
||||
|
||||
void toParent(const PaddedPODArray<UInt64> & ids, PaddedPODArray<UInt64> & out) const override;
|
||||
|
||||
void isInVectorVector(
|
||||
const PaddedPODArray<UInt64> & child_ids, const PaddedPODArray<UInt64> & ancestor_ids, PaddedPODArray<UInt8> & out) const override;
|
||||
void isInVectorConstant(const PaddedPODArray<UInt64> & child_ids, const UInt64 ancestor_id, PaddedPODArray<UInt8> & out) const override;
|
||||
void isInConstantVector(const UInt64 child_id, const PaddedPODArray<UInt64> & ancestor_ids, PaddedPODArray<UInt8> & out) const override;
|
||||
|
||||
DictionaryKeyType getKeyType() const override { return dictionary_key_type; }
|
||||
|
||||
ColumnPtr getColumn(
|
||||
@ -98,30 +75,25 @@ public:
|
||||
|
||||
ColumnUInt8::Ptr hasKeys(const Columns & key_columns, const DataTypes & key_types) const override;
|
||||
|
||||
bool hasHierarchy() const override { return dict_struct.hierarchical_attribute_index.has_value(); }
|
||||
|
||||
ColumnPtr getHierarchy(ColumnPtr key_column, const DataTypePtr & key_type) const override;
|
||||
|
||||
ColumnUInt8::Ptr isInHierarchy(
|
||||
ColumnPtr key_column,
|
||||
ColumnPtr in_key_column,
|
||||
const DataTypePtr & key_type) const override;
|
||||
|
||||
BlockInputStreamPtr getBlockInputStream(const Names & column_names, size_t max_block_size) const override;
|
||||
|
||||
private:
|
||||
void setup();
|
||||
|
||||
BlockInputStreamPtr getSourceBlockInputStream(const Columns & key_columns, const PaddedPODArray<KeyType> & requested_keys) const;
|
||||
|
||||
UInt64 getValueOrNullByKey(const UInt64 & to_find) const;
|
||||
|
||||
template <typename ChildType, typename AncestorType>
|
||||
void isInImpl(const ChildType & child_ids, const AncestorType & ancestor_ids, PaddedPODArray<UInt8> & out) const;
|
||||
|
||||
const DictionaryStructure dict_struct;
|
||||
const DictionarySourcePtr source_ptr;
|
||||
const DictionaryLifetime dict_lifetime;
|
||||
|
||||
std::unordered_map<std::string, size_t> attribute_index_by_name;
|
||||
std::unordered_map<size_t, std::string> attribute_name_by_index;
|
||||
|
||||
const DictionaryAttribute * hierarchical_attribute = nullptr;
|
||||
|
||||
mutable std::atomic<size_t> query_count{0};
|
||||
|
||||
BlockPtr saved_block;
|
||||
};
|
||||
|
||||
extern template class DirectDictionary<DictionaryKeyType::simple>;
|
||||
|
@ -1,20 +1,22 @@
|
||||
#include "FlatDictionary.h"
|
||||
|
||||
#include <Core/Defines.h>
|
||||
#include <Common/HashTable/HashMap.h>
|
||||
|
||||
#include <DataTypes/DataTypesDecimal.h>
|
||||
#include <IO/WriteHelpers.h>
|
||||
#include <Columns/ColumnsNumber.h>
|
||||
#include <Columns/ColumnNullable.h>
|
||||
#include <Functions/FunctionHelpers.h>
|
||||
|
||||
#include "DictionaryBlockInputStream.h"
|
||||
#include "DictionaryFactory.h"
|
||||
#include <Dictionaries/DictionaryBlockInputStream.h>
|
||||
#include <Dictionaries/DictionaryFactory.h>
|
||||
#include <Dictionaries/HierarchyDictionariesUtils.h>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int TYPE_MISMATCH;
|
||||
extern const int ARGUMENT_OUT_OF_BOUND;
|
||||
extern const int BAD_ARGUMENTS;
|
||||
extern const int DICTIONARY_IS_EMPTY;
|
||||
@ -24,7 +26,6 @@ namespace ErrorCodes
|
||||
static const auto initial_array_size = 1024;
|
||||
static const auto max_array_size = 500000;
|
||||
|
||||
|
||||
FlatDictionary::FlatDictionary(
|
||||
const StorageID & dict_id_,
|
||||
const DictionaryStructure & dict_struct_,
|
||||
@ -45,69 +46,6 @@ FlatDictionary::FlatDictionary(
|
||||
calculateBytesAllocated();
|
||||
}
|
||||
|
||||
|
||||
void FlatDictionary::toParent(const PaddedPODArray<Key> & ids, PaddedPODArray<Key> & out) const
|
||||
{
|
||||
const auto null_value = std::get<UInt64>(hierarchical_attribute->null_values);
|
||||
DictionaryDefaultValueExtractor<UInt64> extractor(null_value);
|
||||
|
||||
getItemsImpl<UInt64, UInt64>(
|
||||
*hierarchical_attribute,
|
||||
ids,
|
||||
[&](const size_t row, const UInt64 value) { out[row] = value; },
|
||||
extractor);
|
||||
}
|
||||
|
||||
|
||||
/// Allow to use single value in same way as array.
|
||||
static inline FlatDictionary::Key getAt(const PaddedPODArray<FlatDictionary::Key> & arr, const size_t idx)
|
||||
{
|
||||
return arr[idx];
|
||||
}
|
||||
static inline FlatDictionary::Key getAt(const FlatDictionary::Key & value, const size_t)
|
||||
{
|
||||
return value;
|
||||
}
|
||||
|
||||
template <typename ChildType, typename AncestorType>
|
||||
void FlatDictionary::isInImpl(const ChildType & child_ids, const AncestorType & ancestor_ids, PaddedPODArray<UInt8> & out) const
|
||||
{
|
||||
const auto null_value = std::get<UInt64>(hierarchical_attribute->null_values);
|
||||
const auto & attr = std::get<ContainerType<Key>>(hierarchical_attribute->arrays);
|
||||
const auto rows = out.size();
|
||||
|
||||
size_t loaded_size = attr.size();
|
||||
for (const auto row : ext::range(0, rows))
|
||||
{
|
||||
auto id = getAt(child_ids, row);
|
||||
const auto ancestor_id = getAt(ancestor_ids, row);
|
||||
|
||||
for (size_t i = 0; id < loaded_size && id != null_value && id != ancestor_id && i < DBMS_HIERARCHICAL_DICTIONARY_MAX_DEPTH; ++i)
|
||||
id = attr[id];
|
||||
|
||||
out[row] = id != null_value && id == ancestor_id;
|
||||
}
|
||||
|
||||
query_count.fetch_add(rows, std::memory_order_relaxed);
|
||||
}
|
||||
|
||||
|
||||
void FlatDictionary::isInVectorVector(
|
||||
const PaddedPODArray<Key> & child_ids, const PaddedPODArray<Key> & ancestor_ids, PaddedPODArray<UInt8> & out) const
|
||||
{
|
||||
isInImpl(child_ids, ancestor_ids, out);
|
||||
}
|
||||
|
||||
void FlatDictionary::isInVectorConstant(const PaddedPODArray<Key> & child_ids, const Key ancestor_id, PaddedPODArray<UInt8> & out) const
|
||||
{
|
||||
isInImpl(child_ids, ancestor_id, out);
|
||||
}
|
||||
|
||||
void FlatDictionary::isInConstantVector(const Key child_id, const PaddedPODArray<Key> & ancestor_ids, PaddedPODArray<UInt8> & out) const
|
||||
{
|
||||
isInImpl(child_id, ancestor_ids, out);
|
||||
}
|
||||
|
||||
ColumnPtr FlatDictionary::getColumn(
|
||||
const std::string & attribute_name,
|
||||
const DataTypePtr & result_type,
|
||||
@ -117,14 +55,16 @@ ColumnPtr FlatDictionary::getColumn(
|
||||
{
|
||||
ColumnPtr result;
|
||||
|
||||
PaddedPODArray<Key> backup_storage;
|
||||
PaddedPODArray<UInt64> backup_storage;
|
||||
const auto & ids = getColumnVectorData(this, key_columns.front(), backup_storage);
|
||||
|
||||
auto size = ids.size();
|
||||
|
||||
const auto & attribute = getAttribute(attribute_name);
|
||||
const auto & dictionary_attribute = dict_struct.getAttribute(attribute_name, result_type);
|
||||
|
||||
size_t attribute_index = dict_struct.attribute_name_to_index.find(attribute_name)->second;
|
||||
const auto & attribute = attributes[attribute_index];
|
||||
|
||||
auto type_call = [&](const auto & dictionary_attribute_type)
|
||||
{
|
||||
using Type = std::decay_t<decltype(dictionary_attribute_type)>;
|
||||
@ -183,10 +123,9 @@ ColumnPtr FlatDictionary::getColumn(
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
ColumnUInt8::Ptr FlatDictionary::hasKeys(const Columns & key_columns, const DataTypes &) const
|
||||
{
|
||||
PaddedPODArray<Key> backup_storage;
|
||||
PaddedPODArray<UInt64> backup_storage;
|
||||
const auto& ids = getColumnVectorData(this, key_columns.front(), backup_storage);
|
||||
|
||||
auto result = ColumnUInt8::create(ext::size(ids));
|
||||
@ -205,24 +144,118 @@ ColumnUInt8::Ptr FlatDictionary::hasKeys(const Columns & key_columns, const Data
|
||||
return result;
|
||||
}
|
||||
|
||||
ColumnPtr FlatDictionary::getHierarchy(ColumnPtr key_column, const DataTypePtr &) const
|
||||
{
|
||||
PaddedPODArray<UInt64> keys_backup_storage;
|
||||
const auto & keys = getColumnVectorData(this, key_column, keys_backup_storage);
|
||||
|
||||
size_t hierarchical_attribute_index = *dict_struct.hierarchical_attribute_index;
|
||||
const auto & hierarchical_attribute = attributes[hierarchical_attribute_index];
|
||||
|
||||
const UInt64 null_value = std::get<UInt64>(hierarchical_attribute.null_values);
|
||||
const ContainerType<UInt64> & parent_keys = std::get<ContainerType<UInt64>>(hierarchical_attribute.arrays);
|
||||
|
||||
auto is_key_valid_func = [&, this](auto & key)
|
||||
{
|
||||
return key < loaded_ids.size() && loaded_ids[key];
|
||||
};
|
||||
|
||||
auto get_parent_key_func = [&, this](auto & hierarchy_key)
|
||||
{
|
||||
std::optional<UInt64> result;
|
||||
|
||||
if (hierarchy_key >= loaded_ids.size() || !loaded_ids[hierarchy_key])
|
||||
return result;
|
||||
|
||||
result = parent_keys[hierarchy_key];
|
||||
|
||||
return result;
|
||||
};
|
||||
|
||||
auto dictionary_hierarchy_array = getKeysHierarchyArray(keys, null_value, is_key_valid_func, get_parent_key_func);
|
||||
|
||||
query_count.fetch_add(keys.size(), std::memory_order_relaxed);
|
||||
|
||||
return dictionary_hierarchy_array;
|
||||
}
|
||||
|
||||
ColumnUInt8::Ptr FlatDictionary::isInHierarchy(
|
||||
ColumnPtr key_column,
|
||||
ColumnPtr in_key_column,
|
||||
const DataTypePtr &) const
|
||||
{
|
||||
PaddedPODArray<UInt64> keys_backup_storage;
|
||||
const auto & keys = getColumnVectorData(this, key_column, keys_backup_storage);
|
||||
|
||||
PaddedPODArray<UInt64> keys_in_backup_storage;
|
||||
const auto & keys_in = getColumnVectorData(this, in_key_column, keys_in_backup_storage);
|
||||
|
||||
size_t hierarchical_attribute_index = *dict_struct.hierarchical_attribute_index;
|
||||
const auto & hierarchical_attribute = attributes[hierarchical_attribute_index];
|
||||
|
||||
const UInt64 null_value = std::get<UInt64>(hierarchical_attribute.null_values);
|
||||
const ContainerType<UInt64> & parent_keys = std::get<ContainerType<UInt64>>(hierarchical_attribute.arrays);
|
||||
|
||||
auto is_key_valid_func = [&, this](auto & key)
|
||||
{
|
||||
return key < loaded_ids.size() && loaded_ids[key];
|
||||
};
|
||||
|
||||
auto get_parent_key_func = [&, this](auto & hierarchy_key)
|
||||
{
|
||||
std::optional<UInt64> result;
|
||||
|
||||
if (hierarchy_key >= loaded_ids.size() || !loaded_ids[hierarchy_key])
|
||||
return result;
|
||||
|
||||
result = parent_keys[hierarchy_key];
|
||||
|
||||
return result;
|
||||
};
|
||||
|
||||
auto result = getKeysIsInHierarchyColumn(keys, keys_in, null_value, is_key_valid_func, get_parent_key_func);
|
||||
|
||||
query_count.fetch_add(keys.size(), std::memory_order_relaxed);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
ColumnPtr FlatDictionary::getDescendants(
|
||||
ColumnPtr key_column,
|
||||
const DataTypePtr &,
|
||||
size_t level) const
|
||||
{
|
||||
PaddedPODArray<UInt64> keys_backup;
|
||||
const auto & keys = getColumnVectorData(this, key_column, keys_backup);
|
||||
|
||||
size_t hierarchical_attribute_index = *dict_struct.hierarchical_attribute_index;
|
||||
const auto & hierarchical_attribute = attributes[hierarchical_attribute_index];
|
||||
const ContainerType<UInt64> & parent_keys = std::get<ContainerType<UInt64>>(hierarchical_attribute.arrays);
|
||||
|
||||
HashMap<UInt64, PaddedPODArray<UInt64>> parent_to_child;
|
||||
|
||||
for (size_t i = 0; i < parent_keys.size(); ++i)
|
||||
{
|
||||
auto parent_key = parent_keys[i];
|
||||
|
||||
if (loaded_ids[i])
|
||||
parent_to_child[parent_key].emplace_back(static_cast<UInt64>(i));
|
||||
}
|
||||
|
||||
auto result = getKeysDescendantsArray(keys, parent_to_child, level);
|
||||
|
||||
query_count.fetch_add(keys.size(), std::memory_order_relaxed);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
void FlatDictionary::createAttributes()
|
||||
{
|
||||
const auto size = dict_struct.attributes.size();
|
||||
attributes.reserve(size);
|
||||
|
||||
for (const auto & attribute : dict_struct.attributes)
|
||||
{
|
||||
attribute_index_by_name.emplace(attribute.name, attributes.size());
|
||||
attributes.push_back(createAttribute(attribute, attribute.null_value));
|
||||
|
||||
if (attribute.hierarchical)
|
||||
{
|
||||
hierarchical_attribute = &attributes.back();
|
||||
|
||||
if (hierarchical_attribute->type != AttributeUnderlyingType::utUInt64)
|
||||
throw Exception{full_name + ": hierarchical attribute must be UInt64.", ErrorCodes::TYPE_MISMATCH};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void FlatDictionary::blockToAttributes(const Block & block)
|
||||
@ -271,7 +304,7 @@ void FlatDictionary::updateData()
|
||||
const auto & saved_id_column = *saved_block->safeGetByPosition(0).column;
|
||||
const auto & update_id_column = *block.safeGetByPosition(0).column;
|
||||
|
||||
std::unordered_map<Key, std::vector<size_t>> update_ids;
|
||||
std::unordered_map<UInt64, std::vector<size_t>> update_ids;
|
||||
for (size_t row = 0; row < update_id_column.size(); ++row)
|
||||
{
|
||||
const auto id = update_id_column.get64(row);
|
||||
@ -280,7 +313,7 @@ void FlatDictionary::updateData()
|
||||
|
||||
const size_t saved_rows = saved_id_column.size();
|
||||
IColumn::Filter filter(saved_rows);
|
||||
std::unordered_map<Key, std::vector<size_t>>::iterator it;
|
||||
std::unordered_map<UInt64, std::vector<size_t>>::iterator it;
|
||||
|
||||
for (size_t row = 0; row < saved_id_column.size(); ++row)
|
||||
{
|
||||
@ -385,7 +418,6 @@ void FlatDictionary::createAttributeImpl<String>(Attribute & attribute, const Fi
|
||||
attribute.arrays.emplace<ContainerType<StringRef>>(initial_array_size, StringRef(string_in_arena, string.size()));
|
||||
}
|
||||
|
||||
|
||||
FlatDictionary::Attribute FlatDictionary::createAttribute(const DictionaryAttribute& attribute, const Field & null_value)
|
||||
{
|
||||
auto nullable_set = attribute.is_nullable ? std::make_optional<NullableSet>() : std::optional<NullableSet>{};
|
||||
@ -408,7 +440,7 @@ FlatDictionary::Attribute FlatDictionary::createAttribute(const DictionaryAttrib
|
||||
template <typename AttributeType, typename OutputType, typename ValueSetter, typename DefaultValueExtractor>
|
||||
void FlatDictionary::getItemsImpl(
|
||||
const Attribute & attribute,
|
||||
const PaddedPODArray<Key> & ids,
|
||||
const PaddedPODArray<UInt64> & ids,
|
||||
ValueSetter && set_value,
|
||||
DefaultValueExtractor & default_value_extractor) const
|
||||
{
|
||||
@ -425,7 +457,7 @@ void FlatDictionary::getItemsImpl(
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void FlatDictionary::resize(Attribute & attribute, const Key id)
|
||||
void FlatDictionary::resize(Attribute & attribute, const UInt64 id)
|
||||
{
|
||||
if (id >= max_array_size)
|
||||
throw Exception{full_name + ": identifier should be less than " + toString(max_array_size), ErrorCodes::ARGUMENT_OUT_OF_BOUND};
|
||||
@ -440,7 +472,7 @@ void FlatDictionary::resize(Attribute & attribute, const Key id)
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void FlatDictionary::setAttributeValueImpl(Attribute & attribute, const Key id, const T & value)
|
||||
void FlatDictionary::setAttributeValueImpl(Attribute & attribute, const UInt64 id, const T & value)
|
||||
{
|
||||
auto & array = std::get<ContainerType<T>>(attribute.arrays);
|
||||
array[id] = value;
|
||||
@ -448,13 +480,13 @@ void FlatDictionary::setAttributeValueImpl(Attribute & attribute, const Key id,
|
||||
}
|
||||
|
||||
template <>
|
||||
void FlatDictionary::setAttributeValueImpl<String>(Attribute & attribute, const Key id, const String & value)
|
||||
void FlatDictionary::setAttributeValueImpl<String>(Attribute & attribute, const UInt64 id, const String & value)
|
||||
{
|
||||
const auto * string_in_arena = attribute.string_arena->insert(value.data(), value.size());
|
||||
setAttributeValueImpl(attribute, id, StringRef{string_in_arena, value.size()});
|
||||
}
|
||||
|
||||
void FlatDictionary::setAttributeValue(Attribute & attribute, const Key id, const Field & value)
|
||||
void FlatDictionary::setAttributeValue(Attribute & attribute, const UInt64 id, const Field & value)
|
||||
{
|
||||
auto type_call = [&](const auto &dictionary_attribute_type)
|
||||
{
|
||||
@ -484,21 +516,11 @@ void FlatDictionary::setAttributeValue(Attribute & attribute, const Key id, cons
|
||||
callOnDictionaryAttributeType(attribute.type, type_call);
|
||||
}
|
||||
|
||||
|
||||
const FlatDictionary::Attribute & FlatDictionary::getAttribute(const std::string & attribute_name) const
|
||||
{
|
||||
const auto it = attribute_index_by_name.find(attribute_name);
|
||||
if (it == std::end(attribute_index_by_name))
|
||||
throw Exception{full_name + ": no such attribute '" + attribute_name + "'", ErrorCodes::BAD_ARGUMENTS};
|
||||
|
||||
return attributes[it->second];
|
||||
}
|
||||
|
||||
PaddedPODArray<FlatDictionary::Key> FlatDictionary::getIds() const
|
||||
PaddedPODArray<UInt64> FlatDictionary::getIds() const
|
||||
{
|
||||
const auto ids_count = ext::size(loaded_ids);
|
||||
|
||||
PaddedPODArray<Key> ids;
|
||||
PaddedPODArray<UInt64> ids;
|
||||
ids.reserve(ids_count);
|
||||
|
||||
for (auto idx : ext::range(0, ids_count))
|
||||
@ -509,8 +531,7 @@ PaddedPODArray<FlatDictionary::Key> FlatDictionary::getIds() const
|
||||
|
||||
BlockInputStreamPtr FlatDictionary::getBlockInputStream(const Names & column_names, size_t max_block_size) const
|
||||
{
|
||||
using BlockInputStreamType = DictionaryBlockInputStream<Key>;
|
||||
return std::make_shared<BlockInputStreamType>(shared_from_this(), max_block_size, getIds(), column_names);
|
||||
return std::make_shared<DictionaryBlockInputStream>(shared_from_this(), max_block_size, getIds(), column_names);
|
||||
}
|
||||
|
||||
void registerDictionaryFlat(DictionaryFactory & factory)
|
||||
|
@ -59,18 +59,9 @@ public:
|
||||
|
||||
bool isInjective(const std::string & attribute_name) const override
|
||||
{
|
||||
return dict_struct.attributes[&getAttribute(attribute_name) - attributes.data()].injective;
|
||||
return dict_struct.getAttribute(attribute_name).injective;
|
||||
}
|
||||
|
||||
bool hasHierarchy() const override { return hierarchical_attribute; }
|
||||
|
||||
void toParent(const PaddedPODArray<Key> & ids, PaddedPODArray<Key> & out) const override;
|
||||
|
||||
void isInVectorVector(
|
||||
const PaddedPODArray<Key> & child_ids, const PaddedPODArray<Key> & ancestor_ids, PaddedPODArray<UInt8> & out) const override;
|
||||
void isInVectorConstant(const PaddedPODArray<Key> & child_ids, const Key ancestor_id, PaddedPODArray<UInt8> & out) const override;
|
||||
void isInConstantVector(const Key child_id, const PaddedPODArray<Key> & ancestor_ids, PaddedPODArray<UInt8> & out) const override;
|
||||
|
||||
DictionaryKeyType getKeyType() const override { return DictionaryKeyType::simple; }
|
||||
|
||||
ColumnPtr getColumn(
|
||||
@ -82,13 +73,27 @@ public:
|
||||
|
||||
ColumnUInt8::Ptr hasKeys(const Columns & key_columns, const DataTypes & key_types) const override;
|
||||
|
||||
bool hasHierarchy() const override { return dict_struct.hierarchical_attribute_index.has_value(); }
|
||||
|
||||
ColumnPtr getHierarchy(ColumnPtr key_column, const DataTypePtr & key_type) const override;
|
||||
|
||||
ColumnUInt8::Ptr isInHierarchy(
|
||||
ColumnPtr key_column,
|
||||
ColumnPtr in_key_column,
|
||||
const DataTypePtr & key_type) const override;
|
||||
|
||||
ColumnPtr getDescendants(
|
||||
ColumnPtr key_column,
|
||||
const DataTypePtr & key_type,
|
||||
size_t level) const override;
|
||||
|
||||
BlockInputStreamPtr getBlockInputStream(const Names & column_names, size_t max_block_size) const override;
|
||||
|
||||
private:
|
||||
template <typename Value>
|
||||
using ContainerType = PaddedPODArray<Value>;
|
||||
|
||||
using NullableSet = HashSet<Key, DefaultHash<Key>>;
|
||||
using NullableSet = HashSet<UInt64, DefaultHash<UInt64>>;
|
||||
|
||||
struct Attribute final
|
||||
{
|
||||
@ -151,24 +156,24 @@ private:
|
||||
template <typename AttributeType, typename OutputType, typename ValueSetter, typename DefaultValueExtractor>
|
||||
void getItemsImpl(
|
||||
const Attribute & attribute,
|
||||
const PaddedPODArray<Key> & ids,
|
||||
const PaddedPODArray<UInt64> & ids,
|
||||
ValueSetter && set_value,
|
||||
DefaultValueExtractor & default_value_extractor) const;
|
||||
|
||||
template <typename T>
|
||||
void resize(Attribute & attribute, const Key id);
|
||||
void resize(Attribute & attribute, const UInt64 id);
|
||||
|
||||
template <typename T>
|
||||
void setAttributeValueImpl(Attribute & attribute, const Key id, const T & value);
|
||||
void setAttributeValueImpl(Attribute & attribute, const UInt64 id, const T & value);
|
||||
|
||||
void setAttributeValue(Attribute & attribute, const Key id, const Field & value);
|
||||
void setAttributeValue(Attribute & attribute, const UInt64 id, const Field & value);
|
||||
|
||||
const Attribute & getAttribute(const std::string & attribute_name) const;
|
||||
|
||||
template <typename ChildType, typename AncestorType>
|
||||
void isInImpl(const ChildType & child_ids, const AncestorType & ancestor_ids, PaddedPODArray<UInt8> & out) const;
|
||||
|
||||
PaddedPODArray<Key> getIds() const;
|
||||
PaddedPODArray<UInt64> getIds() const;
|
||||
|
||||
const DictionaryStructure dict_struct;
|
||||
const DictionarySourcePtr source_ptr;
|
||||
@ -177,7 +182,6 @@ private:
|
||||
|
||||
std::map<std::string, size_t> attribute_index_by_name;
|
||||
std::vector<Attribute> attributes;
|
||||
const Attribute * hierarchical_attribute = nullptr;
|
||||
std::vector<bool> loaded_ids;
|
||||
|
||||
size_t bytes_allocated = 0;
|
||||
@ -185,6 +189,7 @@ private:
|
||||
size_t bucket_count = 0;
|
||||
mutable std::atomic<size_t> query_count{0};
|
||||
|
||||
/// TODO: Remove
|
||||
BlockPtr saved_block;
|
||||
};
|
||||
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -4,17 +4,21 @@
|
||||
#include <memory>
|
||||
#include <variant>
|
||||
#include <optional>
|
||||
#include <Columns/ColumnDecimal.h>
|
||||
#include <Columns/ColumnString.h>
|
||||
#include <Core/Block.h>
|
||||
#include <Common/HashTable/HashMap.h>
|
||||
#include <Common/HashTable/HashSet.h>
|
||||
|
||||
#include <sparsehash/sparse_hash_map>
|
||||
#include <ext/range.h>
|
||||
#include "DictionaryStructure.h"
|
||||
#include "IDictionary.h"
|
||||
#include "IDictionarySource.h"
|
||||
#include "DictionaryHelpers.h"
|
||||
|
||||
#include <Common/HashTable/HashMap.h>
|
||||
#include <Common/HashTable/HashSet.h>
|
||||
#include <Core/Block.h>
|
||||
|
||||
#include <Columns/ColumnDecimal.h>
|
||||
#include <Columns/ColumnString.h>
|
||||
|
||||
#include <Dictionaries/DictionaryStructure.h>
|
||||
#include <Dictionaries/IDictionary.h>
|
||||
#include <Dictionaries/IDictionarySource.h>
|
||||
#include <Dictionaries/DictionaryHelpers.h>
|
||||
|
||||
/** This dictionary stores all content in a hash table in memory
|
||||
* (a separate Key -> Value map for each attribute)
|
||||
@ -24,19 +28,32 @@
|
||||
namespace DB
|
||||
{
|
||||
|
||||
template <DictionaryKeyType dictionary_key_type, bool sparse>
|
||||
class HashedDictionary final : public IDictionary
|
||||
{
|
||||
public:
|
||||
using KeyType = std::conditional_t<dictionary_key_type == DictionaryKeyType::simple, UInt64, StringRef>;
|
||||
static_assert(dictionary_key_type != DictionaryKeyType::range, "Range key type is not supported by hashed dictionary");
|
||||
|
||||
HashedDictionary(
|
||||
const StorageID & dict_id_,
|
||||
const DictionaryStructure & dict_struct_,
|
||||
DictionarySourcePtr source_ptr_,
|
||||
const DictionaryLifetime dict_lifetime_,
|
||||
bool require_nonempty_,
|
||||
bool sparse_,
|
||||
BlockPtr saved_block_ = nullptr);
|
||||
|
||||
std::string getTypeName() const override { return sparse ? "SparseHashed" : "Hashed"; }
|
||||
std::string getTypeName() const override
|
||||
{
|
||||
if constexpr (dictionary_key_type == DictionaryKeyType::simple && sparse)
|
||||
return "SparseHashed";
|
||||
else if constexpr (dictionary_key_type == DictionaryKeyType::simple && !sparse)
|
||||
return "Hashed";
|
||||
else if constexpr (dictionary_key_type == DictionaryKeyType::complex && sparse)
|
||||
return "ComplexKeySpareseHashed";
|
||||
else
|
||||
return "ComplexKeyHashed";
|
||||
}
|
||||
|
||||
size_t getBytesAllocated() const override { return bytes_allocated; }
|
||||
|
||||
@ -50,7 +67,7 @@ public:
|
||||
|
||||
std::shared_ptr<const IExternalLoadable> clone() const override
|
||||
{
|
||||
return std::make_shared<HashedDictionary>(getDictionaryID(), dict_struct, source_ptr->clone(), dict_lifetime, require_nonempty, sparse, saved_block);
|
||||
return std::make_shared<HashedDictionary<dictionary_key_type, sparse>>(getDictionaryID(), dict_struct, source_ptr->clone(), dict_lifetime, require_nonempty, saved_block);
|
||||
}
|
||||
|
||||
const IDictionarySource * getSource() const override { return source_ptr.get(); }
|
||||
@ -61,14 +78,10 @@ public:
|
||||
|
||||
bool isInjective(const std::string & attribute_name) const override
|
||||
{
|
||||
return dict_struct.attributes[&getAttribute(attribute_name) - attributes.data()].injective;
|
||||
return dict_struct.getAttribute(attribute_name).injective;
|
||||
}
|
||||
|
||||
bool hasHierarchy() const override { return hierarchical_attribute; }
|
||||
|
||||
void toParent(const PaddedPODArray<Key> & ids, PaddedPODArray<Key> & out) const override;
|
||||
|
||||
DictionaryKeyType getKeyType() const override { return DictionaryKeyType::simple; }
|
||||
DictionaryKeyType getKeyType() const override { return dictionary_key_type; }
|
||||
|
||||
ColumnPtr getColumn(
|
||||
const std::string& attribute_name,
|
||||
@ -79,36 +92,52 @@ public:
|
||||
|
||||
ColumnUInt8::Ptr hasKeys(const Columns & key_columns, const DataTypes & key_types) const override;
|
||||
|
||||
void isInVectorVector(
|
||||
const PaddedPODArray<Key> & child_ids, const PaddedPODArray<Key> & ancestor_ids, PaddedPODArray<UInt8> & out) const override;
|
||||
void isInVectorConstant(const PaddedPODArray<Key> & child_ids, const Key ancestor_id, PaddedPODArray<UInt8> & out) const override;
|
||||
void isInConstantVector(const Key child_id, const PaddedPODArray<Key> & ancestor_ids, PaddedPODArray<UInt8> & out) const override;
|
||||
bool hasHierarchy() const override { return dictionary_key_type == DictionaryKeyType::simple && dict_struct.hierarchical_attribute_index.has_value(); }
|
||||
|
||||
ColumnPtr getHierarchy(ColumnPtr key_column, const DataTypePtr & hierarchy_attribute_type) const override;
|
||||
|
||||
ColumnUInt8::Ptr isInHierarchy(
|
||||
ColumnPtr key_column,
|
||||
ColumnPtr in_key_column,
|
||||
const DataTypePtr & key_type) const override;
|
||||
|
||||
ColumnPtr getDescendants(
|
||||
ColumnPtr key_column,
|
||||
const DataTypePtr & key_type,
|
||||
size_t level) const override;
|
||||
|
||||
BlockInputStreamPtr getBlockInputStream(const Names & column_names, size_t max_block_size) const override;
|
||||
|
||||
private:
|
||||
template <typename Value>
|
||||
using CollectionType = HashMap<UInt64, Value>;
|
||||
template <typename Value>
|
||||
using CollectionPtrType = std::unique_ptr<CollectionType<Value>>;
|
||||
using CollectionTypeNonSparse = std::conditional_t<
|
||||
dictionary_key_type == DictionaryKeyType::simple,
|
||||
HashMap<UInt64, Value>,
|
||||
HashMapWithSavedHash<StringRef, Value, DefaultHash<StringRef>>>;
|
||||
|
||||
#if !defined(ARCADIA_BUILD)
|
||||
template <typename Value>
|
||||
using SparseCollectionType = google::sparse_hash_map<UInt64, Value, DefaultHash<UInt64>>;
|
||||
template <typename Key, typename Value>
|
||||
using SparseHashMap = google::sparse_hash_map<Key, Value, DefaultHash<Key>>;
|
||||
#else
|
||||
template <typename Value>
|
||||
using SparseCollectionType = google::sparsehash::sparse_hash_map<UInt64, Value, DefaultHash<UInt64>>;
|
||||
template <typename Key, typename Value>
|
||||
using SparseHashMap = google::sparsehash::sparse_hash_map<Key, Value, DefaultHash<Key>>;
|
||||
#endif
|
||||
|
||||
template <typename Value>
|
||||
using SparseCollectionPtrType = std::unique_ptr<SparseCollectionType<Value>>;
|
||||
using CollectionTypeSparse = std::conditional_t<
|
||||
dictionary_key_type == DictionaryKeyType::simple,
|
||||
SparseHashMap<UInt64, Value>,
|
||||
SparseHashMap<StringRef, Value>>;
|
||||
|
||||
using NullableSet = HashSet<Key, DefaultHash<Key>>;
|
||||
template <typename Value>
|
||||
using CollectionType = std::conditional_t<sparse, CollectionTypeSparse<Value>, CollectionTypeNonSparse<Value>>;
|
||||
|
||||
using NullableSet = HashSet<KeyType, DefaultHash<KeyType>>;
|
||||
|
||||
struct Attribute final
|
||||
{
|
||||
AttributeUnderlyingType type;
|
||||
std::optional<NullableSet> nullable_set;
|
||||
std::optional<NullableSet> is_nullable_set;
|
||||
|
||||
std::variant<
|
||||
UInt8,
|
||||
@ -127,41 +156,27 @@ private:
|
||||
Float64,
|
||||
StringRef>
|
||||
null_values;
|
||||
|
||||
std::variant<
|
||||
CollectionPtrType<UInt8>,
|
||||
CollectionPtrType<UInt16>,
|
||||
CollectionPtrType<UInt32>,
|
||||
CollectionPtrType<UInt64>,
|
||||
CollectionPtrType<UInt128>,
|
||||
CollectionPtrType<Int8>,
|
||||
CollectionPtrType<Int16>,
|
||||
CollectionPtrType<Int32>,
|
||||
CollectionPtrType<Int64>,
|
||||
CollectionPtrType<Decimal32>,
|
||||
CollectionPtrType<Decimal64>,
|
||||
CollectionPtrType<Decimal128>,
|
||||
CollectionPtrType<Float32>,
|
||||
CollectionPtrType<Float64>,
|
||||
CollectionPtrType<StringRef>>
|
||||
maps;
|
||||
std::variant<
|
||||
SparseCollectionPtrType<UInt8>,
|
||||
SparseCollectionPtrType<UInt16>,
|
||||
SparseCollectionPtrType<UInt32>,
|
||||
SparseCollectionPtrType<UInt64>,
|
||||
SparseCollectionPtrType<UInt128>,
|
||||
SparseCollectionPtrType<Int8>,
|
||||
SparseCollectionPtrType<Int16>,
|
||||
SparseCollectionPtrType<Int32>,
|
||||
SparseCollectionPtrType<Int64>,
|
||||
SparseCollectionPtrType<Decimal32>,
|
||||
SparseCollectionPtrType<Decimal64>,
|
||||
SparseCollectionPtrType<Decimal128>,
|
||||
SparseCollectionPtrType<Float32>,
|
||||
SparseCollectionPtrType<Float64>,
|
||||
SparseCollectionPtrType<StringRef>>
|
||||
sparse_maps;
|
||||
CollectionType<UInt8>,
|
||||
CollectionType<UInt16>,
|
||||
CollectionType<UInt32>,
|
||||
CollectionType<UInt64>,
|
||||
CollectionType<UInt128>,
|
||||
CollectionType<Int8>,
|
||||
CollectionType<Int16>,
|
||||
CollectionType<Int32>,
|
||||
CollectionType<Int64>,
|
||||
CollectionType<Decimal32>,
|
||||
CollectionType<Decimal64>,
|
||||
CollectionType<Decimal128>,
|
||||
CollectionType<Float32>,
|
||||
CollectionType<Float64>,
|
||||
CollectionType<StringRef>>
|
||||
container;
|
||||
|
||||
std::unique_ptr<Arena> string_arena;
|
||||
|
||||
};
|
||||
|
||||
void createAttributes();
|
||||
@ -172,76 +187,47 @@ private:
|
||||
|
||||
void loadData();
|
||||
|
||||
template <typename T>
|
||||
void addAttributeSize(const Attribute & attribute);
|
||||
|
||||
void calculateBytesAllocated();
|
||||
|
||||
template <typename T>
|
||||
void createAttributeImpl(Attribute & attribute, const Field & null_value);
|
||||
|
||||
Attribute createAttribute(const DictionaryAttribute& attribute, const Field & null_value);
|
||||
|
||||
template <typename AttributeType, typename OutputType, typename MapType, typename ValueSetter, typename DefaultValueExtractor>
|
||||
void getItemsAttrImpl(
|
||||
const MapType & attr,
|
||||
const PaddedPODArray<Key> & ids,
|
||||
ValueSetter && set_value,
|
||||
DefaultValueExtractor & default_value_extractor) const;
|
||||
|
||||
template <typename AttributeType, typename OutputType, typename ValueSetter, typename DefaultValueExtractor>
|
||||
template <typename AttributeType, typename ValueSetter, typename NullableValueSetter, typename DefaultValueExtractor>
|
||||
void getItemsImpl(
|
||||
const Attribute & attribute,
|
||||
const PaddedPODArray<Key> & ids,
|
||||
DictionaryKeysExtractor<dictionary_key_type> & keys_extractor,
|
||||
ValueSetter && set_value,
|
||||
NullableValueSetter && set_nullable_value,
|
||||
DefaultValueExtractor & default_value_extractor) const;
|
||||
|
||||
template <typename T>
|
||||
bool setAttributeValueImpl(Attribute & attribute, const Key id, const T value);
|
||||
template <typename GetContainerFunc>
|
||||
void getAttributeContainer(size_t attribute_index, GetContainerFunc && get_container_func);
|
||||
|
||||
bool setAttributeValue(Attribute & attribute, const Key id, const Field & value);
|
||||
template <typename GetContainerFunc>
|
||||
void getAttributeContainer(size_t attribute_index, GetContainerFunc && get_container_func) const;
|
||||
|
||||
const Attribute & getAttribute(const std::string & attribute_name) const;
|
||||
|
||||
template <typename T>
|
||||
void has(const Attribute & attribute, const PaddedPODArray<Key> & ids, PaddedPODArray<UInt8> & out) const;
|
||||
|
||||
template <typename T, typename AttrType>
|
||||
PaddedPODArray<Key> getIdsAttrImpl(const AttrType & attr) const;
|
||||
template <typename T>
|
||||
PaddedPODArray<Key> getIds(const Attribute & attribute) const;
|
||||
|
||||
PaddedPODArray<Key> getIds() const;
|
||||
|
||||
/// Preallocates the hashtable based on query progress
|
||||
/// (Only while loading all data).
|
||||
///
|
||||
/// @see preallocate
|
||||
template <typename T>
|
||||
void resize(Attribute & attribute, size_t added_rows);
|
||||
void resize(size_t added_rows);
|
||||
|
||||
template <typename AttrType, typename ChildType, typename AncestorType>
|
||||
void isInAttrImpl(const AttrType & attr, const ChildType & child_ids, const AncestorType & ancestor_ids, PaddedPODArray<UInt8> & out) const;
|
||||
template <typename ChildType, typename AncestorType>
|
||||
void isInImpl(const ChildType & child_ids, const AncestorType & ancestor_ids, PaddedPODArray<UInt8> & out) const;
|
||||
StringRef copyKeyInArena(StringRef key);
|
||||
|
||||
const DictionaryStructure dict_struct;
|
||||
const DictionarySourcePtr source_ptr;
|
||||
const DictionaryLifetime dict_lifetime;
|
||||
const bool require_nonempty;
|
||||
const bool sparse;
|
||||
|
||||
std::map<std::string, size_t> attribute_index_by_name;
|
||||
std::vector<Attribute> attributes;
|
||||
const Attribute * hierarchical_attribute = nullptr;
|
||||
|
||||
size_t bytes_allocated = 0;
|
||||
size_t element_count = 0;
|
||||
size_t bucket_count = 0;
|
||||
mutable std::atomic<size_t> query_count{0};
|
||||
|
||||
/// TODO: Remove
|
||||
BlockPtr saved_block;
|
||||
Arena complex_key_arena;
|
||||
};
|
||||
|
||||
extern template class HashedDictionary<DictionaryKeyType::simple, false>;
|
||||
extern template class HashedDictionary<DictionaryKeyType::simple, true>;
|
||||
|
||||
extern template class HashedDictionary<DictionaryKeyType::complex, false>;
|
||||
extern template class HashedDictionary<DictionaryKeyType::complex, true>;
|
||||
|
||||
}
|
||||
|
156
src/Dictionaries/HierarchyDictionariesUtils.cpp
Normal file
156
src/Dictionaries/HierarchyDictionariesUtils.cpp
Normal file
@ -0,0 +1,156 @@
|
||||
#include "HierarchyDictionariesUtils.h"
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int UNSUPPORTED_METHOD;
|
||||
}
|
||||
|
||||
namespace
|
||||
{
|
||||
/** In case of cache or direct dictionary we does not have structure with child to parent representation.
|
||||
* This function build such structure calling getColumn for initial keys to request and for next keys in hierarchy,
|
||||
* until all keys are requested or result key is null value.
|
||||
* To distinguish null value key and key that is not present in dictionary, we use special default value column
|
||||
* with max UInt64 value, if result column key has such value we assume that current key is not presented in dictionary storage.
|
||||
*/
|
||||
HashMap<UInt64, UInt64> getChildToParentHierarchyMapImpl(
|
||||
const IDictionary * dictionary,
|
||||
const DictionaryAttribute & hierarchical_attribute,
|
||||
const PaddedPODArray<UInt64> & initial_keys_to_request,
|
||||
const DataTypePtr & key_type)
|
||||
{
|
||||
UInt64 null_value = hierarchical_attribute.null_value.get<UInt64>();
|
||||
|
||||
ColumnPtr key_to_request_column = ColumnVector<UInt64>::create();
|
||||
auto * key_to_request_column_typed = static_cast<ColumnVector<UInt64> *>(key_to_request_column->assumeMutable().get());
|
||||
|
||||
UInt64 key_not_in_storage_value = std::numeric_limits<UInt64>::max();
|
||||
ColumnPtr key_not_in_storage_default_value_column = ColumnVector<UInt64>::create(initial_keys_to_request.size(), key_not_in_storage_value);
|
||||
|
||||
PaddedPODArray<UInt64> & keys_to_request = key_to_request_column_typed->getData();
|
||||
keys_to_request.assign(initial_keys_to_request);
|
||||
|
||||
PaddedPODArray<UInt64> next_keys_to_request;
|
||||
HashSet<UInt64> already_requested_keys;
|
||||
|
||||
HashMap<UInt64, UInt64> child_to_parent_key;
|
||||
|
||||
while (!keys_to_request.empty())
|
||||
{
|
||||
child_to_parent_key.reserve(child_to_parent_key.size() + keys_to_request.size());
|
||||
|
||||
auto parent_key_column = dictionary->getColumn(
|
||||
hierarchical_attribute.name,
|
||||
hierarchical_attribute.type,
|
||||
{key_to_request_column},
|
||||
{key_type},
|
||||
key_not_in_storage_default_value_column);
|
||||
|
||||
const auto * parent_key_column_typed = checkAndGetColumn<ColumnVector<UInt64>>(*parent_key_column);
|
||||
if (!parent_key_column_typed)
|
||||
throw Exception(ErrorCodes::UNSUPPORTED_METHOD,
|
||||
"Parent key column should be UInt64. Actual ({})",
|
||||
hierarchical_attribute.type->getName());
|
||||
|
||||
const auto & parent_keys = parent_key_column_typed->getData();
|
||||
next_keys_to_request.clear();
|
||||
|
||||
for (size_t i = 0; i < keys_to_request.size(); ++i)
|
||||
{
|
||||
auto key = keys_to_request[i];
|
||||
auto parent_key = parent_keys[i];
|
||||
|
||||
if (parent_key == key_not_in_storage_value)
|
||||
continue;
|
||||
|
||||
child_to_parent_key[key] = parent_key;
|
||||
|
||||
if (parent_key == null_value ||
|
||||
already_requested_keys.find(parent_key) != nullptr)
|
||||
continue;
|
||||
|
||||
already_requested_keys.insert(parent_key);
|
||||
next_keys_to_request.emplace_back(parent_key);
|
||||
}
|
||||
|
||||
keys_to_request.clear();
|
||||
keys_to_request.assign(next_keys_to_request);
|
||||
}
|
||||
|
||||
return child_to_parent_key;
|
||||
}
|
||||
}
|
||||
|
||||
ColumnPtr getKeysHierarchyDefaultImplementation(const IDictionary * dictionary, ColumnPtr key_column, const DataTypePtr & key_type)
|
||||
{
|
||||
key_column = key_column->convertToFullColumnIfConst();
|
||||
const auto * key_column_typed = checkAndGetColumn<ColumnVector<UInt64>>(*key_column);
|
||||
if (!key_column_typed)
|
||||
throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "Key column should be UInt64");
|
||||
|
||||
const auto & dictionary_structure = dictionary->getStructure();
|
||||
size_t hierarchical_attribute_index = *dictionary_structure.hierarchical_attribute_index;
|
||||
const auto & hierarchical_attribute = dictionary_structure.attributes[hierarchical_attribute_index];
|
||||
|
||||
const PaddedPODArray<UInt64> & requested_keys = key_column_typed->getData();
|
||||
HashMap<UInt64, UInt64> key_to_parent_key = getChildToParentHierarchyMapImpl(dictionary, hierarchical_attribute, requested_keys, key_type);
|
||||
|
||||
auto is_key_valid_func = [&](auto & key) { return key_to_parent_key.find(key) != nullptr; };
|
||||
|
||||
auto get_parent_key_func = [&](auto & key)
|
||||
{
|
||||
auto it = key_to_parent_key.find(key);
|
||||
std::optional<UInt64> result = (it != nullptr ? std::make_optional(it->getMapped()) : std::nullopt);
|
||||
return result;
|
||||
};
|
||||
|
||||
UInt64 null_value = hierarchical_attribute.null_value.get<UInt64>();
|
||||
|
||||
auto dictionary_hierarchy_array = getKeysHierarchyArray(requested_keys, null_value, is_key_valid_func, get_parent_key_func);
|
||||
return dictionary_hierarchy_array;
|
||||
}
|
||||
|
||||
ColumnUInt8::Ptr getKeysIsInHierarchyDefaultImplementation(
|
||||
const IDictionary * dictionary,
|
||||
ColumnPtr key_column,
|
||||
ColumnPtr in_key_column,
|
||||
const DataTypePtr & key_type)
|
||||
{
|
||||
key_column = key_column->convertToFullColumnIfConst();
|
||||
in_key_column = in_key_column->convertToFullColumnIfConst();
|
||||
|
||||
const auto * key_column_typed = checkAndGetColumn<ColumnVector<UInt64>>(*key_column);
|
||||
if (!key_column_typed)
|
||||
throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "Key column should be UInt64");
|
||||
|
||||
const auto * in_key_column_typed = checkAndGetColumn<ColumnVector<UInt64>>(*in_key_column);
|
||||
if (!in_key_column_typed)
|
||||
throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "Key column should be UInt64");
|
||||
|
||||
const auto & dictionary_structure = dictionary->getStructure();
|
||||
size_t hierarchical_attribute_index = *dictionary_structure.hierarchical_attribute_index;
|
||||
const auto & hierarchical_attribute = dictionary_structure.attributes[hierarchical_attribute_index];
|
||||
|
||||
const PaddedPODArray<UInt64> & requested_keys = key_column_typed->getData();
|
||||
HashMap<UInt64, UInt64> key_to_parent_key = getChildToParentHierarchyMapImpl(dictionary, hierarchical_attribute, requested_keys, key_type);
|
||||
|
||||
auto is_key_valid_func = [&](auto & key) { return key_to_parent_key.find(key) != nullptr; };
|
||||
|
||||
auto get_parent_key_func = [&](auto & key)
|
||||
{
|
||||
auto it = key_to_parent_key.find(key);
|
||||
std::optional<UInt64> result = (it != nullptr ? std::make_optional(it->getMapped()) : std::nullopt);
|
||||
return result;
|
||||
};
|
||||
|
||||
UInt64 null_value = hierarchical_attribute.null_value.get<UInt64>();
|
||||
const auto & in_keys = in_key_column_typed->getData();
|
||||
|
||||
auto result = getKeysIsInHierarchyColumn(requested_keys, in_keys, null_value, is_key_valid_func, get_parent_key_func);
|
||||
return result;
|
||||
}
|
||||
|
||||
}
|
467
src/Dictionaries/HierarchyDictionariesUtils.h
Normal file
467
src/Dictionaries/HierarchyDictionariesUtils.h
Normal file
@ -0,0 +1,467 @@
|
||||
#pragma once
|
||||
|
||||
#include <common/types.h>
|
||||
#include <Common/PODArray.h>
|
||||
#include <Common/HashTable/HashMap.h>
|
||||
#include <Common/HashTable/HashSet.h>
|
||||
|
||||
#include <Columns/IColumn.h>
|
||||
#include <Columns/ColumnVector.h>
|
||||
#include <Columns/ColumnArray.h>
|
||||
|
||||
#include <Dictionaries/IDictionary.h>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
namespace detail
|
||||
{
|
||||
template <typename KeyType>
|
||||
struct ElementsAndOffsets
|
||||
{
|
||||
PaddedPODArray<KeyType> elements;
|
||||
PaddedPODArray<IColumn::Offset> offsets;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct IsKeyValidFuncInterface
|
||||
{
|
||||
bool operator()(T key [[maybe_unused]]) { return false; }
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct GetParentKeyFuncInterface
|
||||
{
|
||||
std::optional<T> operator()(T key [[maybe_unused]]) { return {}; }
|
||||
};
|
||||
|
||||
/** Calculate hierarchy for keys iterating the hierarchy from child to parent using get_parent_key_func provided by client.
|
||||
* Hierarchy iteration is stopped if key equals null value, get_parent_key_func returns null optional, or hierarchy depth
|
||||
* greater or equal than DBMS_HIERARCHICAL_DICTIONARY_MAX_DEPTH.
|
||||
* IsKeyValidFunc used for each input hierarchy key, if it returns false result hierarchy for that key will have size 0.
|
||||
* Hierarchy result is ElementsAndOffsets structure, for each element there is hierarchy array,
|
||||
* with size offset[element_index] - (element_index > 0 ? offset[element_index - 1] : 0).
|
||||
*
|
||||
* Example:
|
||||
* id parent_id
|
||||
* 1 0
|
||||
* 2 1
|
||||
* 3 1
|
||||
* 4 2
|
||||
*
|
||||
* If hierarchy_null_value will be 0. Requested keys [1, 2, 3, 4, 5].
|
||||
* Result: [1], [2, 1], [3, 1], [4, 2, 1], []
|
||||
* Elements: [1, 2, 1, 3, 1, 4, 2, 1]
|
||||
* Offsets: [1, 3, 5, 8, 8]
|
||||
*/
|
||||
template <typename KeyType, typename IsKeyValidFunc, typename GetParentKeyFunc>
|
||||
ElementsAndOffsets<KeyType> getHierarchy(
|
||||
const PaddedPODArray<KeyType> & keys,
|
||||
const KeyType & hierarchy_null_value,
|
||||
IsKeyValidFunc && is_key_valid_func,
|
||||
GetParentKeyFunc && get_parent_key_func)
|
||||
{
|
||||
size_t hierarchy_keys_size = keys.size();
|
||||
|
||||
PaddedPODArray<KeyType> elements;
|
||||
elements.reserve(hierarchy_keys_size);
|
||||
|
||||
PaddedPODArray<IColumn::Offset> offsets;
|
||||
offsets.reserve(hierarchy_keys_size);
|
||||
|
||||
struct OffsetInArray
|
||||
{
|
||||
size_t offset_index;
|
||||
size_t array_element_offset;
|
||||
};
|
||||
|
||||
HashMap<KeyType, OffsetInArray> already_processes_keys_to_offset;
|
||||
already_processes_keys_to_offset.reserve(hierarchy_keys_size);
|
||||
|
||||
for (size_t i = 0; i < hierarchy_keys_size; ++i)
|
||||
{
|
||||
auto hierarchy_key = keys[i];
|
||||
size_t current_hierarchy_depth = 0;
|
||||
|
||||
bool is_key_valid = std::forward<IsKeyValidFunc>(is_key_valid_func)(hierarchy_key);
|
||||
|
||||
if (!is_key_valid)
|
||||
{
|
||||
offsets.emplace_back(elements.size());
|
||||
continue;
|
||||
}
|
||||
|
||||
while (true)
|
||||
{
|
||||
const auto * it = already_processes_keys_to_offset.find(hierarchy_key);
|
||||
|
||||
if (it)
|
||||
{
|
||||
const auto & index = it->getMapped();
|
||||
|
||||
size_t offset = index.offset_index;
|
||||
|
||||
bool is_loop = (offset == offsets.size());
|
||||
|
||||
if (unlikely(is_loop))
|
||||
break;
|
||||
|
||||
size_t array_element_offset = index.array_element_offset;
|
||||
|
||||
size_t previous_offset_size = offset > 0 ? offsets[offset - 1] : 0;
|
||||
size_t start_index = previous_offset_size + array_element_offset;
|
||||
size_t end_index = offsets[offset];
|
||||
|
||||
elements.insertFromItself(elements.begin() + start_index, elements.begin() + end_index);
|
||||
break;
|
||||
}
|
||||
|
||||
if (hierarchy_key == hierarchy_null_value || current_hierarchy_depth >= DBMS_HIERARCHICAL_DICTIONARY_MAX_DEPTH)
|
||||
break;
|
||||
|
||||
already_processes_keys_to_offset[hierarchy_key] = {offsets.size(), current_hierarchy_depth};
|
||||
elements.emplace_back(hierarchy_key);
|
||||
++current_hierarchy_depth;
|
||||
|
||||
std::optional<KeyType> parent_key = std::forward<GetParentKeyFunc>(get_parent_key_func)(hierarchy_key);
|
||||
|
||||
if (!parent_key.has_value())
|
||||
break;
|
||||
|
||||
hierarchy_key = *parent_key;
|
||||
}
|
||||
|
||||
offsets.emplace_back(elements.size());
|
||||
}
|
||||
|
||||
ElementsAndOffsets<KeyType> result = {std::move(elements), std::move(offsets)};
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/** Returns array with UInt8 represent if key from in_keys array is in hierarchy of key from keys column.
|
||||
* If value in result array is 1 that means key from in_keys array is in hierarchy of key from
|
||||
* keys array with same index, 0 therwise.
|
||||
* For getting hierarchy implementation uses getKeysHierarchy function.
|
||||
*
|
||||
* Not: keys size must be equal to in_keys_size.
|
||||
*/
|
||||
template <typename KeyType, typename IsKeyValidFunc, typename GetParentKeyFunc>
|
||||
PaddedPODArray<UInt8> getIsInHierarchy(
|
||||
const PaddedPODArray<KeyType> & keys,
|
||||
const PaddedPODArray<KeyType> & in_keys,
|
||||
const KeyType & hierarchy_null_value,
|
||||
IsKeyValidFunc && is_key_valid_func,
|
||||
GetParentKeyFunc && get_parent_func)
|
||||
{
|
||||
assert(keys.size() == in_keys.size());
|
||||
|
||||
PaddedPODArray<UInt8> result;
|
||||
result.resize_fill(keys.size());
|
||||
|
||||
detail::ElementsAndOffsets<KeyType> hierarchy = detail::getHierarchy(
|
||||
keys,
|
||||
hierarchy_null_value,
|
||||
std::forward<IsKeyValidFunc>(is_key_valid_func),
|
||||
std::forward<GetParentKeyFunc>(get_parent_func));
|
||||
|
||||
auto & offsets = hierarchy.offsets;
|
||||
auto & elements = hierarchy.elements;
|
||||
|
||||
for (size_t i = 0; i < offsets.size(); ++i)
|
||||
{
|
||||
size_t i_elements_start = i > 0 ? offsets[i - 1] : 0;
|
||||
size_t i_elements_end = offsets[i];
|
||||
|
||||
auto & key_to_find = in_keys[i];
|
||||
|
||||
const auto * begin = elements.begin() + i_elements_start;
|
||||
const auto * end = elements.begin() + i_elements_end;
|
||||
|
||||
const auto * it = std::find(begin, end, key_to_find);
|
||||
|
||||
bool contains_key = (it != end);
|
||||
result[i] = contains_key;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
struct GetAllDescendantsStrategy { size_t level = 0; };
|
||||
struct GetDescendantsAtSpecificLevelStrategy { size_t level = 0; };
|
||||
|
||||
/** Get descendants for keys iterating the hierarchy from parent to child using parent_to_child hash map provided by client.
|
||||
* GetAllDescendantsStrategy get all descendants for key
|
||||
* GetDescendantsAtSpecificLevelStrategy get descendants only for specific hierarchy level.
|
||||
* Hierarchy result is ElementsAndOffsets structure, for each element there is descendants array,
|
||||
* with size offset[element_index] - (element_index > 0 ? offset[element_index - 1] : 0).
|
||||
*
|
||||
* Example:
|
||||
* id parent_id
|
||||
* 1 0
|
||||
* 2 1
|
||||
* 3 1
|
||||
* 4 2
|
||||
*
|
||||
* Example. Strategy GetAllDescendantsStrategy.
|
||||
* Requested keys [0, 1, 2, 3, 4].
|
||||
* Result: [1, 2, 3, 4], [2, 2, 4], [4], [], []
|
||||
* Elements: [1, 2, 3, 4, 2, 3, 4, 4]
|
||||
* Offsets: [4, 7, 8, 8, 8]
|
||||
*
|
||||
* Example. Strategy GetDescendantsAtSpecificLevelStrategy with level 1.
|
||||
* Requested keys [0, 1, 2, 3, 4].
|
||||
* Result: [1], [2, 3], [4], [], [];
|
||||
* Offsets: [1, 3, 4, 4, 4];
|
||||
*/
|
||||
template <typename KeyType, typename Strategy>
|
||||
ElementsAndOffsets<KeyType> getDescendants(
|
||||
const PaddedPODArray<KeyType> & keys,
|
||||
const HashMap<KeyType, PaddedPODArray<KeyType>> & parent_to_child,
|
||||
Strategy strategy)
|
||||
{
|
||||
/// If strategy is GetAllDescendantsStrategy we try to cache and later reuse previously calculated descendants.
|
||||
/// If strategy is GetDescendantsAtSpecificLevelStrategy we does not use cache strategy.
|
||||
size_t keys_size = keys.size();
|
||||
|
||||
PaddedPODArray<KeyType> descendants;
|
||||
descendants.reserve(keys_size);
|
||||
|
||||
PaddedPODArray<IColumn::Offset> descendants_offsets;
|
||||
descendants_offsets.reserve(keys_size);
|
||||
|
||||
struct Range
|
||||
{
|
||||
size_t start_index;
|
||||
size_t end_index;
|
||||
};
|
||||
|
||||
static constexpr Int64 key_range_requires_update = -1;
|
||||
HashMap<KeyType, Range> already_processed_keys_to_range [[maybe_unused]];
|
||||
|
||||
if constexpr (std::is_same_v<Strategy, GetAllDescendantsStrategy>)
|
||||
already_processed_keys_to_range.reserve(keys_size);
|
||||
|
||||
struct KeyAndDepth
|
||||
{
|
||||
KeyType key;
|
||||
Int64 depth;
|
||||
};
|
||||
|
||||
HashSet<KeyType> already_processed_keys_during_loop;
|
||||
already_processed_keys_during_loop.reserve(keys_size);
|
||||
|
||||
PaddedPODArray<KeyAndDepth> next_keys_to_process_stack;
|
||||
next_keys_to_process_stack.reserve(keys_size);
|
||||
|
||||
Int64 level = static_cast<Int64>(strategy.level);
|
||||
|
||||
for (size_t i = 0; i < keys_size; ++i)
|
||||
{
|
||||
const KeyType & requested_key = keys[i];
|
||||
|
||||
if (parent_to_child.find(requested_key) == nullptr)
|
||||
{
|
||||
descendants_offsets.emplace_back(descendants.size());
|
||||
continue;
|
||||
}
|
||||
|
||||
next_keys_to_process_stack.emplace_back(KeyAndDepth{requested_key, 0});
|
||||
|
||||
/** To cache range for key without recursive function calls and custom stack we put special
|
||||
* signaling value on stack key_range_requires_update.
|
||||
* When we pop such value from stack that means processing descendants for key is finished
|
||||
* and we can update range with end_index.
|
||||
*/
|
||||
while (!next_keys_to_process_stack.empty())
|
||||
{
|
||||
KeyAndDepth key_to_process = next_keys_to_process_stack.back();
|
||||
|
||||
KeyType key = key_to_process.key;
|
||||
Int64 depth = key_to_process.depth;
|
||||
next_keys_to_process_stack.pop_back();
|
||||
|
||||
if constexpr (std::is_same_v<Strategy, GetAllDescendantsStrategy>)
|
||||
{
|
||||
/// Update end_index for key
|
||||
if (depth == key_range_requires_update)
|
||||
{
|
||||
auto * it = already_processed_keys_to_range.find(key);
|
||||
assert(it);
|
||||
|
||||
auto & range_to_update = it->getMapped();
|
||||
range_to_update.end_index = descendants.size();
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
if (unlikely(already_processed_keys_during_loop.find(key) != nullptr))
|
||||
{
|
||||
next_keys_to_process_stack.clear();
|
||||
break;
|
||||
}
|
||||
|
||||
if constexpr (std::is_same_v<Strategy, GetAllDescendantsStrategy>)
|
||||
{
|
||||
const auto * already_processed_it = already_processed_keys_to_range.find(key);
|
||||
|
||||
if (already_processed_it)
|
||||
{
|
||||
Range range = already_processed_it->getMapped();
|
||||
|
||||
if (unlikely(range.start_index > range.end_index))
|
||||
{
|
||||
/// Broken range because there was loop
|
||||
already_processed_keys_to_range.erase(key);
|
||||
}
|
||||
else
|
||||
{
|
||||
auto insert_start_iterator = descendants.begin() + range.start_index;
|
||||
auto insert_end_iterator = descendants.begin() + range.end_index;
|
||||
descendants.insertFromItself(insert_start_iterator, insert_end_iterator);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const auto * it = parent_to_child.find(key);
|
||||
|
||||
if (!it || depth >= DBMS_HIERARCHICAL_DICTIONARY_MAX_DEPTH)
|
||||
continue;
|
||||
|
||||
if constexpr (std::is_same_v<Strategy, GetDescendantsAtSpecificLevelStrategy>)
|
||||
{
|
||||
if (depth > level)
|
||||
continue;
|
||||
}
|
||||
|
||||
if constexpr (std::is_same_v<Strategy, GetAllDescendantsStrategy>)
|
||||
{
|
||||
/// Put special signaling value on stack and update cache with range start
|
||||
size_t range_start_index = descendants.size();
|
||||
already_processed_keys_to_range[key].start_index = range_start_index;
|
||||
next_keys_to_process_stack.emplace_back(KeyAndDepth{key, key_range_requires_update});
|
||||
}
|
||||
|
||||
already_processed_keys_during_loop.insert(key);
|
||||
|
||||
++depth;
|
||||
|
||||
const auto & children = it->getMapped();
|
||||
|
||||
for (auto child_key : children)
|
||||
{
|
||||
/// In case of GetAllDescendantsStrategy we add any descendant to result array
|
||||
/// If strategy is GetDescendantsAtSpecificLevelStrategy we require depth == level
|
||||
if (std::is_same_v<Strategy, GetAllDescendantsStrategy> || depth == level)
|
||||
descendants.emplace_back(child_key);
|
||||
|
||||
next_keys_to_process_stack.emplace_back(KeyAndDepth{child_key, depth});
|
||||
}
|
||||
}
|
||||
|
||||
already_processed_keys_during_loop.clear();
|
||||
|
||||
descendants_offsets.emplace_back(descendants.size());
|
||||
}
|
||||
|
||||
ElementsAndOffsets<KeyType> result = {std::move(descendants), std::move(descendants_offsets)};
|
||||
return result;
|
||||
}
|
||||
|
||||
/// Converts ElementAndOffsets structure into ArrayColumn
|
||||
template<typename KeyType>
|
||||
ColumnPtr convertElementsAndOffsetsIntoArray(ElementsAndOffsets<KeyType> && elements_and_offsets)
|
||||
{
|
||||
auto elements_column = ColumnVector<KeyType>::create();
|
||||
elements_column->getData() = std::move(elements_and_offsets.elements);
|
||||
|
||||
auto offsets_column = ColumnVector<IColumn::Offset>::create();
|
||||
offsets_column->getData() = std::move(elements_and_offsets.offsets);
|
||||
|
||||
auto column_array = ColumnArray::create(std::move(elements_column), std::move(offsets_column));
|
||||
|
||||
return column_array;
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns hierarchy array column for keys
|
||||
template <typename KeyType, typename IsKeyValidFunc, typename GetParentKeyFunc>
|
||||
ColumnPtr getKeysHierarchyArray(
|
||||
const PaddedPODArray<KeyType> & keys,
|
||||
const KeyType & hierarchy_null_value,
|
||||
IsKeyValidFunc && is_key_valid_func,
|
||||
GetParentKeyFunc && get_parent_func)
|
||||
{
|
||||
auto elements_and_offsets = detail::getHierarchy(
|
||||
keys,
|
||||
hierarchy_null_value,
|
||||
std::forward<IsKeyValidFunc>(is_key_valid_func),
|
||||
std::forward<GetParentKeyFunc>(get_parent_func));
|
||||
|
||||
return detail::convertElementsAndOffsetsIntoArray(std::move(elements_and_offsets));
|
||||
}
|
||||
|
||||
/// Returns is in hierarchy column for keys
|
||||
template <typename KeyType, typename IsKeyValidFunc, typename GetParentKeyFunc>
|
||||
ColumnUInt8::Ptr getKeysIsInHierarchyColumn(
|
||||
const PaddedPODArray<KeyType> & hierarchy_keys,
|
||||
const PaddedPODArray<KeyType> & hierarchy_in_keys,
|
||||
const KeyType & hierarchy_null_value,
|
||||
IsKeyValidFunc && is_key_valid_func,
|
||||
GetParentKeyFunc && get_parent_func)
|
||||
{
|
||||
auto is_in_hierarchy_data = detail::getIsInHierarchy(
|
||||
hierarchy_keys,
|
||||
hierarchy_in_keys,
|
||||
hierarchy_null_value,
|
||||
std::forward<IsKeyValidFunc>(is_key_valid_func),
|
||||
std::forward<GetParentKeyFunc>(get_parent_func));
|
||||
|
||||
auto result = ColumnUInt8::create();
|
||||
result->getData() = std::move(is_in_hierarchy_data);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/// Returns descendants array column for keys
|
||||
template <typename KeyType>
|
||||
ColumnPtr getKeysDescendantsArray(
|
||||
const PaddedPODArray<KeyType> & requested_keys,
|
||||
const HashMap<KeyType, PaddedPODArray<KeyType>> & parent_to_child,
|
||||
size_t level)
|
||||
{
|
||||
if (level == 0)
|
||||
{
|
||||
detail::GetAllDescendantsStrategy strategy { .level = level };
|
||||
auto elements_and_offsets = detail::getDescendants(requested_keys, parent_to_child, strategy);
|
||||
return detail::convertElementsAndOffsetsIntoArray(std::move(elements_and_offsets));
|
||||
}
|
||||
else
|
||||
{
|
||||
detail::GetDescendantsAtSpecificLevelStrategy strategy { .level = level };
|
||||
auto elements_and_offsets = detail::getDescendants(requested_keys, parent_to_child, strategy);
|
||||
return detail::convertElementsAndOffsetsIntoArray(std::move(elements_and_offsets));
|
||||
}
|
||||
}
|
||||
|
||||
/** Default getHierarchy implementation for dictionaries that does not have structure with child to parent representation.
|
||||
* Implementation will build such structure with getColumn calls, and then getHierarchy for such structure.
|
||||
* Returns ColumnArray with hierarchy arrays for keys from key_column.
|
||||
*/
|
||||
ColumnPtr getKeysHierarchyDefaultImplementation(
|
||||
const IDictionary * dictionary,
|
||||
ColumnPtr key_column,
|
||||
const DataTypePtr & key_type);
|
||||
|
||||
/** Default isInHierarchy implementation for dictionaries that does not have structure with child to parent representation.
|
||||
* Implementation will build such structure with getColumn calls, and then getHierarchy for such structure.
|
||||
* Returns UInt8 column if key from in_key_column is in key hierarchy from key_column.
|
||||
*/
|
||||
ColumnUInt8::Ptr getKeysIsInHierarchyDefaultImplementation(
|
||||
const IDictionary * dictionary,
|
||||
ColumnPtr key_column,
|
||||
ColumnPtr in_key_column,
|
||||
const DataTypePtr & key_type);
|
||||
|
||||
}
|
@ -24,8 +24,8 @@ namespace ErrorCodes
|
||||
extern const int NOT_IMPLEMENTED;
|
||||
}
|
||||
|
||||
struct IDictionaryBase;
|
||||
using DictionaryPtr = std::unique_ptr<IDictionaryBase>;
|
||||
struct IDictionary;
|
||||
using DictionaryPtr = std::unique_ptr<IDictionary>;
|
||||
|
||||
/** DictionaryKeyType provides IDictionary client information about
|
||||
* which key type is supported by dictionary.
|
||||
@ -47,13 +47,11 @@ enum class DictionaryKeyType
|
||||
/**
|
||||
* Base class for Dictionaries implementation.
|
||||
*/
|
||||
struct IDictionaryBase : public IExternalLoadable
|
||||
struct IDictionary : public IExternalLoadable
|
||||
{
|
||||
using Key = UInt64;
|
||||
|
||||
IDictionaryBase(const StorageID & dict_id_)
|
||||
: dict_id(dict_id_)
|
||||
, full_name(dict_id.getInternalDictionaryName())
|
||||
explicit IDictionary(const StorageID & dictionary_id_)
|
||||
: dictionary_id(dictionary_id_)
|
||||
, full_name(dictionary_id.getInternalDictionaryName())
|
||||
{
|
||||
}
|
||||
|
||||
@ -61,14 +59,14 @@ struct IDictionaryBase : public IExternalLoadable
|
||||
StorageID getDictionaryID() const
|
||||
{
|
||||
std::lock_guard lock{name_mutex};
|
||||
return dict_id;
|
||||
return dictionary_id;
|
||||
}
|
||||
|
||||
void updateDictionaryName(const StorageID & new_name) const
|
||||
{
|
||||
std::lock_guard lock{name_mutex};
|
||||
assert(new_name.uuid == dict_id.uuid && dict_id.uuid != UUIDHelpers::Nil);
|
||||
dict_id = new_name;
|
||||
assert(new_name.uuid == dictionary_id.uuid && dictionary_id.uuid != UUIDHelpers::Nil);
|
||||
dictionary_id = new_name;
|
||||
}
|
||||
|
||||
const std::string & getLoadableName() const override final { return getFullName(); }
|
||||
@ -80,8 +78,9 @@ struct IDictionaryBase : public IExternalLoadable
|
||||
|
||||
std::string getDatabaseOrNoDatabaseTag() const
|
||||
{
|
||||
if (!dict_id.database_name.empty())
|
||||
return dict_id.database_name;
|
||||
if (!dictionary_id.database_name.empty())
|
||||
return dictionary_id.database_name;
|
||||
|
||||
return NO_DATABASE_TAG;
|
||||
}
|
||||
|
||||
@ -159,74 +158,65 @@ struct IDictionaryBase : public IExternalLoadable
|
||||
const Columns & key_columns,
|
||||
const DataTypes & key_types) const = 0;
|
||||
|
||||
virtual bool hasHierarchy() const { return false; }
|
||||
|
||||
virtual ColumnPtr getHierarchy(
|
||||
ColumnPtr key_column [[maybe_unused]],
|
||||
const DataTypePtr & key_type [[maybe_unused]]) const
|
||||
{
|
||||
throw Exception(ErrorCodes::NOT_IMPLEMENTED,
|
||||
"Method getHierarchy is not supported for {} dictionary.",
|
||||
getDictionaryID().getNameForLogs());
|
||||
}
|
||||
|
||||
virtual ColumnUInt8::Ptr isInHierarchy(
|
||||
ColumnPtr key_column [[maybe_unused]],
|
||||
ColumnPtr in_key_column [[maybe_unused]],
|
||||
const DataTypePtr & key_type [[maybe_unused]]) const
|
||||
{
|
||||
throw Exception(ErrorCodes::NOT_IMPLEMENTED,
|
||||
"Method isInHierarchy is not supported for {} dictionary.",
|
||||
getDictionaryID().getNameForLogs());
|
||||
}
|
||||
|
||||
virtual ColumnPtr getDescendants(
|
||||
ColumnPtr key_column [[maybe_unused]],
|
||||
const DataTypePtr & key_type [[maybe_unused]],
|
||||
size_t level [[maybe_unused]]) const
|
||||
{
|
||||
throw Exception(ErrorCodes::NOT_IMPLEMENTED,
|
||||
"Method getDescendants is not supported for {} dictionary.",
|
||||
getDictionaryID().getNameForLogs());
|
||||
}
|
||||
|
||||
virtual BlockInputStreamPtr getBlockInputStream(const Names & column_names, size_t max_block_size) const = 0;
|
||||
|
||||
bool supportUpdates() const override { return true; }
|
||||
|
||||
bool isModified() const override
|
||||
{
|
||||
auto source = getSource();
|
||||
const auto * source = getSource();
|
||||
return source && source->isModified();
|
||||
}
|
||||
|
||||
virtual std::exception_ptr getLastException() const { return {}; }
|
||||
|
||||
std::shared_ptr<IDictionaryBase> shared_from_this()
|
||||
std::shared_ptr<IDictionary> shared_from_this()
|
||||
{
|
||||
return std::static_pointer_cast<IDictionaryBase>(IExternalLoadable::shared_from_this());
|
||||
return std::static_pointer_cast<IDictionary>(IExternalLoadable::shared_from_this());
|
||||
}
|
||||
|
||||
std::shared_ptr<const IDictionaryBase> shared_from_this() const
|
||||
std::shared_ptr<const IDictionary> shared_from_this() const
|
||||
{
|
||||
return std::static_pointer_cast<const IDictionaryBase>(IExternalLoadable::shared_from_this());
|
||||
return std::static_pointer_cast<const IDictionary>(IExternalLoadable::shared_from_this());
|
||||
}
|
||||
|
||||
private:
|
||||
mutable std::mutex name_mutex;
|
||||
mutable StorageID dict_id;
|
||||
mutable StorageID dictionary_id;
|
||||
|
||||
protected:
|
||||
const String full_name;
|
||||
};
|
||||
|
||||
struct IDictionary : IDictionaryBase
|
||||
{
|
||||
IDictionary(const StorageID & dict_id_) : IDictionaryBase(dict_id_) {}
|
||||
|
||||
virtual bool hasHierarchy() const = 0;
|
||||
|
||||
virtual void toParent(const PaddedPODArray<Key> & ids, PaddedPODArray<Key> & out) const = 0;
|
||||
|
||||
/// TODO: Rewrite
|
||||
/// Methods for hierarchy.
|
||||
|
||||
virtual void isInVectorVector(
|
||||
const PaddedPODArray<Key> & /*child_ids*/, const PaddedPODArray<Key> & /*ancestor_ids*/, PaddedPODArray<UInt8> & /*out*/) const
|
||||
{
|
||||
throw Exception(ErrorCodes::NOT_IMPLEMENTED,
|
||||
"Hierarchy is not supported for {} dictionary.", getDictionaryID().getNameForLogs());
|
||||
}
|
||||
|
||||
virtual void
|
||||
isInVectorConstant(const PaddedPODArray<Key> & /*child_ids*/, const Key /*ancestor_id*/, PaddedPODArray<UInt8> & /*out*/) const
|
||||
{
|
||||
throw Exception(ErrorCodes::NOT_IMPLEMENTED,
|
||||
"Hierarchy is not supported for {} dictionary.", getDictionaryID().getNameForLogs());
|
||||
}
|
||||
|
||||
virtual void
|
||||
isInConstantVector(const Key /*child_id*/, const PaddedPODArray<Key> & /*ancestor_ids*/, PaddedPODArray<UInt8> & /*out*/) const
|
||||
{
|
||||
throw Exception(ErrorCodes::NOT_IMPLEMENTED,
|
||||
"Hierarchy is not supported for {} dictionary.", getDictionaryID().getNameForLogs());
|
||||
}
|
||||
|
||||
void isInConstantConstant(const Key child_id, const Key ancestor_id, UInt8 & out) const
|
||||
{
|
||||
PaddedPODArray<UInt8> out_arr(1);
|
||||
isInVectorConstant(PaddedPODArray<Key>(1, child_id), ancestor_id, out_arr);
|
||||
out = out_arr[0];
|
||||
}
|
||||
};
|
||||
|
||||
}
|
||||
|
@ -195,7 +195,7 @@ IPAddressDictionary::IPAddressDictionary(
|
||||
DictionarySourcePtr source_ptr_,
|
||||
const DictionaryLifetime dict_lifetime_,
|
||||
bool require_nonempty_)
|
||||
: IDictionaryBase(dict_id_)
|
||||
: IDictionary(dict_id_)
|
||||
, dict_struct(dict_struct_)
|
||||
, source_ptr{std::move(source_ptr_)}
|
||||
, dict_lifetime(dict_lifetime_)
|
||||
@ -804,9 +804,6 @@ static auto keyViewGetter()
|
||||
|
||||
BlockInputStreamPtr IPAddressDictionary::getBlockInputStream(const Names & column_names, size_t max_block_size) const
|
||||
{
|
||||
using BlockInputStreamType = DictionaryBlockInputStream<UInt64>;
|
||||
|
||||
|
||||
const bool is_ipv4 = std::get_if<IPv4Container>(&ip_column) != nullptr;
|
||||
|
||||
auto get_keys = [is_ipv4](const Columns & columns, const std::vector<DictionaryAttribute> & dict_attributes)
|
||||
@ -827,12 +824,12 @@ BlockInputStreamPtr IPAddressDictionary::getBlockInputStream(const Names & colum
|
||||
if (is_ipv4)
|
||||
{
|
||||
auto get_view = keyViewGetter<ColumnVector<UInt32>, true>();
|
||||
return std::make_shared<BlockInputStreamType>(
|
||||
return std::make_shared<DictionaryBlockInputStream>(
|
||||
shared_from_this(), max_block_size, getKeyColumns(), column_names, std::move(get_keys), std::move(get_view));
|
||||
}
|
||||
|
||||
auto get_view = keyViewGetter<ColumnFixedString, false>();
|
||||
return std::make_shared<BlockInputStreamType>(
|
||||
return std::make_shared<DictionaryBlockInputStream>(
|
||||
shared_from_this(), max_block_size, getKeyColumns(), column_names, std::move(get_keys), std::move(get_view));
|
||||
}
|
||||
|
||||
|
@ -20,7 +20,7 @@
|
||||
|
||||
namespace DB
|
||||
{
|
||||
class IPAddressDictionary final : public IDictionaryBase
|
||||
class IPAddressDictionary final : public IDictionary
|
||||
{
|
||||
public:
|
||||
IPAddressDictionary(
|
||||
|
@ -30,7 +30,7 @@ IPolygonDictionary::IPolygonDictionary(
|
||||
const DictionaryLifetime dict_lifetime_,
|
||||
InputType input_type_,
|
||||
PointType point_type_)
|
||||
: IDictionaryBase(dict_id_)
|
||||
: IDictionary(dict_id_)
|
||||
, dict_struct(dict_struct_)
|
||||
, source_ptr(std::move(source_ptr_))
|
||||
, dict_lifetime(dict_lifetime_)
|
||||
@ -142,7 +142,6 @@ ColumnPtr IPolygonDictionary::getColumn(
|
||||
callOnDictionaryAttributeType(attribute.underlying_type, type_call);
|
||||
}
|
||||
|
||||
|
||||
query_count.fetch_add(requested_key_points.size(), std::memory_order_relaxed);
|
||||
|
||||
return result;
|
||||
|
@ -24,7 +24,7 @@ namespace bg = boost::geometry;
|
||||
* An implementation should inherit from this base class and preprocess the data upon construction if needed.
|
||||
* It must override the find method of this class which retrieves the polygon containing a single point.
|
||||
*/
|
||||
class IPolygonDictionary : public IDictionaryBase
|
||||
class IPolygonDictionary : public IDictionary
|
||||
{
|
||||
public:
|
||||
/** Controls the different types of polygons allowed as input.
|
||||
|
@ -24,7 +24,7 @@ public:
|
||||
using Key = UInt64;
|
||||
|
||||
RangeDictionaryBlockInputStream(
|
||||
std::shared_ptr<const IDictionaryBase> dictionary,
|
||||
std::shared_ptr<const IDictionary> dictionary,
|
||||
size_t max_block_size,
|
||||
const Names & column_names,
|
||||
PaddedPODArray<Key> && ids_to_fill,
|
||||
@ -49,7 +49,7 @@ private:
|
||||
const PaddedPODArray<RangeType> & block_start_dates,
|
||||
const PaddedPODArray<RangeType> & block_end_dates) const;
|
||||
|
||||
std::shared_ptr<const IDictionaryBase> dictionary;
|
||||
std::shared_ptr<const IDictionary> dictionary;
|
||||
NameSet column_names;
|
||||
PaddedPODArray<Key> ids;
|
||||
PaddedPODArray<RangeType> start_dates;
|
||||
@ -59,7 +59,7 @@ private:
|
||||
|
||||
template <typename RangeType>
|
||||
RangeDictionaryBlockInputStream<RangeType>::RangeDictionaryBlockInputStream(
|
||||
std::shared_ptr<const IDictionaryBase> dictionary_,
|
||||
std::shared_ptr<const IDictionary> dictionary_,
|
||||
size_t max_block_size_,
|
||||
const Names & column_names_,
|
||||
PaddedPODArray<Key> && ids_,
|
||||
|
@ -76,7 +76,7 @@ RangeHashedDictionary::RangeHashedDictionary(
|
||||
DictionarySourcePtr source_ptr_,
|
||||
const DictionaryLifetime dict_lifetime_,
|
||||
bool require_nonempty_)
|
||||
: IDictionaryBase(dict_id_)
|
||||
: IDictionary(dict_id_)
|
||||
, dict_struct(dict_struct_)
|
||||
, source_ptr{std::move(source_ptr_)}
|
||||
, dict_lifetime(dict_lifetime_)
|
||||
@ -185,10 +185,10 @@ ColumnUInt8::Ptr RangeHashedDictionary::hasKeys(const Columns & key_columns, con
|
||||
auto range_column_storage_type = std::make_shared<DataTypeInt64>();
|
||||
auto range_column_updated = castColumnAccurate(column_to_cast, range_column_storage_type);
|
||||
|
||||
PaddedPODArray<Key> key_backup_storage;
|
||||
PaddedPODArray<UInt64> key_backup_storage;
|
||||
PaddedPODArray<RangeStorageType> range_backup_storage;
|
||||
|
||||
const PaddedPODArray<Key> & ids = getColumnVectorData(this, key_columns[0], key_backup_storage);
|
||||
const PaddedPODArray<UInt64> & ids = getColumnVectorData(this, key_columns[0], key_backup_storage);
|
||||
const PaddedPODArray<RangeStorageType> & dates = getColumnVectorData(this, range_column_updated, range_backup_storage);
|
||||
|
||||
const auto & attribute = attributes.front();
|
||||
@ -213,7 +213,7 @@ ColumnUInt8::Ptr RangeHashedDictionary::hasKeys(const Columns & key_columns, con
|
||||
template <typename AttributeType>
|
||||
ColumnUInt8::Ptr RangeHashedDictionary::hasKeysImpl(
|
||||
const Attribute & attribute,
|
||||
const PaddedPODArray<Key> & ids,
|
||||
const PaddedPODArray<UInt64> & ids,
|
||||
const PaddedPODArray<RangeStorageType> & dates) const
|
||||
{
|
||||
auto result = ColumnUInt8::create(ids.size());
|
||||
@ -388,10 +388,10 @@ void RangeHashedDictionary::getItemsImpl(
|
||||
ValueSetter && set_value,
|
||||
DefaultValueExtractor & default_value_extractor) const
|
||||
{
|
||||
PaddedPODArray<Key> key_backup_storage;
|
||||
PaddedPODArray<UInt64> key_backup_storage;
|
||||
PaddedPODArray<RangeStorageType> range_backup_storage;
|
||||
|
||||
const PaddedPODArray<Key> & ids = getColumnVectorData(this, key_columns[0], key_backup_storage);
|
||||
const PaddedPODArray<UInt64> & ids = getColumnVectorData(this, key_columns[0], key_backup_storage);
|
||||
const PaddedPODArray<RangeStorageType> & dates = getColumnVectorData(this, key_columns[1], range_backup_storage);
|
||||
|
||||
const auto & attr = *std::get<Ptr<AttributeType>>(attribute.maps);
|
||||
@ -436,7 +436,7 @@ void RangeHashedDictionary::getItemsImpl(
|
||||
|
||||
|
||||
template <typename T>
|
||||
void RangeHashedDictionary::setAttributeValueImpl(Attribute & attribute, const Key id, const Range & range, const Field & value)
|
||||
void RangeHashedDictionary::setAttributeValueImpl(Attribute & attribute, const UInt64 id, const Range & range, const Field & value)
|
||||
{
|
||||
using ValueType = std::conditional_t<std::is_same_v<T, String>, StringRef, T>;
|
||||
auto & map = *std::get<Ptr<ValueType>>(attribute.maps);
|
||||
@ -480,7 +480,7 @@ void RangeHashedDictionary::setAttributeValueImpl(Attribute & attribute, const K
|
||||
map.insert({id, Values<ValueType>{std::move(value_to_insert)}});
|
||||
}
|
||||
|
||||
void RangeHashedDictionary::setAttributeValue(Attribute & attribute, const Key id, const Range & range, const Field & value)
|
||||
void RangeHashedDictionary::setAttributeValue(Attribute & attribute, const UInt64 id, const Range & range, const Field & value)
|
||||
{
|
||||
auto type_call = [&](const auto &dictionary_attribute_type)
|
||||
{
|
||||
@ -515,7 +515,7 @@ RangeHashedDictionary::getAttributeWithType(const std::string & attribute_name,
|
||||
|
||||
template <typename RangeType>
|
||||
void RangeHashedDictionary::getIdsAndDates(
|
||||
PaddedPODArray<Key> & ids,
|
||||
PaddedPODArray<UInt64> & ids,
|
||||
PaddedPODArray<RangeType> & start_dates,
|
||||
PaddedPODArray<RangeType> & end_dates) const
|
||||
{
|
||||
@ -536,7 +536,7 @@ void RangeHashedDictionary::getIdsAndDates(
|
||||
template <typename T, typename RangeType>
|
||||
void RangeHashedDictionary::getIdsAndDates(
|
||||
const Attribute & attribute,
|
||||
PaddedPODArray<Key> & ids,
|
||||
PaddedPODArray<UInt64> & ids,
|
||||
PaddedPODArray<RangeType> & start_dates,
|
||||
PaddedPODArray<RangeType> & end_dates) const
|
||||
{
|
||||
@ -567,7 +567,7 @@ void RangeHashedDictionary::getIdsAndDates(
|
||||
template <typename RangeType>
|
||||
BlockInputStreamPtr RangeHashedDictionary::getBlockInputStreamImpl(const Names & column_names, size_t max_block_size) const
|
||||
{
|
||||
PaddedPODArray<Key> ids;
|
||||
PaddedPODArray<UInt64> ids;
|
||||
PaddedPODArray<RangeType> start_dates;
|
||||
PaddedPODArray<RangeType> end_dates;
|
||||
getIdsAndDates(ids, start_dates, end_dates);
|
||||
|
@ -16,7 +16,7 @@
|
||||
|
||||
namespace DB
|
||||
{
|
||||
class RangeHashedDictionary final : public IDictionaryBase
|
||||
class RangeHashedDictionary final : public IDictionary
|
||||
{
|
||||
public:
|
||||
RangeHashedDictionary(
|
||||
@ -160,25 +160,25 @@ private:
|
||||
template <typename AttributeType>
|
||||
ColumnUInt8::Ptr hasKeysImpl(
|
||||
const Attribute & attribute,
|
||||
const PaddedPODArray<Key> & ids,
|
||||
const PaddedPODArray<UInt64> & ids,
|
||||
const PaddedPODArray<RangeStorageType> & dates) const;
|
||||
|
||||
template <typename T>
|
||||
static void setAttributeValueImpl(Attribute & attribute, const Key id, const Range & range, const Field & value);
|
||||
static void setAttributeValueImpl(Attribute & attribute, const UInt64 id, const Range & range, const Field & value);
|
||||
|
||||
static void setAttributeValue(Attribute & attribute, const Key id, const Range & range, const Field & value);
|
||||
static void setAttributeValue(Attribute & attribute, const UInt64 id, const Range & range, const Field & value);
|
||||
|
||||
const Attribute & getAttribute(const std::string & attribute_name) const;
|
||||
|
||||
const Attribute & getAttributeWithType(const std::string & name, const AttributeUnderlyingType type) const;
|
||||
|
||||
template <typename RangeType>
|
||||
void getIdsAndDates(PaddedPODArray<Key> & ids, PaddedPODArray<RangeType> & start_dates, PaddedPODArray<RangeType> & end_dates) const;
|
||||
void getIdsAndDates(PaddedPODArray<UInt64> & ids, PaddedPODArray<RangeType> & start_dates, PaddedPODArray<RangeType> & end_dates) const;
|
||||
|
||||
template <typename T, typename RangeType>
|
||||
void getIdsAndDates(
|
||||
const Attribute & attribute,
|
||||
PaddedPODArray<Key> & ids,
|
||||
PaddedPODArray<UInt64> & ids,
|
||||
PaddedPODArray<RangeType> & start_dates,
|
||||
PaddedPODArray<RangeType> & end_dates) const;
|
||||
|
||||
|
@ -57,7 +57,6 @@ void registerDictionaries()
|
||||
{
|
||||
auto & factory = DictionaryFactory::instance();
|
||||
registerDictionaryRangeHashed(factory);
|
||||
registerDictionaryComplexKeyHashed(factory);
|
||||
registerDictionaryTrie(factory);
|
||||
registerDictionaryFlat(factory);
|
||||
registerDictionaryHashed(factory);
|
||||
|
@ -1,7 +1,5 @@
|
||||
#if defined(__linux__) || defined(__FreeBSD__)
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <Dictionaries/SSDCacheDictionaryStorage.h>
|
||||
|
225
src/Dictionaries/tests/gtest_hierarchy_dictionaries_utils.cpp
Normal file
225
src/Dictionaries/tests/gtest_hierarchy_dictionaries_utils.cpp
Normal file
@ -0,0 +1,225 @@
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <Common/HashTable/HashMap.h>
|
||||
|
||||
#include <Dictionaries/HierarchyDictionariesUtils.h>
|
||||
|
||||
using namespace DB;
|
||||
|
||||
TEST(HierarchyDictionariesUtils, getHierarchy)
|
||||
{
|
||||
{
|
||||
HashMap<UInt64, UInt64> child_to_parent;
|
||||
child_to_parent[1] = 0;
|
||||
child_to_parent[2] = 1;
|
||||
child_to_parent[3] = 1;
|
||||
child_to_parent[4] = 2;
|
||||
|
||||
auto is_key_valid_func = [&](auto key) { return child_to_parent.find(key) != nullptr; };
|
||||
|
||||
auto get_parent_key_func = [&](auto key)
|
||||
{
|
||||
auto it = child_to_parent.find(key);
|
||||
std::optional<UInt64> value = (it != nullptr ? std::make_optional(it->getMapped()) : std::nullopt);
|
||||
return value;
|
||||
};
|
||||
|
||||
UInt64 hierarchy_null_value_key = 0;
|
||||
PaddedPODArray<UInt64> keys = {1, 2, 3, 4, 5};
|
||||
|
||||
auto result = DB::detail::getHierarchy(
|
||||
keys,
|
||||
hierarchy_null_value_key,
|
||||
is_key_valid_func,
|
||||
get_parent_key_func);
|
||||
|
||||
const auto & actual_elements = result.elements;
|
||||
const auto & actual_offsets = result.offsets;
|
||||
|
||||
PaddedPODArray<UInt64> expected_elements = {1, 2, 1, 3, 1, 4, 2, 1};
|
||||
PaddedPODArray<IColumn::Offset> expected_offsets = {1, 3, 5, 8, 8};
|
||||
|
||||
ASSERT_EQ(actual_elements, expected_elements);
|
||||
ASSERT_EQ(actual_offsets, expected_offsets);
|
||||
}
|
||||
{
|
||||
HashMap<UInt64, UInt64> child_to_parent;
|
||||
child_to_parent[1] = 2;
|
||||
child_to_parent[2] = 1;
|
||||
|
||||
auto is_key_valid_func = [&](auto key) { return child_to_parent.find(key) != nullptr; };
|
||||
|
||||
auto get_parent_key_func = [&](auto key)
|
||||
{
|
||||
auto it = child_to_parent.find(key);
|
||||
std::optional<UInt64> value = (it != nullptr ? std::make_optional(it->getMapped()) : std::nullopt);
|
||||
return value;
|
||||
};
|
||||
|
||||
UInt64 hierarchy_null_value_key = 0;
|
||||
PaddedPODArray<UInt64> keys = {1, 2, 3};
|
||||
|
||||
auto result = DB::detail::getHierarchy(
|
||||
keys,
|
||||
hierarchy_null_value_key,
|
||||
is_key_valid_func,
|
||||
get_parent_key_func);
|
||||
|
||||
const auto & actual_elements = result.elements;
|
||||
const auto & actual_offsets = result.offsets;
|
||||
|
||||
PaddedPODArray<UInt64> expected_elements = {1, 2, 2};
|
||||
PaddedPODArray<IColumn::Offset> expected_offsets = {2, 3, 3};
|
||||
|
||||
ASSERT_EQ(actual_elements, expected_elements);
|
||||
ASSERT_EQ(actual_offsets, expected_offsets);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(HierarchyDictionariesUtils, getIsInHierarchy)
|
||||
{
|
||||
{
|
||||
HashMap<UInt64, UInt64> child_to_parent;
|
||||
child_to_parent[1] = 0;
|
||||
child_to_parent[2] = 1;
|
||||
child_to_parent[3] = 1;
|
||||
child_to_parent[4] = 2;
|
||||
|
||||
auto is_key_valid_func = [&](auto key) { return child_to_parent.find(key) != nullptr; };
|
||||
|
||||
auto get_parent_key_func = [&](auto key)
|
||||
{
|
||||
auto it = child_to_parent.find(key);
|
||||
std::optional<UInt64> value = (it != nullptr ? std::make_optional(it->getMapped()) : std::nullopt);
|
||||
return value;
|
||||
};
|
||||
|
||||
UInt64 hierarchy_null_value_key = 0;
|
||||
PaddedPODArray<UInt64> keys = {1, 2, 3, 4, 5};
|
||||
PaddedPODArray<UInt64> keys_in = {1, 1, 1, 2, 5};
|
||||
|
||||
PaddedPODArray<UInt8> actual = DB::detail::getIsInHierarchy(
|
||||
keys,
|
||||
keys_in,
|
||||
hierarchy_null_value_key,
|
||||
is_key_valid_func,
|
||||
get_parent_key_func);
|
||||
|
||||
PaddedPODArray<UInt8> expected = {1,1,1,1,0};
|
||||
|
||||
ASSERT_EQ(actual, expected);
|
||||
}
|
||||
{
|
||||
HashMap<UInt64, UInt64> child_to_parent;
|
||||
child_to_parent[1] = 2;
|
||||
child_to_parent[2] = 1;
|
||||
|
||||
auto is_key_valid_func = [&](auto key)
|
||||
{
|
||||
return child_to_parent.find(key) != nullptr;
|
||||
};
|
||||
|
||||
auto get_parent_key_func = [&](auto key)
|
||||
{
|
||||
auto it = child_to_parent.find(key);
|
||||
std::optional<UInt64> value = (it != nullptr ? std::make_optional(it->getMapped()) : std::nullopt);
|
||||
return value;
|
||||
};
|
||||
|
||||
UInt64 hierarchy_null_value_key = 0;
|
||||
PaddedPODArray<UInt64> keys = {1, 2, 3};
|
||||
PaddedPODArray<UInt64> keys_in = {1, 2, 3};
|
||||
|
||||
PaddedPODArray<UInt8> actual = DB::detail::getIsInHierarchy(
|
||||
keys,
|
||||
keys_in,
|
||||
hierarchy_null_value_key,
|
||||
is_key_valid_func,
|
||||
get_parent_key_func);
|
||||
|
||||
PaddedPODArray<UInt8> expected = {1, 1, 0};
|
||||
ASSERT_EQ(actual, expected);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(HierarchyDictionariesUtils, getDescendants)
|
||||
{
|
||||
{
|
||||
HashMap<UInt64, PaddedPODArray<UInt64>> parent_to_child;
|
||||
parent_to_child[0].emplace_back(1);
|
||||
parent_to_child[1].emplace_back(2);
|
||||
parent_to_child[1].emplace_back(3);
|
||||
parent_to_child[2].emplace_back(4);
|
||||
|
||||
PaddedPODArray<UInt64> keys = {0, 1, 2, 3, 4};
|
||||
|
||||
{
|
||||
auto result = DB::detail::getDescendants(
|
||||
keys,
|
||||
parent_to_child,
|
||||
DB::detail::GetAllDescendantsStrategy());
|
||||
|
||||
const auto & actual_elements = result.elements;
|
||||
const auto & actual_offsets = result.offsets;
|
||||
|
||||
PaddedPODArray<UInt64> expected_elements = {1, 2, 3, 4, 2, 3, 4, 4};
|
||||
PaddedPODArray<IColumn::Offset> expected_offsets = {4, 7, 8, 8, 8};
|
||||
|
||||
ASSERT_EQ(actual_elements, expected_elements);
|
||||
ASSERT_EQ(actual_offsets, expected_offsets);
|
||||
}
|
||||
{
|
||||
auto result = DB::detail::getDescendants(
|
||||
keys,
|
||||
parent_to_child,
|
||||
DB::detail::GetDescendantsAtSpecificLevelStrategy{1});
|
||||
|
||||
const auto & actual_elements = result.elements;
|
||||
const auto & actual_offsets = result.offsets;
|
||||
|
||||
PaddedPODArray<UInt64> expected_elements = {1, 2, 3, 4};
|
||||
PaddedPODArray<IColumn::Offset> expected_offsets = {1, 3, 4, 4, 4};
|
||||
|
||||
ASSERT_EQ(actual_elements, expected_elements);
|
||||
ASSERT_EQ(actual_offsets, expected_offsets);
|
||||
}
|
||||
}
|
||||
{
|
||||
HashMap<UInt64, PaddedPODArray<UInt64>> parent_to_child;
|
||||
parent_to_child[1].emplace_back(2);
|
||||
parent_to_child[2].emplace_back(1);
|
||||
|
||||
PaddedPODArray<UInt64> keys = {1, 2, 3};
|
||||
|
||||
{
|
||||
auto result = DB::detail::getDescendants(
|
||||
keys,
|
||||
parent_to_child,
|
||||
DB::detail::GetAllDescendantsStrategy());
|
||||
|
||||
const auto & actual_elements = result.elements;
|
||||
const auto & actual_offsets = result.offsets;
|
||||
|
||||
PaddedPODArray<UInt64> expected_elements = {2, 1, 1};
|
||||
PaddedPODArray<IColumn::Offset> expected_offsets = {2, 3, 3};
|
||||
|
||||
ASSERT_EQ(actual_elements, expected_elements);
|
||||
ASSERT_EQ(actual_offsets, expected_offsets);
|
||||
}
|
||||
{
|
||||
auto result = DB::detail::getDescendants(
|
||||
keys,
|
||||
parent_to_child,
|
||||
DB::detail::GetDescendantsAtSpecificLevelStrategy{1});
|
||||
|
||||
const auto & actual_elements = result.elements;
|
||||
const auto & actual_offsets = result.offsets;
|
||||
|
||||
PaddedPODArray<UInt64> expected_elements = {2, 1};
|
||||
PaddedPODArray<IColumn::Offset> expected_offsets = {1, 2, 2};
|
||||
|
||||
ASSERT_EQ(actual_elements, expected_elements);
|
||||
ASSERT_EQ(actual_offsets, expected_offsets);
|
||||
}
|
||||
}
|
||||
}
|
@ -26,7 +26,7 @@ SRCS(
|
||||
CassandraDictionarySource.cpp
|
||||
CassandraHelpers.cpp
|
||||
ClickHouseDictionarySource.cpp
|
||||
ComplexKeyHashedDictionary.cpp
|
||||
DictionaryBlockInputStream.cpp
|
||||
DictionaryBlockInputStreamBase.cpp
|
||||
DictionaryFactory.cpp
|
||||
DictionarySourceFactory.cpp
|
||||
@ -48,6 +48,7 @@ SRCS(
|
||||
FlatDictionary.cpp
|
||||
HTTPDictionarySource.cpp
|
||||
HashedDictionary.cpp
|
||||
HierarchyDictionariesUtils.cpp
|
||||
IPAddressDictionary.cpp
|
||||
LibraryDictionarySource.cpp
|
||||
LibraryDictionarySourceExternal.cpp
|
||||
|
@ -24,6 +24,8 @@ void registerFunctionsExternalDictionaries(FunctionFactory & factory)
|
||||
factory.registerFunction<FunctionDictGetString>();
|
||||
factory.registerFunction<FunctionDictGetHierarchy>();
|
||||
factory.registerFunction<FunctionDictIsIn>();
|
||||
factory.registerFunction<FunctionDictGetChildren>();
|
||||
factory.registerFunction<FunctionDictGetDescendants>();
|
||||
factory.registerFunction<FunctionDictGetUInt8OrDefault>();
|
||||
factory.registerFunction<FunctionDictGetUInt16OrDefault>();
|
||||
factory.registerFunction<FunctionDictGetUInt32OrDefault>();
|
||||
@ -40,6 +42,7 @@ void registerFunctionsExternalDictionaries(FunctionFactory & factory)
|
||||
factory.registerFunction<FunctionDictGetStringOrDefault>();
|
||||
factory.registerFunction<FunctionDictGetNoType<DictionaryGetFunctionType::get>>();
|
||||
factory.registerFunction<FunctionDictGetNoType<DictionaryGetFunctionType::getOrDefault>>();
|
||||
factory.registerFunction<FunctionDictGetOrNull>();
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -19,6 +19,7 @@
|
||||
#include <Columns/ColumnArray.h>
|
||||
#include <Columns/ColumnString.h>
|
||||
#include <Columns/ColumnTuple.h>
|
||||
#include <Columns/ColumnNullable.h>
|
||||
|
||||
#include <Access/AccessFlags.h>
|
||||
|
||||
@ -28,16 +29,6 @@
|
||||
|
||||
#include <Functions/IFunctionImpl.h>
|
||||
#include <Functions/FunctionHelpers.h>
|
||||
|
||||
#include <Dictionaries/FlatDictionary.h>
|
||||
#include <Dictionaries/HashedDictionary.h>
|
||||
#include <Dictionaries/CacheDictionary.h>
|
||||
#include <Dictionaries/ComplexKeyHashedDictionary.h>
|
||||
#include <Dictionaries/RangeHashedDictionary.h>
|
||||
#include <Dictionaries/IPAddressDictionary.h>
|
||||
#include <Dictionaries/PolygonDictionaryImplementations.h>
|
||||
#include <Dictionaries/DirectDictionary.h>
|
||||
|
||||
#include <ext/range.h>
|
||||
|
||||
#include <type_traits>
|
||||
@ -49,7 +40,6 @@ namespace ErrorCodes
|
||||
{
|
||||
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
|
||||
extern const int UNSUPPORTED_METHOD;
|
||||
extern const int UNKNOWN_TYPE;
|
||||
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
|
||||
extern const int ILLEGAL_COLUMN;
|
||||
extern const int BAD_ARGUMENTS;
|
||||
@ -77,7 +67,7 @@ class FunctionDictHelper
|
||||
public:
|
||||
explicit FunctionDictHelper(const Context & context_) : context(context_) {}
|
||||
|
||||
std::shared_ptr<const IDictionaryBase> getDictionary(const String & dictionary_name)
|
||||
std::shared_ptr<const IDictionary> getDictionary(const String & dictionary_name)
|
||||
{
|
||||
auto dict = context.getExternalDictionariesLoader().getDictionary(dictionary_name, context);
|
||||
|
||||
@ -90,9 +80,13 @@ public:
|
||||
return dict;
|
||||
}
|
||||
|
||||
std::shared_ptr<const IDictionaryBase> getDictionary(const ColumnWithTypeAndName & column)
|
||||
std::shared_ptr<const IDictionary> getDictionary(const ColumnPtr & column)
|
||||
{
|
||||
const auto * dict_name_col = checkAndGetColumnConst<ColumnString>(column.column.get());
|
||||
const auto * dict_name_col = checkAndGetColumnConst<ColumnString>(column.get());
|
||||
|
||||
if (!dict_name_col)
|
||||
throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "Expected const String column");
|
||||
|
||||
return getDictionary(dict_name_col->getValue<String>());
|
||||
}
|
||||
|
||||
@ -148,7 +142,6 @@ public:
|
||||
|
||||
String getName() const override { return name; }
|
||||
|
||||
private:
|
||||
size_t getNumberOfArguments() const override { return 0; }
|
||||
bool isVariadic() const override { return true; }
|
||||
|
||||
@ -187,7 +180,7 @@ private:
|
||||
if (input_rows_count == 0)
|
||||
return result_type->createColumn();
|
||||
|
||||
auto dictionary = helper.getDictionary(arguments[0]);
|
||||
auto dictionary = helper.getDictionary(arguments[0].column);
|
||||
auto dictionary_key_type = dictionary->getKeyType();
|
||||
|
||||
const ColumnWithTypeAndName & key_column_with_type = arguments[1];
|
||||
@ -238,6 +231,7 @@ private:
|
||||
return dictionary->hasKeys({key_column, range_col}, {std::make_shared<DataTypeUInt64>(), range_col_type});
|
||||
}
|
||||
|
||||
private:
|
||||
mutable FunctionDictHelper helper;
|
||||
};
|
||||
|
||||
@ -302,7 +296,7 @@ public:
|
||||
}
|
||||
|
||||
if (types.size() > 1)
|
||||
return std::make_shared<DataTypeTuple>(types);
|
||||
return std::make_shared<DataTypeTuple>(types, attribute_names);
|
||||
else
|
||||
return types.front();
|
||||
}
|
||||
@ -701,6 +695,163 @@ using FunctionDictGetDecimal64OrDefault = FunctionDictGetOrDefault<DataTypeDecim
|
||||
using FunctionDictGetDecimal128OrDefault = FunctionDictGetOrDefault<DataTypeDecimal<Decimal128>, NameDictGetDecimal128OrDefault>;
|
||||
using FunctionDictGetStringOrDefault = FunctionDictGetOrDefault<DataTypeString, NameDictGetStringOrDefault>;
|
||||
|
||||
class FunctionDictGetOrNull final : public IFunction
|
||||
{
|
||||
public:
|
||||
static constexpr auto name = "dictGetOrNull";
|
||||
|
||||
static FunctionPtr create(const Context &context)
|
||||
{
|
||||
return std::make_shared<FunctionDictGetOrNull>(context);
|
||||
}
|
||||
|
||||
explicit FunctionDictGetOrNull(const Context & context_)
|
||||
: dictionary_get_func_impl(context_)
|
||||
, dictionary_has_func_impl(context_)
|
||||
{}
|
||||
|
||||
String getName() const override { return name; }
|
||||
|
||||
private:
|
||||
|
||||
size_t getNumberOfArguments() const override { return 0; }
|
||||
|
||||
bool isVariadic() const override { return true; }
|
||||
|
||||
bool useDefaultImplementationForConstants() const override { return true; }
|
||||
|
||||
bool useDefaultImplementationForNulls() const override { return false; }
|
||||
|
||||
bool isDeterministic() const override { return false; }
|
||||
|
||||
ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {0, 1}; }
|
||||
|
||||
bool isInjective(const ColumnsWithTypeAndName & sample_columns) const override
|
||||
{
|
||||
return dictionary_get_func_impl.isInjective(sample_columns);
|
||||
}
|
||||
|
||||
DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override
|
||||
{
|
||||
auto result_type = dictionary_get_func_impl.getReturnTypeImpl(arguments);
|
||||
|
||||
WhichDataType result_data_type(result_type);
|
||||
if (result_data_type.isTuple())
|
||||
{
|
||||
const auto & data_type_tuple = static_cast<const DataTypeTuple &>(*result_type);
|
||||
auto elements_types_copy = data_type_tuple.getElements();
|
||||
for (auto & element_type : elements_types_copy)
|
||||
element_type = makeNullable(element_type);
|
||||
|
||||
result_type = std::make_shared<DataTypeTuple>(elements_types_copy, data_type_tuple.getElementNames());
|
||||
}
|
||||
else
|
||||
result_type = makeNullable(result_type);
|
||||
|
||||
return result_type;
|
||||
}
|
||||
|
||||
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override
|
||||
{
|
||||
/** We call dictHas function to get which map is key presented in dictionary.
|
||||
For key that presented in dictionary dict has result for that key index value will be 1. Otherwise 0.
|
||||
We invert result, and then for key that is not presented in dictionary value will be 1. Otherwise 0.
|
||||
This inverted result will be used as null column map.
|
||||
After that we call dict get function, by contract for key that are not presented in dictionary we
|
||||
return default value.
|
||||
We create nullable column from dict get result column and null column map.
|
||||
|
||||
2 additional implementation details:
|
||||
1. Result from dict get can be tuple if client requested multiple attributes we apply such operation on each result column.
|
||||
2. If column is already nullable we merge column null map with null map that we get from dict has.
|
||||
*/
|
||||
|
||||
auto dict_has_arguments = filterAttributeNameArgumentForDictHas(arguments);
|
||||
auto is_key_in_dictionary_column = dictionary_has_func_impl.executeImpl(dict_has_arguments, std::make_shared<DataTypeUInt8>(), input_rows_count);
|
||||
auto is_key_in_dictionary_column_mutable = is_key_in_dictionary_column->assumeMutable();
|
||||
ColumnVector<UInt8> & is_key_in_dictionary_column_typed = assert_cast<ColumnVector<UInt8> &>(*is_key_in_dictionary_column_mutable);
|
||||
PaddedPODArray<UInt8> & is_key_in_dictionary_data = is_key_in_dictionary_column_typed.getData();
|
||||
for (auto & key : is_key_in_dictionary_data)
|
||||
key = !key;
|
||||
|
||||
auto result_type = dictionary_get_func_impl.getReturnTypeImpl(arguments);
|
||||
auto dictionary_get_result_column = dictionary_get_func_impl.executeImpl(arguments, result_type, input_rows_count);
|
||||
|
||||
ColumnPtr result;
|
||||
|
||||
WhichDataType result_data_type(result_type);
|
||||
auto dictionary_get_result_column_mutable = dictionary_get_result_column->assumeMutable();
|
||||
|
||||
if (result_data_type.isTuple())
|
||||
{
|
||||
ColumnTuple & column_tuple = assert_cast<ColumnTuple &>(*dictionary_get_result_column_mutable);
|
||||
|
||||
const auto & columns = column_tuple.getColumns();
|
||||
size_t tuple_size = columns.size();
|
||||
|
||||
MutableColumns new_columns(tuple_size);
|
||||
for (size_t tuple_column_index = 0; tuple_column_index < tuple_size; ++tuple_column_index)
|
||||
{
|
||||
auto nullable_column_map = ColumnVector<UInt8>::create();
|
||||
auto & nullable_column_map_data = nullable_column_map->getData();
|
||||
nullable_column_map_data.assign(is_key_in_dictionary_data);
|
||||
|
||||
auto mutable_column = columns[tuple_column_index]->assumeMutable();
|
||||
if (ColumnNullable * nullable_column = typeid_cast<ColumnNullable *>(mutable_column.get()))
|
||||
{
|
||||
auto & null_map_data = nullable_column->getNullMapData();
|
||||
addNullMap(null_map_data, is_key_in_dictionary_data);
|
||||
new_columns[tuple_column_index] = std::move(mutable_column);
|
||||
}
|
||||
else
|
||||
new_columns[tuple_column_index] = ColumnNullable::create(std::move(mutable_column), std::move(nullable_column_map));
|
||||
}
|
||||
|
||||
result = ColumnTuple::create(std::move(new_columns));
|
||||
}
|
||||
else
|
||||
{
|
||||
if (ColumnNullable * nullable_column = typeid_cast<ColumnNullable *>(dictionary_get_result_column_mutable.get()))
|
||||
{
|
||||
auto & null_map_data = nullable_column->getNullMapData();
|
||||
addNullMap(null_map_data, is_key_in_dictionary_data);
|
||||
result = std::move(dictionary_get_result_column);
|
||||
}
|
||||
else
|
||||
result = ColumnNullable::create(std::move(dictionary_get_result_column), std::move(is_key_in_dictionary_column_mutable));
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
static void addNullMap(PaddedPODArray<UInt8> & null_map, PaddedPODArray<UInt8> & null_map_to_add)
|
||||
{
|
||||
assert(null_map.size() == null_map_to_add.size());
|
||||
|
||||
for (size_t i = 0; i < null_map.size(); ++i)
|
||||
null_map[i] = null_map[i] || null_map_to_add[i];
|
||||
}
|
||||
|
||||
static ColumnsWithTypeAndName filterAttributeNameArgumentForDictHas(const ColumnsWithTypeAndName & arguments)
|
||||
{
|
||||
ColumnsWithTypeAndName dict_has_arguments;
|
||||
dict_has_arguments.reserve(arguments.size() - 1);
|
||||
size_t attribute_name_argument_index = 1;
|
||||
|
||||
for (size_t i = 0; i < arguments.size(); ++i)
|
||||
{
|
||||
if (i == attribute_name_argument_index)
|
||||
continue;
|
||||
|
||||
dict_has_arguments.emplace_back(arguments[i]);
|
||||
}
|
||||
|
||||
return dict_has_arguments;
|
||||
}
|
||||
|
||||
const FunctionDictGetNoType<DictionaryGetFunctionType::get> dictionary_get_func_impl;
|
||||
const FunctionDictHas dictionary_has_func_impl;
|
||||
};
|
||||
/// Functions to work with hierarchies.
|
||||
|
||||
class FunctionDictGetHierarchy final : public IFunction
|
||||
@ -727,12 +878,16 @@ private:
|
||||
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
|
||||
{
|
||||
if (!isString(arguments[0]))
|
||||
throw Exception{"Illegal type " + arguments[0]->getName() + " of first argument of function " + getName()
|
||||
+ ", expected a string.", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT};
|
||||
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
|
||||
"Illegal type of first argument of function ({}). Expected String. Actual type ({})",
|
||||
getName(),
|
||||
arguments[0]->getName());
|
||||
|
||||
if (!WhichDataType(arguments[1]).isUInt64())
|
||||
throw Exception{"Illegal type " + arguments[1]->getName() + " of second argument of function " + getName()
|
||||
+ ", must be UInt64.", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT};
|
||||
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
|
||||
"Illegal type of second argument of function ({}). Expected UInt64. Actual type ({})",
|
||||
getName(),
|
||||
arguments[1]->getName());
|
||||
|
||||
return std::make_shared<DataTypeArray>(std::make_shared<DataTypeUInt64>());
|
||||
}
|
||||
@ -744,109 +899,15 @@ private:
|
||||
if (input_rows_count == 0)
|
||||
return result_type->createColumn();
|
||||
|
||||
auto dict = helper.getDictionary(arguments[0]);
|
||||
ColumnPtr res;
|
||||
auto dictionary = helper.getDictionary(arguments[0].column);
|
||||
|
||||
/// TODO: Rewrite this
|
||||
if (!((res = executeDispatch<FlatDictionary>(arguments, result_type, dict))
|
||||
|| (res = executeDispatch<DirectDictionary<DictionaryKeyType::simple>>(arguments, result_type, dict))
|
||||
|| (res = executeDispatch<HashedDictionary>(arguments, result_type, dict))
|
||||
|| (res = executeDispatch<CacheDictionary<DictionaryKeyType::simple>>(arguments, result_type, dict))))
|
||||
throw Exception{"Unsupported dictionary type " + dict->getTypeName(), ErrorCodes::UNKNOWN_TYPE};
|
||||
if (!dictionary->hasHierarchy())
|
||||
throw Exception(ErrorCodes::UNSUPPORTED_METHOD,
|
||||
"Dictionary ({}) does not support hierarchy",
|
||||
dictionary->getFullName());
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
template <typename DictionaryType>
|
||||
ColumnPtr executeDispatch(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, const std::shared_ptr<const IDictionaryBase> & dict_ptr) const
|
||||
{
|
||||
const auto * dict = typeid_cast<const DictionaryType *>(dict_ptr.get());
|
||||
if (!dict)
|
||||
return nullptr;
|
||||
|
||||
if (!dict->hasHierarchy())
|
||||
throw Exception{"Dictionary does not have a hierarchy", ErrorCodes::UNSUPPORTED_METHOD};
|
||||
|
||||
const auto get_hierarchies = [&] (const PaddedPODArray<UInt64> & in, PaddedPODArray<UInt64> & out, PaddedPODArray<UInt64> & offsets)
|
||||
{
|
||||
const auto size = in.size();
|
||||
|
||||
/// copy of `in` array
|
||||
auto in_array = std::make_unique<PaddedPODArray<UInt64>>(std::begin(in), std::end(in));
|
||||
/// used for storing and handling result of ::toParent call
|
||||
auto out_array = std::make_unique<PaddedPODArray<UInt64>>(size);
|
||||
/// resulting hierarchies
|
||||
std::vector<std::vector<IDictionary::Key>> hierarchies(size); /// TODO Bad code, poor performance.
|
||||
|
||||
/// total number of non-zero elements, used for allocating all the required memory upfront
|
||||
size_t total_count = 0;
|
||||
|
||||
while (true)
|
||||
{
|
||||
auto all_zeroes = true;
|
||||
|
||||
/// erase zeroed identifiers, store non-zeroed ones
|
||||
for (const auto i : ext::range(0, size))
|
||||
{
|
||||
const auto id = (*in_array)[i];
|
||||
if (0 == id)
|
||||
continue;
|
||||
|
||||
|
||||
auto & hierarchy = hierarchies[i];
|
||||
|
||||
/// Checking for loop
|
||||
if (std::find(std::begin(hierarchy), std::end(hierarchy), id) != std::end(hierarchy))
|
||||
continue;
|
||||
|
||||
all_zeroes = false;
|
||||
/// place id at it's corresponding place
|
||||
hierarchy.push_back(id);
|
||||
|
||||
++total_count;
|
||||
}
|
||||
|
||||
if (all_zeroes)
|
||||
break;
|
||||
|
||||
/// translate all non-zero identifiers at once
|
||||
dict->toParent(*in_array, *out_array);
|
||||
|
||||
/// we're going to use the `in_array` from this iteration as `out_array` on the next one
|
||||
std::swap(in_array, out_array);
|
||||
}
|
||||
|
||||
out.reserve(total_count);
|
||||
offsets.resize(size);
|
||||
|
||||
for (const auto i : ext::range(0, size))
|
||||
{
|
||||
const auto & ids = hierarchies[i];
|
||||
out.insert_assume_reserved(std::begin(ids), std::end(ids));
|
||||
offsets[i] = out.size();
|
||||
}
|
||||
};
|
||||
|
||||
const auto * id_col_untyped = arguments[1].column.get();
|
||||
if (const auto * id_col = checkAndGetColumn<ColumnUInt64>(id_col_untyped))
|
||||
{
|
||||
const auto & in = id_col->getData();
|
||||
auto backend = ColumnUInt64::create();
|
||||
auto offsets = ColumnArray::ColumnOffsets::create();
|
||||
get_hierarchies(in, backend->getData(), offsets->getData());
|
||||
return ColumnArray::create(std::move(backend), std::move(offsets));
|
||||
}
|
||||
else if (const auto * id_col_const = checkAndGetColumnConst<ColumnVector<UInt64>>(id_col_untyped))
|
||||
{
|
||||
const PaddedPODArray<UInt64> in(1, id_col_const->getValue<UInt64>());
|
||||
auto backend = ColumnUInt64::create();
|
||||
auto offsets = ColumnArray::ColumnOffsets::create();
|
||||
get_hierarchies(in, backend->getData(), offsets->getData());
|
||||
auto array = ColumnArray::create(std::move(backend), std::move(offsets));
|
||||
return result_type->createColumnConst(id_col_const->size(), (*array)[0].get<Array>());
|
||||
}
|
||||
else
|
||||
throw Exception{"Second argument of function " + getName() + " must be UInt64", ErrorCodes::ILLEGAL_COLUMN};
|
||||
ColumnPtr result = dictionary->getHierarchy(arguments[1].column, std::make_shared<DataTypeUInt64>());
|
||||
return result;
|
||||
}
|
||||
|
||||
mutable FunctionDictHelper helper;
|
||||
@ -877,16 +938,22 @@ private:
|
||||
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
|
||||
{
|
||||
if (!isString(arguments[0]))
|
||||
throw Exception{"Illegal type " + arguments[0]->getName() + " of first argument of function " + getName()
|
||||
+ ", expected a string.", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT};
|
||||
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
|
||||
"Illegal type of first argument of function ({}). Expected String. Actual type ({})",
|
||||
getName(),
|
||||
arguments[0]->getName());
|
||||
|
||||
if (!WhichDataType(arguments[1]).isUInt64())
|
||||
throw Exception{"Illegal type " + arguments[1]->getName() + " of second argument of function " + getName()
|
||||
+ ", must be UInt64.", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT};
|
||||
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
|
||||
"Illegal type of second argument of function ({}). Expected UInt64. Actual type ({})",
|
||||
getName(),
|
||||
arguments[1]->getName());
|
||||
|
||||
if (!WhichDataType(arguments[2]).isUInt64())
|
||||
throw Exception{"Illegal type " + arguments[2]->getName() + " of third argument of function " + getName()
|
||||
+ ", must be UInt64.", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT};
|
||||
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
|
||||
"Illegal type of third argument of function ({}). Expected UInt64. Actual type ({})",
|
||||
getName(),
|
||||
arguments[2]->getName());
|
||||
|
||||
return std::make_shared<DataTypeUInt8>();
|
||||
}
|
||||
@ -898,105 +965,163 @@ private:
|
||||
if (input_rows_count == 0)
|
||||
return result_type->createColumn();
|
||||
|
||||
auto dict = helper.getDictionary(arguments[0]);
|
||||
auto dict = helper.getDictionary(arguments[0].column);
|
||||
|
||||
ColumnPtr res;
|
||||
if (!((res = executeDispatch<FlatDictionary>(arguments, dict))
|
||||
|| (res = executeDispatch<DirectDictionary<DictionaryKeyType::simple>>(arguments, dict))
|
||||
|| (res = executeDispatch<HashedDictionary>(arguments, dict))
|
||||
|| (res = executeDispatch<CacheDictionary<DictionaryKeyType::simple>>(arguments, dict))))
|
||||
throw Exception{"Unsupported dictionary type " + dict->getTypeName(), ErrorCodes::UNKNOWN_TYPE};
|
||||
if (!dict->hasHierarchy())
|
||||
throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "Dictionary ({}) does not support hierarchy", dict->getFullName());
|
||||
|
||||
ColumnPtr res = dict->isInHierarchy(arguments[1].column, arguments[2].column, std::make_shared<DataTypeUInt64>());
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
template <typename DictionaryType>
|
||||
ColumnPtr executeDispatch(const ColumnsWithTypeAndName & arguments, const std::shared_ptr<const IDictionaryBase> & dict_ptr) const
|
||||
mutable FunctionDictHelper helper;
|
||||
};
|
||||
|
||||
class FunctionDictGetChildren final : public IFunction
|
||||
{
|
||||
public:
|
||||
static constexpr auto name = "dictGetChildren";
|
||||
|
||||
static FunctionPtr create(const Context & context)
|
||||
{
|
||||
const auto * dict = typeid_cast<const DictionaryType *>(dict_ptr.get());
|
||||
if (!dict)
|
||||
return nullptr;
|
||||
|
||||
if (!dict->hasHierarchy())
|
||||
throw Exception{"Dictionary does not have a hierarchy", ErrorCodes::UNSUPPORTED_METHOD};
|
||||
|
||||
const auto * child_id_col_untyped = arguments[1].column.get();
|
||||
const auto * ancestor_id_col_untyped = arguments[2].column.get();
|
||||
|
||||
if (const auto * child_id_col = checkAndGetColumn<ColumnUInt64>(child_id_col_untyped))
|
||||
return execute(dict, child_id_col, ancestor_id_col_untyped);
|
||||
else if (const auto * child_id_col_const = checkAndGetColumnConst<ColumnVector<UInt64>>(child_id_col_untyped))
|
||||
return execute(dict, child_id_col_const, ancestor_id_col_untyped);
|
||||
else
|
||||
throw Exception{"Illegal column " + child_id_col_untyped->getName()
|
||||
+ " of second argument of function " + getName(), ErrorCodes::ILLEGAL_COLUMN};
|
||||
return std::make_shared<FunctionDictGetChildren>(context);
|
||||
}
|
||||
|
||||
template <typename DictionaryType>
|
||||
ColumnPtr execute(const DictionaryType * dict,
|
||||
const ColumnUInt64 * child_id_col, const IColumn * ancestor_id_col_untyped) const
|
||||
explicit FunctionDictGetChildren(const Context & context_)
|
||||
: helper(context_) {}
|
||||
|
||||
String getName() const override { return name; }
|
||||
|
||||
private:
|
||||
size_t getNumberOfArguments() const override { return 2; }
|
||||
|
||||
bool useDefaultImplementationForConstants() const final { return true; }
|
||||
ColumnNumbers getArgumentsThatAreAlwaysConstant() const final { return {0}; }
|
||||
bool isDeterministic() const override { return false; }
|
||||
|
||||
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
|
||||
{
|
||||
if (const auto * ancestor_id_col = checkAndGetColumn<ColumnUInt64>(ancestor_id_col_untyped))
|
||||
{
|
||||
auto out = ColumnUInt8::create();
|
||||
if (!isString(arguments[0]))
|
||||
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
|
||||
"Illegal type of first argument of function ({}). Expected String. Actual type ({})",
|
||||
getName(),
|
||||
arguments[0]->getName());
|
||||
|
||||
const auto & child_ids = child_id_col->getData();
|
||||
const auto & ancestor_ids = ancestor_id_col->getData();
|
||||
auto & data = out->getData();
|
||||
const auto size = child_id_col->size();
|
||||
data.resize(size);
|
||||
if (!WhichDataType(arguments[1]).isUInt64())
|
||||
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
|
||||
"Illegal type of second argument of function ({}). Expected UInt64. Actual type ({})",
|
||||
getName(),
|
||||
arguments[1]->getName());
|
||||
|
||||
dict->isInVectorVector(child_ids, ancestor_ids, data);
|
||||
return out;
|
||||
}
|
||||
else if (const auto * ancestor_id_col_const = checkAndGetColumnConst<ColumnVector<UInt64>>(ancestor_id_col_untyped))
|
||||
{
|
||||
auto out = ColumnUInt8::create();
|
||||
|
||||
const auto & child_ids = child_id_col->getData();
|
||||
const auto ancestor_id = ancestor_id_col_const->getValue<UInt64>();
|
||||
auto & data = out->getData();
|
||||
const auto size = child_id_col->size();
|
||||
data.resize(size);
|
||||
|
||||
dict->isInVectorConstant(child_ids, ancestor_id, data);
|
||||
return out;
|
||||
}
|
||||
else
|
||||
{
|
||||
throw Exception{"Illegal column " + ancestor_id_col_untyped->getName()
|
||||
+ " of third argument of function " + getName(), ErrorCodes::ILLEGAL_COLUMN};
|
||||
}
|
||||
return std::make_shared<DataTypeArray>(std::make_shared<DataTypeUInt64>());
|
||||
}
|
||||
|
||||
template <typename DictionaryType>
|
||||
ColumnPtr execute(const DictionaryType * dict, const ColumnConst * child_id_col, const IColumn * ancestor_id_col_untyped) const
|
||||
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override
|
||||
{
|
||||
if (const auto * ancestor_id_col = checkAndGetColumn<ColumnUInt64>(ancestor_id_col_untyped))
|
||||
if (input_rows_count == 0)
|
||||
return result_type->createColumn();
|
||||
|
||||
auto dictionary = helper.getDictionary(arguments[0].column);
|
||||
|
||||
if (!dictionary->hasHierarchy())
|
||||
throw Exception(ErrorCodes::UNSUPPORTED_METHOD,
|
||||
"Dictionary ({}) does not support hierarchy",
|
||||
dictionary->getFullName());
|
||||
|
||||
ColumnPtr result = dictionary->getDescendants(arguments[1].column, std::make_shared<DataTypeUInt64>(), 1);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
mutable FunctionDictHelper helper;
|
||||
};
|
||||
|
||||
class FunctionDictGetDescendants final : public IFunction
|
||||
{
|
||||
public:
|
||||
static constexpr auto name = "dictGetDescendants";
|
||||
|
||||
static FunctionPtr create(const Context & context)
|
||||
{
|
||||
return std::make_shared<FunctionDictGetDescendants>(context);
|
||||
}
|
||||
|
||||
explicit FunctionDictGetDescendants(const Context & context_)
|
||||
: helper(context_) {}
|
||||
|
||||
String getName() const override { return name; }
|
||||
|
||||
private:
|
||||
size_t getNumberOfArguments() const override { return 0; }
|
||||
bool isVariadic() const override { return true; }
|
||||
|
||||
bool useDefaultImplementationForConstants() const final { return true; }
|
||||
ColumnNumbers getArgumentsThatAreAlwaysConstant() const final { return {0}; }
|
||||
bool isDeterministic() const override { return false; }
|
||||
|
||||
|
||||
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
|
||||
{
|
||||
size_t arguments_size = arguments.size();
|
||||
if (arguments_size < 2 || arguments_size > 3)
|
||||
{
|
||||
auto out = ColumnUInt8::create();
|
||||
|
||||
const auto child_id = child_id_col->getValue<UInt64>();
|
||||
const auto & ancestor_ids = ancestor_id_col->getData();
|
||||
auto & data = out->getData();
|
||||
const auto size = child_id_col->size();
|
||||
data.resize(size);
|
||||
|
||||
dict->isInConstantVector(child_id, ancestor_ids, data);
|
||||
return out;
|
||||
throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH,
|
||||
"Illegal arguments size of function ({}). Expects 2 or 3 arguments size. Actual size ({})",
|
||||
getName(),
|
||||
arguments_size);
|
||||
}
|
||||
else if (const auto * ancestor_id_col_const = checkAndGetColumnConst<ColumnVector<UInt64>>(ancestor_id_col_untyped))
|
||||
|
||||
if (!isString(arguments[0]))
|
||||
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
|
||||
"Illegal type of first argument of function ({}). Expected const String. Actual type ({})",
|
||||
getName(),
|
||||
arguments[0]->getName());
|
||||
|
||||
if (!WhichDataType(arguments[1]).isUInt64())
|
||||
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
|
||||
"Illegal type of second argument of function ({}). Expected UInt64. Actual type ({})",
|
||||
getName(),
|
||||
arguments[1]->getName());
|
||||
|
||||
if (arguments.size() == 3 && !isUnsignedInteger(arguments[2]))
|
||||
{
|
||||
const auto child_id = child_id_col->getValue<UInt64>();
|
||||
const auto ancestor_id = ancestor_id_col_const->getValue<UInt64>();
|
||||
UInt8 res = 0;
|
||||
|
||||
dict->isInConstantConstant(child_id, ancestor_id, res);
|
||||
return DataTypeUInt8().createColumnConst(child_id_col->size(), res);
|
||||
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
|
||||
"Illegal type of third argument of function ({}). Expected const unsigned integer. Actual type ({})",
|
||||
getName(),
|
||||
arguments[2]->getName());
|
||||
}
|
||||
else
|
||||
throw Exception{"Illegal column " + ancestor_id_col_untyped->getName()
|
||||
+ " of third argument of function " + getName(), ErrorCodes::ILLEGAL_COLUMN};
|
||||
|
||||
return std::make_shared<DataTypeArray>(std::make_shared<DataTypeUInt64>());
|
||||
}
|
||||
|
||||
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override
|
||||
{
|
||||
if (input_rows_count == 0)
|
||||
return result_type->createColumn();
|
||||
|
||||
auto dictionary = helper.getDictionary(arguments[0].column);
|
||||
|
||||
size_t level = 0;
|
||||
|
||||
if (arguments.size() == 3)
|
||||
{
|
||||
if (!isColumnConst(*arguments[2].column))
|
||||
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
|
||||
"Illegal type of third argument of function ({}). Expected const unsigned integer.",
|
||||
getName());
|
||||
|
||||
level = static_cast<size_t>(arguments[2].column->get64(0));
|
||||
}
|
||||
|
||||
if (!dictionary->hasHierarchy())
|
||||
throw Exception(ErrorCodes::UNSUPPORTED_METHOD,
|
||||
"Dictionary ({}) does not support hierarchy",
|
||||
dictionary->getFullName());
|
||||
|
||||
ColumnPtr res = dictionary->getDescendants(arguments[1].column, std::make_shared<DataTypeUInt64>(), level);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
mutable FunctionDictHelper helper;
|
||||
|
@ -19,10 +19,13 @@ struct ExtractBool
|
||||
struct NameVisitParamExtractBool { static constexpr auto name = "visitParamExtractBool"; };
|
||||
using FunctionVisitParamExtractBool = FunctionsStringSearch<ExtractParamImpl<ExtractBool>, NameVisitParamExtractBool>;
|
||||
|
||||
struct NameSimpleJSONExtractBool { static constexpr auto name = "simpleJSONExtractBool"; };
|
||||
using FunctionSimpleJSONExtractBool = FunctionsStringSearch<ExtractParamImpl<ExtractBool>, NameSimpleJSONExtractBool>;
|
||||
|
||||
void registerFunctionVisitParamExtractBool(FunctionFactory & factory)
|
||||
{
|
||||
factory.registerFunction<FunctionVisitParamExtractBool>();
|
||||
factory.registerFunction<FunctionSimpleJSONExtractBool>();
|
||||
}
|
||||
|
||||
}
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user