Merge branch 'master' into ldap-role-mapping

* master: (207 commits) Update RadixSort.h rerun tests to be sure Update date_time_short perf test for toUnixTimestamp(Date()) update test remove comments better fix tests style update copy pasted test better comments better merge new interface for the function better Fix comments Add missing file Make the code less bad initial test added style ...
2024-09-20 08:40:50 +00:00 · 2020-11-26 18:29:13 +04:00 · 2020-11-26 18:29:13 +04:00 · 81280072df
commit 81280072df
parent b02f2cf1ca 084c75fa6e
274 changed files with 11264 additions and 1917 deletions
--- a/.gitignore
+++ b/.gitignore
@ -124,3 +124,5 @@ website/package-lock.json

 # Toolchains
 /cmake/toolchain/*
+
+*.iml
--- a/.gitmodules
+++ b/.gitmodules
@ -44,6 +44,7 @@
 [submodule "contrib/protobuf"]
 	path = contrib/protobuf
 	url = https://github.com/ClickHouse-Extras/protobuf.git
+	branch = v3.13.0.1
 [submodule "contrib/boost"]
 	path = contrib/boost
 	url = https://github.com/ClickHouse-Extras/boost.git
@ -107,6 +108,7 @@
 [submodule "contrib/grpc"]
 	path = contrib/grpc
 	url = https://github.com/ClickHouse-Extras/grpc.git
+	branch = v1.33.2
 [submodule "contrib/aws"]
 	path = contrib/aws
 	url = https://github.com/ClickHouse-Extras/aws-sdk-cpp.git
@ -200,3 +202,7 @@
 [submodule "contrib/xz"]
       path = contrib/xz
       url = https://github.com/xz-mirror/xz
+[submodule "contrib/abseil-cpp"]
+	path = contrib/abseil-cpp
+	url = https://github.com/ClickHouse-Extras/abseil-cpp.git
+	branch = lts_2020_02_25
--- a/base/common/logger_useful.h
+++ b/base/common/logger_useful.h
@ -3,7 +3,6 @@
 /// Macros for convenient usage of Poco logger.

 #include <fmt/format.h>
-#include <fmt/ostream.h>
 #include <Poco/Logger.h>
 #include <Poco/Message.h>
 #include <Common/CurrentThread.h>
--- a/base/glibc-compatibility/musl/accept4.c
+++ b/base/glibc-compatibility/musl/accept4.c
@ -0,0 +1,19 @@
+#define _GNU_SOURCE
+#include <sys/socket.h>
+#include <errno.h>
+#include <fcntl.h>
+#include "syscall.h"
+
+int accept4(int fd, struct sockaddr *restrict addr, socklen_t *restrict len, int flg)
+{
+	if (!flg) return accept(fd, addr, len);
+	int ret = socketcall_cp(accept4, fd, addr, len, flg, 0, 0);
+	if (ret>=0 || (errno != ENOSYS && errno != EINVAL)) return ret;
+	ret = accept(fd, addr, len);
+	if (ret<0) return ret;
+	if (flg & SOCK_CLOEXEC)
+		__syscall(SYS_fcntl, ret, F_SETFD, FD_CLOEXEC);
+	if (flg & SOCK_NONBLOCK)
+		__syscall(SYS_fcntl, ret, F_SETFL, O_NONBLOCK);
+	return ret;
+}
--- a/base/glibc-compatibility/musl/epoll.c
+++ b/base/glibc-compatibility/musl/epoll.c
@ -0,0 +1,37 @@
+#include <sys/epoll.h>
+#include <signal.h>
+#include <errno.h>
+#include "syscall.h"
+
+int epoll_create(int size)
+{
+	return epoll_create1(0);
+}
+
+int epoll_create1(int flags)
+{
+	int r = __syscall(SYS_epoll_create1, flags);
+#ifdef SYS_epoll_create
+	if (r==-ENOSYS && !flags) r = __syscall(SYS_epoll_create, 1);
+#endif
+	return __syscall_ret(r);
+}
+
+int epoll_ctl(int fd, int op, int fd2, struct epoll_event *ev)
+{
+	return syscall(SYS_epoll_ctl, fd, op, fd2, ev);
+}
+
+int epoll_pwait(int fd, struct epoll_event *ev, int cnt, int to, const sigset_t *sigs)
+{
+	int r = __syscall(SYS_epoll_pwait, fd, ev, cnt, to, sigs, _NSIG/8);
+#ifdef SYS_epoll_wait
+	if (r==-ENOSYS && !sigs) r = __syscall(SYS_epoll_wait, fd, ev, cnt, to);
+#endif
+	return __syscall_ret(r);
+}
+
+int epoll_wait(int fd, struct epoll_event *ev, int cnt, int to)
+{
+	return epoll_pwait(fd, ev, cnt, to, 0);
+}
--- a/base/glibc-compatibility/musl/eventfd.c
+++ b/base/glibc-compatibility/musl/eventfd.c
@ -0,0 +1,23 @@
+#include <sys/eventfd.h>
+#include <unistd.h>
+#include <errno.h>
+#include "syscall.h"
+
+int eventfd(unsigned int count, int flags)
+{
+	int r = __syscall(SYS_eventfd2, count, flags);
+#ifdef SYS_eventfd
+	if (r==-ENOSYS && !flags) r = __syscall(SYS_eventfd, count);
+#endif
+	return __syscall_ret(r);
+}
+
+int eventfd_read(int fd, eventfd_t *value)
+{
+	return (sizeof(*value) == read(fd, value, sizeof(*value))) ? 0 : -1;
+}
+
+int eventfd_write(int fd, eventfd_t value)
+{
+	return (sizeof(value) == write(fd, &value, sizeof(value))) ? 0 : -1;
+}
--- a/base/glibc-compatibility/musl/getauxval.c
+++ b/base/glibc-compatibility/musl/getauxval.c
@ -0,0 +1,45 @@
+#include <sys/auxv.h>
+#include <unistd.h> // __environ
+#include <errno.h>
+
+// We don't have libc struct available here. Compute aux vector manually.
+static unsigned long * __auxv = NULL;
+static unsigned long __auxv_secure = 0;
+
+static size_t __find_auxv(unsigned long type)
+{
+    size_t i;
+    for (i = 0; __auxv[i]; i += 2)
+    {
+        if (__auxv[i] == type)
+            return i + 1;
+    }
+    return (size_t) -1;
+}
+
+__attribute__((constructor)) static void __auxv_init()
+{
+    size_t i;
+    for (i = 0; __environ[i]; i++);
+    __auxv = (unsigned long *) (__environ + i + 1);
+
+    size_t secure_idx = __find_auxv(AT_SECURE);
+    if (secure_idx != ((size_t) -1))
+        __auxv_secure = __auxv[secure_idx];
+}
+
+unsigned long getauxval(unsigned long type)
+{
+    if (type == AT_SECURE)
+        return __auxv_secure;
+
+    if (__auxv)
+    {
+        size_t index = __find_auxv(type);
+        if (index != ((size_t) -1))
+            return __auxv[index];
+    }
+
+    errno = ENOENT;
+    return 0;
+}
--- a/base/glibc-compatibility/musl/secure_getenv.c
+++ b/base/glibc-compatibility/musl/secure_getenv.c
@ -0,0 +1,8 @@
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <sys/auxv.h>
+
+char * secure_getenv(const char * name)
+{
+    return getauxval(AT_SECURE) ? NULL : getenv(name);
+}
--- a/base/glibc-compatibility/musl/syscall.h
+++ b/base/glibc-compatibility/musl/syscall.h
@ -13,3 +13,11 @@ long __syscall(syscall_arg_t, ...);

 __attribute__((visibility("hidden")))
 void *__vdsosym(const char *, const char *);
+
+#define syscall(...) __syscall_ret(__syscall(__VA_ARGS__))
+
+#define socketcall(...) __syscall_ret(__socketcall(__VA_ARGS__))
+
+#define __socketcall(nm,a,b,c,d,e,f) __syscall(SYS_##nm, a, b, c, d, e, f)
+
+#define socketcall_cp socketcall
--- a/base/glibc-compatibility/musl/vdso.c
+++ b/base/glibc-compatibility/musl/vdso.c
@ -40,24 +40,10 @@ static int checkver(Verdef *def, int vsym, const char *vername, char *strings)
 #define OK_TYPES (1<<STT_NOTYPE | 1<<STT_OBJECT | 1<<STT_FUNC | 1<<STT_COMMON)
 #define OK_BINDS (1<<STB_GLOBAL | 1<<STB_WEAK | 1<<STB_GNU_UNIQUE)

-extern char** environ;
-static Ehdr *eh = NULL;
-void *__vdsosym(const char *vername, const char *name);
-// We don't have libc struct available here. Compute aux vector manually.
-__attribute__((constructor)) static void auxv_init()
-{
-	size_t i, *auxv;
-	for (i=0; environ[i]; i++);
-	auxv = (void *)(environ+i+1);
-	for (i=0; auxv[i] != AT_SYSINFO_EHDR; i+=2)
-		if (!auxv[i]) return;
-	if (!auxv[i+1]) return;
-	eh = (void *)auxv[i+1];
-}
-
 void *__vdsosym(const char *vername, const char *name)
 {
 	size_t i;
+	Ehdr * eh = (void *) getauxval(AT_SYSINFO_EHDR);
 	if (!eh) return 0;
 	Phdr *ph = (void *)((char *)eh + eh->e_phoff);
 	size_t *dynv=0, base=-1;
--- a/cmake/Modules/FindgRPC.cmake
+++ b/cmake/Modules/FindgRPC.cmake
@ -6,11 +6,9 @@ Defines the following variables:
  The include directories of the gRPC framework, including the include directories of the C++ wrapper.
 ``gRPC_LIBRARIES``
  The libraries of the gRPC framework.
-``gRPC_UNSECURE_LIBRARIES``
-  The libraries of the gRPC framework without SSL.
-``_gRPC_CPP_PLUGIN``
+``gRPC_CPP_PLUGIN``
  The plugin for generating gRPC client and server C++ stubs from `.proto` files
-``_gRPC_PYTHON_PLUGIN``
+``gRPC_PYTHON_PLUGIN``
  The plugin for generating gRPC client and server Python stubs from `.proto` files

 The following :prop_tgt:`IMPORTED` targets are also defined:
@ -19,6 +17,13 @@ The following :prop_tgt:`IMPORTED` targets are also defined:
 ``grpc_cpp_plugin``
 ``grpc_python_plugin``

+Set the following variables to adjust the behaviour of this script:
+``gRPC_USE_UNSECURE_LIBRARIES``
+  if set gRPC_LIBRARIES will be filled with the unsecure version of the libraries (i.e. without SSL)
+  instead of the secure ones.
+``gRPC_DEBUG`
+  if set the debug message will be printed.
+
 Add custom commands to process ``.proto`` files to C++::
 protobuf_generate_grpc_cpp(<SRCS> <HDRS>
    [DESCRIPTORS <DESC>] [EXPORT_MACRO <MACRO>] [<ARGN>...])
@ -242,6 +247,7 @@ find_library(gRPC_LIBRARY NAMES grpc)
 find_library(gRPC_CPP_LIBRARY NAMES grpc++)
 find_library(gRPC_UNSECURE_LIBRARY NAMES grpc_unsecure)
 find_library(gRPC_CPP_UNSECURE_LIBRARY NAMES grpc++_unsecure)
+find_library(gRPC_CARES_LIBRARY NAMES cares)

 set(gRPC_LIBRARIES)
 if(gRPC_USE_UNSECURE_LIBRARIES)
@ -259,6 +265,7 @@ else()
    set(gRPC_LIBRARIES ${gRPC_LIBRARIES} ${gRPC_CPP_LIBRARY})
  endif()
 endif()
+set(gRPC_LIBRARIES ${gRPC_LIBRARIES} ${gRPC_CARES_LIBRARY})

 # Restore the original find library ordering.
 if(gRPC_USE_STATIC_LIBS)
@ -278,11 +285,11 @@ else()
 endif()

 # Get full path to plugin.
-find_program(_gRPC_CPP_PLUGIN
+find_program(gRPC_CPP_PLUGIN
             NAMES grpc_cpp_plugin
             DOC "The plugin for generating gRPC client and server C++ stubs from `.proto` files") 

-find_program(_gRPC_PYTHON_PLUGIN
+find_program(gRPC_PYTHON_PLUGIN
             NAMES grpc_python_plugin
             DOC "The plugin for generating gRPC client and server Python stubs from `.proto` files")

@ -317,14 +324,14 @@ endif()

 #include(FindPackageHandleStandardArgs.cmake)
 FIND_PACKAGE_HANDLE_STANDARD_ARGS(gRPC
-                                  REQUIRED_VARS gRPC_LIBRARY gRPC_CPP_LIBRARY gRPC_UNSECURE_LIBRARY gRPC_CPP_UNSECURE_LIBRARY
-                                                gRPC_INCLUDE_DIR gRPC_CPP_INCLUDE_DIR _gRPC_CPP_PLUGIN _gRPC_PYTHON_PLUGIN)
+                                  REQUIRED_VARS gRPC_LIBRARY gRPC_CPP_LIBRARY gRPC_UNSECURE_LIBRARY gRPC_CPP_UNSECURE_LIBRARY gRPC_CARES_LIBRARY
+                                                gRPC_INCLUDE_DIR gRPC_CPP_INCLUDE_DIR gRPC_CPP_PLUGIN gRPC_PYTHON_PLUGIN)

 if(gRPC_FOUND)
  if(gRPC_DEBUG)
    message(STATUS "gRPC: INCLUDE_DIRS=${gRPC_INCLUDE_DIRS}")
    message(STATUS "gRPC: LIBRARIES=${gRPC_LIBRARIES}")
-    message(STATUS "gRPC: CPP_PLUGIN=${_gRPC_CPP_PLUGIN}")
-    message(STATUS "gRPC: PYTHON_PLUGIN=${_gRPC_PYTHON_PLUGIN}")
+    message(STATUS "gRPC: CPP_PLUGIN=${gRPC_CPP_PLUGIN}")
+    message(STATUS "gRPC: PYTHON_PLUGIN=${gRPC_PYTHON_PLUGIN}")
  endif()
 endif()
--- a/cmake/find/grpc.cmake
+++ b/cmake/find/grpc.cmake
@ -37,8 +37,8 @@ if(NOT USE_INTERNAL_GRPC_LIBRARY)
  if(NOT gRPC_INCLUDE_DIRS OR NOT gRPC_LIBRARIES)
    message(${RECONFIGURE_MESSAGE_LEVEL} "Can't find system gRPC library")
    set(EXTERNAL_GRPC_LIBRARY_FOUND 0)
-  elseif(NOT _gRPC_CPP_PLUGIN)
-    message(${RECONFIGURE_MESSAGE_LEVEL} "Can't find system grcp_cpp_plugin")
+  elseif(NOT gRPC_CPP_PLUGIN)
+    message(${RECONFIGURE_MESSAGE_LEVEL} "Can't find system grpc_cpp_plugin")
    set(EXTERNAL_GRPC_LIBRARY_FOUND 0)
  else()
    set(EXTERNAL_GRPC_LIBRARY_FOUND 1)
@ -53,8 +53,8 @@ if(NOT EXTERNAL_GRPC_LIBRARY_FOUND AND NOT MISSING_INTERNAL_GRPC_LIBRARY)
  else()
    set(gRPC_LIBRARIES grpc grpc++)
  endif()
-  set(_gRPC_CPP_PLUGIN $<TARGET_FILE:grpc_cpp_plugin>)
-  set(_gRPC_PROTOC_EXECUTABLE $<TARGET_FILE:protobuf::protoc>)
+  set(gRPC_CPP_PLUGIN $<TARGET_FILE:grpc_cpp_plugin>)
+  set(gRPC_PYTHON_PLUGIN $<TARGET_FILE:grpc_python_plugin>)

  include("${ClickHouse_SOURCE_DIR}/contrib/grpc-cmake/protobuf_generate_grpc.cmake")

@ -62,4 +62,4 @@ if(NOT EXTERNAL_GRPC_LIBRARY_FOUND AND NOT MISSING_INTERNAL_GRPC_LIBRARY)
  set(USE_GRPC 1)
 endif()

-message(STATUS "Using gRPC=${USE_GRPC}: ${gRPC_INCLUDE_DIRS} : ${gRPC_LIBRARIES} : ${_gRPC_CPP_PLUGIN}")
+message(STATUS "Using gRPC=${USE_GRPC}: ${gRPC_INCLUDE_DIRS} : ${gRPC_LIBRARIES} : ${gRPC_CPP_PLUGIN}")
--- a/contrib/abseil-cpp
+++ b/contrib/abseil-cpp
@ -0,0 +1 @@
+Subproject commit 4f3b686f86c3ebaba7e4e926e62a79cb1c659a54
--- a/contrib/grpc
+++ b/contrib/grpc
@ -1 +1 @@
-Subproject commit a6570b863cf76c9699580ba51c7827d5bffaac43
+Subproject commit 7436366ceb341ba5c00ea29f1645e02a2b70bf93
--- a/contrib/grpc-cmake/CMakeLists.txt
+++ b/contrib/grpc-cmake/CMakeLists.txt
@ -1,6 +1,7 @@
 set(_gRPC_SOURCE_DIR "${ClickHouse_SOURCE_DIR}/contrib/grpc")
 set(_gRPC_BINARY_DIR "${ClickHouse_BINARY_DIR}/contrib/grpc")

+# Use re2 from ClickHouse contrib, not from gRPC third_party.
 if(NOT RE2_INCLUDE_DIR)
  message(FATAL_ERROR " grpc: The location of the \"re2\" library is unknown")
 endif()
@ -8,6 +9,7 @@ set(gRPC_RE2_PROVIDER "clickhouse" CACHE STRING "" FORCE)
 set(_gRPC_RE2_INCLUDE_DIR "${RE2_INCLUDE_DIR}")
 set(_gRPC_RE2_LIBRARIES "${RE2_LIBRARY}")

+# Use zlib from ClickHouse contrib, not from gRPC third_party.
 if(NOT ZLIB_INCLUDE_DIRS)
  message(FATAL_ERROR " grpc: The location of the \"zlib\" library is unknown")
 endif()
@ -15,6 +17,7 @@ set(gRPC_ZLIB_PROVIDER "clickhouse" CACHE STRING "" FORCE)
 set(_gRPC_ZLIB_INCLUDE_DIR "${ZLIB_INCLUDE_DIRS}")
 set(_gRPC_ZLIB_LIBRARIES "${ZLIB_LIBRARIES}")

+# Use protobuf from ClickHouse contrib, not from gRPC third_party.
 if(NOT Protobuf_INCLUDE_DIR OR NOT Protobuf_LIBRARY)
  message(FATAL_ERROR " grpc: The location of the \"protobuf\" library is unknown")
 elseif (NOT Protobuf_PROTOC_EXECUTABLE)
@ -29,21 +32,33 @@ set(_gRPC_PROTOBUF_PROTOC "protoc")
 set(_gRPC_PROTOBUF_PROTOC_EXECUTABLE "${Protobuf_PROTOC_EXECUTABLE}")
 set(_gRPC_PROTOBUF_PROTOC_LIBRARIES "${Protobuf_PROTOC_LIBRARY}")

+# Use OpenSSL from ClickHouse contrib, not from gRPC third_party.
 set(gRPC_SSL_PROVIDER "clickhouse" CACHE STRING "" FORCE)
 set(_gRPC_SSL_INCLUDE_DIR ${OPENSSL_INCLUDE_DIR})
 set(_gRPC_SSL_LIBRARIES ${OPENSSL_LIBRARIES})

+# Use abseil-cpp from ClickHouse contrib, not from gRPC third_party.
+set(gRPC_ABSL_PROVIDER "clickhouse" CACHE STRING "" FORCE)
+set(ABSL_ROOT_DIR "${ClickHouse_SOURCE_DIR}/contrib/abseil-cpp")
+if(NOT EXISTS "${ABSL_ROOT_DIR}/CMakeLists.txt")
+  message(FATAL_ERROR " grpc: submodule third_party/abseil-cpp is missing. To fix try run: \n git submodule update --init --recursive")
+endif()
+add_subdirectory("${ABSL_ROOT_DIR}" "${ClickHouse_BINARY_DIR}/contrib/abseil-cpp")
+
+# Choose to build static or shared library for c-ares.
+if (MAKE_STATIC_LIBRARIES)
+  set(CARES_STATIC ON CACHE BOOL "" FORCE)
+  set(CARES_SHARED OFF CACHE BOOL "" FORCE)
+else ()
+  set(CARES_STATIC OFF CACHE BOOL "" FORCE)
+  set(CARES_SHARED ON CACHE BOOL "" FORCE)
+endif ()
+
 # We don't want to build C# extensions.
 set(gRPC_BUILD_CSHARP_EXT OFF)

-# We don't want to build abseil tests, so we temporarily switch BUILD_TESTING off.
-set(_gRPC_ORIG_BUILD_TESTING ${BUILD_TESTING})
-set(BUILD_TESTING OFF)
-
 add_subdirectory("${_gRPC_SOURCE_DIR}" "${_gRPC_BINARY_DIR}")

-set(BUILD_TESTING ${_gRPC_ORIG_BUILD_TESTING})
-
 # The contrib/grpc/CMakeLists.txt redefined the PROTOBUF_GENERATE_GRPC_CPP() function for its own purposes,
 # so we need to redefine it back.
 include("${ClickHouse_SOURCE_DIR}/contrib/grpc-cmake/protobuf_generate_grpc.cmake")
--- a/contrib/libunwind-cmake/CMakeLists.txt
+++ b/contrib/libunwind-cmake/CMakeLists.txt
@ -22,7 +22,16 @@ set_source_files_properties(${LIBUNWIND_C_SOURCES} PROPERTIES COMPILE_FLAGS "-st
 set(LIBUNWIND_ASM_SOURCES
    ${LIBUNWIND_SOURCE_DIR}/src/UnwindRegistersRestore.S
    ${LIBUNWIND_SOURCE_DIR}/src/UnwindRegistersSave.S)
-set_source_files_properties(${LIBUNWIND_ASM_SOURCES} PROPERTIES LANGUAGE C)
+
+# CMake doesn't pass the correct architecture for Apple prior to CMake 3.19 [1]
+# Workaround these two issues by compiling as C.
+#
+#   [1]: https://gitlab.kitware.com/cmake/cmake/-/issues/20771
+if (APPLE AND CMAKE_VERSION VERSION_LESS 3.19)
+    set_source_files_properties(${LIBUNWIND_ASM_SOURCES} PROPERTIES LANGUAGE C)
+else()
+    enable_language(ASM)
+endif()

 set(LIBUNWIND_SOURCES
    ${LIBUNWIND_CXX_SOURCES}
--- a/contrib/protobuf
+++ b/contrib/protobuf
@ -1 +1 @@
-Subproject commit 445d1ae73a450b1e94622e7040989aa2048402e3
+Subproject commit 73b12814204ad9068ba352914d0dc244648b48ee
--- a/docker/packager/unbundled/Dockerfile
+++ b/docker/packager/unbundled/Dockerfile
@ -56,6 +56,7 @@ RUN apt-get update \
        libprotoc-dev \
        libgrpc++-dev \
        protobuf-compiler-grpc \
+        libc-ares-dev \
        rapidjson-dev \
        libsnappy-dev \
        libparquet-dev \
--- a/docker/test/fasttest/run.sh
+++ b/docker/test/fasttest/run.sh
@ -288,6 +288,7 @@ TESTS_TO_SKIP=(

    # Require python libraries like scipy, pandas and numpy
    01322_ttest_scipy
+    01561_mann_whitney_scipy

    01545_system_errors
    # Checks system.errors
--- a/docker/test/pvs/Dockerfile
+++ b/docker/test/pvs/Dockerfile
@ -10,6 +10,11 @@ RUN apt-get update --yes \
        gpg-agent \
        debsig-verify \
        strace \
+        protobuf-compiler \
+        protobuf-compiler-grpc \
+        libprotoc-dev \
+        libgrpc++-dev \
+        libc-ares-dev \
        --yes --no-install-recommends

 #RUN wget -nv -O - http://files.viva64.com/etc/pubkey.txt | sudo apt-key add -
@ -33,7 +38,8 @@ RUN set -x \
    && dpkg -i "${PKG_VERSION}.deb"

 CMD echo "Running PVS version $PKG_VERSION" && cd /repo_folder && pvs-studio-analyzer credentials $LICENCE_NAME $LICENCE_KEY -o ./licence.lic  \
-    && cmake . -D"ENABLE_EMBEDDED_COMPILER"=OFF && ninja re2_st \
+    && cmake . -D"ENABLE_EMBEDDED_COMPILER"=OFF -D"USE_INTERNAL_PROTOBUF_LIBRARY"=OFF -D"USE_INTERNAL_GRPC_LIBRARY"=OFF \
+    && ninja re2_st clickhouse_grpc_protos \
    && pvs-studio-analyzer analyze -o pvs-studio.log -e contrib -j 4 -l ./licence.lic; \
    plog-converter -a GA:1,2 -t fullhtml -o /test_output/pvs-studio-html-report pvs-studio.log; \
    plog-converter -a GA:1,2 -t tasklist -o /test_output/pvs-studio-task-report.txt pvs-studio.log
--- a/docs/en/engines/table-engines/mergetree-family/replication.md
+++ b/docs/en/engines/table-engines/mergetree-family/replication.md
@ -152,7 +152,7 @@ You can specify default arguments for `Replicated` table engine in the server co

 ```xml
 <default_replica_path>/clickhouse/tables/{shard}/{database}/{table}</default_replica_path>
-<default_replica_name>{replica}</default_replica_path>
+<default_replica_name>{replica}</default_replica_name>
 ```

 In this case, you can omit arguments when creating tables:
--- a/docs/en/getting-started/tutorial.md
+++ b/docs/en/getting-started/tutorial.md
@ -11,7 +11,7 @@ By going through this tutorial, you’ll learn how to set up a simple ClickHouse

 ## Single Node Setup {#single-node-setup}

-To postpone the complexities of a distributed environment, we’ll start with deploying ClickHouse on a single server or virtual machine. ClickHouse is usually installed from [deb](../getting-started/install.md#install-from-deb-packages) or [rpm](../getting-started/install.md#from-rpm-packages) packages, but there are [alternatives](../getting-started/install.md#from-docker-image) for the operating systems that do no support them.
+To postpone the complexities of a distributed environment, we’ll start with deploying ClickHouse on a single server or virtual machine. ClickHouse is usually installed from [deb](../getting-started/install.md#install-from-deb-packages) or [rpm](../getting-started/install.md#from-rpm-packages) packages, but there are [alternatives](../getting-started/install.md#from-docker-image) for the operating systems that do not support them.

 For example, you have chosen `deb` packages and executed:

--- a/docs/en/operations/opentelemetry.md
+++ b/docs/en/operations/opentelemetry.md
@ -44,11 +44,10 @@ stages, such as query planning or distributed queries.

 To be useful, the tracing information has to be exported to a monitoring system
 that supports OpenTelemetry, such as Jaeger or Prometheus. ClickHouse avoids
-a dependency on a particular monitoring system, instead only
-providing the tracing data conforming to the standard. A natural way to do so
-in an SQL RDBMS is a system table. OpenTelemetry trace span information
+a dependency on a particular monitoring system, instead only providing the
+tracing data through a system table. OpenTelemetry trace span information
 [required by the standard](https://github.com/open-telemetry/opentelemetry-specification/blob/master/specification/overview.md#span)
-is stored in the system table called `system.opentelemetry_span_log`.
+is stored in the `system.opentelemetry_span_log` table.

 The table must be enabled in the server configuration, see the `opentelemetry_span_log`
 element in the default config file `config.xml`. It is enabled by default.
@ -67,3 +66,31 @@ The table has the following columns:

 The tags or attributes are saved as two parallel arrays, containing the keys
 and values. Use `ARRAY JOIN` to work with them.
+
+## Integration with monitoring systems
+
+At the moment, there is no ready tool that can export the tracing data from
+ClickHouse to a monitoring system.
+
+For testing, it is possible to setup the export using a materialized view with the URL engine over the `system.opentelemetry_span_log` table, which would push the arriving log data to an HTTP endpoint of a trace collector. For example, to push the minimal span data to a Zipkin instance running at `http://localhost:9411`, in Zipkin v2 JSON format:
+
+```sql
+CREATE MATERIALIZED VIEW default.zipkin_spans
+ENGINE = URL('http://127.0.0.1:9411/api/v2/spans', 'JSONEachRow')
+SETTINGS output_format_json_named_tuples_as_objects = 1,
+    output_format_json_array_of_rows = 1 AS
+SELECT
+    lower(hex(reinterpretAsFixedString(trace_id))) AS traceId,
+    lower(hex(parent_span_id)) AS parentId,
+    lower(hex(span_id)) AS id,
+    operation_name AS name,
+    start_time_us AS timestamp,
+    finish_time_us - start_time_us AS duration,
+    cast(tuple('clickhouse'), 'Tuple(serviceName text)') AS localEndpoint,
+    cast(tuple(
+        attribute.values[indexOf(attribute.names, 'db.statement')]),
+        'Tuple("db.statement" text)') AS tags
+FROM system.opentelemetry_span_log
+```
+
+In case of any errors, the part of the log data for which the error has occurred will be silently lost. Check the server log for error messages if the data does not arrive.
--- a/docs/en/sql-reference/statements/select/index.md
+++ b/docs/en/sql-reference/statements/select/index.md
@ -25,7 +25,7 @@ SELECT [DISTINCT] expr_list
 [ORDER BY expr_list] [WITH FILL] [FROM expr] [TO expr] [STEP expr]
 [LIMIT [offset_value, ]n BY columns]
 [LIMIT [n, ]m] [WITH TIES]
-[UNION ALL ...]
+[UNION  ...]
 [INTO OUTFILE filename]
 [FORMAT format]
 ```
@ -46,7 +46,7 @@ Specifics of each optional clause are covered in separate sections, which are li
 -   [SELECT clause](#select-clause)
 -   [DISTINCT clause](../../../sql-reference/statements/select/distinct.md)
 -   [LIMIT clause](../../../sql-reference/statements/select/limit.md)
-   [UNION ALL clause](../../../sql-reference/statements/select/union-all.md)
+-   [UNION clause](../../../sql-reference/statements/select/union-all.md)
 -   [INTO OUTFILE clause](../../../sql-reference/statements/select/into-outfile.md)
 -   [FORMAT clause](../../../sql-reference/statements/select/format.md)

--- a/docs/en/sql-reference/statements/select/union-all.md
+++ b/docs/en/sql-reference/statements/select/union-all.md
@ -1,5 +1,5 @@
 ---
-toc_title: UNION ALL
+toc_title: UNION
 ---

 # UNION ALL Clause {#union-all-clause}
@ -25,10 +25,13 @@ Type casting is performed for unions. For example, if two queries being combined

 Queries that are parts of `UNION ALL` can’t be enclosed in round brackets. [ORDER BY](../../../sql-reference/statements/select/order-by.md) and [LIMIT](../../../sql-reference/statements/select/limit.md) are applied to separate queries, not to the final result. If you need to apply a conversion to the final result, you can put all the queries with `UNION ALL` in a subquery in the [FROM](../../../sql-reference/statements/select/from.md) clause.

-## Limitations {#limitations}
+# UNION DISTINCT Clause {#union-distinct-clause}
+The difference between `UNION ALL` and `UNION DISTINCT` is that `UNION DISTINCT` will do a distinct transform for union result, it is equivalent to `SELECT DISTINCT` from a subquery containing `UNION ALL`.
+
+# UNION Clause {#union-clause}
+By default, `UNION` has the same behavior as `UNION DISTINCT`, but you can specify union mode by setting `union_default_mode`, values can be 'ALL', 'DISTINCT' or empty string. However, if you use `UNION` with setting `union_default_mode` to empty string, it will throw an exception.

-Only `UNION ALL` is supported. The regular `UNION` (`UNION DISTINCT`) is not supported. If you need `UNION DISTINCT`, you can write `SELECT DISTINCT` from a subquery containing `UNION ALL`.

 ## Implementation Details {#implementation-details}

-Queries that are parts of `UNION ALL` can be run simultaneously, and their results can be mixed together.
+Queries that are parts of `UNION/UNION ALL/UNION DISTINCT` can be run simultaneously, and their results can be mixed together.
--- a/docs/zh/index.md
+++ b/docs/zh/index.md
@ -4,53 +4,50 @@ ClickHouse是一个用于联机分析(OLAP)的列式数据库管理系统(DBMS)

 在传统的行式数据库系统中，数据按如下顺序存储：

-| row  | watchID      | JavaEnable | title      | GoodEvent | EventTime            |
-|-----|-------------|------------|------------|-----------|---------------------|
-| #0 | 89354350662 | 1          | 投资者关系 | 1         | 2016-05-18 05:19:20 |
-| #1 | 90329509958 | 0          | 联系我们   | 1         | 2016-05-18 08:10:20 |
-| #2 | 89953706054 | 1          | 任务       | 1         | 2016-05-18 07:38:00 |
-| #N | …           | …          | …          | …         | …                   |
+| Row | WatchID     | JavaEnable | Title              | GoodEvent | EventTime           |
+|-----|-------------|------------|--------------------|-----------|---------------------|
+| #0 | 89354350662 | 1          | Investor Relations | 1         | 2016-05-18 05:19:20 |
+| #1 | 90329509958 | 0          | Contact us         | 1         | 2016-05-18 08:10:20 |
+| #2 | 89953706054 | 1          | Mission            | 1         | 2016-05-18 07:38:00 |
+| #N | …           | …          | …                  | …         | …                   |

 处于同一行中的数据总是被物理的存储在一起。

-常见的行式数据库系统有： MySQL、Postgres和MS SQL Server。
-{: .灰色 }
+常见的行式数据库系统有：`MySQL`、`Postgres`和`MS SQL Server`。

 在列式数据库系统中，数据按如下的顺序存储：

-| row:         | #0                 | #1                 | #2                 | #N |
+| Row:        | #0                 | #1                 | #2                 | #N |
 |-------------|---------------------|---------------------|---------------------|-----|
-| watchID:     | 89354350662         | 90329509958         | 89953706054         | …   |
+| WatchID:    | 89354350662         | 90329509958         | 89953706054         | …   |
 | JavaEnable: | 1                   | 0                   | 1                   | …   |
-| title:       | 投资者关系          | 联系我们            | 任务                | …   |
+| Title:      | Investor Relations  | Contact us          | Mission             | …   |
 | GoodEvent:  | 1                   | 1                   | 1                   | …   |
-| EventTime:   | 2016-05-18 05:19:20 | 2016-05-18 08:10:20 | 2016-05-18 07:38:00 | …   |
+| EventTime:  | 2016-05-18 05:19:20 | 2016-05-18 08:10:20 | 2016-05-18 07:38:00 | …   |

-该示例中只展示了数据在列式数据库中数据的排列方式。
-对于存储而言，列式数据库总是将同一列的数据存储在一起，不同列的数据也总是分开存储。
+这些示例只显示了数据的排列顺序。来自不同列的值被单独存储，来自同一列的数据被存储在一起。

 常见的列式数据库有： Vertica、 Paraccel (Actian Matrix，Amazon Redshift)、 Sybase IQ、 Exasol、 Infobright、 InfiniDB、 MonetDB (VectorWise， Actian Vector)、 LucidDB、 SAP HANA、 Google Dremel、 Google PowerDrill、 Druid、 kdb+。
-{: .灰色 }

-不同的数据存储方式适用不同的业务场景，数据访问的场景包括：进行了何种查询、多久查询一次以及各类查询的比例； 每种查询读取多少数据————行、列和字节；读取数据和写入数据之间的关系；使用的数据集大小以及如何使用本地的数据集；是否使用事务,以及它们是如何进行隔离的；数据的复制机制与数据的完整性要求；每种类型的查询要求的延迟与吞吐量等等。
+不同的数据存储方式适用不同的业务场景，数据访问的场景包括：进行了何种查询、多久查询一次以及各类查询的比例；每种类型的查询(行、列和字节)读取多少数据；读取数据和更新之间的关系；使用的数据集大小以及如何使用本地的数据集；是否使用事务,以及它们是如何进行隔离的；数据的复制机制与数据的完整性要求；每种类型的查询要求的延迟与吞吐量等等。

-系统负载越高，依据使用场景进行定制化就越重要，并且定制将会变的越精细。没有一个系统能够同时适用所有明显不同的业务场景。如果系统适用于广泛的场景，在负载高的情况下，要兼顾所有的场景，那么将不得不做出选择。是要平衡还是要效率？
+系统负载越高，依据使用场景进行定制化就越重要，并且定制将会变的越精细。没有一个系统能够同时适用所有不同的业务场景。如果系统适用于广泛的场景，在负载高的情况下，要兼顾所有的场景，那么将不得不做出选择。是要平衡还是要效率？

 ## OLAP场景的关键特征 {#olapchang-jing-de-guan-jian-te-zheng}

-   大多数是读请求
-   数据总是以相当大的批(\> 1000 rows)进行写入
-   不修改已添加的数据
-   每次查询都从数据库中读取大量的行，但是同时又仅需要少量的列
+-   绝大多数是读请求
+-   数据以相当大的批次(\> 1000行)更新，而不是单行更新;或者根本没有更新。
+-   已添加到数据库的数据不能修改。
+-   对于读取，从数据库中提取相当多的行，但只提取列的一小部分。
 -   宽表，即每个表包含着大量的列
-   较少的查询(通常每台服务器每秒数百个查询或更少)
+-   查询相对较少(通常每台服务器每秒查询数百次或更少)
 -   对于简单查询，允许延迟大约50毫秒
-   列中的数据相对较小： 数字和短字符串(例如，每个URL 60个字节)
-   处理单个查询时需要高吞吐量（每个服务器每秒高达数十亿行）
+-   列中的数据相对较小：数字和短字符串(例如，每个URL 60个字节)
+-   处理单个查询时需要高吞吐量(每台服务器每秒可达数十亿行)
 -   事务不是必须的
 -   对数据一致性要求低
-   每一个查询除了一个大表外都很小
-   查询结果明显小于源数据，换句话说，数据被过滤或聚合后能够被盛放在单台服务器的内存中
+-   每个查询有一个大表。除了他意以外，其他的都很小。
+-   查询结果明显小于源数据。换句话说，数据经过过滤或聚合，因此结果适合于单个服务器的RAM中

 很容易可以看出，OLAP场景与其他通常业务场景(例如,OLTP或K/V)有很大的不同， 因此想要使用OLTP或Key-Value数据库去高效的处理分析查询场景，并不是非常完美的适用方案。例如，使用OLAP数据库去处理分析请求通常要优于使用MongoDB或Redis去处理分析请求。

--- a/docs/zh/introduction/adopters.md
+++ b/docs/zh/introduction/adopters.md
@ -1,6 +1,6 @@
 ---
-toc_priority: 8
-toc_title: "\u91C7\u7528\u8005"
+toc_priority: 5
+toc_title: "ClickHouse用户"
 ---

 # ClickHouse用户 {#clickhouse-adopters}
--- a/docs/zh/introduction/distinctive-features.md
+++ b/docs/zh/introduction/distinctive-features.md
@ -1,3 +1,8 @@
+---
+toc_priority: 2
+toc_title: ClickHouse的特性
+---
+
 # ClickHouse的特性 {#clickhouse-de-te-xing}

 ## 真正的列式数据库管理系统 {#zhen-zheng-de-lie-shi-shu-ju-ku-guan-li-xi-tong}
@ -12,9 +17,13 @@

 在一些列式数据库管理系统中(例如：InfiniDB CE 和 MonetDB) 并没有使用数据压缩。但是, 若想达到比较优异的性能，数据压缩确实起到了至关重要的作用。

+除了在磁盘空间和CPU消耗之间进行不同权衡的高效通用压缩编解码器之外，ClickHouse还提供针对特定类型数据的[专用编解码器](../sql-reference/statements/create/table.md#create-query-specialized-codecs)，这使得ClickHouse能够与更小的数据库(如时间序列数据库)竞争并超越它们。
+
 ## 数据的磁盘存储 {#shu-ju-de-ci-pan-cun-chu}

-许多的列式数据库(如 SAP HANA, Google PowerDrill)只能在内存中工作，这种方式会造成比实际更多的设备预算。ClickHouse被设计用于工作在传统磁盘上的系统，它提供每GB更低的存储成本，但如果有可以使用SSD和内存，它也会合理的利用这些资源。
+许多的列式数据库(如 SAP HANA, Google PowerDrill)只能在内存中工作，这种方式会造成比实际更多的设备预算。
+
+ClickHouse被设计用于工作在传统磁盘上的系统，它提供每GB更低的存储成本，但如果可以使用SSD和内存，它也会合理的利用这些资源。

 ## 多核心并行处理 {#duo-he-xin-bing-xing-chu-li}

@ -27,9 +36,11 @@ ClickHouse会使用服务器上一切可用的资源，从而以最自然的方

 ## 支持SQL {#zhi-chi-sql}

-ClickHouse支持基于SQL的声明式查询语言，该语言大部分情况下是与SQL标准兼容的。
-支持的查询包括 GROUP BY，ORDER BY，IN，JOIN以及非相关子查询。
-不支持窗口函数和相关子查询。
+ClickHouse支持一种[基于SQL的声明式查询语言](../sql-reference/index.md)，它在许多情况下与[ANSI SQL标准](../sql-reference/ansi.md)相同。
+
+支持的查询[GROUP BY](../sql-reference/statements/select/group-by.md), [ORDER BY](../sql-reference/statements/select/order-by.md), [FROM](../sql-reference/statements/select/from.md), [JOIN](../sql-reference/statements/select/join.md), [IN](../sql-reference/operators/in.md)以及非相关子查询。
+
+相关(依赖性)子查询和窗口函数暂不受支持，但将来会被实现。

 ## 向量引擎 {#xiang-liang-yin-qing}

@ -55,12 +66,20 @@ ClickHouse提供各种各样在允许牺牲数据精度的情况下对查询进
 2.  基于数据的部分样本进行近似查询。这时，仅会从磁盘检索少部分比例的数据。
 3.  不使用全部的聚合条件，通过随机选择有限个数据聚合条件进行聚合。这在数据聚合条件满足某些分布条件下，在提供相当准确的聚合结果的同时降低了计算资源的使用。

+## Adaptive Join Algorithm {#adaptive-join-algorithm}
+
+ClickHouse支持自定义[JOIN](../sql-reference/statements/select/join.md)多个表，它更倾向于散列连接算法，如果有多个大表，则使用合并-连接算法
+
 ## 支持数据复制和数据完整性 {#zhi-chi-shu-ju-fu-zhi-he-shu-ju-wan-zheng-xing}

 ClickHouse使用异步的多主复制技术。当数据被写入任何一个可用副本后，系统会在后台将数据分发给其他副本，以保证系统在不同副本上保持相同的数据。在大多数情况下ClickHouse能在故障后自动恢复，在一些少数的复杂情况下需要手动恢复。

 更多信息，参见 [数据复制](../engines/table-engines/mergetree-family/replication.md)。

+## 角色的访问控制 {#role-based-access-control}
+
+ClickHouse使用SQL查询实现用户帐户管理，并允许[角色的访问控制](../operations/access-rights.md)，类似于ANSI SQL标准和流行的关系数据库管理系统。
+
 # 限制 {#clickhouseke-xian-zhi}

 1.  没有完整的事务支持。
--- a/docs/zh/introduction/history.md
+++ b/docs/zh/introduction/history.md
@ -1,3 +1,8 @@
+---
+toc_priority: 4
+toc_title: ClickHouse历史
+---
+
 # ClickHouse历史 {#clickhouseli-shi}

 ClickHouse最初是为 [YandexMetrica](https://metrica.yandex.com/) [世界第二大Web分析平台](http://w3techs.com/technologies/overview/traffic_analysis/all) 而开发的。多年来一直作为该系统的核心组件被该系统持续使用着。目前为止，该系统在ClickHouse中有超过13万亿条记录，并且每天超过200多亿个事件被处理。它允许直接从原始数据中动态查询并生成报告。本文简要介绍了ClickHouse在其早期发展阶段的目标。
--- a/docs/zh/introduction/performance.md
+++ b/docs/zh/introduction/performance.md
@ -1,3 +1,8 @@
+---
+toc_priority: 3
+toc_title: ClickHouse性能
+---
+
 # 性能 {#performance}

 根据Yandex的内部测试结果，ClickHouse表现出了比同类可比较产品更优的性能。你可以在 [这里](https://clickhouse.tech/benchmark/dbms/) 查看具体的测试结果。
--- a/docs/zh/operations/utilities/clickhouse-local.md
+++ b/docs/zh/operations/utilities/clickhouse-local.md
@ -3,18 +3,18 @@ toc_priority: 60
 toc_title: clickhouse-local
 ---

-# ﾂ环板-ｮﾂ嘉ｯﾂ偲 {#clickhouse-local}
+# ClickHouse Local {#clickhouse-local}

-该 `clickhouse-local` 程序使您能够对本地文件执行快速处理，而无需部署和配置ClickHouse服务器。
+`clickhouse-local`模式可以使您能够对本地文件执行快速处理，而无需部署和配置ClickHouse服务器。

-接受表示表的数据并使用以下方式查询它们 [ﾂ环板ECTｮﾂ嘉ｯﾂ偲](../../operations/utilities/clickhouse-local.md).
+[ClickHouse SQL语法](../../operations/utilities/clickhouse-local.md)支持对表格数据的查询.

-`clickhouse-local` 使用与ClickHouse server相同的核心，因此它支持大多数功能以及相同的格式和表引擎。
+`clickhouse-local`使用与ClickHouse Server相同的核心，因此它支持大多数功能以及相同的格式和表引擎。

-默认情况下 `clickhouse-local` 不能访问同一主机上的数据，但它支持使用以下方式加载服务器配置 `--config-file` 争论。
+默认情况下`clickhouse-local`不能访问同一主机上的数据，但它支持使用`--config-file`方式加载服务器配置。

 !!! warning "警告"
-    不建议将生产服务器配置加载到 `clickhouse-local` 因为数据可以在人为错误的情况下被损坏。
+    不建议将生产服务器配置加载到`clickhouse-local`因为数据可以在人为错误的情况下被损坏。

 ## 用途 {#usage}

@ -26,21 +26,21 @@ clickhouse-local --structure "table_structure" --input-format "format_of_incomin

 参数:

-   `-S`, `--structure` — table structure for input data.
-   `-if`, `--input-format` — input format, `TSV` 默认情况下。
-   `-f`, `--file` — path to data, `stdin` 默认情况下。
-   `-q` `--query` — queries to execute with `;` 如delimeter。
-   `-N`, `--table` — table name where to put output data, `table` 默认情况下。
-   `-of`, `--format`, `--output-format` — output format, `TSV` 默认情况下。
-   `--stacktrace` — whether to dump debug output in case of exception.
-   `--verbose` — more details on query execution.
-   `-s` — disables `stderr` 记录。
-   `--config-file` — path to configuration file in same format as for ClickHouse server, by default the configuration empty.
-   `--help` — arguments references for `clickhouse-local`.
+-   `-S`, `--structure` — 输入数据的表结构。
+-   `-if`, `--input-format` — 输入格式化类型, 默认是`TSV`。
+-   `-f`, `--file` — 数据路径, 默认是`stdin`。
+-   `-q` `--query` — 要查询的SQL语句使用`;`做分隔符。
+-   `-N`, `--table` — 数据输出的表名，默认是`table`。
+-   `-of`, `--format`, `--output-format` — 输出格式化类型, 默认是`TSV`。
+-   `--stacktrace` — 是否在出现异常时输出栈信息。
+-   `--verbose` — debug显示查询的详细信息。
+-   `-s` — 禁用`stderr`输出信息。
+-   `--config-file` — 与ClickHouse服务器格式相同配置文件的路径，默认情况下配置为空。
+-   `--help` — `clickhouse-local`使用帮助信息。

-还有每个ClickHouse配置变量的参数，这些变量更常用，而不是 `--config-file`.
+对于每个ClickHouse配置的参数，也可以单独使用，可以不使用`--config-file`指定。

-## 例 {#examples}
+## 示例 {#examples}

 ``` bash
 echo -e "1,2\n3,4" | clickhouse-local -S "a Int64, b Int64" -if "CSV" -q "SELECT * FROM table"
@ -49,7 +49,7 @@ Read 2 rows, 32.00 B in 0.000 sec., 5182 rows/sec., 80.97 KiB/sec.
 3 4
 ```

-前面的例子是一样的:
+另一个示例，类似上一个使用示例:

 ``` bash
 $ echo -e "1,2\n3,4" | clickhouse-local -q "CREATE TABLE table (a Int64, b Int64) ENGINE = File(CSV, stdin); SELECT a, b FROM table; DROP TABLE table"
@ -58,7 +58,22 @@ Read 2 rows, 32.00 B in 0.000 sec., 4987 rows/sec., 77.93 KiB/sec.
 3 4
 ```

-现在让我们为每个Unix用户输出内存用户:
+你可以使用`stdin`或`--file`参数, 打开任意数量的文件来使用多个文件[`file` table function](../../sql-reference/table-functions/file.md):
+
+```bash
+$ echo 1 | tee 1.tsv
+1
+
+$ echo 2 | tee 2.tsv
+2
+
+$ clickhouse-local --query "
+    select * from file('1.tsv', TSV, 'a int') t1
+    cross join file('2.tsv', TSV, 'b int') t2"
+1   2
+```
+
+现在让我们查询每个Unix用户使用内存:

 ``` bash
 $ ps aux | tail -n +2 | awk '{ printf("%s\t%s\n", $1, $4) }' | clickhouse-local -S "user String, mem Float64" -q "SELECT user, round(sum(mem), 2) as memTotal FROM table GROUP BY user ORDER BY memTotal DESC FORMAT Pretty"
--- a/programs/CMakeLists.txt
+++ b/programs/CMakeLists.txt
@ -112,6 +112,8 @@ add_subdirectory (obfuscator)
 add_subdirectory (install)
 add_subdirectory (git-import)

+#add_subdirectory (grpc-client)
+
 if (ENABLE_CLICKHOUSE_ODBC_BRIDGE)
    add_subdirectory (odbc-bridge)
 endif ()
--- a/programs/client/Client.cpp
+++ b/programs/client/Client.cpp
@ -2515,7 +2515,7 @@ public:
        {
            std::string traceparent = options["opentelemetry-traceparent"].as<std::string>();
            std::string error;
-            if (!context.getClientInfo().parseTraceparentHeader(
+            if (!context.getClientInfo().client_trace_context.parseTraceparentHeader(
                traceparent, error))
            {
                throw Exception(ErrorCodes::BAD_ARGUMENTS,
@ -2526,7 +2526,7 @@ public:

        if (options.count("opentelemetry-tracestate"))
        {
-            context.getClientInfo().opentelemetry_tracestate =
+            context.getClientInfo().client_trace_context.tracestate =
                options["opentelemetry-tracestate"].as<std::string>();
        }

--- a/programs/copier/ClusterCopier.cpp
+++ b/programs/copier/ClusterCopier.cpp
@ -62,6 +62,9 @@ decltype(auto) ClusterCopier::retry(T && func, UInt64 max_tries)
 {
    std::exception_ptr exception;

+    if (max_tries == 0)
+        throw Exception("Cannot perform zero retries", ErrorCodes::LOGICAL_ERROR);
+
    for (UInt64 try_number = 1; try_number <= max_tries; ++try_number)
    {
        try
@ -605,7 +608,7 @@ TaskStatus ClusterCopier::tryMoveAllPiecesToDestinationTable(const TaskTable & t
        settings_push.replication_alter_partitions_sync = 2;

        query_alter_ast_string += " ALTER TABLE " + getQuotedTable(original_table) +
-                                  " ATTACH PARTITION " + partition_name +
+                                  ((partition_name == "'all'") ? " ATTACH PARTITION ID " : " ATTACH PARTITION ") + partition_name +
                                  " FROM " + getQuotedTable(helping_table);

        LOG_DEBUG(log, "Executing ALTER query: {}", query_alter_ast_string);
@ -636,7 +639,7 @@ TaskStatus ClusterCopier::tryMoveAllPiecesToDestinationTable(const TaskTable & t
            if (!task_table.isReplicatedTable())
            {
                query_deduplicate_ast_string += " OPTIMIZE TABLE " + getQuotedTable(original_table) +
-                                                " PARTITION " + partition_name + " DEDUPLICATE;";
+                                                ((partition_name == "'all'") ? " PARTITION ID " : " PARTITION ") + partition_name + " DEDUPLICATE;";

                LOG_DEBUG(log, "Executing OPTIMIZE DEDUPLICATE query: {}", query_alter_ast_string);

@ -807,7 +810,7 @@ bool ClusterCopier::tryDropPartitionPiece(
        DatabaseAndTableName helping_table = DatabaseAndTableName(original_table.first, original_table.second + "_piece_" + toString(current_piece_number));

        String query = "ALTER TABLE " + getQuotedTable(helping_table);
-        query += " DROP PARTITION " + task_partition.name + "";
+        query += ((task_partition.name == "'all'") ? " DROP PARTITION ID " : " DROP PARTITION ")  + task_partition.name + "";

        /// TODO: use this statement after servers will be updated up to 1.1.54310
        // query += " DROP PARTITION ID '" + task_partition.name + "'";
@ -1567,7 +1570,7 @@ void ClusterCopier::dropParticularPartitionPieceFromAllHelpingTables(const TaskT
        DatabaseAndTableName original_table = task_table.table_push;
        DatabaseAndTableName helping_table = DatabaseAndTableName(original_table.first, original_table.second + "_piece_" + toString(current_piece_number));

-        String query = "ALTER TABLE " + getQuotedTable(helping_table) + " DROP PARTITION " + partition_name;
+        String query = "ALTER TABLE " + getQuotedTable(helping_table) + ((partition_name == "'all'") ? " DROP PARTITION ID " : " DROP PARTITION ") + partition_name;

        const ClusterPtr & cluster_push = task_table.cluster_push;
        Settings settings_push = task_cluster->settings_push;
@ -1670,14 +1673,24 @@ void ClusterCopier::createShardInternalTables(const ConnectionTimeouts & timeout

 std::set<String> ClusterCopier::getShardPartitions(const ConnectionTimeouts & timeouts, TaskShard & task_shard)
 {
+    std::set<String> res;
+
    createShardInternalTables(timeouts, task_shard, false);

    TaskTable & task_table = task_shard.task_table;

+    const String & partition_name = queryToString(task_table.engine_push_partition_key_ast);
+
+    if (partition_name == "'all'")
+    {
+        res.emplace("'all'");
+        return res;
+    }
+
    String query;
    {
        WriteBufferFromOwnString wb;
-        wb << "SELECT DISTINCT " << queryToString(task_table.engine_push_partition_key_ast) << " AS partition FROM"
+        wb << "SELECT DISTINCT " << partition_name << " AS partition FROM"
           << " " << getQuotedTable(task_shard.table_read_shard) << " ORDER BY partition DESC";
        query = wb.str();
    }
@ -1692,7 +1705,6 @@ std::set<String> ClusterCopier::getShardPartitions(const ConnectionTimeouts & ti
    local_context.setSettings(task_cluster->settings_pull);
    Block block = getBlockWithAllStreamData(InterpreterFactory::get(query_ast, local_context)->execute().getInputStream());

-    std::set<String> res;
    if (block)
    {
        ColumnWithTypeAndName & column = block.getByPosition(0);
@ -1803,7 +1815,7 @@ UInt64 ClusterCopier::executeQueryOnCluster(
    if (execution_mode == ClusterExecutionMode::ON_EACH_NODE)
        max_successful_executions_per_shard = 0;

-    std::atomic<size_t> origin_replicas_number;
+    std::atomic<size_t> origin_replicas_number = 0;

    /// We need to execute query on one replica at least
    auto do_for_shard = [&] (UInt64 shard_index, Settings shard_settings)
--- a/programs/grpc-client/CMakeLists.txt
+++ b/programs/grpc-client/CMakeLists.txt
@ -0,0 +1,7 @@
+include_directories(${CMAKE_CURRENT_BINARY_DIR})
+get_filename_component(rpc_proto "${CMAKE_CURRENT_SOURCE_DIR}/../server/grpc_protos/GrpcConnection.proto" ABSOLUTE)
+protobuf_generate_cpp(PROTO_SRCS PROTO_HDRS ${rpc_proto})
+PROTOBUF_GENERATE_GRPC_CPP(GRPC_SRCS GRPC_HDRS ${rpc_proto})
+
+add_executable(grpc-client grpc_client.cpp ${PROTO_SRCS} ${PROTO_HDRS} ${GRPC_SRCS} ${GRPC_HDRS})
+target_link_libraries(grpc-client PUBLIC grpc++ PUBLIC libprotobuf PUBLIC daemon) 
--- a/programs/grpc-client/grpc_client.cpp
+++ b/programs/grpc-client/grpc_client.cpp
@ -0,0 +1,173 @@
+#include <iostream>
+#include <memory>
+#include <string>
+#include <vector>
+#include <thread>
+#include <stdlib.h>
+#include <grpc++/channel.h>
+#include <grpc++/client_context.h>
+#include <grpc++/create_channel.h>
+#include <grpc++/security/credentials.h>
+#include "GrpcConnection.grpc.pb.h"
+
+class GRPCClient
+{
+    public:
+        explicit GRPCClient(std::shared_ptr<grpc::Channel> channel)
+            : stub_(GRPCConnection::GRPC::NewStub(channel))
+            {}
+         std::string Query(const GRPCConnection::User& userInfo,
+                            const std::string& query,
+                            std::vector<std::string> insert_data = {})
+         {
+            GRPCConnection::QueryRequest request;
+            grpc::Status status;
+            GRPCConnection::QueryResponse reply;
+            grpc::ClientContext context;
+            auto deadline = std::chrono::system_clock::now() + std::chrono::milliseconds(10000);
+            context.set_deadline(deadline);
+
+            auto user = std::make_unique<GRPCConnection::User>(userInfo);
+            auto querySettigs = std::make_unique<GRPCConnection::QuerySettings>();
+            int id = rand();
+            request.set_allocated_user_info(user.release());
+            // interactive_delay in miliseconds
+            request.set_interactive_delay(1000);
+
+            querySettigs->set_query(query);
+            querySettigs->set_format("Values");
+            querySettigs->set_query_id(std::to_string(id));
+            querySettigs->set_data_stream((insert_data.size() != 0));
+            (*querySettigs->mutable_settings())["max_query_size"] ="100";
+
+
+            request.set_allocated_query_info(querySettigs.release());
+
+            void* got_tag = (void*)1;
+            bool ok = false;
+
+            std::unique_ptr<grpc::ClientReaderWriter<GRPCConnection::QueryRequest, GRPCConnection::QueryResponse> > reader(stub_->Query(&context));
+            reader->Write(request);
+
+            auto write = [&reply, &reader, &insert_data]()
+            {
+                GRPCConnection::QueryRequest request_insert;
+                for (const auto& data : insert_data)
+                {
+                    request_insert.set_insert_data(data);
+                    if (reply.exception_occured().empty())
+                    {
+                        reader->Write(request_insert);
+                    }
+                    else
+                    {
+                        break;
+                    }
+                }
+                request_insert.set_insert_data("");
+                if (reply.exception_occured().empty())
+                {
+                    reader->Write(request_insert);
+                }
+                // reader->WritesDone();
+            };
+            std::thread write_thread(write);
+            write_thread.detach();
+
+            while (reader->Read(&reply))
+            {
+
+                if (!reply.output().empty())
+                {
+                    std::cout << "Query Part:\n " << id<< reply.output()<<'\n';
+                }
+                else if (reply.progress().read_rows()
+                        || reply.progress().read_bytes()
+                        || reply.progress().total_rows_to_read()
+                        || reply.progress().written_rows()
+                        || reply.progress().written_bytes())
+                {
+                    std::cout << "Progress " << id<< ":{\n" << "read_rows: "            << reply.progress().read_rows() << '\n'
+                                                            << "read_bytes: "           << reply.progress().read_bytes() << '\n'
+                                                            << "total_rows_to_read: "   << reply.progress().total_rows_to_read() << '\n'
+                                                            << "written_rows: "         << reply.progress().written_rows() << '\n'
+                                                            << "written_bytes: "        << reply.progress().written_bytes() << '\n';
+
+
+                }
+                else if (!reply.totals().empty())
+                {
+                    std::cout << "Totals:\n " << id << " " << reply.totals() <<'\n';
+                }
+                else if (!reply.extremes().empty())
+                {
+                    std::cout << "Extremes:\n " << id << " " << reply.extremes() <<'\n';
+                }
+            }
+
+            if (status.ok() && reply.exception_occured().empty())
+            {
+                return "";
+            }
+            else if (status.ok() && !reply.exception_occured().empty())
+            {
+                return reply.exception_occured();
+            }
+            else
+            {
+                return "RPC failed";
+            }
+         }
+
+    private:
+        std::unique_ptr<GRPCConnection::GRPC::Stub> stub_;
+};
+
+int main(int argc, char** argv)
+{
+    GRPCConnection::User userInfo1;
+    userInfo1.set_user("default");
+    userInfo1.set_password("");
+    userInfo1.set_quota("default");
+
+    std::cout << "Try: " << argv[1] << std::endl;
+    grpc::ChannelArguments ch_args;
+    ch_args.SetMaxReceiveMessageSize(-1);
+    GRPCClient client(
+     grpc::CreateCustomChannel(argv[1], grpc::InsecureChannelCredentials(), ch_args));
+    {
+        std::cout << client.Query(userInfo1, "CREATE TABLE t (a UInt8) ENGINE = Memory") << std::endl;
+        std::cout << client.Query(userInfo1, "CREATE TABLE t (a UInt8) ENGINE = Memory") << std::endl;
+        std::cout << client.Query(userInfo1, "INSERT INTO t VALUES", {"(1),(2),(3)", "(4),(6),(5)"}) << std::endl;
+        std::cout << client.Query(userInfo1, "INSERT INTO t_not_defined VALUES", {"(1),(2),(3)", "(4),(6),(5)"}) << std::endl;
+        std::cout << client.Query(userInfo1, "SELECT a FROM t ORDER BY a") << std::endl;
+        std::cout << client.Query(userInfo1, "DROP TABLE t") << std::endl;
+    }
+    {
+        std::cout << client.Query(userInfo1, "SELECT count() FROM numbers(1)") << std::endl;
+        std::cout << client.Query(userInfo1, "SELECT 100") << std::endl;
+        std::cout << client.Query(userInfo1, "SELECT count() FROM numbers(10000000000)") << std::endl;
+        std::cout << client.Query(userInfo1, "SELECT count() FROM numbers(100)") << std::endl;
+    }
+    {
+        std::cout << client.Query(userInfo1, "CREATE TABLE arrays_test (s String, arr Array(UInt8)) ENGINE = Memory;") << std::endl;
+        std::cout << client.Query(userInfo1, "INSERT INTO arrays_test VALUES ('Hello', [1,2]), ('World', [3,4,5]), ('Goodbye', []);") << std::endl;
+        std::cout << client.Query(userInfo1, "SELECT s FROM arrays_test") << std::endl;
+        std::cout << client.Query(userInfo1, "DROP TABLE arrays_test") << std::endl;
+        std::cout << client.Query(userInfo1, "") << std::endl;
+    }
+
+    {//Check null return from pipe
+        std::cout << client.Query(userInfo1, "CREATE TABLE table2 (x UInt8, y UInt8) ENGINE = Memory;") << std::endl;
+        std::cout << client.Query(userInfo1, "SELECT x FROM table2") << std::endl;
+        std::cout << client.Query(userInfo1, "DROP TABLE table2") << std::endl;
+    }
+    {//Check Totals
+        std::cout << client.Query(userInfo1, "CREATE TABLE tabl (x UInt8, y UInt8) ENGINE = Memory;") << std::endl;
+        std::cout << client.Query(userInfo1, "INSERT INTO tabl VALUES (1, 2), (2, 4), (3, 2), (3, 3), (3, 4);") << std::endl;
+        std::cout << client.Query(userInfo1, "SELECT sum(x), y FROM tabl GROUP BY y WITH TOTALS") << std::endl;
+        std::cout << client.Query(userInfo1, "DROP TABLE tabl") << std::endl;
+    }
+
+    return 0;
+}
--- a/programs/odbc-bridge/ODBCBridge.cpp
+++ b/programs/odbc-bridge/ODBCBridge.cpp
@ -109,6 +109,14 @@ void ODBCBridge::defineOptions(Poco::Util::OptionSet & options)
                          .argument("err-log-path")
                          .binding("logger.errorlog"));

+    options.addOption(Poco::Util::Option("stdout-path", "", "stdout log path, default console")
+                          .argument("stdout-path")
+                          .binding("logger.stdout"));
+
+    options.addOption(Poco::Util::Option("stderr-path", "", "stderr log path, default console")
+                          .argument("stderr-path")
+                          .binding("logger.stderr"));
+
    using Me = std::decay_t<decltype(*this)>;
    options.addOption(Poco::Util::Option("help", "", "produce this help message")
                          .binding("help")
@ -127,6 +135,27 @@ void ODBCBridge::initialize(Application & self)

    config().setString("logger", "ODBCBridge");

+    /// Redirect stdout, stderr to specified files.
+    /// Some libraries and sanitizers write to stderr in case of errors.
+    const auto stdout_path = config().getString("logger.stdout", "");
+    if (!stdout_path.empty())
+    {
+        if (!freopen(stdout_path.c_str(), "a+", stdout))
+            throw Poco::OpenFileException("Cannot attach stdout to " + stdout_path);
+
+        /// Disable buffering for stdout.
+        setbuf(stdout, nullptr);
+    }
+    const auto stderr_path = config().getString("logger.stderr", "");
+    if (!stderr_path.empty())
+    {
+        if (!freopen(stderr_path.c_str(), "a+", stderr))
+            throw Poco::OpenFileException("Cannot attach stderr to " + stderr_path);
+
+        /// Disable buffering for stderr.
+        setbuf(stderr, nullptr);
+    }
+
    buildLoggers(config(), logger(), self.commandName());

    BaseDaemon::logRevision();
--- a/programs/server/Server.cpp
+++ b/programs/server/Server.cpp
@ -64,6 +64,7 @@
 #include <Common/ThreadFuzzer.h>
 #include <Server/MySQLHandlerFactory.h>
 #include <Server/PostgreSQLHandlerFactory.h>
+#include <Server/ProtocolServerAdapter.h>


 #if !defined(ARCADIA_BUILD)
@ -84,6 +85,11 @@
 #    include <Poco/Net/SecureServerSocket.h>
 #endif

+#if USE_GRPC
+#   include <Server/GRPCServer.h>
+#endif
+
+
 namespace CurrentMetrics
 {
    extern const Metric Revision;
@ -806,7 +812,7 @@ int Server::main(const std::vector<std::string> & /*args*/)
        http_params->setTimeout(settings.http_receive_timeout);
        http_params->setKeepAliveTimeout(keep_alive_timeout);

-        std::vector<std::unique_ptr<Poco::Net::TCPServer>> servers;
+        std::vector<ProtocolServerAdapter> servers;

        std::vector<std::string> listen_hosts = DB::getMultipleValuesFromConfig(config(), "", "listen_host");

@ -1035,6 +1041,15 @@ int Server::main(const std::vector<std::string> & /*args*/)
                LOG_INFO(log, "Listening for PostgreSQL compatibility protocol: " + address.toString());
            });

+#if USE_GRPC
+            create_server("grpc_port", [&](UInt16 port)
+            {
+                Poco::Net::SocketAddress server_address(listen_host, port);
+                servers.emplace_back(std::make_unique<GRPCServer>(*this, make_socket_address(listen_host, port)));
+                LOG_INFO(log, "Listening for gRPC protocol: " + server_address.toString());
+            });
+#endif
+
            /// Prometheus (if defined and not setup yet with http_port)
            create_server("prometheus.port", [&](UInt16 port)
            {
@ -1056,7 +1071,7 @@ int Server::main(const std::vector<std::string> & /*args*/)
        global_context->enableNamedSessions();

        for (auto & server : servers)
-            server->start();
+            server.start();

        {
            String level_str = config().getString("text_log.level", "");
@ -1088,8 +1103,8 @@ int Server::main(const std::vector<std::string> & /*args*/)
            int current_connections = 0;
            for (auto & server : servers)
            {
-                server->stop();
-                current_connections += server->currentConnections();
+                server.stop();
+                current_connections += server.currentConnections();
            }

            if (current_connections)
@ -1109,7 +1124,7 @@ int Server::main(const std::vector<std::string> & /*args*/)
                {
                    current_connections = 0;
                    for (auto & server : servers)
-                        current_connections += server->currentConnections();
+                        current_connections += server.currentConnections();
                    if (!current_connections)
                        break;
                    sleep_current_ms += sleep_one_ms;
--- a/programs/server/config.xml
+++ b/programs/server/config.xml
@ -134,6 +134,34 @@
    <max_connections>4096</max_connections>
    <keep_alive_timeout>3</keep_alive_timeout>

+    <!-- gRPC protocol (see src/Server/grpc_protos/clickhouse_grpc.proto for the API)
+    <grpc_port>9001</grpc_port>
+    <grpc>
+        <enable_ssl>true</enable_ssl> -->
+        
+        <!-- The following two files are used only if enable_ssl=1
+        <ssl_cert_file>/path/to/ssl_cert_file</ssl_cert_file>
+        <ssl_key_file>/path/to/ssl_key_file</ssl_key_file> -->
+        
+        <!-- Whether server will request client for a certificate
+        <ssl_require_client_auth>true</ssl_require_client_auth> -->
+        
+        <!-- The following file is used only if ssl_require_client_auth=1
+        <ssl_ca_cert_file>/path/to/ssl_ca_cert_file</ssl_ca_cert_file> -->
+        
+        <!-- Default compression algorithm (applied if client doesn't specify another algorithm).
+             Supported algorithms: none, deflate, gzip, stream_gzip
+        <compression>gzip</compression> -->
+
+        <!-- Default compression level (applied if client doesn't specify another level).
+             Supported levels: none, low, medium, high
+        <compression_level>high</compression_level> -->
+
+        <!-- Send/receive message size limits in bytes. -1 means unlimited
+        <max_send_message_size>-1</max_send_message_size>
+        <max_receive_message_size>4194304</max_receive_message_size>
+    </grpc> -->
+
    <!-- Maximum number of concurrent queries. -->
    <max_concurrent_queries>100</max_concurrent_queries>

--- a/src/AggregateFunctions/AggregateFunctionMannWhitney.cpp
+++ b/src/AggregateFunctions/AggregateFunctionMannWhitney.cpp
@ -0,0 +1,37 @@
+#include <AggregateFunctions/AggregateFunctionFactory.h>
+#include <AggregateFunctions/AggregateFunctionMannWhitney.h>
+#include <AggregateFunctions/FactoryHelpers.h>
+#include "registerAggregateFunctions.h"
+#include <AggregateFunctions/Helpers.h>
+
+
+namespace ErrorCodes
+{
+extern const int NOT_IMPLEMENTED;
+}
+
+namespace DB
+{
+
+namespace
+{
+
+AggregateFunctionPtr createAggregateFunctionMannWhitneyUTest(const std::string & name, const DataTypes & argument_types, const Array & parameters)
+{
+    assertBinary(name, argument_types);
+
+    if (!isNumber(argument_types[0]) || !isNumber(argument_types[1]))
+        throw Exception("Aggregate function " + name + " only supports numerical types", ErrorCodes::NOT_IMPLEMENTED);
+
+    return std::make_shared<AggregateFunctionMannWhitney>(argument_types, parameters);
+}
+
+}
+
+
+void registerAggregateFunctionMannWhitney(AggregateFunctionFactory & factory)
+{
+    factory.registerFunction("mannWhitneyUTest", createAggregateFunctionMannWhitneyUTest);
+}
+
+}
--- a/src/AggregateFunctions/AggregateFunctionMannWhitney.h
+++ b/src/AggregateFunctions/AggregateFunctionMannWhitney.h
@ -0,0 +1,246 @@
+#pragma once
+
+#include <AggregateFunctions/IAggregateFunction.h>
+#include <AggregateFunctions/StatCommon.h>
+#include <Columns/ColumnArray.h>
+#include <Columns/ColumnVector.h>
+#include <Columns/ColumnTuple.h>
+#include <Common/assert_cast.h>
+#include <Common/FieldVisitors.h>
+#include <Common/PODArray_fwd.h>
+#include <common/types.h>
+#include <DataTypes/DataTypesDecimal.h>
+#include <DataTypes/DataTypeNullable.h>
+#include <DataTypes/DataTypesNumber.h>
+#include <DataTypes/DataTypeTuple.h>
+#include <IO/ReadHelpers.h>
+#include <IO/WriteHelpers.h>
+#include <limits>
+
+#include <DataTypes/DataTypeArray.h>
+
+#include <Common/ArenaAllocator.h>
+
+#include <iostream>
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int ILLEGAL_TYPE_OF_ARGUMENT;
+    extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
+    extern const int BAD_ARGUMENTS;
+}
+
+
+struct MannWhitneyData : public StatisticalSample<Float64, Float64>
+{
+    /*Since null hypothesis is "for randomly selected values X and Y from two populations,
+     *the probability of X being greater than Y is equal to the probability of Y being greater than X".
+     *Or "the distribution F of first sample equals to the distribution G of second sample".
+     *Then alternative for this hypothesis (H1) is "two-sided"(F != G), "less"(F < G), "greater" (F > G). */
+    enum class Alternative
+    {
+        TwoSided,
+        Less,
+        Greater
+    };
+
+    /// The behaviour equals to the similar function from scipy.
+    /// https://github.com/scipy/scipy/blob/ab9e9f17e0b7b2d618c4d4d8402cd4c0c200d6c0/scipy/stats/stats.py#L6978
+    std::pair<Float64, Float64> getResult(Alternative alternative, bool continuity_correction)
+    {
+        ConcatenatedSamples both(this->x, this->y);
+        RanksArray ranks;
+        Float64 tie_correction;
+
+        /// Compute ranks according to both samples.
+        std::tie(ranks, tie_correction) = computeRanksAndTieCorrection(both);
+
+        const Float64 n1 = this->size_x;
+        const Float64 n2 = this->size_y;
+
+        Float64 r1 = 0;
+        for (size_t i = 0; i < n1; ++i)
+            r1 += ranks[i];
+
+        const Float64 u1 = n1 * n2 + (n1 * (n1 + 1.)) / 2. - r1;
+        const Float64 u2 = n1 * n2 - u1;
+
+        /// The distribution of U-statistic under null hypothesis H0  is symmetric with respect to meanrank.
+        const Float64 meanrank = n1 * n2 /2. + 0.5 * continuity_correction;
+        const Float64 sd = std::sqrt(tie_correction * n1 * n2 * (n1 + n2 + 1) / 12.0);
+
+        Float64 u = 0;
+        if (alternative == Alternative::TwoSided)
+            /// There is no difference which u_i to take as u, because z will be differ only in sign and we take std::abs() from it.
+            u = std::max(u1, u2);
+        else if (alternative == Alternative::Less)
+            u = u1;
+        else if (alternative == Alternative::Greater)
+            u = u2;
+
+        Float64 z = (u - meanrank) / sd;
+        if (alternative == Alternative::TwoSided)
+            z = std::abs(z);
+
+        /// In fact cdf is a probability function, so it is intergral of density from (-inf, z].
+        /// But since standard normal distribution is symmetric, cdf(0) = 0.5 and we have to compute integral from [0, z].
+        const Float64 cdf = integrateSimpson(0, z, [] (Float64 t) { return std::pow(M_E, -0.5 * t * t) / std::sqrt(2 * M_PI);});
+
+        Float64 p_value = 0;
+        if (alternative == Alternative::TwoSided)
+            p_value = 1 - 2 * cdf;
+        else
+            p_value = 0.5 - cdf;
+
+        return {u2, p_value};
+    }
+
+private:
+    using Sample = typename StatisticalSample<Float64, Float64>::SampleX;
+
+    /// We need to compute ranks according to all samples. Use this class to avoid extra copy and memory allocation.
+    class ConcatenatedSamples
+    {
+        public:
+            ConcatenatedSamples(const Sample & first_, const Sample & second_)
+                : first(first_), second(second_) {}
+
+            const Float64 & operator[](size_t ind) const
+            {
+                if (ind < first.size())
+                    return first[ind];
+                return second[ind % first.size()];
+            }
+
+            size_t size() const
+            {
+                return first.size() + second.size();
+            }
+
+        private:
+            const Sample & first;
+            const Sample & second;
+    };
+};
+
+class AggregateFunctionMannWhitney final:
+    public IAggregateFunctionDataHelper<MannWhitneyData, AggregateFunctionMannWhitney>
+{
+private:
+    using Alternative = typename MannWhitneyData::Alternative;
+    Alternative alternative;
+    bool continuity_correction{true};
+
+public:
+    explicit AggregateFunctionMannWhitney(const DataTypes & arguments, const Array & params)
+        :IAggregateFunctionDataHelper<MannWhitneyData, AggregateFunctionMannWhitney> ({arguments}, {})
+    {
+        if (params.size() > 2)
+            throw Exception("Aggregate function " + getName() + " require two parameter or less", ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
+
+        if (params.empty())
+        {
+            alternative = Alternative::TwoSided;
+            return;
+        }
+
+        if (params[0].getType() != Field::Types::String)
+            throw Exception("Aggregate function " + getName() + " require require first parameter to be a String", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
+
+        auto param = params[0].get<String>();
+        if (param == "two-sided")
+            alternative = Alternative::TwoSided;
+        else if (param == "less")
+            alternative = Alternative::Less;
+        else if (param == "greater")
+            alternative = Alternative::Greater;
+        else
+            throw Exception("Unknown parameter in aggregate function " + getName() +
+                    ". It must be one of: 'two sided', 'less', 'greater'", ErrorCodes::BAD_ARGUMENTS);
+
+        if (params.size() != 2)
+            return;
+
+        if (params[1].getType() != Field::Types::UInt64)
+                throw Exception("Aggregate function " + getName() + " require require second parameter to be a UInt64", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
+
+        continuity_correction = static_cast<bool>(params[1].get<UInt64>());
+    }
+
+    String getName() const override
+    {
+        return "mannWhitneyUTest";
+    }
+
+    DataTypePtr getReturnType() const override
+    {
+        DataTypes types
+        {
+            std::make_shared<DataTypeNumber<Float64>>(),
+            std::make_shared<DataTypeNumber<Float64>>(),
+        };
+
+        Strings names
+        {
+            "u_statistic",
+            "p_value"
+        };
+
+        return std::make_shared<DataTypeTuple>(
+            std::move(types),
+            std::move(names)
+        );
+    }
+
+    void add(AggregateDataPtr place, const IColumn ** columns, size_t row_num, Arena * arena) const override
+    {
+        Float64 value = columns[0]->getFloat64(row_num);
+        UInt8 is_second = columns[1]->getUInt(row_num);
+
+        if (is_second)
+            this->data(place).addY(value, arena);
+        else
+            this->data(place).addX(value, arena);
+    }
+
+    void merge(AggregateDataPtr place, ConstAggregateDataPtr rhs, Arena * arena) const override
+    {
+        auto & a = this->data(place);
+        auto & b = this->data(rhs);
+
+        a.merge(b, arena);
+    }
+
+    void serialize(ConstAggregateDataPtr place, WriteBuffer & buf) const override
+    {
+        this->data(place).write(buf);
+    }
+
+    void deserialize(AggregateDataPtr place, ReadBuffer & buf, Arena * arena) const override
+    {
+        this->data(place).read(buf, arena);
+    }
+
+    void insertResultInto(AggregateDataPtr place, IColumn & to, Arena *) const override
+    {
+        if (!this->data(place).size_x || !this->data(place).size_y)
+            throw Exception("Aggregate function " + getName() + " require both samples to be non empty", ErrorCodes::BAD_ARGUMENTS);
+
+        auto [u_statistic, p_value] = this->data(place).getResult(alternative, continuity_correction);
+
+        /// Because p-value is a probability.
+        p_value = std::min(1.0, std::max(0.0, p_value));
+
+        auto & column_tuple = assert_cast<ColumnTuple &>(to);
+        auto & column_stat = assert_cast<ColumnVector<Float64> &>(column_tuple.getColumn(0));
+        auto & column_value = assert_cast<ColumnVector<Float64> &>(column_tuple.getColumn(1));
+
+        column_stat.getData().push_back(u_statistic);
+        column_value.getData().push_back(p_value);
+    }
+
+};
+
+};
--- a/src/AggregateFunctions/AggregateFunctionRankCorrelation.cpp
+++ b/src/AggregateFunctions/AggregateFunctionRankCorrelation.cpp
@ -21,23 +21,10 @@ AggregateFunctionPtr createAggregateFunctionRankCorrelation(const std::string &
    assertBinary(name, argument_types);
    assertNoParameters(name, parameters);

-    AggregateFunctionPtr res;
-
-    if (isDecimal(argument_types[0]) || isDecimal(argument_types[1]))
-    {
+    if (!isNumber(argument_types[0]) || !isNumber(argument_types[1]))
        throw Exception("Aggregate function " + name + " only supports numerical types", ErrorCodes::NOT_IMPLEMENTED);
-    }
-    else
-    {
-        res.reset(createWithTwoNumericTypes<AggregateFunctionRankCorrelation>(*argument_types[0], *argument_types[1], argument_types));
-    }

-    if (!res)
-    {
-        throw Exception("Aggregate function " + name + " only supports numerical types", ErrorCodes::NOT_IMPLEMENTED);
-    }
-
-    return res;
+    return std::make_shared<AggregateFunctionRankCorrelation>(argument_types);
 }

 }
--- a/src/AggregateFunctions/AggregateFunctionRankCorrelation.h
+++ b/src/AggregateFunctions/AggregateFunctionRankCorrelation.h
@ -1,73 +1,56 @@
 #pragma once

 #include <AggregateFunctions/IAggregateFunction.h>
+#include <AggregateFunctions/StatCommon.h>
 #include <Columns/ColumnArray.h>
 #include <Columns/ColumnVector.h>
 #include <Columns/ColumnTuple.h>
 #include <Common/assert_cast.h>
-#include <Common/FieldVisitors.h>
+#include <Common/PODArray_fwd.h>
 #include <common/types.h>
 #include <DataTypes/DataTypesDecimal.h>
 #include <DataTypes/DataTypeNullable.h>
 #include <DataTypes/DataTypesNumber.h>
 #include <DataTypes/DataTypeTuple.h>
-#include <IO/ReadHelpers.h>
-#include <IO/WriteHelpers.h>
-#include <limits>
-
 #include <DataTypes/DataTypeArray.h>

 #include <Common/ArenaAllocator.h>

-#include <type_traits>
-
-
 namespace DB
 {

-template <template <typename> class Comparator>
-struct ComparePairFirst final
+
+struct RankCorrelationData : public StatisticalSample<Float64, Float64>
 {
-    template <typename X, typename Y>
-    bool operator()(const std::pair<X, Y> & lhs, const std::pair<X, Y> & rhs) const
+    Float64 getResult()
    {
-        return Comparator<X>{}(lhs.first, rhs.first);
+        RanksArray ranks_x;
+        std::tie(ranks_x, std::ignore) = computeRanksAndTieCorrection(this->x);
+
+        RanksArray ranks_y;
+        std::tie(ranks_y, std::ignore) = computeRanksAndTieCorrection(this->y);
+
+        /// In our case sizes of both samples are equal.
+        const auto size = this->size_x;
+
+        /// Count d^2 sum
+        Float64 answer = 0;
+        for (size_t j = 0; j < size; ++j)
+            answer += (ranks_x[j] - ranks_y[j]) * (ranks_x[j] - ranks_y[j]);
+
+        answer *= 6;
+        answer /= size * (size * size - 1);
+        answer = 1 - answer;
+        return answer;
    }
 };

-
-template <template <typename> class Comparator>
-struct ComparePairSecond final
-{
-    template <typename X, typename Y>
-    bool operator()(const std::pair<X, Y> & lhs, const std::pair<X, Y> & rhs) const
-    {
-        return Comparator<Y>{}(lhs.second, rhs.second);
-    }
-};
-
-template <typename X = Float64, typename Y = Float64>
-struct AggregateFunctionRankCorrelationData final
-{
-    size_t size_x = 0;
-
-    using Allocator = MixedAlignedArenaAllocator<alignof(std::pair<X, Y>), 4096>;
-    using Array = PODArray<std::pair<X, Y>, 32, Allocator>;
-
-    Array values;
-};
-
-template <typename X, typename Y>
 class AggregateFunctionRankCorrelation :
-    public IAggregateFunctionDataHelper<AggregateFunctionRankCorrelationData<X, Y>, AggregateFunctionRankCorrelation<X, Y>>
+    public IAggregateFunctionDataHelper<RankCorrelationData, AggregateFunctionRankCorrelation>
 {
-    using Data = AggregateFunctionRankCorrelationData<X, Y>;
-    using Allocator = MixedAlignedArenaAllocator<alignof(std::pair<Float64, Float64>), 4096>;
-    using Array = PODArray<std::pair<Float64, Float64>, 32, Allocator>;
-
 public:
    explicit AggregateFunctionRankCorrelation(const DataTypes & arguments)
-        :IAggregateFunctionDataHelper<AggregateFunctionRankCorrelationData<X, Y>,AggregateFunctionRankCorrelation<X, Y>> ({arguments}, {})
+        :IAggregateFunctionDataHelper<RankCorrelationData, AggregateFunctionRankCorrelation> ({arguments}, {})
    {}

    String getName() const override
@ -80,24 +63,12 @@ public:
        return std::make_shared<DataTypeNumber<Float64>>();
    }

-    void insert(Data & a, const std::pair<X, Y> & x, Arena * arena) const
-    {
-        ++a.size_x;
-        a.values.push_back(x, arena);
-    }
-
    void add(AggregateDataPtr place, const IColumn ** columns, size_t row_num, Arena * arena) const override
    {
-        auto & a = this->data(place);
-
-        auto new_x = assert_cast<const ColumnVector<X> &>(*columns[0]).getData()[row_num];
-        auto new_y = assert_cast<const ColumnVector<Y> &>(*columns[1]).getData()[row_num];
-
-        auto new_arg = std::make_pair(new_x, new_y);
-
-        a.size_x += 1;
-
-        a.values.push_back(new_arg, arena);
+        Float64 new_x = columns[0]->getFloat64(row_num);
+        Float64 new_y = columns[1]->getFloat64(row_num);
+        this->data(place).addX(new_x, arena);
+        this->data(place).addY(new_y, arena);
    }

    void merge(AggregateDataPtr place, ConstAggregateDataPtr rhs, Arena * arena) const override
@ -105,116 +76,22 @@ public:
        auto & a = this->data(place);
        auto & b = this->data(rhs);

-        if (b.size_x)
-            for (size_t i = 0; i < b.size_x; ++i)
-                insert(a, b.values[i], arena);
+        a.merge(b, arena);
    }

    void serialize(ConstAggregateDataPtr place, WriteBuffer & buf) const override
    {
-        const auto & value = this->data(place).values;
-        size_t size = this->data(place).size_x;
-        writeVarUInt(size, buf);
-        buf.write(reinterpret_cast<const char *>(value.data()), size * sizeof(value[0]));
+        this->data(place).write(buf);
    }

    void deserialize(AggregateDataPtr place, ReadBuffer & buf, Arena * arena) const override
    {
-        size_t size = 0;
-        readVarUInt(size, buf);
-
-        auto & value = this->data(place).values;
-
-        value.resize(size, arena);
-        buf.read(reinterpret_cast<char *>(value.data()), size * sizeof(value[0]));
+        this->data(place).read(buf, arena);
    }

-    void insertResultInto(AggregateDataPtr place, IColumn & to, Arena * /*arena*/) const override
+    void insertResultInto(AggregateDataPtr place, IColumn & to, Arena *) const override
    {
-        const auto & value = this->data(place).values;
-        size_t size = this->data(place).size_x;
-
-        // create a copy of values not to format data
-        PODArrayWithStackMemory<std::pair<Float64, Float64>, 32> tmp_values;
-        tmp_values.resize(size);
-        for (size_t j = 0; j < size; ++ j)
-            tmp_values[j] = static_cast<std::pair<Float64, Float64>>(value[j]);
-
-        // sort x_values
-        std::sort(std::begin(tmp_values), std::end(tmp_values), ComparePairFirst<std::greater>{});
-
-        for (size_t j = 0; j < size;)
-        {
-            // replace x_values with their ranks
-            size_t rank = j + 1;
-            size_t same = 1;
-            size_t cur_sum = rank;
-            size_t cur_start = j;
-
-            while (j < size - 1)
-            {
-                if (tmp_values[j].first == tmp_values[j + 1].first)
-                {
-                    // rank of (j + 1)th number
-                    rank += 1;
-                    ++same;
-                    cur_sum += rank;
-                    ++j;
-                }
-                else
-                    break;
-            }
-
-            // insert rank is calculated as average of ranks of equal values
-            Float64 insert_rank = static_cast<Float64>(cur_sum) / same;
-            for (size_t i = cur_start; i <= j; ++i)
-                tmp_values[i].first = insert_rank;
-            ++j;
-        }
-
-        // sort y_values
-        std::sort(std::begin(tmp_values), std::end(tmp_values), ComparePairSecond<std::greater>{});
-
-        // replace y_values with their ranks
-        for (size_t j = 0; j < size;)
-        {
-            // replace x_values with their ranks
-            size_t rank = j + 1;
-            size_t same = 1;
-            size_t cur_sum = rank;
-            size_t cur_start = j;
-
-            while (j < size - 1)
-            {
-                if (tmp_values[j].second == tmp_values[j + 1].second)
-                {
-                    // rank of (j + 1)th number
-                    rank += 1;
-                    ++same;
-                    cur_sum += rank;
-                    ++j;
-                }
-                else
-                {
-                    break;
-                }
-            }
-
-            // insert rank is calculated as average of ranks of equal values
-            Float64 insert_rank = static_cast<Float64>(cur_sum) / same;
-            for (size_t i = cur_start; i <= j; ++i)
-                tmp_values[i].second = insert_rank;
-            ++j;
-        }
-
-        // count d^2 sum
-        Float64 answer = static_cast<Float64>(0);
-        for (size_t j = 0; j < size; ++ j)
-            answer += (tmp_values[j].first - tmp_values[j].second) * (tmp_values[j].first - tmp_values[j].second);
-
-        answer *= 6;
-        answer /= size * (size * size - 1);
-        answer = 1 - answer;
+        auto answer = this->data(place).getResult();

        auto & column = static_cast<ColumnVector<Float64> &>(to);
        column.getData().push_back(answer);
--- a/src/AggregateFunctions/AggregateFunctionStatisticsSimple.h
+++ b/src/AggregateFunctions/AggregateFunctionStatisticsSimple.h
@ -8,6 +8,7 @@
 #include <IO/ReadHelpers.h>

 #include <AggregateFunctions/IAggregateFunction.h>
+#include <AggregateFunctions/Moments.h>

 #include <DataTypes/DataTypesNumber.h>
 #include <DataTypes/DataTypesDecimal.h>
@ -30,310 +31,6 @@
 namespace DB
 {

-namespace ErrorCodes
-{
-    extern const int DECIMAL_OVERFLOW;
-}
-
-
-/**
-    Calculating univariate central moments
-    Levels:
-        level 2 (pop & samp): var, stddev
-        level 3: skewness
-        level 4: kurtosis
-    References:
-        https://en.wikipedia.org/wiki/Moment_(mathematics)
-        https://en.wikipedia.org/wiki/Skewness
-        https://en.wikipedia.org/wiki/Kurtosis
-*/
-template <typename T, size_t _level>
-struct VarMoments
-{
-    T m[_level + 1]{};
-
-    void add(T x)
-    {
-        ++m[0];
-        m[1] += x;
-        m[2] += x * x;
-        if constexpr (_level >= 3) m[3] += x * x * x;
-        if constexpr (_level >= 4) m[4] += x * x * x * x;
-    }
-
-    void merge(const VarMoments & rhs)
-    {
-        m[0] += rhs.m[0];
-        m[1] += rhs.m[1];
-        m[2] += rhs.m[2];
-        if constexpr (_level >= 3) m[3] += rhs.m[3];
-        if constexpr (_level >= 4) m[4] += rhs.m[4];
-    }
-
-    void write(WriteBuffer & buf) const
-    {
-        writePODBinary(*this, buf);
-    }
-
-    void read(ReadBuffer & buf)
-    {
-        readPODBinary(*this, buf);
-    }
-
-    T getPopulation() const
-    {
-        if (m[0] == 0)
-            return std::numeric_limits<T>::quiet_NaN();
-
-        /// Due to numerical errors, the result can be slightly less than zero,
-        /// but it should be impossible. Trim to zero.
-
-        return std::max(T{}, (m[2] - m[1] * m[1] / m[0]) / m[0]);
-    }
-
-    T getSample() const
-    {
-        if (m[0] <= 1)
-            return std::numeric_limits<T>::quiet_NaN();
-        return std::max(T{}, (m[2] - m[1] * m[1] / m[0]) / (m[0] - 1));
-    }
-
-    T getMoment3() const
-    {
-        if (m[0] == 0)
-            return std::numeric_limits<T>::quiet_NaN();
-        // to avoid accuracy problem
-        if (m[0] == 1)
-            return 0;
-        return (m[3]
-            - (3 * m[2]
-                - 2 * m[1] * m[1] / m[0]
-            ) * m[1] / m[0]
-        ) / m[0];
-    }
-
-    T getMoment4() const
-    {
-        if (m[0] == 0)
-            return std::numeric_limits<T>::quiet_NaN();
-        // to avoid accuracy problem
-        if (m[0] == 1)
-            return 0;
-        return (m[4]
-            - (4 * m[3]
-                - (6 * m[2]
-                    - 3 * m[1] * m[1] / m[0]
-                ) * m[1] / m[0]
-            ) * m[1] / m[0]
-        ) / m[0];
-    }
-};
-
-template <typename T, size_t _level>
-class VarMomentsDecimal
-{
-public:
-    using NativeType = typename T::NativeType;
-
-    void add(NativeType x)
-    {
-        ++m0;
-        getM(1) += x;
-
-        NativeType tmp;
-        bool overflow = common::mulOverflow(x, x, tmp) || common::addOverflow(getM(2), tmp, getM(2));
-        if constexpr (_level >= 3)
-            overflow = overflow || common::mulOverflow(tmp, x, tmp) || common::addOverflow(getM(3), tmp, getM(3));
-        if constexpr (_level >= 4)
-            overflow = overflow || common::mulOverflow(tmp, x, tmp) || common::addOverflow(getM(4), tmp, getM(4));
-
-        if (overflow)
-            throw Exception("Decimal math overflow", ErrorCodes::DECIMAL_OVERFLOW);
-    }
-
-    void merge(const VarMomentsDecimal & rhs)
-    {
-        m0 += rhs.m0;
-        getM(1) += rhs.getM(1);
-
-        bool overflow = common::addOverflow(getM(2), rhs.getM(2), getM(2));
-        if constexpr (_level >= 3)
-            overflow = overflow || common::addOverflow(getM(3), rhs.getM(3), getM(3));
-        if constexpr (_level >= 4)
-            overflow = overflow || common::addOverflow(getM(4), rhs.getM(4), getM(4));
-
-        if (overflow)
-            throw Exception("Decimal math overflow", ErrorCodes::DECIMAL_OVERFLOW);
-    }
-
-    void write(WriteBuffer & buf) const { writePODBinary(*this, buf); }
-    void read(ReadBuffer & buf) { readPODBinary(*this, buf); }
-
-    Float64 getPopulation(UInt32 scale) const
-    {
-        if (m0 == 0)
-            return std::numeric_limits<Float64>::infinity();
-
-        NativeType tmp;
-        if (common::mulOverflow(getM(1), getM(1), tmp) ||
-            common::subOverflow(getM(2), NativeType(tmp / m0), tmp))
-            throw Exception("Decimal math overflow", ErrorCodes::DECIMAL_OVERFLOW);
-        return std::max(Float64{}, DecimalUtils::convertTo<Float64>(T(tmp / m0), scale));
-    }
-
-    Float64 getSample(UInt32 scale) const
-    {
-        if (m0 == 0)
-            return std::numeric_limits<Float64>::quiet_NaN();
-        if (m0 == 1)
-            return std::numeric_limits<Float64>::infinity();
-
-        NativeType tmp;
-        if (common::mulOverflow(getM(1), getM(1), tmp) ||
-            common::subOverflow(getM(2), NativeType(tmp / m0), tmp))
-            throw Exception("Decimal math overflow", ErrorCodes::DECIMAL_OVERFLOW);
-        return std::max(Float64{}, DecimalUtils::convertTo<Float64>(T(tmp / (m0 - 1)), scale));
-    }
-
-    Float64 getMoment3(UInt32 scale) const
-    {
-        if (m0 == 0)
-            return std::numeric_limits<Float64>::infinity();
-
-        NativeType tmp;
-        if (common::mulOverflow(2 * getM(1), getM(1), tmp) ||
-            common::subOverflow(3 * getM(2), NativeType(tmp / m0), tmp) ||
-            common::mulOverflow(tmp, getM(1), tmp) ||
-            common::subOverflow(getM(3), NativeType(tmp / m0), tmp))
-            throw Exception("Decimal math overflow", ErrorCodes::DECIMAL_OVERFLOW);
-        return DecimalUtils::convertTo<Float64>(T(tmp / m0), scale);
-    }
-
-    Float64 getMoment4(UInt32 scale) const
-    {
-        if (m0 == 0)
-            return std::numeric_limits<Float64>::infinity();
-
-        NativeType tmp;
-        if (common::mulOverflow(3 * getM(1), getM(1), tmp) ||
-            common::subOverflow(6 * getM(2), NativeType(tmp / m0), tmp) ||
-            common::mulOverflow(tmp, getM(1), tmp) ||
-            common::subOverflow(4 * getM(3), NativeType(tmp / m0), tmp) ||
-            common::mulOverflow(tmp, getM(1), tmp) ||
-            common::subOverflow(getM(4), NativeType(tmp / m0), tmp))
-            throw Exception("Decimal math overflow", ErrorCodes::DECIMAL_OVERFLOW);
-        return DecimalUtils::convertTo<Float64>(T(tmp / m0), scale);
-    }
-
-private:
-    UInt64 m0{};
-    NativeType m[_level]{};
-
-    NativeType & getM(size_t i) { return m[i - 1]; }
-    const NativeType & getM(size_t i) const { return m[i - 1]; }
-};
-
-/**
-    Calculating multivariate central moments
-    Levels:
-        level 2 (pop & samp): covar
-    References:
-        https://en.wikipedia.org/wiki/Moment_(mathematics)
-*/
-template <typename T>
-struct CovarMoments
-{
-    T m0{};
-    T x1{};
-    T y1{};
-    T xy{};
-
-    void add(T x, T y)
-    {
-        ++m0;
-        x1 += x;
-        y1 += y;
-        xy += x * y;
-    }
-
-    void merge(const CovarMoments & rhs)
-    {
-        m0 += rhs.m0;
-        x1 += rhs.x1;
-        y1 += rhs.y1;
-        xy += rhs.xy;
-    }
-
-    void write(WriteBuffer & buf) const
-    {
-        writePODBinary(*this, buf);
-    }
-
-    void read(ReadBuffer & buf)
-    {
-        readPODBinary(*this, buf);
-    }
-
-    T NO_SANITIZE_UNDEFINED getPopulation() const
-    {
-        return (xy - x1 * y1 / m0) / m0;
-    }
-
-    T NO_SANITIZE_UNDEFINED getSample() const
-    {
-        if (m0 == 0)
-            return std::numeric_limits<T>::quiet_NaN();
-        return (xy - x1 * y1 / m0) / (m0 - 1);
-    }
-};
-
-template <typename T>
-struct CorrMoments
-{
-    T m0{};
-    T x1{};
-    T y1{};
-    T xy{};
-    T x2{};
-    T y2{};
-
-    void add(T x, T y)
-    {
-        ++m0;
-        x1 += x;
-        y1 += y;
-        xy += x * y;
-        x2 += x * x;
-        y2 += y * y;
-    }
-
-    void merge(const CorrMoments & rhs)
-    {
-        m0 += rhs.m0;
-        x1 += rhs.x1;
-        y1 += rhs.y1;
-        xy += rhs.xy;
-        x2 += rhs.x2;
-        y2 += rhs.y2;
-    }
-
-    void write(WriteBuffer & buf) const
-    {
-        writePODBinary(*this, buf);
-    }
-
-    void read(ReadBuffer & buf)
-    {
-        readPODBinary(*this, buf);
-    }
-
-    T NO_SANITIZE_UNDEFINED get() const
-    {
-        return (m0 * xy - x1 * y1) / sqrt((m0 * x2 - x1 * x1) * (m0 * y2 - y1 * y1));
-    }
-};
-
-
 enum class StatisticsFunctionKind
 {
    varPop, varSamp,
--- a/src/AggregateFunctions/AggregateFunctionStudentTTest.cpp
+++ b/src/AggregateFunctions/AggregateFunctionStudentTTest.cpp
@ -0,0 +1,77 @@
+#include <AggregateFunctions/AggregateFunctionFactory.h>
+#include <AggregateFunctions/AggregateFunctionTTest.h>
+#include <AggregateFunctions/FactoryHelpers.h>
+#include <AggregateFunctions/Moments.h>
+
+#include "registerAggregateFunctions.h"
+
+
+namespace ErrorCodes
+{
+    extern const int BAD_ARGUMENTS;
+}
+
+
+namespace DB
+{
+
+namespace
+{
+
+/** Student T-test applies to two samples of independent random variables
+  * that have normal distributions with equal (but unknown) variances.
+  * It allows to answer the question whether means of the distributions differ.
+  *
+  * If variances are not considered equal, Welch T-test should be used instead.
+  */
+struct StudentTTestData : public TTestMoments<Float64>
+{
+    static constexpr auto name = "studentTTest";
+
+    std::pair<Float64, Float64> getResult() const
+    {
+        Float64 mean_x = x1 / nx;
+        Float64 mean_y = y1 / ny;
+
+        /// To estimate the variance we first estimate two means.
+        /// That's why the number of degrees of freedom is the total number of values of both samples minus 2.
+        Float64 degrees_of_freedom = nx + ny - 2;
+
+        /// Calculate s^2
+        /// The original formulae looks like
+        /// \frac{\sum_{i = 1}^{n_x}{(x_i - \bar{x}) ^ 2} + \sum_{i = 1}^{n_y}{(y_i - \bar{y}) ^ 2}}{n_x + n_y - 2}
+        /// But we made some mathematical transformations not to store original sequences.
+        /// Also we dropped sqrt, because later it will be squared later.
+
+        Float64 all_x = x2 + nx * mean_x * mean_x - 2 * mean_x * x1;
+        Float64 all_y = y2 + ny * mean_y * mean_y - 2 * mean_y * y1;
+
+        Float64 s2 = (all_x + all_y) / degrees_of_freedom;
+        Float64 std_err2 = s2 * (1. / nx + 1. / ny);
+
+        /// t-statistic
+        Float64 t_stat = (mean_x - mean_y) / sqrt(std_err2);
+
+        return {t_stat, getPValue(degrees_of_freedom, t_stat * t_stat)};
+    }
+};
+
+AggregateFunctionPtr createAggregateFunctionStudentTTest(const std::string & name, const DataTypes & argument_types, const Array & parameters)
+{
+    assertBinary(name, argument_types);
+    assertNoParameters(name, parameters);
+
+    if (!isNumber(argument_types[0]) || !isNumber(argument_types[1]))
+        throw Exception("Aggregate function " + name + " only supports numerical types", ErrorCodes::BAD_ARGUMENTS);
+
+    return std::make_shared<AggregateFunctionTTest<StudentTTestData>>(argument_types);
+}
+
+}
+
+void registerAggregateFunctionStudentTTest(AggregateFunctionFactory & factory)
+{
+    factory.registerFunction("studentTTest", createAggregateFunctionStudentTTest);
+}
+
+}
--- a/src/AggregateFunctions/AggregateFunctionTTest.h
+++ b/src/AggregateFunctions/AggregateFunctionTTest.h
@ -0,0 +1,154 @@
+#pragma once
+
+#include <AggregateFunctions/IAggregateFunction.h>
+#include <AggregateFunctions/StatCommon.h>
+#include <Columns/ColumnVector.h>
+#include <Columns/ColumnTuple.h>
+#include <Common/assert_cast.h>
+#include <Core/Types.h>
+#include <DataTypes/DataTypesNumber.h>
+#include <DataTypes/DataTypeTuple.h>
+#include <cmath>
+
+
+/// This function is used in implementations of different T-Tests.
+/// On Darwin it's unavailable in math.h but actually exists in the library (can be linked successfully).
+#if defined(OS_DARWIN)
+extern "C"
+{
+    double lgamma_r(double x, int * signgamp);
+}
+#endif
+
+
+namespace DB
+{
+
+class ReadBuffer;
+class WriteBuffer;
+
+/**
+ * If you have a cumulative distribution function F, then calculating the p-value for given statistic T is simply 1−F(T)
+ * In our case p-value is two-sided, so we multiply it by 2.
+ * So cumulative distribution function F equals to
+ * \[ F(t) = \int_{-\infty}^{t} f(u)du = 1 - \frac{1}{2} I_{x(t)}(\frac{v}{2}, \frac{1}{2}) \]
+ * where \[ x(t) = \frac{v}{t^2 + v} \]: https://en.wikipedia.org/wiki/Student%27s_t-distribution#Cumulative_distribution_function
+ *
+ * so our resulting \[ p-value = I_{x(t)}(\frac{v}{2}, \frac{1}{2}) \].
+ *
+ * And I is regularized incomplete beta function: https://en.wikipedia.org/wiki/Beta_function#Incomplete_beta_function
+ *
+ * Keepenig in mind that \[ \mathrm {B} (x;a,b)=\int _{0}^{x}r^{a-1}\,(1-r)^{b-1}\,\mathrm {d} r.\! \]
+ * and
+ * \[ \mathrm {B} (x,y)={\dfrac {\Gamma (x)\,\Gamma (y)}{\Gamma (x+y)}}=\
+ * \exp(\ln {\dfrac {\Gamma (x)\,\Gamma (y)}{\Gamma (x+y)}})=\exp((\ln(\Gamma (x))+\ln(\Gamma (y))-\ln(\Gamma (x+y))) \]
+ *
+ * p-value can be calculated in terms of gamma functions and integrals more simply:
+ * \[ {\frac {\int _{0}^{\frac {\nu }{t^{2}+\nu }}r^{{\frac {\nu }{2}}-1}\,(1-r)^{-0.5}\,\mathrm {d} r}\
+ * {\exp((\ln(\Gamma ({\frac {\nu }{2}}))+\ln(\Gamma (0.5))-\ln(\Gamma ({\frac {\nu }{2}}+0.5)))}} \]
+ *
+ * which simplifies to:
+ *
+ * \[ {\frac {\int _{0}^{\frac {\nu }{t^{2}+\nu }}{\frac {r^{{\frac {\nu }{2}}-1}}{\sqrt {1-r}}}\,\mathrm {d} r}\
+ * {\exp((\ln(\Gamma ({\frac {\nu }{2}}))+\ln(\Gamma (0.5))-\ln(\Gamma ({\frac {\nu }{2}}+0.5)))}} \]
+ *
+ * Read here for details https://rosettacode.org/wiki/Welch%27s_t-test#
+ *
+ * Both WelchTTest and StudentTTest have t-statistric with Student distribution but with different degrees of freedom.
+ * So the procedure of computing p-value is the same.
+*/
+static inline Float64 getPValue(Float64 degrees_of_freedom, Float64 t_stat2)
+{
+    Float64 numerator = integrateSimpson(0, degrees_of_freedom / (t_stat2 + degrees_of_freedom),
+        [degrees_of_freedom](double x) { return std::pow(x, degrees_of_freedom / 2 - 1) / std::sqrt(1 - x); });
+
+    int unused;
+    Float64 denominator = std::exp(
+        lgamma_r(degrees_of_freedom / 2, &unused)
+        + lgamma_r(0.5, &unused)
+        - lgamma_r(degrees_of_freedom / 2 + 0.5, &unused));
+
+    return std::min(1.0, std::max(0.0, numerator / denominator));
+}
+
+
+/// Returns tuple of (t-statistic, p-value)
+/// https://cpb-us-w2.wpmucdn.com/voices.uchicago.edu/dist/9/1193/files/2016/01/05b-TandP.pdf
+template <typename Data>
+class AggregateFunctionTTest :
+    public IAggregateFunctionDataHelper<Data, AggregateFunctionTTest<Data>>
+{
+public:
+    AggregateFunctionTTest(const DataTypes & arguments)
+        : IAggregateFunctionDataHelper<Data, AggregateFunctionTTest<Data>>({arguments}, {})
+    {
+    }
+
+    String getName() const override
+    {
+        return Data::name;
+    }
+
+    DataTypePtr getReturnType() const override
+    {
+        DataTypes types
+        {
+            std::make_shared<DataTypeNumber<Float64>>(),
+            std::make_shared<DataTypeNumber<Float64>>(),
+        };
+
+        Strings names
+        {
+            "t_statistic",
+            "p_value"
+        };
+
+        return std::make_shared<DataTypeTuple>(
+            std::move(types),
+            std::move(names)
+        );
+    }
+
+    void add(AggregateDataPtr place, const IColumn ** columns, size_t row_num, Arena *) const override
+    {
+        Float64 value = columns[0]->getFloat64(row_num);
+        UInt8 is_second = columns[1]->getUInt(row_num);
+
+        if (is_second)
+            this->data(place).addY(value);
+        else
+            this->data(place).addX(value);
+    }
+
+    void merge(AggregateDataPtr place, ConstAggregateDataPtr rhs, Arena *) const override
+    {
+        this->data(place).merge(this->data(rhs));
+    }
+
+    void serialize(ConstAggregateDataPtr place, WriteBuffer & buf) const override
+    {
+        this->data(place).write(buf);
+    }
+
+    void deserialize(AggregateDataPtr place, ReadBuffer & buf, Arena *) const override
+    {
+        this->data(place).read(buf);
+    }
+
+    void insertResultInto(AggregateDataPtr place, IColumn & to, Arena *) const override
+    {
+        auto [t_statistic, p_value] = this->data(place).getResult();
+
+        /// Because p-value is a probability.
+        p_value = std::min(1.0, std::max(0.0, p_value));
+
+        auto & column_tuple = assert_cast<ColumnTuple &>(to);
+        auto & column_stat = assert_cast<ColumnVector<Float64> &>(column_tuple.getColumn(0));
+        auto & column_value = assert_cast<ColumnVector<Float64> &>(column_tuple.getColumn(1));
+
+        column_stat.getData().push_back(t_statistic);
+        column_value.getData().push_back(p_value);
+    }
+};
+
+};
--- a/src/AggregateFunctions/AggregateFunctionWelchTTest.cpp
+++ b/src/AggregateFunctions/AggregateFunctionWelchTTest.cpp
@ -0,0 +1,74 @@
+#include <AggregateFunctions/AggregateFunctionFactory.h>
+#include <AggregateFunctions/AggregateFunctionTTest.h>
+#include <AggregateFunctions/FactoryHelpers.h>
+#include <AggregateFunctions/Moments.h>
+
+#include "registerAggregateFunctions.h"
+
+
+namespace ErrorCodes
+{
+    extern const int BAD_ARGUMENTS;
+}
+
+
+namespace DB
+{
+
+namespace
+{
+
+struct WelchTTestData : public TTestMoments<Float64>
+{
+    static constexpr auto name = "welchTTest";
+
+    std::pair<Float64, Float64> getResult() const
+    {
+        Float64 mean_x = x1 / nx;
+        Float64 mean_y = y1 / ny;
+
+        /// s_x^2, s_y^2
+
+        /// The original formulae looks like  \frac{1}{size_x - 1} \sum_{i = 1}^{size_x}{(x_i - \bar{x}) ^ 2}
+        /// But we made some mathematical transformations not to store original sequences.
+        /// Also we dropped sqrt, because later it will be squared later.
+
+        Float64 sx2 = (x2 + nx * mean_x * mean_x - 2 * mean_x * x1) / (nx - 1);
+        Float64 sy2 = (y2 + ny * mean_y * mean_y - 2 * mean_y * y1) / (ny - 1);
+
+        /// t-statistic
+        Float64 t_stat = (mean_x - mean_y) / sqrt(sx2 / nx + sy2 / ny);
+
+        /// degrees of freedom
+
+        Float64 numerator_sqrt = sx2 / nx + sy2 / ny;
+        Float64 numerator = numerator_sqrt * numerator_sqrt;
+
+        Float64 denominator_x = sx2 * sx2 / (nx * nx * (nx - 1));
+        Float64 denominator_y = sy2 * sy2 / (ny * ny * (ny - 1));
+
+        Float64 degrees_of_freedom = numerator / (denominator_x + denominator_y);
+
+        return {t_stat, getPValue(degrees_of_freedom, t_stat * t_stat)};
+    }
+};
+
+AggregateFunctionPtr createAggregateFunctionWelchTTest(const std::string & name, const DataTypes & argument_types, const Array & parameters)
+{
+    assertBinary(name, argument_types);
+    assertNoParameters(name, parameters);
+
+    if (!isNumber(argument_types[0]) || !isNumber(argument_types[1]))
+        throw Exception("Aggregate function " + name + " only supports numerical types", ErrorCodes::BAD_ARGUMENTS);
+
+    return std::make_shared<AggregateFunctionTTest<WelchTTestData>>(argument_types);
+}
+
+}
+
+void registerAggregateFunctionWelchTTest(AggregateFunctionFactory & factory)
+{
+    factory.registerFunction("welchTTest", createAggregateFunctionWelchTTest);
+}
+
+}
--- a/src/AggregateFunctions/Moments.h
+++ b/src/AggregateFunctions/Moments.h
@ -0,0 +1,361 @@
+#pragma once
+
+#include <IO/WriteHelpers.h>
+#include <IO/ReadHelpers.h>
+
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int DECIMAL_OVERFLOW;
+}
+
+
+/**
+    Calculating univariate central moments
+    Levels:
+        level 2 (pop & samp): var, stddev
+        level 3: skewness
+        level 4: kurtosis
+    References:
+        https://en.wikipedia.org/wiki/Moment_(mathematics)
+        https://en.wikipedia.org/wiki/Skewness
+        https://en.wikipedia.org/wiki/Kurtosis
+*/
+template <typename T, size_t _level>
+struct VarMoments
+{
+    T m[_level + 1]{};
+
+    void add(T x)
+    {
+        ++m[0];
+        m[1] += x;
+        m[2] += x * x;
+        if constexpr (_level >= 3) m[3] += x * x * x;
+        if constexpr (_level >= 4) m[4] += x * x * x * x;
+    }
+
+    void merge(const VarMoments & rhs)
+    {
+        m[0] += rhs.m[0];
+        m[1] += rhs.m[1];
+        m[2] += rhs.m[2];
+        if constexpr (_level >= 3) m[3] += rhs.m[3];
+        if constexpr (_level >= 4) m[4] += rhs.m[4];
+    }
+
+    void write(WriteBuffer & buf) const
+    {
+        writePODBinary(*this, buf);
+    }
+
+    void read(ReadBuffer & buf)
+    {
+        readPODBinary(*this, buf);
+    }
+
+    T getPopulation() const
+    {
+        if (m[0] == 0)
+            return std::numeric_limits<T>::quiet_NaN();
+
+        /// Due to numerical errors, the result can be slightly less than zero,
+        /// but it should be impossible. Trim to zero.
+
+        return std::max(T{}, (m[2] - m[1] * m[1] / m[0]) / m[0]);
+    }
+
+    T getSample() const
+    {
+        if (m[0] <= 1)
+            return std::numeric_limits<T>::quiet_NaN();
+        return std::max(T{}, (m[2] - m[1] * m[1] / m[0]) / (m[0] - 1));
+    }
+
+    T getMoment3() const
+    {
+        if (m[0] == 0)
+            return std::numeric_limits<T>::quiet_NaN();
+        // to avoid accuracy problem
+        if (m[0] == 1)
+            return 0;
+        /// \[ \frac{1}{m_0} (m_3 - (3 * m_2 - \frac{2 * {m_1}^2}{m_0}) * \frac{m_1}{m_0});\]
+        return (m[3]
+            - (3 * m[2]
+                - 2 * m[1] * m[1] / m[0]
+            ) * m[1] / m[0]
+        ) / m[0];
+    }
+
+    T getMoment4() const
+    {
+        if (m[0] == 0)
+            return std::numeric_limits<T>::quiet_NaN();
+        // to avoid accuracy problem
+        if (m[0] == 1)
+            return 0;
+        /// \[ \frac{1}{m_0}(m_4 - (4 * m_3 - (6 * m_2 - \frac{3 * m_1^2}{m_0} ) \frac{m_1}{m_0})\frac{m_1}{m_0})\]
+        return (m[4]
+            - (4 * m[3]
+                - (6 * m[2]
+                    - 3 * m[1] * m[1] / m[0]
+                ) * m[1] / m[0]
+            ) * m[1] / m[0]
+        ) / m[0];
+    }
+};
+
+template <typename T, size_t _level>
+class VarMomentsDecimal
+{
+public:
+    using NativeType = typename T::NativeType;
+
+    void add(NativeType x)
+    {
+        ++m0;
+        getM(1) += x;
+
+        NativeType tmp;
+        bool overflow = common::mulOverflow(x, x, tmp) || common::addOverflow(getM(2), tmp, getM(2));
+        if constexpr (_level >= 3)
+            overflow = overflow || common::mulOverflow(tmp, x, tmp) || common::addOverflow(getM(3), tmp, getM(3));
+        if constexpr (_level >= 4)
+            overflow = overflow || common::mulOverflow(tmp, x, tmp) || common::addOverflow(getM(4), tmp, getM(4));
+
+        if (overflow)
+            throw Exception("Decimal math overflow", ErrorCodes::DECIMAL_OVERFLOW);
+    }
+
+    void merge(const VarMomentsDecimal & rhs)
+    {
+        m0 += rhs.m0;
+        getM(1) += rhs.getM(1);
+
+        bool overflow = common::addOverflow(getM(2), rhs.getM(2), getM(2));
+        if constexpr (_level >= 3)
+            overflow = overflow || common::addOverflow(getM(3), rhs.getM(3), getM(3));
+        if constexpr (_level >= 4)
+            overflow = overflow || common::addOverflow(getM(4), rhs.getM(4), getM(4));
+
+        if (overflow)
+            throw Exception("Decimal math overflow", ErrorCodes::DECIMAL_OVERFLOW);
+    }
+
+    void write(WriteBuffer & buf) const { writePODBinary(*this, buf); }
+    void read(ReadBuffer & buf) { readPODBinary(*this, buf); }
+
+    Float64 getPopulation(UInt32 scale) const
+    {
+        if (m0 == 0)
+            return std::numeric_limits<Float64>::infinity();
+
+        NativeType tmp;
+        if (common::mulOverflow(getM(1), getM(1), tmp) ||
+            common::subOverflow(getM(2), NativeType(tmp / m0), tmp))
+            throw Exception("Decimal math overflow", ErrorCodes::DECIMAL_OVERFLOW);
+        return std::max(Float64{}, DecimalUtils::convertTo<Float64>(T(tmp / m0), scale));
+    }
+
+    Float64 getSample(UInt32 scale) const
+    {
+        if (m0 == 0)
+            return std::numeric_limits<Float64>::quiet_NaN();
+        if (m0 == 1)
+            return std::numeric_limits<Float64>::infinity();
+
+        NativeType tmp;
+        if (common::mulOverflow(getM(1), getM(1), tmp) ||
+            common::subOverflow(getM(2), NativeType(tmp / m0), tmp))
+            throw Exception("Decimal math overflow", ErrorCodes::DECIMAL_OVERFLOW);
+        return std::max(Float64{}, DecimalUtils::convertTo<Float64>(T(tmp / (m0 - 1)), scale));
+    }
+
+    Float64 getMoment3(UInt32 scale) const
+    {
+        if (m0 == 0)
+            return std::numeric_limits<Float64>::infinity();
+
+        NativeType tmp;
+        if (common::mulOverflow(2 * getM(1), getM(1), tmp) ||
+            common::subOverflow(3 * getM(2), NativeType(tmp / m0), tmp) ||
+            common::mulOverflow(tmp, getM(1), tmp) ||
+            common::subOverflow(getM(3), NativeType(tmp / m0), tmp))
+            throw Exception("Decimal math overflow", ErrorCodes::DECIMAL_OVERFLOW);
+        return DecimalUtils::convertTo<Float64>(T(tmp / m0), scale);
+    }
+
+    Float64 getMoment4(UInt32 scale) const
+    {
+        if (m0 == 0)
+            return std::numeric_limits<Float64>::infinity();
+
+        NativeType tmp;
+        if (common::mulOverflow(3 * getM(1), getM(1), tmp) ||
+            common::subOverflow(6 * getM(2), NativeType(tmp / m0), tmp) ||
+            common::mulOverflow(tmp, getM(1), tmp) ||
+            common::subOverflow(4 * getM(3), NativeType(tmp / m0), tmp) ||
+            common::mulOverflow(tmp, getM(1), tmp) ||
+            common::subOverflow(getM(4), NativeType(tmp / m0), tmp))
+            throw Exception("Decimal math overflow", ErrorCodes::DECIMAL_OVERFLOW);
+        return DecimalUtils::convertTo<Float64>(T(tmp / m0), scale);
+    }
+
+private:
+    UInt64 m0{};
+    NativeType m[_level]{};
+
+    NativeType & getM(size_t i) { return m[i - 1]; }
+    const NativeType & getM(size_t i) const { return m[i - 1]; }
+};
+
+/**
+    Calculating multivariate central moments
+    Levels:
+        level 2 (pop & samp): covar
+    References:
+        https://en.wikipedia.org/wiki/Moment_(mathematics)
+*/
+template <typename T>
+struct CovarMoments
+{
+    T m0{};
+    T x1{};
+    T y1{};
+    T xy{};
+
+    void add(T x, T y)
+    {
+        ++m0;
+        x1 += x;
+        y1 += y;
+        xy += x * y;
+    }
+
+    void merge(const CovarMoments & rhs)
+    {
+        m0 += rhs.m0;
+        x1 += rhs.x1;
+        y1 += rhs.y1;
+        xy += rhs.xy;
+    }
+
+    void write(WriteBuffer & buf) const
+    {
+        writePODBinary(*this, buf);
+    }
+
+    void read(ReadBuffer & buf)
+    {
+        readPODBinary(*this, buf);
+    }
+
+    T NO_SANITIZE_UNDEFINED getPopulation() const
+    {
+        return (xy - x1 * y1 / m0) / m0;
+    }
+
+    T NO_SANITIZE_UNDEFINED getSample() const
+    {
+        if (m0 == 0)
+            return std::numeric_limits<T>::quiet_NaN();
+        return (xy - x1 * y1 / m0) / (m0 - 1);
+    }
+};
+
+template <typename T>
+struct CorrMoments
+{
+    T m0{};
+    T x1{};
+    T y1{};
+    T xy{};
+    T x2{};
+    T y2{};
+
+    void add(T x, T y)
+    {
+        ++m0;
+        x1 += x;
+        y1 += y;
+        xy += x * y;
+        x2 += x * x;
+        y2 += y * y;
+    }
+
+    void merge(const CorrMoments & rhs)
+    {
+        m0 += rhs.m0;
+        x1 += rhs.x1;
+        y1 += rhs.y1;
+        xy += rhs.xy;
+        x2 += rhs.x2;
+        y2 += rhs.y2;
+    }
+
+    void write(WriteBuffer & buf) const
+    {
+        writePODBinary(*this, buf);
+    }
+
+    void read(ReadBuffer & buf)
+    {
+        readPODBinary(*this, buf);
+    }
+
+    T NO_SANITIZE_UNDEFINED get() const
+    {
+        return (m0 * xy - x1 * y1) / sqrt((m0 * x2 - x1 * x1) * (m0 * y2 - y1 * y1));
+    }
+};
+
+/// Data for calculation of Student and Welch T-Tests.
+template <typename T>
+struct TTestMoments
+{
+    T nx{};
+    T ny{};
+    T x1{};
+    T y1{};
+    T x2{};
+    T y2{};
+
+    void addX(T value)
+    {
+        ++nx;
+        x1 += value;
+        x2 += value * value;
+    }
+
+    void addY(T value)
+    {
+        ++ny;
+        y1 += value;
+        y2 += value * value;
+    }
+
+    void merge(const TTestMoments & rhs)
+    {
+        nx += rhs.nx;
+        ny += rhs.ny;
+        x1 += rhs.x1;
+        y1 += rhs.y1;
+        x2 += rhs.x2;
+        y2 += rhs.y2;
+    }
+
+    void write(WriteBuffer & buf) const
+    {
+        writePODBinary(*this, buf);
+    }
+
+    void read(ReadBuffer & buf)
+    {
+        readPODBinary(*this, buf);
+    }
+};
+
+}
--- a/src/AggregateFunctions/QuantileTDigest.h
+++ b/src/AggregateFunctions/QuantileTDigest.h
@ -114,7 +114,7 @@ class QuantileTDigest
        static constexpr size_t PART_SIZE_BITS = 8;

        using Transform = RadixSortFloatTransform<KeyBits>;
-        using Allocator = RadixSortMallocAllocator;
+        using Allocator = RadixSortAllocator;

        /// The function to get the key from an array element.
        static Key & extractKey(Element & elem) { return elem.mean; }
--- a/src/AggregateFunctions/StatCommon.h
+++ b/src/AggregateFunctions/StatCommon.h
@ -0,0 +1,114 @@
+#pragma once
+
+#include <IO/WriteHelpers.h>
+#include <IO/ReadHelpers.h>
+#include <Common/ArenaAllocator.h>
+
+#include <numeric>
+#include <algorithm>
+#include <utility>
+
+namespace DB
+{
+
+template <typename F>
+static Float64 integrateSimpson(Float64 a, Float64 b, F && func)
+{
+    const size_t iterations = std::max(1e6, 1e4 * std::abs(std::round(b) - std::round(a)));
+    const long double h = (b - a) / iterations;
+    Float64 sum_odds = 0.0;
+    for (size_t i = 1; i < iterations; i += 2)
+        sum_odds += func(a + i * h);
+    Float64 sum_evens = 0.0;
+    for (size_t i = 2; i < iterations; i += 2)
+        sum_evens += func(a + i * h);
+    return (func(a) + func(b) + 2 * sum_evens + 4 * sum_odds) * h / 3;
+}
+
+/// Because ranks are adjusted, we have to store each of them in Float type.
+using RanksArray = std::vector<Float64>;
+
+template <typename Values>
+std::pair<RanksArray, Float64> computeRanksAndTieCorrection(const Values & values)
+{
+    const size_t size = values.size();
+    /// Save initial positions, than sort indices according to the values.
+    std::vector<size_t> indexes(size);
+    std::iota(indexes.begin(), indexes.end(), 0);
+    std::sort(indexes.begin(), indexes.end(),
+                [&] (size_t lhs, size_t rhs) { return values[lhs] < values[rhs]; });
+
+    size_t left = 0;
+    Float64 tie_numenator = 0;
+    RanksArray out(size);
+    while (left < size)
+    {
+        size_t right = left;
+        while (right < size && values[indexes[left]] == values[indexes[right]])
+            ++right;
+        auto adjusted = (left + right + 1.) / 2.;
+        auto count_equal = right - left;
+        tie_numenator += std::pow(count_equal, 3) - count_equal;
+        for (size_t iter = left; iter < right; ++iter)
+            out[indexes[iter]] = adjusted;
+        left = right;
+    }
+    return {out, 1 - (tie_numenator / (std::pow(size, 3) - size))};
+}
+
+
+template <typename X, typename Y>
+struct StatisticalSample
+{
+    using AllocatorXSample = MixedAlignedArenaAllocator<alignof(X), 4096>;
+    using SampleX = PODArray<X, 32, AllocatorXSample>;
+
+    using AllocatorYSample = MixedAlignedArenaAllocator<alignof(Y), 4096>;
+    using SampleY = PODArray<Y, 32, AllocatorYSample>;
+
+    SampleX x{};
+    SampleY y{};
+    size_t size_x{0};
+    size_t size_y{0};
+
+    void addX(X value, Arena * arena)
+    {
+        ++size_x;
+        x.push_back(value, arena);
+    }
+
+    void addY(Y value, Arena * arena)
+    {
+        ++size_y;
+        y.push_back(value, arena);
+    }
+
+    void merge(const StatisticalSample & rhs, Arena * arena)
+    {
+        size_x += rhs.size_x;
+        size_y += rhs.size_y;
+        x.insert(rhs.x.begin(), rhs.x.end(), arena);
+        y.insert(rhs.y.begin(), rhs.y.end(), arena);
+    }
+
+    void write(WriteBuffer & buf) const
+    {
+        writeVarUInt(size_x, buf);
+        writeVarUInt(size_y, buf);
+        buf.write(reinterpret_cast<const char *>(x.data()), size_x * sizeof(x[0]));
+        buf.write(reinterpret_cast<const char *>(y.data()), size_y * sizeof(y[0]));
+    }
+
+    void read(ReadBuffer & buf, Arena * arena)
+    {
+        readVarUInt(size_x, buf);
+        readVarUInt(size_y, buf);
+        x.resize(size_x, arena);
+        y.resize(size_y, arena);
+        buf.read(reinterpret_cast<char *>(x.data()), size_x * sizeof(x[0]));
+        buf.read(reinterpret_cast<char *>(y.data()), size_y * sizeof(y[0]));
+    }
+};
+
+}
+
--- a/src/AggregateFunctions/registerAggregateFunctions.cpp
+++ b/src/AggregateFunctions/registerAggregateFunctions.cpp
@ -39,9 +39,10 @@ void registerAggregateFunctionSimpleLinearRegression(AggregateFunctionFactory &)
 void registerAggregateFunctionMoving(AggregateFunctionFactory &);
 void registerAggregateFunctionCategoricalIV(AggregateFunctionFactory &);
 void registerAggregateFunctionAggThrow(AggregateFunctionFactory &);
+void registerAggregateFunctionRankCorrelation(AggregateFunctionFactory &);
+void registerAggregateFunctionMannWhitney(AggregateFunctionFactory &);
 void registerAggregateFunctionWelchTTest(AggregateFunctionFactory &);
 void registerAggregateFunctionStudentTTest(AggregateFunctionFactory &);
-void registerAggregateFunctionRankCorrelation(AggregateFunctionFactory &);

 class AggregateFunctionCombinatorFactory;
 void registerAggregateFunctionCombinatorIf(AggregateFunctionCombinatorFactory &);
@ -94,6 +95,9 @@ void registerAggregateFunctions()
        registerAggregateFunctionCategoricalIV(factory);
        registerAggregateFunctionAggThrow(factory);
        registerAggregateFunctionRankCorrelation(factory);
+        registerAggregateFunctionMannWhitney(factory);
+        registerAggregateFunctionWelchTTest(factory);
+        registerAggregateFunctionStudentTTest(factory);
    }

    {
--- a/src/AggregateFunctions/tests/gtest_ranks.cpp
+++ b/src/AggregateFunctions/tests/gtest_ranks.cpp
@ -0,0 +1,27 @@
+#include <IO/WriteBufferFromString.h>
+#include <IO/ReadBufferFromString.h>
+#include <Common/PODArray.h>
+#include <AggregateFunctions/StatCommon.h>
+#include <iostream>
+
+#include <gtest/gtest.h>
+
+
+TEST(Ranks, Simple)
+{
+    using namespace DB;
+    RanksArray sample = {310, 195, 480, 530, 155, 530, 245, 385, 450, 450, 465, 545, 170, 180, 125, 180, 230, 170, 75, 430, 480, 495, 295};
+
+    RanksArray ranks;
+    Float64 t = 0;
+    std::tie(ranks, t) = computeRanksAndTieCorrection(sample);
+
+    RanksArray expected{12.0, 8.0, 18.5, 21.5, 3.0, 21.5, 10.0, 13.0, 15.5, 15.5, 17.0, 23.0, 4.5, 6.5, 2.0, 6.5, 9.0, 4.5, 1.0, 14.0, 18.5, 20.0, 11.0};
+
+    ASSERT_EQ(ranks.size(), expected.size());
+
+    for (size_t i = 0; i < ranks.size(); ++i)
+        ASSERT_DOUBLE_EQ(ranks[i], expected[i]);
+
+    ASSERT_DOUBLE_EQ(t, 0.9975296442687747);
+}
--- a/src/AggregateFunctions/ya.make
+++ b/src/AggregateFunctions/ya.make
@ -29,6 +29,7 @@ SRCS(
    AggregateFunctionHistogram.cpp
    AggregateFunctionIf.cpp
    AggregateFunctionMLMethod.cpp
+    AggregateFunctionMannWhitney.cpp
    AggregateFunctionMaxIntersections.cpp
    AggregateFunctionMerge.cpp
    AggregateFunctionMinMaxAny.cpp
@ -43,6 +44,7 @@ SRCS(
    AggregateFunctionState.cpp
    AggregateFunctionStatistics.cpp
    AggregateFunctionStatisticsSimple.cpp
+    AggregateFunctionStudentTTest.cpp
    AggregateFunctionSum.cpp
    AggregateFunctionSumMap.cpp
    AggregateFunctionTimeSeriesGroupSum.cpp
@ -50,6 +52,7 @@ SRCS(
    AggregateFunctionUniq.cpp
    AggregateFunctionUniqCombined.cpp
    AggregateFunctionUniqUpTo.cpp
+    AggregateFunctionWelchTTest.cpp
    AggregateFunctionWindowFunnel.cpp
    UniqCombinedBiasData.cpp
    UniqVariadicHash.cpp
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -382,6 +382,10 @@ if (USE_PROTOBUF)
    dbms_target_include_directories (SYSTEM BEFORE PRIVATE ${Protobuf_INCLUDE_DIR})
 endif ()

+if (USE_GRPC)
+    dbms_target_link_libraries (PUBLIC clickhouse_grpc_protos)
+endif()
+
 if (USE_HDFS)
    target_link_libraries (clickhouse_common_io PUBLIC ${HDFS3_LIBRARY})
    target_include_directories (clickhouse_common_io SYSTEM BEFORE PUBLIC ${HDFS3_INCLUDE_DIR})
--- a/src/Client/Connection.cpp
+++ b/src/Client/Connection.cpp
@ -207,6 +207,12 @@ void Connection::receiveHello()
    /// Receive hello packet.
    UInt64 packet_type = 0;

+    /// Prevent read after eof in readVarUInt in case of reset connection
+    /// (Poco should throw such exception while reading from socket but
+    /// sometimes it doesn't for unknown reason)
+    if (in->eof())
+        throw Poco::Net::NetException("Connection reset by peer");
+
    readVarUInt(packet_type, *in);
    if (packet_type == Protocol::Server::Hello)
    {
--- a/src/Common/ErrorCodes.cpp
+++ b/src/Common/ErrorCodes.cpp
@ -523,6 +523,9 @@
    M(554, LZMA_STREAM_DECODER_FAILED) \
    M(555, ROCKSDB_ERROR) \
    M(556, SYNC_MYSQL_USER_ACCESS_ERROR)\
+    M(557, UNKNOWN_UNION) \
+    M(558, EXPECTED_ALL_OR_DISTINCT) \
+    M(559, INVALID_GRPC_QUERY_INFO) \
    \
    M(999, KEEPER_EXCEPTION) \
    M(1000, POCO_EXCEPTION) \
--- a/src/Common/OpenTelemetryTraceContext.h
+++ b/src/Common/OpenTelemetryTraceContext.h
@ -0,0 +1,21 @@
+#pragma once
+
+namespace DB
+{
+
+// The runtime info we need to create new OpenTelemetry spans.
+struct OpenTelemetryTraceContext
+{
+    __uint128_t trace_id = 0;
+    UInt64 span_id = 0;
+    // The incoming tracestate header and the trace flags, we just pass them
+    // downstream. See https://www.w3.org/TR/trace-context/
+    String tracestate;
+    __uint8_t trace_flags = 0;
+
+    // Parse/compose OpenTelemetry traceparent header.
+    bool parseTraceparentHeader(const std::string & traceparent, std::string & error);
+    std::string composeTraceparentHeader() const;
+};
+
+}
--- a/src/Common/ProfileEvents.cpp
+++ b/src/Common/ProfileEvents.cpp
@ -97,9 +97,6 @@
    M(DistributedConnectionStaleReplica, "") \
    M(DistributedConnectionFailAtAll, "Total count when distributed connection fails after all retries finished") \
    \
-    M(CompileAttempt, "Number of times a compilation of generated C++ code was initiated.") \
-    M(CompileSuccess, "Number of times a compilation of generated C++ code was successful.") \
-    \
    M(CompileFunction, "Number of times a compilation of generated LLVM code (to create fused function for complex expressions) was initiated.") \
    M(CompiledFunctionExecute, "Number of times a compiled function was executed.") \
    M(CompileExpressionsMicroseconds, "Total time spent for compilation of expressions to LLVM code.") \
--- a/src/Common/QueryProfiler.cpp
+++ b/src/Common/QueryProfiler.cpp
@ -176,7 +176,7 @@ template class QueryProfilerBase<QueryProfilerReal>;
 template class QueryProfilerBase<QueryProfilerCpu>;

 QueryProfilerReal::QueryProfilerReal(const UInt64 thread_id, const UInt32 period)
-    : QueryProfilerBase(thread_id, CLOCK_REALTIME, period, SIGUSR1)
+    : QueryProfilerBase(thread_id, CLOCK_MONOTONIC, period, SIGUSR1)
 {}

 void QueryProfilerReal::signalHandler(int sig, siginfo_t * info, void * context)
--- a/src/Common/RadixSort.h
+++ b/src/Common/RadixSort.h
@ -35,16 +35,16 @@

 /** Used as a template parameter. See below.
  */
-struct RadixSortMallocAllocator
+struct RadixSortAllocator
 {
    void * allocate(size_t size)
    {
-        return malloc(size);
+        return ::operator new(size);
    }

-    void deallocate(void * ptr, size_t /*size*/)
+    void deallocate(void * ptr, size_t size)
    {
-        return free(ptr);
+        ::operator delete(ptr, size);
    }
 };

@ -100,7 +100,7 @@ struct RadixSortFloatTraits
    /// An object with the functions allocate and deallocate.
    /// Can be used, for example, to allocate memory for a temporary array on the stack.
    /// To do this, the allocator itself is created on the stack.
-    using Allocator = RadixSortMallocAllocator;
+    using Allocator = RadixSortAllocator;

    /// The function to get the key from an array element.
    static Key & extractKey(Element & elem) { return elem; }
@ -139,7 +139,7 @@ struct RadixSortUIntTraits
    static constexpr size_t PART_SIZE_BITS = 8;

    using Transform = RadixSortIdentityTransform<KeyBits>;
-    using Allocator = RadixSortMallocAllocator;
+    using Allocator = RadixSortAllocator;

    static Key & extractKey(Element & elem) { return elem; }
    static Result & extractResult(Element & elem) { return elem; }
@ -173,7 +173,7 @@ struct RadixSortIntTraits
    static constexpr size_t PART_SIZE_BITS = 8;

    using Transform = RadixSortSignedTransform<KeyBits>;
-    using Allocator = RadixSortMallocAllocator;
+    using Allocator = RadixSortAllocator;

    static Key & extractKey(Element & elem) { return elem; }
    static Result & extractResult(Element & elem) { return elem; }
--- a/src/Common/ThreadStatus.cpp
+++ b/src/Common/ThreadStatus.cpp
@ -2,6 +2,7 @@
 #include <Common/ThreadProfileEvents.h>
 #include <Common/QueryProfiler.h>
 #include <Common/ThreadStatus.h>
+#include <Interpreters/OpenTelemetrySpanLog.h>

 #include <Poco/Logger.h>
 #include <common/getThreadId.h>
--- a/src/Common/ThreadStatus.h
+++ b/src/Common/ThreadStatus.h
@ -3,6 +3,7 @@
 #include <common/StringRef.h>
 #include <Common/ProfileEvents.h>
 #include <Common/MemoryTracker.h>
+#include <Common/OpenTelemetryTraceContext.h>

 #include <Core/SettingsEnums.h>

@ -31,6 +32,7 @@ class ThreadStatus;
 class QueryProfilerReal;
 class QueryProfilerCpu;
 class QueryThreadLog;
+struct OpenTelemetrySpanHolder;
 class TasksStatsCounters;
 struct RUsageCounters;
 struct PerfEventsCounters;
@ -86,9 +88,6 @@ extern thread_local ThreadStatus * current_thread;
 class ThreadStatus : public boost::noncopyable
 {
 public:
-    ThreadStatus();
-    ~ThreadStatus();
-
    /// Linux's PID (or TGID) (the same id is shown by ps util)
    const UInt64 thread_id = 0;
    /// Also called "nice" value. If it was changed to non-zero (when attaching query) - will be reset to zero when query is detached.
@ -110,6 +109,52 @@ public:
    using Deleter = std::function<void()>;
    Deleter deleter;

+    // This is the current most-derived OpenTelemetry span for this thread. It
+    // can be changed throughout the query execution, whenever we enter a new
+    // span or exit it. See OpenTelemetrySpanHolder that is normally responsible
+    // for these changes.
+    OpenTelemetryTraceContext thread_trace_context;
+
+protected:
+    ThreadGroupStatusPtr thread_group;
+
+    std::atomic<int> thread_state{ThreadState::DetachedFromQuery};
+
+    /// Is set once
+    Context * global_context = nullptr;
+    /// Use it only from current thread
+    Context * query_context = nullptr;
+
+    String query_id;
+
+    /// A logs queue used by TCPHandler to pass logs to a client
+    InternalTextLogsQueueWeakPtr logs_queue_ptr;
+
+    bool performance_counters_finalized = false;
+    UInt64 query_start_time_nanoseconds = 0;
+    UInt64 query_start_time_microseconds = 0;
+    time_t query_start_time = 0;
+    size_t queries_started = 0;
+
+    // CPU and Real time query profilers
+    std::unique_ptr<QueryProfilerReal> query_profiler_real;
+    std::unique_ptr<QueryProfilerCpu> query_profiler_cpu;
+
+    Poco::Logger * log = nullptr;
+
+    friend class CurrentThread;
+
+    /// Use ptr not to add extra dependencies in the header
+    std::unique_ptr<RUsageCounters> last_rusage;
+    std::unique_ptr<TasksStatsCounters> taskstats;
+
+    /// Is used to send logs from logs_queue to client in case of fatal errors.
+    std::function<void()> fatal_error_callback;
+
+public:
+    ThreadStatus();
+    ~ThreadStatus();
+
    ThreadGroupStatusPtr getThreadGroup() const
    {
        return thread_group;
@ -176,40 +221,6 @@ protected:

    void assertState(const std::initializer_list<int> & permitted_states, const char * description = nullptr) const;

-    ThreadGroupStatusPtr thread_group;
-
-    std::atomic<int> thread_state{ThreadState::DetachedFromQuery};
-
-    /// Is set once
-    Context * global_context = nullptr;
-    /// Use it only from current thread
-    Context * query_context = nullptr;
-
-    String query_id;
-
-    /// A logs queue used by TCPHandler to pass logs to a client
-    InternalTextLogsQueueWeakPtr logs_queue_ptr;
-
-    bool performance_counters_finalized = false;
-    UInt64 query_start_time_nanoseconds = 0;
-    UInt64 query_start_time_microseconds = 0;
-    time_t query_start_time = 0;
-    size_t queries_started = 0;
-
-    // CPU and Real time query profilers
-    std::unique_ptr<QueryProfilerReal> query_profiler_real;
-    std::unique_ptr<QueryProfilerCpu> query_profiler_cpu;
-
-    Poco::Logger * log = nullptr;
-
-    friend class CurrentThread;
-
-    /// Use ptr not to add extra dependencies in the header
-    std::unique_ptr<RUsageCounters> last_rusage;
-    std::unique_ptr<TasksStatsCounters> taskstats;
-
-    /// Is used to send logs from logs_queue to client in case of fatal errors.
-    std::function<void()> fatal_error_callback;

 private:
    void setupState(const ThreadGroupStatusPtr & thread_group_);
--- a/src/Common/XDBCBridgeHelper.h
+++ b/src/Common/XDBCBridgeHelper.h
@ -326,6 +326,16 @@ struct ODBCBridgeMixin
            cmd_args.push_back("--err-log-path");
            cmd_args.push_back(config.getString("logger." + configPrefix() + "_errlog"));
        }
+        if (config.has("logger." + configPrefix() + "_stdout"))
+        {
+            cmd_args.push_back("--stdout-path");
+            cmd_args.push_back(config.getString("logger." + configPrefix() + "_stdout"));
+        }
+        if (config.has("logger." + configPrefix() + "_stderr"))
+        {
+            cmd_args.push_back("--stderr-path");
+            cmd_args.push_back(config.getString("logger." + configPrefix() + "_stderr"));
+        }
        if (config.has("logger." + configPrefix() + "_level"))
        {
            cmd_args.push_back("--log-level");
--- a/src/Common/ZooKeeper/ZooKeeperImpl.cpp
+++ b/src/Common/ZooKeeper/ZooKeeperImpl.cpp
@ -1114,6 +1114,7 @@ void ZooKeeper::sendThread()
                    info.request->probably_sent = true;
                    info.request->write(*out);

+                    /// We sent close request, exit
                    if (info.request->xid == close_xid)
                        break;
                }
@ -1342,21 +1343,25 @@ void ZooKeeper::receiveEvent()

 void ZooKeeper::finalize(bool error_send, bool error_receive)
 {
+    /// If some thread (send/receive) already finalizing session don't try to do it
+    if (finalization_started.exchange(true))
+        return;
+
+    auto expire_session_if_not_expired = [&]
    {
        std::lock_guard lock(push_request_mutex);
-
-        if (expired)
-            return;
-        expired = true;
-    }
-
-    active_session_metric_increment.destroy();
+        if (!expired)
+        {
+            expired = true;
+            active_session_metric_increment.destroy();
+        }
+    };

    try
    {
        if (!error_send)
        {
-            /// Send close event. This also signals sending thread to wakeup and then stop.
+            /// Send close event. This also signals sending thread to stop.
            try
            {
                close();
@ -1364,12 +1369,18 @@ void ZooKeeper::finalize(bool error_send, bool error_receive)
            catch (...)
            {
                /// This happens for example, when "Cannot push request to queue within operation timeout".
+                /// Just mark session expired in case of error on close request, otherwise sendThread may not stop.
+                expire_session_if_not_expired();
                tryLogCurrentException(__PRETTY_FUNCTION__);
            }

+            /// Send thread will exit after sending close request or on expired flag
            send_thread.join();
        }

+        /// Set expired flag after we sent close event
+        expire_session_if_not_expired();
+
        try
        {
            /// This will also wakeup the receiving thread.
--- a/src/Common/ZooKeeper/ZooKeeperImpl.h
+++ b/src/Common/ZooKeeper/ZooKeeperImpl.h
@ -187,6 +187,9 @@ private:

    std::atomic<XID> next_xid {1};
    std::atomic<bool> expired {false};
+    /// Mark session finalization start. Used to avoid simultaneous
+    /// finalization from different threads. One-shot flag.
+    std::atomic<bool> finalization_started {false};
    std::mutex push_request_mutex;

    using clock = std::chrono::steady_clock;
--- a/src/Core/Settings.h
+++ b/src/Core/Settings.h
@ -65,6 +65,7 @@ class IColumn;
    M(UInt64, distributed_connections_pool_size, DBMS_DEFAULT_DISTRIBUTED_CONNECTIONS_POOL_SIZE, "Maximum number of connections with one remote server in the pool.", 0) \
    M(UInt64, connections_with_failover_max_tries, DBMS_CONNECTION_POOL_WITH_FAILOVER_DEFAULT_MAX_TRIES, "The maximum number of attempts to connect to replicas.", 0) \
    M(UInt64, s3_min_upload_part_size, 512*1024*1024, "The minimum size of part to upload during multipart upload to S3.", 0) \
+    M(UInt64, s3_max_redirects, 10, "Max number of S3 redirects hops allowed.", 0) \
    M(Bool, extremes, false, "Calculate minimums and maximums of the result columns. They can be output in JSON-formats.", IMPORTANT) \
    M(Bool, use_uncompressed_cache, true, "Whether to use the cache of uncompressed blocks.", 0) \
    M(Bool, replace_running_query, false, "Whether the running request should be canceled with the same id as the new one.", 0) \
@ -406,6 +407,7 @@ class IColumn;
    M(Bool, force_optimize_skip_unused_shards_no_nested, false, "Obsolete setting, does nothing. Will be removed after 2020-12-01. Use force_optimize_skip_unused_shards_nesting instead.", 0) \
    M(Bool, enable_debug_queries, false, "Enabled debug queries, but now is obsolete", 0) \
    M(Bool, allow_experimental_database_atomic, true, "Obsolete setting, does nothing. Will be removed after 2021-02-12", 0) \
+    M(UnionMode, union_default_mode, UnionMode::DISTINCT, "Set default Union Mode in SelectWithUnion query. Possible values: empty string, 'ALL', 'DISTINCT'. If empty, query without Union Mode will throw exception.", 0)

 // End of COMMON_SETTINGS
 // Please add settings related to formats into the FORMAT_FACTORY_SETTINGS below.
--- a/src/Core/SettingsEnums.cpp
+++ b/src/Core/SettingsEnums.cpp
@ -12,6 +12,7 @@ namespace ErrorCodes
    extern const int UNKNOWN_JOIN;
    extern const int BAD_ARGUMENTS;
    extern const int UNKNOWN_MYSQL_DATATYPES_SUPPORT_LEVEL;
+    extern const int UNKNOWN_UNION;
 }


@ -96,4 +97,9 @@ IMPLEMENT_SETTING_MULTI_ENUM(MySQLDataTypesSupport, ErrorCodes::UNKNOWN_MYSQL_DA
    {{"decimal",    MySQLDataTypesSupport::DECIMAL},
     {"datetime64", MySQLDataTypesSupport::DATETIME64}})

+IMPLEMENT_SETTING_ENUM(UnionMode, ErrorCodes::UNKNOWN_UNION,
+    {{"",         UnionMode::Unspecified},
+     {"ALL",      UnionMode::ALL},
+     {"DISTINCT", UnionMode::DISTINCT}})
+
 }
--- a/src/Core/SettingsEnums.h
+++ b/src/Core/SettingsEnums.h
@ -129,4 +129,13 @@ enum class MySQLDataTypesSupport

 DECLARE_SETTING_MULTI_ENUM(MySQLDataTypesSupport)

+enum class UnionMode
+{
+    Unspecified = 0, // Query UNION without UnionMode will throw exception
+    ALL, // Query UNION without UnionMode -> SELECT ... UNION ALL SELECT ...
+    DISTINCT // Query UNION without UnionMode -> SELECT ... UNION DISTINCT SELECT ...
+};
+
+DECLARE_SETTING_ENUM(UnionMode)
+
 }
--- a/src/Core/include/config_core.h
+++ b/src/Core/include/config_core.h
@ -0,0 +1,13 @@
+#pragma once
+
+// .h autogenerated by cmake!
+
+#define USE_ICU 1
+#define USE_MYSQL 1
+#define USE_RDKAFKA 1
+#define USE_AMQPCPP 1
+#define USE_EMBEDDED_COMPILER 0
+#define USE_INTERNAL_LLVM_LIBRARY 0
+#define USE_SSL 1
+#define USE_OPENCL 0
+#define USE_LDAP 1
--- a/src/DataStreams/RemoteBlockOutputStream.cpp
+++ b/src/DataStreams/RemoteBlockOutputStream.cpp
@ -27,6 +27,11 @@ RemoteBlockOutputStream::RemoteBlockOutputStream(Connection & connection_,
 {
    ClientInfo modified_client_info = client_info_;
    modified_client_info.query_kind = ClientInfo::QueryKind::SECONDARY_QUERY;
+    if (CurrentThread::isInitialized())
+    {
+        modified_client_info.client_trace_context
+            = CurrentThread::get().thread_trace_context;
+    }

    /** Send query and receive "header", that describes table structure.
      * Header is needed to know, what structure is required for blocks to be passed to 'write' method.
--- a/src/DataStreams/RemoteQueryExecutor.cpp
+++ b/src/DataStreams/RemoteQueryExecutor.cpp
@ -151,11 +151,26 @@ void RemoteQueryExecutor::sendQuery()
    if (settings.skip_unavailable_shards && 0 == multiplexed_connections->size())
        return;

+    /// Query cannot be canceled in the middle of the send query,
+    /// since there are multiple packages:
+    /// - Query
+    /// - Data (multiple times)
+    ///
+    /// And after the Cancel packet none Data packet can be sent, otherwise the remote side will throw:
+    ///
+    ///     Unexpected packet Data received from client
+    ///
+    std::lock_guard guard(was_cancelled_mutex);
+
    established = true;

    auto timeouts = ConnectionTimeouts::getTCPTimeoutsWithFailover(settings);
    ClientInfo modified_client_info = context.getClientInfo();
    modified_client_info.query_kind = ClientInfo::QueryKind::SECONDARY_QUERY;
+    if (CurrentThread::isInitialized())
+    {
+        modified_client_info.client_trace_context = CurrentThread::get().thread_trace_context;
+    }

    multiplexed_connections->sendQuery(timeouts, query, query_id, stage, modified_client_info, true);

--- a/src/Dictionaries/CacheDictionary.cpp
+++ b/src/Dictionaries/CacheDictionary.cpp
@ -10,6 +10,7 @@
 #include <Common/randomSeed.h>
 #include <Common/typeid_cast.h>
 #include <Core/Defines.h>
+#include <IO/WriteBufferFromOStream.h>
 #include <ext/range.h>
 #include <ext/size.h>
 #include <Common/setThreadName.h>
@ -128,7 +129,7 @@ const IDictionarySource * CacheDictionary::getSource() const

 void CacheDictionary::toParent(const PaddedPODArray<Key> & ids, PaddedPODArray<Key> & out) const
 {
-    const auto null_value = std::get<UInt64>(hierarchical_attribute->null_values);
+    const auto null_value = std::get<UInt64>(hierarchical_attribute->null_value);

    getItemsNumberImpl<UInt64, UInt64>(*hierarchical_attribute, ids, out, [&](const size_t) { return null_value; });
 }
@ -153,7 +154,7 @@ void CacheDictionary::isInImpl(const PaddedPODArray<Key> & child_ids, const Ance
    size_t out_size = out.size();
    memset(out.data(), 0xFF, out_size); /// 0xFF means "not calculated"

-    const auto null_value = std::get<UInt64>(hierarchical_attribute->null_values);
+    const auto null_value = std::get<UInt64>(hierarchical_attribute->null_value);

    PaddedPODArray<Key> children(out_size, 0);
    PaddedPODArray<Key> parents(child_ids.begin(), child_ids.end());
@ -225,7 +226,7 @@ void CacheDictionary::isInConstantVector(const Key child_id, const PaddedPODArra
 {
    /// Special case with single child value.

-    const auto null_value = std::get<UInt64>(hierarchical_attribute->null_values);
+    const auto null_value = std::get<UInt64>(hierarchical_attribute->null_value);

    PaddedPODArray<Key> child(1, child_id);
    PaddedPODArray<Key> parent(1);
@ -253,7 +254,7 @@ void CacheDictionary::getString(const std::string & attribute_name, const Padded
    auto & attribute = getAttribute(attribute_name);
    checkAttributeType(this, attribute_name, attribute.type, AttributeUnderlyingType::utString);

-    const auto null_value = StringRef{std::get<String>(attribute.null_values)};
+    const auto null_value = StringRef{std::get<String>(attribute.null_value)};

    getItemsString(attribute, ids, out, [&](const size_t) { return null_value; });
 }
@ -276,19 +277,80 @@ void CacheDictionary::getString(
    getItemsString(attribute, ids, out, [&](const size_t) { return StringRef{def}; });
 }

+template<class... Ts>
+struct Overloaded : Ts... {using Ts::operator()...;};

-/// returns cell_idx (always valid for replacing), 'cell is valid' flag, 'cell is outdated' flag
-/// true  false   found and valid
-/// false true    not found (something outdated, maybe our cell)
-/// false false   not found (other id stored with valid data)
-/// true  true    impossible
-///
-/// todo: split this func to two: find_for_get and find_for_set
-CacheDictionary::FindResult CacheDictionary::findCellIdx(const Key & id, const CellMetadata::time_point_t now) const
+template<class... Ts>
+Overloaded(Ts...) -> Overloaded<Ts...>;
+
+std::string CacheDictionary::AttributeValuesForKey::dump()
+{
+    WriteBufferFromOwnString os;
+    for (auto & attr : values)
+        std::visit(Overloaded {
+            [&os](UInt8 arg)   { os << "type: UInt8, value: "   <<  std::to_string(arg) << "\n"; },
+            [&os](UInt16 arg)  { os << "type: UInt16, value: "  <<  std::to_string(arg) << "\n"; },
+            [&os](UInt32 arg)  { os << "type: UInt32, value: "  <<  std::to_string(arg) << "\n"; },
+            [&os](UInt64 arg)  { os << "type: UInt64, value: "  <<  std::to_string(arg) << "\n"; },
+            [&os](UInt128 arg) { os << "type: UInt128, value: " << arg.toHexString() << "\n"; },
+            [&os](Int8 arg)   { os << "type: Int8, value: "   <<  std::to_string(arg) << "\n"; },
+            [&os](Int16 arg)  { os << "type: Int16, value: "  <<  std::to_string(arg) << "\n"; },
+            [&os](Int32 arg)  { os << "type: Int32, value: "  <<  std::to_string(arg) << "\n"; },
+            [&os](Int64 arg)  { os << "type: Int64, value: "  <<  std::to_string(arg) << "\n"; },
+            [&os](Decimal32 arg)   { os << "type: Decimal32, value: "  <<  std::to_string(arg) << "\n"; },
+            [&os](Decimal64 arg)   { os << "type: Decimal64, value: "  <<  std::to_string(arg) << "\n"; },
+            [&os](Decimal128)  { os << "type: Decimal128, value: ???" << "\n" ; },
+            [&os](Float32 arg)   { os << "type: Float32, value: "  <<  std::to_string(arg) << "\n"; },
+            [&os](Float64 arg)   { os << "type: Float64, value: "  <<  std::to_string(arg) << "\n"; },
+            [&os](String arg)  { os << "type: String, value: " <<  arg + "\n"; }
+        }, attr);
+    return os.str();
+};
+
+
+std::string CacheDictionary::UpdateUnit::dumpFoundIds()
+{
+    WriteBufferFromOwnString os;
+    for (auto it : found_ids)
+    {
+        os << "Key: " << std::to_string(it.first) << "\n";
+        if (it.second.found)
+            os << it.second.dump() << "\n";
+    }
+    return os.str();
+};
+
+/// Returns cell_idx in handmade open addressing cache table and the state of the cell stored the key.
+CacheDictionary::FindResult CacheDictionary::findCellIdxForGet(const Key & id, const time_point_t now) const
+{
+    auto pos = getCellIdx(id);
+    const auto stop = pos + max_collision_length;
+    for (; pos < stop; ++pos)
+    {
+        const auto cell_idx = pos & size_overlap_mask;
+        const auto & cell = cells[cell_idx];
+
+        if (cell.id != id)
+            continue;
+
+        if (isExpiredPermanently(now, cell.expiresAt()))
+            return {cell_idx, ResultState::FoundButExpiredPermanently};
+
+        if (isExpired(now, cell.expiresAt()))
+            return {cell_idx, ResultState::FoundButExpired};
+
+        return {cell_idx, ResultState::FoundAndValid};
+    }
+
+    return {pos & size_overlap_mask, ResultState::NotFound};
+}
+
+/// Returns cell_idx such that cells[cell_idx].id = id or the oldest cell in bounds of max_coolision_length.
+size_t CacheDictionary::findCellIdxForSet(const Key & id) const
 {
    auto pos = getCellIdx(id);
    auto oldest_id = pos;
-    auto oldest_time = CellMetadata::time_point_t::max();
+    auto oldest_time = time_point_t::max();
    const auto stop = pos + max_collision_length;
    for (; pos < stop; ++pos)
    {
@ -298,7 +360,7 @@ CacheDictionary::FindResult CacheDictionary::findCellIdx(const Key & id, const C
        if (cell.id != id)
        {
            /// maybe we already found nearest expired cell (try minimize collision_length on insert)
-            if (oldest_time > now && oldest_time > cell.expiresAt())
+            if (cell.expiresAt() < oldest_time)
            {
                oldest_time = cell.expiresAt();
                oldest_id = cell_idx;
@ -306,15 +368,11 @@ CacheDictionary::FindResult CacheDictionary::findCellIdx(const Key & id, const C
            continue;
        }

-        if (cell.expiresAt() < now)
-        {
-            return {cell_idx, false, true};
-        }
-
-        return {cell_idx, true, false};
+        /// We found the exact place for id.
+        return cell_idx;
    }

-    return {oldest_id, false, false};
+    return oldest_id;
 }

 void CacheDictionary::has(const PaddedPODArray<Key> & ids, PaddedPODArray<UInt8> & out) const
@ -324,13 +382,19 @@ void CacheDictionary::has(const PaddedPODArray<Key> & ids, PaddedPODArray<UInt8>
    /// - CacheExpired ids. Ids that are in local cache, but their values are rotted (lifetime is expired).
    /// - CacheNotFound ids. We have to go to external storage to know its value.

+    /// Mark everything as absent.
+    const auto rows = ext::size(ids);
+    for (const auto row : ext::range(0, rows))
+        out[row] = false;
+
    /// Mapping: <id> -> { all indices `i` of `ids` such that `ids[i]` = <id> }
-    std::unordered_map<Key, std::vector<size_t>> cache_expired_ids;
-    std::unordered_map<Key, std::vector<size_t>> cache_not_found_ids;
+    std::unordered_map<Key, std::vector<size_t>> cache_expired_or_not_found_ids;

    size_t cache_hit = 0;

-    const auto rows = ext::size(ids);
+    size_t cache_expired_count = 0;
+    size_t cache_not_found_count = 0;
+
    {
        const ProfilingScopedReadRWLock read_lock{rw_lock, ProfileEvents::DictCacheLockReadNs};

@ -339,66 +403,58 @@ void CacheDictionary::has(const PaddedPODArray<Key> & ids, PaddedPODArray<UInt8>
        for (const auto row : ext::range(0, rows))
        {
            const auto id = ids[row];
-            const auto find_result = findCellIdx(id, now);
-            const auto & cell_idx = find_result.cell_idx;
+            const auto [cell_idx, state] = findCellIdxForGet(id, now);
+            auto & cell = cells[cell_idx];

            auto insert_to_answer_routine = [&] ()
            {
-                out[row] = !cells[cell_idx].isDefault();
+                out[row] = !cell.isDefault();
            };

-            if (!find_result.valid)
-            {
-                if (find_result.outdated)
-                {
-                    /// Protection of reading very expired keys.
-                    if (now > cells[find_result.cell_idx].strict_max)
-                    {
-                        cache_not_found_ids[id].push_back(row);
-                        continue;
-                    }
-
-                    cache_expired_ids[id].push_back(row);
-
-                    if (allow_read_expired_keys)
-                        insert_to_answer_routine();
-                }
-                else
-                {
-                    cache_not_found_ids[id].push_back(row);
-                }
-            }
-            else
+            if (state == ResultState::FoundAndValid)
            {
                ++cache_hit;
                insert_to_answer_routine();
            }
+            else if (state == ResultState::NotFound || state == ResultState::FoundButExpiredPermanently)
+            {
+                /// Permanently expired equals to not found semantically.
+                ++cache_not_found_count;
+                cache_expired_or_not_found_ids[id].push_back(row);
+            }
+            else if (state == ResultState::FoundButExpired)
+            {
+                cache_expired_count++;
+                cache_expired_or_not_found_ids[id].push_back(row);
+
+                if (allow_read_expired_keys)
+                    insert_to_answer_routine();
+            }
        }
    }

-    ProfileEvents::increment(ProfileEvents::DictCacheKeysExpired, cache_expired_ids.size());
-    ProfileEvents::increment(ProfileEvents::DictCacheKeysNotFound, cache_not_found_ids.size());
+    ProfileEvents::increment(ProfileEvents::DictCacheKeysExpired, cache_expired_count);
+    ProfileEvents::increment(ProfileEvents::DictCacheKeysNotFound, cache_not_found_count);
    ProfileEvents::increment(ProfileEvents::DictCacheKeysHit, cache_hit);

    query_count.fetch_add(rows, std::memory_order_relaxed);
-    hit_count.fetch_add(rows - cache_expired_ids.size() - cache_not_found_ids.size(), std::memory_order_release);
+    hit_count.fetch_add(rows - cache_expired_count - cache_not_found_count, std::memory_order_release);

-    if (cache_not_found_ids.empty())
+    if (!cache_not_found_count)
    {
        /// Nothing to update - return;
-        if (cache_expired_ids.empty())
+        if (!cache_expired_count)
            return;

        if (allow_read_expired_keys)
        {
            std::vector<Key> required_expired_ids;
-            required_expired_ids.reserve(cache_expired_ids.size());
+            required_expired_ids.reserve(cache_expired_count);
            std::transform(
-                    std::begin(cache_expired_ids), std::end(cache_expired_ids),
+                    std::begin(cache_expired_or_not_found_ids), std::end(cache_expired_or_not_found_ids),
                    std::back_inserter(required_expired_ids), [](auto & pair) { return pair.first; });

-            /// Callbacks are empty because we don't want to receive them after an unknown period of time.
-            auto update_unit_ptr = std::make_shared<UpdateUnit>(required_expired_ids);
+            auto update_unit_ptr = std::make_shared<UpdateUnit>(std::move(required_expired_ids));

            tryPushToUpdateQueueOrThrow(update_unit_ptr);
            /// Update is async - no need to wait.
@ -407,38 +463,26 @@ void CacheDictionary::has(const PaddedPODArray<Key> & ids, PaddedPODArray<UInt8>
    }

    /// At this point we have two situations.
-    /// There may be both types of keys: cache_expired_ids and cache_not_found_ids.
+    /// There may be both types of keys: expired and not_found.
    /// We will update them all synchronously.

    std::vector<Key> required_ids;
-    required_ids.reserve(cache_not_found_ids.size() + cache_expired_ids.size());
+    required_ids.reserve(cache_not_found_count + cache_expired_count);
    std::transform(
-            std::begin(cache_not_found_ids), std::end(cache_not_found_ids),
-            std::back_inserter(required_ids), [](auto & pair) { return pair.first; });
-    std::transform(
-            std::begin(cache_expired_ids), std::end(cache_expired_ids),
+            std::begin(cache_expired_or_not_found_ids), std::end(cache_expired_or_not_found_ids),
            std::back_inserter(required_ids), [](auto & pair) { return pair.first; });

-    auto on_cell_updated = [&] (const Key id, const size_t)
-    {
-        for (const auto row : cache_not_found_ids[id])
-            out[row] = true;
-        for (const auto row : cache_expired_ids[id])
-            out[row] = true;
-    };
-
-    auto on_id_not_found = [&] (const Key id, const size_t)
-    {
-        for (const auto row : cache_not_found_ids[id])
-            out[row] = false;
-        for (const auto row : cache_expired_ids[id])
-            out[row] = true;
-    };
-
-    auto update_unit_ptr = std::make_shared<UpdateUnit>(required_ids, on_cell_updated, on_id_not_found);
+    auto update_unit_ptr = std::make_shared<UpdateUnit>(std::move(required_ids));

    tryPushToUpdateQueueOrThrow(update_unit_ptr);
    waitForCurrentUpdateFinish(update_unit_ptr);
+
+    for (auto & [key, value] : update_unit_ptr->found_ids)
+    {
+        if (value.found)
+            for (const auto row : cache_expired_or_not_found_ids[key])
+                out[row] = true;
+    }
 }


@ -453,7 +497,7 @@ void CacheDictionary::createAttributes()
    for (const auto & attribute : dict_struct.attributes)
    {
        attribute_index_by_name.emplace(attribute.name, attributes.size());
-        attributes.push_back(createAttributeWithType(attribute.underlying_type, attribute.null_value));
+        attributes.push_back(createAttributeWithTypeAndName(attribute.underlying_type, attribute.name, attribute.null_value));

        if (attribute.hierarchical)
        {
@ -465,18 +509,62 @@ void CacheDictionary::createAttributes()
    }
 }

-CacheDictionary::Attribute CacheDictionary::createAttributeWithType(const AttributeUnderlyingType type, const Field & null_value)
+/* For unknown reason clang-tidy wants this function to be static, but it uses bytes_allocated, which is a class member.
+ * NOLINT(readability-convert-member-functions-to-static) */
+CacheDictionary::Attribute CacheDictionary::createAttributeWithTypeAndName(const AttributeUnderlyingType type, const String & name, const Field & null_value)
 {
-    Attribute attr{type, {}, {}};
+    Attribute attr{type, name, {}, {}};

    switch (type)
    {
-#define DISPATCH(TYPE) \
-    case AttributeUnderlyingType::ut##TYPE: \
-        attr.null_values = TYPE(null_value.get<NearestFieldType<TYPE>>()); /* NOLINT */ \
-        attr.arrays = std::make_unique<ContainerType<TYPE>>(size); /* NOLINT */ \
-        bytes_allocated += size * sizeof(TYPE); \
-        break;
+        /* Macro argument should be enclosed in parentheses, but if do so we cannot initialize \
+         * NearestFieldType which takes TYPE as a template parameter. */
+#define DISPATCH(TYPE)\
+        case AttributeUnderlyingType::ut##TYPE:\
+        {\
+            attr.null_value = TYPE(null_value.get<NearestFieldType<TYPE>>()); /* NOLINT(bugprone-macro-parentheses) */ \
+            attr.arrays = std::make_unique<ContainerType<TYPE>>(size); /* NOLINT(bugprone-macro-parentheses) */ \
+            bytes_allocated += size * sizeof(TYPE);\
+            break;\
+        }
+        DISPATCH(UInt8)
+        DISPATCH(UInt16)
+        DISPATCH(UInt32)
+        DISPATCH(UInt64)
+        DISPATCH(UInt128)
+        DISPATCH(Int8)
+        DISPATCH(Int16)
+        DISPATCH(Int32)
+        DISPATCH(Int64)
+        DISPATCH(Decimal32)
+        DISPATCH(Decimal64)
+        DISPATCH(Decimal128)
+        DISPATCH(Float32)
+        DISPATCH(Float64)
+#undef DISPATCH
+        case AttributeUnderlyingType::utString: {
+            attr.null_value = null_value.get<String>();
+            attr.arrays = std::make_unique<ContainerType<StringRef>>(size);
+            bytes_allocated += size * sizeof(StringRef);
+            if (!string_arena)
+                string_arena = std::make_unique<ArenaWithFreeLists>();
+            break;
+        }
+    }
+
+    return attr;
+}
+
+void CacheDictionary::setDefaultAttributeValue(Attribute & attribute, const Key idx) const
+{
+    switch (attribute.type)
+    {
+        /* Macro argument should be enclosed in parentheses, but if do so we cannot initialize \
+        * NearestFieldType which takes TYPE as a template parameter.  */
+#define DISPATCH(TYPE)\
+        case AttributeUnderlyingType::ut##TYPE:\
+            std::get<ContainerPtrType<TYPE>>(attribute.arrays)[idx] = std::get<TYPE>(attribute.null_value); /* NOLINT(bugprone-macro-parentheses) */ \
+            break;
        DISPATCH(UInt8)
        DISPATCH(UInt16)
        DISPATCH(UInt32)
@ -492,69 +580,9 @@ CacheDictionary::Attribute CacheDictionary::createAttributeWithType(const Attrib
        DISPATCH(Float32)
        DISPATCH(Float64)
 #undef DISPATCH
-        case AttributeUnderlyingType::utString:
-            attr.null_values = null_value.get<String>();
-            attr.arrays = std::make_unique<ContainerType<StringRef>>(size);
-            bytes_allocated += size * sizeof(StringRef);
-            if (!string_arena)
-                string_arena = std::make_unique<ArenaWithFreeLists>();
-            break;
-    }
-
-    return attr;
-}
-
-void CacheDictionary::setDefaultAttributeValue(Attribute & attribute, const Key idx) const
-{
-    switch (attribute.type)
-    {
-        case AttributeUnderlyingType::utUInt8:
-            std::get<ContainerPtrType<UInt8>>(attribute.arrays)[idx] = std::get<UInt8>(attribute.null_values);
-            break;
-        case AttributeUnderlyingType::utUInt16:
-            std::get<ContainerPtrType<UInt16>>(attribute.arrays)[idx] = std::get<UInt16>(attribute.null_values);
-            break;
-        case AttributeUnderlyingType::utUInt32:
-            std::get<ContainerPtrType<UInt32>>(attribute.arrays)[idx] = std::get<UInt32>(attribute.null_values);
-            break;
-        case AttributeUnderlyingType::utUInt64:
-            std::get<ContainerPtrType<UInt64>>(attribute.arrays)[idx] = std::get<UInt64>(attribute.null_values);
-            break;
-        case AttributeUnderlyingType::utUInt128:
-            std::get<ContainerPtrType<UInt128>>(attribute.arrays)[idx] = std::get<UInt128>(attribute.null_values);
-            break;
-        case AttributeUnderlyingType::utInt8:
-            std::get<ContainerPtrType<Int8>>(attribute.arrays)[idx] = std::get<Int8>(attribute.null_values);
-            break;
-        case AttributeUnderlyingType::utInt16:
-            std::get<ContainerPtrType<Int16>>(attribute.arrays)[idx] = std::get<Int16>(attribute.null_values);
-            break;
-        case AttributeUnderlyingType::utInt32:
-            std::get<ContainerPtrType<Int32>>(attribute.arrays)[idx] = std::get<Int32>(attribute.null_values);
-            break;
-        case AttributeUnderlyingType::utInt64:
-            std::get<ContainerPtrType<Int64>>(attribute.arrays)[idx] = std::get<Int64>(attribute.null_values);
-            break;
-        case AttributeUnderlyingType::utFloat32:
-            std::get<ContainerPtrType<Float32>>(attribute.arrays)[idx] = std::get<Float32>(attribute.null_values);
-            break;
-        case AttributeUnderlyingType::utFloat64:
-            std::get<ContainerPtrType<Float64>>(attribute.arrays)[idx] = std::get<Float64>(attribute.null_values);
-            break;
-
-        case AttributeUnderlyingType::utDecimal32:
-            std::get<ContainerPtrType<Decimal32>>(attribute.arrays)[idx] = std::get<Decimal32>(attribute.null_values);
-            break;
-        case AttributeUnderlyingType::utDecimal64:
-            std::get<ContainerPtrType<Decimal64>>(attribute.arrays)[idx] = std::get<Decimal64>(attribute.null_values);
-            break;
-        case AttributeUnderlyingType::utDecimal128:
-            std::get<ContainerPtrType<Decimal128>>(attribute.arrays)[idx] = std::get<Decimal128>(attribute.null_values);
-            break;
-
        case AttributeUnderlyingType::utString:
        {
-            const auto & null_value_ref = std::get<String>(attribute.null_values);
+            const auto & null_value_ref = std::get<String>(attribute.null_value);
            auto & string_ref = std::get<ContainerPtrType<StringRef>>(attribute.arrays)[idx];

            if (string_ref.data != null_value_ref.data())
@ -607,7 +635,6 @@ void CacheDictionary::setAttributeValue(Attribute & attribute, const Key idx, co
        case AttributeUnderlyingType::utFloat64:
            std::get<ContainerPtrType<Float64>>(attribute.arrays)[idx] = value.get<Float64>();
            break;
-
        case AttributeUnderlyingType::utDecimal32:
            std::get<ContainerPtrType<Decimal32>>(attribute.arrays)[idx] = value.get<Decimal32>();
            break;
@ -622,7 +649,7 @@ void CacheDictionary::setAttributeValue(Attribute & attribute, const Key idx, co
        {
            const auto & string = value.get<String>();
            auto & string_ref = std::get<ContainerPtrType<StringRef>>(attribute.arrays)[idx];
-            const auto & null_value_ref = std::get<String>(attribute.null_values);
+            const auto & null_value_ref = std::get<String>(attribute.null_value);

            /// free memory unless it points to a null_value
            if (string_ref.data && string_ref.data != null_value_ref.data())
@ -644,20 +671,26 @@ void CacheDictionary::setAttributeValue(Attribute & attribute, const Key idx, co
 }

 CacheDictionary::Attribute & CacheDictionary::getAttribute(const std::string & attribute_name) const
+{
+    const size_t attr_index = getAttributeIndex(attribute_name);
+    return attributes[attr_index];
+}
+
+size_t CacheDictionary::getAttributeIndex(const std::string & attribute_name) const
 {
    const auto it = attribute_index_by_name.find(attribute_name);
    if (it == std::end(attribute_index_by_name))
        throw Exception{full_name + ": no such attribute '" + attribute_name + "'", ErrorCodes::BAD_ARGUMENTS};

-    return attributes[it->second];
+    return it->second;
 }

 bool CacheDictionary::isEmptyCell(const UInt64 idx) const
 {
-    return (idx != zero_cell_idx && cells[idx].id == 0)
-        || (cells[idx].data == ext::safe_bit_cast<CellMetadata::time_point_urep_t>(CellMetadata::time_point_t()));
+    return (idx != zero_cell_idx && cells[idx].id == 0) || (cells[idx].deadline == time_point_t());
 }

+
 PaddedPODArray<CacheDictionary::Key> CacheDictionary::getCachedIds() const
 {
    const ProfilingScopedReadRWLock read_lock{rw_lock, ProfileEvents::DictCacheLockReadNs};
@ -667,9 +700,7 @@ PaddedPODArray<CacheDictionary::Key> CacheDictionary::getCachedIds() const
    {
        auto & cell = cells[idx];
        if (!isEmptyCell(idx) && !cells[idx].isDefault())
-        {
            array.push_back(cell.id);
-        }
    }
    return array;
 }
@ -805,16 +836,6 @@ void CacheDictionary::waitForCurrentUpdateFinish(UpdateUnitPtr & update_unit_ptr

    if (!result)
    {
-        std::lock_guard<std::mutex> callback_lock(update_unit_ptr->callback_mutex);
-        /*
-         * We acquire a lock here and store false to special variable to avoid SEGFAULT's.
-         * Consider timeout for wait had expired and main query's thread ended with exception
-         * or some other error. But the UpdateUnit with callbacks is left in the queue.
-         * It has these callback that capture god knows what from the current thread
-         * (most of the variables lies on the stack of finished thread) that
-         * intended to do a synchronous update. AsyncUpdate thread can touch deallocated memory and explode.
-         * */
-        update_unit_ptr->can_use_callback = false;
        throw DB::Exception(ErrorCodes::TIMEOUT_EXCEEDED,
                            "Dictionary {} source seems unavailable, because {}ms timeout exceeded.",
                            getDictionaryID().getNameForLogs(), toString(query_wait_timeout_milliseconds));
@ -823,8 +844,6 @@ void CacheDictionary::waitForCurrentUpdateFinish(UpdateUnitPtr & update_unit_ptr

    if (update_unit_ptr->current_exception)
    {
-        // There might have been a single update unit for multiple callers in
-        // independent threads, and current_exception will be the same for them.
        // Don't just rethrow it, because sharing the same exception object
        // between multiple threads can lead to weird effects if they decide to
        // modify it, for example, by adding some error context.
@ -853,18 +872,54 @@ void CacheDictionary::tryPushToUpdateQueueOrThrow(UpdateUnitPtr & update_unit_pt
                std::to_string(update_queue.size()));
 }

-void CacheDictionary::update(UpdateUnitPtr & update_unit_ptr) const
+
+std::vector<CacheDictionary::AttributeValue> CacheDictionary::getAttributeValuesFromBlockAtPosition(const std::vector<const IColumn *> & column_ptrs, size_t position)
+{
+    std::vector<AttributeValue> answer;
+    answer.reserve(column_ptrs.size());
+
+    for (const auto * pure_column : column_ptrs)
+    {
+#define DISPATCH(TYPE) \
+        if (const auto * column = typeid_cast<const Column##TYPE *>(pure_column)) { \
+            answer.emplace_back(column->getElement(position)); \
+            continue; \
+        }
+        DISPATCH(UInt8)
+        DISPATCH(UInt16)
+        DISPATCH(UInt32)
+        DISPATCH(UInt64)
+        DISPATCH(UInt128)
+        DISPATCH(Int8)
+        DISPATCH(Int16)
+        DISPATCH(Int32)
+        DISPATCH(Int64)
+        DISPATCH(Decimal<Decimal32>)
+        DISPATCH(Decimal<Decimal64>)
+        DISPATCH(Decimal<Decimal128>)
+        DISPATCH(Float32)
+        DISPATCH(Float64)
+#undef DISPATCH
+        if (const auto * column = typeid_cast<const ColumnString *>(pure_column))
+        {
+            answer.emplace_back(column->getDataAt(position).toString());
+            continue;
+        }
+    }
+    return answer;
+}
+
+void CacheDictionary::update(UpdateUnitPtr & update_unit_ptr)
 {
    CurrentMetrics::Increment metric_increment{CurrentMetrics::DictCacheRequests};
    ProfileEvents::increment(ProfileEvents::DictCacheKeysRequested, update_unit_ptr->requested_ids.size());

-    std::unordered_map<Key, UInt8> remaining_ids{update_unit_ptr->requested_ids.size()};
-    for (const auto id : update_unit_ptr->requested_ids)
-        remaining_ids.insert({id, 0});
+    auto & map_ids = update_unit_ptr->found_ids;
+
+    size_t found_num = 0;

    const auto now = std::chrono::system_clock::now();

-
    if (now > backoff_end_time.load())
    {
        try
@ -894,17 +949,27 @@ void CacheDictionary::update(UpdateUnitPtr & update_unit_ptr) const
                const auto column_ptrs = ext::map<std::vector>(
                        ext::range(0, attributes.size()), [&block](size_t i) { return block.safeGetByPosition(i + 1).column.get(); });

+                found_num += ids.size();
+
                for (const auto i : ext::range(0, ids.size()))
                {
                    /// Modifying cache with write lock
                    ProfilingScopedWriteRWLock write_lock{rw_lock, ProfileEvents::DictCacheLockWriteNs};
                    const auto id = ids[i];

-                    const auto find_result = findCellIdx(id, now);
-                    const auto & cell_idx = find_result.cell_idx;
-
+                    const auto cell_idx = findCellIdxForSet(id);
                    auto & cell = cells[cell_idx];

+                    auto it = map_ids.find(id);
+
+                    /// We have some extra keys from source. Won't add them to cache.
+                    if (it == map_ids.end())
+                        continue;
+
+                    auto & all_attributes = it->second;
+                    all_attributes.found = true;
+                    all_attributes.values = getAttributeValuesFromBlockAtPosition(column_ptrs, i);
+
                    for (const auto attribute_idx : ext::range(0, attributes.size()))
                    {
                        const auto & attribute_column = *column_ptrs[attribute_idx];
@ -918,25 +983,27 @@ void CacheDictionary::update(UpdateUnitPtr & update_unit_ptr) const
                        element_count.fetch_add(1, std::memory_order_relaxed);

                    cell.id = id;
-                    if (dict_lifetime.min_sec != 0 && dict_lifetime.max_sec != 0)
-                    {
-                        std::uniform_int_distribution<UInt64> distribution{dict_lifetime.min_sec, dict_lifetime.max_sec};
-                        cell.setExpiresAt(now + std::chrono::seconds{distribution(rnd_engine)});
-                    }
-                    else
-                        cell.setExpiresAt(std::chrono::time_point<std::chrono::system_clock>::max());
-
-                    update_unit_ptr->callPresentIdHandler(id, cell_idx);
-                    /// mark corresponding id as found
-                    remaining_ids[id] = 1;
+                    setLifetime(cell, now);
                }
            }

            stream->readSuffix();

-            /// Lock just for last_exception safety
+            /// Lock for cache modification
            ProfilingScopedWriteRWLock write_lock{rw_lock, ProfileEvents::DictCacheLockWriteNs};

+            for (auto & [key, value] : update_unit_ptr->found_ids)
+            {
+                if (!value.found)
+                {
+                    auto cell_idx = findCellIdxForSet(key);
+                    auto & cell = cells[cell_idx];
+                    cell.id = key;
+                    setLifetime(cell, now);
+                    cell.setDefault();
+                }
+            }
+
            error_count = 0;
            last_exception = std::exception_ptr{};
            backoff_end_time = std::chrono::system_clock::time_point{};
@ -954,80 +1021,33 @@ void CacheDictionary::update(UpdateUnitPtr & update_unit_ptr) const
            tryLogException(last_exception, log,
                            "Could not update cache dictionary '" + getDictionaryID().getNameForLogs() +
                            "', next update is scheduled at " + ext::to_string(backoff_end_time.load()));
-        }
-    }
-
-    /// Modifying cache state again with write lock
-    ProfilingScopedWriteRWLock write_lock{rw_lock, ProfileEvents::DictCacheLockWriteNs};
-    size_t not_found_num = 0;
-    size_t found_num = 0;
-
-    /// Check which ids have not been found and require setting null_value
-    for (const auto & id_found_pair : remaining_ids)
-    {
-        if (id_found_pair.second)
-        {
-            ++found_num;
-            continue;
-        }
-        ++not_found_num;
-
-        const auto id = id_found_pair.first;
-
-        const auto find_result = findCellIdx(id, now);
-        const auto & cell_idx = find_result.cell_idx;
-        auto & cell = cells[cell_idx];
-
-        if (error_count)
-        {
-            if (find_result.outdated)
+            try
            {
-                /// We have expired data for that `id` so we can continue using it.
-                bool was_default = cell.isDefault();
-                cell.setExpiresAt(backoff_end_time);
-                if (was_default)
-                    cell.setDefault();
-                if (was_default)
-                    update_unit_ptr->callAbsentIdHandler(id, cell_idx);
-                else
-                    update_unit_ptr->callPresentIdHandler(id, cell_idx);
-                continue;
+                std::rethrow_exception(last_exception);
+            }
+            catch (...)
+            {
+                throw DB::Exception(ErrorCodes::CACHE_DICTIONARY_UPDATE_FAIL,
+                    "Update failed for dictionary {} : {}",
+                    getDictionaryID().getNameForLogs(),
+                    getCurrentExceptionMessage(true /*with stack trace*/,
+                                               true /*check embedded stack trace*/));
            }
-            /// We don't have expired data for that `id` so all we can do is to rethrow `last_exception`.
-            std::rethrow_exception(last_exception);
-        }
-
-        /// Check if cell had not been occupied before and increment element counter if it hadn't
-        if (cell.id == 0 && cell_idx != zero_cell_idx)
-            element_count.fetch_add(1, std::memory_order_relaxed);
-
-        cell.id = id;
-
-        if (dict_lifetime.min_sec != 0 && dict_lifetime.max_sec != 0)
-        {
-            std::uniform_int_distribution<UInt64> distribution{dict_lifetime.min_sec, dict_lifetime.max_sec};
-            cell.setExpiresAt(now + std::chrono::seconds{distribution(rnd_engine)});
-            cell.strict_max = now + std::chrono::seconds{strict_max_lifetime_seconds};
-        }
-        else
-        {
-            cell.setExpiresAt(std::chrono::time_point<std::chrono::system_clock>::max());
-            cell.strict_max = now + std::chrono::seconds{strict_max_lifetime_seconds};
        }


-        /// Set null_value for each attribute
-        cell.setDefault();
-        for (auto & attribute : attributes)
-            setDefaultAttributeValue(attribute, cell_idx);
-
-        /// inform caller that the cell has not been found
-        update_unit_ptr->callAbsentIdHandler(id, cell_idx);
-    }
-
-    ProfileEvents::increment(ProfileEvents::DictCacheKeysRequestedMiss, not_found_num);
+    ProfileEvents::increment(ProfileEvents::DictCacheKeysRequestedMiss, update_unit_ptr->requested_ids.size() - found_num);
    ProfileEvents::increment(ProfileEvents::DictCacheKeysRequestedFound, found_num);
    ProfileEvents::increment(ProfileEvents::DictCacheRequests);
+    }
+    else
+    {
+        /// Won't request source for keys
+        throw DB::Exception(ErrorCodes::CACHE_DICTIONARY_UPDATE_FAIL,
+            "Query contains keys that are not present in cache or expired. Could not update cache dictionary {} now, because nearest update is scheduled at {}. Try again later.",
+            getDictionaryID().getNameForLogs(),
+            ext::to_string(backoff_end_time.load()));
+    }
 }

 }
--- a/src/Dictionaries/CacheDictionary.h
+++ b/src/Dictionaries/CacheDictionary.h
@ -198,49 +198,44 @@ private:
    template <typename Value>
    using ContainerPtrType = std::unique_ptr<ContainerType<Value>>;

+    using time_point_t = std::chrono::system_clock::time_point;
+
    struct CellMetadata final
    {
-        using time_point_t = std::chrono::system_clock::time_point;
-        using time_point_rep_t = time_point_t::rep;
-        using time_point_urep_t = std::make_unsigned_t<time_point_rep_t>;
-
-        static constexpr UInt64 EXPIRES_AT_MASK = std::numeric_limits<time_point_rep_t>::max();
-        static constexpr UInt64 IS_DEFAULT_MASK = ~EXPIRES_AT_MASK;
-
        UInt64 id;
-        /// Stores both expiration time and `is_default` flag in the most significant bit
-        time_point_urep_t data;
+        time_point_t deadline;
+        bool is_default{false};

-        time_point_t strict_max;
-
-        /// Sets expiration time, resets `is_default` flag to false
-        time_point_t expiresAt() const { return ext::safe_bit_cast<time_point_t>(data & EXPIRES_AT_MASK); }
-        void setExpiresAt(const time_point_t & t) { data = ext::safe_bit_cast<time_point_urep_t>(t); }
-
-        bool isDefault() const { return (data & IS_DEFAULT_MASK) == IS_DEFAULT_MASK; }
-        void setDefault() { data |= IS_DEFAULT_MASK; }
+        time_point_t expiresAt() const { return deadline; }
+        void setExpiresAt(const time_point_t & t) { deadline = t; is_default = false; }
+        bool isDefault() const { return is_default; }
+        void setDefault() { is_default = true; }
    };

+    using AttributeValue = std::variant<
+        UInt8, UInt16, UInt32, UInt64, UInt128,
+        Int8, Int16, Int32, Int64,
+        Decimal32, Decimal64, Decimal128,
+        Float32, Float64, String>;
+
+    struct AttributeValuesForKey
+    {
+        bool found{false};
+        std::vector<AttributeValue> values;
+
+        std::string dump();
+    };
+
+    using FoundValuesForKeys = std::unordered_map<Key, AttributeValuesForKey>;
+
    struct Attribute final
    {
        AttributeUnderlyingType type;
-        std::variant<
-            UInt8,
-            UInt16,
-            UInt32,
-            UInt64,
-            UInt128,
-            Int8,
-            Int16,
-            Int32,
-            Int64,
-            Decimal32,
-            Decimal64,
-            Decimal128,
-            Float32,
-            Float64,
-            String>
-            null_values;
+        String name;
+        /// Default value for each type. Could be defined in config.
+        AttributeValue null_value;
+        /// We store attribute value for all keys. It is a "row" in a hand-made open addressing hashtable,
+        /// where "column" is key.
        std::variant<
            ContainerPtrType<UInt8>,
            ContainerPtrType<UInt16>,
@ -262,7 +257,8 @@ private:

    void createAttributes();

-    Attribute createAttributeWithType(const AttributeUnderlyingType type, const Field & null_value);
+    /* NOLINTNEXTLINE(readability-convert-member-functions-to-static) */
+    Attribute createAttributeWithTypeAndName(const AttributeUnderlyingType type, const String & name, const Field & null_value);

    template <typename AttributeType, typename OutputType, typename DefaultGetter>
    void getItemsNumberImpl(
@ -281,7 +277,10 @@ private:

    void setAttributeValue(Attribute & attribute, const Key idx, const Field & value) const;

+    static std::vector<AttributeValue> getAttributeValuesFromBlockAtPosition(const std::vector<const IColumn *> & column_ptrs, size_t position);
+
    Attribute & getAttribute(const std::string & attribute_name) const;
+    size_t getAttributeIndex(const std::string & attribute_name) const;

    using SharedDictionarySourcePtr = std::shared_ptr<IDictionarySource>;

@ -303,14 +302,46 @@ private:
        return source_ptr;
    }

-    struct FindResult
+    inline void setLifetime(CellMetadata & cell, time_point_t now)
    {
-        const size_t cell_idx;
-        const bool valid;
-        const bool outdated;
+        if (dict_lifetime.min_sec != 0 && dict_lifetime.max_sec != 0)
+        {
+            std::uniform_int_distribution<UInt64> distribution{dict_lifetime.min_sec, dict_lifetime.max_sec};
+            cell.setExpiresAt(now + std::chrono::seconds{distribution(rnd_engine)});
+        }
+        else
+        {
+            /// This maybe not obvious, but when we define is this cell is expired or expired permanently, we add strict_max_lifetime_seconds
+            /// to the expiration time. And it overflows pretty well.
+            cell.setExpiresAt(std::chrono::time_point<std::chrono::system_clock>::max() - 2 * std::chrono::seconds(strict_max_lifetime_seconds));
+        }
+    }
+
+    inline bool isExpired(time_point_t now, time_point_t deadline) const
+    {
+        return now > deadline;
+    }
+
+    inline bool isExpiredPermanently(time_point_t now, time_point_t deadline) const
+    {
+        return now > deadline + std::chrono::seconds(strict_max_lifetime_seconds);
+    }
+
+    enum class ResultState
+    {
+        NotFound,
+        FoundAndValid,
+        FoundButExpired,
+        /// Here is a gap between there two states in which a key could be read
+        /// with an enabled setting in config enable_read_expired_keys.
+        FoundButExpiredPermanently
    };

-    FindResult findCellIdx(const Key & id, const CellMetadata::time_point_t now) const;
+    using FindResult = std::pair<size_t, ResultState>;
+
+    FindResult findCellIdxForGet(const Key & id, const time_point_t now) const;
+
+    size_t findCellIdxForSet(const Key & id) const;

    template <typename AncestorType>
    void isInImpl(const PaddedPODArray<Key> & child_ids, const AncestorType & ancestor_ids, PaddedPODArray<UInt8> & out) const;
@ -353,7 +384,7 @@ private:
    std::unique_ptr<ArenaWithFreeLists> string_arena;

    mutable std::exception_ptr last_exception;
-    mutable std::atomic<size_t> error_count = 0;
+    mutable std::atomic<size_t> error_count{0};
    mutable std::atomic<std::chrono::system_clock::time_point> backoff_end_time{std::chrono::system_clock::time_point{}};

    mutable pcg64 rnd_engine;
@ -363,62 +394,25 @@ private:
    mutable std::atomic<size_t> hit_count{0};
    mutable std::atomic<size_t> query_count{0};

-    /// Field and methods correlated with update expired and not found keys
-
-    using PresentIdHandler = std::function<void(Key, size_t)>;
-    using AbsentIdHandler  = std::function<void(Key, size_t)>;
-
    /*
-     * Disclaimer: this comment is written not for fun.
-     *
     * How the update goes: we basically have a method like get(keys)->values. Values are cached, so sometimes we
-     * can return them from the cache. For values not in cache, we query them from the dictionary, and add to the
-     * cache. The cache is lossy, so we can't expect it to store all the keys, and we store them separately. Normally,
-     * they would be passed as a return value of get(), but for Unknown Reasons the dictionaries use a baroque
-     * interface where get() accepts two callback, one that it calls for found values, and one for not found.
-     *
-     * Now we make it even uglier by doing this from multiple threads. The missing values are retrieved from the
-     * dictionary in a background thread, and this thread calls the provided callback. So if you provide the callbacks,
-     * you MUST wait until the background update finishes, or god knows what happens. Unfortunately, we have no
-     * way to check that you did this right, so good luck.
+     * can return them from the cache. For values not in cache, we query them from the source, and add to the
+     * cache. The cache is lossy, so we can't expect it to store all the keys, and we store them separately.
+     * So, there is a map of found keys to all its attributes.
     */
    struct UpdateUnit
    {
-        UpdateUnit(std::vector<Key> requested_ids_,
-                PresentIdHandler present_id_handler_,
-                AbsentIdHandler absent_id_handler_) :
+        explicit UpdateUnit(std::vector<Key> && requested_ids_) :
                requested_ids(std::move(requested_ids_)),
-                alive_keys(CurrentMetrics::CacheDictionaryUpdateQueueKeys, requested_ids.size()),
-                present_id_handler(present_id_handler_),
-                absent_id_handler(absent_id_handler_){}
-
-        explicit UpdateUnit(std::vector<Key> requested_ids_) :
-                requested_ids(std::move(requested_ids_)),
-                alive_keys(CurrentMetrics::CacheDictionaryUpdateQueueKeys, requested_ids.size()),
-                present_id_handler([](Key, size_t){}),
-                absent_id_handler([](Key, size_t){}){}
-
-
-        void callPresentIdHandler(Key key, size_t cell_idx)
+                alive_keys(CurrentMetrics::CacheDictionaryUpdateQueueKeys, requested_ids.size())
        {
-            std::lock_guard lock(callback_mutex);
-            if (can_use_callback)
-                present_id_handler(key, cell_idx);
-        }
-
-        void callAbsentIdHandler(Key key, size_t cell_idx)
-        {
-            std::lock_guard lock(callback_mutex);
-            if (can_use_callback)
-                absent_id_handler(key, cell_idx);
+            found_ids.reserve(requested_ids.size());
+            for (const auto id : requested_ids)
+                found_ids.insert({id, {}});
        }

        std::vector<Key> requested_ids;
-
-        /// It might seem that it is a leak of performance.
-        /// But acquiring a mutex without contention is rather cheap.
-        std::mutex callback_mutex;
-        bool can_use_callback{true};
+        FoundValuesForKeys found_ids;

        std::atomic<bool> is_done{false};
        std::exception_ptr current_exception{nullptr};
@ -427,9 +421,7 @@ private:
        CurrentMetrics::Increment alive_batch{CurrentMetrics::CacheDictionaryUpdateQueueBatches};
        CurrentMetrics::Increment alive_keys;

-      private:
-        PresentIdHandler present_id_handler;
-        AbsentIdHandler absent_id_handler;
+        std::string dumpFoundIds();
    };

    using UpdateUnitPtr = std::shared_ptr<UpdateUnit>;
@ -449,12 +441,12 @@ private:
     * 0 - if set is empty, 1 - otherwise
     *
     * Only if there are no cache_not_found_ids and some cache_expired_ids
-     * (with allow_read_expired_keys_from_cache_dictionary setting) we can perform async update.
+     * (with allow_read_expired_keys setting) we can perform async update.
     * Otherwise we have no concatenate ids and update them sync.
     *
     */
    void updateThreadFunction();
-    void update(UpdateUnitPtr & update_unit_ptr) const;
+    void update(UpdateUnitPtr & update_unit_ptr);


    void tryPushToUpdateQueueOrThrow(UpdateUnitPtr & update_unit_ptr) const;
--- a/src/Dictionaries/CacheDictionary.inc.h
+++ b/src/Dictionaries/CacheDictionary.inc.h
@ -1,6 +1,8 @@
 #pragma once
-#include "CacheDictionary.h"

+#include <stdexcept>
+
+#include "CacheDictionary.h"
 #include <Columns/ColumnsNumber.h>
 #include <Common/ProfilingScopedRWLock.h>
 #include <Common/typeid_cast.h>
@ -10,6 +12,7 @@
 #include <ext/range.h>
 #include <ext/size.h>

+
 namespace ProfileEvents
 {
 extern const Event DictCacheKeysRequested;
@ -44,18 +47,27 @@ void CacheDictionary::getItemsNumberImpl(
    for (const auto row : ext::range(0, rows))
        out[row] = get_default(row);

-    /// Mapping: <id> -> { all indices `i` of `ids` such that `ids[i]` = <id> }
-    std::unordered_map<Key, std::vector<size_t>> cache_expired_ids;
-    std::unordered_map<Key, std::vector<size_t>> cache_not_found_ids;
+    /// Maybe there are duplicate keys, so we remember their indices.
+    std::unordered_map<Key, std::vector<size_t>> cache_expired_or_not_found_ids;

    auto & attribute_array = std::get<ContainerPtrType<AttributeType>>(attribute.arrays);

    size_t cache_hit = 0;
+    size_t cache_not_found_count = 0;
+    size_t cache_expired_cound = 0;

    {
        const ProfilingScopedReadRWLock read_lock{rw_lock, ProfileEvents::DictCacheLockReadNs};

        const auto now = std::chrono::system_clock::now();
+
+        auto insert_to_answer_routine = [&](size_t row, size_t idx)
+        {
+            auto & cell = cells[idx];
+            if (!cell.isDefault())
+                out[row] = static_cast<OutputType>(attribute_array[idx]);
+        };
+
        /// fetch up-to-date values, decide which ones require update
        for (const auto row : ext::range(0, rows))
        {
@ -66,68 +78,52 @@ void CacheDictionary::getItemsNumberImpl(
                *    2. cell has expired,
                *    3. explicit defaults were specified and cell was set default. */

-            const auto find_result = findCellIdx(id, now);
+            const auto [cell_idx, state] = findCellIdxForGet(id, now);

-            auto update_routine = [&]()
-            {
-                const auto & cell_idx = find_result.cell_idx;
-                const auto & cell = cells[cell_idx];
-                if (!cell.isDefault())
-                    out[row] = static_cast<OutputType>(attribute_array[cell_idx]);
-            };
-
-            if (!find_result.valid)
-            {
-
-                if (find_result.outdated)
-                {
-                    /// Protection of reading very expired keys.
-                    if (now > cells[find_result.cell_idx].strict_max)
-                    {
-                        cache_not_found_ids[id].push_back(row);
-                        continue;
-                    }
-
-                    cache_expired_ids[id].push_back(row);
-                    if (allow_read_expired_keys)
-                        update_routine();
-                }
-                else
-                {
-                    cache_not_found_ids[id].push_back(row);
-                }
-            }
-            else
+            if (state == ResultState::FoundAndValid)
            {
                ++cache_hit;
-                update_routine();
+                insert_to_answer_routine(row, cell_idx);
+            }
+            else if (state == ResultState::NotFound || state == ResultState::FoundButExpiredPermanently)
+            {
+                ++cache_not_found_count;
+                cache_expired_or_not_found_ids[id].push_back(row);
+            }
+            else if (state == ResultState::FoundButExpired)
+            {
+                cache_expired_cound++;
+                cache_expired_or_not_found_ids[id].push_back(row);
+
+                if (allow_read_expired_keys)
+                    insert_to_answer_routine(row, cell_idx);
            }
        }
    }

-    ProfileEvents::increment(ProfileEvents::DictCacheKeysExpired, cache_expired_ids.size());
-    ProfileEvents::increment(ProfileEvents::DictCacheKeysNotFound, cache_not_found_ids.size());
+    ProfileEvents::increment(ProfileEvents::DictCacheKeysExpired, cache_expired_cound);
+    ProfileEvents::increment(ProfileEvents::DictCacheKeysNotFound, cache_not_found_count);
    ProfileEvents::increment(ProfileEvents::DictCacheKeysHit, cache_hit);

    query_count.fetch_add(rows, std::memory_order_relaxed);
-    hit_count.fetch_add(rows - cache_expired_ids.size() - cache_not_found_ids.size(), std::memory_order_release);
+    hit_count.fetch_add(rows - cache_not_found_count - cache_expired_cound, std::memory_order_release);

-    if (cache_not_found_ids.empty())
+    if (!cache_not_found_count)
    {
        /// Nothing to update - return
-        if (cache_expired_ids.empty())
+        if (!cache_expired_cound)
            return;

        /// Update async only if allow_read_expired_keys_is_enabledadd condvar usage and better code
        if (allow_read_expired_keys)
        {
            std::vector<Key> required_expired_ids;
-            required_expired_ids.reserve(cache_expired_ids.size());
-            std::transform(std::begin(cache_expired_ids), std::end(cache_expired_ids), std::back_inserter(required_expired_ids),
-                           [](auto & pair) { return pair.first; });
+            required_expired_ids.reserve(cache_expired_cound);
+            std::transform(std::begin(cache_expired_or_not_found_ids), std::end(cache_expired_or_not_found_ids),
+                           std::back_inserter(required_expired_ids), [](auto & pair) { return pair.first; });

            /// request new values
-            auto update_unit_ptr = std::make_shared<UpdateUnit>(required_expired_ids);
+            auto update_unit_ptr = std::make_shared<UpdateUnit>(std::move(required_expired_ids));

            tryPushToUpdateQueueOrThrow(update_unit_ptr);

@ -141,34 +137,28 @@ void CacheDictionary::getItemsNumberImpl(
    /// and there no cache_not_found_ids but some cache_expired.

    std::vector<Key> required_ids;
-    required_ids.reserve(cache_not_found_ids.size() + cache_expired_ids.size());
-    std::transform(
-            std::begin(cache_not_found_ids), std::end(cache_not_found_ids),
-            std::back_inserter(required_ids), [](auto & pair) { return pair.first; });
-    std::transform(
-            std::begin(cache_expired_ids), std::end(cache_expired_ids),
-            std::back_inserter(required_ids), [](auto & pair) { return pair.first; });
-
-    auto on_cell_updated =
-    [&attribute_array, &cache_not_found_ids, &cache_expired_ids, &out]
-    (const auto id, const auto cell_idx)
-    {
-        const auto attribute_value = attribute_array[cell_idx];
-
-        for (const size_t row : cache_not_found_ids[id])
-            out[row] = static_cast<OutputType>(attribute_value);
-
-        for (const size_t row : cache_expired_ids[id])
-            out[row] = static_cast<OutputType>(attribute_value);
-    };
-
-    auto on_id_not_found = [&] (auto, auto) {};
+    required_ids.reserve(cache_not_found_count + cache_expired_cound);
+    std::transform(std::begin(cache_expired_or_not_found_ids), std::end(cache_expired_or_not_found_ids),
+                   std::back_inserter(required_ids), [](auto & pair) { return pair.first; });

    /// Request new values
-    auto update_unit_ptr = std::make_shared<UpdateUnit>(required_ids, on_cell_updated, on_id_not_found);
+    auto update_unit_ptr = std::make_shared<UpdateUnit>(std::move(required_ids));

    tryPushToUpdateQueueOrThrow(update_unit_ptr);
    waitForCurrentUpdateFinish(update_unit_ptr);
+
+    /// Add updated keys to asnwer.
+
+    const size_t attribute_index = getAttributeIndex(attribute.name);
+
+    for (auto & [key, value] : update_unit_ptr->found_ids)
+    {
+        if (value.found)
+        {
+            for (const size_t row : cache_expired_or_not_found_ids[key])
+                out[row] = std::get<OutputType>(value.values[attribute_index]);
+        }
+    }
 }

 template <typename DefaultGetter>
@ -177,56 +167,59 @@ void CacheDictionary::getItemsString(
 {
    const auto rows = ext::size(ids);

-    /// save on some allocations
+    /// Save on some allocations.
    out->getOffsets().reserve(rows);

    auto & attribute_array = std::get<ContainerPtrType<StringRef>>(attribute.arrays);

    auto found_outdated_values = false;

-    /// perform optimistic version, fallback to pessimistic if failed
+    /// Perform optimistic version, fallback to pessimistic if failed.
    {
        const ProfilingScopedReadRWLock read_lock{rw_lock, ProfileEvents::DictCacheLockReadNs};

        const auto now = std::chrono::system_clock::now();
-        /// fetch up-to-date values, discard on fail
+
+        /// Fetch up-to-date values, discard on fail.
        for (const auto row : ext::range(0, rows))
        {
            const auto id = ids[row];
+            const auto [cell_idx, state] = findCellIdxForGet(id, now);

-            const auto find_result = findCellIdx(id, now);
-            if (!find_result.valid)
+            if (state == ResultState::FoundAndValid)
+            {
+                auto & cell = cells[cell_idx];
+                const auto string_ref = cell.isDefault() ? get_default(row) : attribute_array[cell_idx];
+                out->insertData(string_ref.data, string_ref.size);
+            }
+            else
            {
                found_outdated_values = true;
                break;
            }
-            else
-            {
-                const auto & cell_idx = find_result.cell_idx;
-                const auto & cell = cells[cell_idx];
-                const auto string_ref = cell.isDefault() ? get_default(row) : attribute_array[cell_idx];
-                out->insertData(string_ref.data, string_ref.size);
-            }
        }
    }

-    /// optimistic code completed successfully
+    /// Optimistic code completed successfully.
    if (!found_outdated_values)
    {
        query_count.fetch_add(rows, std::memory_order_relaxed);
        hit_count.fetch_add(rows, std::memory_order_release);
+        ProfileEvents::increment(ProfileEvents::DictCacheKeysHit, ids.size());
        return;
    }

-    /// now onto the pessimistic one, discard possible partial results from the optimistic path
+    /// Now onto the pessimistic one, discard possible partial results from the optimistic path.
    out->getChars().resize_assume_reserved(0);
    out->getOffsets().resize_assume_reserved(0);

    /// Mapping: <id> -> { all indices `i` of `ids` such that `ids[i]` = <id> }
-    std::unordered_map<Key, std::vector<size_t>> cache_expired_ids;
-    std::unordered_map<Key, std::vector<size_t>> cache_not_found_ids;
+    std::unordered_map<Key, std::vector<size_t>> cache_expired_or_not_found_ids;
    /// we are going to store every string separately
-    std::unordered_map<Key, String> map;
+    std::unordered_map<Key, String> local_cache;
+
+    size_t cache_not_found_count = 0;
+    size_t cache_expired_count = 0;

    size_t total_length = 0;
    size_t cache_hit = 0;
@ -234,113 +227,141 @@ void CacheDictionary::getItemsString(
        const ProfilingScopedReadRWLock read_lock{rw_lock, ProfileEvents::DictCacheLockReadNs};

        const auto now = std::chrono::system_clock::now();
+
+        auto insert_value_routine = [&](size_t row, size_t id, size_t cell_idx)
+        {
+            const auto & cell = cells[cell_idx];
+            const auto string_ref = cell.isDefault() ? get_default(row) : attribute_array[cell_idx];
+
+            /// Do not store default, but count it in total length.
+            if (!cell.isDefault())
+                local_cache[id] = String{string_ref};
+
+            total_length += string_ref.size + 1;
+        };
+
        for (const auto row : ext::range(0, ids.size()))
        {
            const auto id = ids[row];
+            const auto [cell_idx, state] = findCellIdxForGet(id, now);

-            const auto find_result = findCellIdx(id, now);
-
-
-            auto insert_value_routine = [&]()
-            {
-                const auto & cell_idx = find_result.cell_idx;
-                const auto & cell = cells[cell_idx];
-                const auto string_ref = cell.isDefault() ? get_default(row) : attribute_array[cell_idx];
-
-                if (!cell.isDefault())
-                    map[id] = String{string_ref};
-
-                total_length += string_ref.size + 1;
-            };
-
-            if (!find_result.valid)
-            {
-                if (find_result.outdated)
-                {
-                    /// Protection of reading very expired keys.
-                    if (now > cells[find_result.cell_idx].strict_max)
-                    {
-                        cache_not_found_ids[id].push_back(row);
-                        continue;
-                    }
-
-                    cache_expired_ids[id].push_back(row);
-
-                    if (allow_read_expired_keys)
-                        insert_value_routine();
-                } else
-                    cache_not_found_ids[id].push_back(row);
-            } else
+            if (state == ResultState::FoundAndValid)
            {
                ++cache_hit;
-                insert_value_routine();
+                insert_value_routine(row, id, cell_idx);
+            }
+            else if (state == ResultState::NotFound || state == ResultState::FoundButExpiredPermanently)
+            {
+                ++cache_not_found_count;
+                cache_expired_or_not_found_ids[id].push_back(row);
+            }
+            else if (state == ResultState::FoundButExpired)
+            {
+                ++cache_expired_count;
+                cache_expired_or_not_found_ids[id].push_back(row);
+
+                if (allow_read_expired_keys)
+                    insert_value_routine(row, id, cell_idx);
            }
        }
    }

-    ProfileEvents::increment(ProfileEvents::DictCacheKeysExpired, cache_expired_ids.size());
-    ProfileEvents::increment(ProfileEvents::DictCacheKeysNotFound, cache_not_found_ids.size());
+    ProfileEvents::increment(ProfileEvents::DictCacheKeysExpired, cache_expired_count);
+    ProfileEvents::increment(ProfileEvents::DictCacheKeysNotFound, cache_not_found_count);
    ProfileEvents::increment(ProfileEvents::DictCacheKeysHit, cache_hit);

    query_count.fetch_add(rows, std::memory_order_relaxed);
-    hit_count.fetch_add(rows - cache_expired_ids.size() - cache_not_found_ids.size(), std::memory_order_release);
+    hit_count.fetch_add(rows - cache_expired_count - cache_not_found_count, std::memory_order_release);

    /// Async update of expired keys.
-    if (cache_not_found_ids.empty())
+    if (!cache_not_found_count)
    {
-        if (allow_read_expired_keys && !cache_expired_ids.empty())
+        if (allow_read_expired_keys && cache_expired_count)
        {
            std::vector<Key> required_expired_ids;
-            required_expired_ids.reserve(cache_not_found_ids.size());
-            std::transform(std::begin(cache_expired_ids), std::end(cache_expired_ids),
+            required_expired_ids.reserve(cache_expired_count);
+            std::transform(std::begin(cache_expired_or_not_found_ids), std::end(cache_expired_or_not_found_ids),
                           std::back_inserter(required_expired_ids), [](auto & pair) { return pair.first; });

-            auto update_unit_ptr = std::make_shared<UpdateUnit>(required_expired_ids);
+            auto update_unit_ptr = std::make_shared<UpdateUnit>(std::move(required_expired_ids));

            tryPushToUpdateQueueOrThrow(update_unit_ptr);

-            /// Do not return at this point, because there some extra stuff to do at the end of this method.
+            /// Insert all found keys and defaults to output array.
+            out->getChars().reserve(total_length);
+
+            for (const auto row : ext::range(0, ext::size(ids)))
+            {
+                const auto id = ids[row];
+                StringRef value;
+
+                /// Previously we stored found keys in map.
+                const auto it = local_cache.find(id);
+                if (it != local_cache.end())
+                    value = StringRef(it->second);
+                else
+                    value = get_default(row);
+
+                out->insertData(value.data, value.size);
+            }
+
+            /// Nothing to do else.
+            return;
        }
    }

-    /// Request new values sync.
-    /// We have request both cache_not_found_ids and cache_expired_ids.
+    /// We will request both cache_not_found_ids and cache_expired_ids sync.
    std::vector<Key> required_ids;
-    required_ids.reserve(cache_not_found_ids.size() + cache_expired_ids.size());
+    required_ids.reserve(cache_not_found_count + cache_expired_count);
    std::transform(
-        std::begin(cache_not_found_ids), std::end(cache_not_found_ids),
-        std::back_inserter(required_ids), [](auto & pair) { return pair.first; });
-    std::transform(
-        std::begin(cache_expired_ids), std::end(cache_expired_ids),
+        std::begin(cache_expired_or_not_found_ids), std::end(cache_expired_or_not_found_ids),
        std::back_inserter(required_ids), [](auto & pair) { return pair.first; });

-    auto on_cell_updated = [&] (const auto id, const auto cell_idx)
-    {
-        const auto attribute_value = attribute_array[cell_idx];
-
-        map[id] = String{attribute_value};
-        total_length += (attribute_value.size + 1) * cache_not_found_ids[id].size();
-    };
-
-    auto on_id_not_found = [&] (const auto id, const auto)
-    {
-        for (const auto row : cache_not_found_ids[id])
-            total_length += get_default(row).size + 1;
-    };
-
-    auto update_unit_ptr = std::make_shared<UpdateUnit>(required_ids, on_cell_updated, on_id_not_found);
+    auto update_unit_ptr = std::make_shared<UpdateUnit>(std::move(required_ids));

    tryPushToUpdateQueueOrThrow(update_unit_ptr);
    waitForCurrentUpdateFinish(update_unit_ptr);
+
+    const size_t attribute_index = getAttributeIndex(attribute.name);
+
+    /// Only calculate the total length.
+    for (auto & [key, value] : update_unit_ptr->found_ids)
+    {
+        if (value.found)
+        {
+            const auto found_value_ref = std::get<String>(value.values[attribute_index]);
+            total_length += (found_value_ref.size() + 1) * cache_expired_or_not_found_ids[key].size();
+        }
+        else
+        {
+            for (const auto row : cache_expired_or_not_found_ids[key])
+                total_length += get_default(row).size + 1;
+        }
+    }
+
    out->getChars().reserve(total_length);

    for (const auto row : ext::range(0, ext::size(ids)))
    {
        const auto id = ids[row];
-        const auto it = map.find(id);
+        StringRef value;

-        const auto string_ref = it != std::end(map) ? StringRef{it->second} : get_default(row);
-        out->insertData(string_ref.data, string_ref.size);
+        /// We have two maps: found in cache and found in source.
+        const auto local_it = local_cache.find(id);
+        if (local_it != local_cache.end())
+            value = StringRef(local_it->second);
+        else
+        {
+            const auto found_it = update_unit_ptr->found_ids.find(id);
+
+            /// Previously we didn't store defaults in local cache.
+            if (found_it != update_unit_ptr->found_ids.end() && found_it->second.found)
+                value = std::get<String>(found_it->second.values[attribute_index]);
+            else
+                value = get_default(row);
+        }
+
+        out->insertData(value.data, value.size);
    }
 }

--- a/src/Dictionaries/CacheDictionary_generate1.cpp
+++ b/src/Dictionaries/CacheDictionary_generate1.cpp
@ -9,7 +9,7 @@ namespace DB
    { \
        auto & attribute = getAttribute(attribute_name); \
        checkAttributeType(this, attribute_name, attribute.type, AttributeUnderlyingType::ut##TYPE); \
-        const auto null_value = std::get<TYPE>(attribute.null_values); \
+        const auto null_value = std::get<TYPE>(attribute.null_value); \
        getItemsNumberImpl<TYPE, TYPE>(attribute, ids, out, [&](const size_t) { return null_value; }); \
    }

--- a/src/Disks/S3/registerDiskS3.cpp
+++ b/src/Disks/S3/registerDiskS3.cpp
@ -132,7 +132,8 @@ void registerDiskS3(DiskFactory & factory)
            uri.is_virtual_hosted_style,
            config.getString(config_prefix + ".access_key_id", ""),
            config.getString(config_prefix + ".secret_access_key", ""),
-            context.getRemoteHostFilter());
+            context.getRemoteHostFilter(),
+            context.getGlobalContext().getSettingsRef().s3_max_redirects);

        String metadata_path = config.getString(config_prefix + ".metadata_path", context.getPath() + "disks/" + name + "/");

--- a/src/Functions/FunctionsConversion.h
+++ b/src/Functions/FunctionsConversion.h
@ -91,6 +91,9 @@ inline UInt32 extractToDecimalScale(const ColumnWithTypeAndName & named_column)
    return field.get<UInt32>();
 }

+/// Function toUnixTimestamp has exactly the same implementation as toDateTime of String type.
+struct NameToUnixTimestamp { static constexpr auto name = "toUnixTimestamp"; };
+

 /** Conversion of number types to each other, enums to numbers, dates and datetimes to numbers and back: done by straight assignment.
  *  (Date is represented internally as number of days from some day; DateTime - as unix timestamp)
@ -111,6 +114,13 @@ struct ConvertImpl
        using ColVecFrom = typename FromDataType::ColumnType;
        using ColVecTo = typename ToDataType::ColumnType;

+        if (std::is_same_v<Name, NameToUnixTimestamp>)
+        {
+            if (isDate(named_from.type))
+                throw Exception("Illegal column " + named_from.column->getName() + " of first argument of function " + Name::name,
+                    ErrorCodes::ILLEGAL_COLUMN);
+        }
+
        if constexpr ((IsDataTypeDecimal<FromDataType> || IsDataTypeDecimal<ToDataType>)
            && !(std::is_same_v<DataTypeDateTime64, FromDataType> || std::is_same_v<DataTypeDateTime64, ToDataType>))
        {
@ -923,9 +933,6 @@ struct ConvertImplGenericFromString
 };


-/// Function toUnixTimestamp has exactly the same implementation as toDateTime of String type.
-struct NameToUnixTimestamp { static constexpr auto name = "toUnixTimestamp"; };
-
 template <>
 struct ConvertImpl<DataTypeString, DataTypeUInt32, NameToUnixTimestamp>
    : ConvertImpl<DataTypeString, DataTypeDateTime, NameToUnixTimestamp> {};
--- a/src/Functions/fuzzBits.cpp
+++ b/src/Functions/fuzzBits.cpp
@ -99,18 +99,40 @@ public:
            ColumnString::Chars & chars_to = col_to->getChars();
            ColumnString::Offsets & offsets_to = col_to->getOffsets();

-            chars_to.resize(col_in->getChars().size());
-            // TODO: Maybe we can share `col_in->getOffsets()` to `offsets_to.resize` like clever pointers? They are same
-            offsets_to.resize(input_rows_count);
+            size_t col_in_rows = col_in->getOffsets().size();

-            const auto * ptr_in = col_in->getChars().data();
-            auto * ptr_to = chars_to.data();
-            fuzzBits(ptr_in, ptr_to, chars_to.size(), inverse_probability);
-
-            for (size_t i = 0; i < input_rows_count; ++i)
+            if (col_in_rows >= input_rows_count)
            {
-                offsets_to[i] = col_in->getOffsets()[i];
-                ptr_to[offsets_to[i] - 1] = 0;
+                chars_to.resize(col_in->getChars().size());
+                // TODO: Maybe we can share `col_in->getOffsets()` to `offsets_to.resize` like clever pointers? They are same
+                offsets_to.resize(input_rows_count);
+
+                const auto * ptr_in = col_in->getChars().data();
+                auto * ptr_to = chars_to.data();
+                fuzzBits(ptr_in, ptr_to, chars_to.size(), inverse_probability);
+
+                for (size_t i = 0; i < input_rows_count; ++i)
+                {
+                    offsets_to[i] = col_in->getOffsets()[i];
+                    ptr_to[offsets_to[i] - 1] = 0;
+                }
+            }
+            else
+            {
+                assert(col_in_rows == 1);
+                chars_to.resize(col_in->getChars().size() * input_rows_count);
+                offsets_to.resize(input_rows_count);
+                size_t offset = col_in->getOffsets()[0];
+
+                const auto * ptr_in = col_in->getChars().data();
+                auto * ptr_to = chars_to.data();
+
+                for (size_t i = 0; i < input_rows_count; ++i)
+                {
+                    fuzzBits(ptr_in, ptr_to + i * offset, offset, inverse_probability);
+                    offsets_to[i] = (i + 1) * offset;
+                    ptr_to[offsets_to[i] - 1] = 0;
+                }
            }

            return col_to;
--- a/src/IO/HTTPCommon.cpp
+++ b/src/IO/HTTPCommon.cpp
@ -237,7 +237,10 @@ void assertResponseIsOk(const Poco::Net::HTTPRequest & request, Poco::Net::HTTPR
 {
    auto status = response.getStatus();

-    if (!(status == Poco::Net::HTTPResponse::HTTP_OK || (isRedirect(status) && allow_redirects)))
+    if (!(status == Poco::Net::HTTPResponse::HTTP_OK
+        || status == Poco::Net::HTTPResponse::HTTP_CREATED
+        || status == Poco::Net::HTTPResponse::HTTP_ACCEPTED
+        || (isRedirect(status) && allow_redirects)))
    {
        std::stringstream error_message;        // STYLE_CHECK_ALLOW_STD_STRING_STREAM
        error_message.exceptions(std::ios::failbit);
--- a/src/IO/S3/PocoHTTPClient.cpp
+++ b/src/IO/S3/PocoHTTPClient.cpp
@ -50,9 +50,11 @@ namespace DB::S3

 PocoHTTPClientConfiguration::PocoHTTPClientConfiguration(
        const Aws::Client::ClientConfiguration & cfg,
-        const RemoteHostFilter & remote_host_filter_)
+        const RemoteHostFilter & remote_host_filter_,
+        unsigned int s3_max_redirects_)
    : Aws::Client::ClientConfiguration(cfg)
    , remote_host_filter(remote_host_filter_)
+    , s3_max_redirects(s3_max_redirects_)
 {
 }

@ -83,6 +85,7 @@ PocoHTTPClient::PocoHTTPClient(const PocoHTTPClientConfiguration & clientConfigu
          Poco::Timespan(clientConfiguration.httpRequestTimeoutMs * 1000) /// receive timeout.
          ))
    , remote_host_filter(clientConfiguration.remote_host_filter)
+    , s3_max_redirects(clientConfiguration.s3_max_redirects)
 {
 }

@ -157,10 +160,9 @@ void PocoHTTPClient::makeRequestInternal(

    ProfileEvents::increment(select_metric(S3MetricType::Count));

-    static constexpr int max_redirect_attempts = 10;
    try
    {
-        for (int attempt = 0; attempt < max_redirect_attempts; ++attempt)
+        for (unsigned int attempt = 0; attempt <= s3_max_redirects; ++attempt)
        {
            Poco::URI poco_uri(uri);

--- a/src/IO/S3/PocoHTTPClient.h
+++ b/src/IO/S3/PocoHTTPClient.h
@ -11,14 +11,21 @@ namespace Aws::Http::Standard
 class StandardHttpResponse;
 }

+namespace DB
+{
+class Context;
+}
+
 namespace DB::S3
 {

 struct PocoHTTPClientConfiguration : public Aws::Client::ClientConfiguration
 {
    const RemoteHostFilter & remote_host_filter;
+    unsigned int s3_max_redirects;

-    PocoHTTPClientConfiguration(const Aws::Client::ClientConfiguration & cfg, const RemoteHostFilter & remote_host_filter_);
+    PocoHTTPClientConfiguration(const Aws::Client::ClientConfiguration & cfg, const RemoteHostFilter & remote_host_filter_,
+        unsigned int s3_max_redirects_);

    void updateSchemeAndRegion();
 };
@ -48,6 +55,7 @@ private:
    std::function<Aws::Client::ClientConfigurationPerRequest(const Aws::Http::HttpRequest &)> per_request_configuration;
    ConnectionTimeouts timeouts;
    const RemoteHostFilter & remote_host_filter;
+    unsigned int s3_max_redirects;
 };

 }
--- a/src/IO/S3Common.cpp
+++ b/src/IO/S3Common.cpp
@ -164,14 +164,15 @@ namespace S3
        bool is_virtual_hosted_style,
        const String & access_key_id,
        const String & secret_access_key,
-        const RemoteHostFilter & remote_host_filter)
+        const RemoteHostFilter & remote_host_filter,
+        unsigned int s3_max_redirects)
    {
        Aws::Client::ClientConfiguration cfg;

        if (!endpoint.empty())
            cfg.endpointOverride = endpoint;

-        return create(cfg, is_virtual_hosted_style, access_key_id, secret_access_key, remote_host_filter);
+        return create(cfg, is_virtual_hosted_style, access_key_id, secret_access_key, remote_host_filter, s3_max_redirects);
    }

    std::shared_ptr<Aws::S3::S3Client> ClientFactory::create( // NOLINT
@ -179,11 +180,12 @@ namespace S3
        bool is_virtual_hosted_style,
        const String & access_key_id,
        const String & secret_access_key,
-        const RemoteHostFilter & remote_host_filter)
+        const RemoteHostFilter & remote_host_filter,
+        unsigned int s3_max_redirects)
    {
        Aws::Auth::AWSCredentials credentials(access_key_id, secret_access_key);

-        PocoHTTPClientConfiguration client_configuration(cfg, remote_host_filter);
+        PocoHTTPClientConfiguration client_configuration(cfg, remote_host_filter, s3_max_redirects);

        client_configuration.updateSchemeAndRegion();

@ -201,9 +203,10 @@ namespace S3
        const String & access_key_id,
        const String & secret_access_key,
        HeaderCollection headers,
-        const RemoteHostFilter & remote_host_filter)
+        const RemoteHostFilter & remote_host_filter,
+        unsigned int s3_max_redirects)
    {
-        PocoHTTPClientConfiguration client_configuration({}, remote_host_filter);
+        PocoHTTPClientConfiguration client_configuration({}, remote_host_filter, s3_max_redirects);

        if (!endpoint.empty())
            client_configuration.endpointOverride = endpoint;
--- a/src/IO/S3Common.h
+++ b/src/IO/S3Common.h
@ -36,14 +36,16 @@ public:
        bool is_virtual_hosted_style,
        const String & access_key_id,
        const String & secret_access_key,
-        const RemoteHostFilter & remote_host_filter);
+        const RemoteHostFilter & remote_host_filter,
+        unsigned int s3_max_redirects);

    std::shared_ptr<Aws::S3::S3Client> create(
        Aws::Client::ClientConfiguration & cfg,
        bool is_virtual_hosted_style,
        const String & access_key_id,
        const String & secret_access_key,
-        const RemoteHostFilter & remote_host_filter);
+        const RemoteHostFilter & remote_host_filter,
+        unsigned int s3_max_redirects);

    std::shared_ptr<Aws::S3::S3Client> create(
        const String & endpoint,
@ -51,7 +53,8 @@ public:
        const String & access_key_id,
        const String & secret_access_key,
        HeaderCollection headers,
-        const RemoteHostFilter & remote_host_filter);
+        const RemoteHostFilter & remote_host_filter,
+        unsigned int s3_max_redirects);

 private:
    ClientFactory();
--- a/src/IO/WriteBufferFromVector.h
+++ b/src/IO/WriteBufferFromVector.h
@ -85,6 +85,8 @@ public:

    void restart()
    {
+        if (vector.empty())
+            vector.resize(initial_size);
        set(reinterpret_cast<Position>(vector.data()), vector.size());
        is_finished = false;
    }
--- a/src/IO/WriteHelpers.h
+++ b/src/IO/WriteHelpers.h
@ -859,15 +859,15 @@ template <typename T>
 inline std::enable_if_t<std::is_floating_point_v<T>, void>
 writeText(const T & x, WriteBuffer & buf) { writeFloatText(x, buf); }

-inline void writeText(const String & x, WriteBuffer & buf) { writeEscapedString(x, buf); }
+inline void writeText(const String & x, WriteBuffer & buf) { writeString(x.c_str(), x.size(), buf); }

 /// Implemented as template specialization (not function overload) to avoid preference over templates on arithmetic types above.
 template <> inline void writeText<bool>(const bool & x, WriteBuffer & buf) { writeBoolText(x, buf); }

 /// unlike the method for std::string
 /// assumes here that `x` is a null-terminated string.
-inline void writeText(const char * x, WriteBuffer & buf) { writeEscapedString(x, strlen(x), buf); }
-inline void writeText(const char * x, size_t size, WriteBuffer & buf) { writeEscapedString(x, size, buf); }
+inline void writeText(const char * x, WriteBuffer & buf) { writeCString(x, buf); }
+inline void writeText(const char * x, size_t size, WriteBuffer & buf) { writeString(x, size, buf); }

 inline void writeText(const DayNum & x, WriteBuffer & buf) { writeDateText(LocalDate(x), buf); }
 inline void writeText(const LocalDate & x, WriteBuffer & buf) { writeDateText(x, buf); }
--- a/src/Interpreters/ClientInfo.cpp
+++ b/src/Interpreters/ClientInfo.cpp
@ -62,16 +62,16 @@ void ClientInfo::write(WriteBuffer & out, const UInt64 server_protocol_revision)

    if (server_protocol_revision >= DBMS_MIN_REVISION_WITH_OPENTELEMETRY)
    {
-        if (opentelemetry_trace_id)
+        if (client_trace_context.trace_id)
        {
            // Have OpenTelemetry header.
            writeBinary(uint8_t(1), out);
            // No point writing these numbers with variable length, because they
            // are random and will probably require the full length anyway.
-            writeBinary(opentelemetry_trace_id, out);
-            writeBinary(opentelemetry_span_id, out);
-            writeBinary(opentelemetry_tracestate, out);
-            writeBinary(opentelemetry_trace_flags, out);
+            writeBinary(client_trace_context.trace_id, out);
+            writeBinary(client_trace_context.span_id, out);
+            writeBinary(client_trace_context.tracestate, out);
+            writeBinary(client_trace_context.trace_flags, out);
        }
        else
        {
@ -139,10 +139,10 @@ void ClientInfo::read(ReadBuffer & in, const UInt64 client_protocol_revision)
        readBinary(have_trace_id, in);
        if (have_trace_id)
        {
-            readBinary(opentelemetry_trace_id, in);
-            readBinary(opentelemetry_span_id, in);
-            readBinary(opentelemetry_tracestate, in);
-            readBinary(opentelemetry_trace_flags, in);
+            readBinary(client_trace_context.trace_id, in);
+            readBinary(client_trace_context.span_id, in);
+            readBinary(client_trace_context.tracestate, in);
+            readBinary(client_trace_context.trace_flags, in);
        }
    }
 }
@ -155,74 +155,6 @@ void ClientInfo::setInitialQuery()
    client_name = (DBMS_NAME " ") + client_name;
 }

-bool ClientInfo::parseTraceparentHeader(const std::string & traceparent,
-    std::string & error)
-{
-    uint8_t version = -1;
-    uint64_t trace_id_high = 0;
-    uint64_t trace_id_low = 0;
-    uint64_t trace_parent = 0;
-    uint8_t trace_flags = 0;
-
-    // Version 00, which is the only one we can parse, is fixed width. Use this
-    // fact for an additional sanity check.
-    const int expected_length = 2 + 1 + 32 + 1 + 16 + 1 + 2;
-    if (traceparent.length() != expected_length)
-    {
-        error = fmt::format("unexpected length {}, expected {}",
-            traceparent.length(), expected_length);
-        return false;
-    }
-
-    // clang-tidy doesn't like sscanf:
-    //   error: 'sscanf' used to convert a string to an unsigned integer value,
-    //   but function will not report conversion errors; consider using 'strtoul'
-    //   instead [cert-err34-c,-warnings-as-errors]
-    // There is no other ready solution, and hand-rolling a more complicated
-    // parser for an HTTP header in C++ sounds like RCE.
-    // NOLINTNEXTLINE(cert-err34-c)
-    int result = sscanf(&traceparent[0],
-        "%2" SCNx8 "-%16" SCNx64 "%16" SCNx64 "-%16" SCNx64 "-%2" SCNx8,
-        &version, &trace_id_high, &trace_id_low, &trace_parent, &trace_flags);
-
-    if (result == EOF)
-    {
-        error = "EOF";
-        return false;
-    }
-
-    // We read uint128 as two uint64, so 5 parts and not 4.
-    if (result != 5)
-    {
-        error = fmt::format("could only read {} parts instead of the expected 5",
-            result);
-        return false;
-    }
-
-    if (version != 0)
-    {
-        error = fmt::format("unexpected version {}, expected 00", version);
-        return false;
-    }
-
-    opentelemetry_trace_id = static_cast<__uint128_t>(trace_id_high) << 64
-        | trace_id_low;
-    opentelemetry_span_id = trace_parent;
-    opentelemetry_trace_flags = trace_flags;
-    return true;
-}
-
-
-std::string ClientInfo::composeTraceparentHeader() const
-{
-    // This span is a parent for its children, so we specify this span_id as a
-    // parent id.
-    return fmt::format("00-{:032x}-{:016x}-{:02x}", opentelemetry_trace_id,
-        opentelemetry_span_id,
-        // This cast is needed because fmt is being weird and complaining that
-        // "mixing character types is not allowed".
-        static_cast<uint8_t>(opentelemetry_trace_flags));
-}

 void ClientInfo::fillOSUserHostNameAndVersionInfo()
 {
--- a/src/Interpreters/ClientInfo.h
+++ b/src/Interpreters/ClientInfo.h
@ -3,7 +3,7 @@
 #include <Poco/Net/SocketAddress.h>
 #include <Common/UInt128.h>
 #include <common/types.h>
-
+#include <Common/OpenTelemetryTraceContext.h>

 namespace DB
 {
@ -25,6 +25,7 @@ public:
    {
        TCP = 1,
        HTTP = 2,
+        GRPC = 3,
    };

    enum class HTTPMethod : uint8_t
@ -59,16 +60,9 @@ public:
    String initial_query_id;
    Poco::Net::SocketAddress initial_address;

-    // OpenTelemetry trace information.
-    __uint128_t opentelemetry_trace_id = 0;
-    // The span id we get the in the incoming client info becomes our parent span
-    // id, and the span id we send becomes downstream parent span id.
-    UInt64 opentelemetry_span_id = 0;
-    UInt64 opentelemetry_parent_span_id = 0;
-    // The incoming tracestate header and the trace flags, we just pass them downstream.
-    // They are described at https://www.w3.org/TR/trace-context/
-    String opentelemetry_tracestate;
-    UInt8 opentelemetry_trace_flags = 0;
+    // OpenTelemetry trace context we received from client, or which we are going
+    // to send to server.
+    OpenTelemetryTraceContext client_trace_context;

    /// All below are parameters related to initial query.

@ -102,16 +96,6 @@ public:
    /// Initialize parameters on client initiating query.
    void setInitialQuery();

-    // Parse/compose OpenTelemetry traceparent header.
-    // Note that these functions use span_id field, not parent_span_id, same as
-    // in native protocol. The incoming traceparent corresponds to the upstream
-    // trace span, and the outgoing traceparent corresponds to our current span.
-    // We use the same ClientInfo structure first for incoming span, and then
-    // for our span: when we switch, we use old span_id as parent_span_id, and
-    // generate a new span_id (currently this happens in Context::setQueryId()).
-    bool parseTraceparentHeader(const std::string & traceparent, std::string & error);
-    std::string composeTraceparentHeader() const;
-
 private:
    void fillOSUserHostNameAndVersionInfo();
 };
--- a/src/Interpreters/ClusterProxy/IStreamFactory.h
+++ b/src/Interpreters/ClusterProxy/IStreamFactory.h
@ -36,7 +36,8 @@ public:
            const SelectQueryInfo & query_info,
            std::vector<QueryPlanPtr> & res,
            Pipes & remote_pipes,
-            Pipes & delayed_pipes) = 0;
+            Pipes & delayed_pipes,
+            Poco::Logger * log) = 0;
 };

 }
--- a/src/Interpreters/ClusterProxy/SelectStreamFactory.cpp
+++ b/src/Interpreters/ClusterProxy/SelectStreamFactory.cpp
@ -117,7 +117,8 @@ void SelectStreamFactory::createForShard(
    const SelectQueryInfo &,
    std::vector<QueryPlanPtr> & plans,
    Pipes & remote_pipes,
-    Pipes & delayed_pipes)
+    Pipes & delayed_pipes,
+    Poco::Logger * log)
 {
    bool add_agg_info = processed_stage == QueryProcessingStage::WithMergeableState;
    bool add_totals = false;
@ -143,6 +144,8 @@ void SelectStreamFactory::createForShard(
    {
        auto remote_query_executor = std::make_shared<RemoteQueryExecutor>(
            shard_info.pool, modified_query, header, context, nullptr, throttler, scalars, external_tables, processed_stage);
+        remote_query_executor->setLogger(log);
+
        remote_query_executor->setPoolMode(PoolMode::GET_MANY);
        if (!table_func_ptr)
            remote_query_executor->setMainTable(main_table);
--- a/src/Interpreters/ClusterProxy/SelectStreamFactory.h
+++ b/src/Interpreters/ClusterProxy/SelectStreamFactory.h
@ -41,7 +41,8 @@ public:
        const SelectQueryInfo & query_info,
        std::vector<QueryPlanPtr> & plans,
        Pipes & remote_pipes,
-        Pipes & delayed_pipes) override;
+        Pipes & delayed_pipes,
+        Poco::Logger * log) override;

 private:
    const Block header;
--- a/src/Interpreters/ClusterProxy/executeQuery.cpp
+++ b/src/Interpreters/ClusterProxy/executeQuery.cpp
@ -119,7 +119,11 @@ void executeQuery(
        throttler = user_level_throttler;

    for (const auto & shard_info : query_info.cluster->getShardsInfo())
-        stream_factory.createForShard(shard_info, query, query_ast, new_context, throttler, query_info, plans, remote_pipes, delayed_pipes);
+    {
+        stream_factory.createForShard(shard_info, query, query_ast,
+            new_context, throttler, query_info, plans,
+            remote_pipes, delayed_pipes, log);
+    }

    if (!remote_pipes.empty())
    {
--- a/src/Interpreters/Context.cpp
+++ b/src/Interpreters/Context.cpp
@ -1127,8 +1127,14 @@ void Context::setCurrentQueryId(const String & query_id)
    random.words.a = thread_local_rng(); //-V656
    random.words.b = thread_local_rng(); //-V656

-    if (client_info.query_kind == ClientInfo::QueryKind::INITIAL_QUERY
-        && client_info.opentelemetry_trace_id == 0)
+    if (client_info.client_trace_context.trace_id != 0)
+    {
+        // Use the OpenTelemetry trace context we received from the client, and
+        // create a new span for the query.
+        query_trace_context = client_info.client_trace_context;
+        query_trace_context.span_id = thread_local_rng();
+    }
+    else if (client_info.query_kind == ClientInfo::QueryKind::INITIAL_QUERY)
    {
        // If this is an initial query without any parent OpenTelemetry trace, we
        // might start the trace ourselves, with some configurable probability.
@ -1138,20 +1144,12 @@ void Context::setCurrentQueryId(const String & query_id)
        if (should_start_trace(thread_local_rng))
        {
            // Use the randomly generated default query id as the new trace id.
-            client_info.opentelemetry_trace_id = random.uuid;
-            client_info.opentelemetry_parent_span_id = 0;
-            client_info.opentelemetry_span_id = thread_local_rng();
+            query_trace_context.trace_id = random.uuid;
+            query_trace_context.span_id = thread_local_rng();
            // Mark this trace as sampled in the flags.
-            client_info.opentelemetry_trace_flags = 1;
+            query_trace_context.trace_flags = 1;
        }
    }
-    else
-    {
-        // The incoming request has an OpenTelemtry trace context. Its span id
-        // becomes our parent span id.
-        client_info.opentelemetry_parent_span_id = client_info.opentelemetry_span_id;
-        client_info.opentelemetry_span_id = thread_local_rng();
-    }

    String query_id_to_set = query_id;
    if (query_id_to_set.empty())    /// If the user did not submit his query_id, then we generate it ourselves.
--- a/src/Interpreters/Context.h
+++ b/src/Interpreters/Context.h
@ -13,6 +13,7 @@
 #include <Common/LRUCache.h>
 #include <Common/MultiVersion.h>
 #include <Common/ThreadPool.h>
+#include <Common/OpenTelemetryTraceContext.h>
 #include <Storages/IStorage_fwd.h>
 #include <atomic>
 #include <chrono>
@ -198,6 +199,12 @@ private:
    Context * session_context = nullptr;    /// Session context or nullptr. Could be equal to this.
    Context * global_context = nullptr;     /// Global context. Could be equal to this.

+public:
+    // Top-level OpenTelemetry trace context for the query. Makes sense only for
+    // a query context.
+    OpenTelemetryTraceContext query_trace_context;
+
+private:
    friend class NamedSessions;

    using SampleBlockCache = std::unordered_map<std::string, Block>;
--- a/src/Interpreters/ExpressionAnalyzer.cpp
+++ b/src/Interpreters/ExpressionAnalyzer.cpp
@ -364,7 +364,7 @@ void SelectQueryExpressionAnalyzer::makeSetsForIndex(const ASTPtr & node)
    }

    const auto * func = node->as<ASTFunction>();
-    if (func && functionIsInOperator(func->name))
+    if (func && functionIsInOrGlobalInOperator(func->name))
    {
        const IAST & args = *func->arguments;
        const ASTPtr & left_in_operand = args.children.at(0);
--- a/src/Interpreters/GlobalSubqueriesVisitor.h
+++ b/src/Interpreters/GlobalSubqueriesVisitor.h
@ -135,9 +135,28 @@ public:
                ast = database_and_table_name;

            external_tables[external_table_name] = external_storage_holder;
-            subqueries_for_sets[external_table_name].source = std::make_unique<QueryPlan>();
-            interpreter->buildQueryPlan(*subqueries_for_sets[external_table_name].source);
-            subqueries_for_sets[external_table_name].table = external_storage;
+
+            if (context.getSettingsRef().use_index_for_in_with_subqueries)
+            {
+                auto external_table = external_storage_holder->getTable();
+                auto table_out = external_table->write({}, external_table->getInMemoryMetadataPtr(), context);
+                auto stream = interpreter->execute().getInputStream();
+
+                table_out->writePrefix();
+                stream->readPrefix();
+                while (Block block = stream->read())
+                {
+                    table_out->write(block);
+                }
+                table_out->writeSuffix();
+                stream->readSuffix();
+            }
+            else
+            {
+                subqueries_for_sets[external_table_name].source = std::make_unique<QueryPlan>();
+                interpreter->buildQueryPlan(*subqueries_for_sets[external_table_name].source);
+                subqueries_for_sets[external_table_name].table = external_storage;
+            }

            /** NOTE If it was written IN tmp_table - the existing temporary (but not external) table,
            *  then a new temporary table will be created (for example, _data1),
--- a/src/Interpreters/IInterpreterUnionOrSelectQuery.h
+++ b/src/Interpreters/IInterpreterUnionOrSelectQuery.h
@ -0,0 +1,39 @@
+#pragma once
+
+#include <Interpreters/Context.h>
+#include <Interpreters/IInterpreter.h>
+#include <Interpreters/SelectQueryOptions.h>
+#include <Parsers/IAST_fwd.h>
+
+namespace DB
+{
+class IInterpreterUnionOrSelectQuery : public IInterpreter
+{
+public:
+    IInterpreterUnionOrSelectQuery(const ASTPtr & query_ptr_, const Context & context_, const SelectQueryOptions & options_)
+        : query_ptr(query_ptr_)
+        , context(std::make_shared<Context>(context_))
+        , options(options_)
+        , max_streams(context->getSettingsRef().max_threads)
+    {
+    }
+
+    virtual void buildQueryPlan(QueryPlan & query_plan) = 0;
+
+    virtual void ignoreWithTotals() = 0;
+
+    virtual ~IInterpreterUnionOrSelectQuery() override = default;
+
+    Block getSampleBlock() { return result_header; }
+
+    size_t getMaxStreams() const { return max_streams; }
+
+protected:
+    ASTPtr query_ptr;
+    std::shared_ptr<Context> context;
+    Block result_header;
+    SelectQueryOptions options;
+    size_t max_streams = 1;
+};
+}
+
--- a/src/Interpreters/InterpreterCreateQuery.cpp
+++ b/src/Interpreters/InterpreterCreateQuery.cpp
@ -115,7 +115,7 @@ BlockIO InterpreterCreateQuery::createDatabase(ASTCreateQuery & create)
        auto ast = DatabaseOnDisk::parseQueryFromMetadata(nullptr, context, metadata_file_path);
        create = ast->as<ASTCreateQuery &>();
        if (!create.table.empty() || !create.storage)
-            throw Exception(ErrorCodes::INCORRECT_QUERY, "Metadata file {} contains incorrect CREATE DATABASE query", metadata_file_path);
+            throw Exception(ErrorCodes::INCORRECT_QUERY, "Metadata file {} contains incorrect CREATE DATABASE query", metadata_file_path.string());
        create.attach = true;
        create.attach_short_syntax = true;
        create.database = database_name;
@ -149,7 +149,7 @@ BlockIO InterpreterCreateQuery::createDatabase(ASTCreateQuery & create)
        metadata_path = metadata_path / "store" / DatabaseCatalog::getPathForUUID(create.uuid);

        if (!create.attach && fs::exists(metadata_path))
-            throw Exception(ErrorCodes::DATABASE_ALREADY_EXISTS, "Metadata directory {} already exists", metadata_path);
+            throw Exception(ErrorCodes::DATABASE_ALREADY_EXISTS, "Metadata directory {} already exists", metadata_path.string());
    }
    else
    {
--- a/src/Interpreters/InterpreterFactory.cpp
+++ b/src/Interpreters/InterpreterFactory.cpp
@ -66,6 +66,7 @@
 #include <Interpreters/InterpreterUseQuery.h>
 #include <Interpreters/InterpreterWatchQuery.h>
 #include <Interpreters/InterpreterExternalDDLQuery.h>
+#include <Interpreters/OpenTelemetrySpanLog.h>

 #include <Parsers/ASTSystemQuery.h>

@ -93,6 +94,8 @@ namespace ErrorCodes

 std::unique_ptr<IInterpreter> InterpreterFactory::get(ASTPtr & query, Context & context, QueryProcessingStage::Enum stage)
 {
+    OpenTelemetrySpanHolder span("InterpreterFactory::get()");
+
    ProfileEvents::increment(ProfileEvents::Query);

    if (query->as<ASTSelectQuery>())
--- a/src/Interpreters/InterpreterInsertQuery.cpp
+++ b/src/Interpreters/InterpreterInsertQuery.cpp
@ -140,34 +140,39 @@ Block InterpreterInsertQuery::getSampleBlock(
 /** A query that just reads all data without any complex computations or filetering.
  * If we just pipe the result to INSERT, we don't have to use too many threads for read.
  */
-static bool isTrivialSelect(const ASTSelectQuery & select_query)
+static bool isTrivialSelect(const ASTPtr & select)
 {
-    const auto & tables = select_query.tables();
+    if (auto * select_query = select->as<ASTSelectQuery>())
+    {
+        const auto & tables = select_query->tables();

-    if (!tables)
-        return false;
+        if (!tables)
+            return false;

-    const auto & tables_in_select_query = tables->as<ASTTablesInSelectQuery &>();
+        const auto & tables_in_select_query = tables->as<ASTTablesInSelectQuery &>();

-    if (tables_in_select_query.children.size() != 1)
-        return false;
+        if (tables_in_select_query.children.size() != 1)
+            return false;

-    const auto & child = tables_in_select_query.children.front();
-    const auto & table_element = child->as<ASTTablesInSelectQueryElement &>();
-    const auto & table_expr = table_element.table_expression->as<ASTTableExpression &>();
+        const auto & child = tables_in_select_query.children.front();
+        const auto & table_element = child->as<ASTTablesInSelectQueryElement &>();
+        const auto & table_expr = table_element.table_expression->as<ASTTableExpression &>();

-    if (table_expr.subquery)
-        return false;
+        if (table_expr.subquery)
+            return false;

-    /// Note: how to write it in more generic way?
-    return (!select_query.distinct
-        && !select_query.limit_with_ties
-        && !select_query.prewhere()
-        && !select_query.where()
-        && !select_query.groupBy()
-        && !select_query.having()
-        && !select_query.orderBy()
-        && !select_query.limitBy());
+        /// Note: how to write it in more generic way?
+        return (!select_query->distinct
+            && !select_query->limit_with_ties
+            && !select_query->prewhere()
+            && !select_query->where()
+            && !select_query->groupBy()
+            && !select_query->having()
+            && !select_query->orderBy()
+            && !select_query->limitBy());
+    }
+    /// This query is ASTSelectWithUnionQuery subquery
+    return false;
 };


@ -196,23 +201,25 @@ BlockIO InterpreterInsertQuery::execute()
        auto new_query = std::dynamic_pointer_cast<ASTInsertQuery>(query.clone());
        if (select.list_of_selects->children.size() == 1)
        {
-            auto & select_query = select.list_of_selects->children.at(0)->as<ASTSelectQuery &>();
-            JoinedTables joined_tables(Context(context), select_query);
-
-            if (joined_tables.tablesCount() == 1)
+            if (auto * select_query = select.list_of_selects->children.at(0)->as<ASTSelectQuery>())
            {
-                storage_src = std::dynamic_pointer_cast<StorageDistributed>(joined_tables.getLeftTableStorage());
-                if (storage_src)
+                JoinedTables joined_tables(Context(context), *select_query);
+
+                if (joined_tables.tablesCount() == 1)
                {
-                    const auto select_with_union_query = std::make_shared<ASTSelectWithUnionQuery>();
-                    select_with_union_query->list_of_selects = std::make_shared<ASTExpressionList>();
+                    storage_src = std::dynamic_pointer_cast<StorageDistributed>(joined_tables.getLeftTableStorage());
+                    if (storage_src)
+                    {
+                        const auto select_with_union_query = std::make_shared<ASTSelectWithUnionQuery>();
+                        select_with_union_query->list_of_selects = std::make_shared<ASTExpressionList>();

-                    auto new_select_query = std::dynamic_pointer_cast<ASTSelectQuery>(select_query.clone());
-                    select_with_union_query->list_of_selects->children.push_back(new_select_query);
+                        auto new_select_query = std::dynamic_pointer_cast<ASTSelectQuery>(select_query->clone());
+                        select_with_union_query->list_of_selects->children.push_back(new_select_query);

-                    new_select_query->replaceDatabaseAndTable(storage_src->getRemoteDatabaseName(), storage_src->getRemoteTableName());
+                        new_select_query->replaceDatabaseAndTable(storage_src->getRemoteDatabaseName(), storage_src->getRemoteTableName());

-                    new_query->select = select_with_union_query;
+                        new_query->select = select_with_union_query;
+                    }
                }
            }
        }
@ -275,12 +282,17 @@ BlockIO InterpreterInsertQuery::execute()

            if (settings.optimize_trivial_insert_select)
            {
-                const auto & selects = query.select->as<ASTSelectWithUnionQuery &>().list_of_selects->children;
+                const auto & select_query = query.select->as<ASTSelectWithUnionQuery &>();
+                const auto & selects = select_query.list_of_selects->children;
+                const auto & union_modes = select_query.list_of_modes;

-                is_trivial_insert_select = std::all_of(selects.begin(), selects.end(), [](const ASTPtr & select)
-                {
-                    return isTrivialSelect(select->as<ASTSelectQuery &>());
-                });
+                /// ASTSelectWithUnionQuery is not normalized now, so it may pass some querys which can be Trivial select querys
+                is_trivial_insert_select
+                    = std::all_of(
+                          union_modes.begin(),
+                          union_modes.end(),
+                          [](const ASTSelectWithUnionQuery::Mode & mode) { return mode == ASTSelectWithUnionQuery::Mode::ALL; })
+                    && std::all_of(selects.begin(), selects.end(), [](const ASTPtr & select) { return isTrivialSelect(select); });
            }

            if (is_trivial_insert_select)
--- a/src/Interpreters/InterpreterSelectQuery.cpp
+++ b/src/Interpreters/InterpreterSelectQuery.cpp
@ -29,6 +29,7 @@
 #include <Interpreters/TableJoin.h>
 #include <Interpreters/JoinSwitcher.h>
 #include <Interpreters/JoinedTables.h>
+#include <Interpreters/OpenTelemetrySpanLog.h>
 #include <Interpreters/QueryAliasesVisitor.h>

 #include <Processors/Pipe.h>
@ -216,10 +217,8 @@ InterpreterSelectQuery::InterpreterSelectQuery(
    const SelectQueryOptions & options_,
    const Names & required_result_column_names,
    const StorageMetadataPtr & metadata_snapshot_)
-    : options(options_)
    /// NOTE: the query almost always should be cloned because it will be modified during analysis.
-    , query_ptr(options.modify_inplace ? query_ptr_ : query_ptr_->clone())
-    , context(std::make_shared<Context>(context_))
+    : IInterpreterUnionOrSelectQuery(options_.modify_inplace ? query_ptr_ : query_ptr_->clone(), context_, options_)
    , storage(storage_)
    , input(input_)
    , input_pipe(std::move(input_pipe_))
@ -465,12 +464,6 @@ InterpreterSelectQuery::InterpreterSelectQuery(
    sanitizeBlock(result_header, true);
 }

-
-Block InterpreterSelectQuery::getSampleBlock()
-{
-    return result_header;
-}
-
 void InterpreterSelectQuery::buildQueryPlan(QueryPlan & query_plan)
 {
    executeImpl(query_plan, input, std::move(input_pipe));
@ -502,6 +495,8 @@ BlockIO InterpreterSelectQuery::execute()

 Block InterpreterSelectQuery::getSampleBlockImpl()
 {
+    OpenTelemetrySpanHolder span(__PRETTY_FUNCTION__);
+
    query_info.query = query_ptr;

    if (storage && !options.only_analyze)
@ -1182,7 +1177,7 @@ void InterpreterSelectQuery::executeFetchColumns(
        const auto & func = desc.function;
        std::optional<UInt64> num_rows{};
        if (!query.prewhere() && !query.where())
-            num_rows = storage->totalRows();
+            num_rows = storage->totalRows(settings);
        else // It's possible to optimize count() given only partition predicates
        {
            SelectQueryInfo temp_query_info;
--- a/Show More
+++ b/Show More
				`@ -0,0 +1 @@`
				`Subproject commit 4f3b686f86c3ebaba7e4e926e62a79cb1c659a54`