Merge remote-tracking branch 'upstream/master' into HEAD

2024-11-26 09:32:01 +00:00 · 2020-12-23 15:16:43 +03:00 · 2020-12-23 15:16:43 +03:00 · 57857dda63
commit 57857dda63
parent 5f9c47787a ddfe9f61e0
122 changed files with 5219 additions and 672 deletions
--- a/base/common/defines.h
+++ b/base/common/defines.h
@ -61,6 +61,16 @@
 #    endif
 #endif

+#if defined(ADDRESS_SANITIZER)
+#    define BOOST_USE_ASAN 1
+#    define BOOST_USE_UCONTEXT 1
+#endif
+
+#if defined(THREAD_SANITIZER)
+#    define BOOST_USE_TSAN 1
+#    define BOOST_USE_UCONTEXT 1
+#endif
+
 /// TODO: Strange enough, there is no way to detect UB sanitizer.

 /// Explicitly allow undefined behaviour for certain functions. Use it as a function attribute.
--- a/base/daemon/BaseDaemon.cpp
+++ b/base/daemon/BaseDaemon.cpp
@ -4,6 +4,11 @@
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <sys/time.h>
+#include <sys/wait.h>
+#include <sys/resource.h>
+#if defined(__linux__)
+    #include <sys/prctl.h>
+#endif
 #include <fcntl.h>
 #include <errno.h>
 #include <string.h>
@ -12,7 +17,6 @@
 #include <unistd.h>

 #include <typeinfo>
-#include <sys/resource.h>
 #include <iostream>
 #include <fstream>
 #include <sstream>
@ -22,7 +26,6 @@
 #include <Poco/Observer.h>
 #include <Poco/AutoPtr.h>
 #include <Poco/PatternFormatter.h>
-#include <Poco/TaskManager.h>
 #include <Poco/File.h>
 #include <Poco/Path.h>
 #include <Poco/Message.h>
@ -470,7 +473,6 @@ BaseDaemon::~BaseDaemon()

 void BaseDaemon::terminate()
 {
-    getTaskManager().cancelAll();
    if (::raise(SIGTERM) != 0)
        throw Poco::SystemException("cannot terminate process");
 }
@ -478,22 +480,11 @@ void BaseDaemon::terminate()
 void BaseDaemon::kill()
 {
    dumpCoverageReportIfPossible();
-    pid.reset();
+    pid_file.reset();
    if (::raise(SIGKILL) != 0)
        throw Poco::SystemException("cannot kill process");
 }

-void BaseDaemon::sleep(double seconds)
-{
-    wakeup_event.reset();
-    wakeup_event.tryWait(seconds * 1000);
-}
-
-void BaseDaemon::wakeup()
-{
-    wakeup_event.set();
-}
-
 std::string BaseDaemon::getDefaultCorePath() const
 {
    return "/opt/cores/";
@ -564,7 +555,6 @@ void BaseDaemon::initialize(Application & self)
 {
    closeFDs();

-    task_manager = std::make_unique<Poco::TaskManager>();
    ServerApplication::initialize(self);

    /// now highest priority (lowest value) is PRIO_APPLICATION = -100, we want higher!
@ -648,10 +638,6 @@ void BaseDaemon::initialize(Application & self)
            throw Poco::OpenFileException("Cannot attach stdout to " + stdout_path);
    }

-    /// Create pid file.
-    if (config().has("pid"))
-        pid.emplace(config().getString("pid"), DB::StatusFile::write_pid);
-
    /// Change path for logging.
    if (!log_path.empty())
    {
@ -667,9 +653,17 @@ void BaseDaemon::initialize(Application & self)
            throw Poco::Exception("Cannot change directory to /tmp");
    }

-    // sensitive data masking rules are not used here
+    /// sensitive data masking rules are not used here
    buildLoggers(config(), logger(), self.commandName());

+    /// After initialized loggers but before initialized signal handling.
+    if (should_setup_watchdog)
+        setupWatchdog();
+
+    /// Create pid file.
+    if (config().has("pid"))
+        pid_file.emplace(config().getString("pid"), DB::StatusFile::write_pid);
+
    if (is_daemon)
    {
        /** Change working directory to the directory to write core dumps.
@ -704,54 +698,71 @@ void BaseDaemon::initialize(Application & self)
 }


+static void addSignalHandler(const std::vector<int> & signals, signal_function handler, std::vector<int> * out_handled_signals)
+{
+    struct sigaction sa;
+    memset(&sa, 0, sizeof(sa));
+    sa.sa_sigaction = handler;
+    sa.sa_flags = SA_SIGINFO;
+
+#if defined(OS_DARWIN)
+    sigemptyset(&sa.sa_mask);
+    for (auto signal : signals)
+        sigaddset(&sa.sa_mask, signal);
+#else
+    if (sigemptyset(&sa.sa_mask))
+        throw Poco::Exception("Cannot set signal handler.");
+
+    for (auto signal : signals)
+        if (sigaddset(&sa.sa_mask, signal))
+            throw Poco::Exception("Cannot set signal handler.");
+#endif
+
+    for (auto signal : signals)
+        if (sigaction(signal, &sa, nullptr))
+            throw Poco::Exception("Cannot set signal handler.");
+
+    if (out_handled_signals)
+        std::copy(signals.begin(), signals.end(), std::back_inserter(*out_handled_signals));
+};
+
+
+static void blockSignals(const std::vector<int> & signals)
+{
+    sigset_t sig_set;
+
+#if defined(OS_DARWIN)
+    sigemptyset(&sig_set);
+    for (auto signal : signals)
+        sigaddset(&sig_set, signal);
+#else
+    if (sigemptyset(&sig_set))
+        throw Poco::Exception("Cannot block signal.");
+
+    for (auto signal : signals)
+        if (sigaddset(&sig_set, signal))
+            throw Poco::Exception("Cannot block signal.");
+#endif
+
+    if (pthread_sigmask(SIG_BLOCK, &sig_set, nullptr))
+        throw Poco::Exception("Cannot block signal.");
+};
+
+
 void BaseDaemon::initializeTerminationAndSignalProcessing()
 {
    SentryWriter::initialize(config());
    std::set_terminate(terminate_handler);

    /// We want to avoid SIGPIPE when working with sockets and pipes, and just handle return value/errno instead.
-    {
-        sigset_t sig_set;
-        if (sigemptyset(&sig_set) || sigaddset(&sig_set, SIGPIPE) || pthread_sigmask(SIG_BLOCK, &sig_set, nullptr))
-            throw Poco::Exception("Cannot block signal.");
-    }
+    blockSignals({SIGPIPE});

    /// Setup signal handlers.
-    auto add_signal_handler =
-        [this](const std::vector<int> & signals, signal_function handler)
-        {
-            struct sigaction sa;
-            memset(&sa, 0, sizeof(sa));
-            sa.sa_sigaction = handler;
-            sa.sa_flags = SA_SIGINFO;
-
-            {
-#if defined(OS_DARWIN)
-                sigemptyset(&sa.sa_mask);
-                for (auto signal : signals)
-                    sigaddset(&sa.sa_mask, signal);
-#else
-                if (sigemptyset(&sa.sa_mask))
-                    throw Poco::Exception("Cannot set signal handler.");
-
-                for (auto signal : signals)
-                    if (sigaddset(&sa.sa_mask, signal))
-                        throw Poco::Exception("Cannot set signal handler.");
-#endif
-
-                for (auto signal : signals)
-                    if (sigaction(signal, &sa, nullptr))
-                        throw Poco::Exception("Cannot set signal handler.");
-
-                std::copy(signals.begin(), signals.end(), std::back_inserter(handled_signals));
-            }
-        };
-
    /// SIGTSTP is added for debugging purposes. To output a stack trace of any running thread at anytime.

-    add_signal_handler({SIGABRT, SIGSEGV, SIGILL, SIGBUS, SIGSYS, SIGFPE, SIGPIPE, SIGTSTP}, signalHandler);
-    add_signal_handler({SIGHUP, SIGUSR1}, closeLogsSignalHandler);
-    add_signal_handler({SIGINT, SIGQUIT, SIGTERM}, terminateRequestedSignalHandler);
+    addSignalHandler({SIGABRT, SIGSEGV, SIGILL, SIGBUS, SIGSYS, SIGFPE, SIGPIPE, SIGTSTP}, signalHandler, &handled_signals);
+    addSignalHandler({SIGHUP, SIGUSR1}, closeLogsSignalHandler, &handled_signals);
+    addSignalHandler({SIGINT, SIGQUIT, SIGTERM}, terminateRequestedSignalHandler, &handled_signals);

 #if defined(SANITIZER)
    __sanitizer_set_death_callback(sanitizerDeathCallback);
@ -786,23 +797,6 @@ void BaseDaemon::logRevision() const
        + ", PID " + std::to_string(getpid()));
 }

-/// Makes server shutdown if at least one Poco::Task have failed.
-void BaseDaemon::exitOnTaskError()
-{
-    Poco::Observer<BaseDaemon, Poco::TaskFailedNotification> obs(*this, &BaseDaemon::handleNotification);
-    getTaskManager().addObserver(obs);
-}
-
-/// Used for exitOnTaskError()
-void BaseDaemon::handleNotification(Poco::TaskFailedNotification *_tfn)
-{
-    task_failed = true;
-    Poco::AutoPtr<Poco::TaskFailedNotification> fn(_tfn);
-    Poco::Logger * lg = &(logger());
-    LOG_ERROR(lg, "Task '{}' failed. Daemon is shutting down. Reason - {}", fn->task()->name(), fn->reason().displayText());
-    ServerApplication::terminate();
-}
-
 void BaseDaemon::defineOptions(Poco::Util::OptionSet & new_options)
 {
    new_options.addOption(
@ -863,13 +857,144 @@ void BaseDaemon::onInterruptSignals(int signal_id)
    if (sigint_signals_counter >= 2)
    {
        LOG_INFO(&logger(), "Received second signal Interrupt. Immediately terminate.");
-        kill();
+        call_default_signal_handler(signal_id);
+        /// If the above did not help.
+        _exit(128 + signal_id);
    }
 }


 void BaseDaemon::waitForTerminationRequest()
 {
+    /// NOTE: as we already process signals via pipe, we don't have to block them with sigprocmask in threads
    std::unique_lock<std::mutex> lock(signal_handler_mutex);
    signal_event.wait(lock, [this](){ return terminate_signals_counter > 0; });
 }
+
+
+void BaseDaemon::shouldSetupWatchdog(char * argv0_)
+{
+    should_setup_watchdog = true;
+    argv0 = argv0_;
+}
+
+
+void BaseDaemon::setupWatchdog()
+{
+    /// Initialize in advance to avoid double initialization in forked processes.
+    DateLUT::instance();
+
+    std::string original_process_name;
+    if (argv0)
+        original_process_name = argv0;
+
+    while (true)
+    {
+        static pid_t pid = -1;
+        pid = fork();
+
+        if (-1 == pid)
+            throw Poco::Exception("Cannot fork");
+
+        if (0 == pid)
+        {
+            logger().information("Forked a child process to watch");
+#if defined(__linux__)
+            if (0 != prctl(PR_SET_PDEATHSIG, SIGKILL))
+                logger().warning("Cannot do prctl to ask termination with parent.");
+#endif
+            return;
+        }
+
+        /// Change short thread name and process name.
+        setThreadName("clckhouse-watch");   /// 15 characters
+
+        if (argv0)
+        {
+            const char * new_process_name = "clickhouse-watchdog";
+            memset(argv0, 0, original_process_name.size());
+            memcpy(argv0, new_process_name, std::min(strlen(new_process_name), original_process_name.size()));
+        }
+
+        logger().information(fmt::format("Will watch for the process with pid {}", pid));
+
+        /// Forward signals to the child process.
+        addSignalHandler(
+            {SIGHUP, SIGUSR1, SIGINT, SIGQUIT, SIGTERM},
+            [](int sig, siginfo_t *, void *)
+            {
+                /// Forward all signals except INT as it can be send by terminal to the process group when user press Ctrl+C,
+                /// and we process double delivery of this signal as immediate termination.
+                if (sig == SIGINT)
+                    return;
+
+                const char * error_message = "Cannot forward signal to the child process.\n";
+                if (0 != ::kill(pid, sig))
+                {
+                    auto res = write(STDERR_FILENO, error_message, strlen(error_message));
+                    (void)res;
+                }
+            },
+            nullptr);
+
+        int status = 0;
+        do
+        {
+            if (-1 != waitpid(pid, &status, WUNTRACED | WCONTINUED) || errno == ECHILD)
+            {
+                if (WIFSTOPPED(status))
+                    logger().warning(fmt::format("Child process was stopped by signal {}.", WSTOPSIG(status)));
+                else if (WIFCONTINUED(status))
+                    logger().warning(fmt::format("Child process was continued."));
+                else
+                    break;
+            }
+            else if (errno != EINTR)
+                throw Poco::Exception("Cannot waitpid, errno: " + std::string(strerror(errno)));
+        } while (true);
+
+        if (errno == ECHILD)
+        {
+            logger().information("Child process no longer exists.");
+            _exit(status);
+        }
+
+        if (WIFEXITED(status))
+        {
+            logger().information(fmt::format("Child process exited normally with code {}.", WEXITSTATUS(status)));
+            _exit(status);
+        }
+
+        if (WIFSIGNALED(status))
+        {
+            int sig = WTERMSIG(status);
+
+            if (sig == SIGKILL)
+            {
+                logger().fatal(fmt::format("Child process was terminated by signal {} (KILL)."
+                    " If it is not done by 'forcestop' command or manually,"
+                    " the possible cause is OOM Killer (see 'dmesg' and look at the '/var/log/kern.log' for the details).", sig));
+            }
+            else
+            {
+                logger().fatal(fmt::format("Child process was terminated by signal {}.", sig));
+
+                if (sig == SIGINT || sig == SIGTERM || sig == SIGQUIT)
+                    _exit(status);
+            }
+        }
+        else
+        {
+            logger().fatal("Child process was not exited normally by unknown reason.");
+        }
+
+        /// Automatic restart is not enabled but you can play with it.
+#if 1
+        _exit(status);
+#else
+        logger().information("Will restart.");
+        if (argv0)
+            memcpy(argv0, original_process_name.c_str(), original_process_name.size());
+#endif
+    }
+}
--- a/base/daemon/BaseDaemon.h
+++ b/base/daemon/BaseDaemon.h
@ -12,7 +12,6 @@
 #include <chrono>
 #include <Poco/Process.h>
 #include <Poco/ThreadPool.h>
-#include <Poco/TaskNotification.h>
 #include <Poco/Util/Application.h>
 #include <Poco/Util/ServerApplication.h>
 #include <Poco/Net/SocketAddress.h>
@ -26,9 +25,6 @@
 #include <loggers/Loggers.h>


-namespace Poco { class TaskManager; }
-
-
 /// \brief Base class for applications that can run as daemons.
 ///
 /// \code
@ -52,31 +48,26 @@ public:
    BaseDaemon();
    ~BaseDaemon() override;

-    /// Загружает конфигурацию и "строит" логгеры на запись в файлы
+    /// Load configuration, prepare loggers, etc.
    void initialize(Poco::Util::Application &) override;

-    /// Читает конфигурацию
    void reloadConfiguration();

-    /// Определяет параметр командной строки
+    /// Process command line parameters
    void defineOptions(Poco::Util::OptionSet & new_options) override;

-    /// Заставляет демон завершаться, если хотя бы одна задача завершилась неудачно
-    void exitOnTaskError();
+    /// Graceful shutdown
+    static void terminate();

-    /// Завершение демона ("мягкое")
-    void terminate();
-
-    /// Завершение демона ("жёсткое")
+    /// Forceful shutdown
    void kill();

-    /// Получен ли сигнал на завершение?
+    /// Cancellation request has been received.
    bool isCancelled() const
    {
        return is_cancelled;
    }

-    /// Получение ссылки на экземпляр демона
    static BaseDaemon & instance()
    {
        return dynamic_cast<BaseDaemon &>(Poco::Util::Application::instance());
@ -85,12 +76,6 @@ public:
    /// return none if daemon doesn't exist, reference to the daemon otherwise
    static std::optional<std::reference_wrapper<BaseDaemon>> tryGetInstance() { return tryGetInstance<BaseDaemon>(); }

-    /// Спит заданное количество секунд или до события wakeup
-    void sleep(double seconds);
-
-    /// Разбудить
-    void wakeup();
-
    /// В Graphite компоненты пути(папки) разделяются точкой.
    /// У нас принят путь формата root_path.hostname_yandex_ru.key
    /// root_path по умолчанию one_min
@ -131,24 +116,23 @@ public:
    /// also doesn't close global internal pipes for signal handling
    static void closeFDs();

+    /// If this method is called after initialization and before run,
+    /// will fork child process and setup watchdog that will print diagnostic info, if the child terminates.
+    /// argv0 is needed to change process name (consequently, it is needed for scripts involving "pgrep", "pidof" to work correctly).
+    void shouldSetupWatchdog(char * argv0_);
+
 protected:
-    /// Возвращает TaskManager приложения
-    /// все методы task_manager следует вызывать из одного потока
-    /// иначе возможен deadlock, т.к. joinAll выполняется под локом, а любой метод тоже берет лок
-    Poco::TaskManager & getTaskManager() { return *task_manager; }
-
    virtual void logRevision() const;

-    /// Используется при exitOnTaskError()
-    void handleNotification(Poco::TaskFailedNotification *);
-
    /// thread safe
    virtual void handleSignal(int signal_id);

    /// initialize termination process and signal handlers
    virtual void initializeTerminationAndSignalProcessing();

-    /// реализация обработки сигналов завершения через pipe не требует блокировки сигнала с помощью sigprocmask во всех потоках
+    /// fork the main process and watch if it was killed
+    void setupWatchdog();
+
    void waitForTerminationRequest()
 #if defined(POCO_CLICKHOUSE_PATCH) || POCO_VERSION >= 0x02000000 // in old upstream poco not vitrual
    override
@ -162,21 +146,13 @@ protected:

    virtual std::string getDefaultCorePath() const;

-    std::unique_ptr<Poco::TaskManager> task_manager;
-
-    std::optional<DB::StatusFile> pid;
+    std::optional<DB::StatusFile> pid_file;

    std::atomic_bool is_cancelled{false};

-    /// Флаг устанавливается по сообщению из Task (при аварийном завершении).
-    bool task_failed = false;
-
    bool log_to_console = false;

-    /// Событие, чтобы проснуться во время ожидания
-    Poco::Event wakeup_event;
-
-    /// Поток, в котором принимается сигнал HUP/USR1 для закрытия логов.
+    /// A thread that acts on HUP and USR1 signal (close logs).
    Poco::Thread signal_listener_thread;
    std::unique_ptr<Poco::Runnable> signal_listener;

@ -194,6 +170,9 @@ protected:
    String build_id_info;

    std::vector<int> handled_signals;
+
+    bool should_setup_watchdog = false;
+    char * argv0 = nullptr;
 };


--- a/base/glibc-compatibility/musl/timerfd.c
+++ b/base/glibc-compatibility/musl/timerfd.c
@ -0,0 +1,17 @@
+#include <sys/timerfd.h>
+#include "syscall.h"
+
+int timerfd_create(int clockid, int flags)
+{
+    return syscall(SYS_timerfd_create, clockid, flags);
+}
+
+int timerfd_settime(int fd, int flags, const struct itimerspec *new, struct itimerspec *old)
+{
+    return syscall(SYS_timerfd_settime, fd, flags, new, old);
+}
+
+int timerfd_gettime(int fd, struct itimerspec *cur)
+{
+    return syscall(SYS_timerfd_gettime, fd, cur);
+}
--- a/contrib/boost-cmake/CMakeLists.txt
+++ b/contrib/boost-cmake/CMakeLists.txt
@ -11,10 +11,11 @@ if (NOT USE_INTERNAL_BOOST_LIBRARY)
        iostreams
        program_options
        regex
+        context
    )

    if(Boost_INCLUDE_DIR AND Boost_FILESYSTEM_LIBRARY AND Boost_FILESYSTEM_LIBRARY AND
-       Boost_PROGRAM_OPTIONS_LIBRARY AND Boost_REGEX_LIBRARY AND Boost_SYSTEM_LIBRARY)
+       Boost_PROGRAM_OPTIONS_LIBRARY AND Boost_REGEX_LIBRARY AND Boost_SYSTEM_LIBRARY AND Boost_CONTEXT_LIBRARY)

        set(EXTERNAL_BOOST_FOUND 1)

@ -27,18 +28,21 @@ if (NOT USE_INTERNAL_BOOST_LIBRARY)
        add_library (_boost_program_options INTERFACE)
        add_library (_boost_regex INTERFACE)
        add_library (_boost_system INTERFACE)
+        add_library (_boost_context INTERFACE)

        target_link_libraries (_boost_filesystem INTERFACE ${Boost_FILESYSTEM_LIBRARY})
        target_link_libraries (_boost_iostreams INTERFACE ${Boost_IOSTREAMS_LIBRARY})
        target_link_libraries (_boost_program_options INTERFACE ${Boost_PROGRAM_OPTIONS_LIBRARY})
        target_link_libraries (_boost_regex INTERFACE ${Boost_REGEX_LIBRARY})
        target_link_libraries (_boost_system INTERFACE ${Boost_SYSTEM_LIBRARY})
+        target_link_libraries (_boost_context INTERFACE ${Boost_CONTEXT_LIBRARY})

        add_library (boost::filesystem ALIAS _boost_filesystem)
        add_library (boost::iostreams ALIAS _boost_iostreams)
        add_library (boost::program_options ALIAS _boost_program_options)
        add_library (boost::regex ALIAS _boost_regex)
        add_library (boost::system ALIAS _boost_system)
+        add_library (boost::context ALIAS _boost_context)
    else()
        set(EXTERNAL_BOOST_FOUND 0)
        message (${RECONFIGURE_MESSAGE_LEVEL} "Can't find system boost")
@ -142,4 +146,57 @@ if (NOT EXTERNAL_BOOST_FOUND)
    add_library (_boost_system ${SRCS_SYSTEM})
    add_library (boost::system ALIAS _boost_system)
    target_include_directories (_boost_system PRIVATE ${LIBRARY_DIR})
+
+    # context
+    enable_language(ASM)
+    SET(ASM_OPTIONS "-x assembler-with-cpp")
+
+    if (SANITIZE AND (SANITIZE STREQUAL "address" OR SANITIZE STREQUAL "thread"))
+        add_compile_definitions(BOOST_USE_UCONTEXT)
+
+        if (SANITIZE STREQUAL "address")
+            add_compile_definitions(BOOST_USE_ASAN)
+        elseif (SANITIZE STREQUAL "thread")
+            add_compile_definitions(BOOST_USE_TSAN)
+        endif()
+
+        set (SRCS_CONTEXT
+                ${LIBRARY_DIR}/libs/context/src/fiber.cpp
+                ${LIBRARY_DIR}/libs/context/src/continuation.cpp
+                ${LIBRARY_DIR}/libs/context/src/dummy.cpp
+                ${LIBRARY_DIR}/libs/context/src/execution_context.cpp
+                ${LIBRARY_DIR}/libs/context/src/posix/stack_traits.cpp
+        )
+    elseif (ARCH_ARM)
+        set (SRCS_CONTEXT
+            ${LIBRARY_DIR}/libs/context/src/asm/jump_arm64_aapcs_elf_gas.S
+            ${LIBRARY_DIR}/libs/context/src/asm/make_arm64_aapcs_elf_gas.S
+            ${LIBRARY_DIR}/libs/context/src/asm/ontop_arm64_aapcs_elf_gas.S
+            ${LIBRARY_DIR}/libs/context/src/dummy.cpp
+            ${LIBRARY_DIR}/libs/context/src/execution_context.cpp
+            ${LIBRARY_DIR}/libs/context/src/posix/stack_traits.cpp
+        )
+    elseif(OS_DARWIN)
+        set (SRCS_CONTEXT
+            ${LIBRARY_DIR}/libs/context/src/asm/jump_x86_64_sysv_macho_gas.S
+            ${LIBRARY_DIR}/libs/context/src/asm/make_x86_64_sysv_macho_gas.S
+            ${LIBRARY_DIR}/libs/context/src/asm/ontop_x86_64_sysv_macho_gas.S
+            ${LIBRARY_DIR}/libs/context/src/dummy.cpp
+            ${LIBRARY_DIR}/libs/context/src/execution_context.cpp
+            ${LIBRARY_DIR}/libs/context/src/posix/stack_traits.cpp
+        )
+    else()
+        set (SRCS_CONTEXT
+            ${LIBRARY_DIR}/libs/context/src/asm/jump_x86_64_sysv_elf_gas.S
+            ${LIBRARY_DIR}/libs/context/src/asm/make_x86_64_sysv_elf_gas.S
+            ${LIBRARY_DIR}/libs/context/src/asm/ontop_x86_64_sysv_elf_gas.S
+            ${LIBRARY_DIR}/libs/context/src/dummy.cpp
+            ${LIBRARY_DIR}/libs/context/src/execution_context.cpp
+            ${LIBRARY_DIR}/libs/context/src/posix/stack_traits.cpp
+        )
+    endif()
+
+    add_library (_boost_context ${SRCS_CONTEXT})
+    add_library (boost::context ALIAS _boost_context)
+    target_include_directories (_boost_context PRIVATE ${LIBRARY_DIR})
 endif ()
--- a/contrib/grpc-cmake/CMakeLists.txt
+++ b/contrib/grpc-cmake/CMakeLists.txt
@ -54,6 +54,26 @@ else ()
  set(CARES_SHARED ON CACHE BOOL "" FORCE)
 endif ()

+# Disable looking for libnsl on a platforms that has gethostbyname in glibc
+#
+# c-ares searching for gethostbyname in the libnsl library, however in the
+# version that shipped with gRPC it doing it wrong [1], since it uses
+# CHECK_LIBRARY_EXISTS(), which will return TRUE even if the function exists in
+# another dependent library. The upstream already contains correct macro [2],
+# but it is not included in gRPC (even upstream gRPC, not the one that is
+# shipped with clickhousee).
+#
+#   [1]: https://github.com/c-ares/c-ares/blob/e982924acee7f7313b4baa4ee5ec000c5e373c30/CMakeLists.txt#L125
+#   [2]: https://github.com/c-ares/c-ares/blob/44fbc813685a1fa8aa3f27fcd7544faf612d376a/CMakeLists.txt#L146
+#
+# And because if you by some reason have libnsl [3] installed, clickhouse will
+# reject to start w/o it. While this is completelly different library.
+#
+#   [3]: https://packages.debian.org/bullseye/libnsl2
+if (NOT CMAKE_SYSTEM_NAME STREQUAL "SunOS")
+  set(HAVE_LIBNSL OFF CACHE BOOL "" FORCE)
+endif()
+
 # We don't want to build C# extensions.
 set(gRPC_BUILD_CSHARP_EXT OFF)

--- a/debian/rules
+++ b/debian/rules
@ -62,7 +62,7 @@ ifndef DISABLE_NINJA
    NINJA=$(shell which ninja)
 ifneq ($(NINJA),)
        CMAKE_FLAGS += -GNinja
-        export MAKE=$(NINJA)
+        export MAKE=$(NINJA) $(NINJA_FLAGS)
 endif
 endif

--- a/docker/packager/unbundled/Dockerfile
+++ b/docker/packager/unbundled/Dockerfile
@ -21,6 +21,7 @@ RUN apt-get update \
        libboost-thread-dev \
        libboost-iostreams-dev \
        libboost-regex-dev \
+        libboost-context-dev \
        zlib1g-dev \
        liblz4-dev \
        libdouble-conversion-dev \
--- a/docker/test/stateless/run.sh
+++ b/docker/test/stateless/run.sh
@ -12,7 +12,32 @@ dpkg -i package_folder/clickhouse-test_*.deb
 # install test configs
 /usr/share/clickhouse-test/config/install.sh

-service clickhouse-server start && sleep 5
+# For flaky check we also enable thread fuzzer
+if [ "$NUM_TRIES" -gt "1" ]; then
+    export THREAD_FUZZER_CPU_TIME_PERIOD_US=1000
+    export THREAD_FUZZER_SLEEP_PROBABILITY=0.1
+    export THREAD_FUZZER_SLEEP_TIME_US=100000
+
+    export THREAD_FUZZER_pthread_mutex_lock_BEFORE_MIGRATE_PROBABILITY=1
+    export THREAD_FUZZER_pthread_mutex_lock_AFTER_MIGRATE_PROBABILITY=1
+    export THREAD_FUZZER_pthread_mutex_unlock_BEFORE_MIGRATE_PROBABILITY=1
+    export THREAD_FUZZER_pthread_mutex_unlock_AFTER_MIGRATE_PROBABILITY=1
+
+    export THREAD_FUZZER_pthread_mutex_lock_BEFORE_SLEEP_PROBABILITY=0.001
+    export THREAD_FUZZER_pthread_mutex_lock_AFTER_SLEEP_PROBABILITY=0.001
+    export THREAD_FUZZER_pthread_mutex_unlock_BEFORE_SLEEP_PROBABILITY=0.001
+    export THREAD_FUZZER_pthread_mutex_unlock_AFTER_SLEEP_PROBABILITY=0.001
+    export THREAD_FUZZER_pthread_mutex_lock_BEFORE_SLEEP_TIME_US=10000
+    export THREAD_FUZZER_pthread_mutex_lock_AFTER_SLEEP_TIME_US=10000
+    export THREAD_FUZZER_pthread_mutex_unlock_BEFORE_SLEEP_TIME_US=10000
+    export THREAD_FUZZER_pthread_mutex_unlock_AFTER_SLEEP_TIME_US=10000
+
+    # simpliest way to forward env variables to server
+    sudo -E -u clickhouse /usr/bin/clickhouse-server --config /etc/clickhouse-server/config.xml --daemon
+    sleep 5
+else
+    service clickhouse-server start && sleep 5
+fi

 if grep -q -- "--use-skip-list" /usr/bin/clickhouse-test; then
    SKIP_LIST_OPT="--use-skip-list"
--- a/docs/tools/purge_cache_for_changed_files.py
+++ b/docs/tools/purge_cache_for_changed_files.py
@ -1,78 +0,0 @@
-#!/usr/bin/env python3
-
-import subprocess
-import requests
-import os
-import time
-
-FNAME_START = "+++"
-
-CLOUDFLARE_URL = "https://api.cloudflare.com/client/v4/zones/4fc6fb1d46e87851605aa7fa69ca6fe0/purge_cache"
-
-# we have changes in revision and commit sha on all pages
-# so such changes have to be ignored
-MIN_CHANGED_WORDS = 4
-
-
-def collect_changed_files():
-    proc = subprocess.Popen("git diff HEAD~1 --word-diff=porcelain | grep -e '^+[^+]\|^\-[^\-]\|^\+\+\+'", stdout=subprocess.PIPE, shell=True)
-    changed_files = []
-    current_file_name = ""
-    changed_words = []
-    while True:
-        line = proc.stdout.readline().decode("utf-8").strip()
-        if not line:
-            break
-        if FNAME_START in line:
-            if changed_words:
-                if len(changed_words) > MIN_CHANGED_WORDS:
-                    changed_files.append(current_file_name)
-                changed_words = []
-            current_file_name = line[6:]
-        else:
-            changed_words.append(line)
-    return changed_files
-
-
-def filter_and_transform_changed_files(changed_files, base_domain):
-    result = []
-    for f in changed_files:
-        if f.endswith(".html"):
-            result.append(base_domain + f.replace("index.html", ""))
-    return result
-
-
-def convert_to_dicts(changed_files, batch_size):
-    result = []
-    current_batch = {"files": []}
-    for f in changed_files:
-        if len(current_batch["files"]) >= batch_size:
-            result.append(current_batch)
-            current_batch = {"files": []}
-        current_batch["files"].append(f)
-
-    if current_batch["files"]:
-        result.append(current_batch)
-    return result
-
-
-def post_data(prepared_batches, token):
-    headers = {"Authorization": "Bearer {}".format(token)}
-    for batch in prepared_batches:
-        print(("Pugring cache for", ", ".join(batch["files"])))
-        response = requests.post(CLOUDFLARE_URL, json=batch, headers=headers)
-        response.raise_for_status()
-        time.sleep(3)
-
-
-if __name__ == "__main__":
-    token = os.getenv("CLOUDFLARE_TOKEN")
-    if not token:
-        raise Exception("Env variable CLOUDFLARE_TOKEN is empty")
-    base_domain = os.getenv("BASE_DOMAIN", "https://content.clickhouse.tech/")
-    changed_files = collect_changed_files()
-    print(("Found", len(changed_files), "changed files"))
-    filtered_files = filter_and_transform_changed_files(changed_files, base_domain)
-    print(("Files rest after filtering", len(filtered_files)))
-    prepared_batches = convert_to_dicts(filtered_files, 25)
-    post_data(prepared_batches, token)
--- a/docs/tools/release.sh
+++ b/docs/tools/release.sh
@ -32,12 +32,14 @@ then
    git add ".nojekyll"

    # Push to GitHub rewriting the existing contents.
-    git commit -a -m "Add new release at $(date)"
+    git commit --quiet -m "Add new release at $(date)"
    git push --force origin master

    if [[ ! -z "${CLOUDFLARE_TOKEN}" ]]
    then
        sleep 1m
-        python3 "${BASE_DIR}/purge_cache_for_changed_files.py"
+        # https://api.cloudflare.com/#zone-purge-files-by-cache-tags,-host-or-prefix
+        POST_DATA='{"hosts":"clickhouse.tech"}'
+        curl -X POST "https://api.cloudflare.com/client/v4/zones/4fc6fb1d46e87851605aa7fa69ca6fe0/purge_cache" -H "Authorization: Bearer ${CLOUDFLARE_TOKEN}" -H "Content-Type:application/json" --data "${POST_DATA}"
    fi
 fi
--- a/docs/zh/sql-reference/data-types/datetime64.md
+++ b/docs/zh/sql-reference/data-types/datetime64.md
@ -7,9 +7,9 @@ toc_title: DateTime64

 # Datetime64 {#data_type-datetime64}

-允许存储时间instant间，可以表示为日历日期和一天中的时间，具有定义的亚秒精度
+此类型允许以日期（date）加时间（time）的形式来存储一个时刻的时间值，具有定义的亚秒精度

-刻度尺寸（精度）：10<sup>-精度</sup> 秒
+时间刻度大小（精度）：10<sup>-精度</sup> 秒

 语法:

@ -17,11 +17,11 @@ toc_title: DateTime64
 DateTime64(precision, [timezone])
 ```

-在内部，存储数据作为一些 ‘ticks’ 自纪元开始(1970-01-01 00:00:00UTC)作为Int64. 刻度分辨率由precision参数确定。 此外，该 `DateTime64` 类型可以存储时区是相同的整个列，影响如何的值 `DateTime64` 类型值以文本格式显示，以及如何解析指定为字符串的值 (‘2020-01-01 05:00:01.000’). 时区不存储在表的行中（或resultset中），而是存储在列元数据中。 查看详细信息 [日期时间](datetime.md).
+在内部，此类型以Int64类型将数据存储为自Linux纪元开始(1970-01-01 00:00:00UTC)的时间刻度数（ticks）。时间刻度的分辨率由precision参数确定。此外，`DateTime64` 类型可以像存储其他数据列一样存储时区信息，时区会影响 `DateTime64` 类型的值如何以文本格式显示，以及如何解析以字符串形式指定的时间数据 (‘2020-01-01 05:00:01.000’)。时区不存储在表的行中（也不在resultset中），而是存储在列的元数据中。详细信息请参考 [DateTime](datetime.md) 数据类型.

-## 例 {#examples}
+## 示例 {#examples}

-**1.** 创建一个表 `DateTime64`-输入列并将数据插入其中:
+**1.** 创建一个具有 `DateTime64` 类型列的表，并向其中插入数据:

 ``` sql
 CREATE TABLE dt
@ -47,10 +47,10 @@ SELECT * FROM dt
 └─────────────────────────┴──────────┘
 ```

-   将日期时间作为整数插入时，将其视为适当缩放的Unix时间戳(UTC)。 `1546300800000` （精度为3）表示 `'2019-01-01 00:00:00'` UTC. 然而，作为 `timestamp` 列有 `Europe/Moscow` （UTC+3）指定的时区，当输出为字符串时，该值将显示为 `'2019-01-01 03:00:00'`
-   当插入字符串值作为日期时间时，它被视为处于列时区。 `'2019-01-01 00:00:00'` 将被视为 `Europe/Moscow` 时区并存储为 `1546290000000`.
+-   将日期时间作为integer类型插入时，它会被视为适当缩放的Unix时间戳(UTC)。`1546300800000` （精度为3）表示 `'2019-01-01 00:00:00'` UTC. 不过，因为 `timestamp` 列指定了 `Europe/Moscow` （UTC+3）的时区，当作为字符串输出时，它将显示为 `'2019-01-01 03:00:00'`
+-   当把字符串作为日期时间插入时，它会被赋予时区信息。 `'2019-01-01 00:00:00'` 将被认为处于 `Europe/Moscow` 时区并被存储为 `1546290000000`.

-**2.** 过滤 `DateTime64` 值
+**2.** 过滤 `DateTime64` 类型的值

 ``` sql
 SELECT * FROM dt WHERE timestamp = toDateTime64('2019-01-01 00:00:00', 3, 'Europe/Moscow')
@ -62,9 +62,9 @@ SELECT * FROM dt WHERE timestamp = toDateTime64('2019-01-01 00:00:00', 3, 'Europ
 └─────────────────────────┴──────────┘
 ```

-不像 `DateTime`, `DateTime64` 值不转换为 `String` 自动
+与 `DateTime` 不同, `DateTime64` 类型的值不会自动从 `String` 类型的值转换过来

-**3.** 获取一个时区 `DateTime64`-类型值:
+**3.** 获取 `DateTime64` 类型值的时区信息:

 ``` sql
 SELECT toDateTime64(now(), 3, 'Europe/Moscow') AS column, toTypeName(column) AS x
@ -97,8 +97,9 @@ FROM dt
 -   [类型转换函数](../../sql-reference/functions/type-conversion-functions.md)
 -   [用于处理日期和时间的函数](../../sql-reference/functions/date-time-functions.md)
 -   [用于处理数组的函数](../../sql-reference/functions/array-functions.md)
-   [该 `date_time_input_format` 设置](../../operations/settings/settings.md#settings-date_time_input_format)
-   [该 `timezone` 服务器配置参数](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone)
-   [使用日期和时间的操作员](../../sql-reference/operators/index.md#operators-datetime)
+-   [`date_time_input_format` 配置](../../operations/settings/settings.md#settings-date_time_input_format)
+-   [`date_time_output_format` 配置](../../operations/settings/settings.md#settings-date_time_output_format)
+-   [`timezone` 服务器配置参数](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone)
+-   [用于处理日期和时间的算子](../../sql-reference/operators/index.md#operators-datetime)
 -   [`Date` 数据类型](date.md)
 -   [`DateTime` 数据类型](datetime.md)
--- a/programs/client/Client.cpp
+++ b/programs/client/Client.cpp
@ -949,6 +949,11 @@ private:
            TestHint test_hint(test_mode, all_queries_text);
            if (test_hint.clientError() || test_hint.serverError())
                processTextAsSingleQuery("SET send_logs_level = 'none'");
+
+            // Echo all queries if asked; makes for a more readable reference
+            // file.
+            if (test_hint.echoQueries())
+                echo_queries = true;
        }

        /// Several queries separated by ';'.
--- a/programs/client/QueryFuzzer.cpp
+++ b/programs/client/QueryFuzzer.cpp
@ -14,6 +14,7 @@
 #include <Parsers/ASTIdentifier.h>
 #include <Parsers/ASTInsertQuery.h>
 #include <Parsers/ASTLiteral.h>
+#include <Parsers/ASTOrderByElement.h>
 #include <Parsers/ASTQueryWithOutput.h>
 #include <Parsers/ASTSelectQuery.h>
 #include <Parsers/ASTSelectWithUnionQuery.h>
@ -28,6 +29,11 @@
 namespace DB
 {

+namespace ErrorCodes
+{
+    extern const int TOO_DEEP_RECURSION;
+}
+
 Field QueryFuzzer::getRandomField(int type)
 {
    switch (type)
@ -205,14 +211,88 @@ void QueryFuzzer::replaceWithTableLike(ASTPtr & ast)
    ast = new_ast;
 }

-void QueryFuzzer::fuzzColumnLikeExpressionList(ASTPtr ast)
+void QueryFuzzer::fuzzOrderByElement(ASTOrderByElement * elem)
+{
+    switch (fuzz_rand() % 10)
+    {
+        case 0:
+            elem->direction = -1;
+            break;
+        case 1:
+            elem->direction = 1;
+            break;
+        case 2:
+            elem->nulls_direction = -1;
+            elem->nulls_direction_was_explicitly_specified = true;
+            break;
+        case 3:
+            elem->nulls_direction = 1;
+            elem->nulls_direction_was_explicitly_specified = true;
+            break;
+        case 4:
+            elem->nulls_direction = elem->direction;
+            elem->nulls_direction_was_explicitly_specified = false;
+            break;
+        default:
+            // do nothing
+            break;
+    }
+}
+
+void QueryFuzzer::fuzzOrderByList(IAST * ast)
 {
    if (!ast)
    {
        return;
    }

-    auto * impl = assert_cast<ASTExpressionList *>(ast.get());
+    auto * list = assert_cast<ASTExpressionList *>(ast);
+
+    // Remove element
+    if (fuzz_rand() % 50 == 0 && list->children.size() > 1)
+    {
+        // Don't remove last element -- this leads to questionable
+        // constructs such as empty select.
+        list->children.erase(list->children.begin()
+                             + fuzz_rand() % list->children.size());
+    }
+
+    // Add element
+    if (fuzz_rand() % 50 == 0)
+    {
+        auto pos = list->children.empty()
+                ? list->children.begin()
+                : list->children.begin() + fuzz_rand() % list->children.size();
+        auto col = getRandomColumnLike();
+        if (col)
+        {
+            auto elem = std::make_shared<ASTOrderByElement>();
+            elem->children.push_back(col);
+            elem->direction = 1;
+            elem->nulls_direction = 1;
+            elem->nulls_direction_was_explicitly_specified = false;
+            elem->with_fill = false;
+
+            list->children.insert(pos, elem);
+        }
+        else
+        {
+            fprintf(stderr, "no random col!\n");
+        }
+    }
+
+    // We don't have to recurse here to fuzz the children, this is handled by
+    // the generic recursion into IAST.children.
+}
+
+void QueryFuzzer::fuzzColumnLikeExpressionList(IAST * ast)
+{
+    if (!ast)
+    {
+        return;
+    }
+
+    auto * impl = assert_cast<ASTExpressionList *>(ast);

    // Remove element
    if (fuzz_rand() % 50 == 0 && impl->children.size() > 1)
@ -252,11 +332,44 @@ void QueryFuzzer::fuzz(ASTs & asts)
    }
 }

+struct ScopedIncrement
+{
+    size_t & counter;
+
+    explicit ScopedIncrement(size_t & counter_) : counter(counter_) { ++counter; }
+    ~ScopedIncrement() { --counter; }
+};
+
 void QueryFuzzer::fuzz(ASTPtr & ast)
 {
    if (!ast)
        return;

+    // Check for exceeding max depth.
+    ScopedIncrement depth_increment(current_ast_depth);
+    if (current_ast_depth > 500)
+    {
+        // The AST is too deep (see the comment for current_ast_depth). Throw
+        // an exception to fail fast and not use this query as an etalon, or we'll
+        // end up in a very slow and useless loop. It also makes sense to set it
+        // lower than the default max parse depth on the server (1000), so that
+        // we don't get the useless error about parse depth from the server either.
+        throw Exception(ErrorCodes::TOO_DEEP_RECURSION,
+            "AST depth exceeded while fuzzing ({})", current_ast_depth);
+    }
+
+    // Check for loops.
+    auto [_, inserted] = debug_visited_nodes.insert(ast.get());
+    if (!inserted)
+    {
+        fmt::print(stderr, "The AST node '{}' was already visited before."
+            " Depth {}, {} visited nodes, current top AST:\n{}\n",
+            static_cast<void *>(ast.get()), current_ast_depth,
+            debug_visited_nodes.size(), (*debug_top_ast)->dumpTree());
+        assert(false);
+    }
+
+    // The fuzzing.
    if (auto * with_union = typeid_cast<ASTSelectWithUnionQuery *>(ast.get()))
    {
        fuzz(with_union->list_of_selects);
@ -281,17 +394,28 @@ void QueryFuzzer::fuzz(ASTPtr & ast)
    {
        fuzz(expr_list->children);
    }
+    else if (auto * order_by_element = typeid_cast<ASTOrderByElement *>(ast.get()))
+    {
+        fuzzOrderByElement(order_by_element);
+    }
    else if (auto * fn = typeid_cast<ASTFunction *>(ast.get()))
    {
-        fuzzColumnLikeExpressionList(fn->arguments);
-        fuzzColumnLikeExpressionList(fn->parameters);
+        fuzzColumnLikeExpressionList(fn->arguments.get());
+        fuzzColumnLikeExpressionList(fn->parameters.get());
+
+        if (fn->is_window_function)
+        {
+            fuzzColumnLikeExpressionList(fn->window_partition_by);
+            fuzzOrderByList(fn->window_order_by);
+        }

        fuzz(fn->children);
    }
    else if (auto * select = typeid_cast<ASTSelectQuery *>(ast.get()))
    {
-        fuzzColumnLikeExpressionList(select->select());
-        fuzzColumnLikeExpressionList(select->groupBy());
+        fuzzColumnLikeExpressionList(select->select().get());
+        fuzzColumnLikeExpressionList(select->groupBy().get());
+        fuzzOrderByList(select->orderBy().get());

        fuzz(select->children);
    }
@ -416,6 +540,10 @@ void QueryFuzzer::collectFuzzInfoRecurse(const ASTPtr ast)

 void QueryFuzzer::fuzzMain(ASTPtr & ast)
 {
+    current_ast_depth = 0;
+    debug_visited_nodes.clear();
+    debug_top_ast = &ast;
+
    collectFuzzInfoMain(ast);
    fuzz(ast);

--- a/programs/client/QueryFuzzer.h
+++ b/programs/client/QueryFuzzer.h
@ -12,6 +12,9 @@
 namespace DB
 {

+class ASTExpressionList;
+class ASTOrderByElement;
+
 /*
 * This is an AST-based query fuzzer that makes random modifications to query
 * AST, changing numbers, list of columns, functions, etc. It remembers part of
@ -23,6 +26,13 @@ struct QueryFuzzer
 {
    pcg64 fuzz_rand{randomSeed()};

+    // We add elements to expression lists with fixed probability. Some elements
+    // are so large, that the expected number of elements we add to them is
+    // one or higher, hence this process might never finish. Put some limit on the
+    // total depth of AST to prevent this.
+    // This field is reset for each fuzzMain() call.
+    size_t current_ast_depth = 0;
+
    // These arrays hold parts of queries that we can substitute into the query
    // we are currently fuzzing. We add some part from each new query we are asked
    // to fuzz, and keep this state between queries, so the fuzzing output becomes
@ -36,6 +46,12 @@ struct QueryFuzzer
    std::unordered_map<std::string, ASTPtr> table_like_map;
    std::vector<ASTPtr> table_like;

+    // Some debug fields for detecting problematic ASTs with loops.
+    // These are reset for each fuzzMain call.
+    std::unordered_set<const IAST *> debug_visited_nodes;
+    ASTPtr * debug_top_ast;
+
+
    // This is the only function you have to call -- it will modify the passed
    // ASTPtr to point to new AST with some random changes.
    void fuzzMain(ASTPtr & ast);
@ -46,7 +62,9 @@ struct QueryFuzzer
    ASTPtr getRandomColumnLike();
    void replaceWithColumnLike(ASTPtr & ast);
    void replaceWithTableLike(ASTPtr & ast);
-    void fuzzColumnLikeExpressionList(ASTPtr ast);
+    void fuzzOrderByElement(ASTOrderByElement * elem);
+    void fuzzOrderByList(IAST * ast);
+    void fuzzColumnLikeExpressionList(IAST * ast);
    void fuzz(ASTs & asts);
    void fuzz(ASTPtr & ast);
    void collectFuzzInfoMain(const ASTPtr ast);
--- a/programs/client/TestHint.h
+++ b/programs/client/TestHint.h
@ -19,6 +19,7 @@ namespace ErrorCodes

 /// Checks expected server and client error codes in testmode.
 /// To enable it add special comment after the query: "-- { serverError 60 }" or "-- { clientError 20 }".
+/// Also you can enable echoing all queries by writing "-- { echo }".
 class TestHint
 {
 public:
@ -84,12 +85,14 @@ public:

    int serverError() const { return server_error; }
    int clientError() const { return client_error; }
+    bool echoQueries() const { return echo; }

 private:
    bool enabled = false;
    const String & query;
    int server_error = 0;
    int client_error = 0;
+    bool echo = false;

    void parse(const String & hint)
    {
@ -107,6 +110,8 @@ private:
                ss >> server_error;
            else if (item == "clientError")
                ss >> client_error;
+            else if (item == "echo")
+                echo = true;
        }
    }

--- a/programs/install/Install.cpp
+++ b/programs/install/Install.cpp
@ -10,6 +10,10 @@
    #include <linux/capability.h>
 #endif

+#if defined(OS_DARWIN)
+    #include <mach-o/dyld.h>
+#endif
+
 #include <Common/Exception.h>
 #include <Common/ShellCommand.h>
 #include <Common/formatReadable.h>
@ -147,9 +151,24 @@ int mainEntryClickHouseInstall(int argc, char ** argv)
    try
    {
        /// We need to copy binary to the binary directory.
-        /// The binary is currently run. We need to obtain its path from procfs.
+        /// The binary is currently run. We need to obtain its path from procfs (on Linux).

+#if defined(OS_DARWIN)
+        uint32_t path_length = 0;
+        _NSGetExecutablePath(nullptr, &path_length);
+        if (path_length <= 1)
+            Exception(ErrorCodes::FILE_DOESNT_EXIST, "Cannot obtain path to the binary");
+
+        std::string path(path_length, std::string::value_type());
+        auto res = _NSGetExecutablePath(&path[0], &path_length);
+        if (res != 0)
+            Exception(ErrorCodes::FILE_DOESNT_EXIST, "Cannot obtain path to the binary");
+
+        fs::path binary_self_path(path);
+#else
        fs::path binary_self_path = "/proc/self/exe";
+#endif
+
        if (!fs::exists(binary_self_path))
            throw Exception(ErrorCodes::FILE_DOESNT_EXIST, "Cannot obtain path to the binary from {}, file doesn't exist",
                            binary_self_path.string());
--- a/programs/server/Server.cpp
+++ b/programs/server/Server.cpp
@ -4,6 +4,7 @@
 #include <sys/resource.h>
 #include <sys/stat.h>
 #include <sys/types.h>
+#include <sys/wait.h>
 #include <errno.h>
 #include <pwd.h>
 #include <unistd.h>
@ -103,6 +104,7 @@ namespace CurrentMetrics
 int mainEntryClickHouseServer(int argc, char ** argv)
 {
    DB::Server app;
+    app.shouldSetupWatchdog(argc ? argv[0] : nullptr);
    try
    {
        return app.run(argc, argv);
@ -366,6 +368,7 @@ void checkForUsersNotInMainConfig(
 int Server::main(const std::vector<std::string> & /*args*/)
 {
    Poco::Logger * log = &logger();
+
    UseSSL use_ssl;

    MainThreadStatus::getInstance();
--- a/src/AggregateFunctions/AggregateFunctionAvg.h
+++ b/src/AggregateFunctions/AggregateFunctionAvg.h
@ -127,10 +127,10 @@ public:
    void insertResultInto(AggregateDataPtr place, IColumn & to, Arena *) const override
    {
        if constexpr (IsDecimalNumber<Numerator> || IsDecimalNumber<Denominator>)
-            static_cast<ColumnVector<Float64> &>(to).getData().push_back(
+            assert_cast<ColumnVector<Float64> &>(to).getData().push_back(
                this->data(place).divideIfAnyDecimal(num_scale, denom_scale));
        else
-            static_cast<ColumnVector<Float64> &>(to).getData().push_back(this->data(place).divide());
+            assert_cast<ColumnVector<Float64> &>(to).getData().push_back(this->data(place).divide());
    }
 private:
    UInt32 num_scale;
--- a/src/AggregateFunctions/IAggregateFunction.h
+++ b/src/AggregateFunctions/IAggregateFunction.h
@ -104,9 +104,12 @@ public:
        return false;
    }

-    /// Inserts results into a column.
-    /// This method must be called once, from single thread.
-    /// After this method was called for state, you can't do anything with state but destroy.
+    /// Inserts results into a column. This method might modify the state (e.g.
+    /// sort an array), so must be called once, from single thread. The state
+    /// must remain valid though, and the subsequent calls to add/merge/
+    /// insertResultInto must work correctly. This kind of call sequence occurs
+    /// in `runningAccumulate`, or when calculating an aggregate function as a
+    /// window function.
    virtual void insertResultInto(AggregateDataPtr place, IColumn & to, Arena * arena) const = 0;

    /// Used for machine learning methods. Predict result from trained model.
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -436,6 +436,8 @@ if (USE_ROCKSDB)
    dbms_target_include_directories(SYSTEM BEFORE PUBLIC ${ROCKSDB_INCLUDE_DIR})
 endif()

+dbms_target_link_libraries(PRIVATE _boost_context)
+
 if (ENABLE_TESTS AND USE_GTEST)
    macro (grep_gtest_sources BASE_DIR DST_VAR)
        # Cold match files that are not in tests/ directories
--- a/src/Client/Connection.cpp
+++ b/src/Client/Connection.cpp
@ -742,8 +742,11 @@ std::optional<UInt64> Connection::checkPacket(size_t timeout_microseconds)
 }


-Packet Connection::receivePacket()
+Packet Connection::receivePacket(std::function<void(Poco::Net::Socket &)> async_callback)
 {
+    in->setAsyncCallback(std::move(async_callback));
+    SCOPE_EXIT(in->setAsyncCallback({}));
+
    try
    {
        Packet res;
--- a/src/Client/Connection.h
+++ b/src/Client/Connection.h
@ -18,6 +18,7 @@
 #include <DataStreams/BlockStreamProfileInfo.h>

 #include <IO/ConnectionTimeouts.h>
+#include <IO/ReadBufferFromPocoSocket.h>

 #include <Interpreters/TablesStatus.h>

@ -171,7 +172,8 @@ public:
    std::optional<UInt64> checkPacket(size_t timeout_microseconds = 0);

    /// Receive packet from server.
-    Packet receivePacket();
+    /// Each time read blocks and async_callback is set, it will be called. You can poll socket inside it.
+    Packet receivePacket(std::function<void(Poco::Net::Socket &)> async_callback = {});

    /// If not connected yet, or if connection is broken - then connect. If cannot connect - throw an exception.
    void forceConnected(const ConnectionTimeouts & timeouts);
@ -226,7 +228,7 @@ private:
    String server_display_name;

    std::unique_ptr<Poco::Net::StreamSocket> socket;
-    std::shared_ptr<ReadBuffer> in;
+    std::shared_ptr<ReadBufferFromPocoSocket> in;
    std::shared_ptr<WriteBuffer> out;
    std::optional<UInt64> last_input_packet_type;

--- a/src/Client/MultiplexedConnections.cpp
+++ b/src/Client/MultiplexedConnections.cpp
@ -237,7 +237,7 @@ std::string MultiplexedConnections::dumpAddressesUnlocked() const
    return buf.str();
 }

-Packet MultiplexedConnections::receivePacketUnlocked()
+Packet MultiplexedConnections::receivePacketUnlocked(std::function<void(Poco::Net::Socket &)> async_callback)
 {
    if (!sent_query)
        throw Exception("Cannot receive packets: no query sent.", ErrorCodes::LOGICAL_ERROR);
@ -249,7 +249,7 @@ Packet MultiplexedConnections::receivePacketUnlocked()
    if (current_connection == nullptr)
        throw Exception("Logical error: no available replica", ErrorCodes::NO_AVAILABLE_REPLICA);

-    Packet packet = current_connection->receivePacket();
+    Packet packet = current_connection->receivePacket(std::move(async_callback));

    switch (packet.type)
    {
--- a/src/Client/MultiplexedConnections.h
+++ b/src/Client/MultiplexedConnections.h
@ -69,7 +69,7 @@ public:

 private:
    /// Internal version of `receivePacket` function without locking.
-    Packet receivePacketUnlocked();
+    Packet receivePacketUnlocked(std::function<void(Poco::Net::Socket &)> async_callback = {});

    /// Internal version of `dumpAddresses` function without locking.
    std::string dumpAddressesUnlocked() const;
@ -105,6 +105,8 @@ private:
    /// A mutex for the sendCancel function to execute safely
    /// in separate thread.
    mutable std::mutex cancel_mutex;
+
+    friend class RemoteQueryExecutorReadContext;
 };

 }
--- a/src/Common/Fiber.h
+++ b/src/Common/Fiber.h
@ -0,0 +1,5 @@
+#pragma once
+#include <common/defines.h>
+#include <boost/context/fiber.hpp>
+
+using Fiber = boost::context::fiber;
--- a/src/Common/FiberStack.h
+++ b/src/Common/FiberStack.h
@ -0,0 +1,74 @@
+#pragma once
+#include <common/defines.h>
+#include <boost/context/stack_context.hpp>
+#include <Common/formatReadable.h>
+#include <Common/MemoryTracker.h>
+
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <sys/mman.h>
+
+#if defined(BOOST_USE_VALGRIND)
+#include <valgrind/valgrind.h>
+#endif
+
+namespace DB::ErrorCodes
+{
+    extern const int CANNOT_ALLOCATE_MEMORY;
+}
+
+/// This is an implementation of allocator for fiber stack.
+/// The reference implementation is protected_fixedsize_stack from boost::context.
+/// This implementation additionally track memory usage. It is the main reason why it is needed.
+class FiberStack
+{
+private:
+    size_t stack_size;
+    size_t page_size = 0;
+public:
+    static constexpr size_t default_stack_size = 128 * 1024; /// 64KB was not enough for tests
+
+    explicit FiberStack(size_t stack_size_ = default_stack_size) : stack_size(stack_size_)
+    {
+        page_size = ::sysconf(_SC_PAGESIZE);
+    }
+
+    boost::context::stack_context allocate()
+    {
+        size_t num_pages = 1 + (stack_size - 1) / page_size;
+        size_t num_bytes = (num_pages + 1) * page_size; /// Add one page at bottom that will be used as guard-page
+
+        void * vp = ::mmap(nullptr, num_bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+        if (MAP_FAILED == vp)
+            DB::throwFromErrno(fmt::format("FiberStack: Cannot mmap {}.", ReadableSize(num_bytes)), DB::ErrorCodes::CANNOT_ALLOCATE_MEMORY);
+
+        if (-1 == ::mprotect(vp, page_size, PROT_NONE))
+        {
+            ::munmap(vp, num_bytes);
+            DB::throwFromErrno("FiberStack: cannot protect guard page", DB::ErrorCodes::CANNOT_ALLOCATE_MEMORY);
+        }
+
+        /// Do not count guard page in memory usage.
+        CurrentMemoryTracker::alloc(num_pages * page_size);
+
+        boost::context::stack_context sctx;
+        sctx.size = num_bytes;
+        sctx.sp = static_cast< char * >(vp) + sctx.size;
+#if defined(BOOST_USE_VALGRIND)
+        sctx.valgrind_stack_id = VALGRIND_STACK_REGISTER(sctx.sp, vp);
+#endif
+        return sctx;
+    }
+
+    void deallocate(boost::context::stack_context & sctx)
+    {
+#if defined(BOOST_USE_VALGRIND)
+        VALGRIND_STACK_DEREGISTER(sctx.valgrind_stack_id);
+#endif
+        void * vp = static_cast< char * >(sctx.sp) - sctx.size;
+        ::munmap(vp, sctx.size);
+
+        /// Do not count guard page in memory usage.
+        CurrentMemoryTracker::free(sctx.size - page_size);
+    }
+};
--- a/src/Common/ThreadFuzzer.cpp
+++ b/src/Common/ThreadFuzzer.cpp
@ -10,6 +10,7 @@
 #include <common/sleep.h>

 #include <IO/ReadHelpers.h>
+#include <common/logger_useful.h>

 #include <Common/Exception.h>
 #include <Common/thread_local_rng.h>
--- a/src/Common/TimerDescriptor.cpp
+++ b/src/Common/TimerDescriptor.cpp
@ -0,0 +1,84 @@
+#if defined(OS_LINUX)
+#include <Common/TimerDescriptor.h>
+#include <Common/Exception.h>
+
+#include <sys/timerfd.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int CANNOT_CREATE_TIMER;
+    extern const int CANNOT_SET_TIMER_PERIOD;
+    extern const int CANNOT_FCNTL;
+    extern const int CANNOT_READ_FROM_SOCKET;
+}
+
+TimerDescriptor::TimerDescriptor(int clockid, int flags)
+{
+    timer_fd = timerfd_create(clockid, flags);
+    if (timer_fd == -1)
+        throw Exception(ErrorCodes::CANNOT_CREATE_TIMER, "Cannot create timer_fd descriptor");
+
+    if (-1 == fcntl(timer_fd, F_SETFL, O_NONBLOCK))
+        throwFromErrno("Cannot set O_NONBLOCK for timer_fd", ErrorCodes::CANNOT_FCNTL);
+}
+
+TimerDescriptor::~TimerDescriptor()
+{
+    /// Do not check for result cause cannot throw exception.
+    close(timer_fd);
+}
+
+void TimerDescriptor::reset() const
+{
+    itimerspec spec;
+    spec.it_interval.tv_nsec = 0;
+    spec.it_interval.tv_sec = 0;
+    spec.it_value.tv_sec = 0;
+    spec.it_value.tv_nsec = 0;
+
+    if (-1 == timerfd_settime(timer_fd, 0 /*relative timer */, &spec, nullptr))
+        throwFromErrno("Cannot reset timer_fd", ErrorCodes::CANNOT_SET_TIMER_PERIOD);
+
+    /// Drain socket.
+    /// It may be possible that alarm happened and socket is readable.
+    drain();
+}
+
+void TimerDescriptor::drain() const
+{
+    /// It is expected that socket returns 8 bytes when readable.
+    /// Read in loop anyway cause signal may interrupt read call.
+    uint64_t buf;
+    while (true)
+    {
+        ssize_t res = ::read(timer_fd, &buf, sizeof(buf));
+        if (res < 0)
+        {
+            if (errno == EAGAIN)
+                break;
+
+            if (errno != EINTR)
+                throwFromErrno("Cannot drain timer_fd", ErrorCodes::CANNOT_READ_FROM_SOCKET);
+        }
+    }
+}
+
+void TimerDescriptor::setRelative(const Poco::Timespan & timespan) const
+{
+    itimerspec spec;
+    spec.it_interval.tv_nsec = 0;
+    spec.it_interval.tv_sec = 0;
+    spec.it_value.tv_sec = timespan.totalSeconds();
+    spec.it_value.tv_nsec = timespan.useconds();
+
+    if (-1 == timerfd_settime(timer_fd, 0 /*relative timer */, &spec, nullptr))
+        throwFromErrno("Cannot set time for timer_fd", ErrorCodes::CANNOT_SET_TIMER_PERIOD);
+}
+
+}
+#endif
--- a/src/Common/TimerDescriptor.h
+++ b/src/Common/TimerDescriptor.h
@ -0,0 +1,31 @@
+#pragma once
+#if defined(OS_LINUX)
+#include <Poco/Timespan.h>
+
+namespace DB
+{
+
+/// Wrapper over timerfd.
+class TimerDescriptor
+{
+private:
+    int timer_fd;
+
+public:
+    explicit TimerDescriptor(int clockid, int flags);
+    ~TimerDescriptor();
+
+    TimerDescriptor(const TimerDescriptor &) = delete;
+    TimerDescriptor & operator=(const TimerDescriptor &) = delete;
+    TimerDescriptor(TimerDescriptor &&) = default;
+    TimerDescriptor & operator=(TimerDescriptor &&) = default;
+
+    int getDescriptor() const { return timer_fd; }
+
+    void reset() const;
+    void drain() const;
+    void setRelative(const Poco::Timespan & timespan) const;
+};
+
+}
+#endif
--- a/src/Common/ZooKeeper/tests/CMakeLists.txt
+++ b/src/Common/ZooKeeper/tests/CMakeLists.txt
@ -4,9 +4,6 @@ target_link_libraries(zkutil_test_commands PRIVATE clickhouse_common_zookeeper)
 add_executable(zkutil_test_commands_new_lib zkutil_test_commands_new_lib.cpp)
 target_link_libraries(zkutil_test_commands_new_lib PRIVATE clickhouse_common_zookeeper string_utils)

-add_executable(zkutil_expiration_test zkutil_expiration_test.cpp)
-target_link_libraries(zkutil_expiration_test PRIVATE clickhouse_common_zookeeper)
-
 add_executable(zkutil_test_async zkutil_test_async.cpp)
 target_link_libraries(zkutil_test_async PRIVATE clickhouse_common_zookeeper)

--- a/src/Common/ZooKeeper/tests/nozk.sh
+++ b/src/Common/ZooKeeper/tests/nozk.sh
@ -1,15 +0,0 @@
-#!/usr/bin/env bash
-
-# Добавляет в файрвол правила, не пропускающие пакеты до серверов ZooKeeper.
-# Используется для тестирования поведения программ при потере соединения с ZooKeeper.
-# yeszk.sh производит обратные изменения.
-
-# Чтобы посмотреть, какие правила сейчас есть, используйте sudo iptables -L и sudo ip6tables -L
-
-sudo iptables -A OUTPUT -p tcp --dport 2181 -j DROP
-sudo ip6tables -A OUTPUT -p tcp --dport 2181 -j DROP
-
-# You could also test random drops:
-#sudo iptables -A OUTPUT -p tcp --dport 2181 -j REJECT --reject-with tcp-reset -m statistic --mode random --probability 0.1
-#sudo ip6tables -A OUTPUT -p tcp --dport 2181 -j REJECT --reject-with tcp-reset -m statistic --mode random --probability 0.1
-
--- a/src/Common/ZooKeeper/tests/yeszk.sh
+++ b/src/Common/ZooKeeper/tests/yeszk.sh
@ -1,6 +0,0 @@
-#!/usr/bin/env bash
-
-# Выполняет действия, обратные nozk.sh
-
-cat nozk.sh | sed 's/-A/-D/g' | bash
-
--- a/src/Common/ZooKeeper/tests/zkutil_expiration_test.cpp
+++ b/src/Common/ZooKeeper/tests/zkutil_expiration_test.cpp
@ -1,70 +0,0 @@
-#include <iostream>
-#include <Common/ZooKeeper/ZooKeeper.h>
-#include <Common/ZooKeeper/KeeperException.h>
-#include <Poco/ConsoleChannel.h>
-#include <Common/Exception.h>
-
-
-/// Проверяет, какие ошибки выдает ZooKeeper при попытке сделать какую-нибудь операцию через разное время после истечения сессии.
-/// Спойлер: multi иногда падает с segfault, а до этого фейлится с marshalling error.
-///          create всегда фейлится с invalid zhandle state.
-
-int main(int argc, char ** argv)
-{
-    try
-    {
-        if (argc != 2)
-        {
-            std::cerr << "usage: " << argv[0] << " hosts" << std::endl;
-            return 2;
-        }
-
-        Poco::AutoPtr<Poco::ConsoleChannel> channel = new Poco::ConsoleChannel(std::cerr);
-        Poco::Logger::root().setChannel(channel);
-        Poco::Logger::root().setLevel("trace");
-
-        zkutil::ZooKeeper zk(argv[1]);
-        std::string unused;
-        zk.tryCreate("/test", "", zkutil::CreateMode::Persistent, unused);
-
-        std::cerr << "Please run `./nozk.sh && sleep 40s && ./yeszk.sh`" << std::endl;
-
-        time_t time0 = time(nullptr);
-
-        while (true)
-        {
-            {
-                Coordination::Requests ops;
-                ops.emplace_back(zkutil::makeCreateRequest("/test/zk_expiration_test", "hello", zkutil::CreateMode::Persistent));
-                ops.emplace_back(zkutil::makeRemoveRequest("/test/zk_expiration_test", -1));
-
-                Coordination::Responses responses;
-                Coordination::Error code = zk.tryMultiNoThrow(ops, responses);
-
-                std::cout << time(nullptr) - time0 << "s: " << Coordination::errorMessage(code) << std::endl;
-                try
-                {
-                    if (code != Coordination::Error::ZOK)
-                        std::cout << "Path: " << zkutil::KeeperMultiException(code, ops, responses).getPathForFirstFailedOp() << std::endl;
-                }
-                catch (...)
-                {
-                    std::cout << DB::getCurrentExceptionMessage(false) << std::endl;
-                }
-
-            }
-
-            sleep(1);
-        }
-    }
-    catch (Coordination::Exception &)
-    {
-        std::cerr << "KeeperException: " << DB::getCurrentExceptionMessage(true) << std::endl;
-        return 1;
-    }
-    catch (...)
-    {
-        std::cerr << "Some exception: " << DB::getCurrentExceptionMessage(true) << std::endl;
-        return 2;
-    }
-}
--- a/src/Common/ya.make
+++ b/src/Common/ya.make
@ -75,6 +75,7 @@ SRCS(
    ThreadPool.cpp
    ThreadProfileEvents.cpp
    ThreadStatus.cpp
+    TimerDescriptor.cpp
    TraceCollector.cpp
    UTF8Helpers.cpp
    UnicodeBar.cpp
--- a/src/Core/Settings.h
+++ b/src/Core/Settings.h
@ -406,9 +406,12 @@ class IColumn;
    M(Bool, optimize_skip_merged_partitions, false, "Skip partitions with one part with level > 0 in optimize final", 0) \
    M(Bool, optimize_on_insert, true, "Do the same transformation for inserted block of data as if merge was done on this block.", 0) \
    M(Bool, allow_experimental_map_type, false, "Allow data type Map", 0) \
+    M(Bool, allow_experimental_window_functions, false, "Allow experimental window functions", 0) \
    \
    M(Bool, use_antlr_parser, false, "Parse incoming queries using ANTLR-generated parser", 0) \
    \
+    M(Bool, async_socket_for_remote, true, "Asynchronously read from socket executing remote query", 0) \
+    \
    /** Obsolete settings that do nothing but left for compatibility reasons. Remove each one after half a year of obsolescence. */ \
    \
    M(UInt64, max_memory_usage_for_all_queries, 0, "Obsolete. Will be removed after 2020-10-20", 0) \
--- a/src/Core/SortDescription.cpp
+++ b/src/Core/SortDescription.cpp
@ -37,5 +37,12 @@ void dumpSortDescription(const SortDescription & description, const Block & head
    }
 }

+std::string dumpSortDescription(const SortDescription & description)
+{
+    WriteBufferFromOwnString wb;
+    dumpSortDescription(description, Block{}, wb);
+    return wb.str();
+}
+
 }

--- a/src/Core/SortDescription.h
+++ b/src/Core/SortDescription.h
@ -72,4 +72,6 @@ class Block;
 /// Outputs user-readable description into `out`.
 void dumpSortDescription(const SortDescription & description, const Block & header, WriteBuffer & out);

+std::string dumpSortDescription(const SortDescription & description);
+
 }
--- a/src/DataStreams/RemoteQueryExecutor.cpp
+++ b/src/DataStreams/RemoteQueryExecutor.cpp
@ -1,4 +1,5 @@
 #include <DataStreams/RemoteQueryExecutor.h>
+#include <DataStreams/RemoteQueryExecutorReadContext.h>

 #include <Columns/ColumnConst.h>
 #include <Common/CurrentThread.h>
@ -11,6 +12,7 @@
 #include <Interpreters/Context.h>
 #include <Interpreters/InternalTextLogsQueue.h>
 #include <IO/ConnectionTimeoutsContext.h>
+#include <Common/FiberStack.h>

 namespace DB
 {
@ -192,68 +194,119 @@ Block RemoteQueryExecutor::read()

        Packet packet = multiplexed_connections->receivePacket();

-        switch (packet.type)
-        {
-            case Protocol::Server::Data:
-                /// If the block is not empty and is not a header block
-                if (packet.block && (packet.block.rows() > 0))
-                    return adaptBlockStructure(packet.block, header);
-                break;  /// If the block is empty - we will receive other packets before EndOfStream.
-
-            case Protocol::Server::Exception:
-                got_exception_from_replica = true;
-                packet.exception->rethrow();
-                break;
-
-            case Protocol::Server::EndOfStream:
-                if (!multiplexed_connections->hasActiveConnections())
-                {
-                    finished = true;
-                    return Block();
-                }
-                break;
-
-            case Protocol::Server::Progress:
-                /** We use the progress from a remote server.
-                  * We also include in ProcessList,
-                  * and we use it to check
-                  * constraints (for example, the minimum speed of query execution)
-                  * and quotas (for example, the number of lines to read).
-                  */
-                if (progress_callback)
-                    progress_callback(packet.progress);
-                break;
-
-            case Protocol::Server::ProfileInfo:
-                /// Use own (client-side) info about read bytes, it is more correct info than server-side one.
-                if (profile_info_callback)
-                    profile_info_callback(packet.profile_info);
-                break;
-
-            case Protocol::Server::Totals:
-                totals = packet.block;
-                break;
-
-            case Protocol::Server::Extremes:
-                extremes = packet.block;
-                break;
-
-            case Protocol::Server::Log:
-                /// Pass logs from remote server to client
-                if (auto log_queue = CurrentThread::getInternalTextLogsQueue())
-                    log_queue->pushBlock(std::move(packet.block));
-                break;
-
-            default:
-                got_unknown_packet_from_replica = true;
-                throw Exception(ErrorCodes::UNKNOWN_PACKET_FROM_SERVER, "Unknown packet {} from one of the following replicas: {}",
-                    toString(packet.type),
-                    multiplexed_connections->dumpAddresses());
-        }
+        if (auto block = processPacket(std::move(packet)))
+            return *block;
    }
 }

-void RemoteQueryExecutor::finish()
+std::variant<Block, int> RemoteQueryExecutor::read(std::unique_ptr<ReadContext> & read_context [[maybe_unused]])
+{
+
+#if defined(OS_LINUX)
+    if (!sent_query)
+    {
+        sendQuery();
+
+        if (context.getSettingsRef().skip_unavailable_shards && (0 == multiplexed_connections->size()))
+            return Block();
+    }
+
+    if (!read_context)
+    {
+        std::lock_guard lock(was_cancelled_mutex);
+        if (was_cancelled)
+            return Block();
+
+        read_context = std::make_unique<ReadContext>(*multiplexed_connections);
+    }
+
+    do
+    {
+        if (!read_context->resumeRoutine())
+            return Block();
+
+        if (read_context->is_read_in_progress)
+        {
+            read_context->setTimer();
+            return read_context->epoll_fd;
+        }
+        else
+        {
+            if (auto data = processPacket(std::move(read_context->packet)))
+                return std::move(*data);
+        }
+    }
+    while (true);
+#else
+    return read();
+#endif
+}
+
+std::optional<Block> RemoteQueryExecutor::processPacket(Packet packet)
+{
+    switch (packet.type)
+    {
+        case Protocol::Server::Data:
+            /// If the block is not empty and is not a header block
+            if (packet.block && (packet.block.rows() > 0))
+                return adaptBlockStructure(packet.block, header);
+            break;  /// If the block is empty - we will receive other packets before EndOfStream.
+
+        case Protocol::Server::Exception:
+            got_exception_from_replica = true;
+            packet.exception->rethrow();
+            break;
+
+        case Protocol::Server::EndOfStream:
+            if (!multiplexed_connections->hasActiveConnections())
+            {
+                finished = true;
+                return Block();
+            }
+            break;
+
+        case Protocol::Server::Progress:
+            /** We use the progress from a remote server.
+              * We also include in ProcessList,
+              * and we use it to check
+              * constraints (for example, the minimum speed of query execution)
+              * and quotas (for example, the number of lines to read).
+              */
+            if (progress_callback)
+                progress_callback(packet.progress);
+            break;
+
+        case Protocol::Server::ProfileInfo:
+            /// Use own (client-side) info about read bytes, it is more correct info than server-side one.
+            if (profile_info_callback)
+                profile_info_callback(packet.profile_info);
+            break;
+
+        case Protocol::Server::Totals:
+            totals = packet.block;
+            break;
+
+        case Protocol::Server::Extremes:
+            extremes = packet.block;
+            break;
+
+        case Protocol::Server::Log:
+            /// Pass logs from remote server to client
+            if (auto log_queue = CurrentThread::getInternalTextLogsQueue())
+                log_queue->pushBlock(std::move(packet.block));
+            break;
+
+        default:
+            got_unknown_packet_from_replica = true;
+            throw Exception(ErrorCodes::UNKNOWN_PACKET_FROM_SERVER, "Unknown packet {} from one of the following replicas: {}",
+                toString(packet.type),
+                multiplexed_connections->dumpAddresses());
+    }
+
+    return {};
+}
+
+void RemoteQueryExecutor::finish(std::unique_ptr<ReadContext> * read_context)
 {
    /** If one of:
      * - nothing started to do;
@ -270,7 +323,7 @@ void RemoteQueryExecutor::finish()
      */

    /// Send the request to abort the execution of the request, if not already sent.
-    tryCancel("Cancelling query because enough data has been read");
+    tryCancel("Cancelling query because enough data has been read", read_context);

    /// Get the remaining packets so that there is no out of sync in the connections to the replicas.
    Packet packet = multiplexed_connections->drain();
@ -299,7 +352,7 @@ void RemoteQueryExecutor::finish()
    }
 }

-void RemoteQueryExecutor::cancel()
+void RemoteQueryExecutor::cancel(std::unique_ptr<ReadContext> * read_context)
 {
    {
        std::lock_guard lock(external_tables_mutex);
@ -313,7 +366,7 @@ void RemoteQueryExecutor::cancel()
    if (!isQueryPending() || hasThrownException())
        return;

-    tryCancel("Cancelling query");
+    tryCancel("Cancelling query", read_context);
 }

 void RemoteQueryExecutor::sendScalars()
@ -365,7 +418,7 @@ void RemoteQueryExecutor::sendExternalTables()
    multiplexed_connections->sendExternalTablesData(external_tables_data);
 }

-void RemoteQueryExecutor::tryCancel(const char * reason)
+void RemoteQueryExecutor::tryCancel(const char * reason, std::unique_ptr<ReadContext> * read_context)
 {
    {
        /// Flag was_cancelled is atomic because it is checked in read().
@ -375,6 +428,10 @@ void RemoteQueryExecutor::tryCancel(const char * reason)
            return;

        was_cancelled = true;
+
+        if (read_context && *read_context)
+            (*read_context)->cancel();
+
        multiplexed_connections->sendCancel();
    }

--- a/src/DataStreams/RemoteQueryExecutor.h
+++ b/src/DataStreams/RemoteQueryExecutor.h
@ -5,6 +5,9 @@
 #include <Storages/IStorage_fwd.h>
 #include <Interpreters/Context.h>
 #include <Interpreters/StorageID.h>
+#include <Common/FiberStack.h>
+#include <Common/TimerDescriptor.h>
+#include <variant>

 namespace DB
 {
@ -20,10 +23,14 @@ using ProgressCallback = std::function<void(const Progress & progress)>;
 struct BlockStreamProfileInfo;
 using ProfileInfoCallback = std::function<void(const BlockStreamProfileInfo & info)>;

+class RemoteQueryExecutorReadContext;
+
 /// This class allows one to launch queries on remote replicas of one shard and get results
 class RemoteQueryExecutor
 {
 public:
+    using ReadContext = RemoteQueryExecutorReadContext;
+
    /// Takes already set connection.
    RemoteQueryExecutor(
        Connection & connection,
@ -53,13 +60,17 @@ public:
    /// Read next block of data. Returns empty block if query is finished.
    Block read();

+    /// Async variant of read. Returns ready block or file descriptor which may be used for polling.
+    /// ReadContext is an internal read state. Pass empty ptr first time, reuse created one for every call.
+    std::variant<Block, int> read(std::unique_ptr<ReadContext> & read_context);
+
    /// Receive all remain packets and finish query.
    /// It should be cancelled after read returned empty block.
-    void finish();
+    void finish(std::unique_ptr<ReadContext> * read_context = nullptr);

    /// Cancel query execution. Sends Cancel packet and ignore others.
    /// This method may be called from separate thread.
-    void cancel();
+    void cancel(std::unique_ptr<ReadContext> * read_context = nullptr);

    /// Get totals and extremes if any.
    Block getTotals() { return std::move(totals); }
@ -153,13 +164,16 @@ private:
    void sendExternalTables();

    /// If wasn't sent yet, send request to cancel all connections to replicas
-    void tryCancel(const char * reason);
+    void tryCancel(const char * reason, std::unique_ptr<ReadContext> * read_context);

    /// Returns true if query was sent
    bool isQueryPending() const;

    /// Returns true if exception was thrown
    bool hasThrownException() const;
+
+    /// Process packet for read and return data block if possible.
+    std::optional<Block> processPacket(Packet packet);
 };

 }
--- a/src/DataStreams/RemoteQueryExecutorReadContext.h
+++ b/src/DataStreams/RemoteQueryExecutorReadContext.h
@ -0,0 +1,272 @@
+#pragma once
+
+#if defined(OS_LINUX)
+
+#include <sys/epoll.h>
+#include <Common/Fiber.h>
+#include <Common/FiberStack.h>
+#include <Common/TimerDescriptor.h>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int CANNOT_READ_FROM_SOCKET;
+    extern const int CANNOT_OPEN_FILE;
+    extern const int SOCKET_TIMEOUT;
+}
+
+class RemoteQueryExecutorReadContext
+{
+public:
+    using Self = RemoteQueryExecutorReadContext;
+
+    bool is_read_in_progress = false;
+    Packet packet;
+
+    std::exception_ptr exception;
+    FiberStack stack;
+    boost::context::fiber fiber;
+    /// This mutex for fiber is needed because fiber could be destroyed in cancel method from another thread.
+    std::mutex fiber_lock;
+
+    Poco::Timespan receive_timeout;
+    MultiplexedConnections & connections;
+    Poco::Net::Socket * last_used_socket = nullptr;
+
+    /// Here we have three descriptors we are going to wait:
+    /// * socket_fd is a descriptor of connection. It may be changed in case of reading from several replicas.
+    /// * timer is a timerfd descriptor to manually check socket timeout
+    /// * pipe_fd is a pipe we use to cancel query and socket polling by executor.
+    /// We put those descriptors into our own epoll_fd which is used by external executor.
+    TimerDescriptor timer{CLOCK_MONOTONIC, 0};
+    int socket_fd = -1;
+    int epoll_fd;
+    int pipe_fd[2];
+
+    explicit RemoteQueryExecutorReadContext(MultiplexedConnections & connections_) : connections(connections_)
+    {
+        epoll_fd = epoll_create(2);
+        if (-1 == epoll_fd)
+            throwFromErrno("Cannot create epoll descriptor", ErrorCodes::CANNOT_OPEN_FILE);
+
+        if (-1 == pipe2(pipe_fd, O_NONBLOCK))
+            throwFromErrno("Cannot create pipe", ErrorCodes::CANNOT_OPEN_FILE);
+
+        {
+            epoll_event socket_event;
+            socket_event.events = EPOLLIN | EPOLLPRI;
+            socket_event.data.fd = pipe_fd[0];
+
+            if (-1 == epoll_ctl(epoll_fd, EPOLL_CTL_ADD, pipe_fd[0], &socket_event))
+                throwFromErrno("Cannot add pipe descriptor to epoll", ErrorCodes::CANNOT_OPEN_FILE);
+        }
+
+        {
+            epoll_event timer_event;
+            timer_event.events = EPOLLIN | EPOLLPRI;
+            timer_event.data.fd = timer.getDescriptor();
+
+            if (-1 == epoll_ctl(epoll_fd, EPOLL_CTL_ADD, timer_event.data.fd, &timer_event))
+                throwFromErrno("Cannot add timer descriptor to epoll", ErrorCodes::CANNOT_OPEN_FILE);
+        }
+
+        auto routine = Routine{connections, *this};
+        fiber = boost::context::fiber(std::allocator_arg_t(), stack, std::move(routine));
+    }
+
+    void setSocket(Poco::Net::Socket & socket)
+    {
+        int fd = socket.impl()->sockfd();
+        if (fd == socket_fd)
+            return;
+
+        epoll_event socket_event;
+        socket_event.events = EPOLLIN | EPOLLPRI;
+        socket_event.data.fd = fd;
+
+        if (socket_fd != -1)
+        {
+            if (-1 == epoll_ctl(epoll_fd, EPOLL_CTL_DEL, socket_fd, &socket_event))
+                throwFromErrno("Cannot remove socket descriptor to epoll", ErrorCodes::CANNOT_OPEN_FILE);
+        }
+
+        socket_fd = fd;
+
+        if (-1 == epoll_ctl(epoll_fd, EPOLL_CTL_ADD, socket_fd, &socket_event))
+            throwFromErrno("Cannot add socket descriptor to epoll", ErrorCodes::CANNOT_OPEN_FILE);
+
+        receive_timeout = socket.impl()->getReceiveTimeout();
+    }
+
+    bool checkTimeout() const
+    {
+        try
+        {
+            return checkTimeoutImpl();
+        }
+        catch (DB::Exception & e)
+        {
+            if (last_used_socket)
+                e.addMessage(" while reading from socket ({})", last_used_socket->peerAddress().toString());
+            throw;
+        }
+    }
+
+    bool checkTimeoutImpl() const
+    {
+        epoll_event events[3];
+        events[0].data.fd = events[1].data.fd = events[2].data.fd = -1;
+
+        /// Wait for epoll_fd will not block if it was polled externally.
+        int num_events = epoll_wait(epoll_fd, events, 3, 0);
+        if (num_events == -1)
+            throwFromErrno("Failed to epoll_wait", ErrorCodes::CANNOT_READ_FROM_SOCKET);
+
+        bool is_socket_ready = false;
+        bool is_pipe_alarmed = false;
+        bool has_timer_alarm = false;
+
+        for (int i = 0; i < num_events; ++i)
+        {
+            if (events[i].data.fd == socket_fd)
+                is_socket_ready = true;
+            if (events[i].data.fd == timer.getDescriptor())
+                has_timer_alarm = true;
+            if (events[i].data.fd == pipe_fd[0])
+                is_pipe_alarmed = true;
+        }
+
+        if (is_pipe_alarmed)
+            return false;
+
+        if (has_timer_alarm && !is_socket_ready)
+        {
+            /// Socket receive timeout. Drain it in case or error, or it may be hide by timeout exception.
+            timer.drain();
+            throw NetException("Timeout exceeded", ErrorCodes::SOCKET_TIMEOUT);
+        }
+
+        return true;
+    }
+
+    void setTimer() const
+    {
+        /// Did not get packet yet. Init timeout for the next async reading.
+        timer.reset();
+
+        if (receive_timeout.totalMicroseconds())
+            timer.setRelative(receive_timeout);
+    }
+
+    bool resumeRoutine()
+    {
+        if (is_read_in_progress && !checkTimeout())
+            return false;
+
+        {
+            std::lock_guard guard(fiber_lock);
+            if (!fiber)
+                return false;
+
+            fiber = std::move(fiber).resume();
+        }
+
+        if (exception)
+            std::rethrow_exception(std::move(exception));
+
+        return true;
+    }
+
+    void cancel()
+    {
+        std::lock_guard guard(fiber_lock);
+        /// It is safe to just destroy fiber - we are not in the process of reading from socket.
+        boost::context::fiber to_destroy = std::move(fiber);
+
+        /// Send something to pipe to cancel executor waiting.
+        uint64_t buf = 0;
+        while (-1 == write(pipe_fd[1], &buf, sizeof(buf)))
+        {
+            if (errno == EAGAIN)
+                break;
+
+            if (errno != EINTR)
+                throwFromErrno("Cannot write to pipe", ErrorCodes::CANNOT_READ_FROM_SOCKET);
+        }
+    }
+
+    ~RemoteQueryExecutorReadContext()
+    {
+        /// socket_fd is closed by Poco::Net::Socket
+        /// timer_fd is closed by TimerDescriptor
+        close(epoll_fd);
+    }
+
+    struct Routine
+    {
+        MultiplexedConnections & connections;
+        Self & read_context;
+
+        struct ReadCallback
+        {
+            Self & read_context;
+            Fiber & fiber;
+
+            void operator()(Poco::Net::Socket & socket)
+            {
+                try
+                {
+                    read_context.setSocket(socket);
+                }
+                catch (DB::Exception & e)
+                {
+                    e.addMessage(" while reading from socket ({})", socket.peerAddress().toString());
+                    throw;
+                }
+
+                read_context.is_read_in_progress = true;
+                fiber = std::move(fiber).resume();
+                read_context.is_read_in_progress = false;
+            }
+        };
+
+        Fiber operator()(Fiber && sink) const
+        {
+            try
+            {
+                while (true)
+                {
+                    read_context.packet = connections.receivePacketUnlocked(ReadCallback{read_context, sink});
+                    sink = std::move(sink).resume();
+                }
+            }
+            catch (const boost::context::detail::forced_unwind &)
+            {
+                /// This exception is thrown by fiber implementation in case if fiber is being deleted but hasn't exited
+                /// It should not be caught or it will segfault.
+                /// Other exceptions must be caught
+                throw;
+            }
+            catch (...)
+            {
+                read_context.exception = std::current_exception();
+            }
+
+            return std::move(sink);
+        }
+    };
+};
+}
+#else
+namespace DB
+{
+class RemoteQueryExecutorReadContext
+{
+public:
+    void cancel() {}
+};
+
+}
+#endif
--- a/src/Functions/ExtractString.h
+++ b/src/Functions/ExtractString.h
@ -0,0 +1,166 @@
+#pragma once
+#include <Common/PODArray.h>
+#include <Common/StringUtils/StringUtils.h>
+#include <Common/UTF8Helpers.h>
+
+#include <algorithm>
+#include <climits>
+#include <cstring>
+#include <memory>
+#include <utility>
+
+#ifdef __SSE4_2__
+#    include <nmmintrin.h>
+#endif
+
+namespace DB
+{
+// used by FunctionsStringSimilarity and FunctionsStringHash
+// includes extracting ASCII ngram, UTF8 ngram, ASCII word and UTF8 word
+template <size_t N, bool CaseInsensitive>
+struct ExtractStringImpl
+{
+    /// Padding form ColumnsString. It is a number of bytes we can always read starting from pos if pos < end.
+    static constexpr size_t default_padding = 16;
+
+    /// Functions are read `default_padding - (N - 1)` bytes into the buffer. Window of size N is used.
+    /// Read copies `N - 1` last bytes from buffer into beginning, and then reads new bytes.
+    static constexpr size_t buffer_size = default_padding + N - 1;
+
+    // the length of code_points = buffer_size
+    // pos: the current beginning location that we want to copy data
+    // end: the end location of the string
+    static ALWAYS_INLINE size_t readASCIICodePoints(UInt8 * code_points, const char *& pos, const char * end)
+    {
+        /// Offset before which we copy some data.
+        constexpr size_t padding_offset = default_padding - N + 1;
+        /// We have an array like this for ASCII (N == 4, other cases are similar)
+        /// |a0|a1|a2|a3|a4|a5|a6|a7|a8|a9|a10|a11|a12|a13|a14|a15|a16|a17|a18|
+        /// And we copy                                ^^^^^^^^^^^^^^^ these bytes to the start
+        /// Actually it is enough to copy 3 bytes, but memcpy for 4 bytes translates into 1 instruction
+        memcpy(code_points, code_points + padding_offset, roundUpToPowerOfTwoOrZero(N - 1) * sizeof(UInt8));
+        /// Now we have an array
+        /// |a13|a14|a15|a16|a4|a5|a6|a7|a8|a9|a10|a11|a12|a13|a14|a15|a16|a17|a18|
+        ///              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+        /// Doing unaligned read of 16 bytes and copy them like above
+        /// 16 is also chosen to do two `movups`.
+        /// Such copying allow us to have 3 codepoints from the previous read to produce the 4-grams with them.
+        memcpy(code_points + (N - 1), pos, default_padding * sizeof(UInt8));
+
+        if constexpr (CaseInsensitive)
+        {
+            /// We really need template lambdas with C++20 to do it inline
+            unrollLowering<N - 1>(code_points, std::make_index_sequence<padding_offset>());
+        }
+        pos += padding_offset;
+        if (pos > end)
+            return default_padding - (pos - end);
+        return default_padding;
+    }
+
+    // read a ASCII word
+    static ALWAYS_INLINE inline size_t readOneASCIIWord(PaddedPODArray<UInt8> & word_buf, const char *& pos, const char * end)
+    {
+        // jump seperators
+        while (pos < end && !isAlphaNumericASCII(*pos))
+            ++pos;
+
+        // word start from here
+        const char * word_start = pos;
+        while (pos < end && isAlphaNumericASCII(*pos))
+            ++pos;
+
+        word_buf.assign(word_start, pos);
+        if (CaseInsensitive)
+        {
+            std::transform(word_buf.begin(), word_buf.end(), word_buf.begin(), [](UInt8 c) { return std::tolower(c); });
+        }
+        return word_buf.size();
+    }
+
+    static ALWAYS_INLINE inline size_t readUTF8CodePoints(UInt32 * code_points, const char *& pos, const char * end)
+    {
+        memcpy(code_points, code_points + default_padding - N + 1, roundUpToPowerOfTwoOrZero(N - 1) * sizeof(UInt32));
+
+        size_t num = N - 1;
+        while (num < default_padding && pos < end)
+        {
+            code_points[num++] = readOneUTF8Code(pos, end);
+        }
+        return num;
+    }
+
+    // read one UTF8 word from pos to word
+    static ALWAYS_INLINE inline size_t readOneUTF8Word(PaddedPODArray<UInt32> & word_buf, const char *& pos, const char * end)
+    {
+        // jump UTF8 seperator
+        while (pos < end && isUTF8Sep(*pos))
+            ++pos;
+        word_buf.clear();
+        // UTF8 word's character number
+        while (pos < end && !isUTF8Sep(*pos))
+        {
+            word_buf.push_back(readOneUTF8Code(pos, end));
+        }
+        return word_buf.size();
+    }
+
+private:
+    template <size_t Offset, typename Container, size_t... I>
+    static ALWAYS_INLINE inline void unrollLowering(Container & cont, const std::index_sequence<I...> &)
+    {
+        ((cont[Offset + I] = std::tolower(cont[Offset + I])), ...);
+    }
+
+    // we use ASCII non-alphanum character as UTF8 seperator
+    static ALWAYS_INLINE inline bool isUTF8Sep(const UInt8 c) { return c < 128 && !isAlphaNumericASCII(c); }
+
+    // read one UTF8 character and return it
+    static ALWAYS_INLINE inline UInt32 readOneUTF8Code(const char *& pos, const char * end)
+    {
+        size_t length = UTF8::seqLength(*pos);
+
+        if (pos + length > end)
+            length = end - pos;
+        UInt32 res;
+        switch (length)
+        {
+            case 1:
+                res = 0;
+                memcpy(&res, pos, 1);
+                break;
+            case 2:
+                res = 0;
+                memcpy(&res, pos, 2);
+                break;
+            case 3:
+                res = 0;
+                memcpy(&res, pos, 3);
+                break;
+            default:
+                memcpy(&res, pos, 4);
+        }
+
+        if constexpr (CaseInsensitive)
+        {
+            switch (length)
+            {
+                case 4:
+                    res &= ~(1u << (5 + 3 * CHAR_BIT));
+                    [[fallthrough]];
+                case 3:
+                    res &= ~(1u << (5 + 2 * CHAR_BIT));
+                    [[fallthrough]];
+                case 2:
+                    res &= ~(1u);
+                    res &= ~(1u << (5 + CHAR_BIT));
+                    [[fallthrough]];
+                default:
+                    res &= ~(1u << 5);
+            }
+        }
+        pos += length;
+        return res;
+    }
+};
+}
--- a/src/Functions/FunctionsStringHash.cpp
+++ b/src/Functions/FunctionsStringHash.cpp
@ -0,0 +1,626 @@
+#include <Functions/FunctionsStringHash.h>
+
+#include <Functions/ExtractString.h>
+#include <Functions/FunctionFactory.h>
+#include <Functions/FunctionsHashing.h>
+#include <Common/HashTable/ClearableHashMap.h>
+#include <Common/HashTable/Hash.h>
+#include <Common/PODArray.h>
+
+#include <Core/Defines.h>
+
+#include <bitset>
+#include <functional>
+#include <memory>
+#include <tuple>
+#include <vector>
+#include <common/unaligned.h>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int NOT_IMPLEMENTED;
+}
+
+struct Hash
+{
+    static UInt64 crc32u64(UInt64 crc [[maybe_unused]], UInt64 val [[maybe_unused]])
+    {
+#ifdef __SSE4_2__
+        return _mm_crc32_u64(crc, val);
+#elif defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
+        return __crc32cd(crc, val);
+#else
+        throw Exception("String hash is not implemented without sse4.2 support", ErrorCodes::NOT_IMPLEMENTED);
+#endif
+    }
+
+    static UInt64 crc32u32(UInt64 crc [[maybe_unused]], UInt32 val [[maybe_unused]])
+    {
+#ifdef __SSE4_2__
+        return _mm_crc32_u32(crc, val);
+#elif defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
+        return __crc32cw(crc, val);
+#else
+        throw Exception("String hash is not implemented without sse4.2 support", ErrorCodes::NOT_IMPLEMENTED);
+#endif
+    }
+
+    static UInt64 crc32u8(UInt64 crc [[maybe_unused]], UInt8 val [[maybe_unused]])
+    {
+#ifdef __SSE4_2__
+        return _mm_crc32_u8(crc, val);
+#elif defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
+        return __crc32cb(crc, val);
+#else
+        throw Exception("String hash is not implemented without sse4.2 support", ErrorCodes::NOT_IMPLEMENTED);
+#endif
+    }
+
+    static ALWAYS_INLINE inline UInt64 ngramASCIIHash(const UInt8 * code_points)
+    {
+        return crc32u64(-1ULL, unalignedLoad<UInt32>(code_points));
+    }
+
+    static ALWAYS_INLINE inline UInt64 ngramUTF8Hash(const UInt32 * code_points)
+    {
+        UInt64 crc = -1ULL;
+        crc = crc32u64(crc, code_points[0]);
+        crc = crc32u64(crc, code_points[1]);
+        crc = crc32u64(crc, code_points[2]);
+        return crc;
+    }
+
+    static ALWAYS_INLINE inline UInt64 wordShinglesHash(const UInt64 * hashes, size_t size, size_t offset)
+    {
+        UInt64 crc1 = -1ULL;
+        UInt64 crc2 = -1ULL;
+
+        for (size_t i = offset; i < size; i += 2)
+            crc1 = crc32u64(crc1, hashes[i]);
+        for (size_t i = offset + 1; i < size; i += 2)
+            crc2 = crc32u64(crc2, hashes[i]);
+
+        if ((size - offset) & 1)
+        {
+            for (size_t i = 0; i < offset; i += 2)
+                crc2 = crc32u64(crc2, hashes[i]);
+            for (size_t i = 1; i < offset; i += 2)
+                crc1 = crc32u64(crc1, hashes[i]);
+        }
+        else
+        {
+            for (size_t i = 0; i < offset; i += 2)
+                crc1 = crc32u64(crc1, hashes[i]);
+            for (size_t i = 1; i < offset; i += 2)
+                crc2 = crc32u64(crc2, hashes[i]);
+        }
+
+        return crc1 | (crc2 << 32u);
+    }
+
+    static ALWAYS_INLINE inline UInt64 hashSum(const UInt8 * hashes [[maybe_unused]], size_t K [[maybe_unused]])
+    {
+        UInt64 crc1 = -1ULL;
+        UInt64 crc2 = -1ULL;
+
+        for (size_t i = 0; i < K; i += 2)
+            crc1 = crc32u8(crc1, hashes[i]);
+        for (size_t i = 1; i < K; i += 2)
+            crc2 = crc32u8(crc2, hashes[i]);
+
+        return crc1 | (crc2 << 32u);
+    }
+
+    static ALWAYS_INLINE inline UInt64 hashSum(const UInt32 * hashes [[maybe_unused]], size_t K [[maybe_unused]])
+    {
+        UInt64 crc1 = -1ULL;
+        UInt64 crc2 = -1ULL;
+
+        for (size_t i = 0; i < K; i += 2)
+            crc1 = crc32u32(crc1, hashes[i]);
+        for (size_t i = 1; i < K; i += 2)
+            crc2 = crc32u32(crc2, hashes[i]);
+
+        return crc1 | (crc2 << 32u);
+    }
+
+    static ALWAYS_INLINE inline UInt64 hashSum(const UInt64 * hashes, size_t K)
+    {
+        UInt64 crc1 = -1ULL;
+        UInt64 crc2 = -1ULL;
+
+        for (size_t i = 0; i < K; i += 2)
+            crc1 = crc32u64(crc1, hashes[i]);
+        for (size_t i = 1; i < K; i += 2)
+            crc2 = crc32u64(crc2, hashes[i]);
+
+        return crc1 | (crc2 << 32u);
+    }
+};
+
+// Simhash String -> UInt64
+// N: the length of ngram or words shingles
+// CodePoint: UInt8(ASCII) or UInt32(UTF8)
+// UTF8: means ASCII or UTF8, these two parameters CodePoint and UTF8 can only be (UInt8, false) or (UInt32, true)
+// Ngram: means ngram(true) or words shingles(false)
+// CaseInsensitive: means should we consider about letter case or not
+template <size_t N, typename CodePoint, bool UTF8, bool Ngram, bool CaseInsensitive>
+struct SimhashImpl
+{
+    using StrOp = ExtractStringImpl<N, CaseInsensitive>;
+    // we made an assumption that the size of one word cann't exceed 128, which may not true
+    // if some word's size exceed 128, it would be cut up to several word
+    static constexpr size_t max_string_size = 1u << 15;
+    static constexpr size_t simultaneously_codepoints_num = StrOp::buffer_size;
+
+    // Simhash ngram calculate function: String ->UInt64
+    // this function extracting ngram from input string, and maintain a 64-dimensions vector
+    // for each ngram, calculate a 64 bit hash value, and update the vector according the hash value
+    // finally return a 64 bit value(UInt64), i'th bit is 1 means vector[i] > 0, otherwise, vector[i] < 0
+    static ALWAYS_INLINE inline UInt64 ngramCalculateHashValue(
+        const char * data,
+        size_t size,
+        size_t (*read_code_points)(CodePoint *, const char *&, const char *),
+        UInt64 (*hash_functor)(const CodePoint *))
+    {
+        const char * start = data;
+        const char * end = data + size;
+        // fingerprint vector, all dimensions initialized to zero at the first
+        Int64 finger_vec[64] = {};
+        CodePoint cp[simultaneously_codepoints_num] = {};
+
+        size_t found = read_code_points(cp, start, end);
+        size_t iter = N - 1;
+
+        do
+        {
+            for (; iter + N <= found; ++iter)
+            {
+                // for each ngram, we can calculate an 64 bit hash
+                // then update finger_vec according to this hash value
+                // if the i'th bit is 1, finger_vec[i] plus 1, otherwise minus 1
+                UInt64 hash_value = hash_functor(cp + iter);
+                std::bitset<64> bits(hash_value);
+                for (size_t i = 0; i < 64; ++i)
+                {
+                    finger_vec[i] += ((bits.test(i)) ? 1 : -1);
+                }
+            }
+            iter = 0;
+        } while (start < end && (found = read_code_points(cp, start, end)));
+
+        // finally, we return a 64 bit value according to finger_vec
+        // if finger_vec[i] > 0, the i'th bit of the value is 1, otherwise 0
+        std::bitset<64> res_bit(0u);
+        for (size_t i = 0; i < 64; ++i)
+        {
+            if (finger_vec[i] > 0)
+                res_bit.set(i);
+        }
+        return res_bit.to_ullong();
+    }
+
+    // Simhash word shingle calculate funtion: String -> UInt64
+    // this function extracting n word shingle from input string, and maintain a 64-dimensions vector as well
+    // for each word shingle, calculate a 64 bit hash value, and update the vector according the hash value
+    // finally return a 64 bit value(UInt64), i'th bit is 1 means vector[i] > 0, otherwise, vector[i] < 0
+    //
+    // word shingle hash value calculate:
+    // 1. at the first, extracts N word shingles and calculate N hash values, store into an array, use this N hash values
+    // to calculate the first word shingle hash value
+    // 2. next, we extrac one word each time, and calculate a new hash value of the new word,then use the latest N hash
+    // values to caculate the next word shingle hash value
+    static ALWAYS_INLINE inline UInt64 wordShinglesCalculateHashValue(
+        const char * data,
+        size_t size,
+        size_t (*read_one_word)(PaddedPODArray<CodePoint> &, const char *&, const char *),
+        UInt64 (*hash_functor)(const UInt64 *, size_t, size_t))
+    {
+        const char * start = data;
+        const char * end = data + size;
+
+        // Also, a 64 bit vector initialized to zero
+        Int64 finger_vec[64] = {};
+        // a array to store N word hash values
+        UInt64 nword_hashes[N] = {};
+        // word buffer to store one word
+        PaddedPODArray<CodePoint> word_buf;
+        // get first word shingle
+        for (size_t i = 0; i < N && start < end; ++i)
+        {
+            read_one_word(word_buf, start, end);
+            if (!word_buf.empty())
+            {
+                // for each word, calculate a hash value and stored into the array
+                nword_hashes[i++] = Hash::hashSum(word_buf.data(), word_buf.size());
+            }
+        }
+
+        // calculate the first word shingle hash value
+        UInt64 hash_value = hash_functor(nword_hashes, N, 0);
+        std::bitset<64> first_bits(hash_value);
+        for (size_t i = 0; i < 64; ++i)
+        {
+            finger_vec[i] += ((first_bits.test(i)) ? 1 : -1);
+        }
+
+        size_t offset = 0;
+        while (start < end && read_one_word(word_buf, start, end))
+        {
+            // we need to store the new word hash value to the oldest location.
+            // for example, N = 5, array |a0|a1|a2|a3|a4|, now , a0 is the oldest location,
+            // so we need to store new word hash into location of a0, then ,this array become
+            // |a5|a1|a2|a3|a4|, next time, a1 become the oldest location, we need to store new
+            // word hash value into locaion of a1, then array become |a5|a6|a2|a3|a4|
+            nword_hashes[offset] = Hash::hashSum(word_buf.data(), word_buf.size());
+            offset = (offset + 1) % N;
+            // according to the word hash storation way, in order to not lose the word shingle's
+            // sequence information, when calculation word shingle hash value, we need provide the offset
+            // inforation, which is the offset of the first word's hash value of the word shingle
+            hash_value = hash_functor(nword_hashes, N, offset);
+            std::bitset<64> bits(hash_value);
+            for (size_t i = 0; i < 64; ++i)
+            {
+                finger_vec[i] += ((bits.test(i)) ? 1 : -1);
+            }
+        }
+
+        std::bitset<64> res_bit(0u);
+        for (size_t i = 0; i < 64; ++i)
+        {
+            if (finger_vec[i] > 0)
+                res_bit.set(i);
+        }
+        return res_bit.to_ullong();
+    }
+
+    static void apply(const ColumnString::Chars & data, const ColumnString::Offsets & offsets, PaddedPODArray<UInt64> & res)
+    {
+        for (size_t i = 0; i < offsets.size(); ++i)
+        {
+            const char * one_data = reinterpret_cast<const char *>(&data[offsets[i - 1]]);
+            const size_t data_size = offsets[i] - offsets[i - 1] - 1;
+            if (data_size <= max_string_size)
+            {
+                if constexpr (Ngram)
+                {
+                    if constexpr (!UTF8)
+                        res[i] = ngramCalculateHashValue(one_data, data_size, StrOp::readASCIICodePoints, Hash::ngramASCIIHash);
+                    else
+                        res[i] = ngramCalculateHashValue(one_data, data_size, StrOp::readUTF8CodePoints, Hash::ngramUTF8Hash);
+                }
+                else
+                {
+                    if constexpr (!UTF8)
+                        res[i] = wordShinglesCalculateHashValue(one_data, data_size, StrOp::readOneASCIIWord, Hash::wordShinglesHash);
+                    else
+                        res[i] = wordShinglesCalculateHashValue(one_data, data_size, StrOp::readOneUTF8Word, Hash::wordShinglesHash);
+                }
+            }
+            else
+                res[i] = -1ull;
+        }
+    }
+};
+
+template <typename F, size_t K, size_t v>
+class FixedHeap
+{
+public:
+    FixedHeap() = delete;
+
+    explicit FixedHeap(F f_) : f(f_), data_t(std::make_shared<std::vector<UInt64>>(K, v))
+    {
+        std::make_heap(data_t->begin(), data_t->end(), f);
+    }
+
+    void insertAndReplace(UInt64 new_v)
+    {
+        data_t->push_back(new_v);
+        std::push_heap(data_t->begin(), data_t->end(), f);
+        std::pop_heap(data_t->begin(), data_t->end(), f);
+        data_t->pop_back();
+    }
+
+    const UInt64 * data() { return data_t->data(); }
+
+private:
+    F f;
+    std::shared_ptr<std::vector<UInt64>> data_t;
+};
+
+
+// Minhash: String -> Tuple(UInt64, UInt64)
+// for each string, we extract ngram or word shingle,
+// for each ngram or word shingle, calculate a hash value,
+// then we take the K minimum hash values to calculate a hashsum,
+// and take the K maximum hash values to calculate another hashsum,
+// return this two hashsum: Tuple(hashsum1, hashsum2)
+//
+// N: the length of ngram or words shingles
+// K: the number of minimum hashes and maximum hashes that we keep
+// CodePoint: UInt8(ASCII) or UInt32(UTF8)
+// UTF8: means ASCII or UTF8, these two parameters CodePoint and UTF8 can only be (UInt8, false) or (UInt32, true)
+// Ngram: means ngram(true) or words shingles(false)
+// CaseInsensitive: means should we consider about letter case or not
+template <size_t N, size_t K, typename CodePoint, bool UTF8, bool Ngram, bool CaseInsensitive>
+struct MinhashImpl
+{
+    using Less = std::less<size_t>;
+    using Greater = std::greater<size_t>;
+    using MaxHeap = FixedHeap<std::less<size_t>, K, -1ULL>;
+    using MinHeap = FixedHeap<std::greater<size_t>, K, 0>;
+    using StrOp = ExtractStringImpl<N, CaseInsensitive>;
+    static constexpr size_t max_string_size = 1u << 15;
+    static constexpr size_t simultaneously_codepoints_num = StrOp::buffer_size;
+
+    // Minhash ngram calculate function, String -> Tuple(UInt64, UInt64)
+    // we extract ngram from input string, and calculate a hash value for each ngram
+    // then we take the K minimum hash values to calculate a hashsum,
+    // and take the K maximum hash values to calculate another hashsum,
+    // return this two hashsum: Tuple(hashsum1, hashsum2)
+    static ALWAYS_INLINE inline std::tuple<UInt64, UInt64> ngramCalculateHashValue(
+        const char * data,
+        size_t size,
+        size_t (*read_code_points)(CodePoint *, const char *&, const char *),
+        UInt64 (*hash_functor)(const CodePoint *))
+    {
+        const char * start = data;
+        const char * end = data + size;
+        // we just maintain the K minimu and K maximum hash values
+        MaxHeap k_minimum_hashes(Less{});
+        MinHeap k_maximum_hashes(Greater{});
+        CodePoint cp[simultaneously_codepoints_num] = {};
+
+        size_t found = read_code_points(cp, start, end);
+        size_t iter = N - 1;
+
+        do
+        {
+            for (; iter + N <= found; ++iter)
+            {
+                auto new_hash = hash_functor(cp + iter);
+                // insert the new hash value into array used to store K minimum value
+                // and K maximum value
+                k_minimum_hashes.insertAndReplace(new_hash);
+                k_maximum_hashes.insertAndReplace(new_hash);
+            }
+            iter = 0;
+        } while (start < end && (found = read_code_points(cp, start, end)));
+
+        // calculate hashsum of the K minimum hash values and K maximum hash values
+        UInt64 res1 = Hash::hashSum(k_minimum_hashes.data(), K);
+        UInt64 res2 = Hash::hashSum(k_maximum_hashes.data(), K);
+        return std::make_tuple(res1, res2);
+    }
+
+    // Minhash word shingle hash value calculate function: String ->Tuple(UInt64, UInt64)
+    // for each word shingle, we calculate a hash value, but in fact, we just maintain the
+    // K minimum and K maximum hash value
+    static ALWAYS_INLINE inline std::tuple<UInt64, UInt64> wordShinglesCalculateHashValue(
+        const char * data,
+        size_t size,
+        size_t (*read_one_word)(PaddedPODArray<CodePoint> &, const char *&, const char *),
+        UInt64 (*hash_functor)(const UInt64 *, size_t, size_t))
+    {
+        const char * start = data;
+        const char * end = start + size;
+        // also we just store the K minimu and K maximum hash values
+        MaxHeap k_minimum_hashes(Less{});
+        MinHeap k_maximum_hashes(Greater{});
+        // array to store n word hashes
+        UInt64 nword_hashes[N] = {};
+        // word buffer to store one word
+        PaddedPODArray<CodePoint> word_buf;
+        // how word shingle hash value calculation and word hash storation is same as we
+        // have descripted in Simhash wordShinglesCalculateHashValue function
+        for (size_t i = 0; i < N && start < end; ++i)
+        {
+            read_one_word(word_buf, start, end);
+            if (!word_buf.empty())
+            {
+                nword_hashes[i++] = Hash::hashSum(word_buf.data(), word_buf.size());
+            }
+        }
+
+        auto new_hash = hash_functor(nword_hashes, N, 0);
+        k_minimum_hashes.insertAndReplace(new_hash);
+        k_maximum_hashes.insertAndReplace(new_hash);
+
+        size_t offset = 0;
+        while (start < end && read_one_word(word_buf, start, end))
+        {
+            nword_hashes[offset] = Hash::hashSum(word_buf.data(), word_buf.size());
+            offset = (offset + 1) % N;
+            new_hash = hash_functor(nword_hashes, N, offset);
+            k_minimum_hashes.insertAndReplace(new_hash);
+            k_maximum_hashes.insertAndReplace(new_hash);
+        }
+
+        // calculate hashsum
+        UInt64 res1 = Hash::hashSum(k_minimum_hashes.data(), K);
+        UInt64 res2 = Hash::hashSum(k_maximum_hashes.data(), K);
+        return std::make_tuple(res1, res2);
+    }
+
+    static void apply(
+        const ColumnString::Chars & data,
+        const ColumnString::Offsets & offsets,
+        PaddedPODArray<UInt64> & res1,
+        PaddedPODArray<UInt64> & res2)
+    {
+        for (size_t i = 0; i < offsets.size(); ++i)
+        {
+            const char * one_data = reinterpret_cast<const char *>(&data[offsets[i - 1]]);
+            const size_t data_size = offsets[i] - offsets[i - 1] - 1;
+            if (data_size <= max_string_size)
+            {
+                if constexpr (Ngram)
+                {
+                    if constexpr (!UTF8)
+                        std::tie(res1[i], res2[i]) = ngramCalculateHashValue(one_data, data_size, StrOp::readASCIICodePoints, Hash::ngramASCIIHash);
+                    else
+                        std::tie(res1[i], res2[i]) = ngramCalculateHashValue(one_data, data_size, StrOp::readUTF8CodePoints, Hash::ngramUTF8Hash);
+                }
+                else
+                {
+                    if constexpr (!UTF8)
+                        std::tie(res1[i], res2[i]) = wordShinglesCalculateHashValue(one_data, data_size, StrOp::readOneASCIIWord, Hash::wordShinglesHash);
+                    else
+                        std::tie(res1[i], res2[i]) = wordShinglesCalculateHashValue(one_data, data_size, StrOp::readOneUTF8Word, Hash::wordShinglesHash);
+                }
+            }
+            else
+                std::tie(res1[i], res2[i]) = std::make_tuple(-1ull, -1ull);
+        }
+    }
+};
+
+struct NameNgramSimhash
+{
+    static constexpr auto name = "ngramSimhash";
+};
+
+struct NameNgramSimhashCaseInsensitive
+{
+    static constexpr auto name = "ngramSimhashCaseInsensitive";
+};
+
+struct NameNgramSimhashUTF8
+{
+    static constexpr auto name = "ngramSimhashUTF8";
+};
+
+struct NameNgramSimhashCaseInsensitiveUTF8
+{
+    static constexpr auto name = "ngramSimhashCaseInsensitiveUTF8";
+};
+
+struct NameWordShingleSimhash
+{
+    static constexpr auto name = "wordShingleSimhash";
+};
+
+struct NameWordShingleSimhashCaseInsensitive
+{
+    static constexpr auto name = "wordShingleSimhashCaseInsensitive";
+};
+
+struct NameWordShingleSimhashUTF8
+{
+    static constexpr auto name = "wordShingleSimhashUTF8";
+};
+
+struct NameWordShingleSimhashCaseInsensitiveUTF8
+{
+    static constexpr auto name = "wordShingleSimhashCaseInsensitiveUTF8";
+};
+
+struct NameNgramMinhash
+{
+    static constexpr auto name = "ngramMinhash";
+};
+
+struct NameNgramMinhashCaseInsensitive
+{
+    static constexpr auto name = "ngramMinhashCaseInsensitive";
+};
+
+struct NameNgramMinhashUTF8
+{
+    static constexpr auto name = "ngramMinhashUTF8";
+};
+
+struct NameNgramMinhashCaseInsensitiveUTF8
+{
+    static constexpr auto name = "ngramMinhashCaseInsensitiveUTF8";
+};
+
+struct NameWordShingleMinhash
+{
+    static constexpr auto name = "wordShingleMinhash";
+};
+
+struct NameWordShingleMinhashCaseInsensitive
+{
+    static constexpr auto name = "wordShingleMinhashCaseInsensitive";
+};
+
+struct NameWordShingleMinhashUTF8
+{
+    static constexpr auto name = "wordShingleMinhashUTF8";
+};
+
+struct NameWordShingleMinhashCaseInsensitiveUTF8
+{
+    static constexpr auto name = "wordShingleMinhashCaseInsensitiveUTF8";
+};
+
+// Simhash
+using FunctionNgramSimhash = FunctionsStringHash<SimhashImpl<4, UInt8, false, true, false>, NameNgramSimhash, true>;
+
+using FunctionNgramSimhashCaseInsensitive
+    = FunctionsStringHash<SimhashImpl<4, UInt8, false, true, true>, NameNgramSimhashCaseInsensitive, true>;
+
+using FunctionNgramSimhashUTF8 = FunctionsStringHash<SimhashImpl<3, UInt32, true, true, false>, NameNgramSimhashUTF8, true>;
+
+using FunctionNgramSimhashCaseInsensitiveUTF8
+    = FunctionsStringHash<SimhashImpl<3, UInt32, true, true, true>, NameNgramSimhashCaseInsensitiveUTF8, true>;
+
+using FunctionWordShingleSimhash = FunctionsStringHash<SimhashImpl<3, UInt8, false, false, false>, NameWordShingleSimhash, true>;
+
+using FunctionWordShingleSimhashCaseInsensitive
+    = FunctionsStringHash<SimhashImpl<3, UInt8, false, false, true>, NameWordShingleSimhashCaseInsensitive, true>;
+
+using FunctionWordShingleSimhashUTF8 = FunctionsStringHash<SimhashImpl<3, UInt32, true, false, false>, NameWordShingleSimhashUTF8, true>;
+
+using FunctionWordShingleSimhashCaseInsensitiveUTF8
+    = FunctionsStringHash<SimhashImpl<3, UInt32, true, false, true>, NameWordShingleSimhashCaseInsensitiveUTF8, true>;
+
+// Minhash
+using FunctionNgramMinhash = FunctionsStringHash<MinhashImpl<4, 6, UInt8, false, true, false>, NameNgramMinhash, false>;
+
+using FunctionNgramMinhashCaseInsensitive
+    = FunctionsStringHash<MinhashImpl<4, 6, UInt8, false, true, true>, NameNgramMinhashCaseInsensitive, false>;
+
+using FunctionNgramMinhashUTF8 = FunctionsStringHash<MinhashImpl<4, 6, UInt32, true, true, false>, NameNgramMinhashUTF8, false>;
+
+using FunctionNgramMinhashCaseInsensitiveUTF8
+    = FunctionsStringHash<MinhashImpl<4, 6, UInt32, true, true, true>, NameNgramMinhashCaseInsensitiveUTF8, false>;
+
+using FunctionWordShingleMinhash = FunctionsStringHash<MinhashImpl<3, 6, UInt8, false, false, false>, NameWordShingleMinhash, false>;
+
+using FunctionWordShingleMinhashCaseInsensitive
+    = FunctionsStringHash<MinhashImpl<3, 6, UInt8, false, false, true>, NameWordShingleMinhashCaseInsensitive, false>;
+
+using FunctionWordShingleMinhashUTF8
+    = FunctionsStringHash<MinhashImpl<3, 6, UInt32, true, false, false>, NameWordShingleMinhashUTF8, false>;
+
+using FunctionWordShingleMinhashCaseInsensitiveUTF8
+    = FunctionsStringHash<MinhashImpl<3, 6, UInt32, true, false, true>, NameWordShingleMinhashCaseInsensitiveUTF8, false>;
+
+void registerFunctionsStringHash(FunctionFactory & factory)
+{
+    factory.registerFunction<FunctionNgramSimhash>();
+    factory.registerFunction<FunctionNgramSimhashCaseInsensitive>();
+    factory.registerFunction<FunctionNgramSimhashUTF8>();
+    factory.registerFunction<FunctionNgramSimhashCaseInsensitiveUTF8>();
+    factory.registerFunction<FunctionWordShingleSimhash>();
+    factory.registerFunction<FunctionWordShingleSimhashCaseInsensitive>();
+    factory.registerFunction<FunctionWordShingleSimhashUTF8>();
+    factory.registerFunction<FunctionWordShingleSimhashCaseInsensitiveUTF8>();
+
+    factory.registerFunction<FunctionNgramMinhash>();
+    factory.registerFunction<FunctionNgramMinhashCaseInsensitive>();
+    factory.registerFunction<FunctionNgramMinhashUTF8>();
+    factory.registerFunction<FunctionNgramMinhashCaseInsensitiveUTF8>();
+    factory.registerFunction<FunctionWordShingleMinhash>();
+    factory.registerFunction<FunctionWordShingleMinhashCaseInsensitive>();
+    factory.registerFunction<FunctionWordShingleMinhashUTF8>();
+    factory.registerFunction<FunctionWordShingleMinhashCaseInsensitiveUTF8>();
+}
+}
+
--- a/src/Functions/FunctionsStringHash.h
+++ b/src/Functions/FunctionsStringHash.h
@ -0,0 +1,83 @@
+#pragma once
+
+#include <Columns/ColumnConst.h>
+#include <Columns/ColumnString.h>
+#include <Columns/ColumnTuple.h>
+#include <Columns/ColumnVector.h>
+#include <DataTypes/DataTypeFactory.h>
+#include <DataTypes/DataTypeTuple.h>
+#include <DataTypes/DataTypesNumber.h>
+#include <Functions/FunctionHelpers.h>
+#include <Functions/IFunctionImpl.h>
+
+namespace DB
+{
+namespace ErrorCodes
+{
+    extern const int ILLEGAL_TYPE_OF_ARGUMENT;
+}
+
+// FunctionStringHash
+// Simhash: String -> UInt64
+// Minhash: String -> (UInt64, UInt64)
+template <typename Impl, typename Name, bool is_simhash>
+class FunctionsStringHash : public IFunction
+{
+public:
+    static constexpr auto name = Name::name;
+
+    static FunctionPtr create(const Context &) { return std::make_shared<FunctionsStringHash>(); }
+
+    String getName() const override { return name; }
+
+    size_t getNumberOfArguments() const override { return 1; }
+
+    DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
+    {
+        if (!isString(arguments[0]))
+            throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
+                            "Function {} expect single String argument, got {}", getName(), arguments[0]->getName());
+
+        auto type = std::make_shared<DataTypeUInt64>();
+        if constexpr (is_simhash)
+            return type;
+
+        return std::make_shared<DataTypeTuple>(DataTypes{type, type});
+    }
+
+    bool useDefaultImplementationForConstants() const override { return true; }
+
+    ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t) const override
+    {
+        const ColumnPtr & column = arguments[0].column;
+
+        if constexpr (is_simhash)
+        {
+            // non const string, const case is handled by useDefaultImplementationForConstants.
+            auto col_res = ColumnVector<UInt64>::create();
+            auto & vec_res = col_res->getData();
+            vec_res.resize(column->size());
+            const ColumnString * col_str_vector = checkAndGetColumn<ColumnString>(&*column);
+            Impl::apply(col_str_vector->getChars(), col_str_vector->getOffsets(), vec_res);
+            return col_res;
+        }
+        else // Min hash
+        {
+            // non const string
+            auto col_h1 = ColumnVector<UInt64>::create();
+            auto col_h2 = ColumnVector<UInt64>::create();
+            auto & vec_h1 = col_h1->getData();
+            auto & vec_h2 = col_h2->getData();
+            vec_h1.resize(column->size());
+            vec_h2.resize(column->size());
+            const ColumnString * col_str_vector = checkAndGetColumn<ColumnString>(&*column);
+            Impl::apply(col_str_vector->getChars(), col_str_vector->getOffsets(), vec_h1, vec_h2);
+            MutableColumns tuple_columns;
+            tuple_columns.emplace_back(std::move(col_h1));
+            tuple_columns.emplace_back(std::move(col_h2));
+            return ColumnTuple::create(std::move(tuple_columns));
+        }
+    }
+};
+}
+
--- a/src/Functions/bitHammingDistance.cpp
+++ b/src/Functions/bitHammingDistance.cpp
@ -0,0 +1,160 @@
+#include <Columns/ColumnVector.h>
+#include <DataTypes/DataTypesNumber.h>
+#include <Functions/FunctionFactory.h>
+#include <Functions/FunctionHelpers.h>
+#include <Functions/IFunction.h>
+#include <Functions/castTypeToEither.h>
+
+namespace DB
+{
+namespace ErrorCodes
+{
+    extern const int ILLEGAL_COLUMN;
+    extern const int ILLEGAL_TYPE_OF_ARGUMENT;
+}
+
+
+template <typename A, typename B>
+struct BitHammingDistanceImpl
+{
+    using ResultType = UInt8;
+
+    static void NO_INLINE vectorVector(const PaddedPODArray<A> & a, const PaddedPODArray<B> & b, PaddedPODArray<ResultType> & c)
+    {
+        size_t size = a.size();
+        for (size_t i = 0; i < size; ++i)
+            c[i] = apply(a[i], b[i]);
+    }
+
+    static void NO_INLINE vectorConstant(const PaddedPODArray<A> & a, B b, PaddedPODArray<ResultType> & c)
+    {
+        size_t size = a.size();
+        for (size_t i = 0; i < size; ++i)
+            c[i] = apply(a[i], b);
+    }
+
+    static void NO_INLINE constantVector(A a, const PaddedPODArray<B> & b, PaddedPODArray<ResultType> & c)
+    {
+        size_t size = b.size();
+        for (size_t i = 0; i < size; ++i)
+            c[i] = apply(a, b[i]);
+    }
+
+private:
+    static inline UInt8 apply(UInt64 a, UInt64 b)
+    {
+        UInt64 res = a ^ b;
+        return __builtin_popcountll(res);
+    }
+};
+
+template <typename F>
+bool castType(const IDataType * type, F && f)
+{
+    return castTypeToEither<
+        DataTypeInt8,
+        DataTypeInt16,
+        DataTypeInt32,
+        DataTypeInt64,
+        DataTypeUInt8,
+        DataTypeUInt16,
+        DataTypeUInt32,
+        DataTypeUInt64>(type, std::forward<F>(f));
+}
+
+template <typename F>
+static bool castBothTypes(const IDataType * left, const IDataType * right, F && f)
+{
+    return castType(left, [&](const auto & left_) { return castType(right, [&](const auto & right_) { return f(left_, right_); }); });
+}
+
+// bitHammingDistance function: (Integer, Integer) -> UInt8
+class FunctionBitHammingDistance : public IFunction
+{
+public:
+    static constexpr auto name = "bitHammingDistance";
+    using ResultType = UInt8;
+    static FunctionPtr create(const Context &) { return std::make_shared<FunctionBitHammingDistance>(); }
+
+    String getName() const override { return name; }
+
+    size_t getNumberOfArguments() const override { return 2; }
+
+    DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
+    {
+        if (!isInteger(arguments[0]))
+            throw Exception(
+                "Illegal type " + arguments[0]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
+        if (!isInteger(arguments[1]))
+            throw Exception(
+                "Illegal type " + arguments[1]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
+        return std::make_shared<DataTypeUInt8>();
+    }
+
+    bool useDefaultImplementationForConstants() const override { return true; }
+
+    ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override
+    {
+        const auto * left_generic = arguments[0].type.get();
+        const auto * right_generic = arguments[1].type.get();
+        ColumnPtr result_column;
+        bool valid = castBothTypes(left_generic, right_generic, [&](const auto & left, const auto & right)
+        {
+            using LeftDataType = std::decay_t<decltype(left)>;
+            using RightDataType = std::decay_t<decltype(right)>;
+            using T0 = typename LeftDataType::FieldType;
+            using T1 = typename RightDataType::FieldType;
+            using ColVecT0 = ColumnVector<T0>;
+            using ColVecT1 = ColumnVector<T1>;
+            using ColVecResult = ColumnVector<ResultType>;
+
+            using OpImpl = BitHammingDistanceImpl<T0, T1>;
+
+            const auto * const col_left_raw = arguments[0].column.get();
+            const auto * const col_right_raw = arguments[1].column.get();
+
+            typename ColVecResult::MutablePtr col_res = nullptr;
+            col_res = ColVecResult::create();
+
+            auto & vec_res = col_res->getData();
+            vec_res.resize(input_rows_count);
+
+            if (auto col_left_const = checkAndGetColumnConst<ColVecT0>(col_left_raw))
+            {
+                if (auto col_right = checkAndGetColumn<ColVecT1>(col_right_raw))
+                {
+                    // constant integer - non-constant integer
+                    OpImpl::constantVector(col_left_const->template getValue<T0>(), col_right->getData(), vec_res);
+                }
+                else
+                    return false;
+            }
+            else if (auto col_left = checkAndGetColumn<ColVecT0>(col_left_raw))
+            {
+                if (auto col_right = checkAndGetColumn<ColVecT1>(col_right_raw))
+                    // non-constant integer - non-constant integer
+                    OpImpl::vectorVector(col_left->getData(), col_right->getData(), vec_res);
+                else if (auto col_right_const = checkAndGetColumnConst<ColVecT1>(col_right_raw))
+                    // non-constant integer - constant integer
+                    OpImpl::vectorConstant(col_left->getData(), col_right_const->template getValue<T1>(), vec_res);
+                else
+                    return false;
+            }
+            else
+                return false;
+
+            result_column = std::move(col_res);
+            return true;
+        });
+        if (!valid)
+            throw Exception(getName() + "'s arguments do not match the expected data types", ErrorCodes::ILLEGAL_COLUMN);
+
+        return result_column;
+    }
+};
+
+void registerFunctionBitHammingDistance(FunctionFactory & factory)
+{
+    factory.registerFunction<FunctionBitHammingDistance>();
+}
+}
--- a/src/Functions/registerFunctions.cpp
+++ b/src/Functions/registerFunctions.cpp
@ -42,7 +42,9 @@ void registerFunctionsNull(FunctionFactory &);
 void registerFunctionsJSON(FunctionFactory &);
 void registerFunctionsConsistentHashing(FunctionFactory & factory);
 void registerFunctionsUnixTimestamp64(FunctionFactory & factory);
-
+void registerFunctionBitHammingDistance(FunctionFactory & factory);
+void registerFunctionTupleHammingDistance(FunctionFactory & factory);
+void registerFunctionsStringHash(FunctionFactory & factory);
 #if !defined(ARCADIA_BUILD)
 void registerFunctionBayesAB(FunctionFactory &);
 #endif
@ -57,7 +59,6 @@ void registerFunctionAESDecryptMysql(FunctionFactory & factory);

 #endif

-
 void registerFunctions()
 {
    auto & factory = FunctionFactory::instance();
@ -99,6 +100,9 @@ void registerFunctions()
    registerFunctionsIntrospection(factory);
    registerFunctionsConsistentHashing(factory);
    registerFunctionsUnixTimestamp64(factory);
+    registerFunctionBitHammingDistance(factory);
+    registerFunctionTupleHammingDistance(factory);
+    registerFunctionsStringHash(factory);

 #if !defined(ARCADIA_BUILD)
    registerFunctionBayesAB(factory);
--- a/src/Functions/tupleHammingDistance.cpp
+++ b/src/Functions/tupleHammingDistance.cpp
@ -0,0 +1,220 @@
+#include <Columns/ColumnTuple.h>
+#include <Columns/ColumnVector.h>
+#include <DataTypes/DataTypeTuple.h>
+#include <DataTypes/DataTypesNumber.h>
+#include <Functions/FunctionFactory.h>
+#include <Functions/FunctionHelpers.h>
+#include <Functions/IFunction.h>
+#include <Functions/castTypeToEither.h>
+
+namespace DB
+{
+namespace ErrorCodes
+{
+    extern const int ILLEGAL_COLUMN;
+    extern const int ILLEGAL_TYPE_OF_ARGUMENT;
+}
+
+template <typename A, typename B>
+struct TupleHammingDistanceImpl
+{
+    using ResultType = UInt8;
+
+    static void NO_INLINE vectorVector(
+        const PaddedPODArray<A> & a1,
+        const PaddedPODArray<A> & b1,
+        const PaddedPODArray<B> & a2,
+        const PaddedPODArray<B> & b2,
+        PaddedPODArray<ResultType> & c)
+    {
+        size_t size = a1.size();
+        for (size_t i = 0; i < size; ++i)
+            c[i] = apply(a1[i], a2[i]) + apply(b1[i], b2[i]);
+    }
+
+    static void NO_INLINE
+    vectorConstant(const PaddedPODArray<A> & a1, const PaddedPODArray<A> & b1, UInt64 a2, UInt64 b2, PaddedPODArray<ResultType> & c)
+    {
+        size_t size = a1.size();
+        for (size_t i = 0; i < size; ++i)
+            c[i] = apply(a1[i], a2) + apply(b1[i], b2);
+    }
+
+    static void NO_INLINE
+    constantVector(UInt64 a1, UInt64 b1, const PaddedPODArray<B> & a2, const PaddedPODArray<B> & b2, PaddedPODArray<ResultType> & c)
+    {
+        size_t size = a2.size();
+        for (size_t i = 0; i < size; ++i)
+            c[i] = apply(a1, a2[i]) + apply(b1, b2[i]);
+    }
+
+    static ResultType constantConstant(UInt64 a1, UInt64 b1, UInt64 a2, UInt64 b2) { return apply(a1, a2) + apply(b1, b2); }
+
+private:
+    static inline UInt8 apply(UInt64 a, UInt64 b) { return a != b; }
+};
+
+template <typename F>
+bool castType(const IDataType * type, F && f)
+{
+    return castTypeToEither<
+        DataTypeInt8,
+        DataTypeInt16,
+        DataTypeInt32,
+        DataTypeInt64,
+        DataTypeUInt8,
+        DataTypeUInt16,
+        DataTypeUInt32,
+        DataTypeUInt64>(type, std::forward<F>(f));
+}
+
+template <typename F>
+static bool castBothTypes(const IDataType * left, const IDataType * right, F && f)
+{
+    return castType(left, [&](const auto & left_) { return castType(right, [&](const auto & right_) { return f(left_, right_); }); });
+}
+
+// tupleHammingDistance function: (Tuple(Integer, Integer), Tuple(Integer, Integer))->0/1/2
+// in order to avoid code bloating, for non-constant tuple, we make sure that the elements
+// in the tuple should have same data type, and for constant tuple, elements can be any integer
+// data type, we cast all of them into UInt64
+class FunctionTupleHammingDistance : public IFunction
+{
+public:
+    static constexpr auto name = "tupleHammingDistance";
+    using ResultType = UInt8;
+    static FunctionPtr create(const Context &) { return std::make_shared<FunctionTupleHammingDistance>(); }
+
+    String getName() const override { return name; }
+
+    size_t getNumberOfArguments() const override { return 2; }
+
+    DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
+    {
+        if (!isTuple(arguments[0]))
+            throw Exception(
+                "Illegal type " + arguments[0]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
+        if (!isTuple(arguments[1]))
+            throw Exception(
+                "Illegal type " + arguments[1]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
+        return std::make_shared<DataTypeUInt8>();
+    }
+
+    ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override
+    {
+        const ColumnWithTypeAndName & arg1 = arguments[0];
+        const ColumnWithTypeAndName & arg2 = arguments[1];
+        const DataTypeTuple & type1 = static_cast<const DataTypeTuple &>(*arg1.type);
+        const DataTypeTuple & type2 = static_cast<const DataTypeTuple &>(*arg2.type);
+        const auto & left_elems = type1.getElements();
+        const auto & right_elems = type2.getElements();
+        if (left_elems.size() != 2 || right_elems.size() != 2)
+            throw Exception(
+                "Illegal column of arguments of function " + getName() + ", tuple should have exactly two elements.",
+                ErrorCodes::ILLEGAL_COLUMN);
+
+        ColumnPtr result_column;
+
+        bool valid = castBothTypes(left_elems[0].get(), right_elems[0].get(), [&](const auto & left, const auto & right)
+        {
+            using LeftDataType = std::decay_t<decltype(left)>;
+            using RightDataType = std::decay_t<decltype(right)>;
+            using T0 = typename LeftDataType::FieldType;
+            using T1 = typename RightDataType::FieldType;
+            using ColVecT0 = ColumnVector<T0>;
+            using ColVecT1 = ColumnVector<T1>;
+            using ColVecResult = ColumnVector<ResultType>;
+
+            using OpImpl = TupleHammingDistanceImpl<T0, T1>;
+
+            // we can not useDefaultImplementationForConstants,
+            // because with that, tupleHammingDistance((10, 300), (10, 20)) does not work,
+            // since 10 has data type UInt8, and 300 has data type UInt16
+            if (const ColumnConst * const_col_left = checkAndGetColumnConst<ColumnTuple>(arg1.column.get()))
+            {
+                if (const ColumnConst * const_col_right = checkAndGetColumnConst<ColumnTuple>(arg2.column.get()))
+                {
+                    auto cols1 = convertConstTupleToConstantElements(*const_col_left);
+                    auto cols2 = convertConstTupleToConstantElements(*const_col_right);
+                    Field a1, b1, a2, b2;
+                    cols1[0]->get(0, a1);
+                    cols1[1]->get(0, b1);
+                    cols2[0]->get(0, a2);
+                    cols2[1]->get(0, b2);
+                    auto res = OpImpl::constantConstant(a1.get<UInt64>(), b1.get<UInt64>(), a2.get<UInt64>(), b2.get<UInt64>());
+                    result_column = DataTypeUInt8().createColumnConst(const_col_left->size(), toField(res));
+                    return true;
+                }
+            }
+
+            typename ColVecResult::MutablePtr col_res = nullptr;
+            col_res = ColVecResult::create();
+            auto & vec_res = col_res->getData();
+            vec_res.resize(input_rows_count);
+            // constant tuple - non-constant tuple
+            if (const ColumnConst * const_col_left = checkAndGetColumnConst<ColumnTuple>(arg1.column.get()))
+            {
+                if (const ColumnTuple * col_right = typeid_cast<const ColumnTuple *>(arg2.column.get()))
+                {
+                    auto const_cols = convertConstTupleToConstantElements(*const_col_left);
+                    Field a1, b1;
+                    const_cols[0]->get(0, a1);
+                    const_cols[1]->get(0, b1);
+                    auto col_r1 = checkAndGetColumn<ColVecT1>(&col_right->getColumn(0));
+                    auto col_r2 = checkAndGetColumn<ColVecT1>(&col_right->getColumn(1));
+                    if (col_r1 && col_r2)
+                        OpImpl::constantVector(a1.get<UInt64>(), b1.get<UInt64>(), col_r1->getData(), col_r2->getData(), vec_res);
+                    else
+                        return false;
+                }
+                else
+                    return false;
+            }
+            else if (const ColumnTuple * col_left = typeid_cast<const ColumnTuple *>(arg1.column.get()))
+            {
+                auto col_l1 = checkAndGetColumn<ColVecT0>(&col_left->getColumn(0));
+                auto col_l2 = checkAndGetColumn<ColVecT0>(&col_left->getColumn(1));
+                if (col_l1 && col_l2)
+                {
+                    // non-constant tuple - constant tuple
+                    if (const ColumnConst * const_col_right = checkAndGetColumnConst<ColumnTuple>(arg2.column.get()))
+                    {
+                        auto const_cols = convertConstTupleToConstantElements(*const_col_right);
+                        Field a2, b2;
+                        const_cols[0]->get(0, a2);
+                        const_cols[1]->get(0, b2);
+                        OpImpl::vectorConstant(col_l1->getData(), col_l2->getData(), a2.get<UInt64>(), a2.get<UInt64>(), vec_res);
+                    }
+                    // non-constant tuple - non-constant tuple
+                    else if (const ColumnTuple * col_right = typeid_cast<const ColumnTuple *>(arg2.column.get()))
+                    {
+                        auto col_r1 = checkAndGetColumn<ColVecT1>(&col_right->getColumn(0));
+                        auto col_r2 = checkAndGetColumn<ColVecT1>(&col_right->getColumn(1));
+                        if (col_r1 && col_r2)
+                            OpImpl::vectorVector(col_l1->getData(), col_l2->getData(), col_r1->getData(), col_r2->getData(), vec_res);
+                        else
+                            return false;
+                    }
+                    else
+                        return false;
+                }
+                else
+                    return false;
+            }
+            else
+                return false;
+            result_column = std::move(col_res);
+            return true;
+        });
+        if (!valid)
+            throw Exception(getName() + "'s arguments do not match the expected data types", ErrorCodes::ILLEGAL_COLUMN);
+
+        return result_column;
+    }
+};
+
+void registerFunctionTupleHammingDistance(FunctionFactory & factory)
+{
+    factory.registerFunction<FunctionTupleHammingDistance>();
+}
+}
--- a/src/Functions/ya.make
+++ b/src/Functions/ya.make
@ -53,6 +53,7 @@ SRCS(
    FunctionsRandom.cpp
    FunctionsRound.cpp
    FunctionsStringArray.cpp
+    FunctionsStringHash.cpp
    FunctionsStringSimilarity.cpp
    GatherUtils/concat.cpp
    GatherUtils/createArraySink.cpp
@ -185,6 +186,7 @@ SRCS(
    bitBoolMaskAnd.cpp
    bitBoolMaskOr.cpp
    bitCount.cpp
+    bitHammingDistance.cpp
    bitNot.cpp
    bitOr.cpp
    bitRotateLeft.cpp
@ -504,6 +506,7 @@ SRCS(
    tryBase64Decode.cpp
    tuple.cpp
    tupleElement.cpp
+    tupleHammingDistance.cpp
    upper.cpp
    upperUTF8.cpp
    uptime.cpp
--- a/src/IO/ReadBufferFromPocoSocket.cpp
+++ b/src/IO/ReadBufferFromPocoSocket.cpp
@ -28,10 +28,23 @@ bool ReadBufferFromPocoSocket::nextImpl()
    ssize_t bytes_read = 0;
    Stopwatch watch;

+    int flags = 0;
+    if (async_callback)
+        flags |= MSG_DONTWAIT;
+
    /// Add more details to exceptions.
    try
    {
-        bytes_read = socket.impl()->receiveBytes(internal_buffer.begin(), internal_buffer.size());
+        bytes_read = socket.impl()->receiveBytes(internal_buffer.begin(), internal_buffer.size(), flags);
+
+        /// If async_callback is specified, and read is blocking, run async_callback and try again later.
+        /// It is expected that file descriptor may be polled externally.
+        /// Note that receive timeout is not checked here. External code should check it while polling.
+        while (bytes_read < 0 && async_callback && errno == EAGAIN)
+        {
+            async_callback(socket);
+            bytes_read = socket.impl()->receiveBytes(internal_buffer.begin(), internal_buffer.size(), flags);
+        }
    }
    catch (const Poco::Net::NetException & e)
    {
--- a/src/IO/ReadBufferFromPocoSocket.h
+++ b/src/IO/ReadBufferFromPocoSocket.h
@ -5,7 +5,6 @@
 #include <IO/ReadBuffer.h>
 #include <IO/BufferWithOwnMemory.h>

-
 namespace DB
 {

@ -28,6 +27,11 @@ public:
    ReadBufferFromPocoSocket(Poco::Net::Socket & socket_, size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE);

    bool poll(size_t timeout_microseconds);
+
+    void setAsyncCallback(std::function<void(Poco::Net::Socket &)> async_callback_) { async_callback = std::move(async_callback_); }
+
+private:
+    std::function<void(Poco::Net::Socket &)> async_callback;
 };

 }
--- a/src/Interpreters/ActionsDAG.cpp
+++ b/src/Interpreters/ActionsDAG.cpp
@ -624,7 +624,7 @@ ActionsDAGPtr ActionsDAG::makeConvertingActions(
            {
                auto & input = inputs[res_elem.name];
                if (input.empty())
-                    throw Exception("Cannot find column " + backQuoteIfNeed(res_elem.name) + " in source stream",
+                    throw Exception("Cannot find column " + backQuote(res_elem.name) + " in source stream",
                                    ErrorCodes::THERE_IS_NO_COLUMN);

                src_node = actions_dag->inputs[input.front()];
@ -641,12 +641,12 @@ ActionsDAGPtr ActionsDAG::makeConvertingActions(
                if (ignore_constant_values)
                   src_node = const_cast<Node *>(&actions_dag->addColumn(res_elem, true));
                else if (res_const->getField() != src_const->getField())
-                    throw Exception("Cannot convert column " + backQuoteIfNeed(res_elem.name) + " because "
+                    throw Exception("Cannot convert column " + backQuote(res_elem.name) + " because "
                                    "it is constant but values of constants are different in source and result",
                                    ErrorCodes::ILLEGAL_COLUMN);
            }
            else
-                throw Exception("Cannot convert column " + backQuoteIfNeed(res_elem.name) + " because "
+                throw Exception("Cannot convert column " + backQuote(res_elem.name) + " because "
                                "it is non constant in source stream but must be constant in result",
                                ErrorCodes::ILLEGAL_COLUMN);
        }
--- a/src/Interpreters/ActionsVisitor.cpp
+++ b/src/Interpreters/ActionsVisitor.cpp
@ -735,6 +735,28 @@ void ActionsMatcher::visit(const ASTFunction & node, const ASTPtr & ast, Data &
        }
    }

+    if (node.is_window_function)
+    {
+        // Also add columns from PARTITION BY and ORDER BY of window functions.
+        // Requiring a constant reference to a shared pointer to non-const AST
+        // doesn't really look sane, but the visitor does indeed require it.
+        if (node.window_partition_by)
+        {
+            visit(node.window_partition_by->clone(), data);
+        }
+        if (node.window_order_by)
+        {
+            visit(node.window_order_by->clone(), data);
+        }
+
+        // Don't need to do anything more for window functions here -- the
+        // resulting column is added in ExpressionAnalyzer, similar to the
+        // aggregate functions.
+        return;
+    }
+
+    // An aggregate function can also be calculated as a window function, but we
+    // checked for it above, so no need to do anything more.
    if (AggregateFunctionFactory::instance().isAggregateFunctionName(node.name))
        return;

--- a/src/Interpreters/AggregateDescription.cpp
+++ b/src/Interpreters/AggregateDescription.cpp
@ -1,6 +1,7 @@
 #include <Interpreters/AggregateDescription.h>
 #include <Common/FieldVisitors.h>
 #include <IO/Operators.h>
+#include <Parsers/ASTFunction.h>

 namespace DB
 {
@ -99,4 +100,31 @@ void AggregateDescription::explain(WriteBuffer & out, size_t indent) const
    }
 }

+std::string WindowFunctionDescription::dump() const
+{
+    WriteBufferFromOwnString ss;
+
+    ss << "window function '" << column_name << "\n";
+    ss << "function node " << function_node->dumpTree() << "\n";
+    ss << "aggregate function '" << aggregate_function->getName() << "'\n";
+    if (!function_parameters.empty())
+    {
+        ss << "parameters " << toString(function_parameters) << "\n";
+    }
+
+    return ss.str();
+}
+
+std::string WindowDescription::dump() const
+{
+    WriteBufferFromOwnString ss;
+
+    ss << "window '" << window_name << "'\n";
+    ss << "partition_by " << dumpSortDescription(partition_by) << "\n";
+    ss << "order_by " << dumpSortDescription(order_by) << "\n";
+    ss << "full_sort_description " << dumpSortDescription(full_sort_description) << "\n";
+
+    return ss.str();
+}
+
 }
--- a/src/Interpreters/AggregateDescription.h
+++ b/src/Interpreters/AggregateDescription.h
@ -1,13 +1,18 @@
 #pragma once

+#include <AggregateFunctions/IAggregateFunction.h>
+#include <DataTypes/IDataType.h>
 #include <Core/ColumnNumbers.h>
 #include <Core/Names.h>
-#include <AggregateFunctions/IAggregateFunction.h>
+#include <Core/SortDescription.h>
+#include <Parsers/IAST_fwd.h>


 namespace DB
 {

+class ASTFunction;
+
 struct AggregateDescription
 {
    AggregateFunctionPtr function;
@ -21,4 +26,44 @@ struct AggregateDescription

 using AggregateDescriptions = std::vector<AggregateDescription>;

+
+struct WindowFunctionDescription
+{
+    std::string column_name;
+    const ASTFunction * function_node;
+    AggregateFunctionPtr aggregate_function;
+    Array function_parameters;
+    DataTypes argument_types;
+    Names argument_names;
+
+    std::string dump() const;
+};
+
+struct WindowDescription
+{
+    std::string window_name;
+
+    // We don't care about the particular order of keys for PARTITION BY, only
+    // that they are sorted. For now we always require ASC, but we could be more
+    // flexible and match any direction, or even different order of columns.
+    SortDescription partition_by;
+
+    SortDescription order_by;
+
+    // To calculate the window function, we sort input data first by PARTITION BY,
+    // then by ORDER BY. This field holds this combined sort order.
+    SortDescription full_sort_description;
+
+    // No frame info as of yet.
+
+    // The window functions that are calculated for this window.
+    std::vector<WindowFunctionDescription> window_functions;
+
+    std::string dump() const;
+};
+
+using WindowFunctionDescriptions = std::vector<WindowFunctionDescription>;
+
+using WindowDescriptions = std::unordered_map<std::string, WindowDescription>;
+
 }
--- a/src/Interpreters/AsynchronousMetrics.cpp
+++ b/src/Interpreters/AsynchronousMetrics.cpp
@ -212,18 +212,18 @@ void AsynchronousMetrics::update()
        {
            Int64 amount = total_memory_tracker.get();
            Int64 peak = total_memory_tracker.getPeak();
-            Int64 new_peak = data.resident;
+            Int64 new_amount = data.resident;

            LOG_DEBUG(&Poco::Logger::get("AsynchronousMetrics"),
                "MemoryTracking: was {}, peak {}, will set to {} (RSS), difference: {}",
                ReadableSize(amount),
                ReadableSize(peak),
-                ReadableSize(new_peak),
-                ReadableSize(new_peak - peak)
+                ReadableSize(new_amount),
+                ReadableSize(new_amount - amount)
            );

-            total_memory_tracker.set(new_peak);
-            CurrentMetrics::set(CurrentMetrics::MemoryTracking, new_peak);
+            total_memory_tracker.set(new_amount);
+            CurrentMetrics::set(CurrentMetrics::MemoryTracking, new_amount);
        }
    }
 #endif
--- a/src/Interpreters/ClusterProxy/SelectStreamFactory.cpp
+++ b/src/Interpreters/ClusterProxy/SelectStreamFactory.cpp
@ -126,6 +126,7 @@ void SelectStreamFactory::createForShard(
    bool add_agg_info = processed_stage == QueryProcessingStage::WithMergeableState;
    bool add_totals = false;
    bool add_extremes = false;
+    bool async_read = context_ptr->getSettingsRef().async_socket_for_remote;
    if (processed_stage == QueryProcessingStage::Complete)
    {
        add_totals = query_ast->as<ASTSelectQuery &>().group_by_with_totals;
@ -153,7 +154,7 @@ void SelectStreamFactory::createForShard(
        if (!table_func_ptr)
            remote_query_executor->setMainTable(main_table);

-        remote_pipes.emplace_back(createRemoteSourcePipe(remote_query_executor, add_agg_info, add_totals, add_extremes));
+        remote_pipes.emplace_back(createRemoteSourcePipe(remote_query_executor, add_agg_info, add_totals, add_extremes, async_read));
        remote_pipes.back().addInterpreterContext(context_ptr);
    };

@ -249,7 +250,7 @@ void SelectStreamFactory::createForShard(
                pool = shard_info.pool, shard_num = shard_info.shard_num, modified_query, header = header, modified_query_ast,
                &context, context_ptr, throttler,
                main_table = main_table, table_func_ptr = table_func_ptr, scalars = scalars, external_tables = external_tables,
-                stage = processed_stage, local_delay, add_agg_info, add_totals, add_extremes]()
+                stage = processed_stage, local_delay, add_agg_info, add_totals, add_extremes, async_read]()
            -> Pipe
        {
            auto current_settings = context.getSettingsRef();
@ -295,7 +296,7 @@ void SelectStreamFactory::createForShard(
                auto remote_query_executor = std::make_shared<RemoteQueryExecutor>(
                    std::move(connections), modified_query, header, context, throttler, scalars, external_tables, stage);

-                return createRemoteSourcePipe(remote_query_executor, add_agg_info, add_totals, add_extremes);
+                return createRemoteSourcePipe(remote_query_executor, add_agg_info, add_totals, add_extremes, async_read);
            }
        };

--- a/src/Interpreters/ExpressionAnalyzer.cpp
+++ b/src/Interpreters/ExpressionAnalyzer.cpp
@ -50,6 +50,9 @@
 #include <Interpreters/GlobalSubqueriesVisitor.h>
 #include <Interpreters/GetAggregatesVisitor.h>

+#include <IO/Operators.h>
+#include <IO/WriteBufferFromString.h>
+
 namespace DB
 {

@ -58,12 +61,14 @@ using LogAST = DebugASTLog<false>; /// set to true to enable logs

 namespace ErrorCodes
 {
-    extern const int UNKNOWN_TYPE_OF_AST_NODE;
-    extern const int UNKNOWN_IDENTIFIER;
+    extern const int BAD_ARGUMENTS;
    extern const int ILLEGAL_PREWHERE;
-    extern const int LOGICAL_ERROR;
    extern const int ILLEGAL_TYPE_OF_ARGUMENT;
    extern const int ILLEGAL_TYPE_OF_COLUMN_FOR_FILTER;
+    extern const int LOGICAL_ERROR;
+    extern const int NOT_IMPLEMENTED;
+    extern const int UNKNOWN_IDENTIFIER;
+    extern const int UNKNOWN_TYPE_OF_AST_NODE;
 }

 namespace
@ -283,6 +288,8 @@ void ExpressionAnalyzer::analyzeAggregation()
    {
        aggregated_columns = temp_actions->getNamesAndTypesList();
    }
+
+    has_window = makeWindowDescriptions(temp_actions);
 }


@ -444,7 +451,11 @@ bool ExpressionAnalyzer::makeAggregateDescriptions(ActionsDAGPtr & actions)

            auto it = index.find(name);
            if (it == index.end())
-                throw Exception(ErrorCodes::UNKNOWN_IDENTIFIER, "Unknown identifier (in aggregate function '{}'): {}", node->name, name);
+            {
+                throw Exception(ErrorCodes::UNKNOWN_IDENTIFIER,
+                    "Unknown identifier '{}' in aggregate function '{}'",
+                    name, node->formatForErrorMessage());
+            }

            types[i] = (*it)->result_type;
            aggregate.argument_names[i] = name;
@ -461,6 +472,128 @@ bool ExpressionAnalyzer::makeAggregateDescriptions(ActionsDAGPtr & actions)
 }


+bool ExpressionAnalyzer::makeWindowDescriptions(ActionsDAGPtr & actions)
+{
+    // Convenient to check here because at least we have the Context.
+    if (!syntax->window_function_asts.empty() &&
+        !context.getSettingsRef().allow_experimental_window_functions)
+    {
+        throw Exception(ErrorCodes::NOT_IMPLEMENTED,
+            "Window functions are not implemented (while processing '{}')",
+            syntax->window_function_asts[0]->formatForErrorMessage());
+    }
+
+    for (const ASTFunction * function_node : syntax->window_function_asts)
+    {
+        assert(function_node->is_window_function);
+
+        WindowDescription window_description;
+        window_description.window_name = function_node->getWindowDescription();
+
+        if (function_node->window_partition_by)
+        {
+            for (const auto & column_ast
+                : function_node->window_partition_by->children)
+            {
+                const auto * with_alias = dynamic_cast<const ASTWithAlias *>(
+                    column_ast.get());
+                if (!with_alias)
+                {
+                    throw Exception(ErrorCodes::BAD_ARGUMENTS,
+                        "Expected a column in PARTITION BY for window '{}',"
+                        " got '{}'", window_description.window_name,
+                        column_ast->formatForErrorMessage());
+                }
+                window_description.partition_by.push_back(
+                    SortColumnDescription(
+                        with_alias->getColumnName(), 1 /* direction */,
+                        1 /* nulls_direction */));
+            }
+        }
+
+        if (function_node->window_order_by)
+        {
+            for (const auto & column_ast
+                : function_node->window_order_by->children)
+            {
+                // Parser should have checked that we have a proper element here.
+                const auto & order_by_element
+                    = column_ast->as<ASTOrderByElement &>();
+                // Ignore collation for now.
+                window_description.order_by.push_back(
+                    SortColumnDescription(
+                        order_by_element.children.front()->getColumnName(),
+                        order_by_element.direction,
+                        order_by_element.nulls_direction));
+            }
+        }
+
+        window_description.full_sort_description = window_description.partition_by;
+        window_description.full_sort_description.insert(
+            window_description.full_sort_description.end(),
+            window_description.order_by.begin(),
+            window_description.order_by.end());
+
+        WindowFunctionDescription window_function;
+        window_function.function_node = function_node;
+        window_function.column_name
+            = window_function.function_node->getColumnName();
+        window_function.function_parameters
+            = window_function.function_node->parameters
+                ? getAggregateFunctionParametersArray(
+                    window_function.function_node->parameters)
+                : Array();
+
+        // Requiring a constant reference to a shared pointer to non-const AST
+        // doesn't really look sane, but the visitor does indeed require it.
+        // Hence we clone the node (not very sane either, I know).
+        getRootActionsNoMakeSet(window_function.function_node->clone(),
+            true, actions);
+
+        const ASTs & arguments
+            = window_function.function_node->arguments->children;
+        window_function.argument_types.resize(arguments.size());
+        window_function.argument_names.resize(arguments.size());
+        const auto & index = actions->getIndex();
+        for (size_t i = 0; i < arguments.size(); ++i)
+        {
+            const std::string & name = arguments[i]->getColumnName();
+
+            auto it = index.find(name);
+            if (it == index.end())
+            {
+                throw Exception(ErrorCodes::UNKNOWN_IDENTIFIER,
+                    "Unknown identifier '{}' in window function '{}'",
+                    name, window_function.function_node->formatForErrorMessage());
+            }
+
+            window_function.argument_types[i] = (*it)->result_type;
+            window_function.argument_names[i] = name;
+        }
+
+        AggregateFunctionProperties properties;
+        window_function.aggregate_function
+            = AggregateFunctionFactory::instance().get(
+                window_function.function_node->name,
+                window_function.argument_types,
+                window_function.function_parameters, properties);
+
+        auto [it, inserted] = window_descriptions.insert(
+            {window_description.window_name, window_description});
+
+        if (!inserted)
+        {
+            assert(it->second.full_sort_description
+                == window_description.full_sort_description);
+        }
+
+        it->second.window_functions.push_back(window_function);
+    }
+
+    return !syntax->window_function_asts.empty();
+}
+
+
 const ASTSelectQuery * ExpressionAnalyzer::getSelectQuery() const
 {
    const auto * select_query = query->as<ASTSelectQuery>();
@ -831,6 +964,65 @@ void SelectQueryExpressionAnalyzer::appendAggregateFunctionsArguments(Expression
                getRootActions(argument, only_types, step.actions());
 }

+void SelectQueryExpressionAnalyzer::appendWindowFunctionsArguments(
+    ExpressionActionsChain & chain, bool /* only_types */)
+{
+    ExpressionActionsChain::Step & step = chain.lastStep(aggregated_columns);
+
+    // 1) Add actions for window functions and their arguments;
+    // 2) Mark the columns that are really required.
+    for (const auto & [_, w] : window_descriptions)
+    {
+        for (const auto & f : w.window_functions)
+        {
+            // 1.1) arguments of window functions;
+            // Requiring a constant reference to a shared pointer to non-const AST
+            // doesn't really look sane, but the visitor does indeed require it.
+            getRootActionsNoMakeSet(f.function_node->clone(),
+                true /* no_subqueries */, step.actions());
+
+            // 1.2) result of window function: an empty INPUT.
+            // It is an aggregate function, so it won't be added by getRootActions.
+            // This is something of a hack. Other options:
+            //  a] do it like aggregate function -- break the chain of actions
+            //     and manually add window functions to the starting list of
+            //     input columns. Logically this is similar to what we're doing
+            //     now, but would require to split the window function processing
+            //     into a full-fledged step after plain functions. This would be
+            //     somewhat cumbersome. With INPUT hack we can avoid a separate
+            //     step and pretend that window functions are almost "normal"
+            //     select functions. The limitation of both these ways is that
+            //     we can't reference window functions in other SELECT
+            //     expressions.
+            //  b] add a WINDOW action type, then sort, then split the chain on
+            //     each WINDOW action and insert the Window pipeline between the
+            //     Expression pipelines. This is a "proper" way that would allow
+            //     us to depend on window functions in other functions. But it's
+            //     complicated so I avoid doing it for now.
+            ColumnWithTypeAndName col;
+            col.type = f.aggregate_function->getReturnType();
+            col.column = col.type->createColumn();
+            col.name = f.column_name;
+
+            step.actions()->addInput(col);
+
+            for (const auto & a : f.function_node->arguments->children)
+            {
+                // 2.1) function arguments;
+                step.required_output.push_back(a->getColumnName());
+            }
+            // 2.2) function result;
+            step.required_output.push_back(f.column_name);
+        }
+
+        // 2.3) PARTITION BY and ORDER BY columns.
+        for (const auto & c : w.full_sort_description)
+        {
+            step.required_output.push_back(c.column_name);
+        }
+    }
+}
+
 bool SelectQueryExpressionAnalyzer::appendHaving(ExpressionActionsChain & chain, bool only_types)
 {
    const auto * select_query = getAggregatingQuery();
@ -855,7 +1047,9 @@ void SelectQueryExpressionAnalyzer::appendSelect(ExpressionActionsChain & chain,
    getRootActions(select_query->select(), only_types, step.actions());

    for (const auto & child : select_query->select()->children)
+    {
        step.required_output.push_back(child->getColumnName());
+    }
 }

 ActionsDAGPtr SelectQueryExpressionAnalyzer::appendOrderBy(ExpressionActionsChain & chain, bool only_types, bool optimize_read_in_order,
@ -1076,6 +1270,7 @@ ExpressionAnalysisResult::ExpressionAnalysisResult(
    : first_stage(first_stage_)
    , second_stage(second_stage_)
    , need_aggregate(query_analyzer.hasAggregation())
+    , has_window(query_analyzer.hasWindow())
 {
    /// first_stage: Do I need to perform the first part of the pipeline - running on remote servers during distributed processing.
    /// second_stage: Do I need to execute the second part of the pipeline - running on the initiating server during distributed processing.
@ -1225,6 +1420,9 @@ ExpressionAnalysisResult::ExpressionAnalysisResult(

        /// If there is aggregation, we execute expressions in SELECT and ORDER BY on the initiating server, otherwise on the source servers.
        query_analyzer.appendSelect(chain, only_types || (need_aggregate ? !second_stage : !first_stage));
+
+        query_analyzer.appendWindowFunctionsArguments(chain, only_types || !first_stage);
+
        selected_columns = chain.getLastStep().required_output;
        has_order_by = query.orderBy() != nullptr;
        before_order_and_select = query_analyzer.appendOrderBy(
@ -1321,4 +1519,75 @@ void ExpressionAnalysisResult::checkActions() const
    }
 }

+std::string ExpressionAnalysisResult::dump() const
+{
+    WriteBufferFromOwnString ss;
+
+    ss << "need_aggregate " << need_aggregate << "\n";
+    ss << "has_order_by " << has_order_by << "\n";
+    ss << "has_window " << has_window << "\n";
+
+    if (before_array_join)
+    {
+        ss << "before_array_join " << before_array_join->dumpDAG() << "\n";
+    }
+
+    if (array_join)
+    {
+        ss << "array_join " << "FIXME doesn't have dump" << "\n";
+    }
+
+    if (before_join)
+    {
+        ss << "before_join " << before_join->dumpDAG() << "\n";
+    }
+
+    if (before_where)
+    {
+        ss << "before_where " << before_where->dumpDAG() << "\n";
+    }
+
+    if (prewhere_info)
+    {
+        ss << "prewhere_info " << prewhere_info->dump() << "\n";
+    }
+
+    if (filter_info)
+    {
+        ss << "filter_info " << filter_info->dump() << "\n";
+    }
+
+    if (before_aggregation)
+    {
+        ss << "before_aggregation " << before_aggregation->dumpDAG() << "\n";
+    }
+
+    if (before_having)
+    {
+        ss << "before_having " << before_having->dumpDAG() << "\n";
+    }
+
+    if (before_window)
+    {
+        ss << "before_window " << before_window->dumpDAG() << "\n";
+    }
+
+    if (before_order_and_select)
+    {
+        ss << "before_order_and_select " << before_order_and_select->dumpDAG() << "\n";
+    }
+
+    if (before_limit_by)
+    {
+        ss << "before_limit_by " << before_limit_by->dumpDAG() << "\n";
+    }
+
+    if (final_projection)
+    {
+        ss << "final_projection " << final_projection->dumpDAG() << "\n";
+    }
+
+    return ss.str();
+}
+
 }
--- a/src/Interpreters/ExpressionAnalyzer.h
+++ b/src/Interpreters/ExpressionAnalyzer.h
@ -60,6 +60,10 @@ struct ExpressionAnalyzerData
    NamesAndTypesList aggregation_keys;
    AggregateDescriptions aggregate_descriptions;

+    bool has_window = false;
+    WindowDescriptions window_descriptions;
+    NamesAndTypesList window_columns;
+
    bool has_global_subqueries = false;

    /// All new temporary tables obtained by performing the GLOBAL IN/JOIN subqueries.
@ -116,6 +120,9 @@ public:
    /// Get intermediates for tests
    const ExpressionAnalyzerData & getAnalyzedData() const { return *this; }

+    /// A list of windows for window functions.
+    const WindowDescriptions & windowDescriptions() const { return window_descriptions; }
+
 protected:
    ExpressionAnalyzer(
        const ASTPtr & query_,
@ -159,6 +166,8 @@ protected:
    void analyzeAggregation();
    bool makeAggregateDescriptions(ActionsDAGPtr & actions);

+    bool makeWindowDescriptions(ActionsDAGPtr & actions);
+
    const ASTSelectQuery * getSelectQuery() const;

    bool isRemoteStorage() const { return syntax->is_remote_storage; }
@ -169,6 +178,8 @@ class SelectQueryExpressionAnalyzer;
 /// Result of SelectQueryExpressionAnalyzer: expressions for InterpreterSelectQuery
 struct ExpressionAnalysisResult
 {
+    std::string dump() const;
+
    /// Do I need to perform the first part of the pipeline - running on remote servers during distributed processing.
    bool first_stage = false;
    /// Do I need to execute the second part of the pipeline - running on the initiating server during distributed processing.
@ -176,6 +187,7 @@ struct ExpressionAnalysisResult

    bool need_aggregate = false;
    bool has_order_by   = false;
+    bool has_window = false;

    bool remove_where_filter = false;
    bool optimize_read_in_order = false;
@ -189,6 +201,7 @@ struct ExpressionAnalysisResult
    ActionsDAGPtr before_where;
    ActionsDAGPtr before_aggregation;
    ActionsDAGPtr before_having;
+    ActionsDAGPtr before_window;
    ActionsDAGPtr before_order_and_select;
    ActionsDAGPtr before_limit_by;
    ActionsDAGPtr final_projection;
@ -256,6 +269,7 @@ public:

    /// Does the expression have aggregate functions or a GROUP BY or HAVING section.
    bool hasAggregation() const { return has_aggregation; }
+    bool hasWindow() const { return has_window; }
    bool hasGlobalSubqueries() { return has_global_subqueries; }
    bool hasTableJoin() const { return syntax->ast_join; }

@ -326,6 +340,7 @@ private:
    bool appendWhere(ExpressionActionsChain & chain, bool only_types);
    bool appendGroupBy(ExpressionActionsChain & chain, bool only_types, bool optimize_aggregation_in_order, ManyExpressionActions &);
    void appendAggregateFunctionsArguments(ExpressionActionsChain & chain, bool only_types);
+    void appendWindowFunctionsArguments(ExpressionActionsChain & chain, bool only_types);

    /// After aggregation:
    bool appendHaving(ExpressionActionsChain & chain, bool only_types);
--- a/src/Interpreters/ExtractExpressionInfoVisitor.cpp
+++ b/src/Interpreters/ExtractExpressionInfoVisitor.cpp
@ -19,9 +19,18 @@ void ExpressionInfoMatcher::visit(const ASTPtr & ast, Data & data)
 void ExpressionInfoMatcher::visit(const ASTFunction & ast_function, const ASTPtr &, Data & data)
 {
    if (ast_function.name == "arrayJoin")
+    {
        data.is_array_join = true;
-    else if (AggregateFunctionFactory::instance().isAggregateFunctionName(ast_function.name))
+    }
+    // "is_aggregate_function" doesn't mean much by itself. Apparently here it is
+    // used to move filters from HAVING to WHERE, and probably for this purpose
+    // an aggregate function calculated as a window function is not relevant.
+    else if (!ast_function.is_window_function
+        && AggregateFunctionFactory::instance().isAggregateFunctionName(
+            ast_function.name))
+    {
        data.is_aggregate_function = true;
+    }
    else
    {
        const auto & function = FunctionFactory::instance().tryGet(ast_function.name, data.context);
--- a/src/Interpreters/GetAggregatesVisitor.h
+++ b/src/Interpreters/GetAggregatesVisitor.h
@ -19,8 +19,12 @@ public:
    struct Data
    {
        const char * assert_no_aggregates = nullptr;
-        std::unordered_set<String> uniq_names;
-        std::vector<const ASTFunction *> aggregates;
+        const char * assert_no_windows = nullptr;
+        // Explicit empty initializers are needed to make designated initializers
+        // work on GCC 10.
+        std::unordered_set<String> uniq_names {};
+        std::vector<const ASTFunction *> aggregates {};
+        std::vector<const ASTFunction *> window_functions {};
    };

    static bool needChildVisit(const ASTPtr & node, const ASTPtr & child)
@ -28,8 +32,13 @@ public:
        if (child->as<ASTSubquery>() || child->as<ASTSelectQuery>())
            return false;
        if (auto * func = node->as<ASTFunction>())
-            if (isAggregateFunction(func->name))
+        {
+            if (isAggregateFunction(*func)
+                || func->is_window_function)
+            {
                return false;
+            }
+        }
        return true;
    }

@ -42,33 +51,56 @@ public:
 private:
    static void visit(const ASTFunction & node, const ASTPtr &, Data & data)
    {
-        if (!isAggregateFunction(node.name))
-            return;
+        if (isAggregateFunction(node))
+        {
+            if (data.assert_no_aggregates)
+                throw Exception("Aggregate function " + node.getColumnName()  + " is found " + String(data.assert_no_aggregates) + " in query",
+                                ErrorCodes::ILLEGAL_AGGREGATION);

-        if (data.assert_no_aggregates)
-            throw Exception("Aggregate function " + node.getColumnName()  + " is found " + String(data.assert_no_aggregates) + " in query",
-                            ErrorCodes::ILLEGAL_AGGREGATION);
+            String column_name = node.getColumnName();
+            if (data.uniq_names.count(column_name))
+                return;

-        String column_name = node.getColumnName();
-        if (data.uniq_names.count(column_name))
-            return;
+            data.uniq_names.insert(column_name);
+            data.aggregates.push_back(&node);
+        }
+        else if (node.is_window_function)
+        {
+            if (data.assert_no_windows)
+                throw Exception("Window function " + node.getColumnName()  + " is found " + String(data.assert_no_windows) + " in query",
+                                ErrorCodes::ILLEGAL_AGGREGATION);

-        data.uniq_names.insert(column_name);
-        data.aggregates.push_back(&node);
+            String column_name = node.getColumnName();
+            if (data.uniq_names.count(column_name))
+                return;
+
+            data.uniq_names.insert(column_name);
+            data.window_functions.push_back(&node);
+        }
    }

-    static bool isAggregateFunction(const String & name)
+    static bool isAggregateFunction(const ASTFunction & node)
    {
-        return AggregateFunctionFactory::instance().isAggregateFunctionName(name);
+        // Aggregate functions can also be calculated as window functions, but
+        // here we are interested in aggregate functions calculated in GROUP BY.
+        return !node.is_window_function
+            && AggregateFunctionFactory::instance().isAggregateFunctionName(
+                node.name);
    }
 };

 using GetAggregatesVisitor = GetAggregatesMatcher::Visitor;


+inline void assertNoWindows(const ASTPtr & ast, const char * description)
+{
+    GetAggregatesVisitor::Data data{.assert_no_windows = description};
+    GetAggregatesVisitor(data).visit(ast);
+}
+
 inline void assertNoAggregates(const ASTPtr & ast, const char * description)
 {
-    GetAggregatesVisitor::Data data{description, {}, {}};
+    GetAggregatesVisitor::Data data{.assert_no_aggregates = description};
    GetAggregatesVisitor(data).visit(ast);
 }

--- a/src/Interpreters/InterpreterSelectQuery.cpp
+++ b/src/Interpreters/InterpreterSelectQuery.cpp
@ -35,36 +35,37 @@
 #include <Interpreters/QueryAliasesVisitor.h>

 #include <Processors/Pipe.h>
-#include <Processors/Sources/SourceFromInputStream.h>
-#include <Processors/Sources/NullSource.h>
-#include <Processors/Transforms/ExpressionTransform.h>
-#include <Processors/Transforms/JoiningTransform.h>
-#include <Processors/Transforms/AggregatingTransform.h>
-#include <Processors/Transforms/FilterTransform.h>
-#include <Processors/QueryPlan/ArrayJoinStep.h>
-#include <Processors/QueryPlan/SettingQuotaAndLimitsStep.h>
-#include <Processors/QueryPlan/ExpressionStep.h>
-#include <Processors/QueryPlan/FilterStep.h>
-#include <Processors/QueryPlan/ReadNothingStep.h>
-#include <Processors/QueryPlan/ReadFromPreparedSource.h>
-#include <Processors/QueryPlan/PartialSortingStep.h>
-#include <Processors/QueryPlan/MergeSortingStep.h>
-#include <Processors/QueryPlan/MergingSortedStep.h>
-#include <Processors/QueryPlan/DistinctStep.h>
-#include <Processors/QueryPlan/LimitByStep.h>
-#include <Processors/QueryPlan/LimitStep.h>
-#include <Processors/QueryPlan/MergingAggregatedStep.h>
 #include <Processors/QueryPlan/AddingDelayedSourceStep.h>
 #include <Processors/QueryPlan/AggregatingStep.h>
+#include <Processors/QueryPlan/ArrayJoinStep.h>
 #include <Processors/QueryPlan/CreatingSetsStep.h>
-#include <Processors/QueryPlan/TotalsHavingStep.h>
-#include <Processors/QueryPlan/RollupStep.h>
 #include <Processors/QueryPlan/CubeStep.h>
-#include <Processors/QueryPlan/FillingStep.h>
+#include <Processors/QueryPlan/DistinctStep.h>
+#include <Processors/QueryPlan/ExpressionStep.h>
 #include <Processors/QueryPlan/ExtremesStep.h>
-#include <Processors/QueryPlan/OffsetStep.h>
+#include <Processors/QueryPlan/FillingStep.h>
+#include <Processors/QueryPlan/FilterStep.h>
 #include <Processors/QueryPlan/FinishSortingStep.h>
+#include <Processors/QueryPlan/LimitByStep.h>
+#include <Processors/QueryPlan/LimitStep.h>
+#include <Processors/QueryPlan/MergeSortingStep.h>
+#include <Processors/QueryPlan/MergingAggregatedStep.h>
+#include <Processors/QueryPlan/MergingSortedStep.h>
+#include <Processors/QueryPlan/OffsetStep.h>
+#include <Processors/QueryPlan/PartialSortingStep.h>
 #include <Processors/QueryPlan/QueryPlan.h>
+#include <Processors/QueryPlan/ReadFromPreparedSource.h>
+#include <Processors/QueryPlan/ReadNothingStep.h>
+#include <Processors/QueryPlan/RollupStep.h>
+#include <Processors/QueryPlan/SettingQuotaAndLimitsStep.h>
+#include <Processors/QueryPlan/TotalsHavingStep.h>
+#include <Processors/QueryPlan/WindowStep.h>
+#include <Processors/Sources/NullSource.h>
+#include <Processors/Sources/SourceFromInputStream.h>
+#include <Processors/Transforms/AggregatingTransform.h>
+#include <Processors/Transforms/ExpressionTransform.h>
+#include <Processors/Transforms/FilterTransform.h>
+#include <Processors/Transforms/JoiningTransform.h>

 #include <Storages/MergeTree/MergeTreeData.h>
 #include <Storages/MergeTree/MergeTreeWhereOptimizer.h>
@ -958,6 +959,7 @@ void InterpreterSelectQuery::executeImpl(QueryPlan & query_plan, const BlockInpu
            else
            {
                executeExpression(query_plan, expressions.before_order_and_select, "Before ORDER BY and SELECT");
+                executeWindow(query_plan);
                executeDistinct(query_plan, true, expressions.selected_columns, true);
            }

@ -1004,6 +1006,7 @@ void InterpreterSelectQuery::executeImpl(QueryPlan & query_plan, const BlockInpu
                    executeHaving(query_plan, expressions.before_having);

                executeExpression(query_plan, expressions.before_order_and_select, "Before ORDER BY and SELECT");
+                executeWindow(query_plan);
                executeDistinct(query_plan, true, expressions.selected_columns, true);

            }
@ -1749,6 +1752,58 @@ void InterpreterSelectQuery::executeExpression(QueryPlan & query_plan, const Act
 }


+void InterpreterSelectQuery::executeWindow(QueryPlan & query_plan)
+{
+    for (const auto & [_, w] : query_analyzer->windowDescriptions())
+    {
+        const Settings & settings = context->getSettingsRef();
+
+        auto partial_sorting = std::make_unique<PartialSortingStep>(
+            query_plan.getCurrentDataStream(),
+            w.full_sort_description,
+            0 /* LIMIT */,
+            SizeLimits(settings.max_rows_to_sort, settings.max_bytes_to_sort,
+                settings.sort_overflow_mode));
+        partial_sorting->setStepDescription("Sort each block for window '"
+            + w.window_name + "'");
+        query_plan.addStep(std::move(partial_sorting));
+
+        auto merge_sorting_step = std::make_unique<MergeSortingStep>(
+            query_plan.getCurrentDataStream(),
+            w.full_sort_description,
+            settings.max_block_size,
+            0 /* LIMIT */,
+            settings.max_bytes_before_remerge_sort,
+            settings.remerge_sort_lowered_memory_bytes_ratio,
+            settings.max_bytes_before_external_sort,
+            context->getTemporaryVolume(),
+            settings.min_free_disk_space_for_temporary_data);
+        merge_sorting_step->setStepDescription("Merge sorted blocks for window '"
+            + w.window_name + "'");
+        query_plan.addStep(std::move(merge_sorting_step));
+
+        // First MergeSorted, now MergingSorted.
+        auto merging_sorted = std::make_unique<MergingSortedStep>(
+            query_plan.getCurrentDataStream(),
+            w.full_sort_description,
+            settings.max_block_size,
+            0 /* LIMIT */);
+        merging_sorted->setStepDescription("Merge sorted streams for window '"
+            + w.window_name + "'");
+        query_plan.addStep(std::move(merging_sorted));
+
+        auto window_step = std::make_unique<WindowStep>(
+            query_plan.getCurrentDataStream(),
+            w,
+            w.window_functions);
+        window_step->setStepDescription("Window step for window '"
+            + w.window_name + "'");
+
+        query_plan.addStep(std::move(window_step));
+    }
+}
+
+
 void InterpreterSelectQuery::executeOrderOptimized(QueryPlan & query_plan, InputOrderInfoPtr input_sorting_info, UInt64 limit, SortDescription & output_order_descr)
 {
    const Settings & settings = context->getSettingsRef();
@ -1795,9 +1850,13 @@ void InterpreterSelectQuery::executeOrder(QueryPlan & query_plan, InputOrderInfo
    /// Merge the sorted blocks.
    auto merge_sorting_step = std::make_unique<MergeSortingStep>(
            query_plan.getCurrentDataStream(),
-            output_order_descr, settings.max_block_size, limit,
-            settings.max_bytes_before_remerge_sort, settings.remerge_sort_lowered_memory_bytes_ratio,
-            settings.max_bytes_before_external_sort, context->getTemporaryVolume(),
+            output_order_descr,
+            settings.max_block_size,
+            limit,
+            settings.max_bytes_before_remerge_sort,
+            settings.remerge_sort_lowered_memory_bytes_ratio,
+            settings.max_bytes_before_external_sort,
+            context->getTemporaryVolume(),
            settings.min_free_disk_space_for_temporary_data);

    merge_sorting_step->setStepDescription("Merge sorted blocks for ORDER BY");
--- a/src/Interpreters/InterpreterSelectQuery.h
+++ b/src/Interpreters/InterpreterSelectQuery.h
@ -120,6 +120,8 @@ private:
    void executeTotalsAndHaving(QueryPlan & query_plan, bool has_having, const ActionsDAGPtr & expression, bool overflow_row, bool final);
    void executeHaving(QueryPlan & query_plan, const ActionsDAGPtr & expression);
    static void executeExpression(QueryPlan & query_plan, const ActionsDAGPtr & expression, const std::string & description);
+    /// FIXME should go through ActionsDAG to behave as a proper function
+    void executeWindow(QueryPlan & query_plan);
    void executeOrder(QueryPlan & query_plan, InputOrderInfoPtr sorting_info);
    void executeOrderOptimized(QueryPlan & query_plan, InputOrderInfoPtr sorting_info, UInt64 limit, SortDescription & output_order_descr);
    void executeWithFill(QueryPlan & query_plan);
--- a/src/Interpreters/MonotonicityCheckVisitor.h
+++ b/src/Interpreters/MonotonicityCheckVisitor.h
@ -43,9 +43,14 @@ public:
            if (group_by_function_hashes.count(key))
                return false;

-            /// if ORDER BY contains aggregate function it shouldn't be optimized
-            if (AggregateFunctionFactory::instance().isAggregateFunctionName(ast_function.name))
+            /// if ORDER BY contains aggregate function or window functions, it
+            /// shouldn't be optimized
+            if (ast_function.is_window_function
+                || AggregateFunctionFactory::instance().isAggregateFunctionName(
+                    ast_function.name))
+            {
                return false;
+            }

            return true;
        }
--- a/src/Interpreters/RewriteAnyFunctionVisitor.cpp
+++ b/src/Interpreters/RewriteAnyFunctionVisitor.cpp
@ -38,8 +38,16 @@ bool extractIdentifiers(const ASTFunction & func, std::unordered_set<ASTPtr *> &
            if (arg_func->name == "lambda")
                return false;

-            if (AggregateFunctionFactory::instance().isAggregateFunctionName(arg_func->name))
+            // We are looking for identifiers inside a function calculated inside
+            // the aggregate function `any()`. Window or aggregate function can't
+            // be inside `any`, but this check in GetAggregatesMatcher happens
+            // later, so we have to explicitly skip these nested functions here.
+            if (arg_func->is_window_function
+                || AggregateFunctionFactory::instance().isAggregateFunctionName(
+                    arg_func->name))
+            {
                return false;
+            }

            if (!extractIdentifiers(*arg_func, identifiers))
                return false;
--- a/src/Interpreters/TreeRewriter.cpp
+++ b/src/Interpreters/TreeRewriter.cpp
@ -439,12 +439,46 @@ std::vector<const ASTFunction *> getAggregates(ASTPtr & query, const ASTSelectQu

    /// There can not be other aggregate functions within the aggregate functions.
    for (const ASTFunction * node : data.aggregates)
+    {
        if (node->arguments)
+        {
            for (auto & arg : node->arguments->children)
+            {
                assertNoAggregates(arg, "inside another aggregate function");
+                assertNoWindows(arg, "inside an aggregate function");
+            }
+        }
+    }
    return data.aggregates;
 }

+std::vector<const ASTFunction *> getWindowFunctions(ASTPtr & query, const ASTSelectQuery & select_query)
+{
+    /// There can not be window functions inside the WHERE and PREWHERE.
+    if (select_query.where())
+        assertNoWindows(select_query.where(), "in WHERE");
+    if (select_query.prewhere())
+        assertNoWindows(select_query.prewhere(), "in PREWHERE");
+
+    GetAggregatesVisitor::Data data;
+    GetAggregatesVisitor(data).visit(query);
+
+    /// There can not be other window functions within the aggregate functions.
+    for (const ASTFunction * node : data.window_functions)
+    {
+        if (node->arguments)
+        {
+            for (auto & arg : node->arguments->children)
+            {
+                assertNoAggregates(arg, "inside a window function");
+                assertNoWindows(arg, "inside another window function");
+            }
+        }
+    }
+
+    return data.window_functions;
+}
+
 }

 TreeRewriterResult::TreeRewriterResult(
@ -640,14 +674,24 @@ void TreeRewriterResult::collectUsedColumns(const ASTPtr & query, bool is_select
        for (const auto & name : columns_context.requiredColumns())
            ss << " '" << name << "'";

-        if (!source_column_names.empty())
+        if (storage)
        {
-            ss << ", source columns:";
-            for (const auto & name : source_column_names)
-                ss << " '" << name << "'";
+            ss << ", maybe you meant: ";
+            for (const auto & name : columns_context.requiredColumns())
+            {
+                auto hints = storage->getHints(name);
+                if (!hints.empty())
+                    ss << " '" << toString(hints) << "'";
+            }
        }
        else
-            ss << ", no source columns";
+        {
+            if (!source_column_names.empty())
+                for (const auto & name : columns_context.requiredColumns())
+                    ss << " '" << name << "'";
+            else
+                ss << ", no source columns";
+        }

        if (columns_context.has_table_join)
        {
@ -733,6 +777,7 @@ TreeRewriterResultPtr TreeRewriter::analyzeSelect(
    collectJoinedColumns(*result.analyzed_join, *select_query, tables_with_columns, result.aliases);

    result.aggregates = getAggregates(query, *select_query);
+    result.window_function_asts = getWindowFunctions(query, *select_query);
    result.collectUsedColumns(query, true);
    result.ast_join = select_query->join();

--- a/src/Interpreters/TreeRewriter.h
+++ b/src/Interpreters/TreeRewriter.h
@ -35,6 +35,8 @@ struct TreeRewriterResult
    Aliases aliases;
    std::vector<const ASTFunction *> aggregates;

+    std::vector<const ASTFunction *> window_function_asts;
+
    /// Which column is needed to be ARRAY-JOIN'ed to get the specified.
    /// For example, for `SELECT s.v ... ARRAY JOIN a AS s` will get "s.v" -> "a.v".
    NameToNameMap array_join_result_to_source;
--- a/src/Interpreters/tests/CMakeLists.txt
+++ b/src/Interpreters/tests/CMakeLists.txt
@ -32,6 +32,9 @@ target_link_libraries (string_hash_map_aggregation PRIVATE dbms)
 add_executable (string_hash_set string_hash_set.cpp)
 target_link_libraries (string_hash_set PRIVATE dbms)

+add_executable (context context.cpp)
+target_link_libraries (context PRIVATE dbms)
+
 add_executable (two_level_hash_map two_level_hash_map.cpp)
 target_include_directories (two_level_hash_map SYSTEM BEFORE PRIVATE ${SPARSEHASH_INCLUDE_DIR})
 target_link_libraries (two_level_hash_map PRIVATE dbms)
--- a/src/Interpreters/tests/context.cpp
+++ b/src/Interpreters/tests/context.cpp
@ -0,0 +1,90 @@
+#include <iostream>
+/// #define BOOST_USE_UCONTEXT
+#include <Common/Fiber.h>
+// #include <boost/context/pooled_fixedsize_stack.hpp>
+// #include <boost/context/segmented_stack.hpp>
+#include <Common/Exception.h>
+#include <Common/FiberStack.h>
+
+void __attribute__((__noinline__)) foo(std::exception_ptr exception)
+{
+    if (exception)
+        std::rethrow_exception(exception);
+}
+
+void __attribute__((__noinline__)) bar(int a)
+{
+    std::cout << StackTrace().toString() << std::endl;
+
+    if (a > 0)
+        throw DB::Exception(0, "hello");
+}
+
+void __attribute__((__noinline__)) gar(int a)
+{
+    char buf[1024];
+    buf[1023] = a & 255;
+    if (a > 2)
+        return gar(a - 1);
+    else
+        bar(a);
+}
+
+int main(int, char **)
+try {
+    namespace ctx=boost::context;
+    int a;
+    std::exception_ptr exception;
+    // ctx::protected_fixedsize allocator
+    // ctx::pooled_fixedsize_stack(1024 * 64 + 2 * 2 * 1024 * 1024 * 16, 1)
+    ctx::fiber source{std::allocator_arg_t(), FiberStack(), [&](ctx::fiber&& sink)
+    {
+        a=0;
+        int b=1;
+        for (size_t i = 0; i < 9; ++i)
+        {
+            sink=std::move(sink).resume();
+            int next=a+b;
+            a=b;
+            b=next;
+        }
+        try
+        {
+            gar(1024);
+        }
+        catch (...)
+        {
+            std::cout << "Saving exception\n";
+            exception = std::current_exception();
+        }
+        return std::move(sink);
+    }};
+
+    for (int j=0;j<10;++j)
+    {
+        try
+        {
+            source=std::move(source).resume();
+        }
+        catch (DB::Exception & e)
+        {
+            std::cout << "Caught exception in resume " << e.getStackTraceString() << std::endl;
+        }
+        std::cout << a << " ";
+    }
+
+    std::cout << std::endl;
+
+    try
+    {
+        foo(exception);
+    }
+    catch (const DB::Exception & e)
+    {
+        std::cout << e.getStackTraceString() << std::endl;
+    }
+}
+catch (...)
+{
+    std::cerr << "Uncaught exception\n";
+}
--- a/src/Parsers/ASTFunction.cpp
+++ b/src/Parsers/ASTFunction.cpp
@ -1,14 +1,15 @@
-#include <Common/typeid_cast.h>
-#include <Parsers/ASTLiteral.h>
 #include <Parsers/ASTFunction.h>
-#include <Parsers/ASTWithAlias.h>
-#include <Parsers/ASTSubquery.h>
-#include <Parsers/ASTExpressionList.h>
-#include <IO/WriteHelpers.h>
-#include <IO/WriteBufferFromString.h>
-#include <Common/SipHash.h>
-#include <IO/Operators.h>

+#include <Common/SipHash.h>
+#include <Common/typeid_cast.h>
+#include <IO/Operators.h>
+#include <IO/WriteBufferFromString.h>
+#include <IO/WriteHelpers.h>
+#include <Parsers/ASTExpressionList.h>
+#include <Parsers/ASTIdentifier.h>
+#include <Parsers/ASTLiteral.h>
+#include <Parsers/ASTSubquery.h>
+#include <Parsers/ASTWithAlias.h>

 namespace DB
 {
@ -54,6 +55,21 @@ ASTPtr ASTFunction::clone() const
    if (arguments) { res->arguments = arguments->clone(); res->children.push_back(res->arguments); }
    if (parameters) { res->parameters = parameters->clone(); res->children.push_back(res->parameters); }

+    if (window_name)
+    {
+        res->set(res->window_name, window_name->clone());
+    }
+
+    if (window_partition_by)
+    {
+        res->set(res->window_partition_by, window_partition_by->clone());
+    }
+
+    if (window_order_by)
+    {
+        res->set(res->window_order_by, window_order_by->clone());
+    }
+
    return res;
 }

@ -411,44 +427,91 @@ void ASTFunction::formatImplWithoutAlias(const FormatSettings & settings, Format
        }
    }

-    if (!written)
+    if (written)
    {
-        settings.ostr << (settings.hilite ? hilite_function : "") << name;
+        return;
+    }

-        if (parameters)
+    settings.ostr << (settings.hilite ? hilite_function : "") << name;
+
+    if (parameters)
+    {
+        settings.ostr << '(' << (settings.hilite ? hilite_none : "");
+        parameters->formatImpl(settings, state, nested_dont_need_parens);
+        settings.ostr << (settings.hilite ? hilite_function : "") << ')';
+    }
+
+    if ((arguments && !arguments->children.empty()) || !no_empty_args)
+        settings.ostr << '(' << (settings.hilite ? hilite_none : "");
+
+    if (arguments)
+    {
+        bool special_hilite_regexp = settings.hilite
+            && (name == "match" || name == "extract" || name == "extractAll" || name == "replaceRegexpOne"
+                || name == "replaceRegexpAll");
+
+        for (size_t i = 0, size = arguments->children.size(); i < size; ++i)
        {
-            settings.ostr << '(' << (settings.hilite ? hilite_none : "");
-            parameters->formatImpl(settings, state, nested_dont_need_parens);
-            settings.ostr << (settings.hilite ? hilite_function : "") << ')';
+            if (i != 0)
+                settings.ostr << ", ";
+
+            bool special_hilite = false;
+            if (i == 1 && special_hilite_regexp)
+                special_hilite = highlightStringLiteralWithMetacharacters(arguments->children[i], settings, "|()^$.[]?*+{:-");
+
+            if (!special_hilite)
+                arguments->children[i]->formatImpl(settings, state, nested_dont_need_parens);
        }
+    }

-        if ((arguments && !arguments->children.empty()) || !no_empty_args)
-            settings.ostr << '(' << (settings.hilite ? hilite_none : "");
+    if ((arguments && !arguments->children.empty()) || !no_empty_args)
+        settings.ostr << (settings.hilite ? hilite_function : "") << ')';

-        if (arguments)
-        {
-            bool special_hilite_regexp = settings.hilite
-                && (name == "match" || name == "extract" || name == "extractAll" || name == "replaceRegexpOne"
-                    || name == "replaceRegexpAll");
+    settings.ostr << (settings.hilite ? hilite_none : "");

-            for (size_t i = 0, size = arguments->children.size(); i < size; ++i)
-            {
-                if (i != 0)
-                    settings.ostr << ", ";
+    if (!is_window_function)
+    {
+        return;
+    }

-                bool special_hilite = false;
-                if (i == 1 && special_hilite_regexp)
-                    special_hilite = highlightStringLiteralWithMetacharacters(arguments->children[i], settings, "|()^$.[]?*+{:-");
+    settings.ostr << " OVER (";
+    appendWindowDescription(settings, state, nested_dont_need_parens);
+    settings.ostr << ")";
+}

-                if (!special_hilite)
-                    arguments->children[i]->formatImpl(settings, state, nested_dont_need_parens);
-            }
-        }
+std::string ASTFunction::getWindowDescription() const
+{
+    WriteBufferFromOwnString ostr;
+    FormatSettings settings{ostr, true /* one_line */};
+    FormatState state;
+    FormatStateStacked frame;
+    appendWindowDescription(settings, state, frame);
+    return ostr.str();
+}

-        if ((arguments && !arguments->children.empty()) || !no_empty_args)
-            settings.ostr << (settings.hilite ? hilite_function : "") << ')';
+void ASTFunction::appendWindowDescription(const FormatSettings & settings,
+    FormatState & state, FormatStateStacked frame) const
+{
+    if (!is_window_function)
+    {
+        return;
+    }

-        settings.ostr << (settings.hilite ? hilite_none : "");
+    if (window_partition_by)
+    {
+        settings.ostr << "PARTITION BY ";
+        window_partition_by->formatImpl(settings, state, frame);
+    }
+
+    if (window_partition_by && window_order_by)
+    {
+        settings.ostr << " ";
+    }
+
+    if (window_order_by)
+    {
+        settings.ostr << "ORDER BY ";
+        window_order_by->formatImpl(settings, state, frame);
    }
 }

--- a/src/Parsers/ASTFunction.h
+++ b/src/Parsers/ASTFunction.h
@ -8,6 +8,8 @@
 namespace DB
 {

+class ASTIdentifier;
+
 /** AST for function application or operator.
  */
 class ASTFunction : public ASTWithAlias
@ -18,6 +20,11 @@ public:
    /// parameters - for parametric aggregate function. Example: quantile(0.9)(x) - what in first parens are 'parameters'.
    ASTPtr parameters;

+    bool is_window_function = false;
+    ASTIdentifier * window_name;
+    ASTExpressionList * window_partition_by;
+    ASTExpressionList * window_order_by;
+
    /// do not print empty parentheses if there are no args - compatibility with new AST for data types and engine names.
    bool no_empty_args = false;

@ -32,6 +39,11 @@ public:

    ASTPtr toLiteral() const;  // Try to convert functions like Array or Tuple to a literal form.

+    void appendWindowDescription(const FormatSettings & settings,
+        FormatState & state, FormatStateStacked frame) const;
+
+    std::string getWindowDescription() const;
+
 protected:
    void formatImplWithoutAlias(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const override;
    void appendColumnNameImpl(WriteBuffer & ostr) const override;
--- a/src/Parsers/ASTIndexDeclaration.cpp
+++ b/src/Parsers/ASTIndexDeclaration.cpp
@ -1,6 +1,8 @@
 #include <Parsers/ASTIndexDeclaration.h>
+
 #include <Common/quoteString.h>
 #include <IO/Operators.h>
+#include <Parsers/ASTFunction.h>


 namespace DB
--- a/src/Parsers/ASTIndexDeclaration.h
+++ b/src/Parsers/ASTIndexDeclaration.h
@ -1,12 +1,12 @@
 #pragma once

-#include <Parsers/ASTFunction.h>
 #include <Parsers/IAST.h>

-
 namespace DB
 {

+class ASTFunction;
+
 /** name BY expr TYPE typename(args) GRANULARITY int in create query
  */
 class ASTIndexDeclaration : public IAST
--- a/src/Parsers/ExpressionElementParsers.cpp
+++ b/src/Parsers/ExpressionElementParsers.cpp
@ -263,6 +263,7 @@ bool ParserFunction::parseImpl(Pos & pos, ASTPtr & node, Expected & expected)
    ParserKeyword distinct("DISTINCT");
    ParserExpressionList contents(false);
    ParserSelectWithUnionQuery select;
+    ParserKeyword over("OVER");

    bool has_distinct_modifier = false;

@ -382,10 +383,96 @@ bool ParserFunction::parseImpl(Pos & pos, ASTPtr & node, Expected & expected)
        function_node->children.push_back(function_node->parameters);
    }

+    if (over.ignore(pos, expected))
+    {
+        function_node->is_window_function = true;
+
+        // We are slightly breaking the parser interface by parsing the window
+        // definition into an existing ASTFunction. Normally it would take a
+        // reference to ASTPtr and assign it the new node. We only have a pointer
+        // of a different type, hence this workaround with a temporary pointer.
+        ASTPtr function_node_as_iast = function_node;
+
+        ParserWindowDefinition window_definition;
+        if (!window_definition.parse(pos, function_node_as_iast, expected))
+        {
+            return false;
+        }
+    }
+
    node = function_node;
    return true;
 }

+bool ParserWindowDefinition::parseImpl(Pos & pos, ASTPtr & node, Expected & expected)
+{
+    ASTFunction * function = dynamic_cast<ASTFunction *>(node.get());
+
+    // Variant 1:
+    // function_name ( * ) OVER window_name
+    // FIXME doesn't work anyway for now -- never used anywhere, window names
+    // can't be defined, and TreeRewriter thinks the window name is a column so
+    // the query fails.
+    if (pos->type != TokenType::OpeningRoundBracket)
+    {
+        ASTPtr window_name_ast;
+        ParserIdentifier window_name_parser;
+        if (window_name_parser.parse(pos, window_name_ast, expected))
+        {
+            function->set(function->window_name, window_name_ast);
+            return true;
+        }
+        else
+        {
+            return false;
+        }
+    }
+    ++pos;
+
+    // Variant 2:
+    // function_name ( * ) OVER ( window_definition )
+    ParserKeyword keyword_partition_by("PARTITION BY");
+    ParserNotEmptyExpressionList columns_partition_by(
+        false /* we don't allow declaring aliases here*/);
+    ParserKeyword keyword_order_by("ORDER BY");
+    ParserOrderByExpressionList columns_order_by;
+
+    if (keyword_partition_by.ignore(pos, expected))
+    {
+        ASTPtr partition_by_ast;
+        if (columns_partition_by.parse(pos, partition_by_ast, expected))
+        {
+            function->set(function->window_partition_by, partition_by_ast);
+        }
+        else
+        {
+            return false;
+        }
+    }
+
+    if (keyword_order_by.ignore(pos, expected))
+    {
+        ASTPtr order_by_ast;
+        if (columns_order_by.parse(pos, order_by_ast, expected))
+        {
+            function->set(function->window_order_by, order_by_ast);
+        }
+        else
+        {
+            return false;
+        }
+    }
+
+    if (pos->type != TokenType::ClosingRoundBracket)
+    {
+        expected.add(pos, "')'");
+        return false;
+    }
+    ++pos;
+
+    return true;
+}
+
 bool ParserCodecDeclarationList::parseImpl(Pos & pos, ASTPtr & node, Expected & expected)
 {
    return ParserList(std::make_unique<ParserIdentifierWithOptionalParameters>(),
--- a/src/Parsers/ExpressionElementParsers.h
+++ b/src/Parsers/ExpressionElementParsers.h
@ -156,6 +156,13 @@ protected:
    bool allow_function_parameters;
 };

+// Window definition (the thing that goes after OVER) for window function.
+class ParserWindowDefinition : public IParserBase
+{
+    const char * getName() const override { return "window definition"; }
+    bool parseImpl(Pos & pos, ASTPtr & node, Expected & expected) override;
+};
+
 class ParserCodecDeclarationList : public IParserBase
 {
 protected:
--- a/src/Parsers/IAST.cpp
+++ b/src/Parsers/IAST.cpp
@ -161,4 +161,11 @@ void IAST::dumpTree(WriteBuffer & ostr, size_t indent) const
    }
 }

+std::string IAST::dumpTree(size_t indent) const
+{
+    WriteBufferFromOwnString wb;
+    dumpTree(wb, indent);
+    return wb.str();
+}
+
 }
--- a/src/Parsers/IAST.h
+++ b/src/Parsers/IAST.h
@ -19,9 +19,6 @@ namespace DB

 namespace ErrorCodes
 {
-    extern const int NOT_A_COLUMN;
-    extern const int UNKNOWN_TYPE_OF_AST_NODE;
-    extern const int UNKNOWN_ELEMENT_IN_AST;
    extern const int LOGICAL_ERROR;
 }

@ -46,7 +43,7 @@ public:
    String getColumnName() const;
    virtual void appendColumnName(WriteBuffer &) const
    {
-        throw Exception("Trying to get name of not a column: " + getID(), ErrorCodes::NOT_A_COLUMN);
+        throw Exception("Trying to get name of not a column: " + getID(), ErrorCodes::LOGICAL_ERROR);
    }

    /** Get the alias, if any, or the canonical name of the column, if it is not. */
@ -58,7 +55,7 @@ public:
    /** Set the alias. */
    virtual void setAlias(const String & /*to*/)
    {
-        throw Exception("Can't set alias of " + getColumnName(), ErrorCodes::UNKNOWN_TYPE_OF_AST_NODE);
+        throw Exception("Can't set alias of " + getColumnName(), ErrorCodes::LOGICAL_ERROR);
    }

    /** Get the text that identifies this element. */
@ -77,6 +74,7 @@ public:
    virtual void updateTreeHashImpl(SipHash & hash_state) const;

    void dumpTree(WriteBuffer & ostr, size_t indent = 0) const;
+    std::string dumpTree(size_t indent = 0) const;

    /** Check the depth of the tree.
      * If max_depth is specified and the depth is greater - throw an exception.
@ -160,6 +158,7 @@ public:
        bool always_quote_identifiers = false;
        IdentifierQuotingStyle identifier_quoting_style = IdentifierQuotingStyle::Backticks;

+        // Newline or whitespace.
        char nl_or_ws;

        FormatSettings(WriteBuffer & ostr_, bool one_line_)
@ -208,7 +207,7 @@ public:

    virtual void formatImpl(const FormatSettings & /*settings*/, FormatState & /*state*/, FormatStateStacked /*frame*/) const
    {
-        throw Exception("Unknown element in AST: " + getID(), ErrorCodes::UNKNOWN_ELEMENT_IN_AST);
+        throw Exception("Unknown element in AST: " + getID(), ErrorCodes::LOGICAL_ERROR);
    }

    // A simple way to add some user-readable context to an error message.
--- a/src/Parsers/New/AST/TableElementExpr.cpp
+++ b/src/Parsers/New/AST/TableElementExpr.cpp
@ -2,6 +2,7 @@

 #include <Parsers/ASTColumnDeclaration.h>
 #include <Parsers/ASTConstraintDeclaration.h>
+#include <Parsers/ASTFunction.h>
 #include <Parsers/ASTIndexDeclaration.h>
 #include <Parsers/New/AST/ColumnExpr.h>
 #include <Parsers/New/AST/ColumnTypeExpr.h>
--- a/src/Processors/Executors/PipelineExecutor.cpp
+++ b/src/Processors/Executors/PipelineExecutor.cpp
@ -164,7 +164,7 @@ bool PipelineExecutor::expandPipeline(Stack & stack, UInt64 pid)
    return true;
 }

-bool PipelineExecutor::tryAddProcessorToStackIfUpdated(ExecutingGraph::Edge & edge, Queue & queue, size_t thread_number)
+bool PipelineExecutor::tryAddProcessorToStackIfUpdated(ExecutingGraph::Edge & edge, Queue & queue, Queue & async_queue, size_t thread_number)
 {
    /// In this method we have ownership on edge, but node can be concurrently accessed.

@ -185,7 +185,7 @@ bool PipelineExecutor::tryAddProcessorToStackIfUpdated(ExecutingGraph::Edge & ed
    if (status == ExecutingGraph::ExecStatus::Idle)
    {
        node.status = ExecutingGraph::ExecStatus::Preparing;
-        return prepareProcessor(edge.to, thread_number, queue, std::move(lock));
+        return prepareProcessor(edge.to, thread_number, queue, async_queue, std::move(lock));
    }
    else
        graph->nodes[edge.to]->processor->onUpdatePorts();
@ -193,7 +193,7 @@ bool PipelineExecutor::tryAddProcessorToStackIfUpdated(ExecutingGraph::Edge & ed
    return true;
 }

-bool PipelineExecutor::prepareProcessor(UInt64 pid, size_t thread_number, Queue & queue, std::unique_lock<std::mutex> node_lock)
+bool PipelineExecutor::prepareProcessor(UInt64 pid, size_t thread_number, Queue & queue, Queue & async_queue, std::unique_lock<std::mutex> node_lock)
 {
    /// In this method we have ownership on node.
    auto & node = *graph->nodes[pid];
@ -248,15 +248,9 @@ bool PipelineExecutor::prepareProcessor(UInt64 pid, size_t thread_number, Queue
            }
            case IProcessor::Status::Async:
            {
-                throw Exception("Async is temporary not supported.", ErrorCodes::LOGICAL_ERROR);
-
-//            node.status = ExecStatus::Executing;
-//            addAsyncJob(pid);
-//            break;
-            }
-            case IProcessor::Status::Wait:
-            {
-                throw Exception("Wait is temporary not supported.", ErrorCodes::LOGICAL_ERROR);
+                node.status = ExecutingGraph::ExecStatus::Executing;
+                async_queue.push(&node);
+                break;
            }
            case IProcessor::Status::ExpandPipeline:
            {
@ -288,13 +282,13 @@ bool PipelineExecutor::prepareProcessor(UInt64 pid, size_t thread_number, Queue
    {
        for (auto & edge : updated_direct_edges)
        {
-            if (!tryAddProcessorToStackIfUpdated(*edge, queue, thread_number))
+            if (!tryAddProcessorToStackIfUpdated(*edge, queue, async_queue, thread_number))
                return false;
        }

        for (auto & edge : updated_back_edges)
        {
-            if (!tryAddProcessorToStackIfUpdated(*edge, queue, thread_number))
+            if (!tryAddProcessorToStackIfUpdated(*edge, queue, async_queue, thread_number))
                return false;
        }
    }
@ -325,7 +319,7 @@ bool PipelineExecutor::prepareProcessor(UInt64 pid, size_t thread_number, Queue
        while (!stack.empty())
        {
            auto item = stack.top();
-            if (!prepareProcessor(item, thread_number, queue, std::unique_lock<std::mutex>(graph->nodes[item]->status_mutex)))
+            if (!prepareProcessor(item, thread_number, queue, async_queue, std::unique_lock<std::mutex>(graph->nodes[item]->status_mutex)))
                return false;

            stack.pop();
@ -378,6 +372,7 @@ void PipelineExecutor::finish()
    {
        std::lock_guard lock(task_queue_mutex);
        finished = true;
+        async_task_queue.finish();
    }

    std::lock_guard guard(executor_contexts_mutex);
@ -502,11 +497,21 @@ void PipelineExecutor::executeStepImpl(size_t thread_num, size_t num_threads, st
            {
                std::unique_lock lock(task_queue_mutex);

-                if (!task_queue.empty())
+                if (!context->async_tasks.empty())
                {
+                    node = context->async_tasks.front();
+                    context->async_tasks.pop();
+                    --num_waiting_async_tasks;
+
+                    if (context->async_tasks.empty())
+                        context->has_async_tasks = false;
+                }
+                else if (!task_queue.empty())
                    node = task_queue.pop(thread_num);

-                    if (!task_queue.empty() && !threads_queue.empty() /*&& task_queue.quota() > threads_queue.size()*/)
+                if (node)
+                {
+                    if (!task_queue.empty() && !threads_queue.empty())
                    {
                        auto thread_to_wake = task_queue.getAnyThreadWithTasks(thread_num + 1 == num_threads ? 0 : (thread_num + 1));

@ -522,13 +527,26 @@ void PipelineExecutor::executeStepImpl(size_t thread_num, size_t num_threads, st
                    break;
                }

-                if (threads_queue.size() + 1 == num_threads)
+                if (threads_queue.size() + 1 == num_threads && async_task_queue.empty() && num_waiting_async_tasks == 0)
                {
                    lock.unlock();
                    finish();
                    break;
                }

+#if defined(OS_LINUX)
+                if (num_threads == 1)
+                {
+                    /// If we execute in single thread, wait for async tasks here.
+                    auto res = async_task_queue.wait(lock);
+                    if (!res)
+                        throw Exception("Empty task was returned from async task queue", ErrorCodes::LOGICAL_ERROR);
+
+                    node = static_cast<ExecutingGraph::Node *>(res.data);
+                    break;
+                }
+#endif
+
                threads_queue.push(thread_num);
            }

@ -579,6 +597,7 @@ void PipelineExecutor::executeStepImpl(size_t thread_num, size_t num_threads, st
            /// Try to execute neighbour processor.
            {
                Queue queue;
+                Queue async_queue;

                ++num_processing_executors;
                while (auto * task = expand_pipeline_task.load())
@ -587,31 +606,39 @@ void PipelineExecutor::executeStepImpl(size_t thread_num, size_t num_threads, st
                /// Prepare processor after execution.
                {
                    auto lock = std::unique_lock<std::mutex>(node->status_mutex);
-                    if (!prepareProcessor(node->processors_id, thread_num, queue, std::move(lock)))
+                    if (!prepareProcessor(node->processors_id, thread_num, queue, async_queue, std::move(lock)))
                        finish();
                }

                node = nullptr;

                /// Take local task from queue if has one.
-                if (!queue.empty())
+                if (!queue.empty() && !context->has_async_tasks)
                {
                    node = queue.front();
                    queue.pop();
                }

                /// Push other tasks to global queue.
-                if (!queue.empty())
+                if (!queue.empty() || !async_queue.empty())
                {
                    std::unique_lock lock(task_queue_mutex);

+#if defined(OS_LINUX)
+                    while (!async_queue.empty() && !finished)
+                    {
+                        async_task_queue.addTask(thread_num, async_queue.front(), async_queue.front()->processor->schedule());
+                        async_queue.pop();
+                    }
+#endif
+
                    while (!queue.empty() && !finished)
                    {
                        task_queue.push(queue.front(), thread_num);
                        queue.pop();
                    }

-                    if (!threads_queue.empty() && !finished /* && task_queue.quota() > threads_queue.size()*/)
+                    if (!threads_queue.empty() && !task_queue.empty() && !finished)
                    {
                        auto thread_to_wake = task_queue.getAnyThreadWithTasks(thread_num + 1 == num_threads ? 0 : (thread_num + 1));

@ -669,6 +696,7 @@ void PipelineExecutor::initializeExecution(size_t num_threads)
        std::lock_guard lock(task_queue_mutex);

        Queue queue;
+        Queue async_queue;
        size_t next_thread = 0;

        while (!stack.empty())
@ -676,7 +704,7 @@ void PipelineExecutor::initializeExecution(size_t num_threads)
            UInt64 proc = stack.top();
            stack.pop();

-            prepareProcessor(proc, 0, queue, std::unique_lock<std::mutex>(graph->nodes[proc]->status_mutex));
+            prepareProcessor(proc, 0, queue, async_queue, std::unique_lock<std::mutex>(graph->nodes[proc]->status_mutex));

            while (!queue.empty())
            {
@ -687,6 +715,10 @@ void PipelineExecutor::initializeExecution(size_t num_threads)
                if (next_thread >= num_threads)
                    next_thread = 0;
            }
+
+            while (!async_queue.empty())
+                throw Exception(ErrorCodes::LOGICAL_ERROR, "Async is only possible after work() call. Processor {}",
+                                async_queue.front()->processor->getName());
        }
    }
 }
@ -747,6 +779,26 @@ void PipelineExecutor::executeImpl(size_t num_threads)
            });
        }

+#if defined(OS_LINUX)
+        {
+            /// Wait for async tasks.
+            std::unique_lock lock(task_queue_mutex);
+            while (auto task = async_task_queue.wait(lock))
+            {
+                auto * node = static_cast<ExecutingGraph::Node *>(task.data);
+                executor_contexts[task.thread_num]->async_tasks.push(node);
+                executor_contexts[task.thread_num]->has_async_tasks = true;
+                ++num_waiting_async_tasks;
+
+                if (threads_queue.has(task.thread_num))
+                {
+                    threads_queue.pop(task.thread_num);
+                    wakeUpExecutor(task.thread_num);
+                }
+            }
+        }
+#endif
+
        for (auto & thread : threads)
            if (thread.joinable())
                thread.join();
--- a/src/Processors/Executors/PipelineExecutor.h
+++ b/src/Processors/Executors/PipelineExecutor.h
@ -1,6 +1,7 @@
 #pragma once

 #include <Processors/IProcessor.h>
+#include <Processors/Executors/PollingQueue.h>
 #include <Processors/Executors/ThreadsQueue.h>
 #include <Processors/Executors/TasksQueue.h>
 #include <Processors/Executors/ExecutingGraph.h>
@ -57,6 +58,12 @@ private:
    /// Stores processors need to be prepared. Preparing status is already set for them.
    TaskQueue<ExecutingGraph::Node> task_queue;

+    /// Queue which stores tasks where processors returned Async status after prepare.
+    /// If multiple threads are using, main thread will wait for async tasks.
+    /// For single thread, will wait for async tasks only when task_queue is empty.
+    PollingQueue async_task_queue;
+    size_t num_waiting_async_tasks = 0;
+
    ThreadsQueue threads_queue;
    std::mutex task_queue_mutex;

@ -90,6 +97,9 @@ private:
        /// This can be solved by using atomic shard ptr.
        std::list<ExpandPipelineTask> task_list;

+        std::queue<ExecutingGraph::Node *> async_tasks;
+        std::atomic_bool has_async_tasks = false;
+
        std::condition_variable condvar;
        std::mutex mutex;
        bool wake_flag = false;
@ -126,14 +136,14 @@ private:

    /// Pipeline execution related methods.
    void addChildlessProcessorsToStack(Stack & stack);
-    bool tryAddProcessorToStackIfUpdated(ExecutingGraph::Edge & edge, Queue & queue, size_t thread_number);
+    bool tryAddProcessorToStackIfUpdated(ExecutingGraph::Edge & edge, Queue & queue, Queue & async_queue, size_t thread_number);
    static void addJob(ExecutingGraph::Node * execution_state);
    // TODO: void addAsyncJob(UInt64 pid);

    /// Prepare processor with pid number.
    /// Check parents and children of current processor and push them to stacks if they also need to be prepared.
    /// If processor wants to be expanded, ExpandPipelineTask from thread_number's execution context will be used.
-    bool prepareProcessor(UInt64 pid, size_t thread_number, Queue & queue, std::unique_lock<std::mutex> node_lock);
+    bool prepareProcessor(UInt64 pid, size_t thread_number, Queue & queue, Queue & async_queue, std::unique_lock<std::mutex> node_lock);
    bool doExpandPipeline(ExpandPipelineTask * task, bool processing);

    /// Continue executor (in case there are tasks in queue).
--- a/src/Processors/Executors/PollingQueue.cpp
+++ b/src/Processors/Executors/PollingQueue.cpp
@ -0,0 +1,115 @@
+#include <Processors/Executors/PollingQueue.h>
+
+#if defined(OS_LINUX)
+
+#include <Common/Exception.h>
+#include <sys/epoll.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int CANNOT_OPEN_FILE;
+    extern const int CANNOT_READ_FROM_SOCKET;
+    extern const int LOGICAL_ERROR;
+}
+
+
+PollingQueue::PollingQueue()
+{
+    epoll_fd = epoll_create(1);
+    if (-1 == epoll_fd)
+        throwFromErrno("Cannot create epoll descriptor", ErrorCodes::CANNOT_OPEN_FILE);
+
+    if (-1 == pipe2(pipe_fd, O_NONBLOCK))
+        throwFromErrno("Cannot create pipe", ErrorCodes::CANNOT_OPEN_FILE);
+
+    epoll_event socket_event;
+    socket_event.events = EPOLLIN | EPOLLPRI;
+    socket_event.data.ptr = pipe_fd;
+
+    if (-1 == epoll_ctl(epoll_fd, EPOLL_CTL_ADD, pipe_fd[0], &socket_event))
+        throwFromErrno("Cannot add pipe descriptor to epoll", ErrorCodes::CANNOT_OPEN_FILE);
+}
+
+PollingQueue::~PollingQueue()
+{
+    close(epoll_fd);
+    close(pipe_fd[0]);
+    close(pipe_fd[1]);
+}
+
+void PollingQueue::addTask(size_t thread_number, void * data, int fd)
+{
+    std::uintptr_t key = reinterpret_cast<uintptr_t>(data);
+    if (tasks.count(key))
+        throw Exception("Task was already added to task queue", ErrorCodes::LOGICAL_ERROR);
+
+    tasks[key] = TaskData{thread_number, data, fd};
+
+    epoll_event socket_event;
+    socket_event.events = EPOLLIN | EPOLLPRI;
+    socket_event.data.ptr = data;
+
+    if (-1 == epoll_ctl(epoll_fd, EPOLL_CTL_ADD, fd, &socket_event))
+        throwFromErrno("Cannot add socket descriptor to epoll", ErrorCodes::CANNOT_OPEN_FILE);
+}
+
+PollingQueue::TaskData PollingQueue::wait(std::unique_lock<std::mutex> & lock)
+{
+    if (is_finished)
+        return {};
+
+    lock.unlock();
+
+    epoll_event event;
+    event.data.ptr = nullptr;
+    int num_events = 0;
+
+    while (num_events == 0)
+    {
+        num_events = epoll_wait(epoll_fd, &event, 1, 0);
+        if (num_events == -1)
+            throwFromErrno("Failed to epoll_wait", ErrorCodes::CANNOT_READ_FROM_SOCKET);
+    }
+
+    lock.lock();
+
+    if (event.data.ptr == pipe_fd)
+        return {};
+
+    std::uintptr_t key = reinterpret_cast<uintptr_t>(event.data.ptr);
+    auto it = tasks.find(key);
+    if (it == tasks.end())
+        throw Exception("Task was not found in task queue", ErrorCodes::LOGICAL_ERROR);
+
+    auto res = it->second;
+    tasks.erase(it);
+
+    if (-1 == epoll_ctl(epoll_fd, EPOLL_CTL_DEL, res.fd, &event))
+        throwFromErrno("Cannot remove socket descriptor to epoll", ErrorCodes::CANNOT_OPEN_FILE);
+
+    return res;
+}
+
+void PollingQueue::finish()
+{
+    is_finished = true;
+    tasks.clear();
+
+    uint64_t buf = 0;
+    while (-1 == write(pipe_fd[1], &buf, sizeof(buf)))
+    {
+        if (errno == EAGAIN)
+            break;
+
+        if (errno != EINTR)
+            throwFromErrno("Cannot write to pipe", ErrorCodes::CANNOT_READ_FROM_SOCKET);
+    }
+}
+
+}
+#endif
--- a/src/Processors/Executors/PollingQueue.h
+++ b/src/Processors/Executors/PollingQueue.h
@ -0,0 +1,60 @@
+#pragma once
+#include <cstddef>
+#include <cstdint>
+#include <mutex>
+#include <atomic>
+#include <unordered_map>
+
+namespace DB
+{
+
+#if defined(OS_LINUX)
+
+/// This queue is used to poll descriptors. Generally, just a wrapper over epoll.
+class PollingQueue
+{
+public:
+    struct TaskData
+    {
+        size_t thread_num;
+
+        void * data = nullptr;
+        int fd = -1;
+
+        explicit operator bool() const { return data; }
+    };
+
+private:
+    int epoll_fd;
+    int pipe_fd[2];
+    std::atomic_bool is_finished = false;
+    std::unordered_map<std::uintptr_t, TaskData> tasks;
+
+public:
+    PollingQueue();
+    ~PollingQueue();
+
+    size_t size() const { return tasks.size(); }
+    bool empty() const { return tasks.empty(); }
+
+    /// Add new task to queue.
+    void addTask(size_t thread_number, void * data, int fd);
+
+    /// Wait for any descriptor. If no descriptors in queue, blocks.
+    /// Returns ptr which was inserted into queue or nullptr if finished was called.
+    /// Lock is unlocked during waiting.
+    TaskData wait(std::unique_lock<std::mutex> & lock);
+
+    /// Interrupt waiting.
+    void finish();
+};
+#else
+class PollingQueue
+{
+public:
+    bool empty() { return true; }
+    void finish() {}
+};
+#endif
+
+}
--- a/src/Processors/Formats/InputStreamFromInputFormat.h
+++ b/src/Processors/Formats/InputStreamFromInputFormat.h
@ -56,7 +56,6 @@ protected:

                case IProcessor::Status::NeedData:
                case IProcessor::Status::Async:
-                case IProcessor::Status::Wait:
                case IProcessor::Status::ExpandPipeline:
                    throw Exception("Source processor returned status " + IProcessor::statusToName(status), ErrorCodes::LOGICAL_ERROR);
            }
--- a/src/Processors/IProcessor.cpp
+++ b/src/Processors/IProcessor.cpp
@ -33,8 +33,6 @@ std::string IProcessor::statusToName(Status status)
            return "Ready";
        case Status::Async:
            return "Async";
-        case Status::Wait:
-            return "Wait";
        case Status::ExpandPipeline:
            return "ExpandPipeline";
    }
--- a/src/Processors/IProcessor.h
+++ b/src/Processors/IProcessor.h
@ -146,13 +146,10 @@ public:
        /// You may call 'work' method and processor will do some work synchronously.
        Ready,

-        /// You may call 'schedule' method and processor will initiate some background work.
+        /// You may call 'schedule' method and processor will return descriptor.
+        /// You need to poll this descriptor and call work() afterwards.
        Async,

-        /// Processor is doing some work in background.
-        /// You may wait for next event or do something else and then you should call 'prepare' again.
-        Wait,
-
        /// Processor wants to add other processors to pipeline.
        /// New processors must be obtained by expandPipeline() call.
        ExpandPipeline,
@ -198,16 +195,21 @@ public:
        throw Exception("Method 'work' is not implemented for " + getName() + " processor", ErrorCodes::NOT_IMPLEMENTED);
    }

-    /** You may call this method if 'prepare' returned Async.
+    /** Executor must call this method when 'prepare' returned Async.
      * This method cannot access any ports. It should use only data that was prepared by 'prepare' method.
      *
-      * This method should return instantly and fire an event (or many events) when asynchronous job will be done.
-      * When the job is not done, method 'prepare' will return Wait and the user may block and wait for next event before checking again.
+      * This method should instantly return epollable file descriptor which will be readable when asynchronous job is done.
+      * When descriptor is readable, method `work` is called to continue data processing.
      *
-      * Note that it can fire many events in EventCounter while doing its job,
-      *  and you have to wait for next event (or do something else) every time when 'prepare' returned Wait.
+      * NOTE: it would be more logical to let `work()` return ASYNC status instead of prepare. This will get
+      * prepare() -> work() -> schedule() -> work() -> schedule() -> .. -> work() -> prepare()
+      * chain instead of
+      * prepare() -> work() -> prepare() -> schedule() -> work() -> prepare() -> schedule() -> .. -> work() -> prepare()
+      *
+      * It is expected that executor epoll using level-triggered notifications.
+      * Read all available data from descriptor before returning ASYNC.
      */
-    virtual void schedule(EventCounter & /*watch*/)
+    virtual int schedule()
    {
        throw Exception("Method 'schedule' is not implemented for " + getName() + " processor", ErrorCodes::NOT_IMPLEMENTED);
    }
--- a/src/Processors/ISource.cpp
+++ b/src/Processors/ISource.cpp
@ -4,6 +4,11 @@
 namespace DB
 {

+namespace ErrorCodes
+{
+    extern const int NOT_IMPLEMENTED;
+}
+
 ISource::ISource(Block header)
    : IProcessor({}, {std::move(header)}), output(outputs.front())
 {
@ -45,11 +50,17 @@ void ISource::work()
 {
    try
    {
-        current_chunk.chunk = generate();
-        if (!current_chunk.chunk || isCancelled())
-            finished = true;
+        if (auto chunk = tryGenerate())
+        {
+            current_chunk.chunk = std::move(*chunk);
+            if (current_chunk.chunk)
+                has_input = true;
+        }
        else
-            has_input = true;
+            finished = true;
+
+        if (isCancelled())
+            finished = true;
    }
    catch (...)
    {
@ -58,5 +69,19 @@ void ISource::work()
    }
 }

+Chunk ISource::generate()
+{
+    throw Exception(ErrorCodes::NOT_IMPLEMENTED, "generate is not implemented for {}", getName());
+}
+
+std::optional<Chunk> ISource::tryGenerate()
+{
+    auto chunk = generate();
+    if (!chunk)
+        return {};
+
+    return chunk;
+}
+
 }

--- a/src/Processors/ISource.h
+++ b/src/Processors/ISource.h
@ -15,7 +15,8 @@ protected:
    bool got_exception = false;
    Port::Data current_chunk;

-    virtual Chunk generate() = 0;
+    virtual Chunk generate();
+    virtual std::optional<Chunk> tryGenerate();

 public:
    ISource(Block header);
--- a/src/Processors/QueryPlan/WindowStep.cpp
+++ b/src/Processors/QueryPlan/WindowStep.cpp
@ -0,0 +1,111 @@
+#include <Processors/QueryPlan/WindowStep.h>
+
+#include <Processors/Transforms/WindowTransform.h>
+#include <Processors/Transforms/ExpressionTransform.h>
+#include <Processors/QueryPipeline.h>
+#include <Interpreters/ExpressionActions.h>
+#include <IO/Operators.h>
+
+namespace DB
+{
+
+static ITransformingStep::Traits getTraits()
+{
+    return ITransformingStep::Traits
+    {
+        {
+            .preserves_distinct_columns = true,
+            .returns_single_stream = false,
+            .preserves_number_of_streams = true,
+            .preserves_sorting = true,
+        },
+        {
+            .preserves_number_of_rows = true
+        }
+    };
+}
+
+static Block addWindowFunctionResultColumns(const Block & block,
+    std::vector<WindowFunctionDescription> window_functions)
+{
+    auto result = block;
+
+    for (const auto & f : window_functions)
+    {
+        ColumnWithTypeAndName column_with_type;
+        column_with_type.name = f.column_name;
+        column_with_type.type = f.aggregate_function->getReturnType();
+        column_with_type.column = column_with_type.type->createColumn();
+
+        result.insert(column_with_type);
+    }
+
+    return result;
+}
+
+WindowStep::WindowStep(const DataStream & input_stream_,
+        const WindowDescription & window_description_,
+        const std::vector<WindowFunctionDescription> & window_functions_)
+    : ITransformingStep(
+        input_stream_,
+            addWindowFunctionResultColumns(input_stream_.header,
+                window_functions_),
+        getTraits())
+    , window_description(window_description_)
+    , window_functions(window_functions_)
+    , input_header(input_stream_.header)
+{
+    // We don't remove any columns, only add, so probably we don't have to update
+    // the output DataStream::distinct_columns.
+}
+
+void WindowStep::transformPipeline(QueryPipeline & pipeline)
+{
+    pipeline.addSimpleTransform([&](const Block & /*header*/)
+    {
+        return std::make_shared<WindowTransform>(input_header,
+            output_stream->header, window_description, window_functions);
+    });
+
+    assertBlocksHaveEqualStructure(pipeline.getHeader(), output_stream->header,
+        "WindowStep transform for '" + window_description.window_name + "'");
+}
+
+void WindowStep::describeActions(FormatSettings & settings) const
+{
+    String prefix(settings.offset, ' ');
+    settings.out << prefix << "Window: (";
+    if (!window_description.partition_by.empty())
+    {
+        settings.out << "PARTITION BY ";
+        for (size_t i = 0; i < window_description.partition_by.size(); ++i)
+        {
+            if (i > 0)
+            {
+                settings.out << ", ";
+            }
+
+            settings.out << window_description.partition_by[i].column_name;
+        }
+    }
+    if (!window_description.partition_by.empty()
+        && !window_description.order_by.empty())
+    {
+        settings.out << " ";
+    }
+    if (!window_description.order_by.empty())
+    {
+        settings.out << "ORDER BY "
+            << dumpSortDescription(window_description.order_by);
+    }
+    settings.out << ")\n";
+
+    for (size_t i = 0; i < window_functions.size(); ++i)
+    {
+        settings.out << prefix << (i == 0 ? "Functions: "
+                                          : "           ");
+        settings.out << window_functions[i].column_name << "\n";
+    }
+}
+
+}
--- a/src/Processors/QueryPlan/WindowStep.h
+++ b/src/Processors/QueryPlan/WindowStep.h
@ -0,0 +1,33 @@
+#pragma once
+#include <Processors/QueryPlan/ITransformingStep.h>
+
+#include <Interpreters/AggregateDescription.h>
+
+namespace DB
+{
+
+class ActionsDAG;
+using ActionsDAGPtr = std::shared_ptr<ActionsDAG>;
+
+class WindowTransform;
+
+class WindowStep : public ITransformingStep
+{
+public:
+    explicit WindowStep(const DataStream & input_stream_,
+            const WindowDescription & window_description_,
+            const std::vector<WindowFunctionDescription> & window_functions_);
+
+    String getName() const override { return "Window"; }
+
+    void transformPipeline(QueryPipeline & pipeline) override;
+
+    void describeActions(FormatSettings & settings) const override;
+
+private:
+    WindowDescription window_description;
+    std::vector<WindowFunctionDescription> window_functions;
+    Block input_header;
+};
+
+}
--- a/src/Processors/Sources/RemoteSource.cpp
+++ b/src/Processors/Sources/RemoteSource.cpp
@ -1,14 +1,16 @@
 #include <Processors/Sources/RemoteSource.h>
 #include <DataStreams/RemoteQueryExecutor.h>
+#include <DataStreams/RemoteQueryExecutorReadContext.h>
 #include <Processors/Transforms/AggregatingTransform.h>
 #include <DataTypes/DataTypeAggregateFunction.h>

 namespace DB
 {

-RemoteSource::RemoteSource(RemoteQueryExecutorPtr executor, bool add_aggregation_info_)
+RemoteSource::RemoteSource(RemoteQueryExecutorPtr executor, bool add_aggregation_info_, bool async_read_)
    : SourceWithProgress(executor->getHeader(), false)
    , add_aggregation_info(add_aggregation_info_), query_executor(std::move(executor))
+    , async_read(async_read_)
 {
    /// Add AggregatedChunkInfo if we expect DataTypeAggregateFunction as a result.
    const auto & sample = getPort().getHeader();
@ -21,15 +23,28 @@ RemoteSource::~RemoteSource() = default;

 ISource::Status RemoteSource::prepare()
 {
+    /// Check if query was cancelled before returning Async status. Otherwise it may lead to infinite loop.
+    if (was_query_canceled)
+    {
+        getPort().finish();
+        return Status::Finished;
+    }
+
+    if (is_async_state)
+        return Status::Async;
+
    Status status = SourceWithProgress::prepare();
    /// To avoid resetting the connection (because of "unfinished" query) in the
    /// RemoteQueryExecutor it should be finished explicitly.
    if (status == Status::Finished)
-        query_executor->finish();
+    {
+        query_executor->finish(&read_context);
+        is_async_state = false;
+    }
    return status;
 }

-Chunk RemoteSource::generate()
+std::optional<Chunk> RemoteSource::tryGenerate()
 {
    /// onCancel() will do the cancel if the query was sent.
    if (was_query_canceled)
@ -52,11 +67,28 @@ Chunk RemoteSource::generate()
        was_query_sent = true;
    }

-    auto block = query_executor->read();
+    Block block;
+
+    if (async_read)
+    {
+        auto res = query_executor->read(read_context);
+        if (std::holds_alternative<int>(res))
+        {
+            fd = std::get<int>(res);
+            is_async_state = true;
+            return Chunk();
+        }
+
+        is_async_state = false;
+
+        block = std::get<Block>(std::move(res));
+    }
+    else
+        block = query_executor->read();

    if (!block)
    {
-        query_executor->finish();
+        query_executor->finish(&read_context);
        return {};
    }

@ -77,7 +109,18 @@ Chunk RemoteSource::generate()
 void RemoteSource::onCancel()
 {
    was_query_canceled = true;
-    query_executor->cancel();
+    query_executor->cancel(&read_context);
+    // is_async_state = false;
+}
+
+void RemoteSource::onUpdatePorts()
+{
+    if (getPort().isFinished())
+    {
+        was_query_canceled = true;
+        query_executor->finish(&read_context);
+        // is_async_state = false;
+    }
 }


@ -123,9 +166,9 @@ Chunk RemoteExtremesSource::generate()

 Pipe createRemoteSourcePipe(
    RemoteQueryExecutorPtr query_executor,
-    bool add_aggregation_info, bool add_totals, bool add_extremes)
+    bool add_aggregation_info, bool add_totals, bool add_extremes, bool async_read)
 {
-    Pipe pipe(std::make_shared<RemoteSource>(query_executor, add_aggregation_info));
+    Pipe pipe(std::make_shared<RemoteSource>(query_executor, add_aggregation_info, async_read));

    if (add_totals)
        pipe.addTotalsSource(std::make_shared<RemoteTotalsSource>(query_executor));
--- a/src/Processors/Sources/RemoteSource.h
+++ b/src/Processors/Sources/RemoteSource.h
@ -11,6 +11,8 @@ namespace DB
 class RemoteQueryExecutor;
 using RemoteQueryExecutorPtr = std::shared_ptr<RemoteQueryExecutor>;

+class RemoteQueryExecutorReadContext;
+
 /// Source from RemoteQueryExecutor. Executes remote query and returns query result chunks.
 class RemoteSource : public SourceWithProgress
 {
@ -18,7 +20,7 @@ public:
    /// Flag add_aggregation_info tells if AggregatedChunkInfo should be added to result chunk.
    /// AggregatedChunkInfo stores the bucket number used for two-level aggregation.
    /// This flag should be typically enabled for queries with GROUP BY which are executed till WithMergeableState.
-    RemoteSource(RemoteQueryExecutorPtr executor, bool add_aggregation_info_);
+    RemoteSource(RemoteQueryExecutorPtr executor, bool add_aggregation_info_, bool async_read_);
    ~RemoteSource() override;

    Status prepare() override;
@ -27,14 +29,12 @@ public:
    void setRowsBeforeLimitCounter(RowsBeforeLimitCounterPtr counter) { rows_before_limit.swap(counter); }

    /// Stop reading from stream if output port is finished.
-    void onUpdatePorts() override
-    {
-        if (getPort().isFinished())
-            cancel();
-    }
+    void onUpdatePorts() override;
+
+    int schedule() override { return fd; }

 protected:
-    Chunk generate() override;
+    std::optional<Chunk> tryGenerate() override;
    void onCancel() override;

 private:
@ -43,6 +43,11 @@ private:
    bool add_aggregation_info = false;
    RemoteQueryExecutorPtr query_executor;
    RowsBeforeLimitCounterPtr rows_before_limit;
+
+    const bool async_read;
+    bool is_async_state = false;
+    std::unique_ptr<RemoteQueryExecutorReadContext> read_context;
+    int fd = -1;
 };

 /// Totals source from RemoteQueryExecutor.
@ -80,6 +85,6 @@ private:
 /// Create pipe with remote sources.
 Pipe createRemoteSourcePipe(
    RemoteQueryExecutorPtr query_executor,
-    bool add_aggregation_info, bool add_totals, bool add_extremes);
+    bool add_aggregation_info, bool add_totals, bool add_extremes, bool async_read);

 }
--- a/src/Processors/Transforms/WindowTransform.cpp
+++ b/src/Processors/Transforms/WindowTransform.cpp
@ -0,0 +1,184 @@
+#include <Processors/Transforms/WindowTransform.h>
+
+#include <Interpreters/ExpressionActions.h>
+
+#include <Common/Arena.h>
+
+namespace DB
+{
+
+WindowTransform::WindowTransform(const Block & input_header_,
+        const Block & output_header_,
+        const WindowDescription & window_description_,
+        const std::vector<WindowFunctionDescription> & window_function_descriptions
+        )
+    : ISimpleTransform(input_header_, output_header_,
+        false /* skip_empty_chunks */)
+    , input_header(input_header_)
+    , window_description(window_description_)
+{
+    workspaces.reserve(window_function_descriptions.size());
+    for (const auto & f : window_function_descriptions)
+    {
+        WindowFunctionWorkspace workspace;
+        workspace.window_function = f;
+
+        const auto & aggregate_function
+            = workspace.window_function.aggregate_function;
+        if (!arena && aggregate_function->allocatesMemoryInArena())
+        {
+            arena = std::make_unique<Arena>();
+        }
+
+        workspace.argument_column_indices.reserve(
+            workspace.window_function.argument_names.size());
+        workspace.argument_columns.reserve(
+            workspace.window_function.argument_names.size());
+        for (const auto & argument_name : workspace.window_function.argument_names)
+        {
+            workspace.argument_column_indices.push_back(
+                input_header.getPositionByName(argument_name));
+        }
+
+        workspace.aggregate_function_state.reset(aggregate_function->sizeOfData(),
+            aggregate_function->alignOfData());
+        aggregate_function->create(workspace.aggregate_function_state.data());
+
+        workspaces.push_back(std::move(workspace));
+    }
+
+    partition_by_indices.reserve(window_description.partition_by.size());
+    for (const auto & column : window_description.partition_by)
+    {
+        partition_by_indices.push_back(
+            input_header.getPositionByName(column.column_name));
+    }
+    partition_start_columns.resize(partition_by_indices.size(), nullptr);
+    partition_start_row = 0;
+}
+
+WindowTransform::~WindowTransform()
+{
+    // Some states may be not created yet if the creation failed.
+    for (auto & ws : workspaces)
+    {
+        ws.window_function.aggregate_function->destroy(
+            ws.aggregate_function_state.data());
+    }
+}
+
+void WindowTransform::transform(Chunk & chunk)
+{
+    const size_t num_rows = chunk.getNumRows();
+    auto columns = chunk.detachColumns();
+
+    for (auto & ws : workspaces)
+    {
+        ws.argument_columns.clear();
+        for (const auto column_index : ws.argument_column_indices)
+        {
+            ws.argument_columns.push_back(columns[column_index].get());
+        }
+
+        ws.result_column = ws.window_function.aggregate_function->getReturnType()
+            ->createColumn();
+    }
+
+    // We loop for all window functions for each row. Switching the loops might
+    // be more efficient, because we would run less code and access less data in
+    // the inner loop. If you change this, don't forget to fix the calculation of
+    // partition boundaries. Probably it has to be precalculated and stored as
+    // an array of offsets. An interesting optimization would be to pass it as
+    // an extra column from the previous sorting step -- that step might need to
+    // make similar comparison anyway, if it's sorting only by the PARTITION BY
+    // columns.
+    for (size_t row = 0; row < num_rows; row++)
+    {
+        // Check whether the new partition has started. We have to reset the
+        // aggregate functions when the new partition starts.
+        assert(partition_start_columns.size() == partition_by_indices.size());
+        bool new_partition = false;
+        if (partition_start_columns.empty())
+        {
+            // No PARTITION BY at all, do nothing.
+        }
+        else if (partition_start_columns[0] == nullptr)
+        {
+            // This is the first partition.
+            new_partition = true;
+            partition_start_columns.clear();
+            for (const auto i : partition_by_indices)
+            {
+                partition_start_columns.push_back(columns[i]);
+            }
+            partition_start_row = row;
+        }
+        else
+        {
+            // Check whether the new partition started, by comparing all the
+            // PARTITION BY columns.
+            size_t first_inequal_column = 0;
+            for (; first_inequal_column < partition_start_columns.size();
+                  ++first_inequal_column)
+            {
+                const auto * current_column = columns[
+                    partition_by_indices[first_inequal_column]].get();
+
+                if (current_column->compareAt(row, partition_start_row,
+                    *partition_start_columns[first_inequal_column],
+                    1 /* nan_direction_hint */) != 0)
+                {
+                    break;
+                }
+            }
+
+            if (first_inequal_column < partition_start_columns.size())
+            {
+                // The new partition has started. Remember where.
+                new_partition = true;
+                partition_start_columns.clear();
+                for (const auto i : partition_by_indices)
+                {
+                    partition_start_columns.push_back(columns[i]);
+                }
+                partition_start_row = row;
+            }
+        }
+
+        for (auto & ws : workspaces)
+        {
+            const auto & f = ws.window_function;
+            const auto * a = f.aggregate_function.get();
+            auto * buf = ws.aggregate_function_state.data();
+
+            if (new_partition)
+            {
+                // Reset the aggregate function states.
+                a->destroy(buf);
+                a->create(buf);
+            }
+
+            // Update the aggregate function state and save the result.
+            a->add(buf,
+                ws.argument_columns.data(),
+                row,
+                arena.get());
+
+            a->insertResultInto(buf,
+                *ws.result_column,
+                arena.get());
+        }
+    }
+
+    // We have to release the mutable reference to the result column before we
+    // return this block, or else extra copying may occur when the subsequent
+    // processors modify the block. Workspaces live longer than individual blocks.
+    for (auto & ws : workspaces)
+    {
+        columns.push_back(std::move(ws.result_column));
+    }
+
+    chunk.setColumns(std::move(columns), num_rows);
+}
+
+}
--- a/src/Processors/Transforms/WindowTransform.h
+++ b/src/Processors/Transforms/WindowTransform.h
@ -0,0 +1,77 @@
+#pragma once
+#include <Processors/ISimpleTransform.h>
+
+#include <Interpreters/AggregateDescription.h>
+
+#include <Common/AlignedBuffer.h>
+
+namespace DB
+{
+
+class ExpressionActions;
+using ExpressionActionsPtr = std::shared_ptr<ExpressionActions>;
+
+class Arena;
+
+// Runtime data for computing one window function
+struct WindowFunctionWorkspace
+{
+    WindowFunctionDescription window_function;
+    AlignedBuffer aggregate_function_state;
+    std::vector<size_t> argument_column_indices;
+
+    // Argument and result columns. Be careful, they are per-chunk.
+    std::vector<const IColumn *> argument_columns;
+    MutableColumnPtr result_column;
+};
+
+/*
+ * Computes several window functions that share the same window. The input must
+ * be sorted correctly for this window (PARTITION BY, then ORDER BY).
+ */
+class WindowTransform : public ISimpleTransform
+{
+public:
+    WindowTransform(
+            const Block & input_header_,
+            const Block & output_header_,
+            const WindowDescription & window_description_,
+            const std::vector<WindowFunctionDescription> &
+                window_function_descriptions);
+
+    ~WindowTransform() override;
+
+    String getName() const override
+    {
+        return "WindowTransform";
+    }
+
+    static Block transformHeader(Block header, const ExpressionActionsPtr & expression);
+
+    void transform(Chunk & chunk) override;
+
+public:
+    Block input_header;
+
+    WindowDescription window_description;
+
+    // Indices of the PARTITION BY columns in block.
+    std::vector<size_t> partition_by_indices;
+
+    // The columns for PARTITION BY and the row in these columns where the
+    // current partition started. They might be in some of the previous blocks,
+    // so we have to keep the shared ownership of the columns. We don't keep the
+    // entire block to save memory, only the needed columns, in the same order
+    // as the partition_by_indices array.
+    // Can be empty if there is no PARTITION BY.
+    // Columns are nullptr when it is the first partition.
+    std::vector<ColumnPtr> partition_start_columns;
+    size_t partition_start_row = 0;
+
+    // Data for computing the window functions.
+    std::vector<WindowFunctionWorkspace> workspaces;
+
+    std::unique_ptr<Arena> arena;
+};
+
+}
--- a/src/Processors/ya.make
+++ b/src/Processors/ya.make
@ -17,6 +17,7 @@ SRCS(
    Executors/ExecutingGraph.cpp
    Executors/PipelineExecutingBlockInputStream.cpp
    Executors/PipelineExecutor.cpp
+    Executors/PollingQueue.cpp
    Executors/PullingAsyncPipelineExecutor.cpp
    Executors/PullingPipelineExecutor.cpp
    ForkProcessor.cpp
@ -123,6 +124,7 @@ SRCS(
    QueryPlan/SettingQuotaAndLimitsStep.cpp
    QueryPlan/TotalsHavingStep.cpp
    QueryPlan/UnionStep.cpp
+    QueryPlan/WindowStep.cpp
    ResizeProcessor.cpp
    Sources/DelayedSource.cpp
    Sources/RemoteSource.cpp
@ -155,6 +157,7 @@ SRCS(
    Transforms/RollupTransform.cpp
    Transforms/SortingTransform.cpp
    Transforms/TotalsHavingTransform.cpp
+    Transforms/WindowTransform.cpp
    printPipeline.cpp

 )
--- a/src/Storages/IStorage.cpp
+++ b/src/Storages/IStorage.cpp
@ -3,21 +3,22 @@
 #include <sparsehash/dense_hash_map>
 #include <sparsehash/dense_hash_set>

-#include <Storages/AlterCommands.h>
+#include <Common/StringUtils/StringUtils.h>
+#include <Common/quoteString.h>
+#include <IO/Operators.h>
+#include <IO/WriteBufferFromString.h>
+#include <Interpreters/Context.h>
+#include <Interpreters/ExpressionActions.h>
+#include <Interpreters/InterpreterSelectQuery.h>
 #include <Parsers/ASTCreateQuery.h>
 #include <Parsers/ASTSetQuery.h>
 #include <Processors/Pipe.h>
 #include <Processors/QueryPlan/ReadFromPreparedSource.h>
-#include <Interpreters/Context.h>
-#include <Common/StringUtils/StringUtils.h>
-#include <Common/quoteString.h>
-#include <Interpreters/ExpressionActions.h>
-#include <Interpreters/InterpreterSelectQuery.h>
+#include <Storages/AlterCommands.h>


 namespace DB
 {
-
 namespace ErrorCodes
 {
    extern const int TABLE_IS_DROPPED;
@ -32,17 +33,18 @@ bool IStorage::isVirtualColumn(const String & column_name, const StorageMetadata
 }

 RWLockImpl::LockHolder IStorage::tryLockTimed(
-        const RWLock & rwlock, RWLockImpl::Type type, const String & query_id, const std::chrono::milliseconds & acquire_timeout) const
+    const RWLock & rwlock, RWLockImpl::Type type, const String & query_id, const std::chrono::milliseconds & acquire_timeout) const
 {
    auto lock_holder = rwlock->getLock(type, query_id, acquire_timeout);
    if (!lock_holder)
    {
        const String type_str = type == RWLockImpl::Type::Read ? "READ" : "WRITE";
        throw Exception(
-                type_str + " locking attempt on \"" + getStorageID().getFullTableName() +
-                "\" has timed out! (" + std::to_string(acquire_timeout.count()) + "ms) "
-                "Possible deadlock avoided. Client should retry.",
-                ErrorCodes::DEADLOCK_AVOIDED);
+            type_str + " locking attempt on \"" + getStorageID().getFullTableName() + "\" has timed out! ("
+                + std::to_string(acquire_timeout.count())
+                + "ms) "
+                  "Possible deadlock avoided. Client should retry.",
+            ErrorCodes::DEADLOCK_AVOIDED);
    }
    return lock_holder;
 }
@ -82,26 +84,26 @@ TableExclusiveLockHolder IStorage::lockExclusively(const String & query_id, cons
 }

 Pipe IStorage::read(
-        const Names & /*column_names*/,
-        const StorageMetadataPtr & /*metadata_snapshot*/,
-        SelectQueryInfo & /*query_info*/,
-        const Context & /*context*/,
-        QueryProcessingStage::Enum /*processed_stage*/,
-        size_t /*max_block_size*/,
-        unsigned /*num_streams*/)
+    const Names & /*column_names*/,
+    const StorageMetadataPtr & /*metadata_snapshot*/,
+    SelectQueryInfo & /*query_info*/,
+    const Context & /*context*/,
+    QueryProcessingStage::Enum /*processed_stage*/,
+    size_t /*max_block_size*/,
+    unsigned /*num_streams*/)
 {
    throw Exception("Method read is not supported by storage " + getName(), ErrorCodes::NOT_IMPLEMENTED);
 }

 void IStorage::read(
-        QueryPlan & query_plan,
-        const Names & column_names,
-        const StorageMetadataPtr & metadata_snapshot,
-        SelectQueryInfo & query_info,
-        const Context & context,
-        QueryProcessingStage::Enum processed_stage,
-        size_t max_block_size,
-        unsigned num_streams)
+    QueryPlan & query_plan,
+    const Names & column_names,
+    const StorageMetadataPtr & metadata_snapshot,
+    SelectQueryInfo & query_info,
+    const Context & context,
+    QueryProcessingStage::Enum processed_stage,
+    size_t max_block_size,
+    unsigned num_streams)
 {
    auto pipe = read(column_names, metadata_snapshot, query_info, context, processed_stage, max_block_size, num_streams);
    if (pipe.empty())
@ -117,15 +119,12 @@ void IStorage::read(
 }

 Pipe IStorage::alterPartition(
-    const StorageMetadataPtr & /* metadata_snapshot */,
-    const PartitionCommands & /* commands */,
-    const Context & /* context */)
+    const StorageMetadataPtr & /* metadata_snapshot */, const PartitionCommands & /* commands */, const Context & /* context */)
 {
    throw Exception("Partition operations are not supported by storage " + getName(), ErrorCodes::NOT_IMPLEMENTED);
 }

-void IStorage::alter(
-    const AlterCommands & params, const Context & context, TableLockHolder &)
+void IStorage::alter(const AlterCommands & params, const Context & context, TableLockHolder &)
 {
    auto table_id = getStorageID();
    StorageInMemoryMetadata new_metadata = getInMemoryMetadata();
@ -146,7 +145,8 @@ void IStorage::checkAlterIsPossible(const AlterCommands & commands, const Settin
    }
 }

-void IStorage::checkAlterPartitionIsPossible(const PartitionCommands & /*commands*/, const StorageMetadataPtr & /*metadata_snapshot*/, const Settings & /*settings*/) const
+void IStorage::checkAlterPartitionIsPossible(
+    const PartitionCommands & /*commands*/, const StorageMetadataPtr & /*metadata_snapshot*/, const Settings & /*settings*/) const
 {
    throw Exception("Table engine " + getName() + " doesn't support partitioning", ErrorCodes::NOT_IMPLEMENTED);
 }
@ -168,4 +168,52 @@ NamesAndTypesList IStorage::getVirtuals() const
    return {};
 }

+Names IStorage::getAllRegisteredNames() const
+{
+    Names result;
+    auto getter = [](const auto & column) { return column.name; };
+    const NamesAndTypesList & available_columns = getInMemoryMetadata().getColumns().getAllPhysical();
+    std::transform(available_columns.begin(), available_columns.end(), std::back_inserter(result), getter);
+    return result;
+}
+
+std::string PrewhereDAGInfo::dump() const
+{
+    WriteBufferFromOwnString ss;
+    ss << "PrewhereDagInfo\n";
+
+    if (alias_actions)
+    {
+        ss << "alias_actions " << alias_actions->dumpDAG() << "\n";
+    }
+
+    if (prewhere_actions)
+    {
+        ss << "prewhere_actions " << prewhere_actions->dumpDAG() << "\n";
+    }
+
+    if (remove_columns_actions)
+    {
+        ss << "remove_columns_actions " << remove_columns_actions->dumpDAG() << "\n";
+    }
+
+    ss << "remove_prewhere_column " << remove_prewhere_column
+       << ", need_filter " << need_filter << "\n";
+
+    return ss.str();
+}
+
+std::string FilterInfo::dump() const
+{
+    WriteBufferFromOwnString ss;
+    ss << "FilterInfo for column '" << column_name <<"', do_remove_column "
+       << do_remove_column << "\n";
+    if (actions_dag)
+    {
+        ss << "actions_dag " << actions_dag->dumpDAG() << "\n";
+    }
+
+    return ss.str();
+}
+
 }
--- a/src/Storages/IStorage.h
+++ b/src/Storages/IStorage.h
@ -78,7 +78,7 @@ struct ColumnSize
  * - data storage structure (compression, etc.)
  * - concurrent access to data (locks, etc.)
  */
-class IStorage : public std::enable_shared_from_this<IStorage>, public TypePromotion<IStorage>
+class IStorage : public std::enable_shared_from_this<IStorage>, public TypePromotion<IStorage>, public IHints<1, IStorage>
 {
 public:
    IStorage() = delete;
@ -87,7 +87,6 @@ public:
        : storage_id(std::move(storage_id_))
        , metadata(std::make_unique<StorageInMemoryMetadata>()) {} //-V730

-    virtual ~IStorage() = default;
    IStorage(const IStorage &) = delete;
    IStorage & operator=(const IStorage &) = delete;

@ -172,6 +171,7 @@ public:
    /// By default return empty list of columns.
    virtual NamesAndTypesList getVirtuals() const;

+    Names getAllRegisteredNames() const override;
 protected:

    /// Returns whether the column is virtual - by default all columns are real.
--- a/src/Storages/IndicesDescription.cpp
+++ b/src/Storages/IndicesDescription.cpp
@ -2,6 +2,7 @@
 #include <Interpreters/TreeRewriter.h>
 #include <Storages/IndicesDescription.h>

+#include <Parsers/ASTFunction.h>
 #include <Parsers/ASTIndexDeclaration.h>
 #include <Parsers/formatAST.h>
 #include <Parsers/ParserCreateQuery.h>
--- a/src/Storages/Kafka/KafkaBlockInputStream.cpp
+++ b/src/Storages/Kafka/KafkaBlockInputStream.cpp
@ -125,7 +125,6 @@ Block KafkaBlockInputStream::readImpl()
                }
                case IProcessor::Status::NeedData:
                case IProcessor::Status::Async:
-                case IProcessor::Status::Wait:
                case IProcessor::Status::ExpandPipeline:
                    throw Exception("Source processor returned status " + IProcessor::statusToName(status), ErrorCodes::LOGICAL_ERROR);
            }
--- a/src/Storages/MergeTree/MergeTreeIndices.h
+++ b/src/Storages/MergeTree/MergeTreeIndices.h
@ -10,7 +10,6 @@
 #include <Storages/SelectQueryInfo.h>
 #include <Storages/MergeTree/MarkRange.h>
 #include <Interpreters/ExpressionActions.h>
-#include <Parsers/ASTIndexDeclaration.h>
 #include <DataTypes/DataTypeLowCardinality.h>

 constexpr auto INDEX_FILE_PREFIX = "skp_idx_";
--- a/src/Storages/MergeTree/MergeTreeSettings.h
+++ b/src/Storages/MergeTree/MergeTreeSettings.h
@ -111,9 +111,6 @@ struct Settings;
    M(Bool, remove_empty_parts, true, "Remove empty parts after they were pruned by TTL, mutation, or collapsing merge algorithm", 0) \
    M(Bool, assign_part_uuids, false, "Generate UUIDs for parts. Before enabling check that all replicas support new format.", 0) \
    \
-    /** Settings for testing purposes */ \
-    M(Bool, randomize_part_type, false, "For testing purposes only. Randomizes part type between wide and compact", 0) \
-    \
    /** Obsolete settings. Kept for backward compatibility only. */ \
    M(UInt64, min_relative_delay_to_yield_leadership, 120, "Obsolete setting, does nothing.", 0) \
    M(UInt64, check_delay_period, 60, "Obsolete setting, does nothing.", 0) \
--- a/src/Storages/MergeTree/registerStorageMergeTree.cpp
+++ b/src/Storages/MergeTree/registerStorageMergeTree.cpp
@ -234,25 +234,6 @@ If you use the Replicated version of engines, see https://clickhouse.tech/docs/e
 }


-static void randomizePartTypeSettings(const std::unique_ptr<MergeTreeSettings> & storage_settings)
-{
-    static constexpr auto MAX_THRESHOLD_FOR_ROWS = 100000;
-    static constexpr auto MAX_THRESHOLD_FOR_BYTES = 1024 * 1024 * 10;
-
-    /// Create all parts in wide format with probability 1/3.
-    if (thread_local_rng() % 3 == 0)
-    {
-        storage_settings->min_rows_for_wide_part = 0;
-        storage_settings->min_bytes_for_wide_part = 0;
-    }
-    else
-    {
-        storage_settings->min_rows_for_wide_part = std::uniform_int_distribution{0, MAX_THRESHOLD_FOR_ROWS}(thread_local_rng);
-        storage_settings->min_bytes_for_wide_part = std::uniform_int_distribution{0, MAX_THRESHOLD_FOR_BYTES}(thread_local_rng);
-    }
-}
-
-
 static StoragePtr create(const StorageFactory::Arguments & args)
 {
    /** [Replicated][|Summing|Collapsing|Aggregating|Replacing|Graphite]MergeTree (2 * 7 combinations) engines
@ -737,20 +718,6 @@ static StoragePtr create(const StorageFactory::Arguments & args)
        ++arg_num;
    }

-    /// Allow to randomize part type for tests to cover more cases.
-    /// But if settings were set explicitly restrict it.
-    if (storage_settings->randomize_part_type
-        && !storage_settings->min_rows_for_wide_part.changed
-        && !storage_settings->min_bytes_for_wide_part.changed)
-    {
-        randomizePartTypeSettings(storage_settings);
-        LOG_INFO(&Poco::Logger::get(args.table_id.getNameForLogs() + " (registerStorageMergeTree)"),
-            "Applied setting 'randomize_part_type'. "
-            "Setting 'min_rows_for_wide_part' changed to {}. "
-            "Setting 'min_bytes_for_wide_part' changed to {}.",
-            storage_settings->min_rows_for_wide_part, storage_settings->min_bytes_for_wide_part);
-    }
-
    if (arg_num != arg_cnt)
        throw Exception("Wrong number of engine arguments.", ErrorCodes::BAD_ARGUMENTS);

--- a/Show More
+++ b/Show More